Merge branch 'cassandra-2.2' into cassandra-3.0

commit: e8745effa0140202684f6c7f4fc6e5c18d96b295 [log] [tgz]
author: Mick Semb Wever <mck@apache.org> Tue Jul 14 22:06:20 2020 +0200
committer: Mick Semb Wever <mck@apache.org> Tue Jul 14 22:06:20 2020 +0200
tree: e69733855b6f8a82eb7c3ceec294772ae4d51a30
parent: 4f11cb9feebdb0017de1e1063cbf72b840567518 [diff]
parent: cd006d275aa9b6e937c6ebd036d4d27c4ed18dbe [diff]
diff --git a/.circleci/config-2_1.yml b/.circleci/config-2_1.yml
index e90ef4a..f2c4f50 100644
--- a/.circleci/config-2_1.yml
+++ b/.circleci/config-2_1.yml

@@ -29,6 +29,12 @@
     #exec_resource_class: xlarge
   parallelism: 1
 
+j8_medium_par_executor: &j8_medium_par_executor
+  executor:
+    name: java8-executor
+    #exec_resource_class: xlarge
+  parallelism: 1
+
 j8_seq_executor: &j8_seq_executor
   executor:
     name: java8-executor
@@ -60,6 +66,15 @@
             - utests_compression:
                 requires:
                   - start_utests_compression
+            - start_jvm_upgrade_dtest:
+                type: approval
+            - j8_dtest_jars_build:
+                requires:
+                  - build
+                  - start_jvm_upgrade_dtest
+            - j8_jvm_upgrade_dtests:
+                requires:
+                  - j8_dtest_jars_build
             # Java 8 dtests (on request)
             - start_j8_dtests:
                 type: approval
@@ -119,12 +134,25 @@
       - log_environment
       - clone_cassandra
       - build_cassandra
+      - run_eclipse_warnings
       - persist_to_workspace:
             root: /home/cassandra
             paths:
                 - cassandra
                 - .m2
 
+  j8_dtest_jars_build:
+    executor: java8-executor
+    parallelism: 1
+    steps:
+      - attach_workspace:
+          at: /home/cassandra
+      - build_cassandra_dtest_jars
+      - persist_to_workspace:
+          root: /home/cassandra
+          paths:
+            - dtest_jars
+
   j8_unit_tests:
     <<: *j8_par_executor
     steps:
@@ -135,7 +163,7 @@
       - run_parallel_junit_tests
 
   j8_jvm_dtests:
-    <<: *j8_small_par_executor
+    <<: *j8_seq_executor
     steps:
       - attach_workspace:
           at: /home/cassandra
@@ -146,6 +174,18 @@
       - run_parallel_junit_tests:
           classlistprefix: distributed
 
+  j8_jvm_upgrade_dtests:
+    <<: *j8_seq_executor
+    steps:
+      - attach_workspace:
+          at: /home/cassandra
+      - create_junit_containers:
+          classlistprefix: distributed
+          extra_filters: "| grep upgrade"
+      - log_environment
+      - run_parallel_junit_tests:
+          classlistprefix: distributed
+
   utests_long:
     <<: *j8_seq_executor
     steps:
@@ -271,6 +311,47 @@
           fi
         no_output_timeout: 15m
 
+  build_cassandra_dtest_jars:
+    steps:
+    - run:
+        name: Build Cassandra DTest jars
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          git remote add apache git://github.com/apache/cassandra.git
+          for branch in cassandra-2.2 cassandra-3.0 cassandra-3.11 trunk; do
+            # check out the correct cassandra version:
+            git remote set-branches --add apache '$branch'
+            git fetch --depth 1 apache $branch
+            git checkout $branch
+            # Loop to prevent failure due to maven-ant-tasks not downloading a jar..
+            for x in $(seq 1 3); do
+                ${ANT_HOME}/bin/ant clean jar dtest-jar
+                RETURN="$?"
+                if [ "${RETURN}" -eq "0" ]; then
+                    break
+                fi
+            done
+            # Exit, if we didn't build successfully
+            if [ "${RETURN}" -ne "0" ]; then
+                echo "Build failed with exit code: ${RETURN}"
+                exit ${RETURN}
+            fi
+          done
+          # and build the dtest-jar for the branch under test
+          git checkout origin/$CIRCLE_BRANCH
+          for x in $(seq 1 3); do
+              ${ANT_HOME}/bin/ant clean jar dtest-jar
+              RETURN="$?"
+              if [ "${RETURN}" -eq "0" ]; then
+                  break
+              fi
+          done
+          mkdir ~/dtest_jars
+          cp build/dtest*.jar ~/dtest_jars
+          ls -l ~/dtest_jars
+        no_output_timeout: 15m
+
   run_eclipse_warnings:
     steps:
     - run:
@@ -326,6 +407,9 @@
           export PATH=$JAVA_HOME/bin:$PATH
           time mv ~/cassandra /tmp
           cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
           ant clean <<parameters.target>>
         no_output_timeout: <<parameters.no_output_timeout>>
     - store_test_results:
@@ -358,6 +442,9 @@
           export PATH=$JAVA_HOME/bin:$PATH
           time mv ~/cassandra /tmp
           cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
           test_timeout=$(grep 'name="test.<<parameters.classlistprefix>>.timeout"' build.xml | awk -F'"' '{print $4}' || true)
           if [ -z "$test_timeout" ]; then
             test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')

diff --git a/.circleci/config-2_1.yml.high_res.patch b/.circleci/config-2_1.yml.high_res.patch
index 847a08f..3c85668 100644
--- a/.circleci/config-2_1.yml.high_res.patch
+++ b/.circleci/config-2_1.yml.high_res.patch

@@ -16,7 +16,13 @@
 ---
 >     exec_resource_class: xlarge
 >   parallelism: 2
-35c35
+35,36c35,36
+<     #exec_resource_class: xlarge
+<   parallelism: 1
+---
+>     exec_resource_class: xlarge
+>   parallelism: 2
+41c41
 <     #exec_resource_class: xlarge
 ---
 >     exec_resource_class: xlarge

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d5efe4f..3c62b4a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml

@@ -1,5 +1,97 @@
 version: 2
 jobs:
+  j8_jvm_upgrade_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   build:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -53,6 +145,12 @@
               exit ${RETURN}
           fi
         no_output_timeout: 15m
+    - run:
+        name: Run eclipse-warnings
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          ant eclipse-warnings
     - persist_to_workspace:
         root: /home/cassandra
         paths:
@@ -75,374 +173,6 @@
     - CCM_HEAP_NEWSIZE: 256M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_unit_tests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_jvm_dtests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 1
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine distributed Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_long:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 1
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Run Unit Tests (long-test)
-        command: |
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          ant clean long-test
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_compression:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist-compression)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_dtests-with-vnodes:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Clone Cassandra dtest Repository (via git)
-        command: |
-          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
-    - run:
-        name: Configure virtualenv and python Dependencies
-        command: |
-          # note, this should be super quick as all dependencies should be pre-installed in the docker image
-          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
-          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
-          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
-          source ~/env/bin/activate
-          export PATH=$JAVA_HOME/bin:$PATH
-          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
-          pip3 freeze
-    - run:
-        name: Determine Tests to Run (j8_with_vnodes)
-        no_output_timeout: 5m
-        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
-    - run:
-        name: Run dtests (j8_with_vnodes)
-        no_output_timeout: 15m
-        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
-    - store_test_results:
-        path: /tmp/results
-    - store_artifacts:
-        path: /tmp/dtest
-        destination: dtest_j8_with_vnodes
-    - store_artifacts:
-        path: ~/cassandra-dtest/logs
-        destination: dtest_j8_with_vnodes_logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   j8_dtests-no-vnodes:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -600,6 +330,455 @@
     - CCM_HEAP_NEWSIZE: 256M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_unit_tests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtests-with-vnodes:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Clone Cassandra dtest Repository (via git)
+        command: |
+          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
+    - run:
+        name: Configure virtualenv and python Dependencies
+        command: |
+          # note, this should be super quick as all dependencies should be pre-installed in the docker image
+          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
+          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
+          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
+          source ~/env/bin/activate
+          export PATH=$JAVA_HOME/bin:$PATH
+          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
+          pip3 freeze
+    - run:
+        name: Determine Tests to Run (j8_with_vnodes)
+        no_output_timeout: 5m
+        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
+    - run:
+        name: Run dtests (j8_with_vnodes)
+        no_output_timeout: 15m
+        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
+    - store_test_results:
+        path: /tmp/results
+    - store_artifacts:
+        path: /tmp/dtest
+        destination: dtest_j8_with_vnodes
+    - store_artifacts:
+        path: ~/cassandra-dtest/logs
+        destination: dtest_j8_with_vnodes_logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_jvm_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_long:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Run Unit Tests (long-test)
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          ant clean long-test
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_compression:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist-compression)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtest_jars_build:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Build Cassandra DTest jars
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          git remote add apache git://github.com/apache/cassandra.git
+          for branch in cassandra-2.2 cassandra-3.0 cassandra-3.11 trunk; do
+            # check out the correct cassandra version:
+            git remote set-branches --add apache '$branch'
+            git fetch --depth 1 apache $branch
+            git checkout $branch
+            # Loop to prevent failure due to maven-ant-tasks not downloading a jar..
+            for x in $(seq 1 3); do
+                ${ANT_HOME}/bin/ant clean jar dtest-jar
+                RETURN="$?"
+                if [ "${RETURN}" -eq "0" ]; then
+                    break
+                fi
+            done
+            # Exit, if we didn't build successfully
+            if [ "${RETURN}" -ne "0" ]; then
+                echo "Build failed with exit code: ${RETURN}"
+                exit ${RETURN}
+            fi
+          done
+          # and build the dtest-jar for the branch under test
+          git checkout origin/$CIRCLE_BRANCH
+          for x in $(seq 1 3); do
+              ${ANT_HOME}/bin/ant clean jar dtest-jar
+              RETURN="$?"
+              if [ "${RETURN}" -eq "0" ]; then
+                  break
+              fi
+          done
+          mkdir ~/dtest_jars
+          cp build/dtest*.jar ~/dtest_jars
+          ls -l ~/dtest_jars
+        no_output_timeout: 15m
+    - persist_to_workspace:
+        root: /home/cassandra
+        paths:
+        - dtest_jars
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
 workflows:
   version: 2
   build_and_run_tests:
@@ -625,6 +804,15 @@
     - utests_compression:
         requires:
         - start_utests_compression
+    - start_jvm_upgrade_dtest:
+        type: approval
+    - j8_dtest_jars_build:
+        requires:
+        - build
+        - start_jvm_upgrade_dtest
+    - j8_jvm_upgrade_dtests:
+        requires:
+        - j8_dtest_jars_build
     - start_j8_dtests:
         type: approval
         requires:

diff --git a/.circleci/config.yml.HIGHRES b/.circleci/config.yml.HIGHRES
index 77a396d..81802ad 100644
--- a/.circleci/config.yml.HIGHRES
+++ b/.circleci/config.yml.HIGHRES

@@ -1,5 +1,97 @@
 version: 2
 jobs:
+  j8_jvm_upgrade_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   build:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -53,6 +145,12 @@
               exit ${RETURN}
           fi
         no_output_timeout: 15m
+    - run:
+        name: Run eclipse-warnings
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          ant eclipse-warnings
     - persist_to_workspace:
         root: /home/cassandra
         paths:
@@ -75,374 +173,6 @@
     - CCM_HEAP_NEWSIZE: 512M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_unit_tests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: xlarge
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 100
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 2048M
-    - CCM_HEAP_NEWSIZE: 512M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_jvm_dtests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: xlarge
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 2
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine distributed Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 2048M
-    - CCM_HEAP_NEWSIZE: 512M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_long:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: xlarge
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 1
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Run Unit Tests (long-test)
-        command: |
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          ant clean long-test
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 2048M
-    - CCM_HEAP_NEWSIZE: 512M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_compression:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: xlarge
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 100
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist-compression)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 2048M
-    - CCM_HEAP_NEWSIZE: 512M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_dtests-with-vnodes:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: xlarge
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 100
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Clone Cassandra dtest Repository (via git)
-        command: |
-          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
-    - run:
-        name: Configure virtualenv and python Dependencies
-        command: |
-          # note, this should be super quick as all dependencies should be pre-installed in the docker image
-          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
-          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
-          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
-          source ~/env/bin/activate
-          export PATH=$JAVA_HOME/bin:$PATH
-          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
-          pip3 freeze
-    - run:
-        name: Determine Tests to Run (j8_with_vnodes)
-        no_output_timeout: 5m
-        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
-    - run:
-        name: Run dtests (j8_with_vnodes)
-        no_output_timeout: 15m
-        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
-    - store_test_results:
-        path: /tmp/results
-    - store_artifacts:
-        path: /tmp/dtest
-        destination: dtest_j8_with_vnodes
-    - store_artifacts:
-        path: ~/cassandra-dtest/logs
-        destination: dtest_j8_with_vnodes_logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 2048M
-    - CCM_HEAP_NEWSIZE: 512M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   j8_dtests-no-vnodes:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -600,6 +330,455 @@
     - CCM_HEAP_NEWSIZE: 512M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_unit_tests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 100
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtests-with-vnodes:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 100
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Clone Cassandra dtest Repository (via git)
+        command: |
+          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
+    - run:
+        name: Configure virtualenv and python Dependencies
+        command: |
+          # note, this should be super quick as all dependencies should be pre-installed in the docker image
+          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
+          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
+          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
+          source ~/env/bin/activate
+          export PATH=$JAVA_HOME/bin:$PATH
+          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
+          pip3 freeze
+    - run:
+        name: Determine Tests to Run (j8_with_vnodes)
+        no_output_timeout: 5m
+        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
+    - run:
+        name: Run dtests (j8_with_vnodes)
+        no_output_timeout: 15m
+        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
+    - store_test_results:
+        path: /tmp/results
+    - store_artifacts:
+        path: /tmp/dtest
+        destination: dtest_j8_with_vnodes
+    - store_artifacts:
+        path: ~/cassandra-dtest/logs
+        destination: dtest_j8_with_vnodes_logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_jvm_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_long:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Run Unit Tests (long-test)
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          ant clean long-test
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_compression:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: xlarge
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 100
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist-compression)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtest_jars_build:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Build Cassandra DTest jars
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          git remote add apache git://github.com/apache/cassandra.git
+          for branch in cassandra-2.2 cassandra-3.0 cassandra-3.11 trunk; do
+            # check out the correct cassandra version:
+            git remote set-branches --add apache '$branch'
+            git fetch --depth 1 apache $branch
+            git checkout $branch
+            # Loop to prevent failure due to maven-ant-tasks not downloading a jar..
+            for x in $(seq 1 3); do
+                ${ANT_HOME}/bin/ant clean jar dtest-jar
+                RETURN="$?"
+                if [ "${RETURN}" -eq "0" ]; then
+                    break
+                fi
+            done
+            # Exit, if we didn't build successfully
+            if [ "${RETURN}" -ne "0" ]; then
+                echo "Build failed with exit code: ${RETURN}"
+                exit ${RETURN}
+            fi
+          done
+          # and build the dtest-jar for the branch under test
+          git checkout origin/$CIRCLE_BRANCH
+          for x in $(seq 1 3); do
+              ${ANT_HOME}/bin/ant clean jar dtest-jar
+              RETURN="$?"
+              if [ "${RETURN}" -eq "0" ]; then
+                  break
+              fi
+          done
+          mkdir ~/dtest_jars
+          cp build/dtest*.jar ~/dtest_jars
+          ls -l ~/dtest_jars
+        no_output_timeout: 15m
+    - persist_to_workspace:
+        root: /home/cassandra
+        paths:
+        - dtest_jars
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 2048M
+    - CCM_HEAP_NEWSIZE: 512M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
 workflows:
   version: 2
   build_and_run_tests:
@@ -625,6 +804,15 @@
     - utests_compression:
         requires:
         - start_utests_compression
+    - start_jvm_upgrade_dtest:
+        type: approval
+    - j8_dtest_jars_build:
+        requires:
+        - build
+        - start_jvm_upgrade_dtest
+    - j8_jvm_upgrade_dtests:
+        requires:
+        - j8_dtest_jars_build
     - start_j8_dtests:
         type: approval
         requires:

diff --git a/.circleci/config.yml.LOWRES b/.circleci/config.yml.LOWRES
index d5efe4f..3c62b4a 100644
--- a/.circleci/config.yml.LOWRES
+++ b/.circleci/config.yml.LOWRES

@@ -1,5 +1,97 @@
 version: 2
 jobs:
+  j8_jvm_upgrade_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   build:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -53,6 +145,12 @@
               exit ${RETURN}
           fi
         no_output_timeout: 15m
+    - run:
+        name: Run eclipse-warnings
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          ant eclipse-warnings
     - persist_to_workspace:
         root: /home/cassandra
         paths:
@@ -75,374 +173,6 @@
     - CCM_HEAP_NEWSIZE: 256M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_unit_tests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_jvm_dtests:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 1
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine distributed Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_long:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 1
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Run Unit Tests (long-test)
-        command: |
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          ant clean long-test
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  utests_compression:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Determine unit Tests to Run
-        command: |
-          # reminder: this code (along with all the steps) is independently executed on every circle container
-          # so the goal here is to get the circleci script to return the tests *this* container will run
-          # which we do via the `circleci` cli tool.
-
-          rm -fr ~/cassandra-dtest/upgrade_tests
-          echo "***java tests***"
-
-          # get all of our unit test filenames
-          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
-
-          # split up the unit tests into groups based on the number of containers we have
-          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
-          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
-          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
-        no_output_timeout: 15m
-    - run:
-        name: Log Environment Information
-        command: |
-          echo '*** id ***'
-          id
-          echo '*** cat /proc/cpuinfo ***'
-          cat /proc/cpuinfo
-          echo '*** free -m ***'
-          free -m
-          echo '*** df -m ***'
-          df -m
-          echo '*** ifconfig -a ***'
-          ifconfig -a
-          echo '*** uname -a ***'
-          uname -a
-          echo '*** mount ***'
-          mount
-          echo '*** env ***'
-          env
-          echo '*** java ***'
-          which java
-          java -version
-    - run:
-        name: Run Unit Tests (testclasslist-compression)
-        command: |
-          set -x
-          export PATH=$JAVA_HOME/bin:$PATH
-          time mv ~/cassandra /tmp
-          cd /tmp/cassandra
-          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
-          if [ -z "$test_timeout" ]; then
-            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
-          fi
-          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
-        no_output_timeout: 15m
-    - store_test_results:
-        path: /tmp/cassandra/build/test/output/
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/output
-        destination: junitxml
-    - store_artifacts:
-        path: /tmp/cassandra/build/test/logs
-        destination: logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-  j8_dtests-with-vnodes:
-    docker:
-    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
-    resource_class: medium
-    working_directory: ~/
-    shell: /bin/bash -eo pipefail -l
-    parallelism: 4
-    steps:
-    - attach_workspace:
-        at: /home/cassandra
-    - run:
-        name: Clone Cassandra dtest Repository (via git)
-        command: |
-          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
-    - run:
-        name: Configure virtualenv and python Dependencies
-        command: |
-          # note, this should be super quick as all dependencies should be pre-installed in the docker image
-          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
-          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
-          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
-          source ~/env/bin/activate
-          export PATH=$JAVA_HOME/bin:$PATH
-          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
-          pip3 freeze
-    - run:
-        name: Determine Tests to Run (j8_with_vnodes)
-        no_output_timeout: 5m
-        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
-    - run:
-        name: Run dtests (j8_with_vnodes)
-        no_output_timeout: 15m
-        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
-    - store_test_results:
-        path: /tmp/results
-    - store_artifacts:
-        path: /tmp/dtest
-        destination: dtest_j8_with_vnodes
-    - store_artifacts:
-        path: ~/cassandra-dtest/logs
-        destination: dtest_j8_with_vnodes_logs
-    environment:
-    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - ANT_HOME: /usr/share/ant
-    - LANG: en_US.UTF-8
-    - KEEP_TEST_DIR: true
-    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
-    - PYTHONIOENCODING: utf-8
-    - PYTHONUNBUFFERED: true
-    - CASS_DRIVER_NO_EXTENSIONS: true
-    - CASS_DRIVER_NO_CYTHON: true
-    - CASSANDRA_SKIP_SYNC: true
-    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
-    - DTEST_BRANCH: master
-    - CCM_MAX_HEAP_SIZE: 1024M
-    - CCM_HEAP_NEWSIZE: 256M
-    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
-    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
   j8_dtests-no-vnodes:
     docker:
     - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
@@ -600,6 +330,455 @@
     - CCM_HEAP_NEWSIZE: 256M
     - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
     - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_unit_tests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtests-with-vnodes:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Clone Cassandra dtest Repository (via git)
+        command: |
+          git clone --single-branch --branch $DTEST_BRANCH --depth 1 $DTEST_REPO ~/cassandra-dtest
+    - run:
+        name: Configure virtualenv and python Dependencies
+        command: |
+          # note, this should be super quick as all dependencies should be pre-installed in the docker image
+          # if additional dependencies were added to requirmeents.txt and the docker image hasn't been updated
+          # we'd have to install it here at runtime -- which will make things slow, so do yourself a favor and
+          # rebuild the docker image! (it automatically pulls the latest requirements.txt on build)
+          source ~/env/bin/activate
+          export PATH=$JAVA_HOME/bin:$PATH
+          pip3 install --upgrade -r ~/cassandra-dtest/requirements.txt
+          pip3 freeze
+    - run:
+        name: Determine Tests to Run (j8_with_vnodes)
+        no_output_timeout: 5m
+        command: "# reminder: this code (along with all the steps) is independently executed on every circle container\n# so the goal here is to get the circleci script to return the tests *this* container will run\n# which we do via the `circleci` cli tool.\n\ncd cassandra-dtest\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\n\nif [ -n '' ]; then\n  export \nfi\n\necho \"***Collected DTests (j8_with_vnodes)***\"\nset -eo pipefail && ./run_dtests.py --use-vnodes --skip-resource-intensive-tests --dtest-print-tests-only --dtest-print-tests-output=/tmp/all_dtest_tests_j8_with_vnodes_raw --cassandra-dir=../cassandra\nif [ -z '' ]; then\n  mv /tmp/all_dtest_tests_j8_with_vnodes_raw /tmp/all_dtest_tests_j8_with_vnodes\nelse\n  grep -e '' /tmp/all_dtest_tests_j8_with_vnodes_raw > /tmp/all_dtest_tests_j8_with_vnodes || { echo \"Filter did not match any tests! Exiting build.\"; exit 0; }\nfi\nset -eo pipefail && circleci tests split --split-by=timings --timings-type=classname /tmp/all_dtest_tests_j8_with_vnodes > /tmp/split_dtest_tests_j8_with_vnodes.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes.txt | tr '\\n' ' ' > /tmp/split_dtest_tests_j8_with_vnodes_final.txt\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n"
+    - run:
+        name: Run dtests (j8_with_vnodes)
+        no_output_timeout: 15m
+        command: "echo \"cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\"\ncat /tmp/split_dtest_tests_j8_with_vnodes_final.txt\n\nsource ~/env/bin/activate\nexport PATH=$JAVA_HOME/bin:$PATH\nif [ -n '' ]; then\n  export \nfi\n\njava -version\ncd ~/cassandra-dtest\nmkdir -p /tmp/dtest\n\necho \"env: $(env)\"\necho \"** done env\"\nmkdir -p /tmp/results/dtests\n# we need the \"set -o pipefail\" here so that the exit code that circleci will actually use is from pytest and not the exit code from tee\nexport SPLIT_TESTS=`cat /tmp/split_dtest_tests_j8_with_vnodes_final.txt`\nset -o pipefail && cd ~/cassandra-dtest && pytest --use-vnodes --num-tokens=32 --skip-resource-intensive-tests --log-level=\"INFO\" --junit-xml=/tmp/results/dtests/pytest_result_j8_with_vnodes.xml -s --cassandra-dir=/home/cassandra/cassandra --keep-test-dir $SPLIT_TESTS 2>&1 | tee /tmp/dtest/stdout.txt\n"
+    - store_test_results:
+        path: /tmp/results
+    - store_artifacts:
+        path: /tmp/dtest
+        destination: dtest_j8_with_vnodes
+    - store_artifacts:
+        path: ~/cassandra-dtest/logs
+        destination: dtest_j8_with_vnodes_logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_jvm_dtests:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine distributed Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/distributed/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/distributed/;;g" | grep "Test\.java$" | grep -v upgrade > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.distributed.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=distributed
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_long:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Run Unit Tests (long-test)
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          ant clean long-test
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  utests_compression:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 4
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Determine unit Tests to Run
+        command: |
+          # reminder: this code (along with all the steps) is independently executed on every circle container
+          # so the goal here is to get the circleci script to return the tests *this* container will run
+          # which we do via the `circleci` cli tool.
+
+          rm -fr ~/cassandra-dtest/upgrade_tests
+          echo "***java tests***"
+
+          # get all of our unit test filenames
+          set -eo pipefail && circleci tests glob "$HOME/cassandra/test/unit/**/*.java" > /tmp/all_java_unit_tests.txt
+
+          # split up the unit tests into groups based on the number of containers we have
+          set -eo pipefail && circleci tests split --split-by=timings --timings-type=filename --index=${CIRCLE_NODE_INDEX} --total=${CIRCLE_NODE_TOTAL} /tmp/all_java_unit_tests.txt > /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt
+          set -eo pipefail && cat /tmp/java_tests_${CIRCLE_NODE_INDEX}.txt | sed "s;^/home/cassandra/cassandra/test/unit/;;g" | grep "Test\.java$"  > /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+          echo "** /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt"
+          cat /tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt
+        no_output_timeout: 15m
+    - run:
+        name: Log Environment Information
+        command: |
+          echo '*** id ***'
+          id
+          echo '*** cat /proc/cpuinfo ***'
+          cat /proc/cpuinfo
+          echo '*** free -m ***'
+          free -m
+          echo '*** df -m ***'
+          df -m
+          echo '*** ifconfig -a ***'
+          ifconfig -a
+          echo '*** uname -a ***'
+          uname -a
+          echo '*** mount ***'
+          mount
+          echo '*** env ***'
+          env
+          echo '*** java ***'
+          which java
+          java -version
+    - run:
+        name: Run Unit Tests (testclasslist-compression)
+        command: |
+          set -x
+          export PATH=$JAVA_HOME/bin:$PATH
+          time mv ~/cassandra /tmp
+          cd /tmp/cassandra
+          if [ -d ~/dtest_jars ]; then
+            cp ~/dtest_jars/dtest* /tmp/cassandra/build/
+          fi
+          test_timeout=$(grep 'name="test.unit.timeout"' build.xml | awk -F'"' '{print $4}' || true)
+          if [ -z "$test_timeout" ]; then
+            test_timeout=$(grep 'name="test.timeout"' build.xml | awk -F'"' '{print $4}')
+          fi
+          ant testclasslist-compression -Dtest.timeout="$test_timeout" -Dtest.classlistfile=/tmp/java_tests_${CIRCLE_NODE_INDEX}_final.txt  -Dtest.classlistprefix=unit
+        no_output_timeout: 15m
+    - store_test_results:
+        path: /tmp/cassandra/build/test/output/
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/output
+        destination: junitxml
+    - store_artifacts:
+        path: /tmp/cassandra/build/test/logs
+        destination: logs
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+  j8_dtest_jars_build:
+    docker:
+    - image: spod/cassandra-testing-ubuntu1810-java11-w-dependencies:20190306
+    resource_class: medium
+    working_directory: ~/
+    shell: /bin/bash -eo pipefail -l
+    parallelism: 1
+    steps:
+    - attach_workspace:
+        at: /home/cassandra
+    - run:
+        name: Build Cassandra DTest jars
+        command: |
+          export PATH=$JAVA_HOME/bin:$PATH
+          cd ~/cassandra
+          git remote add apache git://github.com/apache/cassandra.git
+          for branch in cassandra-2.2 cassandra-3.0 cassandra-3.11 trunk; do
+            # check out the correct cassandra version:
+            git remote set-branches --add apache '$branch'
+            git fetch --depth 1 apache $branch
+            git checkout $branch
+            # Loop to prevent failure due to maven-ant-tasks not downloading a jar..
+            for x in $(seq 1 3); do
+                ${ANT_HOME}/bin/ant clean jar dtest-jar
+                RETURN="$?"
+                if [ "${RETURN}" -eq "0" ]; then
+                    break
+                fi
+            done
+            # Exit, if we didn't build successfully
+            if [ "${RETURN}" -ne "0" ]; then
+                echo "Build failed with exit code: ${RETURN}"
+                exit ${RETURN}
+            fi
+          done
+          # and build the dtest-jar for the branch under test
+          git checkout origin/$CIRCLE_BRANCH
+          for x in $(seq 1 3); do
+              ${ANT_HOME}/bin/ant clean jar dtest-jar
+              RETURN="$?"
+              if [ "${RETURN}" -eq "0" ]; then
+                  break
+              fi
+          done
+          mkdir ~/dtest_jars
+          cp build/dtest*.jar ~/dtest_jars
+          ls -l ~/dtest_jars
+        no_output_timeout: 15m
+    - persist_to_workspace:
+        root: /home/cassandra
+        paths:
+        - dtest_jars
+    environment:
+    - JAVA8_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - ANT_HOME: /usr/share/ant
+    - LANG: en_US.UTF-8
+    - KEEP_TEST_DIR: true
+    - DEFAULT_DIR: /home/cassandra/cassandra-dtest
+    - PYTHONIOENCODING: utf-8
+    - PYTHONUNBUFFERED: true
+    - CASS_DRIVER_NO_EXTENSIONS: true
+    - CASS_DRIVER_NO_CYTHON: true
+    - CASSANDRA_SKIP_SYNC: true
+    - DTEST_REPO: git://github.com/apache/cassandra-dtest.git
+    - DTEST_BRANCH: master
+    - CCM_MAX_HEAP_SIZE: 1024M
+    - CCM_HEAP_NEWSIZE: 256M
+    - JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64
+    - JDK_HOME: /usr/lib/jvm/java-8-openjdk-amd64
 workflows:
   version: 2
   build_and_run_tests:
@@ -625,6 +804,15 @@
     - utests_compression:
         requires:
         - start_utests_compression
+    - start_jvm_upgrade_dtest:
+        type: approval
+    - j8_dtest_jars_build:
+        requires:
+        - build
+        - start_jvm_upgrade_dtest
+    - j8_jvm_upgrade_dtests:
+        requires:
+        - j8_dtest_jars_build
     - start_j8_dtests:
         type: approval
         requires:

diff --git a/.jenkins/Jenkinsfile b/.jenkins/Jenkinsfile
index 77c111c..2b371c0 100644
--- a/.jenkins/Jenkinsfile
+++ b/.jenkins/Jenkinsfile

@@ -153,6 +153,29 @@
                 }
               }
             }
+            stage('cqlsh') {
+              steps {
+                  warnError('Tests unstable') {
+                    build job: "${env.JOB_NAME}-cqlsh-tests"
+                  }
+              }
+              post {
+                success {
+                    warnError('missing test xml files') {
+                        script {
+                            copyTestResults('cqlsh-test')
+                        }
+                    }
+                }
+                unstable {
+                    warnError('missing test xml files') {
+                        script {
+                            copyTestResults('cqlsh-test')
+                        }
+                    }
+                }
+              }
+            }
           }
       }
       stage('Distributed Test') {
@@ -226,29 +249,6 @@
                 }
               }
             }
-            stage('dtest-offheap') {
-              steps {
-                  warnError('Tests unstable') {
-                    build job: "${env.JOB_NAME}-dtest-offheap"
-                  }
-              }
-              post {
-                success {
-                    warnError('missing test xml files') {
-                        script {
-                            copyTestResults('dtest-offheap')
-                        }
-                    }
-                }
-                unstable {
-                    warnError('missing test xml files') {
-                        script {
-                            copyTestResults('dtest-offheap')
-                        }
-                    }
-                }
-              }
-            }
           }
         }
     stage('Summary') {

diff --git a/.rat-excludes b/.rat-excludes
index d95b499..6df36a5 100644
--- a/.rat-excludes
+++ b/.rat-excludes

@@ -24,7 +24,6 @@
 doc/**
 build.properties.default
 test/data/**
-examples/pig/**
 examples/triggers/build.xml
 examples/triggers/conf/*
 examples/hadoop_word_count/conf/log4j.properties

diff --git a/CASSANDRA-14092.txt b/CASSANDRA-14092.txt
index 5ac872c..f95380b 100644
--- a/CASSANDRA-14092.txt
+++ b/CASSANDRA-14092.txt

@@ -17,7 +17,7 @@
   - CAP_NOWARN: same as previous, except that the client warning will not be emitted.
 
 These policies may be specified via the -Dcassandra.expiration_date_overflow_policy=POLICY
-startup option.
+startup option in the jvm.options configuration file.
 
 # Potential data loss on earlier versions
 
@@ -47,8 +47,8 @@
 timestamp, since tombstones may have been generated with the original timestamp.
 
 To find out if an SSTable has an entry with overflowed expiration, inspect it with the
-sstable2json tool and look for a negative "local deletion time" field. SSTables in this
-condition should be backed up immediately, as they are subject to data loss during
+'sstablemetadata' tool and look for a negative "Minimum local deletion time" field. SSTables
+in this condition should be backed up immediately, as they are subject to data loss during
 compaction.
 
 A "--reinsert-overflowed-ttl" option was added to scrub to rewrite SSTables containing
@@ -78,4 +78,4 @@
    - run "nodetool scrub --reinsert-overflowed-ttl <keyspace> <table>".
    - Re-enable compactions after verifying that scrub recovered the missing entries.
 
-See https://issues.apache.org/jira/browse/CASSANDRA-14092 for more details about this issue.
\ No newline at end of file
+See https://issues.apache.org/jira/browse/CASSANDRA-14092 for more details about this issue.

diff --git a/CHANGES.txt b/CHANGES.txt
index 02de7c1..5681007 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -1,45 +1,145 @@
-2.2.17
+3.0.21
+ * Avoid hinted handoff per-host throttle being arounded to 0 in large cluster (CASSANDRA-15859)
+ * Avoid emitting empty range tombstones from RangeTombstoneList (CASSANDRA-15924)
+ * Avoid thread starvation, and improve compare-and-swap performance, in the slab allocators (CASSANDRA-15922)
+ * Fix broken KEYS 2i queries after DROP COMPACT STORAGE (CASSANDRA-15906)
+ * Add token to tombstone warning and error messages (CASSANDRA-15890)
+ * Fixed range read concurrency factor computation and capped as 10 times tpc cores (CASSANDRA-15752)
+ * Catch exception on bootstrap resume and init native transport (CASSANDRA-15863)
+ * Fix replica-side filtering returning stale data with CL > ONE (CASSANDRA-8272, CASSANDRA-8273)
+ * Fix duplicated row on 2.x upgrades when multi-rows range tombstones interact with collection ones (CASSANDRA-15805)
+ * Rely on snapshotted session infos on StreamResultFuture.maybeComplete to avoid race conditions (CASSANDRA-15667)
+ * EmptyType doesn't override writeValue so could attempt to write bytes when expected not to (CASSANDRA-15790)
+ * Fix index queries on partition key columns when some partitions contains only static data (CASSANDRA-13666)
+ * Avoid creating duplicate rows during major upgrades (CASSANDRA-15789)
+ * liveDiskSpaceUsed and totalDiskSpaceUsed get corrupted if IndexSummaryRedistribution gets interrupted (CASSANDRA-15674)
+ * Fix Debian init start/stop (CASSANDRA-15770)
+ * Fix infinite loop on index query paging in tables with clustering (CASSANDRA-14242)
+ * Fix chunk index overflow due to large sstable with small chunk length (CASSANDRA-15595)
+ * cqlsh return non-zero status when STDIN CQL fails (CASSANDRA-15623)
+ * Don't skip sstables in slice queries based only on local min/max/deletion timestamp (CASSANDRA-15690)
+ * Memtable memory allocations may deadlock (CASSANDRA-15367)
+ * Run evictFromMembership in GossipStage (CASSANDRA-15592)
+Merged from 2.2:
  * Fix nomenclature of allow and deny lists (CASSANDRA-15862)
  * Remove generated files from source artifact (CASSANDRA-15849)
  * Remove duplicated tools binaries from tarballs (CASSANDRA-15768)
  * Duplicate results with DISTINCT queries in mixed mode (CASSANDRA-15501)
  * Disable JMX rebinding (CASSANDRA-15653)
- * Fix Commit log replays when static column clustering keys are collections (CASSANDRA-14365)
- * Fix Red Hat init script on newer systemd versions (CASSANDRA-15273)
- * Allow EXTRA_CLASSPATH to work on tar/source installations (CASSANDRA-15567)
 Merged from 2.1:
  * Fix writing of snapshot manifest when the table has table-backed secondary indexes (CASSANDRA-10968)
  * Fix parse error in cqlsh COPY FROM and formatting for map of blobs (CASSANDRA-15679)
+ * Fix Commit log replays when static column clustering keys are collections (CASSANDRA-14365)
+ * Fix Red Hat init script on newer systemd versions (CASSANDRA-15273)
+ * Allow EXTRA_CLASSPATH to work on tar/source installations (CASSANDRA-15567)
 
 
-2.2.16
+3.0.20
+ * Run in-jvm upgrade dtests in circleci (CASSANDRA-15506)
+ * Include updates to static column in mutation size calculations (CASSANDRA-15293)
+ * Fix point-in-time recoevery ignoring timestamp of updates to static columns (CASSANDRA-15292)
+ * GC logs are also put under $CASSANDRA_LOG_DIR (CASSANDRA-14306)
+ * Fix sstabledump's position key value when partitions have multiple rows (CASSANDRA-14721)
+ * Avoid over-scanning data directories in LogFile.verify() (CASSANDRA-15364)
+ * Bump generations and document changes to system_distributed and system_traces in 3.0, 3.11
+   (CASSANDRA-15441)
+ * Fix system_traces creation timestamp; optimise system keyspace upgrades (CASSANDRA-15398)
+ * Fix various data directory prefix matching issues (CASSANDRA-13974)
+ * Minimize clustering values in metadata collector (CASSANDRA-15400)
+ * Avoid over-trimming of results in mixed mode clusters (CASSANDRA-15405)
+ * validate value sizes in LegacyLayout (CASSANDRA-15373)
+ * Ensure that tracing doesn't break connections in 3.x/4.0 mixed mode by default (CASSANDRA-15385)
+ * Make sure index summary redistribution does not start when compactions are paused (CASSANDRA-15265)
+ * Ensure legacy rows have primary key livenessinfo when they contain illegal cells (CASSANDRA-15365)
+ * Fix race condition when setting bootstrap flags (CASSANDRA-14878)
+ * Fix NativeLibrary.tryOpenDirectory callers for Windows (CASSANDRA-15426)
+Merged from 2.2:
  * Fix SELECT JSON output for empty blobs (CASSANDRA-15435)
  * In-JVM DTest: Set correct internode message version for upgrade test (CASSANDRA-15371)
- * In-JVM DTest: Support NodeTool in dtest
+ * In-JVM DTest: Support NodeTool in dtest (CASSANDRA-15429)
 
 
-2.2.15
+3.0.19
+ * Add ability to cap max negotiable protocol version (CASSANDRA-15193)
+ * Gossip tokens on startup if available (CASSANDRA-15335)
+ * Fix resource leak in CompressedSequentialWriter (CASSANDRA-15340)
+ * Fix merge which reverted CASSANDRA-14993 (CASSANDRA-15289)
+ * Fix LegacyLayout RangeTombstoneList IndexOutOfBoundsException when upgrading and RangeTombstone bounds are asymmetric (CASSANDRA-15172)
+ * Fix NPE when using allocate_tokens_for_keyspace on new DC/rack (CASSANDRA-14952)
+ * Filter sstables earlier when running cleanup (CASSANDRA-15100)
+ * Use mean row count instead of mean column count for index selectivity calculation (CASSANDRA-15259)
+ * Avoid updating unchanged gossip states (CASSANDRA-15097)
+ * Prevent recreation of previously dropped columns with a different kind (CASSANDRA-14948)
+ * Prevent client requests from blocking on executor task queue (CASSANDRA-15013)
+ * Toughen up column drop/recreate type validations (CASSANDRA-15204)
+ * LegacyLayout should handle paging states that cross a collection column (CASSANDRA-15201)
+ * Prevent RuntimeException when username or password is empty/null (CASSANDRA-15198)
+ * Multiget thrift query returns null records after digest mismatch (CASSANDRA-14812)
+ * Skipping illegal legacy cells can break reverse iteration of indexed partitions (CASSANDRA-15178)
+ * Handle paging states serialized with a different version than the session's (CASSANDRA-15176)
+ * Throw IOE instead of asserting on unsupporter peer versions (CASSANDRA-15066)
+ * Update token metadata when handling MOVING/REMOVING_TOKEN events (CASSANDRA-15120)
+ * Add ability to customize cassandra log directory using $CASSANDRA_LOG_DIR (CASSANDRA-15090)
+ * Skip cells with illegal column names when reading legacy sstables (CASSANDRA-15086)
+ * Fix assorted gossip races and add related runtime checks (CASSANDRA-15059)
+ * Fix mixed mode partition range scans with limit (CASSANDRA-15072)
+ * cassandra-stress works with frozen collections: list and set (CASSANDRA-14907)
+ * For nodetool listsnapshots output, put spaces between columns, and increase snapshot padding (CASSANDRA-14876)
+ * Fix handling FS errors on writing and reading flat files - LogTransaction and hints (CASSANDRA-15053)
+ * Avoid double closing the iterator to avoid overcounting the number of requests (CASSANDRA-15058)
+ * Improve `nodetool status -r` speed (CASSANDRA-14847)
+ * Improve merkle tree size and time on heap (CASSANDRA-14096)
+ * Add missing commands to nodetool-completion (CASSANDRA-14916)
+ * Anti-compaction temporarily corrupts sstable state for readers (CASSANDRA-15004)
+ Merged from 2.2:
  * Catch non-IOException in FileUtils.close to make sure that all resources are closed (CASSANDRA-15225)
  * Handle exceptions during authentication/authorization (CASSANDRA-15041)
- * Fix JDK7 compatibility broken in cassandra-2.2 (CASSANDRA-15050)
  * Support cross version messaging in in-jvm upgrade dtests (CASSANDRA-15078)
  * Fix index summary redistribution cancellation (CASSANDRA-15045)
  * Refactor Circle CI configuration (CASSANDRA-14806)
  * Fixing invalid CQL in security documentation (CASSANDRA-15020)
- * Make tools/bin/token-generator py2/3 compatible (CASSANDRA-15012)
  * Multi-version in-JVM dtests (CASSANDRA-14937)
  * Allow instance class loaders to be garbage collected for inJVM dtest (CASSANDRA-15170)
  * Add support for network topology and query tracing for inJVM dtest (CASSANDRA-15319)
 
 
-2.2.14
+3.0.18
+ * Severe concurrency issues in STCS,DTCS,TWCS,TMD.Topology,TypeParser
+ * Add a script to make running the cqlsh tests in cassandra repo easier (CASSANDRA-14951)
+ * If SizeEstimatesRecorder misses a 'onDropTable' notification, the size_estimates table will never be cleared for that table. (CASSANDRA-14905)
+ * Counters fail to increment in 2.1/2.2 to 3.X mixed version clusters (CASSANDRA-14958)
+ * Streaming needs to synchronise access to LifecycleTransaction (CASSANDRA-14554)
+ * Fix cassandra-stress write hang with default options (CASSANDRA-14616)
+ * Differentiate between slices and RTs when decoding legacy bounds (CASSANDRA-14919)
+ * CommitLogReplayer.handleReplayError should print stack traces (CASSANDRA-14589)
+ * Netty epoll IOExceptions caused by unclean client disconnects being logged at INFO (CASSANDRA-14909)
+ * Unfiltered.isEmpty conflicts with Row extends AbstractCollection.isEmpty (CASSANDRA-14588)
+ * RangeTombstoneList doesn't properly clean up mergeable or superseded rts in some cases (CASSANDRA-14894)
+ * Fix handling of collection tombstones for dropped columns from legacy sstables (CASSANDRA-14912)
+ * Throw exception if Columns serialized subset encode more columns than possible (CASSANDRA-14591)
+ * Drop/add column name with different Kind can result in corruption (CASSANDRA-14843)
+ * Fix missing rows when reading 2.1 SSTables with static columns in 3.0 (CASSANDRA-14873)
+ * Move TWCS message 'No compaction necessary for bucket size' to Trace level (CASSANDRA-14884)
+ * Sstable min/max metadata can cause data loss (CASSANDRA-14861)
+ * Dropped columns can cause reverse sstable iteration to return prematurely (CASSANDRA-14838)
+ * Legacy sstables with  multi block range tombstones create invalid bound sequences (CASSANDRA-14823)
+ * Expand range tombstone validation checks to multiple interim request stages (CASSANDRA-14824)
+ * Reverse order reads can return incomplete results (CASSANDRA-14803)
+ * Avoid calling iter.next() in a loop when notifying indexers about range tombstones (CASSANDRA-14794)
+ * Fix purging semi-expired RT boundaries in reversed iterators (CASSANDRA-14672)
+ * DESC order reads can fail to return the last Unfiltered in the partition (CASSANDRA-14766)
+ * Fix corrupted collection deletions for dropped columns in 3.0 <-> 2.{1,2} messages (CASSANDRA-14568)
+ * Fix corrupted static collection deletions in 3.0 <-> 2.{1,2} messages (CASSANDRA-14568)
+ * Handle failures in parallelAllSSTableOperation (cleanup/upgradesstables/etc) (CASSANDRA-14657)
+ * Improve TokenMetaData cache populating performance avoid long locking (CASSANDRA-14660)
+ * Backport: Flush netty client messages immediately (not by default) (CASSANDRA-13651)
+ * Fix static column order for SELECT * wildcard queries (CASSANDRA-14638)
+ * sstableloader should use discovered broadcast address to connect intra-cluster (CASSANDRA-14522)
+ * Fix reading columns with non-UTF names from schema (CASSANDRA-14468)
+ Merged from 2.2:
  * CircleCI docker image should bake in more dependencies (CASSANDRA-14985)
  * Don't enable client transports when bootstrap is pending (CASSANDRA-14525)
  * MigrationManager attempts to pull schema from different major version nodes (CASSANDRA-14928)
- * Don't skip entire sstables when reading backwards with mixed clustering column order
-   (CASSANDRA-14910)
- * Cannot perform slice reads in reverse direction against tables with clustering columns
-   in mixed order (CASSANDRA-14899)
  * Fix incorrect cqlsh results when selecting same columns multiple times (CASSANDRA-13262)
  * Returns null instead of NaN or Infinity in JSON strings (CASSANDRA-14377)
 Merged from 2.1:
@@ -47,86 +147,294 @@
  * Update release checksum algorithms to SHA-256, SHA-512 (CASSANDRA-14970)
 
 
-2.2.13
+3.0.17
+ * Fix corrupted static collection deletions in 3.0 -> 2.{1,2} messages (CASSANDRA-14568)
+ * Fix potential IndexOutOfBoundsException with counters (CASSANDRA-14167)
+ * Restore resumable hints delivery, backport CASSANDRA-11960 (CASSANDRA-14419)
+ * Always close RT markers returned by ReadCommand#executeLocally() (CASSANDRA-14515)
+ * Reverse order queries with range tombstones can cause data loss (CASSANDRA-14513)
+ * Fix regression of lagging commitlog flush log message (CASSANDRA-14451)
+ * Add Missing dependencies in pom-all (CASSANDRA-14422)
+ * Cleanup StartupClusterConnectivityChecker and PING Verb (CASSANDRA-14447)
+ * Fix deprecated repair error notifications from 3.x clusters to legacy JMX clients (CASSANDRA-13121)
+ * Cassandra not starting when using enhanced startup scripts in windows (CASSANDRA-14418)
+ * Fix progress stats and units in compactionstats (CASSANDRA-12244)
+ * Better handle missing partition columns in system_schema.columns (CASSANDRA-14379)
+ * Delay hints store excise by write timeout to avoid race with decommission (CASSANDRA-13740)
+ * Deprecate background repair and probablistic read_repair_chance table options
+   (CASSANDRA-13910)
+ * Add missed CQL keywords to documentation (CASSANDRA-14359)
+ * Fix unbounded validation compactions on repair / revert CASSANDRA-13797 (CASSANDRA-14332)
+ * Avoid deadlock when running nodetool refresh before node is fully up (CASSANDRA-14310)
+ * Handle all exceptions when opening sstables (CASSANDRA-14202)
+ * Handle incompletely written hint descriptors during startup (CASSANDRA-14080)
+ * Handle repeat open bound from SRP in read repair (CASSANDRA-14330)
+ * Respect max hint window when hinting for LWT (CASSANDRA-14215)
+ * Adding missing WriteType enum values to v3, v4, and v5 spec (CASSANDRA-13697)
+ * Don't regenerate bloomfilter and summaries on startup (CASSANDRA-11163)
+ * Fix NPE when performing comparison against a null frozen in LWT (CASSANDRA-14087)
+ * Log when SSTables are deleted (CASSANDRA-14302)
+ * Fix batch commitlog sync regression (CASSANDRA-14292)
+ * Write to pending endpoint when view replica is also base replica (CASSANDRA-14251)
+ * Chain commit log marker potential performance regression in batch commit mode (CASSANDRA-14194)
+ * Fully utilise specified compaction threads (CASSANDRA-14210)
+ * Pre-create deletion log records to finish compactions quicker (CASSANDRA-12763)
+Merged from 2.2:
  * Fix bug that prevented compaction of SSTables after full repairs (CASSANDRA-14423)
  * Incorrect counting of pending messages in OutboundTcpConnection (CASSANDRA-11551)
  * Fix compaction failure caused by reading un-flushed data (CASSANDRA-12743)
  * Use Bounds instead of Range for sstables in anticompaction (CASSANDRA-14411)
  * Fix JSON queries with IN restrictions and ORDER BY clause (CASSANDRA-14286)
  * CQL fromJson(null) throws NullPointerException (CASSANDRA-13891)
- * Fix query pager DEBUG log leak causing hit in paged reads throughput (CASSANDRA-14318)
  * Backport circleci yaml (CASSANDRA-14240)
 Merged from 2.1:
  * Check checksum before decompressing data (CASSANDRA-14284)
  * CVE-2017-5929 Security vulnerability in Logback warning in NEWS.txt (CASSANDRA-14183)
 
 
-2.2.12
+3.0.16
+ * Fix unit test failures in ViewComplexTest (CASSANDRA-14219)
+ * Add MinGW uname check to start scripts (CASSANDRA-12940)
+ * Protect against overflow of local expiration time (CASSANDRA-14092)
+ * Use the correct digest file and reload sstable metadata in nodetool verify (CASSANDRA-14217)
+ * Handle failure when mutating repaired status in Verifier (CASSANDRA-13933)
+ * Close socket on error during connect on OutboundTcpConnection (CASSANDRA-9630)
+ * Set encoding for javadoc generation (CASSANDRA-14154)
+ * Fix index target computation for dense composite tables with dropped compact storage (CASSANDRA-14104)
+ * Improve commit log chain marker updating (CASSANDRA-14108)
+ * Extra range tombstone bound creates double rows (CASSANDRA-14008)
+ * Fix SStable ordering by max timestamp in SinglePartitionReadCommand (CASSANDRA-14010)
+ * Accept role names containing forward-slash (CASSANDRA-14088)
+ * Optimize CRC check chance probability calculations (CASSANDRA-14094)
+ * Fix cleanup on keyspace with no replicas (CASSANDRA-13526)
+ * Fix updating base table rows with TTL not removing materialized view entries (CASSANDRA-14071)
+ * Reduce garbage created by DynamicSnitch (CASSANDRA-14091)
+ * More frequent commitlog chained markers (CASSANDRA-13987)
+ * Fix serialized size of DataLimits (CASSANDRA-14057)
+ * Add flag to allow dropping oversized read repair mutations (CASSANDRA-13975)
+ * Fix SSTableLoader logger message (CASSANDRA-14003)
+ * Fix repair race that caused gossip to block (CASSANDRA-13849)
+ * Tracing interferes with digest requests when using RandomPartitioner (CASSANDRA-13964)
+ * Add flag to disable materialized views, and warnings on creation (CASSANDRA-13959)
+ * Don't let user drop or generally break tables in system_distributed (CASSANDRA-13813)
+ * Provide a JMX call to sync schema with local storage (CASSANDRA-13954)
+ * Mishandling of cells for removed/dropped columns when reading legacy files (CASSANDRA-13939)
+ * Deserialise sstable metadata in nodetool verify (CASSANDRA-13922)
+Merged from 2.2:
  * Fix the inspectJvmOptions startup check (CASSANDRA-14112)
  * Fix race that prevents submitting compaction for a table when executor is full (CASSANDRA-13801)
  * Rely on the JVM to handle OutOfMemoryErrors (CASSANDRA-13006)
- * Grab refs during scrub/index redistribution/cleanup (CASSANDRA-13873)
 Merged from 2.1:
- * Protect against overflow of local expiration time (CASSANDRA-14092)
  * More PEP8 compliance for cqlsh (CASSANDRA-14021)
  * RPM package spec: fix permissions for installed jars and config files (CASSANDRA-14181)
 
 
-2.2.11
+3.0.15
+ * Improve TRUNCATE performance (CASSANDRA-13909)
+ * Implement short read protection on partition boundaries (CASSANDRA-13595)
+ * Fix ISE thrown by UPI.Serializer.hasNext() for some SELECT queries (CASSANDRA-13911)
+ * Filter header only commit logs before recovery (CASSANDRA-13918)
+ * AssertionError prepending to a list (CASSANDRA-13149)
+ * Fix support for SuperColumn tables (CASSANDRA-12373)
+ * Handle limit correctly on tables with strict liveness (CASSANDRA-13883)
+ * Fix missing original update in TriggerExecutor (CASSANDRA-13894)
+ * Remove non-rpc-ready nodes from counter leader candidates (CASSANDRA-13043)
+ * Improve short read protection performance (CASSANDRA-13794)
+ * Fix sstable reader to support range-tombstone-marker for multi-slices (CASSANDRA-13787)
+ * Fix short read protection for tables with no clustering columns (CASSANDRA-13880)
+ * Make isBuilt volatile in PartitionUpdate (CASSANDRA-13619)
+ * Prevent integer overflow of timestamps in CellTest and RowsTest (CASSANDRA-13866)
+ * Fix counter application order in short read protection (CASSANDRA-12872)
+ * Don't block RepairJob execution on validation futures (CASSANDRA-13797)
+ * Wait for all management tasks to complete before shutting down CLSM (CASSANDRA-13123)
+ * INSERT statement fails when Tuple type is used as clustering column with default DESC order (CASSANDRA-13717)
+ * Fix pending view mutations handling and cleanup batchlog when there are local and remote paired mutations (CASSANDRA-13069)
+ * Improve config validation and documentation on overflow and NPE (CASSANDRA-13622)
+ * Range deletes in a CAS batch are ignored (CASSANDRA-13655)
+ * Avoid assertion error when IndexSummary > 2G (CASSANDRA-12014)
+ * Change repair midpoint logging for tiny ranges (CASSANDRA-13603)
+ * Better handle corrupt final commitlog segment (CASSANDRA-11995)
+ * StreamingHistogram is not thread safe (CASSANDRA-13756)
+ * Fix MV timestamp issues (CASSANDRA-11500)
+ * Better tolerate improperly formatted bcrypt hashes (CASSANDRA-13626) 
+ * Fix race condition in read command serialization (CASSANDRA-13363)
+ * Enable segement creation before recovering commitlogs (CASSANDRA-13587)
+ * Fix AssertionError in short read protection (CASSANDRA-13747)
+ * Don't skip corrupted sstables on startup (CASSANDRA-13620)
+ * Fix the merging of cells with different user type versions (CASSANDRA-13776)
+ * Copy session properties on cqlsh.py do_login (CASSANDRA-13640)
+ * Potential AssertionError during ReadRepair of range tombstone and partition deletions (CASSANDRA-13719)
+ * Don't let stress write warmup data if n=0 (CASSANDRA-13773)
+ * Gossip thread slows down when using batch commit log (CASSANDRA-12966)
+ * Randomize batchlog endpoint selection with only 1 or 2 racks (CASSANDRA-12884)
+ * Fix digest calculation for counter cells (CASSANDRA-13750)
+ * Fix ColumnDefinition.cellValueType() for non-frozen collection and change SSTabledump to use type.toJSONString() (CASSANDRA-13573)
+ * Skip materialized view addition if the base table doesn't exist (CASSANDRA-13737)
+ * Drop table should remove corresponding entries in dropped_columns table (CASSANDRA-13730)
+ * Log warn message until legacy auth tables have been migrated (CASSANDRA-13371)
+ * Fix incorrect [2.1 <- 3.0] serialization of counter cells created in 2.0 (CASSANDRA-13691)
+ * Fix invalid writetime for null cells (CASSANDRA-13711)
+ * Fix ALTER TABLE statement to atomically propagate changes to the table and its MVs (CASSANDRA-12952)
+ * Fixed ambiguous output of nodetool tablestats command (CASSANDRA-13722)
+ * JMXEnabledThreadPoolExecutor with corePoolSize equal to maxPoolSize (Backport CASSANDRA-13329)
+ * Fix Digest mismatch Exception if hints file has UnknownColumnFamily (CASSANDRA-13696)
+ * Purge tombstones created by expired cells (CASSANDRA-13643)
+ * Make concat work with iterators that have different subsets of columns (CASSANDRA-13482)
+ * Set test.runners based on cores and memory size (CASSANDRA-13078)
+ * Allow different NUMACTL_ARGS to be passed in (CASSANDRA-13557)
+ * Allow native function calls in CQLSSTableWriter (CASSANDRA-12606)
+ * Fix secondary index queries on COMPACT tables (CASSANDRA-13627)
+ * Nodetool listsnapshots output is missing a newline, if there are no snapshots (CASSANDRA-13568)
+ * sstabledump reports incorrect usage for argument order (CASSANDRA-13532)
+Merged from 2.2:
  * Safely handle empty buffers when outputting to JSON (CASSANDRA-13868)
  * Copy session properties on cqlsh.py do_login (CASSANDRA-13847)
  * Fix load over calculated issue in IndexSummaryRedistribution (CASSANDRA-13738)
  * Fix compaction and flush exception not captured (CASSANDRA-13833)
- * Make BatchlogManagerMBean.forceBatchlogReplay() blocking (CASSANDRA-13809)
  * Uncaught exceptions in Netty pipeline (CASSANDRA-13649)
- * Prevent integer overflow on exabyte filesystems (CASSANDRA-13067) 
+ * Prevent integer overflow on exabyte filesystems (CASSANDRA-13067)
  * Fix queries with LIMIT and filtering on clustering columns (CASSANDRA-11223)
  * Fix potential NPE when resume bootstrap fails (CASSANDRA-13272)
  * Fix toJSONString for the UDT, tuple and collection types (CASSANDRA-13592)
  * Fix nested Tuples/UDTs validation (CASSANDRA-13646)
- * Remove unused max_value_size_in_mb config setting from yaml (CASSANDRA-13625
 Merged from 2.1:
- * Add storage port options to sstableloader (CASSANDRA-13844)
  * Remove stress-test target in CircleCI as it's not existing (CASSANDRA-13775)
  * Clone HeartBeatState when building gossip messages. Make its generation/version volatile (CASSANDRA-13700)
 
 
-2.2.10
+3.0.14
+ * Ensure int overflow doesn't occur when calculating large partition warning size (CASSANDRA-13172)
+ * Ensure consistent view of partition columns between coordinator and replica in ColumnFilter (CASSANDRA-13004)
+ * Failed unregistering mbean during drop keyspace (CASSANDRA-13346)
+ * nodetool scrub/cleanup/upgradesstables exit code is wrong (CASSANDRA-13542)
+ * Fix the reported number of sstable data files accessed per read (CASSANDRA-13120)
+ * Fix schema digest mismatch during rolling upgrades from versions before 3.0.12 (CASSANDRA-13559)
+ * Upgrade JNA version to 4.4.0 (CASSANDRA-13072)
+ * Interned ColumnIdentifiers should use minimal ByteBuffers (CASSANDRA-13533)
+ * ReverseIndexedReader may drop rows during 2.1 to 3.0 upgrade (CASSANDRA-13525)
+ * Fix repair process violating start/end token limits for small ranges (CASSANDRA-13052)
+ * Add storage port options to sstableloader (CASSANDRA-13518)
+ * Properly handle quoted index names in cqlsh DESCRIBE output (CASSANDRA-12847)
+ * Avoid reading static row twice from old format sstables (CASSANDRA-13236)
+ * Fix NPE in StorageService.excise() (CASSANDRA-13163)
+ * Expire OutboundTcpConnection messages by a single Thread (CASSANDRA-13265)
+ * Fail repair if insufficient responses received (CASSANDRA-13397)
+ * Fix SSTableLoader fail when the loaded table contains dropped columns (CASSANDRA-13276)
+ * Avoid name clashes in CassandraIndexTest (CASSANDRA-13427)
+ * Handling partially written hint files (CASSANDRA-12728)
+ * Interrupt replaying hints on decommission (CASSANDRA-13308)
+ * Fix schema version calculation for rolling upgrades (CASSANDRA-13441)
+Merged from 2.2:
  * Nodes started with join_ring=False should be able to serve requests when authentication is enabled (CASSANDRA-11381)
  * cqlsh COPY FROM: increment error count only for failures, not for attempts (CASSANDRA-13209)
- * nodetool upgradesstables should upgrade system tables (CASSANDRA-13119)
+
+
+3.0.13
+ * Make reading of range tombstones more reliable (CASSANDRA-12811)
+ * Fix startup problems due to schema tables not completely flushed (CASSANDRA-12213)
+ * Fix view builder bug that can filter out data on restart (CASSANDRA-13405)
+ * Fix 2i page size calculation when there are no regular columns (CASSANDRA-13400)
+ * Fix the conversion of 2.X expired rows without regular column data (CASSANDRA-13395)
+ * Fix hint delivery when using ext+internal IPs with prefer_local enabled (CASSANDRA-13020)
+ * Fix possible NPE on upgrade to 3.0/3.X in case of IO errors (CASSANDRA-13389)
+ * Legacy deserializer can create empty range tombstones (CASSANDRA-13341)
+ * Use the Kernel32 library to retrieve the PID on Windows and fix startup checks (CASSANDRA-13333)
+ * Fix code to not exchange schema across major versions (CASSANDRA-13274)
+ * Dropping column results in "corrupt" SSTable (CASSANDRA-13337)
+ * Bugs handling range tombstones in the sstable iterators (CASSANDRA-13340)
+ * Fix CONTAINS filtering for null collections (CASSANDRA-13246)
+ * Applying: Use a unique metric reservoir per test run when using Cassandra-wide metrics residing in MBeans (CASSANDRA-13216)
+ * Propagate row deletions in 2i tables on upgrade (CASSANDRA-13320)
+ * Slice.isEmpty() returns false for some empty slices (CASSANDRA-13305)
+ * Add formatted row output to assertEmpty in CQL Tester (CASSANDRA-13238)
+ * Legacy caching options can prevent 3.0 upgrade (CASSANDRA-13384)
+ * Nodetool upgradesstables/scrub/compact ignores system tables (CASSANDRA-13410)
+ * Fix NPE issue in StorageService (CASSANDRA-13060)
+Merged from 2.2:
  * Avoid starting gossiper in RemoveTest (CASSANDRA-13407)
  * Fix weightedSize() for row-cache reported by JMX and NodeTool (CASSANDRA-13393)
- * Fix JVM metric paths (CASSANDRA-13103)
  * Honor truststore-password parameter in cassandra-stress (CASSANDRA-12773)
  * Discard in-flight shadow round responses (CASSANDRA-12653)
  * Don't anti-compact repaired data to avoid inconsistencies (CASSANDRA-13153)
  * Wrong logger name in AnticompactionTask (CASSANDRA-13343)
+ * Commitlog replay may fail if last mutation is within 4 bytes of end of segment (CASSANDRA-13282)
  * Fix queries updating multiple time the same list (CASSANDRA-13130)
  * Fix GRANT/REVOKE when keyspace isn't specified (CASSANDRA-13053)
+Merged from 2.1:
+ * Fix 2ndary index queries on partition keys for tables with static columns CASSANDRA-13147
+ * Fix ParseError unhashable type list in cqlsh copy from (CASSANDRA-13364)
+
+
+3.0.12
+ * Prevent data loss on upgrade 2.1 - 3.0 by adding component separator to LogRecord absolute path (CASSANDRA-13294)
+ * Improve testing on macOS by eliminating sigar logging (CASSANDRA-13233)
+ * Cqlsh copy-from should error out when csv contains invalid data for collections (CASSANDRA-13071)
+ * Update c.yaml doc for offheap memtables (CASSANDRA-13179)
+ * Faster StreamingHistogram (CASSANDRA-13038)
+ * Legacy deserializer can create unexpected boundary range tombstones (CASSANDRA-13237)
+ * Remove unnecessary assertion from AntiCompactionTest (CASSANDRA-13070)
+ * Fix cqlsh COPY for dates before 1900 (CASSANDRA-13185)
+Merged from 2.2:
  * Avoid race on receiver by starting streaming sender thread after sending init message (CASSANDRA-12886)
  * Fix "multiple versions of ant detected..." when running ant test (CASSANDRA-13232)
  * Coalescing strategy sleeps too much (CASSANDRA-13090)
- * Make sure compaction stats are updated when compaction is interrupted (Backport from 3.0, CASSANDRA-12100)
  * Fix flaky LongLeveledCompactionStrategyTest (CASSANDRA-12202)
  * Fix failing COPY TO STDOUT (CASSANDRA-12497)
  * Fix ColumnCounter::countAll behaviour for reverse queries (CASSANDRA-13222)
  * Exceptions encountered calling getSeeds() breaks OTC thread (CASSANDRA-13018)
- * Commitlog replay may fail if last mutation is within 4 bytes of end of segment (CASSANDRA-13282)
 Merged from 2.1:
- * Fix 2ndary indexes on primary key columns to don't create expiring entries (CASSANDRA-13412)
- * Set javac encoding to utf-8 (CASSANDRA-13466)
- * Fix 2ndary index queries on partition keys for tables with static columns (CASSANDRA-13147)
- * Fix ParseError unhashable type list in cqlsh copy from (CASSANDRA-13364)
  * Remove unused repositories (CASSANDRA-13278)
  * Log stacktrace of uncaught exceptions (CASSANDRA-13108)
 
 
-2.2.9
+3.0.11
+ * Use keyspace replication settings on system.size_estimates table (CASSANDRA-9639)
+ * Add vm.max_map_count StartupCheck (CASSANDRA-13008)
+ * Hint related logging should include the IP address of the destination in addition to 
+   host ID (CASSANDRA-13205)
+ * Reloading logback.xml does not work (CASSANDRA-13173)
+ * Lightweight transactions temporarily fail after upgrade from 2.1 to 3.0 (CASSANDRA-13109)
+ * Duplicate rows after upgrading from 2.1.16 to 3.0.10/3.9 (CASSANDRA-13125)
+ * Fix UPDATE queries with empty IN restrictions (CASSANDRA-13152)
+ * Abort or retry on failed hints delivery (CASSANDRA-13124)
+ * Fix handling of partition with partition-level deletion plus
+   live rows in sstabledump (CASSANDRA-13177)
+ * Provide user workaround when system_schema.columns does not contain entries
+   for a table that's in system_schema.tables (CASSANDRA-13180)
+ * Dump threads when unit tests time out (CASSANDRA-13117)
+ * Better error when modifying function permissions without explicit keyspace (CASSANDRA-12925)
+ * Indexer is not correctly invoked when building indexes over sstables (CASSANDRA-13075)
+ * Read repair is not blocking repair to finish in foreground repair (CASSANDRA-13115)
+ * Stress daemon help is incorrect (CASSANDRA-12563)
+ * Remove ALTER TYPE support (CASSANDRA-12443)
+ * Fix assertion for certain legacy range tombstone pattern (CASSANDRA-12203)
+ * Set javac encoding to utf-8 (CASSANDRA-11077)
+ * Replace empty strings with null values if they cannot be converted (CASSANDRA-12794)
+ * Fixed flacky SSTableRewriterTest: check file counts before calling validateCFS (CASSANDRA-12348)
+ * Fix deserialization of 2.x DeletedCells (CASSANDRA-12620)
+ * Add parent repair session id to anticompaction log message (CASSANDRA-12186)
+ * Improve contention handling on failure to acquire MV lock for streaming and hints (CASSANDRA-12905)
+ * Fix DELETE and UPDATE queries with empty IN restrictions (CASSANDRA-12829)
+ * Mark MVs as built after successful bootstrap (CASSANDRA-12984)
+ * Estimated TS drop-time histogram updated with Cell.NO_DELETION_TIME (CASSANDRA-13040)
+ * Nodetool compactionstats fails with NullPointerException (CASSANDRA-13021)
+ * Thread local pools never cleaned up (CASSANDRA-13033)
+ * Set RPC_READY to false when draining or if a node is marked as shutdown (CASSANDRA-12781)
+ * Make sure sstables only get committed when it's safe to discard commit log records (CASSANDRA-12956)
+ * Reject default_time_to_live option when creating or altering MVs (CASSANDRA-12868)
+ * Nodetool should use a more sane max heap size (CASSANDRA-12739)
+ * LocalToken ensures token values are cloned on heap (CASSANDRA-12651)
+ * AnticompactionRequestSerializer serializedSize is incorrect (CASSANDRA-12934)
+ * Prevent reloading of logback.xml from UDF sandbox (CASSANDRA-12535)
+ * Reenable HeapPool (CASSANDRA-12900)
+Merged from 2.2:
+ * Fix JVM metric names (CASSANDRA-13103)
  * Fix negative mean latency metric (CASSANDRA-12876)
  * Use only one file pointer when creating commitlog segments (CASSANDRA-12539)
  * Fix speculative retry bugs (CASSANDRA-13009)
- * Fix handling of nulls and unsets in IN conditions (CASSANDRA-12981) 
+ * Fix handling of nulls and unsets in IN conditions (CASSANDRA-12981)
+ * Fix race causing infinite loop if Thrift server is stopped before it starts listening (CASSANDRA-12856)
+ * CompactionTasks now correctly drops sstables out of compaction when not enough disk space is available (CASSANDRA-12979)
  * Remove support for non-JavaScript UDFs (CASSANDRA-12883)
  * Fix DynamicEndpointSnitch noop in multi-datacenter situations (CASSANDRA-13074)
  * cqlsh copy-from: encode column names to avoid primary key parsing errors (CASSANDRA-12909)
@@ -138,6 +446,50 @@
  * cqlsh: fix DESC TYPES errors (CASSANDRA-12914)
  * Fix leak on skipped SSTables in sstableupgrade (CASSANDRA-12899)
  * Avoid blocking gossip during pending range calculation (CASSANDRA-12281)
+Merged from 2.1:
+ * Use portable stderr for java error in startup (CASSANDRA-13211)
+ * Fix Thread Leak in OutboundTcpConnection (CASSANDRA-13204)
+ * Coalescing strategy can enter infinite loop (CASSANDRA-13159)
+ * Upgrade netty version to fix memory leak with client encryption (CASSANDRA-13114)
+ * cqlsh copy-from: sort user type fields in csv (CASSANDRA-12959)
+
+
+3.0.10
+ * Disallow offheap_buffers memtable allocation (CASSANDRA-11039)
+ * Fix CommitLogSegmentManagerTest (CASSANDRA-12283)
+ * Pass root cause to CorruptBlockException when uncompression failed (CASSANDRA-12889)
+ * Fix partition count log during compaction (CASSANDRA-12184)
+ * Batch with multiple conditional updates for the same partition causes AssertionError (CASSANDRA-12867)
+ * Make AbstractReplicationStrategy extendable from outside its package (CASSANDRA-12788)
+ * Fix CommitLogTest.testDeleteIfNotDirty (CASSANDRA-12854)
+ * Don't tell users to turn off consistent rangemovements during rebuild. (CASSANDRA-12296)
+ * Avoid deadlock due to materialized view lock contention (CASSANDRA-12689)
+ * Fix for KeyCacheCqlTest flakiness (CASSANDRA-12801)
+ * Include SSTable filename in compacting large row message (CASSANDRA-12384)
+ * Fix potential socket leak (CASSANDRA-12329, CASSANDRA-12330)
+ * Fix ViewTest.testCompaction (CASSANDRA-12789)
+ * Improve avg aggregate functions (CASSANDRA-12417)
+ * Preserve quoted reserved keyword column names in MV creation (CASSANDRA-11803)
+ * nodetool stopdaemon errors out (CASSANDRA-12646)
+ * Split materialized view mutations on build to prevent OOM (CASSANDRA-12268)
+ * mx4j does not work in 3.0.8 (CASSANDRA-12274)
+ * Abort cqlsh copy-from in case of no answer after prolonged period of time (CASSANDRA-12740)
+ * Avoid sstable corrupt exception due to dropped static column (CASSANDRA-12582)
+ * Make stress use client mode to avoid checking commit log size on startup (CASSANDRA-12478)
+ * Fix exceptions with new vnode allocation (CASSANDRA-12715)
+ * Unify drain and shutdown processes (CASSANDRA-12509)
+ * Fix NPE in ComponentOfSlice.isEQ() (CASSANDRA-12706)
+ * Fix failure in LogTransactionTest (CASSANDRA-12632)
+ * Fix potentially incomplete non-frozen UDT values when querying with the
+   full primary key specified (CASSANDRA-12605)
+ * Skip writing MV mutations to commitlog on mutation.applyUnsafe() (CASSANDRA-11670)
+ * Establish consistent distinction between non-existing partition and NULL value for LWTs on static columns (CASSANDRA-12060)
+ * Extend ColumnIdentifier.internedInstances key to include the type that generated the byte buffer (CASSANDRA-12516)
+ * Backport CASSANDRA-10756 (race condition in NativeTransportService shutdown) (CASSANDRA-12472)
+ * If CF has no clustering columns, any row cache is full partition cache (CASSANDRA-12499)
+ * Correct log message for statistics of offheap memtable flush (CASSANDRA-12776)
+ * Explicitly set locale for string validation (CASSANDRA-12541,CASSANDRA-12542,CASSANDRA-12543,CASSANDRA-12545)
+Merged from 2.2:
  * Fix purgeability of tombstones with max timestamp (CASSANDRA-12792)
  * Fail repair if participant dies during sync or anticompaction (CASSANDRA-12901)
  * cqlsh COPY: unprotected pk values before converting them if not using prepared statements (CASSANDRA-12863)
@@ -151,24 +503,70 @@
  * Fix merkle tree depth calculation (CASSANDRA-12580)
  * Make Collections deserialization more robust (CASSANDRA-12618)
  * Better handle invalid system roles table (CASSANDRA-12700)
- * Split consistent range movement flag correction (CASSANDRA-12786)
- * CompactionTasks now correctly drops sstables out of compaction when not enough disk space is available (CASSANDRA-12979)
-Merged from 2.1:
- * Use portable stderr for java error in startup (CASSANDRA-13211)
- * Fix Thread Leak in OutboundTcpConnection (CASSANDRA-13204)
- * Coalescing strategy can enter infinite loop (CASSANDRA-13159)
- * Upgrade netty version to fix memory leak with client encryption (CASSANDRA-13114)
- * Fix paging for DISTINCT queries on partition keys and static columns (CASSANDRA-13017)
- * Fix race causing infinite loop if Thrift server is stopped before it starts listening (CASSANDRA-12856)
- * cqlsh copy-from: sort user type fields in csv (CASSANDRA-12959)
- * Don't skip sstables based on maxLocalDeletionTime (CASSANDRA-12765)
-
-
-2.2.8
  * Fix exceptions when enabling gossip on nodes that haven't joined the ring (CASSANDRA-12253)
  * Fix authentication problem when invoking cqlsh copy from a SOURCE command (CASSANDRA-12642)
  * Decrement pending range calculator jobs counter in finally block
   (CASSANDRA-12554)
+ * Split consistent range movement flag correction (CASSANDRA-12786)
+Merged from 2.1:
+ * Add system property to set the max number of native transport requests in queue (CASSANDRA-11363)
+ * Don't skip sstables based on maxLocalDeletionTime (CASSANDRA-12765)
+
+
+3.0.9
+ * Handle composite prefixes with final EOC=0 as in 2.x and refactor LegacyLayout.decodeBound (CASSANDRA-12423)
+ * Fix paging for 2.x to 3.x upgrades (CASSANDRA-11195)
+ * select_distinct_with_deletions_test failing on non-vnode environments (CASSANDRA-11126)
+ * Stack Overflow returned to queries while upgrading (CASSANDRA-12527)
+ * Fix legacy regex for temporary files from 2.2 (CASSANDRA-12565)
+ * Add option to state current gc_grace_seconds to tools/bin/sstablemetadata (CASSANDRA-12208)
+ * Fix file system race condition that may cause LogAwareFileLister to fail to classify files (CASSANDRA-11889)
+ * Fix file handle leaks due to simultaneous compaction/repair and
+   listing snapshots, calculating snapshot sizes, or making schema
+   changes (CASSANDRA-11594)
+ * Fix nodetool repair exits with 0 for some errors (CASSANDRA-12508)
+ * Do not shut down BatchlogManager twice during drain (CASSANDRA-12504)
+ * Disk failure policy should not be invoked on out of space (CASSANDRA-12385)
+ * Calculate last compacted key on startup (CASSANDRA-6216)
+ * Add schema to snapshot manifest, add USING TIMESTAMP clause to ALTER TABLE statements (CASSANDRA-7190)
+ * Fix clean interval not sent to commit log for empty memtable flush (CASSANDRA-12436)
+ * Fix potential resource leak in RMIServerSocketFactoryImpl (CASSANDRA-12331)
+ * Backport CASSANDRA-12002 (CASSANDRA-12177)
+ * Make sure compaction stats are updated when compaction is interrupted (CASSANDRA-12100)
+ * Fix potential bad messaging service message for paged range reads
+   within mixed-version 3.x clusters (CASSANDRA-12249)
+ * Change commitlog and sstables to track dirty and clean intervals (CASSANDRA-11828)
+ * NullPointerException during compaction on table with static columns (CASSANDRA-12336)
+ * Fixed ConcurrentModificationException when reading metrics in GraphiteReporter (CASSANDRA-11823)
+ * Fix upgrade of super columns on thrift (CASSANDRA-12335)
+ * Fixed flacky BlacklistingCompactionsTest, switched to fixed size types and increased corruption size (CASSANDRA-12359)
+ * Rerun ReplicationAwareTokenAllocatorTest on failure to avoid flakiness (CASSANDRA-12277)
+ * Exception when computing read-repair for range tombstones (CASSANDRA-12263)
+ * Lost counter writes in compact table and static columns (CASSANDRA-12219)
+ * AssertionError with MVs on updating a row that isn't indexed due to a null value (CASSANDRA-12247)
+ * Disable RR and speculative retry with EACH_QUORUM reads (CASSANDRA-11980)
+ * Add option to override compaction space check (CASSANDRA-12180)
+ * Faster startup by only scanning each directory for temporary files once (CASSANDRA-12114)
+ * Respond with v1/v2 protocol header when responding to driver that attempts
+   to connect with too low of a protocol version (CASSANDRA-11464)
+ * NullPointerExpception when reading/compacting table (CASSANDRA-11988)
+ * Fix problem with undeleteable rows on upgrade to new sstable format (CASSANDRA-12144)
+ * Fix paging logic for deleted partitions with static columns (CASSANDRA-12107)
+ * Wait until the message is being send to decide which serializer must be used (CASSANDRA-11393)
+ * Fix migration of static thrift column names with non-text comparators (CASSANDRA-12147)
+ * Fix upgrading sparse tables that are incorrectly marked as dense (CASSANDRA-11315)
+ * Fix reverse queries ignoring range tombstones (CASSANDRA-11733)
+ * Avoid potential race when rebuilding CFMetaData (CASSANDRA-12098)
+ * Avoid missing sstables when getting the canonical sstables (CASSANDRA-11996)
+ * Always select the live sstables when getting sstables in bounds (CASSANDRA-11944)
+ * Fix column ordering of results with static columns for Thrift requests in
+   a mixed 2.x/3.x cluster, also fix potential non-resolved duplication of
+   those static columns in query results (CASSANDRA-12123)
+ * Avoid digest mismatch with empty but static rows (CASSANDRA-12090)
+ * Fix EOF exception when altering column type (CASSANDRA-11820)
+ * Fix JsonTransformer output of partition with deletion info (CASSANDRA-12418)
+ * Fix NPE in SSTableLoader when specifying partial directory path (CASSANDRA-12609)
+Merged from 2.2:
  * Add local address entry in PropertyFileSnitch (CASSANDRA-11332)
  * cqlshlib tests: increase default execute timeout (CASSANDRA-12481)
  * Forward writes to replacement node when replace_address != broadcast_address (CASSANDRA-8523)
@@ -182,37 +580,68 @@
  * Add Sigar to classes included in clientutil.jar (CASSANDRA-11635)
  * Add decay to histograms and timers used for metrics (CASSANDRA-11752)
  * Fix hanging stream session (CASSANDRA-10992)
- * Add byteman support for testing (CASSANDRA-12377)
  * Fix INSERT JSON, fromJson() support of smallint, tinyint types (CASSANDRA-12371)
  * Restore JVM metric export for metric reporters (CASSANDRA-12312)
  * Release sstables of failed stream sessions only when outgoing transfers are finished (CASSANDRA-11345)
- * Revert CASSANDRA-11427 (CASSANDRA-12351)
  * Wait for tracing events before returning response and query at same consistency level client side (CASSANDRA-11465)
  * cqlsh copyutil should get host metadata by connected address (CASSANDRA-11979)
  * Fixed cqlshlib.test.remove_test_db (CASSANDRA-12214)
  * Synchronize ThriftServer::stop() (CASSANDRA-12105)
  * Use dedicated thread for JMX notifications (CASSANDRA-12146)
- * NPE when trying to remove purgable tombstones from result (CASSANDRA-12143)
  * Improve streaming synchronization and fault tolerance (CASSANDRA-11414)
  * MemoryUtil.getShort() should return an unsigned short also for architectures not supporting unaligned memory accesses (CASSANDRA-11973)
- * Don't write shadowed range tombstone (CASSANDRA-12030)
 Merged from 2.1:
- * Add system property to set the max number of native transport requests in queue (CASSANDRA-11363)
+ * Fix queries with empty ByteBuffer values in clustering column restrictions (CASSANDRA-12127)
  * Disable passing control to post-flush after flush failure to prevent data loss (CASSANDRA-11828)
  * Allow STCS-in-L0 compactions to reduce scope with LCS (CASSANDRA-12040)
  * cannot use cql since upgrading python to 2.7.11+ (CASSANDRA-11850)
- * Improve digest calculation in the presence of overlapping tombstones (CASSANDRA-11349)
  * Fix filtering on clustering columns when 2i is used (CASSANDRA-11907)
- * Account for partition deletions in tombstone histogram (CASSANDRA-12112)
 
 
-2.2.7
+3.0.8
+ * Fix potential race in schema during new table creation (CASSANDRA-12083)
+ * cqlsh: fix error handling in rare COPY FROM failure scenario (CASSANDRA-12070)
+ * Disable autocompaction during drain (CASSANDRA-11878)
+ * Add a metrics timer to MemtablePool and use it to track time spent blocked on memory in MemtableAllocator (CASSANDRA-11327)
+ * Fix upgrading schema with super columns with non-text subcomparators (CASSANDRA-12023)
+ * Add TimeWindowCompactionStrategy (CASSANDRA-9666)
+Merged from 2.2:
  * Allow nodetool info to run with readonly JMX access (CASSANDRA-11755)
  * Validate bloom_filter_fp_chance against lowest supported
    value when the table is created (CASSANDRA-11920)
- * RandomAccessReader: call isEOF() only when rebuffering, not for every read operation (CASSANDRA-12013)
  * Don't send erroneous NEW_NODE notifications on restart (CASSANDRA-11038)
  * StorageService shutdown hook should use a volatile variable (CASSANDRA-11984)
+Merged from 2.1:
+ * Avoid stalling paxos when the paxos state expires (CASSANDRA-12043)
+ * Remove finished incoming streaming connections from MessagingService (CASSANDRA-11854)
+ * Don't try to get sstables for non-repairing column families (CASSANDRA-12077)
+ * Avoid marking too many sstables as repaired (CASSANDRA-11696)
+ * Prevent select statements with clustering key > 64k (CASSANDRA-11882)
+ * Fix clock skew corrupting other nodes with paxos (CASSANDRA-11991)
+ * Remove distinction between non-existing static columns and existing but null in LWTs (CASSANDRA-9842)
+ * Cache local ranges when calculating repair neighbors (CASSANDRA-11934)
+ * Allow LWT operation on static column with only partition keys (CASSANDRA-10532)
+ * Create interval tree over canonical sstables to avoid missing sstables during streaming (CASSANDRA-11886)
+ * cqlsh COPY FROM: shutdown parent cluster after forking, to avoid corrupting SSL connections (CASSANDRA-11749)
+
+
+3.0.7
+ * Fix legacy serialization of Thrift-generated non-compound range tombstones
+   when communicating with 2.x nodes (CASSANDRA-11930)
+ * Fix Directories instantiations where CFS.initialDirectories should be used (CASSANDRA-11849)
+ * Avoid referencing DatabaseDescriptor in AbstractType (CASSANDRA-11912)
+ * Fix sstables not being protected from removal during index build (CASSANDRA-11905)
+ * cqlsh: Suppress stack trace from Read/WriteFailures (CASSANDRA-11032)
+ * Remove unneeded code to repair index summaries that have
+   been improperly down-sampled (CASSANDRA-11127)
+ * Avoid WriteTimeoutExceptions during commit log replay due to materialized
+   view lock contention (CASSANDRA-11891)
+ * Prevent OOM failures on SSTable corruption, improve tests for corruption detection (CASSANDRA-9530)
+ * Use CFS.initialDirectories when clearing snapshots (CASSANDRA-11705)
+ * Allow compaction strategies to disable early open (CASSANDRA-11754)
+ * Refactor Materialized View code (CASSANDRA-11475)
+ * Update Java Driver (CASSANDRA-11615)
+Merged from 2.2:
  * Persist local metadata earlier in startup sequence (CASSANDRA-11742)
  * Run CommitLog tests with different compression settings (CASSANDRA-9039)
  * cqlsh: fix tab completion for case-sensitive identifiers (CASSANDRA-11664)
@@ -220,13 +649,43 @@
  * Fix possible race condition in CommitLog.recover (CASSANDRA-11743)
  * Enable client encryption in sstableloader with cli options (CASSANDRA-11708)
  * Possible memory leak in NIODataInputStream (CASSANDRA-11867)
- * Fix commit log replay after out-of-order flush completion (CASSANDRA-9669)
  * Add seconds to cqlsh tracing session duration (CASSANDRA-11753)
- * Prohibit Reverse Counter type as part of the PK (CASSANDRA-9395)
+ * Prohibit Reversed Counter type as part of the PK (CASSANDRA-9395)
+Merged from 2.1:
+ * cqlsh: apply current keyspace to source command (CASSANDRA-11152)
+ * Backport CASSANDRA-11578 (CASSANDRA-11750)
+ * Clear out parent repair session if repair coordinator dies (CASSANDRA-11824)
+ * Set default streaming_socket_timeout_in_ms to 24 hours (CASSANDRA-11840)
+ * Do not consider local node a valid source during replace (CASSANDRA-11848)
+ * Add message dropped tasks to nodetool netstats (CASSANDRA-11855)
+ * Avoid holding SSTableReaders for duration of incremental repair (CASSANDRA-11739)
+
+
+3.0.6
+ * Disallow creating view with a static column (CASSANDRA-11602)
+ * Reduce the amount of object allocations caused by the getFunctions methods (CASSANDRA-11593)
+ * Potential error replaying commitlog with smallint/tinyint/date/time types (CASSANDRA-11618)
+ * Fix queries with filtering on counter columns (CASSANDRA-11629)
+ * Improve tombstone printing in sstabledump (CASSANDRA-11655)
+ * Fix paging for range queries where all clustering columns are specified (CASSANDRA-11669)
+ * Don't require HEAP_NEW_SIZE to be set when using G1 (CASSANDRA-11600)
+ * Fix sstabledump not showing cells after tombstone marker (CASSANDRA-11654)
+ * Ignore all LocalStrategy keyspaces for streaming and other related
+   operations (CASSANDRA-11627)
+ * Ensure columnfilter covers indexed columns for thrift 2i queries (CASSANDRA-11523)
+ * Only open one sstable scanner per sstable (CASSANDRA-11412)
+ * Option to specify ProtocolVersion in cassandra-stress (CASSANDRA-11410)
+ * ArithmeticException in avgFunctionForDecimal (CASSANDRA-11485)
+ * LogAwareFileLister should only use OLD sstable files in current folder to determine disk consistency (CASSANDRA-11470)
+ * Notify indexers of expired rows during compaction (CASSANDRA-11329)
+ * Properly respond with ProtocolError when a v1/v2 native protocol
+   header is received (CASSANDRA-11464)
+ * Validate that num_tokens and initial_token are consistent with one another (CASSANDRA-10120)
+Merged from 2.2:
+ * Fix commit log replay after out-of-order flush completion (CASSANDRA-9669)
  * cqlsh: correctly handle non-ascii chars in error messages (CASSANDRA-11626)
  * Exit JVM if JMX server fails to startup (CASSANDRA-11540)
  * Produce a heap dump when exiting on OOM (CASSANDRA-9861)
- * Avoid read repairing purgeable tombstones on range slices (CASSANDRA-11427)
  * Restore ability to filter on clustering columns when using a 2i (CASSANDRA-11510)
  * JSON datetime formatting needs timezone (CASSANDRA-11137)
  * Fix is_dense recalculation for Thrift-updated tables (CASSANDRA-11502)
@@ -236,54 +695,43 @@
  * cqlsh: COPY FROM should use regular inserts for single statement batches and
    report errors correctly if workers processes crash on initialization (CASSANDRA-11474)
  * Always close cluster with connection in CqlRecordWriter (CASSANDRA-11553)
- * Fix slice queries on ordered COMPACT tables (CASSANDRA-10988)
-Merged from 2.1:
- * Avoid stalling paxos when the paxos state expires (CASSANDRA-12043)
- * Remove finished incoming streaming connections from MessagingService (CASSANDRA-11854)
- * Don't try to get sstables for non-repairing column families (CASSANDRA-12077)
- * Prevent select statements with clustering key > 64k (CASSANDRA-11882)
- * Avoid marking too many sstables as repaired (CASSANDRA-11696)
- * Fix clock skew corrupting other nodes with paxos (CASSANDRA-11991)
- * Remove distinction between non-existing static columns and existing but null in LWTs (CASSANDRA-9842)
- * Support mlockall on IBM POWER arch (CASSANDRA-11576)
- * Cache local ranges when calculating repair neighbors (CASSANDRA-11933)
- * Allow LWT operation on static column with only partition keys (CASSANDRA-10532)
- * Create interval tree over canonical sstables to avoid missing sstables during streaming (CASSANDRA-11886)
- * cqlsh COPY FROM: shutdown parent cluster after forking, to avoid corrupting SSL connections (CASSANDRA-11749)
- * cqlsh: apply current keyspace to source command (CASSANDRA-11152)
- * Backport CASSANDRA-11578 (CASSANDRA-11750)
- * Clear out parent repair session if repair coordinator dies (CASSANDRA-11824)
- * Set default streaming_socket_timeout_in_ms to 24 hours (CASSANDRA-11840)
- * Do not consider local node a valid source during replace (CASSANDRA-11848)
- * Avoid holding SSTableReaders for duration of incremental repair (CASSANDRA-11739)
- * Add message dropped tasks to nodetool netstats (CASSANDRA-11855)
- * Don't compute expensive MaxPurgeableTimestamp until we've verified there's an 
-   expired tombstone (CASSANDRA-11834)
- * Add option to disable use of severity in DynamicEndpointSnitch (CASSANDRA-11737)
- * cqlsh COPY FROM fails for null values with non-prepared statements (CASSANDRA-11631)
- * Make cython optional in pylib/setup.py (CASSANDRA-11630)
- * Change order of directory searching for cassandra.in.sh to favor local one 
-   (CASSANDRA-11628)
- * cqlsh COPY FROM fails with []{} chars in UDT/tuple fields/values (CASSANDRA-11633)
- * clqsh: COPY FROM throws TypeError with Cython extensions enabled (CASSANDRA-11574)
- * cqlsh: COPY FROM ignores NULL values in conversion (CASSANDRA-11549)
- * (cqlsh) Fix potential COPY deadlock when parent process is terminating child
-   processes (CASSANDRA-11505)
- * Validate levels when building LeveledScanner to avoid overlaps with orphaned 
-   sstables (CASSANDRA-9935)
-
-
-2.2.6
  * Allow only DISTINCT queries with partition keys restrictions (CASSANDRA-11339)
  * CqlConfigHelper no longer requires both a keystore and truststore to work (CASSANDRA-11532)
  * Make deprecated repair methods backward-compatible with previous notification service (CASSANDRA-11430)
  * IncomingStreamingConnection version check message wrong (CASSANDRA-11462)
+Merged from 2.1:
+ * Support mlockall on IBM POWER arch (CASSANDRA-11576)
+ * Add option to disable use of severity in DynamicEndpointSnitch (CASSANDRA-11737)
+ * cqlsh COPY FROM fails for null values with non-prepared statements (CASSANDRA-11631)
+ * Make cython optional in pylib/setup.py (CASSANDRA-11630)
+ * Change order of directory searching for cassandra.in.sh to favor local one (CASSANDRA-11628)
+ * cqlsh COPY FROM fails with []{} chars in UDT/tuple fields/values (CASSANDRA-11633)
+ * clqsh: COPY FROM throws TypeError with Cython extensions enabled (CASSANDRA-11574)
+ * cqlsh: COPY FROM ignores NULL values in conversion (CASSANDRA-11549)
+ * Validate levels when building LeveledScanner to avoid overlaps with orphaned sstables (CASSANDRA-9935)
+
+
+3.0.5
+ * Fix rare NPE on schema upgrade from 2.x to 3.x (CASSANDRA-10943)
+ * Improve backoff policy for cqlsh COPY FROM (CASSANDRA-11320)
+ * Improve IF NOT EXISTS check in CREATE INDEX (CASSANDRA-11131)
+ * Upgrade ohc to 0.4.3
+ * Enable SO_REUSEADDR for JMX RMI server sockets (CASSANDRA-11093)
+ * Allocate merkletrees with the correct size (CASSANDRA-11390)
+ * Support streaming pre-3.0 sstables (CASSANDRA-10990)
+ * Add backpressure to compressed commit log (CASSANDRA-10971)
+ * SSTableExport supports secondary index tables (CASSANDRA-11330)
+ * Fix sstabledump to include missing info in debug output (CASSANDRA-11321)
+ * Establish and implement canonical bulk reading workload(s) (CASSANDRA-10331)
+ * Fix paging for IN queries on tables without clustering columns (CASSANDRA-11208)
+ * Remove recursive call from CompositesSearcher (CASSANDRA-11304)
+ * Fix filtering on non-primary key columns for queries without index (CASSANDRA-6377)
+ * Fix sstableloader fail when using materialized view (CASSANDRA-11275)
+Merged from 2.2:
  * DatabaseDescriptor should log stacktrace in case of Eception during seed provider creation (CASSANDRA-11312)
  * Use canonical path for directory in SSTable descriptor (CASSANDRA-10587)
  * Add cassandra-stress keystore option (CASSANDRA-9325)
- * Fix out-of-space error treatment in memtable flushing (CASSANDRA-11448).
  * Dont mark sstables as repairing with sub range repairs (CASSANDRA-11451)
- * Fix use of NullUpdater for 2i during compaction (CASSANDRA-11450)
  * Notify when sstables change after cancelling compaction (CASSANDRA-11373)
  * cqlsh: COPY FROM should check that explicit column names are valid (CASSANDRA-11333)
  * Add -Dcassandra.start_gossip startup option (CASSANDRA-10809)
@@ -291,16 +739,41 @@
  * Clarify that now() function is calculated on the coordinator node in CQL documentation (CASSANDRA-10900)
  * Fix bloom filter sizing with LCS (CASSANDRA-11344)
  * (cqlsh) Fix error when result is 0 rows with EXPAND ON (CASSANDRA-11092)
- * Fix intra-node serialization issue for multicolumn-restrictions (CASSANDRA-11196)
- * Non-obsoleting compaction operations over compressed files can impose rate limit on normal reads (CASSANDRA-11301)
  * Add missing newline at end of bin/cqlsh (CASSANDRA-11325)
  * Fix AE in nodetool cfstats (backport CASSANDRA-10859) (CASSANDRA-11297)
  * Unresolved hostname leads to replace being ignored (CASSANDRA-11210)
- * Fix filtering on non-primary key columns for thrift static column families
-   (CASSANDRA-6377)
  * Only log yaml config once, at startup (CASSANDRA-11217)
- * Preserve order for preferred SSL cipher suites (CASSANDRA-11164)
  * Reference leak with parallel repairs on the same table (CASSANDRA-11215)
+Merged from 2.1:
+ * Add a -j parameter to scrub/cleanup/upgradesstables to state how
+   many threads to use (CASSANDRA-11179)
+ * Backport CASSANDRA-10679 (CASSANDRA-9598)
+ * InvalidateKeys should have a weak ref to key cache (CASSANDRA-11176)
+ * COPY FROM on large datasets: fix progress report and debug performance (CASSANDRA-11053)
+
+3.0.4
+ * Preserve order for preferred SSL cipher suites (CASSANDRA-11164)
+ * MV should only query complex columns included in the view (CASSANDRA-11069)
+ * Failed aggregate creation breaks server permanently (CASSANDRA-11064)
+ * Add sstabledump tool (CASSANDRA-7464)
+ * Introduce backpressure for hints (CASSANDRA-10972)
+ * Fix ClusteringPrefix not being able to read tombstone range boundaries (CASSANDRA-11158)
+ * Prevent logging in sandboxed state (CASSANDRA-11033)
+ * Disallow drop/alter operations of UDTs used by UDAs (CASSANDRA-10721)
+ * Add query time validation method on Index (CASSANDRA-11043)
+ * Avoid potential AssertionError in mixed version cluster (CASSANDRA-11128)
+ * Properly handle hinted handoff after topology changes (CASSANDRA-5902)
+ * AssertionError when listing sstable files on inconsistent disk state (CASSANDRA-11156)
+ * Fix wrong rack counting and invalid conditions check for TokenAllocation
+   (CASSANDRA-11139)
+ * Avoid creating empty hint files (CASSANDRA-11090)
+ * Fix leak detection strong reference loop using weak reference (CASSANDRA-11120)
+ * Configurie BatchlogManager to stop delayed tasks on shutdown (CASSANDRA-11062)
+ * Hadoop integration is incompatible with Cassandra Driver 3.0.0 (CASSANDRA-11001)
+ * Add dropped_columns to the list of schema table so it gets handled
+   properly (CASSANDRA-11050)
+ * Fix NPE when using forceRepairRangeAsync without DC (CASSANDRA-11239)
+Merged from 2.2:
  * Range.compareTo() violates the contract of Comparable (CASSANDRA-11216)
  * Avoid NPE when serializing ErrorMessage with null message (CASSANDRA-11167)
  * Replacing an aggregate with a new version doesn't reset INITCOND (CASSANDRA-10840)
@@ -310,29 +783,24 @@
  * Handle adding fields to a UDT in SELECT JSON and toJson() (CASSANDRA-11146)
  * Better error message for cleanup (CASSANDRA-10991)
  * cqlsh pg-style-strings broken if line ends with ';' (CASSANDRA-11123)
- * Use cloned TokenMetadata in size estimates to avoid race against membership check
-   (CASSANDRA-10736)
  * Always persist upsampled index summaries (CASSANDRA-10512)
  * (cqlsh) Fix inconsistent auto-complete (CASSANDRA-10733)
  * Make SELECT JSON and toJson() threadsafe (CASSANDRA-11048)
  * Fix SELECT on tuple relations for mixed ASC/DESC clustering order (CASSANDRA-7281)
+ * Use cloned TokenMetadata in size estimates to avoid race against membership check
+   (CASSANDRA-10736)
  * (cqlsh) Support utf-8/cp65001 encoding on Windows (CASSANDRA-11030)
  * Fix paging on DISTINCT queries repeats result when first row in partition changes
    (CASSANDRA-10010)
+ * cqlsh: change default encoding to UTF-8 (CASSANDRA-11124)
 Merged from 2.1:
  * Checking if an unlogged batch is local is inefficient (CASSANDRA-11529)
- * Fix paging for COMPACT tables without clustering columns (CASSANDRA-11467)
- * Add a -j parameter to scrub/cleanup/upgradesstables to state how
-   many threads to use (CASSANDRA-11179)
- * Backport CASSANDRA-10679 (CASSANDRA-9598)
+ * Fix out-of-space error treatment in memtable flushing (CASSANDRA-11448).
  * Don't do defragmentation if reading from repaired sstables (CASSANDRA-10342)
  * Fix streaming_socket_timeout_in_ms not enforced (CASSANDRA-11286)
  * Avoid dropping message too quickly due to missing unit conversion (CASSANDRA-11302)
- * COPY FROM on large datasets: fix progress report and debug performance (CASSANDRA-11053)
- * InvalidateKeys should have a weak ref to key cache (CASSANDRA-11176)
  * Don't remove FailureDetector history on removeEndpoint (CASSANDRA-10371)
  * Only notify if repair status changed (CASSANDRA-11172)
- * Add partition key to TombstoneOverwhelmingException error message (CASSANDRA-10888)
  * Use logback setting for 'cassandra -v' command (CASSANDRA-10767)
  * Fix sstableloader to unthrottle streaming by default (CASSANDRA-9714)
  * Fix incorrect warning in 'nodetool status' (CASSANDRA-10176)
@@ -341,16 +809,50 @@
  * Gossiper#isEnabled is not thread safe (CASSANDRA-11116)
  * Avoid major compaction mixing repaired and unrepaired sstables in DTCS (CASSANDRA-11113)
  * Make it clear what DTCS timestamp_resolution is used for (CASSANDRA-11041)
- * test_bulk_round_trip_blogposts is failing occasionally (CASSANDRA-10938)
  * (cqlsh) Support timezone conversion using pytz (CASSANDRA-10397)
- * cqlsh: change default encoding to UTF-8 (CASSANDRA-11124)
+ * (cqlsh) Display milliseconds when datetime overflows (CASSANDRA-10625)
 
 
-2.2.5
+3.0.3
+ * Remove double initialization of newly added tables (CASSANDRA-11027)
+ * Filter keys searcher results by target range (CASSANDRA-11104)
+ * Fix deserialization of legacy read commands (CASSANDRA-11087)
+ * Fix incorrect computation of deletion time in sstable metadata (CASSANDRA-11102)
+ * Avoid memory leak when collecting sstable metadata (CASSANDRA-11026)
+ * Mutations do not block for completion under view lock contention (CASSANDRA-10779)
+ * Invalidate legacy schema tables when unloading them (CASSANDRA-11071)
+ * (cqlsh) handle INSERT and UPDATE statements with LWT conditions correctly
+   (CASSANDRA-11003)
+ * Fix DISTINCT queries in mixed version clusters (CASSANDRA-10762)
+ * Migrate build status for indexes along with legacy schema (CASSANDRA-11046)
+ * Ensure SSTables for legacy KEYS indexes can be read (CASSANDRA-11045)
+ * Added support for IBM zSystems architecture (CASSANDRA-11054)
+ * Update CQL documentation (CASSANDRA-10899)
+ * Check the column name, not cell name, for dropped columns when reading
+   legacy sstables (CASSANDRA-11018)
+ * Don't attempt to index clustering values of static rows (CASSANDRA-11021)
+ * Remove checksum files after replaying hints (CASSANDRA-10947)
+ * Support passing base table metadata to custom 2i validation (CASSANDRA-10924)
+ * Ensure stale index entries are purged during reads (CASSANDRA-11013)
+ * Fix AssertionError when removing from list using UPDATE (CASSANDRA-10954)
+ * Fix UnsupportedOperationException when reading old sstable with range
+   tombstone (CASSANDRA-10743)
+ * MV should use the maximum timestamp of the primary key (CASSANDRA-10910)
+ * Fix potential assertion error during compaction (CASSANDRA-10944)
+ * Fix counting of received sstables in streaming (CASSANDRA-10949)
+ * Implement hints compression (CASSANDRA-9428)
+ * Fix potential assertion error when reading static columns (CASSANDRA-10903)
+ * Avoid NoSuchElementException when executing empty batch (CASSANDRA-10711)
+ * Avoid building PartitionUpdate in toString (CASSANDRA-10897)
+ * Reduce heap spent when receiving many SSTables (CASSANDRA-10797)
+ * Add back support for 3rd party auth providers to bulk loader (CASSANDRA-10873)
+ * Eliminate the dependency on jgrapht for UDT resolution (CASSANDRA-10653)
+ * (Hadoop) Close Clusters and Sessions in Hadoop Input/Output classes (CASSANDRA-10837)
+ * Fix sstableloader not working with upper case keyspace name (CASSANDRA-10806)
+Merged from 2.2:
  * maxPurgeableTimestamp needs to check memtables too (CASSANDRA-9949)
  * Apply change to compaction throughput in real time (CASSANDRA-10025)
  * Fix potential NPE on ORDER BY queries with IN (CASSANDRA-10955)
- * Avoid over-fetching during the page of range queries (CASSANDRA-8521)
  * Start L0 STCS-compactions even if there is a L0 -> L1 compaction
    going (CASSANDRA-10979)
  * Make UUID LSB unique per process (CASSANDRA-7925)
@@ -365,7 +867,6 @@
  * Skip commit log and saved cache directories in SSTable version startup check (CASSANDRA-10902)
  * drop/alter user should be case sensitive (CASSANDRA-10817)
  * jemalloc detection fails due to quoting issues in regexv (CASSANDRA-10946)
- * Support counter-columns for native aggregates (sum,avg,max,min) (CASSANDRA-9977)
  * (cqlsh) show correct column names for empty result sets (CASSANDRA-9813)
  * Add new types to Stress (CASSANDRA-9556)
  * Add property to allow listening on broadcast interface (CASSANDRA-9748)
@@ -375,6 +876,7 @@
  * Verify tables in pseudo-system keyspaces at startup (CASSANDRA-10761)
  * (cqlsh) encode input correctly when saving history
 Merged from 2.1:
+ * test_bulk_round_trip_blogposts is failing occasionally (CASSANDRA-10938)
  * Fix isJoined return true only after becoming cluster member (CASANDRA-11007)
  * Fix bad gossip generation seen in long-running clusters (CASSANDRA-10969)
  * Avoid NPE when incremental repair fails (CASSANDRA-10909)
@@ -398,12 +900,42 @@
  * Make Stress compiles within eclipse (CASSANDRA-10807)
  * Cassandra Daemon should print JVM arguments (CASSANDRA-10764)
  * Allow cancellation of index summary redistribution (CASSANDRA-8805)
- * sstableloader will fail if there are collections in the schema tables (CASSANDRA-10700)
+
+
+3.0.2
+ * Fix upgrade data loss due to range tombstone deleting more data than then should
+   (CASSANDRA-10822)
+
+
+3.0.1
+ * Avoid MV race during node decommission (CASSANDRA-10674)
  * Disable reloading of GossipingPropertyFileSnitch (CASSANDRA-9474)
- * Fix Stress profile parsing on Windows (CASSANDRA-10808)
-
-
-2.2.4
+ * Handle single-column deletions correction in materialized views
+   when the column is part of the view primary key (CASSANDRA-10796)
+ * Fix issue with datadir migration on upgrade (CASSANDRA-10788)
+ * Fix bug with range tombstones on reverse queries and test coverage for
+   AbstractBTreePartition (CASSANDRA-10059)
+ * Remove 64k limit on collection elements (CASSANDRA-10374)
+ * Remove unclear Indexer.indexes() method (CASSANDRA-10690)
+ * Fix NPE on stream read error (CASSANDRA-10771)
+ * Normalize cqlsh DESC output (CASSANDRA-10431)
+ * Rejects partition range deletions when columns are specified (CASSANDRA-10739)
+ * Fix error when saving cached key for old format sstable (CASSANDRA-10778)
+ * Invalidate prepared statements on DROP INDEX (CASSANDRA-10758)
+ * Fix SELECT statement with IN restrictions on partition key,
+   ORDER BY and LIMIT (CASSANDRA-10729)
+ * Improve stress performance over 1k threads (CASSANDRA-7217)
+ * Wait for migration responses to complete before bootstrapping (CASSANDRA-10731)
+ * Unable to create a function with argument of type Inet (CASSANDRA-10741)
+ * Fix backward incompatibiliy in CqlInputFormat (CASSANDRA-10717)
+ * Correctly preserve deletion info on updated rows when notifying indexers
+   of single-row deletions (CASSANDRA-10694)
+ * Notify indexers of partition delete during cleanup (CASSANDRA-10685)
+ * Keep the file open in trySkipCache (CASSANDRA-10669)
+ * Updated trigger example (CASSANDRA-10257)
+Merged from 2.2:
+ * Verify tables in pseudo-system keyspaces at startup (CASSANDRA-10761)
+ * Fix IllegalArgumentException in DataOutputBuffer.reallocate for large buffers (CASSANDRA-10592)
  * Show CQL help in cqlsh in web browser (CASSANDRA-7225)
  * Serialize on disk the proper SSTable compression ratio (CASSANDRA-10775)
  * Reject index queries while the index is building (CASSANDRA-8505)
@@ -413,18 +945,9 @@
  * Fix SimpleDateType type compatibility (CASSANDRA-10027)
  * (Hadoop) fix splits calculation (CASSANDRA-10640)
  * (Hadoop) ensure that Cluster instances are always closed (CASSANDRA-10058)
- * (cqlsh) show partial trace if incomplete after max_trace_wait (CASSANDRA-7645)
- * Use most up-to-date version of schema for system tables (CASSANDRA-10652)
- * Deprecate memory_allocator in cassandra.yaml (CASSANDRA-10581,10628)
- * Expose phi values from failure detector via JMX and tweak debug
-   and trace logging (CASSANDRA-9526)
- * Fix RangeNamesQueryPager (CASSANDRA-10509)
- * Deprecate Pig support (CASSANDRA-10542)
- * Reduce contention getting instances of CompositeType (CASSANDRA-10433)
- * Fix IllegalArgumentException in DataOutputBuffer.reallocate for large buffers (CASSANDRA-10592)
 Merged from 2.1:
+ * Fix Stress profile parsing on Windows (CASSANDRA-10808)
  * Fix incremental repair hang when replica is down (CASSANDRA-10288)
- * Avoid writing range tombstones after END_OF_ROW marker (CASSANDRA-10791)
  * Optimize the way we check if a token is repaired in anticompaction (CASSANDRA-10768)
  * Add proper error handling to stream receiver (CASSANDRA-10774)
  * Warn or fail when changing cluster topology live (CASSANDRA-10243)
@@ -435,7 +958,6 @@
  * Properly reject counters as map keys (CASSANDRA-10760)
  * Fix the sstable-needs-cleanup check (CASSANDRA-10740)
  * (cqlsh) Print column names before COPY operation (CASSANDRA-8935)
- * Make paging logic consistent between searcher impls (CASSANDRA-10683)
  * Fix CompressedInputStream for proper cleanup (CASSANDRA-10012)
  * (cqlsh) Support counters in COPY commands (CASSANDRA-9043)
  * Try next replica if not possible to connect to primary replica on
@@ -443,17 +965,59 @@
  * Limit window size in DTCS (CASSANDRA-10280)
  * sstableloader does not use MAX_HEAP_SIZE env parameter (CASSANDRA-10188)
  * (cqlsh) Improve COPY TO performance and error handling (CASSANDRA-9304)
- * Don't remove level info when running upgradesstables (CASSANDRA-10692)
  * Create compression chunk for sending file only (CASSANDRA-10680)
- * Make buffered read size configurable (CASSANDRA-10249)
  * Forbid compact clustering column type changes in ALTER TABLE (CASSANDRA-8879)
  * Reject incremental repair with subrange repair (CASSANDRA-10422)
  * Add a nodetool command to refresh size_estimates (CASSANDRA-9579)
- * Shutdown compaction in drain to prevent leak (CASSANDRA-10079)
  * Invalidate cache after stream receive task is completed (CASSANDRA-10341)
  * Reject counter writes in CQLSSTableWriter (CASSANDRA-10258)
  * Remove superfluous COUNTER_MUTATION stage mapping (CASSANDRA-10605)
- * Improve json2sstable error reporting on nonexistent columns (CASSANDRA-10401)
+
+
+3.0
+ * Fix AssertionError while flushing memtable due to materialized views
+   incorrectly inserting empty rows (CASSANDRA-10614)
+ * Store UDA initcond as CQL literal in the schema table, instead of a blob (CASSANDRA-10650)
+ * Don't use -1 for the position of partition key in schema (CASSANDRA-10491)
+ * Fix distinct queries in mixed version cluster (CASSANDRA-10573)
+ * Skip sstable on clustering in names query (CASSANDRA-10571)
+ * Remove value skipping as it breaks read-repair (CASSANDRA-10655)
+ * Fix bootstrapping with MVs (CASSANDRA-10621)
+ * Make sure EACH_QUORUM reads are using NTS (CASSANDRA-10584)
+ * Fix MV replica filtering for non-NetworkTopologyStrategy (CASSANDRA-10634)
+ * (Hadoop) fix CIF describeSplits() not handling 0 size estimates (CASSANDRA-10600)
+ * Fix reading of legacy sstables (CASSANDRA-10590)
+ * Use CQL type names in schema metadata tables (CASSANDRA-10365)
+ * Guard batchlog replay against integer division by zero (CASSANDRA-9223)
+ * Fix bug when adding a column to thrift with the same name than a primary key (CASSANDRA-10608)
+ * Add client address argument to IAuthenticator::newSaslNegotiator (CASSANDRA-8068)
+ * Fix implementation of LegacyLayout.LegacyBoundComparator (CASSANDRA-10602)
+ * Don't use 'names query' read path for counters (CASSANDRA-10572)
+ * Fix backward compatibility for counters (CASSANDRA-10470)
+ * Remove memory_allocator paramter from cassandra.yaml (CASSANDRA-10581,10628)
+ * Execute the metadata reload task of all registered indexes on CFS::reload (CASSANDRA-10604)
+ * Fix thrift cas operations with defined columns (CASSANDRA-10576)
+ * Fix PartitionUpdate.operationCount()for updates with static column operations (CASSANDRA-10606)
+ * Fix thrift get() queries with defined columns (CASSANDRA-10586)
+ * Fix marking of indexes as built and removed (CASSANDRA-10601)
+ * Skip initialization of non-registered 2i instances, remove Index::getIndexName (CASSANDRA-10595)
+ * Fix batches on multiple tables (CASSANDRA-10554)
+ * Ensure compaction options are validated when updating KeyspaceMetadata (CASSANDRA-10569)
+ * Flatten Iterator Transformation Hierarchy (CASSANDRA-9975)
+ * Remove token generator (CASSANDRA-5261)
+ * RolesCache should not be created for any authenticator that does not requireAuthentication (CASSANDRA-10562)
+ * Fix LogTransaction checking only a single directory for files (CASSANDRA-10421)
+ * Fix handling of range tombstones when reading old format sstables (CASSANDRA-10360)
+ * Aggregate with Initial Condition fails with C* 3.0 (CASSANDRA-10367)
+Merged from 2.2:
+ * (cqlsh) show partial trace if incomplete after max_trace_wait (CASSANDRA-7645)
+ * Use most up-to-date version of schema for system tables (CASSANDRA-10652)
+ * Deprecate memory_allocator in cassandra.yaml (CASSANDRA-10581,10628)
+ * Expose phi values from failure detector via JMX and tweak debug
+   and trace logging (CASSANDRA-9526)
+ * Fix IllegalArgumentException in DataOutputBuffer.reallocate for large buffers (CASSANDRA-10592)
+Merged from 2.1:
+ * Shutdown compaction in drain to prevent leak (CASSANDRA-10079)
  * (cqlsh) fix COPY using wrong variable name for time_format (CASSANDRA-10633)
  * Do not run SizeEstimatesRecorder if a node is not a member of the ring (CASSANDRA-9912)
  * Improve handling of dead nodes in gossip (CASSANDRA-10298)
@@ -463,43 +1027,77 @@
  * Add validation method to PerRowSecondaryIndex (CASSANDRA-10092)
  * Support encrypted and plain traffic on the same port (CASSANDRA-10559)
  * Do STCS in DTCS windows (CASSANDRA-10276)
- * Don't try to get ancestors from half-renamed sstables (CASSANDRA-10501)
  * Avoid repetition of JVM_OPTS in debian package (CASSANDRA-10251)
  * Fix potential NPE from handling result of SIM.highestSelectivityIndex (CASSANDRA-10550)
  * Fix paging issues with partitions containing only static columns data (CASSANDRA-10381)
  * Fix conditions on static columns (CASSANDRA-10264)
  * AssertionError: attempted to delete non-existing file CommitLog (CASSANDRA-10377)
- * (cqlsh) Distinguish negative and positive infinity in output (CASSANDRA-10523)
- * (cqlsh) allow custom time_format for COPY TO (CASSANDRA-8970)
- * Don't allow startup if the node's rack has changed (CASSANDRA-10242)
  * Fix sorting for queries with an IN condition on partition key columns (CASSANDRA-10363)
 
 
-2.2.3
+3.0-rc2
+ * Fix SELECT DISTINCT queries between 2.2.2 nodes and 3.0 nodes (CASSANDRA-10473)
+ * Remove circular references in SegmentedFile (CASSANDRA-10543)
+ * Ensure validation of indexed values only occurs once per-partition (CASSANDRA-10536)
+ * Fix handling of static columns for range tombstones in thrift (CASSANDRA-10174)
+ * Support empty ColumnFilter for backward compatility on empty IN (CASSANDRA-10471)
+ * Remove Pig support (CASSANDRA-10542)
+ * Fix LogFile throws Exception when assertion is disabled (CASSANDRA-10522)
+ * Revert CASSANDRA-7486, make CMS default GC, move GC config to
+   conf/jvm.options (CASSANDRA-10403)
+ * Fix TeeingAppender causing some logs to be truncated/empty (CASSANDRA-10447)
+ * Allow EACH_QUORUM for reads (CASSANDRA-9602)
+ * Fix potential ClassCastException while upgrading (CASSANDRA-10468)
+ * Fix NPE in MVs on update (CASSANDRA-10503)
+ * Only include modified cell data in indexing deltas (CASSANDRA-10438)
+ * Do not load keyspace when creating sstable writer (CASSANDRA-10443)
+ * If node is not yet gossiping write all MV updates to batchlog only (CASSANDRA-10413)
+ * Re-populate token metadata after commit log recovery (CASSANDRA-10293)
+ * Provide additional metrics for materialized views (CASSANDRA-10323)
+ * Flush system schema tables after local schema changes (CASSANDRA-10429)
+Merged from 2.2:
+ * Reduce contention getting instances of CompositeType (CASSANDRA-10433)
+ * Fix the regression when using LIMIT with aggregates (CASSANDRA-10487)
  * Avoid NoClassDefFoundError during DataDescriptor initialization on windows (CASSANDRA-10412)
  * Preserve case of quoted Role & User names (CASSANDRA-10394)
  * cqlsh pg-style-strings broken (CASSANDRA-10484)
- * Make Hadoop CF splits more polite to custom orderered partitioners (CASSANDRA-10400)
- * Fix the regression when using LIMIT with aggregates (CASSANDRA-10487)
+ * cqlsh prompt includes name of keyspace after failed `use` statement (CASSANDRA-10369)
 Merged from 2.1:
- * Fix mmap file segment seeking to EOF (CASSANDRA-10478)
+ * (cqlsh) Distinguish negative and positive infinity in output (CASSANDRA-10523)
+ * (cqlsh) allow custom time_format for COPY TO (CASSANDRA-8970)
+ * Don't allow startup if the node's rack has changed (CASSANDRA-10242)
+ * (cqlsh) show partial trace if incomplete after max_trace_wait (CASSANDRA-7645)
  * Allow LOCAL_JMX to be easily overridden (CASSANDRA-10275)
  * Mark nodes as dead even if they've already left (CASSANDRA-10205)
- * Update internal python driver used by cqlsh (CASSANDRA-10161, CASSANDRA-10507)
 
 
-2.2.2
- * cqlsh prompt includes name of keyspace after failed `use` statement (CASSANDRA-10369)
+3.0.0-rc1
+ * Fix mixed version read request compatibility for compact static tables
+   (CASSANDRA-10373)
+ * Fix paging of DISTINCT with static and IN (CASSANDRA-10354)
+ * Allow MATERIALIZED VIEW's SELECT statement to restrict primary key
+   columns (CASSANDRA-9664)
+ * Move crc_check_chance out of compression options (CASSANDRA-9839)
+ * Fix descending iteration past end of BTreeSearchIterator (CASSANDRA-10301)
+ * Transfer hints to a different node on decommission (CASSANDRA-10198)
+ * Check partition keys for CAS operations during stmt validation (CASSANDRA-10338)
+ * Add custom query expressions to SELECT (CASSANDRA-10217)
+ * Fix minor bugs in MV handling (CASSANDRA-10362)
+ * Allow custom indexes with 0,1 or multiple target columns (CASSANDRA-10124)
+ * Improve MV schema representation (CASSANDRA-9921)
+ * Add flag to enable/disable coordinator batchlog for MV writes (CASSANDRA-10230)
+ * Update cqlsh COPY for new internal driver serialization interface (CASSANDRA-10318)
+ * Give index implementations more control over rebuild operations (CASSANDRA-10312)
+ * Update index file format (CASSANDRA-10314)
+ * Add "shadowable" row tombstones to deal with mv timestamp issues (CASSANDRA-10261)
+ * CFS.loadNewSSTables() broken for pre-3.0 sstables
+ * Cache selected index in read command to reduce lookups (CASSANDRA-10215)
+ * Small optimizations of sstable index serialization (CASSANDRA-10232)
+ * Support for both encrypted and unencrypted native transport connections (CASSANDRA-9590)
+Merged from 2.2:
  * Configurable page size in cqlsh (CASSANDRA-9855)
  * Defer default role manager setup until all nodes are on 2.2+ (CASSANDRA-9761)
- * Cancel transaction for sstables we wont redistribute index summary
-   for (CASSANDRA-10270)
- * Handle missing RoleManager in config after upgrade to 2.2 (CASSANDRA-10209) 
- * Retry snapshot deletion after compaction and gc on Windows (CASSANDRA-10222)
- * Fix failure to start with space in directory path on Windows (CASSANDRA-10239)
- * Fix repair hang when snapshot failed (CASSANDRA-10057)
- * Fall back to 1/4 commitlog volume for commitlog_total_space on small disks
-   (CASSANDRA-10199)
+ * Handle missing RoleManager in config after upgrade to 2.2 (CASSANDRA-10209)
 Merged from 2.1:
  * Bulk Loader API could not tolerate even node failure (CASSANDRA-10347)
  * Avoid misleading pushed notifications when multiple nodes
@@ -511,11 +1109,42 @@
  * Fix NPE in nodetool compactionhistory (CASSANDRA-9758)
  * (Pig) support BulkOutputFormat as a URL parameter (CASSANDRA-7410)
  * BATCH statement is broken in cqlsh (CASSANDRA-10272)
- * Added configurable warning threshold for GC duration (CASSANDRA-8907)
  * (cqlsh) Make cqlsh PEP8 Compliant (CASSANDRA-10066)
  * (cqlsh) Fix error when starting cqlsh with --debug (CASSANDRA-10282)
  * Scrub, Cleanup and Upgrade do not unmark compacting until all operations
    have completed, regardless of the occurence of exceptions (CASSANDRA-10274)
+
+
+3.0.0-beta2
+ * Fix columns returned by AbstractBtreePartitions (CASSANDRA-10220)
+ * Fix backward compatibility issue due to AbstractBounds serialization bug (CASSANDRA-9857)
+ * Fix startup error when upgrading nodes (CASSANDRA-10136)
+ * Base table PRIMARY KEY can be assumed to be NOT NULL in MV creation (CASSANDRA-10147)
+ * Improve batchlog write patch (CASSANDRA-9673)
+ * Re-apply MaterializedView updates on commitlog replay (CASSANDRA-10164)
+ * Require AbstractType.isByteOrderComparable declaration in constructor (CASSANDRA-9901)
+ * Avoid digest mismatch on upgrade to 3.0 (CASSANDRA-9554)
+ * Fix Materialized View builder when adding multiple MVs (CASSANDRA-10156)
+ * Choose better poolingOptions for protocol v4 in cassandra-stress (CASSANDRA-10182)
+ * Fix LWW bug affecting Materialized Views (CASSANDRA-10197)
+ * Ensures frozen sets and maps are always sorted (CASSANDRA-10162)
+ * Don't deadlock when flushing CFS backed custom indexes (CASSANDRA-10181)
+ * Fix double flushing of secondary index tables (CASSANDRA-10180)
+ * Fix incorrect handling of range tombstones in thrift (CASSANDRA-10046)
+ * Only use batchlog when paired materialized view replica is remote (CASSANDRA-10061)
+ * Reuse TemporalRow when updating multiple MaterializedViews (CASSANDRA-10060)
+ * Validate gc_grace_seconds for batchlog writes and MVs (CASSANDRA-9917)
+ * Fix sstablerepairedset (CASSANDRA-10132)
+Merged from 2.2:
+ * Cancel transaction for sstables we wont redistribute index summary
+   for (CASSANDRA-10270)
+ * Retry snapshot deletion after compaction and gc on Windows (CASSANDRA-10222)
+ * Fix failure to start with space in directory path on Windows (CASSANDRA-10239)
+ * Fix repair hang when snapshot failed (CASSANDRA-10057)
+ * Fall back to 1/4 commitlog volume for commitlog_total_space on small disks
+   (CASSANDRA-10199)
+Merged from 2.1:
+ * Added configurable warning threshold for GC duration (CASSANDRA-8907)
  * Fix handling of streaming EOF (CASSANDRA-10206)
  * Only check KeyCache when it is enabled
  * Change streaming_socket_timeout_in_ms default to 1 hour (CASSANDRA-8611)
@@ -531,8 +1160,39 @@
  * Fix race during construction of commit log (CASSANDRA-10049)
  * Fix LeveledCompactionStrategyTest (CASSANDRA-9757)
  * Fix broken UnbufferedDataOutputStreamPlus.writeUTF (CASSANDRA-10203)
+ * (cqlsh) default load-from-file encoding to utf-8 (CASSANDRA-9898)
+ * Avoid returning Permission.NONE when failing to query users table (CASSANDRA-10168)
  * (cqlsh) add CLEAR command (CASSANDRA-10086)
  * Support string literals as Role names for compatibility (CASSANDRA-10135)
+Merged from 2.1:
+ * Only check KeyCache when it is enabled
+ * Change streaming_socket_timeout_in_ms default to 1 hour (CASSANDRA-8611)
+ * (cqlsh) update list of CQL keywords (CASSANDRA-9232)
+
+
+3.0.0-beta1
+ * Redesign secondary index API (CASSANDRA-9459, 7771, 9041)
+ * Fix throwing ReadFailure instead of ReadTimeout on range queries (CASSANDRA-10125)
+ * Rewrite hinted handoff (CASSANDRA-6230)
+ * Fix query on static compact tables (CASSANDRA-10093)
+ * Fix race during construction of commit log (CASSANDRA-10049)
+ * Add option to only purge repaired tombstones (CASSANDRA-6434)
+ * Change authorization handling for MVs (CASSANDRA-9927)
+ * Add custom JMX enabled executor for UDF sandbox (CASSANDRA-10026)
+ * Fix row deletion bug for Materialized Views (CASSANDRA-10014)
+ * Support mixed-version clusters with Cassandra 2.1 and 2.2 (CASSANDRA-9704)
+ * Fix multiple slices on RowSearchers (CASSANDRA-10002)
+ * Fix bug in merging of collections (CASSANDRA-10001)
+ * Optimize batchlog replay to avoid full scans (CASSANDRA-7237)
+ * Repair improvements when using vnodes (CASSANDRA-5220)
+ * Disable scripted UDFs by default (CASSANDRA-9889)
+ * Bytecode inspection for Java-UDFs (CASSANDRA-9890)
+ * Use byte to serialize MT hash length (CASSANDRA-9792)
+ * Replace usage of Adler32 with CRC32 (CASSANDRA-8684)
+ * Fix migration to new format from 2.1 SSTable (CASSANDRA-10006)
+ * SequentialWriter should extend BufferedDataOutputStreamPlus (CASSANDRA-9500)
+ * Use the same repairedAt timestamp within incremental repair session (CASSANDRA-9111)
+Merged from 2.2:
  * Allow count(*) and count(1) to be use as normal aggregation (CASSANDRA-10114)
  * An NPE is thrown if the column name is unknown for an IN relation (CASSANDRA-10043)
  * Apply commit_failure_policy to more errors on startup (CASSANDRA-9749)
@@ -540,15 +1200,7 @@
  * Route gossip messages over dedicated socket (CASSANDRA-9237)
  * Add checksum to saved cache files (CASSANDRA-9265)
  * Log warning when using an aggregate without partition key (CASSANDRA-9737)
- * Avoid grouping sstables for anticompaction with DTCS (CASSANDRA-9900)
- * UDF / UDA execution time in trace (CASSANDRA-9723)
- * Fix broken internode SSL (CASSANDRA-9884)
 Merged from 2.1:
- * Change streaming_socket_timeout_in_ms default to 1 hour (CASSANDRA-8611)
- * (cqlsh) update list of CQL keywords (CASSANDRA-9232)
- * Avoid race condition during read repair (CASSANDRA-9460)
- * (cqlsh) default load-from-file encoding to utf-8 (CASSANDRA-9898)
- * Avoid returning Permission.NONE when failing to query users table (CASSANDRA-10168)
  * (cqlsh) Allow encoding to be set through command line (CASSANDRA-10004)
  * Add new JMX methods to change local compaction strategy (CASSANDRA-9965)
  * Write hints for paxos commits (CASSANDRA-7342)
@@ -558,6 +1210,41 @@
    when both exist (CASSANDRA-9777)
  * Release snapshot selfRef when doing snapshot repair (CASSANDRA-9998)
  * Cannot replace token does not exist - DN node removed as Fat Client (CASSANDRA-9871)
+Merged from 2.0:
+ * Don't cast expected bf size to an int (CASSANDRA-9959)
+ * Make getFullyExpiredSSTables less expensive (CASSANDRA-9882)
+
+
+3.0.0-alpha1
+ * Implement proper sandboxing for UDFs (CASSANDRA-9402)
+ * Simplify (and unify) cleanup of compaction leftovers (CASSANDRA-7066)
+ * Allow extra schema definitions in cassandra-stress yaml (CASSANDRA-9850)
+ * Metrics should use up to date nomenclature (CASSANDRA-9448)
+ * Change CREATE/ALTER TABLE syntax for compression (CASSANDRA-8384)
+ * Cleanup crc and adler code for java 8 (CASSANDRA-9650)
+ * Storage engine refactor (CASSANDRA-8099, 9743, 9746, 9759, 9781, 9808, 9825,
+   9848, 9705, 9859, 9867, 9874, 9828, 9801)
+ * Update Guava to 18.0 (CASSANDRA-9653)
+ * Bloom filter false positive ratio is not honoured (CASSANDRA-8413)
+ * New option for cassandra-stress to leave a ratio of columns null (CASSANDRA-9522)
+ * Change hinted_handoff_enabled yaml setting, JMX (CASSANDRA-9035)
+ * Add algorithmic token allocation (CASSANDRA-7032)
+ * Add nodetool command to replay batchlog (CASSANDRA-9547)
+ * Make file buffer cache independent of paths being read (CASSANDRA-8897)
+ * Remove deprecated legacy Hadoop code (CASSANDRA-9353)
+ * Decommissioned nodes will not rejoin the cluster (CASSANDRA-8801)
+ * Change gossip stabilization to use endpoit size (CASSANDRA-9401)
+ * Change default garbage collector to G1 (CASSANDRA-7486)
+ * Populate TokenMetadata early during startup (CASSANDRA-9317)
+ * Undeprecate cache recentHitRate (CASSANDRA-6591)
+ * Add support for selectively varint encoding fields (CASSANDRA-9499, 9865)
+ * Materialized Views (CASSANDRA-6477)
+Merged from 2.2:
+ * Avoid grouping sstables for anticompaction with DTCS (CASSANDRA-9900)
+ * UDF / UDA execution time in trace (CASSANDRA-9723)
+ * Fix broken internode SSL (CASSANDRA-9884)
+Merged from 2.1:
+ * Add new JMX methods to change local compaction strategy (CASSANDRA-9965)
  * Fix handling of enable/disable autocompaction (CASSANDRA-9899)
  * Add consistency level to tracing ouput (CASSANDRA-9827)
  * Remove repair snapshot leftover on startup (CASSANDRA-7357)
@@ -565,7 +1252,6 @@
  * Ensure atomicity inside thrift and stream session (CASSANDRA-7757)
  * Fix nodetool info error when the node is not joined (CASSANDRA-9031)
 Merged from 2.0:
- * Make getFullyExpiredSSTables less expensive (CASSANDRA-9882)
  * Log when messages are dropped due to cross_node_timeout (CASSANDRA-9793)
  * Don't track hotness when opening from snapshot for validation (CASSANDRA-9382)
 
@@ -582,7 +1268,7 @@
  * Handle corrupt files on startup (CASSANDRA-9686)
  * Fix clientutil jar and tests (CASSANDRA-9760)
  * (cqlsh) Allow the SSL protocol version to be specified through the
-   config file or environment variables (CASSANDRA-9544)
+    config file or environment variables (CASSANDRA-9544)
 Merged from 2.0:
  * Add tool to find why expired sstables are not getting dropped (CASSANDRA-10015)
  * Remove erroneous pending HH tasks from tpstats/jmx (CASSANDRA-9129)

diff --git a/NEWS.txt b/NEWS.txt
index 0f2d7ca..5a2ef51 100644
--- a/NEWS.txt
+++ b/NEWS.txt

@@ -42,134 +42,420 @@
 'sstableloader' tool. You can upgrade the file format of your snapshots
 using the provided 'sstableupgrade' tool.
 
-2.2.17
+3.0.21
 ======
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+    - Nothing specific to this release, but please see previous upgrading sections,
+      especially if you are upgrading from 2.2.
 
-2.2.16
+3.0.20
 ======
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+    - Nothing specific to this release, but please see previous upgrading sections,
+      especially if you are upgrading from 2.2.
 
-2.2.15
+3.0.19
 ======
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+    - repair_session_max_tree_depth setting has been added to cassandra.yaml to allow operators to reduce
+      merkle tree size if repair is creating too much heap pressure. See CASSANDRA-14096 for details.
+    - native_transport_max_negotiable_protocol_version has been added to cassandra.yaml to allow operators to
+      enforce an upper limit on the version of the native protocol that servers will negotiate with clients.
+      This can be used during upgrades from 2.1 to 3.0 to prevent errors due to incompatible paging state formats
+      between the two versions. See CASSANDRA-15193 for details.
 
-2.2.14
+
+3.0.18
 ======
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+    - The order of static columns in SELECT * has been fixed to match that of 2.0 and 2.1 - they are now sorted
+      alphabetically again, by their name, just like regular columns are. If you use prepared statements and
+      SELECT * queries, and have both simple and collection static columns in those tables, and are upgrading from an
+      earlier 3.0 version, then you might be affected by this change. Please see CASSANDRA-14638 for details.
 
-2.2.13
-======
-
-Upgrading
----------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
-
-2.2.12
-======
-
-Upgrading
----------
-    - See MAXIMUM TTL EXPIRATION DATE NOTICE above.
-    - Cassandra is now relying on the JVM options to properly shutdown on OutOfMemoryError. By default it will
-      rely on the OnOutOfMemoryError option as the ExitOnOutOfMemoryError and CrashOnOutOfMemoryError options
-      are not supported by the older 1.7 and 1.8 JVMs. A warning will be logged at startup if none of those JVM
-      options are used. See CASSANDRA-13006 for more details.
-    - Cassandra is not logging anymore by default an Heap histogram on OutOfMemoryError. To enable that behavior
-      set the 'cassandra.printHeapHistogramOnOutOfMemoryError' System property to 'true'. See CASSANDRA-13006
-      for more details.
-
-2.2.11
-======
-
-Upgrading
----------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
-
-2.2.10
-======
-
-Upgrading
----------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
-
-2.2.9
+3.0.17
 =====
 
 Upgrading
 ---------
-   - Compaction now correctly drops sstables out of CompactionTask when there
-     isn't enough disk space to perform the full compaction.  This should reduce
-     pending compaction tasks on systems with little remaining disk space.
+    - Materialized view users upgrading from 3.0.15 or later that have performed range movements (join, decommission, move, etc),
+      should run repair on the base tables, and subsequently on the views to ensure data affected by CASSANDRA-14251 is correctly
+      propagated to all replicas.
+    - Changes to bloom_filter_fp_chance will no longer take effect on existing sstables when the node is restarted. Only
+      compactions/upgradesstables regenerates bloom filters and Summaries sstable components. See CASSANDRA-11163
 
 Deprecation
 -----------
+    - Background read repair has been deprecated. dclocal_read_repair_chance and
+      read_repair_chance table options have been deprecated, and will be removed entirely in 4.0.
+      See CASSANDRA-13910 for details.
 
-(See note about the new feature User-Defined-Functions in 2.2.0.)
-
-Since the security manager added in 3.0 only allows Java and JavaScript
-UDFs to be run, UDFs for other languages are deprecated and support for
-non-Java and non-JavaScript UDFs is deprecated in 2.2 and has been removed
-in version 3.0.11.
-
-2.2.8
+3.0.16
 =====
 
 Upgrading
 ---------
-    - The ReversedType behaviour has been corrected for clustering columns of
-      BYTES type containing empty value. Scrub should be run on the existing
-      SSTables containing a descending clustering column of BYTES type to correct
-      their ordering. See CASSANDRA-12127 for more details.
+   - See MAXIMUM TTL EXPIRATION DATE NOTICE above.
+   - Cassandra is now relying on the JVM options to properly shutdown on OutOfMemoryError. By default it will
+     rely on the OnOutOfMemoryError option as the ExitOnOutOfMemoryError and CrashOnOutOfMemoryError options
+     are not supported by the older 1.7 and 1.8 JVMs. A warning will be logged at startup if none of those JVM
+     options are used. See CASSANDRA-13006 for more details.
+   - Cassandra is not logging anymore by default an Heap histogram on OutOfMemoryError. To enable that behavior
+     set the 'cassandra.printHeapHistogramOnOutOfMemoryError' System property to 'true'. See CASSANDRA-13006
+     for more details.
 
-2.2.7
+Materialized Views
+-------------------
+   - Following a discussion regarding concerns about the design and safety of Materialized Views, the C* development
+     community no longer recommends them for production use, and considers them experimental. Warnings messages will
+     now be logged when they are created. (See https://www.mail-archive.com/dev@cassandra.apache.org/msg11511.html)
+   - An 'enable_materialized_views' flag has been added to cassandra.yaml to allow operators to prevent creation of
+     views
+
+3.0.15
+=====
+
+Upgrading
+---------
+   - Nothing specific to this release, but please see previous upgrading sections,
+     especially if you are upgrading from 2.2.
+
+Compact Storage
+---------------
+    - Starting version 4.0, Thrift and COMPACT STORAGE is no longer supported.
+      'ALTER ... DROP COMPACT STORAGE' statement makes Compact Tables CQL-compatible,
+      exposing internal structure of Thrift/Compact Tables. You can find more details
+      on exposed internal structure under: 
+      http://cassandra.apache.org/doc/latest/cql/appendices.html#appendix-c-dropping-compact-storage
+
+      For uninterrupted cluster upgrades, drivers now support 'NO_COMPACT' startup option.
+      Supplying this flag will have same effect as 'DROP COMPACT STORAGE', but only for the
+      current connection.
+
+      In order to upgrade, clients supporting a non-compact schema view can be rolled out
+      gradually. When all the clients are updated 'ALTER ... DROP COMPACT STORAGE' can be
+      executed. After dropping compact storage, ’NO_COMPACT' option will have no effect
+      after that.
+
+Materialized Views
+-------------------
+    - Cassandra will no longer allow dropping columns on tables with Materialized Views.
+    - A change was made in the way the Materialized View timestamp is computed, which
+      may cause an old deletion to a base column which is view primary key (PK) column
+      to not be reflected in the view when repairing the base table post-upgrade. This
+      condition is only possible when a column deletion to an MV primary key (PK) column
+      not present in the base table PK (via UPDATE base SET view_pk_col = null or DELETE
+      view_pk_col FROM base) is missed before the upgrade and received by repair after the upgrade.
+      If such column deletions are done on a view PK column which is not a base PK, it's advisable
+      to run repair on the base table of all nodes prior to the upgrade. Alternatively it's possible
+      to fix potential inconsistencies by running repair on the views after upgrade or drop and
+      re-create the views. See CASSANDRA-11500 for more details.
+    - Removal of columns not selected in the Materialized View (via UPDATE base SET unselected_column
+      = null or DELETE unselected_column FROM base) may not be properly reflected in the view in some
+      situations so we advise against doing deletions on base columns not selected in views
+      until this is fixed on CASSANDRA-13826.
+
+3.0.14
+======
+
+Upgrading
+---------
+   - ALTER TABLE (ADD/DROP COLUMN) operations concurrent with a read might
+     result into data corruption (see CASSANDRA-13004 for more details).
+     Fixing this bug required a messaging protocol version bump. By default,
+     Cassandra 3.0.14 will use 3014 version for messaging.
+
+     Since Schema Migrations rely the on exact messaging protocol version
+     match between nodes, if you need schema changes during the upgrade
+     process, you have to start your nodes with `-Dcassandra.force_3_0_protocol_version=true`
+     first, in order to temporarily force a backwards compatible protocol.
+     After the whole cluster is upgraded to 3.0.14, do a rolling
+     restart of the cluster without setting that flag.
+
+     3.0.14 nodes with and withouot the flag set will be able to do schema
+     migrations with other 3.x and 3.0.x releases.
+
+     While running the cluster with the flag set to true on 3.0.14 (in
+     compatibility mode), avoid adding or removing any columns to/from
+     existing tables.
+
+     If your cluster can do without schema migrations during the upgrade
+     time, just start the cluster normally without setting aforementioned
+     flag.
+
+   - If performing a rolling upgrade from 3.0.13, there will be a schema mismatch caused
+     by a bug with the schema digest calculation in 3.0.13. This will cause unnecessary
+     but otherwise harmless schema updates, see CASSANDRA-13559 for more details.
+
+3.0.13
+======
+
+Upgrading
+---------
+   - The NativeAccessMBean isAvailable method will only return true if the
+     native library has been successfully linked. Previously it was returning
+     true if JNA could be found but was not taking into account link failures.
+
+3.0.12
+======
+
+Upgrading
+---------
+   - In 2.1, the default for otc_coalescing_strategy was 'DISABLED'.
+     In 2.2 and 3.0, it was changed to 'TIMEHORIZON', but that value was shown
+     to be a performance regression. The default for 3.11.0 and newer has
+     been reverted to 'DISABLED'. Users upgrading to Cassandra 3.0 should
+     consider setting otc_coalescing_strategy to 'DISABLED'.
+
+3.0.11
+======
+
+Upgrading
+---------
+   - Support for alter types of already defined tables and of UDTs fields has been disabled.
+     If it is necessary to return a different type, please use casting instead. See
+     CASSANDRA-12443 for more details.
+   - Specifying the default_time_to_live option when creating or altering a
+     materialized view was erroneously accepted (and ignored). It is now
+     properly rejected.
+   - Only Java and JavaScript are now supported UDF languages.
+     The sandbox in 3.0 already prevented the use of script languages except Java
+     and JavaScript.
+   - Compaction now correctly drops sstables out of CompactionTask when there
+     isn't enough disk space to perform the full compaction.  This should reduce
+     pending compaction tasks on systems with little remaining disk space.
+   - Primary ranges in the system.size_estimates table are now based on the keyspace
+     replication settings and adjacent ranges are no longer merged (CASSANDRA-9639).
+
+3.0.10
+======
+
+Upgrading
+---------
+   - memtable_allocation_type: offheap_buffers is no longer allowed to be specified in the 3.0 series.
+     This was an oversight that can cause segfaults. Offheap was re-introduced in 3.4 see CASSANDRA-11039
+     and CASSANDRA-9472 for details.
+
+3.0.9
+=====
+
+Upgrading
+---------
+   - The ReversedType behaviour has been corrected for clustering columns of
+     BYTES type containing empty value. Scrub should be run on the existing
+     SSTables containing a descending clustering column of BYTES type to correct
+     their ordering. See CASSANDRA-12127 for more details.
+
+3.0.8
+=====
+
+Upgrading
+---------
+   - Ec2MultiRegionSnitch will no longer automatically set broadcast_rpc_address
+     to the public instance IP if this property is defined on cassandra.yaml.
+
+3.0.7
+=====
+
+Upgrading
+---------
+   - A maximum size for SSTables values has been introduced, to prevent out of memory
+     exceptions when reading corrupt SSTables. This maximum size can be set via
+     max_value_size_in_mb in cassandra.yaml. The default is 256MB, which matches the default
+     value of native_transport_max_frame_size_in_mb. SSTables will be considered corrupt if
+     they contain values whose size exceeds this limit. See CASSANDRA-9530 for more details.
+
+Deprecation
+-----------
+   - DateTieredCompactionStrategy has been deprecated - new tables should use
+     TimeWindowCompactionStrategy. Note that migrating an existing DTCS-table to TWCS might
+     cause increased compaction load for a while after the migration so make sure you run
+     tests before migrating. Read CASSANDRA-9666 for background on this.
+
+New features
+------------
+   - TimeWindowCompactionStrategy has been added. This has proven to be a better approach
+     to time series compaction and new tables should use this instead of DTCS. See
+     CASSANDRA-9666 for details.
+
+3.0.6
 =====
 
 New features
 ------------
-    - JSON timestamps are now in UTC and contain the timezone information, see
-      CASSANDRA-11137 for more details.
+   - JSON timestamps are now in UTC and contain the timezone information, see
+     CASSANDRA-11137 for more details.
 
-Upgrading
----------
-    - Ec2MultiRegionSnitch will no longer automatically set broadcast_rpc_address
-      to the public instance IP if this property is defined on cassandra.yaml.
-
-
-2.2.6
+3.0.5
 =====
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+   - Nothing specific to this release, but please see previous versions upgrading section,
+     especially if you are upgrading from 2.2.
 
-2.2.5
+3.0.4
+=====
+
+New features
+------------
+   - sstabledump tool is added to be 3.0 version of former sstable2json. The tool only
+     supports v3.0+ SSTables. See tool's help for more detail.
+
+Upgrading
+---------
+   - Nothing specific to this release, but please see previous versions upgrading section,
+     especially if you are upgrading from 2.2.
+
+
+3.0.3
+=====
+
+New features
+------------
+   - Hinted handoff now supports compression. Reference cassandra.yaml:hints_compression.
+     Note: hints compression is currently disabled by default.
+
+Upgrading
+---------
+    - Nothing specific to 3.0.3 but please see previous versions upgrading section,
+      especially if you are upgrading from 2.2.
+
+
+3.0.1
 =====
 
 Upgrading
 ---------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
+   - The return value of SelectStatement::getLimit as been changed from DataLimits
+     to int.
+   - Custom index implementation should be aware that the method Indexer::indexes()
+     has been removed as its contract was misleading and all custom implementation
+     should have almost surely returned true inconditionally for that method.
+   - GC logging is now enabled by default (you can disable it in the jvm.options
+     file if you prefer).
+
+
+3.0
+===
+
+New features
+------------
+   - EACH_QUORUM is now a supported consistency level for read requests.
+   - Support for IN restrictions on any partition key component or clustering key
+     as well as support for EQ and IN multicolumn restrictions has been added to
+     UPDATE and DELETE statement.
+   - Support for single-column and multi-colum slice restrictions (>, >=, <= and <)
+     has been added to DELETE statements
+   - nodetool rebuild_index accepts the index argument without
+     the redundant table name
+   - Materialized Views, which allow for server-side denormalization, is now
+     available. Materialized views provide an alternative to secondary indexes
+     for non-primary key queries, and perform much better for indexing high
+     cardinality columns.
+     See http://www.datastax.com/dev/blog/new-in-cassandra-3-0-materialized-views
+   - Hinted handoff has been completely rewritten. Hints are now stored in flat
+     files, with less overhead for storage and more efficient dispatch.
+     See CASSANDRA-6230 for full details.
+   - Option to not purge unrepaired tombstones. To avoid users having data resurrected
+     if repair has not been run within gc_grace_seconds, an option has been added to
+     only allow tombstones from repaired sstables to be purged. To enable, set the
+     compaction option 'only_purge_repaired_tombstones':true but keep in mind that if
+     you do not run repair for a long time, you will keep all tombstones around which
+     can cause other problems.
+   - Enabled warning on GC taking longer than 1000ms. See
+     cassandra.yaml:gc_warn_threshold_in_ms
+
+Upgrading
+---------
+   - Clients must use the native protocol version 3 when upgrading from 2.2.X as
+     the native protocol version 4 is not compatible between 2.2.X and 3.Y. See
+     https://www.mail-archive.com/user@cassandra.apache.org/msg45381.html for details.
+   - A new argument of type InetAdress has been added to IAuthenticator::newSaslNegotiator,
+     representing the IP address of the client attempting authentication. It will be a breaking
+     change for any custom implementations.
+   - token-generator tool has been removed.
+   - Upgrade to 3.0 is supported from Cassandra 2.1 versions greater or equal to 2.1.9,
+     or Cassandra 2.2 versions greater or equal to 2.2.2. Upgrade from Cassandra 2.0 and
+     older versions is not supported.
+   - The 'memtable_allocation_type: offheap_objects' option has been removed. It should
+     be re-introduced in a future release and you can follow CASSANDRA-9472 to know more.
+   - Configuration parameter memory_allocator in cassandra.yaml has been removed.
+   - The native protocol versions 1 and 2 are not supported anymore.
+   - Max mutation size is now configurable via max_mutation_size_in_kb setting in
+     cassandra.yaml; the default is half the size commitlog_segment_size_in_mb * 1024.
+   - 3.0 requires Java 8u40 or later.
+   - Garbage collection options were moved from cassandra-env to jvm.options file.
+   - New transaction log files have been introduced to replace the compactions_in_progress
+     system table, temporary file markers (tmp and tmplink) and sstable ancerstors.
+     Therefore, compaction metadata no longer contains ancestors. Transaction log files
+     list sstable descriptors involved in compactions and other operations such as flushing
+     and streaming. Use the sstableutil tool to list any sstable files currently involved
+     in operations not yet completed, which previously would have been marked as temporary.
+     A transaction log file contains one sstable per line, with the prefix "add:" or "remove:".
+     They also contain a special line "commit", only inserted at the end when the transaction
+     is committed. On startup we use these files to cleanup any partial transactions that were
+     in progress when the process exited. If the commit line is found, we keep new sstables
+     (those with the "add" prefix) and delete the old sstables (those with the "remove" prefix),
+     vice-versa if the commit line is missing. Should you lose or delete these log files,
+     both old and new sstable files will be kept as live files, which will result in duplicated
+     sstables. These files are protected by incremental checksums so you should not manually
+     edit them. When restoring a full backup or moving sstable files, you should clean-up
+     any left over transactions and their temporary files first. You can use this command:
+      ===> sstableutil -c ks table
+     See CASSANDRA-7066 for full details.
+   - New write stages have been added for batchlog and materialized view mutations
+     you can set their size in cassandra.yaml
+   - User defined functions are now executed in a sandbox.
+     To use UDFs and UDAs, you have to enable them in cassandra.yaml.
+   - New SSTable version 'la' with improved bloom-filter false-positive handling
+     compared to previous version 'ka' used in 2.2 and 2.1. Running sstableupgrade
+     is not necessary but recommended.
+   - Before upgrading to 3.0, make sure that your cluster is in complete agreement
+     (schema versions outputted by `nodetool describecluster` are all the same).
+   - Schema metadata is now stored in the new `system_schema` keyspace, and
+     legacy `system.schema_*` tables are now gone; see CASSANDRA-6717 for details.
+   - Pig's support has been removed.
+   - Hadoop BulkOutputFormat and BulkRecordWriter have been removed; use
+     CqlBulkOutputFormat and CqlBulkRecordWriter instead.
+   - Hadoop ColumnFamilyInputFormat and ColumnFamilyOutputFormat have been removed;
+     use CqlInputFormat and CqlOutputFormat instead.
+   - Hadoop ColumnFamilyRecordReader and ColumnFamilyRecordWriter have been removed;
+     use CqlRecordReader and CqlRecordWriter instead.
+   - hinted_handoff_enabled in cassandra.yaml no longer supports a list of data centers.
+     To specify a list of excluded data centers when hinted_handoff_enabled is set to true,
+     use hinted_handoff_disabled_datacenters, see CASSANDRA-9035 for details.
+   - The `sstable_compression` and `chunk_length_kb` compression options have been deprecated.
+     The new options are `class` and `chunk_length_in_kb`. Disabling compression should now
+     be done by setting the new option `enabled` to `false`.
+   - The compression option `crc_check_chance` became a top-level table option, but is currently
+     enforced only against tables with enabled compression.
+   - Only map syntax is now allowed for caching options. ALL/NONE/KEYS_ONLY/ROWS_ONLY syntax
+     has been deprecated since 2.1.0 and is being removed in 3.0.0.
+   - The 'index_interval' option for 'CREATE TABLE' statements, which has been deprecated
+     since 2.1 and replaced with the 'min_index_interval' and 'max_index_interval' options,
+     has now been removed.
+   - The 'replicate_on_write' and 'populate_io_cache_on_flush' options for 'CREATE TABLE' statements,
+     which have been deprecated since 2.1, have also been removed.
+   - Batchlog entries are now stored in a new table - system.batches.
+     The old one has been deprecated.
+   - JMX methods set/getCompactionStrategyClass have been removed, use
+     set/getCompactionParameters or set/getCompactionParametersJson instead.
+   - SizeTieredCompactionStrategy parameter cold_reads_to_omit has been removed.
+   - The secondary index API has been comprehensively reworked. This will be a breaking
+     change for any custom index implementations, which should now look to implement
+     the new org.apache.cassandra.index.Index interface. New syntax has been added to create
+     and query row-based indexes, which are not explicitly linked to a single column in the
+     base table.
+
 
 2.2.4
 =====
@@ -191,8 +477,6 @@
       "rack1". To override this behaviour use -Dcassandra.ignore_rack=true and/or
       -Dcassandra.ignore_dc=true.
     - Reloading the configuration file of GossipingPropertyFileSnitch has been disabled.
-    - GC logging is now enabled by default (but you can disable it if you want by
-      commenting the relevant lines of the cassandra-env file).
 
 Upgrading
 ---------
@@ -222,21 +506,11 @@
 2.2.2
 =====
 
-Upgrading
----------
-    - Version 1 and 2 of the native protocol are now deprecated and support
-      will be removed in Cassandra 3.0. You are encouraged to upgrade to a
-      client driver using version 3 of the native protocol.
-
 Changed Defaults
 ----------------
    - commitlog_total_space_in_mb will use the smaller of 8192, and 1/4
      of the total space of the commitlog volume. (Before: always used
      8192)
-   - Incremental repair is on by default since 2.2.0, run full repairs by
-     providing the '-full' parameter to nodetool repair.
-   - Parallel repairs are the default since 2.2.0, run sequential repairs
-     by providing the '-seq' parameter to nodetool repair.
    - The following INFO logs were reduced to DEBUG level and will now show
      on debug.log instead of system.log:
       - Memtable flushing actions
@@ -258,11 +532,6 @@
 2.2.1
 =====
 
-Upgrading
----------
-    - Nothing specific to this release, but please see 2.2 if you are upgrading
-      from a previous version.
-
 New features
 ------------
    - COUNT(*) and COUNT(1) can be selected with other columns or functions
@@ -348,9 +617,6 @@
      3.0.  This will inherently be backwards-incompatible with any 2.2
      UDF that perform insecure operations such as opening a socket or
      writing to the filesystem.
-
-     Per the previous note about adding a security manager in 3.0, this security manager
-     means that non JavaScipt UDF's won't run, there for their use is deprecated.
      ************************************************************************
    - Row-cache is now fully off-heap.
    - jemalloc is now automatically preloaded and used on Linux and OS-X if

diff --git a/NOTICE.txt b/NOTICE.txt
index a71d822..a20994f 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt

@@ -74,3 +74,12 @@
 (https://github.com/snazy/ohc)
 Java Off-Heap-Cache, licensed under APLv2
 Copyright 2014-2015 Robert Stupp, Germany.
+
+Protocol buffers for varint encoding
+https://developers.google.com/protocol-buffers/
+Copyright 2008 Google Inc.  All rights reserved.
+BSD 3-clause
+
+ASM
+(http://asm.ow2.org/)
+Copyright (c) 2000-2011 INRIA, France Telecom

diff --git a/README.asc b/README.asc
index 6360fe9..8ae90e8 100644
--- a/README.asc
+++ b/README.asc

@@ -11,7 +11,7 @@
 
 Requirements
 ------------
-. Java >= 1.7 (OpenJDK and Oracle JVMS have been tested)
+. Java >= 1.8 (OpenJDK and Oracle JVMS have been tested)
 . Python 2.7 (for cqlsh)
 
 Getting started

diff --git a/bin/cassandra b/bin/cassandra
index 7a1dcbd..f887b31 100755
--- a/bin/cassandra
+++ b/bin/cassandra

@@ -31,10 +31,11 @@
 #   JVM_OPTS -- Additional arguments to the JVM for heap size, etc
 #   JVM_ON_OUT_OF_MEMORY_ERROR_OPT -- The OnOutOfMemoryError JVM option if specified
 #   CASSANDRA_CONF -- Directory containing Cassandra configuration files.
+#   CASSANDRA_LOG_DIR -- Directory containing logs(default: $CASSANDRA_HOME/logs).
 #
 # As a convenience, a fragment of shell is sourced in order to set one or
 # more of these variables. This so-called `include' can be placed in a 
-# number of locations and will be searched for in order. The lowest 
+# number of locations and will be searched for in order. The highest 
 # priority search path is the same directory as the startup script, and
 # since this is the location of the sample in the project tree, it should
 # almost work Out Of The Box.
@@ -109,7 +110,7 @@
 # avoid disk I/O. Even for the purpose of CPU efficiency, we don't
 # really have CPU<->data affinity anyway. Also, empirically test that numactl
 # works before trying to use it (CASSANDRA-3245).
-NUMACTL_ARGS="--interleave=all"
+NUMACTL_ARGS=${NUMACTL_ARGS:-"--interleave=all"}
 if which numactl >/dev/null 2>/dev/null && numactl $NUMACTL_ARGS ls / >/dev/null 2>/dev/null
 then
     NUMACTL="numactl $NUMACTL_ARGS"
@@ -126,9 +127,13 @@
     . "$CASSANDRA_CONF/cassandra-env.sh"
 fi
 
+if [ -z "$CASSANDRA_LOG_DIR" ]; then
+  CASSANDRA_LOG_DIR=$CASSANDRA_HOME/logs
+fi
+
 # Special-case path variables.
 case "`uname`" in
-    CYGWIN*) 
+    CYGWIN*|MINGW*) 
         CLASSPATH=`cygpath -p -w "$CLASSPATH"`
         CASSANDRA_CONF=`cygpath -p -w "$CASSANDRA_CONF"`
     ;;
@@ -190,7 +195,7 @@
     props="$3"
     class="$4"
     cassandra_parms="-Dlogback.configurationFile=logback.xml"
-    cassandra_parms="$cassandra_parms -Dcassandra.logdir=$CASSANDRA_HOME/logs"
+    cassandra_parms="$cassandra_parms -Dcassandra.logdir=$CASSANDRA_LOG_DIR"
     cassandra_parms="$cassandra_parms -Dcassandra.storagedir=$cassandra_storagedir"
 
     if [ "x$pidpath" != "x" ]; then

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index 3d0e056..2477766 100644
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py

@@ -70,8 +70,9 @@
 CQL_LIB_PREFIX = 'cassandra-driver-internal-only-'
 
 CASSANDRA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
-CASSANDRA_CQL_HTML_FALLBACK = 'https://cassandra.apache.org/doc/cql3/CQL-2.2.html'
+CASSANDRA_CQL_HTML_FALLBACK = 'https://cassandra.apache.org/doc/cql3/CQL-3.0.html'
 
+# default location of local CQL.html
 if os.path.exists(CASSANDRA_PATH + '/doc/cql3/CQL.html'):
     # default location of local CQL.html
     CASSANDRA_CQL_HTML = 'file://' + CASSANDRA_PATH + '/doc/cql3/CQL.html'
@@ -147,10 +148,12 @@
 
 from cassandra.auth import PlainTextAuthProvider
 from cassandra.cluster import Cluster
+from cassandra.marshal import int64_unpack
 from cassandra.metadata import (ColumnMetadata, KeyspaceMetadata,
                                 TableMetadata, protect_name, protect_names)
 from cassandra.policies import WhiteListRoundRobinPolicy
 from cassandra.query import SimpleStatement, ordered_dict_factory, TraceUnavailable
+from cassandra.util import datetime_from_timestamp
 
 # cqlsh should run correctly when run out of a Cassandra source tree,
 # out of an unpacked Cassandra tarball, and after a proper package install.
@@ -171,8 +174,6 @@
 
 DEFAULT_HOST = '127.0.0.1'
 DEFAULT_PORT = 9042
-DEFAULT_CQLVER = '3.3.1'
-DEFAULT_PROTOCOL_VERSION = 4
 DEFAULT_CONNECT_TIMEOUT_SECONDS = 5
 DEFAULT_REQUEST_TIMEOUT_SECONDS = 10
 
@@ -203,6 +204,7 @@
                                                     - one of the supported browsers in https://docs.python.org/2/library/webbrowser.html.
                                                     - browser path followed by %s, example: /usr/bin/google-chrome-stable %s""")
 parser.add_option('--ssl', action='store_true', help='Use SSL', default=False)
+parser.add_option('--no_compact', action='store_true', help='No Compact', default=False)
 parser.add_option("-u", "--username", help="Authenticate as user.")
 parser.add_option("-p", "--password", help="Authenticate using password.")
 parser.add_option('-k', '--keyspace', help='Authenticate to the given keyspace.')
@@ -212,9 +214,13 @@
 parser.add_option("--encoding", help="Specify a non-default encoding for output." +
                   " (Default: %s)" % (UTF8,))
 parser.add_option("--cqlshrc", help="Specify an alternative cqlshrc file location.")
-parser.add_option('--cqlversion', default=DEFAULT_CQLVER,
-                  help='Specify a particular CQL version (default: %default).'
+parser.add_option('--cqlversion', default=None,
+                  help='Specify a particular CQL version, '
+                       'by default the highest version supported by the server will be used.'
                        ' Examples: "3.0.3", "3.1.0"')
+parser.add_option("--protocol-version", type="int", default=None,
+                  help='Specify a specific protcol version otherwise the client will default and downgrade as necessary')
+
 parser.add_option("-e", "--execute", help='Execute the statement and quit.')
 parser.add_option("--connect-timeout", default=DEFAULT_CONNECT_TIMEOUT_SECONDS, dest='connect_timeout',
                   help='Specify the connection timeout in seconds (default: %default seconds).')
@@ -260,8 +266,8 @@
 # END history/config definition
 
 CQL_ERRORS = (
-    cassandra.AlreadyExists, cassandra.AuthenticationFailed, cassandra.InvalidRequest,
-    cassandra.Timeout, cassandra.Unauthorized, cassandra.OperationTimedOut,
+    cassandra.AlreadyExists, cassandra.AuthenticationFailed, cassandra.CoordinationFailure,
+    cassandra.InvalidRequest, cassandra.Timeout, cassandra.Unauthorized, cassandra.OperationTimedOut,
     cassandra.cluster.NoHostAvailable,
     cassandra.connection.ConnectionBusy, cassandra.connection.ProtocolError, cassandra.connection.ConnectionException,
     cassandra.protocol.ErrorMessage, cassandra.protocol.InternalError, cassandra.query.TraceUnavailable
@@ -333,12 +339,13 @@
                                   | "KEYSPACE" ksname=<keyspaceName>?
                                   | ( "COLUMNFAMILY" | "TABLE" ) cf=<columnFamilyName>
                                   | "INDEX" idx=<indexName>
+                                  | "MATERIALIZED" "VIEW" mv=<materializedViewName>
                                   | ( "COLUMNFAMILIES" | "TABLES" )
                                   | "FULL"? "SCHEMA"
                                   | "CLUSTER"
                                   | "TYPES"
                                   | "TYPE" ut=<userTypeName>
-                                  | (ksname=<keyspaceName> | cf=<columnFamilyName> | idx=<indexName>))
+                                  | (ksname=<keyspaceName> | cf=<columnFamilyName> | idx=<indexName> | mv=<materializedViewName>))
                     ;
 
 <consistencyCommand> ::= "CONSISTENCY" ( level=<consistencyLevel> )?
@@ -510,6 +517,10 @@
     pass
 
 
+class MaterializedViewNotFound(Exception):
+    pass
+
+
 class ObjectNotFound(Exception):
     pass
 
@@ -596,7 +607,7 @@
 
 def extend_cql_deserialization():
     """
-    The python driver returns BLOBs as string, but we expect them as bytearrays; therefore we change
+    The python driver returns BLOBs as string, but we expect them as bytearrays
     the implementation of cassandra.cqltypes.BytesType.deserialize.
 
     The deserializers package exists only when the driver has been compiled with cython extensions and
@@ -614,6 +625,25 @@
             del cassandra.deserializers.DesBytesType
 
     cassandra.cqltypes.BytesType.deserialize = staticmethod(lambda byts, protocol_version: bytearray(byts))
+
+    class DateOverFlowWarning(RuntimeWarning):
+        pass
+
+    # Native datetime types blow up outside of datetime.[MIN|MAX]_YEAR. We will fall back to an int timestamp
+    def deserialize_date_fallback_int(byts, protocol_version):
+        timestamp_ms = int64_unpack(byts)
+        try:
+            return datetime_from_timestamp(timestamp_ms / 1000.0)
+        except OverflowError:
+            warnings.warn(DateOverFlowWarning("Some timestamps are larger than Python datetime can represent. Timestamps are displayed in milliseconds from epoch."))
+            return timestamp_ms
+
+    cassandra.cqltypes.DateType.deserialize = staticmethod(deserialize_date_fallback_int)
+
+    if hasattr(cassandra, 'deserializers'):
+        del cassandra.deserializers.DesDateType
+
+    # Return cassandra.cqltypes.EMPTY instead of None for empty values
     cassandra.cqltypes.CassandraType.support_empty_values = True
 
 
@@ -676,8 +706,9 @@
     def __init__(self, hostname, port, color=False,
                  username=None, password=None, encoding=None, stdin=None, tty=True,
                  completekey=DEFAULT_COMPLETEKEY, browser=None, use_conn=None,
-                 cqlver=DEFAULT_CQLVER, keyspace=None,
+                 cqlver=None, keyspace=None,
                  tracing_enabled=False, expand_enabled=False,
+                 no_compact=False,
                  display_nanotime_format=DEFAULT_NANOTIME_FORMAT,
                  display_timestamp_format=DEFAULT_TIMESTAMP_FORMAT,
                  display_date_format=DEFAULT_DATE_FORMAT,
@@ -687,7 +718,7 @@
                  ssl=False,
                  single_statement=None,
                  request_timeout=DEFAULT_REQUEST_TIMEOUT_SECONDS,
-                 protocol_version=DEFAULT_PROTOCOL_VERSION,
+                 protocol_version=None,
                  connect_timeout=DEFAULT_CONNECT_TIMEOUT_SECONDS):
         cmd.Cmd.__init__(self, completekey=completekey)
         self.hostname = hostname
@@ -706,15 +737,19 @@
         if use_conn:
             self.conn = use_conn
         else:
-            self.conn = Cluster(contact_points=(self.hostname,), port=self.port, cql_version=cqlver,
-                                protocol_version=protocol_version,
-                                auth_provider=self.auth_provider,
+            kwargs = {}
+            if protocol_version is not None:
+                kwargs['protocol_version'] = protocol_version
+            if cqlver is not None:
+                kwargs['cql_version'] = cqlver
+            self.conn = Cluster(contact_points=(self.hostname,), port=self.port,
+                                auth_provider=self.auth_provider, no_compact=no_compact,
                                 ssl_options=sslhandling.ssl_settings(hostname, CONFIG_FILE) if ssl else None,
                                 load_balancing_policy=WhiteListRoundRobinPolicy([self.hostname]),
                                 control_connection_timeout=connect_timeout,
-                                connect_timeout=connect_timeout)
+                                connect_timeout=connect_timeout,
+                                **kwargs)
         self.owns_connection = not use_conn
-        self.set_expanded_cql_version(cqlver)
 
         if keyspace:
             self.session = self.conn.connect(keyspace)
@@ -738,6 +773,7 @@
         self.session.row_factory = ordered_dict_factory
         self.session.default_consistency_level = cassandra.ConsistencyLevel.ONE
         self.get_connection_versions()
+        self.set_expanded_cql_version(self.connection_versions['cql'])
 
         self.current_keyspace = keyspace
 
@@ -778,6 +814,10 @@
         self.single_statement = single_statement
 
     @property
+    def batch_mode(self):
+        return not self.tty
+
+    @property
     def is_using_utf8(self):
         # utf8 encodings from https://docs.python.org/{2,3}/library/codecs.html
         return self.encoding.replace('-', '_').lower() in ['utf', 'utf_8', 'u8', 'utf8', CP65001]
@@ -848,9 +888,9 @@
         result, = self.session.execute("select * from system.local where key = 'local'")
         vers = {
             'build': result['release_version'],
-            'protocol': result['native_protocol_version'],
             'cql': result['cql_version'],
         }
+        vers['protocol'] = self.conn.protocol_version
         self.connection_versions = vers
 
     def get_keyspace_names(self):
@@ -862,6 +902,12 @@
 
         return map(str, self.get_keyspace_meta(ksname).tables.keys())
 
+    def get_materialized_view_names(self, ksname=None):
+        if ksname is None:
+            ksname = self.current_keyspace
+
+        return map(str, self.get_keyspace_meta(ksname).views.keys())
+
     def get_index_names(self, ksname=None):
         if ksname is None:
             ksname = self.current_keyspace
@@ -966,6 +1012,15 @@
 
         return ksmeta.indexes[idxname]
 
+    def get_view_meta(self, ksname, viewname):
+        if ksname is None:
+            ksname = self.current_keyspace
+        ksmeta = self.get_keyspace_meta(ksname)
+
+        if viewname not in ksmeta.views:
+            raise MaterializedViewNotFound("Materialized view %r not found" % viewname)
+        return ksmeta.views[viewname]
+
     def get_object_meta(self, ks, name):
         if name is None:
             if ks and ks in self.conn.metadata.keyspaces:
@@ -985,6 +1040,8 @@
             return ksmeta.tables[name]
         elif name in ksmeta.indexes:
             return ksmeta.indexes[name]
+        elif name in ksmeta.views:
+            return ksmeta.views[name]
 
         raise ObjectNotFound("%r not found in keyspace %r" % (name, ks))
 
@@ -1226,7 +1283,22 @@
 
         return success
 
-    def parse_for_table_meta(self, query_string):
+    def parse_for_select_meta(self, query_string):
+        try:
+            parsed = cqlruleset.cql_parse(query_string)[1]
+        except IndexError:
+            return None
+        ks = self.cql_unprotect_name(parsed.get_binding('ksname', None))
+        name = self.cql_unprotect_name(parsed.get_binding('cfname', None))
+        try:
+            return self.get_table_meta(ks, name)
+        except ColumnFamilyNotFound:
+            try:
+                return self.get_view_meta(ks, name)
+            except MaterializedViewNotFound:
+                raise ObjectNotFound("%r not found in keyspace %r" % (name, ks))
+
+    def parse_for_update_meta(self, query_string):
         try:
             parsed = cqlruleset.cql_parse(query_string)[1]
         except IndexError:
@@ -1262,7 +1334,7 @@
             return False, None
 
         if statement.query_string[:6].lower() == 'select':
-            self.print_result(result, self.parse_for_table_meta(statement.query_string))
+            self.print_result(result, self.parse_for_select_meta(statement.query_string))
         elif statement.query_string.lower().startswith("list users") or statement.query_string.lower().startswith("list roles"):
             self.print_result(result, self.get_table_meta('system_auth', 'roles'))
         elif statement.query_string.lower().startswith("list"):
@@ -1270,7 +1342,7 @@
         elif result:
             # CAS INSERT/UPDATE
             self.writeresult("")
-            self.print_static_result(result.column_names, list(result), self.parse_for_table_meta(statement.query_string))
+            self.print_static_result(result.column_names, list(result), self.parse_for_update_meta(statement.query_string))
         self.flush_output()
         return True, future
 
@@ -1441,6 +1513,16 @@
         out.write(self.get_index_meta(ksname, idxname).export_as_string())
         out.write("\n")
 
+    def print_recreate_materialized_view(self, ksname, viewname, out):
+        """
+        Output CQL commands which should be pasteable back into a CQL session
+        to recreate the given materialized view.
+
+        Writes output to the given out stream.
+        """
+        out.write(self.get_view_meta(ksname, viewname).export_as_string())
+        out.write("\n")
+
     def print_recreate_object(self, ks, name, out):
         """
         Output CQL commands which should be pasteable back into a CQL session
@@ -1475,6 +1557,15 @@
         self.print_recreate_index(ksname, idxname, sys.stdout)
         print
 
+    def describe_materialized_view(self, ksname, viewname):
+        if ksname is None:
+            ksname = self.current_keyspace
+        if ksname is None:
+            raise NoKeyspaceError("No keyspace specified and no current keyspace")
+        print
+        self.print_recreate_materialized_view(ksname, viewname, sys.stdout)
+        print
+
     def describe_object(self, ks, name):
         print
         self.print_recreate_object(ks, name, sys.stdout)
@@ -1515,7 +1606,7 @@
         functions = filter(lambda f: f.name == functionname, ksmeta.functions.values())
         if len(functions) == 0:
             raise FunctionNotFound("User defined function %r not found" % functionname)
-        print "\n\n".join(func.as_cql_query(formatted=True) for func in functions)
+        print "\n\n".join(func.export_as_string() for func in functions)
         print
 
     def describe_aggregates(self, ksname):
@@ -1540,7 +1631,7 @@
         aggregates = filter(lambda f: f.name == aggregatename, ksmeta.aggregates.values())
         if len(aggregates) == 0:
             raise FunctionNotFound("User defined aggregate %r not found" % aggregatename)
-        print "\n\n".join(aggr.as_cql_query(formatted=True) for aggr in aggregates)
+        print "\n\n".join(aggr.export_as_string() for aggr in aggregates)
         print
 
     def describe_usertypes(self, ksname):
@@ -1566,7 +1657,7 @@
             usertype = ksmeta.user_types[typename]
         except KeyError:
             raise UserTypeNotFound("User type %r not found" % typename)
-        print usertype.as_cql_query(formatted=True)
+        print usertype.export_as_string()
         print
 
     def _columnize_unicode(self, name_list, quote=False):
@@ -1640,6 +1731,12 @@
           In some cases, there may be index metadata which is not representable
           and which will not be shown.
 
+        DESCRIBE MATERIALIZED VIEW <viewname>
+
+          Output the CQL command that could be used to recreate the given materialized view.
+          In some cases, there may be materialized view metadata which is not representable
+          and which will not be shown.
+
         DESCRIBE CLUSTER
 
           Output information about the connected Cassandra cluster, such as the
@@ -1683,7 +1780,8 @@
         DESCRIBE <objname>
 
           Output CQL commands that could be used to recreate the entire object schema,
-          where object can be either a keyspace or a table or an index (in this order).
+          where object can be either a keyspace or a table or an index or a materialized
+          view (in this order).
   """
         what = parsed.matched[1][1].lower()
         if what == 'functions':
@@ -1716,6 +1814,10 @@
             ks = self.cql_unprotect_name(parsed.get_binding('ksname', None))
             idx = self.cql_unprotect_name(parsed.get_binding('idxname', None))
             self.describe_index(ks, idx)
+        elif what == 'materialized' and parsed.matched[2][1].lower() == 'view':
+            ks = self.cql_unprotect_name(parsed.get_binding('ksname', None))
+            mv = self.cql_unprotect_name(parsed.get_binding('mvname'))
+            self.describe_materialized_view(ks, mv)
         elif what in ('columnfamilies', 'tables'):
             self.describe_columnfamilies(self.current_keyspace)
         elif what == 'types':
@@ -1735,6 +1837,8 @@
             name = self.cql_unprotect_name(parsed.get_binding('cfname'))
             if not name:
                 name = self.cql_unprotect_name(parsed.get_binding('idxname', None))
+            if not name:
+                name = self.cql_unprotect_name(parsed.get_binding('mvname', None))
             self.describe_object(ks, name)
     do_desc = do_describe
 
@@ -1792,7 +1896,7 @@
           SKIPROWS=0              - the number of rows to skip
           SKIPCOLS=''             - a comma separated list of column names to skip
           MAXPARSEERRORS=-1       - the maximum global number of parsing errors, -1 means no maximum
-          MAXINSERTERRORS=-1      - the maximum global number of insert errors, -1 means no maximum
+          MAXINSERTERRORS=1000    - the maximum global number of insert errors, -1 means no maximum
           ERRFILE=''              - a file where to store all rows that could not be imported, by default this is
                                     import_ks_table.err where <ks> is your keyspace and <table> is your table name.
           PREPAREDSTATEMENTS=True - whether to use prepared statements when importing, by default True. Set this to
@@ -1840,9 +1944,9 @@
 
         direction = parsed.get_binding('dir').upper()
         if direction == 'FROM':
-            task = ImportTask(self, ks, table, columns, fname, opts, DEFAULT_PROTOCOL_VERSION, CONFIG_FILE)
+            task = ImportTask(self, ks, table, columns, fname, opts, self.conn.protocol_version, CONFIG_FILE)
         elif direction == 'TO':
-            task = ExportTask(self, ks, table, columns, fname, opts, DEFAULT_PROTOCOL_VERSION, CONFIG_FILE)
+            task = ExportTask(self, ks, table, columns, fname, opts, self.conn.protocol_version, CONFIG_FILE)
         else:
             raise SyntaxError("Unknown direction %s" % direction)
 
@@ -2398,10 +2502,12 @@
     optvalues.debug = False
     optvalues.file = None
     optvalues.ssl = False
+    optvalues.no_compact = False
     optvalues.encoding = option_with_default(configs.get, 'ui', 'encoding', UTF8)
 
     optvalues.tty = option_with_default(configs.getboolean, 'ui', 'tty', sys.stdin.isatty())
-    optvalues.cqlversion = option_with_default(configs.get, 'cql', 'version', DEFAULT_CQLVER)
+    optvalues.cqlversion = option_with_default(configs.get, 'cql', 'version', None)
+    optvalues.protocol_version = option_with_default(configs.getint, 'protocol', 'version', None)
     optvalues.connect_timeout = option_with_default(configs.getint, 'connection', 'timeout', DEFAULT_CONNECT_TIMEOUT_SECONDS)
     optvalues.request_timeout = option_with_default(configs.getint, 'connection', 'request_timeout', DEFAULT_REQUEST_TIMEOUT_SECONDS)
     optvalues.execute = None
@@ -2445,11 +2551,11 @@
         else:
             options.color = should_use_color()
 
-    options.cqlversion, cqlvertup = full_cql_version(options.cqlversion)
-    if cqlvertup[0] < 3:
-        parser.error('%r is not a supported CQL version.' % options.cqlversion)
-    else:
-        options.cqlmodule = cql3handling
+    if options.cqlversion is not None:
+        options.cqlversion, cqlvertup = full_cql_version(options.cqlversion)
+        if cqlvertup[0] < 3:
+            parser.error('%r is not a supported CQL version.' % options.cqlversion)
+    options.cqlmodule = cql3handling
 
     try:
         port = int(port)
@@ -2553,8 +2659,10 @@
                       tty=options.tty,
                       completekey=options.completekey,
                       browser=options.browser,
+                      protocol_version=options.protocol_version,
                       cqlver=options.cqlversion,
                       keyspace=options.keyspace,
+                      no_compact=options.no_compact,
                       display_timestamp_format=options.time_format,
                       display_nanotime_format=options.nanotime_format,
                       display_date_format=options.date_format,
@@ -2577,8 +2685,8 @@
 
     shell.cmdloop()
     save_history()
-    batch_mode = options.file or options.execute
-    if batch_mode and shell.statement_error:
+
+    if shell.batch_mode and shell.statement_error:
         sys.exit(2)
 
 

diff --git a/bin/debug-cql b/bin/debug-cql
index ae9bfe4..00d4093 100755
--- a/bin/debug-cql
+++ b/bin/debug-cql

@@ -50,7 +50,7 @@
 
 # Special-case path variables.
 case "`uname`" in
-    CYGWIN*) 
+    CYGWIN*|MINGW*) 
         CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
         CASSANDRA_CONF="`cygpath -p -w "$CASSANDRA_CONF"`"
     ;;

diff --git a/bin/nodetool b/bin/nodetool
index b6a6fbf..b1cfba5 100755
--- a/bin/nodetool
+++ b/bin/nodetool

@@ -58,7 +58,9 @@
 # Run cassandra-env.sh to pick up JMX_PORT
 if [ -f "$CASSANDRA_CONF/cassandra-env.sh" ]; then
     JVM_OPTS_SAVE=$JVM_OPTS
+    MAX_HEAP_SIZE_SAVE=$MAX_HEAP_SIZE
     . "$CASSANDRA_CONF/cassandra-env.sh"
+    MAX_HEAP_SIZE=$MAX_HEAP_SIZE_SAVE
     JVM_OPTS=$JVM_OPTS_SAVE
 fi
 

diff --git a/bin/nodetool.bat b/bin/nodetool.bat
index 416aca5..1d3c4e5 100644
--- a/bin/nodetool.bat
+++ b/bin/nodetool.bat

@@ -26,7 +26,6 @@
 set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% -Dcassandra.logdir="%CASSANDRA_HOME%\logs"

 set CASSANDRA_PARAMS=%CASSANDRA_PARAMS% -Dcassandra.storagedir="%CASSANDRA_HOME%\data"

 

-echo Starting NodeTool

 "%JAVA_HOME%\bin\java" -cp %CASSANDRA_CLASSPATH% %CASSANDRA_PARAMS% -Dlogback.configurationFile=logback-tools.xml org.apache.cassandra.tools.NodeTool %*

 goto finally

 


diff --git a/bin/sstablekeys b/bin/sstablekeys
deleted file mode 100755
index c0967ef..0000000
--- a/bin/sstablekeys
+++ /dev/null

@@ -1,66 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ "x$CASSANDRA_INCLUDE" = "x" ]; then
-    # Locations (in order) to use when searching for an include file.
-    for include in "`dirname "$0"`/cassandra.in.sh" \
-                   "$HOME/.cassandra.in.sh" \
-                   /usr/share/cassandra/cassandra.in.sh \
-                   /usr/local/share/cassandra/cassandra.in.sh \
-                   /opt/cassandra/cassandra.in.sh; do
-        if [ -r "$include" ]; then
-            . "$include"
-            break
-        fi
-    done
-elif [ -r "$CASSANDRA_INCLUDE" ]; then
-    . "$CASSANDRA_INCLUDE"
-fi
-
-# Use JAVA_HOME if set, otherwise look for java in PATH
-if [ -x "$JAVA_HOME/bin/java" ]; then
-    JAVA="$JAVA_HOME/bin/java"
-else
-    JAVA="`which java`"
-fi
-
-if [ "x$JAVA" = "x" ]; then
-    echo "Java executable not found (hint: set JAVA_HOME)" >&2
-    exit 1
-fi
-
-if [ -z "$CLASSPATH" ]; then
-    echo "You must set the CLASSPATH var" >&2
-    exit 1
-fi
-
-if [ "x$MAX_HEAP_SIZE" = "x" ]; then
-    MAX_HEAP_SIZE="256M"
-fi
-
-if [ $# -eq "0" ]; then
-    echo "Usage: `basename "$0"` <sstable>"
-    exit 2
-fi
-
-"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \
-        -Dcassandra.storagedir="$cassandra_storagedir" \
-        -Dlogback.configurationFile=logback-tools.xml \
-        org.apache.cassandra.tools.SSTableExport "$@" -e
-
-# vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstablekeys.bat b/bin/sstablekeys.bat
deleted file mode 100644
index 0d0cf95..0000000
--- a/bin/sstablekeys.bat
+++ /dev/null

@@ -1,41 +0,0 @@
-@REM

-@REM  Licensed to the Apache Software Foundation (ASF) under one or more

-@REM  contributor license agreements.  See the NOTICE file distributed with

-@REM  this work for additional information regarding copyright ownership.

-@REM  The ASF licenses this file to You under the Apache License, Version 2.0

-@REM  (the "License"); you may not use this file except in compliance with

-@REM  the License.  You may obtain a copy of the License at

-@REM

-@REM      http://www.apache.org/licenses/LICENSE-2.0

-@REM

-@REM  Unless required by applicable law or agreed to in writing, software

-@REM  distributed under the License is distributed on an "AS IS" BASIS,

-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-@REM  See the License for the specific language governing permissions and

-@REM  limitations under the License.

-

-@echo off

-if "%OS%" == "Windows_NT" setlocal

-

-pushd "%~dp0"

-call cassandra.in.bat

-

-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableExport

-if NOT DEFINED JAVA_HOME goto :err

-

-REM ***** JAVA options *****

-set JAVA_OPTS=^

- -Dlogback.configurationFile=logback-tools.xml

-

-set TOOLS_PARAMS=

-

-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %1 -e

-goto finally

-

-:err

-echo JAVA_HOME environment variable must be set!

-pause

-

-:finally

-

-ENDLOCAL


diff --git a/bin/sstableutil b/bin/sstableutil
new file mode 100755
index 0000000..7457834
--- /dev/null
+++ b/bin/sstableutil

@@ -0,0 +1,61 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ "x$CASSANDRA_INCLUDE" = "x" ]; then
+    # Locations (in order) to use when searching for an include file.
+    for include in "`dirname "$0"`/cassandra.in.sh" \
+                   "$HOME/.cassandra.in.sh" \
+                   /usr/share/cassandra/cassandra.in.sh \
+                   /usr/local/share/cassandra/cassandra.in.sh \
+                   /opt/cassandra/cassandra.in.sh; do
+        if [ -r "$include" ]; then
+            . "$include"
+            break
+        fi
+    done
+elif [ -r "$CASSANDRA_INCLUDE" ]; then
+    . "$CASSANDRA_INCLUDE"
+fi
+
+# Use JAVA_HOME if set, otherwise look for java in PATH
+if [ -x "$JAVA_HOME/bin/java" ]; then
+    JAVA="$JAVA_HOME/bin/java"
+else
+    JAVA="`which java`"
+fi
+
+if [ "x$JAVA" = "x" ]; then
+    echo "Java executable not found (hint: set JAVA_HOME)" >&2
+    exit 1
+fi
+
+if [ -z "$CLASSPATH" ]; then
+    echo "You must set the CLASSPATH var" >&2
+    exit 1
+fi
+
+if [ "x$MAX_HEAP_SIZE" = "x" ]; then
+    MAX_HEAP_SIZE="256M"
+fi
+
+"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
+        org.apache.cassandra.tools.StandaloneSSTableUtil "$@"
+
+# vi:ai sw=4 ts=4 tw=0 et

diff --git a/bin/sstableutil.bat b/bin/sstableutil.bat
new file mode 100644
index 0000000..bc3eb8a
--- /dev/null
+++ b/bin/sstableutil.bat

@@ -0,0 +1,41 @@
+@REM

+@REM  Licensed to the Apache Software Foundation (ASF) under one or more

+@REM  contributor license agreements.  See the NOTICE file distributed with

+@REM  this work for additional information regarding copyright ownership.

+@REM  The ASF licenses this file to You under the Apache License, Version 2.0

+@REM  (the "License"); you may not use this file except in compliance with

+@REM  the License.  You may obtain a copy of the License at

+@REM

+@REM      http://www.apache.org/licenses/LICENSE-2.0

+@REM

+@REM  Unless required by applicable law or agreed to in writing, software

+@REM  distributed under the License is distributed on an "AS IS" BASIS,

+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+@REM  See the License for the specific language governing permissions and

+@REM  limitations under the License.

+

+@echo off

+if "%OS%" == "Windows_NT" setlocal

+

+pushd "%~dp0"

+call cassandra.in.bat

+

+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.StandaloneSSTableUtil

+if NOT DEFINED JAVA_HOME goto :err

+

+REM ***** JAVA options *****

+set JAVA_OPTS=^

+ -Dlogback.configurationFile=logback-tools.xml

+

+set TOOLS_PARAMS=

+

+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %*

+goto finally

+

+:err

+echo JAVA_HOME environment variable must be set!

+pause

+

+:finally

+

+ENDLOCAL


diff --git a/bin/sstableverify b/bin/sstableverify
index 892750b..6b296cf 100755
--- a/bin/sstableverify
+++ b/bin/sstableverify

@@ -17,11 +17,12 @@
 # limitations under the License.
 
 if [ "x$CASSANDRA_INCLUDE" = "x" ]; then
-    for include in /usr/share/cassandra/cassandra.in.sh \
-                   /usr/local/share/cassandra/cassandra.in.sh \
-                   /opt/cassandra/cassandra.in.sh \
+    # Locations (in order) to use when searching for an include file.
+    for include in "`dirname "$0"`/cassandra.in.sh" \
                    "$HOME/.cassandra.in.sh" \
-                   "`dirname "$0"`/cassandra.in.sh"; do
+                   /usr/share/cassandra/cassandra.in.sh \
+                   /usr/local/share/cassandra/cassandra.in.sh \
+                   /opt/cassandra/cassandra.in.sh; do
         if [ -r "$include" ]; then
             . "$include"
             break

diff --git a/build.xml b/build.xml
index 8bed062..d50007e 100644
--- a/build.xml
+++ b/build.xml

@@ -25,7 +25,7 @@
     <property name="debuglevel" value="source,lines,vars"/>
 
     <!-- default version and SCM information -->
-    <property name="base.version" value="2.2.17"/>
+    <property name="base.version" value="3.0.21"/>
     <property name="scm.connection" value="scm:https://gitbox.apache.org/repos/asf/cassandra.git"/>
     <property name="scm.developerConnection" value="scm:https://gitbox.apache.org/repos/asf/cassandra.git"/>
     <property name="scm.url" value="https://gitbox.apache.org/repos/asf?p=cassandra.git;a=tree"/>
@@ -60,22 +60,20 @@
     <property name="test.classlistprefix" value="unit"/>
     <property name="benchmark.name" value=""/>
     <property name="test.methods" value=""/>
-    <property name="test.runners" value="1"/>
     <property name="test.unit.src" value="${test.dir}/unit"/>
     <property name="test.long.src" value="${test.dir}/long"/>
     <property name="test.burn.src" value="${test.dir}/burn"/>
     <property name="test.microbench.src" value="${test.dir}/microbench"/>
-    <property name="test.pig.src" value="${test.dir}/pig"/>
     <property name="test.distributed.src" value="${test.dir}/distributed"/>
-    <property name="test.distributed.listfile" value = "ant-jvm-dtest-list"/>
+    <property name="test.distributed.listfile" value="ant-jvm-dtest-list"/>
+    <property name="test.distributed.upgrade.listfile" value="ant-jvm-dtest-upgrade-list"/>
+    <property name="test.distributed.upgrade.package" value="org.apache.cassandra.distributed.upgrade"/>
     <property name="dist.dir" value="${build.dir}/dist"/>
     <property name="tmp.dir" value="${java.io.tmpdir}"/>
-	
+
     <property name="source.version" value="1.8"/>
-    <property name="source.test.version" value="1.8"/>
     <property name="target.version" value="1.8"/>
-    <property name="target.test.version" value="1.8"/>
-	
+
     <condition property="version" value="${base.version}">
       <isset property="release"/>
     </condition>
@@ -83,7 +81,7 @@
     <property name="version.properties.dir"
               value="${build.src.resources}/org/apache/cassandra/config/" />
     <property name="final.name" value="${ant.project.name}-${version}"/>
- 
+
     <!-- details of what version of Maven ANT Tasks to fetch -->
     <property name="maven-ant-tasks.version" value="2.1.3" />
     <property name="maven-ant-tasks.local" value="${user.home}/.m2/repository/org/apache/maven/maven-ant-tasks"/>
@@ -108,13 +106,9 @@
     <!-- default for cql tests. Can be override by -Dcassandra.test.use_prepared=false -->
     <property name="cassandra.test.use_prepared" value="true" />
 
-    <!-- https://cobertura.sourceforge.net/ -->
-    <property name="cobertura.version" value="2.0.3"/>
-    <property name="cobertura.build.dir" value="${build.dir}/cobertura"/>
-    <property name="cobertura.report.dir" value="${cobertura.build.dir}/report"/>
-    <property name="cobertura.classes.dir" value="${cobertura.build.dir}/classes"/>
-    <property name="cobertura.datafile" value="${cobertura.build.dir}/cobertura.ser"/>
-    
+    <!-- skip flushing schema tables during tests -->
+    <property name="cassandra.test.flush_local_schema_changes" value="false" />
+
     <!-- https://www.eclemma.org/jacoco/ -->
     <property name="jacoco.export.dir" value="${build.dir}/jacoco/" />
     <property name="jacoco.partials.dir" value="${jacoco.export.dir}/partials" />
@@ -143,12 +137,25 @@
       <format property="YEAR" pattern="yyyy"/>
     </tstamp>
 
+    <!-- Check if all tests are being run or just one. If it's all tests don't spam the console with test output.
+         If it's an individual test print the output from the test under the assumption someone is debugging the test
+         and wants to know what is going on without having to context switch to the log file that is generated.
+         Debug level output still needs to be retrieved from the log file.  -->
+    <script language="javascript">
+        if (project.getProperty("cassandra.keepBriefBrief") == null)
+        {
+            if (project.getProperty("test.name").equals("*Test"))
+                project.setProperty("cassandra.keepBriefBrief", "true");
+            else
+                project.setProperty("cassandra.keepBriefBrief", "false");
+        }
+    </script>
+
     <!--
          Add all the dependencies.
     -->
     <path id="maven-ant-tasks.classpath" path="${build.dir}/maven-ant-tasks-${maven-ant-tasks.version}.jar" />
     <path id="cassandra.classpath">
-        <pathelement location="${cobertura.classes.dir}"/>
         <pathelement location="${build.classes.main}" />
         <pathelement location="${build.classes.thrift}" />
         <fileset dir="${build.lib}">
@@ -162,10 +169,6 @@
           <exclude name="**/ant-*.jar"/>
         </fileset>
     </path>
-	
-	<path id="cobertura.classpath">
-		<pathelement location="${cobertura.classes.dir}"/>
-	</path>
 
   <macrodef name="create-javadoc">
     <attribute name="destdir"/>
@@ -174,7 +177,7 @@
       <javadoc destdir="@{destdir}" author="true" version="true" use="true"
         windowtitle="${ant.project.name} API" classpathref="cassandra.classpath"
         bottom="Copyright &amp;copy; ${YEAR} The Apache Software Foundation"
-        useexternalfile="yes"
+        useexternalfile="yes" encoding="UTF-8"
         maxmemory="256m">
         <filesets/>
       </javadoc>
@@ -200,7 +203,6 @@
     <target name="clean" description="Remove all locally created artifacts">
         <delete dir="${build.test.dir}" />
         <delete dir="${build.classes}" />
-        <delete dir="${cobertura.classes.dir}" />
         <delete dir="${build.src.gen-java}" />
         <delete dir="${version.properties.dir}" />
         <delete dir="${jacoco.export.dir}" />
@@ -220,7 +222,7 @@
                 srcfile="${build.src.java}/org/apache/cassandra/cql3/Cql.g"
                 targetfile="${build.src.gen-java}/org/apache/cassandra/cql3/Cql.tokens"/>
     </target>
- 
+
     <target name="gen-cql3-grammar" depends="check-gen-cql3-grammar" unless="cql3current">
       <echo>Building Grammar ${build.src.java}/org/apache/cassandra/cql3/Cql.g  ...</echo>
       <java classname="org.antlr.Tool"
@@ -364,7 +366,7 @@
           <dependency groupId="org.xerial.snappy" artifactId="snappy-java" version="1.1.1.7"/>
           <dependency groupId="net.jpountz.lz4" artifactId="lz4" version="1.3.0"/>
           <dependency groupId="com.ning" artifactId="compress-lzf" version="0.8.4"/>
-          <dependency groupId="com.google.guava" artifactId="guava" version="16.0"/>
+          <dependency groupId="com.google.guava" artifactId="guava" version="18.0"/>
           <dependency groupId="commons-cli" artifactId="commons-cli" version="1.1"/>
           <dependency groupId="commons-codec" artifactId="commons-codec" version="1.2"/>
           <dependency groupId="org.apache.commons" artifactId="commons-lang3" version="3.1"/>
@@ -386,7 +388,7 @@
           <dependency groupId="com.googlecode.json-simple" artifactId="json-simple" version="1.1"/>
           <dependency groupId="com.boundary" artifactId="high-scale-lib" version="1.0.6"/>
           <dependency groupId="com.github.jbellis" artifactId="jamm" version="0.3.0"/>
-	  	  <dependency groupId="com.github.tjake" artifactId="crc32ex" version="0.1.1"/>
+
           <dependency groupId="com.thinkaurelius.thrift" artifactId="thrift-server" version="0.3.7">
             <exclusion groupId="org.slf4j" artifactId="slf4j-log4j12"/>
           </dependency>
@@ -409,26 +411,15 @@
           <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" version="1.0.3">
 		    <exclusion groupId="asm" artifactId="asm"/> <!-- this is the outdated version 3.1 -->
           </dependency>
-          <dependency groupId="org.apache.pig" artifactId="pig" version="0.12.1">
-          	<exclusion groupId="org.mortbay.jetty" artifactId="jetty"/>
-            <exclusion groupId="hsqldb" artifactId="hsqldb"/> <!-- outdated version -->
-            <exclusion groupId="antlr" artifactId="antlr"/> <!-- outdated version -->
-          </dependency>
-          <dependency groupId="net.java.dev.jna" artifactId="jna" version="4.0.0"/>
+          <dependency groupId="net.java.dev.jna" artifactId="jna" version="4.2.2"/>
 
-          <dependency groupId="net.sourceforge.cobertura" artifactId="cobertura" version="${cobertura.version}">
-            <exclusion groupId="xerces" artifactId="xercesImpl"/>
-          	<exclusion groupId="org.mortbay.jetty" artifactId="jetty"/> <!-- older version, also via hadoop-core + pig -->
-          	<exclusion groupId="org.mortbay.jetty" artifactId="jetty-util"/> <!-- older version, also via hadoop-core + pig -->
-            <exclusion groupId="org.apache.ant" artifactId="ant"/> <!-- older version 1.8.3 -->
-          </dependency>
           <dependency groupId="org.jacoco" artifactId="org.jacoco.agent" version="${jacoco.version}"/>
           <dependency groupId="org.jacoco" artifactId="org.jacoco.ant" version="${jacoco.version}"/>
 
-          <dependency groupId="org.jboss.byteman" artifactId="byteman-install" version="${byteman.version}" scope="test"/>
-          <dependency groupId="org.jboss.byteman" artifactId="byteman" version="${byteman.version}" scope="test"/>
-          <dependency groupId="org.jboss.byteman" artifactId="byteman-submit" version="${byteman.version}" scope="test"/>
-          <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit" version="${byteman.version}" scope="test"/>
+          <dependency groupId="org.jboss.byteman" artifactId="byteman-install" version="${byteman.version}"/>
+          <dependency groupId="org.jboss.byteman" artifactId="byteman" version="${byteman.version}"/>
+          <dependency groupId="org.jboss.byteman" artifactId="byteman-submit" version="${byteman.version}"/>
+          <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit" version="${byteman.version}"/>
 
           <dependency groupId="net.bytebuddy" artifactId="byte-buddy" version="${bytebuddy.version}" />
           <dependency groupId="net.bytebuddy" artifactId="byte-buddy-agent" version="${bytebuddy.version}" />
@@ -446,18 +437,17 @@
           <dependency groupId="io.netty" artifactId="netty-all" version="4.0.44.Final" />
           <dependency groupId="com.google.code.findbugs" artifactId="jsr305" version="2.0.2" />
           <dependency groupId="com.clearspring.analytics" artifactId="stream" version="2.5.2" />
-          <!-- TODO CASSANDRA-9543
-          <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" version="2.1.5" classifier="shaded" />
-          -->
+          <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" version="3.0.1" classifier="shaded" />
           <dependency groupId="org.eclipse.jdt.core.compiler" artifactId="ecj" version="4.4.2" />
-          <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core" version="0.3.4" />
-          <dependency groupId="net.sf.supercsv" artifactId="super-csv" version="2.1.0" />
-	      <dependency groupId="net.ju-n.compile-command-annotations" artifactId="compile-command-annotations" version="1.2.0" />
+          <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core" version="0.4.3" />
+          <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core-j8" version="0.4.3" />
+          <dependency groupId="net.ju-n.compile-command-annotations" artifactId="compile-command-annotations" version="1.2.0" />
           <dependency groupId="org.fusesource" artifactId="sigar" version="1.6.4">
           	<exclusion groupId="log4j" artifactId="log4j"/>
           </dependency>
           <dependency groupId="joda-time" artifactId="joda-time" version="2.4" />
-        	
+          <dependency groupId="org.ow2.asm" artifactId="asm" version="5.0.4" />
+
         </dependencyManagement>
         <developer id="adelapena" name="Andres de la Peña"/>
         <developer id="alakshman" name="Avinash Lakshman"/>
@@ -486,6 +476,7 @@
         <developer id="jmckenzie" name="Josh McKenzie"/>
         <developer id="johan" name="Johan Oskarsson"/>
         <developer id="junrao" name="Jun Rao"/>
+        <developer id="jzhuang" name="Jay Zhuang"/>
         <developer id="kohlisankalp" name="Sankalp Kohli"/>
         <developer id="marcuse" name="Marcus Eriksson"/>
         <developer id="mck" name="Michael Semb Wever"/>
@@ -495,6 +486,7 @@
         <developer id="pmalik" name="Prashant Malik"/>
         <developer id="rstupp" name="Robert Stupp"/>
         <developer id="scode" name="Peter Schuller"/>
+        <developer id="beobal" name="Sam Tunnicliffe"/>
         <developer id="slebresne" name="Sylvain Lebresne"/>
         <developer id="stefania" name="Stefania Alborghetti"/>
         <developer id="tylerhobbs" name="Tyler Hobbs"/>
@@ -516,14 +508,12 @@
         <dependency groupId="org.apache.rat" artifactId="apache-rat"/>
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-core"/>
       	<dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster"/>
-        <dependency groupId="org.apache.pig" artifactId="pig"/>
       	<dependency groupId="com.google.code.findbugs" artifactId="jsr305"/>
         <dependency groupId="org.antlr" artifactId="antlr"/>
-        <!-- TODO CASSANDRA-9543
         <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" classifier="shaded"/>
-        -->
         <dependency groupId="org.eclipse.jdt.core.compiler" artifactId="ecj"/>
-        <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core"/>
+        <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core" version="0.4.3" />
+        <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core-j8" version="0.4.3" />
         <dependency groupId="org.openjdk.jmh" artifactId="jmh-core"/>
         <dependency groupId="org.openjdk.jmh" artifactId="jmh-generator-annprocess"/>
         <dependency groupId="net.ju-n.compile-command-annotations" artifactId="compile-command-annotations"/>
@@ -538,18 +528,7 @@
                 version="${version}"/>
         <dependency groupId="junit" artifactId="junit"/>
         <dependency groupId="org.mockito" artifactId="mockito-core" />
-        <dependency groupId="org.apache.pig" artifactId="pig">
-          <exclusion groupId="xmlenc" artifactId="xmlenc"/>
-          <exclusion groupId="tomcat" artifactId="jasper-runtime"/>
-          <exclusion groupId="tomcat" artifactId="jasper-compiler"/>
-          <exclusion groupId="org.eclipse.jdt" artifactId="core"/>
-          <exclusion groupId="net.sf.kosmosfs" artifactId="kfs"/>
-          <exclusion groupId="hsqldb" artifactId="hsqldb"/>
-          <exclusion groupId="antlr" artifactId="antlr"/>
-        </dependency>
-        <!-- TODO CASSANDRA-9543
         <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" classifier="shaded"/>
-        -->
         <dependency groupId="org.eclipse.jdt.core.compiler" artifactId="ecj"/>
         <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core"/>
         <dependency groupId="org.openjdk.jmh" artifactId="jmh-core"/>
@@ -563,13 +542,12 @@
         <parent groupId="org.apache.cassandra"
                 artifactId="cassandra-parent"
                 version="${version}"/>
-        <dependency groupId="net.sourceforge.cobertura" artifactId="cobertura"/>
         <dependency groupId="org.jacoco" artifactId="org.jacoco.agent"/>
-        <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
-        <dependency groupId="org.jboss.byteman" artifactId="byteman-install" scope="test"/>
-        <dependency groupId="org.jboss.byteman" artifactId="byteman" scope="test"/>
-        <dependency groupId="org.jboss.byteman" artifactId="byteman-submit" scope="test"/>
-        <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit" scope="test"/>
+        <dependency groupId="org.jacoco" artifactId="org.jacoco.ant" />
+        <dependency groupId="org.jboss.byteman" artifactId="byteman-install"/>
+        <dependency groupId="org.jboss.byteman" artifactId="byteman"/>
+        <dependency groupId="org.jboss.byteman" artifactId="byteman-submit"/>
+        <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit"/>
       </artifact:pom>
 
       <artifact:pom id="test-deps-pom"
@@ -577,7 +555,6 @@
         <parent groupId="org.apache.cassandra"
                 artifactId="cassandra-parent"
                 version="${version}"/>
-        <!-- do NOT remove this, it breaks pig-test -->
         <dependency groupId="joda-time" artifactId="joda-time"/>
       </artifact:pom>
 
@@ -611,12 +588,12 @@
         <dependency groupId="com.boundary" artifactId="high-scale-lib"/>
         <dependency groupId="org.yaml" artifactId="snakeyaml"/>
         <dependency groupId="org.mindrot" artifactId="jbcrypt"/>
+        <dependency groupId="io.airlift" artifactId="airline"/>
         <dependency groupId="io.dropwizard.metrics" artifactId="metrics-core"/>
         <dependency groupId="io.dropwizard.metrics" artifactId="metrics-jvm"/>
         <dependency groupId="com.addthis.metrics" artifactId="reporter-config3"/>
         <dependency groupId="com.thinkaurelius.thrift" artifactId="thrift-server"/>
         <dependency groupId="com.clearspring.analytics" artifactId="stream"/>
-        <dependency groupId="net.sf.supercsv" artifactId="super-csv"/>
 
         <dependency groupId="ch.qos.logback" artifactId="logback-core"/>
         <dependency groupId="ch.qos.logback" artifactId="logback-classic"/>
@@ -627,25 +604,23 @@
         <!-- don't need hadoop classes to run, but if you use the hadoop stuff -->
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" optional="true"/>
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" optional="true"/>
-        <dependency groupId="org.apache.pig" artifactId="pig" optional="true"/>
-        <!-- TODO CASSANDRA-9543
+
+        <!-- don't need the Java Driver to run, but if you use the hadoop stuff or UDFs -->
         <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" classifier="shaded" optional="true"/>
-        -->
 
         <!-- don't need jna to run, but nice to have -->
         <dependency groupId="net.java.dev.jna" artifactId="jna"/>
-        
+
         <!-- don't need jamm unless running a server in which case it needs to be a -javagent to be used anyway -->
         <dependency groupId="com.github.jbellis" artifactId="jamm"/>
-		<dependency groupId="com.github.tjake" artifactId="crc32ex"/>
+
         <dependency groupId="io.netty" artifactId="netty-all"/>
-      	
-      	<dependency groupId="joda-time" artifactId="joda-time"/>
-
+        <dependency groupId="joda-time" artifactId="joda-time"/>
         <dependency groupId="org.fusesource" artifactId="sigar"/>
-
         <dependency groupId="org.eclipse.jdt.core.compiler" artifactId="ecj"/>
         <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core"/>
+        <dependency groupId="org.caffinitas.ohc" artifactId="ohc-core-j8"/>
+        <dependency groupId="org.ow2.asm" artifactId="asm"/>
       </artifact:pom>
       <artifact:pom id="thrift-pom"
                     artifactId="cassandra-thrift"
@@ -671,7 +646,6 @@
         <scm connection="${scm.connection}" developerConnection="${scm.developerConnection}" url="${scm.url}"/>
   <dependency groupId="com.google.guava" artifactId="guava"/>
       </artifact:pom>
-      
     </target>
 
     <target name="maven-ant-tasks-retrieve-build" depends="maven-declare-dependencies" unless="without.maven">
@@ -701,8 +675,7 @@
       </copy>
       <!-- code coverage tools -->
       <artifact:dependencies pomRefId="coverage-deps-pom"
-                             filesetId="coverage-dependency-jars"
-                             pathId="cobertura.classpath">
+                             filesetId="coverage-dependency-jars">
           <remoteRepository refid="central"/>
       </artifact:dependencies>
       <copy todir="${build.dir.lib}/jars">
@@ -721,8 +694,8 @@
     <target name="maven-ant-tasks-retrieve-test" depends="maven-ant-tasks-init">
       <artifact:dependencies pomRefId="test-deps-pom"
                              filesetId="test-dependency-jars"
-                             sourcesFilesetId="test-dependency-sources" 
-                             cacheDependencyRefs="true" 
+                             sourcesFilesetId="test-dependency-sources"
+                             cacheDependencyRefs="true"
                              dependencyRefsBuildFile="${build.dir}/test-dependencies.xml">
         <remoteRepository refid="apache"/>
         <remoteRepository refid="central"/>
@@ -737,25 +710,6 @@
       </copy>
     </target>
 
-    <target name="maven-ant-tasks-retrieve-pig-test" depends="maven-ant-tasks-init">
-      <artifact:dependencies pomRefId="test-deps-pom"
-                             filesetId="test-dependency-jars"
-                             sourcesFilesetId="test-dependency-sources"
-                             cacheDependencyRefs="true"
-                             dependencyRefsBuildFile="${build.dir}/test-dependencies.xml">
-        <remoteRepository refid="apache"/>
-        <remoteRepository refid="central"/>
-      </artifact:dependencies>
-      <copy todir="${build.dir.lib}/jars">
-        <fileset refid="test-dependency-jars"/>
-        <mapper type="flatten"/>
-      </copy>
-      <copy todir="${build.dir.lib}/sources">
-        <fileset refid="test-dependency-sources"/>
-        <mapper type="flatten"/>
-      </copy>
-    </target>
-
     <!--
        Generate thrift code.  We have targets to build java because
        Cassandra depends on it, and python because that is what the system
@@ -817,7 +771,7 @@
             description="Run in test mode.  Not for production use!">
       <java classname="org.apache.cassandra.service.CassandraDaemon" fork="true">
         <classpath>
-          <path refid="cassandra.classpath"/>  
+          <path refid="cassandra.classpath"/>
           <pathelement location="${test.conf}"/>
         </classpath>
         <jvmarg value="-Dstorage-config=${test.conf}"/>
@@ -838,22 +792,20 @@
         <echo message="${ant.project.name}: ${ant.file}"/>
         <!-- Order matters! -->
         <javac fork="true"
-               debug="true" debuglevel="${debuglevel}"
-               encoding="utf-8"
+               debug="true" debuglevel="${debuglevel}" encoding="utf-8"
                destdir="${build.classes.thrift}" includeantruntime="false" source="${source.version}" target="${target.version}"
                memorymaximumsize="512M">
             <src path="${interface.thrift.dir}/gen-java"/>
             <classpath refid="cassandra.classpath"/>
         </javac>
         <javac fork="true"
-               debug="true" debuglevel="${debuglevel}"
-               encoding="utf-8"
+               debug="true" debuglevel="${debuglevel}" encoding="utf-8"
                destdir="${build.classes.main}" includeantruntime="false" source="${source.version}" target="${target.version}"
                memorymaximumsize="512M">
             <src path="${build.src.java}"/>
             <src path="${build.src.gen-java}"/>
-        	<compilerarg value="-XDignore.symbol.file"/>
-                <compilerarg value="-Xbootclasspath/p:${build.src.jdkoverride}"/>
+            <compilerarg value="-XDignore.symbol.file"/>
+            <compilerarg value="-Xbootclasspath/p:${build.src.jdkoverride}"/>
             <classpath refid="cassandra.classpath"/>
         </javac>
         <antcall target="createVersionPropFile"/>
@@ -898,7 +850,7 @@
 	<target name="write-poms" unless="without.maven">
 	    <antcall target="_write-poms" />
 	</target>
-	
+
     <!--
         The jar target makes cassandra.jar output.
     -->
@@ -1032,7 +984,7 @@
       </create-javadoc>
       <jar jarfile="${build.dir}/${ant.project.name}-clientutil-${version}-javadoc.jar"
            basedir="${javadoc.jars.dir}/clientutil"/>
-      <!-- javadoc task always rebuilds so might as well remove the generated docs to prevent 
+      <!-- javadoc task always rebuilds so might as well remove the generated docs to prevent
            being pulled into the distribution by accident -->
       <delete quiet="true" dir="${javadoc.jars.dir}"/>
     </target>
@@ -1123,7 +1075,6 @@
             <include name="*.jar" />
         </fileset>
       </copy>
-
       <tar compression="gzip" longfile="gnu"
         destfile="${build.dir}/${final.name}-bin.tar.gz">
 
@@ -1218,19 +1169,18 @@
     <javac
      debug="true"
      debuglevel="${debuglevel}"
-     encoding="utf-8"
      destdir="${test.classes}"
      includeantruntime="true"
-     source="${source.test.version}" 
-     target="${target.test.version}">
+     source="${source.version}"
+     target="${target.version}"
+     encoding="utf-8">
      <classpath>
         <path refid="cassandra.classpath"/>
      </classpath>
-	 <compilerarg value="-XDignore.symbol.file"/>
+     <compilerarg value="-XDignore.symbol.file"/>
      <src path="${test.unit.src}"/>
      <src path="${test.long.src}"/>
      <src path="${test.burn.src}"/>
-     <src path="${test.pig.src}"/>
      <src path="${test.microbench.src}"/>
      <src path="${test.distributed.src}"/>
     </javac>
@@ -1256,20 +1206,21 @@
     <attribute name="filelist" default="" />
     <attribute name="poffset" default="0"/>
     <attribute name="testtag" default=""/>
-    
     <attribute name="usejacoco" default="no"/>
     <attribute name="showoutput" default="false"/>
+
     <sequential>
       <condition property="additionalagent"
                  value="-javaagent:${build.dir.lib}/jars/jacocoagent.jar=destfile=${jacoco.partialexecfile}"
                  else="">
         <istrue value="${usejacoco}"/>
       </condition>
+      <!-- use https://github.com/krummas/jstackjunit to get thread dumps when unit tests time out -->
+      <taskdef name="junit-timeout" classname="org.krummas.junit.JStackJUnitTask" classpath="lib/jstackjunit-0.0.1.jar"/>
       <mkdir dir="${build.test.dir}/cassandra"/>
       <mkdir dir="${build.test.dir}/output"/>
       <mkdir dir="${build.test.dir}/output/@{testtag}"/>
-      <junit fork="on" forkmode="@{forkmode}" failureproperty="testfailed" maxmemory="1024m" timeout="@{timeout}" showoutput="@{showoutput}">
-        <sysproperty key="net.sourceforge.cobertura.datafile" file="${cobertura.datafile}"/>
+      <junit-timeout fork="on" forkmode="@{forkmode}" failureproperty="testfailed" maxmemory="1024m" timeout="@{timeout}" showoutput="@{showoutput}">
         <formatter classname="org.apache.cassandra.CassandraXMLJUnitResultFormatter" extension=".xml" usefile="true"/>
         <formatter classname="org.apache.cassandra.CassandraBriefJUnitResultFormatter" usefile="false"/>
         <jvmarg value="-Dstorage-config=${test.conf}"/>
@@ -1289,16 +1240,18 @@
         <jvmarg value="-XX:SoftRefLRUPolicyMSPerMB=0" />
         <jvmarg value="-Dcassandra.memtable_row_overhead_computation_step=100"/>
         <jvmarg value="-Dcassandra.test.use_prepared=${cassandra.test.use_prepared}"/>
-	    <jvmarg value="-Dcassandra.test.offsetseed=@{poffset}"/>
+        <jvmarg value="-Dcassandra.test.offsetseed=@{poffset}"/>
         <jvmarg value="-Dcassandra.test.sstableformatdevelopment=true"/>
-        <jvmarg value="-Dcassandra.testtag=@{testtag}"/>
         <!-- The first time SecureRandom initializes can be slow if it blocks on /dev/random -->
         <jvmarg value="-Djava.security.egd=file:/dev/urandom" />
+        <jvmarg value="-Dcassandra.testtag=@{testtag}"/>
+        <jvmarg value="-Dcassandra.keepBriefBrief=${cassandra.keepBriefBrief}" />
+        <jvmarg value="-Dcassandra.strict.runtime.checks=true" />
 	<optjvmargs/>
         <classpath>
+          <pathelement path="${java.class.path}"/>
           <path refid="cassandra.classpath" />
           <pathelement location="${test.classes}"/>
-          <path refid="cobertura.classpath"/>
           <pathelement location="${test.conf}"/>
           <fileset dir="${test.lib}">
             <include name="**/*.jar" />
@@ -1308,7 +1261,7 @@
             <fileset dir="@{inputdir}" includes="@{filter}" excludes="@{exclude}"/>
             <filelist dir="@{inputdir}" files="@{filelist}"/>
         </batchtest>
-      </junit>
+      </junit-timeout>
       <delete quiet="true" failonerror="false" dir="${build.test.dir}/cassandra/commitlog:@{poffset}"/>
       <delete quiet="true" failonerror="false" dir="${build.test.dir}/cassandra/data:@{poffset}"/>
       <delete quiet="true" failonerror="false" dir="${build.test.dir}/cassandra/saved_caches:@{poffset}"/>
@@ -1359,20 +1312,17 @@
   </target>
 
   <target name="testold" depends="build-test" description="Execute unit tests">
-    <testmacro inputdir="${test.unit.src}" exclude="**/pig/*.java" timeout="${test.timeout}">
+    <testmacro inputdir="${test.unit.src}" timeout="${test.timeout}">
       <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
       <jvmarg value="-Dinvalid-legacy-sstable-root=${test.data}/invalid-legacy-sstables"/>
-      <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
       <jvmarg value="-Dmigration-sstable-root=${test.data}/migration-sstables"/>
       <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
       <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
       <jvmarg value="-Dcassandra.skip_sync=true" />
     </testmacro>
-    <fileset dir="${test.unit.src}">
-        <exclude name="**/pig/*.java" />
-    </fileset>
+    <fileset dir="${test.unit.src}" />
   </target>
-  
+
   <!-- Will not generate a junit report or fail on error since it is called in parallel for test-compression
        That is taken care of by testparallel -->
   <macrodef name="testlist">
@@ -1382,7 +1332,6 @@
       <testmacrohelper inputdir="${test.dir}/${test.classlistprefix}" filelist="@{test.file.list}" poffset="@{testlist.offset}" exclude="**/*.java" timeout="${test.timeout}">
         <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
         <jvmarg value="-Dinvalid-legacy-sstable-root=${test.data}/invalid-legacy-sstables"/>
-        <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
         <jvmarg value="-Dmigration-sstable-root=${test.data}/migration-sstables"/>
         <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
         <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
@@ -1400,14 +1349,13 @@
     <sequential>
       <property name="compressed_yaml" value="${build.test.dir}/cassandra.compressed.yaml"/>
       <concat destfile="${compressed_yaml}">
-        <fileset file="${test.conf}/cassandra.yaml"/>
-        <fileset file="${test.conf}/commitlog_compression.yaml"/>
+          <fileset file="${test.conf}/cassandra.yaml"/>
+          <fileset file="${test.conf}/commitlog_compression.yaml"/>
       </concat>
       <testmacrohelper inputdir="${test.unit.src}" filelist="@{test.file.list}" poffset="@{testlist.offset}"
                        exclude="**/*.java" timeout="${test.timeout}" testtag="compression">
         <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
         <jvmarg value="-Dinvalid-legacy-sstable-root=${test.data}/invalid-legacy-sstables"/>
-        <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
         <jvmarg value="-Dmigration-sstable-root=${test.data}/migration-sstables"/>
         <jvmarg value="-Dcassandra.test.compression=true"/>
         <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
@@ -1420,7 +1368,7 @@
   </macrodef>
 
   <!--
-    Run named ant task with jacoco, such as "ant jacoco-run -Dtaskname=pig-test"
+    Run named ant task with jacoco, such as "ant jacoco-run -Dtaskname=test"
     the target run must enable the jacoco agent if usejacoco is 'yes' -->
   <target name="jacoco-run" description="run named task with jacoco instrumentation">
     <condition property="runtask" value="${taskname}" else="test">
@@ -1435,18 +1383,25 @@
     ant testsome -Dtest.name=org.apache.cassandra.service.StorageServiceServerTest -Dtest.methods=testRegularMode,testGetAllRangesEmpty
   -->
   <target name="testsome" depends="build-test" description="Execute specific unit tests" >
-    <testmacro inputdir="${test.unit.src}" exclude="**/pig/*.java" timeout="${test.timeout}">
+    <testmacro inputdir="${test.unit.src}" timeout="${test.timeout}">
       <test name="${test.name}" methods="${test.methods}" outfile="build/test/output/TEST-${test.name}-${test.methods}"/>
       <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
       <jvmarg value="-Dinvalid-legacy-sstable-root=${test.data}/invalid-legacy-sstables"/>
-      <jvmarg value="-Dcorrupt-sstable-root=${test.data}/corrupt-sstables"/>
       <jvmarg value="-Dmigration-sstable-root=${test.data}/migration-sstables"/>
       <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
       <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
       <jvmarg value="-Dcassandra.skip_sync=true" />
     </testmacro>
   </target>
-    
+
+  <!-- Use this with an FQDN for test class, and a csv list of methods like this:
+    ant burn-testsome -Dtest.name=org.apache.cassandra.utils.memory.LongBufferPoolTest -Dtest.methods=testAllocate
+  -->
+  <target name="burn-testsome" depends="build-test" description="Execute specific burn unit tests" >
+    <testmacro inputdir="${test.burn.src}" timeout="${test.burn.timeout}">
+      <test name="${test.name}" methods="${test.methods}"/>
+    </testmacro>
+  </target>
   <target name="test-compression" depends="build-test" description="Execute unit tests with sstable compression enabled">
     <property name="compressed_yaml" value="${build.test.dir}/cassandra.compressed.yaml"/>
     <concat destfile="${compressed_yaml}">
@@ -1454,7 +1409,7 @@
       <fileset file="${test.conf}/commitlog_compression.yaml"/>
     </concat>
     <path id="all-test-classes-path">
-      <fileset dir="${test.unit.src}" excludes="**/pig/*.java" includes="**/${test.name}.java" />
+      <fileset dir="${test.unit.src}" includes="**/${test.name}.java" />
       <fileset dir="${test.distributed.src}" includes="**/${test.name}.java" />
     </path>
     <property name="all-test-classes" refid="all-test-classes-path"/>
@@ -1467,12 +1422,12 @@
       <jvmarg value="-Dcassandra.test-serialization-writes=True"/>
     </testmacro>
   </target>
-  
+
   <target name="msg-ser-test" depends="build-test" description="Tests message serializations">
       <testmacro inputdir="${test.unit.src}" timeout="${test.timeout}"
                filter="**/SerializationsTest.java"/>
   </target>
-  
+
   <target name="msg-ser-test-7" depends="build-test" description="Generates message serializations">
     <testmacro inputdir="${test.unit.src}"
         timeout="${test.timeout}" filter="**/SerializationsTest.java">
@@ -1519,7 +1474,6 @@
         <classpath>
           <path refid="cassandra.classpath" />
           <pathelement location="${test.classes}"/>
-          <path refid="cobertura.classpath"/>
           <pathelement location="${test.conf}"/>
           <fileset dir="${test.lib}">
             <include name="**/*.jar" />
@@ -1550,7 +1504,6 @@
       <mkdir dir="${build.test.dir}/cassandra"/>
       <mkdir dir="${build.test.dir}/output"/>
       <junit fork="on" forkmode="once" failureproperty="testfailed" maxmemory="1024m" timeout="${test.timeout}">
-        <sysproperty key="net.sourceforge.cobertura.datafile" file="${cobertura.datafile}"/>
         <formatter type="brief" usefile="false"/>
         <jvmarg value="-Dstorage-config=${test.conf}"/>
         <jvmarg value="-Djava.awt.headless=true"/>
@@ -1563,7 +1516,6 @@
         <classpath>
           <path refid="cassandra.classpath" />
           <pathelement location="${test.classes}"/>
-          <path refid="cobertura.classpath"/>
           <pathelement location="${test.conf}"/>
           <fileset dir="${test.lib}">
             <include name="**/*.jar" />
@@ -1574,12 +1526,6 @@
     </sequential>
   </target>
 
-  <target name="pig-test" depends="build-test,maven-ant-tasks-retrieve-pig-test" description="Excute Pig tests">
-    <testmacro inputdir="${test.pig.src}"
-               timeout="1200000">
-    </testmacro>
-  </target>
-
   <!-- Use JaCoCo ant extension without needing externally saved lib -->
   <target name="jacoco-init" depends="maven-ant-tasks-init">
     <artifact:dependencies pathId="jacocoant.classpath">
@@ -1624,41 +1570,6 @@
     <delete dir="${jacoco.export.dir}"/>
   </target>
 
-  <!-- instruments the classes to later create code coverage reports -->
-  <target name="cobertura-instrument" depends="build,build-test">
-    <taskdef resource="tasks.properties">
-      <classpath refid="cobertura.classpath"/>
-      <classpath refid="cassandra.classpath"/>
-    </taskdef>
-
-    <delete file="${cobertura.datafile}"/>
-
-    <cobertura-instrument todir="${cobertura.classes.dir}" datafile="${cobertura.datafile}">
-      <ignore regex="ch.qos.logback.*"/>
-
-      <fileset dir="${build.classes.main}">
-        <include name="**/*.class"/>
-        <exclude name="**/*Test.class"/>
-        <exclude name="**/*TestCase.class"/>
-        <exclude name="**/test/*.class"/>
-        <!-- cobertura modifies the serialVersionUID of classes. Some of our unit tests rely on backward
-        wire compatability of these classes.  It was easier to exlude them from instrumentation than to
-        force their serialVersionUIDs. -->
-        <exclude name="**/*Token.class"/>
-        <exclude name="${cobertura.excludes}"/>
-      </fileset>
-
-    </cobertura-instrument>
-  </target>
-
-  <!-- create both html and xml code coverage reports -->
-  <target name="cobertura-report">
-    <cobertura-report format="html" destdir="${cobertura.report.dir}" srcdir="${build.src.java}"
-      datafile="${cobertura.datafile}"/>
-    <cobertura-report format="xml" destdir="${cobertura.report.dir}" srcdir="${build.src.java}"
-      datafile="${cobertura.datafile}"/>
-  </target>
-
   <!--
     License audit tool
   -->
@@ -1670,7 +1581,7 @@
   </target>
 
   <target name="rat-check" depends="rat-init">
-    <rat:report xmlns:rat="antlib:org.apache.rat.anttasks"  
+    <rat:report xmlns:rat="antlib:org.apache.rat.anttasks"
                 reportFile="${build.dir}/rat-report.log">
       <fileset dir="."  excludesfile=".rat-excludes" />
     </rat:report>
@@ -1775,12 +1686,25 @@
   <scriptdef name="testparallelhelper" language="javascript">
     <attribute name="testdelegate"/>
     <![CDATA[
-        var Integer = java.lang.Integer;
         sep = project.getProperty("path.separator");
         all = project.getProperty("all-test-classes").split(sep);
-        dir = project.getProperty("test.unit.src");
+        runners = project.getProperty("test.runners")
+        cores = project.getProperty("cores.count")
+        mem = project.getProperty("mem.size")
 
-        numRunners = parseInt(project.getProperty("test.runners"));
+        numRunners = 1
+        if (runners != null) // there's test.runners override
+            numRunners = parseInt(runners) || 1;
+        else if (cores != null && mem != null) // only if cores and memory size is set
+            numRunners = Math.min(Math.floor(Math.sqrt(parseInt(cores) || 1)),
+                                  Math.floor((parseInt(mem) || 1)/(4*1024*1024*1024)));
+
+        if (numRunners < 1)
+            numRunners = 1
+
+        var echo = project.createTask("echo");
+        echo.setMessage("Number of test runners: " + numRunners);
+        echo.perform();
 
         var p = project.createTask('parallel');
         p.setThreadCount(numRunners);
@@ -1802,9 +1726,44 @@
     ]]>
   </scriptdef>
 
-  <target name="test" depends="eclipse-warnings,build-test" description="Parallel Test Runner">
+  <target name="get-cores">
+    <property environment="env"/>
+    <!-- support for Windows -->
+    <condition property="cores.count" value="${env.NUMBER_OF_PROCESSORS}">
+      <os family="windows" />
+    </condition>
+    <!-- support for Linux and Solaris (package SUNWgnu-coreutils is required) -->
+    <exec executable="nproc" outputproperty="cores.count" os="Linux,SunOS,Solaris" failifexecutionfails="false">
+      <arg value="--all"/>
+    </exec>
+    <!-- support for Mac OS X -->
+    <exec executable="sysctl" outputproperty="cores.count" os="Mac,Mac OS X,Darwin" failifexecutionfails="false">
+      <arg value="-n"/>
+      <arg value="hw.ncpu"/>
+    </exec>
+    <echo message="Number of cores: ${cores.count}"/>
+  </target>
+
+  <target name="get-mem">
+    <condition property="mem.size" value="unknown">
+      <os family="windows" />
+    </condition>
+    <!-- support for Linux and Solaris (package SUNWgnu-coreutils is required) -->
+    <exec executable="bash" outputproperty="mem.size" os="Linux,SunOS,Solaris" failifexecutionfails="false">
+      <arg value="-c"/>
+      <arg value="free -b | grep Mem: | awk '{print $2}'"/>
+    </exec>
+    <!-- support for Mac OS X -->
+    <exec executable="sysctl" outputproperty="mem.size" os="Mac,Mac OS X,Darwin" failifexecutionfails="false">
+      <arg value="-n"/>
+      <arg value="hw.memsize"/>
+    </exec>
+    <echo message="Mem size : ${mem.size}"/>
+  </target>
+
+  <target name="test" depends="eclipse-warnings,build-test,get-cores,get-mem" description="Parallel Test Runner">
     <path id="all-test-classes-path">
-      <fileset dir="${test.unit.src}" includes="**/${test.name}.java" excludes="**/pig/*.java **/distributed/test/UpgradeTest*.java" />
+      <fileset dir="${test.unit.src}" includes="**/${test.name}.java" excludes="**/distributed/test/UpgradeTest*.java" />
     </path>
     <property name="all-test-classes" refid="all-test-classes-path"/>
     <testparallel testdelegate="testlist"/>
@@ -1820,32 +1779,19 @@
     <property name="all-test-classes" refid="all-test-classes-path"/>
     <testparallel testdelegate="testlist"/>
   </target>
-  <target name="testclasslist-compression" depends="build-test" description="Parallel-run compression tests given in file -Dtest.classlistfile (one-class-per-line, e.g. org/apache/cassandra/db/SomeTest.java)">
-    <path id="all-test-classes-path">
-      <fileset dir="${test.unit.src}" includesfile="${test.classlistfile}"/>
-    </path>
-    <property name="all-test-classes" refid="all-test-classes-path"/>
-    <testparallel testdelegate="testlist-compression"/>
-  </target>
-
-  <target name="test-distributed" depends="build-test" description="Execute unit tests">
-    <testmacro inputdir="${test.distributed.src}" timeout="${test.distributed.timeout}" forkmode="once" showoutput="true" filter="**/test/*Test.java">
-      <jvmarg value="-Dlogback.configurationFile=test/conf/logback-dtest.xml"/>
-      <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
-      <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
-      <jvmarg value="-Dcassandra.skip_sync=true" />
-      <jvmarg value="-XX:MaxMetaspaceSize=256M" />
-      <jvmarg value="-XX:SoftRefLRUPolicyMSPerMB=0" />
-      <jvmarg value="-XX:+HeapDumpOnOutOfMemoryError" />
-      <jvmarg value="-XX:HeapDumpPath=build/test/oom.hprof" />
-    </testmacro>
+  <target name="testclasslist-compression" depends="build-test" description="Parallel-run tests given in file -Dtest.classlistfile (one-class-per-line, e.g. org/apache/cassandra/db/SomeTest.java)">
+      <path id="all-test-classes-path">
+          <fileset dir="${test.dir}/${test.classlistprefix}" includesfile="${test.classlistfile}"/>
+      </path>
+      <property name="all-test-classes" refid="all-test-classes-path"/>
+      <testparallel testdelegate="testlist-compression"/>
   </target>
 
   <!-- In-JVM dtest targets -->
   <target name="list-jvm-dtests" depends="build-test">
     <java classname="org.apache.cassandra.distributed.test.TestLocator" fork="no">
           <classpath>
-              <path refid="cassandra.classpath"/>
+              <path refid="cassandra.classpath" />
               <pathelement location="${test.classes}"/>
               <pathelement location="${test.conf}"/>
               <fileset dir="${test.lib}">
@@ -1880,6 +1826,38 @@
     </testmacro>
   </target>
 
+  <!-- In-JVM upgrade dtests -->
+  <target name="list-jvm-upgrade-dtests" depends="build-test">
+    <java classname="org.apache.cassandra.distributed.test.TestLocator" fork="no">
+          <classpath>
+              <path refid="cassandra.classpath" />
+              <pathelement location="${test.classes}"/>
+              <pathelement location="${test.conf}"/>
+              <fileset dir="${test.lib}">
+                  <include name="**/*.jar" />
+              </fileset>
+          </classpath>
+          <arg value="${test.distributed.upgrade.listfile}"/>
+          <arg value="${test.distributed.upgrade.package}"/>
+    </java>
+  </target>
+
+  <target name="test-jvm-upgrade-dtest-forking" depends="list-jvm-upgrade-dtests" description="Execute In-JVM 'distributed' upgrade tests" >
+    <chmod file="${test.distributed.upgrade.listfile}" perm="+x"/>
+    <exec executable="./${test.distributed.upgrade.listfile}" failonerror="true"/>
+    <delete file="${test.distributed.upgrade.listfile}"/>
+  </target>
+
+  <target name="test-jvm-upgrade-dtest" depends="build-test" description="Execute in-jvm dtests">
+    <testmacro inputdir="${test.distributed.src}" timeout="${test.distributed.timeout}" forkmode="once" showoutput="true" filter="**/upgrade/*Test.java">
+      <jvmarg value="-Dlogback.configurationFile=test/conf/logback-dtest.xml"/>
+      <jvmarg value="-Dcassandra.ring_delay_ms=10000"/>
+      <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
+      <jvmarg value="-Dcassandra.skip_sync=true" />
+      <jvmarg value="-XX:MaxMetaspaceSize=512M"/>
+    </testmacro>
+  </target>
+
   <!-- Use this with an FQDN for test class, and a csv list of methods like this:
       ant test-jvm-dtest-some -Dtest.name=org.apache.cassandra.distributed.test.ResourceLeakTest -Dtest.methods=looperTest
     -->
@@ -1901,7 +1879,6 @@
           <classpath>
               <path refid="cassandra.classpath" />
               <pathelement location="${test.classes}"/>
-              <path refid="cobertura.classpath"/>
               <pathelement location="${test.conf}"/>
               <fileset dir="${test.lib}">
                   <include name="**/*.jar" />
@@ -1911,6 +1888,31 @@
       </java>
   </target>
 
+  <!-- run arbitrary mains in tests, for example to run the long running memory tests with lots of memory pressure
+      ant run-main -Dmainclass=org.apache.cassandra.utils.memory.LongBufferPoolTest -Dvmargs="-Xmx30m -XX:-UseGCOverheadLimit"
+  -->
+  <target name="run-main" depends="build-test">
+      <property name="mainclass" value="" />
+      <property name="vmargs" value="" />
+      <property name="args" value="" />
+      <java classname="${mainclass}"
+            fork="true"
+            failonerror="true">
+          <jvmarg value="-server" />
+          <jvmarg value="-ea" />
+          <jvmarg line="${vmargs}" />
+          <arg line="${args}" />
+          <classpath>
+              <path refid="cassandra.classpath" />
+              <pathelement location="${test.classes}"/>
+              <pathelement location="${test.conf}"/>
+              <fileset dir="${test.lib}">
+                  <include name="**/*.jar" />
+              </fileset>
+          </classpath>
+      </java>
+  </target>
+
   <!-- Generate IDEA project description files -->
   <target name="generate-idea-files" depends="build-test" description="Generate IDEA files">
     <mkdir dir=".idea"/>
@@ -1956,7 +1958,6 @@
   <classpathentry kind="src" path="interface/thrift/gen-java"/>
   <classpathentry kind="src" output="build/test/classes" path="test/unit"/>
   <classpathentry kind="src" output="build/test/classes" path="test/long"/>
-  <classpathentry kind="src" output="build/test/classes" path="test/pig"/>
   <classpathentry kind="src" output="build/test/classes" path="test/distributed"/>
   <classpathentry kind="src" output="build/test/classes" path="test/resources" />
   <classpathentry kind="src" path="tools/stress/src"/>
@@ -1964,8 +1965,9 @@
   <classpathentry kind="output" path="build/classes/main"/>
   <classpathentry kind="lib" path="build/classes/thrift" sourcepath="interface/thrift/gen-java/"/>
   <classpathentry kind="lib" path="test/conf"/>
+  <classpathentry kind="lib" path="${java.home}/../lib/tools.jar"/>
 ]]>
-	</echo>	  
+	</echo>
   	<path id="eclipse-project-libs-path">
   	 <fileset dir="lib">
   	    <include name="**/*.jar" />
@@ -1979,27 +1981,27 @@
   		var File = java.io.File;
   		var FilenameUtils = Packages.org.apache.commons.io.FilenameUtils;
   		jars = project.getProperty("eclipse-project-libs").split(project.getProperty("path.separator"));
-  		
+
   		cp = "";
   	    for (i=0; i< jars.length; i++) {
   	       srcjar = FilenameUtils.getBaseName(jars[i]) + '-sources.jar';
   		   srcdir = FilenameUtils.concat(project.getProperty("build.dir.lib"), 'sources');
   		   srcfile = new File(FilenameUtils.concat(srcdir, srcjar));
-  		
+
   		   cp += ' <classpathentry kind="lib" path="' + jars[i] + '"';
   		   if (srcfile.exists()) {
   		      cp += ' sourcepath="' + srcfile.getAbsolutePath() + '"';
   		   }
   		   cp += '/>\n';
   		}
-  		
+
   		cp += '</classpath>';
-  	    
+
   		echo = project.createTask("echo");
   	    echo.setMessage(cp);
   		echo.setFile(new File(".classpath"));
   		echo.setAppend(true);
-  	    echo.perform();	     
+  	    echo.perform();
   	]]> </script>
     <mkdir dir=".settings" />
   </target>
@@ -2019,30 +2021,29 @@
   </target>
 
 
-  <target name="eclipse-warnings" depends="build" description="Run eclipse compiler code analysis">        
+  <target name="eclipse-warnings" depends="build" description="Run eclipse compiler code analysis">
         <property name="ecj.log.dir" value="${build.dir}/ecj" />
         <property name="ecj.warnings.file" value="${ecj.log.dir}/eclipse_compiler_checks.txt"/>
-        <delete dir="${ecj.log.dir}" />
         <mkdir  dir="${ecj.log.dir}" />
 
-        <property name="ecj.properties" value="${basedir}/eclipse_compiler.properties" />                
+        <property name="ecj.properties" value="${basedir}/eclipse_compiler.properties" />
 
-        <echo message="Running Eclipse Code Analysis.  Output logged to ${ecj.warnings.file}" />        
-        
-	<java 
+        <echo message="Running Eclipse Code Analysis.  Output logged to ${ecj.warnings.file}" />
+
+	<java
 	    jar="${build.dir.lib}/jars/ecj-${ecj.version}.jar"
             fork="true"
 	    failonerror="true"
-            maxmemory="512m"> 
+            maxmemory="512m">
             <arg value="-source"/>
-	    <arg value="${source.version}" /> 
+	    <arg value="${source.version}" />
 	    <arg value="-target"/>
-	    <arg value="${target.version}" /> 
+	    <arg value="${target.version}" />
 	    <arg value="-d" />
             <arg value="none" />
-	    <arg value="-proc:none" /> 
+	    <arg value="-proc:none" />
             <arg value="-log" />
-            <arg value="${ecj.warnings.file}" /> 
+            <arg value="${ecj.warnings.file}" />
             <arg value="-properties" />
             <arg value="${ecj.properties}" />
             <arg value="-cp" />
@@ -2050,19 +2051,19 @@
             <arg value="${build.src.java}" />
         </java>
   </target>
-  
+
 
   <!-- Installs artifacts to local Maven repository -->
   <target name="mvn-install"
           depends="maven-declare-dependencies,jar,sources-jar,javadoc-jar"
           description="Installs the artifacts in the Maven Local Repository">
-          
+
     <!-- the parent -->
     <install pomFile="${build.dir}/${final.name}-parent.pom"
              file="${build.dir}/${final.name}-parent.pom"
              packaging="pom"/>
 
-    <!-- the cassandra-thrift jar -->  
+    <!-- the cassandra-thrift jar -->
     <install pomFile="${build.dir}/${ant.project.name}-thrift-${version}.pom"
              file="${build.dir}/${ant.project.name}-thrift-${version}.jar"/>
     <install pomFile="${build.dir}/${ant.project.name}-thrift-${version}.pom"
@@ -2072,7 +2073,7 @@
              file="${build.dir}/${ant.project.name}-thrift-${version}-javadoc.jar"
              classifier="javadoc"/>
 
-    <!-- the cassandra-clientutil jar -->  
+    <!-- the cassandra-clientutil jar -->
     <install pomFile="${build.dir}/${ant.project.name}-clientutil-${version}.pom"
              file="${build.dir}/${ant.project.name}-clientutil-${version}.jar"/>
     <install pomFile="${build.dir}/${ant.project.name}-clientutil-${version}.pom"
@@ -2098,13 +2099,13 @@
           depends="mvn-install,artifacts"
           if="release"
           description="Publishes the artifacts to the Maven repository">
-          
+
     <!-- the parent -->
     <deploy pomFile="${build.dir}/${final.name}-parent.pom"
             file="${build.dir}/${final.name}-parent.pom"
             packaging="pom"/>
-          
-    <!-- the cassandra-thrift jar -->  
+
+    <!-- the cassandra-thrift jar -->
     <deploy pomFile="${build.dir}/${ant.project.name}-thrift-${version}.pom"
             file="${build.dir}/${ant.project.name}-thrift-${version}.jar"/>
     <deploy pomFile="${build.dir}/${ant.project.name}-thrift-${version}.pom"
@@ -2114,7 +2115,7 @@
             file="${build.dir}/${ant.project.name}-thrift-${version}-javadoc.jar"
             classifier="javadoc"/>
 
-    <!-- the cassandra-clientutil jar -->  
+    <!-- the cassandra-clientutil jar -->
     <deploy pomFile="${build.dir}/${ant.project.name}-clientutil-${version}.pom"
             file="${build.dir}/${ant.project.name}-clientutil-${version}.jar"/>
     <deploy pomFile="${build.dir}/${ant.project.name}-clientutil-${version}.pom"
@@ -2139,4 +2140,5 @@
     <sign-dist file="${build.dir}/${final.name}-src.tar.gz" />
 
   </target>
+
 </project>

diff --git a/conf/cassandra-env.ps1 b/conf/cassandra-env.ps1
index 7b4a632..74511f0 100644
--- a/conf/cassandra-env.ps1
+++ b/conf/cassandra-env.ps1

@@ -1,482 +1,490 @@
-#

-# Licensed to the Apache Software Foundation (ASF) under one or more

-# contributor license agreements.  See the NOTICE file distributed with

-# this work for additional information regarding copyright ownership.

-# The ASF licenses this file to You under the Apache License, Version 2.0

-# (the "License"); you may not use this file except in compliance with

-# the License.  You may obtain a copy of the License at

-#

-#     http://www.apache.org/licenses/LICENSE-2.0

-#

-# Unless required by applicable law or agreed to in writing, software

-# distributed under the License is distributed on an "AS IS" BASIS,

-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-# See the License for the specific language governing permissions and

-# limitations under the License.

-

-# NOTE: All param tuning can be done in the SetCassandraEnvironment Function below

-

-#-----------------------------------------------------------------------------

-Function SetCassandraHome()

-{

-    if (! $env:CASSANDRA_HOME)

-    {

-        $cwd = [System.IO.Directory]::GetCurrentDirectory()

-        $cwd = Split-Path $cwd -parent

-        $env:CASSANDRA_HOME = $cwd -replace "\\", "/"

-    }

-}

-

-#-----------------------------------------------------------------------------

-Function SetCassandraMain()

-{

-    if (! $env:CASSANDRA_MAIN)

-    {

-        $env:CASSANDRA_MAIN="org.apache.cassandra.service.CassandraDaemon"

-    }

-}

-

-#-----------------------------------------------------------------------------

-Function BuildClassPath

-{

-    $cp = """$env:CASSANDRA_HOME\conf"""

-    foreach ($file in Get-ChildItem "$env:CASSANDRA_HOME\lib\*.jar")

-    {

-        $file = $file -replace "\\", "/"

-        $cp = $cp + ";" + """$file"""

-    }

-

-    # Add build/classes/main so it works in development

-    $cp = $cp + ";" + """$env:CASSANDRA_HOME\build\classes\main"";""$env:CASSANDRA_HOME\build\classes\thrift"""

-    $env:CLASSPATH=$cp

-}

-

-#-----------------------------------------------------------------------------

-Function CalculateHeapSizes

-{

-    # Check if swapping is enabled on the host and warn if so - reference CASSANDRA-7316

-

-    $osInfo = Get-WmiObject -class "Win32_computersystem"

-    $autoPage = $osInfo.AutomaticManagedPageFile

-

-    if ($autoPage)

-    {

-        echo "*---------------------------------------------------------------------*"

-        echo "*---------------------------------------------------------------------*"

-        echo ""

-        echo "    WARNING!  Automatic page file configuration detected."

-        echo "    It is recommended that you disable swap when running Cassandra"

-        echo "    for performance and stability reasons."

-        echo ""

-        echo "*---------------------------------------------------------------------*"

-        echo "*---------------------------------------------------------------------*"

-    }

-    else

-    {

-        $pageFileInfo = Get-WmiObject -class "Win32_PageFileSetting" -EnableAllPrivileges

-        $pageFileCount = $PageFileInfo.Count

-        if ($pageFileInfo)

-        {

-            $files = @()

-            $sizes = @()

-            $hasSizes = $FALSE

-

-            # PageFileCount isn't populated and obj comes back as single if there's only 1

-            if ([string]::IsNullOrEmpty($PageFileCount))

-            {

-                $PageFileCount = 1

-                $files += $PageFileInfo.Name

-                if ($PageFileInfo.MaximumSize -ne 0)

-                {

-                    $hasSizes = $TRUE

-                    $sizes += $PageFileInfo.MaximumSize

-                }

-            }

-            else

-            {

-                for ($i = 0; $i -le $PageFileCount; $i++)

-                {

-                    $files += $PageFileInfo[$i].Name

-                    if ($PageFileInfo[$i].MaximumSize -ne 0)

-                    {

-                        $hasSizes = $TRUE

-                        $sizes += $PageFileInfo[$i].MaximumSize

-                    }

-                }

-            }

-

-            echo "*---------------------------------------------------------------------*"

-            echo "*---------------------------------------------------------------------*"

-            echo ""

-            echo "    WARNING!  $PageFileCount swap file(s) detected"

-            for ($i = 0; $i -lt $PageFileCount; $i++)

-            {

-                $toPrint = "        Name: " + $files[$i]

-                if ($hasSizes)

-                {

-                    $toPrint = $toPrint + " Size: " + $sizes[$i]

-                    $toPrint = $toPrint -replace [Environment]::NewLine, ""

-                }

-                echo $toPrint

-            }

-            echo "    It is recommended that you disable swap when running Cassandra"

-            echo "    for performance and stability reasons."

-            echo ""

-            echo "*---------------------------------------------------------------------*"

-            echo "*---------------------------------------------------------------------*"

-        }

-    }

-

-    # Validate that we need to run this function and that our config is good

-    if ($env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE)

-    {

-        return

-    }

-    if (($env:MAX_HEAP_SIZE -and !$env:HEAP_NEWSIZE) -or (!$env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE))

-    {

-        echo "Please set or unset MAX_HEAP_SIZE and HEAP_NEWSIZE in pairs.  Aborting startup."

-        exit 1

-    }

-

-    $memObject = Get-WMIObject -class win32_physicalmemory

-    if ($memObject -eq $null)

-    {

-        echo "WARNING!  Could not determine system memory.  Defaulting to 2G heap, 512M newgen.  Manually override in conf\cassandra-env.ps1 for different heap values."

-        $env:MAX_HEAP_SIZE = "2048M"

-        $env:HEAP_NEWSIZE = "512M"

-        return

-    }

-

-    $memory = ($memObject | Measure-Object Capacity -Sum).sum

-    $memoryMB = [Math]::Truncate($memory / (1024*1024))

-

-    $cpu = gwmi Win32_ComputerSystem | Select-Object NumberOfLogicalProcessors

-    $systemCores = $cpu.NumberOfLogicalProcessors

-

-    # set max heap size based on the following

-    # max(min(1/2 ram, 1024MB), min(1/4 ram, 8GB))

-    # calculate 1/2 ram and cap to 1024MB

-    # calculate 1/4 ram and cap to 8192MB

-    # pick the max

-    $halfMem = [Math]::Truncate($memoryMB / 2)

-    $quarterMem = [Math]::Truncate($halfMem / 2)

-

-    if ($halfMem -gt 1024)

-    {

-        $halfMem = 1024

-    }

-    if ($quarterMem -gt 8192)

-    {

-        $quarterMem = 8192

-    }

-

-    $maxHeapMB = ""

-    if ($halfMem -gt $quarterMem)

-    {

-        $maxHeapMB = $halfMem

-    }

-    else

-    {

-        $maxHeapMB = $quarterMem

-    }

-    $env:MAX_HEAP_SIZE = [System.Convert]::ToString($maxHeapMB) + "M"

-

-    # Young gen: min(max_sensible_per_modern_cpu_core * num_cores, 1/4

-    $maxYGPerCore = 100

-    $maxYGTotal = $maxYGPerCore * $systemCores

-    $desiredYG = [Math]::Truncate($maxHeapMB / 4)

-

-    if ($desiredYG -gt $maxYGTotal)

-    {

-        $env:HEAP_NEWSIZE = [System.Convert]::ToString($maxYGTotal) + "M"

-    }

-    else

-    {

-        $env:HEAP_NEWSIZE = [System.Convert]::ToString($desiredYG) + "M"

-    }

-}

-

-#-----------------------------------------------------------------------------

-Function SetJsr223Env

-{

-    $cp = $env:CLASSPATH

-    foreach ($jsrDir in Get-ChildItem -Path "$env:CASSANDRA_HOME\lib\jsr223")

-    {

-        foreach ($file in Get-ChildItem -Path "$env:CASSANDRA_HOME\lib\jsr223\$jsrDir\*.jar")

-        {

-            $file = $file -replace "\\", "/"

-			$cp = $cp + ";" + """$file"""

-        }

-    }

-    $env:CLASSPATH=$cp

-

-	# JSR223/JRuby - set ruby lib directory

-	if (Test-Path "$env:CASSANDRA_HOME\lib\jsr223\jruby\ruby")

-	{

-		$env:CASSANDRA_PARAMS=$env:CASSANDRA_PARAMS + " -Djruby.lib=$env:CASSANDRA_HOME\lib\jsr223\jruby"

-	}

-	# JSR223/JRuby - set ruby JNI libraries root directory

-	if (Test-Path "$env:CASSANDRA_HOME\lib\jsr223\jruby\jni")

-	{

-		$env:CASSANDRA_PARAMS=$env:CASSANDRA_PARAMS + " -Djffi.boot.library.path=$env:CASSANDRA_HOME\lib\jsr223\jruby\jni"

-	}

-	# JSR223/Jython - set python.home system property

-	if (Test-Path "$env:CASSANDRA_HOME\lib\jsr223\jython\jython.jar")

-	{

-		$env:CASSANDRA_PARAMS=$env:CASSANDRA_PARAMS + " -Dpython.home=$env:CASSANDRA_HOME\lib\jsr223\jython"

-	}

-	# JSR223/Scala - necessary system property

-	if (Test-Path "$env:CASSANDRA_HOME\lib\jsr223\scala\scala-compiler.jar")

-	{

-		$env:CASSANDRA_PARAMS=$env:CASSANDRA_PARAMS + " -Dscala.usejavacp=true"

-	}

-}

-

-#-----------------------------------------------------------------------------

-Function ParseJVMInfo

-{

-    # grab info about the JVM

-    $pinfo = New-Object System.Diagnostics.ProcessStartInfo

-    $pinfo.FileName = "$env:JAVA_BIN"

-    $pinfo.RedirectStandardError = $true

-    $pinfo.RedirectStandardOutput = $true

-    $pinfo.UseShellExecute = $false

-    $pinfo.Arguments = "-d64 -version"

-    $p = New-Object System.Diagnostics.Process

-    $p.StartInfo = $pinfo

-    $p.Start() | Out-Null

-    $p.WaitForExit()

-    $stderr = $p.StandardError.ReadToEnd()

-

-    $env:JVM_ARCH = "64-bit"

-

-    if ($stderr.Contains("Error"))

-    {

-        # 32-bit JVM. re-run w/out -d64

-        echo "Failed 64-bit check. Re-running to get version from 32-bit"

-        $pinfo.Arguments = "-version"

-        $p = New-Object System.Diagnostics.Process

-        $p.StartInfo = $pinfo

-        $p.Start() | Out-Null

-        $p.WaitForExit()

-        $stderr = $p.StandardError.ReadToEnd()

-        $env:JVM_ARCH = "32-bit"

-    }

-

-    $sa = $stderr.Split("""")

-    $env:JVM_VERSION = $sa[1]

-

-    if ($stderr.Contains("OpenJDK"))

-    {

-        $env:JVM_VENDOR = "OpenJDK"

-    }

-    elseif ($stderr.Contains("Java(TM)"))

-    {

-        $env:JVM_VENDOR = "Oracle"

-    }

-    else

-    {

-        $JVM_VENDOR = "other"

-    }

-

-    $pa = $sa[1].Split("_")

-    $env:JVM_PATCH_VERSION=$pa[1]

-}

-

-#-----------------------------------------------------------------------------

-Function SetCassandraEnvironment

-{

-    if (Test-Path Env:\JAVA_HOME)

-    {

-        $env:JAVA_BIN = "$env:JAVA_HOME\bin\java.exe"

-    }

-    elseif (Get-Command "java.exe")

-    {

-        $env:JAVA_BIN = "java.exe"

-    }

-    else

-    {

-        echo "ERROR!  No JAVA_HOME set and could not find java.exe in the path."

-        exit

-    }

-    SetCassandraHome

-    $env:CASSANDRA_CONF = "$env:CASSANDRA_HOME\conf"

-    $env:CASSANDRA_PARAMS="-Dcassandra -Dlogback.configurationFile=logback.xml"

-

-    $logdir = "$env:CASSANDRA_HOME\logs"

-    $storagedir = "$env:CASSANDRA_HOME\data"

-    $env:CASSANDRA_PARAMS = $env:CASSANDRA_PARAMS + " -Dcassandra.logdir=""$logdir"" -Dcassandra.storagedir=""$storagedir"""

-

-    SetCassandraMain

-    BuildClassPath

-    SetJsr223Env

-

-    # Override these to set the amount of memory to allocate to the JVM at

-    # start-up. For production use you may wish to adjust this for your

-    # environment. MAX_HEAP_SIZE is the total amount of memory dedicated

-    # to the Java heap; HEAP_NEWSIZE refers to the size of the young

-    # generation. Both MAX_HEAP_SIZE and HEAP_NEWSIZE should be either set

-    # or not (if you set one, set the other).

-    #

-    # The main trade-off for the young generation is that the larger it

-    # is, the longer GC pause times will be. The shorter it is, the more

-    # expensive GC will be (usually).

-    #

-    # The example HEAP_NEWSIZE assumes a modern 8-core+ machine for decent

-    # times. If in doubt, and if you do not particularly want to tweak, go

-    # 100 MB per physical CPU core.

-

-    #$env:MAX_HEAP_SIZE="4096M"

-    #$env:HEAP_NEWSIZE="800M"

-    CalculateHeapSizes

-

-    ParseJVMInfo

-    # Add sigar env - see Cassandra-7838

-    $env:JVM_OPTS = "$env:JVM_OPTS -Djava.library.path=""$env:CASSANDRA_HOME\lib\sigar-bin"""

-

-    # Confirm we're on high performance power plan, warn if not

-    # Change to $true to suppress this warning

-    $suppressPowerWarning = $false

-    if (!$suppressPowerWarning)

-    {

-        $currentProfile = powercfg /GETACTIVESCHEME

-        if (!$currentProfile.Contains("High performance"))

-        {

-            echo "*---------------------------------------------------------------------*"

-            echo "*---------------------------------------------------------------------*"

-            echo ""

-            echo "    WARNING! Detected a power profile other than High Performance."

-            echo "    Performance of this node will suffer."

-            echo "    Modify conf\cassandra.env.ps1 to suppress this warning."

-            echo ""

-            echo "*---------------------------------------------------------------------*"

-            echo "*---------------------------------------------------------------------*"

-        }

-    }

-

-    # add the jamm javaagent

-    if (($env:JVM_VENDOR -ne "OpenJDK") -or ($env:JVM_VERSION.CompareTo("1.6.0") -eq 1) -or

-        (($env:JVM_VERSION -eq "1.6.0") -and ($env:JVM_PATCH_VERSION.CompareTo("22") -eq 1)))

-    {

-        $env:JVM_OPTS = "$env:JVM_OPTS -javaagent:""$env:CASSANDRA_HOME\lib\jamm-0.3.0.jar"""

-    }

-

-    # enable assertions.  disabling this in production will give a modest

-    # performance benefit (around 5%).

-    $env:JVM_OPTS = "$env:JVM_OPTS -ea"

-

-    # Specifies the default port over which Cassandra will be available for

-    # JMX connections.

-    $JMX_PORT="7199"

-

-    # store in env to check if it's avail in verification

-    $env:JMX_PORT=$JMX_PORT

-

-    # some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSClassUnloadingEnabled"

-

-    # enable thread priorities, primarily so we can give periodic tasks

-    # a lower priority to avoid interfering with client workload

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseThreadPriorities"

-    # allows lowering thread priority without being root on linux - probably

-    # not necessary on Windows but doesn't harm anything.

-    # see http://tech.stolsvik.com/2010/01/linux-java-thread-priorities-workar

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:ThreadPriorityPolicy=42"

-

-    # min and max heap sizes should be set to the same value to avoid

-    # stop-the-world GC pauses during resize.

-    $env:JVM_OPTS="$env:JVM_OPTS -Xms$env:MAX_HEAP_SIZE"

-    $env:JVM_OPTS="$env:JVM_OPTS -Xmx$env:MAX_HEAP_SIZE"

-    $env:JVM_OPTS="$env:JVM_OPTS -Xmn$env:HEAP_NEWSIZE"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+HeapDumpOnOutOfMemoryError"

-

-    # stop the jvm on OutOfMemoryError as it can result in some data corruption

-    # uncomment the preferred option

-    # ExitOnOutOfMemoryError and CrashOnOutOfMemoryError require a JRE greater or equals to 1.7 update 101 or 1.8 update 92

-    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+ExitOnOutOfMemoryError"

-    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+CrashOnOutOfMemoryError"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:OnOutOfMemoryError=""taskkill /F /PID %p"""

-

-    # print an heap histogram on OutOfMemoryError

-    # $env:JVM_OPTS="$env:JVM_OPTS -Dcassandra.printHeapHistogramOnOutOfMemoryError=true"

-

-    # Per-thread stack size.

-    $env:JVM_OPTS="$env:JVM_OPTS -Xss256k"

-

-    # Larger interned string table, for gossip's benefit (CASSANDRA-6410)

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:StringTableSize=1000003"

-

-    # GC tuning options

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseParNewGC"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseConcMarkSweepGC"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSParallelRemarkEnabled"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:SurvivorRatio=8"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:MaxTenuringThreshold=1"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:CMSInitiatingOccupancyFraction=75"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseCMSInitiatingOccupancyOnly"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PerfDisableSharedMem"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseTLAB"

-    if (($env:JVM_VERSION.CompareTo("1.7") -eq 1) -and ($env:JVM_ARCH -eq "64-Bit"))

-    {

-        $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseCondCardMark"

-    }

-    if ( (($env:JVM_VERSION.CompareTo("1.7") -ge 0) -and ($env:JVM_PATCH_VERSION.CompareTo("60") -ge 0)) -or

-         ($env:JVM_VERSION.CompareTo("1.8") -ge 0))

-    {

-        $env:JVM_OPTS="$env:JVM_OPTS -XX:+CMSParallelInitialMarkEnabled -XX:+CMSEdenChunksRecordAlways"

-    }

-

-    # GC logging options

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCDetails"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCDateStamps"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintHeapAtGC"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintTenuringDistribution"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintGCApplicationStoppedTime"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PrintPromotionFailure"

-    # $env:JVM_OPTS="$env:JVM_OPTS -XX:PrintFLSStatistics=1"

-

-    $env:JVM_OPTS="$env:JVM_OPTS -Xloggc:""$env:CASSANDRA_HOME/logs/gc.log"""

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseGCLogFileRotation"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:NumberOfGCLogFiles=10"

-    $env:JVM_OPTS="$env:JVM_OPTS -XX:GCLogFileSize=10M"

-    # if using version before JDK 6u34 or 7u2 use this instead of log rotation

-    # $currentDate = (Get-Date).ToString('yyyy.MM.dd')

-    # $env:JVM_OPTS="$env:JVM_OPTS -Xloggc:$env:CASSANDRA_HOME/logs/gc-$currentDate.log"

-

-    # Configure the following for JEMallocAllocator and if jemalloc is not available in the system

-    # library path.

-    # set LD_LIBRARY_PATH=<JEMALLOC_HOME>/lib/

-    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.library.path=<JEMALLOC_HOME>/lib/"

-

-    # uncomment to have Cassandra JVM listen for remote debuggers/profilers on port 1414

-    # $env:JVM_OPTS="$env:JVM_OPTS -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=1414"

-

-    # Prefer binding to IPv4 network intefaces (when net.ipv6.bindv6only=1). See

-    # http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6342561 (short version:

-    # comment out this entry to enable IPv6 support).

-    $env:JVM_OPTS="$env:JVM_OPTS -Djava.net.preferIPv4Stack=true"

-

-    # jmx: metrics and administration interface

-    #

-    # add this if you're having trouble connecting:

-    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.rmi.server.hostname=<public name>"

-    #

-    # see

-    # https://blogs.oracle.com/jmxetc/entry/troubleshooting_connection_problems_in_jconsole

-    # for more on configuring JMX through firewalls, etc. (Short version:

-    # get it working with no firewall first.)

-    #

-    # Due to potential security exploits, Cassandra ships with JMX accessible

-    # *only* from localhost.  To enable remote JMX connections, uncomment lines below

-    # with authentication and ssl enabled. See https://wiki.apache.org/cassandra/JmxSecurity 

-    #

-    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT"

-    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false"

-    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.authenticate=true"

-    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.password.file=C:/jmxremote.password"

-    $env:JVM_OPTS="$env:JVM_OPTS -Dcassandra.jmx.local.port=$JMX_PORT -XX:+DisableExplicitGC"

-

-    $env:JVM_OPTS="$env:JVM_OPTS $env:JVM_EXTRA_OPTS"

-

-    #$env:JVM_OPTS="$env:JVM_OPTS -XX:+UnlockCommercialFeatures -XX:+FlightRecorder"

-}

+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: All param tuning can be done in the SetCassandraEnvironment Function below
+
+#-----------------------------------------------------------------------------
+Function SetCassandraHome()
+{
+    if (! $env:CASSANDRA_HOME)
+    {
+        $cwd = [System.IO.Directory]::GetCurrentDirectory()
+        $cwd = Split-Path $cwd -parent
+        $env:CASSANDRA_HOME = $cwd -replace "\\", "/"
+    }
+}
+
+#-----------------------------------------------------------------------------
+Function SetCassandraMain()
+{
+    if (! $env:CASSANDRA_MAIN)
+    {
+        $env:CASSANDRA_MAIN="org.apache.cassandra.service.CassandraDaemon"
+    }
+}
+
+#-----------------------------------------------------------------------------
+Function BuildClassPath
+{
+    $cp = """$env:CASSANDRA_HOME\conf"""
+    foreach ($file in Get-ChildItem "$env:CASSANDRA_HOME\lib\*.jar")
+    {
+        $file = $file -replace "\\", "/"
+        $cp = $cp + ";" + """$file"""
+    }
+
+    # Add build/classes/main so it works in development
+    $cp = $cp + ";" + """$env:CASSANDRA_HOME\build\classes\main"";""$env:CASSANDRA_HOME\build\classes\thrift"""
+    $env:CLASSPATH=$cp
+}
+
+#-----------------------------------------------------------------------------
+Function CalculateHeapSizes
+{
+    # Check if swapping is enabled on the host and warn if so - reference CASSANDRA-7316
+
+    $osInfo = Get-WmiObject -class "Win32_computersystem"
+    $autoPage = $osInfo.AutomaticManagedPageFile
+
+    if ($autoPage)
+    {
+        echo "*---------------------------------------------------------------------*"
+        echo "*---------------------------------------------------------------------*"
+        echo ""
+        echo "    WARNING!  Automatic page file configuration detected."
+        echo "    It is recommended that you disable swap when running Cassandra"
+        echo "    for performance and stability reasons."
+        echo ""
+        echo "*---------------------------------------------------------------------*"
+        echo "*---------------------------------------------------------------------*"
+    }
+    else
+    {
+        $pageFileInfo = Get-WmiObject -class "Win32_PageFileSetting" -EnableAllPrivileges
+        $pageFileCount = $PageFileInfo.Count
+        if ($pageFileInfo)
+        {
+            $files = @()
+            $sizes = @()
+            $hasSizes = $FALSE
+
+            # PageFileCount isn't populated and obj comes back as single if there's only 1
+            if ([string]::IsNullOrEmpty($PageFileCount))
+            {
+                $PageFileCount = 1
+                $files += $PageFileInfo.Name
+                if ($PageFileInfo.MaximumSize -ne 0)
+                {
+                    $hasSizes = $TRUE
+                    $sizes += $PageFileInfo.MaximumSize
+                }
+            }
+            else
+            {
+                for ($i = 0; $i -le $PageFileCount; $i++)
+                {
+                    $files += $PageFileInfo[$i].Name
+                    if ($PageFileInfo[$i].MaximumSize -ne 0)
+                    {
+                        $hasSizes = $TRUE
+                        $sizes += $PageFileInfo[$i].MaximumSize
+                    }
+                }
+            }
+
+            echo "*---------------------------------------------------------------------*"
+            echo "*---------------------------------------------------------------------*"
+            echo ""
+            echo "    WARNING!  $PageFileCount swap file(s) detected"
+            for ($i = 0; $i -lt $PageFileCount; $i++)
+            {
+                $toPrint = "        Name: " + $files[$i]
+                if ($hasSizes)
+                {
+                    $toPrint = $toPrint + " Size: " + $sizes[$i]
+                    $toPrint = $toPrint -replace [Environment]::NewLine, ""
+                }
+                echo $toPrint
+            }
+            echo "    It is recommended that you disable swap when running Cassandra"
+            echo "    for performance and stability reasons."
+            echo ""
+            echo "*---------------------------------------------------------------------*"
+            echo "*---------------------------------------------------------------------*"
+        }
+    }
+
+    # Validate that we need to run this function and that our config is good
+    if ($env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE)
+    {
+        return
+    }
+
+    if ((($env:MAX_HEAP_SIZE -and !$env:HEAP_NEWSIZE) -or (!$env:MAX_HEAP_SIZE -and $env:HEAP_NEWSIZE)) -and ($using_cms -eq $true))
+    {
+        echo "Please set or unset MAX_HEAP_SIZE and HEAP_NEWSIZE in pairs.  Aborting startup."
+        exit 1
+    }
+
+    $memObject = Get-WMIObject -class win32_physicalmemory
+    if ($memObject -eq $null)
+    {
+        echo "WARNING!  Could not determine system memory.  Defaulting to 2G heap, 512M newgen.  Manually override in conf\jvm.options for different heap values."
+        $env:MAX_HEAP_SIZE = "2048M"
+        $env:HEAP_NEWSIZE = "512M"
+        return
+    }
+
+    $memory = ($memObject | Measure-Object Capacity -Sum).sum
+    $memoryMB = [Math]::Truncate($memory / (1024*1024))
+
+    $cpu = gwmi Win32_ComputerSystem | Select-Object NumberOfLogicalProcessors
+    $systemCores = $cpu.NumberOfLogicalProcessors
+
+    # set max heap size based on the following
+    # max(min(1/2 ram, 1024MB), min(1/4 ram, 8GB))
+    # calculate 1/2 ram and cap to 1024MB
+    # calculate 1/4 ram and cap to 8192MB
+    # pick the max
+    $halfMem = [Math]::Truncate($memoryMB / 2)
+    $quarterMem = [Math]::Truncate($halfMem / 2)
+
+    if ($halfMem -gt 1024)
+    {
+        $halfMem = 1024
+    }
+    if ($quarterMem -gt 8192)
+    {
+        $quarterMem = 8192
+    }
+
+    $maxHeapMB = ""
+    if ($halfMem -gt $quarterMem)
+    {
+        $maxHeapMB = $halfMem
+    }
+    else
+    {
+        $maxHeapMB = $quarterMem
+    }
+    $env:MAX_HEAP_SIZE = [System.Convert]::ToString($maxHeapMB) + "M"
+
+    # Young gen: min(max_sensible_per_modern_cpu_core * num_cores, 1/4
+    $maxYGPerCore = 100
+    $maxYGTotal = $maxYGPerCore * $systemCores
+    $desiredYG = [Math]::Truncate($maxHeapMB / 4)
+
+    if ($desiredYG -gt $maxYGTotal)
+    {
+        $env:HEAP_NEWSIZE = [System.Convert]::ToString($maxYGTotal) + "M"
+    }
+    else
+    {
+        $env:HEAP_NEWSIZE = [System.Convert]::ToString($desiredYG) + "M"
+    }
+}
+
+#-----------------------------------------------------------------------------
+Function ParseJVMInfo
+{
+    # grab info about the JVM
+    $pinfo = New-Object System.Diagnostics.ProcessStartInfo
+    $pinfo.FileName = "$env:JAVA_BIN"
+    $pinfo.RedirectStandardError = $true
+    $pinfo.RedirectStandardOutput = $true
+    $pinfo.UseShellExecute = $false
+    $pinfo.Arguments = "-d64 -version"
+    $p = New-Object System.Diagnostics.Process
+    $p.StartInfo = $pinfo
+    $p.Start() | Out-Null
+    $p.WaitForExit()
+    $stderr = $p.StandardError.ReadToEnd()
+
+    $env:JVM_ARCH = "64-bit"
+
+    if ($stderr.Contains("Error"))
+    {
+        # 32-bit JVM. re-run w/out -d64
+        echo "Failed 64-bit check. Re-running to get version from 32-bit"
+        $pinfo.Arguments = "-version"
+        $p = New-Object System.Diagnostics.Process
+        $p.StartInfo = $pinfo
+        $p.Start() | Out-Null
+        $p.WaitForExit()
+        $stderr = $p.StandardError.ReadToEnd()
+        $env:JVM_ARCH = "32-bit"
+    }
+
+    $sa = $stderr.Split("""")
+    $env:JVM_VERSION = $sa[1]
+
+    if ($stderr.Contains("OpenJDK"))
+    {
+        $env:JVM_VENDOR = "OpenJDK"
+    }
+    elseif ($stderr.Contains("Java(TM)"))
+    {
+        $env:JVM_VENDOR = "Oracle"
+    }
+    else
+    {
+        $JVM_VENDOR = "other"
+    }
+
+    $pa = $sa[1].Split("_")
+    $subVersion = $pa[1]
+    # Deal with -b (build) versions
+    if ($subVersion -contains '-')
+    {
+        $patchAndBuild = $subVersion.Split("-")
+        $subVersion = $patchAndBuild[0]
+    }
+    $env:JVM_PATCH_VERSION = $subVersion
+}
+
+#-----------------------------------------------------------------------------
+Function SetCassandraEnvironment
+{
+    if (Test-Path Env:\JAVA_HOME)
+    {
+        $env:JAVA_BIN = "$env:JAVA_HOME\bin\java.exe"
+    }
+    elseif (Get-Command "java.exe")
+    {
+        $env:JAVA_BIN = "java.exe"
+    }
+    else
+    {
+        echo "ERROR!  No JAVA_HOME set and could not find java.exe in the path."
+        exit
+    }
+    SetCassandraHome
+    $env:CASSANDRA_CONF = "$env:CASSANDRA_HOME\conf"
+    $env:CASSANDRA_PARAMS="-Dcassandra -Dlogback.configurationFile=logback.xml"
+
+    $logdir = "$env:CASSANDRA_HOME\logs"
+    $storagedir = "$env:CASSANDRA_HOME\data"
+    $env:CASSANDRA_PARAMS = $env:CASSANDRA_PARAMS + " -Dcassandra.logdir=""$logdir"" -Dcassandra.storagedir=""$storagedir"""
+
+    SetCassandraMain
+    BuildClassPath
+
+    # Override these to set the amount of memory to allocate to the JVM at
+    # start-up. For production use you may wish to adjust this for your
+    # environment. MAX_HEAP_SIZE is the total amount of memory dedicated
+    # to the Java heap. HEAP_NEWSIZE refers to the size of the young
+    # generation. Both MAX_HEAP_SIZE and HEAP_NEWSIZE should be either set
+    # or not (if you set one, set the other).
+    #
+    # The main trade-off for the young generation is that the larger it
+    # is, the longer GC pause times will be. The shorter it is, the more
+    # expensive GC will be (usually).
+    #
+    # The example HEAP_NEWSIZE assumes a modern 8-core+ machine for decent
+    # times. If in doubt, and if you do not particularly want to tweak, go
+    # 100 MB per physical CPU core.
+
+    #GC log path has to be defined here since it needs to find CASSANDRA_HOME
+    $env:JVM_OPTS="$env:JVM_OPTS -Xloggc:""$env:CASSANDRA_HOME/logs/gc.log"""
+
+    # Read user-defined JVM options from jvm.options file
+    $content = Get-Content "$env:CASSANDRA_CONF\jvm.options"
+    for ($i = 0; $i -lt $content.Count; $i++)
+    {
+        $line = $content[$i]
+        if ($line.StartsWith("-"))
+        {
+            $env:JVM_OPTS = "$env:JVM_OPTS $line"
+        }
+    }
+
+    $defined_xmn = $env:JVM_OPTS -like '*Xmn*'
+    $defined_xmx = $env:JVM_OPTS -like '*Xmx*'
+    $defined_xms = $env:JVM_OPTS -like '*Xms*'
+    $using_cms = $env:JVM_OPTS -like '*UseConcMarkSweepGC*'
+
+    #$env:MAX_HEAP_SIZE="4096M"
+    #$env:HEAP_NEWSIZE="800M"
+    CalculateHeapSizes
+
+    ParseJVMInfo
+
+    # We only set -Xms and -Xmx if they were not defined on jvm.options file
+    # If defined, both Xmx and Xms should be defined together.
+    if (($defined_xmx -eq $false) -and ($defined_xms -eq $false))
+    {
+        $env:JVM_OPTS="$env:JVM_OPTS -Xms$env:MAX_HEAP_SIZE"
+        $env:JVM_OPTS="$env:JVM_OPTS -Xmx$env:MAX_HEAP_SIZE"
+    }
+    elseif (($defined_xmx -eq $false) -or ($defined_xms -eq $false))
+    {
+        echo "Please set or unset -Xmx and -Xms flags in pairs on jvm.options file."
+        exit
+    }
+
+    # We only set -Xmn flag if it was not defined in jvm.options file
+    # and if the CMS GC is being used
+    # If defined, both Xmn and Xmx should be defined together.
+    if (($defined_xmn -eq $true) -and ($defined_xmx -eq $false))
+    {
+        echo "Please set or unset -Xmx and -Xmn flags in pairs on jvm.options file."
+        exit
+    }
+    elseif (($defined_xmn -eq $false) -and ($using_cms -eq $true))
+    {
+        $env:JVM_OPTS="$env:JVM_OPTS -Xmn$env:HEAP_NEWSIZE"
+    }
+
+    if (($env:JVM_ARCH -eq "64-Bit") -and ($using_cms -eq $true))
+    {
+        $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseCondCardMark"
+    }
+
+    # Add sigar env - see Cassandra-7838
+    $env:JVM_OPTS = "$env:JVM_OPTS -Djava.library.path=""$env:CASSANDRA_HOME\lib\sigar-bin"""
+
+    # Confirm we're on high performance power plan, warn if not
+    # Change to $true to suppress this warning
+    $suppressPowerWarning = $false
+    if (!$suppressPowerWarning)
+    {
+        $currentProfile = powercfg /GETACTIVESCHEME
+        if (!$currentProfile.Contains("High performance"))
+        {
+            echo "*---------------------------------------------------------------------*"
+            echo "*---------------------------------------------------------------------*"
+            echo ""
+            echo "    WARNING! Detected a power profile other than High Performance."
+            echo "    Performance of this node will suffer."
+            echo "    Modify conf\cassandra.env.ps1 to suppress this warning."
+            echo ""
+            echo "*---------------------------------------------------------------------*"
+            echo "*---------------------------------------------------------------------*"
+        }
+    }
+
+    # provides hints to the JIT compiler
+    $env:JVM_OPTS = "$env:JVM_OPTS -XX:CompileCommandFile=""$env:CASSANDRA_CONF\hotspot_compiler"""
+
+    # add the jamm javaagent
+    if (($env:JVM_VENDOR -ne "OpenJDK") -or ($env:JVM_VERSION.CompareTo("1.6.0") -eq 1) -or
+        (($env:JVM_VERSION -eq "1.6.0") -and ($env:JVM_PATCH_VERSION.CompareTo("22") -eq 1)))
+    {
+        $env:JVM_OPTS = "$env:JVM_OPTS -javaagent:""$env:CASSANDRA_HOME\lib\jamm-0.3.0.jar"""
+    }
+
+    # set jvm HeapDumpPath with CASSANDRA_HEAPDUMP_DIR
+    if ($env:CASSANDRA_HEAPDUMP_DIR)
+    {
+        $unixTimestamp = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds
+        $env:JVM_OPTS="$env:JVM_OPTS -XX:HeapDumpPath=""$env:CASSANDRA_HEAPDUMP_DIR\cassandra-$unixTimestamp-pid$pid.hprof"""
+    }
+
+    if ($env:JVM_VERSION.CompareTo("1.8.0") -eq -1 -or [convert]::ToInt32($env:JVM_PATCH_VERSION) -lt 40)
+    {
+        echo "Cassandra 3.0 and later require Java 8u40 or later."
+        exit
+    }
+
+    # enable assertions.  disabling this in production will give a modest
+    # performance benefit (around 5%).
+    $env:JVM_OPTS = "$env:JVM_OPTS -ea"
+
+    # Specifies the default port over which Cassandra will be available for
+    # JMX connections.
+    $JMX_PORT="7199"
+
+    # store in env to check if it's avail in verification
+    $env:JMX_PORT=$JMX_PORT
+
+    # enable thread priorities, primarily so we can give periodic tasks
+    # a lower priority to avoid interfering with client workload
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseThreadPriorities"
+    # allows lowering thread priority without being root on linux - probably
+    # not necessary on Windows but doesn't harm anything.
+    # see http://tech.stolsvik.com/2010/01/linux-java-thread-priorities-workar
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:ThreadPriorityPolicy=42"
+
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+HeapDumpOnOutOfMemoryError"
+
+    # stop the jvm on OutOfMemoryError as it can result in some data corruption
+    # uncomment the preferred option
+    # ExitOnOutOfMemoryError and CrashOnOutOfMemoryError require a JRE greater or equals to 1.7 update 101 or 1.8 update 92
+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+ExitOnOutOfMemoryError"
+    # $env:JVM_OPTS="$env:JVM_OPTS -XX:+CrashOnOutOfMemoryError"
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:OnOutOfMemoryError=""taskkill /F /PID %p"""
+
+    # print an heap histogram on OutOfMemoryError
+    # $env:JVM_OPTS="$env:JVM_OPTS -Dcassandra.printHeapHistogramOnOutOfMemoryError=true"
+
+    # Per-thread stack size.
+    $env:JVM_OPTS="$env:JVM_OPTS -Xss256k"
+
+    # Larger interned string table, for gossip's benefit (CASSANDRA-6410)
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:StringTableSize=1000003"
+
+    # Make sure all memory is faulted and zeroed on startup.
+    # This helps prevent soft faults in containers and makes
+    # transparent hugepage allocation more effective.
+    #$env:JVM_OPTS="$env:JVM_OPTS -XX:+AlwaysPreTouch"
+
+    # Biased locking does not benefit Cassandra.
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:-UseBiasedLocking"
+
+    # Enable thread-local allocation blocks and allow the JVM to automatically
+    # resize them at runtime.
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+UseTLAB -XX:+ResizeTLAB"
+
+    # http://www.evanjones.ca/jvm-mmap-pause.html
+    $env:JVM_OPTS="$env:JVM_OPTS -XX:+PerfDisableSharedMem"
+
+    # Configure the following for JEMallocAllocator and if jemalloc is not available in the system
+    # library path.
+    # set LD_LIBRARY_PATH=<JEMALLOC_HOME>/lib/
+    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.library.path=<JEMALLOC_HOME>/lib/"
+
+    # uncomment to have Cassandra JVM listen for remote debuggers/profilers on port 1414
+    # $env:JVM_OPTS="$env:JVM_OPTS -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=1414"
+
+    # Prefer binding to IPv4 network intefaces (when net.ipv6.bindv6only=1). See
+    # http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6342561 (short version:
+    # comment out this entry to enable IPv6 support).
+    $env:JVM_OPTS="$env:JVM_OPTS -Djava.net.preferIPv4Stack=true"
+
+    # jmx: metrics and administration interface
+    #
+    # add this if you're having trouble connecting:
+    # $env:JVM_OPTS="$env:JVM_OPTS -Djava.rmi.server.hostname=<public name>"
+    #
+    # see
+    # https://blogs.oracle.com/jmxetc/entry/troubleshooting_connection_problems_in_jconsole
+    # for more on configuring JMX through firewalls, etc. (Short version:
+    # get it working with no firewall first.)
+    #
+    # Due to potential security exploits, Cassandra ships with JMX accessible
+    # *only* from localhost.  To enable remote JMX connections, uncomment lines below
+    # with authentication and ssl enabled. See https://wiki.apache.org/cassandra/JmxSecurity
+    #
+    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.port=$JMX_PORT"
+    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.ssl=false"
+    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.authenticate=true"
+    #$env:JVM_OPTS="$env:JVM_OPTS -Dcom.sun.management.jmxremote.password.file=C:/jmxremote.password"
+    $env:JVM_OPTS="$env:JVM_OPTS -Dcassandra.jmx.local.port=$JMX_PORT -XX:+DisableExplicitGC"
+
+    $env:JVM_OPTS="$env:JVM_OPTS $env:JVM_EXTRA_OPTS"
+
+    #$env:JVM_OPTS="$env:JVM_OPTS -XX:+UnlockCommercialFeatures -XX:+FlightRecorder"
+}

diff --git a/conf/cassandra-env.sh b/conf/cassandra-env.sh
index b394d5f..aa92eac 100644
--- a/conf/cassandra-env.sh
+++ b/conf/cassandra-env.sh

@@ -87,24 +87,21 @@
 }
 
 # Determine the sort of JVM we'll be running on.
-
 java_ver_output=`"${JAVA:-java}" -version 2>&1`
-
 jvmver=`echo "$java_ver_output" | grep '[openjdk|java] version' | awk -F'"' 'NR==1 {print $2}'`
 JVM_VERSION=${jvmver%_*}
 JVM_PATCH_VERSION=${jvmver#*_}
 
-if [ "$JVM_VERSION" \< "1.7" ] ; then
-    echo "Cassandra 2.0 and later require Java 7u25 or later."
+if [ "$JVM_VERSION" \< "1.8" ] ; then
+    echo "Cassandra 3.0 and later require Java 8u40 or later."
     exit 1;
 fi
 
-if [ "$JVM_VERSION" \< "1.8" ] && [ "$JVM_PATCH_VERSION" -lt 25 ] ; then
-    echo "Cassandra 2.0 and later require Java 7u25 or later."
+if [ "$JVM_VERSION" \< "1.8" ] && [ "$JVM_PATCH_VERSION" -lt 40 ] ; then
+    echo "Cassandra 3.0 and later require Java 8u40 or later."
     exit 1;
 fi
 
-
 jvm=`echo "$java_ver_output" | grep -A 1 'java version' | awk 'NR==2 {print $1}'`
 case "$jvm" in
     OpenJDK)
@@ -124,11 +121,40 @@
         ;;
 esac
 
+# Sets the path where logback and GC logs are written.
+if [ "x$CASSANDRA_LOG_DIR" = "x" ] ; then
+    CASSANDRA_LOG_DIR="$CASSANDRA_HOME/logs"
+fi
+
+#GC log path has to be defined here because it needs to access CASSANDRA_HOME
+JVM_OPTS="$JVM_OPTS -Xloggc:${CASSANDRA_LOG_DIR}/gc.log"
+
+# Here we create the arguments that will get passed to the jvm when
+# starting cassandra.
+
+# Read user-defined JVM options from jvm.options file
+JVM_OPTS_FILE=$CASSANDRA_CONF/jvm.options
+for opt in `grep "^-" $JVM_OPTS_FILE`
+do
+  JVM_OPTS="$JVM_OPTS $opt"
+done
+
+# Check what parameters were defined on jvm.options file to avoid conflicts
+echo $JVM_OPTS | grep -q Xmn
+DEFINED_XMN=$?
+echo $JVM_OPTS | grep -q Xmx
+DEFINED_XMX=$?
+echo $JVM_OPTS | grep -q Xms
+DEFINED_XMS=$?
+echo $JVM_OPTS | grep -q UseConcMarkSweepGC
+USING_CMS=$?
+echo $JVM_OPTS | grep -q UseG1GC
+USING_G1=$?
 
 # Override these to set the amount of memory to allocate to the JVM at
 # start-up. For production use you may wish to adjust this for your
 # environment. MAX_HEAP_SIZE is the total amount of memory dedicated
-# to the Java heap; HEAP_NEWSIZE refers to the size of the young
+# to the Java heap. HEAP_NEWSIZE refers to the size of the young
 # generation. Both MAX_HEAP_SIZE and HEAP_NEWSIZE should be either set
 # or not (if you set one, set the other).
 #
@@ -146,40 +172,73 @@
 # Set this to control the amount of arenas per-thread in glibc
 #export MALLOC_ARENA_MAX=4
 
-if [ "x$MAX_HEAP_SIZE" = "x" ] && [ "x$HEAP_NEWSIZE" = "x" ]; then
+# only calculate the size if it's not set manually
+if [ "x$MAX_HEAP_SIZE" = "x" ] && [ "x$HEAP_NEWSIZE" = "x" -o $USING_G1 -eq 0 ]; then
     calculate_heap_sizes
-else
-    if [ "x$MAX_HEAP_SIZE" = "x" ] ||  [ "x$HEAP_NEWSIZE" = "x" ]; then
-        echo "please set or unset MAX_HEAP_SIZE and HEAP_NEWSIZE in pairs (see cassandra-env.sh)"
-        exit 1
-    fi
+elif [ "x$MAX_HEAP_SIZE" = "x" ] ||  [ "x$HEAP_NEWSIZE" = "x" -a $USING_G1 -ne 0 ]; then
+    echo "please set or unset MAX_HEAP_SIZE and HEAP_NEWSIZE in pairs when using CMS GC (see cassandra-env.sh)"
+    exit 1
 fi
 
-if [ "x$MALLOC_ARENA_MAX" = "x" ]
-then
+if [ "x$MALLOC_ARENA_MAX" = "x" ] ; then
     export MALLOC_ARENA_MAX=4
 fi
 
+# We only set -Xms and -Xmx if they were not defined on jvm.options file
+# If defined, both Xmx and Xms should be defined together.
+if [ $DEFINED_XMX -ne 0 ] && [ $DEFINED_XMS -ne 0 ]; then
+     JVM_OPTS="$JVM_OPTS -Xms${MAX_HEAP_SIZE}"
+     JVM_OPTS="$JVM_OPTS -Xmx${MAX_HEAP_SIZE}"
+elif [ $DEFINED_XMX -ne 0 ] || [ $DEFINED_XMS -ne 0 ]; then
+     echo "Please set or unset -Xmx and -Xms flags in pairs on jvm.options file."
+     exit 1
+fi
 
-# Specifies the default port over which Cassandra will be available for
-# JMX connections.
-# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
-JMX_PORT="7199"
+# We only set -Xmn flag if it was not defined in jvm.options file
+# and if the CMS GC is being used
+# If defined, both Xmn and Xmx should be defined together.
+if [ $DEFINED_XMN -eq 0 ] && [ $DEFINED_XMX -ne 0 ]; then
+    echo "Please set or unset -Xmx and -Xmn flags in pairs on jvm.options file."
+    exit 1
+elif [ $DEFINED_XMN -ne 0 ] && [ $USING_CMS -eq 0 ]; then
+    JVM_OPTS="$JVM_OPTS -Xmn${HEAP_NEWSIZE}"
+fi
 
-
-# Here we create the arguments that will get passed to the jvm when
-# starting cassandra.
+if [ "$JVM_ARCH" = "64-Bit" ] && [ $USING_CMS -eq 0 ]; then
+    JVM_OPTS="$JVM_OPTS -XX:+UseCondCardMark"
+fi
 
 # enable assertions.  disabling this in production will give a modest
 # performance benefit (around 5%).
 JVM_OPTS="$JVM_OPTS -ea"
 
+# Per-thread stack size.
+JVM_OPTS="$JVM_OPTS -Xss256k"
+
+# Make sure all memory is faulted and zeroed on startup.
+# This helps prevent soft faults in containers and makes
+# transparent hugepage allocation more effective.
+JVM_OPTS="$JVM_OPTS -XX:+AlwaysPreTouch"
+
+# Biased locking does not benefit Cassandra.
+JVM_OPTS="$JVM_OPTS -XX:-UseBiasedLocking"
+
+# Larger interned string table, for gossip's benefit (CASSANDRA-6410)
+JVM_OPTS="$JVM_OPTS -XX:StringTableSize=1000003"
+
+# Enable thread-local allocation blocks and allow the JVM to automatically
+# resize them at runtime.
+JVM_OPTS="$JVM_OPTS -XX:+UseTLAB -XX:+ResizeTLAB"
+
+# http://www.evanjones.ca/jvm-mmap-pause.html
+JVM_OPTS="$JVM_OPTS -XX:+PerfDisableSharedMem"
+
+# provides hints to the JIT compiler
+JVM_OPTS="$JVM_OPTS -XX:CompileCommandFile=$CASSANDRA_CONF/hotspot_compiler"
+
 # add the jamm javaagent
 JVM_OPTS="$JVM_OPTS -javaagent:$CASSANDRA_HOME/lib/jamm-0.3.0.jar"
 
-# some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
-JVM_OPTS="$JVM_OPTS -XX:+CMSClassUnloadingEnabled"
-
 # enable thread priorities, primarily so we can give periodic tasks
 # a lower priority to avoid interfering with client workload
 JVM_OPTS="$JVM_OPTS -XX:+UseThreadPriorities"
@@ -187,28 +246,17 @@
 # http://tech.stolsvik.com/2010/01/linux-java-thread-priorities-workaround.html
 JVM_OPTS="$JVM_OPTS -XX:ThreadPriorityPolicy=42"
 
-# min and max heap sizes should be set to the same value to avoid
-# stop-the-world GC pauses during resize, and so that we can lock the
-# heap in memory on startup to prevent any of it from being swapped
-# out.
-JVM_OPTS="$JVM_OPTS -Xms${MAX_HEAP_SIZE}"
-JVM_OPTS="$JVM_OPTS -Xmx${MAX_HEAP_SIZE}"
-JVM_OPTS="$JVM_OPTS -Xmn${HEAP_NEWSIZE}"
-JVM_OPTS="$JVM_OPTS -XX:+HeapDumpOnOutOfMemoryError"
-
 # set jvm HeapDumpPath with CASSANDRA_HEAPDUMP_DIR
+JVM_OPTS="$JVM_OPTS -XX:+HeapDumpOnOutOfMemoryError"
 if [ "x$CASSANDRA_HEAPDUMP_DIR" != "x" ]; then
     JVM_OPTS="$JVM_OPTS -XX:HeapDumpPath=$CASSANDRA_HEAPDUMP_DIR/cassandra-`date +%s`-pid$$.hprof"
 fi
 
-
-startswith() { [ "${1#$2}" != "$1" ]; }
-
 # stop the jvm on OutOfMemoryError as it can result in some data corruption
 # uncomment the preferred option
+# ExitOnOutOfMemoryError and CrashOnOutOfMemoryError require a JRE greater or equals to 1.7 update 101 or 1.8 update 92
 # For OnOutOfMemoryError we cannot use the JVM_OPTS variables because bash commands split words
 # on white spaces without taking quotes into account
-# ExitOnOutOfMemoryError and CrashOnOutOfMemoryError require a JRE greater or equals to 1.7 update 101 or 1.8 update 92
 # JVM_OPTS="$JVM_OPTS -XX:+ExitOnOutOfMemoryError"
 # JVM_OPTS="$JVM_OPTS -XX:+CrashOnOutOfMemoryError"
 JVM_ON_OUT_OF_MEMORY_ERROR_OPT="-XX:OnOutOfMemoryError=kill -9 %p"
@@ -216,50 +264,6 @@
 # print an heap histogram on OutOfMemoryError
 # JVM_OPTS="$JVM_OPTS -Dcassandra.printHeapHistogramOnOutOfMemoryError=true"
 
-# Per-thread stack size.
-JVM_OPTS="$JVM_OPTS -Xss256k"
-
-# Larger interned string table, for gossip's benefit (CASSANDRA-6410)
-JVM_OPTS="$JVM_OPTS -XX:StringTableSize=1000003"
-
-# GC tuning options
-JVM_OPTS="$JVM_OPTS -XX:+UseParNewGC" 
-JVM_OPTS="$JVM_OPTS -XX:+UseConcMarkSweepGC" 
-JVM_OPTS="$JVM_OPTS -XX:+CMSParallelRemarkEnabled" 
-JVM_OPTS="$JVM_OPTS -XX:SurvivorRatio=8" 
-JVM_OPTS="$JVM_OPTS -XX:MaxTenuringThreshold=1"
-JVM_OPTS="$JVM_OPTS -XX:CMSInitiatingOccupancyFraction=75"
-JVM_OPTS="$JVM_OPTS -XX:+UseCMSInitiatingOccupancyOnly"
-JVM_OPTS="$JVM_OPTS -XX:+UseTLAB"
-JVM_OPTS="$JVM_OPTS -XX:+PerfDisableSharedMem"
-JVM_OPTS="$JVM_OPTS -XX:CompileCommandFile=$CASSANDRA_CONF/hotspot_compiler"
-JVM_OPTS="$JVM_OPTS -XX:CMSWaitDuration=10000"
-
-# note: bash evals '1.7.x' as > '1.7' so this is really a >= 1.7 jvm check
-if { [ "$JVM_VERSION" \> "1.7" ] && [ "$JVM_VERSION" \< "1.8.0" ] && [ "$JVM_PATCH_VERSION" -ge "60" ]; } || [ "$JVM_VERSION" \> "1.8" ] ; then
-    JVM_OPTS="$JVM_OPTS -XX:+CMSParallelInitialMarkEnabled -XX:+CMSEdenChunksRecordAlways -XX:CMSWaitDuration=10000"
-fi
-
-if [ "$JVM_ARCH" = "64-Bit" ] ; then
-    JVM_OPTS="$JVM_OPTS -XX:+UseCondCardMark"
-fi
-
-# GC logging options
-JVM_OPTS="$JVM_OPTS -XX:+PrintGCDetails"
-JVM_OPTS="$JVM_OPTS -XX:+PrintGCDateStamps"
-JVM_OPTS="$JVM_OPTS -XX:+PrintHeapAtGC"
-JVM_OPTS="$JVM_OPTS -XX:+PrintTenuringDistribution"
-JVM_OPTS="$JVM_OPTS -XX:+PrintGCApplicationStoppedTime"
-JVM_OPTS="$JVM_OPTS -XX:+PrintPromotionFailure"
-#JVM_OPTS="$JVM_OPTS -XX:PrintFLSStatistics=1"
-
-JVM_OPTS="$JVM_OPTS -Xloggc:${CASSANDRA_HOME}/logs/gc.log"
-JVM_OPTS="$JVM_OPTS -XX:+UseGCLogFileRotation"
-JVM_OPTS="$JVM_OPTS -XX:NumberOfGCLogFiles=10"
-JVM_OPTS="$JVM_OPTS -XX:GCLogFileSize=10M"
-# if using version before JDK 6u34 or 7u2 use this instead of log rotation
-# JVM_OPTS="$JVM_OPTS -Xloggc:/var/log/cassandra/gc-`date +%s`.log"
-
 # uncomment to have Cassandra JVM listen for remote debuggers/profilers on port 1414
 # JVM_OPTS="$JVM_OPTS -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=1414"
 
@@ -290,6 +294,11 @@
     LOCAL_JMX=yes
 fi
 
+# Specifies the default port over which Cassandra will be available for
+# JMX connections.
+# For security reasons, you should not expose this port to the internet.  Firewall it if needed.
+JMX_PORT="7199"
+
 if [ "$LOCAL_JMX" = "yes" ]; then
   JVM_OPTS="$JVM_OPTS -Dcassandra.jmx.local.port=$JMX_PORT -XX:+DisableExplicitGC"
 else

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index c1c268c..c321a72 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml

@@ -24,6 +24,17 @@
 # multiple tokens per node, see http://wiki.apache.org/cassandra/Operations
 num_tokens: 256
 
+# Triggers automatic allocation of num_tokens tokens for this node. The allocation
+# algorithm attempts to choose tokens in a way that optimizes replicated load over
+# the nodes in the datacenter for the replication strategy used by the specified
+# keyspace.
+#
+# The load assigned to each node will be close to proportional to its number of
+# vnodes.
+#
+# Only supported with the Murmur3Partitioner.
+# allocate_tokens_for_keyspace: KEYSPACE
+
 # initial_token allows you to specify tokens manually.  While you can use # it with
 # vnodes (num_tokens > 1, above) -- in which case you should provide a 
 # comma-separated list -- it's primarily used when adding nodes # to legacy clusters 
@@ -31,25 +42,49 @@
 # initial_token:
 
 # See http://wiki.apache.org/cassandra/HintedHandoff
-# May either be "true" or "false" to enable globally, or contain a list
-# of data centers to enable per-datacenter.
-# hinted_handoff_enabled: DC1,DC2
+# May either be "true" or "false" to enable globally
 hinted_handoff_enabled: true
+# When hinted_handoff_enabled is true, a black list of data centers that will not
+# perform hinted handoff
+#hinted_handoff_disabled_datacenters:
+#    - DC1
+#    - DC2
 # this defines the maximum amount of time a dead host will have hints
 # generated.  After it has been dead this long, new hints for it will not be
 # created until it has been seen alive and gone down again.
 max_hint_window_in_ms: 10800000 # 3 hours
+
 # Maximum throttle in KBs per second, per delivery thread.  This will be
 # reduced proportionally to the number of nodes in the cluster.  (If there
 # are two nodes in the cluster, each delivery thread will use the maximum
 # rate; if there are three, each will throttle to half of the maximum,
 # since we expect two nodes to be delivering hints simultaneously.)
 hinted_handoff_throttle_in_kb: 1024
+
 # Number of threads with which to deliver hints;
 # Consider increasing this number when you have multi-dc deployments, since
 # cross-dc handoff tends to be slower
 max_hints_delivery_threads: 2
 
+# Directory where Cassandra should store hints.
+# If not set, the default directory is $CASSANDRA_HOME/data/hints.
+# hints_directory: /var/lib/cassandra/hints
+
+# How often hints should be flushed from the internal buffers to disk.
+# Will *not* trigger fsync.
+hints_flush_period_in_ms: 10000
+
+# Maximum size for a single hints file, in megabytes.
+max_hints_file_size_in_mb: 128
+
+# Compression to apply to the hint files. If omitted, hints files
+# will be written uncompressed. LZ4, Snappy, and Deflate compressors
+# are supported.
+#hints_compression:
+#   - class_name: LZ4Compressor
+#     parameters:
+#         -
+
 # Maximum throttle in KBs per second, total. This will be
 # reduced proportionally to the number of nodes in the cluster.
 batchlog_replay_throttle_in_kb: 1024
@@ -60,7 +95,7 @@
 #
 # - AllowAllAuthenticator performs no checks - set it to disable authentication.
 # - PasswordAuthenticator relies on username/password pairs to authenticate
-#   users. It keeps usernames and hashed passwords in system_auth.credentials table.
+#   users. It keeps usernames and hashed passwords in system_auth.roles table.
 #   Please increase system_auth keyspace replication factor if you use this authenticator.
 #   If using PasswordAuthenticator, CassandraRoleManager must also be used (see below)
 authenticator: AllowAllAuthenticator
@@ -70,7 +105,7 @@
 # CassandraAuthorizer}.
 #
 # - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
-# - CassandraAuthorizer stores permissions in system_auth.permissions table. Please
+# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please
 #   increase system_auth keyspace replication factor if you use this authorizer.
 authorizer: AllowAllAuthorizer
 
@@ -265,19 +300,26 @@
 #
 # the other option is "periodic" where writes may be acked immediately
 # and the CommitLog is simply synced every commitlog_sync_period_in_ms
-# milliseconds. 
+# milliseconds.
 commitlog_sync: periodic
 commitlog_sync_period_in_ms: 10000
 
 # The size of the individual commitlog file segments.  A commitlog
 # segment may be archived, deleted, or recycled once all the data
 # in it (potentially from each columnfamily in the system) has been
-# flushed to sstables.  
+# flushed to sstables.
 #
 # The default size is 32, which is almost always fine, but if you are
 # archiving commitlog segments (see commitlog_archiving.properties),
 # then you probably want a finer granularity of archiving; 8 or 16 MB
 # is reasonable.
+# Max mutation size is also configurable via max_mutation_size_in_kb setting in
+# cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024.
+# This should be positive and less than 2048.
+#
+# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must
+# be set to at least twice the size of max_mutation_size_in_kb / 1024
+#
 commitlog_segment_size_in_mb: 32
 
 # Compression to apply to the commit log. If omitted, the commit log
@@ -316,11 +358,28 @@
 concurrent_writes: 32
 concurrent_counter_writes: 32
 
-# Total memory to use for sstable-reading buffers.  Defaults to
-# the smaller of 1/4 of heap or 512MB.
+# For materialized view writes, as there is a read involved, so this should
+# be limited by the less of concurrent reads or concurrent writes.
+concurrent_materialized_view_writes: 32
+
+# Maximum memory to use for pooling sstable buffers. Defaults to the smaller
+# of 1/4 of heap or 512MB. This pool is allocated off-heap, so is in addition
+# to the memory allocated for heap. Memory is only allocated as needed.
 # file_cache_size_in_mb: 512
 
-# Total permitted memory to use for memtables. Cassandra will stop 
+# Flag indicating whether to allocate on or off heap when the sstable buffer
+# pool is exhausted, that is when it has exceeded the maximum memory
+# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request.
+
+# buffer_pool_use_heap_if_exhausted: true
+
+# The strategy for optimizing disk read
+# Possible values are:
+# ssd (for solid state disks, the default)
+# spinning (for spinning disks)
+# disk_optimization_strategy: ssd
+
+# Total permitted memory to use for memtables. Cassandra will stop
 # accepting writes when the limit is exceeded until a flush completes,
 # and will trigger a flush based on memtable_cleanup_threshold
 # If omitted, Cassandra will set both to 1/4 the size of the heap.
@@ -339,10 +398,26 @@
 # Specify the way Cassandra allocates and manages memtable memory.
 # Options are:
 #   heap_buffers:    on heap nio buffers
-#   offheap_buffers: off heap (direct) nio buffers
-#   offheap_objects: native memory, eliminating nio buffer heap overhead
+#
+# Note: offheap_buffers are not supported in Cassandra 3.0 - 3.3.
+# They have been re-introduced in Cassandra 3.4. For details see
+# https://issues.apache.org/jira/browse/CASSANDRA-9472 and
+# https://issues.apache.org/jira/browse/CASSANDRA-11039
 memtable_allocation_type: heap_buffers
 
+# Limits the maximum Merkle tree depth to avoid consuming too much
+# memory during repairs.
+#
+# The default setting of 18 generates trees of maximum size around
+# 50 MiB / tree. If you are running out of memory during repairs consider
+# lowering this to 15 (~6 MiB / tree) or lower, but try not to lower it
+# too much past that or you will lose too much resolution and stream
+# too much redundant data during repair. Cannot be set lower than 10.
+#
+# For more details see https://issues.apache.org/jira/browse/CASSANDRA-14096.
+#
+# repair_session_max_tree_depth: 18
+
 # Total space to use for commit logs on disk.
 #
 # If space gets above this value, Cassandra will flush every dirty CF
@@ -440,6 +515,14 @@
 # port for the CQL native transport to listen for clients on
 # For security reasons, you should not expose this port to the internet.  Firewall it if needed.
 native_transport_port: 9042
+# Enabling native transport encryption in client_encryption_options allows you to either use
+# encryption for the standard port or to use a dedicated, additional port along with the unencrypted
+# standard native_transport_port.
+# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
+# for native_transport_port. Setting native_transport_port_ssl to a different value
+# from native_transport_port will use encryption for native_transport_port_ssl while
+# keeping native_transport_port unencrypted.
+# native_transport_port_ssl: 9142
 # The maximum threads for handling requests when the native transport is used.
 # This is similar to rpc_max_threads though the default differs slightly (and
 # there is no native_transport_min_threads, idle threads will always be stopped
@@ -447,7 +530,8 @@
 # native_transport_max_threads: 128
 #
 # The maximum size of allowed frame. Frame (requests) larger than this will
-# be rejected as invalid. The default is 256MB.
+# be rejected as invalid. The default is 256MB. If you're changing this parameter,
+# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048.
 # native_transport_max_frame_size_in_mb: 256
 
 # The maximum number of concurrent client connections.
@@ -869,15 +953,22 @@
 # If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at
 # INFO level
 # Adjust the threshold based on your application throughput requirement
-# gc_warn_threshold_in_ms: 1000
+gc_warn_threshold_in_ms: 1000
 
 # UDFs (user defined functions) are disabled by default.
-# As of Cassandra 2.2, there is no security manager or anything else in place that
-# prevents execution of evil code. CASSANDRA-9402 will fix this issue for Cassandra 3.0.
-# This will inherently be backwards-incompatible with any 2.2 UDF that perform insecure
-# operations such as opening a socket or writing to the filesystem.
+# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code.
 enable_user_defined_functions: false
 
+# Enables scripted UDFs (JavaScript UDFs).
+# Java UDFs are always enabled, if enable_user_defined_functions is true.
+# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider.
+# This option has no effect, if enable_user_defined_functions is false.
+enable_scripted_user_defined_functions: false
+
+# Enables materialized view creation on this node.
+# Materialized views are considered experimental and are not recommended for production use.
+enable_materialized_views: true
+
 # The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation.
 # Lowering this value on Windows can provide much tighter latency and better throughput, however
 # some virtualized environments may see a negative performance impact from changing this setting
@@ -885,6 +976,11 @@
 # setting.
 windows_timer_interval: 1
 
+# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
+# early. Any value size larger than this threshold will result into marking an SSTable
+# as corrupted. This should be positive and less than 2048.
+# max_value_size_in_mb: 256
+
 # Coalescing Strategies #
 # Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more).
 # On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in
@@ -911,3 +1007,12 @@
 
 # Do not try to coalesce messages if we already got that many messages. This should be more than 2 and less than 128.
 # otc_coalescing_enough_coalesced_messages: 8
+
+# How many milliseconds to wait between two expiration runs on the backlog (queue) of the OutboundTcpConnection.
+# Expiration is done if messages are piling up in the backlog. Droppable messages are expired to free the memory
+# taken by expired messages. The interval should be between 0 and 1000, and in most installations the default value
+# will be appropriate. A smaller value could potentially expire messages slightly sooner at the expense of more CPU
+# time and queue contention while iterating the backlog of messages.
+# An interval of 0 disables any wait time, which is the behavior of former Cassandra versions.
+#
+# otc_backlog_expiration_interval_ms: 200
\ No newline at end of file

diff --git a/conf/cqlshrc.sample b/conf/cqlshrc.sample
index 462dcc6..cb02b04 100644
--- a/conf/cqlshrc.sample
+++ b/conf/cqlshrc.sample

@@ -202,7 +202,7 @@
 ; maxparseerrors = -1
 
 ;; The maximum global number of insert errors to ignore, -1 means unlimited
-; maxinserterrors = -1
+; maxinserterrors = 1000
 
 ;; A file to store all rows that could not be imported, by default this is
 ;; import_<ks>_<table>.err where <ks> is your keyspace and <table> is your table name.

diff --git a/conf/jvm.options b/conf/jvm.options
new file mode 100644
index 0000000..eb2ad19
--- /dev/null
+++ b/conf/jvm.options

@@ -0,0 +1,119 @@
+###########################################################################
+#                             jvm.options                                 #
+#                                                                         #
+# - all flags defined here will be used by cassandra to startup the JVM   #
+# - one flag should be specified per line                                 #
+# - lines that do not start with '-' will be ignored                      #
+# - only static flags are accepted (no variables or parameters)           #
+# - dynamic flags will be appended to these on cassandra-env              #
+###########################################################################
+
+#################
+# HEAP SETTINGS #
+#################
+
+# Heap size is automatically calculated by cassandra-env based on this
+# formula: max(min(1/2 ram, 1024MB), min(1/4 ram, 8GB))
+# That is:
+# - calculate 1/2 ram and cap to 1024MB
+# - calculate 1/4 ram and cap to 8192MB
+# - pick the max
+#
+# For production use you may wish to adjust this for your environment.
+# If that's the case, uncomment the -Xmx and Xms options below to override the
+# automatic calculation of JVM heap memory.
+#
+# It is recommended to set min (-Xms) and max (-Xmx) heap sizes to
+# the same value to avoid stop-the-world GC pauses during resize, and
+# so that we can lock the heap in memory on startup to prevent any
+# of it from being swapped out.
+#-Xms4G
+#-Xmx4G
+
+# Young generation size is automatically calculated by cassandra-env
+# based on this formula: min(100 * num_cores, 1/4 * heap size)
+#
+# The main trade-off for the young generation is that the larger it
+# is, the longer GC pause times will be. The shorter it is, the more
+# expensive GC will be (usually).
+#
+# It is not recommended to set the young generation size if using the
+# G1 GC, since that will override the target pause-time goal.
+# More info: http://www.oracle.com/technetwork/articles/java/g1gc-1984535.html
+#
+# The example below assumes a modern 8-core+ machine for decent
+# times. If in doubt, and if you do not particularly want to tweak, go
+# 100 MB per physical CPU core.
+#-Xmn800M
+
+###################################
+# EXPIRATION DATE OVERFLOW POLICY #
+###################################
+
+# Defines how to handle INSERT requests with TTL exceeding the maximum supported expiration date:
+# * REJECT: this is the default policy and will reject any requests with expiration date timestamp after 2038-01-19T03:14:06+00:00.
+# * CAP: any insert with TTL expiring after 2038-01-19T03:14:06+00:00 will expire on 2038-01-19T03:14:06+00:00 and the client will receive a warning.
+# * CAP_NOWARN: same as previous, except that the client warning will not be emitted.
+#
+#-Dcassandra.expiration_date_overflow_policy=REJECT
+
+#################
+#  GC SETTINGS  #
+#################
+
+### CMS Settings
+
+-XX:+UseParNewGC
+-XX:+UseConcMarkSweepGC
+-XX:+CMSParallelRemarkEnabled
+-XX:SurvivorRatio=8
+-XX:MaxTenuringThreshold=1
+-XX:CMSInitiatingOccupancyFraction=75
+-XX:+UseCMSInitiatingOccupancyOnly
+-XX:CMSWaitDuration=10000
+-XX:+CMSParallelInitialMarkEnabled
+-XX:+CMSEdenChunksRecordAlways
+# some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
+-XX:+CMSClassUnloadingEnabled
+
+### G1 Settings (experimental, comment previous section and uncomment section below to enable)
+
+## Use the Hotspot garbage-first collector.
+#-XX:+UseG1GC
+#
+## Have the JVM do less remembered set work during STW, instead
+## preferring concurrent GC. Reduces p99.9 latency.
+#-XX:G1RSetUpdatingPauseTimePercent=5
+#
+## Main G1GC tunable: lowering the pause target will lower throughput and vise versa.
+## 200ms is the JVM default and lowest viable setting
+## 1000ms increases throughput. Keep it smaller than the timeouts in cassandra.yaml.
+#-XX:MaxGCPauseMillis=500
+
+## Optional G1 Settings
+
+# Save CPU time on large (>= 16GB) heaps by delaying region scanning
+# until the heap is 70% full. The default in Hotspot 8u40 is 40%.
+#-XX:InitiatingHeapOccupancyPercent=70
+
+# For systems with > 8 cores, the default ParallelGCThreads is 5/8 the number of logical cores.
+# Otherwise equal to the number of cores when 8 or less.
+# Machines with > 10 cores should try setting these to <= full cores.
+#-XX:ParallelGCThreads=16
+# By default, ConcGCThreads is 1/4 of ParallelGCThreads.
+# Setting both to the same value can reduce STW durations.
+#-XX:ConcGCThreads=16
+
+### GC logging options -- uncomment to enable
+
+-XX:+PrintGCDetails
+-XX:+PrintGCDateStamps
+-XX:+PrintHeapAtGC
+-XX:+PrintTenuringDistribution
+-XX:+PrintGCApplicationStoppedTime
+-XX:+PrintPromotionFailure
+#-XX:PrintFLSStatistics=1
+#-Xloggc:/var/log/cassandra/gc.log
+-XX:+UseGCLogFileRotation
+-XX:NumberOfGCLogFiles=10
+-XX:GCLogFileSize=10M

diff --git a/conf/logback.xml b/conf/logback.xml
index a47740d..9f1e49a 100644
--- a/conf/logback.xml
+++ b/conf/logback.xml

@@ -24,7 +24,8 @@
 
 <configuration scan="true">
   <jmxConfigurator />
-  <shutdownHook class="ch.qos.logback.core.hook.DelayingShutdownHook"/>
+
+  <!-- No shutdown hook; we run it ourselves in StorageService after shutdown -->
 
   <!-- SYSTEMLOG rolling file appender to system.log (INFO level) -->
 

diff --git a/conf/metrics-reporter-config-sample.yaml b/conf/metrics-reporter-config-sample.yaml
index 34b1d7e..54f2657 100644
--- a/conf/metrics-reporter-config-sample.yaml
+++ b/conf/metrics-reporter-config-sample.yaml

@@ -35,7 +35,7 @@
         - "^org.apache.cassandra.metrics.ClientRequest.+" # includes ClientRequestMetrics
         - "^org.apache.cassandra.metrics.CommitLog.+"
         - "^org.apache.cassandra.metrics.Compaction.+"
-        - "^org.apache.cassandra.metrics.DroppedMetrics.+"
+        - "^org.apache.cassandra.metrics.DroppedMessage.+"
         - "^org.apache.cassandra.metrics.ReadRepair.+"
         - "^org.apache.cassandra.metrics.Storage.+"
         - "^org.apache.cassandra.metrics.ThreadPools.+"

diff --git a/debian/cassandra-tools.install b/debian/cassandra-tools.install
index 6df21f3..8806344 100644
--- a/debian/cassandra-tools.install
+++ b/debian/cassandra-tools.install

@@ -1,7 +1,7 @@
-tools/bin/json2sstable usr/bin
-tools/bin/sstable2json usr/bin
+tools/bin/sstableexpiredblockers usr/bin
 tools/bin/sstablelevelreset usr/bin
 tools/bin/sstablemetadata usr/bin
+tools/bin/sstableofflinerelevel usr/bin
 tools/bin/sstablerepairedset usr/bin
 tools/bin/sstablesplit usr/bin
-tools/bin/token-generator usr/bin
+tools/bin/sstabledump usr/bin

diff --git a/debian/cassandra.in.sh b/debian/cassandra.in.sh
index 9f69ac9..8fcaf9c 100644
--- a/debian/cassandra.in.sh
+++ b/debian/cassandra.in.sh

@@ -4,6 +4,10 @@
 
 CASSANDRA_HOME=/usr/share/cassandra
 
+# the default location for commitlogs, sstables, and saved caches
+# if not set in cassandra.yaml
+cassandra_storagedir=/var/lib/cassandra
+
 # The java classpath (required)
 if [ -n "$CLASSPATH" ]; then
     CLASSPATH=$CLASSPATH:$CASSANDRA_CONF

diff --git a/debian/cassandra.install b/debian/cassandra.install
index e8da5e9..50db32d 100644
--- a/debian/cassandra.install
+++ b/debian/cassandra.install

@@ -5,6 +5,7 @@
 conf/cassandra-topology.properties etc/cassandra
 conf/logback.xml etc/cassandra
 conf/logback-tools.xml etc/cassandra
+conf/jvm.options etc/cassandra
 conf/hotspot_compiler etc/cassandra
 conf/triggers/* etc/cassandra/triggers
 debian/cassandra.in.sh usr/share/cassandra
@@ -12,7 +13,7 @@
 debian/cassandra-sysctl.conf etc/sysctl.d
 bin/cassandra usr/sbin
 bin/nodetool usr/bin
-bin/sstablekeys usr/bin
+bin/sstableutil usr/bin
 bin/sstableloader usr/bin
 bin/cqlsh usr/bin
 bin/cqlsh.py usr/bin

diff --git a/debian/changelog b/debian/changelog
index bd880ce..2b0f10f 100644
--- a/debian/changelog
+++ b/debian/changelog

@@ -1,116 +1,158 @@
-cassandra (2.2.17) unstable; urgency=medium
+cassandra (3.0.21) UNRELEASED; urgency=medium
 
   * New release
 
- -- Mick Semb Wever <mck@apache.org>  Tue, 14 Jul 2020 21:30:34 +0200
+ -- Michael Shuler <mshuler@apache.org>  Fri, 14 Feb 2020 18:43:25 -0600
 
-cassandra (2.2.16) unstable; urgency=medium
+cassandra (3.0.20) unstable; urgency=medium
 
   * New release
 
- -- Mick Semb Wever <mck@apache.org>  Mon, 10 Feb 2020 10:52:49 +0100
+ -- Mick Semb Wever <mck@apache.org>  Mon, 10 Feb 2020 22:53:30 +0100
 
-cassandra (2.2.15) unstable; urgency=medium
+cassandra (3.0.19) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <mshuler@apache.org>  Thu, 24 Oct 2019 08:50:00 -0500
+ -- Michael Shuler <mshuler@apache.org>  Thu, 24 Oct 2019 08:56:34 -0500
 
-cassandra (2.2.14) unstable; urgency=medium
+cassandra (3.0.18) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <mshuler@apache.org>  Fri, 01 Feb 2019 12:34:51 -0600
+ -- Michael Shuler <mshuler@apache.org>  Fri, 01 Feb 2019 12:37:12 -0600
 
-cassandra (2.2.13) unstable; urgency=medium
+cassandra (3.0.17) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <michael@pbandjelly.org>  Mon, 02 Jul 2018 13:26:07 -0500
+ -- Michael Shuler <michael@pbandjelly.org>  Mon, 02 Jul 2018 13:29:49 -0500
 
-cassandra (2.2.12) unstable; urgency=medium
+cassandra (3.0.16) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <michael@pbandjelly.org>  Thu, 05 Oct 2017 10:21:40 -0500
+ -- Michael Shuler <michael@pbandjelly.org>  Tue, 10 Oct 2017 17:13:31 -0500
 
-cassandra (2.2.11) unstable; urgency=medium
+cassandra (3.0.15) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <michael@pbandjelly.org>  Mon, 26 Jun 2017 18:45:38 -0500
+ -- Michael Shuler <michael@pbandjelly.org>  Mon, 26 Jun 2017 18:47:08 -0500
 
-cassandra (2.2.10) unstable; urgency=medium
+cassandra (3.0.14) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <michael@pbandjelly.org>  Wed, 21 Jun 2017 14:07:47 -0500
+ -- Michael Shuler <michael@pbandjelly.org>  Fri, 14 Apr 2017 20:01:02 -0500
 
-cassandra (2.2.9) unstable; urgency=medium
+cassandra (3.0.13) unstable; urgency=medium
 
   * New release
 
- -- Michael Shuler <michael@pbandjelly.org>  Wed, 15 Feb 2017 18:12:32 -0600
+ -- Michael Shuler <michael@pbandjelly.org>  Tue, 11 Apr 2017 12:52:26 -0500
 
-cassandra (2.2.8) unstable; urgency=medium
+cassandra (3.0.12) unstable; urgency=medium
+
+  * New release
+
+ -- Michael Shuler <michael@pbandjelly.org>  Tue, 07 Mar 2017 09:28:21 -0600
+
+cassandra (3.0.11) unstable; urgency=medium
+
+  * New release
+
+ -- Michael Shuler <michael@pbandjelly.org>  Wed, 15 Feb 2017 18:15:14 -0600
+
+cassandra (3.0.10) unstable; urgency=medium
+
+  * New release
+
+ -- Michael Shuler <michael@pbandjelly.org>  Mon, 31 Oct 2016 10:33:44 -0500
+
+cassandra (3.0.9) unstable; urgency=medium
+
+  * New release
+
+ -- Michael Shuler <mshuler@apache.org>  Tue, 16 Aug 2016 18:02:20 -0500
+
+cassandra (3.0.8) unstable; urgency=medium
 
   * New release 
 
- -- Michael Shuler <michael@pbandjelly.org>  Fri, 23 Sep 2016 16:35:25 -0500
+ -- Jake Luciani <jake@apache.org>  Tue, 28 Jun 2016 20:12:22 -0400
 
-cassandra (2.2.7) unstable; urgency=medium
-
-  * New release 
-
- -- Jake Luciani <jake@apache.org>  Tue, 28 Jun 2016 20:08:44 -0400
-
-cassandra (2.2.6) unstable; urgency=medium
+cassandra (3.0.7) unstable; urgency=medium
 
   * New release
 
- -- Jake Luciani <jake@apache.org>  Fri, 15 Apr 2016 09:47:38 -0400
+ -- Jake Luciani <jake@apache.org>  Mon, 06 Jun 2016 14:27:28 -0400
 
-cassandra (2.2.5) unstable; urgency=medium
-
-  * New release 
-
- -- Jake Luciani <jake@apache.org>  Tue, 26 Jan 2016 09:15:04 -0500
-
-cassandra (2.2.4) unstable; urgency=medium
+cassandra (3.0.6) unstable; urgency=medium
 
   * New release
 
- -- Jake Luciani <jake@apache.org>  Wed, 02 Dec 2015 10:52:18 -0500
+ -- Jake Luciani <jake@apache.org>  Tue, 03 May 2016 09:29:31 -0400
 
-cassandra (2.2.3) unstable; urgency=medium
-
-  * New release 
-
- -- Jake Luciani <jake@apache.org>  Mon, 12 Oct 2015 09:49:15 -0400
-
-cassandra (2.2.2) unstable; urgency=medium
-
-  * New release 
-
- -- Jake Luciani <jake@apache.org>  Thu, 01 Oct 2015 09:37:57 -0400
-
-cassandra (2.2.1) unstable; urgency=medium
+cassandra (3.0.5) unstable; urgency=medium
 
   * New release
 
- -- Jake Luciani <jake@apache.org>  Tue, 25 Aug 2015 13:22:45 -0400
+ -- Jake Luciani <jake@apache.org>  Sat, 02 Apr 2016 07:57:16 -0400
 
-cassandra (2.2.0) unstable; urgency=medium
+cassandra (3.0.4) unstable; urgency=medium
 
   * New release
 
- -- Jake Luciani <jake@apache.org>  Fri, 17 Jul 2015 13:06:22 -0400
+ -- Jake Luciani <jake@apache.org>  Mon, 29 Feb 2016 10:36:33 -0500
 
-cassandra (2.2.0~rc2) unstable; urgency=medium
+cassandra (3.0.3) unstable; urgency=medium
 
-  * New RC release
+  * New release 
 
- -- Jake Luciani <jake@apache.org>  Tue, 30 Jun 2015 09:43:38 -0400
+ -- Jake Luciani <jake@apache.org>  Wed, 03 Feb 2016 08:54:57 -0500
+
+cassandra (3.0.1) unstable; urgency=medium
+
+  * New release
+
+ -- Jake Luciani <jake@apache.org>  Fri, 04 Dec 2015 15:56:02 -0500
+
+cassandra (3.0.0) unstable; urgency=medium
+
+  * New release 
+
+ -- Jake Luciani <jake@apache.org>  Fri, 06 Nov 2015 14:37:07 -0500
+
+cassandra (3.0.0~rc2) unstable; urgency=medium
+
+  * New release candidate 
+
+ -- Jake Luciani <jake@apache.org>  Fri, 16 Oct 2015 16:02:24 -0400
+
+cassandra (3.0.0~rc1) unstable; urgency=medium
+
+  * New release candidate
+
+ -- Jake Luciani <jake@apache.org>  Sat, 19 Sep 2015 16:01:59 -0400
+
+cassandra (3.0.0~beta2) unstable; urgency=medium
+
+  * New beta release
+
+ -- Jake Luciani <jake@apache.org>  Fri, 04 Sep 2015 19:06:25 -0400
+
+cassandra (3.0.0~beta1) unstable; urgency=medium
+
+  * New beta release
+
+ -- Jake Luciani <jake@apache.org>  Fri, 21 Aug 2015 22:33:04 -0400
+
+cassandra (3.0.0~alpha1) unstable; urgency=medium
+
+  * New alpha release
+
+ -- Jake Luciani <jake@apache.org>  Thu, 30 Jul 2015 09:21:15 -0400
 
 cassandra (2.2.0~rc1) unstable; urgency=medium
 

diff --git a/debian/control b/debian/control
index 95c3c07..6a0128e 100644
--- a/debian/control
+++ b/debian/control

@@ -11,7 +11,7 @@
 
 Package: cassandra
 Architecture: all
-Depends: openjdk-7-jre-headless | java7-runtime, adduser, python (>= 2.7), ${misc:Depends}
+Depends: openjdk-8-jre-headless | java8-runtime, adduser, python (>= 2.7), ${misc:Depends}
 Recommends: ntp | time-daemon
 Suggests: cassandra-tools
 Conflicts: apache-cassandra1

diff --git a/debian/init b/debian/init
index 72417ae..0518cd2 100644
--- a/debian/init
+++ b/debian/init

@@ -21,6 +21,7 @@
 CONFDIR=/etc/cassandra
 WAIT_FOR_START=10
 CASSANDRA_HOME=/usr/share/cassandra
+CASSANDRA_CONF=$CONFDIR
 FD_LIMIT=100000
 
 [ -e /usr/share/cassandra/apache-cassandra.jar ] || exit 0
@@ -97,7 +98,7 @@
     #   1 if daemon was already stopped
     #   2 if daemon could not be stopped
     #   other if a failure occurred
-    start-stop-daemon -K -p "$PIDFILE" -R TERM/30/KILL/5 >/dev/null
+    start-stop-daemon -K -u cassandra -p "$PIDFILE" -R TERM/30/KILL/5 >/dev/null
     RET=$?
     rm -f "$PIDFILE"
     return $RET

diff --git a/debian/nodetool-completion b/debian/nodetool-completion
index 7dc35de..f6f3d5b 100644
--- a/debian/nodetool-completion
+++ b/debian/nodetool-completion

@@ -53,25 +53,31 @@
         _get_comp_words_by_ref cur prev
 
         local shopt='
-            cfstats
-            compactionstats
+            bootstrap
             compactionhistory
+            compactionstats
             decommission
             describecluster
             disablebackup
             disablebinary
             disablegossip
             disablehandoff
+            disablehintsfordc
             disablethrift
             drain
             enablebackup
             enablebinary
             enablegossip
             enablehandoff
+            enablehintsfordc
             enablethrift
+            failuredetector
+            gcstats
             getcompactionthroughput
+            getinterdcstreamthroughput
             getlogginglevels
             getstreamthroughput
+            gettraceprobability
             gossipinfo
             help
             invalidatecountercache
@@ -82,21 +88,30 @@
             pausehandoff
             proxyhistograms
             rangekeysample
+            refreshsizeestimates
+            reloadlocalschema
             reloadtriggers
+            replaybatchlog
             resetlocalschema
             resumehandoff
             ring
+            sethintedhandoffthrottlekb
+            setinterdcstreamthroughput
             setlogginglevel
             status
+            statusbackup
             statusbinary
+            statusgossip
+            statushandoff
             statusthrift
             stopdaemon
+            tablestats
             tpstats
             version
             '
 
         local lngopt='
-            cfhistograms
+            assassinate
             cleanup
             clearsnapshot
             compact
@@ -124,13 +139,14 @@
             settraceprobability
             snapshot
             stop
-            taketoken
+            tablehistograms
+            toppartitions
             truncatehints
             upgradesstables
-            '
+            verify
+	    '
 
         local optwks='
-            cfhistograms
             cleanup
             clearsnapshot
             compact
@@ -145,6 +161,9 @@
             scrub
             setcompactionthreshold
             snapshot
+            tablehistograms
+            toppartitions
+            verify
             '
 
         local optwcfs='
@@ -155,7 +174,9 @@
             flush
             repair
             scrub
+            toppartitions
             upgradesstables
+            verify
             '
 
         if [[ $COMP_CWORD -eq 1 ]] ; then
@@ -177,14 +198,14 @@
                         return 0
                         ;;
                     stop)
-                        COMPREPLY=( $(compgen -W "COMPACTION VALIDATION CLEANUP SCRUB INDEX_BUILD" -- "${cur}") )
+                        COMPREPLY=( $(compgen -W "COMPACTION VALIDATION CLEANUP SCRUB VERIFY INDEX_BUILD" -- "${cur}") )
                         return 0
                         ;;
                     info)
                         COMPREPLY=( $(compgen -W "-T --tokens" -- "${cur}") )
                         return 0
                         ;;
-                    rebuild)
+                    rebuild|disablehintsfordc|enablehintsfordc)
                         show_datacenters "${cur}"
                         return 0
                         ;;
@@ -197,7 +218,7 @@
             fi
         elif [[ $COMP_CWORD -eq 3 ]] ; then
             case "${COMP_WORDS[1]}" in
-                cfhistograms|cleanup|compact|flush|getcompactionthreshold|getendpoints|getsstables|rebuild_index|refresh|repair|scrub|setcompactionthreshold)
+                cleanup|compact|flush|getcompactionthreshold|getendpoints|getsstables|rebuild_index|refresh|repair|scrub|setcompactionthreshold|tablehistograms|toppartitions|verify)
                     show_cfs ${prev} ${cur}
                     return 0
                     ;;

diff --git a/debian/patches/002cassandra_logdir_fix.dpatch b/debian/patches/002cassandra_logdir_fix.dpatch
old mode 100644
new mode 100755
index cca337c..2a3a47d
--- a/debian/patches/002cassandra_logdir_fix.dpatch
+++ b/debian/patches/002cassandra_logdir_fix.dpatch

@@ -6,26 +6,26 @@
 
 @DPATCH@
 diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' cassandra~/bin/cassandra cassandra/bin/cassandra
---- cassandra~/bin/cassandra	2015-10-27 14:15:10.718076265 -0500
-+++ cassandra/bin/cassandra	2015-10-27 14:23:10.000000000 -0500
-@@ -139,7 +139,7 @@
-     props="$3"
-     class="$4"
-     cassandra_parms="-Dlogback.configurationFile=logback.xml"
--    cassandra_parms="$cassandra_parms -Dcassandra.logdir=$CASSANDRA_HOME/logs"
-+    cassandra_parms="$cassandra_parms -Dcassandra.logdir=/var/log/cassandra"
-     cassandra_parms="$cassandra_parms -Dcassandra.storagedir=$cassandra_storagedir"
+--- cassandra~/bin/cassandra	2019-06-27 09:35:32.000000000 -0500
++++ cassandra/bin/cassandra	2019-06-27 09:43:28.756343141 -0500
+@@ -127,7 +127,7 @@
+ fi
  
-     if [ "x$pidpath" != "x" ]; then
+ if [ -z "$CASSANDRA_LOG_DIR" ]; then
+-  CASSANDRA_LOG_DIR=$CASSANDRA_HOME/logs
++  CASSANDRA_LOG_DIR=/var/log/cassandra
+ fi
+ 
+ # Special-case path variables.
 diff -urNad '--exclude=CVS' '--exclude=.svn' '--exclude=.git' '--exclude=.arch' '--exclude=.hg' '--exclude=_darcs' '--exclude=.bzr' cassandra~/conf/cassandra-env.sh cassandra/conf/cassandra-env.sh
---- cassandra~/conf/cassandra-env.sh	2015-10-27 14:20:22.990840135 -0500
-+++ cassandra/conf/cassandra-env.sh	2015-10-27 14:24:03.210202234 -0500
-@@ -288,7 +288,7 @@
- JVM_OPTS="$JVM_OPTS -XX:+PrintPromotionFailure"
- #JVM_OPTS="$JVM_OPTS -XX:PrintFLSStatistics=1"
- 
--JVM_OPTS="$JVM_OPTS -Xloggc:${CASSANDRA_HOME}/logs/gc.log"
-+JVM_OPTS="$JVM_OPTS -Xloggc:/var/log/cassandra/gc.log"
- JVM_OPTS="$JVM_OPTS -XX:+UseGCLogFileRotation"
- JVM_OPTS="$JVM_OPTS -XX:NumberOfGCLogFiles=10"
- JVM_OPTS="$JVM_OPTS -XX:GCLogFileSize=10M"
+--- cassandra~/conf/cassandra-env.sh	2019-06-27 09:35:32.000000000 -0500
++++ cassandra/conf/cassandra-env.sh	2019-06-27 09:42:25.747715490 -0500
+@@ -123,7 +123,7 @@ esac
+
+ # Sets the path where logback and GC logs are written.
+ if [ "x$CASSANDRA_LOG_DIR" = "x" ] ; then
+-    CASSANDRA_LOG_DIR="$CASSANDRA_HOME/logs"
++    CASSANDRA_LOG_DIR="/var/log/cassandra/"
+ fi
+
+ #GC log path has to be defined here because it needs to access CASSANDRA_HOME
\ No newline at end of file

diff --git a/doc/cql3/CQL.textile b/doc/cql3/CQL.textile
index 33db748..6a05c4f 100644
--- a/doc/cql3/CQL.textile
+++ b/doc/cql3/CQL.textile

@@ -1,6 +1,6 @@
 <link rel="StyleSheet" href="CQL.css" type="text/css" media="screen">
 
-h1. Cassandra Query Language (CQL) v3.3.1
+h1. Cassandra Query Language (CQL) v3.4.0
 
 
 
@@ -346,7 +346,7 @@
 
 h4(#compactionOptions). Compaction options
 
-The @compaction@ property must at least define the @'class'@ sub-option, that defines the compaction strategy class to use. The default supported class are @'SizeTieredCompactionStrategy'@, @'LeveledCompactionStrategy'@ and @'DateTieredCompactionStrategy'@. Custom strategy can be provided by specifying the full class name as a "string constant":#constants. The rest of the sub-options depends on the chosen class. The sub-options supported by the default classes are:
+The @compaction@ property must at least define the @'class'@ sub-option, that defines the compaction strategy class to use. The default supported class are @'SizeTieredCompactionStrategy'@, @'LeveledCompactionStrategy'@, @'DateTieredCompactionStrategy'@ and @'TimeWindowCompactionStrategy'@. Custom strategy can be provided by specifying the full class name as a "string constant":#constants. The rest of the sub-options depends on the chosen class. The sub-options supported by the default classes are:
 
 |_. option                         |_. supported compaction strategy |_. default    |_. description |
 | @enabled@                        | _all_                           | true         | A boolean denoting whether compaction should be enabled or not.|
@@ -362,16 +362,20 @@
 | @timestamp_resolution@           | DateTieredCompactionStrategy    | MICROSECONDS | The timestamp resolution used when inserting data, could be MILLISECONDS, MICROSECONDS etc (should be understandable by Java TimeUnit) - don't change this unless you do mutations with USING TIMESTAMP <non_microsecond_timestamps> (or equivalent directly in the client)|
 | @base_time_seconds@              | DateTieredCompactionStrategy    | 60           | The base size of the time windows. |
 | @max_sstable_age_days@           | DateTieredCompactionStrategy    | 365          | SSTables only containing data that is older than this will never be compacted. |
+| @timestamp_resolution@           | TimeWindowCompactionStrategy    | MICROSECONDS | The timestamp resolution used when inserting data, could be MILLISECONDS, MICROSECONDS etc (should be understandable by Java TimeUnit) - don't change this unless you do mutations with USING TIMESTAMP <non_microsecond_timestamps> (or equivalent directly in the client)|
+| @compaction_window_unit@         | TimeWindowCompactionStrategy    | DAYS         | The Java TimeUnit used for the window size, set in conjunction with @compaction_window_size@. Must be one of DAYS, HOURS, MINUTES |
+| @compaction_window_size@         | TimeWindowCompactionStrategy    | 1            | The number of @compaction_window_unit@ units that make up a time window. |
 
 
 h4(#compressionOptions). Compression options
 
 For the @compression@ property, the following sub-options are available:
 
-|_. option              |_. default        |_. description |
-| @sstable_compression@ | LZ4Compressor    | The compression algorithm to use. Default compressor are: LZ4Compressor, SnappyCompressor and DeflateCompressor. Use an empty string (@''@) to disable compression. Custom compressor can be provided by specifying the full class name as a "string constant":#constants.|
-| @chunk_length_kb@     | 64KB             | On disk SSTables are compressed by block (to allow random reads). This defines the size (in KB) of said block. Bigger values may improve the compression rate, but increases the minimum size of data to be read from disk for a read |
-| @crc_check_chance@    | 1.0              | When compression is enabled, each compressed block includes a checksum of that block for the purpose of detecting disk bitrot and avoiding the propagation of corruption to other replica. This option defines the probability with which those checksums are checked during read. By default they are always checked. Set to 0 to disable checksum checking and to 0.5 for instance to check them every other read|
+|_. option                 |_. default        |_. description |
+| @class@                  | LZ4Compressor    | The compression algorithm to use. Default compressor are: LZ4Compressor, SnappyCompressor and DeflateCompressor. Use @'enabled' : false@ to disable compression. Custom compressor can be provided by specifying the full class name as a "string constant":#constants.|
+| @enabled@                | true             | By default compression is enabled. To disable it, set @enabled@ to @false@
+| @chunk_length_in_kb@     | 64KB             | On disk SSTables are compressed by block (to allow random reads). This defines the size (in KB) of said block. Bigger values may improve the compression rate, but increases the minimum size of data to be read from disk for a read |
+| @crc_check_chance@       | 1.0              | When compression is enabled, each compressed block includes a checksum of that block for the purpose of detecting disk bitrot and avoiding the propagation of corruption to other replica. This option defines the probability with which those checksums are checked during read. By default they are always checked. Set to 0 to disable checksum checking and to 0.5 for instance to check them every other read|
 
 h4(#cachingOptions). Caching options
 
@@ -393,8 +397,7 @@
 bc(syntax).. 
 <alter-table-stmt> ::= ALTER (TABLE | COLUMNFAMILY) <tablename> <instruction>
 
-<instruction> ::= ALTER <identifier> TYPE <type>
-                | ADD   <identifier> <type>
+<instruction> ::= ADD   <identifier> <type>
                 | DROP  <identifier>
                 | WITH  <option> ( AND <option> )*
 p. 
@@ -402,7 +405,6 @@
 
 bc(sample).. 
 ALTER TABLE addamsFamily
-ALTER lastKnownLocation TYPE uuid;
 
 ALTER TABLE addamsFamily
 ADD gravesite varchar;
@@ -411,10 +413,9 @@
 WITH comment = 'A most excellent and useful column family'
  AND read_repair_chance = 0.2;
 p. 
-The @ALTER@ statement is used to manipulate table definitions. It allows for adding new columns, dropping existing ones, changing the type of existing columns, or updating the table options. As with table creation, @ALTER COLUMNFAMILY@ is allowed as an alias for @ALTER TABLE@.
+The @ALTER@ statement is used to manipulate table definitions. It allows for adding new columns, dropping existing ones, or updating the table options. As with table creation, @ALTER COLUMNFAMILY@ is allowed as an alias for @ALTER TABLE@.
 
 The @<tablename>@ is the table name optionally preceded by the keyspace name.  The @<instruction>@ defines the alteration to perform:
-* @ALTER@: Update the type of a given defined column. Note that the type of the "clustering columns":#createTablepartitionClustering cannot be modified as it induces the on-disk ordering of rows. Columns on which a "secondary index":#createIndexStmt is defined have the same restriction. Other columns are free from those restrictions (no validation of existing data is performed), but it is usually a bad idea to change the type to a non-compatible one, unless no data have been inserted for that column yet, as this could confuse CQL drivers/tools.
 * @ADD@: Adds a new column to the table. The @<identifier>@ for the new column must not conflict with an existing column. Moreover, columns cannot be added to tables defined with the @COMPACT STORAGE@ option.
 * @DROP@: Removes a column from the table. Dropped columns will immediately become unavailable in the queries and will not be included in compacted sstables in the future. If a column is readded, queries won't return values written before the column was last dropped. It is assumed that timestamps represent actual time, so if this is not your case, you should NOT readd previously dropped columns. Columns can't be dropped from tables defined with the @COMPACT STORAGE@ option.
 * @WITH@: Allows to update the options of the table. The "supported @<option>@":#createTableOptions (and syntax) are the same as for the @CREATE TABLE@ statement except that @COMPACT STORAGE@ is not supported. Note that setting any @compaction@ sub-options has the effect of erasing all previous @compaction@ options, so you  need to re-specify all the sub-options if you want to keep them. The same note applies to the set of @compression@ sub-options.
@@ -492,6 +493,68 @@
 
 If the index does not exists, the statement will return an error, unless @IF EXISTS@ is used in which case the operation is a no-op.
 
+
+h3(#createMVStmt). CREATE MATERIALIZED VIEW
+
+__Syntax:__
+
+bc(syntax).. 
+<create-table-stmt> ::= CREATE MATERIALIZED VIEW ( IF NOT EXISTS )? <viewname> AS
+                          SELECT ( '(' <identifier> ( ',' <identifier> ) * ')' | '*' )
+                          FROM <tablename>
+                          ( WHERE <where-clause> )?
+                          PRIMARY KEY '(' <partition-key> ( ',' <identifier> )* ')'
+                          ( WITH <option> ( AND <option>)* )?
+p. 
+__Sample:__
+
+bc(sample).. 
+CREATE MATERIALIZED VIEW monkeySpecies_by_population AS
+    SELECT *
+    FROM monkeySpecies
+    WHERE population IS NOT NULL AND species IS NOT NULL
+    PRIMARY KEY (population, species)
+    WITH comment='Allow query by population instead of species';
+p. 
+The @CREATE MATERIALIZED VIEW@ statement creates a new materialized view. Each such view is a set of _rows_ which corresponds to rows which are present in the underlying, or base, table specified in the @SELECT@ statement. A materialized view cannot be directly updated, but updates to the base table will cause corresponding updates in the view.
+
+Attempting to create an already existing materialized view will return an error unless the @IF NOT EXISTS@ option is used. If it is used, the statement will be a no-op if the materialized view already exists.
+
+h4(#createMVWhere). @WHERE@ Clause
+
+The @<where-clause>@ is similar to the "where clause of a @SELECT@ statement":#selectWhere, with a few differences.  First, the where clause must contain an expression that disallows @NULL@ values in columns in the view's primary key.  If no other restriction is desired, this can be accomplished with an @IS NOT NULL@ expression.  Second, only columns which are in the base table's primary key may be restricted with expressions other than @IS NOT NULL@.  (Note that this second restriction may be lifted in the future.)
+
+h4. MV Limitations
+
+__Note:__
+Removal of columns not selected in the Materialized View (via `UPDATE base SET unselected_column = null` or `DELETE unselected_column FROM base`) may shadow missed updates to other columns received by hints or repair.
+For this reason, we advise against doing deletions on base columns not selected in views until this is fixed on CASSANDRA-13826.
+
+h3(#alterMVStmt). ALTER MATERIALIZED VIEW
+
+__Syntax:__
+
+bc(syntax). <alter-materialized-view-stmt> ::= ALTER MATERIALIZED VIEW <viewname>
+                                                 WITH <option> ( AND <option> )*
+
+p.
+The @ALTER MATERIALIZED VIEW@ statement allows options to be update; these options are the same as <a href="#createTableOptions">@CREATE TABLE@'s options</a>.
+
+
+h3(#dropMVStmt). DROP MATERIALIZED VIEW
+
+__Syntax:__
+
+bc(syntax). <drop-materialized-stmt> ::= DROP MATERIALIZED VIEW ( IF EXISTS )? <tablename>
+
+__Sample:__
+
+bc(sample). DROP MATERIALIZED VIEW monkeySpecies_by_population;
+
+The @DROP MATERIALIZED VIEW@ statement is used to drop an existing materialized view.
+
+If the materialized view does not exists, the statement will return an error, unless @IF EXISTS@ is used in which case the operation is a no-op.
+
 h3(#createTypeStmt). CREATE TYPE
 
 __Syntax:__
@@ -538,23 +601,18 @@
 bc(syntax).. 
 <alter-type-stmt> ::= ALTER TYPE <typename> <instruction>
 
-<instruction> ::= ALTER <field-name> TYPE <type>
-                | ADD <field-name> <type>
+<instruction> ::= ADD <field-name> <type>
                 | RENAME <field-name> TO <field-name> ( AND <field-name> TO <field-name> )*
 p. 
 __Sample:__
 
 bc(sample).. 
-ALTER TYPE address ALTER zip TYPE varint
-
 ALTER TYPE address ADD country text
 
 ALTER TYPE address RENAME zip TO zipcode AND street_name TO street
 p. 
 The @ALTER TYPE@ statement is used to manipulate type definitions. It allows for adding new fields, renaming existing fields, or changing the type of existing fields.
 
-When altering the type of a column, the new type must be compatible with the previous type.
-
 h3(#dropTypeStmt). DROP TYPE
 
 __Syntax:__
@@ -822,8 +880,11 @@
 <where-clause> ::= <relation> ( AND <relation> )*
 
 <relation> ::= <identifier> '=' <term>
-             | <identifier> IN '(' ( <term> ( ',' <term> )* )? ')'
+             | '(' <identifier> (',' <identifier>)* ')' '=' <term-tuple>
+             | <identifier> IN '(' ( <term> ( ',' <term>)* )? ')'
              | <identifier> IN <variable>
+             | '(' <identifier> (',' <identifier>)* ')' IN '(' ( <term-tuple> ( ',' <term-tuple>)* )? ')'
+             | '(' <identifier> (',' <identifier>)* ')' IN <variable>
 
 <option> ::= TIMESTAMP <integer>
            | TTL <integer>
@@ -839,11 +900,11 @@
 
 UPDATE UserActions SET total = total + 2 WHERE user = B70DE1D0-9908-4AE3-BE34-5573E5B09F14 AND action = 'click';
 p. 
-The @UPDATE@ statement writes one or more columns for a given row in a table. The @<where-clause>@ is used to select the row to update and must include all columns composing the @PRIMARY KEY@ (the @IN@ relation is only supported for the last column of the partition key). Other columns values are specified through @<assignment>@ after the @SET@ keyword.
+The @UPDATE@ statement writes one or more columns for a given row in a table. The @<where-clause>@ is used to select the row to update and must include all columns composing the @PRIMARY KEY@. Other columns values are specified through @<assignment>@ after the @SET@ keyword.
 
-Note that unlike in SQL, @UPDATE@ does not check the prior existence of the row by default (except through the use of @<condition>@, see below): the row is created if none existed before, and updated otherwise. Furthermore, there is no mean to know which of creation or update happened.
+Note that unlike in SQL, @UPDATE@ does not check the prior existence of the row by default (except through the use of @<condition>@, see below): the row is created if none existed before, and updated otherwise. Furthermore, there are no means to know whether a creation or update occurred.
 
-It is however possible to use the conditions on some columns through @IF@, in which case the row will not be updated unless such condition are met. But please note that using @IF@ conditions will incur a non negligible performance cost (internally, Paxos will be used) so this should be used sparingly.
+It is however possible to use the conditions on some columns through @IF@, in which case the row will not be updated unless the conditions are met. But, please note that using @IF@ conditions will incur a non-negligible performance cost (internally, Paxos will be used) so this should be used sparingly.
 
 In an @UPDATE@ statement, all updates within the same partition key are applied atomically and in isolation.
 
@@ -853,9 +914,9 @@
 
 h4(#updateOptions). @<options>@
 
-The @UPDATE@ and @INSERT@ statements allows to specify the following options for the insertion:
+The @UPDATE@ and @INSERT@ statements support the following options:
 * @TIMESTAMP@: sets the timestamp for the operation. If not specified, the coordinator will use the current time (in microseconds) at the start of statement execution as the timestamp. This is usually a suitable default.
-* @TTL@: allows to specify an optional Time To Live (in seconds) for the inserted values. If set, the inserted values are automatically removed from the database after the specified time. Note that the TTL concerns the inserted values, not the column themselves. This means that any subsequent update of the column will also reset the TTL (to whatever TTL is specified in that update). By default, values never expire. A TTL of 0 or a negative one is equivalent to no TTL.
+* @TTL@: specifies an optional Time To Live (in seconds) for the inserted values. If set, the inserted values are automatically removed from the database after the specified time. Note that the TTL concerns the inserted values, not the columns themselves. This means that any subsequent update of the column will also reset the TTL (to whatever TTL is specified in that update). By default, values never expire. A TTL of 0 or a negative value is equivalent to no TTL.
 
 
 h3(#deleteStmt). DELETE
@@ -873,16 +934,20 @@
 
 <where-clause> ::= <relation> ( AND <relation> )*
 
-<relation> ::= <identifier> '=' <term>
-             | <identifier> IN '(' ( <term> ( ',' <term> )* )? ')'
+<relation> ::= <identifier> <op> <term>
+             | '(' <identifier> (',' <identifier>)* ')' <op> <term-tuple>
+             | <identifier> IN '(' ( <term> ( ',' <term>)* )? ')'
              | <identifier> IN <variable>
+             | '(' <identifier> (',' <identifier>)* ')' IN '(' ( <term-tuple> ( ',' <term-tuple>)* )? ')'
+             | '(' <identifier> (',' <identifier>)* ')' IN <variable>
 
-<condition> ::= <identifier> <op> <term>
+<op> ::= '=' | '<' | '>' | '<=' | '>='
+
+<condition> ::= <identifier> (<op> | '!=') <term>
               | <identifier> IN (<variable> | '(' ( <term> ( ',' <term> )* )? ')')
-              | <identifier> '[' <term> ']' <op> <term>
+              | <identifier> '[' <term> ']' (<op> | '!=') <term>
               | <identifier> '[' <term> ']' IN <term>
 
-<op> ::= '<' | '<=' | '=' | '!=' | '>=' | '>'
 p. 
 __Sample:__
 
@@ -891,13 +956,13 @@
 
 DELETE phone FROM Users WHERE userid IN (C73DE1D3-AF08-40F3-B124-3FF3E5109F22, B70DE1D0-9908-4AE3-BE34-5573E5B09F14);
 p. 
-The @DELETE@ statement deletes columns and rows. If column names are provided directly after the @DELETE@ keyword, only those columns are deleted from the row indicated by the @<where-clause>@ (the @id[value]@ syntax in @<selection>@ is for collection, please refer to the "collection section":#collections for more details).  Otherwise whole rows are removed. The @<where-clause>@ allows to specify the key for the row(s) to delete (the @IN@ relation is only supported for the last column of the partition key).
+The @DELETE@ statement deletes columns and rows. If column names are provided directly after the @DELETE@ keyword, only those columns are deleted from the row indicated by the @<where-clause>@ (the @id[value]@ syntax in @<selection>@ is for collection, please refer to the "collection section":#collections for more details).  Otherwise, whole rows are removed. The @<where-clause>@ specifies which rows are to be deleted.  Multiple rows may be deleted with one statement by using an @IN@ clause.  A range of rows may be deleted using an inequality operator (such as @>=@).
 
-@DELETE@ supports the @TIMESTAMP@ options with the same semantic that in the "@UPDATE@":#updateStmt statement.
+@DELETE@ supports the @TIMESTAMP@ option with the same semantics as the "@UPDATE@":#updateStmt statement.
 
 In a @DELETE@ statement, all deletions within the same partition key are applied atomically and in isolation.
 
-A @DELETE@ operation application can be conditioned using @IF@ like for @UPDATE@ and @INSERT@. But please not that as for the later, this will incur a non negligible performance cost (internally, Paxos will be used) and so should be used sparingly.
+A @DELETE@ operation can be conditional through the use of an @IF@ clause, similar to @UPDATE@ and @INSERT@ statements. However, as with @INSERT@ and @UPDATE@ statements, this will incur a non-negligible performance cost (internally, Paxos will be used) and so should be used sparingly.
 
 
 h3(#batchStmt). BATCH
@@ -1014,7 +1079,7 @@
 
 h4(#selectSelection). @<select-clause>@
 
-The @<select-clause>@ determines which columns needs to be queried and returned in the result-set. It consists of either the comma-separated list of <selector> or the wildcard character (@*@) to select all the columns defined for the table.
+The @<select-clause>@ determines which columns needs to be queried and returned in the result-set. It consists of either the comma-separated list of <selector> or the wildcard character (@*@) to select all the columns defined for the table. Please note that for wildcard @SELECT@ queries the order of columns returned is not specified and is not guaranteed to be stable between Cassandra versions.
 
 A @<selector>@ is either a column name to retrieve or a @<function>@ of one or more @<term>@s. The function allowed are the same as for @<term>@ and are described in the "function section":#functions. In addition to these generic functions, the @WRITETIME@ (resp. @TTL@) function allows to select the timestamp of when the column was inserted (resp. the time to live (in seconds) for the column (or null if the column has no expiration set)).
 
@@ -1910,7 +1975,7 @@
 
 h2(#udfs). User-Defined Functions
 
-User-defined functions allow execution of user-provided code in Cassandra. By default, Cassandra supports defining functions in _Java_ and _JavaScript_. Support for other JSR 223 compliant scripting languages (such as Python, Ruby, and Scala) can be added by adding a JAR to the classpath.
+User-defined functions allow execution of user-provided code in Cassandra. By default, Cassandra supports defining functions in _Java_ and _JavaScript_. Support for other JSR 223 compliant scripting languages (such as Python, Ruby, and Scala) has been removed in 3.0.11.
 
 UDFs are part of the Cassandra schema.  As such, they are automatically propagated to all nodes in the cluster.
 
@@ -2128,6 +2193,7 @@
 | @INSERT@       | yes |
 | @INT@          | no  |
 | @INTO@         | yes |
+| @IS@           | yes |
 | @JSON@         | no  |
 | @KEY@          | no  |
 | @KEYS@         | no  |
@@ -2138,6 +2204,7 @@
 | @LIST@         | no  |
 | @LOGIN@        | no  |
 | @MAP@          | no  |
+| @MATERIALIZED@ | yes |
 | @MODIFY@       | yes |
 | @NAN@          | yes |
 | @NOLOGIN@      | no  |
@@ -2192,6 +2259,7 @@
 | @VALUES@       | no  |
 | @VARCHAR@      | no  |
 | @VARINT@       | no  |
+| @VIEW@         | yes |
 | @WHERE@        | yes |
 | @WITH@         | yes |
 | @WRITETIME@    | no  |
@@ -2208,12 +2276,16 @@
 | @enum@      |
 | @interval@  |
 | @macaddr@   |
-| @smallint@  |
 
 h2(#changes). Changes
 
 The following describes the changes in each version of CQL.
 
+h3. 3.4.0
+
+* Support for "materialized views":#createMVStmt
+* "@DELETE@":#deleteStmt support for inequality expressions and @IN@ restrictions on any primary key columns
+* "@UPDATE@":#updateStmt support for @IN@ restrictions on any primary key columns
 
 h3. 3.3.1
 

diff --git a/doc/native_protocol_v1.spec b/doc/native_protocol_v1.spec
deleted file mode 100644
index 9c9b6b5..0000000
--- a/doc/native_protocol_v1.spec
+++ /dev/null

@@ -1,746 +0,0 @@
-
-                             CQL BINARY PROTOCOL v1
-
-
-Table of Contents
-
-  1. Overview
-  2. Frame header
-    2.1. version
-    2.2. flags
-    2.3. stream
-    2.4. opcode
-    2.5. length
-  3. Notations
-  4. Messages
-    4.1. Requests
-      4.1.1. STARTUP
-      4.1.2. CREDENTIALS
-      4.1.3. OPTIONS
-      4.1.4. QUERY
-      4.1.5. PREPARE
-      4.1.6. EXECUTE
-      4.1.7. REGISTER
-    4.2. Responses
-      4.2.1. ERROR
-      4.2.2. READY
-      4.2.3. AUTHENTICATE
-      4.2.4. SUPPORTED
-      4.2.5. RESULT
-        4.2.5.1. Void
-        4.2.5.2. Rows
-        4.2.5.3. Set_keyspace
-        4.2.5.4. Prepared
-        4.2.5.5. Schema_change
-      4.2.6. EVENT
-  5. Compression
-  6. Data Type Serialization Formats
-  7. Error codes
-
-
-1. Overview
-
-  The CQL binary protocol is a frame based protocol. Frames are defined as:
-
-      0         8        16        24        32
-      +---------+---------+---------+---------+
-      | version |  flags  | stream  | opcode  |
-      +---------+---------+---------+---------+
-      |                length                 |
-      +---------+---------+---------+---------+
-      |                                       |
-      .            ...  body ...              .
-      .                                       .
-      .                                       .
-      +----------------------------------------
-
-  The protocol is big-endian (network byte order).
-
-  Each frame contains a fixed size header (8 bytes) followed by a variable size
-  body. The header is described in Section 2. The content of the body depends
-  on the header opcode value (the body can in particular be empty for some
-  opcode values). The list of allowed opcode is defined Section 2.3 and the
-  details of each corresponding message is described Section 4.
-
-  The protocol distinguishes 2 types of frames: requests and responses. Requests
-  are those frame sent by the clients to the server, response are the ones sent
-  by the server. Note however that while communication are initiated by the
-  client with the server responding to request, the protocol may likely add
-  server pushes in the future, so responses does not obligatory come right after
-  a client request.
-
-  Note to client implementors: clients library should always assume that the
-  body of a given frame may contain more data than what is described in this
-  document. It will however always be safe to ignore the remaining of the frame
-  body in such cases. The reason is that this may allow to sometimes extend the
-  protocol with optional features without needing to change the protocol
-  version.
-
-
-2. Frame header
-
-2.1. version
-
-  The version is a single byte that indicate both the direction of the message
-  (request or response) and the version of the protocol in use. The up-most bit
-  of version is used to define the direction of the message: 0 indicates a
-  request, 1 indicates a responses. This can be useful for protocol analyzers to
-  distinguish the nature of the packet from the direction which it is moving.
-  The rest of that byte is the protocol version (1 for the protocol defined in
-  this document). In other words, for this version of the protocol, version will
-  have one of:
-    0x01    Request frame for this protocol version
-    0x81    Response frame for this protocol version
-
-
-2.2. flags
-
-  Flags applying to this frame. The flags have the following meaning (described
-  by the mask that allow to select them):
-    0x01: Compression flag. If set, the frame body is compressed. The actual
-          compression to use should have been set up beforehand through the
-          Startup message (which thus cannot be compressed; Section 4.1.1).
-    0x02: Tracing flag. For a request frame, this indicate the client requires
-          tracing of the request. Note that not all requests support tracing.
-          Currently, only QUERY, PREPARE and EXECUTE queries support tracing.
-          Other requests will simply ignore the tracing flag if set. If a
-          request support tracing and the tracing flag was set, the response to
-          this request will have the tracing flag set and contain tracing
-          information.
-          If a response frame has the tracing flag set, its body contains
-          a tracing ID. The tracing ID is a [uuid] and is the first thing in
-          the frame body. The rest of the body will then be the usual body
-          corresponding to the response opcode.
-
-  The rest of the flags is currently unused and ignored.
-
-2.3. stream
-
-  A frame has a stream id (one signed byte). When sending request messages, this
-  stream id must be set by the client to a positive byte (negative stream id
-  are reserved for streams initiated by the server; currently all EVENT messages
-  (section 4.2.6) have a streamId of -1). If a client sends a request message
-  with the stream id X, it is guaranteed that the stream id of the response to
-  that message will be X.
-
-  This allow to deal with the asynchronous nature of the protocol. If a client
-  sends multiple messages simultaneously (without waiting for responses), there
-  is no guarantee on the order of the responses. For instance, if the client
-  writes REQ_1, REQ_2, REQ_3 on the wire (in that order), the server might
-  respond to REQ_3 (or REQ_2) first. Assigning different stream id to these 3
-  requests allows the client to distinguish to which request an received answer
-  respond to. As there can only be 128 different simultaneous stream, it is up
-  to the client to reuse stream id.
-
-  Note that clients are free to use the protocol synchronously (i.e. wait for
-  the response to REQ_N before sending REQ_N+1). In that case, the stream id
-  can be safely set to 0. Clients should also feel free to use only a subset of
-  the 128 maximum possible stream ids if it is simpler for those
-  implementation.
-
-2.4. opcode
-
-  An integer byte that distinguish the actual message:
-    0x00    ERROR
-    0x01    STARTUP
-    0x02    READY
-    0x03    AUTHENTICATE
-    0x04    CREDENTIALS
-    0x05    OPTIONS
-    0x06    SUPPORTED
-    0x07    QUERY
-    0x08    RESULT
-    0x09    PREPARE
-    0x0A    EXECUTE
-    0x0B    REGISTER
-    0x0C    EVENT
-
-  Messages are described in Section 4.
-
-
-2.5. length
-
-  A 4 byte integer representing the length of the body of the frame (note:
-  currently a frame is limited to 256MB in length).
-
-
-3. Notations
-
-  To describe the layout of the frame body for the messages in Section 4, we
-  define the following:
-
-    [int]          A 4 byte integer
-    [short]        A 2 byte unsigned integer
-    [string]       A [short] n, followed by n bytes representing an UTF-8
-                   string.
-    [long string]  An [int] n, followed by n bytes representing an UTF-8 string.
-    [uuid]         A 16 bytes long uuid.
-    [string list]  A [short] n, followed by n [string].
-    [bytes]        A [int] n, followed by n bytes if n >= 0. If n < 0,
-                   no byte should follow and the value represented is `null`.
-    [short bytes]  A [short] n, followed by n bytes if n >= 0.
-
-    [option]       A pair of <id><value> where <id> is a [short] representing
-                   the option id and <value> depends on that option (and can be
-                   of size 0). The supported id (and the corresponding <value>)
-                   will be described when this is used.
-    [option list]  A [short] n, followed by n [option].
-    [inet]         An address (ip and port) to a node. It consists of one
-                   [byte] n, that represents the address size, followed by n
-                   [byte] representing the IP address (in practice n can only be
-                   either 4 (IPv4) or 16 (IPv6)), following by one [int]
-                   representing the port.
-    [consistency]  A consistency level specification. This is a [short]
-                   representing a consistency level with the following
-                   correspondance:
-                     0x0000    ANY
-                     0x0001    ONE
-                     0x0002    TWO
-                     0x0003    THREE
-                     0x0004    QUORUM
-                     0x0005    ALL
-                     0x0006    LOCAL_QUORUM
-                     0x0007    EACH_QUORUM
-                     0x000A    LOCAL_ONE
-
-    [string map]      A [short] n, followed by n pair <k><v> where <k> and <v>
-                      are [string].
-    [string multimap] A [short] n, followed by n pair <k><v> where <k> is a
-                      [string] and <v> is a [string list].
-
-
-4. Messages
-
-4.1. Requests
-
-  Note that outside of their normal responses (described below), all requests
-  can get an ERROR message (Section 4.2.1) as response.
-
-4.1.1. STARTUP
-
-  Initialize the connection. The server will respond by either a READY message
-  (in which case the connection is ready for queries) or an AUTHENTICATE message
-  (in which case credentials will need to be provided using CREDENTIALS).
-
-  This must be the first message of the connection, except for OPTIONS that can
-  be sent before to find out the options supported by the server. Once the
-  connection has been initialized, a client should not send any more STARTUP
-  message.
-
-  The body is a [string map] of options. Possible options are:
-    - "CQL_VERSION": the version of CQL to use. This option is mandatory and
-      currenty, the only version supported is "3.0.0". Note that this is
-      different from the protocol version.
-    - "COMPRESSION": the compression algorithm to use for frames (See section 5).
-      This is optional, if not specified no compression will be used.
-
-
-4.1.2. CREDENTIALS
-
-  Provides credentials information for the purpose of identification. This
-  message comes as a response to an AUTHENTICATE message from the server, but
-  can be use later in the communication to change the authentication
-  information.
-
-  The body is a list of key/value informations. It is a [short] n, followed by n
-  pair of [string]. These key/value pairs are passed as is to the Cassandra
-  IAuthenticator and thus the detail of which informations is needed depends on
-  that authenticator.
-
-  The response to a CREDENTIALS is a READY message (or an ERROR message).
-
-
-4.1.3. OPTIONS
-
-  Asks the server to return what STARTUP options are supported. The body of an
-  OPTIONS message should be empty and the server will respond with a SUPPORTED
-  message.
-
-
-4.1.4. QUERY
-
-  Performs a CQL query. The body of the message consists of a CQL query as a [long
-  string] followed by the [consistency] for the operation.
-
-  Note that the consistency is ignored by some queries (USE, CREATE, ALTER,
-  TRUNCATE, ...).
-
-  The server will respond to a QUERY message with a RESULT message, the content
-  of which depends on the query.
-
-
-4.1.5. PREPARE
-
-  Prepare a query for later execution (through EXECUTE). The body consists of
-  the CQL query to prepare as a [long string].
-
-  The server will respond with a RESULT message with a `prepared` kind (0x0004,
-  see Section 4.2.5).
-
-
-4.1.6. EXECUTE
-
-  Executes a prepared query. The body of the message must be:
-    <id><n><value_1>....<value_n><consistency>
-  where:
-    - <id> is the prepared query ID. It's the [short bytes] returned as a
-      response to a PREPARE message.
-    - <n> is a [short] indicating the number of following values.
-    - <value_1>...<value_n> are the [bytes] to use for bound variables in the
-      prepared query.
-    - <consistency> is the [consistency] level for the operation.
-
-  Note that the consistency is ignored by some (prepared) queries (USE, CREATE,
-  ALTER, TRUNCATE, ...).
-
-  The response from the server will be a RESULT message.
-
-
-4.1.7. REGISTER
-
-  Register this connection to receive some type of events. The body of the
-  message is a [string list] representing the event types to register to. See
-  section 4.2.6 for the list of valid event types.
-
-  The response to a REGISTER message will be a READY message.
-
-  Please note that if a client driver maintains multiple connections to a
-  Cassandra node and/or connections to multiple nodes, it is advised to
-  dedicate a handful of connections to receive events, but to *not* register
-  for events on all connections, as this would only result in receiving
-  multiple times the same event messages, wasting bandwidth.
-
-
-4.2. Responses
-
-  This section describes the content of the frame body for the different
-  responses. Please note that to make room for future evolution, clients should
-  support extra informations (that they should simply discard) to the one
-  described in this document at the end of the frame body.
-
-4.2.1. ERROR
-
-  Indicates an error processing a request. The body of the message will be an
-  error code ([int]) followed by a [string] error message. Then, depending on
-  the exception, more content may follow. The error codes are defined in
-  Section 7, along with their additional content if any.
-
-
-4.2.2. READY
-
-  Indicates that the server is ready to process queries. This message will be
-  sent by the server either after a STARTUP message if no authentication is
-  required, or after a successful CREDENTIALS message.
-
-  The body of a READY message is empty.
-
-
-4.2.3. AUTHENTICATE
-
-  Indicates that the server require authentication. This will be sent following
-  a STARTUP message and must be answered by a CREDENTIALS message from the
-  client to provide authentication informations.
-
-  The body consists of a single [string] indicating the full class name of the
-  IAuthenticator in use.
-
-
-4.2.4. SUPPORTED
-
-  Indicates which startup options are supported by the server. This message
-  comes as a response to an OPTIONS message.
-
-  The body of a SUPPORTED message is a [string multimap]. This multimap gives
-  for each of the supported STARTUP options, the list of supported values.
-
-
-4.2.5. RESULT
-
-  The result to a query (QUERY, PREPARE or EXECUTE messages).
-
-  The first element of the body of a RESULT message is an [int] representing the
-  `kind` of result. The rest of the body depends on the kind. The kind can be
-  one of:
-    0x0001    Void: for results carrying no information.
-    0x0002    Rows: for results to select queries, returning a set of rows.
-    0x0003    Set_keyspace: the result to a `use` query.
-    0x0004    Prepared: result to a PREPARE message.
-    0x0005    Schema_change: the result to a schema altering query.
-
-  The body for each kind (after the [int] kind) is defined below.
-
-
-4.2.5.1. Void
-
-  The rest of the body for a Void result is empty. It indicates that a query was
-  successful without providing more information.
-
-
-4.2.5.2. Rows
-
-  Indicates a set of rows. The rest of body of a Rows result is:
-    <metadata><rows_count><rows_content>
-  where:
-    - <metadata> is composed of:
-        <flags><columns_count><global_table_spec>?<col_spec_1>...<col_spec_n>
-      where:
-        - <flags> is an [int]. The bits of <flags> provides information on the
-          formatting of the remaining informations. A flag is set if the bit
-          corresponding to its `mask` is set. Supported flags are, given there
-          mask:
-            0x0001    Global_tables_spec: if set, only one table spec (keyspace
-                      and table name) is provided as <global_table_spec>. If not
-                      set, <global_table_spec> is not present.
-        - <columns_count> is an [int] representing the number of columns selected
-          by the query this result is of. It defines the number of <col_spec_i>
-          elements in and the number of element for each row in <rows_content>.
-        - <global_table_spec> is present if the Global_tables_spec is set in
-          <flags>. If present, it is composed of two [string] representing the
-          (unique) keyspace name and table name the columns return are of.
-        - <col_spec_i> specifies the columns returned in the query. There is
-          <column_count> such column specification that are composed of:
-            (<ksname><tablename>)?<column_name><type>
-          The initial <ksname> and <tablename> are two [string] are only present
-          if the Global_tables_spec flag is not set. The <column_name> is a
-          [string] and <type> is an [option] that correspond to the column name
-          and type. The option for <type> is either a native type (see below),
-          in which case the option has no value, or a 'custom' type, in which
-          case the value is a [string] representing the full qualified class
-          name of the type represented. Valid option ids are:
-            0x0000    Custom: the value is a [string], see above.
-            0x0001    Ascii
-            0x0002    Bigint
-            0x0003    Blob
-            0x0004    Boolean
-            0x0005    Counter
-            0x0006    Decimal
-            0x0007    Double
-            0x0008    Float
-            0x0009    Int
-            0x000A    Text
-            0x000B    Timestamp
-            0x000C    Uuid
-            0x000D    Varchar
-            0x000E    Varint
-            0x000F    Timeuuid
-            0x0010    Inet
-            0x0020    List: the value is an [option], representing the type
-                            of the elements of the list.
-            0x0021    Map: the value is two [option], representing the types of the
-                           keys and values of the map
-            0x0022    Set: the value is an [option], representing the type
-                            of the elements of the set
-    - <rows_count> is an [int] representing the number of rows present in this
-      result. Those rows are serialized in the <rows_content> part.
-    - <rows_content> is composed of <row_1>...<row_m> where m is <rows_count>.
-      Each <row_i> is composed of <value_1>...<value_n> where n is
-      <columns_count> and where <value_j> is a [bytes] representing the value
-      returned for the jth column of the ith row. In other words, <rows_content>
-      is composed of (<rows_count> * <columns_count>) [bytes].
-
-
-4.2.5.3. Set_keyspace
-
-  The result to a `use` query. The body (after the kind [int]) is a single
-  [string] indicating the name of the keyspace that has been set.
-
-
-4.2.5.4. Prepared
-
-  The result to a PREPARE message. The rest of the body of a Prepared result is:
-    <id><metadata>
-  where:
-    - <id> is [short bytes] representing the prepared query ID.
-    - <metadata> is defined exactly as for a Rows RESULT (See section 4.2.5.2).
-
-  Note that prepared query ID return is global to the node on which the query
-  has been prepared. It can be used on any connection to that node and this
-  until the node is restarted (after which the query must be reprepared).
-
-4.2.5.5. Schema_change
-
-  The result to a schema altering query (creation/update/drop of a
-  keyspace/table/index). The body (after the kind [int]) is composed of 3
-  [string]:
-    <change><keyspace><table>
-  where:
-    - <change> describe the type of change that has occured. It can be one of
-      "CREATED", "UPDATED" or "DROPPED".
-    - <keyspace> is the name of the affected keyspace or the keyspace of the
-      affected table.
-    - <table> is the name of the affected table. <table> will be empty (i.e.
-      the empty string "") if the change was affecting a keyspace and not a
-      table.
-
-  Note that queries to create and drop an index are considered changes
-  updating the table the index is on.  Queries that create, alter, or drop
-  user-defined types (availble in Cassandra 2.1+) are considered changes
-  updating the keyspace the type is defined in.
-
-
-4.2.6. EVENT
-
-  And event pushed by the server. A client will only receive events for the
-  type it has REGISTER to. The body of an EVENT message will start by a
-  [string] representing the event type. The rest of the message depends on the
-  event type. The valid event types are:
-    - "TOPOLOGY_CHANGE": events related to change in the cluster topology.
-      Currently, events are sent when new nodes are added to the cluster, and
-      when nodes are removed. The body of the message (after the event type)
-      consists of a [string] and an [inet], corresponding respectively to the
-      type of change ("NEW_NODE", "REMOVED_NODE", or "MOVED_NODE") followed
-      by the address of the new/removed/moved node.
-    - "STATUS_CHANGE": events related to change of node status. Currently,
-      up/down events are sent. The body of the message (after the event type)
-      consists of a [string] and an [inet], corresponding respectively to the
-      type of status change ("UP" or "DOWN") followed by the address of the
-      concerned node.
-    - "SCHEMA_CHANGE": events related to schema change. The body of the message
-      (after the event type) consists of 3 [string] corresponding respectively
-      to the type of schema change ("CREATED", "UPDATED" or "DROPPED"),
-      followed by the name of the affected keyspace and the name of the
-      affected table within that keyspace. For changes that affect a keyspace
-      directly, the table name will be empty (i.e. the empty string "").
-      Changes to user-defined types (available in Cassandra 2.1+) will result
-      in an "UPDATED" change for the keyspace containing the type, and the
-      table name will be empty.
-
-  All EVENT message have a streamId of -1 (Section 2.3).
-
-  Please note that "NEW_NODE" and "UP" events are sent based on internal Gossip
-  communication and as such may be sent a short delay before the binary
-  protocol server on the newly up node is fully started. Clients are thus
-  advise to wait a short time before trying to connect to the node (1 seconds
-  should be enough), otherwise they may experience a connection refusal at
-  first.
-
-  It is possible for the same event to be sent multiple times. Therefore,
-  a client library should ignore the same event if it has already been notified
-  of a change.
-
-5. Compression
-
-  Frame compression is supported by the protocol, but then only the frame body
-  is compressed (the frame header should never be compressed).
-
-  Before being used, client and server must agree on a compression algorithm to
-  use, which is done in the STARTUP message. As a consequence, a STARTUP message
-  must never be compressed.  However, once the STARTUP frame has been received
-  by the server can be compressed (including the response to the STARTUP
-  request). Frame do not have to be compressed however, even if compression has
-  been agreed upon (a server may only compress frame above a certain size at its
-  discretion). A frame body should be compressed if and only if the compressed
-  flag (see Section 2.2) is set.
-
-
-6. Data Type Serialization Formats
-
-  This sections describes the serialization formats for all CQL data types
-  supported by Cassandra through the native protocol.  These serialization
-  formats should be used by client drivers to encode values for EXECUTE
-  messages.  Cassandra will use these formats when returning values in
-  RESULT messages.
-
-  All values are represented as [bytes] in EXECUTE and RESULT messages.
-  The [bytes] format includes an int prefix denoting the length of the value.
-  For that reason, the serialization formats described here will not include
-  a length component.
-
-  For legacy compatibility reasons, note that most non-string types support
-  "empty" values (i.e. a value with zero length).  An empty value is distinct
-  from NULL, which is encoded with a negative length.
-
-  As with the rest of the native protocol, all encodings are big-endian.
-
-6.1. ascii
-
-  A sequence of bytes in the ASCII range [0, 127].  Bytes with values outside of
-  this range will result in a validation error.
-
-6.2 bigint
-
-  An eight-byte two's complement integer.
-
-6.3 blob
-
-  Any sequence of bytes.
-
-6.4 boolean
-
-  A single byte.  A value of 0 denotes "false"; any other value denotes "true".
-  (However, it is recommended that a value of 1 be used to represent "true".)
-
-6.5 decimal
-
-  The decimal format represents an arbitrary-precision number.  It contains an
-  [int] "scale" component followed by a varint encoding (see section 6.17)
-  of the unscaled value.  The encoded value represents "<unscaled>E<-scale>".
-  In other words, "<unscaled> * 10 ^ (-1 * <scale>)".
-
-6.6 double
-
-  An eight-byte floating point number in the IEEE 754 binary64 format.
-
-6.7 float
-
-  An four-byte floating point number in the IEEE 754 binary32 format.
-
-6.8 inet
-
-  A 4 byte or 16 byte sequence denoting an IPv4 or IPv6 address, respectively.
-
-6.9 int
-
-  A four-byte two's complement integer.
-
-6.10 list
-
-  A [short] n indicating the number of elements in the list, followed by n
-  elements.  Each element is [short bytes] representing the serialized value.
-
-6.11 map
-
-  A [short] n indicating the number of key/value pairs in the map, followed by
-  n entries.  Each entry is composed of two [short bytes] representing the key
-  and value.
-
-6.12 set
-
-  A [short] n indicating the number of elements in the set, followed by n
-  elements.  Each element is [short bytes] representing the serialized value.
-
-6.13 text
-
-  A sequence of bytes conforming to the UTF-8 specifications.
-
-6.14 timestamp
-
-  An eight-byte two's complement integer representing a millisecond-precision
-  offset from the unix epoch (00:00:00, January 1st, 1970).  Negative values
-  represent a negative offset from the epoch.
-
-6.15 uuid
-
-  A 16 byte sequence representing any valid UUID as defined by RFC 4122.
-
-6.16 varchar
-
-  An alias of the "text" type.
-
-6.17 varint
-
-  A variable-length two's complement encoding of a signed integer.
-
-  The following examples may help implementors of this spec:
-
-  Value | Encoding
-  ------|---------
-      0 |     0x00
-      1 |     0x01
-    127 |     0x7F
-    128 |   0x0080
-     -1 |     0xFF
-   -128 |     0x80
-   -129 |   0xFF7F
-
-  Note that positive numbers must use a most-significant byte with a value
-  less than 0x80, because a most-significant bit of 1 indicates a negative
-  value.  Implementors should pad positive values that have a MSB >= 0x80
-  with a leading 0x00 byte.
-
-6.18 timeuuid
-
-  A 16 byte sequence representing a version 1 UUID as defined by RFC 4122.
-
-
-7. Error codes
-
-  The supported error codes are described below:
-    0x0000    Server error: something unexpected happened. This indicates a
-              server-side bug.
-    0x000A    Protocol error: some client message triggered a protocol
-              violation (for instance a QUERY message is sent before a STARTUP
-              one has been sent)
-    0x0100    Bad credentials: CREDENTIALS request failed because Cassandra
-              did not accept the provided credentials.
-
-    0x1000    Unavailable exception. The rest of the ERROR message body will be
-                <cl><required><alive>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <required> is an [int] representing the number of node that
-                           should be alive to respect <cl>
-                <alive> is an [int] representing the number of replica that
-                        were known to be alive when the request has been
-                        processed (since an unavailable exception has been
-                        triggered, there will be <alive> < <required>)
-    0x1001    Overloaded: the request cannot be processed because the
-              coordinator node is overloaded
-    0x1002    Is_bootstrapping: the request was a read request but the
-              coordinator node is bootstrapping
-    0x1003    Truncate_error: error during a truncation error.
-    0x1100    Write_timeout: Timeout exception during a write request. The rest
-              of the ERROR message body will be
-                <cl><received><blockfor><writeType>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <received> is an [int] representing the number of nodes having
-                           acknowledged the request.
-                <blockfor> is an [int] representing the number of replica whose
-                           acknowledgement is required to achieve <cl>.
-                <writeType> is a [string] that describe the type of the write
-                            that timeouted. The value of that string can be one
-                            of:
-                             - "SIMPLE": the write was a non-batched
-                               non-counter write.
-                             - "BATCH": the write was a (logged) batch write.
-                               If this type is received, it means the batch log
-                               has been successfully written (otherwise a
-                               "BATCH_LOG" type would have been send instead).
-                             - "UNLOGGED_BATCH": the write was an unlogged
-                               batch. Not batch log write has been attempted.
-                             - "COUNTER": the write was a counter write
-                               (batched or not).
-                             - "BATCH_LOG": the timeout occured during the
-                               write to the batch log when a (logged) batch
-                               write was requested.
-    0x1200    Read_timeout: Timeout exception during a read request. The rest
-              of the ERROR message body will be
-                <cl><received><blockfor><data_present>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <received> is an [int] representing the number of nodes having
-                           answered the request.
-                <blockfor> is an [int] representing the number of replicas whose
-                           response is required to achieve <cl>. Please note that it
-                           is possible to have <received> >= <blockfor> if
-                           <data_present> is false. And also in the (unlikely)
-                           case were <cl> is achieved but the coordinator node
-                           timeout while waiting for read-repair
-                           acknowledgement.
-                <data_present> is a single byte. If its value is 0, it means
-                               the replica that was asked for data has not
-                               responded. Otherwise, the value is != 0.
-
-    0x2000    Syntax_error: The submitted query has a syntax error.
-    0x2100    Unauthorized: The logged user doesn't have the right to perform
-              the query.
-    0x2200    Invalid: The query is syntactically correct but invalid.
-    0x2300    Config_error: The query is invalid because of some configuration issue
-    0x2400    Already_exists: The query attempted to create a keyspace or a
-              table that was already existing. The rest of the ERROR message
-              body will be <ks><table> where:
-                <ks> is a [string] representing either the keyspace that
-                     already exists, or the keyspace in which the table that
-                     already exists is.
-                <table> is a [string] representing the name of the table that
-                        already exists. If the query was attempting to create a
-                        keyspace, <table> will be present but will be the empty
-                        string.
-    0x2500    Unprepared: Can be thrown while a prepared statement tries to be
-              executed if the provide prepared statement ID is not known by
-              this host. The rest of the ERROR message body will be [short
-              bytes] representing the unknown ID.

diff --git a/doc/native_protocol_v2.spec b/doc/native_protocol_v2.spec
deleted file mode 100644
index b9cc51f..0000000
--- a/doc/native_protocol_v2.spec
+++ /dev/null

@@ -1,954 +0,0 @@
-
-                             CQL BINARY PROTOCOL v2
-
-
-Table of Contents
-
-  1. Overview
-  2. Frame header
-    2.1. version
-    2.2. flags
-    2.3. stream
-    2.4. opcode
-    2.5. length
-  3. Notations
-  4. Messages
-    4.1. Requests
-      4.1.1. STARTUP
-      4.1.2. AUTH_RESPONSE
-      4.1.3. OPTIONS
-      4.1.4. QUERY
-      4.1.5. PREPARE
-      4.1.6. EXECUTE
-      4.1.7. BATCH
-      4.1.8. REGISTER
-    4.2. Responses
-      4.2.1. ERROR
-      4.2.2. READY
-      4.2.3. AUTHENTICATE
-      4.2.4. SUPPORTED
-      4.2.5. RESULT
-        4.2.5.1. Void
-        4.2.5.2. Rows
-        4.2.5.3. Set_keyspace
-        4.2.5.4. Prepared
-        4.2.5.5. Schema_change
-      4.2.6. EVENT
-      4.2.7. AUTH_CHALLENGE
-      4.2.8. AUTH_SUCCESS
-  5. Compression
-  6. Data Type Serialization Formats
-  7. Result paging
-  8. Error codes
-  9. Changes from v1
-
-
-1. Overview
-
-  The CQL binary protocol is a frame based protocol. Frames are defined as:
-
-      0         8        16        24        32
-      +---------+---------+---------+---------+
-      | version |  flags  | stream  | opcode  |
-      +---------+---------+---------+---------+
-      |                length                 |
-      +---------+---------+---------+---------+
-      |                                       |
-      .            ...  body ...              .
-      .                                       .
-      .                                       .
-      +----------------------------------------
-
-  The protocol is big-endian (network byte order).
-
-  Each frame contains a fixed size header (8 bytes) followed by a variable size
-  body. The header is described in Section 2. The content of the body depends
-  on the header opcode value (the body can in particular be empty for some
-  opcode values). The list of allowed opcode is defined Section 2.3 and the
-  details of each corresponding message is described Section 4.
-
-  The protocol distinguishes 2 types of frames: requests and responses. Requests
-  are those frame sent by the clients to the server, response are the ones sent
-  by the server. Note however that the protocol supports server pushes (events)
-  so responses does not necessarily come right after a client request.
-
-  Note to client implementors: clients library should always assume that the
-  body of a given frame may contain more data than what is described in this
-  document. It will however always be safe to ignore the remaining of the frame
-  body in such cases. The reason is that this may allow to sometimes extend the
-  protocol with optional features without needing to change the protocol
-  version.
-
-
-
-2. Frame header
-
-2.1. version
-
-  The version is a single byte that indicate both the direction of the message
-  (request or response) and the version of the protocol in use. The up-most bit
-  of version is used to define the direction of the message: 0 indicates a
-  request, 1 indicates a responses. This can be useful for protocol analyzers to
-  distinguish the nature of the packet from the direction which it is moving.
-  The rest of that byte is the protocol version (2 for the protocol defined in
-  this document). In other words, for this version of the protocol, version will
-  have one of:
-    0x02    Request frame for this protocol version
-    0x82    Response frame for this protocol version
-
-  Please note that the while every message ship with the version, only one version
-  of messages is accepted on a given connection. In other words, the first message
-  exchanged (STARTUP) sets the version for the connection for the lifetime of this
-  connection.
-
-  This document describe the version 2 of the protocol. For the changes made since
-  version 1, see Section 9.
-
-
-2.2. flags
-
-  Flags applying to this frame. The flags have the following meaning (described
-  by the mask that allow to select them):
-    0x01: Compression flag. If set, the frame body is compressed. The actual
-          compression to use should have been set up beforehand through the
-          Startup message (which thus cannot be compressed; Section 4.1.1).
-    0x02: Tracing flag. For a request frame, this indicate the client requires
-          tracing of the request. Note that not all requests support tracing.
-          Currently, only QUERY, PREPARE and EXECUTE queries support tracing.
-          Other requests will simply ignore the tracing flag if set. If a
-          request support tracing and the tracing flag was set, the response to
-          this request will have the tracing flag set and contain tracing
-          information.
-          If a response frame has the tracing flag set, its body contains
-          a tracing ID. The tracing ID is a [uuid] and is the first thing in
-          the frame body. The rest of the body will then be the usual body
-          corresponding to the response opcode.
-
-  The rest of the flags is currently unused and ignored.
-
-2.3. stream
-
-  A frame has a stream id (one signed byte). When sending request messages, this
-  stream id must be set by the client to a positive byte (negative stream id
-  are reserved for streams initiated by the server; currently all EVENT messages
-  (section 4.2.6) have a streamId of -1). If a client sends a request message
-  with the stream id X, it is guaranteed that the stream id of the response to
-  that message will be X.
-
-  This allow to deal with the asynchronous nature of the protocol. If a client
-  sends multiple messages simultaneously (without waiting for responses), there
-  is no guarantee on the order of the responses. For instance, if the client
-  writes REQ_1, REQ_2, REQ_3 on the wire (in that order), the server might
-  respond to REQ_3 (or REQ_2) first. Assigning different stream id to these 3
-  requests allows the client to distinguish to which request an received answer
-  respond to. As there can only be 128 different simultaneous stream, it is up
-  to the client to reuse stream id.
-
-  Note that clients are free to use the protocol synchronously (i.e. wait for
-  the response to REQ_N before sending REQ_N+1). In that case, the stream id
-  can be safely set to 0. Clients should also feel free to use only a subset of
-  the 128 maximum possible stream ids if it is simpler for those
-  implementation.
-
-2.4. opcode
-
-  An integer byte that distinguish the actual message:
-    0x00    ERROR
-    0x01    STARTUP
-    0x02    READY
-    0x03    AUTHENTICATE
-    0x05    OPTIONS
-    0x06    SUPPORTED
-    0x07    QUERY
-    0x08    RESULT
-    0x09    PREPARE
-    0x0A    EXECUTE
-    0x0B    REGISTER
-    0x0C    EVENT
-    0x0D    BATCH
-    0x0E    AUTH_CHALLENGE
-    0x0F    AUTH_RESPONSE
-    0x10    AUTH_SUCCESS
-
-  Messages are described in Section 4.
-
-  (Note that there is no 0x04 message in this version of the protocol)
-
-
-2.5. length
-
-  A 4 byte integer representing the length of the body of the frame (note:
-  currently a frame is limited to 256MB in length).
-
-
-3. Notations
-
-  To describe the layout of the frame body for the messages in Section 4, we
-  define the following:
-
-    [int]          A 4 byte integer
-    [short]        A 2 byte unsigned integer
-    [string]       A [short] n, followed by n bytes representing an UTF-8
-                   string.
-    [long string]  An [int] n, followed by n bytes representing an UTF-8 string.
-    [uuid]         A 16 bytes long uuid.
-    [string list]  A [short] n, followed by n [string].
-    [bytes]        A [int] n, followed by n bytes if n >= 0. If n < 0,
-                   no byte should follow and the value represented is `null`.
-    [short bytes]  A [short] n, followed by n bytes if n >= 0.
-
-    [option]       A pair of <id><value> where <id> is a [short] representing
-                   the option id and <value> depends on that option (and can be
-                   of size 0). The supported id (and the corresponding <value>)
-                   will be described when this is used.
-    [option list]  A [short] n, followed by n [option].
-    [inet]         An address (ip and port) to a node. It consists of one
-                   [byte] n, that represents the address size, followed by n
-                   [byte] representing the IP address (in practice n can only be
-                   either 4 (IPv4) or 16 (IPv6)), following by one [int]
-                   representing the port.
-    [consistency]  A consistency level specification. This is a [short]
-                   representing a consistency level with the following
-                   correspondance:
-                     0x0000    ANY
-                     0x0001    ONE
-                     0x0002    TWO
-                     0x0003    THREE
-                     0x0004    QUORUM
-                     0x0005    ALL
-                     0x0006    LOCAL_QUORUM
-                     0x0007    EACH_QUORUM
-                     0x0008    SERIAL
-                     0x0009    LOCAL_SERIAL
-                     0x000A    LOCAL_ONE
-
-    [string map]      A [short] n, followed by n pair <k><v> where <k> and <v>
-                      are [string].
-    [string multimap] A [short] n, followed by n pair <k><v> where <k> is a
-                      [string] and <v> is a [string list].
-
-
-4. Messages
-
-4.1. Requests
-
-  Note that outside of their normal responses (described below), all requests
-  can get an ERROR message (Section 4.2.1) as response.
-
-4.1.1. STARTUP
-
-  Initialize the connection. The server will respond by either a READY message
-  (in which case the connection is ready for queries) or an AUTHENTICATE message
-  (in which case credentials will need to be provided using AUTH_RESPONSE).
-
-  This must be the first message of the connection, except for OPTIONS that can
-  be sent before to find out the options supported by the server. Once the
-  connection has been initialized, a client should not send any more STARTUP
-  message.
-
-  The body is a [string map] of options. Possible options are:
-    - "CQL_VERSION": the version of CQL to use. This option is mandatory and
-      currenty, the only version supported is "3.0.0". Note that this is
-      different from the protocol version.
-    - "COMPRESSION": the compression algorithm to use for frames (See section 5).
-      This is optional, if not specified no compression will be used.
-
-
-4.1.2. AUTH_RESPONSE
-
-  Answers a server authentication challenge.
-
-  Authentication in the protocol is SASL based. The server sends authentication
-  challenges (a bytes token) to which the client answer with this message. Those
-  exchanges continue until the server accepts the authentication by sending a
-  AUTH_SUCCESS message after a client AUTH_RESPONSE. It is however that client that
-  initiate the exchange by sending an initial AUTH_RESPONSE in response to a
-  server AUTHENTICATE request.
-
-  The body of this message is a single [bytes] token. The details of what this
-  token contains (and when it can be null/empty, if ever) depends on the actual
-  authenticator used.
-
-  The response to a AUTH_RESPONSE is either a follow-up AUTH_CHALLENGE message,
-  an AUTH_SUCCESS message or an ERROR message.
-
-
-4.1.3. OPTIONS
-
-  Asks the server to return what STARTUP options are supported. The body of an
-  OPTIONS message should be empty and the server will respond with a SUPPORTED
-  message.
-
-
-4.1.4. QUERY
-
-  Performs a CQL query. The body of the message must be:
-    <query><query_parameters>
-  where <query> is a [long string] representing the query and
-  <query_parameters> must be
-    <consistency><flags>[<n><value_1>...<value_n>][<result_page_size>][<paging_state>][<serial_consistency>]
-  where:
-    - <consistency> is the [consistency] level for the operation.
-    - <flags> is a [byte] whose bits define the options for this query and
-      in particular influence what the remainder of the message contains.
-      A flag is set if the bit corresponding to its `mask` is set. Supported
-      flags are, given there mask:
-        0x01: Values. In that case, a [short] <n> followed by <n> [bytes]
-              values are provided. Those value are used for bound variables in
-              the query.
-        0x02: Skip_metadata. If present, the Result Set returned as a response
-              to that query (if any) will have the NO_METADATA flag (see
-              Section 4.2.5.2).
-        0x04: Page_size. In that case, <result_page_size> is an [int]
-              controlling the desired page size of the result (in CQL3 rows).
-              See the section on paging (Section 7) for more details.
-        0x08: With_paging_state. If present, <paging_state> should be present.
-              <paging_state> is a [bytes] value that should have been returned
-              in a result set (Section 4.2.5.2). If provided, the query will be
-              executed but starting from a given paging state. This also to
-              continue paging on a different node from the one it has been
-              started (See Section 7 for more details).
-        0x10: With serial consistency. If present, <serial_consistency> should be
-              present. <serial_consistency> is the [consistency] level for the
-              serial phase of conditional updates. That consitency can only be
-              either SERIAL or LOCAL_SERIAL and if not present, it defaults to
-              SERIAL. This option will be ignored for anything else that a
-              conditional update/insert.
-
-  Note that the consistency is ignored by some queries (USE, CREATE, ALTER,
-  TRUNCATE, ...).
-
-  The server will respond to a QUERY message with a RESULT message, the content
-  of which depends on the query.
-
-
-4.1.5. PREPARE
-
-  Prepare a query for later execution (through EXECUTE). The body consists of
-  the CQL query to prepare as a [long string].
-
-  The server will respond with a RESULT message with a `prepared` kind (0x0004,
-  see Section 4.2.5).
-
-
-4.1.6. EXECUTE
-
-  Executes a prepared query. The body of the message must be:
-    <id><query_parameters>
-  where <id> is the prepared query ID. It's the [short bytes] returned as a
-  response to a PREPARE message. As for <query_parameters>, it has the exact
-  same definition than in QUERY (see Section 4.1.4).
-
-  The response from the server will be a RESULT message.
-
-
-4.1.7. BATCH
-
-  Allows executing a list of queries (prepared or not) as a batch (note that
-  only DML statements are accepted in a batch). The body of the message must
-  be:
-    <type><n><query_1>...<query_n><consistency>
-  where:
-    - <type> is a [byte] indicating the type of batch to use:
-        - If <type> == 0, the batch will be "logged". This is equivalent to a
-          normal CQL3 batch statement.
-        - If <type> == 1, the batch will be "unlogged".
-        - If <type> == 2, the batch will be a "counter" batch (and non-counter
-          statements will be rejected).
-    - <n> is a [short] indicating the number of following queries.
-    - <query_1>...<query_n> are the queries to execute. A <query_i> must be of the
-      form:
-        <kind><string_or_id><n><value_1>...<value_n>
-      where:
-       - <kind> is a [byte] indicating whether the following query is a prepared
-         one or not. <kind> value must be either 0 or 1.
-       - <string_or_id> depends on the value of <kind>. If <kind> == 0, it should be
-         a [long string] query string (as in QUERY, the query string might contain
-         bind markers). Otherwise (that is, if <kind> == 1), it should be a
-         [short bytes] representing a prepared query ID.
-       - <n> is a [short] indicating the number (possibly 0) of following values.
-       - <value_1>...<value_n> are the [bytes] to use for bound variables.
-    - <consistency> is the [consistency] level for the operation.
-
-  The server will respond with a RESULT message with a `Void` kind (0x0001,
-  see Section 4.2.5).
-
-
-4.1.8. REGISTER
-
-  Register this connection to receive some type of events. The body of the
-  message is a [string list] representing the event types to register to. See
-  section 4.2.6 for the list of valid event types.
-
-  The response to a REGISTER message will be a READY message.
-
-  Please note that if a client driver maintains multiple connections to a
-  Cassandra node and/or connections to multiple nodes, it is advised to
-  dedicate a handful of connections to receive events, but to *not* register
-  for events on all connections, as this would only result in receiving
-  multiple times the same event messages, wasting bandwidth.
-
-
-4.2. Responses
-
-  This section describes the content of the frame body for the different
-  responses. Please note that to make room for future evolution, clients should
-  support extra informations (that they should simply discard) to the one
-  described in this document at the end of the frame body.
-
-4.2.1. ERROR
-
-  Indicates an error processing a request. The body of the message will be an
-  error code ([int]) followed by a [string] error message. Then, depending on
-  the exception, more content may follow. The error codes are defined in
-  Section 8, along with their additional content if any.
-
-
-4.2.2. READY
-
-  Indicates that the server is ready to process queries. This message will be
-  sent by the server either after a STARTUP message if no authentication is
-  required, or after a successful CREDENTIALS message.
-
-  The body of a READY message is empty.
-
-
-4.2.3. AUTHENTICATE
-
-  Indicates that the server require authentication, and which authentication
-  mechanism to use.
-
-  The authentication is SASL based and thus consists on a number of server
-  challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses
-  (AUTH_RESPONSE, Section 4.1.2). The Initial exchange is however boostrapped
-  by an initial client response. The details of that exchange (including how
-  much challenge-response pair are required) are specific to the authenticator
-  in use. The exchange ends when the server sends an AUTH_SUCCESS message or
-  an ERROR message.
-
-  This message will be sent following a STARTUP message if authentication is
-  required and must be answered by a AUTH_RESPONSE message from the client.
-
-  The body consists of a single [string] indicating the full class name of the
-  IAuthenticator in use.
-
-
-4.2.4. SUPPORTED
-
-  Indicates which startup options are supported by the server. This message
-  comes as a response to an OPTIONS message.
-
-  The body of a SUPPORTED message is a [string multimap]. This multimap gives
-  for each of the supported STARTUP options, the list of supported values.
-
-
-4.2.5. RESULT
-
-  The result to a query (QUERY, PREPARE, EXECUTE or BATCH messages).
-
-  The first element of the body of a RESULT message is an [int] representing the
-  `kind` of result. The rest of the body depends on the kind. The kind can be
-  one of:
-    0x0001    Void: for results carrying no information.
-    0x0002    Rows: for results to select queries, returning a set of rows.
-    0x0003    Set_keyspace: the result to a `use` query.
-    0x0004    Prepared: result to a PREPARE message.
-    0x0005    Schema_change: the result to a schema altering query.
-
-  The body for each kind (after the [int] kind) is defined below.
-
-
-4.2.5.1. Void
-
-  The rest of the body for a Void result is empty. It indicates that a query was
-  successful without providing more information.
-
-
-4.2.5.2. Rows
-
-  Indicates a set of rows. The rest of body of a Rows result is:
-    <metadata><rows_count><rows_content>
-  where:
-    - <metadata> is composed of:
-        <flags><columns_count>[<paging_state>][<global_table_spec>?<col_spec_1>...<col_spec_n>]
-      where:
-        - <flags> is an [int]. The bits of <flags> provides information on the
-          formatting of the remaining informations. A flag is set if the bit
-          corresponding to its `mask` is set. Supported flags are, given there
-          mask:
-            0x0001    Global_tables_spec: if set, only one table spec (keyspace
-                      and table name) is provided as <global_table_spec>. If not
-                      set, <global_table_spec> is not present.
-            0x0002    Has_more_pages: indicates whether this is not the last
-                      page of results and more should be retrieve. If set, the
-                      <paging_state> will be present. The <paging_state> is a
-                      [bytes] value that should be used in QUERY/EXECUTE to
-                      continue paging and retrieve the remained of the result for
-                      this query (See Section 7 for more details).
-            0x0004    No_metadata: if set, the <metadata> is only composed of
-                      these <flags>, the <column_count> and optionally the
-                      <paging_state> (depending on the Has_more_pages flage) but
-                      no other information (so no <global_table_spec> nor <col_spec_i>).
-                      This will only ever be the case if this was requested
-                      during the query (see QUERY and RESULT messages).
-        - <columns_count> is an [int] representing the number of columns selected
-          by the query this result is of. It defines the number of <col_spec_i>
-          elements in and the number of element for each row in <rows_content>.
-        - <global_table_spec> is present if the Global_tables_spec is set in
-          <flags>. If present, it is composed of two [string] representing the
-          (unique) keyspace name and table name the columns return are of.
-        - <col_spec_i> specifies the columns returned in the query. There is
-          <column_count> such column specifications that are composed of:
-            (<ksname><tablename>)?<name><type>
-          The initial <ksname> and <tablename> are two [string] are only present
-          if the Global_tables_spec flag is not set. The <column_name> is a
-          [string] and <type> is an [option] that correspond to the description
-          (what this description is depends a bit on the context: in results to
-          selects, this will be either the user chosen alias or the selection used
-          (often a colum name, but it can be a function call too). In results to
-          a PREPARE, this will be either the name of the bind variable corresponding
-          or the column name for the variable if it is "anonymous") and type of
-          the corresponding result. The option for <type> is either a native
-          type (see below), in which case the option has no value, or a
-          'custom' type, in which case the value is a [string] representing
-          the full qualified class name of the type represented. Valid option
-          ids are:
-            0x0000    Custom: the value is a [string], see above.
-            0x0001    Ascii
-            0x0002    Bigint
-            0x0003    Blob
-            0x0004    Boolean
-            0x0005    Counter
-            0x0006    Decimal
-            0x0007    Double
-            0x0008    Float
-            0x0009    Int
-            0x000A    Text
-            0x000B    Timestamp
-            0x000C    Uuid
-            0x000D    Varchar
-            0x000E    Varint
-            0x000F    Timeuuid
-            0x0010    Inet
-            0x0020    List: the value is an [option], representing the type
-                            of the elements of the list.
-            0x0021    Map: the value is two [option], representing the types of the
-                           keys and values of the map
-            0x0022    Set: the value is an [option], representing the type
-                            of the elements of the set
-    - <rows_count> is an [int] representing the number of rows present in this
-      result. Those rows are serialized in the <rows_content> part.
-    - <rows_content> is composed of <row_1>...<row_m> where m is <rows_count>.
-      Each <row_i> is composed of <value_1>...<value_n> where n is
-      <columns_count> and where <value_j> is a [bytes] representing the value
-      returned for the jth column of the ith row. In other words, <rows_content>
-      is composed of (<rows_count> * <columns_count>) [bytes].
-
-
-4.2.5.3. Set_keyspace
-
-  The result to a `use` query. The body (after the kind [int]) is a single
-  [string] indicating the name of the keyspace that has been set.
-
-
-4.2.5.4. Prepared
-
-  The result to a PREPARE message. The rest of the body of a Prepared result is:
-    <id><metadata><result_metadata>
-  where:
-    - <id> is [short bytes] representing the prepared query ID.
-    - <metadata> is defined exactly as for a Rows RESULT (See section 4.2.5.2; you
-      can however assume that the Has_more_pages flag is always off) and
-      is the specification for the variable bound in this prepare statement.
-    - <result_metadata> is defined exactly as <metadata> but correspond to the
-      metadata for the resultSet that execute this query will yield. Note that
-      <result_metadata> may be empty (have the No_metadata flag and 0 columns, See
-      section 4.2.5.2) and will be for any query that is not a Select. There is
-      in fact never a guarantee that this will non-empty so client should protect
-      themselves accordingly. The presence of this information is an
-      optimization that allows to later execute the statement that has been
-      prepared without requesting the metadata (Skip_metadata flag in EXECUTE).
-      Clients can safely discard this metadata if they do not want to take
-      advantage of that optimization.
-
-  Note that prepared query ID return is global to the node on which the query
-  has been prepared. It can be used on any connection to that node and this
-  until the node is restarted (after which the query must be reprepared).
-
-4.2.5.5. Schema_change
-
-  The result to a schema altering query (creation/update/drop of a
-  keyspace/table/index). The body (after the kind [int]) is composed of 3
-  [string]:
-    <change><keyspace><table>
-  where:
-    - <change> describe the type of change that has occured. It can be one of
-      "CREATED", "UPDATED" or "DROPPED".
-    - <keyspace> is the name of the affected keyspace or the keyspace of the
-      affected table.
-    - <table> is the name of the affected table. <table> will be empty (i.e.
-      the empty string "") if the change was affecting a keyspace and not a
-      table.
-
-  Note that queries to create and drop an index are considered changes
-  updating the table the index is on.  Queries that create, alter, or drop
-  user-defined types (availble in Cassandra 2.1+) are considered changes
-  updating the keyspace the type is defined in.
-
-
-4.2.6. EVENT
-
-  And event pushed by the server. A client will only receive events for the
-  type it has REGISTER to. The body of an EVENT message will start by a
-  [string] representing the event type. The rest of the message depends on the
-  event type. The valid event types are:
-    - "TOPOLOGY_CHANGE": events related to change in the cluster topology.
-      Currently, events are sent when new nodes are added to the cluster, and
-      when nodes are removed. The body of the message (after the event type)
-      consists of a [string] and an [inet], corresponding respectively to the
-      type of change ("NEW_NODE", "REMOVED_NODE", or "MOVED_NODE") followed
-      by the address of the new/removed/moved node.
-    - "STATUS_CHANGE": events related to change of node status. Currently,
-      up/down events are sent. The body of the message (after the event type)
-      consists of a [string] and an [inet], corresponding respectively to the
-      type of status change ("UP" or "DOWN") followed by the address of the
-      concerned node.
-    - "SCHEMA_CHANGE": events related to schema change. The body of the message
-      (after the event type) consists of 3 [string] corresponding respectively
-      to the type of schema change ("CREATED", "UPDATED" or "DROPPED"),
-      followed by the name of the affected keyspace and the name of the
-      affected table within that keyspace. For changes that affect a keyspace
-      directly, the table name will be empty (i.e. the empty string "").
-      Changes to user-defined types (available in Cassandra 2.1+) will result
-      in an "UPDATED" change for the keyspace containing the type, and the
-      table name will be empty.
-
-  All EVENT message have a streamId of -1 (Section 2.3).
-
-  Please note that "NEW_NODE" and "UP" events are sent based on internal Gossip
-  communication and as such may be sent a short delay before the binary
-  protocol server on the newly up node is fully started. Clients are thus
-  advise to wait a short time before trying to connect to the node (1 seconds
-  should be enough), otherwise they may experience a connection refusal at
-  first.
-
-  It is possible for the same event to be sent multiple times. Therefore,
-  a client library should ignore the same event if it has already been notified
-  of a change.
-
-4.2.7. AUTH_CHALLENGE
-
-  A server authentication challenge (see AUTH_RESPONSE (Section 4.1.2) for more
-  details).
-
-  The body of this message is a single [bytes] token. The details of what this
-  token contains (and when it can be null/empty, if ever) depends on the actual
-  authenticator used.
-
-  Clients are expected to answer the server challenge by an AUTH_RESPONSE
-  message.
-
-4.2.7. AUTH_SUCCESS
-
-  Indicate the success of the authentication phase. See Section 4.2.3 for more
-  details.
-
-  The body of this message is a single [bytes] token holding final information
-  from the server that the client may require to finish the authentication
-  process. What that token contains and whether it can be null depends on the
-  actual authenticator used.
-
-
-5. Compression
-
-  Frame compression is supported by the protocol, but then only the frame body
-  is compressed (the frame header should never be compressed).
-
-  Before being used, client and server must agree on a compression algorithm to
-  use, which is done in the STARTUP message. As a consequence, a STARTUP message
-  must never be compressed.  However, once the STARTUP frame has been received
-  by the server can be compressed (including the response to the STARTUP
-  request). Frame do not have to be compressed however, even if compression has
-  been agreed upon (a server may only compress frame above a certain size at its
-  discretion). A frame body should be compressed if and only if the compressed
-  flag (see Section 2.2) is set.
-
-  As of this version 2 of the protocol, the following compressions are available:
-    - lz4 (https://code.google.com/p/lz4/). In that, note that the 4 first bytes
-      of the body will be the uncompressed length (followed by the compressed
-      bytes).
-    - snappy (https://code.google.com/p/snappy/). This compression might not be
-      available as it depends on a native lib (server-side) that might not be
-      avaivable on some installation.
-
-
-6. Data Type Serialization Formats
-
-  This sections describes the serialization formats for all CQL data types
-  supported by Cassandra through the native protocol.  These serialization
-  formats should be used by client drivers to encode values for EXECUTE
-  messages.  Cassandra will use these formats when returning values in
-  RESULT messages.
-
-  All values are represented as [bytes] in EXECUTE and RESULT messages.
-  The [bytes] format includes an int prefix denoting the length of the value.
-  For that reason, the serialization formats described here will not include
-  a length component.
-
-  For legacy compatibility reasons, note that most non-string types support
-  "empty" values (i.e. a value with zero length).  An empty value is distinct
-  from NULL, which is encoded with a negative length.
-
-  As with the rest of the native protocol, all encodings are big-endian.
-
-6.1. ascii
-
-  A sequence of bytes in the ASCII range [0, 127].  Bytes with values outside of
-  this range will result in a validation error.
-
-6.2 bigint
-
-  An eight-byte two's complement integer.
-
-6.3 blob
-
-  Any sequence of bytes.
-
-6.4 boolean
-
-  A single byte.  A value of 0 denotes "false"; any other value denotes "true".
-  (However, it is recommended that a value of 1 be used to represent "true".)
-
-6.5 decimal
-
-  The decimal format represents an arbitrary-precision number.  It contains an
-  [int] "scale" component followed by a varint encoding (see section 6.17)
-  of the unscaled value.  The encoded value represents "<unscaled>E<-scale>".
-  In other words, "<unscaled> * 10 ^ (-1 * <scale>)".
-
-6.6 double
-
-  An eight-byte floating point number in the IEEE 754 binary64 format.
-
-6.7 float
-
-  An four-byte floating point number in the IEEE 754 binary32 format.
-
-6.8 inet
-
-  A 4 byte or 16 byte sequence denoting an IPv4 or IPv6 address, respectively.
-
-6.9 int
-
-  A four-byte two's complement integer.
-
-6.10 list
-
-  A [short] n indicating the number of elements in the list, followed by n
-  elements.  Each element is [short bytes] representing the serialized value.
-
-6.11 map
-
-  A [short] n indicating the number of key/value pairs in the map, followed by
-  n entries.  Each entry is composed of two [short bytes] representing the key
-  and value.
-
-6.12 set
-
-  A [short] n indicating the number of elements in the set, followed by n
-  elements.  Each element is [short bytes] representing the serialized value.
-
-6.13 text
-
-  A sequence of bytes conforming to the UTF-8 specifications.
-
-6.14 timestamp
-
-  An eight-byte two's complement integer representing a millisecond-precision
-  offset from the unix epoch (00:00:00, January 1st, 1970).  Negative values
-  represent a negative offset from the epoch.
-
-6.15 uuid
-
-  A 16 byte sequence representing any valid UUID as defined by RFC 4122.
-
-6.16 varchar
-
-  An alias of the "text" type.
-
-6.17 varint
-
-  A variable-length two's complement encoding of a signed integer.
-
-  The following examples may help implementors of this spec:
-
-  Value | Encoding
-  ------|---------
-      0 |     0x00
-      1 |     0x01
-    127 |     0x7F
-    128 |   0x0080
-    129 |   0x0081
-     -1 |     0xFF
-   -128 |     0x80
-   -129 |   0xFF7F
-
-  Note that positive numbers must use a most-significant byte with a value
-  less than 0x80, because a most-significant bit of 1 indicates a negative
-  value.  Implementors should pad positive values that have a MSB >= 0x80
-  with a leading 0x00 byte.
-
-6.18 timeuuid
-
-  A 16 byte sequence representing a version 1 UUID as defined by RFC 4122.
-
-
-7. Result paging
-
-  The protocol allows for paging the result of queries. For that, the QUERY and
-  EXECUTE messages have a <result_page_size> value that indicate the desired
-  page size in CQL3 rows.
-
-  If a positive value is provided for <result_page_size>, the result set of the
-  RESULT message returned for the query will contain at most the
-  <result_page_size> first rows of the query result. If that first page of result
-  contains the full result set for the query, the RESULT message (of kind `Rows`)
-  will have the Has_more_pages flag *not* set. However, if some results are not
-  part of the first response, the Has_more_pages flag will be set and the result
-  will contain a <paging_state> value. In that case, the <paging_state> value
-  should be used in a QUERY or EXECUTE message (that has the *same* query than
-  the original one or the behavior is undefined) to retrieve the next page of
-  results.
-
-  Only CQL3 queries that return a result set (RESULT message with a Rows `kind`)
-  support paging. For other type of queries, the <result_page_size> value is
-  ignored.
-
-  Note to client implementors:
-  - While <result_page_size> can be as low as 1, it will likely be detrimental
-    to performance to pick a value too low. A value below 100 is probably too
-    low for most use cases.
-  - Clients should not rely on the actual size of the result set returned to
-    decide if there is more result to fetch or not. Instead, they should always
-    check the Has_more_pages flag (unless they did not enabled paging for the query
-    obviously). Clients should also not assert that no result will have more than
-    <result_page_size> results. While the current implementation always respect
-    the exact value of <result_page_size>, we reserve ourselves the right to return
-    slightly smaller or bigger pages in the future for performance reasons.
-
-
-8. Error codes
-
-  The supported error codes are described below:
-    0x0000    Server error: something unexpected happened. This indicates a
-              server-side bug.
-    0x000A    Protocol error: some client message triggered a protocol
-              violation (for instance a QUERY message is sent before a STARTUP
-              one has been sent)
-    0x0100    Bad credentials: CREDENTIALS request failed because Cassandra
-              did not accept the provided credentials.
-
-    0x1000    Unavailable exception. The rest of the ERROR message body will be
-                <cl><required><alive>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <required> is an [int] representing the number of node that
-                           should be alive to respect <cl>
-                <alive> is an [int] representing the number of replica that
-                        were known to be alive when the request has been
-                        processed (since an unavailable exception has been
-                        triggered, there will be <alive> < <required>)
-    0x1001    Overloaded: the request cannot be processed because the
-              coordinator node is overloaded
-    0x1002    Is_bootstrapping: the request was a read request but the
-              coordinator node is bootstrapping
-    0x1003    Truncate_error: error during a truncation error.
-    0x1100    Write_timeout: Timeout exception during a write request. The rest
-              of the ERROR message body will be
-                <cl><received><blockfor><writeType>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <received> is an [int] representing the number of nodes having
-                           acknowledged the request.
-                <blockfor> is an [int] representing the number of replica whose
-                           acknowledgement is required to achieve <cl>.
-                <writeType> is a [string] that describe the type of the write
-                            that timeouted. The value of that string can be one
-                            of:
-                             - "SIMPLE": the write was a non-batched
-                               non-counter write.
-                             - "BATCH": the write was a (logged) batch write.
-                               If this type is received, it means the batch log
-                               has been successfully written (otherwise a
-                               "BATCH_LOG" type would have been send instead).
-                             - "UNLOGGED_BATCH": the write was an unlogged
-                               batch. Not batch log write has been attempted.
-                             - "COUNTER": the write was a counter write
-                               (batched or not).
-                             - "BATCH_LOG": the timeout occured during the
-                               write to the batch log when a (logged) batch
-                               write was requested.
-    0x1200    Read_timeout: Timeout exception during a read request. The rest
-              of the ERROR message body will be
-                <cl><received><blockfor><data_present>
-              where:
-                <cl> is the [consistency] level of the query having triggered
-                     the exception.
-                <received> is an [int] representing the number of nodes having
-                           answered the request.
-                <blockfor> is an [int] representing the number of replica whose
-                           response is required to achieve <cl>. Please note that it
-                           is possible to have <received> >= <blockfor> if
-                           <data_present> is false. And also in the (unlikely)
-                           case were <cl> is achieved but the coordinator node
-                           timeout while waiting for read-repair
-                           acknowledgement.
-                <data_present> is a single byte. If its value is 0, it means
-                               the replica that was asked for data has not
-                               responded. Otherwise, the value is != 0.
-
-    0x2000    Syntax_error: The submitted query has a syntax error.
-    0x2100    Unauthorized: The logged user doesn't have the right to perform
-              the query.
-    0x2200    Invalid: The query is syntactically correct but invalid.
-    0x2300    Config_error: The query is invalid because of some configuration issue
-    0x2400    Already_exists: The query attempted to create a keyspace or a
-              table that was already existing. The rest of the ERROR message
-              body will be <ks><table> where:
-                <ks> is a [string] representing either the keyspace that
-                     already exists, or the keyspace in which the table that
-                     already exists is.
-                <table> is a [string] representing the name of the table that
-                        already exists. If the query was attempting to create a
-                        keyspace, <table> will be present but will be the empty
-                        string.
-    0x2500    Unprepared: Can be thrown while a prepared statement tries to be
-              executed if the provide prepared statement ID is not known by
-              this host. The rest of the ERROR message body will be [short
-              bytes] representing the unknown ID.
-
-9. Changes from v1
-  * Protocol is versioned to allow old client connects to a newer server, if a
-    newer client connects to an older server, it needs to check if it gets a
-    ProtocolException on connection and try connecting with a lower version.
-  * A query can now have bind variables even though the statement is not
-    prepared; see Section 4.1.4.
-  * A new BATCH message allows to batch a set of queries (prepared or not); see 
-    Section 4.1.7.
-  * Authentication now uses SASL. Concretely, the CREDENTIALS message has been
-    removed and replaced by a server/client challenges/responses exchanges (done
-    through the new AUTH_RESPONSE/AUTH_CHALLENGE messages). See Section 4.2.3 for
-    details.
-  * Query paging has been added (Section 7): QUERY and EXECUTE message have an
-    additional <result_page_size> [int] and <paging_state> [bytes], and
-    the Rows kind of RESULT message has an additional flag and <paging_state> 
-    value. Note that paging is optional, and a client that do not want to handle
-    can simply avoid including the Page_size flag and parameter in QUERY and
-    EXECUTE.
-  * QUERY and EXECUTE statements can request for the metadata to be skipped in
-    the result set returned (for efficiency reasons) if said metadata are known
-    in advance. Furthermore, the result to a PREPARE (section 4.2.5.4) now
-    includes the metadata for the result of executing the statement just
-    prepared (though those metadata will be empty for non SELECT statements).

diff --git a/doc/native_protocol_v3.spec b/doc/native_protocol_v3.spec
index 087e138..0d7f94d 100644
--- a/doc/native_protocol_v3.spec
+++ b/doc/native_protocol_v3.spec

@@ -65,7 +65,7 @@
   Each frame contains a fixed size header (9 bytes) followed by a variable size
   body. The header is described in Section 2. The content of the body depends
   on the header opcode value (the body can in particular be empty for some
-  opcode values). The list of allowed opcode is defined Section 2.3 and the
+  opcode values). The list of allowed opcode is defined Section 2.4 and the
   details of each corresponding message is described Section 4.
 
   The protocol distinguishes 2 types of frames: requests and responses. Requests
@@ -921,6 +921,9 @@
     <result_page_size> results. While the current implementation always respect
     the exact value of <result_page_size>, we reserve ourselves the right to return
     slightly smaller or bigger pages in the future for performance reasons.
+  - The <paging_state> is specific to a protocol version and drivers should not
+    send a <paging_state> returned by a node using the protocol v3 to query a node
+    using the protocol v4 for instance.
 
 
 9. Error codes
@@ -976,6 +979,8 @@
                              - "BATCH_LOG": the timeout occured during the
                                write to the batch log when a (logged) batch
                                write was requested.
+                             - "CAS": the timeout occured during the Compare And Set
+                               write/update.
     0x1200    Read_timeout: Timeout exception during a read request. The rest
               of the ERROR message body will be
                 <cl><received><blockfor><data_present>

diff --git a/doc/native_protocol_v4.spec b/doc/native_protocol_v4.spec
index 187ff80..8beb77b 100644
--- a/doc/native_protocol_v4.spec
+++ b/doc/native_protocol_v4.spec

@@ -65,7 +65,7 @@
   Each frame contains a fixed size header (9 bytes) followed by a variable size
   body. The header is described in Section 2. The content of the body depends
   on the header opcode value (the body can in particular be empty for some
-  opcode values). The list of allowed opcodes is defined in Section 2.3 and the
+  opcode values). The list of allowed opcodes is defined in Section 2.4 and the
   details of each corresponding message are described Section 4.
 
   The protocol distinguishes two types of frames: requests and responses. Requests
@@ -271,6 +271,13 @@
       different from the protocol version.
     - "COMPRESSION": the compression algorithm to use for frames (See section 5).
       This is optional; if not specified no compression will be used.
+    - "NO_COMPACT": whether or not connection has to be established in compatibility
+      mode. This mode will make all Thrift and Compact Tables to be exposed as if
+      they were CQL Tables. This is optional; if not specified, the option will
+      not be used.
+    - "THROW_ON_OVERLOAD": In case of server overloaded with too many requests, by default the server puts
+            back pressure on the client connection. Instead, the server can send an OverloadedException error message back to
+            the client if this option is set to true.
 
 
 4.1.2. AUTH_RESPONSE
@@ -1004,8 +1011,8 @@
     the exact value of <result_page_size>, we reserve the right to return
     slightly smaller or bigger pages in the future for performance reasons.
   - The <paging_state> is specific to a protocol version and drivers should not
-    send a <paging_state> returned by a node using protocol v3 to query a node
-    using protocol v4 for instance.
+    send a <paging_state> returned by a node using the protocol v3 to query a node
+    using the protocol v4 for instance.
 
 
 9. Error codes
@@ -1131,6 +1138,8 @@
                              - "BATCH_LOG": the failure occured during the
                                write to the batch log when a (logged) batch
                                write was requested.
+                             - "CAS": the timeout occured during the Compare And Set
+                               write/update.
 
     0x2000    Syntax_error: The submitted query has a syntax error.
     0x2100    Unauthorized: The logged user doesn't have the right to perform
@@ -1169,3 +1178,4 @@
   * The <paging_state> returned in the v4 protocol is not compatible with the v3
     protocol. In other words, a <paging_state> returned by a node using protocol v4
     should not be used to query a node using protocol v3 (and vice-versa).
+  * Added THROW_ON_OVERLOAD startup option (Section 4.1.1).

diff --git a/examples/hadoop_word_count/README.txt b/examples/hadoop_word_count/README.txt
deleted file mode 100644
index e336b89..0000000
--- a/examples/hadoop_word_count/README.txt
+++ /dev/null

@@ -1,50 +0,0 @@
-Introduction
-============
-
-WordCount hadoop example: Inserts a bunch of words across multiple rows,
-and counts them, with RandomPartitioner. The word_count_counters example sums
-the value of counter columns for a key.
-
-The scripts in bin/ assume you are running with cwd of examples/word_count.
-
-
-Running
-=======
-
-First build and start a Cassandra server with the default configuration*. Ensure that the Thrift
-interface is enabled, either by setting start_rpc:true in cassandra.yaml or by running
-`nodetool enablethrift` after startup.
-Once Cassandra has started and the Thrift interface is available, run
-
-contrib/word_count$ ant
-contrib/word_count$ bin/word_count_setup
-contrib/word_count$ bin/word_count
-contrib/word_count$ bin/word_count_counters
-
-In order to view the results in Cassandra, one can use bin/cqlsh and
-perform the following operations:
-$ bin/cqlsh localhost
-> use wordcount;
-> select * from output_words;
-
-The output of the word count can now be configured. In the bin/word_count
-file, you can specify the OUTPUT_REDUCER. The two options are 'filesystem'
-and 'cassandra'. The filesystem option outputs to the /tmp/word_count*
-directories. The cassandra option outputs to the 'output_words' column family
-in the 'wordcount' keyspace.  'cassandra' is the default.
-
-Read the code in src/ for more details.
-
-The word_count_counters example sums the counter columns for a row. The output
-is written to a text file in /tmp/word_count_counters.
-
-*It is recommended to turn off vnodes when running Cassandra with hadoop.
-This is done by setting "num_tokens: 1" in cassandra.yaml. If you want to
-point wordcount at a real cluster, modify the seed and listenaddress
-settings accordingly.
-
-
-Troubleshooting
-===============
-
-word_count uses conf/logback.xml to log to wc.out.

diff --git a/examples/hadoop_word_count/bin/word_count b/examples/hadoop_word_count/bin/word_count
deleted file mode 100755
index 34534d7..0000000
--- a/examples/hadoop_word_count/bin/word_count
+++ /dev/null

@@ -1,61 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cwd=`dirname $0`
-
-# Cassandra class files.
-if [ ! -d $cwd/../../../build/classes/main ]; then
-    echo "Unable to locate cassandra class files" >&2
-    exit 1
-fi
-
-# word_count Jar.
-if [ ! -e $cwd/../build/word_count.jar ]; then
-    echo "Unable to locate word_count jar" >&2
-    exit 1
-fi
-
-CLASSPATH=$CLASSPATH:$cwd/../conf
-CLASSPATH=$CLASSPATH:$cwd/../build/word_count.jar
-CLASSPATH=$CLASSPATH:$cwd/../../../build/classes/main
-CLASSPATH=$CLASSPATH:$cwd/../../../build/classes/thrift
-for jar in $cwd/../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../lib/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-
-if [ -x $JAVA_HOME/bin/java ]; then
-    JAVA=$JAVA_HOME/bin/java
-else
-    JAVA=`which java`
-fi
-
-if [ "x$JAVA" = "x" ]; then
-    echo "Java executable not found (hint: set JAVA_HOME)" >&2
-    exit 1
-fi
-
-OUTPUT_REDUCER=cassandra
-
-#echo $CLASSPATH
-"$JAVA" -Xmx1G -ea -cp "$CLASSPATH" WordCount output_reducer=$OUTPUT_REDUCER

diff --git a/examples/hadoop_word_count/bin/word_count_counters b/examples/hadoop_word_count/bin/word_count_counters
deleted file mode 100755
index 122565d..0000000
--- a/examples/hadoop_word_count/bin/word_count_counters
+++ /dev/null

@@ -1,59 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cwd=`dirname $0`
-
-# Cassandra class files.
-if [ ! -d $cwd/../../../build/classes/main ]; then
-    echo "Unable to locate cassandra class files" >&2
-    exit 1
-fi
-
-# word_count Jar.
-if [ ! -e $cwd/../build/word_count.jar ]; then
-    echo "Unable to locate word_count jar" >&2
-    exit 1
-fi
-
-CLASSPATH=$CLASSPATH:$cwd/../conf
-CLASSPATH=$CLASSPATH:$cwd/../build/word_count.jar
-CLASSPATH=$CLASSPATH:$cwd/../../../build/classes/main
-CLASSPATH=$CLASSPATH:$cwd/../../../build/classes/thrift
-for jar in $cwd/../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../lib/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-
-if [ -x $JAVA_HOME/bin/java ]; then
-    JAVA=$JAVA_HOME/bin/java
-else
-    JAVA=`which java`
-fi
-
-if [ "x$JAVA" = "x" ]; then
-    echo "Java executable not found (hint: set JAVA_HOME)" >&2
-    exit 1
-fi
-
-#echo $CLASSPATH
-"$JAVA" -Xmx1G -ea -cp "$CLASSPATH" WordCountCounters

diff --git a/examples/hadoop_word_count/bin/word_count_setup b/examples/hadoop_word_count/bin/word_count_setup
deleted file mode 100755
index 6e5650f..0000000
--- a/examples/hadoop_word_count/bin/word_count_setup
+++ /dev/null

@@ -1,61 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cwd=`dirname $0`
-
-# Cassandra class files.
-if [ ! -d $cwd/../../../build/classes/main ]; then
-    echo "Unable to locate cassandra class files" >&2
-    exit 1
-fi
-
-# word_count Jar.
-if [ ! -e $cwd/../build/word_count.jar ]; then
-    echo "Unable to locate word_count jar" >&2
-    exit 1
-fi
-
-CLASSPATH=$CLASSPATH:$cwd/../build/word_count.jar
-CLASSPATH=$CLASSPATH:.:$cwd/../../../build/classes/main
-CLASSPATH=$CLASSPATH:.:$cwd/../../../build/classes/thrift
-for jar in $cwd/../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../lib/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-for jar in $cwd/../../../build/lib/jars/*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-
-if [ -x $JAVA_HOME/bin/java ]; then
-    JAVA=$JAVA_HOME/bin/java
-else
-    JAVA=`which java`
-fi
-
-if [ "x$JAVA" = "x" ]; then
-    echo "Java executable not found (hint: set JAVA_HOME)" >&2
-    exit 1
-fi
-
-HOST=localhost
-PORT=9160
-FRAMED=true
-
-"$JAVA" -Xmx1G -ea -Dcassandra.host=$HOST -Dcassandra.port=$PORT -Dcassandra.framed=$FRAMED -cp "$CLASSPATH" WordCountSetup

diff --git a/examples/hadoop_word_count/build.xml b/examples/hadoop_word_count/build.xml
deleted file mode 100644
index 939e1b3..0000000
--- a/examples/hadoop_word_count/build.xml
+++ /dev/null

@@ -1,113 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one
- ~ or more contributor license agreements.  See the NOTICE file
- ~ distributed with this work for additional information
- ~ regarding copyright ownership.  The ASF licenses this file
- ~ to you under the Apache License, Version 2.0 (the
- ~ "License"); you may not use this file except in compliance
- ~ with the License.  You may obtain a copy of the License at
- ~
- ~    http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing,
- ~ software distributed under the License is distributed on an
- ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- ~ KIND, either express or implied.  See the License for the
- ~ specific language governing permissions and limitations
- ~ under the License.
- -->
-<project default="jar" name="word_count" xmlns:ivy="antlib:org.apache.ivy.ant">
-    <property name="cassandra.dir" value="../.." />
-    <property name="cassandra.dir.lib" value="${cassandra.dir}/lib" />
-    <property name="cassandra.classes" value="${cassandra.dir}/build/classes" />
-    <property name="build.src" value="${basedir}/src" />
-    <property name="build.dir" value="${basedir}/build" />
-    <property name="ivy.lib.dir" value="${build.dir}/lib" />
-    <property name="build.classes" value="${build.dir}/classes" />
-    <property name="final.name" value="word_count" />
-    <property name="ivy.version" value="2.1.0" />
-    <property name="ivy.url"
-              value="http://repo2.maven.org/maven2/org/apache/ivy/ivy" />
-
-    <condition property="ivy.jar.exists">
-        <available file="${build.dir}/ivy-${ivy.version}.jar" />
-    </condition>
-
-    <path id="autoivy.classpath">
-        <fileset dir="${ivy.lib.dir}">
-            <include name="**/*.jar" />
-        </fileset>
-        <pathelement location="${build.dir}/ivy-${ivy.version}.jar"/>
-    </path>
-
-    <path id="wordcount.build.classpath">
-        <fileset dir="${ivy.lib.dir}">
-            <include name="**/*.jar" />
-        </fileset>
-        <!-- cassandra dependencies -->
-        <fileset dir="${cassandra.dir.lib}">
-            <include name="**/*.jar" />
-        </fileset>
-        <fileset dir="${cassandra.dir}/build/lib/jars">
-            <include name="**/*.jar" />
-        </fileset>
-        <pathelement location="${cassandra.classes}/main" />
-        <pathelement location="${cassandra.classes}/thrift" />
-    </path>
-
-    <target name="init">
-        <mkdir dir="${build.classes}" />
-    </target>
-
-    <target depends="init,ivy-retrieve-build" name="build">
-        <javac destdir="${build.classes}">
-            <src path="${build.src}" />
-            <classpath refid="wordcount.build.classpath" />
-        </javac>
-    </target>
-
-    <target name="jar" depends="build">
-        <mkdir dir="${build.classes}/META-INF" />
-        <jar jarfile="${build.dir}/${final.name}.jar">
-           <fileset dir="${build.classes}" />
-           <fileset dir="${cassandra.classes}/main" />
-           <fileset dir="${cassandra.classes}/thrift" />
-           <fileset dir="${cassandra.dir}">
-               <include name="lib/**/*.jar" />
-           </fileset>
-           <zipfileset dir="${cassandra.dir}/build/lib/jars/" prefix="lib">
-               <include name="**/*.jar" />
-           </zipfileset>
-           <fileset file="${basedir}/cassandra.yaml" />
-        </jar>
-    </target>
-
-    <target name="clean">
-        <delete dir="${build.dir}" />
-    </target>
-
-    <!--
-        Ivy Specific targets
-            to fetch Ivy and this project's dependencies
-    -->
-	<target name="ivy-download" unless="ivy.jar.exists">
-      <echo>Downloading Ivy...</echo>
-      <mkdir dir="${build.dir}" />
-      <get src="${ivy.url}/${ivy.version}/ivy-${ivy.version}.jar"
-           dest="${build.dir}/ivy-${ivy.version}.jar" usetimestamp="true" />
-    </target>
-
-    <target name="ivy-init" depends="ivy-download" unless="ivy.initialized">
-      <mkdir dir="${ivy.lib.dir}"/>
-      <taskdef resource="org/apache/ivy/ant/antlib.xml"
-               uri="antlib:org.apache.ivy.ant"
-               classpathref="autoivy.classpath"/>
-      <property name="ivy.initialized" value="true"/>
-    </target>
-
-    <target name="ivy-retrieve-build" depends="ivy-init">
-      <ivy:retrieve type="jar,source" sync="true"
-             pattern="${ivy.lib.dir}/[type]s/[artifact]-[revision].[ext]" />
-    </target>
-</project>

diff --git a/examples/hadoop_word_count/conf/logback.xml b/examples/hadoop_word_count/conf/logback.xml
deleted file mode 100644
index 443bd1c..0000000
--- a/examples/hadoop_word_count/conf/logback.xml
+++ /dev/null

@@ -1,42 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements.  See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership.  The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied.  See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<configuration scan="true">
-
-  <jmxConfigurator />
-
-  <appender name="FILE" class="ch.qos.logback.core.FileAppender">
-    <file>wc.out</file>
-    <encoder>
-      <pattern>%-5level [%thread] %date{ISO8601} %F:%L - %msg%n</pattern>
-    </encoder>
-  </appender>
-
-  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
-    <encoder>
-      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
-    </encoder>
-  </appender>
-
-  <root level="INFO">
-    <appender-ref ref="FILE" />
-    <appender-ref ref="STDOUT" />
-  </root>
-
-</configuration>

diff --git a/examples/hadoop_word_count/ivy.xml b/examples/hadoop_word_count/ivy.xml
deleted file mode 100644
index 2016eb8..0000000
--- a/examples/hadoop_word_count/ivy.xml
+++ /dev/null

@@ -1,24 +0,0 @@
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one
- ~ or more contributor license agreements.  See the NOTICE file
- ~ distributed with this work for additional information
- ~ regarding copyright ownership.  The ASF licenses this file
- ~ to you under the Apache License, Version 2.0 (the
- ~ "License"); you may not use this file except in compliance
- ~ with the License.  You may obtain a copy of the License at
- ~
- ~    http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing,
- ~ software distributed under the License is distributed on an
- ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- ~ KIND, either express or implied.  See the License for the
- ~ specific language governing permissions and limitations
- ~ under the License.
- -->
-<ivy-module version="2.0">
-    <info organisation="apache-cassandra" module="word-count"/>
-    <dependencies>
-        <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.0.3"/>
-    </dependencies>
-</ivy-module>

diff --git a/examples/hadoop_word_count/src/WordCount.java b/examples/hadoop_word_count/src/WordCount.java
deleted file mode 100644
index d092f1f..0000000
--- a/examples/hadoop_word_count/src/WordCount.java
+++ /dev/null

@@ -1,222 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.hadoop.*;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-/**
- * This counts the occurrences of words in ColumnFamily Standard1, that has a single column (that we care about)
- * "text" containing a sequence of words.
- *
- * For each word, we output the total number of occurrences across all texts.
- *
- * When outputting to Cassandra, we write the word counts as a {word, count} column/value pair,
- * with a row key equal to the name of the source column we read the words from.
- */
-public class WordCount extends Configured implements Tool
-{
-    private static final Logger logger = LoggerFactory.getLogger(WordCount.class);
-
-    static final String KEYSPACE = "wordcount";
-    static final String COLUMN_FAMILY = "input_words";
-
-    static final String OUTPUT_REDUCER_VAR = "output_reducer";
-    static final String OUTPUT_COLUMN_FAMILY = "output_words";
-    private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
-
-    private static final String CONF_COLUMN_NAME = "columnname";
-
-    public static void main(String[] args) throws Exception
-    {
-        // Let ToolRunner handle generic command-line options
-        ToolRunner.run(new Configuration(), new WordCount(), args);
-        System.exit(0);
-    }
-
-    public static class TokenizerMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>, Text, IntWritable>
-    {
-        private final static IntWritable one = new IntWritable(1);
-        private Text word = new Text();
-        private ByteBuffer sourceColumn;
-
-        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context)
-        throws IOException, InterruptedException
-        {
-        }
-
-        public void map(ByteBuffer key, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column> columns, Context context) throws IOException, InterruptedException
-        {
-            for (ColumnFamilyRecordReader.Column column : columns.values())
-            {
-                String name  = ByteBufferUtil.string(column.name);
-                String value = null;
-                
-                if (name.contains("int"))
-                    value = String.valueOf(ByteBufferUtil.toInt(column.value));
-                else
-                    value = ByteBufferUtil.string(column.value);
-                               
-                logger.debug("read {}:{}={} from {}",
-                             new Object[] {ByteBufferUtil.string(key), name, value, context.getInputSplit()});
-
-                StringTokenizer itr = new StringTokenizer(value);
-                while (itr.hasMoreTokens())
-                {
-                    word.set(itr.nextToken());
-                    context.write(word, one);
-                }
-            }
-        }
-    }
-
-    public static class ReducerToFilesystem extends Reducer<Text, IntWritable, Text, IntWritable>
-    {
-        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
-        {
-            int sum = 0;
-            for (IntWritable val : values)
-                sum += val.get();
-            context.write(key, new IntWritable(sum));
-        }
-    }
-
-    public static class ReducerToCassandra extends Reducer<Text, IntWritable, ByteBuffer, List<Mutation>>
-    {
-        private ByteBuffer outputKey;
-
-        protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
-        throws IOException, InterruptedException
-        {
-            outputKey = ByteBufferUtil.bytes(context.getConfiguration().get(CONF_COLUMN_NAME));
-        }
-
-        public void reduce(Text word, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
-        {
-            int sum = 0;
-            for (IntWritable val : values)
-                sum += val.get();
-            context.write(outputKey, Collections.singletonList(getMutation(word, sum)));
-        }
-
-        private static Mutation getMutation(Text word, int sum)
-        {
-            org.apache.cassandra.thrift.Column c = new org.apache.cassandra.thrift.Column();
-            c.setName(Arrays.copyOf(word.getBytes(), word.getLength()));
-            c.setValue(ByteBufferUtil.bytes(sum));
-            c.setTimestamp(System.currentTimeMillis());
-
-            Mutation m = new Mutation();
-            m.setColumn_or_supercolumn(new ColumnOrSuperColumn());
-            m.column_or_supercolumn.setColumn(c);
-            return m;
-        }
-    }
-
-    public int run(String[] args) throws Exception
-    {
-        String outputReducerType = "filesystem";
-        if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR))
-        {
-            String[] s = args[0].split("=");
-            if (s != null && s.length == 2)
-                outputReducerType = s[1];
-        }
-        logger.info("output reducer type: " + outputReducerType);
-
-        // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better
-        ConfigHelper.setRangeBatchSize(getConf(), 99);
-
-        for (int i = 0; i < WordCountSetup.TEST_COUNT; i++)
-        {
-            String columnName = "text" + i;
-
-            Job job = new Job(getConf(), "wordcount");
-            job.setJarByClass(WordCount.class);
-            job.setMapperClass(TokenizerMapper.class);
-
-            if (outputReducerType.equalsIgnoreCase("filesystem"))
-            {
-                job.setCombinerClass(ReducerToFilesystem.class);
-                job.setReducerClass(ReducerToFilesystem.class);
-                job.setOutputKeyClass(Text.class);
-                job.setOutputValueClass(IntWritable.class);
-                FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
-            }
-            else
-            {
-                job.setReducerClass(ReducerToCassandra.class);
-
-                job.setMapOutputKeyClass(Text.class);
-                job.setMapOutputValueClass(IntWritable.class);
-                job.setOutputKeyClass(ByteBuffer.class);
-                job.setOutputValueClass(List.class);
-
-                job.setOutputFormatClass(ColumnFamilyOutputFormat.class);
-
-                ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
-                job.getConfiguration().set(CONF_COLUMN_NAME, "sum");
-            }
-
-            job.setInputFormatClass(ColumnFamilyInputFormat.class);
-
-            ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
-            ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
-            ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
-            ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY);
-            SlicePredicate predicate = new SlicePredicate().setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
-            ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);
-
-            if (i == 4)
-            {
-                IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("int4"), IndexOperator.EQ, ByteBufferUtil.bytes(0));
-                ConfigHelper.setInputRange(job.getConfiguration(), Arrays.asList(expr));
-            }
-
-            if (i == 5)
-            {
-                // this will cause the predicate to be ignored in favor of scanning everything as a wide row
-                ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true);
-            }
-
-            ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost");
-            ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner");
-
-            job.waitForCompletion(true);
-        }
-        return 0;
-    }
-}

diff --git a/examples/hadoop_word_count/src/WordCountCounters.java b/examples/hadoop_word_count/src/WordCountCounters.java
deleted file mode 100644
index 98c8579..0000000
--- a/examples/hadoop_word_count/src/WordCountCounters.java
+++ /dev/null

@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.SortedMap;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
-import org.apache.cassandra.hadoop.ColumnFamilyRecordReader;
-import org.apache.cassandra.hadoop.ConfigHelper;
-import org.apache.cassandra.thrift.SlicePredicate;
-import org.apache.cassandra.thrift.SliceRange;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-/**
- * This sums the word count stored in the input_words_count ColumnFamily for the key "key-if-verse1".
- *
- * Output is written to a text file.
- */
-public class WordCountCounters extends Configured implements Tool
-{
-    private static final Logger logger = LoggerFactory.getLogger(WordCountCounters.class);
-
-    static final String COUNTER_COLUMN_FAMILY = "input_words_count";
-    private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count_counters";
-
-
-    public static void main(String[] args) throws Exception
-    {
-        // Let ToolRunner handle generic command-line options
-        ToolRunner.run(new Configuration(), new WordCountCounters(), args);
-        System.exit(0);
-    }
-
-    public static class SumMapper extends Mapper<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>, Text, LongWritable>
-    {
-        public void map(ByteBuffer key, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column> columns, Context context) throws IOException, InterruptedException
-        {
-            long sum = 0;
-            for (ColumnFamilyRecordReader.Column column : columns.values())
-            {
-                logger.debug("read " + key + ":" + ByteBufferUtil.string(column.name) + " from " + context.getInputSplit());
-                sum += ByteBufferUtil.toLong(column.value);
-            }
-            context.write(new Text(ByteBufferUtil.string(key)), new LongWritable(sum));
-        }
-    }
-
-    public int run(String[] args) throws Exception
-    {
-        Job job = new Job(getConf(), "wordcountcounters");
-        job.setJarByClass(WordCountCounters.class);
-        job.setMapperClass(SumMapper.class);
-
-        job.setOutputKeyClass(Text.class);
-        job.setOutputValueClass(LongWritable.class);
-        FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));
-
-
-        job.setInputFormatClass(ColumnFamilyInputFormat.class);
-
-        ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160");
-        ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost");
-        ConfigHelper.setInputPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.Murmur3Partitioner");
-        ConfigHelper.setInputColumnFamily(job.getConfiguration(), WordCount.KEYSPACE, WordCountCounters.COUNTER_COLUMN_FAMILY);
-        SlicePredicate predicate = new SlicePredicate().setSlice_range(
-                                                                        new SliceRange().
-                                                                        setStart(ByteBufferUtil.EMPTY_BYTE_BUFFER).
-                                                                        setFinish(ByteBufferUtil.EMPTY_BYTE_BUFFER).
-                                                                        setCount(100));
-        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);
-
-        job.waitForCompletion(true);
-        return 0;
-    }
-}

diff --git a/examples/hadoop_word_count/src/WordCountSetup.java b/examples/hadoop_word_count/src/WordCountSetup.java
deleted file mode 100644
index 0ef5341..0000000
--- a/examples/hadoop_word_count/src/WordCountSetup.java
+++ /dev/null

@@ -1,239 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.thrift.TException;
-import org.apache.thrift.protocol.TBinaryProtocol;
-import org.apache.thrift.protocol.TProtocol;
-import org.apache.thrift.transport.TFramedTransport;
-import org.apache.thrift.transport.TSocket;
-import org.apache.thrift.transport.TTransport;
-import org.apache.thrift.transport.TTransportException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.util.concurrent.Uninterruptibles;
-
-public class WordCountSetup
-{
-    private static final Logger logger = LoggerFactory.getLogger(WordCountSetup.class);
-
-    public static final int TEST_COUNT = 6;
-
-    public static void main(String[] args) throws Exception
-    {
-        Cassandra.Iface client = createConnection();
-
-        setupKeyspace(client);
-
-        client.set_keyspace(WordCount.KEYSPACE);
-
-        Map<ByteBuffer, Map<String, List<Mutation>>> mutationMap;
-        Column c;
-
-        // text0: no rows
-
-        // text1: 1 row, 1 word
-        c = new Column()
-            .setName(ByteBufferUtil.bytes("text1"))
-            .setValue(ByteBufferUtil.bytes("word1"))
-            .setTimestamp(System.currentTimeMillis());
-        mutationMap = getMutationMap(ByteBufferUtil.bytes("key0"), WordCount.COLUMN_FAMILY, c);
-        client.batch_mutate(mutationMap, ConsistencyLevel.ONE);
-        logger.info("added text1");
-
-        // text1: 1 row, 2 word
-        c = new Column()
-            .setName(ByteBufferUtil.bytes("text2"))
-            .setValue(ByteBufferUtil.bytes("word1 word2"))
-            .setTimestamp(System.currentTimeMillis());
-        mutationMap = getMutationMap(ByteBufferUtil.bytes("key0"), WordCount.COLUMN_FAMILY, c);
-        client.batch_mutate(mutationMap, ConsistencyLevel.ONE);
-        logger.info("added text2");
-
-        // text3: 1000 rows, 1 word
-        mutationMap = new HashMap<ByteBuffer, Map<String, List<Mutation>>>();
-        for (int i = 0; i < 1000; i++)
-        {
-            c = new Column()
-                .setName(ByteBufferUtil.bytes("text3"))
-                .setValue(ByteBufferUtil.bytes("word1"))
-                .setTimestamp(System.currentTimeMillis());
-            addToMutationMap(mutationMap, ByteBufferUtil.bytes("key" + i), WordCount.COLUMN_FAMILY, c);
-        }
-        client.batch_mutate(mutationMap, ConsistencyLevel.ONE);
-        logger.info("added text3");
-
-        // text4: 1000 rows, 1 word, one column to filter on
-        mutationMap = new HashMap<ByteBuffer, Map<String, List<Mutation>>>();
-        for (int i = 0; i < 1000; i++)
-        {
-            Column c1 = new Column()
-                       .setName(ByteBufferUtil.bytes("text4"))
-                       .setValue(ByteBufferUtil.bytes("word1"))
-                       .setTimestamp(System.currentTimeMillis());
-            Column c2 = new Column()
-                       .setName(ByteBufferUtil.bytes("int4"))
-                       .setValue(ByteBufferUtil.bytes(i % 4))
-                       .setTimestamp(System.currentTimeMillis());
-            ByteBuffer key = ByteBufferUtil.bytes("key" + i);
-            addToMutationMap(mutationMap, key, WordCount.COLUMN_FAMILY, c1);
-            addToMutationMap(mutationMap, key, WordCount.COLUMN_FAMILY, c2);
-        }
-        client.batch_mutate(mutationMap, ConsistencyLevel.ONE);
-        logger.info("added text4");
-
-        // sentence data for the counters
-        final ByteBuffer key = ByteBufferUtil.bytes("key-if-verse1");
-        final ColumnParent colParent = new ColumnParent(WordCountCounters.COUNTER_COLUMN_FAMILY);
-        for (String sentence : sentenceData())
-        {
-            client.add(key,
-                       colParent,
-                       new CounterColumn(ByteBufferUtil.bytes(sentence),
-                                         (long) sentence.split("\\s").length),
-                       ConsistencyLevel.ONE);
-        }
-        logger.info("added key-if-verse1");
-
-        System.exit(0);
-    }
-
-    private static Map<ByteBuffer, Map<String, List<Mutation>>> getMutationMap(ByteBuffer key, String cf, Column c)
-    {
-        Map<ByteBuffer, Map<String, List<Mutation>>> mutationMap = new HashMap<ByteBuffer, Map<String, List<Mutation>>>();
-        addToMutationMap(mutationMap, key, cf, c);
-        return mutationMap;
-    }
-
-    private static void addToMutationMap(Map<ByteBuffer, Map<String, List<Mutation>>> mutationMap, ByteBuffer key, String cf, Column c)
-    {
-        Map<String, List<Mutation>> cfMutation = mutationMap.get(key);
-        if (cfMutation == null)
-        {
-            cfMutation = new HashMap<String, List<Mutation>>();
-            mutationMap.put(key, cfMutation);
-        }
-
-        List<Mutation> mutationList = cfMutation.get(cf);
-        if (mutationList == null)
-        {
-            mutationList = new ArrayList<Mutation>();
-            cfMutation.put(cf, mutationList);
-        }
-
-        ColumnOrSuperColumn cc = new ColumnOrSuperColumn();
-        Mutation m = new Mutation();
-
-        cc.setColumn(c);
-        m.setColumn_or_supercolumn(cc);
-        mutationList.add(m);
-    }
-
-    private static void setupKeyspace(Cassandra.Iface client) throws TException, InvalidRequestException, SchemaDisagreementException
-    {
-        List<CfDef> cfDefList = new ArrayList<CfDef>();
-        CfDef input = new CfDef(WordCount.KEYSPACE, WordCount.COLUMN_FAMILY);
-        input.setComparator_type("AsciiType");
-        input.setColumn_metadata(Arrays.asList(new ColumnDef(ByteBufferUtil.bytes("text1"), "AsciiType"),
-                                               new ColumnDef(ByteBufferUtil.bytes("text2"), "AsciiType"),
-                                               new ColumnDef(ByteBufferUtil.bytes("text3"), "AsciiType"),
-                                               new ColumnDef(ByteBufferUtil.bytes("text4"), "AsciiType"),
-                                               new ColumnDef(ByteBufferUtil.bytes("int4"), "Int32Type").setIndex_name("int4idx").setIndex_type(IndexType.KEYS)));
-        cfDefList.add(input);
-
-        CfDef output = new CfDef(WordCount.KEYSPACE, WordCount.OUTPUT_COLUMN_FAMILY);
-        output.setComparator_type("AsciiType");
-        output.setDefault_validation_class("Int32Type");
-        cfDefList.add(output);
-
-        CfDef counterInput = new CfDef(WordCount.KEYSPACE, WordCountCounters.COUNTER_COLUMN_FAMILY);
-        counterInput.setComparator_type("UTF8Type");
-        counterInput.setDefault_validation_class("CounterColumnType");
-        cfDefList.add(counterInput);
-
-        KsDef ksDef = new KsDef(WordCount.KEYSPACE, "org.apache.cassandra.locator.SimpleStrategy", cfDefList);
-        ksDef.putToStrategy_options("replication_factor", "1");
-        client.system_add_keyspace(ksDef);
-
-        int magnitude = getNumberOfHosts(client);
-        Uninterruptibles.sleepUninterruptibly(magnitude, TimeUnit.SECONDS);
-    }
-
-    private static int getNumberOfHosts(Cassandra.Iface client)
-            throws InvalidRequestException, UnavailableException, TimedOutException, TException
-    {
-        client.set_keyspace("system");
-        SlicePredicate predicate = new SlicePredicate();
-        SliceRange sliceRange = new SliceRange();
-        sliceRange.setStart(new byte[0]);
-        sliceRange.setFinish(new byte[0]);
-        predicate.setSlice_range(sliceRange);
-
-        KeyRange keyrRange = new KeyRange();
-        keyrRange.setStart_key(new byte[0]);
-        keyrRange.setEnd_key(new byte[0]);
-        //keyrRange.setCount(100);
-
-        ColumnParent parent = new ColumnParent("peers");
-
-        List<KeySlice> ls = client.get_range_slices(parent, predicate, keyrRange, ConsistencyLevel.ONE);
-
-        return ls.size();
-    }
-
-    private static Cassandra.Iface createConnection() throws TTransportException
-    {
-        if (System.getProperty("cassandra.host") == null || System.getProperty("cassandra.port") == null)
-        {
-            logger.warn("cassandra.host or cassandra.port is not defined, using default");
-        }
-        return createConnection(System.getProperty("cassandra.host", "localhost"),
-                                Integer.valueOf(System.getProperty("cassandra.port", "9160")));
-    }
-
-    private static Cassandra.Client createConnection(String host, Integer port) throws TTransportException
-    {
-        TSocket socket = new TSocket(host, port);
-        TTransport trans = new TFramedTransport(socket);
-        trans.open();
-        TProtocol protocol = new TBinaryProtocol(trans);
-
-        return new Cassandra.Client(protocol);
-    }
-
-    private static String[] sentenceData()
-    {   // Public domain context, source http://en.wikisource.org/wiki/If%E2%80%94
-        return new String[]{
-                "If you can keep your head when all about you",
-                "Are losing theirs and blaming it on you",
-                "If you can trust yourself when all men doubt you,",
-                "But make allowance for their doubting too:",
-                "If you can wait and not be tired by waiting,",
-                "Or being lied about, don’t deal in lies,",
-                "Or being hated, don’t give way to hating,",
-                "And yet don’t look too good, nor talk too wise;"
-        };
-    }
-}

diff --git a/examples/pig/README.txt b/examples/pig/README.txt
deleted file mode 100644
index 1553a9f..0000000
--- a/examples/pig/README.txt
+++ /dev/null

@@ -1,118 +0,0 @@
-A Pig storage class that reads all columns from a given ColumnFamily, or writes
-properly formatted results into a ColumnFamily.
-
-Getting Started
-===============
-
-First build and start a Cassandra server with the default
-configuration and set the PIG_HOME and JAVA_HOME environment
-variables to the location of a Pig >= 0.7.0 install and your Java
-install. 
-
-If you would like to run using the Hadoop backend, you should
-also set PIG_CONF_DIR to the location of your Hadoop config.
-
-Finally, set the following as environment variables (uppercase,
-underscored), or as Hadoop configuration variables (lowercase, dotted):
-* PIG_INITIAL_ADDRESS or cassandra.thrift.address : initial address to connect to
-* PIG_RPC_PORT or cassandra.thrift.port : the port thrift is listening on
-* PIG_PARTITIONER or cassandra.partitioner.class : cluster partitioner
-
-For example, against a local node with the default settings, you'd use:
-export PIG_INITIAL_ADDRESS=localhost
-export PIG_RPC_PORT=9160
-export PIG_PARTITIONER=org.apache.cassandra.dht.Murmur3Partitioner
-
-These properties can be overridden with the following if you use different clusters
-for input and output:
-* PIG_INPUT_INITIAL_ADDRESS : initial address to connect to for reading
-* PIG_INPUT_RPC_PORT : the port thrift is listening on for reading
-* PIG_INPUT_PARTITIONER : cluster partitioner for reading
-* PIG_OUTPUT_INITIAL_ADDRESS : initial address to connect to for writing
-* PIG_OUTPUT_RPC_PORT : the port thrift is listening on for writing
-* PIG_OUTPUT_PARTITIONER : cluster partitioner for writing
-
-CassandraStorage
-================
-
-The CassandraStorage class is for any non-CQL3 ColumnFamilies you may have.  For CQL3 support, refer to the CqlNativeStorage section.
-
-examples/pig$ bin/pig_cassandra -x local example-script.pig
-
-This will run the test script against your Cassandra instance
-and will assume that there is a MyKeyspace/MyColumnFamily with some
-data in it. It will run in local mode (see pig docs for more info).
-
-If you'd like to get to a 'grunt>' shell prompt, run:
-
-examples/pig$ bin/pig_cassandra -x local
-
-Once the 'grunt>' shell has loaded, try a simple program like the
-following, which will determine the top 50 column names:
-
-grunt> rows = LOAD 'cassandra://MyKeyspace/MyColumnFamily' USING CassandraStorage();
-grunt> cols = FOREACH rows GENERATE flatten(columns);
-grunt> colnames = FOREACH cols GENERATE $0;
-grunt> namegroups = GROUP colnames BY (chararray) $0;
-grunt> namecounts = FOREACH namegroups GENERATE COUNT($1), group;
-grunt> orderednames = ORDER namecounts BY $0 DESC;
-grunt> topnames = LIMIT orderednames 50;
-grunt> dump topnames;
-
-Slices on columns can also be specified:
-grunt> rows = LOAD 'cassandra://MyKeyspace/MyColumnFamily?slice_start=C2&slice_end=C4&limit=1&reversed=true' USING CassandraStorage();
-
-Binary values for slice_start and slice_end can be escaped such as '\u0255'
-
-Outputting to Cassandra requires the same format from input, so the simplest example is:
-
-grunt> rows = LOAD 'cassandra://MyKeyspace/MyColumnFamily' USING CassandraStorage();
-grunt> STORE rows into 'cassandra://MyKeyspace/MyColumnFamily' USING CassandraStorage();
-
-Which will copy the ColumnFamily.  Note that the destination ColumnFamily must
-already exist for this to work.
-
-See the example in test/ to see how schema is inferred.
-
-Advanced Options for CassandraStorage
-=====================================
-
-The following environment variables default to false but can be set to true to enable them:
-
-PIG_WIDEROW_INPUT:  this enables loading of rows with many columns without
-                    incurring memory pressure.  All columns will be in a bag and indexes are not
-                    supported.  This can also be set in the LOAD url by adding
-                    the 'widerows=true' parameter.
-
-PIG_USE_SECONDARY:  this allows easy use of secondary indexes within your
-                    script, by appending every index to the schema as 'index_$name', allowing
-                    filtering of loaded rows with a statement like "FILTER rows BY index_color eq
-                    'blue'" if you have an index called 'color' defined.  This
-                    can also be set in the LOAD url by adding the
-                    'use_secondary=true' parameter.
-
-PIG_INPUT_SPLIT_SIZE: this sets the split size passed to Hadoop, controlling
-                      the amount of mapper tasks created.  This can also be set in the LOAD url by
-                      adding the 'split_size=X' parameter, where X is an integer amount for the size.
-
-CqlNativeStorage
-================
-
-The CqlNativeStorage class is somewhat similar to CassandraStorage, but it can work with CQL3-defined ColumnFamilies.  The main difference is in the URL format:
-
-cql://[username:password@]<keyspace>/<columnfamily>
-                    [?[page_size=<size>][&columns=<col1,col2>][&output_query=<prepared_statement>]
-                    [&where_clause=<clause>][&split_size=<size>][&partitioner=<partitioner>][&use_secondary=true|false]
-                    [&init_address=<host>][&native_port=<native_port>][&core_conns=<core_conns>]
-                    [&max_conns=<max_conns>][&min_simult_reqs=<min_simult_reqs>][&max_simult_reqs=<max_simult_reqs>]
-                    [&native_timeout=<native_timeout>][&native_read_timeout=<native_read_timeout>][&rec_buff_size=<rec_buff_size>]
-                    [&send_buff_size=<send_buff_size>][&solinger=<solinger>][&tcp_nodelay=<tcp_nodelay>][&reuse_address=<reuse_address>]
-                    [&keep_alive=<keep_alive>][&auth_provider=<auth_provider>][&trust_store_path=<trust_store_path>]
-                    [&key_store_path=<key_store_path>][&trust_store_password=<trust_store_password>]
-                    [&key_store_password=<key_store_password>][&cipher_suites=<cipher_suites>][&input_cql=<input_cql>]
-                    [columns=<columns>][where_clause=<where_clause>]]
-Which in grunt, the simplest example would look like:
-
-grunt> rows = LOAD 'cql://MyKeyspace/MyColumnFamily' USING CqlNativeStorage();
-
-CqlNativeStorage handles wide rows automatically and thus has no separate flag for this.

diff --git a/examples/pig/bin/pig_cassandra b/examples/pig/bin/pig_cassandra
deleted file mode 100755
index 5d98888..0000000
--- a/examples/pig/bin/pig_cassandra
+++ /dev/null

@@ -1,46 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cwd=`dirname $0`
-cassandra_home="$cwd/../../../"
-
-# general jars.
-for jar in $cassandra_home/lib/*.jar $cassandra_home/build/lib/jars/*.jar $cassandra_home/build/apache-cassandra*.jar; do
-    CLASSPATH=$CLASSPATH:$jar
-done
-
-if [ "x$PIG_HOME" = "x" ]; then
-    echo "PIG_HOME not set: requires Pig >= 0.7.0" >&2
-    exit 1
-fi
-
-# pig jar.
-for jar in $PIG_HOME/*.jar; do
-   PIG_JAR=$jar
-done
-echo "Using $PIG_JAR."
-if [ ! -e $PIG_JAR ]; then
-    echo "Unable to locate Pig jar" >&2
-    exit 1
-fi
-
-CLASSPATH=$CLASSPATH:$PIG_JAR
-
-export PIG_CLASSPATH=$PIG_CLASSPATH:$CLASSPATH
-export PIG_OPTS="$PIG_OPTS -Dudf.import.list=org.apache.cassandra.hadoop.pig"
-$PIG_HOME/bin/pig $*

diff --git a/examples/pig/example-populate-cql.txt b/examples/pig/example-populate-cql.txt
deleted file mode 100644
index 8e23582..0000000
--- a/examples/pig/example-populate-cql.txt
+++ /dev/null

@@ -1,24 +0,0 @@
-CREATE KEYSPACE libdata WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
-USE libdata;
-
-CREATE TABLE libout ("STABR" TEXT, "FSCSKEY" TEXT, "FSCS_SEQ" TEXT,
-                 "LIBID" TEXT, "LIBNAME" TEXT, "ADDRESS" TEXT, "CITY" TEXT,
-                 "ZIP" TEXT, "ZIP4" TEXT, "CNTY" TEXT, "PHONE" TEXT, "C_OUT_TY" TEXT,
-                 "C_MSA" TEXT, "SQ_FEET" INT, "F_SQ_FT" TEXT, "L_NUM_BM" INT,
-                 "F_BKMOB" TEXT, "HOURS" INT, "F_HOURS" TEXT, "WKS_OPEN" INT,
-                 "F_WKSOPN" TEXT, "YR_SUB" INT, "STATSTRU" INT, "STATNAME" INT,
-                 "STATADDR" INT, "LONGITUD" FLOAT, "LATITUDE" FLOAT, "FIPSST" INT,
-                 "FIPSCO" INT, "FIPSPLAC" INT, "CNTYPOP" INT, "LOCALE" TEXT,
-                 "CENTRACT" FLOAT, "CENBLOCK" INT, "CDCODE" TEXT, "MAT_CENT" TEXT,
-                 "MAT_TYPE" INT, "CBSA" INT, "MICROF" TEXT,
-                 PRIMARY KEY ("FSCSKEY", "FSCS_SEQ"));
-
-COPY libout ("STABR","FSCSKEY","FSCS_SEQ","LIBID","LIBNAME",
-                 "ADDRESS","CITY","ZIP","ZIP4","CNTY","PHONE","C_OUT_TY",
-                 "C_MSA","SQ_FEET","F_SQ_FT","L_NUM_BM","F_BKMOB","HOURS",
-                 "F_HOURS","WKS_OPEN","F_WKSOPN","YR_SUB","STATSTRU","STATNAME",
-                 "STATADDR","LONGITUD","LATITUDE","FIPSST","FIPSCO","FIPSPLAC",
-                 "CNTYPOP","LOCALE","CENTRACT","CENBLOCK","CDCODE","MAT_CENT",
-                 "MAT_TYPE","CBSA","MICROF") FROM 'libdata.csv' WITH HEADER=TRUE;
-                 
-CREATE TABLE libsqft (year INT, state TEXT, sqft BIGINT, PRIMARY KEY (year, state));

diff --git a/examples/pig/example-script-cql.pig b/examples/pig/example-script-cql.pig
deleted file mode 100644
index ef11130..0000000
--- a/examples/pig/example-script-cql.pig
+++ /dev/null

@@ -1,11 +0,0 @@
--- CqlNativeStorage
-libdata = LOAD 'cql://libdata/libout' USING CqlNativeStorage();
-book_by_mail = FILTER libdata BY C_OUT_TY == 'BM';
-
-libdata_buildings = FILTER libdata BY SQ_FEET > 0;
-state_flat = FOREACH libdata_buildings GENERATE STABR AS State,SQ_FEET AS SquareFeet;
-state_grouped = GROUP state_flat BY State;
-state_footage = FOREACH state_grouped GENERATE group AS State, SUM(state_flat.SquareFeet) AS TotalFeet:int;
-
-insert_format= FOREACH state_footage GENERATE TOTUPLE(TOTUPLE('year',2011),TOTUPLE('state',State)),TOTUPLE(TotalFeet);
-STORE insert_format INTO 'cql://libdata/libsqft?output_query=UPDATE%20libdata.libsqft%20SET%20sqft%20%3D%20%3F' USING CqlNativeStorage;
\ No newline at end of file

diff --git a/examples/pig/example-script.pig b/examples/pig/example-script.pig
deleted file mode 100644
index d47fc28..0000000
--- a/examples/pig/example-script.pig
+++ /dev/null

@@ -1,9 +0,0 @@
--- CassandraStorage
-rows = LOAD 'cassandra://MyKeyspace/MyColumnFamily' USING CassandraStorage();
-cols = FOREACH rows GENERATE flatten(columns);
-colnames = FOREACH cols GENERATE $0;
-namegroups = GROUP colnames BY (chararray) $0;
-namecounts = FOREACH namegroups GENERATE COUNT($1), group;
-orderednames = ORDER namecounts BY $0 DESC;
-topnames = LIMIT orderednames 50;
-dump topnames;
\ No newline at end of file

diff --git a/examples/pig/libdata.csv b/examples/pig/libdata.csv
deleted file mode 100644
index f9bd0b6..0000000
--- a/examples/pig/libdata.csv
+++ /dev/null

@@ -1,200 +0,0 @@
-KS,KS0189,002,KS0037,CALDWELL PUBLIC LIBRARY,120 S. MAIN ST.,CALDWELL,67022,1414,SUMNER,6208456879,CE,NO,4500,R_11,0,R_11,1200,R_11,50,R_11,2012,00,00,00,-97.606964,37.03116,20,191,09900,23835,43,9623,3078,2004,0,1,48620,0

-CA,CA0152,004,M748,SOLIZ LIBRARY,2820 JOURDAN ST.,OXNARD,93036,1611,VENTURA,8054854515,BR,CC,3030,R_11,0,R_11,1227,R_11,52,R_11,2012,00,00,00,-119.168424,34.2372,06,111,22370,831126,21,50.02999878,2005,0626,0,1,37100,0

-WV,WV0023,002,22,MORGANTOWN PUBLIC LIBRARY,373 SPRUCE STREET,MORGANTOWN,26505,5564,MONONGALIA,3042917425,CE,NO,24140,R_11,0,R_11,3016,R_11,52,R_11,2012,00,00,00,-79.954276,39.630204,54,061,55756,98613,13,107,1011,5401,0,1,34060,0

-IL,IL0603,002,3067300,VESPASIAN WARNER PUBLIC LIBRARY DISTRICT,310 NORTH QUINCY STREET,CLINTON,61727,1300,DE WITT,2179355174,CE,NO,36000,R_11,0,R_11,3172,R_11,-1,U_11,2012,00,00,00,-88.961745,40.154608,17,039,15001,16528,32,9717,2032,1713,0,1,14010,0

-NC,NC0032,017,C-FORSYTH-Y,YWCA BEST CHOICE CENTER,1031 HIGHLAND AVE,WINSTON-SALEM,27101,3108,FORSYTH,3367220597,BR,CC,750,R_11,0,R_11,312,R_11,52,R_11,2012,00,00,00,-80.233147,36.108588,37,067,75000,354454,12,6,2018,3712,0,1,49180,0

-NY,NY0578,002,7200446540,TUXEDO PARK LIBRARY,227 ROUTE 17,TUXEDO PARK,10987,4405,ORANGE,8453512207,CE,NC,9000,R_11,0,R_11,3094,R_11,52,R_11,2012,00,00,00,-74.18461,41.193192,36,071,00000,374438,41,149,3005,3618,0,5,35620,0

-IL,IL0508,002,3063300,STEELEVILLE AREA PUBLIC LIBRARY DISTRICT,625 SOUTH SPARTA STREET,STEELEVILLE,62288,2147,RANDOLPH,6189659732,CE,NO,4998,R_11,0,R_11,2630,R_11,-1,U_11,2012,00,00,00,-89.658669,38.003021,17,157,72468,33262,32,9511,4060,1712,0,1,0,.

-NY,NY0601,002,7200592060,SUNSHINE HALL FREE LIBRARY,14 PROCTOR ROAD,ELDRED,12732,0157,SULLIVAN,8455576258,CE,NO,2300,R_11,0,R_11,1250,R_11,52,R_11,2012,00,00,00,-74.882528,41.526061,36,105,00000,77079,42,9524,1021,3619,0,1,0,.

-FL,FL0001,012,FL0001-012,REDDICK PUBLIC LIBRARY,15150 N.W. GAINESVILLE ROAD,REDDICK,32686,3221,MARION,3524382566,BR,NC,2496,R_11,0,R_11,1228,R_11,52,R_11,2012,00,00,00,-82.197543,29.369123,12,083,59675,332472,42,2,2056,1205,0,1,36100,0

-NC,NC0010,009,R-HYCONEECHEE-CC,CARRBORO CYBRARY,100 N GREENSBORO ST,CARRBORO,27510,2016,ORANGE,9199187387,BR,NC,1060,R_11,0,R_11,1560,R_11,52,R_11,2012,00,00,15,-79.072256,35.910375,37,135,10620,135418,21,107.0500031,3021,3704,0,1,20500,0

-WV,WV0077,004,076B,COAL RIVER,494 JOHN SLACK CIRCLE,RACINE,25165,M,BOONE,3048378437,BR,NO,1200,R_11,0,R_11,1820,R_11,52,R_11,2012,00,00,00,-81.656444,38.145905,54,005,00000,24459,41,9582,3074,5403,X,14,16620,0

-TX,TX0279,002,360,SHEPHERD PUBLIC LIBRARY,30 N LIBERTY ST,SHEPHERD,77371,2460,SAN JACINTO,9366283515,CE,NC,2668,R_11,0,R_11,1626,R_11,52,R_11,2012,00,00,00,-94.997618,30.498398,48,407,67424,26876,42,2001.01001,1011,4808,0,1,0,.

-MI,MI0261,002,MI0261-002,SHIAWASSEE DISTRICT LIBRARY,502 WEST MAIN STREET,OWOSSO,48867,2687,SHIAWASSEE,9897255134,CE,NO,8400,R_11,0,R_11,2868,R_11,52,R_11,2012,00,00,00,-84.177198,42.997558,26,155,61940,69934,32,306,1007,2604,0,1,37020,1

-NE,NE0210,002,235T,SPENCER TOWNSHIP LIBRARY,110 WEST MAIN STREET,SPENCER,68777,0189,BOYD,4025891331,CE,NO,1750,R_11,0,R_11,546,R_11,52,R_11,2012,00,00,00,-98.701432,42.874026,31,015,46275,2081,43,9758,1234,3103,0,1,0,.

-ND,ND0012,002,ND0012-002,CARNEGIE REGIONAL LIBRARY,49 WEST 7TH STREET,GRAFTON,58237,1050,WALSH,7013522754,CE,CC,8692,R_11,0,R_11,2000,R_11,50,R_11,2012,00,00,00,-97.41183,48.417585,38,099,31820,11032,33,9580,1077,3800,0,1,0,.

-CT,CT0193,002,3900,EASTFORD PUBLIC LIBRARY,179 EASTFORD RD.,EASTFORD,06242,0908,WINDHAM,8609740125,CE,NO,1512,R_11,0,R_11,1216,R_11,52,R_11,2012,00,00,00,-72.080012,41.901001,09,015,00000,118205,42,9022,1049,0902,0,1,49340,0

-IL,IL0421,002,3053900,PECATONICA PUBLIC LIBRARY DISTRICT,400 WEST ELEVENTH STREET,PECATONICA,61063,9173,WINNEBAGO,8152392616,CE,NC,11800,R_11,0,R_11,2080,R_11,-1,U_11,2012,00,00,00,-89.365175,42.304798,17,201,58408,293660,42,43,2010,1716,0,1,40420,0

-AR,AR0019,002,AR019-002,LYDA MILLER PUBLIC LIBRARY,2609 HIGHWAY 367 S,BALD KNOB,72010,0287,WHITE,5017245452,BR,NO,2100,R_11,0,R_11,-1,U_11,52,R_11,2012,00,06,00,-91.563371,35.315974,05,145,03280,78089,32,706,4050,0502,0,1,42620,1

-TX,TX0473,002,513,ELGIN PUBLIC LIBRARY,404 N MAIN ST,ELGIN,78621,2625,BASTROP,5122815678,CE,NC,12738,R_11,0,R_11,1983,R_11,52,R_11,2012,00,00,00,-97.371621,30.350555,48,021,23044,75028,32,9502,2001,4810,0,1,12420,0

-MA,MA0164,002,MALDEN,MALDEN PUBLIC LIBRARY,36 SALEM ST.,MALDEN,02148,5208,MIDDLESEX,7813240218,CE,NC,44000,R_11,0,R_11,3126,R_11,52,R_11,2012,00,00,00,-71.066333,42.427158,25,017,37875,1522050,21,3416,5007,2505,0,1,14460,0

-WI,WI0409,002,WI0409,LOWELL PUBLIC LIBRARY,105 N. RIVER ST.,LOWELL,53557,0015,DODGE,9209275700,CE,NO,900,R_11,0,R_11,1040,R_11,52,R_11,2012,00,00,00,-88.79318,43.345758,55,027,00000,88721,42,9618,1101,5505,X,15,13180,1

-PA,PA0413,002,901630213,BURGETTSTOWN COMMUNITY LIBRARY,2 KERR ST,BURGETTSTOWN,15021,1127,WASHINGTON,7249479780,CE,NC,6781,R_11,0,R_11,2392,R_11,52,R_11,2012,00,00,00,-80.389683,40.379256,42,125,10224,208170,42,7127,1030,4218,0,1,38300,0

-NE,NE0091,002,101C,GILBERT PUBLIC LIBRARY,628 SECOND STREET,FRIEND,68359,1308,SALINE,4029475081,CE,NO,1678,R_11,0,R_11,1196,R_11,52,R_11,2012,00,00,00,-97.285879,40.653724,31,151,17775,14398,43,9607,2058,3103,0,1,0,.

-KS,KS0028,002,KS0160,KANOPOLIS PUBLIC LIBRARY,221 N. KANSAS,KANOPOLIS,67454,0205,ELLSWORTH,7854723053,CE,NO,897,R_11,0,R_11,832,R_11,52,R_11,2012,00,00,00,-98.157901,38.711743,20,053,35950,6478,42,866,2294,2001,0,1,0,.

-IL,IL0098,059,3010857,PORTAGE-CRAGIN BRANCH,5108 WEST BELMONT AVENUE,CHICAGO,60641,4206,COOK,3127440152,BR,CC,14000,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-87.754563,41.938765,17,031,14000,5214098,11,1510.01001,2008,1704,0,1,16980,0

-PA,PA0083,002,905620545,SHEFFIELD TOWNSHIP LIBRARY,20 LEATHER ST,SHEFFIELD,16347,0607,WARREN,8149683439,CE,NO,1131,R_11,0,R_11,1352,R_11,52,R_11,2012,00,00,00,-79.033263,41.701956,42,123,69984,41480,42,9712,3019,4205,0,1,47620,1

-UT,UT0072,001,UT0072-001,SALEM CITY LIBRARY,59 SOUTH MAIN STREET,SALEM,84653,9601,UTAH,8014232622,CE,NC,4150,R_11,0,R_11,2216,R_11,52,R_11,2012,00,00,00,-111.673401,40.053702,49,049,65770,530104,32,104.0699997,2016,4903,0,1,39340,0

-NJ,NJ0048,002,NJ0048-002,ORADELL PUBLIC LIBRARY,375 KINDERKAMACK ROAD,ORADELL,07649,2122,BERGEN,2012622613,CE,NC,16000,R_11,0,R_11,3016,R_11,52,R_11,2012,00,00,00,-74.031827,40.953183,34,003,54990,912773,21,400.019989,4006,3405,0,1,35620,0

-WI,WI0191,002,WI0191,MENOMONEE FALLS PUBLIC LIBRARY,W156N8436 PILGRIM RD.,MENOMONEE FALLS,53051,3140,WAUKESHA,2625328930,CE,NC,52951,R_11,0,R_11,3328,R_11,52,R_11,2012,00,00,00,-88.103683,43.170852,55,133,51000,390701,21,2001.030029,3023,5505,0,1,33340,0

-NC,NC0033,002,C-FRANKLIN-F,FRANKLIN COUNTY LIBRARY,906 N MAIN ST,LOUISBURG,27549,2199,FRANKLIN,9194962111,CE,NC,6000,R_11,0,R_11,2704,R_11,52,R_11,2012,00,00,15,-78.296077,36.112774,37,069,39360,61181,32,603.0100098,1018,3701,0,1,39580,0

-FL,FL0042,006,FL0042-006,DUNBAR JUPITER HAMMON PUBLIC LIBRARY,3095 BLOUNT STREET,FORT MYERS,33916,4100,LEE,2395334150,BR,NC,11400,R_11,0,R_11,1992,R_11,52,R_11,2012,00,00,00,-81.84986,26.642837,12,071,24125,631602,13,5.019999981,2011,1219,0,1,15980,0

-LA,LA0038,005,LA0038,VERNON PARISH LIBRARY BOOKMOBILE,1401 NOLAN TRACE,LEESVILLE,71446,8378,VERNON,3372392027,BS,NO,-3,U_11,1,R_11,1248,R_11,52,R_11,2012,00,00,00,-93.133117,31.328122,22,115,00000,52178,43,9501,1052,2204,2,15,22860,1

-FL,FL0058,002,FL0058-002,NORTH PALM BEACH PUBLIC LIBRARY,303 ANCHORAGE DRIVE,NORTH PALM BEACH,33408,4990,PALM BEACH,5618413383,CE,NC,24893,R_11,0,R_11,3024,R_11,52,R_11,2012,00,00,00,-80.057184,26.810361,12,099,49600,1337512,21,7.019999981,1015,1218,0,1,33100,0

-MO,MO0037,002,MO0036-002,UNIVERSITY CITY PUBLIC LIBRARY,6701 DELMAR BLVD,UNIVERSITY CITY,63130,3199,ST. LOUIS,3147273150,CE,NC,43000,R_11,0,R_11,3744,R_11,52,R_11,2012,00,00,00,-90.308719,38.656351,29,189,75220,999321,21,2161,2010,2901,0,1,41180,0

-TX,TX0582,002,575,MCMULLEN MEMORIAL LIBRARY,900 N MAIN ST,HUNTINGTON,75949,M,ANGELINA,9368764516,CE,NO,2500,R_11,0,R_11,1792,R_11,52,R_11,2012,00,00,00,-94.572496,31.285469,48,005,35492,87276,42,13,1016,4801,0,1,31260,1

-CT,CT0109,002,9902,ATWATER MEMORIAL LIBRARY,1720 FOXON RD.,NORTH BRANFORD,06471,0258,NEW HAVEN,2033156020,CE,NC,13000,R_11,0,R_11,2830,R_11,52,R_11,2012,00,00,00,-72.765106,41.328739,09,009,00000,862812,21,1861,1010,0903,0,1,35300,0

-AZ,AZ0064,011,BR-GEASA,GEASA-MARANA BRANCH LIBRARY,13370 LON ADAMS ROAD,MARANA,85653,9050,PIMA,5205945255,BR,NC,2900,R_11,0,R_11,2034,R_11,52,R_11,2012,00,00,00,-111.208324,32.449152,04,019,44270,987573,42,44.29999924,2025,0401,0,1,46060,0

-TX,TX0216,005,30.53,THEODORE JOHNS BRANCH LIBRARY,4255 FANNETT RD,BEAUMONT,77705,M,JEFFERSON,4098425223,BR,CC,11970,R_11,0,R_11,3132,R_11,52,R_11,2012,00,00,00,-94.129488,30.045033,48,245,07000,253160,12,22,2023,4814,0,1,13140,0

-LA,LA0044,012,49,GENEALOGY BRANCH & ADM. OFFICE,200 EAST MULBERRY STREET,AMITE,70422,2524,TANGIPAHOA,9857487559,BR,NO,8000,R_11,0,R_11,1716,R_11,52,R_11,2012,00,00,00,-90.506645,30.728051,22,105,01885,122519,32,9534,3091,2205,0,1,25220,0

-MD,MD0024,005,MD0024-005,POCOMOKE BRANCH,301 MARKET ST.,POCOMOKE CITY,21851,1113,WORCESTER,4109570878,BR,NO,6728,R_11,0,R_11,2136,R_11,52,R_11,2012,00,00,00,-75.566713,38.074163,24,047,62475,51446,32,9515,3032,2401,0,1,41540,0

-IL,IL0318,003,3043000,MARION CARNEGIE LIBRARY,206 SOUTH MARKET STREET,MARION,62959,2516,WILLIAMSON,6189935935,CE,NO,21000,R_11,0,R_11,3129,R_11,-1,U_11,2012,00,00,00,-88.927162,37.729836,17,199,46916,66680,33,211,3029,1712,0,1,16060,0

-IA,IA0307,002,RD,FONTANELLE PUBLIC LIBRARY,303 WASHINGTON,FONTANELLE,50846,0387,ADAIR,6417454981,CE,NO,2000,R_11,0,R_11,1040,R_11,52,R_11,2012,00,00,00,-94.563977,41.289591,19,001,28290,7572,43,9603,4125,1903,0,1,0,.

-PR,PR0010,001,BP0009,MOROVIS ELECTRONIC MUNICIPAL LIBRARY,"JOSE DEL RIO ST., CORNER OF BETANCE",MOROVIS,00687,M,MOROVIS,7878626161,CE,NO,-1,U_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-66.427046,18.340777,72,101,29555,32753,21,9553,3018,7298,X,14,41980,0

-OH,OH0188,002,288C,IDA RUPP PUBLIC LIBRARY,310 MADISON ST.,PORT CLINTON,43452,1921,OTTAWA,4197323212,CE,NC,17075,R_11,0,R_11,3119,R_11,52,R_11,2012,00,00,00,-82.941131,41.510222,39,123,64150,41435,31,506,2000,3909,0,1,38840,1

-MD,MD0008,004,MD0008-004,NORTH EAST BRANCH,106 W. CECIL AVE.,NORTH EAST,21901,3506,CECIL,4109966269,BR,NO,2800,R_11,0,R_11,2600,R_11,52,R_11,2012,00,00,00,-75.945126,39.60096,24,015,56450,101628,31,309.0299988,1067,2401,0,1,37980,0

-LA,LA0055,007,59,DELMONT GARDENS BRANCH,3351 LORRAINE STREET,BATON ROUGE,70805,2724,EAST BATON ROUGE,2253547040,BR,CC,19267,R_11,0,R_11,3432,R_11,52,R_11,2012,00,00,00,-91.156007,30.496636,22,033,05000,441602,12,3,1023,2202,0,1,12940,0

-VA,VA0014,005,VA0014-005,PORTER BRANCH,2001 PARKWAY BOULEVARD,STAFFORD,22554,3972,STAFFORD,5406594909,BR,NO,23000,R_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-77.447472,38.464856,51,179,00000,132246,21,102.1399994,2007,5101,0,1,47900,0

-WI,WI0198,002,WI0198,MILTON PUBLIC LIBRARY,"430 E. HIGH ST., #100",MILTON,53563,1579,ROCK,6088687462,CE,NC,8000,R_11,0,R_11,2756,R_11,52,R_11,2012,00,00,00,-88.944389,42.77298,55,105,52200,160067,23,31,4016,5501,0,1,27500,0

-IA,IA0544,002,IA0563,WASHTA PUBLIC LIBRARY,100 S 5TH AVE,WASHTA,51061,1016,CHEROKEE,7124476546,CE,NO,1353,R_11,0,R_11,1222,R_11,52,R_11,2012,00,00,15,-95.720114,42.57533,19,035,82380,12029,43,803,2316,1904,0,1,0,.

-NH,NH0174,002,NH895,GILMAN LIBRARY,100 MAIN STREET,ALTON,03809,4618,BELKNAP,6038752550,CE,NO,6996,R_11,0,R_11,1872,R_11,52,R_11,2012,00,00,00,-71.213211,43.452041,33,001,00980,60189,42,9665,2060,3301,0,1,29060,1

-CT,CT0175,002,15900,WETHERSFIELD PUBLIC LIBRARY,515 SILAS DEANE HIGHWAY,WETHERSFIELD,06109,2216,HARTFORD,8605292665,CE,NC,32877,R_11,0,R_11,2968,R_11,52,R_11,2012,00,00,00,-72.663363,41.711653,09,003,84970,896248,21,4922,1007,0901,0,1,25540,0

-NH,NH0030,002,NH89126,MAXFIELD PUBLIC LIBRARY,8 ROUTE 129,LOUDON,03307,0814,MERRIMACK,6037985153,CE,NO,6800,R_11,0,R_11,1768,R_11,52,R_11,2012,00,00,00,-71.467819,43.286141,33,013,43300,146614,42,360,2046,3302,0,1,18180,1

-NY,NY0005,015,0800000000,NORTH PARK BRANCH LIBRARY,975 HERTEL AVENUE,BUFFALO,14216,2201,ERIE,7168753748,BR,CC,3967,R_11,0,R_11,1568,R_11,52,R_11,2012,00,00,00,-78.871968,42.94775,36,029,11000,919627,11,50,1005,3626,0,1,15380,0

-OK,OK0018,002,24,CHANDLER PUBLIC LIBRARY,1021 S MANVEL,CHANDLER,74834,3853,LINCOLN,4052583204,CE,NO,2500,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-96.880778,35.700289,40,081,13500,34293,32,9617,4057,4003,0,1,36420,0

-ND,ND0037,002,ND0037-002,HANKINSON PUBLIC LIBRARY,315 MAIN AVENUE S,HANKINSON,58041,0244,RICHLAND,7012427929,CE,NO,400,R_11,0,R_11,1198,R_11,51,R_11,2012,00,00,00,-96.902081,46.069547,38,077,34900,16261,43,9714,2037,3800,0,1,47420,1

-IN,IN0207,009,8412,TRAFALGAR BRANCH,424 TOWER STREET,TRAFALGAR,46181,8876,JOHNSON,3178789560,BR,NO,19000,R_11,0,R_11,3172,R_11,52,R_11,2012,00,00,00,-86.148687,39.413997,18,081,76310,141439,41,6114,2051,1809,0,1,26900,0

-KY,KY0071,003,MARION,MARION COUNTY PUBLIC LIBRARY,201 EAST MAIN STREET,LEBANON,40033,1133,MARION,2706924698,BS,NO,-3,U_11,1,R_11,1012,R_11,44,R_11,2012,00,00,00,-85.250194,37.570737,21,155,44344,20049,32,9702,2017,2101,0,1,0,.

-NY,NY0562,033,6800310000,LEFRAK CITY,98-30 57 AVENUE,CORONA,11368,4696,QUEENS,7185927677,BR,CC,18000,R_11,0,R_11,1820,R_11,52,R_11,2012,00,00,00,-73.861226,40.738002,36,081,51000,2256438,11,455,3000,3614,4,15,35620,0

-NC,NC0014,003,R-PETTIGREW-P,PERQUIMANS COUNTY LIBRARY,110 W ACADEMY ST,HERTFORD,27944,1306,PERQUIMANS,2524265319,BR,NO,5320,R_11,0,R_11,2278,R_11,52,R_11,2012,00,00,15,-76.471096,36.188915,37,143,30900,13458,42,9202.009766,3019,3701,0,1,21020,1

-IL,IL0140,002,3022800,DIVERNON TOWNSHIP LIBRARY,221 SOUTH SECOND STREET,DIVERNON,62530,0140,SANGAMON,2176283813,CE,NO,1300,R_11,0,R_11,1560,R_11,-1,U_11,2012,00,00,00,-89.65852,39.565293,17,167,20045,198949,41,33,4122,1718,0,1,44100,0

-GA,GA0010,005,6282PICKENS,PICKENS COUNTY PUBLIC LIBRARY,100 LIBRARY LANE,JASPER,30143,1364,PICKENS,7066925411,BR,NO,11000,R_11,0,R_11,3320,R_11,52,R_11,2012,00,00,00,-84.420188,34.467478,13,227,41932,29431,32,502,1040,1309,2,14,12060,0

-MN,MN0085,002,S1310,BLOOMING PRAIRIE BRANCH LIBRARY,138 HIGHWAY AVE SOUTH,BLOOMING PRAIRIE,55917,0187,STEELE,5075837750,BR,NO,3000,R_11,0,R_11,2213,R_11,52,R_11,2012,00,00,00,-93.049227,43.866202,27,147,06580,36541,43,9608,4004,2701,0,5,36940,1

-MD,MD0003,039,MD0003,SOUTHEAST ANCHOR BRANCH,3601 EASTERN AVENUE,BALTIMORE,21224,M,BALTIMORE CITY,4103961580,BR,CC,27000,R_11,0,R_11,2527,R_11,52,R_11,2012,00,00,00,-76.567069,39.286587,24,510,04000,620216,11,2609,1010,2403,0,1,12580,0

-TX,TX0012,002,109,DENTON PUBLIC LIBRARY,502 OAKLAND ST,DENTON,76201,3102,DENTON,9403498750,CE,NC,22876,R_11,0,R_11,3161,R_11,52,R_11,2012,00,00,00,-97.129918,33.219673,48,121,19972,685601,12,206.0099945,3025,4826,0,1,19100,0

-MT,MT0044,002,747,LAUREL PUBLIC LIBRARY,720 WEST 3RD ST,LAUREL,59044,0068,YELLOWSTONE,4066284961,CE,NO,6000,R_11,0,R_11,2338,R_11,51,R_11,2012,00,00,00,-108.780888,45.671408,30,111,42700,149907,31,19.01000023,2007,3000,0,1,13740,0

-NJ,NJ0194,008,1225F,ISELIN BRANCH,1081 GREEN ST.,ISELIN,08830,2171,MIDDLESEX,7327267073,BR,NC,9000,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.313243,40.571154,34,023,34470,816618,21,26.03000069,2000,3406,0,1,35620,0

-FL,FL0005,003,FL0005-003,ALAFAYA BRANCH,12000 EAST COLONIAL DRIVE,ORLANDO,32826,4705,ORANGE,4078357323,BR,CC,12000,R_11,0,R_11,3150,R_11,52,R_11,2012,00,00,00,-81.203356,28.566308,12,095,00410,1171018,21,167.2700043,1000,1209,0,1,36740,0

-LA,LA0002,003,10,MONTGOMERY BRANCH,940 CADDO STREET,MONTGOMERY,71454,0157,GRANT,3186463660,BR,NO,770,R_11,0,R_11,1456,R_11,52,R_11,2012,00,00,00,-92.889812,31.667523,22,043,51620,22115,42,202,1127,2205,0,1,10780,0

-IA,IA0400,002,VT,CRESCO PUBLIC LIBRARY,320 N ELM ST,CRESCO,52136,1452,HOWARD,5635472540,CE,NO,7800,R_11,0,R_11,3120,R_11,52,R_11,2012,00,00,15,-92.11626,43.374078,19,089,17220,9533,33,9601,4021,1901,0,1,0,.

-IL,IL0031,002,3003100,BARCLAY PUBLIC LIBRARY DISTRICT,220 SOUTH MAIN STREET,WARRENSBURG,62573,0349,MACON,2176723621,CE,NO,5000,R_11,0,R_11,2600,R_11,-1,U_11,2012,00,00,00,-89.062322,39.929914,17,115,78916,110616,41,28,1137,1713,0,1,19500,0

-CT,CT0022,002,2100,DAVID M. HUNT LIBRARY,63 MAIN ST.,CANAAN,06031,0217,LITCHFIELD,8608247424,CE,NO,3600,R_11,0,R_11,1456,R_11,52,R_11,2012,00,00,00,-73.328793,42.026425,09,005,10870,188923,42,2602,1020,0905,0,1,45860,1

-CA,CA0119,002,M715,CROWELL PUBLIC LIBRARY,1890 HUNTINGTON DR.,SAN MARINO,91108,2595,LOS ANGELES,6263000777,CE,NC,29000,R_11,0,R_11,3224,R_11,52,R_11,2012,00,00,00,-118.112501,34.119348,06,037,68224,9889025,21,4641,4005,0627,0,1,31080,0

-MN,MN0043,009,M0715,NEW BRIGHTON LIBRARY,400 10TH STREET NW,NEW BRIGHTON,55112,6806,RAMSEY,6517246002,BR,NC,6250,R_11,0,R_11,294,R_11,9,R_11,2012,02,00,00,-93.191665,45.065114,27,123,45430,515336,21,412,1003,2704,0,1,33460,0

-GA,GA0008,018,GA0008,TYBEE,403 BUTLER AVENUE,TYBEE ISLAND,31328,9719,CHATHAM,9127867733,BR,NC,4000,R_11,0,R_11,1528,R_11,52,R_11,2012,00,00,00,-80.842618,32.008883,13,051,78036,271896,31,111.0299988,2022,1301,0,1,42340,0

-NH,NH0042,002,NH89137,WADLEIGH MEMORIAL LIBRARY,49 NASHUA STREET,MILFORD,03055,3753,HILLSBOROUGH,6036732408,CE,NO,13500,R_11,0,R_11,2923,R_11,52,R_11,2012,00,00,00,-71.648234,42.835511,33,011,47940,401842,22,162.0099945,5021,3302,0,1,31700,0

-NY,NY0069,002,7800020610,FREE LIBRARY OF THE BELMONT LITERARY AND HIST,2 WILLETS AVENUE,BELMONT,14813,1025,ALLEGANY,5852685308,CE,NO,8000,R_11,0,R_11,1000,R_11,50,R_11,2012,00,00,00,-78.034633,42.223293,36,003,05815,48744,42,9506,3060,3623,0,1,0,.

-IL,IL0243,002,3034900,HIGHWOOD PUBLIC LIBRARY,102 HIGHWOOD AVENUE,HIGHWOOD,60040,1520,LAKE,8474325404,CE,NC,10363,R_11,0,R_11,2128,R_11,-1,U_11,2012,00,00,00,-87.810707,42.201979,17,097,34865,701575,21,8652,1007,1710,0,1,16980,0

-VA,VA0005,008,VA0005-008,WESTOVER BRANCH LIBRARY,1644 N.MCKINLEY STREET,ARLINGTON,22205,2855,ARLINGTON,7032285260,BR,NC,16403,R_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-77.140244,38.888422,51,013,03000,216118,12,1010,4003,5108,4,15,47900,0

-AZ,AZ0111,002,CE-COPPER,COPPER QUEEN LIBRARY,6 MAIN STREET,BISBEE,85603,9901,COCHISE,5204324232,CE,NO,9000,R_11,0,R_11,2236,R_11,52,R_11,2012,00,00,00,-109.915161,31.441616,04,003,06260,132770,33,10,2015,0402,0,1,43420,0

-SD,SD0042,002,SD0042-002,DOROTHEE PIKE MEMORIAL LIBRARY,225 N MAIN AVE,LAKE PRESTON,57249,0036,KINGSBURY,6058474843,CE,NO,392,R_11,0,R_11,988,R_11,52,R_11,2012,00,00,15,-97.377319,44.362653,46,077,35500,5152,43,9581,1191,4600,0,1,0,.

-WI,WI0322,002,WI0322,TOMAHAWK PUBLIC LIBRARY,300 W. LINCOLN AVE.,TOMAHAWK,54487,1202,LINCOLN,7154532455,CE,NO,11465,R_11,0,R_11,2596,R_11,52,R_11,2012,00,00,00,-89.733724,45.473925,55,069,80125,28492,32,9602,2040,5507,0,1,32980,1

-NY,NY0146,002,2400113890,PECK MEMORIAL LIBRARY,24 MAIN STREET,MARATHON,13803,0325,CORTLAND,6078496135,CE,NO,3052,R_11,0,R_11,1490,R_11,52,R_11,2012,00,00,00,-76.033345,42.441233,36,023,45392,49619,42,9711,3064,3622,0,1,18660,1

-DC,DC0001,017,SOW,SOUTHWEST BRANCH LIBRARY,"900 WESLEY PLACE, SW",WASHINGTON,20024,4212,DIST OF COLUMBIA,2027244752,BR,CC,22032,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-77.016219,38.879185,11,001,50000,619020,11,102,2044,1198,0,1,47900,0

-OR,OR0140,001,OR0140,LANE LIBRARY DISTRICT,64 W. OREGON AVE.,CRESWELL,97426,0366,LANE,5038953053,CE,NC,4950,R_11,0,R_11,2300,R_11,52,R_11,2012,00,00,00,-123.019529,43.917861,41,039,16950,353481,31,11.01000023,3086,4104,0,1,21660,0

-IL,IL0445,004,3056601,YORK STREET BRANCH LIBRARY,639 YORK STREET,QUINCY,62301,3918,ADAMS,2172231309,BR,NO,2775,R_11,0,R_11,2020,R_11,52,R_11,2012,00,00,00,-91.40595,39.929353,17,001,62367,67169,33,7,2054,1718,0,1,39500,1

-OH,OH0226,003,326B01,BROOKFIELD BRANCH LIBRARY,7032 GROVE ST,BROOKFIELD,44403,0052,TRUMBULL,3304488134,BR,NC,4200,R_11,0,R_11,2652,R_11,52,R_11,2012,00,00,00,-80.565232,41.232504,39,155,09200,208874,21,9314,4006,3913,0,1,49660,0

-WI,WI0052,002,WI0052,CAMPBELLSPORT PUBLIC LIBRARY,220 N. HELENA,CAMPBELLSPORT,53010,0405,FOND DU LAC,9205338534,CE,NC,10440,R_11,0,R_11,2020,R_11,52,R_11,2012,00,00,00,-88.276351,43.599297,55,039,12325,101856,42,422,3028,5506,0,1,22540,0

-AR,AR0001,017,AR0001-13,OLEY E. ROOKER LIBRARY,11 OTTER CREEK CT.,LITTLE ROCK,72210,M,PULASKI,5019075991,BR,NC,13550,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-92.427498,34.659253,05,119,41000,386822,41,42.20000076,3009,0502,0,1,30780,0

-GA,GA0022,042,GA0022,NORTHEAST-SPRUILL OAKS REGIONAL  LIBRARY,9560 SPRUILL ROAD,ALPHARETTA,30022,5550,FULTON,7703608820,BR,NC,25000,R_11,0,R_11,2882,R_11,52,R_11,2012,00,00,00,-84.22258,34.01515,13,121,42425,949580,21,114.25,2005,1306,0,1,12060,0

-ME,ME0247,002,93,NORTH GORHAM PUBLIC LIBRARY,2 STANDISH NECK ROAD,GORHAM,04038,2469,CUMBERLAND,2078922575,CE,NO,1200,R_11,0,R_11,750,R_11,52,R_11,2012,00,00,00,-70.45554,43.798963,23,005,00000,282669,32,41,2006,2301,0,1,38860,0

-ME,ME0079,002,83,FORT KENT PUBLIC LIBRARY,1 MONUMENT SQUARE,FORT KENT,04743,1216,AROOSTOOK,2078343048,CE,NO,3153,R_11,0,R_11,1579,R_11,52,R_11,2012,00,00,00,-68.595746,47.250002,23,003,25720,71388,43,9506,3020,2302,4,14,0,.

-MI,MI0336,002,MI0336-002,THOMAS TOWNSHIP LIBRARY,8207 SHIELDS DRIVE,SAGINAW,48609,4814,SAGINAW,9897813770,CE,CC,14360,R_11,0,R_11,2732,R_11,52,R_11,2012,00,00,00,-84.074739,43.41882,26,145,73560,198990,22,120.0299988,1011,2604,0,1,40980,0

-TN,TN0133,006,KN001-05,FARRAGUT BRANCH LIBRARY,417 N. CAMPBELL STATION ROAD,KNOXVILLE,37934,2708,KNOX,8657771750,BR,CC,10100,R_11,0,R_11,2860,R_11,52,R_11,2012,00,00,00,-84.168446,35.88913,47,093,25760,436877,21,58.09000015,2020,4702,0,1,28940,0

-TX,TX0160,002,245,RED WALLER COMMUNITY LIBRARY,109 MELTON ST,MALAKOFF,75148,9347,HENDERSON,9034891818,CE,NO,4800,R_11,0,R_11,1500,R_11,50,R_11,2012,00,00,00,-96.014132,32.170392,48,213,46224,78753,33,9510,2059,4805,0,1,11980,1

-LA,LA0058,008,LA0058,MILTON H. LATTER MEMORIAL BRANCH,5120 ST. CHARLES AVENUE,NEW ORLEANS,70115,4941,ORLEANS,5045962625,BR,CC,9000,R_11,0,R_11,2704,R_11,52,R_11,2012,00,00,00,-90.109987,29.926997,22,071,55000,360341,11,108,1015,2202,0,1,35380,0

-WV,WV0020,005,019C,BOOKMOBILE,3100 EMERSON AVE.,PARKERSBURG,26104,2414,WOOD,3044204587,BS,NC,-3,U_11,1,R_11,1040,R_11,52,R_11,2012,00,00,00,-81.53833,39.28578,54,107,62140,86837,13,3,2003,5401,0,1,37620,0

-CA,CA0019,004,M615,CARNEGIE TECHNOLOGY CENTER BRANCH,420 HEBER AVE.,CALEXICO,92231,2559,IMPERIAL,7603575525,BR,NC,4000,R_11,0,R_11,708,R_11,52,R_11,2012,00,00,00,-115.493508,32.669062,06,025,09710,175897,31,121,3011,0651,0,1,20940,0

-PA,PA0385,026,926510006,LAWNCREST BRANCH LIBRARY,6098 RISING SUN AVENUE,PHILADELPHIA,19111,6009,PHILADELPHIA,2156850549,BR,CC,7184,R_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-75.099695,40.045973,42,101,60000,1538567,11,305.019989,1004,4213,0,1,37980,0

-NY,NY0306,002,3600414840,C. W. CLARK MEMORIAL LIBRARY,160 NORTH MAIN STREET,ORISKANY FALLS,13425,0670,ONEIDA,3158217850,CE,NC,3141,R_11,0,R_11,2060,R_11,52,R_11,2012,00,00,00,-75.459544,42.940144,36,065,55376,234155,42,256,2068,3622,0,1,46540,0

-NC,NC0061,005,C-UNION-W,WAXHAW LIBRARY,509 S PROVIDENCE ST,WAXHAW,28173,9339,UNION,7048433131,BR,NC,2900,R_11,0,R_11,1489,R_11,52,R_11,2012,00,00,15,-80.741342,34.922191,37,179,71460,205193,41,210.1399994,1009,3709,0,1,16740,0

-IL,IL0194,002,3029200,FRANKLIN PARK PUBLIC LIBRARY DISTRICT,10311 GRAND AVENUE,FRANKLIN PARK,60131,2225,COOK,8474556016,CE,NC,25000,R_11,0,R_11,3424,R_11,-1,U_11,2012,00,00,00,-87.882502,41.930234,17,031,27702,5214098,21,8114.02002,2009,1705,0,1,16980,0

-OH,OH0116,004,216B02,BALTIMORE BRANCH LIBRARY,205 E. MARKET ST.,BALTIMORE,43105,1326,FAIRFIELD,7408628505,BR,NC,3866,R_11,0,R_11,1632,R_11,51,R_11,2012,00,00,00,-82.600057,39.84531,39,045,03758,147355,31,303,3030,3915,0,1,18140,0

-MI,MI0360,002,MI0360-002,WATERVLIET DISTRICT LIBRARY,333 NORTH MAIN STREET,WATERVLIET,49098,9750,BERRIEN,2694636382,CE,NC,10272,R_11,0,R_11,2340,R_11,52,R_11,2012,00,00,00,-86.259586,42.19054,26,021,84500,156489,31,103,4027,2606,0,1,35660,0

-AL,AL0108,019,AL0108-019,BIRMINGHAM PUBLIC - TITUSVILLE BRANCH,26TH AVENUE SW,BIRMINGHAM,35211,2909,JEFFERSON,2053221140,BR,CC,9000,R_11,0,R_11,2385,R_11,52,R_11,2012,00,00,00,-86.833206,33.491538,01,073,07000,658967,12,51.00999832,3026,0107,4,15,13820,0

-TX,TX0443,005,86.66,SOUTH REGIONAL LIBRARY,2101 LAKE ROBBINS DR,THE WOODLANDS,77380,M,MONTGOMERY,9364427727,BR,NC,30000,R_11,0,R_11,3141,R_11,52,R_11,2012,00,00,00,-95.465652,30.162471,48,339,72656,471704,23,6917,1033,4808,0,1,26420,0

-FL,FL0012,042,FL0012-042,TAMARAC BRANCH,8701 W. COMMERCIAL BLVD.,TAMARAC,33351,4310,BROWARD,9547202282,BR,NC,30000,R_11,0,R_11,2672,R_11,52,R_11,2012,00,00,00,-80.267013,26.193663,12,011,39550,1784956,21,601.1400146,1004,1220,0,1,33100,0

-PA,PA0007,002,901300753,EVA K BOWLBY PUBLIC LIBRARY,311 NORTH WEST ST,WAYNESBURG,15370,1238,GREENE,7246279776,CE,NO,9791,R_11,0,R_11,2401,R_11,52,R_11,2012,00,00,00,-80.191465,39.899297,42,059,81832,38424,32,9706,3034,4218,0,1,0,.

-IN,IN0068,012,2713,CENTRE TOWNSHIP BRANCH,1150 EAST KERN ROAD,SOUTH BEND,46614,6472,ST JOSEPH,5742513700,BR,CC,17160,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-86.232966,41.607649,18,141,71000,266661,12,119,2011,1802,0,1,43780,0

-MI,MI0284,002,MI0284-002,REED CITY PUBLIC LIBRARY,410 WEST UPTON AVENUE,REED CITY,49677,1152,OSCEOLA,2318322131,CE,NO,3649,R_11,0,R_11,1872,R_11,52,R_11,2012,00,00,00,-85.516822,43.875995,26,133,67820,23455,33,9706,2012,2604,0,1,0,.

-VT,VT0042,002,CORNWALL,CORNWALL FREE PUBLIC,2629 RT 30,CORNWALL,05753,9299,ADDISON,8024623615,CE,NO,315,R_11,0,R_11,1664,R_11,52,R_11,2012,00,00,00,-73.209627,43.961155,50,001,00000,36785,41,9609,1014,5000,0,1,0,.

-TX,TX0065,002,157,LEE PUBLIC LIBRARY,312 W PACIFIC AVE,GLADEWATER,75647,2135,GREGG,9038452640,CE,NC,5500,R_11,0,R_11,2048,R_11,52,R_11,2012,00,00,00,-94.946153,32.535873,48,183,29660,122325,31,102,3008,4801,0,1,30980,0

-CT,CT0105,002,9500,PUBLIC LIBRARY OF NEW LONDON,63 HUNTINGTON ST.,NEW LONDON,06320,6194,NEW LONDON,8604471411,CE,CC,22000,R_11,0,R_11,2678,R_11,52,R_11,2012,00,00,00,-72.099541,41.354992,09,011,52280,274091,13,6905,2026,0902,0,1,35980,0

-NJ,NJ0195,009,1300G,MARLBORO LIBRARY,1 LIBRARY COURT,MARLBORO,07746,1102,MONMOUTH,7325369406,BR,NC,18100,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.262612,40.330578,34,025,63900,630092,21,8097.009766,2021,3406,0,1,35620,0

-IL,IL0037,002,3003700,BEDFORD PARK PUBLIC LIBRARY DISTRICT,7816 WEST 65TH PLACE,BEDFORD PARK,60501,1914,COOK,7084586826,CE,NC,17000,R_11,0,R_11,3016,R_11,-1,U_11,2012,00,00,00,-87.816069,41.772696,17,031,04572,5214098,21,8205.009766,1026,1703,0,1,16980,0

-KS,KS0191,002,KS0059,CONWAY SPRINGS CITY LIBRARY,210 W. SPRINGS AVE,CONWAY SPRINGS,67031,0183,SUMNER,6204562859,CE,NO,980,R_11,0,R_11,1040,R_11,52,R_11,2012,00,00,00,-97.63682,37.390358,20,191,15325,23835,42,9622,3026,2004,0,1,48620,0

-NH,NH0055,002,NH89149,NEWFIELDS PUBLIC LIBRARY,76 MAIN STREET,NEWFIELDS,03856,0200,ROCKINGHAM,6037788169,CE,NO,-1,U_11,0,R_11,1456,R_11,52,R_11,2012,00,00,00,-70.932892,43.041363,33,015,00000,296085,31,675.0100098,2014,3301,0,1,14460,0

-WI,WI0308,002,WI0308,JEAN M. THOMSEN MEMORIAL LIBRARY,105 N. GERSHWIN ST.,STETSONVILLE,54480,0099,TAYLOR,7156782892,CE,NO,3990,R_11,0,R_11,1772,R_11,52,R_11,2012,00,00,00,-90.311907,45.077181,55,119,77100,20759,42,9606,3046,5507,0,1,0,.

-FL,FL0011,019,FL0011-019,SUNTREE/VIERA PUBLIC LIBRARY,902 JORDAN BLASS DRIVE,MELBOURNE,32940,3350,BREVARD,3212554404,BR,NO,15000,R_11,0,R_11,2716,R_11,-1,U_11,2012,00,00,00,-80.679796,28.215339,12,009,00000,544322,21,641.2700195,4005,1208,0,1,37340,0

-NY,NY0071,002,7800021050,ESSENTIAL CLUB FREE LIBRARY,11 PRATT STREET,CANASERAGA,14822,9728,ALLEGANY,6075456443,CE,NO,2500,R_11,0,R_11,1144,R_11,52,R_11,2012,00,00,00,-77.775745,42.461616,36,003,12177,48744,42,9501,1037,3623,0,1,0,.

-WI,WI0134,002,WI0134,HILLSBORO PUBLIC LIBRARY,819 HIGH AVE.,HILLSBORO,54634,0468,VERNON,6084892192,CE,NO,6800,R_11,0,R_11,1716,R_11,52,R_11,2012,00,00,00,-90.341447,43.652202,55,123,34825,30043,43,9601,2032,5503,0,1,0,.

-FL,FL0021,002,FL0021-002,COLUMBIA COUNTY PUBLIC LIBRARY,308 N.W. COLUMBIA AVE.,LAKE CITY,32055,2891,COLUMBIA,3867581018,CE,NO,15400,R_11,0,R_11,2942,R_11,52,R_11,2012,00,00,00,-82.63798,30.192204,12,023,37775,67383,33,1105,2048,1203,0,1,29380,1

-FL,FL0001,005,FL0001-005,BELLEVIEW PUBLIC LIBRARY,13145 SE HWY 484,BELLEVIEW,34420,5831,MARION,3524382500,BR,NC,16000,R_11,0,R_11,2614,R_11,52,R_11,2012,00,00,00,-82.065647,29.028237,12,083,00000,332472,23,11.02000046,2051,1211,0,1,36100,0

-TN,TN0135,015,NA001-14,MADISON BRANCH LIBRARY,610 GALLATIN PIKE SOUTH,MADISON,37115,2192,DAVIDSON,6158625868,BR,CC,20680,R_11,0,R_11,2600,R_11,52,R_11,2012,00,00,00,-86.71406,36.257013,47,037,52006,635592,11,107.0100021,4001,4705,0,1,34980,0

-NH,NH0220,002,NH8995,WOODSVILLE FREE PUBLIC LIBRARY,14 SCHOOL LANE,WOODSVILLE,03785,1227,GRAFTON,6037473483,CE,NO,1360,IP10,0,R_11,1092,R_11,52,R_11,2012,00,00,00,-72.036754,44.150391,33,009,87140,88954,43,9606,3035,3302,0,1,17200,1

-NY,NY0651,002,8000580910,BROOKHAVEN FREE,273 BEAVER DAM RD,BROOKHAVEN,11719,9607,SUFFOLK,6312861923,CE,NC,5047,R_11,0,R_11,3140,R_11,52,R_11,2012,00,00,00,-72.911466,40.773631,36,103,09000,1500338,21,1592.040039,3015,3601,0,1,35620,0

-IL,IL0480,002,3060300,SCHILLER PARK PUBLIC LIBRARY,4200 OLD RIVER ROAD,SCHILLER PARK,60176,1630,COOK,8476780433,CE,NC,12500,R_11,0,R_11,3328,R_11,-1,U_11,2012,00,00,00,-87.860692,41.955696,17,031,68081,5214098,21,7708,1039,1705,0,1,16980,0

-NY,NY0004,019,0400300000,CROWN HEIGHTS,560 NEW YORK AVENUE,BROOKLYN,11225,5296,KINGS,7187731180,BR,CC,9280,R_11,0,R_11,2392,R_11,52,R_11,2012,00,00,00,-73.947776,40.661073,36,047,51000,2540230,11,804,1001,3609,0,1,35620,0

-OH,OH0039,011,139BK,STARK COUNTY DISTRICT LIBRARY,715 MARKET AVE.,CANTON,44702,1080,STARK,3304520665,BS,CC,-3,U_11,5,R_11,5976,R_11,52,R_11,2012,00,00,00,-81.373216,40.803018,39,151,12000,374328,13,7001,1048,3907,4,16,15940,0

-GA,GA0036,008,GA0036,MARION COUNTY PUBLIC LIBRARY,123 EAST 5TH AVE.,BUENNA VISTA,31803,0391,MARION,2296496385,BR,NO,4311,R_11,0,R_11,1588,R_11,52,R_11,2012,00,00,00,-84.522819,32.318596,13,197,11728,8706,42,9202,3058,1302,0,1,17980,0

-NJ,NJ0195,006,1300D,HAZLET MEMORIAL LIBRAY,251 MIDDLE ROAD,HAZLET,07730,1904,MONMOUTH,7322647164,BR,NC,10200,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.176972,40.427274,34,025,00000,630092,21,8021,5017,3406,0,1,35620,0

-SC,SC8004,006,SC8004-002,NINETY-SIX BRANCH LIBRARY,100 SOUTH CAMBRIDGE STREET,NINETY-SIX,29666,1111,GREENWOOD,8645434749,BR,NO,3750,R_11,0,R_11,2314,R_11,52,R_11,2012,00,00,00,-82.023766,34.175093,45,047,50290,69739,42,9707.019531,4019,4503,0,1,24940,1

-WI,WI0382,002,WI0382,DRESSER PUBLIC LIBRARY,117 S. CENTRAL AVE.,DRESSER,54009,0547,POLK,7157552944,CE,NO,1274,R_11,0,R_11,1250,R_11,50,R_11,2012,00,00,00,-92.633499,45.355307,55,095,20850,43984,42,9607,3074,5507,0,1,0,.

-OK,OK0061,002,65,MAYSVILLE PUBLIC LIBRARY,508 WILLIAMS,MAYSVILLE,73057,0878,GARVIN,4058674748,CE,NO,3125,R_11,0,R_11,2184,R_11,52,R_11,2012,00,00,00,-97.40843,34.817807,40,049,47150,27387,42,6816,1114,4004,0,1,0,.

-CA,CA0027,006,M623,ATLANTIC BRANCH,2269 S. ATLANTIC BLVD,COMMERCE,90040,3955,LOS ANGELES,3237801176,BR,NC,3852,R_11,0,R_11,2080,R_11,52,R_11,2012,00,00,00,-118.164562,34.005171,06,037,14974,9889025,21,5323.02002,4024,0640,0,1,31080,0

-NJ,NJ0114,005,0705C,FRANKLIN BRANCH,192 DODD STREET,EAST ORANGE,07017,2107,ESSEX,9732667053,BR,NC,500,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.21016,40.781824,34,013,19390,786754,21,100,1001,3410,0,1,35620,0

-OH,OH0177,002,277C,MORLEY LIBRARY,184 PHELPS ST.,PAINESVILLE,44077,3926,LAKE,4403523383,CE,NC,17500,R_11,0,R_11,3951,R_11,52,R_11,2012,00,00,00,-81.243767,41.727571,39,085,59416,229873,21,2045,1008,3914,0,1,17460,0

-NY,NY0420,002,5600221120,CARTHAGE FREE LIBRARY,412 BUDD STREET,CARTHAGE,13619,1206,JEFFERSON,3154932620,CE,NO,4800,R_11,0,R_11,2392,R_11,52,R_11,2012,00,00,00,-75.609727,43.979609,36,045,12683,118163,33,609,3023,3621,0,1,48060,0

-KY,KY0073,003,MARTIN,RUFUS M. REED PUBLIC LIBRARY,1442 RIVERFRONT ROAD,LOVELY,41231,0359,MARTIN,6063956500,BR,NO,3400,R_11,0,R_11,2080,R_11,52,R_11,2012,00,00,00,-82.367849,37.78319,21,159,00000,12839,42,9503,1003,2105,X,14,0,.

-MT,MT0051,005,1216,FRENCHTOWN SCHOOL AND COMMUNITY LIBRARY  (MISSOULA CO.),16495 MAIN STREET,FRENCHTOWN,59834,M,MISSOULA,4066262730,BR,NO,4500,R_11,0,R_11,1040,R_11,52,R_11,2012,00,14,00,-114.227048,47.011982,30,063,29350,110114,42,16,1029,3000,0,1,33540,0

-PA,PA0147,002,908561265,MARY S BIESECKER PUB LIBRARY,230 S ROSINA AVE,SOMERSET,15501,1937,SOMERSET,8144454011,CE,NC,3531,R_11,0,R_11,2345,R_11,52,R_11,2012,00,00,00,-79.082912,40.00724,42,111,71776,77382,32,210,3002,4212,0,1,43740,1

-IA,IA0251,002,MM,ALGONA PUBLIC LIBRARY,210 N PHILLIPS,ALGONA,50511,1735,KOSSUTH,5152955476,CE,NO,16000,R_11,0,R_11,2912,R_11,52,R_11,2012,00,00,15,-94.226822,43.070044,19,109,01135,15369,33,9505,2042,1904,0,1,0,.

-NY,NY0004,061,0400300000,WINDSOR TERRACE,160 EAST 5TH STREET,BROOKLYN,11218,1702,KINGS,7186869707,BR,CC,7500,R_11,0,R_11,2392,R_11,52,R_11,2012,00,00,00,-73.97657,40.648829,36,047,51000,2540230,11,500,2004,3609,0,1,35620,0

-IN,IN0177,002,7284,BLOOMFIELD-EASTERN GREENE COUNTY PUBLIC LIBRARY,125 SOUTH FRANKLIN STREET,BLOOMFIELD,47424,1406,GREENE,8123844125,CE,NO,13400,R_11,0,R_11,2808,R_11,52,R_11,2012,00,00,00,-86.93852,39.02559,18,055,05716,33061,32,9554,4026,1808,0,1,0,.

-NJ,NJ0223,002,NJ0223-002,BUTLER PUBLIC LIBRARY,1 ACE ROAD,BUTLER,07045,1375,MORRIS,2018383262,CE,NC,6324,R_11,0,R_11,2659,R_11,52,R_11,2012,00,00,00,-74.342983,41.003358,34,027,09040,495941,21,405,1007,3411,0,1,35620,0

-TX,TX0247,011,329.299,CINCO RANCH BRANCH LIBRARY,2620 COMMERCIAL CENTER BLVD,KATY,77494,6407,FORT BEND,2813951311,BR,NC,33500,R_11,0,R_11,2524,R_11,52,R_11,2012,00,00,00,-95.787502,29.725205,48,157,00000,607952,41,6731.02002,1089,4822,2,15,26420,0

-MN,MN0035,015,M0213,ANOKA: RUM RIVER,4201 6TH AVE NW,ANOKA,55303,M,ANOKA,7635764695,BR,NC,30150,R_11,0,R_11,2312,R_11,52,R_11,2012,00,00,15,-93.380862,45.22981,27,003,01720,333226,21,505.0499878,3007,2706,0,1,33460,0

-OH,OH0002,019,102B17,HIGHLAND SQUARE BRANCH LIBRARY,807 W MARKET ST,AKRON,44303,1010,SUMMIT,3303762927,BR,CC,11500,R_11,0,R_11,2964,R_11,52,R_11,2012,00,06,15,-81.543313,41.096492,39,153,01000,541227,12,5073,3001,3913,0,1,10420,0

-MA,MA0015,002,ATHOL,ATHOL PUBLIC LIBRARY,568 MAIN STEET,ATHOL,01331,1888,WORCESTER,9782499515,CE,NO,8000,R_11,0,R_11,2350,R_11,52,R_11,2012,00,00,15,-72.228447,42.593893,25,027,02515,803785,32,7031,1026,2502,4,14,49340,0

-CA,CA0105,022,M701,SOUTHGATE LIBRARY,6132 66TH AVE.,SACRAMENTO,95823,2706,SACRAMENTO,9162642700,BR,CC,12000,R_11,0,R_11,1980,R_11,49,R_11,2012,00,00,00,-121.433101,38.492443,06,067,24498,1436262,21,50.02000046,1004,0607,0,1,40900,0

-CT,CT0124,002,11000,PLAINVILLE PUBLIC LIBRARY,56 EAST MAIN ST.,PLAINVILLE,06062,1934,HARTFORD,8607931446,CE,NC,30000,R_11,0,R_11,3016,R_11,52,R_11,2012,00,00,00,-72.865298,41.671845,09,003,00000,896248,21,4205,4005,0905,0,1,25540,0

-TX,TX0069,002,160,LIBRARY OF GRAHAM,910 CHERRY ST,GRAHAM,76450,3547,YOUNG,9405490600,CE,NO,14000,R_11,0,R_11,2184,R_11,52,R_11,2012,00,00,00,-98.587919,33.099243,48,503,30392,18385,33,9505,2005,4819,0,1,0,.

-MI,MI0322,002,MI0322-002,SAINT CLAIR SHORES PUBLIC LIBRARY,22500 ELEVEN MILE ROAD,SAINT CLAIR SHORES,48081,1399,MACOMB,5867719020,CE,NC,30000,R_11,0,R_11,3092,R_11,52,R_11,2012,00,00,00,-82.887251,42.496575,26,099,70760,842881,21,2511,1001,2609,0,1,19820,0

-TX,TX0183,002,269,WARD COUNTY LIBRARY,409 S DWIGHT ST,MONAHANS,79756,4609,WARD,4329433332,CE,NO,15000,R_11,0,R_11,2452,R_11,52,R_11,2012,00,00,00,-102.887829,31.59298,48,475,48936,10713,32,9502,1052,4823,0,5,0,.

-AL,AL0248,001,506-003,SOUTHSIDE PUBLIC LIBRARY,2255 HIGHWAY 77,SOUTHSIDE,35907,7905,ETOWAH,2564420105,CE,NO,1000,R_11,0,R_11,1404,R_11,52,R_11,2012,00,00,00,-86.022239,33.918787,01,055,71832,104298,23,105.0100021,3044,0104,0,1,23460,0

-ME,ME0296,002,268,SWANS ISLAND PUBLIC LIBRARY,451 ATLANTIC ROAD,SWAN'S ISLAND,04685,0012,HANCOCK,2075264330,CE,NO,3500,R_11,0,R_11,1200,R_11,52,R_11,2012,00,00,00,-68.417378,44.174678,23,009,00000,54551,43,9662,2136,2302,0,1,0,.

-KS,KS0200,002,KS0020,BELLE PLAINE PUBLIC LIBRARY,222 W. 5TH,BELLE PLAINE,67013,0700,SUMNER,6204883431,CE,NO,1987,R_11,0,R_11,1508,R_11,52,R_11,2012,00,00,00,-97.281473,37.393656,20,191,05500,23835,42,9621,4011,2004,0,1,48620,0

-CA,CA0144,006,M740,SUTTER BRANCH LIBRARY,2147 CALIFORNIA ST.,SUTTER,95982,2546,SUTTER,5307550485,BR,NC,970,R_11,0,R_11,996,R_11,52,R_11,2012,00,00,00,-121.748948,39.158979,06,101,77378,94951,23,508,2033,0603,0,1,49700,0

-DC,DC0001,012,NOE,NORTHEAST BRANCH LIBRARY,"330 7TH STREET, NE",WASHINGTON,20002,6104,DIST OF COLUMBIA,2026983320,BR,CC,13900,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-76.996192,38.893985,11,001,50000,619020,11,83.01999664,1014,1198,0,1,47900,0

-KY,KY0109,007,WARREN,GRAHAM DRIVE COMMUNITY BRANCH,305 GRAHAM DRIVE,BOWLING GREEN,42101,1140,WARREN,2707811441,BR,CC,1627,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-86.441796,37.003255,21,227,08902,115491,13,102,3031,2102,0,1,14540,0

-FL,FL0025,015,FL0025-015,KENDALL BRANCH LIBRARY,9101 S.W. 97 AVENUE,MIAMI,33176,1985,MIAMI-DADE,3052790520,BR,CC,14000,R_11,0,R_11,2559,R_11,52,R_11,2012,00,00,00,-80.350799,25.685623,12,086,36100,2565440,13,84.05000305,1009,1227,0,1,33100,0

-PA,PA0532,002,902024075,SENIOR BUS,1301 BEAVER AVENUE,PITTSBURGH,15233,2342,ALLEGHENY,4123211853,BS,CC,-3,U_11,1,R_11,403,R_11,50,R_11,2012,00,00,00,-80.028932,40.453027,42,003,61000,1227442,11,9806,1011,4214,0,1,38300,0

-NJ,NJ0204,002,NJ0204-002,FAIR HAVEN PUBLIC LIBRARY,748 RIVER ROAD,FAIR HAVEN,07704,3397,MONMOUTH,2017475031,CE,NC,1750,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.035853,40.364273,34,025,22440,630092,21,8037,5000,3404,0,1,35620,0

-VI,VI0002,002,VI0002-002,ELAINE I. SPRAUVE PUBLIC LIBRARY,EST. ENIGHED,CRUZ BAY,00830,M,ST. JOHN,3407766359,BR,NO,1200,R_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-64.740289,18.338647,78,020,00000,-1,41,9501,3092,7898,X,14,0,.

-NC,NC0104,001,C-CHATHAM-BS,CHATHAM COUNTY BOOKMOBILE,500 N 2ND AVE,SILER CITY,27344,3123,CHATHAM,9195427725,BS,NC,-3,U_11,1,R_11,1000,R_11,50,R_11,2012,00,14,15,-79.46469,35.727411,37,037,61860,65280,32,204.0200043,1049,3702,0,1,20500,0

-TX,TX0402,003,49.362,SOUTHMOST BRANCH LIBRARY,4320 SOUTHMOST BLVD,BROWNSVILLE,78521,M,CAMERON,9565481055,BR,CC,20000,R_11,0,R_11,2602,R_11,52,R_11,2012,00,00,00,-97.449958,25.896555,48,061,10768,412577,12,133.0599976,1008,4834,0,1,15180,0

-OH,OH0163,008,OH0163-008,BUCKEYE LAKE BRANCH,41 W FIRST ST,BUCKEYE LAKE,43008,1708,LICKING,7409280472,BR,NO,1200,R_11,0,R_11,2433,R_11,52,R_11,2012,00,00,00,-82.482686,39.933065,39,089,09890,167194,23,7583,3012,3912,0,1,18140,0

-GA,GA0023,006,6632HILTON,IDA HILTON PUBLIC LIBRARY,1105 NORTHWAY,DARIEN,31305,1227,MCINTOSH,9124272124,BR,NO,6000,R_11,0,R_11,2110,R_11,-1,U_11,2012,00,00,00,-81.417136,31.394249,13,191,00000,14229,31,1102,1079,1301,X,14,15260,0

-LA,LA0057,015,60,ROSEDALE BRANCH,4036 JEFFERSON HIGHWAY,OLD JEFFERSON,70121,1630,JEFFERSON,5048384350,BR,NC,7138,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-90.168164,29.961761,22,051,38145,433523,21,245,1006,2201,0,1,35380,0

-WI,WI0059,002,WI0059,CHIPPEWA FALLS PUBLIC LIBRARY,105 W. CENTRAL ST.,CHIPPEWA FALLS,54729,2397,CHIPPEWA,7157231146,CE,NC,18025,R_11,0,R_11,2682,R_11,52,R_11,2012,00,00,00,-91.393911,44.935456,55,017,14575,62857,23,103,1021,5503,0,1,20740,0

-WI,WI0065,002,WI0065,COLFAX PUBLIC LIBRARY,613 MAIN ST.,COLFAX,54730,0525,DUNN,7159624334,CE,NO,1800,R_11,0,R_11,1820,R_11,52,R_11,2012,00,00,00,-91.727195,44.999765,55,033,16275,44021,42,9703,3057,5503,0,1,32860,1

-DC,DC0001,016,SOE,SOUTHEAST BRANCH LIBRARY,"403 7TH STREET, SE",WASHINGTON,20003,2700,DIST OF COLUMBIA,2026983377,BR,CC,6431,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-76.996189,38.88387,11,001,50000,619020,11,65,2000,1198,0,1,47900,0

-MI,MI0376,002,MI0376-002,GEORGE W. SPINDLER MEMORIAL LIBRARY,186 NORTH MAIN STREET,WOODLAND,48897,0068,BARRY,2693674694,CE,NO,1738,R_11,0,R_11,1040,R_11,52,R_11,2012,00,00,00,-85.13415,42.728953,26,015,88420,58936,42,101,2030,2603,0,1,24340,0

-WV,WV0040,002,39,CLARKSBURG-HARRISON CO. PUBLIC LIBRARY,404 W. PIKE STREET,CLARKSBURG,26301,2712,HARRISON,3046272236,CE,NO,30442,R_11,0,R_11,3484,R_11,52,R_11,2012,00,00,00,-80.340592,39.280797,54,033,15628,69298,32,301,1055,5401,0,1,17220,1

-IL,IL0524,002,3064900,THREE RIVERS PUBLIC LIBRARY DISTRICT,25207 WEST CHANNON DRIVE,CHANNAHON,60410,0300,WILL,8154676200,CE,NC,9200,R_11,0,R_11,3464,R_11,-1,U_11,2012,00,00,00,-88.224651,41.434087,17,197,12476,680584,21,8833.049805,1044,1716,0,1,16980,0

-IA,IA0048,002,BW,BERTHA BARTLETT PUBLIC LIBRARY,503 BROAD ST,STORY CITY,50248,1133,STORY,5157332685,CE,NO,8000,R_11,0,R_11,2738,R_11,52,R_11,2012,00,00,15,-93.597291,42.187095,19,169,75675,90816,31,106,1023,1904,0,1,11180,0

-MT,MT0011,003,1634,BUTTE-SILVER BOW PUBLIC LIBRARY SOUTH BRANCH,3100 HARRISON AVE,BUTTE,59701,M,SILVER BOW,4067233361,BR,NO,1200,R_11,0,R_11,1008,R_11,28,R_11,2012,02,00,00,-112.508,45.979868,30,093,11397,34362,33,7,1022,3000,0,1,15580,1

-NM,NM0110,001,NM0110-001,PUEBLO DE ABIQUIU LIBRARY & CULTURAL CENTER,COUNTY RD. #187 BUILDING #29,ABIQUIU,87510,0838,RIO ARRIBA,5056854884,CE,NO,400,R_11,0,R_11,1329,R_11,52,R_11,2012,00,00,00,-106.231605,36.156557,35,039,00000,40353,42,4,2205,3503,X,14,21580,1

-OR,OR0072,002,OR0072,SCAPPOOSE PUBLIC LIBRARY,52469 SE 2ND ST,SCAPPOOSE,97056,0400,COLUMBIA,5035437123,CE,NC,11020,R_11,0,R_11,2190,R_11,52,R_11,2012,00,00,00,-122.876339,45.756527,41,009,65500,49357,32,9709,2004,4101,0,1,38900,0

-TN,TN0132,005,CH001-04,EASTGATE BRANCH LIBRARY,5705 MARLIN ROAD,CHATTANOOGA,37411,5508,HAMILTON,4237575310,BR,CC,9084,R_11,0,R_11,2340,R_11,52,R_11,2012,00,00,00,-85.211325,35.007946,47,065,14000,340870,12,30,2044,4703,0,1,16860,0

-UT,UT0013,002,CACHCO1501,RICHMOND PUBLIC LIBRARY,38 WEST MAIN STREET,RICHMOND,84333,1409,CACHE,4352585525,CE,NO,2700,R_11,0,R_11,1565,R_11,52,R_11,2012,00,00,00,-111.809843,41.922642,49,005,63680,114559,41,1.00999999,3061,4901,0,1,30860,0

-IN,IN0123,002,5092,LOGANSPORT-CASS COUNTY PUBLIC LIBRARY,616 EAST BROADWAY,LOGANSPORT,46947,3155,CASS,5747536383,CE,NO,40448,R_11,0,R_11,3900,R_11,52,R_11,2012,00,00,00,-86.362725,40.754552,18,017,44658,38810,32,9515,2020,1804,0,1,30900,1

-WV,WV0082,002,81,GREENBRIER COUNTY PUBLIC LIBRARY,152 ROBERT W. MCCORMICK DRIVE,LEWISBURG,24901,1312,GREENBRIER,3046477568,CE,NO,13500,R_11,0,R_11,3068,R_11,52,R_11,2012,00,00,00,-80.448962,37.803713,54,025,46636,35710,33,9506,1041,5403,4,14,0,.

-ME,ME0042,002,43,CARIBOU PUBLIC LIBRARY,30 HIGH STREET,CARIBOU,04736,2796,AROOSTOOK,2074934214,CE,NO,11000,R_11,0,R_11,2548,R_11,52,R_11,2012,00,00,00,-68.010616,46.860615,23,003,10565,71388,33,9513,3015,2302,0,1,0,.

-MA,MA0091,002,ERVING,ERVING PUBLIC LIBRARY,17 MOORE ST.,ERVING,01344,9717,FRANKLIN,4134233348,CE,NO,1865,R_11,0,R_11,1044,R_11,52,R_11,2012,00,00,00,-72.490752,42.584079,25,011,41410,71646,32,404,2075,2502,0,1,24640,1

-CA,CA0068,026,M664,GEORGE SOUTH MERCED BRANCH,401 LESHER DRIVE,MERCED,95340,6572,MERCED,2097253909,BR,CC,1200,R_11,0,R_11,1300,R_11,52,R_11,2012,00,00,00,-120.504007,37.293699,06,047,46898,259966,13,15.02999973,1021,0616,0,1,32900,0

-MI,MI0240,012,MI0240-012,NORTH MUSKEGON BRANCH LIBRARY,1522 RUDDIMAN DRIVE,NORTH MUSKEGON,49445,3038,MUSKEGON,2317446080,BR,NC,3700,R_11,0,R_11,2496,R_11,52,R_11,2012,00,00,00,-86.267321,43.256368,26,121,58640,170021,22,15,2009,2602,0,1,34740,0

-VA,VA0076,002,VA0076-002,SHENANDOAH COUNTY LIBRARY,514 STONEY CREEK BLVD.,EDINBURG,22824,9142,SHENANDOAH,5409848200,CE,NO,13000,R_11,0,R_11,-1,U_11,-1,U_11,2012,00,00,00,-78.569018,38.825883,51,171,25008,42270,42,406,2037,5106,0,1,0,.

-NJ,NJ0188,002,NJ0188-002,SAYREVILLE FREE PUBLIC LIBRARY,1050 WASHINGTON ROAD,PARLIN,08859,1061,MIDDLESEX,2017270212,CE,NC,19500,R_11,0,R_11,3172,R_11,52,R_11,2012,00,00,00,-74.315701,40.464356,34,023,65790,816618,21,73.01000214,2000,3406,0,1,35620,0

-NY,NY0251,002,3200482455,ALICE CURTIS DESMOND AND HAMILTON FISH LIBRARY,472 ROUTE 403,GARRISON,10524,9803,PUTNAM,8454243020,CE,NC,9467,R_11,0,R_11,2258,R_11,52,R_11,2012,00,00,00,-73.94238,41.375377,36,079,00000,99916,41,108,2025,3618,0,1,35620,0

-GA,GA0004,015,GA0004,GORDON PUBLIC LIBRARY,284 MILLEDGEVILLE ROAD,GORDON,31031,0336,WILKINSON,4786285352,BR,NO,3385,R_11,0,R_11,2524,R_11,52,R_11,2012,00,00,00,-83.32934,32.882452,13,319,33980,9453,42,9603,3018,1308,0,1,0,.

-NJ,NJ0284,006,1900D,SUSSEX-WANTAGE BRANCH,69 ROUTE 639,SUSSEX,07461,2301,SUSSEX,9738753940,BR,NC,1400,R_11,0,R_11,-1,U_11,52,R_11,2012,00,00,00,-74.608678,41.210574,34,037,71670,148517,32,3712,1004,3405,4,16,35620,0

-NE,NE0038,002,040C,BRUNSWICK PUBLIC LIBRARY,303 FRANKLIN ST.,BRUNSWICK,68720,0011,ANTELOPE,4028422105,CE,NO,810,R_11,0,R_11,832,R_11,52,R_11,2012,00,00,00,-97.97071,42.292652,31,003,00000,6631,43,9796,2236,3103,0,1,0,.

-KS,KS0155,002,KS0111,GOODLAND PUBLIC LIBRARY,812 BROADWAY,GOODLAND,67735,3037,SHERMAN,7858995461,CE,NO,13500,R_11,0,R_11,2810,R_11,52,R_11,2012,00,00,00,-101.710437,39.351002,20,181,26875,6039,33,4537,4002,2001,0,1,0,.

-MI,MI0361,019,MI0361-002,WAYNE COUNTY PUBLIC LIBRARY,30555 MICHIGAN AVENUE,WESTLAND,48186,5310,WAYNE,7347277310,CE,NC,7000,R_11,0,R_11,2172,R_11,52,R_11,2012,00,00,00,-83.340555,42.286454,26,163,86000,1801789,21,5687,4017,2613,0,1,19820,0

-TX,TX0583,002,577,DILLEY PUBLIC LIBRARY,231 W FM 117,DILLEY,78017,3503,FRIO,8309651951,CE,NO,2340,R_11,0,R_11,2000,R_11,50,R_11,2012,00,00,00,-99.170073,28.67396,48,163,20428,17367,33,9503,1253,4823,0,1,0,.

-GA,GA0022,026,GA0022,PEACHTREE BRANCH LIBRARY,"1315 PEACHTREE STREET, N.E.",ATLANTA,30309,7515,FULTON,4048857830,BR,CC,10000,R_11,0,R_11,1806,R_11,52,R_11,2012,00,00,00,-84.38501,33.790479,13,121,04000,949580,11,4,1013,1305,0,1,12060,0

-IN,IN0180,003,7531,ELLETTSVILLE BRANCH,600 WEST TEMPERANCE,ELLETTSVILLE,47429,1324,MONROE,8123492515,BR,NC,12000,R_11,0,R_11,3328,R_11,52,R_11,2012,00,00,00,-86.625077,39.234002,18,105,20800,140063,23,13.02999973,1050,1809,0,1,14020,0

-WI,WI0023,002,WI0023,BELLEVILLE PUBLIC LIBRARY,130 S. VINE ST.,BELLEVILLE,53508,0140,DANE,6084241812,CE,NC,4600,R_11,0,R_11,2652,R_11,52,R_11,2012,00,00,00,-89.53399,42.858436,55,025,06300,496456,42,126,3121,5502,0,1,31540,0

-CA,CA0146,004,M742,EL RETIRO BRANCH LIBRARY,126 VISTA DEL PARQUE,REDONDO BEACH,90277,6112,LOS ANGELES,3103750922,BR,NC,4158,R_11,0,R_11,2222,R_11,52,R_11,2012,00,00,00,-118.379754,33.815298,06,037,80000,9889025,12,6513.040039,2005,0633,0,1,31080,0

-TX,TX0099,040,189.360,MCGOVERN-STELLA LINK BRANCH LIBRARY,7405 STELLA LINK RD,HOUSTON,77025,M,HARRIS,8323932630,BR,CC,20393,R_11,0,R_11,2098,R_11,50,R_11,2012,00,00,00,-95.440039,29.698416,48,201,35000,4173695,11,4130,2005,4807,0,1,26420,0


diff --git a/examples/pig/test/populate-cli.txt b/examples/pig/test/populate-cli.txt
deleted file mode 100644
index b2dda58..0000000
--- a/examples/pig/test/populate-cli.txt
+++ /dev/null

@@ -1,134 +0,0 @@
-create keyspace PigTest with placement_strategy = 'org.apache.cassandra.locator.SimpleStrategy' and strategy_options={replication_factor:1};
-use PigTest;
-create column family SomeApp with
-key_validation_class = UTF8Type and
-default_validation_class = LexicalUUIDType and
-comparator = UTF8Type and
-column_metadata =
-[
-    {column_name: name, validation_class: UTF8Type, index_type: KEYS},
-    {column_name: vote_type, validation_class: UTF8Type},
-    {column_name: rating, validation_class: Int32Type},
-    {column_name: score, validation_class: LongType},
-    {column_name: percent, validation_class: FloatType},
-    {column_name: atomic_weight, validation_class: DoubleType},
-    {column_name: created, validation_class: DateType},
-];
-
-create column family CopyOfSomeApp with
-key_validation_class = UTF8Type and
-default_validation_class = LexicalUUIDType and
-comparator = UTF8Type and
-column_metadata =
-[
-    {column_name: name, validation_class: UTF8Type, index_type: KEYS},
-    {column_name: vote_type, validation_class: UTF8Type},
-    {column_name: rating, validation_class: Int32Type},
-    {column_name: score, validation_class: LongType},
-    {column_name: percent, validation_class: FloatType},
-    {column_name: atomic_weight, validation_class: DoubleType},
-    {column_name: created, validation_class: DateType},
-];
-
-set SomeApp['foo']['name'] = 'User Foo';
-set SomeApp['foo']['vote_type'] = 'like';
-set SomeApp['foo']['rating'] = 8;
-set SomeApp['foo']['score'] = 125000;
-set SomeApp['foo']['percent'] = '85.0';
-set SomeApp['foo']['atomic_weight'] = '2.7182818284590451';
-set SomeApp['foo']['created'] = 1335890877;
-
-set SomeApp['bar']['name'] = 'User Bar';
-set SomeApp['bar']['vote_type'] = 'like';
-set SomeApp['bar']['rating'] = 9;
-set SomeApp['bar']['score'] = 15000;
-set SomeApp['bar']['percent'] = '35.0';
-set SomeApp['bar']['atomic_weight'] = '3.1415926535897931';
-set SomeApp['bar']['created'] = 1335890877;
-
-set SomeApp['baz']['name'] = 'User Baz';
-set SomeApp['baz']['vote_type'] = 'dislike';
-set SomeApp['baz']['rating'] = 3;
-set SomeApp['baz']['score'] = 512000;
-set SomeApp['baz']['percent'] = '95.3';
-set SomeApp['baz']['atomic_weight'] = '1.61803399';
-set SomeApp['baz']['created'] = 1335890877;
-set SomeApp['baz']['extra1'] = lexicaluuid();
-set SomeApp['baz']['extra2'] = lexicaluuid();
-set SomeApp['baz']['extra3'] = lexicaluuid();
-
-set SomeApp['qux']['name'] = 'User Qux';
-set SomeApp['qux']['vote_type'] = 'dislike';
-set SomeApp['qux']['rating'] = 2;
-set SomeApp['qux']['score'] = 12000;
-set SomeApp['qux']['percent'] = '64.7';
-set SomeApp['qux']['atomic_weight'] = '0.660161815846869';
-set SomeApp['qux']['created'] = 1335890877;
-set SomeApp['qux']['extra1'] = lexicaluuid();
-set SomeApp['qux']['extra2'] = lexicaluuid();
-set SomeApp['qux']['extra3'] = lexicaluuid();
-set SomeApp['qux']['extra4'] = lexicaluuid();
-set SomeApp['qux']['extra5'] = lexicaluuid();
-set SomeApp['qux']['extra6'] = lexicaluuid();
-set SomeApp['qux']['extra7'] = lexicaluuid();
-
-create column family U8 with
-    key_validation_class = UTF8Type and
-    comparator = UTF8Type;
-
-create column family Bytes with
-    key_validation_class = BytesType and
-    comparator = UTF8Type;
-
-set U8['foo']['x'] = ascii('Z');
-set Bytes[ascii('foo')]['x'] = ascii('Z');
-
-create column family CC with
-    key_validation_class = UTF8Type and
-    default_validation_class=CounterColumnType
-    and comparator=UTF8Type;
-
-incr CC['chuck']['kick'];
-incr CC['chuck']['kick'];
-incr CC['chuck']['kick'];
-incr CC['chuck']['fist'];
-
-create column family Compo
-    with key_validation_class = UTF8Type
-    and default_validation_class = UTF8Type
-    and comparator = 'CompositeType(UTF8Type,UTF8Type)';
-
-set Compo['punch']['bruce:lee'] = 'ouch';
-set Compo['punch']['bruce:bruce'] = 'hunh?';
-set Compo['kick']['bruce:lee'] = 'oww';
-set Compo['kick']['bruce:bruce'] = 'watch it, mate';
-
-create column family CompoInt
-    with key_validation_class = UTF8Type
-    and default_validation_class = UTF8Type
-    and comparator = 'CompositeType(LongType,LongType)';
-
-set CompoInt['clock']['1:0'] = 'z';
-set CompoInt['clock']['1:30'] = 'zzzz';
-set CompoInt['clock']['2:30'] = 'daddy?';
-set CompoInt['clock']['6:30'] = 'coffee...';
-
-create column family CompoIntCopy
-    with key_validation_class = UTF8Type
-    and default_validation_class = UTF8Type
-    and comparator = 'CompositeType(LongType,LongType)';
-
-create column family CompoKey
-    with key_validation_class = 'CompositeType(UTF8Type,LongType)'
-    and default_validation_class = UTF8Type
-    and comparator = LongType;
-
-set CompoKey['clock:10']['1'] = 'z';
-set CompoKey['clock:20']['1'] = 'zzzz';
-set CompoKey['clock:30']['2'] = 'daddy?';
-set CompoKey['clock:40']['6'] = 'coffee...';
-
-create column family CompoKeyCopy
-    with key_validation_class = 'CompositeType(UTF8Type,LongType)'
-    and default_validation_class = UTF8Type
-    and comparator = LongType;

diff --git a/examples/pig/test/populate-cql.txt b/examples/pig/test/populate-cql.txt
deleted file mode 100644
index 00b81db..0000000
--- a/examples/pig/test/populate-cql.txt
+++ /dev/null

@@ -1,33 +0,0 @@
-CREATE KEYSPACE cql3ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };
-USE cql3ks;
-
-CREATE TABLE test (a int PRIMARY KEY, b int);
-CREATE TABLE moredata (x int PRIMARY KEY, y int);
-INSERT INTO test (a,b) VALUES (1,1);
-INSERT INTO test (a,b) VALUES (2,2);
-INSERT INTO test (a,b) VALUES (3,3);
-INSERT INTO moredata (x, y) VALUES (4,4);
-INSERT INTO moredata (x, y) VALUES (5,5);
-INSERT INTO moredata (x, y) VALUES (6,6);
-
-CREATE TABLE compotable (a int, b int, c text, d text, PRIMARY KEY (a,b,c));
-INSERT INTO compotable (a, b , c , d ) VALUES ( 1,1,'One','match');
-INSERT INTO compotable (a, b , c , d ) VALUES ( 2,2,'Two','match');
-INSERT INTO compotable (a, b , c , d ) VALUES ( 3,3,'Three','match');
-INSERT INTO compotable (a, b , c , d ) VALUES ( 4,4,'Four','match');
-
-create table compmore (id int PRIMARY KEY, x int, y int, z text, data text);
-INSERT INTO compmore (id, x, y, z,data) VALUES (1,5,6,'Fix','nomatch');
-INSERT INTO compmore (id, x, y, z,data) VALUES (2,6,5,'Sive','nomatch');
-INSERT INTO compmore (id, x, y, z,data) VALUES (3,7,7,'Seven','match');
-INSERT INTO compmore (id, x, y, z,data) VALUES (4,8,8,'Eight','match');
-INSERT INTO compmore (id, x, y, z,data) VALUES (5,9,10,'Ninen','nomatch');
-
-
-CREATE TABLE collectiontable(m text PRIMARY KEY, n map<text, text>);
-UPDATE collectiontable SET n['key1'] = 'value1' WHERE m = 'book1';
-UPDATE collectiontable SET n['key2'] = 'value2' WHERE m = 'book2';
-UPDATE collectiontable SET n['key3'] = 'value3' WHERE m = 'book3';
-UPDATE collectiontable SET n['key4'] = 'value4' WHERE m = 'book4';
-
-

diff --git a/examples/pig/test/test_cql_storage.pig b/examples/pig/test/test_cql_storage.pig
deleted file mode 100644
index 822748e..0000000
--- a/examples/pig/test/test_cql_storage.pig
+++ /dev/null

@@ -1,14 +0,0 @@
-moretestvalues= LOAD 'cql://cql3ks/moredata/' USING CqlNativeStorage;
-insertformat= FOREACH moretestvalues GENERATE TOTUPLE(TOTUPLE('a',x)),TOTUPLE(y);
-STORE insertformat INTO 'cql://cql3ks/test?output_query=UPDATE+cql3ks.test+set+b+%3D+%3F' USING CqlNativeStorage;
-
--- composite key
-moredata = load 'cql://cql3ks/compmore' USING CqlNativeStorage;
-insertformat = FOREACH moredata GENERATE TOTUPLE (TOTUPLE('a',x),TOTUPLE('b',y), TOTUPLE('c',z)),TOTUPLE(data);
-STORE insertformat INTO 'cql://cql3ks/compotable?output_query=UPDATE%20cql3ks.compotable%20SET%20d%20%3D%20%3F' USING CqlNativeStorage;
-
--- collection column
-collectiontable = LOAD 'cql://cql3ks/collectiontable/' USING CqlNativeStorage;
--- recs= (((m,kk)),((map,(m,mm),(n,nn))))
-recs= FOREACH collectiontable GENERATE TOTUPLE(TOTUPLE('m', m) ), TOTUPLE(TOTUPLE('map', TOTUPLE('m', 'mm'), TOTUPLE('n', 'nn')));
-store recs INTO 'cql://cql3ks/collectiontable?output_query=update+cql3ks.collectiontable+set+n+%3D+%3F' USING CqlNativeStorage();

diff --git a/examples/pig/test/test_storage.pig b/examples/pig/test/test_storage.pig
deleted file mode 100644
index 026cb02..0000000
--- a/examples/pig/test/test_storage.pig
+++ /dev/null

@@ -1,85 +0,0 @@
-rows = LOAD 'cassandra://PigTest/SomeApp' USING CassandraStorage();
--- full copy
-STORE rows INTO 'cassandra://PigTest/CopyOfSomeApp' USING CassandraStorage();
--- single tuple
-onecol = FOREACH rows GENERATE key, percent;
-STORE onecol INTO 'cassandra://PigTest/CopyOfSomeApp' USING CassandraStorage();
--- bag only
-other = FOREACH rows GENERATE key, columns;
-STORE other INTO 'cassandra://PigTest/CopyOfSomeApp' USING CassandraStorage();
-
-
--- filter
-likes = FILTER rows by vote_type.value eq 'like' and rating.value > 5;
-dislikes_extras = FILTER rows by vote_type.value eq 'dislike' AND COUNT(columns) > 0;
-
--- store these too
-STORE likes INTO 'cassandra://PigTest/CopyOfSomeApp' USING CassandraStorage();
-STORE dislikes_extras INTO 'cassandra://PigTest/CopyOfSomeApp' USING CassandraStorage();
-
--- filter to fully visible rows (no uuid columns) and dump
-visible = FILTER rows BY COUNT(columns) == 0;
-dump visible;
-
-
-
--- test key types with a join
-U8 = load 'cassandra://PigTest/U8' using CassandraStorage();
-Bytes = load 'cassandra://PigTest/Bytes' using CassandraStorage();
-
--- cast key to chararray
-b = foreach Bytes generate (chararray)key, columns;
-
--- key in Bytes is a bytearray, U8 chararray
-a = join Bytes by key, U8 by key;
-dump a
-
--- key should now be cast into a chararray
-c = join b by (chararray)key, U8 by (chararray)key;
-dump c
-
-
---
---  Test counter column family support
---
-CC = load 'cassandra://PigTest/CC' using CassandraStorage();
-
-total_hits = foreach CC generate key, SUM(columns.value);
-
-dump total_hits;
-
---
---  Test CompositeType
---
-
-compo = load 'cassandra://PigTest/Compo' using CassandraStorage();
-
-compo = foreach compo generate key as method, flatten(columns);
-
-lee = filter compo by columns::name == ('bruce','lee');
-
-dump lee;
-
-night = load 'cassandra://PigTest/CompoInt' using CassandraStorage();
-night = foreach night generate flatten(columns);
-night = foreach night generate (int)columns::name.$0+(double)columns::name.$1/60 as hour, columns::value as noise;
-
--- What happens at the darkest hour?
-darkest = filter night by hour > 2 and hour < 5;
-
-dump darkest;
-
-compo_int_rows = LOAD 'cassandra://PigTest/CompoInt' USING CassandraStorage();
-STORE compo_int_rows INTO 'cassandra://PigTest/CompoIntCopy' USING CassandraStorage();
-
---
---  Test CompositeKey
---
-
-compokeys = load 'cassandra://PigTest/CompoKey' using CassandraStorage();
-compokeys = filter compokeys by key.$1 == 40;
-
-dump compokeys;
-
-compo_key_rows = LOAD 'cassandra://PigTest/CompoKey' USING CassandraStorage();
-STORE compo_key_rows INTO 'cassandra://PigTest/CompoKeyCopy' USING CassandraStorage();

diff --git a/examples/triggers/README.txt b/examples/triggers/README.txt
index a99fa98..e5f1ecf 100644
--- a/examples/triggers/README.txt
+++ b/examples/triggers/README.txt

@@ -1,11 +1,8 @@
-Cassandra Trigger's Example:
-=========================
+Cassandra Trigger Example:
+==========================
 
-InvertedIndex class will create a inverted index of 
-RowKey:ColumnName:Value to Value:ColumnName:RowKey
-
-NOTE: This example is limited to append-only workloads, 
-	  doesn't delete indexes on deletes. 
+The AuditTrigger class will create a basic audit of
+activity on a table.
 
 Installation:
 ============
@@ -13,9 +10,27 @@
 run "ant jar"
 Copy build/trigger-example.jar to <cassandra_conf>/triggers/
 Copy conf/* to <cassandra_home>/conf/
-Create column family configured in InvertedIndex.properties 
-    Example: Keyspace1.InvertedIndex as configured in InvertedIndex.properties
-Configure trigger on the table.
-    Example: CREATE TRIGGER test1 ON "Keyspace1"."Standard1"
-                 USING 'org.apache.cassandra.triggers.InvertedIndex';
-Start inserting data to the column family which has the triggers. 
+
+Create the keyspace and table configured in AuditTrigger.properties:
+    CREATE KEYSPACE test WITH REPLICATION =
+        { 'class' : 'SimpleStrategy', 'replication_factor' : '1' };
+    CREATE TABLE test.audit (key timeuuid, keyspace_name text,
+        table_name text, primary_key text, PRIMARY KEY(key));
+
+Create a table to add the trigger to:
+    CREATE TABLE test.test (key text, value text, PRIMARY KEY(key));
+    Note: The example currently only handles non-composite partition keys
+
+Configure the trigger on the table:
+    CREATE TRIGGER test1 ON test.test
+        USING 'org.apache.cassandra.triggers.AuditTrigger';
+
+Start inserting data to the table that has the trigger. For each
+partition added to the table an entry should appear in the 'audit' table:
+    INSERT INTO test.test (key, value) values ('1', '1');
+    SELECT * FROM test.audit;
+
+    key                                  | keyspace_name | primary_key | table_name
+   --------------------------------------+---------------+-------------+------------
+    7dc75b60-770f-11e5-9019-033d8af33e6f |          test |           1 |       test
+

diff --git a/examples/triggers/conf/AuditTrigger.properties b/examples/triggers/conf/AuditTrigger.properties
new file mode 100644
index 0000000..7f122de
--- /dev/null
+++ b/examples/triggers/conf/AuditTrigger.properties

@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+keyspace=test
+table=audit

diff --git a/examples/triggers/conf/InvertedIndex.properties b/examples/triggers/conf/InvertedIndex.properties
deleted file mode 100644
index ea49a86..0000000
--- a/examples/triggers/conf/InvertedIndex.properties
+++ /dev/null

@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-keyspace=Keyspace1
-table=InvertedIndex

diff --git a/examples/triggers/src/org/apache/cassandra/triggers/AuditTrigger.java b/examples/triggers/src/org/apache/cassandra/triggers/AuditTrigger.java
new file mode 100644
index 0000000..7739450
--- /dev/null
+++ b/examples/triggers/src/org/apache/cassandra/triggers/AuditTrigger.java

@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.triggers;
+
+import java.io.InputStream;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Properties;
+
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class AuditTrigger implements ITrigger
+{
+    private Properties properties = loadProperties();
+
+    public Collection<Mutation> augment(Partition update)
+    {
+        String auditKeyspace = properties.getProperty("keyspace");
+        String auditTable = properties.getProperty("table");
+
+        RowUpdateBuilder audit = new RowUpdateBuilder(Schema.instance.getCFMetaData(auditKeyspace, auditTable),
+                                                      FBUtilities.timestampMicros(),
+                                                      UUIDGen.getTimeUUID());
+
+        audit.add("keyspace_name", update.metadata().ksName);
+        audit.add("table_name", update.metadata().cfName);
+        audit.add("primary_key", update.metadata().getKeyValidator().getString(update.partitionKey().getKey()));
+
+        return Collections.singletonList(audit.build());
+    }
+
+    private static Properties loadProperties()
+    {
+        Properties properties = new Properties();
+        InputStream stream = AuditTrigger.class.getClassLoader().getResourceAsStream("AuditTrigger.properties");
+        try
+        {
+            properties.load(stream);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+        finally
+        {
+            FileUtils.closeQuietly(stream);
+        }
+        return properties;
+    }
+}

diff --git a/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java b/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java
deleted file mode 100644
index 2053387..0000000
--- a/examples/triggers/src/org/apache/cassandra/triggers/InvertedIndex.java
+++ /dev/null

@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.triggers;
-
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Properties;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.io.util.FileUtils;
-
-public class InvertedIndex implements ITrigger
-{
-    private static final Logger logger = LoggerFactory.getLogger(InvertedIndex.class);
-    private Properties properties = loadProperties();
-
-    public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
-    {
-        List<Mutation> mutations = new ArrayList<>(update.getColumnCount());
-
-        String indexKeySpace = properties.getProperty("keyspace");
-        String indexColumnFamily = properties.getProperty("table");
-        for (Cell cell : update)
-        {
-            // Skip the row marker and other empty values, since they lead to an empty key.
-            if (cell.value().remaining() > 0)
-            {
-                Mutation mutation = new Mutation(indexKeySpace, cell.value());
-                mutation.add(indexColumnFamily, cell.name(), key, System.currentTimeMillis());
-                mutations.add(mutation);
-            }
-        }
-
-        return mutations;
-    }
-
-    private static Properties loadProperties()
-    {
-        Properties properties = new Properties();
-        InputStream stream = InvertedIndex.class.getClassLoader().getResourceAsStream("InvertedIndex.properties");
-        try
-        {
-            properties.load(stream);
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-        finally
-        {
-            FileUtils.closeQuietly(stream);
-        }
-        logger.info("loaded property file, InvertedIndex.properties");
-        return properties;
-    }
-}

diff --git a/ide/idea-iml-file.xml b/ide/idea-iml-file.xml
index 338d780..bb9155d 100644
--- a/ide/idea-iml-file.xml
+++ b/ide/idea-iml-file.xml

@@ -18,7 +18,7 @@
   -->
 
 <module type="JAVA_MODULE" version="4">
-    <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_7" inherit-compiler-output="false">
+    <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8" inherit-compiler-output="false">
         <output url="file://$MODULE_DIR$/.idea/out/main" />
         <output-test url="file://$MODULE_DIR$/.idea/out/test" />
         <exclude-output />
@@ -32,9 +32,7 @@
             <sourceFolder url="file://$MODULE_DIR$/test/long" isTestSource="true" />
             <sourceFolder url="file://$MODULE_DIR$/test/microbench" isTestSource="true" />
             <sourceFolder url="file://$MODULE_DIR$/test/burn" isTestSource="true" />
-            <sourceFolder url="file://$MODULE_DIR$/test/pig" isTestSource="true" />
-<!--              We don't include the distributed source since it has a language level of 8, and there is no clean way to set a different language level for tests in Idea without creating a separate module -->
-<!--            <sourceFolder url="file://$MODULE_DIR$/test/distributed" isTestSource="true" /> -->
+            <sourceFolder url="file://$MODULE_DIR$/test/distributed" isTestSource="true" />
             <sourceFolder url="file://$MODULE_DIR$/test/resources" type="java-test-resource" />
             <excludeFolder url="file://$MODULE_DIR$/.idea" />
             <excludeFolder url="file://$MODULE_DIR$/.settings" />

diff --git a/ide/idea/inspectionProfiles/Project_Default.xml b/ide/idea/inspectionProfiles/Project_Default.xml
index a609ae3..2edbb70 100644
--- a/ide/idea/inspectionProfiles/Project_Default.xml
+++ b/ide/idea/inspectionProfiles/Project_Default.xml

@@ -56,6 +56,7 @@
     <inspection_tool class="JavaLangImport" enabled="true" level="WARNING" enabled_by_default="true" />
     <inspection_tool class="KeySetIterationMayUseEntrySet" enabled="true" level="WARNING" enabled_by_default="true" />
     <inspection_tool class="LengthOneStringInIndexOf" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="LengthOneStringsInConcatenation" enabled="true" level="WARNING" enabled_by_default="true" />
     <inspection_tool class="LoggerInitializedWithForeignClass" enabled="false" level="WARNING" enabled_by_default="false">
       <option name="loggerClassName" value="org.apache.log4j.Logger,org.slf4j.LoggerFactory,org.apache.commons.logging.LogFactory,java.util.logging.Logger" />
       <option name="loggerFactoryMethodName" value="getLogger,getLogger,getLog,getLogger" />

diff --git a/ide/idea/workspace.xml b/ide/idea/workspace.xml
index 4826f6c..9cf2d96 100644
--- a/ide/idea/workspace.xml
+++ b/ide/idea/workspace.xml

@@ -168,7 +168,7 @@
       <option name="MAIN_CLASS_NAME" value="" />
       <option name="METHOD_NAME" value="" />
       <option name="TEST_OBJECT" value="class" />
-      <option name="VM_PARAMETERS" value="-Dcassandra.config=file://$PROJECT_DIR$/test/conf/cassandra.yaml -Dlogback.configurationFile=file://$PROJECT_DIR$/test/conf/logback-test.xml -Dcassandra.logdir=$PROJECT_DIR$/build/test/logs -ea -XX:MaxMetaspaceSize=256M -XX:SoftRefLRUPolicyMSPerMB=0" />
+      <option name="VM_PARAMETERS" value="-Dcassandra.config=file://$PROJECT_DIR$/test/conf/cassandra.yaml -Dlogback.configurationFile=file://$PROJECT_DIR$/test/conf/logback-test.xml -Dcassandra.logdir=$PROJECT_DIR$/build/test/logs -ea -XX:MaxMetaspaceSize=256M -XX:SoftRefLRUPolicyMSPerMB=0 -Dcassandra.strict.runtime.checks=true" />
       <option name="PARAMETERS" value="" />
       <option name="WORKING_DIRECTORY" value="" />
       <option name="ENV_VARIABLES" />

diff --git a/lib/asm-5.0.4.jar b/lib/asm-5.0.4.jar
new file mode 100644
index 0000000..cdb283d
--- /dev/null
+++ b/lib/asm-5.0.4.jar
Binary files differ

diff --git a/lib/cassandra-driver-core-2.2.0-rc2-SNAPSHOT-20150617-shaded.jar b/lib/cassandra-driver-core-2.2.0-rc2-SNAPSHOT-20150617-shaded.jar
deleted file mode 100644
index 7d971df..0000000
--- a/lib/cassandra-driver-core-2.2.0-rc2-SNAPSHOT-20150617-shaded.jar
+++ /dev/null
Binary files differ

diff --git a/lib/cassandra-driver-core-3.0.1-shaded.jar b/lib/cassandra-driver-core-3.0.1-shaded.jar
new file mode 100644
index 0000000..bc269a0
--- /dev/null
+++ b/lib/cassandra-driver-core-3.0.1-shaded.jar
Binary files differ

diff --git a/lib/cassandra-driver-internal-only-3.11.0-bb96859b.zip b/lib/cassandra-driver-internal-only-3.11.0-bb96859b.zip
new file mode 100644
index 0000000..d31abc3
--- /dev/null
+++ b/lib/cassandra-driver-internal-only-3.11.0-bb96859b.zip
Binary files differ

diff --git a/lib/cassandra-driver-internal-only-3.5.0.post0-d8d0456.zip b/lib/cassandra-driver-internal-only-3.5.0.post0-d8d0456.zip
deleted file mode 100644
index 7d23b48..0000000
--- a/lib/cassandra-driver-internal-only-3.5.0.post0-d8d0456.zip
+++ /dev/null
Binary files differ

diff --git a/lib/crc32ex-0.1.1.jar b/lib/crc32ex-0.1.1.jar
deleted file mode 100644
index 4ba70ff..0000000
--- a/lib/crc32ex-0.1.1.jar
+++ /dev/null
Binary files differ

diff --git a/lib/guava-16.0.jar b/lib/guava-16.0.jar
deleted file mode 100644
index 7afcb10..0000000
--- a/lib/guava-16.0.jar
+++ /dev/null
Binary files differ

diff --git a/lib/guava-18.0.jar b/lib/guava-18.0.jar
new file mode 100644
index 0000000..8f89e49
--- /dev/null
+++ b/lib/guava-18.0.jar
Binary files differ

diff --git a/lib/jna-4.0.0.jar b/lib/jna-4.0.0.jar
deleted file mode 100644
index 9038048..0000000
--- a/lib/jna-4.0.0.jar
+++ /dev/null
Binary files differ

diff --git a/lib/jna-4.2.2.jar b/lib/jna-4.2.2.jar
new file mode 100644
index 0000000..a943670
--- /dev/null
+++ b/lib/jna-4.2.2.jar
Binary files differ

diff --git a/lib/jsr223/clojure/README.txt b/lib/jsr223/clojure/README.txt
deleted file mode 100644
index 7ed7551..0000000
--- a/lib/jsr223/clojure/README.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Unfortunately the JSR-223 support provided by the project https://github.com/ato/clojure-jsr223
-and the related ones do not provide compileable script support.
-
-The JSR-223 javax.script.Compilable implementation takes source file names or readers but not script sources
-as all other JSR-223 implementations do.

diff --git a/lib/jsr223/groovy/README.txt b/lib/jsr223/groovy/README.txt
deleted file mode 100644
index 09fef93..0000000
--- a/lib/jsr223/groovy/README.txt
+++ /dev/null

@@ -1,35 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Using JSR-223 capable Groovy
-
-Tested with version 2.3.6
-
-Installation
-------------
-
-1. Download Groovy binary release
-2. Unpack the downloaded archive into a temporary directory
-3. Copy the jar groovy-all-2.3.6-indy.jar from the Groovy embeddable directory to $CASSANDRA_HOME/lib/jsr223/groovy
-   "indy" means "invokedynamic" and is a JVM instruction for scripting languages new to Java 7.
-4. Restart your Cassandra daemon if it's already running
-
-Cassandra log should contain a line like this:
-  INFO  10:49:45 Found scripting engine Groovy Scripting Engine 2.0 - Groovy 2.3.6 - language names: [groovy, Groovy]
-Such a line appears when you already have scripted UDFs in your system or add a scripted UDF for the first time (see below).
-
-Smoke Test
-----------
-
-To test Groovy functionality, open cqlsh and execute the following command:
-  CREATE OR REPLACE FUNCTION foobar ( input text ) RETURNS text LANGUAGE groovy AS 'return "foo";' ;
-
-If you get the error
-  code=2200 [Invalid query] message="Invalid language groovy for 'foobar'"
-Groovy for Apache Cassandra has not been installed correctly.
-
-Notes / Java7 invokedynamic
----------------------------
-
-Groovy provides jars that support invokedynamic bytecode instruction. These jars are whose ending with
-"-indy.jar".

diff --git a/lib/jsr223/jaskell/README.txt b/lib/jsr223/jaskell/README.txt
deleted file mode 100644
index 53e942e..0000000
--- a/lib/jsr223/jaskell/README.txt
+++ /dev/null

@@ -1,5 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Unfortunately Jaskell JSR-223 support is quite old and the Jaskell engine seems to be quite
-unsupported. If you find a solution, please open a ticket at Apache Cassandra JIRA.

diff --git a/lib/jsr223/jruby/README.txt b/lib/jsr223/jruby/README.txt
deleted file mode 100644
index cbc12dc..0000000
--- a/lib/jsr223/jruby/README.txt
+++ /dev/null

@@ -1,54 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Using JSR-223 capable JRuby
-
-Tested with version 1.7.15
-
-Installation
-------------
-
-1. Download JRuby binary release
-2. Unpack the downloaded archive into a temporary directory
-3. Copy everything from the JRuby lib directory to $CASSANDRA_HOME/lib/jsr223/jruby
-4. Restart your Cassandra daemon if it's already running
-
-Cassandra log should contain a line like this:
-  INFO  10:29:03 Found scripting engine JSR 223 JRuby Engine 1.7.15 - ruby jruby 1.7.15 - language names: [ruby, jruby]
-Such a line appears when you already have scripted UDFs in your system or add a scripted UDF for the first time (see below).
-
-
-Smoke Test
-----------
-
-To test JRuby functionality, open cqlsh and execute the following command:
-  CREATE OR REPLACE FUNCTION foobar ( input text ) RETURNS text LANGUAGE ruby AS 'return "foo";' ;
-
-If you get the error
-  code=2200 [Invalid query] message="Invalid language ruby for 'foobar'"
-JRuby for Apache Cassandra has not been installed correctly.
-
-
-Ruby require/include
---------------------
-
-You can use Ruby require and include in your scripts as in the following example:
-
-
-CREATE OR REPLACE FUNCTION foobar ( input text ) RETURNS text LANGUAGE ruby AS '
-require "bigdecimal"
-require "bigdecimal/math"
-
-include BigMath
-
-a = BigDecimal((PI(100)/2).to_s)
-
-return "foo " + a.to_s;
-' ;
-
-
-Notes / Java7 invokedynamic
----------------------------
-
-See JRuby wiki pages https://github.com/jruby/jruby/wiki/ConfiguringJRuby and
-https://github.com/jruby/jruby/wiki/PerformanceTuning for more information and optimization tips.

diff --git a/lib/jsr223/jython/README.txt b/lib/jsr223/jython/README.txt
deleted file mode 100644
index bef3c83..0000000
--- a/lib/jsr223/jython/README.txt
+++ /dev/null

@@ -1,33 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Using JSR-223 capable Jython
-
-Tested with version 2.3.5
-
-Installation
-------------
-
-1. Download Jython binary release
-2. Unpack the downloaded archive into a temporary directory
-3. Copy the jar jython.jar from the Jython directory to $CASSANDRA_HOME/lib/jsr223/jython
-4. Restart your Cassandra daemon if it's already running
-
-Cassandra log should contain a line like this:
-  INFO  10:58:18 Found scripting engine jython 2.5.3 - python 2.5 - language names: [python, jython]
-Such a line appears when you already have scripted UDFs in your system or add a scripted UDF for the first time (see below).
-
-Smoke Test
-----------
-
-To test Jython functionality, open cqlsh and execute the following command:
-  CREATE OR REPLACE FUNCTION foobar ( input text ) RETURNS text LANGUAGE python AS '''foo''' ;
-
-If you get the error
-  code=2200 [Invalid query] message="Invalid language python for 'foobar'"
-Jython for Apache Cassandra has not been installed correctly.
-
-Notes / Java7 invokedynamic
----------------------------
-
-Jython currently targets Java6 only. They want to switch to Java7 + invokedynamic in Jython 3.

diff --git a/lib/jsr223/scala/README.txt b/lib/jsr223/scala/README.txt
deleted file mode 100644
index 7f5d6fe..0000000
--- a/lib/jsr223/scala/README.txt
+++ /dev/null

@@ -1,37 +0,0 @@
-Apache Cassandra User-Defined-Functions JSR 223 scripting
-=========================================================
-
-Using JSR-223 capable Scala
-
-Tested with version 2.11.2
-
-Installation
-------------
-
-1. Download Scala binary release
-2. Unpack the downloaded archive into a temporary directory
-3. Copy the following jars from the Scala lib directory to $CASSANDRA_HOME/lib/jsr223/scala
-   scala-compiler.jar
-   scala-library.jar
-   scala-reflect.jar
-4. Restart your Cassandra daemon if it's already running
-
-Cassandra log should contain a line like this:
-  INFO  11:42:35 Found scripting engine Scala Interpreter 1.0 - Scala version 2.11.2 - language names: [scala]
-Such a line appears when you already have scripted UDFs in your system or add a scripted UDF for the first time (see below).
-
-Smoke Test
-----------
-
-To test Scala functionality, open cqlsh and execute the following command:
-  CREATE OR REPLACE FUNCTION foobar ( input text ) RETURNS text LANGUAGE scala AS 'return "foo";' ;
-
-If you get the error
-  code=2200 [Invalid query] message="Invalid language scala for 'foobar'"
-Scala for Apache Cassandra has not been installed correctly.
-
-Notes / Java7 invokedynamic
----------------------------
-
-Scala 2.10 has Java6 support only. 2.11 has experimental invokedynamic support (use at your own risk!).
-2.12 introduces an upgrade directly to Java8 - see https://stackoverflow.com/questions/14285894/advantages-of-scala-emitting-bytecode-for-the-jvm-1-7
\ No newline at end of file

diff --git a/lib/jstackjunit-0.0.1.jar b/lib/jstackjunit-0.0.1.jar
new file mode 100644
index 0000000..ba39976
--- /dev/null
+++ b/lib/jstackjunit-0.0.1.jar
Binary files differ

diff --git a/lib/licenses/asm-5.0.4.txt b/lib/licenses/asm-5.0.4.txt
new file mode 100644
index 0000000..c5aba7b
--- /dev/null
+++ b/lib/licenses/asm-5.0.4.txt

@@ -0,0 +1,29 @@
+Copyright (c) 2000-2011 INRIA, France Telecom
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holders nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.

diff --git a/lib/licenses/cassandra-driver-2.1.3.txt b/lib/licenses/cassandra-driver-3.0.1.txt
similarity index 100%
rename from lib/licenses/cassandra-driver-2.1.3.txt
rename to lib/licenses/cassandra-driver-3.0.1.txt


diff --git a/lib/licenses/crc32ex-0.1.1.txt b/lib/licenses/crc32ex-0.1.1.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/crc32ex-0.1.1.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/licenses/guava-16.0.txt b/lib/licenses/guava-18.0.txt
similarity index 100%
rename from lib/licenses/guava-16.0.txt
rename to lib/licenses/guava-18.0.txt


diff --git a/lib/licenses/jna-4.0.0.txt b/lib/licenses/jna-4.2.2.txt
similarity index 100%
rename from lib/licenses/jna-4.0.0.txt
rename to lib/licenses/jna-4.2.2.txt


diff --git a/lib/licenses/jstackjunit-0.0.1.txt b/lib/licenses/jstackjunit-0.0.1.txt
new file mode 100644
index 0000000..d5c4984
--- /dev/null
+++ b/lib/licenses/jstackjunit-0.0.1.txt

@@ -0,0 +1,209 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+THIRD-PARTY DEPENDENCIES
+========================
+Convenience copies of some third-party dependencies are distributed with
+Apache Cassandra as Java jar files in lib/. Licensing information for
+these files can be found in the lib/licenses directory.

diff --git a/lib/licenses/ohc-0.3.4.txt b/lib/licenses/ohc-0.4.3.txt
similarity index 100%
rename from lib/licenses/ohc-0.3.4.txt
rename to lib/licenses/ohc-0.4.3.txt


diff --git a/lib/licenses/super-csv-2.1.0.txt b/lib/licenses/super-csv-2.1.0.txt
deleted file mode 100644
index d645695..0000000
--- a/lib/licenses/super-csv-2.1.0.txt
+++ /dev/null

@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

diff --git a/lib/ohc-core-0.3.4.jar b/lib/ohc-core-0.3.4.jar
deleted file mode 100644
index 0773e78..0000000
--- a/lib/ohc-core-0.3.4.jar
+++ /dev/null
Binary files differ

diff --git a/lib/ohc-core-0.4.3.jar b/lib/ohc-core-0.4.3.jar
new file mode 100644
index 0000000..2f75a9d
--- /dev/null
+++ b/lib/ohc-core-0.4.3.jar
Binary files differ

diff --git a/lib/ohc-core-j8-0.3.4.jar b/lib/ohc-core-j8-0.3.4.jar
deleted file mode 100644
index faa102f..0000000
--- a/lib/ohc-core-j8-0.3.4.jar
+++ /dev/null
Binary files differ

diff --git a/lib/ohc-core-j8-0.4.3.jar b/lib/ohc-core-j8-0.4.3.jar
new file mode 100644
index 0000000..7f2007e
--- /dev/null
+++ b/lib/ohc-core-j8-0.4.3.jar
Binary files differ

diff --git a/lib/super-csv-2.1.0.jar b/lib/super-csv-2.1.0.jar
deleted file mode 100644
index 6a85716..0000000
--- a/lib/super-csv-2.1.0.jar
+++ /dev/null
Binary files differ

diff --git a/pylib/cassandra-cqlsh-tests.sh b/pylib/cassandra-cqlsh-tests.sh
new file mode 100755
index 0000000..8174636
--- /dev/null
+++ b/pylib/cassandra-cqlsh-tests.sh

@@ -0,0 +1,107 @@
+#!/bin/bash -x
+
+################################
+#
+# Prep
+#
+################################
+
+WORKSPACE=$1
+
+if [ "${WORKSPACE}" = "" ]; then
+    echo "Specify Cassandra source directory"
+    exit
+fi
+
+export PYTHONIOENCODING="utf-8"
+export PYTHONUNBUFFERED=true
+export CASS_DRIVER_NO_EXTENSIONS=true
+export CASS_DRIVER_NO_CYTHON=true
+export CCM_MAX_HEAP_SIZE="2048M"
+export CCM_HEAP_NEWSIZE="200M"
+export CCM_CONFIG_DIR=${WORKSPACE}/.ccm
+export NUM_TOKENS="32"
+export CASSANDRA_DIR=${WORKSPACE}
+export TESTSUITE_NAME="cqlshlib.python2.jdk8"
+
+# Loop to prevent failure due to maven-ant-tasks not downloading a jar..
+for x in $(seq 1 3); do
+    ant -buildfile ${CASSANDRA_DIR}/build.xml realclean jar
+    RETURN="$?"
+    if [ "${RETURN}" -eq "0" ]; then
+        break
+    fi
+done
+# Exit, if we didn't build successfully
+if [ "${RETURN}" -ne "0" ]; then
+    echo "Build failed with exit code: ${RETURN}"
+    exit ${RETURN}
+fi
+
+# Set up venv with dtest dependencies
+set -e # enable immediate exit if venv setup fails
+virtualenv --python=python2 --no-site-packages venv
+source venv/bin/activate
+pip install -r ${CASSANDRA_DIR}/pylib/requirements.txt
+pip freeze
+
+if [ "$cython" = "yes" ]; then
+    TESTSUITE_NAME="${TESTSUITE_NAME}.cython"
+    pip install "Cython>=0.20,<0.25"
+    cd pylib/; python setup.py build_ext --inplace
+    cd ${WORKSPACE}
+else
+    TESTSUITE_NAME="${TESTSUITE_NAME}.no_cython"
+fi
+
+################################
+#
+# Main
+#
+################################
+
+ccm remove test || true # in case an old ccm cluster is left behind
+ccm create test -n 1 --install-dir=${CASSANDRA_DIR}
+ccm updateconf "enable_user_defined_functions: true"
+
+version_from_build=$(ccm node1 versionfrombuild)
+export pre_or_post_cdc=$(python -c """from distutils.version import LooseVersion
+print \"postcdc\" if LooseVersion(\"${version_from_build}\") >= \"3.8\" else \"precdc\"
+""")
+case "${pre_or_post_cdc}" in
+    postcdc)
+        ccm updateconf "cdc_enabled: true"
+        ;;
+    precdc)
+        :
+        ;;
+    *)
+        echo "${pre_or_post_cdc}" is an invalid value.
+        exit 1
+        ;;
+esac
+
+ccm start --wait-for-binary-proto
+
+cd ${CASSANDRA_DIR}/pylib/cqlshlib/
+
+set +e # disable immediate exit from this point
+nosetests
+
+ccm remove
+# hack around --xunit-prefix-with-testsuite-name not being available in nose 1.3.7
+sed -i "s/testsuite name=\"nosetests\"/testsuite name=\"${TESTSUITE_NAME}\"/g" nosetests.xml
+sed -i "s/testcase classname=\"cqlshlib./testcase classname=\"${TESTSUITE_NAME}./g" nosetests.xml
+mv nosetests.xml ${WORKSPACE}/cqlshlib.xml
+
+################################
+#
+# Clean
+#
+################################
+
+# /virtualenv
+deactivate
+
+# Exit cleanly for usable "Unstable" status
+exit 0

diff --git a/pylib/cqlshlib/copyutil.py b/pylib/cqlshlib/copyutil.py
index b28d96d..6a45153 100644
--- a/pylib/cqlshlib/copyutil.py
+++ b/pylib/cqlshlib/copyutil.py

@@ -37,16 +37,17 @@
 from collections import defaultdict, namedtuple
 from decimal import Decimal
 from Queue import Queue
-from random import randrange
+from random import randint
 from StringIO import StringIO
 from select import select
 from uuid import UUID
 from util import profile_on, profile_off
 
-from cassandra.cluster import Cluster
+from cassandra import OperationTimedOut
+from cassandra.cluster import Cluster, DefaultConnection
 from cassandra.cqltypes import ReversedType, UserType
 from cassandra.metadata import protect_name, protect_names, protect_value
-from cassandra.policies import RetryPolicy, WhiteListRoundRobinPolicy, DCAwareRoundRobinPolicy
+from cassandra.policies import RetryPolicy, WhiteListRoundRobinPolicy, DCAwareRoundRobinPolicy, FallthroughRetryPolicy
 from cassandra.query import BatchStatement, BatchType, SimpleStatement, tuple_factory
 from cassandra.util import Date, Time
 
@@ -356,12 +357,23 @@
         copy_options['skiprows'] = int(opts.pop('skiprows', '0'))
         copy_options['skipcols'] = opts.pop('skipcols', '')
         copy_options['maxparseerrors'] = int(opts.pop('maxparseerrors', '-1'))
-        copy_options['maxinserterrors'] = int(opts.pop('maxinserterrors', '-1'))
+        copy_options['maxinserterrors'] = int(opts.pop('maxinserterrors', '1000'))
         copy_options['errfile'] = safe_normpath(opts.pop('errfile', 'import_%s_%s.err' % (self.ks, self.table,)))
         copy_options['ratefile'] = safe_normpath(opts.pop('ratefile', ''))
         copy_options['maxoutputsize'] = int(opts.pop('maxoutputsize', '-1'))
         copy_options['preparedstatements'] = bool(opts.pop('preparedstatements', 'true').lower() == 'true')
 
+        # Hidden properties, they do not appear in the documentation but can be set in config files
+        # or on the cmd line but w/o completion
+        copy_options['maxinflightmessages'] = int(opts.pop('maxinflightmessages', '512'))
+        copy_options['maxbackoffattempts'] = int(opts.pop('maxbackoffattempts', '12'))
+        copy_options['maxpendingchunks'] = int(opts.pop('maxpendingchunks', '24'))
+        # set requesttimeout to a value high enough so that maxbatchsize rows will never timeout if the server
+        # responds: here we set it to 1 sec per 10 rows but no less than 60 seconds
+        copy_options['requesttimeout'] = int(opts.pop('requesttimeout', max(60, 1 * copy_options['maxbatchsize'] / 10)))
+        # set childtimeout higher than requesttimeout so that child processes have a chance to report request timeouts
+        copy_options['childtimeout'] = int(opts.pop('childtimeout', copy_options['requesttimeout'] + 30))
+
         self.check_options(copy_options)
         return CopyOptions(copy=copy_options, dialect=dialect_options, unrecognized=opts)
 
@@ -1185,9 +1197,21 @@
         if not self.fname:
             self.send_stdin_rows()
 
+        child_timeout = self.options.copy['childtimeout']
+        last_recv_num_records = 0
+        last_recv_time = time.time()
+
         while self.feeding_result is None or self.receive_meter.total_records < self.feeding_result.sent:
             self.receive_results()
 
+            if self.feeding_result is not None:
+                if self.receive_meter.total_records != last_recv_num_records:
+                    last_recv_num_records = self.receive_meter.total_records
+                    last_recv_time = time.time()
+                elif (time.time() - last_recv_time) > child_timeout:
+                    self.shell.printerr("No records inserted in {} seconds, aborting".format(child_timeout))
+                    break
+
             if self.error_handler.max_exceeded() or not self.all_processes_running():
                 break
 
@@ -1258,6 +1282,7 @@
         self.send_meter = RateMeter(log_fcn=None, update_interval=1)
         self.ingest_rate = options.copy['ingestrate']
         self.num_worker_processes = options.copy['numprocesses']
+        self.max_pending_chunks = options.copy['maxpendingchunks']
         self.chunk_id = 0
         self.parent_cluster = parent_cluster
 
@@ -1295,10 +1320,22 @@
         reader = self.reader
         reader.start()
         channels = self.worker_channels
+        max_pending_chunks = self.max_pending_chunks
         sent = 0
+        failed_attempts = 0
 
         while not reader.exhausted:
-            for ch in channels:
+            channels_eligible = filter(lambda c: c.num_pending() < max_pending_chunks, channels)
+            if not channels_eligible:
+                failed_attempts += 1
+                delay = randint(1, pow(2, failed_attempts))
+                printdebugmsg("All workers busy, sleeping for %d second(s)" % (delay,))
+                time.sleep(delay)
+                continue
+            elif failed_attempts > 0:
+                failed_attempts = 0
+
+            for ch in channels_eligible:
                 try:
                     max_rows = self.ingest_rate - self.send_meter.current_record
                     if max_rows <= 0:
@@ -1428,14 +1465,13 @@
         this maximum is per query.
         To back-off we should wait a random number of seconds
         between 0 and 2^c - 1, where c is the number of total failures.
-        randrange() excludes the last value, so we drop the -1.
 
         :return : the number of seconds to wait for, -1 if we should not retry
         """
         if retry_num >= self.max_attempts:
             return -1
 
-        delay = randrange(0, pow(2, retry_num + 1))
+        delay = randint(0, pow(2, retry_num + 1) - 1)
         return delay
 
 
@@ -1845,11 +1881,30 @@
 
         def split(val, sep=','):
             """
-            Split into a list of values whenever we encounter a separator but
+            Split "val" into a list of values whenever the separator "sep" is found, but
             ignore separators inside parentheses or single quotes, except for the two
-            outermost parentheses, which will be ignored. We expect val to be at least
-            2 characters long (the two outer parentheses).
+            outermost parentheses, which will be ignored. This method is called when parsing composite
+            types, "val" should be at least 2 characters long, the first char should be an
+            open parenthesis and the last char should be a matching closing parenthesis. We could also
+            check exactly which parenthesis type depending on the caller, but I don't want to enforce
+            too many checks that don't necessarily provide any additional benefits, and risk breaking
+            data that could previously be imported, even if strictly speaking it is incorrect CQL.
+            For example, right now we accept sets that start with '[' and ']', I don't want to break this
+            by enforcing '{' and '}' in a minor release.
             """
+            def is_open_paren(cc):
+                return cc == '{' or cc == '[' or cc == '('
+
+            def is_close_paren(cc):
+                return cc == '}' or cc == ']' or cc == ')'
+
+            def paren_match(c1, c2):
+                return (c1 == '{' and c2 == '}') or (c1 == '[' and c2 == ']') or (c1 == '(' and c2 == ')')
+
+            if len(val) < 2 or not paren_match(val[0], val[-1]):
+                raise ParseError('Invalid composite string, it should start and end with matching parentheses: {}'
+                                 .format(val))
+
             ret = []
             last = 1
             level = 0
@@ -1858,9 +1913,9 @@
                 if c == '\'':
                     quote = not quote
                 elif not quote:
-                    if c == '{' or c == '[' or c == '(':
+                    if is_open_paren(c):
                         level += 1
-                    elif c == '}' or c == ']' or c == ')':
+                    elif is_close_paren(c):
                         level -= 1
                     elif c == sep and level == 1:
                         ret.append(val[last:i])
@@ -1885,7 +1940,13 @@
 
             m = p.match(val)
             if not m:
-                raise ValueError("can't interpret %r as a date with this format: %s" % (val, self.date_time_format))
+                try:
+                    # in case of overflow COPY TO prints dates as milliseconds from the epoch, see
+                    # deserialize_date_fallback_int in cqlsh.py
+                    return int(val)
+                except ValueError:
+                    raise ValueError("can't interpret %r as a date with format %s or as int" % (val,
+                                                                                                self.date_time_format))
 
             # https://docs.python.org/2/library/time.html#time.struct_time
             tval = time.struct_time((int(m.group(1)), int(m.group(2)), int(m.group(3)),  # year, month, day
@@ -2017,6 +2078,13 @@
             try:
                 return c(v) if v != self.nullval else self.get_null_val()
             except Exception, e:
+                # if we could not convert an empty string, then self.nullval has been set to a marker
+                # because the user needs to import empty strings, except that the converters for some types
+                # will fail to convert an empty string, in this case the null value should be inserted
+                # see CASSANDRA-12794
+                if v == '':
+                    return self.get_null_val()
+
                 if self.debug:
                     traceback.print_exc()
                 raise ParseError("Failed to parse %s : %s" % (val, e.message))
@@ -2116,24 +2184,58 @@
 
 class FastTokenAwarePolicy(DCAwareRoundRobinPolicy):
     """
-    Send to any replicas attached to the query, or else fall back to DCAwareRoundRobinPolicy
+    Send to any replicas attached to the query, or else fall back to DCAwareRoundRobinPolicy. Perform
+    exponential back-off if too many in flight requests to all replicas are already in progress.
     """
 
-    def __init__(self, local_dc='', used_hosts_per_remote_dc=0):
-        DCAwareRoundRobinPolicy.__init__(self, local_dc, used_hosts_per_remote_dc)
+    def __init__(self, parent):
+        DCAwareRoundRobinPolicy.__init__(self, parent.local_dc, 0)
+        self.max_backoff_attempts = parent.max_backoff_attempts
+        self.max_inflight_messages = parent.max_inflight_messages
 
     def make_query_plan(self, working_keyspace=None, query=None):
         """
         Extend TokenAwarePolicy.make_query_plan() so that we choose the same replicas in preference
-        and most importantly we avoid repeating the (slow) bisect
+        and most importantly we avoid repeating the (slow) bisect. We also implement a backoff policy
+        by sleeping an exponentially larger delay in case all connections to eligible replicas have
+        too many in flight requests.
         """
-        replicas = query.replicas if hasattr(query, 'replicas') else []
-        for r in replicas:
-            yield r
+        connections = ConnectionWrapper.connections
+        replicas = list(query.replicas) if hasattr(query, 'replicas') else []
+        replicas.extend([r for r in DCAwareRoundRobinPolicy.make_query_plan(self, working_keyspace, query)
+                        if r not in replicas])
 
-        for r in DCAwareRoundRobinPolicy.make_query_plan(self, working_keyspace, query):
-            if r not in replicas:
-                yield r
+        if replicas:
+            def replica_is_not_overloaded(r):
+                if r.address in connections:
+                    conn = connections[r.address]
+                    return conn.in_flight < min(conn.max_request_id, self.max_inflight_messages)
+                return True
+
+            for i in xrange(self.max_backoff_attempts):
+                for r in filter(replica_is_not_overloaded, replicas):
+                    yield r
+
+                # the back-off starts at 10 ms (0.01) and it can go up to to 2^max_backoff_attempts,
+                # which is currently 12, so 2^12 = 4096 = ~40 seconds when dividing by 0.01
+                delay = randint(1, pow(2, i + 1)) * 0.01
+                printdebugmsg("All replicas busy, sleeping for %d second(s)..." % (delay,))
+                time.sleep(delay)
+
+            printdebugmsg("Replicas too busy, given up")
+
+
+class ConnectionWrapper(DefaultConnection):
+    """
+    A wrapper to the driver default connection that helps in keeping track of messages in flight.
+    The newly created connection is registered into a global dictionary so that FastTokenAwarePolicy
+    is able to determine if a connection has too many in flight requests.
+    """
+    connections = {}
+
+    def __init__(self, *args, **kwargs):
+        DefaultConnection.__init__(self, *args, **kwargs)
+        self.connections[self.host] = self
 
 
 class ImportProcess(ChildProcess):
@@ -2151,6 +2253,10 @@
         self.min_batch_size = options.copy['minbatchsize']
         self.max_batch_size = options.copy['maxbatchsize']
         self.use_prepared_statements = options.copy['preparedstatements']
+        self.max_inflight_messages = options.copy['maxinflightmessages']
+        self.max_backoff_attempts = options.copy['maxbackoffattempts']
+        self.request_timeout = options.copy['requesttimeout']
+
         self.dialect_options = options.dialect
         self._session = None
         self.query = None
@@ -2166,16 +2272,17 @@
                 cql_version=self.cql_version,
                 protocol_version=self.protocol_version,
                 auth_provider=self.auth_provider,
-                load_balancing_policy=FastTokenAwarePolicy(local_dc=self.local_dc),
+                load_balancing_policy=FastTokenAwarePolicy(self),
                 ssl_options=ssl_settings(self.hostname, self.config_file) if self.ssl else None,
-                default_retry_policy=ExpBackoffRetryPolicy(self),
+                default_retry_policy=FallthroughRetryPolicy(),  # we throw on timeouts and retry in the error callback
                 compression=None,
                 control_connection_timeout=self.connect_timeout,
                 connect_timeout=self.connect_timeout,
-                idle_heartbeat_interval=0)
+                idle_heartbeat_interval=0,
+                connection_class=ConnectionWrapper)
 
             self._session = cluster.connect(self.ks)
-            self._session.default_timeout = None
+            self._session.default_timeout = self.request_timeout
         return self._session
 
     def run(self):
@@ -2258,6 +2365,10 @@
                         future = session.execute_async(statement)
                         future.add_callbacks(callback=result_callback, callback_args=(batch, chunk),
                                              errback=err_callback, errback_args=(batch, chunk, replicas))
+                    # do not handle else case, if a statement could not be created, the exception is handled
+                    # in self.wrap_make_statement and the error is reported, if a failure is injected that
+                    # causes the statement to be None, then we should not report the error so that we can test
+                    # the parent process handling missing batches from child processes
 
             except Exception, exc:
                 self.report_error(exc, chunk, chunk['rows'])
@@ -2272,8 +2383,8 @@
                 return None
 
         def make_statement_with_failures(query, conv, chunk, batch, replicas):
-            failed_batch = self.maybe_inject_failures(batch)
-            if failed_batch:
+            failed_batch, apply_failure = self.maybe_inject_failures(batch)
+            if apply_failure:
                 return failed_batch
             return make_statement(query, conv, chunk, batch, replicas)
 
@@ -2363,10 +2474,12 @@
 
     def maybe_inject_failures(self, batch):
         """
-        Examine self.test_failures and see if token_range is either a token range
-        supposed to cause a failure (failing_range) or to terminate the worker process
-        (exit_range). If not then call prepare_export_query(), which implements the
-        normal behavior.
+        Examine self.test_failures and see if the batch is a batch
+        supposed to cause a failure (failing_batch), or to terminate the worker process
+        (exit_batch), or not to be sent (unsent_batch).
+
+        @return any statement that will cause a failure or None if the statement should not be sent
+        plus a boolean indicating if a failure should be applied at all
         """
         if 'failing_batch' in self.test_failures:
             failing_batch = self.test_failures['failing_batch']
@@ -2374,14 +2487,19 @@
                 if batch['attempts'] < failing_batch['failures']:
                     statement = SimpleStatement("INSERT INTO badtable (a, b) VALUES (1, 2)",
                                                 consistency_level=self.consistency_level)
-                    return statement
+                    return statement, True  # use this statement, which will cause an error
 
         if 'exit_batch' in self.test_failures:
             exit_batch = self.test_failures['exit_batch']
             if exit_batch['id'] == batch['id']:
                 sys.exit(1)
 
-        return None  # carry on as normal
+        if 'unsent_batch' in self.test_failures:
+            unsent_batch = self.test_failures['unsent_batch']
+            if unsent_batch['id'] == batch['id']:
+                return None, True  # do not send this batch, which will cause missing acks in the parent process
+
+        return None, False  # carry on as normal, do not apply any failures
 
     @staticmethod
     def make_batch(batch_id, rows, attempts=1):
@@ -2442,6 +2560,8 @@
         self.update_chunk(batch['rows'], chunk)
 
     def err_callback(self, response, batch, chunk, replicas):
+        if isinstance(response, OperationTimedOut) and chunk['imported'] == chunk['num_rows_sent']:
+            return  # occasionally the driver sends false timeouts for rows already processed (PYTHON-652)
         err_is_final = batch['attempts'] >= self.max_attempts
         self.report_error(response, chunk, batch['rows'], batch['attempts'], err_is_final)
         if not err_is_final:

diff --git a/pylib/cqlshlib/cql3handling.py b/pylib/cqlshlib/cql3handling.py
index 8224ad9..0f42e6e 100644
--- a/pylib/cqlshlib/cql3handling.py
+++ b/pylib/cqlshlib/cql3handling.py

@@ -35,8 +35,8 @@
         return 'Unexpected table structure; may not translate correctly to CQL. ' + self.msg
 
 
-SYSTEM_KEYSPACES = ('system', 'system_traces', 'system_auth', 'system_distributed')
-NONALTERBALE_KEYSPACES = ('system')
+SYSTEM_KEYSPACES = ('system', 'system_schema', 'system_traces', 'system_auth', 'system_distributed')
+NONALTERBALE_KEYSPACES = ('system', 'system_schema')
 
 
 class Cql3ParsingRuleSet(CqlParsingRuleSet):
@@ -58,7 +58,7 @@
         # (CQL3 option name, schema_columnfamilies column name (or None if same),
         #  list of known map keys)
         ('compaction', 'compaction_strategy_options',
-            ('class', 'max_threshold', 'tombstone_compaction_interval', 'tombstone_threshold', 'enabled', 'unchecked_tombstone_compaction')),
+            ('class', 'max_threshold', 'tombstone_compaction_interval', 'tombstone_threshold', 'enabled', 'unchecked_tombstone_compaction', 'only_purge_repaired_tombstones')),
         ('compression', 'compression_parameters',
             ('sstable_compression', 'chunk_length_kb', 'crc_check_chance')),
         ('caching', None,
@@ -240,6 +240,7 @@
 <schemaChangeStatement> ::= <createKeyspaceStatement>
                           | <createColumnFamilyStatement>
                           | <createIndexStatement>
+                          | <createMaterializedViewStatement>
                           | <createUserTypeStatement>
                           | <createFunctionStatement>
                           | <createAggregateStatement>
@@ -247,6 +248,7 @@
                           | <dropKeyspaceStatement>
                           | <dropColumnFamilyStatement>
                           | <dropIndexStatement>
+                          | <dropMaterializedViewStatement>
                           | <dropUserTypeStatement>
                           | <dropFunctionStatement>
                           | <dropAggregateStatement>
@@ -294,6 +296,8 @@
 
 <columnFamilyName> ::= ( ksname=<cfOrKsName> dot="." )? cfname=<cfOrKsName> ;
 
+<materializedViewName> ::= ( ksname=<cfOrKsName> dot="." )? mvname=<cfOrKsName> ;
+
 <userTypeName> ::= ( ksname=<cfOrKsName> dot="." )? utname=<cfOrKsName> ;
 
 <keyspaceName> ::= ksname=<cfOrKsName> ;
@@ -514,6 +518,13 @@
             opts.add('min_threshold')
             opts.add('max_window_size_seconds')
             opts.add('timestamp_resolution')
+        elif csc == 'TimeWindowCompactionStrategy':
+            opts.add('compaction_window_unit')
+            opts.add('compaction_window_size')
+            opts.add('min_threshold')
+            opts.add('max_threshold')
+            opts.add('timestamp_resolution')
+
         return map(escape_value, opts)
     return ()
 
@@ -573,6 +584,7 @@
 
 
 completer_for('columnFamilyName', 'ksname')(cf_ks_name_completer)
+completer_for('materializedViewName', 'ksname')(cf_ks_name_completer)
 
 
 def cf_ks_dot_completer(ctxt, cass):
@@ -583,6 +595,7 @@
 
 
 completer_for('columnFamilyName', 'dot')(cf_ks_dot_completer)
+completer_for('materializedViewName', 'dot')(cf_ks_dot_completer)
 
 
 @completer_for('columnFamilyName', 'cfname')
@@ -599,6 +612,20 @@
     return map(maybe_escape_name, cfnames)
 
 
+@completer_for('materializedViewName', 'mvname')
+def mv_name_completer(ctxt, cass):
+    ks = ctxt.get_binding('ksname', None)
+    if ks is not None:
+        ks = dequote_name(ks)
+    try:
+        mvnames = cass.get_materialized_view_names(ks)
+    except Exception:
+        if ks is None:
+            return ()
+        raise
+    return map(maybe_escape_name, mvnames)
+
+
 completer_for('userTypeName', 'ksname')(cf_ks_name_completer)
 
 completer_for('userTypeName', 'dot')(cf_ks_dot_completer)
@@ -656,7 +683,7 @@
 <useStatement> ::= "USE" <keyspaceName>
                  ;
 <selectStatement> ::= "SELECT" ( "JSON" )? <selectClause>
-                        "FROM" cf=<columnFamilyName>
+                        "FROM" (cf=<columnFamilyName> | mv=<materializedViewName>)
                           ( "WHERE" <whereClause> )?
                           ( "ORDER" "BY" <orderByClause> ( "," <orderByClause> )* )?
                           ( "LIMIT" limit=<wholenumber> )?
@@ -889,7 +916,7 @@
                ;
 <conditions> ::=  <condition> ( "AND" <condition> )*
                ;
-<condition> ::= <cident> ( "[" <term> "]" )? ( ( "=" | "<" | ">" | "<=" | ">=" | "!=" ) <term>
+<condition> ::= <cident> ( "[" <term> "]" )? (("=" | "<" | ">" | "<=" | ">=" | "!=") <term>
                                              | "IN" "(" <term> ( "," <term> )* ")")
               ;
 '''
@@ -1156,6 +1183,11 @@
                                ( "USING" <stringLiteral> ( "WITH" "OPTIONS" "=" <mapLiteral> )? )?
                          ;
 
+<createMaterializedViewStatement> ::= "CREATE" "MATERIALIZED" "VIEW" ("IF" "NOT" "EXISTS")? <materializedViewName>?
+                                      "AS" <selectStatement>
+                                      "PRIMARY" "KEY" <pkDef>
+                                    ;
+
 <createUserTypeStatement> ::= "CREATE" "TYPE" ( ks=<nonSystemKeyspaceName> dot="." )? typename=<cfOrKsName> "(" newcol=<cident> <storageType>
                                 ( "," [newcolname]=<cident> <storageType> )*
                             ")"
@@ -1216,6 +1248,9 @@
 <dropIndexStatement> ::= "DROP" "INDEX" ("IF" "EXISTS")? idx=<indexName>
                        ;
 
+<dropMaterializedViewStatement> ::= "DROP" "MATERIALIZED" "VIEW" ("IF" "EXISTS")? mv=<materializedViewName>
+                                  ;
+
 <dropUserTypeStatement> ::= "DROP" "TYPE" ut=<userTypeName>
                           ;
 
@@ -1259,8 +1294,7 @@
 <alterTableStatement> ::= "ALTER" wat=( "COLUMNFAMILY" | "TABLE" ) cf=<columnFamilyName>
                                <alterInstructions>
                         ;
-<alterInstructions> ::= "ALTER" existcol=<cident> "TYPE" <storageType>
-                      | "ADD" newcol=<cident> <storageType> ("static")?
+<alterInstructions> ::= "ADD" newcol=<cident> <storageType> ("static")?
                       | "DROP" existcol=<cident>
                       | "WITH" <cfamProperty> ( "AND" <cfamProperty> )*
                       | "RENAME" existcol=<cident> "TO" newcol=<cident>
@@ -1270,8 +1304,7 @@
 <alterUserTypeStatement> ::= "ALTER" "TYPE" ut=<userTypeName>
                                <alterTypeInstructions>
                              ;
-<alterTypeInstructions> ::= "ALTER" existcol=<cident> "TYPE" <storageType>
-                           | "ADD" newcol=<cident> <storageType>
+<alterTypeInstructions> ::= "ADD" newcol=<cident> <storageType>
                            | "RENAME" existcol=<cident> "TO" newcol=<cident>
                               ( "AND" existcol=<cident> "TO" newcol=<cident> )*
                            ;

diff --git a/pylib/cqlshlib/cqlhandling.py b/pylib/cqlshlib/cqlhandling.py
index a8a0ba8..51d9726 100644
--- a/pylib/cqlshlib/cqlhandling.py
+++ b/pylib/cqlshlib/cqlhandling.py

@@ -35,7 +35,8 @@
     available_compaction_classes = (
         'LeveledCompactionStrategy',
         'SizeTieredCompactionStrategy',
-        'DateTieredCompactionStrategy'
+        'DateTieredCompactionStrategy',
+        'TimeWindowCompactionStrategy'
     )
 
     replication_strategies = (

diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py
index f95bfff..53ba478 100644
--- a/pylib/cqlshlib/formatting.py
+++ b/pylib/cqlshlib/formatting.py

@@ -261,7 +261,15 @@
     ret_dt = datetime_from_timestamp(seconds).replace(tzinfo=UTC())
     if timezone:
         ret_dt = ret_dt.astimezone(timezone)
-    return ret_dt.strftime(time_format)
+    try:
+        return ret_dt.strftime(time_format)
+    except ValueError:
+        # CASSANDRA-13185: if the date cannot be formatted as a string, return a string with the milliseconds
+        # since the epoch. cqlsh does the exact same thing for values below datetime.MINYEAR (1) or above
+        # datetime.MAXYEAR (9999). Some versions of strftime() also have problems for dates between MIN_YEAR and 1900.
+        # cqlsh COPY assumes milliseconds from the epoch if it fails to parse a datetime string, and so it is
+        # able to correctly import timestamps exported as milliseconds since the epoch.
+        return '%d' % (seconds * 1000.0)
 
 
 @formatter_for('Date')

diff --git a/pylib/cqlshlib/helptopics.py b/pylib/cqlshlib/helptopics.py
index 279063b..46cd156 100644
--- a/pylib/cqlshlib/helptopics.py
+++ b/pylib/cqlshlib/helptopics.py

@@ -16,7 +16,6 @@
 
 
 class CQL3HelpTopics(object):
-
     def get_help_topics(self):
         return [t[5:] for t in dir(self) if t.startswith('help_')]
 
@@ -145,6 +144,15 @@
     def help_drop_trigger(self):
         return 'dropTriggerStmt'
 
+    def help_create_materialized_view(self):
+        return 'createMVStmt'
+
+    def help_alter_materialized_view(self):
+        return 'alterMVStmt'
+
+    def help_drop_materialized_view(self):
+        return 'dropMVStmt'
+
     def help_keywords(self):
         return 'appendixA'
 
@@ -160,6 +168,15 @@
     def help_list_users(self):
         return 'listUsersStmt'
 
+    def help_create_role(self):
+        return 'createRoleStmt'
+
+    def help_drop_role(self):
+        return 'dropRoleStmt'
+
+    def help_list_roles(self):
+        return 'listRolesStmt'
+
     def help_permissions(self):
         return 'permissions'
 

diff --git a/pylib/cqlshlib/setup.cfg b/pylib/cqlshlib/setup.cfg
new file mode 100644
index 0000000..6c523ee
--- /dev/null
+++ b/pylib/cqlshlib/setup.cfg

@@ -0,0 +1,4 @@
+[nosetests]
+verbosity=3
+detailed-errors=1
+with-xunit=1

diff --git a/pylib/cqlshlib/test/cassconnect.py b/pylib/cqlshlib/test/cassconnect.py
index 71f7565..501850c 100644
--- a/pylib/cqlshlib/test/cassconnect.py
+++ b/pylib/cqlshlib/test/cassconnect.py

@@ -24,15 +24,13 @@
 
 test_keyspace_init = os.path.join(rundir, 'test_keyspace_init.cql')
 
-def get_cassandra_connection(cql_version=cqlsh.DEFAULT_CQLVER):
-    if cql_version is None:
-        cql_version = cqlsh.DEFAULT_CQLVER
+def get_cassandra_connection(cql_version=None):
     conn = cql((TEST_HOST,), TEST_PORT, cql_version=cql_version, load_balancing_policy=policy)
     # until the cql lib does this for us
     conn.cql_version = cql_version
     return conn
 
-def get_cassandra_cursor(cql_version=cqlsh.DEFAULT_CQLVER):
+def get_cassandra_cursor(cql_version=None):
     return get_cassandra_connection(cql_version=cql_version).cursor()
 
 TEST_KEYSPACES_CREATED = []
@@ -83,7 +81,7 @@
         c.execute('DROP KEYSPACE %s' % quote_name(TEST_KEYSPACES_CREATED.pop(-1)))
 
 @contextlib.contextmanager
-def cassandra_connection(cql_version=cqlsh.DEFAULT_CQLVER):
+def cassandra_connection(cql_version=None):
     """
     Make a Cassandra CQL connection with the given CQL version and get a cursor
     for it, and optionally connect to a given keyspace.

diff --git a/pylib/cqlshlib/test/test_cqlsh_completion.py b/pylib/cqlshlib/test/test_cqlsh_completion.py
index 19bd092..75198b6 100644
--- a/pylib/cqlshlib/test/test_cqlsh_completion.py
+++ b/pylib/cqlshlib/test/test_cqlsh_completion.py

@@ -42,7 +42,7 @@
 class CqlshCompletionCase(BaseTestCase):
 
     def setUp(self):
-        self.cqlsh_runner = testrun_cqlsh(cqlver=cqlsh.DEFAULT_CQLVER, env={'COLUMNS': '100000'})
+        self.cqlsh_runner = testrun_cqlsh(cqlver=None, env={'COLUMNS': '100000'})
         self.cqlsh = self.cqlsh_runner.__enter__()
 
     def tearDown(self):
@@ -406,7 +406,7 @@
                                      'utf8_with_special_chars',
                                      'system_traces.', 'songs',
                                      'system_auth.', 'system_distributed.',
-                                     'system_traces.',
+                                     'system_schema.', 'system_traces.',
                                      '"' + self.cqlsh.keyspace + '".'],
                             other_choices_ok=True)
         self.trycompletions('DELETE FROM twenty_rows_composite_table ',
@@ -523,7 +523,7 @@
         self.trycompletions('DROP ',
                             choices=['AGGREGATE', 'COLUMNFAMILY', 'FUNCTION',
                                      'INDEX', 'KEYSPACE', 'ROLE', 'TABLE',
-                                     'TRIGGER', 'TYPE', 'USER'])
+                                     'TRIGGER', 'TYPE', 'USER', 'MATERIALIZED'])
 
     def test_complete_in_drop_keyspace(self):
         self.trycompletions('DROP K', immediate='EYSPACE ')
@@ -617,7 +617,8 @@
                             + "{'class': '",
                             choices=['SizeTieredCompactionStrategy',
                                      'LeveledCompactionStrategy',
-                                     'DateTieredCompactionStrategy'])
+                                     'DateTieredCompactionStrategy',
+                                     'TimeWindowCompactionStrategy'])
         self.trycompletions(prefix + " new_table (col_a int PRIMARY KEY) WITH compaction = "
                             + "{'class': 'S",
                             immediate="izeTieredCompactionStrategy'")
@@ -637,7 +638,8 @@
                                      'min_sstable_size', 'min_threshold',
                                      'tombstone_compaction_interval',
                                      'tombstone_threshold',
-                                     'unchecked_tombstone_compaction'])
+                                     'unchecked_tombstone_compaction',
+                                     'only_purge_repaired_tombstones'])
         self.trycompletions(prefix + " new_table (col_a int PRIMARY KEY) WITH compaction = "
                             + "{'class': 'SizeTieredCompactionStrategy'}",
                             choices=[';', 'AND'])
@@ -658,7 +660,15 @@
                                      'timestamp_resolution', 'min_threshold', 'class', 'max_threshold',
                                      'tombstone_compaction_interval', 'tombstone_threshold',
                                      'enabled', 'unchecked_tombstone_compaction',
-                                     'max_window_size_seconds'])
+                                     'max_window_size_seconds', 'only_purge_repaired_tombstones'])
+        self.trycompletions(prefix + " new_table (col_a int PRIMARY KEY) WITH compaction = "
+                            + "{'class': 'TimeWindowCompactionStrategy', '",
+                            choices=['compaction_window_unit', 'compaction_window_size',
+                                     'timestamp_resolution', 'min_threshold', 'class', 'max_threshold',
+                                     'tombstone_compaction_interval', 'tombstone_threshold',
+                                     'enabled', 'unchecked_tombstone_compaction',
+                                     'only_purge_repaired_tombstones'])
+
 
     def test_complete_in_create_columnfamily(self):
         self.trycompletions('CREATE C', choices=['COLUMNFAMILY', 'CUSTOM'])

diff --git a/pylib/cqlshlib/test/test_cqlsh_output.py b/pylib/cqlshlib/test/test_cqlsh_output.py
index e47b981..50849d4 100644
--- a/pylib/cqlshlib/test/test_cqlsh_output.py
+++ b/pylib/cqlshlib/test/test_cqlsh_output.py

@@ -67,13 +67,6 @@
                                  'Actually got:      %s\ncolor code:        %s'
                                  % (tags, coloredtext.colored_version(), coloredtext.colortags()))
 
-    def assertCqlverQueriesGiveColoredOutput(self, queries_and_expected_outputs,
-                                             cqlver=(cqlsh.DEFAULT_CQLVER,), **kwargs):
-        if not isinstance(cqlver, (tuple, list)):
-            cqlver = (cqlver,)
-        for ver in cqlver:
-            self.assertQueriesGiveColoredOutput(queries_and_expected_outputs, cqlver=ver, **kwargs)
-
     def assertQueriesGiveColoredOutput(self, queries_and_expected_outputs, **kwargs):
         """
         Allow queries and expected output to be specified in structured tuples,
@@ -133,7 +126,7 @@
                 self.assertHasColors(c.read_to_next_prompt())
 
     def test_count_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ('select count(*) from has_all_types;', """
              count
              MMMMM
@@ -198,7 +191,7 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
         q = 'select COUNT(*) FROM twenty_rows_composite_table limit 1000000;'
         self.assertQueriesGiveColoredOutput((
@@ -214,10 +207,10 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_static_cf_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select a, b from twenty_rows_table where a in ('1', '13', '2');", """
              a  | b
              RR   MM
@@ -234,7 +227,7 @@
             (3 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
         self.assertQueriesGiveColoredOutput((
             ('select * from dynamic_columns;', """
@@ -257,11 +250,11 @@
             (5 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_empty_cf_output(self):
         # we print the header after CASSANDRA-6910
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ('select * from empty_table;', """
              lonelykey | lonelycol
              RRRRRRRRR   MMMMMMMMM
@@ -270,7 +263,7 @@
 
             (0 rows)
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
         q = 'select * from has_all_types where num = 999;'
 
@@ -284,7 +277,7 @@
 
             (0 rows)
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_columnless_key_output(self):
         q = "select a from twenty_rows_table where a in ('1', '2', '-9192');"
@@ -304,10 +297,10 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_numeric_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ('''select intcol, bigintcol, varintcol \
                   from has_all_types \
                  where num in (0, 1, 2, 3, 4);''', """
@@ -353,7 +346,7 @@
             (5 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_timestamp_output(self):
         self.assertQueriesGiveColoredOutput((
@@ -390,7 +383,7 @@
             pass
 
     def test_boolean_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ('select num, booleancol from has_all_types where num in (0, 1, 2, 3);', """
              num | booleancol
              RRR   MMMMMMMMMM
@@ -409,11 +402,11 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_null_output(self):
         # column with metainfo but no values
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select k, c, notthere from undefined_values_table where k in ('k1', 'k2');", """
              k  | c  | notthere
              R    M    MMMMMMMM
@@ -428,7 +421,7 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
         # all-columns, including a metainfo column has no values (cql3)
         self.assertQueriesGiveColoredOutput((
@@ -446,10 +439,10 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_string_output_ascii(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select * from ascii_with_special_chars where k in (0, 1, 2, 3);", r"""
              k | val
              R   MMM
@@ -468,7 +461,7 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_string_output_utf8(self):
         # many of these won't line up visually here, to keep the source code
@@ -477,7 +470,7 @@
         # terminals, but the color-checking machinery here will still treat
         # it as one character, so those won't seem to line up visually either.
 
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select * from utf8_with_special_chars where k in (0, 1, 2, 3, 4, 5, 6);", u"""
              k | val
              R   MMM
@@ -502,10 +495,10 @@
             (7 rows)
             nnnnnnnn
             """.encode('utf-8')),
-        ), cqlver=cqlsh.DEFAULT_CQLVER, env={'LANG': 'en_US.UTF-8'})
+        ), env={'LANG': 'en_US.UTF-8'})
 
     def test_blob_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select num, blobcol from has_all_types where num in (0, 1, 2, 3);", r"""
              num | blobcol
              RRR   MMMMMMM
@@ -524,10 +517,10 @@
             (4 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_prompt(self):
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
+        with testrun_cqlsh(tty=True, keyspace=None) as c:
             self.assertTrue(c.output_header.splitlines()[-1].endswith('cqlsh> '))
 
             c.send('\n')
@@ -559,8 +552,7 @@
                              "RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR")
 
     def test_describe_keyspace_output(self):
-        fullcqlver = cqlsh.DEFAULT_CQLVER
-        with testrun_cqlsh(tty=True, cqlver=fullcqlver) as c:
+        with testrun_cqlsh(tty=True) as c:
             ks = get_keyspace()
             qks = quote_name(ks)
             for cmd in ('describe keyspace', 'desc keyspace'):
@@ -568,7 +560,7 @@
                     for semicolon in ('', ';'):
                         fullcmd = cmd + (' ' if givename else '') + givename + semicolon
                         desc = c.cmd_and_response(fullcmd)
-                        self.check_describe_keyspace_output(desc, givename or qks, fullcqlver)
+                        self.check_describe_keyspace_output(desc, givename or qks)
 
             # try to actually execute that last keyspace description, with a
             # new keyspace name
@@ -577,7 +569,7 @@
             statements = split_cql_commands(copy_desc)
             do_drop = True
 
-            with cassandra_cursor(cql_version=fullcqlver) as curs:
+            with cassandra_cursor() as curs:
                 try:
                     for stmt in statements:
                         cqlshlog.debug('TEST EXEC: %s' % stmt)
@@ -587,7 +579,7 @@
                     if do_drop:
                         curs.execute('drop keyspace %s' % quote_name(new_ks_name))
 
-    def check_describe_keyspace_output(self, output, qksname, fullcqlver):
+    def check_describe_keyspace_output(self, output, qksname):
         expected_bits = [r'(?im)^CREATE KEYSPACE %s WITH\b' % re.escape(qksname),
                          r';\s*$',
                          r'\breplication = {\'class\':']
@@ -619,10 +611,11 @@
                 varcharcol text,
                 varintcol varint
             ) WITH bloom_filter_fp_chance = 0.01
-                AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
+                AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
                 AND comment = ''
-                AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
-                AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
+                AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
+                AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
+                AND crc_check_chance = 1.0
                 AND dclocal_read_repair_chance = 0.1
                 AND default_time_to_live = 0
                 AND gc_grace_seconds = 864000
@@ -630,11 +623,11 @@
                 AND memtable_flush_period_in_ms = 0
                 AND min_index_interval = 128
                 AND read_repair_chance = 0.0
-                AND speculative_retry = '99.0PERCENTILE';
+                AND speculative_retry = '99PERCENTILE';
 
         """ % quote_name(get_keyspace()))
 
-        with testrun_cqlsh(tty=True, cqlver=cqlsh.DEFAULT_CQLVER) as c:
+        with testrun_cqlsh(tty=True) as c:
             for cmdword in ('describe table', 'desc columnfamily'):
                 for semicolon in (';', ''):
                     output = c.cmd_and_response('%s has_all_types%s' % (cmdword, semicolon))
@@ -652,7 +645,7 @@
 
         ks = get_keyspace()
 
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
+        with testrun_cqlsh(tty=True, keyspace=None) as c:
 
             # when not in a keyspace
             for cmdword in ('DESCRIBE COLUMNFAMILIES', 'desc tables'):
@@ -703,7 +696,7 @@
             \n
         '''
 
-        with testrun_cqlsh(tty=True, keyspace=None, cqlver=cqlsh.DEFAULT_CQLVER) as c:
+        with testrun_cqlsh(tty=True, keyspace=None) as c:
 
             # not in a keyspace
             for semicolon in ('', ';'):
@@ -791,7 +784,7 @@
         pass
 
     def test_user_types_output(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select addresses from users;", r"""
              addresses
              MMMMMMMMM
@@ -806,8 +799,8 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
-        self.assertCqlverQueriesGiveColoredOutput((
+        ))
+        self.assertQueriesGiveColoredOutput((
             ("select phone_numbers from users;", r"""
              phone_numbers
              MMMMMMMMMMMMM
@@ -822,10 +815,10 @@
             (2 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))
 
     def test_user_types_with_collections(self):
-        self.assertCqlverQueriesGiveColoredOutput((
+        self.assertQueriesGiveColoredOutput((
             ("select info from songs;", r"""
              info
              MMMM
@@ -838,8 +831,8 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
-        self.assertCqlverQueriesGiveColoredOutput((
+        ))
+        self.assertQueriesGiveColoredOutput((
             ("select tags from songs;", r"""
              tags
              MMMM
@@ -852,4 +845,4 @@
             (1 rows)
             nnnnnnnn
             """),
-        ), cqlver=cqlsh.DEFAULT_CQLVER)
+        ))

diff --git a/pylib/requirements.txt b/pylib/requirements.txt
new file mode 100644
index 0000000..a9b6217
--- /dev/null
+++ b/pylib/requirements.txt

@@ -0,0 +1,21 @@
+# See python driver docs: futures and six have to be installed before
+# cythonizing the driver, perhaps only on old pips.
+# http://datastax.github.io/python-driver/installation.html#cython-based-extensions
+futures
+six
+-e git+https://github.com/datastax/python-driver.git@cassandra-test#egg=cassandra-driver
+# Used ccm version is tracked by cassandra-test branch in ccm repo. Please create a PR there for fixes or upgrades to new releases.
+-e git+https://github.com/riptano/ccm.git@cassandra-test#egg=ccm
+cql
+decorator
+docopt
+enum34
+flaky
+mock
+nose
+nose-test-select
+parse
+pycodestyle
+psutil
+pycassa
+thrift==0.9.3

diff --git a/redhat/cassandra.spec b/redhat/cassandra.spec
index 07c3dc5..2a83b60 100644
--- a/redhat/cassandra.spec
+++ b/redhat/cassandra.spec

@@ -22,8 +22,9 @@
 BuildRoot:     %{_tmppath}/%{relname}root-%(%{__id_u} -n)
 
 BuildRequires: ant >= 1.9
+BuildRequires: ant-junit >= 1.9
 
-Requires:      jre >= 1.7.0
+Requires:      jre >= 1.8.0
 Requires:      python(abi) >= 2.7
 Requires(pre): user(cassandra)
 Requires(pre): group(cassandra)
@@ -61,6 +62,7 @@
 mkdir -p %{buildroot}/var/lib/%{username}/commitlog
 mkdir -p %{buildroot}/var/lib/%{username}/data
 mkdir -p %{buildroot}/var/lib/%{username}/saved_caches
+mkdir -p %{buildroot}/var/lib/%{username}/hints
 mkdir -p %{buildroot}/var/run/%{username}
 mkdir -p %{buildroot}/var/log/%{username}
 ( cd pylib && python2.7 setup.py install --no-compile --root %{buildroot}; )
@@ -68,16 +70,18 @@
 # patches for data and log paths
 patch -p1 < debian/patches/001cassandra_yaml_dirs.dpatch
 patch -p1 < debian/patches/002cassandra_logdir_fix.dpatch
+# uncomment hints_directory path
+sed -i 's/^# hints_directory:/hints_directory:/' conf/cassandra.yaml
 
 # remove batch, powershell, and other files not being installed
-rm conf/*.ps1
-rm bin/*.bat
-rm bin/*.orig
-rm bin/*.ps1
-rm bin/cassandra.in.sh
-rm lib/sigar-bin/*winnt*  # strip segfaults on dll..
-rm tools/bin/*.bat
-rm tools/bin/cassandra.in.sh
+rm -f conf/*.ps1
+rm -f bin/*.bat
+rm -f bin/*.orig
+rm -f bin/*.ps1
+rm -f bin/cassandra.in.sh
+rm -f lib/sigar-bin/*winnt*  # strip segfaults on dll..
+rm -f tools/bin/*.bat
+rm -f tools/bin/cassandra.in.sh
 
 # copy default configs
 cp -pr conf/* %{buildroot}/%{_sysconfdir}/%{username}/default.conf/
@@ -120,10 +124,10 @@
 %attr(755,root,root) %{_bindir}/cqlsh.py
 %attr(755,root,root) %{_bindir}/debug-cql
 %attr(755,root,root) %{_bindir}/nodetool
-%attr(755,root,root) %{_bindir}/sstablekeys
 %attr(755,root,root) %{_bindir}/sstableloader
 %attr(755,root,root) %{_bindir}/sstablescrub
 %attr(755,root,root) %{_bindir}/sstableupgrade
+%attr(755,root,root) %{_bindir}/sstableutil
 %attr(755,root,root) %{_bindir}/sstableverify
 %attr(755,root,root) %{_bindir}/stop-server
 %attr(755,root,root) %{_sbindir}/cassandra
@@ -161,16 +165,14 @@
 This package contains extra tools for working with Cassandra clusters.
 
 %files tools
+%attr(755,root,root) %{_bindir}/sstabledump
 %attr(755,root,root) %{_bindir}/cassandra-stressd
-%attr(755,root,root) %{_bindir}/json2sstable
-%attr(755,root,root) %{_bindir}/sstable2json
 %attr(755,root,root) %{_bindir}/sstableexpiredblockers
 %attr(755,root,root) %{_bindir}/sstablelevelreset
 %attr(755,root,root) %{_bindir}/sstablemetadata
 %attr(755,root,root) %{_bindir}/sstableofflinerelevel
 %attr(755,root,root) %{_bindir}/sstablerepairedset
 %attr(755,root,root) %{_bindir}/sstablesplit
-%attr(755,root,root) %{_bindir}/token-generator
 
 
 %changelog

diff --git a/src/java/org/apache/cassandra/auth/AllowAllAuthenticator.java b/src/java/org/apache/cassandra/auth/AllowAllAuthenticator.java
index bc00c3e..7b21dc6 100644
--- a/src/java/org/apache/cassandra/auth/AllowAllAuthenticator.java
+++ b/src/java/org/apache/cassandra/auth/AllowAllAuthenticator.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.auth;
 
+import java.net.InetAddress;
 import java.util.Collections;
 import java.util.Map;
 import java.util.Set;
@@ -46,7 +47,7 @@
     {
     }
 
-    public SaslNegotiator newSaslNegotiator()
+    public SaslNegotiator newSaslNegotiator(InetAddress clientAddress)
     {
         return AUTHENTICATOR_INSTANCE;
     }

diff --git a/src/java/org/apache/cassandra/auth/AuthKeyspace.java b/src/java/org/apache/cassandra/auth/AuthKeyspace.java
index 199b6e2..d91b014 100644
--- a/src/java/org/apache/cassandra/auth/AuthKeyspace.java
+++ b/src/java/org/apache/cassandra/auth/AuthKeyspace.java

@@ -17,20 +17,30 @@
  */
 package org.apache.cassandra.auth;
 
-import java.util.Arrays;
-import java.util.List;
 import java.util.concurrent.TimeUnit;
 
-import com.google.common.collect.ImmutableMap;
-
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Tables;
 
-public class AuthKeyspace
+public final class AuthKeyspace
 {
+    private AuthKeyspace()
+    {
+    }
+
     public static final String NAME = "system_auth";
 
+    /**
+     * Generation is used as a timestamp for automatic table creation on startup.
+     * If you make any changes to the tables below, make sure to increment the
+     * generation and document your change here.
+     *
+     * gen 0: original definition in 3.0
+     */
+    public static final long GENERATION = 0;
+
     public static final String ROLES = "roles";
     public static final String ROLE_MEMBERS = "role_members";
     public static final String ROLE_PERMISSIONS = "role_permissions";
@@ -82,9 +92,8 @@
                          .gcGraceSeconds((int) TimeUnit.DAYS.toSeconds(90));
     }
 
-    public static KSMetaData definition()
+    public static KeyspaceMetadata metadata()
     {
-        List<CFMetaData> tables = Arrays.asList(Roles, RoleMembers, RolePermissions, ResourceRoleIndex);
-        return new KSMetaData(NAME, SimpleStrategy.class, ImmutableMap.of("replication_factor", "1"), true, tables);
+        return KeyspaceMetadata.create(NAME, KeyspaceParams.simple(1), Tables.of(Roles, RoleMembers, RolePermissions, ResourceRoleIndex));
     }
 }

diff --git a/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java b/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java
index 68d4303..c9d7bbc 100644
--- a/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java
+++ b/src/java/org/apache/cassandra/auth/CassandraAuthorizer.java

@@ -422,7 +422,7 @@
                             return resource.applicablePermissions().contains(Permission.valueOf(s));
                         }
                     };
-                    SetSerializer<String> serializer = SetSerializer.getInstance(UTF8Serializer.instance);
+                    SetSerializer<String> serializer = SetSerializer.getInstance(UTF8Serializer.instance, UTF8Type.instance);
                     Set<String> originalPerms = serializer.deserialize(row.getBytes("permissions"));
                     Set<String> filteredPerms = ImmutableSet.copyOf(Iterables.filter(originalPerms, isApplicable));
                     insertStatement.execute(QueryState.forInternalCalls(),

diff --git a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java
index 1e5ea8a..aa98b7e 100644
--- a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java
+++ b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java

@@ -39,6 +39,7 @@
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.transport.messages.ResultMessage;
@@ -166,24 +167,16 @@
         {
             legacySelectUserStatement = prepareLegacySelectUserStatement();
 
-            scheduleSetupTask(new Callable<Void>()
-            {
-                public Void call() throws Exception
-                {
-                    convertLegacyData();
-                    return null;
-                }
+            scheduleSetupTask(() -> {
+                convertLegacyData();
+                return null;
             });
         }
         else
         {
-            scheduleSetupTask(new Callable<Void>()
-            {
-                public Void call() throws Exception
-                {
-                    setupDefaultRole();
-                    return null;
-                }
+            scheduleSetupTask(() -> {
+                setupDefaultRole();
+                return null;
             });
         }
     }
@@ -489,7 +482,7 @@
     {
         try
         {
-            return QueryProcessor.parseStatement(String.format(template, keyspace, table)).prepare().statement;
+            return QueryProcessor.parseStatement(String.format(template, keyspace, table)).prepare(ClientState.forInternalCalls()).statement;
         }
         catch (RequestValidationException e)
         {

diff --git a/src/java/org/apache/cassandra/auth/FunctionResource.java b/src/java/org/apache/cassandra/auth/FunctionResource.java
index 1421541..01a4de5 100644
--- a/src/java/org/apache/cassandra/auth/FunctionResource.java
+++ b/src/java/org/apache/cassandra/auth/FunctionResource.java

@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Optional;
 import java.util.Set;
 
 import com.google.common.base.Joiner;
@@ -31,9 +32,9 @@
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
  * IResource implementation representing functions.
@@ -146,6 +147,9 @@
      */
     public static FunctionResource functionFromCql(String keyspace, String name, List<CQL3Type.Raw> argTypes)
     {
+        if (keyspace == null)
+            throw new InvalidRequestException("In this context function name must be " +
+                                              "explictly qualified by a keyspace");
         List<AbstractType<?>> abstractTypes = new ArrayList<>();
         for (CQL3Type.Raw cqlType : argTypes)
             abstractTypes.add(cqlType.prepare(keyspace).getType());
@@ -244,7 +248,7 @@
             case KEYSPACE:
                 return Schema.instance.getKeyspaces().contains(keyspace);
             case FUNCTION:
-                return Functions.find(getFunctionName(), argTypes) != null;
+                return Schema.instance.findFunction(getFunctionName(), argTypes).isPresent();
         }
         throw new AssertionError();
     }
@@ -258,9 +262,9 @@
                 return COLLECTION_LEVEL_PERMISSIONS;
             case FUNCTION:
             {
-                Function function = Functions.find(getFunctionName(), argTypes);
-                assert function != null : "Unable to find function object for resource " + toString();
-                return function.isAggregate() ? AGGREGATE_FUNCTION_PERMISSIONS : SCALAR_FUNCTION_PERMISSIONS;
+                Optional<Function> function = Schema.instance.findFunction(getFunctionName(), argTypes);
+                assert function.isPresent() : "Unable to find function object for resource " + toString();
+                return function.get().isAggregate() ? AGGREGATE_FUNCTION_PERMISSIONS : SCALAR_FUNCTION_PERMISSIONS;
             }
         }
         throw new AssertionError();

diff --git a/src/java/org/apache/cassandra/auth/IAuthenticator.java b/src/java/org/apache/cassandra/auth/IAuthenticator.java
index 24792f6..ccbdb75 100644
--- a/src/java/org/apache/cassandra/auth/IAuthenticator.java
+++ b/src/java/org/apache/cassandra/auth/IAuthenticator.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.auth;
 
+import java.net.InetAddress;
 import java.util.Map;
 import java.util.Set;
 
@@ -56,10 +57,12 @@
      * Provide a SASL handler to perform authentication for an single connection. SASL
      * is a stateful protocol, so a new instance must be used for each authentication
      * attempt.
+     * @param clientAddress the IP address of the client whom we wish to authenticate, or null
+     *                      if an internal client (one not connected over the remote transport).
      * @return org.apache.cassandra.auth.IAuthenticator.SaslNegotiator implementation
      * (see {@link org.apache.cassandra.auth.PasswordAuthenticator.PlainTextSaslAuthenticator})
      */
-    SaslNegotiator newSaslNegotiator();
+    SaslNegotiator newSaslNegotiator(InetAddress clientAddress);
 
     /**
      * For implementations which support the Thrift login method that accepts arbitrary

diff --git a/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java b/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java
index b7250a8..602fea4 100644
--- a/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java
+++ b/src/java/org/apache/cassandra/auth/PasswordAuthenticator.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.auth;
 
+import java.net.InetAddress;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Map;
@@ -61,7 +62,7 @@
     public static final String USERNAME_KEY = "username";
     public static final String PASSWORD_KEY = "password";
 
-    private static final byte NUL = 0;
+    static final byte NUL = 0;
     private SelectStatement authenticateStatement;
 
     public static final String LEGACY_CREDENTIALS_TABLE = "credentials";
@@ -73,6 +74,20 @@
         return true;
     }
 
+    protected static boolean checkpw(String password, String hash)
+    {
+        try
+        {
+            return BCrypt.checkpw(password, hash);
+        }
+        catch (Exception e)
+        {
+            // Improperly formatted hashes may cause BCrypt.checkpw to throw, so trap any other exception as a failure
+            logger.warn("Error: invalid password hash encountered, rejecting user", e);
+            return false;
+        }
+    }
+
     private AuthenticatedUser authenticate(String username, String password) throws AuthenticationException
     {
         try
@@ -148,7 +163,7 @@
         return authenticate(username, password);
     }
 
-    public SaslNegotiator newSaslNegotiator()
+    public SaslNegotiator newSaslNegotiator(InetAddress clientAddress)
     {
         return new PlainTextSaslAuthenticator();
     }
@@ -161,7 +176,7 @@
                                                                                                 Lists.newArrayList(ByteBufferUtil.bytes(username))));
         UntypedResultSet result = UntypedResultSet.create(rows.result);
 
-        if ((result.isEmpty() || !result.one().has(SALTED_HASH)) || !BCrypt.checkpw(password, result.one().getString(SALTED_HASH)))
+        if ((result.isEmpty() || !result.one().has(SALTED_HASH)) || !checkpw(password, result.one().getString(SALTED_HASH)))
             throw new AuthenticationException("Username and/or password are incorrect");
 
         return new AuthenticatedUser(username);
@@ -216,7 +231,7 @@
             byte[] user = null;
             byte[] pass = null;
             int end = bytes.length;
-            for (int i = bytes.length - 1 ; i >= 0; i--)
+            for (int i = bytes.length - 1; i >= 0; i--)
             {
                 if (bytes[i] == NUL)
                 {
@@ -224,13 +239,16 @@
                         pass = Arrays.copyOfRange(bytes, i + 1, end);
                     else if (user == null)
                         user = Arrays.copyOfRange(bytes, i + 1, end);
+                    else
+                        throw new AuthenticationException("Credential format error: username or password is empty or contains NUL(\\0) character");
+
                     end = i;
                 }
             }
 
-            if (user == null)
+            if (user == null || user.length == 0)
                 throw new AuthenticationException("Authentication ID must not be null");
-            if (pass == null)
+            if (pass == null || pass.length == 0)
                 throw new AuthenticationException("Password must not be null");
 
             username = new String(user, StandardCharsets.UTF_8);

diff --git a/src/java/org/apache/cassandra/auth/RoleResource.java b/src/java/org/apache/cassandra/auth/RoleResource.java
index e994233..89665f4 100644
--- a/src/java/org/apache/cassandra/auth/RoleResource.java
+++ b/src/java/org/apache/cassandra/auth/RoleResource.java

@@ -96,9 +96,9 @@
      */
     public static RoleResource fromName(String name)
     {
-        String[] parts = StringUtils.split(name, '/');
+        String[] parts = StringUtils.split(name, "/", 2);
 
-        if (!parts[0].equals(ROOT_NAME) || parts.length > 2)
+        if (!parts[0].equals(ROOT_NAME))
             throw new IllegalArgumentException(String.format("%s is not a valid role resource name", name));
 
         if (parts.length == 1)

diff --git a/src/java/org/apache/cassandra/auth/RolesCache.java b/src/java/org/apache/cassandra/auth/RolesCache.java
index c781ee0..a8dae21 100644
--- a/src/java/org/apache/cassandra/auth/RolesCache.java
+++ b/src/java/org/apache/cassandra/auth/RolesCache.java

@@ -103,7 +103,7 @@
 
     private LoadingCache<RoleResource, Set<RoleResource>> initCache(LoadingCache<RoleResource, Set<RoleResource>> existing)
     {
-        if (DatabaseDescriptor.getAuthenticator() instanceof AllowAllAuthenticator)
+        if (!DatabaseDescriptor.getAuthenticator().requireAuthentication())
             return null;
 
         if (DatabaseDescriptor.getRolesValidity() <= 0)

diff --git a/src/java/org/apache/cassandra/batchlog/Batch.java b/src/java/org/apache/cassandra/batchlog/Batch.java
new file mode 100644
index 0000000..e91e3ca
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/Batch.java

@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+import static org.apache.cassandra.db.TypeSizes.sizeof;
+import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
+
+public final class Batch
+{
+    public static final Serializer serializer = new Serializer();
+
+    public final UUID id;
+    public final long creationTime; // time of batch creation (in microseconds)
+
+    // one of these will always be empty
+    final Collection<Mutation> decodedMutations;
+    final Collection<ByteBuffer> encodedMutations;
+
+    private Batch(UUID id, long creationTime, Collection<Mutation> decodedMutations, Collection<ByteBuffer> encodedMutations)
+    {
+        this.id = id;
+        this.creationTime = creationTime;
+
+        this.decodedMutations = decodedMutations;
+        this.encodedMutations = encodedMutations;
+    }
+
+    /**
+     * Creates a 'local' batch - with all enclosed mutations in decoded form (as Mutation instances)
+     */
+    public static Batch createLocal(UUID id, long creationTime, Collection<Mutation> mutations)
+    {
+        return new Batch(id, creationTime, mutations, Collections.emptyList());
+    }
+
+    /**
+     * Creates a 'remote' batch - with all enclosed mutations in encoded form (as ByteBuffer instances)
+     *
+     * The mutations will always be encoded using the current messaging version.
+     */
+    public static Batch createRemote(UUID id, long creationTime, Collection<ByteBuffer> mutations)
+    {
+        return new Batch(id, creationTime, Collections.<Mutation>emptyList(), mutations);
+    }
+
+    /**
+     * Count of the mutations in the batch.
+     */
+    public int size()
+    {
+        return decodedMutations.size() + encodedMutations.size();
+    }
+
+    static final class Serializer implements IVersionedSerializer<Batch>
+    {
+        public long serializedSize(Batch batch, int version)
+        {
+            assert batch.encodedMutations.isEmpty() : "attempted to serialize a 'remote' batch";
+
+            long size = UUIDSerializer.serializer.serializedSize(batch.id, version);
+            size += sizeof(batch.creationTime);
+
+            size += sizeofUnsignedVInt(batch.decodedMutations.size());
+            for (Mutation mutation : batch.decodedMutations)
+            {
+                int mutationSize = (int) Mutation.serializer.serializedSize(mutation, version);
+                size += sizeofUnsignedVInt(mutationSize);
+                size += mutationSize;
+            }
+
+            return size;
+        }
+
+        public void serialize(Batch batch, DataOutputPlus out, int version) throws IOException
+        {
+            assert batch.encodedMutations.isEmpty() : "attempted to serialize a 'remote' batch";
+
+            UUIDSerializer.serializer.serialize(batch.id, out, version);
+            out.writeLong(batch.creationTime);
+
+            out.writeUnsignedVInt(batch.decodedMutations.size());
+            for (Mutation mutation : batch.decodedMutations)
+            {
+                out.writeUnsignedVInt(Mutation.serializer.serializedSize(mutation, version));
+                Mutation.serializer.serialize(mutation, out, version);
+            }
+        }
+
+        public Batch deserialize(DataInputPlus in, int version) throws IOException
+        {
+            UUID id = UUIDSerializer.serializer.deserialize(in, version);
+            long creationTime = in.readLong();
+
+            /*
+             * If version doesn't match the current one, we cannot not just read the encoded mutations verbatim,
+             * so we decode them instead, to deal with compatibility.
+             */
+            return version == MessagingService.current_version
+                 ? createRemote(id, creationTime, readEncodedMutations(in))
+                 : createLocal(id, creationTime, decodeMutations(in, version));
+        }
+
+        private static Collection<ByteBuffer> readEncodedMutations(DataInputPlus in) throws IOException
+        {
+            int count = (int) in.readUnsignedVInt();
+
+            ArrayList<ByteBuffer> mutations = new ArrayList<>(count);
+            for (int i = 0; i < count; i++)
+                mutations.add(ByteBufferUtil.readWithVIntLength(in));
+
+            return mutations;
+        }
+
+        private static Collection<Mutation> decodeMutations(DataInputPlus in, int version) throws IOException
+        {
+            int count = (int) in.readUnsignedVInt();
+
+            ArrayList<Mutation> mutations = new ArrayList<>(count);
+            for (int i = 0; i < count; i++)
+            {
+                in.readUnsignedVInt(); // skip mutation size
+                mutations.add(Mutation.serializer.deserialize(in, version));
+            }
+
+            return mutations;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/batchlog/BatchRemoveVerbHandler.java b/src/java/org/apache/cassandra/batchlog/BatchRemoveVerbHandler.java
new file mode 100644
index 0000000..3c3fcec
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/BatchRemoveVerbHandler.java

@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.util.UUID;
+
+import org.apache.cassandra.net.IVerbHandler;
+import org.apache.cassandra.net.MessageIn;
+
+public final class BatchRemoveVerbHandler implements IVerbHandler<UUID>
+{
+    public void doVerb(MessageIn<UUID> message, int id)
+    {
+        BatchlogManager.remove(message.payload);
+    }
+}

diff --git a/src/java/org/apache/cassandra/batchlog/BatchStoreVerbHandler.java b/src/java/org/apache/cassandra/batchlog/BatchStoreVerbHandler.java
new file mode 100644
index 0000000..4bc878c
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/BatchStoreVerbHandler.java

@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import org.apache.cassandra.db.WriteResponse;
+import org.apache.cassandra.net.IVerbHandler;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
+
+public final class BatchStoreVerbHandler implements IVerbHandler<Batch>
+{
+    public void doVerb(MessageIn<Batch> message, int id)
+    {
+        BatchlogManager.store(message.payload);
+        MessagingService.instance().sendReply(WriteResponse.createMessage(), id, message.from);
+    }
+}

diff --git a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
new file mode 100644
index 0000000..71d60e7
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java

@@ -0,0 +1,591 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import com.google.common.util.concurrent.RateLimiter;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.WriteType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.WriteFailureException;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.hints.Hint;
+import org.apache.cassandra.hints.HintsService;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.WriteResponseHandler;
+import org.apache.cassandra.utils.ExecutorUtils;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MBeanWrapper;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static com.google.common.collect.Iterables.transform;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternalWithPaging;
+
+public class BatchlogManager implements BatchlogManagerMBean
+{
+    public static final String MBEAN_NAME = "org.apache.cassandra.db:type=BatchlogManager";
+    private static final long REPLAY_INTERVAL = 10 * 1000; // milliseconds
+    static final int DEFAULT_PAGE_SIZE = 128;
+
+    private static final Logger logger = LoggerFactory.getLogger(BatchlogManager.class);
+    public static final BatchlogManager instance = new BatchlogManager();
+    public static final long BATCHLOG_REPLAY_TIMEOUT = Long.getLong("cassandra.batchlog.replay_timeout_in_ms", DatabaseDescriptor.getWriteRpcTimeout() * 2);
+
+    private volatile long totalBatchesReplayed = 0; // no concurrency protection necessary as only written by replay thread.
+    private volatile UUID lastReplayedUuid = UUIDGen.minTimeUUID(0);
+
+    // Single-thread executor service for scheduling and serializing log replay.
+    private final ScheduledExecutorService batchlogTasks;
+
+    public BatchlogManager()
+    {
+        ScheduledThreadPoolExecutor executor = new DebuggableScheduledThreadPoolExecutor("BatchlogTasks");
+        executor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
+        batchlogTasks = executor;
+    }
+
+    public void start()
+    {
+        MBeanWrapper.instance.registerMBean(this, MBEAN_NAME);
+
+        batchlogTasks.scheduleWithFixedDelay(this::replayFailedBatches,
+                                             StorageService.RING_DELAY,
+                                             REPLAY_INTERVAL,
+                                             TimeUnit.MILLISECONDS);
+    }
+
+    public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
+    {
+        ExecutorUtils.shutdownAndWait(timeout, unit, batchlogTasks);
+    }
+
+    public static void remove(UUID id)
+    {
+        new Mutation(PartitionUpdate.fullPartitionDelete(SystemKeyspace.Batches,
+                                                         UUIDType.instance.decompose(id),
+                                                         FBUtilities.timestampMicros(),
+                                                         FBUtilities.nowInSeconds()))
+            .apply();
+    }
+
+    public static void store(Batch batch)
+    {
+        store(batch, true);
+    }
+
+    public static void store(Batch batch, boolean durableWrites)
+    {
+        RowUpdateBuilder builder =
+            new RowUpdateBuilder(SystemKeyspace.Batches, batch.creationTime, batch.id)
+                .clustering()
+                .add("version", MessagingService.current_version);
+
+        for (ByteBuffer mutation : batch.encodedMutations)
+            builder.addListEntry("mutations", mutation);
+
+        for (Mutation mutation : batch.decodedMutations)
+        {
+            try (DataOutputBuffer buffer = new DataOutputBuffer())
+            {
+                Mutation.serializer.serialize(mutation, buffer, MessagingService.current_version);
+                builder.addListEntry("mutations", buffer.buffer());
+            }
+            catch (IOException e)
+            {
+                // shouldn't happen
+                throw new AssertionError(e);
+            }
+        }
+
+        builder.build().apply(durableWrites);
+    }
+
+    @VisibleForTesting
+    public int countAllBatches()
+    {
+        String query = String.format("SELECT count(*) FROM %s.%s", SystemKeyspace.NAME, SystemKeyspace.BATCHES);
+        UntypedResultSet results = executeInternal(query);
+        if (results == null || results.isEmpty())
+            return 0;
+
+        return (int) results.one().getLong("count");
+    }
+
+    public long getTotalBatchesReplayed()
+    {
+        return totalBatchesReplayed;
+    }
+
+    public void forceBatchlogReplay() throws Exception
+    {
+        startBatchlogReplay().get();
+    }
+
+    public Future<?> startBatchlogReplay()
+    {
+        // If a replay is already in progress this request will be executed after it completes.
+        return batchlogTasks.submit(this::replayFailedBatches);
+    }
+
+    void performInitialReplay() throws InterruptedException, ExecutionException
+    {
+        // Invokes initial replay. Used for testing only.
+        batchlogTasks.submit(this::replayFailedBatches).get();
+    }
+
+    private void replayFailedBatches()
+    {
+        logger.trace("Started replayFailedBatches");
+
+        // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
+        // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
+        int endpointsCount = StorageService.instance.getTokenMetadata().getAllEndpoints().size();
+        if (endpointsCount <= 0)
+        {
+            logger.trace("Replay cancelled as there are no peers in the ring.");
+            return;
+        }
+        int throttleInKB = DatabaseDescriptor.getBatchlogReplayThrottleInKB() / endpointsCount;
+        RateLimiter rateLimiter = RateLimiter.create(throttleInKB == 0 ? Double.MAX_VALUE : throttleInKB * 1024);
+
+        UUID limitUuid = UUIDGen.maxTimeUUID(System.currentTimeMillis() - getBatchlogTimeout());
+        ColumnFamilyStore store = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES);
+        int pageSize = calculatePageSize(store);
+        // There cannot be any live content where token(id) <= token(lastReplayedUuid) as every processed batch is
+        // deleted, but the tombstoned content may still be present in the tables. To avoid walking over it we specify
+        // token(id) > token(lastReplayedUuid) as part of the query.
+        String query = String.format("SELECT id, mutations, version FROM %s.%s WHERE token(id) > token(?) AND token(id) <= token(?)",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.BATCHES);
+        UntypedResultSet batches = executeInternalWithPaging(query, pageSize, lastReplayedUuid, limitUuid);
+        processBatchlogEntries(batches, pageSize, rateLimiter);
+        lastReplayedUuid = limitUuid;
+        logger.trace("Finished replayFailedBatches");
+    }
+
+    // read less rows (batches) per page if they are very large
+    static int calculatePageSize(ColumnFamilyStore store)
+    {
+        double averageRowSize = store.getMeanPartitionSize();
+        if (averageRowSize <= 0)
+            return DEFAULT_PAGE_SIZE;
+
+        return (int) Math.max(1, Math.min(DEFAULT_PAGE_SIZE, 4 * 1024 * 1024 / averageRowSize));
+    }
+
+    private void processBatchlogEntries(UntypedResultSet batches, int pageSize, RateLimiter rateLimiter)
+    {
+        int positionInPage = 0;
+        ArrayList<ReplayingBatch> unfinishedBatches = new ArrayList<>(pageSize);
+
+        Set<InetAddress> hintedNodes = new HashSet<>();
+        Set<UUID> replayedBatches = new HashSet<>();
+
+        // Sending out batches for replay without waiting for them, so that one stuck batch doesn't affect others
+        for (UntypedResultSet.Row row : batches)
+        {
+            UUID id = row.getUUID("id");
+            int version = row.getInt("version");
+            try
+            {
+                ReplayingBatch batch = new ReplayingBatch(id, version, row.getList("mutations", BytesType.instance));
+                if (batch.replay(rateLimiter, hintedNodes) > 0)
+                {
+                    unfinishedBatches.add(batch);
+                }
+                else
+                {
+                    remove(id); // no write mutations were sent (either expired or all CFs involved truncated).
+                    ++totalBatchesReplayed;
+                }
+            }
+            catch (IOException e)
+            {
+                logger.warn("Skipped batch replay of {} due to {}", id, e);
+                remove(id);
+            }
+
+            if (++positionInPage == pageSize)
+            {
+                // We have reached the end of a batch. To avoid keeping more than a page of mutations in memory,
+                // finish processing the page before requesting the next row.
+                finishAndClearBatches(unfinishedBatches, hintedNodes, replayedBatches);
+                positionInPage = 0;
+            }
+        }
+
+        finishAndClearBatches(unfinishedBatches, hintedNodes, replayedBatches);
+
+        // to preserve batch guarantees, we must ensure that hints (if any) have made it to disk, before deleting the batches
+        HintsService.instance.flushAndFsyncBlockingly(transform(hintedNodes, StorageService.instance::getHostIdForEndpoint));
+
+        // once all generated hints are fsynced, actually delete the batches
+        replayedBatches.forEach(BatchlogManager::remove);
+    }
+
+    private void finishAndClearBatches(ArrayList<ReplayingBatch> batches, Set<InetAddress> hintedNodes, Set<UUID> replayedBatches)
+    {
+        // schedule hints for timed out deliveries
+        for (ReplayingBatch batch : batches)
+        {
+            batch.finish(hintedNodes);
+            replayedBatches.add(batch.id);
+        }
+
+        totalBatchesReplayed += batches.size();
+        batches.clear();
+    }
+
+    public static long getBatchlogTimeout()
+    {
+        return BATCHLOG_REPLAY_TIMEOUT; // enough time for the actual write + BM removal mutation
+    }
+
+    private static class ReplayingBatch
+    {
+        private final UUID id;
+        private final long writtenAt;
+        private final List<Mutation> mutations;
+        private final int replayedBytes;
+
+        private List<ReplayWriteResponseHandler<Mutation>> replayHandlers;
+
+        ReplayingBatch(UUID id, int version, List<ByteBuffer> serializedMutations) throws IOException
+        {
+            this.id = id;
+            this.writtenAt = UUIDGen.unixTimestamp(id);
+            this.mutations = new ArrayList<>(serializedMutations.size());
+            this.replayedBytes = addMutations(version, serializedMutations);
+        }
+
+        public int replay(RateLimiter rateLimiter, Set<InetAddress> hintedNodes) throws IOException
+        {
+            logger.trace("Replaying batch {}", id);
+
+            if (mutations.isEmpty())
+                return 0;
+
+            int gcgs = gcgs(mutations);
+            if (TimeUnit.MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds())
+                return 0;
+
+            replayHandlers = sendReplays(mutations, writtenAt, hintedNodes);
+
+            rateLimiter.acquire(replayedBytes); // acquire afterwards, to not mess up ttl calculation.
+
+            return replayHandlers.size();
+        }
+
+        public void finish(Set<InetAddress> hintedNodes)
+        {
+            for (int i = 0; i < replayHandlers.size(); i++)
+            {
+                ReplayWriteResponseHandler<Mutation> handler = replayHandlers.get(i);
+                try
+                {
+                    handler.get();
+                }
+                catch (WriteTimeoutException|WriteFailureException e)
+                {
+                    logger.trace("Failed replaying a batched mutation to a node, will write a hint");
+                    logger.trace("Failure was : {}", e.getMessage());
+                    // writing hints for the rest to hints, starting from i
+                    writeHintsForUndeliveredEndpoints(i, hintedNodes);
+                    return;
+                }
+            }
+        }
+
+        private int addMutations(int version, List<ByteBuffer> serializedMutations) throws IOException
+        {
+            int ret = 0;
+            for (ByteBuffer serializedMutation : serializedMutations)
+            {
+                ret += serializedMutation.remaining();
+                try (DataInputBuffer in = new DataInputBuffer(serializedMutation, true))
+                {
+                    addMutation(Mutation.serializer.deserialize(in, version));
+                }
+            }
+
+            return ret;
+        }
+
+        // Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis.
+        // We don't abort the replay entirely b/c this can be considered a success (truncated is same as delivered then
+        // truncated.
+        private void addMutation(Mutation mutation)
+        {
+            for (UUID cfId : mutation.getColumnFamilyIds())
+                if (writtenAt <= SystemKeyspace.getTruncatedAt(cfId))
+                    mutation = mutation.without(cfId);
+
+            if (!mutation.isEmpty())
+                mutations.add(mutation);
+        }
+
+        private void writeHintsForUndeliveredEndpoints(int startFrom, Set<InetAddress> hintedNodes)
+        {
+            int gcgs = gcgs(mutations);
+
+            // expired
+            if (TimeUnit.MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds())
+                return;
+
+            for (int i = startFrom; i < replayHandlers.size(); i++)
+            {
+                ReplayWriteResponseHandler<Mutation> handler = replayHandlers.get(i);
+                Mutation undeliveredMutation = mutations.get(i);
+
+                if (handler != null)
+                {
+                    hintedNodes.addAll(handler.undelivered);
+                    HintsService.instance.write(transform(handler.undelivered, StorageService.instance::getHostIdForEndpoint),
+                                                Hint.create(undeliveredMutation, writtenAt));
+                }
+            }
+        }
+
+        private static List<ReplayWriteResponseHandler<Mutation>> sendReplays(List<Mutation> mutations,
+                                                                              long writtenAt,
+                                                                              Set<InetAddress> hintedNodes)
+        {
+            List<ReplayWriteResponseHandler<Mutation>> handlers = new ArrayList<>(mutations.size());
+            for (Mutation mutation : mutations)
+            {
+                ReplayWriteResponseHandler<Mutation> handler = sendSingleReplayMutation(mutation, writtenAt, hintedNodes);
+                if (handler != null)
+                    handlers.add(handler);
+            }
+            return handlers;
+        }
+
+        /**
+         * We try to deliver the mutations to the replicas ourselves if they are alive and only resort to writing hints
+         * when a replica is down or a write request times out.
+         *
+         * @return direct delivery handler to wait on or null, if no live nodes found
+         */
+        private static ReplayWriteResponseHandler<Mutation> sendSingleReplayMutation(final Mutation mutation,
+                                                                                     long writtenAt,
+                                                                                     Set<InetAddress> hintedNodes)
+        {
+            Set<InetAddress> liveEndpoints = new HashSet<>();
+            String ks = mutation.getKeyspaceName();
+            Token tk = mutation.key().getToken();
+
+            for (InetAddress endpoint : StorageService.instance.getNaturalAndPendingEndpoints(ks, tk))
+            {
+                if (endpoint.equals(FBUtilities.getBroadcastAddress()))
+                {
+                    mutation.apply();
+                }
+                else if (FailureDetector.instance.isAlive(endpoint))
+                {
+                    liveEndpoints.add(endpoint); // will try delivering directly instead of writing a hint.
+                }
+                else
+                {
+                    hintedNodes.add(endpoint);
+                    HintsService.instance.write(StorageService.instance.getHostIdForEndpoint(endpoint),
+                                                Hint.create(mutation, writtenAt));
+                }
+            }
+
+            if (liveEndpoints.isEmpty())
+                return null;
+
+            ReplayWriteResponseHandler<Mutation> handler = new ReplayWriteResponseHandler<>(liveEndpoints);
+            MessageOut<Mutation> message = mutation.createMessage();
+            for (InetAddress endpoint : liveEndpoints)
+                MessagingService.instance().sendRR(message, endpoint, handler, false);
+            return handler;
+        }
+
+        private static int gcgs(Collection<Mutation> mutations)
+        {
+            int gcgs = Integer.MAX_VALUE;
+            for (Mutation mutation : mutations)
+                gcgs = Math.min(gcgs, mutation.smallestGCGS());
+            return gcgs;
+        }
+
+        /**
+         * A wrapper of WriteResponseHandler that stores the addresses of the endpoints from
+         * which we did not receive a successful reply.
+         */
+        private static class ReplayWriteResponseHandler<T> extends WriteResponseHandler<T>
+        {
+            private final Set<InetAddress> undelivered = Collections.newSetFromMap(new ConcurrentHashMap<>());
+
+            ReplayWriteResponseHandler(Collection<InetAddress> writeEndpoints)
+            {
+                super(writeEndpoints, Collections.<InetAddress>emptySet(), null, null, null, WriteType.UNLOGGED_BATCH);
+                undelivered.addAll(writeEndpoints);
+            }
+
+            @Override
+            protected int totalBlockFor()
+            {
+                return this.naturalEndpoints.size();
+            }
+
+            @Override
+            public void response(MessageIn<T> m)
+            {
+                boolean removed = undelivered.remove(m == null ? FBUtilities.getBroadcastAddress() : m.from);
+                assert removed;
+                super.response(m);
+            }
+        }
+    }
+
+    public static class EndpointFilter
+    {
+        private final String localRack;
+        private final Multimap<String, InetAddress> endpoints;
+
+        public EndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
+        {
+            this.localRack = localRack;
+            this.endpoints = endpoints;
+        }
+
+        /**
+         * @return list of candidates for batchlog hosting. If possible these will be two nodes from different racks.
+         */
+        public Collection<InetAddress> filter()
+        {
+            // special case for single-node data centers
+            if (endpoints.values().size() == 1)
+                return endpoints.values();
+
+            // strip out dead endpoints and localhost
+            ListMultimap<String, InetAddress> validated = ArrayListMultimap.create();
+            for (Map.Entry<String, InetAddress> entry : endpoints.entries())
+                if (isValid(entry.getValue()))
+                    validated.put(entry.getKey(), entry.getValue());
+
+            if (validated.size() <= 2)
+                return validated.values();
+
+            if (validated.size() - validated.get(localRack).size() >= 2)
+            {
+                // we have enough endpoints in other racks
+                validated.removeAll(localRack);
+            }
+
+            if (validated.keySet().size() == 1)
+            {
+                /*
+                 * we have only 1 `other` rack to select replicas from (whether it be the local rack or a single non-local rack)
+                 * pick two random nodes from there; we are guaranteed to have at least two nodes in the single remaining rack
+                 * because of the preceding if block.
+                 */
+                List<InetAddress> otherRack = Lists.newArrayList(validated.values());
+                shuffle(otherRack);
+                return otherRack.subList(0, 2);
+            }
+
+            // randomize which racks we pick from if more than 2 remaining
+            Collection<String> racks;
+            if (validated.keySet().size() == 2)
+            {
+                racks = validated.keySet();
+            }
+            else
+            {
+                racks = Lists.newArrayList(validated.keySet());
+                shuffle((List<String>) racks);
+            }
+
+            // grab a random member of up to two racks
+            List<InetAddress> result = new ArrayList<>(2);
+            for (String rack : Iterables.limit(racks, 2))
+            {
+                List<InetAddress> rackMembers = validated.get(rack);
+                result.add(rackMembers.get(getRandomInt(rackMembers.size())));
+            }
+
+            return result;
+        }
+
+        @VisibleForTesting
+        protected boolean isValid(InetAddress input)
+        {
+            return !input.equals(FBUtilities.getBroadcastAddress()) && FailureDetector.instance.isAlive(input);
+        }
+
+        @VisibleForTesting
+        protected int getRandomInt(int bound)
+        {
+            return ThreadLocalRandom.current().nextInt(bound);
+        }
+
+        @VisibleForTesting
+        protected void shuffle(List<?> list)
+        {
+            Collections.shuffle(list);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/batchlog/BatchlogManagerMBean.java b/src/java/org/apache/cassandra/batchlog/BatchlogManagerMBean.java
new file mode 100644
index 0000000..371b6a8
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/BatchlogManagerMBean.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+public interface BatchlogManagerMBean
+{
+    /**
+     * Counts all batches currently in the batchlog.
+     *
+     * @return total batch count
+     */
+    public int countAllBatches();
+
+    /**
+     * @return total count of batches replayed since node start
+     */
+    public long getTotalBatchesReplayed();
+
+    /**
+     * Forces batchlog replay. Blocks until completion.
+     */
+    public void forceBatchlogReplay() throws Exception;
+}

diff --git a/src/java/org/apache/cassandra/batchlog/LegacyBatchlogMigrator.java b/src/java/org/apache/cassandra/batchlog/LegacyBatchlogMigrator.java
new file mode 100644
index 0000000..dd19f19
--- /dev/null
+++ b/src/java/org/apache/cassandra/batchlog/LegacyBatchlogMigrator.java

@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.exceptions.WriteFailureException;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.AbstractWriteResponseHandler;
+import org.apache.cassandra.service.WriteResponseHandler;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+public final class LegacyBatchlogMigrator
+{
+    private static final Logger logger = LoggerFactory.getLogger(LegacyBatchlogMigrator.class);
+
+    private LegacyBatchlogMigrator()
+    {
+        // static class
+    }
+
+    @SuppressWarnings("deprecation")
+    public static void migrate()
+    {
+        ColumnFamilyStore store = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_BATCHLOG);
+
+        // nothing to migrate
+        if (store.isEmpty())
+            return;
+
+        logger.info("Migrating legacy batchlog to new storage");
+
+        int convertedBatches = 0;
+        String query = String.format("SELECT id, data, written_at, version FROM %s.%s",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.LEGACY_BATCHLOG);
+
+        int pageSize = BatchlogManager.calculatePageSize(store);
+
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, pageSize);
+        for (UntypedResultSet.Row row : rows)
+        {
+            if (apply(row, convertedBatches))
+                convertedBatches++;
+        }
+
+        if (convertedBatches > 0)
+            Keyspace.openAndGetStore(SystemKeyspace.LegacyBatchlog).truncateBlocking();
+    }
+
+    @SuppressWarnings("deprecation")
+    public static boolean isLegacyBatchlogMutation(Mutation mutation)
+    {
+        return mutation.getKeyspaceName().equals(SystemKeyspace.NAME)
+            && mutation.getPartitionUpdate(SystemKeyspace.LegacyBatchlog.cfId) != null;
+    }
+
+    @SuppressWarnings("deprecation")
+    public static void handleLegacyMutation(Mutation mutation)
+    {
+        PartitionUpdate update = mutation.getPartitionUpdate(SystemKeyspace.LegacyBatchlog.cfId);
+        logger.trace("Applying legacy batchlog mutation {}", update);
+        update.forEach(row -> apply(UntypedResultSet.Row.fromInternalRow(update.metadata(), update.partitionKey(), row), -1));
+    }
+
+    private static boolean apply(UntypedResultSet.Row row, long counter)
+    {
+        UUID id = row.getUUID("id");
+        long timestamp = id.version() == 1 ? UUIDGen.unixTimestamp(id) : row.getLong("written_at");
+        int version = row.has("version") ? row.getInt("version") : MessagingService.VERSION_12;
+
+        if (id.version() != 1)
+            id = UUIDGen.getTimeUUID(timestamp, counter);
+
+        logger.trace("Converting mutation at {}", timestamp);
+
+        try (DataInputBuffer in = new DataInputBuffer(row.getBytes("data"), false))
+        {
+            int numMutations = in.readInt();
+            List<Mutation> mutations = new ArrayList<>(numMutations);
+            for (int i = 0; i < numMutations; i++)
+                mutations.add(Mutation.serializer.deserialize(in, version));
+
+            BatchlogManager.store(Batch.createLocal(id, TimeUnit.MILLISECONDS.toMicros(timestamp), mutations));
+            return true;
+        }
+        catch (Throwable t)
+        {
+            logger.error("Failed to convert mutation {} at timestamp {}", id, timestamp, t);
+            return false;
+        }
+    }
+
+    public static void syncWriteToBatchlog(WriteResponseHandler<?> handler, Batch batch, Collection<InetAddress> endpoints)
+    throws WriteTimeoutException, WriteFailureException
+    {
+        for (InetAddress target : endpoints)
+        {
+            logger.trace("Sending legacy batchlog store request {} to {} for {} mutations", batch.id, target, batch.size());
+
+            int targetVersion = MessagingService.instance().getVersion(target);
+            MessagingService.instance().sendRR(getStoreMutation(batch, targetVersion).createMessage(MessagingService.Verb.MUTATION),
+                                               target,
+                                               handler,
+                                               false);
+        }
+    }
+
+    public static void asyncRemoveFromBatchlog(Collection<InetAddress> endpoints, UUID uuid)
+    {
+        AbstractWriteResponseHandler<IMutation> handler = new WriteResponseHandler<>(endpoints,
+                                                                                     Collections.<InetAddress>emptyList(),
+                                                                                     ConsistencyLevel.ANY,
+                                                                                     Keyspace.open(SystemKeyspace.NAME),
+                                                                                     null,
+                                                                                     WriteType.SIMPLE);
+        Mutation mutation = getRemoveMutation(uuid);
+
+        for (InetAddress target : endpoints)
+        {
+            logger.trace("Sending legacy batchlog remove request {} to {}", uuid, target);
+            MessagingService.instance().sendRR(mutation.createMessage(MessagingService.Verb.MUTATION), target, handler, false);
+        }
+    }
+
+    static void store(Batch batch, int version)
+    {
+        getStoreMutation(batch, version).apply();
+    }
+
+    @SuppressWarnings("deprecation")
+    static Mutation getStoreMutation(Batch batch, int version)
+    {
+        return new RowUpdateBuilder(SystemKeyspace.LegacyBatchlog, batch.creationTime, batch.id)
+               .clustering()
+               .add("written_at", new Date(batch.creationTime / 1000))
+               .add("data", getSerializedMutations(version, batch.decodedMutations))
+               .add("version", version)
+               .build();
+    }
+
+    @SuppressWarnings("deprecation")
+    private static Mutation getRemoveMutation(UUID uuid)
+    {
+        return new Mutation(PartitionUpdate.fullPartitionDelete(SystemKeyspace.LegacyBatchlog,
+                                                                UUIDType.instance.decompose(uuid),
+                                                                FBUtilities.timestampMicros(),
+                                                                FBUtilities.nowInSeconds()));
+    }
+
+    private static ByteBuffer getSerializedMutations(int version, Collection<Mutation> mutations)
+    {
+        try (DataOutputBuffer buf = new DataOutputBuffer())
+        {
+            buf.writeInt(mutations.size());
+            for (Mutation mutation : mutations)
+                Mutation.serializer.serialize(mutation, buf, version);
+            return buf.buffer();
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cache/AutoSavingCache.java b/src/java/org/apache/cassandra/cache/AutoSavingCache.java
index 2c6820e..3da6352 100644
--- a/src/java/org/apache/cassandra/cache/AutoSavingCache.java
+++ b/src/java/org/apache/cassandra/cache/AutoSavingCache.java

@@ -42,10 +42,11 @@
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.io.util.ChecksummedRandomAccessReader.CorruptFileException;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.Pair;
@@ -75,14 +76,16 @@
      *
      * Since cache versions match exactly and there is no partial fallback just add
      * a minor version letter.
+     *
+     * Sticking with "d" is fine for 3.0 since it has never been released or used by another version
      */
-    private static final String CURRENT_VERSION = "ca";
+    private static final String CURRENT_VERSION = "d";
 
     private static volatile IStreamFactory streamFactory = new IStreamFactory()
     {
         public InputStream getInputStream(File dataPath, File crcPath) throws IOException
         {
-            return ChecksummedRandomAccessReader.open(dataPath, crcPath);
+            return new ChecksummedRandomAccessReader.Builder(dataPath, crcPath).build();
         }
 
         public OutputStream getOutputStream(File dataPath, File crcPath)
@@ -166,7 +169,7 @@
                             cacheType);
                 es.shutdown();
             }
-        }, MoreExecutors.sameThreadExecutor());
+        }, MoreExecutors.directExecutor());
 
         return cacheLoad;
     }
@@ -181,11 +184,11 @@
         File crcPath = getCacheCrcPath(CURRENT_VERSION);
         if (dataPath.exists() && crcPath.exists())
         {
-            DataInputStream in = null;
+            DataInputStreamPlus in = null;
             try
             {
                 logger.info(String.format("reading saved cache %s", dataPath));
-                in = new DataInputStream(new LengthAvailableInputStream(new BufferedInputStream(streamFactory.getInputStream(dataPath, crcPath)), dataPath.length()));
+                in = new DataInputStreamPlus(new LengthAvailableInputStream(new BufferedInputStream(streamFactory.getInputStream(dataPath, crcPath)), dataPath.length()));
 
                 //Check the schema has not changed since CFs are looked up by name which is ambiguous
                 UUID schemaVersion = new UUID(in.readLong(), in.readLong());
@@ -298,11 +301,11 @@
             else
                 type = OperationType.UNKNOWN;
 
-            info = new CompactionInfo(CFMetaData.denseCFMetaData(SystemKeyspace.NAME, cacheType.toString(), BytesType.instance),
+            info = new CompactionInfo(CFMetaData.createFake(SystemKeyspace.NAME, cacheType.toString()),
                                       type,
                                       0,
                                       keysEstimate,
-                                      "keys",
+                                      Unit.KEYS,
                                       UUIDGen.getTimeUUID());
         }
 
@@ -318,7 +321,6 @@
             return info.forProgress(keysWritten, Math.max(keysWritten, keysEstimate));
         }
 
-        @SuppressWarnings("resource")
         public void saveCache()
         {
             logger.trace("Deleting old {} files.", cacheType);
@@ -332,56 +334,42 @@
 
             long start = System.nanoTime();
 
-            WrappedDataOutputStreamPlus writer = null;
             Pair<File, File> cacheFilePaths = tempCacheFiles();
-            try
+            try (WrappedDataOutputStreamPlus writer = new WrappedDataOutputStreamPlus(streamFactory.getOutputStream(cacheFilePaths.left, cacheFilePaths.right)))
             {
-                try
+
+                //Need to be able to check schema version because CF names are ambiguous
+                UUID schemaVersion = Schema.instance.getVersion();
+                if (schemaVersion == null)
                 {
-                    writer = new WrappedDataOutputStreamPlus(streamFactory.getOutputStream(cacheFilePaths.left, cacheFilePaths.right));
+                    Schema.instance.updateVersion();
+                    schemaVersion = Schema.instance.getVersion();
                 }
-                catch (FileNotFoundException e)
+                writer.writeLong(schemaVersion.getMostSignificantBits());
+                writer.writeLong(schemaVersion.getLeastSignificantBits());
+
+                while (keyIterator.hasNext())
                 {
-                    throw new RuntimeException(e);
+                    K key = keyIterator.next();
+
+                    ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreIncludingIndexes(key.ksAndCFName);
+                    if (cfs == null)
+                        continue; // the table or 2i has been dropped.
+
+                    cacheLoader.serialize(key, writer, cfs);
+
+                    keysWritten++;
+                    if (keysWritten >= keysEstimate)
+                        break;
                 }
-
-                try
-                {
-                    //Need to be able to check schema version because CF names are ambiguous
-                    UUID schemaVersion = Schema.instance.getVersion();
-                    if (schemaVersion == null)
-                    {
-                        Schema.instance.updateVersion();
-                        schemaVersion = Schema.instance.getVersion();
-                    }
-                    writer.writeLong(schemaVersion.getMostSignificantBits());
-                    writer.writeLong(schemaVersion.getLeastSignificantBits());
-
-                    while (keyIterator.hasNext())
-                    {
-                        K key = keyIterator.next();
-
-                        ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreIncludingIndexes(key.ksAndCFName);
-                        if (cfs == null)
-                            continue; // the table or 2i has been dropped.
-
-                        cacheLoader.serialize(key, writer, cfs);
-
-                        keysWritten++;
-                        if (keysWritten >= keysEstimate)
-                            break;
-                    }
-                }
-                catch (IOException e)
-                {
-                    throw new FSWriteError(e, cacheFilePaths.left);
-                }
-
             }
-            finally
+            catch (FileNotFoundException e)
             {
-                if (writer != null)
-                    FileUtils.closeQuietly(writer);
+                throw new RuntimeException(e);
+            }
+            catch (IOException e)
+            {
+                throw new FSWriteError(e, cacheFilePaths.left);
             }
 
             File cacheFile = getCacheDataPath(CURRENT_VERSION);
@@ -433,12 +421,17 @@
                 logger.warn("Could not list files in {}", savedCachesDir);
             }
         }
+
+        public boolean isGlobal()
+        {
+            return false;
+        }
     }
 
     public interface CacheSerializer<K extends CacheKey, V>
     {
         void serialize(K key, DataOutputPlus out, ColumnFamilyStore cfs) throws IOException;
 
-        Future<Pair<K, V>> deserialize(DataInputStream in, ColumnFamilyStore cfs) throws IOException;
+        Future<Pair<K, V>> deserialize(DataInputPlus in, ColumnFamilyStore cfs) throws IOException;
     }
 }

diff --git a/src/java/org/apache/cassandra/cache/CachingOptions.java b/src/java/org/apache/cassandra/cache/CachingOptions.java
deleted file mode 100644
index 1c82f55..0000000
--- a/src/java/org/apache/cassandra/cache/CachingOptions.java
+++ /dev/null

@@ -1,285 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cache;
-
-
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import org.apache.commons.lang3.StringUtils;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import static org.apache.cassandra.utils.FBUtilities.fromJsonMap;
-
-/*
-CQL: { 'keys' : 'ALL|NONE', 'rows_per_partition': '200|NONE|ALL' }
- */
-public class CachingOptions
-{
-    public static final CachingOptions KEYS_ONLY = new CachingOptions(new KeyCache(KeyCache.Type.ALL), new RowCache(RowCache.Type.NONE));
-    public static final CachingOptions ALL = new CachingOptions(new KeyCache(KeyCache.Type.ALL), new RowCache(RowCache.Type.ALL));
-    public static final CachingOptions ROWS_ONLY = new CachingOptions(new KeyCache(KeyCache.Type.NONE), new RowCache(RowCache.Type.ALL));
-    public static final CachingOptions NONE = new CachingOptions(new KeyCache(KeyCache.Type.NONE), new RowCache(RowCache.Type.NONE));
-
-    public final KeyCache keyCache;
-    public final RowCache rowCache;
-    private static final Set<String> legacyOptions = new HashSet<>(Arrays.asList("ALL", "NONE", "KEYS_ONLY", "ROWS_ONLY"));
-
-    public CachingOptions(KeyCache kc, RowCache rc)
-    {
-        this.keyCache = kc;
-        this.rowCache = rc;
-    }
-
-    public static CachingOptions fromString(String cache) throws ConfigurationException
-    {
-        if (legacyOptions.contains(cache.toUpperCase()))
-            return fromLegacyOption(cache.toUpperCase());
-        return fromMap(fromJsonMap(cache));
-    }
-
-    public static CachingOptions fromMap(Map<String, String> cacheConfig) throws ConfigurationException
-    {
-        validateCacheConfig(cacheConfig);
-        if (!cacheConfig.containsKey("keys") && !cacheConfig.containsKey("rows_per_partition"))
-            return CachingOptions.NONE;
-        if (!cacheConfig.containsKey("keys"))
-            return new CachingOptions(new KeyCache(KeyCache.Type.NONE), RowCache.fromString(cacheConfig.get("rows_per_partition")));
-        if (!cacheConfig.containsKey("rows_per_partition"))
-            return CachingOptions.KEYS_ONLY;
-
-        return new CachingOptions(KeyCache.fromString(cacheConfig.get("keys")), RowCache.fromString(cacheConfig.get("rows_per_partition")));
-    }
-
-    private static void validateCacheConfig(Map<String, String> cacheConfig) throws ConfigurationException
-    {
-        for (Map.Entry<String, String> entry : cacheConfig.entrySet())
-        {
-            String value = entry.getValue().toUpperCase();
-            if (entry.getKey().equals("keys"))
-            {
-                if (!(value.equals("ALL") || value.equals("NONE")))
-                {
-                    throw new ConfigurationException("'keys' can only have values 'ALL' or 'NONE', but was '" + value + "'");
-                }
-            }
-            else if (entry.getKey().equals("rows_per_partition"))
-            {
-                if (!(value.equals("ALL") || value.equals("NONE") || StringUtils.isNumeric(value)))
-                {
-                    throw new ConfigurationException("'rows_per_partition' can only have values 'ALL', 'NONE' or be numeric, but was '" + value + "'.");
-                }
-            }
-            else
-                throw new ConfigurationException("Only supported CachingOptions parameters are 'keys' and 'rows_per_partition', but was '" + entry.getKey() + "'");
-        }
-    }
-
-    @Override
-    public String toString()
-    {
-        return String.format("{\"keys\":\"%s\", \"rows_per_partition\":\"%s\"}", keyCache.toString(), rowCache.toString());
-    }
-
-    private static CachingOptions fromLegacyOption(String cache)
-    {
-        if (cache.equals("ALL"))
-            return ALL;
-        if (cache.equals("KEYS_ONLY"))
-            return KEYS_ONLY;
-        if (cache.equals("ROWS_ONLY"))
-            return ROWS_ONLY;
-        return NONE;
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-
-        CachingOptions o2 = (CachingOptions) o;
-
-        if (!keyCache.equals(o2.keyCache)) return false;
-        if (!rowCache.equals(o2.rowCache)) return false;
-
-        return true;
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int result = keyCache.hashCode();
-        result = 31 * result + rowCache.hashCode();
-        return result;
-    }
-
-    // FIXME: move to ThriftConversion
-    public static CachingOptions fromThrift(String caching, String cellsPerRow) throws ConfigurationException
-    {
-
-        RowCache rc = new RowCache(RowCache.Type.NONE);
-        KeyCache kc = new KeyCache(KeyCache.Type.ALL);
-        // if we get a caching string from thrift it is legacy, "ALL", "KEYS_ONLY" etc, fromString handles those
-        if (caching != null)
-        {
-            CachingOptions givenOptions = CachingOptions.fromString(caching);
-            rc = givenOptions.rowCache;
-            kc = givenOptions.keyCache;
-        }
-        // if we get cells_per_row from thrift, it is either "ALL" or "<number of cells to cache>".
-        if (cellsPerRow != null && rc.isEnabled())
-            rc = RowCache.fromString(cellsPerRow);
-        return new CachingOptions(kc, rc);
-    }
-
-    // FIXME: move to ThriftConversion
-    public String toThriftCaching()
-    {
-        if (rowCache.isEnabled() && keyCache.isEnabled())
-            return "ALL";
-        if (rowCache.isEnabled())
-            return "ROWS_ONLY";
-        if (keyCache.isEnabled())
-            return "KEYS_ONLY";
-        return "NONE";
-    }
-
-    // FIXME: move to ThriftConversion
-    public String toThriftCellsPerRow()
-    {
-        if (rowCache.cacheFullPartitions())
-            return "ALL";
-        return String.valueOf(rowCache.rowsToCache);
-    }
-
-    public static class KeyCache
-    {
-        public final Type type;
-        public KeyCache(Type type)
-        {
-            this.type = type;
-        }
-
-        public enum Type
-        {
-            ALL, NONE
-        }
-        public static KeyCache fromString(String keyCache)
-        {
-            return new KeyCache(Type.valueOf(keyCache.toUpperCase()));
-        }
-
-        public boolean isEnabled()
-        {
-            return type == Type.ALL;
-        }
-
-        @Override
-        public boolean equals(Object o)
-        {
-            if (this == o) return true;
-            if (o == null || getClass() != o.getClass()) return false;
-
-            KeyCache keyCache = (KeyCache) o;
-
-            if (type != keyCache.type) return false;
-
-            return true;
-        }
-
-        @Override
-        public int hashCode()
-        {
-            return type.hashCode();
-        }
-        @Override
-        public String toString()
-        {
-            return type.toString();
-        }
-    }
-
-    public static class RowCache
-    {
-        public final Type type;
-        public final int rowsToCache;
-
-        public RowCache(Type type)
-        {
-            this(type, (type == Type.ALL) ? Integer.MAX_VALUE : 0);
-        }
-        public RowCache(Type type, int rowsToCache)
-        {
-            this.type = type;
-            this.rowsToCache = rowsToCache;
-        }
-
-        public enum Type
-        {
-            ALL, NONE, HEAD
-        }
-
-        public static RowCache fromString(String rowCache)
-        {
-            if (rowCache == null || rowCache.equalsIgnoreCase("none"))
-                return new RowCache(Type.NONE, 0);
-            else if (rowCache.equalsIgnoreCase("all"))
-                return new RowCache(Type.ALL, Integer.MAX_VALUE);
-            return new RowCache(Type.HEAD, Integer.parseInt(rowCache));
-        }
-        public boolean isEnabled()
-        {
-            return (type == Type.ALL) || (type == Type.HEAD);
-        }
-        public boolean cacheFullPartitions()
-        {
-            return type == Type.ALL;
-        }
-        @Override
-        public String toString()
-        {
-            if (type == Type.ALL) return "ALL";
-            if (type == Type.NONE) return "NONE";
-            return String.valueOf(rowsToCache);
-        }
-
-        @Override
-        public boolean equals(Object o)
-        {
-            if (this == o) return true;
-            if (o == null || getClass() != o.getClass()) return false;
-
-            RowCache rowCache = (RowCache) o;
-
-            if (rowsToCache != rowCache.rowsToCache) return false;
-            if (type != rowCache.type) return false;
-
-            return true;
-        }
-
-        @Override
-        public int hashCode()
-        {
-            int result = type.hashCode();
-            result = 31 * result + rowsToCache;
-            return result;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cache/CounterCacheKey.java b/src/java/org/apache/cassandra/cache/CounterCacheKey.java
index 68856eb..8b173bf 100644
--- a/src/java/org/apache/cassandra/cache/CounterCacheKey.java
+++ b/src/java/org/apache/cassandra/cache/CounterCacheKey.java

@@ -20,27 +20,42 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.utils.*;
 
 public final class CounterCacheKey extends CacheKey
 {
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new CounterCacheKey(null, ByteBufferUtil.EMPTY_BYTE_BUFFER, CellNames.simpleDense(ByteBuffer.allocate(1))));
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new CounterCacheKey(null, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBuffer.allocate(1)));
 
     public final byte[] partitionKey;
     public final byte[] cellName;
 
-    private CounterCacheKey(Pair<String, String> ksAndCFName, ByteBuffer partitionKey, CellName cellName)
+    public CounterCacheKey(Pair<String, String> ksAndCFName, ByteBuffer partitionKey, ByteBuffer cellName)
     {
         super(ksAndCFName);
         this.partitionKey = ByteBufferUtil.getArray(partitionKey);
-        this.cellName = ByteBufferUtil.getArray(cellName.toByteBuffer());
+        this.cellName = ByteBufferUtil.getArray(cellName);
     }
 
-    public static CounterCacheKey create(Pair<String, String> ksAndCFName, ByteBuffer partitionKey, CellName cellName)
+    public static CounterCacheKey create(Pair<String, String> ksAndCFName, ByteBuffer partitionKey, Clustering clustering, ColumnDefinition c, CellPath path)
     {
-        return new CounterCacheKey(ksAndCFName, partitionKey, cellName);
+        return new CounterCacheKey(ksAndCFName, partitionKey, makeCellName(clustering, c, path));
+    }
+
+    private static ByteBuffer makeCellName(Clustering clustering, ColumnDefinition c, CellPath path)
+    {
+        int cs = clustering.size();
+        ByteBuffer[] values = new ByteBuffer[cs + 1 + (path == null ? 0 : path.size())];
+        for (int i = 0; i < cs; i++)
+            values[i] = clustering.get(i);
+        values[cs] = c.name.bytes;
+        if (path != null)
+            for (int i = 0; i < path.size(); i++)
+                values[cs + 1 + i] = path.get(i);
+        return CompositeType.build(values);
     }
 
     public long unsharedHeapSize()

diff --git a/src/java/org/apache/cassandra/cache/OHCProvider.java b/src/java/org/apache/cassandra/cache/OHCProvider.java
index ab2745a..1ea2b78 100644
--- a/src/java/org/apache/cassandra/cache/OHCProvider.java
+++ b/src/java/org/apache/cassandra/cache/OHCProvider.java

@@ -17,21 +17,17 @@
  */
 package org.apache.cassandra.cache;
 
-import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.channels.WritableByteChannel;
 import java.util.Iterator;
 
-import com.google.common.base.Function;
-
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.Memory;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.db.partitions.CachedPartition;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputBufferFixed;
+import org.apache.cassandra.io.util.RebufferingInputStream;
 import org.apache.cassandra.utils.Pair;
 import org.caffinitas.ohc.OHCache;
 import org.caffinitas.ohc.OHCacheBuilder;
@@ -42,8 +38,8 @@
     {
         OHCacheBuilder<RowCacheKey, IRowCacheEntry> builder = OHCacheBuilder.newBuilder();
         builder.capacity(DatabaseDescriptor.getRowCacheSizeInMB() * 1024 * 1024)
-               .keySerializer(new KeySerializer())
-               .valueSerializer(new ValueSerializer())
+               .keySerializer(KeySerializer.instance)
+               .valueSerializer(ValueSerializer.instance)
                .throwOOME(true);
 
         return new OHCacheAdapter(builder.build());
@@ -70,7 +66,7 @@
 
         public void put(RowCacheKey key, IRowCacheEntry value)
         {
-            ohCache.put(key, value);
+            ohCache.put(key,  value);
         }
 
         public boolean putIfAbsent(RowCacheKey key, IRowCacheEntry value)
@@ -126,27 +122,48 @@
 
     private static class KeySerializer implements org.caffinitas.ohc.CacheSerializer<RowCacheKey>
     {
-        public void serialize(RowCacheKey rowCacheKey, DataOutput dataOutput) throws IOException
+        private static KeySerializer instance = new KeySerializer();
+        public void serialize(RowCacheKey rowCacheKey, ByteBuffer buf)
         {
-            dataOutput.writeUTF(rowCacheKey.ksAndCFName.left);
-            dataOutput.writeUTF(rowCacheKey.ksAndCFName.right);
-            dataOutput.writeInt(rowCacheKey.key.length);
-            dataOutput.write(rowCacheKey.key);
+            @SuppressWarnings("resource")
+            DataOutputBuffer dataOutput = new DataOutputBufferFixed(buf);
+            try
+            {
+                dataOutput.writeUTF(rowCacheKey.ksAndCFName.left);
+                dataOutput.writeUTF(rowCacheKey.ksAndCFName.right);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+            buf.putInt(rowCacheKey.key.length);
+            buf.put(rowCacheKey.key);
         }
 
-        public RowCacheKey deserialize(DataInput dataInput) throws IOException
+        public RowCacheKey deserialize(ByteBuffer buf)
         {
-            String ksName = dataInput.readUTF();
-            String cfName = dataInput.readUTF();
-            byte[] key = new byte[dataInput.readInt()];
-            dataInput.readFully(key);
+            @SuppressWarnings("resource")
+            DataInputBuffer dataInput = new DataInputBuffer(buf, false);
+            String ksName = null;
+            String cfName = null;
+            try
+            {
+                ksName = dataInput.readUTF();
+                cfName = dataInput.readUTF();
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+            byte[] key = new byte[buf.getInt()];
+            buf.get(key);
             return new RowCacheKey(Pair.create(ksName, cfName), key);
         }
 
         public int serializedSize(RowCacheKey rowCacheKey)
         {
-            return TypeSizes.NATIVE.sizeof(rowCacheKey.ksAndCFName.left)
-                    + TypeSizes.NATIVE.sizeof(rowCacheKey.ksAndCFName.right)
+            return TypeSizes.sizeof(rowCacheKey.ksAndCFName.left)
+                    + TypeSizes.sizeof(rowCacheKey.ksAndCFName.right)
                     + 4
                     + rowCacheKey.key.length;
         }
@@ -154,132 +171,50 @@
 
     private static class ValueSerializer implements org.caffinitas.ohc.CacheSerializer<IRowCacheEntry>
     {
-        public void serialize(IRowCacheEntry entry, DataOutput out) throws IOException
+        private static ValueSerializer instance = new ValueSerializer();
+        public void serialize(IRowCacheEntry entry, ByteBuffer buf)
         {
             assert entry != null; // unlike CFS we don't support nulls, since there is no need for that in the cache
-            boolean isSentinel = entry instanceof RowCacheSentinel;
-            out.writeBoolean(isSentinel);
-            if (isSentinel)
-                out.writeLong(((RowCacheSentinel) entry).sentinelId);
-            else
-                ColumnFamily.serializer.serialize((ColumnFamily) entry, new DataOutputPlusAdapter(out), MessagingService.current_version);
+            try (DataOutputBufferFixed out = new DataOutputBufferFixed(buf))
+            {
+                boolean isSentinel = entry instanceof RowCacheSentinel;
+                out.writeBoolean(isSentinel);
+                if (isSentinel)
+                    out.writeLong(((RowCacheSentinel) entry).sentinelId);
+                else
+                    CachedPartition.cacheSerializer.serialize((CachedPartition)entry, out);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
         }
 
-        public IRowCacheEntry deserialize(DataInput in) throws IOException
+        @SuppressWarnings("resource")
+        public IRowCacheEntry deserialize(ByteBuffer buf)
         {
-            boolean isSentinel = in.readBoolean();
-            if (isSentinel)
-                return new RowCacheSentinel(in.readLong());
-            return ColumnFamily.serializer.deserialize(in, MessagingService.current_version);
+            try
+            {
+                RebufferingInputStream in = new DataInputBuffer(buf, false);
+                boolean isSentinel = in.readBoolean();
+                if (isSentinel)
+                    return new RowCacheSentinel(in.readLong());
+                return CachedPartition.cacheSerializer.deserialize(in);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
         }
 
         public int serializedSize(IRowCacheEntry entry)
         {
-            TypeSizes typeSizes = TypeSizes.NATIVE;
-            int size = typeSizes.sizeof(true);
+            int size = TypeSizes.sizeof(true);
             if (entry instanceof RowCacheSentinel)
-                size += typeSizes.sizeof(((RowCacheSentinel) entry).sentinelId);
+                size += TypeSizes.sizeof(((RowCacheSentinel) entry).sentinelId);
             else
-                size += ColumnFamily.serializer.serializedSize((ColumnFamily) entry, typeSizes, MessagingService.current_version);
+                size += CachedPartition.cacheSerializer.serializedSize((CachedPartition) entry);
             return size;
         }
     }
-
-    static class DataOutputPlusAdapter implements DataOutputPlus
-    {
-        private final DataOutput out;
-
-        public void write(byte[] b) throws IOException
-        {
-            out.write(b);
-        }
-
-        public void write(byte[] b, int off, int len) throws IOException
-        {
-            out.write(b, off, len);
-        }
-
-        public void write(int b) throws IOException
-        {
-            out.write(b);
-        }
-
-        public void writeBoolean(boolean v) throws IOException
-        {
-            out.writeBoolean(v);
-        }
-
-        public void writeByte(int v) throws IOException
-        {
-            out.writeByte(v);
-        }
-
-        public void writeBytes(String s) throws IOException
-        {
-            out.writeBytes(s);
-        }
-
-        public void writeChar(int v) throws IOException
-        {
-            out.writeChar(v);
-        }
-
-        public void writeChars(String s) throws IOException
-        {
-            out.writeChars(s);
-        }
-
-        public void writeDouble(double v) throws IOException
-        {
-            out.writeDouble(v);
-        }
-
-        public void writeFloat(float v) throws IOException
-        {
-            out.writeFloat(v);
-        }
-
-        public void writeInt(int v) throws IOException
-        {
-            out.writeInt(v);
-        }
-
-        public void writeLong(long v) throws IOException
-        {
-            out.writeLong(v);
-        }
-
-        public void writeShort(int v) throws IOException
-        {
-            out.writeShort(v);
-        }
-
-        public void writeUTF(String s) throws IOException
-        {
-            out.writeUTF(s);
-        }
-
-        public DataOutputPlusAdapter(DataOutput out)
-        {
-            this.out = out;
-        }
-
-        public void write(ByteBuffer buffer) throws IOException
-        {
-            if (buffer.hasArray())
-                out.write(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
-            else
-                throw new UnsupportedOperationException("IMPLEMENT ME");
-        }
-
-        public void write(Memory memory, long offset, long length) throws IOException
-        {
-            throw new UnsupportedOperationException("IMPLEMENT ME");
-        }
-
-        public <R> R applyToChannel(Function<WritableByteChannel, R> c) throws IOException
-        {
-            throw new UnsupportedOperationException("IMPLEMENT ME");
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/cache/SerializingCache.java b/src/java/org/apache/cassandra/cache/SerializingCache.java
index 01d70b4..3651a0c 100644
--- a/src/java/org/apache/cassandra/cache/SerializingCache.java
+++ b/src/java/org/apache/cassandra/cache/SerializingCache.java

@@ -26,12 +26,11 @@
 import com.googlecode.concurrentlinkedhashmap.ConcurrentLinkedHashMap;
 import com.googlecode.concurrentlinkedhashmap.EvictionListener;
 import com.googlecode.concurrentlinkedhashmap.Weigher;
-import org.apache.cassandra.db.TypeSizes;
+
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.util.MemoryInputStream;
 import org.apache.cassandra.io.util.MemoryOutputStream;
-import org.apache.cassandra.utils.vint.EncodedDataInputStream;
-import org.apache.cassandra.utils.vint.EncodedDataOutputStream;
+import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
 
 /**
  * Serializes cache values off-heap.
@@ -39,7 +38,6 @@
 public class SerializingCache<K, V> implements ICache<K, V>
 {
     private static final Logger logger = LoggerFactory.getLogger(SerializingCache.class);
-    private static final TypeSizes ENCODED_TYPE_SIZES = TypeSizes.VINT;
 
     private static final int DEFAULT_CONCURENCY_LEVEL = 64;
 
@@ -88,7 +86,7 @@
     {
         try
         {
-            return serializer.deserialize(new EncodedDataInputStream(new MemoryInputStream(mem)));
+            return serializer.deserialize(new MemoryInputStream(mem));
         }
         catch (IOException e)
         {
@@ -99,7 +97,7 @@
 
     private RefCountedMemory serialize(V value)
     {
-        long serializedSize = serializer.serializedSize(value, ENCODED_TYPE_SIZES);
+        long serializedSize = serializer.serializedSize(value);
         if (serializedSize > Integer.MAX_VALUE)
             throw new IllegalArgumentException("Unable to allocate " + serializedSize + " bytes");
 
@@ -115,7 +113,7 @@
 
         try
         {
-            serializer.serialize(value, new EncodedDataOutputStream(new MemoryOutputStream(freeableMemory)));
+            serializer.serialize(value, new WrappedDataOutputStreamPlus(new MemoryOutputStream(freeableMemory)));
         }
         catch (IOException e)
         {

diff --git a/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java b/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java
index f540322..1119295 100644
--- a/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java
+++ b/src/java/org/apache/cassandra/cache/SerializingCacheProvider.java

@@ -17,15 +17,14 @@
  */
 package org.apache.cassandra.cache;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessagingService;
 
 public class SerializingCacheProvider implements CacheProvider<RowCacheKey, IRowCacheEntry>
 {
@@ -45,24 +44,25 @@
             if (isSentinel)
                 out.writeLong(((RowCacheSentinel) entry).sentinelId);
             else
-                ColumnFamily.serializer.serialize((ColumnFamily) entry, out, MessagingService.current_version);
+                CachedPartition.cacheSerializer.serialize((CachedPartition)entry, out);
         }
 
-        public IRowCacheEntry deserialize(DataInput in) throws IOException
+        public IRowCacheEntry deserialize(DataInputPlus in) throws IOException
         {
             boolean isSentinel = in.readBoolean();
             if (isSentinel)
                 return new RowCacheSentinel(in.readLong());
-            return ColumnFamily.serializer.deserialize(in, MessagingService.current_version);
+
+            return CachedPartition.cacheSerializer.deserialize(in);
         }
 
-        public long serializedSize(IRowCacheEntry entry, TypeSizes typeSizes)
+        public long serializedSize(IRowCacheEntry entry)
         {
-            int size = typeSizes.sizeof(true);
+            int size = TypeSizes.sizeof(true);
             if (entry instanceof RowCacheSentinel)
-                size += typeSizes.sizeof(((RowCacheSentinel) entry).sentinelId);
+                size += TypeSizes.sizeof(((RowCacheSentinel) entry).sentinelId);
             else
-                size += ColumnFamily.serializer.serializedSize((ColumnFamily) entry, typeSizes, MessagingService.current_version);
+                size += CachedPartition.cacheSerializer.serializedSize((CachedPartition) entry);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/concurrent/AbstractLocalAwareExecutorService.java b/src/java/org/apache/cassandra/concurrent/AbstractLocalAwareExecutorService.java
index 088b43e..4b1fe05 100644
--- a/src/java/org/apache/cassandra/concurrent/AbstractLocalAwareExecutorService.java
+++ b/src/java/org/apache/cassandra/concurrent/AbstractLocalAwareExecutorService.java

@@ -29,8 +29,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.tracing.TraceState;
-import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.concurrent.SimpleCondition;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
@@ -165,10 +166,15 @@
             }
             catch (Throwable t)
             {
-                JVMStabilityInspector.inspectThrowable(t);
-                logger.warn("Uncaught exception on thread {}: {}", Thread.currentThread(), t);
+                logger.error("Uncaught exception on thread {}", Thread.currentThread(), t);
                 result = t;
                 failure = true;
+                if (t instanceof CorruptSSTableException)
+                    FileUtils.handleCorruptSSTable((CorruptSSTableException) t);
+                else if (t instanceof FSError)
+                    FileUtils.handleFSError((FSError) t);
+                else
+                    JVMStabilityInspector.inspectThrowable(t);
             }
             finally
             {
@@ -203,7 +209,8 @@
 
         public T get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
         {
-            await(timeout, unit);
+            if (!await(timeout, unit))
+                throw new TimeoutException();
             Object result = this.result;
             if (failure)
                 throw new ExecutionException((Throwable) result);

diff --git a/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java b/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java
index ea0715c..3b9d2ff 100644
--- a/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutor.java

@@ -47,7 +47,7 @@
         {
             if (executor.isShutdown())
             {
-                if (!StorageService.instance.isInShutdownHook())
+                if (!StorageService.instance.isShutdown())
                     throw new RejectedExecutionException("ScheduledThreadPoolExecutor has shut down.");
 
                 //Give some notification to the caller the task isn't going to run

diff --git a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java
index af41513..b54fa3f 100644
--- a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java

@@ -39,13 +39,7 @@
     public InfiniteLoopExecutor(String name, InterruptibleRunnable runnable)
     {
         this.runnable = runnable;
-        this.thread = new Thread(new Runnable()
-        {
-            public void run()
-            {
-                loop();
-            }
-        }, name);
+        this.thread = new Thread(this::loop, name);
         this.thread.setDaemon(true);
     }
 

diff --git a/src/java/org/apache/cassandra/concurrent/JMXEnabledThreadPoolExecutor.java b/src/java/org/apache/cassandra/concurrent/JMXEnabledThreadPoolExecutor.java
index 377442b..ae51aff 100644
--- a/src/java/org/apache/cassandra/concurrent/JMXEnabledThreadPoolExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/JMXEnabledThreadPoolExecutor.java

@@ -51,6 +51,11 @@
         this(1, Integer.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new NamedThreadFactory(threadPoolName, priority), "internal");
     }
 
+    public JMXEnabledThreadPoolExecutor(NamedThreadFactory threadFactory, String jmxPath)
+    {
+        this(1, Integer.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), threadFactory, jmxPath);
+    }
+
     public JMXEnabledThreadPoolExecutor(int corePoolSize,
             long keepAliveTime,
             TimeUnit unit,

diff --git a/src/java/org/apache/cassandra/concurrent/NamedThreadFactory.java b/src/java/org/apache/cassandra/concurrent/NamedThreadFactory.java
index a084571..27aa344 100644
--- a/src/java/org/apache/cassandra/concurrent/NamedThreadFactory.java
+++ b/src/java/org/apache/cassandra/concurrent/NamedThreadFactory.java

@@ -20,6 +20,9 @@
 import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import io.netty.util.concurrent.FastThreadLocal;
+import io.netty.util.concurrent.FastThreadLocalThread;
+
 /**
  * This class is an implementation of the <i>ThreadFactory</i> interface. This
  * is useful to give Java threads meaningful names which is useful when using
@@ -31,7 +34,7 @@
     private static volatile String globalPrefix;
     public static void setGlobalPrefix(String prefix) { globalPrefix = prefix; }
 
-    protected final String id;
+    public final String id;
     private final int priority;
     private final ClassLoader contextClassLoader;
     private final ThreadGroup threadGroup;
@@ -59,12 +62,28 @@
     {
         String name = id + ':' + n.getAndIncrement();
         String prefix = globalPrefix;
-        Thread thread = new Thread(threadGroup, runnable, prefix != null ? prefix + name : name);
-        thread.setDaemon(true);
+        Thread thread = new FastThreadLocalThread(threadGroup, threadLocalDeallocator(runnable), prefix != null ? prefix + name : name);
         thread.setPriority(priority);
+        thread.setDaemon(true);
         if (contextClassLoader != null)
             thread.setContextClassLoader(contextClassLoader);
         return thread;
     }
 
+    /**
+     * Ensures that {@link FastThreadLocal#remove() FastThreadLocal.remove()} is called when the {@link Runnable#run()}
+     * method of the given {@link Runnable} instance completes to ensure cleanup of {@link FastThreadLocal} instances.
+     * This is especially important for direct byte buffers allocated locally for a thread.
+     */
+    public static Runnable threadLocalDeallocator(Runnable r)
+    {
+        return () ->
+        {
+            try {
+                r.run();
+            } finally {
+                FastThreadLocal.removeAll();
+            }
+        };
+    }
 }

diff --git a/src/java/org/apache/cassandra/concurrent/SEPExecutor.java b/src/java/org/apache/cassandra/concurrent/SEPExecutor.java
index 8b12b82..d5c7b14 100644
--- a/src/java/org/apache/cassandra/concurrent/SEPExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/SEPExecutor.java

@@ -174,7 +174,11 @@
             long current = permits.get();
             int workPermits = workPermits(current);
             if (permits.compareAndSet(current, updateWorkPermits(current, workPermits + 1)))
-                return;
+            {
+                if (shuttingDown && workPermits + 1 == maxWorkers)
+                    shutdown.signalAll();
+                break;
+            }
         }
     }
 
@@ -206,7 +210,7 @@
     {
         shuttingDown = true;
         pool.executors.remove(this);
-        if (getActiveCount() == 0)
+        if (getActiveCount() == 0 && getPendingTasks() == 0)
             shutdown.signalAll();
 
         // release metrics
@@ -219,6 +223,8 @@
         List<Runnable> aborted = new ArrayList<>();
         while (takeTaskPermit())
             aborted.add(tasks.poll());
+        if (getActiveCount() == 0)
+            shutdown.signalAll();
         return aborted;
     }
 

diff --git a/src/java/org/apache/cassandra/concurrent/SEPWorker.java b/src/java/org/apache/cassandra/concurrent/SEPWorker.java
index edc31da..f7eb47a 100644
--- a/src/java/org/apache/cassandra/concurrent/SEPWorker.java
+++ b/src/java/org/apache/cassandra/concurrent/SEPWorker.java

@@ -98,7 +98,6 @@
                 // if we do have tasks assigned, nobody will change our state so we can simply set it to WORKING
                 // (which is also a state that will never be interrupted externally)
                 set(Work.WORKING);
-                boolean shutdown;
                 while (true)
                 {
                     // before we process any task, we maybe schedule a new worker _to our executor only_; this
@@ -111,19 +110,13 @@
                     task = null;
 
                     // if we're shutting down, or we fail to take a permit, we don't perform any more work
-                    if ((shutdown = assigned.shuttingDown) || !assigned.takeTaskPermit())
+                    if (!assigned.takeTaskPermit())
                         break;
                     task = assigned.tasks.poll();
                 }
 
                 // return our work permit, and maybe signal shutdown
                 assigned.returnWorkPermit();
-                if (shutdown)
-                {
-                    if (assigned.getActiveCount() == 0)
-                        assigned.shutdown.signalAll();
-                    return;
-                }
                 assigned = null;
 
                 // try to immediately reassign ourselves some work; if we fail, start spinning
@@ -134,23 +127,25 @@
         catch (Throwable t)
         {
             JVMStabilityInspector.inspectThrowable(t);
-            while (true)
-            {
-                if (get().assigned != null)
-                {
-                    assigned = get().assigned;
-                    set(Work.WORKING);
-                }
-                if (assign(Work.STOPPED, true))
-                    break;
-            }
-            if (assigned != null)
-                assigned.returnWorkPermit();
             if (task != null)
                 logger.error("Failed to execute task, unexpected exception killed worker: {}", t);
             else
                 logger.error("Unexpected exception killed worker: {}", t);
         }
+        finally
+        {
+            if (assigned != null)
+                assigned.returnWorkPermit();
+
+            do
+            {
+                if (get().assigned != null)
+                {
+                    get().assigned.returnWorkPermit();
+                    set(Work.WORKING);
+                }
+            } while (!assign(Work.STOPPED, true));
+        }
     }
 
     // try to assign this worker the provided work

diff --git a/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java
index 50cc5a3..e6a0df7 100644
--- a/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java
+++ b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.concurrent;
 
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentSkipListMap;
@@ -107,16 +108,17 @@
             schedule(Work.SPINNING);
     }
 
-    public LocalAwareExecutorService newExecutor(int maxConcurrency, int maxQueuedTasks, String jmxPath, String name)
+    public synchronized LocalAwareExecutorService newExecutor(int maxConcurrency, int maxQueuedTasks, String jmxPath, String name)
     {
         SEPExecutor executor = new SEPExecutor(this, maxConcurrency, maxQueuedTasks, jmxPath, name);
         executors.add(executor);
         return executor;
     }
 
-    public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException
+    public synchronized void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException
     {
         shuttingDown = true;
+        List<SEPExecutor> executors = new ArrayList<>(this.executors);
         for (SEPExecutor executor : executors)
             executor.shutdownNow();
 
@@ -127,7 +129,7 @@
             executor.shutdown.await(until - System.nanoTime(), TimeUnit.NANOSECONDS);
     }
 
-    void terminateWorkers()
+    private void terminateWorkers()
     {
         assert shuttingDown;
 

diff --git a/src/java/org/apache/cassandra/concurrent/Stage.java b/src/java/org/apache/cassandra/concurrent/Stage.java
index 4e3700f..ccb1565 100644
--- a/src/java/org/apache/cassandra/concurrent/Stage.java
+++ b/src/java/org/apache/cassandra/concurrent/Stage.java

@@ -27,6 +27,7 @@
     READ,
     MUTATION,
     COUNTER_MUTATION,
+    VIEW_MUTATION,
     GOSSIP,
     REQUEST_RESPONSE,
     ANTI_ENTROPY,
@@ -60,6 +61,7 @@
                 return "internal";
             case MUTATION:
             case COUNTER_MUTATION:
+            case VIEW_MUTATION:
             case READ:
             case REQUEST_RESPONSE:
             case READ_REPAIR:

diff --git a/src/java/org/apache/cassandra/concurrent/StageManager.java b/src/java/org/apache/cassandra/concurrent/StageManager.java
index 01bfb7c..457e801 100644
--- a/src/java/org/apache/cassandra/concurrent/StageManager.java
+++ b/src/java/org/apache/cassandra/concurrent/StageManager.java

@@ -48,6 +48,7 @@
     {
         stages.put(Stage.MUTATION, multiThreadedLowSignalStage(Stage.MUTATION, getConcurrentWriters()));
         stages.put(Stage.COUNTER_MUTATION, multiThreadedLowSignalStage(Stage.COUNTER_MUTATION, getConcurrentCounterWriters()));
+        stages.put(Stage.VIEW_MUTATION, multiThreadedLowSignalStage(Stage.VIEW_MUTATION, getConcurrentViewWriters()));
         stages.put(Stage.READ, multiThreadedLowSignalStage(Stage.READ, getConcurrentReaders()));
         stages.put(Stage.REQUEST_RESPONSE, multiThreadedLowSignalStage(Stage.REQUEST_RESPONSE, FBUtilities.getAvailableProcessors()));
         stages.put(Stage.INTERNAL_RESPONSE, multiThreadedStage(Stage.INTERNAL_RESPONSE, FBUtilities.getAvailableProcessors()));

diff --git a/src/java/org/apache/cassandra/config/CFMetaData.java b/src/java/org/apache/cassandra/config/CFMetaData.java
index 67a1c8c..19f744e 100644
--- a/src/java/org/apache/cassandra/config/CFMetaData.java
+++ b/src/java/org/apache/cassandra/config/CFMetaData.java

@@ -17,43 +17,46 @@
  */
 package org.apache.cassandra.config;
 
-import java.io.DataInput;
+import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.MoreObjects;
 import com.google.common.base.Objects;
-import com.google.common.base.Strings;
-import com.google.common.collect.*;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
 import org.apache.cassandra.cql3.statements.CFStatement;
 import org.apache.cassandra.cql3.statements.CreateTableStatement;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.compaction.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.io.compress.LZ4Compressor;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.schema.LegacySchemaTables;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.schema.*;
+import org.apache.cassandra.utils.*;
 import org.github.jamm.Unmetered;
 
 /**
@@ -62,107 +65,14 @@
 @Unmetered
 public final class CFMetaData
 {
+    public enum Flag
+    {
+        SUPER, COUNTER, DENSE, COMPOUND
+    }
+
     private static final Logger logger = LoggerFactory.getLogger(CFMetaData.class);
 
-    public final static double DEFAULT_READ_REPAIR_CHANCE = 0.0;
-    public final static double DEFAULT_DCLOCAL_READ_REPAIR_CHANCE = 0.1;
-    public final static int DEFAULT_GC_GRACE_SECONDS = 864000;
-    public final static int DEFAULT_MIN_COMPACTION_THRESHOLD = 4;
-    public final static int DEFAULT_MAX_COMPACTION_THRESHOLD = 32;
-    public final static Class<? extends AbstractCompactionStrategy> DEFAULT_COMPACTION_STRATEGY_CLASS = SizeTieredCompactionStrategy.class;
-    public final static CachingOptions DEFAULT_CACHING_STRATEGY = CachingOptions.KEYS_ONLY;
-    public final static int DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
-    public final static SpeculativeRetry DEFAULT_SPECULATIVE_RETRY = new SpeculativeRetry(SpeculativeRetry.RetryType.PERCENTILE, 0.99);
-    public final static int DEFAULT_MIN_INDEX_INTERVAL = 128;
-    public final static int DEFAULT_MAX_INDEX_INTERVAL = 2048;
-
-    // Note that this is the default only for user created tables
-    public final static String DEFAULT_COMPRESSOR = LZ4Compressor.class.getCanonicalName();
-
-    // Note that this need to come *before* any CFMetaData is defined so before the compile below.
-    private static final Comparator<ColumnDefinition> regularColumnComparator = new Comparator<ColumnDefinition>()
-    {
-        public int compare(ColumnDefinition def1, ColumnDefinition def2)
-        {
-            return ByteBufferUtil.compareUnsigned(def1.name.bytes, def2.name.bytes);
-        }
-    };
-
-    public static class SpeculativeRetry
-    {
-        public enum RetryType
-        {
-            NONE, CUSTOM, PERCENTILE, ALWAYS
-        }
-
-        public final RetryType type;
-        public final double value;
-
-        private SpeculativeRetry(RetryType type, double value)
-        {
-            this.type = type;
-            this.value = value;
-        }
-
-        public static SpeculativeRetry fromString(String retry) throws ConfigurationException
-        {
-            String name = retry.toUpperCase();
-            try
-            {
-                if (name.endsWith(RetryType.PERCENTILE.toString()))
-                {
-                    double value = Double.parseDouble(name.substring(0, name.length() - 10));
-                    if (value > 100 || value < 0)
-                        throw new ConfigurationException("PERCENTILE should be between 0 and 100, but was " + value);
-                    return new SpeculativeRetry(RetryType.PERCENTILE, (value / 100));
-                }
-                else if (name.endsWith("MS"))
-                {
-                    double value = Double.parseDouble(name.substring(0, name.length() - 2));
-                    return new SpeculativeRetry(RetryType.CUSTOM, value);
-                }
-                else
-                {
-                    return new SpeculativeRetry(RetryType.valueOf(name), 0);
-                }
-            }
-            catch (IllegalArgumentException e)
-            {
-                // ignore to throw the below exception.
-            }
-            throw new ConfigurationException("invalid speculative_retry type: " + retry);
-        }
-
-        @Override
-        public boolean equals(Object obj)
-        {
-            if (!(obj instanceof SpeculativeRetry))
-                return false;
-            SpeculativeRetry rhs = (SpeculativeRetry) obj;
-            return Objects.equal(type, rhs.type) && Objects.equal(value, rhs.value);
-        }
-
-        @Override
-        public int hashCode()
-        {
-            return Objects.hashCode(type, value);
-        }
-
-        @Override
-        public String toString()
-        {
-            switch (type)
-            {
-            case PERCENTILE:
-                // TODO switch to BigDecimal so round-tripping isn't lossy
-                return (value * 100) + "PERCENTILE";
-            case CUSTOM:
-                return value + "ms";
-            default:
-                return type.toString();
-            }
-        }
-    }
+    public static final Serializer serializer = new Serializer();
 
     //REQUIRED
     public final UUID cfId;                           // internal id, never exposed to user
@@ -170,28 +80,29 @@
     public final String cfName;                       // name of this column family
     public final Pair<String, String> ksAndCFName;
     public final byte[] ksAndCFBytes;
-    public final ColumnFamilyType cfType;             // standard, super
-    public volatile CellNameType comparator;          // bytes, long, timeuuid, utf8, etc.
 
-    //OPTIONAL
-    private volatile String comment = "";
-    private volatile double readRepairChance = DEFAULT_READ_REPAIR_CHANCE;
-    private volatile double dcLocalReadRepairChance = DEFAULT_DCLOCAL_READ_REPAIR_CHANCE;
-    private volatile int gcGraceSeconds = DEFAULT_GC_GRACE_SECONDS;
-    private volatile AbstractType<?> defaultValidator = BytesType.instance;
-    private volatile AbstractType<?> keyValidator = BytesType.instance;
-    private volatile int minCompactionThreshold = DEFAULT_MIN_COMPACTION_THRESHOLD;
-    private volatile int maxCompactionThreshold = DEFAULT_MAX_COMPACTION_THRESHOLD;
-    private volatile Double bloomFilterFpChance = null;
-    private volatile CachingOptions caching = DEFAULT_CACHING_STRATEGY;
-    private volatile int minIndexInterval = DEFAULT_MIN_INDEX_INTERVAL;
-    private volatile int maxIndexInterval = DEFAULT_MAX_INDEX_INTERVAL;
-    private volatile int memtableFlushPeriod = 0;
-    private volatile int defaultTimeToLive = DEFAULT_DEFAULT_TIME_TO_LIVE;
-    private volatile SpeculativeRetry speculativeRetry = DEFAULT_SPECULATIVE_RETRY;
-    private volatile Map<ColumnIdentifier, Long> droppedColumns = new HashMap<>();
-    private volatile Map<String, TriggerDefinition> triggers = new HashMap<>();
-    private volatile boolean isPurged = false;
+    private final boolean isCounter;
+    private final boolean isView;
+    private final boolean isIndex;
+
+    public volatile ClusteringComparator comparator;  // bytes, long, timeuuid, utf8, etc. This is built directly from clusteringColumns
+    public final IPartitioner partitioner;            // partitioner the table uses
+    private volatile AbstractType<?> keyValidator;
+
+    private final Serializers serializers;
+
+    // non-final, for now
+    private volatile ImmutableSet<Flag> flags;
+    private volatile boolean isDense;
+    private volatile boolean isCompound;
+    private volatile boolean isSuper;
+
+    public volatile TableParams params = TableParams.DEFAULT;
+
+    private volatile Map<ByteBuffer, DroppedColumn> droppedColumns = new HashMap<>();
+    private volatile Triggers triggers = Triggers.none();
+    private volatile Indexes indexes = Indexes.none();
+
     /*
      * All CQL3 columns definition are stored in the columnMetadata map.
      * On top of that, we keep separated collection of each kind of definition, to
@@ -199,113 +110,399 @@
      * clustering key ones, those list are ordered by the "component index" of the
      * elements.
      */
-    public static final String DEFAULT_KEY_ALIAS = "key";
-    public static final String DEFAULT_COLUMN_ALIAS = "column";
-    public static final String DEFAULT_VALUE_ALIAS = "value";
-
-    // We call dense a CF for which each component of the comparator is a clustering column, i.e. no
-    // component is used to store a regular column names. In other words, non-composite static "thrift"
-    // and CQL3 CF are *not* dense.
-    private volatile Boolean isDense; // null means "we don't know and need to infer from other data"
-
     private volatile Map<ByteBuffer, ColumnDefinition> columnMetadata = new HashMap<>();
     private volatile List<ColumnDefinition> partitionKeyColumns;  // Always of size keyValidator.componentsCount, null padded if necessary
     private volatile List<ColumnDefinition> clusteringColumns;    // Of size comparator.componentsCount or comparator.componentsCount -1, null padded if necessary
-    private volatile SortedSet<ColumnDefinition> regularColumns;  // We use a sorted set so iteration is of predictable order (for SELECT for instance)
-    private volatile SortedSet<ColumnDefinition> staticColumns;   // Same as above
+    private volatile PartitionColumns partitionColumns;           // Always non-PK, non-clustering columns
+
+    // For dense tables, this alias the single non-PK column the table contains (since it can only have one). We keep
+    // that as convenience to access that column more easily (but we could replace calls by partitionColumns().iterator().next()
+    // for those tables in practice).
     private volatile ColumnDefinition compactValueColumn;
 
-    public volatile Class<? extends AbstractCompactionStrategy> compactionStrategyClass = DEFAULT_COMPACTION_STRATEGY_CLASS;
-    public volatile Map<String, String> compactionStrategyOptions = new HashMap<>();
-
-    public volatile CompressionParameters compressionParameters = new CompressionParameters(null);
-
-    // attribute setters that return the modified CFMetaData instance
-    public CFMetaData comment(String prop) {comment = Strings.nullToEmpty(prop); return this;}
-    public CFMetaData readRepairChance(double prop) {readRepairChance = prop; return this;}
-    public CFMetaData dcLocalReadRepairChance(double prop) {dcLocalReadRepairChance = prop; return this;}
-    public CFMetaData gcGraceSeconds(int prop) {gcGraceSeconds = prop; return this;}
-    public CFMetaData defaultValidator(AbstractType<?> prop) {defaultValidator = prop; return this;}
-    public CFMetaData keyValidator(AbstractType<?> prop) {keyValidator = prop; return this;}
-    public CFMetaData minCompactionThreshold(int prop) {minCompactionThreshold = prop; return this;}
-    public CFMetaData maxCompactionThreshold(int prop) {maxCompactionThreshold = prop; return this;}
-    public CFMetaData compactionStrategyClass(Class<? extends AbstractCompactionStrategy> prop) {compactionStrategyClass = prop; return this;}
-    public CFMetaData compactionStrategyOptions(Map<String, String> prop) {compactionStrategyOptions = prop; return this;}
-    public CFMetaData compressionParameters(CompressionParameters prop) {compressionParameters = prop; return this;}
-    public CFMetaData bloomFilterFpChance(double prop) {bloomFilterFpChance = prop; return this;}
-    public CFMetaData caching(CachingOptions prop) {caching = prop; return this;}
-    public CFMetaData minIndexInterval(int prop) {minIndexInterval = prop; return this;}
-    public CFMetaData maxIndexInterval(int prop) {maxIndexInterval = prop; return this;}
-    public CFMetaData memtableFlushPeriod(int prop) {memtableFlushPeriod = prop; return this;}
-    public CFMetaData defaultTimeToLive(int prop) {defaultTimeToLive = prop; return this;}
-    public CFMetaData speculativeRetry(SpeculativeRetry prop) {speculativeRetry = prop; return this;}
-    public CFMetaData droppedColumns(Map<ColumnIdentifier, Long> cols) {droppedColumns = cols; return this;}
-    public CFMetaData triggers(Map<String, TriggerDefinition> prop) {triggers = prop; return this;}
-    public CFMetaData isDense(Boolean prop) {isDense = prop; return this;}
+    private volatile Set<ColumnDefinition> hiddenColumns;
 
     /**
-     * Create new ColumnFamily metadata with generated random ID.
-     * When loading from existing schema, use CFMetaData
+     * These two columns are "virtual" (e.g. not persisted together with schema).
      *
-     * @param keyspace keyspace name
-     * @param name column family name
-     * @param comp default comparator
-     */
-    public CFMetaData(String keyspace, String name, ColumnFamilyType type, CellNameType comp)
+     * They are stored here to avoid re-creating during SELECT and UPDATE queries, where
+     * they are used to allow presenting supercolumn families in the CQL-compatible
+     * format. See {@link SuperColumnCompatibility} for more details.
+     **/
+    private volatile ColumnDefinition superCfKeyColumn;
+    private volatile ColumnDefinition superCfValueColumn;
+
+    /** Caches a non-compact version of the metadata for compact tables to be used with the NO_COMPACT protocol option. */
+    private volatile CFMetaData nonCompactCopy = null;
+
+    public boolean isSuperColumnKeyColumn(ColumnDefinition cd)
     {
-        this(keyspace, name, type, comp, null);
+        return cd.name.equals(superCfKeyColumn.name);
     }
 
-    public CFMetaData(String keyspace, String name, ColumnFamilyType type, CellNameType comp, UUID id)
+    public boolean isSuperColumnValueColumn(ColumnDefinition cd)
     {
-        cfId = id != null ? id : UUIDGen.getTimeUUID();
-        ksName = keyspace;
-        cfName = name;
+        return cd.name.equals(superCfValueColumn.name);
+    }
+
+    public ColumnDefinition superColumnValueColumn()
+    {
+        return superCfValueColumn;
+    }
+
+    public ColumnDefinition superColumnKeyColumn() { return superCfKeyColumn; }
+
+    /*
+     * All of these methods will go away once CFMetaData becomes completely immutable.
+     */
+    public CFMetaData params(TableParams params)
+    {
+        this.params = params;
+        return this;
+    }
+
+    public CFMetaData bloomFilterFpChance(double prop)
+    {
+        params = TableParams.builder(params).bloomFilterFpChance(prop).build();
+        return this;
+    }
+
+    public CFMetaData caching(CachingParams prop)
+    {
+        params = TableParams.builder(params).caching(prop).build();
+        return this;
+    }
+
+    public CFMetaData comment(String prop)
+    {
+        params = TableParams.builder(params).comment(prop).build();
+        return this;
+    }
+
+    public CFMetaData compaction(CompactionParams prop)
+    {
+        params = TableParams.builder(params).compaction(prop).build();
+        return this;
+    }
+
+    public CFMetaData compression(CompressionParams prop)
+    {
+        params = TableParams.builder(params).compression(prop).build();
+        return this;
+    }
+
+    public CFMetaData dcLocalReadRepairChance(double prop)
+    {
+        params = TableParams.builder(params).dcLocalReadRepairChance(prop).build();
+        return this;
+    }
+
+    public CFMetaData defaultTimeToLive(int prop)
+    {
+        params = TableParams.builder(params).defaultTimeToLive(prop).build();
+        return this;
+    }
+
+    public CFMetaData gcGraceSeconds(int prop)
+    {
+        params = TableParams.builder(params).gcGraceSeconds(prop).build();
+        return this;
+    }
+
+    public CFMetaData maxIndexInterval(int prop)
+    {
+        params = TableParams.builder(params).maxIndexInterval(prop).build();
+        return this;
+    }
+
+    public CFMetaData memtableFlushPeriod(int prop)
+    {
+        params = TableParams.builder(params).memtableFlushPeriodInMs(prop).build();
+        return this;
+    }
+
+    public CFMetaData minIndexInterval(int prop)
+    {
+        params = TableParams.builder(params).minIndexInterval(prop).build();
+        return this;
+    }
+
+    public CFMetaData readRepairChance(double prop)
+    {
+        params = TableParams.builder(params).readRepairChance(prop).build();
+        return this;
+    }
+
+    public CFMetaData crcCheckChance(double prop)
+    {
+        params = TableParams.builder(params).crcCheckChance(prop).build();
+        return this;
+    }
+
+    public CFMetaData speculativeRetry(SpeculativeRetryParam prop)
+    {
+        params = TableParams.builder(params).speculativeRetry(prop).build();
+        return this;
+    }
+
+    public CFMetaData extensions(Map<String, ByteBuffer> extensions)
+    {
+        params = TableParams.builder(params).extensions(extensions).build();
+        return this;
+    }
+
+    public CFMetaData droppedColumns(Map<ByteBuffer, DroppedColumn> cols)
+    {
+        droppedColumns = cols;
+        return this;
+    }
+
+    public CFMetaData triggers(Triggers prop)
+    {
+        triggers = prop;
+        return this;
+    }
+
+    public CFMetaData indexes(Indexes indexes)
+    {
+        this.indexes = indexes;
+        return this;
+    }
+
+    private CFMetaData(String keyspace,
+                       String name,
+                       UUID cfId,
+                       boolean isSuper,
+                       boolean isCounter,
+                       boolean isDense,
+                       boolean isCompound,
+                       boolean isView,
+                       List<ColumnDefinition> partitionKeyColumns,
+                       List<ColumnDefinition> clusteringColumns,
+                       PartitionColumns partitionColumns,
+                       IPartitioner partitioner,
+                       ColumnDefinition superCfKeyColumn,
+                       ColumnDefinition superCfValueColumn)
+    {
+        this.cfId = cfId;
+        this.ksName = keyspace;
+        this.cfName = name;
         ksAndCFName = Pair.create(keyspace, name);
         byte[] ksBytes = FBUtilities.toWriteUTFBytes(ksName);
         byte[] cfBytes = FBUtilities.toWriteUTFBytes(cfName);
         ksAndCFBytes = Arrays.copyOf(ksBytes, ksBytes.length + cfBytes.length);
         System.arraycopy(cfBytes, 0, ksAndCFBytes, ksBytes.length, cfBytes.length);
 
-        cfType = type;
-        comparator = comp;
+        this.isDense = isSuper ? (isDense || SuperColumnCompatibility.recalculateIsDense(partitionColumns.regulars)) : isDense;
+
+        this.isCompound = isCompound;
+        this.isSuper = isSuper;
+        this.isCounter = isCounter;
+        this.isView = isView;
+
+        EnumSet<Flag> flags = EnumSet.noneOf(Flag.class);
+        if (isSuper)
+            flags.add(Flag.SUPER);
+        if (isCounter)
+            flags.add(Flag.COUNTER);
+        if (isDense)
+            flags.add(Flag.DENSE);
+        if (isCompound)
+            flags.add(Flag.COMPOUND);
+        this.flags = Sets.immutableEnumSet(flags);
+
+        isIndex = cfName.contains(".");
+
+        assert partitioner != null : "This assertion failure is probably due to accessing Schema.instance " +
+                                     "from client-mode tools - See CASSANDRA-8143.";
+        this.partitioner = partitioner;
+
+        // A compact table should always have a clustering
+        assert isCQLTable() || !clusteringColumns.isEmpty() : String.format("For table %s.%s, isDense=%b, isCompound=%b, clustering=%s", ksName, cfName, isDense, isCompound, clusteringColumns);
+
+        this.partitionKeyColumns = partitionKeyColumns;
+        this.clusteringColumns = clusteringColumns;
+        this.partitionColumns = partitionColumns;
+
+        this.superCfKeyColumn = superCfKeyColumn;
+        this.superCfValueColumn = superCfValueColumn;
+
+        //This needs to happen before serializers are set
+        //because they use comparator.subtypes()
+        rebuild();
+
+        this.serializers = new Serializers(this);
     }
 
-    public static CFMetaData denseCFMetaData(String keyspace, String name, AbstractType<?> comp, AbstractType<?> subcc)
+    // This rebuild informations that are intrinsically duplicate of the table definition but
+    // are kept because they are often useful in a different format.
+    private void rebuild()
     {
-        CellNameType cellNameType = CellNames.fromAbstractType(makeRawAbstractType(comp, subcc), true);
-        return new CFMetaData(keyspace, name, subcc == null ? ColumnFamilyType.Standard : ColumnFamilyType.Super, cellNameType);
+        // A non-compact copy will be created lazily
+        this.nonCompactCopy = null;
+
+        if (isCompactTable())
+        {
+            this.compactValueColumn = isSuper() ?
+                                      SuperColumnCompatibility.getCompactValueColumn(partitionColumns) :
+                                      CompactTables.getCompactValueColumn(partitionColumns);
+        }
+
+        Map<ByteBuffer, ColumnDefinition> newColumnMetadata = Maps.newHashMapWithExpectedSize(partitionKeyColumns.size() + clusteringColumns.size() + partitionColumns.size());
+
+        if (isSuper() && isDense())
+        {
+            CompactTables.DefaultNames defaultNames = SuperColumnCompatibility.columnNameGenerator(partitionKeyColumns, clusteringColumns, partitionColumns);
+            if (superCfKeyColumn == null)
+                superCfKeyColumn = SuperColumnCompatibility.getSuperCfKeyColumn(this, clusteringColumns, defaultNames);
+            if (superCfValueColumn == null)
+                superCfValueColumn = SuperColumnCompatibility.getSuperCfValueColumn(this, partitionColumns, superCfKeyColumn, defaultNames);
+
+            for (ColumnDefinition def : partitionKeyColumns)
+                newColumnMetadata.put(def.name.bytes, def);
+            newColumnMetadata.put(clusteringColumns.get(0).name.bytes, clusteringColumns.get(0));
+            newColumnMetadata.put(superCfKeyColumn.name.bytes, SuperColumnCompatibility.getSuperCfSschemaRepresentation(superCfKeyColumn));
+            newColumnMetadata.put(superCfValueColumn.name.bytes, superCfValueColumn);
+            newColumnMetadata.put(compactValueColumn.name.bytes, compactValueColumn);
+            clusteringColumns = Arrays.asList(clusteringColumns().get(0));
+            partitionColumns = PartitionColumns.of(compactValueColumn);
+        }
+        else
+        {
+            for (ColumnDefinition def : partitionKeyColumns)
+                newColumnMetadata.put(def.name.bytes, def);
+            for (ColumnDefinition def : clusteringColumns)
+                newColumnMetadata.put(def.name.bytes, def);
+            for (ColumnDefinition def : partitionColumns)
+                newColumnMetadata.put(def.name.bytes, def);
+        }
+        this.columnMetadata = newColumnMetadata;
+
+        List<AbstractType<?>> keyTypes = extractTypes(partitionKeyColumns);
+        this.keyValidator = keyTypes.size() == 1 ? keyTypes.get(0) : CompositeType.getInstance(keyTypes);
+
+        if (isSuper())
+            this.comparator = new ClusteringComparator(clusteringColumns.get(0).type);
+        else
+            this.comparator = new ClusteringComparator(extractTypes(clusteringColumns));
+
+        Set<ColumnDefinition> hiddenColumns;
+        if (isCompactTable() && isDense && CompactTables.hasEmptyCompactValue(this))
+        {
+            hiddenColumns = Collections.singleton(compactValueColumn);
+        }
+        else if (isCompactTable() && !isDense && !isSuper)
+        {
+            hiddenColumns = Sets.newHashSetWithExpectedSize(clusteringColumns.size() + 1);
+            hiddenColumns.add(compactValueColumn);
+            hiddenColumns.addAll(clusteringColumns);
+
+        }
+        else
+        {
+            hiddenColumns = Collections.emptySet();
+        }
+        this.hiddenColumns = hiddenColumns;
     }
 
-    public static CFMetaData sparseCFMetaData(String keyspace, String name, AbstractType<?> comp)
+    public Indexes getIndexes()
     {
-        CellNameType cellNameType = CellNames.fromAbstractType(comp, false);
-        return new CFMetaData(keyspace, name, ColumnFamilyType.Standard, cellNameType);
+        return indexes;
     }
 
-    public static CFMetaData denseCFMetaData(String keyspace, String name, AbstractType<?> comp)
+    public static CFMetaData create(String ksName,
+                                    String name,
+                                    UUID cfId,
+                                    boolean isDense,
+                                    boolean isCompound,
+                                    boolean isSuper,
+                                    boolean isCounter,
+                                    boolean isView,
+                                    List<ColumnDefinition> columns,
+                                    IPartitioner partitioner)
     {
-        return denseCFMetaData(keyspace, name, comp, null);
+        List<ColumnDefinition> partitions = new ArrayList<>();
+        List<ColumnDefinition> clusterings = new ArrayList<>();
+        PartitionColumns.Builder builder = PartitionColumns.builder();
+
+        for (ColumnDefinition column : columns)
+        {
+            switch (column.kind)
+            {
+                case PARTITION_KEY:
+                    partitions.add(column);
+                    break;
+                case CLUSTERING:
+                    clusterings.add(column);
+                    break;
+                default:
+                    builder.add(column);
+                    break;
+            }
+        }
+
+        Collections.sort(partitions);
+        Collections.sort(clusterings);
+
+        return new CFMetaData(ksName,
+                              name,
+                              cfId,
+                              isSuper,
+                              isCounter,
+                              isDense,
+                              isCompound,
+                              isView,
+                              partitions,
+                              clusterings,
+                              builder.build(),
+                              partitioner,
+                              null,
+                              null);
     }
 
-    public static AbstractType<?> makeRawAbstractType(AbstractType<?> comparator, AbstractType<?> subComparator)
+    private static List<AbstractType<?>> extractTypes(List<ColumnDefinition> clusteringColumns)
     {
-        return subComparator == null ? comparator : CompositeType.getInstance(Arrays.asList(comparator, subComparator));
+        List<AbstractType<?>> types = new ArrayList<>(clusteringColumns.size());
+        for (ColumnDefinition def : clusteringColumns)
+            types.add(def.type);
+        return types;
     }
 
-    public Map<String, TriggerDefinition> getTriggers()
+    public Set<Flag> flags()
+    {
+        return flags;
+    }
+
+    /**
+     * There is a couple of places in the code where we need a CFMetaData object and don't have one readily available
+     * and know that only the keyspace and name matter. This creates such "fake" metadata. Use only if you know what
+     * you're doing.
+     */
+    public static CFMetaData createFake(String keyspace, String name)
+    {
+        return CFMetaData.Builder.create(keyspace, name).addPartitionKey("key", BytesType.instance).build();
+    }
+
+    public Triggers getTriggers()
     {
         return triggers;
     }
 
+    // Compiles a system metadata
     public static CFMetaData compile(String cql, String keyspace)
     {
         CFStatement parsed = (CFStatement)QueryProcessor.parseStatement(cql);
         parsed.prepareKeyspace(keyspace);
-        CreateTableStatement statement = (CreateTableStatement) parsed.prepare().statement;
-        CFMetaData cfm = newSystemMetadata(keyspace, statement.columnFamily(), "", statement.comparator);
-        statement.applyPropertiesTo(cfm);
-        return cfm.rebuild();
+        CreateTableStatement statement = (CreateTableStatement) ((CreateTableStatement.RawStatement) parsed).prepare(Types.none()).statement;
+
+        return statement.metadataBuilder()
+                        .withId(generateLegacyCfId(keyspace, statement.columnFamily()))
+                        .build()
+                        .params(statement.params())
+                        .readRepairChance(0.0)
+                        .dcLocalReadRepairChance(0.0)
+                        .gcGraceSeconds(0)
+                        .memtableFlushPeriod((int) TimeUnit.HOURS.toMillis(1));
     }
 
     /**
@@ -319,59 +516,59 @@
         return UUID.nameUUIDFromBytes(ArrayUtils.addAll(ksName.getBytes(), cfName.getBytes()));
     }
 
-    private static CFMetaData newSystemMetadata(String keyspace, String cfName, String comment, CellNameType comparator)
+    public CFMetaData reloadIndexMetadataProperties(CFMetaData parent)
     {
-        return new CFMetaData(keyspace, cfName, ColumnFamilyType.Standard, comparator, generateLegacyCfId(keyspace, cfName))
-                             .comment(comment)
-                             .readRepairChance(0)
-                             .dcLocalReadRepairChance(0)
-                             .gcGraceSeconds(0)
-                             .memtableFlushPeriod(3600 * 1000);
+        TableParams.Builder indexParams = TableParams.builder(parent.params);
+
+        // Depends on parent's cache setting, turn on its index CF's cache.
+        // Row caching is never enabled; see CASSANDRA-5732
+        if (parent.params.caching.cacheKeys())
+            indexParams.caching(CachingParams.CACHE_KEYS);
+        else
+            indexParams.caching(CachingParams.CACHE_NOTHING);
+
+        indexParams.readRepairChance(0.0)
+                   .dcLocalReadRepairChance(0.0)
+                   .gcGraceSeconds(0);
+
+        return params(indexParams.build());
     }
 
     /**
-     * Creates CFMetaData for secondary index CF.
-     * Secondary index CF has the same CF ID as parent's.
-     *
-     * @param parent Parent CF where secondary index is created
-     * @param info Column definition containing secondary index definition
-     * @param indexComparator Comparator for secondary index
-     * @return CFMetaData for secondary index
+     * Returns a cached non-compact version of this table. Cached version has to be invalidated
+     * every time the table is rebuilt.
      */
-    public static CFMetaData newIndexMetadata(CFMetaData parent, ColumnDefinition info, CellNameType indexComparator)
+    public CFMetaData asNonCompact()
     {
-        // Depends on parent's cache setting, turn on its index CF's cache.
-        // Row caching is never enabled; see CASSANDRA-5732
-        CachingOptions indexCaching = parent.getCaching().keyCache.isEnabled()
-                                    ? CachingOptions.KEYS_ONLY
-                                    : CachingOptions.NONE;
+        assert isCompactTable() : "Can't get non-compact version of a CQL table";
 
-        return new CFMetaData(parent.ksName, parent.indexColumnFamilyName(info), ColumnFamilyType.Standard, indexComparator, parent.cfId)
-                             .keyValidator(info.type)
-                             .readRepairChance(0.0)
-                             .dcLocalReadRepairChance(0.0)
-                             .gcGraceSeconds(0)
-                             .caching(indexCaching)
-                             .speculativeRetry(parent.speculativeRetry)
-                             .compactionStrategyClass(parent.compactionStrategyClass)
-                             .compactionStrategyOptions(parent.compactionStrategyOptions)
-                             .reloadSecondaryIndexMetadata(parent)
-                             .rebuild();
-    }
+        // Note that this is racy, but re-computing the non-compact copy a few times on first uses isn't a big deal so
+        // we don't bother.
+        if (nonCompactCopy == null)
+        {
+            nonCompactCopy = copyOpts(new CFMetaData(ksName,
+                                                     cfName,
+                                                     cfId,
+                                                     false,
+                                                     isCounter,
+                                                     false,
+                                                     true,
+                                                     isView,
+                                                     copy(partitionKeyColumns),
+                                                     copy(clusteringColumns),
+                                                     copy(partitionColumns),
+                                                     partitioner,
+                                                     superCfKeyColumn,
+                                                     superCfValueColumn),
+                                      this);
+        }
 
-    public CFMetaData reloadSecondaryIndexMetadata(CFMetaData parent)
-    {
-        minCompactionThreshold(parent.minCompactionThreshold);
-        maxCompactionThreshold(parent.maxCompactionThreshold);
-        compactionStrategyClass(parent.compactionStrategyClass);
-        compactionStrategyOptions(parent.compactionStrategyOptions);
-        compressionParameters(parent.compressionParameters);
-        return this;
+        return nonCompactCopy;
     }
 
     public CFMetaData copy()
     {
-        return copyOpts(new CFMetaData(ksName, cfName, cfType, comparator, cfId), this);
+        return copy(cfId);
     }
 
     /**
@@ -382,39 +579,86 @@
      */
     public CFMetaData copy(UUID newCfId)
     {
-        return copyOpts(new CFMetaData(ksName, cfName, cfType, comparator, newCfId), this);
+        return copyOpts(new CFMetaData(ksName,
+                                       cfName,
+                                       newCfId,
+                                       isSuper(),
+                                       isCounter(),
+                                       isDense(),
+                                       isCompound(),
+                                       isView(),
+                                       copy(partitionKeyColumns),
+                                       copy(clusteringColumns),
+                                       copy(partitionColumns),
+                                       partitioner,
+                                       superCfKeyColumn,
+                                       superCfValueColumn),
+                        this);
+    }
+
+    public CFMetaData copy(IPartitioner partitioner)
+    {
+        return copyOpts(new CFMetaData(ksName,
+                                       cfName,
+                                       cfId,
+                                       isSuper,
+                                       isCounter,
+                                       isDense,
+                                       isCompound,
+                                       isView,
+                                       copy(partitionKeyColumns),
+                                       copy(clusteringColumns),
+                                       copy(partitionColumns),
+                                       partitioner,
+                                       superCfKeyColumn,
+                                       superCfValueColumn),
+                        this);
+    }
+
+    public CFMetaData copyWithNewCompactValueType(AbstractType<?> type)
+    {
+        assert isDense && compactValueColumn.type instanceof EmptyType && partitionColumns.size() == 1;
+        return copyOpts(new CFMetaData(ksName,
+                                       cfName,
+                                       cfId,
+                                       isSuper,
+                                       isCounter,
+                                       isDense,
+                                       isCompound,
+                                       isView,
+                                       copy(partitionKeyColumns),
+                                       copy(clusteringColumns),
+                                       PartitionColumns.of(compactValueColumn.withNewType(type)),
+                                       partitioner,
+                                       superCfKeyColumn,
+                                       superCfValueColumn),
+                        this);
+    }
+
+
+    private static List<ColumnDefinition> copy(List<ColumnDefinition> l)
+    {
+        List<ColumnDefinition> copied = new ArrayList<>(l.size());
+        for (ColumnDefinition cd : l)
+            copied.add(cd.copy());
+        return copied;
+    }
+
+    private static PartitionColumns copy(PartitionColumns columns)
+    {
+        PartitionColumns.Builder newColumns = PartitionColumns.builder();
+        for (ColumnDefinition cd : columns)
+            newColumns.add(cd.copy());
+        return newColumns.build();
     }
 
     @VisibleForTesting
     public static CFMetaData copyOpts(CFMetaData newCFMD, CFMetaData oldCFMD)
     {
-        List<ColumnDefinition> clonedColumns = new ArrayList<>(oldCFMD.allColumns().size());
-        for (ColumnDefinition cd : oldCFMD.allColumns())
-            clonedColumns.add(cd.copy());
-
-        return newCFMD.addAllColumnDefinitions(clonedColumns)
-                      .comment(oldCFMD.comment)
-                      .readRepairChance(oldCFMD.readRepairChance)
-                      .dcLocalReadRepairChance(oldCFMD.dcLocalReadRepairChance)
-                      .gcGraceSeconds(oldCFMD.gcGraceSeconds)
-                      .defaultValidator(oldCFMD.defaultValidator)
-                      .keyValidator(oldCFMD.keyValidator)
-                      .minCompactionThreshold(oldCFMD.minCompactionThreshold)
-                      .maxCompactionThreshold(oldCFMD.maxCompactionThreshold)
-                      .compactionStrategyClass(oldCFMD.compactionStrategyClass)
-                      .compactionStrategyOptions(new HashMap<>(oldCFMD.compactionStrategyOptions))
-                      .compressionParameters(oldCFMD.compressionParameters.copy())
-                      .bloomFilterFpChance(oldCFMD.getBloomFilterFpChance())
-                      .caching(oldCFMD.caching)
-                      .defaultTimeToLive(oldCFMD.defaultTimeToLive)
-                      .minIndexInterval(oldCFMD.minIndexInterval)
-                      .maxIndexInterval(oldCFMD.maxIndexInterval)
-                      .speculativeRetry(oldCFMD.speculativeRetry)
-                      .memtableFlushPeriod(oldCFMD.memtableFlushPeriod)
+        return newCFMD.params(oldCFMD.params)
                       .droppedColumns(new HashMap<>(oldCFMD.droppedColumns))
-                      .triggers(new HashMap<>(oldCFMD.triggers))
-                      .isDense(oldCFMD.isDense)
-                      .rebuild();
+                      .triggers(oldCFMD.triggers)
+                      .indexes(oldCFMD.indexes);
     }
 
     /**
@@ -425,28 +669,23 @@
      *
      * @return name of the index ColumnFamily
      */
-    public String indexColumnFamilyName(ColumnDefinition info)
+    public String indexColumnFamilyName(IndexMetadata info)
     {
         // TODO simplify this when info.index_name is guaranteed to be set
-        return cfName + Directories.SECONDARY_INDEX_NAME_SEPARATOR + (info.getIndexName() == null ? ByteBufferUtil.bytesToHex(info.name.bytes) : info.getIndexName());
-    }
-
-    public String getComment()
-    {
-        return comment;
-    }
-
-    public boolean isSuper()
-    {
-        return cfType == ColumnFamilyType.Super;
+        return cfName + Directories.SECONDARY_INDEX_NAME_SEPARATOR + info.name;
     }
 
     /**
-     * The '.' char is the only way to identify if the CFMetadata is for a secondary index
+     * true if this CFS contains secondary index data.
      */
-    public boolean isSecondaryIndex()
+    public boolean isIndex()
     {
-        return cfName.contains(".");
+        return isIndex;
+    }
+
+    public DecoratedKey decorateKey(ByteBuffer key)
+    {
+        return partitioner.decorateKey(key);
     }
 
     public Map<ByteBuffer, ColumnDefinition> getColumnMetadata()
@@ -460,39 +699,26 @@
      */
     public String getParentColumnFamilyName()
     {
-        return isSecondaryIndex() ? cfName.substring(0, cfName.indexOf('.')) : null;
-    }
-
-    public double getReadRepairChance()
-    {
-        return readRepairChance;
-    }
-
-    public double getDcLocalReadRepairChance()
-    {
-        return dcLocalReadRepairChance;
+        return isIndex ? cfName.substring(0, cfName.indexOf('.')) : null;
     }
 
     public ReadRepairDecision newReadRepairDecision()
     {
         double chance = ThreadLocalRandom.current().nextDouble();
-        if (getReadRepairChance() > chance)
+        if (params.readRepairChance > chance)
             return ReadRepairDecision.GLOBAL;
 
-        if (getDcLocalReadRepairChance() > chance)
+        if (params.dcLocalReadRepairChance > chance)
             return ReadRepairDecision.DC_LOCAL;
 
         return ReadRepairDecision.NONE;
     }
 
-    public int getGcGraceSeconds()
+    public AbstractType<?> getColumnDefinitionNameComparator(ColumnDefinition.Kind kind)
     {
-        return gcGraceSeconds;
-    }
-
-    public AbstractType<?> getDefaultValidator()
-    {
-        return defaultValidator;
+        return (isSuper() && kind == ColumnDefinition.Kind.REGULAR) || (isStaticCompactTable() && kind == ColumnDefinition.Kind.STATIC)
+             ? thriftColumnNameType()
+             : UTF8Type.instance;
     }
 
     public AbstractType<?> getKeyValidator()
@@ -500,36 +726,44 @@
         return keyValidator;
     }
 
-    public Integer getMinCompactionThreshold()
-    {
-        return minCompactionThreshold;
-    }
-
-    public Integer getMaxCompactionThreshold()
-    {
-        return maxCompactionThreshold;
-    }
-
-    public CompressionParameters compressionParameters()
-    {
-        return compressionParameters;
-    }
-
     public Collection<ColumnDefinition> allColumns()
     {
         return columnMetadata.values();
     }
 
+    private Iterator<ColumnDefinition> nonPkColumnIterator()
+    {
+        final boolean noNonPkColumns = isCompactTable() && CompactTables.hasEmptyCompactValue(this) && !isSuper();
+        if (noNonPkColumns)
+        {
+            return Collections.<ColumnDefinition>emptyIterator();
+        }
+        else if (isStaticCompactTable())
+        {
+            return partitionColumns.statics.selectOrderIterator();
+        }
+        else if (isSuper())
+        {
+            if (isDense)
+                return Iterators.forArray(superCfKeyColumn, superCfValueColumn);
+            else
+                return Iterators.filter(partitionColumns.iterator(), (c) -> !c.type.isCollection());
+        }
+        else
+            return partitionColumns().selectOrderIterator();
+    }
+
     // An iterator over all column definitions but that respect the order of a SELECT *.
+    // This also hides the clustering/regular columns for a non-CQL3 non-dense table for backward compatibility
+    // sake (those are accessible through thrift but not through CQL currently) and exposes the key and value
+    // columns for supercolumn family.
     public Iterator<ColumnDefinition> allColumnsInSelectOrder()
     {
         return new AbstractIterator<ColumnDefinition>()
         {
             private final Iterator<ColumnDefinition> partitionKeyIter = partitionKeyColumns.iterator();
-            private final Iterator<ColumnDefinition> clusteringIter = clusteringColumns.iterator();
-            private boolean valueDone;
-            private final Iterator<ColumnDefinition> staticIter = staticColumns.iterator();
-            private final Iterator<ColumnDefinition> regularIter = regularColumns.iterator();
+            private final Iterator<ColumnDefinition> clusteringIter = isStaticCompactTable() ? Collections.<ColumnDefinition>emptyIterator() : clusteringColumns.iterator();
+            private final Iterator<ColumnDefinition> otherColumns = nonPkColumnIterator();
 
             protected ColumnDefinition computeNext()
             {
@@ -539,26 +773,16 @@
                 if (clusteringIter.hasNext())
                     return clusteringIter.next();
 
-                if (staticIter.hasNext())
-                    return staticIter.next();
-
-                if (compactValueColumn != null && !valueDone)
-                {
-                    valueDone = true;
-                    // If the compactValueColumn is empty, this means we have a dense table but
-                    // with only a PK. As far as selects are concerned, we should ignore the value.
-                    if (compactValueColumn.name.bytes.hasRemaining())
-                        return compactValueColumn;
-                }
-
-                if (regularIter.hasNext())
-                    return regularIter.next();
-
-                return endOfData();
+                return otherColumns.hasNext() ? otherColumns.next() : endOfData();
             }
         };
     }
 
+    public Iterable<ColumnDefinition> primaryKeyColumns()
+    {
+        return Iterables.concat(partitionKeyColumns, clusteringColumns);
+    }
+
     public List<ColumnDefinition> partitionKeyColumns()
     {
         return partitionKeyColumns;
@@ -569,19 +793,9 @@
         return clusteringColumns;
     }
 
-    public Set<ColumnDefinition> regularColumns()
+    public PartitionColumns partitionColumns()
     {
-        return regularColumns;
-    }
-
-    public Set<ColumnDefinition> staticColumns()
-    {
-        return staticColumns;
-    }
-
-    public Iterable<ColumnDefinition> regularAndStaticColumns()
-    {
-        return Iterables.concat(staticColumns, regularColumns);
+        return partitionColumns;
     }
 
     public ColumnDefinition compactValueColumn()
@@ -589,61 +803,60 @@
         return compactValueColumn;
     }
 
-    // TODO: we could use CType for key validation too to make this unnecessary but
-    // it's unclear it would be a win overall
-    public CType getKeyValidatorAsCType()
+    public ClusteringComparator getKeyValidatorAsClusteringComparator()
     {
-        return keyValidator instanceof CompositeType
-             ? new CompoundCType(((CompositeType) keyValidator).types)
-             : new SimpleCType(keyValidator);
+        boolean isCompound = keyValidator instanceof CompositeType;
+        List<AbstractType<?>> types = isCompound
+                                    ? ((CompositeType) keyValidator).types
+                                    : Collections.<AbstractType<?>>singletonList(keyValidator);
+        return new ClusteringComparator(types);
     }
 
-    public double getBloomFilterFpChance()
+    public static ByteBuffer serializePartitionKey(ClusteringPrefix keyAsClustering)
     {
-        // we disallow bFFPC==null starting in 1.2.1 but tolerated it before that
-        return (bloomFilterFpChance == null || bloomFilterFpChance == 0)
-               ? compactionStrategyClass == LeveledCompactionStrategy.class ? 0.1 : 0.01
-               : bloomFilterFpChance;
+        // TODO: we should stop using Clustering for partition keys. Maybe we can add
+        // a few methods to DecoratedKey so we don't have to (note that while using a Clustering
+        // allows to use buildBound(), it's actually used for partition keys only when every restriction
+        // is an equal, so we could easily create a specific method for keys for that.
+        if (keyAsClustering.size() == 1)
+            return keyAsClustering.get(0);
+
+        ByteBuffer[] values = new ByteBuffer[keyAsClustering.size()];
+        for (int i = 0; i < keyAsClustering.size(); i++)
+            values[i] = keyAsClustering.get(i);
+        return CompositeType.build(values);
     }
 
-    public CachingOptions getCaching()
-    {
-        return caching;
-    }
-
-    public int getMinIndexInterval()
-    {
-        return minIndexInterval;
-    }
-
-    public int getMaxIndexInterval()
-    {
-        return maxIndexInterval;
-    }
-
-    public SpeculativeRetry getSpeculativeRetry()
-    {
-        return speculativeRetry;
-    }
-
-    public int getMemtableFlushPeriod()
-    {
-        return memtableFlushPeriod;
-    }
-
-    public int getDefaultTimeToLive()
-    {
-        return defaultTimeToLive;
-    }
-
-    public Map<ColumnIdentifier, Long> getDroppedColumns()
+    public Map<ByteBuffer, DroppedColumn> getDroppedColumns()
     {
         return droppedColumns;
     }
 
-    public Boolean getIsDense()
+    public ColumnDefinition getDroppedColumnDefinition(ByteBuffer name)
     {
-        return isDense;
+        return getDroppedColumnDefinition(name, false);
+    }
+
+    /**
+     * Returns a "fake" ColumnDefinition corresponding to the dropped column {@code name}
+     * of {@code null} if there is no such dropped column.
+     *
+     * @param name - the column name
+     * @param isStatic - whether the column was a static column, if known
+     */
+    public ColumnDefinition getDroppedColumnDefinition(ByteBuffer name, boolean isStatic)
+    {
+        DroppedColumn dropped = droppedColumns.get(name);
+        if (dropped == null)
+            return null;
+
+        // We need the type for deserialization purpose. If we don't have the type however,
+        // it means that it's a dropped column from before 3.0, and in that case using
+        // BytesType is fine for what we'll be using it for, even if that's a hack.
+        AbstractType<?> type = dropped.type == null ? BytesType.instance : dropped.type;
+        return isStatic
+               ? ColumnDefinition.staticDef(this, name, type)
+               : ColumnDefinition.regularDef(this, name, type);
     }
 
     @Override
@@ -658,32 +871,16 @@
         CFMetaData other = (CFMetaData) o;
 
         return Objects.equal(cfId, other.cfId)
+            && Objects.equal(flags, other.flags)
             && Objects.equal(ksName, other.ksName)
             && Objects.equal(cfName, other.cfName)
-            && Objects.equal(cfType, other.cfType)
+            && Objects.equal(params, other.params)
             && Objects.equal(comparator, other.comparator)
-            && Objects.equal(comment, other.comment)
-            && Objects.equal(readRepairChance, other.readRepairChance)
-            && Objects.equal(dcLocalReadRepairChance, other.dcLocalReadRepairChance)
-            && Objects.equal(gcGraceSeconds, other.gcGraceSeconds)
-            && Objects.equal(defaultValidator, other.defaultValidator)
             && Objects.equal(keyValidator, other.keyValidator)
-            && Objects.equal(minCompactionThreshold, other.minCompactionThreshold)
-            && Objects.equal(maxCompactionThreshold, other.maxCompactionThreshold)
             && Objects.equal(columnMetadata, other.columnMetadata)
-            && Objects.equal(compactionStrategyClass, other.compactionStrategyClass)
-            && Objects.equal(compactionStrategyOptions, other.compactionStrategyOptions)
-            && Objects.equal(compressionParameters, other.compressionParameters)
-            && Objects.equal(getBloomFilterFpChance(), other.getBloomFilterFpChance())
-            && Objects.equal(memtableFlushPeriod, other.memtableFlushPeriod)
-            && Objects.equal(caching, other.caching)
-            && Objects.equal(defaultTimeToLive, other.defaultTimeToLive)
-            && Objects.equal(minIndexInterval, other.minIndexInterval)
-            && Objects.equal(maxIndexInterval, other.maxIndexInterval)
-            && Objects.equal(speculativeRetry, other.speculativeRetry)
             && Objects.equal(droppedColumns, other.droppedColumns)
             && Objects.equal(triggers, other.triggers)
-            && Objects.equal(isDense, other.isDense);
+            && Objects.equal(indexes, other.indexes);
     }
 
     @Override
@@ -693,52 +890,23 @@
             .append(cfId)
             .append(ksName)
             .append(cfName)
-            .append(cfType)
+            .append(flags)
             .append(comparator)
-            .append(comment)
-            .append(readRepairChance)
-            .append(dcLocalReadRepairChance)
-            .append(gcGraceSeconds)
-            .append(defaultValidator)
+            .append(params)
             .append(keyValidator)
-            .append(minCompactionThreshold)
-            .append(maxCompactionThreshold)
             .append(columnMetadata)
-            .append(compactionStrategyClass)
-            .append(compactionStrategyOptions)
-            .append(compressionParameters)
-            .append(getBloomFilterFpChance())
-            .append(memtableFlushPeriod)
-            .append(caching)
-            .append(defaultTimeToLive)
-            .append(minIndexInterval)
-            .append(maxIndexInterval)
-            .append(speculativeRetry)
             .append(droppedColumns)
             .append(triggers)
-            .append(isDense)
+            .append(indexes)
             .toHashCode();
     }
 
-    public AbstractType<?> getValueValidator(CellName cellName)
-    {
-        ColumnDefinition def = getColumnDefinition(cellName);
-        return def == null ? defaultValidator : def.type;
-    }
-
-    /**
-     * Updates this object in place to match the definition in the system schema tables.
-     * @return true if any columns were added, removed, or altered; otherwise, false is returned
-     */
-    public boolean reload()
-    {
-        return apply(LegacySchemaTables.createTableFromName(ksName, cfName));
-    }
-
     /**
      * Updates CFMetaData in-place to match cfm
      *
-     * @return true if any columns were added, removed, or altered; otherwise, false is returned
+     * @return true if any change was made which impacts queries/updates on the table,
+     *         e.g. any columns or indexes were added, removed, or altered; otherwise, false is returned.
+     *         Used to determine whether prepared statements against this table need to be re-prepared.
      * @throws ConfigurationException if ks/cf names or cf ids didn't match
      */
     @VisibleForTesting
@@ -746,69 +914,43 @@
     {
         logger.debug("applying {} to {}", cfm, this);
 
-        validateCompatility(cfm);
+        validateCompatibility(cfm);
 
-        // TODO: this method should probably return a new CFMetaData so that
-        // 1) we can keep comparator final
-        // 2) updates are applied atomically
-        comparator = cfm.comparator;
+        partitionKeyColumns = cfm.partitionKeyColumns;
+        clusteringColumns = cfm.clusteringColumns;
+
+        boolean changeAffectsStatements = !partitionColumns.equals(cfm.partitionColumns);
+        partitionColumns = cfm.partitionColumns;
+        superCfKeyColumn = cfm.superCfKeyColumn;
+        superCfValueColumn = cfm.superCfValueColumn;
+
+        isDense = cfm.isDense;
+        isCompound = cfm.isCompound;
+        isSuper = cfm.isSuper;
+
+        flags = cfm.flags;
+
+        rebuild();
 
         // compaction thresholds are checked by ThriftValidation. We shouldn't be doing
         // validation on the apply path; it's too late for that.
 
-        comment = Strings.nullToEmpty(cfm.comment);
-        readRepairChance = cfm.readRepairChance;
-        dcLocalReadRepairChance = cfm.dcLocalReadRepairChance;
-        gcGraceSeconds = cfm.gcGraceSeconds;
-        defaultValidator = cfm.defaultValidator;
-        keyValidator = cfm.keyValidator;
-        minCompactionThreshold = cfm.minCompactionThreshold;
-        maxCompactionThreshold = cfm.maxCompactionThreshold;
-
-        bloomFilterFpChance = cfm.getBloomFilterFpChance();
-        caching = cfm.caching;
-        minIndexInterval = cfm.minIndexInterval;
-        maxIndexInterval = cfm.maxIndexInterval;
-        memtableFlushPeriod = cfm.memtableFlushPeriod;
-        defaultTimeToLive = cfm.defaultTimeToLive;
-        speculativeRetry = cfm.speculativeRetry;
+        params = cfm.params;
 
         if (!cfm.droppedColumns.isEmpty())
             droppedColumns = cfm.droppedColumns;
 
-        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(columnMetadata, cfm.columnMetadata);
-        // columns that are no longer needed
-        for (ColumnDefinition cd : columnDiff.entriesOnlyOnLeft().values())
-            removeColumnDefinition(cd);
-        // newly added columns
-        for (ColumnDefinition cd : columnDiff.entriesOnlyOnRight().values())
-            addColumnDefinition(cd);
-        // old columns with updated attributes
-        for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
-        {
-            ColumnDefinition oldDef = columnMetadata.get(name);
-            ColumnDefinition def = cfm.columnMetadata.get(name);
-            addOrReplaceColumnDefinition(oldDef.apply(def));
-        }
-
-        compactionStrategyClass = cfm.compactionStrategyClass;
-        compactionStrategyOptions = cfm.compactionStrategyOptions;
-
-        compressionParameters = cfm.compressionParameters;
-
         triggers = cfm.triggers;
 
-        isDense(cfm.isDense);
+        changeAffectsStatements |= !indexes.equals(cfm.indexes);
+        indexes = cfm.indexes;
 
-        rebuild();
         logger.debug("application result is {}", this);
 
-        return !columnDiff.entriesOnlyOnLeft().isEmpty() ||
-               !columnDiff.entriesOnlyOnRight().isEmpty() ||
-               !columnDiff.entriesDiffering().isEmpty();
+        return changeAffectsStatements;
     }
 
-    public void validateCompatility(CFMetaData cfm) throws ConfigurationException
+    public void validateCompatibility(CFMetaData cfm) throws ConfigurationException
     {
         // validate
         if (!cfm.ksName.equals(ksName))
@@ -820,66 +962,27 @@
         if (!cfm.cfId.equals(cfId))
             throw new ConfigurationException(String.format("Column family ID mismatch (found %s; expected %s)",
                                                            cfm.cfId, cfId));
-
-        if (cfm.cfType != cfType)
-            throw new ConfigurationException(String.format("Column family types do not match (found %s; expected %s).", cfm.cfType, cfType));
-
-        if (!cfm.comparator.isCompatibleWith(comparator))
-            throw new ConfigurationException(String.format("Column family comparators do not match or are not compatible (found %s; expected %s).", cfm.comparator.getClass().getSimpleName(), comparator.getClass().getSimpleName()));
     }
 
-    public static void validateCompactionOptions(Class<? extends AbstractCompactionStrategy> strategyClass, Map<String, String> options) throws ConfigurationException
-    {
-        try
-        {
-            if (options == null)
-                return;
-
-            Map<?,?> unknownOptions = (Map) strategyClass.getMethod("validateOptions", Map.class).invoke(null, options);
-            if (!unknownOptions.isEmpty())
-                throw new ConfigurationException(String.format("Properties specified %s are not understood by %s", unknownOptions.keySet(), strategyClass.getSimpleName()));
-        }
-        catch (NoSuchMethodException e)
-        {
-            logger.warn("Compaction Strategy {} does not have a static validateOptions method. Validation ignored", strategyClass.getName());
-        }
-        catch (InvocationTargetException e)
-        {
-            if (e.getTargetException() instanceof ConfigurationException)
-                throw (ConfigurationException) e.getTargetException();
-            throw new ConfigurationException("Failed to validate compaction options: " + options);
-        }
-        catch (ConfigurationException e)
-        {
-            throw e;
-        }
-        catch (Exception e)
-        {
-            throw new ConfigurationException("Failed to validate compaction options: " + options);
-        }
-    }
 
     public static Class<? extends AbstractCompactionStrategy> createCompactionStrategy(String className) throws ConfigurationException
     {
         className = className.contains(".") ? className : "org.apache.cassandra.db.compaction." + className;
         Class<AbstractCompactionStrategy> strategyClass = FBUtilities.classForName(className, "compaction strategy");
-        if (className.equals(WrappingCompactionStrategy.class.getName()))
-            throw new ConfigurationException("You can't set WrappingCompactionStrategy as the compaction strategy!");
         if (!AbstractCompactionStrategy.class.isAssignableFrom(strategyClass))
             throw new ConfigurationException(String.format("Specified compaction strategy class (%s) is not derived from AbstractReplicationStrategy", className));
 
         return strategyClass;
     }
 
-    public static AbstractCompactionStrategy createCompactionStrategyInstance(Class<? extends AbstractCompactionStrategy> compactionStrategyClass,
-                                                                              ColumnFamilyStore cfs,
-                                                                              Map<String, String> compactionStrategyOptions)
+    public static AbstractCompactionStrategy createCompactionStrategyInstance(ColumnFamilyStore cfs,
+                                                                              CompactionParams compactionParams)
     {
         try
         {
             Constructor<? extends AbstractCompactionStrategy> constructor =
-                compactionStrategyClass.getConstructor(ColumnFamilyStore.class, Map.class);
-            return constructor.newInstance(cfs, compactionStrategyOptions);
+                compactionParams.klass().getConstructor(ColumnFamilyStore.class, Map.class);
+            return constructor.newInstance(cfs, compactionParams.options());
         }
         catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException e)
         {
@@ -887,18 +990,12 @@
         }
     }
 
-    @Deprecated
-    public AbstractCompactionStrategy createCompactionStrategyInstance(ColumnFamilyStore cfs)
-    {
-        return createCompactionStrategyInstance(compactionStrategyClass, cfs, compactionStrategyOptions);
-    }
-
     /**
      * Returns the ColumnDefinition for {@code name}.
      */
     public ColumnDefinition getColumnDefinition(ColumnIdentifier name)
     {
-        return columnMetadata.get(name.bytes);
+        return getColumnDefinition(name.bytes);
     }
 
     // In general it is preferable to work with ColumnIdentifier to make it
@@ -910,98 +1007,16 @@
         return columnMetadata.get(name);
     }
 
-    /**
-     * Returns a ColumnDefinition given a cell name.
-     */
-    public ColumnDefinition getColumnDefinition(CellName cellName)
+    // Returns only columns that are supposed to be visible through CQL layer
+    public ColumnDefinition getColumnDefinitionForCQL(ColumnIdentifier name)
     {
-        ColumnIdentifier id = cellName.cql3ColumnName(this);
-        ColumnDefinition def = id == null
-                             ? getColumnDefinition(cellName.toByteBuffer())  // Means a dense layout, try the full column name
-                             : getColumnDefinition(id);
-
-        // It's possible that the def is a PRIMARY KEY or COMPACT_VALUE one in case a concrete cell
-        // name conflicts with a CQL column name, which can happen in 2 cases:
-        // 1) because the user inserted a cell through Thrift that conflicts with a default "alias" used
-        //    by CQL for thrift tables (see #6892).
-        // 2) for COMPACT STORAGE tables with a single utf8 clustering column, the cell name can be anything,
-        //    including a CQL column name (without this being a problem).
-        // In any case, this is fine, this just mean that columnDefinition is not the ColumnDefinition we are
-        // looking for.
-        return def != null && def.isPartOfCellName() ? def : null;
+        return getColumnDefinitionForCQL(name.bytes);
     }
 
-    public ColumnDefinition getColumnDefinitionForIndex(String indexName)
+    public ColumnDefinition getColumnDefinitionForCQL(ByteBuffer name)
     {
-        for (ColumnDefinition def : allColumns())
-        {
-            if (indexName.equals(def.getIndexName()))
-                return def;
-        }
-        return null;
-    }
-
-    /**
-     * Convert a null index_name to appropriate default name according to column status
-     */
-    public void addDefaultIndexNames() throws ConfigurationException
-    {
-        // if this is ColumnFamily update we need to add previously defined index names to the existing columns first
-        UUID cfId = Schema.instance.getId(ksName, cfName);
-        if (cfId != null)
-        {
-            CFMetaData cfm = Schema.instance.getCFMetaData(cfId);
-
-            for (ColumnDefinition newDef : allColumns())
-            {
-                if (!cfm.columnMetadata.containsKey(newDef.name.bytes) || newDef.getIndexType() == null)
-                    continue;
-
-                String oldIndexName = cfm.getColumnDefinition(newDef.name).getIndexName();
-
-                if (oldIndexName == null)
-                    continue;
-
-                if (newDef.getIndexName() != null && !oldIndexName.equals(newDef.getIndexName()))
-                    throw new ConfigurationException("Can't modify index name: was '" + oldIndexName + "' changed to '" + newDef.getIndexName() + "'.");
-
-                newDef.setIndexName(oldIndexName);
-            }
-        }
-
-        Set<String> existingNames = existingIndexNames(null);
-        for (ColumnDefinition column : allColumns())
-        {
-            if (column.getIndexType() != null && column.getIndexName() == null)
-            {
-                String baseName = getDefaultIndexName(cfName, column.name);
-                String indexName = baseName;
-                int i = 0;
-                while (existingNames.contains(indexName))
-                    indexName = baseName + '_' + (++i);
-                column.setIndexName(indexName);
-            }
-        }
-    }
-
-    public static String getDefaultIndexName(String cfName, ColumnIdentifier columnName)
-    {
-        return (cfName + "_" + columnName + "_idx").replaceAll("\\W", "");
-    }
-
-    public Iterator<OnDiskAtom> getOnDiskIterator(FileDataInput in, Version version)
-    {
-        return getOnDiskIterator(in, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
-    }
-
-    public Iterator<OnDiskAtom> getOnDiskIterator(FileDataInput in, ColumnSerializer.Flag flag, int expireBefore, Version version)
-    {
-        return version.getSSTableFormat().getOnDiskIterator(in, flag, expireBefore, this, version);
-    }
-
-    public AtomDeserializer getOnDiskDeserializer(DataInput in, Version version)
-    {
-        return new AtomDeserializer(comparator, in, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
+        ColumnDefinition cd = getColumnDefinition(name);
+        return hiddenColumns.contains(cd) ? null : cd;
     }
 
     public static boolean isNameValid(String name)
@@ -1009,11 +1024,6 @@
         return name != null && !name.isEmpty() && name.length() <= Schema.NAME_LENGTH && name.matches("\\w+");
     }
 
-    public static boolean isIndexNameValid(String name)
-    {
-        return name != null && !name.isEmpty() && name.matches("\\w+");
-    }
-
     public CFMetaData validate() throws ConfigurationException
     {
         rebuild();
@@ -1023,8 +1033,7 @@
         if (!isNameValid(cfName))
             throw new ConfigurationException(String.format("ColumnFamily name must not be empty, more than %s characters long, or contain non-alphanumeric-underscore characters (got \"%s\")", Schema.NAME_LENGTH, cfName));
 
-        if (cfType == null)
-            throw new ConfigurationException(String.format("Invalid column family type for %s", cfName));
+        params.validate();
 
         for (int i = 0; i < comparator.size(); i++)
         {
@@ -1035,11 +1044,11 @@
             throw new ConfigurationException("CounterColumnType is not a valid key validator");
 
         // Mixing counter with non counter columns is not supported (#2614)
-        if (defaultValidator instanceof CounterColumnType)
+        if (isCounter())
         {
-            for (ColumnDefinition def : regularAndStaticColumns())
-                if (!(def.type instanceof CounterColumnType))
-                    throw new ConfigurationException("Cannot add a non counter column (" + def.name + ") in a counter column family");
+            for (ColumnDefinition def : partitionColumns())
+                if (!(def.type instanceof CounterColumnType) && (!isSuper() || isSuperColumnValueColumn(def)))
+                    throw new ConfigurationException("Cannot add a non counter column (" + def + ") in a counter column family");
         }
         else
         {
@@ -1048,122 +1057,38 @@
                     throw new ConfigurationException("Cannot add a counter column (" + def.name + ") in a non counter column family");
         }
 
+        if (!indexes.isEmpty() && isSuper())
+            throw new ConfigurationException("Secondary indexes are not supported on super column families");
+
         // initialize a set of names NOT in the CF under consideration
-        Set<String> indexNames = existingIndexNames(cfName);
-        for (ColumnDefinition c : allColumns())
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
+
+        Set<String> indexNames = ksm == null ? new HashSet<>() : ksm.existingIndexNames(cfName);
+        for (IndexMetadata index : indexes)
         {
-            if (c.getIndexType() == null)
-            {
-                if (c.getIndexName() != null)
-                    throw new ConfigurationException("Index name cannot be set without index type");
-            }
-            else
-            {
-                if (cfType == ColumnFamilyType.Super)
-                    throw new ConfigurationException("Secondary indexes are not supported on super column families");
-                if (!isIndexNameValid(c.getIndexName()))
-                    throw new ConfigurationException("Illegal index name " + c.getIndexName());
-                // check index names against this CF _and_ globally
-                if (indexNames.contains(c.getIndexName()))
-                    throw new ConfigurationException("Duplicate index name " + c.getIndexName());
-                indexNames.add(c.getIndexName());
+            // check index names against this CF _and_ globally
+            if (indexNames.contains(index.name))
+                throw new ConfigurationException("Duplicate index name " + index.name);
+            indexNames.add(index.name);
 
-                if (c.getIndexType() == IndexType.CUSTOM)
-                {
-                    if (c.getIndexOptions() == null || !c.hasIndexOption(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME))
-                        throw new ConfigurationException("Required index option missing: " + SecondaryIndex.CUSTOM_INDEX_OPTION_NAME);
-                }
-
-                // This method validates the column metadata but does not intialize the index
-                SecondaryIndex.createInstance(null, c);
-            }
+            index.validate(this);
         }
 
-        validateCompactionThresholds();
-
-        if (bloomFilterFpChance != null && bloomFilterFpChance == 0)
-            throw new ConfigurationException("Zero false positives is impossible; bloom filter false positive chance bffpc must be 0 < bffpc <= 1");
-
-        validateIndexIntervalThresholds();
-
         return this;
     }
 
-    private static Set<String> existingIndexNames(String cfToExclude)
+    // The comparator to validate the definition name with thrift.
+    public AbstractType<?> thriftColumnNameType()
     {
-        Set<String> indexNames = new HashSet<>();
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-            if (cfToExclude == null || !cfs.name.equals(cfToExclude))
-                for (ColumnDefinition cd : cfs.metadata.allColumns())
-                    indexNames.add(cd.getIndexName());
-        return indexNames;
-    }
-
-    private void validateCompactionThresholds() throws ConfigurationException
-    {
-        if (maxCompactionThreshold == 0)
+        if (isSuper())
         {
-            logger.warn("Disabling compaction by setting max or min compaction has been deprecated, " +
-                    "set the compaction strategy option 'enabled' to 'false' instead");
-            return;
+            ColumnDefinition def = compactValueColumn();
+            assert def != null && def.type instanceof MapType;
+            return ((MapType)def.type).nameComparator();
         }
 
-        if (minCompactionThreshold <= 1)
-            throw new ConfigurationException(String.format("Min compaction threshold cannot be less than 2 (got %d).", minCompactionThreshold));
-
-        if (minCompactionThreshold > maxCompactionThreshold)
-            throw new ConfigurationException(String.format("Min compaction threshold (got %d) cannot be greater than max compaction threshold (got %d)",
-                                                            minCompactionThreshold, maxCompactionThreshold));
-    }
-
-    private void validateIndexIntervalThresholds() throws ConfigurationException
-    {
-        if (minIndexInterval <= 0)
-            throw new ConfigurationException(String.format("Min index interval must be greater than 0 (got %d).", minIndexInterval));
-        if (maxIndexInterval < minIndexInterval)
-            throw new ConfigurationException(String.format("Max index interval (%d) must be greater than the min index " +
-                                                           "interval (%d).", maxIndexInterval, minIndexInterval));
-    }
-
-    public boolean isPurged()
-    {
-        return isPurged;
-    }
-
-    void markPurged()
-    {
-        isPurged = true;
-    }
-
-    // The comparator to validate the definition name.
-
-    public AbstractType<?> getColumnDefinitionComparator(ColumnDefinition def)
-    {
-        return getComponentComparator(def.isOnAllComponents() ? null : def.position(), def.kind);
-    }
-
-    public AbstractType<?> getComponentComparator(Integer componentIndex, ColumnDefinition.Kind kind)
-    {
-        switch (kind)
-        {
-            case REGULAR:
-                if (componentIndex == null)
-                    return comparator.asAbstractType();
-
-                AbstractType<?> t = comparator.subtype(componentIndex);
-                assert t != null : "Non-sensical component index";
-                return t;
-            default:
-                // CQL3 column names are UTF8
-                return UTF8Type.instance;
-        }
-    }
-
-    public CFMetaData addAllColumnDefinitions(Collection<ColumnDefinition> defs)
-    {
-        for (ColumnDefinition def : defs)
-            addOrReplaceColumnDefinition(def);
-        return this;
+        assert isStaticCompactTable();
+        return clusteringColumns.get(0).type;
     }
 
     public CFMetaData addColumnDefinition(ColumnDefinition def) throws ConfigurationException
@@ -1178,252 +1103,128 @@
     // know this cannot happen.
     public CFMetaData addOrReplaceColumnDefinition(ColumnDefinition def)
     {
-        if (def.kind == ColumnDefinition.Kind.REGULAR)
-            comparator.addCQL3Column(def.name);
-        columnMetadata.put(def.name.bytes, def);
+        // Adds the definition and rebuild what is necessary. We could call rebuild() but it's not too hard to
+        // only rebuild the necessary bits.
+        switch (def.kind)
+        {
+            case PARTITION_KEY:
+                partitionKeyColumns.set(def.position(), def);
+                break;
+            case CLUSTERING:
+                clusteringColumns.set(def.position(), def);
+                break;
+            case REGULAR:
+            case STATIC:
+                PartitionColumns.Builder builder = PartitionColumns.builder();
+                for (ColumnDefinition column : partitionColumns)
+                    if (!column.name.equals(def.name))
+                        builder.add(column);
+                builder.add(def);
+                partitionColumns = builder.build();
+                // If dense, we must have modified the compact value since that's the only one we can have.
+                if (isDense())
+                    this.compactValueColumn = def;
+                break;
+        }
+        this.columnMetadata.put(def.name.bytes, def);
         return this;
     }
 
     public boolean removeColumnDefinition(ColumnDefinition def)
     {
-        if (def.kind == ColumnDefinition.Kind.REGULAR)
-            comparator.removeCQL3Column(def.name);
-        return columnMetadata.remove(def.name.bytes) != null;
+        assert !def.isPartitionKey();
+        boolean removed = columnMetadata.remove(def.name.bytes) != null;
+        if (removed)
+            partitionColumns = partitionColumns.without(def);
+        return removed;
     }
 
-    public void addTriggerDefinition(TriggerDefinition def) throws InvalidRequestException
+    /**
+     * Adds the column definition as a dropped column, recording the drop with the provided timestamp.
+     */
+    public void recordColumnDrop(ColumnDefinition def, long deleteTimestamp)
     {
-        if (containsTriggerDefinition(def))
-            throw new InvalidRequestException(
-                String.format("Cannot create trigger %s, a trigger with the same name already exists", def.name));
-        triggers.put(def.name, def);
+        recordColumnDrop(def, deleteTimestamp, true);
     }
 
-    public boolean containsTriggerDefinition(TriggerDefinition def)
+    @VisibleForTesting
+    public void recordColumnDrop(ColumnDefinition def, long deleteTimestamp, boolean preserveKind)
     {
-        return triggers.containsKey(def.name);
-    }
-
-    public boolean removeTrigger(String name)
-    {
-        return triggers.remove(name) != null;
-    }
-
-    public void recordColumnDrop(ColumnDefinition def)
-    {
-        assert !def.isOnAllComponents();
-        droppedColumns.put(def.name, FBUtilities.timestampMicros());
+        droppedColumns.put(def.name.bytes, new DroppedColumn(def.name.toString(), preserveKind ? def.kind : null, def.type, deleteTimestamp));
     }
 
     public void renameColumn(ColumnIdentifier from, ColumnIdentifier to) throws InvalidRequestException
     {
-        ColumnDefinition def = getColumnDefinition(from);
+        ColumnDefinition def = getColumnDefinitionForCQL(from);
+
         if (def == null)
             throw new InvalidRequestException(String.format("Cannot rename unknown column %s in keyspace %s", from, cfName));
 
         if (getColumnDefinition(to) != null)
             throw new InvalidRequestException(String.format("Cannot rename column %s to %s in keyspace %s; another column of that name already exist", from, to, cfName));
 
-        if (def.isPartOfCellName())
+        if (def.isPartOfCellName(isCQLTable(), isSuper()) && !isDense())
         {
             throw new InvalidRequestException(String.format("Cannot rename non PRIMARY KEY part %s", from));
         }
-        else if (def.isIndexed())
+
+        if (!getIndexes().isEmpty())
         {
-            throw new InvalidRequestException(String.format("Cannot rename column %s because it is secondary indexed", from));
+            ColumnFamilyStore store = Keyspace.openAndGetStore(this);
+            Set<IndexMetadata> dependentIndexes = store.indexManager.getDependentIndexes(def);
+            if (!dependentIndexes.isEmpty())
+                throw new InvalidRequestException(String.format("Cannot rename column %s because it has " +
+                                                                "dependent secondary indexes (%s)",
+                                                                from,
+                                                                dependentIndexes.stream()
+                                                                                .map(i -> i.name)
+                                                                                .collect(Collectors.joining(","))));
         }
 
-        ColumnDefinition newDef = def.withNewName(to);
-        // don't call addColumnDefinition/removeColumnDefition because we want to avoid recomputing
-        // the CQL3 cfDef between those two operation
-        addOrReplaceColumnDefinition(newDef);
-        removeColumnDefinition(def);
-    }
-
-    public CFMetaData rebuild()
-    {
-        if (isDense == null)
-            isDense(calculateIsDense(comparator.asAbstractType(), allColumns()));
-
-        List<ColumnDefinition> pkCols = nullInitializedList(keyValidator.componentsCount());
-        List<ColumnDefinition> ckCols = nullInitializedList(comparator.clusteringPrefixSize());
-        // We keep things sorted to get consistent/predictable order in select queries
-        SortedSet<ColumnDefinition> regCols = new TreeSet<>(regularColumnComparator);
-        SortedSet<ColumnDefinition> statCols = new TreeSet<>(regularColumnComparator);
-        ColumnDefinition compactCol = null;
-
-        for (ColumnDefinition def : allColumns())
+        if (isSuper() && isDense())
         {
-            switch (def.kind)
+            if (isSuperColumnKeyColumn(def))
             {
-                case PARTITION_KEY:
-                    assert !(def.isOnAllComponents() && keyValidator instanceof CompositeType);
-                    pkCols.set(def.position(), def);
-                    break;
-                case CLUSTERING_COLUMN:
-                    assert !(def.isOnAllComponents() && comparator.isCompound());
-                    ckCols.set(def.position(), def);
-                    break;
-                case REGULAR:
-                    regCols.add(def);
-                    break;
-                case STATIC:
-                    statCols.add(def);
-                    break;
-                case COMPACT_VALUE:
-                    assert compactCol == null : "There shouldn't be more than one compact value defined: got " + compactCol + " and " + def;
-                    compactCol = def;
-                    break;
+                columnMetadata.remove(superCfKeyColumn.name.bytes);
+                superCfKeyColumn = superCfKeyColumn.withNewName(to);
+                columnMetadata.put(superCfKeyColumn.name.bytes, SuperColumnCompatibility.getSuperCfSschemaRepresentation(superCfKeyColumn));
             }
-        }
-
-        // Now actually assign the correct value. This is not atomic, but then again, updating CFMetaData is never atomic anyway.
-        partitionKeyColumns = addDefaultKeyAliases(pkCols);
-        clusteringColumns = addDefaultColumnAliases(ckCols);
-        regularColumns = regCols;
-        staticColumns = statCols;
-        compactValueColumn = addDefaultValueAlias(compactCol);
-        return this;
-    }
-
-    private List<ColumnDefinition> addDefaultKeyAliases(List<ColumnDefinition> pkCols)
-    {
-        for (int i = 0; i < pkCols.size(); i++)
-        {
-            if (pkCols.get(i) == null)
+            else if (isSuperColumnValueColumn(def))
             {
-                Integer idx = null;
-                AbstractType<?> type = keyValidator;
-                if (keyValidator instanceof CompositeType)
-                {
-                    idx = i;
-                    type = ((CompositeType)keyValidator).types.get(i);
-                }
-                // For compatibility sake, we call the first alias 'key' rather than 'key1'. This
-                // is inconsistent with column alias, but it's probably not worth risking breaking compatibility now.
-                ByteBuffer name = ByteBufferUtil.bytes(i == 0 ? DEFAULT_KEY_ALIAS : DEFAULT_KEY_ALIAS + (i + 1));
-                ColumnDefinition newDef = ColumnDefinition.partitionKeyDef(this, name, type, idx);
-                addOrReplaceColumnDefinition(newDef);
-                pkCols.set(i, newDef);
+                columnMetadata.remove(superCfValueColumn.name.bytes);
+                superCfValueColumn = superCfValueColumn.withNewName(to);
+                columnMetadata.put(superCfValueColumn.name.bytes, superCfValueColumn);
             }
-        }
-        return pkCols;
-    }
-
-    private List<ColumnDefinition> addDefaultColumnAliases(List<ColumnDefinition> ckCols)
-    {
-        for (int i = 0; i < ckCols.size(); i++)
-        {
-            if (ckCols.get(i) == null)
-            {
-                Integer idx;
-                AbstractType<?> type;
-                if (comparator.isCompound())
-                {
-                    idx = i;
-                    type = comparator.subtype(i);
-                }
-                else
-                {
-                    idx = null;
-                    type = comparator.asAbstractType();
-                }
-                ByteBuffer name = ByteBufferUtil.bytes(DEFAULT_COLUMN_ALIAS + (i + 1));
-                ColumnDefinition newDef = ColumnDefinition.clusteringKeyDef(this, name, type, idx);
-                addOrReplaceColumnDefinition(newDef);
-                ckCols.set(i, newDef);
-            }
-        }
-        return ckCols;
-    }
-
-    private ColumnDefinition addDefaultValueAlias(ColumnDefinition compactValueDef)
-    {
-        if (comparator.isDense())
-        {
-            if (compactValueDef != null)
-                return compactValueDef;
-
-            ColumnDefinition newDef = ColumnDefinition.compactValueDef(this, ByteBufferUtil.bytes(DEFAULT_VALUE_ALIAS), defaultValidator);
-            addOrReplaceColumnDefinition(newDef);
-            return newDef;
+            else
+                addOrReplaceColumnDefinition(def.withNewName(to));
         }
         else
         {
-            assert compactValueDef == null;
-            return null;
-        }
-    }
-
-    /*
-     * We call dense a CF for which each component of the comparator is a clustering column, i.e. no
-     * component is used to store a regular column names. In other words, non-composite static "thrift"
-     * and CQL3 CF are *not* dense.
-     * We save whether the table is dense or not during table creation through CQL, but we don't have this
-     * information for table just created through thrift, nor for table prior to CASSANDRA-7744, so this
-     * method does its best to infer whether the table is dense or not based on other elements.
-     */
-    public static boolean calculateIsDense(AbstractType<?> comparator, Collection<ColumnDefinition> defs)
-    {
-        /*
-         * As said above, this method is only here because we need to deal with thrift upgrades.
-         * Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
-         * then we'll have saved the "is_dense" value and will be good to go.
-         *
-         * But non-upgraded thrift CF (and pre-7744 CF) will have no value for "is_dense", so we need
-         * to infer that information without relying on it in that case. And for the most part this is
-         * easy, a CF that has at least one REGULAR definition is not dense. But the subtlety is that not
-         * having a REGULAR definition may not mean dense because of CQL3 definitions that have only the
-         * PRIMARY KEY defined.
-         *
-         * So we need to recognize those special case CQL3 table with only a primary key. If we have some
-         * clustering columns, we're fine as said above. So the only problem is that we cannot decide for
-         * sure if a CF without REGULAR columns nor CLUSTERING_COLUMN definition is meant to be dense, or if it
-         * has been created in CQL3 by say:
-         *    CREATE TABLE test (k int PRIMARY KEY)
-         * in which case it should not be dense. However, we can limit our margin of error by assuming we are
-         * in the latter case only if the comparator is exactly CompositeType(UTF8Type).
-         */
-        boolean hasRegular = false;
-        int maxClusteringIdx = -1;
-        for (ColumnDefinition def : defs)
-        {
-            switch (def.kind)
-            {
-                case CLUSTERING_COLUMN:
-                    maxClusteringIdx = Math.max(maxClusteringIdx, def.position());
-                    break;
-                case REGULAR:
-                    hasRegular = true;
-                    break;
-            }
+            addOrReplaceColumnDefinition(def.withNewName(to));
         }
 
-        return maxClusteringIdx >= 0
-             ? maxClusteringIdx == comparator.componentsCount() - 1
-             : !hasRegular && !isCQL3OnlyPKComparator(comparator);
+
+        // removeColumnDefinition doesn't work for partition key (expectedly) but renaming one is fine so we still
+        // want to update columnMetadata.
+        if (def.isPartitionKey())
+            columnMetadata.remove(def.name.bytes);
+        else
+            removeColumnDefinition(def);
     }
 
-    private static boolean isCQL3OnlyPKComparator(AbstractType<?> comparator)
+    public boolean isCQLTable()
     {
-        if (!(comparator instanceof CompositeType))
-            return false;
-
-        CompositeType ct = (CompositeType)comparator;
-        return ct.types.size() == 1 && ct.types.get(0) instanceof UTF8Type;
+        return !isSuper() && !isDense() && isCompound();
     }
 
-    public boolean isCQL3Table()
+    public boolean isCompactTable()
     {
-        return !isSuper() && !comparator.isDense() && comparator.isCompound();
+        return !isCQLTable();
     }
 
-    private static <T> List<T> nullInitializedList(int size)
+    public boolean isStaticCompactTable()
     {
-        List<T> l = new ArrayList<>(size);
-        for (int i = 0; i < size; ++i)
-            l.add(null);
-        return l;
+        return !isSuper() && !isDense() && !isCompound();
     }
 
     /**
@@ -1431,36 +1232,111 @@
      */
     public boolean isThriftCompatible()
     {
-        // Super CF are always "thrift compatible". But since they may have defs with a componentIndex != null,
-        // we have to special case here.
-        if (isSuper())
-            return true;
-
-        for (ColumnDefinition def : allColumns())
-        {
-            // Non-REGULAR ColumnDefinition are not "thrift compatible" per-se, but it's ok because they hold metadata
-            // this is only of use to CQL3, so we will just skip them in toThrift.
-            if (def.kind == ColumnDefinition.Kind.REGULAR && !def.isThriftCompatible())
-                return false;
-        }
-
-        // The table might also have no REGULAR columns (be PK-only), but still be "thrift incompatible". See #7832.
-        if (isCQL3OnlyPKComparator(comparator.asAbstractType()) && !isDense)
-            return false;
-
-        return true;
-    }
-
-    public boolean isCounter()
-    {
-        return defaultValidator.isCounter();
+        return isCompactTable();
     }
 
     public boolean hasStaticColumns()
     {
-        return !staticColumns.isEmpty();
+        return !partitionColumns.statics.isEmpty();
     }
 
+    public boolean hasCollectionColumns()
+    {
+        for (ColumnDefinition def : partitionColumns())
+            if (def.type instanceof CollectionType && def.type.isMultiCell())
+                return true;
+        return false;
+    }
+
+    public boolean hasComplexColumns()
+    {
+        for (ColumnDefinition def : partitionColumns())
+            if (def.isComplex())
+                return true;
+        return false;
+    }
+
+    public boolean hasDroppedCollectionColumns()
+    {
+        for (DroppedColumn def : getDroppedColumns().values())
+            if (def.type instanceof CollectionType && def.type.isMultiCell())
+                return true;
+        return false;
+    }
+
+    public boolean isSuper()
+    {
+        return isSuper;
+    }
+
+    public boolean isCounter()
+    {
+        return isCounter;
+    }
+
+    // We call dense a CF for which each component of the comparator is a clustering column, i.e. no
+    // component is used to store a regular column names. In other words, non-composite static "thrift"
+    // and CQL3 CF are *not* dense.
+    public boolean isDense()
+    {
+        return isDense;
+    }
+
+    public boolean isCompound()
+    {
+        return isCompound;
+    }
+
+    public boolean isView()
+    {
+        return isView;
+    }
+
+    /**
+     * A table with strict liveness filters/ignores rows without PK liveness info,
+     * effectively tying the row liveness to its primary key liveness.
+     *
+     * Currently this is only used by views with normal base column as PK column
+     * so updates to other columns do not make the row live when the base column
+     * is not live. See CASSANDRA-11500.
+     */
+    public boolean enforceStrictLiveness()
+    {
+        return isView && Keyspace.open(ksName).viewManager.getByName(cfName).enforceStrictLiveness();
+    }
+
+    public Serializers serializers()
+    {
+        return serializers;
+    }
+
+    public AbstractType<?> makeLegacyDefaultValidator()
+    {
+        if (isCounter())
+            return CounterColumnType.instance;
+        else if (isCompactTable())
+            return isSuper() ? ((MapType)compactValueColumn().type).valueComparator() : compactValueColumn().type;
+        else
+            return BytesType.instance;
+    }
+
+    public static Set<Flag> flagsFromStrings(Set<String> strings)
+    {
+        return strings.stream()
+                      .map(String::toUpperCase)
+                      .map(Flag::valueOf)
+                      .collect(Collectors.toSet());
+    }
+
+    public static Set<String> flagsToStrings(Set<Flag> flags)
+    {
+        return flags.stream()
+                    .map(Flag::toString)
+                    .map(String::toLowerCase)
+                    .collect(Collectors.toSet());
+    }
+
+
     @Override
     public String toString()
     {
@@ -1468,30 +1344,279 @@
             .append("cfId", cfId)
             .append("ksName", ksName)
             .append("cfName", cfName)
-            .append("cfType", cfType)
+            .append("flags", flags)
+            .append("params", params)
             .append("comparator", comparator)
-            .append("comment", comment)
-            .append("readRepairChance", readRepairChance)
-            .append("dcLocalReadRepairChance", dcLocalReadRepairChance)
-            .append("gcGraceSeconds", gcGraceSeconds)
-            .append("defaultValidator", defaultValidator)
+            .append("partitionColumns", partitionColumns)
+            .append("partitionKeyColumns", partitionKeyColumns)
+            .append("clusteringColumns", clusteringColumns)
             .append("keyValidator", keyValidator)
-            .append("minCompactionThreshold", minCompactionThreshold)
-            .append("maxCompactionThreshold", maxCompactionThreshold)
             .append("columnMetadata", columnMetadata.values())
-            .append("compactionStrategyClass", compactionStrategyClass)
-            .append("compactionStrategyOptions", compactionStrategyOptions)
-            .append("compressionParameters", compressionParameters.asThriftOptions())
-            .append("bloomFilterFpChance", getBloomFilterFpChance())
-            .append("memtableFlushPeriod", memtableFlushPeriod)
-            .append("caching", caching)
-            .append("defaultTimeToLive", defaultTimeToLive)
-            .append("minIndexInterval", minIndexInterval)
-            .append("maxIndexInterval", maxIndexInterval)
-            .append("speculativeRetry", speculativeRetry)
             .append("droppedColumns", droppedColumns)
-            .append("triggers", triggers.values())
-            .append("isDense", isDense)
+            .append("triggers", triggers)
+            .append("indexes", indexes)
             .toString();
     }
+
+    public static class Builder
+    {
+        private final String keyspace;
+        private final String table;
+        private final boolean isDense;
+        private final boolean isCompound;
+        private final boolean isSuper;
+        private final boolean isCounter;
+        private final boolean isView;
+        private Optional<IPartitioner> partitioner;
+
+        private UUID tableId;
+
+        private final List<Pair<ColumnIdentifier, AbstractType>> partitionKeys = new ArrayList<>();
+        private final List<Pair<ColumnIdentifier, AbstractType>> clusteringColumns = new ArrayList<>();
+        private final List<Pair<ColumnIdentifier, AbstractType>> staticColumns = new ArrayList<>();
+        private final List<Pair<ColumnIdentifier, AbstractType>> regularColumns = new ArrayList<>();
+
+        private Builder(String keyspace, String table, boolean isDense, boolean isCompound, boolean isSuper, boolean isCounter, boolean isView)
+        {
+            this.keyspace = keyspace;
+            this.table = table;
+            this.isDense = isDense;
+            this.isCompound = isCompound;
+            this.isSuper = isSuper;
+            this.isCounter = isCounter;
+            this.isView = isView;
+            this.partitioner = Optional.empty();
+        }
+
+        public static Builder create(String keyspace, String table)
+        {
+            return create(keyspace, table, false, true, false);
+        }
+
+        public static Builder create(String keyspace, String table, boolean isDense, boolean isCompound, boolean isCounter)
+        {
+            return create(keyspace, table, isDense, isCompound, false, isCounter);
+        }
+
+        public static Builder create(String keyspace, String table, boolean isDense, boolean isCompound, boolean isSuper, boolean isCounter)
+        {
+            return new Builder(keyspace, table, isDense, isCompound, isSuper, isCounter, false);
+        }
+
+        public static Builder createView(String keyspace, String table)
+        {
+            return new Builder(keyspace, table, false, true, false, false, true);
+        }
+
+        public static Builder createDense(String keyspace, String table, boolean isCompound, boolean isCounter)
+        {
+            return create(keyspace, table, true, isCompound, isCounter);
+        }
+
+        public static Builder createSuper(String keyspace, String table, boolean isCounter)
+        {
+            return create(keyspace, table, true, true, true, isCounter);
+        }
+
+        public Builder withPartitioner(IPartitioner partitioner)
+        {
+            this.partitioner = Optional.ofNullable(partitioner);
+            return this;
+        }
+
+        public Builder withId(UUID tableId)
+        {
+            this.tableId = tableId;
+            return this;
+        }
+
+        public Builder addPartitionKey(String name, AbstractType type)
+        {
+            return addPartitionKey(ColumnIdentifier.getInterned(name, false), type);
+        }
+
+        public Builder addPartitionKey(ColumnIdentifier name, AbstractType type)
+        {
+            this.partitionKeys.add(Pair.create(name, type));
+            return this;
+        }
+
+        public Builder addClusteringColumn(String name, AbstractType type)
+        {
+            return addClusteringColumn(ColumnIdentifier.getInterned(name, false), type);
+        }
+
+        public Builder addClusteringColumn(ColumnIdentifier name, AbstractType type)
+        {
+            this.clusteringColumns.add(Pair.create(name, type));
+            return this;
+        }
+
+        public Builder addRegularColumn(String name, AbstractType type)
+        {
+            return addRegularColumn(ColumnIdentifier.getInterned(name, false), type);
+        }
+
+        public Builder addRegularColumn(ColumnIdentifier name, AbstractType type)
+        {
+            this.regularColumns.add(Pair.create(name, type));
+            return this;
+        }
+
+        public boolean hasRegulars()
+        {
+            return !this.regularColumns.isEmpty();
+        }
+
+        public Builder addStaticColumn(String name, AbstractType type)
+        {
+            return addStaticColumn(ColumnIdentifier.getInterned(name, false), type);
+        }
+
+        public Builder addStaticColumn(ColumnIdentifier name, AbstractType type)
+        {
+            this.staticColumns.add(Pair.create(name, type));
+            return this;
+        }
+
+        public Set<String> usedColumnNames()
+        {
+            Set<String> usedNames = new HashSet<>();
+            for (Pair<ColumnIdentifier, AbstractType> p : partitionKeys)
+                usedNames.add(p.left.toString());
+            for (Pair<ColumnIdentifier, AbstractType> p : clusteringColumns)
+                usedNames.add(p.left.toString());
+            for (Pair<ColumnIdentifier, AbstractType> p : staticColumns)
+                usedNames.add(p.left.toString());
+            for (Pair<ColumnIdentifier, AbstractType> p : regularColumns)
+                usedNames.add(p.left.toString());
+            return usedNames;
+        }
+
+        public CFMetaData build()
+        {
+            if (tableId == null)
+                tableId = UUIDGen.getTimeUUID();
+
+            List<ColumnDefinition> partitions = new ArrayList<>(partitionKeys.size());
+            List<ColumnDefinition> clusterings = new ArrayList<>(clusteringColumns.size());
+            PartitionColumns.Builder builder = PartitionColumns.builder();
+
+            for (int i = 0; i < partitionKeys.size(); i++)
+            {
+                Pair<ColumnIdentifier, AbstractType> p = partitionKeys.get(i);
+                partitions.add(new ColumnDefinition(keyspace, table, p.left, p.right, i, ColumnDefinition.Kind.PARTITION_KEY));
+            }
+
+            for (int i = 0; i < clusteringColumns.size(); i++)
+            {
+                Pair<ColumnIdentifier, AbstractType> p = clusteringColumns.get(i);
+                clusterings.add(new ColumnDefinition(keyspace, table, p.left, p.right, i, ColumnDefinition.Kind.CLUSTERING));
+            }
+
+            for (Pair<ColumnIdentifier, AbstractType> p : regularColumns)
+                builder.add(new ColumnDefinition(keyspace, table, p.left, p.right, ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.REGULAR));
+
+            for (Pair<ColumnIdentifier, AbstractType> p : staticColumns)
+                builder.add(new ColumnDefinition(keyspace, table, p.left, p.right, ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.STATIC));
+
+            return new CFMetaData(keyspace,
+                                  table,
+                                  tableId,
+                                  isSuper,
+                                  isCounter,
+                                  isDense,
+                                  isCompound,
+                                  isView,
+                                  partitions,
+                                  clusterings,
+                                  builder.build(),
+                                  partitioner.orElseGet(DatabaseDescriptor::getPartitioner),
+                                  null,
+                                  null);
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(CFMetaData metadata, DataOutputPlus out, int version) throws IOException
+        {
+            UUIDSerializer.serializer.serialize(metadata.cfId, out, version);
+        }
+
+        public CFMetaData deserialize(DataInputPlus in, int version) throws IOException
+        {
+            UUID cfId = UUIDSerializer.serializer.deserialize(in, version);
+            CFMetaData metadata = Schema.instance.getCFMetaData(cfId);
+            if (metadata == null)
+            {
+                String message = String.format("Couldn't find table for cfId %s. If a table was just " +
+                        "created, this is likely due to the schema not being fully propagated.  Please wait for schema " +
+                        "agreement on table creation.", cfId);
+                throw new UnknownColumnFamilyException(message, cfId);
+            }
+
+            return metadata;
+        }
+
+        public long serializedSize(CFMetaData metadata, int version)
+        {
+            return UUIDSerializer.serializer.serializedSize(metadata.cfId, version);
+        }
+    }
+
+    public static class DroppedColumn
+    {
+        // we only allow dropping REGULAR columns, from CQL-native tables, so the names are always of UTF8Type
+        public final String name;
+        public final AbstractType<?> type;
+
+        // drop timestamp, in microseconds, yet with millisecond granularity
+        public final long droppedTime;
+
+        @Nullable
+        public final ColumnDefinition.Kind kind;
+
+        public DroppedColumn(String name, ColumnDefinition.Kind kind, AbstractType<?> type, long droppedTime)
+        {
+            this.name = name;
+            this.kind = kind;
+            this.type = type;
+            this.droppedTime = droppedTime;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (!(o instanceof DroppedColumn))
+                return false;
+
+            DroppedColumn dc = (DroppedColumn) o;
+
+            return name.equals(dc.name)
+                && kind == dc.kind
+                && type.equals(dc.type)
+                && droppedTime == dc.droppedTime;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(name, kind, type, droppedTime);
+        }
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this)
+                              .add("name", name)
+                              .add("kind", kind)
+                              .add("type", type)
+                              .add("droppedTime", droppedTime)
+                              .toString();
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/config/ColumnDefinition.java b/src/java/org/apache/cassandra/config/ColumnDefinition.java
index b33718f..6f7f749 100644
--- a/src/java/org/apache/cassandra/config/ColumnDefinition.java
+++ b/src/java/org/apache/cassandra/config/ColumnDefinition.java

@@ -23,87 +23,123 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
 import com.google.common.base.Objects;
-import com.google.common.collect.Lists;
+import com.google.common.collect.Collections2;
 
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.serializers.MarshalException;
+import org.github.jamm.Unmetered;
 
-public class ColumnDefinition extends ColumnSpecification
+@Unmetered
+public class ColumnDefinition extends ColumnSpecification implements Comparable<ColumnDefinition>
 {
+    public static final Comparator<Object> asymmetricColumnDataComparator =
+        (a, b) -> ((ColumnData) a).column().compareTo((ColumnDefinition) b);
+
+    public static final int NO_POSITION = -1;
+
+    public enum ClusteringOrder
+    {
+        ASC, DESC, NONE
+    }
+
     /*
      * The type of CQL3 column this definition represents.
-     * There is 3 main type of CQL3 columns: those parts of the partition key,
-     * those parts of the clustering key and the other, regular ones.
-     * But when COMPACT STORAGE is used, there is by design only one regular
-     * column, whose name is not stored in the data contrarily to the column of
-     * type REGULAR. Hence the COMPACT_VALUE type to distinguish it below.
+     * There is 4 main type of CQL3 columns: those parts of the partition key,
+     * those parts of the clustering columns and amongst the others, regular and
+     * static ones.
      *
      * Note that thrift only knows about definitions of type REGULAR (and
-     * the ones whose componentIndex == null).
+     * the ones whose position == NO_POSITION (-1)).
      */
     public enum Kind
     {
+        // NOTE: if adding a new type, must modify comparisonOrder
         PARTITION_KEY,
-        CLUSTERING_COLUMN,
+        CLUSTERING,
         REGULAR,
-        STATIC,
-        COMPACT_VALUE
+        STATIC;
+
+        public boolean isPrimaryKeyKind()
+        {
+            return this == PARTITION_KEY || this == CLUSTERING;
+        }
+
     }
 
     public final Kind kind;
 
-    private String indexName;
-    private IndexType indexType;
-    private Map<String,String> indexOptions;
-
     /*
-     * If the column comparator is a composite type, indicates to which
-     * component this definition refers to. If null, the definition refers to
-     * the full column name.
+     * If the column is a partition key or clustering column, its position relative to
+     * other columns of the same kind. Otherwise,  NO_POSITION (-1).
+     *
+     * Note that partition key and clustering columns are numbered separately so
+     * the first clustering column is 0.
      */
-    private final Integer componentIndex;
+    private final int position;
 
-    public static ColumnDefinition partitionKeyDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    private final Comparator<CellPath> cellPathComparator;
+    private final Comparator<Object> asymmetricCellPathComparator;
+    private final Comparator<? super Cell> cellComparator;
+
+    /**
+     * These objects are compared frequently, so we encode several of their comparison components
+     * into a single long value so that this can be done efficiently
+     */
+    private final long comparisonOrder;
+
+    private static long comparisonOrder(Kind kind, boolean isComplex, long position, ColumnIdentifier name)
     {
-        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.PARTITION_KEY);
+        assert position >= 0 && position < 1 << 12;
+        return   (((long) kind.ordinal()) << 61)
+               | (isComplex ? 1L << 60 : 0)
+               | (position << 48)
+               | (name.prefixComparison >>> 16);
     }
 
-    public static ColumnDefinition partitionKeyDef(String ksName, String cfName, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition partitionKeyDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> type, int position)
     {
-        return new ColumnDefinition(ksName, cfName, new ColumnIdentifier(name, UTF8Type.instance), validator, null, null, null, componentIndex, Kind.PARTITION_KEY);
+        return new ColumnDefinition(cfm, name, type, position, Kind.PARTITION_KEY);
     }
 
-    public static ColumnDefinition clusteringKeyDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition partitionKeyDef(String ksName, String cfName, String name, AbstractType<?> type, int position)
     {
-        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.CLUSTERING_COLUMN);
+        return new ColumnDefinition(ksName, cfName, ColumnIdentifier.getInterned(name, true), type, position, Kind.PARTITION_KEY);
     }
 
-    public static ColumnDefinition regularDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition clusteringDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> type, int position)
     {
-        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.REGULAR);
+        return new ColumnDefinition(cfm, name, type, position, Kind.CLUSTERING);
     }
 
-    public static ColumnDefinition staticDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex)
+    public static ColumnDefinition clusteringDef(String ksName, String cfName, String name, AbstractType<?> type, int position)
     {
-        return new ColumnDefinition(cfm, name, validator, componentIndex, Kind.STATIC);
+        return new ColumnDefinition(ksName, cfName, ColumnIdentifier.getInterned(name, true),  type, position, Kind.CLUSTERING);
     }
 
-    public static ColumnDefinition compactValueDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator)
+    public static ColumnDefinition regularDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> type)
     {
-        return new ColumnDefinition(cfm, name, validator, null, Kind.COMPACT_VALUE);
+        return new ColumnDefinition(cfm, name, type, NO_POSITION, Kind.REGULAR);
     }
 
-    public ColumnDefinition(CFMetaData cfm, ByteBuffer name, AbstractType<?> validator, Integer componentIndex, Kind kind)
+    public static ColumnDefinition regularDef(String ksName, String cfName, String name, AbstractType<?> type)
+    {
+        return new ColumnDefinition(ksName, cfName, ColumnIdentifier.getInterned(name, true), type, NO_POSITION, Kind.REGULAR);
+    }
+
+    public static ColumnDefinition staticDef(CFMetaData cfm, ByteBuffer name, AbstractType<?> type)
+    {
+        return new ColumnDefinition(cfm, name, type, NO_POSITION, Kind.STATIC);
+    }
+
+    public ColumnDefinition(CFMetaData cfm, ByteBuffer name, AbstractType<?> type, int position, Kind kind)
     {
         this(cfm.ksName,
              cfm.cfName,
-             new ColumnIdentifier(name, cfm.getComponentComparator(componentIndex, kind)),
-             validator,
-             null,
-             null,
-             null,
-             componentIndex,
+             ColumnIdentifier.getInterned(name, cfm.getColumnDefinitionNameComparator(kind)),
+             type,
+             position,
              kind);
     }
 
@@ -111,39 +147,62 @@
     public ColumnDefinition(String ksName,
                             String cfName,
                             ColumnIdentifier name,
-                            AbstractType<?> validator,
-                            IndexType indexType,
-                            Map<String, String> indexOptions,
-                            String indexName,
-                            Integer componentIndex,
+                            AbstractType<?> type,
+                            int position,
                             Kind kind)
     {
-        super(ksName, cfName, name, validator);
-        assert name != null && validator != null;
+        super(ksName, cfName, name, type);
+        assert name != null && type != null && kind != null;
+        assert (position == NO_POSITION) == !kind.isPrimaryKeyKind(); // The position really only make sense for partition and clustering columns (and those must have one),
+                                                                      // so make sure we don't sneak it for something else since it'd breaks equals()
         this.kind = kind;
-        this.indexName = indexName;
-        this.componentIndex = componentIndex;
-        this.setIndexType(indexType, indexOptions);
+        this.position = position;
+        this.cellPathComparator = makeCellPathComparator(kind, type);
+        this.cellComparator = cellPathComparator == null ? ColumnData.comparator : (a, b) -> cellPathComparator.compare(a.path(), b.path());
+        this.asymmetricCellPathComparator = cellPathComparator == null ? null : (a, b) -> cellPathComparator.compare(((Cell)a).path(), (CellPath) b);
+        this.comparisonOrder = comparisonOrder(kind, isComplex(), Math.max(0, position), name);
+    }
+
+    private static Comparator<CellPath> makeCellPathComparator(Kind kind, AbstractType<?> type)
+    {
+        if (kind.isPrimaryKeyKind() || !type.isCollection() || !type.isMultiCell())
+            return null;
+
+        CollectionType collection = (CollectionType) type;
+
+        return new Comparator<CellPath>()
+        {
+            public int compare(CellPath path1, CellPath path2)
+            {
+                if (path1.size() == 0 || path2.size() == 0)
+                {
+                    if (path1 == CellPath.BOTTOM)
+                        return path2 == CellPath.BOTTOM ? 0 : -1;
+                    if (path1 == CellPath.TOP)
+                        return path2 == CellPath.TOP ? 0 : 1;
+                    return path2 == CellPath.BOTTOM ? 1 : -1;
+                }
+
+                // This will get more complicated once we have non-frozen UDT and nested collections
+                assert path1.size() == 1 && path2.size() == 1;
+                return collection.nameComparator().compare(path1.get(0), path2.get(0));
+            }
+        };
     }
 
     public ColumnDefinition copy()
     {
-        return new ColumnDefinition(ksName, cfName, name, type, indexType, indexOptions, indexName, componentIndex, kind);
+        return new ColumnDefinition(ksName, cfName, name, type, position, kind);
     }
 
     public ColumnDefinition withNewName(ColumnIdentifier newName)
     {
-        return new ColumnDefinition(ksName, cfName, newName, type, indexType, indexOptions, indexName, componentIndex, kind);
+        return new ColumnDefinition(ksName, cfName, newName, type, position, kind);
     }
 
     public ColumnDefinition withNewType(AbstractType<?> newType)
     {
-        return new ColumnDefinition(ksName, cfName, name, newType, indexType, indexOptions, indexName, componentIndex, kind);
-    }
-
-    public boolean isOnAllComponents()
-    {
-        return componentIndex == null;
+        return new ColumnDefinition(ksName, cfName, name, newType, position, kind);
     }
 
     public boolean isPartitionKey()
@@ -153,7 +212,7 @@
 
     public boolean isClusteringColumn()
     {
-        return kind == Kind.CLUSTERING_COLUMN;
+        return kind == Kind.CLUSTERING;
     }
 
     public boolean isStatic()
@@ -166,17 +225,17 @@
         return kind == Kind.REGULAR;
     }
 
-    public boolean isCompactValue()
+    public ClusteringOrder clusteringOrder()
     {
-        return kind == Kind.COMPACT_VALUE;
+        if (!isClusteringColumn())
+            return ClusteringOrder.NONE;
+
+        return type.isReversed() ? ClusteringOrder.DESC : ClusteringOrder.ASC;
     }
 
-    // The componentIndex. This never return null however for convenience sake:
-    // if componentIndex == null, this return 0. So caller should first check
-    // isOnAllComponents() to distinguish if that's a possibility.
     public int position()
     {
-        return componentIndex == null ? 0 : componentIndex;
+        return position;
     }
 
     @Override
@@ -195,16 +254,13 @@
             && Objects.equal(name, cd.name)
             && Objects.equal(type, cd.type)
             && Objects.equal(kind, cd.kind)
-            && Objects.equal(componentIndex, cd.componentIndex)
-            && Objects.equal(indexName, cd.indexName)
-            && Objects.equal(indexType, cd.indexType)
-            && Objects.equal(indexOptions, cd.indexOptions);
+            && Objects.equal(position, cd.position);
     }
 
     @Override
     public int hashCode()
     {
-        return Objects.hashCode(ksName, cfName, name, type, kind, componentIndex, indexName, indexType, indexOptions);
+        return Objects.hashCode(ksName, cfName, name, type, kind, position);
     }
 
     @Override
@@ -214,106 +270,30 @@
                       .add("name", name)
                       .add("type", type)
                       .add("kind", kind)
-                      .add("componentIndex", componentIndex)
-                      .add("indexName", indexName)
-                      .add("indexType", indexType)
+                      .add("position", position)
                       .toString();
     }
 
-    public boolean isThriftCompatible()
-    {
-        return kind == ColumnDefinition.Kind.REGULAR && componentIndex == null;
-    }
-
     public boolean isPrimaryKeyColumn()
     {
-        return kind == Kind.PARTITION_KEY || kind == Kind.CLUSTERING_COLUMN;
+        return kind.isPrimaryKeyKind();
     }
 
     /**
      * Whether the name of this definition is serialized in the cell nane, i.e. whether
      * it's not just a non-stored CQL metadata.
      */
-    public boolean isPartOfCellName()
+    public boolean isPartOfCellName(boolean isCQL3Table, boolean isSuper)
     {
-        return kind == Kind.REGULAR || kind == Kind.STATIC;
-    }
-
-    public ColumnDefinition apply(ColumnDefinition def)  throws ConfigurationException
-    {
-        assert kind == def.kind && Objects.equal(componentIndex, def.componentIndex);
-
-        if (getIndexType() != null && def.getIndexType() != null)
-        {
-            // If an index is set (and not drop by this update), the validator shouldn't be change to a non-compatible one
-            // (and we want true comparator compatibility, not just value one, since the validator is used by LocalPartitioner to order index rows)
-            if (!def.type.isCompatibleWith(type))
-                throw new ConfigurationException(String.format("Cannot modify validator to a non-order-compatible one for column %s since an index is set", name));
-
-            assert getIndexName() != null;
-            if (!getIndexName().equals(def.getIndexName()))
-                throw new ConfigurationException("Cannot modify index name: " + def.getIndexName());
-        }
-
-        return new ColumnDefinition(ksName,
-                                    cfName,
-                                    name,
-                                    def.type,
-                                    def.getIndexType(),
-                                    def.getIndexOptions(),
-                                    def.getIndexName(),
-                                    componentIndex,
-                                    kind);
-    }
-
-    public String getIndexName()
-    {
-        return indexName;
-    }
-
-    public ColumnDefinition setIndexName(String indexName)
-    {
-        this.indexName = indexName;
-        return this;
-    }
-
-    public ColumnDefinition setIndexType(IndexType indexType, Map<String,String> indexOptions)
-    {
-        this.indexType = indexType;
-        this.indexOptions = indexOptions;
-        return this;
-    }
-
-    public ColumnDefinition setIndex(String indexName, IndexType indexType, Map<String,String> indexOptions)
-    {
-        return setIndexName(indexName).setIndexType(indexType, indexOptions);
-    }
-
-    public boolean isIndexed()
-    {
-        return indexType != null;
-    }
-
-    public IndexType getIndexType()
-    {
-        return indexType;
-    }
-
-    public Map<String,String> getIndexOptions()
-    {
-        return indexOptions;
-    }
-
-    /**
-     * Checks if the index option with the specified name has been specified.
-     *
-     * @param name index option name
-     * @return <code>true</code> if the index option with the specified name has been specified, <code>false</code>
-     * otherwise.
-     */
-    public boolean hasIndexOption(String name)
-    {
-        return indexOptions != null && indexOptions.containsKey(name);
+        // When converting CQL3 tables to thrift, any regular or static column ends up in the cell name.
+        // When it's a compact table however, the REGULAR definition is the name for the cell value of "dynamic"
+        // column (so it's not part of the cell name) and it's static columns that ends up in the cell name.
+        if (isCQL3Table)
+            return kind == Kind.REGULAR || kind == Kind.STATIC;
+        else if (isSuper)
+            return kind == Kind.REGULAR;
+        else
+            return kind == Kind.STATIC;
     }
 
     /**
@@ -322,9 +302,9 @@
      * @param definitions the column definitions to convert.
      * @return the column identifiers corresponding to the specified definitions
      */
-    public static List<ColumnIdentifier> toIdentifiers(List<ColumnDefinition> definitions)
+    public static Collection<ColumnIdentifier> toIdentifiers(Collection<ColumnDefinition> definitions)
     {
-        return Lists.transform(definitions, new Function<ColumnDefinition, ColumnIdentifier>()
+        return Collections2.transform(definitions, new Function<ColumnDefinition, ColumnIdentifier>()
         {
             @Override
             public ColumnIdentifier apply(ColumnDefinition columnDef)
@@ -333,4 +313,101 @@
             }
         });
     }
+
+    public int compareTo(ColumnDefinition other)
+    {
+        if (this == other)
+            return 0;
+
+        if (comparisonOrder != other.comparisonOrder)
+            return Long.compare(comparisonOrder, other.comparisonOrder);
+
+        return this.name.compareTo(other.name);
+    }
+
+    public Comparator<CellPath> cellPathComparator()
+    {
+        return cellPathComparator;
+    }
+
+    public Comparator<Object> asymmetricCellPathComparator()
+    {
+        return asymmetricCellPathComparator;
+    }
+
+    public Comparator<? super Cell> cellComparator()
+    {
+        return cellComparator;
+    }
+
+    public boolean isComplex()
+    {
+        return cellPathComparator != null;
+    }
+
+    public boolean isSimple()
+    {
+        return !isComplex();
+    }
+
+    public CellPath.Serializer cellPathSerializer()
+    {
+        // Collections are our only complex so far, so keep it simple
+        return CollectionType.cellPathSerializer;
+    }
+
+    public void validateCellValue(ByteBuffer value)
+    {
+        type.validateCellValue(value);
+    }
+
+    public void validateCellPath(CellPath path)
+    {
+        if (!isComplex())
+            throw new MarshalException("Only complex cells should have a cell path");
+
+        assert type instanceof CollectionType;
+        ((CollectionType)type).nameComparator().validate(path.get(0));
+    }
+
+    public static String toCQLString(Iterable<ColumnDefinition> defs)
+    {
+        return toCQLString(defs.iterator());
+    }
+
+    public static String toCQLString(Iterator<ColumnDefinition> defs)
+    {
+        if (!defs.hasNext())
+            return "";
+
+        StringBuilder sb = new StringBuilder();
+        sb.append(defs.next().name);
+        while (defs.hasNext())
+            sb.append(", ").append(defs.next().name);
+        return sb.toString();
+    }
+
+    /**
+     * The type of the cell values for cell belonging to this column.
+     *
+     * This is the same than the column type, except for non-frozen collections where it's the 'valueComparator'
+     * of the collection.
+     * 
+     * This method should not be used to get value type of non-frozon UDT.
+     */
+    public AbstractType<?> cellValueType()
+    {
+        assert !(type instanceof UserType && type.isMultiCell());
+        return type instanceof CollectionType && type.isMultiCell()
+                ? ((CollectionType)type).valueComparator()
+                : type;
+    }
+
+
+    public boolean isCounterColumn()
+    {
+        if (type instanceof CollectionType) // for thrift
+            return ((CollectionType) type).valueComparator().isCounter();
+        return type.isCounter();
+    }
 }

diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index ab79a08..6003bd1 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java

@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.config;
 
-import java.io.IOException;
-import java.io.StringReader;
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
 import java.util.ArrayList;
@@ -27,7 +25,7 @@
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.TimeUnit;
-import com.google.common.base.Supplier;
+import java.util.function.Supplier;
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
@@ -37,9 +35,6 @@
 
 import org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions;
 import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.supercsv.io.CsvListReader;
-import org.supercsv.prefs.CsvPreference;
 
 /**
  * A class that contains configuration properties for the cassandra node it runs within.
@@ -71,10 +66,10 @@
     public String partitioner;
 
     public Boolean auto_bootstrap = true;
-    public volatile boolean hinted_handoff_enabled_global = true;
-    public String hinted_handoff_enabled;
-    public Set<String> hinted_handoff_enabled_by_dc = Sets.newConcurrentHashSet();
+    public volatile boolean hinted_handoff_enabled = true;
+    public Set<String> hinted_handoff_disabled_datacenters = Sets.newConcurrentHashSet();
     public volatile Integer max_hint_window_in_ms = 3 * 3600 * 1000; // three hours
+    public String hints_directory;
 
     public ParameterizedClass seed_provider;
     public DiskAccessMode disk_access_mode = DiskAccessMode.auto;
@@ -85,6 +80,8 @@
     /* initial token in the ring */
     public String initial_token;
     public Integer num_tokens = 1;
+    /** Triggers automatic allocation of tokens if set, using the replication strategy of the referenced keyspace */
+    public String allocate_tokens_for_keyspace = null;
 
     public volatile Long request_timeout_in_ms = 10000L;
 
@@ -109,6 +106,7 @@
     public Integer concurrent_reads = 32;
     public Integer concurrent_writes = 32;
     public Integer concurrent_counter_writes = 32;
+    public Integer concurrent_materialized_view_writes = 32;
 
     @Deprecated
     public Integer concurrent_replicates = null;
@@ -118,6 +116,9 @@
     public Integer memtable_offheap_space_in_mb;
     public Float memtable_cleanup_threshold = null;
 
+    // Limit the maximum depth of repair session merkle trees
+    public volatile Integer repair_session_max_tree_depth = 18;
+
     public Integer storage_port = 7000;
     public Integer ssl_storage_port = 7001;
     public String listen_address;
@@ -147,13 +148,24 @@
 
     public Boolean start_native_transport = false;
     public Integer native_transport_port = 9042;
+    public Integer native_transport_port_ssl = null;
     public Integer native_transport_max_threads = 128;
     public Integer native_transport_max_frame_size_in_mb = 256;
     public volatile Long native_transport_max_concurrent_connections = -1L;
     public volatile Long native_transport_max_concurrent_connections_per_ip = -1L;
+    public boolean native_transport_flush_in_batches_legacy = true;
+    public volatile long native_transport_max_concurrent_requests_in_bytes_per_ip = -1L;
+    public volatile long native_transport_max_concurrent_requests_in_bytes = -1L;
+    public Integer native_transport_max_negotiable_protocol_version = Integer.MIN_VALUE;
 
     @Deprecated
     public Integer thrift_max_message_length_in_mb = 16;
+    /**
+     * Max size of values in SSTables, in MegaBytes.
+     * Default is the same as the native protocol frame limit: 256Mb.
+     * See AbstractType for how it is used.
+     */
+    public Integer max_value_size_in_mb = 256;
 
     public Integer thrift_framed_transport_size_in_mb = 15;
     public Boolean snapshot_before_compaction = false;
@@ -167,6 +179,7 @@
     public Integer concurrent_compactors;
     public volatile Integer compaction_throughput_mb_per_sec = 16;
     public volatile Integer compaction_large_partition_warning_threshold_mb = 100;
+    public Integer min_free_space_per_drive_in_mb = 50;
 
     /**
      * @deprecated retry support removed on CASSANDRA-10992
@@ -190,7 +203,9 @@
     public int commitlog_segment_size_in_mb = 32;
     public ParameterizedClass commitlog_compression;
     public int commitlog_max_compression_buffers_in_pool = 3;
- 
+
+    public Integer max_mutation_size_in_kb;
+
     @Deprecated
     public int commitlog_periodic_queue_size = -1;
 
@@ -216,7 +231,10 @@
 
     public int hinted_handoff_throttle_in_kb = 1024;
     public int batchlog_replay_throttle_in_kb = 1024;
-    public int max_hints_delivery_threads = 1;
+    public int max_hints_delivery_threads = 2;
+    public int hints_flush_period_in_ms = 10000;
+    public int max_hints_file_size_in_mb = 128;
+    public ParameterizedClass hints_compression;
     public int sstable_preemptive_open_interval_in_mb = 50;
 
     public volatile boolean incremental_backups = false;
@@ -236,13 +254,18 @@
     public volatile int counter_cache_save_period = 7200;
     public volatile int counter_cache_keys_to_save = Integer.MAX_VALUE;
 
-    @Deprecated
-    public String memory_allocator;
-
     private static boolean isClientMode = false;
     private static Supplier<Config> overrideLoadConfig = null;
 
-    public Integer file_cache_size_in_mb;
+    public Integer file_cache_size_in_mb = 512;
+
+    public boolean buffer_pool_use_heap_if_exhausted = true;
+
+    public DiskOptimizationStrategy disk_optimization_strategy = DiskOptimizationStrategy.ssd;
+
+    public double disk_optimization_estimate_percentile = 0.95;
+
+    public double disk_optimization_page_cross_chance = 0.1;
 
     public boolean inter_dc_tcp_nodelay = true;
 
@@ -263,9 +286,6 @@
     public int gc_log_threshold_in_ms = 200;
     public int gc_warn_threshold_in_ms = 0;
 
-    private static final CsvPreference STANDARD_SURROUNDING_SPACES_NEED_QUOTES = new CsvPreference.Builder(CsvPreference.STANDARD_PREFERENCE)
-                                                                                                  .surroundingSpacesNeedQuotes(true).build();
-
     // TTL for different types of trace events.
     public int tracetype_query_ttl = (int) TimeUnit.DAYS.toSeconds(1);
     public int tracetype_repair_ttl = (int) TimeUnit.DAYS.toSeconds(7);
@@ -287,9 +307,49 @@
     public int otc_coalescing_window_us = otc_coalescing_window_us_default;
     public int otc_coalescing_enough_coalesced_messages = 8;
 
+    /**
+     * Backlog expiration interval in milliseconds for the OutboundTcpConnection.
+     */
+    public static final int otc_backlog_expiration_interval_ms_default = 200;
+    public volatile int otc_backlog_expiration_interval_ms = otc_backlog_expiration_interval_ms_default;
+ 
     public int windows_timer_interval = 0;
 
     public boolean enable_user_defined_functions = false;
+    public boolean enable_scripted_user_defined_functions = false;
+
+    public boolean enable_materialized_views = true;
+
+    /**
+     * Optionally disable asynchronous UDF execution.
+     * Disabling asynchronous UDF execution also implicitly disables the security-manager!
+     * By default, async UDF execution is enabled to be able to detect UDFs that run too long / forever and be
+     * able to fail fast - i.e. stop the Cassandra daemon, which is currently the only appropriate approach to
+     * "tell" a user that there's something really wrong with the UDF.
+     * When you disable async UDF execution, users MUST pay attention to read-timeouts since these may indicate
+     * UDFs that run too long or forever - and this can destabilize the cluster.
+     */
+    public boolean enable_user_defined_functions_threads = true;
+    /**
+     * Time in milliseconds after a warning will be emitted to the log and to the client that a UDF runs too long.
+     * (Only valid, if enable_user_defined_functions_threads==true)
+     */
+    public long user_defined_function_warn_timeout = 500;
+    /**
+     * Time in milliseconds after a fatal UDF run-time situation is detected and action according to
+     * user_function_timeout_policy will take place.
+     * (Only valid, if enable_user_defined_functions_threads==true)
+     */
+    public long user_defined_function_fail_timeout = 1500;
+    /**
+     * Defines what to do when a UDF ran longer than user_defined_function_fail_timeout.
+     * Possible options are:
+     * - 'die' - i.e. it is able to emit a warning to the client before the Cassandra Daemon will shut down.
+     * - 'die_immediate' - shut down C* daemon immediately (effectively prevent the chance that the client will receive a warning).
+     * - 'ignore' - just log - the most dangerous option.
+     * (Only valid, if enable_user_defined_functions_threads==true)
+     */
+    public UserFunctionTimeoutPolicy user_function_timeout_policy = UserFunctionTimeoutPolicy.die;
 
     public static boolean getOutboundBindAny()
     {
@@ -301,6 +361,25 @@
         outboundBindAny = value;
     }
 
+    /**
+     * If true, when rows with duplicate clustering keys are detected during a read or compaction
+     * a snapshot will be taken. In the read case, each a snapshot request will be issued to each
+     * replica involved in the query, for compaction the snapshot will be created locally.
+     * These are limited at the replica level so that only a single snapshot per-day can be taken
+     * via this method.
+     *
+     * This requires check_for_duplicate_rows_during_reads and/or check_for_duplicate_rows_during_compaction
+     * below to be enabled
+     */
+    public volatile boolean snapshot_on_duplicate_row_detection = false;
+
+    /**
+     * If these are enabled duplicate keys will get logged, and if snapshot_on_duplicate_row_detection
+     * is enabled, the table will get snapshotted for offline investigation
+     */
+    public volatile boolean check_for_duplicate_rows_during_reads = true;
+    public volatile boolean check_for_duplicate_rows_during_compaction = true;
+
     public static boolean isClientMode()
     {
         return isClientMode;
@@ -321,51 +400,17 @@
         overrideLoadConfig = loadConfig;
     }
 
-    public void configHintedHandoff() throws ConfigurationException
-    {
-        if (hinted_handoff_enabled != null && !hinted_handoff_enabled.isEmpty())
-        {
-            if (hinted_handoff_enabled.equalsIgnoreCase("true"))
-            {
-                hinted_handoff_enabled_global = true;
-            }
-            else if (hinted_handoff_enabled.equalsIgnoreCase("false"))
-            {
-                hinted_handoff_enabled_global = false;
-            }
-            else
-            {
-                try
-                {
-                    hinted_handoff_enabled_by_dc.addAll(parseHintedHandoffEnabledDCs(hinted_handoff_enabled));
-                }
-                catch (IOException e)
-                {
-                    throw new ConfigurationException("Invalid hinted_handoff_enabled parameter " + hinted_handoff_enabled, e);
-                }
-            }
-        }
-    }
-
-    public static List<String> parseHintedHandoffEnabledDCs(final String dcNames) throws IOException
-    {
-        try (final CsvListReader csvListReader = new CsvListReader(new StringReader(dcNames), STANDARD_SURROUNDING_SPACES_NEED_QUOTES))
-        {
-        	return csvListReader.read();
-        }
-    }
-
-    public static enum CommitLogSync
+    public enum CommitLogSync
     {
         periodic,
         batch
     }
-    public static enum InternodeCompression
+    public enum InternodeCompression
     {
         all, none, dc
     }
 
-    public static enum DiskAccessMode
+    public enum DiskAccessMode
     {
         auto,
         mmap,
@@ -373,7 +418,7 @@
         standard,
     }
 
-    public static enum MemtableAllocationType
+    public enum MemtableAllocationType
     {
         unslabbed_heap_buffers,
         heap_buffers,
@@ -381,7 +426,7 @@
         offheap_objects
     }
 
-    public static enum DiskFailurePolicy
+    public enum DiskFailurePolicy
     {
         best_effort,
         stop,
@@ -390,7 +435,7 @@
         die
     }
 
-    public static enum CommitFailurePolicy
+    public enum CommitFailurePolicy
     {
         stop,
         stop_commit,
@@ -398,11 +443,24 @@
         die,
     }
 
-    public static enum RequestSchedulerId
+    public enum UserFunctionTimeoutPolicy
+    {
+        ignore,
+        die,
+        die_immediate
+    }
+
+    public enum RequestSchedulerId
     {
         keyspace
     }
 
+    public enum DiskOptimizationStrategy
+    {
+        ssd,
+        spinning
+    }
+
     private static final List<String> SENSITIVE_KEYS = new ArrayList<String>() {{
         add("client_encryption_options");
         add("server_encryption_options");

diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 41a09e9..4b732c2 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java

@@ -52,6 +52,7 @@
 import org.apache.cassandra.scheduler.NoScheduler;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.thrift.ThriftServer;
+import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.memory.*;
 
@@ -106,6 +107,9 @@
 
     private static boolean daemonInitialized;
 
+    // turns some warnings into exceptions for testing
+    private static final boolean strictRuntimeChecks = Boolean.getBoolean("cassandra.strict.runtime.checks");
+
     public static boolean isDaemonInitialized()
     {
         return daemonInitialized;
@@ -399,22 +403,22 @@
         /* phi convict threshold for FailureDetector */
         if (conf.phi_convict_threshold < 5 || conf.phi_convict_threshold > 16)
         {
-            throw new ConfigurationException("phi_convict_threshold must be between 5 and 16", false);
+            throw new ConfigurationException("phi_convict_threshold must be between 5 and 16, but was " + conf.phi_convict_threshold, false);
         }
 
         /* Thread per pool */
         if (conf.concurrent_reads != null && conf.concurrent_reads < 2)
         {
-            throw new ConfigurationException("concurrent_reads must be at least 2", false);
+            throw new ConfigurationException("concurrent_reads must be at least 2, but was " + conf.concurrent_reads, false);
         }
 
-        if (conf.concurrent_writes != null && conf.concurrent_writes < 2)
+        if (conf.concurrent_writes != null && conf.concurrent_writes < 2 && System.getProperty("cassandra.test.fail_mv_locks_count", "").isEmpty())
         {
-            throw new ConfigurationException("concurrent_writes must be at least 2", false);
+            throw new ConfigurationException("concurrent_writes must be at least 2, but was " + conf.concurrent_writes, false);
         }
 
         if (conf.concurrent_counter_writes != null && conf.concurrent_counter_writes < 2)
-            throw new ConfigurationException("concurrent_counter_writes must be at least 2", false);
+            throw new ConfigurationException("concurrent_counter_writes must be at least 2, but was " + conf.concurrent_counter_writes, false);
 
         if (conf.concurrent_replicates != null)
             logger.warn("concurrent_replicates has been deprecated and should be removed from cassandra.yaml");
@@ -422,31 +426,36 @@
         if (conf.file_cache_size_in_mb == null)
             conf.file_cache_size_in_mb = Math.min(512, (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576)));
 
-        if (conf.memory_allocator != null)
-            logger.warn("memory_allocator has been deprecated and should be removed from cassandra.yaml");
-
         if (conf.memtable_offheap_space_in_mb == null)
             conf.memtable_offheap_space_in_mb = (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576));
         if (conf.memtable_offheap_space_in_mb < 0)
-            throw new ConfigurationException("memtable_offheap_space_in_mb must be positive", false);
+            throw new ConfigurationException("memtable_offheap_space_in_mb must be positive, but was " + conf.memtable_offheap_space_in_mb, false);
         // for the moment, we default to twice as much on-heap space as off-heap, as heap overhead is very large
         if (conf.memtable_heap_space_in_mb == null)
             conf.memtable_heap_space_in_mb = (int) (Runtime.getRuntime().maxMemory() / (4 * 1048576));
         if (conf.memtable_heap_space_in_mb <= 0)
-            throw new ConfigurationException("memtable_heap_space_in_mb must be positive", false);
+            throw new ConfigurationException("memtable_heap_space_in_mb must be positive, but was " + conf.memtable_heap_space_in_mb, false);
         logger.info("Global memtable on-heap threshold is enabled at {}MB", conf.memtable_heap_space_in_mb);
         if (conf.memtable_offheap_space_in_mb == 0)
             logger.info("Global memtable off-heap threshold is disabled, HeapAllocator will be used instead");
         else
             logger.info("Global memtable off-heap threshold is enabled at {}MB", conf.memtable_offheap_space_in_mb);
 
+        if (conf.repair_session_max_tree_depth < 10)
+            throw new ConfigurationException("repair_session_max_tree_depth should not be < 10, but was " + conf.repair_session_max_tree_depth);
+        if (conf.repair_session_max_tree_depth > 20)
+            logger.warn("repair_session_max_tree_depth of " + conf.repair_session_max_tree_depth + " > 20 could lead to excessive memory usage");
+
         applyAddressConfig(config);
 
         if (conf.thrift_framed_transport_size_in_mb <= 0)
-            throw new ConfigurationException("thrift_framed_transport_size_in_mb must be positive", false);
+            throw new ConfigurationException("thrift_framed_transport_size_in_mb must be positive, but was " + conf.thrift_framed_transport_size_in_mb, false);
 
         if (conf.native_transport_max_frame_size_in_mb <= 0)
-            throw new ConfigurationException("native_transport_max_frame_size_in_mb must be positive", false);
+            throw new ConfigurationException("native_transport_max_frame_size_in_mb must be positive, but was " + conf.native_transport_max_frame_size_in_mb, false);
+        else if (conf.native_transport_max_frame_size_in_mb >= 2048)
+            throw new ConfigurationException("native_transport_max_frame_size_in_mb must be smaller than 2048, but was "
+                    + conf.native_transport_max_frame_size_in_mb, false);
 
         // fail early instead of OOMing (see CASSANDRA-8116)
         if (ThriftServer.HSHA.equals(conf.rpc_server_type) && conf.rpc_max_threads == Integer.MAX_VALUE)
@@ -462,6 +471,17 @@
         {
             throw new ConfigurationException("Missing endpoint_snitch directive", false);
         }
+
+        if (conf.native_transport_max_concurrent_requests_in_bytes <= 0)
+        {
+            conf.native_transport_max_concurrent_requests_in_bytes = Runtime.getRuntime().maxMemory() / 10;
+        }
+
+        if (conf.native_transport_max_concurrent_requests_in_bytes_per_ip <= 0)
+        {
+            conf.native_transport_max_concurrent_requests_in_bytes_per_ip = Runtime.getRuntime().maxMemory() / 40;
+        }
+
         snitch = createEndpointSnitch(conf.endpoint_snitch);
         EndpointSnitchInfo.create();
 
@@ -527,6 +547,14 @@
             conf.commitlog_directory += File.separator + "commitlog";
         }
 
+        if (conf.hints_directory == null)
+        {
+            conf.hints_directory = System.getProperty("cassandra.storagedir", null);
+            if (conf.hints_directory == null)
+                throw new ConfigurationException("hints_directory is missing and -Dcassandra.storagedir is not set", false);
+            conf.hints_directory += File.separator + "hints";
+        }
+
         if (conf.commitlog_total_space_in_mb == null)
         {
             int preferredSize = 8192;
@@ -561,7 +589,7 @@
                 throw new ConfigurationException("saved_caches_directory is missing and -Dcassandra.storagedir is not set", false);
             conf.saved_caches_directory += File.separator + "saved_caches";
         }
-        if (conf.data_file_directories == null)
+        if (conf.data_file_directories == null || conf.data_file_directories.length == 0)
         {
             String defaultDataDir = System.getProperty("cassandra.storagedir", null);
             if (defaultDataDir == null)
@@ -573,8 +601,12 @@
         /* data file and commit log directories. they get created later, when they're needed. */
         for (String datadir : conf.data_file_directories)
         {
+            if (datadir == null)
+                throw new ConfigurationException("data_file_directories must not contain empty entry", false);
             if (datadir.equals(conf.commitlog_directory))
                 throw new ConfigurationException("commitlog_directory must not be the same as any data_file_directories", false);
+            if (datadir.equals(conf.hints_directory))
+                throw new ConfigurationException("hints_directory must not be the same as any data_file_directories", false);
             if (datadir.equals(conf.saved_caches_directory))
                 throw new ConfigurationException("saved_caches_directory must not be the same as any data_file_directories", false);
 
@@ -596,38 +628,49 @@
 
         if (conf.commitlog_directory.equals(conf.saved_caches_directory))
             throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory", false);
+        if (conf.commitlog_directory.equals(conf.hints_directory))
+            throw new ConfigurationException("hints_directory must not be the same as the commitlog_directory", false);
+        if (conf.hints_directory.equals(conf.saved_caches_directory))
+            throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);
 
         if (conf.memtable_flush_writers == null)
             conf.memtable_flush_writers = Math.min(8, Math.max(2, Math.min(FBUtilities.getAvailableProcessors(), conf.data_file_directories.length)));
 
         if (conf.memtable_flush_writers < 1)
-            throw new ConfigurationException("memtable_flush_writers must be at least 1", false);
+            throw new ConfigurationException("memtable_flush_writers must be at least 1, but was " + conf.memtable_flush_writers, false);
 
         if (conf.memtable_cleanup_threshold == null)
             conf.memtable_cleanup_threshold = (float) (1.0 / (1 + conf.memtable_flush_writers));
 
         if (conf.memtable_cleanup_threshold < 0.01f)
-            throw new ConfigurationException("memtable_cleanup_threshold must be >= 0.01", false);
+            throw new ConfigurationException("memtable_cleanup_threshold must be >= 0.01, but was " + conf.memtable_cleanup_threshold, false);
         if (conf.memtable_cleanup_threshold > 0.99f)
-            throw new ConfigurationException("memtable_cleanup_threshold must be <= 0.99", false);
+            throw new ConfigurationException("memtable_cleanup_threshold must be <= 0.99, but was " + conf.memtable_cleanup_threshold, false);
         if (conf.memtable_cleanup_threshold < 0.1f)
-            logger.warn("memtable_cleanup_threshold is set very low, which may cause performance degradation");
+            logger.warn("memtable_cleanup_threshold is set very low [{}], which may cause performance degradation", conf.memtable_cleanup_threshold);
 
         if (conf.concurrent_compactors == null)
             conf.concurrent_compactors = Math.min(8, Math.max(2, Math.min(FBUtilities.getAvailableProcessors(), conf.data_file_directories.length)));
 
         if (conf.concurrent_compactors <= 0)
-            throw new ConfigurationException("concurrent_compactors should be strictly greater than 0", false);
-
-        if (conf.initial_token != null)
-            for (String token : tokensFromString(conf.initial_token))
-                partitioner.getTokenFactory().validate(token);
+            throw new ConfigurationException("concurrent_compactors should be strictly greater than 0, but was " + conf.concurrent_compactors, false);
 
         if (conf.num_tokens == null)
-        	conf.num_tokens = 1;
+            conf.num_tokens = 1;
         else if (conf.num_tokens > MAX_NUM_TOKENS)
             throw new ConfigurationException(String.format("A maximum number of %d tokens per node is supported", MAX_NUM_TOKENS), false);
 
+        if (conf.initial_token != null)
+        {
+            Collection<String> tokens = tokensFromString(conf.initial_token);
+            if (tokens.size() != conf.num_tokens)
+                throw new ConfigurationException("The number of initial tokens (by initial_token) specified is different from num_tokens value", false);
+
+            for (String token : tokens)
+                partitioner.getTokenFactory().validate(token);
+        }
+
+
         try
         {
             // if key_cache_size_in_mb option was set to "auto" then size of the cache should be "min(5% of Heap (in MB), 100MB)
@@ -694,6 +737,54 @@
         if (seedProvider.getSeeds().size() == 0)
             throw new ConfigurationException("The seed provider lists no seeds.", false);
 
+        if (conf.user_defined_function_fail_timeout < 0)
+            throw new ConfigurationException("user_defined_function_fail_timeout must not be negative", false);
+        if (conf.user_defined_function_warn_timeout < 0)
+            throw new ConfigurationException("user_defined_function_warn_timeout must not be negative", false);
+
+        if (conf.user_defined_function_fail_timeout < conf.user_defined_function_warn_timeout)
+            throw new ConfigurationException("user_defined_function_warn_timeout must less than user_defined_function_fail_timeout", false);
+
+        if (conf.commitlog_segment_size_in_mb <= 0)
+            throw new ConfigurationException("commitlog_segment_size_in_mb must be positive, but was "
+                    + conf.commitlog_segment_size_in_mb, false);
+        else if (conf.commitlog_segment_size_in_mb >= 2048)
+            throw new ConfigurationException("commitlog_segment_size_in_mb must be smaller than 2048, but was "
+                    + conf.commitlog_segment_size_in_mb, false);
+
+        if (conf.max_mutation_size_in_kb == null)
+            conf.max_mutation_size_in_kb = conf.commitlog_segment_size_in_mb * 1024 / 2;
+        else if (conf.commitlog_segment_size_in_mb * 1024 < 2 * conf.max_mutation_size_in_kb)
+            throw new ConfigurationException("commitlog_segment_size_in_mb must be at least twice the size of max_mutation_size_in_kb / 1024", false);
+
+        // native transport encryption options
+        if (conf.native_transport_port_ssl != null
+            && conf.native_transport_port_ssl.intValue() != conf.native_transport_port.intValue()
+            && !conf.client_encryption_options.enabled)
+        {
+            throw new ConfigurationException("Encryption must be enabled in client_encryption_options for native_transport_port_ssl", false);
+        }
+
+        // If max protocol version has been set, just validate it's within an acceptable range
+        if (conf.native_transport_max_negotiable_protocol_version != Integer.MIN_VALUE)
+        {
+            if (conf.native_transport_max_negotiable_protocol_version < Server.MIN_SUPPORTED_VERSION
+                || conf.native_transport_max_negotiable_protocol_version > Server.CURRENT_VERSION)
+            {
+                throw new ConfigurationException(String.format("Invalid setting for native_transport_max_negotiable_version (%d); " +
+                                                               "Values between %s and %s are supported",
+                                                               conf.native_transport_max_negotiable_protocol_version,
+                                                               Server.MIN_SUPPORTED_VERSION,
+                                                               Server.CURRENT_VERSION));
+            }
+        }
+
+        if (conf.max_value_size_in_mb == null || conf.max_value_size_in_mb <= 0)
+            throw new ConfigurationException("max_value_size_in_mb must be positive", false);
+        else if (conf.max_value_size_in_mb >= 2048)
+            throw new ConfigurationException("max_value_size_in_mb must be smaller than 2048, but was "
+                    + conf.max_value_size_in_mb, false);
+
         if (conf.otc_coalescing_enough_coalesced_messages > 128)
             throw new ConfigurationException("otc_coalescing_enough_coalesced_messages must be smaller than 128", false);
 
@@ -816,6 +907,16 @@
         return conf.thrift_framed_transport_size_in_mb * 1024 * 1024;
     }
 
+    public static int getMaxValueSize()
+    {
+        return conf.max_value_size_in_mb * 1024 * 1024;
+    }
+
+    public static void setMaxValueSize(int maxValueSizeInBytes)
+    {
+        conf.max_value_size_in_mb = maxValueSizeInBytes / 1024 / 1024;
+    }
+
     /**
      * Creates all storage-related directories.
      */
@@ -827,18 +928,18 @@
                 throw new ConfigurationException("At least one DataFileDirectory must be specified", false);
 
             for (String dataFileDirectory : conf.data_file_directories)
-            {
                 FileUtils.createDirectory(dataFileDirectory);
-            }
 
             if (conf.commitlog_directory == null)
                 throw new ConfigurationException("commitlog_directory must be specified", false);
-
             FileUtils.createDirectory(conf.commitlog_directory);
 
+            if (conf.hints_directory == null)
+                throw new ConfigurationException("hints_directory must be specified", false);
+            FileUtils.createDirectory(conf.hints_directory);
+
             if (conf.saved_caches_directory == null)
                 throw new ConfigurationException("saved_caches_directory must be specified", false);
-
             FileUtils.createDirectory(conf.saved_caches_directory);
         }
         catch (ConfigurationException e)
@@ -861,10 +962,12 @@
         return paritionerName;
     }
 
-    /* For tests ONLY, don't use otherwise or all hell will break loose */
-    public static void setPartitioner(IPartitioner newPartitioner)
+    /* For tests ONLY, don't use otherwise or all hell will break loose. Tests should restore value at the end. */
+    public static IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner)
     {
+        IPartitioner old = partitioner;
         partitioner = newPartitioner;
+        return old;
     }
 
     public static IEndpointSnitch getEndpointSnitch()
@@ -931,6 +1034,11 @@
         return tokensFromString(System.getProperty("cassandra.initial_token", conf.initial_token));
     }
 
+    public static String getAllocateTokensForKeyspace()
+    {
+        return System.getProperty("cassandra.allocate_tokens_for_keyspace", conf.allocate_tokens_for_keyspace);
+    }
+
     public static Collection<String> tokensFromString(String tokenString)
     {
         List<String> tokens = new ArrayList<String>();
@@ -957,7 +1065,7 @@
         }
         catch (UnknownHostException e)
         {
-            throw new RuntimeException("Replacement host name could not be resolved or scope_id was specified for a global IPv6 address", e);
+            throw new RuntimeException("Replacement ost name could not be resolved or scope_id was specified for a global IPv6 address", e);
         }
     }
 
@@ -1095,6 +1203,7 @@
             case READ:
                 return getReadRpcTimeout();
             case RANGE_SLICE:
+            case PAGED_RANGE:
                 return getRangeRpcTimeout();
             case TRUNCATE:
                 return getTruncateRpcTimeout();
@@ -1103,6 +1212,9 @@
             case PAXOS_COMMIT:
             case PAXOS_PREPARE:
             case PAXOS_PROPOSE:
+            case HINT:
+            case BATCH_STORE:
+            case BATCH_REMOVE:
                 return getWriteRpcTimeout();
             case COUNTER_MUTATION:
                 return getCounterWriteRpcTimeout();
@@ -1149,6 +1261,11 @@
         return conf.concurrent_counter_writes;
     }
 
+    public static int getConcurrentViewWriters()
+    {
+        return conf.concurrent_materialized_view_writes;
+    }
+
     public static int getFlushWriters()
     {
             return conf.memtable_flush_writers;
@@ -1169,7 +1286,12 @@
         conf.compaction_throughput_mb_per_sec = value;
     }
 
-    public static int getCompactionLargePartitionWarningThreshold() { return conf.compaction_large_partition_warning_threshold_mb * 1024 * 1024; }
+    public static long getCompactionLargePartitionWarningThreshold() { return conf.compaction_large_partition_warning_threshold_mb * 1024L * 1024L; }
+
+    public static long getMinFreeSpacePerDriveInBytes()
+    {
+        return conf.min_free_space_per_drive_in_mb * 1024L * 1024L;
+    }
 
     public static boolean getDisableSTCSInL0()
     {
@@ -1221,6 +1343,11 @@
         return conf.commitlog_max_compression_buffers_in_pool;
     }
 
+    public static int getMaxMutationSize()
+    {
+        return conf.max_mutation_size_in_kb * 1024;
+    }
+
     public static int getTombstoneWarnThreshold()
     {
         return conf.tombstone_warn_threshold;
@@ -1362,6 +1489,23 @@
         return Integer.parseInt(System.getProperty("cassandra.native_transport_port", conf.native_transport_port.toString()));
     }
 
+    @VisibleForTesting
+    public static void setNativeTransportPort(int port)
+    {
+        conf.native_transport_port = port;
+    }
+
+    public static int getNativeTransportPortSSL()
+    {
+        return conf.native_transport_port_ssl == null ? getNativeTransportPort() : conf.native_transport_port_ssl;
+    }
+
+    @VisibleForTesting
+    public static void setNativeTransportPortSSL(Integer port)
+    {
+        conf.native_transport_port_ssl = port;
+    }
+
     public static Integer getNativeTransportMaxThreads()
     {
         return conf.native_transport_max_threads;
@@ -1391,6 +1535,16 @@
         conf.native_transport_max_concurrent_connections_per_ip = native_transport_max_concurrent_connections_per_ip;
     }
 
+    public static boolean useNativeTransportLegacyFlusher()
+    {
+        return conf.native_transport_flush_in_batches_legacy;
+    }
+
+    public static int getNativeProtocolMaxVersionOverride()
+    {
+        return conf.native_transport_max_negotiable_protocol_version;
+    }
+
     public static double getCommitLogSyncBatchWindow()
     {
         return conf.commitlog_sync_batch_window_in_ms;
@@ -1401,6 +1555,26 @@
         conf.commitlog_sync_batch_window_in_ms = windowMillis;
     }
 
+    public static long getNativeTransportMaxConcurrentRequestsInBytesPerIp()
+    {
+        return conf.native_transport_max_concurrent_requests_in_bytes_per_ip;
+    }
+
+    public static void setNativeTransportMaxConcurrentRequestsInBytesPerIp(long maxConcurrentRequestsInBytes)
+    {
+        conf.native_transport_max_concurrent_requests_in_bytes_per_ip = maxConcurrentRequestsInBytes;
+    }
+
+    public static long getNativeTransportMaxConcurrentRequestsInBytes()
+    {
+        return conf.native_transport_max_concurrent_requests_in_bytes;
+    }
+
+    public static void setNativeTransportMaxConcurrentRequestsInBytes(long maxConcurrentRequestsInBytes)
+    {
+        conf.native_transport_max_concurrent_requests_in_bytes = maxConcurrentRequestsInBytes;
+    }
+
     public static int getCommitLogSyncPeriod()
     {
         return conf.commitlog_sync_period_in_ms;
@@ -1475,9 +1649,15 @@
     }
 
     @VisibleForTesting
-    public static void setAutoSnapshot(boolean autoSnapshot) {
+    public static void setAutoSnapshot(boolean autoSnapshot)
+    {
         conf.auto_snapshot = autoSnapshot;
     }
+    @VisibleForTesting
+    public static boolean getAutoSnapshot()
+    {
+        return conf.auto_snapshot;
+    }
 
     public static boolean isAutoBootstrap()
     {
@@ -1486,47 +1666,27 @@
 
     public static void setHintedHandoffEnabled(boolean hintedHandoffEnabled)
     {
-        conf.hinted_handoff_enabled_global = hintedHandoffEnabled;
-        conf.hinted_handoff_enabled_by_dc.clear();
-    }
-
-    public static void setHintedHandoffEnabled(final String dcNames)
-    {
-        List<String> dcNameList;
-        try
-        {
-            dcNameList = Config.parseHintedHandoffEnabledDCs(dcNames);
-        }
-        catch (IOException e)
-        {
-            throw new IllegalArgumentException("Could not read csv of dcs for hinted handoff enable. " + dcNames, e);
-        }
-
-        if (dcNameList.isEmpty())
-            throw new IllegalArgumentException("Empty list of Dcs for hinted handoff enable");
-
-        conf.hinted_handoff_enabled_by_dc.clear();
-        conf.hinted_handoff_enabled_by_dc.addAll(dcNameList);
+        conf.hinted_handoff_enabled = hintedHandoffEnabled;
     }
 
     public static boolean hintedHandoffEnabled()
     {
-        return conf.hinted_handoff_enabled_global;
+        return conf.hinted_handoff_enabled;
     }
 
-    public static Set<String> hintedHandoffEnabledByDC()
+    public static Set<String> hintedHandoffDisabledDCs()
     {
-        return Collections.unmodifiableSet(conf.hinted_handoff_enabled_by_dc);
+        return conf.hinted_handoff_disabled_datacenters;
     }
 
-    public static boolean shouldHintByDC()
+    public static void enableHintsForDC(String dc)
     {
-        return !conf.hinted_handoff_enabled_by_dc.isEmpty();
+        conf.hinted_handoff_disabled_datacenters.remove(dc);
     }
 
-    public static boolean hintedHandoffEnabled(final String dcName)
+    public static void disableHintsForDC(String dc)
     {
-        return conf.hinted_handoff_enabled_by_dc.contains(dcName);
+        conf.hinted_handoff_disabled_datacenters.add(dc);
     }
 
     public static void setMaxHintWindow(int ms)
@@ -1539,6 +1699,11 @@
         return conf.max_hint_window_in_ms;
     }
 
+    public static File getHintsDirectory()
+    {
+        return new File(conf.hints_directory);
+    }
+
     public static File getSerializedCachePath(CacheService.CacheType cacheType, String version, String extension)
     {
         String name = cacheType.toString()
@@ -1599,11 +1764,31 @@
         conf.hinted_handoff_throttle_in_kb = throttleInKB;
     }
 
-    public static int getMaxHintsThread()
+    public static int getMaxHintsDeliveryThreads()
     {
         return conf.max_hints_delivery_threads;
     }
 
+    public static int getHintsFlushPeriodInMS()
+    {
+        return conf.hints_flush_period_in_ms;
+    }
+
+    public static long getMaxHintsFileSize()
+    {
+        return conf.max_hints_file_size_in_mb * 1024L * 1024L;
+    }
+
+    public static ParameterizedClass getHintsCompression()
+    {
+        return conf.hints_compression;
+    }
+
+    public static void setHintsCompression(ParameterizedClass parameterizedClass)
+    {
+        conf.hints_compression = parameterizedClass;
+    }
+
     public static boolean isIncrementalBackupsEnabled()
     {
         return conf.incremental_backups;
@@ -1619,6 +1804,38 @@
         return conf.file_cache_size_in_mb;
     }
 
+    public static boolean getBufferPoolUseHeapIfExhausted()
+    {
+        return conf.buffer_pool_use_heap_if_exhausted;
+    }
+
+    public static Config.DiskOptimizationStrategy getDiskOptimizationStrategy()
+    {
+        return conf.disk_optimization_strategy;
+    }
+
+    @VisibleForTesting
+    public static void setDiskOptimizationStrategy(Config.DiskOptimizationStrategy strategy)
+    {
+        conf.disk_optimization_strategy = strategy;
+    }
+
+    public static double getDiskOptimizationEstimatePercentile()
+    {
+        return conf.disk_optimization_estimate_percentile;
+    }
+
+    public static double getDiskOptimizationPageCrossChance()
+    {
+        return conf.disk_optimization_page_cross_chance;
+    }
+
+    @VisibleForTesting
+    public static void setDiskOptimizationPageCrossChance(double chance)
+    {
+        conf.disk_optimization_page_cross_chance = chance;
+    }
+
     public static long getTotalCommitlogSpaceInMB()
     {
         return conf.commitlog_total_space_in_mb;
@@ -1776,18 +1993,37 @@
             case heap_buffers:
                 return new SlabPool(heapLimit, 0, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
             case offheap_buffers:
-                if (!FileUtils.isCleanerAvailable())
+                throw new ConfigurationException("offheap_buffers are not available in 3.0. They will be re-introduced in a future release, see https://issues.apache.org/jira/browse/CASSANDRA-9472 for details");
+
+                /*if (!FileUtils.isCleanerAvailable())
                 {
                     throw new IllegalStateException("Could not free direct byte buffer: offheap_buffers is not a safe memtable_allocation_type without this ability, please adjust your config. This feature is only guaranteed to work on an Oracle JVM. Refusing to start.");
                 }
-                return new SlabPool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+                return new SlabPool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());*/
             case offheap_objects:
-                return new NativePool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
+                throw new ConfigurationException("offheap_objects are not available in 3.0. They will be re-introduced in a future release, see https://issues.apache.org/jira/browse/CASSANDRA-9472 for details");
+                // return new NativePool(heapLimit, offHeapLimit, conf.memtable_cleanup_threshold, new ColumnFamilyStore.FlushLargestColumnFamily());
             default:
                 throw new AssertionError();
         }
     }
 
+    public static int getRepairSessionMaxTreeDepth()
+    {
+        return conf.repair_session_max_tree_depth;
+    }
+
+    public static void setRepairSessionMaxTreeDepth(int depth)
+    {
+        if (depth < 10)
+            throw new ConfigurationException("Cannot set repair_session_max_tree_depth to " + depth +
+                                             " which is < 10, doing nothing");
+        else if (depth > 20)
+            logger.warn("repair_session_max_tree_depth of " + depth + " > 20 could lead to excessive memory usage");
+
+        conf.repair_session_max_tree_depth = depth;
+    }
+
     public static boolean getOutboundBindAny()
     {
         return Config.outboundBindAny || conf.listen_on_broadcast_address;
@@ -1844,14 +2080,74 @@
         conf.otc_coalescing_enough_coalesced_messages = otc_coalescing_enough_coalesced_messages;
     }
 
+    public static int getOtcBacklogExpirationInterval()
+    {
+        return conf.otc_backlog_expiration_interval_ms;
+    }
+
+    public static void setOtcBacklogExpirationInterval(int intervalInMillis)
+    {
+        conf.otc_backlog_expiration_interval_ms = intervalInMillis;
+    }
+ 
+    public static int getWindowsTimerInterval()
+    {
+        return conf.windows_timer_interval;
+    }
+
     public static boolean enableUserDefinedFunctions()
     {
         return conf.enable_user_defined_functions;
     }
 
-    public static int getWindowsTimerInterval()
+    public static boolean enableScriptedUserDefinedFunctions()
     {
-        return conf.windows_timer_interval;
+        return conf.enable_scripted_user_defined_functions;
+    }
+
+    public static void enableScriptedUserDefinedFunctions(boolean enableScriptedUserDefinedFunctions)
+    {
+        conf.enable_scripted_user_defined_functions = enableScriptedUserDefinedFunctions;
+    }
+
+    public static boolean enableUserDefinedFunctionsThreads()
+    {
+        return conf.enable_user_defined_functions_threads;
+    }
+
+    public static long getUserDefinedFunctionWarnTimeout()
+    {
+        return conf.user_defined_function_warn_timeout;
+    }
+
+    public static void setUserDefinedFunctionWarnTimeout(long userDefinedFunctionWarnTimeout)
+    {
+        conf.user_defined_function_warn_timeout = userDefinedFunctionWarnTimeout;
+    }
+
+    public static boolean enableMaterializedViews()
+    {
+        return conf.enable_materialized_views;
+    }
+
+    public static long getUserDefinedFunctionFailTimeout()
+    {
+        return conf.user_defined_function_fail_timeout;
+    }
+
+    public static void setUserDefinedFunctionFailTimeout(long userDefinedFunctionFailTimeout)
+    {
+        conf.user_defined_function_fail_timeout = userDefinedFunctionFailTimeout;
+    }
+
+    public static Config.UserFunctionTimeoutPolicy getUserFunctionTimeoutPolicy()
+    {
+        return conf.user_function_timeout_policy;
+    }
+
+    public static void setUserFunctionTimeoutPolicy(Config.UserFunctionTimeoutPolicy userFunctionTimeoutPolicy)
+    {
+        conf.user_function_timeout_policy = userFunctionTimeoutPolicy;
     }
 
     public static long getGCLogThreshold()
@@ -1864,4 +2160,39 @@
         return conf.gc_warn_threshold_in_ms;
     }
 
+    public static boolean strictRuntimeChecks()
+    {
+        return strictRuntimeChecks;
+    }
+
+    public static boolean snapshotOnDuplicateRowDetection()
+    {
+        return conf.snapshot_on_duplicate_row_detection;
+    }
+
+    public static void setSnapshotOnDuplicateRowDetection(boolean enabled)
+    {
+        conf.snapshot_on_duplicate_row_detection = enabled;
+    }
+
+    public static boolean checkForDuplicateRowsDuringReads()
+    {
+        return conf.check_for_duplicate_rows_during_reads;
+    }
+
+    public static void setCheckForDuplicateRowsDuringReads(boolean enabled)
+    {
+        conf.check_for_duplicate_rows_during_reads = enabled;
+    }
+
+    public static boolean checkForDuplicateRowsDuringCompaction()
+    {
+        return conf.check_for_duplicate_rows_during_compaction;
+    }
+
+    public static void setCheckForDuplicateRowsDuringCompaction(boolean enabled)
+    {
+        conf.check_for_duplicate_rows_during_compaction = enabled;
+    }
+
 }

diff --git a/src/java/org/apache/cassandra/config/IndexType.java b/src/java/org/apache/cassandra/config/IndexType.java
deleted file mode 100644
index d39dccb..0000000
--- a/src/java/org/apache/cassandra/config/IndexType.java
+++ /dev/null

@@ -1,25 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.config;
-
-public enum IndexType
-{
-    KEYS,
-    CUSTOM,
-    COMPOSITES
-}

diff --git a/src/java/org/apache/cassandra/config/KSMetaData.java b/src/java/org/apache/cassandra/config/KSMetaData.java
deleted file mode 100644
index 1537aae..0000000
--- a/src/java/org/apache/cassandra/config/KSMetaData.java
+++ /dev/null

@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.config;
-
-import java.util.*;
-
-import com.google.common.base.Objects;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.*;
-import org.apache.cassandra.service.StorageService;
-
-public final class KSMetaData
-{
-    public final String name;
-    public final Class<? extends AbstractReplicationStrategy> strategyClass;
-    public final Map<String, String> strategyOptions;
-    private final Map<String, CFMetaData> cfMetaData;
-    public final boolean durableWrites;
-
-    public final UTMetaData userTypes;
-
-    public KSMetaData(String name,
-                      Class<? extends AbstractReplicationStrategy> strategyClass,
-                      Map<String, String> strategyOptions,
-                      boolean durableWrites)
-    {
-        this(name, strategyClass, strategyOptions, durableWrites, Collections.<CFMetaData>emptyList(), new UTMetaData());
-    }
-
-    public KSMetaData(String name,
-                      Class<? extends AbstractReplicationStrategy> strategyClass,
-                      Map<String, String> strategyOptions,
-                      boolean durableWrites,
-                      Iterable<CFMetaData> cfDefs)
-    {
-        this(name, strategyClass, strategyOptions, durableWrites, cfDefs, new UTMetaData());
-    }
-
-    private KSMetaData(String name,
-                       Class<? extends AbstractReplicationStrategy> strategyClass,
-                       Map<String, String> strategyOptions,
-                       boolean durableWrites,
-                       Iterable<CFMetaData> cfDefs,
-                       UTMetaData userTypes)
-    {
-        this.name = name;
-        this.strategyClass = strategyClass == null ? NetworkTopologyStrategy.class : strategyClass;
-        this.strategyOptions = strategyOptions;
-        Map<String, CFMetaData> cfmap = new HashMap<>();
-        for (CFMetaData cfm : cfDefs)
-            cfmap.put(cfm.cfName, cfm);
-        this.cfMetaData = Collections.unmodifiableMap(cfmap);
-        this.durableWrites = durableWrites;
-        this.userTypes = userTypes;
-    }
-
-    // For new user created keyspaces (through CQL)
-    public static KSMetaData newKeyspace(String name, String strategyName, Map<String, String> options, boolean durableWrites) throws ConfigurationException
-    {
-        Class<? extends AbstractReplicationStrategy> cls = AbstractReplicationStrategy.getClass(strategyName);
-        if (cls.equals(LocalStrategy.class))
-            throw new ConfigurationException("Unable to use given strategy class: LocalStrategy is reserved for internal use.");
-
-        return newKeyspace(name, cls, options, durableWrites, Collections.<CFMetaData>emptyList());
-    }
-
-    public static KSMetaData newKeyspace(String name, Class<? extends AbstractReplicationStrategy> strategyClass, Map<String, String> options, boolean durablesWrites, Iterable<CFMetaData> cfDefs)
-    {
-        return new KSMetaData(name, strategyClass, options, durablesWrites, cfDefs, new UTMetaData());
-    }
-
-    public KSMetaData cloneWithTableRemoved(CFMetaData table)
-    {
-        // clone ksm but do not include the new table
-        List<CFMetaData> newTables = new ArrayList<>(cfMetaData().values());
-        newTables.remove(table);
-        assert newTables.size() == cfMetaData().size() - 1;
-        return cloneWith(newTables, userTypes);
-    }
-
-    public KSMetaData cloneWithTableAdded(CFMetaData table)
-    {
-        // clone ksm but include the new table
-        List<CFMetaData> newTables = new ArrayList<>(cfMetaData().values());
-        newTables.add(table);
-        assert newTables.size() == cfMetaData().size() + 1;
-        return cloneWith(newTables, userTypes);
-    }
-
-    public KSMetaData cloneWith(Iterable<CFMetaData> tables, UTMetaData types)
-    {
-        return new KSMetaData(name, strategyClass, strategyOptions, durableWrites, tables, types);
-    }
-
-    public static KSMetaData testMetadata(String name, Class<? extends AbstractReplicationStrategy> strategyClass, Map<String, String> strategyOptions, CFMetaData... cfDefs)
-    {
-        return new KSMetaData(name, strategyClass, strategyOptions, true, Arrays.asList(cfDefs));
-    }
-
-    public static KSMetaData testMetadataNotDurable(String name, Class<? extends AbstractReplicationStrategy> strategyClass, Map<String, String> strategyOptions, CFMetaData... cfDefs)
-    {
-        return new KSMetaData(name, strategyClass, strategyOptions, false, Arrays.asList(cfDefs));
-    }
-
-    @Override
-    public int hashCode()
-    {
-        return Objects.hashCode(name, strategyClass, strategyOptions, cfMetaData, durableWrites, userTypes);
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-
-        if (!(o instanceof KSMetaData))
-            return false;
-
-        KSMetaData other = (KSMetaData) o;
-
-        return Objects.equal(name, other.name)
-            && Objects.equal(strategyClass, other.strategyClass)
-            && Objects.equal(strategyOptions, other.strategyOptions)
-            && Objects.equal(cfMetaData, other.cfMetaData)
-            && Objects.equal(durableWrites, other.durableWrites)
-            && Objects.equal(userTypes, other.userTypes);
-    }
-
-    public Map<String, CFMetaData> cfMetaData()
-    {
-        return cfMetaData;
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this)
-                      .add("name", name)
-                      .add("strategyClass", strategyClass.getSimpleName())
-                      .add("strategyOptions", strategyOptions)
-                      .add("cfMetaData", cfMetaData)
-                      .add("durableWrites", durableWrites)
-                      .add("userTypes", userTypes)
-                      .toString();
-    }
-
-    public static Map<String,String> optsWithRF(final Integer rf)
-    {
-        return Collections.singletonMap("replication_factor", rf.toString());
-    }
-
-    public KSMetaData validate() throws ConfigurationException
-    {
-        if (!CFMetaData.isNameValid(name))
-            throw new ConfigurationException(String.format("Keyspace name must not be empty, more than %s characters long, or contain non-alphanumeric-underscore characters (got \"%s\")", Schema.NAME_LENGTH, name));
-
-        // Attempt to instantiate the ARS, which will throw a ConfigException if the strategy_options aren't fully formed
-        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
-        IEndpointSnitch eps = DatabaseDescriptor.getEndpointSnitch();
-        AbstractReplicationStrategy.validateReplicationStrategy(name, strategyClass, tmd, eps, strategyOptions);
-
-        for (CFMetaData cfm : cfMetaData.values())
-            cfm.validate();
-
-        return this;
-    }
-}

diff --git a/src/java/org/apache/cassandra/config/ParameterizedClass.java b/src/java/org/apache/cassandra/config/ParameterizedClass.java
index 6b7af63..6c7996a 100644
--- a/src/java/org/apache/cassandra/config/ParameterizedClass.java
+++ b/src/java/org/apache/cassandra/config/ParameterizedClass.java

@@ -17,14 +17,17 @@
  */
 package org.apache.cassandra.config;
 
-import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 
 import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
 
 public class ParameterizedClass
 {
+    public static final String CLASS_NAME = "class_name";
+    public static final String PARAMETERS = "parameters";
+
     public String class_name;
     public Map<String, String> parameters;
 
@@ -35,10 +38,10 @@
     }
 
     @SuppressWarnings("unchecked")
-    public ParameterizedClass(LinkedHashMap<String, ?> p)
+    public ParameterizedClass(Map<String, ?> p)
     {
-        this((String)p.get("class_name"),
-                p.containsKey("parameters") ? (Map<String, String>)((List<?>)p.get("parameters")).get(0) : null);
+        this((String)p.get(CLASS_NAME),
+             p.containsKey(PARAMETERS) ? (Map<String, String>)((List<?>)p.get(PARAMETERS)).get(0) : null);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/config/Schema.java b/src/java/org/apache/cassandra/config/Schema.java
index 2cd7611..6d91d8d 100644
--- a/src/java/org/apache/cassandra/config/Schema.java
+++ b/src/java/org/apache/cassandra/config/Schema.java

@@ -20,6 +20,7 @@
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.*;
+import java.util.stream.Collectors;
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
@@ -28,18 +29,19 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.AuthKeyspace;
-import org.apache.cassandra.cql3.functions.Functions;
-import org.apache.cassandra.cql3.functions.UDAggregate;
-import org.apache.cassandra.cql3.functions.UDFunction;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.functions.*;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.UserType;
-import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.locator.LocalStrategy;
 import org.apache.cassandra.repair.SystemDistributedKeyspace;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.tracing.TraceKeyspace;
 import org.apache.cassandra.utils.ConcurrentBiMap;
@@ -52,6 +54,14 @@
 
     public static final Schema instance = new Schema();
 
+    /* system keyspace names (the ones with LocalStrategy replication strategy) */
+    public static final Set<String> LOCAL_SYSTEM_KEYSPACE_NAMES =
+        ImmutableSet.of(SystemKeyspace.NAME, SchemaKeyspace.NAME);
+
+    /* replicate system keyspace names (the ones with a "true" replication strategy) */
+    public static final Set<String> REPLICATED_SYSTEM_KEYSPACE_NAMES =
+        ImmutableSet.of(TraceKeyspace.NAME, AuthKeyspace.NAME, SystemDistributedKeyspace.NAME);
+
     /**
      * longest permissible KS or CF name.  Our main concern is that filename not be more than 255 characters;
      * the filename will contain both the KS and CF names. Since non-schema-name components only take up
@@ -61,7 +71,7 @@
     public static final int NAME_LENGTH = 48;
 
     /* metadata map for faster keyspace lookup */
-    private final Map<String, KSMetaData> keyspaces = new NonBlockingHashMap<>();
+    private final Map<String, KeyspaceMetadata> keyspaces = new NonBlockingHashMap<>();
 
     /* Keyspace objects, one per keyspace. Only one instance should ever exist for any given keyspace. */
     private final Map<String, Keyspace> keyspaceInstances = new NonBlockingHashMap<>();
@@ -74,9 +84,6 @@
     // 59adb24e-f3cd-3e02-97f0-5b395827453f
     public static final UUID emptyVersion;
 
-    private static final ImmutableSet<String> replicatedSystemKeyspaceNames = ImmutableSet.of(TraceKeyspace.NAME,
-                                                                                              AuthKeyspace.NAME,
-                                                                                              SystemDistributedKeyspace.NAME);
 
     static
     {
@@ -95,7 +102,27 @@
      */
     public Schema()
     {
-        load(SystemKeyspace.definition());
+        if (!Config.isClientMode())
+        {
+            load(SchemaKeyspace.metadata());
+            load(SystemKeyspace.metadata());
+        }
+    }
+
+    /**
+     * @return whether or not the keyspace is a really system one (w/ LocalStrategy, unmodifiable, hardcoded)
+     */
+    public static boolean isLocalSystemKeyspace(String keyspaceName)
+    {
+        return LOCAL_SYSTEM_KEYSPACE_NAMES.contains(keyspaceName.toLowerCase());
+    }
+
+    /**
+     * @return whether or not the keyspace is a replicated system keyspace (trace, auth, sys-ditributed)
+     */
+    public static boolean isReplicatedSystemKeyspace(String keyspaceName)
+    {
+        return REPLICATED_SYSTEM_KEYSPACE_NAMES.contains(keyspaceName.toLowerCase());
     }
 
     /**
@@ -114,7 +141,7 @@
      */
     public Schema loadFromDisk(boolean updateVersion)
     {
-        load(LegacySchemaTables.readSchemaFromSystemTables());
+        load(SchemaKeyspace.fetchNonSystemKeyspaces());
         if (updateVersion)
             updateVersion();
         return this;
@@ -127,11 +154,9 @@
      *
      * @return self to support chaining calls
      */
-    public Schema load(Collection<KSMetaData> keyspaceDefs)
+    public Schema load(Iterable<KeyspaceMetadata> keyspaceDefs)
     {
-        for (KSMetaData def : keyspaceDefs)
-            load(def);
-
+        keyspaceDefs.forEach(this::load);
         return this;
     }
 
@@ -142,13 +167,11 @@
      *
      * @return self to support chaining calls
      */
-    public Schema load(KSMetaData keyspaceDef)
+    public Schema load(KeyspaceMetadata keyspaceDef)
     {
-        for (CFMetaData cfm : keyspaceDef.cfMetaData().values())
-            load(cfm);
-
-        setKeyspaceDefinition(keyspaceDef);
-
+        keyspaceDef.tables.forEach(this::load);
+        keyspaceDef.views.forEach(this::load);
+        setKeyspaceMetadata(keyspaceDef);
         return this;
     }
 
@@ -204,11 +227,13 @@
         if (baseCFS == null)
             return null;
 
-        SecondaryIndex index = baseCFS.indexManager.getIndexByName(cfName);
+        Index index = baseCFS.indexManager.getIndexByName(cfName.substring(indexOfSeparator + 1, cfName.length()));
         if (index == null)
             return null;
 
-        return index.getIndexCfs();
+        //Shouldn't ask for a backing table if there is none so just throw?
+        //Or should it return null?
+        return index.getBackingTable().get();
     }
 
     public ColumnFamilyStore getColumnFamilyStoreInstance(UUID cfId)
@@ -254,7 +279,7 @@
      *
      * @param ksm The keyspace definition to remove
      */
-    public void clearKeyspaceDefinition(KSMetaData ksm)
+    public void clearKeyspaceMetadata(KeyspaceMetadata ksm)
     {
         keyspaces.remove(ksm.name);
     }
@@ -272,8 +297,11 @@
     public CFMetaData getCFMetaData(String keyspaceName, String cfName)
     {
         assert keyspaceName != null;
-        KSMetaData ksm = keyspaces.get(keyspaceName);
-        return (ksm == null) ? null : ksm.cfMetaData().get(cfName);
+
+        KeyspaceMetadata ksm = keyspaces.get(keyspaceName);
+        return ksm == null
+             ? null
+             : ksm.getTableOrViewNullable(cfName);
     }
 
     /**
@@ -294,6 +322,13 @@
         return getCFMetaData(descriptor.ksname, descriptor.cfname);
     }
 
+    public ViewDefinition getView(String keyspaceName, String viewName)
+    {
+        assert keyspaceName != null;
+        KeyspaceMetadata ksm = keyspaces.get(keyspaceName);
+        return (ksm == null) ? null : ksm.views.getNullable(viewName);
+    }
+
     /**
      * Get metadata about keyspace by its name
      *
@@ -301,7 +336,7 @@
      *
      * @return The keyspace metadata or null if it wasn't found
      */
-    public KSMetaData getKSMetaData(String keyspaceName)
+    public KeyspaceMetadata getKSMetaData(String keyspaceName)
     {
         assert keyspaceName != null;
         return keyspaces.get(keyspaceName);
@@ -309,7 +344,7 @@
 
     private Set<String> getNonSystemKeyspacesSet()
     {
-        return Sets.difference(keyspaces.keySet(), Collections.singleton(SystemKeyspace.NAME));
+        return Sets.difference(keyspaces.keySet(), LOCAL_SYSTEM_KEYSPACE_NAMES);
     }
 
     /**
@@ -323,11 +358,34 @@
     }
 
     /**
+     * @return a collection of keyspaces that do not use LocalStrategy for replication
+     */
+    public List<String> getNonLocalStrategyKeyspaces()
+    {
+        return keyspaces.values().stream()
+                .filter(keyspace -> keyspace.params.replication.klass != LocalStrategy.class)
+                .map(keyspace -> keyspace.name)
+                .collect(Collectors.toList());
+    }
+
+    /**
      * @return collection of the user defined keyspaces
      */
     public List<String> getUserKeyspaces()
     {
-        return ImmutableList.copyOf(Sets.difference(getNonSystemKeyspacesSet(), replicatedSystemKeyspaceNames));
+        return ImmutableList.copyOf(Sets.difference(getNonSystemKeyspacesSet(), REPLICATED_SYSTEM_KEYSPACE_NAMES));
+    }
+
+    public Keyspaces getReplicatedKeyspaces()
+    {
+        Keyspaces.Builder builder = Keyspaces.builder();
+
+        keyspaces.values()
+                 .stream()
+                 .filter(k -> !Schema.isLocalSystemKeyspace(k.name))
+                 .forEach(builder::add);
+
+        return builder.build();
     }
 
     /**
@@ -337,12 +395,12 @@
      *
      * @return metadata about ColumnFamilies the belong to the given keyspace
      */
-    public Map<String, CFMetaData> getKeyspaceMetaData(String keyspaceName)
+    public Iterable<CFMetaData> getTablesAndViews(String keyspaceName)
     {
         assert keyspaceName != null;
-        KSMetaData ksm = keyspaces.get(keyspaceName);
+        KeyspaceMetadata ksm = keyspaces.get(keyspaceName);
         assert ksm != null;
-        return ksm.cfMetaData();
+        return ksm.tablesAndViews();
     }
 
     /**
@@ -353,12 +411,14 @@
         return keyspaces.keySet();
     }
 
-    /**
-     * @return collection of the metadata about all keyspaces registered in the system (system and non-system)
-     */
-    public Collection<KSMetaData> getKeyspaceDefinitions()
+    public Keyspaces getKeyspaces(Set<String> includedKeyspaceNames)
     {
-        return keyspaces.values();
+        Keyspaces.Builder builder = Keyspaces.builder();
+        keyspaces.values()
+                 .stream()
+                 .filter(k -> includedKeyspaceNames.contains(k.name))
+                 .forEach(builder::add);
+        return builder.build();
     }
 
     /**
@@ -366,10 +426,14 @@
      *
      * @param ksm The metadata about keyspace
      */
-    public void setKeyspaceDefinition(KSMetaData ksm)
+    public void setKeyspaceMetadata(KeyspaceMetadata ksm)
     {
         assert ksm != null;
+
         keyspaces.put(ksm.name, ksm);
+        Keyspace keyspace = getKeyspaceInstance(ksm.name);
+        if (keyspace != null)
+            keyspace.setMetadata(ksm);
     }
 
     /* ColumnFamily query/control methods */
@@ -423,14 +487,80 @@
     }
 
     /**
+     * Load individual View Definition to the schema
+     * (to make View lookup faster)
+     *
+     * @param view The View definition to load
+     */
+    public void load(ViewDefinition view)
+    {
+        CFMetaData cfm = view.metadata;
+        Pair<String, String> key = Pair.create(cfm.ksName, cfm.cfName);
+
+        if (cfIdMap.containsKey(key))
+            throw new RuntimeException(String.format("Attempting to load already loaded view %s.%s", cfm.ksName, cfm.cfName));
+
+        logger.debug("Adding {} to cfIdMap", cfm);
+        cfIdMap.put(key, cfm.cfId);
+    }
+
+    /**
      * Used for ColumnFamily data eviction out from the schema
      *
      * @param cfm The ColumnFamily Definition to evict
      */
-    public void purge(CFMetaData cfm)
+    public void unload(CFMetaData cfm)
     {
         cfIdMap.remove(Pair.create(cfm.ksName, cfm.cfName));
-        cfm.markPurged();
+    }
+
+    /**
+     * Used for View eviction from the schema
+     *
+     * @param view The view definition to evict
+     */
+    private void unload(ViewDefinition view)
+    {
+        cfIdMap.remove(Pair.create(view.ksName, view.viewName));
+    }
+
+    /* Function helpers */
+
+    /**
+     * Get all function overloads with the specified name
+     *
+     * @param name fully qualified function name
+     * @return an empty list if the keyspace or the function name are not found;
+     *         a non-empty collection of {@link Function} otherwise
+     */
+    public Collection<Function> getFunctions(FunctionName name)
+    {
+        if (!name.hasKeyspace())
+            throw new IllegalArgumentException(String.format("Function name must be fully quallified: got %s", name));
+
+        KeyspaceMetadata ksm = getKSMetaData(name.keyspace);
+        return ksm == null
+             ? Collections.emptyList()
+             : ksm.functions.get(name);
+    }
+
+    /**
+     * Find the function with the specified name
+     *
+     * @param name fully qualified function name
+     * @param argTypes function argument types
+     * @return an empty {@link Optional} if the keyspace or the function name are not found;
+     *         a non-empty optional of {@link Function} otherwise
+     */
+    public Optional<Function> findFunction(FunctionName name, List<AbstractType<?>> argTypes)
+    {
+        if (!name.hasKeyspace())
+            throw new IllegalArgumentException(String.format("Function name must be fully quallified: got %s", name));
+
+        KeyspaceMetadata ksm = getKSMetaData(name.keyspace);
+        return ksm == null
+             ? Optional.empty()
+             : ksm.functions.find(name, argTypes);
     }
 
     /* Version control */
@@ -449,7 +579,7 @@
      */
     public void updateVersion()
     {
-        version = LegacySchemaTables.calculateSchemaDigest();
+        version = SchemaKeyspace.calculateSchemaDigest();
         SystemKeyspace.updateSchemaVersion(version);
     }
 
@@ -469,16 +599,16 @@
     {
         for (String keyspaceName : getNonSystemKeyspaces())
         {
-            KSMetaData ksm = getKSMetaData(keyspaceName);
-            for (CFMetaData cfm : ksm.cfMetaData().values())
-                purge(cfm);
-            clearKeyspaceDefinition(ksm);
+            KeyspaceMetadata ksm = getKSMetaData(keyspaceName);
+            ksm.tables.forEach(this::unload);
+            ksm.views.forEach(this::unload);
+            clearKeyspaceMetadata(ksm);
         }
 
         updateVersionAndAnnounce();
     }
 
-    public void addKeyspace(KSMetaData ksm)
+    public void addKeyspace(KeyspaceMetadata ksm)
     {
         assert getKSMetaData(ksm.name) == null;
         load(ksm);
@@ -487,34 +617,28 @@
         MigrationManager.instance.notifyCreateKeyspace(ksm);
     }
 
-    public void updateKeyspace(String ksName)
+    public void updateKeyspace(String ksName, KeyspaceParams newParams)
     {
-        KSMetaData oldKsm = getKSMetaData(ksName);
-        assert oldKsm != null;
-        KSMetaData newKsm = LegacySchemaTables.createKeyspaceFromName(ksName).cloneWith(oldKsm.cfMetaData().values(), oldKsm.userTypes);
-
-        setKeyspaceDefinition(newKsm);
-        Keyspace.open(ksName).setMetadata(newKsm);
-
-        MigrationManager.instance.notifyUpdateKeyspace(newKsm);
+        KeyspaceMetadata ksm = update(ksName, ks -> ks.withSwapped(newParams));
+        MigrationManager.instance.notifyUpdateKeyspace(ksm);
     }
 
     public void dropKeyspace(String ksName)
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(ksName);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
         String snapshotName = Keyspace.getTimestampedSnapshotName(ksName);
 
-        CompactionManager.instance.interruptCompactionFor(ksm.cfMetaData().values(), true);
+        CompactionManager.instance.interruptCompactionFor(ksm.tablesAndViews(), true);
 
         Keyspace keyspace = Keyspace.open(ksm.name);
 
         // remove all cfs from the keyspace instance.
         List<UUID> droppedCfs = new ArrayList<>();
-        for (CFMetaData cfm : ksm.cfMetaData().values())
+        for (CFMetaData cfm : ksm.tablesAndViews())
         {
             ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfm.cfName);
 
-            purge(cfm);
+            unload(cfm);
 
             if (DatabaseDescriptor.isAutoSnapshot())
                 cfs.snapshot(snapshotName);
@@ -525,9 +649,9 @@
 
         // remove the keyspace from the static instances.
         Keyspace.clear(ksm.name);
-        clearKeyspaceDefinition(ksm);
+        clearKeyspaceMetadata(ksm);
 
-        keyspace.writeOrder.awaitNewBarrier();
+        Keyspace.writeOrder.awaitNewBarrier();
 
         // force a new segment in the CL
         CommitLog.instance.forceRecycleAllSegments(droppedCfs);
@@ -538,144 +662,175 @@
     public void addTable(CFMetaData cfm)
     {
         assert getCFMetaData(cfm.ksName, cfm.cfName) == null;
-        KSMetaData ksm = getKSMetaData(cfm.ksName).cloneWithTableAdded(cfm);
 
-        logger.info("Loading {}", cfm);
-
-        load(cfm);
-
-        // make sure it's init-ed w/ the old definitions first,
-        // since we're going to call initCf on the new one manually
-        Keyspace.open(cfm.ksName);
-
-        // init the new CF before switching the KSM to the new one
+        // Make sure the keyspace is initialized
+        // and init the new CF before switching the KSM to the new one
         // to avoid races as in CASSANDRA-10761
         Keyspace.open(cfm.ksName).initCf(cfm, true);
-        setKeyspaceDefinition(ksm);
+        // Update the keyspaces map with the updated metadata
+        update(cfm.ksName, ks -> ks.withSwapped(ks.tables.with(cfm)));
+        // Update the table ID <-> table name map (cfIdMap)
+        load(cfm);
         MigrationManager.instance.notifyCreateColumnFamily(cfm);
     }
 
-    public void updateTable(String ksName, String tableName)
+    public void updateTable(CFMetaData table)
     {
-        CFMetaData cfm = getCFMetaData(ksName, tableName);
-        assert cfm != null;
-        boolean columnsDidChange = cfm.reload();
+        CFMetaData current = getCFMetaData(table.ksName, table.cfName);
+        assert current != null;
+        boolean changeAffectsStatements = current.apply(table);
 
-        Keyspace keyspace = Keyspace.open(cfm.ksName);
-        keyspace.getColumnFamilyStore(cfm.cfName).reload();
-        MigrationManager.instance.notifyUpdateColumnFamily(cfm, columnsDidChange);
+        Keyspace keyspace = Keyspace.open(current.ksName);
+        keyspace.getColumnFamilyStore(current.cfName).reload();
+        MigrationManager.instance.notifyUpdateColumnFamily(current, changeAffectsStatements);
     }
 
     public void dropTable(String ksName, String tableName)
     {
-        KSMetaData ksm = getKSMetaData(ksName);
-        assert ksm != null;
+        KeyspaceMetadata oldKsm = getKSMetaData(ksName);
+        assert oldKsm != null;
         ColumnFamilyStore cfs = Keyspace.open(ksName).getColumnFamilyStore(tableName);
         assert cfs != null;
 
+        // make sure all the indexes are dropped, or else.
+        cfs.indexManager.markAllIndexesRemoved();
+
         // reinitialize the keyspace.
-        CFMetaData cfm = ksm.cfMetaData().get(tableName);
+        CFMetaData cfm = oldKsm.tables.get(tableName).get();
+        KeyspaceMetadata newKsm = oldKsm.withSwapped(oldKsm.tables.without(tableName));
 
-        purge(cfm);
-        setKeyspaceDefinition(ksm.cloneWithTableRemoved(cfm));
+        unload(cfm);
+        setKeyspaceMetadata(newKsm);
 
-        CompactionManager.instance.interruptCompactionFor(Arrays.asList(cfm), true);
+        CompactionManager.instance.interruptCompactionFor(Collections.singleton(cfm), true);
 
         if (DatabaseDescriptor.isAutoSnapshot())
             cfs.snapshot(Keyspace.getTimestampedSnapshotName(cfs.name));
-        Keyspace.open(ksm.name).dropCf(cfm.cfId);
+        Keyspace.open(ksName).dropCf(cfm.cfId);
         MigrationManager.instance.notifyDropColumnFamily(cfm);
 
         CommitLog.instance.forceRecycleAllSegments(Collections.singleton(cfm.cfId));
     }
 
+    public void addView(ViewDefinition view)
+    {
+        assert getCFMetaData(view.ksName, view.viewName) == null;
+
+        Keyspace keyspace = Keyspace.open(view.ksName);
+
+        // Make sure the keyspace is initialized and initialize the table.
+        keyspace.initCf(view.metadata, true);
+        // Update the keyspaces map with the updated metadata
+        update(view.ksName, ks -> ks.withSwapped(ks.views.with(view)));
+        // Update the table ID <-> table name map (cfIdMap)
+        load(view);
+
+        keyspace.viewManager.reload();
+        MigrationManager.instance.notifyCreateView(view);
+    }
+
+    public void updateView(ViewDefinition view)
+    {
+        ViewDefinition current = getKSMetaData(view.ksName).views.get(view.viewName).get();
+        boolean changeAffectsStatements = current.metadata.apply(view.metadata);
+
+        Keyspace keyspace = Keyspace.open(current.ksName);
+        keyspace.getColumnFamilyStore(current.viewName).reload();
+        Keyspace.open(current.ksName).viewManager.update(current.viewName);
+        MigrationManager.instance.notifyUpdateView(current, changeAffectsStatements);
+    }
+
+    public void dropView(String ksName, String viewName)
+    {
+        KeyspaceMetadata oldKsm = getKSMetaData(ksName);
+        assert oldKsm != null;
+        ColumnFamilyStore cfs = Keyspace.open(ksName).getColumnFamilyStore(viewName);
+        assert cfs != null;
+
+        // make sure all the indexes are dropped, or else.
+        cfs.indexManager.markAllIndexesRemoved();
+
+        // reinitialize the keyspace.
+        ViewDefinition view = oldKsm.views.get(viewName).get();
+        KeyspaceMetadata newKsm = oldKsm.withSwapped(oldKsm.views.without(viewName));
+
+        unload(view);
+        setKeyspaceMetadata(newKsm);
+
+        CompactionManager.instance.interruptCompactionFor(Collections.singleton(view.metadata), true);
+
+        if (DatabaseDescriptor.isAutoSnapshot())
+            cfs.snapshot(Keyspace.getTimestampedSnapshotName(cfs.name));
+        Keyspace.open(ksName).dropCf(view.metadata.cfId);
+        Keyspace.open(ksName).viewManager.reload();
+        MigrationManager.instance.notifyDropView(view);
+
+        CommitLog.instance.forceRecycleAllSegments(Collections.singleton(view.metadata.cfId));
+    }
+
     public void addType(UserType ut)
     {
-        KSMetaData ksm = getKSMetaData(ut.keyspace);
-        assert ksm != null;
-
-        logger.info("Loading {}", ut);
-
-        ksm.userTypes.addType(ut);
-
+        update(ut.keyspace, ks -> ks.withSwapped(ks.types.with(ut)));
         MigrationManager.instance.notifyCreateUserType(ut);
     }
 
     public void updateType(UserType ut)
     {
-        KSMetaData ksm = getKSMetaData(ut.keyspace);
-        assert ksm != null;
-
-        logger.info("Updating {}", ut);
-
-        ksm.userTypes.addType(ut);
-
+        update(ut.keyspace, ks -> ks.withSwapped(ks.types.without(ut.name).with(ut)));
         MigrationManager.instance.notifyUpdateUserType(ut);
     }
 
     public void dropType(UserType ut)
     {
-        KSMetaData ksm = getKSMetaData(ut.keyspace);
-        assert ksm != null;
-
-        ksm.userTypes.removeType(ut);
-
+        update(ut.keyspace, ks -> ks.withSwapped(ks.types.without(ut.name)));
         MigrationManager.instance.notifyDropUserType(ut);
     }
 
     public void addFunction(UDFunction udf)
     {
-        logger.info("Loading {}", udf);
-
-        Functions.addOrReplaceFunction(udf);
-
+        update(udf.name().keyspace, ks -> ks.withSwapped(ks.functions.with(udf)));
         MigrationManager.instance.notifyCreateFunction(udf);
     }
 
     public void updateFunction(UDFunction udf)
     {
-        logger.info("Updating {}", udf);
-
-        Functions.addOrReplaceFunction(udf);
-
+        update(udf.name().keyspace, ks -> ks.withSwapped(ks.functions.without(udf.name(), udf.argTypes()).with(udf)));
         MigrationManager.instance.notifyUpdateFunction(udf);
     }
 
     public void dropFunction(UDFunction udf)
     {
-        logger.info("Drop {}", udf);
-
-        // TODO: this is kind of broken as this remove all overloads of the function name
-        Functions.removeFunction(udf.name(), udf.argTypes());
-
+        update(udf.name().keyspace, ks -> ks.withSwapped(ks.functions.without(udf.name(), udf.argTypes())));
         MigrationManager.instance.notifyDropFunction(udf);
     }
 
-    public void addAggregate(UDAggregate udf)
+    public void addAggregate(UDAggregate uda)
     {
-        logger.info("Loading {}", udf);
-
-        Functions.addOrReplaceFunction(udf);
-
-        MigrationManager.instance.notifyCreateAggregate(udf);
+        update(uda.name().keyspace, ks -> ks.withSwapped(ks.functions.with(uda)));
+        MigrationManager.instance.notifyCreateAggregate(uda);
     }
 
-    public void updateAggregate(UDAggregate udf)
+    public void updateAggregate(UDAggregate uda)
     {
-        logger.info("Updating {}", udf);
-
-        Functions.addOrReplaceFunction(udf);
-
-        MigrationManager.instance.notifyUpdateAggregate(udf);
+        update(uda.name().keyspace, ks -> ks.withSwapped(ks.functions.without(uda.name(), uda.argTypes()).with(uda)));
+        MigrationManager.instance.notifyUpdateAggregate(uda);
     }
 
-    public void dropAggregate(UDAggregate udf)
+    public void dropAggregate(UDAggregate uda)
     {
-        logger.info("Drop {}", udf);
+        update(uda.name().keyspace, ks -> ks.withSwapped(ks.functions.without(uda.name(), uda.argTypes())));
+        MigrationManager.instance.notifyDropAggregate(uda);
+    }
 
-        // TODO: this is kind of broken as this remove all overloads of the function name
-        Functions.removeFunction(udf.name(), udf.argTypes());
+    private synchronized KeyspaceMetadata update(String keyspaceName, java.util.function.Function<KeyspaceMetadata, KeyspaceMetadata> transformation)
+    {
+        KeyspaceMetadata current = getKSMetaData(keyspaceName);
+        if (current == null)
+            throw new IllegalStateException(String.format("Keyspace %s doesn't exist", keyspaceName));
 
-        MigrationManager.instance.notifyDropAggregate(udf);
+        KeyspaceMetadata transformed = transformation.apply(current);
+        setKeyspaceMetadata(transformed);
+
+        return transformed;
     }
 }

diff --git a/src/java/org/apache/cassandra/config/TriggerDefinition.java b/src/java/org/apache/cassandra/config/TriggerDefinition.java
deleted file mode 100644
index 6a84379..0000000
--- a/src/java/org/apache/cassandra/config/TriggerDefinition.java
+++ /dev/null

@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- *  with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.config;
-
-import com.google.common.base.Objects;
-
-public class TriggerDefinition
-{
-    public static final String CLASS = "class";
-
-    public final String name;
-
-    // For now, the only supported option is 'class'.
-    // Proper trigger parametrization will be added later.
-    public final String classOption;
-
-    public TriggerDefinition(String name, String classOption)
-    {
-        this.name = name;
-        this.classOption = classOption;
-    }
-
-    public static TriggerDefinition create(String name, String classOption)
-    {
-        return new TriggerDefinition(name, classOption);
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-
-        if (!(o instanceof TriggerDefinition))
-            return false;
-
-        TriggerDefinition td = (TriggerDefinition) o;
-
-        return Objects.equal(name, td.name) && Objects.equal(classOption, td.classOption);
-    }
-
-    @Override
-    public int hashCode()
-    {
-        return Objects.hashCode(name, classOption);
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this).add("name", name).add("classOption", classOption).toString();
-    }
-}

diff --git a/src/java/org/apache/cassandra/config/UTMetaData.java b/src/java/org/apache/cassandra/config/UTMetaData.java
deleted file mode 100644
index 08cedee..0000000
--- a/src/java/org/apache/cassandra/config/UTMetaData.java
+++ /dev/null

@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.config;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Defined (and loaded) user types.
- *
- * In practice, because user types are global, we have only one instance of
- * this class that retrieve through the Schema class.
- */
-public final class UTMetaData
-{
-    private final Map<ByteBuffer, UserType> userTypes;
-
-    public UTMetaData()
-    {
-        this(new HashMap<ByteBuffer, UserType>());
-    }
-
-    public UTMetaData(Map<ByteBuffer, UserType> types)
-    {
-        this.userTypes = types;
-    }
-
-    public UserType getType(ByteBuffer typeName)
-    {
-        return userTypes.get(typeName);
-    }
-
-    public Map<ByteBuffer, UserType> getAllTypes()
-    {
-        // Copy to avoid concurrent modification while iterating. Not intended to be called on a critical path anyway
-        return new HashMap<>(userTypes);
-    }
-
-    // This is *not* thread safe but is only called in Schema that is synchronized.
-    public void addType(UserType type)
-    {
-        UserType old = userTypes.get(type.name);
-        assert old == null || type.isCompatibleWith(old);
-        userTypes.put(type.name, type);
-    }
-
-    // Same remarks than for addType
-    public void removeType(UserType type)
-    {
-        userTypes.remove(type.name);
-    }
-
-    public boolean equals(Object that)
-    {
-        if (!(that instanceof UTMetaData))
-            return false;
-        return userTypes.equals(((UTMetaData) that).userTypes);
-    }
-}

diff --git a/src/java/org/apache/cassandra/config/ViewDefinition.java b/src/java/org/apache/cassandra/config/ViewDefinition.java
new file mode 100644
index 0000000..5300f56
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/ViewDefinition.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.config;
+
+import java.util.List;
+import java.util.Objects;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import org.antlr.runtime.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.view.View;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+
+public class ViewDefinition
+{
+    public final String ksName;
+    public final String viewName;
+    public final UUID baseTableId;
+    public final String baseTableName;
+    public final boolean includeAllColumns;
+    public final CFMetaData metadata;
+
+    public SelectStatement.RawStatement select;
+    public String whereClause;
+
+    public ViewDefinition(ViewDefinition def)
+    {
+        this(def.ksName, def.viewName, def.baseTableId, def.baseTableName, def.includeAllColumns, def.select, def.whereClause, def.metadata);
+    }
+
+    /**
+     * @param viewName          Name of the view
+     * @param baseTableId       Internal ID of the table which this view is based off of
+     * @param includeAllColumns Whether to include all columns or not
+     */
+    public ViewDefinition(String ksName, String viewName, UUID baseTableId, String baseTableName, boolean includeAllColumns, SelectStatement.RawStatement select, String whereClause, CFMetaData metadata)
+    {
+        this.ksName = ksName;
+        this.viewName = viewName;
+        this.baseTableId = baseTableId;
+        this.baseTableName = baseTableName;
+        this.includeAllColumns = includeAllColumns;
+        this.select = select;
+        this.whereClause = whereClause;
+        this.metadata = metadata;
+    }
+
+    /**
+     * @return true if the view specified by this definition will include the column, false otherwise
+     */
+    public boolean includes(ColumnIdentifier column)
+    {
+        return metadata.getColumnDefinition(column) != null;
+    }
+
+    public ViewDefinition copy()
+    {
+        return new ViewDefinition(ksName, viewName, baseTableId, baseTableName, includeAllColumns, select, whereClause, metadata.copy());
+    }
+
+    public CFMetaData baseTableMetadata()
+    {
+        return Schema.instance.getCFMetaData(baseTableId);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof ViewDefinition))
+            return false;
+
+        ViewDefinition other = (ViewDefinition) o;
+        return Objects.equals(ksName, other.ksName)
+               && Objects.equals(viewName, other.viewName)
+               && Objects.equals(baseTableId, other.baseTableId)
+               && Objects.equals(includeAllColumns, other.includeAllColumns)
+               && Objects.equals(whereClause, other.whereClause)
+               && Objects.equals(metadata, other.metadata);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return new HashCodeBuilder(29, 1597)
+               .append(ksName)
+               .append(viewName)
+               .append(baseTableId)
+               .append(includeAllColumns)
+               .append(whereClause)
+               .append(metadata)
+               .toHashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return new ToStringBuilder(this)
+               .append("ksName", ksName)
+               .append("viewName", viewName)
+               .append("baseTableId", baseTableId)
+               .append("baseTableName", baseTableName)
+               .append("includeAllColumns", includeAllColumns)
+               .append("whereClause", whereClause)
+               .append("metadata", metadata)
+               .toString();
+    }
+
+    /**
+     * Replace the column {@param from} with {@param to} in this materialized view definition's partition,
+     * clustering, or included columns.
+     */
+    public void renameColumn(ColumnIdentifier from, ColumnIdentifier to)
+    {
+        metadata.renameColumn(from, to);
+
+        // convert whereClause to Relations, rename ids in Relations, then convert back to whereClause
+        List<Relation> relations = whereClauseToRelations(whereClause);
+        ColumnIdentifier.Raw fromRaw = new ColumnIdentifier.Literal(from.toString(), true);
+        ColumnIdentifier.Raw toRaw = new ColumnIdentifier.Literal(to.toString(), true);
+        List<Relation> newRelations = relations.stream()
+                .map(r -> r.renameIdentifier(fromRaw, toRaw))
+                .collect(Collectors.toList());
+
+        this.whereClause = View.relationsToWhereClause(newRelations);
+        String rawSelect = View.buildSelectStatement(baseTableName, metadata.allColumns(), whereClause);
+        this.select = (SelectStatement.RawStatement) QueryProcessor.parseStatement(rawSelect);
+    }
+
+    private static List<Relation> whereClauseToRelations(String whereClause)
+    {
+        try
+        {
+            List<Relation> relations = CQLFragmentParser.parseAnyUnhandled(CqlParser::whereClause, whereClause).build().relations;
+
+            return relations;
+        }
+        catch (RecognitionException | SyntaxException exc)
+        {
+            throw new RuntimeException("Unexpected error parsing materialized view's where clause while handling column rename: ", exc);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
index dc691c4..435377c 100644
--- a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
+++ b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java

@@ -24,11 +24,13 @@
 import java.io.IOException;
 import java.net.URL;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
 
-import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import com.google.common.io.ByteStreams;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -36,6 +38,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.yaml.snakeyaml.TypeDescription;
 import org.yaml.snakeyaml.Yaml;
+import org.yaml.snakeyaml.constructor.Constructor;
 import org.yaml.snakeyaml.error.YAMLException;
 import org.yaml.snakeyaml.introspector.MissingProperty;
 import org.yaml.snakeyaml.introspector.Property;
@@ -50,7 +53,7 @@
     /**
      * Inspect the classpath to find storage configuration file
      */
-    static URL getStorageConfigURL() throws ConfigurationException
+    private static URL getStorageConfigURL() throws ConfigurationException
     {
         String configUrl = System.getProperty("cassandra.config");
         if (configUrl == null)
@@ -70,19 +73,26 @@
             {
                 String required = "file:" + File.separator + File.separator;
                 if (!configUrl.startsWith(required))
-                    throw new ConfigurationException("Expecting URI in variable: [cassandra.config].  Please prefix the file with " + required + File.separator +
-                            " for local files or " + required + "<server>" + File.separator + " for remote files. Aborting. If you are executing this from an external tool, it needs to set Config.setClientMode(true) to avoid loading configuration.");
+                    throw new ConfigurationException(String.format(
+                        "Expecting URI in variable: [cassandra.config]. Found[%s]. Please prefix the file with [%s%s] for local " +
+                        "files and [%s<server>%s] for remote files. If you are executing this from an external tool, it needs " +
+                        "to set Config.setClientMode(true) to avoid loading configuration.",
+                        configUrl, required, File.separator, required, File.separator));
                 throw new ConfigurationException("Cannot locate " + configUrl + ".  If this is a local file, please confirm you've provided " + required + File.separator + " as a URI prefix.");
             }
         }
 
+        logger.info("Configuration location: {}", url);
+
         return url;
     }
 
+    private static final URL storageConfigURL = getStorageConfigURL();
+
     @Override
     public Config loadConfig() throws ConfigurationException
     {
-        return loadConfig(getStorageConfigURL());
+        return loadConfig(storageConfigURL);
     }
 
     public Config loadConfig(URL url) throws ConfigurationException
@@ -101,15 +111,11 @@
                 throw new AssertionError(e);
             }
 
-            org.yaml.snakeyaml.constructor.Constructor constructor = new org.yaml.snakeyaml.constructor.Constructor(Config.class);
-            TypeDescription seedDesc = new TypeDescription(ParameterizedClass.class);
-            seedDesc.putMapPropertyType("parameters", String.class, String.class);
-            constructor.addTypeDescription(seedDesc);
+            Constructor constructor = new CustomConstructor(Config.class);
             MissingPropertiesChecker propertiesChecker = new MissingPropertiesChecker();
             constructor.setPropertyUtils(propertiesChecker);
             Yaml yaml = new Yaml(constructor);
             Config result = yaml.loadAs(new ByteArrayInputStream(configBytes), Config.class);
-            result.configHintedHandoff();
             propertiesChecker.check();
             return result;
         }
@@ -119,6 +125,42 @@
         }
     }
 
+    static class CustomConstructor extends Constructor
+    {
+        CustomConstructor(Class<?> theRoot)
+        {
+            super(theRoot);
+
+            TypeDescription seedDesc = new TypeDescription(ParameterizedClass.class);
+            seedDesc.putMapPropertyType("parameters", String.class, String.class);
+            addTypeDescription(seedDesc);
+        }
+
+        @Override
+        protected List<Object> createDefaultList(int initSize)
+        {
+            return Lists.newCopyOnWriteArrayList();
+        }
+
+        @Override
+        protected Map<Object, Object> createDefaultMap()
+        {
+            return Maps.newConcurrentMap();
+        }
+
+        @Override
+        protected Set<Object> createDefaultSet(int initSize)
+        {
+            return Sets.newConcurrentHashSet();
+        }
+
+        @Override
+        protected Set<Object> createDefaultSet()
+        {
+            return Sets.newConcurrentHashSet();
+        }
+    }
+
     private static class MissingPropertiesChecker extends PropertyUtils
     {
         private final Set<String> missingProperties = new HashSet<>();

diff --git a/src/java/org/apache/cassandra/cql3/AbstractConditions.java b/src/java/org/apache/cassandra/cql3/AbstractConditions.java
new file mode 100644
index 0000000..530d2b1
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/AbstractConditions.java

@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.List;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.functions.Function;
+
+/**
+ * Base class for <code>Conditions</code> classes.
+ *
+ */
+abstract class AbstractConditions implements Conditions
+{
+    public void addFunctionsTo(List<Function> functions)
+    {
+    }
+
+    public Iterable<ColumnDefinition> getColumns()
+    {
+        return null;
+    }
+
+    public boolean isEmpty()
+    {
+        return false;
+    }
+
+    public boolean appliesToStaticColumns()
+    {
+        return false;
+    }
+
+    public boolean appliesToRegularColumns()
+    {
+        return false;
+    }
+
+    public boolean isIfExists()
+    {
+        return false;
+    }
+
+    public boolean isIfNotExists()
+    {
+        return false;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/AbstractMarker.java b/src/java/org/apache/cassandra/cql3/AbstractMarker.java
index d11b8e2..14170b1 100644
--- a/src/java/org/apache/cassandra/cql3/AbstractMarker.java
+++ b/src/java/org/apache/cassandra/cql3/AbstractMarker.java

@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.cql3;
 
-import java.util.Collections;
+import java.util.List;
 
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.db.marshal.CollectionType;
@@ -48,17 +48,16 @@
         return true;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return Collections.emptySet();
     }
 
     /**
      * A parsed, but non prepared, bind marker.
      */
-    public static class Raw implements Term.Raw
+    public static class Raw extends Term.Raw
     {
-        protected final int bindIndex;
+        private final int bindIndex;
 
         public Raw(int bindIndex)
         {
@@ -85,7 +84,39 @@
         }
 
         @Override
-        public String toString()
+        public String getText()
+        {
+            return "?";
+        }
+
+        public int bindIndex()
+        {
+            return bindIndex;
+        }
+    }
+
+    /** A MultiColumnRaw version of AbstractMarker.Raw */
+    public static abstract class MultiColumnRaw extends Term.MultiColumnRaw
+    {
+        protected final int bindIndex;
+
+        public MultiColumnRaw(int bindIndex)
+        {
+            this.bindIndex = bindIndex;
+        }
+
+        public NonTerminal prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
+        {
+            throw new AssertionError("MultiColumnRaw..prepare() requires a list of receivers");
+        }
+
+        public AssignmentTestable.TestResult testAssignment(String keyspace, ColumnSpecification receiver)
+        {
+            return AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
+        }
+
+        @Override
+        public String getText()
         {
             return "?";
         }
@@ -113,7 +144,7 @@
         @Override
         public AbstractMarker prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            return new Lists.Marker(bindIndex, makeInReceiver(receiver));
+            return new Lists.Marker(bindIndex(), makeInReceiver(receiver));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Attributes.java b/src/java/org/apache/cassandra/cql3/Attributes.java
index 84f423a..832d0a7 100644
--- a/src/java/org/apache/cassandra/cql3/Attributes.java
+++ b/src/java/org/apache/cassandra/cql3/Attributes.java

@@ -18,23 +18,16 @@
 package org.apache.cassandra.cql3;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.collect.Iterables;
-import com.google.common.annotations.VisibleForTesting;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.util.List;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.db.ExpirationDateOverflowHandling;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.NoSpamLogger;
 
 /**
  * Utility class for the Parser to gather attributes for modification
@@ -42,40 +35,13 @@
  */
 public class Attributes
 {
-    private static final int EXPIRATION_OVERFLOW_WARNING_INTERVAL_MINUTES = Integer.getInteger("cassandra.expiration_overflow_warning_interval_minutes", 5);
-
-    private static final Logger logger = LoggerFactory.getLogger(Attributes.class);
-
-    public enum ExpirationDateOverflowPolicy
-    {
-        REJECT, CAP
-    }
-
-    @VisibleForTesting
-    public static ExpirationDateOverflowPolicy policy;
-
-    static {
-        String policyAsString = System.getProperty("cassandra.expiration_date_overflow_policy", ExpirationDateOverflowPolicy.REJECT.name());
-        try
-        {
-            policy = ExpirationDateOverflowPolicy.valueOf(policyAsString.toUpperCase());
-        }
-        catch (RuntimeException e)
-        {
-            logger.warn("Invalid expiration date overflow policy: {}. Using default: {}", policyAsString, ExpirationDateOverflowPolicy.REJECT.name());
-            policy = ExpirationDateOverflowPolicy.REJECT;
-        }
-    }
-
-    public static final String MAXIMUM_EXPIRATION_DATE_EXCEEDED_WARNING = "Request on table {}.{} with {}ttl of {} seconds exceeds maximum supported expiration " +
-                                                                          "date of 2038-01-19T03:14:06+00:00 and will have its expiration capped to that date. " +
-                                                                          "In order to avoid this use a lower TTL or upgrade to a version where this limitation " +
-                                                                          "is fixed. See CASSANDRA-14092 for more details.";
-
-    public static final String MAXIMUM_EXPIRATION_DATE_EXCEEDED_REJECT_MESSAGE = "Request on table %s.%s with %sttl of %d seconds exceeds maximum supported expiration " +
-                                                                                 "date of 2038-01-19T03:14:06+00:00. In order to avoid this use a lower TTL, change " +
-                                                                                 "the expiration date overflow policy or upgrade to a version where this limitation " +
-                                                                                 "is fixed. See CASSANDRA-14092 for more details.";
+    /**
+     * If this limit is ever raised, make sure @{@link Integer#MAX_VALUE} is not allowed,
+     * as this is used as a flag to represent expired liveness.
+     *
+     * See {@link org.apache.cassandra.db.LivenessInfo#EXPIRED_LIVENESS_TTL}
+     */
+    public static final int MAX_TTL = 20 * 365 * 24 * 60 * 60; // 20 years in seconds
 
     private final Term timestamp;
     private final Term timeToLive;
@@ -91,16 +57,12 @@
         this.timeToLive = timeToLive;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        if (timestamp != null && timeToLive != null)
-            return Iterables.concat(timestamp.getFunctions(), timeToLive.getFunctions());
-        else if (timestamp != null)
-            return timestamp.getFunctions();
-        else if (timeToLive != null)
-            return timeToLive.getFunctions();
-        else
-            return Collections.emptySet();
+        if (timestamp != null)
+            timestamp.addFunctionsTo(functions);
+        if (timeToLive != null)
+            timeToLive.addFunctionsTo(functions);
     }
 
     public boolean isTimestampSet()
@@ -141,8 +103,8 @@
     {
         if (timeToLive == null)
         {
-            maybeApplyExpirationDateOverflowPolicy(metadata, metadata.getDefaultTimeToLive(), true);
-            return metadata.getDefaultTimeToLive();
+            ExpirationDateOverflowHandling.maybeApplyExpirationDateOverflowPolicy(metadata, metadata.params.defaultTimeToLive, true);
+            return metadata.params.defaultTimeToLive;
         }
 
         ByteBuffer tval = timeToLive.bindAndGet(options);
@@ -165,10 +127,10 @@
         if (ttl < 0)
             throw new InvalidRequestException("A TTL must be greater or equal to 0, but was " + ttl);
 
-        if (ttl > ExpiringCell.MAX_TTL)
-            throw new InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", ttl, ExpiringCell.MAX_TTL));
+        if (ttl > MAX_TTL)
+            throw new InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", ttl, MAX_TTL));
 
-        maybeApplyExpirationDateOverflowPolicy(metadata, ttl, false);
+        ExpirationDateOverflowHandling.maybeApplyExpirationDateOverflowPolicy(metadata, ttl, false);
 
         return ttl;
     }
@@ -203,33 +165,4 @@
             return new ColumnSpecification(ksName, cfName, new ColumnIdentifier("[ttl]", true), Int32Type.instance);
         }
     }
-
-    public static void maybeApplyExpirationDateOverflowPolicy(CFMetaData metadata, int ttl, boolean isDefaultTTL) throws InvalidRequestException
-    {
-        if (ttl == 0)
-            return;
-
-        // Check for localExpirationTime overflow (CASSANDRA-14092)
-        int nowInSecs = (int)(System.currentTimeMillis() / 1000);
-        if (ttl + nowInSecs < 0)
-        {
-            switch (policy)
-            {
-                case CAP:
-                    /**
-                     * Capping at this stage is basically not rejecting the request. The actual capping is done
-                     * by {@link org.apache.cassandra.db.BufferExpiringCell#computeLocalExpirationTime(int)},
-                     * which converts the negative TTL to {@link org.apache.cassandra.db.BufferExpiringCell#MAX_DELETION_TIME}
-                     */
-                    NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, EXPIRATION_OVERFLOW_WARNING_INTERVAL_MINUTES,
-                                     TimeUnit.MINUTES, MAXIMUM_EXPIRATION_DATE_EXCEEDED_WARNING,
-                                     metadata.ksName, metadata.cfName, isDefaultTTL? "default " : "", ttl);
-                    return;
-
-                default: //REJECT
-                    throw new InvalidRequestException(String.format(MAXIMUM_EXPIRATION_DATE_EXCEEDED_REJECT_MESSAGE, metadata.ksName, metadata.cfName,
-                                                                    isDefaultTTL? "default " : "", ttl));
-            }
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/CQL3Row.java b/src/java/org/apache/cassandra/cql3/CQL3Row.java
deleted file mode 100644
index e3e76d1..0000000
--- a/src/java/org/apache/cassandra/cql3/CQL3Row.java
+++ /dev/null

@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.cassandra.db.Cell;
-
-public interface CQL3Row
-{
-    public ByteBuffer getClusteringColumn(int i);
-    public Cell getColumn(ColumnIdentifier name);
-    public List<Cell> getMultiCellColumn(ColumnIdentifier name);
-
-    public interface Builder
-    {
-        public RowIterator group(Iterator<Cell> cells);
-    }
-
-    public interface RowIterator extends Iterator<CQL3Row>
-    {
-        public CQL3Row getStaticRow();
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/CQL3Type.java b/src/java/org/apache/cassandra/cql3/CQL3Type.java
index 1823108..95524d9 100644
--- a/src/java/org/apache/cassandra/cql3/CQL3Type.java
+++ b/src/java/org/apache/cassandra/cql3/CQL3Type.java

@@ -17,18 +17,23 @@
  */
 package org.apache.cassandra.cql3;
 
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.cassandra.config.KSMetaData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.Types;
+import org.apache.cassandra.serializers.CollectionSerializer;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public interface CQL3Type
 {
@@ -37,6 +42,15 @@
     public boolean isCollection();
     public AbstractType<?> getType();
 
+    /**
+     * Generates CQL literal from a binary value of this type.
+     *
+     * @param buffer the value to convert to a CQL literal. This value must be
+     * serialized with {@code version} of the native protocol.
+     * @param version the native protocol version in which {@code buffer} is encoded.
+     */
+    public String toCQLLiteral(ByteBuffer buffer, int version);
+
     public enum Native implements CQL3Type
     {
         ASCII       (AsciiType.instance),
@@ -44,21 +58,22 @@
         BLOB        (BytesType.instance),
         BOOLEAN     (BooleanType.instance),
         COUNTER     (CounterColumnType.instance),
+        DATE        (SimpleDateType.instance),
         DECIMAL     (DecimalType.instance),
         DOUBLE      (DoubleType.instance),
+        EMPTY       (EmptyType.instance),
         FLOAT       (FloatType.instance),
         INET        (InetAddressType.instance),
         INT         (Int32Type.instance),
         SMALLINT    (ShortType.instance),
         TEXT        (UTF8Type.instance),
+        TIME        (TimeType.instance),
         TIMESTAMP   (TimestampType.instance),
+        TIMEUUID    (TimeUUIDType.instance),
         TINYINT     (ByteType.instance),
         UUID        (UUIDType.instance),
         VARCHAR     (UTF8Type.instance),
-        VARINT      (IntegerType.instance),
-        TIMEUUID    (TimeUUIDType.instance),
-        DATE        (SimpleDateType.instance),
-        TIME        (TimeType.instance);
+        VARINT      (IntegerType.instance);
 
         private final AbstractType<?> type;
 
@@ -77,6 +92,18 @@
             return type;
         }
 
+        /**
+         * Delegate to
+         * {@link org.apache.cassandra.serializers.TypeSerializer#toCQLLiteral(ByteBuffer)}
+         * for native types as most CQL literal representations work fine with the default
+         * {@link org.apache.cassandra.serializers.TypeSerializer#toString(Object)}
+         * {@link org.apache.cassandra.serializers.TypeSerializer#deserialize(ByteBuffer)} implementations.
+         */
+        public String toCQLLiteral(ByteBuffer buffer, int version)
+        {
+            return type.getSerializer().toCQLLiteral(buffer);
+        }
+
         @Override
         public String toString()
         {
@@ -108,6 +135,12 @@
             return type;
         }
 
+        public String toCQLLiteral(ByteBuffer buffer, int version)
+        {
+            // *always* use the 'blob' syntax to express custom types in CQL
+            return Native.BLOB.toCQLLiteral(buffer, version);
+        }
+
         @Override
         public final boolean equals(Object o)
         {
@@ -127,7 +160,7 @@
         @Override
         public String toString()
         {
-            return "'" + type + "'";
+            return "'" + type + '\'';
         }
     }
 
@@ -150,6 +183,65 @@
             return true;
         }
 
+        public String toCQLLiteral(ByteBuffer buffer, int version)
+        {
+            if (buffer == null)
+                return "null";
+
+            StringBuilder target = new StringBuilder();
+            buffer = buffer.duplicate();
+            int size = CollectionSerializer.readCollectionSize(buffer, version);
+
+            switch (type.kind)
+            {
+                case LIST:
+                    CQL3Type elements = ((ListType) type).getElementsType().asCQL3Type();
+                    target.append('[');
+                    generateSetOrListCQLLiteral(buffer, version, target, size, elements);
+                    target.append(']');
+                    break;
+                case SET:
+                    elements = ((SetType) type).getElementsType().asCQL3Type();
+                    target.append('{');
+                    generateSetOrListCQLLiteral(buffer, version, target, size, elements);
+                    target.append('}');
+                    break;
+                case MAP:
+                    target.append('{');
+                    generateMapCQLLiteral(buffer, version, target, size);
+                    target.append('}');
+                    break;
+            }
+            return target.toString();
+        }
+
+        private void generateMapCQLLiteral(ByteBuffer buffer, int version, StringBuilder target, int size)
+        {
+            CQL3Type keys = ((MapType) type).getKeysType().asCQL3Type();
+            CQL3Type values = ((MapType) type).getValuesType().asCQL3Type();
+            for (int i = 0; i < size; i++)
+            {
+                if (i > 0)
+                    target.append(", ");
+                ByteBuffer element = CollectionSerializer.readValue(buffer, version);
+                target.append(keys.toCQLLiteral(element, version));
+                target.append(": ");
+                element = CollectionSerializer.readValue(buffer, version);
+                target.append(values.toCQLLiteral(element, version));
+            }
+        }
+
+        private static void generateSetOrListCQLLiteral(ByteBuffer buffer, int version, StringBuilder target, int size, CQL3Type elements)
+        {
+            for (int i = 0; i < size; i++)
+            {
+                if (i > 0)
+                    target.append(", ");
+                ByteBuffer element = CollectionSerializer.readValue(buffer, version);
+                target.append(elements.toCQLLiteral(element, version));
+            }
+        }
+
         @Override
         public final boolean equals(Object o)
         {
@@ -189,9 +281,9 @@
                 default:
                     throw new AssertionError();
             }
-            sb.append(">");
+            sb.append('>');
             if (isFrozen)
-                sb.append(">");
+                sb.append('>');
             return sb.toString();
         }
     }
@@ -223,6 +315,49 @@
             return type;
         }
 
+        public String toCQLLiteral(ByteBuffer buffer, int version)
+        {
+            if (buffer == null)
+                return "null";
+
+
+            StringBuilder target = new StringBuilder();
+            buffer = buffer.duplicate();
+            target.append('{');
+            for (int i = 0; i < type.size(); i++)
+            {
+                // we allow the input to have less fields than declared so as to support field addition.
+                if (!buffer.hasRemaining())
+                    break;
+
+                if (buffer.remaining() < 4)
+                    throw new MarshalException(String.format("Not enough bytes to read size of %dth field %s", i, type.fieldName(i)));
+
+                int size = buffer.getInt();
+
+                if (i > 0)
+                    target.append(", ");
+
+                target.append(ColumnIdentifier.maybeQuote(type.fieldNameAsString(i)));
+                target.append(": ");
+
+                // size < 0 means null value
+                if (size < 0)
+                {
+                    target.append("null");
+                    continue;
+                }
+
+                if (buffer.remaining() < size)
+                    throw new MarshalException(String.format("Not enough bytes to read %dth field %s", i, type.fieldName(i)));
+
+                ByteBuffer field = ByteBufferUtil.readBytes(buffer, size);
+                target.append(type.fieldType(i).asCQL3Type().toCQLLiteral(field, version));
+            }
+            target.append('}');
+            return target.toString();
+        }
+
         @Override
         public final boolean equals(Object o)
         {
@@ -242,7 +377,7 @@
         @Override
         public String toString()
         {
-            return name;
+            return "frozen<" + ColumnIdentifier.maybeQuote(name) + '>';
         }
     }
 
@@ -270,6 +405,48 @@
             return type;
         }
 
+        public String toCQLLiteral(ByteBuffer buffer, int version)
+        {
+            if (buffer == null)
+                return "null";
+
+            StringBuilder target = new StringBuilder();
+            buffer = buffer.duplicate();
+            target.append('(');
+            boolean first = true;
+            for (int i = 0; i < type.size(); i++)
+            {
+                // we allow the input to have less fields than declared so as to support field addition.
+                if (!buffer.hasRemaining())
+                    break;
+
+                if (buffer.remaining() < 4)
+                    throw new MarshalException(String.format("Not enough bytes to read size of %dth component", i));
+
+                int size = buffer.getInt();
+
+                if (first)
+                    first = false;
+                else
+                    target.append(", ");
+
+                // size < 0 means null value
+                if (size < 0)
+                {
+                    target.append("null");
+                    continue;
+                }
+
+                if (buffer.remaining() < size)
+                    throw new MarshalException(String.format("Not enough bytes to read %dth component", i));
+
+                ByteBuffer field = ByteBufferUtil.readBytes(buffer, size);
+                target.append(type.type(i).asCQL3Type().toCQLLiteral(field, version));
+            }
+            target.append(')');
+            return target.toString();
+        }
+
         @Override
         public final boolean equals(Object o)
         {
@@ -290,14 +467,14 @@
         public String toString()
         {
             StringBuilder sb = new StringBuilder();
-            sb.append("tuple<");
+            sb.append("frozen<tuple<");
             for (int i = 0; i < type.size(); i++)
             {
                 if (i > 0)
                     sb.append(", ");
                 sb.append(type.type(i).asCQL3Type());
             }
-            sb.append(">");
+            sb.append(">>");
             return sb.toString();
         }
     }
@@ -341,7 +518,25 @@
             throw new InvalidRequestException(message);
         }
 
-        public abstract CQL3Type prepare(String keyspace) throws InvalidRequestException;
+        public CQL3Type prepare(String keyspace)
+        {
+            KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace);
+            if (ksm == null)
+                throw new ConfigurationException(String.format("Keyspace %s doesn't exist", keyspace));
+            return prepare(keyspace, ksm.types);
+        }
+
+        public abstract CQL3Type prepare(String keyspace, Types udts) throws InvalidRequestException;
+
+        public CQL3Type prepareInternal(String keyspace, Types udts) throws InvalidRequestException
+        {
+            return prepare(keyspace, udts);
+        }
+
+        public boolean referencesUserType(String name)
+        {
+            return false;
+        }
 
         public static Raw from(CQL3Type type)
         {
@@ -381,14 +576,14 @@
 
         private static class RawType extends Raw
         {
-            private CQL3Type type;
+            private final CQL3Type type;
 
             private RawType(CQL3Type type)
             {
                 this.type = type;
             }
 
-            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            public CQL3Type prepare(String keyspace, Types udts) throws InvalidRequestException
             {
                 return type;
             }
@@ -442,13 +637,26 @@
                 return true;
             }
 
-            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            public CQL3Type prepare(String keyspace, Types udts) throws InvalidRequestException
+            {
+                return prepare(keyspace, udts, false);
+            }
+
+            public CQL3Type prepareInternal(String keyspace, Types udts)
+            {
+                return prepare(keyspace, udts, true);
+            }
+
+            public CQL3Type prepare(String keyspace, Types udts, boolean isInternal) throws InvalidRequestException
             {
                 assert values != null : "Got null values type for a collection";
 
                 if (!frozen && values.supportsFreezing() && !values.frozen)
                     throw new InvalidRequestException("Non-frozen collections are not allowed inside collections: " + this);
-                if (values.isCounter())
+
+                // we represent Thrift supercolumns as maps, internally, and we do allow counters in supercolumns. Thus,
+                // for internal type parsing (think schema) we have to make an exception and allow counters as (map) values
+                if (values.isCounter() && !isInternal)
                     throw new InvalidRequestException("Counters are not allowed inside collections: " + this);
 
                 if (keys != null)
@@ -462,16 +670,21 @@
                 switch (kind)
                 {
                     case LIST:
-                        return new Collection(ListType.getInstance(values.prepare(keyspace).getType(), !frozen));
+                        return new Collection(ListType.getInstance(values.prepare(keyspace, udts).getType(), !frozen));
                     case SET:
-                        return new Collection(SetType.getInstance(values.prepare(keyspace).getType(), !frozen));
+                        return new Collection(SetType.getInstance(values.prepare(keyspace, udts).getType(), !frozen));
                     case MAP:
                         assert keys != null : "Got null keys type for a collection";
-                        return new Collection(MapType.getInstance(keys.prepare(keyspace).getType(), values.prepare(keyspace).getType(), !frozen));
+                        return new Collection(MapType.getInstance(keys.prepare(keyspace, udts).getType(), values.prepare(keyspace, udts).getType(), !frozen));
                 }
                 throw new AssertionError();
             }
 
+            public boolean referencesUserType(String name)
+            {
+                return (keys != null && keys.referencesUserType(name)) || values.referencesUserType(name);
+            }
+
             @Override
             public String toString()
             {
@@ -479,9 +692,9 @@
                 String end = frozen ? ">" : "";
                 switch (kind)
                 {
-                    case LIST: return start + "list<" + values + ">" + end;
-                    case SET:  return start + "set<" + values + ">" + end;
-                    case MAP:  return start + "map<" + keys + ", " + values + ">" + end;
+                    case LIST: return start + "list<" + values + '>' + end;
+                    case SET:  return start + "set<" + values + '>' + end;
+                    case MAP:  return start + "map<" + keys + ", " + values + '>' + end;
                 }
                 throw new AssertionError();
             }
@@ -511,13 +724,13 @@
                 return false;
             }
 
-            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            public CQL3Type prepare(String keyspace, Types udts) throws InvalidRequestException
             {
                 if (name.hasKeyspace())
                 {
                     // The provided keyspace is the one of the current statement this is part of. If it's different from the keyspace of
                     // the UTName, we reject since we want to limit user types to their own keyspace (see #6643)
-                    if (keyspace != null && !SystemKeyspace.NAME.equals(name.getKeyspace()) && !keyspace.equals(name.getKeyspace()))
+                    if (!keyspace.equals(name.getKeyspace()))
                         throw new InvalidRequestException(String.format("Statement on keyspace %s cannot refer to a user type in keyspace %s; "
                                                                         + "user types can only be used in the keyspace they are defined in",
                                                                         keyspace, name.getKeyspace()));
@@ -527,10 +740,7 @@
                     name.setKeyspace(keyspace);
                 }
 
-                KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
-                if (ksm == null)
-                    throw new InvalidRequestException("Unknown keyspace " + name.getKeyspace());
-                UserType type = ksm.userTypes.getType(name.getUserTypeName());
+                UserType type = udts.getNullable(name.getUserTypeName());
                 if (type == null)
                     throw new InvalidRequestException("Unknown type " + name);
 
@@ -540,6 +750,11 @@
                 return new UserDefined(name.toString(), type);
             }
 
+            public boolean referencesUserType(String name)
+            {
+                return this.name.getStringTypeName().equals(name);
+            }
+
             protected boolean supportsFreezing()
             {
                 return true;
@@ -574,14 +789,13 @@
             public void freeze() throws InvalidRequestException
             {
                 for (CQL3Type.Raw t : types)
-                {
                     if (t.supportsFreezing())
                         t.freeze();
-                }
+
                 frozen = true;
             }
 
-            public CQL3Type prepare(String keyspace) throws InvalidRequestException
+            public CQL3Type prepare(String keyspace, Types udts) throws InvalidRequestException
             {
                 if (!frozen)
                     freeze();
@@ -592,11 +806,16 @@
                     if (t.isCounter())
                         throw new InvalidRequestException("Counters are not allowed inside tuples");
 
-                    ts.add(t.prepare(keyspace).getType());
+                    ts.add(t.prepare(keyspace, udts).getType());
                 }
                 return new Tuple(new TupleType(ts));
             }
 
+            public boolean referencesUserType(String name)
+            {
+                return types.stream().anyMatch(t -> t.referencesUserType(name));
+            }
+
             @Override
             public String toString()
             {
@@ -608,7 +827,7 @@
                         sb.append(", ");
                     sb.append(types.get(i));
                 }
-                sb.append(">");
+                sb.append('>');
                 return sb.toString();
             }
         }

diff --git a/src/java/org/apache/cassandra/cql3/CQLFragmentParser.java b/src/java/org/apache/cassandra/cql3/CQLFragmentParser.java
new file mode 100644
index 0000000..d6f4732
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/CQLFragmentParser.java

@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import org.antlr.runtime.ANTLRStringStream;
+import org.antlr.runtime.CharStream;
+import org.antlr.runtime.CommonTokenStream;
+import org.antlr.runtime.RecognitionException;
+import org.antlr.runtime.TokenStream;
+import org.apache.cassandra.exceptions.SyntaxException;
+
+/**
+ * Helper class to encapsulate common code that calls one of the generated methods in {@code CqlParser}.
+ */
+public final class CQLFragmentParser
+{
+
+    @FunctionalInterface
+    public interface CQLParserFunction<R>
+    {
+        R parse(CqlParser parser) throws RecognitionException;
+    }
+
+    public static <R> R parseAny(CQLParserFunction<R> parserFunction, String input, String meaning)
+    {
+        try
+        {
+            return parseAnyUnhandled(parserFunction, input);
+        }
+        catch (RuntimeException re)
+        {
+            throw new SyntaxException(String.format("Failed parsing %s: [%s] reason: %s %s",
+                                                    meaning,
+                                                    input,
+                                                    re.getClass().getSimpleName(),
+                                                    re.getMessage()));
+        }
+        catch (RecognitionException e)
+        {
+            throw new SyntaxException("Invalid or malformed " + meaning + ": " + e.getMessage());
+        }
+    }
+
+    /**
+     * Just call a parser method in {@link CqlParser} - does not do any error handling.
+     */
+    public static <R> R parseAnyUnhandled(CQLParserFunction<R> parserFunction, String input) throws RecognitionException
+    {
+        // Lexer and parser
+        ErrorCollector errorCollector = new ErrorCollector(input);
+        CharStream stream = new ANTLRStringStream(input);
+        CqlLexer lexer = new CqlLexer(stream);
+        lexer.addErrorListener(errorCollector);
+
+        TokenStream tokenStream = new CommonTokenStream(lexer);
+        CqlParser parser = new CqlParser(tokenStream);
+        parser.addErrorListener(errorCollector);
+
+        // Parse the query string to a statement instance
+        R r = parserFunction.parse(parser);
+
+        // The errorCollector has queue up any errors that the lexer and parser may have encountered
+        // along the way, if necessary, we turn the last error into exceptions here.
+        errorCollector.throwFirstSyntaxError();
+
+        return r;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/ColumnCondition.java
index 3412e71..c3a3af7 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnCondition.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnCondition.java

@@ -20,19 +20,12 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.base.Predicate;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Iterators;
 
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.Term.Terminal;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.transport.Server;
@@ -45,7 +38,6 @@
  */
 public class ColumnCondition
 {
-
     public final ColumnDefinition column;
 
     // For collection, when testing the equality of a specific element, null otherwise.
@@ -68,6 +60,12 @@
             assert this.inValues == null;
     }
 
+    // Public for SuperColumn tables support only
+    public Term value()
+    {
+        return value;
+    }
+
     public static ColumnCondition condition(ColumnDefinition column, Term value, Operator op)
     {
         return new ColumnCondition(column, null, value, null, op);
@@ -98,18 +96,16 @@
         return new ColumnCondition(column, collectionElement, inMarker, null, Operator.IN);
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        Iterable<Function> iter = Collections.emptyList();
         if (collectionElement != null)
-           iter = Iterables.concat(iter, collectionElement.getFunctions());
+           collectionElement.addFunctionsTo(functions);
         if (value != null)
-            iter = Iterables.concat(iter, value.getFunctions());
+           value.addFunctionsTo(functions);
         if (inValues != null)
             for (Term value : inValues)
                 if (value != null)
-                    iter = Iterables.concat(iter, value.getFunctions());
-        return iter;
+                    value.addFunctionsTo(functions);
     }
 
     /**
@@ -161,20 +157,19 @@
         /**
          * Validates whether this condition applies to {@code current}.
          */
-        public abstract boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException;
+        public abstract boolean appliesTo(Row row) throws InvalidRequestException;
 
         public ByteBuffer getCollectionElementValue()
         {
             return null;
         }
 
-        protected boolean isSatisfiedByValue(ByteBuffer value, Cell c, AbstractType<?> type, Operator operator, long now) throws InvalidRequestException
+        protected boolean isSatisfiedByValue(ByteBuffer value, Cell c, AbstractType<?> type, Operator operator) throws InvalidRequestException
         {
-            ByteBuffer columnValue = (c == null || !c.isLive(now)) ? null : c.value();
-            return compareWithOperator(operator, type, value, columnValue);
+            return compareWithOperator(operator, type, value, c == null ? null : c.value());
         }
 
-        /** Returns true if the operator is satisfied (i.e. "value operator otherValue == true"), false otherwise. */
+        /** Returns true if the operator is satisfied (i.e. "otherValue operator value == true"), false otherwise. */
         protected boolean compareWithOperator(Operator operator, AbstractType<?> type, ByteBuffer value, ByteBuffer otherValue) throws InvalidRequestException
         {
             if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
@@ -196,41 +191,33 @@
                 // the condition value is not null, so only NEQ can return true
                 return operator == Operator.NEQ;
             }
-            int comparison = type.compare(otherValue, value);
-            switch (operator)
-            {
-                case EQ:
-                    return comparison == 0;
-                case LT:
-                    return comparison < 0;
-                case LTE:
-                    return comparison <= 0;
-                case GT:
-                    return comparison > 0;
-                case GTE:
-                    return comparison >= 0;
-                case NEQ:
-                    return comparison != 0;
-                default:
-                    // we shouldn't get IN, CONTAINS, or CONTAINS KEY here
-                    throw new AssertionError();
-            }
+            return operator.isSatisfiedBy(type, otherValue, value);
         }
+    }
 
-        protected Iterator<Cell> collectionColumns(CellName collection, ColumnFamily cf, final long now)
-        {
-            // We are testing for collection equality, so we need to have the expected values *and* only those.
-            ColumnSlice[] collectionSlice = new ColumnSlice[]{ collection.slice() };
-            // Filter live columns, this makes things simpler afterwards
-            return Iterators.filter(cf.iterator(collectionSlice), new Predicate<Cell>()
-            {
-                public boolean apply(Cell c)
-                {
-                    // we only care about live columns
-                    return c.isLive(now);
-                }
-            });
-        }
+    private static Cell getCell(Row row, ColumnDefinition column)
+    {
+        // If we're asking for a given cell, and we didn't got any row from our read, it's
+        // the same as not having said cell.
+        return row == null ? null : row.getCell(column);
+    }
+
+    private static Cell getCell(Row row, ColumnDefinition column, CellPath path)
+    {
+        // If we're asking for a given cell, and we didn't got any row from our read, it's
+        // the same as not having said cell.
+        return row == null ? null : row.getCell(column, path);
+    }
+
+    private static Iterator<Cell> getCells(Row row, ColumnDefinition column)
+    {
+        // If we're asking for a complex cells, and we didn't got any row from our read, it's
+        // the same as not having any cells for that column.
+        if (row == null)
+            return Collections.<Cell>emptyIterator();
+
+        ComplexColumnData complexData = row.getComplexColumnData(column);
+        return complexData == null ? Collections.<Cell>emptyIterator() : complexData.iterator();
     }
 
     /**
@@ -248,10 +235,9 @@
             this.value = condition.value.bindAndGet(options);
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
-            CellName name = current.metadata().comparator.create(rowPrefix, column);
-            return isSatisfiedByValue(value, current.getColumn(name), column.type, operator, now);
+            return isSatisfiedByValue(value, getCell(row, column), column.type, operator);
         }
     }
 
@@ -291,12 +277,12 @@
             }
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
-            CellName name = current.metadata().comparator.create(rowPrefix, column);
+            Cell c = getCell(row, column);
             for (ByteBuffer value : inValues)
             {
-                if (isSatisfiedByValue(value, current.getColumn(name), column.type, Operator.EQ, now))
+                if (isSatisfiedByValue(value, c, column.type, Operator.EQ))
                     return true;
             }
             return false;
@@ -318,7 +304,7 @@
             this.value = condition.value.bindAndGet(options);
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
             if (collectionElement == null)
                 throw new InvalidRequestException("Invalid null value for " + (column.type instanceof MapType ? "map" : "list") + " element access");
@@ -328,14 +314,13 @@
                 MapType mapType = (MapType) column.type;
                 if (column.type.isMultiCell())
                 {
-                    Cell cell = current.getColumn(current.metadata().comparator.create(rowPrefix, column, collectionElement));
-                    return isSatisfiedByValue(value, cell, mapType.getValuesType(), operator, now);
+                    Cell cell = getCell(row, column, CellPath.create(collectionElement));
+                    return isSatisfiedByValue(value, cell, ((MapType) column.type).getValuesType(), operator);
                 }
                 else
                 {
-                    Cell cell = current.getColumn(current.metadata().comparator.create(rowPrefix, column));
-                    ByteBuffer mapElementValue = cell.isLive(now) ? mapType.getSerializer().getSerializedValue(cell.value(), collectionElement, mapType.getKeysType())
-                                                                  : null;
+                    Cell cell = getCell(row, column);
+                    ByteBuffer mapElementValue = mapType.getSerializer().getSerializedValue(cell.value(), collectionElement, mapType.getKeysType());
                     return compareWithOperator(operator, mapType.getValuesType(), value, mapElementValue);
                 }
             }
@@ -344,16 +329,13 @@
             ListType listType = (ListType) column.type;
             if (column.type.isMultiCell())
             {
-                ByteBuffer columnValue = getListItem(
-                        collectionColumns(current.metadata().comparator.create(rowPrefix, column), current, now),
-                        getListIndex(collectionElement));
-                return compareWithOperator(operator, listType.getElementsType(), value, columnValue);
+                ByteBuffer columnValue = getListItem(getCells(row, column), getListIndex(collectionElement));
+                return compareWithOperator(operator, ((ListType)column.type).getElementsType(), value, columnValue);
             }
             else
             {
-                Cell cell = current.getColumn(current.metadata().comparator.create(rowPrefix, column));
-                ByteBuffer listElementValue = cell.isLive(now) ? listType.getSerializer().getElement(cell.value(), getListIndex(collectionElement))
-                                                               : null;
+                Cell cell = getCell(row, column);
+                ByteBuffer listElementValue = listType.getSerializer().getElement(cell.value(), getListIndex(collectionElement));
                 return compareWithOperator(operator, listType.getElementsType(), value, listElementValue);
             }
         }
@@ -412,33 +394,31 @@
             }
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
             if (collectionElement == null)
                 throw new InvalidRequestException("Invalid null value for " + (column.type instanceof MapType ? "map" : "list") + " element access");
 
-            CellNameType nameType = current.metadata().comparator;
             if (column.type instanceof MapType)
             {
                 MapType mapType = (MapType) column.type;
                 AbstractType<?> valueType = mapType.getValuesType();
                 if (column.type.isMultiCell())
                 {
-                    CellName name = nameType.create(rowPrefix, column, collectionElement);
-                    Cell item = current.getColumn(name);
+                    Cell item = getCell(row, column, CellPath.create(collectionElement));
                     for (ByteBuffer value : inValues)
                     {
-                        if (isSatisfiedByValue(value, item, valueType, Operator.EQ, now))
+                        if (isSatisfiedByValue(value, item, valueType, Operator.EQ))
                             return true;
                     }
                     return false;
                 }
                 else
                 {
-                    Cell cell = current.getColumn(nameType.create(rowPrefix, column));
-                    ByteBuffer mapElementValue  = null;
-                    if (cell != null && cell.isLive(now))
-                        mapElementValue =  mapType.getSerializer().getSerializedValue(cell.value(), collectionElement, mapType.getKeysType());
+                    Cell cell = getCell(row, column);
+                    ByteBuffer mapElementValue = cell == null
+                                               ? null
+                                               : mapType.getSerializer().getSerializedValue(cell.value(), collectionElement, mapType.getKeysType());
                     for (ByteBuffer value : inValues)
                     {
                         if (value == null)
@@ -458,9 +438,7 @@
             AbstractType<?> elementsType = listType.getElementsType();
             if (column.type.isMultiCell())
             {
-                ByteBuffer columnValue = ElementAccessBound.getListItem(
-                        collectionColumns(nameType.create(rowPrefix, column), current, now),
-                        ElementAccessBound.getListIndex(collectionElement));
+                ByteBuffer columnValue = ElementAccessBound.getListItem(getCells(row, column), ElementAccessBound.getListIndex(collectionElement));
 
                 for (ByteBuffer value : inValues)
                 {
@@ -470,10 +448,10 @@
             }
             else
             {
-                Cell cell = current.getColumn(nameType.create(rowPrefix, column));
-                ByteBuffer listElementValue = null;
-                if (cell != null && cell.isLive(now))
-                    listElementValue = listType.getSerializer().getElement(cell.value(), ElementAccessBound.getListIndex(collectionElement));
+                Cell cell = getCell(row, column);
+                ByteBuffer listElementValue = cell == null
+                                            ? null
+                                            : listType.getSerializer().getElement(cell.value(), ElementAccessBound.getListIndex(collectionElement));
 
                 for (ByteBuffer value : inValues)
                 {
@@ -504,13 +482,13 @@
             this.value = condition.value.bind(options);
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
             CollectionType type = (CollectionType)column.type;
 
             if (type.isMultiCell())
             {
-                Iterator<Cell> iter = collectionColumns(current.metadata().comparator.create(rowPrefix, column), current, now);
+                Iterator<Cell> iter = getCells(row, column);
                 if (value == null)
                 {
                     if (operator == Operator.EQ)
@@ -525,16 +503,20 @@
             }
 
             // frozen collections
-            Cell cell = current.getColumn(current.metadata().comparator.create(rowPrefix, column));
+            Cell cell = getCell(row, column);
             if (value == null)
             {
                 if (operator == Operator.EQ)
-                    return cell == null || !cell.isLive(now);
+                    return cell == null;
                 else if (operator == Operator.NEQ)
-                    return cell != null && cell.isLive(now);
+                    return cell != null;
                 else
                     throw new InvalidRequestException(String.format("Invalid comparison with null for operator \"%s\"", operator));
             }
+            else if (cell == null) // cell is null but condition has a value
+            {
+                return false;
+            }
 
             // make sure we use v3 serialization format for comparison
             ByteBuffer conditionValue;
@@ -576,7 +558,7 @@
                     return (operator == Operator.GT) || (operator == Operator.GTE) || (operator == Operator.NEQ);
 
                 // for lists we use the cell value; for sets we use the cell name
-                ByteBuffer cellValue = isSet? iter.next().name().collectionElement() : iter.next().value();
+                ByteBuffer cellValue = isSet ? iter.next().path().get(0) : iter.next().value();
                 int comparison = type.compare(cellValue, conditionIter.next());
                 if (comparison != 0)
                     return evaluateComparisonWithOperator(comparison, operator);
@@ -634,7 +616,7 @@
                 Cell c = iter.next();
 
                 // compare the keys
-                int comparison = type.getKeysType().compare(c.name().collectionElement(), conditionEntry.getKey());
+                int comparison = type.getKeysType().compare(c.path().get(0), conditionEntry.getKey());
                 if (comparison != 0)
                     return evaluateComparisonWithOperator(comparison, operator);
 
@@ -729,29 +711,27 @@
             }
         }
 
-        public boolean appliesTo(Composite rowPrefix, ColumnFamily current, final long now) throws InvalidRequestException
+        public boolean appliesTo(Row row) throws InvalidRequestException
         {
             CollectionType type = (CollectionType)column.type;
-            CellName name = current.metadata().comparator.create(rowPrefix, column);
             if (type.isMultiCell())
             {
                 // copy iterator contents so that we can properly reuse them for each comparison with an IN value
-                List<Cell> cells = newArrayList(collectionColumns(name, current, now));
                 for (Term.Terminal value : inValues)
                 {
-                    if (CollectionBound.valueAppliesTo(type, cells.iterator(), value, Operator.EQ))
+                    if (CollectionBound.valueAppliesTo(type, getCells(row, column), value, Operator.EQ))
                         return true;
                 }
                 return false;
             }
             else
             {
-                Cell cell = current.getColumn(name);
+                Cell cell = getCell(row, column);
                 for (Term.Terminal value : inValues)
                 {
                     if (value == null)
                     {
-                        if (cell == null || !cell.isLive(now))
+                        if (cell == null)
                             return true;
                     }
                     else if (type.compare(value.get(Server.VERSION_3), cell.value()) == 0)

diff --git a/src/java/org/apache/cassandra/cql3/ColumnConditions.java b/src/java/org/apache/cassandra/cql3/ColumnConditions.java
new file mode 100644
index 0000000..5ec4cb4
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/ColumnConditions.java

@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.cql3.statements.CQL3CasRequest;
+import org.apache.cassandra.db.Clustering;
+
+/**
+ * A set of <code>ColumnCondition</code>s.
+ *
+ */
+public final class ColumnConditions extends AbstractConditions
+{
+    /**
+     * The conditions on regular columns.
+     */
+    private final List<ColumnCondition> columnConditions;
+
+    /**
+     * The conditions on static columns
+     */
+    private final List<ColumnCondition> staticConditions;
+
+    /**
+     * Creates a new <code>ColumnConditions</code> instance for the specified builder.
+     */
+    private ColumnConditions(Builder builder)
+    {
+        this.columnConditions = builder.columnConditions;
+        this.staticConditions = builder.staticConditions;
+    }
+
+    @Override
+    public boolean appliesToStaticColumns()
+    {
+        return !staticConditions.isEmpty();
+    }
+
+    @Override
+    public boolean appliesToRegularColumns()
+    {
+        return !columnConditions.isEmpty();
+    }
+
+    @Override
+    public Collection<ColumnDefinition> getColumns()
+    {
+        return Stream.concat(columnConditions.stream(), staticConditions.stream())
+                     .map(e -> e.column)
+                     .collect(Collectors.toList());
+    }
+
+    @Override
+    public boolean isEmpty()
+    {
+        return columnConditions.isEmpty() && staticConditions.isEmpty();
+    }
+
+    /**
+     * Adds the conditions to the specified CAS request.
+     *
+     * @param request the request
+     * @param clustering the clustering prefix
+     * @param options the query options
+     */
+    public void addConditionsTo(CQL3CasRequest request,
+                                Clustering clustering,
+                                QueryOptions options)
+    {
+        if (!columnConditions.isEmpty())
+            request.addConditions(clustering, columnConditions, options);
+        if (!staticConditions.isEmpty())
+            request.addConditions(Clustering.STATIC_CLUSTERING, staticConditions, options);
+    }
+
+    @Override
+    public void addFunctionsTo(List<Function> functions)
+    {
+        columnConditions.forEach(p -> p.addFunctionsTo(functions));
+        staticConditions.forEach(p -> p.addFunctionsTo(functions));
+    }
+
+    // Public for SuperColumn tables support only
+    public Collection<ColumnCondition> columnConditions()
+    {
+        return this.columnConditions;
+    }
+
+    /**
+     * Creates a new <code>Builder</code> for <code>ColumnConditions</code>.
+     * @return a new <code>Builder</code> for <code>ColumnConditions</code>
+     */
+    public static Builder newBuilder()
+    {
+        return new Builder();
+    }
+
+    /**
+     * A <code>Builder</code> for <code>ColumnConditions</code>.
+     *
+     */
+    public static final class Builder
+    {
+        /**
+         * The conditions on regular columns.
+         */
+        private List<ColumnCondition> columnConditions = Collections.emptyList();
+
+        /**
+         * The conditions on static columns
+         */
+        private List<ColumnCondition> staticConditions = Collections.emptyList();
+
+        /**
+         * Adds the specified <code>ColumnCondition</code> to this set of conditions.
+         * @param condition the condition to add
+         */
+        public Builder add(ColumnCondition condition)
+        {
+            List<ColumnCondition> conds = null;
+            if (condition.column.isStatic())
+            {
+                if (staticConditions.isEmpty())
+                    staticConditions = new ArrayList<>();
+                conds = staticConditions;
+            }
+            else
+            {
+                if (columnConditions.isEmpty())
+                    columnConditions = new ArrayList<>();
+                conds = columnConditions;
+            }
+            conds.add(condition);
+            return this;
+        }
+
+        public ColumnConditions build()
+        {
+            return new ColumnConditions(this);
+        }
+
+        private Builder()
+        {
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java b/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java
index 823af94..5d4e992 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnIdentifier.java

@@ -20,6 +20,11 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 import java.util.Locale;
+import java.util.concurrent.ConcurrentMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.MapMaker;
 
 import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.config.CFMetaData;
@@ -28,9 +33,8 @@
 import org.apache.cassandra.cql3.selection.Selector;
 import org.apache.cassandra.cql3.selection.SimpleSelector;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.memory.AbstractAllocator;
@@ -39,29 +43,129 @@
  * Represents an identifer for a CQL column definition.
  * TODO : should support light-weight mode without text representation for when not interned
  */
-public class ColumnIdentifier extends org.apache.cassandra.cql3.selection.Selectable implements IMeasurableMemory
+public class ColumnIdentifier extends Selectable implements IMeasurableMemory, Comparable<ColumnIdentifier>
 {
+    private static final Pattern PATTERN_DOUBLE_QUOTE = Pattern.compile("\"", Pattern.LITERAL);
+
     public final ByteBuffer bytes;
     private final String text;
+    /**
+     * since these objects are compared frequently, we stash an efficiently compared prefix of the bytes, in the expectation
+     * that the majority of comparisons can be answered by this value only
+     */
+    public final long prefixComparison;
+    private final boolean interned;
 
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new ColumnIdentifier("", true));
+    private static final Pattern UNQUOTED_IDENTIFIER = Pattern.compile("[a-z][a-z0-9_]*");
+
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new ColumnIdentifier(ByteBufferUtil.EMPTY_BYTE_BUFFER, "", false));
+
+    private static final ConcurrentMap<InternedKey, ColumnIdentifier> internedInstances = new MapMaker().weakValues().makeMap();
+
+    private static final class InternedKey
+    {
+        private final AbstractType<?> type;
+        private final ByteBuffer bytes;
+
+        InternedKey(AbstractType<?> type, ByteBuffer bytes)
+        {
+            this.type = type;
+            this.bytes = bytes;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (o == null || getClass() != o.getClass())
+                return false;
+
+            InternedKey that = (InternedKey) o;
+            return bytes.equals(that.bytes) && type.equals(that.type);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return bytes.hashCode() + 31 * type.hashCode();
+        }
+    }
+
+    private static long prefixComparison(ByteBuffer bytes)
+    {
+        long prefix = 0;
+        ByteBuffer read = bytes.duplicate();
+        int i = 0;
+        while (read.hasRemaining() && i < 8)
+        {
+            prefix <<= 8;
+            prefix |= read.get() & 0xFF;
+            i++;
+        }
+        prefix <<= (8 - i) * 8;
+        // by flipping the top bit (==Integer.MIN_VALUE), we ensure that signed comparison gives the same result
+        // as an unsigned without the bit flipped
+        prefix ^= Long.MIN_VALUE;
+        return prefix;
+    }
 
     public ColumnIdentifier(String rawText, boolean keepCase)
     {
         this.text = keepCase ? rawText : rawText.toLowerCase(Locale.US);
         this.bytes = ByteBufferUtil.bytes(this.text);
+        this.prefixComparison = prefixComparison(bytes);
+        this.interned = false;
     }
 
     public ColumnIdentifier(ByteBuffer bytes, AbstractType<?> type)
     {
-        this.bytes = bytes;
-        this.text = type.getString(bytes);
+        this(bytes, type.getString(bytes), false);
     }
 
     public ColumnIdentifier(ByteBuffer bytes, String text)
     {
+        this(bytes, text, false);
+    }
+
+    private ColumnIdentifier(ByteBuffer bytes, String text, boolean interned)
+    {
         this.bytes = bytes;
         this.text = text;
+        this.interned = interned;
+        this.prefixComparison = prefixComparison(bytes);
+    }
+
+    public static ColumnIdentifier getInterned(ByteBuffer bytes, AbstractType<?> type)
+    {
+        return getInterned(type, bytes, type.getString(bytes));
+    }
+
+    public static ColumnIdentifier getInterned(String rawText, boolean keepCase)
+    {
+        String text = keepCase ? rawText : rawText.toLowerCase(Locale.US);
+        ByteBuffer bytes = ByteBufferUtil.bytes(text);
+        return getInterned(UTF8Type.instance, bytes, text);
+    }
+
+    public static ColumnIdentifier getInterned(AbstractType<?> type, ByteBuffer bytes, String text)
+    {
+        bytes = ByteBufferUtil.minimalBufferFor(bytes);
+
+        InternedKey key = new InternedKey(type, bytes);
+        ColumnIdentifier id = internedInstances.get(key);
+        if (id != null)
+            return id;
+
+        ColumnIdentifier created = new ColumnIdentifier(bytes, text, true);
+        ColumnIdentifier previous = internedInstances.putIfAbsent(key, created);
+        return previous == null ? created : previous;
+    }
+
+    public boolean isInterned()
+    {
+        return interned;
     }
 
     @Override
@@ -73,8 +177,6 @@
     @Override
     public final boolean equals(Object o)
     {
-        // Note: it's worth checking for reference equality since we intern those
-        // in SparseCellNameType
         if (this == o)
             return true;
 
@@ -90,6 +192,15 @@
         return text;
     }
 
+    /**
+     * Returns a string representation of the identifier that is safe to use directly in CQL queries.
+     * In necessary, the string will be double-quoted, and any quotes inside the string will be escaped.
+     */
+    public String toCQLString()
+    {
+        return maybeQuote(text);
+    }
+
     public long unsharedHeapSize()
     {
         return EMPTY_SIZE
@@ -106,30 +217,52 @@
 
     public ColumnIdentifier clone(AbstractAllocator allocator)
     {
-        return new ColumnIdentifier(allocator.clone(bytes), text);
+        return interned ? this : new ColumnIdentifier(allocator.clone(bytes), text, false);
     }
 
     public Selector.Factory newSelectorFactory(CFMetaData cfm, List<ColumnDefinition> defs) throws InvalidRequestException
     {
-        ColumnDefinition def = cfm.getColumnDefinition(this);
+        ColumnDefinition def = cfm.getColumnDefinitionForCQL(this);
         if (def == null)
             throw new InvalidRequestException(String.format("Undefined name %s in selection clause", this));
 
         return SimpleSelector.newFactory(def, addAndGetIndex(def, defs));
     }
 
+    public int compareTo(ColumnIdentifier that)
+    {
+        int c = Long.compare(this.prefixComparison, that.prefixComparison);
+        if (c != 0)
+            return c;
+        if (this == that)
+            return 0;
+        return ByteBufferUtil.compareUnsigned(this.bytes, that.bytes);
+    }
+
     /**
      * Because Thrift-created tables may have a non-text comparator, we cannot determine the proper 'key' until
      * we know the comparator. ColumnIdentifier.Raw is a placeholder that can be converted to a real ColumnIdentifier
      * once the comparator is known with prepare(). This should only be used with identifiers that are actual
      * column names. See CASSANDRA-8178 for more background.
      */
-    public static class Raw implements Selectable.Raw
+    public static interface Raw extends Selectable.Raw
+    {
+
+        public ColumnIdentifier prepare(CFMetaData cfm);
+
+        /**
+         * Returns a string representation of the identifier that is safe to use directly in CQL queries.
+         * In necessary, the string will be double-quoted, and any quotes inside the string will be escaped.
+         */
+        public String toCQLString();
+    }
+
+    public static class Literal implements Raw
     {
         private final String rawText;
         private final String text;
 
-        public Raw(String rawText, boolean keepCase)
+        public Literal(String rawText, boolean keepCase)
         {
             this.rawText = rawText;
             this.text =  keepCase ? rawText : rawText.toLowerCase(Locale.US);
@@ -137,20 +270,22 @@
 
         public ColumnIdentifier prepare(CFMetaData cfm)
         {
-            AbstractType<?> comparator = cfm.comparator.asAbstractType();
-            if (cfm.getIsDense() || comparator instanceof CompositeType || comparator instanceof UTF8Type)
-                return new ColumnIdentifier(text, true);
+            if (!cfm.isStaticCompactTable())
+                return getInterned(text, true);
 
-            // We have a Thrift-created table with a non-text comparator.  We need to parse column names with the comparator
-            // to get the correct ByteBuffer representation.  However, this doesn't apply to key aliases, so we need to
-            // make a special check for those and treat them normally.  See CASSANDRA-8178.
+            AbstractType<?> thriftColumnNameType = cfm.thriftColumnNameType();
+            if (thriftColumnNameType instanceof UTF8Type)
+                return getInterned(text, true);
+
+            // We have a Thrift-created table with a non-text comparator. Check if we have a match column, otherwise assume we should use
+            // thriftColumnNameType
             ByteBuffer bufferName = ByteBufferUtil.bytes(text);
-            for (ColumnDefinition def : cfm.partitionKeyColumns())
+            for (ColumnDefinition def : cfm.allColumns())
             {
                 if (def.name.bytes.equals(bufferName))
-                    return new ColumnIdentifier(text, true);
+                    return def.name;
             }
-            return new ColumnIdentifier(comparator.fromString(rawText), text);
+            return getInterned(thriftColumnNameType, thriftColumnNameType.fromString(rawText), text);
         }
 
         public boolean processesSelection()
@@ -167,9 +302,10 @@
         @Override
         public final boolean equals(Object o)
         {
-            if(!(o instanceof ColumnIdentifier.Raw))
+            if(!(o instanceof Literal))
                 return false;
-            ColumnIdentifier.Raw that = (ColumnIdentifier.Raw)o;
+
+            Literal that = (Literal) o;
             return text.equals(that.text);
         }
 
@@ -178,5 +314,64 @@
         {
             return text;
         }
+
+        public String toCQLString()
+        {
+            return maybeQuote(text);
+        }
+    }
+
+    public static class ColumnIdentifierValue implements Raw
+    {
+        private final ColumnIdentifier identifier;
+
+        public ColumnIdentifierValue(ColumnIdentifier identifier)
+        {
+            this.identifier = identifier;
+        }
+
+        public ColumnIdentifier prepare(CFMetaData cfm)
+        {
+            return identifier;
+        }
+
+        public boolean processesSelection()
+        {
+            return false;
+        }
+
+        @Override
+        public final int hashCode()
+        {
+            return identifier.hashCode();
+        }
+
+        @Override
+        public final boolean equals(Object o)
+        {
+            if(!(o instanceof ColumnIdentifierValue))
+                return false;
+            ColumnIdentifierValue that = (ColumnIdentifierValue) o;
+            return identifier.equals(that.identifier);
+        }
+
+        @Override
+        public String toString()
+        {
+            return identifier.toString();
+        }
+
+        public String toCQLString()
+        {
+            return maybeQuote(identifier.text);
+        }
+    }
+
+    public static String maybeQuote(String text)
+    {
+        if (UNQUOTED_IDENTIFIER.matcher(text).matches() && !ReservedKeywords.isReserved(text))
+            return text;
+
+        return '"' + PATTERN_DOUBLE_QUOTE.matcher(text).replaceAll(Matcher.quoteReplacement("\"\"")) + '"';
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Conditions.java b/src/java/org/apache/cassandra/cql3/Conditions.java
new file mode 100644
index 0000000..16fa4aa
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/Conditions.java

@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.List;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.cql3.statements.CQL3CasRequest;
+import org.apache.cassandra.db.Clustering;
+
+/**
+ * Conditions that can be applied to a mutation statement.
+ *
+ */
+public interface Conditions
+{
+    /**
+     * An EMPTY condition
+     */
+    static final Conditions EMPTY_CONDITION = ColumnConditions.newBuilder().build();
+
+    /**
+     * IF EXISTS condition
+     */
+    static final Conditions IF_EXISTS_CONDITION = new IfExistsCondition();
+
+    /**
+     * IF NOT EXISTS condition
+     */
+    static final Conditions IF_NOT_EXISTS_CONDITION = new IfNotExistsCondition();
+
+    /**
+     * Adds the functions used by the conditions to the specified list.
+     * @param functions the list to add to
+     */
+    void addFunctionsTo(List<Function> functions);
+
+    /**
+     * Returns the column definitions to which apply the conditions.
+     * @return the column definitions to which apply the conditions.
+     */
+    Iterable<ColumnDefinition> getColumns();
+
+    /**
+     * Checks if this <code>Conditions</code> is empty.
+     * @return <code>true</code> if this <code>Conditions</code> is empty, <code>false</code> otherwise.
+     */
+    boolean isEmpty();
+
+    /**
+     * Checks if this is a IF EXIST condition.
+     * @return <code>true</code> if this is a IF EXIST condition, <code>false</code> otherwise.
+     */
+    boolean isIfExists();
+
+    /**
+     * Checks if this is a IF NOT EXIST condition.
+     * @return <code>true</code> if this is a IF NOT EXIST condition, <code>false</code> otherwise.
+     */
+    boolean isIfNotExists();
+
+    /**
+     * Checks if some of the conditions apply to static columns.
+     *
+     * @return <code>true</code> if some of the conditions apply to static columns, <code>false</code> otherwise.
+     */
+    boolean appliesToStaticColumns();
+
+    /**
+     * Checks if some of the conditions apply to regular columns.
+     *
+     * @return <code>true</code> if some of the conditions apply to regular columns, <code>false</code> otherwise.
+     */
+    boolean appliesToRegularColumns();
+
+    /**
+     * Adds the conditions to the specified CAS request.
+     *
+     * @param request the request
+     * @param clustering the clustering prefix
+     * @param options the query options
+     */
+    public void addConditionsTo(CQL3CasRequest request,
+                                Clustering clustering,
+                                QueryOptions options);
+}

diff --git a/src/java/org/apache/cassandra/cql3/Constants.java b/src/java/org/apache/cassandra/cql3/Constants.java
index 07b848c..f37d900 100644
--- a/src/java/org/apache/cassandra/cql3/Constants.java
+++ b/src/java/org/apache/cassandra/cql3/Constants.java

@@ -23,8 +23,6 @@
 import org.slf4j.LoggerFactory;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.CounterColumnType;
@@ -48,7 +46,7 @@
 
     public static final Value UNSET_VALUE = new Value(ByteBufferUtil.UNSET_BYTE_BUFFER);
 
-    public static final Term.Raw NULL_LITERAL = new Term.Raw()
+    private static class NullLiteral extends Term.Raw
     {
         public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
@@ -65,12 +63,13 @@
                  : AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            return "null";
+            return "NULL";
         }
-    };
+    }
+
+    public static final NullLiteral NULL_LITERAL = new NullLiteral();
 
     public static final Term.Terminal NULL_VALUE = new Value(null)
     {
@@ -88,7 +87,7 @@
         }
     };
 
-    public static class Literal implements Term.Raw
+    public static class Literal extends Term.Raw
     {
         private final Type type;
         private final String text;
@@ -144,9 +143,12 @@
                 validator = ((ReversedType<?>) validator).baseType;
             try
             {
-                // BytesType doesn't want it's input prefixed by '0x'.
-                if (type == Type.HEX && validator instanceof BytesType)
-                    return validator.fromString(text.substring(2));
+                if (type == Type.HEX)
+                    // Note that validator could be BytesType, but it could also be a custom type, so
+                    // we hardcode BytesType (rather than using 'validator') in the call below.
+                    // Further note that BytesType doesn't want it's input prefixed by '0x', hence the substring.
+                    return BytesType.instance.fromString(text.substring(2));
+
                 if (validator instanceof CounterColumnType)
                     return LongType.instance.fromString(text);
                 return validator.fromString(text);
@@ -157,11 +159,6 @@
             }
         }
 
-        public String getRawText()
-        {
-            return text;
-        }
-
         public AssignmentTestable.TestResult testAssignment(String keyspace, ColumnSpecification receiver)
         {
             CQL3Type receiverType = receiver.type.asCQL3Type();
@@ -199,6 +196,7 @@
                         case FLOAT:
                         case INT:
                         case SMALLINT:
+                        case TIME:
                         case TIMESTAMP:
                         case TINYINT:
                         case VARINT:
@@ -240,8 +238,12 @@
             return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
         }
 
-        @Override
-        public String toString()
+        public String getRawText()
+        {
+            return text;
+        }
+
+        public String getText()
         {
             return type == Type.STRING ? String.format("'%s'", text) : text;
         }
@@ -279,7 +281,8 @@
 
     public static class Marker extends AbstractMarker
     {
-        protected Marker(int bindIndex, ColumnSpecification receiver)
+        // Constructor is public only for the SuperColumn tables support
+        public Marker(int bindIndex, ColumnSpecification receiver)
         {
             super(bindIndex, receiver);
             assert !receiver.type.isCollection();
@@ -319,14 +322,13 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             ByteBuffer value = t.bindAndGet(params.options);
-            if (value != ByteBufferUtil.UNSET_BYTE_BUFFER) // use reference equality and not object equality
-            {
-                CellName cname = cf.getComparator().create(prefix, column);
-                cf.addColumn(value == null ? params.makeTombstone(cname) : params.makeColumn(cname, value));
-            }
+            if (value == null)
+                params.addTombstone(column);
+            else if (value != ByteBufferUtil.UNSET_BYTE_BUFFER) // use reference equality and not object equality
+                params.addCell(column, value);
         }
     }
 
@@ -337,7 +339,7 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             ByteBuffer bytes = t.bindAndGet(params.options);
             if (bytes == null)
@@ -346,8 +348,7 @@
                 return;
 
             long increment = ByteBufferUtil.toLong(bytes);
-            CellName cname = cf.getComparator().create(prefix, column);
-            cf.addColumn(params.makeCounter(cname, increment));
+            params.addCounter(column, increment);
         }
     }
 
@@ -358,7 +359,7 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             ByteBuffer bytes = t.bindAndGet(params.options);
             if (bytes == null)
@@ -370,8 +371,7 @@
             if (increment == Long.MIN_VALUE)
                 throw new InvalidRequestException("The negation of " + increment + " overflows supported counter precision (signed 8 bytes integer)");
 
-            CellName cname = cf.getComparator().create(prefix, column);
-            cf.addColumn(params.makeCounter(cname, -increment));
+            params.addCounter(column, -increment);
         }
     }
 
@@ -384,13 +384,12 @@
             super(column, null);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
-            CellName cname = cf.getComparator().create(prefix, column);
             if (column.type.isMultiCell())
-                cf.addAtom(params.makeRangeTombstone(cname.slice()));
+                params.setComplexDeletionTime(column);
             else
-                cf.addColumn(params.makeTombstone(cname));
+                params.addTombstone(column);
         }
-    };
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Cql.g b/src/java/org/apache/cassandra/cql3/Cql.g
index 035e704..0234327 100644
--- a/src/java/org/apache/cassandra/cql3/Cql.g
+++ b/src/java/org/apache/cassandra/cql3/Cql.g

@@ -39,6 +39,7 @@
 
     import org.apache.cassandra.auth.*;
     import org.apache.cassandra.cql3.*;
+    import org.apache.cassandra.cql3.restrictions.CustomIndexExpression;
     import org.apache.cassandra.cql3.statements.*;
     import org.apache.cassandra.cql3.selection.*;
     import org.apache.cassandra.cql3.functions.*;
@@ -233,43 +234,46 @@
 
 cqlStatement returns [ParsedStatement stmt]
     @after{ if (stmt != null) stmt.setBoundVariables(bindVariables); }
-    : st1= selectStatement             { $stmt = st1; }
-    | st2= insertStatement             { $stmt = st2; }
-    | st3= updateStatement             { $stmt = st3; }
-    | st4= batchStatement              { $stmt = st4; }
-    | st5= deleteStatement             { $stmt = st5; }
-    | st6= useStatement                { $stmt = st6; }
-    | st7= truncateStatement           { $stmt = st7; }
-    | st8= createKeyspaceStatement     { $stmt = st8; }
-    | st9= createTableStatement        { $stmt = st9; }
-    | st10=createIndexStatement        { $stmt = st10; }
-    | st11=dropKeyspaceStatement       { $stmt = st11; }
-    | st12=dropTableStatement          { $stmt = st12; }
-    | st13=dropIndexStatement          { $stmt = st13; }
-    | st14=alterTableStatement         { $stmt = st14; }
-    | st15=alterKeyspaceStatement      { $stmt = st15; }
-    | st16=grantPermissionsStatement   { $stmt = st16; }
-    | st17=revokePermissionsStatement  { $stmt = st17; }
-    | st18=listPermissionsStatement    { $stmt = st18; }
-    | st19=createUserStatement         { $stmt = st19; }
-    | st20=alterUserStatement          { $stmt = st20; }
-    | st21=dropUserStatement           { $stmt = st21; }
-    | st22=listUsersStatement          { $stmt = st22; }
-    | st23=createTriggerStatement      { $stmt = st23; }
-    | st24=dropTriggerStatement        { $stmt = st24; }
-    | st25=createTypeStatement         { $stmt = st25; }
-    | st26=alterTypeStatement          { $stmt = st26; }
-    | st27=dropTypeStatement           { $stmt = st27; }
-    | st28=createFunctionStatement     { $stmt = st28; }
-    | st29=dropFunctionStatement       { $stmt = st29; }
-    | st30=createAggregateStatement    { $stmt = st30; }
-    | st31=dropAggregateStatement      { $stmt = st31; }
-    | st32=createRoleStatement         { $stmt = st32; }
-    | st33=alterRoleStatement          { $stmt = st33; }
-    | st34=dropRoleStatement           { $stmt = st34; }
-    | st35=listRolesStatement          { $stmt = st35; }
-    | st36=grantRoleStatement          { $stmt = st36; }
-    | st37=revokeRoleStatement         { $stmt = st37; }
+    : st1= selectStatement                 { $stmt = st1; }
+    | st2= insertStatement                 { $stmt = st2; }
+    | st3= updateStatement                 { $stmt = st3; }
+    | st4= batchStatement                  { $stmt = st4; }
+    | st5= deleteStatement                 { $stmt = st5; }
+    | st6= useStatement                    { $stmt = st6; }
+    | st7= truncateStatement               { $stmt = st7; }
+    | st8= createKeyspaceStatement         { $stmt = st8; }
+    | st9= createTableStatement            { $stmt = st9; }
+    | st10=createIndexStatement            { $stmt = st10; }
+    | st11=dropKeyspaceStatement           { $stmt = st11; }
+    | st12=dropTableStatement              { $stmt = st12; }
+    | st13=dropIndexStatement              { $stmt = st13; }
+    | st14=alterTableStatement             { $stmt = st14; }
+    | st15=alterKeyspaceStatement          { $stmt = st15; }
+    | st16=grantPermissionsStatement       { $stmt = st16; }
+    | st17=revokePermissionsStatement      { $stmt = st17; }
+    | st18=listPermissionsStatement        { $stmt = st18; }
+    | st19=createUserStatement             { $stmt = st19; }
+    | st20=alterUserStatement              { $stmt = st20; }
+    | st21=dropUserStatement               { $stmt = st21; }
+    | st22=listUsersStatement              { $stmt = st22; }
+    | st23=createTriggerStatement          { $stmt = st23; }
+    | st24=dropTriggerStatement            { $stmt = st24; }
+    | st25=createTypeStatement             { $stmt = st25; }
+    | st26=alterTypeStatement              { $stmt = st26; }
+    | st27=dropTypeStatement               { $stmt = st27; }
+    | st28=createFunctionStatement         { $stmt = st28; }
+    | st29=dropFunctionStatement           { $stmt = st29; }
+    | st30=createAggregateStatement        { $stmt = st30; }
+    | st31=dropAggregateStatement          { $stmt = st31; }
+    | st32=createRoleStatement             { $stmt = st32; }
+    | st33=alterRoleStatement              { $stmt = st33; }
+    | st34=dropRoleStatement               { $stmt = st34; }
+    | st35=listRolesStatement              { $stmt = st35; }
+    | st36=grantRoleStatement              { $stmt = st36; }
+    | st37=revokeRoleStatement             { $stmt = st37; }
+    | st38=createMaterializedViewStatement { $stmt = st38; }
+    | st39=dropMaterializedViewStatement   { $stmt = st39; }
+    | st40=alterMaterializedViewStatement  { $stmt = st40; }
     ;
 
 /*
@@ -306,7 +310,8 @@
                                                                              isDistinct,
                                                                              allowFiltering,
                                                                              isJson);
-          $expr = new SelectStatement.RawStatement(cf, params, sclause, wclause, limit);
+          WhereClause where = wclause == null ? WhereClause.empty() : wclause.build();
+          $expr = new SelectStatement.RawStatement(cf, params, sclause, where, limit);
       }
     ;
 
@@ -317,7 +322,7 @@
 
 selector returns [RawSelector s]
     @init{ ColumnIdentifier alias = null; }
-    : us=unaliasedSelector (K_AS c=ident { alias = c; })? { $s = new RawSelector(us, alias); }
+    : us=unaliasedSelector (K_AS c=noncol_ident { alias = c; })? { $s = new RawSelector(us, alias); }
     ;
 
 unaliasedSelector returns [Selectable.Raw s]
@@ -342,9 +347,19 @@
     | i=INTEGER { if (!i.getText().equals("1")) addRecognitionError("Only COUNT(1) is supported, got COUNT(" + i.getText() + ")");}
     ;
 
-whereClause returns [List<Relation> clause]
-    @init{ $clause = new ArrayList<Relation>(); }
-    : relation[$clause] (K_AND relation[$clause])*
+whereClause returns [WhereClause.Builder clause]
+    @init{ $clause = new WhereClause.Builder(); }
+    : relationOrExpression[$clause] (K_AND relationOrExpression[$clause])*
+    ;
+
+relationOrExpression [WhereClause.Builder clause]
+    : relation[$clause]
+    | customIndexExpression[$clause]
+    ;
+
+customIndexExpression [WhereClause.Builder clause]
+    @init{IndexName name = new IndexName();}
+    : 'expr(' idxName[name] ',' t=term ')' { clause.add(new CustomIndexExpression(name, t));}
     ;
 
 orderByClause[Map<ColumnIdentifier.Raw, Boolean> orderings]
@@ -399,7 +414,7 @@
 jsonValue returns [Json.Raw value]
     :
     | s=STRING_LITERAL { $value = new Json.Literal($s.text); }
-    | ':' id=ident     { $value = newJsonBindVariables(id); }
+    | ':' id=noncol_ident     { $value = newJsonBindVariables(id); }
     | QMARK            { $value = newJsonBindVariables(null); }
     ;
 
@@ -434,7 +449,7 @@
           return new UpdateStatement.ParsedUpdate(cf,
                                                   attrs,
                                                   operations,
-                                                  wclause,
+                                                  wclause.build(),
                                                   conditions == null ? Collections.<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>>emptyList() : conditions,
                                                   ifExists);
      }
@@ -468,7 +483,7 @@
           return new DeleteStatement.Parsed(cf,
                                             attrs,
                                             columnDeletions,
-                                            wclause,
+                                            wclause.build(),
                                             conditions == null ? Collections.<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>>emptyList() : conditions,
                                             ifExists);
       }
@@ -599,8 +614,8 @@
       fn=functionName
       '('
         (
-          k=ident v=comparatorType { argsNames.add(k); argsTypes.add(v); }
-          ( ',' k=ident v=comparatorType { argsNames.add(k); argsTypes.add(v); } )*
+          k=noncol_ident v=comparatorType { argsNames.add(k); argsTypes.add(v); }
+          ( ',' k=noncol_ident v=comparatorType { argsNames.add(k); argsTypes.add(v); } )*
         )?
       ')'
       ( (K_RETURNS K_NULL) | (K_CALLED { calledOnNullInput=true; })) K_ON K_NULL K_INPUT
@@ -637,7 +652,7 @@
  */
 createKeyspaceStatement returns [CreateKeyspaceStatement expr]
     @init {
-        KSPropDefs attrs = new KSPropDefs();
+        KeyspaceAttributes attrs = new KeyspaceAttributes();
         boolean ifNotExists = false;
     }
     : K_CREATE K_KEYSPACE (K_IF K_NOT K_EXISTS { ifNotExists = true; } )? ks=keyspaceName
@@ -660,7 +675,7 @@
 
 cfamDefinition[CreateTableStatement.RawStatement expr]
     : '(' cfamColumns[expr] ( ',' cfamColumns[expr]? )* ')'
-      ( K_WITH cfamProperty[expr] ( K_AND cfamProperty[expr] )*)?
+      ( K_WITH cfamProperty[expr.properties] ( K_AND cfamProperty[expr.properties] )*)?
     ;
 
 cfamColumns[CreateTableStatement.RawStatement expr]
@@ -674,15 +689,15 @@
     | '(' { List<ColumnIdentifier> l = new ArrayList<ColumnIdentifier>(); } k1=ident { l.add(k1); } ( ',' kn=ident { l.add(kn); } )* ')' { $expr.addKeyAliases(l); }
     ;
 
-cfamProperty[CreateTableStatement.RawStatement expr]
-    : property[expr.properties]
-    | K_COMPACT K_STORAGE { $expr.setCompactStorage(); }
-    | K_CLUSTERING K_ORDER K_BY '(' cfamOrdering[expr] (',' cfamOrdering[expr])* ')'
+cfamProperty[CFProperties props]
+    : property[props.properties]
+    | K_COMPACT K_STORAGE { $props.setCompactStorage(); }
+    | K_CLUSTERING K_ORDER K_BY '(' cfamOrdering[props] (',' cfamOrdering[props])* ')'
     ;
 
-cfamOrdering[CreateTableStatement.RawStatement expr]
+cfamOrdering[CFProperties props]
     @init{ boolean reversed=false; }
-    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $expr.setOrdering(k, reversed); }
+    : k=ident (K_ASC | K_DESC { reversed=true;} ) { $props.setOrdering(k, reversed); }
     ;
 
 
@@ -701,7 +716,7 @@
     ;
 
 typeColumns[CreateTypeStatement expr]
-    : k=ident v=comparatorType { $expr.addDefinition(k, v); }
+    : k=noncol_ident v=comparatorType { $expr.addDefinition(k, v); }
     ;
 
 
@@ -714,21 +729,50 @@
         IndexPropDefs props = new IndexPropDefs();
         boolean ifNotExists = false;
         IndexName name = new IndexName();
+        List<IndexTarget.Raw> targets = new ArrayList<>();
     }
     : K_CREATE (K_CUSTOM { props.isCustom = true; })? K_INDEX (K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
-        (idxName[name])? K_ON cf=columnFamilyName '(' id=indexIdent ')'
+        (idxName[name])? K_ON cf=columnFamilyName '(' (indexIdent[targets] (',' indexIdent[targets])*)? ')'
         (K_USING cls=STRING_LITERAL { props.customClass = $cls.text; })?
         (K_WITH properties[props])?
-      { $expr = new CreateIndexStatement(cf, name, id, props, ifNotExists); }
+      { $expr = new CreateIndexStatement(cf, name, targets, props, ifNotExists); }
     ;
 
-indexIdent returns [IndexTarget.Raw id]
-    : c=cident                   { $id = IndexTarget.Raw.valuesOf(c); }
-    | K_KEYS '(' c=cident ')'    { $id = IndexTarget.Raw.keysOf(c); }
-    | K_ENTRIES '(' c=cident ')' { $id = IndexTarget.Raw.keysAndValuesOf(c); }
-    | K_FULL '(' c=cident ')'    { $id = IndexTarget.Raw.fullCollection(c); }
+indexIdent [List<IndexTarget.Raw> targets]
+    : c=cident                   { $targets.add(IndexTarget.Raw.simpleIndexOn(c)); }
+    | K_VALUES '(' c=cident ')'  { $targets.add(IndexTarget.Raw.valuesOf(c)); }
+    | K_KEYS '(' c=cident ')'    { $targets.add(IndexTarget.Raw.keysOf(c)); }
+    | K_ENTRIES '(' c=cident ')' { $targets.add(IndexTarget.Raw.keysAndValuesOf(c)); }
+    | K_FULL '(' c=cident ')'    { $targets.add(IndexTarget.Raw.fullCollection(c)); }
     ;
 
+/**
+ * CREATE MATERIALIZED VIEW <viewName> AS
+ *  SELECT <columns>
+ *  FROM <CF>
+ *  WHERE <pkColumns> IS NOT NULL
+ *  PRIMARY KEY (<pkColumns>)
+ *  WITH <property> = <value> AND ...;
+ */
+createMaterializedViewStatement returns [CreateViewStatement expr]
+    @init {
+        boolean ifNotExists = false;
+        List<ColumnIdentifier.Raw> partitionKeys = new ArrayList<>();
+        List<ColumnIdentifier.Raw> compositeKeys = new ArrayList<>();
+    }
+    : K_CREATE K_MATERIALIZED K_VIEW (K_IF K_NOT K_EXISTS { ifNotExists = true; })? cf=columnFamilyName K_AS
+        K_SELECT sclause=selectClause K_FROM basecf=columnFamilyName
+        (K_WHERE wclause=whereClause)?
+        K_PRIMARY K_KEY (
+        '(' '(' k1=cident { partitionKeys.add(k1); } ( ',' kn=cident { partitionKeys.add(kn); } )* ')' ( ',' c1=cident { compositeKeys.add(c1); } )* ')'
+    |   '(' k1=cident { partitionKeys.add(k1); } ( ',' cn=cident { compositeKeys.add(cn); } )* ')'
+        )
+        {
+             WhereClause where = wclause == null ? WhereClause.empty() : wclause.build();
+             $expr = new CreateViewStatement(cf, basecf, sclause, where, partitionKeys, compositeKeys, ifNotExists);
+        }
+        ( K_WITH cfamProperty[expr.properties] ( K_AND cfamProperty[expr.properties] )*)?
+    ;
 
 /**
  * CREATE TRIGGER triggerName ON columnFamily USING 'triggerClass';
@@ -737,7 +781,7 @@
     @init {
         boolean ifNotExists = false;
     }
-    : K_CREATE K_TRIGGER (K_IF K_NOT K_EXISTS { ifNotExists = true; } )? (name=cident)
+    : K_CREATE K_TRIGGER (K_IF K_NOT K_EXISTS { ifNotExists = true; } )? (name=noncol_ident)
         K_ON cf=columnFamilyName K_USING cls=STRING_LITERAL
       { $expr = new CreateTriggerStatement(cf, name.toString(), $cls.text, ifNotExists); }
     ;
@@ -747,7 +791,7 @@
  */
 dropTriggerStatement returns [DropTriggerStatement expr]
      @init { boolean ifExists = false; }
-    : K_DROP K_TRIGGER (K_IF K_EXISTS { ifExists = true; } )? (name=cident) K_ON cf=columnFamilyName
+    : K_DROP K_TRIGGER (K_IF K_EXISTS { ifExists = true; } )? (name=noncol_ident) K_ON cf=columnFamilyName
       { $expr = new DropTriggerStatement(cf, name.toString(), ifExists); }
     ;
 
@@ -755,7 +799,7 @@
  * ALTER KEYSPACE <KS> WITH <property> = <value>;
  */
 alterKeyspaceStatement returns [AlterKeyspaceStatement expr]
-    @init { KSPropDefs attrs = new KSPropDefs(); }
+    @init { KeyspaceAttributes attrs = new KeyspaceAttributes(); }
     : K_ALTER K_KEYSPACE ks=keyspaceName
         K_WITH properties[attrs] { $expr = new AlterKeyspaceStatement(ks, attrs); }
     ;
@@ -771,24 +815,40 @@
 alterTableStatement returns [AlterTableStatement expr]
     @init {
         AlterTableStatement.Type type = null;
-        CFPropDefs props = new CFPropDefs();
-        Map<ColumnIdentifier.Raw, ColumnIdentifier.Raw> renames = new HashMap<ColumnIdentifier.Raw, ColumnIdentifier.Raw>();
+        TableAttributes attrs = new TableAttributes();
+        Map<ColumnIdentifier.Raw, ColumnIdentifier> renames = new HashMap<ColumnIdentifier.Raw, ColumnIdentifier>();
         boolean isStatic = false;
+        Long dropTimestamp = null;
     }
     : K_ALTER K_COLUMNFAMILY cf=columnFamilyName
-          ( K_ALTER id=cident K_TYPE v=comparatorType { type = AlterTableStatement.Type.ALTER; }
-          | K_ADD   id=cident v=comparatorType ({ isStatic=true; } K_STATIC)? { type = AlterTableStatement.Type.ADD; }
-          | K_DROP  id=cident                         { type = AlterTableStatement.Type.DROP; }
-          | K_WITH  properties[props]                 { type = AlterTableStatement.Type.OPTS; }
-          | K_RENAME                                  { type = AlterTableStatement.Type.RENAME; }
-               id1=cident K_TO toId1=cident { renames.put(id1, toId1); }
-               ( K_AND idn=cident K_TO toIdn=cident { renames.put(idn, toIdn); } )*
+          ( K_ALTER id=cident K_TYPE v=comparatorType { type = AlterTableStatement.Type.ALTER;  }
+          | K_ADD   aid=ident {id=new ColumnIdentifier.ColumnIdentifierValue(aid);} v=comparatorType ({ isStatic=true; } K_STATIC)? { type = AlterTableStatement.Type.ADD; }
+          | K_DROP  id=cident                               { type = AlterTableStatement.Type.DROP; }
+          | K_DROP  id=cident K_USING K_TIMESTAMP t=INTEGER { type = AlterTableStatement.Type.DROP;
+                                                              dropTimestamp = Long.parseLong(Constants.Literal.integer($t.text).getText()); }
+          | K_DROP  K_COMPACT K_STORAGE                     { type = AlterTableStatement.Type.DROP_COMPACT_STORAGE; }
+          | K_WITH  properties[attrs]                       { type = AlterTableStatement.Type.OPTS; }
+          | K_RENAME                                        { type = AlterTableStatement.Type.RENAME; }
+               id1=cident K_TO toId1=ident { renames.put(id1, toId1); }
+               ( K_AND idn=cident K_TO toIdn=ident { renames.put(idn, toIdn); } )*
           )
     {
-        $expr = new AlterTableStatement(cf, type, id, v, props, renames, isStatic);
+        $expr = new AlterTableStatement(cf, type, id, v, attrs, renames, isStatic, dropTimestamp);
     }
     ;
 
+alterMaterializedViewStatement returns [AlterViewStatement expr]
+    @init {
+        TableAttributes attrs = new TableAttributes();
+    }
+    : K_ALTER K_MATERIALIZED K_VIEW name=columnFamilyName
+          K_WITH properties[attrs]
+    {
+        $expr = new AlterViewStatement(name, attrs);
+    }
+    ;
+    
+
 /**
  * ALTER TYPE <name> ALTER <field> TYPE <newtype>;
  * ALTER TYPE <name> ADD <field> <newtype>;
@@ -796,12 +856,12 @@
  */
 alterTypeStatement returns [AlterTypeStatement expr]
     : K_ALTER K_TYPE name=userTypeName
-          ( K_ALTER f=ident K_TYPE v=comparatorType { $expr = AlterTypeStatement.alter(name, f, v); }
-          | K_ADD   f=ident v=comparatorType        { $expr = AlterTypeStatement.addition(name, f, v); }
+          ( K_ALTER f=noncol_ident K_TYPE v=comparatorType { $expr = AlterTypeStatement.alter(name, f, v); }
+          | K_ADD   f=noncol_ident v=comparatorType        { $expr = AlterTypeStatement.addition(name, f, v); }
           | K_RENAME
                { Map<ColumnIdentifier, ColumnIdentifier> renames = new HashMap<ColumnIdentifier, ColumnIdentifier>(); }
-                 id1=ident K_TO toId1=ident { renames.put(id1, toId1); }
-                 ( K_AND idn=ident K_TO toIdn=ident { renames.put(idn, toIdn); } )*
+                 id1=noncol_ident K_TO toId1=noncol_ident { renames.put(id1, toId1); }
+                 ( K_AND idn=noncol_ident K_TO toIdn=noncol_ident { renames.put(idn, toIdn); } )*
                { $expr = AlterTypeStatement.renames(name, renames); }
           )
     ;
@@ -841,6 +901,15 @@
     ;
 
 /**
+ * DROP MATERIALIZED VIEW [IF EXISTS] <view_name>
+ */
+dropMaterializedViewStatement returns [DropViewStatement expr]
+    @init { boolean ifExists = false; }
+    : K_DROP K_MATERIALIZED K_VIEW (K_IF K_EXISTS { ifExists = true; } )? cf=columnFamilyName
+      { $expr = new DropViewStatement(cf, ifExists); }
+    ;
+
+/**
   * TRUNCATE <CF>;
   */
 truncateStatement returns [TruncateStatement stmt]
@@ -1101,14 +1170,25 @@
 // Column Identifiers.  These need to be treated differently from other
 // identifiers because the underlying comparator is not necessarily text. See
 // CASSANDRA-8178 for details.
+// Also, we need to support the internal of the super column map (for backward
+// compatibility) which is empty (we only want to allow this is queries, not for
+// creating table or other).
 cident returns [ColumnIdentifier.Raw id]
-    : t=IDENT              { $id = new ColumnIdentifier.Raw($t.text, false); }
-    | t=QUOTED_NAME        { $id = new ColumnIdentifier.Raw($t.text, true); }
-    | k=unreserved_keyword { $id = new ColumnIdentifier.Raw(k, false); }
+    : t=IDENT              { $id = new ColumnIdentifier.Literal($t.text, false); }
+    | t=QUOTED_NAME        { $id = new ColumnIdentifier.Literal($t.text, true); }
+    | k=unreserved_keyword { $id = new ColumnIdentifier.Literal(k, false); }
+    | EMPTY_QUOTED_NAME    { $id = new ColumnIdentifier.Literal("", false); }
     ;
 
-// Identifiers that do not refer to columns or where the comparator is known to be text
+// Column identifiers where the comparator is known to be text
 ident returns [ColumnIdentifier id]
+    : t=IDENT              { $id = ColumnIdentifier.getInterned($t.text, false); }
+    | t=QUOTED_NAME        { $id = ColumnIdentifier.getInterned($t.text, true); }
+    | k=unreserved_keyword { $id = ColumnIdentifier.getInterned(k, false); }
+    ;
+
+// Identifiers that do not refer to columns
+noncol_ident returns [ColumnIdentifier id]
     : t=IDENT              { $id = new ColumnIdentifier($t.text, false); }
     | t=QUOTED_NAME        { $id = new ColumnIdentifier($t.text, true); }
     | k=unreserved_keyword { $id = new ColumnIdentifier(k, false); }
@@ -1131,7 +1211,7 @@
     ;
 
 userTypeName returns [UTName name]
-    : (ks=ident '.')? ut=non_type_ident { return new UTName(ks, ut); }
+    : (ks=noncol_ident '.')? ut=non_type_ident { return new UTName(ks, ut); }
     ;
 
 userOrRoleName returns [RoleName name]
@@ -1207,7 +1287,7 @@
     @init{ Map<ColumnIdentifier, Term.Raw> m = new HashMap<ColumnIdentifier, Term.Raw>(); }
     @after{ $ut = new UserTypes.Literal(m); }
     // We don't allow empty literals because that conflicts with sets/maps and is currently useless since we don't allow empty user types
-    : '{' k1=ident ':' v1=term { m.put(k1, v1); } ( ',' kn=ident ':' vn=term { m.put(kn, vn); } )* '}'
+    : '{' k1=noncol_ident ':' v1=term { m.put(k1, v1); } ( ',' kn=noncol_ident ':' vn=term { m.put(kn, vn); } )* '}'
     ;
 
 tupleLiteral returns [Tuples.Literal tt]
@@ -1222,19 +1302,21 @@
     | u=usertypeLiteral    { $value = u; }
     | t=tupleLiteral       { $value = t; }
     | K_NULL               { $value = Constants.NULL_LITERAL; }
-    | ':' id=ident         { $value = newBindVariables(id); }
+    | ':' id=noncol_ident  { $value = newBindVariables(id); }
     | QMARK                { $value = newBindVariables(null); }
     ;
 
 intValue returns [Term.Raw value]
     :
     | t=INTEGER     { $value = Constants.Literal.integer($t.text); }
-    | ':' id=ident  { $value = newBindVariables(id); }
+    | ':' id=noncol_ident  { $value = newBindVariables(id); }
     | QMARK         { $value = newBindVariables(null); }
     ;
 
 functionName returns [FunctionName s]
-    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s = new FunctionName(ks, f); }
+     // antlr might try to recover and give a null for f. It will still error out in the end, but FunctionName
+     // wouldn't be happy with that so we should bypass this for now or we'll have a weird user-facing error
+    : (ks=keyspaceName '.')? f=allowedFunctionName   { $s = f == null ? null : new FunctionName(ks, f); }
     ;
 
 allowedFunctionName returns [String s]
@@ -1330,8 +1412,8 @@
     ;
 
 property[PropertyDefinitions props]
-    : k=ident '=' simple=propertyValue { try { $props.addProperty(k.toString(), simple); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } }
-    | k=ident '=' map=mapLiteral { try { $props.addProperty(k.toString(), convertPropertyMap(map)); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } }
+    : k=noncol_ident '=' simple=propertyValue { try { $props.addProperty(k.toString(), simple); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } }
+    | k=noncol_ident '=' map=mapLiteral { try { $props.addProperty(k.toString(), convertPropertyMap(map)); } catch (SyntaxException e) { addRecognitionError(e.getMessage()); } }
     ;
 
 propertyValue returns [String str]
@@ -1348,8 +1430,9 @@
     | '!=' { $op = Operator.NEQ; }
     ;
 
-relation[List<Relation> clauses]
+relation[WhereClause.Builder clauses]
     : name=cident type=relationType t=term { $clauses.add(new SingleColumnRelation(name, type, t)); }
+    | name=cident K_IS K_NOT K_NULL { $clauses.add(new SingleColumnRelation(name, Operator.IS_NOT, Constants.NULL_LITERAL)); }
     | K_TOKEN l=tupleOfIdentifiers type=relationType t=term
         { $clauses.add(new TokenRelation(l, type, t)); }
     | name=cident K_IN marker=inMarker
@@ -1384,7 +1467,7 @@
 
 inMarker returns [AbstractMarker.INRaw marker]
     : QMARK { $marker = newINBindVariables(null); }
-    | ':' name=ident { $marker = newINBindVariables(name); }
+    | ':' name=noncol_ident { $marker = newINBindVariables(name); }
     ;
 
 tupleOfIdentifiers returns [List<ColumnIdentifier.Raw> ids]
@@ -1404,7 +1487,7 @@
 
 markerForTuple returns [Tuples.Raw marker]
     : QMARK { $marker = newTupleBindVariables(null); }
-    | ':' name=ident { $marker = newTupleBindVariables(name); }
+    | ':' name=noncol_ident { $marker = newTupleBindVariables(name); }
     ;
 
 tupleOfMarkersForTuples returns [List<Tuples.Raw> markers]
@@ -1414,7 +1497,7 @@
 
 inMarkerForTuple returns [Tuples.INRaw marker]
     : QMARK { $marker = newTupleINBindVariables(null); }
-    | ':' name=ident { $marker = newTupleINBindVariables(name); }
+    | ':' name=noncol_ident { $marker = newTupleINBindVariables(name); }
     ;
 
 comparatorType returns [CQL3Type.Raw t]
@@ -1558,6 +1641,8 @@
     ;
 
 // Case-insensitive keywords
+// When adding a new reserved keyword, add entry to o.a.c.cql3.ReservedKeywords as well
+// When adding a new unreserved keyword, add entry to list above
 K_SELECT:      S E L E C T;
 K_FROM:        F R O M;
 K_AS:          A S;
@@ -1589,6 +1674,8 @@
 K_KEYSPACES:   K E Y S P A C E S;
 K_COLUMNFAMILY:( C O L U M N F A M I L Y
                  | T A B L E );
+K_MATERIALIZED:M A T E R I A L I Z E D;
+K_VIEW:        V I E W;
 K_INDEX:       I N D E X;
 K_CUSTOM:      C U S T O M;
 K_ON:          O N;
@@ -1612,6 +1699,7 @@
 K_ALLOW:       A L L O W;
 K_FILTERING:   F I L T E R I N G;
 K_IF:          I F;
+K_IS:          I S;
 K_CONTAINS:    C O N T A I N S;
 
 K_GRANT:       G R A N T;
@@ -1741,6 +1829,10 @@
       )
     ;
 
+EMPTY_QUOTED_NAME
+    : '\"' '\"'
+    ;
+
 QUOTED_NAME
     @init{ StringBuilder b = new StringBuilder(); }
     @after{ setText(b.toString()); }

diff --git a/src/java/org/apache/cassandra/cql3/IfExistsCondition.java b/src/java/org/apache/cassandra/cql3/IfExistsCondition.java
new file mode 100644
index 0000000..a24d8c0
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/IfExistsCondition.java

@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.apache.cassandra.cql3.statements.CQL3CasRequest;
+import org.apache.cassandra.db.Clustering;
+
+final class IfExistsCondition extends AbstractConditions
+{
+    @Override
+    public void addConditionsTo(CQL3CasRequest request, Clustering clustering, QueryOptions options)
+    {
+        request.addExist(clustering);
+    }
+
+    @Override
+    public boolean isIfExists()
+    {
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/IfNotExistsCondition.java b/src/java/org/apache/cassandra/cql3/IfNotExistsCondition.java
new file mode 100644
index 0000000..05cb864
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/IfNotExistsCondition.java

@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.apache.cassandra.cql3.statements.CQL3CasRequest;
+import org.apache.cassandra.db.Clustering;
+
+final class IfNotExistsCondition extends AbstractConditions
+{
+    @Override
+    public void addConditionsTo(CQL3CasRequest request, Clustering clustering, QueryOptions options)
+    {
+        request.addNotExist(clustering);
+    }
+
+    @Override
+    public boolean isIfNotExists()
+    {
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/Json.java b/src/java/org/apache/cassandra/cql3/Json.java
index 78fbd08..ab02fb6 100644
--- a/src/java/org/apache/cassandra/cql3/Json.java
+++ b/src/java/org/apache/cassandra/cql3/Json.java

@@ -77,7 +77,7 @@
 
         public Prepared prepareAndCollectMarkers(CFMetaData metadata, Collection<ColumnDefinition> receivers, VariableSpecifications boundNames)
         {
-            return new PreparedLiteral(metadata.ksName, parseJson(text, receivers));
+            return new PreparedLiteral(parseJson(text, receivers));
         }
     }
 
@@ -97,7 +97,7 @@
         public Prepared prepareAndCollectMarkers(CFMetaData metadata, Collection<ColumnDefinition> receivers, VariableSpecifications boundNames)
         {
             boundNames.add(bindIndex, makeReceiver(metadata));
-            return new PreparedMarker(metadata.ksName, bindIndex, receivers);
+            return new PreparedMarker(bindIndex, receivers);
         }
 
         private ColumnSpecification makeReceiver(CFMetaData metadata)
@@ -111,27 +111,7 @@
      */
     public static abstract class Prepared
     {
-        private final String keyspace;
-
-        protected Prepared(String keyspace)
-        {
-            this.keyspace = keyspace;
-        }
-
-        protected abstract Term.Raw getRawTermForColumn(ColumnDefinition def);
-
-        public Term getPrimaryKeyValueForColumn(ColumnDefinition def)
-        {
-            // Note that we know we don't have to call collectMarkerSpecification since it has already been collected
-            return getRawTermForColumn(def).prepare(keyspace, def);
-        }
-
-        public Operation getSetOperationForColumn(ColumnDefinition def)
-        {
-            // Note that we know we don't have to call collectMarkerSpecification on the operation since we have
-            // already collected all we need.
-            return new Operation.SetValue(getRawTermForColumn(def)).prepare(keyspace, def);
-        }
+        public abstract Term.Raw getRawTermForColumn(ColumnDefinition def);
     }
 
     /**
@@ -141,13 +121,12 @@
     {
         private final Map<ColumnIdentifier, Term> columnMap;
 
-        public PreparedLiteral(String keyspace, Map<ColumnIdentifier, Term> columnMap)
+        public PreparedLiteral(Map<ColumnIdentifier, Term> columnMap)
         {
-            super(keyspace);
             this.columnMap = columnMap;
         }
 
-        protected Term.Raw getRawTermForColumn(ColumnDefinition def)
+        public Term.Raw getRawTermForColumn(ColumnDefinition def)
         {
             Term value = columnMap.get(def.name);
             return value == null ? Constants.NULL_LITERAL : new ColumnValue(value);
@@ -162,16 +141,15 @@
         private final int bindIndex;
         private final Collection<ColumnDefinition> columns;
 
-        public PreparedMarker(String keyspace, int bindIndex, Collection<ColumnDefinition> columns)
+        public PreparedMarker(int bindIndex, Collection<ColumnDefinition> columns)
         {
-            super(keyspace);
             this.bindIndex = bindIndex;
             this.columns = columns;
         }
 
-        protected DelayedColumnValue getRawTermForColumn(ColumnDefinition def)
+        public RawDelayedColumnValue getRawTermForColumn(ColumnDefinition def)
         {
-            return new DelayedColumnValue(this, def);
+            return new RawDelayedColumnValue(this, def);
         }
     }
 
@@ -181,7 +159,7 @@
      * Note that this is intrinsically an already prepared term, but this still implements Term.Raw so that we can
      * easily use it to create raw operations.
      */
-    private static class ColumnValue implements Term.Raw
+    private static class ColumnValue extends Term.Raw
     {
         private final Term term;
 
@@ -201,19 +179,22 @@
         {
             return TestResult.NOT_ASSIGNABLE;
         }
+
+        public String getText()
+        {
+            return term.toString();
+        }
     }
 
     /**
-     * A NonTerminal for a single column.
-     *
-     * As with {@code ColumnValue}, this is intrinsically a prepared term but implements Terms.Raw for convenience.
+     * A Raw term for a single column. Like ColumnValue, this is intrinsically already prepared.
      */
-    private static class DelayedColumnValue extends Term.NonTerminal implements Term.Raw
+    private static class RawDelayedColumnValue extends Term.Raw
     {
         private final PreparedMarker marker;
         private final ColumnDefinition column;
 
-        public DelayedColumnValue(PreparedMarker prepared, ColumnDefinition column)
+        public RawDelayedColumnValue(PreparedMarker prepared, ColumnDefinition column)
         {
             this.marker = prepared;
             this.column = column;
@@ -222,7 +203,7 @@
         @Override
         public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            return this;
+            return new DelayedColumnValue(marker, column);
         }
 
         @Override
@@ -231,6 +212,26 @@
             return TestResult.WEAKLY_ASSIGNABLE;
         }
 
+        public String getText()
+        {
+            return marker.toString();
+        }
+    }
+
+    /**
+     * A NonTerminal for a single column. As with {@code ColumnValue}, this is intrinsically a prepared.
+     */
+    private static class DelayedColumnValue extends Term.NonTerminal
+    {
+        private final PreparedMarker marker;
+        private final ColumnDefinition column;
+
+        public DelayedColumnValue(PreparedMarker prepared, ColumnDefinition column)
+        {
+            this.marker = prepared;
+            this.column = column;
+        }
+
         @Override
         public void collectMarkerSpecification(VariableSpecifications boundNames)
         {
@@ -251,9 +252,8 @@
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Collections.emptyList();
         }
     }
 

diff --git a/src/java/org/apache/cassandra/cql3/Lists.java b/src/java/org/apache/cassandra/cql3/Lists.java
index cc75476..065f74a 100644
--- a/src/java/org/apache/cassandra/cql3/Lists.java
+++ b/src/java/org/apache/cassandra/cql3/Lists.java

@@ -21,18 +21,16 @@
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
 
-import org.apache.cassandra.config.CFMetaData;
+import com.google.common.annotations.VisibleForTesting;
+
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompositesBuilder;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.ListType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -40,7 +38,6 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
 
 /**
@@ -60,7 +57,7 @@
         return new ColumnSpecification(column.ksName, column.cfName, new ColumnIdentifier("value(" + column.name + ")", true), ((ListType)column.type).getElementsType());
     }
 
-    public static class Literal implements Term.Raw
+    public static class Literal extends Term.Raw
     {
         private final List<Term.Raw> elements;
 
@@ -74,7 +71,7 @@
             validateAssignableTo(keyspace, receiver);
 
             ColumnSpecification valueSpec = Lists.valueSpecOf(receiver);
-            List<Term> values = new ArrayList<Term>(elements.size());
+            List<Term> values = new ArrayList<>(elements.size());
             boolean allTerminal = true;
             for (Term.Raw rt : elements)
             {
@@ -118,10 +115,9 @@
             return AssignmentTestable.TestResult.testAll(keyspace, valueSpec, elements);
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            return elements.toString();
+            return elements.stream().map(Term.Raw::getText).collect(Collectors.joining(", ", "[", "]"));
         }
     }
 
@@ -216,20 +212,14 @@
                 if (bytes == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     return UNSET_VALUE;
 
-                // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-                if (bytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("List value is too long. List values are limited to %d bytes but %d bytes value provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    bytes.remaining()));
-
                 buffers.add(bytes);
             }
             return new Value(buffers);
         }
 
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(elements);
+            Terms.addFunctions(elements, functions);
         }
     }
 
@@ -255,18 +245,17 @@
         }
     }
 
-    /*
+    /**
      * For prepend, we need to be able to generate unique but decreasing time
-     * UUID, which is a bit challenging. To do that, given a time in milliseconds,
-     * we adds a number representing the 100-nanoseconds precision and make sure
-     * that within the same millisecond, that number is always decreasing. We
-     * do rely on the fact that the user will only provide decreasing
-     * milliseconds timestamp for that purpose.
+     * UUIDs, which is a bit challenging. To do that, given a time in milliseconds,
+     * we add a number representing the 100-nanoseconds precision and make sure
+     * that within the same millisecond, that number is always decreasing.
      */
-    private static class PrecisionTime
+    static class PrecisionTime
     {
         // Our reference time (1 jan 2010, 00:00:00) in milliseconds.
         private static final long REFERENCE_TIME = 1262304000000L;
+        static final int MAX_NANOS = 9999;
         private static final AtomicReference<PrecisionTime> last = new AtomicReference<>(new PrecisionTime(Long.MAX_VALUE, 0));
 
         public final long millis;
@@ -278,21 +267,52 @@
             this.nanos = nanos;
         }
 
-        static PrecisionTime getNext(long millis)
+        static PrecisionTime getNext(long millis, int count)
         {
+            if (count == 0)
+                return last.get();
+
             while (true)
             {
                 PrecisionTime current = last.get();
 
-                assert millis <= current.millis;
-                PrecisionTime next = millis < current.millis
-                    ? new PrecisionTime(millis, 9999)
-                    : new PrecisionTime(millis, Math.max(0, current.nanos - 1));
+                final PrecisionTime next;
+                if (millis < current.millis)
+                {
+                    next = new PrecisionTime(millis, MAX_NANOS - count);
+                }
+                else
+                {
+                    // in addition to being at the same millisecond, we handle the unexpected case of the millis parameter
+                    // being in the past. That could happen if the System.currentTimeMillis() not operating montonically
+                    // or if one thread is just a really big loser in the compareAndSet game of life.
+                    long millisToUse = millis <= current.millis ? millis : current.millis;
+
+                    // if we will go below zero on the nanos, decrement the millis by one
+                    final int nanosToUse;
+                    if (current.nanos - count >= 0)
+                    {
+                        nanosToUse = current.nanos - count;
+                    }
+                    else
+                    {
+                        nanosToUse = MAX_NANOS - count;
+                        millisToUse -= 1;
+                    }
+
+                    next = new PrecisionTime(millisToUse, nanosToUse);
+                }
 
                 if (last.compareAndSet(current, next))
                     return next;
             }
         }
+
+        @VisibleForTesting
+        static void set(long millis, int nanos)
+        {
+            last.set(new PrecisionTime(millis, nanos));
+        }
     }
 
     public static class Setter extends Operation
@@ -302,20 +322,28 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             Term.Terminal value = t.bind(params.options);
-            if (column.type.isMultiCell() && value != UNSET_VALUE)
-            {
-                // delete + append
-                CellName name = cf.getComparator().create(prefix, column);
-                cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
-            }
-            if (value != UNSET_VALUE)
-                Appender.doAppend(cf, prefix, column, params, value);
+            if (value == UNSET_VALUE)
+                return;
+
+            // delete + append
+            if (column.type.isMultiCell())
+                params.setComplexDeletionTimeForOverwrite(column);
+            Appender.doAppend(value, column, params);
         }
     }
 
+    private static int existingSize(Row row, ColumnDefinition column)
+    {
+        if (row == null)
+            return 0;
+
+        ComplexColumnData complexData = row.getComplexColumnData(column);
+        return complexData == null ? 0 : complexData.cellsCount();
+    }
+
     public static class SetterByIndex extends Operation
     {
         private final Term idx;
@@ -339,7 +367,7 @@
             idx.collectMarkerSpecification(boundNames);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             // we should not get here for frozen lists
             assert column.type.isMultiCell() : "Attempted to set an individual element on a frozen list";
@@ -352,27 +380,22 @@
             if (index == ByteBufferUtil.UNSET_BYTE_BUFFER)
                 throw new InvalidRequestException("Invalid unset value for list index");
 
-            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name, cf);
+            Row existingRow = params.getPrefetchedRow(partitionKey, params.currentClustering());
+            int existingSize = existingSize(existingRow, column);
             int idx = ByteBufferUtil.toInt(index);
-            if (existingList == null || existingList.size() == 0)
+            if (existingSize == 0)
                 throw new InvalidRequestException("Attempted to set an element on a list which is null");
-            if (idx < 0 || idx >= existingList.size())
-                throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingList.size()));
+            if (idx < 0 || idx >= existingSize)
+                throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingSize));
 
-            CellName elementName = existingList.get(idx).name();
+            CellPath elementPath = existingRow.getComplexColumnData(column).getCellByIndex(idx).path();
             if (value == null)
             {
-                cf.addColumn(params.makeTombstone(elementName));
+                params.addTombstone(column, elementPath);
             }
             else if (value != ByteBufferUtil.UNSET_BYTE_BUFFER)
             {
-                // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-                if (value.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("List value is too long. List values are limited to %d bytes but %d bytes value provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    value.remaining()));
-
-                cf.addColumn(params.makeColumn(elementName, value));
+                params.addCell(column, elementPath, value);
             }
         }
     }
@@ -384,15 +407,14 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to append to a frozen list";
             Term.Terminal value = t.bind(params.options);
-            if (value != UNSET_VALUE)
-                doAppend(cf, prefix, column, params, value);
+            doAppend(value, column, params);
         }
 
-        static void doAppend(ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params, Term.Terminal value) throws InvalidRequestException
+        static void doAppend(Term.Terminal value, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
             if (column.type.isMultiCell())
             {
@@ -404,17 +426,16 @@
                 for (ByteBuffer buffer : ((Value) value).elements)
                 {
                     ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes());
-                    cf.addColumn(params.makeColumn(cf.getComparator().create(prefix, column, uuid), buffer));
+                    params.addCell(column, CellPath.create(uuid), buffer);
                 }
             }
             else
             {
                 // for frozen lists, we're overwriting the whole cell value
-                CellName name = cf.getComparator().create(prefix, column);
                 if (value == null)
-                    cf.addAtom(params.makeTombstone(name));
+                    params.addTombstone(column);
                 else
-                    cf.addColumn(params.makeColumn(name, value.get(Server.CURRENT_VERSION)));
+                    params.addCell(column, value.get(Server.CURRENT_VERSION));
             }
         }
     }
@@ -426,21 +447,31 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to prepend to a frozen list";
             Term.Terminal value = t.bind(params.options);
             if (value == null || value == UNSET_VALUE)
                 return;
 
-            long time = PrecisionTime.REFERENCE_TIME - (System.currentTimeMillis() - PrecisionTime.REFERENCE_TIME);
-
             List<ByteBuffer> toAdd = ((Value) value).elements;
-            for (int i = toAdd.size() - 1; i >= 0; i--)
+            final int totalCount = toAdd.size();
+
+            // we have to obey MAX_NANOS per batch - in the unlikely event a client has decided to prepend a list with
+            // an insane number of entries.
+            PrecisionTime pt = null;
+            int remainingInBatch = 0;
+            for (int i = totalCount - 1; i >= 0; i--)
             {
-                PrecisionTime pt = PrecisionTime.getNext(time);
-                ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes(pt.millis, pt.nanos));
-                cf.addColumn(params.makeColumn(cf.getComparator().create(prefix, column, uuid), toAdd.get(i)));
+                if (remainingInBatch == 0)
+                {
+                    long time = PrecisionTime.REFERENCE_TIME - (System.currentTimeMillis() - PrecisionTime.REFERENCE_TIME);
+                    remainingInBatch = Math.min(PrecisionTime.MAX_NANOS, i) + 1;
+                    pt = PrecisionTime.getNext(time, remainingInBatch);
+                }
+
+                ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes(pt.millis, (pt.nanos + remainingInBatch--)));
+                params.addCell(column, CellPath.create(uuid), toAdd.get(i));
             }
         }
     }
@@ -458,30 +489,27 @@
             return true;
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete from a frozen list";
-            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name, cf);
+
             // We want to call bind before possibly returning to reject queries where the value provided is not a list.
             Term.Terminal value = t.bind(params.options);
 
-            if (existingList == null)
-                throw new InvalidRequestException("Attempted to delete an element from a list which is null");
-            if (existingList.isEmpty())
-                return;
-
-            if (value == null || value == UNSET_VALUE)
+            Row existingRow = params.getPrefetchedRow(partitionKey, params.currentClustering());
+            ComplexColumnData complexData = existingRow == null ? null : existingRow.getComplexColumnData(column);
+            if (value == null || value == UNSET_VALUE || complexData == null)
                 return;
 
             // Note: below, we will call 'contains' on this toDiscard list for each element of existingList.
             // Meaning that if toDiscard is big, converting it to a HashSet might be more efficient. However,
             // the read-before-write this operation requires limits its usefulness on big lists, so in practice
             // toDiscard will be small and keeping a list will be more efficient.
-            List<ByteBuffer> toDiscard = ((Value) value).elements;
-            for (Cell cell : existingList)
+            List<ByteBuffer> toDiscard = ((Value)value).elements;
+            for (Cell cell : complexData)
             {
                 if (toDiscard.contains(cell.value()))
-                    cf.addColumn(params.makeTombstone(cell.name()));
+                    params.addTombstone(column, cell.path());
             }
         }
     }
@@ -499,7 +527,7 @@
             return true;
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete an item by index from a frozen list";
             Term.Terminal index = t.bind(params.options);
@@ -508,16 +536,15 @@
             if (index == Constants.UNSET_VALUE)
                 return;
 
-            List<Cell> existingList = params.getPrefetchedList(rowKey, column.name, cf);
-
+            Row existingRow = params.getPrefetchedRow(partitionKey, params.currentClustering());
+            int existingSize = existingSize(existingRow, column);
             int idx = ByteBufferUtil.toInt(index.get(params.options.getProtocolVersion()));
-            if (existingList == null || existingList.size() == 0)
+            if (existingSize == 0)
                 throw new InvalidRequestException("Attempted to delete an element from a list which is null");
-            if (idx < 0 || idx >= existingList.size())
-                throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingList.size()));
+            if (idx < 0 || idx >= existingSize)
+                throw new InvalidRequestException(String.format("List index %d out of bound, list has size %d", idx, existingSize));
 
-            CellName elementName = existingList.get(idx).name();
-            cf.addColumn(params.makeTombstone(elementName));
+            params.addTombstone(column, existingRow.getComplexColumnData(column).getCellByIndex(idx).path());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Maps.java b/src/java/org/apache/cassandra/cql3/Maps.java
index 5bb3a48..4b6f0fe 100644
--- a/src/java/org/apache/cassandra/cql3/Maps.java
+++ b/src/java/org/apache/cassandra/cql3/Maps.java

@@ -21,21 +21,18 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
-
-import com.google.common.collect.Iterables;
+import java.util.stream.Collectors;
 
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -55,7 +52,7 @@
         return new ColumnSpecification(column.ksName, column.cfName, new ColumnIdentifier("value(" + column.name + ")", true), ((MapType)column.type).getValuesType());
     }
 
-    public static class Literal implements Term.Raw
+    public static class Literal extends Term.Raw
     {
         public final List<Pair<Term.Raw, Term.Raw>> entries;
 
@@ -130,18 +127,11 @@
             return res;
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            StringBuilder sb = new StringBuilder();
-            sb.append("{");
-            for (int i = 0; i < entries.size(); i++)
-            {
-                if (i > 0) sb.append(", ");
-                sb.append(entries.get(i).left).append(":").append(entries.get(i).right);
-            }
-            sb.append("}");
-            return sb.toString();
+            return entries.stream()
+                    .map(entry -> String.format("%s: %s", entry.left.getText(), entry.right.getText()))
+                    .collect(Collectors.joining(", ", "{", "}"));
         }
     }
 
@@ -232,14 +222,11 @@
             {
                 // We don't support values > 64K because the serialization format encode the length as an unsigned short.
                 ByteBuffer keyBytes = entry.getKey().bindAndGet(options);
+
                 if (keyBytes == null)
                     throw new InvalidRequestException("null is not supported inside collections");
                 if (keyBytes == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     throw new InvalidRequestException("unset value is not supported for map keys");
-                if (keyBytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("Map key is too long. Map keys are limited to %d bytes but %d bytes keys provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    keyBytes.remaining()));
 
                 ByteBuffer valueBytes = entry.getValue().bindAndGet(options);
                 if (valueBytes == null)
@@ -247,20 +234,15 @@
                 if (valueBytes == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     return UNSET_VALUE;
 
-                if (valueBytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("Map value is too long. Map values are limited to %d bytes but %d bytes value provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    valueBytes.remaining()));
-
                 buffers.put(keyBytes, valueBytes);
             }
             return new Value(buffers);
         }
 
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Iterables.concat(Terms.getFunctions(elements.keySet()),
-                                    Terms.getFunctions(elements.values()));
+            Terms.addFunctions(elements.keySet(), functions);
+            Terms.addFunctions(elements.values(), functions);
         }
     }
 
@@ -290,17 +272,16 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             Term.Terminal value = t.bind(params.options);
-            if (column.type.isMultiCell() && value != UNSET_VALUE)
-            {
-                // delete + put
-                CellName name = cf.getComparator().create(prefix, column);
-                cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
-            }
-            if (value != UNSET_VALUE)
-                Putter.doPut(cf, prefix, column, params, value);
+            if (value == UNSET_VALUE)
+                return;
+
+            // delete + put
+            if (column.type.isMultiCell())
+                params.setComplexDeletionTimeForOverwrite(column);
+            Putter.doPut(value, column, params);
         }
     }
 
@@ -321,7 +302,7 @@
             k.collectMarkerSpecification(boundNames);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to set a value for a single key on a frozen map";
             ByteBuffer key = k.bindAndGet(params.options);
@@ -331,25 +312,104 @@
             if (key == ByteBufferUtil.UNSET_BYTE_BUFFER)
                 throw new InvalidRequestException("Invalid unset map key");
 
-            CellName cellName = cf.getComparator().create(prefix, column, key);
+            CellPath path = CellPath.create(key);
 
             if (value == null)
             {
-                cf.addColumn(params.makeTombstone(cellName));
+                params.addTombstone(column, path);
             }
             else if (value != ByteBufferUtil.UNSET_BYTE_BUFFER)
             {
-                // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-                if (value.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("Map value is too long. Map values are limited to %d bytes but %d bytes value provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    value.remaining()));
-
-                cf.addColumn(params.makeColumn(cellName, value));
+                params.addCell(column, path, value);
             }
         }
     }
 
+    // Currently only used internally counters support in SuperColumn families.
+    // Addition on the element level inside the collections are otherwise not supported in the CQL.
+    public static class AdderByKey extends Operation
+    {
+        private final Term k;
+
+        public AdderByKey(ColumnDefinition column, Term t, Term k)
+        {
+            super(column, t);
+            this.k = k;
+        }
+
+        @Override
+        public void collectMarkerSpecification(VariableSpecifications boundNames)
+        {
+            super.collectMarkerSpecification(boundNames);
+            k.collectMarkerSpecification(boundNames);
+        }
+
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
+        {
+            assert column.type.isMultiCell() : "Attempted to set a value for a single key on a frozen map";
+
+            ByteBuffer key = k.bindAndGet(params.options);
+            ByteBuffer value = t.bindAndGet(params.options);
+
+            if (key == null)
+                throw new InvalidRequestException("Invalid null map key");
+            if (key == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                throw new InvalidRequestException("Invalid unset map key");
+
+            if (value == null)
+                throw new InvalidRequestException("Invalid null value for counter increment");
+            if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                return;
+
+            long increment = ByteBufferUtil.toLong(value);
+            params.addCounter(column, increment, CellPath.create(key));
+        }
+    }
+
+    // Currently only used internally counters support in SuperColumn families.
+    // Addition on the element level inside the collections are otherwise not supported in the CQL.
+    public static class SubtracterByKey extends Operation
+    {
+        private final Term k;
+
+        public SubtracterByKey(ColumnDefinition column, Term t, Term k)
+        {
+            super(column, t);
+            this.k = k;
+        }
+
+        @Override
+        public void collectMarkerSpecification(VariableSpecifications boundNames)
+        {
+            super.collectMarkerSpecification(boundNames);
+            k.collectMarkerSpecification(boundNames);
+        }
+
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
+        {
+            assert column.type.isMultiCell() : "Attempted to set a value for a single key on a frozen map";
+
+            ByteBuffer key = k.bindAndGet(params.options);
+            ByteBuffer value = t.bindAndGet(params.options);
+
+            if (key == null)
+                throw new InvalidRequestException("Invalid null map key");
+            if (key == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                throw new InvalidRequestException("Invalid unset map key");
+
+            if (value == null)
+                throw new InvalidRequestException("Invalid null value for counter increment");
+            if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                return;
+
+            long increment = ByteBufferUtil.toLong(value);
+            if (increment == Long.MIN_VALUE)
+                throw new InvalidRequestException("The negation of " + increment + " overflows supported counter precision (signed 8 bytes integer)");
+
+            params.addCounter(column, -increment, CellPath.create(key));
+        }
+    }
+
     public static class Putter extends Operation
     {
         public Putter(ColumnDefinition column, Term t)
@@ -357,15 +417,15 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to add items to a frozen map";
             Term.Terminal value = t.bind(params.options);
             if (value != UNSET_VALUE)
-                doPut(cf, prefix, column, params, value);
+                doPut(value, column, params);
         }
 
-        static void doPut(ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params, Term.Terminal value) throws InvalidRequestException
+        static void doPut(Term.Terminal value, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
             if (column.type.isMultiCell())
             {
@@ -374,19 +434,15 @@
 
                 Map<ByteBuffer, ByteBuffer> elements = ((Value) value).map;
                 for (Map.Entry<ByteBuffer, ByteBuffer> entry : elements.entrySet())
-                {
-                    CellName cellName = cf.getComparator().create(prefix, column, entry.getKey());
-                    cf.addColumn(params.makeColumn(cellName, entry.getValue()));
-                }
+                    params.addCell(column, CellPath.create(entry.getKey()), entry.getValue());
             }
             else
             {
                 // for frozen maps, we're overwriting the whole cell
-                CellName cellName = cf.getComparator().create(prefix, column);
                 if (value == null)
-                    cf.addAtom(params.makeTombstone(cellName));
+                    params.addTombstone(column);
                 else
-                    cf.addColumn(params.makeColumn(cellName, value.get(Server.CURRENT_VERSION)));
+                    params.addCell(column, value.get(Server.CURRENT_VERSION));
             }
         }
     }
@@ -398,7 +454,7 @@
             super(column, k);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete a single key in a frozen map";
             Term.Terminal key = t.bind(params.options);
@@ -407,8 +463,7 @@
             if (key == Constants.UNSET_VALUE)
                 throw new InvalidRequestException("Invalid unset map key");
 
-            CellName cellName = cf.getComparator().create(prefix, column, key.get(params.options.getProtocolVersion()));
-            cf.addColumn(params.makeTombstone(cellName));
+            params.addTombstone(column, CellPath.create(key.get(params.options.getProtocolVersion())));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/MultiColumnRelation.java b/src/java/org/apache/cassandra/cql3/MultiColumnRelation.java
index b54bdd0..1bfac3f 100644
--- a/src/java/org/apache/cassandra/cql3/MultiColumnRelation.java
+++ b/src/java/org/apache/cassandra/cql3/MultiColumnRelation.java

@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
@@ -26,6 +27,7 @@
 import org.apache.cassandra.cql3.Term.Raw;
 import org.apache.cassandra.cql3.restrictions.MultiColumnRestriction;
 import org.apache.cassandra.cql3.restrictions.Restriction;
+import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
 import org.apache.cassandra.cql3.statements.Bound;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
@@ -114,11 +116,17 @@
      * For non-IN relations, returns the Tuples.Literal or Tuples.Raw marker for a single tuple.
      * @return a Tuples.Literal for non-IN relations or Tuples.Raw marker for a single tuple.
      */
-    private Term.MultiColumnRaw getValue()
+    public Term.MultiColumnRaw getValue()
     {
         return relationType == Operator.IN ? inMarker : valuesOrMarker;
     }
 
+    public List<? extends Term.Raw> getInValues()
+    {
+        assert relationType == Operator.IN;
+        return inValues;
+    }
+
     @Override
     public boolean isMultiColumn()
     {
@@ -131,7 +139,7 @@
     {
         List<ColumnDefinition> receivers = receivers(cfm);
         Term term = toTerm(receivers, getValue(), cfm.ksName, boundNames);
-        return new MultiColumnRestriction.EQ(receivers, term);
+        return new MultiColumnRestriction.EQRestriction(receivers, term);
     }
 
     @Override
@@ -143,9 +151,9 @@
         if (terms == null)
         {
             Term term = toTerm(receivers, getValue(), cfm.ksName, boundNames);
-            return new MultiColumnRestriction.InWithMarker(receivers, (AbstractMarker) term);
+            return new MultiColumnRestriction.InRestrictionWithMarker(receivers, (AbstractMarker) term);
         }
-        return new MultiColumnRestriction.InWithValues(receivers, terms);
+        return new MultiColumnRestriction.InRestrictionWithValues(receivers, terms);
     }
 
     @Override
@@ -155,8 +163,8 @@
                                               boolean inclusive) throws InvalidRequestException
     {
         List<ColumnDefinition> receivers = receivers(cfm);
-        Term term = toTerm(receivers(cfm), getValue(), cfm.ksName, boundNames);
-        return new MultiColumnRestriction.Slice(receivers, bound, inclusive, term);
+        Term term = toTerm(receivers, getValue(), cfm.ksName, boundNames);
+        return new MultiColumnRestriction.SliceRestriction(receivers, bound, inclusive, term);
     }
 
     @Override
@@ -164,7 +172,15 @@
                                                  VariableSpecifications boundNames,
                                                  boolean isKey) throws InvalidRequestException
     {
-        throw invalidRequest("%s cannot be used for Multi-column relations", operator());
+        throw invalidRequest("%s cannot be used for multi-column relations", operator());
+    }
+
+    @Override
+    protected Restriction newIsNotRestriction(CFMetaData cfm,
+                                              VariableSpecifications boundNames) throws InvalidRequestException
+    {
+        // this is currently disallowed by the grammar
+        throw new AssertionError(String.format("%s cannot be used for multi-column relations", operator()));
     }
 
     @Override
@@ -198,6 +214,15 @@
         return names;
     }
 
+    public Relation renameIdentifier(ColumnIdentifier.Raw from, ColumnIdentifier.Raw to)
+    {
+        if (!entities.contains(from))
+            return this;
+
+        List<ColumnIdentifier.Raw> newEntities = entities.stream().map(e -> e.equals(from) ? to : e).collect(Collectors.toList());
+        return new MultiColumnRelation(newEntities, operator(), valuesOrMarker, inValues, inMarker);
+    }
+
     @Override
     public String toString()
     {
@@ -214,4 +239,65 @@
                       .append(valuesOrMarker)
                       .toString();
     }
-}
\ No newline at end of file
+
+    @Override
+    public Relation toSuperColumnAdapter()
+    {
+        return new SuperColumnMultiColumnRelation(entities, relationType, valuesOrMarker, inValues, inMarker);
+    }
+
+    /**
+     * Required for SuperColumn compatibility, in order to map the SuperColumn key restrictions from the regular
+     * column to the collection key one.
+     */
+    private class SuperColumnMultiColumnRelation extends MultiColumnRelation
+    {
+        private SuperColumnMultiColumnRelation(List<ColumnIdentifier.Raw> entities, Operator relationType, MultiColumnRaw valuesOrMarker, List<? extends MultiColumnRaw> inValues, Tuples.INRaw inMarker)
+        {
+            super(entities, relationType, valuesOrMarker, inValues, inMarker);
+        }
+
+        @Override
+        protected Restriction newSliceRestriction(CFMetaData cfm,
+                                                  VariableSpecifications boundNames,
+                                                  Bound bound,
+                                                  boolean inclusive) throws InvalidRequestException
+        {
+            assert cfm.isSuper() && cfm.isDense();
+            List<ColumnDefinition> receivers = receivers(cfm);
+            Term term = toTerm(receivers, getValue(), cfm.ksName, boundNames);
+            return new SingleColumnRestriction.SuperColumnMultiSliceRestriction(receivers.get(0), bound, inclusive, term);
+        }
+
+        @Override
+        protected Restriction newEQRestriction(CFMetaData cfm,
+                                               VariableSpecifications boundNames) throws InvalidRequestException
+        {
+            assert cfm.isSuper() && cfm.isDense();
+            List<ColumnDefinition> receivers = receivers(cfm);
+            Term term = toTerm(receivers, getValue(), cfm.ksName, boundNames);
+            return new SingleColumnRestriction.SuperColumnMultiEQRestriction(receivers.get(0), term);
+        }
+
+        @Override
+        protected List<ColumnDefinition> receivers(CFMetaData cfm) throws InvalidRequestException
+        {
+            assert cfm.isSuper() && cfm.isDense();
+            List<ColumnDefinition> names = new ArrayList<>(getEntities().size());
+
+            for (ColumnIdentifier.Raw raw : getEntities())
+            {
+                ColumnDefinition def = toColumnDefinition(cfm, raw);
+
+                checkTrue(def.isClusteringColumn() ||
+                          cfm.isSuperColumnKeyColumn(def),
+                          "Multi-column relations can only be applied to clustering columns but was applied to: %s", def.name);
+
+                checkFalse(names.contains(def), "Column \"%s\" appeared twice in a relation: %s", def.name, this);
+
+                names.add(def);
+            }
+            return names;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/Operation.java b/src/java/org/apache/cassandra/cql3/Operation.java
index 4701a96..4b8d5ba 100644
--- a/src/java/org/apache/cassandra/cql3/Operation.java
+++ b/src/java/org/apache/cassandra/cql3/Operation.java

@@ -17,13 +17,11 @@
  */
 package org.apache.cassandra.cql3;
 
-import java.nio.ByteBuffer;
-import java.util.Collections;
+import java.util.List;
 
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
@@ -57,9 +55,10 @@
         this.t = t;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return t != null ? t.getFunctions() : Collections.<Function>emptySet();
+        if (t != null)
+            t.addFunctionsTo(functions);
     }
 
     /**
@@ -86,12 +85,10 @@
     /**
      * Execute the operation.
      *
-     * @param rowKey row key for the update.
-     * @param cf the column family to which to add the updates generated by this operation.
-     * @param prefix the prefix that identify the CQL3 row this operation applies to.
+     * @param partitionKey partition key for the update.
      * @param params parameters of the update.
      */
-    public abstract void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException;
+    public abstract void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException;
 
     /**
      * A parsed raw UPDATE operation.
@@ -195,6 +192,11 @@
             // it's stupid and 2) the result would seem random to the user.
             return false;
         }
+
+        public Term.Raw value()
+        {
+            return value;
+        }
     }
 
     public static class SetElement implements RawUpdate
@@ -244,6 +246,72 @@
         }
     }
 
+    // Currently only used internally counters support in SuperColumn families.
+    // Addition on the element level inside the collections are otherwise not supported in the CQL.
+    public static class ElementAddition implements RawUpdate
+    {
+        private final Term.Raw selector;
+        private final Term.Raw value;
+
+        public ElementAddition(Term.Raw selector, Term.Raw value)
+        {
+            this.selector = selector;
+            this.value = value;
+        }
+
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
+        {
+            assert receiver.type instanceof MapType;
+            Term k = selector.prepare(keyspace, Maps.keySpecOf(receiver));
+            Term v = value.prepare(keyspace, Maps.valueSpecOf(receiver));
+
+            return new Maps.AdderByKey(receiver, v, k);
+        }
+
+        protected String toString(ColumnSpecification column)
+        {
+            return String.format("%s = %s + %s", column.name, column.name, value);
+        }
+
+        public boolean isCompatibleWith(RawUpdate other)
+        {
+            return !(other instanceof SetValue);
+        }
+    }
+
+    // Currently only used internally counters support in SuperColumn families.
+    // Addition on the element level inside the collections are otherwise not supported in the CQL.
+    public static class ElementSubtraction implements RawUpdate
+    {
+        private final Term.Raw selector;
+        private final Term.Raw value;
+
+        public  ElementSubtraction(Term.Raw selector, Term.Raw value)
+        {
+            this.selector = selector;
+            this.value = value;
+        }
+
+        public Operation prepare(String keyspace, ColumnDefinition receiver) throws InvalidRequestException
+        {
+            assert receiver.type instanceof MapType;
+            Term k = selector.prepare(keyspace, Maps.keySpecOf(receiver));
+            Term v = value.prepare(keyspace, Maps.valueSpecOf(receiver));
+
+            return new Maps.SubtracterByKey(receiver, v, k);
+        }
+
+        protected String toString(ColumnSpecification column)
+        {
+            return String.format("%s = %s + %s", column.name, column.name, value);
+        }
+
+        public boolean isCompatibleWith(RawUpdate other)
+        {
+            return !(other instanceof SetValue);
+        }
+    }
+
     public static class Addition implements RawUpdate
     {
         private final Term.Raw value;
@@ -287,6 +355,11 @@
         {
             return !(other instanceof SetValue);
         }
+
+        public Term.Raw value()
+        {
+            return value;
+        }
     }
 
     public static class Substraction implements RawUpdate
@@ -335,6 +408,11 @@
         {
             return !(other instanceof SetValue);
         }
+
+        public Term.Raw value()
+        {
+            return value;
+        }
     }
 
     public static class Prepend implements RawUpdate

diff --git a/src/java/org/apache/cassandra/cql3/Operations.java b/src/java/org/apache/cassandra/cql3/Operations.java
new file mode 100644
index 0000000..a9451d7
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/Operations.java

@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.cql3.statements.StatementType;
+
+import com.google.common.collect.Iterators;
+
+/**
+ * A set of <code>Operation</code>s.
+ *
+ */
+public final class Operations implements Iterable<Operation>
+{
+    /**
+     * The type of statement.
+     */
+    private final StatementType type;
+
+    /**
+     * The operations on regular columns.
+     */
+    private final List<Operation> regularOperations = new ArrayList<>();
+
+    /**
+     * The operations on static columns.
+     */
+    private final List<Operation> staticOperations = new ArrayList<>();
+
+    public Operations(StatementType type)
+    {
+        this.type = type;
+    }
+
+    /**
+     * Checks if some of the operations apply to static columns.
+     *
+     * @return <code>true</code> if some of the operations apply to static columns, <code>false</code> otherwise.
+     */
+    public boolean appliesToStaticColumns()
+    {
+        return !staticOperations.isEmpty();
+    }
+
+    /**
+     * Checks if some of the operations apply to regular columns.
+     *
+     * @return <code>true</code> if some of the operations apply to regular columns, <code>false</code> otherwise.
+     */
+    public boolean appliesToRegularColumns()
+    {
+     // If we have regular operations, this applies to regular columns.
+        // Otherwise, if the statement is a DELETE and staticOperations is also empty, this means we have no operations,
+        // which for a DELETE means a full row deletion. Which means the operation applies to all columns and regular ones in particular.
+        return !regularOperations.isEmpty() || (type.isDelete() && staticOperations.isEmpty());
+    }
+
+    /**
+     * Returns the operation on regular columns.
+     * @return the operation on regular columns
+     */
+    public List<Operation> regularOperations()
+    {
+        return regularOperations;
+    }
+
+    /**
+     * Returns the operation on static columns.
+     * @return the operation on static columns
+     */
+    public List<Operation> staticOperations()
+    {
+        return staticOperations;
+    }
+
+    /**
+     * Adds the specified <code>Operation</code> to this set of operations.
+     * @param operation the operation to add
+     */
+    public void add(Operation operation)
+    {
+        if (operation.column.isStatic())
+            staticOperations.add(operation);
+        else
+            regularOperations.add(operation);
+    }
+
+    /**
+     * Checks if one of the operations requires a read.
+     *
+     * @return <code>true</code> if one of the operations requires a read, <code>false</code> otherwise.
+     */
+    public boolean requiresRead()
+    {
+        // Lists SET operation incurs a read.
+        for (Operation operation : this)
+            if (operation.requiresRead())
+                return true;
+
+        return false;
+    }
+
+    /**
+     * Checks if this <code>Operations</code> is empty.
+     * @return <code>true</code> if this <code>Operations</code> is empty, <code>false</code> otherwise.
+     */
+    public boolean isEmpty()
+    {
+        return staticOperations.isEmpty() && regularOperations.isEmpty();
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public Iterator<Operation> iterator()
+    {
+        return Iterators.concat(staticOperations.iterator(), regularOperations.iterator());
+    }
+
+    public void addFunctionsTo(List<Function> functions)
+    {
+        regularOperations.forEach(p -> p.addFunctionsTo(functions));
+        staticOperations.forEach(p -> p.addFunctionsTo(functions));
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/Operator.java b/src/java/org/apache/cassandra/cql3/Operator.java
index 86bcbd3..7b28a30 100644
--- a/src/java/org/apache/cassandra/cql3/Operator.java
+++ b/src/java/org/apache/cassandra/cql3/Operator.java

@@ -20,6 +20,15 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.SetType;
 
 public enum Operator
 {
@@ -38,12 +47,6 @@
         {
             return "<";
         }
-
-        @Override
-        public Operator reverse()
-        {
-            return GT;
-        }
     },
     LTE(3)
     {
@@ -52,12 +55,6 @@
         {
             return "<=";
         }
-
-        @Override
-        public Operator reverse()
-        {
-            return GTE;
-        }
     },
     GTE(1)
     {
@@ -66,12 +63,6 @@
         {
             return ">=";
         }
-
-        @Override
-        public Operator reverse()
-        {
-            return LTE;
-        }
     },
     GT(2)
     {
@@ -80,12 +71,6 @@
         {
             return ">";
         }
-
-        @Override
-        public Operator reverse()
-        {
-            return LT;
-        }
     },
     IN(7)
     {
@@ -108,6 +93,14 @@
         {
             return "!=";
         }
+    },
+    IS_NOT(9)
+    {
+        @Override
+        public String toString()
+        {
+            return "IS NOT";
+        }
     };
 
     /**
@@ -135,6 +128,11 @@
         output.writeInt(b);
     }
 
+    public int getValue()
+    {
+        return b;
+    }
+
     /**
      * Deserializes a <code>Operator</code> instance from the specified input.
      *
@@ -152,19 +150,63 @@
           throw new IOException(String.format("Cannot resolve Relation.Type from binary representation: %s", b));
     }
 
+    /**
+     * Whether 2 values satisfy this operator (given the type they should be compared with).
+     *
+     * @throws AssertionError for CONTAINS and CONTAINS_KEY as this doesn't support those operators yet
+     */
+    public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand)
+    {
+        switch (this)
+        {
+            case EQ:
+                return type.compareForCQL(leftOperand, rightOperand) == 0;
+            case LT:
+                return type.compareForCQL(leftOperand, rightOperand) < 0;
+            case LTE:
+                return type.compareForCQL(leftOperand, rightOperand) <= 0;
+            case GT:
+                return type.compareForCQL(leftOperand, rightOperand) > 0;
+            case GTE:
+                return type.compareForCQL(leftOperand, rightOperand) >= 0;
+            case NEQ:
+                return type.compareForCQL(leftOperand, rightOperand) != 0;
+            case IN:
+                List inValues = ((List) ListType.getInstance(type, false).getSerializer().deserialize(rightOperand));
+                return inValues.contains(type.getSerializer().deserialize(leftOperand));
+            case CONTAINS:
+                if (type instanceof ListType)
+                {
+                    List list = (List) type.getSerializer().deserialize(leftOperand);
+                    return list.contains(((ListType) type).getElementsType().getSerializer().deserialize(rightOperand));
+                }
+                else if (type instanceof SetType)
+                {
+                    Set set = (Set) type.getSerializer().deserialize(leftOperand);
+                    return set.contains(((SetType) type).getElementsType().getSerializer().deserialize(rightOperand));
+                }
+                else  // MapType
+                {
+                    Map map = (Map) type.getSerializer().deserialize(leftOperand);
+                    return map.containsValue(((MapType) type).getValuesType().getSerializer().deserialize(rightOperand));
+                }
+            case CONTAINS_KEY:
+                Map map = (Map) type.getSerializer().deserialize(leftOperand);
+                return map.containsKey(((MapType) type).getKeysType().getSerializer().deserialize(rightOperand));
+            default:
+                // we shouldn't get CONTAINS, CONTAINS KEY, or IS NOT here
+                throw new AssertionError();
+        }
+    }
+
+    public int serializedSize()
+    {
+        return 4;
+    }
+
     @Override
     public String toString()
     {
          return this.name();
     }
-
-    /**
-     * Returns the reverse operator if this one.
-     *
-     * @return the reverse operator of this one.
-     */
-    public Operator reverse()
-    {
-        return this;
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/QueryOptions.java b/src/java/org/apache/cassandra/cql3/QueryOptions.java
index da705e0..a062567 100644
--- a/src/java/org/apache/cassandra/cql3/QueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/QueryOptions.java

@@ -51,14 +51,9 @@
     // A cache of bind values parsed as JSON, see getJsonColumnValue for details.
     private List<Map<ColumnIdentifier, Term>> jsonValuesCache;
 
-    public static QueryOptions fromProtocolV1(ConsistencyLevel consistency, List<ByteBuffer> values)
+    public static QueryOptions fromThrift(ConsistencyLevel consistency, List<ByteBuffer> values)
     {
-        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, Server.VERSION_1);
-    }
-
-    public static QueryOptions fromProtocolV2(ConsistencyLevel consistency, List<ByteBuffer> values)
-    {
-        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, Server.VERSION_2);
+        return new DefaultQueryOptions(consistency, values, false, SpecificOptions.DEFAULT, Server.VERSION_3);
     }
 
     public static QueryOptions forInternalCalls(ConsistencyLevel consistency, List<ByteBuffer> values)
@@ -71,11 +66,6 @@
         return new DefaultQueryOptions(ConsistencyLevel.ONE, values, false, SpecificOptions.DEFAULT, Server.VERSION_3);
     }
 
-    public static QueryOptions fromPreV3Batch(ConsistencyLevel consistency)
-    {
-        return new DefaultQueryOptions(consistency, Collections.<ByteBuffer>emptyList(), false, SpecificOptions.DEFAULT, Server.VERSION_2);
-    }
-
     public static QueryOptions forProtocolVersion(int protocolVersion)
     {
         return new DefaultQueryOptions(null, null, true, null, protocolVersion);
@@ -88,7 +78,7 @@
 
     public static QueryOptions create(ConsistencyLevel consistency, List<ByteBuffer> values, boolean skipMetadata, int pageSize, PagingState pagingState, ConsistencyLevel serialConsistency, int protocolVersion)
     {
-        return new DefaultQueryOptions(consistency, values, skipMetadata, new SpecificOptions(pageSize, pagingState, serialConsistency, -1L), protocolVersion);
+        return new DefaultQueryOptions(consistency, values, skipMetadata, new SpecificOptions(pageSize, pagingState, serialConsistency, Long.MIN_VALUE), protocolVersion);
     }
 
     public static QueryOptions addColumnSpecifications(QueryOptions options, List<ColumnSpecification> columnSpecs)
@@ -414,8 +404,6 @@
 
         public QueryOptions decode(ByteBuf body, int version)
         {
-            assert version >= 2;
-
             ConsistencyLevel consistency = CBUtil.readConsistencyLevel(body);
             EnumSet<Flag> flags = Flag.deserialize((int)body.readByte());
 
@@ -443,7 +431,7 @@
             if (!flags.isEmpty())
             {
                 int pageSize = flags.contains(Flag.PAGE_SIZE) ? body.readInt() : -1;
-                PagingState pagingState = flags.contains(Flag.PAGING_STATE) ? PagingState.deserialize(CBUtil.readValue(body)) : null;
+                PagingState pagingState = flags.contains(Flag.PAGING_STATE) ? PagingState.deserialize(CBUtil.readValue(body), version) : null;
                 ConsistencyLevel serialConsistency = flags.contains(Flag.SERIAL_CONSISTENCY) ? CBUtil.readConsistencyLevel(body) : ConsistencyLevel.SERIAL;
                 long timestamp = Long.MIN_VALUE;
                 if (flags.contains(Flag.TIMESTAMP))
@@ -462,8 +450,6 @@
 
         public void encode(QueryOptions options, ByteBuf dest, int version)
         {
-            assert version >= 2;
-
             CBUtil.writeConsistencyLevel(options.getConsistency(), dest);
 
             EnumSet<Flag> flags = gatherFlags(options);
@@ -474,7 +460,7 @@
             if (flags.contains(Flag.PAGE_SIZE))
                 dest.writeInt(options.getPageSize());
             if (flags.contains(Flag.PAGING_STATE))
-                CBUtil.writeValue(options.getPagingState().serialize(), dest);
+                CBUtil.writeValue(options.getPagingState().serialize(version), dest);
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
                 CBUtil.writeConsistencyLevel(options.getSerialConsistency(), dest);
             if (flags.contains(Flag.TIMESTAMP))
@@ -499,7 +485,7 @@
             if (flags.contains(Flag.PAGE_SIZE))
                 size += 4;
             if (flags.contains(Flag.PAGING_STATE))
-                size += CBUtil.sizeOfValue(options.getPagingState().serialize());
+                size += CBUtil.sizeOfValue(options.getPagingState().serializedSize(version));
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
                 size += CBUtil.sizeOfConsistencyLevel(options.getSerialConsistency());
             if (flags.contains(Flag.TIMESTAMP))

diff --git a/src/java/org/apache/cassandra/cql3/QueryProcessor.java b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
index a0afda7..af751b0 100644
--- a/src/java/org/apache/cassandra/cql3/QueryProcessor.java
+++ b/src/java/org/apache/cassandra/cql3/QueryProcessor.java

@@ -37,27 +37,29 @@
 import com.googlecode.concurrentlinkedhashmap.EvictionListener;
 import org.antlr.runtime.*;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.cql3.statements.*;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterators;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.metrics.CQLMetrics;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.service.pager.QueryPager;
-import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.thrift.ThriftClientState;
 import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.*;
 import org.github.jamm.MemoryMeter;
 
 public class QueryProcessor implements QueryHandler
 {
-    public static final CassandraVersion CQL_VERSION = new CassandraVersion("3.3.1");
+    public static final CassandraVersion CQL_VERSION = new CassandraVersion("3.4.0");
 
     public static final QueryProcessor instance = new QueryProcessor();
 
@@ -194,28 +196,6 @@
         }
     }
 
-    public static void validateCellNames(Iterable<CellName> cellNames, CellNameType type) throws InvalidRequestException
-    {
-        for (CellName name : cellNames)
-            validateCellName(name, type);
-    }
-
-    public static void validateCellName(CellName name, CellNameType type) throws InvalidRequestException
-    {
-        validateComposite(name, type);
-        if (name.isEmpty())
-            throw new InvalidRequestException("Invalid empty value for clustering column of COMPACT TABLE");
-    }
-
-    public static void validateComposite(Composite name, CType type) throws InvalidRequestException
-    {
-        long serializedSize = type.serializer().serializedSize(name, TypeSizes.NATIVE);
-        if (serializedSize > Cell.MAX_NAME_LENGTH)
-            throw new InvalidRequestException(String.format("The sum of all clustering columns is too long (%s > %s)",
-                                                            serializedSize,
-                                                            Cell.MAX_NAME_LENGTH));
-    }
-
     public ResultMessage processStatement(CQLStatement statement, QueryState queryState, QueryOptions options)
     throws RequestExecutionException, RequestValidationException
     {
@@ -280,6 +260,11 @@
     @VisibleForTesting
     public static QueryOptions makeInternalOptions(ParsedStatement.Prepared prepared, Object[] values)
     {
+        return makeInternalOptions(prepared, values, ConsistencyLevel.ONE);
+    }
+
+    private static QueryOptions makeInternalOptions(ParsedStatement.Prepared prepared, Object[] values, ConsistencyLevel cl)
+    {
         if (prepared.boundNames.size() != values.length)
             throw new IllegalArgumentException(String.format("Invalid number of values. Expecting %d but got %d", prepared.boundNames.size(), values.length));
 
@@ -290,7 +275,7 @@
             AbstractType type = prepared.boundNames.get(i).type;
             boundValues.add(value instanceof ByteBuffer || value == null ? (ByteBuffer)value : type.decompose(value));
         }
-        return QueryOptions.forInternalCalls(boundValues);
+        return QueryOptions.forInternalCalls(cl, boundValues);
     }
 
     public static ParsedStatement.Prepared prepareInternal(String query) throws RequestValidationException
@@ -316,6 +301,24 @@
             return null;
     }
 
+    public static UntypedResultSet execute(String query, ConsistencyLevel cl, QueryState state, Object... values)
+    throws RequestExecutionException
+    {
+        try
+        {
+            ParsedStatement.Prepared prepared = prepareInternal(query);
+            ResultMessage result = prepared.statement.execute(state, makeInternalOptions(prepared, values, cl));
+            if (result instanceof ResultMessage.Rows)
+                return UntypedResultSet.create(((ResultMessage.Rows)result).result);
+            else
+                return null;
+        }
+        catch (RequestValidationException e)
+        {
+            throw new RuntimeException("Error validating " + query, e);
+        }
+    }
+
     public static UntypedResultSet executeInternalWithPaging(String query, int pageSize, Object... values)
     {
         ParsedStatement.Prepared prepared = prepareInternal(query);
@@ -323,7 +326,7 @@
             throw new IllegalArgumentException("Only SELECTs can be paged");
 
         SelectStatement select = (SelectStatement)prepared.statement;
-        QueryPager pager = QueryPagers.localPager(select.getPageableCommand(makeInternalOptions(prepared, values)));
+        QueryPager pager = select.getQuery(makeInternalOptions(prepared, values), FBUtilities.nowInSeconds()).getPager(null, Server.CURRENT_VERSION);
         return UntypedResultSet.create(select, pager, pageSize);
     }
 
@@ -347,37 +350,29 @@
      * Note that this only make sense for Selects so this only accept SELECT statements and is only useful in rare
      * cases.
      */
-    public static UntypedResultSet executeInternalWithNow(long now, String query, Object... values)
+    public static UntypedResultSet executeInternalWithNow(int nowInSec, String query, Object... values)
     {
-        try
-        {
-            ParsedStatement.Prepared prepared = prepareInternal(query);
-            assert prepared.statement instanceof SelectStatement;
-            SelectStatement select = (SelectStatement)prepared.statement;
-            ResultMessage result = select.executeInternal(internalQueryState(), makeInternalOptions(prepared, values), now);
-            assert result instanceof ResultMessage.Rows;
-            return UntypedResultSet.create(((ResultMessage.Rows)result).result);
-        }
-        catch (RequestExecutionException e)
-        {
-            throw new RuntimeException(e);
-        }
-        catch (RequestValidationException e)
-        {
-            throw new RuntimeException("Error validating query " + query, e);
-        }
+        ParsedStatement.Prepared prepared = prepareInternal(query);
+        assert prepared.statement instanceof SelectStatement;
+        SelectStatement select = (SelectStatement)prepared.statement;
+        ResultMessage result = select.executeInternal(internalQueryState(), makeInternalOptions(prepared, values), nowInSec);
+        assert result instanceof ResultMessage.Rows;
+        return UntypedResultSet.create(((ResultMessage.Rows)result).result);
     }
 
-    public static UntypedResultSet resultify(String query, Row row)
+    public static UntypedResultSet resultify(String query, RowIterator partition)
     {
-        return resultify(query, Collections.singletonList(row));
+        return resultify(query, PartitionIterators.singletonIterator(partition));
     }
 
-    public static UntypedResultSet resultify(String query, List<Row> rows)
+    public static UntypedResultSet resultify(String query, PartitionIterator partitions)
     {
-        SelectStatement ss = (SelectStatement) getStatement(query, null).statement;
-        ResultSet cqlRows = ss.process(rows);
-        return UntypedResultSet.create(cqlRows);
+        try (PartitionIterator iter = partitions)
+        {
+            SelectStatement ss = (SelectStatement) getStatement(query, null).statement;
+            ResultSet cqlRows = ss.process(iter, FBUtilities.nowInSeconds());
+            return UntypedResultSet.create(cqlRows);
+        }
     }
 
     public ResultMessage.Prepared prepare(String query,
@@ -524,31 +519,14 @@
             ((CFStatement)statement).prepareKeyspace(clientState);
 
         Tracing.trace("Preparing statement");
-        return statement.prepare();
+        return statement.prepare(clientState);
     }
 
     public static ParsedStatement parseStatement(String queryStr) throws SyntaxException
     {
         try
         {
-            // Lexer and parser
-            ErrorCollector errorCollector = new ErrorCollector(queryStr);
-            CharStream stream = new ANTLRStringStream(queryStr);
-            CqlLexer lexer = new CqlLexer(stream);
-            lexer.addErrorListener(errorCollector);
-
-            TokenStream tokenStream = new CommonTokenStream(lexer);
-            CqlParser parser = new CqlParser(tokenStream);
-            parser.addErrorListener(errorCollector);
-
-            // Parse the query string to a statement instance
-            ParsedStatement statement = parser.query();
-
-            // The errorCollector has queue up any errors that the lexer and parser may have encountered
-            // along the way, if necessary, we turn the last error into exceptions here.
-            errorCollector.throwFirstSyntaxError();
-
-            return statement;
+            return CQLFragmentParser.parseAnyUnhandled(CqlParser::query, queryStr);
         }
         catch (CassandraException ce)
         {
@@ -637,28 +615,26 @@
 
         public void onCreateFunction(String ksName, String functionName, List<AbstractType<?>> argTypes)
         {
-            if (Functions.getOverloadCount(new FunctionName(ksName, functionName)) > 1)
-            {
-                // in case there are other overloads, we have to remove all overloads since argument type
-                // matching may change (due to type casting)
-                removeAllInvalidPreparedStatementsForFunction(ksName, functionName);
-            }
+            onCreateFunctionInternal(ksName, functionName, argTypes);
         }
 
         public void onCreateAggregate(String ksName, String aggregateName, List<AbstractType<?>> argTypes)
         {
-            if (Functions.getOverloadCount(new FunctionName(ksName, aggregateName)) > 1)
-            {
-                // in case there are other overloads, we have to remove all overloads since argument type
-                // matching may change (due to type casting)
-                removeAllInvalidPreparedStatementsForFunction(ksName, aggregateName);
-            }
+            onCreateFunctionInternal(ksName, aggregateName, argTypes);
         }
 
-        public void onUpdateColumnFamily(String ksName, String cfName, boolean columnsDidChange)
+        private static void onCreateFunctionInternal(String ksName, String functionName, List<AbstractType<?>> argTypes)
+        {
+            // in case there are other overloads, we have to remove all overloads since argument type
+            // matching may change (due to type casting)
+            if (Schema.instance.getKSMetaData(ksName).functions.get(new FunctionName(ksName, functionName)).size() > 1)
+                removeAllInvalidPreparedStatementsForFunction(ksName, functionName);
+        }
+
+        public void onUpdateColumnFamily(String ksName, String cfName, boolean affectsStatements)
         {
             logger.trace("Column definitions for {}.{} changed, invalidating related prepared statements", ksName, cfName);
-            if (columnsDidChange)
+            if (affectsStatements)
                 removeInvalidPreparedStatements(ksName, cfName);
         }
 
@@ -702,7 +678,7 @@
             removeAllInvalidPreparedStatementsForFunction(ksName, aggregateName);
         }
 
-        private void removeAllInvalidPreparedStatementsForFunction(String ksName, String functionName)
+        private static void removeAllInvalidPreparedStatementsForFunction(String ksName, String functionName)
         {
             removeInvalidPreparedStatementsForFunction(internalStatements.values().iterator(), ksName, functionName);
             removeInvalidPreparedStatementsForFunction(preparedStatements.values().iterator(), ksName, functionName);
@@ -713,21 +689,8 @@
                                                                        final String ksName,
                                                                        final String functionName)
         {
-            final Predicate<Function> matchesFunction = new Predicate<Function>()
-            {
-                public boolean apply(Function f)
-                {
-                    return ksName.equals(f.name().keyspace) && functionName.equals(f.name().name);
-                }
-            };
-
-            Iterators.removeIf(statements, new Predicate<ParsedStatement.Prepared>()
-            {
-                public boolean apply(ParsedStatement.Prepared statement)
-                {
-                    return Iterables.any(statement.statement.getFunctions(), matchesFunction);
-                }
-            });
+            Predicate<Function> matchesFunction = f -> ksName.equals(f.name().keyspace) && functionName.equals(f.name().name);
+            Iterators.removeIf(statements, statement -> Iterables.any(statement.statement.getFunctions(), matchesFunction));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Relation.java b/src/java/org/apache/cassandra/cql3/Relation.java
index 1337096..005d984 100644
--- a/src/java/org/apache/cassandra/cql3/Relation.java
+++ b/src/java/org/apache/cassandra/cql3/Relation.java

@@ -26,6 +26,7 @@
 import org.apache.cassandra.cql3.statements.Bound;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.UnrecognizedEntityException;
+import sun.reflect.generics.reflectiveObjects.NotImplementedException;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -39,6 +40,16 @@
     }
 
     /**
+     * Returns the raw value for this relation, or null if this is an IN relation.
+     */
+    public abstract Term.Raw getValue();
+
+    /**
+     * Returns the list of raw IN values for this relation, or null if this is not an IN relation.
+     */
+    public abstract List<? extends Term.Raw> getInValues();
+
+    /**
      * Checks if this relation apply to multiple columns.
      *
      * @return <code>true</code> if this relation apply to multiple columns, <code>false</code> otherwise.
@@ -132,11 +143,21 @@
             case IN: return newINRestriction(cfm, boundNames);
             case CONTAINS: return newContainsRestriction(cfm, boundNames, false);
             case CONTAINS_KEY: return newContainsRestriction(cfm, boundNames, true);
+            case IS_NOT: return newIsNotRestriction(cfm, boundNames);
             default: throw invalidRequest("Unsupported \"!=\" relation: %s", this);
         }
     }
 
     /**
+     * Required for SuperColumn compatibility, creates an adapter Relation that remaps all restrictions required for
+     * SuperColumn tables.
+     */
+    public Relation toSuperColumnAdapter()
+    {
+        throw invalidRequest("Unsupported operation (" + this + ") on super column family");
+    }
+
+    /**
      * Creates a new EQ restriction instance.
      *
      * @param cfm the Column Family meta data
@@ -186,6 +207,9 @@
                                                           VariableSpecifications boundNames,
                                                           boolean isKey) throws InvalidRequestException;
 
+    protected abstract Restriction newIsNotRestriction(CFMetaData cfm,
+                                                       VariableSpecifications boundNames) throws InvalidRequestException;
+
     /**
      * Converts the specified <code>Raw</code> into a <code>Term</code>.
      * @param receivers the columns to which the values must be associated at
@@ -239,11 +263,20 @@
                                                         ColumnIdentifier.Raw entity) throws InvalidRequestException
     {
         ColumnIdentifier identifier = entity.prepare(cfm);
-        ColumnDefinition def = cfm.getColumnDefinition(identifier);
+        ColumnDefinition def = cfm.getColumnDefinitionForCQL(identifier);
 
         if (def == null)
             throw new UnrecognizedEntityException(identifier, this);
 
         return def;
     }
+
+    /**
+     * Renames an identifier in this Relation, if applicable.
+     * @param from the old identifier
+     * @param to the new identifier
+     * @return this object, if the old identifier is not in the set of entities that this relation covers; otherwise
+     *         a new Relation with "from" replaced by "to" is returned.
+     */
+    public abstract Relation renameIdentifier(ColumnIdentifier.Raw from, ColumnIdentifier.Raw to);
 }

diff --git a/src/java/org/apache/cassandra/cql3/ReservedKeywords.java b/src/java/org/apache/cassandra/cql3/ReservedKeywords.java
new file mode 100644
index 0000000..ee052a7
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/ReservedKeywords.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.Set;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
+
+class ReservedKeywords
+{
+    @VisibleForTesting
+    static final String[] reservedKeywords = new String[]
+                                                     {
+                                                     "SELECT",
+                                                     "FROM",
+                                                     "WHERE",
+                                                     "AND",
+                                                     "ENTRIES",
+                                                     "FULL",
+                                                     "INSERT",
+                                                     "UPDATE",
+                                                     "WITH",
+                                                     "LIMIT",
+                                                     "USING",
+                                                     "USE",
+                                                     "SET",
+                                                     "BEGIN",
+                                                     "UNLOGGED",
+                                                     "BATCH",
+                                                     "APPLY",
+                                                     "TRUNCATE",
+                                                     "DELETE",
+                                                     "IN",
+                                                     "CREATE",
+                                                     "KEYSPACE",
+                                                     "SCHEMA",
+                                                     "COLUMNFAMILY",
+                                                     "TABLE",
+                                                     "MATERIALIZED",
+                                                     "VIEW",
+                                                     "INDEX",
+                                                     "ON",
+                                                     "TO",
+                                                     "DROP",
+                                                     "PRIMARY",
+                                                     "INTO",
+                                                     "ALTER",
+                                                     "RENAME",
+                                                     "ADD",
+                                                     "ORDER",
+                                                     "BY",
+                                                     "ASC",
+                                                     "DESC",
+                                                     "ALLOW",
+                                                     "IF",
+                                                     "IS",
+                                                     "GRANT",
+                                                     "OF",
+                                                     "REVOKE",
+                                                     "MODIFY",
+                                                     "AUTHORIZE",
+                                                     "DESCRIBE",
+                                                     "EXECUTE",
+                                                     "NORECURSIVE",
+                                                     "TOKEN",
+                                                     "NULL",
+                                                     "NOT",
+                                                     "NAN",
+                                                     "INFINITY",
+                                                     "OR",
+                                                     "REPLACE" };
+
+    private static final Set<String> reservedSet = ImmutableSet.copyOf(reservedKeywords);
+
+    static boolean isReserved(String text)
+    {
+        return reservedSet.contains(text.toUpperCase());
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/ResultSet.java b/src/java/org/apache/cassandra/cql3/ResultSet.java
index 16f0d1b..57ec796 100644
--- a/src/java/org/apache/cassandra/cql3/ResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/ResultSet.java

@@ -335,7 +335,7 @@
 
                 PagingState state = null;
                 if (flags.contains(Flag.HAS_MORE_PAGES))
-                    state = PagingState.deserialize(CBUtil.readValue(body));
+                    state = PagingState.deserialize(CBUtil.readValue(body), version);
 
                 if (flags.contains(Flag.NO_METADATA))
                     return new ResultMetadata(flags, null, columnCount, state);
@@ -375,7 +375,7 @@
                 dest.writeInt(m.columnCount);
 
                 if (hasMorePages)
-                    CBUtil.writeValue(m.pagingState.serialize(), dest);
+                    CBUtil.writeValue(m.pagingState.serialize(version), dest);
 
                 if (!noMetadata)
                 {
@@ -407,7 +407,7 @@
 
                 int size = 8;
                 if (hasMorePages)
-                    size += CBUtil.sizeOfValue(m.pagingState.serialize());
+                    size += CBUtil.sizeOfValue(m.pagingState.serializedSize(version));
 
                 if (!noMetadata)
                 {

diff --git a/src/java/org/apache/cassandra/cql3/Sets.java b/src/java/org/apache/cassandra/cql3/Sets.java
index 093f1dc..622bb23 100644
--- a/src/java/org/apache/cassandra/cql3/Sets.java
+++ b/src/java/org/apache/cassandra/cql3/Sets.java

@@ -21,23 +21,18 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
-
-import com.google.common.base.Joiner;
+import java.util.stream.Collectors;
 
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.MapType;
-import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Static helper methods and classes for sets.
@@ -51,7 +46,7 @@
         return new ColumnSpecification(column.ksName, column.cfName, new ColumnIdentifier("value(" + column.name + ")", true), ((SetType)column.type).getElementsType());
     }
 
-    public static class Literal implements Term.Raw
+    public static class Literal extends Term.Raw
     {
         private final List<Term.Raw> elements;
 
@@ -127,10 +122,9 @@
             return AssignmentTestable.TestResult.testAll(keyspace, valueSpec, elements);
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            return "{" + Joiner.on(", ").join(elements) + "}";
+            return elements.stream().map(Term.Raw::getText).collect(Collectors.joining(", ", "{", "}"));
         }
     }
 
@@ -216,20 +210,14 @@
                 if (bytes == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     return UNSET_VALUE;
 
-                // We don't support value > 64K because the serialization format encode the length as an unsigned short.
-                if (bytes.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-                    throw new InvalidRequestException(String.format("Set value is too long. Set values are limited to %d bytes but %d bytes value provided",
-                                                                    FBUtilities.MAX_UNSIGNED_SHORT,
-                                                                    bytes.remaining()));
-
                 buffers.add(bytes);
             }
             return new Value(buffers);
         }
 
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(elements);
+            Terms.addFunctions(elements, functions);
         }
     }
 
@@ -259,17 +247,16 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             Term.Terminal value = t.bind(params.options);
-            if (column.type.isMultiCell() && value != UNSET_VALUE)
-            {
-                // delete + add
-                CellName name = cf.getComparator().create(prefix, column);
-                cf.addAtom(params.makeTombstoneForOverwrite(name.slice()));
-            }
-            if (value != UNSET_VALUE)
-                Adder.doAdd(cf, prefix, column, params, value);
+            if (value == UNSET_VALUE)
+                return;
+
+            // delete + add
+            if (column.type.isMultiCell())
+                params.setComplexDeletionTimeForOverwrite(column);
+            Adder.doAdd(value, column, params);
         }
     }
 
@@ -280,15 +267,15 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to add items to a frozen set";
             Term.Terminal value = t.bind(params.options);
             if (value != UNSET_VALUE)
-                doAdd(cf, prefix, column, params, value);
+                doAdd(value, column, params);
         }
 
-        static void doAdd(ColumnFamily cf, Composite prefix, ColumnDefinition column, UpdateParameters params, Term.Terminal value) throws InvalidRequestException
+        static void doAdd(Term.Terminal value, ColumnDefinition column, UpdateParameters params) throws InvalidRequestException
         {
             if (column.type.isMultiCell())
             {
@@ -299,18 +286,17 @@
                 {
                     if (bb == ByteBufferUtil.UNSET_BYTE_BUFFER)
                         continue;
-                    CellName cellName = cf.getComparator().create(prefix, column, bb);
-                    cf.addColumn(params.makeColumn(cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+
+                    params.addCell(column, CellPath.create(bb), ByteBufferUtil.EMPTY_BYTE_BUFFER);
                 }
             }
             else
             {
                 // for frozen sets, we're overwriting the whole cell
-                CellName cellName = cf.getComparator().create(prefix, column);
                 if (value == null)
-                    cf.addAtom(params.makeTombstone(cellName));
+                    params.addTombstone(column);
                 else
-                    cf.addColumn(params.makeColumn(cellName, value.get(Server.CURRENT_VERSION)));
+                    params.addCell(column, value.get(Server.CURRENT_VERSION));
             }
         }
     }
@@ -323,7 +309,7 @@
             super(column, t);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to remove items from a frozen set";
 
@@ -337,7 +323,7 @@
                                       : Collections.singleton(value.get(params.options.getProtocolVersion()));
 
             for (ByteBuffer bb : toDiscard)
-                cf.addColumn(params.makeTombstone(cf.getComparator().create(prefix, column, bb)));
+                params.addTombstone(column, CellPath.create(bb));
         }
     }
 
@@ -348,15 +334,14 @@
             super(column, k);
         }
 
-        public void execute(ByteBuffer rowKey, ColumnFamily cf, Composite prefix, UpdateParameters params) throws InvalidRequestException
+        public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete a single element in a frozen set";
             Term.Terminal elt = t.bind(params.options);
             if (elt == null)
                 throw new InvalidRequestException("Invalid null set element");
 
-            CellName cellName = cf.getComparator().create(prefix, column, elt.get(params.options.getProtocolVersion()));
-            cf.addColumn(params.makeTombstone(cellName));
+            params.addTombstone(column, CellPath.create(elt.get(params.options.getProtocolVersion())));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
index dbae5f0..455ae0c 100644
--- a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
+++ b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java

@@ -40,7 +40,7 @@
  * a value (term). For example, <key> > "start" or "colname1" = "somevalue".
  *
  */
-public final class SingleColumnRelation extends Relation
+public class SingleColumnRelation extends Relation
 {
     private final ColumnIdentifier.Raw entity;
     private final Term.Raw mapKey;
@@ -54,6 +54,9 @@
         this.relationType = type;
         this.value = value;
         this.inValues = inValues;
+
+        if (type == Operator.IS_NOT)
+            assert value == Constants.NULL_LITERAL;
     }
 
     /**
@@ -81,6 +84,16 @@
         this(entity, null, type, value);
     }
 
+    public Term.Raw getValue()
+    {
+        return value;
+    }
+
+    public List<? extends Term.Raw> getInValues()
+    {
+        return inValues;
+    }
+
     public static SingleColumnRelation createInRelation(ColumnIdentifier.Raw entity, List<Term.Raw> inValues)
     {
         return new SingleColumnRelation(entity, null, Operator.IN, null, inValues);
@@ -120,6 +133,13 @@
         }
     }
 
+    public Relation renameIdentifier(ColumnIdentifier.Raw from, ColumnIdentifier.Raw to)
+    {
+        return entity.equals(from)
+               ? new SingleColumnRelation(to, mapKey, operator(), value, inValues)
+               : this;
+    }
+
     @Override
     public String toString()
     {
@@ -140,13 +160,13 @@
         ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
         if (mapKey == null)
         {
-            Term term = toTerm(toReceivers(columnDef), value, cfm.ksName, boundNames);
-            return new SingleColumnRestriction.EQ(columnDef, term);
+            Term term = toTerm(toReceivers(columnDef, cfm.isDense()), value, cfm.ksName, boundNames);
+            return new SingleColumnRestriction.EQRestriction(columnDef, term);
         }
-        List<? extends ColumnSpecification> receivers = toReceivers(columnDef);
+        List<? extends ColumnSpecification> receivers = toReceivers(columnDef, cfm.isDense());
         Term entryKey = toTerm(Collections.singletonList(receivers.get(0)), mapKey, cfm.ksName, boundNames);
         Term entryValue = toTerm(Collections.singletonList(receivers.get(1)), value, cfm.ksName, boundNames);
-        return new SingleColumnRestriction.Contains(columnDef, entryKey, entryValue);
+        return new SingleColumnRestriction.ContainsRestriction(columnDef, entryKey, entryValue);
     }
 
     @Override
@@ -154,14 +174,14 @@
                                            VariableSpecifications boundNames) throws InvalidRequestException
     {
         ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
-        List<? extends ColumnSpecification> receivers = toReceivers(columnDef);
+        List<? extends ColumnSpecification> receivers = toReceivers(columnDef, cfm.isDense());
         List<Term> terms = toTerms(receivers, inValues, cfm.ksName, boundNames);
         if (terms == null)
         {
             Term term = toTerm(receivers, value, cfm.ksName, boundNames);
-            return new SingleColumnRestriction.InWithMarker(columnDef, (Lists.Marker) term);
+            return new SingleColumnRestriction.InRestrictionWithMarker(columnDef, (Lists.Marker) term);
         }
-        return new SingleColumnRestriction.InWithValues(columnDef, terms);
+        return new SingleColumnRestriction.InRestrictionWithValues(columnDef, terms);
     }
 
     @Override
@@ -171,8 +191,8 @@
                                               boolean inclusive) throws InvalidRequestException
     {
         ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
-        Term term = toTerm(toReceivers(columnDef), value, cfm.ksName, boundNames);
-        return new SingleColumnRestriction.Slice(columnDef, bound, inclusive, term);
+        Term term = toTerm(toReceivers(columnDef, cfm.isDense()), value, cfm.ksName, boundNames);
+        return new SingleColumnRestriction.SliceRestriction(columnDef, bound, inclusive, term);
     }
 
     @Override
@@ -181,18 +201,29 @@
                                                  boolean isKey) throws InvalidRequestException
     {
         ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
-        Term term = toTerm(toReceivers(columnDef), value, cfm.ksName, boundNames);
-        return new SingleColumnRestriction.Contains(columnDef, term, isKey);
+        Term term = toTerm(toReceivers(columnDef, cfm.isDense()), value, cfm.ksName, boundNames);
+        return new SingleColumnRestriction.ContainsRestriction(columnDef, term, isKey);
+    }
+
+    @Override
+    protected Restriction newIsNotRestriction(CFMetaData cfm,
+                                              VariableSpecifications boundNames) throws InvalidRequestException
+    {
+        ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
+        // currently enforced by the grammar
+        assert value == Constants.NULL_LITERAL : "Expected null literal for IS NOT relation: " + this.toString();
+        return new SingleColumnRestriction.IsNotNullRestriction(columnDef);
     }
 
     /**
      * Returns the receivers for this relation.
      * @param columnDef the column definition
+     * @param isDense whether the table is a dense one
      *
      * @return the receivers for the specified relation.
      * @throws InvalidRequestException if the relation is invalid
      */
-    private List<? extends ColumnSpecification> toReceivers(ColumnDefinition columnDef) throws InvalidRequestException
+    private List<? extends ColumnSpecification> toReceivers(ColumnDefinition columnDef, boolean isDense) throws InvalidRequestException
     {
         ColumnSpecification receiver = columnDef;
 
@@ -218,6 +249,7 @@
         }
 
         checkFalse(isContainsKey() && !(receiver.type instanceof MapType), "Cannot use CONTAINS KEY on non-map column %s", receiver.name);
+        checkFalse(isContains() && !(receiver.type.isCollection()), "Cannot use CONTAINS on non-collection column %s", receiver.name);
 
         if (mapKey != null)
         {
@@ -271,4 +303,78 @@
     {
         return isEQ() || (isIN() && inValues != null && inValues.size() == 1);
     }
+
+    @Override
+    public Relation toSuperColumnAdapter()
+    {
+        return new SuperColumnSingleColumnRelation(entity, mapKey, relationType, value);
+    }
+
+    /**
+     * Required for SuperColumn compatibility, in order to map the SuperColumn key restrictions from the regular
+     * column to the collection key one.
+     */
+    private class SuperColumnSingleColumnRelation extends SingleColumnRelation
+    {
+        SuperColumnSingleColumnRelation(ColumnIdentifier.Raw entity, Raw mapKey, Operator type, Raw value)
+        {
+            super(entity, mapKey, type, value, inValues);
+        }
+
+        @Override
+        public Restriction newSliceRestriction(CFMetaData cfm,
+                                               VariableSpecifications boundNames,
+                                               Bound bound,
+                                               boolean inclusive) throws InvalidRequestException
+        {
+            ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
+            if (cfm.isSuperColumnKeyColumn(columnDef))
+            {
+                Term term = toTerm(toReceivers(columnDef, cfm.isDense()), value, cfm.ksName, boundNames);
+                return new SingleColumnRestriction.SuperColumnKeySliceRestriction(cfm.superColumnKeyColumn(), bound, inclusive, term);
+            }
+            else
+            {
+                return super.newSliceRestriction(cfm, boundNames, bound, inclusive);
+            }
+        }
+
+        @Override
+        protected Restriction newEQRestriction(CFMetaData cfm,
+                                               VariableSpecifications boundNames) throws InvalidRequestException
+        {
+            ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
+            if (cfm.isSuperColumnKeyColumn(columnDef))
+            {
+                Term term = toTerm(toReceivers(columnDef, cfm.isDense()), value, cfm.ksName, boundNames);
+                return new SingleColumnRestriction.SuperColumnKeyEQRestriction(cfm.superColumnKeyColumn(), term);
+            }
+            else
+            {
+                return super.newEQRestriction(cfm, boundNames);
+            }
+        }
+
+        @Override
+        protected Restriction newINRestriction(CFMetaData cfm,
+                                               VariableSpecifications boundNames) throws InvalidRequestException
+        {
+            ColumnDefinition columnDef = toColumnDefinition(cfm, entity);
+            if (cfm.isSuperColumnKeyColumn(columnDef))
+            {
+                List<? extends ColumnSpecification> receivers = Collections.singletonList(cfm.superColumnKeyColumn());
+                List<Term> terms = toTerms(receivers, inValues, cfm.ksName, boundNames);
+                if (terms == null)
+                {
+                    Term term = toTerm(receivers, value, cfm.ksName, boundNames);
+                    return new SingleColumnRestriction.SuperColumnKeyINRestrictionWithMarkers(cfm.superColumnKeyColumn(), (Lists.Marker) term);
+                }
+                return new SingleColumnRestriction.SuperColumnKeyINRestrictionWithValues(cfm.superColumnKeyColumn(), terms);
+            }
+            else
+            {
+                return super.newINRestriction(cfm, boundNames);
+            }
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/SuperColumnCompatibility.java b/src/java/org/apache/cassandra/cql3/SuperColumnCompatibility.java
new file mode 100644
index 0000000..d4c14df
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/SuperColumnCompatibility.java

@@ -0,0 +1,765 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.restrictions.Restriction;
+import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
+import org.apache.cassandra.cql3.restrictions.TermSlice;
+import org.apache.cassandra.cql3.selection.Selection;
+import org.apache.cassandra.cql3.statements.Bound;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.Columns;
+import org.apache.cassandra.db.CompactTables;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static org.apache.cassandra.cql3.statements.SelectStatement.getComponents;
+
+/**
+ * Class incapsulating the helper logic to handle SELECT / UPDATE / INSERT special-cases related
+ * to SuperColumn tables in applicable scenarios.
+ *
+ * SuperColumn families have a special layout and are represented as a Map internally. These tables
+ * have two special columns (called `column2` and `value` by default):
+ *
+ *   * `column2`, {@link CFMetaData#superCfValueColumn}, a key of the SuperColumn map, exposed as a
+ *   REGULAR column, but stored in schema tables as a CLUSTERING column to make a distinction from
+ *   the SC value column in case of renames.
+ *   * `value`, {@link CFMetaData#compactValueColumn()}, a value of the SuperColumn map, exposed and
+ *   stored as a REGULAR column
+ *
+ * These columns have to be translated to this internal representation as key and value, correspondingly.
+ *
+ * In CQL terms, the SuperColumn families is encoded with:
+ *
+ *   CREATE TABLE super (
+ *      key [key_validation_class],
+ *      super_column_name [comparator],
+ *      [column_metadata_1] [type1],
+ *      ...,
+ *      [column_metadata_n] [type1],
+ *      "" map<[sub_comparator], [default_validation_class]>
+ *      PRIMARY KEY (key, super_column_name)
+ *   )
+ *
+ * In other words, every super column is encoded by a row. That row has one column for each defined
+ * "column_metadata", but it also has a special map column (whose name is the empty string as this is
+ * guaranteed to never conflict with a user-defined "column_metadata") which stores the super column
+ * "dynamic" sub-columns.
+ *
+ * On write path, `column2` and `value` columns are translated to the key and value of the
+ * underlying map. During the read, the inverse conversion is done. Deletes are converted into
+ * discards by the key in the underlying map. Counters are handled by translating an update to a
+ * counter update with a cell path. See {@link SuperColumnRestrictions} for the details.
+ *
+ * Since non-dense SuperColumn families do not modify the contents of the internal map through in CQL
+ * and do not expose this via CQL either, reads, writes and deletes are handled normally.
+ *
+ * Sidenote: a _dense_ SuperColumn Familiy is the one that has no added REGULAR columns.
+ */
+public class SuperColumnCompatibility
+{
+    // We use an empty value for the 1) this can't conflict with a user-defined column and 2) this actually
+    // validate with any comparator which makes it convenient for columnDefinitionComparator().
+    public static final ByteBuffer SUPER_COLUMN_MAP_COLUMN = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+    public static final String SUPER_COLUMN_MAP_COLUMN_STR = UTF8Type.instance.compose(SUPER_COLUMN_MAP_COLUMN);
+
+    /**
+     * Dense flag might have been incorrectly set if the node was upgraded from 2.x before CASSANDRA-12373.
+     *
+     * For 3.x created tables, the flag is set correctly in ThriftConversion code.
+     */
+    public static boolean recalculateIsDense(Columns columns)
+    {
+        return columns.size() == 1 && columns.getComplex(0).name.toString().isEmpty();
+    }
+
+    /**
+     * For _dense_ SuperColumn Families, the supercolumn key column has to be translated to the collection subselection
+     * query in order to avoid reading an entire collection and then filtering out the results.
+     */
+    public static ColumnFilter getColumnFilter(CFMetaData cfm, QueryOptions queryOptions, SuperColumnRestrictions restrictions)
+    {
+        assert cfm.isSuper() && cfm.isDense();
+
+        ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+        builder.add(cfm.compactValueColumn());
+
+        if (restrictions.keySliceRestriction != null)
+        {
+            SingleColumnRestriction.SuperColumnKeySliceRestriction restriction = restrictions.keySliceRestriction;
+            TermSlice slice = restriction.slice;
+
+            ByteBuffer start = slice.hasBound(Bound.START) ? slice.bound(Bound.START).bindAndGet(queryOptions) : null;
+            ByteBuffer end = slice.hasBound(Bound.END) ? slice.bound(Bound.END).bindAndGet(queryOptions) : null;
+
+            builder.slice(cfm.compactValueColumn(),
+                          start == null ? CellPath.BOTTOM : CellPath.create(start),
+                          end == null ? CellPath.TOP : CellPath.create(end));
+        }
+        else if (restrictions.keyEQRestriction != null)
+        {
+            SingleColumnRestriction.SuperColumnKeyEQRestriction restriction = restrictions.keyEQRestriction;
+            ByteBuffer value = restriction.bindValue(queryOptions);
+            builder.select(cfm.compactValueColumn(), CellPath.create(value));
+        }
+        else if (restrictions.keyINRestriction != null)
+        {
+            SingleColumnRestriction.SuperColumnKeyINRestriction cast = restrictions.keyINRestriction;
+            Set<ByteBuffer> keyINRestrictionValues = new TreeSet<ByteBuffer>(((MapType) cfm.compactValueColumn().type).getKeysType());
+            keyINRestrictionValues.addAll(cast.getValues(queryOptions));
+
+            for (ByteBuffer value : keyINRestrictionValues)
+                builder.select(cfm.compactValueColumn(), CellPath.create(value));
+        }
+        else if (restrictions.multiEQRestriction != null)
+        {
+            SingleColumnRestriction.SuperColumnMultiEQRestriction restriction = restrictions.multiEQRestriction;
+            ByteBuffer value = restriction.secondValue;
+            builder.select(cfm.compactValueColumn(), CellPath.create(value));
+        }
+
+        return builder.build();
+    }
+
+    /**
+     * For _dense_ SuperColumn Families.
+     *
+     * On read path, instead of writing row per map, we have to write a row per key/value pair in map.
+     *
+     * For example:
+     *
+     *   | partition-key | clustering-key | { key1: value1, key2: value2 } |
+     *
+     * Will be translated to:
+     *
+     *   | partition-key | clustering-key | key1 | value1 |
+     *   | partition-key | clustering-key | key2 | value2 |
+     *
+     */
+    public static void processPartition(CFMetaData cfm, Selection selection, RowIterator partition, Selection.ResultSetBuilder result, int protocolVersion,
+                                        SuperColumnRestrictions restrictions, QueryOptions queryOptions)
+    {
+        assert cfm.isDense();
+        ByteBuffer[] keyComponents = getComponents(cfm, partition.partitionKey());
+
+        int nowInSeconds = FBUtilities.nowInSeconds();
+        while (partition.hasNext())
+        {
+            Row row = partition.next();
+
+            ComplexColumnData ccd = row.getComplexColumnData(cfm.compactValueColumn());
+
+            if (ccd == null)
+                continue;
+
+            Iterator<Cell> cellIter = ccd.iterator();
+
+            outer:
+            while (cellIter.hasNext())
+            {
+                Cell cell = cellIter.next();
+                ByteBuffer superColumnKey = cell.path().get(0);
+
+                if (restrictions != null)
+                {
+                    // Slice on SuperColumn key
+                    if (restrictions.keySliceRestriction != null)
+                    {
+                        for (Bound bound : Bound.values())
+                        {
+                            if (restrictions.keySliceRestriction.hasBound(bound) &&
+                                !restrictions.keySliceRestriction.isInclusive(bound))
+                            {
+                                ByteBuffer excludedValue = restrictions.keySliceRestriction.bindValue(queryOptions);
+                                if (excludedValue.equals(superColumnKey))
+                                    continue outer;
+                            }
+                        }
+                    }
+
+                    // Multi-column restriction on clustering+SuperColumn key
+                    if (restrictions.multiSliceRestriction != null &&
+                        cfm.comparator.compare(row.clustering(), new Clustering(restrictions.multiSliceRestriction.firstValue)) == 0)
+                    {
+                        AbstractType t = ((MapType) cfm.compactValueColumn().type).getKeysType();
+                        int cmp = t.compare(superColumnKey, restrictions.multiSliceRestriction.secondValue);
+
+                        if ((cmp == 0 && !restrictions.multiSliceRestriction.trueInclusive) ||     // EQ
+                            (restrictions.multiSliceRestriction.hasBound(Bound.END) && cmp > 0) || // LT
+                            (restrictions.multiSliceRestriction.hasBound(Bound.START) && cmp < 0)) // GT
+                            continue outer;
+                    }
+                }
+
+                result.newRow(protocolVersion);
+
+                for (ColumnDefinition def : selection.getColumns())
+                {
+                    if (cfm.isSuperColumnKeyColumn(def))
+                    {
+                        result.add(superColumnKey);
+                    }
+                    else if (cfm.isSuperColumnValueColumn(def))
+                    {
+                        result.add(cell, nowInSeconds);
+                    }
+                    else
+                    {
+                        switch (def.kind)
+                        {
+                            case PARTITION_KEY:
+                                result.add(keyComponents[def.position()]);
+                                break;
+                            case CLUSTERING:
+                                result.add(row.clustering().get(def.position()));
+                                break;
+                            case REGULAR:
+                            case STATIC:
+                                throw new AssertionError(String.format("Invalid column '%s' found in SuperColumn table", def.name.toString()));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * For _dense_ SuperColumn Families.
+     *
+     * On the write path, we have to do combine the columns into a key/value pair:
+     *
+     * So inserting a row:
+     *
+     *     | partition-key | clustering-key | key1 | value1 |
+     *
+     * Would result into:
+     *
+     *     | partition-key | clustering-key | {key1: value1} |
+     *
+     * or adding / overwriting the value for `key1`.
+     */
+    public static void prepareInsertOperations(CFMetaData cfm,
+                                               List<ColumnIdentifier.Raw> columnNames,
+                                               WhereClause.Builder whereClause,
+                                               List<Term.Raw> columnValues,
+                                               VariableSpecifications boundNames,
+                                               Operations operations)
+    {
+        List<ColumnDefinition> defs = new ArrayList<>(columnNames.size());
+        for (int i = 0; i < columnNames.size(); i++)
+        {
+            ColumnIdentifier id = columnNames.get(i).prepare(cfm);
+            defs.add(cfm.getColumnDefinition(id));
+        }
+
+        prepareInsertOperations(cfm, defs, boundNames, columnValues, whereClause, operations);
+    }
+
+    /**
+     * For _dense_ SuperColumn Families.
+     *
+     * {@link #prepareInsertOperations(CFMetaData, List, VariableSpecifications, List, WhereClause.Builder, Operations)},
+     * but for INSERT JSON queries
+     */
+    public static void prepareInsertJSONOperations(CFMetaData cfm,
+                                                   List<ColumnDefinition> defs,
+                                                   VariableSpecifications boundNames,
+                                                   Json.Prepared prepared,
+                                                   WhereClause.Builder whereClause,
+                                                   Operations operations)
+    {
+        List<Term.Raw> columnValues = new ArrayList<>(defs.size());
+        for (ColumnDefinition def : defs)
+            columnValues.add(prepared.getRawTermForColumn(def));
+
+        prepareInsertOperations(cfm, defs, boundNames, columnValues, whereClause, operations);
+    }
+
+    private static void prepareInsertOperations(CFMetaData cfm,
+                                                List<ColumnDefinition> defs,
+                                                VariableSpecifications boundNames,
+                                                List<Term.Raw> columnValues,
+                                                WhereClause.Builder whereClause,
+                                                Operations operations)
+    {
+        assert cfm.isDense();
+        assert defs.size() == columnValues.size();
+
+        Term.Raw superColumnKey = null;
+        Term.Raw superColumnValue = null;
+
+        for (int i = 0, size = defs.size(); i < size; i++)
+        {
+            ColumnDefinition def = defs.get(i);
+            Term.Raw raw = columnValues.get(i);
+
+            if (cfm.isSuperColumnKeyColumn(def))
+            {
+                superColumnKey = raw;
+                collectMarkerSpecifications(raw, boundNames, def);
+            }
+            else if (cfm.isSuperColumnValueColumn(def))
+            {
+                superColumnValue = raw;
+                collectMarkerSpecifications(raw, boundNames, def);
+            }
+            else if (def.isPrimaryKeyColumn())
+            {
+                whereClause.add(new SingleColumnRelation(new ColumnIdentifier.ColumnIdentifierValue(def.name), Operator.EQ, raw));
+            }
+            else
+            {
+                throw invalidRequest("Invalid column {} in where clause");
+            }
+        }
+
+        checkTrue(superColumnValue != null,
+                  "Column value is mandatory for SuperColumn tables");
+        checkTrue(superColumnKey != null,
+                  "Column key is mandatory for SuperColumn tables");
+
+        Operation operation = new Operation.SetElement(superColumnKey, superColumnValue).prepare(cfm.ksName, cfm.compactValueColumn());
+        operations.add(operation);
+    }
+
+    /**
+     * Collect the marker specifications for the bound columns manually, since the operations on a column are
+     * converted to the operations on the collection element.
+     */
+    private static void collectMarkerSpecifications(Term.Raw raw, VariableSpecifications boundNames, ColumnDefinition def)
+    {
+        if (raw instanceof AbstractMarker.Raw)
+            boundNames.add(((AbstractMarker.Raw) raw).bindIndex(), def);
+    }
+
+    /**
+     * For _dense_ SuperColumn Families.
+     *
+     * During UPDATE operation, the update by clustering (with correponding relation in WHERE clause)
+     * has to be substituted with an update to the map that backs the given SuperColumn.
+     *
+     * For example, an update such as:
+     *
+     *     UPDATE ... SET value = 'value1' WHERE key = 'pk' AND column1 = 'ck' AND column2 = 'mk'
+     *
+     * Will update the value under key 'mk' in the map, backing the SuperColumn, located in the row
+     * with clustering 'ck' in the partition with key 'pk'.
+     */
+    public static WhereClause prepareUpdateOperations(CFMetaData cfm,
+                                                      WhereClause whereClause,
+                                                      List<Pair<ColumnIdentifier.Raw, Operation.RawUpdate>> updates,
+                                                      VariableSpecifications boundNames,
+                                                      Operations operations)
+    {
+        assert cfm.isDense();
+        Term.Raw superColumnKey = null;
+        Term.Raw superColumnValue = null;
+
+        List<Relation> newRelations = new ArrayList<>(whereClause.relations.size());
+        for (int i = 0; i < whereClause.relations.size(); i++)
+        {
+            SingleColumnRelation relation = (SingleColumnRelation) whereClause.relations.get(i);
+            ColumnIdentifier id = relation.getEntity().prepare(cfm);
+            ColumnDefinition def = cfm.getColumnDefinition(id);
+
+            if (cfm.isSuperColumnKeyColumn(def))
+            {
+                superColumnKey = relation.getValue();
+                collectMarkerSpecifications(superColumnKey, boundNames, def);
+            }
+            else
+            {
+                newRelations.add(relation);
+            }
+        }
+
+        checkTrue(superColumnKey != null,
+                  "Column key is mandatory for SuperColumn tables");
+
+        for (Pair<ColumnIdentifier.Raw, Operation.RawUpdate> entry : updates)
+        {
+            ColumnIdentifier id = entry.left.prepare(cfm);
+            ColumnDefinition def = cfm.getColumnDefinition(id);
+
+            if (!cfm.isSuperColumnValueColumn(def))
+                throw invalidRequest("Column `%s` of type `%s` found in SET part", def.name, def.type.asCQL3Type());
+
+            Operation operation;
+
+            if (entry.right instanceof Operation.Addition)
+            {
+                Operation.Addition op = (Operation.Addition) entry.right;
+                superColumnValue = op.value();
+
+                operation = new Operation.ElementAddition(superColumnKey, superColumnValue).prepare(cfm.ksName, cfm.compactValueColumn());
+            }
+            else if (entry.right instanceof Operation.Substraction)
+            {
+                Operation.Substraction op = (Operation.Substraction) entry.right;
+                superColumnValue = op.value();
+
+                operation = new Operation.ElementSubtraction(superColumnKey, superColumnValue).prepare(cfm.ksName, cfm.compactValueColumn());
+            }
+            else if (entry.right instanceof Operation.SetValue)
+            {
+                Operation.SetValue op = (Operation.SetValue) entry.right;
+                superColumnValue = op.value();
+
+                operation = new Operation.SetElement(superColumnKey, superColumnValue).prepare(cfm.ksName, cfm.compactValueColumn());
+            }
+            else
+            {
+                throw invalidRequest("Invalid operation `%s` on column `%s` of type `%s` found in SET part", entry.right, def.name, def.type.asCQL3Type());
+            }
+
+            collectMarkerSpecifications(superColumnValue, boundNames, def);
+            operations.add(operation);
+        }
+
+        checkTrue(superColumnValue != null,
+                  "Column value is mandatory for SuperColumn tables");
+
+        return newRelations.size() != whereClause.relations.size() ? whereClause.copy(newRelations) : whereClause;
+    }
+
+    /**
+     * Rebuilds LWT conditions on SuperColumn _value_ column.
+     *
+     * Conditions have to be changed to correspond the internal representation of SuperColumn value, since it's not
+     * a separate column, but a value in a hidden compact value column.
+     */
+    public static Conditions rebuildLWTColumnConditions(Conditions conditions, CFMetaData cfm, WhereClause whereClause)
+    {
+        if (conditions.isEmpty() || conditions.isIfExists() || conditions.isIfNotExists())
+            return conditions;
+
+        ColumnConditions.Builder builder = ColumnConditions.newBuilder();
+        Collection<ColumnCondition> columnConditions = ((ColumnConditions) conditions).columnConditions();
+
+        Pair<ColumnDefinition, Relation> superColumnKeyRelation = SuperColumnCompatibility.getSuperColumnKeyRelation(whereClause.relations, cfm);
+
+        checkNotNull(superColumnKeyRelation,
+                     "Lightweight transactions on SuperColumn tables are only supported with supplied SuperColumn key");
+
+        for (ColumnCondition columnCondition : columnConditions)
+        {
+            checkTrue(cfm.isSuperColumnValueColumn(columnCondition.column),
+                      "Lightweight transactions are only supported on the value column of SuperColumn tables");
+
+            Term.Raw value = superColumnKeyRelation.right.getValue();
+            Term collectionElemnt = value instanceof AbstractMarker.Raw ?
+                                    new Constants.Marker(((AbstractMarker.Raw) value).bindIndex(),
+                                                         superColumnKeyRelation.left) :
+                                    value.prepare(cfm.ksName, superColumnKeyRelation.left);
+            builder.add(ColumnCondition.condition(cfm.compactValueColumn(),
+                                                  collectionElemnt,
+                                                  columnCondition.value(), columnCondition.operator));
+        }
+
+        return builder.build();
+    }
+
+    /**
+     * Returns a relation on the SuperColumn key
+     */
+    private static Pair<ColumnDefinition, Relation> getSuperColumnKeyRelation(List<Relation> relations, CFMetaData cfm)
+    {
+        for (int i = 0; i < relations.size(); i++)
+        {
+            SingleColumnRelation relation = (SingleColumnRelation) relations.get(i);
+            ColumnIdentifier id = relation.getEntity().prepare(cfm);
+            ColumnDefinition def = cfm.getColumnDefinition(id);
+
+            if (cfm.isSuperColumnKeyColumn(def))
+                return Pair.create(def, relation);
+        }
+        return null;
+    }
+
+    /**
+     * For _dense_ SuperColumn Families.
+     *
+     * Delete, when the "regular" columns are present, have to be translated into
+     * deletion of value in the internal map by key.
+     *
+     * For example, delete such as:
+     *
+     *     DELETE FROM ... WHERE key = 'pk' AND column1 = 'ck' AND column2 = 'mk'
+     *
+     * Will delete a value under 'mk' from the map, located in the row with clustering key 'ck' in the partition
+     * with key 'pk'.
+     */
+    public static WhereClause prepareDeleteOperations(CFMetaData cfm,
+                                                      WhereClause whereClause,
+                                                      VariableSpecifications boundNames,
+                                                      Operations operations)
+    {
+        assert cfm.isDense();
+        List<Relation> newRelations = new ArrayList<>(whereClause.relations.size());
+
+        for (int i = 0; i < whereClause.relations.size(); i++)
+        {
+            Relation orig = whereClause.relations.get(i);
+
+            checkFalse(orig.isMultiColumn(),
+                       "Multi-column relations cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", orig);
+            checkFalse(orig.onToken(),
+                       "Token relations cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", orig);
+
+            SingleColumnRelation relation = (SingleColumnRelation) orig;
+            ColumnIdentifier id = relation.getEntity().prepare(cfm);
+            ColumnDefinition def = cfm.getColumnDefinition(id);
+
+            if (cfm.isSuperColumnKeyColumn(def))
+            {
+                Term.Raw value = relation.getValue();
+
+                if (value instanceof AbstractMarker.Raw)
+                    boundNames.add(((AbstractMarker.Raw) value).bindIndex(), def);
+
+                Operation operation = new Maps.DiscarderByKey(cfm.compactValueColumn(), value.prepare(cfm.ksName, def));
+                operations.add(operation);
+            }
+            else
+            {
+                newRelations.add(relation);
+            }
+        }
+
+        return newRelations.size() != whereClause.relations.size() ? whereClause.copy(newRelations) : whereClause;
+    }
+
+    /**
+     * Create a column name generator for SuperColumns
+     */
+    public static CompactTables.DefaultNames columnNameGenerator(List<ColumnDefinition> partitionKeyColumns,
+                                                                 List<ColumnDefinition> clusteringColumns,
+                                                                 PartitionColumns partitionColumns)
+    {
+        Set<String> names = new HashSet<>();
+        // If the clustering column was renamed, the supercolumn key's default nname still can't be `column1` (SuperColumn
+        // key renames are handled separately by looking up an existing column).
+        names.add("column1");
+        for (ColumnDefinition columnDefinition: partitionKeyColumns)
+            names.add(columnDefinition.name.toString());
+        for (ColumnDefinition columnDefinition: clusteringColumns)
+            names.add(columnDefinition.name.toString());
+        for (ColumnDefinition columnDefinition: partitionColumns)
+            names.add(columnDefinition.name.toString());
+
+        return CompactTables.defaultNameGenerator(names);
+    }
+
+    /**
+     * Find a SuperColumn key column if it's available (for example, when it was renamed) or create one with a default name.
+     */
+    public static ColumnDefinition getSuperCfKeyColumn(CFMetaData cfm, List<ColumnDefinition> clusteringColumns, CompactTables.DefaultNames defaultNames)
+    {
+        assert cfm.isDense();
+
+        MapType mapType = (MapType) cfm.compactValueColumn().type;
+        // Pre CASSANDRA-12373 3.x-created supercolumn family
+        if (clusteringColumns.size() == 1)
+        {
+            // create a new one with a default name
+            ColumnIdentifier identifier = ColumnIdentifier.getInterned(defaultNames.defaultClusteringName(), true);
+            return new ColumnDefinition(cfm.ksName, cfm.cfName, identifier, mapType.getKeysType(), ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.REGULAR);
+        }
+
+        // Upgrade path: table created in 2.x, handle pre-created columns and/or renames.
+        assert clusteringColumns.size() == 2 : clusteringColumns;
+        ColumnDefinition cd = clusteringColumns.get(1);
+
+        assert cd.type.equals(mapType.getKeysType()) : cd.type + " != " + mapType.getKeysType();
+        return new ColumnDefinition(cfm.ksName, cfm.cfName, cd.name, mapType.getKeysType(), ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.REGULAR);
+    }
+
+    /**
+     * Find a SuperColumn value column if it's available (for example, when it was renamed) or create one with a default name.
+     */
+    public static ColumnDefinition getSuperCfValueColumn(CFMetaData cfm, PartitionColumns partitionColumns, ColumnDefinition superCfKeyColumn, CompactTables.DefaultNames defaultNames)
+    {
+        assert cfm.isDense();
+
+        MapType mapType = (MapType) cfm.compactValueColumn().type;
+        for (ColumnDefinition def: partitionColumns.regulars)
+        {
+            if (!def.name.bytes.equals(SUPER_COLUMN_MAP_COLUMN) && def.type.equals(mapType.getValuesType()) && !def.equals(superCfKeyColumn))
+                return def;
+        }
+
+        ColumnIdentifier identifier = ColumnIdentifier.getInterned(defaultNames.defaultCompactValueName(), true);
+        return new ColumnDefinition(cfm.ksName, cfm.cfName, identifier, mapType.getValuesType(), ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.REGULAR);
+    }
+
+    /**
+     * SuperColumn key is stored in {@link CFMetaData#columnMetadata} as a clustering column (to make sure we can make
+     * a distinction between the SuperColumn key and SuperColumn value columns, especially when they have the same type
+     * and were renamed), but exposed as {@link CFMetaData#superCfKeyColumn} as a regular column to be compatible with
+     * the storage engine.
+     *
+     * This remapping is necessary to facilitate the column metadata part.
+     */
+    public static ColumnDefinition getSuperCfSschemaRepresentation(ColumnDefinition superCfKeyColumn)
+    {
+        return new ColumnDefinition(superCfKeyColumn.ksName, superCfKeyColumn.cfName, superCfKeyColumn.name, superCfKeyColumn.type, 1, ColumnDefinition.Kind.CLUSTERING);
+    }
+
+    public static boolean isSuperColumnMapColumn(ColumnDefinition column)
+    {
+        return column.isRegular() && column.name.bytes.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN);
+    }
+
+    public static ColumnDefinition getCompactValueColumn(PartitionColumns columns)
+    {
+        for (ColumnDefinition column : columns.regulars)
+        {
+            if (isSuperColumnMapColumn(column))
+                return column;
+        }
+        throw new AssertionError("Invalid super column table definition, no 'dynamic' map column");
+    }
+
+    /**
+     * Restrictions are the trickiest part of the SuperColumn integration.
+     * See specific docs on each field. For the purpose of this doc, the "default" column names are used,
+     * `column2` and `value`. Detailed description and semantics of these fields can be found in this class'
+     * header comment.
+     */
+    public static class SuperColumnRestrictions
+    {
+        /**
+         * Restrictions in the form of:
+         *   ... AND (column1, column2) > ('value1', 1)
+         * Multi-column restrictions. `column1` will be handled normally by the clustering bounds,
+         * and `column2` value has to be "saved" and filtered out in `processPartition`, as there's no
+         * direct mapping of multi-column restrictions to clustering + cell path. The first row
+         * is special-cased to make sure the semantics of multi-column restrictions are preserved.
+         */
+        private final SingleColumnRestriction.SuperColumnMultiSliceRestriction multiSliceRestriction;
+
+        /**
+         * Restrictions in the form of:
+         *   ... AND (column1, column2) = ('value1', 1)
+         * Multi-column restriction with EQ does have a direct mapping: `column1` will be handled
+         * normally by the clustering bounds, and the `column2` will be special-cased by the
+         * {@link #getColumnFilter(CFMetaData, QueryOptions, SuperColumnRestrictions)} as a collection path lookup.
+         */
+        private final SingleColumnRestriction.SuperColumnMultiEQRestriction multiEQRestriction;
+
+        /**
+         * Restrictions in the form of:
+         *   ... AND column2 >= 5
+         * For non-filtering cases (when the preceding clustering column and a partition key are
+         * restricted), will be handled in {@link #getColumnFilter(CFMetaData, QueryOptions, SuperColumnRestrictions)}
+         * like an inclusive bounds lookup.
+         *
+         * For the restrictions taking a form of
+         *   ... AND column2 > 5
+         * (non-inclusive ones), the items that match `=` will be filtered out
+         * by {@link #processPartition(CFMetaData, Selection, RowIterator, Selection.ResultSetBuilder, int, SuperColumnRestrictions, QueryOptions)}
+         *
+         * Unfortunately, there are no good ways to do it other than here:
+         * {@link RowFilter} can't be used in this case, since the complex collection cells are not yet rows by that
+         * point.
+         * {@link ColumnFilter} (which is used for inclusive slices) can't be changed to support exclusive slices as it would
+         * require a protocol change in order to add a Kind. So exclusive slices are a combination of inclusive plus
+         * an ad-hoc filter.
+         */
+        private final SingleColumnRestriction.SuperColumnKeySliceRestriction keySliceRestriction;
+
+        /**
+         * Restrictions in the form of:
+         *   ... AND column2 IN (1, 2, 3)
+         * For non-filtering cases (when the preceeding clustering column and a partition key are
+         * restricted), are handled in {@link #getColumnFilter(CFMetaData, QueryOptions, SuperColumnRestrictions)} by
+         * adding multiple collection paths to the {@link ColumnFilter}
+         */
+        private final SingleColumnRestriction.SuperColumnKeyINRestriction keyINRestriction;
+
+        /**
+         * Restrictions in the form of:
+         *   ... AND column2 = 1
+         * For non-filtering cases (when the preceeding clustering column and a partition key are
+         * restricted), will be handled by converting the restriction to the column filter on
+         * the collection key in {@link #getColumnFilter(CFMetaData, QueryOptions, SuperColumnRestrictions)}
+         */
+        private final SingleColumnRestriction.SuperColumnKeyEQRestriction keyEQRestriction;
+
+        public SuperColumnRestrictions(Iterator<Restriction> restrictions)
+        {
+            // In order to keep the fields final, assignments have to be done outside the loop
+            SingleColumnRestriction.SuperColumnMultiSliceRestriction multiSliceRestriction = null;
+            SingleColumnRestriction.SuperColumnKeySliceRestriction keySliceRestriction = null;
+            SingleColumnRestriction.SuperColumnKeyINRestriction keyINRestriction = null;
+            SingleColumnRestriction.SuperColumnMultiEQRestriction multiEQRestriction = null;
+            SingleColumnRestriction.SuperColumnKeyEQRestriction keyEQRestriction = null;
+
+            while (restrictions.hasNext())
+            {
+                Restriction restriction = restrictions.next();
+
+                if (restriction instanceof SingleColumnRestriction.SuperColumnMultiSliceRestriction)
+                    multiSliceRestriction = (SingleColumnRestriction.SuperColumnMultiSliceRestriction) restriction;
+                else if (restriction instanceof SingleColumnRestriction.SuperColumnKeySliceRestriction)
+                    keySliceRestriction = (SingleColumnRestriction.SuperColumnKeySliceRestriction) restriction;
+                else if (restriction instanceof SingleColumnRestriction.SuperColumnKeyINRestriction)
+                    keyINRestriction = (SingleColumnRestriction.SuperColumnKeyINRestriction) restriction;
+                else if (restriction instanceof SingleColumnRestriction.SuperColumnMultiEQRestriction)
+                    multiEQRestriction = (SingleColumnRestriction.SuperColumnMultiEQRestriction) restriction;
+                else if (restriction instanceof SingleColumnRestriction.SuperColumnKeyEQRestriction)
+                    keyEQRestriction = (SingleColumnRestriction.SuperColumnKeyEQRestriction) restriction;
+            }
+
+            this.multiSliceRestriction = multiSliceRestriction;
+            this.keySliceRestriction = keySliceRestriction;
+            this.keyINRestriction = keyINRestriction;
+            this.multiEQRestriction = multiEQRestriction;
+            this.keyEQRestriction = keyEQRestriction;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/Term.java b/src/java/org/apache/cassandra/cql3/Term.java
index 6fa0c76..5ae9c18 100644
--- a/src/java/org/apache/cassandra/cql3/Term.java
+++ b/src/java/org/apache/cassandra/cql3/Term.java

@@ -70,7 +70,7 @@
      */
     public abstract boolean containsBindMarker();
 
-    Iterable<Function> getFunctions();
+    public void addFunctionsTo(List<Function> functions);
 
     /**
      * A parsed, non prepared (thus untyped) term.
@@ -81,7 +81,7 @@
      *   - a function call
      *   - a marker
      */
-    public interface Raw extends AssignmentTestable
+    public abstract class Raw implements AssignmentTestable
     {
         /**
          * This method validates this RawTerm is valid for provided column
@@ -93,12 +93,23 @@
          * case this RawTerm describe a list index or a map key, etc...
          * @return the prepared term.
          */
-        public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException;
+        public abstract Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException;
+
+        /**
+         * @return a String representation of the raw term that can be used when reconstructing a CQL query string.
+         */
+        public abstract String getText();
+
+        @Override
+        public String toString()
+        {
+            return getText();
+        }
     }
 
-    public interface MultiColumnRaw extends Raw
+    public abstract class MultiColumnRaw extends Term.Raw
     {
-        public Term prepare(String keyspace, List<? extends ColumnSpecification> receiver) throws InvalidRequestException;
+        public abstract Term prepare(String keyspace, List<? extends ColumnSpecification> receiver) throws InvalidRequestException;
     }
 
     /**
@@ -120,9 +131,8 @@
         public void collectMarkerSpecification(VariableSpecifications boundNames) {}
         public Terminal bind(QueryOptions options) { return this; }
 
-        public Set<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Collections.emptySet();
         }
 
         // While some NonTerminal may not have bind markers, no Term can be Terminal

diff --git a/src/java/org/apache/cassandra/cql3/Terms.java b/src/java/org/apache/cassandra/cql3/Terms.java
index 0b049b9..7d3948a 100644
--- a/src/java/org/apache/cassandra/cql3/Terms.java
+++ b/src/java/org/apache/cassandra/cql3/Terms.java

@@ -17,29 +17,24 @@
  */
 package org.apache.cassandra.cql3;
 
-import java.util.Collections;
-
-import com.google.common.collect.Iterables;
+import java.nio.ByteBuffer;
+import java.util.List;
 
 import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.db.marshal.AbstractType;
 
 public class Terms
 {
-
-    private static com.google.common.base.Function<Term, Iterable<Function>> TO_FUNCTION_ITERABLE =
-    new com.google.common.base.Function<Term, Iterable<Function>>()
+    public static void addFunctions(Iterable<Term> terms, List<Function> functions)
     {
-        public Iterable<Function> apply(Term term)
-        {
-            return term.getFunctions();
-        }
-    };
+        if (terms != null)
+            terms.forEach(t -> t.addFunctionsTo(functions));
+    }
 
-    public static Iterable<Function> getFunctions(Iterable<Term> terms)
+    public static ByteBuffer asBytes(String keyspace, String term, AbstractType type)
     {
-        if (terms == null)
-            return Collections.emptySet();
-
-        return Iterables.concat(Iterables.transform(terms, TO_FUNCTION_ITERABLE));
+        ColumnSpecification receiver = new ColumnSpecification(keyspace, "--dummy--", new ColumnIdentifier("(dummy)", true), type);
+        Term.Raw rawTerm = CQLFragmentParser.parseAny(CqlParser::term, term, "CQL term");
+        return rawTerm.prepare(keyspace, receiver).bindAndGet(QueryOptions.DEFAULT);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/TokenRelation.java b/src/java/org/apache/cassandra/cql3/TokenRelation.java
index 46a812c..2c13b19 100644
--- a/src/java/org/apache/cassandra/cql3/TokenRelation.java
+++ b/src/java/org/apache/cassandra/cql3/TokenRelation.java

@@ -20,6 +20,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import com.google.common.base.Joiner;
 
@@ -30,7 +31,6 @@
 import org.apache.cassandra.cql3.restrictions.TokenRestriction;
 import org.apache.cassandra.cql3.statements.Bound;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.StorageService;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkContainsNoDuplicates;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkContainsOnly;
@@ -64,12 +64,22 @@
         return true;
     }
 
+    public Term.Raw getValue()
+    {
+        return value;
+    }
+
+    public List<? extends Term.Raw> getInValues()
+    {
+        return null;
+    }
+
     @Override
     protected Restriction newEQRestriction(CFMetaData cfm, VariableSpecifications boundNames) throws InvalidRequestException
     {
         List<ColumnDefinition> columnDefs = getColumnDefinitions(cfm);
         Term term = toTerm(toReceivers(cfm, columnDefs), value, cfm.ksName, boundNames);
-        return new TokenRestriction.EQ(cfm.getKeyValidatorAsCType(), columnDefs, term);
+        return new TokenRestriction.EQRestriction(cfm, columnDefs, term);
     }
 
     @Override
@@ -86,7 +96,7 @@
     {
         List<ColumnDefinition> columnDefs = getColumnDefinitions(cfm);
         Term term = toTerm(toReceivers(cfm, columnDefs), value, cfm.ksName, boundNames);
-        return new TokenRestriction.Slice(cfm.getKeyValidatorAsCType(), columnDefs, bound, inclusive, term);
+        return new TokenRestriction.SliceRestriction(cfm, columnDefs, bound, inclusive, term);
     }
 
     @Override
@@ -96,6 +106,12 @@
     }
 
     @Override
+    protected Restriction newIsNotRestriction(CFMetaData cfm, VariableSpecifications boundNames) throws InvalidRequestException
+    {
+        throw invalidRequest("%s cannot be used with the token function", operator());
+    }
+
+    @Override
     protected Term toTerm(List<? extends ColumnSpecification> receivers,
                           Raw raw,
                           String keyspace,
@@ -106,6 +122,15 @@
         return term;
     }
 
+    public Relation renameIdentifier(ColumnIdentifier.Raw from, ColumnIdentifier.Raw to)
+    {
+        if (!entities.contains(from))
+            return this;
+
+        List<ColumnIdentifier.Raw> newEntities = entities.stream().map(e -> e.equals(from) ? to : e).collect(Collectors.toList());
+        return new TokenRelation(newEntities, operator(), value);
+    }
+
     @Override
     public String toString()
     {
@@ -159,6 +184,6 @@
         return Collections.singletonList(new ColumnSpecification(firstColumn.ksName,
                                                                  firstColumn.cfName,
                                                                  new ColumnIdentifier("partition key token", true),
-                                                                 StorageService.getPartitioner().getTokenValidator()));
+                                                                 cfm.partitioner.getTokenValidator()));
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/Tuples.java b/src/java/org/apache/cassandra/cql3/Tuples.java
index 89fecd0..c7564d3 100644
--- a/src/java/org/apache/cassandra/cql3/Tuples.java
+++ b/src/java/org/apache/cassandra/cql3/Tuples.java

@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -46,14 +47,14 @@
         return new ColumnSpecification(column.ksName,
                                        column.cfName,
                                        new ColumnIdentifier(String.format("%s[%d]", column.name, component), true),
-                                       ((TupleType)column.type).type(component));
+                                       (getTupleType(column.type)).type(component));
     }
 
     /**
      * A raw, literal tuple.  When prepared, this will become a Tuples.Value or Tuples.DelayedValue, depending
      * on whether the tuple holds NonTerminals.
      */
-    public static class Literal implements Term.MultiColumnRaw
+    public static class Literal extends Term.MultiColumnRaw
     {
         private final List<Term.Raw> elements;
 
@@ -76,7 +77,7 @@
 
                 values.add(value);
             }
-            DelayedValue value = new DelayedValue((TupleType)receiver.type, values);
+            DelayedValue value = new DelayedValue(getTupleType(receiver.type), values);
             return allTerminal ? value.bind(QueryOptions.DEFAULT) : value;
         }
 
@@ -103,10 +104,10 @@
 
         private void validateAssignableTo(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            if (!(receiver.type instanceof TupleType))
+            if (!checkIfTupleType(receiver.type))
                 throw new InvalidRequestException(String.format("Invalid tuple type literal for %s of type %s", receiver.name, receiver.type.asCQL3Type()));
 
-            TupleType tt = (TupleType)receiver.type;
+            TupleType tt = getTupleType(receiver.type);
             for (int i = 0; i < elements.size(); i++)
             {
                 if (i >= tt.size())
@@ -133,10 +134,9 @@
             }
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            return tupleToString(elements);
+            return elements.stream().map(Term.Raw::getText).collect(Collectors.joining(", ", "(", ")"));
         }
     }
 
@@ -199,8 +199,6 @@
 
         private ByteBuffer[] bindInternal(QueryOptions options) throws InvalidRequestException
         {
-            int version = options.getProtocolVersion();
-
             ByteBuffer[] buffers = new ByteBuffer[elements.size()];
             for (int i = 0; i < elements.size(); i++)
             {
@@ -208,10 +206,6 @@
                 // Since A tuple value is always written in its entirety Cassandra can't preserve a pre-existing value by 'not setting' the new value. Reject the query.
                 if (buffers[i] == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     throw new InvalidRequestException(String.format("Invalid unset value for tuple field number %d", i));
-                // Inside tuples, we must force the serialization of collections to v3 whatever protocol
-                // version is in use since we're going to store directly that serialized value.
-                if (version < 3 && type.type(i).isCollection())
-                    buffers[i] = ((CollectionType)type.type(i)).getSerializer().reserializeToV3(buffers[i]);
             }
             return buffers;
         }
@@ -234,9 +228,9 @@
             return tupleToString(elements);
         }
 
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(elements);
+            Terms.addFunctions(elements, functions);
         }
     }
 
@@ -262,7 +256,7 @@
                 List<?> l = type.getSerializer().deserializeForNativeProtocol(value, options.getProtocolVersion());
 
                 assert type.getElementsType() instanceof TupleType;
-                TupleType tupleType = (TupleType) type.getElementsType();
+                TupleType tupleType = Tuples.getTupleType(type.getElementsType());
 
                 // type.split(bytes)
                 List<List<ByteBuffer>> elements = new ArrayList<>(l.size());
@@ -293,7 +287,7 @@
      * For example, "SELECT ... WHERE (col1, col2) > ?".
      * }
      */
-    public static class Raw extends AbstractMarker.Raw implements Term.MultiColumnRaw
+    public static class Raw extends AbstractMarker.MultiColumnRaw
     {
         public Raw(int bindIndex)
         {
@@ -323,18 +317,12 @@
         {
             return new Tuples.Marker(bindIndex, makeReceiver(receivers));
         }
-
-        @Override
-        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver)
-        {
-            throw new AssertionError("Tuples.Raw.prepare() requires a list of receivers");
-        }
     }
 
     /**
      * A raw marker for an IN list of tuples, like "SELECT ... WHERE (a, b, c) IN ?"
      */
-    public static class INRaw extends AbstractMarker.Raw implements MultiColumnRaw
+    public static class INRaw extends AbstractMarker.MultiColumnRaw
     {
         public INRaw(int bindIndex)
         {
@@ -368,12 +356,6 @@
         {
             return new InMarker(bindIndex, makeInReceiver(receivers));
         }
-
-        @Override
-        public AbstractMarker prepare(String keyspace, ColumnSpecification receiver)
-        {
-            throw new AssertionError("Tuples.INRaw.prepare() requires a list of receivers");
-        }
     }
 
     /**
@@ -393,7 +375,7 @@
             ByteBuffer value = options.getValues().get(bindIndex);
             if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
                 throw new InvalidRequestException(String.format("Invalid unset value for tuple %s", receiver.name));
-            return value == null ? null : Value.fromSerialized(value, (TupleType)receiver.type);
+            return value == null ? null : Value.fromSerialized(value, getTupleType(receiver.type));
         }
     }
 
@@ -430,4 +412,16 @@
         sb.append(')');
         return sb.toString();
     }
+
+    public static boolean checkIfTupleType(AbstractType<?> tuple)
+    {
+        return (tuple instanceof TupleType) ||
+               (tuple instanceof ReversedType && ((ReversedType) tuple).baseType instanceof TupleType);
+
+    }
+
+    public static TupleType getTupleType(AbstractType<?> tuple)
+    {
+        return (tuple instanceof ReversedType ? ((TupleType) ((ReversedType) tuple).baseType) : (TupleType)tuple);
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/TypeCast.java b/src/java/org/apache/cassandra/cql3/TypeCast.java
index 561a158..890b34f 100644
--- a/src/java/org/apache/cassandra/cql3/TypeCast.java
+++ b/src/java/org/apache/cassandra/cql3/TypeCast.java

@@ -20,7 +20,7 @@
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
-public class TypeCast implements Term.Raw
+public class TypeCast extends Term.Raw
 {
     private final CQL3Type.Raw type;
     private final Term.Raw term;
@@ -58,8 +58,7 @@
             return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
     }
 
-    @Override
-    public String toString()
+    public String getText()
     {
         return "(" + type + ")" + term;
     }

diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
index e8d610d..dc4237d 100644
--- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java

@@ -22,11 +22,21 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.collect.AbstractIterator;
+import com.google.common.annotations.VisibleForTesting;
 
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.utils.AbstractIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.FBUtilities;
 
 /** a utility for doing internal cql-based queries */
 public abstract class UntypedResultSet implements Iterable<UntypedResultSet.Row>
@@ -46,6 +56,20 @@
         return new FromPager(select, pager, pageSize);
     }
 
+    /**
+     * This method is intended for testing purposes, since it executes query on cluster
+     * and not on the local node only.
+     */
+    @VisibleForTesting
+    public static UntypedResultSet create(SelectStatement select,
+                                          ConsistencyLevel cl,
+                                          ClientState clientState,
+                                          QueryPager pager,
+                                          int pageSize)
+    {
+        return new FromDistributedPager(select, cl, clientState, pager, pageSize);
+    }
+
     public boolean isEmpty()
     {
         return size() == 0;
@@ -99,6 +123,71 @@
         }
     }
 
+    /**
+     * Pager that calls `execute` rather than `executeInternal`
+     */
+    private static class FromDistributedPager extends UntypedResultSet
+    {
+        private final SelectStatement select;
+        private final ConsistencyLevel cl;
+        private final ClientState clientState;
+        private final QueryPager pager;
+        private final int pageSize;
+        private final List<ColumnSpecification> metadata;
+
+        private FromDistributedPager(SelectStatement select,
+                                     ConsistencyLevel cl,
+                                     ClientState clientState,
+                                     QueryPager pager, int pageSize)
+        {
+            this.select = select;
+            this.cl = cl;
+            this.clientState = clientState;
+            this.pager = pager;
+            this.pageSize = pageSize;
+            this.metadata = select.getResultMetadata().requestNames();
+        }
+
+        public int size()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Row one()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Iterator<Row> iterator()
+        {
+            return new AbstractIterator<Row>()
+            {
+                private Iterator<List<ByteBuffer>> currentPage;
+
+                protected Row computeNext()
+                {
+                    int nowInSec = FBUtilities.nowInSeconds();
+                    while (currentPage == null || !currentPage.hasNext())
+                    {
+                        if (pager.isExhausted())
+                            return endOfData();
+
+                        try (PartitionIterator iter = pager.fetchPage(pageSize, cl, clientState))
+                        {
+                            currentPage = select.process(iter, nowInSec).rows.iterator();
+                        }
+                    }
+                    return new Row(metadata, currentPage.next());
+                }
+            };
+        }
+
+        public List<ColumnSpecification> metadata()
+        {
+            return metadata;
+        }
+    }
+
     private static class FromResultList extends UntypedResultSet
     {
         private final List<Map<String, ByteBuffer>> cqlRows;
@@ -174,11 +263,16 @@
 
                 protected Row computeNext()
                 {
+                    int nowInSec = FBUtilities.nowInSeconds();
                     while (currentPage == null || !currentPage.hasNext())
                     {
                         if (pager.isExhausted())
                             return endOfData();
-                        currentPage = select.process(pager.fetchPage(pageSize)).rows.iterator();
+
+                        try (ReadOrderGroup orderGroup = pager.startOrderGroup(); PartitionIterator iter = pager.fetchPageInternal(pageSize, orderGroup))
+                        {
+                            currentPage = select.process(iter, nowInSec).rows.iterator();
+                        }
                     }
                     return new Row(metadata, currentPage.next());
                 }
@@ -208,6 +302,37 @@
                 data.put(names.get(i).name.toString(), columns.get(i));
         }
 
+        public static Row fromInternalRow(CFMetaData metadata, DecoratedKey key, org.apache.cassandra.db.rows.Row row)
+        {
+            Map<String, ByteBuffer> data = new HashMap<>();
+
+            ByteBuffer[] keyComponents = SelectStatement.getComponents(metadata, key);
+            for (ColumnDefinition def : metadata.partitionKeyColumns())
+                data.put(def.name.toString(), keyComponents[def.position()]);
+
+            Clustering clustering = row.clustering();
+            for (ColumnDefinition def : metadata.clusteringColumns())
+                data.put(def.name.toString(), clustering.get(def.position()));
+
+            for (ColumnDefinition def : metadata.partitionColumns())
+            {
+                if (def.isSimple())
+                {
+                    Cell cell = row.getCell(def);
+                    if (cell != null)
+                        data.put(def.name.toString(), cell.value());
+                }
+                else
+                {
+                    ComplexColumnData complexData = row.getComplexColumnData(def);
+                    if (complexData != null)
+                        data.put(def.name.toString(), ((CollectionType)def.type).serializeForNativeProtocol(def, complexData.iterator(), Server.VERSION_3));
+                }
+            }
+
+            return new Row(data);
+        }
+
         public boolean has(String column)
         {
             // Note that containsKey won't work because we may have null values
@@ -292,6 +417,34 @@
             return raw == null ? null : MapType.getInstance(keyType, valueType, true).compose(raw);
         }
 
+        public Map<String, String> getTextMap(String column)
+        {
+            return getMap(column, UTF8Type.instance, UTF8Type.instance);
+        }
+
+        public <T> Set<T> getFrozenSet(String column, AbstractType<T> type)
+        {
+            ByteBuffer raw = data.get(column);
+            return raw == null ? null : SetType.getInstance(type, false).compose(raw);
+        }
+
+        public <T> List<T> getFrozenList(String column, AbstractType<T> type)
+        {
+            ByteBuffer raw = data.get(column);
+            return raw == null ? null : ListType.getInstance(type, false).compose(raw);
+        }
+
+        public <K, V> Map<K, V> getFrozenMap(String column, AbstractType<K> keyType, AbstractType<V> valueType)
+        {
+            ByteBuffer raw = data.get(column);
+            return raw == null ? null : MapType.getInstance(keyType, valueType, false).compose(raw);
+        }
+
+        public Map<String, String> getFrozenTextMap(String column)
+        {
+            return getFrozenMap(column, UTF8Type.instance, UTF8Type.instance);
+        }
+
         public List<ColumnSpecification> getColumns()
         {
             return columns;

diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
index 65edef7..7d09506 100644
--- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java
+++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java

@@ -18,40 +18,61 @@
 package org.apache.cassandra.cql3;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
 import java.util.Map;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.ColumnSlice;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
- * A simple container that simplify passing parameters for collections methods.
+ * Groups the parameters of an update query, and make building updates easier.
  */
 public class UpdateParameters
 {
     public final CFMetaData metadata;
+    public final PartitionColumns updatedColumns;
     public final QueryOptions options;
-    public final long timestamp;
+
+    private final int nowInSec;
+    private final long timestamp;
     private final int ttl;
-    public final int localDeletionTime;
+
+    private final DeletionTime deletionTime;
 
     // For lists operation that require a read-before-write. Will be null otherwise.
-    private final Map<ByteBuffer, CQL3Row> prefetchedLists;
+    private final Map<DecoratedKey, Partition> prefetchedRows;
 
-    public UpdateParameters(CFMetaData metadata, QueryOptions options, long timestamp, int ttl, Map<ByteBuffer, CQL3Row> prefetchedLists)
+    private Row.Builder staticBuilder;
+    private Row.Builder regularBuilder;
+
+    // The builder currently in use. Will alias either staticBuilder or regularBuilder, which are themselves built lazily.
+    private Row.Builder builder;
+
+    public UpdateParameters(CFMetaData metadata,
+                            PartitionColumns updatedColumns,
+                            QueryOptions options,
+                            long timestamp,
+                            int ttl,
+                            Map<DecoratedKey, Partition> prefetchedRows)
     throws InvalidRequestException
     {
         this.metadata = metadata;
+        this.updatedColumns = updatedColumns;
         this.options = options;
+
+        this.nowInSec = FBUtilities.nowInSeconds();
         this.timestamp = timestamp;
         this.ttl = ttl;
-        this.localDeletionTime = (int)(System.currentTimeMillis() / 1000);
-        this.prefetchedLists = prefetchedLists;
+
+        this.deletionTime = new DeletionTime(timestamp, nowInSec);
+
+        this.prefetchedRows = prefetchedRows;
 
         // We use MIN_VALUE internally to mean the absence of of timestamp (in Selection, in sstable stats, ...), so exclude
         // it to avoid potential confusion.
@@ -59,71 +80,164 @@
             throw new InvalidRequestException(String.format("Out of bound timestamp, must be in [%d, %d]", Long.MIN_VALUE + 1, Long.MAX_VALUE));
     }
 
-    public Cell makeColumn(CellName name, ByteBuffer value) throws InvalidRequestException
+    public void newRow(Clustering clustering) throws InvalidRequestException
     {
-        QueryProcessor.validateCellName(name, metadata.comparator);
-        return AbstractCell.create(name, value, timestamp, ttl, metadata);
+        if (metadata.isDense() && !metadata.isCompound())
+        {
+            // If it's a COMPACT STORAGE table with a single clustering column, the clustering value is
+            // translated in Thrift to the full Thrift column name, and for backward compatibility we
+            // don't want to allow that to be empty (even though this would be fine for the storage engine).
+            assert clustering.size() == 1;
+            ByteBuffer value = clustering.get(0);
+            if (value == null || !value.hasRemaining())
+                throw new InvalidRequestException("Invalid empty or null value for column " + metadata.clusteringColumns().get(0).name);
+        }
+
+        if (clustering == Clustering.STATIC_CLUSTERING)
+        {
+            if (staticBuilder == null)
+                staticBuilder = BTreeRow.unsortedBuilder(nowInSec);
+            builder = staticBuilder;
+        }
+        else
+        {
+            if (regularBuilder == null)
+                regularBuilder = BTreeRow.unsortedBuilder(nowInSec);
+            builder = regularBuilder;
+        }
+
+        builder.newRow(clustering);
     }
 
-     public Cell makeCounter(CellName name, long delta) throws InvalidRequestException
-     {
-         QueryProcessor.validateCellName(name, metadata.comparator);
-         return new BufferCounterUpdateCell(name, delta, FBUtilities.timestampMicros());
-     }
-
-    public Cell makeTombstone(CellName name) throws InvalidRequestException
+    public Clustering currentClustering()
     {
-        QueryProcessor.validateCellName(name, metadata.comparator);
-        return new BufferDeletedCell(name, localDeletionTime, timestamp);
+        return builder.clustering();
     }
 
-    public RangeTombstone makeRangeTombstone(ColumnSlice slice) throws InvalidRequestException
+    public void addPrimaryKeyLivenessInfo()
     {
-        QueryProcessor.validateComposite(slice.start, metadata.comparator);
-        QueryProcessor.validateComposite(slice.finish, metadata.comparator);
-        return new RangeTombstone(slice.start, slice.finish, timestamp, localDeletionTime);
+        builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(metadata, timestamp, ttl, nowInSec));
     }
 
-    public RangeTombstone makeTombstoneForOverwrite(ColumnSlice slice) throws InvalidRequestException
+    public void addRowDeletion()
     {
-        QueryProcessor.validateComposite(slice.start, metadata.comparator);
-        QueryProcessor.validateComposite(slice.finish, metadata.comparator);
-        return new RangeTombstone(slice.start, slice.finish, timestamp - 1, localDeletionTime);
+        // For compact tables, at the exclusion of the static row (of static compact tables), each row ever has a single column,
+        // the "compact" one. As such, deleting the row or deleting that single cell is equivalent. We favor the later however
+        // because that makes it easier when translating back to the old format layout (for thrift and pre-3.0 backward
+        // compatibility) as we don't have to special case for the row deletion. This is also in line with what we used to do pre-3.0.
+        if (metadata.isCompactTable() && builder.clustering() != Clustering.STATIC_CLUSTERING && !metadata.isSuper())
+            addTombstone(metadata.compactValueColumn());
+        else
+            builder.addRowDeletion(Row.Deletion.regular(deletionTime));
+    }
+
+    public void addTombstone(ColumnDefinition column) throws InvalidRequestException
+    {
+        addTombstone(column, null);
+    }
+
+    public void addTombstone(ColumnDefinition column, CellPath path) throws InvalidRequestException
+    {
+        builder.addCell(BufferCell.tombstone(column, timestamp, nowInSec, path));
+    }
+
+    public void addCell(ColumnDefinition column, ByteBuffer value) throws InvalidRequestException
+    {
+        addCell(column, null, value);
+    }
+
+    public void addCell(ColumnDefinition column, CellPath path, ByteBuffer value) throws InvalidRequestException
+    {
+        Cell cell = ttl == LivenessInfo.NO_TTL
+                  ? BufferCell.live(metadata, column, timestamp, value, path)
+                  : BufferCell.expiring(column, timestamp, ttl, nowInSec, value, path);
+        builder.addCell(cell);
+    }
+
+    public void addCounter(ColumnDefinition column, long increment) throws InvalidRequestException
+    {
+        addCounter(column, increment, null);
+    }
+
+    public void addCounter(ColumnDefinition column, long increment, CellPath path) throws InvalidRequestException
+    {
+        assert ttl == LivenessInfo.NO_TTL;
+
+        // Because column is a counter, we need the value to be a CounterContext. However, we're only creating a
+        // "counter update", which is a temporary state until we run into 'CounterMutation.updateWithCurrentValue()'
+        // which does the read-before-write and sets the proper CounterId, clock and updated value.
+        //
+        // We thus create a "fake" local shard here. The clock used doesn't matter as this is just a temporary
+        // state that will be replaced when processing the mutation in CounterMutation, but the reason we use a 'local'
+        // shard is due to the merging rules: if a user includes multiple updates to the same counter in a batch, those
+        // multiple updates will be merged in the PartitionUpdate *before* they even reach CounterMutation. So we need
+        // such update to be added together, and that's what a local shard gives us.
+        //
+        // We set counterid to a special value to differentiate between regular pre-2.0 local shards from pre-2.1 era
+        // and "counter update" temporary state cells. Please see CounterContext.createUpdate() for further details.
+        builder.addCell(BufferCell.live(metadata, column, timestamp, CounterContext.instance().createUpdate(increment), path));
+    }
+
+    public void setComplexDeletionTime(ColumnDefinition column)
+    {
+        builder.addComplexDeletion(column, deletionTime);
+    }
+
+    public void setComplexDeletionTimeForOverwrite(ColumnDefinition column)
+    {
+        builder.addComplexDeletion(column, new DeletionTime(deletionTime.markedForDeleteAt() - 1, deletionTime.localDeletionTime()));
+    }
+
+    public Row buildRow()
+    {
+        Row built = builder.build();
+        builder = null; // Resetting to null just so we quickly bad usage where we forget to call newRow() after that.
+        return built;
+    }
+
+    public DeletionTime deletionTime()
+    {
+        return deletionTime;
+    }
+
+    public RangeTombstone makeRangeTombstone(ClusteringComparator comparator, Clustering clustering)
+    {
+        return makeRangeTombstone(Slice.make(comparator, clustering));
+    }
+
+    public RangeTombstone makeRangeTombstone(Slice slice)
+    {
+        return new RangeTombstone(slice, deletionTime);
     }
 
     /**
-     * Returns the prefetched list with the already performed modifications.
-     * <p>If no modification have yet been performed this method will return the fetched list.
-     * If some modifications (updates or deletions) have already been done the list returned
-     * will be the result of the merge of the fetched list and of the pending mutations.</p>
+     * Returns the prefetched row with the already performed modifications.
+     * <p>If no modification have yet been performed this method will return the fetched row or {@code null} if
+     * the row does not exist. If some modifications (updates or deletions) have already been done the row returned
+     * will be the result of the merge of the fetched row and of the pending mutations.</p>
      *
-     * @param rowKey the row key
-     * @param cql3ColumnName the column name
-     * @param cf the pending modifications
-     * @return the prefetched list with the already performed modifications
+     * @param key the partition key
+     * @param clustering the row clustering
+     * @return the prefetched row with the already performed modifications
      */
-    public List<Cell> getPrefetchedList(ByteBuffer rowKey, ColumnIdentifier cql3ColumnName, ColumnFamily cf)
+    public Row getPrefetchedRow(DecoratedKey key, Clustering clustering)
     {
-        if (prefetchedLists == null)
-            return Collections.emptyList();
+        if (prefetchedRows == null)
+            return null;
 
-        CQL3Row row = prefetchedLists.get(rowKey);
+        Partition partition = prefetchedRows.get(key);
+        Row prefetchedRow = partition == null ? null : partition.searchIterator(ColumnFilter.selection(partition.columns()), false).next(clustering);
 
-        List<Cell> cql3List = row == null ? Collections.<Cell>emptyList() : row.getMultiCellColumn(cql3ColumnName);
+        // We need to apply the pending mutations to return the row in its current state
+        Row pendingMutations = builder.copy().build();
 
-        if (!cf.isEmpty())
-        {
-            ColumnFamily currentCf = cf.cloneMe();
+        if (pendingMutations.isEmpty())
+            return prefetchedRow;
 
-            for (Cell c : cql3List)
-                currentCf.addColumn(c);
+        if (prefetchedRow == null)
+            return pendingMutations;
 
-            CFMetaData cfm = currentCf.metadata();
-            CQL3Row.RowIterator iterator = cfm.comparator.CQL3RowBuilder(cfm, timestamp).group(currentCf.iterator());
-            // We can only update one CQ3Row per partition key at a time (we don't allow IN for clustering key)
-            cql3List = iterator.hasNext() ? iterator.next().getMultiCellColumn(cql3ColumnName) : null;
-        }
-
-        return (cql3List == null) ? Collections.<Cell>emptyList() : cql3List;
+        return Rows.merge(prefetchedRow, pendingMutations, nowInSec)
+                   .purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/UserTypes.java b/src/java/org/apache/cassandra/cql3/UserTypes.java
index 6766d07..68a0513 100644
--- a/src/java/org/apache/cassandra/cql3/UserTypes.java
+++ b/src/java/org/apache/cassandra/cql3/UserTypes.java

@@ -21,11 +21,9 @@
 import java.util.*;
 
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /**
@@ -44,7 +42,7 @@
                                        ut.fieldType(field));
     }
 
-    public static class Literal implements Term.Raw
+    public static class Literal extends Term.Raw
     {
         public final Map<ColumnIdentifier, Term.Raw> entries;
 
@@ -120,8 +118,7 @@
             }
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
             StringBuilder sb = new StringBuilder();
             sb.append("{");
@@ -129,7 +126,7 @@
             while (iter.hasNext())
             {
                 Map.Entry<ColumnIdentifier, Term.Raw> entry = iter.next();
-                sb.append(entry.getKey()).append(":").append(entry.getValue());
+                sb.append(entry.getKey()).append(": ").append(entry.getValue().getText());
                 if (iter.hasNext())
                     sb.append(", ");
             }
@@ -150,9 +147,9 @@
             this.values = values;
         }
 
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(values);
+            Terms.addFunctions(values, functions);
         }
 
         public boolean containsBindMarker()
@@ -171,8 +168,6 @@
 
         private ByteBuffer[] bindInternal(QueryOptions options) throws InvalidRequestException
         {
-            int version = options.getProtocolVersion();
-
             ByteBuffer[] buffers = new ByteBuffer[values.size()];
             for (int i = 0; i < type.size(); i++)
             {
@@ -180,10 +175,6 @@
                 // Since A UDT value is always written in its entirety Cassandra can't preserve a pre-existing value by 'not setting' the new value. Reject the query.
                 if (buffers[i] == ByteBufferUtil.UNSET_BYTE_BUFFER)
                     throw new InvalidRequestException(String.format("Invalid unset value for field '%s' of user defined type %s", type.fieldNameAsString(i), type.getNameAsString()));
-                // Inside UDT values, we must force the serialization of collections to v3 whatever protocol
-                // version is in use since we're going to store directly that serialized value.
-                if (version < Server.VERSION_3 && type.fieldType(i).isCollection() && buffers[i] != null)
-                    buffers[i] = ((CollectionType)type.fieldType(i)).getSerializer().reserializeToV3(buffers[i]);
             }
             return buffers;
         }

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
new file mode 100644
index 0000000..c56c8e0
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java

@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.cql3.restrictions.CustomIndexExpression;
+
+public final class WhereClause
+{
+
+    private static final WhereClause EMPTY = new WhereClause(new Builder());
+
+    public final List<Relation> relations;
+    public final List<CustomIndexExpression> expressions;
+
+    private WhereClause(Builder builder)
+    {
+        this(builder.relations.build(), builder.expressions.build());
+    }
+
+    private WhereClause(List<Relation> relations, List<CustomIndexExpression> expressions)
+    {
+        this.relations = relations;
+        this.expressions = expressions;
+    }
+
+    public static WhereClause empty()
+    {
+        return EMPTY;
+    }
+
+    public WhereClause copy(List<Relation> newRelations)
+    {
+        return new WhereClause(newRelations, expressions);
+    }
+
+    public boolean containsCustomExpressions()
+    {
+        return !expressions.isEmpty();
+    }
+
+    public static final class Builder
+    {
+        ImmutableList.Builder<Relation> relations = new ImmutableList.Builder<>();
+        ImmutableList.Builder<CustomIndexExpression> expressions = new ImmutableList.Builder<>();
+
+        public Builder add(Relation relation)
+        {
+            relations.add(relation);
+            return this;
+        }
+
+        public Builder add(CustomIndexExpression expression)
+        {
+            expressions.add(expression);
+            return this;
+        }
+
+        public WhereClause build()
+        {
+            return new WhereClause(this);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/AbstractFunction.java b/src/java/org/apache/cassandra/cql3/functions/AbstractFunction.java
index b77f4d5..0cf11a5 100644
--- a/src/java/org/apache/cassandra/cql3/functions/AbstractFunction.java
+++ b/src/java/org/apache/cassandra/cql3/functions/AbstractFunction.java

@@ -20,7 +20,6 @@
 import java.util.List;
 
 import com.google.common.base.Objects;
-import com.google.common.collect.ImmutableSet;
 
 import org.apache.cassandra.cql3.AssignmentTestable;
 import org.apache.cassandra.cql3.ColumnSpecification;
@@ -69,9 +68,9 @@
             && Objects.equal(this.returnType, that.returnType);
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return ImmutableSet.<Function>of(this);
+        functions.add(this);
     }
 
     public boolean hasReferenceTo(Function function)

diff --git a/src/java/org/apache/cassandra/cql3/functions/AggregateFcts.java b/src/java/org/apache/cassandra/cql3/functions/AggregateFcts.java
index cca6156..441fa58 100644
--- a/src/java/org/apache/cassandra/cql3/functions/AggregateFcts.java
+++ b/src/java/org/apache/cassandra/cql3/functions/AggregateFcts.java

@@ -19,25 +19,71 @@
 
 import java.math.BigDecimal;
 import java.math.BigInteger;
+import java.math.RoundingMode;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ByteType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.db.marshal.DecimalType;
-import org.apache.cassandra.db.marshal.DoubleType;
-import org.apache.cassandra.db.marshal.FloatType;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.db.marshal.ShortType;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
  * Factory methods for aggregate functions.
  */
 public abstract class AggregateFcts
 {
+    public static Collection<AggregateFunction> all()
+    {
+        Collection<AggregateFunction> functions = new ArrayList<>();
+
+        functions.add(countRowsFunction);
+
+        // sum for primitives
+        functions.add(sumFunctionForByte);
+        functions.add(sumFunctionForShort);
+        functions.add(sumFunctionForInt32);
+        functions.add(sumFunctionForLong);
+        functions.add(sumFunctionForFloat);
+        functions.add(sumFunctionForDouble);
+        functions.add(sumFunctionForDecimal);
+        functions.add(sumFunctionForVarint);
+        functions.add(sumFunctionForCounter);
+
+        // avg for primitives
+        functions.add(avgFunctionForByte);
+        functions.add(avgFunctionForShort);
+        functions.add(avgFunctionForInt32);
+        functions.add(avgFunctionForLong);
+        functions.add(avgFunctionForFloat);
+        functions.add(avgFunctionForDouble);
+        functions.add(avgFunctionForDecimal);
+        functions.add(avgFunctionForVarint);
+        functions.add(avgFunctionForCounter);
+
+        // count, max, and min for all standard types
+        for (CQL3Type type : CQL3Type.Native.values())
+        {
+            if (type != CQL3Type.Native.VARCHAR) // varchar and text both mapping to UTF8Type
+            {
+                functions.add(AggregateFcts.makeCountFunction(type.getType()));
+                if (type != CQL3Type.Native.COUNTER)
+                {
+                    functions.add(AggregateFcts.makeMaxFunction(type.getType()));
+                    functions.add(AggregateFcts.makeMinFunction(type.getType()));
+                }
+                else
+                {
+                    functions.add(AggregateFcts.maxFunctionForCounter);
+                    functions.add(AggregateFcts.minFunctionForCounter);
+                }
+            }
+        }
+
+        return functions;
+    }
+
     /**
      * Checks if the specified function is the count rows (e.g. COUNT(*) or COUNT(1)) function.
      *
@@ -69,7 +115,7 @@
 
                         public ByteBuffer compute(int protocolVersion)
                         {
-                            return ((LongType) returnType()).decompose(Long.valueOf(count));
+                            return LongType.instance.decompose(count);
                         }
 
                         public void addInput(int protocolVersion, List<ByteBuffer> values)
@@ -110,7 +156,7 @@
                             if (value == null)
                                 return;
 
-                            BigDecimal number = ((BigDecimal) argTypes().get(0).compose(value));
+                            BigDecimal number = DecimalType.instance.compose(value);
                             sum = sum.add(number);
                         }
                     };
@@ -127,22 +173,19 @@
                 {
                     return new Aggregate()
                     {
-                        private BigDecimal sum = BigDecimal.ZERO;
+                        private BigDecimal avg = BigDecimal.ZERO;
 
                         private int count;
 
                         public void reset()
                         {
                             count = 0;
-                            sum = BigDecimal.ZERO;
+                            avg = BigDecimal.ZERO;
                         }
 
                         public ByteBuffer compute(int protocolVersion)
                         {
-                            if (count == 0)
-                                return ((DecimalType) returnType()).decompose(BigDecimal.ZERO);
-
-                            return ((DecimalType) returnType()).decompose(sum.divide(BigDecimal.valueOf(count)));
+                            return DecimalType.instance.decompose(avg);
                         }
 
                         public void addInput(int protocolVersion, List<ByteBuffer> values)
@@ -153,13 +196,16 @@
                                 return;
 
                             count++;
-                            BigDecimal number = ((BigDecimal) argTypes().get(0).compose(value));
-                            sum = sum.add(number);
+                            BigDecimal number = DecimalType.instance.compose(value);
+
+                            // avg = avg + (value - sum) / count.
+                            avg = avg.add(number.subtract(avg).divide(BigDecimal.valueOf(count), RoundingMode.HALF_EVEN));
                         }
                     };
                 }
             };
 
+
     /**
      * The SUM function for varint values.
      */
@@ -189,7 +235,7 @@
                             if (value == null)
                                 return;
 
-                            BigInteger number = ((BigInteger) argTypes().get(0).compose(value));
+                            BigInteger number = IntegerType.instance.compose(value);
                             sum = sum.add(number);
                         }
                     };
@@ -219,9 +265,9 @@
                         public ByteBuffer compute(int protocolVersion)
                         {
                             if (count == 0)
-                                return ((IntegerType) returnType()).decompose(BigInteger.ZERO);
+                                return IntegerType.instance.decompose(BigInteger.ZERO);
 
-                            return ((IntegerType) returnType()).decompose(sum.divide(BigInteger.valueOf(count)));
+                            return IntegerType.instance.decompose(sum.divide(BigInteger.valueOf(count)));
                         }
 
                         public void addInput(int protocolVersion, List<ByteBuffer> values)
@@ -283,35 +329,11 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new Aggregate()
+                    return new AvgAggregate(ByteType.instance)
                     {
-                        private byte sum;
-
-                        private int count;
-
-                        public void reset()
+                        public ByteBuffer compute(int protocolVersion) throws InvalidRequestException
                         {
-                            count = 0;
-                            sum = 0;
-                        }
-
-                        public ByteBuffer compute(int protocolVersion)
-                        {
-                            int avg = count == 0 ? 0 : sum / count;
-
-                            return ((ByteType) returnType()).decompose((byte) avg);
-                        }
-
-                        public void addInput(int protocolVersion, List<ByteBuffer> values)
-                        {
-                            ByteBuffer value = values.get(0);
-
-                            if (value == null)
-                                return;
-
-                            count++;
-                            Number number = ((Number) argTypes().get(0).compose(value));
-                            sum += number.byteValue();
+                            return ByteType.instance.decompose((byte) computeInternal());
                         }
                     };
                 }
@@ -361,35 +383,11 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new Aggregate()
+                    return new AvgAggregate(ShortType.instance)
                     {
-                        private short sum;
-
-                        private int count;
-
-                        public void reset()
-                        {
-                            count = 0;
-                            sum = 0;
-                        }
-
                         public ByteBuffer compute(int protocolVersion)
                         {
-                            int avg = count == 0 ? 0 : sum / count;
-
-                            return ((ShortType) returnType()).decompose((short) avg);
-                        }
-
-                        public void addInput(int protocolVersion, List<ByteBuffer> values)
-                        {
-                            ByteBuffer value = values.get(0);
-
-                            if (value == null)
-                                return;
-
-                            count++;
-                            Number number = ((Number) argTypes().get(0).compose(value));
-                            sum += number.shortValue();
+                            return ShortType.instance.decompose((short) computeInternal());
                         }
                     };
                 }
@@ -439,35 +437,11 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new Aggregate()
+                    return new AvgAggregate(Int32Type.instance)
                     {
-                        private int sum;
-
-                        private int count;
-
-                        public void reset()
-                        {
-                            count = 0;
-                            sum = 0;
-                        }
-
                         public ByteBuffer compute(int protocolVersion)
                         {
-                            int avg = count == 0 ? 0 : sum / count;
-
-                            return ((Int32Type) returnType()).decompose(avg);
-                        }
-
-                        public void addInput(int protocolVersion, List<ByteBuffer> values)
-                        {
-                            ByteBuffer value = values.get(0);
-
-                            if (value == null)
-                                return;
-
-                            count++;
-                            Number number = ((Number) argTypes().get(0).compose(value));
-                            sum += number.intValue();
+                            return Int32Type.instance.decompose((int) computeInternal());
                         }
                     };
                 }
@@ -493,7 +467,13 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new LongAvgAggregate();
+                    return new AvgAggregate(LongType.instance)
+                    {
+                        public ByteBuffer compute(int protocolVersion)
+                        {
+                            return LongType.instance.decompose(computeInternal());
+                        }
+                    };
                 }
             };
 
@@ -541,35 +521,11 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new Aggregate()
+                    return new FloatAvgAggregate(FloatType.instance)
                     {
-                        private float sum;
-
-                        private int count;
-
-                        public void reset()
+                        public ByteBuffer compute(int protocolVersion) throws InvalidRequestException
                         {
-                            count = 0;
-                            sum = 0;
-                        }
-
-                        public ByteBuffer compute(int protocolVersion)
-                        {
-                            float avg = count == 0 ? 0 : sum / count;
-
-                            return ((FloatType) returnType()).decompose(avg);
-                        }
-
-                        public void addInput(int protocolVersion, List<ByteBuffer> values)
-                        {
-                            ByteBuffer value = values.get(0);
-
-                            if (value == null)
-                                return;
-
-                            count++;
-                            Number number = ((Number) argTypes().get(0).compose(value));
-                            sum += number.floatValue();
+                            return FloatType.instance.decompose((float) computeInternal());
                         }
                     };
                 }
@@ -610,6 +566,95 @@
                     };
                 }
             };
+    /**
+     * Average aggregate for floating point umbers, using double arithmetics and Kahan's algorithm
+     * to calculate sum by default, switching to BigDecimal on sum overflow. Resulting number is
+     * converted to corresponding representation by concrete implementations.
+     */
+    private static abstract class FloatAvgAggregate implements AggregateFunction.Aggregate
+    {
+        private double sum;
+        private double compensation;
+        private double simpleSum;
+
+        private int count;
+
+        private BigDecimal bigSum = null;
+        private boolean overflow = false;
+
+        private final AbstractType numberType;
+
+        public FloatAvgAggregate(AbstractType numberType)
+        {
+            this.numberType = numberType;
+        }
+
+        public void reset()
+        {
+            sum = 0;
+            compensation = 0;
+            simpleSum = 0;
+
+            count = 0;
+            bigSum = null;
+            overflow = false;
+        }
+
+        public double computeInternal()
+        {
+            if (count == 0)
+                return 0d;
+
+            if (overflow)
+            {
+                return bigSum.divide(BigDecimal.valueOf(count), RoundingMode.HALF_EVEN).doubleValue();
+            }
+            else
+            {
+                // correctly compute final sum if it's NaN from consequently
+                // adding same-signed infinite values.
+                double tmp = sum + compensation;
+                if (Double.isNaN(tmp) && Double.isInfinite(simpleSum))
+                    sum = simpleSum;
+                else
+                    sum = tmp;
+
+                return sum / count;
+            }
+        }
+
+        public void addInput(int protocolVersion, List<ByteBuffer> values)
+        {
+            ByteBuffer value = values.get(0);
+
+            if (value == null)
+                return;
+
+            count++;
+
+            double number = ((Number) numberType.compose(value)).doubleValue();
+
+            if (overflow)
+            {
+                bigSum = bigSum.add(BigDecimal.valueOf(number));
+            }
+            else
+            {
+                simpleSum += number;
+                double prev = sum;
+                double tmp = number - compensation;
+                double rounded = sum + tmp;
+                compensation = (rounded - sum) - tmp;
+                sum = rounded;
+
+                if (Double.isInfinite(sum) && !Double.isInfinite(number))
+                {
+                    overflow = true;
+                    bigSum = BigDecimal.valueOf(prev).add(BigDecimal.valueOf(number));
+                }
+            }
+        }
+    }
 
     /**
      * AVG function for double values.
@@ -619,35 +664,11 @@
             {
                 public Aggregate newAggregate()
                 {
-                    return new Aggregate()
+                    return new FloatAvgAggregate(DoubleType.instance)
                     {
-                        private double sum;
-
-                        private int count;
-
-                        public void reset()
+                        public ByteBuffer compute(int protocolVersion) throws InvalidRequestException
                         {
-                            count = 0;
-                            sum = 0;
-                        }
-
-                        public ByteBuffer compute(int protocolVersion)
-                        {
-                            double avg = count == 0 ? 0 : sum / count;
-
-                            return ((DoubleType) returnType()).decompose(avg);
-                        }
-
-                        public void addInput(int protocolVersion, List<ByteBuffer> values)
-                        {
-                            ByteBuffer value = values.get(0);
-
-                            if (value == null)
-                                return;
-
-                            count++;
-                            Number number = ((Number) argTypes().get(0).compose(value));
-                            sum += number.doubleValue();
+                            return DoubleType.instance.decompose(computeInternal());
                         }
                     };
                 }
@@ -657,25 +678,107 @@
      * The SUM function for counter column values.
      */
     public static final AggregateFunction sumFunctionForCounter =
-            new NativeAggregateFunction("sum", CounterColumnType.instance, CounterColumnType.instance)
-            {
-                public Aggregate newAggregate()
-                {
-                    return new LongSumAggregate();
-                }
-            };
+    new NativeAggregateFunction("sum", CounterColumnType.instance, CounterColumnType.instance)
+    {
+        public Aggregate newAggregate()
+        {
+            return new LongSumAggregate();
+        }
+    };
 
     /**
      * AVG function for counter column values.
      */
     public static final AggregateFunction avgFunctionForCounter =
-            new NativeAggregateFunction("avg", CounterColumnType.instance, CounterColumnType.instance)
+    new NativeAggregateFunction("avg", CounterColumnType.instance, CounterColumnType.instance)
+    {
+        public Aggregate newAggregate()
+        {
+            return new AvgAggregate(LongType.instance)
             {
-                public Aggregate newAggregate()
+                public ByteBuffer compute(int protocolVersion) throws InvalidRequestException
                 {
-                    return new LongAvgAggregate();
+                    return CounterColumnType.instance.decompose(computeInternal());
                 }
             };
+        }
+    };
+
+    /**
+     * The MIN function for counter column values.
+     */
+    public static final AggregateFunction minFunctionForCounter =
+    new NativeAggregateFunction("min", CounterColumnType.instance, CounterColumnType.instance)
+    {
+        public Aggregate newAggregate()
+        {
+            return new Aggregate()
+            {
+                private Long min;
+
+                public void reset()
+                {
+                    min = null;
+                }
+
+                public ByteBuffer compute(int protocolVersion)
+                {
+                    return min != null ? LongType.instance.decompose(min) : null;
+                }
+
+                public void addInput(int protocolVersion, List<ByteBuffer> values)
+                {
+                    ByteBuffer value = values.get(0);
+
+                    if (value == null)
+                        return;
+
+                    long lval = LongType.instance.compose(value);
+
+                    if (min == null || lval < min)
+                        min = lval;
+                }
+            };
+        }
+    };
+
+    /**
+     * MAX function for counter column values.
+     */
+    public static final AggregateFunction maxFunctionForCounter =
+    new NativeAggregateFunction("max", CounterColumnType.instance, CounterColumnType.instance)
+    {
+        public Aggregate newAggregate()
+        {
+            return new Aggregate()
+            {
+                private Long max;
+
+                public void reset()
+                {
+                    max = null;
+                }
+
+                public ByteBuffer compute(int protocolVersion)
+                {
+                    return max != null ? LongType.instance.decompose(max) : null;
+                }
+
+                public void addInput(int protocolVersion, List<ByteBuffer> values)
+                {
+                    ByteBuffer value = values.get(0);
+
+                    if (value == null)
+                        return;
+
+                    long lval = LongType.instance.compose(value);
+
+                    if (max == null || lval > max)
+                        max = lval;
+                }
+            };
+        }
+    };
 
     /**
      * Creates a MAX function for the specified type.
@@ -825,23 +928,43 @@
         }
     }
 
-    private static class LongAvgAggregate implements AggregateFunction.Aggregate
+    /**
+     * Average aggregate class, collecting the sum using long arithmetics, falling back
+     * to BigInteger on long overflow. Resulting number is converted to corresponding
+     * representation by concrete implementations.
+     */
+    private static abstract class AvgAggregate implements AggregateFunction.Aggregate
     {
         private long sum;
-
         private int count;
+        private BigInteger bigSum = null;
+        private boolean overflow = false;
+
+        private final AbstractType numberType;
+
+        public AvgAggregate(AbstractType type)
+        {
+            this.numberType = type;
+        }
 
         public void reset()
         {
             count = 0;
-            sum = 0;
+            sum = 0L;
+            overflow = false;
+            bigSum = null;
         }
 
-        public ByteBuffer compute(int protocolVersion)
+        long computeInternal()
         {
-            long avg = count == 0 ? 0 : sum / count;
-
-            return LongType.instance.decompose(avg);
+            if (overflow)
+            {
+                return bigSum.divide(BigInteger.valueOf(count)).longValue();
+            }
+            else
+            {
+                return count == 0 ? 0 : (sum / count);
+            }
         }
 
         public void addInput(int protocolVersion, List<ByteBuffer> values)
@@ -852,8 +975,22 @@
                 return;
 
             count++;
-            Number number = LongType.instance.compose(value);
-            sum += number.longValue();
+            long number = ((Number) numberType.compose(value)).longValue();
+            if (overflow)
+            {
+                bigSum = bigSum.add(BigInteger.valueOf(number));
+            }
+            else
+            {
+                long prev = sum;
+                sum += number;
+
+                if (((prev ^ sum) & (number ^ sum)) < 0)
+                {
+                    overflow = true;
+                    bigSum = BigInteger.valueOf(prev).add(BigInteger.valueOf(number));
+                }
+            }
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/BytesConversionFcts.java b/src/java/org/apache/cassandra/cql3/functions/BytesConversionFcts.java
index ddb33fc..d9c6a52 100644
--- a/src/java/org/apache/cassandra/cql3/functions/BytesConversionFcts.java
+++ b/src/java/org/apache/cassandra/cql3/functions/BytesConversionFcts.java

@@ -18,8 +18,11 @@
 package org.apache.cassandra.cql3.functions;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 
+import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.UTF8Type;
@@ -29,6 +32,27 @@
 
 public abstract class BytesConversionFcts
 {
+    public static Collection<Function> all()
+    {
+        Collection<Function> functions = new ArrayList<>();
+
+        // because text and varchar ends up being synonymous, our automatic makeToBlobFunction doesn't work
+        // for varchar, so we special case it below. We also skip blob for obvious reasons.
+        for (CQL3Type type : CQL3Type.Native.values())
+        {
+            if (type != CQL3Type.Native.VARCHAR && type != CQL3Type.Native.BLOB)
+            {
+                functions.add(makeToBlobFunction(type.getType()));
+                functions.add(makeFromBlobFunction(type.getType()));
+            }
+        }
+
+        functions.add(VarcharAsBlobFct);
+        functions.add(BlobAsVarcharFct);
+
+        return functions;
+    }
+
     // Most of the XAsBlob and blobAsX functions are basically no-op since everything is
     // bytes internally. They only "trick" the type system.
     public static Function makeToBlobFunction(AbstractType<?> fromType)
@@ -74,7 +98,7 @@
         }
     };
 
-    public static final Function BlobAsVarcharFact = new NativeScalarFunction("blobasvarchar", UTF8Type.instance, BytesType.instance)
+    public static final Function BlobAsVarcharFct = new NativeScalarFunction("blobasvarchar", UTF8Type.instance, BytesType.instance)
     {
         public ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters)
         {

diff --git a/src/java/org/apache/cassandra/cql3/functions/Function.java b/src/java/org/apache/cassandra/cql3/functions/Function.java
index ed6e2a7..f93f14b 100644
--- a/src/java/org/apache/cassandra/cql3/functions/Function.java
+++ b/src/java/org/apache/cassandra/cql3/functions/Function.java

@@ -44,7 +44,7 @@
      */
     public boolean isAggregate();
 
-    public Iterable<Function> getFunctions();
+    public void addFunctionsTo(List<Function> functions);
 
     public boolean hasReferenceTo(Function function);
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
index 323f1bb..4ccd4b2 100644
--- a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
+++ b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java

@@ -20,8 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
-
-import com.google.common.collect.Iterables;
+import java.util.stream.Collectors;
 
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.statements.RequestValidations;
@@ -41,9 +40,10 @@
         this.terms = terms;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return Iterables.concat(Terms.getFunctions(terms), fun.getFunctions());
+        Terms.addFunctions(terms, functions);
+        fun.addFunctionsTo(functions);
     }
 
     public void collectMarkerSpecification(VariableSpecifications boundNames)
@@ -112,7 +112,7 @@
         throw new AssertionError();
     }
 
-    public static class Raw implements Term.Raw
+    public static class Raw extends Term.Raw
     {
         private FunctionName name;
         private final List<Term.Raw> terms;
@@ -125,7 +125,7 @@
 
         public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException
         {
-            Function fun = Functions.get(keyspace, name, terms, receiver.ksName, receiver.cfName, receiver.type);
+            Function fun = FunctionResolver.get(keyspace, name, terms, receiver.ksName, receiver.cfName, receiver.type);
             if (fun == null)
                 throw new InvalidRequestException(String.format("Unknown function %s called", name));
             if (fun.isAggregate())
@@ -147,7 +147,7 @@
             List<Term> parameters = new ArrayList<>(terms.size());
             for (int i = 0; i < terms.size(); i++)
             {
-                Term t = terms.get(i).prepare(keyspace, Functions.makeArgSpec(receiver.ksName, receiver.cfName, scalarFun, i));
+                Term t = terms.get(i).prepare(keyspace, FunctionResolver.makeArgSpec(receiver.ksName, receiver.cfName, scalarFun, i));
                 parameters.add(t);
             }
 
@@ -162,7 +162,7 @@
             // later with a more helpful error message that if we were to return false here.
             try
             {
-                Function fun = Functions.get(keyspace, name, terms, receiver.ksName, receiver.cfName, receiver.type);
+                Function fun = FunctionResolver.get(keyspace, name, terms, receiver.ksName, receiver.cfName, receiver.type);
 
                 // Because fromJson() can return whatever type the receiver is, we'll always get EXACT_MATCH.  To
                 // handle potentially ambiguous function calls with fromJson() as an argument, always return
@@ -183,18 +183,9 @@
             }
         }
 
-        @Override
-        public String toString()
+        public String getText()
         {
-            StringBuilder sb = new StringBuilder();
-            sb.append(name).append("(");
-            for (int i = 0; i < terms.size(); i++)
-            {
-                if (i > 0)
-                    sb.append(", ");
-                sb.append(terms.get(i));
-            }
-            return sb.append(")").toString();
+            return name + terms.stream().map(Term.Raw::getText).collect(Collectors.joining(", ", "(", ")"));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/FunctionResolver.java b/src/java/org/apache/cassandra/cql3/functions/FunctionResolver.java
new file mode 100644
index 0000000..be2daae
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/FunctionResolver.java

@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.functions;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+import static java.util.stream.Collectors.joining;
+
+public final class FunctionResolver
+{
+    private FunctionResolver()
+    {
+    }
+
+    // We special case the token function because that's the only function whose argument types actually
+    // depend on the table on which the function is called. Because it's the sole exception, it's easier
+    // to handle it as a special case.
+    private static final FunctionName TOKEN_FUNCTION_NAME = FunctionName.nativeFunction("token");
+
+    public static ColumnSpecification makeArgSpec(String receiverKs, String receiverCf, Function fun, int i)
+    {
+        return new ColumnSpecification(receiverKs,
+                                       receiverCf,
+                                       new ColumnIdentifier("arg" + i + '(' + fun.name().toString().toLowerCase() + ')', true),
+                                       fun.argTypes().get(i));
+    }
+
+    /**
+     * @param keyspace the current keyspace
+     * @param name the name of the function
+     * @param providedArgs the arguments provided for the function call
+     * @param receiverKs the receiver's keyspace
+     * @param receiverCf the receiver's table
+     * @param receiverType if the receiver type is known (during inserts, for example), this should be the type of
+     *                     the receiver
+     * @throws InvalidRequestException
+     */
+    public static Function get(String keyspace,
+                               FunctionName name,
+                               List<? extends AssignmentTestable> providedArgs,
+                               String receiverKs,
+                               String receiverCf,
+                               AbstractType<?> receiverType)
+    throws InvalidRequestException
+    {
+        if (name.equalsNativeFunction(TOKEN_FUNCTION_NAME))
+            return new TokenFct(Schema.instance.getCFMetaData(receiverKs, receiverCf));
+
+        // The toJson() function can accept any type of argument, so instances of it are not pre-declared.  Instead,
+        // we create new instances as needed while handling selectors (which is the only place that toJson() is supported,
+        // due to needing to know the argument types in advance).
+        if (name.equalsNativeFunction(ToJsonFct.NAME))
+            throw new InvalidRequestException("toJson() may only be used within the selection clause of SELECT statements");
+
+        // Similarly, we can only use fromJson when we know the receiver type (such as inserts)
+        if (name.equalsNativeFunction(FromJsonFct.NAME))
+        {
+            if (receiverType == null)
+                throw new InvalidRequestException("fromJson() cannot be used in the selection clause of a SELECT statement");
+            return FromJsonFct.getInstance(receiverType);
+        }
+
+        Collection<Function> candidates;
+        if (!name.hasKeyspace())
+        {
+            // function name not fully qualified
+            candidates = new ArrayList<>();
+            // add 'SYSTEM' (native) candidates
+            candidates.addAll(Schema.instance.getFunctions(name.asNativeFunction()));
+            // add 'current keyspace' candidates
+            candidates.addAll(Schema.instance.getFunctions(new FunctionName(keyspace, name.name)));
+        }
+        else
+        {
+            // function name is fully qualified (keyspace + name)
+            candidates = Schema.instance.getFunctions(name);
+        }
+
+        if (candidates.isEmpty())
+            return null;
+
+        // Fast path if there is only one choice
+        if (candidates.size() == 1)
+        {
+            Function fun = candidates.iterator().next();
+            validateTypes(keyspace, fun, providedArgs, receiverKs, receiverCf);
+            return fun;
+        }
+
+        List<Function> compatibles = null;
+        for (Function toTest : candidates)
+        {
+            AssignmentTestable.TestResult r = matchAguments(keyspace, toTest, providedArgs, receiverKs, receiverCf);
+            switch (r)
+            {
+                case EXACT_MATCH:
+                    // We always favor exact matches
+                    return toTest;
+                case WEAKLY_ASSIGNABLE:
+                    if (compatibles == null)
+                        compatibles = new ArrayList<>();
+                    compatibles.add(toTest);
+                    break;
+            }
+        }
+
+        if (compatibles == null || compatibles.isEmpty())
+            throw new InvalidRequestException(String.format("Invalid call to function %s, none of its type signatures match (known type signatures: %s)",
+                                                            name, format(candidates)));
+
+        if (compatibles.size() > 1)
+            throw new InvalidRequestException(String.format("Ambiguous call to function %s (can be matched by following signatures: %s): use type casts to disambiguate",
+                        name, format(compatibles)));
+
+        return compatibles.get(0);
+    }
+
+    // This method and matchArguments are somewhat duplicate, but this method allows us to provide more precise errors in the common
+    // case where there is no override for a given function. This is thus probably worth the minor code duplication.
+    private static void validateTypes(String keyspace,
+                                      Function fun,
+                                      List<? extends AssignmentTestable> providedArgs,
+                                      String receiverKs,
+                                      String receiverCf)
+    throws InvalidRequestException
+    {
+        if (providedArgs.size() != fun.argTypes().size())
+            throw new InvalidRequestException(String.format("Invalid number of arguments in call to function %s: %d required but %d provided", fun.name(), fun.argTypes().size(), providedArgs.size()));
+
+        for (int i = 0; i < providedArgs.size(); i++)
+        {
+            AssignmentTestable provided = providedArgs.get(i);
+
+            // If the concrete argument is a bind variables, it can have any type.
+            // We'll validate the actually provided value at execution time.
+            if (provided == null)
+                continue;
+
+            ColumnSpecification expected = makeArgSpec(receiverKs, receiverCf, fun, i);
+            if (!provided.testAssignment(keyspace, expected).isAssignable())
+                throw new InvalidRequestException(String.format("Type error: %s cannot be passed as argument %d of function %s of type %s", provided, i, fun.name(), expected.type.asCQL3Type()));
+        }
+    }
+
+    private static AssignmentTestable.TestResult matchAguments(String keyspace,
+                                                               Function fun,
+                                                               List<? extends AssignmentTestable> providedArgs,
+                                                               String receiverKs,
+                                                               String receiverCf)
+    {
+        if (providedArgs.size() != fun.argTypes().size())
+            return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
+
+        // It's an exact match if all are exact match, but is not assignable as soon as any is non assignable.
+        AssignmentTestable.TestResult res = AssignmentTestable.TestResult.EXACT_MATCH;
+        for (int i = 0; i < providedArgs.size(); i++)
+        {
+            AssignmentTestable provided = providedArgs.get(i);
+            if (provided == null)
+            {
+                res = AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
+                continue;
+            }
+
+            ColumnSpecification expected = makeArgSpec(receiverKs, receiverCf, fun, i);
+            AssignmentTestable.TestResult argRes = provided.testAssignment(keyspace, expected);
+            if (argRes == AssignmentTestable.TestResult.NOT_ASSIGNABLE)
+                return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
+            if (argRes == AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE)
+                res = AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
+        }
+        return res;
+    }
+
+    private static String format(Collection<Function> funs)
+    {
+        return funs.stream().map(Function::toString).collect(joining(", "));
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/Functions.java b/src/java/org/apache/cassandra/cql3/functions/Functions.java
deleted file mode 100644
index 0f1af19..0000000
--- a/src/java/org/apache/cassandra/cql3/functions/Functions.java
+++ /dev/null

@@ -1,405 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.functions;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.CopyOnWriteArrayList;
-
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.MigrationListener;
-import org.apache.cassandra.service.MigrationManager;
-
-public abstract class Functions
-{
-    // We special case the token function because that's the only function whose argument types actually
-    // depend on the table on which the function is called. Because it's the sole exception, it's easier
-    // to handle it as a special case.
-    private static final FunctionName TOKEN_FUNCTION_NAME = FunctionName.nativeFunction("token");
-
-    private Functions() {}
-
-    private static final ConcurrentMap<FunctionName, List<Function>> declared = new ConcurrentHashMap<>();
-
-    static
-    {
-        declare(AggregateFcts.countRowsFunction);
-        declare(TimeFcts.nowFct);
-        declare(TimeFcts.minTimeuuidFct);
-        declare(TimeFcts.maxTimeuuidFct);
-        declare(TimeFcts.dateOfFct);
-        declare(TimeFcts.unixTimestampOfFct);
-        declare(TimeFcts.timeUuidtoDate);
-        declare(TimeFcts.timeUuidToTimestamp);
-        declare(TimeFcts.timeUuidToUnixTimestamp);
-        declare(TimeFcts.timestampToDate);
-        declare(TimeFcts.timestampToUnixTimestamp);
-        declare(TimeFcts.dateToTimestamp);
-        declare(TimeFcts.dateToUnixTimestamp);
-        declare(UuidFcts.uuidFct);
-
-        for (CQL3Type type : CQL3Type.Native.values())
-        {
-            // Note: because text and varchar ends up being synonymous, our automatic makeToBlobFunction doesn't work
-            // for varchar, so we special case it below. We also skip blob for obvious reasons.
-            if (type != CQL3Type.Native.VARCHAR && type != CQL3Type.Native.BLOB)
-            {
-                declare(BytesConversionFcts.makeToBlobFunction(type.getType()));
-                declare(BytesConversionFcts.makeFromBlobFunction(type.getType()));
-            }
-        }
-        declare(BytesConversionFcts.VarcharAsBlobFct);
-        declare(BytesConversionFcts.BlobAsVarcharFact);
-
-        for (CQL3Type type : CQL3Type.Native.values())
-        {
-            // special case varchar to avoid duplicating functions for UTF8Type
-            if (type != CQL3Type.Native.VARCHAR)
-            {
-                declare(AggregateFcts.makeCountFunction(type.getType()));
-                declare(AggregateFcts.makeMaxFunction(type.getType()));
-                declare(AggregateFcts.makeMinFunction(type.getType()));
-            }
-        }
-        declare(AggregateFcts.sumFunctionForByte);
-        declare(AggregateFcts.sumFunctionForShort);
-        declare(AggregateFcts.sumFunctionForInt32);
-        declare(AggregateFcts.sumFunctionForLong);
-        declare(AggregateFcts.sumFunctionForFloat);
-        declare(AggregateFcts.sumFunctionForDouble);
-        declare(AggregateFcts.sumFunctionForDecimal);
-        declare(AggregateFcts.sumFunctionForVarint);
-        declare(AggregateFcts.sumFunctionForCounter);
-        declare(AggregateFcts.avgFunctionForByte);
-        declare(AggregateFcts.avgFunctionForShort);
-        declare(AggregateFcts.avgFunctionForInt32);
-        declare(AggregateFcts.avgFunctionForLong);
-        declare(AggregateFcts.avgFunctionForFloat);
-        declare(AggregateFcts.avgFunctionForDouble);
-        declare(AggregateFcts.avgFunctionForVarint);
-        declare(AggregateFcts.avgFunctionForDecimal);
-        declare(AggregateFcts.avgFunctionForCounter);
-
-        MigrationManager.instance.register(new FunctionsMigrationListener());
-    }
-
-    private static void declare(Function fun)
-    {
-        synchronized (declared)
-        {
-            List<Function> functions = declared.get(fun.name());
-            if (functions == null)
-            {
-                functions = new CopyOnWriteArrayList<>();
-                List<Function> existing = declared.putIfAbsent(fun.name(), functions);
-                if (existing != null)
-                    functions = existing;
-            }
-            functions.add(fun);
-        }
-    }
-
-    public static ColumnSpecification makeArgSpec(String receiverKs, String receiverCf, Function fun, int i)
-    {
-        return new ColumnSpecification(receiverKs,
-                                       receiverCf,
-                                       new ColumnIdentifier("arg" + i + '(' + fun.name().toString().toLowerCase() + ')', true),
-                                       fun.argTypes().get(i));
-    }
-
-    public static int getOverloadCount(FunctionName name)
-    {
-        return find(name).size();
-    }
-
-    /**
-     * @param keyspace the current keyspace
-     * @param name the name of the function
-     * @param providedArgs the arguments provided for the function call
-     * @param receiverKs the receiver's keyspace
-     * @param receiverCf the receiver's table
-     * @param receiverType if the receiver type is known (during inserts, for example), this should be the type of
-     *                     the receiver
-     * @throws InvalidRequestException
-     */
-    public static Function get(String keyspace,
-                               FunctionName name,
-                               List<? extends AssignmentTestable> providedArgs,
-                               String receiverKs,
-                               String receiverCf,
-                               AbstractType<?> receiverType)
-    throws InvalidRequestException
-    {
-        if (name.equalsNativeFunction(TOKEN_FUNCTION_NAME))
-            return new TokenFct(Schema.instance.getCFMetaData(receiverKs, receiverCf));
-
-        // The toJson() function can accept any type of argument, so instances of it are not pre-declared.  Instead,
-        // we create new instances as needed while handling selectors (which is the only place that toJson() is supported,
-        // due to needing to know the argument types in advance).
-        if (name.equalsNativeFunction(ToJsonFct.NAME))
-            throw new InvalidRequestException("toJson() may only be used within the selection clause of SELECT statements");
-
-        // Similarly, we can only use fromJson when we know the receiver type (such as inserts)
-        if (name.equalsNativeFunction(FromJsonFct.NAME))
-        {
-            if (receiverType == null)
-                throw new InvalidRequestException("fromJson() cannot be used in the selection clause of a SELECT statement");
-            return FromJsonFct.getInstance(receiverType);
-        }
-
-        List<Function> candidates;
-        if (!name.hasKeyspace())
-        {
-            // function name not fully qualified
-            candidates = new ArrayList<>();
-            // add 'SYSTEM' (native) candidates
-            candidates.addAll(find(name.asNativeFunction()));
-            // add 'current keyspace' candidates
-            candidates.addAll(find(new FunctionName(keyspace, name.name)));
-        }
-        else
-            // function name is fully qualified (keyspace + name)
-            candidates = find(name);
-
-        if (candidates.isEmpty())
-            return null;
-
-        // Fast path if there is only one choice
-        if (candidates.size() == 1)
-        {
-            Function fun = candidates.get(0);
-            validateTypes(keyspace, fun, providedArgs, receiverKs, receiverCf);
-            return fun;
-        }
-
-        List<Function> compatibles = null;
-        for (Function toTest : candidates)
-        {
-            AssignmentTestable.TestResult r = matchAguments(keyspace, toTest, providedArgs, receiverKs, receiverCf);
-            switch (r)
-            {
-                case EXACT_MATCH:
-                    // We always favor exact matches
-                    return toTest;
-                case WEAKLY_ASSIGNABLE:
-                    if (compatibles == null)
-                        compatibles = new ArrayList<>();
-                    compatibles.add(toTest);
-                    break;
-            }
-        }
-
-        if (compatibles == null || compatibles.isEmpty())
-            throw new InvalidRequestException(String.format("Invalid call to function %s, none of its type signatures match (known type signatures: %s)",
-                                                            name, toString(candidates)));
-
-        if (compatibles.size() > 1)
-            throw new InvalidRequestException(String.format("Ambiguous call to function %s (can be matched by following signatures: %s): use type casts to disambiguate",
-                        name, toString(compatibles)));
-
-        return compatibles.get(0);
-    }
-
-    public static List<Function> find(FunctionName name)
-    {
-        List<Function> functions = declared.get(name);
-        return functions != null ? functions : Collections.<Function>emptyList();
-    }
-
-    public static Function find(FunctionName name, List<AbstractType<?>> argTypes)
-    {
-        assert name.hasKeyspace() : "function name not fully qualified";
-        for (Function f : find(name))
-        {
-            if (typeEquals(f.argTypes(), argTypes))
-                return f;
-        }
-        return null;
-    }
-
-    // This method and matchArguments are somewhat duplicate, but this method allows us to provide more precise errors in the common
-    // case where there is no override for a given function. This is thus probably worth the minor code duplication.
-    private static void validateTypes(String keyspace,
-                                      Function fun,
-                                      List<? extends AssignmentTestable> providedArgs,
-                                      String receiverKs,
-                                      String receiverCf)
-    throws InvalidRequestException
-    {
-        if (providedArgs.size() != fun.argTypes().size())
-            throw new InvalidRequestException(String.format("Invalid number of arguments in call to function %s: %d required but %d provided", fun.name(), fun.argTypes().size(), providedArgs.size()));
-
-        for (int i = 0; i < providedArgs.size(); i++)
-        {
-            AssignmentTestable provided = providedArgs.get(i);
-
-            // If the concrete argument is a bind variables, it can have any type.
-            // We'll validate the actually provided value at execution time.
-            if (provided == null)
-                continue;
-
-            ColumnSpecification expected = makeArgSpec(receiverKs, receiverCf, fun, i);
-            if (!provided.testAssignment(keyspace, expected).isAssignable())
-                throw new InvalidRequestException(String.format("Type error: %s cannot be passed as argument %d of function %s of type %s", provided, i, fun.name(), expected.type.asCQL3Type()));
-        }
-    }
-
-    private static AssignmentTestable.TestResult matchAguments(String keyspace,
-                                                               Function fun,
-                                                               List<? extends AssignmentTestable> providedArgs,
-                                                               String receiverKs,
-                                                               String receiverCf)
-    {
-        if (providedArgs.size() != fun.argTypes().size())
-            return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
-
-        // It's an exact match if all are exact match, but is not assignable as soon as any is non assignable.
-        AssignmentTestable.TestResult res = AssignmentTestable.TestResult.EXACT_MATCH;
-        for (int i = 0; i < providedArgs.size(); i++)
-        {
-            AssignmentTestable provided = providedArgs.get(i);
-            if (provided == null)
-            {
-                res = AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
-                continue;
-            }
-
-            ColumnSpecification expected = makeArgSpec(receiverKs, receiverCf, fun, i);
-            AssignmentTestable.TestResult argRes = provided.testAssignment(keyspace, expected);
-            if (argRes == AssignmentTestable.TestResult.NOT_ASSIGNABLE)
-                return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
-            if (argRes == AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE)
-                res = AssignmentTestable.TestResult.WEAKLY_ASSIGNABLE;
-        }
-        return res;
-    }
-
-    private static String toString(List<Function> funs)
-    {
-        StringBuilder sb = new StringBuilder();
-        for (int i = 0; i < funs.size(); i++)
-        {
-            if (i > 0) sb.append(", ");
-            sb.append(funs.get(i));
-        }
-        return sb.toString();
-    }
-
-    public static void addOrReplaceFunction(AbstractFunction fun)
-    {
-        // We shouldn't get there unless that function don't exist
-        removeFunction(fun.name(), fun.argTypes());
-        declare(fun);
-    }
-
-    // Same remarks than for addFunction
-    public static void removeFunction(FunctionName name, List<AbstractType<?>> argTypes)
-    {
-        assert name.hasKeyspace() : "function name " + name + " not fully qualified";
-        synchronized (declared)
-        {
-            List<Function> functions = find(name);
-            for (int i = 0; i < functions.size(); i++)
-            {
-                Function f = functions.get(i);
-                if (!typeEquals(f.argTypes(), argTypes))
-                    continue;
-                assert !f.isNative();
-                functions.remove(i);
-                if (functions.isEmpty())
-                    declared.remove(name);
-                return;
-            }
-        }
-    }
-
-    public static List<Function> getReferencesTo(Function old)
-    {
-        List<Function> references = new ArrayList<>();
-        for (List<Function> functions : declared.values())
-            for (Function function : functions)
-                if (function.hasReferenceTo(old))
-                    references.add(function);
-        return references;
-    }
-
-    public static Collection<Function> all()
-    {
-        List<Function> all = new ArrayList<>();
-        for (List<Function> functions : declared.values())
-            all.addAll(functions);
-        return all;
-    }
-
-    /*
-     * We need to compare the CQL3 representation of the type because comparing
-     * the AbstractType will fail for example if a UDT has been changed.
-     * Reason is that UserType.equals() takes the field names and types into account.
-     * Example CQL sequence that would fail when comparing AbstractType:
-     *    CREATE TYPE foo ...
-     *    CREATE FUNCTION bar ( par foo ) RETURNS foo ...
-     *    ALTER TYPE foo ADD ...
-     * or
-     *    ALTER TYPE foo ALTER ...
-     * or
-     *    ALTER TYPE foo RENAME ...
-     */
-    public static boolean typeEquals(AbstractType<?> t1, AbstractType<?> t2)
-    {
-        return t1.asCQL3Type().toString().equals(t2.asCQL3Type().toString());
-    }
-
-    public static boolean typeEquals(List<AbstractType<?>> t1, List<AbstractType<?>> t2)
-    {
-        if (t1.size() != t2.size())
-            return false;
-        for (int i = 0; i < t1.size(); i ++)
-            if (!typeEquals(t1.get(i), t2.get(i)))
-                return false;
-        return true;
-    }
-
-    public static int typeHashCode(AbstractType<?> t)
-    {
-        return t.asCQL3Type().toString().hashCode();
-    }
-
-    public static int typeHashCode(List<AbstractType<?>> types)
-    {
-        int h = 0;
-        for (AbstractType<?> type : types)
-            h = h * 31 + typeHashCode(type);
-        return h;
-    }
-
-    private static class FunctionsMigrationListener extends MigrationListener
-    {
-        public void onUpdateUserType(String ksName, String typeName) {
-            for (Function function : all())
-                if (function instanceof UDFunction)
-                    ((UDFunction)function).userTypeUpdated(ksName, typeName);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/functions/JavaBasedUDFunction.java b/src/java/org/apache/cassandra/cql3/functions/JavaBasedUDFunction.java
new file mode 100644
index 0000000..660d494
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/JavaBasedUDFunction.java

@@ -0,0 +1,654 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.functions;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.MethodType;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.net.*;
+import java.nio.ByteBuffer;
+import java.security.*;
+import java.security.cert.Certificate;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.io.ByteStreams;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.eclipse.jdt.core.compiler.IProblem;
+import org.eclipse.jdt.internal.compiler.*;
+import org.eclipse.jdt.internal.compiler.Compiler;
+import org.eclipse.jdt.internal.compiler.classfmt.ClassFileReader;
+import org.eclipse.jdt.internal.compiler.classfmt.ClassFormatException;
+import org.eclipse.jdt.internal.compiler.env.ICompilationUnit;
+import org.eclipse.jdt.internal.compiler.env.INameEnvironment;
+import org.eclipse.jdt.internal.compiler.env.NameEnvironmentAnswer;
+import org.eclipse.jdt.internal.compiler.impl.CompilerOptions;
+import org.eclipse.jdt.internal.compiler.problem.DefaultProblemFactory;
+
+final class JavaBasedUDFunction extends UDFunction
+{
+    private static final String BASE_PACKAGE = "org.apache.cassandra.cql3.udf.gen";
+
+    static final Logger logger = LoggerFactory.getLogger(JavaBasedUDFunction.class);
+
+    private static final AtomicInteger classSequence = new AtomicInteger();
+
+    // use a JVM standard ExecutorService as DebuggableThreadPoolExecutor references internal
+    // classes, which triggers AccessControlException from the UDF sandbox
+    private static final UDFExecutorService executor =
+        new UDFExecutorService(new NamedThreadFactory("UserDefinedFunctions",
+                                                      Thread.MIN_PRIORITY,
+                                                      udfClassLoader,
+                                                      new SecurityThreadGroup("UserDefinedFunctions", null, UDFunction::initializeThread)),
+                               "userfunction");
+
+    private static final EcjTargetClassLoader targetClassLoader = new EcjTargetClassLoader();
+
+    private static final UDFByteCodeVerifier udfByteCodeVerifier = new UDFByteCodeVerifier();
+
+    private static final ProtectionDomain protectionDomain;
+
+    private static final IErrorHandlingPolicy errorHandlingPolicy = DefaultErrorHandlingPolicies.proceedWithAllProblems();
+    private static final IProblemFactory problemFactory = new DefaultProblemFactory(Locale.ENGLISH);
+    private static final CompilerOptions compilerOptions;
+
+    /**
+     * Poor man's template - just a text file splitted at '#' chars.
+     * Each string at an even index is a constant string (just copied),
+     * each string at an odd index is an 'instruction'.
+     */
+    private static final String[] javaSourceTemplate;
+
+    static
+    {
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/Class", "forName");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/Class", "getClassLoader");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/Class", "getResource");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/Class", "getResourceAsStream");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "clearAssertionStatus");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getResource");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getResourceAsStream");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getResources");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getSystemClassLoader");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getSystemResource");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getSystemResourceAsStream");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "getSystemResources");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "loadClass");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "setClassAssertionStatus");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "setDefaultAssertionStatus");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/lang/ClassLoader", "setPackageAssertionStatus");
+        udfByteCodeVerifier.addDisallowedMethodCall("java/nio/ByteBuffer", "allocateDirect");
+        for (String ia : new String[]{"java/net/InetAddress", "java/net/Inet4Address", "java/net/Inet6Address"})
+        {
+            // static method, probably performing DNS lookups (despite SecurityManager)
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getByAddress");
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getAllByName");
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getByName");
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getLocalHost");
+            // instance methods, probably performing DNS lookups (despite SecurityManager)
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getHostName");
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "getCanonicalHostName");
+            // ICMP PING
+            udfByteCodeVerifier.addDisallowedMethodCall(ia, "isReachable");
+        }
+        udfByteCodeVerifier.addDisallowedClass("java/net/NetworkInterface");
+        udfByteCodeVerifier.addDisallowedClass("java/net/SocketException");
+
+        Map<String, String> settings = new HashMap<>();
+        settings.put(CompilerOptions.OPTION_LineNumberAttribute,
+                     CompilerOptions.GENERATE);
+        settings.put(CompilerOptions.OPTION_SourceFileAttribute,
+                     CompilerOptions.DISABLED);
+        settings.put(CompilerOptions.OPTION_ReportDeprecation,
+                     CompilerOptions.IGNORE);
+        settings.put(CompilerOptions.OPTION_Source,
+                     CompilerOptions.VERSION_1_8);
+        settings.put(CompilerOptions.OPTION_TargetPlatform,
+                     CompilerOptions.VERSION_1_8);
+
+        compilerOptions = new CompilerOptions(settings);
+        compilerOptions.parseLiteralExpressionsAsConstants = true;
+
+        try (InputStream input = JavaBasedUDFunction.class.getResource("JavaSourceUDF.txt").openConnection().getInputStream())
+        {
+            ByteArrayOutputStream output = new ByteArrayOutputStream();
+            FBUtilities.copy(input, output, Long.MAX_VALUE);
+            String template = output.toString();
+
+            StringTokenizer st = new StringTokenizer(template, "#");
+            javaSourceTemplate = new String[st.countTokens()];
+            for (int i = 0; st.hasMoreElements(); i++)
+                javaSourceTemplate[i] = st.nextToken();
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        CodeSource codeSource;
+        try
+        {
+            codeSource = new CodeSource(new URL("udf", "localhost", 0, "/java", new URLStreamHandler()
+            {
+                protected URLConnection openConnection(URL u)
+                {
+                    return null;
+                }
+            }), (Certificate[])null);
+        }
+        catch (MalformedURLException e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        protectionDomain = new ProtectionDomain(codeSource, ThreadAwareSecurityManager.noPermissions, targetClassLoader, null);
+    }
+
+    private final JavaUDF javaUDF;
+
+    JavaBasedUDFunction(FunctionName name, List<ColumnIdentifier> argNames, List<AbstractType<?>> argTypes,
+                        AbstractType<?> returnType, boolean calledOnNullInput, String body)
+    {
+        super(name, argNames, argTypes, UDHelper.driverTypes(argTypes),
+              returnType, UDHelper.driverType(returnType), calledOnNullInput, "java", body);
+
+        // javaParamTypes is just the Java representation for argTypes resp. argCodecs
+        Class<?>[] javaParamTypes = UDHelper.javaTypes(argCodecs, calledOnNullInput);
+        // javaReturnType is just the Java representation for returnType resp. returnCodec
+        Class<?> javaReturnType = UDHelper.asJavaClass(returnCodec);
+
+        // put each UDF in a separate package to prevent cross-UDF code access
+        String pkgName = BASE_PACKAGE + '.' + generateClassName(name, 'p');
+        String clsName = generateClassName(name, 'C');
+
+        String executeInternalName = generateClassName(name, 'x');
+
+        StringBuilder javaSourceBuilder = new StringBuilder();
+        int lineOffset = 1;
+        for (int i = 0; i < javaSourceTemplate.length; i++)
+        {
+            String s = javaSourceTemplate[i];
+
+            // strings at odd indexes are 'instructions'
+            if ((i & 1) == 1)
+            {
+                switch (s)
+                {
+                    case "package_name":
+                        s = pkgName;
+                        break;
+                    case "class_name":
+                        s = clsName;
+                        break;
+                    case "body":
+                        lineOffset = countNewlines(javaSourceBuilder);
+                        s = body;
+                        break;
+                    case "arguments":
+                        s = generateArguments(javaParamTypes, argNames);
+                        break;
+                    case "argument_list":
+                        s = generateArgumentList(javaParamTypes, argNames);
+                        break;
+                    case "return_type":
+                        s = javaSourceName(javaReturnType);
+                        break;
+                    case "execute_internal_name":
+                        s = executeInternalName;
+                        break;
+                }
+            }
+
+            javaSourceBuilder.append(s);
+        }
+
+        String targetClassName = pkgName + '.' + clsName;
+
+        String javaSource = javaSourceBuilder.toString();
+
+        logger.trace("Compiling Java source UDF '{}' as class '{}' using source:\n{}", name, targetClassName, javaSource);
+
+        try
+        {
+            EcjCompilationUnit compilationUnit = new EcjCompilationUnit(javaSource, targetClassName);
+
+            org.eclipse.jdt.internal.compiler.Compiler compiler = new Compiler(compilationUnit,
+                                                                               errorHandlingPolicy,
+                                                                               compilerOptions,
+                                                                               compilationUnit,
+                                                                               problemFactory);
+            compiler.compile(new ICompilationUnit[]{ compilationUnit });
+
+            if (compilationUnit.problemList != null && !compilationUnit.problemList.isEmpty())
+            {
+                boolean fullSource = false;
+                StringBuilder problems = new StringBuilder();
+                for (IProblem problem : compilationUnit.problemList)
+                {
+                    long ln = problem.getSourceLineNumber() - lineOffset;
+                    if (ln < 1L)
+                    {
+                        if (problem.isError())
+                        {
+                            // if generated source around UDF source provided by the user is buggy,
+                            // this code is appended.
+                            problems.append("GENERATED SOURCE ERROR: line ")
+                                    .append(problem.getSourceLineNumber())
+                                    .append(" (in generated source): ")
+                                    .append(problem.getMessage())
+                                    .append('\n');
+                            fullSource = true;
+                        }
+                    }
+                    else
+                    {
+                        problems.append("Line ")
+                                .append(Long.toString(ln))
+                                .append(": ")
+                                .append(problem.getMessage())
+                                .append('\n');
+                    }
+                }
+
+                if (fullSource)
+                    throw new InvalidRequestException("Java source compilation failed:\n" + problems + "\n generated source:\n" + javaSource);
+                else
+                    throw new InvalidRequestException("Java source compilation failed:\n" + problems);
+            }
+
+            // Verify the UDF bytecode against use of probably dangerous code
+            Set<String> errors = udfByteCodeVerifier.verify(targetClassLoader.classData(targetClassName));
+            String validDeclare = "not allowed method declared: " + executeInternalName + '(';
+            String validCall = "call to " + targetClassName.replace('.', '/') + '.' + executeInternalName + "()";
+            for (Iterator<String> i = errors.iterator(); i.hasNext();)
+            {
+                String error = i.next();
+                // we generate a random name of the private, internal execute method, which is detected by the byte-code verifier
+                if (error.startsWith(validDeclare) || error.equals(validCall))
+                {
+                    i.remove();
+                }
+            }
+            if (!errors.isEmpty())
+                throw new InvalidRequestException("Java UDF validation failed: " + errors);
+
+            // Load the class and create a new instance of it
+            Thread thread = Thread.currentThread();
+            ClassLoader orig = thread.getContextClassLoader();
+            try
+            {
+                thread.setContextClassLoader(UDFunction.udfClassLoader);
+                // Execute UDF intiialization from UDF class loader
+
+                Class cls = Class.forName(targetClassName, false, targetClassLoader);
+
+                // Count only non-synthetic methods, so code coverage instrumentation doesn't cause a miscount
+                int nonSyntheticMethodCount = 0;
+                for (Method m : cls.getDeclaredMethods())
+                {
+                    if (!m.isSynthetic())
+                    {
+                        nonSyntheticMethodCount += 1;
+                    }
+                }
+
+                if (nonSyntheticMethodCount != 2 || cls.getDeclaredConstructors().length != 1)
+                    throw new InvalidRequestException("Check your source to not define additional Java methods or constructors");
+                MethodType methodType = MethodType.methodType(void.class)
+                                                  .appendParameterTypes(TypeCodec.class, TypeCodec[].class);
+                MethodHandle ctor = MethodHandles.lookup().findConstructor(cls, methodType);
+                this.javaUDF = (JavaUDF) ctor.invokeWithArguments(returnCodec, argCodecs);
+            }
+            finally
+            {
+                thread.setContextClassLoader(orig);
+            }
+        }
+        catch (InvocationTargetException e)
+        {
+            // in case of an ITE, use the cause
+            throw new InvalidRequestException(String.format("Could not compile function '%s' from Java source: %s", name, e.getCause()));
+        }
+        catch (VirtualMachineError e)
+        {
+            throw e;
+        }
+        catch (Throwable e)
+        {
+            throw new InvalidRequestException(String.format("Could not compile function '%s' from Java source: %s", name, e));
+        }
+    }
+
+    protected ExecutorService executor()
+    {
+        return executor;
+    }
+
+    protected ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> params)
+    {
+        return javaUDF.executeImpl(protocolVersion, params);
+    }
+
+
+    private static int countNewlines(StringBuilder javaSource)
+    {
+        int ln = 0;
+        for (int i = 0; i < javaSource.length(); i++)
+            if (javaSource.charAt(i) == '\n')
+                ln++;
+        return ln;
+    }
+
+    private static String generateClassName(FunctionName name, char prefix)
+    {
+        String qualifiedName = name.toString();
+
+        StringBuilder sb = new StringBuilder(qualifiedName.length() + 10);
+        sb.append(prefix);
+        for (int i = 0; i < qualifiedName.length(); i++)
+        {
+            char c = qualifiedName.charAt(i);
+            if (Character.isJavaIdentifierPart(c))
+                sb.append(c);
+            else
+                sb.append(Integer.toHexString(((short)c)&0xffff));
+        }
+        sb.append('_')
+          .append(ThreadLocalRandom.current().nextInt() & 0xffffff)
+          .append('_')
+          .append(classSequence.incrementAndGet());
+        return sb.toString();
+    }
+
+    private static String javaSourceName(Class<?> type)
+    {
+        String n = type.getName();
+        return n.startsWith("java.lang.") ? type.getSimpleName() : n;
+    }
+
+    private static String generateArgumentList(Class<?>[] paramTypes, List<ColumnIdentifier> argNames)
+    {
+        // initial builder size can just be a guess (prevent temp object allocations)
+        StringBuilder code = new StringBuilder(32 * paramTypes.length);
+        for (int i = 0; i < paramTypes.length; i++)
+        {
+            if (i > 0)
+                code.append(", ");
+            code.append(javaSourceName(paramTypes[i]))
+                .append(' ')
+                .append(argNames.get(i));
+        }
+        return code.toString();
+    }
+
+    private static String generateArguments(Class<?>[] paramTypes, List<ColumnIdentifier> argNames)
+    {
+        StringBuilder code = new StringBuilder(64 * paramTypes.length);
+        for (int i = 0; i < paramTypes.length; i++)
+        {
+            if (i > 0)
+                code.append(",\n");
+
+            if (logger.isTraceEnabled())
+                code.append("            /* parameter '").append(argNames.get(i)).append("' */\n");
+
+            code
+                // cast to Java type
+                .append("            (").append(javaSourceName(paramTypes[i])).append(") ")
+                // generate object representation of input parameter (call UDFunction.compose)
+                .append(composeMethod(paramTypes[i])).append("(protocolVersion, ").append(i).append(", params.get(").append(i).append("))");
+        }
+        return code.toString();
+    }
+
+    private static String composeMethod(Class<?> type)
+    {
+        return (type.isPrimitive()) ? ("super.compose_" + type.getName()) : "super.compose";
+    }
+
+    // Java source UDFs are a very simple compilation task, which allows us to let one class implement
+    // all interfaces required by ECJ.
+    static final class EcjCompilationUnit implements ICompilationUnit, ICompilerRequestor, INameEnvironment
+    {
+        List<IProblem> problemList;
+        private final String className;
+        private final char[] sourceCode;
+
+        EcjCompilationUnit(String sourceCode, String className)
+        {
+            this.className = className;
+            this.sourceCode = sourceCode.toCharArray();
+        }
+
+        // ICompilationUnit
+
+        @Override
+        public char[] getFileName()
+        {
+            return sourceCode;
+        }
+
+        @Override
+        public char[] getContents()
+        {
+            return sourceCode;
+        }
+
+        @Override
+        public char[] getMainTypeName()
+        {
+            int dot = className.lastIndexOf('.');
+            return ((dot > 0) ? className.substring(dot + 1) : className).toCharArray();
+        }
+
+        @Override
+        public char[][] getPackageName()
+        {
+            StringTokenizer izer = new StringTokenizer(className, ".");
+            char[][] result = new char[izer.countTokens() - 1][];
+            for (int i = 0; i < result.length; i++)
+                result[i] = izer.nextToken().toCharArray();
+            return result;
+        }
+
+        @Override
+        public boolean ignoreOptionalProblems()
+        {
+            return false;
+        }
+
+        // ICompilerRequestor
+
+        @Override
+        public void acceptResult(CompilationResult result)
+        {
+            if (result.hasErrors())
+            {
+                IProblem[] problems = result.getProblems();
+                if (problemList == null)
+                    problemList = new ArrayList<>(problems.length);
+                Collections.addAll(problemList, problems);
+            }
+            else
+            {
+                ClassFile[] classFiles = result.getClassFiles();
+                for (ClassFile classFile : classFiles)
+                    targetClassLoader.addClass(className, classFile.getBytes());
+            }
+        }
+
+        // INameEnvironment
+
+        @Override
+        public NameEnvironmentAnswer findType(char[][] compoundTypeName)
+        {
+            StringBuilder result = new StringBuilder();
+            for (int i = 0; i < compoundTypeName.length; i++)
+            {
+                if (i > 0)
+                    result.append('.');
+                result.append(compoundTypeName[i]);
+            }
+            return findType(result.toString());
+        }
+
+        @Override
+        public NameEnvironmentAnswer findType(char[] typeName, char[][] packageName)
+        {
+            StringBuilder result = new StringBuilder();
+            int i = 0;
+            for (; i < packageName.length; i++)
+            {
+                if (i > 0)
+                    result.append('.');
+                result.append(packageName[i]);
+            }
+            if (i > 0)
+                result.append('.');
+            result.append(typeName);
+            return findType(result.toString());
+        }
+
+        private NameEnvironmentAnswer findType(String className)
+        {
+            if (className.equals(this.className))
+            {
+                return new NameEnvironmentAnswer(this, null);
+            }
+
+            String resourceName = className.replace('.', '/') + ".class";
+
+            try (InputStream is = UDFunction.udfClassLoader.getResourceAsStream(resourceName))
+            {
+                if (is != null)
+                {
+                    byte[] classBytes = ByteStreams.toByteArray(is);
+                    char[] fileName = className.toCharArray();
+                    ClassFileReader classFileReader = new ClassFileReader(classBytes, fileName, true);
+                    return new NameEnvironmentAnswer(classFileReader, null);
+                }
+            }
+            catch (IOException | ClassFormatException exc)
+            {
+                throw new RuntimeException(exc);
+            }
+            return null;
+        }
+
+        private boolean isPackage(String result)
+        {
+            if (result.equals(this.className))
+                return false;
+            String resourceName = result.replace('.', '/') + ".class";
+            try (InputStream is = UDFunction.udfClassLoader.getResourceAsStream(resourceName))
+            {
+                return is == null;
+            }
+            catch (IOException e)
+            {
+                // we are here, since close on is failed. That means it was not null
+                return false;
+            }
+        }
+
+        @Override
+        public boolean isPackage(char[][] parentPackageName, char[] packageName)
+        {
+            StringBuilder result = new StringBuilder();
+            int i = 0;
+            if (parentPackageName != null)
+                for (; i < parentPackageName.length; i++)
+                {
+                    if (i > 0)
+                        result.append('.');
+                    result.append(parentPackageName[i]);
+                }
+
+            if (Character.isUpperCase(packageName[0]) && !isPackage(result.toString()))
+                return false;
+            if (i > 0)
+                result.append('.');
+            result.append(packageName);
+
+            return isPackage(result.toString());
+        }
+
+        @Override
+        public void cleanup()
+        {
+        }
+    }
+
+    static final class EcjTargetClassLoader extends SecureClassLoader
+    {
+        EcjTargetClassLoader()
+        {
+            super(UDFunction.udfClassLoader);
+        }
+
+        // This map is usually empty.
+        // It only contains data *during* UDF compilation but not during runtime.
+        //
+        // addClass() is invoked by ECJ after successful compilation of the generated Java source.
+        // loadClass(targetClassName) is invoked by buildUDF() after ECJ returned from successful compilation.
+        //
+        private final Map<String, byte[]> classes = new ConcurrentHashMap<>();
+
+        void addClass(String className, byte[] classData)
+        {
+            classes.put(className, classData);
+        }
+
+        byte[] classData(String className)
+        {
+            return classes.get(className);
+        }
+
+        protected Class<?> findClass(String name) throws ClassNotFoundException
+        {
+            // remove the class binary - it's only used once - so it's wasting heap
+            byte[] classData = classes.remove(name);
+
+            if (classData != null)
+                return defineClass(name, classData, 0, classData.length, protectionDomain);
+
+            return getParent().loadClass(name);
+        }
+
+        protected PermissionCollection getPermissions(CodeSource codesource)
+        {
+            return ThreadAwareSecurityManager.noPermissions;
+        }
+    }}

diff --git a/src/java/org/apache/cassandra/cql3/functions/JavaSourceUDFFactory.java b/src/java/org/apache/cassandra/cql3/functions/JavaSourceUDFFactory.java
deleted file mode 100644
index 515c947..0000000
--- a/src/java/org/apache/cassandra/cql3/functions/JavaSourceUDFFactory.java
+++ /dev/null

@@ -1,536 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.functions;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.invoke.MethodHandle;
-import java.lang.invoke.MethodHandles;
-import java.lang.invoke.MethodType;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.StringTokenizer;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.io.ByteStreams;
-
-import org.apache.cassandra.utils.FBUtilities;
-import org.eclipse.jdt.core.compiler.IProblem;
-import org.eclipse.jdt.internal.compiler.*;
-import org.eclipse.jdt.internal.compiler.Compiler;
-import org.eclipse.jdt.internal.compiler.classfmt.ClassFileReader;
-import org.eclipse.jdt.internal.compiler.classfmt.ClassFormatException;
-import org.eclipse.jdt.internal.compiler.env.ICompilationUnit;
-import org.eclipse.jdt.internal.compiler.env.INameEnvironment;
-import org.eclipse.jdt.internal.compiler.env.NameEnvironmentAnswer;
-import org.eclipse.jdt.internal.compiler.impl.CompilerOptions;
-import org.eclipse.jdt.internal.compiler.problem.DefaultProblemFactory;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.datastax.driver.core.DataType;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-
-/**
- * Java source UDF code generation.
- */
-public final class JavaSourceUDFFactory
-{
-    private static final String GENERATED_PACKAGE = "org.apache.cassandra.cql3.udf.gen";
-
-    static final Logger logger = LoggerFactory.getLogger(JavaSourceUDFFactory.class);
-
-    private static final AtomicInteger classSequence = new AtomicInteger();
-
-    private static final ClassLoader baseClassLoader = Thread.currentThread().getContextClassLoader();
-    private static final EcjTargetClassLoader targetClassLoader = new EcjTargetClassLoader();
-    private static final IErrorHandlingPolicy errorHandlingPolicy = DefaultErrorHandlingPolicies.proceedWithAllProblems();
-    private static final IProblemFactory problemFactory = new DefaultProblemFactory(Locale.ENGLISH);
-    private static final CompilerOptions compilerOptions;
-
-    /**
-     * Poor man's template - just a text file splitted at '#' chars.
-     * Each string at an even index is a constant string (just copied),
-     * each string at an odd index is an 'instruction'.
-     */
-    private static final String[] javaSourceTemplate;
-
-    static
-    {
-        Map<String, String> settings = new HashMap<>();
-        settings.put(CompilerOptions.OPTION_LineNumberAttribute,
-                     CompilerOptions.GENERATE);
-        settings.put(CompilerOptions.OPTION_SourceFileAttribute,
-                     CompilerOptions.DISABLED);
-        settings.put(CompilerOptions.OPTION_ReportDeprecation,
-                     CompilerOptions.IGNORE);
-        settings.put(CompilerOptions.OPTION_Source,
-                     CompilerOptions.VERSION_1_7);
-        settings.put(CompilerOptions.OPTION_TargetPlatform,
-                     CompilerOptions.VERSION_1_7);
-
-        compilerOptions = new CompilerOptions(settings);
-        compilerOptions.parseLiteralExpressionsAsConstants = true;
-
-        try (InputStream input = JavaSourceUDFFactory.class.getResource("JavaSourceUDF.txt").openConnection().getInputStream())
-        {
-            ByteArrayOutputStream output = new ByteArrayOutputStream();
-            FBUtilities.copy(input, output, Long.MAX_VALUE);
-            String template = output.toString();
-
-            StringTokenizer st = new StringTokenizer(template, "#");
-            javaSourceTemplate = new String[st.countTokens()];
-            for (int i = 0; st.hasMoreElements(); i++)
-                javaSourceTemplate[i] = st.nextToken();
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    static UDFunction buildUDF(FunctionName name,
-                               List<ColumnIdentifier> argNames,
-                               List<AbstractType<?>> argTypes,
-                               AbstractType<?> returnType,
-                               boolean calledOnNullInput,
-                               String body)
-    throws InvalidRequestException
-    {
-        // argDataTypes is just the C* internal argTypes converted to the Java Driver DataType
-        DataType[] argDataTypes = UDHelper.driverTypes(argTypes);
-        // returnDataType is just the C* internal returnType converted to the Java Driver DataType
-        DataType returnDataType = UDHelper.driverType(returnType);
-        // javaParamTypes is just the Java representation for argTypes resp. argDataTypes
-        Class<?>[] javaParamTypes = UDHelper.javaTypes(argDataTypes, calledOnNullInput);
-        // javaReturnType is just the Java representation for returnType resp. returnDataType
-        Class<?> javaReturnType = returnDataType.asJavaClass();
-
-        String clsName = generateClassName(name);
-
-        StringBuilder javaSourceBuilder = new StringBuilder();
-        int lineOffset = 1;
-        for (int i = 0; i < javaSourceTemplate.length; i++)
-        {
-            String s = javaSourceTemplate[i];
-
-            // strings at odd indexes are 'instructions'
-            if ((i & 1) == 1)
-            {
-                switch (s)
-                {
-                    case "class_name":
-                        s = clsName;
-                        break;
-                    case "body":
-                        lineOffset = countNewlines(javaSourceBuilder);
-                        s = body;
-                        break;
-                    case "arguments":
-                        s = generateArguments(javaParamTypes, argNames);
-                        break;
-                    case "argument_list":
-                        s = generateArgumentList(javaParamTypes, argNames);
-                        break;
-                    case "return_type":
-                        s = javaSourceName(javaReturnType);
-                        break;
-                }
-            }
-
-            javaSourceBuilder.append(s);
-        }
-
-        String targetClassName = GENERATED_PACKAGE + '.' + clsName;
-
-        String javaSource = javaSourceBuilder.toString();
-
-        logger.trace("Compiling Java source UDF '{}' as class '{}' using source:\n{}", name, targetClassName, javaSource);
-
-        try
-        {
-            EcjCompilationUnit compilationUnit = new EcjCompilationUnit(javaSource, targetClassName);
-
-            Compiler compiler = new Compiler(compilationUnit,
-                                             errorHandlingPolicy,
-                                             compilerOptions,
-                                             compilationUnit,
-                                             problemFactory);
-            compiler.compile(new ICompilationUnit[]{ compilationUnit });
-
-            if (compilationUnit.problemList != null && !compilationUnit.problemList.isEmpty())
-            {
-                boolean fullSource = false;
-                StringBuilder problems = new StringBuilder();
-                for (IProblem problem : compilationUnit.problemList)
-                {
-                    long ln = problem.getSourceLineNumber() - lineOffset;
-                    if (ln < 1L)
-                    {
-                        if (problem.isError())
-                        {
-                            // if generated source around UDF source provided by the user is buggy,
-                            // this code is appended.
-                            problems.append("GENERATED SOURCE ERROR: line ")
-                                    .append(problem.getSourceLineNumber())
-                                    .append(" (in generated source): ")
-                                    .append(problem.getMessage())
-                                    .append('\n');
-                            fullSource = true;
-                        }
-                    }
-                    else
-                    {
-                        problems.append("Line ")
-                                .append(Long.toString(ln))
-                                .append(": ")
-                                .append(problem.getMessage())
-                                .append('\n');
-                    }
-                }
-
-                if (fullSource)
-                    throw new InvalidRequestException("Java source compilation failed:\n" + problems + "\n generated source:\n" + javaSource);
-                else
-                    throw new InvalidRequestException("Java source compilation failed:\n" + problems);
-            }
-
-            Class cls = targetClassLoader.loadClass(targetClassName);
-
-            // Count only non-synthetic methods, so code coverage instrumentation doesn't cause a miscount
-            int nonSyntheticMethodCount = 0;
-            for (Method m : cls.getDeclaredMethods())
-            {
-                if (!m.isSynthetic())
-                {
-                    nonSyntheticMethodCount += 1;
-                }
-            }
-
-            if (nonSyntheticMethodCount != 2 || cls.getDeclaredConstructors().length != 1)
-                throw new InvalidRequestException("Check your source to not define additional Java methods or constructors");
-            MethodType methodType = MethodType.methodType(void.class)
-                                              .appendParameterTypes(FunctionName.class, List.class, List.class, DataType[].class,
-                                                                    AbstractType.class, DataType.class,
-                                                                    boolean.class, String.class);
-            MethodHandle ctor = MethodHandles.lookup().findConstructor(cls, methodType);
-            return (UDFunction) ctor.invokeWithArguments(name, argNames, argTypes, argDataTypes,
-                                                         returnType, returnDataType,
-                                                         calledOnNullInput, body);
-        }
-        catch (InvocationTargetException e)
-        {
-            // in case of an ITE, use the cause
-            throw new InvalidRequestException(String.format("Could not compile function '%s' from Java source: %s", name, e.getCause()));
-        }
-        catch (VirtualMachineError e)
-        {
-            throw e;
-        }
-        catch (Throwable e)
-        {
-            throw new InvalidRequestException(String.format("Could not compile function '%s' from Java source: %s", name, e));
-        }
-    }
-
-    private static int countNewlines(StringBuilder javaSource)
-    {
-        int ln = 0;
-        for (int i = 0; i < javaSource.length(); i++)
-            if (javaSource.charAt(i) == '\n')
-                ln++;
-        return ln;
-    }
-
-    private static String generateClassName(FunctionName name)
-    {
-        String qualifiedName = name.toString();
-
-        StringBuilder sb = new StringBuilder(qualifiedName.length() + 10);
-        sb.append('C');
-        for (int i = 0; i < qualifiedName.length(); i++)
-        {
-            char c = qualifiedName.charAt(i);
-            if (Character.isJavaIdentifierPart(c))
-                sb.append(c);
-        }
-        sb.append('_')
-          .append(classSequence.incrementAndGet());
-        return sb.toString();
-    }
-
-    private static String javaSourceName(Class<?> type)
-    {
-        String n = type.getName();
-        return n.startsWith("java.lang.") ? type.getSimpleName() : n;
-    }
-
-    private static String generateArgumentList(Class<?>[] paramTypes, List<ColumnIdentifier> argNames)
-    {
-        // initial builder size can just be a guess (prevent temp object allocations)
-        StringBuilder code = new StringBuilder(32 * paramTypes.length);
-        for (int i = 0; i < paramTypes.length; i++)
-        {
-            if (i > 0)
-                code.append(", ");
-            code.append(javaSourceName(paramTypes[i]))
-                .append(' ')
-                .append(argNames.get(i));
-        }
-        return code.toString();
-    }
-
-    private static String generateArguments(Class<?>[] paramTypes, List<ColumnIdentifier> argNames)
-    {
-        StringBuilder code = new StringBuilder(64 * paramTypes.length);
-        for (int i = 0; i < paramTypes.length; i++)
-        {
-            if (i > 0)
-                code.append(",\n");
-
-            if (logger.isTraceEnabled())
-                code.append("                /* parameter '").append(argNames.get(i)).append("' */\n");
-
-            code
-                // cast to Java type
-                .append("                (").append(javaSourceName(paramTypes[i])).append(") ")
-                // generate object representation of input parameter (call UDFunction.compose)
-                .append(composeMethod(paramTypes[i])).append("(protocolVersion, ").append(i).append(", params.get(").append(i).append("))");
-        }
-        return code.toString();
-    }
-
-    private static String composeMethod(Class<?> type)
-    {
-        return (type.isPrimitive()) ? ("compose_" + type.getName()) : "compose";
-    }
-
-    // Java source UDFs are a very simple compilation task, which allows us to let one class implement
-    // all interfaces required by ECJ.
-    static final class EcjCompilationUnit implements ICompilationUnit, ICompilerRequestor, INameEnvironment
-    {
-        List<IProblem> problemList;
-        private final String className;
-        private final char[] sourceCode;
-
-        EcjCompilationUnit(String sourceCode, String className)
-        {
-            this.className = className;
-            this.sourceCode = sourceCode.toCharArray();
-        }
-
-        // ICompilationUnit
-
-        @Override
-        public char[] getFileName()
-        {
-            return sourceCode;
-        }
-
-        @Override
-        public char[] getContents()
-        {
-            return sourceCode;
-        }
-
-        @Override
-        public char[] getMainTypeName()
-        {
-            int dot = className.lastIndexOf('.');
-            return ((dot > 0) ? className.substring(dot + 1) : className).toCharArray();
-        }
-
-        @Override
-        public char[][] getPackageName()
-        {
-            StringTokenizer izer = new StringTokenizer(className, ".");
-            char[][] result = new char[izer.countTokens() - 1][];
-            for (int i = 0; i < result.length; i++)
-                result[i] = izer.nextToken().toCharArray();
-            return result;
-        }
-
-        @Override
-        public boolean ignoreOptionalProblems()
-        {
-            return false;
-        }
-
-        // ICompilerRequestor
-
-        @Override
-        public void acceptResult(CompilationResult result)
-        {
-            if (result.hasErrors())
-            {
-                IProblem[] problems = result.getProblems();
-                if (problemList == null)
-                    problemList = new ArrayList<>(problems.length);
-                Collections.addAll(problemList, problems);
-            }
-            else
-            {
-                ClassFile[] classFiles = result.getClassFiles();
-                for (ClassFile classFile : classFiles)
-                    targetClassLoader.addClass(className, classFile.getBytes());
-            }
-        }
-
-        // INameEnvironment
-
-        @Override
-        public NameEnvironmentAnswer findType(char[][] compoundTypeName)
-        {
-            StringBuilder result = new StringBuilder();
-            for (int i = 0; i < compoundTypeName.length; i++)
-            {
-                if (i > 0)
-                    result.append('.');
-                result.append(compoundTypeName[i]);
-            }
-            return findType(result.toString());
-        }
-
-        @Override
-        public NameEnvironmentAnswer findType(char[] typeName, char[][] packageName)
-        {
-            StringBuilder result = new StringBuilder();
-            int i = 0;
-            for (; i < packageName.length; i++)
-            {
-                if (i > 0)
-                    result.append('.');
-                result.append(packageName[i]);
-            }
-            if (i > 0)
-                result.append('.');
-            result.append(typeName);
-            return findType(result.toString());
-        }
-
-        private NameEnvironmentAnswer findType(String className)
-        {
-            if (className.equals(this.className))
-            {
-                return new NameEnvironmentAnswer(this, null);
-            }
-
-            String resourceName = className.replace('.', '/') + ".class";
-
-            try (InputStream is = baseClassLoader.getResourceAsStream(resourceName))
-            {
-                if (is != null)
-                {
-                    byte[] classBytes = ByteStreams.toByteArray(is);
-                    char[] fileName = className.toCharArray();
-                    ClassFileReader classFileReader = new ClassFileReader(classBytes, fileName, true);
-                    return new NameEnvironmentAnswer(classFileReader, null);
-                }
-            }
-            catch (IOException | ClassFormatException exc)
-            {
-                throw new RuntimeException(exc);
-            }
-            return null;
-        }
-
-        private boolean isPackage(String result)
-        {
-            if (result.equals(this.className))
-                return false;
-            String resourceName = result.replace('.', '/') + ".class";
-            try (InputStream is = baseClassLoader.getResourceAsStream(resourceName))
-            {
-                return is == null;
-            }
-            catch (IOException e)
-            {
-                // we are here, since close on is failed. That means it was not null
-                return false;
-            }
-        }
-
-        @Override
-        public boolean isPackage(char[][] parentPackageName, char[] packageName)
-        {
-            StringBuilder result = new StringBuilder();
-            int i = 0;
-            if (parentPackageName != null)
-                for (; i < parentPackageName.length; i++)
-                {
-                    if (i > 0)
-                        result.append('.');
-                    result.append(parentPackageName[i]);
-                }
-
-            if (Character.isUpperCase(packageName[0]) && !isPackage(result.toString()))
-                return false;
-            if (i > 0)
-                result.append('.');
-            result.append(packageName);
-
-            return isPackage(result.toString());
-        }
-
-        @Override
-        public void cleanup()
-        {
-        }
-    }
-
-    static final class EcjTargetClassLoader extends ClassLoader
-    {
-        // This map is usually empty.
-        // It only contains data *during* UDF compilation but not during runtime.
-        //
-        // addClass() is invoked by ECJ after successful compilation of the generated Java source.
-        // loadClass(targetClassName) is invoked by buildUDF() after ECJ returned from successful compilation.
-        //
-        private final Map<String, byte[]> classes = new ConcurrentHashMap<>();
-
-        EcjTargetClassLoader()
-        {
-            super(baseClassLoader);
-        }
-
-        public void addClass(String className, byte[] classData)
-        {
-            classes.put(className, classData);
-        }
-
-        protected Class<?> findClass(String name) throws ClassNotFoundException
-        {
-            // remove the class binary - it's only used once - so it's wasting heap
-            byte[] classData = classes.remove(name);
-
-            return classData != null ? defineClass(name, classData, 0, classData.length)
-                                     : super.findClass(name);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/functions/JavaUDF.java b/src/java/org/apache/cassandra/cql3/functions/JavaUDF.java
new file mode 100644
index 0000000..fcfd21c
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/JavaUDF.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.functions;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+
+/**
+ * Base class for all Java UDFs.
+ * Used to separate internal classes like {@link UDFunction} from user provided code.
+ * Only references <b>to</b> this class (and generated implementations) are allowed -
+ * references from this class back to C* code are not allowed (except argument/return type information).
+ */
+public abstract class JavaUDF
+{
+    private final TypeCodec<Object> returnCodec;
+    private final TypeCodec<Object>[] argCodecs;
+
+    protected JavaUDF(TypeCodec<Object> returnCodec, TypeCodec<Object>[] argCodecs)
+    {
+        this.returnCodec = returnCodec;
+        this.argCodecs = argCodecs;
+    }
+
+    protected abstract ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params);
+
+    protected Object compose(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        return UDFunction.compose(argCodecs, protocolVersion, argIndex, value);
+    }
+
+    protected ByteBuffer decompose(int protocolVersion, Object value)
+    {
+        return UDFunction.decompose(returnCodec, protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected float compose_float(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (float) UDHelper.deserialize(TypeCodec.cfloat(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected double compose_double(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (double) UDHelper.deserialize(TypeCodec.cdouble(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected byte compose_byte(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (byte) UDHelper.deserialize(TypeCodec.tinyInt(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected short compose_short(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (short) UDHelper.deserialize(TypeCodec.smallInt(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected int compose_int(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (int) UDHelper.deserialize(TypeCodec.cint(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected long compose_long(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (long) UDHelper.deserialize(TypeCodec.bigint(), protocolVersion, value);
+    }
+
+    // do not remove - used by generated Java UDFs
+    protected boolean compose_boolean(int protocolVersion, int argIndex, ByteBuffer value)
+    {
+        assert value != null && value.remaining() > 0;
+        return (boolean) UDHelper.deserialize(TypeCodec.cboolean(), protocolVersion, value);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDF.java b/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDF.java
deleted file mode 100644
index 2d46934..0000000
--- a/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDF.java
+++ /dev/null

@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.functions;
-
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import javax.script.Bindings;
-import javax.script.Compilable;
-import javax.script.CompiledScript;
-import javax.script.ScriptEngine;
-import javax.script.ScriptEngineFactory;
-import javax.script.ScriptEngineManager;
-import javax.script.ScriptException;
-import javax.script.SimpleBindings;
-
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.FunctionExecutionException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-
-public class ScriptBasedUDF extends UDFunction
-{
-    static final Map<String, Compilable> scriptEngines = new HashMap<>();
-
-    static {
-        ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
-        for (ScriptEngineFactory scriptEngineFactory : scriptEngineManager.getEngineFactories())
-        {
-            ScriptEngine scriptEngine = scriptEngineFactory.getScriptEngine();
-            boolean compilable = scriptEngine instanceof Compilable;
-            if (compilable)
-            {
-                logger.info("Found scripting engine {} {} - {} {} - language names: {}",
-                            scriptEngineFactory.getEngineName(), scriptEngineFactory.getEngineVersion(),
-                            scriptEngineFactory.getLanguageName(), scriptEngineFactory.getLanguageVersion(),
-                            scriptEngineFactory.getNames());
-                for (String name : scriptEngineFactory.getNames())
-                    scriptEngines.put(name, (Compilable) scriptEngine);
-            }
-        }
-    }
-
-    private final CompiledScript script;
-
-    ScriptBasedUDF(FunctionName name,
-                   List<ColumnIdentifier> argNames,
-                   List<AbstractType<?>> argTypes,
-                   AbstractType<?> returnType,
-                   boolean calledOnNullInput,
-                   String language,
-                   String body)
-    throws InvalidRequestException
-    {
-        super(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
-
-        if (!"JavaScript".equalsIgnoreCase(language))
-            logger.warn("Support for UDFs using '" + language + "' has been deprecated and removed in 3.0. If '" +
-                        language + "' actually is JavaScript, change the language used in CREATE/ALTER FUNCTION to " +
-                        "'javascript'.");
-
-        Compilable scriptEngine = scriptEngines.get(language);
-        if (scriptEngine == null)
-            throw new InvalidRequestException(String.format("Invalid language '%s' for function '%s'", language, name));
-
-        try
-        {
-            this.script = scriptEngine.compile(body);
-        }
-        catch (RuntimeException | ScriptException e)
-        {
-            logger.info("Failed to compile function '{}' for language {}: ", name, language, e);
-            throw new InvalidRequestException(
-                    String.format("Failed to compile function '%s' for language %s: %s", name, language, e));
-        }
-    }
-
-    public ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters) throws InvalidRequestException
-    {
-        Object[] params = new Object[argTypes.size()];
-        for (int i = 0; i < params.length; i++)
-            params[i] = compose(protocolVersion, i, parameters.get(i));
-
-        try
-        {
-            Bindings bindings = new SimpleBindings();
-            for (int i = 0; i < params.length; i++)
-                bindings.put(argNames.get(i).toString(), params[i]);
-
-            Object result = script.eval(bindings);
-            if (result == null)
-                return null;
-
-            Class<?> javaReturnType = returnDataType.asJavaClass();
-            Class<?> resultType = result.getClass();
-            if (!javaReturnType.isAssignableFrom(resultType))
-            {
-                if (result instanceof Number)
-                {
-                    Number rNumber = (Number) result;
-                    if (javaReturnType == Integer.class)
-                        result = rNumber.intValue();
-                    else if (javaReturnType == Short.class)
-                        result = rNumber.shortValue();
-                    else if (javaReturnType == Byte.class)
-                        result = rNumber.byteValue();
-                    else if (javaReturnType == Long.class)
-                        result = rNumber.longValue();
-                    else if (javaReturnType == Float.class)
-                        result = rNumber.floatValue();
-                    else if (javaReturnType == Double.class)
-                        result = rNumber.doubleValue();
-                    else if (javaReturnType == BigInteger.class)
-                    {
-                        if (rNumber instanceof BigDecimal)
-                            result = ((BigDecimal)rNumber).toBigInteger();
-                        else if (rNumber instanceof Double || rNumber instanceof Float)
-                            result = new BigDecimal(rNumber.toString()).toBigInteger();
-                        else
-                            result = BigInteger.valueOf(rNumber.longValue());
-                    }
-                    else if (javaReturnType == BigDecimal.class)
-                        // String c'tor of BigDecimal is more accurate than valueOf(double)
-                        result = new BigDecimal(rNumber.toString());
-                }
-            }
-
-            return decompose(protocolVersion, result);
-        }
-        catch (RuntimeException | ScriptException e)
-        {
-            logger.trace("Execution of UDF '{}' failed", name, e);
-            throw FunctionExecutionException.create(this, e);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDFunction.java b/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDFunction.java
new file mode 100644
index 0000000..47deafa
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/ScriptBasedUDFunction.java

@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.functions;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.*;
+import java.nio.ByteBuffer;
+import java.security.*;
+import java.security.cert.Certificate;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+import javax.script.*;
+
+import jdk.nashorn.api.scripting.ClassFilter;
+import jdk.nashorn.api.scripting.NashornScriptEngine;
+import jdk.nashorn.api.scripting.NashornScriptEngineFactory;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+final class ScriptBasedUDFunction extends UDFunction
+{
+    private static final ProtectionDomain protectionDomain;
+    private static final AccessControlContext accessControlContext;
+
+    //
+    // For scripted UDFs we have to rely on the security mechanisms of the scripting engine and
+    // SecurityManager - especially SecurityManager.checkPackageAccess(). Unlike Java-UDFs, strict checking
+    // of class access via the UDF class loader is not possible, since e.g. Nashorn builds its own class loader
+    // (jdk.nashorn.internal.runtime.ScriptLoader / jdk.nashorn.internal.runtime.NashornLoader) configured with
+    // a system class loader.
+    //
+    private static final String[] allowedPackagesArray =
+    {
+    // following required by jdk.nashorn.internal.objects.Global.initJavaAccess()
+    "",
+    "com",
+    "edu",
+    "java",
+    "javax",
+    "javafx",
+    "org",
+    // following required by Nashorn runtime
+    "java.lang",
+    "java.lang.invoke",
+    "java.lang.reflect",
+    "java.nio.charset",
+    "java.util",
+    "java.util.concurrent",
+    "javax.script",
+    "sun.reflect",
+    "jdk.internal.org.objectweb.asm.commons",
+    "jdk.nashorn.internal.runtime",
+    "jdk.nashorn.internal.runtime.linker",
+    // following required by Java Driver
+    "java.math",
+    "java.nio",
+    "java.text",
+    "com.google.common.base",
+    "com.google.common.collect",
+    "com.google.common.reflect",
+    // following required by UDF
+    "com.datastax.driver.core",
+    "com.datastax.driver.core.utils"
+    };
+
+    // use a JVM standard ExecutorService as DebuggableThreadPoolExecutor references internal
+    // classes, which triggers AccessControlException from the UDF sandbox
+    private static final UDFExecutorService executor =
+        new UDFExecutorService(new NamedThreadFactory("UserDefinedScriptFunctions",
+                                                      Thread.MIN_PRIORITY,
+                                                      udfClassLoader,
+                                                      new SecurityThreadGroup("UserDefinedScriptFunctions",
+                                                                              Collections.unmodifiableSet(new HashSet<>(Arrays.asList(allowedPackagesArray))),
+                                                                              UDFunction::initializeThread)),
+                               "userscripts");
+
+    private static final ClassFilter classFilter = clsName -> secureResource(clsName.replace('.', '/') + ".class");
+
+    private static final NashornScriptEngine scriptEngine;
+
+
+    static
+    {
+        ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
+        ScriptEngine engine = scriptEngineManager.getEngineByName("nashorn");
+        NashornScriptEngineFactory factory = engine != null ? (NashornScriptEngineFactory) engine.getFactory() : null;
+        scriptEngine = factory != null ? (NashornScriptEngine) factory.getScriptEngine(new String[]{}, udfClassLoader, classFilter) : null;
+
+        try
+        {
+            protectionDomain = new ProtectionDomain(new CodeSource(new URL("udf", "localhost", 0, "/script", new URLStreamHandler()
+            {
+                protected URLConnection openConnection(URL u)
+                {
+                    return null;
+                }
+            }), (Certificate[]) null), ThreadAwareSecurityManager.noPermissions);
+        }
+        catch (MalformedURLException e)
+        {
+            throw new RuntimeException(e);
+        }
+        accessControlContext = new AccessControlContext(new ProtectionDomain[]{ protectionDomain });
+    }
+
+    private final CompiledScript script;
+
+    ScriptBasedUDFunction(FunctionName name,
+                          List<ColumnIdentifier> argNames,
+                          List<AbstractType<?>> argTypes,
+                          AbstractType<?> returnType,
+                          boolean calledOnNullInput,
+                          String language,
+                          String body)
+    {
+        super(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
+
+        if (!"JavaScript".equalsIgnoreCase(language) || scriptEngine == null)
+            throw new InvalidRequestException(String.format("Invalid language '%s' for function '%s'", language, name));
+
+        // execute compilation with no-permissions to prevent evil code e.g. via "static code blocks" / "class initialization"
+        try
+        {
+            this.script = AccessController.doPrivileged((PrivilegedExceptionAction<CompiledScript>) () -> scriptEngine.compile(body),
+                                                        accessControlContext);
+        }
+        catch (PrivilegedActionException x)
+        {
+            Throwable e = x.getCause();
+            logger.info("Failed to compile function '{}' for language {}: ", name, language, e);
+            throw new InvalidRequestException(
+                                             String.format("Failed to compile function '%s' for language %s: %s", name, language, e));
+        }
+    }
+
+    protected ExecutorService executor()
+    {
+        return executor;
+    }
+
+    public ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters)
+    {
+        Object[] params = new Object[argTypes.size()];
+        for (int i = 0; i < params.length; i++)
+            params[i] = compose(protocolVersion, i, parameters.get(i));
+
+        ScriptContext scriptContext = new SimpleScriptContext();
+        scriptContext.setAttribute("javax.script.filename", this.name.toString(), ScriptContext.ENGINE_SCOPE);
+        Bindings bindings = scriptContext.getBindings(ScriptContext.ENGINE_SCOPE);
+        for (int i = 0; i < params.length; i++)
+            bindings.put(argNames.get(i).toString(), params[i]);
+
+        Object result;
+        try
+        {
+            // How to prevent Class.forName() _without_ "help" from the script engine ?
+            // NOTE: Nashorn enforces a special permission to allow class-loading, which is not granted - so it's fine.
+
+            result = script.eval(scriptContext);
+        }
+        catch (ScriptException e)
+        {
+            throw new RuntimeException(e);
+        }
+        if (result == null)
+            return null;
+
+        Class<?> javaReturnType = UDHelper.asJavaClass(returnCodec);
+        Class<?> resultType = result.getClass();
+        if (!javaReturnType.isAssignableFrom(resultType))
+        {
+            if (result instanceof Number)
+            {
+                Number rNumber = (Number) result;
+                if (javaReturnType == Integer.class)
+                    result = rNumber.intValue();
+                else if (javaReturnType == Long.class)
+                    result = rNumber.longValue();
+                else if (javaReturnType == Short.class)
+                    result = rNumber.shortValue();
+                else if (javaReturnType == Byte.class)
+                    result = rNumber.byteValue();
+                else if (javaReturnType == Float.class)
+                    result = rNumber.floatValue();
+                else if (javaReturnType == Double.class)
+                    result = rNumber.doubleValue();
+                else if (javaReturnType == BigInteger.class)
+                {
+                    if (javaReturnType == Integer.class)
+                        result = rNumber.intValue();
+                    else if (javaReturnType == Short.class)
+                        result = rNumber.shortValue();
+                    else if (javaReturnType == Byte.class)
+                        result = rNumber.byteValue();
+                    else if (javaReturnType == Long.class)
+                        result = rNumber.longValue();
+                    else if (javaReturnType == Float.class)
+                        result = rNumber.floatValue();
+                    else if (javaReturnType == Double.class)
+                        result = rNumber.doubleValue();
+                    else if (javaReturnType == BigInteger.class)
+                    {
+                        if (rNumber instanceof BigDecimal)
+                            result = ((BigDecimal) rNumber).toBigInteger();
+                        else if (rNumber instanceof Double || rNumber instanceof Float)
+                            result = new BigDecimal(rNumber.toString()).toBigInteger();
+                        else
+                            result = BigInteger.valueOf(rNumber.longValue());
+                    }
+                    else if (javaReturnType == BigDecimal.class)
+                        // String c'tor of BigDecimal is more accurate than valueOf(double)
+                        result = new BigDecimal(rNumber.toString());
+                }
+                else if (javaReturnType == BigDecimal.class)
+                    // String c'tor of BigDecimal is more accurate than valueOf(double)
+                    result = new BigDecimal(rNumber.toString());
+            }
+        }
+
+        return decompose(protocolVersion, result);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/SecurityThreadGroup.java b/src/java/org/apache/cassandra/cql3/functions/SecurityThreadGroup.java
new file mode 100644
index 0000000..8f50dc8
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/SecurityThreadGroup.java

@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.functions;
+
+import java.util.Set;
+
+/**
+ * Used by {@link ThreadAwareSecurityManager} to determine whether access-control checks needs to be performed.
+ */
+public final class SecurityThreadGroup extends ThreadGroup
+{
+    private final Set<String> allowedPackages;
+    private final ThreadInitializer threadInitializer;
+
+    public SecurityThreadGroup(String name, Set<String> allowedPackages, ThreadInitializer threadInitializer)
+    {
+        super(name);
+        this.allowedPackages = allowedPackages;
+        this.threadInitializer = threadInitializer;
+    }
+
+    public void initializeThread()
+    {
+        threadInitializer.initializeThread();
+    }
+
+    public boolean isPackageAllowed(String pkg)
+    {
+        return allowedPackages == null || allowedPackages.contains(pkg);
+    }
+
+    @FunctionalInterface
+    interface ThreadInitializer
+    {
+        void initializeThread();
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/ThreadAwareSecurityManager.java b/src/java/org/apache/cassandra/cql3/functions/ThreadAwareSecurityManager.java
new file mode 100644
index 0000000..13d1945
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/ThreadAwareSecurityManager.java

@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.functions;
+
+import java.security.AccessControlException;
+import java.security.AllPermission;
+import java.security.CodeSource;
+import java.security.Permission;
+import java.security.PermissionCollection;
+import java.security.Permissions;
+import java.security.Policy;
+import java.security.ProtectionDomain;
+import java.util.Collections;
+import java.util.Enumeration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ch.qos.logback.classic.LoggerContext;
+import ch.qos.logback.classic.spi.TurboFilterList;
+import ch.qos.logback.classic.turbo.ReconfigureOnChangeFilter;
+import ch.qos.logback.classic.turbo.TurboFilter;
+
+/**
+ * Custom {@link SecurityManager} and {@link Policy} implementation that only performs access checks
+ * if explicitly enabled.
+ * <p>
+ * This implementation gives no measurable performance panalty
+ * (see <a href="http://cstar.datastax.com/tests/id/1d461628-12ba-11e5-918f-42010af0688f">see cstar test</a>).
+ * This is better than the penalty of 1 to 3 percent using a standard {@code SecurityManager} with an <i>allow all</i> policy.
+ * </p>
+ */
+public final class ThreadAwareSecurityManager extends SecurityManager
+{
+    static final PermissionCollection noPermissions = new PermissionCollection()
+    {
+        public void add(Permission permission)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public boolean implies(Permission permission)
+        {
+            return false;
+        }
+
+        public Enumeration<Permission> elements()
+        {
+            return Collections.emptyEnumeration();
+        }
+    };
+
+    private static final RuntimePermission CHECK_MEMBER_ACCESS_PERMISSION = new RuntimePermission("accessDeclaredMembers");
+    private static final RuntimePermission MODIFY_THREAD_PERMISSION = new RuntimePermission("modifyThread");
+    private static final RuntimePermission MODIFY_THREADGROUP_PERMISSION = new RuntimePermission("modifyThreadGroup");
+
+    private static volatile boolean installed;
+
+    public static void install()
+    {
+        if (installed)
+            return;
+        System.setSecurityManager(new ThreadAwareSecurityManager());
+
+        // The default logback configuration in conf/logback.xml allows reloading the
+        // configuration when the configuration file has changed (every 60 seconds by default).
+        // This requires logback to use file I/O APIs. But file I/O is not allowed from UDFs.
+        // I.e. if logback decides to check for a modification of the config file while
+        // executiing a sandbox thread, the UDF execution and therefore the whole request
+        // execution will fail with an AccessControlException.
+        // To work around this, a custom ReconfigureOnChangeFilter is installed, that simply
+        // prevents this configuration file check and possible reload of the configration,
+        // while executing sandboxed UDF code.
+        Logger l = LoggerFactory.getLogger(ThreadAwareSecurityManager.class);
+        ch.qos.logback.classic.Logger logbackLogger = (ch.qos.logback.classic.Logger) l;
+        LoggerContext ctx = logbackLogger.getLoggerContext();
+
+        TurboFilterList turboFilterList = ctx.getTurboFilterList();
+        for (int i = 0; i < turboFilterList.size(); i++)
+        {
+            TurboFilter turboFilter = turboFilterList.get(i);
+            if (turboFilter instanceof ReconfigureOnChangeFilter)
+            {
+                ReconfigureOnChangeFilter reconfigureOnChangeFilter = (ReconfigureOnChangeFilter) turboFilter;
+                turboFilterList.set(i, new SMAwareReconfigureOnChangeFilter(reconfigureOnChangeFilter));
+                break;
+            }
+        }
+
+        installed = true;
+    }
+
+    /**
+     * The purpose of this class is to prevent logback from checking for config file change,
+     * if the current thread is executing a sandboxed thread to avoid {@link AccessControlException}s.
+     */
+    private static class SMAwareReconfigureOnChangeFilter extends ReconfigureOnChangeFilter
+    {
+        SMAwareReconfigureOnChangeFilter(ReconfigureOnChangeFilter reconfigureOnChangeFilter)
+        {
+            setRefreshPeriod(reconfigureOnChangeFilter.getRefreshPeriod());
+            setName(reconfigureOnChangeFilter.getName());
+            setContext(reconfigureOnChangeFilter.getContext());
+            if (reconfigureOnChangeFilter.isStarted())
+            {
+                reconfigureOnChangeFilter.stop();
+                start();
+            }
+        }
+
+        protected boolean changeDetected(long now)
+        {
+            if (isSecuredThread())
+                return false;
+            return super.changeDetected(now);
+        }
+    }
+
+    static
+    {
+        //
+        // Use own security policy to be easier (and faster) since the C* has no fine grained permissions.
+        // Either code has access to everything or code has access to nothing (UDFs).
+        // This also removes the burden to maintain and configure policy files for production, unit tests etc.
+        //
+        // Note: a permission is only granted, if there is no objector. This means that
+        // AccessController/AccessControlContext collect all applicable ProtectionDomains - only if none of these
+        // applicable ProtectionDomains denies access, the permission is granted.
+        // A ProtectionDomain can have its origin at an oridinary code-source or provided via a
+        // AccessController.doPrivileded() call.
+        //
+        Policy.setPolicy(new Policy()
+        {
+            public PermissionCollection getPermissions(CodeSource codesource)
+            {
+                // contract of getPermissions() methods is to return a _mutable_ PermissionCollection
+
+                Permissions perms = new Permissions();
+
+                if (codesource == null || codesource.getLocation() == null)
+                    return perms;
+
+                switch (codesource.getLocation().getProtocol())
+                {
+                    case "file":
+                        // All JARs and class files reside on the file system - we can safely
+                        // assume that these classes are "good".
+                        perms.add(new AllPermission());
+                        return perms;
+                }
+
+                return perms;
+            }
+
+            public PermissionCollection getPermissions(ProtectionDomain domain)
+            {
+                return getPermissions(domain.getCodeSource());
+            }
+
+            public boolean implies(ProtectionDomain domain, Permission permission)
+            {
+                CodeSource codesource = domain.getCodeSource();
+                if (codesource == null || codesource.getLocation() == null)
+                    return false;
+
+                switch (codesource.getLocation().getProtocol())
+                {
+                    case "file":
+                        // All JARs and class files reside on the file system - we can safely
+                        // assume that these classes are "good".
+                        return true;
+                }
+
+                return false;
+            }
+        });
+    }
+
+    private static final ThreadLocal<Boolean> initializedThread = new ThreadLocal<>();
+
+    private ThreadAwareSecurityManager()
+    {
+    }
+
+    private static boolean isSecuredThread()
+    {
+        ThreadGroup tg = Thread.currentThread().getThreadGroup();
+        if (!(tg instanceof SecurityThreadGroup))
+            return false;
+        Boolean threadInitialized = initializedThread.get();
+        if (threadInitialized == null)
+        {
+            initializedThread.set(false);
+            ((SecurityThreadGroup) tg).initializeThread();
+            initializedThread.set(true);
+            threadInitialized = true;
+        }
+        return threadInitialized;
+    }
+
+    public void checkAccess(Thread t)
+    {
+        // need to override since the default implementation only checks the permission if the current thread's
+        // in the root-thread-group
+
+        if (isSecuredThread())
+            throw new AccessControlException("access denied: " + MODIFY_THREAD_PERMISSION, MODIFY_THREAD_PERMISSION);
+        super.checkAccess(t);
+    }
+
+    public void checkAccess(ThreadGroup g)
+    {
+        // need to override since the default implementation only checks the permission if the current thread's
+        // in the root-thread-group
+
+        if (isSecuredThread())
+            throw new AccessControlException("access denied: " + MODIFY_THREADGROUP_PERMISSION, MODIFY_THREADGROUP_PERMISSION);
+        super.checkAccess(g);
+    }
+
+    public void checkPermission(Permission perm)
+    {
+        if (!isSecuredThread())
+            return;
+
+        // required by JavaDriver 2.2.0-rc3 and 3.0.0-a2 or newer
+        // code in com.datastax.driver.core.CodecUtils uses Guava stuff, which in turns requires this permission
+        if (CHECK_MEMBER_ACCESS_PERMISSION.equals(perm))
+            return;
+
+        super.checkPermission(perm);
+    }
+
+    public void checkPermission(Permission perm, Object context)
+    {
+        if (isSecuredThread())
+            super.checkPermission(perm, context);
+    }
+
+    public void checkPackageAccess(String pkg)
+    {
+        if (!isSecuredThread())
+            return;
+
+        if (!((SecurityThreadGroup) Thread.currentThread().getThreadGroup()).isPackageAllowed(pkg))
+        {
+            RuntimePermission perm = new RuntimePermission("accessClassInPackage." + pkg);
+            throw new AccessControlException("access denied: " + perm, perm);
+        }
+
+        super.checkPackageAccess(pkg);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/TimeFcts.java b/src/java/org/apache/cassandra/cql3/functions/TimeFcts.java
index a4623cd..93d6d3b 100644
--- a/src/java/org/apache/cassandra/cql3/functions/TimeFcts.java
+++ b/src/java/org/apache/cassandra/cql3/functions/TimeFcts.java

@@ -18,9 +18,11 @@
 package org.apache.cassandra.cql3.functions;
 
 import java.nio.ByteBuffer;
+import java.util.Collection;
 import java.util.Date;
 import java.util.List;
 
+import com.google.common.collect.ImmutableList;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -33,6 +35,22 @@
 {
     public static Logger logger = LoggerFactory.getLogger(TimeFcts.class);
 
+    public static Collection<Function> all()
+    {
+        return ImmutableList.of(nowFct,
+                                minTimeuuidFct,
+                                maxTimeuuidFct,
+                                dateOfFct,
+                                unixTimestampOfFct,
+                                timeUuidtoDate,
+                                timeUuidToTimestamp,
+                                timeUuidToUnixTimestamp,
+                                timestampToUnixTimestamp,
+                                timestampToDate,
+                                dateToUnixTimestamp,
+                                dateToTimestamp);
+    }
+
     public static final Function nowFct = new NativeScalarFunction("now", TimeUUIDType.instance)
     {
         public ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters)

diff --git a/src/java/org/apache/cassandra/cql3/functions/TokenFct.java b/src/java/org/apache/cassandra/cql3/functions/TokenFct.java
index 9d50a97..283ac0b 100644
--- a/src/java/org/apache/cassandra/cql3/functions/TokenFct.java
+++ b/src/java/org/apache/cassandra/cql3/functions/TokenFct.java

@@ -22,22 +22,17 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.composites.CBuilder;
+import org.apache.cassandra.db.CBuilder;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.StorageService;
 
 public class TokenFct extends NativeScalarFunction
 {
-    // The actual token function depends on the partitioner used
-    private static final IPartitioner partitioner = StorageService.getPartitioner();
-
     private final CFMetaData cfm;
 
     public TokenFct(CFMetaData cfm)
     {
-        super("token", partitioner.getTokenValidator(), getKeyTypes(cfm));
+        super("token", cfm.partitioner.getTokenValidator(), getKeyTypes(cfm));
         this.cfm = cfm;
     }
 
@@ -52,7 +47,7 @@
 
     public ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters) throws InvalidRequestException
     {
-        CBuilder builder = cfm.getKeyValidatorAsCType().builder();
+        CBuilder builder = CBuilder.create(cfm.getKeyValidatorAsClusteringComparator());
         for (int i = 0; i < parameters.size(); i++)
         {
             ByteBuffer bb = parameters.get(i);
@@ -60,6 +55,6 @@
                 return null;
             builder.add(bb);
         }
-        return partitioner.getTokenFactory().toByteArray(partitioner.getToken(builder.build().toByteBuffer()));
+        return cfm.partitioner.getTokenFactory().toByteArray(cfm.partitioner.getToken(CFMetaData.serializePartitionKey(builder.build())));
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/UDAggregate.java b/src/java/org/apache/cassandra/cql3/functions/UDAggregate.java
index 5f4d107..96e19de 100644
--- a/src/java/org/apache/cassandra/cql3/functions/UDAggregate.java
+++ b/src/java/org/apache/cassandra/cql3/functions/UDAggregate.java

@@ -21,12 +21,12 @@
 import java.util.*;
 
 import com.google.common.base.Objects;
-import com.google.common.collect.ImmutableSet;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.Functions;
 import org.apache.cassandra.tracing.Tracing;
 
 /**
@@ -55,7 +55,8 @@
         this.initcond = initcond;
     }
 
-    public static UDAggregate create(FunctionName name,
+    public static UDAggregate create(Functions functions,
+                                     FunctionName name,
                                      List<AbstractType<?>> argTypes,
                                      AbstractType<?> returnType,
                                      FunctionName stateFunc,
@@ -71,8 +72,8 @@
         return new UDAggregate(name,
                                argTypes,
                                returnType,
-                               resolveScalar(name, stateFunc, stateTypes),
-                               finalFunc != null ? resolveScalar(name, finalFunc, finalTypes) : null,
+                               resolveScalar(functions, name, stateFunc, stateTypes),
+                               finalFunc != null ? resolveScalar(functions, name, finalFunc, finalTypes) : null,
                                initcond);
     }
 
@@ -99,14 +100,17 @@
         return stateFunction == function || finalFunction == function;
     }
 
-    public Iterable<Function> getFunctions()
+    @Override
+    public void addFunctionsTo(List<Function> functions)
     {
-        if (stateFunction == null)
-            return Collections.emptySet();
-        if (finalFunction != null)
-            return ImmutableSet.of(this, stateFunction, finalFunction);
-        else
-            return ImmutableSet.of(this, stateFunction);
+        functions.add(this);
+        if (stateFunction != null)
+        {
+            stateFunction.addFunctionsTo(functions);
+
+            if (finalFunction != null)
+                finalFunction.addFunctionsTo(functions);
+        }
     }
 
     public boolean isAggregate()
@@ -162,7 +166,7 @@
                 {
                     UDFunction udf = (UDFunction)stateFunction;
                     if (udf.isCallableWrtNullable(fArgs))
-                        state = udf.executeUserDefined(protocolVersion, fArgs);
+                        state = udf.execute(protocolVersion, fArgs);
                 }
                 else
                 {
@@ -192,16 +196,21 @@
         };
     }
 
-    private static ScalarFunction resolveScalar(FunctionName aName, FunctionName fName, List<AbstractType<?>> argTypes) throws InvalidRequestException
+    private static ScalarFunction resolveScalar(Functions functions, FunctionName aName, FunctionName fName, List<AbstractType<?>> argTypes) throws InvalidRequestException
     {
-        Function func = Functions.find(fName, argTypes);
-        if (func == null)
+        Optional<Function> fun = functions.find(fName, argTypes);
+        if (!fun.isPresent())
             throw new InvalidRequestException(String.format("Referenced state function '%s %s' for aggregate '%s' does not exist",
-                                                            fName, Arrays.toString(UDHelper.driverTypes(argTypes)), aName));
-        if (!(func instanceof ScalarFunction))
+                                                            fName,
+                                                            Arrays.toString(UDHelper.driverTypes(argTypes)),
+                                                            aName));
+
+        if (!(fun.get() instanceof ScalarFunction))
             throw new InvalidRequestException(String.format("Referenced state function '%s %s' for aggregate '%s' is not a scalar function",
-                                                            fName, Arrays.toString(UDHelper.driverTypes(argTypes)), aName));
-        return (ScalarFunction) func;
+                                                            fName,
+                                                            Arrays.toString(UDHelper.driverTypes(argTypes)),
+                                                            aName));
+        return (ScalarFunction) fun.get();
     }
 
     @Override
@@ -212,8 +221,8 @@
 
         UDAggregate that = (UDAggregate) o;
         return Objects.equal(name, that.name)
-            && Functions.typeEquals(argTypes, that.argTypes)
-            && Functions.typeEquals(returnType, that.returnType)
+            && Functions.typesMatch(argTypes, that.argTypes)
+            && Functions.typesMatch(returnType, that.returnType)
             && Objects.equal(stateFunction, that.stateFunction)
             && Objects.equal(finalFunction, that.finalFunction)
             && Objects.equal(stateType, that.stateType)

diff --git a/src/java/org/apache/cassandra/cql3/functions/UDFByteCodeVerifier.java b/src/java/org/apache/cassandra/cql3/functions/UDFByteCodeVerifier.java
new file mode 100644
index 0000000..1314af3
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/UDFByteCodeVerifier.java

@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.functions;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.Handle;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+
+/**
+ * Verifies Java UDF byte code.
+ * Checks for disallowed method calls (e.g. {@code Object.finalize()}),
+ * additional code in the constructor,
+ * use of {@code synchronized} blocks,
+ * too many methods.
+ */
+public final class UDFByteCodeVerifier
+{
+
+    public static final String JAVA_UDF_NAME = JavaUDF.class.getName().replace('.', '/');
+    public static final String OBJECT_NAME = Object.class.getName().replace('.', '/');
+    public static final String CTOR_SIG = "(Lcom/datastax/driver/core/TypeCodec;[Lcom/datastax/driver/core/TypeCodec;)V";
+
+    private final Set<String> disallowedClasses = new HashSet<>();
+    private final Multimap<String, String> disallowedMethodCalls = HashMultimap.create();
+    private final List<String> disallowedPackages = new ArrayList<>();
+
+    public UDFByteCodeVerifier()
+    {
+        addDisallowedMethodCall(OBJECT_NAME, "clone");
+        addDisallowedMethodCall(OBJECT_NAME, "finalize");
+        addDisallowedMethodCall(OBJECT_NAME, "notify");
+        addDisallowedMethodCall(OBJECT_NAME, "notifyAll");
+        addDisallowedMethodCall(OBJECT_NAME, "wait");
+    }
+
+    public UDFByteCodeVerifier addDisallowedClass(String clazz)
+    {
+        disallowedClasses.add(clazz);
+        return this;
+    }
+
+    public UDFByteCodeVerifier addDisallowedMethodCall(String clazz, String method)
+    {
+        disallowedMethodCalls.put(clazz, method);
+        return this;
+    }
+
+    public UDFByteCodeVerifier addDisallowedPackage(String pkg)
+    {
+        disallowedPackages.add(pkg);
+        return this;
+    }
+
+    public Set<String> verify(byte[] bytes)
+    {
+        Set<String> errors = new TreeSet<>(); // it's a TreeSet for unit tests
+        ClassVisitor classVisitor = new ClassVisitor(Opcodes.ASM5)
+        {
+            public FieldVisitor visitField(int access, String name, String desc, String signature, Object value)
+            {
+                errors.add("field declared: " + name);
+                return null;
+            }
+
+            public MethodVisitor visitMethod(int access, String name, String desc, String signature, String[] exceptions)
+            {
+                if ("<init>".equals(name) && CTOR_SIG.equals(desc))
+                {
+                    if (Opcodes.ACC_PUBLIC != access)
+                        errors.add("constructor not public");
+                    // allowed constructor - JavaUDF(TypeCodec returnCodec, TypeCodec[] argCodecs)
+                    return new ConstructorVisitor(errors);
+                }
+                if ("executeImpl".equals(name) && "(ILjava/util/List;)Ljava/nio/ByteBuffer;".equals(desc))
+                {
+                    if (Opcodes.ACC_PROTECTED != access)
+                        errors.add("executeImpl not protected");
+                    // the executeImpl method - ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+                    return new ExecuteImplVisitor(errors);
+                }
+                if ("<clinit>".equals(name))
+                {
+                    errors.add("static initializer declared");
+                }
+                else
+                {
+                    errors.add("not allowed method declared: " + name + desc);
+                    return new ExecuteImplVisitor(errors);
+                }
+                return null;
+            }
+
+            public void visit(int version, int access, String name, String signature, String superName, String[] interfaces)
+            {
+                if (!JAVA_UDF_NAME.equals(superName))
+                {
+                    errors.add("class does not extend " + JavaUDF.class.getName());
+                }
+                if (access != (Opcodes.ACC_PUBLIC | Opcodes.ACC_FINAL | Opcodes.ACC_SUPER))
+                {
+                    errors.add("class not public final");
+                }
+                super.visit(version, access, name, signature, superName, interfaces);
+            }
+
+            public void visitInnerClass(String name, String outerName, String innerName, int access)
+            {
+                errors.add("class declared as inner class");
+                super.visitInnerClass(name, outerName, innerName, access);
+            }
+        };
+
+        ClassReader classReader = new ClassReader(bytes);
+        classReader.accept(classVisitor, ClassReader.SKIP_DEBUG);
+
+        return errors;
+    }
+
+    private class ExecuteImplVisitor extends MethodVisitor
+    {
+        private final Set<String> errors;
+
+        ExecuteImplVisitor(Set<String> errors)
+        {
+            super(Opcodes.ASM5);
+            this.errors = errors;
+        }
+
+        public void visitMethodInsn(int opcode, String owner, String name, String desc, boolean itf)
+        {
+            if (disallowedClasses.contains(owner))
+            {
+                errorDisallowed(owner, name);
+            }
+            Collection<String> disallowed = disallowedMethodCalls.get(owner);
+            if (disallowed != null && disallowed.contains(name))
+            {
+                errorDisallowed(owner, name);
+            }
+            if (!JAVA_UDF_NAME.equals(owner))
+            {
+                for (String pkg : disallowedPackages)
+                {
+                    if (owner.startsWith(pkg))
+                        errorDisallowed(owner, name);
+                }
+            }
+            super.visitMethodInsn(opcode, owner, name, desc, itf);
+        }
+
+        private void errorDisallowed(String owner, String name)
+        {
+            errors.add("call to " + owner.replace('/', '.') + '.' + name + "()");
+        }
+
+        public void visitInsn(int opcode)
+        {
+            switch (opcode)
+            {
+                case Opcodes.MONITORENTER:
+                case Opcodes.MONITOREXIT:
+                    errors.add("use of synchronized");
+                    break;
+            }
+            super.visitInsn(opcode);
+        }
+    }
+
+    private static class ConstructorVisitor extends MethodVisitor
+    {
+        private final Set<String> errors;
+
+        ConstructorVisitor(Set<String> errors)
+        {
+            super(Opcodes.ASM5);
+            this.errors = errors;
+        }
+
+        public void visitInvokeDynamicInsn(String name, String desc, Handle bsm, Object... bsmArgs)
+        {
+            errors.add("Use of invalid method instruction in constructor");
+            super.visitInvokeDynamicInsn(name, desc, bsm, bsmArgs);
+        }
+
+        public void visitMethodInsn(int opcode, String owner, String name, String desc, boolean itf)
+        {
+            if (!(Opcodes.INVOKESPECIAL == opcode && JAVA_UDF_NAME.equals(owner) && "<init>".equals(name) && CTOR_SIG.equals(desc)))
+            {
+                errors.add("initializer declared");
+            }
+            super.visitMethodInsn(opcode, owner, name, desc, itf);
+        }
+
+        public void visitInsn(int opcode)
+        {
+            if (Opcodes.RETURN != opcode)
+            {
+                errors.add("initializer declared");
+            }
+            super.visitInsn(opcode);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/UDFExecutorService.java b/src/java/org/apache/cassandra/cql3/functions/UDFExecutorService.java
new file mode 100644
index 0000000..5e08ad8
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/functions/UDFExecutorService.java

@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.functions;
+
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Executor service which exposes stats via JMX, but which doesn't reference
+ * internal classes in its beforeExecute & afterExecute methods as these are
+ * forbidden by the UDF execution sandbox
+ */
+final class UDFExecutorService extends JMXEnabledThreadPoolExecutor
+{
+    private static int KEEPALIVE = Integer.getInteger("cassandra.udf_executor_thread_keepalive_ms", 30000);
+
+    UDFExecutorService(NamedThreadFactory threadFactory, String jmxPath)
+    {
+        super(FBUtilities.getAvailableProcessors(),
+              KEEPALIVE,
+              TimeUnit.MILLISECONDS,
+              new LinkedBlockingQueue<>(),
+              threadFactory,
+              jmxPath);
+    }
+
+    protected void afterExecute(Runnable r, Throwable t)
+    {
+    }
+
+    protected void beforeExecute(Thread t, Runnable r)
+    {
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/functions/UDFunction.java b/src/java/org/apache/cassandra/cql3/functions/UDFunction.java
index 1e5cea6..27f9eb8 100644
--- a/src/java/org/apache/cassandra/cql3/functions/UDFunction.java
+++ b/src/java/org/apache/cassandra/cql3/functions/UDFunction.java

@@ -17,25 +17,45 @@
  */
 package org.apache.cassandra.cql3.functions;
 
+import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadMXBean;
+import java.net.InetAddress;
+import java.net.URL;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import com.google.common.base.Objects;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.datastax.driver.core.DataType;
-import com.datastax.driver.core.ProtocolVersion;
+import com.datastax.driver.core.TypeCodec;
 import com.datastax.driver.core.UserType;
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.FunctionExecutionException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.Functions;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * Base class for User Defined Functions.
@@ -44,15 +64,118 @@
 {
     protected static final Logger logger = LoggerFactory.getLogger(UDFunction.class);
 
+    static final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean();
+
     protected final List<ColumnIdentifier> argNames;
 
     protected final String language;
     protected final String body;
 
-    protected final DataType[] argDataTypes;
-    protected final DataType returnDataType;
+    protected final TypeCodec<Object>[] argCodecs;
+    protected final TypeCodec<Object> returnCodec;
     protected final boolean calledOnNullInput;
 
+    //
+    // Access to classes is controlled via allow and disallow lists.
+    //
+    // When a class is requested (both during compilation and runtime),
+    // the allowedPatterns array is searched first, whether the
+    // requested name matches one of the patterns. If not, nothing is
+    // returned from the class-loader - meaning ClassNotFoundException
+    // during runtime and "type could not resolved" during compilation.
+    //
+    // If an allowed pattern has been found, the disallowedPatterns
+    // array is searched for a match. If a match is found, class-loader
+    // rejects access. Otherwise the class/resource can be loaded.
+    //
+    private static final String[] allowedPatterns =
+    {
+    "com/datastax/driver/core/",
+    "com/google/common/reflect/TypeToken",
+    "java/io/IOException.class",
+    "java/io/Serializable.class",
+    "java/lang/",
+    "java/math/",
+    "java/net/InetAddress.class",
+    "java/net/Inet4Address.class",
+    "java/net/Inet6Address.class",
+    "java/net/UnknownHostException.class", // req'd by InetAddress
+    "java/net/NetworkInterface.class", // req'd by InetAddress
+    "java/net/SocketException.class", // req'd by InetAddress
+    "java/nio/Buffer.class",
+    "java/nio/ByteBuffer.class",
+    "java/text/",
+    "java/time/",
+    "java/util/",
+    "org/apache/cassandra/cql3/functions/JavaUDF.class",
+    "org/apache/cassandra/exceptions/",
+    };
+    // Only need to disallow a pattern, if it would otherwise be allowed via allowedPatterns
+    private static final String[] disallowedPatterns =
+    {
+    "com/datastax/driver/core/Cluster.class",
+    "com/datastax/driver/core/Metrics.class",
+    "com/datastax/driver/core/NettyOptions.class",
+    "com/datastax/driver/core/Session.class",
+    "com/datastax/driver/core/Statement.class",
+    "com/datastax/driver/core/TimestampGenerator.class", // indirectly covers ServerSideTimestampGenerator + ThreadLocalMonotonicTimestampGenerator
+    "java/lang/Compiler.class",
+    "java/lang/InheritableThreadLocal.class",
+    "java/lang/Package.class",
+    "java/lang/Process.class",
+    "java/lang/ProcessBuilder.class",
+    "java/lang/ProcessEnvironment.class",
+    "java/lang/ProcessImpl.class",
+    "java/lang/Runnable.class",
+    "java/lang/Runtime.class",
+    "java/lang/Shutdown.class",
+    "java/lang/Thread.class",
+    "java/lang/ThreadGroup.class",
+    "java/lang/ThreadLocal.class",
+    "java/lang/instrument/",
+    "java/lang/invoke/",
+    "java/lang/management/",
+    "java/lang/ref/",
+    "java/lang/reflect/",
+    "java/util/ServiceLoader.class",
+    "java/util/Timer.class",
+    "java/util/concurrent/",
+    "java/util/function/",
+    "java/util/jar/",
+    "java/util/logging/",
+    "java/util/prefs/",
+    "java/util/spi/",
+    "java/util/stream/",
+    "java/util/zip/",
+    };
+
+    static boolean secureResource(String resource)
+    {
+        while (resource.startsWith("/"))
+            resource = resource.substring(1);
+
+        for (String allowed : allowedPatterns)
+            if (resource.startsWith(allowed))
+            {
+
+                // resource is in allowedPatterns, let's see if it is not explicitly disallowed
+                for (String disallowed : disallowedPatterns)
+                    if (resource.startsWith(disallowed))
+                    {
+                        logger.trace("access denied: resource {}", resource);
+                        return false;
+                    }
+
+                return true;
+            }
+
+        logger.trace("access denied: resource {}", resource);
+        return false;
+    }
+
+    // setup the UDF class loader with no parent class loader so that we have full control about what class/resource UDF uses
+    static final ClassLoader udfClassLoader = new UDFClassLoader();
+
     protected UDFunction(FunctionName name,
                          List<ColumnIdentifier> argNames,
                          List<AbstractType<?>> argTypes,
@@ -80,8 +203,8 @@
         this.argNames = argNames;
         this.language = language;
         this.body = body;
-        this.argDataTypes = argDataTypes;
-        this.returnDataType = returnDataType;
+        this.argCodecs = UDHelper.codecsFor(argDataTypes);
+        this.returnCodec = UDHelper.codecFor(returnDataType);
         this.calledOnNullInput = calledOnNullInput;
     }
 
@@ -92,15 +215,15 @@
                                     boolean calledOnNullInput,
                                     String language,
                                     String body)
-    throws InvalidRequestException
     {
-        if (!DatabaseDescriptor.enableUserDefinedFunctions())
-            throw new InvalidRequestException("User-defined-functions are disabled in cassandra.yaml - set enable_user_defined_functions=true to enable if you are aware of the security risks");
+        UDFunction.assertUdfsEnabled(language);
 
         switch (language)
         {
-            case "java": return JavaSourceUDFFactory.buildUDF(name, argNames, argTypes, returnType, calledOnNullInput, body);
-            default: return new ScriptBasedUDF(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
+            case "java":
+                return new JavaBasedUDFunction(name, argNames, argTypes, returnType, calledOnNullInput, body);
+            default:
+                return new ScriptBasedUDFunction(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
         }
     }
 
@@ -120,11 +243,16 @@
                                                   boolean calledOnNullInput,
                                                   String language,
                                                   String body,
-                                                  final InvalidRequestException reason)
+                                                  InvalidRequestException reason)
     {
         return new UDFunction(name, argNames, argTypes, returnType, calledOnNullInput, language, body)
         {
-            public ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters) throws InvalidRequestException
+            protected ExecutorService executor()
+            {
+                return Executors.newSingleThreadExecutor();
+            }
+
+            public ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters)
             {
                 throw new InvalidRequestException(String.format("Function '%s' exists but hasn't been loaded successfully "
                                                                 + "for the following reason: %s. Please see the server log for details",
@@ -134,20 +262,172 @@
         };
     }
 
-    public final ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters) throws InvalidRequestException
+    public final ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters)
     {
-        if (!DatabaseDescriptor.enableUserDefinedFunctions())
-            throw new InvalidRequestException("User-defined-functions are disabled in cassandra.yaml - set enable_user_defined_functions=true to enable if you are aware of the security risks");
+        assertUdfsEnabled(language);
 
         if (!isCallableWrtNullable(parameters))
             return null;
 
         long tStart = System.nanoTime();
-        ByteBuffer result = executeUserDefined(protocolVersion, parameters);
-        Tracing.trace("Executed UDF {} in {}\u03bcs", name(), (System.nanoTime() - tStart) / 1000);
-        return result;
+        parameters = makeEmptyParametersNull(parameters);
+
+        try
+        {
+            // Using async UDF execution is expensive (adds about 100us overhead per invocation on a Core-i7 MBPr).
+            ByteBuffer result = DatabaseDescriptor.enableUserDefinedFunctionsThreads()
+                                ? executeAsync(protocolVersion, parameters)
+                                : executeUserDefined(protocolVersion, parameters);
+
+            Tracing.trace("Executed UDF {} in {}\u03bcs", name(), (System.nanoTime() - tStart) / 1000);
+            return result;
+        }
+        catch (InvalidRequestException e)
+        {
+            throw e;
+        }
+        catch (Throwable t)
+        {
+            logger.trace("Invocation of user-defined function '{}' failed", this, t);
+            if (t instanceof VirtualMachineError)
+                throw (VirtualMachineError) t;
+            throw FunctionExecutionException.create(this, t);
+        }
     }
 
+    public static void assertUdfsEnabled(String language)
+    {
+        if (!DatabaseDescriptor.enableUserDefinedFunctions())
+            throw new InvalidRequestException("User-defined functions are disabled in cassandra.yaml - set enable_user_defined_functions=true to enable");
+        if (!"java".equalsIgnoreCase(language) && !DatabaseDescriptor.enableScriptedUserDefinedFunctions())
+            throw new InvalidRequestException("Scripted user-defined functions are disabled in cassandra.yaml - set enable_scripted_user_defined_functions=true to enable if you are aware of the security risks");
+    }
+
+    static void initializeThread()
+    {
+        // Get the TypeCodec stuff in Java Driver initialized.
+        // This is to get the classes loaded outside of the restricted sandbox's security context of a UDF.
+        TypeCodec.inet().format(InetAddress.getLoopbackAddress());
+        TypeCodec.ascii().format("");
+    }
+
+    private static final class ThreadIdAndCpuTime extends CompletableFuture<Object>
+    {
+        long threadId;
+        long cpuTime;
+
+        ThreadIdAndCpuTime()
+        {
+            // Looks weird?
+            // This call "just" links this class to java.lang.management - otherwise UDFs (script UDFs) might fail due to
+            //      java.security.AccessControlException: access denied: ("java.lang.RuntimePermission" "accessClassInPackage.java.lang.management")
+            // because class loading would be deferred until setup() is executed - but setup() is called with
+            // limited privileges.
+            threadMXBean.getCurrentThreadCpuTime();
+        }
+
+        void setup()
+        {
+            this.threadId = Thread.currentThread().getId();
+            this.cpuTime = threadMXBean.getCurrentThreadCpuTime();
+            complete(null);
+        }
+    }
+
+    private ByteBuffer executeAsync(int protocolVersion, List<ByteBuffer> parameters)
+    {
+        ThreadIdAndCpuTime threadIdAndCpuTime = new ThreadIdAndCpuTime();
+
+        Future<ByteBuffer> future = executor().submit(() -> {
+            threadIdAndCpuTime.setup();
+            return executeUserDefined(protocolVersion, parameters);
+        });
+
+        try
+        {
+            if (DatabaseDescriptor.getUserDefinedFunctionWarnTimeout() > 0)
+                try
+                {
+                    return future.get(DatabaseDescriptor.getUserDefinedFunctionWarnTimeout(), TimeUnit.MILLISECONDS);
+                }
+                catch (TimeoutException e)
+                {
+
+                    // log and emit a warning that UDF execution took long
+                    String warn = String.format("User defined function %s ran longer than %dms", this, DatabaseDescriptor.getUserDefinedFunctionWarnTimeout());
+                    logger.warn(warn);
+                    ClientWarn.instance.warn(warn);
+                }
+
+            // retry with difference of warn-timeout to fail-timeout
+            return future.get(DatabaseDescriptor.getUserDefinedFunctionFailTimeout() - DatabaseDescriptor.getUserDefinedFunctionWarnTimeout(), TimeUnit.MILLISECONDS);
+        }
+        catch (InterruptedException e)
+        {
+            Thread.currentThread().interrupt();
+            throw new RuntimeException(e);
+        }
+        catch (ExecutionException e)
+        {
+            Throwable c = e.getCause();
+            if (c instanceof RuntimeException)
+                throw (RuntimeException) c;
+            throw new RuntimeException(c);
+        }
+        catch (TimeoutException e)
+        {
+            // retry a last time with the difference of UDF-fail-timeout to consumed CPU time (just in case execution hit a badly timed GC)
+            try
+            {
+                //The threadIdAndCpuTime shouldn't take a long time to be set so this should return immediately
+                threadIdAndCpuTime.get(1, TimeUnit.SECONDS);
+
+                long cpuTimeMillis = threadMXBean.getThreadCpuTime(threadIdAndCpuTime.threadId) - threadIdAndCpuTime.cpuTime;
+                cpuTimeMillis /= 1000000L;
+
+                return future.get(Math.max(DatabaseDescriptor.getUserDefinedFunctionFailTimeout() - cpuTimeMillis, 0L),
+                                  TimeUnit.MILLISECONDS);
+            }
+            catch (InterruptedException e1)
+            {
+                Thread.currentThread().interrupt();
+                throw new RuntimeException(e);
+            }
+            catch (ExecutionException e1)
+            {
+                Throwable c = e.getCause();
+                if (c instanceof RuntimeException)
+                    throw (RuntimeException) c;
+                throw new RuntimeException(c);
+            }
+            catch (TimeoutException e1)
+            {
+                TimeoutException cause = new TimeoutException(String.format("User defined function %s ran longer than %dms%s",
+                                                                            this,
+                                                                            DatabaseDescriptor.getUserDefinedFunctionFailTimeout(),
+                                                                            DatabaseDescriptor.getUserFunctionTimeoutPolicy() == Config.UserFunctionTimeoutPolicy.ignore
+                                                                            ? "" : " - will stop Cassandra VM"));
+                FunctionExecutionException fe = FunctionExecutionException.create(this, cause);
+                JVMStabilityInspector.userFunctionTimeout(cause);
+                throw fe;
+            }
+        }
+    }
+
+    private List<ByteBuffer> makeEmptyParametersNull(List<ByteBuffer> parameters)
+    {
+        List<ByteBuffer> r = new ArrayList<>(parameters.size());
+        for (int i = 0; i < parameters.size(); i++)
+        {
+            ByteBuffer param = parameters.get(i);
+            r.add(UDHelper.isNullOrEmpty(argTypes.get(i), param)
+                  ? null : param);
+        }
+        return r;
+    }
+
+    protected abstract ExecutorService executor();
+
     public boolean isCallableWrtNullable(List<ByteBuffer> parameters)
     {
         if (!calledOnNullInput)
@@ -157,7 +437,7 @@
         return true;
     }
 
-    protected abstract ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters) throws InvalidRequestException;
+    protected abstract ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> parameters);
 
     public boolean isAggregate()
     {
@@ -190,77 +470,38 @@
     }
 
     /**
-     * Used by UDF implementations (both Java code generated by {@link org.apache.cassandra.cql3.functions.JavaSourceUDFFactory}
-     * and script executor {@link org.apache.cassandra.cql3.functions.ScriptBasedUDF}) to convert the C*
+     * Used by UDF implementations (both Java code generated by {@link JavaBasedUDFunction}
+     * and script executor {@link ScriptBasedUDFunction}) to convert the C*
      * serialized representation to the Java object representation.
      *
      * @param protocolVersion the native protocol version used for serialization
-     * @param argIndex index of the UDF input argument
+     * @param argIndex        index of the UDF input argument
      */
     protected Object compose(int protocolVersion, int argIndex, ByteBuffer value)
     {
-        return UDHelper.isNullOrEmpty(argTypes.get(argIndex), value) ? null : argDataTypes[argIndex].deserialize(value, ProtocolVersion.fromInt(protocolVersion));
+        return compose(argCodecs, protocolVersion, argIndex, value);
     }
 
-    // do not remove - used by generated Java UDFs
-    protected float compose_float(int protocolVersion, int argIndex, ByteBuffer value)
+    protected static Object compose(TypeCodec<Object>[] codecs, int protocolVersion, int argIndex, ByteBuffer value)
     {
-        assert value != null && value.remaining() > 0;
-        return (float)DataType.cfloat().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected double compose_double(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (double)DataType.cdouble().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected byte compose_byte(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (byte)DataType.tinyint().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected short compose_short(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (short)DataType.smallint().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected int compose_int(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (int)DataType.cint().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected long compose_long(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (long)DataType.bigint().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
-    }
-
-    // do not remove - used by generated Java UDFs
-    protected boolean compose_boolean(int protocolVersion, int argIndex, ByteBuffer value)
-    {
-        assert value != null && value.remaining() > 0;
-        return (boolean) DataType.cboolean().deserialize(value, ProtocolVersion.fromInt(protocolVersion));
+        return value == null ? null : UDHelper.deserialize(codecs[argIndex], protocolVersion, value);
     }
 
     /**
-     * Used by UDF implementations (both Java code generated by {@link org.apache.cassandra.cql3.functions.JavaSourceUDFFactory}
-     * and script executor {@link org.apache.cassandra.cql3.functions.ScriptBasedUDF}) to convert the Java
+     * Used by UDF implementations (both Java code generated by {@link JavaBasedUDFunction}
+     * and script executor {@link ScriptBasedUDFunction}) to convert the Java
      * object representation for the return value to the C* serialized representation.
      *
      * @param protocolVersion the native protocol version used for serialization
      */
     protected ByteBuffer decompose(int protocolVersion, Object value)
     {
-        return value == null ? null : returnDataType.serialize(value, ProtocolVersion.fromInt(protocolVersion));
+        return decompose(returnCodec, protocolVersion, value);
+    }
+
+    protected static ByteBuffer decompose(TypeCodec<Object> codec, int protocolVersion, Object value)
+    {
+        return value == null ? null : UDHelper.serialize(codec, protocolVersion, value);
     }
 
     @Override
@@ -272,8 +513,8 @@
         UDFunction that = (UDFunction)o;
         return Objects.equal(name, that.name)
             && Objects.equal(argNames, that.argNames)
-            && Functions.typeEquals(argTypes, that.argTypes)
-            && Functions.typeEquals(returnType, that.returnType)
+            && Functions.typesMatch(argTypes, that.argTypes)
+            && Functions.typesMatch(returnType, that.returnType)
             && Objects.equal(language, that.language)
             && Objects.equal(body, that.body);
     }
@@ -288,21 +529,21 @@
     {
         boolean updated = false;
 
-        for (int i = 0; i < argDataTypes.length; i++)
+        for (int i = 0; i < argCodecs.length; i++)
         {
-            DataType dataType = argDataTypes[i];
+            DataType dataType = argCodecs[i].getCqlType();
             if (dataType instanceof UserType)
             {
                 UserType userType = (UserType) dataType;
                 if (userType.getKeyspace().equals(ksName) && userType.getTypeName().equals(typeName))
                 {
-                    KSMetaData ksm = Schema.instance.getKSMetaData(ksName);
+                    KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
                     assert ksm != null;
 
-                    org.apache.cassandra.db.marshal.UserType ut = ksm.userTypes.getType(ByteBufferUtil.bytes(typeName));
+                    org.apache.cassandra.db.marshal.UserType ut = ksm.types.get(ByteBufferUtil.bytes(typeName)).get();
 
                     DataType newUserType = UDHelper.driverType(ut);
-                    argDataTypes[i] = newUserType;
+                    argCodecs[i] = UDHelper.codecFor(newUserType);
 
                     argTypes.set(i, ut);
 
@@ -314,4 +555,41 @@
         if (updated)
             MigrationManager.announceNewFunction(this, true);
     }
+
+    private static class UDFClassLoader extends ClassLoader
+    {
+        // insecureClassLoader is the C* class loader
+        static final ClassLoader insecureClassLoader = Thread.currentThread().getContextClassLoader();
+
+        public URL getResource(String name)
+        {
+            if (!secureResource(name))
+                return null;
+            return insecureClassLoader.getResource(name);
+        }
+
+        protected URL findResource(String name)
+        {
+            return getResource(name);
+        }
+
+        public Enumeration<URL> getResources(String name)
+        {
+            return Collections.emptyEnumeration();
+        }
+
+        protected Class<?> findClass(String name) throws ClassNotFoundException
+        {
+            if (!secureResource(name.replace('.', '/') + ".class"))
+                throw new ClassNotFoundException(name);
+            return insecureClassLoader.loadClass(name);
+        }
+
+        public Class<?> loadClass(String name) throws ClassNotFoundException
+        {
+            if (!secureResource(name.replace('.', '/') + ".class"))
+                throw new ClassNotFoundException(name);
+            return super.loadClass(name);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/functions/UDHelper.java b/src/java/org/apache/cassandra/cql3/functions/UDHelper.java
index d1d12e6..45c734f 100644
--- a/src/java/org/apache/cassandra/cql3/functions/UDHelper.java
+++ b/src/java/org/apache/cassandra/cql3/functions/UDHelper.java

@@ -23,9 +23,14 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
+import com.datastax.driver.core.CodecRegistry;
 import com.datastax.driver.core.DataType;
+import com.datastax.driver.core.ProtocolVersion;
+import com.datastax.driver.core.TypeCodec;
+import com.datastax.driver.core.exceptions.InvalidTypeException;
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.transport.Server;
 
 /**
  * Helper class for User Defined Functions + Aggregates.
@@ -33,15 +38,17 @@
 public final class UDHelper
 {
     // TODO make these c'tors and methods public in Java-Driver - see https://datastax-oss.atlassian.net/browse/JAVA-502
-    static final MethodHandle methodParseOne;
+    private static final MethodHandle methodParseOne;
+    private static final CodecRegistry codecRegistry;
     static
     {
         try
         {
-            Class<?> cls = Class.forName("com.datastax.driver.core.CassandraTypeParser");
-            Method m = cls.getDeclaredMethod("parseOne", String.class);
+            Class<?> cls = Class.forName("com.datastax.driver.core.DataTypeClassNameParser");
+            Method m = cls.getDeclaredMethod("parseOne", String.class, ProtocolVersion.class, CodecRegistry.class);
             m.setAccessible(true);
             methodParseOne = MethodHandles.lookup().unreflect(m);
+            codecRegistry = new CodecRegistry();
         }
         catch (Exception e)
         {
@@ -49,6 +56,19 @@
         }
     }
 
+    static TypeCodec<Object>[] codecsFor(DataType[] dataType)
+    {
+        TypeCodec<Object>[] codecs = new TypeCodec[dataType.length];
+        for (int i = 0; i < dataType.length; i++)
+            codecs[i] = codecFor(dataType[i]);
+        return codecs;
+    }
+
+    static TypeCodec<Object> codecFor(DataType dataType)
+    {
+        return codecRegistry.codecFor(dataType);
+    }
+
     /**
      * Construct an array containing the Java classes for the given Java Driver {@link com.datastax.driver.core.DataType}s.
      *
@@ -56,12 +76,12 @@
      * @param calledOnNullInput whether to allow {@code null} as an argument value
      * @return array of same size with UDF arguments
      */
-    public static Class<?>[] javaTypes(DataType[] dataTypes, boolean calledOnNullInput)
+    public static Class<?>[] javaTypes(TypeCodec<Object>[] dataTypes, boolean calledOnNullInput)
     {
         Class<?>[] paramTypes = new Class[dataTypes.length];
         for (int i = 0; i < paramTypes.length; i++)
         {
-            Class<?> clazz = dataTypes[i].asJavaClass();
+            Class<?> clazz = asJavaClass(dataTypes[i]);
             if (!calledOnNullInput)
             {
                 // only care about classes that can be used in a data type
@@ -108,7 +128,9 @@
         CQL3Type cqlType = abstractType.asCQL3Type();
         try
         {
-            return (DataType) methodParseOne.invoke(cqlType.getType().toString());
+            return (DataType) methodParseOne.invoke(cqlType.getType().toString(),
+                                                    ProtocolVersion.fromInt(Server.CURRENT_VERSION),
+                                                    codecRegistry);
         }
         catch (RuntimeException | Error e)
         {
@@ -121,6 +143,24 @@
         }
     }
 
+    public static Object deserialize(TypeCodec<?> codec, int protocolVersion, ByteBuffer value)
+    {
+        return codec.deserialize(value, ProtocolVersion.fromInt(protocolVersion));
+    }
+
+    public static ByteBuffer serialize(TypeCodec<?> codec, int protocolVersion, Object value)
+    {
+        if (!codec.getJavaType().getRawType().isAssignableFrom(value.getClass()))
+            throw new InvalidTypeException("Invalid value for CQL type " + codec.getCqlType().getName().toString());
+
+        return ((TypeCodec)codec).serialize(value, ProtocolVersion.fromInt(protocolVersion));
+    }
+
+    public static Class<?> asJavaClass(TypeCodec<?> codec)
+    {
+        return codec.getJavaType().getRawType();
+    }
+
     public static boolean isNullOrEmpty(AbstractType<?> type, ByteBuffer bb)
     {
         return bb == null ||

diff --git a/src/java/org/apache/cassandra/cql3/functions/UuidFcts.java b/src/java/org/apache/cassandra/cql3/functions/UuidFcts.java
index 0aa3ac4..32adbdc 100644
--- a/src/java/org/apache/cassandra/cql3/functions/UuidFcts.java
+++ b/src/java/org/apache/cassandra/cql3/functions/UuidFcts.java

@@ -18,14 +18,18 @@
 package org.apache.cassandra.cql3.functions;
 
 import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.UUID;
+import java.util.*;
 
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.serializers.UUIDSerializer;
 
 public abstract class UuidFcts
 {
+    public static Collection<Function> all()
+    {
+        return Collections.singleton(uuidFct);
+    }
+
     public static final Function uuidFct = new NativeScalarFunction("uuid", UUIDType.instance)
     {
         public ByteBuffer execute(int protocolVersion, List<ByteBuffer> parameters)

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/AbstractPrimaryKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/AbstractPrimaryKeyRestrictions.java
index 51c3e26..f1b5a50 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/AbstractPrimaryKeyRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/AbstractPrimaryKeyRestrictions.java

@@ -18,12 +18,12 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.nio.ByteBuffer;
-import java.util.List;
+import java.util.*;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.composites.CType;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
@@ -34,17 +34,17 @@
     /**
      * The composite type.
      */
-    protected final CType ctype;
+    protected final ClusteringComparator comparator;
 
-    public AbstractPrimaryKeyRestrictions(CType ctype)
+    public AbstractPrimaryKeyRestrictions(ClusteringComparator comparator)
     {
-        this.ctype = ctype;
+        this.comparator = comparator;
     }
 
     @Override
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound b, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> bounds(Bound b, QueryOptions options) throws InvalidRequestException
     {
-        return values(cfm, options);
+        return values(options);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/AbstractRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/AbstractRestriction.java
index 4093780..df04331 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/AbstractRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/AbstractRestriction.java

@@ -17,19 +17,11 @@
  */
 package org.apache.cassandra.cql3.restrictions;
 
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-
-import org.apache.cassandra.cql3.ColumnSpecification;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import static org.apache.cassandra.cql3.statements.RequestValidations.checkBindValueSet;
-import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
-import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
+import org.apache.cassandra.db.MultiCBuilder;
+
+import org.apache.cassandra.config.ColumnDefinition;
 
 /**
  * Base class for <code>Restriction</code>s
@@ -73,15 +65,21 @@
     }
 
     @Override
+    public boolean isNotNull()
+    {
+        return false;
+    }
+
+    @Override
     public boolean hasBound(Bound b)
     {
         return true;
     }
 
     @Override
-    public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+    public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
     {
-        return appendTo(cfm, builder, options);
+        return appendTo(builder, options);
     }
 
     @Override
@@ -90,21 +88,6 @@
         return true;
     }
 
-    public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-    {
-        return false;
-    }
-
-    protected static ByteBuffer validateIndexedValue(ColumnSpecification columnSpec,
-                                                     ByteBuffer value)
-                                                     throws InvalidRequestException
-    {
-        checkNotNull(value, "Unsupported null value for column %s", columnSpec.name);
-        checkBindValueSet(value, "Unsupported unset value for column %s", columnSpec.name);
-        checkFalse(value.remaining() > 0xFFFF, "Index expression values may not be larger than 64K");
-        return value;
-    }
-
     /**
      * Reverses the specified bound if the column type is a reversed one.
      *

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
new file mode 100644
index 0000000..eb91928
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.restrictions;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+public class CustomIndexExpression
+{
+    private final ColumnIdentifier valueColId = new ColumnIdentifier("custom index expression", false);
+
+    public final IndexName targetIndex;
+    public final Term.Raw valueRaw;
+
+    private Term value;
+
+    public CustomIndexExpression(IndexName targetIndex, Term.Raw value)
+    {
+        this.targetIndex = targetIndex;
+        this.valueRaw = value;
+    }
+
+    public void prepareValue(CFMetaData cfm, AbstractType<?> expressionType, VariableSpecifications boundNames)
+    {
+        ColumnSpecification spec = new ColumnSpecification(cfm.ksName, cfm.ksName, valueColId, expressionType);
+        value = valueRaw.prepare(cfm.ksName, spec);
+        value.collectMarkerSpecification(boundNames);
+    }
+
+    public void addToRowFilter(RowFilter filter,
+                               CFMetaData cfm,
+                               QueryOptions options)
+    {
+        filter.addCustomIndexExpression(cfm,
+                                        cfm.getIndexes()
+                                           .get(targetIndex.getIdx())
+                                           .orElseThrow(() -> IndexRestrictions.indexNotFound(targetIndex, cfm)),
+                                        value.bindAndGet(options));
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ForwardingPrimaryKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ForwardingPrimaryKeyRestrictions.java
index 76d0233..71305b9 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/ForwardingPrimaryKeyRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/ForwardingPrimaryKeyRestrictions.java

@@ -19,17 +19,18 @@
 
 import java.nio.ByteBuffer;
 import java.util.List;
+import java.util.NavigableSet;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.MultiCBuilder;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 
 /**
  * A <code>PrimaryKeyRestrictions</code> which forwards all its method calls to another 
@@ -45,9 +46,9 @@
     protected abstract PrimaryKeyRestrictions getDelegate();
 
     @Override
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return getDelegate().getFunctions();
+        getDelegate().addFunctionsTo(functions);
     }
 
     @Override
@@ -81,39 +82,39 @@
     }
 
     @Override
-    public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
     {
-        return getDelegate().values(cfm, options);
+        return getDelegate().values(options);
     }
 
     @Override
-    public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+    public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
     {
-        return getDelegate().appendTo(cfm, builder, options);
+        return getDelegate().appendTo(builder, options);
     }
 
     @Override
-    public List<Composite> valuesAsComposites(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering> valuesAsClustering(QueryOptions options) throws InvalidRequestException
     {
-        return getDelegate().valuesAsComposites(cfm, options);
+        return getDelegate().valuesAsClustering(options);
     }
 
     @Override
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> bounds(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        return getDelegate().bounds(cfm, bound, options);
+        return getDelegate().bounds(bound, options);
     }
 
     @Override
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Slice.Bound> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        return getDelegate().boundsAsComposites(cfm, bound, options);
+        return getDelegate().boundsAsClustering(bound, options);
     }
 
     @Override
-    public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+    public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
     {
-        return getDelegate().appendBoundTo(cfm, builder, bound, options);
+        return getDelegate().appendBoundTo(builder, bound, options);
     }
 
     @Override
@@ -165,6 +166,12 @@
     }
 
     @Override
+    public boolean isNotNull()
+    {
+        return getDelegate().isNotNull();
+    }
+
+    @Override
     public boolean isMultiColumn()
     {
         return getDelegate().isMultiColumn();
@@ -177,16 +184,8 @@
     }
 
     @Override
-    public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                     SecondaryIndexManager indexManager,
-                                     QueryOptions options) throws InvalidRequestException
+    public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
     {
-        getDelegate().addIndexExpressionTo(expressions, indexManager, options);
-    }
-
-    @Override
-    public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-    {
-        return getDelegate().isNotReturningAnyRows(cfm, options);
+        getDelegate().addRowFilterTo(filter, indexManager, options);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
new file mode 100644
index 0000000..c7f6b5f
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java

@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.restrictions;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.IndexName;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+public class IndexRestrictions
+{
+    public static final String INDEX_NOT_FOUND = "Invalid index expression, index %s not found for %s.%s";
+    public static final String INVALID_INDEX = "Target index %s cannot be used to query %s.%s";
+    public static final String CUSTOM_EXPRESSION_NOT_SUPPORTED = "Index %s does not support custom expressions";
+    public static final String NON_CUSTOM_INDEX_IN_EXPRESSION = "Only CUSTOM indexes may be used in custom index expressions, %s is not valid";
+    public static final String MULTIPLE_EXPRESSIONS = "Multiple custom index expressions in a single query are not supported";
+
+    private final List<Restrictions> regularRestrictions = new ArrayList<>();
+    private final List<CustomIndexExpression> customExpressions = new ArrayList<>();
+
+    public void add(Restrictions restrictions)
+    {
+        regularRestrictions.add(restrictions);
+    }
+
+    public void add(CustomIndexExpression expression)
+    {
+        customExpressions.add(expression);
+    }
+
+    public boolean isEmpty()
+    {
+        return regularRestrictions.isEmpty() && customExpressions.isEmpty();
+    }
+
+    public List<Restrictions> getRestrictions()
+    {
+        return regularRestrictions;
+    }
+
+    public List<CustomIndexExpression> getCustomIndexExpressions()
+    {
+        return customExpressions;
+    }
+
+    static InvalidRequestException invalidIndex(IndexName indexName, CFMetaData cfm)
+    {
+        return new InvalidRequestException(String.format(INVALID_INDEX, indexName.getIdx(), cfm.ksName, cfm.cfName));
+    }
+
+    static InvalidRequestException indexNotFound(IndexName indexName, CFMetaData cfm)
+    {
+        return new InvalidRequestException(String.format(INDEX_NOT_FOUND,indexName.getIdx(), cfm.ksName, cfm.cfName));
+    }
+
+    static InvalidRequestException nonCustomIndexInExpression(IndexName indexName)
+    {
+        return new InvalidRequestException(String.format(NON_CUSTOM_INDEX_IN_EXPRESSION, indexName.getIdx()));
+    }
+
+    static InvalidRequestException customExpressionNotSupported(IndexName indexName)
+    {
+        return new InvalidRequestException(String.format(CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName.getIdx()));
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
index 44f25ec..9d33bb1 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java

@@ -20,17 +20,17 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.Term.Terminal;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.MultiCBuilder;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
@@ -112,38 +112,35 @@
     @Override
     public final boolean hasSupportingIndex(SecondaryIndexManager indexManager)
     {
-        for (ColumnDefinition columnDef : columnDefs)
-        {
-            SecondaryIndex index = indexManager.getIndexForColumn(columnDef.name.bytes);
-            if (index != null && isSupportedBy(index))
-                return true;
-        }
+        for (Index index : indexManager.listIndexes())
+           if (isSupportedBy(index))
+               return true;
         return false;
     }
 
     /**
      * Check if this type of restriction is supported for by the specified index.
-     * @param index the Secondary index
+     * @param index the secondary index
      *
      * @return <code>true</code> this type of restriction is supported by the specified index,
      * <code>false</code> otherwise.
      */
-    protected abstract boolean isSupportedBy(SecondaryIndex index);
+    protected abstract boolean isSupportedBy(Index index);
 
-    public static class EQ  extends MultiColumnRestriction
+    public static class EQRestriction extends MultiColumnRestriction
     {
         protected final Term value;
 
-        public EQ(List<ColumnDefinition> columnDefs, Term value)
+        public EQRestriction(List<ColumnDefinition> columnDefs, Term value)
         {
             super(columnDefs);
             this.value = value;
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return value.getFunctions();
+            value.addFunctionsTo(functions);
         }
 
         @Override
@@ -160,13 +157,16 @@
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
-            return index.supportsOperator(Operator.EQ);
+            for(ColumnDefinition column : columnDefs)
+                if (index.supportsExpression(column, Operator.EQ))
+                    return true;
+            return false;
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
             Tuples.Value t = ((Tuples.Value) value.bind(options));
             List<ByteBuffer> values = t.getElements();
@@ -179,9 +179,7 @@
         }
 
         @Override
-        public final void addIndexExpressionTo(List<IndexExpression> expressions,
-                                               SecondaryIndexManager indexManager,
-                                               QueryOptions options) throws InvalidRequestException
+        public final void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexMananger, QueryOptions options) throws InvalidRequestException
         {
             Tuples.Value t = ((Tuples.Value) value.bind(options));
             List<ByteBuffer> values = t.getElements();
@@ -189,30 +187,25 @@
             for (int i = 0, m = columnDefs.size(); i < m; i++)
             {
                 ColumnDefinition columnDef = columnDefs.get(i);
-                ByteBuffer component = validateIndexedValue(columnDef, values.get(i));
-                expressions.add(new IndexExpression(columnDef.name.bytes, Operator.EQ, component));
+                filter.add(columnDef, Operator.EQ, values.get(i));
             }
         }
-
-        @Override
-        public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-        {
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that
-            // any query with an EQ restriction containing an empty value will not return any results.
-            return !cfm.comparator.isCompound()
-                    && !((Tuples.Value) value.bind(options)).getElements().get(0).hasRemaining();
-        }
     }
 
-    public abstract static class IN extends MultiColumnRestriction
+    public abstract static class INRestriction extends MultiColumnRestriction
     {
+        public INRestriction(List<ColumnDefinition> columnDefs)
+        {
+            super(columnDefs);
+        }
+
         /**
          * {@inheritDoc}
          */
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
-            List<List<ByteBuffer>> splitInValues = filterValuesIfNeeded(cfm, splitValues(options));
+            List<List<ByteBuffer>> splitInValues = splitValues(options);
             builder.addAllElementsToAll(splitInValues);
 
             if (builder.containsNull())
@@ -220,29 +213,6 @@
             return builder;
         }
 
-        private List<List<ByteBuffer>> filterValuesIfNeeded(CFMetaData cfm, List<List<ByteBuffer>> splitInValues)
-        {
-            if (cfm.comparator.isCompound())
-                return splitInValues;
-
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that we can
-            // ignore any IN value which is an empty byte buffer an which otherwise will trigger an error.
-
-            // As some List implementations do not support remove, we copy the list to be on the safe side.
-            List<List<ByteBuffer>> filteredValues = new ArrayList<>(splitInValues.size());
-            for (List<ByteBuffer> values : splitInValues)
-            {
-                if (values.get(0).hasRemaining())
-                    filteredValues.add(values);
-            }
-            return filteredValues;
-        }
-
-        public IN(List<ColumnDefinition> columnDefs)
-        {
-            super(columnDefs);
-        }
-
         @Override
         public boolean isIN()
         {
@@ -257,15 +227,18 @@
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
-            return index.supportsOperator(Operator.IN);
+            for (ColumnDefinition column: columnDefs)
+                if (index.supportsExpression(column, Operator.IN))
+                    return true;
+            return false;
         }
 
         @Override
-        public final void addIndexExpressionTo(List<IndexExpression> expressions,
-                                               SecondaryIndexManager indexManager,
-                                               QueryOptions options) throws InvalidRequestException
+        public final void addRowFilterTo(RowFilter filter,
+                                         SecondaryIndexManager indexManager,
+                                         QueryOptions options) throws InvalidRequestException
         {
             List<List<ByteBuffer>> splitInValues = splitValues(options);
             checkTrue(splitInValues.size() == 1, "IN restrictions are not supported on indexed columns");
@@ -274,8 +247,7 @@
             for (int i = 0, m = columnDefs.size(); i < m; i++)
             {
                 ColumnDefinition columnDef = columnDefs.get(i);
-                ByteBuffer component = validateIndexedValue(columnDef, values.get(i));
-                expressions.add(new IndexExpression(columnDef.name.bytes, Operator.EQ, component));
+                filter.add(columnDef, Operator.EQ, values.get(i));
             }
         }
 
@@ -286,20 +258,20 @@
      * An IN restriction that has a set of terms for in values.
      * For example: "SELECT ... WHERE (a, b, c) IN ((1, 2, 3), (4, 5, 6))" or "WHERE (a, b, c) IN (?, ?)"
      */
-    public static class InWithValues extends MultiColumnRestriction.IN
+    public static class InRestrictionWithValues extends INRestriction
     {
         protected final List<Term> values;
 
-        public InWithValues(List<ColumnDefinition> columnDefs, List<Term> values)
+        public InRestrictionWithValues(List<ColumnDefinition> columnDefs, List<Term> values)
         {
             super(columnDefs);
             this.values = values;
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(values);
+            Terms.addFunctions(values, functions);
         }
 
         @Override
@@ -325,20 +297,19 @@
      * An IN restriction that uses a single marker for a set of IN values that are tuples.
      * For example: "SELECT ... WHERE (a, b, c) IN ?"
      */
-    public static class InWithMarker extends MultiColumnRestriction.IN
+    public static class InRestrictionWithMarker extends INRestriction
     {
         protected final AbstractMarker marker;
 
-        public InWithMarker(List<ColumnDefinition> columnDefs, AbstractMarker marker)
+        public InRestrictionWithMarker(List<ColumnDefinition> columnDefs, AbstractMarker marker)
         {
             super(columnDefs);
             this.marker = marker;
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Collections.emptySet();
         }
 
         @Override
@@ -357,16 +328,16 @@
         }
     }
 
-    public static class Slice extends MultiColumnRestriction
+    public static class SliceRestriction extends MultiColumnRestriction
     {
         private final TermSlice slice;
 
-        public Slice(List<ColumnDefinition> columnDefs, Bound bound, boolean inclusive, Term term)
+        public SliceRestriction(List<ColumnDefinition> columnDefs, Bound bound, boolean inclusive, Term term)
         {
             this(columnDefs, TermSlice.newInstance(bound, inclusive, term));
         }
 
-        Slice(List<ColumnDefinition> columnDefs, TermSlice slice)
+        SliceRestriction(List<ColumnDefinition> columnDefs, TermSlice slice)
         {
             super(columnDefs);
             this.slice = slice;
@@ -379,13 +350,13 @@
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
             throw new UnsupportedOperationException();
         }
 
         @Override
-        public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
         {
             boolean reversed = getFirstColumn().isReversedType();
 
@@ -446,9 +417,12 @@
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
-            return slice.isSupportedBy(index);
+            for(ColumnDefinition def : columnDefs)
+                if (slice.isSupportedBy(def, index))
+                    return true;
+            return false;
         }
 
         @Override
@@ -458,9 +432,9 @@
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return slice.getFunctions();
+            slice.addFunctionsTo(functions);
         }
 
         @Override
@@ -492,16 +466,16 @@
                        "More than one restriction was found for the end bound on %s",
                        getColumnsInCommons(otherRestriction));
 
-            Slice otherSlice = (Slice) otherRestriction;
+            SliceRestriction otherSlice = (SliceRestriction) otherRestriction;
             List<ColumnDefinition> newColumnDefs = columnDefs.size() >= otherSlice.columnDefs.size() ?  columnDefs : otherSlice.columnDefs;
 
-            return new Slice(newColumnDefs, slice.merge(otherSlice.slice));
+            return new SliceRestriction(newColumnDefs, slice.merge(otherSlice.slice));
         }
 
         @Override
-        public final void addIndexExpressionTo(List<IndexExpression> expressions,
-                                               SecondaryIndexManager indexManager,
-                                               QueryOptions options) throws InvalidRequestException
+        public final void addRowFilterTo(RowFilter filter,
+                                         SecondaryIndexManager indexManager,
+                                         QueryOptions options) throws InvalidRequestException
         {
             throw invalidRequest("Multi-column slice restrictions cannot be used for filtering.");
         }
@@ -535,19 +509,63 @@
             return Collections.singletonList(terminal.get(options.getProtocolVersion()));
         }
 
-        @Override
-        public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-        {
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that
-            // any query with a slice restriction with an empty value for the END bound will not return any results.
-            return !cfm.comparator.isCompound()
-                    && hasBound(Bound.END)
-                    && !componentBounds(Bound.END, options).get(0).hasRemaining();
-        }
-
         private boolean hasComponent(Bound b, int index, EnumMap<Bound, List<ByteBuffer>> componentBounds)
         {
             return componentBounds.get(b).size() > index;
         }
     }
+
+    public static class NotNullRestriction extends MultiColumnRestriction
+    {
+        public NotNullRestriction(List<ColumnDefinition> columnDefs)
+        {
+            super(columnDefs);
+            assert columnDefs.size() == 1;
+        }
+
+        @Override
+        public void addFunctionsTo(List<Function> functions)
+        {
+        }
+
+        @Override
+        public boolean isNotNull()
+        {
+            return true;
+        }
+
+        @Override
+        public String toString()
+        {
+            return "IS NOT NULL";
+        }
+
+        @Override
+        public Restriction doMergeWith(Restriction otherRestriction) throws InvalidRequestException
+        {
+            throw invalidRequest("%s cannot be restricted by a relation if it includes an IS NOT NULL clause",
+                                 getColumnsInCommons(otherRestriction));
+        }
+
+        @Override
+        protected boolean isSupportedBy(Index index)
+        {
+            for(ColumnDefinition column : columnDefs)
+                if (index.supportsExpression(column, Operator.IS_NOT))
+                    return true;
+            return false;
+        }
+
+        @Override
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
+        {
+            throw new UnsupportedOperationException("Cannot use IS NOT NULL restriction for slicing");
+        }
+
+        @Override
+        public final void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexMananger, QueryOptions options) throws InvalidRequestException
+        {
+            throw new UnsupportedOperationException("Secondary indexes do not support IS NOT NULL restrictions");
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSet.java
index 5136fee..860d3f0 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSet.java

@@ -25,11 +25,11 @@
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.composites.Composite.EOC;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.utils.btree.BTreeSet;
 
 /**
  * A set of single column restrictions on a primary key part (partition key or clustering key).
@@ -61,18 +61,26 @@
      */
     private boolean contains;
 
-    public PrimaryKeyRestrictionSet(CType ctype)
+    /**
+     * <code>true</code> if the restrictions corresponding to a partition key, <code>false</code> if it's clustering columns.
+     */
+    private boolean isPartitionKey;
+
+    public PrimaryKeyRestrictionSet(ClusteringComparator comparator, boolean isPartitionKey)
     {
-        super(ctype);
+        super(comparator);
+
         this.restrictions = new RestrictionSet();
         this.eq = true;
+        this.isPartitionKey = isPartitionKey;
     }
 
     private PrimaryKeyRestrictionSet(PrimaryKeyRestrictionSet primaryKeyRestrictions,
                                      Restriction restriction) throws InvalidRequestException
     {
-        super(primaryKeyRestrictions.ctype);
+        super(primaryKeyRestrictions.comparator);
         this.restrictions = primaryKeyRestrictions.restrictions.addRestriction(restriction);
+        this.isPartitionKey = primaryKeyRestrictions.isPartitionKey;
 
         if (restriction.isSlice() || primaryKeyRestrictions.isSlice())
             this.slice = true;
@@ -84,6 +92,15 @@
             this.eq = true;
     }
 
+    private List<ByteBuffer> toByteBuffers(SortedSet<? extends ClusteringPrefix> clusterings)
+    {
+        // It's currently a tad hard to follow that this is only called for partition key so we should fix that
+        List<ByteBuffer> l = new ArrayList<>(clusterings.size());
+        for (ClusteringPrefix clustering : clusterings)
+            l.add(CFMetaData.serializePartitionKey(clustering));
+        return l;
+    }
+
     @Override
     public boolean isSlice()
     {
@@ -121,9 +138,9 @@
     }
 
     @Override
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return restrictions.getFunctions();
+        restrictions.addFunctionsTo(functions);
     }
 
     @Override
@@ -141,17 +158,17 @@
     }
 
     @Override
-    public List<Composite> valuesAsComposites(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering> valuesAsClustering(QueryOptions options) throws InvalidRequestException
     {
-        return filterAndSort(appendTo(cfm, new CompositesBuilder(ctype), options).build());
+        return appendTo(MultiCBuilder.create(comparator), options).build();
     }
 
     @Override
-    public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+    public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
     {
         for (Restriction r : restrictions)
         {
-            r.appendTo(cfm, builder, options);
+            r.appendTo(builder, options);
             if (builder.hasMissingElements())
                 break;
         }
@@ -159,19 +176,15 @@
     }
 
     @Override
-    public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+    public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
     {
         throw new UnsupportedOperationException();
     }
 
     @Override
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Slice.Bound> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        CompositesBuilder builder = new CompositesBuilder(ctype);
-        // The end-of-component of composite doesn't depend on whether the
-        // component type is reversed or not (i.e. the ReversedType is applied
-        // to the component comparator but not to the end-of-component itself),
-        // it only depends on whether the slice is reversed
+        MultiCBuilder builder = MultiCBuilder.create(comparator);
         int keyPosition = 0;
         for (Restriction r : restrictions)
         {
@@ -182,133 +195,41 @@
 
             if (r.isSlice())
             {
-                r.appendBoundTo(cfm, builder, bound, options);
-
-                // Since CASSANDRA-7281, the composites might not end with the same components and it is possible
-                // that one of the composites is an empty one. Unfortunatly, AbstractCType will always sort
-                // Composites.EMPTY before all the other components due to its EOC, even if it is not the desired
-                // behaviour in some cases. To avoid that problem the code will use normal composites for the empty
-                // ones until the composites are properly sorted. They will then be replaced by Composites.EMPTY as
-                // it is what is expected by the intra-node serialization.
-                // It is clearly a hack but it does not make a lot of sense to refactor 2.2 for that as the problem is
-                // already solved in 3.0.
-                List<Composite> composites = filterAndSort(setEocs(r, bound, builder.build()));
-
-                for (Composite c : composites)
-                    if (c.isEmpty())
-                        return normalizeEmptyComposites(composites);
-
-                return composites;
+                r.appendBoundTo(builder, bound, options);
+                return builder.buildBoundForSlice(bound.isStart(),
+                                                  r.isInclusive(bound),
+                                                  r.isInclusive(bound.reverse()),
+                                                  r.getColumnDefs());
             }
 
-            r.appendBoundTo(cfm, builder, bound, options);
+            r.appendBoundTo(builder, bound, options);
 
             if (builder.hasMissingElements())
-                return Collections.emptyList();
+                return BTreeSet.empty(comparator);
 
             keyPosition = r.getLastColumn().position() + 1;
         }
-        // Means no relation at all or everything was an equal
-        // Note: if the builder is "full", there is no need to use the end-of-component bit. For columns selection,
-        // it would be harmless to do it. However, we use this method got the partition key too. And when a query
-        // with 2ndary index is done, and with the the partition provided with an EQ, we'll end up here, and in that
-        // case using the eoc would be bad, since for the random partitioner we have no guarantee that
-        // prefix.end() will sort after prefix (see #5240).
-        EOC eoc = !builder.hasRemaining() ? EOC.NONE : (bound.isEnd() ? EOC.END : EOC.START);
-        return filterAndSort(builder.buildWithEOC(eoc));
-    }
 
-    /**
-     * Removes duplicates and sort the specified composites.
-     *
-     * @param composites the composites to filter and sort
-     * @return the composites sorted and without duplicates
-     */
-    private List<Composite> filterAndSort(List<Composite> composites)
-    {
-        if (composites.size() <= 1)
-            return composites;
-
-        TreeSet<Composite> set = new TreeSet<Composite>(ctype);
-        set.addAll(composites);
-
-        return new ArrayList<>(set);
-    }
-
-    private List<Composite> normalizeEmptyComposites(List<Composite> composites)
-    {
-        List<Composite> transformed = new ArrayList<>(composites.size());
-        for (Composite c : composites)
-            transformed.add(c.isEmpty() ? Composites.EMPTY : c);
-        return transformed;
-    }
-
-    /**
-     * Sets EOCs for the composites returned by the specified slice restriction for the given bound.
-     *
-     * @param r the slice restriction
-     * @param bound the bound
-     * @param composites the composites
-     * @return the composites with their EOCs properly set
-     */
-    private List<Composite> setEocs(Restriction r, Bound bound, List<Composite> composites)
-    {
-        List<Composite> list = new ArrayList<>(composites.size());
-
-        // The first column of the slice might not be the first clustering column (e.g. clustering_0 = ? AND (clustering_1, clustering_2) >= (?, ?)
-        int offset = r.getFirstColumn().position();
-
-        for (int i = 0, m = composites.size(); i < m; i++)
-        {
-            Composite composite = composites.get(i);
-
-            // Handle the no bound case
-            if (composite.size() == offset)
-            {
-                list.add(composite.withEOC(bound.isEnd() ? EOC.END : EOC.START));
-                continue;
-            }
-
-            // In the case of mixed order columns, we will have some extra slices where the columns change directions.
-            // For example: if we have clustering_0 DESC and clustering_1 ASC a slice like (clustering_0, clustering_1) > (1, 2)
-            // will produce 2 slices: [EMPTY, 1.START] and [1.2.END, 1.END]
-            // So, the END bound will return 2 composite with the same values 1
-            if (composite.size() <= r.getLastColumn().position() && i < m - 1 && composite.equals(composites.get(i + 1)))
-            {
-                list.add(composite.withEOC(EOC.START));
-                list.add(composites.get(i++).withEOC(EOC.END));
-                continue;
-            }
-
-            // Handle the normal bounds
-            ColumnDefinition column = r.getColumnDefs().get(composite.size() - 1 - offset);
-            Bound b = reverseBoundIfNeeded(column, bound);
-
-            Composite.EOC eoc = eocFor(r, bound, b);
-            list.add(composite.withEOC(eoc));
-        }
-
-        return list;
+        // Everything was an equal (or there was nothing)
+        return builder.buildBound(bound.isStart(), true);
     }
 
     @Override
-    public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
     {
-        return Composites.toByteBuffers(valuesAsComposites(cfm, options));
+        if (!isPartitionKey)
+            throw new UnsupportedOperationException();
+
+        return toByteBuffers(valuesAsClustering(options));
     }
 
     @Override
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound b, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> bounds(Bound b, QueryOptions options) throws InvalidRequestException
     {
-        return Composites.toByteBuffers(boundsAsComposites(cfm, b, options));
-    }
+        if (!isPartitionKey)
+            throw new UnsupportedOperationException();
 
-    private static Composite.EOC eocFor(Restriction r, Bound eocBound, Bound inclusiveBound)
-    {
-        if (eocBound.isStart())
-            return r.isInclusive(inclusiveBound) ? Composite.EOC.NONE : Composite.EOC.END;
-
-        return r.isInclusive(inclusiveBound) ? Composite.EOC.END : Composite.EOC.START;
+        return toByteBuffers(boundsAsClustering(b, options));
     }
 
     @Override
@@ -334,26 +255,18 @@
     }
 
     @Override
-    public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                     SecondaryIndexManager indexManager,
-                                     QueryOptions options) throws InvalidRequestException
+    public void addRowFilterTo(RowFilter filter,
+                               SecondaryIndexManager indexManager,
+                               QueryOptions options) throws InvalidRequestException
     {
-        Boolean clusteringColumns = null;
         int position = 0;
 
         for (Restriction restriction : restrictions)
         {
-            ColumnDefinition columnDef = restriction.getFirstColumn();
-
-            // PrimaryKeyRestrictionSet contains only one kind of column, either partition key or clustering columns.
-            // Therefore we only need to check the column kind once. All the other columns will be of the same kind.
-            if (clusteringColumns == null)
-                clusteringColumns = columnDef.isClusteringColumn() ? Boolean.TRUE : Boolean.FALSE;
-
             // We ignore all the clustering columns that can be handled by slices.
-            if (!clusteringColumns || handleInFilter(restriction, position) || restriction.hasSupportingIndex(indexManager))
+            if (isPartitionKey || handleInFilter(restriction, position) || restriction.hasSupportingIndex(indexManager))
             {
-                restriction.addIndexExpressionTo(expressions, indexManager, options);
+                restriction.addRowFilterTo(filter, indexManager, options);
                 continue;
             }
 
@@ -398,17 +311,6 @@
         return false;
     }
 
-    @Override
-    public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-    {
-        for (Restriction restriction : restrictions)
-        {
-            if (restriction.isNotReturningAnyRows(cfm, options))
-                return true;
-        }
-        return false;
-    }
-
     private boolean handleInFilter(Restriction restriction, int index)
     {
         return restriction.isContains() || index != restriction.getFirstColumn().position();

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictions.java
index 5e5e3f5..2f9cd7b 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictions.java

@@ -19,11 +19,12 @@
 
 import java.nio.ByteBuffer;
 import java.util.List;
+import java.util.NavigableSet;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.Slice;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
@@ -35,11 +36,11 @@
     @Override
     public PrimaryKeyRestrictions mergeWith(Restriction restriction) throws InvalidRequestException;
 
-    public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException;
+    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException;
 
-    public List<Composite> valuesAsComposites(CFMetaData cfm, QueryOptions options) throws InvalidRequestException;
+    public NavigableSet<Clustering> valuesAsClustering(QueryOptions options) throws InvalidRequestException;
 
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound b, QueryOptions options) throws InvalidRequestException;
+    public List<ByteBuffer> bounds(Bound b, QueryOptions options) throws InvalidRequestException;
 
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException;
+    public NavigableSet<Slice.Bound> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException;
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
index 9df100a..987fd30 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java

@@ -19,15 +19,14 @@
 
 import java.util.List;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.MultiCBuilder;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 
 /**
  * A restriction/clause on a column.
@@ -42,6 +41,7 @@
     public boolean isEQ();
     public boolean isIN();
     public boolean isContains();
+    public boolean isNotNull();
     public boolean isMultiColumn();
 
     /**
@@ -63,11 +63,11 @@
     public List<ColumnDefinition> getColumnDefs();
 
     /**
-     * Return an Iterable over all of the functions (both native and user-defined) used by any component
-     * of the restriction
-     * @return functions all functions found (may contain duplicates)
+     * Adds all functions (native and user-defined) used by any component of the restriction
+     * to the specified list.
+     * @param functions the list to add to
      */
-    public Iterable<Function> getFunctions();
+    void addFunctionsTo(List<Function> functions);
 
     /**
      * Checks if the specified bound is set or not.
@@ -105,46 +105,34 @@
     public boolean hasSupportingIndex(SecondaryIndexManager indexManager);
 
     /**
-     * Adds to the specified list the <code>IndexExpression</code>s corresponding to this <code>Restriction</code>.
+     * Adds to the specified row filter the expressions corresponding to this <code>Restriction</code>.
      *
-     * @param expressions the list to add the <code>IndexExpression</code>s to
+     * @param filter the row filter to add expressions to
      * @param indexManager the secondary index manager
      * @param options the query options
-     * @throws InvalidRequestException if this <code>Restriction</code> cannot be converted into 
-     * <code>IndexExpression</code>s
+     * @throws InvalidRequestException if this <code>Restriction</code> cannot be converted into a row filter
      */
-    public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                     SecondaryIndexManager indexManager,
-                                     QueryOptions options)
-                                     throws InvalidRequestException;
+    public void addRowFilterTo(RowFilter filter,
+                               SecondaryIndexManager indexManager,
+                               QueryOptions options)
+                               throws InvalidRequestException;
 
     /**
      * Appends the values of this <code>Restriction</code> to the specified builder.
      *
-     * @param cfm the table metadata
-     * @param builder the <code>CompositesBuilder</code> to append to.
+     * @param builder the <code>MultiCBuilder</code> to append to.
      * @param options the query options
-     * @return the <code>CompositesBuilder</code>
+     * @return the <code>MultiCBuilder</code>
      */
-    public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options);
+    public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options);
 
     /**
      * Appends the values of the <code>Restriction</code> for the specified bound to the specified builder.
      *
-     * @param cfm the table metadata
-     * @param builder the <code>CompositesBuilder</code> to append to.
+     * @param builder the <code>MultiCBuilder</code> to append to.
      * @param bound the bound
      * @param options the query options
-     * @return the <code>CompositesBuilder</code>
+     * @return the <code>MultiCBuilder</code>
      */
-    public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options);
-
-    /**
-     * Checks if this restriction will prevent the query to return any rows.
-     *
-     * @param cfm the table metadata
-     * @param options the query options
-     * @return {@code true} if this restriction will prevent the query to return any rows, {@false} otherwise
-     */
-    public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options);
+    public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options);
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
index 676ed13..9aeea69 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java

@@ -19,15 +19,13 @@
 
 import java.util.*;
 
-import com.google.common.collect.Iterables;
-
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction.Contains;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction.ContainsRestriction;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 
 /**
  * Sets of column restrictions.
@@ -66,12 +64,10 @@
     }
 
     @Override
-    public final void addIndexExpressionTo(List<IndexExpression> expressions,
-                                           SecondaryIndexManager indexManager,
-                                           QueryOptions options) throws InvalidRequestException
+    public final void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
     {
         for (Restriction restriction : restrictions.values())
-            restriction.addIndexExpressionTo(expressions, indexManager, options);
+            restriction.addRowFilterTo(filter, indexManager, options);
     }
 
     @Override
@@ -81,18 +77,18 @@
     }
 
     @Override
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        com.google.common.base.Function<Restriction, Iterable<Function>> transform =
-            new com.google.common.base.Function<Restriction, Iterable<Function>>()
+        Restriction previous = null;
+        for (Restriction restriction : restrictions.values())
         {
-            public Iterable<Function> apply(Restriction restriction)
+            // For muti-column restriction, we can have multiple time the same restriction.
+            if (!restriction.equals(previous))
             {
-                return restriction.getFunctions();
+                previous = restriction;
+                restriction.addFunctionsTo(functions);
             }
-        };
-
-        return Iterables.concat(Iterables.transform(restrictions.values(), transform));
+        }
     }
 
     @Override
@@ -245,7 +241,7 @@
         {
             if (restriction.isContains())
             {
-                Contains contains = (Contains) restriction;
+                ContainsRestriction contains = (ContainsRestriction) restriction;
                 numberOfContains += (contains.numberOfValues() + contains.numberOfKeys() + contains.numberOfEntries());
             }
         }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java
index ab81bf7..5fa3170 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java

@@ -23,9 +23,9 @@
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 
 /**
  * Sets of restrictions
@@ -39,11 +39,11 @@
     public Collection<ColumnDefinition> getColumnDefs();
 
     /**
-     * Return an Iterable over all of the functions (both native and user-defined) used by any component
-     * of the restrictions
-     * @return functions all functions found (may contain duplicates)
+     * Adds all functions (native and user-defined) used by any component of the restriction
+     * to the specified list.
+     * @param functions the list to add to
      */
-    public Iterable<Function> getFunctions();
+    public void addFunctionsTo(List<Function> functions);
 
     /**
      * Check if the restriction is on indexed columns.
@@ -54,18 +54,14 @@
     public boolean hasSupportingIndex(SecondaryIndexManager indexManager);
 
     /**
-     * Adds to the specified list the <code>IndexExpression</code>s corresponding to this <code>Restriction</code>.
+     * Adds to the specified row filter the expressions corresponding to this <code>Restrictions</code>.
      *
-     * @param expressions the list to add the <code>IndexExpression</code>s to
+     * @param filter the row filter to add expressions to
      * @param indexManager the secondary index manager
      * @param options the query options
-     * @throws InvalidRequestException if this <code>Restriction</code> cannot be converted into
-     * <code>IndexExpression</code>s
+     * @throws InvalidRequestException if this <code>Restrictions</code> cannot be converted into a row filter
      */
-    public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                     SecondaryIndexManager indexManager,
-                                     QueryOptions options)
-                                     throws InvalidRequestException;
+    public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException;
 
     /**
      * Checks if this <code>PrimaryKeyRestrictionSet</code> is empty or not.

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ReversedPrimaryKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ReversedPrimaryKeyRestrictions.java
deleted file mode 100644
index 9107acd..0000000
--- a/src/java/org/apache/cassandra/cql3/restrictions/ReversedPrimaryKeyRestrictions.java
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.restrictions;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.QueryOptions;
-import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-
-/**
- * <code>PrimaryKeyRestrictions</code> decorator that reverse the slices.
- */
-final class ReversedPrimaryKeyRestrictions extends ForwardingPrimaryKeyRestrictions
-{
-    /**
-     * The decorated restrictions.
-     */
-    private PrimaryKeyRestrictions restrictions;
-
-    public ReversedPrimaryKeyRestrictions(PrimaryKeyRestrictions restrictions)
-    {
-        this.restrictions = restrictions;
-    }
-
-    @Override
-    public PrimaryKeyRestrictions mergeWith(Restriction restriction) throws InvalidRequestException
-    {
-        return new ReversedPrimaryKeyRestrictions(this.restrictions.mergeWith(restriction));
-    }
-
-    @Override
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
-    {
-        List<ByteBuffer> buffers = restrictions.bounds(cfm, bound.reverse(), options);
-        Collections.reverse(buffers);
-        return buffers;
-    }
-
-    @Override
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
-    {
-        List<Composite> composites = restrictions.boundsAsComposites(cfm, bound.reverse(), options);
-        Collections.reverse(composites);
-        return composites;
-    }
-
-    @Override
-    public boolean isInclusive(Bound bound)
-    {
-        return this.restrictions.isInclusive(bound.reverse());
-    }
-
-    @Override
-    protected PrimaryKeyRestrictions getDelegate()
-    {
-        return this.restrictions;
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
index 1f4960b..5985962 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java

@@ -20,20 +20,17 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.Term.Terminal;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.MultiCBuilder;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkBindValueSet;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
@@ -73,8 +70,11 @@
     @Override
     public boolean hasSupportingIndex(SecondaryIndexManager indexManager)
     {
-        SecondaryIndex index = indexManager.getIndexForColumn(columnDef.name.bytes);
-        return index != null && isSupportedBy(index);
+        for (Index index : indexManager.listIndexes())
+            if (isSupportedBy(index))
+                return true;
+
+        return false;
     }
 
     @Override
@@ -112,26 +112,26 @@
     /**
      * Check if this type of restriction is supported by the specified index.
      *
-     * @param index the Secondary index
+     * @param index the secondary index
      * @return <code>true</code> this type of restriction is supported by the specified index,
      * <code>false</code> otherwise.
      */
-    protected abstract boolean isSupportedBy(SecondaryIndex index);
+    protected abstract boolean isSupportedBy(Index index);
 
-    public static final class EQ extends SingleColumnRestriction
+    public static class EQRestriction extends SingleColumnRestriction
     {
-        private final Term value;
+        public final Term value;
 
-        public EQ(ColumnDefinition columnDef, Term value)
+        public EQRestriction(ColumnDefinition columnDef, Term value)
         {
             super(columnDef);
             this.value = value;
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return value.getFunctions();
+            value.addFunctionsTo(functions);
         }
 
         @Override
@@ -143,20 +143,19 @@
         @Override
         MultiColumnRestriction toMultiColumnRestriction()
         {
-            return new MultiColumnRestriction.EQ(Collections.singletonList(columnDef), value);
+            return new MultiColumnRestriction.EQRestriction(Collections.singletonList(columnDef), value);
         }
 
         @Override
-        public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                         SecondaryIndexManager indexManager,
-                                         QueryOptions options) throws InvalidRequestException
+        public void addRowFilterTo(RowFilter filter,
+                                   SecondaryIndexManager indexManager,
+                                   QueryOptions options)
         {
-            ByteBuffer buffer = validateIndexedValue(columnDef, value.bindAndGet(options));
-            expressions.add(new IndexExpression(columnDef.name.bytes, Operator.EQ, buffer));
+            filter.add(columnDef, Operator.EQ, value.bindAndGet(options));
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
             builder.addElementToAll(value.bindAndGet(options));
             checkFalse(builder.containsNull(), "Invalid null value in condition for column %s", columnDef.name);
@@ -177,25 +176,15 @@
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
-            return index.supportsOperator(Operator.EQ);
-        }
-
-        @Override
-        public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-        {
-            assert columnDef.isClusteringColumn();
-
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that
-            // any query with an EQ restriction containing an empty value will not return any results.
-            return !cfm.comparator.isCompound() && !value.bindAndGet(options).hasRemaining();
+            return index.supportsExpression(columnDef, Operator.EQ);
         }
     }
 
-    public static abstract class IN extends SingleColumnRestriction
+    public static abstract class INRestriction extends SingleColumnRestriction
     {
-        public IN(ColumnDefinition columnDef)
+        public INRestriction(ColumnDefinition columnDef)
         {
             super(columnDef);
         }
@@ -213,60 +202,39 @@
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
-            List<ByteBuffer> values = filterValuesIfNeeded(cfm, getValues(options));
-
-            builder.addEachElementToAll(values);
+            builder.addEachElementToAll(getValues(options));
             checkFalse(builder.containsNull(), "Invalid null value in condition for column %s", columnDef.name);
             checkFalse(builder.containsUnset(), "Invalid unset value for column %s", columnDef.name);
             return builder;
         }
 
-        private List<ByteBuffer> filterValuesIfNeeded(CFMetaData cfm, List<ByteBuffer> values)
-        {
-            if (!columnDef.isClusteringColumn() || cfm.comparator.isCompound())
-                return values;
-
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that we can
-            // ignore any IN value which is an empty byte buffer an which otherwise will trigger an error.
-
-            // As some List implementations do not support remove, we copy the list to be on the safe side.
-            List<ByteBuffer> filteredValues = new ArrayList<>(values.size());
-            for (ByteBuffer value : values)
-            {
-                if (value.hasRemaining())
-                    filteredValues.add(value);
-            }
-            return filteredValues;
-        }
-
         @Override
-        public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                         SecondaryIndexManager indexManager,
-                                         QueryOptions options) throws InvalidRequestException
+        public void addRowFilterTo(RowFilter filter,
+                                   SecondaryIndexManager indexManager,
+                                   QueryOptions options) throws InvalidRequestException
         {
             List<ByteBuffer> values = getValues(options);
             checkTrue(values.size() == 1, "IN restrictions are not supported on indexed columns");
 
-            ByteBuffer value = validateIndexedValue(columnDef, values.get(0));
-            expressions.add(new IndexExpression(columnDef.name.bytes, Operator.EQ, value));
+            filter.add(columnDef, Operator.EQ, values.get(0));
         }
 
         @Override
-        protected final boolean isSupportedBy(SecondaryIndex index)
+        protected final boolean isSupportedBy(Index index)
         {
-            return index.supportsOperator(Operator.IN);
+            return index.supportsExpression(columnDef, Operator.IN);
         }
 
         protected abstract List<ByteBuffer> getValues(QueryOptions options) throws InvalidRequestException;
     }
 
-    public static class InWithValues extends IN
+    public static class InRestrictionWithValues extends INRestriction
     {
         protected final List<Term> values;
 
-        public InWithValues(ColumnDefinition columnDef, List<Term> values)
+        public InRestrictionWithValues(ColumnDefinition columnDef, List<Term> values)
         {
             super(columnDef);
             this.values = values;
@@ -275,13 +243,13 @@
         @Override
         MultiColumnRestriction toMultiColumnRestriction()
         {
-            return new MultiColumnRestriction.InWithValues(Collections.singletonList(columnDef), values);
+            return new MultiColumnRestriction.InRestrictionWithValues(Collections.singletonList(columnDef), values);
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Terms.getFunctions(values);
+            Terms.addFunctions(values, functions);
         }
 
         @Override
@@ -300,26 +268,25 @@
         }
     }
 
-    public static class InWithMarker extends IN
+    public static class InRestrictionWithMarker extends INRestriction
     {
         protected final AbstractMarker marker;
 
-        public InWithMarker(ColumnDefinition columnDef, AbstractMarker marker)
+        public InRestrictionWithMarker(ColumnDefinition columnDef, AbstractMarker marker)
         {
             super(columnDef);
             this.marker = marker;
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Collections.emptySet();
         }
 
         @Override
         MultiColumnRestriction toMultiColumnRestriction()
         {
-            return new MultiColumnRestriction.InWithMarker(Collections.singletonList(columnDef), marker);
+            return new MultiColumnRestriction.InRestrictionWithMarker(Collections.singletonList(columnDef), marker);
         }
 
         @Override
@@ -339,26 +306,26 @@
         }
     }
 
-    public static final class Slice extends SingleColumnRestriction
+    public static class SliceRestriction extends SingleColumnRestriction
     {
-        private final TermSlice slice;
+        public final TermSlice slice;
 
-        public Slice(ColumnDefinition columnDef, Bound bound, boolean inclusive, Term term)
+        public SliceRestriction(ColumnDefinition columnDef, Bound bound, boolean inclusive, Term term)
         {
             super(columnDef);
             slice = TermSlice.newInstance(bound, inclusive, term);
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return slice.getFunctions();
+            slice.addFunctionsTo(functions);
         }
 
         @Override
         MultiColumnRestriction toMultiColumnRestriction()
         {
-            return new MultiColumnRestriction.Slice(Collections.singletonList(columnDef), slice);
+            return new MultiColumnRestriction.SliceRestriction(Collections.singletonList(columnDef), slice);
         }
 
         @Override
@@ -368,7 +335,7 @@
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
             throw new UnsupportedOperationException();
         }
@@ -380,7 +347,7 @@
         }
 
         @Override
-        public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
         {
             Bound b = reverseBoundIfNeeded(getFirstColumn(), bound);
 
@@ -406,7 +373,7 @@
                       "Column \"%s\" cannot be restricted by both an equality and an inequality relation",
                       columnDef.name);
 
-            SingleColumnRestriction.Slice otherSlice = (SingleColumnRestriction.Slice) otherRestriction;
+            SingleColumnRestriction.SliceRestriction otherSlice = (SingleColumnRestriction.SliceRestriction) otherRestriction;
 
             checkFalse(hasBound(Bound.START) && otherSlice.hasBound(Bound.START),
                        "More than one restriction was found for the start bound on %s", columnDef.name);
@@ -414,33 +381,21 @@
             checkFalse(hasBound(Bound.END) && otherSlice.hasBound(Bound.END),
                        "More than one restriction was found for the end bound on %s", columnDef.name);
 
-            return new Slice(columnDef,  slice.merge(otherSlice.slice));
+            return new SliceRestriction(columnDef,  slice.merge(otherSlice.slice));
         }
 
         @Override
-        public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                         SecondaryIndexManager indexManager,
-                                         QueryOptions options) throws InvalidRequestException
+        public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
         {
             for (Bound b : Bound.values())
-            {
                 if (hasBound(b))
-                {
-                    ByteBuffer value = validateIndexedValue(columnDef, slice.bound(b).bindAndGet(options));
-                    Operator op = slice.getIndexOperator(b);
-                    // If the underlying comparator for name is reversed, we need to reverse the IndexOperator: user operation
-                    // always refer to the "forward" sorting even if the clustering order is reversed, but the 2ndary code does
-                    // use the underlying comparator as is.
-                    op = columnDef.isReversedType() ? op.reverse() : op;
-                    expressions.add(new IndexExpression(columnDef.name.bytes, op, value));
-                }
-            }
+                    filter.add(columnDef, slice.getIndexOperator(b), slice.bound(b).bindAndGet(options));
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
-            return slice.isSupportedBy(index);
+            return slice.isSupportedBy(columnDef, index);
         }
 
         @Override
@@ -449,19 +404,7 @@
             return String.format("SLICE%s", slice);
         }
 
-        @Override
-        public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-        {
-            assert columnDef.isClusteringColumn();
-
-            // Dense non-compound tables do not accept empty ByteBuffers. By consequence, we know that
-            // any query with a slice restriction with an empty value for the END bound will not return any results.
-            return !cfm.comparator.isCompound()
-                    && hasBound(Bound.END)
-                    && !slice.bound(Bound.END).bindAndGet(options).hasRemaining();
-        }
-
-        private Slice(ColumnDefinition columnDef, TermSlice slice)
+        SliceRestriction(ColumnDefinition columnDef, TermSlice slice)
         {
             super(columnDef);
             this.slice = slice;
@@ -469,14 +412,14 @@
     }
 
     // This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
-    public static final class Contains extends SingleColumnRestriction
+    public static final class ContainsRestriction extends SingleColumnRestriction
     {
         private List<Term> values = new ArrayList<>(); // for CONTAINS
         private List<Term> keys = new ArrayList<>(); // for CONTAINS_KEY
         private List<Term> entryKeys = new ArrayList<>(); // for map[key] = value
         private List<Term> entryValues = new ArrayList<>(); // for map[key] = value
 
-        public Contains(ColumnDefinition columnDef, Term t, boolean isKey)
+        public ContainsRestriction(ColumnDefinition columnDef, Term t, boolean isKey)
         {
             super(columnDef);
             if (isKey)
@@ -485,7 +428,7 @@
                 values.add(t);
         }
 
-        public Contains(ColumnDefinition columnDef, Term mapKey, Term mapValue)
+        public ContainsRestriction(ColumnDefinition columnDef, Term mapKey, Term mapValue)
         {
             super(columnDef);
             entryKeys.add(mapKey);
@@ -505,7 +448,7 @@
         }
 
         @Override
-        public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         {
             throw new UnsupportedOperationException();
         }
@@ -523,48 +466,42 @@
                       "Collection column %s can only be restricted by CONTAINS, CONTAINS KEY, or map-entry equality",
                       columnDef.name);
 
-            SingleColumnRestriction.Contains newContains = new Contains(columnDef);
+            SingleColumnRestriction.ContainsRestriction newContains = new ContainsRestriction(columnDef);
 
             copyKeysAndValues(this, newContains);
-            copyKeysAndValues((Contains) otherRestriction, newContains);
+            copyKeysAndValues((ContainsRestriction) otherRestriction, newContains);
 
             return newContains;
         }
 
         @Override
-        public void addIndexExpressionTo(List<IndexExpression> expressions,
-                                         SecondaryIndexManager indexManager,
-                                         QueryOptions options)
-                                         throws InvalidRequestException
+        public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
         {
-            addExpressionsFor(expressions, bindAndGet(values, options), Operator.CONTAINS);
-            addExpressionsFor(expressions, bindAndGet(keys, options), Operator.CONTAINS_KEY);
-            addExpressionsFor(expressions, entries(options), Operator.EQ);
-        }
+            for (ByteBuffer value : bindAndGet(values, options))
+                filter.add(columnDef, Operator.CONTAINS, value);
+            for (ByteBuffer key : bindAndGet(keys, options))
+                filter.add(columnDef, Operator.CONTAINS_KEY, key);
 
-        private void addExpressionsFor(List<IndexExpression> target, List<ByteBuffer> values,
-                                       Operator op) throws InvalidRequestException
-        {
-            for (ByteBuffer value : values)
-            {
-                validateIndexedValue(columnDef, value);
-                target.add(new IndexExpression(columnDef.name.bytes, op, value));
-            }
+            List<ByteBuffer> eks = bindAndGet(entryKeys, options);
+            List<ByteBuffer> evs = bindAndGet(entryValues, options);
+            assert eks.size() == evs.size();
+            for (int i = 0; i < eks.size(); i++)
+                filter.addMapEquality(columnDef, eks.get(i), Operator.EQ, evs.get(i));
         }
 
         @Override
-        protected boolean isSupportedBy(SecondaryIndex index)
+        protected boolean isSupportedBy(Index index)
         {
             boolean supported = false;
 
             if (numberOfValues() > 0)
-                supported |= index.supportsOperator(Operator.CONTAINS);
+                supported |= index.supportsExpression(columnDef, Operator.CONTAINS);
 
             if (numberOfKeys() > 0)
-                supported |= index.supportsOperator(Operator.CONTAINS_KEY);
+                supported |= index.supportsExpression(columnDef, Operator.CONTAINS_KEY);
 
             if (numberOfEntries() > 0)
-                supported |= index.supportsOperator(Operator.EQ);
+                supported |= index.supportsExpression(columnDef, Operator.EQ);
 
             return supported;
         }
@@ -585,12 +522,12 @@
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Iterables.concat(Terms.getFunctions(values),
-                                    Terms.getFunctions(keys),
-                                    Terms.getFunctions(entryKeys),
-                                    Terms.getFunctions(entryValues));
+            Terms.addFunctions(values, functions);
+            Terms.addFunctions(keys, functions);
+            Terms.addFunctions(entryKeys, functions);
+            Terms.addFunctions(entryValues, functions);
         }
 
         @Override
@@ -606,7 +543,7 @@
         }
 
         @Override
-        public CompositesBuilder appendBoundTo(CFMetaData cfm, CompositesBuilder builder, Bound bound, QueryOptions options)
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
         {
             throw new UnsupportedOperationException();
         }
@@ -617,20 +554,6 @@
             throw new UnsupportedOperationException();
         }
 
-        private List<ByteBuffer> entries(QueryOptions options) throws InvalidRequestException
-        {
-            List<ByteBuffer> entryBuffers = new ArrayList<>(entryKeys.size());
-            List<ByteBuffer> keyBuffers = bindAndGet(entryKeys, options);
-            List<ByteBuffer> valueBuffers = bindAndGet(entryValues, options);
-            for (int i = 0; i < entryKeys.size(); i++)
-            {
-                if (valueBuffers.get(i) == null)
-                    throw new InvalidRequestException("Unsupported null value for map-entry equality");
-                entryBuffers.add(CompositeType.build(keyBuffers.get(i), valueBuffers.get(i)));
-            }
-            return entryBuffers;
-        }
-
         /**
          * Binds the query options to the specified terms and returns the resulting values.
          *
@@ -653,7 +576,7 @@
          * @param from the <code>Contains</code> to copy from
          * @param to the <code>Contains</code> to copy to
          */
-        private static void copyKeysAndValues(Contains from, Contains to)
+        private static void copyKeysAndValues(ContainsRestriction from, ContainsRestriction to)
         {
             to.values.addAll(from.values);
             to.keys.addAll(from.keys);
@@ -661,9 +584,264 @@
             to.entryValues.addAll(from.entryValues);
         }
 
-        private Contains(ColumnDefinition columnDef)
+        private ContainsRestriction(ColumnDefinition columnDef)
         {
             super(columnDef);
         }
     }
+
+    public static final class IsNotNullRestriction extends SingleColumnRestriction
+    {
+        public IsNotNullRestriction(ColumnDefinition columnDef)
+        {
+            super(columnDef);
+        }
+
+        @Override
+        public void addFunctionsTo(List<Function> functions)
+        {
+        }
+
+        @Override
+        public boolean isNotNull()
+        {
+            return true;
+        }
+
+        @Override
+        MultiColumnRestriction toMultiColumnRestriction()
+        {
+            return new MultiColumnRestriction.NotNullRestriction(Collections.singletonList(columnDef));
+        }
+
+        @Override
+        public void addRowFilterTo(RowFilter filter,
+                                   SecondaryIndexManager indexManager,
+                                   QueryOptions options)
+        {
+            throw new UnsupportedOperationException("Secondary indexes do not support IS NOT NULL restrictions");
+        }
+
+        @Override
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
+        {
+            throw new UnsupportedOperationException("Cannot use IS NOT NULL restriction for slicing");
+        }
+
+        @Override
+        public String toString()
+        {
+            return "IS NOT NULL";
+        }
+
+        @Override
+        public Restriction doMergeWith(Restriction otherRestriction) throws InvalidRequestException
+        {
+            throw invalidRequest("%s cannot be restricted by a relation if it includes an IS NOT NULL", columnDef.name);
+        }
+
+        @Override
+        protected boolean isSupportedBy(Index index)
+        {
+            return index.supportsExpression(columnDef, Operator.IS_NOT);
+        }
+    }
+
+    /**
+     * Super Column Compatibiltiy
+     */
+
+    public static class SuperColumnMultiEQRestriction extends EQRestriction
+    {
+        public ByteBuffer firstValue;
+        public ByteBuffer secondValue;
+
+        public SuperColumnMultiEQRestriction(ColumnDefinition columnDef, Term value)
+        {
+            super(columnDef, value);
+        }
+
+        @Override
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
+        {
+            Term term = value.bind(options);
+
+            assert (term instanceof Tuples.Value);
+            firstValue = ((Tuples.Value)term).getElements().get(0);
+            secondValue = ((Tuples.Value)term).getElements().get(1);
+
+            builder.addElementToAll(firstValue);
+            checkFalse(builder.containsNull(), "Invalid null value in condition for column %s", columnDef.name);
+            checkFalse(builder.containsUnset(), "Invalid unset value for column %s", columnDef.name);
+            return builder;
+        }
+    }
+
+    public static class SuperColumnMultiSliceRestriction extends SliceRestriction
+    {
+        public ByteBuffer firstValue;
+        public ByteBuffer secondValue;
+
+        // These are here to avoid polluting SliceRestriction
+        public final Bound bound;
+        public final boolean trueInclusive;
+        public SuperColumnMultiSliceRestriction(ColumnDefinition columnDef, Bound bound, boolean inclusive, Term term)
+        {
+            super(columnDef, bound, true, term);
+            this.bound = bound;
+            this.trueInclusive = inclusive;
+
+        }
+
+        @Override
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
+        {
+            Bound b = reverseBoundIfNeeded(getFirstColumn(), bound);
+
+            if (!hasBound(b))
+                return builder;
+
+            Term term = slice.bound(b);
+
+            assert (term instanceof Tuples.Value);
+            firstValue = ((Tuples.Value)term).getElements().get(0);
+            secondValue = ((Tuples.Value)term).getElements().get(1);
+
+            checkBindValueSet(firstValue, "Invalid unset value for column %s", columnDef.name);
+            checkBindValueSet(secondValue, "Invalid unset value for column %s", columnDef.name);
+            return builder.addElementToAll(firstValue);
+
+        }
+    }
+
+    public static final class SuperColumnKeyEQRestriction extends EQRestriction
+    {
+        public SuperColumnKeyEQRestriction(ColumnDefinition columnDef, Term value)
+        {
+            super(columnDef, value);
+        }
+
+        public ByteBuffer bindValue(QueryOptions options)
+        {
+            return value.bindAndGet(options);
+        }
+
+        @Override
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
+        {
+            // no-op
+            return builder;
+        }
+
+        @Override
+        public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
+        {
+            // no-op
+        }
+    }
+
+    public static abstract class SuperColumnKeyINRestriction extends INRestriction
+    {
+        public SuperColumnKeyINRestriction(ColumnDefinition columnDef)
+        {
+            super(columnDef);
+        }
+
+        @Override
+        public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
+        {
+            // no-op
+            return builder;
+        }
+
+        @Override
+        public void addRowFilterTo(RowFilter filter,
+                                   SecondaryIndexManager indexManager,
+                                   QueryOptions options) throws InvalidRequestException
+        {
+            // no-op
+        }
+
+        public void addFunctionsTo(List<Function> functions)
+        {
+            // no-op
+        }
+
+        MultiColumnRestriction toMultiColumnRestriction()
+        {
+            // no-op
+            return null;
+        }
+
+        public abstract List<ByteBuffer> getValues(QueryOptions options) throws InvalidRequestException;
+    }
+
+    public static class SuperColumnKeyINRestrictionWithMarkers extends SuperColumnKeyINRestriction
+    {
+        protected final AbstractMarker marker;
+
+        public SuperColumnKeyINRestrictionWithMarkers(ColumnDefinition columnDef, AbstractMarker marker)
+        {
+            super(columnDef);
+            this.marker = marker;
+        }
+
+        public List<ByteBuffer> getValues(QueryOptions options) throws InvalidRequestException
+        {
+            Terminal term = marker.bind(options);
+            checkNotNull(term, "Invalid null value for column %s", columnDef.name);
+            checkFalse(term == Constants.UNSET_VALUE, "Invalid unset value for column %s", columnDef.name);
+            Term.MultiItemTerminal lval = (Term.MultiItemTerminal) term;
+            return lval.getElements();
+        }
+    }
+
+    public static class SuperColumnKeyINRestrictionWithValues extends SuperColumnKeyINRestriction
+    {
+        private final List<Term> values;
+
+        public SuperColumnKeyINRestrictionWithValues(ColumnDefinition columnDef, List<Term> values)
+        {
+            super(columnDef);
+            this.values = values;
+        }
+
+        public List<ByteBuffer> getValues(QueryOptions options) throws InvalidRequestException
+        {
+            List<ByteBuffer> buffers = new ArrayList<>(values.size());
+            for (Term value : values)
+                buffers.add(value.bindAndGet(options));
+            return buffers;
+        }
+    }
+
+    public static class SuperColumnKeySliceRestriction extends SliceRestriction
+    {
+        // These are here to avoid polluting SliceRestriction
+        private Term term;
+
+        public SuperColumnKeySliceRestriction(ColumnDefinition columnDef, Bound bound, boolean inclusive, Term term)
+        {
+            super(columnDef, bound, inclusive, term);
+            this.term = term;
+        }
+
+        public ByteBuffer bindValue(QueryOptions options)
+        {
+            return term.bindAndGet(options);
+        }
+
+        @Override
+        public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOptions options)
+        {
+            // no-op
+            return builder;
+        }
+
+        @Override
+        public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options) throws InvalidRequestException
+        {
+            // no-op
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index 2c396c4..d7d6f48 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java

@@ -22,21 +22,26 @@
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
+import org.apache.cassandra.cql3.statements.StatementType;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.btree.BTreeSet;
 
-import static org.apache.cassandra.config.ColumnDefinition.toIdentifiers;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
@@ -52,6 +57,11 @@
             "this query despite the performance unpredictability, use ALLOW FILTERING";
 
     /**
+     * The type of statement
+     */
+    private final StatementType type;
+
+    /**
      * The Column Family meta data
      */
     public final CFMetaData cfm;
@@ -77,10 +87,12 @@
      */
     private boolean hasRegularColumnsRestriction = false;
 
+    private Set<ColumnDefinition> notNullColumns;
+
     /**
-     * The restrictions used to build the index expressions
+     * The restrictions used to build the row filter
      */
-    private final List<Restrictions> indexRestrictions = new ArrayList<>();
+    private final IndexRestrictions indexRestrictions = new IndexRestrictions();
 
     /**
      * <code>true</code> if the secondary index need to be queried, <code>false</code> otherwise
@@ -95,62 +107,115 @@
     /**
      * Creates a new empty <code>StatementRestrictions</code>.
      *
+     * @param type the type of statement
      * @param cfm the column family meta data
      * @return a new empty <code>StatementRestrictions</code>.
      */
-    public static StatementRestrictions empty(CFMetaData cfm)
+    public static StatementRestrictions empty(StatementType type, CFMetaData cfm)
     {
-        return new StatementRestrictions(cfm);
+        return new StatementRestrictions(type, cfm);
     }
 
-    private StatementRestrictions(CFMetaData cfm)
+    private StatementRestrictions(StatementType type, CFMetaData cfm)
     {
+        this.type = type;
         this.cfm = cfm;
-        this.partitionKeyRestrictions = new PrimaryKeyRestrictionSet(cfm.getKeyValidatorAsCType());
-        this.clusteringColumnsRestrictions = new PrimaryKeyRestrictionSet(cfm.comparator);
+        this.partitionKeyRestrictions = new PrimaryKeyRestrictionSet(cfm.getKeyValidatorAsClusteringComparator(), true);
+        this.clusteringColumnsRestrictions = new PrimaryKeyRestrictionSet(cfm.comparator, false);
         this.nonPrimaryKeyRestrictions = new RestrictionSet();
+        this.notNullColumns = new HashSet<>();
     }
 
-    public StatementRestrictions(CFMetaData cfm,
-                                 List<Relation> whereClause,
+    public StatementRestrictions(StatementType type,
+                                 CFMetaData cfm,
+                                 WhereClause whereClause,
                                  VariableSpecifications boundNames,
                                  boolean selectsOnlyStaticColumns,
                                  boolean selectACollection,
-                                 boolean allowFiltering)
+                                 boolean allowFiltering,
+                                 boolean forView) throws InvalidRequestException
     {
+        this.type = type;
         this.cfm = cfm;
-        this.partitionKeyRestrictions = new PrimaryKeyRestrictionSet(cfm.getKeyValidatorAsCType());
-        this.clusteringColumnsRestrictions = new PrimaryKeyRestrictionSet(cfm.comparator);
+        this.partitionKeyRestrictions = new PrimaryKeyRestrictionSet(cfm.getKeyValidatorAsClusteringComparator(), true);
+        this.clusteringColumnsRestrictions = new PrimaryKeyRestrictionSet(cfm.comparator, false);
         this.nonPrimaryKeyRestrictions = new RestrictionSet();
+        this.notNullColumns = new HashSet<>();
 
         /*
-         * WHERE clause. For a given entity, rules are: - EQ relation conflicts with anything else (including a 2nd EQ)
-         * - Can't have more than one LT(E) relation (resp. GT(E) relation) - IN relation are restricted to row keys
-         * (for now) and conflicts with anything else (we could allow two IN for the same entity but that doesn't seem
-         * very useful) - The value_alias cannot be restricted in any way (we don't support wide rows with indexed value
-         * in CQL so far)
+         * WHERE clause. For a given entity, rules are:
+         *   - EQ relation conflicts with anything else (including a 2nd EQ)
+         *   - Can't have more than one LT(E) relation (resp. GT(E) relation)
+         *   - IN relation are restricted to row keys (for now) and conflicts with anything else (we could
+         *     allow two IN for the same entity but that doesn't seem very useful)
+         *   - The value_alias cannot be restricted in any way (we don't support wide rows with indexed value
+         *     in CQL so far)
          */
-        for (Relation relation : whereClause)
-            addRestriction(relation.toRestriction(cfm, boundNames));
+        for (Relation relation : whereClause.relations)
+        {
+            if (relation.operator() == Operator.IS_NOT)
+            {
+                if (!forView)
+                    throw new InvalidRequestException("Unsupported restriction: " + relation);
 
-        SecondaryIndexManager secondaryIndexManager = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName).indexManager;
-        boolean hasQueriableClusteringColumnIndex = clusteringColumnsRestrictions.hasSupportingIndex(secondaryIndexManager);
-        boolean hasQueriableIndex = hasQueriableClusteringColumnIndex
-                || partitionKeyRestrictions.hasSupportingIndex(secondaryIndexManager)
-                || nonPrimaryKeyRestrictions.hasSupportingIndex(secondaryIndexManager);
+                for (ColumnDefinition def : relation.toRestriction(cfm, boundNames).getColumnDefs())
+                    this.notNullColumns.add(def);
+            }
+            else
+            {
+                if (cfm.isSuper() && cfm.isDense() && !relation.onToken())
+                    addRestriction(relation.toSuperColumnAdapter().toRestriction(cfm, boundNames));
+                else
+                    addRestriction(relation.toRestriction(cfm, boundNames));
+            }
+        }
+
+        boolean hasQueriableClusteringColumnIndex = false;
+        boolean hasQueriableIndex = false;
+
+        if (type.allowUseOfSecondaryIndices())
+        {
+            ColumnFamilyStore cfs = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
+            SecondaryIndexManager secondaryIndexManager = cfs.indexManager;
+
+            if (whereClause.containsCustomExpressions())
+                processCustomIndexExpressions(whereClause.expressions, boundNames, secondaryIndexManager);
+
+            hasQueriableClusteringColumnIndex = clusteringColumnsRestrictions.hasSupportingIndex(secondaryIndexManager);
+            hasQueriableIndex = !indexRestrictions.getCustomIndexExpressions().isEmpty()
+                    || hasQueriableClusteringColumnIndex
+                    || partitionKeyRestrictions.hasSupportingIndex(secondaryIndexManager)
+                    || nonPrimaryKeyRestrictions.hasSupportingIndex(secondaryIndexManager);
+        }
 
         // At this point, the select statement if fully constructed, but we still have a few things to validate
         processPartitionKeyRestrictions(hasQueriableIndex);
 
         // Some but not all of the partition key columns have been specified;
-        // hence we need turn these restrictions into index expressions.
+        // hence we need turn these restrictions into a row filter.
         if (usesSecondaryIndexing)
             indexRestrictions.add(partitionKeyRestrictions);
 
-        checkFalse(selectsOnlyStaticColumns && hasClusteringColumnsRestriction(),
-                   "Cannot restrict clustering columns when selecting only static columns");
+        if (selectsOnlyStaticColumns && hasClusteringColumnsRestriction())
+        {
+            // If the only updated/deleted columns are static, then we don't need clustering columns.
+            // And in fact, unless it is an INSERT, we reject if clustering colums are provided as that
+            // suggest something unintended. For instance, given:
+            //   CREATE TABLE t (k int, v int, s int static, PRIMARY KEY (k, v))
+            // it can make sense to do:
+            //   INSERT INTO t(k, v, s) VALUES (0, 1, 2)
+            // but both
+            //   UPDATE t SET s = 3 WHERE k = 0 AND v = 1
+            //   DELETE v FROM t WHERE k = 0 AND v = 1
+            // sounds like you don't really understand what your are doing.
+            if (type.isDelete() || type.isUpdate())
+                throw invalidRequest("Invalid restrictions on clustering columns since the %s statement modifies only static columns",
+                                     type);
+            if (type.isSelect())
+                throw invalidRequest("Cannot restrict clustering columns when selecting only static columns");
+        }
 
-        processClusteringColumnsRestrictions(hasQueriableIndex, selectACollection);
+        processClusteringColumnsRestrictions(hasQueriableIndex, selectsOnlyStaticColumns, selectACollection, forView);
 
         // Covers indexes on the first clustering column (among others).
         if (isKeyRange && hasQueriableClusteringColumnIndex)
@@ -165,17 +230,26 @@
         // there is restrictions not covered by the PK.
         if (!nonPrimaryKeyRestrictions.isEmpty())
         {
-            if (!hasQueriableIndex)
+            if (!type.allowNonPrimaryKeyInWhereClause())
             {
-                // Filtering for non-index query is only supported for thrift static CFs
-                if (cfm.comparator.isDense() ||  cfm.comparator.isCompound())
-                    throw invalidRequest("Predicates on non-primary-key columns (%s) are not yet supported for non secondary index queries",
-                                         Joiner.on(", ").join(toIdentifiers(nonPrimaryKeyRestrictions.getColumnDefs())));
+                Collection<ColumnIdentifier> nonPrimaryKeyColumns =
+                        ColumnDefinition.toIdentifiers(nonPrimaryKeyRestrictions.getColumnDefs());
 
-                if (!allowFiltering)
-                    throw invalidRequest(REQUIRES_ALLOW_FILTERING_MESSAGE);
+                throw invalidRequest("Non PRIMARY KEY columns found in where clause: %s ",
+                                     Joiner.on(", ").join(nonPrimaryKeyColumns));
             }
-            usesSecondaryIndexing = true;
+            if (hasQueriableIndex)
+            {
+                usesSecondaryIndexing = true;
+            }
+            else if (!allowFiltering && !cfm.isSuper())
+            {
+                throw invalidRequest(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+            }
+
+            checkFalse(clusteringColumnsRestrictions.isEmpty() && cfm.isSuper(),
+                       "Filtering is not supported on SuperColumn tables");
+
             indexRestrictions.add(nonPrimaryKeyRestrictions);
         }
 
@@ -183,7 +257,7 @@
             validateSecondaryIndexSelections(selectsOnlyStaticColumns);
     }
 
-    private void addRestriction(Restriction restriction) throws InvalidRequestException
+    private void addRestriction(Restriction restriction)
     {
         if (restriction.isMultiColumn())
             clusteringColumnsRestrictions = clusteringColumnsRestrictions.mergeWith(restriction);
@@ -193,14 +267,14 @@
             addSingleColumnRestriction((SingleColumnRestriction) restriction);
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return Iterables.concat(partitionKeyRestrictions.getFunctions(),
-                                clusteringColumnsRestrictions.getFunctions(),
-                                nonPrimaryKeyRestrictions.getFunctions());
+        partitionKeyRestrictions.addFunctionsTo(functions);
+        clusteringColumnsRestrictions.addFunctionsTo(functions);
+        nonPrimaryKeyRestrictions.addFunctionsTo(functions);
     }
 
-    private void addSingleColumnRestriction(SingleColumnRestriction restriction) throws InvalidRequestException
+    private void addSingleColumnRestriction(SingleColumnRestriction restriction)
     {
         ColumnDefinition def = restriction.columnDef;
         if (def.isPartitionKey())
@@ -218,6 +292,56 @@
     }
 
     /**
+     * Returns the non-PK column that are restricted.  If includeNotNullRestrictions is true, columns that are restricted
+     * by an IS NOT NULL restriction will be included, otherwise they will not be included (unless another restriction
+     * applies to them).
+     */
+    public Set<ColumnDefinition> nonPKRestrictedColumns(boolean includeNotNullRestrictions)
+    {
+        Set<ColumnDefinition> columns = new HashSet<>();
+        for (Restrictions r : indexRestrictions.getRestrictions())
+        {
+            for (ColumnDefinition def : r.getColumnDefs())
+                if (!def.isPrimaryKeyColumn())
+                    columns.add(def);
+        }
+
+        if (includeNotNullRestrictions)
+        {
+            for (ColumnDefinition def : notNullColumns)
+            {
+                if (!def.isPrimaryKeyColumn())
+                    columns.add(def);
+            }
+        }
+
+        return columns;
+    }
+
+    /**
+     * @return the set of columns that have an IS NOT NULL restriction on them
+     */
+    public Set<ColumnDefinition> notNullColumns()
+    {
+        return notNullColumns;
+    }
+
+    /**
+     * @return true if column is restricted by some restriction, false otherwise
+     */
+    public boolean isRestricted(ColumnDefinition column)
+    {
+        if (notNullColumns.contains(column))
+            return true;
+        else if (column.isPartitionKey())
+            return partitionKeyRestrictions.getColumnDefs().contains(column);
+        else if (column.isClusteringColumn())
+            return clusteringColumnsRestrictions.getColumnDefs().contains(column);
+        else
+            return nonPrimaryKeyRestrictions.getColumnDefs().contains(column);
+    }
+
+    /**
      * Checks if the restrictions on the partition key is an IN restriction.
      *
      * @return <code>true</code> the restrictions on the partition key is an IN restriction, <code>false</code>
@@ -253,8 +377,19 @@
         return this.usesSecondaryIndexing;
     }
 
-    private void processPartitionKeyRestrictions(boolean hasQueriableIndex) throws InvalidRequestException
+    private void processPartitionKeyRestrictions(boolean hasQueriableIndex)
     {
+        if (!type.allowPartitionKeyRanges())
+        {
+            checkFalse(partitionKeyRestrictions.isOnToken(),
+                       "The token function cannot be used in WHERE clauses for %s statements", type);
+
+            if (hasUnrestrictedPartitionKeyComponents())
+                throw invalidRequest("Some partition key parts are missing: %s",
+                                     Joiner.on(", ").join(getPartitionKeyUnrestrictedComponents()));
+        }
+        else
+        {
         // If there is a queriable index, no special condition are required on the other restrictions.
         // But we still need to know 2 things:
         // - If we don't have a queriable index, is the query ok
@@ -264,17 +399,18 @@
         if (partitionKeyRestrictions.isOnToken())
             isKeyRange = true;
 
-        if (hasPartitionKeyUnrestrictedComponents())
-        {
-            if (!partitionKeyRestrictions.isEmpty())
+            if (hasUnrestrictedPartitionKeyComponents())
             {
-                if (!hasQueriableIndex)
-                    throw invalidRequest("Partition key parts: %s must be restricted as other parts are",
-                                         Joiner.on(", ").join(getPartitionKeyUnrestrictedComponents()));
-            }
+                if (!partitionKeyRestrictions.isEmpty())
+                {
+                    if (!hasQueriableIndex)
+                        throw invalidRequest("Partition key parts: %s must be restricted as other parts are",
+                                             Joiner.on(", ").join(getPartitionKeyUnrestrictedComponents()));
+                }
 
-            isKeyRange = true;
-            usesSecondaryIndexing = hasQueriableIndex;
+                isKeyRange = true;
+                usesSecondaryIndexing = hasQueriableIndex;
+            }
         }
     }
 
@@ -282,7 +418,7 @@
      * Checks if the partition key has some unrestricted components.
      * @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
      */
-    private boolean hasPartitionKeyUnrestrictedComponents()
+    private boolean hasUnrestrictedPartitionKeyComponents()
     {
         return partitionKeyRestrictions.size() <  cfm.partitionKeyColumns().size();
     }
@@ -305,7 +441,7 @@
      * Returns the partition key components that are not restricted.
      * @return the partition key components that are not restricted.
      */
-    private List<ColumnIdentifier> getPartitionKeyUnrestrictedComponents()
+    private Collection<ColumnIdentifier> getPartitionKeyUnrestrictedComponents()
     {
         List<ColumnDefinition> list = new ArrayList<>(cfm.partitionKeyColumns());
         list.removeAll(partitionKeyRestrictions.getColumnDefs());
@@ -313,44 +449,82 @@
     }
 
     /**
+     * Checks if the restrictions on the partition key are token restrictions.
+     *
+     * @return <code>true</code> if the restrictions on the partition key are token restrictions,
+     * <code>false</code> otherwise.
+     */
+    public boolean isPartitionKeyRestrictionsOnToken()
+    {
+        return partitionKeyRestrictions.isOnToken();
+    }
+
+    /**
+     * Checks if restrictions on the clustering key have IN restrictions.
+     *
+     * @return <code>true</code> if the restrictions on the clustering key have IN restrictions,
+     * <code>false</code> otherwise.
+     */
+    public boolean clusteringKeyRestrictionsHasIN()
+    {
+        return clusteringColumnsRestrictions.isIN();
+    }
+
+    /**
      * Processes the clustering column restrictions.
      *
      * @param hasQueriableIndex <code>true</code> if some of the queried data are indexed, <code>false</code> otherwise
+     * @param selectsOnlyStaticColumns <code>true</code> if the selected or modified columns are all statics,
+     * <code>false</code> otherwise.
      * @param selectACollection <code>true</code> if the query should return a collection column
-     * @throws InvalidRequestException if the request is invalid
      */
     private void processClusteringColumnsRestrictions(boolean hasQueriableIndex,
-                                                      boolean selectACollection) throws InvalidRequestException
+                                                      boolean selectsOnlyStaticColumns,
+                                                      boolean selectACollection,
+                                                      boolean forView) throws InvalidRequestException
     {
         validateClusteringRestrictions(hasQueriableIndex);
 
-        checkFalse(clusteringColumnsRestrictions.isIN() && selectACollection,
-                   "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
-        checkFalse(clusteringColumnsRestrictions.isContains() && !hasQueriableIndex,
-                   "Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
+        checkFalse(!type.allowClusteringColumnSlices() && clusteringColumnsRestrictions.isSlice(),
+                   "Slice restrictions are not supported on the clustering columns in %s statements", type);
 
-        if (hasClusteringColumnsRestriction() && clusteringRestrictionsNeedFiltering())
+        if (!type.allowClusteringColumnSlices()
+               && (!cfm.isCompactTable() || (cfm.isCompactTable() && !hasClusteringColumnsRestriction())))
         {
-            if (hasQueriableIndex)
+            if (!selectsOnlyStaticColumns && hasUnrestrictedClusteringColumns())
+                throw invalidRequest("Some clustering keys are missing: %s",
+                                     Joiner.on(", ").join(getUnrestrictedClusteringColumns()));
+        }
+        else
+        {
+            checkFalse(clusteringColumnsRestrictions.isIN() && selectACollection,
+                       "Cannot restrict clustering columns by IN relations when a collection is selected by the query");
+            checkFalse(clusteringColumnsRestrictions.isContains() && !hasQueriableIndex,
+                       "Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
+
+            if (hasClusteringColumnsRestriction() && clusteringRestrictionsNeedFiltering())
             {
-                usesSecondaryIndexing = true;
-                return;
-            }
-
-            List<ColumnDefinition> clusteringColumns = cfm.clusteringColumns();
-            List<ColumnDefinition> restrictedColumns = new LinkedList<>(clusteringColumnsRestrictions.getColumnDefs());
-
-            for (int i = 0, m = restrictedColumns.size(); i < m; i++)
-            {
-                ColumnDefinition clusteringColumn = clusteringColumns.get(i);
-                ColumnDefinition restrictedColumn = restrictedColumns.get(i);
-
-                if (!clusteringColumn.equals(restrictedColumn))
+                if (hasQueriableIndex || forView)
                 {
-                    throw invalidRequest(
-                              "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
-                              restrictedColumn.name,
-                              clusteringColumn.name);
+                    usesSecondaryIndexing = true;
+                    return;
+                }
+
+                List<ColumnDefinition> clusteringColumns = cfm.clusteringColumns();
+                List<ColumnDefinition> restrictedColumns = new LinkedList<>(clusteringColumnsRestrictions.getColumnDefs());
+
+                for (int i = 0, m = restrictedColumns.size(); i < m; i++)
+                {
+                    ColumnDefinition clusteringColumn = clusteringColumns.get(i);
+                    ColumnDefinition restrictedColumn = restrictedColumns.get(i);
+
+                    if (!clusteringColumn.equals(restrictedColumn))
+                    {
+                        throw invalidRequest(
+                           "PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
+                            restrictedColumn.name,
+                            clusteringColumn.name);
+                    }
                 }
             }
         }
@@ -367,9 +541,8 @@
         if (hasQueriableIndex)
             return;
 
-        Iterator<Restriction> iter = ((PrimaryKeyRestrictionSet)clusteringColumnsRestrictions).iterator();
+        Iterator<Restriction> iter = ((PrimaryKeyRestrictionSet) clusteringColumnsRestrictions).iterator();
         Restriction previousRestriction = null;
-
         while (iter.hasNext())
         {
             Restriction restriction = iter.next();
@@ -394,17 +567,76 @@
         return ((PrimaryKeyRestrictionSet) clusteringColumnsRestrictions).needsFiltering();
     }
 
-    public List<IndexExpression> getIndexExpressions(SecondaryIndexManager indexManager,
-                                                     QueryOptions options) throws InvalidRequestException
+    /**
+     * Returns the clustering columns that are not restricted.
+     * @return the clustering columns that are not restricted.
+     */
+    private Collection<ColumnIdentifier> getUnrestrictedClusteringColumns()
     {
-        if (!usesSecondaryIndexing || indexRestrictions.isEmpty())
-            return Collections.emptyList();
+        List<ColumnDefinition> missingClusteringColumns = new ArrayList<>(cfm.clusteringColumns());
+        missingClusteringColumns.removeAll(new LinkedList<>(clusteringColumnsRestrictions.getColumnDefs()));
+        return ColumnDefinition.toIdentifiers(missingClusteringColumns);
+    }
 
-        List<IndexExpression> expressions = new ArrayList<>();
-        for (Restrictions restrictions : indexRestrictions)
-            restrictions.addIndexExpressionTo(expressions, indexManager, options);
+    /**
+     * Checks if some clustering columns are not restricted.
+     * @return <code>true</code> if some clustering columns are not restricted, <code>false</code> otherwise.
+     */
+    private boolean hasUnrestrictedClusteringColumns()
+    {
+        return cfm.clusteringColumns().size() != clusteringColumnsRestrictions.size();
+    }
 
-        return expressions;
+    private void processCustomIndexExpressions(List<CustomIndexExpression> expressions,
+                                               VariableSpecifications boundNames,
+                                               SecondaryIndexManager indexManager)
+    {
+        if (!MessagingService.instance().areAllNodesAtLeast30())
+            throw new InvalidRequestException("Please upgrade all nodes to at least 3.0 before using custom index expressions");
+
+        if (expressions.size() > 1)
+            throw new InvalidRequestException(IndexRestrictions.MULTIPLE_EXPRESSIONS);
+
+        CustomIndexExpression expression = expressions.get(0);
+
+        CFName cfName = expression.targetIndex.getCfName();
+        if (cfName.hasKeyspace()
+            && !expression.targetIndex.getKeyspace().equals(cfm.ksName))
+            throw IndexRestrictions.invalidIndex(expression.targetIndex, cfm);
+
+        if (cfName.getColumnFamily() != null && !cfName.getColumnFamily().equals(cfm.cfName))
+            throw IndexRestrictions.invalidIndex(expression.targetIndex, cfm);
+
+        if (!cfm.getIndexes().has(expression.targetIndex.getIdx()))
+            throw IndexRestrictions.indexNotFound(expression.targetIndex, cfm);
+
+        Index index = indexManager.getIndex(cfm.getIndexes().get(expression.targetIndex.getIdx()).get());
+
+        if (!index.getIndexMetadata().isCustom())
+            throw IndexRestrictions.nonCustomIndexInExpression(expression.targetIndex);
+
+        AbstractType<?> expressionType = index.customExpressionValueType();
+        if (expressionType == null)
+            throw IndexRestrictions.customExpressionNotSupported(expression.targetIndex);
+
+        expression.prepareValue(cfm, expressionType, boundNames);
+
+        indexRestrictions.add(expression);
+    }
+
+    public RowFilter getRowFilter(SecondaryIndexManager indexManager, QueryOptions options)
+    {
+        if (indexRestrictions.isEmpty())
+            return RowFilter.NONE;
+
+        RowFilter filter = RowFilter.create();
+        for (Restrictions restrictions : indexRestrictions.getRestrictions())
+            restrictions.addRowFilterTo(filter, indexManager, options);
+
+        for (CustomIndexExpression expression : indexRestrictions.getCustomIndexExpressions())
+            expression.addToRowFilter(filter, cfm, options);
+
+        return filter;
     }
 
     /**
@@ -412,11 +644,10 @@
      *
      * @param options the query options
      * @return the partition keys for which the data is requested.
-     * @throws InvalidRequestException if the partition keys cannot be retrieved
      */
-    public Collection<ByteBuffer> getPartitionKeys(final QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> getPartitionKeys(final QueryOptions options)
     {
-        return partitionKeyRestrictions.values(cfm, options);
+        return partitionKeyRestrictions.values(options);
     }
 
     /**
@@ -425,18 +656,16 @@
      * @param b the boundary type
      * @param options the query options
      * @return the specified bound of the partition key
-     * @throws InvalidRequestException if the boundary cannot be retrieved
      */
-    private ByteBuffer getPartitionKeyBound(Bound b, QueryOptions options) throws InvalidRequestException
+    private ByteBuffer getPartitionKeyBound(Bound b, QueryOptions options)
     {
         // Deal with unrestricted partition key components (special-casing is required to deal with 2i queries on the
-        // first
-        // component of a composite partition key).
-        if (hasPartitionKeyUnrestrictedComponents())
+        // first component of a composite partition key).
+        if (hasUnrestrictedPartitionKeyComponents())
             return ByteBufferUtil.EMPTY_BYTE_BUFFER;
 
         // We deal with IN queries for keys in other places, so we know buildBound will return only one result
-        return partitionKeyRestrictions.bounds(cfm, b, options).get(0);
+        return partitionKeyRestrictions.bounds(b, options).get(0);
     }
 
     /**
@@ -444,11 +673,10 @@
      *
      * @param options the query options
      * @return the partition key bounds
-     * @throws InvalidRequestException if the query is invalid
      */
-    public AbstractBounds<RowPosition> getPartitionKeyBounds(QueryOptions options) throws InvalidRequestException
+    public AbstractBounds<PartitionPosition> getPartitionKeyBounds(QueryOptions options)
     {
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = cfm.partitioner;
 
         if (partitionKeyRestrictions.isOnToken())
         {
@@ -458,14 +686,14 @@
         return getPartitionKeyBounds(p, options);
     }
 
-    private AbstractBounds<RowPosition> getPartitionKeyBounds(IPartitioner p,
-                                                              QueryOptions options) throws InvalidRequestException
+    private AbstractBounds<PartitionPosition> getPartitionKeyBounds(IPartitioner p,
+                                                                    QueryOptions options)
     {
         ByteBuffer startKeyBytes = getPartitionKeyBound(Bound.START, options);
         ByteBuffer finishKeyBytes = getPartitionKeyBound(Bound.END, options);
 
-        RowPosition startKey = RowPosition.ForKey.get(startKeyBytes, p);
-        RowPosition finishKey = RowPosition.ForKey.get(finishKeyBytes, p);
+        PartitionPosition startKey = PartitionPosition.ForKey.get(startKeyBytes, p);
+        PartitionPosition finishKey = PartitionPosition.ForKey.get(finishKeyBytes, p);
 
         if (startKey.compareTo(finishKey) > 0 && !finishKey.isMinimum())
             return null;
@@ -482,9 +710,8 @@
                 : new ExcludingBounds<>(startKey, finishKey);
     }
 
-    private AbstractBounds<RowPosition> getPartitionKeyBoundsForTokenRestrictions(IPartitioner p,
-                                                                                  QueryOptions options)
-                                                                                          throws InvalidRequestException
+    private AbstractBounds<PartitionPosition> getPartitionKeyBoundsForTokenRestrictions(IPartitioner p,
+                                                                                        QueryOptions options)
     {
         Token startToken = getTokenBound(Bound.START, options, p);
         Token endToken = getTokenBound(Bound.END, options, p);
@@ -507,34 +734,23 @@
                 && (cmp > 0 || (cmp == 0 && (!includeStart || !includeEnd))))
             return null;
 
-        RowPosition start = includeStart ? startToken.minKeyBound() : startToken.maxKeyBound();
-        RowPosition end = includeEnd ? endToken.maxKeyBound() : endToken.minKeyBound();
+        PartitionPosition start = includeStart ? startToken.minKeyBound() : startToken.maxKeyBound();
+        PartitionPosition end = includeEnd ? endToken.maxKeyBound() : endToken.minKeyBound();
 
         return new Range<>(start, end);
     }
 
-    private Token getTokenBound(Bound b, QueryOptions options, IPartitioner p) throws InvalidRequestException
+    private Token getTokenBound(Bound b, QueryOptions options, IPartitioner p)
     {
         if (!partitionKeyRestrictions.hasBound(b))
             return p.getMinimumToken();
 
-        ByteBuffer value = partitionKeyRestrictions.bounds(cfm, b, options).get(0);
+        ByteBuffer value = partitionKeyRestrictions.bounds(b, options).get(0);
         checkNotNull(value, "Invalid null token value");
         return p.getTokenFactory().fromByteArray(value);
     }
 
     /**
-     * Checks if the query does not contains any restriction on the clustering columns.
-     *
-     * @return <code>true</code> if the query does not contains any restriction on the clustering columns,
-     * <code>false</code> otherwise.
-     */
-    public boolean hasNoClusteringColumnsRestriction()
-    {
-        return clusteringColumnsRestrictions.isEmpty();
-    }
-
-    /**
      * Checks if the query has some restrictions on the clustering columns.
      *
      * @return <code>true</code> if the query has some restrictions on the clustering columns,
@@ -545,44 +761,21 @@
         return !clusteringColumnsRestrictions.isEmpty();
     }
 
-    // For non-composite slices, we don't support internally the difference between exclusive and
-    // inclusive bounds, so we deal with it manually.
-    public boolean isNonCompositeSliceWithExclusiveBounds()
-    {
-        return !cfm.comparator.isCompound()
-                && clusteringColumnsRestrictions.isSlice()
-                && (!clusteringColumnsRestrictions.isInclusive(Bound.START) || !clusteringColumnsRestrictions.isInclusive(Bound.END));
-    }
-
     /**
-     * Returns the requested clustering columns as <code>Composite</code>s.
+     * Returns the requested clustering columns.
      *
      * @param options the query options
-     * @return the requested clustering columns as <code>Composite</code>s
-     * @throws InvalidRequestException if the query is not valid
+     * @return the requested clustering columns
      */
-    public List<Composite> getClusteringColumnsAsComposites(QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering> getClusteringColumns(QueryOptions options)
     {
-        return clusteringColumnsRestrictions.valuesAsComposites(cfm, options);
-    }
+        // If this is a names command and the table is a static compact one, then as far as CQL is concerned we have
+        // only a single row which internally correspond to the static parts. In which case we want to return an empty
+        // set (since that's what ClusteringIndexNamesFilter expects).
+        if (cfm.isStaticCompactTable())
+            return BTreeSet.empty(cfm.comparator);
 
-    /**
-     * Returns the bounds (start or end) of the clustering columns as <code>Composites</code>.
-     *
-     * @param b the bound type
-     * @param options the query options
-     * @return the bounds (start or end) of the clustering columns as <code>Composites</code>
-     * @throws InvalidRequestException if the request is not valid
-     */
-    public List<Composite> getClusteringColumnsBoundsAsComposites(Bound b,
-                                                                  QueryOptions options) throws InvalidRequestException
-    {
-        List<Composite> bounds = clusteringColumnsRestrictions.boundsAsComposites(cfm, b, options);
-        for (Composite c : bounds) {
-            if (!c.isEmpty())
-                QueryProcessor.validateComposite(c, cfm.comparator);
-        }
-        return bounds;
+        return clusteringColumnsRestrictions.valuesAsClustering(options);
     }
 
     /**
@@ -591,11 +784,10 @@
      * @param b the bound type
      * @param options the query options
      * @return the bounds (start or end) of the clustering columns
-     * @throws InvalidRequestException if the request is not valid
      */
-    public List<ByteBuffer> getClusteringColumnsBounds(Bound b, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Slice.Bound> getClusteringColumnsBounds(Bound b, QueryOptions options)
     {
-        return clusteringColumnsRestrictions.bounds(cfm, b, options);
+        return clusteringColumnsRestrictions.boundsAsClustering(b, options);
     }
 
     /**
@@ -617,15 +809,13 @@
      */
     public boolean isColumnRange()
     {
-        // Due to CASSANDRA-5762, we always do a slice for CQL3 tables (not dense, composite).
-        // Static CF (non dense but non composite) never entails a column slice however
-        if (!cfm.comparator.isDense())
-            return cfm.comparator.isCompound();
-
-        // Otherwise (i.e. for compact table where we don't have a row marker anyway and thus don't care about
-        // CASSANDRA-5762),
+        // For static compact tables we want to ignore the fake clustering column (note that if we weren't special casing,
+        // this would mean a 'SELECT *' on a static compact table would query whole partitions, even though we'll only return
+        // the static part as far as CQL is concerned. This is thus mostly an optimization to use the query-by-name path).
+        int numberOfClusteringColumns = cfm.isStaticCompactTable() ? 0 : cfm.clusteringColumns().size();
         // it is a range query if it has at least one the column alias for which no relation is defined or is not EQ.
-        return clusteringColumnsRestrictions.size() < cfm.clusteringColumns().size() || clusteringColumnsRestrictions.isSlice();
+        return clusteringColumnsRestrictions.size() < numberOfClusteringColumns
+            || (!clusteringColumnsRestrictions.isEQ() && !clusteringColumnsRestrictions.isIN());
     }
 
     /**
@@ -634,40 +824,60 @@
      */
     public boolean needFiltering()
     {
-        int numberOfRestrictedColumns = 0;
-        for (Restrictions restrictions : indexRestrictions)
-            numberOfRestrictedColumns += restrictions.size();
+        int numberOfRestrictions = indexRestrictions.getCustomIndexExpressions().size();
+        for (Restrictions restrictions : indexRestrictions.getRestrictions())
+            numberOfRestrictions += restrictions.size();
 
-        return numberOfRestrictedColumns > 1
-                || (numberOfRestrictedColumns == 0 && !clusteringColumnsRestrictions.isEmpty())
-                || (numberOfRestrictedColumns != 0
+        return numberOfRestrictions > 1
+                || (numberOfRestrictions == 0 && !clusteringColumnsRestrictions.isEmpty())
+                || (numberOfRestrictions != 0
                         && nonPrimaryKeyRestrictions.hasMultipleContains());
     }
 
-    private void validateSecondaryIndexSelections(boolean selectsOnlyStaticColumns) throws InvalidRequestException
+    private void validateSecondaryIndexSelections(boolean selectsOnlyStaticColumns)
     {
         checkFalse(keyIsInRelation(),
                    "Select on indexed columns and with IN clause for the PRIMARY KEY are not supported");
         // When the user only select static columns, the intent is that we don't query the whole partition but just
         // the static parts. But 1) we don't have an easy way to do that with 2i and 2) since we don't support index on
-        // static columns
-        // so far, 2i means that you've restricted a non static column, so the query is somewhat non-sensical.
-        checkFalse(selectsOnlyStaticColumns, "Queries using 2ndary indexes don't support selecting only static columns");
+        // static columns so far, 2i means that you've restricted a non static column, so the query is somewhat
+        // non-sensical.
+        // Note: an exception is if the index is a KEYS one. Which can happen if the user had a KEYS index on
+        // a compact table, and subsequently DROP COMPACT STORAGE on that table. After which, the KEYS index will still
+        // work, but queries will effectively be only on now-static columns and we should let this work.
+        checkFalse(selectsOnlyStaticColumns && !hasKeysIndex(cfm),
+                   "Queries using 2ndary indexes don't support selecting only static columns");
     }
 
-    public void reverse()
+    private boolean hasKeysIndex(CFMetaData cfm)
     {
-        clusteringColumnsRestrictions = new ReversedPrimaryKeyRestrictions(clusteringColumnsRestrictions);
+        return Iterables.any(cfm.getIndexes(), i -> i.kind == IndexMetadata.Kind.KEYS);
     }
 
     /**
-     * Checks if the query will never return any rows.
+     * Checks that all the primary key columns (partition key and clustering columns) are restricted by an equality
+     * relation ('=' or 'IN').
      *
-     * @param options the query options
-     * @return {@code true} if the query will never return any rows, {@false} otherwise
+     * @return <code>true</code> if all the primary key columns are restricted by an equality relation.
      */
-    public boolean isNotReturningAnyRows(QueryOptions options)
+    public boolean hasAllPKColumnsRestrictedByEqualities()
     {
-        return clusteringColumnsRestrictions.isNotReturningAnyRows(cfm, options);
+        return !isPartitionKeyRestrictionsOnToken()
+               && !hasUnrestrictedPartitionKeyComponents()
+               && (partitionKeyRestrictions.isEQ() || partitionKeyRestrictions.isIN())
+               && !hasUnrestrictedClusteringColumns()
+               && (clusteringColumnsRestrictions.isEQ() || clusteringColumnsRestrictions.isIN());
+    }
+
+
+    private SuperColumnCompatibility.SuperColumnRestrictions cached;
+    public SuperColumnCompatibility.SuperColumnRestrictions getSuperColumnRestrictions()
+    {
+        assert cfm.isSuper() && cfm.isDense();
+
+        if (cached == null)
+            cached = new SuperColumnCompatibility.SuperColumnRestrictions(Iterators.concat(((PrimaryKeyRestrictionSet) clusteringColumnsRestrictions).iterator(),
+                                                                                           nonPrimaryKeyRestrictions.iterator()));
+        return cached;
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TermSlice.java b/src/java/org/apache/cassandra/cql3/restrictions/TermSlice.java
index d082cc3..4b13877 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TermSlice.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TermSlice.java

@@ -17,17 +17,16 @@
  */
 package org.apache.cassandra.cql3.restrictions;
 
-import java.util.Collections;
+import java.util.List;
 
-import com.google.common.collect.Iterables;
-
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.cql3.Term;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.index.Index;
 
-final class TermSlice
+public final class TermSlice
 {
     /**
      * The slice boundaries.
@@ -152,33 +151,30 @@
     /**
      * Check if this <code>TermSlice</code> is supported by the specified index.
      *
-     * @param index the Secondary index
+     * @param index the secondary index
      * @return <code>true</code> this type of <code>TermSlice</code> is supported by the specified index,
      * <code>false</code> otherwise.
      */
-    public boolean isSupportedBy(SecondaryIndex index)
+    public boolean isSupportedBy(ColumnDefinition column, Index index)
     {
         boolean supported = false;
 
         if (hasBound(Bound.START))
-            supported |= isInclusive(Bound.START) ? index.supportsOperator(Operator.GTE)
-                    : index.supportsOperator(Operator.GT);
+            supported |= isInclusive(Bound.START) ? index.supportsExpression(column, Operator.GTE)
+                    : index.supportsExpression(column, Operator.GT);
         if (hasBound(Bound.END))
-            supported |= isInclusive(Bound.END) ? index.supportsOperator(Operator.LTE)
-                    : index.supportsOperator(Operator.LT);
+            supported |= isInclusive(Bound.END) ? index.supportsExpression(column, Operator.LTE)
+                    : index.supportsExpression(column, Operator.LT);
 
         return supported;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        if (hasBound(Bound.START) && hasBound(Bound.END))
-            return Iterables.concat(bound(Bound.START).getFunctions(), bound(Bound.END).getFunctions());
-        else if (hasBound(Bound.START))
-            return bound(Bound.START).getFunctions();
-        else if (hasBound(Bound.END))
-            return bound(Bound.END).getFunctions();
-        else
-            return Collections.emptySet();
+        if (hasBound(Bound.START))
+            bound(Bound.START).addFunctionsTo(functions);
+
+        if (hasBound(Bound.END))
+            bound(Bound.END).addFunctionsTo(functions);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
index 18444ec..3258b26 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java

@@ -18,22 +18,19 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
 
 import com.google.common.collect.BoundType;
 import com.google.common.collect.ImmutableRangeSet;
 import com.google.common.collect.Range;
 import com.google.common.collect.RangeSet;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.service.StorageService;
 
 import static org.apache.cassandra.cql3.statements.Bound.END;
 import static org.apache.cassandra.cql3.statements.Bound.START;
@@ -54,9 +51,9 @@
     private TokenRestriction tokenRestriction;
 
     /**
-     * The partitioner
+     * Partitioner to manage tokens, extracted from tokenRestriction metadata.
      */
-    private static final IPartitioner partitioner = StorageService.getPartitioner();
+    private final IPartitioner partitioner;
 
     @Override
     protected PrimaryKeyRestrictions getDelegate()
@@ -76,16 +73,17 @@
     {
         this.restrictions = restrictions;
         this.tokenRestriction = tokenRestriction;
+        this.partitioner = tokenRestriction.metadata.partitioner;
     }
 
     @Override
-    public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
     {
-        return filter(cfm, restrictions.values(cfm, options), options);
+        return filter(restrictions.values(options), options);
     }
 
     @Override
-    public List<Composite> valuesAsComposites(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering> valuesAsClustering(QueryOptions options) throws InvalidRequestException
     {
         throw new UnsupportedOperationException();
     }
@@ -112,30 +110,29 @@
     }
 
     @Override
-    public List<ByteBuffer> bounds(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public List<ByteBuffer> bounds(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        return tokenRestriction.bounds(cfm, bound, options);
+        return tokenRestriction.bounds(bound, options);
     }
 
     @Override
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Slice.Bound> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        return tokenRestriction.boundsAsComposites(cfm, bound, options);
+        return tokenRestriction.boundsAsClustering(bound, options);
     }
 
     /**
      * Filter the values returned by the restriction.
      *
-     * @param cfm the table metadata
      * @param values the values returned by the decorated restriction
      * @param options the query options
      * @return the values matching the token restriction
      * @throws InvalidRequestException if the request is invalid
      */
-    private List<ByteBuffer> filter(CFMetaData cfm, List<ByteBuffer> values, QueryOptions options) throws InvalidRequestException
+    private List<ByteBuffer> filter(List<ByteBuffer> values, QueryOptions options) throws InvalidRequestException
     {
-        RangeSet<Token> rangeSet = tokenRestriction.isSlice() ? toRangeSet(cfm, tokenRestriction, options)
-                                                              : toRangeSet(tokenRestriction.values(cfm, options));
+        RangeSet<Token> rangeSet = tokenRestriction.isSlice() ? toRangeSet(tokenRestriction, options)
+                                                              : toRangeSet(tokenRestriction.values(options));
 
         return filterWithRangeSet(rangeSet, values);
     }
@@ -147,7 +144,7 @@
      * @param values the restricted values
      * @return the values for which the tokens are not included within the specified range.
      */
-    private static List<ByteBuffer> filterWithRangeSet(RangeSet<Token> tokens, List<ByteBuffer> values)
+    private List<ByteBuffer> filterWithRangeSet(RangeSet<Token> tokens, List<ByteBuffer> values)
     {
         List<ByteBuffer> remaining = new ArrayList<>();
 
@@ -169,7 +166,7 @@
      * @param buffers the token restriction values
      * @return the range set corresponding to the specified list
      */
-    private static RangeSet<Token> toRangeSet(List<ByteBuffer> buffers)
+    private RangeSet<Token> toRangeSet(List<ByteBuffer> buffers)
     {
         ImmutableRangeSet.Builder<Token> builder = ImmutableRangeSet.builder();
 
@@ -182,24 +179,23 @@
     /**
      * Converts the specified slice into a range set.
      *
-     * @param cfm the table metadata
      * @param slice the slice to convert
      * @param options the query option
      * @return the range set corresponding to the specified slice
      * @throws InvalidRequestException if the request is invalid
      */
-    private static RangeSet<Token> toRangeSet(CFMetaData cfm, TokenRestriction slice, QueryOptions options) throws InvalidRequestException
+    private RangeSet<Token> toRangeSet(TokenRestriction slice, QueryOptions options) throws InvalidRequestException
     {
         if (slice.hasBound(START))
         {
-            Token start = deserializeToken(slice.bounds(cfm, START, options).get(0));
+            Token start = deserializeToken(slice.bounds(START, options).get(0));
 
             BoundType startBoundType = toBoundType(slice.isInclusive(START));
 
             if (slice.hasBound(END))
             {
                 BoundType endBoundType = toBoundType(slice.isInclusive(END));
-                Token end = deserializeToken(slice.bounds(cfm, END, options).get(0));
+                Token end = deserializeToken(slice.bounds(END, options).get(0));
 
                 if (start.equals(end) && (BoundType.OPEN == startBoundType || BoundType.OPEN == endBoundType))
                     return ImmutableRangeSet.of();
@@ -218,22 +214,17 @@
             return ImmutableRangeSet.of(Range.downTo(start,
                                                      startBoundType));
         }
-        Token end = deserializeToken(slice.bounds(cfm, END, options).get(0));
+        Token end = deserializeToken(slice.bounds(END, options).get(0));
         return ImmutableRangeSet.of(Range.upTo(end, toBoundType(slice.isInclusive(END))));
     }
 
-    public boolean isNotReturningAnyRows(CFMetaData cfm, QueryOptions options)
-    {
-        return false;
-    }
-
     /**
      * Deserializes the token corresponding to the specified buffer.
      *
      * @param buffer the buffer
      * @return the token corresponding to the specified buffer
      */
-    private static Token deserializeToken(ByteBuffer buffer)
+    private Token deserializeToken(ByteBuffer buffer)
     {
         return partitioner.getTokenFactory().fromByteArray(buffer);
     }

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
index 97c55c4..14d2cb7 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java

@@ -18,8 +18,7 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
 
 import com.google.common.base.Joiner;
 
@@ -29,12 +28,12 @@
 import org.apache.cassandra.cql3.Term;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.MultiCBuilder;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -48,16 +47,18 @@
      */
     protected final List<ColumnDefinition> columnDefs;
 
+    final CFMetaData metadata;
+
     /**
      * Creates a new <code>TokenRestriction</code> that apply to the specified columns.
      *
-     * @param ctype the composite type
      * @param columnDefs the definition of the columns to which apply the token restriction
      */
-    public TokenRestriction(CType ctype, List<ColumnDefinition> columnDefs)
+    public TokenRestriction(CFMetaData metadata, List<ColumnDefinition> columnDefs)
     {
-        super(ctype);
+        super(metadata.getKeyValidatorAsClusteringComparator());
         this.columnDefs = columnDefs;
+        this.metadata = metadata;
     }
 
     @Override
@@ -91,27 +92,25 @@
     }
 
     @Override
-    public final void addIndexExpressionTo(List<IndexExpression> expressions,
-                                     SecondaryIndexManager indexManager,
-                                     QueryOptions options)
+    public void addRowFilterTo(RowFilter filter, SecondaryIndexManager indexManager, QueryOptions options)
     {
         throw new UnsupportedOperationException("Index expression cannot be created for token restriction");
     }
 
     @Override
-    public CompositesBuilder appendTo(CFMetaData cfm, CompositesBuilder builder, QueryOptions options)
+    public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
     {
         throw new UnsupportedOperationException();
     }
 
     @Override
-    public List<Composite> valuesAsComposites(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering> valuesAsClustering(QueryOptions options) throws InvalidRequestException
     {
         throw new UnsupportedOperationException();
     }
 
     @Override
-    public List<Composite> boundsAsComposites(CFMetaData cfm, Bound bound, QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Slice.Bound> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException
     {
         throw new UnsupportedOperationException();
     }
@@ -153,16 +152,16 @@
         if (restriction instanceof PrimaryKeyRestrictions)
             return (PrimaryKeyRestrictions) restriction;
 
-        return new PrimaryKeyRestrictionSet(ctype).mergeWith(restriction);
+        return new PrimaryKeyRestrictionSet(comparator, true).mergeWith(restriction);
     }
 
-    public static final class EQ extends TokenRestriction
+    public static final class EQRestriction extends TokenRestriction
     {
         private final Term value;
 
-        public EQ(CType ctype, List<ColumnDefinition> columnDefs, Term value)
+        public EQRestriction(CFMetaData cfm, List<ColumnDefinition> columnDefs, Term value)
         {
-            super(ctype, columnDefs);
+            super(cfm, columnDefs);
             this.value = value;
         }
 
@@ -173,9 +172,9 @@
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return value.getFunctions();
+            value.addFunctionsTo(functions);
         }
 
         @Override
@@ -186,19 +185,19 @@
         }
 
         @Override
-        public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
             return Collections.singletonList(value.bindAndGet(options));
         }
     }
 
-    public static class Slice extends TokenRestriction
+    public static class SliceRestriction extends TokenRestriction
     {
         private final TermSlice slice;
 
-        public Slice(CType ctype, List<ColumnDefinition> columnDefs, Bound bound, boolean inclusive, Term term)
+        public SliceRestriction(CFMetaData cfm, List<ColumnDefinition> columnDefs, Bound bound, boolean inclusive, Term term)
         {
-            super(ctype, columnDefs);
+            super(cfm, columnDefs);
             slice = TermSlice.newInstance(bound, inclusive, term);
         }
 
@@ -209,7 +208,7 @@
         }
 
         @Override
-        public List<ByteBuffer> values(CFMetaData cfm, QueryOptions options) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
         {
             throw new UnsupportedOperationException();
         }
@@ -221,15 +220,15 @@
         }
 
         @Override
-        public List<ByteBuffer> bounds(CFMetaData cfm, Bound b, QueryOptions options) throws InvalidRequestException
+        public List<ByteBuffer> bounds(Bound b, QueryOptions options) throws InvalidRequestException
         {
             return Collections.singletonList(slice.bound(b).bindAndGet(options));
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return slice.getFunctions();
+            slice.addFunctionsTo(functions);
         }
 
         @Override
@@ -246,7 +245,7 @@
                 throw invalidRequest("Columns \"%s\" cannot be restricted by both an equality and an inequality relation",
                                      getColumnNamesAsString());
 
-            TokenRestriction.Slice otherSlice = (TokenRestriction.Slice) otherRestriction;
+            TokenRestriction.SliceRestriction otherSlice = (TokenRestriction.SliceRestriction) otherRestriction;
 
             if (hasBound(Bound.START) && otherSlice.hasBound(Bound.START))
                 throw invalidRequest("More than one restriction was found for the start bound on %s",
@@ -256,7 +255,7 @@
                 throw invalidRequest("More than one restriction was found for the end bound on %s",
                                      getColumnNamesAsString());
 
-            return new Slice(ctype, columnDefs,  slice.merge(otherSlice.slice));
+            return new SliceRestriction(metadata, columnDefs,  slice.merge(otherSlice.slice));
         }
 
         @Override
@@ -264,10 +263,9 @@
         {
             return String.format("SLICE%s", slice);
         }
-
-        private Slice(CType ctype, List<ColumnDefinition> columnDefs, TermSlice slice)
+        private SliceRestriction(CFMetaData cfm, List<ColumnDefinition> columnDefs, TermSlice slice)
         {
-            super(ctype, columnDefs);
+            super(cfm, columnDefs);
             this.slice = slice;
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/selection/AbstractFunctionSelector.java b/src/java/org/apache/cassandra/cql3/selection/AbstractFunctionSelector.java
index bf1234f..c48b93c 100644
--- a/src/java/org/apache/cassandra/cql3/selection/AbstractFunctionSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/AbstractFunctionSelector.java

@@ -21,8 +21,6 @@
 import java.util.Arrays;
 import java.util.List;
 
-import com.google.common.collect.Iterables;
-
 import org.apache.commons.lang3.text.StrBuilder;
 
 import org.apache.cassandra.cql3.functions.AggregateFcts;
@@ -85,9 +83,10 @@
                     mapping.addMapping(resultsColumn, tmpMapping.getMappings().values());
             }
 
-            public Iterable<Function> getFunctions()
+            public void addFunctionsTo(List<Function> functions)
             {
-                return Iterables.concat(fun.getFunctions(), factories.getFunctions());
+                fun.addFunctionsTo(functions);
+                factories.addFunctionsTo(functions);
             }
 
             public Selector newInstance() throws InvalidRequestException

diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
index ee134ee..653a86a 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java

@@ -77,7 +77,7 @@
         public Selector.Factory newSelectorFactory(CFMetaData cfm,
                                                    List<ColumnDefinition> defs) throws InvalidRequestException
         {
-            ColumnDefinition def = cfm.getColumnDefinition(id);
+            ColumnDefinition def = cfm.getColumnDefinitionForCQL(id);
             if (def == null)
                 throw new InvalidRequestException(String.format("Undefined name %s in selection clause", id));
             if (def.isPrimaryKeyColumn())
@@ -148,7 +148,7 @@
             if (functionName.equalsNativeFunction(ToJsonFct.NAME))
                 fun = ToJsonFct.getInstance(factories.getReturnTypes());
             else
-                fun = Functions.get(cfm.ksName, functionName, factories.newInstances(), cfm.ksName, cfm.cfName, null);
+                fun = FunctionResolver.get(cfm.ksName, functionName, factories.newInstances(), cfm.ksName, cfm.cfName, null);
 
             if (fun == null)
                 throw new InvalidRequestException(String.format("Unknown function '%s'", functionName));

diff --git a/src/java/org/apache/cassandra/cql3/selection/Selection.java b/src/java/org/apache/cassandra/cql3/selection/Selection.java
index 72aec27..510e11c 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selection.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selection.java

@@ -30,9 +30,7 @@
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.CounterCell;
-import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -86,7 +84,7 @@
      */
     public boolean containsStaticColumns()
     {
-        if (!cfm.hasStaticColumns())
+        if (cfm.isStaticCompactTable() || !cfm.hasStaticColumns())
             return false;
 
         if (isWildcard())
@@ -123,9 +121,6 @@
      */
     public boolean containsACollection()
     {
-        if (!cfm.comparator.hasCollections())
-            return false;
-
         for (ColumnDefinition def : getColumns())
             if (def.type.isCollection() && def.type.isMultiCell())
                 return true;
@@ -199,9 +194,8 @@
         return columns.size() - 1;
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        return Collections.emptySet();
     }
 
     private static boolean processesSelection(List<RawSelector> rawSelectors)
@@ -284,9 +278,9 @@
         return columnMapping;
     }
 
-    public ResultSetBuilder resultSetBuilder(long now, boolean isJson) throws InvalidRequestException
+    public ResultSetBuilder resultSetBuilder(boolean isJons) throws InvalidRequestException
     {
-        return new ResultSetBuilder(now, isJson);
+        return new ResultSetBuilder(isJons);
     }
 
     public abstract boolean isAggregate();
@@ -324,18 +318,22 @@
         List<ByteBuffer> current;
         final long[] timestamps;
         final int[] ttls;
-        final long now;
 
         private final boolean isJson;
 
-        private ResultSetBuilder(long now, boolean isJson) throws InvalidRequestException
+        private ResultSetBuilder(boolean isJson) throws InvalidRequestException
         {
             this.resultSet = new ResultSet(getResultMetadata(isJson).copy(), new ArrayList<List<ByteBuffer>>());
             this.selectors = newSelectors();
             this.timestamps = collectTimestamps ? new long[columns.size()] : null;
             this.ttls = collectTTLs ? new int[columns.size()] : null;
-            this.now = now;
             this.isJson = isJson;
+
+            // We use MIN_VALUE to indicate no timestamp and -1 for no ttl
+            if (timestamps != null)
+                Arrays.fill(timestamps, Long.MIN_VALUE);
+            if (ttls != null)
+                Arrays.fill(ttls, -1);
         }
 
         public void add(ByteBuffer v)
@@ -343,25 +341,37 @@
             current.add(v);
         }
 
-        public void add(Cell c)
+        public void add(Cell c, int nowInSec)
         {
-            current.add(isDead(c) ? null : value(c));
+            if (c == null)
+            {
+                current.add(null);
+                return;
+            }
+
+            current.add(value(c));
+
             if (timestamps != null)
-            {
-                timestamps[current.size() - 1] = isDead(c) ? Long.MIN_VALUE : c.timestamp();
-            }
+                timestamps[current.size() - 1] = c.timestamp();
+
             if (ttls != null)
-            {
-                int ttl = -1;
-                if (!isDead(c) && c instanceof ExpiringCell)
-                    ttl = c.getLocalDeletionTime() - (int) (now / 1000);
-                ttls[current.size() - 1] = ttl;
-            }
+                ttls[current.size() - 1] = remainingTTL(c, nowInSec);
         }
 
-        private boolean isDead(Cell c)
+        private int remainingTTL(Cell c, int nowInSec)
         {
-            return c == null || !c.isLive(now);
+            if (!c.isExpiring())
+                return -1;
+
+            int remaining = c.localDeletionTime() - nowInSec;
+            return remaining >= 0 ? remaining : -1;
+        }
+
+        private ByteBuffer value(Cell c)
+        {
+            return c.isCounterCell()
+                 ? ByteBufferUtil.bytes(CounterContext.instance().total(c.value()))
+                 : c.value();
         }
 
         public void newRow(int protocolVersion) throws InvalidRequestException
@@ -376,6 +386,12 @@
                 }
             }
             current = new ArrayList<>(columns.size());
+
+            // Timestamps and TTLs are arrays per row, we must null them out between row
+            if (timestamps != null)
+                Arrays.fill(timestamps, Long.MIN_VALUE);
+            if (ttls != null)
+                Arrays.fill(ttls, -1);
         }
 
         public ResultSet build(int protocolVersion) throws InvalidRequestException
@@ -438,13 +454,6 @@
             jsonRow.add(UTF8Type.instance.getSerializer().serialize(sb.toString()));
             return jsonRow;
         }
-
-        private ByteBuffer value(Cell c)
-        {
-            return (c instanceof CounterCell)
-                ? ByteBufferUtil.bytes(CounterContext.instance().total(c.value()))
-                : c.value();
-        }
     }
 
     private static interface Selectors
@@ -547,9 +556,9 @@
         }
 
         @Override
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return factories.getFunctions();
+            factories.addFunctionsTo(functions);
         }
 
         @Override

diff --git a/src/java/org/apache/cassandra/cql3/selection/SelectionColumnMapping.java b/src/java/org/apache/cassandra/cql3/selection/SelectionColumnMapping.java
index 8636f19..4cfdefb 100644
--- a/src/java/org/apache/cassandra/cql3/selection/SelectionColumnMapping.java
+++ b/src/java/org/apache/cassandra/cql3/selection/SelectionColumnMapping.java

@@ -21,6 +21,7 @@
 package org.apache.cassandra.cql3.selection;
 
 import java.util.*;
+import java.util.stream.Collectors;
 
 import com.google.common.base.Function;
 import com.google.common.base.Joiner;
@@ -109,44 +110,20 @@
 
     public String toString()
     {
-        final Function<ColumnDefinition, String> getDefName = new Function<ColumnDefinition, String>()
-        {
-            public String apply(ColumnDefinition def)
-            {
-                return def.name.toString();
-            }
-        };
-        Function<Map.Entry<ColumnSpecification, Collection<ColumnDefinition>>, String> mappingEntryToString =
-        new Function<Map.Entry<ColumnSpecification, Collection<ColumnDefinition>>, String>(){
-            public String apply(Map.Entry<ColumnSpecification, Collection<ColumnDefinition>> entry)
-            {
-                StringBuilder builder = new StringBuilder();
-                builder.append(entry.getKey().name.toString());
-                builder.append(":[");
-                builder.append(Joiner.on(',').join(Iterables.transform(entry.getValue(), getDefName)));
-                builder.append("]");
-                return builder.toString();
-            }
-        };
-
-        Function<ColumnSpecification, String> colSpecToString = new Function<ColumnSpecification, String>()
-        {
-            public String apply(ColumnSpecification columnSpecification)
-            {
-                return columnSpecification.name.toString();
-            }
-        };
-
-        StringBuilder builder = new StringBuilder();
-        builder.append("{ Columns:[");
-        builder.append(Joiner.on(",")
-                             .join(Iterables.transform(columnSpecifications, colSpecToString)));
-        builder.append("], Mappings:[");
-        builder.append(Joiner.on(", ")
-                             .join(Iterables.transform(columnMappings.asMap().entrySet(),
-                                                       mappingEntryToString)));
-        builder.append("] }");
-        return builder.toString();
+        return columnMappings.asMap()
+                             .entrySet()
+                             .stream()
+                             .map(entry ->
+                                  entry.getValue()
+                                       .stream()
+                                       .map(colDef -> colDef.name.toString())
+                                       .collect(Collectors.joining(", ", entry.getKey().name.toString() + ":[", "]")))
+                             .collect(Collectors.joining(", ",
+                                                         columnSpecifications.stream()
+                                                                             .map(colSpec -> colSpec.name.toString())
+                                                                             .collect(Collectors.joining(", ",
+                                                                                                         "{ Columns:[",
+                                                                                                         "], Mappings:{")),
+                                                         "} }"));
     }
-
 }

diff --git a/src/java/org/apache/cassandra/cql3/selection/Selector.java b/src/java/org/apache/cassandra/cql3/selection/Selector.java
index 7b818b5..7249d22 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selector.java

@@ -18,7 +18,7 @@
 package org.apache.cassandra.cql3.selection;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
+import java.util.List;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cql3.AssignmentTestable;
@@ -43,9 +43,8 @@
      */
     public static abstract class Factory
     {
-        public Iterable<Function> getFunctions()
+        public void addFunctionsTo(List<Function> functions)
         {
-            return Collections.emptySet();
         }
 
         /**
@@ -59,7 +58,7 @@
         {
             return new ColumnSpecification(cfm.ksName,
                                            cfm.cfName,
-                                           new ColumnIdentifier(getColumnName(), true),
+                                           ColumnIdentifier.getInterned(getColumnName(), true),
                                            getReturnType());
         }
 

diff --git a/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java b/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java
index fbbfbb5..97a1198 100644
--- a/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java
+++ b/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java

@@ -19,7 +19,6 @@
 
 import java.util.*;
 
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 
 import org.apache.cassandra.config.CFMetaData;
@@ -89,13 +88,9 @@
         }
     }
 
-    public Iterable<Function> getFunctions()
+    public void addFunctionsTo(List<Function> functions)
     {
-        Iterable<Function> functions = Collections.emptySet();
-        for (Factory factory : factories)
-            if (factory != null)
-                functions = Iterables.concat(functions, factory.getFunctions());
-        return functions;
+        factories.forEach(p -> p.addFunctionsTo(functions));
     }
 
     /**

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java
index 50c3f00..3ca4d72 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterKeyspaceStatement.java

@@ -18,23 +18,22 @@
 package org.apache.cassandra.cql3.statements;
 
 import org.apache.cassandra.auth.Permission;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.locator.LocalStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 public class AlterKeyspaceStatement extends SchemaAlteringStatement
 {
     private final String name;
-    private final KSPropDefs attrs;
+    private final KeyspaceAttributes attrs;
 
-    public AlterKeyspaceStatement(String name, KSPropDefs attrs)
+    public AlterKeyspaceStatement(String name, KeyspaceAttributes attrs)
     {
         super();
         this.name = name;
@@ -54,44 +53,38 @@
 
     public void validate(ClientState state) throws RequestValidationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name);
         if (ksm == null)
             throw new InvalidRequestException("Unknown keyspace " + name);
-        if (ksm.name.equalsIgnoreCase(SystemKeyspace.NAME))
+        if (Schema.isLocalSystemKeyspace(ksm.name))
             throw new InvalidRequestException("Cannot alter system keyspace");
 
         attrs.validate();
 
         if (attrs.getReplicationStrategyClass() == null && !attrs.getReplicationOptions().isEmpty())
-        {
             throw new ConfigurationException("Missing replication strategy class");
-        }
-        else if (attrs.getReplicationStrategyClass() != null)
+
+        if (attrs.getReplicationStrategyClass() != null)
         {
             // The strategy is validated through KSMetaData.validate() in announceKeyspaceUpdate below.
             // However, for backward compatibility with thrift, this doesn't validate unexpected options yet,
             // so doing proper validation here.
-            AbstractReplicationStrategy.validateReplicationStrategy(name,
-                                                                    AbstractReplicationStrategy.getClass(attrs.getReplicationStrategyClass()),
-                                                                    StorageService.instance.getTokenMetadata(),
-                                                                    DatabaseDescriptor.getEndpointSnitch(),
-                                                                    attrs.getReplicationOptions());
+            KeyspaceParams params = attrs.asAlteredKeyspaceParams(ksm.params);
+            params.validate(name);
+            if (params.replication.klass.equals(LocalStrategy.class))
+                throw new ConfigurationException("Unable to use given strategy class: LocalStrategy is reserved for internal use.");
         }
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name);
+        KeyspaceMetadata oldKsm = Schema.instance.getKSMetaData(name);
         // In the (very) unlikely case the keyspace was dropped since validate()
-        if (ksm == null)
+        if (oldKsm == null)
             throw new InvalidRequestException("Unknown keyspace " + name);
 
-        MigrationManager.announceKeyspaceUpdate(attrs.asKSMetadataUpdate(ksm), isLocalOnly);
-        return true;
-    }
-
-    public Event.SchemaChange changeEvent()
-    {
+        KeyspaceMetadata newKsm = oldKsm.withSwapped(attrs.asAlteredKeyspaceParams(oldKsm.params));
+        MigrationManager.announceKeyspaceUpdate(newKsm, isLocalOnly);
         return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, keyspace());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java
index f4a7b39..193c24c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterTableStatement.java

@@ -17,51 +17,65 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.auth.Permission;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.CFName;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.EmptyType;
+import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Indexes;
+import org.apache.cassandra.schema.TableParams;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 import static org.apache.cassandra.thrift.ThriftValidation.validateColumnFamily;
 
 public class AlterTableStatement extends SchemaAlteringStatement
 {
-    public static enum Type
+    public enum Type
     {
-        ADD, ALTER, DROP, OPTS, RENAME
+        ADD, ALTER, DROP, DROP_COMPACT_STORAGE, OPTS, RENAME
     }
 
     public final Type oType;
     public final CQL3Type.Raw validator;
     public final ColumnIdentifier.Raw rawColumnName;
-    private final CFPropDefs cfProps;
-    private final Map<ColumnIdentifier.Raw, ColumnIdentifier.Raw> renames;
+    private final TableAttributes attrs;
+    private final Map<ColumnIdentifier.Raw, ColumnIdentifier> renames;
     private final boolean isStatic; // Only for ALTER ADD
+    private final Long deleteTimestamp;
 
     public AlterTableStatement(CFName name,
                                Type type,
                                ColumnIdentifier.Raw columnName,
                                CQL3Type.Raw validator,
-                               CFPropDefs cfProps,
-                               Map<ColumnIdentifier.Raw, ColumnIdentifier.Raw> renames,
-                               boolean isStatic)
+                               TableAttributes attrs,
+                               Map<ColumnIdentifier.Raw, ColumnIdentifier> renames,
+                               boolean isStatic,
+                               Long deleteTimestamp)
     {
         super(name);
         this.oType = type;
         this.rawColumnName = columnName;
         this.validator = validator; // used only for ADD/ALTER commands
-        this.cfProps = cfProps;
+        this.attrs = attrs;
         this.renames = renames;
         this.isStatic = isStatic;
+        this.deleteTimestamp = deleteTimestamp;
     }
 
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
@@ -74,32 +88,59 @@
         // validated in announceMigration()
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
         CFMetaData meta = validateColumnFamily(keyspace(), columnFamily());
-        CFMetaData cfm = meta.copy();
+        if (meta.isView())
+            throw new InvalidRequestException("Cannot use ALTER TABLE on Materialized View");
+
+        CFMetaData cfm;
 
         CQL3Type validator = this.validator == null ? null : this.validator.prepare(keyspace());
         ColumnIdentifier columnName = null;
         ColumnDefinition def = null;
         if (rawColumnName != null)
         {
-            columnName = rawColumnName.prepare(cfm);
-            def = cfm.getColumnDefinition(columnName);
+            columnName = rawColumnName.prepare(meta);
+            def = meta.getColumnDefinition(columnName);
         }
-        if (cfProps.getId() != null)
-            throw new ConfigurationException("Cannot alter table id.");
+
+        List<ViewDefinition> viewUpdates = null;
+        Iterable<ViewDefinition> views = View.findAll(keyspace(), columnFamily());
 
         switch (oType)
         {
+            case ALTER:
+                // We do not support altering of types and only allow this to for people who have already one
+                // through the upgrade of 2.x CQL-created SSTables with Thrift writes, affected by CASSANDRA-15778.
+                if (meta.isDense()
+                    && meta.compactValueColumn().equals(def)
+                    && meta.compactValueColumn().type instanceof EmptyType
+                    && validator != null)
+                {
+                    if (validator.getType() instanceof BytesType)
+                    {
+                        cfm = meta.copyWithNewCompactValueType(validator.getType());
+                        break;
+                    }
+
+                    throw new InvalidRequestException(String.format("Compact value type can only be changed to BytesType, but %s was given.",
+                                                                    validator.getType()));
+                }
+                else
+                {
+                    throw new InvalidRequestException("Altering of types is not allowed");
+                }
             case ADD:
                 assert columnName != null;
-                if (cfm.comparator.isDense())
+                if (meta.isDense())
                     throw new InvalidRequestException("Cannot add new column to a COMPACT STORAGE table");
 
+                cfm = meta.copy();
+
                 if (isStatic)
                 {
-                    if (!cfm.comparator.isCompound())
+                    if (!cfm.isCompound())
                         throw new InvalidRequestException("Static columns are not allowed in COMPACT STORAGE tables");
                     if (cfm.clusteringColumns().isEmpty())
                         throw new InvalidRequestException("Static columns are only useful (and thus allowed) if the table has at least one clustering column");
@@ -110,146 +151,94 @@
                     switch (def.kind)
                     {
                         case PARTITION_KEY:
-                        case CLUSTERING_COLUMN:
+                        case CLUSTERING:
                             throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with a PRIMARY KEY part", columnName));
                         default:
                             throw new InvalidRequestException(String.format("Invalid column name %s because it conflicts with an existing column", columnName));
                     }
                 }
 
-                // Cannot re-add a dropped counter column. See #7831.
-                if (meta.isCounter() && meta.getDroppedColumns().containsKey(columnName))
-                    throw new InvalidRequestException(String.format("Cannot re-add previously dropped counter column %s", columnName));
-
                 AbstractType<?> type = validator.getType();
                 if (type.isCollection() && type.isMultiCell())
                 {
-                    if (!cfm.comparator.supportCollections())
-                        throw new InvalidRequestException("Cannot use non-frozen collections with a non-composite PRIMARY KEY");
+                    if (!cfm.isCompound())
+                        throw new InvalidRequestException("Cannot use non-frozen collections in COMPACT STORAGE tables");
                     if (cfm.isSuper())
                         throw new InvalidRequestException("Cannot use non-frozen collections with super column families");
+                }
 
-                    // If there used to be a collection column with the same name (that has been dropped), it will
-                    // still be appear in the ColumnToCollectionType because or reasons explained on #6276. The same
-                    // reason mean that we can't allow adding a new collection with that name (see the ticket for details).
-                    if (cfm.comparator.hasCollections())
+                ColumnDefinition toAdd = isStatic
+                                       ? ColumnDefinition.staticDef(cfm, columnName.bytes, type)
+                                       : ColumnDefinition.regularDef(cfm, columnName.bytes, type);
+
+                CFMetaData.DroppedColumn droppedColumn = meta.getDroppedColumns().get(columnName.bytes);
+                if (null != droppedColumn)
+                {
+                    if (droppedColumn.kind != toAdd.kind)
                     {
-                        CollectionType previous = cfm.comparator.collectionType() == null ? null : cfm.comparator.collectionType().defined.get(columnName.bytes);
-                        if (previous != null && !type.isCompatibleWith(previous))
-                            throw new InvalidRequestException(String.format("Cannot add a collection with the name %s " +
-                                        "because a collection with the same name and a different type has already been used in the past", columnName));
+                        String message =
+                            String.format("Cannot re-add previously dropped column '%s' of kind %s, incompatible with previous kind %s",
+                                          columnName,
+                                          toAdd.kind,
+                                          droppedColumn.kind == null ? "UNKNOWN" : droppedColumn.kind);
+                        throw new InvalidRequestException(message);
                     }
 
-                    cfm.comparator = cfm.comparator.addOrUpdateCollection(columnName, (CollectionType)type);
+                    // After #8099, not safe to re-add columns of incompatible types - until *maybe* deser logic with dropped
+                    // columns is pushed deeper down the line. The latter would still be problematic in cases of schema races.
+                    if (!type.isValueCompatibleWith(droppedColumn.type))
+                    {
+                        String message =
+                            String.format("Cannot re-add previously dropped column '%s' of type %s, incompatible with previous type %s",
+                                          columnName,
+                                          type.asCQL3Type(),
+                                          droppedColumn.type.asCQL3Type());
+                        throw new InvalidRequestException(message);
+                    }
+
+                    // Cannot re-add a dropped counter column. See #7831.
+                    if (meta.isCounter())
+                        throw new InvalidRequestException(String.format("Cannot re-add previously dropped counter column %s", columnName));
                 }
 
-                Integer componentIndex = cfm.comparator.isCompound() ? cfm.comparator.clusteringPrefixSize() : null;
-                cfm.addColumnDefinition(isStatic
-                                        ? ColumnDefinition.staticDef(cfm, columnName.bytes, type, componentIndex)
-                                        : ColumnDefinition.regularDef(cfm, columnName.bytes, type, componentIndex));
-                break;
+                cfm.addColumnDefinition(toAdd);
 
-            case ALTER:
-                assert columnName != null;
-                if (def == null)
-                    throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
-
-                AbstractType<?> validatorType = validator.getType();
-                switch (def.kind)
+                // Adding a column to a table which has an include all view requires the column to be added to the view as well
+                if (!isStatic)
                 {
-                    case PARTITION_KEY:
-                        if (validatorType instanceof CounterColumnType)
-                            throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", columnName));
-                        if (cfm.getKeyValidator() instanceof CompositeType)
+                    for (ViewDefinition view : views)
+                    {
+                        if (view.includeAllColumns)
                         {
-                            List<AbstractType<?>> oldTypes = ((CompositeType) cfm.getKeyValidator()).types;
-                            if (!validatorType.isValueCompatibleWith(oldTypes.get(def.position())))
-                                throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
-                                                                               columnName,
-                                                                               oldTypes.get(def.position()).asCQL3Type(),
-                                                                               validator));
-
-                            List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(oldTypes);
-                            newTypes.set(def.position(), validatorType);
-                            cfm.keyValidator(CompositeType.getInstance(newTypes));
+                            ViewDefinition viewCopy = view.copy();
+                            viewCopy.metadata.addColumnDefinition(ColumnDefinition.regularDef(viewCopy.metadata, columnName.bytes, type));
+                            if (viewUpdates == null)
+                                viewUpdates = new ArrayList<>();
+                            viewUpdates.add(viewCopy);
                         }
-                        else
-                        {
-                            if (!validatorType.isValueCompatibleWith(cfm.getKeyValidator()))
-                                throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
-                                                                               columnName,
-                                                                               cfm.getKeyValidator().asCQL3Type(),
-                                                                               validator));
-                            cfm.keyValidator(validatorType);
-                        }
-                        break;
-                    case CLUSTERING_COLUMN:
-                        if (!cfm.isCQL3Table())
-                            throw new InvalidRequestException(String.format("Cannot alter clustering column %s in a non-CQL3 table", columnName));
-
-                        AbstractType<?> oldType = cfm.comparator.subtype(def.position());
-                        // Note that CFMetaData.validateCompatibility already validate the change we're about to do. However, the error message it
-                        // sends is a bit cryptic for a CQL3 user, so validating here for a sake of returning a better error message
-                        // Do note that we need isCompatibleWith here, not just isValueCompatibleWith.
-                        if (!validatorType.isCompatibleWith(oldType))
-                            throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are not order-compatible.",
-                                                                           columnName,
-                                                                           oldType.asCQL3Type(),
-                                                                           validator));
-
-                        cfm.comparator = cfm.comparator.setSubtype(def.position(), validatorType);
-                        break;
-                    case COMPACT_VALUE:
-                        // See below
-                        if (!validatorType.isValueCompatibleWith(cfm.getDefaultValidator()))
-                            throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
-                                                                           columnName,
-                                                                           cfm.getDefaultValidator().asCQL3Type(),
-                                                                           validator));
-                        cfm.defaultValidator(validatorType);
-                        break;
-                    case REGULAR:
-                    case STATIC:
-                        // Thrift allows to change a column validator so CFMetaData.validateCompatibility will let it slide
-                        // if we change to an incompatible type (contrarily to the comparator case). But we don't want to
-                        // allow it for CQL3 (see #5882) so validating it explicitly here. We only care about value compatibility
-                        // though since we won't compare values (except when there is an index, but that is validated by
-                        // ColumnDefinition already).
-                        if (!validatorType.isValueCompatibleWith(def.type))
-                            throw new ConfigurationException(String.format("Cannot change %s from type %s to type %s: types are incompatible.",
-                                                                           columnName,
-                                                                           def.type.asCQL3Type(),
-                                                                           validator));
-
-                        // For collections, if we alter the type, we need to update the comparator too since it includes
-                        // the type too (note that isValueCompatibleWith above has validated that the new type doesn't
-                        // change the underlying sorting order, but we still don't want to have a discrepancy between the type
-                        // in the comparator and the one in the ColumnDefinition as that would be dodgy).
-                        if (validatorType.isCollection() && validatorType.isMultiCell())
-                            cfm.comparator = cfm.comparator.addOrUpdateCollection(def.name, (CollectionType)validatorType);
-
-                        break;
+                    }
                 }
-                // In any case, we update the column definition
-                cfm.addOrReplaceColumnDefinition(def.withNewType(validatorType));
                 break;
 
             case DROP:
                 assert columnName != null;
-                if (!cfm.isCQL3Table())
+                if (!meta.isCQLTable())
                     throw new InvalidRequestException("Cannot drop columns from a non-CQL3 table");
+
                 if (def == null)
                     throw new InvalidRequestException(String.format("Column %s was not found in table %s", columnName, columnFamily()));
 
+                cfm = meta.copy();
+
                 switch (def.kind)
                 {
                     case PARTITION_KEY:
-                    case CLUSTERING_COLUMN:
+                    case CLUSTERING:
                         throw new InvalidRequestException(String.format("Cannot drop PRIMARY KEY part %s", columnName));
                     case REGULAR:
                     case STATIC:
                         ColumnDefinition toDelete = null;
-                        for (ColumnDefinition columnDef : cfm.regularAndStaticColumns())
+                        for (ColumnDefinition columnDef : cfm.partitionColumns())
                         {
                             if (columnDef.name.equals(columnName))
                             {
@@ -259,33 +248,93 @@
                         }
                         assert toDelete != null;
                         cfm.removeColumnDefinition(toDelete);
-                        cfm.recordColumnDrop(toDelete);
+                        cfm.recordColumnDrop(toDelete, deleteTimestamp == null ? queryState.getTimestamp() : deleteTimestamp);
                         break;
                 }
+
+                // If the dropped column is required by any secondary indexes
+                // we reject the operation, as the indexes must be dropped first
+                Indexes allIndexes = cfm.getIndexes();
+                if (!allIndexes.isEmpty())
+                {
+                    ColumnFamilyStore store = Keyspace.openAndGetStore(cfm);
+                    Set<IndexMetadata> dependentIndexes = store.indexManager.getDependentIndexes(def);
+                    if (!dependentIndexes.isEmpty())
+                        throw new InvalidRequestException(String.format("Cannot drop column %s because it has " +
+                                                                        "dependent secondary indexes (%s)",
+                                                                        def,
+                                                                        dependentIndexes.stream()
+                                                                                        .map(i -> i.name)
+                                                                                        .collect(Collectors.joining(","))));
+                }
+
+                if (!Iterables.isEmpty(views))
+                    throw new InvalidRequestException(String.format("Cannot drop column %s on base table %s with materialized views.",
+                                                                    columnName.toString(),
+                                                                    columnFamily()));
+                break;
+            case DROP_COMPACT_STORAGE:
+                if (!meta.isCompactTable())
+                    throw new InvalidRequestException("Cannot DROP COMPACT STORAGE on table without COMPACT STORAGE");
+
+                cfm = meta.asNonCompact();
                 break;
             case OPTS:
-                if (cfProps == null)
+                if (attrs == null)
                     throw new InvalidRequestException("ALTER TABLE WITH invoked, but no parameters found");
+                attrs.validate();
 
-                cfProps.validate();
+                cfm = meta.copy();
 
-                if (meta.isCounter() && cfProps.getDefaultTimeToLive() > 0)
+                TableParams params = attrs.asAlteredTableParams(cfm.params);
+
+                if (!Iterables.isEmpty(views) && params.gcGraceSeconds == 0)
+                {
+                    throw new InvalidRequestException("Cannot alter gc_grace_seconds of the base table of a " +
+                                                      "materialized view to 0, since this value is used to TTL " +
+                                                      "undelivered updates. Setting gc_grace_seconds too low might " +
+                                                      "cause undelivered updates to expire " +
+                                                      "before being replayed.");
+                }
+
+                if (meta.isCounter() && params.defaultTimeToLive > 0)
                     throw new InvalidRequestException("Cannot set default_time_to_live on a table with counters");
 
-                cfProps.applyToCFMetadata(cfm);
+                cfm.params(params);
+
                 break;
             case RENAME:
-                for (Map.Entry<ColumnIdentifier.Raw, ColumnIdentifier.Raw> entry : renames.entrySet())
+                cfm = meta.copy();
+
+                for (Map.Entry<ColumnIdentifier.Raw, ColumnIdentifier> entry : renames.entrySet())
                 {
                     ColumnIdentifier from = entry.getKey().prepare(cfm);
-                    ColumnIdentifier to = entry.getValue().prepare(cfm);
+                    ColumnIdentifier to = entry.getValue();
+
                     cfm.renameColumn(from, to);
+
+                    // If the view includes a renamed column, it must be renamed in the view table and the definition.
+                    for (ViewDefinition view : views)
+                    {
+                        if (!view.includes(from)) continue;
+
+                        ViewDefinition viewCopy = view.copy();
+                        ColumnIdentifier viewFrom = entry.getKey().prepare(viewCopy.metadata);
+                        ColumnIdentifier viewTo = entry.getValue();
+                        viewCopy.renameColumn(viewFrom, viewTo);
+
+                        if (viewUpdates == null)
+                            viewUpdates = new ArrayList<>();
+                        viewUpdates.add(viewCopy);
+                    }
                 }
                 break;
+            default:
+                throw new InvalidRequestException("Can not alter table: unknown option type " + oType);
         }
 
-        MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
-        return true;
+        MigrationManager.announceColumnFamilyUpdate(cfm, viewUpdates, isLocalOnly);
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 
     public String toString()
@@ -296,9 +345,4 @@
                              rawColumnName,
                              validator);
     }
-
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java
index 9203cf9..f2e1578 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterTypeStatement.java

@@ -23,11 +23,12 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.composites.CellNames;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 public abstract class AlterTypeStatement extends SchemaAlteringStatement
@@ -36,7 +37,6 @@
 
     protected AlterTypeStatement(UTName name)
     {
-        super();
         this.name = name;
     }
 
@@ -50,16 +50,16 @@
             throw new InvalidRequestException("You need to be logged in a keyspace or use a fully qualified user type name");
     }
 
-    protected abstract UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException;
+    protected abstract UserType makeUpdatedType(UserType toUpdate, KeyspaceMetadata ksm) throws InvalidRequestException;
 
     public static AlterTypeStatement addition(UTName name, ColumnIdentifier fieldName, CQL3Type.Raw type)
     {
-        return new AddOrAlter(name, true, fieldName, type);
+        return new Add(name, fieldName, type);
     }
 
     public static AlterTypeStatement alter(UTName name, ColumnIdentifier fieldName, CQL3Type.Raw type)
     {
-        return new AddOrAlter(name, false, fieldName, type);
+        throw new InvalidRequestException("Altering of types is not allowed");
     }
 
     public static AlterTypeStatement renames(UTName name, Map<ColumnIdentifier, ColumnIdentifier> renames)
@@ -78,35 +78,29 @@
         // It doesn't really change anything anyway.
     }
 
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
-    }
-
     @Override
     public String keyspace()
     {
         return name.getKeyspace();
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name.getKeyspace());
         if (ksm == null)
             throw new InvalidRequestException(String.format("Cannot alter type in unknown keyspace %s", name.getKeyspace()));
 
-        UserType toUpdate = ksm.userTypes.getType(name.getUserTypeName());
-        // Shouldn't happen, unless we race with a drop
-        if (toUpdate == null)
-            throw new InvalidRequestException(String.format("No user type named %s exists.", name));
+        UserType toUpdate =
+            ksm.types.get(name.getUserTypeName())
+                     .orElseThrow(() -> new InvalidRequestException(String.format("No user type named %s exists.", name)));
 
-        UserType updated = makeUpdatedType(toUpdate);
+        UserType updated = makeUpdatedType(toUpdate, ksm);
 
         // Now, we need to announce the type update to basically change it for new tables using this type,
         // but we also need to find all existing user types and CF using it and change them.
         MigrationManager.announceTypeUpdate(updated, isLocalOnly);
 
-        for (CFMetaData cfm : ksm.cfMetaData().values())
+        for (CFMetaData cfm : ksm.tables)
         {
             CFMetaData copy = cfm.copy();
             boolean modified = false;
@@ -116,8 +110,18 @@
                 MigrationManager.announceColumnFamilyUpdate(copy, isLocalOnly);
         }
 
+        for (ViewDefinition view : ksm.views)
+        {
+            ViewDefinition copy = view.copy();
+            boolean modified = false;
+            for (ColumnDefinition def : copy.metadata.allColumns())
+                modified |= updateDefinition(copy.metadata, def, toUpdate.keyspace, toUpdate.name, updated);
+            if (modified)
+                MigrationManager.announceViewUpdate(copy, isLocalOnly);
+        }
+
         // Other user types potentially using the updated type
-        for (UserType ut : ksm.userTypes.getAllTypes().values())
+        for (UserType ut : ksm.types)
         {
             // Re-updating the type we've just updated would be harmless but useless so we avoid it.
             // Besides, we use the occasion to drop the old version of the type if it's a type rename
@@ -131,7 +135,7 @@
             if (upd != null)
                 MigrationManager.announceTypeUpdate((UserType) upd, isLocalOnly);
         }
-        return true;
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
     }
 
     private static int getIdxOfField(UserType type, ColumnIdentifier field)
@@ -150,28 +154,6 @@
 
         // We need to update this validator ...
         cfm.addOrReplaceColumnDefinition(def.withNewType(t));
-
-        // ... but if it's part of the comparator or key validator, we need to go update those too.
-        switch (def.kind)
-        {
-            case PARTITION_KEY:
-                cfm.keyValidator(updateWith(cfm.getKeyValidator(), keyspace, toReplace, updated));
-                break;
-            case CLUSTERING_COLUMN:
-                cfm.comparator = CellNames.fromAbstractType(updateWith(cfm.comparator.asAbstractType(), keyspace, toReplace, updated), cfm.comparator.isDense());
-                break;
-            default:
-                // If it's a collection, we still want to modify the comparator because the collection is aliased in it
-                if (def.type instanceof CollectionType && def.type.isMultiCell())
-                {
-                    t = updateWith(cfm.comparator.asAbstractType(), keyspace, toReplace, updated);
-                    // If t == null, all relevant comparators were updated via updateWith, which reaches into types and
-                    // collections
-                    if (t != null)
-                        cfm.comparator = CellNames.fromAbstractType(t, cfm.comparator.isDense());
-                }
-                break;
-        }
         return true;
     }
 
@@ -203,23 +185,6 @@
             List<AbstractType<?>> updatedTypes = updateTypes(ct.types, keyspace, toReplace, updated);
             return updatedTypes == null ? null : CompositeType.getInstance(updatedTypes);
         }
-        else if (type instanceof ColumnToCollectionType)
-        {
-            ColumnToCollectionType ctct = (ColumnToCollectionType)type;
-            Map<ByteBuffer, CollectionType> updatedTypes = null;
-            for (Map.Entry<ByteBuffer, CollectionType> entry : ctct.defined.entrySet())
-            {
-                AbstractType<?> t = updateWith(entry.getValue(), keyspace, toReplace, updated);
-                if (t == null)
-                    continue;
-
-                if (updatedTypes == null)
-                    updatedTypes = new HashMap<>(ctct.defined);
-
-                updatedTypes.put(entry.getKey(), (CollectionType)t);
-            }
-            return updatedTypes == null ? null : ColumnToCollectionType.getInstance(updatedTypes);
-        }
         else if (type instanceof CollectionType)
         {
             if (type instanceof ListType)
@@ -271,21 +236,28 @@
         return updatedTypes;
     }
 
-    private static class AddOrAlter extends AlterTypeStatement
+    protected void checkTypeNotUsedByAggregate(KeyspaceMetadata ksm)
     {
-        private final boolean isAdd;
+        ksm.functions.udas().filter(aggregate -> aggregate.initialCondition() != null && aggregate.stateType().referencesUserType(name.getStringTypeName()))
+                     .findAny()
+                     .ifPresent((aggregate) -> {
+                         throw new InvalidRequestException(String.format("Cannot alter user type %s as it is still used as an INITCOND by aggregate %s", name, aggregate));
+                     });
+    }
+
+    private static class Add extends AlterTypeStatement
+    {
         private final ColumnIdentifier fieldName;
         private final CQL3Type.Raw type;
 
-        public AddOrAlter(UTName name, boolean isAdd, ColumnIdentifier fieldName, CQL3Type.Raw type)
+        public Add(UTName name, ColumnIdentifier fieldName, CQL3Type.Raw type)
         {
             super(name);
-            this.isAdd = isAdd;
             this.fieldName = fieldName;
             this.type = type;
         }
 
-        private UserType doAdd(UserType toUpdate) throws InvalidRequestException
+        protected UserType makeUpdatedType(UserType toUpdate, KeyspaceMetadata ksm) throws InvalidRequestException
         {
             if (getIdxOfField(toUpdate, fieldName) >= 0)
                 throw new InvalidRequestException(String.format("Cannot add new field %s to type %s: a field of the same name already exists", fieldName, name));
@@ -295,7 +267,7 @@
             newNames.add(fieldName.bytes);
 
             AbstractType<?> addType = type.prepare(keyspace()).getType();
-            if (addType.references(toUpdate))
+            if (addType.referencesUserType(toUpdate.getNameAsString()))
                 throw new InvalidRequestException(String.format("Cannot add new field %s of type %s to type %s as this would create a circular reference", fieldName, type, name));
 
             List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.size() + 1);
@@ -304,28 +276,6 @@
 
             return new UserType(toUpdate.keyspace, toUpdate.name, newNames, newTypes);
         }
-
-        private UserType doAlter(UserType toUpdate) throws InvalidRequestException
-        {
-            int idx = getIdxOfField(toUpdate, fieldName);
-            if (idx < 0)
-                throw new InvalidRequestException(String.format("Unknown field %s in type %s", fieldName, name));
-
-            AbstractType<?> previous = toUpdate.fieldType(idx);
-            if (!type.prepare(keyspace()).getType().isCompatibleWith(previous))
-                throw new InvalidRequestException(String.format("Type %s is incompatible with previous type %s of field %s in user type %s", type, previous.asCQL3Type(), fieldName, name));
-
-            List<ByteBuffer> newNames = new ArrayList<>(toUpdate.fieldNames());
-            List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.fieldTypes());
-            newTypes.set(idx, type.prepare(keyspace()).getType());
-
-            return new UserType(toUpdate.keyspace, toUpdate.name, newNames, newTypes);
-        }
-
-        protected UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException
-        {
-            return isAdd ? doAdd(toUpdate) : doAlter(toUpdate);
-        }
     }
 
     private static class Renames extends AlterTypeStatement
@@ -338,8 +288,10 @@
             this.renames = renames;
         }
 
-        protected UserType makeUpdatedType(UserType toUpdate) throws InvalidRequestException
+        protected UserType makeUpdatedType(UserType toUpdate, KeyspaceMetadata ksm) throws InvalidRequestException
         {
+            checkTypeNotUsedByAggregate(ksm);
+
             List<ByteBuffer> newNames = new ArrayList<>(toUpdate.fieldNames());
             List<AbstractType<?>> newTypes = new ArrayList<>(toUpdate.fieldTypes());
 

diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterViewStatement.java
new file mode 100644
index 0000000..ea87cfd
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterViewStatement.java

@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.cql3.CFName;
+import org.apache.cassandra.db.view.View;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.Event;
+
+import static org.apache.cassandra.thrift.ThriftValidation.validateColumnFamily;
+
+public class AlterViewStatement extends SchemaAlteringStatement
+{
+    private final TableAttributes attrs;
+
+    public AlterViewStatement(CFName name, TableAttributes attrs)
+    {
+        super(name);
+        this.attrs = attrs;
+    }
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        CFMetaData baseTable = View.findBaseTable(keyspace(), columnFamily());
+        if (baseTable != null)
+            state.hasColumnFamilyAccess(keyspace(), baseTable.cfName, Permission.ALTER);
+    }
+
+    public void validate(ClientState state)
+    {
+        // validated in announceMigration()
+    }
+
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
+    {
+        CFMetaData meta = validateColumnFamily(keyspace(), columnFamily());
+        if (!meta.isView())
+            throw new InvalidRequestException("Cannot use ALTER MATERIALIZED VIEW on Table");
+
+        ViewDefinition viewCopy = Schema.instance.getView(keyspace(), columnFamily()).copy();
+
+        if (attrs == null)
+            throw new InvalidRequestException("ALTER MATERIALIZED VIEW WITH invoked, but no parameters found");
+
+        attrs.validate();
+
+        TableParams params = attrs.asAlteredTableParams(viewCopy.metadata.params);
+        if (params.gcGraceSeconds == 0)
+        {
+            throw new InvalidRequestException("Cannot alter gc_grace_seconds of a materialized view to 0, since this " +
+                                              "value is used to TTL undelivered updates. Setting gc_grace_seconds too " +
+                                              "low might cause undelivered updates to expire before being replayed.");
+        }
+
+        if (params.defaultTimeToLive > 0)
+        {
+            throw new InvalidRequestException("Cannot set or alter default_time_to_live for a materialized view. " +
+                                              "Data in a materialized view always expire at the same time than " +
+                                              "the corresponding data in the parent table.");
+        }
+
+        viewCopy.metadata.params(params);
+
+        MigrationManager.announceViewUpdate(viewCopy, isLocalOnly);
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
+    }
+
+    public String toString()
+    {
+        return String.format("AlterViewStatement(name=%s)", cfName);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/AuthenticationStatement.java b/src/java/org/apache/cassandra/cql3/statements/AuthenticationStatement.java
index 151e4f0..30ab6b0 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AuthenticationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AuthenticationStatement.java

@@ -31,7 +31,7 @@
 public abstract class AuthenticationStatement extends ParsedStatement implements CQLStatement
 {
     @Override
-    public Prepared prepare()
+    public Prepared prepare(ClientState clientState)
     {
         return new Prepared(this);
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/AuthorizationStatement.java b/src/java/org/apache/cassandra/cql3/statements/AuthorizationStatement.java
index 098e22c..fa2a993 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AuthorizationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AuthorizationStatement.java

@@ -32,7 +32,7 @@
 public abstract class AuthorizationStatement extends ParsedStatement implements CQLStatement
 {
     @Override
-    public Prepared prepare()
+    public Prepared prepare(ClientState clientState)
     {
         return new Prepared(this);
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index 76e389b..1c3cfa6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java

@@ -23,26 +23,26 @@
 
 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.helpers.MessageFormatter;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.ClientWarn;
-import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.*;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.Pair;
 
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+
 /**
  * A <code>BATCH</code> statement parsed from a CQL query.
  */
@@ -56,11 +56,27 @@
     private final int boundTerms;
     public final Type type;
     private final List<ModificationStatement> statements;
+
+    // Columns modified for each table (keyed by the table ID)
+    private final Map<UUID, PartitionColumns> updatedColumns;
+    // Columns on which there is conditions. Note that if there is any, then the batch can only be on a single partition (and thus table).
+    private final PartitionColumns conditionColumns;
+
+    private final boolean updatesRegularRows;
+    private final boolean updatesStaticRow;
     private final Attributes attrs;
     private final boolean hasConditions;
     private static final Logger logger = LoggerFactory.getLogger(BatchStatement.class);
-    private static final String unloggedBatchWarning = "Unlogged batch covering {} partitions detected against table{} {}. " +
-                                                       "You should use a logged batch for atomicity, or asynchronous writes for performance.";
+
+    private static final String UNLOGGED_BATCH_WARNING = "Unlogged batch covering {} partitions detected " +
+                                                         "against table{} {}. You should use a logged batch for " +
+                                                         "atomicity, or asynchronous writes for performance.";
+
+    private static final String LOGGED_BATCH_LOW_GCGS_WARNING = "Executing a LOGGED BATCH on table{} {}, configured with a " +
+                                                                "gc_grace_seconds of 0. The gc_grace_seconds is used to TTL " +
+                                                                "batchlog entries, so setting gc_grace_seconds too low on " +
+                                                                "tables involved in an atomic batch might cause batchlog " +
+                                                                "entries to expire before being replayed.";
 
     /**
      * Creates a new BatchStatement from a list of statements and a
@@ -72,22 +88,41 @@
      */
     public BatchStatement(int boundTerms, Type type, List<ModificationStatement> statements, Attributes attrs)
     {
-        boolean hasConditions = false;
-        for (ModificationStatement statement : statements)
-            hasConditions |= statement.hasConditions();
-
         this.boundTerms = boundTerms;
         this.type = type;
         this.statements = statements;
         this.attrs = attrs;
+
+        boolean hasConditions = false;
+        MultiTableColumnsBuilder regularBuilder = new MultiTableColumnsBuilder();
+        PartitionColumns.Builder conditionBuilder = PartitionColumns.builder();
+        boolean updateRegular = false;
+        boolean updateStatic = false;
+
+        for (ModificationStatement stmt : statements)
+        {
+            regularBuilder.addAll(stmt.cfm, stmt.updatedColumns());
+            updateRegular |= stmt.updatesRegularRows();
+            if (stmt.hasConditions())
+            {
+                hasConditions = true;
+                conditionBuilder.addAll(stmt.conditionColumns());
+                updateStatic |= stmt.updatesStaticRow();
+            }
+        }
+
+        this.updatedColumns = regularBuilder.build();
+        this.conditionColumns = conditionBuilder.build();
+        this.updatesRegularRows = updateRegular;
+        this.updatesStaticRow = updateStatic;
         this.hasConditions = hasConditions;
     }
 
     public Iterable<org.apache.cassandra.cql3.functions.Function> getFunctions()
     {
-        Iterable<org.apache.cassandra.cql3.functions.Function> functions = attrs.getFunctions();
+        List<org.apache.cassandra.cql3.functions.Function> functions = new ArrayList<>();
         for (ModificationStatement statement : statements)
-            functions = Iterables.concat(functions, statement.getFunctions());
+            statement.addFunctionsTo(functions);
         return functions;
     }
 
@@ -113,7 +148,8 @@
         {
             if (hasConditions)
                 throw new InvalidRequestException("Cannot provide custom timestamp for conditional BATCH");
-            if (type == Type.COUNTER)
+
+            if (isCounter())
                 throw new InvalidRequestException("Cannot provide custom timestamp for counter BATCH");
         }
 
@@ -128,10 +164,10 @@
             if (timestampSet && statement.isTimestampSet())
                 throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements");
 
-            if (type == Type.COUNTER && !statement.isCounter())
+            if (isCounter() && !statement.isCounter())
                 throw new InvalidRequestException("Cannot include non-counter statement in a counter batch");
 
-            if (type == Type.LOGGED && statement.isCounter())
+            if (isLogged() && statement.isCounter())
                 throw new InvalidRequestException("Cannot include a counter statement in a logged batch");
 
             if (statement.isCounter())
@@ -157,6 +193,16 @@
         }
     }
 
+    private boolean isCounter()
+    {
+        return type == Type.COUNTER;
+    }
+
+    private boolean isLogged()
+    {
+        return type == Type.LOGGED;
+    }
+
     // The batch itself will be validated in either Parsed#prepare() - for regular CQL3 batches,
     //   or in QueryProcessor.processBatch() - for native protocol batches.
     public void validate(ClientState state) throws InvalidRequestException
@@ -173,71 +219,40 @@
     private Collection<? extends IMutation> getMutations(BatchQueryOptions options, boolean local, long now)
     throws RequestExecutionException, RequestValidationException
     {
-        Map<String, Map<ByteBuffer, IMutation>> mutations = new HashMap<>();
+        Set<String> tablesWithZeroGcGs = null;
+        UpdatesCollector collector = new UpdatesCollector(updatedColumns, updatedRows());
         for (int i = 0; i < statements.size(); i++)
         {
             ModificationStatement statement = statements.get(i);
+            if (isLogged() && statement.cfm.params.gcGraceSeconds == 0)
+            {
+                if (tablesWithZeroGcGs == null)
+                    tablesWithZeroGcGs = new HashSet<>();
+                tablesWithZeroGcGs.add(String.format("%s.%s", statement.cfm.ksName, statement.cfm.cfName));
+            }
             QueryOptions statementOptions = options.forStatement(i);
             long timestamp = attrs.getTimestamp(now, statementOptions);
-            addStatementMutations(statement, statementOptions, local, timestamp, mutations);
+            statement.addUpdates(collector, statementOptions, local, timestamp);
         }
-        return unzipMutations(mutations);
+
+        if (tablesWithZeroGcGs != null)
+        {
+            String suffix = tablesWithZeroGcGs.size() == 1 ? "" : "s";
+            NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.MINUTES, LOGGED_BATCH_LOW_GCGS_WARNING,
+                             suffix, tablesWithZeroGcGs);
+            ClientWarn.instance.warn(MessageFormatter.arrayFormat(LOGGED_BATCH_LOW_GCGS_WARNING, new Object[] { suffix, tablesWithZeroGcGs })
+                                                     .getMessage());
+        }
+
+        collector.validateIndexedColumns();
+        return collector.toMutations();
     }
 
-    private Collection<? extends IMutation> unzipMutations(Map<String, Map<ByteBuffer, IMutation>> mutations)
+    private int updatedRows()
     {
-
-        // The case where all statement where on the same keyspace is pretty common
-        if (mutations.size() == 1)
-            return mutations.values().iterator().next().values();
-
-
-        List<IMutation> ms = new ArrayList<>();
-        for (Map<ByteBuffer, IMutation> ksMap : mutations.values())
-            ms.addAll(ksMap.values());
-
-        return ms;
-    }
-
-    private void addStatementMutations(ModificationStatement statement,
-                                       QueryOptions options,
-                                       boolean local,
-                                       long now,
-                                       Map<String, Map<ByteBuffer, IMutation>> mutations)
-    throws RequestExecutionException, RequestValidationException
-    {
-        String ksName = statement.keyspace();
-        Map<ByteBuffer, IMutation> ksMap = mutations.get(ksName);
-        if (ksMap == null)
-        {
-            ksMap = new HashMap<>();
-            mutations.put(ksName, ksMap);
-        }
-
-        // The following does the same than statement.getMutations(), but we inline it here because
-        // we don't want to recreate mutations every time as this is particularly inefficient when applying
-        // multiple batch to the same partition (see #6737).
-        List<ByteBuffer> keys = statement.buildPartitionKeyNames(options);
-        Composite clusteringPrefix = statement.createClusteringPrefix(options);
-        UpdateParameters params = statement.makeUpdateParameters(keys, clusteringPrefix, options, local, now);
-
-        for (ByteBuffer key : keys)
-        {
-            IMutation mutation = ksMap.get(key);
-            Mutation mut;
-            if (mutation == null)
-            {
-                mut = new Mutation(ksName, key);
-                mutation = statement.cfm.isCounter() ? new CounterMutation(mut, options.getConsistency()) : mut;
-                ksMap.put(key, mutation);
-            }
-            else
-            {
-                mut = statement.cfm.isCounter() ? ((CounterMutation) mutation).getMutation() : (Mutation) mutation;
-            }
-
-            statement.addUpdateForKey(mut.addOrGet(statement.cfm), key, clusteringPrefix, params);
-        }
+        // Note: it's possible for 2 statements to actually apply to the same row, but that's just an estimation
+        // for sizing our PartitionUpdate backing array, so it's good enough.
+        return statements.size();
     }
 
     /**
@@ -245,64 +260,60 @@
      *
      * @param cfs ColumnFamilies that will store the batch's mutations.
      */
-    public static void verifyBatchSize(Iterable<ColumnFamily> cfs) throws InvalidRequestException
+    public static void verifyBatchSize(Iterable<PartitionUpdate> updates) throws InvalidRequestException
     {
         long size = 0;
         long warnThreshold = DatabaseDescriptor.getBatchSizeWarnThreshold();
         long failThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
 
-        for (ColumnFamily cf : cfs)
-            size += cf.dataSize();
+        for (PartitionUpdate update : updates)
+            size += update.dataSize();
 
         if (size > warnThreshold)
         {
-            Set<String> ksCfPairs = new HashSet<>();
-            for (ColumnFamily cf : cfs)
-                ksCfPairs.add(String.format("%s.%s", cf.metadata().ksName, cf.metadata().cfName));
+            Set<String> tableNames = new HashSet<>();
+            for (PartitionUpdate update : updates)
+                tableNames.add(String.format("%s.%s", update.metadata().ksName, update.metadata().cfName));
 
             String format = "Batch of prepared statements for {} is of size {}, exceeding specified threshold of {} by {}.{}";
             if (size > failThreshold)
             {
-                Tracing.trace(format, ksCfPairs, size, failThreshold, size - failThreshold, " (see batch_size_fail_threshold_in_kb)");
-                logger.error(format, ksCfPairs, size, failThreshold, size - failThreshold, " (see batch_size_fail_threshold_in_kb)");
+                Tracing.trace(format, tableNames, size, failThreshold, size - failThreshold, " (see batch_size_fail_threshold_in_kb)");
+                logger.error(format, tableNames, size, failThreshold, size - failThreshold, " (see batch_size_fail_threshold_in_kb)");
                 throw new InvalidRequestException("Batch too large");
             }
             else if (logger.isWarnEnabled())
             {
-                logger.warn(format, ksCfPairs, size, warnThreshold, size - warnThreshold, "");
+                logger.warn(format, tableNames, size, warnThreshold, size - warnThreshold, "");
             }
-            ClientWarn.instance.warn(MessageFormatter.arrayFormat(format, new Object[]{ ksCfPairs, size, warnThreshold, size - warnThreshold, "" }).getMessage());
+            ClientWarn.instance.warn(MessageFormatter.arrayFormat(format, new Object[] {tableNames, size, warnThreshold, size - warnThreshold, ""}).getMessage());
         }
     }
 
-    private void verifyBatchType(Collection<? extends IMutation> mutations)
+    private void verifyBatchType(Iterable<PartitionUpdate> updates)
     {
-        if (type != Type.LOGGED && mutations.size() > 1)
+        if (!isLogged() && Iterables.size(updates) > 1)
         {
-            Set<String> ksCfPairs = new HashSet<>();
-            Set<ByteBuffer> keySet = new HashSet<>();
+            Set<DecoratedKey> keySet = new HashSet<>();
+            Set<String> tableNames = new HashSet<>();
 
-            for (IMutation im : mutations)
+            for (PartitionUpdate update : updates)
             {
-                keySet.add(im.key());
+                keySet.add(update.partitionKey());
 
-                for (ColumnFamily cf : im.getColumnFamilies())
-                    ksCfPairs.add(String.format("%s.%s", cf.metadata().ksName, cf.metadata().cfName));
+                tableNames.add(String.format("%s.%s", update.metadata().ksName, update.metadata().cfName));
             }
 
             // CASSANDRA-11529: log only if we have more than a threshold of keys, this was also suggested in the
             // original ticket that introduced this warning, CASSANDRA-9282
             if (keySet.size() > DatabaseDescriptor.getUnloggedBatchAcrossPartitionsWarnThreshold())
             {
-                NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.MINUTES, unloggedBatchWarning,
-                                 keySet.size(), ksCfPairs.size() == 1 ? "" : "s", ksCfPairs);
 
-                ClientWarn.instance.warn(MessageFormatter.arrayFormat(unloggedBatchWarning,
-                                                                  new Object[]{
-                                                                              keySet.size(),
-                                                                              ksCfPairs.size() == 1 ? "" : "s",
-                                                                              ksCfPairs
-                                                                  }).getMessage());
+                NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.MINUTES, UNLOGGED_BATCH_WARNING,
+                                 keySet.size(), tableNames.size() == 1 ? "" : "s", tableNames);
+
+                ClientWarn.instance.warn(MessageFormatter.arrayFormat(UNLOGGED_BATCH_WARNING, new Object[]{keySet.size(),
+                                                    tableNames.size() == 1 ? "" : "s", tableNames}).getMessage());
             }
         }
     }
@@ -335,19 +346,22 @@
 
     private void executeWithoutConditions(Collection<? extends IMutation> mutations, ConsistencyLevel cl) throws RequestExecutionException, RequestValidationException
     {
-        // Extract each collection of cfs from it's IMutation and then lazily concatenate all of them into a single Iterable.
-        Iterable<ColumnFamily> cfs = Iterables.concat(Iterables.transform(mutations, new Function<IMutation, Collection<ColumnFamily>>()
+        if (mutations.isEmpty())
+            return;
+
+        // Extract each collection of updates from it's IMutation and then lazily concatenate all of them into a single Iterable.
+        Iterable<PartitionUpdate> updates = Iterables.concat(Iterables.transform(mutations, new Function<IMutation, Collection<PartitionUpdate>>()
         {
-            public Collection<ColumnFamily> apply(IMutation im)
+            public Collection<PartitionUpdate> apply(IMutation im)
             {
-                return im.getColumnFamilies();
+                return im.getPartitionUpdates();
             }
         }));
 
-        verifyBatchSize(cfs);
-        verifyBatchType(mutations);
+        verifyBatchSize(updates);
+        verifyBatchType(updates);
 
-        boolean mutateAtomic = (type == Type.LOGGED && mutations.size() > 1);
+        boolean mutateAtomic = (isLogged() && mutations.size() > 1);
         StorageProxy.mutateWithTriggers(mutations, cl, mutateAtomic);
     }
 
@@ -358,27 +372,26 @@
         CQL3CasRequest casRequest = p.left;
         Set<ColumnDefinition> columnsWithConditions = p.right;
 
-        ColumnFamily result = StorageProxy.cas(casRequest.cfm.ksName,
-                                               casRequest.cfm.cfName,
-                                               casRequest.key,
-                                               casRequest,
-                                               options.getSerialConsistency(),
-                                               options.getConsistency(),
-                                               state.getClientState());
+        String ksName = casRequest.cfm.ksName;
+        String tableName = casRequest.cfm.cfName;
 
-        return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(casRequest.cfm.ksName,
-                                                                              casRequest.key,
-                                                                              casRequest.cfm.cfName,
-                                                                              result,
-                                                                              columnsWithConditions,
-                                                                              true,
-                                                                              options.forStatement(0)));
+        try (RowIterator result = StorageProxy.cas(ksName,
+                                                   tableName,
+                                                   casRequest.key,
+                                                   casRequest,
+                                                   options.getSerialConsistency(),
+                                                   options.getConsistency(),
+                                                   state.getClientState()))
+        {
+            return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(ksName, tableName, result, columnsWithConditions, true, options.forStatement(0)));
+        }
     }
 
+
     private Pair<CQL3CasRequest,Set<ColumnDefinition>> makeCasRequest(BatchQueryOptions options, QueryState state)
     {
         long now = state.getTimestamp();
-        ByteBuffer key = null;
+        DecoratedKey key = null;
         CQL3CasRequest casRequest = null;
         Set<ColumnDefinition> columnsWithConditions = new LinkedHashSet<>();
 
@@ -388,29 +401,52 @@
             QueryOptions statementOptions = options.forStatement(i);
             long timestamp = attrs.getTimestamp(now, statementOptions);
             List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementOptions);
-            if (pks.size() > 1)
+            if (statement.getRestrictions().keyIsInRelation())
                 throw new IllegalArgumentException("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)");
             if (key == null)
             {
-                key = pks.get(0);
-                casRequest = new CQL3CasRequest(statement.cfm, key, true);
+                key = statement.cfm.decorateKey(pks.get(0));
+                casRequest = new CQL3CasRequest(statement.cfm, key, true, conditionColumns, updatesRegularRows, updatesStaticRow);
             }
-            else if (!key.equals(pks.get(0)))
+            else if (!key.getKey().equals(pks.get(0)))
             {
                 throw new InvalidRequestException("Batch with conditions cannot span multiple partitions");
             }
 
-            Composite clusteringPrefix = statement.createClusteringPrefix(statementOptions);
-            if (statement.hasConditions())
+            checkFalse(statement.getRestrictions().clusteringKeyRestrictionsHasIN(),
+                       "IN on the clustering key columns is not supported with conditional %s",
+                       statement.type.isUpdate()? "updates" : "deletions");
+
+            if (statement.hasSlices())
             {
-                statement.addConditions(clusteringPrefix, casRequest, statementOptions);
-                // As soon as we have a ifNotExists, we set columnsWithConditions to null so that everything is in the resultSet
-                if (statement.hasIfNotExistCondition() || statement.hasIfExistCondition())
-                    columnsWithConditions = null;
-                else if (columnsWithConditions != null)
-                    Iterables.addAll(columnsWithConditions, statement.getColumnsWithConditions());
+                // All of the conditions require meaningful Clustering, not Slices
+                assert !statement.hasConditions();
+
+                Slices slices = statement.createSlices(statementOptions);
+                // If all the ranges were invalid we do not need to do anything.
+                if (slices.isEmpty())
+                    continue;
+
+                for (Slice slice : slices)
+                {
+                    casRequest.addRangeDeletion(slice, statement, statementOptions, timestamp);
+                }
+
             }
-            casRequest.addRowUpdate(clusteringPrefix, statement, statementOptions, timestamp);
+            else
+            {
+                Clustering clustering = Iterables.getOnlyElement(statement.createClustering(statementOptions));
+                if (statement.hasConditions())
+                {
+                    statement.addConditions(clustering, casRequest, statementOptions);
+                    // As soon as we have a ifNotExists, we set columnsWithConditions to null so that everything is in the resultSet
+                    if (statement.hasIfNotExistCondition() || statement.hasIfExistCondition())
+                        columnsWithConditions = null;
+                    else if (columnsWithConditions != null)
+                        Iterables.addAll(columnsWithConditions, statement.getColumnsWithConditions());
+                }
+                casRequest.addRowUpdate(clustering, statement, statementOptions, timestamp);
+            }
         }
 
         return Pair.create(casRequest, columnsWithConditions);
@@ -445,20 +481,13 @@
         CQL3CasRequest request = p.left;
         Set<ColumnDefinition> columnsWithConditions = p.right;
 
-        ColumnFamily result = ModificationStatement.casInternal(request, state);
+        String ksName = request.cfm.ksName;
+        String tableName = request.cfm.cfName;
 
-        return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(request.cfm.ksName,
-                                                                              request.key,
-                                                                              request.cfm.cfName,
-                                                                              result,
-                                                                              columnsWithConditions,
-                                                                              true,
-                                                                              options.forStatement(0)));
-    }
-
-    public interface BatchVariables
-    {
-        public List<ByteBuffer> getVariablesForStatement(int statementInBatch);
+        try (RowIterator result = ModificationStatement.casInternal(request, state))
+        {
+            return new ResultMessage.Rows(ModificationStatement.buildCasResultSet(ksName, tableName, result, columnsWithConditions, true, options.forStatement(0)));
+        }
     }
 
     public String toString()
@@ -487,7 +516,7 @@
                 statement.prepareKeyspace(state);
         }
 
-        public ParsedStatement.Prepared prepare() throws InvalidRequestException
+        public ParsedStatement.Prepared prepare(ClientState clientState) throws InvalidRequestException
         {
             VariableSpecifications boundNames = getBoundVariables();
 
@@ -508,7 +537,7 @@
                     haveMultipleCFs = !firstKS.equals(parsed.keyspace()) || !firstCF.equals(parsed.columnFamily());
                 }
 
-                statements.add(parsed.prepare(boundNames));
+                statements.add(parsed.prepare(boundNames, clientState));
             }
 
             Attributes prepAttrs = attrs.prepare("[batch]", "[batch]");
@@ -525,4 +554,28 @@
             return new ParsedStatement.Prepared(batchStatement, boundNames, partitionKeyBindIndexes);
         }
     }
+
+    private static class MultiTableColumnsBuilder
+    {
+        private final Map<UUID, PartitionColumns.Builder> perTableBuilders = new HashMap<>();
+
+        public void addAll(CFMetaData table, PartitionColumns columns)
+        {
+            PartitionColumns.Builder builder = perTableBuilders.get(table.cfId);
+            if (builder == null)
+            {
+                builder = PartitionColumns.builder();
+                perTableBuilders.put(table.cfId, builder);
+            }
+            builder.addAll(columns);
+        }
+
+        public Map<UUID, PartitionColumns> build()
+        {
+            Map<UUID, PartitionColumns> m = new HashMap<>();
+            for (Map.Entry<UUID, PartitionColumns.Builder> p : perTableBuilders.entrySet())
+                m.put(p.getKey(), p.getValue().build());
+            return m;
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java b/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java
deleted file mode 100644
index 4db4fb7..0000000
--- a/src/java/org/apache/cassandra/cql3/statements/CFPropDefs.java
+++ /dev/null

@@ -1,255 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.statements;
-
-import java.util.*;
-
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.CFMetaData.SpeculativeRetry;
-import org.apache.cassandra.db.ExpiringCell;
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.utils.BloomCalculations;
-
-public class CFPropDefs extends PropertyDefinitions
-{
-    public static final String KW_COMMENT = "comment";
-    public static final String KW_READREPAIRCHANCE = "read_repair_chance";
-    public static final String KW_DCLOCALREADREPAIRCHANCE = "dclocal_read_repair_chance";
-    public static final String KW_GCGRACESECONDS = "gc_grace_seconds";
-    public static final String KW_MINCOMPACTIONTHRESHOLD = "min_threshold";
-    public static final String KW_MAXCOMPACTIONTHRESHOLD = "max_threshold";
-    public static final String KW_CACHING = "caching";
-    public static final String KW_DEFAULT_TIME_TO_LIVE = "default_time_to_live";
-    public static final String KW_MIN_INDEX_INTERVAL = "min_index_interval";
-    public static final String KW_MAX_INDEX_INTERVAL = "max_index_interval";
-    public static final String KW_SPECULATIVE_RETRY = "speculative_retry";
-    public static final String KW_BF_FP_CHANCE = "bloom_filter_fp_chance";
-    public static final String KW_MEMTABLE_FLUSH_PERIOD = "memtable_flush_period_in_ms";
-
-    public static final String KW_COMPACTION = "compaction";
-    public static final String KW_COMPRESSION = "compression";
-
-    public static final String KW_ID = "id";
-
-    public static final String COMPACTION_STRATEGY_CLASS_KEY = "class";
-
-    public static final Set<String> keywords = new HashSet<>();
-    public static final Set<String> obsoleteKeywords = new HashSet<>();
-
-    static
-    {
-        keywords.add(KW_COMMENT);
-        keywords.add(KW_READREPAIRCHANCE);
-        keywords.add(KW_DCLOCALREADREPAIRCHANCE);
-        keywords.add(KW_GCGRACESECONDS);
-        keywords.add(KW_CACHING);
-        keywords.add(KW_DEFAULT_TIME_TO_LIVE);
-        keywords.add(KW_MIN_INDEX_INTERVAL);
-        keywords.add(KW_MAX_INDEX_INTERVAL);
-        keywords.add(KW_SPECULATIVE_RETRY);
-        keywords.add(KW_BF_FP_CHANCE);
-        keywords.add(KW_COMPACTION);
-        keywords.add(KW_COMPRESSION);
-        keywords.add(KW_MEMTABLE_FLUSH_PERIOD);
-        keywords.add(KW_ID);
-
-        obsoleteKeywords.add("index_interval");
-        obsoleteKeywords.add("replicate_on_write");
-        obsoleteKeywords.add("populate_io_cache_on_flush");
-    }
-
-    private Class<? extends AbstractCompactionStrategy> compactionStrategyClass = null;
-
-    public void validate() throws ConfigurationException, SyntaxException
-    {
-        // Skip validation if the comapction strategy class is already set as it means we've alreayd
-        // prepared (and redoing it would set strategyClass back to null, which we don't want)
-        if (compactionStrategyClass != null)
-            return;
-
-        validate(keywords, obsoleteKeywords);
-
-        try
-        {
-            getId();
-        }
-        catch (IllegalArgumentException e)
-        {
-            throw new ConfigurationException("Invalid table id", e);
-        }
-
-        Map<String, String> compactionOptions = getCompactionOptions();
-        if (!compactionOptions.isEmpty())
-        {
-            String strategy = compactionOptions.get(COMPACTION_STRATEGY_CLASS_KEY);
-            if (strategy == null)
-                throw new ConfigurationException("Missing sub-option '" + COMPACTION_STRATEGY_CLASS_KEY + "' for the '" + KW_COMPACTION + "' option.");
-
-            compactionStrategyClass = CFMetaData.createCompactionStrategy(strategy);
-            compactionOptions.remove(COMPACTION_STRATEGY_CLASS_KEY);
-
-            CFMetaData.validateCompactionOptions(compactionStrategyClass, compactionOptions);
-        }
-
-        Map<String, String> compressionOptions = getCompressionOptions();
-        if (!compressionOptions.isEmpty())
-        {
-            String sstableCompressionClass = compressionOptions.get(CompressionParameters.SSTABLE_COMPRESSION);
-            if (sstableCompressionClass == null)
-                throw new ConfigurationException("Missing sub-option '" + CompressionParameters.SSTABLE_COMPRESSION + "' for the '" + KW_COMPRESSION + "' option.");
-
-            Integer chunkLength = CompressionParameters.DEFAULT_CHUNK_LENGTH;
-            if (compressionOptions.containsKey(CompressionParameters.CHUNK_LENGTH_KB))
-                chunkLength = CompressionParameters.parseChunkLength(compressionOptions.get(CompressionParameters.CHUNK_LENGTH_KB));
-
-            Map<String, String> remainingOptions = new HashMap<>(compressionOptions);
-            remainingOptions.remove(CompressionParameters.SSTABLE_COMPRESSION);
-            remainingOptions.remove(CompressionParameters.CHUNK_LENGTH_KB);
-            CompressionParameters cp = new CompressionParameters(sstableCompressionClass, chunkLength, remainingOptions);
-            cp.validate();
-        }
-
-        validateMinimumInt(KW_DEFAULT_TIME_TO_LIVE, 0, CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE);
-        Integer defaultTimeToLive = getInt(KW_DEFAULT_TIME_TO_LIVE, 0);
-        if (defaultTimeToLive > ExpiringCell.MAX_TTL)
-            throw new ConfigurationException(String.format("%s must be less than or equal to %d (got %s)",
-                                                           KW_DEFAULT_TIME_TO_LIVE,
-                                                           ExpiringCell.MAX_TTL,
-                                                           defaultTimeToLive));
-
-        Integer minIndexInterval = getInt(KW_MIN_INDEX_INTERVAL, null);
-        Integer maxIndexInterval = getInt(KW_MAX_INDEX_INTERVAL, null);
-        if (minIndexInterval != null && minIndexInterval < 1)
-            throw new ConfigurationException(KW_MIN_INDEX_INTERVAL + " must be greater than 0");
-        if (maxIndexInterval != null && minIndexInterval != null && maxIndexInterval < minIndexInterval)
-            throw new ConfigurationException(KW_MAX_INDEX_INTERVAL + " must be greater than " + KW_MIN_INDEX_INTERVAL);
-
-        SpeculativeRetry.fromString(getString(KW_SPECULATIVE_RETRY, SpeculativeRetry.RetryType.NONE.name()));
-    }
-
-    public Class<? extends AbstractCompactionStrategy> getCompactionStrategy()
-    {
-        return compactionStrategyClass;
-    }
-
-    public Map<String, String> getCompactionOptions() throws SyntaxException
-    {
-        Map<String, String> compactionOptions = getMap(KW_COMPACTION);
-        if (compactionOptions == null)
-            return Collections.emptyMap();
-        return compactionOptions;
-    }
-
-    public Map<String, String> getCompressionOptions() throws SyntaxException
-    {
-        Map<String, String> compressionOptions = getMap(KW_COMPRESSION);
-        if (compressionOptions == null)
-            return Collections.emptyMap();
-        return compressionOptions;
-    }
-    public CachingOptions getCachingOptions() throws SyntaxException, ConfigurationException
-    {
-        CachingOptions options = null;
-        Object val = properties.get(KW_CACHING);
-        if (val == null)
-            return null;
-        else if (val instanceof Map)
-            options = CachingOptions.fromMap(getMap(KW_CACHING));
-        else if (val instanceof String) // legacy syntax
-        {
-            options = CachingOptions.fromString(getSimple(KW_CACHING));
-            logger.warn("Setting caching options with deprecated syntax.");
-        }
-        return options;
-    }
-
-    public Integer getDefaultTimeToLive() throws SyntaxException
-    {
-        return getInt(KW_DEFAULT_TIME_TO_LIVE, 0);
-    }
-
-    public UUID getId() throws SyntaxException
-    {
-        String id = getSimple(KW_ID);
-        return id != null ? UUID.fromString(id) : null;
-    }
-
-    public void applyToCFMetadata(CFMetaData cfm) throws ConfigurationException, SyntaxException
-    {
-        if (hasProperty(KW_COMMENT))
-            cfm.comment(getString(KW_COMMENT, ""));
-
-        cfm.readRepairChance(getDouble(KW_READREPAIRCHANCE, cfm.getReadRepairChance()));
-        cfm.dcLocalReadRepairChance(getDouble(KW_DCLOCALREADREPAIRCHANCE, cfm.getDcLocalReadRepairChance()));
-        cfm.gcGraceSeconds(getInt(KW_GCGRACESECONDS, cfm.getGcGraceSeconds()));
-        int minCompactionThreshold = toInt(KW_MINCOMPACTIONTHRESHOLD, getCompactionOptions().get(KW_MINCOMPACTIONTHRESHOLD), cfm.getMinCompactionThreshold());
-        int maxCompactionThreshold = toInt(KW_MAXCOMPACTIONTHRESHOLD, getCompactionOptions().get(KW_MAXCOMPACTIONTHRESHOLD), cfm.getMaxCompactionThreshold());
-        if (minCompactionThreshold <= 0 || maxCompactionThreshold <= 0)
-            throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been deprecated, set the compaction option 'enabled' to false instead.");
-        cfm.minCompactionThreshold(minCompactionThreshold);
-        cfm.maxCompactionThreshold(maxCompactionThreshold);
-        cfm.defaultTimeToLive(getInt(KW_DEFAULT_TIME_TO_LIVE, cfm.getDefaultTimeToLive()));
-        cfm.speculativeRetry(CFMetaData.SpeculativeRetry.fromString(getString(KW_SPECULATIVE_RETRY, cfm.getSpeculativeRetry().toString())));
-        cfm.memtableFlushPeriod(getInt(KW_MEMTABLE_FLUSH_PERIOD, cfm.getMemtableFlushPeriod()));
-        cfm.minIndexInterval(getInt(KW_MIN_INDEX_INTERVAL, cfm.getMinIndexInterval()));
-        cfm.maxIndexInterval(getInt(KW_MAX_INDEX_INTERVAL, cfm.getMaxIndexInterval()));
-
-        if (compactionStrategyClass != null)
-        {
-            cfm.compactionStrategyClass(compactionStrategyClass);
-            cfm.compactionStrategyOptions(new HashMap<>(getCompactionOptions()));
-        }
-
-        double bloomFilterFpChance = getDouble(KW_BF_FP_CHANCE, cfm.getBloomFilterFpChance());
-        double minBloomFilterFpChanceValue = BloomCalculations.minSupportedBloomFilterFpChance();
-        if (bloomFilterFpChance <=  minBloomFilterFpChanceValue || bloomFilterFpChance > 1)
-        {
-            throw new ConfigurationException(String.format(
-                    "%s must be larger than %s and less than or equal to 1.0 (got %s)",
-                    KW_BF_FP_CHANCE,
-                    minBloomFilterFpChanceValue,
-                    bloomFilterFpChance));
-        }
-        cfm.bloomFilterFpChance(bloomFilterFpChance);
-
-        if (!getCompressionOptions().isEmpty())
-            cfm.compressionParameters(CompressionParameters.create(getCompressionOptions()));
-        CachingOptions cachingOptions = getCachingOptions();
-        if (cachingOptions != null)
-            cfm.caching(cachingOptions);
-    }
-
-    @Override
-    public String toString()
-    {
-        return String.format("CFPropDefs(%s)", properties.toString());
-    }
-
-    private void validateMinimumInt(String field, int minimumValue, int defaultValue) throws SyntaxException, ConfigurationException
-    {
-        Integer val = getInt(field, null);
-        if (val != null && val < minimumValue)
-            throw new ConfigurationException(String.format("%s cannot be smaller than %s, (default %s)",
-                                                            field, minimumValue, defaultValue));
-
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/statements/CFProperties.java b/src/java/org/apache/cassandra/cql3/statements/CFProperties.java
new file mode 100644
index 0000000..92dd994
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/CFProperties.java

@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ReversedType;
+
+public class CFProperties
+{
+    public final TableAttributes properties = new TableAttributes();
+    final Map<ColumnIdentifier, Boolean> definedOrdering = new LinkedHashMap<>(); // Insertion ordering is important
+    boolean useCompactStorage = false;
+
+    public void validate()
+    {
+        properties.validate();
+    }
+
+    public void setOrdering(ColumnIdentifier alias, boolean reversed)
+    {
+        definedOrdering.put(alias, reversed);
+    }
+
+    public void setCompactStorage()
+    {
+        useCompactStorage = true;
+    }
+
+    public AbstractType getReversableType(ColumnIdentifier targetIdentifier, AbstractType<?> type)
+    {
+        if (!definedOrdering.containsKey(targetIdentifier))
+        {
+            return type;
+        }
+        return definedOrdering.get(targetIdentifier) ? ReversedType.getInstance(type) : type;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
index 081a14e..e14ae6c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java

@@ -20,13 +20,16 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-import org.apache.cassandra.cql3.*;
+import com.google.common.collect.*;
+
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.service.CASRequest;
 import org.apache.cassandra.utils.Pair;
@@ -36,62 +39,101 @@
  */
 public class CQL3CasRequest implements CASRequest
 {
-    final CFMetaData cfm;
-    final ByteBuffer key;
-    final long now;
-    final boolean isBatch;
+    public final CFMetaData cfm;
+    public final DecoratedKey key;
+    public final boolean isBatch;
+    private final PartitionColumns conditionColumns;
+    private final boolean updatesRegularRows;
+    private final boolean updatesStaticRow;
+    private boolean hasExists; // whether we have an exist or if not exist condition
 
-    // We index RowCondition by the prefix of the row they applied to for 2 reasons:
-    //   1) this allows to keep things sorted to build the ColumnSlice array below
+    // Conditions on the static row. We keep it separate from 'conditions' as most things related to the static row are
+    // special cases anyway.
+    private RowCondition staticConditions;
+    // We index RowCondition by the clustering of the row they applied to for 2 reasons:
+    //   1) this allows to keep things sorted to build the read command below
     //   2) this allows to detect when contradictory conditions are set (not exists with some other conditions on the same row)
-    private final SortedMap<Composite, RowCondition> conditions;
+    private final TreeMap<Clustering, RowCondition> conditions;
 
     private final List<RowUpdate> updates = new ArrayList<>();
+    private final List<RangeDeletion> rangeDeletions = new ArrayList<>();
 
-    public CQL3CasRequest(CFMetaData cfm, ByteBuffer key, boolean isBatch)
+    public CQL3CasRequest(CFMetaData cfm,
+                          DecoratedKey key,
+                          boolean isBatch,
+                          PartitionColumns conditionColumns,
+                          boolean updatesRegularRows,
+                          boolean updatesStaticRow)
     {
         this.cfm = cfm;
-        // When checking if conditions apply, we want to use a fixed reference time for a whole request to check
-        // for expired cells. Note that this is unrelated to the cell timestamp.
-        this.now = System.currentTimeMillis();
         this.key = key;
         this.conditions = new TreeMap<>(cfm.comparator);
         this.isBatch = isBatch;
+        this.conditionColumns = conditionColumns;
+        this.updatesRegularRows = updatesRegularRows;
+        this.updatesStaticRow = updatesStaticRow;
     }
 
-    public void addRowUpdate(Composite prefix, ModificationStatement stmt, QueryOptions options, long timestamp)
+    public void addRowUpdate(Clustering clustering, ModificationStatement stmt, QueryOptions options, long timestamp)
     {
-        updates.add(new RowUpdate(prefix, stmt, options, timestamp));
+        updates.add(new RowUpdate(clustering, stmt, options, timestamp));
     }
 
-    public void addNotExist(Composite prefix) throws InvalidRequestException
+    public void addRangeDeletion(Slice slice, ModificationStatement stmt, QueryOptions options, long timestamp)
     {
-        RowCondition previous = conditions.put(prefix, new NotExistCondition(prefix, now));
-        if (previous != null && !(previous instanceof NotExistCondition))
+        rangeDeletions.add(new RangeDeletion(slice, stmt, options, timestamp));
+    }
+
+    public void addNotExist(Clustering clustering) throws InvalidRequestException
+    {
+        addExistsCondition(clustering, new NotExistCondition(clustering), true);
+    }
+
+    public void addExist(Clustering clustering) throws InvalidRequestException
+    {
+        addExistsCondition(clustering, new ExistCondition(clustering), false);
+    }
+
+    private void addExistsCondition(Clustering clustering, RowCondition condition, boolean isNotExist)
+    {
+        assert condition instanceof ExistCondition || condition instanceof NotExistCondition;
+        RowCondition previous = getConditionsForRow(clustering);
+        if (previous != null)
         {
-            // these should be prevented by the parser, but it doesn't hurt to check
-            if (previous instanceof ExistCondition)
-                throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
+            if (previous.getClass().equals(condition.getClass()))
+            {
+                // We can get here if a BATCH has 2 different statements on the same row with the same "exist" condition.
+                // For instance (assuming 'k' is the full PK):
+                //   BEGIN BATCH
+                //      INSERT INTO t(k, v1) VALUES (0, 'foo') IF NOT EXISTS;
+                //      INSERT INTO t(k, v2) VALUES (0, 'bar') IF NOT EXISTS;
+                //   APPLY BATCH;
+                // Of course, those can be trivially rewritten by the user as a single INSERT statement, but we still don't
+                // want this to be a problem (see #12867 in particular), so we simply return (the condition itself has
+                // already be set).
+                assert hasExists; // We shouldn't have a previous condition unless hasExists has been set already.
+                return;
+            }
             else
-                throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row");
+            {
+                // these should be prevented by the parser, but it doesn't hurt to check
+                throw (previous instanceof NotExistCondition || previous instanceof ExistCondition)
+                    ? new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row")
+                    : new InvalidRequestException("Cannot mix IF conditions and IF " + (isNotExist ? "NOT " : "") + "EXISTS for the same row");
+            }
         }
+
+        setConditionsForRow(clustering, condition);
+        hasExists = true;
     }
 
-    public void addExist(Composite prefix) throws InvalidRequestException
+    public void addConditions(Clustering clustering, Collection<ColumnCondition> conds, QueryOptions options) throws InvalidRequestException
     {
-        RowCondition previous = conditions.put(prefix, new ExistCondition(prefix, now));
-        // this should be prevented by the parser, but it doesn't hurt to check
-        if (previous instanceof NotExistCondition)
-            throw new InvalidRequestException("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row");
-    }
-
-    public void addConditions(Composite prefix, Collection<ColumnCondition> conds, QueryOptions options) throws InvalidRequestException
-    {
-        RowCondition condition = conditions.get(prefix);
+        RowCondition condition = getConditionsForRow(clustering);
         if (condition == null)
         {
-            condition = new ColumnsConditions(prefix, now);
-            conditions.put(prefix, condition);
+            condition = new ColumnsConditions(clustering);
+            setConditionsForRow(clustering, condition);
         }
         else if (!(condition instanceof ColumnsConditions))
         {
@@ -100,25 +142,75 @@
         ((ColumnsConditions)condition).addConditions(conds, options);
     }
 
-    public IDiskAtomFilter readFilter()
+    private RowCondition getConditionsForRow(Clustering clustering)
     {
-        assert !conditions.isEmpty();
-        ColumnSlice[] slices = new ColumnSlice[conditions.size()];
-        int i = 0;
-        // We always read CQL rows entirely as on CAS failure we want to be able to distinguish between "row exists
-        // but all values for which there were conditions are null" and "row doesn't exists", and we can't rely on the
-        // row marker for that (see #6623)
-        for (Composite prefix : conditions.keySet())
-            slices[i++] = prefix.slice();
-
-        int toGroup = cfm.comparator.isDense() ? -1 : cfm.clusteringColumns().size();
-        slices = ColumnSlice.deoverlapSlices(slices, cfm.comparator);
-        assert ColumnSlice.validateSlices(slices, cfm.comparator, false);
-        return new SliceQueryFilter(slices, false, slices.length, toGroup);
+        return clustering == Clustering.STATIC_CLUSTERING ? staticConditions : conditions.get(clustering);
     }
 
-    public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
+    private void setConditionsForRow(Clustering clustering, RowCondition condition)
     {
+        if (clustering == Clustering.STATIC_CLUSTERING)
+        {
+            assert staticConditions == null;
+            staticConditions = condition;
+        }
+        else
+        {
+            RowCondition previous = conditions.put(clustering, condition);
+            assert previous == null;
+        }
+    }
+
+    private PartitionColumns columnsToRead()
+    {
+        PartitionColumns allColumns = cfm.partitionColumns();
+
+        // If we update static row, we won't have any conditions on regular rows.
+        // If we update regular row, we have to fetch all regular rows (which would satisfy column condition) and
+        // static rows that take part in column condition.
+        // In both cases, we're fetching enough rows to distinguish between "all conditions are nulls" and "row does not exist".
+        // We have to do this as we can't rely on row marker for that (see #6623)
+        Columns statics = updatesStaticRow ? allColumns.statics : conditionColumns.statics;
+        Columns regulars = updatesRegularRows ? allColumns.regulars : conditionColumns.regulars;
+        return new PartitionColumns(statics, regulars);
+    }
+
+    public SinglePartitionReadCommand readCommand(int nowInSec)
+    {
+        assert staticConditions != null || !conditions.isEmpty();
+
+        // Fetch all columns, but query only the selected ones
+        ColumnFilter columnFilter = ColumnFilter.selection(columnsToRead());
+
+        // With only a static condition, we still want to make the distinction between a non-existing partition and one
+        // that exists (has some live data) but has not static content. So we query the first live row of the partition.
+        if (conditions.isEmpty())
+            return SinglePartitionReadCommand.create(cfm,
+                                                     nowInSec,
+                                                     columnFilter,
+                                                     RowFilter.NONE,
+                                                     DataLimits.cqlLimits(1),
+                                                     key,
+                                                     new ClusteringIndexSliceFilter(Slices.ALL, false));
+
+        ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(conditions.navigableKeySet(), false);
+        return SinglePartitionReadCommand.create(cfm, nowInSec, key, columnFilter, filter);
+    }
+
+    /**
+     * Checks whether the conditions represented by this object applies provided the current state of the partition on
+     * which those conditions are.
+     *
+     * @param current the partition with current data corresponding to these conditions. More precisely, this must be
+     * the result of executing the command returned by {@link #readCommand}. This can be empty but it should not be
+     * {@code null}.
+     * @return whether the conditions represented by this object applies or not.
+     */
+    public boolean appliesTo(FilteredPartition current) throws InvalidRequestException
+    {
+        if (staticConditions != null && !staticConditions.appliesTo(current))
+            return false;
+
         for (RowCondition condition : conditions.values())
         {
             if (!condition.appliesTo(current))
@@ -127,16 +219,28 @@
         return true;
     }
 
-    public ColumnFamily makeUpdates(ColumnFamily current) throws InvalidRequestException
+    private PartitionColumns updatedColumns()
     {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfm);
+        PartitionColumns.Builder builder = PartitionColumns.builder();
         for (RowUpdate upd : updates)
-            upd.applyUpdates(current, cf);
+            builder.addAll(upd.stmt.updatedColumns());
+        return builder.build();
+    }
+
+    public PartitionUpdate makeUpdates(FilteredPartition current) throws InvalidRequestException
+    {
+        PartitionUpdate update = new PartitionUpdate(cfm, key, updatedColumns(), conditions.size());
+        for (RowUpdate upd : updates)
+            upd.applyUpdates(current, update);
+        for (RangeDeletion upd : rangeDeletions)
+            upd.applyUpdates(current, update);
+
+        Keyspace.openAndGetStore(cfm).indexManager.validate(update);
 
         if (isBatch)
-            BatchStatement.verifyBatchSize(Collections.singleton(cf));
+            BatchStatement.verifyBatchSize(Collections.singleton(update));
 
-        return cf;
+        return update;
     }
 
     /**
@@ -147,89 +251,85 @@
      */
     private class RowUpdate
     {
-        private final Composite rowPrefix;
+        private final Clustering clustering;
         private final ModificationStatement stmt;
         private final QueryOptions options;
         private final long timestamp;
 
-        private RowUpdate(Composite rowPrefix, ModificationStatement stmt, QueryOptions options, long timestamp)
+        private RowUpdate(Clustering clustering, ModificationStatement stmt, QueryOptions options, long timestamp)
         {
-            this.rowPrefix = rowPrefix;
+            this.clustering = clustering;
             this.stmt = stmt;
             this.options = options;
             this.timestamp = timestamp;
         }
 
-        public void applyUpdates(ColumnFamily current, ColumnFamily updates) throws InvalidRequestException
+        public void applyUpdates(FilteredPartition current, PartitionUpdate updates) throws InvalidRequestException
         {
-            Map<ByteBuffer, CQL3Row> map = null;
-            if (stmt.requiresRead())
-            {
-                // Uses the "current" values read by Paxos for lists operation that requires a read
-                Iterator<CQL3Row> iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(current.iterator(new ColumnSlice[]{ rowPrefix.slice() }));
-                if (iter.hasNext())
-                {
-                    map = Collections.singletonMap(key, iter.next());
-                    assert !iter.hasNext() : "We shoudn't be updating more than one CQL row per-ModificationStatement";
-                }
-            }
+            Map<DecoratedKey, Partition> map = stmt.requiresRead() ? Collections.<DecoratedKey, Partition>singletonMap(key, current) : null;
+            UpdateParameters params = new UpdateParameters(cfm, updates.columns(), options, timestamp, stmt.getTimeToLive(options), map);
+            stmt.addUpdateForKey(updates, clustering, params);
+        }
+    }
 
-            UpdateParameters params = new UpdateParameters(cfm, options, timestamp, stmt.getTimeToLive(options), map);
-            stmt.addUpdateForKey(updates, key, rowPrefix, params);
+    private class RangeDeletion
+    {
+        private final Slice slice;
+        private final ModificationStatement stmt;
+        private final QueryOptions options;
+        private final long timestamp;
+
+        private RangeDeletion(Slice slice, ModificationStatement stmt, QueryOptions options, long timestamp)
+        {
+            this.slice = slice;
+            this.stmt = stmt;
+            this.options = options;
+            this.timestamp = timestamp;
+        }
+
+        public void applyUpdates(FilteredPartition current, PartitionUpdate updates) throws InvalidRequestException
+        {
+            Map<DecoratedKey, Partition> map = stmt.requiresRead() ? Collections.<DecoratedKey, Partition>singletonMap(key, current) : null;
+            UpdateParameters params = new UpdateParameters(cfm, updates.columns(), options, timestamp, stmt.getTimeToLive(options), map);
+            stmt.addUpdateForKey(updates, slice, params);
         }
     }
 
     private static abstract class RowCondition
     {
-        public final Composite rowPrefix;
-        protected final long now;
+        public final Clustering clustering;
 
-        protected RowCondition(Composite rowPrefix, long now)
+        protected RowCondition(Clustering clustering)
         {
-            this.rowPrefix = rowPrefix;
-            this.now = now;
+            this.clustering = clustering;
         }
 
-        public abstract boolean appliesTo(ColumnFamily current) throws InvalidRequestException;
+        public abstract boolean appliesTo(FilteredPartition current) throws InvalidRequestException;
     }
 
     private static class NotExistCondition extends RowCondition
     {
-        private NotExistCondition(Composite rowPrefix, long now)
+        private NotExistCondition(Clustering clustering)
         {
-            super(rowPrefix, now);
+            super(clustering);
         }
 
-        public boolean appliesTo(ColumnFamily current)
+        public boolean appliesTo(FilteredPartition current)
         {
-            if (current == null)
-                return true;
-
-            Iterator<Cell> iter = current.iterator(new ColumnSlice[]{ rowPrefix.slice() });
-            while (iter.hasNext())
-                if (iter.next().isLive(now))
-                    return false;
-            return true;
+            return current.getRow(clustering) == null;
         }
     }
 
     private static class ExistCondition extends RowCondition
     {
-        private ExistCondition(Composite rowPrefix, long now)
+        private ExistCondition(Clustering clustering)
         {
-            super (rowPrefix, now);
+            super(clustering);
         }
 
-        public boolean appliesTo(ColumnFamily current)
+        public boolean appliesTo(FilteredPartition current)
         {
-            if (current == null)
-                return false;
-
-            Iterator<Cell> iter = current.iterator(new ColumnSlice[]{ rowPrefix.slice() });
-            while (iter.hasNext())
-                if (iter.next().isLive(now))
-                    return true;
-            return false;
+            return current.getRow(clustering) != null;
         }
     }
 
@@ -237,9 +337,9 @@
     {
         private final Multimap<Pair<ColumnIdentifier, ByteBuffer>, ColumnCondition.Bound> conditions = HashMultimap.create();
 
-        private ColumnsConditions(Composite rowPrefix, long now)
+        private ColumnsConditions(Clustering clustering)
         {
-            super(rowPrefix, now);
+            super(clustering);
         }
 
         public void addConditions(Collection<ColumnCondition> conds, QueryOptions options) throws InvalidRequestException
@@ -251,14 +351,14 @@
             }
         }
 
-        public boolean appliesTo(ColumnFamily current) throws InvalidRequestException
+        public boolean appliesTo(FilteredPartition current) throws InvalidRequestException
         {
-            if (current == null)
-                return conditions.isEmpty();
-
+            Row row = current.getRow(clustering);
             for (ColumnCondition.Bound condition : conditions.values())
-                if (!condition.appliesTo(rowPrefix, current, now))
+            {
+                if (!condition.appliesTo(row))
                     return false;
+            }
             return true;
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateAggregateStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateAggregateStatement.java
index 5ee7e33..9d91693 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateAggregateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateAggregateStatement.java

@@ -20,6 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Objects;
 import java.util.List;
 
 import org.apache.cassandra.auth.*;
@@ -29,11 +30,13 @@
 import org.apache.cassandra.cql3.functions.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
+import org.apache.cassandra.transport.Server;
 
 /**
  * A {@code CREATE AGGREGATE} statement parsed from a CQL query.
@@ -50,9 +53,6 @@
     private final List<CQL3Type.Raw> argRawTypes;
     private final Term.Raw ival;
 
-    private UDAggregate udAggregate;
-    private boolean replaced;
-
     private List<AbstractType<?>> argTypes;
     private AbstractType<?> returnType;
     private ScalarFunction stateFunction;
@@ -78,7 +78,7 @@
         this.ifNotExists = ifNotExists;
     }
 
-    public Prepared prepare()
+    public Prepared prepare(ClientState clientState)
     {
         argTypes = new ArrayList<>(argRawTypes.size());
         for (CQL3Type.Raw rawType : argRawTypes)
@@ -88,7 +88,7 @@
 
         List<AbstractType<?>> stateArgs = stateArguments(stateType, argTypes);
 
-        Function f = Functions.find(stateFunc, stateArgs);
+        Function f = Schema.instance.findFunction(stateFunc, stateArgs).orElse(null);
         if (!(f instanceof ScalarFunction))
             throw new InvalidRequestException("State function " + stateFuncSig(stateFunc, stateTypeRaw, argRawTypes) + " does not exist or is not a scalar function");
         stateFunction = (ScalarFunction)f;
@@ -100,7 +100,7 @@
         if (finalFunc != null)
         {
             List<AbstractType<?>> finalArgs = Collections.<AbstractType<?>>singletonList(stateType);
-            f = Functions.find(finalFunc, finalArgs);
+            f = Schema.instance.findFunction(finalFunc, finalArgs).orElse(null);
             if (!(f instanceof ScalarFunction))
                 throw new InvalidRequestException("Final function " + finalFunc + '(' + stateTypeRaw + ") does not exist or is not a scalar function");
             finalFunction = (ScalarFunction) f;
@@ -113,13 +113,30 @@
 
         if (ival != null)
         {
-            ColumnSpecification receiver = new ColumnSpecification(functionName.keyspace, "--dummy--", new ColumnIdentifier("(aggregate_initcond)", true), stateType);
-            initcond = ival.prepare(functionName.keyspace, receiver).bindAndGet(QueryOptions.DEFAULT);
+            initcond = Terms.asBytes(functionName.keyspace, ival.toString(), stateType);
+
+            if (initcond != null)
+            {
+                try
+                {
+                    stateType.validate(initcond);
+                }
+                catch (MarshalException e)
+                {
+                    throw new InvalidRequestException(String.format("Invalid value for INITCOND of type %s%s", stateType.asCQL3Type(),
+                                                                    e.getMessage() == null ? "" : String.format(" (%s)", e.getMessage())));
+                }
+            }
+
+            // Sanity check that converts the initcond to a CQL literal and parse it back to avoid getting in CASSANDRA-11064.
+            String initcondAsCql = stateType.asCQL3Type().toCQLLiteral(initcond, Server.CURRENT_VERSION);
+            assert Objects.equals(initcond, Terms.asBytes(functionName.keyspace, initcondAsCql, stateType));
+
             if (Constants.NULL_LITERAL != ival && UDHelper.isNullOrEmpty(stateType, initcond))
                 throw new InvalidRequestException("INITCOND must not be empty for all types except TEXT, ASCII, BLOB");
         }
 
-        return super.prepare();
+        return super.prepare(clientState);
     }
 
     private AbstractType<?> prepareType(String typeName, CQL3Type.Raw rawType)
@@ -169,19 +186,17 @@
 
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
     {
-        if (Functions.find(functionName, argTypes) != null && orReplace)
+        if (Schema.instance.findFunction(functionName, argTypes).isPresent() && orReplace)
             state.ensureHasPermission(Permission.ALTER, FunctionResource.function(functionName.keyspace,
                                                                                   functionName.name,
                                                                                   argTypes));
         else
             state.ensureHasPermission(Permission.CREATE, FunctionResource.keyspace(functionName.keyspace));
 
-        for (Function referencedFunction : stateFunction.getFunctions())
-            state.ensureHasPermission(Permission.EXECUTE, referencedFunction);
+        state.ensureHasPermission(Permission.EXECUTE, stateFunction);
 
         if (finalFunction != null)
-            for (Function referencedFunction : finalFunction.getFunctions())
-                state.ensureHasPermission(Permission.EXECUTE, referencedFunction);
+            state.ensureHasPermission(Permission.EXECUTE, finalFunction);
     }
 
     public void validate(ClientState state) throws InvalidRequestException
@@ -193,20 +208,14 @@
             throw new InvalidRequestException(String.format("Cannot add aggregate '%s' to non existing keyspace '%s'.", functionName.name, functionName.keyspace));
     }
 
-    public Event.SchemaChange changeEvent()
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
-        return new Event.SchemaChange(replaced ? Event.SchemaChange.Change.UPDATED : Event.SchemaChange.Change.CREATED,
-                                      Event.SchemaChange.Target.AGGREGATE,
-                                      udAggregate.name().keyspace, udAggregate.name().name, AbstractType.asCQLTypeStringList(udAggregate.argTypes()));
-    }
-
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
-    {
-        Function old = Functions.find(functionName, argTypes);
-        if (old != null)
+        Function old = Schema.instance.findFunction(functionName, argTypes).orElse(null);
+        boolean replaced = old != null;
+        if (replaced)
         {
             if (ifNotExists)
-                return false;
+                return null;
             if (!orReplace)
                 throw new InvalidRequestException(String.format("Function %s already exists", old));
             if (!(old instanceof AggregateFunction))
@@ -224,15 +233,13 @@
         if (!stateFunction.isCalledOnNullInput() && initcond == null)
             throw new InvalidRequestException(String.format("Cannot create aggregate %s without INITCOND because state function %s does not accept 'null' arguments", functionName, stateFunc));
 
-        udAggregate = new UDAggregate(functionName, argTypes, returnType,
-                                                  stateFunction,
-                                                  finalFunction,
-                                                  initcond);
-        replaced = old != null;
+        UDAggregate udAggregate = new UDAggregate(functionName, argTypes, returnType, stateFunction, finalFunction, initcond);
 
         MigrationManager.announceNewAggregate(udAggregate, isLocalOnly);
 
-        return true;
+        return new Event.SchemaChange(replaced ? Event.SchemaChange.Change.UPDATED : Event.SchemaChange.Change.CREATED,
+                                      Event.SchemaChange.Target.AGGREGATE,
+                                      udAggregate.name().keyspace, udAggregate.name().name, AbstractType.asCQLTypeStringList(udAggregate.argTypes()));
     }
 
     private static String stateFuncSig(FunctionName stateFuncName, CQL3Type.Raw stateTypeRaw, List<CQL3Type.Raw> argRawTypes)

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateFunctionStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateFunctionStatement.java
index 77e41ed..dfe522b 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateFunctionStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateFunctionStatement.java

@@ -29,6 +29,7 @@
 import org.apache.cassandra.cql3.functions.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.Functions;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.QueryState;
@@ -53,8 +54,6 @@
 
     private List<AbstractType<?>> argTypes;
     private AbstractType<?> returnType;
-    private UDFunction udFunction;
-    private boolean replaced;
 
     public CreateFunctionStatement(FunctionName functionName,
                                    String language,
@@ -77,7 +76,7 @@
         this.ifNotExists = ifNotExists;
     }
 
-    public Prepared prepare() throws InvalidRequestException
+    public Prepared prepare(ClientState clientState) throws InvalidRequestException
     {
         if (new HashSet<>(argNames).size() != argNames.size())
             throw new InvalidRequestException(String.format("duplicate argument names for given function %s with argument names %s",
@@ -88,7 +87,7 @@
             argTypes.add(prepareType("arguments", rawType));
 
         returnType = prepareType("return type", rawReturnType);
-        return super.prepare();
+        return super.prepare(clientState);
     }
 
     public void prepareKeyspace(ClientState state) throws InvalidRequestException
@@ -120,7 +119,7 @@
 
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
     {
-        if (Functions.find(functionName, argTypes) != null && orReplace)
+        if (Schema.instance.findFunction(functionName, argTypes).isPresent() && orReplace)
             state.ensureHasPermission(Permission.ALTER, FunctionResource.function(functionName.keyspace,
                                                                                   functionName.name,
                                                                                   argTypes));
@@ -130,8 +129,7 @@
 
     public void validate(ClientState state) throws InvalidRequestException
     {
-        if (!DatabaseDescriptor.enableUserDefinedFunctions())
-            throw new InvalidRequestException("User-defined-functions are disabled in cassandra.yaml - set enable_user_defined_functions=true to enable if you are aware of the security risks");
+        UDFunction.assertUdfsEnabled(language);
 
         if (ifNotExists && orReplace)
             throw new InvalidRequestException("Cannot use both 'OR REPLACE' and 'IF NOT EXISTS' directives");
@@ -140,20 +138,14 @@
             throw new InvalidRequestException(String.format("Cannot add function '%s' to non existing keyspace '%s'.", functionName.name, functionName.keyspace));
     }
 
-    public Event.SchemaChange changeEvent()
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
-        return new Event.SchemaChange(replaced ? Event.SchemaChange.Change.UPDATED : Event.SchemaChange.Change.CREATED,
-                                      Event.SchemaChange.Target.FUNCTION,
-                                      udFunction.name().keyspace, udFunction.name().name, AbstractType.asCQLTypeStringList(udFunction.argTypes()));
-    }
-
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
-    {
-        Function old = Functions.find(functionName, argTypes);
-        if (old != null)
+        Function old = Schema.instance.findFunction(functionName, argTypes).orElse(null);
+        boolean replaced = old != null;
+        if (replaced)
         {
             if (ifNotExists)
-                return false;
+                return null;
             if (!orReplace)
                 throw new InvalidRequestException(String.format("Function %s already exists", old));
             if (!(old instanceof ScalarFunction))
@@ -162,20 +154,18 @@
                 throw new InvalidRequestException(String.format("Function %s can only be replaced with %s", old,
                                                                 calledOnNullInput ? "CALLED ON NULL INPUT" : "RETURNS NULL ON NULL INPUT"));
 
-            if (!Functions.typeEquals(old.returnType(), returnType))
+            if (!Functions.typesMatch(old.returnType(), returnType))
                 throw new InvalidRequestException(String.format("Cannot replace function %s, the new return type %s is not compatible with the return type %s of existing function",
                                                                 functionName, returnType.asCQL3Type(), old.returnType().asCQL3Type()));
         }
 
-        this.udFunction = UDFunction.create(functionName, argNames, argTypes, returnType, calledOnNullInput, language, body);
-        this.replaced = old != null;
-
-        // add function to registry to prevent duplicate compilation on coordinator during migration
-        Functions.addOrReplaceFunction(udFunction);
+        UDFunction udFunction = UDFunction.create(functionName, argNames, argTypes, returnType, calledOnNullInput, language, body);
 
         MigrationManager.announceNewFunction(udFunction, isLocalOnly);
 
-        return true;
+        return new Event.SchemaChange(replaced ? Event.SchemaChange.Change.UPDATED : Event.SchemaChange.Change.CREATED,
+                                      Event.SchemaChange.Target.FUNCTION,
+                                      udFunction.name().keyspace, udFunction.name().name, AbstractType.asCQLTypeStringList(udFunction.argTypes()));
     }
 
     private AbstractType<?> prepareType(String typeName, CQL3Type.Raw rawType)

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java
index d93c0a7..e0b9b02 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateIndexStatement.java

@@ -17,23 +17,30 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.util.Collections;
-import java.util.Map;
+import java.util.*;
 
-import com.google.common.collect.ImmutableMap;
+import com.google.common.base.Optional;
+import com.google.common.base.Strings;
+import com.google.common.collect.Iterables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.IndexType;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.CFName;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.IndexName;
 import org.apache.cassandra.db.marshal.MapType;
-import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Indexes;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 
@@ -43,19 +50,19 @@
     private static final Logger logger = LoggerFactory.getLogger(CreateIndexStatement.class);
 
     private final String indexName;
-    private final IndexTarget.Raw rawTarget;
+    private final List<IndexTarget.Raw> rawTargets;
     private final IndexPropDefs properties;
     private final boolean ifNotExists;
 
     public CreateIndexStatement(CFName name,
                                 IndexName indexName,
-                                IndexTarget.Raw target,
+                                List<IndexTarget.Raw> targets,
                                 IndexPropDefs properties,
                                 boolean ifNotExists)
     {
         super(name);
         this.indexName = indexName.getIdx();
-        this.rawTarget = target;
+        this.rawTargets = targets;
         this.properties = properties;
         this.ifNotExists = ifNotExists;
     }
@@ -68,91 +75,107 @@
     public void validate(ClientState state) throws RequestValidationException
     {
         CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+
         if (cfm.isCounter())
             throw new InvalidRequestException("Secondary indexes are not supported on counter tables");
 
-        IndexTarget target = rawTarget.prepare(cfm);
-        ColumnDefinition cd = cfm.getColumnDefinition(target.column);
+        if (cfm.isView())
+            throw new InvalidRequestException("Secondary indexes are not supported on materialized views");
 
-        if (cd == null)
-            throw new InvalidRequestException("No column definition found for column " + target.column);
+        if (cfm.isCompactTable() && !cfm.isStaticCompactTable())
+            throw new InvalidRequestException("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
 
-        boolean isMap = cd.type instanceof MapType;
-        boolean isFrozenCollection = cd.type.isCollection() && !cd.type.isMultiCell();
+        List<IndexTarget> targets = new ArrayList<>(rawTargets.size());
+        for (IndexTarget.Raw rawTarget : rawTargets)
+            targets.add(rawTarget.prepare(cfm));
 
-        if (isFrozenCollection)
+        if (targets.isEmpty() && !properties.isCustom)
+            throw new InvalidRequestException("Only CUSTOM indexes can be created without specifying a target column");
+
+        if (targets.size() > 1)
+            validateTargetsForMultiColumnIndex(targets);
+
+        for (IndexTarget target : targets)
         {
-            validateForFrozenCollection(target);
-        }
-        else
-        {
-            validateNotFullIndex(target);
-            validateIsValuesIndexIfTargetColumnNotCollection(cd, target);
-            validateTargetColumnIsMapIfIndexInvolvesKeys(isMap, target);
-        }
+            ColumnDefinition cd = cfm.getColumnDefinitionForCQL(target.column);
 
-        if (cd.getIndexType() != null)
-        {
-            IndexTarget.TargetType prevType = IndexTarget.TargetType.fromColumnDefinition(cd);
-            if (isMap && target.type != prevType)
+            if (cd == null)
+                throw new InvalidRequestException("No column definition found for column " + target.column);
+
+            // TODO: we could lift that limitation
+            if (cfm.isCompactTable())
             {
-                String msg = "Cannot create index on %s(%s): an index on %s(%s) already exists and indexing " +
-                             "a map on more than one dimension at the same time is not currently supported";
-                throw new InvalidRequestException(String.format(msg,
-                                                                target.type, target.column,
-                                                                prevType, target.column));
+                if (cd.isPrimaryKeyColumn())
+                    throw new InvalidRequestException("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
+                if (cfm.compactValueColumn().equals(cd))
+                    throw new InvalidRequestException("Secondary indexes are not supported on compact value column of COMPACT STORAGE tables");
             }
 
-            if (ifNotExists)
-                return;
+            // It would be possible to support 2ndary index on static columns (but not without modifications of at least ExtendedFilter and
+            // CompositesIndex) and maybe we should, but that means a query like:
+            //     SELECT * FROM foo WHERE static_column = 'bar'
+            // would pull the full partition every time the static column of partition is 'bar', which sounds like offering a
+            // fair potential for foot-shooting, so I prefer leaving that to a follow up ticket once we have identified cases where
+            // such indexing is actually useful.
+            if (!cfm.isCompactTable() && cd.isStatic())
+                throw new InvalidRequestException("Secondary indexes are not allowed on static columns");
+
+            if (cd.kind == ColumnDefinition.Kind.PARTITION_KEY && cfm.getKeyValidatorAsClusteringComparator().size() == 1)
+                throw new InvalidRequestException(String.format("Cannot create secondary index on partition key column %s", target.column));
+
+            boolean isMap = cd.type instanceof MapType;
+            boolean isFrozenCollection = cd.type.isCollection() && !cd.type.isMultiCell();
+            if (isFrozenCollection)
+            {
+                validateForFrozenCollection(target);
+            }
             else
-                throw new InvalidRequestException("Index already exists");
+            {
+                validateNotFullIndex(target);
+                validateIsSimpleIndexIfTargetColumnNotCollection(cd, target);
+                validateTargetColumnIsMapIfIndexInvolvesKeys(isMap, target);
+            }
+        }
+
+        if (!Strings.isNullOrEmpty(indexName))
+        {
+            if (Schema.instance.getKSMetaData(keyspace()).existingIndexNames(null).contains(indexName))
+            {
+                if (ifNotExists)
+                    return;
+                else
+                    throw new InvalidRequestException(String.format("Index %s already exists", indexName));
+            }
         }
 
         properties.validate();
-
-        // TODO: we could lift that limitation
-        if ((cfm.comparator.isDense() || !cfm.comparator.isCompound()) && cd.isPrimaryKeyColumn())
-            throw new InvalidRequestException("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
-
-        if (cd.kind == ColumnDefinition.Kind.COMPACT_VALUE)
-            throw new InvalidRequestException("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
-
-        // It would be possible to support 2ndary index on static columns (but not without modifications of at least ExtendedFilter and
-        // CompositesIndex) and maybe we should, but that means a query like:
-        //     SELECT * FROM foo WHERE static_column = 'bar'
-        // would pull the full partition every time the static column of partition is 'bar', which sounds like offering a
-        // fair potential for foot-shooting, so I prefer leaving that to a follow up ticket once we have identified cases where
-        // such indexing is actually useful.
-        if (cd.isStatic())
-            throw new InvalidRequestException("Secondary indexes are not allowed on static columns");
-
-        if (cd.kind == ColumnDefinition.Kind.PARTITION_KEY && cd.isOnAllComponents())
-            throw new InvalidRequestException(String.format("Cannot create secondary index on partition key column %s", target.column));
     }
 
     private void validateForFrozenCollection(IndexTarget target) throws InvalidRequestException
     {
-        if (target.type != IndexTarget.TargetType.FULL)
-            throw new InvalidRequestException(String.format("Cannot create index on %s of frozen<map> column %s", target.type, target.column));
+        if (target.type != IndexTarget.Type.FULL)
+            throw new InvalidRequestException(String.format("Cannot create %s() index on frozen column %s. " +
+                                                            "Frozen collections only support full() indexes",
+                                                            target.type, target.column));
     }
 
     private void validateNotFullIndex(IndexTarget target) throws InvalidRequestException
     {
-        if (target.type == IndexTarget.TargetType.FULL)
+        if (target.type == IndexTarget.Type.FULL)
             throw new InvalidRequestException("full() indexes can only be created on frozen collections");
     }
 
-    private void validateIsValuesIndexIfTargetColumnNotCollection(ColumnDefinition cd, IndexTarget target) throws InvalidRequestException
+    private void validateIsSimpleIndexIfTargetColumnNotCollection(ColumnDefinition cd, IndexTarget target) throws InvalidRequestException
     {
-        if (!cd.type.isCollection() && target.type != IndexTarget.TargetType.VALUES)
-            throw new InvalidRequestException(String.format("Cannot create index on %s of column %s; only non-frozen collections support %s indexes",
-                                                            target.type, target.column, target.type));
+        if (!cd.type.isCollection() && target.type != IndexTarget.Type.SIMPLE)
+            throw new InvalidRequestException(String.format("Cannot create %s() index on %s. " +
+                                                            "Non-collection columns support only simple indexes",
+                                                            target.type.toString(), target.column));
     }
 
     private void validateTargetColumnIsMapIfIndexInvolvesKeys(boolean isMap, IndexTarget target) throws InvalidRequestException
     {
-        if (target.type == IndexTarget.TargetType.KEYS || target.type == IndexTarget.TargetType.KEYS_AND_VALUES)
+        if (target.type == IndexTarget.Type.KEYS || target.type == IndexTarget.Type.KEYS_AND_VALUES)
         {
             if (!isMap)
                 throw new InvalidRequestException(String.format("Cannot create index on %s of column %s with non-map type",
@@ -160,43 +183,72 @@
         }
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
+    private void validateTargetsForMultiColumnIndex(List<IndexTarget> targets)
+    {
+        if (!properties.isCustom)
+            throw new InvalidRequestException("Only CUSTOM indexes support multiple columns");
+
+        Set<ColumnIdentifier> columns = new HashSet<>();
+        for (IndexTarget target : targets)
+            if (!columns.add(target.column))
+                throw new InvalidRequestException("Duplicate column " + target.column + " in index target list");
+    }
+
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
         CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
-        IndexTarget target = rawTarget.prepare(cfm);
-        logger.trace("Updating column {} definition for index {}", target.column, indexName);
-        ColumnDefinition cd = cfm.getColumnDefinition(target.column);
+        List<IndexTarget> targets = new ArrayList<>(rawTargets.size());
+        for (IndexTarget.Raw rawTarget : rawTargets)
+            targets.add(rawTarget.prepare(cfm));
 
-        if (cd.getIndexType() != null && ifNotExists)
-            return false;
+        String acceptedName = indexName;
+        if (Strings.isNullOrEmpty(acceptedName))
+        {
+            acceptedName = Indexes.getAvailableIndexName(keyspace(),
+                                                         columnFamily(),
+                                                         targets.size() == 1 ? targets.get(0).column.toString() : null);
+        }
 
+        if (Schema.instance.getKSMetaData(keyspace()).existingIndexNames(null).contains(acceptedName))
+        {
+            if (ifNotExists)
+                return null;
+            else
+                throw new InvalidRequestException(String.format("Index %s already exists", acceptedName));
+        }
+
+        IndexMetadata.Kind kind;
+        Map<String, String> indexOptions;
         if (properties.isCustom)
         {
-            cd.setIndexType(IndexType.CUSTOM, properties.getOptions());
-        }
-        else if (cfm.comparator.isCompound())
-        {
-            Map<String, String> options = Collections.emptyMap();
-            // For now, we only allow indexing values for collections, but we could later allow
-            // to also index map keys, so we record that this is the values we index to make our
-            // lives easier then.
-            if (cd.type.isCollection() && cd.type.isMultiCell())
-                options = ImmutableMap.of(target.type.indexOption(), "");
-            cd.setIndexType(IndexType.COMPOSITES, options);
+            kind = IndexMetadata.Kind.CUSTOM;
+            indexOptions = properties.getOptions();
         }
         else
         {
-            cd.setIndexType(IndexType.KEYS, Collections.<String, String>emptyMap());
+            indexOptions = Collections.emptyMap();
+            kind = cfm.isCompound() ? IndexMetadata.Kind.COMPOSITES : IndexMetadata.Kind.KEYS;
         }
 
-        cd.setIndexName(indexName);
-        cfm.addDefaultIndexNames();
-        MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
-        return true;
-    }
+        IndexMetadata index = IndexMetadata.fromIndexTargets(cfm, targets, acceptedName, kind, indexOptions);
 
-    public Event.SchemaChange changeEvent()
-    {
+        // check to disallow creation of an index which duplicates an existing one in all but name
+        Optional<IndexMetadata> existingIndex = Iterables.tryFind(cfm.getIndexes(), existing -> existing.equalsWithoutName(index));
+        if (existingIndex.isPresent())
+        {
+            if (ifNotExists)
+                return null;
+            else
+                throw new InvalidRequestException(String.format("Index %s is a duplicate of existing index %s",
+                                                                index.name,
+                                                                existingIndex.get().name));
+        }
+
+        logger.trace("Updating index definition for {}", indexName);
+        cfm.indexes(cfm.getIndexes().with(index));
+
+        MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
+
         // Creating an index is akin to updating the CF
         return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java
index a3e27e4..787dc73 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateKeyspaceStatement.java

@@ -21,7 +21,9 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.locator.LocalStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
@@ -30,7 +32,7 @@
 public class CreateKeyspaceStatement extends SchemaAlteringStatement
 {
     private final String name;
-    private final KSPropDefs attrs;
+    private final KeyspaceAttributes attrs;
     private final boolean ifNotExists;
 
     /**
@@ -40,7 +42,7 @@
      * @param name the name of the keyspace to create
      * @param attrs map of the raw keyword arguments that followed the <code>WITH</code> keyword.
      */
-    public CreateKeyspaceStatement(String name, KSPropDefs attrs, boolean ifNotExists)
+    public CreateKeyspaceStatement(String name, KeyspaceAttributes attrs, boolean ifNotExists)
     {
         super();
         this.name = name;
@@ -84,33 +86,28 @@
         // The strategy is validated through KSMetaData.validate() in announceNewKeyspace below.
         // However, for backward compatibility with thrift, this doesn't validate unexpected options yet,
         // so doing proper validation here.
-        AbstractReplicationStrategy.validateReplicationStrategy(name,
-                                                                AbstractReplicationStrategy.getClass(attrs.getReplicationStrategyClass()),
-                                                                StorageService.instance.getTokenMetadata(),
-                                                                DatabaseDescriptor.getEndpointSnitch(),
-                                                                attrs.getReplicationOptions());
+        KeyspaceParams params = attrs.asNewKeyspaceParams();
+        params.validate(name);
+        if (params.replication.klass.equals(LocalStrategy.class))
+            throw new ConfigurationException("Unable to use given strategy class: LocalStrategy is reserved for internal use.");
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(name, attrs.asNewKeyspaceParams());
         try
         {
-            MigrationManager.announceNewKeyspace(attrs.asKSMetadata(name), isLocalOnly);
-            return true;
+            MigrationManager.announceNewKeyspace(ksm, isLocalOnly);
+            return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, keyspace());
         }
         catch (AlreadyExistsException e)
         {
             if (ifNotExists)
-                return false;
+                return null;
             throw e;
         }
     }
 
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, keyspace());
-    }
-
     protected void grantPermissionsToCreator(QueryState state)
     {
         try

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java
index e761674..9f14194 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTableStatement.java

@@ -26,60 +26,48 @@
 
 import org.apache.cassandra.auth.*;
 import org.apache.cassandra.config.*;
-import org.apache.cassandra.cql3.CFName;
-import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.schema.Types;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
-/** A <code>CREATE TABLE</code> parsed from a CQL query statement. */
+/** A {@code CREATE TABLE} parsed from a CQL query statement. */
 public class CreateTableStatement extends SchemaAlteringStatement
 {
-    public CellNameType comparator;
-    private AbstractType<?> defaultValidator;
-    private AbstractType<?> keyValidator;
+    private List<AbstractType<?>> keyTypes;
+    private List<AbstractType<?>> clusteringTypes;
 
-    private final List<ByteBuffer> keyAliases = new ArrayList<ByteBuffer>();
-    private final List<ByteBuffer> columnAliases = new ArrayList<ByteBuffer>();
-    private ByteBuffer valueAlias;
+    private final Map<ByteBuffer, CollectionType> collections = new HashMap<>();
+
+    private final List<ColumnIdentifier> keyAliases = new ArrayList<>();
+    private final List<ColumnIdentifier> columnAliases = new ArrayList<>();
 
     private boolean isDense;
+    private boolean isCompound;
+    private boolean hasCounters;
 
     // use a TreeMap to preserve ordering across JDK versions (see CASSANDRA-9492)
-    private final Map<ColumnIdentifier, AbstractType> columns = new TreeMap<>(new Comparator<ColumnIdentifier>()
-    {
-        public int compare(ColumnIdentifier o1, ColumnIdentifier o2)
-        {
-            return o1.bytes.compareTo(o2.bytes);
-        }
-    });
+    private final Map<ColumnIdentifier, AbstractType> columns = new TreeMap<>((o1, o2) -> o1.bytes.compareTo(o2.bytes));
+
     private final Set<ColumnIdentifier> staticColumns;
-    private final CFPropDefs properties;
+    private final TableParams params;
     private final boolean ifNotExists;
     private final UUID id;
 
-    public CreateTableStatement(CFName name, CFPropDefs properties, boolean ifNotExists, Set<ColumnIdentifier> staticColumns, UUID id)
+    public CreateTableStatement(CFName name, TableParams params, boolean ifNotExists, Set<ColumnIdentifier> staticColumns, UUID id)
     {
         super(name);
-        this.properties = properties;
+        this.params = params;
         this.ifNotExists = ifNotExists;
         this.staticColumns = staticColumns;
         this.id = id;
-
-        if (!this.properties.hasProperty(CFPropDefs.KW_COMPRESSION) && CFMetaData.DEFAULT_COMPRESSOR != null)
-            this.properties.addProperty(CFPropDefs.KW_COMPRESSION,
-                                        new HashMap<String, String>()
-                                        {{
-                                            put(CompressionParameters.SSTABLE_COMPRESSION, CFMetaData.DEFAULT_COMPRESSOR);
-                                        }});
     }
 
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
@@ -92,42 +80,21 @@
         // validated in announceMigration()
     }
 
-    // Column definitions
-    private List<ColumnDefinition> getColumns(CFMetaData cfm)
-    {
-        List<ColumnDefinition> columnDefs = new ArrayList<>(columns.size());
-        Integer componentIndex = comparator.isCompound() ? comparator.clusteringPrefixSize() : null;
-        for (Map.Entry<ColumnIdentifier, AbstractType> col : columns.entrySet())
-        {
-            ColumnIdentifier id = col.getKey();
-            columnDefs.add(staticColumns.contains(id)
-                           ? ColumnDefinition.staticDef(cfm, col.getKey().bytes, col.getValue(), componentIndex)
-                           : ColumnDefinition.regularDef(cfm, col.getKey().bytes, col.getValue(), componentIndex));
-        }
-
-        return columnDefs;
-    }
-
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
         try
         {
             MigrationManager.announceNewColumnFamily(getCFMetaData(), isLocalOnly);
-            return true;
+            return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
         }
         catch (AlreadyExistsException e)
         {
             if (ifNotExists)
-                return false;
+                return null;
             throw e;
         }
     }
 
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
-    }
-
     protected void grantPermissionsToCreator(QueryState state)
     {
         try
@@ -144,69 +111,73 @@
         }
     }
 
+    public CFMetaData.Builder metadataBuilder()
+    {
+        CFMetaData.Builder builder = CFMetaData.Builder.create(keyspace(), columnFamily(), isDense, isCompound, hasCounters);
+        builder.withId(id);
+        for (int i = 0; i < keyAliases.size(); i++)
+            builder.addPartitionKey(keyAliases.get(i), keyTypes.get(i));
+        for (int i = 0; i < columnAliases.size(); i++)
+            builder.addClusteringColumn(columnAliases.get(i), clusteringTypes.get(i));
+
+        boolean isStaticCompact = !isDense && !isCompound;
+        for (Map.Entry<ColumnIdentifier, AbstractType> entry : columns.entrySet())
+        {
+            ColumnIdentifier name = entry.getKey();
+            // Note that for "static" no-clustering compact storage we use static for the defined columns
+            if (staticColumns.contains(name) || isStaticCompact)
+                builder.addStaticColumn(name, entry.getValue());
+            else
+                builder.addRegularColumn(name, entry.getValue());
+        }
+
+        boolean isCompactTable = isDense || !isCompound;
+        if (isCompactTable)
+        {
+            CompactTables.DefaultNames names = CompactTables.defaultNameGenerator(builder.usedColumnNames());
+            // Compact tables always have a clustering and a single regular value.
+            if (isStaticCompact)
+            {
+                builder.addClusteringColumn(names.defaultClusteringName(), UTF8Type.instance);
+                builder.addRegularColumn(names.defaultCompactValueName(), hasCounters ? CounterColumnType.instance : BytesType.instance);
+            }
+            else if (isDense && !builder.hasRegulars())
+            {
+                // Even for dense, we might not have our regular column if it wasn't part of the declaration. If
+                // that's the case, add it but with a specific EmptyType so we can recognize that case later
+                builder.addRegularColumn(names.defaultCompactValueName(), EmptyType.instance);
+            }
+        }
+
+        return builder;
+    }
+
     /**
      * Returns a CFMetaData instance based on the parameters parsed from this
-     * <code>CREATE</code> statement, or defaults where applicable.
+     * {@code CREATE} statement, or defaults where applicable.
      *
      * @return a CFMetaData instance corresponding to the values parsed from this statement
      * @throws InvalidRequestException on failure to validate parsed parameters
      */
-    public CFMetaData getCFMetaData() throws RequestValidationException
+    public CFMetaData getCFMetaData()
     {
-        CFMetaData newCFMD;
-        newCFMD = new CFMetaData(keyspace(),
-                                 columnFamily(),
-                                 ColumnFamilyType.Standard,
-                                 comparator,
-                                 id);
-        applyPropertiesTo(newCFMD);
-        return newCFMD;
+        return metadataBuilder().build().params(params);
     }
 
-    public void applyPropertiesTo(CFMetaData cfmd) throws RequestValidationException
+    public TableParams params()
     {
-        cfmd.defaultValidator(defaultValidator)
-            .keyValidator(keyValidator)
-            .addAllColumnDefinitions(getColumns(cfmd))
-            .isDense(isDense);
-
-        addColumnMetadataFromAliases(cfmd, keyAliases, keyValidator, ColumnDefinition.Kind.PARTITION_KEY);
-        addColumnMetadataFromAliases(cfmd, columnAliases, comparator.asAbstractType(), ColumnDefinition.Kind.CLUSTERING_COLUMN);
-        if (valueAlias != null)
-            addColumnMetadataFromAliases(cfmd, Collections.singletonList(valueAlias), defaultValidator, ColumnDefinition.Kind.COMPACT_VALUE);
-
-        properties.applyToCFMetadata(cfmd);
+        return params;
     }
 
-    private void addColumnMetadataFromAliases(CFMetaData cfm, List<ByteBuffer> aliases, AbstractType<?> comparator, ColumnDefinition.Kind kind)
-    {
-        if (comparator instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)comparator;
-            for (int i = 0; i < aliases.size(); ++i)
-                if (aliases.get(i) != null)
-                    cfm.addOrReplaceColumnDefinition(new ColumnDefinition(cfm, aliases.get(i), ct.types.get(i), i, kind));
-        }
-        else
-        {
-            assert aliases.size() <= 1;
-            if (!aliases.isEmpty() && aliases.get(0) != null)
-                cfm.addOrReplaceColumnDefinition(new ColumnDefinition(cfm, aliases.get(0), comparator, null, kind));
-        }
-    }
-
-
     public static class RawStatement extends CFStatement
     {
         private final Map<ColumnIdentifier, CQL3Type.Raw> definitions = new HashMap<>();
-        public final CFPropDefs properties = new CFPropDefs();
+        public final CFProperties properties = new CFProperties();
 
-        private final List<List<ColumnIdentifier>> keyAliases = new ArrayList<List<ColumnIdentifier>>();
-        private final List<ColumnIdentifier> columnAliases = new ArrayList<ColumnIdentifier>();
-        private final Map<ColumnIdentifier, Boolean> definedOrdering = new LinkedHashMap<ColumnIdentifier, Boolean>(); // Insertion ordering is important
-        private final Set<ColumnIdentifier> staticColumns = new HashSet<ColumnIdentifier>();
+        private final List<List<ColumnIdentifier>> keyAliases = new ArrayList<>();
+        private final List<ColumnIdentifier> columnAliases = new ArrayList<>();
+        private final Set<ColumnIdentifier> staticColumns = new HashSet<>();
 
-        private boolean useCompactStorage;
         private final Multiset<ColumnIdentifier> definedNames = HashMultiset.create(1);
 
         private final boolean ifNotExists;
@@ -220,7 +191,15 @@
         /**
          * Transform this raw statement into a CreateTableStatement.
          */
-        public ParsedStatement.Prepared prepare() throws RequestValidationException
+        public ParsedStatement.Prepared prepare(ClientState clientState) throws RequestValidationException
+        {
+            KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace());
+            if (ksm == null)
+                throw new ConfigurationException(String.format("Keyspace %s doesn't exist", keyspace()));
+            return prepare(ksm.types);
+        }
+
+        public ParsedStatement.Prepared prepare(Types udts) throws RequestValidationException
         {
             // Column family name
             if (!columnFamily().matches("\\w+"))
@@ -234,184 +213,125 @@
 
             properties.validate();
 
-            CreateTableStatement stmt = new CreateTableStatement(cfName, properties, ifNotExists, staticColumns, properties.getId());
+            TableParams params = properties.properties.asNewTableParams();
 
-            boolean hasCounters = false;
-            Map<ByteBuffer, CollectionType> definedMultiCellCollections = null;
+            CreateTableStatement stmt = new CreateTableStatement(cfName, params, ifNotExists, staticColumns, properties.properties.getId());
+
             for (Map.Entry<ColumnIdentifier, CQL3Type.Raw> entry : definitions.entrySet())
             {
                 ColumnIdentifier id = entry.getKey();
-                CQL3Type pt = entry.getValue().prepare(keyspace());
-                if (pt.isCollection() && ((CollectionType) pt.getType()).isMultiCell())
-                {
-                    if (definedMultiCellCollections == null)
-                        definedMultiCellCollections = new HashMap<>();
-                    definedMultiCellCollections.put(id.bytes, (CollectionType) pt.getType());
-                }
-                else if (entry.getValue().isCounter())
-                    hasCounters = true;
-
+                CQL3Type pt = entry.getValue().prepare(keyspace(), udts);
+                if (pt.isCollection() && ((CollectionType)pt.getType()).isMultiCell())
+                    stmt.collections.put(id.bytes, (CollectionType)pt.getType());
+                if (entry.getValue().isCounter())
+                    stmt.hasCounters = true;
                 stmt.columns.put(id, pt.getType()); // we'll remove what is not a column below
             }
 
             if (keyAliases.isEmpty())
                 throw new InvalidRequestException("No PRIMARY KEY specifed (exactly one required)");
-            else if (keyAliases.size() > 1)
+            if (keyAliases.size() > 1)
                 throw new InvalidRequestException("Multiple PRIMARY KEYs specifed (exactly one required)");
-            else if (hasCounters && properties.getDefaultTimeToLive() > 0)
+            if (stmt.hasCounters && params.defaultTimeToLive > 0)
                 throw new InvalidRequestException("Cannot set default_time_to_live on a table with counters");
 
             List<ColumnIdentifier> kAliases = keyAliases.get(0);
-
-            List<AbstractType<?>> keyTypes = new ArrayList<AbstractType<?>>(kAliases.size());
+            stmt.keyTypes = new ArrayList<>(kAliases.size());
             for (ColumnIdentifier alias : kAliases)
             {
-                stmt.keyAliases.add(alias.bytes);
+                stmt.keyAliases.add(alias);
                 AbstractType<?> t = getTypeAndRemove(stmt.columns, alias);
                 if (t.asCQL3Type().getType() instanceof CounterColumnType)
                     throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", alias));
                 if (staticColumns.contains(alias))
                     throw new InvalidRequestException(String.format("Static column %s cannot be part of the PRIMARY KEY", alias));
-                keyTypes.add(t);
+                stmt.keyTypes.add(t);
             }
-            stmt.keyValidator = keyTypes.size() == 1 ? keyTypes.get(0) : CompositeType.getInstance(keyTypes);
 
-            // Dense means that no part of the comparator stores a CQL column name. This means
-            // COMPACT STORAGE with at least one columnAliases (otherwise it's a thrift "static" CF).
-            stmt.isDense = useCompactStorage && !columnAliases.isEmpty();
-
+            stmt.clusteringTypes = new ArrayList<>(columnAliases.size());
             // Handle column aliases
-            if (columnAliases.isEmpty())
+            for (ColumnIdentifier t : columnAliases)
             {
-                if (useCompactStorage)
+                stmt.columnAliases.add(t);
+
+                AbstractType<?> type = getTypeAndRemove(stmt.columns, t);
+                if (type.asCQL3Type().getType() instanceof CounterColumnType)
+                    throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", t));
+                if (staticColumns.contains(t))
+                    throw new InvalidRequestException(String.format("Static column %s cannot be part of the PRIMARY KEY", t));
+                stmt.clusteringTypes.add(type);
+            }
+
+            // We've handled anything that is not a rpimary key so stmt.columns only contains NON-PK columns. So
+            // if it's a counter table, make sure we don't have non-counter types
+            if (stmt.hasCounters)
+            {
+                for (AbstractType<?> type : stmt.columns.values())
+                    if (!type.isCounter())
+                        throw new InvalidRequestException("Cannot mix counter and non counter columns in the same table");
+            }
+
+            boolean useCompactStorage = properties.useCompactStorage;
+            // Dense means that on the thrift side, no part of the "thrift column name" stores a "CQL/metadata column name".
+            // This means COMPACT STORAGE with at least one clustering type (otherwise it's a thrift "static" CF).
+            stmt.isDense = useCompactStorage && !stmt.clusteringTypes.isEmpty();
+            // Compound means that on the thrift side, the "thrift column name" is a composite one. It's the case unless
+            // we use compact storage COMPACT STORAGE and we have either no clustering columns (thrift "static" CF) or
+            // only one of them (if more than one, it's a "dense composite").
+            stmt.isCompound = !(useCompactStorage && stmt.clusteringTypes.size() <= 1);
+
+            // For COMPACT STORAGE, we reject any "feature" that we wouldn't be able to translate back to thrift.
+            if (useCompactStorage)
+            {
+                if (!stmt.collections.isEmpty())
+                    throw new InvalidRequestException("Non-frozen collection types are not supported with COMPACT STORAGE");
+                if (!staticColumns.isEmpty())
+                    throw new InvalidRequestException("Static columns are not supported in COMPACT STORAGE tables");
+
+                if (stmt.clusteringTypes.isEmpty())
                 {
-                    // There should remain some column definition since it is a non-composite "static" CF
+                    // It's a thrift "static CF" so there should be some columns definition
                     if (stmt.columns.isEmpty())
                         throw new InvalidRequestException("No definition found that is not part of the PRIMARY KEY");
-
-                    if (definedMultiCellCollections != null)
-                        throw new InvalidRequestException("Non-frozen collection types are not supported with COMPACT STORAGE");
-
-                    stmt.comparator = new SimpleSparseCellNameType(UTF8Type.instance);
                 }
-                else
+
+                if (stmt.isDense)
                 {
-                    stmt.comparator = definedMultiCellCollections == null
-                                    ? new CompoundSparseCellNameType(Collections.<AbstractType<?>>emptyList())
-                                    : new CompoundSparseCellNameType.WithCollection(Collections.<AbstractType<?>>emptyList(), ColumnToCollectionType.getInstance(definedMultiCellCollections));
-                }
-            }
-            else
-            {
-                // If we use compact storage and have only one alias, it is a
-                // standard "dynamic" CF, otherwise it's a composite
-                if (useCompactStorage && columnAliases.size() == 1)
-                {
-                    if (definedMultiCellCollections != null)
-                        throw new InvalidRequestException("Collection types are not supported with COMPACT STORAGE");
-
-                    ColumnIdentifier alias = columnAliases.get(0);
-                    if (staticColumns.contains(alias))
-                        throw new InvalidRequestException(String.format("Static column %s cannot be part of the PRIMARY KEY", alias));
-
-                    stmt.columnAliases.add(alias.bytes);
-                    AbstractType<?> at = getTypeAndRemove(stmt.columns, alias);
-                    if (at.asCQL3Type().getType() instanceof CounterColumnType)
-                        throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", stmt.columnAliases.get(0)));
-                    stmt.comparator = new SimpleDenseCellNameType(at);
-                }
-                else
-                {
-                    List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(columnAliases.size() + 1);
-                    for (ColumnIdentifier t : columnAliases)
-                    {
-                        stmt.columnAliases.add(t.bytes);
-
-                        AbstractType<?> type = getTypeAndRemove(stmt.columns, t);
-                        if (type.asCQL3Type().getType() instanceof CounterColumnType)
-                            throw new InvalidRequestException(String.format("counter type is not supported for PRIMARY KEY part %s", t));
-                        if (staticColumns.contains(t))
-                            throw new InvalidRequestException(String.format("Static column %s cannot be part of the PRIMARY KEY", t));
-                        types.add(type);
-                    }
-
-                    if (useCompactStorage)
-                    {
-                        if (definedMultiCellCollections != null)
-                            throw new InvalidRequestException("Collection types are not supported with COMPACT STORAGE");
-
-                        stmt.comparator = new CompoundDenseCellNameType(types);
-                    }
-                    else
-                    {
-                        stmt.comparator = definedMultiCellCollections == null
-                                        ? new CompoundSparseCellNameType(types)
-                                        : new CompoundSparseCellNameType.WithCollection(types, ColumnToCollectionType.getInstance(definedMultiCellCollections));
-                    }
-                }
-            }
-
-            if (!staticColumns.isEmpty())
-            {
-                // Only CQL3 tables can have static columns
-                if (useCompactStorage)
-                    throw new InvalidRequestException("Static columns are not supported in COMPACT STORAGE tables");
-                // Static columns only make sense if we have at least one clustering column. Otherwise everything is static anyway
-                if (columnAliases.isEmpty())
-                    throw new InvalidRequestException("Static columns are only useful (and thus allowed) if the table has at least one clustering column");
-            }
-
-            if (useCompactStorage && !stmt.columnAliases.isEmpty())
-            {
-                if (stmt.columns.isEmpty())
-                {
-                    // The only value we'll insert will be the empty one, so the default validator don't matter
-                    stmt.defaultValidator = BytesType.instance;
-                    // We need to distinguish between
-                    //   * I'm upgrading from thrift so the valueAlias is null
-                    //   * I've defined my table with only a PK (and the column value will be empty)
-                    // So, we use an empty valueAlias (rather than null) for the second case
-                    stmt.valueAlias = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-                }
-                else
-                {
+                    // We can have no columns (only the PK), but we can't have more than one.
                     if (stmt.columns.size() > 1)
                         throw new InvalidRequestException(String.format("COMPACT STORAGE with composite PRIMARY KEY allows no more than one column not part of the PRIMARY KEY (got: %s)", StringUtils.join(stmt.columns.keySet(), ", ")));
-
-                    Map.Entry<ColumnIdentifier, AbstractType> lastEntry = stmt.columns.entrySet().iterator().next();
-                    stmt.defaultValidator = lastEntry.getValue();
-                    stmt.valueAlias = lastEntry.getKey().bytes;
-                    stmt.columns.remove(lastEntry.getKey());
+                }
+                else
+                {
+                    // we are in the "static" case, so we need at least one column defined. For non-compact however, having
+                    // just the PK is fine.
+                    if (stmt.columns.isEmpty())
+                        throw new InvalidRequestException("COMPACT STORAGE with non-composite PRIMARY KEY require one column not part of the PRIMARY KEY, none given");
                 }
             }
             else
             {
-                // For compact, we are in the "static" case, so we need at least one column defined. For non-compact however, having
-                // just the PK is fine since we have CQL3 row marker.
-                if (useCompactStorage && stmt.columns.isEmpty())
-                    throw new InvalidRequestException("COMPACT STORAGE with non-composite PRIMARY KEY require one column not part of the PRIMARY KEY, none given");
-
-                // There is no way to insert/access a column that is not defined for non-compact storage, so
-                // the actual validator don't matter much (except that we want to recognize counter CF as limitation apply to them).
-                stmt.defaultValidator = !stmt.columns.isEmpty() && (stmt.columns.values().iterator().next() instanceof CounterColumnType)
-                    ? CounterColumnType.instance
-                    : BytesType.instance;
+                if (stmt.clusteringTypes.isEmpty() && !staticColumns.isEmpty())
+                {
+                    // Static columns only make sense if we have at least one clustering column. Otherwise everything is static anyway
+                    if (columnAliases.isEmpty())
+                        throw new InvalidRequestException("Static columns are only useful (and thus allowed) if the table has at least one clustering column");
+                }
             }
 
-
             // If we give a clustering order, we must explicitly do so for all aliases and in the order of the PK
-            if (!definedOrdering.isEmpty())
+            if (!properties.definedOrdering.isEmpty())
             {
-                if (definedOrdering.size() > columnAliases.size())
+                if (properties.definedOrdering.size() > columnAliases.size())
                     throw new InvalidRequestException("Only clustering key columns can be defined in CLUSTERING ORDER directive");
 
                 int i = 0;
-                for (ColumnIdentifier id : definedOrdering.keySet())
+                for (ColumnIdentifier id : properties.definedOrdering.keySet())
                 {
                     ColumnIdentifier c = columnAliases.get(i);
                     if (!id.equals(c))
                     {
-                        if (definedOrdering.containsKey(c))
+                        if (properties.definedOrdering.containsKey(c))
                             throw new InvalidRequestException(String.format("The order of columns in the CLUSTERING ORDER directive must be the one of the clustering key (%s must appear before %s)", c, id));
                         else
                             throw new InvalidRequestException(String.format("Missing CLUSTERING ORDER for column %s", c));
@@ -432,7 +352,7 @@
                 throw new InvalidRequestException(String.format("Invalid collection type for PRIMARY KEY component %s", t));
 
             columns.remove(t);
-            Boolean isReversed = definedOrdering.get(t);
+            Boolean isReversed = properties.definedOrdering.get(t);
             return isReversed != null && isReversed ? ReversedType.getInstance(type) : type;
         }
 
@@ -453,15 +373,5 @@
         {
             columnAliases.add(alias);
         }
-
-        public void setOrdering(ColumnIdentifier alias, boolean reversed)
-        {
-            definedOrdering.put(alias, reversed);
-        }
-
-        public void setCompactStorage()
-        {
-            useCompactStorage = true;
-        }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java
index ef2f263..5d29996 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTriggerStatement.java

@@ -22,14 +22,16 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.cql3.CFName;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.TriggerMetadata;
+import org.apache.cassandra.schema.Triggers;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.triggers.TriggerExecutor;
@@ -57,7 +59,10 @@
 
     public void validate(ClientState state) throws RequestValidationException
     {
-        ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+        CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+        if (cfm.isView())
+            throw new InvalidRequestException("Cannot CREATE TRIGGER against a materialized view");
+
         try
         {
             TriggerExecutor.instance.loadTriggerInstance(triggerClass);
@@ -68,24 +73,22 @@
         }
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
     {
         CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
+        Triggers triggers = cfm.getTriggers();
 
-        TriggerDefinition triggerDefinition = TriggerDefinition.create(triggerName, triggerClass);
-
-        if (!ifNotExists || !cfm.containsTriggerDefinition(triggerDefinition))
+        if (triggers.get(triggerName).isPresent())
         {
-            cfm.addTriggerDefinition(triggerDefinition);
-            logger.info("Adding trigger with name {} and class {}", triggerName, triggerClass);
-            MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
-            return true;
+            if (ifNotExists)
+                return null;
+            else
+                throw new InvalidRequestException(String.format("Trigger %s already exists", triggerName));
         }
-        return false;
-    }
 
-    public Event.SchemaChange changeEvent()
-    {
+        cfm.triggers(triggers.with(TriggerMetadata.create(triggerName, triggerClass)));
+        logger.info("Adding trigger with name {} and class {}", triggerName, triggerClass);
+        MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
         return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java
index 82c2808..e7f8feb 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateTypeStatement.java

@@ -27,8 +27,10 @@
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 public class CreateTypeStatement extends SchemaAlteringStatement
@@ -65,11 +67,11 @@
 
     public void validate(ClientState state) throws RequestValidationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name.getKeyspace());
         if (ksm == null)
             throw new InvalidRequestException(String.format("Cannot add type in unknown keyspace %s", name.getKeyspace()));
 
-        if (ksm.userTypes.getType(name.getUserTypeName()) != null && !ifNotExists)
+        if (ksm.types.get(name.getUserTypeName()).isPresent() && !ifNotExists)
             throw new InvalidRequestException(String.format("A user type of name %s already exists", name));
 
         for (CQL3Type.Raw type : columnTypes)
@@ -92,11 +94,6 @@
         }
     }
 
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
-    }
-
     @Override
     public String keyspace()
     {
@@ -116,18 +113,18 @@
         return new UserType(name.getKeyspace(), name.getUserTypeName(), names, types);
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name.getKeyspace());
         assert ksm != null; // should haven't validate otherwise
 
         // Can happen with ifNotExists
-        if (ksm.userTypes.getType(name.getUserTypeName()) != null)
-            return false;
+        if (ksm.types.get(name.getUserTypeName()).isPresent())
+            return null;
 
         UserType type = createType();
         checkForDuplicateNames(type);
         MigrationManager.announceNewType(type, isLocalOnly);
-        return true;
+        return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateViewStatement.java
new file mode 100644
index 0000000..cce954f
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateViewStatement.java

@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Iterables;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.cql3.selection.RawSelector;
+import org.apache.cassandra.cql3.selection.Selectable;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.view.View;
+import org.apache.cassandra.exceptions.AlreadyExistsException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.thrift.ThriftValidation;
+import org.apache.cassandra.transport.Event;
+
+public class CreateViewStatement extends SchemaAlteringStatement
+{
+    private static final Logger logger = LoggerFactory.getLogger(CreateViewStatement.class);
+
+    private final CFName baseName;
+    private final List<RawSelector> selectClause;
+    private final WhereClause whereClause;
+    private final List<ColumnIdentifier.Raw> partitionKeys;
+    private final List<ColumnIdentifier.Raw> clusteringKeys;
+    public final CFProperties properties = new CFProperties();
+    private final boolean ifNotExists;
+
+    public CreateViewStatement(CFName viewName,
+                               CFName baseName,
+                               List<RawSelector> selectClause,
+                               WhereClause whereClause,
+                               List<ColumnIdentifier.Raw> partitionKeys,
+                               List<ColumnIdentifier.Raw> clusteringKeys,
+                               boolean ifNotExists)
+    {
+        super(viewName);
+        this.baseName = baseName;
+        this.selectClause = selectClause;
+        this.whereClause = whereClause;
+        this.partitionKeys = partitionKeys;
+        this.clusteringKeys = clusteringKeys;
+        this.ifNotExists = ifNotExists;
+    }
+
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        if (!baseName.hasKeyspace())
+            baseName.setKeyspace(keyspace(), true);
+        state.hasColumnFamilyAccess(keyspace(), baseName.getColumnFamily(), Permission.ALTER);
+    }
+
+    public void validate(ClientState state) throws RequestValidationException
+    {
+        // We do validation in announceMigration to reduce doubling up of work
+    }
+
+    private interface AddColumn {
+        void add(ColumnIdentifier identifier, AbstractType<?> type);
+    }
+
+    private void add(CFMetaData baseCfm, Iterable<ColumnIdentifier> columns, AddColumn adder)
+    {
+        for (ColumnIdentifier column : columns)
+        {
+            AbstractType<?> type = baseCfm.getColumnDefinition(column).type;
+            if (properties.definedOrdering.containsKey(column))
+            {
+                boolean desc = properties.definedOrdering.get(column);
+                if (!desc && type.isReversed())
+                {
+                    type = ((ReversedType)type).baseType;
+                }
+                else if (desc && !type.isReversed())
+                {
+                    type = ReversedType.getInstance(type);
+                }
+            }
+            adder.add(column, type);
+        }
+    }
+
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
+    {
+        if (!DatabaseDescriptor.enableMaterializedViews())
+        {
+            throw new InvalidRequestException("Materialized views are disabled. Enable in cassandra.yaml to use.");
+        }
+
+        // We need to make sure that:
+        //  - primary key includes all columns in base table's primary key
+        //  - make sure that the select statement does not have anything other than columns
+        //    and their names match the base table's names
+        //  - make sure that primary key does not include any collections
+        //  - make sure there is no where clause in the select statement
+        //  - make sure there is not currently a table or view
+        //  - make sure baseTable gcGraceSeconds > 0
+
+        properties.validate();
+
+        if (properties.useCompactStorage)
+            throw new InvalidRequestException("Cannot use 'COMPACT STORAGE' when defining a materialized view");
+
+        // We enforce the keyspace because if the RF is different, the logic to wait for a
+        // specific replica would break
+        if (!baseName.getKeyspace().equals(keyspace()))
+            throw new InvalidRequestException("Cannot create a materialized view on a table in a separate keyspace");
+
+        CFMetaData cfm = ThriftValidation.validateColumnFamily(baseName.getKeyspace(), baseName.getColumnFamily());
+
+        if (cfm.isCounter())
+            throw new InvalidRequestException("Materialized views are not supported on counter tables");
+        if (cfm.isSuper())
+            throw new InvalidRequestException("Materialized views are not supported on SuperColumn tables");
+        if (cfm.isView())
+            throw new InvalidRequestException("Materialized views cannot be created against other materialized views");
+
+        if (cfm.params.gcGraceSeconds == 0)
+        {
+            throw new InvalidRequestException(String.format("Cannot create materialized view '%s' for base table " +
+                                                            "'%s' with gc_grace_seconds of 0, since this value is " +
+                                                            "used to TTL undelivered updates. Setting gc_grace_seconds" +
+                                                            " too low might cause undelivered updates to expire " +
+                                                            "before being replayed.", cfName.getColumnFamily(),
+                                                            baseName.getColumnFamily()));
+        }
+
+        Set<ColumnIdentifier> included = new HashSet<>();
+        for (RawSelector selector : selectClause)
+        {
+            Selectable.Raw selectable = selector.selectable;
+            if (selectable instanceof Selectable.WithFieldSelection.Raw)
+                throw new InvalidRequestException("Cannot select out a part of type when defining a materialized view");
+            if (selectable instanceof Selectable.WithFunction.Raw)
+                throw new InvalidRequestException("Cannot use function when defining a materialized view");
+            if (selectable instanceof Selectable.WritetimeOrTTL.Raw)
+                throw new InvalidRequestException("Cannot use function when defining a materialized view");
+            ColumnIdentifier identifier = (ColumnIdentifier) selectable.prepare(cfm);
+            if (selector.alias != null)
+                throw new InvalidRequestException(String.format("Cannot alias column '%s' as '%s' when defining a materialized view", identifier.toString(), selector.alias.toString()));
+
+            ColumnDefinition cdef = cfm.getColumnDefinition(identifier);
+
+            if (cdef == null)
+                throw new InvalidRequestException("Unknown column name detected in CREATE MATERIALIZED VIEW statement : "+identifier);
+
+            included.add(identifier);
+        }
+
+        Set<ColumnIdentifier.Raw> targetPrimaryKeys = new HashSet<>();
+        for (ColumnIdentifier.Raw identifier : Iterables.concat(partitionKeys, clusteringKeys))
+        {
+            if (!targetPrimaryKeys.add(identifier))
+                throw new InvalidRequestException("Duplicate entry found in PRIMARY KEY: "+identifier);
+
+            ColumnDefinition cdef = cfm.getColumnDefinition(identifier.prepare(cfm));
+
+            if (cdef == null)
+                throw new InvalidRequestException("Unknown column name detected in CREATE MATERIALIZED VIEW statement : "+identifier);
+
+            if (cfm.getColumnDefinition(identifier.prepare(cfm)).type.isMultiCell())
+                throw new InvalidRequestException(String.format("Cannot use MultiCell column '%s' in PRIMARY KEY of materialized view", identifier));
+
+            if (cdef.isStatic())
+                throw new InvalidRequestException(String.format("Cannot use Static column '%s' in PRIMARY KEY of materialized view", identifier));
+        }
+
+        // build the select statement
+        Map<ColumnIdentifier.Raw, Boolean> orderings = Collections.emptyMap();
+        SelectStatement.Parameters parameters = new SelectStatement.Parameters(orderings, false, true, false);
+        SelectStatement.RawStatement rawSelect = new SelectStatement.RawStatement(baseName, parameters, selectClause, whereClause, null);
+
+        ClientState state = ClientState.forInternalCalls();
+        state.setKeyspace(keyspace());
+
+        rawSelect.prepareKeyspace(state);
+        rawSelect.setBoundVariables(getBoundVariables());
+
+        ParsedStatement.Prepared prepared = rawSelect.prepare(true, queryState.getClientState());
+        SelectStatement select = (SelectStatement) prepared.statement;
+        StatementRestrictions restrictions = select.getRestrictions();
+
+        if (!prepared.boundNames.isEmpty())
+            throw new InvalidRequestException("Cannot use query parameters in CREATE MATERIALIZED VIEW statements");
+
+        if (!restrictions.nonPKRestrictedColumns(false).isEmpty())
+        {
+            throw new InvalidRequestException(String.format(
+                    "Non-primary key columns cannot be restricted in the SELECT statement used for materialized view " +
+                    "creation (got restrictions on: %s)",
+                    restrictions.nonPKRestrictedColumns(false).stream().map(def -> def.name.toString()).collect(Collectors.joining(", "))));
+        }
+
+        String whereClauseText = View.relationsToWhereClause(whereClause.relations);
+
+        Set<ColumnIdentifier> basePrimaryKeyCols = new HashSet<>();
+        for (ColumnDefinition definition : Iterables.concat(cfm.partitionKeyColumns(), cfm.clusteringColumns()))
+            basePrimaryKeyCols.add(definition.name);
+
+        List<ColumnIdentifier> targetClusteringColumns = new ArrayList<>();
+        List<ColumnIdentifier> targetPartitionKeys = new ArrayList<>();
+
+        // This is only used as an intermediate state; this is to catch whether multiple non-PK columns are used
+        boolean hasNonPKColumn = false;
+        for (ColumnIdentifier.Raw raw : partitionKeys)
+            hasNonPKColumn |= getColumnIdentifier(cfm, basePrimaryKeyCols, hasNonPKColumn, raw, targetPartitionKeys, restrictions);
+
+        for (ColumnIdentifier.Raw raw : clusteringKeys)
+            hasNonPKColumn |= getColumnIdentifier(cfm, basePrimaryKeyCols, hasNonPKColumn, raw, targetClusteringColumns, restrictions);
+
+        // We need to include all of the primary key columns from the base table in order to make sure that we do not
+        // overwrite values in the view. We cannot support "collapsing" the base table into a smaller number of rows in
+        // the view because if we need to generate a tombstone, we have no way of knowing which value is currently being
+        // used in the view and whether or not to generate a tombstone. In order to not surprise our users, we require
+        // that they include all of the columns. We provide them with a list of all of the columns left to include.
+        boolean missingClusteringColumns = false;
+        StringBuilder columnNames = new StringBuilder();
+        List<ColumnIdentifier> includedColumns = new ArrayList<>();
+        for (ColumnDefinition def : cfm.allColumns())
+        {
+            ColumnIdentifier identifier = def.name;
+            boolean includeDef = included.isEmpty() || included.contains(identifier);
+
+            if (includeDef && def.isStatic())
+            {
+                throw new InvalidRequestException(String.format("Unable to include static column '%s' which would be included by Materialized View SELECT * statement", identifier));
+            }
+
+            if (includeDef && !targetClusteringColumns.contains(identifier) && !targetPartitionKeys.contains(identifier))
+            {
+                includedColumns.add(identifier);
+            }
+            if (!def.isPrimaryKeyColumn()) continue;
+
+            if (!targetClusteringColumns.contains(identifier) && !targetPartitionKeys.contains(identifier))
+            {
+                if (missingClusteringColumns)
+                    columnNames.append(',');
+                else
+                    missingClusteringColumns = true;
+                columnNames.append(identifier);
+            }
+        }
+        if (missingClusteringColumns)
+            throw new InvalidRequestException(String.format("Cannot create Materialized View %s without primary key columns from base %s (%s)",
+                                                            columnFamily(), baseName.getColumnFamily(), columnNames.toString()));
+
+        if (targetPartitionKeys.isEmpty())
+            throw new InvalidRequestException("Must select at least a column for a Materialized View");
+
+        if (targetClusteringColumns.isEmpty())
+            throw new InvalidRequestException("No columns are defined for Materialized View other than primary key");
+
+        TableParams params = properties.properties.asNewTableParams();
+
+        if (params.defaultTimeToLive > 0)
+        {
+            throw new InvalidRequestException("Cannot set default_time_to_live for a materialized view. " +
+                                              "Data in a materialized view always expire at the same time than " +
+                                              "the corresponding data in the parent table.");
+        }
+
+        CFMetaData.Builder cfmBuilder = CFMetaData.Builder.createView(keyspace(), columnFamily());
+        add(cfm, targetPartitionKeys, cfmBuilder::addPartitionKey);
+        add(cfm, targetClusteringColumns, cfmBuilder::addClusteringColumn);
+        add(cfm, includedColumns, cfmBuilder::addRegularColumn);
+        cfmBuilder.withId(properties.properties.getId());
+
+        CFMetaData viewCfm = cfmBuilder.build().params(params);
+        ViewDefinition definition = new ViewDefinition(keyspace(),
+                                                       columnFamily(),
+                                                       Schema.instance.getId(keyspace(), baseName.getColumnFamily()),
+                                                       baseName.getColumnFamily(),
+                                                       included.isEmpty(),
+                                                       rawSelect,
+                                                       whereClauseText,
+                                                       viewCfm);
+
+        logger.warn("Creating materialized view {} for {}.{}. " +
+                    "Materialized views are experimental and are not recommended for production use.",
+                    definition.viewName, cfm.ksName, cfm.cfName);
+
+        try
+        {
+            ClientWarn.instance.warn("Materialized views are experimental and are not recommended for production use.");
+            MigrationManager.announceNewView(definition, isLocalOnly);
+            return new Event.SchemaChange(Event.SchemaChange.Change.CREATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
+        }
+        catch (AlreadyExistsException e)
+        {
+            if (ifNotExists)
+                return null;
+            throw e;
+        }
+    }
+
+    private static boolean getColumnIdentifier(CFMetaData cfm,
+                                               Set<ColumnIdentifier> basePK,
+                                               boolean hasNonPKColumn,
+                                               ColumnIdentifier.Raw raw,
+                                               List<ColumnIdentifier> columns,
+                                               StatementRestrictions restrictions)
+    {
+        ColumnIdentifier identifier = raw.prepare(cfm);
+        ColumnDefinition def = cfm.getColumnDefinition(identifier);
+
+        boolean isPk = basePK.contains(identifier);
+        if (!isPk && hasNonPKColumn)
+            throw new InvalidRequestException(String.format("Cannot include more than one non-primary key column '%s' in materialized view primary key", identifier));
+
+        // We don't need to include the "IS NOT NULL" filter on a non-composite partition key
+        // because we will never allow a single partition key to be NULL
+        boolean isSinglePartitionKey = cfm.getColumnDefinition(identifier).isPartitionKey()
+                                       && cfm.partitionKeyColumns().size() == 1;
+        if (!isSinglePartitionKey && !restrictions.isRestricted(def))
+            throw new InvalidRequestException(String.format("Primary key column '%s' is required to be filtered by 'IS NOT NULL'", identifier));
+
+        columns.add(identifier);
+        return !isPk;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
index 397928d..a0919d7 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java

@@ -17,143 +17,173 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.List;
 
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.cql3.restrictions.Restriction;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.Pair;
 
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
+
 /**
  * A <code>DELETE</code> parsed from a CQL query statement.
  */
 public class DeleteStatement extends ModificationStatement
 {
-    private DeleteStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
+    private DeleteStatement(int boundTerms,
+                            CFMetaData cfm,
+                            Operations operations,
+                            StatementRestrictions restrictions,
+                            Conditions conditions,
+                            Attributes attrs)
     {
-        super(type, boundTerms, cfm, attrs);
+        super(StatementType.DELETE, boundTerms, cfm, operations, restrictions, conditions, attrs);
     }
 
-    public boolean requireFullClusteringKey()
-    {
-        return false;
-    }
-
-    public void addUpdateForKey(ColumnFamily cf, ByteBuffer key, Composite prefix, UpdateParameters params)
+    @Override
+    public void addUpdateForKey(PartitionUpdate update, Clustering clustering, UpdateParameters params)
     throws InvalidRequestException
     {
-        List<Operation> deletions = getOperations();
+        List<Operation> regularDeletions = getRegularOperations();
+        List<Operation> staticDeletions = getStaticOperations();
 
-        if (deletions.isEmpty())
+        if (regularDeletions.isEmpty() && staticDeletions.isEmpty())
         {
-            // We delete the slice selected by the prefix.
-            // However, for performance reasons, we distinguish 2 cases:
-            //   - It's a full internal row delete
-            //   - It's a full cell name (i.e it's a dense layout and the prefix is full)
-            if (prefix.isEmpty())
+            // We're not deleting any specific columns so it's either a full partition deletion ....
+            if (clustering.size() == 0)
             {
-                // No columns specified, delete the row
-                cf.delete(new DeletionInfo(params.timestamp, params.localDeletionTime));
+                update.addPartitionDeletion(params.deletionTime());
             }
-            else if (cfm.comparator.isDense() && prefix.size() == cfm.clusteringColumns().size())
+            // ... or a row deletion ...
+            else if (clustering.size() == cfm.clusteringColumns().size())
             {
-                cf.addAtom(params.makeTombstone(cfm.comparator.create(prefix, null)));
+                params.newRow(clustering);
+                params.addRowDeletion();
+                update.add(params.buildRow());
             }
+            // ... or a range of rows deletion.
             else
             {
-                cf.addAtom(params.makeRangeTombstone(prefix.slice()));
+                update.add(params.makeRangeTombstone(cfm.comparator, clustering));
             }
         }
         else
         {
-            for (Operation op : deletions)
-                op.execute(key, cf, prefix, params);
+            if (!regularDeletions.isEmpty())
+            {
+                // if the clustering size is zero but there are some clustering columns, it means that it's a
+                // range deletion (the full partition) in which case we need to throw an error as range deletion
+                // do not support specific columns
+                checkFalse(clustering.size() == 0 && cfm.clusteringColumns().size() != 0,
+                           "Range deletions are not supported for specific columns");
+
+                params.newRow(clustering);
+
+                for (Operation op : regularDeletions)
+                    op.execute(update.partitionKey(), params);
+                update.add(params.buildRow());
+            }
+
+            if (!staticDeletions.isEmpty())
+            {
+                params.newRow(Clustering.STATIC_CLUSTERING);
+                for (Operation op : staticDeletions)
+                    op.execute(update.partitionKey(), params);
+                update.add(params.buildRow());
+            }
         }
     }
 
-    protected void validateWhereClauseForConditions() throws InvalidRequestException
+    @Override
+    public void addUpdateForKey(PartitionUpdate update, Slice slice, UpdateParameters params)
     {
-        boolean onlyHasConditionsOnStaticColumns = hasStaticConditions() && !hasRegularConditions();
+        List<Operation> regularDeletions = getRegularOperations();
+        List<Operation> staticDeletions = getStaticOperations();
 
-        // In general, we can't delete specific columns if not all clustering columns have been specified.
-        // However, if we delete only static colums, it's fine since we won't really use the prefix anyway.
-        Iterator<ColumnDefinition> iterator = appliesOnlyToStaticColumns()
-                                              ? cfm.partitionKeyColumns().iterator()
-                                              : Iterators.concat(cfm.partitionKeyColumns().iterator(), cfm.clusteringColumns().iterator());
-        while (iterator.hasNext())
-        {
-            ColumnDefinition def = iterator.next();
-            Restriction restriction = processedKeys.get(def.name);
-            if (restriction == null || !(restriction.isEQ() || restriction.isIN()))
-            {
-                if (onlyHasConditionsOnStaticColumns)
-                {
-                    for (Operation oper : getOperations())
-                    {
-                        if (!oper.column.isStatic())
-                        {
-                            throw new InvalidRequestException(String.format("Primary key column '%s' must be specified in order to delete column '%s'",
-                                                                            def.name,
-                                                                            oper.column.name));
-                        }
-                    }
-                }
+        checkTrue(regularDeletions.isEmpty() && staticDeletions.isEmpty(),
+                  "Range deletions are not supported for specific columns");
 
-                throw new InvalidRequestException(
-                        String.format("DELETE statements must restrict all %s KEY columns with equality relations in order " +
-                                      "to use IF conditions%s, but column '%s' is not restricted",
-                                      onlyHasConditionsOnStaticColumns ? "PARTITION" : "PRIMARY",
-                                      onlyHasConditionsOnStaticColumns ? " on static columns" : "", def.name));
-            }
-        }
+        update.add(params.makeRangeTombstone(slice));
     }
 
     public static class Parsed extends ModificationStatement.Parsed
     {
         private final List<Operation.RawDeletion> deletions;
-        private final List<Relation> whereClause;
+        private WhereClause whereClause;
 
         public Parsed(CFName name,
                       Attributes.Raw attrs,
                       List<Operation.RawDeletion> deletions,
-                      List<Relation> whereClause,
+                      WhereClause whereClause,
                       List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions,
                       boolean ifExists)
         {
-            super(name, attrs, conditions, false, ifExists);
+            super(name, StatementType.DELETE, attrs, conditions, false, ifExists);
             this.deletions = deletions;
             this.whereClause = whereClause;
         }
 
-        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+
+        @Override
+        protected ModificationStatement prepareInternal(CFMetaData cfm,
+                                                        VariableSpecifications boundNames,
+                                                        Conditions conditions,
+                                                        Attributes attrs)
         {
-            DeleteStatement stmt = new DeleteStatement(ModificationStatement.StatementType.DELETE, boundNames.size(), cfm, attrs);
+            Operations operations = new Operations(type);
 
-            for (Operation.RawDeletion deletion : deletions)
+            if (cfm.isSuper() && cfm.isDense())
             {
-                ColumnIdentifier id = deletion.affectedColumn().prepare(cfm);
-                ColumnDefinition def = cfm.getColumnDefinition(id);
-                if (def == null)
-                    throw new InvalidRequestException(String.format("Unknown identifier %s", id));
+                conditions = SuperColumnCompatibility.rebuildLWTColumnConditions(conditions, cfm, whereClause);
+                whereClause = SuperColumnCompatibility.prepareDeleteOperations(cfm, whereClause, boundNames, operations);
+            }
+            else
+            {
+                for (Operation.RawDeletion deletion : deletions)
+                {
+                    ColumnDefinition def = getColumnDefinition(cfm, deletion.affectedColumn());
 
-                // For compact, we only have one value except the key, so the only form of DELETE that make sense is without a column
-                // list. However, we support having the value name for coherence with the static/sparse case
-                if (def.isPrimaryKeyColumn())
-                    throw new InvalidRequestException(String.format("Invalid identifier %s for deletion (should not be a PRIMARY KEY part)", def.name));
+                    // For compact, we only have one value except the key, so the only form of DELETE that make sense is without a column
+                    // list. However, we support having the value name for coherence with the static/sparse case
+                    checkFalse(def.isPrimaryKeyColumn(), "Invalid identifier %s for deletion (should not be a PRIMARY KEY part)", def.name);
 
-                Operation op = deletion.prepare(cfm.ksName, def);
-                op.collectMarkerSpecification(boundNames);
-                stmt.addOperation(op);
+                    Operation op = deletion.prepare(cfm.ksName, def);
+                    op.collectMarkerSpecification(boundNames);
+                    operations.add(op);
+                }
             }
 
-            stmt.processWhereClause(whereClause, boundNames);
+            StatementRestrictions restrictions = newRestrictions(cfm,
+                                                                 boundNames,
+                                                                 operations,
+                                                                 whereClause,
+                                                                 conditions);
+
+            DeleteStatement stmt = new DeleteStatement(boundNames.size(),
+                                                       cfm,
+                                                       operations,
+                                                       restrictions,
+                                                       conditions,
+                                                       attrs);
+
+            if (stmt.hasConditions() && !restrictions.hasAllPKColumnsRestrictedByEqualities())
+            {
+                checkFalse(operations.appliesToRegularColumns(),
+                           "DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns");
+
+                // All primary keys must be specified, unless this has static column restrictions
+                checkFalse(conditions.appliesToRegularColumns(),
+                           "DELETE statements must restrict all PRIMARY KEY columns with equality relations" +
+                           " in order to use IF condition on non static columns");
+            }
+
             return stmt;
         }
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropAggregateStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropAggregateStatement.java
index 2d5ea70..ae8ad8c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropAggregateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropAggregateStatement.java

@@ -18,9 +18,11 @@
 package org.apache.cassandra.cql3.statements;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.functions.*;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -29,11 +31,12 @@
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 
 /**
- * A <code>DROP AGGREGATE</code> statement parsed from a CQL query.
+ * A {@code DROP AGGREGATE} statement parsed from a CQL query.
  */
 public final class DropAggregateStatement extends SchemaAlteringStatement
 {
@@ -42,8 +45,6 @@
     private final List<CQL3Type.Raw> argRawTypes;
     private final boolean argsPresent;
 
-    private Function old;
-
     public DropAggregateStatement(FunctionName functionName,
                                   List<CQL3Type.Raw> argRawTypes,
                                   boolean argsPresent,
@@ -77,15 +78,9 @@
     {
     }
 
-    public Event.SchemaChange changeEvent()
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
-        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.AGGREGATE,
-                                      old.name().keyspace, old.name().name, AbstractType.asCQLTypeStringList(old.argTypes()));
-    }
-
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
-    {
-        List<Function> olds = Functions.find(functionName);
+        Collection<Function> olds = Schema.instance.getFunctions(functionName);
 
         if (!argsPresent && olds != null && olds.size() > 1)
             throw new InvalidRequestException(String.format("'DROP AGGREGATE %s' matches multiple function definitions; " +
@@ -94,18 +89,21 @@
                                                             "'DESCRIBE AGGREGATE %s' command to find all overloads",
                                                             functionName, functionName, functionName));
 
-        List<AbstractType<?>> argTypes = new ArrayList<>(argRawTypes.size());
-        for (CQL3Type.Raw rawType : argRawTypes)
-            argTypes.add(prepareType("arguments", rawType));
-
-        Function old;
+        Function old = null;
         if (argsPresent)
         {
-            old = Functions.find(functionName, argTypes);
+            if (Schema.instance.getKSMetaData(functionName.keyspace) != null)
+            {
+                List<AbstractType<?>> argTypes = new ArrayList<>(argRawTypes.size());
+                for (CQL3Type.Raw rawType : argRawTypes)
+                    argTypes.add(prepareType("arguments", rawType));
+
+                old = Schema.instance.findFunction(functionName, argTypes).orElse(null);
+            }
             if (old == null || !(old instanceof AggregateFunction))
             {
                 if (ifExists)
-                    return false;
+                    return null;
                 // just build a nicer error message
                 StringBuilder sb = new StringBuilder();
                 for (CQL3Type.Raw rawType : argRawTypes)
@@ -120,24 +118,23 @@
         }
         else
         {
-            if (olds == null || olds.isEmpty() || !(olds.get(0) instanceof AggregateFunction))
+            if (olds == null || olds.isEmpty() || !(olds.iterator().next() instanceof AggregateFunction))
             {
                 if (ifExists)
-                    return false;
+                    return null;
                 throw new InvalidRequestException(String.format("Cannot drop non existing aggregate '%s'", functionName));
             }
-            old = olds.get(0);
+            old = olds.iterator().next();
         }
 
         if (old.isNative())
             throw new InvalidRequestException(String.format("Cannot drop aggregate '%s' because it is a " +
                                                             "native (built-in) function", functionName));
 
-        this.old = old;
-
         MigrationManager.announceAggregateDrop((UDAggregate)old, isLocalOnly);
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.AGGREGATE,
+                                      old.name().keyspace, old.name().name, AbstractType.asCQLTypeStringList(old.argTypes()));
 
-        return true;
     }
 
     private AbstractType<?> prepareType(String typeName, CQL3Type.Raw rawType)

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropFunctionStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropFunctionStatement.java
index edd8a65..8845a82 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropFunctionStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropFunctionStatement.java

@@ -18,25 +18,29 @@
 package org.apache.cassandra.cql3.statements;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 
 import com.google.common.base.Joiner;
 
 import org.apache.cassandra.auth.FunctionResource;
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.functions.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 
 /**
- * A <code>DROP FUNCTION</code> statement parsed from a CQL query.
+ * A {@code DROP FUNCTION} statement parsed from a CQL query.
  */
 public final class DropFunctionStatement extends SchemaAlteringStatement
 {
@@ -45,7 +49,6 @@
     private final List<CQL3Type.Raw> argRawTypes;
     private final boolean argsPresent;
 
-    private Function old;
     private List<AbstractType<?>> argTypes;
 
     public DropFunctionStatement(FunctionName functionName,
@@ -60,22 +63,26 @@
     }
 
     @Override
-    public Prepared prepare() throws InvalidRequestException
+    public Prepared prepare(ClientState clientState) throws InvalidRequestException
     {
-        argTypes = new ArrayList<>(argRawTypes.size());
-        for (CQL3Type.Raw rawType : argRawTypes)
+        if (Schema.instance.getKSMetaData(functionName.keyspace) != null)
         {
-            if (rawType.isFrozen())
-                throw new InvalidRequestException("The function arguments should not be frozen; remove the frozen<> modifier");
+            argTypes = new ArrayList<>(argRawTypes.size());
+            for (CQL3Type.Raw rawType : argRawTypes)
+            {
+                if (rawType.isFrozen())
+                    throw new InvalidRequestException("The function arguments should not be frozen; remove the frozen<> modifier");
 
-            // UDT are not supported non frozen but we do not allow the frozen keyword for argument. So for the moment we
-            // freeze them here
-            if (!rawType.canBeNonFrozen())
-                rawType.freeze();
+                // UDT are not supported non frozen but we do not allow the frozen keyword for argument. So for the moment we
+                // freeze them here
+                if (!rawType.canBeNonFrozen())
+                    rawType.freeze();
 
-            argTypes.add(rawType.prepare(functionName.keyspace).getType());
+                argTypes.add(rawType.prepare(functionName.keyspace).getType());
+            }
         }
-        return super.prepare();
+
+        return super.prepare(clientState);
     }
 
     @Override
@@ -90,7 +97,6 @@
         ThriftValidation.validateKeyspaceNotSystem(functionName.keyspace);
     }
 
-    @Override
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
     {
         Function function = findFunction();
@@ -110,10 +116,9 @@
         }
     }
 
-    @Override
     public void validate(ClientState state)
     {
-        List<Function> olds = Functions.find(functionName);
+        Collection<Function> olds = Schema.instance.getFunctions(functionName);
 
         if (!argsPresent && olds != null && olds.size() > 1)
             throw new InvalidRequestException(String.format("'DROP FUNCTION %s' matches multiple function definitions; " +
@@ -123,32 +128,26 @@
                                                             functionName, functionName, functionName));
     }
 
-    @Override
-    public Event.SchemaChange changeEvent()
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException
     {
-        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.FUNCTION,
-                                      old.name().keyspace, old.name().name, AbstractType.asCQLTypeStringList(old.argTypes()));
-    }
-
-    @Override
-    public boolean announceMigration(boolean isLocalOnly) throws RequestValidationException
-    {
-        old = findFunction();
+        Function old = findFunction();
         if (old == null)
         {
             if (ifExists)
-                return false;
+                return null;
             else
                 throw new InvalidRequestException(getMissingFunctionError());
         }
 
-        List<Function> references = Functions.getReferencesTo(old);
-        if (!references.isEmpty())
-            throw new InvalidRequestException(String.format("Function '%s' still referenced by %s", old, references));
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(old.name().keyspace);
+        Collection<UDAggregate> referrers = ksm.functions.aggregatesUsingFunction(old);
+        if (!referrers.isEmpty())
+            throw new InvalidRequestException(String.format("Function '%s' still referenced by %s", old, referrers));
 
         MigrationManager.announceFunctionDrop((UDFunction) old, isLocalOnly);
 
-        return true;
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.FUNCTION,
+                                      old.name().keyspace, old.name().name, AbstractType.asCQLTypeStringList(old.argTypes()));
     }
 
     private String getMissingFunctionError()
@@ -158,24 +157,21 @@
         sb.append(functionName);
         if (argsPresent)
             sb.append(Joiner.on(", ").join(argRawTypes));
-        sb.append("'");
+        sb.append('\'');
         return sb.toString();
     }
 
-    private String typeKeyspace(CQL3Type.Raw rawType)
-    {
-        String ks = rawType.keyspace();
-        if (ks != null)
-            return ks;
-        return functionName.keyspace;
-    }
-
     private Function findFunction()
     {
         Function old;
         if (argsPresent)
         {
-            old = Functions.find(functionName, argTypes);
+            if (argTypes == null)
+            {
+                return null;
+            }
+
+            old = Schema.instance.findFunction(functionName, argTypes).orElse(null);
             if (old == null || !(old instanceof ScalarFunction))
             {
                 return null;
@@ -183,11 +179,11 @@
         }
         else
         {
-            List<Function> olds = Functions.find(functionName);
-            if (olds == null || olds.isEmpty() || !(olds.get(0) instanceof ScalarFunction))
+            Collection<Function> olds = Schema.instance.getFunctions(functionName);
+            if (olds == null || olds.isEmpty() || !(olds.iterator().next() instanceof ScalarFunction))
                 return null;
 
-            old = olds.get(0);
+            old = olds.iterator().next();
         }
         return old;
     }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java
index 0d33e57..35aee3c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropIndexStatement.java

@@ -19,12 +19,12 @@
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.IndexName;
+import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.KeyspaceNotDefinedException;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.service.QueryState;
@@ -36,9 +36,6 @@
     public final String indexName;
     public final boolean ifExists;
 
-    // initialized in announceMigration()
-    private String indexedCF;
-
     public DropIndexStatement(IndexName indexName, boolean ifExists)
     {
         super(indexName.getCfName());
@@ -48,23 +45,13 @@
 
     public String columnFamily()
     {
-        if (indexedCF != null)
-            return indexedCF;
-
-        try
-        {
-            CFMetaData cfm = findIndexedCF();
-            return cfm == null ? null : cfm.cfName;
-        }
-        catch (InvalidRequestException ire)
-        {
-            throw new RuntimeException(ire);
-        }
+        CFMetaData cfm = lookupIndexedTable();
+        return cfm == null ? null : cfm.cfName;
     }
 
     public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
     {
-        CFMetaData cfm = findIndexedCF();
+        CFMetaData cfm = lookupIndexedTable();
         if (cfm == null)
             return;
 
@@ -73,70 +60,54 @@
 
     public void validate(ClientState state)
     {
-        // validated in findIndexedCf()
-    }
-
-    public Event.SchemaChange changeEvent()
-    {
-        // Dropping an index is akin to updating the CF
-        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
+        // validated in lookupIndexedTable()
     }
 
     @Override
     public ResultMessage execute(QueryState state, QueryOptions options) throws RequestValidationException
     {
-        announceMigration(false);
-        return indexedCF == null ? null : new ResultMessage.SchemaChange(changeEvent());
+        Event.SchemaChange ce = announceMigration(state, false);
+        return ce == null ? null : new ResultMessage.SchemaChange(ce);
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
     {
-        CFMetaData cfm = findIndexedCF();
+        CFMetaData cfm = lookupIndexedTable();
         if (cfm == null)
-            return false;
+            return null;
 
-        CFMetaData updatedCfm = updateCFMetadata(cfm);
-        indexedCF = updatedCfm.cfName;
+        CFMetaData updatedCfm = cfm.copy();
+        updatedCfm.indexes(updatedCfm.getIndexes().without(indexName));
         MigrationManager.announceColumnFamilyUpdate(updatedCfm, isLocalOnly);
-        return true;
+        // Dropping an index is akin to updating the CF
+        // Note that we shouldn't call columnFamily() at this point because the index has been dropped and the call to lookupIndexedTable()
+        // in that method would now throw.
+        return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, cfm.ksName, cfm.cfName);
     }
 
-    private CFMetaData updateCFMetadata(CFMetaData cfm)
+    /**
+     * The table for which the index should be dropped, or null if the index doesn't exist
+     *
+     * @return the metadata for the table containing the dropped index, or {@code null}
+     * if the index to drop cannot be found but "IF EXISTS" is set on the statement.
+     *
+     * @throws InvalidRequestException if the index cannot be found and "IF EXISTS" is not
+     * set on the statement.
+     */
+    private CFMetaData lookupIndexedTable()
     {
-        ColumnDefinition column = findIndexedColumn(cfm);
-        assert column != null;
-        CFMetaData cloned = cfm.copy();
-        ColumnDefinition toChange = cloned.getColumnDefinition(column.name);
-        assert toChange.getIndexName() != null && toChange.getIndexName().equals(indexName);
-        toChange.setIndexName(null);
-        toChange.setIndexType(null, null);
-        return cloned;
-    }
-
-    private CFMetaData findIndexedCF() throws InvalidRequestException
-    {
-        KSMetaData ksm = Schema.instance.getKSMetaData(keyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace());
         if (ksm == null)
             throw new KeyspaceNotDefinedException("Keyspace " + keyspace() + " does not exist");
-        for (CFMetaData cfm : ksm.cfMetaData().values())
-        {
-            if (findIndexedColumn(cfm) != null)
-                return cfm;
-        }
 
-        if (ifExists)
-            return null;
-        else
-            throw new InvalidRequestException("Index '" + indexName + "' could not be found in any of the tables of keyspace '" + keyspace() + '\'');
-    }
-
-    private ColumnDefinition findIndexedColumn(CFMetaData cfm)
-    {
-        for (ColumnDefinition column : cfm.allColumns())
-        {
-            if (column.getIndexType() != null && column.getIndexName() != null && column.getIndexName().equals(indexName))
-                return column;
-        }
-        return null;
+        return ksm.findIndexedTable(indexName)
+                  .orElseGet(() -> {
+                      if (ifExists)
+                          return null;
+                      else
+                          throw new InvalidRequestException(String.format("Index '%s' could not be found in any " +
+                                                                          "of the tables of keyspace '%s'",
+                                                                          indexName, keyspace()));
+                  });
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java
index ba6b917..9ba68a6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropKeyspaceStatement.java

@@ -24,6 +24,7 @@
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 
@@ -55,23 +56,18 @@
         return keyspace;
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws ConfigurationException
     {
         try
         {
             MigrationManager.announceKeyspaceDrop(keyspace, isLocalOnly);
-            return true;
+            return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, keyspace());
         }
         catch(ConfigurationException e)
         {
             if (ifExists)
-                return false;
+                return null;
             throw e;
         }
     }
-
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, keyspace());
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java
index e690c3e4..5641185 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTableStatement.java

@@ -18,12 +18,17 @@
 package org.apache.cassandra.cql3.statements;
 
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CFName;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 public class DropTableStatement extends SchemaAlteringStatement
@@ -54,23 +59,46 @@
         // validated in announceMigration()
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws ConfigurationException
     {
         try
         {
+            KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace());
+            if (ksm == null)
+                throw new ConfigurationException(String.format("Cannot drop table in unknown keyspace '%s'", keyspace()));
+            CFMetaData cfm = ksm.getTableOrViewNullable(columnFamily());
+            if (cfm != null)
+            {
+                if (cfm.isView())
+                    throw new InvalidRequestException("Cannot use DROP TABLE on Materialized View");
+
+                boolean rejectDrop = false;
+                StringBuilder messageBuilder = new StringBuilder();
+                for (ViewDefinition def : ksm.views)
+                {
+                    if (def.baseTableId.equals(cfm.cfId))
+                    {
+                        if (rejectDrop)
+                            messageBuilder.append(',');
+                        rejectDrop = true;
+                        messageBuilder.append(def.viewName);
+                    }
+                }
+                if (rejectDrop)
+                {
+                    throw new InvalidRequestException(String.format("Cannot drop table when materialized views still depend on it (%s.{%s})",
+                                                                    keyspace(),
+                                                                    messageBuilder.toString()));
+                }
+            }
             MigrationManager.announceColumnFamilyDrop(keyspace(), columnFamily(), isLocalOnly);
-            return true;
+            return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
         }
         catch (ConfigurationException e)
         {
             if (ifExists)
-                return false;
+                return null;
             throw e;
         }
     }
-
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
-    }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java
index 8267b4e..162c736 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTriggerStatement.java

@@ -27,8 +27,10 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.schema.Triggers;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 
@@ -57,22 +59,22 @@
         ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws ConfigurationException, InvalidRequestException
     {
         CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), columnFamily()).copy();
-        if (cfm.removeTrigger(triggerName))
-        {
-            logger.info("Dropping trigger with name {}", triggerName);
-            MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
-            return true;
-        }
-        if (!ifExists)
-            throw new InvalidRequestException(String.format("Trigger %s was not found", triggerName));
-        return false;
-    }
+        Triggers triggers = cfm.getTriggers();
 
-    public Event.SchemaChange changeEvent()
-    {
+        if (!triggers.get(triggerName).isPresent())
+        {
+            if (ifExists)
+                return null;
+            else
+                throw new InvalidRequestException(String.format("Trigger %s was not found", triggerName));
+        }
+
+        logger.info("Dropping trigger with name {}", triggerName);
+        cfm.triggers(triggers.without(triggerName));
+        MigrationManager.announceColumnFamilyUpdate(cfm, isLocalOnly);
         return new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java
index 6993ea3..cd6daae 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropTypeStatement.java

@@ -21,11 +21,12 @@
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event;
 
 public class DropTypeStatement extends SchemaAlteringStatement
@@ -35,7 +36,6 @@
 
     public DropTypeStatement(UTName name, boolean ifExists)
     {
-        super();
         this.name = name;
         this.ifExists = ifExists;
     }
@@ -54,7 +54,7 @@
 
     public void validate(ClientState state) throws RequestValidationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name.getKeyspace());
         if (ksm == null)
         {
             if (ifExists)
@@ -63,8 +63,7 @@
                 throw new InvalidRequestException(String.format("Cannot drop type in unknown keyspace %s", name.getKeyspace()));
         }
 
-        UserType old = ksm.userTypes.getType(name.getUserTypeName());
-        if (old == null)
+        if (!ksm.types.get(name.getUserTypeName()).isPresent())
         {
             if (ifExists)
                 return;
@@ -79,72 +78,24 @@
         // we drop and 2) existing tables referencing the type (maybe in a nested
         // way).
 
-        for (Function function : Functions.all())
+        for (Function function : ksm.functions)
         {
-            if (isUsedBy(function.returnType()))
+            if (function.returnType().referencesUserType(name.getStringTypeName()))
                 throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by function %s", name, function));
+
             for (AbstractType<?> argType : function.argTypes())
-                if (isUsedBy(argType))
+                if (argType.referencesUserType(name.getStringTypeName()))
                     throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by function %s", name, function));
         }
 
-        for (KSMetaData ksm2 : Schema.instance.getKeyspaceDefinitions())
-        {
-            for (UserType ut : ksm2.userTypes.getAllTypes().values())
-            {
-                if (ut.keyspace.equals(name.getKeyspace()) && ut.name.equals(name.getUserTypeName()))
-                    continue;
-                if (isUsedBy(ut))
-                    throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by user type %s", name, ut.asCQL3Type()));
-            }
+        for (UserType ut : ksm.types)
+            if (!ut.name.equals(name.getUserTypeName()) && ut.referencesUserType(name.getStringTypeName()))
+                throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by user type %s", name, ut.getNameAsString()));
 
-            for (CFMetaData cfm : ksm2.cfMetaData().values())
-                for (ColumnDefinition def : cfm.allColumns())
-                    if (isUsedBy(def.type))
-                        throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by table %s.%s", name, cfm.ksName, cfm.cfName));
-        }
-    }
-
-    private boolean isUsedBy(AbstractType<?> toCheck) throws RequestValidationException
-    {
-        if (toCheck instanceof UserType)
-        {
-            UserType ut = (UserType)toCheck;
-            if (name.getKeyspace().equals(ut.keyspace) && name.getUserTypeName().equals(ut.name))
-                return true;
-
-            for (AbstractType<?> subtype : ut.fieldTypes())
-                if (isUsedBy(subtype))
-                    return true;
-        }
-        else if (toCheck instanceof CompositeType)
-        {
-            CompositeType ct = (CompositeType)toCheck;
-            for (AbstractType<?> subtype : ct.types)
-                if (isUsedBy(subtype))
-                    return true;
-        }
-        else if (toCheck instanceof ColumnToCollectionType)
-        {
-            for (CollectionType collection : ((ColumnToCollectionType)toCheck).defined.values())
-                if (isUsedBy(collection))
-                    return true;
-        }
-        else if (toCheck instanceof CollectionType)
-        {
-            if (toCheck instanceof ListType)
-                return isUsedBy(((ListType)toCheck).getElementsType());
-            else if (toCheck instanceof SetType)
-                return isUsedBy(((SetType)toCheck).getElementsType());
-            else
-                return isUsedBy(((MapType)toCheck).getKeysType()) || isUsedBy(((MapType)toCheck).getValuesType());
-        }
-        return false;
-    }
-
-    public Event.SchemaChange changeEvent()
-    {
-        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
+        for (CFMetaData cfm : ksm.tablesAndViews())
+            for (ColumnDefinition def : cfm.allColumns())
+                if (def.type.referencesUserType(name.getStringTypeName()))
+                    throw new InvalidRequestException(String.format("Cannot drop user type %s as it is still used by table %s.%s", name, cfm.ksName, cfm.cfName));
     }
 
     @Override
@@ -153,18 +104,18 @@
         return name.getKeyspace();
     }
 
-    public boolean announceMigration(boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(name.getKeyspace());
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(name.getKeyspace());
         if (ksm == null)
-            return false; // do not assert (otherwise IF EXISTS case fails)
+            return null; // do not assert (otherwise IF EXISTS case fails)
 
-        UserType toDrop = ksm.userTypes.getType(name.getUserTypeName());
+        UserType toDrop = ksm.types.getNullable(name.getUserTypeName());
         // Can be null with ifExists
         if (toDrop == null)
-            return false;
+            return null;
 
         MigrationManager.announceTypeDrop(toDrop, isLocalOnly);
-        return true;
+        return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TYPE, keyspace(), name.getStringTypeName());
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/DropViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropViewStatement.java
new file mode 100644
index 0000000..b65c1b0
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/DropViewStatement.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.cql3.CFName;
+import org.apache.cassandra.db.view.View;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.Event;
+
+public class DropViewStatement extends SchemaAlteringStatement
+{
+    public final boolean ifExists;
+
+    public DropViewStatement(CFName cf, boolean ifExists)
+    {
+        super(cf);
+        this.ifExists = ifExists;
+    }
+
+    public void checkAccess(ClientState state) throws UnauthorizedException, InvalidRequestException
+    {
+        CFMetaData baseTable = View.findBaseTable(keyspace(), columnFamily());
+        if (baseTable != null)
+            state.hasColumnFamilyAccess(keyspace(), baseTable.cfName, Permission.ALTER);
+    }
+
+    public void validate(ClientState state)
+    {
+        // validated in findIndexedCf()
+    }
+
+    public Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws InvalidRequestException, ConfigurationException
+    {
+        try
+        {
+//            ViewDefinition view = Schema.instance.getViewDefinition(keyspace(), columnFamily());
+//            if (view == null)
+//            {
+//                if (Schema.instance.getCFMetaData(keyspace(), columnFamily()) != null)
+//                    throw new ConfigurationException(String.format("Cannot drop table '%s' in keyspace '%s'.", columnFamily(), keyspace()));
+//
+//                throw new ConfigurationException(String.format("Cannot drop non existing materialized view '%s' in keyspace '%s'.", columnFamily(), keyspace()));
+//            }
+//
+//            CFMetaData baseCfm = Schema.instance.getCFMetaData(view.baseTableId);
+//            if (baseCfm == null)
+//            {
+//                if (ifExists)
+//                    throw new ConfigurationException(String.format("Cannot drop materialized view '%s' in keyspace '%s' without base CF.", columnFamily(), keyspace()));
+//                else
+//                    throw new InvalidRequestException(String.format("View '%s' could not be found in any of the tables of keyspace '%s'", cfName, keyspace()));
+//            }
+
+            MigrationManager.announceViewDrop(keyspace(), columnFamily(), isLocalOnly);
+            return new Event.SchemaChange(Event.SchemaChange.Change.DROPPED, Event.SchemaChange.Target.TABLE, keyspace(), columnFamily());
+        }
+        catch (ConfigurationException e)
+        {
+            if (ifExists)
+                return null;
+            throw e;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/IndexPropDefs.java b/src/java/org/apache/cassandra/cql3/statements/IndexPropDefs.java
index 6790611..b8ce7ec 100644
--- a/src/java/org/apache/cassandra/cql3/statements/IndexPropDefs.java
+++ b/src/java/org/apache/cassandra/cql3/statements/IndexPropDefs.java

@@ -19,8 +19,9 @@
 
 import java.util.*;
 
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.SyntaxException;
 
 public class IndexPropDefs extends PropertyDefinitions
 {
@@ -50,9 +51,14 @@
         if (!isCustom && !properties.isEmpty())
             throw new InvalidRequestException("Cannot specify options for a non-CUSTOM index");
 
-        if (getRawOptions().containsKey(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME))
+        if (getRawOptions().containsKey(IndexTarget.CUSTOM_INDEX_OPTION_NAME))
             throw new InvalidRequestException(String.format("Cannot specify %s as a CUSTOM option",
-                                                            SecondaryIndex.CUSTOM_INDEX_OPTION_NAME));
+                                                            IndexTarget.CUSTOM_INDEX_OPTION_NAME));
+
+        if (getRawOptions().containsKey(IndexTarget.TARGET_OPTION_NAME))
+            throw new InvalidRequestException(String.format("Cannot specify %s as a CUSTOM option",
+                                                            IndexTarget.TARGET_OPTION_NAME));
+
     }
 
     public Map<String, String> getRawOptions() throws SyntaxException
@@ -64,7 +70,7 @@
     public Map<String, String> getOptions() throws SyntaxException
     {
         Map<String, String> options = new HashMap<>(getRawOptions());
-        options.put(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, customClass);
+        options.put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, customClass);
         return options;
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java b/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java
index d602388..8cdf2c8 100644
--- a/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java
+++ b/src/java/org/apache/cassandra/cql3/statements/IndexTarget.java

@@ -17,64 +17,111 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.util.Map;
+import java.util.regex.Pattern;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 
 public class IndexTarget
 {
-    public final ColumnIdentifier column;
-    public final TargetType type;
+    public static final String TARGET_OPTION_NAME = "target";
+    public static final String CUSTOM_INDEX_OPTION_NAME = "class_name";
 
-    private IndexTarget(ColumnIdentifier column, TargetType type)
+    /**
+     * The name of the option used to specify that the index is on the collection keys.
+     */
+    public static final String INDEX_KEYS_OPTION_NAME = "index_keys";
+
+    /**
+     * The name of the option used to specify that the index is on the collection (map) entries.
+     */
+    public static final String INDEX_ENTRIES_OPTION_NAME = "index_keys_and_values";
+
+    /**
+     * Regex for *unquoted* column names, anything which does not match this pattern must be a quoted name
+     */
+    private static final Pattern COLUMN_IDENTIFIER_PATTERN = Pattern.compile("[a-z_0-9]+");
+
+    public final ColumnIdentifier column;
+    public final boolean quoteName;
+    public final Type type;
+
+    public IndexTarget(ColumnIdentifier column, Type type)
     {
         this.column = column;
         this.type = type;
+
+        // if the column name contains anything other than lower case alphanumerics
+        // or underscores, then it must be quoted when included in the target string
+        quoteName = !COLUMN_IDENTIFIER_PATTERN.matcher(column.toString()).matches();
+    }
+
+    public String asCqlString(CFMetaData cfm)
+    {
+        if (!cfm.getColumnDefinition(column).type.isCollection())
+            return column.toCQLString();
+
+        return String.format("%s(%s)", type.toString(), column.toCQLString());
     }
 
     public static class Raw
     {
         private final ColumnIdentifier.Raw column;
-        private final TargetType type;
+        private final Type type;
 
-        private Raw(ColumnIdentifier.Raw column, TargetType type)
+        private Raw(ColumnIdentifier.Raw column, Type type)
         {
             this.column = column;
             this.type = type;
         }
 
+        public static Raw simpleIndexOn(ColumnIdentifier.Raw c)
+        {
+            return new Raw(c, Type.SIMPLE);
+        }
+
         public static Raw valuesOf(ColumnIdentifier.Raw c)
         {
-            return new Raw(c, TargetType.VALUES);
+            return new Raw(c, Type.VALUES);
         }
 
         public static Raw keysOf(ColumnIdentifier.Raw c)
         {
-            return new Raw(c, TargetType.KEYS);
+            return new Raw(c, Type.KEYS);
         }
 
         public static Raw keysAndValuesOf(ColumnIdentifier.Raw c)
         {
-            return new Raw(c, TargetType.KEYS_AND_VALUES);
+            return new Raw(c, Type.KEYS_AND_VALUES);
         }
 
         public static Raw fullCollection(ColumnIdentifier.Raw c)
         {
-            return new Raw(c, TargetType.FULL);
+            return new Raw(c, Type.FULL);
         }
 
         public IndexTarget prepare(CFMetaData cfm)
         {
-            return new IndexTarget(column.prepare(cfm), type);
+            // Until we've prepared the target column, we can't be certain about the target type
+            // because (for backwards compatibility) an index on a collection's values uses the
+            // same syntax as an index on a regular column (i.e. the 'values' in
+            // 'CREATE INDEX on table(values(collection));' is optional). So we correct the target type
+            // when the target column is a collection & the target type is SIMPLE.
+            ColumnIdentifier colId = column.prepare(cfm);
+            ColumnDefinition columnDef = cfm.getColumnDefinition(colId);
+            if (columnDef == null)
+                throw new InvalidRequestException("No column definition found for column " + colId);
+
+            Type actualType = (type == Type.SIMPLE && columnDef.type.isCollection()) ? Type.VALUES : type;
+            return new IndexTarget(colId, actualType);
         }
     }
 
-    public static enum TargetType
+    public static enum Type
     {
-        VALUES, KEYS, KEYS_AND_VALUES, FULL;
+        VALUES, KEYS, KEYS_AND_VALUES, FULL, SIMPLE;
 
         public String toString()
         {
@@ -83,32 +130,26 @@
                 case KEYS: return "keys";
                 case KEYS_AND_VALUES: return "entries";
                 case FULL: return "full";
-                default: return "values";
+                case VALUES: return "values";
+                case SIMPLE: return "";
+                default: return "";
             }
         }
 
-        public String indexOption()
+        public static Type fromString(String s)
         {
-            switch (this)
-            {
-                case KEYS: return SecondaryIndex.INDEX_KEYS_OPTION_NAME;
-                case KEYS_AND_VALUES: return SecondaryIndex.INDEX_ENTRIES_OPTION_NAME;
-                case VALUES: return SecondaryIndex.INDEX_VALUES_OPTION_NAME;
-                default: throw new AssertionError();
-            }
-        }
-
-        public static TargetType fromColumnDefinition(ColumnDefinition cd)
-        {
-            Map<String, String> options = cd.getIndexOptions();
-            if (options.containsKey(SecondaryIndex.INDEX_KEYS_OPTION_NAME))
-                return KEYS;
-            else if (options.containsKey(SecondaryIndex.INDEX_ENTRIES_OPTION_NAME))
-                return KEYS_AND_VALUES;
-            else if (cd.type.isCollection() && !cd.type.isMultiCell())
-                return FULL;
-            else
+            if ("".equals(s))
+                return SIMPLE;
+            else if ("values".equals(s))
                 return VALUES;
+            else if ("keys".equals(s))
+                return KEYS;
+            else if ("entries".equals(s))
+                return KEYS_AND_VALUES;
+            else if ("full".equals(s))
+                return FULL;
+
+            throw new AssertionError("Unrecognized index target type " + s);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/KSPropDefs.java b/src/java/org/apache/cassandra/cql3/statements/KSPropDefs.java
deleted file mode 100644
index 7c05435..0000000
--- a/src/java/org/apache/cassandra/cql3/statements/KSPropDefs.java
+++ /dev/null

@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3.statements;
-
-import java.util.*;
-
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.*;
-
-public class KSPropDefs extends PropertyDefinitions
-{
-    public static final String KW_DURABLE_WRITES = "durable_writes";
-    public static final String KW_REPLICATION = "replication";
-
-    public static final String REPLICATION_STRATEGY_CLASS_KEY = "class";
-
-    public static final Set<String> keywords = new HashSet<>();
-    public static final Set<String> obsoleteKeywords = new HashSet<>();
-
-    static
-    {
-        keywords.add(KW_DURABLE_WRITES);
-        keywords.add(KW_REPLICATION);
-    }
-
-    private String strategyClass;
-
-    public void validate() throws SyntaxException
-    {
-        // Skip validation if the strategy class is already set as it means we've alreayd
-        // prepared (and redoing it would set strategyClass back to null, which we don't want)
-        if (strategyClass != null)
-            return;
-
-        validate(keywords, obsoleteKeywords);
-
-        Map<String, String> replicationOptions = getReplicationOptions();
-        if (!replicationOptions.isEmpty())
-        {
-            strategyClass = replicationOptions.get(REPLICATION_STRATEGY_CLASS_KEY);
-            replicationOptions.remove(REPLICATION_STRATEGY_CLASS_KEY);
-        }
-    }
-
-    public Map<String, String> getReplicationOptions() throws SyntaxException
-    {
-        Map<String, String> replicationOptions = getMap(KW_REPLICATION);
-        if (replicationOptions == null)
-            return Collections.emptyMap();
-        return replicationOptions;
-    }
-
-    public String getReplicationStrategyClass()
-    {
-        return strategyClass;
-    }
-
-    public KSMetaData asKSMetadata(String ksName) throws RequestValidationException
-    {
-        return KSMetaData.newKeyspace(ksName, getReplicationStrategyClass(), getReplicationOptions(), getBoolean(KW_DURABLE_WRITES, true));
-    }
-
-    public KSMetaData asKSMetadataUpdate(KSMetaData old) throws RequestValidationException
-    {
-        String sClass = strategyClass;
-        Map<String, String> sOptions = getReplicationOptions();
-        if (sClass == null)
-        {
-            sClass = old.strategyClass.getName();
-            sOptions = old.strategyOptions;
-        }
-        return KSMetaData.newKeyspace(old.name, sClass, sOptions, getBoolean(KW_DURABLE_WRITES, old.durableWrites));
-    }
-}

diff --git a/src/java/org/apache/cassandra/cql3/statements/KeyspaceAttributes.java b/src/java/org/apache/cassandra/cql3/statements/KeyspaceAttributes.java
new file mode 100644
index 0000000..db6b0d6
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/KeyspaceAttributes.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.util.*;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.KeyspaceParams.Option;
+import org.apache.cassandra.schema.ReplicationParams;
+
+public final class KeyspaceAttributes extends PropertyDefinitions
+{
+    private static final Set<String> validKeywords;
+    private static final Set<String> obsoleteKeywords;
+
+    static
+    {
+        ImmutableSet.Builder<String> validBuilder = ImmutableSet.builder();
+        for (Option option : Option.values())
+            validBuilder.add(option.toString());
+        validKeywords = validBuilder.build();
+        obsoleteKeywords = ImmutableSet.of();
+    }
+
+    public void validate()
+    {
+        validate(validKeywords, obsoleteKeywords);
+    }
+
+    public String getReplicationStrategyClass()
+    {
+        return getAllReplicationOptions().get(ReplicationParams.CLASS);
+    }
+
+    public Map<String, String> getReplicationOptions()
+    {
+        Map<String, String> replication = new HashMap<>(getAllReplicationOptions());
+        replication.remove(ReplicationParams.CLASS);
+        return replication;
+    }
+
+    public Map<String, String> getAllReplicationOptions()
+    {
+        Map<String, String> replication = getMap(Option.REPLICATION.toString());
+        return replication == null
+             ? Collections.emptyMap()
+             : replication;
+    }
+
+    public KeyspaceParams asNewKeyspaceParams()
+    {
+        boolean durableWrites = getBoolean(Option.DURABLE_WRITES.toString(), KeyspaceParams.DEFAULT_DURABLE_WRITES);
+        return KeyspaceParams.create(durableWrites, getAllReplicationOptions());
+    }
+
+    public KeyspaceParams asAlteredKeyspaceParams(KeyspaceParams previous)
+    {
+        boolean durableWrites = getBoolean(Option.DURABLE_WRITES.toString(), previous.durableWrites);
+        ReplicationParams replication = getReplicationStrategyClass() == null
+                                      ? previous.replication
+                                      : ReplicationParams.fromMap(getAllReplicationOptions());
+        return new KeyspaceParams(durableWrites, replication);
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 293bc07..65fa948 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java

@@ -21,23 +21,24 @@
 import java.util.*;
 
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.ColumnIdentifier.Raw;
 import org.apache.cassandra.cql3.functions.Function;
-import org.apache.cassandra.cql3.restrictions.Restriction;
-import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
 import org.apache.cassandra.cql3.selection.Selection;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompositesBuilder;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
+import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
@@ -46,80 +47,121 @@
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.triggers.TriggerExecutor;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.UUIDGen;
+
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
-import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkNull;
 
 /*
  * Abstract parent class of individual modifications, i.e. INSERT, UPDATE and DELETE.
  */
 public abstract class ModificationStatement implements CQLStatement
 {
+    protected static final Logger logger = LoggerFactory.getLogger(ModificationStatement.class);
+
+    public static final String CUSTOM_EXPRESSIONS_NOT_ALLOWED =
+        "Custom index expressions cannot be used in WHERE clauses for UPDATE or DELETE statements";
+
     private static final ColumnIdentifier CAS_RESULT_COLUMN = new ColumnIdentifier("[applied]", false);
 
-    public static enum StatementType { INSERT, UPDATE, DELETE }
-    public final StatementType type;
+    protected final StatementType type;
 
     private final int boundTerms;
     public final CFMetaData cfm;
-    public final Attributes attrs;
+    private final Attributes attrs;
 
-    protected final Map<ColumnIdentifier, Restriction> processedKeys = new HashMap<>();
-    private final List<Operation> columnOperations = new ArrayList<Operation>();
+    private final StatementRestrictions restrictions;
 
-    // Separating normal and static conditions makes things somewhat easier
-    private List<ColumnCondition> columnConditions;
-    private List<ColumnCondition> staticConditions;
-    private boolean ifNotExists;
-    private boolean ifExists;
+    private final Operations operations;
 
-    private boolean hasNoClusteringColumns = true;
+    private final PartitionColumns updatedColumns;
 
-    private boolean setsStaticColumns;
-    private boolean setsRegularColumns;
+    private final Conditions conditions;
 
-    private final com.google.common.base.Function<ColumnCondition, ColumnDefinition> getColumnForCondition =
-      new com.google.common.base.Function<ColumnCondition, ColumnDefinition>()
-    {
-        public ColumnDefinition apply(ColumnCondition cond)
-        {
-            return cond.column;
-        }
-    };
+    private final PartitionColumns conditionColumns;
 
-    public ModificationStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
+    private final PartitionColumns requiresRead;
+
+    public ModificationStatement(StatementType type,
+                                 int boundTerms,
+                                 CFMetaData cfm,
+                                 Operations operations,
+                                 StatementRestrictions restrictions,
+                                 Conditions conditions,
+                                 Attributes attrs)
     {
         this.type = type;
         this.boundTerms = boundTerms;
         this.cfm = cfm;
+        this.restrictions = restrictions;
+        this.operations = operations;
+        this.conditions = conditions;
         this.attrs = attrs;
+
+        if (!conditions.isEmpty())
+        {
+            checkFalse(cfm.isCounter(), "Conditional updates are not supported on counter tables");
+            checkFalse(attrs.isTimestampSet(), "Cannot provide custom timestamp for conditional updates");
+        }
+
+        PartitionColumns.Builder conditionColumnsBuilder = PartitionColumns.builder();
+        Iterable<ColumnDefinition> columns = conditions.getColumns();
+        if (columns != null)
+            conditionColumnsBuilder.addAll(columns);
+
+        PartitionColumns.Builder updatedColumnsBuilder = PartitionColumns.builder();
+        PartitionColumns.Builder requiresReadBuilder = PartitionColumns.builder();
+        for (Operation operation : operations)
+        {
+            updatedColumnsBuilder.add(operation.column);
+            // If the operation requires a read-before-write and we're doing a conditional read, we want to read
+            // the affected column as part of the read-for-conditions paxos phase (see #7499).
+            if (operation.requiresRead())
+            {
+                conditionColumnsBuilder.add(operation.column);
+                requiresReadBuilder.add(operation.column);
+            }
+        }
+
+        PartitionColumns modifiedColumns = updatedColumnsBuilder.build();
+        // Compact tables have not row marker. So if we don't actually update any particular column,
+        // this means that we're only updating the PK, which we allow if only those were declared in
+        // the definition. In that case however, we do went to write the compactValueColumn (since again
+        // we can't use a "row marker") so add it automatically.
+        if (cfm.isCompactTable() && modifiedColumns.isEmpty() && updatesRegularRows())
+            modifiedColumns = cfm.partitionColumns();
+
+        this.updatedColumns = modifiedColumns;
+        this.conditionColumns = conditionColumnsBuilder.build();
+        this.requiresRead = requiresReadBuilder.build();
+    }
+
+    public StatementRestrictions getRestrictions()
+    {
+        return restrictions;
     }
 
     public Iterable<Function> getFunctions()
     {
-        List<Iterable<Function>> iterables = new LinkedList<>();
-        for (Restriction restriction : processedKeys.values())
-            iterables.add(restriction.getFunctions());
-
-        if (columnOperations != null)
-            for (Operation operation : columnOperations)
-                iterables.add(operation.getFunctions());
-
-        if (columnConditions != null)
-            for (ColumnCondition condition : columnConditions)
-                iterables.add(condition.getFunctions());
-
-        if (staticConditions != null)
-            for (ColumnCondition condition : staticConditions)
-                iterables.add(condition.getFunctions());
-
-        return Iterables.concat(iterables);
+        List<Function> functions = new ArrayList<>();
+        addFunctionsTo(functions);
+        return functions;
     }
 
-    public abstract boolean requireFullClusteringKey();
-    public abstract void addUpdateForKey(ColumnFamily updates, ByteBuffer key, Composite prefix, UpdateParameters params) throws InvalidRequestException;
+    public void addFunctionsTo(List<Function> functions)
+    {
+        attrs.addFunctionsTo(functions);
+        restrictions.addFunctionsTo(functions);
+        operations.addFunctionsTo(functions);
+        conditions.addFunctionsTo(functions);
+    }
+
+    public abstract void addUpdateForKey(PartitionUpdate update, Clustering clustering, UpdateParameters params);
+
+    public abstract void addUpdateForKey(PartitionUpdate update, Slice slice, UpdateParameters params);
 
     public int getBoundTerms()
     {
@@ -141,6 +183,11 @@
         return cfm.isCounter();
     }
 
+    public boolean isView()
+    {
+        return cfm.isView();
+    }
+
     public long getTimestamp(long now, QueryOptions options) throws InvalidRequestException
     {
         return attrs.getTimestamp(now, options);
@@ -164,298 +211,139 @@
         if (hasConditions())
             state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.SELECT);
 
+        // MV updates need to get the current state from the table, and might update the views
+        // Require Permission.SELECT on the base table, and Permission.MODIFY on the views
+        Iterator<ViewDefinition> views = View.findAll(keyspace(), columnFamily()).iterator();
+        if (views.hasNext())
+        {
+            state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.SELECT);
+            do
+            {
+                state.hasColumnFamilyAccess(keyspace(), views.next().viewName, Permission.MODIFY);
+            } while (views.hasNext());
+        }
+
         for (Function function : getFunctions())
             state.ensureHasPermission(Permission.EXECUTE, function);
     }
 
     public void validate(ClientState state) throws InvalidRequestException
     {
-        if (hasConditions() && attrs.isTimestampSet())
-            throw new InvalidRequestException("Cannot provide custom timestamp for conditional updates");
-
-        if (isCounter() && attrs.isTimestampSet())
-            throw new InvalidRequestException("Cannot provide custom timestamp for counter updates");
-
-        if (isCounter() && attrs.isTimeToLiveSet())
-            throw new InvalidRequestException("Cannot provide custom TTL for counter updates");
+        checkFalse(hasConditions() && attrs.isTimestampSet(), "Cannot provide custom timestamp for conditional updates");
+        checkFalse(isCounter() && attrs.isTimestampSet(), "Cannot provide custom timestamp for counter updates");
+        checkFalse(isCounter() && attrs.isTimeToLiveSet(), "Cannot provide custom TTL for counter updates");
+        checkFalse(isView(), "Cannot directly modify a materialized view");
     }
 
-    public void addOperation(Operation op)
+    public PartitionColumns updatedColumns()
     {
-        if (op.column.isStatic())
-            setsStaticColumns = true;
-        else
-            setsRegularColumns = true;
-        columnOperations.add(op);
+        return updatedColumns;
     }
 
-    public List<Operation> getOperations()
+    public PartitionColumns conditionColumns()
     {
-        return columnOperations;
+        return conditionColumns;
+    }
+
+    public boolean updatesRegularRows()
+    {
+        // We're updating regular rows if all the clustering columns are provided.
+        // Note that the only case where we're allowed not to provide clustering
+        // columns is if we set some static columns, and in that case no clustering
+        // columns should be given. So in practice, it's enough to check if we have
+        // either the table has no clustering or if it has at least one of them set.
+        return cfm.clusteringColumns().isEmpty() || restrictions.hasClusteringColumnsRestriction();
+    }
+
+    public boolean updatesStaticRow()
+    {
+        return operations.appliesToStaticColumns();
+    }
+
+    public List<Operation> getRegularOperations()
+    {
+        return operations.regularOperations();
+    }
+
+    public List<Operation> getStaticOperations()
+    {
+        return operations.staticOperations();
+    }
+
+    public Iterable<Operation> allOperations()
+    {
+        return operations;
     }
 
     public Iterable<ColumnDefinition> getColumnsWithConditions()
     {
-        if (ifNotExists || ifExists)
-            return null;
-
-        return Iterables.concat(columnConditions == null ? Collections.<ColumnDefinition>emptyList() : Iterables.transform(columnConditions, getColumnForCondition),
-                                staticConditions == null ? Collections.<ColumnDefinition>emptyList() : Iterables.transform(staticConditions, getColumnForCondition));
-    }
-
-    public void addCondition(ColumnCondition cond)
-    {
-        List<ColumnCondition> conds = null;
-        if (cond.column.isStatic())
-        {
-            setsStaticColumns = true;
-            if (staticConditions == null)
-                staticConditions = new ArrayList<ColumnCondition>();
-            conds = staticConditions;
-        }
-        else
-        {
-            setsRegularColumns = true;
-            if (columnConditions == null)
-                columnConditions = new ArrayList<ColumnCondition>();
-            conds = columnConditions;
-        }
-        conds.add(cond);
-    }
-
-    public void setIfNotExistCondition()
-    {
-        ifNotExists = true;
+         return conditions.getColumns();
     }
 
     public boolean hasIfNotExistCondition()
     {
-        return ifNotExists;
-    }
-
-    public void setIfExistCondition()
-    {
-        ifExists = true;
+        return conditions.isIfNotExists();
     }
 
     public boolean hasIfExistCondition()
     {
-        return ifExists;
-    }
-
-    public boolean hasStaticConditions()
-    {
-        return staticConditions != null && !staticConditions.isEmpty();
-    }
-
-    public boolean hasRegularConditions()
-    {
-        return columnConditions != null && !columnConditions.isEmpty();
-    }
-
-    private void addKeyValues(ColumnDefinition def, Restriction values) throws InvalidRequestException
-    {
-        if (def.kind == ColumnDefinition.Kind.CLUSTERING_COLUMN)
-            hasNoClusteringColumns = false;
-        if (processedKeys.put(def.name, values) != null)
-            throw new InvalidRequestException(String.format("Multiple definitions found for PRIMARY KEY part %s", def.name));
-    }
-
-    public void addKeyValue(ColumnDefinition def, Term value) throws InvalidRequestException
-    {
-        addKeyValues(def, new SingleColumnRestriction.EQ(def, value));
-    }
-
-    public void processWhereClause(List<Relation> whereClause, VariableSpecifications names) throws InvalidRequestException
-    {
-        for (Relation relation : whereClause)
-        {
-            if (relation.isMultiColumn())
-            {
-                throw new InvalidRequestException(
-                        String.format("Multi-column relations cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", relation));
-            }
-
-            if (relation.onToken())
-            {
-                throw new InvalidRequestException(String.format("Token relations cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", relation));
-            }
-
-            SingleColumnRelation rel = (SingleColumnRelation) relation;
-
-            if (rel.onToken())
-                throw new InvalidRequestException(String.format("The token function cannot be used in WHERE clauses for UPDATE and DELETE statements: %s", relation));
-
-            ColumnIdentifier id = rel.getEntity().prepare(cfm);
-            ColumnDefinition def = cfm.getColumnDefinition(id);
-            if (def == null)
-                throw new InvalidRequestException(String.format("Unknown key identifier %s", id));
-
-            switch (def.kind)
-            {
-                case PARTITION_KEY:
-                case CLUSTERING_COLUMN:
-                    Restriction restriction;
-
-                    if (rel.isEQ() || (def.isPartitionKey() && rel.isIN()))
-                    {
-                        restriction = rel.toRestriction(cfm, names);
-                    }
-                    else
-                    {
-                        throw new InvalidRequestException(String.format("Invalid operator %s for PRIMARY KEY part %s", rel.operator(), def.name));
-                    }
-
-                    addKeyValues(def, restriction);
-                    break;
-                default:
-                    throw new InvalidRequestException(String.format("Non PRIMARY KEY %s found in where clause", def.name));
-            }
-        }
+        return conditions.isIfExists();
     }
 
     public List<ByteBuffer> buildPartitionKeyNames(QueryOptions options)
     throws InvalidRequestException
     {
-        CompositesBuilder keyBuilder = new CompositesBuilder(cfm.getKeyValidatorAsCType());
-        for (ColumnDefinition def : cfm.partitionKeyColumns())
-        {
-            Restriction r = checkNotNull(processedKeys.get(def.name), "Missing mandatory PRIMARY KEY part %s", def.name);
-            r.appendTo(cfm, keyBuilder, options);
-        }
+        List<ByteBuffer> partitionKeys = restrictions.getPartitionKeys(options);
+        for (ByteBuffer key : partitionKeys)
+            QueryProcessor.validateKey(key);
 
-        return Lists.transform(filterAndSort(keyBuilder.build()), new com.google.common.base.Function<Composite, ByteBuffer>()
-        {
-            @Override
-            public ByteBuffer apply(Composite composite)
-            {
-                ByteBuffer byteBuffer = composite.toByteBuffer();
-                ThriftValidation.validateKey(cfm, byteBuffer);
-                return byteBuffer;
-            }
-        });
+        return partitionKeys;
     }
 
-    public Composite createClusteringPrefix(QueryOptions options)
+    public NavigableSet<Clustering> createClustering(QueryOptions options)
     throws InvalidRequestException
     {
-        // If the only updated/deleted columns are static, then we don't need clustering columns.
-        // And in fact, unless it is an INSERT, we reject if clustering colums are provided as that
-        // suggest something unintended. For instance, given:
-        //   CREATE TABLE t (k int, v int, s int static, PRIMARY KEY (k, v))
-        // it can make sense to do:
-        //   INSERT INTO t(k, v, s) VALUES (0, 1, 2)
-        // but both
-        //   UPDATE t SET s = 3 WHERE k = 0 AND v = 1
-        //   DELETE v FROM t WHERE k = 0 AND v = 1
-        // sounds like you don't really understand what your are doing.
-        if (appliesOnlyToStaticColumns())
-        {
-            // If we set no non-static columns, then it's fine not to have clustering columns
-            if (hasNoClusteringColumns)
-                return cfm.comparator.staticPrefix();
+        if (appliesOnlyToStaticColumns() && !restrictions.hasClusteringColumnsRestriction())
+            return FBUtilities.singleton(CBuilder.STATIC_BUILDER.build(), cfm.comparator);
 
-            // If we do have clustering columns however, then either it's an INSERT and the query is valid
-            // but we still need to build a proper prefix, or it's not an INSERT, and then we want to reject
-            // (see above)
-            if (type != StatementType.INSERT)
-            {
-                for (ColumnDefinition def : cfm.clusteringColumns())
-                    if (processedKeys.get(def.name) != null)
-                        throw new InvalidRequestException(String.format("Invalid restriction on clustering column %s since the %s statement modifies only static columns", def.name, type));
-                // we should get there as it contradicts hasNoClusteringColumns == false
-                throw new AssertionError();
-            }
-        }
-
-        return createClusteringPrefixBuilderInternal(options);
+        return restrictions.getClusteringColumns(options);
     }
 
     /**
      * Checks that the modification only apply to static columns.
      * @return <code>true</code> if the modification only apply to static columns, <code>false</code> otherwise.
      */
-    protected boolean appliesOnlyToStaticColumns()
+    private boolean appliesOnlyToStaticColumns()
     {
-        return setsStaticColumns && !appliesToRegularColumns();
+        return appliesOnlyToStaticColumns(operations, conditions);
     }
 
     /**
-     * Checks that the modification apply to regular columns.
-     * @return <code>true</code> if the modification apply to regular columns, <code>false</code> otherwise.
+     * Checks that the specified operations and conditions only apply to static columns.
+     * @return <code>true</code> if the specified operations and conditions only apply to static columns,
+     * <code>false</code> otherwise.
      */
-    protected boolean appliesToRegularColumns()
+    public static boolean appliesOnlyToStaticColumns(Operations operation, Conditions conditions)
     {
-        // If we have regular operations, this applies to regular columns.
-        // Otherwise, if the statement is a DELETE and columnOperations is empty, this means we have no operations,
-        // which for a DELETE means a full row deletion. Which means the operation applies to all columns and regular ones in particular.
-        return setsRegularColumns || (type == StatementType.DELETE && columnOperations.isEmpty());
-    }
-
-    private Composite createClusteringPrefixBuilderInternal(QueryOptions options)
-    throws InvalidRequestException
-    {
-        CompositesBuilder builder = new CompositesBuilder(cfm.comparator);
-        ColumnDefinition firstEmptyKey = null;
-        for (ColumnDefinition def : cfm.clusteringColumns())
-        {
-            Restriction r = processedKeys.get(def.name);
-            if (r == null)
-            {
-                firstEmptyKey = def;
-                checkFalse(requireFullClusteringKey() && !cfm.comparator.isDense() && cfm.comparator.isCompound(), 
-                           "Missing mandatory PRIMARY KEY part %s", def.name);
-            }
-            else if (firstEmptyKey != null)
-            {
-                throw invalidRequest("Missing PRIMARY KEY part %s since %s is set", firstEmptyKey.name, def.name);
-            }
-            else
-            {
-                r.appendTo(cfm, builder, options);
-            }
-        }
-        return builder.build().get(0); // We only allow IN for row keys so far
-    }
-
-    /**
-     * Removes duplicates and sort the specified composites.
-     *
-     * @param composites the composites to filter and sort
-     * @return the composites sorted and without duplicates
-     */
-    private List<Composite> filterAndSort(List<Composite> composites)
-    {
-        if (composites.size() <= 1)
-            return composites;
-
-        TreeSet<Composite> set = new TreeSet<Composite>(cfm.getKeyValidatorAsCType());
-        set.addAll(composites);
-        return new ArrayList<>(set);
-    }
-
-    protected ColumnDefinition getFirstEmptyKey()
-    {
-        for (ColumnDefinition def : cfm.clusteringColumns())
-        {
-            if (processedKeys.get(def.name) == null)
-                return def;
-        }
-        return null;
+        return !operation.appliesToRegularColumns() && !conditions.appliesToRegularColumns()
+                && (operation.appliesToStaticColumns() || conditions.appliesToStaticColumns());
     }
 
     public boolean requiresRead()
     {
         // Lists SET operation incurs a read.
-        for (Operation op : columnOperations)
+        for (Operation op : allOperations())
             if (op.requiresRead())
                 return true;
 
         return false;
     }
 
-    protected Map<ByteBuffer, CQL3Row> readRequiredRows(Collection<ByteBuffer> partitionKeys, Composite clusteringPrefix, boolean local, ConsistencyLevel cl)
-    throws RequestExecutionException, RequestValidationException
+    private Map<DecoratedKey, Partition> readRequiredLists(Collection<ByteBuffer> partitionKeys,
+                                                           ClusteringIndexFilter filter,
+                                                           DataLimits limits,
+                                                           boolean local,
+                                                           ConsistencyLevel cl)
     {
         if (!requiresRead())
             return null;
@@ -469,35 +357,41 @@
             throw new InvalidRequestException(String.format("Write operation require a read but consistency %s is not supported on reads", cl));
         }
 
-        ColumnSlice[] slices = new ColumnSlice[]{ clusteringPrefix.slice() };
-        List<ReadCommand> commands = new ArrayList<ReadCommand>(partitionKeys.size());
-        long now = System.currentTimeMillis();
+        List<SinglePartitionReadCommand> commands = new ArrayList<>(partitionKeys.size());
+        int nowInSec = FBUtilities.nowInSeconds();
         for (ByteBuffer key : partitionKeys)
-            commands.add(new SliceFromReadCommand(keyspace(),
-                                                  key,
-                                                  columnFamily(),
-                                                  now,
-                                                  new SliceQueryFilter(slices, false, Integer.MAX_VALUE)));
+            commands.add(SinglePartitionReadCommand.create(cfm,
+                                                           nowInSec,
+                                                           ColumnFilter.selection(this.requiresRead),
+                                                           RowFilter.NONE,
+                                                           limits,
+                                                           cfm.decorateKey(key),
+                                                           filter));
 
-        List<Row> rows = local
-                       ? SelectStatement.readLocally(keyspace(), commands)
-                       : StorageProxy.read(commands, cl);
+        SinglePartitionReadCommand.Group group = new SinglePartitionReadCommand.Group(commands, DataLimits.NONE);
 
-        Map<ByteBuffer, CQL3Row> map = new HashMap<ByteBuffer, CQL3Row>();
-        for (Row row : rows)
+        if (local)
         {
-            if (row.cf == null || row.cf.isEmpty())
-                continue;
-
-            CQL3Row.RowIterator iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(row.cf.getSortedColumns().iterator());
-            if(iter.getStaticRow() != null) {
-                map.put(row.key.getKey(), iter.getStaticRow());
-            }
-            if (iter.hasNext())
+            try (ReadOrderGroup orderGroup = group.startOrderGroup(); PartitionIterator iter = group.executeInternal(orderGroup))
             {
-                map.put(row.key.getKey(), iter.next());
-                // We can only update one CQ3Row per partition key at a time (we don't allow IN for clustering key)
-                assert !iter.hasNext();
+                return asMaterializedMap(iter);
+            }
+        }
+
+        try (PartitionIterator iter = group.execute(cl, null))
+        {
+            return asMaterializedMap(iter);
+        }
+    }
+
+    private Map<DecoratedKey, Partition> asMaterializedMap(PartitionIterator iterator)
+    {
+        Map<DecoratedKey, Partition> map = new HashMap<>();
+        while (iterator.hasNext())
+        {
+            try (RowIterator partition = iterator.next())
+            {
+                map.put(partition.partitionKey(), FilteredPartition.create(partition));
             }
         }
         return map;
@@ -505,10 +399,14 @@
 
     public boolean hasConditions()
     {
-        return ifNotExists
-            || ifExists
-            || (columnConditions != null && !columnConditions.isEmpty())
-            || (staticConditions != null && !staticConditions.isEmpty());
+        return !conditions.isEmpty();
+    }
+
+    public boolean hasSlices()
+    {
+        return type.allowClusteringColumnSlices()
+               && getRestrictions().hasClusteringColumnsRestriction()
+               && getRestrictions().isColumnRange();
     }
 
     public ResultMessage execute(QueryState queryState, QueryOptions options)
@@ -517,9 +415,6 @@
         if (options.getConsistency() == null)
             throw new InvalidRequestException("Invalid empty consistency level");
 
-        if (hasConditions() && options.getProtocolVersion() == 1)
-            throw new InvalidRequestException("Conditional updates are not supported by the protocol version in use. You need to upgrade to a driver using the native protocol v2.");
-
         return hasConditions()
              ? executeWithCondition(queryState, options)
              : executeWithoutCondition(queryState, options);
@@ -546,71 +441,63 @@
     {
         CQL3CasRequest request = makeCasRequest(queryState, options);
 
-        ColumnFamily result = StorageProxy.cas(keyspace(),
-                                               columnFamily(),
-                                               request.key,
-                                               request,
-                                               options.getSerialConsistency(),
-                                               options.getConsistency(),
-                                               queryState.getClientState());
-        return new ResultMessage.Rows(buildCasResultSet(request.key, result, options));
+        try (RowIterator result = StorageProxy.cas(keyspace(),
+                                                   columnFamily(),
+                                                   request.key,
+                                                   request,
+                                                   options.getSerialConsistency(),
+                                                   options.getConsistency(),
+                                                   queryState.getClientState()))
+        {
+            return new ResultMessage.Rows(buildCasResultSet(result, options));
+        }
     }
 
     private CQL3CasRequest makeCasRequest(QueryState queryState, QueryOptions options)
     {
         List<ByteBuffer> keys = buildPartitionKeyNames(options);
         // We don't support IN for CAS operation so far
-        if (keys.size() > 1)
-            throw new InvalidRequestException("IN on the partition key is not supported with conditional updates");
+        checkFalse(restrictions.keyIsInRelation(),
+                   "IN on the partition key is not supported with conditional %s",
+                   type.isUpdate()? "updates" : "deletions");
 
-        ByteBuffer key = keys.get(0);
+        DecoratedKey key = cfm.decorateKey(keys.get(0));
         long now = options.getTimestamp(queryState);
-        Composite prefix = createClusteringPrefix(options);
 
-        CQL3CasRequest request = new CQL3CasRequest(cfm, key, false);
-        addConditions(prefix, request, options);
-        request.addRowUpdate(prefix, this, options, now);
+        checkFalse(restrictions.clusteringKeyRestrictionsHasIN(),
+                   "IN on the clustering key columns is not supported with conditional %s",
+                    type.isUpdate()? "updates" : "deletions");
+
+        Clustering clustering = Iterables.getOnlyElement(createClustering(options));
+        CQL3CasRequest request = new CQL3CasRequest(cfm, key, false, conditionColumns(), updatesRegularRows(), updatesStaticRow());
+
+        addConditions(clustering, request, options);
+        request.addRowUpdate(clustering, this, options, now);
+
         return request;
     }
 
-    public void addConditions(Composite clusteringPrefix, CQL3CasRequest request, QueryOptions options) throws InvalidRequestException
+    public void addConditions(Clustering clustering, CQL3CasRequest request, QueryOptions options) throws InvalidRequestException
     {
-        if (ifNotExists)
-        {
-            // If we use ifNotExists, if the statement applies to any non static columns, then the condition is on the row of the non-static
-            // columns and the prefix should be the clusteringPrefix. But if only static columns are set, then the ifNotExists apply to the existence
-            // of any static columns and we should use the prefix for the "static part" of the partition.
-            request.addNotExist(clusteringPrefix);
-        }
-        else if (ifExists)
-        {
-            request.addExist(clusteringPrefix);
-        }
-        else
-        {
-            if (columnConditions != null)
-                request.addConditions(clusteringPrefix, columnConditions, options);
-            if (staticConditions != null)
-                request.addConditions(cfm.comparator.staticPrefix(), staticConditions, options);
-        }
+        conditions.addConditionsTo(request, clustering, options);
     }
 
-    private ResultSet buildCasResultSet(ByteBuffer key, ColumnFamily cf, QueryOptions options) throws InvalidRequestException
+    private ResultSet buildCasResultSet(RowIterator partition, QueryOptions options) throws InvalidRequestException
     {
-        return buildCasResultSet(keyspace(), key, columnFamily(), cf, getColumnsWithConditions(), false, options);
+        return buildCasResultSet(keyspace(), columnFamily(), partition, getColumnsWithConditions(), false, options);
     }
 
-    public static ResultSet buildCasResultSet(String ksName, ByteBuffer key, String cfName, ColumnFamily cf, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
+    public static ResultSet buildCasResultSet(String ksName, String tableName, RowIterator partition, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
     throws InvalidRequestException
     {
-        boolean success = cf == null;
+        boolean success = partition == null;
 
-        ColumnSpecification spec = new ColumnSpecification(ksName, cfName, CAS_RESULT_COLUMN, BooleanType.instance);
+        ColumnSpecification spec = new ColumnSpecification(ksName, tableName, CAS_RESULT_COLUMN, BooleanType.instance);
         ResultSet.ResultMetadata metadata = new ResultSet.ResultMetadata(Collections.singletonList(spec));
         List<List<ByteBuffer>> rows = Collections.singletonList(Collections.singletonList(BooleanType.instance.decompose(success)));
 
         ResultSet rs = new ResultSet(metadata, rows);
-        return success ? rs : merge(rs, buildCasFailureResultSet(key, cf, columnsWithConditions, isBatch, options));
+        return success ? rs : merge(rs, buildCasFailureResultSet(partition, columnsWithConditions, isBatch, options));
     }
 
     private static ResultSet merge(ResultSet left, ResultSet right)
@@ -636,10 +523,10 @@
         return new ResultSet(new ResultSet.ResultMetadata(specs), rows);
     }
 
-    private static ResultSet buildCasFailureResultSet(ByteBuffer key, ColumnFamily cf, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
+    private static ResultSet buildCasFailureResultSet(RowIterator partition, Iterable<ColumnDefinition> columnsWithConditions, boolean isBatch, QueryOptions options)
     throws InvalidRequestException
     {
-        CFMetaData cfm = cf.metadata();
+        CFMetaData cfm = partition.metadata();
         Selection selection;
         if (columnsWithConditions == null)
         {
@@ -657,15 +544,23 @@
                 defs.addAll(cfm.partitionKeyColumns());
                 defs.addAll(cfm.clusteringColumns());
             }
-            for (ColumnDefinition def : columnsWithConditions)
-                defs.add(def);
-            selection = Selection.forColumns(cfm, new ArrayList<>(defs));
 
+
+            if (cfm.isSuper() && cfm.isDense())
+            {
+                defs.add(cfm.superColumnValueColumn());
+            }
+            else
+            {
+                for (ColumnDefinition def : columnsWithConditions)
+                    defs.add(def);
+            }
+
+            selection = Selection.forColumns(cfm, new ArrayList<>(defs));
         }
 
-        long now = System.currentTimeMillis();
-        Selection.ResultSetBuilder builder = selection.resultSetBuilder(now, false);
-        SelectStatement.forSelection(cfm, selection).processColumnFamily(key, cf, options, now, builder);
+        Selection.ResultSetBuilder builder = selection.resultSetBuilder(false);
+        SelectStatement.forSelection(cfm, selection).processPartition(partition, options, builder, FBUtilities.nowInSeconds());
 
         return builder.build(options.getProtocolVersion());
     }
@@ -694,32 +589,30 @@
     public ResultMessage executeInternalWithCondition(QueryState state, QueryOptions options) throws RequestValidationException, RequestExecutionException
     {
         CQL3CasRequest request = makeCasRequest(state, options);
-        ColumnFamily result = casInternal(request, state);
-        return new ResultMessage.Rows(buildCasResultSet(request.key, result, options));
+        try (RowIterator result = casInternal(request, state))
+        {
+            return new ResultMessage.Rows(buildCasResultSet(result, options));
+        }
     }
 
-    static ColumnFamily casInternal(CQL3CasRequest request, QueryState state)
+    static RowIterator casInternal(CQL3CasRequest request, QueryState state)
     {
         UUID ballot = UUIDGen.getTimeUUIDFromMicros(state.getTimestamp());
-        CFMetaData metadata = Schema.instance.getCFMetaData(request.cfm.ksName, request.cfm.cfName);
 
-        ReadCommand readCommand = ReadCommand.create(request.cfm.ksName, request.key, request.cfm.cfName, request.now, request.readFilter());
-        Keyspace keyspace = Keyspace.open(request.cfm.ksName);
-
-        Row row = readCommand.getRow(keyspace);
-        ColumnFamily current = row.cf;
-        if (current == null)
-            current = ArrayBackedSortedColumns.factory.create(metadata);
-
-        if (!request.appliesTo(current))
+        SinglePartitionReadCommand readCommand = request.readCommand(FBUtilities.nowInSeconds());
+        FilteredPartition current;
+        try (ReadOrderGroup orderGroup = readCommand.startOrderGroup(); PartitionIterator iter = readCommand.executeInternal(orderGroup))
         {
-            return current;
+            current = FilteredPartition.create(PartitionIterators.getOnlyElement(iter, readCommand));
         }
 
-        ColumnFamily updates = request.makeUpdates(current);
-        updates = TriggerExecutor.instance.execute(request.key, updates);
+        if (!request.appliesTo(current))
+            return current.rowIterator();
 
-        Commit proposal = Commit.newProposal(request.key, ballot, updates);
+        PartitionUpdate updates = request.makeUpdates(current);
+        updates = TriggerExecutor.instance.execute(updates);
+
+        Commit proposal = Commit.newProposal(ballot, updates);
         proposal.makeMutation().apply();
         return null;
     }
@@ -732,135 +625,294 @@
      * @param now the current timestamp in microseconds to use if no timestamp is user provided.
      *
      * @return list of the mutations
-     * @throws InvalidRequestException on invalid requests
      */
     private Collection<? extends IMutation> getMutations(QueryOptions options, boolean local, long now)
-    throws RequestExecutionException, RequestValidationException
+    {
+        UpdatesCollector collector = new UpdatesCollector(Collections.singletonMap(cfm.cfId, updatedColumns), 1);
+        addUpdates(collector, options, local, now);
+        collector.validateIndexedColumns();
+
+        return collector.toMutations();
+    }
+
+    final void addUpdates(UpdatesCollector collector,
+                          QueryOptions options,
+                          boolean local,
+                          long now)
     {
         List<ByteBuffer> keys = buildPartitionKeyNames(options);
-        Composite clusteringPrefix = createClusteringPrefix(options);
 
-        UpdateParameters params = makeUpdateParameters(keys, clusteringPrefix, options, local, now);
-
-        Collection<IMutation> mutations = new ArrayList<IMutation>(keys.size());
-        for (ByteBuffer key: keys)
+        if (hasSlices())
         {
-            ThriftValidation.validateKey(cfm, key);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfm);
-            addUpdateForKey(cf, key, clusteringPrefix, params);
-            Mutation mut = new Mutation(cfm.ksName, key, cf);
+            Slices slices = createSlices(options);
 
-            mutations.add(isCounter() ? new CounterMutation(mut, options.getConsistency()) : mut);
+            // If all the ranges were invalid we do not need to do anything.
+            if (slices.isEmpty())
+                return;
+
+            UpdateParameters params = makeUpdateParameters(keys,
+                                                           new ClusteringIndexSliceFilter(slices, false),
+                                                           options,
+                                                           DataLimits.NONE,
+                                                           local,
+                                                           now);
+            for (ByteBuffer key : keys)
+            {
+                ThriftValidation.validateKey(cfm, key);
+                DecoratedKey dk = cfm.decorateKey(key);
+
+                PartitionUpdate upd = collector.getPartitionUpdate(cfm, dk, options.getConsistency());
+
+                for (Slice slice : slices)
+                    addUpdateForKey(upd, slice, params);
+            }
         }
-        return mutations;
+        else
+        {
+            NavigableSet<Clustering> clusterings = createClustering(options);
+
+            // If some of the restrictions were unspecified (e.g. empty IN restrictions) we do not need to do anything.
+            if (restrictions.hasClusteringColumnsRestriction() && clusterings.isEmpty())
+                return;
+
+            UpdateParameters params = makeUpdateParameters(keys, clusterings, options, local, now);
+
+            for (ByteBuffer key : keys)
+            {
+                ThriftValidation.validateKey(cfm, key);
+                DecoratedKey dk = cfm.decorateKey(key);
+
+                PartitionUpdate upd = collector.getPartitionUpdate(cfm, dk, options.getConsistency());
+
+                if (!restrictions.hasClusteringColumnsRestriction())
+                {
+                    addUpdateForKey(upd, Clustering.EMPTY, params);
+                }
+                else
+                {
+                    for (Clustering clustering : clusterings)
+                    {
+                       for (ByteBuffer c : clustering.getRawValues())
+                       {
+                           if (c != null && c.remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
+                               throw new InvalidRequestException(String.format("Key length of %d is longer than maximum of %d",
+                                                                               clustering.dataSize(),
+                                                                               FBUtilities.MAX_UNSIGNED_SHORT));
+                       }
+
+                        addUpdateForKey(upd, clustering, params);
+                    }
+                }
+            }
+        }
     }
 
-    public UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
-                                                 Composite prefix,
-                                                 QueryOptions options,
-                                                 boolean local,
-                                                 long now)
-    throws RequestExecutionException, RequestValidationException
+    Slices createSlices(QueryOptions options)
+    {
+        SortedSet<Slice.Bound> startBounds = restrictions.getClusteringColumnsBounds(Bound.START, options);
+        SortedSet<Slice.Bound> endBounds = restrictions.getClusteringColumnsBounds(Bound.END, options);
+
+        return toSlices(startBounds, endBounds);
+    }
+
+    private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
+                                                  NavigableSet<Clustering> clusterings,
+                                                  QueryOptions options,
+                                                  boolean local,
+                                                  long now)
+    {
+        if (clusterings.contains(Clustering.STATIC_CLUSTERING))
+            return makeUpdateParameters(keys,
+                                        new ClusteringIndexSliceFilter(Slices.ALL, false),
+                                        options,
+                                        DataLimits.cqlLimits(1),
+                                        local,
+                                        now);
+
+        return makeUpdateParameters(keys,
+                                    new ClusteringIndexNamesFilter(clusterings, false),
+                                    options,
+                                    DataLimits.NONE,
+                                    local,
+                                    now);
+    }
+
+    private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
+                                                  ClusteringIndexFilter filter,
+                                                  QueryOptions options,
+                                                  DataLimits limits,
+                                                  boolean local,
+                                                  long now)
     {
         // Some lists operation requires reading
-        Map<ByteBuffer, CQL3Row> rows = readRequiredRows(keys, prefix, local, options.getConsistency());
-        return new UpdateParameters(cfm, options, getTimestamp(now, options), getTimeToLive(options), rows);
+        Map<DecoratedKey, Partition> lists = readRequiredLists(keys, filter, limits, local, options.getConsistency());
+        return new UpdateParameters(cfm, updatedColumns(), options, getTimestamp(now, options), getTimeToLive(options), lists);
     }
 
-    /**
-     * If there are conditions on the statement, this is called after the where clause and conditions have been
-     * processed to check that they are compatible.
-     * @throws InvalidRequestException
-     */
-    protected void validateWhereClauseForConditions() throws InvalidRequestException
+    private Slices toSlices(SortedSet<Slice.Bound> startBounds, SortedSet<Slice.Bound> endBounds)
     {
-        //  no-op by default
+        assert startBounds.size() == endBounds.size();
+
+        Slices.Builder builder = new Slices.Builder(cfm.comparator);
+
+        Iterator<Slice.Bound> starts = startBounds.iterator();
+        Iterator<Slice.Bound> ends = endBounds.iterator();
+
+        while (starts.hasNext())
+        {
+            Slice slice = Slice.make(starts.next(), ends.next());
+            if (!slice.isEmpty(cfm.comparator))
+            {
+                builder.add(slice);
+            }
+        }
+
+        return builder.build();
     }
 
     public static abstract class Parsed extends CFStatement
     {
-        protected final Attributes.Raw attrs;
-        protected final List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions;
+        protected final StatementType type;
+        private final Attributes.Raw attrs;
+        private final List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions;
         private final boolean ifNotExists;
         private final boolean ifExists;
 
-        protected Parsed(CFName name, Attributes.Raw attrs, List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions, boolean ifNotExists, boolean ifExists)
+        protected Parsed(CFName name,
+                         StatementType type,
+                         Attributes.Raw attrs,
+                         List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions,
+                         boolean ifNotExists,
+                         boolean ifExists)
         {
             super(name);
+            this.type = type;
             this.attrs = attrs;
             this.conditions = conditions == null ? Collections.<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>>emptyList() : conditions;
             this.ifNotExists = ifNotExists;
             this.ifExists = ifExists;
         }
 
-        public ParsedStatement.Prepared prepare() throws InvalidRequestException
+        public ParsedStatement.Prepared prepare(ClientState clientState)
         {
             VariableSpecifications boundNames = getBoundVariables();
-            ModificationStatement statement = prepare(boundNames);
-            CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
-            return new ParsedStatement.Prepared(statement, boundNames, boundNames.getPartitionKeyBindIndexes(cfm));
+            ModificationStatement statement = prepare(boundNames, clientState);
+            return new ParsedStatement.Prepared(statement, boundNames, boundNames.getPartitionKeyBindIndexes(statement.cfm));
         }
 
-        public ModificationStatement prepare(VariableSpecifications boundNames) throws InvalidRequestException
+        public ModificationStatement prepare(VariableSpecifications boundNames, ClientState clientState)
         {
-            CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+            CFMetaData metadata = ThriftValidation.validateColumnFamilyWithCompactMode(keyspace(), columnFamily(), clientState.isNoCompactMode());
 
             Attributes preparedAttributes = attrs.prepare(keyspace(), columnFamily());
             preparedAttributes.collectMarkerSpecification(boundNames);
 
-            ModificationStatement stmt = prepareInternal(metadata, boundNames, preparedAttributes);
+            Conditions preparedConditions = prepareConditions(metadata, boundNames);
 
-            if (ifNotExists || ifExists || !conditions.isEmpty())
-            {
-                if (stmt.isCounter())
-                    throw new InvalidRequestException("Conditional updates are not supported on counter tables");
-
-                if (attrs.timestamp != null)
-                    throw new InvalidRequestException("Cannot provide custom timestamp for conditional updates");
-
-                if (ifNotExists)
-                {
-                    // To have both 'IF NOT EXISTS' and some other conditions doesn't make sense.
-                    // So far this is enforced by the parser, but let's assert it for sanity if ever the parse changes.
-                    assert conditions.isEmpty();
-                    assert !ifExists;
-                    stmt.setIfNotExistCondition();
-                }
-                else if (ifExists)
-                {
-                    assert conditions.isEmpty();
-                    assert !ifNotExists;
-                    stmt.setIfExistCondition();
-                }
-                else
-                {
-                    for (Pair<ColumnIdentifier.Raw, ColumnCondition.Raw> entry : conditions)
-                    {
-                        ColumnIdentifier id = entry.left.prepare(metadata);
-                        ColumnDefinition def = metadata.getColumnDefinition(id);
-                        if (def == null)
-                            throw new InvalidRequestException(String.format("Unknown identifier %s", id));
-
-                        ColumnCondition condition = entry.right.prepare(keyspace(), def);
-                        condition.collectMarkerSpecification(boundNames);
-
-                        switch (def.kind)
-                        {
-                            case PARTITION_KEY:
-                            case CLUSTERING_COLUMN:
-                                throw new InvalidRequestException(String.format("PRIMARY KEY column '%s' cannot have IF conditions", id));
-                            default:
-                                stmt.addCondition(condition);
-                                break;
-                        }
-                    }
-                }
-
-                stmt.validateWhereClauseForConditions();
-            }
-            return stmt;
+            return prepareInternal(metadata,
+                                   boundNames,
+                                   preparedConditions,
+                                   preparedAttributes);
         }
 
-        protected abstract ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException;
+        /**
+         * Returns the column conditions.
+         *
+         * @param metadata the column family meta data
+         * @param boundNames the bound names
+         * @return the column conditions.
+         */
+        private Conditions prepareConditions(CFMetaData metadata, VariableSpecifications boundNames)
+        {
+            // To have both 'IF EXISTS'/'IF NOT EXISTS' and some other conditions doesn't make sense.
+            // So far this is enforced by the parser, but let's assert it for sanity if ever the parse changes.
+            if (ifExists)
+            {
+                assert conditions.isEmpty();
+                assert !ifNotExists;
+                return Conditions.IF_EXISTS_CONDITION;
+            }
+
+            if (ifNotExists)
+            {
+                assert conditions.isEmpty();
+                assert !ifExists;
+                return Conditions.IF_NOT_EXISTS_CONDITION;
+            }
+
+            if (conditions.isEmpty())
+                return Conditions.EMPTY_CONDITION;
+
+            return prepareColumnConditions(metadata, boundNames);
+        }
+
+        /**
+         * Returns the column conditions.
+         *
+         * @param metadata the column family meta data
+         * @param boundNames the bound names
+         * @return the column conditions.
+         */
+        private ColumnConditions prepareColumnConditions(CFMetaData metadata, VariableSpecifications boundNames)
+        {
+            checkNull(attrs.timestamp, "Cannot provide custom timestamp for conditional updates");
+
+            ColumnConditions.Builder builder = ColumnConditions.newBuilder();
+
+            for (Pair<ColumnIdentifier.Raw, ColumnCondition.Raw> entry : conditions)
+            {
+                ColumnIdentifier id = entry.left.prepare(metadata);
+                ColumnDefinition def = metadata.getColumnDefinitionForCQL(id);
+                checkNotNull(def, "Unknown identifier %s in IF conditions", id);
+
+                ColumnCondition condition = entry.right.prepare(keyspace(), def);
+                condition.collectMarkerSpecification(boundNames);
+
+                checkFalse(def.isPrimaryKeyColumn(), "PRIMARY KEY column '%s' cannot have IF conditions", id);
+                builder.add(condition);
+            }
+            return builder.build();
+        }
+
+        protected abstract ModificationStatement prepareInternal(CFMetaData cfm,
+                                                                 VariableSpecifications boundNames,
+                                                                 Conditions conditions,
+                                                                 Attributes attrs);
+
+        /**
+         * Creates the restrictions.
+         *
+         * @param cfm the column family meta data
+         * @param boundNames the bound names
+         * @param operations the column operations
+         * @param where the where clause
+         * @param conditions the conditions
+         * @return the restrictions
+         */
+        protected StatementRestrictions newRestrictions(CFMetaData cfm,
+                                                        VariableSpecifications boundNames,
+                                                        Operations operations,
+                                                        WhereClause where,
+                                                        Conditions conditions)
+        {
+            if (where.containsCustomExpressions())
+                throw new InvalidRequestException(CUSTOM_EXPRESSIONS_NOT_ALLOWED);
+
+            boolean applyOnlyToStaticColumns = appliesOnlyToStaticColumns(operations, conditions);
+            return new StatementRestrictions(type, cfm, where, boundNames, applyOnlyToStaticColumns, false, false, false);
+        }
+
+        /**
+         * Retrieves the <code>ColumnDefinition</code> corresponding to the specified raw <code>ColumnIdentifier</code>.
+         *
+         * @param cfm the column family meta data
+         * @param rawId the raw <code>ColumnIdentifier</code>
+         * @return the <code>ColumnDefinition</code> corresponding to the specified raw <code>ColumnIdentifier</code>
+         */
+        protected static ColumnDefinition getColumnDefinition(CFMetaData cfm, Raw rawId)
+        {
+            ColumnIdentifier id = rawId.prepare(cfm);
+            return checkNotNull(cfm.getColumnDefinitionForCQL(id), "Unknown identifier %s", id);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/ParsedStatement.java b/src/java/org/apache/cassandra/cql3/statements/ParsedStatement.java
index 539a957..01a1b5e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ParsedStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ParsedStatement.java

@@ -23,6 +23,7 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.service.ClientState;
 
 public abstract class ParsedStatement
 {
@@ -39,7 +40,12 @@
         this.variables = new VariableSpecifications(boundNames);
     }
 
-    public abstract Prepared prepare() throws RequestValidationException;
+    public void setBoundVariables(VariableSpecifications variables)
+    {
+        this.variables = variables;
+    }
+
+    public abstract Prepared prepare(ClientState clientState) throws RequestValidationException;
 
     public static class Prepared
     {

diff --git a/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java b/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java
index a477df6..e7ecb14 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SchemaAlteringStatement.java

@@ -18,6 +18,8 @@
 package org.apache.cassandra.cql3.statements;
 
 import org.apache.cassandra.auth.AuthenticatedUser;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CFName;
 import org.apache.cassandra.cql3.CQLStatement;
 import org.apache.cassandra.cql3.QueryOptions;
@@ -25,9 +27,12 @@
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.messages.ResultMessage;
 
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+
 /**
  * Abstract class for statements that alter the schema.
  */
@@ -60,13 +65,36 @@
     }
 
     @Override
-    public Prepared prepare()
+    public Prepared prepare(ClientState clientState)
     {
+        // We don't allow schema changes in no-compact mode on compact tables because it feels like unnecessary
+        // complication: applying the change on the non compact version of the table might be unsafe (the table is
+        // still compact in general), and applying it to the compact version in a no-compact connection feels
+        // confusing/unintuitive. If user want to alter the compact version, they can simply do so in a normal
+        // connection; if they want to alter the non-compact version, they should finish their transition and properly
+        // DROP COMPACT STORAGE on the table before doing so.
+        if (isColumnFamilyLevel && clientState.isNoCompactMode())
+        {
+            CFMetaData table = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+            if (table.isCompactTable())
+            {
+                throw invalidRequest("Cannot alter schema of compact table %s.%s from a connection in NO-COMPACT mode",
+                                     table.ksName, table.cfName);
+            }
+            else if (table.isView())
+            {
+                CFMetaData baseTable = Schema.instance.getView(table.ksName, table.cfName).baseTableMetadata();
+                if (baseTable.isCompactTable())
+                    throw new InvalidRequestException(String.format("Cannot ALTER schema of view %s.%s on compact table %s from "
+                                                                    + "a connection in NO-COMPACT mode",
+                                                                    table.ksName, table.cfName,
+                                                                    baseTable.ksName, baseTable.cfName));
+            }
+        }
+
         return new Prepared(this);
     }
 
-    public abstract Event.SchemaChange changeEvent();
-
     /**
      * Schema alteration may result in a new database object (keyspace, table, role, function) being created capable of
      * having permissions GRANTed on it. The creator of the object (the primary role assigned to the AuthenticatedUser
@@ -80,29 +108,29 @@
 
     /**
      * Announces the migration to other nodes in the cluster.
-     * @return true if the execution of this statement resulted in a schema change, false otherwise (when IF NOT EXISTS
-     * is used, for example)
+     *
+     * @return the schema change event corresponding to the execution of this statement, or {@code null} if no schema change
+     * has occurred (when IF NOT EXISTS is used, for example)
+     *
      * @throws RequestValidationException
      */
-    public abstract boolean announceMigration(boolean isLocalOnly) throws RequestValidationException;
+    protected abstract Event.SchemaChange announceMigration(QueryState queryState, boolean isLocalOnly) throws RequestValidationException;
 
     public ResultMessage execute(QueryState state, QueryOptions options) throws RequestValidationException
     {
         // If an IF [NOT] EXISTS clause was used, this may not result in an actual schema change.  To avoid doing
         // extra work in the drivers to handle schema changes, we return an empty message in this case. (CASSANDRA-7600)
-        boolean didChangeSchema = announceMigration(false);
-        if (!didChangeSchema)
+        Event.SchemaChange ce = announceMigration(state, false);
+        if (ce == null)
             return new ResultMessage.Void();
 
-        Event.SchemaChange ce = changeEvent();
-
         // when a schema alteration results in a new db object being created, we grant permissions on the new
         // object to the user performing the request if:
         // * the user is not anonymous
         // * the configured IAuthorizer supports granting of permissions (not all do, AllowAllAuthorizer doesn't and
         //   custom external implementations may not)
         AuthenticatedUser user = state.getClientState().getUser();
-        if (user != null && !user.isAnonymous() && ce != null && ce.change == Event.SchemaChange.Change.CREATED)
+        if (user != null && !user.isAnonymous() && ce.change == Event.SchemaChange.Change.CREATED)
         {
             try
             {
@@ -114,16 +142,12 @@
             }
         }
 
-        return ce == null ? new ResultMessage.Void() : new ResultMessage.SchemaChange(ce);
+        return new ResultMessage.SchemaChange(ce);
     }
 
     public ResultMessage executeInternal(QueryState state, QueryOptions options)
     {
-        boolean didChangeSchema = announceMigration(true);
-        if (!didChangeSchema)
-            return new ResultMessage.Void();
-
-        Event.SchemaChange ce = changeEvent();
+        Event.SchemaChange ce = announceMigration(state, true);
         return ce == null ? new ResultMessage.Void() : new ResultMessage.SchemaChange(ce);
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index 729cf83..30c4458 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java

@@ -18,41 +18,80 @@
 package org.apache.cassandra.cql3.statements;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.SortedSet;
 
 import com.google.common.base.Objects;
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.CFName;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.WhereClause;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
 import org.apache.cassandra.cql3.selection.RawSelector;
 import org.apache.cassandra.cql3.selection.Selection;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadOrderGroup;
+import org.apache.cassandra.db.ReadQuery;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.exceptions.UnrecognizedEntityException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.pager.Pageable;
+import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.service.pager.QueryPager;
-import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.thrift.ThriftValidation;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -78,7 +117,6 @@
     private static final Logger logger = LoggerFactory.getLogger(SelectStatement.class);
 
     private static final int DEFAULT_COUNT_PAGE_SIZE = 10000;
-
     private final int boundTerms;
     public final CFMetaData cfm;
     public final Parameters parameters;
@@ -94,6 +132,8 @@
      */
     private final Comparator<List<ByteBuffer>> orderingComparator;
 
+    private final ColumnFilter queriedColumns;
+
     // Used by forSelection below
     private static final Parameters defaultParameters = new Parameters(Collections.<ColumnIdentifier.Raw, Boolean>emptyMap(), false, false, false);
 
@@ -114,13 +154,49 @@
         this.orderingComparator = orderingComparator;
         this.parameters = parameters;
         this.limit = limit;
+        this.queriedColumns = gatherQueriedColumns();
     }
 
     public Iterable<Function> getFunctions()
     {
-        return Iterables.concat(selection.getFunctions(),
-                                restrictions.getFunctions(),
-                                limit != null ? limit.getFunctions() : Collections.<Function>emptySet());
+        List<Function> functions = new ArrayList<>();
+        addFunctionsTo(functions);
+        return functions;
+    }
+
+    private void addFunctionsTo(List<Function> functions)
+    {
+        selection.addFunctionsTo(functions);
+        restrictions.addFunctionsTo(functions);
+
+        if (limit != null)
+            limit.addFunctionsTo(functions);
+    }
+
+    // Note that the queried columns internally is different from the one selected by the
+    // user as it also include any column for which we have a restriction on.
+    private ColumnFilter gatherQueriedColumns()
+    {
+        if (selection.isWildcard())
+            return ColumnFilter.all(cfm);
+
+        ColumnFilter.Builder builder = ColumnFilter.allColumnsBuilder(cfm);
+        // Adds all selected columns
+        for (ColumnDefinition def : selection.getColumns())
+            if (!def.isPrimaryKeyColumn())
+                builder.add(def);
+        // as well as any restricted column (so we can actually apply the restriction)
+        builder.addAll(restrictions.nonPKRestrictedColumns(true));
+        return builder.build();
+    }
+
+    /**
+     * The columns to fetch internally for this SELECT statement (which can be more than the one selected by the
+     * user as it also include any restricted column in particular).
+     */
+    public ColumnFilter queriedColumns()
+    {
+        return queriedColumns;
     }
 
     // Creates a simple select based on the given selection.
@@ -132,7 +208,7 @@
                                    0,
                                    defaultParameters,
                                    selection,
-                                   StatementRestrictions.empty(cfm),
+                                   StatementRestrictions.empty(StatementType.SELECT, cfm),
                                    false,
                                    null,
                                    null);
@@ -150,7 +226,17 @@
 
     public void checkAccess(ClientState state) throws InvalidRequestException, UnauthorizedException
     {
-        state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.SELECT);
+        if (cfm.isView())
+        {
+            CFMetaData baseTable = View.findBaseTable(keyspace(), columnFamily());
+            if (baseTable != null)
+                state.hasColumnFamilyAccess(keyspace(), baseTable.cfName, Permission.SELECT);
+        }
+        else
+        {
+            state.hasColumnFamilyAccess(keyspace(), columnFamily(), Permission.SELECT);
+        }
+
         for (Function function : getFunctions())
             state.ensureHasPermission(Permission.EXECUTE, function);
     }
@@ -167,34 +253,17 @@
 
         cl.validateForRead(keyspace());
 
-        int limit = getLimit(options);
-        long now = System.currentTimeMillis();
-        Pageable command = getPageableCommand(options, limit, now);
+        int nowInSec = FBUtilities.nowInSeconds();
+        int userLimit = getLimit(options);
+        ReadQuery query = getQuery(options, nowInSec, userLimit);
+
         int pageSize = getPageSize(options);
 
-        if (pageSize <= 0 || command == null || !QueryPagers.mayNeedPaging(command, pageSize))
-            return execute(command, options, limit, now, state);
+        if (pageSize <= 0 || query.limits().count() <= pageSize)
+            return execute(query, options, state, nowInSec, userLimit);
 
-        QueryPager pager = QueryPagers.pager(command, cl, state.getClientState(), options.getPagingState());
-        return execute(pager, options, limit, now, pageSize);
-    }
-
-    private Pageable getPageableCommand(QueryOptions options, int limit, long now) throws RequestValidationException
-    {
-        if (restrictions.isNotReturningAnyRows(options))
-            return null;
-
-        int limitForQuery = updateLimitForQuery(limit);
-        if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing())
-            return getRangeCommand(options, limitForQuery, now);
-
-        List<ReadCommand> commands = getSliceCommands(options, limitForQuery, now);
-        return commands == null ? null : new Pageable.ReadCommands(commands, limitForQuery);
-    }
-
-    public Pageable getPageableCommand(QueryOptions options) throws RequestValidationException
-    {
-        return getPageableCommand(options, getLimit(options), System.currentTimeMillis());
+        QueryPager pager = query.getPager(options.getPagingState(), options.getProtocolVersion());
+        return execute(Pager.forDistributedQuery(pager, cl, state.getClientState()), options, pageSize, nowInSec, userLimit);
     }
 
     private int getPageSize(QueryOptions options)
@@ -210,46 +279,129 @@
         return  pageSize;
     }
 
-    private ResultMessage.Rows execute(Pageable command, QueryOptions options, int limit, long now, QueryState state)
-    throws RequestValidationException, RequestExecutionException
+    public ReadQuery getQuery(QueryOptions options, int nowInSec) throws RequestValidationException
     {
-        List<Row> rows;
-        if (command == null)
-        {
-            rows = Collections.<Row>emptyList();
-        }
-        else
-        {
-            rows = command instanceof Pageable.ReadCommands
-                 ? StorageProxy.read(((Pageable.ReadCommands)command).commands, options.getConsistency(), state.getClientState())
-                 : StorageProxy.getRangeSlice((RangeSliceCommand)command, options.getConsistency());
-        }
-
-        return processResults(rows, options, limit, now);
+        return getQuery(options, nowInSec, getLimit(options));
     }
 
-    private ResultMessage.Rows execute(QueryPager pager, QueryOptions options, int limit, long now, int pageSize)
-    throws RequestValidationException, RequestExecutionException
+    public ReadQuery getQuery(QueryOptions options, int nowInSec, int userLimit) throws RequestValidationException
+    {
+        DataLimits limit = getDataLimits(userLimit);
+        if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing())
+            return getRangeCommand(options, limit, nowInSec);
+
+        return getSliceCommands(options, limit, nowInSec);
+    }
+
+    private ResultMessage.Rows execute(ReadQuery query,
+                                       QueryOptions options,
+                                       QueryState state,
+                                       int nowInSec,
+                                       int userLimit) throws RequestValidationException, RequestExecutionException
+    {
+        try (PartitionIterator data = query.execute(options.getConsistency(), state.getClientState()))
+        {
+            return processResults(data, options, nowInSec, userLimit);
+        }
+    }
+
+    // Simple wrapper class to avoid some code duplication
+    private static abstract class Pager
+    {
+        protected QueryPager pager;
+
+        protected Pager(QueryPager pager)
+        {
+            this.pager = pager;
+        }
+
+        public static Pager forInternalQuery(QueryPager pager, ReadOrderGroup orderGroup)
+        {
+            return new InternalPager(pager, orderGroup);
+        }
+
+        public static Pager forDistributedQuery(QueryPager pager, ConsistencyLevel consistency, ClientState clientState)
+        {
+            return new NormalPager(pager, consistency, clientState);
+        }
+
+        public boolean isExhausted()
+        {
+            return pager.isExhausted();
+        }
+
+        public PagingState state()
+        {
+            return pager.state();
+        }
+
+        public abstract PartitionIterator fetchPage(int pageSize);
+
+        public static class NormalPager extends Pager
+        {
+            private final ConsistencyLevel consistency;
+            private final ClientState clientState;
+
+            private NormalPager(QueryPager pager, ConsistencyLevel consistency, ClientState clientState)
+            {
+                super(pager);
+                this.consistency = consistency;
+                this.clientState = clientState;
+            }
+
+            public PartitionIterator fetchPage(int pageSize)
+            {
+                return pager.fetchPage(pageSize, consistency, clientState);
+            }
+        }
+
+        public static class InternalPager extends Pager
+        {
+            private final ReadOrderGroup orderGroup;
+
+            private InternalPager(QueryPager pager, ReadOrderGroup orderGroup)
+            {
+                super(pager);
+                this.orderGroup = orderGroup;
+            }
+
+            public PartitionIterator fetchPage(int pageSize)
+            {
+                return pager.fetchPageInternal(pageSize, orderGroup);
+            }
+        }
+    }
+
+    private ResultMessage.Rows execute(Pager pager,
+                                       QueryOptions options,
+                                       int pageSize,
+                                       int nowInSec,
+                                       int userLimit) throws RequestValidationException, RequestExecutionException
     {
         if (selection.isAggregate())
-            return pageAggregateQuery(pager, options, pageSize, now);
+            return pageAggregateQuery(pager, options, pageSize, nowInSec);
 
         // We can't properly do post-query ordering if we page (see #6722)
         checkFalse(needsPostQueryOrdering(),
-                   "Cannot page queries with both ORDER BY and a IN restriction on the partition key;"
-                   + " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
+                  "Cannot page queries with both ORDER BY and a IN restriction on the partition key;"
+                  + " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
 
-        List<Row> page = pager.fetchPage(pageSize);
-        ResultMessage.Rows msg = processResults(page, options, limit, now);
+        ResultMessage.Rows msg;
+        try (PartitionIterator page = pager.fetchPage(pageSize))
+        {
+            msg = processResults(page, options, nowInSec, userLimit);
+        }
 
+        // Please note that the isExhausted state of the pager only gets updated when we've closed the page, so this
+        // shouldn't be moved inside the 'try' above.
         if (!pager.isExhausted())
             msg.result.metadata.setHasMorePages(pager.state());
 
         return msg;
     }
 
-    private ResultMessage.Rows pageAggregateQuery(QueryPager pager, QueryOptions options, int pageSize, long now)
-            throws RequestValidationException, RequestExecutionException
+    private ResultMessage.Rows pageAggregateQuery(Pager pager, QueryOptions options, int pageSize, int nowInSec)
+    throws RequestValidationException, RequestExecutionException
     {
         if (!restrictions.hasPartitionKeyRestrictions())
         {
@@ -262,66 +414,63 @@
             ClientWarn.instance.warn("Aggregation query used on multiple partition keys (IN restriction)");
         }
 
-        Selection.ResultSetBuilder result = selection.resultSetBuilder(now, parameters.isJson);
+        Selection.ResultSetBuilder result = selection.resultSetBuilder(parameters.isJson);
         while (!pager.isExhausted())
         {
-            for (Row row : pager.fetchPage(pageSize))
+            try (PartitionIterator iter = pager.fetchPage(pageSize))
             {
-                // Not columns match the query, skip
-                if (row.cf == null)
-                    continue;
-
-                processColumnFamily(row.key.getKey(), row.cf, options, now, result);
+                while (iter.hasNext())
+                {
+                    try (RowIterator partition = iter.next())
+                    {
+                        processPartition(partition, options, result, nowInSec);
+                    }
+                }
             }
         }
         return new ResultMessage.Rows(result.build(options.getProtocolVersion()));
     }
 
-    public ResultMessage.Rows processResults(List<Row> rows, QueryOptions options, int limit, long now) throws RequestValidationException
+    private ResultMessage.Rows processResults(PartitionIterator partitions,
+                                              QueryOptions options,
+                                              int nowInSec,
+                                              int userLimit) throws RequestValidationException
     {
-        ResultSet rset = process(rows, options, limit, now);
+        ResultSet rset = process(partitions, options, nowInSec, userLimit);
         return new ResultMessage.Rows(rset);
     }
 
-    static List<Row> readLocally(String keyspaceName, List<ReadCommand> cmds)
-    {
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        List<Row> rows = new ArrayList<Row>(cmds.size());
-        for (ReadCommand cmd : cmds)
-            rows.add(cmd.getRow(keyspace));
-        return rows;
-    }
-
     public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options) throws RequestExecutionException, RequestValidationException
     {
-        return executeInternal(state, options, System.currentTimeMillis());
+        return executeInternal(state, options, FBUtilities.nowInSeconds());
     }
 
-    public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options, long now) throws RequestExecutionException, RequestValidationException
+    public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options, int nowInSec) throws RequestExecutionException, RequestValidationException
     {
-        int limit = getLimit(options);
-        Pageable command = getPageableCommand(options, limit, now);
+        int userLimit = getLimit(options);
+        ReadQuery query = getQuery(options, nowInSec, userLimit);
         int pageSize = getPageSize(options);
 
-        if (pageSize <= 0 || command == null || !QueryPagers.mayNeedPaging(command, pageSize))
+        try (ReadOrderGroup orderGroup = query.startOrderGroup())
         {
-            List<Row> rows = command == null
-                             ? Collections.<Row>emptyList()
-                             : (command instanceof Pageable.ReadCommands
-                                ? readLocally(keyspace(), ((Pageable.ReadCommands)command).commands)
-                                : ((RangeSliceCommand)command).executeLocally());
-
-            return processResults(rows, options, limit, now);
+            if (pageSize <= 0 || query.limits().count() <= pageSize)
+            {
+                try (PartitionIterator data = query.executeInternal(orderGroup))
+                {
+                    return processResults(data, options, nowInSec, userLimit);
+                }
+            }
+            else
+            {
+                QueryPager pager = query.getPager(options.getPagingState(), options.getProtocolVersion());
+                return execute(Pager.forInternalQuery(pager, orderGroup), options, pageSize, nowInSec, userLimit);
+            }
         }
-
-        QueryPager pager = QueryPagers.localPager(command);
-        return execute(pager, options, limit, now, pageSize);
     }
 
-    public ResultSet process(List<Row> rows) throws InvalidRequestException
+    public ResultSet process(PartitionIterator partitions, int nowInSec) throws InvalidRequestException
     {
-        QueryOptions options = QueryOptions.DEFAULT;
-        return process(rows, options, getLimit(options), System.currentTimeMillis());
+        return process(partitions, QueryOptions.DEFAULT, nowInSec, getLimit(QueryOptions.DEFAULT));
     }
 
     public String keyspace()
@@ -350,398 +499,319 @@
         return restrictions;
     }
 
-    private List<ReadCommand> getSliceCommands(QueryOptions options, int limit, long now) throws RequestValidationException
+    private ReadQuery getSliceCommands(QueryOptions options, DataLimits limit, int nowInSec) throws RequestValidationException
     {
         Collection<ByteBuffer> keys = restrictions.getPartitionKeys(options);
+        if (keys.isEmpty())
+            return ReadQuery.EMPTY;
 
-        List<ReadCommand> commands = new ArrayList<>(keys.size());
-
-        IDiskAtomFilter filter = makeFilter(options, limit);
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options);
         if (filter == null)
-            return null;
+            return ReadQuery.EMPTY;
+
+        RowFilter rowFilter = getRowFilter(options);
 
         // Note that we use the total limit for every key, which is potentially inefficient.
         // However, IN + LIMIT is not a very sensible choice.
+        List<SinglePartitionReadCommand> commands = new ArrayList<>(keys.size());
         for (ByteBuffer key : keys)
         {
             QueryProcessor.validateKey(key);
-            // We should not share the slice filter amongst the commands (hence the cloneShallow), due to
-            // SliceQueryFilter not being immutable due to its columnCounter used by the lastCounted() method
-            // (this is fairly ugly and we should change that but that's probably not a tiny refactor to do that cleanly)
-            commands.add(ReadCommand.create(keyspace(), ByteBufferUtil.clone(key), columnFamily(), now, filter.cloneShallow()));
+            DecoratedKey dk = cfm.decorateKey(ByteBufferUtil.clone(key));
+            ColumnFilter cf = (cfm.isSuper() && cfm.isDense()) ? SuperColumnCompatibility.getColumnFilter(cfm, options, restrictions.getSuperColumnRestrictions()) : queriedColumns;
+            commands.add(SinglePartitionReadCommand.create(cfm, nowInSec, cf, rowFilter, limit, dk, filter));
         }
 
-        return commands;
-    }
-
-    private RangeSliceCommand getRangeCommand(QueryOptions options, int limit, long now) throws RequestValidationException
-    {
-        IDiskAtomFilter filter = makeFilter(options, limit);
-        if (filter == null)
-            return null;
-
-        List<IndexExpression> expressions = getValidatedIndexExpressions(options);
-        // The LIMIT provided by the user is the number of CQL row he wants returned.
-        // We want to have getRangeSlice to count the number of columns, not the number of keys.
-        AbstractBounds<RowPosition> keyBounds = restrictions.getPartitionKeyBounds(options);
-        return keyBounds == null
-             ? null
-             : new RangeSliceCommand(keyspace(), columnFamily(), now,  filter, keyBounds, expressions, limit, !parameters.isDistinct, false);
-    }
-
-    private ColumnSlice makeStaticSlice()
-    {
-        // Note: we could use staticPrefix.start() for the start bound, but EMPTY gives us the
-        // same effect while saving a few CPU cycles.
-        return isReversed
-             ? new ColumnSlice(cfm.comparator.staticPrefix().end(), Composites.EMPTY)
-             : new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end());
-    }
-
-    private IDiskAtomFilter makeFilter(QueryOptions options, int limit)
-    throws InvalidRequestException
-    {
-        int toGroup = cfm.comparator.isDense() ? -1 : cfm.clusteringColumns().size();
-        if (parameters.isDistinct)
-        {
-            // For distinct, we only care about fetching the beginning of each partition. If we don't have
-            // static columns, we in fact only care about the first cell, so we query only that (we don't "group").
-            // If we do have static columns, we do need to fetch the first full group (to have the static columns values).
-
-            // See the comments on IGNORE_TOMBSTONED_PARTITIONS and CASSANDRA-8490 for why we use a special value for
-            // DISTINCT queries on the partition key only.
-            toGroup = selection.containsStaticColumns() ? toGroup : SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
-            return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 1, toGroup);
-        }
-        else if (restrictions.isColumnRange())
-        {
-            List<Composite> startBounds = restrictions.getClusteringColumnsBoundsAsComposites(Bound.START, options);
-            List<Composite> endBounds = restrictions.getClusteringColumnsBoundsAsComposites(Bound.END, options);
-            assert startBounds.size() == endBounds.size();
-
-            // Handles fetching static columns. Note that for 2i, the filter is just used to restrict
-            // the part of the index to query so adding the static slice would be useless and confusing.
-            // For 2i, static columns are retrieve in CompositesSearcher with each index hit.
-            ColumnSlice staticSlice = selection.containsStaticColumns() && !restrictions.usesSecondaryIndexing()
-                                    ? makeStaticSlice()
-                                    : null;
-
-            // The case where startBounds == 1 is common enough that it's worth optimizing
-            if (startBounds.size() == 1)
-            {
-                ColumnSlice slice = new ColumnSlice(startBounds.get(0), endBounds.get(0));
-                if (slice.isAlwaysEmpty(cfm.comparator, isReversed))
-                    return staticSlice == null ? null : sliceFilter(staticSlice, limit, toGroup);
-
-                if (staticSlice == null)
-                    return sliceFilter(slice, limit, toGroup);
-
-                if (isReversed)
-                    return slice.includes(cfm.comparator.reverseComparator(), staticSlice.start)
-                            ? sliceFilter(new ColumnSlice(slice.start, staticSlice.finish), limit, toGroup)
-                            : sliceFilter(new ColumnSlice[]{ slice, staticSlice }, limit, toGroup);
-                else
-                    return slice.includes(cfm.comparator, staticSlice.finish)
-                            ? sliceFilter(new ColumnSlice(staticSlice.start, slice.finish), limit, toGroup)
-                            : sliceFilter(new ColumnSlice[]{ staticSlice, slice }, limit, toGroup);
-            }
-
-            List<ColumnSlice> l = new ArrayList<ColumnSlice>(startBounds.size());
-            for (int i = 0; i < startBounds.size(); i++)
-            {
-                ColumnSlice slice = new ColumnSlice(startBounds.get(i), endBounds.get(i));
-                if (!slice.isAlwaysEmpty(cfm.comparator, isReversed))
-                    l.add(slice);
-            }
-
-            if (l.isEmpty())
-                return staticSlice == null ? null : sliceFilter(staticSlice, limit, toGroup);
-            if (staticSlice == null)
-                return sliceFilter(l.toArray(new ColumnSlice[l.size()]), limit, toGroup);
-
-            // The slices should not overlap. We know the slices built from startBounds/endBounds don't, but if there is
-            // a static slice, it could overlap with the 2nd slice. Check for it and correct if that's the case
-            ColumnSlice[] slices;
-            if (isReversed)
-            {
-                if (l.get(l.size() - 1).includes(cfm.comparator.reverseComparator(), staticSlice.start))
-                {
-                    slices = l.toArray(new ColumnSlice[l.size()]);
-                    slices[slices.length-1] = new ColumnSlice(slices[slices.length-1].start, Composites.EMPTY);
-                }
-                else
-                {
-                    slices = l.toArray(new ColumnSlice[l.size()+1]);
-                    slices[slices.length-1] = staticSlice;
-                }
-            }
-            else
-            {
-                if (l.get(0).includes(cfm.comparator, staticSlice.finish))
-                {
-                    slices = new ColumnSlice[l.size()];
-                    slices[0] = new ColumnSlice(Composites.EMPTY, l.get(0).finish);
-                    for (int i = 1; i < l.size(); i++)
-                        slices[i] = l.get(i);
-                }
-                else
-                {
-                    slices = new ColumnSlice[l.size()+1];
-                    slices[0] = staticSlice;
-                    for (int i = 0; i < l.size(); i++)
-                        slices[i+1] = l.get(i);
-                }
-            }
-            return sliceFilter(slices, limit, toGroup);
-        }
-        else
-        {
-            SortedSet<CellName> cellNames = getRequestedColumns(options);
-            if (cellNames == null) // in case of IN () for the last column of the key
-                return null;
-            QueryProcessor.validateCellNames(cellNames, cfm.comparator);
-            return new NamesQueryFilter(cellNames, true);
-        }
-    }
-
-    private SliceQueryFilter sliceFilter(ColumnSlice slice, int limit, int toGroup)
-    {
-        return sliceFilter(new ColumnSlice[]{ slice }, limit, toGroup);
-    }
-
-    private SliceQueryFilter sliceFilter(ColumnSlice[] slices, int limit, int toGroup)
-    {
-        assert ColumnSlice.validateSlices(slices, cfm.comparator, isReversed) : String.format("Invalid slices: " + Arrays.toString(slices) + (isReversed ? " (reversed)" : ""));
-        return new SliceQueryFilter(slices, isReversed, limit, toGroup);
+        return new SinglePartitionReadCommand.Group(commands, limit);
     }
 
     /**
-     * May be used by custom QueryHandler implementations
+     * Returns the slices fetched by this SELECT, assuming an internal call (no bound values in particular).
+     * <p>
+     * Note that if the SELECT intrinsically selects rows by names, we convert them into equivalent slices for
+     * the purpose of this method. This is used for MVs to restrict what needs to be read when we want to read
+     * everything that could be affected by a given view (and so, if the view SELECT statement has restrictions
+     * on the clustering columns, we can restrict what we read).
      */
-    public int getLimit(QueryOptions options) throws InvalidRequestException
+    public Slices clusteringIndexFilterAsSlices()
     {
+        QueryOptions options = QueryOptions.forInternalCalls(Collections.emptyList());
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options);
+        if (filter instanceof ClusteringIndexSliceFilter)
+            return ((ClusteringIndexSliceFilter)filter).requestedSlices();
+
+        Slices.Builder builder = new Slices.Builder(cfm.comparator);
+        for (Clustering clustering: ((ClusteringIndexNamesFilter)filter).requestedRows())
+            builder.add(Slice.make(clustering));
+        return builder.build();
+    }
+
+    /**
+     * Returns a read command that can be used internally to query all the rows queried by this SELECT for a
+     * give key (used for materialized views).
+     */
+    public SinglePartitionReadCommand internalReadForView(DecoratedKey key, int nowInSec)
+    {
+        QueryOptions options = QueryOptions.forInternalCalls(Collections.emptyList());
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options);
+        RowFilter rowFilter = getRowFilter(options);
+        return SinglePartitionReadCommand.create(cfm, nowInSec, queriedColumns, rowFilter, DataLimits.NONE, key, filter);
+    }
+
+    /**
+     * The {@code RowFilter} for this SELECT, assuming an internal call (no bound values in particular).
+     */
+    public RowFilter rowFilterForInternalCalls()
+    {
+        return getRowFilter(QueryOptions.forInternalCalls(Collections.emptyList()));
+    }
+
+    private ReadQuery getRangeCommand(QueryOptions options, DataLimits limit, int nowInSec) throws RequestValidationException
+    {
+        ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options);
+        if (clusteringIndexFilter == null)
+            return ReadQuery.EMPTY;
+
+        RowFilter rowFilter = getRowFilter(options);
+
+        // The LIMIT provided by the user is the number of CQL row he wants returned.
+        // We want to have getRangeSlice to count the number of columns, not the number of keys.
+        AbstractBounds<PartitionPosition> keyBounds = restrictions.getPartitionKeyBounds(options);
+        if (keyBounds == null)
+            return ReadQuery.EMPTY;
+
+        PartitionRangeReadCommand command =
+            PartitionRangeReadCommand.create(false, cfm, nowInSec, queriedColumns, rowFilter, limit, new DataRange(keyBounds, clusteringIndexFilter));
+
+        // If there's a secondary index that the command can use, have it validate the request parameters.
+        command.maybeValidateIndex();
+
+        return command;
+    }
+
+    private ClusteringIndexFilter makeClusteringIndexFilter(QueryOptions options)
+    throws InvalidRequestException
+    {
+        if (parameters.isDistinct)
+        {
+            // We need to be able to distinguish between partition having live rows and those that don't. But
+            // doing so is not trivial since "having a live row" depends potentially on
+            //   1) when the query is performed, due to TTLs
+            //   2) how thing reconcile together between different nodes
+            // so that it's hard to really optimize properly internally. So to keep it simple, we simply query
+            // for the first row of the partition and hence uses Slices.ALL. We'll limit it to the first live
+            // row however in getLimit().
+            return new ClusteringIndexSliceFilter(Slices.ALL, false);
+        }
+
+        if (restrictions.isColumnRange())
+        {
+            Slices slices = makeSlices(options);
+            if (slices == Slices.NONE && !selection.containsStaticColumns())
+                return null;
+
+            return new ClusteringIndexSliceFilter(slices, isReversed);
+        }
+        else
+        {
+            NavigableSet<Clustering> clusterings = getRequestedRows(options);
+            // We can have no clusterings if either we're only selecting the static columns, or if we have
+            // a 'IN ()' for clusterings. In that case, we still want to query if some static columns are
+            // queried. But we're fine otherwise.
+            if (clusterings.isEmpty() && queriedColumns.fetchedColumns().statics.isEmpty())
+                return null;
+
+            return new ClusteringIndexNamesFilter(clusterings, isReversed);
+        }
+    }
+
+    private Slices makeSlices(QueryOptions options)
+    throws InvalidRequestException
+    {
+        SortedSet<Slice.Bound> startBounds = restrictions.getClusteringColumnsBounds(Bound.START, options);
+        SortedSet<Slice.Bound> endBounds = restrictions.getClusteringColumnsBounds(Bound.END, options);
+        assert startBounds.size() == endBounds.size();
+
+        // The case where startBounds == 1 is common enough that it's worth optimizing
+        if (startBounds.size() == 1)
+        {
+            Slice.Bound start = startBounds.first();
+            Slice.Bound end = endBounds.first();
+            return cfm.comparator.compare(start, end) > 0
+                 ? Slices.NONE
+                 : Slices.with(cfm.comparator, Slice.make(start, end));
+        }
+
+        Slices.Builder builder = new Slices.Builder(cfm.comparator, startBounds.size());
+        Iterator<Slice.Bound> startIter = startBounds.iterator();
+        Iterator<Slice.Bound> endIter = endBounds.iterator();
+        while (startIter.hasNext() && endIter.hasNext())
+        {
+            Slice.Bound start = startIter.next();
+            Slice.Bound end = endIter.next();
+
+            // Ignore slices that are nonsensical
+            if (cfm.comparator.compare(start, end) > 0)
+                continue;
+
+            builder.add(start, end);
+        }
+
+        return builder.build();
+    }
+
+    private DataLimits getDataLimits(int userLimit)
+    {
+        int cqlRowLimit = DataLimits.NO_LIMIT;
+
+        // If we aggregate, the limit really apply to the number of rows returned to the user, not to what is queried, and
+        // since in practice we currently only aggregate at top level (we have no GROUP BY support yet), we'll only ever
+        // return 1 result and can therefore basically ignore the user LIMIT in this case.
+        // Whenever we support GROUP BY, we'll have to add a new DataLimits kind that knows how things are grouped and is thus
+        // able to apply the user limit properly.
+        // If we do post ordering we need to get all the results sorted before we can trim them.
+        if (!selection.isAggregate() && !needsPostQueryOrdering())
+            cqlRowLimit = userLimit;
+
+        if (parameters.isDistinct)
+            return cqlRowLimit == DataLimits.NO_LIMIT ? DataLimits.DISTINCT_NONE : DataLimits.distinctLimits(cqlRowLimit);
+
+        return cqlRowLimit == DataLimits.NO_LIMIT ? DataLimits.NONE : DataLimits.cqlLimits(cqlRowLimit);
+    }
+
+    /**
+     * Returns the limit specified by the user.
+     * May be used by custom QueryHandler implementations
+     *
+     * @return the limit specified by the user or <code>DataLimits.NO_LIMIT</code> if no value 
+     * as been specified.
+     */
+    public int getLimit(QueryOptions options)
+    {
+        int userLimit = DataLimits.NO_LIMIT;
+
         if (limit != null)
         {
             ByteBuffer b = checkNotNull(limit.bindAndGet(options), "Invalid null value of limit");
             // treat UNSET limit value as 'unlimited'
-            if (b == UNSET_BYTE_BUFFER)
-                return Integer.MAX_VALUE;
-            try
+            if (b != UNSET_BYTE_BUFFER)
             {
-                Int32Type.instance.validate(b);
-                int l = Int32Type.instance.compose(b);
-                checkTrue(l > 0, "LIMIT must be strictly positive");
-                return l;
-            }
-            catch (MarshalException e)
-            {
-                throw new InvalidRequestException("Invalid limit value");
+                try
+                {
+                    Int32Type.instance.validate(b);
+                    userLimit = Int32Type.instance.compose(b);
+                    checkTrue(userLimit > 0, "LIMIT must be strictly positive");
+                }
+                catch (MarshalException e)
+                {
+                    throw new InvalidRequestException("Invalid limit value");
+                }
             }
         }
-        return Integer.MAX_VALUE;
+        return userLimit;
     }
 
-    private int updateLimitForQuery(int limit)
-    {
-        // If the query is for an aggregate, we do not want to limit the number of rows retrieved. The LIMIT
-        // clause apply to the number of rows returned to the user and not to the number of rows retrieved.
-        if (selection.isAggregate())
-            return Integer.MAX_VALUE;
-        // Internally, we don't support exclusive bounds for slices. Instead, we query one more element if necessary
-        // and exclude it later (in processColumnFamily)
-        return restrictions.isNonCompositeSliceWithExclusiveBounds() && limit != Integer.MAX_VALUE
-             ? limit + 1
-             : limit;
-    }
-
-    private SortedSet<CellName> getRequestedColumns(QueryOptions options) throws InvalidRequestException
+    private NavigableSet<Clustering> getRequestedRows(QueryOptions options) throws InvalidRequestException
     {
         // Note: getRequestedColumns don't handle static columns, but due to CASSANDRA-5762
         // we always do a slice for CQL3 tables, so it's ok to ignore them here
         assert !restrictions.isColumnRange();
-        SortedSet<CellName> columns = new TreeSet<CellName>(cfm.comparator);
-        for (Composite composite : restrictions.getClusteringColumnsAsComposites(options))
-            columns.addAll(addSelectedColumns(composite));
-        return columns;
-    }
-
-    private SortedSet<CellName> addSelectedColumns(Composite prefix)
-    {
-        if (cfm.comparator.isDense())
-        {
-            return FBUtilities.singleton(cfm.comparator.create(prefix, null), cfm.comparator);
-        }
-        else
-        {
-            SortedSet<CellName> columns = new TreeSet<CellName>(cfm.comparator);
-
-            // We need to query the selected column as well as the marker
-            // column (for the case where the row exists but has no columns outside the PK)
-            // Two exceptions are "static CF" (non-composite non-compact CF) and "super CF"
-            // that don't have marker and for which we must query all columns instead
-            if (cfm.comparator.isCompound() && !cfm.isSuper())
-            {
-                // marker
-                columns.add(cfm.comparator.rowMarker(prefix));
-
-                // selected columns
-                for (ColumnDefinition def : selection.getColumns())
-                    if (def.isRegular() || def.isStatic())
-                        columns.add(cfm.comparator.create(prefix, def));
-            }
-            else
-            {
-                // We now that we're not composite so we can ignore static columns
-                for (ColumnDefinition def : cfm.regularColumns())
-                    columns.add(cfm.comparator.create(prefix, def));
-            }
-            return columns;
-        }
+        return restrictions.getClusteringColumns(options);
     }
 
     /**
      * May be used by custom QueryHandler implementations
      */
-    public List<IndexExpression> getValidatedIndexExpressions(QueryOptions options) throws InvalidRequestException
+    public RowFilter getRowFilter(QueryOptions options) throws InvalidRequestException
     {
-        if (!restrictions.usesSecondaryIndexing())
-            return Collections.emptyList();
-
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(columnFamily());
         SecondaryIndexManager secondaryIndexManager = cfs.indexManager;
-
-        List<IndexExpression> expressions = restrictions.getIndexExpressions(secondaryIndexManager, options);
-        secondaryIndexManager.validateIndexSearchersForQuery(expressions);
-
-        return expressions;
+        RowFilter filter = restrictions.getRowFilter(secondaryIndexManager, options);
+        return filter;
     }
 
-    private CellName makeExclusiveSliceBound(Bound bound, CellNameType type, QueryOptions options) throws InvalidRequestException
+    private ResultSet process(PartitionIterator partitions,
+                              QueryOptions options,
+                              int nowInSec,
+                              int userLimit) throws InvalidRequestException
     {
-        // clusteringColumnBounds may reverse bound if clustering order is reversed
-        // but areRequestedBoundsInclusive checks for Restriction::isInclusive and never
-        // reverses the order. In order to avoid inconsistencies and check inclusive
-        // bounds correctly, we need to check for column order and reverse it. See CASSANDRA-10988
-        if (restrictions.areRequestedBoundsInclusive(reverseBoundIfNeeded(bound)))
-            return null;
-
-        // We can only reach that if the table is dense non-compound tables.
-        // By consequence, we know that the table is a COMPACT table with only one clustering column.
-        ByteBuffer value = restrictions.getClusteringColumnsBounds(bound, options).get(0);
-
-        // Dense non-compound tables do not accept empty ByteBuffers. By consequence, if the slice value is empty
-        // we know that we can treat the slice as inclusive.
-        return value.hasRemaining() ? type.makeCellName(value) : null;
-    }
-
-    /**
-     * Reverses the specified bound if the non-compound clustering column is a reversed one.
-     * @param bound bound to reverse
-     * @return the bound reversed if the column type was a reversed one or the original bound
-     */
-    private Bound reverseBoundIfNeeded(Bound bound)
-    {
-        assert !cfm.comparator.isCompound();
-
-        List<ColumnDefinition> columnDefs = cfm.clusteringColumns();
-        return columnDefs.get(columnDefs.size() - 1).isReversedType() ? bound.reverse() : bound;
-    }
-
-    private Iterator<Cell> applySliceRestriction(final Iterator<Cell> cells, final QueryOptions options) throws InvalidRequestException
-    {
-        final CellNameType type = cfm.comparator;
-
-        final CellName excludedStart = makeExclusiveSliceBound(Bound.START, type, options);
-        final CellName excludedEnd = makeExclusiveSliceBound(Bound.END, type, options);
-
-        return Iterators.filter(cells, new Predicate<Cell>()
+        Selection.ResultSetBuilder result = selection.resultSetBuilder(parameters.isJson);
+        while (partitions.hasNext())
         {
-            public boolean apply(Cell c)
+            try (RowIterator partition = partitions.next())
             {
-                // For dynamic CF, the column could be out of the requested bounds (because we don't support strict bounds internally (unless
-                // the comparator is composite that is)), filter here
-                return !((excludedStart != null && type.compare(c.name(), excludedStart) == 0)
-                            || (excludedEnd != null && type.compare(c.name(), excludedEnd) == 0));
+                processPartition(partition, options, result, nowInSec);
             }
-        });
-    }
-
-    private ResultSet process(List<Row> rows, QueryOptions options, int limit, long now) throws InvalidRequestException
-    {
-        Selection.ResultSetBuilder result = selection.resultSetBuilder(now, parameters.isJson);
-        for (Row row : rows)
-        {
-            // Not columns match the query, skip
-            if (row.cf == null)
-                continue;
-
-            processColumnFamily(row.key.getKey(), row.cf, options, now, result);
         }
 
         ResultSet cqlRows = result.build(options.getProtocolVersion());
 
         orderResults(cqlRows);
 
-        // Internal calls always return columns in the comparator order, even when reverse was set
-        if (isReversed)
-            cqlRows.reverse();
+        cqlRows.trim(userLimit);
 
-        // Trim result if needed to respect the user limit
-        cqlRows.trim(limit);
         return cqlRows;
     }
 
-    // Used by ModificationStatement for CAS operations
-    void processColumnFamily(ByteBuffer key, ColumnFamily cf, QueryOptions options, long now, Selection.ResultSetBuilder result)
-    throws InvalidRequestException
+    public static ByteBuffer[] getComponents(CFMetaData cfm, DecoratedKey dk)
     {
-        CFMetaData cfm = cf.metadata();
-        ByteBuffer[] keyComponents = null;
+        ByteBuffer key = dk.getKey();
         if (cfm.getKeyValidator() instanceof CompositeType)
         {
-            keyComponents = ((CompositeType)cfm.getKeyValidator()).split(key);
+            return ((CompositeType)cfm.getKeyValidator()).split(key);
         }
         else
         {
-            keyComponents = new ByteBuffer[]{ key };
+            return new ByteBuffer[]{ key };
+        }
+    }
+
+    // Used by ModificationStatement for CAS operations
+    void processPartition(RowIterator partition, QueryOptions options, Selection.ResultSetBuilder result, int nowInSec)
+    throws InvalidRequestException
+    {
+        if (cfm.isSuper() && cfm.isDense())
+        {
+            SuperColumnCompatibility.processPartition(cfm, selection, partition, result, options.getProtocolVersion(), restrictions.getSuperColumnRestrictions(), options);
+            return;
         }
 
-        Iterator<Cell> cells = cf.getSortedColumns().iterator();
-        if (restrictions.isNonCompositeSliceWithExclusiveBounds())
-            cells = applySliceRestriction(cells, options);
-
         int protocolVersion = options.getProtocolVersion();
-        CQL3Row.RowIterator iter = cfm.comparator.CQL3RowBuilder(cfm, now).group(cells);
 
-        // If there is static columns but there is no non-static row,
-        // and the select was a full partition selection (i.e. there was no condition on clustering or regular columns),
-        // we want to include the static columns in the result set (and we're done).
-        CQL3Row staticRow = iter.getStaticRow();
-        if (staticRow != null && !iter.hasNext() && !restrictions.hasClusteringColumnsRestriction() && !restrictions.hasRegularColumnsRestriction())
+        ByteBuffer[] keyComponents = getComponents(cfm, partition.partitionKey());
+
+        Row staticRow = partition.staticRow();
+        // If there is no rows, and there's no restriction on clustering/regular columns,
+        // then provided the select was a full partition selection (either by partition key and/or by static column),
+        // we want to include static columns and we're done.
+        if (!partition.hasNext())
         {
-            result.newRow(protocolVersion);
-            for (ColumnDefinition def : selection.getColumns())
+            if (!staticRow.isEmpty()
+                && (!restrictions.hasClusteringColumnsRestriction() || cfm.isStaticCompactTable())
+                && !restrictions.hasRegularColumnsRestriction())
             {
-                switch (def.kind)
+                result.newRow(protocolVersion);
+                for (ColumnDefinition def : selection.getColumns())
                 {
-                    case PARTITION_KEY:
-                        result.add(keyComponents[def.position()]);
-                        break;
-                    case STATIC:
-                        addValue(result, def, staticRow, options);
-                        break;
-                    default:
-                        result.add((ByteBuffer)null);
+                    switch (def.kind)
+                    {
+                        case PARTITION_KEY:
+                            result.add(keyComponents[def.position()]);
+                            break;
+                        case STATIC:
+                            addValue(result, def, staticRow, nowInSec, protocolVersion);
+                            break;
+                        default:
+                            result.add((ByteBuffer)null);
+                    }
                 }
             }
             return;
         }
 
-        while (iter.hasNext())
+        while (partition.hasNext())
         {
-            CQL3Row cql3Row = iter.next();
-
-            // Respect requested order
+            Row row = partition.next();
             result.newRow(protocolVersion);
             // Respect selection order
             for (ColumnDefinition def : selection.getColumns())
@@ -751,42 +821,36 @@
                     case PARTITION_KEY:
                         result.add(keyComponents[def.position()]);
                         break;
-                    case CLUSTERING_COLUMN:
-                        result.add(cql3Row.getClusteringColumn(def.position()));
-                        break;
-                    case COMPACT_VALUE:
-                        result.add(cql3Row.getColumn(null));
+                    case CLUSTERING:
+                        result.add(row.clustering().get(def.position()));
                         break;
                     case REGULAR:
-                        addValue(result, def, cql3Row, options);
+                        addValue(result, def, row, nowInSec, protocolVersion);
                         break;
                     case STATIC:
-                        addValue(result, def, staticRow, options);
+                        addValue(result, def, staticRow, nowInSec, protocolVersion);
                         break;
                 }
             }
         }
     }
 
-    private static void addValue(Selection.ResultSetBuilder result, ColumnDefinition def, CQL3Row row, QueryOptions options)
+    private static void addValue(Selection.ResultSetBuilder result, ColumnDefinition def, Row row, int nowInSec, int protocolVersion)
     {
-        if (row == null)
+        if (def.isComplex())
         {
-            result.add((ByteBuffer)null);
-            return;
+            // Collections are the only complex types we have so far
+            assert def.type.isCollection() && def.type.isMultiCell();
+            ComplexColumnData complexData = row.getComplexColumnData(def);
+            if (complexData == null)
+                result.add((ByteBuffer)null);
+            else
+                result.add(((CollectionType)def.type).serializeForNativeProtocol(def, complexData.iterator(), protocolVersion));
         }
-
-        if (def.type.isMultiCell())
+        else
         {
-            List<Cell> cells = row.getMultiCellColumn(def.name);
-            ByteBuffer buffer = cells == null
-                             ? null
-                             : ((CollectionType)def.type).serializeForNativeProtocol(def, cells, options.getProtocolVersion());
-            result.add(buffer);
-            return;
+            result.add(row.getCell(def), nowInSec);
         }
-
-        result.add(row.getColumn(def.name));
     }
 
     private boolean needsPostQueryOrdering()
@@ -808,30 +872,35 @@
 
     public static class RawStatement extends CFStatement
     {
-        private final Parameters parameters;
-        private final List<RawSelector> selectClause;
-        private final List<Relation> whereClause;
-        private final Term.Raw limit;
+        public final Parameters parameters;
+        public final List<RawSelector> selectClause;
+        public final WhereClause whereClause;
+        public final Term.Raw limit;
 
-        public RawStatement(CFName cfName, Parameters parameters, List<RawSelector> selectClause, List<Relation> whereClause, Term.Raw limit)
+        public RawStatement(CFName cfName, Parameters parameters, List<RawSelector> selectClause, WhereClause whereClause, Term.Raw limit)
         {
             super(cfName);
             this.parameters = parameters;
             this.selectClause = selectClause;
-            this.whereClause = whereClause == null ? Collections.<Relation>emptyList() : whereClause;
+            this.whereClause = whereClause;
             this.limit = limit;
         }
 
-        public ParsedStatement.Prepared prepare() throws InvalidRequestException
+        public ParsedStatement.Prepared prepare(ClientState clientState) throws InvalidRequestException
         {
-            CFMetaData cfm = ThriftValidation.validateColumnFamily(keyspace(), columnFamily());
+            return prepare(false, clientState);
+        }
+
+        public ParsedStatement.Prepared prepare(boolean forView, ClientState clientState) throws InvalidRequestException
+        {
+            CFMetaData cfm = ThriftValidation.validateColumnFamilyWithCompactMode(keyspace(), columnFamily(), clientState.isNoCompactMode());
             VariableSpecifications boundNames = getBoundVariables();
 
             Selection selection = selectClause.isEmpty()
                                   ? Selection.wildcard(cfm)
                                   : Selection.fromSelectors(cfm, selectClause);
 
-            StatementRestrictions restrictions = prepareRestrictions(cfm, boundNames, selection);
+            StatementRestrictions restrictions = prepareRestrictions(cfm, boundNames, selection, forView);
 
             if (parameters.isDistinct)
                 validateDistinctSelection(cfm, selection, restrictions);
@@ -841,14 +910,14 @@
 
             if (!parameters.orderings.isEmpty())
             {
+                assert !forView;
                 verifyOrderingIsAllowed(restrictions);
                 orderingComparator = getOrderingComparator(cfm, selection, restrictions, parameters.isJson);
                 isReversed = isReversed(cfm);
+                if (isReversed)
+                    orderingComparator = Collections.reverseOrder(orderingComparator);
             }
 
-            if (isReversed)
-                restrictions.reverse();
-
             checkNeedsFiltering(restrictions);
 
             SelectStatement stmt = new SelectStatement(cfm,
@@ -874,16 +943,19 @@
          */
         private StatementRestrictions prepareRestrictions(CFMetaData cfm,
                                                           VariableSpecifications boundNames,
-                                                          Selection selection) throws InvalidRequestException
+                                                          Selection selection,
+                                                          boolean forView) throws InvalidRequestException
         {
             try
             {
-                return new StatementRestrictions(cfm,
+                return new StatementRestrictions(StatementType.SELECT,
+                                                 cfm,
                                                  whereClause,
                                                  boundNames,
                                                  selection.containsOnlyStaticColumns(),
                                                  selection.containsACollection(),
-                                                 parameters.allowFiltering);
+                                                 parameters.allowFiltering,
+                                                 forView);
             }
             catch (UnrecognizedEntityException e)
             {
@@ -915,8 +987,9 @@
                                                       StatementRestrictions restrictions)
                                                       throws InvalidRequestException
         {
-            checkFalse(restrictions.hasClusteringColumnsRestriction() || restrictions.hasNonPrimaryKeyRestrictions(),
-                       "SELECT DISTINCT with WHERE clause only supports restriction by partition key.");
+            checkFalse(restrictions.hasClusteringColumnsRestriction() ||
+                       (restrictions.hasNonPrimaryKeyRestrictions() && !restrictions.nonPKRestrictedColumns(true).stream().allMatch(ColumnDefinition::isStatic)),
+                       "SELECT DISTINCT with WHERE clause only supports restriction by partition key and/or static columns.");
 
             Collection<ColumnDefinition> requestedColumns = selection.getColumns();
             for (ColumnDefinition def : requestedColumns)
@@ -957,7 +1030,7 @@
             for (ColumnIdentifier.Raw raw : parameters.orderings.keySet())
             {
                 ColumnIdentifier identifier = raw.prepare(cfm);
-                ColumnDefinition orderingColumn = cfm.getColumnDefinition(identifier);
+                ColumnDefinition orderingColumn = cfm.getColumnDefinitionForCQL(identifier);
                 idToSort.add(orderingIndexes.get(orderingColumn));
                 sorters.add(orderingColumn.type);
             }
@@ -974,7 +1047,7 @@
             for (ColumnIdentifier.Raw raw : parameters.orderings.keySet())
             {
                 ColumnIdentifier column = raw.prepare(cfm);
-                final ColumnDefinition def = cfm.getColumnDefinition(column);
+                final ColumnDefinition def = cfm.getColumnDefinitionForCQL(column);
                 if (def == null)
                     handleUnrecognizedOrderingColumn(column);
                 selection.addColumnForOrdering(def);
@@ -991,7 +1064,7 @@
                 ColumnIdentifier column = entry.getKey().prepare(cfm);
                 boolean reversed = entry.getValue();
 
-                ColumnDefinition def = cfm.getColumnDefinition(column);
+                ColumnDefinition def = cfm.getColumnDefinitionForCQL(column);
                 if (def == null)
                     handleUnrecognizedOrderingColumn(column);
 
@@ -1031,45 +1104,9 @@
             {
                 // We will potentially filter data if either:
                 //  - Have more than one IndexExpression
-                //  - Have no index expression and the column filter is not the identity
-                checkFalse(restrictions.needFiltering(),
-                           StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+                //  - Have no index expression and the row filter is not the identity
+                checkFalse(restrictions.needFiltering(), StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
             }
-
-            // We don't internally support exclusive slice bounds on non-composite tables. To deal with it we do an
-            // inclusive slice and remove post-query the value that shouldn't be returned. One problem however is that
-            // if there is a user limit, that limit may make the query return before the end of the slice is reached,
-            // in which case, once we'll have removed bound post-query, we might end up with less results than
-            // requested which would be incorrect. For single-partition query, this is not a problem, we just ask for
-            // one more result (see updateLimitForQuery()) since that's enough to compensate for that problem. For key
-            // range however, each returned row may include one result that will have to be trimmed, so we would have
-            // to bump the query limit by N where N is the number of rows we will return, but we don't know that in
-            // advance. So, since we currently don't have a good way to handle such query, we refuse it (#7059) rather
-            // than answering with something that is wrong.
-            if (restrictions.isNonCompositeSliceWithExclusiveBounds() && restrictions.isKeyRange() && limit != null)
-            {
-                SingleColumnRelation rel = findInclusiveClusteringRelationForCompact(restrictions.cfm);
-                throw invalidRequest("The query requests a restriction of rows with a strict bound (%s) over a range of partitions. "
-                                   + "This is not supported by the underlying storage engine for COMPACT tables if a LIMIT is provided. "
-                                   + "Please either make the condition non strict (%s) or remove the user LIMIT", rel, rel.withNonStrictOperator());
-            }
-        }
-
-        private SingleColumnRelation findInclusiveClusteringRelationForCompact(CFMetaData cfm)
-        {
-            for (Relation r : whereClause)
-            {
-                // We only call this when sliceRestriction != null, i.e. for compact table with non composite comparator,
-                // so it can't be a MultiColumnRelation.
-                SingleColumnRelation rel = (SingleColumnRelation)r;
-
-                if (cfm.getColumnDefinition(rel.getEntity().prepare(cfm)).isClusteringColumn()
-                        && (rel.operator() == Operator.GT || rel.operator() == Operator.LT))
-                    return rel;
-            }
-
-            // We're not supposed to call this method unless we know this can't happen
-            throw new AssertionError();
         }
 
         private boolean containsAlias(final ColumnIdentifier name)

diff --git a/src/java/org/apache/cassandra/cql3/statements/StatementType.java b/src/java/org/apache/cassandra/cql3/statements/StatementType.java
new file mode 100644
index 0000000..d399931
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/StatementType.java

@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+public enum StatementType
+{
+    INSERT
+    {
+        @Override
+        public boolean allowClusteringColumnSlices()
+        {
+            return false;
+        }
+    },
+    UPDATE
+    {
+
+        @Override
+        public boolean allowClusteringColumnSlices()
+        {
+            return false;
+        }
+    },
+    DELETE
+    {
+    },
+    SELECT
+    {
+        @Override
+        public boolean allowPartitionKeyRanges()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean allowNonPrimaryKeyInWhereClause()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean allowUseOfSecondaryIndices()
+        {
+            return true;
+        }
+    };
+
+    /**
+     * Checks if this type is an insert.
+     * @return <code>true</code> if this type is an insert, <code>false</code> otherwise.
+     */
+    public boolean isInsert()
+    {
+        return this == INSERT;
+    }
+
+    /**
+     * Checks if this type is an update.
+     * @return <code>true</code> if this type is an update, <code>false</code> otherwise.
+     */
+    public boolean isUpdate()
+    {
+        return this == UPDATE;
+    }
+
+    /**
+     * Checks if this type is a delete.
+     * @return <code>true</code> if this type is a delete, <code>false</code> otherwise.
+     */
+    public boolean isDelete()
+    {
+        return this == DELETE;
+    }
+
+    /**
+     * Checks if this type is a select.
+     * @return <code>true</code> if this type is a select, <code>false</code> otherwise.
+     */
+    public boolean isSelect()
+    {
+        return this == SELECT;
+    }
+
+    /**
+     * Checks this statement allow the where clause to contains missing partition key components or token relation.
+     * @return <code>true</code> if this statement allow the where clause to contains missing partition key components
+     * or token relation, <code>false</code> otherwise.
+     */
+    public boolean allowPartitionKeyRanges()
+    {
+        return false;
+    }
+
+    /**
+     * Checks this type of statement allow the where clause to contains clustering column slices.
+     * @return <code>true</code> if this type of statement allow the where clause to contains clustering column slices,
+     * <code>false</code> otherwise.
+     */
+    public boolean allowClusteringColumnSlices()
+    {
+        return true;
+    }
+
+    /**
+     * Checks if this type of statement allow non primary key in the where clause.
+     * @return <code>true</code> if this type of statement allow non primary key in the where clause,
+     * <code>false</code> otherwise.
+     */
+    public boolean allowNonPrimaryKeyInWhereClause()
+    {
+        return false;
+    }
+
+    /**
+     * Checks if this type of statement allow the use of secondary indices.
+     * @return <code>true</code> if this type of statement allow the use of secondary indices,
+     * <code>false</code> otherwise.
+     */
+    public boolean allowUseOfSecondaryIndices()
+    {
+        return false;
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/cql3/statements/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/TableAttributes.java
new file mode 100644
index 0000000..595fdb3
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/TableAttributes.java

@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.*;
+import org.apache.cassandra.schema.TableParams.Option;
+import org.apache.cassandra.service.ClientWarn;
+
+import static java.lang.String.format;
+
+public final class TableAttributes extends PropertyDefinitions
+{
+    private static final String KW_ID = "id";
+    private static final Set<String> validKeywords;
+    private static final Set<String> obsoleteKeywords;
+
+    private static boolean loggedReadRepairChanceDeprecationWarnings;
+
+    static
+    {
+        ImmutableSet.Builder<String> validBuilder = ImmutableSet.builder();
+        for (Option option : Option.values())
+            validBuilder.add(option.toString());
+        validBuilder.add(KW_ID);
+        validKeywords = validBuilder.build();
+        obsoleteKeywords = ImmutableSet.of();
+    }
+
+    public void validate()
+    {
+        validate(validKeywords, obsoleteKeywords);
+        build(TableParams.builder()).validate();
+    }
+
+    public TableParams asNewTableParams()
+    {
+        return build(TableParams.builder());
+    }
+
+    public TableParams asAlteredTableParams(TableParams previous)
+    {
+        if (getId() != null)
+            throw new ConfigurationException("Cannot alter table id.");
+        return build(TableParams.builder(previous));
+    }
+
+    public UUID getId() throws ConfigurationException
+    {
+        String id = getSimple(KW_ID);
+        try
+        {
+            return id != null ? UUID.fromString(id) : null;
+        }
+        catch (IllegalArgumentException e)
+        {
+            throw new ConfigurationException("Invalid table id", e);
+        }
+    }
+
+    private TableParams build(TableParams.Builder builder)
+    {
+        if (hasOption(Option.BLOOM_FILTER_FP_CHANCE))
+            builder.bloomFilterFpChance(getDouble(Option.BLOOM_FILTER_FP_CHANCE));
+
+        if (hasOption(Option.CACHING))
+            builder.caching(CachingParams.fromMap(getMap(Option.CACHING)));
+
+        if (hasOption(Option.COMMENT))
+            builder.comment(getString(Option.COMMENT));
+
+        if (hasOption(Option.COMPACTION))
+            builder.compaction(CompactionParams.fromMap(getMap(Option.COMPACTION)));
+
+        if (hasOption(Option.COMPRESSION))
+        {
+            //crc_check_chance was "promoted" from a compression property to a top-level-property after #9839
+            //so we temporarily accept it to be defined as a compression option, to maintain backwards compatibility
+            Map<String, String> compressionOpts = getMap(Option.COMPRESSION);
+            if (compressionOpts.containsKey(Option.CRC_CHECK_CHANCE.toString().toLowerCase()))
+            {
+                Double crcCheckChance = getDeprecatedCrcCheckChance(compressionOpts);
+                builder.crcCheckChance(crcCheckChance);
+            }
+            builder.compression(CompressionParams.fromMap(getMap(Option.COMPRESSION)));
+        }
+
+        if (hasOption(Option.DCLOCAL_READ_REPAIR_CHANCE))
+        {
+            double chance = getDouble(Option.DCLOCAL_READ_REPAIR_CHANCE);
+
+            if (chance != 0.0)
+            {
+                ClientWarn.instance.warn("dclocal_read_repair_chance table option has been deprecated and will be removed in version 4.0");
+                maybeLogReadRepairChanceDeprecationWarning();
+            }
+
+            builder.dcLocalReadRepairChance(chance);
+        }
+
+        if (hasOption(Option.DEFAULT_TIME_TO_LIVE))
+            builder.defaultTimeToLive(getInt(Option.DEFAULT_TIME_TO_LIVE));
+
+        if (hasOption(Option.GC_GRACE_SECONDS))
+            builder.gcGraceSeconds(getInt(Option.GC_GRACE_SECONDS));
+
+        if (hasOption(Option.MAX_INDEX_INTERVAL))
+            builder.maxIndexInterval(getInt(Option.MAX_INDEX_INTERVAL));
+
+        if (hasOption(Option.MEMTABLE_FLUSH_PERIOD_IN_MS))
+            builder.memtableFlushPeriodInMs(getInt(Option.MEMTABLE_FLUSH_PERIOD_IN_MS));
+
+        if (hasOption(Option.MIN_INDEX_INTERVAL))
+            builder.minIndexInterval(getInt(Option.MIN_INDEX_INTERVAL));
+
+        if (hasOption(Option.READ_REPAIR_CHANCE))
+        {
+            double chance = getDouble(Option.READ_REPAIR_CHANCE);
+
+            if (chance != 0.0)
+            {
+                ClientWarn.instance.warn("read_repair_chance table option has been deprecated and will be removed in version 4.0");
+                maybeLogReadRepairChanceDeprecationWarning();
+            }
+
+            builder.readRepairChance(chance);
+        }
+
+        if (hasOption(Option.SPECULATIVE_RETRY))
+            builder.speculativeRetry(SpeculativeRetryParam.fromString(getString(Option.SPECULATIVE_RETRY)));
+
+        if (hasOption(Option.CRC_CHECK_CHANCE))
+            builder.crcCheckChance(getDouble(Option.CRC_CHECK_CHANCE));
+
+        return builder.build();
+    }
+
+    private void maybeLogReadRepairChanceDeprecationWarning()
+    {
+        if (!loggedReadRepairChanceDeprecationWarnings)
+        {
+            logger.warn("dclocal_read_repair_chance and read_repair_chance table options have been deprecated and will be removed in version 4.0");
+            loggedReadRepairChanceDeprecationWarnings = true;
+        }
+    }
+
+    private Double getDeprecatedCrcCheckChance(Map<String, String> compressionOpts)
+    {
+        String value = compressionOpts.get(Option.CRC_CHECK_CHANCE.toString().toLowerCase());
+        try
+        {
+            return Double.parseDouble(value);
+        }
+        catch (NumberFormatException e)
+        {
+            throw new SyntaxException(String.format("Invalid double value %s for crc_check_chance.'", value));
+        }
+    }
+
+    private double getDouble(Option option)
+    {
+        String value = getString(option);
+
+        try
+        {
+            return Double.parseDouble(value);
+        }
+        catch (NumberFormatException e)
+        {
+            throw new SyntaxException(format("Invalid double value %s for '%s'", value, option));
+        }
+    }
+
+    private int getInt(Option option)
+    {
+        String value = getString(option);
+
+        try
+        {
+            return Integer.parseInt(value);
+        }
+        catch (NumberFormatException e)
+        {
+            throw new SyntaxException(String.format("Invalid integer value %s for '%s'", value, option));
+        }
+    }
+
+    private String getString(Option option)
+    {
+        String value = getSimple(option.toString());
+        if (value == null)
+            throw new IllegalStateException(format("Option '%s' is absent", option));
+        return value;
+    }
+
+    private Map<String, String> getMap(Option option)
+    {
+        Map<String, String> value = getMap(option.toString());
+        if (value == null)
+            throw new IllegalStateException(format("Option '%s' is absent", option));
+        return value;
+    }
+
+    private boolean hasOption(Option option)
+    {
+        return hasProperty(option.toString());
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
index 9234a79..b697910 100644
--- a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java

@@ -21,6 +21,8 @@
 import java.util.concurrent.TimeoutException;
 
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
@@ -43,7 +45,7 @@
         return 0;
     }
 
-    public Prepared prepare() throws InvalidRequestException
+    public Prepared prepare(ClientState clientState) throws InvalidRequestException
     {
         return new Prepared(this);
     }
@@ -62,6 +64,10 @@
     {
         try
         {
+            CFMetaData metaData = Schema.instance.getCFMetaData(keyspace(), columnFamily());
+            if (metaData.isView())
+                throw new InvalidRequestException("Cannot TRUNCATE materialized view directly; must truncate base table instead");
+
             StorageProxy.truncateBlocking(keyspace(), columnFamily());
         }
         catch (UnavailableException | TimeoutException | IOException e)

diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
index 517d842..641b6bb 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java

@@ -17,21 +17,24 @@
  */
 package org.apache.cassandra.cql3.statements;
 
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Collections;
+import java.util.List;
 
-import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.CompactTables;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
-import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static com.google.common.collect.Lists.newArrayList;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkContainsNoDuplicates;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
 
 /**
  * An <code>UPDATE</code> statement parsed from a CQL query statement.
@@ -41,9 +44,15 @@
 {
     private static final Constants.Value EMPTY = new Constants.Value(ByteBufferUtil.EMPTY_BYTE_BUFFER);
 
-    private UpdateStatement(StatementType type, int boundTerms, CFMetaData cfm, Attributes attrs)
+    private UpdateStatement(StatementType type,
+                            int boundTerms,
+                            CFMetaData cfm,
+                            Operations operations,
+                            StatementRestrictions restrictions,
+                            Conditions conditions,
+                            Attributes attrs)
     {
-        super(type, boundTerms, cfm, attrs);
+        super(type, boundTerms, cfm, operations, restrictions, conditions, attrs);
     }
 
     public boolean requireFullClusteringKey()
@@ -51,100 +60,55 @@
         return true;
     }
 
-    public void addUpdateForKey(ColumnFamily cf,
-                                ByteBuffer key,
-                                Composite prefix,
-                                UpdateParameters params) throws InvalidRequestException
+    @Override
+    public void addUpdateForKey(PartitionUpdate update, Clustering clustering, UpdateParameters params)
     {
-        addUpdateForKey(cf, key, prefix, params, true);
+        if (updatesRegularRows())
+        {
+            params.newRow(clustering);
+
+            // We update the row timestamp (ex-row marker) only on INSERT (#6782)
+            // Further, COMPACT tables semantic differs from "CQL3" ones in that a row exists only if it has
+            // a non-null column, so we don't want to set the row timestamp for them.
+            if (type.isInsert() && cfm.isCQLTable())
+                params.addPrimaryKeyLivenessInfo();
+
+            List<Operation> updates = getRegularOperations();
+
+            // For compact table, when we translate it to thrift, we don't have a row marker. So we don't accept an insert/update
+            // that only sets the PK unless the is no declared non-PK columns (in the latter we just set the value empty).
+
+            // For a dense layout, when we translate it to thrift, we don't have a row marker. So we don't accept an insert/update
+            // that only sets the PK unless the is no declared non-PK columns (which we recognize because in that case the compact
+            // value is of type "EmptyType").
+            if ((cfm.isCompactTable() && !cfm.isSuper()) && updates.isEmpty())
+            {
+                checkTrue(CompactTables.hasEmptyCompactValue(cfm),
+                          "Column %s is mandatory for this COMPACT STORAGE table",
+                          cfm.compactValueColumn().name);
+
+                updates = Collections.<Operation>singletonList(new Constants.Setter(cfm.compactValueColumn(), EMPTY));
+            }
+
+            for (Operation op : updates)
+                op.execute(update.partitionKey(), params);
+
+            update.add(params.buildRow());
+        }
+
+        if (updatesStaticRow())
+        {
+            params.newRow(Clustering.STATIC_CLUSTERING);
+            for (Operation op : getStaticOperations())
+                op.execute(update.partitionKey(), params);
+            update.add(params.buildRow());
+        }
     }
 
-    public void addUpdateForKey(ColumnFamily cf,
-                                ByteBuffer key,
-                                Composite prefix,
-                                UpdateParameters params,
-                                boolean validateIndexedColumns) throws InvalidRequestException
+    @Override
+    public void addUpdateForKey(PartitionUpdate update, Slice slice, UpdateParameters params)
     {
-        // Inserting the CQL row marker (see #4361)
-        // We always need to insert a marker for INSERT, because of the following situation:
-        //   CREATE TABLE t ( k int PRIMARY KEY, c text );
-        //   INSERT INTO t(k, c) VALUES (1, 1)
-        //   DELETE c FROM t WHERE k = 1;
-        //   SELECT * FROM t;
-        // The last query should return one row (but with c == null). Adding the marker with the insert make sure
-        // the semantic is correct (while making sure a 'DELETE FROM t WHERE k = 1' does remove the row entirely)
-        //
-        // We do not insert the marker for UPDATE however, as this amount to updating the columns in the WHERE
-        // clause which is inintuitive (#6782)
-        //
-        // We never insert markers for Super CF as this would confuse the thrift side.
-        if (type == StatementType.INSERT && cfm.isCQL3Table() && !prefix.isStatic())
-            cf.addColumn(params.makeColumn(cfm.comparator.rowMarker(prefix), ByteBufferUtil.EMPTY_BYTE_BUFFER));
-
-        List<Operation> updates = getOperations();
-
-        if (cfm.comparator.isDense())
-        {
-            if (prefix.isEmpty())
-                throw new InvalidRequestException(String.format("Missing PRIMARY KEY part %s", cfm.clusteringColumns().get(0).name));
-
-            // An empty name for the compact value is what we use to recognize the case where there is not column
-            // outside the PK, see CreateStatement.
-            if (!cfm.compactValueColumn().name.bytes.hasRemaining())
-            {
-                // There is no column outside the PK. So no operation could have passed through validation
-                assert updates.isEmpty();
-                new Constants.Setter(cfm.compactValueColumn(), EMPTY).execute(key, cf, prefix, params);
-            }
-            else
-            {
-                // dense means we don't have a row marker, so don't accept to set only the PK. See CASSANDRA-5648.
-                if (updates.isEmpty())
-                    throw new InvalidRequestException(String.format("Column %s is mandatory for this COMPACT STORAGE table", cfm.compactValueColumn().name));
-
-                for (Operation update : updates)
-                    update.execute(key, cf, prefix, params);
-            }
-        }
-        else
-        {
-            for (Operation update : updates)
-                update.execute(key, cf, prefix, params);
-        }
-
-        // validateIndexedColumns trigger a call to Keyspace.open() which we want to be able to avoid in some case
-        //(e.g. when using CQLSSTableWriter)
-        if (validateIndexedColumns)
-            validateIndexedColumns(key, cf);
-    }
-
-    /**
-     * Checks if the values of the indexed columns are valid.
-     *
-     * @param key row key for the column family
-     * @param cf the column family
-     * @throws InvalidRequestException if one of the values of the indexed columns is not valid
-     */
-    private void validateIndexedColumns(ByteBuffer key, ColumnFamily cf)
-    {
-        SecondaryIndexManager indexManager = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfId).indexManager;
-        if (indexManager.hasIndexes())
-        {
-            for (Cell cell : cf)
-            {
-                // Indexed values must be validated by any applicable index. See CASSANDRA-3057/4240/8081 for more details
-                SecondaryIndex failedIndex = indexManager.validate(key, cell);
-                if (failedIndex != null)
-                {
-                    throw invalidRequest(String.format("Can't index column value of size %d for index %s on %s.%s",
-                                                       cell.value().remaining(),
-                                                       failedIndex.getIndexName(),
-                                                       cfm.ksName,
-                                                       cfm.cfName));
-                }
-            }
-            indexManager.validateRowLevelIndexes(key, cf);
-        }
+        throw new UnsupportedOperationException();
     }
 
     public static class ParsedInsert extends ModificationStatement.Parsed
@@ -167,57 +131,78 @@
                             List<Term.Raw> columnValues,
                             boolean ifNotExists)
         {
-            super(name, attrs, null, ifNotExists, false);
+            super(name, StatementType.INSERT, attrs, null, ifNotExists, false);
             this.columnNames = columnNames;
             this.columnValues = columnValues;
         }
 
-        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        @Override
+        protected ModificationStatement prepareInternal(CFMetaData cfm,
+                                                        VariableSpecifications boundNames,
+                                                        Conditions conditions,
+                                                        Attributes attrs)
         {
-            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.INSERT, boundNames.size(), cfm, attrs);
 
             // Created from an INSERT
-            if (stmt.isCounter())
-                throw new InvalidRequestException("INSERT statements are not allowed on counter tables, use UPDATE instead");
+            checkFalse(cfm.isCounter(), "INSERT statements are not allowed on counter tables, use UPDATE instead");
 
-            if (columnNames == null)
-                throw new InvalidRequestException("Column names for INSERT must be provided when using VALUES");
-            if (columnNames.isEmpty())
-                throw new InvalidRequestException("No columns provided to INSERT");
-            if (columnNames.size() != columnValues.size())
-                throw new InvalidRequestException("Unmatched column names/values");
+            checkFalse(columnNames == null, "Column names for INSERT must be provided when using VALUES");
+            checkFalse(columnNames.isEmpty(), "No columns provided to INSERT");
+            checkFalse(columnNames.size() != columnValues.size(), "Unmatched column names/values");
+            checkContainsNoDuplicates(columnNames, "The column names contains duplicates");
 
-            String ks = keyspace();
-            for (int i = 0; i < columnNames.size(); i++)
+            WhereClause.Builder whereClause = new WhereClause.Builder();
+            Operations operations = new Operations(type);
+            boolean hasClusteringColumnsSet = false;
+
+            if (cfm.isSuper() && cfm.isDense())
             {
-                ColumnIdentifier id = columnNames.get(i).prepare(cfm);
-                ColumnDefinition def = cfm.getColumnDefinition(id);
-                if (def == null)
-                    throw new InvalidRequestException(String.format("Unknown identifier %s", id));
+                // SuperColumn familiy updates are always row-level
+                hasClusteringColumnsSet = true;
+                SuperColumnCompatibility.prepareInsertOperations(cfm, columnNames, whereClause, columnValues, boundNames, operations);
+            }
+            else
+            {
+                for (int i = 0; i < columnNames.size(); i++)
+                {
+                    ColumnDefinition def = getColumnDefinition(cfm, columnNames.get(i));
 
-                for (int j = 0; j < i; j++)
-                {
-                    ColumnIdentifier otherId = columnNames.get(j).prepare(cfm);
-                    if (id.equals(otherId))
-                        throw new InvalidRequestException(String.format("Multiple definitions found for column %s", id));
-                }
+                    if (def.isClusteringColumn())
+                        hasClusteringColumnsSet = true;
 
-                Term.Raw value = columnValues.get(i);
-                if (def.isPrimaryKeyColumn())
-                {
-                    Term t = value.prepare(ks, def);
-                    t.collectMarkerSpecification(boundNames);
-                    stmt.addKeyValue(def, t);
-                }
-                else
-                {
-                    Operation operation = new Operation.SetValue(value).prepare(ks, def);
-                    operation.collectMarkerSpecification(boundNames);
-                    stmt.addOperation(operation);
+                    Term.Raw value = columnValues.get(i);
+
+                    if (def.isPrimaryKeyColumn())
+                    {
+                        whereClause.add(new SingleColumnRelation(columnNames.get(i), Operator.EQ, value));
+                    }
+                    else
+                    {
+                        Operation operation = new Operation.SetValue(value).prepare(cfm.ksName, def);
+                        operation.collectMarkerSpecification(boundNames);
+                        operations.add(operation);
+                    }
                 }
             }
 
-            return stmt;
+            boolean applyOnlyToStaticColumns = appliesOnlyToStaticColumns(operations, conditions) && !hasClusteringColumnsSet;
+
+            StatementRestrictions restrictions = new StatementRestrictions(type,
+                                                                           cfm,
+                                                                           whereClause.build(),
+                                                                           boundNames,
+                                                                           applyOnlyToStaticColumns,
+                                                                           false,
+                                                                           false,
+                                                                           false);
+
+            return new UpdateStatement(type,
+                                       boundNames.size(),
+                                       cfm,
+                                       operations,
+                                       restrictions,
+                                       conditions,
+                                       attrs);
         }
     }
 
@@ -230,28 +215,69 @@
 
         public ParsedInsertJson(CFName name, Attributes.Raw attrs, Json.Raw jsonValue, boolean ifNotExists)
         {
-            super(name, attrs, null, ifNotExists, false);
+            super(name, StatementType.INSERT, attrs, null, ifNotExists, false);
             this.jsonValue = jsonValue;
         }
 
-        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        @Override
+        protected ModificationStatement prepareInternal(CFMetaData cfm,
+                                                        VariableSpecifications boundNames,
+                                                        Conditions conditions,
+                                                        Attributes attrs)
         {
-            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.INSERT, boundNames.size(), cfm, attrs);
-            if (stmt.isCounter())
-                throw new InvalidRequestException("INSERT statements are not allowed on counter tables, use UPDATE instead");
+            checkFalse(cfm.isCounter(), "INSERT statements are not allowed on counter tables, use UPDATE instead");
 
-            Collection<ColumnDefinition> defs = cfm.allColumns();
+            List<ColumnDefinition> defs = newArrayList(cfm.allColumnsInSelectOrder());
             Json.Prepared prepared = jsonValue.prepareAndCollectMarkers(cfm, defs, boundNames);
 
-            for (ColumnDefinition def : defs)
+            WhereClause.Builder whereClause = new WhereClause.Builder();
+            Operations operations = new Operations(type);
+            boolean hasClusteringColumnsSet = false;
+
+            if (cfm.isSuper() && cfm.isDense())
             {
-                if (def.isPrimaryKeyColumn())
-                    stmt.addKeyValue(def, prepared.getPrimaryKeyValueForColumn(def));
-                else
-                    stmt.addOperation(prepared.getSetOperationForColumn(def));
+                hasClusteringColumnsSet = true;
+                SuperColumnCompatibility.prepareInsertJSONOperations(cfm, defs, boundNames, prepared, whereClause, operations);
+            }
+            else
+            {
+                for (ColumnDefinition def : defs)
+                {
+                    if (def.isClusteringColumn())
+                        hasClusteringColumnsSet = true;
+
+                    Term.Raw raw = prepared.getRawTermForColumn(def);
+                    if (def.isPrimaryKeyColumn())
+                    {
+                        whereClause.add(new SingleColumnRelation(new ColumnIdentifier.ColumnIdentifierValue(def.name), Operator.EQ, raw));
+                    }
+                    else
+                    {
+                        Operation operation = new Operation.SetValue(raw).prepare(cfm.ksName, def);
+                        operation.collectMarkerSpecification(boundNames);
+                        operations.add(operation);
+                    }
+                }
             }
 
-            return stmt;
+            boolean applyOnlyToStaticColumns = appliesOnlyToStaticColumns(operations, conditions) && !hasClusteringColumnsSet;
+
+            StatementRestrictions restrictions = new StatementRestrictions(type,
+                                                                           cfm,
+                                                                           whereClause.build(),
+                                                                           boundNames,
+                                                                           applyOnlyToStaticColumns,
+                                                                           false,
+                                                                           false,
+                                                                           false);
+
+            return new UpdateStatement(type,
+                                       boundNames.size(),
+                                       cfm,
+                                       operations,
+                                       restrictions,
+                                       conditions,
+                                       attrs);
         }
     }
 
@@ -259,7 +285,7 @@
     {
         // Provided for an UPDATE
         private final List<Pair<ColumnIdentifier.Raw, Operation.RawUpdate>> updates;
-        private final List<Relation> whereClause;
+        private WhereClause whereClause;
 
         /**
          * Creates a new UpdateStatement from a column family name, columns map, consistency
@@ -274,41 +300,55 @@
         public ParsedUpdate(CFName name,
                             Attributes.Raw attrs,
                             List<Pair<ColumnIdentifier.Raw, Operation.RawUpdate>> updates,
-                            List<Relation> whereClause,
+                            WhereClause whereClause,
                             List<Pair<ColumnIdentifier.Raw, ColumnCondition.Raw>> conditions,
                             boolean ifExists)
         {
-            super(name, attrs, conditions, false, ifExists);
+            super(name, StatementType.UPDATE, attrs, conditions, false, ifExists);
             this.updates = updates;
             this.whereClause = whereClause;
         }
 
-        protected ModificationStatement prepareInternal(CFMetaData cfm, VariableSpecifications boundNames, Attributes attrs) throws InvalidRequestException
+        @Override
+        protected ModificationStatement prepareInternal(CFMetaData cfm,
+                                                        VariableSpecifications boundNames,
+                                                        Conditions conditions,
+                                                        Attributes attrs)
         {
-            UpdateStatement stmt = new UpdateStatement(ModificationStatement.StatementType.UPDATE, boundNames.size(), cfm, attrs);
+            Operations operations = new Operations(type);
 
-            for (Pair<ColumnIdentifier.Raw, Operation.RawUpdate> entry : updates)
+            if (cfm.isSuper() && cfm.isDense())
             {
-                ColumnDefinition def = cfm.getColumnDefinition(entry.left.prepare(cfm));
-                if (def == null)
-                    throw new InvalidRequestException(String.format("Unknown identifier %s", entry.left));
-
-                Operation operation = entry.right.prepare(keyspace(), def);
-                operation.collectMarkerSpecification(boundNames);
-
-                switch (def.kind)
+                conditions = SuperColumnCompatibility.rebuildLWTColumnConditions(conditions, cfm, whereClause);
+                whereClause = SuperColumnCompatibility.prepareUpdateOperations(cfm, whereClause, updates, boundNames, operations);
+            }
+            else
+            {
+                for (Pair<ColumnIdentifier.Raw, Operation.RawUpdate> entry : updates)
                 {
-                    case PARTITION_KEY:
-                    case CLUSTERING_COLUMN:
-                        throw new InvalidRequestException(String.format("PRIMARY KEY part %s found in SET part", entry.left));
-                    default:
-                        stmt.addOperation(operation);
-                        break;
+                    ColumnDefinition def = getColumnDefinition(cfm, entry.left);
+
+                    checkFalse(def.isPrimaryKeyColumn(), "PRIMARY KEY part %s found in SET part", def.name);
+
+                    Operation operation = entry.right.prepare(cfm.ksName, def);
+                    operation.collectMarkerSpecification(boundNames);
+                    operations.add(operation);
                 }
             }
+            
+            StatementRestrictions restrictions = newRestrictions(cfm,
+                                                                 boundNames,
+                                                                 operations,
+                                                                 whereClause,
+                                                                 conditions);
 
-            stmt.processWhereClause(whereClause, boundNames);
-            return stmt;
+            return new UpdateStatement(type,
+                                       boundNames.size(),
+                                       cfm,
+                                       operations,
+                                       restrictions,
+                                       conditions,
+                                       attrs);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java
new file mode 100644
index 0000000..1d65a78
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java

@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.statements;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+
+/**
+ * Utility class to collect updates.
+ *
+ * <p>In a batch statement we don't want to recreate mutations every time as this is particularly inefficient when
+ * applying multiple batch to the same partition (see #6737). </p>
+ *
+ */
+final class UpdatesCollector
+{
+    /**
+     * The columns that will be updated for each table (keyed by the table ID).
+     */
+    private final Map<UUID, PartitionColumns> updatedColumns;
+
+    /**
+     * The estimated number of updated row.
+     */
+    private final int updatedRows;
+
+    /**
+     * The mutations per keyspace.
+     */
+    private final Map<String, Map<ByteBuffer, IMutation>> mutations = new HashMap<>();
+
+    public UpdatesCollector(Map<UUID, PartitionColumns> updatedColumns, int updatedRows)
+    {
+        super();
+        this.updatedColumns = updatedColumns;
+        this.updatedRows = updatedRows;
+    }
+
+    /**
+     * Gets the <code>PartitionUpdate</code> for the specified column family and key. If the update does not
+     * exist it will be created.
+     *
+     * @param cfm the column family meta data
+     * @param dk the partition key
+     * @param consistency the consistency level
+     * @return the <code>PartitionUpdate</code> for the specified column family and key
+     */
+    public PartitionUpdate getPartitionUpdate(CFMetaData cfm, DecoratedKey dk, ConsistencyLevel consistency)
+    {
+        Mutation mut = getMutation(cfm, dk, consistency);
+        PartitionUpdate upd = mut.get(cfm);
+        if (upd == null)
+        {
+            PartitionColumns columns = updatedColumns.get(cfm.cfId);
+            assert columns != null;
+            upd = new PartitionUpdate(cfm, dk, columns, updatedRows);
+            mut.add(upd);
+        }
+        return upd;
+    }
+
+    /**
+     * Check all partition updates contain only valid values for any
+     * indexed columns.
+     */
+    public void validateIndexedColumns()
+    {
+        for (Map<ByteBuffer, IMutation> perKsMutations : mutations.values())
+            for (IMutation mutation : perKsMutations.values())
+                for (PartitionUpdate update : mutation.getPartitionUpdates())
+                    Keyspace.openAndGetStore(update.metadata()).indexManager.validate(update);
+    }
+
+    private Mutation getMutation(CFMetaData cfm, DecoratedKey dk, ConsistencyLevel consistency)
+    {
+        String ksName = cfm.ksName;
+        IMutation mutation = keyspaceMap(ksName).get(dk.getKey());
+        if (mutation == null)
+        {
+            Mutation mut = new Mutation(ksName, dk);
+            mutation = cfm.isCounter() ? new CounterMutation(mut, consistency) : mut;
+            keyspaceMap(ksName).put(dk.getKey(), mutation);
+            return mut;
+        }
+        return cfm.isCounter() ? ((CounterMutation) mutation).getMutation() : (Mutation) mutation;
+    }
+
+    /**
+     * Returns a collection containing all the mutations.
+     * @return a collection containing all the mutations.
+     */
+    public Collection<IMutation> toMutations()
+    {
+        // The case where all statement where on the same keyspace is pretty common
+        if (mutations.size() == 1)
+            return mutations.values().iterator().next().values();
+
+        List<IMutation> ms = new ArrayList<>();
+        for (Map<ByteBuffer, IMutation> ksMap : mutations.values())
+            ms.addAll(ksMap.values());
+
+        return ms;
+    }
+
+    /**
+     * Returns the key-mutation mappings for the specified keyspace.
+     *
+     * @param ksName the keyspace name
+     * @return the key-mutation mappings for the specified keyspace.
+     */
+    private Map<ByteBuffer, IMutation> keyspaceMap(String ksName)
+    {
+        Map<ByteBuffer, IMutation> ksMap = mutations.get(ksName);
+        if (ksMap == null)
+        {
+            ksMap = new HashMap<>();
+            mutations.put(ksName, ksMap);
+        }
+        return ksMap;
+    }
+}

diff --git a/src/java/org/apache/cassandra/cql3/statements/UseStatement.java b/src/java/org/apache/cassandra/cql3/statements/UseStatement.java
index fe3d518..e4685cc 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UseStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UseStatement.java

@@ -39,7 +39,7 @@
         return 0;
     }
 
-    public Prepared prepare() throws InvalidRequestException
+    public Prepared prepare(ClientState clientState) throws InvalidRequestException
     {
         return new Prepared(this);
     }

diff --git a/src/java/org/apache/cassandra/db/AbstractCell.java b/src/java/org/apache/cassandra/db/AbstractCell.java
deleted file mode 100644
index bd63985..0000000
--- a/src/java/org/apache/cassandra/db/AbstractCell.java
+++ /dev/null

@@ -1,236 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOError;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.util.Iterator;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-
-public abstract class AbstractCell implements Cell
-{
-    public static Iterator<OnDiskAtom> onDiskIterator(final DataInput in,
-                                                      final ColumnSerializer.Flag flag,
-                                                      final int expireBefore,
-                                                      final Version version,
-                                                      final CellNameType type)
-    {
-        return new AbstractIterator<OnDiskAtom>()
-        {
-            protected OnDiskAtom computeNext()
-            {
-                OnDiskAtom atom;
-                try
-                {
-                    atom = type.onDiskAtomSerializer().deserializeFromSSTable(in, flag, expireBefore, version);
-                }
-                catch (IOException e)
-                {
-                    throw new IOError(e);
-                }
-                if (atom == null)
-                    return endOfData();
-
-                return atom;
-            }
-        };
-    }
-
-    public boolean isLive()
-    {
-        return true;
-    }
-
-    public boolean isLive(long now)
-    {
-        return true;
-    }
-
-    public int cellDataSize()
-    {
-        return name().dataSize() + value().remaining() + TypeSizes.NATIVE.sizeof(timestamp());
-    }
-
-    public int serializedSize(CellNameType type, TypeSizes typeSizes)
-    {
-        /*
-         * Size of a column is =
-         *   size of a name (short + length of the string)
-         * + 1 byte to indicate if the column has been deleted
-         * + 8 bytes for timestamp
-         * + 4 bytes which basically indicates the size of the byte array
-         * + entire byte array.
-        */
-        int valueSize = value().remaining();
-        return ((int)type.cellSerializer().serializedSize(name(), typeSizes)) + 1 + typeSizes.sizeof(timestamp()) + typeSizes.sizeof(valueSize) + valueSize;
-    }
-
-    public int serializationFlags()
-    {
-        return 0;
-    }
-
-    public Cell diff(Cell cell)
-    {
-        if (timestamp() < cell.timestamp())
-            return cell;
-        return null;
-    }
-
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name().toByteBuffer().duplicate());
-        digest.update(value().duplicate());
-
-        FBUtilities.updateWithLong(digest, timestamp());
-        FBUtilities.updateWithByte(digest, serializationFlags());
-    }
-
-    public int getLocalDeletionTime()
-    {
-        return Integer.MAX_VALUE;
-    }
-
-    public Cell reconcile(Cell cell)
-    {
-        long ts1 = timestamp(), ts2 = cell.timestamp();
-        if (ts1 != ts2)
-            return ts1 < ts2 ? cell : this;
-        if (isLive() != cell.isLive())
-            return isLive() ? cell : this;
-        return value().compareTo(cell.value()) < 0 ? cell : this;
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        return this == o || (o instanceof Cell && equals((Cell) o));
-    }
-
-    public boolean equals(Cell cell)
-    {
-        return timestamp() == cell.timestamp() && name().equals(cell.name()) && value().equals(cell.value())
-               && serializationFlags() == cell.serializationFlags();
-    }
-
-    public int hashCode()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s:%b:%d@%d",
-                             comparator.getString(name()),
-                             !isLive(),
-                             value().remaining(),
-                             timestamp());
-    }
-
-    public void validateName(CFMetaData metadata) throws MarshalException
-    {
-        metadata.comparator.validate(name());
-    }
-
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-
-        AbstractType<?> valueValidator = metadata.getValueValidator(name());
-        if (valueValidator != null)
-            valueValidator.validateCellValue(value());
-    }
-
-    public static Cell create(CellName name, ByteBuffer value, long timestamp, int ttl, CFMetaData metadata)
-    {
-        if (ttl <= 0)
-            ttl = metadata.getDefaultTimeToLive();
-
-        return ttl > 0
-                ? new BufferExpiringCell(name, value, timestamp, ttl)
-                : new BufferCell(name, value, timestamp);
-    }
-
-    public Cell diffCounter(Cell cell)
-    {
-        assert this instanceof CounterCell : "Wrong class type: " + getClass();
-
-        if (timestamp() < cell.timestamp())
-            return cell;
-
-        // Note that if at that point, cell can't be a tombstone. Indeed,
-        // cell is the result of merging us with other nodes results, and
-        // merging a CounterCell with a tombstone never return a tombstone
-        // unless that tombstone timestamp is greater that the CounterCell
-        // one.
-        assert cell instanceof CounterCell : "Wrong class type: " + cell.getClass();
-
-        if (((CounterCell) this).timestampOfLastDelete() < ((CounterCell) cell).timestampOfLastDelete())
-            return cell;
-
-        CounterContext.Relationship rel = CounterCell.contextManager.diff(cell.value(), value());
-        return (rel == CounterContext.Relationship.GREATER_THAN || rel == CounterContext.Relationship.DISJOINT) ? cell : null;
-    }
-
-    /** This is temporary until we start creating Cells of the different type (buffer vs. native) */
-    public Cell reconcileCounter(Cell cell)
-    {
-        assert this instanceof CounterCell : "Wrong class type: " + getClass();
-
-        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-        if (cell instanceof DeletedCell)
-            return cell;
-
-        assert (cell instanceof CounterCell) : "Wrong class type: " + cell.getClass();
-
-        // live < live last delete
-        if (timestamp() < ((CounterCell) cell).timestampOfLastDelete())
-            return cell;
-
-        long timestampOfLastDelete = ((CounterCell) this).timestampOfLastDelete();
-
-        // live last delete > live
-        if (timestampOfLastDelete > cell.timestamp())
-            return this;
-
-        // live + live. return one of the cells if its context is a superset of the other's, or merge them otherwise
-        ByteBuffer context = CounterCell.contextManager.merge(value(), cell.value());
-        if (context == value() && timestamp() >= cell.timestamp() && timestampOfLastDelete >= ((CounterCell) cell).timestampOfLastDelete())
-            return this;
-        else if (context == cell.value() && cell.timestamp() >= timestamp() && ((CounterCell) cell).timestampOfLastDelete() >= timestampOfLastDelete)
-            return cell;
-        else // merge clocks and timestamps.
-            return new BufferCounterCell(name(),
-                                         context,
-                                         Math.max(timestamp(), cell.timestamp()),
-                                         Math.max(timestampOfLastDelete, ((CounterCell) cell).timestampOfLastDelete()));
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/AbstractClusteringPrefix.java b/src/java/org/apache/cassandra/db/AbstractClusteringPrefix.java
new file mode 100644
index 0000000..2631b46
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AbstractClusteringPrefix.java

@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.Objects;
+
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+
+public abstract class AbstractClusteringPrefix implements ClusteringPrefix
+{
+    protected static final ByteBuffer[] EMPTY_VALUES_ARRAY = new ByteBuffer[0];
+
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new Clustering(EMPTY_VALUES_ARRAY));
+
+    protected final Kind kind;
+    protected final ByteBuffer[] values;
+
+    protected AbstractClusteringPrefix(Kind kind, ByteBuffer[] values)
+    {
+        this.kind = kind;
+        this.values = values;
+    }
+
+    public Kind kind()
+    {
+        return kind;
+    }
+
+    public ClusteringPrefix clustering()
+    {
+        return this;
+    }
+
+    public int size()
+    {
+        return values.length;
+    }
+
+    public ByteBuffer get(int i)
+    {
+        return values[i];
+    }
+
+    public ByteBuffer[] getRawValues()
+    {
+        return values;
+    }
+
+    public int dataSize()
+    {
+        int size = 0;
+        for (int i = 0; i < size(); i++)
+        {
+            ByteBuffer bb = get(i);
+            size += bb == null ? 0 : bb.remaining();
+        }
+        return size;
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        for (int i = 0; i < size(); i++)
+        {
+            ByteBuffer bb = get(i);
+            if (bb != null)
+                digest.update(bb.duplicate());
+        }
+        FBUtilities.updateWithByte(digest, kind().ordinal());
+    }
+
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(values);
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapExcludingData(values);
+    }
+
+    @Override
+    public final int hashCode()
+    {
+        int result = 31;
+        for (int i = 0; i < size(); i++)
+            result += 31 * Objects.hashCode(get(i));
+        return 31 * result + Objects.hashCode(kind());
+    }
+
+    @Override
+    public final boolean equals(Object o)
+    {
+        if(!(o instanceof ClusteringPrefix))
+            return false;
+
+        ClusteringPrefix that = (ClusteringPrefix)o;
+        if (this.kind() != that.kind() || this.size() != that.size())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+            if (!Objects.equals(this.get(i), that.get(i)))
+                return false;
+
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/AbstractNativeCell.java b/src/java/org/apache/cassandra/db/AbstractNativeCell.java
deleted file mode 100644
index 35e320d..0000000
--- a/src/java/org/apache/cassandra/db/AbstractNativeCell.java
+++ /dev/null

@@ -1,716 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.security.MessageDigest;
-
-import net.nicoulaj.compilecommand.annotations.Inline;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.FastByteOperations;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.*;
-
-
-/**
- * <pre>
- * {@code
- * Packs a CellName AND a Cell into one off-heap representation.
- * Layout is:
- *
- * Note we store the ColumnIdentifier in full as bytes. This seems an okay tradeoff for now, as we just
- * look it back up again when we need to, and in the near future we hope to switch to ints, longs or
- * UUIDs representing column identifiers on disk, at which point we can switch that here as well.
- *
- * [timestamp][value offset][name size]][name extra][name offset deltas][cell names][value][Descendants]
- * [   8b    ][     4b     ][    2b   ][     1b    ][     each 2b      ][ arb < 64k][ arb ][ arbitrary ]
- *
- * descendants: any overriding classes will put their state here
- * name offsets are deltas from their base offset, and don't include the first offset, or the end position of the final entry,
- * i.e. there will be size - 1 entries, and each is a delta that is added to the offset of the position of the first name
- * (which is always CELL_NAME_OFFSETS_OFFSET + (2 * (size - 1))). The length of the final name fills up any remaining
- * space upto the value offset
- * name extra:  lowest 2 bits indicate the clustering size delta (i.e. how many name items are NOT part of the clustering key)
- *              the next 2 bits indicate the CellNameType
- *              the next bit indicates if the column is a static or clustered/dynamic column
- * }
- * </pre>
- */
-public abstract class AbstractNativeCell extends AbstractCell implements CellName
-{
-    static final int TIMESTAMP_OFFSET = 4;
-    private static final int VALUE_OFFSET_OFFSET = 12;
-    private static final int CELL_NAME_SIZE_OFFSET = 16;
-    private static final int CELL_NAME_EXTRA_OFFSET = 18;
-    private static final int CELL_NAME_OFFSETS_OFFSET = 19;
-    private static final int CELL_NAME_SIZE_DELTA_MASK = 3;
-    private static final int CELL_NAME_TYPE_SHIFT = 2;
-    private static final int CELL_NAME_TYPE_MASK = 7;
-
-    private static enum NameType
-    {
-        COMPOUND_DENSE(0 << 2), COMPOUND_SPARSE(1 << 2), COMPOUND_SPARSE_STATIC(2 << 2), SIMPLE_DENSE(3 << 2), SIMPLE_SPARSE(4 << 2);
-        static final NameType[] TYPES = NameType.values();
-        final int bits;
-
-        NameType(int bits)
-        {
-            this.bits = bits;
-        }
-
-        static NameType typeOf(CellName name)
-        {
-            if (name instanceof CompoundDenseCellName)
-            {
-                assert !name.isStatic();
-                return COMPOUND_DENSE;
-            }
-
-            if (name instanceof CompoundSparseCellName)
-                return name.isStatic() ? COMPOUND_SPARSE_STATIC : COMPOUND_SPARSE;
-
-            if (name instanceof SimpleDenseCellName)
-            {
-                assert !name.isStatic();
-                return SIMPLE_DENSE;
-            }
-
-            if (name instanceof SimpleSparseCellName)
-            {
-                assert !name.isStatic();
-                return SIMPLE_SPARSE;
-            }
-
-            if (name instanceof NativeCell)
-                return ((NativeCell) name).nametype();
-
-            throw new AssertionError();
-        }
-    }
-
-    private final long peer; // peer is assigned by peer updater in setPeer method
-
-    AbstractNativeCell()
-    {
-        peer = -1;
-    }
-
-    public AbstractNativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf)
-    {
-        int size = sizeOf(copyOf);
-        peer = allocator.allocate(size, writeOp);
-
-        MemoryUtil.setInt(peer, size);
-        construct(copyOf);
-    }
-
-    protected int sizeOf(Cell cell)
-    {
-        int size = CELL_NAME_OFFSETS_OFFSET + Math.max(0, cell.name().size() - 1) * 2 + cell.value().remaining();
-        CellName name = cell.name();
-        for (int i = 0; i < name.size(); i++)
-            size += name.get(i).remaining();
-        return size;
-    }
-
-    protected void construct(Cell from)
-    {
-        setLong(TIMESTAMP_OFFSET, from.timestamp());
-        CellName name = from.name();
-        int nameSize = name.size();
-        int offset = CELL_NAME_SIZE_OFFSET;
-        setShort(offset, (short) nameSize);
-        assert nameSize - name.clusteringSize() <= 2;
-        byte cellNameExtraBits = (byte) ((nameSize - name.clusteringSize()) | NameType.typeOf(name).bits);
-        setByte(offset += 2, cellNameExtraBits);
-        offset += 1;
-        short cellNameDelta = 0;
-        for (int i = 1; i < nameSize; i++)
-        {
-            cellNameDelta += name.get(i - 1).remaining();
-            setShort(offset, cellNameDelta);
-            offset += 2;
-        }
-        for (int i = 0; i < nameSize; i++)
-        {
-            ByteBuffer bb = name.get(i);
-            setBytes(offset, bb);
-            offset += bb.remaining();
-        }
-        setInt(VALUE_OFFSET_OFFSET, offset);
-        setBytes(offset, from.value());
-    }
-
-    // the offset at which to read the short that gives the names
-    private int nameDeltaOffset(int i)
-    {
-        return CELL_NAME_OFFSETS_OFFSET + ((i - 1) * 2);
-    }
-
-    int valueStartOffset()
-    {
-        return getInt(VALUE_OFFSET_OFFSET);
-    }
-
-    private int valueEndOffset()
-    {
-        return (int) (internalSize() - postfixSize());
-    }
-
-    protected int postfixSize()
-    {
-        return 0;
-    }
-
-    @Override
-    public ByteBuffer value()
-    {
-        long offset = valueStartOffset();
-        return getByteBuffer(offset, (int) (internalSize() - (postfixSize() + offset))).order(ByteOrder.BIG_ENDIAN);
-    }
-
-    private int clusteringSizeDelta()
-    {
-        return getByte(CELL_NAME_EXTRA_OFFSET) & CELL_NAME_SIZE_DELTA_MASK;
-    }
-
-    public boolean isStatic()
-    {
-        return nametype() == NameType.COMPOUND_SPARSE_STATIC;
-    }
-
-    NameType nametype()
-    {
-        return NameType.TYPES[(((int) this.getByte(CELL_NAME_EXTRA_OFFSET)) >> CELL_NAME_TYPE_SHIFT) & CELL_NAME_TYPE_MASK];
-    }
-
-    public long minTimestamp()
-    {
-        return timestamp();
-    }
-
-    public long maxTimestamp()
-    {
-        return timestamp();
-    }
-
-    public int clusteringSize()
-    {
-        return size() - clusteringSizeDelta();
-    }
-
-    @Override
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
-    {
-        switch (nametype())
-        {
-            case SIMPLE_SPARSE:
-                return getIdentifier(metadata, get(clusteringSize()));
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                ByteBuffer buffer = get(clusteringSize());
-                if (buffer.remaining() == 0)
-                    return CompoundSparseCellNameType.rowMarkerId;
-
-                return getIdentifier(metadata, buffer);
-            case SIMPLE_DENSE:
-            case COMPOUND_DENSE:
-                return null;
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    public ByteBuffer collectionElement()
-    {
-        return isCollectionCell() ? get(size() - 1) : null;
-    }
-
-    // we always have a collection element if our clustering size is 2 less than our total size,
-    // and we never have one otherwiss
-    public boolean isCollectionCell()
-    {
-        return clusteringSizeDelta() == 2;
-    }
-
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
-    {
-        switch (nametype())
-        {
-            case SIMPLE_DENSE:
-            case COMPOUND_DENSE:
-                return type.compare(this, other) == 0;
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                int clusteringSize = clusteringSize();
-                if (clusteringSize != other.clusteringSize() || other.isStatic() != isStatic())
-                    return false;
-                for (int i = 0; i < clusteringSize; i++)
-                    if (type.subtype(i).compare(get(i), other.get(i)) != 0)
-                        return false;
-                return true;
-            case SIMPLE_SPARSE:
-                return true;
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    public int size()
-    {
-        return getShort(CELL_NAME_SIZE_OFFSET);
-    }
-
-    public boolean isEmpty()
-    {
-        return size() == 0;
-    }
-
-    public ByteBuffer get(int i)
-    {
-        return get(i, null);
-    }
-
-    private ByteBuffer get(int i, AbstractAllocator copy)
-    {
-        // remember to take dense/sparse into account, and only return EOC when not dense
-        int size = size();
-        assert i >= 0 && i < size();
-        int cellNamesOffset = nameDeltaOffset(size);
-        int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
-        int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
-        int length = endDelta - startDelta;
-        if (copy == null)
-            return getByteBuffer(cellNamesOffset + startDelta, length).order(ByteOrder.BIG_ENDIAN);
-        ByteBuffer result = copy.allocate(length);
-        FastByteOperations.UnsafeOperations.copy(null, peer + cellNamesOffset + startDelta, result, 0, length);
-        return result;
-    }
-
-    private static final ThreadLocal<byte[]> BUFFER = new ThreadLocal<byte[]>()
-    {
-        protected byte[] initialValue()
-        {
-            return new byte[256];
-        }
-    };
-
-    protected void writeComponentTo(MessageDigest digest, int i, boolean includeSize)
-    {
-        // remember to take dense/sparse into account, and only return EOC when not dense
-        int size = size();
-        assert i >= 0 && i < size();
-        int cellNamesOffset = nameDeltaOffset(size);
-        int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
-        int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
-
-        int componentStart = cellNamesOffset + startDelta;
-        int count = endDelta - startDelta;
-
-        if (includeSize)
-            FBUtilities.updateWithShort(digest, count);
-
-        writeMemoryTo(digest, componentStart, count);
-    }
-
-    protected void writeMemoryTo(MessageDigest digest, int from, int count)
-    {
-        // only batch if we have more than 16 bytes remaining to transfer, otherwise fall-back to single-byte updates
-        int i = 0, batchEnd = count - 16;
-        if (i < batchEnd)
-        {
-            byte[] buffer = BUFFER.get();
-            while (i < batchEnd)
-            {
-                int transfer = Math.min(count - i, 256);
-                getBytes(from + i, buffer, 0, transfer);
-                digest.update(buffer, 0, transfer);
-                i += transfer;
-            }
-        }
-        while (i < count)
-            digest.update(getByte(from + i++));
-    }
-
-    public EOC eoc()
-    {
-        return EOC.NONE;
-    }
-
-    public Composite withEOC(EOC eoc)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Composite start()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Composite end()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public ColumnSlice slice()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean isPrefixOf(CType type, Composite c)
-    {
-        if (size() > c.size() || isStatic() != c.isStatic())
-            return false;
-
-        for (int i = 0; i < size(); i++)
-        {
-            if (type.subtype(i).compare(get(i), c.get(i)) != 0)
-                return false;
-        }
-        return true;
-    }
-
-    public ByteBuffer toByteBuffer()
-    {
-        // for simple sparse we just return our one name buffer
-        switch (nametype())
-        {
-            case SIMPLE_DENSE:
-            case SIMPLE_SPARSE:
-                return get(0);
-            case COMPOUND_DENSE:
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                // This is the legacy format of composites.
-                // See org.apache.cassandra.db.marshal.CompositeType for details.
-                ByteBuffer result = ByteBuffer.allocate(cellDataSize());
-                if (isStatic())
-                    ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER);
-
-                for (int i = 0; i < size(); i++)
-                {
-                    ByteBuffer bb = get(i);
-                    ByteBufferUtil.writeShortLength(result, bb.remaining());
-                    result.put(bb);
-                    result.put((byte) 0);
-                }
-                result.flip();
-                return result;
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    protected void updateWithName(MessageDigest digest)
-    {
-        // for simple sparse we just return our one name buffer
-        switch (nametype())
-        {
-            case SIMPLE_DENSE:
-            case SIMPLE_SPARSE:
-                writeComponentTo(digest, 0, false);
-                break;
-
-            case COMPOUND_DENSE:
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                // This is the legacy format of composites.
-                // See org.apache.cassandra.db.marshal.CompositeType for details.
-                if (isStatic())
-                    FBUtilities.updateWithShort(digest, CompositeType.STATIC_MARKER);
-
-                for (int i = 0; i < size(); i++)
-                {
-                    writeComponentTo(digest, i, true);
-                    digest.update((byte) 0);
-                }
-                break;
-
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    protected void updateWithValue(MessageDigest digest)
-    {
-        int offset = valueStartOffset();
-        int length = valueEndOffset() - offset;
-        writeMemoryTo(digest, offset, length);
-    }
-
-    @Override // this is the NAME dataSize, only!
-    public int dataSize()
-    {
-        switch (nametype())
-        {
-            case SIMPLE_DENSE:
-            case SIMPLE_SPARSE:
-                return valueStartOffset() - nameDeltaOffset(size());
-            case COMPOUND_DENSE:
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                int size = size();
-                return valueStartOffset() - nameDeltaOffset(size) + 3 * size + (isStatic() ? 2 : 0);
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    public boolean equals(Object obj)
-    {
-        if (obj == this)
-            return true;
-        if (obj instanceof CellName)
-            return equals((CellName) obj);
-        if (obj instanceof Cell)
-            return equals((Cell) obj);
-        return false;
-    }
-
-    public boolean equals(CellName that)
-    {
-        int size = this.size();
-        if (size != that.size())
-            return false;
-
-        for (int i = 0 ; i < size ; i++)
-            if (!get(i).equals(that.get(i)))
-                return false;
-        return true;
-    }
-
-    private static final ByteBuffer[] EMPTY = new ByteBuffer[0];
-
-    @Override
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        ByteBuffer[] r;
-        switch (nametype())
-        {
-            case SIMPLE_DENSE:
-                return CellNames.simpleDense(get(0, allocator));
-
-            case COMPOUND_DENSE:
-                r = new ByteBuffer[size()];
-                for (int i = 0; i < r.length; i++)
-                    r[i] = get(i, allocator);
-                return CellNames.compositeDense(r);
-
-            case COMPOUND_SPARSE_STATIC:
-            case COMPOUND_SPARSE:
-                int clusteringSize = clusteringSize();
-                r = clusteringSize == 0 ? EMPTY : new ByteBuffer[clusteringSize()];
-                for (int i = 0; i < clusteringSize; i++)
-                    r[i] = get(i, allocator);
-
-                ByteBuffer nameBuffer = get(r.length);
-                ColumnIdentifier name;
-
-                if (nameBuffer.remaining() == 0)
-                {
-                    name = CompoundSparseCellNameType.rowMarkerId;
-                }
-                else
-                {
-                    name = getIdentifier(cfm, nameBuffer);
-                }
-
-                if (clusteringSizeDelta() == 2)
-                {
-                    ByteBuffer element = allocator.clone(get(size() - 1));
-                    return CellNames.compositeSparseWithCollection(r, element, name, isStatic());
-                }
-                return CellNames.compositeSparse(r, name, isStatic());
-
-            case SIMPLE_SPARSE:
-                return CellNames.simpleSparse(getIdentifier(cfm, get(0)));
-        }
-        throw new IllegalStateException();
-    }
-
-    private static ColumnIdentifier getIdentifier(CFMetaData cfMetaData, ByteBuffer name)
-    {
-        ColumnDefinition def = cfMetaData.getColumnDefinition(name);
-        if (def != null)
-        {
-            return def.name;
-        }
-        else
-        {
-            // it's safe to simply grab based on clusteringPrefixSize() as we are only called if not a dense type
-            AbstractType<?> type = cfMetaData.comparator.subtype(cfMetaData.comparator.clusteringPrefixSize());
-            return new ColumnIdentifier(HeapAllocator.instance.clone(name), type);
-        }
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Cell withUpdatedTimestamp(long newTimestamp)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    protected long internalSize()
-    {
-        return MemoryUtil.getInt(peer);
-    }
-
-    private void checkPosition(long offset, long size)
-    {
-        assert size >= 0;
-        assert peer > 0 : "Memory was freed";
-        assert offset >= 0 && offset + size <= internalSize() : String.format("Illegal range: [%d..%d), size: %s", offset, offset + size, internalSize());
-    }
-
-    protected final void setByte(long offset, byte b)
-    {
-        checkPosition(offset, 1);
-        MemoryUtil.setByte(peer + offset, b);
-    }
-
-    protected final void setShort(long offset, short s)
-    {
-        checkPosition(offset, 1);
-        MemoryUtil.setShort(peer + offset, s);
-    }
-
-    protected final void setInt(long offset, int l)
-    {
-        checkPosition(offset, 4);
-        MemoryUtil.setInt(peer + offset, l);
-    }
-
-    protected final void setLong(long offset, long l)
-    {
-        checkPosition(offset, 8);
-        MemoryUtil.setLong(peer + offset, l);
-    }
-
-    protected final void setBytes(long offset, ByteBuffer buffer)
-    {
-        int start = buffer.position();
-        int count = buffer.limit() - start;
-        if (count == 0)
-            return;
-
-        checkPosition(offset, count);
-        MemoryUtil.setBytes(peer + offset, buffer);
-    }
-
-    protected final byte getByte(long offset)
-    {
-        checkPosition(offset, 1);
-        return MemoryUtil.getByte(peer + offset);
-    }
-
-    protected final void getBytes(long offset, byte[] trg, int trgOffset, int count)
-    {
-        checkPosition(offset, count);
-        MemoryUtil.getBytes(peer + offset, trg, trgOffset, count);
-    }
-
-    protected final int getShort(long offset)
-    {
-        checkPosition(offset, 2);
-        return MemoryUtil.getShort(peer + offset);
-    }
-
-    protected final int getInt(long offset)
-    {
-        checkPosition(offset, 4);
-        return MemoryUtil.getInt(peer + offset);
-    }
-
-    protected final long getLong(long offset)
-    {
-        checkPosition(offset, 8);
-        return MemoryUtil.getLong(peer + offset);
-    }
-
-    protected final ByteBuffer getByteBuffer(long offset, int length)
-    {
-        checkPosition(offset, length);
-        return MemoryUtil.getByteBuffer(peer + offset, length);
-    }
-
-    // requires isByteOrderComparable to be true. Compares the name components only; ; may need to compare EOC etc still
-    @Inline
-    public final int compareTo(final Composite that)
-    {
-        if (isStatic() != that.isStatic())
-        {
-            // Static sorts before non-static no matter what, except for empty which
-            // always sort first
-            if (isEmpty())
-                return that.isEmpty() ? 0 : -1;
-            if (that.isEmpty())
-                return 1;
-            return isStatic() ? -1 : 1;
-        }
-
-        int size = size();
-        int size2 = that.size();
-        int minSize = Math.min(size, size2);
-        int startDelta = 0;
-        int cellNamesOffset = nameDeltaOffset(size);
-        for (int i = 0 ; i < minSize ; i++)
-        {
-            int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
-            long offset = peer + cellNamesOffset + startDelta;
-            int length = endDelta - startDelta;
-            int cmp = FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(i));
-            if (cmp != 0)
-                return cmp;
-            startDelta = endDelta;
-        }
-
-        EOC eoc = that.eoc();
-        if (size == size2)
-            return this.eoc().compareTo(eoc);
-
-        return size < size2 ? this.eoc().prefixComparisonResult : -eoc.prefixComparisonResult;
-    }
-
-    public final int compareToSimple(final Composite that)
-    {
-        assert size() == 1 && that.size() == 1;
-        int length = valueStartOffset() - nameDeltaOffset(1);
-        long offset = peer + nameDeltaOffset(1);
-        return FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(0));
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java b/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
deleted file mode 100644
index 8bcb5b3..0000000
--- a/src/java/org/apache/cassandra/db/AbstractRangeCommand.java
+++ /dev/null

@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.List;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.index.*;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.service.IReadCommand;
-
-public abstract class AbstractRangeCommand implements IReadCommand
-{
-    public final String keyspace;
-    public final String columnFamily;
-    public final long timestamp;
-
-    public final AbstractBounds<RowPosition> keyRange;
-    public final IDiskAtomFilter predicate;
-    public final List<IndexExpression> rowFilter;
-
-    public final SecondaryIndexSearcher searcher;
-
-    public AbstractRangeCommand(String keyspace, String columnFamily, long timestamp, AbstractBounds<RowPosition> keyRange, IDiskAtomFilter predicate, List<IndexExpression> rowFilter)
-    {
-        this.keyspace = keyspace;
-        this.columnFamily = columnFamily;
-        this.timestamp = timestamp;
-        this.keyRange = keyRange;
-        this.predicate = predicate;
-        this.rowFilter = rowFilter;
-        SecondaryIndexManager indexManager = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily).indexManager;
-        this.searcher = indexManager.getHighestSelectivityIndexSearcher(rowFilter);
-    }
-
-    public boolean requiresScanningAllRanges()
-    {
-        return searcher != null && searcher.requiresScanningAllRanges(rowFilter);
-    }
-
-    public List<Row> postReconciliationProcessing(List<Row> rows)
-    {
-        return searcher == null ? trim(rows) : trim(searcher.postReconciliationProcessing(rowFilter, rows));
-    }
-
-    private List<Row> trim(List<Row> rows)
-    {
-        if (countCQL3Rows() || ignoredTombstonedPartitions())
-            return rows;
-        else
-            return rows.size() > limit() ? rows.subList(0, limit()) : rows;
-    }
-
-    public String getKeyspace()
-    {
-        return keyspace;
-    }
-
-    public abstract MessageOut<? extends AbstractRangeCommand> createMessage();
-    public abstract AbstractRangeCommand forSubRange(AbstractBounds<RowPosition> range);
-    public abstract AbstractRangeCommand withUpdatedLimit(int newLimit);
-
-    public abstract int limit();
-    public abstract boolean countCQL3Rows();
-
-    /**
-     * Returns true if tombstoned partitions should not be included in results or count towards the limit.
-     * See CASSANDRA-8490 for more details on why this is needed (and done this way).
-     * */
-    public boolean ignoredTombstonedPartitions()
-    {
-        if (!(predicate instanceof SliceQueryFilter))
-            return false;
-
-        return ((SliceQueryFilter) predicate).compositesToGroup == SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
-    }
-
-    public abstract List<Row> executeLocally();
-
-    public long getTimeout()
-    {
-        return DatabaseDescriptor.getRangeRpcTimeout();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java b/src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java
new file mode 100644
index 0000000..a7f3319
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java

@@ -0,0 +1,354 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.utils.FBUtilities;
+
+public abstract class AbstractReadCommandBuilder
+{
+    protected final ColumnFamilyStore cfs;
+    protected int nowInSeconds;
+
+    private int cqlLimit = -1;
+    private int pagingLimit = -1;
+    protected boolean reversed = false;
+
+    protected Set<ColumnIdentifier> columns;
+    protected final RowFilter filter = RowFilter.create();
+
+    private Slice.Bound lowerClusteringBound;
+    private Slice.Bound upperClusteringBound;
+
+    private NavigableSet<Clustering> clusterings;
+
+    // Use Util.cmd() instead of this ctor directly
+    AbstractReadCommandBuilder(ColumnFamilyStore cfs)
+    {
+        this.cfs = cfs;
+        this.nowInSeconds = FBUtilities.nowInSeconds();
+    }
+
+    public AbstractReadCommandBuilder withNowInSeconds(int nowInSec)
+    {
+        this.nowInSeconds = nowInSec;
+        return this;
+    }
+
+    public AbstractReadCommandBuilder fromIncl(Object... values)
+    {
+        assert lowerClusteringBound == null && clusterings == null;
+        this.lowerClusteringBound = Slice.Bound.create(cfs.metadata.comparator, true, true, values);
+        return this;
+    }
+
+    public AbstractReadCommandBuilder fromExcl(Object... values)
+    {
+        assert lowerClusteringBound == null && clusterings == null;
+        this.lowerClusteringBound = Slice.Bound.create(cfs.metadata.comparator, true, false, values);
+        return this;
+    }
+
+    public AbstractReadCommandBuilder toIncl(Object... values)
+    {
+        assert upperClusteringBound == null && clusterings == null;
+        this.upperClusteringBound = Slice.Bound.create(cfs.metadata.comparator, false, true, values);
+        return this;
+    }
+
+    public AbstractReadCommandBuilder toExcl(Object... values)
+    {
+        assert upperClusteringBound == null && clusterings == null;
+        this.upperClusteringBound = Slice.Bound.create(cfs.metadata.comparator, false, false, values);
+        return this;
+    }
+
+    public AbstractReadCommandBuilder includeRow(Object... values)
+    {
+        assert lowerClusteringBound == null && upperClusteringBound == null;
+
+        if (this.clusterings == null)
+            this.clusterings = new TreeSet<>(cfs.metadata.comparator);
+
+        this.clusterings.add(cfs.metadata.comparator.make(values));
+        return this;
+    }
+
+    public AbstractReadCommandBuilder reverse()
+    {
+        this.reversed = true;
+        return this;
+    }
+
+    public AbstractReadCommandBuilder withLimit(int newLimit)
+    {
+        this.cqlLimit = newLimit;
+        return this;
+    }
+
+    public AbstractReadCommandBuilder withPagingLimit(int newLimit)
+    {
+        this.pagingLimit = newLimit;
+        return this;
+    }
+
+    public AbstractReadCommandBuilder columns(String... columns)
+    {
+        if (this.columns == null)
+            this.columns = new HashSet<>();
+
+        for (String column : columns)
+            this.columns.add(ColumnIdentifier.getInterned(column, true));
+        return this;
+    }
+
+    private ByteBuffer bb(Object value, AbstractType<?> type)
+    {
+        return value instanceof ByteBuffer ? (ByteBuffer)value : ((AbstractType)type).decompose(value);
+    }
+
+    private AbstractType<?> forValues(AbstractType<?> collectionType)
+    {
+        assert collectionType instanceof CollectionType;
+        CollectionType ct = (CollectionType)collectionType;
+        switch (ct.kind)
+        {
+            case LIST:
+            case MAP:
+                return ct.valueComparator();
+            case SET:
+                return ct.nameComparator();
+        }
+        throw new AssertionError();
+    }
+
+    private AbstractType<?> forKeys(AbstractType<?> collectionType)
+    {
+        assert collectionType instanceof CollectionType;
+        CollectionType ct = (CollectionType)collectionType;
+        switch (ct.kind)
+        {
+            case LIST:
+            case MAP:
+                return ct.nameComparator();
+        }
+        throw new AssertionError();
+    }
+
+    @VisibleForTesting
+    public AbstractReadCommandBuilder filterOn(String column, Operator op, Object value)
+    {
+        ColumnDefinition def = cfs.metadata.getColumnDefinition(ColumnIdentifier.getInterned(column, true));
+        assert def != null;
+
+        AbstractType<?> type = def.type;
+        if (op == Operator.CONTAINS)
+            type = forValues(type);
+        else if (op == Operator.CONTAINS_KEY)
+            type = forKeys(type);
+
+        this.filter.add(def, op, bb(value, type));
+        return this;
+    }
+
+    protected ColumnFilter makeColumnFilter()
+    {
+        if (columns == null || columns.isEmpty())
+            return ColumnFilter.all(cfs.metadata);
+
+        ColumnFilter.Builder filter = ColumnFilter.selectionBuilder();
+        for (ColumnIdentifier column : columns)
+            filter.add(cfs.metadata.getColumnDefinition(column));
+        return filter.build();
+    }
+
+    protected ClusteringIndexFilter makeFilter()
+    {
+        // StatementRestrictions.isColumnRange() returns false for static compact tables, which means
+        // SelectStatement.makeClusteringIndexFilter uses a names filter with no clusterings for static
+        // compact tables, here we reproduce this behavior (CASSANDRA-11223). Note that this code is only
+        // called by tests.
+        if (cfs.metadata.isStaticCompactTable())
+            return new ClusteringIndexNamesFilter(new TreeSet<>(cfs.metadata.comparator), reversed);
+
+        if (clusterings != null)
+        {
+            return new ClusteringIndexNamesFilter(clusterings, reversed);
+        }
+        else
+        {
+            Slice slice = Slice.make(lowerClusteringBound == null ? Slice.Bound.BOTTOM : lowerClusteringBound,
+                                     upperClusteringBound == null ? Slice.Bound.TOP : upperClusteringBound);
+            return new ClusteringIndexSliceFilter(Slices.with(cfs.metadata.comparator, slice), reversed);
+        }
+    }
+
+    protected DataLimits makeLimits()
+    {
+        DataLimits limits = cqlLimit < 0 ? DataLimits.NONE : DataLimits.cqlLimits(cqlLimit);
+        if (pagingLimit >= 0)
+            limits = limits.forPaging(pagingLimit);
+        return limits;
+    }
+
+    public abstract ReadCommand build();
+
+    public static class SinglePartitionBuilder extends AbstractReadCommandBuilder
+    {
+        private final DecoratedKey partitionKey;
+
+        public SinglePartitionBuilder(ColumnFamilyStore cfs, DecoratedKey key)
+        {
+            super(cfs);
+            this.partitionKey = key;
+        }
+
+        @Override
+        public ReadCommand build()
+        {
+            return SinglePartitionReadCommand.create(cfs.metadata, nowInSeconds, makeColumnFilter(), filter, makeLimits(), partitionKey, makeFilter());
+        }
+    }
+
+    public static class SinglePartitionSliceBuilder extends AbstractReadCommandBuilder
+    {
+        private final DecoratedKey partitionKey;
+        private Slices.Builder sliceBuilder;
+
+        public SinglePartitionSliceBuilder(ColumnFamilyStore cfs, DecoratedKey key)
+        {
+            super(cfs);
+            this.partitionKey = key;
+            sliceBuilder = new Slices.Builder(cfs.getComparator());
+        }
+
+        public SinglePartitionSliceBuilder addSlice(Slice slice)
+        {
+            sliceBuilder.add(slice);
+            return this;
+        }
+
+        @Override
+        protected ClusteringIndexFilter makeFilter()
+        {
+            return new ClusteringIndexSliceFilter(sliceBuilder.build(), reversed);
+        }
+
+        @Override
+        public ReadCommand build()
+        {
+            return SinglePartitionReadCommand.create(cfs.metadata, nowInSeconds, makeColumnFilter(), filter, makeLimits(), partitionKey, makeFilter());
+        }
+    }
+
+    public static class PartitionRangeBuilder extends AbstractReadCommandBuilder
+    {
+        private DecoratedKey startKey;
+        private boolean startInclusive;
+        private DecoratedKey endKey;
+        private boolean endInclusive;
+
+        public PartitionRangeBuilder(ColumnFamilyStore cfs)
+        {
+            super(cfs);
+        }
+
+        public PartitionRangeBuilder fromKeyIncl(Object... values)
+        {
+            assert startKey == null;
+            this.startInclusive = true;
+            this.startKey = makeKey(cfs.metadata, values);
+            return this;
+        }
+
+        public PartitionRangeBuilder fromKeyExcl(Object... values)
+        {
+            assert startKey == null;
+            this.startInclusive = false;
+            this.startKey = makeKey(cfs.metadata, values);
+            return this;
+        }
+
+        public PartitionRangeBuilder toKeyIncl(Object... values)
+        {
+            assert endKey == null;
+            this.endInclusive = true;
+            this.endKey = makeKey(cfs.metadata, values);
+            return this;
+        }
+
+        public PartitionRangeBuilder toKeyExcl(Object... values)
+        {
+            assert endKey == null;
+            this.endInclusive = false;
+            this.endKey = makeKey(cfs.metadata, values);
+            return this;
+        }
+
+        @Override
+        public ReadCommand build()
+        {
+            PartitionPosition start = startKey;
+            if (start == null)
+            {
+                start = cfs.getPartitioner().getMinimumToken().maxKeyBound();
+                startInclusive = false;
+            }
+            PartitionPosition end = endKey;
+            if (end == null)
+            {
+                end = cfs.getPartitioner().getMinimumToken().maxKeyBound();
+                endInclusive = true;
+            }
+
+            AbstractBounds<PartitionPosition> bounds;
+            if (startInclusive && endInclusive)
+                bounds = new Bounds<>(start, end);
+            else if (startInclusive && !endInclusive)
+                bounds = new IncludingExcludingBounds<>(start, end);
+            else if (!startInclusive && endInclusive)
+                bounds = new Range<>(start, end);
+            else
+                bounds = new ExcludingBounds<>(start, end);
+
+            return PartitionRangeReadCommand.create(false, cfs.metadata, nowInSeconds, makeColumnFilter(), filter, makeLimits(), new DataRange(bounds, makeFilter()));
+        }
+
+        static DecoratedKey makeKey(CFMetaData metadata, Object... partitionKey)
+        {
+            if (partitionKey.length == 1 && partitionKey[0] instanceof DecoratedKey)
+                return (DecoratedKey)partitionKey[0];
+
+            ByteBuffer key = CFMetaData.serializePartitionKey(metadata.getKeyValidatorAsClusteringComparator().make(partitionKey));
+            return metadata.decorateKey(key);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java b/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
deleted file mode 100644
index 1beb982..0000000
--- a/src/java/org/apache/cassandra/db/ArrayBackedSortedColumns.java
+++ /dev/null

@@ -1,774 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.*;
-
-import com.google.common.base.Function;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.BatchRemoveIterator;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.SearchIterator;
-
-/**
- * A ColumnFamily backed by an array.
- * This implementation is not synchronized and should only be used when
- * thread-safety is not required. This implementation makes sense when the
- * main operations performed are iterating over the cells and adding cells
- * (especially if insertion is in sorted order).
- */
-public class ArrayBackedSortedColumns extends ColumnFamily
-{
-    private static final Cell[] EMPTY_ARRAY = new Cell[0];
-    private static final int MINIMAL_CAPACITY = 10;
-
-    private final boolean reversed;
-
-    private DeletionInfo deletionInfo;
-    private Cell[] cells;
-    private int size;
-    private int sortedSize;
-    private volatile boolean isSorted;
-
-    public static final ColumnFamily.Factory<ArrayBackedSortedColumns> factory = new Factory<ArrayBackedSortedColumns>()
-    {
-        public ArrayBackedSortedColumns create(CFMetaData metadata, boolean insertReversed, int initialCapacity)
-        {
-            return new ArrayBackedSortedColumns(metadata, insertReversed, initialCapacity == 0 ? EMPTY_ARRAY : new Cell[initialCapacity], 0, 0);
-        }
-    };
-
-    private ArrayBackedSortedColumns(CFMetaData metadata, boolean reversed, Cell[] cells, int size, int sortedSize)
-    {
-        super(metadata);
-        this.reversed = reversed;
-        this.deletionInfo = DeletionInfo.live();
-        this.cells = cells;
-        this.size = size;
-        this.sortedSize = sortedSize;
-        this.isSorted = size == sortedSize;
-    }
-
-    protected ArrayBackedSortedColumns(CFMetaData metadata, boolean reversed)
-    {
-        this(metadata, reversed, EMPTY_ARRAY, 0, 0);
-    }
-
-    private ArrayBackedSortedColumns(ArrayBackedSortedColumns original)
-    {
-        super(original.metadata);
-        this.reversed = original.reversed;
-        this.deletionInfo = DeletionInfo.live(); // this is INTENTIONALLY not set to original.deletionInfo.
-        this.cells = Arrays.copyOf(original.cells, original.size);
-        this.size = original.size;
-        this.sortedSize = original.sortedSize;
-        this.isSorted = original.isSorted;
-    }
-
-    public static ArrayBackedSortedColumns localCopy(ColumnFamily original, AbstractAllocator allocator)
-    {
-        ArrayBackedSortedColumns copy = new ArrayBackedSortedColumns(original.metadata, false, new Cell[original.getColumnCount()], 0, 0);
-        for (Cell cell : original)
-            copy.internalAdd(cell.localCopy(original.metadata, allocator));
-        copy.sortedSize = copy.size; // internalAdd doesn't update sortedSize.
-        copy.delete(original);
-        return copy;
-    }
-
-    public ColumnFamily.Factory getFactory()
-    {
-        return factory;
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new ArrayBackedSortedColumns(this);
-    }
-
-    public boolean isInsertReversed()
-    {
-        return reversed;
-    }
-
-    public BatchRemoveIterator<Cell> batchRemoveIterator()
-    {
-        maybeSortCells();
-
-        return new BatchRemoveIterator<Cell>()
-        {
-            private final Iterator<Cell> iter = iterator();
-            private BitSet removedIndexes = new BitSet(size);
-            private int idx = -1;
-            private boolean shouldCallNext = false;
-            private boolean isCommitted = false;
-            private boolean removedAnything = false;
-
-            public void commit()
-            {
-                if (isCommitted)
-                    throw new IllegalStateException();
-                isCommitted = true;
-
-                if (!removedAnything)
-                    return;
-
-                int retainedCount = 0;
-                int clearIdx, setIdx = -1;
-
-                // shift all [clearIdx, setIdx) segments to the left, skipping any removed columns
-                while (true)
-                {
-                    clearIdx = removedIndexes.nextClearBit(setIdx + 1);
-                    if (clearIdx >= size)
-                        break; // nothing left to retain
-
-                    setIdx = removedIndexes.nextSetBit(clearIdx + 1);
-                    if (setIdx < 0)
-                        setIdx = size; // no removals past retainIdx - copy all remaining cells
-
-                    if (retainedCount != clearIdx)
-                        System.arraycopy(cells, clearIdx, cells, retainedCount, setIdx - clearIdx);
-
-                    retainedCount += (setIdx - clearIdx);
-                }
-
-                for (int i = retainedCount; i < size; i++)
-                    cells[i] = null;
-
-                size = sortedSize = retainedCount;
-            }
-
-            public boolean hasNext()
-            {
-                return iter.hasNext();
-            }
-
-            public Cell next()
-            {
-                idx++;
-                shouldCallNext = false;
-                return iter.next();
-            }
-
-            public void remove()
-            {
-                if (shouldCallNext)
-                    throw new IllegalStateException();
-
-                removedIndexes.set(reversed ? size - idx - 1 : idx);
-                removedAnything = true;
-                shouldCallNext = true;
-            }
-        };
-    }
-
-    private Comparator<Composite> internalComparator()
-    {
-        return reversed ? getComparator().reverseComparator() : getComparator();
-    }
-
-    private void maybeSortCells()
-    {
-        if (!isSorted)
-            sortCells();
-    }
-
-    /**
-     * synchronized so that concurrent (read-only) accessors don't mess the internal state.
-     */
-    private synchronized void sortCells()
-    {
-        if (isSorted)
-            return; // Just sorted by a previous call
-
-        Comparator<Cell> comparator = reversed
-                                    ? getComparator().columnReverseComparator()
-                                    : getComparator().columnComparator(false);
-
-        // Sort the unsorted segment - will still potentially contain duplicate (non-reconciled) cells
-        Arrays.sort(cells, sortedSize, size, comparator);
-
-        // Determine the merge start position for that segment
-        int pos = binarySearch(0, sortedSize, cells[sortedSize].name(), internalComparator());
-        if (pos < 0)
-            pos = -pos - 1;
-
-        // Copy [pos, lastSortedCellIndex] cells into a separate array
-        Cell[] leftCopy = pos == sortedSize
-                        ? EMPTY_ARRAY
-                        : Arrays.copyOfRange(cells, pos, sortedSize);
-
-        // Store the beginning (inclusive) and the end (exclusive) indexes of the right segment
-        int rightStart = sortedSize;
-        int rightEnd = size;
-
-        // 'Trim' the sizes to what's left without the leftCopy
-        size = sortedSize = pos;
-
-        // Merge the cells from both segments. When adding from the left segment we can rely on it not having any
-        // duplicate cells, and thus omit the comparison with the previously entered cell - we'll never need to reconcile.
-        int l = 0, r = rightStart;
-        while (l < leftCopy.length && r < rightEnd)
-        {
-            int cmp = comparator.compare(leftCopy[l], cells[r]);
-            if (cmp < 0)
-                append(leftCopy[l++]);
-            else if (cmp == 0)
-                append(leftCopy[l++].reconcile(cells[r++]));
-            else
-                appendOrReconcile(cells[r++]);
-        }
-        while (l < leftCopy.length)
-            append(leftCopy[l++]);
-        while (r < rightEnd)
-            appendOrReconcile(cells[r++]);
-
-        // Nullify the remainder of the array (in case we had duplicate cells that got reconciled)
-        for (int i = size; i < rightEnd; i++)
-            cells[i] = null;
-
-        // Fully sorted at this point
-        isSorted = true;
-    }
-
-    private void appendOrReconcile(Cell cell)
-    {
-        if (size > 0 && cells[size - 1].name().equals(cell.name()))
-            reconcileWith(size - 1, cell);
-        else
-            append(cell);
-    }
-
-    private void append(Cell cell)
-    {
-        cells[size] = cell;
-        size++;
-        sortedSize++;
-    }
-
-    public Cell getColumn(CellName name)
-    {
-        maybeSortCells();
-        int pos = binarySearch(name);
-        return pos >= 0 ? cells[pos] : null;
-    }
-
-    /**
-      * Adds a cell, assuming that:
-      * - it's non-gc-able (if a tombstone) or not a tombstone
-      * - it has a more recent timestamp than any partition/range tombstone shadowing it
-      * - it sorts *strictly after* the current-last cell in the array.
-      */
-    public void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore)
-    {
-        if (cell.getLocalDeletionTime() >= gcBefore && !tester.isDeleted(cell))
-            appendColumn(cell);
-    }
-
-    /**
-     * Adds a cell, assuming that it sorts *strictly after* the current-last cell in the array.
-     */
-    public void appendColumn(Cell cell)
-    {
-        internalAdd(cell);
-        sortedSize++;
-    }
-
-    public void addColumn(Cell cell)
-    {
-        if (size == 0)
-        {
-            internalAdd(cell);
-            sortedSize++;
-            return;
-        }
-
-        if (!isSorted)
-        {
-            internalAdd(cell);
-            return;
-        }
-
-        int c = internalComparator().compare(cells[size - 1].name(), cell.name());
-        if (c < 0)
-        {
-            // Append to the end
-            internalAdd(cell);
-            sortedSize++;
-        }
-        else if (c == 0)
-        {
-            // Resolve against the last cell
-            reconcileWith(size - 1, cell);
-        }
-        else
-        {
-            int pos = binarySearch(cell.name());
-            if (pos >= 0) // Reconcile with an existing cell
-            {
-                reconcileWith(pos, cell);
-            }
-            else
-            {
-                internalAdd(cell); // Append to the end, making cells unsorted from now on
-                isSorted = false;
-            }
-        }
-    }
-
-    public void addAll(ColumnFamily other)
-    {
-        delete(other.deletionInfo());
-
-        if (!other.hasColumns())
-            return;
-
-        // In reality, with ABSC being the only remaining container (aside from ABTC), other will aways be ABSC.
-        if (size == 0 && other instanceof ArrayBackedSortedColumns)
-        {
-            fastAddAll((ArrayBackedSortedColumns) other);
-        }
-        else
-        {
-            Iterator<Cell> iterator = reversed ? other.reverseIterator() : other.iterator();
-            while (iterator.hasNext())
-                addColumn(iterator.next());
-        }
-    }
-
-    // Fast path, when this ABSC is empty.
-    private void fastAddAll(ArrayBackedSortedColumns other)
-    {
-        if (other.isInsertReversed() == isInsertReversed())
-        {
-            cells = Arrays.copyOf(other.cells, other.cells.length);
-            size = other.size;
-            sortedSize = other.sortedSize;
-            isSorted = other.isSorted;
-        }
-        else
-        {
-            if (cells.length < other.getColumnCount())
-                cells = new Cell[Math.max(MINIMAL_CAPACITY, other.getColumnCount())];
-            Iterator<Cell> iterator = reversed ? other.reverseIterator() : other.iterator();
-            while (iterator.hasNext())
-                cells[size++] = iterator.next();
-            sortedSize = size;
-            isSorted = true;
-        }
-    }
-
-    /**
-     * Add a cell to the array, 'resizing' it first if necessary (if it doesn't fit).
-     */
-    private void internalAdd(Cell cell)
-    {
-        if (cells.length == size)
-            cells = Arrays.copyOf(cells, Math.max(MINIMAL_CAPACITY, size * 3 / 2 + 1));
-        cells[size++] = cell;
-    }
-
-    /**
-     * Remove the cell at a given index, shifting the rest of the array to the left if needed.
-     * Please note that we mostly remove from the end, so the shifting should be rare.
-     */
-    private void internalRemove(int index)
-    {
-        int moving = size - index - 1;
-        if (moving > 0)
-            System.arraycopy(cells, index + 1, cells, index, moving);
-        cells[--size] = null;
-    }
-
-    /**
-     * Reconcile with a cell at position i.
-     * Assume that i is a valid position.
-     */
-    private void reconcileWith(int i, Cell cell)
-    {
-        cells[i] = cell.reconcile(cells[i]);
-    }
-
-    private int binarySearch(CellName name)
-    {
-        return binarySearch(0, size, name, internalComparator());
-    }
-
-    /**
-     * Simple binary search for a given cell name.
-     * The return value has the exact same meaning that the one of Collections.binarySearch().
-     * (We don't use Collections.binarySearch() directly because it would require us to create
-     * a fake Cell (as well as an Cell comparator) to do the search, which is ugly.
-     */
-    private int binarySearch(int fromIndex, int toIndex, Composite name, Comparator<Composite> comparator)
-    {
-        int low = fromIndex;
-        int mid = toIndex;
-        int high = mid - 1;
-        int result = -1;
-        while (low <= high)
-        {
-            mid = (low + high) >> 1;
-            if ((result = comparator.compare(name, cells[mid].name())) > 0)
-                low = mid + 1;
-            else if (result == 0)
-                return mid;
-            else
-                high = mid - 1;
-        }
-        return -mid - (result < 0 ? 1 : 2);
-    }
-
-    public Collection<Cell> getSortedColumns()
-    {
-        return new CellCollection(reversed);
-    }
-
-    public Collection<Cell> getReverseSortedColumns()
-    {
-        return new CellCollection(!reversed);
-    }
-
-    public int getColumnCount()
-    {
-        maybeSortCells();
-        return size;
-    }
-
-    public boolean hasColumns()
-    {
-        return size > 0;
-    }
-
-    public void clear()
-    {
-        setDeletionInfo(DeletionInfo.live());
-        for (int i = 0; i < size; i++)
-            cells[i] = null;
-        size = sortedSize = 0;
-        isSorted = true;
-    }
-
-    public DeletionInfo deletionInfo()
-    {
-        return deletionInfo;
-    }
-
-    public void delete(DeletionTime delTime)
-    {
-        deletionInfo.add(delTime);
-    }
-
-    public void delete(DeletionInfo newInfo)
-    {
-        deletionInfo.add(newInfo);
-    }
-
-    protected void delete(RangeTombstone tombstone)
-    {
-        deletionInfo.add(tombstone, getComparator());
-    }
-
-    public void setDeletionInfo(DeletionInfo newInfo)
-    {
-        deletionInfo = newInfo;
-    }
-
-    /**
-     * Purges any tombstones with a local deletion time before gcBefore.
-     * @param gcBefore a timestamp (in seconds) before which tombstones should be purged
-     */
-    public void purgeTombstones(int gcBefore)
-    {
-        deletionInfo.purge(gcBefore);
-    }
-
-    public Iterable<CellName> getColumnNames()
-    {
-        return Iterables.transform(new CellCollection(false), new Function<Cell, CellName>()
-        {
-            public CellName apply(Cell cell)
-            {
-                return cell.name();
-            }
-        });
-    }
-
-    public Iterator<Cell> iterator(ColumnSlice[] slices)
-    {
-        maybeSortCells();
-        return slices.length == 1
-             ? slice(slices[0], reversed, null)
-             : new SlicesIterator(slices, reversed);
-    }
-
-    public Iterator<Cell> reverseIterator(ColumnSlice[] slices)
-    {
-        maybeSortCells();
-        return slices.length == 1
-             ? slice(slices[0], !reversed, null)
-             : new SlicesIterator(slices, !reversed);
-    }
-
-    public SearchIterator<CellName, Cell> searchIterator()
-    {
-        maybeSortCells();
-
-        return new SearchIterator<CellName, Cell>()
-        {
-            // the first index that we could find the next key at, i.e. one larger
-            // than the last key's location
-            private int i = 0;
-
-            // We assume a uniform distribution of keys,
-            // so we keep track of how many keys were skipped to satisfy last lookup, and only look at twice that
-            // many keys for next lookup initially, extending to whole range only if we couldn't find it in that subrange
-            private int range = size / 2;
-
-            public boolean hasNext()
-            {
-                return i < size;
-            }
-
-            public Cell next(CellName name)
-            {
-                if (!isSorted || !hasNext())
-                    throw new IllegalStateException();
-
-                // optimize for runs of sequential matches, as in CollationController
-                // checking to see if we've found the desired cells yet (CASSANDRA-6933)
-                int c = metadata.comparator.compare(name, cells[i].name());
-                if (c <= 0)
-                    return c < 0 ? null : cells[i++];
-
-                // use range to manually force a better bsearch "pivot" by breaking it into two calls:
-                // first for i..i+range, then i+range..size if necessary.
-                // https://issues.apache.org/jira/browse/CASSANDRA-6933?focusedCommentId=13958264&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13958264
-                int limit = Math.min(size, i + range);
-                int i2 = binarySearch(i + 1, limit, name, internalComparator());
-                if (-1 - i2 == limit)
-                    i2 = binarySearch(limit, size, name, internalComparator());
-                // i2 can't be zero since we already checked cells[i] above
-                if (i2 > 0)
-                {
-                    range = i2 - i;
-                    i = i2 + 1;
-                    return cells[i2];
-                }
-                i2 = -1 - i2;
-                range = i2 - i;
-                i = i2;
-                return null;
-            }
-        };
-    }
-
-    private class SlicesIterator extends AbstractIterator<Cell>
-    {
-        private final ColumnSlice[] slices;
-        private final boolean invert;
-
-        private int idx = 0;
-        private int previousSliceEnd;
-        private Iterator<Cell> currentSlice;
-
-        public SlicesIterator(ColumnSlice[] slices, boolean invert)
-        {
-            this.slices = slices;
-            this.invert = invert;
-            previousSliceEnd = invert ? size : 0;
-        }
-
-        protected Cell computeNext()
-        {
-            if (currentSlice == null)
-            {
-                if (idx >= slices.length)
-                    return endOfData();
-                currentSlice = slice(slices[idx++], invert, this);
-            }
-
-            if (currentSlice.hasNext())
-                return currentSlice.next();
-
-            currentSlice = null;
-            return computeNext();
-        }
-    }
-
-    /**
-     * @return a sub-range of our cells as an Iterator, between the provided composites (inclusive)
-     *
-     * @param slice  The slice with the inclusive start and finish bounds
-     * @param invert If the sort order of our collection is opposite to the desired sort order of the result;
-     *               this results in swapping the start/finish (since they are provided based on the desired
-     *               sort order, not our sort order), to normalise to our sort order, and a backwards iterator is returned
-     * @param iter   If this slice is part of a multi-slice, the iterator will be updated to ensure cells are visited only once
-     */
-    private Iterator<Cell> slice(ColumnSlice slice, boolean invert, SlicesIterator iter)
-    {
-        Composite start = invert ? slice.finish : slice.start;
-        Composite finish = invert ? slice.start : slice.finish;
-
-        int lowerBound = 0, upperBound = size;
-        if (iter != null)
-        {
-            if (invert)
-                upperBound = iter.previousSliceEnd;
-            else
-                lowerBound = iter.previousSliceEnd;
-        }
-
-        if (!start.isEmpty())
-        {
-            lowerBound = binarySearch(lowerBound, upperBound, start, internalComparator());
-            if (lowerBound < 0)
-                lowerBound = -lowerBound - 1;
-        }
-
-        if (!finish.isEmpty())
-        {
-            upperBound = binarySearch(lowerBound, upperBound, finish, internalComparator());
-            upperBound = upperBound < 0
-                       ? -upperBound - 1
-                       : upperBound + 1; // upperBound is exclusive for the iterators
-        }
-
-        // If we're going backwards (wrt our sort order) we store the startIdx and use it as our upper bound next round
-        if (iter != null)
-            iter.previousSliceEnd = invert ? lowerBound : upperBound;
-
-        return invert
-             ? new BackwardsCellIterator(lowerBound, upperBound)
-             : new ForwardsCellIterator(lowerBound, upperBound);
-    }
-
-    private final class BackwardsCellIterator implements Iterator<Cell>
-    {
-        private int idx, end;
-        private boolean shouldCallNext = true;
-
-        // lowerBound inclusive, upperBound exclusive
-        private BackwardsCellIterator(int lowerBound, int upperBound)
-        {
-            idx = upperBound - 1;
-            end = lowerBound - 1;
-        }
-
-        public boolean hasNext()
-        {
-            return idx > end;
-        }
-
-        public Cell next()
-        {
-            try
-            {
-                shouldCallNext = false;
-                return cells[idx--];
-            }
-            catch (ArrayIndexOutOfBoundsException e)
-            {
-                NoSuchElementException ne = new NoSuchElementException(e.getMessage());
-                ne.initCause(e);
-                throw ne;
-            }
-        }
-
-        public void remove()
-        {
-            if (shouldCallNext)
-                throw new IllegalStateException();
-            shouldCallNext = true;
-            internalRemove(idx + 1);
-            sortedSize--;
-        }
-    }
-
-    private final class ForwardsCellIterator implements Iterator<Cell>
-    {
-        private int idx, end;
-        private boolean shouldCallNext = true;
-
-        // lowerBound inclusive, upperBound exclusive
-        private ForwardsCellIterator(int lowerBound, int upperBound)
-        {
-            idx = lowerBound;
-            end = upperBound;
-        }
-
-        public boolean hasNext()
-        {
-            return idx < end;
-        }
-
-        public Cell next()
-        {
-            try
-            {
-                shouldCallNext = false;
-                return cells[idx++];
-            }
-            catch (ArrayIndexOutOfBoundsException e)
-            {
-                NoSuchElementException ne = new NoSuchElementException(e.getMessage());
-                ne.initCause(e);
-                throw ne;
-            }
-        }
-
-        public void remove()
-        {
-            if (shouldCallNext)
-                throw new IllegalStateException();
-            shouldCallNext = true;
-            internalRemove(--idx);
-            sortedSize--;
-            end--;
-        }
-    }
-
-    private final class CellCollection extends AbstractCollection<Cell>
-    {
-        private final boolean invert;
-
-        private CellCollection(boolean invert)
-        {
-            this.invert = invert;
-        }
-
-        public int size()
-        {
-            return getColumnCount();
-        }
-
-        public Iterator<Cell> iterator()
-        {
-            maybeSortCells();
-            return invert
-                 ? new BackwardsCellIterator(0, size)
-                 : new ForwardsCellIterator(0, size);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/AtomDeserializer.java b/src/java/org/apache/cassandra/db/AtomDeserializer.java
deleted file mode 100644
index 74f1946..0000000
--- a/src/java/org/apache/cassandra/db/AtomDeserializer.java
+++ /dev/null

@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.sstable.format.Version;
-
-/**
- * Helper class to deserialize OnDiskAtom efficiently.
- *
- * More precisely, this class is used by the low-level readers
- * (IndexedSliceReader and SSTableNamesIterator) to ensure we don't
- * do more work than necessary (i.e. we don't allocate/deserialize
- * objects for things we don't care about).
- */
-public class AtomDeserializer
-{
-    private final CellNameType type;
-    private final CellNameType.Deserializer nameDeserializer;
-    private final DataInput in;
-    private final ColumnSerializer.Flag flag;
-    private final int expireBefore;
-    private final Version version;
-
-    // The "flag" for the next name (which correspond to the "masks" in ColumnSerializer) if it has been
-    // read already, Integer.MIN_VALUE otherwise;
-    private int nextFlags = Integer.MIN_VALUE;
-
-    public AtomDeserializer(CellNameType type, DataInput in, ColumnSerializer.Flag flag, int expireBefore, Version version)
-    {
-        this.type = type;
-        this.nameDeserializer = type.newDeserializer(in);
-        this.in = in;
-        this.flag = flag;
-        this.expireBefore = expireBefore;
-        this.version = version;
-    }
-
-    /**
-     * Whether or not there is more atom to read.
-     */
-    public boolean hasNext() throws IOException
-    {
-        return nameDeserializer.hasNext();
-    }
-
-    /**
-     * Whether or not some atom has been read but not processed (neither readNext() nor
-     * skipNext() has been called for that atom) yet.
-     */
-    public boolean hasUnprocessed() throws IOException
-    {
-        return nameDeserializer.hasUnprocessed();
-    }
-
-    /**
-     * Compare the provided composite to the next atom to read on disk.
-     *
-     * This will not read/deserialize the whole atom but only what is necessary for the
-     * comparison. Whenever we know what to do with this atom (read it or skip it),
-     * readNext or skipNext should be called.
-     */
-    public int compareNextTo(Composite composite) throws IOException
-    {
-        return nameDeserializer.compareNextTo(composite);
-    }
-
-    /**
-     * Returns whether the next atom is a range tombstone or not.
-     *
-     * Please note that this should only be called after compareNextTo() has been called.
-     */
-    public boolean nextIsRangeTombstone() throws IOException
-    {
-        nextFlags = in.readUnsignedByte();
-        return (nextFlags & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0;
-    }
-
-    /**
-     * Returns the next atom.
-     */
-    public OnDiskAtom readNext() throws IOException
-    {
-        Composite name = nameDeserializer.readNext();
-        assert !name.isEmpty(); // This would imply hasNext() hasn't been called
-
-        nextFlags = nextFlags == Integer.MIN_VALUE ? in.readUnsignedByte() : nextFlags;
-        OnDiskAtom atom = (nextFlags & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0
-                        ? type.rangeTombstoneSerializer().deserializeBody(in, name, version)
-                        : type.columnSerializer().deserializeColumnBody(in, (CellName)name, nextFlags, flag, expireBefore);
-        nextFlags = Integer.MIN_VALUE;
-        return atom;
-    }
-
-    /**
-     * Skips the next atom.
-     */
-    public void skipNext() throws IOException
-    {
-        nameDeserializer.skipNext();
-        nextFlags = nextFlags == Integer.MIN_VALUE ? in.readUnsignedByte() : nextFlags;
-        if ((nextFlags & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0)
-            type.rangeTombstoneSerializer().skipBody(in, version);
-        else
-            type.columnSerializer().skipColumnBody(in, nextFlags);
-        nextFlags = Integer.MIN_VALUE;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java b/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
deleted file mode 100644
index f5b7712..0000000
--- a/src/java/org/apache/cassandra/db/AtomicBTreeColumns.java
+++ /dev/null

@@ -1,586 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.AbstractCollection;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
-import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
-
-import com.google.common.base.Function;
-import com.google.common.base.Functions;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.utils.*;
-import org.apache.cassandra.utils.SearchIterator;
-import org.apache.cassandra.utils.btree.BTree;
-import org.apache.cassandra.utils.btree.BTreeSearchIterator;
-import org.apache.cassandra.utils.btree.UpdateFunction;
-import org.apache.cassandra.utils.concurrent.Locks;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.HeapAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.NativePool;
-
-import static org.apache.cassandra.db.index.SecondaryIndexManager.Updater;
-
-/**
- * A thread-safe and atomic ISortedColumns implementation.
- * Operations (in particular addAll) on this implemenation are atomic and
- * isolated (in the sense of ACID). Typically a addAll is guaranteed that no
- * other thread can see the state where only parts but not all columns have
- * been added.
- * <p>
- * WARNING: removing element through getSortedColumns().iterator() is *not* supported
- * </p>
- */
-public class AtomicBTreeColumns extends ColumnFamily
-{
-    static final long EMPTY_SIZE = ObjectSizes.measure(new AtomicBTreeColumns(CFMetaData.denseCFMetaData("keyspace", "table", BytesType.instance), null))
-            + ObjectSizes.measure(new Holder(null, null));
-
-    // Reserved values for wasteTracker field. These values must not be consecutive (see avoidReservedValues)
-    private static final int TRACKER_NEVER_WASTED = 0;
-    private static final int TRACKER_PESSIMISTIC_LOCKING = Integer.MAX_VALUE;
-
-    // The granularity with which we track wasted allocation/work; we round up
-    private static final int ALLOCATION_GRANULARITY_BYTES = 1024;
-    // The number of bytes we have to waste in excess of our acceptable realtime rate of waste (defined below)
-    private static final long EXCESS_WASTE_BYTES = 10 * 1024 * 1024L;
-    private static final int EXCESS_WASTE_OFFSET = (int) (EXCESS_WASTE_BYTES / ALLOCATION_GRANULARITY_BYTES);
-    // Note this is a shift, because dividing a long time and then picking the low 32 bits doesn't give correct rollover behavior
-    private static final int CLOCK_SHIFT = 17;
-    // CLOCK_GRANULARITY = 1^9ns >> CLOCK_SHIFT == 132us == (1/7.63)ms
-
-    /**
-     * (clock + allocation) granularity are combined to give us an acceptable (waste) allocation rate that is defined by
-     * the passage of real time of ALLOCATION_GRANULARITY_BYTES/CLOCK_GRANULARITY, or in this case 7.63Kb/ms, or 7.45Mb/s
-     *
-     * in wasteTracker we maintain within EXCESS_WASTE_OFFSET before the current time; whenever we waste bytes
-     * we increment the current value if it is within this window, and set it to the min of the window plus our waste
-     * otherwise.
-     */
-    private volatile int wasteTracker = TRACKER_NEVER_WASTED;
-
-    private static final AtomicIntegerFieldUpdater<AtomicBTreeColumns> wasteTrackerUpdater = AtomicIntegerFieldUpdater.newUpdater(AtomicBTreeColumns.class, "wasteTracker");
-
-    private static final Function<Cell, CellName> NAME = new Function<Cell, CellName>()
-    {
-        public CellName apply(Cell column)
-        {
-            return column.name();
-        }
-    };
-
-    public static final Factory<AtomicBTreeColumns> factory = new Factory<AtomicBTreeColumns>()
-    {
-        public AtomicBTreeColumns create(CFMetaData metadata, boolean insertReversed, int initialCapacity)
-        {
-            if (insertReversed)
-                throw new IllegalArgumentException();
-            return new AtomicBTreeColumns(metadata);
-        }
-    };
-
-    private static final DeletionInfo LIVE = DeletionInfo.live();
-    // This is a small optimization: DeletionInfo is mutable, but we know that we will always copy it in that class,
-    // so we can safely alias one DeletionInfo.live() reference and avoid some allocations.
-    private static final Holder EMPTY = new Holder(BTree.empty(), LIVE);
-
-    private volatile Holder ref;
-
-    private static final AtomicReferenceFieldUpdater<AtomicBTreeColumns, Holder> refUpdater = AtomicReferenceFieldUpdater.newUpdater(AtomicBTreeColumns.class, Holder.class, "ref");
-
-    private AtomicBTreeColumns(CFMetaData metadata)
-    {
-        this(metadata, EMPTY);
-    }
-
-    private AtomicBTreeColumns(CFMetaData metadata, Holder holder)
-    {
-        super(metadata);
-        this.ref = holder;
-    }
-
-    public Factory getFactory()
-    {
-        return factory;
-    }
-
-    public ColumnFamily cloneMe()
-    {
-        return new AtomicBTreeColumns(metadata, ref);
-    }
-
-    public DeletionInfo deletionInfo()
-    {
-        return ref.deletionInfo;
-    }
-
-    public void delete(DeletionTime delTime)
-    {
-        delete(new DeletionInfo(delTime));
-    }
-
-    protected void delete(RangeTombstone tombstone)
-    {
-        delete(new DeletionInfo(tombstone, getComparator()));
-    }
-
-    public SearchIterator<CellName, Cell> searchIterator()
-    {
-        return new BTreeSearchIterator<>(ref.tree, asymmetricComparator());
-    }
-
-    public void delete(DeletionInfo info)
-    {
-        if (info.isLive())
-            return;
-
-        // Keeping deletion info for max markedForDeleteAt value
-        while (true)
-        {
-            Holder current = ref;
-            DeletionInfo curDelInfo = current.deletionInfo;
-            DeletionInfo newDelInfo = info.mayModify(curDelInfo) ? curDelInfo.copy().add(info) : curDelInfo;
-            if (refUpdater.compareAndSet(this, current, current.with(newDelInfo)))
-                break;
-        }
-    }
-
-    public void setDeletionInfo(DeletionInfo newInfo)
-    {
-        ref = ref.with(newInfo);
-    }
-
-    public void purgeTombstones(int gcBefore)
-    {
-        while (true)
-        {
-            Holder current = ref;
-            if (!current.deletionInfo.hasPurgeableTombstones(gcBefore))
-                break;
-
-            DeletionInfo purgedInfo = current.deletionInfo.copy();
-            purgedInfo.purge(gcBefore);
-            if (refUpdater.compareAndSet(this, current, current.with(purgedInfo)))
-                break;
-        }
-    }
-
-    /**
-     * This is only called by Memtable.resolve, so only AtomicBTreeColumns needs to implement it.
-     *
-     * @return the difference in size seen after merging the given columns
-     */
-    public ColumnUpdater addAllWithSizeDelta(final ColumnFamily cm, MemtableAllocator allocator, OpOrder.Group writeOp, Updater indexer)
-    {
-        ColumnUpdater updater = new ColumnUpdater(this, cm.metadata, allocator, writeOp, indexer);
-        DeletionInfo inputDeletionInfoCopy = null;
-
-        boolean monitorOwned = false;
-        try
-        {
-            if (usePessimisticLocking())
-            {
-                Locks.monitorEnterUnsafe(this);
-                monitorOwned = true;
-            }
-            while (true)
-            {
-                Holder current = ref;
-                updater.ref = current;
-                updater.reset();
-
-                DeletionInfo deletionInfo;
-                if (cm.deletionInfo().mayModify(current.deletionInfo))
-                {
-                    if (inputDeletionInfoCopy == null)
-                        inputDeletionInfoCopy = cm.deletionInfo().copy(HeapAllocator.instance);
-
-                    deletionInfo = current.deletionInfo.copy().add(inputDeletionInfoCopy);
-                    updater.allocated(deletionInfo.unsharedHeapSize() - current.deletionInfo.unsharedHeapSize());
-                }
-                else
-                {
-                    deletionInfo = current.deletionInfo;
-                }
-
-                Object[] tree = BTree.update(current.tree, metadata.comparator.columnComparator(Memtable.MEMORY_POOL instanceof NativePool), cm, cm.getColumnCount(), true, updater);
-
-                if (tree != null && refUpdater.compareAndSet(this, current, new Holder(tree, deletionInfo)))
-                {
-                    indexer.updateRowLevelIndexes();
-                    updater.finish();
-                    return updater;
-                }
-                else if (!monitorOwned)
-                {
-                    boolean shouldLock = usePessimisticLocking();
-                    if (!shouldLock)
-                    {
-                        shouldLock = updateWastedAllocationTracker(updater.heapSize);
-                    }
-                    if (shouldLock)
-                    {
-                        Locks.monitorEnterUnsafe(this);
-                        monitorOwned = true;
-                    }
-                }
-            }
-        }
-        finally
-        {
-            if (monitorOwned)
-                Locks.monitorExitUnsafe(this);
-        }
-    }
-
-    boolean usePessimisticLocking()
-    {
-        return wasteTracker == TRACKER_PESSIMISTIC_LOCKING;
-    }
-
-    /**
-     * Update the wasted allocation tracker state based on newly wasted allocation information
-     *
-     * @param wastedBytes the number of bytes wasted by this thread
-     * @return true if the caller should now proceed with pessimistic locking because the waste limit has been reached
-     */
-    private boolean updateWastedAllocationTracker(long wastedBytes) {
-        // Early check for huge allocation that exceeds the limit
-        if (wastedBytes < EXCESS_WASTE_BYTES)
-        {
-            // We round up to ensure work < granularity are still accounted for
-            int wastedAllocation = ((int) (wastedBytes + ALLOCATION_GRANULARITY_BYTES - 1)) / ALLOCATION_GRANULARITY_BYTES;
-
-            int oldTrackerValue;
-            while (TRACKER_PESSIMISTIC_LOCKING != (oldTrackerValue = wasteTracker))
-            {
-                // Note this time value has an arbitrary offset, but is a constant rate 32 bit counter (that may wrap)
-                int time = (int) (System.nanoTime() >>> CLOCK_SHIFT);
-                int delta = oldTrackerValue - time;
-                if (oldTrackerValue == TRACKER_NEVER_WASTED || delta >= 0 || delta < -EXCESS_WASTE_OFFSET)
-                    delta = -EXCESS_WASTE_OFFSET;
-                delta += wastedAllocation;
-                if (delta >= 0)
-                    break;
-                if (wasteTrackerUpdater.compareAndSet(this, oldTrackerValue, avoidReservedValues(time + delta)))
-                    return false;
-            }
-        }
-        // We have definitely reached our waste limit so set the state if it isn't already
-        wasteTrackerUpdater.set(this, TRACKER_PESSIMISTIC_LOCKING);
-        // And tell the caller to proceed with pessimistic locking
-        return true;
-    }
-
-    private static int avoidReservedValues(int wasteTracker)
-    {
-        if (wasteTracker == TRACKER_NEVER_WASTED || wasteTracker == TRACKER_PESSIMISTIC_LOCKING)
-            return wasteTracker + 1;
-        return wasteTracker;
-    }
-
-    // no particular reason not to implement these next methods, we just haven't needed them yet
-
-    public void addColumn(Cell column)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void appendColumn(Cell cell)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void addAll(ColumnFamily cf)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void clear()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Cell getColumn(CellName name)
-    {
-        return (Cell) BTree.find(ref.tree, asymmetricComparator(), name);
-    }
-
-    private Comparator<Object> asymmetricComparator()
-    {
-        return metadata.comparator.asymmetricColumnComparator(Memtable.MEMORY_POOL instanceof NativePool);
-    }
-
-    public Iterable<CellName> getColumnNames()
-    {
-        return collection(false, NAME);
-    }
-
-    public Collection<Cell> getSortedColumns()
-    {
-        return collection(true, Functions.<Cell>identity());
-    }
-
-    public Collection<Cell> getReverseSortedColumns()
-    {
-        return collection(false, Functions.<Cell>identity());
-    }
-
-    private <V> Collection<V> collection(final boolean forwards, final Function<Cell, V> f)
-    {
-        final Holder ref = this.ref;
-        return new AbstractCollection<V>()
-        {
-            public Iterator<V> iterator()
-            {
-                return Iterators.transform(BTree.<Cell>slice(ref.tree, forwards), f);
-            }
-
-            public int size()
-            {
-                return BTree.slice(ref.tree, true).count();
-            }
-        };
-    }
-
-    public int getColumnCount()
-    {
-        return BTree.slice(ref.tree, true).count();
-    }
-
-    public boolean hasColumns()
-    {
-        return !BTree.isEmpty(ref.tree);
-    }
-
-    public Iterator<Cell> iterator(ColumnSlice[] slices)
-    {
-        return slices.length == 1
-             ? slice(ref.tree, asymmetricComparator(), slices[0].start, slices[0].finish, true)
-             : new SliceIterator(ref.tree, asymmetricComparator(), true, slices);
-    }
-
-    public Iterator<Cell> reverseIterator(ColumnSlice[] slices)
-    {
-        return slices.length == 1
-             ? slice(ref.tree, asymmetricComparator(), slices[0].finish, slices[0].start, false)
-             : new SliceIterator(ref.tree, asymmetricComparator(), false, slices);
-    }
-
-    public boolean isInsertReversed()
-    {
-        return false;
-    }
-
-    public BatchRemoveIterator<Cell> batchRemoveIterator()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    private static final class Holder
-    {
-        final DeletionInfo deletionInfo;
-        // the btree of columns
-        final Object[] tree;
-
-        Holder(Object[] tree, DeletionInfo deletionInfo)
-        {
-            this.tree = tree;
-            this.deletionInfo = deletionInfo;
-        }
-
-        Holder with(DeletionInfo info)
-        {
-            return new Holder(this.tree, info);
-        }
-    }
-
-    // the function we provide to the btree utilities to perform any column replacements
-    static final class ColumnUpdater implements UpdateFunction<Cell>
-    {
-        final AtomicBTreeColumns updating;
-        final CFMetaData metadata;
-        final MemtableAllocator allocator;
-        final OpOrder.Group writeOp;
-        final Updater indexer;
-        Holder ref;
-        long dataSize;
-        long heapSize;
-        long colUpdateTimeDelta = Long.MAX_VALUE;
-        final MemtableAllocator.DataReclaimer reclaimer;
-        List<Cell> inserted; // TODO: replace with walk of aborted BTree
-        long minTimestamp = Long.MAX_VALUE;
-
-        private ColumnUpdater(AtomicBTreeColumns updating, CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group writeOp, Updater indexer)
-        {
-            this.updating = updating;
-            this.allocator = allocator;
-            this.writeOp = writeOp;
-            this.indexer = indexer;
-            this.metadata = metadata;
-            this.reclaimer = allocator.reclaimer();
-        }
-
-        public Cell apply(Cell insert)
-        {
-            indexer.insert(insert);
-            insert = insert.localCopy(metadata, allocator, writeOp);
-            this.dataSize += insert.cellDataSize();
-            this.heapSize += insert.unsharedHeapSizeExcludingData();
-            if (inserted == null)
-                inserted = new ArrayList<>();
-            inserted.add(insert);
-            minTimestamp = Math.min(minTimestamp, insert.timestamp());
-            return insert;
-        }
-
-        public Cell apply(Cell existing, Cell update)
-        {
-            Cell reconciled = existing.reconcile(update);
-            indexer.update(existing, reconciled);
-            // pick the smallest timestamp because we want to be consistent with the logic applied when inserting
-            // a cell in apply(Cell insert) above. For example given 3 timestamps where T3 < T2 < T1 then we want
-            // [apply(T1) -> apply(T2) -> apply(T3)] and [apply(T3) -> apply(T2) -> apply(T1)] to both return the
-            // smallest value T3, see CompactionControllerTest.testMaxPurgeableTimestamp()
-            minTimestamp = Math.min(minTimestamp, update.timestamp());
-            if (existing != reconciled)
-            {
-                reconciled = reconciled.localCopy(metadata, allocator, writeOp);
-                dataSize += reconciled.cellDataSize() - existing.cellDataSize();
-                heapSize += reconciled.unsharedHeapSizeExcludingData() - existing.unsharedHeapSizeExcludingData();
-                if (inserted == null)
-                    inserted = new ArrayList<>();
-                inserted.add(reconciled);
-                discard(existing);
-                //Getting the minimum delta for an update containing multiple columns
-                colUpdateTimeDelta =  Math.min(Math.abs(existing.timestamp()  - update.timestamp()), colUpdateTimeDelta);
-            }
-            return reconciled;
-        }
-
-        protected void reset()
-        {
-            this.dataSize = 0;
-            this.heapSize = 0;
-            if (inserted != null)
-            {
-                for (Cell cell : inserted)
-                    abort(cell);
-                inserted.clear();
-            }
-            reclaimer.cancel();
-            minTimestamp = Long.MAX_VALUE;
-        }
-
-        protected void abort(Cell abort)
-        {
-            reclaimer.reclaimImmediately(abort);
-        }
-
-        protected void discard(Cell discard)
-        {
-            reclaimer.reclaim(discard);
-        }
-
-        public boolean abortEarly()
-        {
-            return updating.ref != ref;
-        }
-
-        public void allocated(long heapSize)
-        {
-            this.heapSize += heapSize;
-        }
-
-        protected void finish()
-        {
-            allocator.onHeap().adjust(heapSize, writeOp);
-            reclaimer.commit();
-        }
-    }
-
-    private static class SliceIterator extends AbstractIterator<Cell>
-    {
-        private final Object[] btree;
-        private final boolean forwards;
-        private final Comparator<Object> comparator;
-        private final ColumnSlice[] slices;
-
-        private int idx = 0;
-        private Iterator<Cell> currentSlice;
-
-        SliceIterator(Object[] btree, Comparator<Object> comparator, boolean forwards, ColumnSlice[] slices)
-        {
-            this.btree = btree;
-            this.comparator = comparator;
-            this.slices = slices;
-            this.forwards = forwards;
-        }
-
-        protected Cell computeNext()
-        {
-            while (currentSlice != null || idx < slices.length)
-            {
-                if (currentSlice == null)
-                {
-                    ColumnSlice slice = slices[idx++];
-                    if (forwards)
-                        currentSlice = slice(btree, comparator, slice.start, slice.finish, true);
-                    else
-                        currentSlice = slice(btree, comparator, slice.finish, slice.start, false);
-                }
-
-                if (currentSlice.hasNext())
-                    return currentSlice.next();
-
-                currentSlice = null;
-            }
-
-            return endOfData();
-        }
-    }
-
-    private static Iterator<Cell> slice(Object[] btree, Comparator<Object> comparator, Composite start, Composite finish, boolean forwards)
-    {
-        return BTree.slice(btree,
-                           comparator,
-                           start.isEmpty() ? null : start,
-                           true,
-                           finish.isEmpty() ? null : finish,
-                           true,
-                           forwards);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BatchlogManager.java b/src/java/org/apache/cassandra/db/BatchlogManager.java
deleted file mode 100644
index 40f8ce0..0000000
--- a/src/java/org/apache/cassandra/db/BatchlogManager.java
+++ /dev/null

@@ -1,540 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.atomic.AtomicLong;
-import javax.management.ObjectName;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.*;
-import com.google.common.util.concurrent.RateLimiter;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.WriteFailureException;
-import org.apache.cassandra.exceptions.WriteTimeoutException;
-import org.apache.cassandra.gms.FailureDetector;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.service.WriteResponseHandler;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ExecutorUtils;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.MBeanWrapper;
-import org.apache.cassandra.utils.WrappedRunnable;
-import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
-
-public class BatchlogManager implements BatchlogManagerMBean
-{
-    private static final String MBEAN_NAME = "org.apache.cassandra.db:type=BatchlogManager";
-    private static final long REPLAY_INTERVAL = 60 * 1000; // milliseconds
-    private static final int PAGE_SIZE = 128; // same as HHOM, for now, w/out using any heuristics. TODO: set based on avg batch size.
-
-    private static final Logger logger = LoggerFactory.getLogger(BatchlogManager.class);
-    public static final BatchlogManager instance = new BatchlogManager();
-
-    private final AtomicLong totalBatchesReplayed = new AtomicLong();
-
-    // Single-thread executor service for scheduling and serializing log replay.
-    private static final ScheduledExecutorService batchlogTasks = new DebuggableScheduledThreadPoolExecutor("BatchlogTasks");
-
-    public void start()
-    {
-        MBeanWrapper mbs = MBeanWrapper.instance;
-        try
-        {
-            mbs.registerMBean(this, new ObjectName(MBEAN_NAME));
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-
-        Runnable runnable = new WrappedRunnable()
-        {
-            public void runMayThrow() throws ExecutionException, InterruptedException
-            {
-                replayAllFailedBatches();
-            }
-        };
-
-        batchlogTasks.scheduleWithFixedDelay(runnable, StorageService.RING_DELAY, REPLAY_INTERVAL, TimeUnit.MILLISECONDS);
-    }
-
-    public static void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
-    {
-        ExecutorUtils.shutdownAndWait(timeout, unit, batchlogTasks);
-    }
-
-    public int countAllBatches()
-    {
-        String query = String.format("SELECT count(*) FROM %s.%s", SystemKeyspace.NAME, SystemKeyspace.BATCHLOG);
-        return (int) executeInternal(query).one().getLong("count");
-    }
-
-    public long getTotalBatchesReplayed()
-    {
-        return totalBatchesReplayed.longValue();
-    }
-
-    public void forceBatchlogReplay() throws Exception
-    {
-        startBatchlogReplay().get();
-    }
-
-    public Future<?> startBatchlogReplay()
-    {
-        Runnable runnable = new WrappedRunnable()
-        {
-            public void runMayThrow() throws ExecutionException, InterruptedException
-            {
-                replayAllFailedBatches();
-            }
-        };
-        // If a replay is already in progress this request will be executed after it completes.
-        return batchlogTasks.submit(runnable);
-    }
-
-    public static Mutation getBatchlogMutationFor(Collection<Mutation> mutations, UUID uuid, int version)
-    {
-        return getBatchlogMutationFor(mutations, uuid, version, FBUtilities.timestampMicros());
-    }
-
-    @VisibleForTesting
-    static Mutation getBatchlogMutationFor(Collection<Mutation> mutations, UUID uuid, int version, long now)
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(SystemKeyspace.Batchlog);
-        CFRowAdder adder = new CFRowAdder(cf, SystemKeyspace.Batchlog.comparator.builder().build(), now);
-        adder.add("data", serializeMutations(mutations, version))
-             .add("written_at", new Date(now / 1000))
-             .add("version", version);
-        return new Mutation(SystemKeyspace.NAME, UUIDType.instance.decompose(uuid), cf);
-    }
-
-    private static ByteBuffer serializeMutations(Collection<Mutation> mutations, int version)
-    {
-        try (DataOutputBuffer buf = new DataOutputBuffer())
-        {
-            buf.writeInt(mutations.size());
-            for (Mutation mutation : mutations)
-                Mutation.serializer.serialize(mutation, buf, version);
-            return buf.buffer();
-        }
-        catch (IOException e)
-        {
-            throw new AssertionError(); // cannot happen.
-        }
-    }
-
-    private void replayAllFailedBatches() throws ExecutionException, InterruptedException
-    {
-        logger.trace("Started replayAllFailedBatches");
-
-        // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
-        // max rate is scaled by the number of nodes in the cluster (same as for HHOM - see CASSANDRA-5272).
-        int throttleInKB = DatabaseDescriptor.getBatchlogReplayThrottleInKB() / StorageService.instance.getTokenMetadata().getAllEndpoints().size();
-        RateLimiter rateLimiter = RateLimiter.create(throttleInKB == 0 ? Double.MAX_VALUE : throttleInKB * 1024);
-
-        UntypedResultSet page = executeInternal(String.format("SELECT id, data, written_at, version FROM %s.%s LIMIT %d",
-                                                              SystemKeyspace.NAME,
-                                                              SystemKeyspace.BATCHLOG,
-                                                              PAGE_SIZE));
-
-        while (!page.isEmpty())
-        {
-            UUID id = processBatchlogPage(page, rateLimiter);
-
-            if (page.size() < PAGE_SIZE)
-                break; // we've exhausted the batchlog, next query would be empty.
-
-            page = executeInternal(String.format("SELECT id, data, written_at, version FROM %s.%s WHERE token(id) > token(?) LIMIT %d",
-                                                 SystemKeyspace.NAME,
-                                                 SystemKeyspace.BATCHLOG,
-                                                 PAGE_SIZE),
-                                   id);
-        }
-
-        cleanup();
-
-        logger.trace("Finished replayAllFailedBatches");
-    }
-
-    private void deleteBatch(UUID id)
-    {
-        Mutation mutation = new Mutation(SystemKeyspace.NAME, UUIDType.instance.decompose(id));
-        mutation.delete(SystemKeyspace.BATCHLOG, FBUtilities.timestampMicros());
-        mutation.apply();
-    }
-
-    private UUID processBatchlogPage(UntypedResultSet page, RateLimiter rateLimiter)
-    {
-        UUID id = null;
-        ArrayList<Batch> batches = new ArrayList<>(page.size());
-
-        // Sending out batches for replay without waiting for them, so that one stuck batch doesn't affect others
-        for (UntypedResultSet.Row row : page)
-        {
-            id = row.getUUID("id");
-            long writtenAt = row.getLong("written_at");
-            // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-            long timeout = getBatchlogTimeout();
-            if (System.currentTimeMillis() < writtenAt + timeout)
-                continue; // not ready to replay yet, might still get a deletion.
-
-            int version = row.has("version") ? row.getInt("version") : MessagingService.VERSION_12;
-            Batch batch = new Batch(id, writtenAt, row.getBytes("data"), version);
-            try
-            {
-                if (batch.replay(rateLimiter) > 0)
-                {
-                    batches.add(batch);
-                }
-                else
-                {
-                    deleteBatch(id); // no write mutations were sent (either expired or all CFs involved truncated).
-                    totalBatchesReplayed.incrementAndGet();
-                }
-            }
-            catch (IOException e)
-            {
-                logger.warn("Skipped batch replay of {} due to {}", id, e);
-                deleteBatch(id);
-            }
-        }
-
-        // now waiting for all batches to complete their processing
-        // schedule hints for timed out deliveries
-        for (Batch batch : batches)
-        {
-            batch.finish();
-            deleteBatch(batch.id);
-        }
-
-        totalBatchesReplayed.addAndGet(batches.size());
-
-        return id;
-    }
-
-    public long getBatchlogTimeout()
-    {
-        return DatabaseDescriptor.getWriteRpcTimeout() * 2; // enough time for the actual write + BM removal mutation
-    }
-
-    private static class Batch
-    {
-        private final UUID id;
-        private final long writtenAt;
-        private final ByteBuffer data;
-        private final int version;
-
-        private List<ReplayWriteResponseHandler<Mutation>> replayHandlers;
-
-        public Batch(UUID id, long writtenAt, ByteBuffer data, int version)
-        {
-            this.id = id;
-            this.writtenAt = writtenAt;
-            this.data = data;
-            this.version = version;
-        }
-
-        public int replay(RateLimiter rateLimiter) throws IOException
-        {
-            logger.trace("Replaying batch {}", id);
-
-            List<Mutation> mutations = replayingMutations();
-
-            if (mutations.isEmpty())
-                return 0;
-
-            int ttl = calculateHintTTL(mutations);
-            if (ttl <= 0)
-                return 0;
-
-            replayHandlers = sendReplays(mutations, writtenAt, ttl);
-
-            rateLimiter.acquire(data.remaining()); // acquire afterwards, to not mess up ttl calculation.
-
-            return replayHandlers.size();
-        }
-
-        public void finish()
-        {
-            for (int i = 0; i < replayHandlers.size(); i++)
-            {
-                ReplayWriteResponseHandler<Mutation> handler = replayHandlers.get(i);
-                try
-                {
-                    handler.get();
-                }
-                catch (WriteTimeoutException|WriteFailureException e)
-                {
-                    logger.trace("Failed replaying a batched mutation to a node, will write a hint");
-                    logger.trace("Failure was : {}", e.getMessage());
-                    // writing hints for the rest to hints, starting from i
-                    writeHintsForUndeliveredEndpoints(i);
-                    return;
-                }
-            }
-        }
-
-        private List<Mutation> replayingMutations() throws IOException
-        {
-            DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(data));
-            int size = in.readInt();
-            List<Mutation> mutations = new ArrayList<>(size);
-            for (int i = 0; i < size; i++)
-            {
-                Mutation mutation = Mutation.serializer.deserialize(in, version);
-
-                // Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis.
-                // We don't abort the replay entirely b/c this can be considered a success (truncated is same as delivered then
-                // truncated.
-                for (UUID cfId : mutation.getColumnFamilyIds())
-                    if (writtenAt <= SystemKeyspace.getTruncatedAt(cfId))
-                        mutation = mutation.without(cfId);
-
-                if (!mutation.isEmpty())
-                    mutations.add(mutation);
-            }
-            return mutations;
-        }
-
-        private void writeHintsForUndeliveredEndpoints(int startFrom)
-        {
-            try
-            {
-                // Here we deserialize mutations 2nd time from byte buffer.
-                // but this is ok, because timeout on batch direct delivery is rare
-                // (it can happen only several seconds until node is marked dead)
-                // so trading some cpu to keep less objects
-                List<Mutation> replayingMutations = replayingMutations();
-                for (int i = startFrom; i < replayHandlers.size(); i++)
-                {
-                    Mutation undeliveredMutation = replayingMutations.get(i);
-                    int ttl = calculateHintTTL(replayingMutations);
-                    ReplayWriteResponseHandler<Mutation> handler = replayHandlers.get(i);
-
-                    if (ttl > 0 && handler != null)
-                        for (InetAddress endpoint : handler.undelivered)
-                            StorageProxy.writeHintForMutation(undeliveredMutation, writtenAt, ttl, endpoint);
-                }
-            }
-            catch (IOException e)
-            {
-                logger.error("Cannot schedule hints for undelivered batch", e);
-            }
-        }
-
-        private List<ReplayWriteResponseHandler<Mutation>> sendReplays(List<Mutation> mutations, long writtenAt, int ttl)
-        {
-            List<ReplayWriteResponseHandler<Mutation>> handlers = new ArrayList<>(mutations.size());
-            for (Mutation mutation : mutations)
-            {
-                ReplayWriteResponseHandler<Mutation> handler = sendSingleReplayMutation(mutation, writtenAt, ttl);
-                if (handler != null)
-                    handlers.add(handler);
-            }
-            return handlers;
-        }
-
-        /**
-         * We try to deliver the mutations to the replicas ourselves if they are alive and only resort to writing hints
-         * when a replica is down or a write request times out.
-         *
-         * @return direct delivery handler to wait on or null, if no live nodes found
-         */
-        private ReplayWriteResponseHandler<Mutation> sendSingleReplayMutation(final Mutation mutation, long writtenAt, int ttl)
-        {
-            Set<InetAddress> liveEndpoints = new HashSet<>();
-            String ks = mutation.getKeyspaceName();
-            Token tk = StorageService.getPartitioner().getToken(mutation.key());
-
-            for (InetAddress endpoint : Iterables.concat(StorageService.instance.getNaturalEndpoints(ks, tk),
-                                                         StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, ks)))
-            {
-                if (endpoint.equals(FBUtilities.getBroadcastAddress()))
-                    mutation.apply();
-                else if (FailureDetector.instance.isAlive(endpoint))
-                    liveEndpoints.add(endpoint); // will try delivering directly instead of writing a hint.
-                else
-                    StorageProxy.writeHintForMutation(mutation, writtenAt, ttl, endpoint);
-            }
-
-            if (liveEndpoints.isEmpty())
-                return null;
-
-            ReplayWriteResponseHandler<Mutation> handler = new ReplayWriteResponseHandler<>(liveEndpoints);
-            MessageOut<Mutation> message = mutation.createMessage();
-            for (InetAddress endpoint : liveEndpoints)
-                MessagingService.instance().sendRR(message, endpoint, handler, false);
-            return handler;
-        }
-
-        /*
-         * Calculate ttl for the mutations' hints (and reduce ttl by the time the mutations spent in the batchlog).
-         * This ensures that deletes aren't "undone" by an old batch replay.
-         */
-        private int calculateHintTTL(Collection<Mutation> mutations)
-        {
-            int unadjustedTTL = Integer.MAX_VALUE;
-            for (Mutation mutation : mutations)
-                unadjustedTTL = Math.min(unadjustedTTL, HintedHandOffManager.calculateHintTTL(mutation));
-            return unadjustedTTL - (int) TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - writtenAt);
-        }
-
-        /**
-         * A wrapper of WriteResponseHandler that stores the addresses of the endpoints from
-         * which we did not receive a successful reply.
-         */
-        private static class ReplayWriteResponseHandler<T> extends WriteResponseHandler<T>
-        {
-            private final Set<InetAddress> undelivered = Collections.newSetFromMap(new ConcurrentHashMap<InetAddress, Boolean>());
-
-            public ReplayWriteResponseHandler(Collection<InetAddress> writeEndpoints)
-            {
-                super(writeEndpoints, Collections.<InetAddress>emptySet(), null, null, null, WriteType.UNLOGGED_BATCH);
-                undelivered.addAll(writeEndpoints);
-            }
-
-            @Override
-            protected int totalBlockFor()
-            {
-                return this.naturalEndpoints.size();
-            }
-
-            @Override
-            public void response(MessageIn<T> m)
-            {
-                boolean removed = undelivered.remove(m == null ? FBUtilities.getBroadcastAddress() : m.from);
-                assert removed;
-                super.response(m);
-            }
-        }
-    }
-
-    // force flush + compaction to reclaim space from the replayed batches
-    private void cleanup() throws ExecutionException, InterruptedException
-    {
-        ColumnFamilyStore cfs = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHLOG);
-        cfs.forceBlockingFlush();
-        Collection<Descriptor> descriptors = new ArrayList<>();
-        for (SSTableReader sstr : cfs.getSSTables())
-            descriptors.add(sstr.descriptor);
-        if (!descriptors.isEmpty()) // don't pollute the logs if there is nothing to compact.
-            CompactionManager.instance.submitUserDefined(cfs, descriptors, Integer.MAX_VALUE).get();
-    }
-
-    public static class EndpointFilter
-    {
-        private final String localRack;
-        private final Multimap<String, InetAddress> endpoints;
-
-        public EndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
-        {
-            this.localRack = localRack;
-            this.endpoints = endpoints;
-        }
-
-        /**
-         * @return list of candidates for batchlog hosting. If possible these will be two nodes from different racks.
-         */
-        public Collection<InetAddress> filter()
-        {
-            // special case for single-node data centers
-            if (endpoints.values().size() == 1)
-                return endpoints.values();
-
-            // strip out dead endpoints and localhost
-            ListMultimap<String, InetAddress> validated = ArrayListMultimap.create();
-            for (Map.Entry<String, InetAddress> entry : endpoints.entries())
-                if (isValid(entry.getValue()))
-                    validated.put(entry.getKey(), entry.getValue());
-
-            if (validated.size() <= 2)
-                return validated.values();
-
-            if (validated.size() - validated.get(localRack).size() >= 2)
-            {
-                // we have enough endpoints in other racks
-                validated.removeAll(localRack);
-            }
-
-            if (validated.keySet().size() == 1)
-            {
-                // we have only 1 `other` rack
-                // pick up to two random nodes from there
-                List<InetAddress> otherRack = validated.get(validated.keySet().iterator().next());
-                Collections.shuffle(otherRack);
-                return Lists.newArrayList(Iterables.limit(otherRack, 2));
-            }
-
-            // randomize which racks we pick from if more than 2 remaining
-            Collection<String> racks;
-            if (validated.keySet().size() == 2)
-            {
-                racks = validated.keySet();
-            }
-            else
-            {
-                racks = Lists.newArrayList(validated.keySet());
-                Collections.shuffle((List) racks);
-            }
-
-            // grab a random member of up to two racks
-            List<InetAddress> result = new ArrayList<>(2);
-            for (String rack : Iterables.limit(racks, 2))
-            {
-                List<InetAddress> rackMembers = validated.get(rack);
-                result.add(rackMembers.get(getRandomInt(rackMembers.size())));
-            }
-
-            return result;
-        }
-
-        @VisibleForTesting
-        protected boolean isValid(InetAddress input)
-        {
-            return !input.equals(FBUtilities.getBroadcastAddress()) && FailureDetector.instance.isAlive(input);
-        }
-
-        @VisibleForTesting
-        protected int getRandomInt(int bound)
-        {
-            return ThreadLocalRandom.current().nextInt(bound);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BatchlogManagerMBean.java b/src/java/org/apache/cassandra/db/BatchlogManagerMBean.java
deleted file mode 100644
index 5ddf232..0000000
--- a/src/java/org/apache/cassandra/db/BatchlogManagerMBean.java
+++ /dev/null

@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-public interface BatchlogManagerMBean
-{
-    /**
-     * Counts all batches currently in the batchlog.
-     *
-     * @return total batch count
-     */
-    public int countAllBatches();
-
-    /**
-     * @return total count of batches replayed since node start
-     */
-    public long getTotalBatchesReplayed();
-
-    /**
-     * Forces batchlog replay. Blocks until completion.
-     */
-    public void forceBatchlogReplay() throws Exception;
-}

diff --git a/src/java/org/apache/cassandra/db/BufferCell.java b/src/java/org/apache/cassandra/db/BufferCell.java
deleted file mode 100644
index ee5fe41..0000000
--- a/src/java/org/apache/cassandra/db/BufferCell.java
+++ /dev/null

@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public class BufferCell extends AbstractCell
-{
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(CellNames.simpleDense(ByteBuffer.allocate(1))));
-
-    protected final CellName name;
-    protected final ByteBuffer value;
-    protected final long timestamp;
-
-    BufferCell(CellName name)
-    {
-        this(name, ByteBufferUtil.EMPTY_BYTE_BUFFER);
-    }
-
-    public BufferCell(CellName name, ByteBuffer value)
-    {
-        this(name, value, 0);
-    }
-
-    public BufferCell(CellName name, ByteBuffer value, long timestamp)
-    {
-        assert name != null;
-        assert value != null;
-
-        this.name = name;
-        this.value = value;
-        this.timestamp = timestamp;
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        return new BufferCell(newName, value, timestamp);
-    }
-
-    @Override
-    public Cell withUpdatedTimestamp(long newTimestamp)
-    {
-        return new BufferCell(name, value, newTimestamp);
-    }
-
-    @Override
-    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public CellName name() {
-        return name;
-    }
-
-    @Override
-    public ByteBuffer value() {
-        return value;
-    }
-
-    @Override
-    public long timestamp() {
-        return timestamp;
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return EMPTY_SIZE + name.unsharedHeapSizeExcludingData() + ObjectSizes.sizeOnHeapExcludingData(value);
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferCell(name.copy(metadata, allocator), allocator.clone(value), timestamp);
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BufferCounterCell.java b/src/java/org/apache/cassandra/db/BufferCounterCell.java
deleted file mode 100644
index 827182a..0000000
--- a/src/java/org/apache/cassandra/db/BufferCounterCell.java
+++ /dev/null

@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public class BufferCounterCell extends BufferCell implements CounterCell
-{
-    private final long timestampOfLastDelete;
-
-    public BufferCounterCell(CellName name, ByteBuffer value, long timestamp)
-    {
-        this(name, value, timestamp, Long.MIN_VALUE);
-    }
-
-    public BufferCounterCell(CellName name, ByteBuffer value, long timestamp, long timestampOfLastDelete)
-    {
-        super(name, value, timestamp);
-        this.timestampOfLastDelete = timestampOfLastDelete;
-    }
-
-    public static CounterCell create(CellName name, ByteBuffer value, long timestamp, long timestampOfLastDelete, ColumnSerializer.Flag flag)
-    {
-        if (flag == ColumnSerializer.Flag.FROM_REMOTE || (flag == ColumnSerializer.Flag.LOCAL && contextManager.shouldClearLocal(value)))
-            value = contextManager.clearAllLocal(value);
-        return new BufferCounterCell(name, value, timestamp, timestampOfLastDelete);
-    }
-
-    // For use by tests of compatibility with pre-2.1 counter only.
-    public static CounterCell createLocal(CellName name, long value, long timestamp, long timestampOfLastDelete)
-    {
-        return new BufferCounterCell(name, contextManager.createLocal(value), timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        return new BufferCounterCell(newName, value, timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public long timestampOfLastDelete()
-    {
-        return timestampOfLastDelete;
-    }
-
-    @Override
-    public long total()
-    {
-        return contextManager.total(value);
-    }
-
-    @Override
-    public int cellDataSize()
-    {
-        // A counter column adds 8 bytes for timestampOfLastDelete to Cell.
-        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(timestampOfLastDelete);
-    }
-
-    @Override
-    public int serializedSize(CellNameType type, TypeSizes typeSizes)
-    {
-        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(timestampOfLastDelete);
-    }
-
-    @Override
-    public Cell diff(Cell cell)
-    {
-        return diffCounter(cell);
-    }
-
-    /*
-     * We have to special case digest creation for counter column because
-     * we don't want to include the information about which shard of the
-     * context is a delta or not, since this information differs from node to
-     * node.
-     */
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name().toByteBuffer().duplicate());
-        // We don't take the deltas into account in a digest
-        contextManager.updateDigest(digest, value());
-
-        FBUtilities.updateWithLong(digest, timestamp);
-        FBUtilities.updateWithByte(digest, serializationFlags());
-        FBUtilities.updateWithLong(digest, timestampOfLastDelete);
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        return reconcileCounter(cell);
-    }
-
-    @Override
-    public boolean hasLegacyShards()
-    {
-        return contextManager.hasLegacyShards(value);
-    }
-
-    @Override
-    public CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferCounterCell(name.copy(metadata, allocator), allocator.clone(value), timestamp, timestampOfLastDelete);
-    }
-
-    @Override
-    public CounterCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s:false:%s@%d!%d",
-                             comparator.getString(name()),
-                             contextManager.toString(value()),
-                             timestamp(),
-                             timestampOfLastDelete);
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.COUNTER_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        // We cannot use the value validator as for other columns as the CounterColumnType validate a long,
-        // which is not the internal representation of counters
-        contextManager.validateContext(value());
-    }
-
-    @Override
-    public Cell markLocalToBeCleared()
-    {
-        ByteBuffer marked = contextManager.markLocalToBeCleared(value());
-        return marked == value() ? this : new BufferCounterCell(name(), marked, timestamp(), timestampOfLastDelete);
-    }
-
-    @Override
-    public boolean equals(Cell cell)
-    {
-        return super.equals(cell) && timestampOfLastDelete == ((CounterCell) cell).timestampOfLastDelete();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java b/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java
deleted file mode 100644
index f7df3ea..0000000
--- a/src/java/org/apache/cassandra/db/BufferCounterUpdateCell.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public class BufferCounterUpdateCell extends BufferCell implements CounterUpdateCell
-{
-    public BufferCounterUpdateCell(CellName name, long value, long timestamp)
-    {
-        this(name, ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    public BufferCounterUpdateCell(CellName name, ByteBuffer value, long timestamp)
-    {
-        super(name, value, timestamp);
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        return new BufferCounterUpdateCell(newName, value, timestamp);
-    }
-
-    public long delta()
-    {
-        return value().getLong(value.position());
-    }
-
-    @Override
-    public Cell diff(Cell cell)
-    {
-        // Diff is used during reads, but we should never read those columns
-        throw new UnsupportedOperationException("This operation is unsupported on CounterUpdateCell.");
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-        if (cell instanceof DeletedCell)
-            return cell;
-
-        assert cell instanceof CounterUpdateCell : "Wrong class type.";
-
-        // The only time this could happen is if a batch ships two increments for the same cell. Hence we simply sum the deltas.
-        return new BufferCounterUpdateCell(name, delta() + ((CounterUpdateCell) cell).delta(), Math.max(timestamp, cell.timestamp()));
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.COUNTER_UPDATE_MASK;
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s:%s@%d", comparator.getString(name()), ByteBufferUtil.toLong(value), timestamp());
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BufferDeletedCell.java b/src/java/org/apache/cassandra/db/BufferDeletedCell.java
deleted file mode 100644
index 3762e1f..0000000
--- a/src/java/org/apache/cassandra/db/BufferDeletedCell.java
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public class BufferDeletedCell extends BufferCell implements DeletedCell
-{
-    public BufferDeletedCell(CellName name, int localDeletionTime, long timestamp)
-    {
-        this(name, ByteBufferUtil.bytes(localDeletionTime), timestamp);
-    }
-
-    public BufferDeletedCell(CellName name, ByteBuffer value, long timestamp)
-    {
-        super(name, value, timestamp);
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        return new BufferDeletedCell(newName, value, timestamp);
-    }
-
-    @Override
-    public Cell withUpdatedTimestamp(long newTimestamp)
-    {
-        return new BufferDeletedCell(name, value, newTimestamp);
-    }
-
-    @Override
-    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public boolean isLive()
-    {
-        return false;
-    }
-
-    @Override
-    public boolean isLive(long now)
-    {
-        return false;
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-       return value().getInt(value.position());
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        if (cell instanceof DeletedCell)
-            return super.reconcile(cell);
-        return cell.reconcile(this);
-    }
-
-    @Override
-    public DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferDeletedCell(name.copy(metadata, allocator), allocator.clone(value), timestamp);
-    }
-
-    @Override
-    public DeletedCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.DELETION_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        if (value().remaining() != 4)
-            throw new MarshalException("A tombstone value should be 4 bytes long");
-        if (getLocalDeletionTime() < 0)
-            throw new MarshalException("The local deletion time should not be negative");
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(name().toByteBuffer().duplicate());
-
-        FBUtilities.updateWithLong(digest, timestamp());
-        FBUtilities.updateWithByte(digest, serializationFlags());
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/BufferExpiringCell.java b/src/java/org/apache/cassandra/db/BufferExpiringCell.java
deleted file mode 100644
index c71cd88..0000000
--- a/src/java/org/apache/cassandra/db/BufferExpiringCell.java
+++ /dev/null

@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public class BufferExpiringCell extends BufferCell implements ExpiringCell
-{
-    public static final int MAX_DELETION_TIME = Integer.MAX_VALUE - 1;
-
-    private final int localExpirationTime;
-    private final int timeToLive;
-
-    public BufferExpiringCell(CellName name, ByteBuffer value, long timestamp, int timeToLive)
-    {
-        super(name, value, timestamp);
-        assert timeToLive > 0 : timeToLive;
-        this.timeToLive = timeToLive;
-        this.localExpirationTime = computeLocalExpirationTime(timeToLive);
-    }
-
-    public BufferExpiringCell(CellName name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime)
-    {
-        super(name, value, timestamp);
-        assert timeToLive > 0 : timeToLive;
-        this.timeToLive = timeToLive;
-        this.localExpirationTime = localExpirationTime;
-    }
-
-    public int getTimeToLive()
-    {
-        return timeToLive;
-    }
-
-    @Override
-    public Cell withUpdatedName(CellName newName)
-    {
-        return new BufferExpiringCell(newName, value(), timestamp(), timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public Cell withUpdatedTimestamp(long newTimestamp)
-    {
-        return new BufferExpiringCell(name(), value(), newTimestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
-    {
-        return new BufferExpiringCell(name(), value(), newTimestamp, timeToLive, newLocalDeletionTime);
-    }
-
-    @Override
-    public int cellDataSize()
-    {
-        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(localExpirationTime) + TypeSizes.NATIVE.sizeof(timeToLive);
-    }
-
-    @Override
-    public int serializedSize(CellNameType type, TypeSizes typeSizes)
-    {
-        /*
-         * An expired column adds to a Cell :
-         *    4 bytes for the localExpirationTime
-         *  + 4 bytes for the timeToLive
-        */
-        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(localExpirationTime) + typeSizes.sizeof(timeToLive);
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        super.updateDigest(digest);
-        FBUtilities.updateWithInt(digest, timeToLive);
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-        return localExpirationTime;
-    }
-
-    @Override
-    public ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferExpiringCell(name.copy(metadata, allocator), allocator.clone(value), timestamp, timeToLive, localExpirationTime);
-    }
-
-    @Override
-    public ExpiringCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s!%d", super.getString(comparator), timeToLive);
-    }
-
-    @Override
-    public boolean isLive()
-    {
-        return isLive(System.currentTimeMillis());
-    }
-
-    @Override
-    public boolean isLive(long now)
-    {
-        return (int) (now / 1000) < getLocalDeletionTime();
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.EXPIRATION_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        super.validateFields(metadata);
-
-        if (timeToLive <= 0)
-            throw new MarshalException("A column TTL should be > 0, but was " + timeToLive);
-        if (localExpirationTime < 0)
-            throw new MarshalException("The local expiration time should not be negative but was " + localExpirationTime);
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        long ts1 = timestamp(), ts2 = cell.timestamp();
-        if (ts1 != ts2)
-            return ts1 < ts2 ? cell : this;
-        // we should prefer tombstones
-        if (cell instanceof DeletedCell)
-            return cell;
-        int c = value().compareTo(cell.value());
-        if (c != 0)
-            return c < 0 ? cell : this;
-        // If we have same timestamp and value, prefer the longest ttl
-        if (cell instanceof ExpiringCell)
-        {
-            int let1 = localExpirationTime, let2 = cell.getLocalDeletionTime();
-            if (let1 < let2)
-                return cell;
-        }
-        return this;
-    }
-
-    @Override
-    public boolean equals(Cell cell)
-    {
-        if (!super.equals(cell))
-            return false;
-        ExpiringCell that = (ExpiringCell) cell;
-        return getLocalDeletionTime() == that.getLocalDeletionTime() && getTimeToLive() == that.getTimeToLive();
-    }
-
-    /** @return Either a DeletedCell, or an ExpiringCell. */
-    public static Cell create(CellName name, ByteBuffer value, long timestamp, int timeToLive, int localExpirationTime, int expireBefore, ColumnSerializer.Flag flag)
-    {
-        // CASSANDRA-14092 may have written rows with negative localExpirationTime, so we don't turn them into tombstones yet
-        // to be able to recover them with scrub.
-        if (localExpirationTime < 0 || localExpirationTime >= expireBefore || flag == ColumnSerializer.Flag.PRESERVE_SIZE)
-            return new BufferExpiringCell(name, value, timestamp, timeToLive, localExpirationTime);
-        // The column is now expired, we can safely return a simple tombstone. Note that
-        // as long as the expiring column and the tombstone put together live longer than GC grace seconds,
-        // we'll fulfil our responsibility to repair.  See discussion at
-        // http://cassandra-user-incubator-apache-org.3065146.n2.nabble.com/repair-compaction-and-tombstone-rows-td7583481.html
-        return new BufferDeletedCell(name, localExpirationTime - timeToLive, timestamp);
-    }
-
-    /**
-     * This method computes the {@link #localExpirationTime}, maybe capping to the maximum representable value
-     * which is {@link #MAX_DELETION_TIME}.
-     *
-     * Please note that the {@link org.apache.cassandra.cql3.Attributes.ExpirationDateOverflowPolicy} is applied
-     * during {@link org.apache.cassandra.cql3.Attributes#maybeApplyExpirationDateOverflowPolicy(CFMetaData, int, boolean)},
-     * so if the request was not denied it means it's expiration date should be capped.
-     *
-     * See CASSANDRA-14092
-     */
-    private int computeLocalExpirationTime(int timeToLive)
-    {
-        int localExpirationTime =  (int) (System.currentTimeMillis() / 1000) + timeToLive;
-        return localExpirationTime >= 0? localExpirationTime : MAX_DELETION_TIME;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/CBuilder.java b/src/java/org/apache/cassandra/db/CBuilder.java
new file mode 100644
index 0000000..94feb93
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/CBuilder.java

@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+
+/**
+ * Allows to build ClusteringPrefixes, either Clustering or Slice.Bound.
+ */
+public abstract class CBuilder
+{
+    public static CBuilder STATIC_BUILDER = new CBuilder()
+    {
+        public int count()
+        {
+            return 0;
+        }
+
+        public int remainingCount()
+        {
+            return 0;
+        }
+
+        public ClusteringComparator comparator()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public CBuilder add(ByteBuffer value)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public CBuilder add(Object value)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Clustering build()
+        {
+            return Clustering.STATIC_CLUSTERING;
+        }
+
+        public Slice.Bound buildBound(boolean isStart, boolean isInclusive)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Slice buildSlice()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Clustering buildWith(ByteBuffer value)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Clustering buildWith(List<ByteBuffer> newValues)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Slice.Bound buildBoundWith(ByteBuffer value, boolean isStart, boolean isInclusive)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public Slice.Bound buildBoundWith(List<ByteBuffer> newValues, boolean isStart, boolean isInclusive)
+        {
+            throw new UnsupportedOperationException();
+        }
+    };
+
+    public static CBuilder create(ClusteringComparator comparator)
+    {
+        return new ArrayBackedBuilder(comparator);
+    }
+
+    public abstract int count();
+    public abstract int remainingCount();
+    public abstract ClusteringComparator comparator();
+    public abstract CBuilder add(ByteBuffer value);
+    public abstract CBuilder add(Object value);
+    public abstract Clustering build();
+    public abstract Slice.Bound buildBound(boolean isStart, boolean isInclusive);
+    public abstract Slice buildSlice();
+    public abstract Clustering buildWith(ByteBuffer value);
+    public abstract Clustering buildWith(List<ByteBuffer> newValues);
+    public abstract Slice.Bound buildBoundWith(ByteBuffer value, boolean isStart, boolean isInclusive);
+    public abstract Slice.Bound buildBoundWith(List<ByteBuffer> newValues, boolean isStart, boolean isInclusive);
+
+    private static class ArrayBackedBuilder extends CBuilder
+    {
+        private final ClusteringComparator type;
+        private final ByteBuffer[] values;
+        private int size;
+        private boolean built;
+
+        public ArrayBackedBuilder(ClusteringComparator type)
+        {
+            this.type = type;
+            this.values = new ByteBuffer[type.size()];
+        }
+
+        public int count()
+        {
+            return size;
+        }
+
+        public int remainingCount()
+        {
+            return values.length - size;
+        }
+
+        public ClusteringComparator comparator()
+        {
+            return type;
+        }
+
+        public CBuilder add(ByteBuffer value)
+        {
+            if (isDone())
+                throw new IllegalStateException();
+            values[size++] = value;
+            return this;
+        }
+
+        public CBuilder add(Object value)
+        {
+            return add(((AbstractType)type.subtype(size)).decompose(value));
+        }
+
+        private boolean isDone()
+        {
+            return remainingCount() == 0 || built;
+        }
+
+        public Clustering build()
+        {
+            // We don't allow to add more element to a builder that has been built so
+            // that we don't have to copy values.
+            built = true;
+
+            // Currently, only dense table can leave some clustering column out (see #7990)
+            return size == 0 ? Clustering.EMPTY : new Clustering(values);
+        }
+
+        public Slice.Bound buildBound(boolean isStart, boolean isInclusive)
+        {
+            // We don't allow to add more element to a builder that has been built so
+            // that we don't have to copy values (even though we have to do it in most cases).
+            built = true;
+
+            if (size == 0)
+                return isStart ? Slice.Bound.BOTTOM : Slice.Bound.TOP;
+
+            return Slice.Bound.create(Slice.Bound.boundKind(isStart, isInclusive),
+                                      size == values.length ? values : Arrays.copyOfRange(values, 0, size));
+        }
+
+        public Slice buildSlice()
+        {
+            // We don't allow to add more element to a builder that has been built so
+            // that we don't have to copy values.
+            built = true;
+
+            if (size == 0)
+                return Slice.ALL;
+
+            return Slice.make(buildBound(true, true), buildBound(false, true));
+        }
+
+        public Clustering buildWith(ByteBuffer value)
+        {
+            assert size+1 <= type.size();
+
+            ByteBuffer[] newValues = Arrays.copyOf(values, type.size());
+            newValues[size] = value;
+            return new Clustering(newValues);
+        }
+
+        public Clustering buildWith(List<ByteBuffer> newValues)
+        {
+            assert size + newValues.size() <= type.size();
+            ByteBuffer[] buffers = Arrays.copyOf(values, type.size());
+            int newSize = size;
+            for (ByteBuffer value : newValues)
+                buffers[newSize++] = value;
+
+            return new Clustering(buffers);
+        }
+
+        public Slice.Bound buildBoundWith(ByteBuffer value, boolean isStart, boolean isInclusive)
+        {
+            ByteBuffer[] newValues = Arrays.copyOf(values, size+1);
+            newValues[size] = value;
+            return Slice.Bound.create(Slice.Bound.boundKind(isStart, isInclusive), newValues);
+        }
+
+        public Slice.Bound buildBoundWith(List<ByteBuffer> newValues, boolean isStart, boolean isInclusive)
+        {
+            ByteBuffer[] buffers = Arrays.copyOf(values, size + newValues.size());
+            int newSize = size;
+            for (ByteBuffer value : newValues)
+                buffers[newSize++] = value;
+
+            return Slice.Bound.create(Slice.Bound.boundKind(isStart, isInclusive), buffers);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/CFRowAdder.java b/src/java/org/apache/cassandra/db/CFRowAdder.java
deleted file mode 100644
index 6fab8d5..0000000
--- a/src/java/org/apache/cassandra/db/CFRowAdder.java
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.ListType;
-import org.apache.cassandra.db.marshal.MapType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.UUIDGen;
-
-/**
- * Convenience object to populate a given CQL3 row in a ColumnFamily object.
- *
- * This is meant for when performance is not of the utmost importance. When
- * performance matters, it might be worth allocating such builder.
- */
-public class CFRowAdder
-{
-    public final ColumnFamily cf;
-    public final Composite prefix;
-    public final long timestamp;
-    public final int ttl;
-    private final int ldt;
-
-    public CFRowAdder(ColumnFamily cf, Composite prefix, long timestamp)
-    {
-        this(cf, prefix, timestamp, 0);
-    }
-
-    public CFRowAdder(ColumnFamily cf, Composite prefix, long timestamp, int ttl)
-    {
-        this.cf = cf;
-        this.prefix = prefix;
-        this.timestamp = timestamp;
-        this.ttl = ttl;
-        this.ldt = (int) (System.currentTimeMillis() / 1000);
-
-        // If a CQL3 table, add the row marker
-        if (cf.metadata().isCQL3Table() && !prefix.isStatic())
-            cf.addColumn(new BufferCell(cf.getComparator().rowMarker(prefix), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp));
-    }
-
-    public CFRowAdder add(String cql3ColumnName, Object value)
-    {
-        ColumnDefinition def = getDefinition(cql3ColumnName);
-        return add(cf.getComparator().create(prefix, def), def, value);
-    }
-
-    public CFRowAdder resetCollection(String cql3ColumnName)
-    {
-        ColumnDefinition def = getDefinition(cql3ColumnName);
-        assert def.type.isCollection() && def.type.isMultiCell();
-        Composite name = cf.getComparator().create(prefix, def);
-        cf.addAtom(new RangeTombstone(name.start(), name.end(), timestamp - 1, ldt));
-        return this;
-    }
-
-    public CFRowAdder addMapEntry(String cql3ColumnName, Object key, Object value)
-    {
-        ColumnDefinition def = getDefinition(cql3ColumnName);
-        assert def.type instanceof MapType;
-        MapType mt = (MapType)def.type;
-        CellName name = cf.getComparator().create(prefix, def, mt.getKeysType().decompose(key));
-        return add(name, def, value);
-    }
-
-    public CFRowAdder addListEntry(String cql3ColumnName, Object value)
-    {
-        ColumnDefinition def = getDefinition(cql3ColumnName);
-        assert def.type instanceof ListType;
-        CellName name = cf.getComparator().create(prefix, def, ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes()));
-        return add(name, def, value);
-    }
-
-    private ColumnDefinition getDefinition(String name)
-    {
-        return cf.metadata().getColumnDefinition(new ColumnIdentifier(name, false));
-    }
-
-    private CFRowAdder add(CellName name, ColumnDefinition def, Object value)
-    {
-        if (value == null)
-        {
-            cf.addColumn(new BufferDeletedCell(name, ldt, timestamp));
-        }
-        else
-        {
-            AbstractType valueType = def.type.isCollection()
-                                   ? ((CollectionType) def.type).valueComparator()
-                                   : def.type;
-            ByteBuffer valueBytes = value instanceof ByteBuffer ? (ByteBuffer)value : valueType.decompose(value);
-            if (ttl == 0)
-                cf.addColumn(new BufferCell(name, valueBytes, timestamp));
-            else
-                cf.addColumn(new BufferExpiringCell(name, valueBytes, timestamp, ttl));
-        }
-        return this;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/Cell.java b/src/java/org/apache/cassandra/db/Cell.java
deleted file mode 100644
index 274f369..0000000
--- a/src/java/org/apache/cassandra/db/Cell.java
+++ /dev/null

@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-/**
- * Cell is immutable, which prevents all kinds of confusion in a multithreaded environment.
- */
-public interface Cell extends OnDiskAtom
-{
-    public static final int MAX_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT;
-
-    public Cell withUpdatedName(CellName newName);
-
-    public Cell withUpdatedTimestamp(long newTimestamp);
-
-    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime);
-
-    @Override
-    public CellName name();
-
-    public ByteBuffer value();
-
-    public boolean isLive();
-
-    public boolean isLive(long now);
-
-    public int cellDataSize();
-
-    // returns the size of the Cell and all references on the heap, excluding any costs associated with byte arrays
-    // that would be allocated by a localCopy, as these will be accounted for by the allocator
-    public long unsharedHeapSizeExcludingData();
-
-    public int serializedSize(CellNameType type, TypeSizes typeSizes);
-
-    public int serializationFlags();
-
-    public Cell diff(Cell cell);
-
-    public Cell reconcile(Cell cell);
-
-    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator);
-
-    public Cell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
-
-    public String getString(CellNameType comparator);
-}

diff --git a/src/java/org/apache/cassandra/db/Clusterable.java b/src/java/org/apache/cassandra/db/Clusterable.java
new file mode 100644
index 0000000..62ab9dc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Clusterable.java

@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+/**
+ * Common class for objects that are identified by a clustering prefix, and can be thus sorted by a
+ * {@link ClusteringComparator}.
+ */
+public interface Clusterable
+{
+    public ClusteringPrefix clustering();
+}

diff --git a/src/java/org/apache/cassandra/db/Clustering.java b/src/java/org/apache/cassandra/db/Clustering.java
new file mode 100644
index 0000000..62af0f1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Clustering.java

@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * The clustering column values for a row.
+ * <p>
+ * A {@code Clustering} is a {@code ClusteringPrefix} that must always be "complete", i.e. have
+ * as many values as there is clustering columns in the table it is part of. It is the clustering
+ * prefix used by rows.
+ * <p>
+ * Note however that while it's size must be equal to the table clustering size, a clustering can have
+ * {@code null} values, and this mostly for thrift backward compatibility (in practice, if a value is null,
+ * all of the following ones will be too because that's what thrift allows, but it's never assumed by the
+ * code so we could start generally allowing nulls for clustering columns if we wanted to).
+ */
+public class Clustering extends AbstractClusteringPrefix
+{
+    public static final Serializer serializer = new Serializer();
+
+    /**
+     * The special cased clustering used by all static rows. It is a special case in the
+     * sense that it's always empty, no matter how many clustering columns the table has.
+     */
+    public static final Clustering STATIC_CLUSTERING = new Clustering(EMPTY_VALUES_ARRAY)
+    {
+        @Override
+        public Kind kind()
+        {
+            return Kind.STATIC_CLUSTERING;
+        }
+
+        @Override
+        public String toString()
+        {
+            return "STATIC";
+        }
+
+        @Override
+        public String toString(CFMetaData metadata)
+        {
+            return toString();
+        }
+    };
+
+    /** Empty clustering for tables having no clustering columns. */
+    public static final Clustering EMPTY = new Clustering(EMPTY_VALUES_ARRAY)
+    {
+        @Override
+        public String toString(CFMetaData metadata)
+        {
+            return "EMPTY";
+        }
+    };
+
+    public Clustering(ByteBuffer... values)
+    {
+        super(Kind.CLUSTERING, values);
+    }
+
+    public Kind kind()
+    {
+        return Kind.CLUSTERING;
+    }
+
+    public ClusteringPrefix minimize()
+    {
+        if (!ByteBufferUtil.canMinimize(values))
+            return this;
+        return new Clustering(ByteBufferUtil.minimizeBuffers(values));
+    }
+
+    public Clustering copy(AbstractAllocator allocator)
+    {
+        // Important for STATIC_CLUSTERING (but no point in being wasteful in general).
+        if (size() == 0)
+            return this;
+
+        ByteBuffer[] newValues = new ByteBuffer[size()];
+        for (int i = 0; i < size(); i++)
+            newValues[i] = values[i] == null ? null : allocator.clone(values[i]);
+        return new Clustering(newValues);
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < size(); i++)
+        {
+            ColumnDefinition c = metadata.clusteringColumns().get(i);
+            sb.append(i == 0 ? "" : ", ").append(c.name).append('=').append(get(i) == null ? "null" : c.type.getString(get(i)));
+        }
+        return sb.toString();
+    }
+
+    public String toCQLString(CFMetaData metadata)
+    {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < size(); i++)
+        {
+            ColumnDefinition c = metadata.clusteringColumns().get(i);
+            sb.append(i == 0 ? "" : ", ").append(c.type.getString(get(i)));
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Serializer for Clustering object.
+     * <p>
+     * Because every clustering in a given table must have the same size (ant that size cannot actually change once the table
+     * has been defined), we don't record that size.
+     */
+    public static class Serializer
+    {
+        public void serialize(Clustering clustering, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
+        {
+            assert clustering != STATIC_CLUSTERING : "We should never serialize a static clustering";
+            assert clustering.size() == types.size() : "Invalid clustering for the table: " + clustering;
+            ClusteringPrefix.serializer.serializeValuesWithoutSize(clustering, out, version, types);
+        }
+
+        public ByteBuffer serialize(Clustering clustering, int version, List<AbstractType<?>> types)
+        {
+            try (DataOutputBuffer buffer = new DataOutputBuffer((int)serializedSize(clustering, version, types)))
+            {
+                serialize(clustering, buffer, version, types);
+                return buffer.buffer();
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException("Writting to an in-memory buffer shouldn't trigger an IOException", e);
+            }
+        }
+
+        public long serializedSize(Clustering clustering, int version, List<AbstractType<?>> types)
+        {
+            return ClusteringPrefix.serializer.valuesWithoutSizeSerializedSize(clustering, version, types);
+        }
+
+        public Clustering deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException
+        {
+            if (types.isEmpty())
+                return EMPTY;
+
+            ByteBuffer[] values = ClusteringPrefix.serializer.deserializeValuesWithoutSize(in, types.size(), version, types);
+            return new Clustering(values);
+        }
+
+        public Clustering deserialize(ByteBuffer in, int version, List<AbstractType<?>> types)
+        {
+            try (DataInputBuffer buffer = new DataInputBuffer(in, true))
+            {
+                return deserialize(buffer, version, types);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException("Reading from an in-memory buffer shouldn't trigger an IOException", e);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java
new file mode 100644
index 0000000..f3411cf
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java

@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Objects;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FastByteOperations;
+
+import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
+
+/**
+ * A comparator of clustering prefixes (or more generally of {@link Clusterable}}.
+ * <p>
+ * This is essentially just a composite comparator that the clustering values of the provided
+ * clustering prefixes in lexicographical order, with each component being compared based on
+ * the type of the clustering column this is a value of.
+ */
+public class ClusteringComparator implements Comparator<Clusterable>
+{
+    private final List<AbstractType<?>> clusteringTypes;
+
+    private final Comparator<IndexInfo> indexComparator;
+    private final Comparator<IndexInfo> indexReverseComparator;
+    private final Comparator<Clusterable> reverseComparator;
+
+    private final Comparator<Row> rowComparator = (r1, r2) -> compare(r1.clustering(), r2.clustering());
+
+    public ClusteringComparator(AbstractType<?>... clusteringTypes)
+    {
+        this(ImmutableList.copyOf(clusteringTypes));
+    }
+
+    public ClusteringComparator(List<AbstractType<?>> clusteringTypes)
+    {
+        // copy the list to ensure despatch is monomorphic
+        this.clusteringTypes = ImmutableList.copyOf(clusteringTypes);
+
+        this.indexComparator = (o1, o2) -> ClusteringComparator.this.compare(o1.lastName, o2.lastName);
+        this.indexReverseComparator = (o1, o2) -> ClusteringComparator.this.compare(o1.firstName, o2.firstName);
+        this.reverseComparator = (c1, c2) -> ClusteringComparator.this.compare(c2, c1);
+        for (AbstractType<?> type : clusteringTypes)
+            type.checkComparable(); // this should already be enforced by CFMetaData.rebuild, but we check again for other constructors
+    }
+
+    /**
+     * The number of clustering columns for the table this is the comparator of.
+     */
+    public int size()
+    {
+        return clusteringTypes.size();
+    }
+
+    /**
+     * The "subtypes" of this clustering comparator, that is the types of the clustering
+     * columns for the table this is a comparator of.
+     */
+    public List<AbstractType<?>> subtypes()
+    {
+        return clusteringTypes;
+    }
+
+    /**
+     * Returns the type of the ith clustering column of the table.
+     */
+    public AbstractType<?> subtype(int i)
+    {
+        return clusteringTypes.get(i);
+    }
+
+    /**
+     * Creates a row clustering based on the clustering values.
+     * <p>
+     * Every argument can either be a {@code ByteBuffer}, in which case it is used as-is, or a object
+     * corresponding to the type of the corresponding clustering column, in which case it will be
+     * converted to a byte buffer using the column type.
+     *
+     * @param values the values to use for the created clustering. There should be exactly {@code size()}
+     * values which must be either byte buffers or of the type the column expect.
+     *
+     * @return the newly created clustering.
+     */
+    public Clustering make(Object... values)
+    {
+        if (values.length != size())
+            throw new IllegalArgumentException(String.format("Invalid number of components, expecting %d but got %d", size(), values.length));
+
+        CBuilder builder = CBuilder.create(this);
+        for (Object val : values)
+        {
+            if (val instanceof ByteBuffer)
+                builder.add((ByteBuffer) val);
+            else
+                builder.add(val);
+        }
+        return builder.build();
+    }
+
+    public int compare(Clusterable c1, Clusterable c2)
+    {
+        return compare(c1.clustering(), c2.clustering());
+    }
+
+    public int compare(ClusteringPrefix c1, ClusteringPrefix c2)
+    {
+        int s1 = c1.size();
+        int s2 = c2.size();
+        int minSize = Math.min(s1, s2);
+
+        for (int i = 0; i < minSize; i++)
+        {
+            int cmp = compareComponent(i, c1.get(i), c2.get(i));
+            if (cmp != 0)
+                return cmp;
+        }
+
+        if (s1 == s2)
+            return ClusteringPrefix.Kind.compare(c1.kind(), c2.kind());
+
+        return s1 < s2 ? c1.kind().comparedToClustering : -c2.kind().comparedToClustering;
+    }
+
+    public int compare(Clustering c1, Clustering c2)
+    {
+        for (int i = 0; i < size(); i++)
+        {
+            int cmp = compareComponent(i, c1.get(i), c2.get(i));
+            if (cmp != 0)
+                return cmp;
+        }
+        return 0;
+    }
+
+    public int compareComponent(int i, ByteBuffer v1, ByteBuffer v2)
+    {
+        if (v1 == null)
+            return v2 == null ? 0 : -1;
+        if (v2 == null)
+            return 1;
+
+        return clusteringTypes.get(i).compare(v1, v2);
+    }
+
+    /**
+     * Returns whether this clustering comparator is compatible with the provided one,
+     * that is if the provided one can be safely replaced by this new one.
+     *
+     * @param previous the previous comparator that we want to replace and test
+     * compatibility with.
+     *
+     * @return whether {@code previous} can be safely replaced by this comparator.
+     */
+    public boolean isCompatibleWith(ClusteringComparator previous)
+    {
+        if (this == previous)
+            return true;
+
+        // Extending with new components is fine, shrinking is not
+        if (size() < previous.size())
+            return false;
+
+        for (int i = 0; i < previous.size(); i++)
+        {
+            AbstractType<?> tprev = previous.subtype(i);
+            AbstractType<?> tnew = subtype(i);
+            if (!tnew.isCompatibleWith(tprev))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Validates the provided prefix for corrupted data.
+     *
+     * @param clustering the clustering prefix to validate.
+     *
+     * @throws MarshalException if {@code clustering} contains some invalid data.
+     */
+    public void validate(ClusteringPrefix clustering)
+    {
+        for (int i = 0; i < clustering.size(); i++)
+        {
+            ByteBuffer value = clustering.get(i);
+            if (value != null)
+                subtype(i).validate(value);
+        }
+    }
+
+    /**
+     * A comparator for rows.
+     *
+     * A {@code Row} is a {@code Clusterable} so {@code ClusteringComparator} can be used
+     * to compare rows directly, but when we know we deal with rows (and not {@code Clusterable} in
+     * general), this is a little faster because by knowing we compare {@code Clustering} objects,
+     * we know that 1) they all have the same size and 2) they all have the same kind.
+     */
+    public Comparator<Row> rowComparator()
+    {
+        return rowComparator;
+    }
+
+    public Comparator<IndexInfo> indexComparator(boolean reversed)
+    {
+        return reversed ? indexReverseComparator : indexComparator;
+    }
+
+    public Comparator<Clusterable> reversed()
+    {
+        return reverseComparator;
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("comparator(%s)", Joiner.on(", ").join(clusteringTypes));
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof ClusteringComparator))
+            return false;
+
+        ClusteringComparator that = (ClusteringComparator)o;
+        return this.clusteringTypes.equals(that.clusteringTypes);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(clusteringTypes);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
new file mode 100644
index 0000000..c1000e4
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java

@@ -0,0 +1,529 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.*;
+
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare.
+ * <p>
+ * It holds values for the clustering columns of a table (potentially only a prefix of all of them) and has
+ * a "kind" that allows us to implement slices with inclusive and exclusive bounds.
+ * <p>
+ * In practice, {@code ClusteringPrefix} is just the common parts to its 3 main subtype: {@link Clustering} and
+ * {@link Slice.Bound}/{@link RangeTombstone.Bound}, where:
+ *   1) {@code Clustering} represents the clustering values for a row, i.e. the values for it's clustering columns.
+ *   2) {@code Slice.Bound} represents a bound (start or end) of a slice (of rows).
+ *   3) {@code RangeTombstoneBoundMarker.Bound} represents a range tombstone marker "bound".
+ * See those classes for more details.
+ */
+public interface ClusteringPrefix extends IMeasurableMemory, Clusterable
+{
+    public static final Serializer serializer = new Serializer();
+
+    /**
+     * The kind of clustering prefix this actually is.
+     *
+     * The kind {@code STATIC_CLUSTERING} is only implemented by {@link Clustering#STATIC_CLUSTERING} and {@code CLUSTERING} is
+     * implemented by the {@link Clustering} class. The rest is used by {@link Slice.Bound} and {@link RangeTombstone.Bound}.
+     */
+    public enum Kind
+    {
+        // WARNING: the ordering of that enum matters because we use ordinal() in the serialization
+
+        EXCL_END_BOUND              (0, -1),
+        INCL_START_BOUND            (0, -1),
+        EXCL_END_INCL_START_BOUNDARY(0, -1),
+        STATIC_CLUSTERING           (1, -1),
+        CLUSTERING                  (2,  0),
+        INCL_END_EXCL_START_BOUNDARY(3,  1),
+        INCL_END_BOUND              (3,  1),
+        EXCL_START_BOUND            (3,  1);
+
+        private final int comparison;
+
+        /**
+         * Return the comparison of this kind to CLUSTERING.
+         * For bounds/boundaries, this basically tells us if we sort before or after our clustering values.
+         */
+        public final int comparedToClustering;
+
+        private Kind(int comparison, int comparedToClustering)
+        {
+            this.comparison = comparison;
+            this.comparedToClustering = comparedToClustering;
+        }
+
+        /**
+         * Compares the 2 provided kind.
+         * <p>
+         * Note: this should be used instead of {@link #compareTo} when comparing clustering prefixes. We do
+         * not override that latter method because it is final for an enum.
+         */
+        public static int compare(Kind k1, Kind k2)
+        {
+            return Integer.compare(k1.comparison, k2.comparison);
+        }
+
+        /**
+         * Returns the inverse of the current kind.
+         * <p>
+         * This invert both start into end (and vice-versa) and inclusive into exclusive (and vice-versa).
+         *
+         * @return the invert of this kind. For instance, if this kind is an exlusive start, this return
+         * an inclusive end.
+         */
+        public Kind invert()
+        {
+            switch (this)
+            {
+                case EXCL_START_BOUND:              return INCL_END_BOUND;
+                case INCL_START_BOUND:              return EXCL_END_BOUND;
+                case EXCL_END_BOUND:                return INCL_START_BOUND;
+                case INCL_END_BOUND:                return EXCL_START_BOUND;
+                case EXCL_END_INCL_START_BOUNDARY:  return INCL_END_EXCL_START_BOUNDARY;
+                case INCL_END_EXCL_START_BOUNDARY:  return EXCL_END_INCL_START_BOUNDARY;
+                default:                            return this;
+            }
+        }
+
+        public boolean isBound()
+        {
+            switch (this)
+            {
+                case INCL_START_BOUND:
+                case INCL_END_BOUND:
+                case EXCL_START_BOUND:
+                case EXCL_END_BOUND:
+                    return true;
+            }
+            return false;
+        }
+
+        public boolean isBoundary()
+        {
+            switch (this)
+            {
+                case INCL_END_EXCL_START_BOUNDARY:
+                case EXCL_END_INCL_START_BOUNDARY:
+                    return true;
+            }
+            return false;
+        }
+
+        public boolean isStart()
+        {
+            switch (this)
+            {
+                case INCL_START_BOUND:
+                case EXCL_END_INCL_START_BOUNDARY:
+                case INCL_END_EXCL_START_BOUNDARY:
+                case EXCL_START_BOUND:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+
+        public boolean isEnd()
+        {
+            switch (this)
+            {
+                case INCL_END_BOUND:
+                case EXCL_END_INCL_START_BOUNDARY:
+                case INCL_END_EXCL_START_BOUNDARY:
+                case EXCL_END_BOUND:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+
+        public boolean isOpen(boolean reversed)
+        {
+            return isBoundary() || (reversed ? isEnd() : isStart());
+        }
+
+        public boolean isClose(boolean reversed)
+        {
+            return isBoundary() || (reversed ? isStart() : isEnd());
+        }
+
+        public Kind closeBoundOfBoundary(boolean reversed)
+        {
+            assert isBoundary();
+            return reversed
+                 ? (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND)
+                 : (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND);
+        }
+
+        public Kind openBoundOfBoundary(boolean reversed)
+        {
+            assert isBoundary();
+            return reversed
+                 ? (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND)
+                 : (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND);
+        }
+    }
+
+    public Kind kind();
+
+    /**
+     * The number of values in this prefix.
+     *
+     * There can't be more values that the this is a prefix of has of clustering columns.
+     *
+     * @return the number of values in this prefix.
+     */
+    public int size();
+
+    /**
+     * Retrieves the ith value of this prefix.
+     *
+     * @param i the index of the value to retrieve. Must be such that {@code 0 <= i < size()}.
+     *
+     * @return the ith value of this prefix. Note that a value can be {@code null}.
+     */
+    public ByteBuffer get(int i);
+
+    /**
+     * Adds the data of this clustering prefix to the provided digest.
+     *
+     * @param digest the digest to which to add this prefix.
+     */
+    public void digest(MessageDigest digest);
+
+    /**
+     * The size of the data hold by this prefix.
+     *
+     * @return the size of the data hold by this prefix (this is not the size of the object in memory, just
+     * the size of the data it stores).
+     */
+    public int dataSize();
+
+    /**
+     * Generates a proper string representation of the prefix.
+     *
+     * @param metadata the metadata for the table the clustering prefix is of.
+     * @return a human-readable string representation fo this prefix.
+     */
+    public String toString(CFMetaData metadata);
+
+    /**
+     * The values of this prefix as an array.
+     * <p>
+     * Please note that this may or may not require an array creation. So 1) you should *not*
+     * modify the returned array and 2) it's more efficient to use {@link #size()} and
+     * {@link #get} unless you actually need an array.
+     *
+     * @return the values for this prefix as an array.
+     */
+    public ByteBuffer[] getRawValues();
+
+    /**
+     * If the prefix contains byte buffers that can be minimized (see {@link ByteBufferUtil#minimalBufferFor(ByteBuffer)}),
+     * this will return a copy of the prefix with minimized values, otherwise it returns itself.
+     */
+    public ClusteringPrefix minimize();
+
+    public static class Serializer
+    {
+        public void serialize(ClusteringPrefix clustering, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
+        {
+            // We shouldn't serialize static clusterings
+            assert clustering.kind() != Kind.STATIC_CLUSTERING;
+            if (clustering.kind() == Kind.CLUSTERING)
+            {
+                out.writeByte(clustering.kind().ordinal());
+                Clustering.serializer.serialize((Clustering)clustering, out, version, types);
+            }
+            else
+            {
+                RangeTombstone.Bound.serializer.serialize((RangeTombstone.Bound)clustering, out, version, types);
+            }
+        }
+
+        public ClusteringPrefix deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException
+        {
+            Kind kind = Kind.values()[in.readByte()];
+            // We shouldn't serialize static clusterings
+            assert kind != Kind.STATIC_CLUSTERING;
+            if (kind == Kind.CLUSTERING)
+                return Clustering.serializer.deserialize(in, version, types);
+            else
+                return RangeTombstone.Bound.serializer.deserializeValues(in, kind, version, types);
+        }
+
+        public long serializedSize(ClusteringPrefix clustering, int version, List<AbstractType<?>> types)
+        {
+            // We shouldn't serialize static clusterings
+            assert clustering.kind() != Kind.STATIC_CLUSTERING;
+            if (clustering.kind() == Kind.CLUSTERING)
+                return 1 + Clustering.serializer.serializedSize((Clustering)clustering, version, types);
+            else
+                return RangeTombstone.Bound.serializer.serializedSize((RangeTombstone.Bound)clustering, version, types);
+        }
+
+        void serializeValuesWithoutSize(ClusteringPrefix clustering, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
+        {
+            int offset = 0;
+            int clusteringSize = clustering.size();
+            // serialize in batches of 32, to avoid garbage when deserializing headers
+            while (offset < clusteringSize)
+            {
+                // we micro-batch the headers, so that we can incur fewer method calls,
+                // and generate no garbage on deserialization;
+                // we piggyback on vint encoding so that, typically, only 1 byte is used per 32 clustering values,
+                // i.e. more than we ever expect to see
+                int limit = Math.min(clusteringSize, offset + 32);
+                out.writeUnsignedVInt(makeHeader(clustering, offset, limit));
+                while (offset < limit)
+                {
+                    ByteBuffer v = clustering.get(offset);
+                    if (v != null && v.hasRemaining())
+                        types.get(offset).writeValue(v, out);
+                    offset++;
+                }
+            }
+        }
+
+        long valuesWithoutSizeSerializedSize(ClusteringPrefix clustering, int version, List<AbstractType<?>> types)
+        {
+            long result = 0;
+            int offset = 0;
+            int clusteringSize = clustering.size();
+            while (offset < clusteringSize)
+            {
+                int limit = Math.min(clusteringSize, offset + 32);
+                result += TypeSizes.sizeofUnsignedVInt(makeHeader(clustering, offset, limit));
+                offset = limit;
+            }
+            for (int i = 0; i < clusteringSize; i++)
+            {
+                ByteBuffer v = clustering.get(i);
+                if (v == null || !v.hasRemaining())
+                    continue; // handled in the header
+
+                result += types.get(i).writtenLength(v);
+            }
+            return result;
+        }
+
+        ByteBuffer[] deserializeValuesWithoutSize(DataInputPlus in, int size, int version, List<AbstractType<?>> types) throws IOException
+        {
+            // Callers of this method should handle the case where size = 0 (in all case we want to return a special value anyway).
+            assert size > 0;
+            ByteBuffer[] values = new ByteBuffer[size];
+            int offset = 0;
+            while (offset < size)
+            {
+                long header = in.readUnsignedVInt();
+                int limit = Math.min(size, offset + 32);
+                while (offset < limit)
+                {
+                    values[offset] = isNull(header, offset)
+                                ? null
+                                : (isEmpty(header, offset) ? ByteBufferUtil.EMPTY_BYTE_BUFFER : types.get(offset).readValue(in, DatabaseDescriptor.getMaxValueSize()));
+                    offset++;
+                }
+            }
+            return values;
+        }
+
+        /**
+         * Whatever the type of a given clustering column is, its value can always be either empty or null. So we at least need to distinguish those
+         * 2 values, and because we want to be able to store fixed width values without appending their (fixed) size first, we need a way to encode
+         * empty values too. So for that, every clustering prefix includes a "header" that contains 2 bits per element in the prefix. For each element,
+         * those 2 bits encode whether the element is null, empty, or none of those.
+         */
+        private static long makeHeader(ClusteringPrefix clustering, int offset, int limit)
+        {
+            long header = 0;
+            for (int i = offset ; i < limit ; i++)
+            {
+                ByteBuffer v = clustering.get(i);
+                // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition
+                if (v == null)
+                    header |= (1L << (i * 2) + 1);
+                else if (!v.hasRemaining())
+                    header |= (1L << (i * 2));
+            }
+            return header;
+        }
+
+        // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition
+        private static boolean isNull(long header, int i)
+        {
+            long mask = 1L << (i * 2) + 1;
+            return (header & mask) != 0;
+        }
+
+        // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition
+        private static boolean isEmpty(long header, int i)
+        {
+            long mask = 1L << (i * 2);
+            return (header & mask) != 0;
+        }
+    }
+
+    /**
+     * Helper class that makes the deserialization of clustering prefixes faster.
+     * <p>
+     * The main reason for this is that when we deserialize rows from sstables, there is many cases where we have
+     * a bunch of rows to skip at the beginning of an index block because those rows are before the requested slice.
+     * This class make sure we can answer the question "is the next row on disk before the requested slice" with as
+     * little work as possible. It does that by providing a comparison method that deserialize only what is needed
+     * to decide of the comparison.
+     */
+    public static class Deserializer
+    {
+        private final ClusteringComparator comparator;
+        private final DataInputPlus in;
+        private final SerializationHeader serializationHeader;
+
+        private boolean nextIsRow;
+        private long nextHeader;
+
+        private int nextSize;
+        private ClusteringPrefix.Kind nextKind;
+        private int deserializedSize;
+        private ByteBuffer[] nextValues;
+
+        public Deserializer(ClusteringComparator comparator, DataInputPlus in, SerializationHeader header)
+        {
+            this.comparator = comparator;
+            this.in = in;
+            this.serializationHeader = header;
+        }
+
+        public void prepare(int flags, int extendedFlags) throws IOException
+        {
+            if (UnfilteredSerializer.isStatic(extendedFlags))
+                throw new IOException("Corrupt flags value for clustering prefix (isStatic flag set): " + flags);
+
+            this.nextIsRow = UnfilteredSerializer.kind(flags) == Unfiltered.Kind.ROW;
+            this.nextKind = nextIsRow ? Kind.CLUSTERING : ClusteringPrefix.Kind.values()[in.readByte()];
+            this.nextSize = nextIsRow ? comparator.size() : in.readUnsignedShort();
+            this.deserializedSize = 0;
+
+            // The point of the deserializer is that some of the clustering prefix won't actually be used (because they are not
+            // within the bounds of the query), and we want to reduce allocation for them. So we only reuse the values array
+            // between elements if 1) we haven't returned the previous element (if we have, nextValues will be null) and 2)
+            // nextValues is of the proper size. Note that the 2nd condition may not hold for range tombstone bounds, but all
+            // rows have a fixed size clustering, so we'll still save in the common case.
+            if (nextValues == null || nextValues.length != nextSize)
+                this.nextValues = new ByteBuffer[nextSize];
+        }
+
+        public int compareNextTo(Slice.Bound bound) throws IOException
+        {
+            if (bound == Slice.Bound.TOP)
+                return -1;
+
+            for (int i = 0; i < bound.size(); i++)
+            {
+                if (!hasComponent(i))
+                    return nextKind.comparedToClustering;
+
+                int cmp = comparator.compareComponent(i, nextValues[i], bound.get(i));
+                if (cmp != 0)
+                    return cmp;
+            }
+
+            if (bound.size() == nextSize)
+                return Kind.compare(nextKind, bound.kind());
+
+            // We know that we'll have exited already if nextSize < bound.size
+            return -bound.kind().comparedToClustering;
+        }
+
+        private boolean hasComponent(int i) throws IOException
+        {
+            if (i >= nextSize)
+                return false;
+
+            while (deserializedSize <= i)
+                deserializeOne();
+
+            return true;
+        }
+
+        private boolean deserializeOne() throws IOException
+        {
+            if (deserializedSize == nextSize)
+                return false;
+
+            if ((deserializedSize % 32) == 0)
+                nextHeader = in.readUnsignedVInt();
+
+            int i = deserializedSize++;
+            nextValues[i] = Serializer.isNull(nextHeader, i)
+                          ? null
+                          : (Serializer.isEmpty(nextHeader, i) ? ByteBufferUtil.EMPTY_BYTE_BUFFER : serializationHeader.clusteringTypes().get(i).readValue(in, DatabaseDescriptor.getMaxValueSize()));
+            return true;
+        }
+
+        private void deserializeAll() throws IOException
+        {
+            while (deserializeOne())
+                continue;
+        }
+
+        public RangeTombstone.Bound deserializeNextBound() throws IOException
+        {
+            assert !nextIsRow;
+            deserializeAll();
+            RangeTombstone.Bound bound = new RangeTombstone.Bound(nextKind, nextValues);
+            nextValues = null;
+            return bound;
+        }
+
+        public Clustering deserializeNextClustering() throws IOException
+        {
+            assert nextIsRow;
+            deserializeAll();
+            Clustering clustering = new Clustering(nextValues);
+            nextValues = null;
+            return clustering;
+        }
+
+        public ClusteringPrefix.Kind skipNext() throws IOException
+        {
+            for (int i = deserializedSize; i < nextSize; i++)
+            {
+                if ((i % 32) == 0)
+                    nextHeader = in.readUnsignedVInt();
+                if (!Serializer.isNull(nextHeader, i) && !Serializer.isEmpty(nextHeader, i))
+                    serializationHeader.clusteringTypes().get(i).skipValue(in);
+            }
+            deserializedSize = nextSize;
+            return nextKind;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/CollationController.java b/src/java/org/apache/cassandra/db/CollationController.java
deleted file mode 100644
index 5be3bd2..0000000
--- a/src/java/org/apache/cassandra/db/CollationController.java
+++ /dev/null

@@ -1,332 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.Closeable;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.TreeSet;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.concurrent.Stage;
-import org.apache.cassandra.concurrent.StageManager;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.SearchIterator;
-import org.apache.cassandra.utils.memory.HeapAllocator;
-
-public class CollationController
-{
-    private final ColumnFamilyStore cfs;
-    private final QueryFilter filter;
-    private final int gcBefore;
-
-    private int sstablesIterated = 0;
-
-    public CollationController(ColumnFamilyStore cfs, QueryFilter filter, int gcBefore)
-    {
-        this.cfs = cfs;
-        this.filter = filter;
-        this.gcBefore = gcBefore;
-    }
-
-    public ColumnFamily getTopLevelColumns(boolean copyOnHeap)
-    {
-        return filter.filter instanceof NamesQueryFilter
-               && cfs.metadata.getDefaultValidator() != CounterColumnType.instance
-               ? collectTimeOrderedData(copyOnHeap)
-               : collectAllData(copyOnHeap);
-    }
-
-    /**
-     * Collects data in order of recency, using the sstable maxtimestamp data.
-     * Once we have data for all requests columns that is newer than the newest remaining maxtimestamp,
-     * we stop.
-     */
-    private ColumnFamily collectTimeOrderedData(boolean copyOnHeap)
-    {
-        final ColumnFamily container = ArrayBackedSortedColumns.factory.create(cfs.metadata, filter.filter.isReversed());
-        List<OnDiskAtomIterator> iterators = new ArrayList<>();
-        boolean isEmpty = true;
-        Tracing.trace("Acquiring sstable references");
-        ColumnFamilyStore.ViewFragment view = cfs.select(cfs.viewFilter(filter.key));
-        DeletionInfo returnDeletionInfo = container.deletionInfo();
-
-        try
-        {
-            Tracing.trace("Merging memtable contents");
-            for (Memtable memtable : view.memtables)
-            {
-                ColumnFamily cf = memtable.getColumnFamily(filter.key);
-                if (cf != null)
-                {
-                    filter.delete(container.deletionInfo(), cf);
-                    isEmpty = false;
-                    Iterator<Cell> iter = filter.getIterator(cf);
-                    while (iter.hasNext())
-                    {
-                        Cell cell = iter.next();
-                        if (copyOnHeap)
-                            cell = cell.localCopy(cfs.metadata, HeapAllocator.instance);
-                        container.addColumn(cell);
-                    }
-                }
-            }
-
-            // avoid changing the filter columns of the original filter
-            // (reduceNameFilter removes columns that are known to be irrelevant)
-            NamesQueryFilter namesFilter = (NamesQueryFilter) filter.filter;
-            TreeSet<CellName> filterColumns = new TreeSet<>(namesFilter.columns);
-            QueryFilter reducedFilter = new QueryFilter(filter.key, filter.cfName, namesFilter.withUpdatedColumns(filterColumns), filter.timestamp);
-
-            /* add the SSTables on disk */
-            Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
-            boolean onlyUnrepaired = true;
-            // read sorted sstables
-            for (SSTableReader sstable : view.sstables)
-            {
-                // if we've already seen a row tombstone with a timestamp greater
-                // than the most recent update to this sstable, we're done, since the rest of the sstables
-                // will also be older
-                if (sstable.getMaxTimestamp() < returnDeletionInfo.getTopLevelDeletion().markedForDeleteAt)
-                    break;
-
-                long currentMaxTs = sstable.getMaxTimestamp();
-                reduceNameFilter(reducedFilter, container, currentMaxTs);
-                if (((NamesQueryFilter) reducedFilter.filter).columns.isEmpty())
-                    break;
-                if (sstable.isRepaired())
-                    onlyUnrepaired = false;
-                Tracing.trace("Merging data from sstable {}", sstable.descriptor.generation);
-                sstable.incrementReadCount();
-                OnDiskAtomIterator iter = reducedFilter.getSSTableColumnIterator(sstable);
-                iterators.add(iter);
-                isEmpty = false;
-                if (iter.getColumnFamily() != null)
-                {
-                    container.delete(iter.getColumnFamily());
-                    sstablesIterated++;
-                    while (iter.hasNext())
-                        container.addAtom(iter.next());
-                }
-            }
-
-            // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently)
-            // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower)
-            if (isEmpty)
-                return null;
-
-            // do a final collate.  toCollate is boilerplate required to provide a CloseableIterator
-            ColumnFamily returnCF = container.cloneMeShallow();
-            Tracing.trace("Collating all results");
-            filter.collateOnDiskAtom(returnCF, container.iterator(), gcBefore);
-
-            // "hoist up" the requested data into a more recent sstable
-            if (sstablesIterated > cfs.getMinimumCompactionThreshold()
-                && onlyUnrepaired
-                && !cfs.isAutoCompactionDisabled()
-                && cfs.getCompactionStrategy().shouldDefragment())
-            {
-                // !!WARNING!!   if we stop copying our data to a heap-managed object,
-                //               we will need to track the lifetime of this mutation as well
-                Tracing.trace("Defragmenting requested data");
-                final Mutation mutation = new Mutation(cfs.keyspace.getName(), filter.key.getKey(), returnCF.cloneMe());
-                StageManager.getStage(Stage.MUTATION).execute(new Runnable()
-                {
-                    public void run()
-                    {
-                        // skipping commitlog and index updates is fine since we're just de-fragmenting existing data
-                        Keyspace.open(mutation.getKeyspaceName()).apply(mutation, false, false);
-                    }
-                });
-            }
-
-            // Caller is responsible for final removeDeletedCF.  This is important for cacheRow to work correctly:
-            return returnCF;
-        }
-        finally
-        {
-            for (OnDiskAtomIterator iter : iterators)
-                FileUtils.closeQuietly(iter);
-        }
-    }
-
-    /**
-     * remove columns from @param filter where we already have data in @param container newer than @param sstableTimestamp
-     */
-    private void reduceNameFilter(QueryFilter filter, ColumnFamily container, long sstableTimestamp)
-    {
-        if (container == null)
-            return;
-
-        SearchIterator<CellName, Cell> searchIter = container.searchIterator();
-        for (Iterator<CellName> iterator = ((NamesQueryFilter) filter.filter).columns.iterator(); iterator.hasNext() && searchIter.hasNext(); )
-        {
-            CellName filterColumn = iterator.next();
-            Cell cell = searchIter.next(filterColumn);
-            if (cell != null && cell.timestamp() > sstableTimestamp)
-                iterator.remove();
-        }
-    }
-
-    /**
-     * Collects data the brute-force way: gets an iterator for the filter in question
-     * from every memtable and sstable, then merges them together.
-     */
-    private ColumnFamily collectAllData(boolean copyOnHeap)
-    {
-        Tracing.trace("Acquiring sstable references");
-        ColumnFamilyStore.ViewFragment view = cfs.select(cfs.viewFilter(filter.key));
-        List<Iterator<? extends OnDiskAtom>> iterators = new ArrayList<>(Iterables.size(view.memtables) + view.sstables.size());
-        ColumnFamily returnCF = ArrayBackedSortedColumns.factory.create(cfs.metadata, filter.filter.isReversed());
-        DeletionInfo returnDeletionInfo = returnCF.deletionInfo();
-        try
-        {
-            Tracing.trace("Merging memtable tombstones");
-            for (Memtable memtable : view.memtables)
-            {
-                final ColumnFamily cf = memtable.getColumnFamily(filter.key);
-                if (cf != null)
-                {
-                    filter.delete(returnDeletionInfo, cf);
-                    Iterator<Cell> iter = filter.getIterator(cf);
-                    if (copyOnHeap)
-                    {
-                        iter = Iterators.transform(iter, new Function<Cell, Cell>()
-                        {
-                            public Cell apply(Cell cell)
-                            {
-                                return cell.localCopy(cf.metadata, HeapAllocator.instance);
-                            }
-                        });
-                    }
-                    iterators.add(iter);
-                }
-            }
-
-            /*
-             * We can't eliminate full sstables based on the timestamp of what we've already read like
-             * in collectTimeOrderedData, but we still want to eliminate sstable whose maxTimestamp < mostRecentTombstone
-             * we've read. We still rely on the sstable ordering by maxTimestamp since if
-             *   maxTimestamp_s1 > maxTimestamp_s0,
-             * we're guaranteed that s1 cannot have a row tombstone such that
-             *   timestamp(tombstone) > maxTimestamp_s0
-             * since we necessarily have
-             *   timestamp(tombstone) <= maxTimestamp_s1
-             * In other words, iterating in maxTimestamp order allow to do our mostRecentTombstone elimination
-             * in one pass, and minimize the number of sstables for which we read a rowTombstone.
-             */
-            Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
-            List<SSTableReader> skippedSSTables = null;
-            long minTimestamp = Long.MAX_VALUE;
-            int nonIntersectingSSTables = 0;
-
-            for (SSTableReader sstable : view.sstables)
-            {
-                minTimestamp = Math.min(minTimestamp, sstable.getMinTimestamp());
-                // if we've already seen a row tombstone with a timestamp greater
-                // than the most recent update to this sstable, we can skip it
-                if (sstable.getMaxTimestamp() < returnDeletionInfo.getTopLevelDeletion().markedForDeleteAt)
-                    break;
-
-                if (!filter.shouldInclude(sstable))
-                {
-                    nonIntersectingSSTables++;
-                    if (skippedSSTables == null)
-                        skippedSSTables = new ArrayList<>();
-                    skippedSSTables.add(sstable);
-                    continue;
-                }
-
-                sstable.incrementReadCount();
-                OnDiskAtomIterator iter = filter.getSSTableColumnIterator(sstable);
-                iterators.add(iter);
-                if (iter.getColumnFamily() != null)
-                {
-                    ColumnFamily cf = iter.getColumnFamily();
-                    returnCF.delete(cf);
-                    sstablesIterated++;
-                }
-            }
-
-            int includedDueToTombstones = 0;
-            // Check for row tombstone in the skipped sstables
-            if (skippedSSTables != null)
-            {
-                for (SSTableReader sstable : skippedSSTables)
-                {
-                    if (sstable.getMaxTimestamp() <= minTimestamp)
-                        continue;
-
-                    sstable.incrementReadCount();
-                    OnDiskAtomIterator iter = filter.getSSTableColumnIterator(sstable);
-                    ColumnFamily cf = iter.getColumnFamily();
-                    // we are only interested in row-level tombstones here, and only if markedForDeleteAt is larger than minTimestamp
-                    if (cf != null && cf.deletionInfo().getTopLevelDeletion().markedForDeleteAt > minTimestamp)
-                    {
-                        includedDueToTombstones++;
-                        iterators.add(iter);
-                        returnCF.delete(cf.deletionInfo().getTopLevelDeletion());
-                        sstablesIterated++;
-                    }
-                    else
-                    {
-                        FileUtils.closeQuietly(iter);
-                    }
-                }
-            }
-
-            if (Tracing.isTracing())
-                Tracing.trace("Skipped {}/{} non-slice-intersecting sstables, included {} due to tombstones",
-                              nonIntersectingSSTables, view.sstables.size(), includedDueToTombstones);
-
-            // we need to distinguish between "there is no data at all for this row" (BF will let us rebuild that efficiently)
-            // and "there used to be data, but it's gone now" (we should cache the empty CF so we don't need to rebuild that slower)
-            if (iterators.isEmpty())
-                return null;
-
-            Tracing.trace("Merging data from memtables and {} sstables", sstablesIterated);
-            filter.collateOnDiskAtom(returnCF, iterators, gcBefore);
-
-            // Caller is responsible for final removeDeletedCF.  This is important for cacheRow to work correctly:
-            return returnCF;
-        }
-        finally
-        {
-            for (Object iter : iterators)
-                if (iter instanceof Closeable)
-                    FileUtils.closeQuietly((Closeable) iter);
-        }
-    }
-
-    public int getSstablesIterated()
-    {
-        return sstablesIterated;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/ColumnFamily.java b/src/java/org/apache/cassandra/db/ColumnFamily.java
deleted file mode 100644
index 1532439..0000000
--- a/src/java/org/apache/cassandra/db/ColumnFamily.java
+++ /dev/null

@@ -1,565 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.UUID;
-
-import com.google.common.collect.ImmutableMap;
-import org.apache.commons.lang3.builder.HashCodeBuilder;
-
-import org.apache.cassandra.cache.IRowCacheEntry;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.filter.ColumnCounter;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.io.sstable.ColumnNameHelper;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.*;
-
-/**
- * A sorted map of columns.
- * This represents the backing map of a colum family.
- *
- * Whether the implementation is thread safe or not is left to the
- * implementing classes.
- */
-public abstract class ColumnFamily implements Iterable<Cell>, IRowCacheEntry
-{
-    /* The column serializer for this Column Family. Create based on config. */
-    public static final ColumnFamilySerializer serializer = new ColumnFamilySerializer();
-
-    protected final CFMetaData metadata;
-
-    protected ColumnFamily(CFMetaData metadata)
-    {
-        assert metadata != null;
-        this.metadata = metadata;
-    }
-
-    public <T extends ColumnFamily> T cloneMeShallow(ColumnFamily.Factory<T> factory, boolean reversedInsertOrder)
-    {
-        T cf = factory.create(metadata, reversedInsertOrder);
-        cf.delete(this);
-        return cf;
-    }
-
-    public ColumnFamily cloneMeShallow()
-    {
-        return cloneMeShallow(false);
-    }
-
-    public ColumnFamily cloneMeShallow(boolean reversed)
-    {
-        return cloneMeShallow(getFactory(), reversed);
-    }
-
-    public ColumnFamilyType getType()
-    {
-        return metadata.cfType;
-    }
-
-    public int liveCQL3RowCount(long now)
-    {
-        ColumnCounter counter = getComparator().isDense()
-                              ? new ColumnCounter(now)
-                              : new ColumnCounter.GroupByPrefix(now, getComparator(), metadata.clusteringColumns().size(), true);
-        return counter.countAll(this).live();
-    }
-
-    /**
-     * Clones the column map.
-     */
-    public abstract ColumnFamily cloneMe();
-
-    public UUID id()
-    {
-        return metadata.cfId;
-    }
-
-    /**
-     * @return The CFMetaData for this row
-     */
-    public CFMetaData metadata()
-    {
-        return metadata;
-    }
-
-    public void addColumn(CellName name, ByteBuffer value, long timestamp)
-    {
-        addColumn(name, value, timestamp, 0);
-    }
-
-    public void addColumn(CellName name, ByteBuffer value, long timestamp, int timeToLive)
-    {
-        assert !metadata().isCounter();
-        Cell cell = AbstractCell.create(name, value, timestamp, timeToLive, metadata());
-        addColumn(cell);
-    }
-
-    public void addCounter(CellName name, long value)
-    {
-        addColumn(new BufferCounterUpdateCell(name, value, FBUtilities.timestampMicros()));
-    }
-
-    public void addTombstone(CellName name, ByteBuffer localDeletionTime, long timestamp)
-    {
-        addColumn(new BufferDeletedCell(name, localDeletionTime, timestamp));
-    }
-
-    public void addTombstone(CellName name, int localDeletionTime, long timestamp)
-    {
-        addColumn(new BufferDeletedCell(name, localDeletionTime, timestamp));
-    }
-
-    public void addAtom(OnDiskAtom atom)
-    {
-        if (atom instanceof Cell)
-        {
-            addColumn((Cell)atom);
-        }
-        else
-        {
-            assert atom instanceof RangeTombstone;
-            delete((RangeTombstone)atom);
-        }
-    }
-
-    /**
-     * Clear this column family, removing all columns and deletion info.
-     */
-    public abstract void clear();
-
-    /**
-     * Returns a {@link DeletionInfo.InOrderTester} for the deletionInfo() of
-     * this column family. Please note that for ThreadSafe implementation of ColumnFamily,
-     * this tester will remain valid even if new tombstones are added to this ColumnFamily
-     * *as long as said addition is done in comparator order*. For AtomicSortedColumns,
-     * the tester will correspond to the state of when this method is called.
-     */
-    public DeletionInfo.InOrderTester inOrderDeletionTester()
-    {
-        return deletionInfo().inOrderTester();
-    }
-
-    /**
-     * Returns the factory used for this ISortedColumns implementation.
-     */
-    public abstract Factory getFactory();
-
-    public abstract DeletionInfo deletionInfo();
-    public abstract void setDeletionInfo(DeletionInfo info);
-
-    public abstract void delete(DeletionInfo info);
-    public abstract void delete(DeletionTime deletionTime);
-    protected abstract void delete(RangeTombstone tombstone);
-
-    public abstract SearchIterator<CellName, Cell> searchIterator();
-
-    /**
-     * Purges top-level and range tombstones whose localDeletionTime is older than gcBefore.
-     * @param gcBefore a timestamp (in seconds) before which tombstones should be purged
-     */
-    public abstract void purgeTombstones(int gcBefore);
-
-    /**
-     * Adds a cell to this cell map.
-     * If a cell with the same name is already present in the map, it will
-     * be replaced by the newly added cell.
-     */
-    public abstract void addColumn(Cell cell);
-
-    /**
-     * Adds a cell if it's non-gc-able and isn't shadowed by a partition/range tombstone with a higher timestamp.
-     * Requires that the cell to add is sorted strictly after the last cell in the container.
-     */
-    public abstract void maybeAppendColumn(Cell cell, DeletionInfo.InOrderTester tester, int gcBefore);
-
-    /**
-     * Appends a cell. Requires that the cell to add is sorted strictly after the last cell in the container.
-     */
-    public abstract void appendColumn(Cell cell);
-
-    /**
-     * Adds all the columns of a given column map to this column map.
-     * This is equivalent to:
-     *   <code>
-     *   for (Cell c : cm)
-     *      addColumn(c, ...);
-     *   </code>
-     *  but is potentially faster.
-     */
-    public abstract void addAll(ColumnFamily cm);
-
-    /**
-     * Get a column given its name, returning null if the column is not
-     * present.
-     */
-    public abstract Cell getColumn(CellName name);
-
-    /**
-     * Returns an iterable with the names of columns in this column map in the same order
-     * as the underlying columns themselves.
-     */
-    public abstract Iterable<CellName> getColumnNames();
-
-    /**
-     * Returns the columns of this column map as a collection.
-     * The columns in the returned collection should be sorted as the columns
-     * in this map.
-     */
-    public abstract Collection<Cell> getSortedColumns();
-
-    /**
-     * Returns the columns of this column map as a collection.
-     * The columns in the returned collection should be sorted in reverse
-     * order of the columns in this map.
-     */
-    public abstract Collection<Cell> getReverseSortedColumns();
-
-    /**
-     * Returns the number of columns in this map.
-     */
-    public abstract int getColumnCount();
-
-    /**
-     * Returns whether or not there are any columns present.
-     */
-    public abstract boolean hasColumns();
-
-    /**
-     * Returns true if this contains no columns or deletion info
-     */
-    public boolean isEmpty()
-    {
-        return deletionInfo().isLive() && !hasColumns();
-    }
-
-    /**
-     * Returns an iterator over the columns of this map that returns only the matching @param slices.
-     * The provided slices must be in order and must be non-overlapping.
-     */
-    public abstract Iterator<Cell> iterator(ColumnSlice[] slices);
-
-    /**
-     * Returns a reversed iterator over the columns of this map that returns only the matching @param slices.
-     * The provided slices must be in reversed order and must be non-overlapping.
-     */
-    public abstract Iterator<Cell> reverseIterator(ColumnSlice[] slices);
-
-    /**
-     * Returns if this map only support inserts in reverse order.
-     */
-    public abstract boolean isInsertReversed();
-
-    /**
-     * If `columns` has any tombstones (top-level or range tombstones), they will be applied to this set of columns.
-     */
-    public void delete(ColumnFamily columns)
-    {
-        delete(columns.deletionInfo());
-    }
-
-    /*
-     * This function will calculate the difference between 2 column families.
-     * The external input is assumed to be a superset of internal.
-     */
-    public ColumnFamily diff(ColumnFamily cfComposite)
-    {
-        assert cfComposite.id().equals(id());
-        ColumnFamily cfDiff = ArrayBackedSortedColumns.factory.create(metadata);
-        cfDiff.delete(cfComposite.deletionInfo());
-
-        // (don't need to worry about cfNew containing Columns that are shadowed by
-        // the delete tombstone, since cfNew was generated by CF.resolve, which
-        // takes care of those for us.)
-        for (Cell cellExternal : cfComposite)
-        {
-            CellName cName = cellExternal.name();
-            Cell cellInternal = getColumn(cName);
-            if (cellInternal == null)
-            {
-                cfDiff.addColumn(cellExternal);
-            }
-            else
-            {
-                Cell cellDiff = cellInternal.diff(cellExternal);
-                if (cellDiff != null)
-                {
-                    cfDiff.addColumn(cellDiff);
-                }
-            }
-        }
-
-        cfDiff.setDeletionInfo(deletionInfo().diff(cfComposite.deletionInfo()));
-
-        if (!cfDiff.isEmpty())
-            return cfDiff;
-        
-        return null;
-    }
-
-    public long dataSize()
-    {
-        long size = 0;
-        for (Cell cell : this)
-            size += cell.cellDataSize();
-        return size;
-    }
-
-    public long maxTimestamp()
-    {
-        long maxTimestamp = deletionInfo().maxTimestamp();
-        for (Cell cell : this)
-            maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
-        return maxTimestamp;
-    }
-
-    @Override
-    public int hashCode()
-    {
-        HashCodeBuilder builder = new HashCodeBuilder(373, 75437)
-                .append(metadata)
-                .append(deletionInfo());
-        for (Cell cell : this)
-            builder.append(cell);
-        return builder.toHashCode();
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-        if (o == null || !(o instanceof ColumnFamily))
-            return false;
-
-        ColumnFamily comparison = (ColumnFamily) o;
-
-        return metadata.equals(comparison.metadata)
-               && deletionInfo().equals(comparison.deletionInfo())
-               && ByteBufferUtil.compareUnsigned(digest(this), digest(comparison)) == 0;
-    }
-
-    @Override
-    public String toString()
-    {
-        StringBuilder sb = new StringBuilder("ColumnFamily(");
-        sb.append(metadata.cfName);
-
-        if (isMarkedForDelete())
-            sb.append(" -").append(deletionInfo()).append("-");
-
-        sb.append(" [").append(CellNames.getColumnsString(getComparator(), this)).append("])");
-        return sb.toString();
-    }
-
-    public static ByteBuffer digest(ColumnFamily cf)
-    {
-        MessageDigest digest = FBUtilities.threadLocalMD5Digest();
-        if (cf != null)
-            cf.updateDigest(digest);
-        return ByteBuffer.wrap(digest.digest());
-    }
-
-    public void updateDigest(MessageDigest digest)
-    {
-        for (Cell cell : this)
-            cell.updateDigest(digest);
-
-        deletionInfo().updateDigest(digest);
-    }
-
-    public static ColumnFamily diff(ColumnFamily cf1, ColumnFamily cf2)
-    {
-        if (cf1 == null)
-            return cf2;
-        return cf1.diff(cf2);
-    }
-
-    public ColumnStats getColumnStats()
-    {
-        // note that we default to MIN_VALUE/MAX_VALUE here to be able to override them later in this method
-        // we are checking row/range tombstones and actual cells - there should always be data that overrides
-        // these with actual values
-        ColumnStats.MinLongTracker minTimestampTracker = new ColumnStats.MinLongTracker(Long.MIN_VALUE);
-        ColumnStats.MaxLongTracker maxTimestampTracker = new ColumnStats.MaxLongTracker(Long.MAX_VALUE);
-        StreamingHistogram tombstones = new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
-        ColumnStats.MaxIntTracker maxDeletionTimeTracker = new ColumnStats.MaxIntTracker(Integer.MAX_VALUE);
-        List<ByteBuffer> minColumnNamesSeen = Collections.emptyList();
-        List<ByteBuffer> maxColumnNamesSeen = Collections.emptyList();
-        boolean hasLegacyCounterShards = false;
-
-        if (deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
-        {
-            tombstones.update(deletionInfo().getTopLevelDeletion().localDeletionTime);
-            maxDeletionTimeTracker.update(deletionInfo().getTopLevelDeletion().localDeletionTime);
-            minTimestampTracker.update(deletionInfo().getTopLevelDeletion().markedForDeleteAt);
-            maxTimestampTracker.update(deletionInfo().getTopLevelDeletion().markedForDeleteAt);
-        }
-        Iterator<RangeTombstone> it = deletionInfo().rangeIterator();
-        while (it.hasNext())
-        {
-            RangeTombstone rangeTombstone = it.next();
-            tombstones.update(rangeTombstone.getLocalDeletionTime());
-            minTimestampTracker.update(rangeTombstone.timestamp());
-            maxTimestampTracker.update(rangeTombstone.timestamp());
-            maxDeletionTimeTracker.update(rangeTombstone.getLocalDeletionTime());
-            minColumnNamesSeen = ColumnNameHelper.minComponents(minColumnNamesSeen, rangeTombstone.min, metadata.comparator);
-            maxColumnNamesSeen = ColumnNameHelper.maxComponents(maxColumnNamesSeen, rangeTombstone.max, metadata.comparator);
-        }
-
-        for (Cell cell : this)
-        {
-            minTimestampTracker.update(cell.timestamp());
-            maxTimestampTracker.update(cell.timestamp());
-            maxDeletionTimeTracker.update(cell.getLocalDeletionTime());
-
-            int deletionTime = cell.getLocalDeletionTime();
-            if (deletionTime < Integer.MAX_VALUE)
-                tombstones.update(deletionTime);
-            minColumnNamesSeen = ColumnNameHelper.minComponents(minColumnNamesSeen, cell.name(), metadata.comparator);
-            maxColumnNamesSeen = ColumnNameHelper.maxComponents(maxColumnNamesSeen, cell.name(), metadata.comparator);
-            if (cell instanceof CounterCell)
-                hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) cell).hasLegacyShards();
-        }
-        return new ColumnStats(getColumnCount(),
-                               minTimestampTracker.get(),
-                               maxTimestampTracker.get(),
-                               maxDeletionTimeTracker.get(),
-                               tombstones,
-                               minColumnNamesSeen,
-                               maxColumnNamesSeen,
-                               hasLegacyCounterShards);
-    }
-
-    public boolean isMarkedForDelete()
-    {
-        return !deletionInfo().isLive();
-    }
-
-    /**
-     * @return the comparator whose sorting order the contained columns conform to
-     */
-    public CellNameType getComparator()
-    {
-        return metadata.comparator;
-    }
-
-    public boolean hasOnlyTombstones(long now)
-    {
-        for (Cell cell : this)
-            if (cell.isLive(now))
-                return false;
-        return true;
-    }
-
-    public Iterator<Cell> iterator()
-    {
-        return getSortedColumns().iterator();
-    }
-
-    public Iterator<Cell> reverseIterator()
-    {
-        return getReverseSortedColumns().iterator();
-    }
-
-    public Map<CellName, ByteBuffer> asMap()
-    {
-        ImmutableMap.Builder<CellName, ByteBuffer> builder = ImmutableMap.builder();
-        for (Cell cell : this)
-            builder.put(cell.name(), cell.value());
-        return builder.build();
-    }
-
-    public static ColumnFamily fromBytes(ByteBuffer bytes)
-    {
-        if (bytes == null)
-            return null;
-
-        try
-        {
-            return serializer.deserialize(new DataInputStream(ByteBufferUtil.inputStream(bytes)),
-                                                              ArrayBackedSortedColumns.factory,
-                                                              ColumnSerializer.Flag.LOCAL,
-                                                              MessagingService.current_version);
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public ByteBuffer toBytes()
-    {
-        try (DataOutputBuffer out = new DataOutputBuffer())
-        {
-            serializer.serialize(this, out, MessagingService.current_version);
-            return ByteBuffer.wrap(out.getData(), 0, out.getLength());
-        }
-    }
-
-
-    /**
-     * @return an iterator where the removes are carried out once everything has been iterated
-     */
-    public abstract BatchRemoveIterator<Cell> batchRemoveIterator();
-
-    public abstract static class Factory <T extends ColumnFamily>
-    {
-        /**
-         * Returns a (initially empty) column map whose columns are sorted
-         * according to the provided comparator.
-         * The {@code insertReversed} flag is an hint on how we expect insertion to be perfomed,
-         * either in sorted or reverse sorted order. This is used by ArrayBackedSortedColumns to
-         * allow optimizing for both forward and reversed slices. This does not matter for ThreadSafeSortedColumns.
-         * Note that this is only an hint on how we expect to do insertion, this does not change the map sorting.
-         */
-        public abstract T create(CFMetaData metadata, boolean insertReversed, int initialCapacity);
-
-        public T create(CFMetaData metadata, boolean insertReversed)
-        {
-            return create(metadata, insertReversed, 0);
-        }
-
-        public T create(CFMetaData metadata)
-        {
-            return create(metadata, false);
-        }
-
-        public T create(String keyspace, String cfName)
-        {
-            return create(Schema.instance.getCFMetaData(keyspace, cfName));
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java b/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
deleted file mode 100644
index 928c21f..0000000
--- a/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
+++ /dev/null

@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.UUID;
-
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.io.ISSTableSerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.UUIDSerializer;
-
-public class ColumnFamilySerializer implements IVersionedSerializer<ColumnFamily>, ISSTableSerializer<ColumnFamily>
-{
-    /*
-     * Serialized ColumnFamily format:
-     *
-     * [serialized for intra-node writes only, e.g. returning a query result]
-     * <cf nullability boolean: false if the cf is null>
-     * <cf id>
-     *
-     * [in sstable only]
-     * <column bloom filter>
-     * <sparse column index, start/finish columns every ColumnIndexSizeInKB of data>
-     *
-     * [always present]
-     * <local deletion time>
-     * <client-provided deletion time>
-     * <column count>
-     * <columns, serialized individually>
-    */
-    public void serialize(ColumnFamily cf, DataOutputPlus out, int version)
-    {
-        try
-        {
-            if (cf == null)
-            {
-                out.writeBoolean(false);
-                return;
-            }
-
-            out.writeBoolean(true);
-            serializeCfId(cf.id(), out, version);
-            cf.getComparator().deletionInfoSerializer().serialize(cf.deletionInfo(), out, version);
-            ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
-            int count = cf.getColumnCount();
-            out.writeInt(count);
-            int written = 0;
-            for (Cell cell : cf)
-            {
-                columnSerializer.serialize(cell, out);
-                written++;
-            }
-            assert count == written: "Table had " + count + " columns, but " + written + " written";
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public ColumnFamily deserialize(DataInput in, int version) throws IOException
-    {
-        return deserialize(in, ColumnSerializer.Flag.LOCAL, version);
-    }
-
-    public ColumnFamily deserialize(DataInput in, ColumnSerializer.Flag flag, int version) throws IOException
-    {
-        return deserialize(in, ArrayBackedSortedColumns.factory, flag, version);
-    }
-
-    public ColumnFamily deserialize(DataInput in, ColumnFamily.Factory factory, ColumnSerializer.Flag flag, int version) throws IOException
-    {
-        if (!in.readBoolean())
-            return null;
-
-        ColumnFamily cf = factory.create(Schema.instance.getCFMetaData(deserializeCfId(in, version)));
-
-        if (cf.metadata().isSuper() && version < MessagingService.VERSION_20)
-        {
-            SuperColumns.deserializerSuperColumnFamily(in, cf, flag, version);
-        }
-        else
-        {
-            cf.delete(cf.getComparator().deletionInfoSerializer().deserialize(in, version));
-
-            ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
-            int size = in.readInt();
-            for (int i = 0; i < size; ++i)
-                cf.addColumn(columnSerializer.deserialize(in, flag));
-        }
-        return cf;
-    }
-
-    public long contentSerializedSize(ColumnFamily cf, TypeSizes typeSizes, int version)
-    {
-        long size = cf.getComparator().deletionInfoSerializer().serializedSize(cf.deletionInfo(), typeSizes, version);
-        size += typeSizes.sizeof(cf.getColumnCount());
-        ColumnSerializer columnSerializer = cf.getComparator().columnSerializer();
-        for (Cell cell : cf)
-            size += columnSerializer.serializedSize(cell, typeSizes);
-        return size;
-    }
-
-    public long serializedSize(ColumnFamily cf, TypeSizes typeSizes, int version)
-    {
-        if (cf == null)
-        {
-            return typeSizes.sizeof(false);
-        }
-        else
-        {
-            return typeSizes.sizeof(true)  /* nullness bool */
-                 + cfIdSerializedSize(cf.id(), typeSizes, version)  /* id */
-                 + contentSerializedSize(cf, typeSizes, version);
-        }
-    }
-
-    public long serializedSize(ColumnFamily cf, int version)
-    {
-        return serializedSize(cf, TypeSizes.NATIVE, version);
-    }
-
-    public void serializeForSSTable(ColumnFamily cf, DataOutputPlus out)
-    {
-        // Column families shouldn't be written directly to disk, use ColumnIndex.Builder instead
-        throw new UnsupportedOperationException();
-    }
-
-    public ColumnFamily deserializeFromSSTable(DataInput in, Version version)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void serializeCfId(UUID cfId, DataOutputPlus out, int version) throws IOException
-    {
-        UUIDSerializer.serializer.serialize(cfId, out, version);
-    }
-
-    public UUID deserializeCfId(DataInput in, int version) throws IOException
-    {
-        UUID cfId = UUIDSerializer.serializer.deserialize(in, version);
-        if (Schema.instance.getCF(cfId) == null)
-            throw new UnknownColumnFamilyException("Couldn't find cfId=" + cfId, cfId);
-
-        return cfId;
-    }
-
-    public int cfIdSerializedSize(UUID cfId, TypeSizes typeSizes, int version)
-    {
-        return typeSizes.sizeof(cfId);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 7f625e1..70c14c0 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java

@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.*;
-import java.lang.management.ManagementFactory;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
@@ -29,7 +27,6 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.regex.Pattern;
-
 import javax.management.*;
 import javax.management.openmbean.*;
 
@@ -38,62 +35,97 @@
 import com.google.common.base.Throwables;
 import com.google.common.collect.*;
 import com.google.common.util.concurrent.*;
-
-import org.apache.cassandra.db.lifecycle.SSTableIntervalTree;
-import org.apache.cassandra.db.lifecycle.View;
-import org.apache.cassandra.db.lifecycle.Tracker;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.io.FSWriteError;
-import org.json.simple.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import com.clearspring.analytics.stream.Counter;
 import org.apache.cassandra.cache.*;
 import org.apache.cassandra.concurrent.*;
 import org.apache.cassandra.config.*;
-import org.apache.cassandra.config.CFMetaData.SpeculativeRetry;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.view.TableViews;
+import org.apache.cassandra.db.lifecycle.*;
+import org.apache.cassandra.db.partitions.CachedPartition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.CellPath;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.FSReadError;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.*;
-import org.apache.cassandra.io.sstable.metadata.CompactionMetadata;
-import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.metrics.ColumnFamilyMetrics;
-import org.apache.cassandra.metrics.ColumnFamilyMetrics.Sampler;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.metrics.TableMetrics.Sampler;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.streaming.StreamLockfile;
-import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.*;
-import org.apache.cassandra.utils.concurrent.*;
 import org.apache.cassandra.utils.TopKSampler.SamplerResult;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.Refs;
 import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-import com.clearspring.analytics.stream.Counter;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
 
 import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
 import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
 import static org.apache.cassandra.utils.Throwables.maybeFail;
+import static org.apache.cassandra.utils.Throwables.merge;
 
 public class ColumnFamilyStore implements ColumnFamilyStoreMBean
 {
+    // The directories which will be searched for sstables on cfs instantiation.
+    private static volatile Directories.DataDirectory[] initialDirectories = Directories.dataDirectories;
+
+    /**
+     * A hook to add additional directories to initialDirectories.
+     * Any additional directories should be added prior to ColumnFamilyStore instantiation on startup
+     *
+     * Since the directories used by a given table are determined by the compaction strategy,
+     * it's possible for sstables to be written to directories specified outside of cassandra.yaml.
+     * By adding additional directories to initialDirectories, sstables in these extra locations are
+     * made discoverable on sstable instantiation.
+     */
+    public static synchronized void addInitialDirectories(Directories.DataDirectory[] newDirectories)
+    {
+        assert newDirectories != null;
+
+        Set<Directories.DataDirectory> existing = Sets.newHashSet(initialDirectories);
+
+        List<Directories.DataDirectory> replacementList = Lists.newArrayList(initialDirectories);
+        for (Directories.DataDirectory directory: newDirectories)
+        {
+            if (!existing.contains(directory))
+            {
+                replacementList.add(directory);
+            }
+        }
+
+        Directories.DataDirectory[] replacementArray = new Directories.DataDirectory[replacementList.size()];
+        replacementList.toArray(replacementArray);
+        initialDirectories = replacementArray;
+    }
+
+    public static Directories.DataDirectory[] getInitialDirectories()
+    {
+        Directories.DataDirectory[] src = initialDirectories;
+        return Arrays.copyOf(src, src.length);
+    }
+
     private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyStore.class);
 
     private static final ExecutorService flushExecutor = new JMXEnabledThreadPoolExecutor(DatabaseDescriptor.getFlushWriters(),
@@ -111,10 +143,6 @@
                                                                                               new NamedThreadFactory("MemtablePostFlush"),
                                                                                               "internal");
 
-    // If a flush fails with an error the post-flush is never allowed to continue. This stores the error that caused it
-    // to be able to show an error on following flushes instead of blindly continuing.
-    private static volatile FSWriteError previousFlushFailure = null;
-
     private static final ExecutorService reclaimExecutor = new JMXEnabledThreadPoolExecutor(1,
                                                                                             StageManager.KEEPALIVE,
                                                                                             TimeUnit.SECONDS,
@@ -155,14 +183,12 @@
         }
     }
 
-    @VisibleForTesting
-    public static volatile ColumnFamilyStore discardFlushResults;
-
     public final Keyspace keyspace;
     public final String name;
     public final CFMetaData metadata;
-    public final IPartitioner partitioner;
     private final String mbeanName;
+    @Deprecated
+    private final String oldMBeanName;
     private volatile boolean valid = true;
 
     /**
@@ -182,18 +208,23 @@
     private final AtomicInteger fileIndexGenerator = new AtomicInteger(0);
 
     public final SecondaryIndexManager indexManager;
+    public final TableViews viewManager;
 
     /* These are locally held copies to be changed from the config during runtime */
-    private volatile DefaultInteger minCompactionThreshold;
-    private volatile DefaultInteger maxCompactionThreshold;
-    private final WrappingCompactionStrategy compactionStrategyWrapper;
+    private volatile DefaultValue<Integer> minCompactionThreshold;
+    private volatile DefaultValue<Integer> maxCompactionThreshold;
+    private volatile DefaultValue<Double> crcCheckChance;
 
-    public final Directories directories;
+    private final CompactionStrategyManager compactionStrategyManager;
 
-    public final ColumnFamilyMetrics metric;
+    private volatile Directories directories;
+
+    public final TableMetrics metric;
     public volatile long sampleLatencyNanos;
     private final ScheduledFuture<?> latencyCalculator;
 
+    private volatile boolean compactionSpaceCheck = true;
+
     public static void shutdownPostFlushExecutor() throws InterruptedException
     {
         postFlushExecutor.shutdown();
@@ -213,12 +244,16 @@
         // only update these runtime-modifiable settings if they have not been modified.
         if (!minCompactionThreshold.isModified())
             for (ColumnFamilyStore cfs : concatWithIndexes())
-                cfs.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
+                cfs.minCompactionThreshold = new DefaultValue(metadata.params.compaction.minCompactionThreshold());
         if (!maxCompactionThreshold.isModified())
             for (ColumnFamilyStore cfs : concatWithIndexes())
-                cfs.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
+                cfs.maxCompactionThreshold = new DefaultValue(metadata.params.compaction.maxCompactionThreshold());
+        if (!crcCheckChance.isModified())
+            for (ColumnFamilyStore cfs : concatWithIndexes())
+                cfs.crcCheckChance = new DefaultValue(metadata.params.crcCheckChance);
 
-        compactionStrategyWrapper.maybeReloadCompactionStrategy(metadata);
+        compactionStrategyManager.maybeReload(metadata);
+        directories = compactionStrategyManager.getDirectories();
 
         scheduleFlush();
 
@@ -232,7 +267,7 @@
 
     void scheduleFlush()
     {
-        int period = metadata.getMemtableFlushPeriod();
+        int period = metadata.params.memtableFlushPeriodInMs;
         if (period > 0)
         {
             logger.trace("scheduling flush in {} ms", period);
@@ -291,11 +326,9 @@
     {
         try
         {
-            Map<String, String> optionsCopy = new HashMap<>(options);
-            Class<? extends AbstractCompactionStrategy> compactionStrategyClass = CFMetaData.createCompactionStrategy(optionsCopy.get("class"));
-            optionsCopy.remove("class");
-            CFMetaData.validateCompactionOptions(compactionStrategyClass, optionsCopy);
-            compactionStrategyWrapper.setNewLocalCompactionStrategy(compactionStrategyClass, optionsCopy);
+            CompactionParams compactionParams = CompactionParams.fromMap(options);
+            compactionParams.validate();
+            compactionStrategyManager.setNewLocalCompactionStrategy(compactionParams);
         }
         catch (Throwable t)
         {
@@ -307,39 +340,20 @@
 
     public Map<String, String> getCompactionParameters()
     {
-        Map<String, String> options = new HashMap<>(compactionStrategyWrapper.options);
-        options.put("class", compactionStrategyWrapper.getName());
-        return options;
-    }
-
-    public void setCompactionStrategyClass(String compactionStrategyClass)
-    {
-        try
-        {
-            metadata.compactionStrategyClass = CFMetaData.createCompactionStrategy(compactionStrategyClass);
-            compactionStrategyWrapper.maybeReloadCompactionStrategy(metadata);
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IllegalArgumentException(e.getMessage());
-        }
-    }
-
-    public String getCompactionStrategyClass()
-    {
-        return metadata.compactionStrategyClass.getName();
+        return compactionStrategyManager.getCompactionParams().asMap();
     }
 
     public Map<String,String> getCompressionParameters()
     {
-        return metadata.compressionParameters().asThriftOptions();
+        return metadata.params.compression.asMap();
     }
 
     public void setCompressionParameters(Map<String,String> opts)
     {
         try
         {
-            metadata.compressionParameters = CompressionParameters.create(opts);
+            metadata.compression(CompressionParams.fromMap(opts));
+            metadata.params.compression.validate();
         }
         catch (ConfigurationException e)
         {
@@ -347,53 +361,38 @@
         }
     }
 
-    public void setCrcCheckChance(double crcCheckChance)
-    {
-        try
-        {
-            for (SSTableReader sstable : keyspace.getAllSSTables())
-                if (sstable.compression)
-                    sstable.getCompressionMetadata().parameters.setCrcCheckChance(crcCheckChance);
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IllegalArgumentException(e.getMessage());
-        }
-    }
-
-    public ColumnFamilyStore(Keyspace keyspace,
+    private ColumnFamilyStore(Keyspace keyspace,
                              String columnFamilyName,
-                             IPartitioner partitioner,
                              int generation,
                              CFMetaData metadata,
                              Directories directories,
                              boolean loadSSTables)
     {
-        this(keyspace, columnFamilyName, partitioner, generation, metadata, directories, loadSSTables, true);
+        this(keyspace, columnFamilyName, generation, metadata, directories, loadSSTables, true);
     }
 
 
     @VisibleForTesting
     public ColumnFamilyStore(Keyspace keyspace,
                               String columnFamilyName,
-                              IPartitioner partitioner,
                               int generation,
                               CFMetaData metadata,
                               Directories directories,
                               boolean loadSSTables,
                               boolean registerBookkeeping)
     {
+        assert directories != null;
         assert metadata != null : "null metadata for " + keyspace + ":" + columnFamilyName;
 
         this.keyspace = keyspace;
-        name = columnFamilyName;
         this.metadata = metadata;
-        this.minCompactionThreshold = new DefaultInteger(metadata.getMinCompactionThreshold());
-        this.maxCompactionThreshold = new DefaultInteger(metadata.getMaxCompactionThreshold());
-        this.partitioner = partitioner;
-        this.directories = directories;
-        this.indexManager = new SecondaryIndexManager(this);
-        this.metric = new ColumnFamilyMetrics(this);
+        name = columnFamilyName;
+        minCompactionThreshold = new DefaultValue<>(metadata.params.compaction.minCompactionThreshold());
+        maxCompactionThreshold = new DefaultValue<>(metadata.params.compaction.maxCompactionThreshold());
+        crcCheckChance = new DefaultValue<>(metadata.params.crcCheckChance);
+        indexManager = new SecondaryIndexManager(this);
+        viewManager = keyspace.viewManager.forTable(metadata);
+        metric = new TableMetrics(this);
         fileIndexGenerator.set(generation);
         sampleLatencyNanos = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getReadRpcTimeout() / 2);
 
@@ -408,48 +407,60 @@
         // scan for sstables corresponding to this cf and load them
         if (data.loadsstables)
         {
-            Directories.SSTableLister sstableFiles = directories.sstableLister().skipTemporary(true);
-            Collection<SSTableReader> sstables = SSTableReader.openAll(sstableFiles.list().entrySet(), metadata, this.partitioner);
+            Directories.SSTableLister sstableFiles = directories.sstableLister(Directories.OnTxnErr.IGNORE).skipTemporary(true);
+            Collection<SSTableReader> sstables = SSTableReader.openAll(sstableFiles.list().entrySet(), metadata);
             data.addInitialSSTables(sstables);
         }
 
         // compaction strategy should be created after the CFS has been prepared
-        this.compactionStrategyWrapper = new WrappingCompactionStrategy(this);
+        compactionStrategyManager = new CompactionStrategyManager(this);
+        this.directories = compactionStrategyManager.getDirectories();
 
         if (maxCompactionThreshold.value() <= 0 || minCompactionThreshold.value() <=0)
         {
             logger.warn("Disabling compaction strategy by setting compaction thresholds to 0 is deprecated, set the compaction option 'enabled' to 'false' instead.");
-            this.compactionStrategyWrapper.disable();
+            this.compactionStrategyManager.disable();
         }
 
         // create the private ColumnFamilyStores for the secondary column indexes
-        for (ColumnDefinition info : metadata.allColumns())
-        {
-            if (info.getIndexType() != null)
-                indexManager.addIndexedColumn(info);
-        }
+        for (IndexMetadata info : metadata.getIndexes())
+            indexManager.addIndex(info);
 
         if (registerBookkeeping)
         {
             // register the mbean
-            String type = this.partitioner instanceof LocalPartitioner ? "IndexColumnFamilies" : "ColumnFamilies";
-            mbeanName = "org.apache.cassandra.db:type=" + type + ",keyspace=" + this.keyspace.getName() + ",columnfamily=" + name;
-            MBeanWrapper.instance.registerMBean(this, mbeanName);
-            logger.trace("retryPolicy for {} is {}", name, this.metadata.getSpeculativeRetry());
+            mbeanName = String.format("org.apache.cassandra.db:type=%s,keyspace=%s,table=%s",
+                                         isIndex() ? "IndexTables" : "Tables",
+                                         keyspace.getName(), name);
+            oldMBeanName = String.format("org.apache.cassandra.db:type=%s,keyspace=%s,columnfamily=%s",
+                                         isIndex() ? "IndexColumnFamilies" : "ColumnFamilies",
+                                         keyspace.getName(), name);
+            try
+            {
+                ObjectName[] objectNames = {new ObjectName(mbeanName), new ObjectName(oldMBeanName)};
+                for (ObjectName objectName : objectNames)
+                {
+                    MBeanWrapper.instance.registerMBean(this, objectName);
+                }
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+            logger.trace("retryPolicy for {} is {}", name, this.metadata.params.speculativeRetry);
             latencyCalculator = ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(new Runnable()
             {
                 public void run()
                 {
-                    SpeculativeRetry retryPolicy = ColumnFamilyStore.this.metadata.getSpeculativeRetry();
-                    switch (retryPolicy.type)
+                    SpeculativeRetryParam retryPolicy = ColumnFamilyStore.this.metadata.params.speculativeRetry;
+                    switch (retryPolicy.kind())
                     {
                         case PERCENTILE:
                             // get percentile in nanos
-                            sampleLatencyNanos = (long) (metric.coordinatorReadLatency.getSnapshot().getValue(retryPolicy.value));
+                            sampleLatencyNanos = (long) (metric.coordinatorReadLatency.getSnapshot().getValue(retryPolicy.threshold()));
                             break;
                         case CUSTOM:
-                            // convert to nanos, since configuration is in millisecond
-                            sampleLatencyNanos = (long) (retryPolicy.value * 1000d * 1000d);
+                            sampleLatencyNanos = (long) retryPolicy.threshold();
                             break;
                         default:
                             sampleLatencyNanos = Long.MAX_VALUE;
@@ -462,9 +473,31 @@
         {
             latencyCalculator = ScheduledExecutors.optionalTasks.schedule(Runnables.doNothing(), 0, TimeUnit.NANOSECONDS);
             mbeanName = null;
+            oldMBeanName= null;
         }
     }
 
+    public Directories getDirectories()
+    {
+        return directories;
+    }
+
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
+    {
+        MetadataCollector collector = new MetadataCollector(metadata.comparator).sstableLevel(sstableLevel);
+        return createSSTableMultiWriter(descriptor, keyCount, repairedAt, collector, header, lifecycleNewTracker);
+    }
+
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, MetadataCollector metadataCollector, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
+    {
+        return getCompactionStrategyManager().createSSTableMultiWriter(descriptor, keyCount, repairedAt, metadataCollector, header, lifecycleNewTracker);
+    }
+
+    public boolean supportsEarlyOpen()
+    {
+        return compactionStrategyManager.supportsEarlyOpen();
+    }
+
     /** call when dropping or renaming a CF. Performs mbean housekeeping and invalidates CFS to other operations */
     public void invalidate()
     {
@@ -491,9 +524,12 @@
         }
 
         latencyCalculator.cancel(false);
+        compactionStrategyManager.shutdown();
         SystemKeyspace.removeTruncationRecord(metadata.cfId);
+
         data.dropSSTables();
-        indexManager.invalidate();
+        LifecycleTransaction.waitForDeletions();
+        indexManager.invalidateAllIndexesBlocking();
 
         invalidateCaches();
     }
@@ -507,11 +543,15 @@
         data.removeUnreadableSSTables(directory);
     }
 
-    void unregisterMBean()
+    void unregisterMBean() throws MalformedObjectNameException
     {
-        if (MBeanWrapper.instance.isRegistered(mbeanName)) {
-            MBeanWrapper.instance.unregisterMBean(mbeanName);
+        ObjectName[] objectNames = {new ObjectName(mbeanName), new ObjectName(oldMBeanName)};
+        for (ObjectName objectName : objectNames)
+        {
+            if (MBeanWrapper.instance.isRegistered(objectName))
+                MBeanWrapper.instance.unregisterMBean(objectName);
         }
+
         // unregister metrics
         metric.release();
     }
@@ -519,18 +559,17 @@
 
     public static ColumnFamilyStore createColumnFamilyStore(Keyspace keyspace, CFMetaData metadata, boolean loadSSTables)
     {
-        return createColumnFamilyStore(keyspace, metadata.cfName, StorageService.getPartitioner(), metadata, loadSSTables);
+        return createColumnFamilyStore(keyspace, metadata.cfName, metadata, loadSSTables);
     }
 
     public static synchronized ColumnFamilyStore createColumnFamilyStore(Keyspace keyspace,
                                                                          String columnFamily,
-                                                                         IPartitioner partitioner,
                                                                          CFMetaData metadata,
                                                                          boolean loadSSTables)
     {
         // get the max generation number, to prevent generation conflicts
-        Directories directories = new Directories(metadata);
-        Directories.SSTableLister lister = directories.sstableLister().includeBackups(true);
+        Directories directories = new Directories(metadata, initialDirectories);
+        Directories.SSTableLister lister = directories.sstableLister(Directories.OnTxnErr.IGNORE).includeBackups(true);
         List<Integer> generations = new ArrayList<Integer>();
         for (Map.Entry<Descriptor, Set<Component>> entry : lister.list().entrySet())
         {
@@ -543,7 +582,7 @@
         Collections.sort(generations);
         int value = (generations.size() > 0) ? (generations.get(generations.size() - 1)) : 0;
 
-        return new ColumnFamilyStore(keyspace, columnFamily, partitioner, value, metadata, directories, loadSSTables);
+        return new ColumnFamilyStore(keyspace, columnFamily, value, metadata, directories, loadSSTables);
     }
 
     /**
@@ -552,46 +591,29 @@
      */
     public static void scrubDataDirectories(CFMetaData metadata)
     {
-        Directories directories = new Directories(metadata);
+        Directories directories = new Directories(metadata, initialDirectories);
+        Set<File> cleanedDirectories = new HashSet<>();
 
-        // clear ephemeral snapshots that were not properly cleared last session (CASSANDRA-7357)
+         // clear ephemeral snapshots that were not properly cleared last session (CASSANDRA-7357)
         clearEphemeralSnapshots(directories);
 
-        // remove any left-behind SSTables from failed/stalled streaming
-        FileFilter filter = new FileFilter()
-        {
-            public boolean accept(File pathname)
-            {
-                return pathname.getPath().endsWith(StreamLockfile.FILE_EXT);
-            }
-        };
-        for (File dir : directories.getCFDirectories())
-        {
-            File[] lockfiles = dir.listFiles(filter);
-            // lock files can be null if I/O error happens
-            if (lockfiles == null || lockfiles.length == 0)
-                continue;
-            logger.info("Removing SSTables from failed streaming session. Found {} files to cleanup.", lockfiles.length);
+        directories.removeTemporaryDirectories();
 
-            for (File lockfile : lockfiles)
-            {
-                StreamLockfile streamLockfile = new StreamLockfile(lockfile);
-                streamLockfile.cleanup();
-                streamLockfile.delete();
-            }
-        }
+        logger.trace("Removing temporary or obsoleted files from unfinished operations for table {}", metadata.cfName);
+        LifecycleTransaction.removeUnfinishedLeftovers(metadata);
 
-        logger.trace("Removing compacted SSTable files from {} (see http://wiki.apache.org/cassandra/MemtableSSTable)", metadata.cfName);
-
-        for (Map.Entry<Descriptor,Set<Component>> sstableFiles : directories.sstableLister().list().entrySet())
+        logger.trace("Further extra check for orphan sstable files for {}", metadata.cfName);
+        for (Map.Entry<Descriptor,Set<Component>> sstableFiles : directories.sstableLister(Directories.OnTxnErr.IGNORE).list().entrySet())
         {
             Descriptor desc = sstableFiles.getKey();
+            File directory = desc.directory;
             Set<Component> components = sstableFiles.getValue();
 
-            if (desc.type.isTemporary)
+            if (!cleanedDirectories.contains(directory))
             {
-                SSTable.delete(desc, components);
-                continue;
+                cleanedDirectories.add(directory);
+                for (File tmpFile : desc.getTemporaryFiles())
+                    tmpFile.delete();
             }
 
             File dataFile = new File(desc.filenameFor(Component.DATA));
@@ -603,7 +625,9 @@
             logger.warn("Removing orphans for {}: {}", desc, components);
             for (Component component : components)
             {
-                FileUtils.deleteWithConfirm(desc.filenameFor(component));
+                File file = new File(desc.filenameFor(component));
+                if (file.exists())
+                    FileUtils.deleteWithConfirm(desc.filenameFor(component));
             }
         }
 
@@ -621,107 +645,12 @@
         }
 
         // also clean out any index leftovers.
-        for (ColumnDefinition def : metadata.allColumns())
-        {
-            if (def.isIndexed())
+        for (IndexMetadata index : metadata.getIndexes())
+            if (!index.isCustom())
             {
-                CellNameType indexComparator = SecondaryIndex.getIndexComparator(metadata, def);
-                if (indexComparator != null)
-                {
-                    CFMetaData indexMetadata = CFMetaData.newIndexMetadata(metadata, def, indexComparator);
-                    scrubDataDirectories(indexMetadata);
-                }
+                CFMetaData indexMetadata = CassandraIndex.indexCfsMetadata(metadata, index);
+                scrubDataDirectories(indexMetadata);
             }
-        }
-    }
-
-    /**
-     * Replacing compacted sstables is atomic as far as observers of Tracker are concerned, but not on the
-     * filesystem: first the new sstables are renamed to "live" status (i.e., the tmp marker is removed), then
-     * their ancestors are removed.
-     *
-     * If an unclean shutdown happens at the right time, we can thus end up with both the new ones and their
-     * ancestors "live" in the system.  This is harmless for normal data, but for counters it can cause overcounts.
-     *
-     * To prevent this, we record sstables being compacted in the system keyspace.  If we find unfinished
-     * compactions, we remove the new ones (since those may be incomplete -- under LCS, we may create multiple
-     * sstables from any given ancestor).
-     */
-    public static void removeUnfinishedCompactionLeftovers(CFMetaData metadata, Map<Integer, UUID> unfinishedCompactions)
-    {
-        Directories directories = new Directories(metadata);
-        Set<Integer> allGenerations = new HashSet<>();
-        for (Descriptor desc : directories.sstableLister().list().keySet())
-            allGenerations.add(desc.generation);
-
-        // sanity-check unfinishedCompactions
-        Set<Integer> unfinishedGenerations = unfinishedCompactions.keySet();
-        if (!allGenerations.containsAll(unfinishedGenerations))
-        {
-            HashSet<Integer> missingGenerations = new HashSet<>(unfinishedGenerations);
-            missingGenerations.removeAll(allGenerations);
-            logger.trace("Unfinished compactions of {}.{} reference missing sstables of generations {}",
-                         metadata.ksName, metadata.cfName, missingGenerations);
-        }
-
-        // remove new sstables from compactions that didn't complete, and compute
-        // set of ancestors that shouldn't exist anymore
-        Set<Integer> completedAncestors = new HashSet<>();
-        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : directories.sstableLister().skipTemporary(true).list().entrySet())
-        {
-            // we rename the Data component last - if it does not exist as a final file, we should ignore this sstable and
-            // it will be removed during startup
-            if (!sstableFiles.getValue().contains(Component.DATA))
-                continue;
-
-            Descriptor desc = sstableFiles.getKey();
-
-            Set<Integer> ancestors;
-            try
-            {
-                CompactionMetadata compactionMetadata = (CompactionMetadata) desc.getMetadataSerializer().deserialize(desc, MetadataType.COMPACTION);
-                ancestors = compactionMetadata.ancestors;
-            }
-            catch (IOException e)
-            {
-                throw new FSReadError(e, desc.filenameFor(Component.STATS));
-            }
-            catch (NullPointerException e)
-            {
-                throw new FSReadError(e, "Failed to remove unfinished compaction leftovers (file: " + desc.filenameFor(Component.STATS) + ").  See log for details.");
-            }
-
-            if (!ancestors.isEmpty()
-                && unfinishedGenerations.containsAll(ancestors)
-                && allGenerations.containsAll(ancestors))
-            {
-                // any of the ancestors would work, so we'll just lookup the compaction task ID with the first one
-                UUID compactionTaskID = unfinishedCompactions.get(ancestors.iterator().next());
-                assert compactionTaskID != null;
-                logger.trace("Going to delete unfinished compaction product {}", desc);
-                SSTable.delete(desc, sstableFiles.getValue());
-                SystemKeyspace.finishCompaction(compactionTaskID);
-            }
-            else
-            {
-                completedAncestors.addAll(ancestors);
-            }
-        }
-
-        // remove old sstables from compactions that did complete
-        for (Map.Entry<Descriptor, Set<Component>> sstableFiles : directories.sstableLister().list().entrySet())
-        {
-            Descriptor desc = sstableFiles.getKey();
-            if (completedAncestors.contains(desc.generation))
-            {
-                // if any of the ancestors were participating in a compaction, finish that compaction
-                logger.trace("Going to delete leftover compaction ancestor {}", desc);
-                SSTable.delete(desc, sstableFiles.getValue());
-                UUID compactionTaskID = unfinishedCompactions.get(desc.generation);
-                if (compactionTaskID != null)
-                    SystemKeyspace.finishCompaction(unfinishedCompactions.get(desc.generation));
-            }
-        }
     }
 
     /**
@@ -730,7 +659,7 @@
      * @param ksName The keyspace name
      * @param cfName The columnFamily name
      */
-    public static synchronized void loadNewSSTables(String ksName, String cfName)
+    public static void loadNewSSTables(String ksName, String cfName)
     {
         /** ks/cf existence checks will be done by open and getCFS methods for us */
         Keyspace keyspace = Keyspace.open(ksName);
@@ -744,20 +673,18 @@
     {
         logger.info("Loading new SSTables for {}/{}...", keyspace.getName(), name);
 
-        Set<Descriptor> currentDescriptors = new HashSet<Descriptor>();
-        for (SSTableReader sstable : data.getView().sstables)
+        Set<Descriptor> currentDescriptors = new HashSet<>();
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
             currentDescriptors.add(sstable.descriptor);
         Set<SSTableReader> newSSTables = new HashSet<>();
 
-        Directories.SSTableLister lister = directories.sstableLister().skipTemporary(true);
+        Directories.SSTableLister lister = getDirectories().sstableLister(Directories.OnTxnErr.IGNORE).skipTemporary(true);
         for (Map.Entry<Descriptor, Set<Component>> entry : lister.list().entrySet())
         {
             Descriptor descriptor = entry.getKey();
 
             if (currentDescriptors.contains(descriptor))
                 continue; // old (initialized) SSTable found, skipping
-            if (descriptor.type.isTemporary) // in the process of being written
-                continue;
 
             if (!descriptor.isCompatible())
                 throw new RuntimeException(String.format("Can't open incompatible SSTable! Current version %s, found file: %s",
@@ -772,7 +699,8 @@
             }
             catch (IOException e)
             {
-                SSTableReader.logOpenException(entry.getKey(), e);
+                FileUtils.handleCorruptSSTable(new CorruptSSTableException(e, entry.getKey().filenameFor(Component.STATS)));
+                logger.error("Cannot read sstable {}; other IO error, skipping table", entry, e);
                 continue;
             }
 
@@ -786,8 +714,8 @@
                                                descriptor.ksname,
                                                descriptor.cfname,
                                                fileIndexGenerator.incrementAndGet(),
-                                               Descriptor.Type.FINAL,
-                                               descriptor.formatType);
+                                               descriptor.formatType,
+                                               descriptor.digestComponent);
             }
             while (new File(newDescriptor.filenameFor(Component.DATA)).exists());
 
@@ -797,11 +725,24 @@
             SSTableReader reader;
             try
             {
-                reader = SSTableReader.open(newDescriptor, entry.getValue(), metadata, partitioner);
+                reader = SSTableReader.open(newDescriptor, entry.getValue(), metadata);
             }
-            catch (IOException e)
+            catch (CorruptSSTableException ex)
             {
-                SSTableReader.logOpenException(entry.getKey(), e);
+                FileUtils.handleCorruptSSTable(ex);
+                logger.error("Corrupt sstable {}; skipping table", entry, ex);
+                continue;
+            }
+            catch (FSError ex)
+            {
+                FileUtils.handleFSError(ex);
+                logger.error("Cannot read sstable {}; file system error, skipping table", entry, ex);
+                continue;
+            }
+            catch (IOException ex)
+            {
+                FileUtils.handleCorruptSSTable(new CorruptSSTableException(ex, entry.getKey().filenameFor(Component.DATA)));
+                logger.error("Cannot read sstable {}; other IO error, skipping table", entry, ex);
                 continue;
             }
             newSSTables.add(reader);
@@ -818,7 +759,7 @@
         try (Refs<SSTableReader> refs = Refs.ref(newSSTables))
         {
             data.addSSTables(newSSTables);
-            indexManager.maybeBuildSecondaryIndexes(newSSTables, indexManager.allIndexesNames());
+            indexManager.buildAllIndexesBlocking(newSSTables);
         }
 
         logger.info("Done loading load new SSTables for {}/{}", keyspace.getName(), name);
@@ -835,41 +776,44 @@
 
         Set<String> indexes = new HashSet<String>(Arrays.asList(idxNames));
 
-        Collection<SSTableReader> sstables = cfs.getSSTables();
-
+        Iterable<SSTableReader> sstables = cfs.getSSTables(SSTableSet.CANONICAL);
         try (Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            cfs.indexManager.setIndexRemoved(indexes);
-            logger.info(String.format("User Requested secondary index re-build for %s/%s indexes", ksName, cfName));
-            cfs.indexManager.maybeBuildSecondaryIndexes(sstables, indexes);
-            cfs.indexManager.setIndexBuilt(indexes);
+            logger.info("User Requested secondary index re-build for {}/{} indexes: {}", ksName, cfName, Joiner.on(',').join(idxNames));
+            cfs.indexManager.rebuildIndexesBlocking(refs, indexes);
         }
     }
 
+    @Deprecated
     public String getColumnFamilyName()
     {
+        return getTableName();
+    }
+
+    public String getTableName()
+    {
         return name;
     }
 
-    public String getTempSSTablePath(File directory)
+    public String getSSTablePath(File directory)
     {
-        return getTempSSTablePath(directory, DatabaseDescriptor.getSSTableFormat().info.getLatestVersion(), DatabaseDescriptor.getSSTableFormat());
+        return getSSTablePath(directory, DatabaseDescriptor.getSSTableFormat().info.getLatestVersion(), DatabaseDescriptor.getSSTableFormat());
     }
 
-    public String getTempSSTablePath(File directory, SSTableFormat.Type format)
+    public String getSSTablePath(File directory, SSTableFormat.Type format)
     {
-        return getTempSSTablePath(directory, format.info.getLatestVersion(), format);
+        return getSSTablePath(directory, format.info.getLatestVersion(), format);
     }
 
-    private String getTempSSTablePath(File directory, Version version, SSTableFormat.Type format)
+    private String getSSTablePath(File directory, Version version, SSTableFormat.Type format)
     {
         Descriptor desc = new Descriptor(version,
                                          directory,
                                          keyspace.getName(),
                                          name,
                                          fileIndexGenerator.incrementAndGet(),
-                                         Descriptor.Type.TEMP,
-                                         format);
+                                         format,
+                                         Component.digestFor(BigFormat.latestVersion.uncompressedChecksumType()));
         return desc.filenameFor(Component.DATA);
     }
 
@@ -899,33 +843,12 @@
     {
         synchronized (data)
         {
-            if (previousFlushFailure != null)
-                throw new IllegalStateException("A flush previously failed with the error below. To prevent data loss, "
-                                              + "no flushes can be carried out until the node is restarted.",
-                                                previousFlushFailure);
             logFlush();
             Flush flush = new Flush(false);
-            ListenableFutureTask<Void> flushTask = ListenableFutureTask.create(flush, null);
-            flushExecutor.execute(flushTask);
+            flushExecutor.execute(flush);
             ListenableFutureTask<ReplayPosition> task = ListenableFutureTask.create(flush.postFlush);
             postFlushExecutor.execute(task);
-
-            @SuppressWarnings("unchecked")
-            ListenableFuture<ReplayPosition> future =
-                    // If either of the two tasks errors out, resulting future must also error out.
-                    // Combine the two futures and only return post-flush result after both have completed.
-                    // Note that flushTask will always yield null, but Futures.allAsList is
-                    // order preserving, which is why the transform function returns the result
-                    // from item 1 in it's input list (i.e. what was yielded by task).
-                    Futures.transform(Futures.allAsList(flushTask, task),
-                                      new Function<List<Object>, ReplayPosition>()
-                                      {
-                                          public ReplayPosition apply(List<Object> input)
-                                          {
-                                              return (ReplayPosition) input.get(1);
-                                          }
-                                      });
-            return future;
+            return task;
         }
     }
 
@@ -941,16 +864,13 @@
         onHeapTotal += memtable.getAllocator().onHeap().owns();
         offHeapTotal += memtable.getAllocator().offHeap().owns();
 
-        for (SecondaryIndex index : indexManager.getIndexes())
+        for (ColumnFamilyStore indexCfs : indexManager.getAllIndexColumnFamilyStores())
         {
-            if (index.getIndexCfs() != null)
-            {
-                MemtableAllocator allocator = index.getIndexCfs().getTracker().getView().getCurrentMemtable().getAllocator();
-                onHeapRatio += allocator.onHeap().ownershipRatio();
-                offHeapRatio += allocator.offHeap().ownershipRatio();
-                onHeapTotal += allocator.onHeap().owns();
-                offHeapTotal += allocator.offHeap().owns();
-            }
+            MemtableAllocator allocator = indexCfs.getTracker().getView().getCurrentMemtable().getAllocator();
+            onHeapRatio += allocator.onHeap().ownershipRatio();
+            offHeapRatio += allocator.offHeap().ownershipRatio();
+            onHeapTotal += allocator.onHeap().owns();
+            offHeapTotal += allocator.offHeap().owns();
         }
 
         logger.debug("Enqueuing flush of {}: {}", name, String.format("%d (%.0f%%) on-heap, %d (%.0f%%) off-heap",
@@ -1025,47 +945,17 @@
      */
     private final class PostFlush implements Callable<ReplayPosition>
     {
-        final boolean flushSecondaryIndexes;
-        final OpOrder.Barrier writeBarrier;
         final CountDownLatch latch = new CountDownLatch(1);
-        final ReplayPosition commitLogUpperBound;
+        volatile Throwable flushFailure = null;
         final List<Memtable> memtables;
-        final List<SSTableReader> readers;
 
-        private PostFlush(boolean flushSecondaryIndexes, OpOrder.Barrier writeBarrier, ReplayPosition commitLogUpperBound,
-                          List<Memtable> memtables, List<SSTableReader> readers)
+        private PostFlush(List<Memtable> memtables)
         {
-            this.writeBarrier = writeBarrier;
-            this.flushSecondaryIndexes = flushSecondaryIndexes;
-            this.commitLogUpperBound = commitLogUpperBound;
             this.memtables = memtables;
-            this.readers = readers;
         }
 
         public ReplayPosition call()
         {
-            if (discardFlushResults == ColumnFamilyStore.this)
-                return commitLogUpperBound;
-
-            writeBarrier.await();
-
-            /**
-             * we can flush 2is as soon as the barrier completes, as they will be consistent with (or ahead of) the
-             * flushed memtables and CL position, which is as good as we can guarantee.
-             * TODO: SecondaryIndex should support setBarrier(), so custom implementations can co-ordinate exactly
-             * with CL as we do with memtables/CFS-backed SecondaryIndexes.
-             */
-
-            if (flushSecondaryIndexes)
-            {
-                for (SecondaryIndex index : indexManager.getIndexesNotBackedByCfs())
-                {
-                    // flush any non-cfs backed indexes
-                    logger.info("Flushing SecondaryIndex {}", index);
-                    index.forceBlockingFlush();
-                }
-            }
-
             try
             {
                 // we wait on the latch for the commitLogUpperBound to be set, and so that waiters
@@ -1077,16 +967,20 @@
                 throw new IllegalStateException();
             }
 
-            CommitLog.instance.discardCompletedSegments(metadata.cfId, commitLogUpperBound);
-            for (int i = 0 ; i < memtables.size() ; i++)
+            ReplayPosition commitLogUpperBound = ReplayPosition.NONE;
+            // If a flush errored out but the error was ignored, make sure we don't discard the commit log.
+            if (flushFailure == null && !memtables.isEmpty())
             {
-                Memtable memtable = memtables.get(i);
-                SSTableReader reader = readers.get(i);
-                memtable.cfs.data.permitCompactionOfFlushed(reader);
-                memtable.cfs.compactionStrategyWrapper.replaceFlushed(memtable, reader);
+                Memtable memtable = memtables.get(0);
+                commitLogUpperBound = memtable.getCommitLogUpperBound();
+                CommitLog.instance.discardCompletedSegments(metadata.cfId, memtable.getCommitLogLowerBound(), commitLogUpperBound);
             }
+
             metric.pendingFlushes.dec();
 
+            if (flushFailure != null)
+                Throwables.propagate(flushFailure);
+
             return commitLogUpperBound;
         }
     }
@@ -1103,7 +997,6 @@
     {
         final OpOrder.Barrier writeBarrier;
         final List<Memtable> memtables = new ArrayList<>();
-        final List<SSTableReader> readers = new ArrayList<>();
         final PostFlush postFlush;
         final boolean truncate;
 
@@ -1145,7 +1038,7 @@
             // since this happens after wiring up the commitLogUpperBound, we also know all operations with earlier
             // replay positions have also completed, i.e. the memtables are done and ready to flush
             writeBarrier.issue();
-            postFlush = new PostFlush(!truncate, writeBarrier, commitLogUpperBound.get(), memtables, readers);
+            postFlush = new PostFlush(memtables);
         }
 
         public void run()
@@ -1155,61 +1048,43 @@
             writeBarrier.markBlocking();
             writeBarrier.await();
 
-            // mark all memtables as flushing, removing them from the live memtable list, and
-            // remove any memtables that are already clean from the set we need to flush
-            Iterator<Memtable> iter = memtables.iterator();
-            while (iter.hasNext())
-            {
-                Memtable memtable = iter.next();
+            // mark all memtables as flushing, removing them from the live memtable list
+            for (Memtable memtable : memtables)
                 memtable.cfs.data.markFlushing(memtable);
-                if (memtable.isClean() || truncate)
-                {
-                    memtable.cfs.data.replaceFlushed(memtable, null);
-                    memtable.cfs.compactionStrategyWrapper.replaceFlushed(memtable, null);
-                    reclaim(memtable);
-                    iter.remove();
-                }
-            }
-
-            if (memtables.isEmpty())
-            {
-                postFlush.latch.countDown();
-                return;
-            }
 
             metric.memtableSwitchCount.inc();
 
             try
             {
+                boolean flushNonCf2i = true;
                 for (Memtable memtable : memtables)
                 {
-                    // flush the memtable
-                    SSTableReader reader = memtable.flush();
-                    memtable.cfs.data.replaceFlushed(memtable, reader);
+                    Collection<SSTableReader> readers = Collections.emptyList();
+                    if (!memtable.isClean() && !truncate)
+                    {
+                        // TODO: SecondaryIndex should support setBarrier(), so custom implementations can co-ordinate exactly
+                        // with CL as we do with memtables/CFS-backed SecondaryIndexes.
+                        if (flushNonCf2i)
+                        {
+                            indexManager.flushAllNonCFSBackedIndexesBlocking();
+                            flushNonCf2i = false;
+                        }
+                        readers = memtable.flush();
+                    }
+                    memtable.cfs.replaceFlushed(memtable, readers);
                     reclaim(memtable);
-                    readers.add(reader);
                 }
-
-                // signal the post-flush we've done our work
-                // Note: This should not be done in case of error. Read more below.
-                postFlush.latch.countDown();
             }
-            catch (FSWriteError e)
+            catch (Throwable e)
             {
                 JVMStabilityInspector.inspectThrowable(e);
-                // The call above may kill the process or the transports, or ignore the error.
-                // In any case we should not be passing on control to post-flush as a subsequent succeeding flush
-                // could mask the error and:
-                //   - let the commit log discard unpersisted data, resulting in data loss
-                //   - let truncations proceed, with the possibility of resurrecting the unflushed data
-                //   - let snapshots succeed with incomplete data
-
-                // Not passing control on means that all flushes from the moment of failure cannot complete
-                // (including snapshots).
-                // If the disk failure policy is ignore, this will cause memtables and the commit log to grow
-                // unboundedly until the node eventually fails.
-                previousFlushFailure = e;
-                throw e;
+                // If we weren't killed, try to continue work but do not allow CommitLog to be discarded.
+                postFlush.flushFailure = e;
+            }
+            finally
+            {
+                // signal the post-flush we've done our work
+                postFlush.latch.countDown();
             }
         }
 
@@ -1246,21 +1121,6 @@
         }
     }
 
-    @VisibleForTesting
-    // this method should ONLY be used for testing commit log behaviour; it discards the current memtable
-    // contents without marking the commit log clean, and prevents any proceeding flushes from marking
-    // the commit log as done, however they *will* terminate (unlike under typical failures) to ensure progress is made
-    public void simulateFailedFlush()
-    {
-        discardFlushResults = this;
-        data.markFlushing(data.switchMemtable(false, new Memtable(new AtomicReference<>(CommitLog.instance.getContext()), this)));
-    }
-
-    public void resumeFlushing()
-    {
-        discardFlushResults = null;
-    }
-
     /**
      * Finds the largest memtable, as a percentage of *either* on- or off-heap memory limits, and immediately
      * queues it for flushing. If the memtable selected is flushed before this completes, no work is done.
@@ -1285,14 +1145,11 @@
                 onHeap += current.getAllocator().onHeap().ownershipRatio();
                 offHeap += current.getAllocator().offHeap().ownershipRatio();
 
-                for (SecondaryIndex index : cfs.indexManager.getIndexes())
+                for (ColumnFamilyStore indexCfs : cfs.indexManager.getAllIndexColumnFamilyStores())
                 {
-                    if (index.getIndexCfs() != null)
-                    {
-                        MemtableAllocator allocator = index.getIndexCfs().getTracker().getView().getCurrentMemtable().getAllocator();
-                        onHeap += allocator.onHeap().ownershipRatio();
-                        offHeap += allocator.offHeap().ownershipRatio();
-                    }
+                    MemtableAllocator allocator = indexCfs.getTracker().getView().getCurrentMemtable().getAllocator();
+                    onHeap += allocator.onHeap().ownershipRatio();
+                    offHeap += allocator.offHeap().ownershipRatio();
                 }
 
                 float ratio = Math.max(onHeap, offHeap);
@@ -1313,7 +1170,7 @@
                 float flushingOnHeap = Memtable.MEMORY_POOL.onHeap.reclaimingRatio();
                 float flushingOffHeap = Memtable.MEMORY_POOL.offHeap.reclaimingRatio();
                 float thisOnHeap = largest.getAllocator().onHeap().ownershipRatio();
-                float thisOffHeap = largest.getAllocator().onHeap().ownershipRatio();
+                float thisOffHeap = largest.getAllocator().offHeap().ownershipRatio();
                 logger.debug("Flushing largest {} to free up room. Used total: {}, live: {}, flushing: {}, this: {}",
                             largest.cfs, ratio(usedOnHeap, usedOffHeap), ratio(liveOnHeap, liveOffHeap),
                             ratio(flushingOnHeap, flushingOffHeap), ratio(thisOnHeap, thisOffHeap));
@@ -1333,7 +1190,7 @@
             return;
 
         RowCacheKey cacheKey = new RowCacheKey(metadata.ksAndCFName, key);
-        invalidateCachedRow(cacheKey);
+        invalidateCachedPartition(cacheKey);
     }
 
     /**
@@ -1343,108 +1200,33 @@
      * param @ key - key for update/insert
      * param @ columnFamily - columnFamily changes
      */
-    public void apply(DecoratedKey key, ColumnFamily columnFamily, SecondaryIndexManager.Updater indexer, OpOrder.Group opGroup, ReplayPosition replayPosition)
+    public void apply(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, ReplayPosition replayPosition)
+
     {
         long start = System.nanoTime();
         Memtable mt = data.getMemtableFor(opGroup, replayPosition);
-        final long timeDelta = mt.put(key, columnFamily, indexer, opGroup);
-        maybeUpdateRowCache(key);
-        metric.samplers.get(Sampler.WRITES).addSample(key.getKey(), key.hashCode(), 1);
-        metric.writeLatency.addNano(System.nanoTime() - start);
-        // CASSANDRA-11117 - certain resolution paths on memtable put can result in very
-        // large time deltas, either through a variety of sentinel timestamps (used for empty values, ensuring
-        // a minimal write, etc). This limits the time delta to the max value the histogram
-        // can bucket correctly. This also filters the Long.MAX_VALUE case where there was no previous value
-        // to update.
-        if(timeDelta < Long.MAX_VALUE)
-            metric.colUpdateTimeDeltaHistogram.update(Math.min(18165375903306L, timeDelta));
-    }
-
-    /**
-     * Purges gc-able top-level and range tombstones, returning `cf` if there are any columns or tombstones left,
-     * null otherwise.
-     * @param gcBefore a timestamp (in seconds); tombstones with a localDeletionTime before this will be purged
-     */
-    public static ColumnFamily removeDeletedCF(ColumnFamily cf, int gcBefore)
-    {
-        // purge old top-level and range tombstones
-        cf.purgeTombstones(gcBefore);
-
-        // if there are no columns or tombstones left, return null
-        return !cf.hasColumns() && !cf.isMarkedForDelete() ? null : cf;
-    }
-
-    /**
-     * Removes deleted columns and purges gc-able tombstones.
-     * @return an updated `cf` if any columns or tombstones remain, null otherwise
-     */
-    public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore)
-    {
-        return removeDeleted(cf, gcBefore, SecondaryIndexManager.nullUpdater);
-    }
-
-    /*
-     This is complicated because we need to preserve deleted columns and columnfamilies
-     until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
-     their contents; just the object itself as a "tombstone" that can be used to repair other
-     replicas that do not know about the deletion.
-     */
-    public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
-    {
-        if (cf == null)
+        try
         {
-            return null;
+            long timeDelta = mt.put(update, indexer, opGroup);
+            DecoratedKey key = update.partitionKey();
+            maybeUpdateRowCache(key);
+            metric.samplers.get(Sampler.WRITES).addSample(key.getKey(), key.hashCode(), 1);
+            metric.writeLatency.addNano(System.nanoTime() - start);
+            // CASSANDRA-11117 - certain resolution paths on memtable put can result in very
+            // large time deltas, either through a variety of sentinel timestamps (used for empty values, ensuring
+            // a minimal write, etc). This limits the time delta to the max value the histogram
+            // can bucket correctly. This also filters the Long.MAX_VALUE case where there was no previous value
+            // to update.
+            if(timeDelta < Long.MAX_VALUE)
+                metric.colUpdateTimeDeltaHistogram.update(Math.min(18165375903306L, timeDelta));
+        }
+        catch (RuntimeException e)
+        {
+            throw new RuntimeException(e.getMessage()
+                                                     + " for ks: "
+                                                     + keyspace.getName() + ", table: " + name, e);
         }
 
-        return removeDeletedCF(removeDeletedColumnsOnly(cf, gcBefore, indexer), gcBefore);
-    }
-
-    /**
-     * Removes only per-cell tombstones, cells that are shadowed by a row-level or range tombstone, or
-     * columns that have been dropped from the schema (for CQL3 tables only).
-     * @return the updated ColumnFamily
-     */
-    public static ColumnFamily removeDeletedColumnsOnly(ColumnFamily cf, int gcBefore, SecondaryIndexManager.Updater indexer)
-    {
-        BatchRemoveIterator<Cell> iter = cf.batchRemoveIterator();
-        DeletionInfo.InOrderTester tester = cf.inOrderDeletionTester();
-        boolean hasDroppedColumns = !cf.metadata.getDroppedColumns().isEmpty();
-        while (iter.hasNext())
-        {
-            Cell c = iter.next();
-            // remove columns if
-            // (a) the column itself is gcable or
-            // (b) the column is shadowed by a CF tombstone
-            // (c) the column has been dropped from the CF schema (CQL3 tables only)
-            if (c.getLocalDeletionTime() < gcBefore || tester.isDeleted(c) || (hasDroppedColumns && isDroppedColumn(c, cf.metadata())))
-            {
-                iter.remove();
-                indexer.remove(c);
-            }
-        }
-        iter.commit();
-        return cf;
-    }
-
-    // returns true if
-    // 1. this column has been dropped from schema and
-    // 2. if it has been re-added since then, this particular column was inserted before the last drop
-    private static boolean isDroppedColumn(Cell c, CFMetaData meta)
-    {
-        Long droppedAt = meta.getDroppedColumns().get(c.name().cql3ColumnName(meta));
-        return droppedAt != null && c.timestamp() <= droppedAt;
-    }
-
-    private void removeDroppedColumns(ColumnFamily cf)
-    {
-        if (cf == null || cf.metadata.getDroppedColumns().isEmpty())
-            return;
-
-        BatchRemoveIterator<Cell> iter = cf.batchRemoveIterator();
-        while (iter.hasNext())
-            if (isDroppedColumn(iter.next(), metadata))
-                iter.remove();
-        iter.commit();
     }
 
     /**
@@ -1452,7 +1234,7 @@
      * @return sstables whose key range overlaps with that of the given sstables, not including itself.
      * (The given sstables may or may not overlap with each other.)
      */
-    public Collection<SSTableReader> getOverlappingSSTables(Iterable<SSTableReader> sstables)
+    public Collection<SSTableReader> getOverlappingLiveSSTables(Iterable<SSTableReader> sstables)
     {
         logger.trace("Checking for sstables overlapping {}", sstables);
 
@@ -1461,18 +1243,12 @@
         if (!sstables.iterator().hasNext())
             return ImmutableSet.of();
 
-
+        View view = data.getView();
 
         List<SSTableReader> sortedByFirst = Lists.newArrayList(sstables);
-        Collections.sort(sortedByFirst, new Comparator<SSTableReader>()
-        {
-            @Override
-            public int compare(SSTableReader o1, SSTableReader o2)
-            {
-                return o1.first.compareTo(o2.first);
-            }
-        });
-        List<Interval<RowPosition, SSTableReader>> intervals = new ArrayList<>();
+        Collections.sort(sortedByFirst, (o1, o2) -> o1.first.compareTo(o2.first));
+
+        List<AbstractBounds<PartitionPosition>> bounds = new ArrayList<>();
         DecoratedKey first = null, last = null;
         /*
         normalize the intervals covered by the sstables
@@ -1499,18 +1275,17 @@
                 }
                 else
                 {
-                    intervals.add(Interval.<RowPosition, SSTableReader>create(first, last));
+                    bounds.add(AbstractBounds.bounds(first, true, last, true));
                     first = sstable.first;
                     last = sstable.last;
                 }
             }
         }
-        intervals.add(Interval.<RowPosition, SSTableReader>create(first, last));
-        SSTableIntervalTree tree = data.getView().intervalTree;
+        bounds.add(AbstractBounds.bounds(first, true, last, true));
         Set<SSTableReader> results = new HashSet<>();
 
-        for (Interval<RowPosition, SSTableReader> interval : intervals)
-            results.addAll(tree.search(interval));
+        for (AbstractBounds<PartitionPosition> bound : bounds)
+            Iterables.addAll(results, view.liveSSTablesInBounds(bound.left, bound.right));
 
         return Sets.difference(results, ImmutableSet.copyOf(sstables));
     }
@@ -1518,11 +1293,11 @@
     /**
      * like getOverlappingSSTables, but acquires references before returning
      */
-    public Refs<SSTableReader> getAndReferenceOverlappingSSTables(Iterable<SSTableReader> sstables)
+    public Refs<SSTableReader> getAndReferenceOverlappingLiveSSTables(Iterable<SSTableReader> sstables)
     {
         while (true)
         {
-            Iterable<SSTableReader> overlapped = getOverlappingSSTables(sstables);
+            Iterable<SSTableReader> overlapped = getOverlappingLiveSSTables(sstables);
             Refs<SSTableReader> refs = Refs.tryRef(overlapped);
             if (refs != null)
                 return refs;
@@ -1610,12 +1385,13 @@
         return CompactionManager.instance.performCleanup(ColumnFamilyStore.this, jobs);
     }
 
-    public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs) throws ExecutionException, InterruptedException
+    public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, int jobs) throws ExecutionException, InterruptedException
     {
-        return scrub(disableSnapshot, skipCorrupted, false, checkData, reinsertOverflowedTTLRows, jobs);
+        return scrub(disableSnapshot, skipCorrupted, reinsertOverflowedTTL, false, checkData, jobs);
     }
 
-    public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean alwaysFail, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs) throws ExecutionException, InterruptedException
+    @VisibleForTesting
+    public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boolean skipCorrupted, boolean reinsertOverflowedTTL, boolean alwaysFail, boolean checkData, int jobs) throws ExecutionException, InterruptedException
     {
         // skip snapshot creation during scrub, SEE JIRA 5891
         if(!disableSnapshot)
@@ -1623,7 +1399,7 @@
 
         try
         {
-            return CompactionManager.instance.performScrub(ColumnFamilyStore.this, skipCorrupted, checkData, reinsertOverflowedTTLRows, jobs);
+            return CompactionManager.instance.performScrub(ColumnFamilyStore.this, skipCorrupted, checkData, reinsertOverflowedTTL, jobs);
         }
         catch(Throwable t)
         {
@@ -1642,25 +1418,19 @@
      */
     public boolean rebuildOnFailedScrub(Throwable failure)
     {
-        if (!isIndex())
-            return false;
-
-        SecondaryIndex index = null;
-        if (metadata.cfName.contains(Directories.SECONDARY_INDEX_NAME_SEPARATOR))
-        {
-            String[] parts = metadata.cfName.split("\\" + Directories.SECONDARY_INDEX_NAME_SEPARATOR, 2);
-            ColumnFamilyStore parentCfs = keyspace.getColumnFamilyStore(parts[0]);
-            index = parentCfs.indexManager.getIndexByName(metadata.cfName);
-            assert index != null;
-        }
-
-        if (index == null)
+        if (!isIndex() || !SecondaryIndexManager.isIndexColumnFamilyStore(this))
             return false;
 
         truncateBlocking();
 
         logger.warn("Rebuilding index for {} because of <{}>", name, failure.getMessage());
-        index.getBaseCfs().rebuildSecondaryIndex(index.getIndexName());
+
+        ColumnFamilyStore parentCfs = SecondaryIndexManager.getParentCfs(this);
+        assert parentCfs.indexManager.getAllIndexColumnFamilyStores().contains(this);
+
+        String indexName = SecondaryIndexManager.getIndexName(this);
+
+        parentCfs.rebuildSecondaryIndex(indexName);
         return true;
     }
 
@@ -1680,14 +1450,16 @@
         maybeFail(data.dropSSTables(Predicates.in(sstables), compactionType, null));
     }
 
+    void replaceFlushed(Memtable memtable, Collection<SSTableReader> sstables)
+    {
+        compactionStrategyManager.replaceFlushed(memtable, sstables);
+    }
+
     public boolean isValid()
     {
         return valid;
     }
 
-
-
-
     /**
      * Package protected for access from the CompactionManager.
      */
@@ -1696,296 +1468,53 @@
         return data;
     }
 
-    public Collection<SSTableReader> getSSTables()
+    public Set<SSTableReader> getLiveSSTables()
     {
-        return data.getSSTables();
+        return data.getView().liveSSTables();
     }
 
-    public Iterable<SSTableReader> getPermittedToCompactSSTables()
+    public Iterable<SSTableReader> getSSTables(SSTableSet sstableSet)
     {
-        return data.getPermittedToCompact();
+        return data.getView().select(sstableSet);
     }
 
-    public Set<SSTableReader> getUncompactingSSTables()
+    public Iterable<SSTableReader> getUncompactingSSTables()
     {
         return data.getUncompacting();
     }
 
-    public ColumnFamily getColumnFamily(DecoratedKey key,
-                                        Composite start,
-                                        Composite finish,
-                                        boolean reversed,
-                                        int limit,
-                                        long timestamp)
-    {
-        return getColumnFamily(QueryFilter.getSliceFilter(key, name, start, finish, reversed, limit, timestamp));
-    }
-
-    /**
-     * Fetch the row and columns given by filter.key if it is in the cache; if not, read it from disk and cache it
-     *
-     * If row is cached, and the filter given is within its bounds, we return from cache, otherwise from disk
-     *
-     * If row is not cached, we figure out what filter is "biggest", read that from disk, then
-     * filter the result and either cache that or return it.
-     *
-     * @param cfId the column family to read the row from
-     * @param filter the columns being queried.
-     * @return the requested data for the filter provided
-     */
-    private ColumnFamily getThroughCache(UUID cfId, QueryFilter filter)
-    {
-        assert isRowCacheEnabled()
-               : String.format("Row cache is not enabled on table [" + name + "]");
-
-        RowCacheKey key = new RowCacheKey(metadata.ksAndCFName, filter.key);
-
-        // attempt a sentinel-read-cache sequence.  if a write invalidates our sentinel, we'll return our
-        // (now potentially obsolete) data, but won't cache it. see CASSANDRA-3862
-        // TODO: don't evict entire rows on writes (#2864)
-        IRowCacheEntry cached = CacheService.instance.rowCache.get(key);
-        if (cached != null)
-        {
-            if (cached instanceof RowCacheSentinel)
-            {
-                // Some other read is trying to cache the value, just do a normal non-caching read
-                Tracing.trace("Row cache miss (race)");
-                metric.rowCacheMiss.inc();
-                return getTopLevelColumns(filter, Integer.MIN_VALUE);
-            }
-
-            ColumnFamily cachedCf = (ColumnFamily)cached;
-            if (isFilterFullyCoveredBy(filter.filter, cachedCf, filter.timestamp))
-            {
-                metric.rowCacheHit.inc();
-                Tracing.trace("Row cache hit");
-                ColumnFamily result = filterColumnFamily(cachedCf, filter);
-                metric.updateSSTableIterated(0);
-                return result;
-            }
-
-            metric.rowCacheHitOutOfRange.inc();
-            Tracing.trace("Ignoring row cache as cached value could not satisfy query");
-            return getTopLevelColumns(filter, Integer.MIN_VALUE);
-        }
-
-        metric.rowCacheMiss.inc();
-        Tracing.trace("Row cache miss");
-        RowCacheSentinel sentinel = new RowCacheSentinel();
-        boolean sentinelSuccess = CacheService.instance.rowCache.putIfAbsent(key, sentinel);
-        ColumnFamily data = null;
-        ColumnFamily toCache = null;
-        try
-        {
-            // If we are explicitely asked to fill the cache with full partitions, we go ahead and query the whole thing
-            if (metadata.getCaching().rowCache.cacheFullPartitions())
-            {
-                data = getTopLevelColumns(QueryFilter.getIdentityFilter(filter.key, name, filter.timestamp), Integer.MIN_VALUE);
-                toCache = data;
-                Tracing.trace("Populating row cache with the whole partition");
-                if (sentinelSuccess && toCache != null)
-                    CacheService.instance.rowCache.replace(key, sentinel, toCache);
-                return filterColumnFamily(data, filter);
-            }
-
-            // Otherwise, if we want to cache the result of the query we're about to do, we must make sure this query
-            // covers what needs to be cached. And if the user filter does not satisfy that, we sometimes extend said
-            // filter so we can populate the cache but only if:
-            //   1) we can guarantee it is a strict extension, i.e. that we will still fetch the data asked by the user.
-            //   2) the extension does not make us query more than getRowsPerPartitionToCache() (as a mean to limit the
-            //      amount of extra work we'll do on a user query for the purpose of populating the cache).
-            //
-            // In practice, we can only guarantee those 2 points if the filter is one that queries the head of the
-            // partition (and if that filter actually counts CQL3 rows since that's what we cache and it would be
-            // bogus to compare the filter count to the 'rows to cache' otherwise).
-            if (filter.filter.isHeadFilter() && filter.filter.countCQL3Rows(metadata.comparator))
-            {
-                SliceQueryFilter sliceFilter = (SliceQueryFilter)filter.filter;
-                int rowsToCache = metadata.getCaching().rowCache.rowsToCache;
-
-                SliceQueryFilter cacheSlice = readFilterForCache();
-                QueryFilter cacheFilter = new QueryFilter(filter.key, name, cacheSlice, filter.timestamp);
-
-                // If the filter count is less than the number of rows cached, we simply extend it to make sure we do cover the
-                // number of rows to cache, and if that count is greater than the number of rows to cache, we simply filter what
-                // needs to be cached afterwards.
-                if (sliceFilter.count < rowsToCache)
-                {
-                    toCache = getTopLevelColumns(cacheFilter, Integer.MIN_VALUE);
-                    if (toCache != null)
-                    {
-                        Tracing.trace("Populating row cache ({} rows cached)", cacheSlice.lastCounted());
-                        data = filterColumnFamily(toCache, filter);
-                    }
-                }
-                else
-                {
-                    data = getTopLevelColumns(filter, Integer.MIN_VALUE);
-                    if (data != null)
-                    {
-                        // The filter limit was greater than the number of rows to cache. But, if the filter had a non-empty
-                        // finish bound, we may have gotten less than what needs to be cached, in which case we shouldn't cache it
-                        // (otherwise a cache hit would assume the whole partition is cached which is not the case).
-                        if (sliceFilter.finish().isEmpty() || sliceFilter.lastCounted() >= rowsToCache)
-                        {
-                            toCache = filterColumnFamily(data, cacheFilter);
-                            Tracing.trace("Caching {} rows (out of {} requested)", cacheSlice.lastCounted(), sliceFilter.count);
-                        }
-                        else
-                        {
-                            Tracing.trace("Not populating row cache, not enough rows fetched ({} fetched but {} required for the cache)", sliceFilter.lastCounted(), rowsToCache);
-                        }
-                    }
-                }
-
-                if (sentinelSuccess && toCache != null)
-                    CacheService.instance.rowCache.replace(key, sentinel, toCache);
-                return data;
-            }
-            else
-            {
-                Tracing.trace("Fetching data but not populating cache as query does not query from the start of the partition");
-                return getTopLevelColumns(filter, Integer.MIN_VALUE);
-            }
-        }
-        finally
-        {
-            if (sentinelSuccess && toCache == null)
-                invalidateCachedRow(key);
-        }
-    }
-
-    public SliceQueryFilter readFilterForCache()
-    {
-        // We create a new filter everytime before for now SliceQueryFilter is unfortunatly mutable.
-        return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, metadata.getCaching().rowCache.rowsToCache, metadata.clusteringColumns().size());
-    }
-
-    public boolean isFilterFullyCoveredBy(IDiskAtomFilter filter, ColumnFamily cachedCf, long now)
+    public boolean isFilterFullyCoveredBy(ClusteringIndexFilter filter, DataLimits limits, CachedPartition cached, int nowInSec)
     {
         // We can use the cached value only if we know that no data it doesn't contain could be covered
         // by the query filter, that is if:
         //   1) either the whole partition is cached
-        //   2) or we can ensure than any data the filter selects are in the cached partition
+        //   2) or we can ensure than any data the filter selects is in the cached partition
 
-        // When counting rows to decide if the whole row is cached, we should be careful with expiring
-        // columns: if we use a timestamp newer than the one that was used when populating the cache, we might
-        // end up deciding the whole partition is cached when it's really not (just some rows expired since the
-        // cf was cached). This is the reason for Integer.MIN_VALUE below.
-        boolean wholePartitionCached = cachedCf.liveCQL3RowCount(Integer.MIN_VALUE) < metadata.getCaching().rowCache.rowsToCache;
+        // We can guarantee that a partition is fully cached if the number of rows it contains is less than
+        // what we're caching. Wen doing that, we should be careful about expiring cells: we should count
+        // something expired that wasn't when the partition was cached, or we could decide that the whole
+        // partition is cached when it's not. This is why we use CachedPartition#cachedLiveRows.
+        if (cached.cachedLiveRows() < metadata.params.caching.rowsPerPartitionToCache())
+            return true;
 
-        // Contrarily to the "wholePartitionCached" check above, we do want isFullyCoveredBy to take the
-        // timestamp of the query into account when dealing with expired columns. Otherwise, we could think
-        // the cached partition has enough live rows to satisfy the filter when it doesn't because some
-        // are now expired.
-        return wholePartitionCached || filter.isFullyCoveredBy(cachedCf, now);
+        // If the whole partition isn't cached, then we must guarantee that the filter cannot select data that
+        // is not in the cache. We can guarantee that if either the filter is a "head filter" and the cached
+        // partition has more live rows that queried (where live rows refers to the rows that are live now),
+        // or if we can prove that everything the filter selects is in the cached partition based on its content.
+        return (filter.isHeadFilter() && limits.hasEnoughLiveData(cached,
+                                                                  nowInSec,
+                                                                  filter.selectsAllPartition(),
+                                                                  metadata.enforceStrictLiveness()))
+                || filter.isFullyCoveredBy(cached);
     }
 
-    public int gcBefore(long now)
+    public int gcBefore(int nowInSec)
     {
-        return (int) (now / 1000) - metadata.getGcGraceSeconds();
-    }
-
-    /**
-     * get a list of columns starting from a given column, in a specified order.
-     * only the latest version of a column is returned.
-     * @return null if there is no data and no tombstones; otherwise a ColumnFamily
-     */
-    public ColumnFamily getColumnFamily(QueryFilter filter)
-    {
-        assert name.equals(filter.getColumnFamilyName()) : filter.getColumnFamilyName();
-
-        ColumnFamily result = null;
-
-        long start = System.nanoTime();
-        try
-        {
-            int gcBefore = gcBefore(filter.timestamp);
-            if (isRowCacheEnabled())
-            {
-                assert !isIndex(); // CASSANDRA-5732
-                UUID cfId = metadata.cfId;
-
-                ColumnFamily cached = getThroughCache(cfId, filter);
-                if (cached == null)
-                {
-                    logger.trace("cached row is empty");
-                    return null;
-                }
-
-                result = cached;
-            }
-            else
-            {
-                ColumnFamily cf = getTopLevelColumns(filter, gcBefore);
-
-                if (cf == null)
-                    return null;
-
-                result = removeDeletedCF(cf, gcBefore);
-            }
-
-            removeDroppedColumns(result);
-
-            if (filter.filter instanceof SliceQueryFilter)
-            {
-                // Log the number of tombstones scanned on single key queries
-                metric.tombstoneScannedHistogram.update(((SliceQueryFilter) filter.filter).lastTombstones());
-                metric.liveScannedHistogram.update(((SliceQueryFilter) filter.filter).lastLive());
-            }
-        }
-        finally
-        {
-            metric.readLatency.addNano(System.nanoTime() - start);
-        }
-
-        return result;
-    }
-
-    /**
-     *  Filter a cached row, which will not be modified by the filter, but may be modified by throwing out
-     *  tombstones that are no longer relevant.
-     *  The returned column family won't be thread safe.
-     */
-    ColumnFamily filterColumnFamily(ColumnFamily cached, QueryFilter filter)
-    {
-        if (cached == null)
-            return null;
-
-        ColumnFamily cf = cached.cloneMeShallow(ArrayBackedSortedColumns.factory, filter.filter.isReversed());
-        int gcBefore = gcBefore(filter.timestamp);
-        filter.collateOnDiskAtom(cf, filter.getIterator(cached), gcBefore);
-        return removeDeletedCF(cf, gcBefore);
-    }
-
-    public Set<SSTableReader> getUnrepairedSSTables()
-    {
-        Set<SSTableReader> unRepairedSSTables = new HashSet<>(getSSTables());
-        Iterator<SSTableReader> sstableIterator = unRepairedSSTables.iterator();
-        while(sstableIterator.hasNext())
-        {
-            SSTableReader sstable = sstableIterator.next();
-            if (sstable.isRepaired())
-                sstableIterator.remove();
-        }
-        return unRepairedSSTables;
-    }
-
-    public Set<SSTableReader> getRepairedSSTables()
-    {
-        Set<SSTableReader> repairedSSTables = new HashSet<>(getSSTables());
-        Iterator<SSTableReader> sstableIterator = repairedSSTables.iterator();
-        while(sstableIterator.hasNext())
-        {
-            SSTableReader sstable = sstableIterator.next();
-            if (!sstable.isRepaired())
-                sstableIterator.remove();
-        }
-        return repairedSSTables;
+        return nowInSec - metadata.params.gcGraceSeconds;
     }
 
     @SuppressWarnings("resource")
-    public RefViewFragment selectAndReference(Function<View, List<SSTableReader>> filter)
+    public RefViewFragment selectAndReference(Function<View, Iterable<SSTableReader>> filter)
     {
         long failingSince = -1L;
         while (true)
@@ -2011,92 +1540,21 @@
         }
     }
 
-    public ViewFragment select(Function<View, List<SSTableReader>> filter)
+    public ViewFragment select(Function<View, Iterable<SSTableReader>> filter)
     {
         View view = data.getView();
-        List<SSTableReader> sstables = view.intervalTree.isEmpty()
-                                       ? Collections.<SSTableReader>emptyList()
-                                       : filter.apply(view);
+        List<SSTableReader> sstables = Lists.newArrayList(filter.apply(view));
         return new ViewFragment(sstables, view.getAllMemtables());
     }
 
-
-    /**
-     * @return a ViewFragment containing the sstables and memtables that may need to be merged
-     * for the given @param key, according to the interval tree
-     */
-    public Function<View, List<SSTableReader>> viewFilter(final DecoratedKey key)
-    {
-        assert !key.isMinimum();
-        return new Function<View, List<SSTableReader>>()
-        {
-            public List<SSTableReader> apply(View view)
-            {
-                return compactionStrategyWrapper.filterSSTablesForReads(view.intervalTree.search(key));
-            }
-        };
-    }
-
-    /**
-     * @return a ViewFragment containing the sstables and memtables that may need to be merged
-     * for rows within @param rowBounds, inclusive, according to the interval tree.
-     */
-    public Function<View, List<SSTableReader>> viewFilter(final AbstractBounds<RowPosition> rowBounds)
-    {
-        assert !AbstractBounds.strictlyWrapsAround(rowBounds.left, rowBounds.right);
-        return new Function<View, List<SSTableReader>>()
-        {
-            public List<SSTableReader> apply(View view)
-            {
-                // Note that View.sstablesInBounds always includes it's bound while rowBounds may not. This is ok however
-                // because the fact we restrict the sstables returned by this function is an optimization in the first
-                // place and the returned sstables will (almost) never cover *exactly* rowBounds anyway. It's also
-                // *very* unlikely that a sstable is included *just* because we consider one of the bound inclusively
-                // instead of exclusively, so the performance impact is negligible in practice.
-                return view.sstablesInBounds(rowBounds.left, rowBounds.right);
-            }
-        };
-    }
-
-    /**
-     * @return a ViewFragment containing the sstables and memtables that may need to be merged
-     * for rows for all of @param rowBoundsCollection, inclusive, according to the interval tree.
-     */
-    public Function<View, List<SSTableReader>> viewFilter(final Collection<AbstractBounds<RowPosition>> rowBoundsCollection, final boolean includeRepaired)
-    {
-        assert AbstractBounds.noneStrictlyWrapsAround(rowBoundsCollection);
-        return new Function<View, List<SSTableReader>>()
-        {
-            public List<SSTableReader> apply(View view)
-            {
-                Set<SSTableReader> sstables = Sets.newHashSet();
-                for (AbstractBounds<RowPosition> rowBounds : rowBoundsCollection)
-                {
-                    // Note that View.sstablesInBounds always includes it's bound while rowBounds may not. This is ok however
-                    // because the fact we restrict the sstables returned by this function is an optimization in the first
-                    // place and the returned sstables will (almost) never cover *exactly* rowBounds anyway. It's also
-                    // *very* unlikely that a sstable is included *just* because we consider one of the bound inclusively
-                    // instead of exclusively, so the performance impact is negligible in practice.
-                    for (SSTableReader sstable : view.sstablesInBounds(rowBounds.left, rowBounds.right))
-                    {
-                        if (includeRepaired || !sstable.isRepaired())
-                            sstables.add(sstable);
-                    }
-                }
-
-                logger.trace("ViewFilter for {}/{} sstables", sstables.size(), getSSTables().size());
-                return ImmutableList.copyOf(sstables);
-            }
-        };
-    }
-
+    // WARNING: this returns the set of LIVE sstables only, which may be only partially written
     public List<String> getSSTablesForKey(String key)
     {
-        DecoratedKey dk = partitioner.decorateKey(metadata.getKeyValidator().fromString(key));
+        DecoratedKey dk = decorateKey(metadata.getKeyValidator().fromString(key));
         try (OpOrder.Group op = readOrdering.start())
         {
             List<String> files = new ArrayList<>();
-            for (SSTableReader sstr : select(viewFilter(dk)).sstables)
+            for (SSTableReader sstr : select(View.select(SSTableSet.LIVE, dk)).sstables)
             {
                 // check if the key actually exists in this sstable, without updating cache and stats
                 if (sstr.getPosition(dk, SSTableReader.Operator.EQ, false) != null)
@@ -2106,20 +1564,6 @@
         }
     }
 
-    public ColumnFamily getTopLevelColumns(QueryFilter filter, int gcBefore)
-    {
-        Tracing.trace("Executing single-partition query on {}", name);
-        CollationController controller = new CollationController(this, filter, gcBefore);
-        ColumnFamily columns;
-        try (OpOrder.Group op = readOrdering.start())
-        {
-            columns = controller.getTopLevelColumns(Memtable.MEMORY_POOL.needToCopyOnHeap());
-        }
-        if (columns != null)
-            metric.samplers.get(Sampler.READS).addSample(filter.key.getKey(), filter.key.hashCode(), 1);
-        metric.updateSSTableIterated(controller.getSstablesIterated());
-        return columns;
-    }
 
     public void beginLocalSampling(String sampler, int capacity)
     {
@@ -2144,6 +1588,16 @@
                 samplerResults.cardinality, result});
     }
 
+    public boolean isCompactionDiskSpaceCheckEnabled()
+    {
+        return compactionSpaceCheck;
+    }
+
+    public void compactionDiskSpaceCheck(boolean enable)
+    {
+        compactionSpaceCheck = enable;
+    }
+
     public void cleanupCache()
     {
         Collection<Range<Token>> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
@@ -2152,9 +1606,9 @@
              keyIter.hasNext(); )
         {
             RowCacheKey key = keyIter.next();
-            DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.key));
+            DecoratedKey dk = decorateKey(ByteBuffer.wrap(key.key));
             if (key.ksAndCFName.equals(metadata.ksAndCFName) && !Range.isInRanges(dk.getToken(), ranges))
-                invalidateCachedRow(dk);
+                invalidateCachedPartition(dk);
         }
 
         if (metadata.isCounter())
@@ -2163,254 +1617,14 @@
                  keyIter.hasNext(); )
             {
                 CounterCacheKey key = keyIter.next();
-                DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.partitionKey));
+                DecoratedKey dk = decorateKey(ByteBuffer.wrap(key.partitionKey));
                 if (key.ksAndCFName.equals(metadata.ksAndCFName) && !Range.isInRanges(dk.getToken(), ranges))
                     CacheService.instance.counterCache.remove(key);
             }
         }
     }
 
-    public static abstract class AbstractScanIterator extends AbstractIterator<Row> implements CloseableIterator<Row>
-    {
-        public boolean needsFiltering()
-        {
-            return true;
-        }
-    }
-
-    /**
-      * Iterate over a range of rows and columns from memtables/sstables.
-      *
-      * @param range The range of keys and columns within those keys to fetch
-     */
-    @SuppressWarnings("resource")
-    private AbstractScanIterator getSequentialIterator(final DataRange range, long now)
-    {
-        assert !(range.keyRange() instanceof Range) || !((Range<?>)range.keyRange()).isWrapAround() || range.keyRange().right.isMinimum() : range.keyRange();
-
-        final ViewFragment view = select(viewFilter(range.keyRange()));
-        Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(), range.keyRange().getString(metadata.getKeyValidator()));
-
-        final CloseableIterator<Row> iterator = RowIteratorFactory.getIterator(view.memtables, view.sstables, range, this, now);
-
-        // todo this could be pushed into SSTableScanner
-        return new AbstractScanIterator()
-        {
-            protected Row computeNext()
-            {
-                while (true)
-                {
-                    // pull a row out of the iterator
-                    if (!iterator.hasNext())
-                        return endOfData();
-
-                    Row current = iterator.next();
-                    DecoratedKey key = current.key;
-
-                    if (!range.stopKey().isMinimum() && range.stopKey().compareTo(key) < 0)
-                        return endOfData();
-
-                    // skipping outside of assigned range
-                    if (!range.contains(key))
-                        continue;
-
-                    if (logger.isTraceEnabled())
-                        logger.trace("scanned {}", metadata.getKeyValidator().getString(key.getKey()));
-
-                    return current;
-                }
-            }
-
-            public void close() throws IOException
-            {
-                iterator.close();
-            }
-        };
-    }
-
-    @VisibleForTesting
-    public List<Row> getRangeSlice(final AbstractBounds<RowPosition> range,
-                                   List<IndexExpression> rowFilter,
-                                   IDiskAtomFilter columnFilter,
-                                   int maxResults)
-    {
-        return getRangeSlice(range, rowFilter, columnFilter, maxResults, System.currentTimeMillis());
-    }
-
-    public List<Row> getRangeSlice(final AbstractBounds<RowPosition> range,
-                                   List<IndexExpression> rowFilter,
-                                   IDiskAtomFilter columnFilter,
-                                   int maxResults,
-                                   long now)
-    {
-        return getRangeSlice(makeExtendedFilter(range, columnFilter, rowFilter, maxResults, false, false, now));
-    }
-
-    /**
-     * Allows generic range paging with the slice column filter.
-     * Typically, suppose we have rows A, B, C ... Z having each some columns in [1, 100].
-     * And suppose we want to page through the query that for all rows returns the columns
-     * within [25, 75]. For that, we need to be able to do a range slice starting at (row r, column c)
-     * and ending at (row Z, column 75), *but* that only return columns in [25, 75].
-     * That is what this method allows. The columnRange is the "window" of  columns we are interested
-     * in each row, and columnStart (resp. columnEnd) is the start (resp. end) for the first
-     * (resp. last) requested row.
-     */
-    public ExtendedFilter makeExtendedFilter(AbstractBounds<RowPosition> keyRange,
-                                             SliceQueryFilter columnRange,
-                                             Composite columnStart,
-                                             Composite columnStop,
-                                             List<IndexExpression> rowFilter,
-                                             int maxResults,
-                                             boolean countCQL3Rows,
-                                             long now)
-    {
-        DataRange dataRange = new DataRange.Paging(keyRange, columnRange, columnStart, columnStop, metadata);
-        return ExtendedFilter.create(this, dataRange, rowFilter, maxResults, countCQL3Rows, now);
-    }
-
-    public List<Row> getRangeSlice(AbstractBounds<RowPosition> range,
-                                   List<IndexExpression> rowFilter,
-                                   IDiskAtomFilter columnFilter,
-                                   int maxResults,
-                                   long now,
-                                   boolean countCQL3Rows,
-                                   boolean isPaging)
-    {
-        return getRangeSlice(makeExtendedFilter(range, columnFilter, rowFilter, maxResults, countCQL3Rows, isPaging, now));
-    }
-
-    public ExtendedFilter makeExtendedFilter(AbstractBounds<RowPosition> range,
-                                             IDiskAtomFilter columnFilter,
-                                             List<IndexExpression> rowFilter,
-                                             int maxResults,
-                                             boolean countCQL3Rows,
-                                             boolean isPaging,
-                                             long timestamp)
-    {
-        DataRange dataRange;
-        if (isPaging)
-        {
-            assert columnFilter instanceof SliceQueryFilter;
-            SliceQueryFilter sfilter = (SliceQueryFilter)columnFilter;
-            assert sfilter.slices.length == 1;
-            // create a new SliceQueryFilter that selects all cells, but pass the original slice start and finish
-            // through to DataRange.Paging to be used on the first and last partitions
-            SliceQueryFilter newFilter = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, sfilter.isReversed(), sfilter.count);
-            dataRange = new DataRange.Paging(range, newFilter, sfilter.start(), sfilter.finish(), metadata);
-        }
-        else
-        {
-            dataRange = new DataRange(range, columnFilter);
-        }
-        return ExtendedFilter.create(this, dataRange, rowFilter, maxResults, countCQL3Rows, timestamp);
-    }
-
-    public List<Row> getRangeSlice(ExtendedFilter filter)
-    {
-        long start = System.nanoTime();
-        try (OpOrder.Group op = readOrdering.start())
-        {
-            return filter(getSequentialIterator(filter.dataRange, filter.timestamp), filter);
-        }
-        finally
-        {
-            metric.rangeLatency.addNano(System.nanoTime() - start);
-        }
-    }
-
-    @VisibleForTesting
-    public List<Row> search(AbstractBounds<RowPosition> range,
-                            List<IndexExpression> clause,
-                            IDiskAtomFilter dataFilter,
-                            int maxResults)
-    {
-        return search(range, clause, dataFilter, maxResults, System.currentTimeMillis());
-    }
-
-    public List<Row> search(AbstractBounds<RowPosition> range,
-                            List<IndexExpression> clause,
-                            IDiskAtomFilter dataFilter,
-                            int maxResults,
-                            long now)
-    {
-        return search(makeExtendedFilter(range, dataFilter, clause, maxResults, false, false, now));
-    }
-
-    public List<Row> search(ExtendedFilter filter)
-    {
-        Tracing.trace("Executing indexed scan for {}", filter.dataRange.keyRange().getString(metadata.getKeyValidator()));
-        return indexManager.search(filter);
-    }
-
-    public List<Row> filter(AbstractScanIterator rowIterator, ExtendedFilter filter)
-    {
-        logger.trace("Filtering {} for rows matching {}", rowIterator, filter);
-        List<Row> rows = new ArrayList<Row>();
-        int columnsCount = 0;
-        int total = 0, matched = 0;
-        boolean ignoreTombstonedPartitions = filter.ignoreTombstonedPartitions();
-
-        try
-        {
-            while (rowIterator.hasNext() && matched < filter.maxRows() && columnsCount < filter.maxColumns())
-            {
-                // get the raw columns requested, and additional columns for the expressions if necessary
-                Row rawRow = rowIterator.next();
-                total++;
-                ColumnFamily data = rawRow.cf;
-
-                if (rowIterator.needsFiltering())
-                {
-                    IDiskAtomFilter extraFilter = filter.getExtraFilter(rawRow.key, data);
-                    if (extraFilter != null)
-                    {
-                        ColumnFamily cf = filter.cfs.getColumnFamily(new QueryFilter(rawRow.key, name, extraFilter, filter.timestamp));
-                        if (cf != null)
-                            data.addAll(cf);
-                    }
-
-                    removeDroppedColumns(data);
-
-                    if (!filter.isSatisfiedBy(rawRow.key, data, null, null))
-                        continue;
-
-                    logger.trace("{} satisfies all filter expressions", data);
-                    // cut the resultset back to what was requested, if necessary
-                    data = filter.prune(rawRow.key, data);
-                }
-                else
-                {
-                    removeDroppedColumns(data);
-                }
-
-                rows.add(new Row(rawRow.key, data));
-                if (!ignoreTombstonedPartitions || !data.hasOnlyTombstones(filter.timestamp))
-                    matched++;
-
-                if (data != null)
-                    columnsCount += filter.lastCounted(data);
-                // Update the underlying filter to avoid querying more columns per slice than necessary and to handle paging
-                filter.updateFilter(columnsCount);
-            }
-
-            return rows;
-        }
-        finally
-        {
-            try
-            {
-                rowIterator.close();
-                Tracing.trace("Scanned {} rows and matched {}", total, matched);
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-    }
-
-    public CellNameType getComparator()
+    public ClusteringComparator getComparator()
     {
         return metadata.comparator;
     }
@@ -2429,13 +1643,10 @@
         final JSONArray filesJSONArr = new JSONArray();
         for (ColumnFamilyStore cfs : concatWithIndexes())
         {
-            try (RefViewFragment currentView = cfs.selectAndReference(CANONICAL_SSTABLES))
+            try (RefViewFragment currentView = cfs.selectAndReference(View.select(SSTableSet.CANONICAL, (x) -> predicate == null || predicate.apply(x))))
             {
                 for (SSTableReader ssTable : currentView.sstables)
                 {
-                    if (predicate != null && !predicate.apply(ssTable))
-                        continue;
-
                     File snapshotDirectory = Directories.getSnapshotDirectory(ssTable.descriptor, snapshotName);
                     ssTable.createLinks(snapshotDirectory.getPath()); // hard links
                     filesJSONArr.add(ssTable.descriptor.relativeFilenameFor(Component.DATA));
@@ -2447,6 +1658,10 @@
             }
         }
         writeSnapshotManifest(filesJSONArr, snapshotName);
+
+        if (!Schema.isLocalSystemKeyspace(metadata.ksName) && !Schema.isReplicatedSystemKeyspace(metadata.ksName))
+            writeSnapshotSchema(snapshotName);
+
         if (ephemeral)
             createEphemeralSnapshotMarkerFile(snapshotName);
         return snapshottedSSTables;
@@ -2454,7 +1669,7 @@
 
     private void writeSnapshotManifest(final JSONArray filesJSONArr, final String snapshotName)
     {
-        final File manifestFile = directories.getSnapshotManifestFile(snapshotName);
+        final File manifestFile = getDirectories().getSnapshotManifestFile(snapshotName);
 
         try
         {
@@ -2474,9 +1689,30 @@
         }
     }
 
+    private void writeSnapshotSchema(final String snapshotName)
+    {
+        final File schemaFile = getDirectories().getSnapshotSchemaFile(snapshotName);
+
+        try
+        {
+            if (!schemaFile.getParentFile().exists())
+                schemaFile.getParentFile().mkdirs();
+
+            try (PrintStream out = new PrintStream(schemaFile))
+            {
+                for (String s: ColumnFamilyStoreCQLHelper.dumpReCreateStatements(metadata))
+                    out.println(s);
+            }
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, schemaFile);
+        }
+    }
+
     private void createEphemeralSnapshotMarkerFile(final String snapshot)
     {
-        final File ephemeralSnapshotMarker = directories.getNewEphemeralSnapshotMarkerFile(snapshot);
+        final File ephemeralSnapshotMarker = getDirectories().getNewEphemeralSnapshotMarkerFile(snapshot);
 
         try
         {
@@ -2507,9 +1743,9 @@
     public Refs<SSTableReader> getSnapshotSSTableReader(String tag) throws IOException
     {
         Map<Integer, SSTableReader> active = new HashMap<>();
-        for (SSTableReader sstable : data.getView().sstables)
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
             active.put(sstable.descriptor.generation, sstable);
-        Map<Descriptor, Set<Component>> snapshots = directories.sstableLister().snapshots(tag).list();
+        Map<Descriptor, Set<Component>> snapshots = getDirectories().sstableLister(Directories.OnTxnErr.IGNORE).snapshots(tag).list();
         Refs<SSTableReader> refs = new Refs<>();
         try
         {
@@ -2522,8 +1758,8 @@
                 {
                     if (logger.isTraceEnabled())
                         logger.trace("using snapshot sstable {}", entries.getKey());
-                    // open without tracking hotness
-                    sstable = SSTableReader.open(entries.getKey(), entries.getValue(), metadata, partitioner, true, false);
+                    // open offline so we don't modify components or track hotness.
+                    sstable = SSTableReader.open(entries.getKey(), entries.getValue(), metadata, true, true);
                     refs.tryRef(sstable);
                     // release the self ref as we never add the snapshot sstable to DataTracker where it is otherwise released
                     sstable.selfRef().release();
@@ -2566,12 +1802,12 @@
 
     public boolean snapshotExists(String snapshotName)
     {
-        return directories.snapshotExists(snapshotName);
+        return getDirectories().snapshotExists(snapshotName);
     }
 
     public long getSnapshotCreationTime(String snapshotName)
     {
-        return directories.snapshotCreationTime(snapshotName);
+        return getDirectories().snapshotCreationTime(snapshotName);
     }
 
     /**
@@ -2582,7 +1818,7 @@
      */
     public void clearSnapshot(String snapshotName)
     {
-        List<File> snapshotDirs = directories.getCFDirectories();
+        List<File> snapshotDirs = getDirectories().getCFDirectories();
         Directories.clearSnapshot(snapshotName, snapshotDirs);
     }
     /**
@@ -2592,29 +1828,23 @@
      */
     public Map<String, Pair<Long,Long>> getSnapshotDetails()
     {
-        return directories.getSnapshotDetails();
-    }
-
-    public boolean hasUnreclaimedSpace()
-    {
-        return metric.liveDiskSpaceUsed.getCount() < metric.totalDiskSpaceUsed.getCount();
+        return getDirectories().getSnapshotDetails();
     }
 
     /**
-     * @return the cached row for @param key if it is already present in the cache.
-     * That is, unlike getThroughCache, it will not readAndCache the row if it is not present, nor
+     * @return the cached partition for @param key if it is already present in the cache.
+     * Not that this will not readAndCache the parition if it is not present, nor
      * are these calls counted in cache statistics.
      *
-     * Note that this WILL cause deserialization of a SerializingCache row, so if all you
-     * need to know is whether a row is present or not, use containsCachedRow instead.
+     * Note that this WILL cause deserialization of a SerializingCache partition, so if all you
+     * need to know is whether a partition is present or not, use containsCachedParition instead.
      */
-    public ColumnFamily getRawCachedRow(DecoratedKey key)
+    public CachedPartition getRawCachedPartition(DecoratedKey key)
     {
         if (!isRowCacheEnabled())
             return null;
-
         IRowCacheEntry cached = CacheService.instance.rowCache.getInternal(new RowCacheKey(metadata.ksAndCFName, key));
-        return cached == null || cached instanceof RowCacheSentinel ? null : (ColumnFamily)cached;
+        return cached == null || cached instanceof RowCacheSentinel ? null : (CachedPartition)cached;
     }
 
     private void invalidateCaches()
@@ -2632,14 +1862,13 @@
              keyIter.hasNext(); )
         {
             RowCacheKey key = keyIter.next();
-            DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.key));
+            DecoratedKey dk = decorateKey(ByteBuffer.wrap(key.key));
             if (key.ksAndCFName.equals(metadata.ksAndCFName) && Bounds.isInBounds(dk.getToken(), boundsToInvalidate))
             {
-                invalidateCachedRow(dk);
+                invalidateCachedPartition(dk);
                 invalidatedKeys++;
             }
         }
-
         return invalidatedKeys;
     }
 
@@ -2650,7 +1879,7 @@
              keyIter.hasNext(); )
         {
             CounterCacheKey key = keyIter.next();
-            DecoratedKey dk = partitioner.decorateKey(ByteBuffer.wrap(key.partitionKey));
+            DecoratedKey dk = decorateKey(ByteBuffer.wrap(key.partitionKey));
             if (key.ksAndCFName.equals(metadata.ksAndCFName) && Bounds.isInBounds(dk.getToken(), boundsToInvalidate))
             {
                 CacheService.instance.counterCache.remove(key);
@@ -2663,37 +1892,36 @@
     /**
      * @return true if @param key is contained in the row cache
      */
-    public boolean containsCachedRow(DecoratedKey key)
+    public boolean containsCachedParition(DecoratedKey key)
     {
         return CacheService.instance.rowCache.getCapacity() != 0 && CacheService.instance.rowCache.containsKey(new RowCacheKey(metadata.ksAndCFName, key));
     }
 
-    public void invalidateCachedRow(RowCacheKey key)
+    public void invalidateCachedPartition(RowCacheKey key)
     {
         CacheService.instance.rowCache.remove(key);
     }
 
-    public void invalidateCachedRow(DecoratedKey key)
+    public void invalidateCachedPartition(DecoratedKey key)
     {
-        UUID cfId = Schema.instance.getId(keyspace.getName(), this.name);
-        if (cfId == null)
-            return; // secondary index
+        if (!Schema.instance.hasCF(metadata.ksAndCFName))
+            return; //2i don't cache rows
 
-        invalidateCachedRow(new RowCacheKey(metadata.ksAndCFName, key));
+        invalidateCachedPartition(new RowCacheKey(metadata.ksAndCFName, key));
     }
 
-    public ClockAndCount getCachedCounter(ByteBuffer partitionKey, CellName cellName)
+    public ClockAndCount getCachedCounter(ByteBuffer partitionKey, Clustering clustering, ColumnDefinition column, CellPath path)
     {
         if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
             return null;
-        return CacheService.instance.counterCache.get(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, cellName));
+        return CacheService.instance.counterCache.get(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, clustering, column, path));
     }
 
-    public void putCachedCounter(ByteBuffer partitionKey, CellName cellName, ClockAndCount clockAndCount)
+    public void putCachedCounter(ByteBuffer partitionKey, Clustering clustering, ColumnDefinition column, CellPath path, ClockAndCount clockAndCount)
     {
         if (CacheService.instance.counterCache.getCapacity() == 0L) // counter cache disabled.
             return;
-        CacheService.instance.counterCache.put(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, cellName), clockAndCount);
+        CacheService.instance.counterCache.put(CounterCacheKey.create(metadata.ksAndCFName, partitionKey, clustering, column, path), clockAndCount);
     }
 
     public void forceMajorCompaction() throws InterruptedException, ExecutionException
@@ -2719,7 +1947,7 @@
 
     public Iterable<DecoratedKey> keySamples(Range<Token> range)
     {
-        try (RefViewFragment view = selectAndReference(CANONICAL_SSTABLES))
+        try (RefViewFragment view = selectAndReference(View.selectFunction(SSTableSet.CANONICAL)))
         {
             Iterable<DecoratedKey>[] samples = new Iterable[view.sstables.size()];
             int i = 0;
@@ -2733,7 +1961,7 @@
 
     public long estimatedKeysForRange(Range<Token> range)
     {
-        try (RefViewFragment view = selectAndReference(CANONICAL_SSTABLES))
+        try (RefViewFragment view = selectAndReference(View.selectFunction(SSTableSet.CANONICAL)))
         {
             long count = 0;
             for (SSTableReader sstable : view.sstables)
@@ -2757,11 +1985,9 @@
                 public Void call()
                 {
                     cfs.data.reset(new Memtable(new AtomicReference<>(ReplayPosition.NONE), cfs));
-                    cfs.getCompactionStrategy().shutdown();
-                    cfs.getCompactionStrategy().startup();
                     return null;
                 }
-            }, true);
+            }, true, false);
         }
     }
 
@@ -2770,18 +1996,6 @@
      */
     public void truncateBlocking()
     {
-        truncateBlocking(DatabaseDescriptor.isAutoSnapshot());
-    }
-
-    /**
-     * Truncate deletes the column family's data with no expensive tombstone creation,
-     * optionally snapshotting the data.
-     *
-     * @param takeSnapshot whether or not to take a snapshot <code>true</code> if snapshot should be taken,
-     *                     <code>false</code> otherwise
-     */
-    public void truncateBlocking(final boolean takeSnapshot)
-    {
         // We have two goals here:
         // - truncate should delete everything written before truncate was invoked
         // - but not delete anything that isn't part of the snapshot we create.
@@ -2799,27 +2013,29 @@
         final long truncatedAt;
         final ReplayPosition replayAfter;
 
-        if (keyspace.getMetadata().durableWrites || takeSnapshot)
+        if (keyspace.getMetadata().params.durableWrites || DatabaseDescriptor.isAutoSnapshot())
         {
             replayAfter = forceBlockingFlush();
+            viewManager.forceBlockingFlush();
         }
         else
         {
             // just nuke the memtable data w/o writing to disk first
-            Future<ReplayPosition> replayAfterFuture;
-            synchronized (data)
+            viewManager.dumpMemtables();
+            try
             {
-                final Flush flush = new Flush(true);
-                flushExecutor.execute(flush);
-                replayAfterFuture = postFlushExecutor.submit(flush.postFlush);
+                replayAfter = dumpMemtable().get();
             }
-            replayAfter = FBUtilities.waitOnFuture(replayAfterFuture);
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
         }
 
         long now = System.currentTimeMillis();
         // make sure none of our sstables are somehow in the future (clock drift, perhaps)
         for (ColumnFamilyStore cfs : concatWithIndexes())
-            for (SSTableReader sstable : cfs.data.getSSTables())
+            for (SSTableReader sstable : cfs.getLiveSSTables())
                 now = Math.max(now, sstable.maxDataAge);
         truncatedAt = now;
 
@@ -2830,13 +2046,13 @@
                 logger.debug("Discarding sstable data for truncated CF + indexes");
                 data.notifyTruncated(truncatedAt);
 
-                if (takeSnapshot)
+                if (DatabaseDescriptor.isAutoSnapshot())
                     snapshot(Keyspace.getTimestampedSnapshotName(name));
 
                 discardSSTables(truncatedAt);
 
-                for (SecondaryIndex index : indexManager.getIndexes())
-                    index.truncateBlocking(truncatedAt);
+                indexManager.truncateAllIndexesBlocking(truncatedAt);
+                viewManager.truncateBlocking(replayAfter, truncatedAt);
 
                 SystemKeyspace.saveTruncationRecord(ColumnFamilyStore.this, truncatedAt, replayAfter);
                 logger.trace("cleaning out row cache");
@@ -2844,11 +2060,24 @@
             }
         };
 
-        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true);
+        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true, true);
         logger.trace("truncate complete");
     }
 
-    public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation)
+    /**
+     * Drops current memtable without flushing to disk. This should only be called when truncating a column family which is not durable.
+     */
+    public Future<ReplayPosition> dumpMemtable()
+    {
+        synchronized (data)
+        {
+            final Flush flush = new Flush(true);
+            flushExecutor.execute(flush);
+            return postFlushExecutor.submit(flush.postFlush);
+        }
+    }
+
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation, boolean interruptViews)
     {
         // synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly,
         // and so we only run one major compaction at a time
@@ -2856,17 +2085,19 @@
         {
             logger.trace("Cancelling in-progress compactions for {}", metadata.cfName);
 
-            Iterable<ColumnFamilyStore> selfWithIndexes = concatWithIndexes();
-            for (ColumnFamilyStore cfs : selfWithIndexes)
-                cfs.getCompactionStrategy().pause();
-            try
+            Iterable<ColumnFamilyStore> selfWithAuxiliaryCfs = interruptViews
+                                                               ? Iterables.concat(concatWithIndexes(), viewManager.allViewsCfs())
+                                                               : concatWithIndexes();
+
+            try (CompactionManager.CompactionPauser pause = CompactionManager.instance.pauseGlobalCompaction();
+                 CompactionManager.CompactionPauser pausedStrategies = pauseCompactionStrategies(selfWithAuxiliaryCfs))
             {
                 // interrupt in-progress compactions
-                CompactionManager.instance.interruptCompactionForCFs(selfWithIndexes, interruptValidation);
-                CompactionManager.instance.waitForCessation(selfWithIndexes);
+                CompactionManager.instance.interruptCompactionForCFs(selfWithAuxiliaryCfs, interruptValidation);
+                CompactionManager.instance.waitForCessation(selfWithAuxiliaryCfs);
 
                 // doublecheck that we finished, instead of timing out
-                for (ColumnFamilyStore cfs : selfWithIndexes)
+                for (ColumnFamilyStore cfs : selfWithAuxiliaryCfs)
                 {
                     if (!cfs.getTracker().getCompacting().isEmpty())
                     {
@@ -2886,12 +2117,43 @@
                     throw new RuntimeException(e);
                 }
             }
-            finally
+        }
+    }
+
+    private static CompactionManager.CompactionPauser pauseCompactionStrategies(Iterable<ColumnFamilyStore> toPause)
+    {
+        ArrayList<ColumnFamilyStore> successfullyPaused = new ArrayList<>();
+        try
+        {
+            for (ColumnFamilyStore cfs : toPause)
             {
-                for (ColumnFamilyStore cfs : selfWithIndexes)
-                    cfs.getCompactionStrategy().resume();
+                successfullyPaused.ensureCapacity(successfullyPaused.size() + 1); // to avoid OOM:ing after pausing the strategies
+                cfs.getCompactionStrategyManager().pause();
+                successfullyPaused.add(cfs);
+            }
+            return () -> maybeFail(resumeAll(null, toPause));
+        }
+        catch (Throwable t)
+        {
+            resumeAll(t, successfullyPaused);
+            throw t;
+        }
+    }
+
+    private static Throwable resumeAll(Throwable accumulate, Iterable<ColumnFamilyStore> cfss)
+    {
+        for (ColumnFamilyStore cfs : cfss)
+        {
+            try
+            {
+                cfs.getCompactionStrategyManager().resume();
+            }
+            catch (Throwable t)
+            {
+                accumulate = merge(accumulate, t);
             }
         }
+        return accumulate;
     }
 
     public LifecycleTransaction markAllCompacting(final OperationType operationType)
@@ -2901,16 +2163,15 @@
             public LifecycleTransaction call() throws Exception
             {
                 assert data.getCompacting().isEmpty() : data.getCompacting();
-                Iterable<SSTableReader> sstables = getPermittedToCompactSSTables();
+                Iterable<SSTableReader> sstables = getLiveSSTables();
                 sstables = AbstractCompactionStrategy.filterSuspectSSTables(sstables);
-                sstables = ImmutableList.copyOf(sstables);
                 LifecycleTransaction modifier = data.tryModify(sstables, operationType);
                 assert modifier != null: "something marked things compacting while compactions are disabled";
                 return modifier;
             }
         };
 
-        return runWithCompactionsDisabled(callable, false);
+        return runWithCompactionsDisabled(callable, false, false);
     }
 
 
@@ -2927,7 +2188,7 @@
     {
         // we don't use CompactionStrategy.pause since we don't want users flipping that on and off
         // during runWithCompactionsDisabled
-        this.compactionStrategyWrapper.disable();
+        compactionStrategyManager.disable();
     }
 
     public void enableAutoCompaction()
@@ -2942,7 +2203,7 @@
     @VisibleForTesting
     public void enableAutoCompaction(boolean waitForFutures)
     {
-        this.compactionStrategyWrapper.enable();
+        compactionStrategyManager.enable();
         List<Future<?>> futures = CompactionManager.instance.submitBackground(this);
         if (waitForFutures)
             FBUtilities.waitOnFutures(futures);
@@ -2950,7 +2211,7 @@
 
     public boolean isAutoCompactionDisabled()
     {
-        return !this.compactionStrategyWrapper.isEnabled();
+        return !this.compactionStrategyManager.isEnabled();
     }
 
     /*
@@ -2962,9 +2223,33 @@
        - get/set memtime
      */
 
-    public AbstractCompactionStrategy getCompactionStrategy()
+    public CompactionStrategyManager getCompactionStrategyManager()
     {
-        return compactionStrategyWrapper;
+        return compactionStrategyManager;
+    }
+
+    public void setCrcCheckChance(double crcCheckChance)
+    {
+        try
+        {
+            TableParams.builder().crcCheckChance(crcCheckChance).build().validate();
+            for (ColumnFamilyStore cfs : concatWithIndexes())
+            {
+                cfs.crcCheckChance.set(crcCheckChance);
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                    sstable.setCrcCheckChance(crcCheckChance);
+            }
+        }
+        catch (ConfigurationException e)
+        {
+            throw new IllegalArgumentException(e.getMessage());
+        }
+    }
+
+
+    public Double getCrcCheckChance()
+    {
+        return crcCheckChance.value();
     }
 
     public void setCompactionThresholds(int minThreshold, int maxThreshold)
@@ -3015,7 +2300,7 @@
     {
         long sum = 0;
         long count = 0;
-        for (SSTableReader sstable : getSSTables())
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
         {
             long n = sstable.getEstimatedColumnCount().count();
             sum += sstable.getEstimatedColumnCount().mean() * n;
@@ -3024,40 +2309,63 @@
         return count > 0 ? (int) (sum / count) : 0;
     }
 
+    public double getMeanPartitionSize()
+    {
+        long sum = 0;
+        long count = 0;
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
+        {
+            long n = sstable.getEstimatedPartitionSize().count();
+            sum += sstable.getEstimatedPartitionSize().mean() * n;
+            count += n;
+        }
+        return count > 0 ? sum * 1.0 / count : 0;
+    }
+
     public long estimateKeys()
     {
         long n = 0;
-        for (SSTableReader sstable : getSSTables())
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
             n += sstable.estimatedKeys();
         return n;
     }
 
+    public IPartitioner getPartitioner()
+    {
+        return metadata.partitioner;
+    }
+
+    public DecoratedKey decorateKey(ByteBuffer key)
+    {
+        return metadata.decorateKey(key);
+    }
+
     /** true if this CFS contains secondary index data */
     public boolean isIndex()
     {
-        return partitioner instanceof LocalPartitioner;
+        return metadata.isIndex();
     }
 
     public Iterable<ColumnFamilyStore> concatWithIndexes()
     {
         // we return the main CFS first, which we rely on for simplicity in switchMemtable(), for getting the
         // latest replay position
-        return Iterables.concat(Collections.singleton(this), indexManager.getIndexesBackedByCfs());
+        return Iterables.concat(Collections.singleton(this), indexManager.getAllIndexColumnFamilyStores());
     }
 
     public List<String> getBuiltIndexes()
     {
-       return indexManager.getBuiltIndexes();
+       return indexManager.getBuiltIndexNames();
     }
 
     public int getUnleveledSSTables()
     {
-        return this.compactionStrategyWrapper.getUnleveledSSTables();
+        return this.compactionStrategyManager.getUnleveledSSTables();
     }
 
     public int[] getSSTableCountPerLevel()
     {
-        return compactionStrategyWrapper.getSSTableCountPerLevel();
+        return compactionStrategyManager.getSSTableCountPerLevel();
     }
 
     public static class ViewFragment
@@ -3095,13 +2403,15 @@
 
     public boolean isEmpty()
     {
-        View view = data.getView();
-        return view.sstables.isEmpty() && view.getCurrentMemtable().getOperations() == 0 && view.liveMemtables.size() <= 1 && view.flushingMemtables.size() == 0;
+        return data.getView().isEmpty();
     }
 
     public boolean isRowCacheEnabled()
     {
-        return metadata.getCaching().rowCache.isEnabled() && CacheService.instance.rowCache.getCapacity() > 0;
+
+        boolean retval = metadata.params.caching.cacheRows() && CacheService.instance.rowCache.getCapacity() > 0;
+        assert(!retval || !isIndex());
+        return retval;
     }
 
     public boolean isCounterCacheEnabled()
@@ -3111,7 +2421,7 @@
 
     public boolean isKeyCacheEnabled()
     {
-        return metadata.getCaching().keyCache.isEnabled() && CacheService.instance.keyCache.getCapacity() > 0;
+        return metadata.params.caching.cacheKeys() && CacheService.instance.keyCache.getCapacity() > 0;
     }
 
     /**
@@ -3128,7 +2438,7 @@
 
         List<SSTableReader> truncatedSSTables = new ArrayList<>();
 
-        for (SSTableReader sstable : getSSTables())
+        for (SSTableReader sstable : getSSTables(SSTableSet.LIVE))
         {
             if (!sstable.newSince(truncatedAt))
                 truncatedSSTables.add(sstable);
@@ -3144,9 +2454,9 @@
         long allColumns = 0;
         int localTime = (int)(System.currentTimeMillis()/1000);
 
-        for (SSTableReader sstable : getSSTables())
+        for (SSTableReader sstable : getSSTables(SSTableSet.CANONICAL))
         {
-            allDroppable += sstable.getDroppableTombstonesBefore(localTime - sstable.metadata.getGcGraceSeconds());
+            allDroppable += sstable.getDroppableTombstonesBefore(localTime - sstable.metadata.params.gcGraceSeconds);
             allColumns += sstable.getEstimatedColumnCount().mean() * sstable.getEstimatedColumnCount().count();
         }
         return allColumns > 0 ? allDroppable / allColumns : 0;
@@ -3154,7 +2464,7 @@
 
     public long trueSnapshotsSize()
     {
-        return directories.trueSnapshotsSize();
+        return getDirectories().trueSnapshotsSize();
     }
 
     @VisibleForTesting
@@ -3163,38 +2473,6 @@
         fileIndexGenerator.set(0);
     }
 
-    // returns the "canonical" version of any current sstable, i.e. if an sstable is being replaced and is only partially
-    // visible to reads, this sstable will be returned as its original entirety, and its replacement will not be returned
-    // (even if it completely replaces it)
-    public static final Function<View, List<SSTableReader>> CANONICAL_SSTABLES = new Function<View, List<SSTableReader>>()
-    {
-        public List<SSTableReader> apply(View view)
-        {
-            List<SSTableReader> sstables = new ArrayList<>();
-            for (SSTableReader sstable : view.compacting)
-                if (sstable.openReason != SSTableReader.OpenReason.EARLY)
-                    sstables.add(sstable);
-            for (SSTableReader sstable : view.sstables)
-                if (!view.compacting.contains(sstable) && sstable.openReason != SSTableReader.OpenReason.EARLY)
-                    sstables.add(sstable);
-            return sstables;
-        }
-    };
-
-    public static final Function<View, List<SSTableReader>> UNREPAIRED_SSTABLES = new Function<View, List<SSTableReader>>()
-    {
-        public List<SSTableReader> apply(View view)
-        {
-            List<SSTableReader> sstables = new ArrayList<>();
-            for (SSTableReader sstable : CANONICAL_SSTABLES.apply(view))
-            {
-                if (!sstable.isRepaired())
-                    sstables.add(sstable);
-            }
-            return sstables;
-        }
-    };
-
     /**
      * Returns a ColumnFamilyStore by cfId if it exists, null otherwise
      * Differently from others, this method does not throw exception if the table does not exist.
@@ -3231,4 +2509,9 @@
 
         return keyspace.getColumnFamilyStore(id);
     }
+
+    public static TableMetrics metricsFor(UUID tableId)
+    {
+        return getIfExists(tableId).metric;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStoreCQLHelper.java b/src/java/org/apache/cassandra/db/ColumnFamilyStoreCQLHelper.java
new file mode 100644
index 0000000..5dc9324
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStoreCQLHelper.java

@@ -0,0 +1,442 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.atomic.*;
+import java.util.function.*;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.statements.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.schema.*;
+import org.apache.cassandra.utils.*;
+
+/**
+ * Helper methods to represent CFMetadata and related objects in CQL format
+ */
+public class ColumnFamilyStoreCQLHelper
+{
+    public static List<String> dumpReCreateStatements(CFMetaData metadata)
+    {
+        List<String> l = new ArrayList<>();
+        // Types come first, as table can't be created without them
+        l.addAll(ColumnFamilyStoreCQLHelper.getUserTypesAsCQL(metadata));
+        // Record re-create schema statements
+        l.add(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(metadata, true));
+        // Dropped columns (and re-additions)
+        l.addAll(ColumnFamilyStoreCQLHelper.getDroppedColumnsAsCQL(metadata));
+        // Indexes applied as last, since otherwise they may interfere with column drops / re-additions
+        l.addAll(ColumnFamilyStoreCQLHelper.getIndexesAsCQL(metadata));
+        return l;
+    }
+
+    private static List<ColumnDefinition> getClusteringColumns(CFMetaData metadata)
+    {
+        List<ColumnDefinition> cds = new ArrayList<>(metadata.clusteringColumns().size());
+
+        if (!metadata.isStaticCompactTable())
+            for (ColumnDefinition cd : metadata.clusteringColumns())
+                cds.add(cd);
+
+        return cds;
+    }
+
+    private static List<ColumnDefinition> getPartitionColumns(CFMetaData metadata)
+    {
+        List<ColumnDefinition> cds = new ArrayList<>(metadata.partitionColumns().size());
+
+        for (ColumnDefinition cd : metadata.partitionColumns().statics)
+            cds.add(cd);
+
+        if (metadata.isDense())
+        {
+            // remove an empty type
+            for (ColumnDefinition cd : metadata.partitionColumns().withoutStatics())
+                if (!cd.type.equals(EmptyType.instance))
+                    cds.add(cd);
+        }
+        // "regular" columns are not exposed for static compact tables
+        else if (!metadata.isStaticCompactTable())
+        {
+            for (ColumnDefinition cd : metadata.partitionColumns().withoutStatics())
+                cds.add(cd);
+        }
+
+        return cds;
+    }
+
+    /**
+     * Build a CQL String representation of Column Family Metadata
+     */
+    @VisibleForTesting
+    public static String getCFMetadataAsCQL(CFMetaData metadata, boolean includeDroppedColumns)
+    {
+        StringBuilder sb = new StringBuilder();
+        if (!isCqlCompatible(metadata))
+        {
+            sb.append(String.format("/*\nWarning: Table %s.%s omitted because it has constructs not compatible with CQL (was created via legacy API).\n",
+                                    metadata.ksName,
+                                    metadata.cfName));
+            sb.append("\nApproximate structure, for reference:");
+            sb.append("\n(this should not be used to reproduce this schema)\n\n");
+        }
+
+        sb.append("CREATE TABLE IF NOT EXISTS ");
+        sb.append(quoteIdentifier(metadata.ksName)).append('.').append(quoteIdentifier(metadata.cfName)).append(" (");
+
+        List<ColumnDefinition> partitionKeyColumns = metadata.partitionKeyColumns();
+        List<ColumnDefinition> clusteringColumns = getClusteringColumns(metadata);
+        List<ColumnDefinition> partitionColumns = getPartitionColumns(metadata);
+
+        Consumer<StringBuilder> cdCommaAppender = commaAppender("\n\t");
+        sb.append("\n\t");
+        for (ColumnDefinition cfd: partitionKeyColumns)
+        {
+            cdCommaAppender.accept(sb);
+            sb.append(toCQL(cfd));
+            if (partitionKeyColumns.size() == 1 && clusteringColumns.size() == 0)
+                sb.append(" PRIMARY KEY");
+        }
+
+        for (ColumnDefinition cfd: clusteringColumns)
+        {
+            cdCommaAppender.accept(sb);
+            sb.append(toCQL(cfd));
+        }
+
+        for (ColumnDefinition cfd: partitionColumns)
+        {
+            cdCommaAppender.accept(sb);
+            sb.append(toCQL(cfd, metadata.isStaticCompactTable()));
+        }
+
+        if (includeDroppedColumns)
+        {
+            for (Map.Entry<ByteBuffer, CFMetaData.DroppedColumn> entry: metadata.getDroppedColumns().entrySet())
+            {
+                if (metadata.getColumnDefinition(entry.getKey()) != null)
+                    continue;
+
+                CFMetaData.DroppedColumn droppedColumn = entry.getValue();
+                cdCommaAppender.accept(sb);
+                sb.append(quoteIdentifier(droppedColumn.name));
+                sb.append(' ');
+                sb.append(droppedColumn.type.asCQL3Type().toString());
+            }
+        }
+
+        if (clusteringColumns.size() > 0 || partitionKeyColumns.size() > 1)
+        {
+            sb.append(",\n\tPRIMARY KEY (");
+            if (partitionKeyColumns.size() > 1)
+            {
+                sb.append("(");
+                Consumer<StringBuilder> pkCommaAppender = commaAppender(" ");
+                for (ColumnDefinition cfd : partitionKeyColumns)
+                {
+                    pkCommaAppender.accept(sb);
+                    sb.append(quoteIdentifier(cfd.name.toString()));
+                }
+                sb.append(")");
+            }
+            else
+            {
+                sb.append(quoteIdentifier(partitionKeyColumns.get(0).name.toString()));
+            }
+
+            for (ColumnDefinition cfd : metadata.clusteringColumns())
+                sb.append(", ").append(quoteIdentifier(cfd.name.toString()));
+
+            sb.append(')');
+        }
+        sb.append(")\n\t");
+        sb.append("WITH ");
+
+        sb.append("ID = ").append(metadata.cfId).append("\n\tAND ");
+
+        if (metadata.isCompactTable())
+            sb.append("COMPACT STORAGE\n\tAND ");
+
+        if (clusteringColumns.size() > 0)
+        {
+            sb.append("CLUSTERING ORDER BY (");
+
+            Consumer<StringBuilder> cOrderCommaAppender = commaAppender(" ");
+            for (ColumnDefinition cd : clusteringColumns)
+            {
+                cOrderCommaAppender.accept(sb);
+                sb.append(quoteIdentifier(cd.name.toString())).append(' ').append(cd.clusteringOrder().toString());
+            }
+            sb.append(")\n\tAND ");
+        }
+
+        sb.append(toCQL(metadata.params));
+        sb.append(";");
+
+        if (!isCqlCompatible(metadata))
+        {
+            sb.append("\n*/");
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Build a CQL String representation of User Types used in the given Column Family.
+     *
+     * Type order is ensured as types are built incrementally: from the innermost (most nested)
+     * to the outermost.
+     */
+    @VisibleForTesting
+    public static List<String> getUserTypesAsCQL(CFMetaData metadata)
+    {
+        List<UserType> types = new ArrayList<>();
+        Set<UserType> typeSet = new HashSet<>();
+        for (ColumnDefinition cd: Iterables.concat(metadata.partitionKeyColumns(), metadata.clusteringColumns(), metadata.partitionColumns()))
+        {
+            AbstractType type = cd.type;
+            if (type instanceof UserType)
+                resolveUserType((UserType) type, typeSet, types);
+        }
+
+        List<String> typeStrings = new ArrayList<>();
+        for (UserType type: types)
+            typeStrings.add(toCQL(type));
+        return typeStrings;
+    }
+
+    /**
+     * Build a CQL String representation of Dropped Columns in the given Column Family.
+     *
+     * If the column was dropped once, but is now re-created `ADD` will be appended accordingly.
+     */
+    @VisibleForTesting
+    public static List<String> getDroppedColumnsAsCQL(CFMetaData metadata)
+    {
+        List<String> droppedColumns = new ArrayList<>();
+
+        for (Map.Entry<ByteBuffer, CFMetaData.DroppedColumn> entry: metadata.getDroppedColumns().entrySet())
+        {
+            CFMetaData.DroppedColumn column = entry.getValue();
+            droppedColumns.add(toCQLDrop(metadata.ksName, metadata.cfName, column));
+            if (metadata.getColumnDefinition(entry.getKey()) != null)
+                droppedColumns.add(toCQLAdd(metadata.ksName, metadata.cfName, metadata.getColumnDefinition(entry.getKey())));
+        }
+
+        return droppedColumns;
+    }
+
+    /**
+     * Build a CQL String representation of Indexes on columns in the given Column Family
+     */
+    @VisibleForTesting
+    public static List<String> getIndexesAsCQL(CFMetaData metadata)
+    {
+        List<String> indexes = new ArrayList<>();
+        for (IndexMetadata indexMetadata: metadata.getIndexes())
+            indexes.add(toCQL(metadata.ksName, metadata.cfName, indexMetadata));
+        return indexes;
+    }
+
+    private static String toCQL(String keyspace, String cf, IndexMetadata indexMetadata)
+    {
+        if (indexMetadata.isCustom())
+        {
+            Map<String, String> options = new HashMap<>();
+            indexMetadata.options.forEach((k, v) -> {
+                if (!k.equals(IndexTarget.TARGET_OPTION_NAME) && !k.equals(IndexTarget.CUSTOM_INDEX_OPTION_NAME))
+                    options.put(k, v);
+            });
+
+            return String.format("CREATE CUSTOM INDEX %s ON %s.%s (%s) USING '%s'%s;",
+                                 quoteIdentifier(indexMetadata.name),
+                                 quoteIdentifier(keyspace),
+                                 quoteIdentifier(cf),
+                                 indexMetadata.options.get(IndexTarget.TARGET_OPTION_NAME),
+                                 indexMetadata.options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME),
+                                 options.isEmpty() ? "" : " WITH OPTIONS " + toCQL(options));
+        }
+        else
+        {
+            return String.format("CREATE INDEX %s ON %s.%s (%s);",
+                                 quoteIdentifier(indexMetadata.name),
+                                 quoteIdentifier(keyspace),
+                                 quoteIdentifier(cf),
+                                 indexMetadata.options.get(IndexTarget.TARGET_OPTION_NAME));
+        }
+    }
+    private static String toCQL(UserType userType)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append(String.format("CREATE TYPE %s.%s(",
+                                quoteIdentifier(userType.keyspace),
+                                quoteIdentifier(userType.getNameAsString())));
+
+        Consumer<StringBuilder> commaAppender = commaAppender(" ");
+        for (int i = 0; i < userType.size(); i++)
+        {
+            commaAppender.accept(sb);
+            sb.append(String.format("%s %s",
+                                    userType.fieldNameAsString(i),
+                                    userType.fieldType(i).asCQL3Type()));
+        }
+        sb.append(");");
+        return sb.toString();
+    }
+
+    private static String toCQL(TableParams tableParams)
+    {
+        StringBuilder builder = new StringBuilder();
+
+        builder.append("bloom_filter_fp_chance = ").append(tableParams.bloomFilterFpChance);
+        builder.append("\n\tAND dclocal_read_repair_chance = ").append(tableParams.dcLocalReadRepairChance);
+        builder.append("\n\tAND crc_check_chance = ").append(tableParams.crcCheckChance);
+        builder.append("\n\tAND default_time_to_live = ").append(tableParams.defaultTimeToLive);
+        builder.append("\n\tAND gc_grace_seconds = ").append(tableParams.gcGraceSeconds);
+        builder.append("\n\tAND min_index_interval = ").append(tableParams.minIndexInterval);
+        builder.append("\n\tAND max_index_interval = ").append(tableParams.maxIndexInterval);
+        builder.append("\n\tAND memtable_flush_period_in_ms = ").append(tableParams.memtableFlushPeriodInMs);
+        builder.append("\n\tAND read_repair_chance = ").append(tableParams.readRepairChance);
+        builder.append("\n\tAND speculative_retry = '").append(tableParams.speculativeRetry).append("'");
+        builder.append("\n\tAND comment = ").append(singleQuote(tableParams.comment));
+        builder.append("\n\tAND caching = ").append(toCQL(tableParams.caching.asMap()));
+        builder.append("\n\tAND compaction = ").append(toCQL(tableParams.compaction.asMap()));
+        builder.append("\n\tAND compression = ").append(toCQL(tableParams.compression.asMap()));
+
+        builder.append("\n\tAND extensions = { ");
+        for (Map.Entry<String, ByteBuffer> entry : tableParams.extensions.entrySet())
+        {
+            builder.append(singleQuote(entry.getKey()));
+            builder.append(": ");
+            builder.append("0x" + ByteBufferUtil.bytesToHex(entry.getValue()));
+        }
+        builder.append(" }");
+        return builder.toString();
+    }
+
+    private static String toCQL(Map<?, ?> map)
+    {
+        StringBuilder builder = new StringBuilder("{ ");
+
+        boolean isFirst = true;
+        for (Map.Entry entry: map.entrySet())
+        {
+            if (isFirst)
+                isFirst = false;
+            else
+                builder.append(", ");
+            builder.append(singleQuote(entry.getKey().toString()));
+            builder.append(": ");
+            builder.append(singleQuote(entry.getValue().toString()));
+        }
+
+        builder.append(" }");
+        return builder.toString();
+    }
+
+    private static String toCQL(ColumnDefinition cd)
+    {
+        return toCQL(cd, false);
+    }
+
+    private static String toCQL(ColumnDefinition cd, boolean isStaticCompactTable)
+    {
+        return String.format("%s %s%s",
+                             quoteIdentifier(cd.name.toString()),
+                             cd.type.asCQL3Type().toString(),
+                             cd.isStatic() && !isStaticCompactTable ? " static" : "");
+    }
+
+    private static String toCQLAdd(String keyspace, String cf, ColumnDefinition cd)
+    {
+        return String.format("ALTER TABLE %s.%s ADD %s %s%s;",
+                             quoteIdentifier(keyspace),
+                             quoteIdentifier(cf),
+                             quoteIdentifier(cd.name.toString()),
+                             cd.type.asCQL3Type().toString(),
+                             cd.isStatic() ? " static" : "");
+    }
+
+    private static String toCQLDrop(String keyspace, String cf, CFMetaData.DroppedColumn droppedColumn)
+    {
+        return String.format("ALTER TABLE %s.%s DROP %s USING TIMESTAMP %s;",
+                             quoteIdentifier(keyspace),
+                             quoteIdentifier(cf),
+                             quoteIdentifier(droppedColumn.name),
+                             droppedColumn.droppedTime);
+    }
+
+    private static void resolveUserType(UserType type, Set<UserType> typeSet, List<UserType> types)
+    {
+        for (AbstractType subType: type.fieldTypes())
+            if (!typeSet.contains(subType) && subType instanceof UserType)
+                resolveUserType((UserType) subType, typeSet, types);
+
+        if (!typeSet.contains(type))
+        {
+            UserType t = type;
+            typeSet.add(t);
+            types.add(t);
+        }
+    }
+
+    private static String singleQuote(String s)
+    {
+        return String.format("'%s'", s.replaceAll("'", "''"));
+    }
+
+    private static Consumer<StringBuilder> commaAppender(String afterComma)
+    {
+        AtomicBoolean isFirst = new AtomicBoolean(true);
+        return new Consumer<StringBuilder>()
+        {
+            public void accept(StringBuilder stringBuilder)
+            {
+                if (!isFirst.getAndSet(false))
+                    stringBuilder.append(',').append(afterComma);
+            }
+        };
+    }
+
+    private static String quoteIdentifier(String id)
+    {
+        return ColumnIdentifier.maybeQuote(id);
+    }
+
+    /**
+     * Whether or not the given metadata is compatible / representable with CQL Language
+     */
+    public static boolean isCqlCompatible(CFMetaData metaData)
+    {
+        if (metaData.isSuper())
+            return false;
+
+        if (metaData.isCompactTable()
+            && metaData.partitionColumns().withoutStatics().size() > 1
+            && metaData.clusteringColumns().size() >= 1)
+            return false;
+
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
index b4511d0..a74316e 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java

@@ -32,8 +32,11 @@
     /**
      * @return the name of the column family
      */
+    @Deprecated
     public String getColumnFamilyName();
 
+    public String getTableName();
+
     /**
      * force a major compaction of this column family
      *
@@ -85,18 +88,6 @@
      */
     public void setCompactionParameters(Map<String, String> options);
     public Map<String, String> getCompactionParameters();
-    /**
-     * Sets the compaction strategy by class name
-     * @param className the name of the compaction strategy class
-     */
-    @Deprecated
-    public void setCompactionStrategyClass(String className);
-
-    /**
-     * Gets the compaction strategy class name
-     */
-    @Deprecated
-    public String getCompactionStrategyClass();
 
     /**
      * Get the compression parameters
@@ -170,4 +161,14 @@
      * @return top <i>count</i> items for the sampler since beginLocalSampling was called
      */
     public CompositeData finishLocalSampling(String sampler, int count) throws OpenDataException;
+
+    /*
+        Is Compaction space check enabled
+     */
+    public boolean isCompactionDiskSpaceCheckEnabled();
+
+    /*
+       Enable/Disable compaction space check
+     */
+    public void compactionDiskSpaceCheck(boolean enable);
 }

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyType.java b/src/java/org/apache/cassandra/db/ColumnFamilyType.java
deleted file mode 100644
index 51e8b63..0000000
--- a/src/java/org/apache/cassandra/db/ColumnFamilyType.java
+++ /dev/null

@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-/**
- * column family type enum
- */
-public enum ColumnFamilyType
-{
-    Standard,
-    Super;
-
-    public static ColumnFamilyType create(String name)
-    {
-        try
-        {
-            // TODO thrift optional parameter in CfDef is leaking down here which it shouldn't
-            return name == null ? ColumnFamilyType.Standard : ColumnFamilyType.valueOf(name);
-        }
-        catch (IllegalArgumentException e)
-        {
-            return null;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/db/ColumnIndex.java
index 8f147cc..ede3f79 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/db/ColumnIndex.java

@@ -18,30 +18,40 @@
 package org.apache.cassandra.db;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.sstable.IndexHelper;
-import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class ColumnIndex
 {
+    public final long partitionHeaderLength;
     public final List<IndexHelper.IndexInfo> columnsIndex;
 
-    private static final ColumnIndex EMPTY = new ColumnIndex(Collections.<IndexHelper.IndexInfo>emptyList());
+    private static final ColumnIndex EMPTY = new ColumnIndex(-1, Collections.<IndexHelper.IndexInfo>emptyList());
 
-    private ColumnIndex(List<IndexHelper.IndexInfo> columnsIndex)
+    private ColumnIndex(long partitionHeaderLength, List<IndexHelper.IndexInfo> columnsIndex)
     {
         assert columnsIndex != null;
 
+        this.partitionHeaderLength = partitionHeaderLength;
         this.columnsIndex = columnsIndex;
     }
 
+    public static ColumnIndex writeAndBuildIndex(UnfilteredRowIterator iterator, SequentialWriter output, SerializationHeader header, Version version) throws IOException
+    {
+        assert !iterator.isEmpty() && version.storeRows();
+
+        Builder builder = new Builder(iterator, output, header, version.correspondingMessagingVersion());
+        return builder.build();
+    }
+
     @VisibleForTesting
     public static ColumnIndex nothing()
     {
@@ -52,212 +62,117 @@
      * Help to create an index for a column family based on size of columns,
      * and write said columns to disk.
      */
-    public static class Builder
+    private static class Builder
     {
-        private final ColumnIndex result;
-        private final long indexOffset;
+        private final UnfilteredRowIterator iterator;
+        private final SequentialWriter writer;
+        private final SerializationHeader header;
+        private final int version;
+
+        private final List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<>();
+        private final long initialPosition;
+        private long headerLength = -1;
+
         private long startPosition = -1;
-        private long endPosition = 0;
-        private long blockSize;
-        private OnDiskAtom firstColumn;
-        private OnDiskAtom lastColumn;
-        private OnDiskAtom lastBlockClosing;
-        private final DataOutputPlus output;
-        private final RangeTombstone.Tracker tombstoneTracker;
-        private int atomCount;
-        private final ByteBuffer key;
-        private final DeletionInfo deletionInfo; // only used for serializing and calculating row header size
 
-        private final OnDiskAtom.SerializerForWriting atomSerializer;
+        private int written;
+        private long previousRowStart;
 
-        public Builder(ColumnFamily cf,
-                       ByteBuffer key,
-                       DataOutputPlus output)
+        private ClusteringPrefix firstClustering;
+        private ClusteringPrefix lastClustering;
+
+        private DeletionTime openMarker;
+
+        public Builder(UnfilteredRowIterator iterator,
+                       SequentialWriter writer,
+                       SerializationHeader header,
+                       int version)
         {
-            this(cf, key, output, cf.getComparator().onDiskAtomSerializer());
+            this.iterator = iterator;
+            this.writer = writer;
+            this.header = header;
+            this.version = version;
+            this.initialPosition = writer.position();
         }
 
-        public Builder(ColumnFamily cf,
-                ByteBuffer key,
-                DataOutputPlus output,
-                OnDiskAtom.SerializerForWriting serializer)
+        private void writePartitionHeader(UnfilteredRowIterator iterator) throws IOException
         {
-            assert cf != null;
-            assert key != null;
-            assert output != null;
-
-            this.key = key;
-            deletionInfo = cf.deletionInfo();
-            this.indexOffset = rowHeaderSize(key, deletionInfo);
-            this.result = new ColumnIndex(new ArrayList<IndexHelper.IndexInfo>());
-            this.output = output;
-            this.tombstoneTracker = new RangeTombstone.Tracker(cf.getComparator());
-            this.atomSerializer = serializer;
+            ByteBufferUtil.writeWithShortLength(iterator.partitionKey().getKey(), writer);
+            DeletionTime.serializer.serialize(iterator.partitionLevelDeletion(), writer);
+            if (header.hasStatic())
+                UnfilteredSerializer.serializer.serializeStaticRow(iterator.staticRow(), header, writer, version);
         }
 
-        /**
-         * Returns the number of bytes between the beginning of the row and the
-         * first serialized column.
-         */
-        private static long rowHeaderSize(ByteBuffer key, DeletionInfo delInfo)
+        public ColumnIndex build() throws IOException
         {
-            TypeSizes typeSizes = TypeSizes.NATIVE;
-            // TODO fix constantSize when changing the nativeconststs.
-            int keysize = key.remaining();
-            return typeSizes.sizeof((short) keysize) + keysize          // Row key
-                 + DeletionTime.serializer.serializedSize(delInfo.getTopLevelDeletion(), typeSizes);
+            writePartitionHeader(iterator);
+            this.headerLength = writer.position() - initialPosition;
+
+            while (iterator.hasNext())
+                add(iterator.next());
+
+            return close();
         }
 
-        public RangeTombstone.Tracker tombstoneTracker()
+        private long currentPosition()
         {
-            return tombstoneTracker;
+            return writer.position() - initialPosition;
         }
 
-        public int writtenAtomCount()
+        private void addIndexBlock()
         {
-            return atomCount + tombstoneTracker.writtenAtom();
+            IndexHelper.IndexInfo cIndexInfo = new IndexHelper.IndexInfo(firstClustering,
+                                                                         lastClustering,
+                                                                         startPosition,
+                                                                         currentPosition() - startPosition,
+                                                                         openMarker);
+            columnsIndex.add(cIndexInfo);
+            firstClustering = null;
         }
 
-        /**
-         * Serializes the index into in-memory structure with all required components
-         * such as Bloom Filter, index block size, IndexInfo list
-         *
-         * @param cf Column family to create index for
-         *
-         * @return information about index - it's Bloom Filter, block size and IndexInfo list
-         */
-        public ColumnIndex build(ColumnFamily cf) throws IOException
+        private void add(Unfiltered unfiltered) throws IOException
         {
-            // cf has disentangled the columns and range tombstones, we need to re-interleave them in comparator order
-            Comparator<Composite> comparator = cf.getComparator();
-            DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester();
-            Iterator<RangeTombstone> rangeIter = cf.deletionInfo().rangeIterator();
-            RangeTombstone tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
+            long pos = currentPosition();
 
-            for (Cell c : cf)
+            if (firstClustering == null)
             {
-                while (tombstone != null && comparator.compare(c.name(), tombstone.min) >= 0)
-                {
-                    // skip range tombstones that are shadowed by partition tombstones
-                    if (!cf.deletionInfo().getTopLevelDeletion().isDeleted(tombstone))
-                        add(tombstone);
-                    tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
-                }
-
-                // We can skip any cell if it's shadowed by a tombstone already. This is a more
-                // general case than was handled by CASSANDRA-2589.
-                if (!tester.isDeleted(c))
-                    add(c);
+                // Beginning of an index block. Remember the start and position
+                firstClustering = unfiltered.clustering();
+                startPosition = pos;
             }
 
-            while (tombstone != null)
+            UnfilteredSerializer.serializer.serialize(unfiltered, header, writer, pos - previousRowStart, version);
+            lastClustering = unfiltered.clustering();
+            previousRowStart = pos;
+            ++written;
+
+            if (unfiltered.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
             {
-                add(tombstone);
-                tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
+                RangeTombstoneMarker marker = (RangeTombstoneMarker)unfiltered;
+                openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
             }
-            finishAddingAtoms();
-            ColumnIndex index = build();
-
-            maybeWriteEmptyRowHeader();
-
-            return index;
-        }
-
-        /**
-         * The important distinction wrt build() is that we may be building for a row that ends up
-         * being compacted away entirely, i.e., the input consists only of expired tombstones (or
-         * columns shadowed by expired tombstone).  Thus, it is the caller's responsibility
-         * to decide whether to write the header for an empty row.
-         */
-        public ColumnIndex buildForCompaction(Iterator<OnDiskAtom> columns) throws IOException
-        {
-            while (columns.hasNext())
-            {
-                OnDiskAtom c =  columns.next();
-                add(c);
-            }
-            finishAddingAtoms();
-
-            return build();
-        }
-
-        public void add(OnDiskAtom column) throws IOException
-        {
-            atomCount++;
-
-            if (firstColumn == null)
-            {
-                firstColumn = column;
-                startPosition = endPosition;
-                // TODO: have that use the firstColumn as min + make sure we optimize that on read
-                endPosition += tombstoneTracker.writeOpenedMarkers(firstColumn.name(), output, atomSerializer);
-                blockSize = 0; // We don't count repeated tombstone marker in the block size, to avoid a situation
-                               // where we wouldn't make any progress because a block is filled by said marker
-
-                maybeWriteRowHeader();
-            }
-
-            if (tombstoneTracker.update(column, false))
-            {
-                long size = tombstoneTracker.writeUnwrittenTombstones(output, atomSerializer);
-                size += atomSerializer.serializedSizeForSSTable(column);
-                endPosition += size;
-                blockSize += size;
-
-                atomSerializer.serializeForSSTable(column, output);
-            }
-
-            lastColumn = column;
 
             // if we hit the column index size that we have to index after, go ahead and index it.
-            if (blockSize >= DatabaseDescriptor.getColumnIndexSize())
-            {
-                IndexHelper.IndexInfo cIndexInfo = new IndexHelper.IndexInfo(firstColumn.name(), column.name(), indexOffset + startPosition, endPosition - startPosition);
-                result.columnsIndex.add(cIndexInfo);
-                firstColumn = null;
-                lastBlockClosing = column;
-            }
+            if (currentPosition() - startPosition >= DatabaseDescriptor.getColumnIndexSize())
+                addIndexBlock();
+
         }
 
-        private void maybeWriteRowHeader() throws IOException
+        private ColumnIndex close() throws IOException
         {
-            if (lastColumn == null)
-            {
-                ByteBufferUtil.writeWithShortLength(key, output);
-                DeletionTime.serializer.serialize(deletionInfo.getTopLevelDeletion(), output);
-            }
-        }
+            UnfilteredSerializer.serializer.writeEndOfPartition(writer);
 
-        public void finishAddingAtoms() throws IOException
-        {
-            long size = tombstoneTracker.writeUnwrittenTombstones(output, atomSerializer);
-            endPosition += size;
-            blockSize += size;
-        }
-
-        public ColumnIndex build()
-        {
-            assert !tombstoneTracker.hasUnwrittenTombstones();  // finishAddingAtoms must be called before building.
-            // all columns were GC'd after all
-            if (lastColumn == null)
+            // It's possible we add no rows, just a top level deletion
+            if (written == 0)
                 return ColumnIndex.EMPTY;
 
             // the last column may have fallen on an index boundary already.  if not, index it explicitly.
-            if (result.columnsIndex.isEmpty() || lastBlockClosing != lastColumn)
-            {
-                IndexHelper.IndexInfo cIndexInfo = new IndexHelper.IndexInfo(firstColumn.name(), lastColumn.name(), indexOffset + startPosition, endPosition - startPosition);
-                result.columnsIndex.add(cIndexInfo);
-            }
+            if (firstClustering != null)
+                addIndexBlock();
 
             // we should always have at least one computed index block, but we only write it out if there is more than that.
-            assert result.columnsIndex.size() > 0;
-            return result;
-        }
-
-        public void maybeWriteEmptyRowHeader() throws IOException
-        {
-            if (!deletionInfo.isLive())
-                maybeWriteRowHeader();
+            assert columnsIndex.size() > 0 && headerLength >= 0;
+            return new ColumnIndex(headerLength, columnsIndex);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/ColumnSerializer.java b/src/java/org/apache/cassandra/db/ColumnSerializer.java
deleted file mode 100644
index 8e7026c..0000000
--- a/src/java/org/apache/cassandra/db/ColumnSerializer.java
+++ /dev/null

@@ -1,188 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.FSReadError;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class ColumnSerializer implements ISerializer<Cell>
-{
-    public final static int DELETION_MASK        = 0x01;
-    public final static int EXPIRATION_MASK      = 0x02;
-    public final static int COUNTER_MASK         = 0x04;
-    public final static int COUNTER_UPDATE_MASK  = 0x08;
-    public final static int RANGE_TOMBSTONE_MASK = 0x10;
-
-    /**
-     * Flag affecting deserialization behavior.
-     *  - LOCAL: for deserialization of local data (Expired columns are
-     *      converted to tombstones (to gain disk space)).
-     *  - FROM_REMOTE: for deserialization of data received from remote hosts
-     *      (Expired columns are converted to tombstone and counters have
-     *      their delta cleared)
-     *  - PRESERVE_SIZE: used when no transformation must be performed, i.e,
-     *      when we must ensure that deserializing and reserializing the
-     *      result yield the exact same bytes. Streaming uses this.
-     */
-    public static enum Flag
-    {
-        LOCAL, FROM_REMOTE, PRESERVE_SIZE;
-    }
-
-    private final CellNameType type;
-
-    public ColumnSerializer(CellNameType type)
-    {
-        this.type = type;
-    }
-
-    public void serialize(Cell cell, DataOutputPlus out) throws IOException
-    {
-        assert !cell.name().isEmpty();
-        type.cellSerializer().serialize(cell.name(), out);
-        try
-        {
-            out.writeByte(cell.serializationFlags());
-            if (cell instanceof CounterCell)
-            {
-                out.writeLong(((CounterCell) cell).timestampOfLastDelete());
-            }
-            else if (cell instanceof ExpiringCell)
-            {
-                out.writeInt(((ExpiringCell) cell).getTimeToLive());
-                out.writeInt(cell.getLocalDeletionTime());
-            }
-            out.writeLong(cell.timestamp());
-            ByteBufferUtil.writeWithLength(cell.value(), out);
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public Cell deserialize(DataInput in) throws IOException
-    {
-        return deserialize(in, Flag.LOCAL);
-    }
-
-    /*
-     * For counter columns, we must know when we deserialize them if what we
-     * deserialize comes from a remote host. If it does, then we must clear
-     * the delta.
-     */
-    public Cell deserialize(DataInput in, ColumnSerializer.Flag flag) throws IOException
-    {
-        return deserialize(in, flag, Integer.MIN_VALUE);
-    }
-
-    public Cell deserialize(DataInput in, ColumnSerializer.Flag flag, int expireBefore) throws IOException
-    {
-        CellName name = type.cellSerializer().deserialize(in);
-
-        int b = in.readUnsignedByte();
-        return deserializeColumnBody(in, name, b, flag, expireBefore);
-    }
-
-    Cell deserializeColumnBody(DataInput in, CellName name, int mask, ColumnSerializer.Flag flag, int expireBefore) throws IOException
-    {
-        if ((mask & COUNTER_MASK) != 0)
-        {
-            long timestampOfLastDelete = in.readLong();
-            long ts = in.readLong();
-            ByteBuffer value = ByteBufferUtil.readWithLength(in);
-            return BufferCounterCell.create(name, value, ts, timestampOfLastDelete, flag);
-        }
-        else if ((mask & EXPIRATION_MASK) != 0)
-        {
-            int ttl = in.readInt();
-            int expiration = in.readInt();
-            long ts = in.readLong();
-            ByteBuffer value = ByteBufferUtil.readWithLength(in);
-            return BufferExpiringCell.create(name, value, ts, ttl, expiration, expireBefore, flag);
-        }
-        else
-        {
-            long ts = in.readLong();
-            ByteBuffer value = ByteBufferUtil.readWithLength(in);
-            return (mask & COUNTER_UPDATE_MASK) != 0
-                   ? new BufferCounterUpdateCell(name, value, ts)
-                   : ((mask & DELETION_MASK) == 0
-                      ? new BufferCell(name, value, ts)
-                      : new BufferDeletedCell(name, value, ts));
-        }
-    }
-
-    void skipColumnBody(DataInput in, int mask) throws IOException
-    {
-        if ((mask & COUNTER_MASK) != 0)
-            FileUtils.skipBytesFully(in, 16);
-        else if ((mask & EXPIRATION_MASK) != 0)
-            FileUtils.skipBytesFully(in, 16);
-        else
-            FileUtils.skipBytesFully(in, 8);
-
-        int length = in.readInt();
-        FileUtils.skipBytesFully(in, length);
-    }
-
-    public long serializedSize(Cell cell, TypeSizes typeSizes)
-    {
-        return cell.serializedSize(type, typeSizes);
-    }
-
-    public static class CorruptColumnException extends IOException
-    {
-        public CorruptColumnException(String s)
-        {
-            super(s);
-        }
-
-        public static CorruptColumnException create(DataInput in, ByteBuffer name)
-        {
-            assert name.remaining() <= 0;
-            String format = "invalid column name length %d%s";
-            String details = "";
-            if (in instanceof FileDataInput)
-            {
-                FileDataInput fdis = (FileDataInput)in;
-                long remaining;
-                try
-                {
-                    remaining = fdis.bytesRemaining();
-                }
-                catch (IOException e)
-                {
-                    throw new FSReadError(e, fdis.getPath());
-                }
-                details = String.format(" (%s, %d bytes remaining)", fdis.getPath(), remaining);
-            }
-            return new CorruptColumnException(String.format(format, name.remaining(), details));
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/Columns.java b/src/java/org/apache/cassandra/db/Columns.java
new file mode 100644
index 0000000..18e17d7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Columns.java

@@ -0,0 +1,668 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.Predicate;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterators;
+
+import net.nicoulaj.compilecommand.annotations.DontInline;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSearchIterator;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+/**
+ * An immutable and sorted list of (non-PK) columns for a given table.
+ * <p>
+ * Note that in practice, it will either store only static columns, or only regular ones. When
+ * we need both type of columns, we use a {@link PartitionColumns} object.
+ */
+public class Columns extends AbstractCollection<ColumnDefinition> implements Collection<ColumnDefinition>
+{
+    public static final Serializer serializer = new Serializer();
+    public static final Columns NONE = new Columns(BTree.empty(), 0);
+
+    private static final ColumnDefinition FIRST_COMPLEX_STATIC =
+        new ColumnDefinition("",
+                             "",
+                             ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance),
+                             SetType.getInstance(UTF8Type.instance, true),
+                             ColumnDefinition.NO_POSITION,
+                             ColumnDefinition.Kind.STATIC);
+
+    private static final ColumnDefinition FIRST_COMPLEX_REGULAR =
+        new ColumnDefinition("",
+                             "",
+                             ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance),
+                             SetType.getInstance(UTF8Type.instance, true),
+                             ColumnDefinition.NO_POSITION,
+                             ColumnDefinition.Kind.REGULAR);
+
+    private final Object[] columns;
+    private final int complexIdx; // Index of the first complex column
+
+    private Columns(Object[] columns, int complexIdx)
+    {
+        assert complexIdx <= BTree.size(columns);
+        this.columns = columns;
+        this.complexIdx = complexIdx;
+    }
+
+    private Columns(Object[] columns)
+    {
+        this(columns, findFirstComplexIdx(columns));
+    }
+
+    /**
+     * Creates a {@code Columns} holding only the one column provided.
+     *
+     * @param c the column for which to create a {@code Columns} object.
+     *
+     * @return the newly created {@code Columns} containing only {@code c}.
+     */
+    public static Columns of(ColumnDefinition c)
+    {
+        return new Columns(BTree.singleton(c), c.isComplex() ? 0 : 1);
+    }
+
+    /**
+     * Returns a new {@code Columns} object holing the same columns than the provided set.
+     *
+     * @param s the set from which to create the new {@code Columns}.
+     * @return the newly created {@code Columns} containing the columns from {@code s}.
+     */
+    public static Columns from(Collection<ColumnDefinition> s)
+    {
+        Object[] tree = BTree.<ColumnDefinition>builder(Comparator.naturalOrder()).addAll(s).build();
+        return new Columns(tree, findFirstComplexIdx(tree));
+    }
+
+    private static int findFirstComplexIdx(Object[] tree)
+    {
+        if (BTree.isEmpty(tree))
+            return 0;
+
+        int size = BTree.size(tree);
+        ColumnDefinition last = BTree.findByIndex(tree, size - 1);
+        return last.isSimple()
+             ? size
+             : BTree.ceilIndex(tree, Comparator.naturalOrder(), last.isStatic() ? FIRST_COMPLEX_STATIC : FIRST_COMPLEX_REGULAR);
+    }
+
+    /**
+     * Whether this columns is empty.
+     *
+     * @return whether this columns is empty.
+     */
+    public boolean isEmpty()
+    {
+        return BTree.isEmpty(columns);
+    }
+
+    /**
+     * The number of simple columns in this object.
+     *
+     * @return the number of simple columns in this object.
+     */
+    public int simpleColumnCount()
+    {
+        return complexIdx;
+    }
+
+    /**
+     * The number of complex columns (non-frozen collections, udts, ...) in this object.
+     *
+     * @return the number of complex columns in this object.
+     */
+    public int complexColumnCount()
+    {
+        return BTree.size(columns) - complexIdx;
+    }
+
+    /**
+     * The total number of columns in this object.
+     *
+     * @return the total number of columns in this object.
+     */
+    public int size()
+    {
+        return BTree.size(columns);
+    }
+
+    /**
+     * Whether this objects contains simple columns.
+     *
+     * @return whether this objects contains simple columns.
+     */
+    public boolean hasSimple()
+    {
+        return complexIdx > 0;
+    }
+
+    /**
+     * Whether this objects contains complex columns.
+     *
+     * @return whether this objects contains complex columns.
+     */
+    public boolean hasComplex()
+    {
+        return complexIdx < BTree.size(columns);
+    }
+
+    /**
+     * Returns the ith simple column of this object.
+     *
+     * @param i the index for the simple column to fectch. This must
+     * satisfy {@code 0 <= i < simpleColumnCount()}.
+     *
+     * @return the {@code i}th simple column in this object.
+     */
+    public ColumnDefinition getSimple(int i)
+    {
+        return BTree.findByIndex(columns, i);
+    }
+
+    /**
+     * Returns the ith complex column of this object.
+     *
+     * @param i the index for the complex column to fectch. This must
+     * satisfy {@code 0 <= i < complexColumnCount()}.
+     *
+     * @return the {@code i}th complex column in this object.
+     */
+    public ColumnDefinition getComplex(int i)
+    {
+        return BTree.findByIndex(columns, complexIdx + i);
+    }
+
+    /**
+     * The index of the provided simple column in this object (if it contains
+     * the provided column).
+     *
+     * @param c the simple column for which to return the index of.
+     *
+     * @return the index for simple column {@code c} if it is contains in this
+     * object
+     */
+    public int simpleIdx(ColumnDefinition c)
+    {
+        return BTree.findIndex(columns, Comparator.naturalOrder(), c);
+    }
+
+    /**
+     * The index of the provided complex column in this object (if it contains
+     * the provided column).
+     *
+     * @param c the complex column for which to return the index of.
+     *
+     * @return the index for complex column {@code c} if it is contains in this
+     * object
+     */
+    public int complexIdx(ColumnDefinition c)
+    {
+        return BTree.findIndex(columns, Comparator.naturalOrder(), c) - complexIdx;
+    }
+
+    /**
+     * Whether the provided column is contained by this object.
+     *
+     * @param c the column to check presence of.
+     *
+     * @return whether {@code c} is contained by this object.
+     */
+    public boolean contains(ColumnDefinition c)
+    {
+        return BTree.findIndex(columns, Comparator.naturalOrder(), c) >= 0;
+    }
+
+    /**
+     * Returns the result of merging this {@code Columns} object with the
+     * provided one.
+     *
+     * @param other the other {@code Columns} to merge this object with.
+     *
+     * @return the result of merging/taking the union of {@code this} and
+     * {@code other}. The returned object may be one of the operand and that
+     * operand is a subset of the other operand.
+     */
+    public Columns mergeTo(Columns other)
+    {
+        if (this == other || other == NONE)
+            return this;
+        if (this == NONE)
+            return other;
+
+        Object[] tree = BTree.<ColumnDefinition>merge(this.columns, other.columns, Comparator.naturalOrder(),
+                                                      UpdateFunction.noOp());
+        if (tree == this.columns)
+            return this;
+        if (tree == other.columns)
+            return other;
+
+        return new Columns(tree, findFirstComplexIdx(tree));
+    }
+
+    /**
+     * Whether this object is a superset of the provided other {@code Columns object}.
+     *
+     * @param other the other object to test for inclusion in this object.
+     *
+     * @return whether all the columns of {@code other} are contained by this object.
+     */
+    public boolean containsAll(Collection<?> other)
+    {
+        if (other == this)
+            return true;
+        if (other.size() > this.size())
+            return false;
+
+        BTreeSearchIterator<ColumnDefinition, ColumnDefinition> iter = BTree.slice(columns, Comparator.naturalOrder(), BTree.Dir.ASC);
+        for (Object def : other)
+            if (iter.next((ColumnDefinition) def) == null)
+                return false;
+        return true;
+    }
+
+    /**
+     * Iterator over the simple columns of this object.
+     *
+     * @return an iterator over the simple columns of this object.
+     */
+    public Iterator<ColumnDefinition> simpleColumns()
+    {
+        return BTree.iterator(columns, 0, complexIdx - 1, BTree.Dir.ASC);
+    }
+
+    /**
+     * Iterator over the complex columns of this object.
+     *
+     * @return an iterator over the complex columns of this object.
+     */
+    public Iterator<ColumnDefinition> complexColumns()
+    {
+        return BTree.iterator(columns, complexIdx, BTree.size(columns) - 1, BTree.Dir.ASC);
+    }
+
+    /**
+     * Iterator over all the columns of this object.
+     *
+     * @return an iterator over all the columns of this object.
+     */
+    public BTreeSearchIterator<ColumnDefinition, ColumnDefinition> iterator()
+    {
+        return BTree.<ColumnDefinition, ColumnDefinition>slice(columns, Comparator.naturalOrder(), BTree.Dir.ASC);
+    }
+
+    /**
+     * An iterator that returns the columns of this object in "select" order (that
+     * is in global alphabetical order, where the "normal" iterator returns simple
+     * columns first and the complex second).
+     *
+     * @return an iterator returning columns in alphabetical order.
+     */
+    public Iterator<ColumnDefinition> selectOrderIterator()
+    {
+        // In wildcard selection, we want to return all columns in alphabetical order,
+        // irregarding of whether they are complex or not
+        return Iterators.<ColumnDefinition>
+                         mergeSorted(ImmutableList.of(simpleColumns(), complexColumns()),
+                                     (s, c) ->
+                                     {
+                                         assert !s.kind.isPrimaryKeyKind();
+                                         return s.name.bytes.compareTo(c.name.bytes);
+                                     });
+    }
+
+    /**
+     * Returns the equivalent of those columns but with the provided column removed.
+     *
+     * @param column the column to remove.
+     *
+     * @return newly allocated columns containing all the columns of {@code this} expect
+     * for {@code column}.
+     */
+    public Columns without(ColumnDefinition column)
+    {
+        if (!contains(column))
+            return this;
+
+        Object[] newColumns = BTree.<ColumnDefinition>transformAndFilter(columns, (c) -> c.equals(column) ? null : c);
+        return new Columns(newColumns);
+    }
+
+    /**
+     * Returns a predicate to test whether columns are included in this {@code Columns} object,
+     * assuming that tes tested columns are passed to the predicate in sorted order.
+     *
+     * @return a predicate to test the inclusion of sorted columns in this object.
+     */
+    public Predicate<ColumnDefinition> inOrderInclusionTester()
+    {
+        SearchIterator<ColumnDefinition, ColumnDefinition> iter = BTree.slice(columns, Comparator.naturalOrder(), BTree.Dir.ASC);
+        return column -> iter.next(column) != null;
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        for (ColumnDefinition c : this)
+            digest.update(c.name.bytes.duplicate());
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (other == this)
+            return true;
+        if (!(other instanceof Columns))
+            return false;
+
+        Columns that = (Columns)other;
+        return this.complexIdx == that.complexIdx && BTree.equals(this.columns, that.columns);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(complexIdx, BTree.hashCode(columns));
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder("[");
+        boolean first = true;
+        for (ColumnDefinition def : this)
+        {
+            if (first) first = false; else sb.append(" ");
+            sb.append(def.name);
+        }
+        return sb.append("]").toString();
+    }
+
+    public static class Serializer
+    {
+        public void serialize(Columns columns, DataOutputPlus out) throws IOException
+        {
+            out.writeUnsignedVInt(columns.size());
+            for (ColumnDefinition column : columns)
+                ByteBufferUtil.writeWithVIntLength(column.name.bytes, out);
+        }
+
+        public long serializedSize(Columns columns)
+        {
+            long size = TypeSizes.sizeofUnsignedVInt(columns.size());
+            for (ColumnDefinition column : columns)
+                size += ByteBufferUtil.serializedSizeWithVIntLength(column.name.bytes);
+            return size;
+        }
+
+        public Columns deserialize(DataInputPlus in, CFMetaData metadata) throws IOException
+        {
+            int length = (int)in.readUnsignedVInt();
+            BTree.Builder<ColumnDefinition> builder = BTree.builder(Comparator.naturalOrder());
+            builder.auto(false);
+            for (int i = 0; i < length; i++)
+            {
+                ByteBuffer name = ByteBufferUtil.readWithVIntLength(in);
+                ColumnDefinition column = metadata.getColumnDefinition(name);
+
+                if (column == null)
+                {
+                    // If we don't find the definition, it could be we have data for a dropped column, and we shouldn't
+                    // fail deserialization because of that. So we grab a "fake" ColumnDefinition that ensure proper
+                    // deserialization. The column will be ignore later on anyway.
+                    column = metadata.getDroppedColumnDefinition(name);
+                    if (column == null)
+                        throw new RuntimeException("Unknown column " + UTF8Type.instance.getString(name) + " during deserialization");
+                }
+                builder.add(column);
+            }
+            return new Columns(builder.build());
+        }
+
+        /**
+         * If both ends have a pre-shared superset of the columns we are serializing, we can send them much
+         * more efficiently. Both ends must provide the identically same set of columns.
+         */
+        public void serializeSubset(Collection<ColumnDefinition> columns, Columns superset, DataOutputPlus out) throws IOException
+        {
+            /**
+             * We weight this towards small sets, and sets where the majority of items are present, since
+             * we expect this to mostly be used for serializing result sets.
+             *
+             * For supersets with fewer than 64 columns, we encode a bitmap of *missing* columns,
+             * which equates to a zero (single byte) when all columns are present, and otherwise
+             * a positive integer that can typically be vint encoded efficiently.
+             *
+             * If we have 64 or more columns, we cannot neatly perform a bitmap encoding, so we just switch
+             * to a vint encoded set of deltas, either adding or subtracting (whichever is most efficient).
+             * We indicate this switch by sending our bitmap with every bit set, i.e. -1L
+             */
+            int columnCount = columns.size();
+            int supersetCount = superset.size();
+            if (columnCount == supersetCount)
+            {
+                out.writeUnsignedVInt(0);
+            }
+            else if (supersetCount < 64)
+            {
+                out.writeUnsignedVInt(encodeBitmap(columns, superset, supersetCount));
+            }
+            else
+            {
+                serializeLargeSubset(columns, columnCount, superset, supersetCount, out);
+            }
+        }
+
+        public long serializedSubsetSize(Collection<ColumnDefinition> columns, Columns superset)
+        {
+            int columnCount = columns.size();
+            int supersetCount = superset.size();
+            if (columnCount == supersetCount)
+            {
+                return TypeSizes.sizeofUnsignedVInt(0);
+            }
+            else if (supersetCount < 64)
+            {
+                return TypeSizes.sizeofUnsignedVInt(encodeBitmap(columns, superset, supersetCount));
+            }
+            else
+            {
+                return serializeLargeSubsetSize(columns, columnCount, superset, supersetCount);
+            }
+        }
+
+        public Columns deserializeSubset(Columns superset, DataInputPlus in) throws IOException
+        {
+            long encoded = in.readUnsignedVInt();
+            if (encoded == 0L)
+            {
+                return superset;
+            }
+            else if (superset.size() >= 64)
+            {
+                return deserializeLargeSubset(in, superset, (int) encoded);
+            }
+            else
+            {
+                BTree.Builder<ColumnDefinition> builder = BTree.builder(Comparator.naturalOrder());
+                int firstComplexIdx = 0;
+                for (ColumnDefinition column : superset)
+                {
+                    if ((encoded & 1) == 0)
+                    {
+                        builder.add(column);
+                        if (column.isSimple())
+                            ++firstComplexIdx;
+                    }
+                    encoded >>>= 1;
+                }
+                if (encoded != 0)
+                    throw new IOException("Invalid Columns subset bytes; too many bits set:" + Long.toBinaryString(encoded));
+                return new Columns(builder.build(), firstComplexIdx);
+            }
+        }
+
+        // encodes a 1 bit for every *missing* column, on the assumption presence is more common,
+        // and because this is consistent with encoding 0 to represent all present
+        private static long encodeBitmap(Collection<ColumnDefinition> columns, Columns superset, int supersetCount)
+        {
+            long bitmap = 0L;
+            BTreeSearchIterator<ColumnDefinition, ColumnDefinition> iter = superset.iterator();
+            // the index we would encounter next if all columns are present
+            int expectIndex = 0;
+            for (ColumnDefinition column : columns)
+            {
+                if (iter.next(column) == null)
+                    throw new IllegalStateException(columns + " is not a subset of " + superset);
+
+                int currentIndex = iter.indexOfCurrent();
+                int count = currentIndex - expectIndex;
+                // (1L << count) - 1 gives us count bits set at the bottom of the register
+                // so << expectIndex moves these bits to start at expectIndex, which is where our missing portion
+                // begins (assuming count > 0; if not, we're adding 0 bits, so it's a no-op)
+                bitmap |= ((1L << count) - 1) << expectIndex;
+                expectIndex = currentIndex + 1;
+            }
+            int count = supersetCount - expectIndex;
+            bitmap |= ((1L << count) - 1) << expectIndex;
+            return bitmap;
+        }
+
+        @DontInline
+        private void serializeLargeSubset(Collection<ColumnDefinition> columns, int columnCount, Columns superset, int supersetCount, DataOutputPlus out) throws IOException
+        {
+            // write flag indicating we're in lengthy mode
+            out.writeUnsignedVInt(supersetCount - columnCount);
+            BTreeSearchIterator<ColumnDefinition, ColumnDefinition> iter = superset.iterator();
+            if (columnCount < supersetCount / 2)
+            {
+                // write present columns
+                for (ColumnDefinition column : columns)
+                {
+                    if (iter.next(column) == null)
+                        throw new IllegalStateException();
+                    out.writeUnsignedVInt(iter.indexOfCurrent());
+                }
+            }
+            else
+            {
+                // write missing columns
+                int prev = -1;
+                for (ColumnDefinition column : columns)
+                {
+                    if (iter.next(column) == null)
+                        throw new IllegalStateException();
+                    int cur = iter.indexOfCurrent();
+                    while (++prev != cur)
+                        out.writeUnsignedVInt(prev);
+                }
+                while (++prev != supersetCount)
+                    out.writeUnsignedVInt(prev);
+            }
+        }
+
+        @DontInline
+        private Columns deserializeLargeSubset(DataInputPlus in, Columns superset, int delta) throws IOException
+        {
+            int supersetCount = superset.size();
+            int columnCount = supersetCount - delta;
+
+            BTree.Builder<ColumnDefinition> builder = BTree.builder(Comparator.naturalOrder());
+            if (columnCount < supersetCount / 2)
+            {
+                for (int i = 0 ; i < columnCount ; i++)
+                {
+                    int idx = (int) in.readUnsignedVInt();
+                    builder.add(BTree.findByIndex(superset.columns, idx));
+                }
+            }
+            else
+            {
+                Iterator<ColumnDefinition> iter = superset.iterator();
+                int idx = 0;
+                int skipped = 0;
+                while (true)
+                {
+                    int nextMissingIndex = skipped < delta ? (int)in.readUnsignedVInt() : supersetCount;
+                    while (idx < nextMissingIndex)
+                    {
+                        ColumnDefinition def = iter.next();
+                        builder.add(def);
+                        idx++;
+                    }
+                    if (idx == supersetCount)
+                        break;
+                    iter.next();
+                    idx++;
+                    skipped++;
+                }
+            }
+            return new Columns(builder.build());
+        }
+
+        @DontInline
+        private int serializeLargeSubsetSize(Collection<ColumnDefinition> columns, int columnCount, Columns superset, int supersetCount)
+        {
+            // write flag indicating we're in lengthy mode
+            int size = TypeSizes.sizeofUnsignedVInt(supersetCount - columnCount);
+            BTreeSearchIterator<ColumnDefinition, ColumnDefinition> iter = superset.iterator();
+            if (columnCount < supersetCount / 2)
+            {
+                // write present columns
+                for (ColumnDefinition column : columns)
+                {
+                    if (iter.next(column) == null)
+                        throw new IllegalStateException();
+                    size += TypeSizes.sizeofUnsignedVInt(iter.indexOfCurrent());
+                }
+            }
+            else
+            {
+                // write missing columns
+                int prev = -1;
+                for (ColumnDefinition column : columns)
+                {
+                    if (iter.next(column) == null)
+                        throw new IllegalStateException();
+                    int cur = iter.indexOfCurrent();
+                    while (++prev != cur)
+                        size += TypeSizes.sizeofUnsignedVInt(prev);
+                }
+                while (++prev != supersetCount)
+                    size += TypeSizes.sizeofUnsignedVInt(prev);
+            }
+            return size;
+        }
+
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/CompactTables.java b/src/java/org/apache/cassandra/db/CompactTables.java
new file mode 100644
index 0000000..9da4d94
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/CompactTables.java

@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.EmptyType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+
+/**
+ * Small utility methods pertaining to the encoding of COMPACT STORAGE tables.
+ *
+ * COMPACT STORAGE tables exists mainly for the sake of encoding internally thrift tables (as well as
+ * exposing those tables through CQL). Note that due to these constraints, the internal representation
+ * of compact tables does *not* correspond exactly to their CQL definition.
+ *
+ * The internal layout of such tables is such that it can encode any thrift table. That layout is as follow:
+ *   CREATE TABLE compact (
+ *      key [key_validation_class],
+ *      [column_metadata_1] [type1] static,
+ *      ...,
+ *      [column_metadata_n] [type1] static,
+ *      column [comparator],
+ *      value [default_validation_class]
+ *      PRIMARY KEY (key, column)
+ *   )
+ * More specifically, the table:
+ *  - always has a clustering column and a regular value, which are used to store the "dynamic" thrift columns name and value.
+ *    Those are always present because we have no way to know in advance if "dynamic" columns will be inserted or not. Note
+ *    that when declared from CQL, compact tables may not have any clustering: in that case, we still have a clustering
+ *    defined internally, it is just ignored as far as interacting from CQL is concerned.
+ *  - have a static column for every "static" column defined in the thrift "column_metadata". Note that when declaring a compact
+ *    table from CQL without any clustering (but some non-PK columns), the columns ends up static internally even though they are
+ *    not in the declaration
+ *
+ * On variation is that if the table comparator is a CompositeType, then the underlying table will have one clustering column by
+ * element of the CompositeType, but the rest of the layout is as above.
+ *
+ * SuperColumn families handling and detailed format description can be found in {@code SuperColumnCompatibility}.
+ */
+public abstract class CompactTables
+{
+    private CompactTables() {}
+
+    public static ColumnDefinition getCompactValueColumn(PartitionColumns columns)
+    {
+        assert columns.regulars.simpleColumnCount() == 1 && columns.regulars.complexColumnCount() == 0;
+        return columns.regulars.getSimple(0);
+    }
+
+    public static AbstractType<?> columnDefinitionComparator(String kind, boolean isSuper, AbstractType<?> rawComparator, AbstractType<?> rawSubComparator)
+    {
+        if (!"regular".equals(kind))
+            return UTF8Type.instance;
+
+        return isSuper ? rawSubComparator : rawComparator;
+    }
+
+    public static boolean hasEmptyCompactValue(CFMetaData metadata)
+    {
+        return metadata.compactValueColumn().type instanceof EmptyType;
+    }
+
+    public static DefaultNames defaultNameGenerator(Set<String> usedNames)
+    {
+        return new DefaultNames(new HashSet<String>(usedNames));
+    }
+
+    public static DefaultNames defaultNameGenerator(Iterable<ColumnDefinition> defs)
+    {
+        Set<String> usedNames = new HashSet<>();
+        for (ColumnDefinition def : defs)
+            usedNames.add(def.name.toString());
+        return new DefaultNames(usedNames);
+    }
+
+    public static class DefaultNames
+    {
+        private static final String DEFAULT_PARTITION_KEY_NAME = "key";
+        private static final String DEFAULT_CLUSTERING_NAME = "column";
+        private static final String DEFAULT_COMPACT_VALUE_NAME = "value";
+
+        private final Set<String> usedNames;
+        private int partitionIndex = 0;
+        private int clusteringIndex = 1;
+        private int compactIndex = 0;
+
+        private DefaultNames(Set<String> usedNames)
+        {
+            this.usedNames = usedNames;
+        }
+
+        public String defaultPartitionKeyName()
+        {
+            while (true)
+            {
+                // For compatibility sake, we call the first alias 'key' rather than 'key1'. This
+                // is inconsistent with column alias, but it's probably not worth risking breaking compatibility now.
+                String candidate = partitionIndex == 0 ? DEFAULT_PARTITION_KEY_NAME : DEFAULT_PARTITION_KEY_NAME + (partitionIndex + 1);
+                ++partitionIndex;
+                if (usedNames.add(candidate))
+                    return candidate;
+            }
+        }
+
+        public String defaultClusteringName()
+        {
+            while (true)
+            {
+                String candidate = DEFAULT_CLUSTERING_NAME + clusteringIndex;
+                ++clusteringIndex;
+                if (usedNames.add(candidate))
+                    return candidate;
+            }
+        }
+
+        public String defaultCompactValueName()
+        {
+            while (true)
+            {
+                String candidate = compactIndex == 0 ? DEFAULT_COMPACT_VALUE_NAME : DEFAULT_COMPACT_VALUE_NAME + compactIndex;
+                ++compactIndex;
+                if (usedNames.add(candidate))
+                    return candidate;
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Conflicts.java b/src/java/org/apache/cassandra/db/Conflicts.java
new file mode 100644
index 0000000..fa0e819
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Conflicts.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.context.CounterContext;
+
+public abstract class Conflicts
+{
+    private Conflicts() {}
+
+    public enum Resolution { LEFT_WINS, MERGE, RIGHT_WINS };
+
+    public static Resolution resolveRegular(long leftTimestamp,
+                                            boolean leftLive,
+                                            int leftLocalDeletionTime,
+                                            ByteBuffer leftValue,
+                                            long rightTimestamp,
+                                            boolean rightLive,
+                                            int rightLocalDeletionTime,
+                                            ByteBuffer rightValue)
+    {
+        if (leftTimestamp != rightTimestamp)
+            return leftTimestamp < rightTimestamp ? Resolution.RIGHT_WINS : Resolution.LEFT_WINS;
+
+        if (leftLive != rightLive)
+            return leftLive ? Resolution.RIGHT_WINS : Resolution.LEFT_WINS;
+
+        int c = leftValue.compareTo(rightValue);
+        if (c < 0)
+            return Resolution.RIGHT_WINS;
+        else if (c > 0)
+            return Resolution.LEFT_WINS;
+
+        // Prefer the longest ttl if relevant
+        return leftLocalDeletionTime < rightLocalDeletionTime ? Resolution.RIGHT_WINS : Resolution.LEFT_WINS;
+    }
+
+    public static Resolution resolveCounter(long leftTimestamp,
+                                            boolean leftLive,
+                                            ByteBuffer leftValue,
+                                            long rightTimestamp,
+                                            boolean rightLive,
+                                            ByteBuffer rightValue)
+    {
+        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
+        if (!leftLive)
+            // left is a tombstone: it has precedence over right if either right is not a tombstone, or left has a greater timestamp
+            return rightLive || leftTimestamp > rightTimestamp ? Resolution.LEFT_WINS : Resolution.RIGHT_WINS;
+
+        // If right is a tombstone, since left isn't one, it has precedence
+        if (!rightLive)
+            return Resolution.RIGHT_WINS;
+
+        return Resolution.MERGE;
+    }
+
+    public static ByteBuffer mergeCounterValues(ByteBuffer left, ByteBuffer right)
+    {
+        return CounterContext.instance().merge(left, right);
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index 85ec0f3..ab4243f 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java

@@ -183,6 +183,14 @@
     public List<InetAddress> filterForQuery(Keyspace keyspace, List<InetAddress> liveEndpoints, ReadRepairDecision readRepair)
     {
         /*
+         * If we are doing an each quorum query, we have to make sure that the endpoints we select
+         * provide a quorum for each data center. If we are not using a NetworkTopologyStrategy,
+         * we should fall through and grab a quorum in the replication strategy.
+         */
+        if (this == EACH_QUORUM && keyspace.getReplicationStrategy() instanceof NetworkTopologyStrategy)
+            return filterForEachQuorum(keyspace, liveEndpoints, readRepair);
+
+        /*
          * Endpoints are expected to be restricted to live replicas, sorted by snitch preference.
          * For LOCAL_QUORUM, move local-DC replicas in front first as we need them there whether
          * we do read repair (since the first replica gets the data read) or not (since we'll take
@@ -217,6 +225,37 @@
         }
     }
 
+    private List<InetAddress> filterForEachQuorum(Keyspace keyspace, List<InetAddress> liveEndpoints, ReadRepairDecision readRepair)
+    {
+        NetworkTopologyStrategy strategy = (NetworkTopologyStrategy) keyspace.getReplicationStrategy();
+
+        // quickly drop out if read repair is GLOBAL, since we just use all of the live endpoints
+        if (readRepair == ReadRepairDecision.GLOBAL)
+            return liveEndpoints;
+
+        Map<String, List<InetAddress>> dcsEndpoints = new HashMap<>();
+        for (String dc: strategy.getDatacenters())
+            dcsEndpoints.put(dc, new ArrayList<>());
+
+        for (InetAddress add : liveEndpoints)
+        {
+            String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(add);
+            dcsEndpoints.get(dc).add(add);
+        }
+
+        List<InetAddress> waitSet = new ArrayList<>();
+        for (Map.Entry<String, List<InetAddress>> dcEndpoints : dcsEndpoints.entrySet())
+        {
+            List<InetAddress> dcEndpoint = dcEndpoints.getValue();
+            if (readRepair == ReadRepairDecision.DC_LOCAL && dcEndpoints.getKey().equals(DatabaseDescriptor.getLocalDataCenter()))
+                waitSet.addAll(dcEndpoint);
+            else
+                waitSet.addAll(dcEndpoint.subList(0, Math.min(localQuorumFor(keyspace, dcEndpoints.getKey()), dcEndpoint.size())));
+        }
+
+        return waitSet;
+    }
+
     public boolean isSufficientLiveNodes(Keyspace keyspace, Iterable<InetAddress> liveEndpoints)
     {
         switch (this)
@@ -282,7 +321,7 @@
                         int dcBlockFor = localQuorumFor(keyspace, entry.getKey());
                         int dcLive = entry.getValue();
                         if (dcLive < dcBlockFor)
-                            throw new UnavailableException(this, dcBlockFor, dcLive);
+                            throw new UnavailableException(this, entry.getKey(), dcBlockFor, dcLive);
                     }
                     break;
                 }
@@ -304,8 +343,6 @@
         {
             case ANY:
                 throw new InvalidRequestException("ANY ConsistencyLevel is only supported for writes");
-            case EACH_QUORUM:
-                throw new InvalidRequestException("EACH_QUORUM ConsistencyLevel is only supported for writes");
         }
     }
 

diff --git a/src/java/org/apache/cassandra/db/CounterCell.java b/src/java/org/apache/cassandra/db/CounterCell.java
deleted file mode 100644
index cda1200..0000000
--- a/src/java/org/apache/cassandra/db/CounterCell.java
+++ /dev/null

@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-/**
- * A column that represents a partitioned counter.
- */
-public interface CounterCell extends Cell
-{
-    static final CounterContext contextManager = CounterContext.instance();
-
-    public long timestampOfLastDelete();
-
-    public long total();
-
-    public boolean hasLegacyShards();
-
-    public Cell markLocalToBeCleared();
-
-    CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
-
-    CounterCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
-}

diff --git a/src/java/org/apache/cassandra/db/CounterMutation.java b/src/java/org/apache/cassandra/db/CounterMutation.java
index 58717b4..8aafa5c 100644
--- a/src/java/org/apache/cassandra/db/CounterMutation.java
+++ b/src/java/org/apache/cassandra/db/CounterMutation.java

@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.Lock;
@@ -27,20 +25,26 @@
 import com.google.common.base.Function;
 import com.google.common.base.Objects;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.PeekingIterator;
 import com.google.common.util.concurrent.Striped;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.btree.BTreeSet;
 
 public class CounterMutation implements IMutation
 {
@@ -67,9 +71,9 @@
         return mutation.getColumnFamilyIds();
     }
 
-    public Collection<ColumnFamily> getColumnFamilies()
+    public Collection<PartitionUpdate> getPartitionUpdates()
     {
-        return mutation.getColumnFamilies();
+        return mutation.getPartitionUpdates();
     }
 
     public Mutation getMutation()
@@ -77,7 +81,7 @@
         return mutation;
     }
 
-    public ByteBuffer key()
+    public DecoratedKey key()
     {
         return mutation.key();
     }
@@ -111,19 +115,14 @@
         Mutation result = new Mutation(getKeyspaceName(), key());
         Keyspace keyspace = Keyspace.open(getKeyspaceName());
 
-        int count = 0;
-        for (ColumnFamily cf : getColumnFamilies())
-            count += cf.getColumnCount();
-
-        List<Lock> locks = new ArrayList<>(count);
-        Tracing.trace("Acquiring {} counter locks", count);
+        List<Lock> locks = new ArrayList<>();
+        Tracing.trace("Acquiring counter locks");
         try
         {
             grabCounterLocks(keyspace, locks);
-            for (ColumnFamily cf : getColumnFamilies())
-                result.add(processModifications(cf));
+            for (PartitionUpdate upd : getPartitionUpdates())
+                result.add(processModifications(upd));
             result.apply();
-            updateCounterCache(result, keyspace);
             return result;
         }
         finally
@@ -156,145 +155,149 @@
     /**
      * Returns a wrapper for the Striped#bulkGet() call (via Keyspace#counterLocksFor())
      * Striped#bulkGet() depends on Object#hashCode(), so here we make sure that the cf id and the partition key
-     * all get to be part of the hashCode() calculation, not just the cell name.
+     * all get to be part of the hashCode() calculation.
      */
     private Iterable<Object> getCounterLockKeys()
     {
-        return Iterables.concat(Iterables.transform(getColumnFamilies(), new Function<ColumnFamily, Iterable<Object>>()
+        return Iterables.concat(Iterables.transform(getPartitionUpdates(), new Function<PartitionUpdate, Iterable<Object>>()
         {
-            public Iterable<Object> apply(final ColumnFamily cf)
+            public Iterable<Object> apply(final PartitionUpdate update)
             {
-                return Iterables.transform(cf, new Function<Cell, Object>()
+                return Iterables.concat(Iterables.transform(update, new Function<Row, Iterable<Object>>()
                 {
-                    public Object apply(Cell cell)
+                    public Iterable<Object> apply(final Row row)
                     {
-                        return Objects.hashCode(cf.id(), key(), cell.name());
+                        return Iterables.concat(Iterables.transform(row, new Function<ColumnData, Object>()
+                        {
+                            public Object apply(final ColumnData data)
+                            {
+                                return Objects.hashCode(update.metadata().cfId, key(), row.clustering(), data.column());
+                            }
+                        }));
                     }
-                });
+                }));
             }
         }));
     }
 
-    // Replaces all the CounterUpdateCell-s with updated regular CounterCell-s
-    private ColumnFamily processModifications(ColumnFamily changesCF)
+    private PartitionUpdate processModifications(PartitionUpdate changes)
     {
-        ColumnFamilyStore cfs = Keyspace.open(getKeyspaceName()).getColumnFamilyStore(changesCF.id());
+        ColumnFamilyStore cfs = Keyspace.open(getKeyspaceName()).getColumnFamilyStore(changes.metadata().cfId);
 
-        ColumnFamily resultCF = changesCF.cloneMeShallow();
-
-        List<CounterUpdateCell> counterUpdateCells = new ArrayList<>(changesCF.getColumnCount());
-        for (Cell cell : changesCF)
-        {
-            if (cell instanceof CounterUpdateCell)
-                counterUpdateCells.add((CounterUpdateCell)cell);
-            else
-                resultCF.addColumn(cell);
-        }
-
-        if (counterUpdateCells.isEmpty())
-            return resultCF; // only DELETEs
-
-        ClockAndCount[] currentValues = getCurrentValues(counterUpdateCells, cfs);
-        for (int i = 0; i < counterUpdateCells.size(); i++)
-        {
-            ClockAndCount currentValue = currentValues[i];
-            CounterUpdateCell update = counterUpdateCells.get(i);
-
-            long clock = currentValue.clock + 1L;
-            long count = currentValue.count + update.delta();
-
-            resultCF.addColumn(new BufferCounterCell(update.name(),
-                                                     CounterContext.instance().createGlobal(CounterId.getLocalId(), clock, count),
-                                                     update.timestamp()));
-        }
-
-        return resultCF;
-    }
-
-    // Attempt to load the current values(s) from cache. If that fails, read the rest from the cfs.
-    private ClockAndCount[] getCurrentValues(List<CounterUpdateCell> counterUpdateCells, ColumnFamilyStore cfs)
-    {
-        ClockAndCount[] currentValues = new ClockAndCount[counterUpdateCells.size()];
-        int remaining = counterUpdateCells.size();
+        List<PartitionUpdate.CounterMark> marks = changes.collectCounterMarks();
 
         if (CacheService.instance.counterCache.getCapacity() != 0)
         {
-            Tracing.trace("Fetching {} counter values from cache", counterUpdateCells.size());
-            remaining = getCurrentValuesFromCache(counterUpdateCells, cfs, currentValues);
-            if (remaining == 0)
-                return currentValues;
+            Tracing.trace("Fetching {} counter values from cache", marks.size());
+            updateWithCurrentValuesFromCache(marks, cfs);
+            if (marks.isEmpty())
+                return changes;
         }
 
-        Tracing.trace("Reading {} counter values from the CF", remaining);
-        getCurrentValuesFromCFS(counterUpdateCells, cfs, currentValues);
+        Tracing.trace("Reading {} counter values from the CF", marks.size());
+        updateWithCurrentValuesFromCFS(marks, cfs);
 
-        return currentValues;
+        // What's remain is new counters
+        for (PartitionUpdate.CounterMark mark : marks)
+            updateWithCurrentValue(mark, ClockAndCount.BLANK, cfs);
+
+        return changes;
+    }
+
+    private void updateWithCurrentValue(PartitionUpdate.CounterMark mark, ClockAndCount currentValue, ColumnFamilyStore cfs)
+    {
+        long clock = currentValue.clock + 1L;
+        long count = currentValue.count + CounterContext.instance().total(mark.value());
+
+        mark.setValue(CounterContext.instance().createGlobal(CounterId.getLocalId(), clock, count));
+
+        // Cache the newly updated value
+        cfs.putCachedCounter(key().getKey(), mark.clustering(), mark.column(), mark.path(), ClockAndCount.create(clock, count));
     }
 
     // Returns the count of cache misses.
-    private int getCurrentValuesFromCache(List<CounterUpdateCell> counterUpdateCells,
-                                          ColumnFamilyStore cfs,
-                                          ClockAndCount[] currentValues)
+    private void updateWithCurrentValuesFromCache(List<PartitionUpdate.CounterMark> marks, ColumnFamilyStore cfs)
     {
-        int cacheMisses = 0;
-        for (int i = 0; i < counterUpdateCells.size(); i++)
+        Iterator<PartitionUpdate.CounterMark> iter = marks.iterator();
+        while (iter.hasNext())
         {
-            ClockAndCount cached = cfs.getCachedCounter(key(), counterUpdateCells.get(i).name());
+            PartitionUpdate.CounterMark mark = iter.next();
+            ClockAndCount cached = cfs.getCachedCounter(key().getKey(), mark.clustering(), mark.column(), mark.path());
             if (cached != null)
-                currentValues[i] = cached;
-            else
-                cacheMisses++;
+            {
+                updateWithCurrentValue(mark, cached, cfs);
+                iter.remove();
+            }
         }
-        return cacheMisses;
     }
 
     // Reads the missing current values from the CFS.
-    private void getCurrentValuesFromCFS(List<CounterUpdateCell> counterUpdateCells,
-                                         ColumnFamilyStore cfs,
-                                         ClockAndCount[] currentValues)
+    private void updateWithCurrentValuesFromCFS(List<PartitionUpdate.CounterMark> marks, ColumnFamilyStore cfs)
     {
-        SortedSet<CellName> names = new TreeSet<>(cfs.metadata.comparator);
-        for (int i = 0; i < currentValues.length; i++)
-            if (currentValues[i] == null)
-                names.add(counterUpdateCells.get(i).name());
-
-        ReadCommand cmd = new SliceByNamesReadCommand(getKeyspaceName(), key(), cfs.metadata.cfName, Long.MIN_VALUE, new NamesQueryFilter(names));
-        Row row = cmd.getRow(cfs.keyspace);
-        ColumnFamily cf = row == null ? null : row.cf;
-
-        for (int i = 0; i < currentValues.length; i++)
+        ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+        BTreeSet.Builder<Clustering> names = BTreeSet.builder(cfs.metadata.comparator);
+        for (PartitionUpdate.CounterMark mark : marks)
         {
-            if (currentValues[i] != null)
-                continue;
-
-            Cell cell = cf == null ? null : cf.getColumn(counterUpdateCells.get(i).name());
-            if (cell == null || !cell.isLive()) // absent or a tombstone.
-                currentValues[i] = ClockAndCount.BLANK;
+            if (mark.clustering() != Clustering.STATIC_CLUSTERING)
+                names.add(mark.clustering());
+            if (mark.path() == null)
+                builder.add(mark.column());
             else
-                currentValues[i] = CounterContext.instance().getLocalClockAndCount(cell.value());
+                builder.select(mark.column(), mark.path());
+        }
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(names.build(), false);
+        SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(cfs.metadata, nowInSec, key(), builder.build(), filter);
+        PeekingIterator<PartitionUpdate.CounterMark> markIter = Iterators.peekingIterator(marks.iterator());
+        try (OpOrder.Group op = cfs.readOrdering.start(); RowIterator partition = UnfilteredRowIterators.filter(cmd.queryMemtableAndDisk(cfs, op), nowInSec))
+        {
+            updateForRow(markIter, partition.staticRow(), cfs);
+
+            while (partition.hasNext())
+            {
+                if (!markIter.hasNext())
+                    return;
+
+                updateForRow(markIter, partition.next(), cfs);
+            }
         }
     }
 
-    private void updateCounterCache(Mutation applied, Keyspace keyspace)
+    private int compare(Clustering c1, Clustering c2, ColumnFamilyStore cfs)
     {
-        if (CacheService.instance.counterCache.getCapacity() == 0)
+        if (c1 == Clustering.STATIC_CLUSTERING)
+            return c2 == Clustering.STATIC_CLUSTERING ? 0 : -1;
+        if (c2 == Clustering.STATIC_CLUSTERING)
+            return 1;
+
+        return cfs.getComparator().compare(c1, c2);
+    }
+
+    private void updateForRow(PeekingIterator<PartitionUpdate.CounterMark> markIter, Row row, ColumnFamilyStore cfs)
+    {
+        int cmp = 0;
+        // If the mark is before the row, we have no value for this mark, just consume it
+        while (markIter.hasNext() && (cmp = compare(markIter.peek().clustering(), row.clustering(), cfs)) < 0)
+            markIter.next();
+
+        if (!markIter.hasNext())
             return;
 
-        for (ColumnFamily cf : applied.getColumnFamilies())
+        while (cmp == 0)
         {
-            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cf.id());
-            for (Cell cell : cf)
-                if (cell instanceof CounterCell)
-                    cfs.putCachedCounter(key(), cell.name(), CounterContext.instance().getLocalClockAndCount(cell.value()));
-        }
-    }
+            PartitionUpdate.CounterMark mark = markIter.next();
+            Cell cell = mark.path() == null ? row.getCell(mark.column()) : row.getCell(mark.column(), mark.path());
+            if (cell != null)
+            {
+                updateWithCurrentValue(mark, CounterContext.instance().getLocalClockAndCount(cell.value()), cfs);
+                markIter.remove();
+            }
+            if (!markIter.hasNext())
+                return;
 
-    public void addAll(IMutation m)
-    {
-        if (!(m instanceof CounterMutation))
-            throw new IllegalArgumentException();
-        CounterMutation cm = (CounterMutation)m;
-        mutation.addAll(cm.mutation);
+            cmp = compare(markIter.peek().clustering(), row.clustering(), cfs);
+        }
     }
 
     public long getTimeout()
@@ -321,7 +324,7 @@
             out.writeUTF(cm.consistency.name());
         }
 
-        public CounterMutation deserialize(DataInput in, int version) throws IOException
+        public CounterMutation deserialize(DataInputPlus in, int version) throws IOException
         {
             Mutation m = Mutation.serializer.deserialize(in, version);
             ConsistencyLevel consistency = Enum.valueOf(ConsistencyLevel.class, in.readUTF());
@@ -331,7 +334,7 @@
         public long serializedSize(CounterMutation cm, int version)
         {
             return Mutation.serializer.serializedSize(cm.mutation, version)
-                 + TypeSizes.NATIVE.sizeof(cm.consistency.name());
+                 + TypeSizes.sizeof(cm.consistency.name());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java b/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java
index 4dd8ac3..f89480a 100644
--- a/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/CounterMutationVerbHandler.java

@@ -49,7 +49,7 @@
         {
             public void run()
             {
-                MessagingService.instance().sendReply(new WriteResponse().createMessage(), id, message.from);
+                MessagingService.instance().sendReply(WriteResponse.createMessage(), id, message.from);
             }
         });
     }

diff --git a/src/java/org/apache/cassandra/db/CounterUpdateCell.java b/src/java/org/apache/cassandra/db/CounterUpdateCell.java
deleted file mode 100644
index 58ac365..0000000
--- a/src/java/org/apache/cassandra/db/CounterUpdateCell.java
+++ /dev/null

@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-/**
- * A counter update while it hasn't been applied yet by the leader replica.
- *
- * Contains a single counter update. When applied by the leader replica, this
- * is transformed to a relevant CounterCell. This Cell is a temporary data
- * structure that should never be stored inside a memtable or an sstable.
- */
-public interface CounterUpdateCell extends Cell
-{
-    public long delta();
-}

diff --git a/src/java/org/apache/cassandra/db/DataRange.java b/src/java/org/apache/cassandra/db/DataRange.java
index 1e6f8c8..f6776c4 100644
--- a/src/java/org/apache/cassandra/db/DataRange.java
+++ b/src/java/org/apache/cassandra/db/DataRange.java

@@ -6,7 +6,6 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
@@ -17,302 +16,439 @@
  */
 package org.apache.cassandra.db;
 
+import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
 
-import com.google.common.base.Objects;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.Composites;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
- * Groups key range and column filter for range queries.
- *
- * The main "trick" of this class is that the column filter can only
- * be obtained by providing the row key on which the column filter will
- * be applied (which we always know before actually querying the columns).
- *
- * This allows the paging DataRange to return a filter for most rows but a
- * potentially different ones for the starting and stopping key. Could
- * allow more fancy stuff in the future too, like column filters that
- * depend on the actual key value :)
+ * Groups both the range of partitions to query, and the clustering index filter to
+ * apply for each partition (for a (partition) range query).
+ * <p>
+ * The main "trick" is that the clustering index filter can only be obtained by
+ * providing the partition key on which the filter will be applied. This is
+ * necessary when paging range queries, as we might need a different filter
+ * for the starting key than for other keys (because the previous page we had
+ * queried may have ended in the middle of a partition).
  */
 public class DataRange
 {
-    protected final AbstractBounds<RowPosition> keyRange;
-    protected IDiskAtomFilter columnFilter;
-    protected final boolean selectFullRow;
+    public static final Serializer serializer = new Serializer();
 
-    public DataRange(AbstractBounds<RowPosition> range, IDiskAtomFilter columnFilter)
+    protected final AbstractBounds<PartitionPosition> keyRange;
+    protected final ClusteringIndexFilter clusteringIndexFilter;
+
+    /**
+     * Creates a {@code DataRange} given a range of partition keys and a clustering index filter. The
+     * return {@code DataRange} will return the same filter for all keys.
+     *
+     * @param range the range over partition keys to use.
+     * @param clusteringIndexFilter the clustering index filter to use.
+     */
+    public DataRange(AbstractBounds<PartitionPosition> range, ClusteringIndexFilter clusteringIndexFilter)
     {
         this.keyRange = range;
-        this.columnFilter = columnFilter;
-        this.selectFullRow = columnFilter instanceof SliceQueryFilter
-                           ? isFullRowSlice((SliceQueryFilter)columnFilter)
-                           : false;
+        this.clusteringIndexFilter = clusteringIndexFilter;
     }
 
-    public static boolean isFullRowSlice(SliceQueryFilter filter)
-    {
-        return filter.slices.length == 1
-            && filter.start().isEmpty()
-            && filter.finish().isEmpty()
-            && filter.count == Integer.MAX_VALUE;
-    }
-
+    /**
+     * Creates a {@code DataRange} to query all data (over the whole ring).
+     *
+     * @param partitioner the partitioner in use for the table.
+     *
+     * @return the newly create {@code DataRange}.
+     */
     public static DataRange allData(IPartitioner partitioner)
     {
         return forTokenRange(new Range<Token>(partitioner.getMinimumToken(), partitioner.getMinimumToken()));
     }
 
-    public static DataRange forTokenRange(Range<Token> keyRange)
+    /**
+     * Creates a {@code DataRange} to query all rows over the provided token range.
+     *
+     * @param tokenRange the (partition key) token range to query.
+     *
+     * @return the newly create {@code DataRange}.
+     */
+    public static DataRange forTokenRange(Range<Token> tokenRange)
     {
-        return forKeyRange(Range.makeRowRange(keyRange));
+        return forKeyRange(Range.makeRowRange(tokenRange));
     }
 
-    public static DataRange forKeyRange(Range<RowPosition> keyRange)
+    /**
+     * Creates a {@code DataRange} to query all rows over the provided key range.
+     *
+     * @param keyRange the (partition key) range to query.
+     *
+     * @return the newly create {@code DataRange}.
+     */
+    public static DataRange forKeyRange(Range<PartitionPosition> keyRange)
     {
-        return new DataRange(keyRange, new IdentityQueryFilter());
+        return new DataRange(keyRange, new ClusteringIndexSliceFilter(Slices.ALL, false));
     }
 
-    public AbstractBounds<RowPosition> keyRange()
+    /**
+     * Creates a {@code DataRange} to query all partitions of the ring using the provided
+     * clustering index filter.
+     *
+     * @param partitioner the partitioner in use for the table queried.
+     * @param filter the clustering index filter to use.
+     *
+     * @return the newly create {@code DataRange}.
+     */
+    public static DataRange allData(IPartitioner partitioner, ClusteringIndexFilter filter)
+    {
+        return new DataRange(Range.makeRowRange(new Range<Token>(partitioner.getMinimumToken(), partitioner.getMinimumToken())), filter);
+    }
+
+    /**
+     * The range of partition key queried by this {@code DataRange}.
+     *
+     * @return the range of partition key queried by this {@code DataRange}.
+     */
+    public AbstractBounds<PartitionPosition> keyRange()
     {
         return keyRange;
     }
 
-    public RowPosition startKey()
+    /**
+     * The start of the partition key range queried by this {@code DataRange}.
+     *
+     * @return the start of the partition key range queried by this {@code DataRange}.
+     */
+    public PartitionPosition startKey()
     {
         return keyRange.left;
     }
 
-    public RowPosition stopKey()
+    /**
+     * The end of the partition key range queried by this {@code DataRange}.
+     *
+     * @return the end of the partition key range queried by this {@code DataRange}.
+     */
+    public PartitionPosition stopKey()
     {
         return keyRange.right;
     }
 
     /**
-     * Returns true if tombstoned partitions should not be included in results or count towards the limit.
-     * See CASSANDRA-8490 for more details on why this is needed (and done this way).
-     * */
-    public boolean ignoredTombstonedPartitions()
+     * Whether the underlying clustering index filter is a names filter or not.
+     *
+     * @return Whether the underlying clustering index filter is a names filter or not.
+     */
+    public boolean isNamesQuery()
     {
-        if (!(columnFilter instanceof SliceQueryFilter))
-            return false;
-
-        return ((SliceQueryFilter) columnFilter).compositesToGroup == SliceQueryFilter.IGNORE_TOMBSTONED_PARTITIONS;
+        return clusteringIndexFilter instanceof ClusteringIndexNamesFilter;
     }
 
-    // Whether the bounds of this DataRange actually wraps around.
+    /**
+     * Whether the data range is for a paged request or not.
+     *
+     * @return true if for paging, false otherwise
+     */
+    public boolean isPaging()
+    {
+        return false;
+    }
+
+    /**
+     * Whether the range queried by this {@code DataRange} actually wraps around.
+     *
+     * @return whether the range queried by this {@code DataRange} actually wraps around.
+     */
     public boolean isWrapAround()
     {
-        // On range can ever wrap
+        // Only range can ever wrap
         return keyRange instanceof Range && ((Range<?>)keyRange).isWrapAround();
     }
 
-    public boolean contains(RowPosition pos)
+    /**
+     * Whether the provided ring position is covered by this {@code DataRange}.
+     *
+     * @return whether the provided ring position is covered by this {@code DataRange}.
+     */
+    public boolean contains(PartitionPosition pos)
     {
         return keyRange.contains(pos);
     }
 
-    public int getLiveCount(ColumnFamily data, long now)
+    /**
+     * Whether this {@code DataRange} queries everything (has no restriction neither on the
+     * partition queried, nor within the queried partition).
+     *
+     * @return Whether this {@code DataRange} queries everything.
+     */
+    public boolean isUnrestricted()
     {
-        return columnFilter instanceof SliceQueryFilter
-             ? ((SliceQueryFilter)columnFilter).lastCounted()
-             : columnFilter.getLiveCount(data, now);
+        return startKey().isMinimum() && stopKey().isMinimum() && clusteringIndexFilter.selectsAllPartition();
     }
 
-    public boolean selectsFullRowFor(ByteBuffer rowKey)
+    public boolean selectsAllPartition()
     {
-        return selectFullRow;
+        return clusteringIndexFilter.selectsAllPartition();
     }
 
     /**
-     * Returns a column filter that should be used for a particular row key.  Note that in the case of paging,
-     * slice starts and ends may change depending on the row key.
+     * Whether the underlying {@code ClusteringIndexFilter} is reversed or not.
+     *
+     * @return whether the underlying {@code ClusteringIndexFilter} is reversed or not.
      */
-    public IDiskAtomFilter columnFilter(ByteBuffer rowKey)
+    public boolean isReversed()
     {
-        return columnFilter;
+        return clusteringIndexFilter.isReversed();
     }
 
     /**
-     * Sets a new limit on the number of (grouped) cells to fetch. This is currently only used when the query limit applies
-     * to CQL3 rows.
+     * The clustering index filter to use for the provided key.
+     * <p>
+     * This may or may not be the same filter for all keys (that is, paging range
+     * use a different filter for their start key).
+     *
+     * @param key the partition key for which we want the clustering index filter.
+     *
+     * @return the clustering filter to use for {@code key}.
      */
-    public void updateColumnsLimit(int count)
+    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key)
     {
-        columnFilter.updateColumnsLimit(count);
+        return clusteringIndexFilter;
     }
 
+    /**
+     * Returns a new {@code DataRange} for use when paging {@code this} range.
+     *
+     * @param range the range of partition keys to query.
+     * @param comparator the comparator for the table queried.
+     * @param lastReturned the clustering for the last result returned by the previous page, i.e. the result we want to start our new page
+     * from. This last returned <b>must</b> correspond to left bound of {@code range} (in other words, {@code range.left} must be the
+     * partition key for that {@code lastReturned} result).
+     * @param inclusive whether or not we want to include the {@code lastReturned} in the newly returned page of results.
+     *
+     * @return a new {@code DataRange} suitable for paging {@code this} range given the {@code lastRetuned} result of the previous page.
+     */
+    public DataRange forPaging(AbstractBounds<PartitionPosition> range, ClusteringComparator comparator, Clustering lastReturned, boolean inclusive)
+    {
+        return new Paging(range, clusteringIndexFilter, comparator, lastReturned, inclusive);
+    }
+
+    /**
+     * Returns a new {@code DataRange} equivalent to {@code this} one but restricted to the provided sub-range.
+     *
+     * @param range the sub-range to use for the newly returned data range. Note that assumes that {@code range} is a proper
+     * sub-range of the initial range but doesn't validate it. You should make sure to only provided sub-ranges however or this
+     * might throw off the paging case (see Paging.forSubRange()).
+     *
+     * @return a new {@code DataRange} using {@code range} as partition key range and the clustering index filter filter from {@code this}.
+     */
+    public DataRange forSubRange(AbstractBounds<PartitionPosition> range)
+    {
+        return new DataRange(range, clusteringIndexFilter);
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return String.format("range=%s pfilter=%s", keyRange.getString(metadata.getKeyValidator()), clusteringIndexFilter.toString(metadata));
+    }
+
+    public String toCQLString(CFMetaData metadata)
+    {
+        if (isUnrestricted())
+            return "UNRESTRICTED";
+
+        StringBuilder sb = new StringBuilder();
+
+        boolean needAnd = false;
+        if (!startKey().isMinimum())
+        {
+            appendClause(startKey(), sb, metadata, true, keyRange.isStartInclusive());
+            needAnd = true;
+        }
+        if (!stopKey().isMinimum())
+        {
+            if (needAnd)
+                sb.append(" AND ");
+            appendClause(stopKey(), sb, metadata, false, keyRange.isEndInclusive());
+            needAnd = true;
+        }
+
+        String filterString = clusteringIndexFilter.toCQLString(metadata);
+        if (!filterString.isEmpty())
+            sb.append(needAnd ? " AND " : "").append(filterString);
+
+        return sb.toString();
+    }
+
+    private void appendClause(PartitionPosition pos, StringBuilder sb, CFMetaData metadata, boolean isStart, boolean isInclusive)
+    {
+        sb.append("token(");
+        sb.append(ColumnDefinition.toCQLString(metadata.partitionKeyColumns()));
+        sb.append(") ").append(getOperator(isStart, isInclusive)).append(" ");
+        if (pos instanceof DecoratedKey)
+        {
+            sb.append("token(");
+            appendKeyString(sb, metadata.getKeyValidator(), ((DecoratedKey)pos).getKey());
+            sb.append(")");
+        }
+        else
+        {
+            sb.append(((Token.KeyBound)pos).getToken());
+        }
+    }
+
+    private static String getOperator(boolean isStart, boolean isInclusive)
+    {
+        return isStart
+             ? (isInclusive ? ">=" : ">")
+             : (isInclusive ? "<=" : "<");
+    }
+
+    // TODO: this is reused in SinglePartitionReadCommand but this should not really be here. Ideally
+    // we need a more "native" handling of composite partition keys.
+    public static void appendKeyString(StringBuilder sb, AbstractType<?> type, ByteBuffer key)
+    {
+        if (type instanceof CompositeType)
+        {
+            CompositeType ct = (CompositeType)type;
+            ByteBuffer[] values = ct.split(key);
+            for (int i = 0; i < ct.types.size(); i++)
+                sb.append(i == 0 ? "" : ", ").append(ct.types.get(i).getString(values[i]));
+        }
+        else
+        {
+            sb.append(type.getString(key));
+        }
+    }
+
+    /**
+     * Specialized {@code DataRange} used for the paging case.
+     * <p>
+     * It uses the clustering of the last result of the previous page to restrict the filter on the
+     * first queried partition (the one for that last result) so it only fetch results that follow that
+     * last result. In other words, this makes sure this resume paging where we left off.
+     */
     public static class Paging extends DataRange
     {
-        // The slice of columns that we want to fetch for each row, ignoring page start/end issues.
-        private final SliceQueryFilter sliceFilter;
+        private final ClusteringComparator comparator;
+        private final Clustering lastReturned;
+        private final boolean inclusive;
 
-        private final CFMetaData cfm;
-
-        private final Comparator<Composite> comparator;
-
-        // used to restrict the start of the slice for the first partition in the range
-        private final Composite firstPartitionColumnStart;
-
-        // used to restrict the end of the slice for the last partition in the range
-        private final Composite lastPartitionColumnFinish;
-
-        // tracks the last key that we updated the filter for to avoid duplicating work
-        private ByteBuffer lastKeyFilterWasUpdatedFor;
-
-        private Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, Composite firstPartitionColumnStart,
-                       Composite lastPartitionColumnFinish, CFMetaData cfm, Comparator<Composite> comparator)
+        private Paging(AbstractBounds<PartitionPosition> range,
+                       ClusteringIndexFilter filter,
+                       ClusteringComparator comparator,
+                       Clustering lastReturned,
+                       boolean inclusive)
         {
             super(range, filter);
 
             // When using a paging range, we don't allow wrapped ranges, as it's unclear how to handle them properly.
-            // This is ok for now since we only need this in range slice queries, and the range are "unwrapped" in that case.
+            // This is ok for now since we only need this in range queries, and the range are "unwrapped" in that case.
             assert !(range instanceof Range) || !((Range<?>)range).isWrapAround() || range.right.isMinimum() : range;
+            assert lastReturned != null;
 
-            this.sliceFilter = filter;
-            this.cfm = cfm;
             this.comparator = comparator;
-            this.firstPartitionColumnStart = firstPartitionColumnStart;
-            this.lastPartitionColumnFinish = lastPartitionColumnFinish;
-            this.lastKeyFilterWasUpdatedFor = null;
-        }
-
-        public Paging(AbstractBounds<RowPosition> range, SliceQueryFilter filter, Composite columnStart, Composite columnFinish, CFMetaData cfm)
-        {
-            this(range, filter, columnStart, columnFinish, cfm, filter.isReversed() ? cfm.comparator.reverseComparator() : cfm.comparator);
+            this.lastReturned = lastReturned;
+            this.inclusive = inclusive;
         }
 
         @Override
-        public boolean selectsFullRowFor(ByteBuffer rowKey)
+        public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key)
         {
-            // If we initial filter is not the full filter, don't bother
-            if (!selectFullRow)
-                return false;
-
-            if (!equals(startKey(), rowKey) && !equals(stopKey(), rowKey))
-                return true;
-
-            return isFullRowSlice((SliceQueryFilter)columnFilter(rowKey));
-        }
-
-        private boolean equals(RowPosition pos, ByteBuffer rowKey)
-        {
-            return pos instanceof DecoratedKey && ((DecoratedKey)pos).getKey().equals(rowKey);
+            return key.equals(startKey())
+                 ? clusteringIndexFilter.forPaging(comparator, lastReturned, inclusive)
+                 : clusteringIndexFilter;
         }
 
         @Override
-        public IDiskAtomFilter columnFilter(ByteBuffer rowKey)
+        public DataRange forSubRange(AbstractBounds<PartitionPosition> range)
         {
-            /*
-             * We have that ugly hack that for slice queries, when we ask for
-             * the live count, we reach into the query filter to get the last
-             * counter number of columns to avoid recounting.
-             * Maybe we should just remove that hack, but in the meantime, we
-             * need to keep a reference the last returned filter.
-             */
-            if (equals(startKey(), rowKey) || equals(stopKey(), rowKey))
+            // This is called for subrange of the initial range. So either it's the beginning of the initial range,
+            // and we need to preserver lastReturned, or it's not, and we don't care about it anymore.
+            return range.left.equals(keyRange().left)
+                 ? new Paging(range, clusteringIndexFilter, comparator, lastReturned, inclusive)
+                 : new DataRange(range, clusteringIndexFilter);
+        }
+
+        /**
+         * @return the last Clustering that was returned (in the previous page)
+         */
+        public Clustering getLastReturned()
+        {
+            return lastReturned;
+        }
+
+        @Override
+        public boolean isPaging()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean isUnrestricted()
+        {
+            return false;
+        }
+
+        @Override
+        public String toString(CFMetaData metadata)
+        {
+            return String.format("range=%s (paging) pfilter=%s lastReturned=%s (%s)",
+                                 keyRange.getString(metadata.getKeyValidator()),
+                                 clusteringIndexFilter.toString(metadata),
+                                 lastReturned.toString(metadata),
+                                 inclusive ? "included" : "excluded");
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(DataRange range, DataOutputPlus out, int version, CFMetaData metadata) throws IOException
+        {
+            AbstractBounds.rowPositionSerializer.serialize(range.keyRange, out, version);
+            ClusteringIndexFilter.serializer.serialize(range.clusteringIndexFilter, out, version);
+            boolean isPaging = range instanceof Paging;
+            out.writeBoolean(isPaging);
+            if (isPaging)
             {
-                if (!rowKey.equals(lastKeyFilterWasUpdatedFor))
-                {
-                    this.lastKeyFilterWasUpdatedFor = rowKey;
-                    columnFilter = sliceFilter.withUpdatedSlices(slicesForKey(rowKey));
-                }
+                Clustering.serializer.serialize(((Paging)range).lastReturned, out, version, metadata.comparator.subtypes());
+                out.writeBoolean(((Paging)range).inclusive);
+            }
+        }
+
+        public DataRange deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            AbstractBounds<PartitionPosition> range = AbstractBounds.rowPositionSerializer.deserialize(in, metadata.partitioner, version);
+            ClusteringIndexFilter filter = ClusteringIndexFilter.serializer.deserialize(in, version, metadata);
+            if (in.readBoolean())
+            {
+                ClusteringComparator comparator = metadata.comparator;
+                Clustering lastReturned = Clustering.serializer.deserialize(in, version, comparator.subtypes());
+                boolean inclusive = in.readBoolean();
+                return new Paging(range, filter, comparator, lastReturned, inclusive);
             }
             else
             {
-                columnFilter = sliceFilter;
+                return new DataRange(range, filter);
             }
-
-            return columnFilter;
         }
 
-        /** Returns true if the slice includes static columns, false otherwise. */
-        private boolean sliceIncludesStatics(ColumnSlice slice, boolean reversed, CFMetaData cfm)
+        public long serializedSize(DataRange range, int version, CFMetaData metadata)
         {
-            return cfm.hasStaticColumns() &&
-                   slice.includes(reversed ? cfm.comparator.reverseComparator() : cfm.comparator, cfm.comparator.staticPrefix().end());
-        }
+            long size = AbstractBounds.rowPositionSerializer.serializedSize(range.keyRange, version)
+                      + ClusteringIndexFilter.serializer.serializedSize(range.clusteringIndexFilter, version)
+                      + 1; // isPaging boolean
 
-        private ColumnSlice[] slicesForKey(ByteBuffer key)
-        {
-            // Also note that firstPartitionColumnStart and lastPartitionColumnFinish, when used, only "restrict" the filter slices,
-            // it doesn't expand on them. As such, we can ignore the case where they are empty and we do
-            // as it screw up with the logic below (see #6592)
-            Composite newStart = equals(startKey(), key) && !firstPartitionColumnStart.isEmpty() ? firstPartitionColumnStart : null;
-            Composite newFinish = equals(stopKey(), key) && !lastPartitionColumnFinish.isEmpty() ? lastPartitionColumnFinish : null;
-
-            // in the common case, we'll have the same number of slices
-            List<ColumnSlice> newSlices = new ArrayList<>(sliceFilter.slices.length);
-
-            // Check our slices to see if any fall before the page start (in which case they can be removed) or
-            // if they contain the page start (in which case they should start from the page start).  However, if the
-            // slices would include static columns, we need to ensure they are also fetched, and so a separate
-            // slice for the static columns may be required.
-            // Note that if the query is reversed, we can't handle statics by simply adding a separate slice here, so
-            // the reversed case is handled by SliceFromReadCommand instead. See CASSANDRA-8502 for more details.
-            for (ColumnSlice slice : sliceFilter.slices)
+            if (range instanceof Paging)
             {
-                if (newStart != null)
-                {
-                    if (slice.isBefore(comparator, newStart))
-                    {
-                        if (!sliceFilter.reversed && sliceIncludesStatics(slice, false, cfm))
-                            newSlices.add(new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end()));
-
-                        continue;
-                    }
-
-                    if (slice.includes(comparator, newStart))
-                    {
-                        if (!sliceFilter.reversed && sliceIncludesStatics(slice, false, cfm) && !newStart.equals(Composites.EMPTY))
-                            newSlices.add(new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end()));
-
-                        slice = new ColumnSlice(newStart, slice.finish);
-                    }
-
-                    // once we see a slice that either includes the page start or is after it, we can stop checking
-                    // against the page start (because the slices are ordered)
-                    newStart = null;
-                }
-
-                assert newStart == null;
-                if (newFinish != null && !slice.isBefore(comparator, newFinish))
-                {
-                    if (slice.includes(comparator, newFinish))
-                        newSlices.add(new ColumnSlice(slice.start, newFinish));
-                    // In any case, we're done
-                    break;
-                }
-                newSlices.add(slice);
+                size += Clustering.serializer.serializedSize(((Paging)range).lastReturned, version, metadata.comparator.subtypes());
+                size += 1; // inclusive boolean
             }
-
-            return newSlices.toArray(new ColumnSlice[newSlices.size()]);
-        }
-
-        @Override
-        public void updateColumnsLimit(int count)
-        {
-            columnFilter.updateColumnsLimit(count);
-            sliceFilter.updateColumnsLimit(count);
-        }
-
-        @Override
-        public String toString()
-        {
-            return Objects.toStringHelper(this)
-                          .add("keyRange", keyRange)
-                          .add("sliceFilter", sliceFilter)
-                          .add("columnFilter", columnFilter)
-                          .add("firstPartitionColumnStart", firstPartitionColumnStart == null ? "null" : cfm.comparator.getString(firstPartitionColumnStart))
-                          .add("lastPartitionColumnFinish", lastPartitionColumnFinish == null ? "null" : cfm.comparator.getString(lastPartitionColumnFinish))
-                          .toString();
+            return size;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java
index cc62a15..92d6414 100644
--- a/src/java/org/apache/cassandra/db/DecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/DecoratedKey.java

@@ -36,7 +36,7 @@
  * if this matters, you can subclass RP to use a stronger hash, or use a non-lossy tokenization scheme (as in the
  * OrderPreservingPartitioner classes).
  */
-public abstract class DecoratedKey implements RowPosition, FilterKey
+public abstract class DecoratedKey implements PartitionPosition, FilterKey
 {
     public static final Comparator<DecoratedKey> comparator = new Comparator<DecoratedKey>()
     {
@@ -72,7 +72,7 @@
         return ByteBufferUtil.compareUnsigned(getKey(), other.getKey()) == 0; // we compare faster than BB.equals for array backed BB
     }
 
-    public int compareTo(RowPosition pos)
+    public int compareTo(PartitionPosition pos)
     {
         if (this == pos)
             return 0;
@@ -86,7 +86,7 @@
         return cmp == 0 ? ByteBufferUtil.compareUnsigned(getKey(), otherKey.getKey()) : cmp;
     }
 
-    public static int compareTo(IPartitioner partitioner, ByteBuffer key, RowPosition position)
+    public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionPosition position)
     {
         // delegate to Token.KeyBound if needed
         if (!(position instanceof DecoratedKey))
@@ -113,9 +113,9 @@
         return false;
     }
 
-    public RowPosition.Kind kind()
+    public PartitionPosition.Kind kind()
     {
-        return RowPosition.Kind.ROW_KEY;
+        return PartitionPosition.Kind.ROW_KEY;
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java b/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java
index 51d15b4..8b3e121 100644
--- a/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/DefinitionsUpdateVerbHandler.java

@@ -24,9 +24,10 @@
 
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.net.IVerbHandler;
 import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.utils.WrappedRunnable;
 
 /**
@@ -45,9 +46,9 @@
 
         StageManager.getStage(Stage.MIGRATION).submit(new WrappedRunnable()
         {
-            public void runMayThrow() throws Exception
+            public void runMayThrow() throws ConfigurationException
             {
-                LegacySchemaTables.mergeSchema(message.payload);
+                SchemaKeyspace.mergeSchemaAndAnnounceVersion(message.payload);
             }
         });
     }

diff --git a/src/java/org/apache/cassandra/db/DeletedCell.java b/src/java/org/apache/cassandra/db/DeletedCell.java
deleted file mode 100644
index 998c409..0000000
--- a/src/java/org/apache/cassandra/db/DeletedCell.java
+++ /dev/null

@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-public interface DeletedCell extends Cell
-{
-    DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
-
-    DeletedCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
-}

diff --git a/src/java/org/apache/cassandra/db/DeletionInfo.java b/src/java/org/apache/cassandra/db/DeletionInfo.java
index 048324a..5bec812 100644
--- a/src/java/org/apache/cassandra/db/DeletionInfo.java
+++ b/src/java/org/apache/cassandra/db/DeletionInfo.java

@@ -17,453 +17,58 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
-import java.io.IOException;
-import java.security.MessageDigest;
-import java.util.Comparator;
 import java.util.Iterator;
 
-import com.google.common.base.Objects;
-import com.google.common.collect.Iterators;
-
 import org.apache.cassandra.cache.IMeasurableMemory;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.utils.memory.AbstractAllocator;
 
 /**
- * A combination of a top-level (or row) tombstone and range tombstones describing the deletions
- * within a {@link ColumnFamily} (or row).
+ * A combination of a top-level (partition) tombstone and range tombstones describing the deletions
+ * within a partition.
+ * <p>
+ * Note that in practice {@link MutableDeletionInfo} is the only concrete implementation of this, however
+ * different parts of the code will return either {@code DeletionInfo} or {@code MutableDeletionInfo} based
+ * on whether it can/should be mutated or not.
+ * <p>
+ * <b>Warning:</b> do not ever cast a {@code DeletionInfo} into a {@code MutableDeletionInfo} to mutate it!!!
+ * TODO: it would be safer to have 2 actual implementation of DeletionInfo, one mutable and one that isn't (I'm
+ * just lazy right this minute).
  */
-public class DeletionInfo implements IMeasurableMemory
+public interface DeletionInfo extends IMeasurableMemory
 {
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new DeletionInfo(0, 0));
-
-    /**
-     * This represents a deletion of the entire row.  We can't represent this within the RangeTombstoneList, so it's
-     * kept separately.  This also slightly optimizes the common case of a full row deletion.
-     */
-    private DeletionTime topLevel;
-
-    /**
-     * A list of range tombstones within the row.  This is left as null if there are no range tombstones
-     * (to save an allocation (since it's a common case).
-     */
-    private RangeTombstoneList ranges;
-
-    /**
-     * Creates a DeletionInfo with only a top-level (row) tombstone.
-     * @param markedForDeleteAt the time after which the entire row should be considered deleted
-     * @param localDeletionTime what time the deletion write was applied locally (for purposes of
-     *                          purging the tombstone after gc_grace_seconds).
-     */
-    public DeletionInfo(long markedForDeleteAt, int localDeletionTime)
-    {
-        // Pre-1.1 node may return MIN_VALUE for non-deleted container, but the new default is MAX_VALUE
-        // (see CASSANDRA-3872)
-        this(new DeletionTime(markedForDeleteAt, localDeletionTime == Integer.MIN_VALUE ? Integer.MAX_VALUE : localDeletionTime));
-    }
-
-    public DeletionInfo(DeletionTime topLevel)
-    {
-        this(topLevel, null);
-    }
-
-    public DeletionInfo(Composite start, Composite end, Comparator<Composite> comparator, long markedForDeleteAt, int localDeletionTime)
-    {
-        this(DeletionTime.LIVE, new RangeTombstoneList(comparator, 1));
-        ranges.add(start, end, markedForDeleteAt, localDeletionTime);
-    }
-
-    public DeletionInfo(RangeTombstone rangeTombstone, Comparator<Composite> comparator)
-    {
-        this(rangeTombstone.min, rangeTombstone.max, comparator, rangeTombstone.data.markedForDeleteAt, rangeTombstone.data.localDeletionTime);
-    }
-
-    private DeletionInfo(DeletionTime topLevel, RangeTombstoneList ranges)
-    {
-        this.topLevel = topLevel;
-        this.ranges = ranges;
-    }
-
-    /**
-     * Returns a new DeletionInfo that has no top-level tombstone or any range tombstones.
-     */
-    public static DeletionInfo live()
-    {
-        return new DeletionInfo(DeletionTime.LIVE);
-    }
-
-    public DeletionInfo copy()
-    {
-        return new DeletionInfo(topLevel, ranges == null ? null : ranges.copy());
-    }
-
-    public DeletionInfo copy(AbstractAllocator allocator)
-    {
-
-        RangeTombstoneList rangesCopy = null;
-        if (ranges != null)
-             rangesCopy = ranges.copy(allocator);
-
-        return new DeletionInfo(topLevel, rangesCopy);
-    }
+    // Note that while MutableDeletionInfo.live() is mutable, we expose it here as a non-mutable DeletionInfo so sharing is fine.
+    public static final DeletionInfo LIVE = MutableDeletionInfo.live();
 
     /**
      * Returns whether this DeletionInfo is live, that is deletes no columns.
      */
-    public boolean isLive()
-    {
-        return topLevel.isLive() && (ranges == null || ranges.isEmpty());
-    }
+    public boolean isLive();
 
-    /**
-     * Return whether a given cell is deleted by the container having this deletion info.
-     *
-     * @param cell the cell to check.
-     * @return true if the cell is deleted, false otherwise
-     */
-    public boolean isDeleted(Cell cell)
-    {
-        // We do rely on this test: if topLevel.markedForDeleteAt is MIN_VALUE, we should not
-        // consider the column deleted even if timestamp=MIN_VALUE, otherwise this break QueryFilter.isRelevant
-        if (isLive())
-            return false;
-
-        if (cell.timestamp() <= topLevel.markedForDeleteAt)
-            return true;
-
-        // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-        if (!topLevel.isLive() && cell instanceof CounterCell)
-            return true;
-
-        return ranges != null && ranges.isDeleted(cell);
-    }
-
-    /**
-     * Returns a new {@link InOrderTester} in forward order.
-     */
-    public InOrderTester inOrderTester()
-    {
-        return inOrderTester(false);
-    }
-
-    /**
-     * Returns a new {@link InOrderTester} given the order in which
-     * columns will be passed to it.
-     */
-    public InOrderTester inOrderTester(boolean reversed)
-    {
-        return new InOrderTester(reversed);
-    }
-
-    /**
-     * Purge every tombstones that are older than {@code gcbefore}.
-     *
-     * @param gcBefore timestamp (in seconds) before which tombstones should be purged
-     */
-    public void purge(int gcBefore)
-    {
-        topLevel = topLevel.localDeletionTime < gcBefore ? DeletionTime.LIVE : topLevel;
-
-        if (ranges != null)
-        {
-            ranges.purge(gcBefore);
-            if (ranges.isEmpty())
-                ranges = null;
-        }
-    }
-
-    /**
-     * Evaluates difference between this deletion info and superset for read repair
-     *
-     * @return the difference between the two, or LIVE if no difference
-     */
-    public DeletionInfo diff(DeletionInfo superset)
-    {
-        RangeTombstoneList rangeDiff = superset.ranges == null || superset.ranges.isEmpty()
-                                     ? null
-                                     : ranges == null ? superset.ranges : ranges.diff(superset.ranges);
-
-        return topLevel.markedForDeleteAt != superset.topLevel.markedForDeleteAt || rangeDiff != null
-             ? new DeletionInfo(superset.topLevel, rangeDiff)
-             : DeletionInfo.live();
-    }
-
-
-    /**
-     * Digests deletion info. Used to trigger read repair on mismatch.
-     */
-    public void updateDigest(MessageDigest digest)
-    {
-        if (topLevel.markedForDeleteAt != Long.MIN_VALUE)
-            digest.update(ByteBufferUtil.bytes(topLevel.markedForDeleteAt));
-
-        if (ranges != null)
-            ranges.updateDigest(digest);
-    }
-
-    /**
-     * Returns true if {@code purge} would remove the top-level tombstone or any of the range
-     * tombstones, false otherwise.
-     * @param gcBefore timestamp (in seconds) before which tombstones should be purged
-     */
-    public boolean hasPurgeableTombstones(int gcBefore)
-    {
-        if (topLevel.localDeletionTime < gcBefore)
-            return true;
-
-        return ranges != null && ranges.hasPurgeableTombstones(gcBefore);
-    }
-
-    /**
-     * Potentially replaces the top-level tombstone with another, keeping whichever has the higher markedForDeleteAt
-     * timestamp.
-     * @param newInfo
-     */
-    public void add(DeletionTime newInfo)
-    {
-        if (topLevel.markedForDeleteAt < newInfo.markedForDeleteAt)
-            topLevel = newInfo;
-    }
-
-    public void add(RangeTombstone tombstone, Comparator<Composite> comparator)
-    {
-        if (ranges == null)
-            ranges = new RangeTombstoneList(comparator, 1);
-
-        ranges.add(tombstone);
-    }
-
-    /**
-     * Combines another DeletionInfo with this one and returns the result.  Whichever top-level tombstone
-     * has the higher markedForDeleteAt timestamp will be kept, along with its localDeletionTime.  The
-     * range tombstones will be combined.
-     *
-     * @return this object.
-     */
-    public DeletionInfo add(DeletionInfo newInfo)
-    {
-        add(newInfo.topLevel);
-
-        if (ranges == null)
-            ranges = newInfo.ranges == null ? null : newInfo.ranges.copy();
-        else if (newInfo.ranges != null)
-            ranges.addAll(newInfo.ranges);
-
-        return this;
-    }
-
-    /**
-     * Returns the minimum timestamp in any of the range tombstones or the top-level tombstone.
-     */
-    public long minTimestamp()
-    {
-        return ranges == null
-             ? topLevel.markedForDeleteAt
-             : Math.min(topLevel.markedForDeleteAt, ranges.minMarkedAt());
-    }
-
-    /**
-     * Returns the maximum timestamp in any of the range tombstones or the top-level tombstone.
-     */
-    public long maxTimestamp()
-    {
-        return ranges == null
-             ? topLevel.markedForDeleteAt
-             : Math.max(topLevel.markedForDeleteAt, ranges.maxMarkedAt());
-    }
-
-    /**
-     * Returns the top-level (or "row") tombstone.
-     */
-    public DeletionTime getTopLevelDeletion()
-    {
-        return topLevel;
-    }
+    public DeletionTime getPartitionDeletion();
 
     // Use sparingly, not the most efficient thing
-    public Iterator<RangeTombstone> rangeIterator()
-    {
-        return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator();
-    }
+    public Iterator<RangeTombstone> rangeIterator(boolean reversed);
 
-    public Iterator<RangeTombstone> rangeIterator(Composite start, Composite finish)
-    {
-        return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator(start, finish);
-    }
+    public Iterator<RangeTombstone> rangeIterator(Slice slice, boolean reversed);
 
-    public RangeTombstone rangeCovering(Composite name)
-    {
-        return ranges == null ? null : ranges.search(name);
-    }
+    public RangeTombstone rangeCovering(Clustering name);
 
-    public int dataSize()
-    {
-        int size = TypeSizes.NATIVE.sizeof(topLevel.markedForDeleteAt);
-        return size + (ranges == null ? 0 : ranges.dataSize());
-    }
+    public void collectStats(EncodingStats.Collector collector);
 
-    public boolean hasRanges()
-    {
-        return ranges != null && !ranges.isEmpty();
-    }
+    public int dataSize();
 
-    public int rangeCount()
-    {
-        return hasRanges() ? ranges.size() : 0;
-    }
+    public boolean hasRanges();
+
+    public int rangeCount();
+
+    public long maxTimestamp();
 
     /**
      * Whether this deletion info may modify the provided one if added to it.
      */
-    public boolean mayModify(DeletionInfo delInfo)
-    {
-        return topLevel.compareTo(delInfo.topLevel) > 0 || hasRanges();
-    }
+    public boolean mayModify(DeletionInfo delInfo);
 
-    @Override
-    public String toString()
-    {
-        if (ranges == null || ranges.isEmpty())
-            return String.format("{%s}", topLevel);
-        else
-            return String.format("{%s, ranges=%s}", topLevel, rangesAsString());
-    }
-
-    private String rangesAsString()
-    {
-        assert !ranges.isEmpty();
-        StringBuilder sb = new StringBuilder();
-        CType type = (CType)ranges.comparator();
-        assert type != null;
-        Iterator<RangeTombstone> iter = rangeIterator();
-        while (iter.hasNext())
-        {
-            RangeTombstone i = iter.next();
-            sb.append("[");
-            sb.append(type.getString(i.min)).append("-");
-            sb.append(type.getString(i.max)).append(", ");
-            sb.append(i.data);
-            sb.append("]");
-        }
-        return sb.toString();
-    }
-
-    // Updates all the timestamp of the deletion contained in this DeletionInfo to be {@code timestamp}.
-    public void updateAllTimestamp(long timestamp)
-    {
-        if (topLevel.markedForDeleteAt != Long.MIN_VALUE)
-            topLevel = new DeletionTime(timestamp, topLevel.localDeletionTime);
-
-        if (ranges != null)
-            ranges.updateAllTimestamp(timestamp);
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if(!(o instanceof DeletionInfo))
-            return false;
-        DeletionInfo that = (DeletionInfo)o;
-        return topLevel.equals(that.topLevel) && Objects.equal(ranges, that.ranges);
-    }
-
-    @Override
-    public final int hashCode()
-    {
-        return Objects.hashCode(topLevel, ranges);
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return EMPTY_SIZE + topLevel.unsharedHeapSize() + (ranges == null ? 0 : ranges.unsharedHeapSize());
-    }
-
-    public static class Serializer implements IVersionedSerializer<DeletionInfo>
-    {
-        private final RangeTombstoneList.Serializer rtlSerializer;
-
-        public Serializer(CType type)
-        {
-            this.rtlSerializer = new RangeTombstoneList.Serializer(type);
-        }
-
-        public void serialize(DeletionInfo info, DataOutputPlus out, int version) throws IOException
-        {
-            DeletionTime.serializer.serialize(info.topLevel, out);
-            rtlSerializer.serialize(info.ranges, out, version);
-        }
-
-        public DeletionInfo deserialize(DataInput in, int version) throws IOException
-        {
-            DeletionTime topLevel = DeletionTime.serializer.deserialize(in);
-            RangeTombstoneList ranges = rtlSerializer.deserialize(in, version);
-            return new DeletionInfo(topLevel, ranges);
-        }
-
-        public long serializedSize(DeletionInfo info, TypeSizes typeSizes, int version)
-        {
-            long size = DeletionTime.serializer.serializedSize(info.topLevel, typeSizes);
-            return size + rtlSerializer.serializedSize(info.ranges, typeSizes, version);
-        }
-
-        public long serializedSize(DeletionInfo info, int version)
-        {
-            return serializedSize(info, TypeSizes.NATIVE, version);
-        }
-    }
-
-    /**
-     * This object allow testing whether a given column (name/timestamp) is deleted
-     * or not by this DeletionInfo, assuming that the columns given to this
-     * object are passed in forward or reversed comparator sorted order.
-     *
-     * This is more efficient that calling DeletionInfo.isDeleted() repeatedly
-     * in that case.
-     */
-    public class InOrderTester
-    {
-        /*
-         * Note that because because range tombstone are added to this DeletionInfo while we iterate,
-         * `ranges` may be null initially and we need to wait for the first range to create the tester (once
-         * created the test will pick up new tombstones however). We are guaranteed that a range tombstone
-         * will be added *before* we test any column that it may delete, so this is ok.
-         */
-        private RangeTombstoneList.InOrderTester tester;
-        private final boolean reversed;
-
-        private InOrderTester(boolean reversed)
-        {
-            this.reversed = reversed;
-        }
-
-        public boolean isDeleted(Cell cell)
-        {
-            if (cell.timestamp() <= topLevel.markedForDeleteAt)
-                return true;
-
-            // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-            if (!topLevel.isLive() && cell instanceof CounterCell)
-                return true;
-
-            /*
-             * We don't optimize the reversed case for now because RangeTombstoneList
-             * is always in forward sorted order.
-             */
-            if (reversed)
-                 return DeletionInfo.this.isDeleted(cell);
-
-            // Maybe create the tester if we hadn't yet and we now have some ranges (see above).
-            if (tester == null && ranges != null)
-                tester = ranges.inOrderTester();
-
-            return tester != null && tester.isDeleted(cell);
-        }
-    }
+    public MutableDeletionInfo mutableCopy();
+    public DeletionInfo copy(AbstractAllocator allocator);
 }

diff --git a/src/java/org/apache/cassandra/db/DeletionPurger.java b/src/java/org/apache/cassandra/db/DeletionPurger.java
new file mode 100644
index 0000000..d368b69
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/DeletionPurger.java

@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+public interface DeletionPurger
+{
+    public static final DeletionPurger PURGE_ALL = (ts, ldt) -> true;
+
+    public boolean shouldPurge(long timestamp, int localDeletionTime);
+
+    public default boolean shouldPurge(DeletionTime dt)
+    {
+        return !dt.isLive() && shouldPurge(dt.markedForDeleteAt(), dt.localDeletionTime());
+    }
+
+    public default boolean shouldPurge(LivenessInfo liveness, int nowInSec)
+    {
+        return !liveness.isLive(nowInSec) && shouldPurge(liveness.timestamp(), liveness.localExpirationTime());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/DeletionTime.java b/src/java/org/apache/cassandra/db/DeletionTime.java
index c10a15f..652689c 100644
--- a/src/java/org/apache/cassandra/db/DeletionTime.java
+++ b/src/java/org/apache/cassandra/db/DeletionTime.java

@@ -17,21 +17,21 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
+import java.security.MessageDigest;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
 
 import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
-import org.codehaus.jackson.annotate.JsonIgnore;
 
 /**
- * A top-level (row) tombstone.
+ * Information on deletion of a storage engine object.
  */
 public class DeletionTime implements Comparable<DeletionTime>, IMeasurableMemory
 {
@@ -42,36 +42,50 @@
      */
     public static final DeletionTime LIVE = new DeletionTime(Long.MIN_VALUE, Integer.MAX_VALUE);
 
-    /**
-     * A timestamp (typically in microseconds since the unix epoch, although this is not enforced) after which
-     * data should be considered deleted. If set to Long.MIN_VALUE, this implies that the data has not been marked
-     * for deletion at all.
-     */
-    public final long markedForDeleteAt;
-
-    /**
-     * The local server timestamp, in seconds since the unix epoch, at which this tombstone was created. This is
-     * only used for purposes of purging the tombstone after gc_grace_seconds have elapsed.
-     */
-    public final int localDeletionTime;
-
     public static final Serializer serializer = new Serializer();
 
-    @VisibleForTesting
+    private final long markedForDeleteAt;
+    private final int localDeletionTime;
+
     public DeletionTime(long markedForDeleteAt, int localDeletionTime)
     {
-        assert localDeletionTime >= 0 : localDeletionTime;
         this.markedForDeleteAt = markedForDeleteAt;
         this.localDeletionTime = localDeletionTime;
     }
 
     /**
+     * A timestamp (typically in microseconds since the unix epoch, although this is not enforced) after which
+     * data should be considered deleted. If set to Long.MIN_VALUE, this implies that the data has not been marked
+     * for deletion at all.
+     */
+    public long markedForDeleteAt()
+    {
+        return markedForDeleteAt;
+    }
+
+    /**
+     * The local server timestamp, in seconds since the unix epoch, at which this tombstone was created. This is
+     * only used for purposes of purging the tombstone after gc_grace_seconds have elapsed.
+     */
+    public int localDeletionTime()
+    {
+        return localDeletionTime;
+    }
+
+    /**
      * Returns whether this DeletionTime is live, that is deletes no columns.
      */
-    @JsonIgnore
     public boolean isLive()
     {
-        return markedForDeleteAt == Long.MIN_VALUE && localDeletionTime == Integer.MAX_VALUE;
+        return markedForDeleteAt() == Long.MIN_VALUE && localDeletionTime() == Integer.MAX_VALUE;
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        // localDeletionTime is basically a metadata of the deletion time that tells us when it's ok to purge it.
+        // It's thus intrinsically a local information and shouldn't be part of the digest (which exists for
+        // cross-nodes comparisons).
+        FBUtilities.updateWithLong(digest, markedForDeleteAt());
     }
 
     @Override
@@ -80,48 +94,58 @@
         if(!(o instanceof DeletionTime))
             return false;
         DeletionTime that = (DeletionTime)o;
-        return markedForDeleteAt == that.markedForDeleteAt && localDeletionTime == that.localDeletionTime;
+        return markedForDeleteAt() == that.markedForDeleteAt() && localDeletionTime() == that.localDeletionTime();
     }
 
     @Override
     public final int hashCode()
     {
-        return Objects.hashCode(markedForDeleteAt, localDeletionTime);
+        return Objects.hashCode(markedForDeleteAt(), localDeletionTime());
     }
 
     @Override
     public String toString()
     {
-        return String.format("deletedAt=%d, localDeletion=%d", markedForDeleteAt, localDeletionTime);
+        return String.format("deletedAt=%d, localDeletion=%d", markedForDeleteAt(), localDeletionTime());
     }
 
     public int compareTo(DeletionTime dt)
     {
-        if (markedForDeleteAt < dt.markedForDeleteAt)
+        if (markedForDeleteAt() < dt.markedForDeleteAt())
             return -1;
-        else if (markedForDeleteAt > dt.markedForDeleteAt)
+        else if (markedForDeleteAt() > dt.markedForDeleteAt())
             return 1;
-        else if (localDeletionTime < dt.localDeletionTime)
+        else if (localDeletionTime() < dt.localDeletionTime())
             return -1;
-        else if (localDeletionTime > dt.localDeletionTime)
+        else if (localDeletionTime() > dt.localDeletionTime())
             return 1;
         else
             return 0;
     }
 
-    public boolean isGcAble(int gcBefore)
-    {
-        return localDeletionTime < gcBefore;
-    }
-
-    public boolean isDeleted(OnDiskAtom atom)
-    {
-        return atom.timestamp() <= markedForDeleteAt;
-    }
-
     public boolean supersedes(DeletionTime dt)
     {
-        return this.markedForDeleteAt > dt.markedForDeleteAt;
+        return markedForDeleteAt() > dt.markedForDeleteAt() || (markedForDeleteAt() == dt.markedForDeleteAt() && localDeletionTime() > dt.localDeletionTime());
+    }
+
+    public boolean deletes(LivenessInfo info)
+    {
+        return deletes(info.timestamp());
+    }
+
+    public boolean deletes(Cell cell)
+    {
+        return deletes(cell.timestamp());
+    }
+
+    public boolean deletes(long timestamp)
+    {
+        return timestamp <= markedForDeleteAt();
+    }
+
+    public int dataSize()
+    {
+        return 12;
     }
 
     public long unsharedHeapSize()
@@ -133,11 +157,11 @@
     {
         public void serialize(DeletionTime delTime, DataOutputPlus out) throws IOException
         {
-            out.writeInt(delTime.localDeletionTime);
-            out.writeLong(delTime.markedForDeleteAt);
+            out.writeInt(delTime.localDeletionTime());
+            out.writeLong(delTime.markedForDeleteAt());
         }
 
-        public DeletionTime deserialize(DataInput in) throws IOException
+        public DeletionTime deserialize(DataInputPlus in) throws IOException
         {
             int ldt = in.readInt();
             long mfda = in.readLong();
@@ -146,15 +170,15 @@
                  : new DeletionTime(mfda, ldt);
         }
 
-        public void skip(DataInput in) throws IOException
+        public void skip(DataInputPlus in) throws IOException
         {
-            FileUtils.skipBytesFully(in, 4 + 8);
+            in.skipBytesFully(4 + 8);
         }
 
-        public long serializedSize(DeletionTime delTime, TypeSizes typeSizes)
+        public long serializedSize(DeletionTime delTime)
         {
-            return typeSizes.sizeof(delTime.localDeletionTime)
-                 + typeSizes.sizeof(delTime.markedForDeleteAt);
+            return TypeSizes.sizeof(delTime.localDeletionTime())
+                 + TypeSizes.sizeof(delTime.markedForDeleteAt());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java
index 5e02cd7..0f3b2b6 100644
--- a/src/java/org/apache/cassandra/db/Directories.java
+++ b/src/java/org/apache/cassandra/db/Directories.java

@@ -26,11 +26,14 @@
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.BiFunction;
+import java.util.function.Consumer;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Predicate;
@@ -44,6 +47,8 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.FSDiskFullWriteError;
 import org.apache.cassandra.io.FSError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.FileUtils;
@@ -91,9 +96,11 @@
 
     public static final String BACKUPS_SUBDIR = "backups";
     public static final String SNAPSHOT_SUBDIR = "snapshots";
+    public static final String TMP_SUBDIR = "tmp";
     public static final String SECONDARY_INDEX_NAME_SEPARATOR = ".";
 
     public static final DataDirectory[] dataDirectories;
+
     static
     {
         String[] locations = DatabaseDescriptor.getAllDataFileLocations();
@@ -140,7 +147,7 @@
     {
         X, W, XW, R, XR, RW, XRW;
 
-        private FileAction()
+        FileAction()
         {
         }
 
@@ -176,30 +183,36 @@
     }
 
     private final CFMetaData metadata;
+    private final DataDirectory[] paths;
     private final File[] dataPaths;
 
+    public Directories(final CFMetaData metadata)
+    {
+        this(metadata, dataDirectories);
+    }
     /**
      * Create Directories of given ColumnFamily.
      * SSTable directories are created under data_directories defined in cassandra.yaml if not exist at this time.
      *
      * @param metadata metadata of ColumnFamily
      */
-    public Directories(final CFMetaData metadata)
+    public Directories(final CFMetaData metadata, DataDirectory[] paths)
     {
         this.metadata = metadata;
+        this.paths = paths;
 
         String cfId = ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(metadata.cfId));
         int idx = metadata.cfName.indexOf(SECONDARY_INDEX_NAME_SEPARATOR);
         String cfName = idx >= 0 ? metadata.cfName.substring(0, idx) : metadata.cfName;
         String indexNameWithDot = idx >= 0 ? metadata.cfName.substring(idx) : null;
 
-        this.dataPaths = new File[dataDirectories.length];
+        this.dataPaths = new File[paths.length];
         // If upgraded from version less than 2.1, use existing directories
         String oldSSTableRelativePath = join(metadata.ksName, cfName);
-        for (int i = 0; i < dataDirectories.length; ++i)
+        for (int i = 0; i < paths.length; ++i)
         {
             // check if old SSTable directory exists
-            dataPaths[i] = new File(dataDirectories[i].location, oldSSTableRelativePath);
+            dataPaths[i] = new File(paths[i].location, oldSSTableRelativePath);
         }
         boolean olderDirectoryExists = Iterables.any(Arrays.asList(dataPaths), new Predicate<File>()
         {
@@ -212,13 +225,13 @@
         {
             // use 2.1+ style
             String newSSTableRelativePath = join(metadata.ksName, cfName + '-' + cfId);
-            for (int i = 0; i < dataDirectories.length; ++i)
-                dataPaths[i] = new File(dataDirectories[i].location, newSSTableRelativePath);
+            for (int i = 0; i < paths.length; ++i)
+                dataPaths[i] = new File(paths[i].location, newSSTableRelativePath);
         }
         // if index, then move to its own directory
         if (indexNameWithDot != null)
         {
-            for (int i = 0; i < dataDirectories.length; ++i)
+            for (int i = 0; i < paths.length; ++i)
                 dataPaths[i] = new File(dataPaths[i], indexNameWithDot);
         }
 
@@ -275,8 +288,13 @@
     {
         if (dataDirectory != null)
             for (File dir : dataPaths)
-                if (dir.getAbsolutePath().startsWith(dataDirectory.location.getAbsolutePath()))
+            {
+                // Note that we must compare absolute paths (not canonical) here since keyspace directories might be symlinks
+                Path dirPath = Paths.get(dir.getAbsolutePath());
+                Path locationPath = Paths.get(dataDirectory.location.getAbsolutePath());
+                if (dirPath.startsWith(locationPath))
                     return dir;
+            }
         return null;
     }
 
@@ -313,6 +331,34 @@
     }
 
     /**
+     * Returns a temporary subdirectory on allowed data directory
+     * that _currently_ has {@code writeSize} bytes as usable space.
+     * This method does not create the temporary directory.
+     *
+     * @throws IOError if all directories are disallowed.
+     */
+    public File getTemporaryWriteableDirectoryAsFile(long writeSize)
+    {
+        File location = getLocationForDisk(getWriteableLocation(writeSize));
+        if (location == null)
+            return null;
+        return new File(location, TMP_SUBDIR);
+    }
+
+    public void removeTemporaryDirectories()
+    {
+        for (File dataDir : dataPaths)
+        {
+            File tmpDir = new File(dataDir, TMP_SUBDIR);
+            if (tmpDir.exists())
+            {
+                logger.debug("Removing temporary directory {}", tmpDir);
+                FileUtils.deleteRecursive(tmpDir);
+            }
+        }
+    }
+
+    /**
      * Returns an allowed data directory that _currently_ has {@code writeSize} bytes as usable space.
      *
      * @throws IOError if all directories are disallowed.
@@ -325,7 +371,7 @@
 
         // pick directories with enough space and so that resulting sstable dirs aren't disallowed for writes.
         boolean tooBig = false;
-        for (DataDirectory dataDir : dataDirectories)
+        for (DataDirectory dataDir : paths)
         {
             if (DisallowedDirectories.isUnwritable(getLocationForDisk(dataDir)))
             {
@@ -346,7 +392,7 @@
 
         if (candidates.isEmpty())
             if (tooBig)
-                throw new RuntimeException("Insufficient disk space to write " + writeSize + " bytes");
+                throw new FSDiskFullWriteError(new IOException("Insufficient disk space to write " + writeSize + " bytes"), "");
             else
                 throw new FSWriteError(new IOException("All configured data directories have been disallowed as unwritable for erroring out"), "");
 
@@ -391,7 +437,7 @@
         long writeSize = expectedTotalWriteSize / estimatedSSTables;
         long totalAvailable = 0L;
 
-        for (DataDirectory dataDir : dataDirectories)
+        for (DataDirectory dataDir : paths)
         {
             if (DisallowedDirectories.isUnwritable(getLocationForDisk(dataDir)))
                   continue;
@@ -438,6 +484,12 @@
         return new File(snapshotDir, "manifest.json");
     }
 
+    public File getSnapshotSchemaFile(String snapshotName)
+    {
+        File snapshotDir = getSnapshotDirectory(getDirectoryForNewSSTables(), snapshotName);
+        return new File(snapshotDir, "schema.cql");
+    }
+
     public File getNewEphemeralSnapshotMarkerFile(String snapshotName)
     {
         File snapshotDir = new File(getWriteableLocationAsFile(1L), join(SNAPSHOT_SUBDIR, snapshotName));
@@ -466,11 +518,6 @@
         }
     }
 
-    public SSTableLister sstableLister()
-    {
-        return new SSTableLister();
-    }
-
     public static class DataDirectory
     {
         public final File location;
@@ -482,7 +529,25 @@
 
         public long getAvailableSpace()
         {
-            return FileUtils.getUsableSpace(location);
+            long availableSpace = FileUtils.getUsableSpace(location) - DatabaseDescriptor.getMinFreeSpacePerDriveInBytes();
+            return availableSpace > 0 ? availableSpace : 0;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            DataDirectory that = (DataDirectory) o;
+
+            return location.equals(that.location);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return location.hashCode();
         }
     }
 
@@ -518,8 +583,41 @@
         }
     }
 
+    /** The type of files that can be listed by SSTableLister, we never return txn logs,
+     * use LifecycleTransaction.getFiles() if you need txn logs. */
+    public enum FileType
+    {
+        /** A permanent sstable file that is safe to use. */
+        FINAL,
+
+        /** A temporary sstable file that will soon be deleted. */
+        TEMPORARY,
+
+        /** A transaction log file (contains information on final and temporary files). */
+        TXN_LOG;
+    }
+
+    /**
+     * How to handle a failure to read a txn log file. Note that we will try a few
+     * times before giving up.
+     **/
+    public enum OnTxnErr
+    {
+        /** Throw the exception */
+        THROW,
+
+        /** Ignore the problematic parts of the txn log file */
+        IGNORE
+    }
+
+    public SSTableLister sstableLister(OnTxnErr onTxnErr)
+    {
+        return new SSTableLister(onTxnErr);
+    }
+
     public class SSTableLister
     {
+        private final OnTxnErr onTxnErr;
         private boolean skipTemporary;
         private boolean includeBackups;
         private boolean onlyBackups;
@@ -528,6 +626,11 @@
         private boolean filtered;
         private String snapshotName;
 
+        private SSTableLister(OnTxnErr onTxnErr)
+        {
+            this.onTxnErr = onTxnErr;
+        }
+
         public SSTableLister skipTemporary(boolean b)
         {
             if (filtered)
@@ -593,49 +696,72 @@
 
                 if (snapshotName != null)
                 {
-                    getSnapshotDirectory(location, snapshotName).listFiles(getFilter());
+                    LifecycleTransaction.getFiles(getSnapshotDirectory(location, snapshotName).toPath(), getFilter(), onTxnErr);
                     continue;
                 }
 
                 if (!onlyBackups)
-                    location.listFiles(getFilter());
+                    LifecycleTransaction.getFiles(location.toPath(), getFilter(), onTxnErr);
 
                 if (includeBackups)
-                    getBackupsDirectory(location).listFiles(getFilter());
+                    LifecycleTransaction.getFiles(getBackupsDirectory(location).toPath(), getFilter(), onTxnErr);
             }
+
             filtered = true;
         }
 
-        private FileFilter getFilter()
+        private BiFunction<File, FileType, Boolean> getFilter()
         {
-            return new FileFilter()
+            // This function always return false since it adds to the components map
+            return (file, type) ->
             {
-                // This function always return false since accepts adds to the components map
-                public boolean accept(File file)
+                switch (type)
                 {
-                    if (file.isDirectory())
+                    case TXN_LOG:
+                        return false;
+                    case TEMPORARY:
+                        if (skipTemporary)
+                            return false;
+
+                    case FINAL:
+                        Pair<Descriptor, Component> pair = SSTable.tryComponentFromFilename(file.getParentFile(), file.getName());
+                        if (pair == null)
+                            return false;
+
+                        // we are only interested in the SSTable files that belong to the specific ColumnFamily
+                        if (!pair.left.ksname.equals(metadata.ksName) || !pair.left.cfname.equals(metadata.cfName))
+                            return false;
+
+                        Set<Component> previous = components.get(pair.left);
+                        if (previous == null)
+                        {
+                            previous = new HashSet<>();
+                            components.put(pair.left, previous);
+                        }
+                        else if (pair.right.type == Component.Type.DIGEST)
+                        {
+                            if (pair.right != pair.left.digestComponent)
+                            {
+                                // Need to update the DIGEST component as it might be set to another
+                                // digest type as a guess. This may happen if the first component is
+                                // not the DIGEST (but the Data component for example), so the digest
+                                // type is _guessed_ from the Version.
+                                // Although the Version explicitly defines the digest type, it doesn't
+                                // seem to be true under all circumstances. Generated sstables from a
+                                // post 2.1.8 snapshot produced Digest.sha1 files although Version
+                                // defines Adler32.
+                                // TL;DR this piece of code updates the digest component to be "correct".
+                                components.remove(pair.left);
+                                Descriptor updated = pair.left.withDigestComponent(pair.right);
+                                components.put(updated, previous);
+                            }
+                        }
+                        previous.add(pair.right);
+                        nbFiles++;
                         return false;
 
-                    Pair<Descriptor, Component> pair = SSTable.tryComponentFromFilename(file.getParentFile(), file.getName());
-                    if (pair == null)
-                        return false;
-
-                    // we are only interested in the SSTable files that belong to the specific ColumnFamily
-                    if (!pair.left.ksname.equals(metadata.ksName) || !pair.left.cfname.equals(metadata.cfName))
-                        return false;
-
-                    if (skipTemporary && pair.left.type.isTemporary)
-                        return false;
-
-                    Set<Component> previous = components.get(pair.left);
-                    if (previous == null)
-                    {
-                        previous = new HashSet<>();
-                        components.put(pair.left, previous);
-                    }
-                    previous.add(pair.right);
-                    nbFiles++;
-                    return false;
+                    default:
+                        throw new AssertionError();
                 }
             };
         }
@@ -790,11 +916,17 @@
         return visitor.getAllocatedSize();
     }
 
-    // Recursively finds all the sub directories in the KS directory.
     public static List<File> getKSChildDirectories(String ksName)
     {
+        return getKSChildDirectories(ksName, dataDirectories);
+
+    }
+
+    // Recursively finds all the sub directories in the KS directory.
+    public static List<File> getKSChildDirectories(String ksName, DataDirectory[] directories)
+    {
         List<File> result = new ArrayList<>();
-        for (DataDirectory dataDirectory : dataDirectories)
+        for (DataDirectory dataDirectory : directories)
         {
             File ksDir = new File(dataDirectory.location, ksName);
             File[] cfDirs = ksDir.listFiles();
@@ -870,7 +1002,7 @@
         {
             super();
             Builder<String> builder = ImmutableSet.builder();
-            for (File file : sstableLister().listFiles())
+            for (File file : sstableLister(Directories.OnTxnErr.THROW).listFiles())
                 builder.add(file.getName());
             alive = builder.build();
         }

diff --git a/src/java/org/apache/cassandra/db/DisallowedDirectories.java b/src/java/org/apache/cassandra/db/DisallowedDirectories.java
index c0518e2..75b5e79 100644
--- a/src/java/org/apache/cassandra/db/DisallowedDirectories.java
+++ b/src/java/org/apache/cassandra/db/DisallowedDirectories.java

@@ -25,6 +25,8 @@
 import java.util.Set;
 import java.util.concurrent.CopyOnWriteArraySet;
 
+import com.google.common.annotations.VisibleForTesting;
+
 import org.apache.cassandra.utils.MBeanWrapper;
 
 public class DisallowedDirectories implements DisallowedDirectoriesMBean
@@ -89,6 +91,17 @@
     }
 
     /**
+     * Testing only!
+     * Clear the set of unwritable directories.
+     */
+    @VisibleForTesting
+    public static void clearUnwritableUnsafe()
+    {
+        instance.unwritableDirectories.clear();
+    }
+
+
+    /**
      * Tells whether or not the directory is disallowed for reads.
      * @return whether or not the directory is disallowed for reads.
      */

diff --git a/src/java/org/apache/cassandra/db/EmptyIterators.java b/src/java/org/apache/cassandra/db/EmptyIterators.java
new file mode 100644
index 0000000..6bf8fff
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/EmptyIterators.java

@@ -0,0 +1,214 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.partitions.BasePartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+
+public class EmptyIterators
+{
+
+    private static class EmptyBasePartitionIterator<R extends BaseRowIterator<?>> implements BasePartitionIterator<R>
+    {
+        EmptyBasePartitionIterator()
+        {
+        }
+
+        public void close()
+        {
+        }
+
+        public boolean hasNext()
+        {
+            return false;
+        }
+
+        public R next()
+        {
+            throw new NoSuchElementException();
+        }
+    }
+
+    private static class EmptyUnfilteredPartitionIterator extends EmptyBasePartitionIterator<UnfilteredRowIterator> implements UnfilteredPartitionIterator
+    {
+        final CFMetaData metadata;
+        final boolean isForThrift;
+
+        public EmptyUnfilteredPartitionIterator(CFMetaData metadata, boolean isForThrift)
+        {
+            this.metadata = metadata;
+            this.isForThrift = isForThrift;
+        }
+
+        public boolean isForThrift()
+        {
+            return isForThrift;
+        }
+
+        public CFMetaData metadata()
+        {
+            return metadata;
+        }
+    }
+
+    private static class EmptyPartitionIterator extends EmptyBasePartitionIterator<RowIterator> implements PartitionIterator
+    {
+        public static final EmptyPartitionIterator instance = new EmptyPartitionIterator();
+        private EmptyPartitionIterator()
+        {
+            super();
+        }
+    }
+
+    private static class EmptyBaseRowIterator<U extends Unfiltered> implements BaseRowIterator<U>
+    {
+        final PartitionColumns columns;
+        final CFMetaData metadata;
+        final DecoratedKey partitionKey;
+        final boolean isReverseOrder;
+        final Row staticRow;
+
+        EmptyBaseRowIterator(PartitionColumns columns, CFMetaData metadata, DecoratedKey partitionKey, boolean isReverseOrder, Row staticRow)
+        {
+            this.columns = columns;
+            this.metadata = metadata;
+            this.partitionKey = partitionKey;
+            this.isReverseOrder = isReverseOrder;
+            this.staticRow = staticRow;
+        }
+
+        public CFMetaData metadata()
+        {
+            return metadata;
+        }
+
+        public boolean isReverseOrder()
+        {
+            return isReverseOrder;
+        }
+
+        public PartitionColumns columns()
+        {
+            return columns;
+        }
+
+        public DecoratedKey partitionKey()
+        {
+            return partitionKey;
+        }
+
+        public Row staticRow()
+        {
+            return staticRow;
+        }
+
+        public void close()
+        {
+        }
+
+        public boolean isEmpty()
+        {
+            return staticRow == Rows.EMPTY_STATIC_ROW;
+        }
+
+        public boolean hasNext()
+        {
+            return false;
+        }
+
+        public U next()
+        {
+            throw new NoSuchElementException();
+        }
+    }
+
+    private static class EmptyUnfilteredRowIterator extends EmptyBaseRowIterator<Unfiltered> implements UnfilteredRowIterator
+    {
+        final DeletionTime partitionLevelDeletion;
+        public EmptyUnfilteredRowIterator(PartitionColumns columns, CFMetaData metadata, DecoratedKey partitionKey,
+                                          boolean isReverseOrder, Row staticRow, DeletionTime partitionLevelDeletion)
+        {
+            super(columns, metadata, partitionKey, isReverseOrder, staticRow);
+            this.partitionLevelDeletion = partitionLevelDeletion;
+        }
+
+        public boolean isEmpty()
+        {
+            return partitionLevelDeletion == DeletionTime.LIVE && super.isEmpty();
+        }
+
+        public DeletionTime partitionLevelDeletion()
+        {
+            return partitionLevelDeletion;
+        }
+
+        public EncodingStats stats()
+        {
+            return EncodingStats.NO_STATS;
+        }
+    }
+
+    private static class EmptyRowIterator extends EmptyBaseRowIterator<Row> implements RowIterator
+    {
+        public EmptyRowIterator(CFMetaData metadata, DecoratedKey partitionKey, boolean isReverseOrder, Row staticRow)
+        {
+            super(PartitionColumns.NONE, metadata, partitionKey, isReverseOrder, staticRow);
+        }
+    }
+
+    public static UnfilteredPartitionIterator unfilteredPartition(CFMetaData metadata, boolean isForThrift)
+    {
+        return new EmptyUnfilteredPartitionIterator(metadata, isForThrift);
+    }
+
+    public static PartitionIterator partition()
+    {
+        return EmptyPartitionIterator.instance;
+    }
+
+    // this method is the only one that can return a non-empty iterator, but it still has no rows, so it seems cleanest to keep it here
+    public static UnfilteredRowIterator unfilteredRow(CFMetaData metadata, DecoratedKey partitionKey, boolean isReverseOrder, Row staticRow, DeletionTime partitionDeletion)
+    {
+        PartitionColumns columns = PartitionColumns.NONE;
+        if (!staticRow.isEmpty())
+            columns = new PartitionColumns(Columns.from(staticRow.columns()), Columns.NONE);
+        else
+            staticRow = Rows.EMPTY_STATIC_ROW;
+
+        if (partitionDeletion.isLive())
+            partitionDeletion = DeletionTime.LIVE;
+
+        return new EmptyUnfilteredRowIterator(columns, metadata, partitionKey, isReverseOrder, staticRow, partitionDeletion);
+    }
+
+    public static UnfilteredRowIterator unfilteredRow(CFMetaData metadata, DecoratedKey partitionKey, boolean isReverseOrder)
+    {
+        return new EmptyUnfilteredRowIterator(PartitionColumns.NONE, metadata, partitionKey, isReverseOrder, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE);
+    }
+
+    public static RowIterator row(CFMetaData metadata, DecoratedKey partitionKey, boolean isReverseOrder)
+    {
+        return new EmptyRowIterator(metadata, partitionKey, isReverseOrder, Rows.EMPTY_STATIC_ROW);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java b/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java
new file mode 100644
index 0000000..852dcb1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ExpirationDateOverflowHandling.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.slf4j.helpers.MessageFormatter;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.utils.NoSpamLogger;
+
+public class ExpirationDateOverflowHandling
+{
+    private static final Logger logger = LoggerFactory.getLogger(Attributes.class);
+
+    private static final int EXPIRATION_OVERFLOW_WARNING_INTERVAL_MINUTES = Integer.getInteger("cassandra.expiration_overflow_warning_interval_minutes", 5);
+
+    public enum ExpirationDateOverflowPolicy
+    {
+        REJECT, CAP_NOWARN, CAP
+    }
+
+    @VisibleForTesting
+    public static ExpirationDateOverflowPolicy policy;
+
+    static {
+        String policyAsString = System.getProperty("cassandra.expiration_date_overflow_policy", ExpirationDateOverflowPolicy.REJECT.name());
+        try
+        {
+            policy = ExpirationDateOverflowPolicy.valueOf(policyAsString.toUpperCase());
+        }
+        catch (RuntimeException e)
+        {
+            logger.warn("Invalid expiration date overflow policy: {}. Using default: {}", policyAsString, ExpirationDateOverflowPolicy.REJECT.name());
+            policy = ExpirationDateOverflowPolicy.REJECT;
+        }
+    }
+
+    public static final String MAXIMUM_EXPIRATION_DATE_EXCEEDED_WARNING = "Request on table {}.{} with {}ttl of {} seconds exceeds maximum supported expiration " +
+                                                                          "date of 2038-01-19T03:14:06+00:00 and will have its expiration capped to that date. " +
+                                                                          "In order to avoid this use a lower TTL or upgrade to a version where this limitation " +
+                                                                          "is fixed. See CASSANDRA-14092 for more details.";
+
+    public static final String MAXIMUM_EXPIRATION_DATE_EXCEEDED_REJECT_MESSAGE = "Request on table %s.%s with %sttl of %d seconds exceeds maximum supported expiration " +
+                                                                                 "date of 2038-01-19T03:14:06+00:00. In order to avoid this use a lower TTL, change " +
+                                                                                 "the expiration date overflow policy or upgrade to a version where this limitation " +
+                                                                                 "is fixed. See CASSANDRA-14092 for more details.";
+
+    public static void maybeApplyExpirationDateOverflowPolicy(CFMetaData metadata, int ttl, boolean isDefaultTTL) throws InvalidRequestException
+    {
+        if (ttl == BufferCell.NO_TTL)
+            return;
+
+        // Check for localExpirationTime overflow (CASSANDRA-14092)
+        int nowInSecs = (int)(System.currentTimeMillis() / 1000);
+        if (ttl + nowInSecs < 0)
+        {
+            switch (policy)
+            {
+                case CAP:
+                    ClientWarn.instance.warn(MessageFormatter.arrayFormat(MAXIMUM_EXPIRATION_DATE_EXCEEDED_WARNING, new Object[] { metadata.ksName,
+                                                                                                                                   metadata.cfName,
+                                                                                                                                   isDefaultTTL? "default " : "", ttl })
+                                                             .getMessage());
+                case CAP_NOWARN:
+                    /**
+                     * Capping at this stage is basically not rejecting the request. The actual capping is done
+                     * by {@link #computeLocalExpirationTime(int, int)}, which converts the negative TTL
+                     * to {@link org.apache.cassandra.db.BufferExpiringCell#MAX_DELETION_TIME}
+                     */
+                    NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, EXPIRATION_OVERFLOW_WARNING_INTERVAL_MINUTES, TimeUnit.MINUTES, MAXIMUM_EXPIRATION_DATE_EXCEEDED_WARNING,
+                                     metadata.ksName, metadata.cfName, isDefaultTTL? "default " : "", ttl);
+                    return;
+
+                default:
+                    throw new InvalidRequestException(String.format(MAXIMUM_EXPIRATION_DATE_EXCEEDED_REJECT_MESSAGE, metadata.ksName, metadata.cfName,
+                                                                    isDefaultTTL? "default " : "", ttl));
+            }
+        }
+    }
+
+    /**
+     * This method computes the {@link Cell#localDeletionTime()}, maybe capping to the maximum representable value
+     * which is {@link Cell#MAX_DELETION_TIME}.
+     *
+     * Please note that the {@link ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy} is applied
+     * during {@link ExpirationDateOverflowHandling#maybeApplyExpirationDateOverflowPolicy(CFMetaData, int, boolean)},
+     * so if the request was not denied it means its expiration date should be capped.
+     *
+     * See CASSANDRA-14092
+     */
+    public static int computeLocalExpirationTime(int nowInSec, int timeToLive)
+    {
+        int localExpirationTime = nowInSec + timeToLive;
+        return localExpirationTime >= 0? localExpirationTime : Cell.MAX_DELETION_TIME;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ExpiringCell.java b/src/java/org/apache/cassandra/db/ExpiringCell.java
deleted file mode 100644
index 5fc0f94..0000000
--- a/src/java/org/apache/cassandra/db/ExpiringCell.java
+++ /dev/null

@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-
-/**
- * Alternative to Cell that have an expiring time.
- * ExpiringCell is immutable (as Cell is).
- *
- * Note that ExpiringCell does not override Cell.getMarkedForDeleteAt,
- * which means that it's in the somewhat unintuitive position of being deleted (after its expiration)
- * without having a time-at-which-it-became-deleted.  (Because ttl is a server-side measurement,
- * we can't mix it with the timestamp field, which is client-supplied and whose resolution we
- * can't assume anything about.)
- */
-public interface ExpiringCell extends Cell
-{
-    public static final int MAX_TTL = 20 * 365 * 24 * 60 * 60; // 20 years in seconds
-
-    public int getTimeToLive();
-
-    ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator);
-
-    ExpiringCell localCopy(CFMetaData metaData, MemtableAllocator allocator, OpOrder.Group opGroup);
-}

diff --git a/src/java/org/apache/cassandra/db/HintedHandOffManager.java b/src/java/org/apache/cassandra/db/HintedHandOffManager.java
index 7a570d2..e26f658 100644
--- a/src/java/org/apache/cassandra/db/HintedHandOffManager.java
+++ b/src/java/org/apache/cassandra/db/HintedHandOffManager.java

@@ -17,615 +17,60 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ImmutableSortedSet;
-import com.google.common.collect.Lists;
-import com.google.common.util.concurrent.RateLimiter;
-import com.google.common.util.concurrent.Uninterruptibles;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
-import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
-import org.apache.cassandra.concurrent.NamedThreadFactory;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.WriteFailureException;
-import org.apache.cassandra.exceptions.WriteTimeoutException;
-import org.apache.cassandra.gms.ApplicationState;
-import org.apache.cassandra.gms.FailureDetector;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.metrics.HintedHandoffMetrics;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.service.WriteResponseHandler;
-import org.apache.cassandra.utils.*;
-import org.cliffc.high_scale_lib.NonBlockingHashSet;
 import java.util.List;
 
+import org.apache.cassandra.hints.HintsService;
 import org.apache.cassandra.utils.MBeanWrapper;
 
-import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
-import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
-
 /**
- * The hint schema looks like this:
+ * A proxy class that implement the deprecated legacy HintedHandoffManagerMBean interface.
  *
- * CREATE TABLE hints (
- *   target_id uuid,
- *   hint_id timeuuid,
- *   message_version int,
- *   mutation blob,
- *   PRIMARY KEY (target_id, hint_id, message_version)
- * ) WITH COMPACT STORAGE;
- *
- * Thus, for each node in the cluster we treat its uuid as the partition key; each hint is a logical row
- * (physical composite column) containing the mutation to replay and associated metadata.
- *
- * When FailureDetector signals that a node that was down is back up, we page through
- * the hinted mutations and send them over one at a time, waiting for
- * hinted_handoff_throttle_delay in between each.
- *
- * deliverHints is also exposed to JMX so it can be run manually if FD ever misses
- * its cue somehow.
+ * TODO: remove in 4.0.
  */
-
-public class HintedHandOffManager implements HintedHandOffManagerMBean
+@SuppressWarnings("deprecation")
+@Deprecated
+public final class HintedHandOffManager implements HintedHandOffManagerMBean
 {
-    public static final String MBEAN_NAME = "org.apache.cassandra.db:type=HintedHandoffManager";
     public static final HintedHandOffManager instance = new HintedHandOffManager();
 
-    private static final Logger logger = LoggerFactory.getLogger(HintedHandOffManager.class);
-    private static final int PAGE_SIZE = 128;
-    private static final int LARGE_NUMBER = 65536; // 64k nodes ought to be enough for anybody.
+    public static final String MBEAN_NAME = "org.apache.cassandra.db:type=HintedHandoffManager";
 
-    public final HintedHandoffMetrics metrics = new HintedHandoffMetrics();
-
-    private volatile boolean hintedHandOffPaused = false;
-
-    static final int maxHintTTL = Integer.parseInt(System.getProperty("cassandra.maxHintTTL", String.valueOf(Integer.MAX_VALUE)));
-
-    private final NonBlockingHashSet<InetAddress> queuedDeliveries = new NonBlockingHashSet<>();
-
-    // To keep metrics consistent with earlier versions, where periodic tasks were run on a shared executor,
-    // we run them on this executor and so keep counts separate from those for hint delivery tasks. See CASSANDRA-9129
-    private final DebuggableScheduledThreadPoolExecutor executor =
-        new DebuggableScheduledThreadPoolExecutor(1, new NamedThreadFactory("HintedHandoffManager", Thread.MIN_PRIORITY));
-
-    // Non-scheduled executor to run the actual hint delivery tasks.
-    // Per CASSANDRA-9129, this is where the values displayed in nodetool tpstats
-    // and via the HintedHandoff mbean are obtained.
-    private final ThreadPoolExecutor hintDeliveryExecutor =
-        new JMXEnabledThreadPoolExecutor(
-            DatabaseDescriptor.getMaxHintsThread(),
-            Integer.MAX_VALUE,
-            TimeUnit.SECONDS,
-            new LinkedBlockingQueue<Runnable>(),
-            new NamedThreadFactory("HintedHandoff", Thread.MIN_PRIORITY),
-            "internal");
-
-    private final ColumnFamilyStore hintStore = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.HINTS);
-
-    /**
-     * Returns a mutation representing a Hint to be sent to <code>targetId</code>
-     * as soon as it becomes available again.
-     */
-    public Mutation hintFor(Mutation mutation, long now, int ttl, Pair<InetAddress, UUID> target)
+    private HintedHandOffManager()
     {
-        assert ttl > 0;
-
-        InetAddress endpoint = target.left;
-        UUID targetId = target.right;
-
-        metrics.incrCreatedHints(endpoint);
-
-        UUID hintId = UUIDGen.getTimeUUID();
-        // serialize the hint with id and version as a composite column name
-        CellName name = SystemKeyspace.Hints.comparator.makeCellName(hintId, MessagingService.current_version);
-        ByteBuffer value = ByteBuffer.wrap(FBUtilities.serialize(mutation, Mutation.serializer, MessagingService.current_version));
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(Schema.instance.getCFMetaData(SystemKeyspace.NAME, SystemKeyspace.HINTS));
-        cf.addColumn(name, value, now, ttl);
-        return new Mutation(SystemKeyspace.NAME, UUIDType.instance.decompose(targetId), cf);
     }
 
-    /*
-     * determine the TTL for the hint Mutation
-     * this is set at the smallest GCGraceSeconds for any of the CFs in the RM
-     * this ensures that deletes aren't "undone" by delivery of an old hint
-     */
-    public static int calculateHintTTL(Mutation mutation)
-    {
-        int ttl = maxHintTTL;
-        for (ColumnFamily cf : mutation.getColumnFamilies())
-            ttl = Math.min(ttl, cf.metadata().getGcGraceSeconds());
-        return ttl;
-    }
-
-
-    public void start()
+    public void registerMBean()
     {
         MBeanWrapper.instance.registerMBean(this, MBEAN_NAME);
-        logger.trace("Created HHOM instance, registered MBean.");
-
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-                scheduleAllDeliveries();
-                metrics.log();
-            }
-        };
-        executor.scheduleWithFixedDelay(runnable, 10, 10, TimeUnit.MINUTES);
     }
 
-    private static void deleteHint(ByteBuffer tokenBytes, CellName columnName, long timestamp)
+    public void deleteHintsForEndpoint(String host)
     {
-        Mutation mutation = new Mutation(SystemKeyspace.NAME, tokenBytes);
-        mutation.delete(SystemKeyspace.HINTS, columnName, timestamp);
-        mutation.applyUnsafe(); // don't bother with commitlog since we're going to flush as soon as we're done with delivery
+        HintsService.instance.deleteAllHintsForEndpoint(host);
     }
 
-    public void deleteHintsForEndpoint(final String ipOrHostname)
+    public void truncateAllHints()
     {
-        try
-        {
-            InetAddress endpoint = InetAddress.getByName(ipOrHostname);
-            deleteHintsForEndpoint(endpoint);
-        }
-        catch (UnknownHostException e)
-        {
-            logger.warn("Unable to find {}, not a hostname or ipaddr of a node", ipOrHostname);
-            throw new RuntimeException(e);
-        }
+        HintsService.instance.deleteAllHints();
     }
 
-    public void deleteHintsForEndpoint(final InetAddress endpoint)
-    {
-        if (!StorageService.instance.getTokenMetadata().isMember(endpoint))
-            return;
-
-        UUID hostId = StorageService.instance.getTokenMetadata().getHostId(endpoint);
-        if (hostId == null)
-            return;
-
-        ByteBuffer hostIdBytes = ByteBuffer.wrap(UUIDGen.decompose(hostId));
-        final Mutation mutation = new Mutation(SystemKeyspace.NAME, hostIdBytes);
-        mutation.delete(SystemKeyspace.HINTS, System.currentTimeMillis());
-
-        // execute asynchronously to avoid blocking caller (which may be processing gossip)
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-                try
-                {
-                    logger.info("Deleting any stored hints for {}", endpoint);
-                    mutation.apply();
-                    hintStore.forceBlockingFlush();
-                    compact();
-                }
-                catch (Exception e)
-                {
-                    JVMStabilityInspector.inspectThrowable(e);
-                    logger.warn("Could not delete hints for {}: {}", endpoint, e);
-                }
-            }
-        };
-        executor.submit(runnable);
-    }
-
-    //foobar
-    public void truncateAllHints() throws ExecutionException, InterruptedException
-    {
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-                try
-                {
-                    logger.info("Truncating all stored hints.");
-                    Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.HINTS).truncateBlocking();
-                }
-                catch (Exception e)
-                {
-                    logger.warn("Could not truncate all hints.", e);
-                }
-            }
-        };
-        executor.submit(runnable).get();
-    }
-
-    @VisibleForTesting
-    protected synchronized void compact()
-    {
-        ArrayList<Descriptor> descriptors = new ArrayList<>();
-        for (SSTable sstable : hintStore.getTracker().getUncompacting())
-            descriptors.add(sstable.descriptor);
-
-        if (descriptors.isEmpty())
-            return;
-
-        try
-        {
-            CompactionManager.instance.submitUserDefined(hintStore, descriptors, (int) (System.currentTimeMillis() / 1000)).get();
-        }
-        catch (InterruptedException | ExecutionException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private static boolean pagingFinished(ColumnFamily hintColumnFamily, Composite startColumn)
-    {
-        // done if no hints found or the start column (same as last column processed in previous iteration) is the only one
-        return hintColumnFamily == null
-               || (!startColumn.isEmpty() && hintColumnFamily.getSortedColumns().size() == 1 && hintColumnFamily.getColumn((CellName)startColumn) != null);
-    }
-
-    private int waitForSchemaAgreement(InetAddress endpoint) throws TimeoutException
-    {
-        Gossiper gossiper = Gossiper.instance;
-        int waited = 0;
-        // first, wait for schema to be gossiped.
-        while (gossiper.getEndpointStateForEndpoint(endpoint) != null && gossiper.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.SCHEMA) == null)
-        {
-            Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
-            waited += 1000;
-            if (waited > 2 * StorageService.RING_DELAY)
-                throw new TimeoutException("Didin't receive gossiped schema from " + endpoint + " in " + 2 * StorageService.RING_DELAY + "ms");
-        }
-        if (gossiper.getEndpointStateForEndpoint(endpoint) == null)
-            throw new TimeoutException("Node " + endpoint + " vanished while waiting for agreement");
-        waited = 0;
-        // then wait for the correct schema version.
-        // usually we use DD.getDefsVersion, which checks the local schema uuid as stored in the system keyspace.
-        // here we check the one in gossip instead; this serves as a canary to warn us if we introduce a bug that
-        // causes the two to diverge (see CASSANDRA-2946)
-        while (gossiper.getEndpointStateForEndpoint(endpoint) != null && !gossiper.getEndpointStateForEndpoint(endpoint).getApplicationState(ApplicationState.SCHEMA).value.equals(
-                gossiper.getEndpointStateForEndpoint(FBUtilities.getBroadcastAddress()).getApplicationState(ApplicationState.SCHEMA).value))
-        {
-            Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
-            waited += 1000;
-            if (waited > 2 * StorageService.RING_DELAY)
-                throw new TimeoutException("Could not reach schema agreement with " + endpoint + " in " + 2 * StorageService.RING_DELAY + "ms");
-        }
-        if (gossiper.getEndpointStateForEndpoint(endpoint) == null)
-            throw new TimeoutException("Node " + endpoint + " vanished while waiting for agreement");
-        logger.trace("schema for {} matches local schema", endpoint);
-        return waited;
-    }
-
-    private void deliverHintsToEndpoint(InetAddress endpoint)
-    {
-        if (hintStore.isEmpty())
-            return; // nothing to do, don't confuse users by logging a no-op handoff
-
-        // check if hints delivery has been paused
-        if (hintedHandOffPaused)
-        {
-            logger.trace("Hints delivery process is paused, aborting");
-            return;
-        }
-
-        logger.trace("Checking remote({}) schema before delivering hints", endpoint);
-        try
-        {
-            waitForSchemaAgreement(endpoint);
-        }
-        catch (TimeoutException e)
-        {
-            return;
-        }
-
-        if (!FailureDetector.instance.isAlive(endpoint))
-        {
-            logger.trace("Endpoint {} died before hint delivery, aborting", endpoint);
-            return;
-        }
-
-        doDeliverHintsToEndpoint(endpoint);
-    }
-
-    /*
-     * 1. Get the key of the endpoint we need to handoff
-     * 2. For each column, deserialize the mutation and send it to the endpoint
-     * 3. Delete the column if the write was successful
-     * 4. Force a flush
-     */
-    private void doDeliverHintsToEndpoint(InetAddress endpoint)
-    {
-        // find the hints for the node using its token.
-        UUID hostId = Gossiper.instance.getHostId(endpoint);
-        logger.info("Started hinted handoff for host: {} with IP: {}", hostId, endpoint);
-        final ByteBuffer hostIdBytes = ByteBuffer.wrap(UUIDGen.decompose(hostId));
-        DecoratedKey epkey =  StorageService.getPartitioner().decorateKey(hostIdBytes);
-
-        final AtomicInteger rowsReplayed = new AtomicInteger(0);
-        Composite startColumn = Composites.EMPTY;
-
-        int pageSize = calculatePageSize();
-        logger.trace("Using pageSize of {}", pageSize);
-
-        // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
-        // max rate is scaled by the number of nodes in the cluster (CASSANDRA-5272).
-        int throttleInKB = DatabaseDescriptor.getHintedHandoffThrottleInKB()
-                           / (StorageService.instance.getTokenMetadata().getAllEndpoints().size() - 1);
-        RateLimiter rateLimiter = RateLimiter.create(throttleInKB == 0 ? Double.MAX_VALUE : throttleInKB * 1024);
-
-        delivery:
-        while (true)
-        {
-            long now = System.currentTimeMillis();
-            QueryFilter filter = QueryFilter.getSliceFilter(epkey,
-                                                            SystemKeyspace.HINTS,
-                                                            startColumn,
-                                                            Composites.EMPTY,
-                                                            false,
-                                                            pageSize,
-                                                            now);
-
-            ColumnFamily hintsPage = ColumnFamilyStore.removeDeleted(hintStore.getColumnFamily(filter), (int) (now / 1000));
-
-            if (pagingFinished(hintsPage, startColumn))
-            {
-                logger.info("Finished hinted handoff of {} rows to endpoint {}", rowsReplayed, endpoint);
-                break;
-            }
-
-            // check if node is still alive and we should continue delivery process
-            if (!FailureDetector.instance.isAlive(endpoint))
-            {
-                logger.info("Endpoint {} died during hint delivery; aborting ({} delivered)", endpoint, rowsReplayed);
-                break;
-            }
-
-            List<WriteResponseHandler<Mutation>> responseHandlers = Lists.newArrayList();
-            for (final Cell hint : hintsPage)
-            {
-                // check if hints delivery has been paused during the process
-                if (hintedHandOffPaused)
-                {
-                    logger.trace("Hints delivery process is paused, aborting");
-                    break delivery;
-                }
-
-                // Skip tombstones:
-                // if we iterate quickly enough, it's possible that we could request a new page in the same millisecond
-                // in which the local deletion timestamp was generated on the last column in the old page, in which
-                // case the hint will have no columns (since it's deleted) but will still be included in the resultset
-                // since (even with gcgs=0) it's still a "relevant" tombstone.
-                if (!hint.isLive())
-                    continue;
-
-                startColumn = hint.name();
-
-                int version = Int32Type.instance.compose(hint.name().get(1));
-                DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(hint.value()));
-                Mutation mutation;
-                try
-                {
-                    mutation = Mutation.serializer.deserialize(in, version);
-                }
-                catch (UnknownColumnFamilyException e)
-                {
-                    logger.trace("Skipping delivery of hint for deleted table", e);
-                    deleteHint(hostIdBytes, hint.name(), hint.timestamp());
-                    continue;
-                }
-                catch (IOException e)
-                {
-                    throw new AssertionError(e);
-                }
-
-                for (UUID cfId : mutation.getColumnFamilyIds())
-                {
-                    if (hint.timestamp() <= SystemKeyspace.getTruncatedAt(cfId))
-                    {
-                        logger.trace("Skipping delivery of hint for truncated table {}", cfId);
-                        mutation = mutation.without(cfId);
-                    }
-                }
-
-                if (mutation.isEmpty())
-                {
-                    deleteHint(hostIdBytes, hint.name(), hint.timestamp());
-                    continue;
-                }
-
-                MessageOut<Mutation> message = mutation.createMessage();
-                rateLimiter.acquire(message.serializedSize(MessagingService.current_version));
-                Runnable callback = new Runnable()
-                {
-                    public void run()
-                    {
-                        rowsReplayed.incrementAndGet();
-                        deleteHint(hostIdBytes, hint.name(), hint.timestamp());
-                    }
-                };
-                WriteResponseHandler<Mutation> responseHandler = new WriteResponseHandler<>(endpoint, WriteType.SIMPLE, callback);
-                MessagingService.instance().sendRR(message, endpoint, responseHandler, false);
-                responseHandlers.add(responseHandler);
-            }
-
-            for (WriteResponseHandler<Mutation> handler : responseHandlers)
-            {
-                try
-                {
-                    handler.get();
-                }
-                catch (WriteTimeoutException|WriteFailureException e)
-                {
-                    logger.info("Failed replaying hints to {}; aborting ({} delivered), error : {}",
-                        endpoint, rowsReplayed, e.getMessage());
-                    break delivery;
-                }
-            }
-        }
-
-        // Flush all the tombstones to disk
-        hintStore.forceBlockingFlush();
-    }
-
-    // read less columns (mutations) per page if they are very large
-    private int calculatePageSize()
-    {
-        int meanColumnCount = hintStore.getMeanColumns();
-        if (meanColumnCount <= 0)
-            return PAGE_SIZE;
-
-        int averageColumnSize = (int) (hintStore.metric.meanRowSize.getValue() / meanColumnCount);
-        if (averageColumnSize <= 0)
-            return PAGE_SIZE;
-
-        // page size of 1 does not allow actual paging b/c of >= behavior on startColumn
-        return Math.max(2, Math.min(PAGE_SIZE, 4 * 1024 * 1024 / averageColumnSize));
-    }
-
-    /**
-     * Attempt delivery to any node for which we have hints.  Necessary since we can generate hints even for
-     * nodes which are never officially down/failed.
-     */
-    private void scheduleAllDeliveries()
-    {
-        logger.trace("Started scheduleAllDeliveries");
-
-        // Force a major compaction to get rid of the tombstones and expired hints. Do it once, before we schedule any
-        // individual replay, to avoid N - 1 redundant individual compactions (when N is the number of nodes with hints
-        // to deliver to).
-        compact();
-
-        IPartitioner p = StorageService.getPartitioner();
-        RowPosition minPos = p.getMinimumToken().minKeyBound();
-        Range<RowPosition> range = new Range<>(minPos, minPos);
-        IDiskAtomFilter filter = new NamesQueryFilter(ImmutableSortedSet.<CellName>of());
-        List<Row> rows = hintStore.getRangeSlice(range, null, filter, Integer.MAX_VALUE, System.currentTimeMillis());
-        for (Row row : rows)
-        {
-            UUID hostId = UUIDGen.getUUID(row.key.getKey());
-            InetAddress target = StorageService.instance.getTokenMetadata().getEndpointForHostId(hostId);
-            // token may have since been removed (in which case we have just read back a tombstone)
-            if (target != null)
-                scheduleHintDelivery(target, false);
-        }
-
-        logger.trace("Finished scheduleAllDeliveries");
-    }
-
-    /*
-     * This method is used to deliver hints to a particular endpoint.
-     * When we learn that some endpoint is back up we deliver the data
-     * to him via an event driven mechanism.
-    */
-    public void scheduleHintDelivery(final InetAddress to, final boolean precompact)
-    {
-        // We should not deliver hints to the same host in 2 different threads
-        if (!queuedDeliveries.add(to))
-            return;
-
-        logger.trace("Scheduling delivery of Hints to {}", to);
-
-        hintDeliveryExecutor.execute(new Runnable()
-        {
-            public void run()
-            {
-                try
-                {
-                    // If it's an individual node hint replay (triggered by Gossip or via JMX), and not the global scheduled replay
-                    // (every 10 minutes), force a major compaction to get rid of the tombstones and expired hints.
-                    if (precompact)
-                        compact();
-
-                    deliverHintsToEndpoint(to);
-                }
-                finally
-                {
-                    queuedDeliveries.remove(to);
-                }
-            }
-        });
-    }
-
-    public void scheduleHintDelivery(String to) throws UnknownHostException
-    {
-        scheduleHintDelivery(InetAddress.getByName(to), true);
-    }
-
-    public void pauseHintsDelivery(boolean b)
-    {
-        hintedHandOffPaused = b;
-    }
-
+    // TODO
     public List<String> listEndpointsPendingHints()
     {
-        Token.TokenFactory tokenFactory = StorageService.getPartitioner().getTokenFactory();
-
-        // Extract the keys as strings to be reported.
-        LinkedList<String> result = new LinkedList<>();
-        for (Row row : getHintsSlice(1))
-        {
-            if (row.cf != null) //ignore removed rows
-                result.addFirst(tokenFactory.toString(row.key.getToken()));
-        }
-        return result;
+        throw new UnsupportedOperationException();
     }
 
-    private List<Row> getHintsSlice(int columnCount)
+    // TODO
+    public void scheduleHintDelivery(String host)
     {
-        // Get count # of columns...
-        SliceQueryFilter predicate = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY,
-                                                          false,
-                                                          columnCount);
-
-        // From keys "" to ""...
-        IPartitioner partitioner = StorageService.getPartitioner();
-        RowPosition minPos = partitioner.getMinimumToken().minKeyBound();
-        Range<RowPosition> range = new Range<>(minPos, minPos);
-
-        try
-        {
-            RangeSliceCommand cmd = new RangeSliceCommand(SystemKeyspace.NAME,
-                                                          SystemKeyspace.HINTS,
-                                                          System.currentTimeMillis(),
-                                                          predicate,
-                                                          range,
-                                                          null,
-                                                          LARGE_NUMBER);
-            return StorageProxy.getRangeSlice(cmd, ConsistencyLevel.ONE);
-        }
-        catch (Exception e)
-        {
-            logger.info("HintsCF getEPPendingHints timed out.");
-            throw new RuntimeException(e);
-        }
+        throw new UnsupportedOperationException();
     }
 
-    @VisibleForTesting
-    public void shutdownAndWait(long timeout, TimeUnit units) throws InterruptedException, TimeoutException
+    public void pauseHintsDelivery(boolean doPause)
     {
-        shutdown(executor, hintDeliveryExecutor);
-        awaitTermination(timeout, units, executor, hintDeliveryExecutor);
+        if (doPause)
+            HintsService.instance.pauseDispatch();
+        else
+            HintsService.instance.resumeDispatch();
     }
 }

diff --git a/src/java/org/apache/cassandra/db/HintedHandOffManagerMBean.java b/src/java/org/apache/cassandra/db/HintedHandOffManagerMBean.java
index bbb2a14..9ba425e 100644
--- a/src/java/org/apache/cassandra/db/HintedHandOffManagerMBean.java
+++ b/src/java/org/apache/cassandra/db/HintedHandOffManagerMBean.java

@@ -21,6 +21,7 @@
 import java.util.List;
 import java.util.concurrent.ExecutionException;
 
+@Deprecated
 public interface HintedHandOffManagerMBean
 {
     /**

diff --git a/src/java/org/apache/cassandra/db/IMutation.java b/src/java/org/apache/cassandra/db/IMutation.java
index 44df104..aad35c3 100644
--- a/src/java/org/apache/cassandra/db/IMutation.java
+++ b/src/java/org/apache/cassandra/db/IMutation.java

@@ -21,13 +21,14 @@
 import java.util.Collection;
 import java.util.UUID;
 
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+
 public interface IMutation
 {
     public String getKeyspaceName();
     public Collection<UUID> getColumnFamilyIds();
-    public ByteBuffer key();
+    public DecoratedKey key();
     public long getTimeout();
     public String toString(boolean shallow);
-    public void addAll(IMutation m);
-    public Collection<ColumnFamily> getColumnFamilies();
+    public Collection<PartitionUpdate> getPartitionUpdates();
 }

diff --git a/src/java/org/apache/cassandra/db/IndexExpression.java b/src/java/org/apache/cassandra/db/IndexExpression.java
deleted file mode 100644
index bdb74ce..0000000
--- a/src/java/org/apache/cassandra/db/IndexExpression.java
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- *  with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import com.google.common.base.Objects;
-
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public final class IndexExpression
-{
-    public final ByteBuffer column;
-    public final Operator operator;
-    public final ByteBuffer value;
-
-    public IndexExpression(ByteBuffer column, Operator operator, ByteBuffer value)
-    {
-        this.column = column;
-        this.operator = operator;
-        this.value = value;
-    }
-
-    /**
-     * Checks if the operator of this <code>IndexExpression</code> is a <code>CONTAINS</code> operator.
-     *
-     * @return <code>true</code> if the operator of this <code>IndexExpression</code> is a <code>CONTAINS</code>
-     * operator, <code>false</code> otherwise.
-     */
-    public boolean isContains()
-    {
-        return Operator.CONTAINS == operator;
-    }
-
-    /**
-     * Checks if the operator of this <code>IndexExpression</code> is a <code>CONTAINS_KEY</code> operator.
-     *
-     * @return <code>true</code> if the operator of this <code>IndexExpression</code> is a <code>CONTAINS_KEY</code>
-     * operator, <code>false</code> otherwise.
-     */
-    public boolean isContainsKey()
-    {
-        return Operator.CONTAINS_KEY == operator;
-    }
-
-    @Override
-    public String toString()
-    {
-        return String.format("%s %s %s", ByteBufferUtil.bytesToHex(column), operator, ByteBufferUtil.bytesToHex(value));
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-
-        if (!(o instanceof IndexExpression))
-            return false;
-
-        IndexExpression ie = (IndexExpression) o;
-
-        return Objects.equal(this.column, ie.column)
-            && Objects.equal(this.operator, ie.operator)
-            && Objects.equal(this.value, ie.value);
-    }
-
-    @Override
-    public int hashCode()
-    {
-        return Objects.hashCode(column, operator, value);
-    }
-
-    /**
-     * Write the serialized version of this <code>IndexExpression</code> to the specified output.
-     *
-     * @param output the output to write to
-     * @throws IOException if an I/O problem occurs while writing to the specified output
-     */
-    public void writeTo(DataOutputPlus output) throws IOException
-    {
-        ByteBufferUtil.writeWithShortLength(column, output);
-        operator.writeTo(output);
-        ByteBufferUtil.writeWithShortLength(value, output);
-    }
-
-    /**
-     * Deserializes an <code>IndexExpression</code> instance from the specified input. 
-     *
-     * @param input the input to read from 
-     * @return the <code>IndexExpression</code> instance deserialized
-     * @throws IOException if a problem occurs while deserializing the <code>IndexExpression</code> instance.
-     */
-    public static IndexExpression readFrom(DataInput input) throws IOException
-    {
-        return new IndexExpression(ByteBufferUtil.readWithShortLength(input),
-                                   Operator.readFrom(input),
-                                   ByteBufferUtil.readWithShortLength(input));
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java
index c126f5c..3d3e037 100644
--- a/src/java/org/apache/cassandra/db/Keyspace.java
+++ b/src/java/org/apache/cassandra/db/Keyspace.java

@@ -20,41 +20,49 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.Future;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.Lock;
 
 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.view.ViewManager;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.service.pager.QueryPagers;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.metrics.KeyspaceMetrics;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 /**
  * It represents a Keyspace.
  */
 public class Keyspace
 {
-    private static final int DEFAULT_PAGE_SIZE = 10000;
-
     private static final Logger logger = LoggerFactory.getLogger(Keyspace.class);
 
     private static final String TEST_FAIL_WRITES_KS = System.getProperty("cassandra.test.fail_writes_ks", "");
     private static final boolean TEST_FAIL_WRITES = !TEST_FAIL_WRITES_KS.isEmpty();
+    private static int TEST_FAIL_MV_LOCKS_COUNT = Integer.getInteger("cassandra.test.fail_mv_locks_count", 0);
 
     public final KeyspaceMetrics metric;
 
@@ -66,12 +74,16 @@
             DatabaseDescriptor.createAllDirectories();
     }
 
-    public final OpOrder writeOrder = new OpOrder();
+    private volatile KeyspaceMetadata metadata;
+
+    //OpOrder is defined globally since we need to order writes across
+    //Keyspaces in the case of Views (batchlog of view mutations)
+    public static final OpOrder writeOrder = new OpOrder();
 
     /* ColumnFamilyStore per column family */
     private final ConcurrentMap<UUID, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<>();
-    private volatile KSMetaData metadata;
     private volatile AbstractReplicationStrategy replicationStrategy;
+    public final ViewManager viewManager;
 
     public static final Function<String,Keyspace> keyspaceTransformer = new Function<String, Keyspace>()
     {
@@ -82,6 +94,7 @@
     };
 
     private static volatile boolean initialized = false;
+
     public static void setInitialized()
     {
         initialized = true;
@@ -89,7 +102,7 @@
 
     public static Keyspace open(String keyspaceName)
     {
-        assert initialized || keyspaceName.equals(SystemKeyspace.NAME);
+        assert initialized || Schema.isLocalSystemKeyspace(keyspaceName);
         return open(keyspaceName, Schema.instance, true);
     }
 
@@ -141,6 +154,11 @@
         }
     }
 
+    public static ColumnFamilyStore openAndGetStore(CFMetaData cfm)
+    {
+        return open(cfm.ksName).getColumnFamilyStore(cfm.cfId);
+    }
+
     /**
      * Removes every SSTable in the directory from the appropriate Tracker's view.
      * @param directory the unreadable directory, possibly with SSTables in it, but not necessarily.
@@ -157,6 +175,17 @@
         }
     }
 
+    public void setMetadata(KeyspaceMetadata metadata)
+    {
+        this.metadata = metadata;
+        createReplicationStrategy(metadata);
+    }
+
+    public KeyspaceMetadata getMetadata()
+    {
+        return metadata;
+    }
+
     public Collection<ColumnFamilyStore> getColumnFamilyStores()
     {
         return Collections.unmodifiableCollection(columnFamilyStores.values());
@@ -178,6 +207,11 @@
         return cfs;
     }
 
+    public boolean hasColumnFamilyStore(UUID id)
+    {
+        return columnFamilyStores.containsKey(id);
+    }
+
     /**
      * Take a snapshot of the specific column family, or the entire set of column families
      * if columnFamily is null with a given timestamp
@@ -242,18 +276,18 @@
      */
     public static void clearSnapshot(String snapshotName, String keyspace)
     {
-        List<File> snapshotDirs = Directories.getKSChildDirectories(keyspace);
+        List<File> snapshotDirs = Directories.getKSChildDirectories(keyspace, ColumnFamilyStore.getInitialDirectories());
         Directories.clearSnapshot(snapshotName, snapshotDirs);
     }
 
     /**
      * @return A list of open SSTableReaders
      */
-    public List<SSTableReader> getAllSSTables()
+    public List<SSTableReader> getAllSSTables(SSTableSet sstableSet)
     {
         List<SSTableReader> list = new ArrayList<>(columnFamilyStores.size());
         for (ColumnFamilyStore cfStore : columnFamilyStores.values())
-            list.addAll(cfStore.getSSTables());
+            Iterables.addAll(list, cfStore.getSSTables(sstableSet));
         return list;
     }
 
@@ -264,43 +298,35 @@
         createReplicationStrategy(metadata);
 
         this.metric = new KeyspaceMetrics(this);
-        for (CFMetaData cfm : new ArrayList<>(metadata.cfMetaData().values()))
+        this.viewManager = new ViewManager(this);
+        for (CFMetaData cfm : metadata.tablesAndViews())
         {
             logger.trace("Initializing {}.{}", getName(), cfm.cfName);
             initCf(cfm, loadSSTables);
         }
+        this.viewManager.reload();
     }
 
-    private Keyspace(KSMetaData metadata)
+    private Keyspace(KeyspaceMetadata metadata)
     {
         this.metadata = metadata;
         createReplicationStrategy(metadata);
         this.metric = new KeyspaceMetrics(this);
+        this.viewManager = new ViewManager(this);
     }
 
-    public static Keyspace mockKS(KSMetaData metadata)
+    public static Keyspace mockKS(KeyspaceMetadata metadata)
     {
         return new Keyspace(metadata);
     }
 
-    private void createReplicationStrategy(KSMetaData ksm)
+    private void createReplicationStrategy(KeyspaceMetadata ksm)
     {
         replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name,
-                                                                                    ksm.strategyClass,
+                                                                                    ksm.params.replication.klass,
                                                                                     StorageService.instance.getTokenMetadata(),
                                                                                     DatabaseDescriptor.getEndpointSnitch(),
-                                                                                    ksm.strategyOptions);
-    }
-
-    public void setMetadata(KSMetaData ksm)
-    {
-        this.metadata = ksm;
-        createReplicationStrategy(ksm);
-    }
-
-    public KSMetaData getMetadata()
-    {
-        return metadata;
+                                                                                    ksm.params.replication.options);
     }
 
     // best invoked on the compaction mananger.
@@ -311,7 +337,7 @@
         if (cfs == null)
             return;
 
-        cfs.getCompactionStrategy().shutdown();
+        cfs.getCompactionStrategyManager().shutdown();
         CompactionManager.instance.interruptCompactionForCFs(cfs.concatWithIndexes(), true);
         // wait for any outstanding reads/writes that might affect the CFS
         cfs.keyspace.writeOrder.awaitNewBarrier();
@@ -349,21 +375,65 @@
             // re-initializing an existing CF.  This will happen if you cleared the schema
             // on this node and it's getting repopulated from the rest of the cluster.
             assert cfs.name.equals(metadata.cfName);
-            cfs.metadata.reload();
             cfs.reload();
         }
     }
 
-    public Row getRow(QueryFilter filter)
+    public CompletableFuture<?> applyFuture(Mutation mutation, boolean writeCommitLog, boolean updateIndexes)
     {
-        ColumnFamilyStore cfStore = getColumnFamilyStore(filter.getColumnFamilyName());
-        ColumnFamily columnFamily = cfStore.getColumnFamily(filter);
-        return new Row(filter.key, columnFamily);
+        return applyInternal(mutation, writeCommitLog, updateIndexes, true, true, new CompletableFuture<>());
     }
 
-    public void apply(Mutation mutation, boolean writeCommitLog)
+    public CompletableFuture<?> applyFuture(Mutation mutation, boolean writeCommitLog, boolean updateIndexes, boolean isDroppable,
+                                            boolean isDeferrable)
     {
-        apply(mutation, writeCommitLog, true);
+        return applyInternal(mutation, writeCommitLog, updateIndexes, isDroppable, isDeferrable, new CompletableFuture<>());
+    }
+
+    public void apply(Mutation mutation, boolean writeCommitLog, boolean updateIndexes)
+    {
+        apply(mutation, writeCommitLog, updateIndexes, true);
+    }
+
+    public void apply(final Mutation mutation,
+                      final boolean writeCommitLog)
+    {
+        apply(mutation, writeCommitLog, true, true);
+    }
+
+    /**
+     * If apply is blocking, apply must not be deferred
+     * Otherwise there is a race condition where ALL mutation workers are beeing blocked ending
+     * in a complete deadlock of the mutation stage. See CASSANDRA-12689.
+     *
+     * @param mutation       the row to write.  Must not be modified after calling apply, since commitlog append
+     *                       may happen concurrently, depending on the CL Executor type.
+     * @param writeCommitLog false to disable commitlog append entirely
+     * @param updateIndexes  false to disable index updates (used by CollationController "defragmenting")
+     * @param isDroppable    true if this should throw WriteTimeoutException if it does not acquire lock within write_request_timeout_in_ms
+     * @throws ExecutionException
+     */
+    public void apply(final Mutation mutation,
+                      final boolean writeCommitLog,
+                      boolean updateIndexes,
+                      boolean isDroppable)
+    {
+        applyInternal(mutation, writeCommitLog, updateIndexes, isDroppable, false, null);
+    }
+
+    /**
+     * Compatibility method that keeps <bold>isClReplay</bold> flag.
+     * @deprecated Use {@link this#applyFuture(Mutation, boolean, boolean, boolean, boolean)} instead
+     */
+    @Deprecated
+    public CompletableFuture<?> apply(final Mutation mutation,
+                                       final boolean writeCommitLog,
+                                       boolean updateIndexes,
+                                       boolean isClReplay,
+                                       boolean isDeferrable,
+                                       CompletableFuture<?> future)
+    {
+        return applyInternal(mutation, writeCommitLog, updateIndexes, !isClReplay, isDeferrable, future != null? future : new CompletableFuture<>());
     }
 
     /**
@@ -373,12 +443,93 @@
      *                       may happen concurrently, depending on the CL Executor type.
      * @param writeCommitLog false to disable commitlog append entirely
      * @param updateIndexes  false to disable index updates (used by CollationController "defragmenting")
+     * @param isDroppable    true if this should throw WriteTimeoutException if it does not acquire lock within write_request_timeout_in_ms
+     * @param isDeferrable   true if caller is not waiting for future to complete, so that future may be deferred
      */
-    public void apply(Mutation mutation, boolean writeCommitLog, boolean updateIndexes)
+    private CompletableFuture<?> applyInternal(final Mutation mutation,
+                                               final boolean writeCommitLog,
+                                               boolean updateIndexes,
+                                               boolean isDroppable,
+                                               boolean isDeferrable,
+                                               CompletableFuture<?> future)
     {
         if (TEST_FAIL_WRITES && metadata.name.equals(TEST_FAIL_WRITES_KS))
             throw new RuntimeException("Testing write failures");
 
+        boolean requiresViewUpdate = updateIndexes && viewManager.updatesAffectView(Collections.singleton(mutation), false);
+
+        Lock lock = null;
+        if (requiresViewUpdate)
+        {
+            mutation.viewLockAcquireStart.compareAndSet(0L, System.currentTimeMillis());
+            while (true)
+            {
+                if (TEST_FAIL_MV_LOCKS_COUNT == 0)
+                    lock = ViewManager.acquireLockFor(mutation.key().getKey());
+                else
+                    TEST_FAIL_MV_LOCKS_COUNT--;
+
+                if (lock == null)
+                {
+                    //throw WTE only if request is droppable
+                    if (isDroppable && (System.currentTimeMillis() - mutation.createdAt) > DatabaseDescriptor.getWriteRpcTimeout())
+                    {
+                        logger.trace("Could not acquire lock for {}", ByteBufferUtil.bytesToHex(mutation.key().getKey()));
+                        Tracing.trace("Could not acquire MV lock");
+                        if (future != null)
+                        {
+                            future.completeExceptionally(new WriteTimeoutException(WriteType.VIEW, ConsistencyLevel.LOCAL_ONE, 0, 1));
+                            return future;
+                        }
+                        else
+                        {
+                            throw new WriteTimeoutException(WriteType.VIEW, ConsistencyLevel.LOCAL_ONE, 0, 1);
+                        }
+                    }
+                    else if (isDeferrable)
+                    {
+                        //This view update can't happen right now. so rather than keep this thread busy
+                        // we will re-apply ourself to the queue and try again later
+                        final CompletableFuture<?> mark = future;
+                        StageManager.getStage(Stage.MUTATION).execute(() ->
+                                applyInternal(mutation, writeCommitLog, true, isDroppable, true, mark)
+                        );
+
+                        return future;
+                    }
+                    else
+                    {
+                        // Retry lock on same thread, if mutation is not deferrable.
+                        // Mutation is not deferrable, if applied from MutationStage and caller is waiting for future to finish
+                        // If blocking caller defers future, this may lead to deadlock situation with all MutationStage workers
+                        // being blocked by waiting for futures which will never be processed as all workers are blocked
+                        try
+                        {
+                            // Wait a little bit before retrying to lock
+                            Thread.sleep(10);
+                        }
+                        catch (InterruptedException e)
+                        {
+                            // Just continue
+                        }
+                        // continue in while loop
+                    }
+                }
+                else
+                {
+                    long acquireTime = System.currentTimeMillis() - mutation.viewLockAcquireStart.get();
+                    // Metrics are only collected for droppable write operations
+                    // Bulk non-droppable operations (e.g. commitlog replay, hint delivery) are not measured
+                    if (isDroppable)
+                    {
+                        for (UUID cfid : mutation.getColumnFamilyIds())
+                            columnFamilyStores.get(cfid).metric.viewLockAcquireTime.update(acquireTime, TimeUnit.MILLISECONDS);
+                    }
+                    break;
+                }
+            }
+        }
+        int nowInSec = FBUtilities.nowInSeconds();
         try (OpOrder.Group opGroup = writeOrder.start())
         {
             // write the mutation to the commitlog and memtables
@@ -389,22 +540,50 @@
                 replayPosition = CommitLog.instance.add(mutation);
             }
 
-            DecoratedKey key = StorageService.getPartitioner().decorateKey(mutation.key());
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate upd : mutation.getPartitionUpdates())
             {
-                ColumnFamilyStore cfs = columnFamilyStores.get(cf.id());
+                ColumnFamilyStore cfs = columnFamilyStores.get(upd.metadata().cfId);
                 if (cfs == null)
                 {
-                    logger.error("Attempting to mutate non-existant table {}", cf.id());
+                    logger.error("Attempting to mutate non-existant table {} ({}.{})", upd.metadata().cfId, upd.metadata().ksName, upd.metadata().cfName);
                     continue;
                 }
+                AtomicLong baseComplete = new AtomicLong(Long.MAX_VALUE);
 
-                Tracing.trace("Adding to {} memtable", cf.metadata().cfName);
-                SecondaryIndexManager.Updater updater = updateIndexes
-                                                      ? cfs.indexManager.updaterFor(key, cf, opGroup)
-                                                      : SecondaryIndexManager.nullUpdater;
-                cfs.apply(key, cf, updater, opGroup, replayPosition);
+                if (requiresViewUpdate)
+                {
+                    try
+                    {
+                        Tracing.trace("Creating materialized view mutations from base table replica");
+                        viewManager.forTable(upd.metadata()).pushViewReplicaUpdates(upd, writeCommitLog, baseComplete);
+                    }
+                    catch (Throwable t)
+                    {
+                        JVMStabilityInspector.inspectThrowable(t);
+                        logger.error(String.format("Unknown exception caught while attempting to update MaterializedView! %s.%s",
+                                     upd.metadata().ksName, upd.metadata().cfName), t);
+                        throw t;
+                    }
+                }
+
+                Tracing.trace("Adding to {} memtable", upd.metadata().cfName);
+                UpdateTransaction indexTransaction = updateIndexes
+                                                     ? cfs.indexManager.newUpdateTransaction(upd, opGroup, nowInSec)
+                                                     : UpdateTransaction.NO_OP;
+                cfs.apply(upd, indexTransaction, opGroup, replayPosition);
+                if (requiresViewUpdate)
+                    baseComplete.set(System.currentTimeMillis());
             }
+
+            if (future != null) {
+                future.complete(null);
+            }
+            return future;
+        }
+        finally
+        {
+            if (lock != null)
+                lock.unlock();
         }
     }
 
@@ -413,34 +592,6 @@
         return replicationStrategy;
     }
 
-    /**
-     * @param key row to index
-     * @param cfs ColumnFamily to index row in
-     * @param idxNames columns to index, in comparator order
-     */
-    public static void indexRow(DecoratedKey key, ColumnFamilyStore cfs, Set<String> idxNames)
-    {
-        if (logger.isTraceEnabled())
-            logger.trace("Indexing row {} ", cfs.metadata.getKeyValidator().getString(key.getKey()));
-
-        Set<SecondaryIndex> indexes = cfs.indexManager.getIndexesByNames(idxNames);
-
-        Iterator<ColumnFamily> pager = QueryPagers.pageRowLocally(cfs, key.getKey(), DEFAULT_PAGE_SIZE);
-        while (pager.hasNext())
-        {
-            try (OpOrder.Group opGroup = cfs.keyspace.writeOrder.start()) {
-                ColumnFamily cf = pager.next();
-                ColumnFamily cf2 = cf.cloneMeShallow();
-                for (Cell cell : cf)
-                {
-                    if (cfs.indexManager.indexes(cell.name(), indexes))
-                        cf2.addColumn(cell);
-                }
-                cfs.indexManager.indexRow(key.getKey(), cf2, opGroup);
-            }
-        }
-    }
-
     public List<Future<?>> flush()
     {
         List<Future<?>> futures = new ArrayList<>(columnFamilyStores.size());
@@ -449,7 +600,9 @@
         return futures;
     }
 
-    public Iterable<ColumnFamilyStore> getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String... cfNames) throws IOException
+    public Iterable<ColumnFamilyStore> getValidColumnFamilies(boolean allowIndexes,
+                                                              boolean autoAddIndexes,
+                                                              String... cfNames) throws IOException
     {
         Set<ColumnFamilyStore> valid = new HashSet<>();
 
@@ -460,65 +613,57 @@
             {
                 valid.add(cfStore);
                 if (autoAddIndexes)
-                {
-                    for (SecondaryIndex si : cfStore.indexManager.getIndexes())
-                    {
-                        if (si.getIndexCfs() != null) {
-                            logger.info("adding secondary index {} to operation", si.getIndexName());
-                            valid.add(si.getIndexCfs());
-                        }
-                    }
-
-                }
+                    valid.addAll(getIndexColumnFamilyStores(cfStore));
             }
             return valid;
         }
-        // filter out interesting stores
+
+        // include the specified stores and possibly the stores of any of their indexes
         for (String cfName : cfNames)
         {
-            //if the CF name is an index, just flush the CF that owns the index
-            String baseCfName = cfName;
-            String idxName = null;
-            if (cfName.contains(".")) // secondary index
+            if (SecondaryIndexManager.isIndexColumnFamily(cfName))
             {
-                if(!allowIndexes)
+                if (!allowIndexes)
                 {
                     logger.warn("Operation not allowed on secondary Index table ({})", cfName);
                     continue;
                 }
+                String baseName = SecondaryIndexManager.getParentCfsName(cfName);
+                String indexName = SecondaryIndexManager.getIndexName(cfName);
 
-                String[] parts = cfName.split("\\.", 2);
-                baseCfName = parts[0];
-                idxName = parts[1];
-            }
+                ColumnFamilyStore baseCfs = getColumnFamilyStore(baseName);
+                Index index = baseCfs.indexManager.getIndexByName(indexName);
+                if (index == null)
+                    throw new IllegalArgumentException(String.format("Invalid index specified: %s/%s.",
+                                                                     baseCfs.metadata.cfName,
+                                                                     indexName));
 
-            ColumnFamilyStore cfStore = getColumnFamilyStore(baseCfName);
-            if (idxName != null)
-            {
-                Collection< SecondaryIndex > indexes = cfStore.indexManager.getIndexesByNames(new HashSet<>(Arrays.asList(cfName)));
-                if (indexes.isEmpty())
-                    throw new IllegalArgumentException(String.format("Invalid index specified: %s/%s.", baseCfName, idxName));
-                else
-                    valid.add(Iterables.get(indexes, 0).getIndexCfs());
+                if (index.getBackingTable().isPresent())
+                    valid.add(index.getBackingTable().get());
             }
             else
             {
+                ColumnFamilyStore cfStore = getColumnFamilyStore(cfName);
                 valid.add(cfStore);
-                if(autoAddIndexes)
-                {
-                    for(SecondaryIndex si : cfStore.indexManager.getIndexes())
-                    {
-                        if (si.getIndexCfs() != null) {
-                            logger.info("adding secondary index {} to operation", si.getIndexName());
-                            valid.add(si.getIndexCfs());
-                        }
-                    }
-                }
+                if (autoAddIndexes)
+                    valid.addAll(getIndexColumnFamilyStores(cfStore));
             }
         }
+
         return valid;
     }
 
+    private Set<ColumnFamilyStore> getIndexColumnFamilyStores(ColumnFamilyStore baseCfs)
+    {
+        Set<ColumnFamilyStore> stores = new HashSet<>();
+        for (ColumnFamilyStore indexCfs : baseCfs.indexManager.getAllIndexColumnFamilyStores())
+        {
+            logger.info("adding secondary index table {} to operation", indexCfs.metadata.cfName);
+            stores.add(indexCfs);
+        }
+        return stores;
+    }
+
     public static Iterable<Keyspace> all()
     {
         return Iterables.transform(Schema.instance.getKeyspaces(), keyspaceTransformer);
@@ -529,9 +674,14 @@
         return Iterables.transform(Schema.instance.getNonSystemKeyspaces(), keyspaceTransformer);
     }
 
+    public static Iterable<Keyspace> nonLocalStrategy()
+    {
+        return Iterables.transform(Schema.instance.getNonLocalStrategyKeyspaces(), keyspaceTransformer);
+    }
+
     public static Iterable<Keyspace> system()
     {
-        return Iterables.transform(Collections.singleton(SystemKeyspace.NAME), keyspaceTransformer);
+        return Iterables.transform(Schema.LOCAL_SYSTEM_KEYSPACE_NAMES, keyspaceTransformer);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/LegacyLayout.java b/src/java/org/apache/cassandra/db/LegacyLayout.java
new file mode 100644
index 0000000..8492de5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/LegacyLayout.java

@@ -0,0 +1,2793 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.io.IOError;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
+import org.apache.cassandra.utils.AbstractIterator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.collect.Iterables.all;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+/**
+ * Functions to deal with the old format.
+ */
+public abstract class LegacyLayout
+{
+    private static final Logger logger = LoggerFactory.getLogger(LegacyLayout.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1L, TimeUnit.MINUTES);
+
+    public final static int MAX_CELL_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT;
+
+    public final static int STATIC_PREFIX = 0xFFFF;
+
+    public final static int DELETION_MASK        = 0x01;
+    public final static int EXPIRATION_MASK      = 0x02;
+    public final static int COUNTER_MASK         = 0x04;
+    public final static int COUNTER_UPDATE_MASK  = 0x08;
+    private final static int RANGE_TOMBSTONE_MASK = 0x10;
+
+    // Used in decodeBound if the number of components in the legacy bound is greater than the clustering size,
+    // indicating a complex column deletion (i.e. a collection tombstone), but the referenced column is either
+    // not present in the current table metadata, or is not currently a complex column. In that case, we'll
+    // check the dropped columns for the table which should contain the previous column definition. If that
+    // previous definition is also not complex (indicating that the column may have been dropped and re-added
+    // with different types multiple times), we use this fake definition to ensure that the complex deletion
+    // can be safely processed. This resulting deletion should be filtered out of any row created by a
+    // CellGrouper by the dropped column check, but this gives us an extra level of confidence as that check
+    // is timestamp based and so is fallible in the face of clock drift.
+    private static final ColumnDefinition INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN =
+        new ColumnDefinition("",
+                             "",
+                             ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance),
+                             SetType.getInstance(UTF8Type.instance, true),
+                             ColumnDefinition.NO_POSITION,
+                             ColumnDefinition.Kind.REGULAR);
+
+    private LegacyLayout() {}
+
+    public static AbstractType<?> makeLegacyComparator(CFMetaData metadata)
+    {
+        ClusteringComparator comparator = metadata.comparator;
+        if (!metadata.isCompound())
+        {
+            assert comparator.size() == 1;
+            return comparator.subtype(0);
+        }
+
+        boolean hasCollections = metadata.hasCollectionColumns() || metadata.hasDroppedCollectionColumns();
+        List<AbstractType<?>> types = new ArrayList<>(comparator.size() + (metadata.isDense() ? 0 : 1) + (hasCollections ? 1 : 0));
+
+        types.addAll(comparator.subtypes());
+
+        if (!metadata.isDense())
+        {
+            types.add(UTF8Type.instance);
+
+            if (hasCollections)
+            {
+                Map<ByteBuffer, CollectionType> defined = new HashMap<>();
+
+                for (CFMetaData.DroppedColumn def : metadata.getDroppedColumns().values())
+                    if (def.type instanceof CollectionType && def.type.isMultiCell())
+                        defined.put(bytes(def.name), (CollectionType) def.type);
+
+                for (ColumnDefinition def : metadata.partitionColumns())
+                    if (def.type instanceof CollectionType && def.type.isMultiCell())
+                        defined.put(def.name.bytes, (CollectionType) def.type);
+
+                types.add(ColumnToCollectionType.getInstance(defined));
+            }
+        }
+        return CompositeType.getInstance(types);
+    }
+
+    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer cellname)
+    throws UnknownColumnException
+    {
+        assert cellname != null;
+        if (metadata.isSuper())
+        {
+            assert superColumnName != null;
+            return decodeForSuperColumn(metadata, new Clustering(superColumnName), cellname);
+        }
+
+        assert superColumnName == null;
+        return decodeCellName(metadata, cellname);
+    }
+
+    private static LegacyCellName decodeForSuperColumn(CFMetaData metadata, Clustering clustering, ByteBuffer subcol)
+    {
+        ColumnDefinition def = metadata.getColumnDefinition(subcol);
+        if (def != null)
+        {
+            // it's a statically defined subcolumn
+            return new LegacyCellName(clustering, def, null);
+        }
+
+        def = metadata.compactValueColumn();
+        assert def != null && def.type instanceof MapType;
+        return new LegacyCellName(clustering, def, subcol);
+    }
+
+    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname) throws UnknownColumnException
+    {
+        return decodeCellName(metadata, cellname, false);
+    }
+
+    public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname, boolean readAllAsDynamic) throws UnknownColumnException
+    {
+        Clustering clustering = decodeClustering(metadata, cellname);
+
+        if (metadata.isSuper())
+            return decodeForSuperColumn(metadata, clustering, CompositeType.extractComponent(cellname, 1));
+
+        if (metadata.isDense() || (metadata.isCompactTable() && readAllAsDynamic))
+            return new LegacyCellName(clustering, metadata.compactValueColumn(), null);
+
+        ByteBuffer column = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size()) : cellname;
+        if (column == null)
+        {
+            // Tables for composite 2ndary indexes used to be compound but dense, but we've transformed them into regular tables
+            // (non compact ones) but with no regular column (i.e. we only care about the clustering). So we'll get here
+            // in that case, and what we want to return is basically a row marker.
+            if (metadata.partitionColumns().isEmpty())
+                return new LegacyCellName(clustering, null, null);
+
+            // Otherwise, we shouldn't get there
+            throw new IllegalArgumentException("No column name component found in cell name");
+        }
+
+        // Row marker, this is ok
+        if (!column.hasRemaining())
+            return new LegacyCellName(clustering, null, null);
+
+        ColumnDefinition def = metadata.getColumnDefinition(column);
+
+        if (metadata.isCompactTable())
+        {
+            if (def == null || def.isPrimaryKeyColumn())
+                // If it's a compact table, it means the column is in fact a "dynamic" one
+                return new LegacyCellName(new Clustering(column), metadata.compactValueColumn(), null);
+        }
+        else if (def == null)
+        {
+            throw new UnknownColumnException(metadata, column);
+        }
+
+        ByteBuffer collectionElement = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size() + 1) : null;
+        if (collectionElement != null && def.type instanceof CollectionType)
+        {
+            ((CollectionType)def.type).nameComparator().validateIfFixedSize(collectionElement);
+        }
+
+        // Note that because static compact columns are translated to static defs in the new world order, we need to force a static
+        // clustering if the definition is static (as it might not be in this case).
+        return new LegacyCellName(def.isStatic() ? Clustering.STATIC_CLUSTERING : clustering, def, collectionElement);
+    }
+
+    public static LegacyBound decodeSliceBound(CFMetaData metadata, ByteBuffer bound, boolean isStart)
+    {
+        return decodeBound(metadata, bound, isStart, false);
+    }
+
+    public static LegacyBound decodeTombstoneBound(CFMetaData metadata, ByteBuffer bound, boolean isStart)
+    {
+        return decodeBound(metadata, bound, isStart, true);
+    }
+
+    private static LegacyBound decodeBound(CFMetaData metadata, ByteBuffer bound, boolean isStart, boolean isDeletion)
+    {
+        if (!bound.hasRemaining())
+            return isStart ? LegacyBound.BOTTOM : LegacyBound.TOP;
+
+        if (!metadata.isCompound())
+        {
+            // The non compound case is a lot easier, in that there is no EOC nor collection to worry about, so dealing
+            // with that first.
+            metadata.comparator.subtype(0).validateIfFixedSize(bound);
+            return new LegacyBound(isStart ? Slice.Bound.inclusiveStartOf(bound) : Slice.Bound.inclusiveEndOf(bound), false, null);
+        }
+
+        int clusteringSize = metadata.comparator.size();
+
+        boolean isStatic = metadata.isCompound() && CompositeType.isStaticName(bound);
+        List<ByteBuffer> components = CompositeType.splitName(bound);
+        byte eoc = CompositeType.lastEOC(bound);
+        for (int i=0; i<Math.min(clusteringSize, components.size()); i++)
+        {
+            metadata.comparator.subtype(i).validateIfFixedSize(components.get(i));
+        }
+
+        // if the bound we have decoded is static, 2.2 format requires there to be N empty clusterings
+        assert !isStatic ||
+                (components.size() >= clusteringSize
+                        && all(components.subList(0, clusteringSize), ByteBufferUtil.EMPTY_BYTE_BUFFER::equals));
+
+        ColumnDefinition collectionName = null;
+        if (components.size() > clusteringSize)
+        {
+            // For a deletion, there can be more components than the clustering size only in the case this is the
+            // bound of a collection range tombstone. In such a case, there is exactly one more component, and that
+            // component is the name of the collection being deleted, since we do not support collection range deletions.
+            // If the bound is not part of a deletion, it is from slice query filter. The column name may be:
+            //   - a valid, non-collection column; in this case we expect a single extra component
+            //   - an empty buffer, representing a row marker; in this case we also expect a single extra empty component
+            //   - a valid collection column and the first part of a cell path; in this case we expect exactly two extra components
+            // In any of these slice cases, these items are unnecessary for the bound we construct,
+            // so we can simply remove them, after corroborating we have encountered one of these scenario.
+            assert !metadata.isCompactTable() : toDebugHex(components);
+
+            // In all cases, the element straight after the clusterings should contain the name of a column.
+            if (components.size() > clusteringSize + 1)
+            {
+                // we accept bounds from paging state that occur inside a complex column - in this case, we expect
+                // two excess components, the first of which is a column name, the second a key into the collection
+                if (isDeletion)
+                    throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": deletion can have at most one extra component");
+
+                if (clusteringSize + 2 != components.size())
+                    throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": complex slices require exactly two extra components");
+
+                // decode simply to verify that we have (or may have had) a complex column; we assume the collection key is valid
+                decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic);
+                components.remove(clusteringSize + 1);
+            }
+            else if (isDeletion)
+            {
+                collectionName = decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic);
+            }
+            else if (components.get(clusteringSize).hasRemaining())
+            {
+                decodeBoundVerifySimpleColumn(metadata, components, clusteringSize, isStatic);
+            }
+            components.remove(clusteringSize);
+        }
+
+        boolean isInclusive;
+        if (isStart)
+        {
+            isInclusive = eoc <= 0;
+        }
+        else
+        {
+            isInclusive = eoc >= 0;
+
+            // for an end bound, if we only have a prefix of all the components and the final EOC is zero,
+            // then it should only match up to the prefix but no further, that is, it is an inclusive bound
+            // of the exact prefix but an exclusive bound of anything beyond it, so adding an empty
+            // composite value ensures this behavior, see CASSANDRA-12423 for more details
+            if (eoc == 0 && components.size() < clusteringSize)
+            {
+                components.add(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+                isInclusive = false;
+            }
+        }
+
+        Slice.Bound.Kind boundKind = Slice.Bound.boundKind(isStart, isInclusive);
+        Slice.Bound sb = Slice.Bound.create(boundKind, components.toArray(new ByteBuffer[components.size()]));
+        return new LegacyBound(sb, isStatic, collectionName);
+    }
+
+    // finds the simple column definition associated with components.get(clusteringSize)
+    // if no such columns exists, or ever existed, we throw an exception; if we do not know, we return a dummy column definition
+    private static ColumnDefinition decodeBoundLookupComplexColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic)
+    {
+        ByteBuffer columnNameBytes = components.get(clusteringSize);
+        ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes);
+        if (columnName == null || !columnName.isComplex())
+        {
+            columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic);
+            // if no record of the column having ever existed is found, something is badly wrong
+            if (columnName == null)
+                throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected complex column at position " + clusteringSize);
+
+            // if we do have a record of dropping this column but it wasn't previously complex, use a fake
+            // column definition for safety (see the comment on the constant declaration for details)
+            if (!columnName.isComplex())
+                columnName = INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN;
+        }
+
+        return columnName;
+    }
+
+    // finds the simple column definition associated with components.get(clusteringSize)
+    // if no such columns exists, and definitely never existed, we throw an exception
+    private static void decodeBoundVerifySimpleColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic)
+    {
+        ByteBuffer columnNameBytes = components.get(clusteringSize);
+        ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes);
+        if (columnName == null || !columnName.isSimple())
+        {
+            columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic);
+            // if no record of the column having ever existed is found, something is badly wrong
+            if (columnName == null)
+                throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected simple column at position " + clusteringSize);
+        }
+    }
+
+    private static String toDebugHex(Collection<ByteBuffer> buffers)
+    {
+        return buffers.stream().map(ByteBufferUtil::bytesToHex).collect(Collectors.joining());
+    }
+
+    public static ByteBuffer encodeBound(CFMetaData metadata, Slice.Bound bound, boolean isStart)
+    {
+        if (bound == Slice.Bound.BOTTOM || bound == Slice.Bound.TOP || metadata.comparator.size() == 0)
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+
+        ClusteringPrefix clustering = bound.clustering();
+
+        if (!metadata.isCompound())
+        {
+            assert clustering.size() == 1;
+            return clustering.get(0);
+        }
+
+        CompositeType ctype = CompositeType.getInstance(metadata.comparator.subtypes());
+        CompositeType.Builder builder = ctype.builder();
+        for (int i = 0; i < clustering.size(); i++)
+            builder.add(clustering.get(i));
+
+        if (isStart)
+            return bound.isInclusive() ? builder.build() : builder.buildAsEndOfRange();
+        else
+            return bound.isInclusive() ? builder.buildAsEndOfRange() : builder.build();
+    }
+
+    public static ByteBuffer encodeCellName(CFMetaData metadata, ClusteringPrefix clustering, ByteBuffer columnName, ByteBuffer collectionElement)
+    {
+        boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;
+
+        if (!metadata.isCompound())
+        {
+            if (isStatic)
+                return columnName;
+
+            assert clustering.size() == 1 : "Expected clustering size to be 1, but was " + clustering.size();
+            return clustering.get(0);
+        }
+
+        // We use comparator.size() rather than clustering.size() because of static clusterings
+        int clusteringSize = metadata.comparator.size();
+        int size = clusteringSize + (metadata.isDense() ? 0 : 1) + (collectionElement == null ? 0 : 1);
+        if (metadata.isSuper())
+            size = clusteringSize + 1;
+        ByteBuffer[] values = new ByteBuffer[size];
+        for (int i = 0; i < clusteringSize; i++)
+        {
+            if (isStatic)
+            {
+                values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+                continue;
+            }
+
+            ByteBuffer v = clustering.get(i);
+            // we can have null (only for dense compound tables for backward compatibility reasons) but that
+            // means we're done and should stop there as far as building the composite is concerned.
+            if (v == null)
+                return CompositeType.build(Arrays.copyOfRange(values, 0, i));
+
+            values[i] = v;
+        }
+
+        if (metadata.isSuper())
+        {
+            // We need to set the "column" (in thrift terms) name, i.e. the value corresponding to the subcomparator.
+            // What it is depends if this a cell for a declared "static" column or a "dynamic" column part of the
+            // super-column internal map.
+            assert columnName != null; // This should never be null for supercolumns, see decodeForSuperColumn() above
+            values[clusteringSize] = columnName.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN)
+                                   ? collectionElement
+                                   : columnName;
+        }
+        else
+        {
+            if (!metadata.isDense())
+                values[clusteringSize] = columnName;
+            if (collectionElement != null)
+                values[clusteringSize + 1] = collectionElement;
+        }
+
+        return CompositeType.build(isStatic, values);
+    }
+
+    public static Clustering decodeClustering(CFMetaData metadata, ByteBuffer value)
+    {
+        int csize = metadata.comparator.size();
+        if (csize == 0)
+            return Clustering.EMPTY;
+
+        if (metadata.isCompound() && CompositeType.isStaticName(value))
+            return Clustering.STATIC_CLUSTERING;
+
+        List<ByteBuffer> components = metadata.isCompound()
+                                    ? CompositeType.splitName(value)
+                                    : Collections.singletonList(value);
+
+        for (int i=0; i<Math.min(csize, components.size()); i++)
+        {
+            AbstractType<?> type = metadata.comparator.subtype(i);
+            type.validateIfFixedSize(components.get(i));
+        }
+        return new Clustering(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize]));
+    }
+
+    public static ByteBuffer encodeClustering(CFMetaData metadata, ClusteringPrefix clustering)
+    {
+        if (clustering.size() == 0)
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+
+        if (!metadata.isCompound())
+        {
+            assert clustering.size() == 1;
+            return clustering.get(0);
+        }
+
+        ByteBuffer[] values = new ByteBuffer[clustering.size()];
+        for (int i = 0; i < clustering.size(); i++)
+            values[i] = clustering.get(i);
+        return CompositeType.build(values);
+    }
+
+    /**
+     * The maximum number of cells to include per partition when converting to the old format.
+     * <p>
+     * We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries
+     * and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested
+     * because {@link DataLimits} always include full rows. So if the limit ends in the middle of a queried row, the
+     * full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to
+     * what's expected before writting it on the wire.
+     *
+     * @param command the read commmand for which to determine the maximum cells per partition. This can be {@code null}
+     * in which case {@code Integer.MAX_VALUE} is returned.
+     * @return the maximum number of cells per partition that should be enforced according to the read command if
+     * post-query limitation are in order (see above). This will be {@code Integer.MAX_VALUE} if no such limits are
+     * necessary.
+     */
+    private static int maxLiveCellsPerPartition(ReadCommand command)
+    {
+        if (command == null)
+            return Integer.MAX_VALUE;
+
+        DataLimits limits = command.limits();
+
+        // There is 2 types of DISTINCT queries: those that includes only the partition key, and those that include static columns.
+        // On old nodes, the latter expects the first row in term of CQL count, which is what we already have and there is no additional
+        // limit to apply. The former however expect only one cell per partition and rely on it (See CASSANDRA-10762).
+        if (limits.isDistinct())
+            return command.columnFilter().fetchedColumns().statics.isEmpty() ? 1 : Integer.MAX_VALUE;
+
+        switch (limits.kind())
+        {
+            case THRIFT_LIMIT:
+            case SUPER_COLUMN_COUNTING_LIMIT:
+                return limits.perPartitionCount();
+            default:
+                return Integer.MAX_VALUE;
+        }
+    }
+
+    // For serializing to old wire format
+    public static LegacyUnfilteredPartition fromUnfilteredRowIterator(ReadCommand command, UnfilteredRowIterator iterator)
+    {
+        // we need to extract the range tombstone so materialize the partition. Since this is
+        // used for the on-wire format, this is not worst than it used to be.
+        final ImmutableBTreePartition partition = ImmutableBTreePartition.create(iterator);
+        DeletionInfo info = partition.deletionInfo();
+        Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> pair = fromRowIterator(partition.metadata(), partition.iterator(), partition.staticRow());
+
+        LegacyLayout.LegacyRangeTombstoneList rtl = pair.left;
+
+        // Processing the cell iterator results in the LegacyRangeTombstoneList being populated, so we do this
+        // before we use the LegacyRangeTombstoneList at all
+        List<LegacyLayout.LegacyCell> cells = Lists.newArrayList(pair.right);
+
+        int maxCellsPerPartition = maxLiveCellsPerPartition(command);
+        cells = maybeTrimLiveCells(cells, maxCellsPerPartition, command);
+
+        // The LegacyRangeTombstoneList already has range tombstones for the single-row deletions and complex
+        // deletions.  Go through our normal range tombstones and add then to the LegacyRTL so that the range
+        // tombstones all get merged and sorted properly.
+        if (info.hasRanges())
+        {
+            Iterator<RangeTombstone> rangeTombstoneIterator = info.rangeIterator(false);
+            while (rangeTombstoneIterator.hasNext())
+            {
+                RangeTombstone rt = rangeTombstoneIterator.next();
+                Slice slice = rt.deletedSlice();
+                LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(slice.start(), false, null);
+                LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(slice.end(), false, null);
+                rtl.add(start, end, rt.deletionTime().markedForDeleteAt(), rt.deletionTime().localDeletionTime());
+            }
+        }
+
+        return new LegacyUnfilteredPartition(info.getPartitionDeletion(), rtl, cells);
+    }
+
+    private static List<LegacyCell> maybeTrimLiveCells(List<LegacyCell> cells, int maxLiveCells, ReadCommand command)
+    {
+        if (null == command || maxLiveCells >= cells.size())
+            return cells;
+
+        int nowInSec = command.nowInSec();
+        int live = 0;
+        int dead = 0;
+
+        for (int i = 0; i < cells.size() && live < maxLiveCells; i++)
+        {
+            if (cells.get(i).isLive(nowInSec))
+                live++;
+            else
+                dead++;
+        }
+
+        return cells.subList(0, live + dead);
+    }
+
+    public static void serializeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, DataOutputPlus out, int version) throws IOException
+    {
+        assert version < MessagingService.VERSION_30;
+
+        out.writeBoolean(true);
+
+        LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition);
+
+        UUIDSerializer.serializer.serialize(partition.metadata().cfId, out, version);
+        DeletionTime.serializer.serialize(legacyPartition.partitionDeletion, out);
+
+        legacyPartition.rangeTombstones.serialize(out, partition.metadata());
+
+        // begin cell serialization
+        out.writeInt(legacyPartition.cells.size());
+        for (LegacyLayout.LegacyCell cell : legacyPartition.cells)
+        {
+            ByteBufferUtil.writeWithShortLength(cell.name.encode(partition.metadata()), out);
+            out.writeByte(cell.serializationFlags());
+            if (cell.isExpiring())
+            {
+                out.writeInt(cell.ttl);
+                out.writeInt(cell.localDeletionTime);
+            }
+            else if (cell.isTombstone())
+            {
+                out.writeLong(cell.timestamp);
+                out.writeInt(TypeSizes.sizeof(cell.localDeletionTime));
+                out.writeInt(cell.localDeletionTime);
+                continue;
+            }
+            else if (cell.isCounterUpdate())
+            {
+                out.writeLong(cell.timestamp);
+                long count = CounterContext.instance().getUpdateCount(cell.value);
+                ByteBufferUtil.writeWithLength(ByteBufferUtil.bytes(count), out);
+                continue;
+            }
+            else if (cell.isCounter())
+            {
+                out.writeLong(Long.MIN_VALUE);  // timestampOfLastDelete (not used, and MIN_VALUE is the default)
+            }
+
+            out.writeLong(cell.timestamp);
+            ByteBufferUtil.writeWithLength(cell.value, out);
+        }
+    }
+
+    // For the old wire format
+    // Note: this can return null if an empty partition is serialized!
+    public static UnfilteredRowIterator deserializeLegacyPartition(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException
+    {
+        assert version < MessagingService.VERSION_30;
+
+        // This is only used in mutation, and mutation have never allowed "null" column families
+        boolean present = in.readBoolean();
+        if (!present)
+            return null;
+
+        CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
+        LegacyDeletionInfo info = LegacyDeletionInfo.deserialize(metadata, in);
+        int size = in.readInt();
+        Iterator<LegacyCell> cells = deserializeCells(metadata, in, flag, size);
+        SerializationHelper helper = new SerializationHelper(metadata, version, flag);
+        return onWireCellstoUnfilteredRowIterator(metadata, metadata.partitioner.decorateKey(key), info, cells, false, helper);
+    }
+
+    // For the old wire format
+    public static long serializedSizeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, int version)
+    {
+        assert version < MessagingService.VERSION_30;
+
+        if (partition.isEmpty())
+            return TypeSizes.sizeof(false);
+
+        long size = TypeSizes.sizeof(true);
+
+        LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition);
+
+        size += UUIDSerializer.serializer.serializedSize(partition.metadata().cfId, version);
+        size += DeletionTime.serializer.serializedSize(legacyPartition.partitionDeletion);
+        size += legacyPartition.rangeTombstones.serializedSize(partition.metadata());
+
+        // begin cell serialization
+        size += TypeSizes.sizeof(legacyPartition.cells.size());
+        for (LegacyLayout.LegacyCell cell : legacyPartition.cells)
+        {
+            size += ByteBufferUtil.serializedSizeWithShortLength(cell.name.encode(partition.metadata()));
+            size += 1;  // serialization flags
+            if (cell.isExpiring())
+            {
+                size += TypeSizes.sizeof(cell.ttl);
+                size += TypeSizes.sizeof(cell.localDeletionTime);
+            }
+            else if (cell.isTombstone())
+            {
+                size += TypeSizes.sizeof(cell.timestamp);
+                // localDeletionTime replaces cell.value as the body
+                size += TypeSizes.sizeof(TypeSizes.sizeof(cell.localDeletionTime));
+                size += TypeSizes.sizeof(cell.localDeletionTime);
+                continue;
+            }
+            else if (cell.isCounterUpdate())
+            {
+                size += TypeSizes.sizeof(cell.timestamp);
+                long count = CounterContext.instance().getUpdateCount(cell.value);
+                size += ByteBufferUtil.serializedSizeWithLength(ByteBufferUtil.bytes(count));
+                continue;
+            }
+            else if (cell.isCounter())
+            {
+                size += TypeSizes.sizeof(Long.MIN_VALUE);  // timestampOfLastDelete
+            }
+
+            size += TypeSizes.sizeof(cell.timestamp);
+            size += ByteBufferUtil.serializedSizeWithLength(cell.value);
+        }
+
+        return size;
+    }
+
+    // For thrift sake
+    public static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata,
+                                                                DecoratedKey key,
+                                                                LegacyDeletionInfo delInfo,
+                                                                Iterator<LegacyCell> cells)
+    {
+        SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL);
+        return toUnfilteredRowIterator(metadata, key, delInfo, cells, false, helper);
+    }
+
+    // For deserializing old wire format
+    public static UnfilteredRowIterator onWireCellstoUnfilteredRowIterator(CFMetaData metadata,
+                                                                           DecoratedKey key,
+                                                                           LegacyDeletionInfo delInfo,
+                                                                           Iterator<LegacyCell> cells,
+                                                                           boolean reversed,
+                                                                           SerializationHelper helper)
+    {
+
+        // If the table is a static compact, the "column_metadata" are now internally encoded as
+        // static. This has already been recognized by decodeCellName, but it means the cells
+        // provided are not in the expected order (the "static" cells are not necessarily at the front).
+        // So sort them to make sure toUnfilteredRowIterator works as expected.
+        // Further, if the query is reversed, then the on-wire format still has cells in non-reversed
+        // order, but we need to have them reverse in the final UnfilteredRowIterator. So reverse them.
+        if (metadata.isStaticCompactTable() || reversed)
+        {
+            List<LegacyCell> l = new ArrayList<>();
+            Iterators.addAll(l, cells);
+            Collections.sort(l, legacyCellComparator(metadata, reversed));
+            cells = l.iterator();
+        }
+
+        return toUnfilteredRowIterator(metadata, key, delInfo, cells, reversed, helper);
+    }
+
+    private static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata,
+                                                                 DecoratedKey key,
+                                                                 LegacyDeletionInfo delInfo,
+                                                                 Iterator<LegacyCell> cells,
+                                                                 boolean reversed,
+                                                                 SerializationHelper helper)
+    {
+        // A reducer that basically does nothing, we know the 2 merged iterators can't have conflicting atoms (since we merge cells with range tombstones).
+        MergeIterator.Reducer<LegacyAtom, LegacyAtom> reducer = new MergeIterator.Reducer<LegacyAtom, LegacyAtom>()
+        {
+            private LegacyAtom atom;
+
+            public void reduce(int idx, LegacyAtom current)
+            {
+                // We're merging cell with range tombstones, so we should always only have a single atom to reduce.
+                assert atom == null;
+                atom = current;
+            }
+
+            protected LegacyAtom getReduced()
+            {
+                return atom;
+            }
+
+            protected void onKeyChange()
+            {
+                atom = null;
+            }
+        };
+        List<Iterator<LegacyAtom>> iterators = Arrays.asList(asLegacyAtomIterator(cells), asLegacyAtomIterator(delInfo.inRowRangeTombstones()));
+        PeekingIterator<LegacyAtom> atoms = Iterators.peekingIterator(MergeIterator.get(iterators, legacyAtomComparator(metadata), reducer));
+
+        // Check if we have some static
+        Row staticRow = atoms.hasNext() && atoms.peek().isStatic()
+                      ? getNextRow(CellGrouper.staticGrouper(metadata, helper), atoms)
+                      : Rows.EMPTY_STATIC_ROW;
+
+        Iterator<Row> rows = convertToRows(new CellGrouper(metadata, helper), atoms);
+        Iterator<RangeTombstone> ranges = delInfo.deletionInfo.rangeIterator(reversed);
+        return new RowAndDeletionMergeIterator(metadata,
+                                               key,
+                                               delInfo.deletionInfo.getPartitionDeletion(),
+                                               ColumnFilter.all(metadata),
+                                               staticRow,
+                                               reversed,
+                                               EncodingStats.NO_STATS,
+                                               rows,
+                                               ranges,
+                                               true);
+    }
+
+    public static Row extractStaticColumns(CFMetaData metadata, DataInputPlus in, Columns statics) throws IOException
+    {
+        assert !statics.isEmpty();
+        assert metadata.isCompactTable();
+
+        if (metadata.isSuper())
+            // TODO: there is in practice nothing to do here, but we need to handle the column_metadata for super columns somewhere else
+            throw new UnsupportedOperationException();
+
+        Set<ByteBuffer> columnsToFetch = new HashSet<>(statics.size());
+        for (ColumnDefinition column : statics)
+            columnsToFetch.add(column.name.bytes);
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        builder.newRow(Clustering.STATIC_CLUSTERING);
+
+        boolean foundOne = false;
+        LegacyAtom atom;
+        while ((atom = readLegacyAtomSkippingUnknownColumn(metadata,in)) != null)
+        {
+            if (atom.isCell())
+            {
+                LegacyCell cell = atom.asCell();
+                if (!columnsToFetch.contains(cell.name.encode(metadata)))
+                    continue;
+
+                foundOne = true;
+                cell.name.column.type.validateIfFixedSize(cell.value);
+                builder.addCell(new BufferCell(cell.name.column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, null));
+            }
+            else
+            {
+                LegacyRangeTombstone tombstone = atom.asRangeTombstone();
+                // TODO: we need to track tombstones and potentially ignore cells that are
+                // shadowed (or even better, replace them by tombstones).
+                throw new UnsupportedOperationException();
+            }
+        }
+
+        return foundOne ? builder.build() : Rows.EMPTY_STATIC_ROW;
+    }
+
+    private static LegacyAtom readLegacyAtomSkippingUnknownColumn(CFMetaData metadata, DataInputPlus in)
+    throws IOException
+    {
+        while (true)
+        {
+            try
+            {
+                return readLegacyAtom(metadata, in, false);
+            }
+            catch (UnknownColumnException e)
+            {
+                // Simply skip, as the method name implies.
+            }
+        }
+
+    }
+
+    private static Row getNextRow(CellGrouper grouper, PeekingIterator<? extends LegacyAtom> cells)
+    {
+        if (!cells.hasNext())
+            return null;
+
+        grouper.reset();
+        while (cells.hasNext() && grouper.addAtom(cells.peek()))
+        {
+            // We've added the cell already in the grouper, so just skip it
+            cells.next();
+        }
+        return grouper.getRow();
+    }
+
+    @SuppressWarnings("unchecked")
+    private static Iterator<LegacyAtom> asLegacyAtomIterator(Iterator<? extends LegacyAtom> iter)
+    {
+        return (Iterator<LegacyAtom>)iter;
+    }
+
+    private static Iterator<Row> convertToRows(final CellGrouper grouper, final PeekingIterator<LegacyAtom> atoms)
+    {
+        return new AbstractIterator<Row>()
+        {
+            protected Row computeNext()
+            {
+                if (!atoms.hasNext())
+                    return endOfData();
+
+                return getNextRow(grouper, atoms);
+            }
+        };
+    }
+
+    public static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final RowIterator iterator)
+    {
+        return fromRowIterator(iterator.metadata(), iterator, iterator.staticRow());
+    }
+
+    private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final CFMetaData metadata, final Iterator<Row> iterator, final Row staticRow)
+    {
+        LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10);
+        Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>()
+        {
+            private Iterator<LegacyCell> currentRow = initializeRow();
+
+            private Iterator<LegacyCell> initializeRow()
+            {
+                if (staticRow == null || staticRow.isEmpty())
+                    return Collections.<LegacyLayout.LegacyCell>emptyIterator();
+
+                Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, staticRow);
+                deletions.addAll(row.left);
+                return row.right;
+            }
+
+            protected LegacyCell computeNext()
+            {
+                while (true)
+                {
+                    if (currentRow.hasNext())
+                        return currentRow.next();
+
+                    if (!iterator.hasNext())
+                        return endOfData();
+
+                    Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, iterator.next());
+                    deletions.addAll(row.left);
+                    currentRow = row.right;
+                }
+            }
+        };
+
+        return Pair.create(deletions, cells);
+    }
+
+    private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRow(final CFMetaData metadata, final Row row)
+    {
+        // convert any complex deletions or row deletion into normal range tombstones so that we can build and send a proper RangeTombstoneList
+        // to legacy nodes
+        LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10);
+
+        if (!row.deletion().isLive())
+        {
+            Clustering clustering = row.clustering();
+            Slice.Bound startBound = Slice.Bound.inclusiveStartOf(clustering);
+            Slice.Bound endBound = Slice.Bound.inclusiveEndOf(clustering);
+
+            LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, null);
+            LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, null);
+
+            deletions.add(start, end, row.deletion().time().markedForDeleteAt(), row.deletion().time().localDeletionTime());
+        }
+
+        for (ColumnData cd : row)
+        {
+            ColumnDefinition col = cd.column();
+            if (col.isSimple())
+                continue;
+
+            DeletionTime delTime = ((ComplexColumnData)cd).complexDeletion();
+            if (!delTime.isLive())
+            {
+                Clustering clustering = row.clustering();
+                boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;
+                assert isStatic == col.isStatic();
+
+                Slice.Bound startBound = isStatic
+                        ? LegacyDeletionInfo.staticBound(metadata, true)
+                        : Slice.Bound.inclusiveStartOf(clustering);
+                Slice.Bound endBound = isStatic
+                        ? LegacyDeletionInfo.staticBound(metadata, false)
+                        : Slice.Bound.inclusiveEndOf(clustering);
+
+                LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, isStatic, col);
+                LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, isStatic, col);
+
+                deletions.add(start, end, delTime.markedForDeleteAt(), delTime.localDeletionTime());
+            }
+        }
+
+        Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>()
+        {
+            private final Iterator<Cell> cells = row.cellsInLegacyOrder(metadata, false).iterator();
+            // we don't have (and shouldn't have) row markers for compact tables.
+            private boolean hasReturnedRowMarker = metadata.isCompactTable();
+
+            protected LegacyCell computeNext()
+            {
+                if (!hasReturnedRowMarker)
+                {
+                    hasReturnedRowMarker = true;
+
+                    // don't include a row marker if there's no timestamp on the primary key; this is the 3.0+ equivalent
+                    // of a row marker
+                    if (!row.primaryKeyLivenessInfo().isEmpty())
+                    {
+                        LegacyCellName cellName = new LegacyCellName(row.clustering(), null, null);
+                        LivenessInfo info = row.primaryKeyLivenessInfo();
+                        return new LegacyCell(info.isExpiring() ? LegacyCell.Kind.EXPIRING : LegacyCell.Kind.REGULAR, cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER, info.timestamp(), info.localExpirationTime(), info.ttl());
+                    }
+                }
+
+                if (!cells.hasNext())
+                    return endOfData();
+
+                return makeLegacyCell(row.clustering(), cells.next());
+            }
+        };
+        return Pair.create(deletions, cells);
+    }
+
+    private static LegacyCell makeLegacyCell(Clustering clustering, Cell cell)
+    {
+        LegacyCell.Kind kind;
+        if (cell.isCounterCell())
+            kind = LegacyCell.Kind.COUNTER;
+        else if (cell.isTombstone())
+            kind = LegacyCell.Kind.DELETED;
+        else if (cell.isExpiring())
+            kind = LegacyCell.Kind.EXPIRING;
+        else
+            kind = LegacyCell.Kind.REGULAR;
+
+        CellPath path = cell.path();
+        assert path == null || path.size() == 1;
+        LegacyCellName name = new LegacyCellName(clustering, cell.column(), path == null ? null : path.get(0));
+        return new LegacyCell(kind, name, cell.value(), cell.timestamp(), cell.localDeletionTime(), cell.ttl());
+    }
+
+    public static RowIterator toRowIterator(final CFMetaData metadata,
+                                            final DecoratedKey key,
+                                            final Iterator<LegacyCell> cells,
+                                            final int nowInSec)
+    {
+        SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL);
+        return UnfilteredRowIterators.filter(toUnfilteredRowIterator(metadata, key, LegacyDeletionInfo.live(), cells, false, helper), nowInSec);
+    }
+
+    public static Comparator<LegacyCell> legacyCellComparator(CFMetaData metadata)
+    {
+        return legacyCellComparator(metadata, false);
+    }
+
+    public static Comparator<LegacyCell> legacyCellComparator(final CFMetaData metadata, final boolean reversed)
+    {
+        final Comparator<LegacyCellName> cellNameComparator = legacyCellNameComparator(metadata, reversed);
+        return new Comparator<LegacyCell>()
+        {
+            public int compare(LegacyCell cell1, LegacyCell cell2)
+            {
+                LegacyCellName c1 = cell1.name;
+                LegacyCellName c2 = cell2.name;
+
+                int c = cellNameComparator.compare(c1, c2);
+                if (c != 0)
+                    return c;
+
+                // The actual sorting when the cellname is equal doesn't matter, we just want to make
+                // sure the cells are not considered equal.
+                if (cell1.timestamp != cell2.timestamp)
+                    return cell1.timestamp < cell2.timestamp ? -1 : 1;
+
+                if (cell1.localDeletionTime != cell2.localDeletionTime)
+                    return cell1.localDeletionTime < cell2.localDeletionTime ? -1 : 1;
+
+                return cell1.value.compareTo(cell2.value);
+            }
+        };
+    }
+
+    // Note that this doesn't exactly compare cells as they were pre-3.0 because within a row they sort columns like
+    // in 3.0, that is, with simple columns before complex columns. In other words, this comparator makes sure cells
+    // are in the proper order to convert them to actual 3.0 rows.
+    public static Comparator<LegacyCellName> legacyCellNameComparator(final CFMetaData metadata, final boolean reversed)
+    {
+        return new Comparator<LegacyCellName>()
+        {
+            public int compare(LegacyCellName c1, LegacyCellName c2)
+            {
+                // Compare clustering first
+                if (c1.clustering == Clustering.STATIC_CLUSTERING)
+                {
+                    if (c2.clustering != Clustering.STATIC_CLUSTERING)
+                        return -1;
+                }
+                else if (c2.clustering == Clustering.STATIC_CLUSTERING)
+                {
+                    return 1;
+                }
+                else
+                {
+                    int c = metadata.comparator.compare(c1.clustering, c2.clustering);
+                    if (c != 0)
+                        return reversed ? -c : c;
+                }
+
+                // Note that when reversed, we only care about the clustering being reversed, so it's ok
+                // not to take reversed into account below.
+
+                // Then check the column name
+                if (c1.column != c2.column)
+                {
+                    // A null for the column means it's a row marker
+                    if (c1.column == null)
+                        return -1;
+                    if (c2.column == null)
+                        return 1;
+
+                    assert c1.column.isRegular() || c1.column.isStatic();
+                    assert c2.column.isRegular() || c2.column.isStatic();
+                    int cmp = c1.column.compareTo(c2.column);
+                    if (cmp != 0)
+                        return cmp;
+                }
+
+                assert (c1.collectionElement == null) == (c2.collectionElement == null);
+
+                if (c1.collectionElement != null)
+                {
+                    AbstractType<?> colCmp = ((CollectionType)c1.column.type).nameComparator();
+                    return colCmp.compare(c1.collectionElement, c2.collectionElement);
+                }
+                return 0;
+            }
+        };
+    }
+
+    private static boolean equalValues(ClusteringPrefix c1, ClusteringPrefix c2, ClusteringComparator comparator)
+    {
+        assert c1.size() == c2.size();
+        for (int i = 0; i < c1.size(); i++)
+        {
+            if (comparator.compareComponent(i, c1.get(i), c2.get(i)) != 0)
+                return false;
+        }
+        return true;
+    }
+
+    static Comparator<LegacyAtom> legacyAtomComparator(CFMetaData metadata)
+    {
+        return (o1, o2) ->
+        {
+            // First we want to compare by clustering, but we have to be careful with range tombstone, because
+            // we can have collection deletion and we want those to sort properly just before the column they
+            // delete, not before the whole row.
+            // We also want to special case static so they sort before any non-static. Note in particular that
+            // this special casing is important in the case of one of the Atom being Slice.Bound.BOTTOM: we want
+            // it to sort after the static as we deal with static first in toUnfilteredAtomIterator and having
+            // Slice.Bound.BOTTOM first would mess that up (note that static deletion is handled through a specific
+            // static tombstone, see LegacyDeletionInfo.add()).
+            if (o1.isStatic() != o2.isStatic())
+                return o1.isStatic() ? -1 : 1;
+
+            ClusteringPrefix c1 = o1.clustering();
+            ClusteringPrefix c2 = o2.clustering();
+
+            int clusteringComparison;
+            if (c1.size() != c2.size() || (o1.isCell() == o2.isCell()) || !equalValues(c1, c2, metadata.comparator))
+            {
+                clusteringComparison = metadata.comparator.compare(c1, c2);
+            }
+            else
+            {
+                // one is a cell and one is a range tombstone, and both have the same prefix size (that is, the
+                // range tombstone is either a row deletion or a collection deletion).
+                LegacyRangeTombstone rt = o1.isCell() ? o2.asRangeTombstone() : o1.asRangeTombstone();
+                clusteringComparison = rt.isCollectionTombstone()
+                                       ? 0
+                                       : metadata.comparator.compare(c1, c2);
+            }
+
+            // Note that if both are range tombstones and have the same clustering, then they are equal.
+            if (clusteringComparison != 0)
+                return clusteringComparison;
+
+            if (o1.isCell())
+            {
+                LegacyCell cell1 = o1.asCell();
+                if (o2.isCell())
+                {
+                    LegacyCell cell2 = o2.asCell();
+                    // Check for row marker cells
+                    if (cell1.name.column == null)
+                        return cell2.name.column == null ? 0 : -1;
+                    return cell2.name.column == null ? 1 : cell1.name.column.compareTo(cell2.name.column);
+                }
+
+                LegacyRangeTombstone rt2 = o2.asRangeTombstone();
+                assert rt2.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality
+                if (cell1.name.column == null)
+                    return -1;
+                int cmp = cell1.name.column.compareTo(rt2.start.collectionName);
+                // If both are for the same column, then the RT should come first
+                return cmp == 0 ? 1 : cmp;
+            }
+            else
+            {
+                assert o2.isCell();
+                LegacyCell cell2 = o2.asCell();
+
+                LegacyRangeTombstone rt1 = o1.asRangeTombstone();
+                assert rt1.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality
+
+                if (cell2.name.column == null)
+                    return 1;
+
+                int cmp = rt1.start.collectionName.compareTo(cell2.name.column);
+                // If both are for the same column, then the RT should come first
+                return cmp == 0 ? -1 : cmp;
+            }
+        };
+    }
+
+    public static LegacyAtom readLegacyAtom(CFMetaData metadata, DataInputPlus in, boolean readAllAsDynamic)
+    throws IOException, UnknownColumnException
+    {
+        ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in);
+        if (!cellname.hasRemaining())
+            return null; // END_OF_ROW
+
+        try
+        {
+            int b = in.readUnsignedByte();
+            return (b & RANGE_TOMBSTONE_MASK) != 0
+                   ? readLegacyRangeTombstoneBody(metadata, in, cellname)
+                   : readLegacyCellBody(metadata, in, cellname, b, SerializationHelper.Flag.LOCAL, readAllAsDynamic);
+        }
+        catch (UnknownColumnException e)
+        {
+            // We legitimately can get here in 2 cases:
+            // 1) for system tables, because we've unceremoniously removed columns (without registering them as dropped)
+            // 2) for dropped columns.
+            // In any other case, there is a mismatch between the schema and the data, and we complain loudly in
+            // that case. Note that if we are in a legit case of an unknown column, we want to simply skip that cell,
+            // but we don't do this here and re-throw the exception because the calling code sometimes has to know
+            // about this happening. This does mean code calling this method should handle this case properly.
+            if (!metadata.ksName.equals(SystemKeyspace.NAME) && metadata.getDroppedColumnDefinition(e.columnName) == null)
+                throw new IllegalStateException(String.format("Got cell for unknown column %s in sstable of %s.%s: " +
+                                                              "This suggest a problem with the schema which doesn't list " +
+                                                              "this column. Even if that column was dropped, it should have " +
+                                                              "been listed as such", metadata.ksName, metadata.cfName, UTF8Type.instance.compose(e.columnName)), e);
+
+            throw e;
+        }
+    }
+
+    public static LegacyCell readLegacyCell(CFMetaData metadata, DataInput in, SerializationHelper.Flag flag) throws IOException, UnknownColumnException
+    {
+        ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in);
+        int b = in.readUnsignedByte();
+        return readLegacyCellBody(metadata, in, cellname, b, flag, false);
+    }
+
+    public static LegacyCell readLegacyCellBody(CFMetaData metadata, DataInput in, ByteBuffer cellname, int mask, SerializationHelper.Flag flag, boolean readAllAsDynamic)
+    throws IOException, UnknownColumnException
+    {
+        // Note that we want to call decodeCellName only after we've deserialized other parts, since it can throw
+        // and we want to throw only after having deserialized the full cell.
+        if ((mask & COUNTER_MASK) != 0)
+        {
+            in.readLong(); // timestampOfLastDelete: this has been unused for a long time so we ignore it
+            long ts = in.readLong();
+            ByteBuffer value = ByteBufferUtil.readWithLength(in);
+            if (flag == SerializationHelper.Flag.FROM_REMOTE || (flag == SerializationHelper.Flag.LOCAL && CounterContext.instance().shouldClearLocal(value)))
+                value = CounterContext.instance().clearAllLocal(value);
+            return new LegacyCell(LegacyCell.Kind.COUNTER, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL);
+        }
+        else if ((mask & EXPIRATION_MASK) != 0)
+        {
+            int ttl = in.readInt();
+            int expiration = in.readInt();
+            long ts = in.readLong();
+            ByteBuffer value = ByteBufferUtil.readWithLength(in);
+            return new LegacyCell(LegacyCell.Kind.EXPIRING, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, expiration, ttl);
+        }
+        else
+        {
+            long ts = in.readLong();
+            ByteBuffer value = ByteBufferUtil.readWithLength(in);
+            LegacyCellName name = decodeCellName(metadata, cellname, readAllAsDynamic);
+            return (mask & COUNTER_UPDATE_MASK) != 0
+                ? new LegacyCell(LegacyCell.Kind.COUNTER, name, CounterContext.instance().createUpdate(ByteBufferUtil.toLong(value)), ts, Cell.NO_DELETION_TIME, Cell.NO_TTL)
+                : ((mask & DELETION_MASK) == 0
+                        ? new LegacyCell(LegacyCell.Kind.REGULAR, name, value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL)
+                        : new LegacyCell(LegacyCell.Kind.DELETED, name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ts, ByteBufferUtil.toInt(value), Cell.NO_TTL));
+        }
+    }
+
+    public static LegacyRangeTombstone readLegacyRangeTombstoneBody(CFMetaData metadata, DataInputPlus in, ByteBuffer boundname) throws IOException
+    {
+        LegacyBound min = decodeTombstoneBound(metadata, boundname, true);
+        LegacyBound max = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false);
+        DeletionTime dt = DeletionTime.serializer.deserialize(in);
+        return new LegacyRangeTombstone(min, max, dt);
+    }
+
+    public static Iterator<LegacyCell> deserializeCells(final CFMetaData metadata,
+                                                        final DataInput in,
+                                                        final SerializationHelper.Flag flag,
+                                                        final int size)
+    {
+        return new AbstractIterator<LegacyCell>()
+        {
+            private int i = 0;
+
+            protected LegacyCell computeNext()
+            {
+                if (i >= size)
+                    return endOfData();
+
+                ++i;
+                try
+                {
+                    return readLegacyCell(metadata, in, flag);
+                }
+                catch (UnknownColumnException e)
+                {
+                    // We can get there if we read a cell for a dropped column, and if that is the case,
+                    // then simply ignore the cell is fine. But also not that we ignore if it's the
+                    // system keyspace because for those table we actually remove columns without registering
+                    // them in the dropped columns
+                    if (metadata.ksName.equals(SystemKeyspace.NAME) || metadata.getDroppedColumnDefinition(e.columnName) != null)
+                        return computeNext();
+                    else
+                        throw new IOError(e);
+                }
+                catch (IOException e)
+                {
+                    throw new IOError(e);
+                }
+            }
+        };
+    }
+
+    public static class CellGrouper
+    {
+        /**
+         * The fake TTL used for expired rows that have been compacted.
+         */
+        private static final int FAKE_TTL = 1;
+
+        public final CFMetaData metadata;
+        private final boolean isStatic;
+        private final SerializationHelper helper;
+        private final Row.Builder builder;
+        private Clustering clustering;
+
+        private LegacyRangeTombstone rowDeletion;
+        private LegacyRangeTombstone collectionDeletion;
+
+        /**
+         * Used to track if we need to add pk liveness info (row marker) when removing invalid legacy cells.
+         *
+         * In 2.1 these invalid cells existed but were not queryable, in this case specifically because they
+         * represented values for clustering key columns that were written as data cells.
+         *
+         * However, the presence (or not) of such cells on an otherwise empty CQL row (or partition) would decide
+         * if an empty result row were returned for the CQL row (or partition).  To maintain this behaviour we
+         * insert a row marker containing the liveness info of these invalid cells iff we have no other data
+         * on the row.
+         *
+         * See also CASSANDRA-15365
+         */
+        private boolean hasValidCells = false;
+        private LivenessInfo invalidLivenessInfo = null;
+
+        public CellGrouper(CFMetaData metadata, SerializationHelper helper)
+        {
+            this(metadata, helper, false);
+        }
+
+        private CellGrouper(CFMetaData metadata, SerializationHelper helper, boolean isStatic)
+        {
+            this.metadata = metadata;
+            this.isStatic = isStatic;
+            this.helper = helper;
+            // We cannot use a sorted builder because we don't have exactly the same ordering in 3.0 and pre-3.0. More precisely, within a row, we
+            // store all simple columns before the complex ones in 3.0, which we use to sort everything sorted by the column name before. Note however
+            // that the unsorted builder won't have to reconcile cells, so the exact value we pass for nowInSec doesn't matter.
+            this.builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        }
+
+        public static CellGrouper staticGrouper(CFMetaData metadata, SerializationHelper helper)
+        {
+            return new CellGrouper(metadata, helper, true);
+        }
+
+        public void reset()
+        {
+            this.clustering = null;
+            this.rowDeletion = null;
+            this.collectionDeletion = null;
+            this.invalidLivenessInfo = null;
+            this.hasValidCells = false;
+        }
+
+        /**
+         * Try adding the provided atom to the currently grouped row.
+         *
+         * @param atom the new atom to try to add. This <b>must</b> be a "row" atom, that is either a cell or a legacy
+         *             range tombstone that covers only one row (row deletion) or a subset of it (collection
+         *             deletion). Meaning that legacy range tombstone covering multiple rows (that should be handled as
+         *             legit range tombstone in the new storage engine) should be handled separately. Atoms should also
+         *             be provided in proper clustering order.
+         * @return {@code true} if the provided atom has been "consumed" by this grouper (this does _not_ mean the
+         *          atom has been "used" by the grouper as the grouper will skip some shadowed atoms for instance, just
+         *          that {@link #getRow()} shouldn't be called just yet if there is more atom in the atom iterator we're
+         *          grouping). {@code false} otherwise, that is if the row currently built by this grouper is done
+         *          _without_ the provided atom being "consumed" (and so {@link #getRow()} should be called and the
+         *          grouper resetted, after which the provided atom should be provided again).
+         */
+        public boolean addAtom(LegacyAtom atom)
+        {
+            assert atom.isRowAtom(metadata) : "Unexpected non in-row legacy range tombstone " + atom;
+            return atom.isCell()
+                 ? addCell(atom.asCell())
+                 : addRangeTombstone(atom.asRangeTombstone());
+        }
+
+        private boolean addCell(LegacyCell cell)
+        {
+            if (clustering == null)
+            {
+                clustering = cell.name.clustering;
+                assert !isStatic || clustering == Clustering.STATIC_CLUSTERING;
+                builder.newRow(clustering);
+            }
+            else if (!clustering.equals(cell.name.clustering))
+            {
+                return false;
+            }
+
+            // Ignore shadowed cells
+            if (rowDeletion != null && rowDeletion.deletionTime.deletes(cell.timestamp))
+                return true;
+
+            ColumnDefinition column = cell.name.column;
+            if (column == null)
+            {
+                // It's the row marker
+                assert !cell.value.hasRemaining();
+                // In 2.1, the row marker expired cell might have been converted into a deleted one by compaction.
+                // If we do not set the primary key liveness info for this row and it does not contains any regular columns
+                // the row will be empty. To avoid that, we reuse the localDeletionTime but use a fake TTL.
+                // The only time in 2.x that we actually delete a row marker is in 2i tables, so in that case we do
+                // want to actually propagate the row deletion. (CASSANDRA-13320)
+                if (!cell.isTombstone())
+                    builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, cell.ttl, cell.localDeletionTime));
+                else if (metadata.isIndex())
+                    builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(cell.timestamp, cell.localDeletionTime)));
+                else
+                    builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, FAKE_TTL, cell.localDeletionTime));
+                hasValidCells = true;
+            }
+            else if (column.isPrimaryKeyColumn() && metadata.isCQLTable())
+            {
+                // SSTables generated offline and side-loaded may include invalid cells which have the column name
+                // of a primary key column. So that we don't fail when encountering these cells, we treat them the
+                // same way as 2.1 did, namely we include their clusterings in the new CQL row, but drop the invalid
+                // column part of the cell
+                noSpamLogger.warn("Illegal cell name for CQL3 table {}.{}. {} is defined as a primary key column",
+                                  metadata.ksName, metadata.cfName, column.name);
+
+                if (invalidLivenessInfo != null)
+                {
+                    // when we have several invalid cells we follow the logic in LivenessInfo#supersedes when picking the PKLI to keep:
+                    LivenessInfo newInvalidLiveness = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime);
+                    if (newInvalidLiveness.supersedes(invalidLivenessInfo))
+                        invalidLivenessInfo = newInvalidLiveness;
+                }
+                else
+                {
+                    invalidLivenessInfo = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime);
+                }
+                return true;
+            }
+            else
+            {
+                if (collectionDeletion != null && collectionDeletion.start.collectionName.name.equals(column.name) && collectionDeletion.deletionTime.deletes(cell.timestamp))
+                    return true;
+
+                if (helper.includes(column))
+                {
+                    hasValidCells = true;
+                    CellPath path = null;
+                    if (column.isComplex())
+                    {
+                        // Recalling startOfComplexColumn for every cell is a big inefficient, but it's ok in practice
+                        // and it's simpler. And since 1) this only matter for super column selection in thrift in
+                        // practice and 2) is only used during upgrade, it's probably worth keeping things simple.
+                        helper.startOfComplexColumn(column);
+                        path = cell.name.collectionElement == null ? null : CellPath.create(cell.name.collectionElement);
+                        if (!helper.includes(path))
+                            return true;
+                    }
+                    column.type.validateIfFixedSize(cell.value);
+                    Cell c = new BufferCell(column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, path);
+                    if (!helper.isDropped(c, column.isComplex()))
+                        builder.addCell(c);
+                    if (column.isComplex())
+                    {
+                        helper.endOfComplexColumn();
+                    }
+                }
+            }
+            return true;
+        }
+
+        private boolean addRangeTombstone(LegacyRangeTombstone tombstone)
+        {
+            if (tombstone.isRowDeletion(metadata))
+            {
+                return addRowTombstone(tombstone);
+            }
+            else
+            {
+                // The isRowAtom() assertion back in addAtom would have already triggered otherwise, but spelling it
+                // out nonetheless.
+                assert tombstone.isCollectionTombstone();
+                return addCollectionTombstone(tombstone);
+            }
+        }
+
+        private boolean addRowTombstone(LegacyRangeTombstone tombstone)
+        {
+            if (clustering != null)
+            {
+                // If we're already in the row, there might be a chance that there were two range tombstones
+                // written, as 2.x storage format does not guarantee just one range tombstone, unlike 3.x.
+                // We have to make sure that clustering matches, which would mean that tombstone is for the
+                // same row.
+                if (clustering.equals(tombstone.start.getAsClustering(metadata)))
+                {
+                    // If the tombstone superceeds the previous delete, we discard the previous one.
+                    // This assumes that we are building the row from a sane source (ie, this row deletion
+                    // does not delete anything already added to the builder). See CASSANDRA-15789 for details
+                    if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime))
+                    {
+                        builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime));
+                        rowDeletion = tombstone;
+                        hasValidCells = true;
+                    }
+                    return true;
+                }
+
+                // different clustering -> new row
+                return false;
+            }
+
+            clustering = tombstone.start.getAsClustering(metadata);
+            builder.newRow(clustering);
+            builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime));
+            rowDeletion = tombstone;
+            hasValidCells = true;
+
+            return true;
+        }
+
+        private boolean addCollectionTombstone(LegacyRangeTombstone tombstone)
+        {
+            // If the collection tombstone is not included in the query (which technically would only apply to thrift
+            // queries since CQL one "fetch" everything), we can skip it (so return), but we're problably still within
+            // the current row so we return `true`. Technically, it is possible that tombstone belongs to another row
+            // that the row currently grouped, but as we ignore it, returning `true` is ok in that case too.
+            if (!helper.includes(tombstone.start.collectionName))
+                return true; // see CASSANDRA-13109
+
+            // The helper needs to be informed about the current complex column identifier before
+            // it can perform the comparison between the recorded drop time and the RT deletion time.
+            // If the RT has been superceded by a drop, we still return true as we don't want the
+            // grouper to terminate yet.
+            helper.startOfComplexColumn(tombstone.start.collectionName);
+            if (helper.isDroppedComplexDeletion(tombstone.deletionTime))
+                return true;
+
+            if (clustering == null)
+            {
+                clustering = tombstone.start.getAsClustering(metadata);
+                builder.newRow(clustering);
+            }
+            else if (!clustering.equals(tombstone.start.getAsClustering(metadata)))
+            {
+                return false;
+            }
+
+            builder.addComplexDeletion(tombstone.start.collectionName, tombstone.deletionTime);
+            if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime))
+                collectionDeletion = tombstone;
+            hasValidCells = true;
+
+            return true;
+        }
+
+        /**
+         * Whether the provided range tombstone starts strictly after the current row of the cell grouper (if no row is
+         * currently started, this return false).
+         */
+        public boolean startsAfterCurrentRow(LegacyRangeTombstone rangeTombstone)
+        {
+            return clustering != null && metadata.comparator.compare(rangeTombstone.start.bound, clustering) > 0;
+        }
+
+        /**
+         * The clustering of the current row of the cell grouper, or {@code null} if no row is currently started.
+         */
+        public Clustering currentRowClustering()
+        {
+            return clustering;
+        }
+
+        /**
+         * Generates the row currently grouped by this grouper and reset it for the following row.
+         * <p>
+         * Note that the only correct way to call this is when either all the atom we're trying to group has been
+         * consumed, or when {@link #addAtom(LegacyAtom)} returns {@code false}.
+         *
+         * @return the current row that has been grouped, or {@code null} in the rare case where all the atoms
+         * "consumed" by {@link #addAtom(LegacyAtom)} for this row were skipped (we skip atoms under a few conditions).
+         */
+        public Row getRow()
+        {
+            if (!hasValidCells && invalidLivenessInfo != null)
+                builder.addPrimaryKeyLivenessInfo(invalidLivenessInfo);
+            return builder.build();
+        }
+    }
+
+    public static class LegacyUnfilteredPartition
+    {
+        public final DeletionTime partitionDeletion;
+        public final LegacyRangeTombstoneList rangeTombstones;
+        public final List<LegacyCell> cells;
+
+        private LegacyUnfilteredPartition(DeletionTime partitionDeletion, LegacyRangeTombstoneList rangeTombstones, List<LegacyCell> cells)
+        {
+            this.partitionDeletion = partitionDeletion;
+            this.rangeTombstones = rangeTombstones;
+            this.cells = cells;
+        }
+
+        public void digest(CFMetaData metadata, MessageDigest digest)
+        {
+            for (LegacyCell cell : cells)
+            {
+                digest.update(cell.name.encode(metadata).duplicate());
+
+                if (cell.isCounter())
+                    CounterContext.instance().updateDigest(digest, cell.value);
+                else
+                    digest.update(cell.value.duplicate());
+
+                FBUtilities.updateWithLong(digest, cell.timestamp);
+                FBUtilities.updateWithByte(digest, cell.serializationFlags());
+
+                if (cell.isExpiring())
+                    FBUtilities.updateWithInt(digest, cell.ttl);
+
+                if (cell.isCounter())
+                {
+                    // Counters used to have the timestampOfLastDelete field, which we stopped using long ago and has been hard-coded
+                    // to Long.MIN_VALUE but was still taken into account in 2.2 counter digests (to maintain backward compatibility
+                    // in the first place).
+                    FBUtilities.updateWithLong(digest, Long.MIN_VALUE);
+                }
+            }
+
+            if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE)
+                digest.update(ByteBufferUtil.bytes(partitionDeletion.markedForDeleteAt()));
+
+            if (!rangeTombstones.isEmpty())
+                rangeTombstones.updateDigest(digest);
+        }
+    }
+
+    public static class LegacyCellName
+    {
+        public final Clustering clustering;
+        public final ColumnDefinition column;
+        public final ByteBuffer collectionElement;
+
+        @VisibleForTesting
+        public LegacyCellName(Clustering clustering, ColumnDefinition column, ByteBuffer collectionElement)
+        {
+            this.clustering = clustering;
+            this.column = column;
+            this.collectionElement = collectionElement;
+        }
+
+        public static LegacyCellName create(Clustering clustering, ColumnDefinition column)
+        {
+            return new LegacyCellName(clustering, column, null);
+        }
+
+        public ByteBuffer encode(CFMetaData metadata)
+        {
+            return encodeCellName(metadata, clustering, column == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : column.name.bytes, collectionElement);
+        }
+
+        public ByteBuffer superColumnSubName()
+        {
+            assert collectionElement != null;
+            return collectionElement;
+        }
+
+        public ByteBuffer superColumnName()
+        {
+            return clustering.get(0);
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < clustering.size(); i++)
+                sb.append(i > 0 ? ":" : "").append(clustering.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(clustering.get(i)));
+            return String.format("Cellname(clustering=%s, column=%s, collElt=%s)", sb.toString(), column == null ? "null" : column.name, collectionElement == null ? "null" : ByteBufferUtil.bytesToHex(collectionElement));
+        }
+    }
+
+    public static class LegacyBound
+    {
+        public static final LegacyBound BOTTOM = new LegacyBound(Slice.Bound.BOTTOM, false, null);
+        public static final LegacyBound TOP = new LegacyBound(Slice.Bound.TOP, false, null);
+
+        public final Slice.Bound bound;
+        public final boolean isStatic;
+        public final ColumnDefinition collectionName;
+
+        public LegacyBound(Slice.Bound bound, boolean isStatic, ColumnDefinition collectionName)
+        {
+            this.bound = bound;
+            this.isStatic = isStatic;
+            this.collectionName = collectionName;
+        }
+
+        public Clustering getAsClustering(CFMetaData metadata)
+        {
+            if (isStatic)
+                return Clustering.STATIC_CLUSTERING;
+
+            assert bound.size() == metadata.comparator.size();
+            ByteBuffer[] values = new ByteBuffer[bound.size()];
+            for (int i = 0; i < bound.size(); i++)
+                values[i] = bound.get(i);
+            return new Clustering(values);
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append(bound.kind()).append('(');
+            for (int i = 0; i < bound.size(); i++)
+                sb.append(i > 0 ? ":" : "").append(bound.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(bound.get(i)));
+            sb.append(')');
+            return String.format("Bound(%s, collection=%s)", sb.toString(), collectionName == null ? "null" : collectionName.name);
+        }
+    }
+
+    public interface LegacyAtom
+    {
+        public boolean isCell();
+
+        // note that for static atoms, LegacyCell and LegacyRangeTombstone behave differently here:
+        //  - LegacyCell returns the modern Clustering.STATIC_CLUSTERING
+        //  - LegacyRangeTombstone returns the 2.2 bound (i.e. N empty ByteBuffer, where N is number of clusterings)
+        // in LegacyDeletionInfo.add(), we split any LRT with a static bound out into the inRowRangeTombstones collection
+        // these are merged with regular row cells, in the CellGrouper, and their clustering is obtained via start.bound.getAsClustering
+        // (also, it should be impossibly to issue raw static row deletions anyway)
+        public ClusteringPrefix clustering();
+        public boolean isStatic();
+
+        public LegacyCell asCell();
+        public LegacyRangeTombstone asRangeTombstone();
+
+        /**
+         * Whether the atom is one that becomes part of a {@link Row} in the new storage engine, meaning it is either
+         * as cell or a legacy range tombstone that covers a single row, or parts of one.
+         */
+        public boolean isRowAtom(CFMetaData metadata);
+    }
+
+    /**
+     * A legacy cell.
+     * <p>
+     * This is used as a temporary object to facilitate dealing with the legacy format, this
+     * is not meant to be optimal.
+     */
+    public static class LegacyCell implements LegacyAtom
+    {
+        private final static int DELETION_MASK        = 0x01;
+        private final static int EXPIRATION_MASK      = 0x02;
+        private final static int COUNTER_MASK         = 0x04;
+        private final static int COUNTER_UPDATE_MASK  = 0x08;
+        private final static int RANGE_TOMBSTONE_MASK = 0x10;
+
+        public enum Kind { REGULAR, EXPIRING, DELETED, COUNTER }
+
+        public final Kind kind;
+
+        public final LegacyCellName name;
+        public final ByteBuffer value;
+
+        public final long timestamp;
+        public final int localDeletionTime;
+        public final int ttl;
+
+        @VisibleForTesting
+        public LegacyCell(Kind kind, LegacyCellName name, ByteBuffer value, long timestamp, int localDeletionTime, int ttl)
+        {
+            this.kind = kind;
+            this.name = name;
+            this.value = value;
+            this.timestamp = timestamp;
+            this.localDeletionTime = localDeletionTime;
+            this.ttl = ttl;
+        }
+
+        public static LegacyCell regular(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp)
+        throws UnknownColumnException
+        {
+            return new LegacyCell(Kind.REGULAR, decodeCellName(metadata, superColumnName, name), value, timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL);
+        }
+
+        public static LegacyCell expiring(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp, int ttl, int nowInSec)
+        throws UnknownColumnException
+        {
+            /*
+             * CASSANDRA-14092: Max expiration date capping is maybe performed here, expiration overflow policy application
+             * is done at {@link org.apache.cassandra.thrift.ThriftValidation#validateTtl(CFMetaData, Column)}
+             */
+            return new LegacyCell(Kind.EXPIRING, decodeCellName(metadata, superColumnName, name), value, timestamp, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), ttl);
+        }
+
+        public static LegacyCell tombstone(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec)
+        throws UnknownColumnException
+        {
+            return new LegacyCell(Kind.DELETED, decodeCellName(metadata, superColumnName, name), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, nowInSec, LivenessInfo.NO_TTL);
+        }
+
+        public static LegacyCell counterUpdate(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long value)
+        throws UnknownColumnException
+        {
+            // See UpdateParameters.addCounter() for more details on this
+            ByteBuffer counterValue = CounterContext.instance().createUpdate(value);
+            return counter(decodeCellName(metadata, superColumnName, name), counterValue);
+        }
+
+        public static LegacyCell counter(LegacyCellName name, ByteBuffer value)
+        {
+            return new LegacyCell(Kind.COUNTER, name, value, FBUtilities.timestampMicros(), Cell.NO_DELETION_TIME, Cell.NO_TTL);
+        }
+
+        public byte serializationFlags()
+        {
+            if (isExpiring())
+                return EXPIRATION_MASK;
+            if (isTombstone())
+                return DELETION_MASK;
+            if (isCounterUpdate())
+                return COUNTER_UPDATE_MASK;
+            if (isCounter())
+                return COUNTER_MASK;
+            return 0;
+        }
+
+        public boolean isCounterUpdate()
+        {
+            // See UpdateParameters.addCounter() for more details on this
+            return isCounter() && CounterContext.instance().isUpdate(value);
+        }
+
+        public ClusteringPrefix clustering()
+        {
+            return name.clustering;
+        }
+
+        public boolean isStatic()
+        {
+            return name.clustering == Clustering.STATIC_CLUSTERING;
+        }
+
+        public boolean isCell()
+        {
+            return true;
+        }
+
+        public LegacyCell asCell()
+        {
+            return this;
+        }
+
+        public LegacyRangeTombstone asRangeTombstone()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public boolean isRowAtom(CFMetaData metaData)
+        {
+            return true;
+        }
+
+        public boolean isCounter()
+        {
+            return kind == Kind.COUNTER;
+        }
+
+        public boolean isExpiring()
+        {
+            return kind == Kind.EXPIRING;
+        }
+
+        public boolean isTombstone()
+        {
+            return kind == Kind.DELETED;
+        }
+
+        public boolean isLive(int nowInSec)
+        {
+            if (isTombstone())
+                return false;
+
+            return !isExpiring() || nowInSec < localDeletionTime;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("LegacyCell(%s, name=%s, v=%s, ts=%s, ldt=%s, ttl=%s)", kind, name, ByteBufferUtil.bytesToHex(value), timestamp, localDeletionTime, ttl);
+        }
+    }
+
+    /**
+     * A legacy range tombstone.
+     * <p>
+     * This is used as a temporary object to facilitate dealing with the legacy format, this
+     * is not meant to be optimal.
+     */
+    public static class LegacyRangeTombstone implements LegacyAtom
+    {
+        public final LegacyBound start;
+        public final LegacyBound stop;
+        public final DeletionTime deletionTime;
+
+        public LegacyRangeTombstone(LegacyBound start, LegacyBound stop, DeletionTime deletionTime)
+        {
+            // Because of the way RangeTombstoneList work, we can have a tombstone where only one of
+            // the bound has a collectionName. That happens if we have a big tombstone A (spanning one
+            // or multiple rows) and a collection tombstone B. In that case, RangeTombstoneList will
+            // split this into 3 RTs: the first one from the beginning of A to the beginning of B,
+            // then B, then a third one from the end of B to the end of A. To make this simpler, if
+            // we detect that case we transform the 1st and 3rd tombstone so they don't end in the middle
+            // of a row (which is still correct).
+            if ((start.collectionName == null) != (stop.collectionName == null))
+            {
+                if (start.collectionName == null)
+                    stop = new LegacyBound(Slice.Bound.inclusiveEndOf(stop.bound.values), stop.isStatic, null);
+                else
+                    start = new LegacyBound(Slice.Bound.inclusiveStartOf(start.bound.values), start.isStatic, null);
+            }
+            else if (!Objects.equals(start.collectionName, stop.collectionName))
+            {
+                // We're in the similar but slightly more complex case where on top of the big tombstone
+                // A, we have 2 (or more) collection tombstones B and C within A. So we also end up with
+                // a tombstone that goes between the end of B and the start of C.
+                start = new LegacyBound(start.bound, start.isStatic, null);
+                stop = new LegacyBound(stop.bound, stop.isStatic, null);
+            }
+
+            this.start = start;
+            this.stop = stop;
+            this.deletionTime = deletionTime;
+        }
+
+        /** @see LegacyAtom#clustering for static inconsistencies explained */
+        public ClusteringPrefix clustering()
+        {
+            return start.bound;
+        }
+
+        public LegacyRangeTombstone withNewStart(LegacyBound newStart)
+        {
+            return new LegacyRangeTombstone(newStart, stop, deletionTime);
+        }
+
+        public LegacyRangeTombstone withNewStart(Slice.Bound newStart)
+        {
+            return withNewStart(new LegacyBound(newStart, start.isStatic, null));
+        }
+
+        public LegacyRangeTombstone withNewEnd(LegacyBound newStop)
+        {
+            return new LegacyRangeTombstone(start, newStop, deletionTime);
+        }
+
+        public LegacyRangeTombstone withNewEnd(Slice.Bound newEnd)
+        {
+            return withNewEnd(new LegacyBound(newEnd, stop.isStatic, null));
+        }
+
+        public boolean isCell()
+        {
+            return false;
+        }
+
+        public boolean isStatic()
+        {
+            return start.isStatic || stop.isStatic;
+        }
+
+        public LegacyCell asCell()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public LegacyRangeTombstone asRangeTombstone()
+        {
+            return this;
+        }
+
+        @Override
+        public boolean isRowAtom(CFMetaData metadata)
+        {
+            return isCollectionTombstone() || isRowDeletion(metadata);
+        }
+
+        public boolean isCollectionTombstone()
+        {
+            return start.collectionName != null;
+        }
+
+        public boolean isRowDeletion(CFMetaData metadata)
+        {
+            if (start.collectionName != null
+                || stop.collectionName != null
+                || start.bound.size() != metadata.comparator.size()
+                || stop.bound.size() != metadata.comparator.size())
+                return false;
+
+            for (int i = 0; i < start.bound.size(); i++)
+                if (!Objects.equals(start.bound.get(i), stop.bound.get(i)))
+                    return false;
+            return true;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("RT(%s-%s, %s)", start, stop, deletionTime);
+        }
+    }
+
+    public static class LegacyDeletionInfo
+    {
+        public final MutableDeletionInfo deletionInfo;
+        public final List<LegacyRangeTombstone> inRowTombstones = new ArrayList<>();
+
+        private LegacyDeletionInfo(MutableDeletionInfo deletionInfo)
+        {
+            this.deletionInfo = deletionInfo;
+        }
+
+        public static LegacyDeletionInfo live()
+        {
+            return new LegacyDeletionInfo(MutableDeletionInfo.live());
+        }
+
+        public void add(DeletionTime topLevel)
+        {
+            deletionInfo.add(topLevel);
+        }
+
+        private static Slice.Bound staticBound(CFMetaData metadata, boolean isStart)
+        {
+            // In pre-3.0 nodes, static row started by a clustering with all empty values so we
+            // preserve that here. Note that in practice, it doesn't really matter since the rest
+            // of the code will ignore the bound for RT that have their static flag set.
+            ByteBuffer[] values = new ByteBuffer[metadata.comparator.size()];
+            for (int i = 0; i < values.length; i++)
+                values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            return isStart
+                 ? Slice.Bound.inclusiveStartOf(values)
+                 : Slice.Bound.inclusiveEndOf(values);
+        }
+
+        public void add(CFMetaData metadata, LegacyRangeTombstone tombstone)
+        {
+            if (metadata.hasStaticColumns())
+            {
+                /*
+                 * For table having static columns we have to deal with the following cases:
+                 *  1. the end of the tombstone is static (in which case either the start is static or is BOTTOM, which is the same
+                 *     for our consideration). This mean that either the range only delete the static row, or that it's a collection
+                 *     tombstone of a static collection. In both case, we just add the tombstone to the inRowTombstones.
+                 *  2. only the start is static. There is then 2 subcase: either the start is inclusive, and that mean we include the
+                 *     static row and more (so we add an inRowTombstone for the static and deal with the rest normally). Or the start
+                 *     is exclusive, and that means we explicitely exclude the static (in which case we can just add the tombstone
+                 *     as if it started at BOTTOM).
+                 *  3. none of the bound are static but the start is BOTTOM. This means we intended to delete the static row so we
+                 *     need to add it to the inRowTombstones (and otherwise handle the range normally).
+                 */
+                if (tombstone.stop.isStatic)
+                {
+                    // If the start is BOTTOM, we replace it by the beginning of the starting row so as to not confuse the
+                    // RangeTombstone.isRowDeletion() method
+                    if (tombstone.start == LegacyBound.BOTTOM)
+                        tombstone = tombstone.withNewStart(new LegacyBound(staticBound(metadata, true), true, null));
+                    inRowTombstones.add(tombstone);
+                    return;
+                }
+
+                if (tombstone.start.isStatic)
+                {
+                    if (tombstone.start.bound.isInclusive())
+                        inRowTombstones.add(tombstone.withNewEnd(new LegacyBound(staticBound(metadata, false), true, null)));
+
+                    tombstone = tombstone.withNewStart(LegacyBound.BOTTOM);
+                }
+                else if (tombstone.start == LegacyBound.BOTTOM)
+                {
+                    inRowTombstones.add(new LegacyRangeTombstone(new LegacyBound(staticBound(metadata, true), true, null),
+                                                                 new LegacyBound(staticBound(metadata, false), true, null),
+                                                                 tombstone.deletionTime));
+                }
+            }
+
+            if (tombstone.isCollectionTombstone() || tombstone.isRowDeletion(metadata))
+                inRowTombstones.add(tombstone);
+            else
+                add(metadata, new RangeTombstone(Slice.make(tombstone.start.bound, tombstone.stop.bound), tombstone.deletionTime));
+        }
+
+        public void add(CFMetaData metadata, RangeTombstone tombstone)
+        {
+            deletionInfo.add(tombstone, metadata.comparator);
+        }
+
+        public Iterator<LegacyRangeTombstone> inRowRangeTombstones()
+        {
+            return inRowTombstones.iterator();
+        }
+
+        public static LegacyDeletionInfo deserialize(CFMetaData metadata, DataInputPlus in) throws IOException
+        {
+            DeletionTime topLevel = DeletionTime.serializer.deserialize(in);
+
+            int rangeCount = in.readInt();
+            if (rangeCount == 0)
+                return new LegacyDeletionInfo(new MutableDeletionInfo(topLevel));
+
+            LegacyDeletionInfo delInfo = new LegacyDeletionInfo(new MutableDeletionInfo(topLevel));
+            for (int i = 0; i < rangeCount; i++)
+            {
+                LegacyBound start = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), true);
+                LegacyBound end = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false);
+                int delTime =  in.readInt();
+                long markedAt = in.readLong();
+
+                delInfo.add(metadata, new LegacyRangeTombstone(start, end, new DeletionTime(markedAt, delTime)));
+            }
+            return delInfo;
+        }
+    }
+
+    /**
+     * A helper class for LegacyRangeTombstoneList.  This replaces the Comparator<Composite> that RTL used before 3.0.
+     */
+    private static class LegacyBoundComparator implements Comparator<LegacyBound>
+    {
+        ClusteringComparator clusteringComparator;
+
+        public LegacyBoundComparator(ClusteringComparator clusteringComparator)
+        {
+            this.clusteringComparator = clusteringComparator;
+        }
+
+        public int compare(LegacyBound a, LegacyBound b)
+        {
+            // In the legacy sorting, BOTTOM comes before anything else
+            if (a == LegacyBound.BOTTOM)
+                return b == LegacyBound.BOTTOM ? 0 : -1;
+            if (b == LegacyBound.BOTTOM)
+                return 1;
+
+            // Excluding BOTTOM, statics are always before anything else.
+            if (a.isStatic != b.isStatic)
+                return a.isStatic ? -1 : 1;
+
+            // We have to be careful with bound comparison because of collections. Namely, if the 2 bounds represent the
+            // same prefix, then we should take the collectionName into account before taking the bounds kind
+            // (ClusteringPrefix.Kind). This means we can't really call ClusteringComparator.compare() directly.
+            // For instance, if
+            //    a is (bound=INCL_START_BOUND('x'), collectionName='d')
+            //    b is (bound=INCL_END_BOUND('x'),   collectionName='c')
+            // Ten b < a since the element 'c' of collection 'x' comes before element 'd', but calling
+            // clusteringComparator.compare(a.bound, b.bound) returns -1.
+            // See CASSANDRA-13125 for details.
+            int sa = a.bound.size();
+            int sb = b.bound.size();
+            for (int i = 0; i < Math.min(sa, sb); i++)
+            {
+                int cmp = clusteringComparator.compareComponent(i, a.bound.get(i), b.bound.get(i));
+                if (cmp != 0)
+                    return cmp;
+            }
+
+            if (sa != sb)
+                return sa < sb ? a.bound.kind().comparedToClustering : -b.bound.kind().comparedToClustering;
+
+            // Both bound represent the same prefix, compare the collection names
+            // If one has a collection name and the other doesn't, the other comes before as it points to the beginning of the row.
+            if ((a.collectionName == null) != (b.collectionName == null))
+                return a.collectionName == null ? -1 : 1;
+
+            // If they both have a collection, compare that first
+            if (a.collectionName != null)
+            {
+                int cmp = UTF8Type.instance.compare(a.collectionName.name.bytes, b.collectionName.name.bytes);
+                if (cmp != 0)
+                    return cmp;
+            }
+
+            // Lastly, if everything so far is equal, compare their clustering kind
+            return ClusteringPrefix.Kind.compare(a.bound.kind(), b.bound.kind());
+        }
+    }
+
+    /**
+     * Almost an entire copy of RangeTombstoneList from C* 2.1.  The main difference is that LegacyBoundComparator
+     * is used in place of Comparator<Composite> (because Composite doesn't exist any more).
+     *
+     * This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones
+     * and properly merge them into the normal set of range tombstones.
+     */
+    public static class LegacyRangeTombstoneList
+    {
+        private final LegacyBoundComparator comparator;
+
+        // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could
+        // use a List for starts and ends, but having arrays everywhere is almost simpler.
+        LegacyBound[] starts;
+        LegacyBound[] ends;
+        private long[] markedAts;
+        private int[] delTimes;
+
+        private int size;
+
+        private LegacyRangeTombstoneList(LegacyBoundComparator comparator, LegacyBound[] starts, LegacyBound[] ends, long[] markedAts, int[] delTimes, int size)
+        {
+            assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length;
+            this.comparator = comparator;
+            this.starts = starts;
+            this.ends = ends;
+            this.markedAts = markedAts;
+            this.delTimes = delTimes;
+            this.size = size;
+        }
+
+        public LegacyRangeTombstoneList(LegacyBoundComparator comparator, int capacity)
+        {
+            this(comparator, new LegacyBound[capacity], new LegacyBound[capacity], new long[capacity], new int[capacity], 0);
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append('[');
+            for (int i = 0; i < size; i++)
+            {
+                if (i > 0)
+                    sb.append(',');
+                sb.append('(').append(starts[i]).append(", ").append(ends[i]).append(')');
+            }
+            return sb.append(']').toString();
+        }
+
+        public boolean isEmpty()
+        {
+            return size == 0;
+        }
+
+        public int size()
+        {
+            return size;
+        }
+
+        /**
+         * Adds a new range tombstone.
+         *
+         * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case),
+         * but it doesn't assume it.
+         */
+        public void add(LegacyBound start, LegacyBound end, long markedAt, int delTime)
+        {
+            if (isEmpty())
+            {
+                addInternal(0, start, end, markedAt, delTime);
+                return;
+            }
+
+            int c = comparator.compare(ends[size-1], start);
+
+            // Fast path if we add in sorted order
+            if (c <= 0)
+            {
+                addInternal(size, start, end, markedAt, delTime);
+            }
+            else
+            {
+                // Note: insertFrom expect i to be the insertion point in term of interval ends
+                int pos = Arrays.binarySearch(ends, 0, size, start, comparator);
+                insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime);
+            }
+        }
+
+        /*
+         * Inserts a new element starting at index i. This method assumes that:
+         *    ends[i-1] <= start <= ends[i]
+         *
+         * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that:
+         *   - s_i <= e_i
+         *   - e_i <= s_i+1
+         *   - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1
+         * Basically, range are non overlapping except for their bound and in order. And while
+         * we allow ranges with the same value for the start and end, we don't allow repeating
+         * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2
+         * conditions).
+         *
+         */
+
+        /**
+         * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList.
+         */
+        public void addAll(LegacyRangeTombstoneList tombstones)
+        {
+            if (tombstones.isEmpty())
+                return;
+
+            if (isEmpty())
+            {
+                copyArrays(tombstones, this);
+                return;
+            }
+
+            /*
+             * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values,
+             * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, then
+             * calling add() will be faster, otherwise it's merging that will be faster.
+             *
+             * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range
+             * tombstones, while the CF we're adding it to (the one in the memtable) has many. In that case, using add() is
+             * likely going to be faster.
+             *
+             * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will
+             * be likely sized, so using add() might be a bit inefficient.
+             *
+             * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big
+             * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)).
+             *
+             * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only
+             * if this list size is lot bigger that the other one, so let's keep it simple.
+             */
+            if (size > 10 * tombstones.size)
+            {
+                for (int i = 0; i < tombstones.size; i++)
+                    add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]);
+            }
+            else
+            {
+                int i = 0;
+                int j = 0;
+                while (i < size && j < tombstones.size)
+                {
+                    if (comparator.compare(tombstones.starts[j], ends[i]) <= 0)
+                    {
+                        insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]);
+                        j++;
+                    }
+                    else
+                    {
+                        i++;
+                    }
+                }
+                // Addds the remaining ones from tombstones if any (note that addInternal will increment size if relevant).
+                for (; j < tombstones.size; j++)
+                    addInternal(size, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]);
+            }
+        }
+
+        private static void copyArrays(LegacyRangeTombstoneList src, LegacyRangeTombstoneList dst)
+        {
+            dst.grow(src.size);
+            System.arraycopy(src.starts, 0, dst.starts, 0, src.size);
+            System.arraycopy(src.ends, 0, dst.ends, 0, src.size);
+            System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size);
+            System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size);
+            dst.size = src.size;
+        }
+
+        private void insertFrom(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime)
+        {
+            while (i < size)
+            {
+                assert i == 0 || comparator.compare(ends[i-1], start) <= 0;
+
+                int c = comparator.compare(start, ends[i]);
+                assert c <= 0;
+                if (c == 0)
+                {
+                    // If start == ends[i], then we can insert from the next one (basically the new element
+                    // really start at the next element), except for the case where starts[i] == ends[i].
+                    // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]...
+                    if (comparator.compare(starts[i], ends[i]) == 0)
+                    {
+                        // The current element cover a single value which is equal to the start of the inserted
+                        // element. If the inserted element overwrites the current one, just remove the current
+                        // (it's included in what we insert) and proceed with the insert.
+                        if (markedAt > markedAts[i])
+                        {
+                            removeInternal(i);
+                            continue;
+                        }
+
+                        // Otherwise (the current singleton interval override the new one), we want to leave the
+                        // current element and move to the next, unless start == end since that means the new element
+                        // is in fact fully covered by the current one (so we're done)
+                        if (comparator.compare(start, end) == 0)
+                            return;
+                    }
+                    i++;
+                    continue;
+                }
+
+                // Do we overwrite the current element?
+                if (markedAt > markedAts[i])
+                {
+                    // We do overwrite.
+
+                    // First deal with what might come before the newly added one.
+                    if (comparator.compare(starts[i], start) < 0)
+                    {
+                        addInternal(i, starts[i], start, markedAts[i], delTimes[i]);
+                        i++;
+                        // We don't need to do the following line, but in spirit that's what we want to do
+                        // setInternal(i, start, ends[i], markedAts, delTime])
+                    }
+
+                    // now, start <= starts[i]
+
+                    // Does the new element stops before/at the current one,
+                    int endCmp = comparator.compare(end, starts[i]);
+                    if (endCmp <= 0)
+                    {
+                        // Here start <= starts[i] and end <= starts[i]
+                        // This means the current element is before the current one. However, one special
+                        // case is if end == starts[i] and starts[i] == ends[i]. In that case,
+                        // the new element entirely overwrite the current one and we can just overwrite
+                        if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0)
+                            setInternal(i, start, end, markedAt, delTime);
+                        else
+                            addInternal(i, start, end, markedAt, delTime);
+                        return;
+                    }
+
+                    // Do we overwrite the current element fully?
+                    int cmp = comparator.compare(ends[i], end);
+                    if (cmp <= 0)
+                    {
+                        // We do overwrite fully:
+                        // update the current element until it's end and continue
+                        // on with the next element (with the new inserted start == current end).
+
+                        // If we're on the last element, we can optimize
+                        if (i == size-1)
+                        {
+                            setInternal(i, start, end, markedAt, delTime);
+                            return;
+                        }
+
+                        setInternal(i, start, ends[i], markedAt, delTime);
+                        if (cmp == 0)
+                            return;
+
+                        start = ends[i];
+                        i++;
+                    }
+                    else
+                    {
+                        // We don't ovewrite fully. Insert the new interval, and then update the now next
+                        // one to reflect the not overwritten parts. We're then done.
+                        addInternal(i, start, end, markedAt, delTime);
+                        i++;
+                        setInternal(i, end, ends[i], markedAts[i], delTimes[i]);
+                        return;
+                    }
+                }
+                else
+                {
+                    // we don't overwrite the current element
+
+                    // If the new interval starts before the current one, insert that new interval
+                    if (comparator.compare(start, starts[i]) < 0)
+                    {
+                        // If we stop before the start of the current element, just insert the new
+                        // interval and we're done; otherwise insert until the beginning of the
+                        // current element
+                        if (comparator.compare(end, starts[i]) <= 0)
+                        {
+                            addInternal(i, start, end, markedAt, delTime);
+                            return;
+                        }
+                        addInternal(i, start, starts[i], markedAt, delTime);
+                        i++;
+                    }
+
+                    // After that, we're overwritten on the current element but might have
+                    // some residual parts after ...
+
+                    // ... unless we don't extend beyond it.
+                    if (comparator.compare(end, ends[i]) <= 0)
+                        return;
+
+                    start = ends[i];
+                    i++;
+                }
+            }
+
+            // If we got there, then just insert the remainder at the end
+            addInternal(i, start, end, markedAt, delTime);
+        }
+
+        private int capacity()
+        {
+            return starts.length;
+        }
+
+        private void addInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime)
+        {
+            assert i >= 0;
+
+            if (size == capacity())
+                growToFree(i);
+            else if (i < size)
+                moveElements(i);
+
+            setInternal(i, start, end, markedAt, delTime);
+            size++;
+        }
+
+        private void removeInternal(int i)
+        {
+            assert i >= 0;
+
+            System.arraycopy(starts, i+1, starts, i, size - i - 1);
+            System.arraycopy(ends, i+1, ends, i, size - i - 1);
+            System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1);
+            System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1);
+
+            --size;
+            starts[size] = null;
+            ends[size] = null;
+        }
+
+        /*
+         * Grow the arrays, leaving index i "free" in the process.
+         */
+        private void growToFree(int i)
+        {
+            int newLength = (capacity() * 3) / 2 + 1;
+            grow(i, newLength);
+        }
+
+        /*
+         * Grow the arrays to match newLength capacity.
+         */
+        private void grow(int newLength)
+        {
+            if (capacity() < newLength)
+                grow(-1, newLength);
+        }
+
+        private void grow(int i, int newLength)
+        {
+            starts = grow(starts, size, newLength, i);
+            ends = grow(ends, size, newLength, i);
+            markedAts = grow(markedAts, size, newLength, i);
+            delTimes = grow(delTimes, size, newLength, i);
+        }
+
+        private static LegacyBound[] grow(LegacyBound[] a, int size, int newLength, int i)
+        {
+            if (i < 0 || i >= size)
+                return Arrays.copyOf(a, newLength);
+
+            LegacyBound[] newA = new LegacyBound[newLength];
+            System.arraycopy(a, 0, newA, 0, i);
+            System.arraycopy(a, i, newA, i+1, size - i);
+            return newA;
+        }
+
+        private static long[] grow(long[] a, int size, int newLength, int i)
+        {
+            if (i < 0 || i >= size)
+                return Arrays.copyOf(a, newLength);
+
+            long[] newA = new long[newLength];
+            System.arraycopy(a, 0, newA, 0, i);
+            System.arraycopy(a, i, newA, i+1, size - i);
+            return newA;
+        }
+
+        private static int[] grow(int[] a, int size, int newLength, int i)
+        {
+            if (i < 0 || i >= size)
+                return Arrays.copyOf(a, newLength);
+
+            int[] newA = new int[newLength];
+            System.arraycopy(a, 0, newA, 0, i);
+            System.arraycopy(a, i, newA, i+1, size - i);
+            return newA;
+        }
+
+        /*
+         * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end.
+         */
+        private void moveElements(int i)
+        {
+            if (i >= size)
+                return;
+
+            System.arraycopy(starts, i, starts, i+1, size - i);
+            System.arraycopy(ends, i, ends, i+1, size - i);
+            System.arraycopy(markedAts, i, markedAts, i+1, size - i);
+            System.arraycopy(delTimes, i, delTimes, i+1, size - i);
+            // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize
+            // when we set it
+            starts[i] = null;
+        }
+
+        private void setInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime)
+        {
+            starts[i] = start;
+            ends[i] = end;
+            markedAts[i] = markedAt;
+            delTimes[i] = delTime;
+        }
+
+        public void updateDigest(MessageDigest digest)
+        {
+            ByteBuffer longBuffer = ByteBuffer.allocate(8);
+            for (int i = 0; i < size; i++)
+            {
+                for (int j = 0; j < starts[i].bound.size(); j++)
+                    digest.update(starts[i].bound.get(j).duplicate());
+                if (starts[i].collectionName != null)
+                    digest.update(starts[i].collectionName.name.bytes.duplicate());
+                for (int j = 0; j < ends[i].bound.size(); j++)
+                    digest.update(ends[i].bound.get(j).duplicate());
+                if (ends[i].collectionName != null)
+                    digest.update(ends[i].collectionName.name.bytes.duplicate());
+
+                longBuffer.putLong(0, markedAts[i]);
+                digest.update(longBuffer.array(), 0, 8);
+            }
+        }
+
+        public void serialize(DataOutputPlus out, CFMetaData metadata) throws IOException
+        {
+            out.writeInt(size);
+            if (size == 0)
+                return;
+
+            if (metadata.isCompound())
+                serializeCompound(out, metadata.isDense());
+            else
+                serializeSimple(out);
+        }
+
+        private void serializeCompound(DataOutputPlus out, boolean isDense) throws IOException
+        {
+            List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes());
+
+            if (!isDense)
+                types.add(UTF8Type.instance);
+
+            CompositeType type = CompositeType.getInstance(types);
+
+            for (int i = 0; i < size; i++)
+            {
+                LegacyBound start = starts[i];
+                LegacyBound end = ends[i];
+
+                CompositeType.Builder startBuilder = type.builder(start.isStatic);
+                CompositeType.Builder endBuilder = type.builder(end.isStatic);
+                for (int j = 0; j < start.bound.clustering().size(); j++)
+                {
+                    startBuilder.add(start.bound.get(j));
+                    endBuilder.add(end.bound.get(j));
+                }
+
+                if (start.collectionName != null)
+                    startBuilder.add(start.collectionName.name.bytes);
+                if (end.collectionName != null)
+                    endBuilder.add(end.collectionName.name.bytes);
+
+                ByteBufferUtil.writeWithShortLength(startBuilder.build(), out);
+                ByteBufferUtil.writeWithShortLength(endBuilder.buildAsEndOfRange(), out);
+
+                out.writeInt(delTimes[i]);
+                out.writeLong(markedAts[i]);
+            }
+        }
+
+        private void serializeSimple(DataOutputPlus out) throws IOException
+        {
+            List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes());
+            assert types.size() == 1 : types;
+
+            for (int i = 0; i < size; i++)
+            {
+                LegacyBound start = starts[i];
+                LegacyBound end = ends[i];
+
+                ClusteringPrefix startClustering = start.bound.clustering();
+                ClusteringPrefix endClustering = end.bound.clustering();
+
+                assert startClustering.size() == 1;
+                assert endClustering.size() == 1;
+
+                ByteBufferUtil.writeWithShortLength(startClustering.get(0), out);
+                ByteBufferUtil.writeWithShortLength(endClustering.get(0), out);
+
+                out.writeInt(delTimes[i]);
+                out.writeLong(markedAts[i]);
+            }
+        }
+
+        public long serializedSize(CFMetaData metadata)
+        {
+            long size = 0;
+            size += TypeSizes.sizeof(this.size);
+
+            if (this.size == 0)
+                return size;
+
+            if (metadata.isCompound())
+                return size + serializedSizeCompound(metadata.isDense());
+            else
+                return size + serializedSizeSimple();
+        }
+
+        private long serializedSizeCompound(boolean isDense)
+        {
+            long size = 0;
+            List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes());
+            if (!isDense)
+                types.add(UTF8Type.instance);
+            CompositeType type = CompositeType.getInstance(types);
+
+            for (int i = 0; i < this.size; i++)
+            {
+                LegacyBound start = starts[i];
+                LegacyBound end = ends[i];
+
+                CompositeType.Builder startBuilder = type.builder();
+                CompositeType.Builder endBuilder = type.builder();
+                for (int j = 0; j < start.bound.size(); j++)
+                    startBuilder.add(start.bound.get(j));
+                for (int j = 0; j < end.bound.size(); j++)
+                    endBuilder.add(end.bound.get(j));
+
+                if (start.collectionName != null)
+                    startBuilder.add(start.collectionName.name.bytes);
+                if (end.collectionName != null)
+                    endBuilder.add(end.collectionName.name.bytes);
+
+                size += ByteBufferUtil.serializedSizeWithShortLength(startBuilder.build());
+                size += ByteBufferUtil.serializedSizeWithShortLength(endBuilder.buildAsEndOfRange());
+
+                size += TypeSizes.sizeof(delTimes[i]);
+                size += TypeSizes.sizeof(markedAts[i]);
+            }
+            return size;
+        }
+
+        private long serializedSizeSimple()
+        {
+            long size = 0;
+            List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes());
+            assert types.size() == 1 : types;
+
+            for (int i = 0; i < this.size; i++)
+            {
+                LegacyBound start = starts[i];
+                LegacyBound end = ends[i];
+
+                ClusteringPrefix startClustering = start.bound.clustering();
+                ClusteringPrefix endClustering = end.bound.clustering();
+
+                assert startClustering.size() == 1;
+                assert endClustering.size() == 1;
+
+                size += ByteBufferUtil.serializedSizeWithShortLength(startClustering.get(0));
+                size += ByteBufferUtil.serializedSizeWithShortLength(endClustering.get(0));
+
+                size += TypeSizes.sizeof(delTimes[i]);
+                size += TypeSizes.sizeof(markedAts[i]);
+            }
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/LivenessInfo.java b/src/java/org/apache/cassandra/db/LivenessInfo.java
new file mode 100644
index 0000000..f6c9b62
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/LivenessInfo.java

@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.Objects;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Stores the information relating to the liveness of the primary key columns of a row.
+ * <p>
+ * A {@code LivenessInfo} can first be empty. If it isn't, it contains at least a timestamp,
+ * which is the timestamp for the row primary key columns. On top of that, the info can be
+ * ttl'ed, in which case the {@code LivenessInfo} also has both a ttl and a local expiration time.
+ * <p>
+ * Please note that if a liveness info is ttl'ed, that expiration is <b>only</b> an expiration
+ * of the liveness info itself (so, of the timestamp), and once the info expires it becomes
+ * {@code EMPTY}. But if a row has a liveness info which expires, the rest of the row data is
+ * unaffected (of course, the rest of said row data might be ttl'ed on its own but this is
+ * separate).
+ */
+public class LivenessInfo
+{
+    public static final long NO_TIMESTAMP = Long.MIN_VALUE;
+    public static final int NO_TTL = Cell.NO_TTL;
+    /**
+     * Used as flag for representing an expired liveness.
+     *
+     * TTL per request is at most 20 yrs, so this shouldn't conflict
+     * (See {@link org.apache.cassandra.cql3.Attributes#MAX_TTL})
+     */
+    public static final int EXPIRED_LIVENESS_TTL = Integer.MAX_VALUE;
+    public static final int NO_EXPIRATION_TIME = Cell.NO_DELETION_TIME;
+
+    public static final LivenessInfo EMPTY = new LivenessInfo(NO_TIMESTAMP);
+
+    protected final long timestamp;
+
+    protected LivenessInfo(long timestamp)
+    {
+        this.timestamp = timestamp;
+    }
+
+    public static LivenessInfo create(CFMetaData metadata, long timestamp, int nowInSec)
+    {
+        int defaultTTL = metadata.params.defaultTimeToLive;
+        if (defaultTTL != NO_TTL)
+            return expiring(timestamp, defaultTTL, nowInSec);
+
+        return new LivenessInfo(timestamp);
+    }
+
+    public static LivenessInfo expiring(long timestamp, int ttl, int nowInSec)
+    {
+        assert ttl != EXPIRED_LIVENESS_TTL;
+        return new ExpiringLivenessInfo(timestamp, ttl, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl));
+    }
+
+    public static LivenessInfo create(CFMetaData metadata, long timestamp, int ttl, int nowInSec)
+    {
+        return ttl == NO_TTL
+             ? create(metadata, timestamp, nowInSec)
+             : expiring(timestamp, ttl, nowInSec);
+    }
+
+    // Note that this ctor ignores the default table ttl and takes the expiration time, not the current time.
+    // Use when you know that's what you want.
+    public static LivenessInfo create(long timestamp, int ttl, int localExpirationTime)
+    {
+        if (ttl == EXPIRED_LIVENESS_TTL)
+            return new ExpiredLivenessInfo(timestamp, ttl, localExpirationTime);
+        return ttl == NO_TTL ? new LivenessInfo(timestamp) : new ExpiringLivenessInfo(timestamp, ttl, localExpirationTime);
+    }
+
+    /**
+     * Whether this liveness info is empty (has no timestamp).
+     *
+     * @return whether this liveness info is empty or not.
+     */
+    public boolean isEmpty()
+    {
+        return timestamp == NO_TIMESTAMP;
+    }
+
+    /**
+     * The timestamp for this liveness info.
+     *
+     * @return the liveness info timestamp (or {@link #NO_TIMESTAMP} if the info is empty).
+     */
+    public long timestamp()
+    {
+        return timestamp;
+    }
+
+    /**
+     * Whether the info has a ttl.
+     */
+    public boolean isExpiring()
+    {
+        return false;
+    }
+
+    /**
+     * The ttl (if any) on the row primary key columns or {@link #NO_TTL} if it is not
+     * expiring.
+     *
+     * Please note that this value is the TTL that was set originally and is thus not
+     * changing.
+     */
+    public int ttl()
+    {
+        return NO_TTL;
+    }
+
+    /**
+     * The expiration time (in seconds) if the info is expiring ({@link #NO_EXPIRATION_TIME} otherwise).
+     *
+     */
+    public int localExpirationTime()
+    {
+        return NO_EXPIRATION_TIME;
+    }
+
+    /**
+     * Whether that info is still live.
+     *
+     * A {@code LivenessInfo} is live if it is either not expiring, or if its expiration time if after
+     * {@code nowInSec}.
+     *
+     * @param nowInSec the current time in seconds.
+     * @return whether this liveness info is live or not.
+     */
+    public boolean isLive(int nowInSec)
+    {
+        return !isEmpty();
+    }
+
+    /**
+     * Adds this liveness information to the provided digest.
+     *
+     * @param digest the digest to add this liveness information to.
+     */
+    public void digest(MessageDigest digest)
+    {
+        FBUtilities.updateWithLong(digest, timestamp());
+    }
+
+    /**
+     * Validate the data contained by this liveness information.
+     *
+     * @throws MarshalException if some of the data is corrupted.
+     */
+    public void validate()
+    {
+    }
+
+    /**
+     * The size of the (useful) data this liveness information contains.
+     *
+     * @return the size of the data this liveness information contains.
+     */
+    public int dataSize()
+    {
+        return TypeSizes.sizeof(timestamp());
+    }
+
+    /**
+     * Whether this liveness information supersedes another one (that is
+     * whether is has a greater timestamp than the other or not).
+     *
+     * </br>
+     *
+     * If timestamps are the same and none of them are expired livenessInfo,
+     * livenessInfo with greater TTL supersedes another. It also means, if timestamps are the same,
+     * ttl superseders no-ttl. This is the same rule as {@link Conflicts#resolveRegular}
+     *
+     * If timestamps are the same and one of them is expired livenessInfo. Expired livenessInfo
+     * supersedes, ie. tombstone supersedes.
+     *
+     * If timestamps are the same and both of them are expired livenessInfo(Ideally it shouldn't happen),
+     * greater localDeletionTime wins.
+     *
+     * @param other
+     *            the {@code LivenessInfo} to compare this info to.
+     *
+     * @return whether this {@code LivenessInfo} supersedes {@code other}.
+     */
+    public boolean supersedes(LivenessInfo other)
+    {
+        if (timestamp != other.timestamp)
+            return timestamp > other.timestamp;
+        if (isExpired() ^ other.isExpired())
+            return isExpired();
+        if (isExpiring() == other.isExpiring())
+            return localExpirationTime() > other.localExpirationTime();
+        return isExpiring();
+    }
+
+    protected boolean isExpired()
+    {
+        return false;
+    }
+
+    /**
+     * Returns a copy of this liveness info updated with the provided timestamp.
+     *
+     * @param newTimestamp the timestamp for the returned info.
+     * @return if this liveness info has a timestamp, a copy of it with {@code newTimestamp}
+     * as timestamp. If it has no timestamp however, this liveness info is returned
+     * unchanged.
+     */
+    public LivenessInfo withUpdatedTimestamp(long newTimestamp)
+    {
+        return new LivenessInfo(newTimestamp);
+    }
+
+    public LivenessInfo withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
+    {
+        return LivenessInfo.create(newTimestamp, ttl(), newLocalDeletionTime);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("[ts=%d]", timestamp);
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof LivenessInfo))
+            return false;
+
+        LivenessInfo that = (LivenessInfo)other;
+        return this.timestamp() == that.timestamp()
+            && this.ttl() == that.ttl()
+            && this.localExpirationTime() == that.localExpirationTime();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(timestamp(), ttl(), localExpirationTime());
+    }
+
+    /**
+     * Effectively acts as a PK tombstone. This is used for Materialized Views to shadow
+     * updated entries while co-existing with row tombstones.
+     *
+     * See {@link org.apache.cassandra.db.view.ViewUpdateGenerator#deleteOldEntryInternal}.
+     */
+    private static class ExpiredLivenessInfo extends ExpiringLivenessInfo
+    {
+        private ExpiredLivenessInfo(long timestamp, int ttl, int localExpirationTime)
+        {
+            super(timestamp, ttl, localExpirationTime);
+            assert ttl == EXPIRED_LIVENESS_TTL;
+            assert timestamp != NO_TIMESTAMP;
+        }
+
+        @Override
+        public boolean isExpired()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean isLive(int nowInSec)
+        {
+            // used as tombstone to shadow entire PK
+            return false;
+        }
+
+        @Override
+        public LivenessInfo withUpdatedTimestamp(long newTimestamp)
+        {
+            return new ExpiredLivenessInfo(newTimestamp, ttl(), localExpirationTime());
+        }
+    }
+
+    private static class ExpiringLivenessInfo extends LivenessInfo
+    {
+        private final int ttl;
+        private final int localExpirationTime;
+
+        private ExpiringLivenessInfo(long timestamp, int ttl, int localExpirationTime)
+        {
+            super(timestamp);
+            assert ttl != NO_TTL && localExpirationTime != NO_EXPIRATION_TIME;
+            this.ttl = ttl;
+            this.localExpirationTime = localExpirationTime;
+        }
+
+        @Override
+        public int ttl()
+        {
+            return ttl;
+        }
+
+        @Override
+        public int localExpirationTime()
+        {
+            return localExpirationTime;
+        }
+
+        @Override
+        public boolean isExpiring()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean isLive(int nowInSec)
+        {
+            return nowInSec < localExpirationTime;
+        }
+
+        @Override
+        public void digest(MessageDigest digest)
+        {
+            super.digest(digest);
+            FBUtilities.updateWithInt(digest, localExpirationTime);
+            FBUtilities.updateWithInt(digest, ttl);
+        }
+
+        @Override
+        public void validate()
+        {
+            if (ttl < 0)
+                throw new MarshalException("A TTL should not be negative");
+            if (localExpirationTime < 0)
+                throw new MarshalException("A local expiration time should not be negative");
+        }
+
+        @Override
+        public int dataSize()
+        {
+            return super.dataSize()
+                 + TypeSizes.sizeof(ttl)
+                 + TypeSizes.sizeof(localExpirationTime);
+
+        }
+
+        @Override
+        public LivenessInfo withUpdatedTimestamp(long newTimestamp)
+        {
+            return new ExpiringLivenessInfo(newTimestamp, ttl, localExpirationTime);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[ts=%d ttl=%d, let=%d]", timestamp, ttl, localExpirationTime);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
index b17b3fc..041ac2e 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java

@@ -18,12 +18,9 @@
 package org.apache.cassandra.db;
 
 import java.io.File;
-import java.util.AbstractMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.concurrent.ConcurrentNavigableMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.TimeUnit;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -33,20 +30,32 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.commitlog.IntervalSet;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableTxnWriter;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.*;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.MemtablePool;
 
 public class Memtable implements Comparable<Memtable>
 {
@@ -60,7 +69,7 @@
     private final AtomicLong liveDataSize = new AtomicLong(0);
     private final AtomicLong currentOperations = new AtomicLong(0);
 
-    // the write barrier for directing writes to this memtable during a switch
+    // the write barrier for directing writes to this memtable or the next during a switch
     private volatile OpOrder.Barrier writeBarrier;
     // the precise upper bound of ReplayPosition owned by this memtable
     private volatile AtomicReference<ReplayPosition> commitLogUpperBound;
@@ -82,10 +91,10 @@
         }
     }
 
-    // We index the memtable by RowPosition only for the purpose of being able
+    // We index the memtable by PartitionPosition only for the purpose of being able
     // to select key range using Token.KeyBound. However put() ensures that we
     // actually only store DecoratedKey.
-    private final ConcurrentNavigableMap<RowPosition, AtomicBTreeColumns> rows = new ConcurrentSkipListMap<>();
+    private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> partitions = new ConcurrentSkipListMap<>();
     public final ColumnFamilyStore cfs;
     private final long creationNano = System.nanoTime();
 
@@ -95,8 +104,12 @@
     // Record the comparator of the CFS at the creation of the memtable. This
     // is only used when a user update the CF comparator, to know if the
     // memtable was created with the new or old comparator.
-    public final CellNameType initialComparator;
+    public final ClusteringComparator initialComparator;
 
+    private final ColumnsCollector columnsCollector;
+    private final StatsCollector statsCollector = new StatsCollector();
+
+    // only to be used by init(), to setup the very first memtable for the cfs
     public Memtable(AtomicReference<ReplayPosition> commitLogLowerBound, ColumnFamilyStore cfs)
     {
         this.cfs = cfs;
@@ -104,6 +117,7 @@
         this.allocator = MEMORY_POOL.newAllocator();
         this.initialComparator = cfs.metadata.comparator;
         this.cfs.scheduleFlush();
+        this.columnsCollector = new ColumnsCollector(cfs.metadata.partitionColumns());
     }
 
     // ONLY to be used for testing, to create a mock Memtable
@@ -113,6 +127,7 @@
         this.initialComparator = metadata.comparator;
         this.cfs = null;
         this.allocator = null;
+        this.columnsCollector = new ColumnsCollector(metadata.partitionColumns());
     }
 
     public MemtableAllocator getAllocator()
@@ -179,6 +194,11 @@
         return commitLogLowerBound.get();
     }
 
+    public ReplayPosition getCommitLogUpperBound()
+    {
+        return commitLogUpperBound.get();
+    }
+
     public boolean isLive()
     {
         return allocator.isLive();
@@ -186,7 +206,7 @@
 
     public boolean isClean()
     {
-        return rows.isEmpty();
+        return partitions.isEmpty();
     }
 
     public boolean mayContainDataBefore(ReplayPosition position)
@@ -199,7 +219,7 @@
      */
     public boolean isExpired()
     {
-        int period = cfs.metadata.getMemtableFlushPeriod();
+        int period = cfs.metadata.params.memtableFlushPeriodInMs;
         return period > 0 && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period));
     }
 
@@ -209,55 +229,40 @@
      *
      * replayPosition should only be null if this is a secondary index, in which case it is *expected* to be null
      */
-    long put(DecoratedKey key, ColumnFamily cf, SecondaryIndexManager.Updater indexer, OpOrder.Group opGroup)
+    long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
     {
-        AtomicBTreeColumns previous = rows.get(key);
+        AtomicBTreePartition previous = partitions.get(update.partitionKey());
 
         long initialSize = 0;
         if (previous == null)
         {
-            AtomicBTreeColumns empty = cf.cloneMeShallow(AtomicBTreeColumns.factory, false);
-            final DecoratedKey cloneKey = allocator.clone(key, opGroup);
+            final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup);
+            AtomicBTreePartition empty = new AtomicBTreePartition(cfs.metadata, cloneKey, allocator);
             // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
-            previous = rows.putIfAbsent(cloneKey, empty);
+            previous = partitions.putIfAbsent(cloneKey, empty);
             if (previous == null)
             {
                 previous = empty;
                 // allocate the row overhead after the fact; this saves over allocating and having to free after, but
                 // means we can overshoot our declared limit.
-                int overhead = (int) (key.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
+                int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
                 allocator.onHeap().allocate(overhead, opGroup);
                 initialSize = 8;
             }
-            else
-            {
-                allocator.reclaimer().reclaimImmediately(cloneKey);
-            }
         }
 
-        final AtomicBTreeColumns.ColumnUpdater updater = previous.addAllWithSizeDelta(cf, allocator, opGroup, indexer);
-        minTimestamp = Math.min(minTimestamp, updater.minTimestamp);
-        liveDataSize.addAndGet(initialSize + updater.dataSize);
-        currentOperations.addAndGet(cf.getColumnCount() + (cf.isMarkedForDelete() ? 1 : 0) + cf.deletionInfo().rangeCount());
-        return updater.colUpdateTimeDelta;
-    }
-
-    // for debugging
-    public String contents()
-    {
-        StringBuilder builder = new StringBuilder();
-        builder.append("{");
-        for (Map.Entry<RowPosition, AtomicBTreeColumns> entry : rows.entrySet())
-        {
-            builder.append(entry.getKey()).append(": ").append(entry.getValue()).append(", ");
-        }
-        builder.append("}");
-        return builder.toString();
+        long[] pair = previous.addAllWithSizeDelta(update, opGroup, indexer);
+        minTimestamp = Math.min(minTimestamp, previous.stats().minTimestamp);
+        liveDataSize.addAndGet(initialSize + pair[0]);
+        columnsCollector.update(update.columns());
+        statsCollector.update(update.stats());
+        currentOperations.addAndGet(update.operationCount());
+        return pair[1];
     }
 
     public int partitionCount()
     {
-        return rows.size();
+        return partitions.size();
     }
 
     public String toString()
@@ -267,63 +272,58 @@
                              100 * allocator.onHeap().ownershipRatio(), 100 * allocator.offHeap().ownershipRatio());
     }
 
-    /**
-     * @param startWith Include data in the result from and including this key and to the end of the memtable
-     * @return An iterator of entries with the data from the start key
-     */
-    public Iterator<Map.Entry<DecoratedKey, ColumnFamily>> getEntryIterator(final RowPosition startWith, final RowPosition stopAt)
+    public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFilter columnFilter, final DataRange dataRange, final boolean isForThrift)
     {
-        return new Iterator<Map.Entry<DecoratedKey, ColumnFamily>>()
+        AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();
+
+        boolean startIsMin = keyRange.left.isMinimum();
+        boolean stopIsMin = keyRange.right.isMinimum();
+
+        boolean isBound = keyRange instanceof Bounds;
+        boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
+        boolean includeStop = isBound || keyRange instanceof Range;
+        Map<PartitionPosition, AtomicBTreePartition> subMap;
+        if (startIsMin)
+            subMap = stopIsMin ? partitions : partitions.headMap(keyRange.right, includeStop);
+        else
+            subMap = stopIsMin
+                   ? partitions.tailMap(keyRange.left, includeStart)
+                   : partitions.subMap(keyRange.left, includeStart, keyRange.right, includeStop);
+
+        int minLocalDeletionTime = Integer.MAX_VALUE;
+
+        // avoid iterating over the memtable if we purge all tombstones
+        if (cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones())
+            minLocalDeletionTime = findMinLocalDeletionTime(subMap.entrySet().iterator());
+
+        final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter = subMap.entrySet().iterator();
+
+        return new MemtableUnfilteredPartitionIterator(cfs, iter, isForThrift, minLocalDeletionTime, columnFilter, dataRange);
+    }
+
+    private int findMinLocalDeletionTime(Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iterator)
+    {
+        int minLocalDeletionTime = Integer.MAX_VALUE;
+        while (iterator.hasNext())
         {
-            private Iterator<? extends Map.Entry<? extends RowPosition, AtomicBTreeColumns>> iter = stopAt.isMinimum()
-                    ? rows.tailMap(startWith).entrySet().iterator()
-                    : rows.subMap(startWith, true, stopAt, true).entrySet().iterator();
-
-            private Map.Entry<? extends RowPosition, ? extends ColumnFamily> currentEntry;
-
-            public boolean hasNext()
-            {
-                return iter.hasNext();
-            }
-
-            public Map.Entry<DecoratedKey, ColumnFamily> next()
-            {
-                Map.Entry<? extends RowPosition, ? extends ColumnFamily> entryRowPosition = iter.next();
-                // Actual stored key should be true DecoratedKey
-                assert entryRowPosition.getKey() instanceof DecoratedKey;
-                @SuppressWarnings("unchecked") // Object cast is required since otherwise we can't turn RowPosition into DecoratedKey
-                Map.Entry<DecoratedKey, ColumnFamily> entry = (Map.Entry<DecoratedKey, ColumnFamily>) entryRowPosition;
-                if (MEMORY_POOL.needToCopyOnHeap())
-                {
-                    DecoratedKey key = entry.getKey();
-                    key = new BufferDecoratedKey(key.getToken(), HeapAllocator.instance.clone(key.getKey()));
-                    ColumnFamily cells = ArrayBackedSortedColumns.localCopy(entry.getValue(), HeapAllocator.instance);
-                    entry = new AbstractMap.SimpleImmutableEntry<>(key, cells);
-                }
-                // Store the reference to the current entry so that remove() can update the current size.
-                currentEntry = entry;
-                return entry;
-            }
-
-            public void remove()
-            {
-                iter.remove();
-                liveDataSize.addAndGet(-currentEntry.getValue().dataSize());
-                currentEntry = null;
-            }
-        };
+            Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iterator.next();
+            minLocalDeletionTime = Math.min(minLocalDeletionTime, entry.getValue().stats().minLocalDeletionTime);
+        }
+        return minLocalDeletionTime;
     }
 
-    public ColumnFamily getColumnFamily(DecoratedKey key)
+    public Partition getPartition(DecoratedKey key)
     {
-        return rows.get(key);
+        return partitions.get(key);
     }
 
-    public SSTableReader flush()
+    public Collection<SSTableReader> flush()
     {
         long estimatedSize = estimatedSize();
-        Directories.DataDirectory dataDirectory = cfs.directories.getWriteableLocation(estimatedSize);
-        File sstableDirectory = cfs.directories.getLocationForDisk(dataDirectory);
+        Directories.DataDirectory dataDirectory = cfs.getDirectories().getWriteableLocation(estimatedSize);
+        if (dataDirectory == null)
+            throw new RuntimeException("Insufficient disk space to write " + estimatedSize + " bytes");
+        File sstableDirectory = cfs.getDirectories().getLocationForDisk(dataDirectory);
         assert sstableDirectory != null : "Flush task is not bound to any disk";
         return writeSortedContents(sstableDirectory);
     }
@@ -333,10 +333,19 @@
         return minTimestamp;
     }
 
+    /**
+     * For testing only. Give this memtable too big a size to make it always fail flushing.
+     */
+    @VisibleForTesting
+    public void makeUnflushable()
+    {
+        liveDataSize.addAndGet(1L * 1024 * 1024 * 1024 * 1024 * 1024);
+    }
+
     private long estimatedSize()
     {
         long keySize = 0;
-        for (RowPosition key : rows.keySet())
+        for (PartitionPosition key : partitions.keySet())
         {
             //  make sure we don't write non-sensical keys
             assert key instanceof DecoratedKey;
@@ -348,71 +357,94 @@
                        * 1.2); // bloom filter and row index overhead
     }
 
-    private SSTableReader writeSortedContents(File sstableDirectory)
+    private Collection<SSTableReader> writeSortedContents(File sstableDirectory)
     {
-        logger.info("Writing {}", Memtable.this.toString());
+        boolean isBatchLogTable = cfs.name.equals(SystemKeyspace.BATCHES) && cfs.keyspace.getName().equals(SystemKeyspace.NAME);
 
-        SSTableReader ssTable;
-        // errors when creating the writer that may leave empty temp files.
-        try (SSTableWriter writer = createFlushWriter(cfs.getTempSSTablePath(sstableDirectory)))
+        logger.debug("Writing {}", Memtable.this.toString());
+
+        Collection<SSTableReader> ssTables;
+        try (SSTableTxnWriter writer = createFlushWriter(cfs.getSSTablePath(sstableDirectory), columnsCollector.get(), statsCollector.get()))
         {
             boolean trackContention = logger.isTraceEnabled();
             int heavilyContendedRowCount = 0;
             // (we can't clear out the map as-we-go to free up memory,
             //  since the memtable is being used for queries in the "pending flush" category)
-            for (Map.Entry<RowPosition, AtomicBTreeColumns> entry : rows.entrySet())
+            for (AtomicBTreePartition partition : partitions.values())
             {
-                AtomicBTreeColumns cf = entry.getValue();
+                // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
+                // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local,
+                // we don't need to preserve tombstones for repair. So if both operation are in this
+                // memtable (which will almost always be the case if there is no ongoing failure), we can
+                // just skip the entry (CASSANDRA-4667).
+                if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
+                    continue;
 
-                if (cf.isMarkedForDelete() && cf.hasColumns())
-                {
-                    // When every node is up, there's no reason to write batchlog data out to sstables
-                    // (which in turn incurs cost like compaction) since the BL write + delete cancel each other out,
-                    // and BL data is strictly local, so we don't need to preserve tombstones for repair.
-                    // If we have a data row + row level tombstone, then writing it is effectively an expensive no-op so we skip it.
-                    // See CASSANDRA-4667.
-                    if (cfs.name.equals(SystemKeyspace.BATCHLOG) && cfs.keyspace.getName().equals(SystemKeyspace.NAME))
-                        continue;
-                }
-
-                if (trackContention && cf.usePessimisticLocking())
+                if (trackContention && partition.useLock())
                     heavilyContendedRowCount++;
 
-                if (!cf.isEmpty())
-                    writer.append((DecoratedKey)entry.getKey(), cf);
+                if (!partition.isEmpty())
+                {
+                    try (UnfilteredRowIterator iter = partition.unfilteredIterator())
+                    {
+                        writer.append(iter);
+                    }
+                }
             }
 
             if (writer.getFilePointer() > 0)
             {
                 logger.debug(String.format("Completed flushing %s (%s) for commitlog position %s",
                                            writer.getFilename(),
-                                           FBUtilities.prettyPrintMemory(writer.getOnDiskFilePointer()),
+                                           FBUtilities.prettyPrintMemory(writer.getFilePointer()),
                                            commitLogUpperBound));
 
-                // temp sstables should contain non-repaired data.
-                ssTable = writer.finish(true);
+                // sstables should contain non-repaired data.
+                ssTables = writer.finish(true);
             }
             else
             {
                 logger.debug("Completed flushing {}; nothing needed to be retained.  Commitlog position was {}",
                              writer.getFilename(), commitLogUpperBound);
                 writer.abort();
-                ssTable = null;
+                ssTables = Collections.emptyList();
             }
 
             if (heavilyContendedRowCount > 0)
-                logger.trace(String.format("High update contention in %d/%d partitions of %s ", heavilyContendedRowCount, rows.size(), Memtable.this.toString()));
+                logger.trace(String.format("High update contention in %d/%d partitions of %s ", heavilyContendedRowCount, partitions.size(), Memtable.this.toString()));
 
-            return ssTable;
+            return ssTables;
         }
     }
 
-    private SSTableWriter createFlushWriter(String filename)
+    @SuppressWarnings("resource") // log and writer closed by SSTableTxnWriter
+    public SSTableTxnWriter createFlushWriter(String filename,
+                                              PartitionColumns columns,
+                                              EncodingStats stats)
     {
-        MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.metadata.comparator)
-                                                     .commitLogLowerBound(commitLogLowerBound.get())
-                                                     .commitLogUpperBound(commitLogUpperBound.get());
-        return SSTableWriter.create(Descriptor.fromFilename(filename), (long) rows.size(), ActiveRepairService.UNREPAIRED_SSTABLE, cfs.metadata, cfs.partitioner, sstableMetadataCollector);
+        // we operate "offline" here, as we expose the resulting reader consciously when done
+        // (although we may want to modify this behaviour in future, to encapsulate full flush behaviour in LifecycleTransaction)
+        LifecycleTransaction txn = null;
+        try
+        {
+            txn = LifecycleTransaction.offline(OperationType.FLUSH);
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.metadata.comparator)
+                    .commitLogIntervals(new IntervalSet(commitLogLowerBound.get(), commitLogUpperBound.get()));
+
+            return new SSTableTxnWriter(txn,
+                                        cfs.createSSTableMultiWriter(Descriptor.fromFilename(filename),
+                                                                     (long) partitions.size(),
+                                                                     ActiveRepairService.UNREPAIRED_SSTABLE,
+                                                                     sstableMetadataCollector,
+                                                                     new SerializationHeader(true, cfs.metadata, columns, stats),
+                                                                     txn));
+        }
+        catch (Throwable t)
+        {
+            if (txn != null)
+                txn.close();
+            throw t;
+        }
     }
 
     private static int estimateRowOverhead(final int count)
@@ -422,17 +454,132 @@
         {
             int rowOverhead;
             MemtableAllocator allocator = MEMORY_POOL.newAllocator();
-            ConcurrentNavigableMap<RowPosition, Object> rows = new ConcurrentSkipListMap<>();
+            ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>();
             final Object val = new Object();
-            for (int i = 0; i < count; i++)
-                rows.put(allocator.clone(new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val);
-            double avgSize = ObjectSizes.measureDeep(rows) / (double) count;
+            for (int i = 0 ; i < count ; i++)
+                partitions.put(allocator.clone(new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val);
+            double avgSize = ObjectSizes.measureDeep(partitions) / (double) count;
             rowOverhead = (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
             rowOverhead -= ObjectSizes.measureDeep(new LongToken(0));
-            rowOverhead += AtomicBTreeColumns.EMPTY_SIZE;
+            rowOverhead += AtomicBTreePartition.EMPTY_SIZE;
             allocator.setDiscarding();
             allocator.setDiscarded();
             return rowOverhead;
         }
     }
+
+    public static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator
+    {
+        private final ColumnFamilyStore cfs;
+        private final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter;
+        private final boolean isForThrift;
+        private final int minLocalDeletionTime;
+        private final ColumnFilter columnFilter;
+        private final DataRange dataRange;
+
+        public MemtableUnfilteredPartitionIterator(ColumnFamilyStore cfs, Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter, boolean isForThrift, int minLocalDeletionTime, ColumnFilter columnFilter, DataRange dataRange)
+        {
+            this.cfs = cfs;
+            this.iter = iter;
+            this.isForThrift = isForThrift;
+            this.minLocalDeletionTime = minLocalDeletionTime;
+            this.columnFilter = columnFilter;
+            this.dataRange = dataRange;
+        }
+
+        public boolean isForThrift()
+        {
+            return isForThrift;
+        }
+
+        public int getMinLocalDeletionTime()
+        {
+            return minLocalDeletionTime;
+        }
+
+        public CFMetaData metadata()
+        {
+            return cfs.metadata;
+        }
+
+        public boolean hasNext()
+        {
+            return iter.hasNext();
+        }
+
+        public UnfilteredRowIterator next()
+        {
+            Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iter.next();
+            // Actual stored key should be true DecoratedKey
+            assert entry.getKey() instanceof DecoratedKey;
+            DecoratedKey key = (DecoratedKey)entry.getKey();
+            ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key);
+            return filter.getUnfilteredRowIterator(columnFilter, entry.getValue());
+        }
+    }
+
+    private static class ColumnsCollector
+    {
+        private final HashMap<ColumnDefinition, AtomicBoolean> predefined = new HashMap<>();
+        private final ConcurrentSkipListSet<ColumnDefinition> extra = new ConcurrentSkipListSet<>();
+        ColumnsCollector(PartitionColumns columns)
+        {
+            for (ColumnDefinition def : columns.statics)
+                predefined.put(def, new AtomicBoolean());
+            for (ColumnDefinition def : columns.regulars)
+                predefined.put(def, new AtomicBoolean());
+        }
+
+        public void update(PartitionColumns columns)
+        {
+            for (ColumnDefinition s : columns.statics)
+                update(s);
+            for (ColumnDefinition r : columns.regulars)
+                update(r);
+        }
+
+        private void update(ColumnDefinition definition)
+        {
+            AtomicBoolean present = predefined.get(definition);
+            if (present != null)
+            {
+                if (!present.get())
+                    present.set(true);
+            }
+            else
+            {
+                extra.add(definition);
+            }
+        }
+
+        public PartitionColumns get()
+        {
+            PartitionColumns.Builder builder = PartitionColumns.builder();
+            for (Map.Entry<ColumnDefinition, AtomicBoolean> e : predefined.entrySet())
+                if (e.getValue().get())
+                    builder.add(e.getKey());
+            return builder.addAll(extra).build();
+        }
+    }
+
+    private static class StatsCollector
+    {
+        private final AtomicReference<EncodingStats> stats = new AtomicReference<>(EncodingStats.NO_STATS);
+
+        public void update(EncodingStats newStats)
+        {
+            while (true)
+            {
+                EncodingStats current = stats.get();
+                EncodingStats updated = current.mergeWith(newStats);
+                if (stats.compareAndSet(current, updated))
+                    return;
+            }
+        }
+
+        public EncodingStats get()
+        {
+            return stats.get();
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java b/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java
index ab934c6..3666b27 100644
--- a/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/MigrationRequestVerbHandler.java

@@ -26,7 +26,7 @@
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.service.MigrationManager;
 
 /**
@@ -41,7 +41,7 @@
     {
         logger.trace("Received migration request from {}.", message.from);
         MessageOut<Collection<Mutation>> response = new MessageOut<>(MessagingService.Verb.INTERNAL_RESPONSE,
-                                                                     LegacySchemaTables.convertSchemaToMutations(),
+                                                                     SchemaKeyspace.convertSchemaToMutations(),
                                                                      MigrationManager.MigrationsSerializer.instance);
         MessagingService.instance().sendReply(response, id, message.from);
     }

diff --git a/src/java/org/apache/cassandra/db/MultiCBuilder.java b/src/java/org/apache/cassandra/db/MultiCBuilder.java
new file mode 100644
index 0000000..7c77ab0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/MultiCBuilder.java

@@ -0,0 +1,379 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.NavigableSet;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+/**
+ * Builder that allow to build multiple Clustering/Slice.Bound at the same time.
+ */
+public class MultiCBuilder
+{
+    /**
+     * The table comparator.
+     */
+    private final ClusteringComparator comparator;
+
+    /**
+     * The elements of the clusterings
+     */
+    private final List<List<ByteBuffer>> elementsList = new ArrayList<>();
+
+    /**
+     * The number of elements that have been added.
+     */
+    private int size;
+
+    /**
+     * <code>true</code> if the clusterings have been build, <code>false</code> otherwise.
+     */
+    private boolean built;
+
+    /**
+     * <code>true</code> if the clusterings contains some <code>null</code> elements.
+     */
+    private boolean containsNull;
+
+    /**
+     * <code>true</code> if the composites contains some <code>unset</code> elements.
+     */
+    private boolean containsUnset;
+
+    /**
+     * <code>true</code> if some empty collection have been added.
+     */
+    private boolean hasMissingElements;
+
+    private MultiCBuilder(ClusteringComparator comparator)
+    {
+        this.comparator = comparator;
+    }
+
+    /**
+     * Creates a new empty {@code MultiCBuilder}.
+     */
+    public static MultiCBuilder create(ClusteringComparator comparator)
+    {
+        return new MultiCBuilder(comparator);
+    }
+
+    /**
+     * Checks if this builder is empty.
+     *
+     * @return <code>true</code> if this builder is empty, <code>false</code> otherwise.
+     */
+    private boolean isEmpty()
+    {
+        return elementsList.isEmpty();
+    }
+
+    /**
+     * Adds the specified element to all the clusterings.
+     * <p>
+     * If this builder contains 2 clustering: A-B and A-C a call to this method to add D will result in the clusterings:
+     * A-B-D and A-C-D.
+     * </p>
+     *
+     * @param value the value of the next element
+     * @return this <code>MulitCBuilder</code>
+     */
+    public MultiCBuilder addElementToAll(ByteBuffer value)
+    {
+        checkUpdateable();
+
+        if (isEmpty())
+            elementsList.add(new ArrayList<ByteBuffer>());
+
+        for (int i = 0, m = elementsList.size(); i < m; i++)
+        {
+            if (value == null)
+                containsNull = true;
+            if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                containsUnset = true;
+
+            elementsList.get(i).add(value);
+        }
+        size++;
+        return this;
+    }
+
+    /**
+     * Adds individually each of the specified elements to the end of all of the existing clusterings.
+     * <p>
+     * If this builder contains 2 clusterings: A-B and A-C a call to this method to add D and E will result in the 4
+     * clusterings: A-B-D, A-B-E, A-C-D and A-C-E.
+     * </p>
+     *
+     * @param values the elements to add
+     * @return this <code>CompositeBuilder</code>
+     */
+    public MultiCBuilder addEachElementToAll(List<ByteBuffer> values)
+    {
+        checkUpdateable();
+
+        if (isEmpty())
+            elementsList.add(new ArrayList<ByteBuffer>());
+
+        if (values.isEmpty())
+        {
+            hasMissingElements = true;
+        }
+        else
+        {
+            for (int i = 0, m = elementsList.size(); i < m; i++)
+            {
+                List<ByteBuffer> oldComposite = elementsList.remove(0);
+
+                for (int j = 0, n = values.size(); j < n; j++)
+                {
+                    List<ByteBuffer> newComposite = new ArrayList<>(oldComposite);
+                    elementsList.add(newComposite);
+
+                    ByteBuffer value = values.get(j);
+
+                    if (value == null)
+                        containsNull = true;
+                    if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
+                        containsUnset = true;
+
+                    newComposite.add(values.get(j));
+                }
+            }
+        }
+        size++;
+        return this;
+    }
+
+    /**
+     * Adds individually each of the specified list of elements to the end of all of the existing composites.
+     * <p>
+     * If this builder contains 2 composites: A-B and A-C a call to this method to add [[D, E], [F, G]] will result in the 4
+     * composites: A-B-D-E, A-B-F-G, A-C-D-E and A-C-F-G.
+     * </p>
+     *
+     * @param values the elements to add
+     * @return this <code>CompositeBuilder</code>
+     */
+    public MultiCBuilder addAllElementsToAll(List<List<ByteBuffer>> values)
+    {
+        checkUpdateable();
+
+        if (isEmpty())
+            elementsList.add(new ArrayList<ByteBuffer>());
+
+        if (values.isEmpty())
+        {
+            hasMissingElements = true;
+        }
+        else
+        {
+            for (int i = 0, m = elementsList.size(); i < m; i++)
+            {
+                List<ByteBuffer> oldComposite = elementsList.remove(0);
+
+                for (int j = 0, n = values.size(); j < n; j++)
+                {
+                    List<ByteBuffer> newComposite = new ArrayList<>(oldComposite);
+                    elementsList.add(newComposite);
+
+                    List<ByteBuffer> value = values.get(j);
+
+                    if (value.contains(null))
+                        containsNull = true;
+                    if (value.contains(ByteBufferUtil.UNSET_BYTE_BUFFER))
+                        containsUnset = true;
+
+                    newComposite.addAll(value);
+                }
+            }
+            size += values.get(0).size();
+        }
+        return this;
+    }
+
+    /**
+     * Returns the number of elements that can be added to the clusterings.
+     *
+     * @return the number of elements that can be added to the clusterings.
+     */
+    public int remainingCount()
+    {
+        return comparator.size() - size;
+    }
+
+    /**
+     * Checks if the clusterings contains null elements.
+     *
+     * @return <code>true</code> if the clusterings contains <code>null</code> elements, <code>false</code> otherwise.
+     */
+    public boolean containsNull()
+    {
+        return containsNull;
+    }
+
+    /**
+     * Checks if the clusterings contains unset elements.
+     *
+     * @return <code>true</code> if the clusterings contains <code>unset</code> elements, <code>false</code> otherwise.
+     */
+    public boolean containsUnset()
+    {
+        return containsUnset;
+    }
+
+    /**
+     * Checks if some empty list of values have been added
+     * @return <code>true</code> if the clusterings have some missing elements, <code>false</code> otherwise.
+     */
+    public boolean hasMissingElements()
+    {
+        return hasMissingElements;
+    }
+
+    /**
+     * Builds the <code>clusterings</code>.
+     *
+     * @return the clusterings
+     */
+    public NavigableSet<Clustering> build()
+    {
+        built = true;
+
+        if (hasMissingElements)
+            return BTreeSet.empty(comparator);
+
+        CBuilder builder = CBuilder.create(comparator);
+
+        if (elementsList.isEmpty())
+            return BTreeSet.of(builder.comparator(), builder.build());
+
+        BTreeSet.Builder<Clustering> set = BTreeSet.builder(builder.comparator());
+        for (int i = 0, m = elementsList.size(); i < m; i++)
+        {
+            List<ByteBuffer> elements = elementsList.get(i);
+            set.add(builder.buildWith(elements));
+        }
+        return set.build();
+    }
+
+    /**
+     * Builds the <code>Slice.Bound</code>s for slice restrictions.
+     *
+     * @param isStart specify if the bound is a start one
+     * @param isInclusive specify if the bound is inclusive or not
+     * @param isOtherBoundInclusive specify if the other bound is inclusive or not
+     * @param columnDefs the columns of the slice restriction
+     * @return the <code>Slice.Bound</code>s
+     */
+    public NavigableSet<Slice.Bound> buildBoundForSlice(boolean isStart,
+                                                        boolean isInclusive,
+                                                        boolean isOtherBoundInclusive,
+                                                        List<ColumnDefinition> columnDefs)
+    {
+        built = true;
+
+        if (hasMissingElements)
+            return BTreeSet.empty(comparator);
+
+        CBuilder builder = CBuilder.create(comparator);
+
+        if (elementsList.isEmpty())
+            return BTreeSet.of(comparator, builder.buildBound(isStart, isInclusive));
+
+        // Use a TreeSet to sort and eliminate duplicates
+        BTreeSet.Builder<Slice.Bound> set = BTreeSet.builder(comparator);
+
+        // The first column of the slice might not be the first clustering column (e.g. clustering_0 = ? AND (clustering_1, clustering_2) >= (?, ?)
+        int offset = columnDefs.get(0).position();
+
+        for (int i = 0, m = elementsList.size(); i < m; i++)
+        {
+            List<ByteBuffer> elements = elementsList.get(i);
+
+            // Handle the no bound case
+            if (elements.size() == offset)
+            {
+                set.add(builder.buildBoundWith(elements, isStart, true));
+                continue;
+            }
+
+            // In the case of mixed order columns, we will have some extra slices where the columns change directions.
+            // For example: if we have clustering_0 DESC and clustering_1 ASC a slice like (clustering_0, clustering_1) > (1, 2)
+            // will produce 2 slices: [BOTTOM, 1) and (1.2, 1]
+            // So, the END bound will return 2 bounds with the same values 1
+            ColumnDefinition lastColumn = columnDefs.get(columnDefs.size() - 1);
+            if (elements.size() <= lastColumn.position() && i < m - 1 && elements.equals(elementsList.get(i + 1)))
+            {
+                set.add(builder.buildBoundWith(elements, isStart, false));
+                set.add(builder.buildBoundWith(elementsList.get(i++), isStart, true));
+                continue;
+            }
+
+            // Handle the normal bounds
+            ColumnDefinition column = columnDefs.get(elements.size() - 1 - offset);
+            set.add(builder.buildBoundWith(elements, isStart, column.isReversedType() ? isOtherBoundInclusive : isInclusive));
+        }
+        return set.build();
+    }
+
+    public NavigableSet<Slice.Bound> buildBound(boolean isStart, boolean isInclusive)
+    {
+        built = true;
+
+        if (hasMissingElements)
+            return BTreeSet.empty(comparator);
+
+        CBuilder builder = CBuilder.create(comparator);
+
+        if (elementsList.isEmpty())
+            return BTreeSet.of(comparator, builder.buildBound(isStart, isInclusive));
+
+        // Use a TreeSet to sort and eliminate duplicates
+        BTreeSet.Builder<Slice.Bound> set = BTreeSet.builder(comparator);
+
+        for (int i = 0, m = elementsList.size(); i < m; i++)
+        {
+            List<ByteBuffer> elements = elementsList.get(i);
+            set.add(builder.buildBoundWith(elements, isStart, isInclusive));
+        }
+        return set.build();
+    }
+
+    /**
+     * Checks if some elements can still be added to the clusterings.
+     *
+     * @return <code>true</code> if it is possible to add more elements to the clusterings, <code>false</code> otherwise.
+     */
+    public boolean hasRemaining()
+    {
+        return remainingCount() > 0;
+    }
+
+    private void checkUpdateable()
+    {
+        if (!hasRemaining() || built)
+            throw new IllegalStateException("this builder cannot be updated anymore");
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/MutableDeletionInfo.java b/src/java/org/apache/cassandra/db/MutableDeletionInfo.java
new file mode 100644
index 0000000..d01b1d1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/MutableDeletionInfo.java

@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.Iterator;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A mutable implementation of {@code DeletionInfo}.
+ */
+public class MutableDeletionInfo implements DeletionInfo
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new MutableDeletionInfo(0, 0));
+
+    /**
+     * This represents a deletion of the entire partition. We can't represent this within the RangeTombstoneList, so it's
+     * kept separately. This also slightly optimizes the common case of a full partition deletion.
+     */
+    private DeletionTime partitionDeletion;
+
+    /**
+     * A list of range tombstones within the partition.  This is left as null if there are no range tombstones
+     * (to save an allocation (since it's a common case).
+     */
+    private RangeTombstoneList ranges;
+
+    /**
+     * Creates a DeletionInfo with only a top-level (row) tombstone.
+     * @param markedForDeleteAt the time after which the entire row should be considered deleted
+     * @param localDeletionTime what time the deletion write was applied locally (for purposes of
+     *                          purging the tombstone after gc_grace_seconds).
+     */
+    public MutableDeletionInfo(long markedForDeleteAt, int localDeletionTime)
+    {
+        // Pre-1.1 node may return MIN_VALUE for non-deleted container, but the new default is MAX_VALUE
+        // (see CASSANDRA-3872)
+        this(new DeletionTime(markedForDeleteAt, localDeletionTime == Integer.MIN_VALUE ? Integer.MAX_VALUE : localDeletionTime));
+    }
+
+    public MutableDeletionInfo(DeletionTime partitionDeletion)
+    {
+        this(partitionDeletion, null);
+    }
+
+    public MutableDeletionInfo(DeletionTime partitionDeletion, RangeTombstoneList ranges)
+    {
+        this.partitionDeletion = partitionDeletion;
+        this.ranges = ranges;
+    }
+
+    /**
+     * Returns a new DeletionInfo that has no top-level tombstone or any range tombstones.
+     */
+    public static MutableDeletionInfo live()
+    {
+        return new MutableDeletionInfo(DeletionTime.LIVE);
+    }
+
+    public MutableDeletionInfo mutableCopy()
+    {
+        return new MutableDeletionInfo(partitionDeletion, ranges == null ? null : ranges.copy());
+    }
+
+    public MutableDeletionInfo copy(AbstractAllocator allocator)
+    {
+        RangeTombstoneList rangesCopy = null;
+        if (ranges != null)
+             rangesCopy = ranges.copy(allocator);
+
+        return new MutableDeletionInfo(partitionDeletion, rangesCopy);
+    }
+
+    /**
+     * Returns whether this DeletionInfo is live, that is deletes no columns.
+     */
+    public boolean isLive()
+    {
+        return partitionDeletion.isLive() && (ranges == null || ranges.isEmpty());
+    }
+
+    /**
+     * Potentially replaces the top-level tombstone with another, keeping whichever has the higher markedForDeleteAt
+     * timestamp.
+     * @param newInfo the deletion time to add to this deletion info.
+     */
+    public void add(DeletionTime newInfo)
+    {
+        if (newInfo.supersedes(partitionDeletion))
+            partitionDeletion = newInfo;
+    }
+
+    public void add(RangeTombstone tombstone, ClusteringComparator comparator)
+    {
+        if (ranges == null)
+            ranges = new RangeTombstoneList(comparator, 1);
+
+        ranges.add(tombstone);
+    }
+
+    /**
+     * Combines another DeletionInfo with this one and returns the result.  Whichever top-level tombstone
+     * has the higher markedForDeleteAt timestamp will be kept, along with its localDeletionTime.  The
+     * range tombstones will be combined.
+     *
+     * @return this object.
+     */
+    public DeletionInfo add(DeletionInfo newInfo)
+    {
+        add(newInfo.getPartitionDeletion());
+
+        // We know MutableDeletionInfo is the only impelementation and we're not mutating it, it's just to get access to the
+        // RangeTombstoneList directly.
+        assert newInfo instanceof MutableDeletionInfo;
+        RangeTombstoneList newRanges = ((MutableDeletionInfo)newInfo).ranges;
+
+        if (ranges == null)
+            ranges = newRanges == null ? null : newRanges.copy();
+        else if (newRanges != null)
+            ranges.addAll(newRanges);
+
+        return this;
+    }
+
+    public DeletionTime getPartitionDeletion()
+    {
+        return partitionDeletion;
+    }
+
+    // Use sparingly, not the most efficient thing
+    public Iterator<RangeTombstone> rangeIterator(boolean reversed)
+    {
+        return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator(reversed);
+    }
+
+    public Iterator<RangeTombstone> rangeIterator(Slice slice, boolean reversed)
+    {
+        return ranges == null ? Iterators.<RangeTombstone>emptyIterator() : ranges.iterator(slice, reversed);
+    }
+
+    public RangeTombstone rangeCovering(Clustering name)
+    {
+        return ranges == null ? null : ranges.search(name);
+    }
+
+    public int dataSize()
+    {
+        int size = TypeSizes.sizeof(partitionDeletion.markedForDeleteAt());
+        return size + (ranges == null ? 0 : ranges.dataSize());
+    }
+
+    public boolean hasRanges()
+    {
+        return ranges != null && !ranges.isEmpty();
+    }
+
+    public int rangeCount()
+    {
+        return hasRanges() ? ranges.size() : 0;
+    }
+
+    public long maxTimestamp()
+    {
+        return ranges == null ? partitionDeletion.markedForDeleteAt() : Math.max(partitionDeletion.markedForDeleteAt(), ranges.maxMarkedAt());
+    }
+
+    /**
+     * Whether this deletion info may modify the provided one if added to it.
+     */
+    public boolean mayModify(DeletionInfo delInfo)
+    {
+        return partitionDeletion.compareTo(delInfo.getPartitionDeletion()) > 0 || hasRanges();
+    }
+
+    @Override
+    public String toString()
+    {
+        if (ranges == null || ranges.isEmpty())
+            return String.format("{%s}", partitionDeletion);
+        else
+            return String.format("{%s, ranges=%s}", partitionDeletion, rangesAsString());
+    }
+
+    private String rangesAsString()
+    {
+        assert !ranges.isEmpty();
+        StringBuilder sb = new StringBuilder();
+        ClusteringComparator cc = ranges.comparator();
+        Iterator<RangeTombstone> iter = rangeIterator(false);
+        while (iter.hasNext())
+        {
+            RangeTombstone i = iter.next();
+            sb.append(i.deletedSlice().toString(cc));
+            sb.append('@');
+            sb.append(i.deletionTime());
+        }
+        return sb.toString();
+    }
+
+    // Updates all the timestamp of the deletion contained in this DeletionInfo to be {@code timestamp}.
+    public DeletionInfo updateAllTimestamp(long timestamp)
+    {
+        if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE)
+            partitionDeletion = new DeletionTime(timestamp, partitionDeletion.localDeletionTime());
+
+        if (ranges != null)
+            ranges.updateAllTimestamp(timestamp);
+        return this;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if(!(o instanceof MutableDeletionInfo))
+            return false;
+        MutableDeletionInfo that = (MutableDeletionInfo)o;
+        return partitionDeletion.equals(that.partitionDeletion) && Objects.equal(ranges, that.ranges);
+    }
+
+    @Override
+    public final int hashCode()
+    {
+        return Objects.hashCode(partitionDeletion, ranges);
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE + partitionDeletion.unsharedHeapSize() + (ranges == null ? 0 : ranges.unsharedHeapSize());
+    }
+
+    public void collectStats(EncodingStats.Collector collector)
+    {
+        collector.update(partitionDeletion);
+        if (ranges != null)
+            ranges.collectStats(collector);
+    }
+
+    public static Builder builder(DeletionTime partitionLevelDeletion, ClusteringComparator comparator, boolean reversed)
+    {
+        return new Builder(partitionLevelDeletion, comparator, reversed);
+    }
+
+    /**
+     * Builds DeletionInfo object from (in order) range tombstone markers.
+     */
+    public static class Builder
+    {
+        private final MutableDeletionInfo deletion;
+        private final ClusteringComparator comparator;
+
+        private final boolean reversed;
+
+        private RangeTombstoneMarker openMarker;
+
+        private Builder(DeletionTime partitionLevelDeletion, ClusteringComparator comparator, boolean reversed)
+        {
+            this.deletion = new MutableDeletionInfo(partitionLevelDeletion);
+            this.comparator = comparator;
+            this.reversed = reversed;
+        }
+
+        public void add(RangeTombstoneMarker marker)
+        {
+            // We need to start by the close case in case that's a boundary
+
+            if (marker.isClose(reversed))
+            {
+                DeletionTime openDeletion = openMarker.openDeletionTime(reversed);
+                assert marker.closeDeletionTime(reversed).equals(openDeletion);
+
+                Slice.Bound open = openMarker.openBound(reversed);
+                Slice.Bound close = marker.closeBound(reversed);
+
+                Slice slice = reversed ? Slice.make(close, open) : Slice.make(open, close);
+                deletion.add(new RangeTombstone(slice, openDeletion), comparator);
+            }
+
+            if (marker.isOpen(reversed))
+            {
+                openMarker = marker;
+            }
+        }
+
+        public MutableDeletionInfo build()
+        {
+            return deletion;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Mutation.java b/src/java/org/apache/cassandra/db/Mutation.java
index 8079ef8..7ed69c0 100644
--- a/src/java/org/apache/cassandra/db/Mutation.java
+++ b/src/java/org/apache/cassandra/db/Mutation.java

@@ -17,19 +17,23 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.atomic.AtomicLong;
 
+import com.google.common.base.Throwables;
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.apache.commons.lang3.StringUtils;
-
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.SerializationHelper;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -51,43 +55,52 @@
     // when we remove it, also restore SerializationsTest.testMutationRead to not regenerate new Mutations each test
     private final String keyspaceName;
 
-    private final ByteBuffer key;
+    private final DecoratedKey key;
     // map of column family id to mutations for that column family.
-    private final Map<UUID, ColumnFamily> modifications;
+    private final Map<UUID, PartitionUpdate> modifications;
 
-    public Mutation(String keyspaceName, ByteBuffer key)
+    // Time at which this mutation was instantiated
+    public final long createdAt = System.currentTimeMillis();
+    // keep track of when mutation has started waiting for a MV partition lock
+    public final AtomicLong viewLockAcquireStart = new AtomicLong(0);
+
+    public Mutation(String keyspaceName, DecoratedKey key)
     {
-        this(keyspaceName, key, new HashMap<UUID, ColumnFamily>());
+        this(keyspaceName, key, new HashMap<>());
     }
 
-    public Mutation(String keyspaceName, ByteBuffer key, ColumnFamily cf)
+    public Mutation(PartitionUpdate update)
     {
-        this(keyspaceName, key, Collections.singletonMap(cf.id(), cf));
+        this(update.metadata().ksName, update.partitionKey(), Collections.singletonMap(update.metadata().cfId, update));
     }
 
-    public Mutation(String keyspaceName, Row row)
-    {
-        this(keyspaceName, row.key.getKey(), row.cf);
-    }
-
-    protected Mutation(String keyspaceName, ByteBuffer key, Map<UUID, ColumnFamily> modifications)
+    protected Mutation(String keyspaceName, DecoratedKey key, Map<UUID, PartitionUpdate> modifications)
     {
         this.keyspaceName = keyspaceName;
         this.key = key;
         this.modifications = modifications;
     }
 
-    public Mutation(ByteBuffer key, ColumnFamily cf)
-    {
-        this(cf.metadata().ksName, key, cf);
-    }
-
     public Mutation copy()
     {
-        Mutation copy = new Mutation(keyspaceName, key, new HashMap<>(modifications));
+        return new Mutation(keyspaceName, key, new HashMap<>(modifications));
+    }
+
+    public Mutation without(Set<UUID> cfIds)
+    {
+        if (cfIds.isEmpty())
+            return this;
+
+        Mutation copy = copy();
+        copy.modifications.keySet().removeAll(cfIds);
         return copy;
     }
 
+    public Mutation without(UUID cfId)
+    {
+        return without(Collections.singleton(cfId));
+    }
+
     public String getKeyspaceName()
     {
         return keyspaceName;
@@ -98,53 +111,35 @@
         return modifications.keySet();
     }
 
-    public ByteBuffer key()
+    public DecoratedKey key()
     {
         return key;
     }
 
-    public Collection<ColumnFamily> getColumnFamilies()
+    public Collection<PartitionUpdate> getPartitionUpdates()
     {
         return modifications.values();
     }
 
-    public ColumnFamily getColumnFamily(UUID cfId)
+    public PartitionUpdate getPartitionUpdate(UUID cfId)
     {
         return modifications.get(cfId);
     }
 
-    /*
-     * Specify a column family name and the corresponding column
-     * family object.
-     * param @ cf - column family name
-     * param @ columnFamily - the column family.
-     */
-    public void add(ColumnFamily columnFamily)
+    public Mutation add(PartitionUpdate update)
     {
-        assert columnFamily != null;
-        ColumnFamily prev = modifications.put(columnFamily.id(), columnFamily);
+        assert update != null;
+        assert update.partitionKey().getPartitioner() == key.getPartitioner();
+        PartitionUpdate prev = modifications.put(update.metadata().cfId, update);
         if (prev != null)
             // developer error
-            throw new IllegalArgumentException("Table " + columnFamily + " already has modifications in this mutation: " + prev);
+            throw new IllegalArgumentException("Table " + update.metadata().cfName + " already has modifications in this mutation: " + prev);
+        return this;
     }
 
-    /**
-     * @return the ColumnFamily in this Mutation corresponding to @param cfName, creating an empty one if necessary.
-     */
-    public ColumnFamily addOrGet(String cfName)
+    public PartitionUpdate get(CFMetaData cfm)
     {
-        return addOrGet(Schema.instance.getCFMetaData(keyspaceName, cfName));
-    }
-
-    public ColumnFamily addOrGet(CFMetaData cfm)
-    {
-        ColumnFamily cf = modifications.get(cfm.cfId);
-        if (cf == null)
-        {
-            cf = ArrayBackedSortedColumns.factory.create(cfm);
-            modifications.put(cfm.cfId, cf);
-        }
-        return cf;
+        return modifications.get(cfm.cfId);
     }
 
     public boolean isEmpty()
@@ -152,56 +147,72 @@
         return modifications.isEmpty();
     }
 
-    public void add(String cfName, CellName name, ByteBuffer value, long timestamp, int timeToLive)
+    /**
+     * Creates a new mutation that merges all the provided mutations.
+     *
+     * @param mutations the mutations to merge together. All mutation must be
+     * on the same keyspace and partition key. There should also be at least one
+     * mutation.
+     * @return a mutation that contains all the modifications contained in {@code mutations}.
+     *
+     * @throws IllegalArgumentException if not all the mutations are on the same
+     * keyspace and key.
+     */
+    public static Mutation merge(List<Mutation> mutations)
     {
-        addOrGet(cfName).addColumn(name, value, timestamp, timeToLive);
-    }
+        assert !mutations.isEmpty();
 
-    public void addCounter(String cfName, CellName name, long value)
-    {
-        addOrGet(cfName).addCounter(name, value);
-    }
+        if (mutations.size() == 1)
+            return mutations.get(0);
 
-    public void add(String cfName, CellName name, ByteBuffer value, long timestamp)
-    {
-        add(cfName, name, value, timestamp, 0);
-    }
-
-    public void delete(String cfName, long timestamp)
-    {
-        int localDeleteTime = (int) (System.currentTimeMillis() / 1000);
-        addOrGet(cfName).delete(new DeletionInfo(timestamp, localDeleteTime));
-    }
-
-    public void delete(String cfName, CellName name, long timestamp)
-    {
-        int localDeleteTime = (int) (System.currentTimeMillis() / 1000);
-        addOrGet(cfName).addTombstone(name, localDeleteTime, timestamp);
-    }
-
-    public void deleteRange(String cfName, Composite start, Composite end, long timestamp)
-    {
-        int localDeleteTime = (int) (System.currentTimeMillis() / 1000);
-        addOrGet(cfName).addAtom(new RangeTombstone(start, end, timestamp, localDeleteTime));
-    }
-
-    public void addAll(IMutation m)
-    {
-        if (!(m instanceof Mutation))
-            throw new IllegalArgumentException();
-
-        Mutation mutation = (Mutation)m;
-        if (!keyspaceName.equals(mutation.keyspaceName) || !key.equals(mutation.key))
-            throw new IllegalArgumentException();
-
-        for (Map.Entry<UUID, ColumnFamily> entry : mutation.modifications.entrySet())
+        Set<UUID> updatedTables = new HashSet<>();
+        String ks = null;
+        DecoratedKey key = null;
+        for (Mutation mutation : mutations)
         {
-            // It's slighty faster to assume the key wasn't present and fix if
-            // not in the case where it wasn't there indeed.
-            ColumnFamily cf = modifications.put(entry.getKey(), entry.getValue());
-            if (cf != null)
-                entry.getValue().addAll(cf);
+            updatedTables.addAll(mutation.modifications.keySet());
+            if (ks != null && !ks.equals(mutation.keyspaceName))
+                throw new IllegalArgumentException();
+            if (key != null && !key.equals(mutation.key))
+                throw new IllegalArgumentException();
+            ks = mutation.keyspaceName;
+            key = mutation.key;
         }
+
+        List<PartitionUpdate> updates = new ArrayList<>(mutations.size());
+        Map<UUID, PartitionUpdate> modifications = new HashMap<>(updatedTables.size());
+        for (UUID table : updatedTables)
+        {
+            for (Mutation mutation : mutations)
+            {
+                PartitionUpdate upd = mutation.modifications.get(table);
+                if (upd != null)
+                    updates.add(upd);
+            }
+
+            if (updates.isEmpty())
+                continue;
+
+            modifications.put(table, updates.size() == 1 ? updates.get(0) : PartitionUpdate.merge(updates));
+            updates.clear();
+        }
+        return new Mutation(ks, key, modifications);
+    }
+
+    public CompletableFuture<?> applyFuture()
+    {
+        Keyspace ks = Keyspace.open(keyspaceName);
+        return ks.applyFuture(this, Keyspace.open(keyspaceName).getMetadata().params.durableWrites, true);
+    }
+
+    public void apply(boolean durableWrites, boolean isDroppable)
+    {
+        Keyspace.open(keyspaceName).apply(this, durableWrites, true, isDroppable);
+    }
+
+    public void apply(boolean durableWrites)
+    {
+        apply(durableWrites, true);
     }
 
     /*
@@ -210,13 +221,12 @@
      */
     public void apply()
     {
-        Keyspace ks = Keyspace.open(keyspaceName);
-        ks.apply(this, ks.getMetadata().durableWrites);
+        apply(Keyspace.open(keyspaceName).getMetadata().params.durableWrites);
     }
 
     public void applyUnsafe()
     {
-        Keyspace.open(keyspaceName).apply(this, false);
+        apply(false);
     }
 
     public MessageOut<Mutation> createMessage()
@@ -234,6 +244,14 @@
         return DatabaseDescriptor.getWriteRpcTimeout();
     }
 
+    public int smallestGCGS()
+    {
+        int gcgs = Integer.MAX_VALUE;
+        for (PartitionUpdate update : getPartitionUpdates())
+            gcgs = Math.min(gcgs, update.metadata().params.gcGraceSeconds);
+        return gcgs;
+    }
+
     public String toString()
     {
         return toString(false);
@@ -243,7 +261,7 @@
     {
         StringBuilder buff = new StringBuilder("Mutation(");
         buff.append("keyspace='").append(keyspaceName).append('\'');
-        buff.append(", key='").append(ByteBufferUtil.bytesToHex(key)).append('\'');
+        buff.append(", key='").append(ByteBufferUtil.bytesToHex(key.getKey())).append('\'');
         buff.append(", modifications=[");
         if (shallow)
         {
@@ -256,19 +274,12 @@
             buff.append(StringUtils.join(cfnames, ", "));
         }
         else
-            buff.append(StringUtils.join(modifications.values(), ", "));
+        {
+            buff.append("\n  ").append(StringUtils.join(modifications.values(), "\n  ")).append("\n");
+        }
         return buff.append("])").toString();
     }
 
-    public Mutation without(UUID cfId)
-    {
-        Mutation mutation = new Mutation(keyspaceName, key);
-        for (Map.Entry<UUID, ColumnFamily> entry : modifications.entrySet())
-            if (!entry.getKey().equals(cfId))
-                mutation.add(entry.getValue());
-        return mutation;
-    }
-
     public static class MutationSerializer implements IVersionedSerializer<Mutation>
     {
         public void serialize(Mutation mutation, DataOutputPlus out, int version) throws IOException
@@ -276,74 +287,85 @@
             if (version < MessagingService.VERSION_20)
                 out.writeUTF(mutation.getKeyspaceName());
 
-            ByteBufferUtil.writeWithShortLength(mutation.key(), out);
-
             /* serialize the modifications in the mutation */
             int size = mutation.modifications.size();
-            out.writeInt(size);
-            assert size > 0;
-            for (Map.Entry<UUID, ColumnFamily> entry : mutation.modifications.entrySet())
-                ColumnFamily.serializer.serialize(entry.getValue(), out, version);
-        }
 
-        public Mutation deserialize(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
-        {
-            String keyspaceName = null; // will always be set from cf.metadata but javac isn't smart enough to see that
-            if (version < MessagingService.VERSION_20)
-                keyspaceName = in.readUTF();
-
-            ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-            int size = in.readInt();
-            assert size > 0;
-
-            Map<UUID, ColumnFamily> modifications;
-            if (size == 1)
+            if (version < MessagingService.VERSION_30)
             {
-                ColumnFamily cf = deserializeOneCf(in, version, flag);
-                modifications = Collections.singletonMap(cf.id(), cf);
-                keyspaceName = cf.metadata().ksName;
+                ByteBufferUtil.writeWithShortLength(mutation.key().getKey(), out);
+                out.writeInt(size);
             }
             else
             {
-                modifications = new HashMap<UUID, ColumnFamily>(size);
-                for (int i = 0; i < size; ++i)
-                {
-                    ColumnFamily cf = deserializeOneCf(in, version, flag);
-                    modifications.put(cf.id(), cf);
-                    keyspaceName = cf.metadata().ksName;
-                }
+                out.writeUnsignedVInt(size);
             }
 
-            return new Mutation(keyspaceName, key, modifications);
+            assert size > 0;
+            for (Map.Entry<UUID, PartitionUpdate> entry : mutation.modifications.entrySet())
+                PartitionUpdate.serializer.serialize(entry.getValue(), out, version);
         }
 
-        private ColumnFamily deserializeOneCf(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
+        public Mutation deserialize(DataInputPlus in, int version, SerializationHelper.Flag flag) throws IOException
         {
-            ColumnFamily cf = ColumnFamily.serializer.deserialize(in, ArrayBackedSortedColumns.factory, flag, version);
-            // We don't allow Mutation with null column family, so we should never get null back.
-            assert cf != null;
-            return cf;
+            if (version < MessagingService.VERSION_20)
+                in.readUTF(); // read pre-2.0 keyspace name
+
+            ByteBuffer key = null;
+            int size;
+            if (version < MessagingService.VERSION_30)
+            {
+                key = ByteBufferUtil.readWithShortLength(in);
+                size = in.readInt();
+            }
+            else
+            {
+                size = (int)in.readUnsignedVInt();
+            }
+
+            assert size > 0;
+
+            PartitionUpdate update = PartitionUpdate.serializer.deserialize(in, version, flag, key);
+            if (size == 1)
+                return new Mutation(update);
+
+            Map<UUID, PartitionUpdate> modifications = new HashMap<>(size);
+            DecoratedKey dk = update.partitionKey();
+
+            modifications.put(update.metadata().cfId, update);
+            for (int i = 1; i < size; ++i)
+            {
+                update = PartitionUpdate.serializer.deserialize(in, version, flag, dk);
+                modifications.put(update.metadata().cfId, update);
+            }
+
+            return new Mutation(update.metadata().ksName, dk, modifications);
         }
 
-        public Mutation deserialize(DataInput in, int version) throws IOException
+        public Mutation deserialize(DataInputPlus in, int version) throws IOException
         {
-            return deserialize(in, version, ColumnSerializer.Flag.FROM_REMOTE);
+            return deserialize(in, version, SerializationHelper.Flag.FROM_REMOTE);
         }
 
         public long serializedSize(Mutation mutation, int version)
         {
-            TypeSizes sizes = TypeSizes.NATIVE;
             int size = 0;
 
             if (version < MessagingService.VERSION_20)
-                size += sizes.sizeof(mutation.getKeyspaceName());
+                size += TypeSizes.sizeof(mutation.getKeyspaceName());
 
-            int keySize = mutation.key().remaining();
-            size += sizes.sizeof((short) keySize) + keySize;
+            if (version < MessagingService.VERSION_30)
+            {
+                int keySize = mutation.key().getKey().remaining();
+                size += TypeSizes.sizeof((short) keySize) + keySize;
+                size += TypeSizes.sizeof(mutation.modifications.size());
+            }
+            else
+            {
+                size += TypeSizes.sizeofUnsignedVInt(mutation.modifications.size());
+            }
 
-            size += sizes.sizeof(mutation.modifications.size());
-            for (Map.Entry<UUID,ColumnFamily> entry : mutation.modifications.entrySet())
-                size += ColumnFamily.serializer.serializedSize(entry.getValue(), TypeSizes.NATIVE, version);
+            for (Map.Entry<UUID, PartitionUpdate> entry : mutation.modifications.entrySet())
+                size += PartitionUpdate.serializer.serializedSize(entry.getValue(), version);
 
             return size;
         }

diff --git a/src/java/org/apache/cassandra/db/MutationVerbHandler.java b/src/java/org/apache/cassandra/db/MutationVerbHandler.java
index 3baa93e..5888438 100644
--- a/src/java/org/apache/cassandra/db/MutationVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/MutationVerbHandler.java

@@ -18,46 +18,69 @@
 package org.apache.cassandra.db;
 
 import java.io.DataInputStream;
-import java.io.IOError;
 import java.io.IOException;
 import java.net.InetAddress;
 
+import org.apache.cassandra.batchlog.LegacyBatchlogMigrator;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.io.util.FastByteArrayInputStream;
 import org.apache.cassandra.net.*;
 import org.apache.cassandra.tracing.Tracing;
 
 public class MutationVerbHandler implements IVerbHandler<Mutation>
 {
-    private static final boolean TEST_FAIL_WRITES = System.getProperty("cassandra.test.fail_writes", "false").equalsIgnoreCase("true");
+    private void reply(int id, InetAddress replyTo)
+    {
+        Tracing.trace("Enqueuing response to {}", replyTo);
+        MessagingService.instance().sendReply(WriteResponse.createMessage(), id, replyTo);
+    }
+
+    private void failed()
+    {
+        Tracing.trace("Payload application resulted in WriteTimeout, not replying");
+    }
 
     public void doVerb(MessageIn<Mutation> message, int id)  throws IOException
     {
-            // Check if there were any forwarding headers in this message
-            byte[] from = message.parameters.get(Mutation.FORWARD_FROM);
-            InetAddress replyTo;
-            if (from == null)
+        // Check if there were any forwarding headers in this message
+        byte[] from = message.parameters.get(Mutation.FORWARD_FROM);
+        InetAddress replyTo;
+        if (from == null)
+        {
+            replyTo = message.from;
+            byte[] forwardBytes = message.parameters.get(Mutation.FORWARD_TO);
+            if (forwardBytes != null)
+                forwardToLocalNodes(message.payload, message.verb, forwardBytes, message.from);
+        }
+        else
+        {
+            replyTo = InetAddress.getByAddress(from);
+        }
+
+        try
+        {
+            if (message.version < MessagingService.VERSION_30 && LegacyBatchlogMigrator.isLegacyBatchlogMutation(message.payload))
             {
-                replyTo = message.from;
-                byte[] forwardBytes = message.parameters.get(Mutation.FORWARD_TO);
-                if (forwardBytes != null)
-                    forwardToLocalNodes(message.payload, message.verb, forwardBytes, message.from);
+                LegacyBatchlogMigrator.handleLegacyMutation(message.payload);
+                reply(id, replyTo);
             }
             else
-            {
-                replyTo = InetAddress.getByAddress(from);
-            }
-
-            message.payload.apply();
-            WriteResponse response = new WriteResponse();
-            Tracing.trace("Enqueuing response to {}", replyTo);
-            MessagingService.instance().sendReply(response.createMessage(), id, replyTo);
+                message.payload.applyFuture().thenAccept(o -> reply(id, replyTo)).exceptionally(wto -> {
+                    failed();
+                    return null;
+                });
+        }
+        catch (WriteTimeoutException wto)
+        {
+            failed();
+        }
     }
 
     /**
      * Older version (< 1.0) will not send this message at all, hence we don't
      * need to check the version of the data.
      */
-    private void forwardToLocalNodes(Mutation mutation, MessagingService.Verb verb, byte[] forwardBytes, InetAddress from) throws IOException
+    private static void forwardToLocalNodes(Mutation mutation, MessagingService.Verb verb, byte[] forwardBytes, InetAddress from) throws IOException
     {
         try (DataInputStream in = new DataInputStream(new FastByteArrayInputStream(forwardBytes)))
         {

diff --git a/src/java/org/apache/cassandra/db/NativeCell.java b/src/java/org/apache/cassandra/db/NativeCell.java
deleted file mode 100644
index dac5674..0000000
--- a/src/java/org/apache/cassandra/db/NativeCell.java
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-
-public class NativeCell extends AbstractNativeCell
-{
-    private static final long SIZE = ObjectSizes.measure(new NativeCell());
-
-    NativeCell()
-    {}
-
-    public NativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf)
-    {
-        super(allocator, writeOp, copyOf);
-    }
-
-    @Override
-    public CellName name()
-    {
-        return this;
-    }
-
-    @Override
-    public long timestamp()
-    {
-        return getLong(TIMESTAMP_OFFSET);
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferCell(copy(metadata, allocator), allocator.clone(value()), timestamp());
-    }
-
-    @Override
-    public Cell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        updateWithName(digest);  // name
-        updateWithValue(digest); // value
-
-        FBUtilities.updateWithLong(digest, timestamp());
-        FBUtilities.updateWithByte(digest, serializationFlags());
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return SIZE;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return SIZE;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/NativeCounterCell.java b/src/java/org/apache/cassandra/db/NativeCounterCell.java
deleted file mode 100644
index c16cc44..0000000
--- a/src/java/org/apache/cassandra/db/NativeCounterCell.java
+++ /dev/null

@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-
-public class NativeCounterCell extends NativeCell implements CounterCell
-{
-    private static final long SIZE = ObjectSizes.measure(new NativeCounterCell());
-
-    private NativeCounterCell()
-    {}
-
-    public NativeCounterCell(NativeAllocator allocator, OpOrder.Group writeOp, CounterCell copyOf)
-    {
-        super(allocator, writeOp, copyOf);
-    }
-
-    @Override
-    protected void construct(Cell from)
-    {
-        super.construct(from);
-        setLong(internalSize() - 8, ((CounterCell) from).timestampOfLastDelete());
-    }
-
-    @Override
-    protected int postfixSize()
-    {
-        return 8;
-    }
-
-    @Override
-    protected int sizeOf(Cell cell)
-    {
-        return 8 + super.sizeOf(cell);
-    }
-
-    @Override
-    public long timestampOfLastDelete()
-    {
-        return getLong(internalSize() - 8);
-    }
-
-    @Override
-    public long total()
-    {
-        return contextManager.total(value());
-    }
-
-    @Override
-    public boolean hasLegacyShards()
-    {
-        return contextManager.hasLegacyShards(value());
-    }
-
-    @Override
-    public Cell markLocalToBeCleared()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Cell diff(Cell cell)
-    {
-        return diffCounter(cell);
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        return reconcileCounter(cell);
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.COUNTER_MASK;
-    }
-
-    @Override
-    public int cellDataSize()
-    {
-        // A counter column adds 8 bytes for timestampOfLastDelete to Cell.
-        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(timestampOfLastDelete());
-    }
-
-    @Override
-    public int serializedSize(CellNameType type, TypeSizes typeSizes)
-    {
-        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(timestampOfLastDelete());
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-        // We cannot use the value validator as for other columns as the CounterColumnType validate a long,
-        // which is not the internal representation of counters
-        contextManager.validateContext(value());
-    }
-
-    /*
-     * We have to special case digest creation for counter column because
-     * we don't want to include the information about which shard of the
-     * context is a delta or not, since this information differs from node to
-     * node.
-     */
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        updateWithName(digest);
-
-        // We don't take the deltas into account in a digest
-        contextManager.updateDigest(digest, value());
-
-        FBUtilities.updateWithLong(digest, timestamp());
-        FBUtilities.updateWithByte(digest, serializationFlags());
-        FBUtilities.updateWithLong(digest, timestampOfLastDelete());
-    }
-
-    @Override
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s(%s:false:%s@%d!%d)",
-                             getClass().getSimpleName(),
-                             comparator.getString(name()),
-                             contextManager.toString(value()),
-                             timestamp(),
-                             timestampOfLastDelete());
-    }
-
-    @Override
-    public CounterCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferCounterCell(copy(metadata, allocator), allocator.clone(value()), timestamp(), timestampOfLastDelete());
-    }
-
-    @Override
-    public CounterCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return SIZE;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return SIZE;
-    }
-
-    @Override
-    public boolean equals(Cell cell)
-    {
-        return super.equals(cell) && timestampOfLastDelete() == ((CounterCell) cell).timestampOfLastDelete();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/NativeDeletedCell.java b/src/java/org/apache/cassandra/db/NativeDeletedCell.java
deleted file mode 100644
index 6bdef43..0000000
--- a/src/java/org/apache/cassandra/db/NativeDeletedCell.java
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemoryUtil;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-
-public class NativeDeletedCell extends NativeCell implements DeletedCell
-{
-    private static final long SIZE = ObjectSizes.measure(new NativeDeletedCell());
-
-    private NativeDeletedCell()
-    {}
-
-    public NativeDeletedCell(NativeAllocator allocator, OpOrder.Group writeOp, DeletedCell copyOf)
-    {
-        super(allocator, writeOp, copyOf);
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        if (cell instanceof DeletedCell)
-            return super.reconcile(cell);
-        return cell.reconcile(this);
-    }
-
-    @Override
-    public boolean isLive()
-    {
-        return false;
-    }
-
-    @Override
-    public boolean isLive(long now)
-    {
-        return false;
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-        int v = getInt(valueStartOffset());
-        return MemoryUtil.INVERTED_ORDER ? Integer.reverseBytes(v) : v;
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.DELETION_MASK;
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        validateName(metadata);
-
-        if ((int) (internalSize() - valueStartOffset()) != 4)
-            throw new MarshalException("A tombstone value should be 4 bytes long");
-        if (getLocalDeletionTime() < 0)
-            throw new MarshalException("The local deletion time should not be negative");
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        updateWithName(digest);
-        FBUtilities.updateWithLong(digest, timestamp());
-        FBUtilities.updateWithByte(digest, serializationFlags());
-    }
-
-    @Override
-    public DeletedCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferDeletedCell(copy(metadata, allocator), allocator.clone(value()), timestamp());
-    }
-
-    @Override
-    public DeletedCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return SIZE;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return SIZE;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/NativeExpiringCell.java b/src/java/org/apache/cassandra/db/NativeExpiringCell.java
deleted file mode 100644
index 6369536..0000000
--- a/src/java/org/apache/cassandra/db/NativeExpiringCell.java
+++ /dev/null

@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-
-public class NativeExpiringCell extends NativeCell implements ExpiringCell
-{
-    private static final long SIZE = ObjectSizes.measure(new NativeExpiringCell());
-
-    private NativeExpiringCell()
-    {}
-
-    public NativeExpiringCell(NativeAllocator allocator, OpOrder.Group writeOp, ExpiringCell copyOf)
-    {
-        super(allocator, writeOp, copyOf);
-    }
-
-    @Override
-    protected int sizeOf(Cell cell)
-    {
-        return super.sizeOf(cell) + 8;
-    }
-
-    @Override
-    protected void construct(Cell from)
-    {
-        ExpiringCell expiring = (ExpiringCell) from;
-
-        setInt(internalSize() - 4, expiring.getTimeToLive());
-        setInt(internalSize() - 8, expiring.getLocalDeletionTime());
-        super.construct(from);
-    }
-
-    @Override
-    protected int postfixSize()
-    {
-        return 8;
-    }
-
-    @Override
-    public int getTimeToLive()
-    {
-        return getInt(internalSize() - 4);
-    }
-
-    @Override
-    public int getLocalDeletionTime()
-    {
-        return getInt(internalSize() - 8);
-    }
-
-    @Override
-    public boolean isLive()
-    {
-        return isLive(System.currentTimeMillis());
-    }
-
-    @Override
-    public boolean isLive(long now)
-    {
-        return (int) (now / 1000) < getLocalDeletionTime();
-    }
-
-    @Override
-    public int serializationFlags()
-    {
-        return ColumnSerializer.EXPIRATION_MASK;
-    }
-
-    @Override
-    public int cellDataSize()
-    {
-        return super.cellDataSize() + TypeSizes.NATIVE.sizeof(getLocalDeletionTime()) + TypeSizes.NATIVE.sizeof(getTimeToLive());
-    }
-
-    @Override
-    public int serializedSize(CellNameType type, TypeSizes typeSizes)
-    {
-        /*
-         * An expired column adds to a Cell :
-         *    4 bytes for the localExpirationTime
-         *  + 4 bytes for the timeToLive
-        */
-        return super.serializedSize(type, typeSizes) + typeSizes.sizeof(getLocalDeletionTime()) + typeSizes.sizeof(getTimeToLive());
-    }
-
-    @Override
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        super.validateFields(metadata);
-
-        if (getTimeToLive() <= 0)
-            throw new MarshalException("A column TTL should be > 0");
-        if (getLocalDeletionTime() < 0)
-            throw new MarshalException("The local expiration time should not be negative");
-    }
-
-    @Override
-    public void updateDigest(MessageDigest digest)
-    {
-        super.updateDigest(digest);
-        FBUtilities.updateWithInt(digest, getTimeToLive());
-    }
-
-    @Override
-    public Cell reconcile(Cell cell)
-    {
-        long ts1 = timestamp(), ts2 = cell.timestamp();
-        if (ts1 != ts2)
-            return ts1 < ts2 ? cell : this;
-        // we should prefer tombstones
-        if (cell instanceof DeletedCell)
-            return cell;
-        int c = value().compareTo(cell.value());
-        if (c != 0)
-            return c < 0 ? cell : this;
-        // If we have same timestamp and value, prefer the longest ttl
-        if (cell instanceof ExpiringCell)
-        {
-            int let1 = getLocalDeletionTime(), let2 = cell.getLocalDeletionTime();
-            if (let1 < let2)
-                return cell;
-        }
-        return this;
-    }
-
-    public boolean equals(Cell cell)
-    {
-        if (!super.equals(cell))
-            return false;
-        ExpiringCell that = (ExpiringCell) cell;
-        return getLocalDeletionTime() == that.getLocalDeletionTime() && getTimeToLive() == that.getTimeToLive();
-    }
-
-    @Override
-    public String getString(CellNameType comparator)
-    {
-        return String.format("%s(%s!%d)", getClass().getSimpleName(), super.getString(comparator), getTimeToLive());
-    }
-
-    @Override
-    public ExpiringCell localCopy(CFMetaData metadata, AbstractAllocator allocator)
-    {
-        return new BufferExpiringCell(name().copy(metadata, allocator), allocator.clone(value()), timestamp(), getTimeToLive(), getLocalDeletionTime());
-    }
-
-    @Override
-    public ExpiringCell localCopy(CFMetaData metadata, MemtableAllocator allocator, OpOrder.Group opGroup)
-    {
-        return allocator.clone(this, metadata, opGroup);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return SIZE;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return SIZE;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/OnDiskAtom.java b/src/java/org/apache/cassandra/db/OnDiskAtom.java
deleted file mode 100644
index 2a9c39f..0000000
--- a/src/java/org/apache/cassandra/db/OnDiskAtom.java
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.*;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.ISSTableSerializer;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.serializers.MarshalException;
-
-public interface OnDiskAtom
-{
-    public Composite name();
-
-    /**
-     * For a standard column, this is the same as timestamp().
-     * For a super column, this is the min/max column timestamp of the sub columns.
-     */
-    public long timestamp();
-    public int getLocalDeletionTime(); // for tombstone GC, so int is sufficient granularity
-
-    public void validateFields(CFMetaData metadata) throws MarshalException;
-    public void updateDigest(MessageDigest digest);
-
-    public interface SerializerForWriting
-    {
-        public void serializeForSSTable(OnDiskAtom atom, DataOutputPlus out) throws IOException;
-        public long serializedSizeForSSTable(OnDiskAtom atom);
-    }
-
-    public static class Serializer implements ISSTableSerializer<OnDiskAtom>, SerializerForWriting
-    {
-        private final CellNameType type;
-
-        public Serializer(CellNameType type)
-        {
-            this.type = type;
-        }
-
-        public void serializeForSSTable(OnDiskAtom atom, DataOutputPlus out) throws IOException
-        {
-            if (atom instanceof Cell)
-            {
-                type.columnSerializer().serialize((Cell)atom, out);
-            }
-            else
-            {
-                assert atom instanceof RangeTombstone;
-                type.rangeTombstoneSerializer().serializeForSSTable((RangeTombstone)atom, out);
-            }
-        }
-
-        public OnDiskAtom deserializeFromSSTable(DataInput in, Version version) throws IOException
-        {
-            return deserializeFromSSTable(in, ColumnSerializer.Flag.LOCAL, Integer.MIN_VALUE, version);
-        }
-
-        public OnDiskAtom deserializeFromSSTable(DataInput in, ColumnSerializer.Flag flag, int expireBefore, Version version) throws IOException
-        {
-            Composite name = type.serializer().deserialize(in);
-            if (name.isEmpty())
-            {
-                // SSTableWriter.END_OF_ROW
-                return null;
-            }
-
-            int b = in.readUnsignedByte();
-            if ((b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0)
-                return type.rangeTombstoneSerializer().deserializeBody(in, name, version);
-            else
-                return type.columnSerializer().deserializeColumnBody(in, (CellName)name, b, flag, expireBefore);
-        }
-
-        public long serializedSizeForSSTable(OnDiskAtom atom)
-        {
-            if (atom instanceof Cell)
-            {
-                return type.columnSerializer().serializedSize((Cell)atom, TypeSizes.NATIVE);
-            }
-            else
-            {
-                assert atom instanceof RangeTombstone;
-                return type.rangeTombstoneSerializer().serializedSizeForSSTable((RangeTombstone)atom);
-            }
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/PagedRangeCommand.java b/src/java/org/apache/cassandra/db/PagedRangeCommand.java
deleted file mode 100644
index 40ef88e..0000000
--- a/src/java/org/apache/cassandra/db/PagedRangeCommand.java
+++ /dev/null

@@ -1,224 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-
-public class PagedRangeCommand extends AbstractRangeCommand
-{
-    public static final IVersionedSerializer<PagedRangeCommand> serializer = new Serializer();
-
-    public final Composite start;
-    public final Composite stop;
-    public final int limit;
-    private final boolean countCQL3Rows;
-
-    public PagedRangeCommand(String keyspace,
-                             String columnFamily,
-                             long timestamp,
-                             AbstractBounds<RowPosition> keyRange,
-                             SliceQueryFilter predicate,
-                             Composite start,
-                             Composite stop,
-                             List<IndexExpression> rowFilter,
-                             int limit,
-                             boolean countCQL3Rows)
-    {
-        super(keyspace, columnFamily, timestamp, keyRange, predicate, rowFilter);
-        this.start = start;
-        this.stop = stop;
-        this.limit = limit;
-        this.countCQL3Rows = countCQL3Rows;
-    }
-
-    public MessageOut<PagedRangeCommand> createMessage()
-    {
-        return new MessageOut<>(MessagingService.Verb.PAGED_RANGE, this, serializer);
-    }
-
-    public AbstractRangeCommand forSubRange(AbstractBounds<RowPosition> subRange)
-    {
-        Composite newStart = subRange.left.equals(keyRange.left) ? start : ((SliceQueryFilter)predicate).start();
-        Composite newStop = subRange.right.equals(keyRange.right) ? stop : ((SliceQueryFilter)predicate).finish();
-        return new PagedRangeCommand(keyspace,
-                                     columnFamily,
-                                     timestamp,
-                                     subRange,
-                                     ((SliceQueryFilter) predicate).cloneShallow(),
-                                     newStart,
-                                     newStop,
-                                     rowFilter,
-                                     limit,
-                                     countCQL3Rows);
-    }
-
-    public AbstractRangeCommand withUpdatedLimit(int newLimit)
-    {
-        return new PagedRangeCommand(keyspace,
-                                     columnFamily,
-                                     timestamp,
-                                     keyRange,
-                                     ((SliceQueryFilter) predicate).cloneShallow(),
-                                     start,
-                                     stop,
-                                     rowFilter,
-                                     newLimit,
-                                     countCQL3Rows);
-    }
-
-    public int limit()
-    {
-        return limit;
-    }
-
-    public boolean countCQL3Rows()
-    {
-        return countCQL3Rows;
-    }
-
-    public List<Row> executeLocally()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
-
-        ExtendedFilter exFilter = cfs.makeExtendedFilter(keyRange, (SliceQueryFilter)predicate, start, stop, rowFilter, limit, countCQL3Rows(), timestamp);
-        if (cfs.indexManager.hasIndexFor(rowFilter))
-            return cfs.search(exFilter);
-        else
-            return cfs.getRangeSlice(exFilter);
-    }
-
-    @Override
-    public String toString()
-    {
-        return String.format("PagedRange(%s, %s, %d, %s, %s, %s, %s, %s, %d)", keyspace, columnFamily, timestamp, keyRange, predicate, start, stop, rowFilter, limit);
-    }
-
-    private static class Serializer implements IVersionedSerializer<PagedRangeCommand>
-    {
-        public void serialize(PagedRangeCommand cmd, DataOutputPlus out, int version) throws IOException
-        {
-            out.writeUTF(cmd.keyspace);
-            out.writeUTF(cmd.columnFamily);
-            out.writeLong(cmd.timestamp);
-
-            MessagingService.validatePartitioner(cmd.keyRange);
-            AbstractBounds.rowPositionSerializer.serialize(cmd.keyRange, out, version);
-
-            CFMetaData metadata = Schema.instance.getCFMetaData(cmd.keyspace, cmd.columnFamily);
-
-            // SliceQueryFilter (the count is not used)
-            SliceQueryFilter filter = (SliceQueryFilter)cmd.predicate;
-            metadata.comparator.sliceQueryFilterSerializer().serialize(filter, out, version);
-
-            // The start and stop of the page
-            metadata.comparator.serializer().serialize(cmd.start, out);
-            metadata.comparator.serializer().serialize(cmd.stop, out);
-
-            out.writeInt(cmd.rowFilter.size());
-            for (IndexExpression expr : cmd.rowFilter)
-            {
-                expr.writeTo(out);;
-            }
-
-            out.writeInt(cmd.limit);
-            if (version >= MessagingService.VERSION_21)
-                out.writeBoolean(cmd.countCQL3Rows);
-        }
-
-        public PagedRangeCommand deserialize(DataInput in, int version) throws IOException
-        {
-            String keyspace = in.readUTF();
-            String columnFamily = in.readUTF();
-            long timestamp = in.readLong();
-
-            AbstractBounds<RowPosition> keyRange =
-                    AbstractBounds.rowPositionSerializer.deserialize(in, MessagingService.globalPartitioner(), version);
-
-            CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
-            if (metadata == null)
-            {
-                String message = String.format("Got paged range command for nonexistent table %s.%s.  If the table was just " +
-                        "created, this is likely due to the schema not being fully propagated.  Please wait for schema " +
-                        "agreement on table creation." , keyspace, columnFamily);
-                throw new UnknownColumnFamilyException(message, null);
-            }
-
-            SliceQueryFilter predicate = metadata.comparator.sliceQueryFilterSerializer().deserialize(in, version);
-
-            Composite start = metadata.comparator.serializer().deserialize(in);
-            Composite stop =  metadata.comparator.serializer().deserialize(in);
-
-            int filterCount = in.readInt();
-            List<IndexExpression> rowFilter = new ArrayList<IndexExpression>(filterCount);
-            for (int i = 0; i < filterCount; i++)
-            {
-                rowFilter.add(IndexExpression.readFrom(in));
-            }
-
-            int limit = in.readInt();
-            boolean countCQL3Rows = version >= MessagingService.VERSION_21
-                                  ? in.readBoolean()
-                                  : predicate.compositesToGroup >= 0 || predicate.count != 1; // See #6857
-            return new PagedRangeCommand(keyspace, columnFamily, timestamp, keyRange, predicate, start, stop, rowFilter, limit, countCQL3Rows);
-        }
-
-        public long serializedSize(PagedRangeCommand cmd, int version)
-        {
-            long size = 0;
-
-            size += TypeSizes.NATIVE.sizeof(cmd.keyspace);
-            size += TypeSizes.NATIVE.sizeof(cmd.columnFamily);
-            size += TypeSizes.NATIVE.sizeof(cmd.timestamp);
-
-            size += AbstractBounds.rowPositionSerializer.serializedSize(cmd.keyRange, version);
-
-            CFMetaData metadata = Schema.instance.getCFMetaData(cmd.keyspace, cmd.columnFamily);
-
-            size += metadata.comparator.sliceQueryFilterSerializer().serializedSize((SliceQueryFilter)cmd.predicate, version);
-
-            size += metadata.comparator.serializer().serializedSize(cmd.start, TypeSizes.NATIVE);
-            size += metadata.comparator.serializer().serializedSize(cmd.stop, TypeSizes.NATIVE);
-
-            size += TypeSizes.NATIVE.sizeof(cmd.rowFilter.size());
-            for (IndexExpression expr : cmd.rowFilter)
-            {
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column);
-                size += TypeSizes.NATIVE.sizeof(expr.operator.ordinal());
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.value);
-            }
-
-            size += TypeSizes.NATIVE.sizeof(cmd.limit);
-            if (version >= MessagingService.VERSION_21)
-                size += TypeSizes.NATIVE.sizeof(cmd.countCQL3Rows);
-            return size;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/PartitionColumns.java b/src/java/org/apache/cassandra/db/PartitionColumns.java
new file mode 100644
index 0000000..bf4ac43
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/PartitionColumns.java

@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.*;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+import static java.util.Comparator.naturalOrder;
+
+/**
+ * Columns (or a subset of the columns) that a partition contains.
+ * This mainly groups both static and regular columns for convenience.
+ */
+public class PartitionColumns implements Iterable<ColumnDefinition>
+{
+    public static PartitionColumns NONE = new PartitionColumns(Columns.NONE, Columns.NONE);
+
+    public final Columns statics;
+    public final Columns regulars;
+
+    public PartitionColumns(Columns statics, Columns regulars)
+    {
+        assert statics != null && regulars != null;
+        this.statics = statics;
+        this.regulars = regulars;
+    }
+
+    public static PartitionColumns of(ColumnDefinition column)
+    {
+        return new PartitionColumns(column.isStatic() ? Columns.of(column) : Columns.NONE,
+                                    column.isStatic() ? Columns.NONE : Columns.of(column));
+    }
+
+    public PartitionColumns without(ColumnDefinition column)
+    {
+        return new PartitionColumns(column.isStatic() ? statics.without(column) : statics,
+                                    column.isStatic() ? regulars : regulars.without(column));
+    }
+
+    public PartitionColumns withoutStatics()
+    {
+        return statics.isEmpty() ? this : new PartitionColumns(Columns.NONE, regulars);
+    }
+
+    public PartitionColumns mergeTo(PartitionColumns that)
+    {
+        if (this == that)
+            return this;
+        Columns statics = this.statics.mergeTo(that.statics);
+        Columns regulars = this.regulars.mergeTo(that.regulars);
+        if (statics == this.statics && regulars == this.regulars)
+            return this;
+        if (statics == that.statics && regulars == that.regulars)
+            return that;
+        return new PartitionColumns(statics, regulars);
+    }
+
+    public boolean isEmpty()
+    {
+        return statics.isEmpty() && regulars.isEmpty();
+    }
+
+    public Columns columns(boolean isStatic)
+    {
+        return isStatic ? statics : regulars;
+    }
+
+    public boolean contains(ColumnDefinition column)
+    {
+        return column.isStatic() ? statics.contains(column) : regulars.contains(column);
+    }
+
+    public boolean includes(PartitionColumns columns)
+    {
+        return statics.containsAll(columns.statics) && regulars.containsAll(columns.regulars);
+    }
+
+    public Iterator<ColumnDefinition> iterator()
+    {
+        return Iterators.concat(statics.iterator(), regulars.iterator());
+    }
+
+    public Iterator<ColumnDefinition> selectOrderIterator()
+    {
+        return Iterators.concat(statics.selectOrderIterator(), regulars.selectOrderIterator());
+    }
+
+    /** * Returns the total number of static and regular columns. */
+    public int size()
+    {
+        return regulars.size() + statics.size();
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append("[").append(statics).append(" | ").append(regulars).append("]");
+        return sb.toString();
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof PartitionColumns))
+            return false;
+
+        PartitionColumns that = (PartitionColumns)other;
+        return this.statics.equals(that.statics)
+            && this.regulars.equals(that.regulars);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(statics, regulars);
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static class Builder
+    {
+        // Note that we do want to use sorted sets because we want the column definitions to be compared
+        // through compareTo, not equals. The former basically check it's the same column name, while the latter
+        // check it's the same object, including the same type.
+        private BTreeSet.Builder<ColumnDefinition> regularColumns;
+        private BTreeSet.Builder<ColumnDefinition> staticColumns;
+
+        public Builder add(ColumnDefinition c)
+        {
+            if (c.isStatic())
+            {
+                if (staticColumns == null)
+                    staticColumns = BTreeSet.builder(naturalOrder());
+                staticColumns.add(c);
+            }
+            else
+            {
+                assert c.isRegular();
+                if (regularColumns == null)
+                    regularColumns = BTreeSet.builder(naturalOrder());
+                regularColumns.add(c);
+            }
+            return this;
+        }
+
+        public Builder addAll(Iterable<ColumnDefinition> columns)
+        {
+            for (ColumnDefinition c : columns)
+                add(c);
+            return this;
+        }
+
+        public Builder addAll(PartitionColumns columns)
+        {
+            if (regularColumns == null && !columns.regulars.isEmpty())
+                regularColumns = BTreeSet.builder(naturalOrder());
+
+            for (ColumnDefinition c : columns.regulars)
+                regularColumns.add(c);
+
+            if (staticColumns == null && !columns.statics.isEmpty())
+                staticColumns = BTreeSet.builder(naturalOrder());
+
+            for (ColumnDefinition c : columns.statics)
+                staticColumns.add(c);
+
+            return this;
+        }
+
+        public PartitionColumns build()
+        {
+            return new PartitionColumns(staticColumns == null ? Columns.NONE : Columns.from(staticColumns.build()),
+                                        regularColumns == null ? Columns.NONE : Columns.from(regularColumns.build()));
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java
new file mode 100644
index 0000000..ac5258d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/PartitionPosition.java

@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public interface PartitionPosition extends RingPosition<PartitionPosition>
+{
+    public static enum Kind
+    {
+        // Only add new values to the end of the enum, the ordinal is used
+        // during serialization
+        ROW_KEY, MIN_BOUND, MAX_BOUND;
+
+        private static final Kind[] allKinds = Kind.values();
+
+        static Kind fromOrdinal(int ordinal)
+        {
+            return allKinds[ordinal];
+        }
+    }
+
+    public static final class ForKey
+    {
+        public static PartitionPosition get(ByteBuffer key, IPartitioner p)
+        {
+            return key == null || key.remaining() == 0 ? p.getMinimumToken().minKeyBound() : p.decorateKey(key);
+        }
+    }
+
+    public static final RowPositionSerializer serializer = new RowPositionSerializer();
+
+    public Kind kind();
+    public boolean isMinimum();
+
+    public static class RowPositionSerializer implements IPartitionerDependentSerializer<PartitionPosition>
+    {
+        /*
+         * We need to be able to serialize both Token.KeyBound and
+         * DecoratedKey. To make this compact, we first write a byte whose
+         * meaning is:
+         *   - 0: DecoratedKey
+         *   - 1: a 'minimum' Token.KeyBound
+         *   - 2: a 'maximum' Token.KeyBound
+         * In the case of the DecoratedKey, we then serialize the key (the
+         * token is recreated on the other side). In the other cases, we then
+         * serialize the token.
+         */
+        public void serialize(PartitionPosition pos, DataOutputPlus out, int version) throws IOException
+        {
+            Kind kind = pos.kind();
+            out.writeByte(kind.ordinal());
+            if (kind == Kind.ROW_KEY)
+                ByteBufferUtil.writeWithShortLength(((DecoratedKey)pos).getKey(), out);
+            else
+                Token.serializer.serialize(pos.getToken(), out, version);
+        }
+
+        public PartitionPosition deserialize(DataInput in, IPartitioner p, int version) throws IOException
+        {
+            Kind kind = Kind.fromOrdinal(in.readByte());
+            if (kind == Kind.ROW_KEY)
+            {
+                ByteBuffer k = ByteBufferUtil.readWithShortLength(in);
+                return p.decorateKey(k);
+            }
+            else
+            {
+                Token t = Token.serializer.deserialize(in, p, version);
+                return kind == Kind.MIN_BOUND ? t.minKeyBound() : t.maxKeyBound();
+            }
+        }
+
+        public long serializedSize(PartitionPosition pos, int version)
+        {
+            Kind kind = pos.kind();
+            int size = 1; // 1 byte for enum
+            if (kind == Kind.ROW_KEY)
+            {
+                int keySize = ((DecoratedKey)pos).getKey().remaining();
+                size += TypeSizes.sizeof((short) keySize) + keySize;
+            }
+            else
+            {
+                size += Token.serializer.serializedSize(pos.getToken(), version);
+            }
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
new file mode 100644
index 0000000..1da66c1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java

@@ -0,0 +1,451 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.db.transform.RTBoundValidator;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.pager.*;
+import org.apache.cassandra.thrift.ThriftResultsMerger;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A read command that selects a (part of a) range of partitions.
+ */
+public class PartitionRangeReadCommand extends ReadCommand
+{
+    protected static final SelectionDeserializer selectionDeserializer = new Deserializer();
+
+    private final DataRange dataRange;
+    private int oldestUnrepairedTombstone = Integer.MAX_VALUE;
+
+    private PartitionRangeReadCommand(boolean isDigest,
+                                      int digestVersion,
+                                      boolean isForThrift,
+                                      CFMetaData metadata,
+                                      int nowInSec,
+                                      ColumnFilter columnFilter,
+                                      RowFilter rowFilter,
+                                      DataLimits limits,
+                                      DataRange dataRange,
+                                      IndexMetadata index)
+    {
+        super(Kind.PARTITION_RANGE, isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+        this.dataRange = dataRange;
+    }
+
+    public static PartitionRangeReadCommand create(boolean isForThrift,
+                                                   CFMetaData metadata,
+                                                   int nowInSec,
+                                                   ColumnFilter columnFilter,
+                                                   RowFilter rowFilter,
+                                                   DataLimits limits,
+                                                   DataRange dataRange)
+    {
+        return new PartitionRangeReadCommand(false,
+                                             0,
+                                             isForThrift,
+                                             metadata,
+                                             nowInSec,
+                                             columnFilter,
+                                             rowFilter,
+                                             limits,
+                                             dataRange,
+                                             findIndex(metadata, rowFilter));
+    }
+
+    /**
+     * Creates a new read command that query all the data in the table.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     *
+     * @return a newly created read command that queries everything in the table.
+     */
+    public static PartitionRangeReadCommand allDataRead(CFMetaData metadata, int nowInSec)
+    {
+        return new PartitionRangeReadCommand(false, 0, false,
+                                             metadata,
+                                             nowInSec,
+                                             ColumnFilter.all(metadata),
+                                             RowFilter.NONE,
+                                             DataLimits.NONE,
+                                             DataRange.allData(metadata.partitioner),
+                                             null);
+    }
+
+    public DataRange dataRange()
+    {
+        return dataRange;
+    }
+
+    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key)
+    {
+        return dataRange.clusteringIndexFilter(key);
+    }
+
+    public boolean isNamesQuery()
+    {
+        return dataRange.isNamesQuery();
+    }
+
+    public PartitionRangeReadCommand forSubRange(AbstractBounds<PartitionPosition> range)
+    {
+        return new PartitionRangeReadCommand(isDigestQuery(),
+                                             digestVersion(),
+                                             isForThrift(),
+                                             metadata(),
+                                             nowInSec(),
+                                             columnFilter(),
+                                             rowFilter(),
+                                             limits(),
+                                             dataRange().forSubRange(range),
+                                             indexMetadata());
+    }
+
+    public PartitionRangeReadCommand copy()
+    {
+        return new PartitionRangeReadCommand(isDigestQuery(),
+                                             digestVersion(),
+                                             isForThrift(),
+                                             metadata(),
+                                             nowInSec(),
+                                             columnFilter(),
+                                             rowFilter(),
+                                             limits(),
+                                             dataRange(),
+                                             indexMetadata());
+    }
+
+    public PartitionRangeReadCommand copyAsDigestQuery()
+    {
+        return new PartitionRangeReadCommand(true,
+                                             digestVersion(),
+                                             isForThrift(),
+                                             metadata(),
+                                             nowInSec(),
+                                             columnFilter(),
+                                             rowFilter(),
+                                             limits(),
+                                             dataRange(),
+                                             indexMetadata());
+    }
+
+    public PartitionRangeReadCommand withUpdatedDataRange(DataRange newDataRange)
+    {
+        return new PartitionRangeReadCommand(isDigestQuery(),
+                                             digestVersion(),
+                                             isForThrift(),
+                                             metadata(),
+                                             nowInSec(),
+                                             columnFilter(),
+                                             rowFilter(),
+                                             limits(),
+                                             newDataRange,
+                                             indexMetadata());
+    }
+
+    public PartitionRangeReadCommand withUpdatedLimitsAndDataRange(DataLimits newLimits, DataRange newDataRange)
+    {
+        return new PartitionRangeReadCommand(isDigestQuery(),
+                                             digestVersion(),
+                                             isForThrift(),
+                                             metadata(),
+                                             nowInSec(),
+                                             columnFilter(),
+                                             rowFilter(),
+                                             newLimits,
+                                             newDataRange,
+                                             indexMetadata());
+    }
+
+    public long getTimeout()
+    {
+        return DatabaseDescriptor.getRangeRpcTimeout();
+    }
+
+    public boolean isReversed()
+    {
+        return dataRange.isReversed();
+    }
+
+    public boolean selectsKey(DecoratedKey key)
+    {
+        if (!dataRange().contains(key))
+            return false;
+
+        return rowFilter().partitionKeyRestrictionsAreSatisfiedBy(key, metadata().getKeyValidator());
+    }
+
+    public boolean selectsClustering(DecoratedKey key, Clustering clustering)
+    {
+        if (clustering == Clustering.STATIC_CLUSTERING)
+            return !columnFilter().fetchedColumns().statics.isEmpty();
+
+        if (!dataRange().clusteringIndexFilter(key).selects(clustering))
+            return false;
+        return rowFilter().clusteringKeyRestrictionsAreSatisfiedBy(clustering);
+    }
+
+    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState) throws RequestExecutionException
+    {
+        return StorageProxy.getRangeSlice(this, consistency);
+    }
+
+    public QueryPager getPager(PagingState pagingState, int protocolVersion)
+    {
+            return new PartitionRangeQueryPager(this, pagingState, protocolVersion);
+    }
+
+    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    {
+        metric.rangeLatency.addNano(latencyNanos);
+    }
+
+    @VisibleForTesting
+    public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadOrderGroup orderGroup)
+    {
+        ColumnFamilyStore.ViewFragment view = cfs.select(View.selectLive(dataRange().keyRange()));
+        Tracing.trace("Executing seq scan across {} sstables for {}", view.sstables.size(), dataRange().keyRange().getString(metadata().getKeyValidator()));
+
+        // fetch data from current memtable, historical memtables, and SSTables in the correct order.
+        final List<UnfilteredPartitionIterator> iterators = new ArrayList<>(Iterables.size(view.memtables) + view.sstables.size());
+
+        try
+        {
+            for (Memtable memtable : view.memtables)
+            {
+                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
+                Memtable.MemtableUnfilteredPartitionIterator iter = memtable.makePartitionIterator(columnFilter(), dataRange(), isForThrift());
+
+                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
+                UnfilteredPartitionIterator iterator = isForThrift() ? ThriftResultsMerger.maybeWrap(iter, metadata(), nowInSec()) : iter;
+                iterators.add(RTBoundValidator.validate(iterator, RTBoundValidator.Stage.MEMTABLE, false));
+
+                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.getMinLocalDeletionTime());
+            }
+
+            SSTableReadsListener readCountUpdater = newReadCountUpdater();
+            for (SSTableReader sstable : view.sstables)
+            {
+                @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
+                UnfilteredPartitionIterator iter = sstable.getScanner(columnFilter(), dataRange(), isForThrift(), readCountUpdater);
+
+                if (isForThrift())
+                    iter = ThriftResultsMerger.maybeWrap(iter, metadata(), nowInSec());
+
+                iterators.add(RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false));
+
+                if (!sstable.isRepaired())
+                    oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
+            }
+            return iterators.isEmpty() ? EmptyIterators.unfilteredPartition(metadata(), isForThrift())
+                                       : checkCacheFilter(UnfilteredPartitionIterators.mergeLazily(iterators, nowInSec()), cfs);
+        }
+        catch (RuntimeException | Error e)
+        {
+            try
+            {
+                FBUtilities.closeAll(iterators);
+            }
+            catch (Exception suppressed)
+            {
+                e.addSuppressed(suppressed);
+            }
+
+            throw e;
+        }
+    }
+
+    /**
+     * Creates a new {@code SSTableReadsListener} to update the SSTables read counts.
+     * @return a new {@code SSTableReadsListener} to update the SSTables read counts.
+     */
+    private static SSTableReadsListener newReadCountUpdater()
+    {
+        return new SSTableReadsListener()
+                {
+                    @Override
+                    public void onScanningStarted(SSTableReader sstable)
+                    {
+                        sstable.incrementReadCount();
+                    }
+                };
+    }
+
+    @Override
+    protected int oldestUnrepairedTombstone()
+    {
+        return oldestUnrepairedTombstone;
+    }
+
+    private UnfilteredPartitionIterator checkCacheFilter(UnfilteredPartitionIterator iter, final ColumnFamilyStore cfs)
+    {
+        class CacheFilter extends Transformation
+        {
+            @Override
+            public BaseRowIterator applyToPartition(BaseRowIterator iter)
+            {
+                // Note that we rely on the fact that until we actually advance 'iter', no really costly operation is actually done
+                // (except for reading the partition key from the index file) due to the call to mergeLazily in queryStorage.
+                DecoratedKey dk = iter.partitionKey();
+
+                // Check if this partition is in the rowCache and if it is, if  it covers our filter
+                CachedPartition cached = cfs.getRawCachedPartition(dk);
+                ClusteringIndexFilter filter = dataRange().clusteringIndexFilter(dk);
+
+                if (cached != null && cfs.isFilterFullyCoveredBy(filter, limits(), cached, nowInSec()))
+                {
+                    // We won't use 'iter' so close it now.
+                    iter.close();
+
+                    return filter.getUnfilteredRowIterator(columnFilter(), cached);
+                }
+
+                return iter;
+            }
+        }
+        return Transformation.apply(iter, new CacheFilter());
+    }
+
+    public MessageOut<ReadCommand> createMessage(int version)
+    {
+        return dataRange().isPaging()
+             ? new MessageOut<>(MessagingService.Verb.PAGED_RANGE, this, pagedRangeSerializer)
+             : new MessageOut<>(MessagingService.Verb.RANGE_SLICE, this, rangeSliceSerializer);
+    }
+
+    protected void appendCQLWhereClause(StringBuilder sb)
+    {
+        if (dataRange.isUnrestricted() && rowFilter().isEmpty())
+            return;
+
+        sb.append(" WHERE ");
+        // We put the row filter first because the data range can end by "ORDER BY"
+        if (!rowFilter().isEmpty())
+        {
+            sb.append(rowFilter());
+            if (!dataRange.isUnrestricted())
+                sb.append(" AND ");
+        }
+        if (!dataRange.isUnrestricted())
+            sb.append(dataRange.toCQLString(metadata()));
+    }
+
+    /**
+     * Allow to post-process the result of the query after it has been reconciled on the coordinator
+     * but before it is passed to the CQL layer to return the ResultSet.
+     *
+     * See CASSANDRA-8717 for why this exists.
+     */
+    public PartitionIterator postReconciliationProcessing(PartitionIterator result)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(metadata().ksName).getColumnFamilyStore(metadata().cfName);
+        Index index = getIndex(cfs);
+        return index == null ? result : index.postProcessorFor(this).apply(result, this);
+    }
+
+    @Override
+    public boolean selectsFullPartition()
+    {
+        return metadata().isStaticCompactTable() ||
+               (dataRange.selectsAllPartition() && !rowFilter().hasExpressionOnClusteringOrRegularColumns());
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Read(%s.%s columns=%s rowfilter=%s limits=%s %s)",
+                             metadata().ksName,
+                             metadata().cfName,
+                             columnFilter(),
+                             rowFilter(),
+                             limits(),
+                             dataRange().toString(metadata()));
+    }
+
+    protected void serializeSelection(DataOutputPlus out, int version) throws IOException
+    {
+        DataRange.serializer.serialize(dataRange(), out, version, metadata());
+    }
+
+    protected long selectionSerializedSize(int version)
+    {
+        return DataRange.serializer.serializedSize(dataRange(), version, metadata());
+    }
+
+    /*
+     * We are currently using PartitionRangeReadCommand for most index queries, even if they are explicitly restricted
+     * to a single partition key. Return true if that is the case.
+     *
+     * See CASSANDRA-11617 and CASSANDRA-11872 for details.
+     */
+    public boolean isLimitedToOnePartition()
+    {
+        return dataRange.keyRange instanceof Bounds
+            && dataRange.startKey().kind() == PartitionPosition.Kind.ROW_KEY
+            && dataRange.startKey().equals(dataRange.stopKey());
+    }
+
+    private static class Deserializer extends SelectionDeserializer
+    {
+        public ReadCommand deserialize(DataInputPlus in,
+                                       int version,
+                                       boolean isDigest,
+                                       int digestVersion,
+                                       boolean isForThrift,
+                                       CFMetaData metadata,
+                                       int nowInSec,
+                                       ColumnFilter columnFilter,
+                                       RowFilter rowFilter,
+                                       DataLimits limits,
+                                       IndexMetadata index)
+        throws IOException
+        {
+            DataRange range = DataRange.serializer.deserialize(in, version, metadata);
+            return new PartitionRangeReadCommand(isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, range, index);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/RangeSliceCommand.java b/src/java/org/apache/cassandra/db/RangeSliceCommand.java
deleted file mode 100644
index 664eeee..0000000
--- a/src/java/org/apache/cassandra/db/RangeSliceCommand.java
+++ /dev/null

@@ -1,246 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.google.common.base.Objects;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.pager.Pageable;
-
-public class RangeSliceCommand extends AbstractRangeCommand implements Pageable
-{
-    public static final RangeSliceCommandSerializer serializer = new RangeSliceCommandSerializer();
-
-    public final int maxResults;
-    public final boolean countCQL3Rows;
-    public final boolean isPaging;
-
-    public RangeSliceCommand(String keyspace,
-                             String columnFamily,
-                             long timestamp,
-                             IDiskAtomFilter predicate,
-                             AbstractBounds<RowPosition> range,
-                             int maxResults)
-    {
-        this(keyspace, columnFamily, timestamp, predicate, range, null, maxResults, false, false);
-    }
-
-    public RangeSliceCommand(String keyspace,
-                             String columnFamily,
-                             long timestamp,
-                             IDiskAtomFilter predicate,
-                             AbstractBounds<RowPosition> range,
-                             List<IndexExpression> row_filter,
-                             int maxResults)
-    {
-        this(keyspace, columnFamily, timestamp, predicate, range, row_filter, maxResults, false, false);
-    }
-
-    public RangeSliceCommand(String keyspace,
-                             String columnFamily,
-                             long timestamp,
-                             IDiskAtomFilter predicate,
-                             AbstractBounds<RowPosition> range,
-                             List<IndexExpression> rowFilter,
-                             int maxResults,
-                             boolean countCQL3Rows,
-                             boolean isPaging)
-    {
-        super(keyspace, columnFamily, timestamp, range, predicate, rowFilter);
-        this.maxResults = maxResults;
-        this.countCQL3Rows = countCQL3Rows;
-        this.isPaging = isPaging;
-    }
-
-    public MessageOut<RangeSliceCommand> createMessage()
-    {
-        return new MessageOut<>(MessagingService.Verb.RANGE_SLICE, this, serializer);
-    }
-
-    public AbstractRangeCommand forSubRange(AbstractBounds<RowPosition> subRange)
-    {
-        return new RangeSliceCommand(keyspace,
-                                     columnFamily,
-                                     timestamp,
-                                     predicate.cloneShallow(),
-                                     subRange,
-                                     rowFilter,
-                                     maxResults,
-                                     countCQL3Rows,
-                                     isPaging);
-    }
-
-    public AbstractRangeCommand withUpdatedLimit(int newLimit)
-    {
-        return new RangeSliceCommand(keyspace,
-                                     columnFamily,
-                                     timestamp,
-                                     predicate.cloneShallow(),
-                                     keyRange,
-                                     rowFilter,
-                                     newLimit,
-                                     countCQL3Rows,
-                                     isPaging);
-    }
-
-    public int limit()
-    {
-        return maxResults;
-    }
-
-    public boolean countCQL3Rows()
-    {
-        return countCQL3Rows;
-    }
-
-    public List<Row> executeLocally()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
-
-        ExtendedFilter exFilter = cfs.makeExtendedFilter(keyRange, predicate, rowFilter, maxResults, countCQL3Rows, isPaging, timestamp);
-        if (cfs.indexManager.hasIndexFor(rowFilter))
-            return cfs.search(exFilter);
-        else
-            return cfs.getRangeSlice(exFilter);
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this)
-                      .add("keyspace", keyspace)
-                      .add("columnFamily", columnFamily)
-                      .add("predicate", predicate)
-                      .add("keyRange", keyRange)
-                      .add("rowFilter", rowFilter)
-                      .add("maxResults", maxResults)
-                      .add("counterCQL3Rows", countCQL3Rows)
-                      .add("timestamp", timestamp)
-                      .toString();
-    }
-}
-
-class RangeSliceCommandSerializer implements IVersionedSerializer<RangeSliceCommand>
-{
-    public void serialize(RangeSliceCommand sliceCommand, DataOutputPlus out, int version) throws IOException
-    {
-        out.writeUTF(sliceCommand.keyspace);
-        out.writeUTF(sliceCommand.columnFamily);
-        out.writeLong(sliceCommand.timestamp);
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(sliceCommand.keyspace, sliceCommand.columnFamily);
-
-        metadata.comparator.diskAtomFilterSerializer().serialize(sliceCommand.predicate, out, version);
-
-        if (sliceCommand.rowFilter == null)
-        {
-            out.writeInt(0);
-        }
-        else
-        {
-            out.writeInt(sliceCommand.rowFilter.size());
-            for (IndexExpression expr : sliceCommand.rowFilter)
-            {
-                expr.writeTo(out);
-            }
-        }
-        MessagingService.validatePartitioner(sliceCommand.keyRange);
-        AbstractBounds.rowPositionSerializer.serialize(sliceCommand.keyRange, out, version);
-        out.writeInt(sliceCommand.maxResults);
-        out.writeBoolean(sliceCommand.countCQL3Rows);
-        out.writeBoolean(sliceCommand.isPaging);
-    }
-
-    public RangeSliceCommand deserialize(DataInput in, int version) throws IOException
-    {
-        String keyspace = in.readUTF();
-        String columnFamily = in.readUTF();
-        long timestamp = in.readLong();
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
-        if (metadata == null)
-        {
-            String message = String.format("Got range slice command for nonexistent table %s.%s.  If the table was just " +
-                    "created, this is likely due to the schema not being fully propagated.  Please wait for schema " +
-                    "agreement on table creation." , keyspace, columnFamily);
-            throw new UnknownColumnFamilyException(message, null);
-        }
-
-        IDiskAtomFilter predicate = metadata.comparator.diskAtomFilterSerializer().deserialize(in, version);
-
-        List<IndexExpression> rowFilter;
-        int filterCount = in.readInt();
-        rowFilter = new ArrayList<>(filterCount);
-        for (int i = 0; i < filterCount; i++)
-        {
-            rowFilter.add(IndexExpression.readFrom(in));
-        }
-        AbstractBounds<RowPosition> range = AbstractBounds.rowPositionSerializer.deserialize(in, MessagingService.globalPartitioner(), version);
-
-        int maxResults = in.readInt();
-        boolean countCQL3Rows = in.readBoolean();
-        boolean isPaging = in.readBoolean();
-        return new RangeSliceCommand(keyspace, columnFamily, timestamp, predicate, range, rowFilter, maxResults, countCQL3Rows, isPaging);
-    }
-
-    public long serializedSize(RangeSliceCommand rsc, int version)
-    {
-        long size = TypeSizes.NATIVE.sizeof(rsc.keyspace);
-        size += TypeSizes.NATIVE.sizeof(rsc.columnFamily);
-        size += TypeSizes.NATIVE.sizeof(rsc.timestamp);
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(rsc.keyspace, rsc.columnFamily);
-
-        IDiskAtomFilter filter = rsc.predicate;
-
-        size += metadata.comparator.diskAtomFilterSerializer().serializedSize(filter, version);
-
-        if (rsc.rowFilter == null)
-        {
-            size += TypeSizes.NATIVE.sizeof(0);
-        }
-        else
-        {
-            size += TypeSizes.NATIVE.sizeof(rsc.rowFilter.size());
-            for (IndexExpression expr : rsc.rowFilter)
-            {
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.column);
-                size += TypeSizes.NATIVE.sizeof(expr.operator.ordinal());
-                size += TypeSizes.NATIVE.sizeofWithShortLength(expr.value);
-            }
-        }
-        size += AbstractBounds.rowPositionSerializer.serializedSize(rsc.keyRange, version);
-        size += TypeSizes.NATIVE.sizeof(rsc.maxResults);
-        size += TypeSizes.NATIVE.sizeof(rsc.countCQL3Rows);
-        size += TypeSizes.NATIVE.sizeof(rsc.isPaging);
-        return size;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RangeSliceReply.java b/src/java/org/apache/cassandra/db/RangeSliceReply.java
deleted file mode 100644
index ed1f523..0000000
--- a/src/java/org/apache/cassandra/db/RangeSliceReply.java
+++ /dev/null

@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.lang3.StringUtils;
-
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FastByteArrayInputStream;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-
-public class RangeSliceReply
-{
-    public static final RangeSliceReplySerializer serializer = new RangeSliceReplySerializer();
-
-    public final List<Row> rows;
-
-    public RangeSliceReply(List<Row> rows)
-    {
-        this.rows = rows;
-    }
-
-    public MessageOut<RangeSliceReply> createMessage()
-    {
-        return new MessageOut<RangeSliceReply>(MessagingService.Verb.REQUEST_RESPONSE, this, serializer);
-    }
-
-    @Override
-    public String toString()
-    {
-        return "RangeSliceReply{" +
-               "rows=" + StringUtils.join(rows, ",") +
-               '}';
-    }
-
-    public static RangeSliceReply read(byte[] body, int version) throws IOException
-    {
-        try (DataInputStream dis = new DataInputStream(new FastByteArrayInputStream(body)))
-        {
-            return serializer.deserialize(dis, version);
-        }
-    }
-
-    private static class RangeSliceReplySerializer implements IVersionedSerializer<RangeSliceReply>
-    {
-        public void serialize(RangeSliceReply rsr, DataOutputPlus out, int version) throws IOException
-        {
-            out.writeInt(rsr.rows.size());
-            for (Row row : rsr.rows)
-                Row.serializer.serialize(row, out, version);
-        }
-
-        public RangeSliceReply deserialize(DataInput in, int version) throws IOException
-        {
-            int rowCount = in.readInt();
-            List<Row> rows = new ArrayList<Row>(rowCount);
-            for (int i = 0; i < rowCount; i++)
-                rows.add(Row.serializer.deserialize(in, version));
-            return new RangeSliceReply(rows);
-        }
-
-        public long serializedSize(RangeSliceReply rsr, int version)
-        {
-            int size = TypeSizes.NATIVE.sizeof(rsr.rows.size());
-            for (Row row : rsr.rows)
-                size += Row.serializer.serializedSize(row, version);
-            return size;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RangeSliceVerbHandler.java b/src/java/org/apache/cassandra/db/RangeSliceVerbHandler.java
new file mode 100644
index 0000000..55826f5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/RangeSliceVerbHandler.java

@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.io.IVersionedSerializer;
+
+public class RangeSliceVerbHandler extends ReadCommandVerbHandler
+{
+    @Override
+    protected IVersionedSerializer<ReadResponse> serializer()
+    {
+        return ReadResponse.rangeSliceSerializer;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/RangeTombstone.java b/src/java/org/apache/cassandra/db/RangeTombstone.java
index eecf801..4a26581 100644
--- a/src/java/org/apache/cassandra/db/RangeTombstone.java
+++ b/src/java/org/apache/cassandra/db/RangeTombstone.java

@@ -17,381 +17,209 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.security.MessageDigest;
-import java.util.*;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Objects;
 
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.OnDiskAtom.Serializer;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.ISSTableSerializer;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.Interval;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
 
-public class RangeTombstone extends Interval<Composite, DeletionTime> implements OnDiskAtom
+
+/**
+ * A range tombstone is a tombstone that covers a slice/range of rows.
+ * <p>
+ * Note that in most of the storage engine, a range tombstone is actually represented by its separated
+ * opening and closing bound, see {@link RangeTombstoneMarker}. So in practice, this is only used when
+ * full partitions are materialized in memory in a {@code Partition} object, and more precisely through
+ * the use of a {@code RangeTombstoneList} in a {@code DeletionInfo} object.
+ */
+public class RangeTombstone
 {
-    public RangeTombstone(Composite start, Composite stop, long markedForDeleteAt, int localDeletionTime)
-    {
-        this(start, stop, new DeletionTime(markedForDeleteAt, localDeletionTime));
-    }
+    private final Slice slice;
+    private final DeletionTime deletion;
 
-    public RangeTombstone(Composite start, Composite stop, DeletionTime delTime)
+    public RangeTombstone(Slice slice, DeletionTime deletion)
     {
-        super(start, stop, delTime);
-    }
-
-    public Composite name()
-    {
-        return min;
-    }
-
-    public int getLocalDeletionTime()
-    {
-        return data.localDeletionTime;
-    }
-
-    public long timestamp()
-    {
-        return data.markedForDeleteAt;
-    }
-
-    public void validateFields(CFMetaData metadata) throws MarshalException
-    {
-        metadata.comparator.validate(min);
-        metadata.comparator.validate(max);
-    }
-
-    public void updateDigest(MessageDigest digest)
-    {
-        digest.update(min.toByteBuffer().duplicate());
-        digest.update(max.toByteBuffer().duplicate());
-
-        try (DataOutputBuffer buffer = new DataOutputBuffer())
-        {
-            buffer.writeLong(data.markedForDeleteAt);
-            digest.update(buffer.getData(), 0, buffer.getLength());
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
+        this.slice = slice;
+        this.deletion = deletion;
     }
 
     /**
-     * This tombstone supersedes another one if it is more recent and cover a
-     * bigger range than rt.
+     * The slice of rows that is deleted by this range tombstone.
+     *
+     * @return the slice of rows that is deleted by this range tombstone.
      */
-    public boolean supersedes(RangeTombstone rt, Comparator<Composite> comparator)
+    public Slice deletedSlice()
     {
-        if (rt.data.markedForDeleteAt > data.markedForDeleteAt)
+        return slice;
+    }
+
+    /**
+     * The deletion time for this (range) tombstone.
+     *
+     * @return the deletion time for this range tombstone.
+     */
+    public DeletionTime deletionTime()
+    {
+        return deletion;
+    }
+
+    public String toString(ClusteringComparator comparator)
+    {
+        return slice.toString(comparator) + '@' + deletion;
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof RangeTombstone))
             return false;
 
-        return comparator.compare(min, rt.min) <= 0 && comparator.compare(max, rt.max) >= 0;
+        RangeTombstone that = (RangeTombstone)other;
+        return this.deletedSlice().equals(that.deletedSlice())
+            && this.deletionTime().equals(that.deletionTime());
     }
 
-    public boolean includes(Comparator<Composite> comparator, Composite name)
+    @Override
+    public int hashCode()
     {
-        return comparator.compare(name, min) >= 0 && comparator.compare(name, max) <= 0;
+        return Objects.hash(deletedSlice(), deletionTime());
     }
 
     /**
-     * Tracks opened RangeTombstones when iterating over a partition.
+     * The bound of a range tombstone.
      * <p>
-     * This tracker must be provided all the atoms of a given partition in
-     * order (to the {@code update} method). Given this, it keeps enough
-     * information to be able to decide if one of an atom is deleted (shadowed)
-     * by a previously open RT. One the tracker can prove a given range
-     * tombstone cannot be useful anymore (that is, as soon as we've seen an
-     * atom that is after the end of that RT), it discards this RT. In other
-     * words, the maximum memory used by this object should be proportional to
-     * the maximum number of RT that can be simultaneously open (and this
-     * should fairly low in practice).
+     * This is the same than for a slice but it includes "boundaries" between ranges. A boundary simply condensed
+     * a close and an opening "bound" into a single object. There is 2 main reasons for these "shortcut" boundaries:
+     *   1) When merging multiple iterators having range tombstones (that are represented by their start and end markers),
+     *      we need to know when a range is close on an iterator, if it is reopened right away. Otherwise, we cannot
+     *      easily produce the markers on the merged iterators within risking to fail the sorting guarantees of an
+     *      iterator. See this comment for more details: https://goo.gl/yyB5mR.
+     *   2) This saves some storage space.
      */
-    public static class Tracker
+    public static class Bound extends Slice.Bound
     {
-        private final Comparator<Composite> comparator;
+        public static final Serializer serializer = new Serializer();
 
-        // A list the currently open RTs. We keep the list sorted in order of growing end bounds as for a
-        // new atom, this allows to efficiently find the RTs that are now useless (if any). Also note that because
-        // atom are passed to the tracker in order, any RT that is tracked can be assumed as opened, i.e. we
-        // never have to test the RTs start since it's always assumed to be less than what we have.
-        // Also note that this will store expired RTs (#7810). Those will be of type ExpiredRangeTombstone and
-        // will be ignored by writeOpenedMarker.
-        private final List<RangeTombstone> openedTombstones = new LinkedList<>();
+        /** The smallest start bound, i.e. the one that starts before any row. */
+        public static final Bound BOTTOM = new Bound(Kind.INCL_START_BOUND, EMPTY_VALUES_ARRAY);
+        /** The biggest end bound, i.e. the one that ends after any row. */
+        public static final Bound TOP = new Bound(Kind.INCL_END_BOUND, EMPTY_VALUES_ARRAY);
 
-        // Holds tombstones that are processed but not yet written out. Delaying the write allows us to remove
-        // duplicate / completely covered tombstones.
-        // Sorted in open order (to be written in that order).
-        private final Set<RangeTombstone> unwrittenTombstones = new LinkedHashSet<>();
-
-        // Total number of atoms written by writeOpenedMarker().
-        private int atomCount;
-
-        /**
-         * Creates a new tracker given the table comparator.
-         *
-         * @param comparator the comparator for the table this will track atoms
-         * for. The tracker assumes that atoms will be later provided to the
-         * tracker in {@code comparator} order.
-         */
-        public Tracker(Comparator<Composite> comparator)
+        public Bound(Kind kind, ByteBuffer[] values)
         {
-            this.comparator = comparator;
+            super(kind, values);
+            assert values.length > 0 || !kind.isBoundary();
         }
 
-        /**
-         * Computes the RangeTombstone that are needed at the beginning of an index
-         * block starting with {@code firstColumn}.
-         *
-         * @return the total serialized size of said tombstones and write them to
-         * {@code out} it if isn't null.
-         */
-        public long writeOpenedMarkers(Composite startPos, DataOutputPlus out, OnDiskAtom.SerializerForWriting atomSerializer) throws IOException
+        public boolean isBoundary()
         {
-            long size = 0;
+            return kind.isBoundary();
+        }
 
-            for (RangeTombstone rt : openedTombstones)
+        public boolean isOpen(boolean reversed)
+        {
+            return kind.isOpen(reversed);
+        }
+
+        public boolean isClose(boolean reversed)
+        {
+            return kind.isClose(reversed);
+        }
+
+        public static RangeTombstone.Bound inclusiveOpen(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.INCL_END_BOUND : Kind.INCL_START_BOUND, boundValues);
+        }
+
+        public static RangeTombstone.Bound exclusiveOpen(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.EXCL_END_BOUND : Kind.EXCL_START_BOUND, boundValues);
+        }
+
+        public static RangeTombstone.Bound inclusiveClose(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.INCL_START_BOUND : Kind.INCL_END_BOUND, boundValues);
+        }
+
+        public static RangeTombstone.Bound exclusiveClose(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.EXCL_START_BOUND : Kind.EXCL_END_BOUND, boundValues);
+        }
+
+        public static RangeTombstone.Bound inclusiveCloseExclusiveOpen(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.EXCL_END_INCL_START_BOUNDARY : Kind.INCL_END_EXCL_START_BOUNDARY, boundValues);
+        }
+
+        public static RangeTombstone.Bound exclusiveCloseInclusiveOpen(boolean reversed, ByteBuffer[] boundValues)
+        {
+            return new Bound(reversed ? Kind.INCL_END_EXCL_START_BOUNDARY : Kind.EXCL_END_INCL_START_BOUNDARY, boundValues);
+        }
+
+        public static RangeTombstone.Bound fromSliceBound(Slice.Bound sliceBound)
+        {
+            return new RangeTombstone.Bound(sliceBound.kind(), sliceBound.getRawValues());
+        }
+
+        public RangeTombstone.Bound copy(AbstractAllocator allocator)
+        {
+            ByteBuffer[] newValues = new ByteBuffer[size()];
+            for (int i = 0; i < size(); i++)
+                newValues[i] = allocator.clone(get(i));
+            return new Bound(kind(), newValues);
+        }
+
+        public ClusteringPrefix minimize()
+        {
+            if (!ByteBufferUtil.canMinimize(values))
+                return this;
+            return new Bound(kind, ByteBufferUtil.minimizeBuffers(values));
+        }
+
+        @Override
+        public Bound withNewKind(Kind kind)
+        {
+            return new Bound(kind, values);
+        }
+
+        public static class Serializer
+        {
+            public void serialize(RangeTombstone.Bound bound, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
             {
-                if (rt instanceof ExpiredRangeTombstone || comparator.compare(rt.max, startPos) < 0)
-                    continue;
-
-                size += writeTombstone(rt, out, atomSerializer);
-            }
-            return size;
-        }
-
-        /**
-         * Writes out all tombstones that have been accepted after the previous call of this method.
-         * Tombstones are not written immediately to permit redundant ones to be skipped.
-         *
-         * @return the serialized size of written tombstones
-         */
-        public long writeUnwrittenTombstones(DataOutputPlus out, OnDiskAtom.SerializerForWriting atomSerializer) throws IOException
-        {
-            long size = 0;
-            for (RangeTombstone rt : unwrittenTombstones)
-            {
-                size += writeTombstone(rt, out, atomSerializer);
-            }
-            unwrittenTombstones.clear();
-            return size;
-        }
-
-        private long writeTombstone(RangeTombstone rt, DataOutputPlus out, OnDiskAtom.SerializerForWriting atomSerializer)
-                throws IOException
-        {
-            long size = atomSerializer.serializedSizeForSSTable(rt);
-            atomCount++;
-            if (out != null)
-                atomSerializer.serializeForSSTable(rt, out);
-            return size;
-        }
-
-        /**
-         * The total number of atoms written by calls to the above methods.
-         */
-        public int writtenAtom()
-        {
-            return atomCount;
-        }
-
-        /**
-         * Update this tracker given an {@code atom}.
-         * <p>
-         * This method first test if some range tombstone can be discarded due
-         * to the knowledge of that new atom. Then, if it's a range tombstone,
-         * it adds it to the tracker.
-         * <p>
-         * Note that this method should be called on *every* atom of a partition for
-         * the tracker to work as efficiently as possible (#9486).
-         */
-        public boolean update(OnDiskAtom atom, boolean isExpired)
-        {
-            // Get rid of now useless RTs
-            ListIterator<RangeTombstone> iterator = openedTombstones.listIterator();
-            while (iterator.hasNext())
-            {
-                // If this tombstone stops before the new atom, it is now useless since it cannot cover this or any future
-                // atoms. Otherwise, if a RT ends after the new atom, then we know that's true of any following atom too
-                // since maxOrderingSet is sorted by end bounds
-                RangeTombstone t = iterator.next();
-                if (comparator.compare(atom.name(), t.max) > 0)
-                {
-                    iterator.remove();
-                    // The iterator may still be in the unwrittenTombstones list. That's ok, it still needs to be written
-                    // but it can't influence anything else.
-                }
-                else
-                {
-                    // If the atom is a RT, we'll add it next and for that we want to start by looking at the atom we just
-                    // returned, so rewind the iterator.
-                    iterator.previous();
-                    break;
-                }
+                out.writeByte(bound.kind().ordinal());
+                out.writeShort(bound.size());
+                ClusteringPrefix.serializer.serializeValuesWithoutSize(bound, out, version, types);
             }
 
-            // If it's a RT, adds it.
-            if (atom instanceof RangeTombstone)
+            public long serializedSize(RangeTombstone.Bound bound, int version, List<AbstractType<?>> types)
             {
-                RangeTombstone toAdd = (RangeTombstone)atom;
-
-                // We want to maintain openedTombstones in end bounds order so we find where to insert the new element
-                // and add it. While doing so, we also check if that new tombstone fully shadow or is fully shadowed
-                // by an existing tombstone so we avoid tracking more tombstone than necessary (and we know this will
-                // at least happend for start-of-index-block repeated range tombstones).
-                while (iterator.hasNext())
-                {
-                    RangeTombstone existing = iterator.next();
-                    int cmp = comparator.compare(toAdd.max, existing.max);
-                    if (cmp > 0)
-                    {
-                        // the new one covers more than the existing one. If the new one happens to also supersedes
-                        // the existing one, remove the existing one. In any case, we're not done yet.
-                        if (!existing.data.supersedes(toAdd.data))
-                        {
-                            iterator.remove();
-                            // If the existing one starts at the same position as the new, it does not need to be written
-                            // (it won't have been yet).
-                            if (comparator.compare(toAdd.min, existing.min) == 0)
-                                unwrittenTombstones.remove(existing);
-                        }
-                    }
-                    else
-                    {
-                        // the new one is included in the existing one. If the new one supersedes the existing one,
-                        // then we add the new one (and if the new one ends like the existing one, we can actually remove
-                        // the existing one), otherwise we can actually ignore it. In any case, we're done.
-                        if (!toAdd.data.supersedes(existing.data))
-                            return false;
-
-                        if (cmp == 0)
-                        {
-                            iterator.remove();
-                            // If the existing one starts at the same position as the new, it does not need to be written
-                            // (it won't have been yet).
-                            if (comparator.compare(toAdd.min, existing.min) == 0)
-                                unwrittenTombstones.remove(existing);
-                        }
-                        else
-                        {
-                            iterator.previous();
-                        }
-                        // Found the insert position for the new tombstone
-                        break;
-                    }
-                }
-
-                if (isExpired)
-                    iterator.add(new ExpiredRangeTombstone(toAdd));
-                else
-                {
-                    iterator.add(toAdd);
-                    unwrittenTombstones.add(toAdd);
-                }
-                return false;
+                return 1 // kind ordinal
+                     + TypeSizes.sizeof((short)bound.size())
+                     + ClusteringPrefix.serializer.valuesWithoutSizeSerializedSize(bound, version, types);
             }
-            // Caller should write cell.
-            return true;
-        }
 
-        /**
-         * Tests if the provided column is deleted by one of the tombstone
-         * tracked by this tracker.
-         * <p>
-         * This method should be called on columns in the same order than for the update()
-         * method. Note that this method does not update the tracker so the update() method
-         * should still be called on {@code column} (it doesn't matter if update is called
-         * before or after this call).
-         */
-        public boolean isDeleted(Cell cell)
-        {
-            // We know every tombstone kept are "open", start before the column. So the
-            // column is deleted if any of the tracked tombstone ends after the column
-            // (this will be the case of every RT if update() has been called before this
-            // method, but we might have a few RT to skip otherwise) and the RT deletion is
-            // actually more recent than the column timestamp.
-            for (RangeTombstone tombstone : openedTombstones)
+            public RangeTombstone.Bound deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException
             {
-                if (comparator.compare(cell.name(), tombstone.max) <= 0
-                    && tombstone.timestamp() >= cell.timestamp())
-                    return true;
+                Kind kind = Kind.values()[in.readByte()];
+                return deserializeValues(in, kind, version, types);
             }
-            return false;
-        }
 
-        public boolean hasUnwrittenTombstones()
-        {
-            return !unwrittenTombstones.isEmpty();
-        }
-
-        /**
-         * The tracker needs to track expired range tombstone but keep tracks that they are
-         * expired, so this is what this class is used for.
-         */
-        private static class ExpiredRangeTombstone extends RangeTombstone
-        {
-            private ExpiredRangeTombstone(RangeTombstone tombstone)
+            public RangeTombstone.Bound deserializeValues(DataInputPlus in, Kind kind, int version,
+                    List<AbstractType<?>> types) throws IOException
             {
-                super(tombstone.min, tombstone.max, tombstone.data);
+                int size = in.readUnsignedShort();
+                if (size == 0)
+                    return kind.isStart() ? BOTTOM : TOP;
+
+                ByteBuffer[] values = ClusteringPrefix.serializer.deserializeValuesWithoutSize(in, size, version, types);
+                return new RangeTombstone.Bound(kind, values);
             }
         }
     }
-
-    public static class Serializer implements ISSTableSerializer<RangeTombstone>
-    {
-        private final CType type;
-
-        public Serializer(CType type)
-        {
-            this.type = type;
-        }
-
-        public void serializeForSSTable(RangeTombstone t, DataOutputPlus out) throws IOException
-        {
-            type.serializer().serialize(t.min, out);
-            out.writeByte(ColumnSerializer.RANGE_TOMBSTONE_MASK);
-            type.serializer().serialize(t.max, out);
-            DeletionTime.serializer.serialize(t.data, out);
-        }
-
-        public RangeTombstone deserializeFromSSTable(DataInput in, Version version) throws IOException
-        {
-            Composite min = type.serializer().deserialize(in);
-
-            int b = in.readUnsignedByte();
-            assert (b & ColumnSerializer.RANGE_TOMBSTONE_MASK) != 0;
-            return deserializeBody(in, min, version);
-        }
-
-        public RangeTombstone deserializeBody(DataInput in, Composite min, Version version) throws IOException
-        {
-            Composite max = type.serializer().deserialize(in);
-            DeletionTime dt = DeletionTime.serializer.deserialize(in);
-            // If the max equals the min.end(), we can avoid keeping an extra ByteBuffer in memory by using
-            // min.end() instead of max
-            Composite minEnd = min.end();
-            max = minEnd.equals(max) ? minEnd : max;
-            return new RangeTombstone(min, max, dt);
-        }
-
-        public void skipBody(DataInput in, Version version) throws IOException
-        {
-            type.serializer().skip(in);
-            DeletionTime.serializer.skip(in);
-        }
-
-        public long serializedSizeForSSTable(RangeTombstone t)
-        {
-            TypeSizes typeSizes = TypeSizes.NATIVE;
-            return type.serializer().serializedSize(t.min, typeSizes)
-                 + 1 // serialization flag
-                 + type.serializer().serializedSize(t.max, typeSizes)
-                 + DeletionTime.serializer.serializedSize(t.data, typeSizes);
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/RangeTombstoneList.java b/src/java/org/apache/cassandra/db/RangeTombstoneList.java
index 37f1ef4..ad91e72 100644
--- a/src/java/org/apache/cassandra/db/RangeTombstoneList.java
+++ b/src/java/org/apache/cassandra/db/RangeTombstoneList.java

@@ -17,26 +17,16 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
-import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.security.MessageDigest;
 import java.util.Arrays;
-import java.util.Comparator;
+import java.util.Collections;
 import java.util.Iterator;
 
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 import com.google.common.collect.Iterators;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.cache.IMeasurableMemory;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.memory.AbstractAllocator;
 
@@ -58,23 +48,21 @@
  */
 public class RangeTombstoneList implements Iterable<RangeTombstone>, IMeasurableMemory
 {
-    private static final Logger logger = LoggerFactory.getLogger(RangeTombstoneList.class);
-
     private static long EMPTY_SIZE = ObjectSizes.measure(new RangeTombstoneList(null, 0));
 
-    private final Comparator<Composite> comparator;
+    private final ClusteringComparator comparator;
 
     // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could
     // use a List for starts and ends, but having arrays everywhere is almost simpler.
-    private Composite[] starts;
-    private Composite[] ends;
+    private Slice.Bound[] starts;
+    private Slice.Bound[] ends;
     private long[] markedAts;
     private int[] delTimes;
 
     private long boundaryHeapSize;
     private int size;
 
-    private RangeTombstoneList(Comparator<Composite> comparator, Composite[] starts, Composite[] ends, long[] markedAts, int[] delTimes, long boundaryHeapSize, int size)
+    private RangeTombstoneList(ClusteringComparator comparator, Slice.Bound[] starts, Slice.Bound[] ends, long[] markedAts, int[] delTimes, long boundaryHeapSize, int size)
     {
         assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length;
         this.comparator = comparator;
@@ -86,9 +74,9 @@
         this.boundaryHeapSize = boundaryHeapSize;
     }
 
-    public RangeTombstoneList(Comparator<Composite> comparator, int capacity)
+    public RangeTombstoneList(ClusteringComparator comparator, int capacity)
     {
-        this(comparator, new Composite[capacity], new Composite[capacity], new long[capacity], new int[capacity], 0, 0);
+        this(comparator, new Slice.Bound[capacity], new Slice.Bound[capacity], new long[capacity], new int[capacity], 0, 0);
     }
 
     public boolean isEmpty()
@@ -101,7 +89,7 @@
         return size;
     }
 
-    public Comparator<Composite> comparator()
+    public ClusteringComparator comparator()
     {
         return comparator;
     }
@@ -119,27 +107,36 @@
     public RangeTombstoneList copy(AbstractAllocator allocator)
     {
         RangeTombstoneList copy =  new RangeTombstoneList(comparator,
-                                      new Composite[size],
-                                      new Composite[size],
-                                      Arrays.copyOf(markedAts, size),
-                                      Arrays.copyOf(delTimes, size),
-                                      boundaryHeapSize, size);
+                                                          new Slice.Bound[size],
+                                                          new Slice.Bound[size],
+                                                          Arrays.copyOf(markedAts, size),
+                                                          Arrays.copyOf(delTimes, size),
+                                                          boundaryHeapSize, size);
 
 
         for (int i = 0; i < size; i++)
         {
-            assert !(starts[i] instanceof AbstractNativeCell || ends[i] instanceof AbstractNativeCell); //this should never happen
-
-            copy.starts[i] = starts[i].copy(null, allocator);
-            copy.ends[i] = ends[i].copy(null, allocator);
+            copy.starts[i] = clone(starts[i], allocator);
+            copy.ends[i] = clone(ends[i], allocator);
         }
 
         return copy;
     }
 
+    private static Slice.Bound clone(Slice.Bound bound, AbstractAllocator allocator)
+    {
+        ByteBuffer[] values = new ByteBuffer[bound.size()];
+        for (int i = 0; i < values.length; i++)
+            values[i] = allocator.clone(bound.get(i));
+        return new Slice.Bound(bound.kind(), values);
+    }
+
     public void add(RangeTombstone tombstone)
     {
-        add(tombstone.min, tombstone.max, tombstone.data.markedForDeleteAt, tombstone.data.localDeletionTime);
+        add(tombstone.deletedSlice().start(),
+            tombstone.deletedSlice().end(),
+            tombstone.deletionTime().markedForDeleteAt(),
+            tombstone.deletionTime().localDeletionTime());
     }
 
     /**
@@ -148,7 +145,7 @@
      * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case),
      * but it doesn't assume it.
      */
-    public void add(Composite start, Composite end, long markedAt, int delTime)
+    public void add(Slice.Bound start, Slice.Bound end, long markedAt, int delTime)
     {
         if (isEmpty())
         {
@@ -159,7 +156,7 @@
         int c = comparator.compare(ends[size-1], start);
 
         // Fast path if we add in sorted order
-        if (c < 0)
+        if (c <= 0)
         {
             addInternal(size, start, end, markedAt, delTime);
         }
@@ -167,7 +164,7 @@
         {
             // Note: insertFrom expect i to be the insertion point in term of interval ends
             int pos = Arrays.binarySearch(ends, 0, size, start, comparator);
-            insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime);
+            insertFrom((pos >= 0 ? pos+1 : -pos-1), start, end, markedAt, delTime);
         }
         boundaryHeapSize += start.unsharedHeapSize() + end.unsharedHeapSize();
     }
@@ -215,7 +212,7 @@
             int j = 0;
             while (i < size && j < tombstones.size)
             {
-                if (comparator.compare(tombstones.starts[j], ends[i]) <= 0)
+                if (comparator.compare(tombstones.starts[j], ends[i]) < 0)
                 {
                     insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]);
                     j++;
@@ -235,55 +232,46 @@
      * Returns whether the given name/timestamp pair is deleted by one of the tombstone
      * of this RangeTombstoneList.
      */
-    public boolean isDeleted(Cell cell)
+    public boolean isDeleted(Clustering clustering, Cell cell)
     {
-        int idx = searchInternal(cell.name(), 0);
+        int idx = searchInternal(clustering, 0, size);
         // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-        return idx >= 0 && (cell instanceof CounterCell || markedAts[idx] >= cell.timestamp());
-    }
-
-    /**
-     * Returns a new {@link InOrderTester}.
-     */
-    InOrderTester inOrderTester()
-    {
-        return new InOrderTester();
+        return idx >= 0 && (cell.isCounterCell() || markedAts[idx] >= cell.timestamp());
     }
 
     /**
      * Returns the DeletionTime for the tombstone overlapping {@code name} (there can't be more than one),
      * or null if {@code name} is not covered by any tombstone.
      */
-    public DeletionTime searchDeletionTime(Composite name)
+    public DeletionTime searchDeletionTime(Clustering name)
     {
-        int idx = searchInternal(name, 0);
+        int idx = searchInternal(name, 0, size);
         return idx < 0 ? null : new DeletionTime(markedAts[idx], delTimes[idx]);
     }
 
-    public RangeTombstone search(Composite name)
+    public RangeTombstone search(Clustering name)
     {
-        int idx = searchInternal(name, 0);
+        int idx = searchInternal(name, 0, size);
         return idx < 0 ? null : rangeTombstone(idx);
     }
 
     /*
      * Return is the index of the range covering name if name is covered. If the return idx is negative,
      * no range cover name and -idx-1 is the index of the first range whose start is greater than name.
+     *
+     * Note that bounds are not in the range if they fall on its boundary.
      */
-    private int searchInternal(Composite name, int startIdx)
+    private int searchInternal(ClusteringPrefix name, int startIdx, int endIdx)
     {
         if (isEmpty())
             return -1;
 
-        int pos = Arrays.binarySearch(starts, startIdx, size, name, comparator);
+        int pos = Arrays.binarySearch(starts, startIdx, endIdx, name, comparator);
         if (pos >= 0)
         {
-            // We're exactly on an interval start. The one subtility is that we need to check if
-            // the previous is not equal to us and doesn't have a higher marked at
-            if (pos > 0 && comparator.compare(name, ends[pos-1]) == 0 && markedAts[pos-1] > markedAts[pos])
-                return pos-1;
-            else
-                return pos;
+            // Equality only happens for bounds (as used by forward/reverseIterator), and bounds are equal only if they
+            // are the same or complementary, in either case the bound itself is not part of the range.
+            return -pos - 1;
         }
         else
         {
@@ -292,30 +280,22 @@
             if (idx < 0)
                 return -1;
 
-            return comparator.compare(name, ends[idx]) <= 0 ? idx : -idx-2;
+            return comparator.compare(name, ends[idx]) < 0 ? idx : -idx-2;
         }
     }
 
     public int dataSize()
     {
-        int dataSize = TypeSizes.NATIVE.sizeof(size);
+        int dataSize = TypeSizes.sizeof(size);
         for (int i = 0; i < size; i++)
         {
             dataSize += starts[i].dataSize() + ends[i].dataSize();
-            dataSize += TypeSizes.NATIVE.sizeof(markedAts[i]);
-            dataSize += TypeSizes.NATIVE.sizeof(delTimes[i]);
+            dataSize += TypeSizes.sizeof(markedAts[i]);
+            dataSize += TypeSizes.sizeof(delTimes[i]);
         }
         return dataSize;
     }
 
-    public long minMarkedAt()
-    {
-        long min = Long.MAX_VALUE;
-        for (int i = 0; i < size; i++)
-            min = Math.min(min, markedAts[i]);
-        return min;
-    }
-
     public long maxMarkedAt()
     {
         long max = Long.MIN_VALUE;
@@ -324,77 +304,105 @@
         return max;
     }
 
+    public void collectStats(EncodingStats.Collector collector)
+    {
+        for (int i = 0; i < size; i++)
+        {
+            collector.updateTimestamp(markedAts[i]);
+            collector.updateLocalDeletionTime(delTimes[i]);
+        }
+    }
+
     public void updateAllTimestamp(long timestamp)
     {
         for (int i = 0; i < size; i++)
             markedAts[i] = timestamp;
     }
 
-    /**
-     * Removes all range tombstones whose local deletion time is older than gcBefore.
-     */
-    public void purge(int gcBefore)
-    {
-        int j = 0;
-        for (int i = 0; i < size; i++)
-        {
-            if (delTimes[i] >= gcBefore)
-                setInternal(j++, starts[i], ends[i], markedAts[i], delTimes[i]);
-        }
-        size = j;
-    }
-
-    /**
-     * Returns whether {@code purge(gcBefore)} would remove something or not.
-     */
-    public boolean hasPurgeableTombstones(int gcBefore)
-    {
-        for (int i = 0; i < size; i++)
-        {
-            if (delTimes[i] < gcBefore)
-                return true;
-        }
-        return false;
-    }
-
     private RangeTombstone rangeTombstone(int idx)
     {
-        return new RangeTombstone(starts[idx], ends[idx], markedAts[idx], delTimes[idx]);
+        return new RangeTombstone(Slice.make(starts[idx], ends[idx]), new DeletionTime(markedAts[idx], delTimes[idx]));
+    }
+
+    private RangeTombstone rangeTombstoneWithNewStart(int idx, Slice.Bound newStart)
+    {
+        return new RangeTombstone(Slice.make(newStart, ends[idx]), new DeletionTime(markedAts[idx], delTimes[idx]));
+    }
+
+    private RangeTombstone rangeTombstoneWithNewEnd(int idx, Slice.Bound newEnd)
+    {
+        return new RangeTombstone(Slice.make(starts[idx], newEnd), new DeletionTime(markedAts[idx], delTimes[idx]));
+    }
+
+    private RangeTombstone rangeTombstoneWithNewBounds(int idx, Slice.Bound newStart, Slice.Bound newEnd)
+    {
+        return new RangeTombstone(Slice.make(newStart, newEnd), new DeletionTime(markedAts[idx], delTimes[idx]));
     }
 
     public Iterator<RangeTombstone> iterator()
     {
-        return new AbstractIterator<RangeTombstone>()
-        {
-            private int idx;
-
-            protected RangeTombstone computeNext()
-            {
-                if (idx >= size)
-                    return endOfData();
-
-                return rangeTombstone(idx++);
-            }
-        };
+        return iterator(false);
     }
 
-    public Iterator<RangeTombstone> iterator(Composite from, Composite till)
+    public Iterator<RangeTombstone> iterator(boolean reversed)
     {
-        int startIdx = from.isEmpty() ? 0 : searchInternal(from, 0);
+        return reversed
+             ? new AbstractIterator<RangeTombstone>()
+             {
+                 private int idx = size - 1;
+
+                 protected RangeTombstone computeNext()
+                 {
+                     if (idx < 0)
+                         return endOfData();
+
+                     return rangeTombstone(idx--);
+                 }
+             }
+             : new AbstractIterator<RangeTombstone>()
+             {
+                 private int idx;
+
+                 protected RangeTombstone computeNext()
+                 {
+                     if (idx >= size)
+                         return endOfData();
+
+                     return rangeTombstone(idx++);
+                 }
+             };
+    }
+
+    public Iterator<RangeTombstone> iterator(final Slice slice, boolean reversed)
+    {
+        return reversed ? reverseIterator(slice) : forwardIterator(slice);
+    }
+
+    private Iterator<RangeTombstone> forwardIterator(final Slice slice)
+    {
+        int startIdx = slice.start() == Slice.Bound.BOTTOM ? 0 : searchInternal(slice.start(), 0, size);
         final int start = startIdx < 0 ? -startIdx-1 : startIdx;
 
         if (start >= size)
-            return Iterators.<RangeTombstone>emptyIterator();
+            return Collections.emptyIterator();
 
-        int finishIdx = till.isEmpty() ? size : searchInternal(till, start);
-        // if stopIdx is the first range after 'till' we care only until the previous range
+        int finishIdx = slice.end() == Slice.Bound.TOP ? size - 1 : searchInternal(slice.end(), start, size);
+        // if stopIdx is the first range after 'slice.end()' we care only until the previous range
         final int finish = finishIdx < 0 ? -finishIdx-2 : finishIdx;
 
-        // Note: the following is true because we know 'from' is before 'till' in sorted order.
         if (start > finish)
-            return Iterators.<RangeTombstone>emptyIterator();
-        else if (start == finish)
-            return Iterators.<RangeTombstone>singletonIterator(rangeTombstone(start));
+            return Collections.emptyIterator();
+
+        if (start == finish)
+        {
+            // We want to make sure the range are stricly included within the queried slice as this
+            // make it easier to combine things when iterating over successive slices.
+            Slice.Bound s = comparator.compare(starts[start], slice.start()) < 0 ? slice.start() : starts[start];
+            Slice.Bound e = comparator.compare(slice.end(), ends[start]) < 0 ? slice.end() : ends[start];
+            if (Slice.isEmpty(comparator, s, e))
+                return Collections.emptyIterator();
+            return Iterators.<RangeTombstone>singletonIterator(rangeTombstoneWithNewBounds(start, s, e));
+        }
 
         return new AbstractIterator<RangeTombstone>()
         {
@@ -405,77 +413,66 @@
                 if (idx >= size || idx > finish)
                     return endOfData();
 
+                // We want to make sure the range are stricly included within the queried slice as this
+                // make it easier to combine things when iterating over successive slices. This means that
+                // for the first and last range we might have to "cut" the range returned.
+                if (idx == start && comparator.compare(starts[idx], slice.start()) < 0)
+                    return rangeTombstoneWithNewStart(idx++, slice.start());
+                if (idx == finish && comparator.compare(slice.end(), ends[idx]) < 0)
+                    return rangeTombstoneWithNewEnd(idx++, slice.end());
                 return rangeTombstone(idx++);
             }
         };
     }
 
-    /**
-     * Evaluates a diff between superset (known to be all merged tombstones) and this list for read repair
-     *
-     * @return null if there is no difference
-     */
-    public RangeTombstoneList diff(RangeTombstoneList superset)
+    private Iterator<RangeTombstone> reverseIterator(final Slice slice)
     {
-        if (isEmpty())
-            return superset;
+        int startIdx = slice.end() == Slice.Bound.TOP ? size - 1 : searchInternal(slice.end(), 0, size);
+        // if startIdx is the first range after 'slice.end()' we care only until the previous range
+        final int start = startIdx < 0 ? -startIdx-2 : startIdx;
 
-        RangeTombstoneList diff = null;
+        if (start < 0)
+            return Collections.emptyIterator();
 
-        int j = 0; // index to iterate through our own list
-        for (int i = 0; i < superset.size; i++)
+        int finishIdx = slice.start() == Slice.Bound.BOTTOM ? 0 : searchInternal(slice.start(), 0, start + 1);  // include same as finish
+        // if stopIdx is the first range after 'slice.end()' we care only until the previous range
+        final int finish = finishIdx < 0 ? -finishIdx-1 : finishIdx;
+
+        if (start < finish)
+            return Collections.emptyIterator();
+
+        if (start == finish)
         {
-            // we can assume that this list is a subset of the superset list
-            while (j < size && comparator.compare(starts[j], superset.starts[i]) < 0)
-                j++;
-
-            if (j >= size)
-            {
-                // we're at the end of our own list, add the remainder of the superset to the diff
-                if (i < superset.size)
-                {
-                    if (diff == null)
-                        diff = new RangeTombstoneList(comparator, superset.size - i);
-
-                    for(int k = i; k < superset.size; k++)
-                        diff.add(superset.starts[k], superset.ends[k], superset.markedAts[k], superset.delTimes[k]);
-                }
-                return diff;
-            }
-
-            // we don't care about local deletion time here, because it doesn't matter for read repair
-            if (!starts[j].equals(superset.starts[i])
-                || !ends[j].equals(superset.ends[i])
-                || markedAts[j] != superset.markedAts[i])
-            {
-                if (diff == null)
-                    diff = new RangeTombstoneList(comparator, Math.min(8, superset.size - i));
-                diff.add(superset.starts[i], superset.ends[i], superset.markedAts[i], superset.delTimes[i]);
-            }
+            // We want to make sure the range are stricly included within the queried slice as this
+            // make it easier to combine things when iterator over successive slices.
+            Slice.Bound s = comparator.compare(starts[start], slice.start()) < 0 ? slice.start() : starts[start];
+            Slice.Bound e = comparator.compare(slice.end(), ends[start]) < 0 ? slice.end() : ends[start];
+            if (Slice.isEmpty(comparator, s, e))
+                return Collections.emptyIterator();
+            return Iterators.<RangeTombstone>singletonIterator(rangeTombstoneWithNewBounds(start, s, e));
         }
 
-        return diff;
-    }
-    
-    /**
-     * Calculates digest for triggering read repair on mismatch
-     */
-    public void updateDigest(MessageDigest digest)
-    {
-        ByteBuffer longBuffer = ByteBuffer.allocate(8);
-        for (int i = 0; i < size; i++)
+        return new AbstractIterator<RangeTombstone>()
         {
-            for (int j = 0; j < starts[i].size(); j++)
-                digest.update(starts[i].get(j).duplicate());
-            for (int j = 0; j < ends[i].size(); j++)
-                digest.update(ends[i].get(j).duplicate());
+            private int idx = start;
 
-            longBuffer.putLong(0, markedAts[i]);
-            digest.update(longBuffer.array(), 0, 8);
-        }
+            protected RangeTombstone computeNext()
+            {
+                if (idx < 0 || idx < finish)
+                    return endOfData();
+
+                // We want to make sure the range are stricly included within the queried slice as this
+                // make it easier to combine things when iterator over successive slices. This means that
+                // for the first and last range we might have to "cut" the range returned.
+                if (idx == start && comparator.compare(slice.end(), ends[idx]) < 0)
+                    return rangeTombstoneWithNewEnd(idx--, slice.end());
+                if (idx == finish && comparator.compare(starts[idx], slice.start()) < 0)
+                    return rangeTombstoneWithNewStart(idx--, slice.start());
+                return rangeTombstone(idx--);
+            }
+        };
     }
 
-
     @Override
     public boolean equals(Object o)
     {
@@ -484,7 +481,7 @@
         RangeTombstoneList that = (RangeTombstoneList)o;
         if (size != that.size)
             return false;
-        
+
         for (int i = 0; i < size; i++)
         {
             if (!starts[i].equals(that.starts[i]))
@@ -525,51 +522,26 @@
 
     /*
      * Inserts a new element starting at index i. This method assumes that:
-     *    ends[i-1] <= start <= ends[i]
+     *    ends[i-1] <= start < ends[i]
+     * (note that start can be equal to ends[i-1] in the case where we have a boundary, i.e. for instance
+     * ends[i-1] is the exclusive end of X and start is the inclusive start of X).
      *
      * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that:
-     *   - s_i <= e_i
+     *   - s_i is a start bound and e_i is a end bound
+     *   - s_i < e_i
      *   - e_i <= s_i+1
-     *   - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1
-     * Basically, range are non overlapping except for their bound and in order. And while
-     * we allow ranges with the same value for the start and end, we don't allow repeating
-     * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2
-     * conditions).
-     *
+     * Basically, range are non overlapping and in order.
      */
-    private void insertFrom(int i, Composite start, Composite end, long markedAt, int delTime)
+    private void insertFrom(int i, Slice.Bound start, Slice.Bound end, long markedAt, int delTime)
     {
         while (i < size)
         {
+            assert start.isStart() && end.isEnd();
             assert i == 0 || comparator.compare(ends[i-1], start) <= 0;
+            assert comparator.compare(start, ends[i]) < 0;
 
-            int c = comparator.compare(start, ends[i]);
-            assert c <= 0;
-            if (c == 0)
-            {
-                // If start == ends[i], then we can insert from the next one (basically the new element
-                // really start at the next element), except for the case where starts[i] == ends[i].
-                // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]...
-                if (comparator.compare(starts[i], ends[i]) == 0)
-                {
-                    // The current element cover a single value which is equal to the start of the inserted
-                    // element. If the inserted element overwrites the current one, just remove the current
-                    // (it's included in what we insert) and proceed with the insert.
-                    if (markedAt > markedAts[i])
-                    {
-                        removeInternal(i);
-                        continue;
-                    }
-
-                    // Otherwise (the current singleton interval override the new one), we want to leave the
-                    // current element and move to the next, unless start == end since that means the new element
-                    // is in fact fully covered by the current one (so we're done)
-                    if (comparator.compare(start, end) == 0)
-                        return;
-                }
-                i++;
-                continue;
-            }
+            if (Slice.isEmpty(comparator, start, end))
+                return;
 
             // Do we overwrite the current element?
             if (markedAt > markedAts[i])
@@ -579,26 +551,24 @@
                 // First deal with what might come before the newly added one.
                 if (comparator.compare(starts[i], start) < 0)
                 {
-                    addInternal(i, starts[i], start, markedAts[i], delTimes[i]);
-                    i++;
-                    // We don't need to do the following line, but in spirit that's what we want to do
-                    // setInternal(i, start, ends[i], markedAts, delTime])
+                    Slice.Bound newEnd = start.invert();
+                    if (!Slice.isEmpty(comparator, starts[i], newEnd))
+                    {
+                        addInternal(i, starts[i], start.invert(), markedAts[i], delTimes[i]);
+                        i++;
+                        setInternal(i, start, ends[i], markedAts[i], delTimes[i]);
+                    }
                 }
 
                 // now, start <= starts[i]
 
-                // Does the new element stops before/at the current one,
+                // Does the new element stops before the current one,
                 int endCmp = comparator.compare(end, starts[i]);
-                if (endCmp <= 0)
+                if (endCmp < 0)
                 {
-                    // Here start <= starts[i] and end <= starts[i]
-                    // This means the current element is before the current one. However, one special
-                    // case is if end == starts[i] and starts[i] == ends[i]. In that case,
-                    // the new element entirely overwrite the current one and we can just overwrite
-                    if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0)
-                        setInternal(i, start, end, markedAt, delTime);
-                    else
-                        addInternal(i, start, end, markedAt, delTime);
+                    // Here start <= starts[i] and end < starts[i]
+                    // This means the current element is before the current one.
+                    addInternal(i, start, end, markedAt, delTime);
                     return;
                 }
 
@@ -607,30 +577,33 @@
                 if (cmp <= 0)
                 {
                     // We do overwrite fully:
-                    // update the current element until it's end and continue
-                    // on with the next element (with the new inserted start == current end).
+                    // update the current element until it's end and continue on with the next element (with the new inserted start == current end).
 
-                    // If we're on the last element, we can optimize
-                    if (i == size-1)
+                    // If we're on the last element, or if we stop before the next start, we set the current element and are done
+                    // Note that the comparison below is inclusive: if a end equals a start, this means they form a boundary, or
+                    // in other words that they are for the same element but one is inclusive while the other exclusive. In which case we know
+                    // we're good with the next element
+                    if (i == size-1 || comparator.compare(end, starts[i+1]) <= 0)
                     {
                         setInternal(i, start, end, markedAt, delTime);
                         return;
                     }
 
-                    setInternal(i, start, ends[i], markedAt, delTime);
-                    if (cmp == 0)
-                        return;
-
-                    start = ends[i];
+                    setInternal(i, start, starts[i+1].invert(), markedAt, delTime);
+                    start = starts[i+1];
                     i++;
                 }
                 else
                 {
-                    // We don't ovewrite fully. Insert the new interval, and then update the now next
+                    // We don't overwrite fully. Insert the new interval, and then update the now next
                     // one to reflect the not overwritten parts. We're then done.
                     addInternal(i, start, end, markedAt, delTime);
                     i++;
-                    setInternal(i, end, ends[i], markedAts[i], delTimes[i]);
+                    Slice.Bound newStart = end.invert();
+                    if (!Slice.isEmpty(comparator, newStart, ends[i]))
+                    {
+                        setInternal(i, newStart, ends[i], markedAts[i], delTimes[i]);
+                    }
                     return;
                 }
             }
@@ -641,16 +614,19 @@
                 // If the new interval starts before the current one, insert that new interval
                 if (comparator.compare(start, starts[i]) < 0)
                 {
-                    // If we stop before the start of the current element, just insert the new
-                    // interval and we're done; otherwise insert until the beginning of the
-                    // current element
+                    // If we stop before the start of the current element, just insert the new interval and we're done;
+                    // otherwise insert until the beginning of the current element
                     if (comparator.compare(end, starts[i]) <= 0)
                     {
                         addInternal(i, start, end, markedAt, delTime);
                         return;
                     }
-                    addInternal(i, start, starts[i], markedAt, delTime);
-                    i++;
+                    Slice.Bound newEnd = starts[i].invert();
+                    if (!Slice.isEmpty(comparator, start, newEnd))
+                    {
+                        addInternal(i, start, newEnd, markedAt, delTime);
+                        i++;
+                    }
                 }
 
                 // After that, we're overwritten on the current element but might have
@@ -660,7 +636,7 @@
                 if (comparator.compare(end, ends[i]) <= 0)
                     return;
 
-                start = ends[i];
+                start = ends[i].invert();
                 i++;
             }
         }
@@ -677,7 +653,7 @@
     /*
      * Adds the new tombstone at index i, growing and/or moving elements to make room for it.
      */
-    private void addInternal(int i, Composite start, Composite end, long markedAt, int delTime)
+    private void addInternal(int i, Slice.Bound start, Slice.Bound end, long markedAt, int delTime)
     {
         assert i >= 0;
 
@@ -690,20 +666,6 @@
         size++;
     }
 
-    private void removeInternal(int i)
-    {
-        assert i >= 0;
-
-        System.arraycopy(starts, i+1, starts, i, size - i - 1);
-        System.arraycopy(ends, i+1, ends, i, size - i - 1);
-        System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1);
-        System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1);
-
-        --size;
-        starts[size] = null;
-        ends[size] = null;
-    }
-
     /*
      * Grow the arrays, leaving index i "free" in the process.
      */
@@ -730,12 +692,12 @@
         delTimes = grow(delTimes, size, newLength, i);
     }
 
-    private static Composite[] grow(Composite[] a, int size, int newLength, int i)
+    private static Slice.Bound[] grow(Slice.Bound[] a, int size, int newLength, int i)
     {
         if (i < 0 || i >= size)
             return Arrays.copyOf(a, newLength);
 
-        Composite[] newA = new Composite[newLength];
+        Slice.Bound[] newA = new Slice.Bound[newLength];
         System.arraycopy(a, 0, newA, 0, i);
         System.arraycopy(a, i, newA, i+1, size - i);
         return newA;
@@ -780,7 +742,7 @@
         starts[i] = null;
     }
 
-    private void setInternal(int i, Composite start, Composite end, long markedAt, int delTime)
+    private void setInternal(int i, Slice.Bound start, Slice.Bound end, long markedAt, int delTime)
     {
         if (starts[i] != null)
             boundaryHeapSize -= starts[i].unsharedHeapSize() + ends[i].unsharedHeapSize();
@@ -801,140 +763,4 @@
                 + ObjectSizes.sizeOfArray(markedAts)
                 + ObjectSizes.sizeOfArray(delTimes);
     }
-
-    public static class Serializer implements IVersionedSerializer<RangeTombstoneList>
-    {
-        private final CType type;
-
-        public Serializer(CType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(RangeTombstoneList tombstones, DataOutputPlus out, int version) throws IOException
-        {
-            if (tombstones == null)
-            {
-                out.writeInt(0);
-                return;
-            }
-
-            out.writeInt(tombstones.size);
-            for (int i = 0; i < tombstones.size; i++)
-            {
-                type.serializer().serialize(tombstones.starts[i], out);
-                type.serializer().serialize(tombstones.ends[i], out);
-                out.writeInt(tombstones.delTimes[i]);
-                out.writeLong(tombstones.markedAts[i]);
-            }
-        }
-
-        public RangeTombstoneList deserialize(DataInput in, int version) throws IOException
-        {
-            int size = in.readInt();
-            if (size == 0)
-                return null;
-
-            RangeTombstoneList tombstones = new RangeTombstoneList(type, size);
-
-            for (int i = 0; i < size; i++)
-            {
-                Composite start = type.serializer().deserialize(in);
-                Composite end = type.serializer().deserialize(in);
-                int delTime =  in.readInt();
-                long markedAt = in.readLong();
-
-                if (version >= MessagingService.VERSION_20)
-                {
-                    tombstones.setInternal(i, start, end, markedAt, delTime);
-                }
-                else
-                {
-                    /*
-                     * The old implementation used to have range sorted by left value, but with potentially
-                     * overlapping range. So we need to use the "slow" path.
-                     */
-                    tombstones.add(start, end, markedAt, delTime);
-                }
-            }
-
-            // The "slow" path take care of updating the size, but not the fast one
-            if (version >= MessagingService.VERSION_20)
-                tombstones.size = size;
-            return tombstones;
-        }
-
-        public long serializedSize(RangeTombstoneList tombstones, TypeSizes typeSizes, int version)
-        {
-            if (tombstones == null)
-                return typeSizes.sizeof(0);
-
-            long size = typeSizes.sizeof(tombstones.size);
-            for (int i = 0; i < tombstones.size; i++)
-            {
-                size += type.serializer().serializedSize(tombstones.starts[i], typeSizes);
-                size += type.serializer().serializedSize(tombstones.ends[i], typeSizes);
-                size += typeSizes.sizeof(tombstones.delTimes[i]);
-                size += typeSizes.sizeof(tombstones.markedAts[i]);
-            }
-            return size;
-        }
-
-        public long serializedSize(RangeTombstoneList tombstones, int version)
-        {
-            return serializedSize(tombstones, TypeSizes.NATIVE, version);
-        }
-    }
-
-    /**
-     * This object allow testing whether a given column (name/timestamp) is deleted
-     * or not by this RangeTombstoneList, assuming that the column given to this
-     * object are passed in (comparator) sorted order.
-     *
-     * This is more efficient that calling RangeTombstoneList.isDeleted() repeatedly
-     * in that case since we're able to take the sorted nature of the RangeTombstoneList
-     * into account.
-     */
-    public class InOrderTester
-    {
-        private int idx;
-
-        public boolean isDeleted(Cell cell)
-        {
-            CellName name = cell.name();
-            long timestamp = cell.timestamp();
-
-            while (idx < size)
-            {
-                int cmp = comparator.compare(name, starts[idx]);
-
-                if (cmp < 0)
-                {
-                    return false;
-                }
-                else if (cmp == 0)
-                {
-                    // No matter what the counter cell's timestamp is, a tombstone always takes precedence. See CASSANDRA-7346.
-                    if (cell instanceof CounterCell)
-                        return true;
-
-                    // As for searchInternal, we need to check the previous end
-                    if (idx > 0 && comparator.compare(name, ends[idx-1]) == 0 && markedAts[idx-1] > markedAts[idx])
-                        return markedAts[idx-1] >= timestamp;
-                    else
-                        return markedAts[idx] >= timestamp;
-                }
-                else
-                {
-                    if (comparator.compare(name, ends[idx]) <= 0)
-                        return markedAts[idx] >= timestamp || cell instanceof CounterCell;
-                    else
-                        idx++;
-                }
-            }
-
-            return false;
-        }
-    }
-
 }

diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index cd86336..81b6803 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java

@@ -17,160 +17,1703 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Predicate;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
+import javax.annotation.Nullable;
+
+import com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.RTBoundCloser;
+import org.apache.cassandra.db.transform.RTBoundValidator;
+import org.apache.cassandra.db.transform.RTBoundValidator.Stage;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.IndexNotAvailableException;
+import org.apache.cassandra.io.ForwardingVersionedSerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.metrics.TableMetrics;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.IReadCommand;
-import org.apache.cassandra.service.RowDataResolver;
-import org.apache.cassandra.service.pager.Pageable;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.UnknownIndexException;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
 
-public abstract class ReadCommand implements IReadCommand, Pageable
+/**
+ * General interface for storage-engine read commands (common to both range and
+ * single partition commands).
+ * <p>
+ * This contains all the informations needed to do a local read.
+ */
+public abstract class ReadCommand implements ReadQuery
 {
-    public enum Type
+    protected static final Logger logger = LoggerFactory.getLogger(ReadCommand.class);
+
+    public static final IVersionedSerializer<ReadCommand> serializer = new Serializer();
+
+    // For READ verb: will either dispatch on 'serializer' for 3.0 or 'legacyReadCommandSerializer' for earlier version.
+    // Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
+    public static final IVersionedSerializer<ReadCommand> readSerializer = new ForwardingVersionedSerializer<ReadCommand>()
+    {
+        protected IVersionedSerializer<ReadCommand> delegate(int version)
+        {
+            return version < MessagingService.VERSION_30
+                    ? legacyReadCommandSerializer : serializer;
+        }
+    };
+
+    // For RANGE_SLICE verb: will either dispatch on 'serializer' for 3.0 or 'legacyRangeSliceCommandSerializer' for earlier version.
+    // Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
+    public static final IVersionedSerializer<ReadCommand> rangeSliceSerializer = new ForwardingVersionedSerializer<ReadCommand>()
+    {
+        protected IVersionedSerializer<ReadCommand> delegate(int version)
+        {
+            return version < MessagingService.VERSION_30
+                    ? legacyRangeSliceCommandSerializer : serializer;
+        }
+    };
+
+    // For PAGED_RANGE verb: will either dispatch on 'serializer' for 3.0 or 'legacyPagedRangeCommandSerializer' for earlier version.
+    // Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
+    public static final IVersionedSerializer<ReadCommand> pagedRangeSerializer = new ForwardingVersionedSerializer<ReadCommand>()
+    {
+        protected IVersionedSerializer<ReadCommand> delegate(int version)
+        {
+            return version < MessagingService.VERSION_30
+                    ? legacyPagedRangeCommandSerializer : serializer;
+        }
+    };
+
+    public static final IVersionedSerializer<ReadCommand> legacyRangeSliceCommandSerializer = new LegacyRangeSliceCommandSerializer();
+    public static final IVersionedSerializer<ReadCommand> legacyPagedRangeCommandSerializer = new LegacyPagedRangeCommandSerializer();
+    public static final IVersionedSerializer<ReadCommand> legacyReadCommandSerializer = new LegacyReadCommandSerializer();
+
+    private final Kind kind;
+    private final CFMetaData metadata;
+    private final int nowInSec;
+
+    private final ColumnFilter columnFilter;
+    private final RowFilter rowFilter;
+    private final DataLimits limits;
+
+    private final boolean isDigestQuery;
+    // if a digest query, the version for which the digest is expected. Ignored if not a digest.
+    private int digestVersion;
+    private final boolean isForThrift;
+
+    @Nullable
+    private final IndexMetadata index;
+
+    protected static abstract class SelectionDeserializer
+    {
+        public abstract ReadCommand deserialize(DataInputPlus in,
+                                                int version,
+                                                boolean isDigest,
+                                                int digestVersion,
+                                                boolean isForThrift,
+                                                CFMetaData metadata,
+                                                int nowInSec,
+                                                ColumnFilter columnFilter,
+                                                RowFilter rowFilter,
+                                                DataLimits limits,
+                                                IndexMetadata index) throws IOException;
+    }
+
+    protected enum Kind
+    {
+        SINGLE_PARTITION (SinglePartitionReadCommand.selectionDeserializer),
+        PARTITION_RANGE  (PartitionRangeReadCommand.selectionDeserializer);
+
+        private final SelectionDeserializer selectionDeserializer;
+
+        Kind(SelectionDeserializer selectionDeserializer)
+        {
+            this.selectionDeserializer = selectionDeserializer;
+        }
+    }
+
+    protected ReadCommand(Kind kind,
+                          boolean isDigestQuery,
+                          int digestVersion,
+                          boolean isForThrift,
+                          CFMetaData metadata,
+                          int nowInSec,
+                          ColumnFilter columnFilter,
+                          RowFilter rowFilter,
+                          DataLimits limits,
+                          IndexMetadata index)
+    {
+        this.kind = kind;
+        this.isDigestQuery = isDigestQuery;
+        this.digestVersion = digestVersion;
+        this.isForThrift = isForThrift;
+        this.metadata = metadata;
+        this.nowInSec = nowInSec;
+        this.columnFilter = columnFilter;
+        this.rowFilter = rowFilter;
+        this.limits = limits;
+        this.index = index;
+    }
+
+    protected abstract void serializeSelection(DataOutputPlus out, int version) throws IOException;
+    protected abstract long selectionSerializedSize(int version);
+
+    public abstract boolean isLimitedToOnePartition();
+
+    /**
+     * The metadata for the table queried.
+     *
+     * @return the metadata for the table queried.
+     */
+    public CFMetaData metadata()
+    {
+        return metadata;
+    }
+
+    /**
+     * The time in seconds to use as "now" for this query.
+     * <p>
+     * We use the same time as "now" for the whole query to avoid considering different
+     * values as expired during the query, which would be buggy (would throw of counting amongst other
+     * things).
+     *
+     * @return the time (in seconds) to use as "now".
+     */
+    public int nowInSec()
+    {
+        return nowInSec;
+    }
+
+    /**
+     * The configured timeout for this command.
+     *
+     * @return the configured timeout for this command.
+     */
+    public abstract long getTimeout();
+
+    /**
+     * A filter on which (non-PK) columns must be returned by the query.
+     *
+     * @return which columns must be fetched by this query.
+     */
+    public ColumnFilter columnFilter()
+    {
+        return columnFilter;
+    }
+
+    /**
+     * Filters/Resrictions on CQL rows.
+     * <p>
+     * This contains the restrictions that are not directly handled by the
+     * {@code ClusteringIndexFilter}. More specifically, this includes any non-PK column
+     * restrictions and can include some PK columns restrictions when those can't be
+     * satisfied entirely by the clustering index filter (because not all clustering columns
+     * have been restricted for instance). If there is 2ndary indexes on the table,
+     * one of this restriction might be handled by a 2ndary index.
+     *
+     * @return the filter holding the expression that rows must satisfy.
+     */
+    public RowFilter rowFilter()
+    {
+        return rowFilter;
+    }
+
+    /**
+     * The limits set on this query.
+     *
+     * @return the limits set on this query.
+     */
+    public DataLimits limits()
+    {
+        return limits;
+    }
+
+    /**
+     * Whether this query is a digest one or not.
+     *
+     * @return Whether this query is a digest query.
+     */
+    public boolean isDigestQuery()
+    {
+        return isDigestQuery;
+    }
+
+    /**
+     * If the query is a digest one, the requested digest version.
+     *
+     * @return the requested digest version if the query is a digest. Otherwise, this can return
+     * anything.
+     */
+    public int digestVersion()
+    {
+        return digestVersion;
+    }
+
+    /**
+     * Sets the digest version, for when digest for that command is requested.
+     * <p>
+     * Note that we allow setting this independently of setting the command as a digest query as
+     * this allows us to use the command as a carrier of the digest version even if we only call
+     * setIsDigestQuery on some copy of it.
+     *
+     * @param digestVersion the version for the digest is this command is used for digest query..
+     * @return this read command.
+     */
+    public ReadCommand setDigestVersion(int digestVersion)
+    {
+        this.digestVersion = digestVersion;
+        return this;
+    }
+
+    /**
+     * Whether this query is for thrift or not.
+     *
+     * @return whether this query is for thrift.
+     */
+    public boolean isForThrift()
+    {
+        return isForThrift;
+    }
+
+    /**
+     * Index (metadata) chosen for this query. Can be null.
+     *
+     * @return index (metadata) chosen for this query
+     */
+    @Nullable
+    public IndexMetadata indexMetadata()
+    {
+        return index;
+    }
+
+    /**
+     * The clustering index filter this command to use for the provided key.
+     * <p>
+     * Note that that method should only be called on a key actually queried by this command
+     * and in practice, this will almost always return the same filter, but for the sake of
+     * paging, the filter on the first key of a range command might be slightly different.
+     *
+     * @param key a partition key queried by this command.
+     *
+     * @return the {@code ClusteringIndexFilter} to use for the partition of key {@code key}.
+     */
+    public abstract ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key);
+
+    /**
+     * Returns a copy of this command.
+     *
+     * @return a copy of this command.
+     */
+    public abstract ReadCommand copy();
+
+    /**
+     * Returns a copy of this command with isDigestQuery set to true.
+     */
+    public abstract ReadCommand copyAsDigestQuery();
+
+    protected abstract UnfilteredPartitionIterator queryStorage(ColumnFamilyStore cfs, ReadOrderGroup orderGroup);
+
+    protected abstract int oldestUnrepairedTombstone();
+
+    /**
+     * Whether the underlying {@code ClusteringIndexFilter} is reversed or not.
+     *
+     * @return whether the underlying {@code ClusteringIndexFilter} is reversed or not.
+     */
+    public abstract boolean isReversed();
+
+    public ReadResponse createResponse(UnfilteredPartitionIterator iterator)
+    {
+        // validate that the sequence of RT markers is correct: open is followed by close, deletion times for both
+        // ends equal, and there are no dangling RT bound in any partition.
+        iterator = RTBoundValidator.validate(iterator, Stage.PROCESSED, true);
+
+        return isDigestQuery()
+             ? ReadResponse.createDigestResponse(iterator, this)
+             : ReadResponse.createDataResponse(iterator, this);
+    }
+
+    long indexSerializedSize(int version)
+    {
+        return null != index
+             ? IndexMetadata.serializer.serializedSize(index, version)
+             : 0;
+    }
+
+    public Index getIndex(ColumnFamilyStore cfs)
+    {
+        return null != index
+             ? cfs.indexManager.getIndex(index)
+             : null;
+    }
+
+    static IndexMetadata findIndex(CFMetaData table, RowFilter rowFilter)
+    {
+        if (table.getIndexes().isEmpty() || rowFilter.isEmpty())
+            return null;
+
+        ColumnFamilyStore cfs = Keyspace.openAndGetStore(table);
+
+        Index index = cfs.indexManager.getBestIndexFor(rowFilter);
+
+        return null != index
+             ? index.getIndexMetadata()
+             : null;
+    }
+
+    /**
+     * If the index manager for the CFS determines that there's an applicable
+     * 2i that can be used to execute this command, call its (optional)
+     * validation method to check that nothing in this command's parameters
+     * violates the implementation specific validation rules.
+     */
+    public void maybeValidateIndex()
+    {
+        Index index = getIndex(Keyspace.openAndGetStore(metadata));
+        if (null != index)
+            index.validate(this);
+    }
+
+    /**
+     * Executes this command on the local host.
+     *
+     * @param orderGroup the operation group spanning this command
+     *
+     * @return an iterator over the result of executing this command locally.
+     */
+    @SuppressWarnings("resource") // The result iterator is closed upon exceptions (we know it's fine to potentially not close the intermediary
+                                  // iterators created inside the try as long as we do close the original resultIterator), or by closing the result.
+    public UnfilteredPartitionIterator executeLocally(ReadOrderGroup orderGroup)
+    {
+        long startTimeNanos = System.nanoTime();
+
+        ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata());
+        Index index = getIndex(cfs);
+
+        Index.Searcher searcher = null;
+        if (index != null)
+        {
+            if (!cfs.indexManager.isIndexQueryable(index))
+                throw new IndexNotAvailableException(index);
+
+            searcher = index.searcherFor(this);
+            Tracing.trace("Executing read on {}.{} using index {}", cfs.metadata.ksName, cfs.metadata.cfName, index.getIndexMetadata().name);
+        }
+
+        UnfilteredPartitionIterator iterator = (null == searcher) ? queryStorage(cfs, orderGroup) : searcher.search(orderGroup);
+        iterator = RTBoundValidator.validate(iterator, Stage.MERGED, false);
+
+        try
+        {
+            iterator = RTBoundValidator.validate(withoutPurgeableTombstones(iterator, cfs), Stage.PURGED, false);
+
+            iterator = withMetricsRecording(iterator, cfs.metric, startTimeNanos);
+
+            // If we've used a 2ndary index, we know the result already satisfy the primary expression used, so
+            // no point in checking it again.
+            RowFilter filter = (null == searcher) ? rowFilter() : index.getPostIndexQueryFilter(rowFilter());
+
+            /*
+             * TODO: We'll currently do filtering by the rowFilter here because it's convenient. However,
+             * we'll probably want to optimize by pushing it down the layer (like for dropped columns) as it
+             * would be more efficient (the sooner we discard stuff we know we don't care, the less useless
+             * processing we do on it).
+             */
+            iterator = filter.filter(iterator, nowInSec());
+
+            // apply the limits/row counter; this transformation is stopping and would close the iterator as soon
+            // as the count is observed; if that happens in the middle of an open RT, its end bound will not be included.
+            iterator = limits().filter(iterator, nowInSec(), selectsFullPartition());
+
+            // because of the above, we need to append an aritifical end bound if the source iterator was stopped short by a counter.
+            return RTBoundCloser.close(iterator);
+        }
+        catch (RuntimeException | Error e)
+        {
+            iterator.close();
+            throw e;
+        }
+    }
+
+    protected abstract void recordLatency(TableMetrics metric, long latencyNanos);
+
+    public PartitionIterator executeInternal(ReadOrderGroup orderGroup)
+    {
+        return UnfilteredPartitionIterators.filter(executeLocally(orderGroup), nowInSec());
+    }
+
+    public ReadOrderGroup startOrderGroup()
+    {
+        return ReadOrderGroup.forCommand(this);
+    }
+
+    /**
+     * Wraps the provided iterator so that metrics on what is scanned by the command are recorded.
+     * This also log warning/trow TombstoneOverwhelmingException if appropriate.
+     */
+    private UnfilteredPartitionIterator withMetricsRecording(UnfilteredPartitionIterator iter, final TableMetrics metric, final long startTimeNanos)
+    {
+        class MetricRecording extends Transformation<UnfilteredRowIterator>
+        {
+            private final int failureThreshold = DatabaseDescriptor.getTombstoneFailureThreshold();
+            private final int warningThreshold = DatabaseDescriptor.getTombstoneWarnThreshold();
+
+            private final boolean respectTombstoneThresholds = !Schema.isLocalSystemKeyspace(ReadCommand.this.metadata().ksName);
+            private final boolean enforceStrictLiveness = metadata.enforceStrictLiveness();
+
+            private int liveRows = 0;
+            private int tombstones = 0;
+
+            private DecoratedKey currentKey;
+
+            @Override
+            public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator iter)
+            {
+                currentKey = iter.partitionKey();
+                return Transformation.apply(iter, this);
+            }
+
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                return applyToRow(row);
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                if (row.hasLiveData(ReadCommand.this.nowInSec(), enforceStrictLiveness))
+                    ++liveRows;
+
+                for (Cell cell : row.cells())
+                {
+                    if (!cell.isLive(ReadCommand.this.nowInSec()))
+                        countTombstone(row.clustering());
+                }
+                return row;
+            }
+
+            @Override
+            public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+            {
+                countTombstone(marker.clustering());
+                return marker;
+            }
+
+            private void countTombstone(ClusteringPrefix clustering)
+            {
+                ++tombstones;
+                if (tombstones > failureThreshold && respectTombstoneThresholds)
+                {
+                    String query = ReadCommand.this.toCQLString();
+                    Tracing.trace("Scanned over {} tombstones for query {}; query aborted (see tombstone_failure_threshold)", failureThreshold, query);
+                    throw new TombstoneOverwhelmingException(tombstones, query, ReadCommand.this.metadata(), currentKey, clustering);
+                }
+            }
+
+            @Override
+            public void onClose()
+            {
+                recordLatency(metric, System.nanoTime() - startTimeNanos);
+
+                metric.tombstoneScannedHistogram.update(tombstones);
+                metric.liveScannedHistogram.update(liveRows);
+
+                boolean warnTombstones = tombstones > warningThreshold && respectTombstoneThresholds;
+                if (warnTombstones)
+                {
+                    String msg = String.format(
+                            "Read %d live rows and %d tombstone cells for query %1.512s; token %s (see tombstone_warn_threshold)",
+                            liveRows, tombstones, ReadCommand.this.toCQLString(), currentKey.getToken());
+                    ClientWarn.instance.warn(msg);
+                    logger.warn(msg);
+                }
+
+                Tracing.trace("Read {} live and {} tombstone cells{}", liveRows, tombstones, (warnTombstones ? " (see tombstone_warn_threshold)" : ""));
+            }
+        };
+
+        return Transformation.apply(iter, new MetricRecording());
+    }
+
+    /**
+     * Creates a message for this command.
+     */
+    public abstract MessageOut<ReadCommand> createMessage(int version);
+
+    protected abstract void appendCQLWhereClause(StringBuilder sb);
+
+    // Skip purgeable tombstones. We do this because it's safe to do (post-merge of the memtable and sstable at least), it
+    // can save us some bandwith, and avoid making us throw a TombstoneOverwhelmingException for purgeable tombstones (which
+    // are to some extend an artefact of compaction lagging behind and hence counting them is somewhat unintuitive).
+    protected UnfilteredPartitionIterator withoutPurgeableTombstones(UnfilteredPartitionIterator iterator, ColumnFamilyStore cfs)
+    {
+        final boolean isForThrift = iterator.isForThrift();
+        class WithoutPurgeableTombstones extends PurgeFunction
+        {
+            public WithoutPurgeableTombstones()
+            {
+                super(isForThrift,
+                      nowInSec(),
+                      cfs.gcBefore(nowInSec()),
+                      oldestUnrepairedTombstone(),
+                      cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
+                      cfs.metadata.enforceStrictLiveness());
+            }
+
+            protected Predicate<Long> getPurgeEvaluator()
+            {
+                return time -> true;
+            }
+        }
+        return Transformation.apply(iterator, new WithoutPurgeableTombstones());
+    }
+
+    /**
+     * Recreate the CQL string corresponding to this query.
+     * <p>
+     * Note that in general the returned string will not be exactly the original user string, first
+     * because there isn't always a single syntax for a given query,  but also because we don't have
+     * all the information needed (we know the non-PK columns queried but not the PK ones as internally
+     * we query them all). So this shouldn't be relied too strongly, but this should be good enough for
+     * debugging purpose which is what this is for.
+     */
+    public String toCQLString()
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append("SELECT ").append(columnFilter());
+        sb.append(" FROM ").append(metadata().ksName).append('.').append(metadata.cfName);
+        appendCQLWhereClause(sb);
+
+        if (limits() != DataLimits.NONE)
+            sb.append(' ').append(limits());
+        return sb.toString();
+    }
+
+    private static class Serializer implements IVersionedSerializer<ReadCommand>
+    {
+        private static int digestFlag(boolean isDigest)
+        {
+            return isDigest ? 0x01 : 0;
+        }
+
+        private static boolean isDigest(int flags)
+        {
+            return (flags & 0x01) != 0;
+        }
+
+        private static int thriftFlag(boolean isForThrift)
+        {
+            return isForThrift ? 0x02 : 0;
+        }
+
+        private static boolean isForThrift(int flags)
+        {
+            return (flags & 0x02) != 0;
+        }
+
+        private static int indexFlag(boolean hasIndex)
+        {
+            return hasIndex ? 0x04 : 0;
+        }
+
+        private static boolean hasIndex(int flags)
+        {
+            return (flags & 0x04) != 0;
+        }
+
+        public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
+        {
+            assert version >= MessagingService.VERSION_30;
+
+            out.writeByte(command.kind.ordinal());
+            out.writeByte(digestFlag(command.isDigestQuery()) | thriftFlag(command.isForThrift()) | indexFlag(null != command.index));
+            if (command.isDigestQuery())
+                out.writeUnsignedVInt(command.digestVersion());
+            CFMetaData.serializer.serialize(command.metadata(), out, version);
+            out.writeInt(command.nowInSec());
+            ColumnFilter.serializer.serialize(command.columnFilter(), out, version);
+            RowFilter.serializer.serialize(command.rowFilter(), out, version);
+            DataLimits.serializer.serialize(command.limits(), out, version);
+            if (null != command.index)
+                IndexMetadata.serializer.serialize(command.index, out, version);
+
+            command.serializeSelection(out, version);
+        }
+
+        public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
+        {
+            assert version >= MessagingService.VERSION_30;
+
+            Kind kind = Kind.values()[in.readByte()];
+            int flags = in.readByte();
+            boolean isDigest = isDigest(flags);
+            boolean isForThrift = isForThrift(flags);
+            boolean hasIndex = hasIndex(flags);
+            int digestVersion = isDigest ? (int)in.readUnsignedVInt() : 0;
+            CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
+            int nowInSec = in.readInt();
+            ColumnFilter columnFilter = ColumnFilter.serializer.deserialize(in, version, metadata);
+            RowFilter rowFilter = RowFilter.serializer.deserialize(in, version, metadata);
+            DataLimits limits = DataLimits.serializer.deserialize(in, version);
+            IndexMetadata index = hasIndex ? deserializeIndexMetadata(in, version, metadata) : null;
+
+            return kind.selectionDeserializer.deserialize(in, version, isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+        }
+
+        private IndexMetadata deserializeIndexMetadata(DataInputPlus in, int version, CFMetaData cfm) throws IOException
+        {
+            try
+            {
+                return IndexMetadata.serializer.deserialize(in, version, cfm);
+            }
+            catch (UnknownIndexException e)
+            {
+                String message = String.format("Couldn't find a defined index on %s.%s with the id %s. " +
+                                               "If an index was just created, this is likely due to the schema not " +
+                                               "being fully propagated. Local read will proceed without using the " +
+                                               "index. Please wait for schema agreement after index creation.",
+                                               cfm.ksName, cfm.cfName, e.indexId.toString());
+                logger.info(message);
+                return null;
+            }
+        }
+
+        public long serializedSize(ReadCommand command, int version)
+        {
+            assert version >= MessagingService.VERSION_30;
+
+            return 2 // kind + flags
+                 + (command.isDigestQuery() ? TypeSizes.sizeofUnsignedVInt(command.digestVersion()) : 0)
+                 + CFMetaData.serializer.serializedSize(command.metadata(), version)
+                 + TypeSizes.sizeof(command.nowInSec())
+                 + ColumnFilter.serializer.serializedSize(command.columnFilter(), version)
+                 + RowFilter.serializer.serializedSize(command.rowFilter(), version)
+                 + DataLimits.serializer.serializedSize(command.limits(), version)
+                 + command.selectionSerializedSize(version)
+                 + command.indexSerializedSize(version);
+        }
+    }
+
+    private enum LegacyType
     {
         GET_BY_NAMES((byte)1),
         GET_SLICES((byte)2);
 
         public final byte serializedValue;
 
-        private Type(byte b)
+        LegacyType(byte b)
         {
             this.serializedValue = b;
         }
 
-        public static Type fromSerializedValue(byte b)
+        public static LegacyType fromPartitionFilterKind(ClusteringIndexFilter.Kind kind)
+        {
+            return kind == ClusteringIndexFilter.Kind.SLICE
+                   ? GET_SLICES
+                   : GET_BY_NAMES;
+        }
+
+        public static LegacyType fromSerializedValue(byte b)
         {
             return b == 1 ? GET_BY_NAMES : GET_SLICES;
         }
     }
 
-    public static final ReadCommandSerializer serializer = new ReadCommandSerializer();
-
-    public MessageOut<ReadCommand> createMessage()
+    /**
+     * Serializer for pre-3.0 RangeSliceCommands.
+     */
+    private static class LegacyRangeSliceCommandSerializer implements IVersionedSerializer<ReadCommand>
     {
-        return new MessageOut<>(MessagingService.Verb.READ, this, serializer);
-    }
-
-    public final String ksName;
-    public final String cfName;
-    public final ByteBuffer key;
-    public final long timestamp;
-    private boolean isDigestQuery = false;
-    protected final Type commandType;
-
-    protected ReadCommand(String ksName, ByteBuffer key, String cfName, long timestamp, Type cmdType)
-    {
-        this.ksName = ksName;
-        this.key = key;
-        this.cfName = cfName;
-        this.timestamp = timestamp;
-        this.commandType = cmdType;
-    }
-
-    public static ReadCommand create(String ksName, ByteBuffer key, String cfName, long timestamp, IDiskAtomFilter filter)
-    {
-        if (filter instanceof SliceQueryFilter)
-            return new SliceFromReadCommand(ksName, key, cfName, timestamp, (SliceQueryFilter)filter);
-        else
-            return new SliceByNamesReadCommand(ksName, key, cfName, timestamp, (NamesQueryFilter)filter);
-    }
-
-    public boolean isDigestQuery()
-    {
-        return isDigestQuery;
-    }
-
-    public ReadCommand setIsDigestQuery(boolean isDigestQuery)
-    {
-        this.isDigestQuery = isDigestQuery;
-        return this;
-    }
-
-    public String getColumnFamilyName()
-    {
-        return cfName;
-    }
-
-    public abstract ReadCommand copy();
-
-    public abstract Row getRow(Keyspace keyspace);
-
-    public abstract IDiskAtomFilter filter();
-
-    public String getKeyspace()
-    {
-        return ksName;
-    }
-
-    // maybeGenerateRetryCommand is used to generate a retry for short reads
-    public ReadCommand maybeGenerateRetryCommand(RowDataResolver resolver, Row row)
-    {
-        return null;
-    }
-
-    // maybeTrim removes columns from a response that is too long
-    public Row maybeTrim(Row row)
-    {
-        return row;
-    }
-
-    public long getTimeout()
-    {
-        return DatabaseDescriptor.getReadRpcTimeout();
-    }
-}
-
-class ReadCommandSerializer implements IVersionedSerializer<ReadCommand>
-{
-    public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
-    {
-        out.writeByte(command.commandType.serializedValue);
-        switch (command.commandType)
+        public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
         {
-            case GET_BY_NAMES:
-                SliceByNamesReadCommand.serializer.serialize(command, out, version);
-                break;
-            case GET_SLICES:
-                SliceFromReadCommand.serializer.serialize(command, out, version);
-                break;
-            default:
-                throw new AssertionError();
+            assert version < MessagingService.VERSION_30;
+
+            PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
+            assert !rangeCommand.dataRange().isPaging();
+
+            // convert pre-3.0 incompatible names filters to slice filters
+            rangeCommand = maybeConvertNamesToSlice(rangeCommand);
+
+            CFMetaData metadata = rangeCommand.metadata();
+
+            out.writeUTF(metadata.ksName);
+            out.writeUTF(metadata.cfName);
+            out.writeLong(rangeCommand.nowInSec() * 1000L);  // convert from seconds to millis
+
+            // begin DiskAtomFilterSerializer.serialize()
+            if (rangeCommand.isNamesQuery())
+            {
+                out.writeByte(1);  // 0 for slices, 1 for names
+                ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter;
+                LegacyReadCommandSerializer.serializeNamesFilter(rangeCommand, filter, out);
+            }
+            else
+            {
+                out.writeByte(0);  // 0 for slices, 1 for names
+
+                // slice filter serialization
+                ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
+
+                boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+                LegacyReadCommandSerializer.serializeSlices(out, filter.requestedSlices(), filter.isReversed(), makeStaticSlice, metadata);
+
+                out.writeBoolean(filter.isReversed());
+
+                // limit
+                DataLimits limits = rangeCommand.limits();
+                if (limits.isDistinct())
+                    out.writeInt(1);
+                else
+                    out.writeInt(LegacyReadCommandSerializer.updateLimitForQuery(rangeCommand.limits().count(), filter.requestedSlices()));
+
+                int compositesToGroup;
+                boolean selectsStatics = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+                if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT)
+                    compositesToGroup = -1;
+                else if (limits.isDistinct() && !selectsStatics)
+                    compositesToGroup = -2;  // for DISTINCT queries (CASSANDRA-8490)
+                else
+                    compositesToGroup = metadata.isDense() ? -1 : metadata.clusteringColumns().size();
+
+                out.writeInt(compositesToGroup);
+            }
+
+            serializeRowFilter(out, rangeCommand.rowFilter());
+            AbstractBounds.rowPositionSerializer.serialize(rangeCommand.dataRange().keyRange(), out, version);
+
+            // maxResults
+            out.writeInt(rangeCommand.limits().count());
+
+            // countCQL3Rows
+            if (rangeCommand.isForThrift() || rangeCommand.limits().perPartitionCount() == 1)  // if for Thrift or DISTINCT
+                out.writeBoolean(false);
+            else
+                out.writeBoolean(true);
+
+            // isPaging
+            out.writeBoolean(false);
+        }
+
+        public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
+        {
+            assert version < MessagingService.VERSION_30;
+
+            String keyspace = in.readUTF();
+            String columnFamily = in.readUTF();
+
+            CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
+            if (metadata == null)
+            {
+                String message = String.format("Got legacy range command for nonexistent table %s.%s.", keyspace, columnFamily);
+                throw new UnknownColumnFamilyException(message, null);
+            }
+
+            int nowInSec = (int) (in.readLong() / 1000);  // convert from millis to seconds
+
+            ClusteringIndexFilter filter;
+            ColumnFilter selection;
+            int compositesToGroup = 0;
+            int perPartitionLimit = -1;
+            byte readType = in.readByte();  // 0 for slices, 1 for names
+            if (readType == 1)
+            {
+                Pair<ColumnFilter, ClusteringIndexNamesFilter> selectionAndFilter = LegacyReadCommandSerializer.deserializeNamesSelectionAndFilter(in, metadata);
+                selection = selectionAndFilter.left;
+                filter = selectionAndFilter.right;
+            }
+            else
+            {
+                Pair<ClusteringIndexSliceFilter, Boolean> p = LegacyReadCommandSerializer.deserializeSlicePartitionFilter(in, metadata);
+                filter = p.left;
+                perPartitionLimit = in.readInt();
+                compositesToGroup = in.readInt();
+                selection = getColumnSelectionForSlice(p.right, compositesToGroup, metadata);
+            }
+
+            RowFilter rowFilter = deserializeRowFilter(in, metadata);
+
+            AbstractBounds<PartitionPosition> keyRange = AbstractBounds.rowPositionSerializer.deserialize(in, metadata.partitioner, version);
+            int maxResults = in.readInt();
+
+            boolean countCQL3Rows = in.readBoolean();  // countCQL3Rows (not needed)
+            in.readBoolean();  // isPaging (not needed)
+
+            boolean selectsStatics = (!selection.fetchedColumns().statics.isEmpty() || filter.selects(Clustering.STATIC_CLUSTERING));
+            // We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
+            // we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter one is slightly less
+            // direct, but we know that on 2.1/2.2 queries, DISTINCT queries are the only CQL queries that have countCQL3Rows to false so we use
+            // that fact.
+            boolean isDistinct = compositesToGroup == -2 || (compositesToGroup != -1 && !countCQL3Rows);
+            DataLimits limits;
+            if (isDistinct)
+                limits = DataLimits.distinctLimits(maxResults);
+            else if (compositesToGroup == -1)
+                limits = DataLimits.thriftLimits(maxResults, perPartitionLimit);
+            else if (metadata.isStaticCompactTable())
+                limits = DataLimits.legacyCompactStaticCqlLimits(maxResults);
+            else
+                limits = DataLimits.cqlLimits(maxResults);
+
+            return PartitionRangeReadCommand.create(true, metadata, nowInSec, selection, rowFilter, limits, new DataRange(keyRange, filter));
+        }
+
+        static void serializeRowFilter(DataOutputPlus out, RowFilter rowFilter) throws IOException
+        {
+            ArrayList<RowFilter.Expression> indexExpressions = Lists.newArrayList(rowFilter.iterator());
+            out.writeInt(indexExpressions.size());
+            for (RowFilter.Expression expression : indexExpressions)
+            {
+                ByteBufferUtil.writeWithShortLength(expression.column().name.bytes, out);
+                expression.operator().writeTo(out);
+                ByteBufferUtil.writeWithShortLength(expression.getIndexValue(), out);
+            }
+        }
+
+        static RowFilter deserializeRowFilter(DataInputPlus in, CFMetaData metadata) throws IOException
+        {
+            int numRowFilters = in.readInt();
+            if (numRowFilters == 0)
+                return RowFilter.NONE;
+
+            RowFilter rowFilter = RowFilter.create(numRowFilters);
+            for (int i = 0; i < numRowFilters; i++)
+            {
+                ByteBuffer columnName = ByteBufferUtil.readWithShortLength(in);
+                ColumnDefinition column = metadata.getColumnDefinition(columnName);
+                Operator op = Operator.readFrom(in);
+                ByteBuffer indexValue = ByteBufferUtil.readWithShortLength(in);
+                rowFilter.add(column, op, indexValue);
+            }
+            return rowFilter;
+        }
+
+        static long serializedRowFilterSize(RowFilter rowFilter)
+        {
+            long size = TypeSizes.sizeof(0);  // rowFilterCount
+            for (RowFilter.Expression expression : rowFilter)
+            {
+                size += ByteBufferUtil.serializedSizeWithShortLength(expression.column().name.bytes);
+                size += TypeSizes.sizeof(0);  // operator int value
+                size += ByteBufferUtil.serializedSizeWithShortLength(expression.getIndexValue());
+            }
+            return size;
+        }
+
+        public long serializedSize(ReadCommand command, int version)
+        {
+            assert version < MessagingService.VERSION_30;
+            assert command.kind == Kind.PARTITION_RANGE;
+
+            PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
+            rangeCommand = maybeConvertNamesToSlice(rangeCommand);
+            CFMetaData metadata = rangeCommand.metadata();
+
+            long size = TypeSizes.sizeof(metadata.ksName);
+            size += TypeSizes.sizeof(metadata.cfName);
+            size += TypeSizes.sizeof((long) rangeCommand.nowInSec());
+
+            size += 1;  // single byte flag: 0 for slices, 1 for names
+            if (rangeCommand.isNamesQuery())
+            {
+                PartitionColumns columns = rangeCommand.columnFilter().fetchedColumns();
+                ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter;
+                size += LegacyReadCommandSerializer.serializedNamesFilterSize(filter, metadata, columns);
+            }
+            else
+            {
+                ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
+                boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+                size += LegacyReadCommandSerializer.serializedSlicesSize(filter.requestedSlices(), makeStaticSlice, metadata);
+                size += TypeSizes.sizeof(filter.isReversed());
+                size += TypeSizes.sizeof(rangeCommand.limits().perPartitionCount());
+                size += TypeSizes.sizeof(0); // compositesToGroup
+            }
+
+            if (rangeCommand.rowFilter().equals(RowFilter.NONE))
+            {
+                size += TypeSizes.sizeof(0);
+            }
+            else
+            {
+                ArrayList<RowFilter.Expression> indexExpressions = Lists.newArrayList(rangeCommand.rowFilter().iterator());
+                size += TypeSizes.sizeof(indexExpressions.size());
+                for (RowFilter.Expression expression : indexExpressions)
+                {
+                    size += ByteBufferUtil.serializedSizeWithShortLength(expression.column().name.bytes);
+                    size += TypeSizes.sizeof(expression.operator().ordinal());
+                    size += ByteBufferUtil.serializedSizeWithShortLength(expression.getIndexValue());
+                }
+            }
+
+            size += AbstractBounds.rowPositionSerializer.serializedSize(rangeCommand.dataRange().keyRange(), version);
+            size += TypeSizes.sizeof(rangeCommand.limits().count());
+            size += TypeSizes.sizeof(!rangeCommand.isForThrift());
+            return size + TypeSizes.sizeof(rangeCommand.dataRange().isPaging());
+        }
+
+        static PartitionRangeReadCommand maybeConvertNamesToSlice(PartitionRangeReadCommand command)
+        {
+            if (!command.dataRange().isNamesQuery())
+                return command;
+
+            CFMetaData metadata = command.metadata();
+            if (!LegacyReadCommandSerializer.shouldConvertNamesToSlice(metadata, command.columnFilter().fetchedColumns()))
+                return command;
+
+            ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) command.dataRange().clusteringIndexFilter;
+            ClusteringIndexSliceFilter sliceFilter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter(filter, metadata);
+            DataRange newRange = new DataRange(command.dataRange().keyRange(), sliceFilter);
+
+            return command.withUpdatedDataRange(newRange);
+        }
+
+        static ColumnFilter getColumnSelectionForSlice(boolean selectsStatics, int compositesToGroup, CFMetaData metadata)
+        {
+            // A value of -2 indicates this is a DISTINCT query that doesn't select static columns, only partition keys.
+            // In that case, we'll basically be querying the first row of the partition, but we must make sure we include
+            // all columns so we get at least one cell if there is a live row as it would confuse pre-3.0 nodes otherwise.
+            if (compositesToGroup == -2)
+                return ColumnFilter.all(metadata);
+
+            // if a slice query from a pre-3.0 node doesn't cover statics, we shouldn't select them at all
+            PartitionColumns columns = selectsStatics
+                                     ? metadata.partitionColumns()
+                                     : metadata.partitionColumns().withoutStatics();
+            return ColumnFilter.selectionBuilder().addAll(columns).build();
         }
     }
 
-    public ReadCommand deserialize(DataInput in, int version) throws IOException
+    /**
+     * Serializer for pre-3.0 PagedRangeCommands.
+     */
+    private static class LegacyPagedRangeCommandSerializer implements IVersionedSerializer<ReadCommand>
     {
-        ReadCommand.Type msgType = ReadCommand.Type.fromSerializedValue(in.readByte());
-        switch (msgType)
+        public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
         {
-            case GET_BY_NAMES:
-                return SliceByNamesReadCommand.serializer.deserialize(in, version);
-            case GET_SLICES:
-                return SliceFromReadCommand.serializer.deserialize(in, version);
-            default:
-                throw new AssertionError();
+            assert version < MessagingService.VERSION_30;
+
+            PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
+            assert rangeCommand.dataRange().isPaging();
+
+            CFMetaData metadata = rangeCommand.metadata();
+
+            out.writeUTF(metadata.ksName);
+            out.writeUTF(metadata.cfName);
+            out.writeLong(rangeCommand.nowInSec() * 1000L);  // convert from seconds to millis
+
+            AbstractBounds.rowPositionSerializer.serialize(rangeCommand.dataRange().keyRange(), out, version);
+
+            // pre-3.0 nodes don't accept names filters for paged range commands
+            ClusteringIndexSliceFilter filter;
+            if (rangeCommand.dataRange().clusteringIndexFilter.kind() == ClusteringIndexFilter.Kind.NAMES)
+                filter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter((ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter, metadata);
+            else
+                filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
+
+            // slice filter
+            boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+            LegacyReadCommandSerializer.serializeSlices(out, filter.requestedSlices(), filter.isReversed(), makeStaticSlice, metadata);
+            out.writeBoolean(filter.isReversed());
+
+            // slice filter's count
+            DataLimits.Kind kind = rangeCommand.limits().kind();
+            boolean isDistinct = (kind == DataLimits.Kind.CQL_LIMIT || kind == DataLimits.Kind.CQL_PAGING_LIMIT) && rangeCommand.limits().perPartitionCount() == 1;
+            if (isDistinct)
+                out.writeInt(1);
+            else
+                out.writeInt(LegacyReadCommandSerializer.updateLimitForQuery(rangeCommand.limits().perPartitionCount(), filter.requestedSlices()));
+
+            // compositesToGroup
+            boolean selectsStatics = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() || filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+            int compositesToGroup;
+            if (kind == DataLimits.Kind.THRIFT_LIMIT)
+                compositesToGroup = -1;
+            else if (isDistinct && !selectsStatics)
+                compositesToGroup = -2;  // for DISTINCT queries (CASSANDRA-8490)
+            else
+                compositesToGroup = metadata.isDense() ? -1 : metadata.clusteringColumns().size();
+
+            out.writeInt(compositesToGroup);
+
+            // command-level "start" and "stop" composites.  The start is the last-returned cell name if there is one,
+            // otherwise it's the same as the slice filter's start.  The stop appears to always be the same as the
+            // slice filter's stop.
+            DataRange.Paging pagingRange = (DataRange.Paging) rangeCommand.dataRange();
+            Clustering lastReturned = pagingRange.getLastReturned();
+            Slice.Bound newStart = Slice.Bound.inclusiveStartOf(lastReturned);
+            Slice lastSlice = filter.requestedSlices().get(filter.requestedSlices().size() - 1);
+            ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeBound(metadata, newStart, true), out);
+            ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeClustering(metadata, lastSlice.end().clustering()), out);
+
+            LegacyRangeSliceCommandSerializer.serializeRowFilter(out, rangeCommand.rowFilter());
+
+            // command-level limit
+            // Pre-3.0 we would always request one more row than we actually needed and the command-level "start" would
+            // be the last-returned cell name, so the response would always include it.
+            int maxResults = rangeCommand.limits().count() + 1;
+            out.writeInt(maxResults);
+
+            // countCQL3Rows
+            if (rangeCommand.isForThrift() || rangeCommand.limits().perPartitionCount() == 1)  // for Thrift or DISTINCT
+                out.writeBoolean(false);
+            else
+                out.writeBoolean(true);
+        }
+
+        public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
+        {
+            assert version < MessagingService.VERSION_30;
+
+            String keyspace = in.readUTF();
+            String columnFamily = in.readUTF();
+
+            CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
+            if (metadata == null)
+            {
+                String message = String.format("Got legacy paged range command for nonexistent table %s.%s.", keyspace, columnFamily);
+                throw new UnknownColumnFamilyException(message, null);
+            }
+
+            int nowInSec = (int) (in.readLong() / 1000);  // convert from millis to seconds
+            AbstractBounds<PartitionPosition> keyRange = AbstractBounds.rowPositionSerializer.deserialize(in, metadata.partitioner, version);
+
+            Pair<ClusteringIndexSliceFilter, Boolean> p = LegacyReadCommandSerializer.deserializeSlicePartitionFilter(in, metadata);
+            ClusteringIndexSliceFilter filter = p.left;
+            boolean selectsStatics = p.right;
+
+            int perPartitionLimit = in.readInt();
+            int compositesToGroup = in.readInt();
+
+            // command-level Composite "start" and "stop"
+            LegacyLayout.LegacyBound startBound = LegacyLayout.decodeSliceBound(metadata, ByteBufferUtil.readWithShortLength(in), true);
+
+            ByteBufferUtil.readWithShortLength(in);  // the composite "stop", which isn't actually needed
+
+            ColumnFilter selection = LegacyRangeSliceCommandSerializer.getColumnSelectionForSlice(selectsStatics, compositesToGroup, metadata);
+
+            RowFilter rowFilter = LegacyRangeSliceCommandSerializer.deserializeRowFilter(in, metadata);
+            int maxResults = in.readInt();
+            boolean countCQL3Rows = in.readBoolean();
+
+            // We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
+            // we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter one is slightly less
+            // direct, but we know that on 2.1/2.2 queries, DISTINCT queries are the only CQL queries that have countCQL3Rows to false so we use
+            // that fact.
+            boolean isDistinct = compositesToGroup == -2 || (compositesToGroup != -1 && !countCQL3Rows);
+            DataLimits limits;
+            if (isDistinct)
+                limits = DataLimits.distinctLimits(maxResults);
+            else
+                limits = DataLimits.cqlLimits(maxResults);
+
+            limits = limits.forPaging(maxResults);
+
+            // The pagedRangeCommand is used in pre-3.0 for both the first page and the following ones. On the first page, the startBound will be
+            // the start of the overall slice and will not be a proper Clustering. So detect that case and just return a non-paging DataRange, which
+            // is what 3.0 does.
+            DataRange dataRange = new DataRange(keyRange, filter);
+            Slices slices = filter.requestedSlices();
+            if (!isDistinct && startBound != LegacyLayout.LegacyBound.BOTTOM && !startBound.bound.equals(slices.get(0).start()))
+            {
+                // pre-3.0 nodes normally expect pages to include the last cell from the previous page, but they handle it
+                // missing without any problems, so we can safely always set "inclusive" to false in the data range
+                dataRange = dataRange.forPaging(keyRange, metadata.comparator, startBound.getAsClustering(metadata), false);
+            }
+            return PartitionRangeReadCommand.create(true, metadata, nowInSec, selection, rowFilter, limits, dataRange);
+        }
+
+        public long serializedSize(ReadCommand command, int version)
+        {
+            assert version < MessagingService.VERSION_30;
+            assert command.kind == Kind.PARTITION_RANGE;
+
+            PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
+            CFMetaData metadata = rangeCommand.metadata();
+            assert rangeCommand.dataRange().isPaging();
+
+            long size = TypeSizes.sizeof(metadata.ksName);
+            size += TypeSizes.sizeof(metadata.cfName);
+            size += TypeSizes.sizeof((long) rangeCommand.nowInSec());
+
+            size += AbstractBounds.rowPositionSerializer.serializedSize(rangeCommand.dataRange().keyRange(), version);
+
+            // pre-3.0 nodes only accept slice filters for paged range commands
+            ClusteringIndexSliceFilter filter;
+            if (rangeCommand.dataRange().clusteringIndexFilter.kind() == ClusteringIndexFilter.Kind.NAMES)
+                filter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter((ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter, metadata);
+            else
+                filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
+
+            // slice filter
+            boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
+            size += LegacyReadCommandSerializer.serializedSlicesSize(filter.requestedSlices(), makeStaticSlice, metadata);
+            size += TypeSizes.sizeof(filter.isReversed());
+
+            // slice filter's count
+            size += TypeSizes.sizeof(rangeCommand.limits().perPartitionCount());
+
+            // compositesToGroup
+            size += TypeSizes.sizeof(0);
+
+            // command-level Composite "start" and "stop"
+            DataRange.Paging pagingRange = (DataRange.Paging) rangeCommand.dataRange();
+            Clustering lastReturned = pagingRange.getLastReturned();
+            Slice lastSlice = filter.requestedSlices().get(filter.requestedSlices().size() - 1);
+            size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeClustering(metadata, lastReturned));
+            size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeClustering(metadata, lastSlice.end().clustering()));
+
+            size += LegacyRangeSliceCommandSerializer.serializedRowFilterSize(rangeCommand.rowFilter());
+
+            // command-level limit
+            size += TypeSizes.sizeof(rangeCommand.limits().count());
+
+            // countCQL3Rows
+            return size + TypeSizes.sizeof(true);
         }
     }
 
-    public long serializedSize(ReadCommand command, int version)
+    /**
+     * Serializer for pre-3.0 ReadCommands.
+     */
+    static class LegacyReadCommandSerializer implements IVersionedSerializer<ReadCommand>
     {
-        switch (command.commandType)
+        public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
         {
-            case GET_BY_NAMES:
-                return 1 + SliceByNamesReadCommand.serializer.serializedSize(command, version);
-            case GET_SLICES:
-                return 1 + SliceFromReadCommand.serializer.serializedSize(command, version);
-            default:
-                throw new AssertionError();
+            assert version < MessagingService.VERSION_30;
+            assert command.kind == Kind.SINGLE_PARTITION;
+
+            SinglePartitionReadCommand singleReadCommand = (SinglePartitionReadCommand) command;
+            singleReadCommand = maybeConvertNamesToSlice(singleReadCommand);
+
+            CFMetaData metadata = singleReadCommand.metadata();
+
+            out.writeByte(LegacyType.fromPartitionFilterKind(singleReadCommand.clusteringIndexFilter().kind()).serializedValue);
+
+            out.writeBoolean(singleReadCommand.isDigestQuery());
+            out.writeUTF(metadata.ksName);
+            ByteBufferUtil.writeWithShortLength(singleReadCommand.partitionKey().getKey(), out);
+            out.writeUTF(metadata.cfName);
+            out.writeLong(singleReadCommand.nowInSec() * 1000L);  // convert from seconds to millis
+
+            if (singleReadCommand.clusteringIndexFilter().kind() == ClusteringIndexFilter.Kind.SLICE)
+                serializeSliceCommand(singleReadCommand, out);
+            else
+                serializeNamesCommand(singleReadCommand, out);
+        }
+
+        public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
+        {
+            assert version < MessagingService.VERSION_30;
+            LegacyType msgType = LegacyType.fromSerializedValue(in.readByte());
+
+            boolean isDigest = in.readBoolean();
+            String keyspaceName = in.readUTF();
+            ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
+            String cfName = in.readUTF();
+            long nowInMillis = in.readLong();
+            int nowInSeconds = (int) (nowInMillis / 1000);  // convert from millis to seconds
+            CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
+            DecoratedKey dk = metadata.partitioner.decorateKey(key);
+
+            switch (msgType)
+            {
+                case GET_BY_NAMES:
+                    return deserializeNamesCommand(in, isDigest, metadata, dk, nowInSeconds, version);
+                case GET_SLICES:
+                    return deserializeSliceCommand(in, isDigest, metadata, dk, nowInSeconds, version);
+                default:
+                    throw new AssertionError();
+            }
+        }
+
+        public long serializedSize(ReadCommand command, int version)
+        {
+            assert version < MessagingService.VERSION_30;
+            assert command.kind == Kind.SINGLE_PARTITION;
+            SinglePartitionReadCommand singleReadCommand = (SinglePartitionReadCommand) command;
+            singleReadCommand = maybeConvertNamesToSlice(singleReadCommand);
+
+            int keySize = singleReadCommand.partitionKey().getKey().remaining();
+
+            CFMetaData metadata = singleReadCommand.metadata();
+
+            long size = 1;  // message type (single byte)
+            size += TypeSizes.sizeof(command.isDigestQuery());
+            size += TypeSizes.sizeof(metadata.ksName);
+            size += TypeSizes.sizeof((short) keySize) + keySize;
+            size += TypeSizes.sizeof((long) command.nowInSec());
+
+            if (singleReadCommand.clusteringIndexFilter().kind() == ClusteringIndexFilter.Kind.SLICE)
+                return size + serializedSliceCommandSize(singleReadCommand);
+            else
+                return size + serializedNamesCommandSize(singleReadCommand);
+        }
+
+        private void serializeNamesCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException
+        {
+            serializeNamesFilter(command, (ClusteringIndexNamesFilter)command.clusteringIndexFilter(), out);
+        }
+
+        private static void serializeNamesFilter(ReadCommand command, ClusteringIndexNamesFilter filter, DataOutputPlus out) throws IOException
+        {
+            PartitionColumns columns = command.columnFilter().fetchedColumns();
+            CFMetaData metadata = command.metadata();
+            SortedSet<Clustering> requestedRows = filter.requestedRows();
+
+            if (requestedRows.isEmpty())
+            {
+                // only static columns are requested
+                out.writeInt(columns.size());
+                for (ColumnDefinition column : columns)
+                    ByteBufferUtil.writeWithShortLength(column.name.bytes, out);
+            }
+            else
+            {
+                out.writeInt(requestedRows.size() * columns.size());
+                for (Clustering clustering : requestedRows)
+                {
+                    for (ColumnDefinition column : columns)
+                        ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeCellName(metadata, clustering, column.name.bytes, null), out);
+                }
+            }
+
+            // countCql3Rows should be true if it's not for Thrift or a DISTINCT query
+            if (command.isForThrift() || (command.limits().kind() == DataLimits.Kind.CQL_LIMIT && command.limits().perPartitionCount() == 1))
+                out.writeBoolean(false);  // it's compact and not a DISTINCT query
+            else
+                out.writeBoolean(true);
+        }
+
+        static long serializedNamesFilterSize(ClusteringIndexNamesFilter filter, CFMetaData metadata, PartitionColumns fetchedColumns)
+        {
+            SortedSet<Clustering> requestedRows = filter.requestedRows();
+
+            long size = 0;
+            if (requestedRows.isEmpty())
+            {
+                // only static columns are requested
+                size += TypeSizes.sizeof(fetchedColumns.size());
+                for (ColumnDefinition column : fetchedColumns)
+                    size += ByteBufferUtil.serializedSizeWithShortLength(column.name.bytes);
+            }
+            else
+            {
+                size += TypeSizes.sizeof(requestedRows.size() * fetchedColumns.size());
+                for (Clustering clustering : requestedRows)
+                {
+                    for (ColumnDefinition column : fetchedColumns)
+                        size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeCellName(metadata, clustering, column.name.bytes, null));
+                }
+            }
+
+            return size + TypeSizes.sizeof(true);  // countCql3Rows
+        }
+
+        private SinglePartitionReadCommand deserializeNamesCommand(DataInputPlus in, boolean isDigest, CFMetaData metadata, DecoratedKey key, int nowInSeconds, int version) throws IOException
+        {
+            Pair<ColumnFilter, ClusteringIndexNamesFilter> selectionAndFilter = deserializeNamesSelectionAndFilter(in, metadata);
+
+            return SinglePartitionReadCommand.legacyNamesCommand(isDigest, version, metadata, nowInSeconds, selectionAndFilter.left, key, selectionAndFilter.right);
+        }
+
+        static Pair<ColumnFilter, ClusteringIndexNamesFilter> deserializeNamesSelectionAndFilter(DataInputPlus in, CFMetaData metadata) throws IOException
+        {
+            int numCellNames = in.readInt();
+
+            // The names filter could include either a) static columns or b) normal columns with the clustering columns
+            // fully specified.  We need to handle those cases differently in 3.0.
+            NavigableSet<Clustering> clusterings = new TreeSet<>(metadata.comparator);
+
+            ColumnFilter.Builder selectionBuilder = ColumnFilter.selectionBuilder();
+            for (int i = 0; i < numCellNames; i++)
+            {
+                ByteBuffer buffer = ByteBufferUtil.readWithShortLength(in);
+                LegacyLayout.LegacyCellName cellName;
+                try
+                {
+                    cellName = LegacyLayout.decodeCellName(metadata, buffer);
+                }
+                catch (UnknownColumnException exc)
+                {
+                    // TODO this probably needs a new exception class that shares a parent with UnknownColumnFamilyException
+                    throw new UnknownColumnFamilyException(
+                            "Received legacy range read command with names filter for unrecognized column name. " +
+                                    "Fill name in filter (hex): " + ByteBufferUtil.bytesToHex(buffer), metadata.cfId);
+                }
+
+                // If we're querying for a static column, we may also need to read it
+                // as if it were a thrift dynamic column (because the column metadata,
+                // which makes it a static column in 3.0+, may have been added *after*
+                // some values were written). Note that all cql queries on non-compact
+                // tables used slice & not name filters prior to 3.0 so this path is
+                // not taken for non-compact tables. It is theoretically possible to
+                // get here via thrift, hence the check on metadata.isStaticCompactTable.
+                // See CASSANDRA-11087.
+                if (metadata.isStaticCompactTable() && cellName.clustering.equals(Clustering.STATIC_CLUSTERING))
+                {
+                    clusterings.add(new Clustering(cellName.column.name.bytes));
+                    selectionBuilder.add(metadata.compactValueColumn());
+                }
+                else
+                {
+                    clusterings.add(cellName.clustering);
+                }
+
+                selectionBuilder.add(cellName.column);
+            }
+
+            // for compact storage tables without clustering keys, the column holding the selected value is named
+            // 'value' internally we add it to the selection here to prevent errors due to unexpected column names
+            // when serializing the initial local data response
+            if (metadata.isStaticCompactTable() && clusterings.isEmpty())
+                selectionBuilder.addAll(metadata.partitionColumns());
+
+            in.readBoolean();  // countCql3Rows
+
+            // clusterings cannot include STATIC_CLUSTERING, so if the names filter is for static columns, clusterings
+            // will be empty.  However, by requesting the static columns in our ColumnFilter, this will still work.
+            ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings, false);
+            return Pair.create(selectionBuilder.build(), filter);
+        }
+
+        private long serializedNamesCommandSize(SinglePartitionReadCommand command)
+        {
+            ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter)command.clusteringIndexFilter();
+            PartitionColumns columns = command.columnFilter().fetchedColumns();
+            return serializedNamesFilterSize(filter, command.metadata(), columns);
+        }
+
+        private void serializeSliceCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException
+        {
+            CFMetaData metadata = command.metadata();
+            ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter();
+
+            Slices slices = filter.requestedSlices();
+            boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING);
+            serializeSlices(out, slices, filter.isReversed(), makeStaticSlice, metadata);
+
+            out.writeBoolean(filter.isReversed());
+
+            boolean selectsStatics = !command.columnFilter().fetchedColumns().statics.isEmpty() || slices.selects(Clustering.STATIC_CLUSTERING);
+            DataLimits limits = command.limits();
+            if (limits.isDistinct())
+                out.writeInt(1);  // the limit is always 1 for DISTINCT queries
+            else
+                out.writeInt(updateLimitForQuery(command.limits().count(), filter.requestedSlices()));
+
+            int compositesToGroup;
+            if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT || metadata.isDense())
+                compositesToGroup = -1;
+            else if (limits.isDistinct() && !selectsStatics)
+                compositesToGroup = -2;  // for DISTINCT queries (CASSANDRA-8490)
+            else
+                compositesToGroup = metadata.clusteringColumns().size();
+
+            out.writeInt(compositesToGroup);
+        }
+
+        private SinglePartitionReadCommand deserializeSliceCommand(DataInputPlus in, boolean isDigest, CFMetaData metadata, DecoratedKey key, int nowInSeconds, int version) throws IOException
+        {
+            Pair<ClusteringIndexSliceFilter, Boolean> p = deserializeSlicePartitionFilter(in, metadata);
+            ClusteringIndexSliceFilter filter = p.left;
+            boolean selectsStatics = p.right;
+            int count = in.readInt();
+            int compositesToGroup = in.readInt();
+
+            // if a slice query from a pre-3.0 node doesn't cover statics, we shouldn't select them at all
+            ColumnFilter columnFilter = LegacyRangeSliceCommandSerializer.getColumnSelectionForSlice(selectsStatics, compositesToGroup, metadata);
+
+            // We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
+            // we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter is probablematic
+            // however as we have no way to distinguish it from a normal select with a limit of 1 (and this, contrarily to the range query case
+            // were the countCQL3Rows boolean allows us to decide).
+            // So we consider this case not distinct here. This is ok because even if it is a distinct (with static), the count will be 1 and
+            // we'll still just query one row (a distinct DataLimits currently behave exactly like a CQL limit with a count of 1). The only
+            // drawback is that we'll send back the first row entirely while a 2.1/2.2 node would return only the first cell in that same
+            // situation. This isn't a problem for 2.1/2.2 code however (it would be for a range query, as it would throw off the count for
+            // reasons similar to CASSANDRA-10762, but it's ok for single partition queries).
+            // We do _not_ want to do the reverse however and consider a 'SELECT * FROM foo LIMIT 1' as a DISTINCT query as that would make
+            // us only return the 1st cell rather then 1st row.
+            DataLimits limits;
+            if (compositesToGroup == -2)
+                limits = DataLimits.distinctLimits(count);  // See CASSANDRA-8490 for the explanation of this value
+            else if (compositesToGroup == -1)
+                limits = DataLimits.thriftLimits(1, count);
+            else
+                limits = DataLimits.cqlLimits(count);
+
+            return SinglePartitionReadCommand.legacySliceCommand(isDigest, version, metadata, nowInSeconds, columnFilter, limits, key, filter);
+        }
+
+        private long serializedSliceCommandSize(SinglePartitionReadCommand command)
+        {
+            CFMetaData metadata = command.metadata();
+            ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter();
+
+            Slices slices = filter.requestedSlices();
+            boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING);
+
+            long size = serializedSlicesSize(slices, makeStaticSlice, metadata);
+            size += TypeSizes.sizeof(command.clusteringIndexFilter().isReversed());
+            size += TypeSizes.sizeof(command.limits().count());
+            return size + TypeSizes.sizeof(0);  // compositesToGroup
+        }
+
+        static void serializeSlices(DataOutputPlus out, Slices slices, boolean isReversed, boolean makeStaticSlice, CFMetaData metadata) throws IOException
+        {
+            out.writeInt(slices.size() + (makeStaticSlice ? 1 : 0));
+
+            // In 3.0 we always store the slices in normal comparator order.  Pre-3.0 nodes expect the slices to
+            // be in reversed order if the query is reversed, so we handle that here.
+            if (isReversed)
+            {
+                for (int i = slices.size() - 1; i >= 0; i--)
+                    serializeSlice(out, slices.get(i), true, metadata);
+                if (makeStaticSlice)
+                    serializeStaticSlice(out, true, metadata);
+            }
+            else
+            {
+                if (makeStaticSlice)
+                    serializeStaticSlice(out, false, metadata);
+                for (Slice slice : slices)
+                    serializeSlice(out, slice, false, metadata);
+            }
+        }
+
+        static long serializedSlicesSize(Slices slices, boolean makeStaticSlice, CFMetaData metadata)
+        {
+            long size = TypeSizes.sizeof(slices.size());
+
+            for (Slice slice : slices)
+            {
+                ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, slice.start(), true);
+                size += ByteBufferUtil.serializedSizeWithShortLength(sliceStart);
+                ByteBuffer sliceEnd = LegacyLayout.encodeBound(metadata, slice.end(), false);
+                size += ByteBufferUtil.serializedSizeWithShortLength(sliceEnd);
+            }
+
+            if (makeStaticSlice)
+                size += serializedStaticSliceSize(metadata);
+
+            return size;
+        }
+
+        static long serializedStaticSliceSize(CFMetaData metadata)
+        {
+            // unlike serializeStaticSlice(), but we don't care about reversal for size calculations
+            ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, Slice.Bound.BOTTOM, false);
+            long size = ByteBufferUtil.serializedSizeWithShortLength(sliceStart);
+
+            size += TypeSizes.sizeof((short) (metadata.comparator.size() * 3 + 2));
+            size += TypeSizes.sizeof((short) LegacyLayout.STATIC_PREFIX);
+            for (int i = 0; i < metadata.comparator.size(); i++)
+            {
+                size += ByteBufferUtil.serializedSizeWithShortLength(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+                size += 1;  // EOC
+            }
+            return size;
+        }
+
+        private static void serializeSlice(DataOutputPlus out, Slice slice, boolean isReversed, CFMetaData metadata) throws IOException
+        {
+            ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, isReversed ? slice.end() : slice.start(), !isReversed);
+            ByteBufferUtil.writeWithShortLength(sliceStart, out);
+
+            ByteBuffer sliceEnd = LegacyLayout.encodeBound(metadata, isReversed ? slice.start() : slice.end(), isReversed);
+            ByteBufferUtil.writeWithShortLength(sliceEnd, out);
+        }
+
+        private static void serializeStaticSlice(DataOutputPlus out, boolean isReversed, CFMetaData metadata) throws IOException
+        {
+            // if reversed, write an empty bound for the slice start; if reversed, write out an empty bound for the
+            // slice finish after we've written the static slice start
+            if (!isReversed)
+            {
+                ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, Slice.Bound.BOTTOM, false);
+                ByteBufferUtil.writeWithShortLength(sliceStart, out);
+            }
+
+            // write out the length of the composite
+            out.writeShort(2 + metadata.comparator.size() * 3);  // two bytes + EOC for each component, plus static prefix
+            out.writeShort(LegacyLayout.STATIC_PREFIX);
+            for (int i = 0; i < metadata.comparator.size(); i++)
+            {
+                ByteBufferUtil.writeWithShortLength(ByteBufferUtil.EMPTY_BYTE_BUFFER, out);
+                // write the EOC, using an inclusive end if we're on the final component
+                out.writeByte(i == metadata.comparator.size() - 1 ? 1 : 0);
+            }
+
+            if (isReversed)
+            {
+                ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, Slice.Bound.BOTTOM, false);
+                ByteBufferUtil.writeWithShortLength(sliceStart, out);
+            }
+        }
+
+        // Returns the deserialized filter, and whether static columns are queried (in pre-3.0, both info are determined by the slices,
+        // but in 3.0 they are separated: whether static columns are queried or not depends on the ColumnFilter).
+        static Pair<ClusteringIndexSliceFilter, Boolean> deserializeSlicePartitionFilter(DataInputPlus in, CFMetaData metadata) throws IOException
+        {
+            int numSlices = in.readInt();
+            ByteBuffer[] startBuffers = new ByteBuffer[numSlices];
+            ByteBuffer[] finishBuffers = new ByteBuffer[numSlices];
+            for (int i = 0; i < numSlices; i++)
+            {
+                startBuffers[i] = ByteBufferUtil.readWithShortLength(in);
+                finishBuffers[i] = ByteBufferUtil.readWithShortLength(in);
+            }
+
+            boolean reversed = in.readBoolean();
+
+            if (reversed)
+            {
+                // pre-3.0, reversed query slices put the greater element at the start of the slice
+                ByteBuffer[] tmp = finishBuffers;
+                finishBuffers = startBuffers;
+                startBuffers = tmp;
+            }
+
+            boolean selectsStatics = false;
+            Slices.Builder slicesBuilder = new Slices.Builder(metadata.comparator);
+            for (int i = 0; i < numSlices; i++)
+            {
+                LegacyLayout.LegacyBound start = LegacyLayout.decodeSliceBound(metadata, startBuffers[i], true);
+                LegacyLayout.LegacyBound finish = LegacyLayout.decodeSliceBound(metadata, finishBuffers[i], false);
+
+                if (start.isStatic)
+                {
+                    // If we start at the static block, this means we start at the beginning of the partition in 3.0
+                    // terms (since 3.0 handles static outside of the slice).
+                    start = LegacyLayout.LegacyBound.BOTTOM;
+
+                    // Then if we include the static, records it
+                    if (start.bound.isInclusive())
+                        selectsStatics = true;
+                }
+                else if (start == LegacyLayout.LegacyBound.BOTTOM)
+                {
+                    selectsStatics = true;
+                }
+
+                // If the end of the slice is the end of the statics, then that mean this slice was just selecting static
+                // columns. We have already recorded that in selectsStatics, so we can ignore the slice (which doesn't make
+                // sense for 3.0).
+                if (finish.isStatic)
+                {
+                    assert finish.bound.isInclusive(); // it would make no sense for a pre-3.0 node to have a slice that stops
+                                                     // before the static columns (since there is nothing before that)
+                    continue;
+                }
+
+                slicesBuilder.add(Slice.make(start.bound, finish.bound));
+            }
+
+            return Pair.create(new ClusteringIndexSliceFilter(slicesBuilder.build(), reversed), selectsStatics);
+        }
+
+        private static SinglePartitionReadCommand maybeConvertNamesToSlice(SinglePartitionReadCommand command)
+        {
+            if (command.clusteringIndexFilter().kind() != ClusteringIndexFilter.Kind.NAMES)
+                return command;
+
+            CFMetaData metadata = command.metadata();
+
+            if (!shouldConvertNamesToSlice(metadata, command.columnFilter().fetchedColumns()))
+                return command;
+
+            ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter)command.clusteringIndexFilter();
+            ClusteringIndexSliceFilter sliceFilter = convertNamesFilterToSliceFilter(filter, metadata);
+
+            return command.withUpdatedClusteringIndexFilter(sliceFilter);
+        }
+
+        /**
+         * Returns true if a names filter on the given table and column selection should be converted to a slice
+         * filter for compatibility with pre-3.0 nodes, false otherwise.
+         */
+        static boolean shouldConvertNamesToSlice(CFMetaData metadata, PartitionColumns columns)
+        {
+            // On pre-3.0 nodes, due to CASSANDRA-5762, we always do a slice for CQL3 tables (not dense, composite).
+            if (!metadata.isDense() && metadata.isCompound())
+                return true;
+
+            // pre-3.0 nodes don't support names filters for reading collections, so if we're requesting any of those,
+            // we need to convert this to a slice filter
+            for (ColumnDefinition column : columns)
+            {
+                if (column.type.isMultiCell())
+                    return true;
+            }
+            return false;
+        }
+
+        /**
+         * Converts a names filter that is incompatible with pre-3.0 nodes to a slice filter that is compatible.
+         */
+        private static ClusteringIndexSliceFilter convertNamesFilterToSliceFilter(ClusteringIndexNamesFilter filter, CFMetaData metadata)
+        {
+            SortedSet<Clustering> requestedRows = filter.requestedRows();
+            Slices slices;
+            if (requestedRows.isEmpty())
+            {
+                slices = Slices.NONE;
+            }
+            else if (requestedRows.size() == 1 && requestedRows.first().size() == 0)
+            {
+                slices = Slices.ALL;
+            }
+            else
+            {
+                Slices.Builder slicesBuilder = new Slices.Builder(metadata.comparator);
+                for (Clustering clustering : requestedRows)
+                    slicesBuilder.add(Slice.Bound.inclusiveStartOf(clustering), Slice.Bound.inclusiveEndOf(clustering));
+                slices = slicesBuilder.build();
+            }
+
+            return new ClusteringIndexSliceFilter(slices, filter.isReversed());
+        }
+
+        /**
+         * Potentially increases the existing query limit to account for the lack of exclusive bounds in pre-3.0 nodes.
+         * @param limit the existing query limit
+         * @param slices the requested slices
+         * @return the updated limit
+         */
+        static int updateLimitForQuery(int limit, Slices slices)
+        {
+            // Pre-3.0 nodes don't support exclusive bounds for slices. Instead, we query one more element if necessary
+            // and filter it later (in LegacyRemoteDataResponse)
+            if (!slices.hasLowerBound() && ! slices.hasUpperBound())
+                return limit;
+
+            for (Slice slice : slices)
+            {
+                if (limit == Integer.MAX_VALUE)
+                    return limit;
+
+                if (!slice.start().isInclusive())
+                    limit++;
+                if (!slice.end().isInclusive())
+                    limit++;
+            }
+            return limit;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java
new file mode 100644
index 0000000..9cde8dc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.net.IVerbHandler;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.tracing.Tracing;
+
+public class ReadCommandVerbHandler implements IVerbHandler<ReadCommand>
+{
+    protected IVersionedSerializer<ReadResponse> serializer()
+    {
+        return ReadResponse.serializer;
+    }
+
+    public void doVerb(MessageIn<ReadCommand> message, int id)
+    {
+        if (StorageService.instance.isBootstrapMode())
+        {
+            throw new RuntimeException("Cannot service reads while bootstrapping!");
+        }
+
+        ReadCommand command = message.payload;
+        ReadResponse response;
+        try (ReadOrderGroup opGroup = command.startOrderGroup(); UnfilteredPartitionIterator iterator = command.executeLocally(opGroup))
+        {
+            response = command.createResponse(iterator);
+        }
+
+        MessageOut<ReadResponse> reply = new MessageOut<>(MessagingService.Verb.REQUEST_RESPONSE, response, serializer());
+
+        Tracing.trace("Enqueuing response to {}", message.from);
+        MessagingService.instance().sendReply(reply, id, message.from);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ReadOrderGroup.java b/src/java/org/apache/cassandra/db/ReadOrderGroup.java
new file mode 100644
index 0000000..416a2b8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ReadOrderGroup.java

@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+public class ReadOrderGroup implements AutoCloseable
+{
+    // For every reads
+    private final OpOrder.Group baseOp;
+
+    // For index reads
+    private final OpOrder.Group indexOp;
+    private final OpOrder.Group writeOp;
+
+    private ReadOrderGroup(OpOrder.Group baseOp, OpOrder.Group indexOp, OpOrder.Group writeOp)
+    {
+        this.baseOp = baseOp;
+        this.indexOp = indexOp;
+        this.writeOp = writeOp;
+    }
+
+    public OpOrder.Group baseReadOpOrderGroup()
+    {
+        return baseOp;
+    }
+
+    public OpOrder.Group indexReadOpOrderGroup()
+    {
+        return indexOp;
+    }
+
+    public OpOrder.Group writeOpOrderGroup()
+    {
+        return writeOp;
+    }
+
+    public static ReadOrderGroup emptyGroup()
+    {
+        return new ReadOrderGroup(null, null, null);
+    }
+
+    @SuppressWarnings("resource") // ops closed during group close
+    public static ReadOrderGroup forCommand(ReadCommand command)
+    {
+        ColumnFamilyStore baseCfs = Keyspace.openAndGetStore(command.metadata());
+        ColumnFamilyStore indexCfs = maybeGetIndexCfs(baseCfs, command);
+
+        if (indexCfs == null)
+        {
+            return new ReadOrderGroup(baseCfs.readOrdering.start(), null, null);
+        }
+        else
+        {
+            OpOrder.Group baseOp = null, indexOp = null, writeOp = null;
+            // OpOrder.start() shouldn't fail, but better safe than sorry.
+            try
+            {
+                baseOp = baseCfs.readOrdering.start();
+                indexOp = indexCfs.readOrdering.start();
+                // TODO: this should perhaps not open and maintain a writeOp for the full duration, but instead only *try* to delete stale entries, without blocking if there's no room
+                // as it stands, we open a writeOp and keep it open for the duration to ensure that should this CF get flushed to make room we don't block the reclamation of any room being made
+                writeOp = Keyspace.writeOrder.start();
+                return new ReadOrderGroup(baseOp, indexOp, writeOp);
+            }
+            catch (RuntimeException e)
+            {
+                // Note that must have writeOp == null since ReadOrderGroup ctor can't fail
+                assert writeOp == null;
+                try
+                {
+                    if (baseOp != null)
+                        baseOp.close();
+                }
+                finally
+                {
+                    if (indexOp != null)
+                        indexOp.close();
+                }
+                throw e;
+            }
+        }
+    }
+
+    private static ColumnFamilyStore maybeGetIndexCfs(ColumnFamilyStore baseCfs, ReadCommand command)
+    {
+        Index index = command.getIndex(baseCfs);
+        return index == null ? null : index.getBackingTable().orElse(null);
+    }
+
+    public void close()
+    {
+        try
+        {
+            if (baseOp != null)
+                baseOp.close();
+        }
+        finally
+        {
+            if (indexOp != null)
+            {
+                try
+                {
+                    indexOp.close();
+                }
+                finally
+                {
+                    writeOp.close();
+                }
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/ReadQuery.java b/src/java/org/apache/cassandra/db/ReadQuery.java
new file mode 100644
index 0000000..75ba8f5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/ReadQuery.java

@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.service.pager.PagingState;
+
+/**
+ * Generic abstraction for read queries.
+ * <p>
+ * The main implementation of this is {@link ReadCommand} but we have this interface because
+ * {@link SinglePartitionReadCommand.Group} is also consider as a "read query" but is not a
+ * {@code ReadCommand}.
+ */
+public interface ReadQuery
+{
+    ReadQuery EMPTY = new ReadQuery()
+    {
+        public ReadOrderGroup startOrderGroup()
+        {
+            return ReadOrderGroup.emptyGroup();
+        }
+
+        public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState) throws RequestExecutionException
+        {
+            return EmptyIterators.partition();
+        }
+
+        public PartitionIterator executeInternal(ReadOrderGroup orderGroup)
+        {
+            return EmptyIterators.partition();
+        }
+
+        public DataLimits limits()
+        {
+            // What we return here doesn't matter much in practice. However, returning DataLimits.NONE means
+            // "no particular limit", which makes SelectStatement.execute() take the slightly more complex "paging"
+            // path. Not a big deal but it's easy enough to return a limit of 0 rows which avoids this.
+            return DataLimits.cqlLimits(0);
+        }
+
+        public QueryPager getPager(PagingState state, int protocolVersion)
+        {
+            return QueryPager.EMPTY;
+        }
+
+        public QueryPager getLocalPager()
+        {
+            return QueryPager.EMPTY;
+        }
+
+        public boolean selectsKey(DecoratedKey key)
+        {
+            return false;
+        }
+
+        public boolean selectsClustering(DecoratedKey key, Clustering clustering)
+        {
+            return false;
+        }
+
+        @Override
+        public boolean selectsFullPartition()
+        {
+            return false;
+        }
+    };
+
+    /**
+     * Starts a new read operation.
+     * <p>
+     * This must be called before {@link executeInternal} and passed to it to protect the read.
+     * The returned object <b>must</b> be closed on all path and it is thus strongly advised to
+     * use it in a try-with-ressource construction.
+     *
+     * @return a newly started order group for this {@code ReadQuery}.
+     */
+    public ReadOrderGroup startOrderGroup();
+
+    /**
+     * Executes the query at the provided consistency level.
+     *
+     * @param consistency the consistency level to achieve for the query.
+     * @param clientState the {@code ClientState} for the query. In practice, this can be null unless
+     * {@code consistency} is a serial consistency.
+     *
+     * @return the result of the query.
+     */
+    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState) throws RequestExecutionException;
+
+    /**
+     * Execute the query for internal queries (that is, it basically executes the query locally).
+     *
+     * @param orderGroup the {@code ReadOrderGroup} protecting the read.
+     * @return the result of the query.
+     */
+    public PartitionIterator executeInternal(ReadOrderGroup orderGroup);
+
+    /**
+     * Returns a pager for the query.
+     *
+     * @param pagingState the {@code PagingState} to start from if this is a paging continuation. This can be
+     * {@code null} if this is the start of paging.
+     * @param protocolVersion the protocol version to use for the paging state of that pager.
+     *
+     * @return a pager for the query.
+     */
+    public QueryPager getPager(PagingState pagingState, int protocolVersion);
+
+    /**
+     * The limits for the query.
+     *
+     * @return The limits for the query.
+     */
+    public DataLimits limits();
+
+    /**
+     * @return true if the read query would select the given key, including checks against the row filter, if
+     * checkRowFilter is true
+     */
+    public boolean selectsKey(DecoratedKey key);
+
+    /**
+     * @return true if the read query would select the given clustering, including checks against the row filter, if
+     * checkRowFilter is true
+     */
+    public boolean selectsClustering(DecoratedKey key, Clustering clustering);
+
+    /**
+     * Checks if this {@code ReadQuery} selects full partitions, that is it has no filtering on clustering or regular columns.
+     * @return {@code true} if this {@code ReadQuery} selects full partitions, {@code false} otherwise.
+     */
+    public boolean selectsFullPartition();
+}

diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java
index 849ac70..2e499e7 100644
--- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java

@@ -26,7 +26,6 @@
     public void doVerb(MessageIn<Mutation> message, int id)
     {
         message.payload.apply();
-        WriteResponse response = new WriteResponse();
-        MessagingService.instance().sendReply(response.createMessage(), id, message.from);
+        MessagingService.instance().sendReply(WriteResponse.createMessage(), id, message.from);
     }
 }

diff --git a/src/java/org/apache/cassandra/db/ReadResponse.java b/src/java/org/apache/cassandra/db/ReadResponse.java
index d8744d3..bc44a1b 100644
--- a/src/java/org/apache/cassandra/db/ReadResponse.java
+++ b/src/java/org/apache/cassandra/db/ReadResponse.java

@@ -19,115 +19,527 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+import java.security.MessageDigest;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
 
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.io.ForwardingVersionedSerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.thrift.ThriftResultsMerger;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 
-/*
- * The read response message is sent by the server when reading data
- * this encapsulates the keyspacename and the row that has been read.
- * The keyspace name is needed so that we can use it to create repairs.
- */
-public class ReadResponse
+public abstract class ReadResponse
 {
-    public static final IVersionedSerializer<ReadResponse> serializer = new ReadResponseSerializer();
-    private static final AtomicReferenceFieldUpdater<ReadResponse, ByteBuffer> digestUpdater = AtomicReferenceFieldUpdater.newUpdater(ReadResponse.class, ByteBuffer.class, "digest");
-
-    private final Row row;
-    private volatile ByteBuffer digest;
-
-    public ReadResponse(ByteBuffer digest)
+    // Serializer for single partition read response
+    public static final IVersionedSerializer<ReadResponse> serializer = new Serializer();
+    // Serializer for the pre-3.0 rang slice responses.
+    public static final IVersionedSerializer<ReadResponse> legacyRangeSliceReplySerializer = new LegacyRangeSliceReplySerializer();
+    // Serializer for partition range read response (this actually delegate to 'serializer' in 3.0 and to
+    // 'legacyRangeSliceReplySerializer' in older version.
+    public static final IVersionedSerializer<ReadResponse> rangeSliceSerializer = new ForwardingVersionedSerializer<ReadResponse>()
     {
-        this(null, digest);
-        assert digest != null;
-    }
-
-    public ReadResponse(Row row)
-    {
-        this(row, null);
-        assert row != null;
-    }
-
-    public ReadResponse(Row row, ByteBuffer digest)
-    {
-        this.row = row;
-        this.digest = digest;
-    }
-
-    public Row row()
-    {
-        return row;
-    }
-
-    public ByteBuffer digest()
-    {
-        return digest;
-    }
-
-    public void setDigest(ByteBuffer digest)
-    {
-        ByteBuffer curr = this.digest;
-        if (!digestUpdater.compareAndSet(this, curr, digest))
+        @Override
+        protected IVersionedSerializer<ReadResponse> delegate(int version)
         {
-            assert digest.equals(this.digest) :
-                String.format("Digest mismatch : %s vs %s",
-                              Arrays.toString(digest.array()),
-                              Arrays.toString(this.digest.array()));
+            return version < MessagingService.VERSION_30
+                    ? legacyRangeSliceReplySerializer
+                    : serializer;
+        }
+    };
+
+    // This is used only when serializing data responses and we can't it easily in other cases. So this can be null, which is slighly
+    // hacky, but as this hack doesn't escape this class, and it's easy enough to validate that it's not null when we need, it's "good enough".
+    private final ReadCommand command;
+
+    protected ReadResponse(ReadCommand command)
+    {
+        this.command = command;
+    }
+
+    public static ReadResponse createDataResponse(UnfilteredPartitionIterator data, ReadCommand command)
+    {
+        return new LocalDataResponse(data, command);
+    }
+
+    @VisibleForTesting
+    public static ReadResponse createRemoteDataResponse(UnfilteredPartitionIterator data, ReadCommand command)
+    {
+        return new RemoteDataResponse(LocalDataResponse.build(data, command.columnFilter()));
+    }
+
+    public static ReadResponse createDigestResponse(UnfilteredPartitionIterator data, ReadCommand command)
+    {
+        return new DigestResponse(makeDigest(data, command));
+    }
+
+    public abstract UnfilteredPartitionIterator makeIterator(ReadCommand command);
+    public abstract ByteBuffer digest(ReadCommand command);
+
+    public abstract boolean isDigestResponse();
+
+    /**
+     * Creates a string of the requested partition in this read response suitable for debugging.
+     */
+    public String toDebugString(ReadCommand command, DecoratedKey key)
+    {
+        if (isDigestResponse())
+            return "Digest:0x" + ByteBufferUtil.bytesToHex(digest(command));
+
+        try (UnfilteredPartitionIterator iter = makeIterator(command))
+        {
+            while (iter.hasNext())
+            {
+                try (UnfilteredRowIterator partition = iter.next())
+                {
+                    if (partition.partitionKey().equals(key))
+                        return toDebugString(partition, command.metadata());
+                }
+            }
+        }
+        return "<key " + key + " not found>";
+    }
+
+    private String toDebugString(UnfilteredRowIterator partition, CFMetaData metadata)
+    {
+        StringBuilder sb = new StringBuilder();
+
+        sb.append(String.format("[%s.%s] key=%s partition_deletion=%s columns=%s",
+                                metadata.ksName,
+                                metadata.cfName,
+                                metadata.getKeyValidator().getString(partition.partitionKey().getKey()),
+                                partition.partitionLevelDeletion(),
+                                partition.columns()));
+
+        if (partition.staticRow() != Rows.EMPTY_STATIC_ROW)
+            sb.append("\n    ").append(partition.staticRow().toString(metadata, true));
+
+        while (partition.hasNext())
+            sb.append("\n    ").append(partition.next().toString(metadata, true));
+
+        return sb.toString();
+    }
+
+    protected static ByteBuffer makeDigest(UnfilteredPartitionIterator iterator, ReadCommand command)
+    {
+        MessageDigest digest = FBUtilities.threadLocalMD5Digest();
+        UnfilteredPartitionIterators.digest(command, iterator, digest, command.digestVersion());
+        return ByteBuffer.wrap(digest.digest());
+    }
+
+    private static class DigestResponse extends ReadResponse
+    {
+        private final ByteBuffer digest;
+
+        private DigestResponse(ByteBuffer digest)
+        {
+            super(null);
+            assert digest.hasRemaining();
+            this.digest = digest;
+        }
+
+        public UnfilteredPartitionIterator makeIterator(ReadCommand command)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public ByteBuffer digest(ReadCommand command)
+        {
+            // We assume that the digest is in the proper version, which bug excluded should be true since this is called with
+            // ReadCommand.digestVersion() as argument and that's also what we use to produce the digest in the first place.
+            // Validating it's the proper digest in this method would require sending back the digest version along with the
+            // digest which would waste bandwith for little gain.
+            return digest;
+        }
+
+        public boolean isDigestResponse()
+        {
+            return true;
         }
     }
 
-    public boolean isDigestQuery()
+    // built on the owning node responding to a query
+    private static class LocalDataResponse extends DataResponse
     {
-        return digest != null && row == null;
-    }
-}
-
-class ReadResponseSerializer implements IVersionedSerializer<ReadResponse>
-{
-    public void serialize(ReadResponse response, DataOutputPlus out, int version) throws IOException
-    {
-        out.writeInt(response.isDigestQuery() ? response.digest().remaining() : 0);
-        ByteBuffer buffer = response.isDigestQuery() ? response.digest() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        out.write(buffer);
-        out.writeBoolean(response.isDigestQuery());
-        if (!response.isDigestQuery())
-            Row.serializer.serialize(response.row(), out, version);
-    }
-
-    public ReadResponse deserialize(DataInput in, int version) throws IOException
-    {
-        byte[] digest = null;
-        int digestSize = in.readInt();
-        if (digestSize > 0)
+        private LocalDataResponse(UnfilteredPartitionIterator iter, ReadCommand command)
         {
-            digest = new byte[digestSize];
-            in.readFully(digest, 0, digestSize);
-        }
-        boolean isDigest = in.readBoolean();
-        assert isDigest == digestSize > 0;
-
-        Row row = null;
-        if (!isDigest)
-        {
-            // This is coming from a remote host
-            row = Row.serializer.deserialize(in, version, ColumnSerializer.Flag.FROM_REMOTE);
+            super(command, build(iter, command.columnFilter()), SerializationHelper.Flag.LOCAL);
         }
 
-        return isDigest ? new ReadResponse(ByteBuffer.wrap(digest)) : new ReadResponse(row);
+        private static ByteBuffer build(UnfilteredPartitionIterator iter, ColumnFilter selection)
+        {
+            try (DataOutputBuffer buffer = new DataOutputBuffer())
+            {
+                UnfilteredPartitionIterators.serializerForIntraNode().serialize(iter, selection, buffer, MessagingService.current_version);
+                return buffer.buffer();
+            }
+            catch (IOException e)
+            {
+                // We're serializing in memory so this shouldn't happen
+                throw new RuntimeException(e);
+            }
+        }
     }
 
-    public long serializedSize(ReadResponse response, int version)
+    // built on the coordinator node receiving a response
+    private static class RemoteDataResponse extends DataResponse
     {
-        TypeSizes typeSizes = TypeSizes.NATIVE;
-        ByteBuffer buffer = response.isDigestQuery() ? response.digest() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        int size = typeSizes.sizeof(buffer.remaining());
-        size += buffer.remaining();
-        size += typeSizes.sizeof(response.isDigestQuery());
-        if (!response.isDigestQuery())
-            size += Row.serializer.serializedSize(response.row(), version);
-        return size;
+        protected RemoteDataResponse(ByteBuffer data)
+        {
+            super(null, data, SerializationHelper.Flag.FROM_REMOTE);
+        }
+    }
+
+    static abstract class DataResponse extends ReadResponse
+    {
+        // TODO: can the digest be calculated over the raw bytes now?
+        // The response, serialized in the current messaging version
+        private final ByteBuffer data;
+        private final SerializationHelper.Flag flag;
+
+        protected DataResponse(ReadCommand command, ByteBuffer data, SerializationHelper.Flag flag)
+        {
+            super(command);
+            this.data = data;
+            this.flag = flag;
+        }
+
+        public UnfilteredPartitionIterator makeIterator(ReadCommand command)
+        {
+            try (DataInputBuffer in = new DataInputBuffer(data, true))
+            {
+                // Note that the command parameter shadows the 'command' field and this is intended because
+                // the later can be null (for RemoteDataResponse as those are created in the serializers and
+                // those don't have easy access to the command). This is also why we need the command as parameter here.
+                return UnfilteredPartitionIterators.serializerForIntraNode().deserialize(in,
+                                                                                         MessagingService.current_version,
+                                                                                         command.metadata(),
+                                                                                         command.columnFilter(),
+                                                                                         flag);
+            }
+            catch (IOException e)
+            {
+                // We're deserializing in memory so this shouldn't happen
+                throw new RuntimeException(e);
+            }
+        }
+
+        public ByteBuffer digest(ReadCommand command)
+        {
+            try (UnfilteredPartitionIterator iterator = makeIterator(command))
+            {
+                return makeDigest(iterator, command);
+            }
+        }
+
+        public boolean isDigestResponse()
+        {
+            return false;
+        }
+    }
+
+    /**
+     * A remote response from a pre-3.0 node.  This needs a separate class in order to cleanly handle trimming and
+     * reversal of results when the read command calls for it.  Pre-3.0 nodes always return results in the normal
+     * sorted order, even if the query asks for reversed results.  Additionally,  pre-3.0 nodes do not have a notion of
+     * exclusive slices on non-composite tables, so extra rows may need to be trimmed.
+     */
+    @VisibleForTesting
+    static class LegacyRemoteDataResponse extends ReadResponse
+    {
+        private final List<ImmutableBTreePartition> partitions;
+
+        @VisibleForTesting
+        LegacyRemoteDataResponse(List<ImmutableBTreePartition> partitions)
+        {
+            super(null); // we never serialize LegacyRemoteDataResponses, so we don't care about the command
+            this.partitions = partitions;
+        }
+
+        public UnfilteredPartitionIterator makeIterator(final ReadCommand command)
+        {
+            // Due to a bug in the serialization of AbstractBounds, anything that isn't a Range is understood by pre-3.0 nodes
+            // as a Bound, which means IncludingExcludingBounds and ExcludingBounds responses may include keys they shouldn't.
+            // So filter partitions that shouldn't be included here.
+            boolean skipFirst = false;
+            boolean skipLast = false;
+            if (!partitions.isEmpty() && command instanceof PartitionRangeReadCommand)
+            {
+                AbstractBounds<PartitionPosition> keyRange = ((PartitionRangeReadCommand)command).dataRange().keyRange();
+                boolean isExcludingBounds = keyRange instanceof ExcludingBounds;
+                skipFirst = isExcludingBounds && !keyRange.contains(partitions.get(0).partitionKey());
+                skipLast = (isExcludingBounds || keyRange instanceof IncludingExcludingBounds) && !keyRange.contains(partitions.get(partitions.size() - 1).partitionKey());
+            }
+
+            final List<ImmutableBTreePartition> toReturn;
+            if (skipFirst || skipLast)
+            {
+                toReturn = partitions.size() == 1
+                         ? Collections.emptyList()
+                         : partitions.subList(skipFirst ? 1 : 0, skipLast ? partitions.size() - 1 : partitions.size());
+            }
+            else
+            {
+                toReturn = partitions;
+            }
+
+            return new AbstractUnfilteredPartitionIterator()
+            {
+                private int idx;
+
+                public boolean isForThrift()
+                {
+                    return true;
+                }
+
+                public CFMetaData metadata()
+                {
+                    return command.metadata();
+                }
+
+                public boolean hasNext()
+                {
+                    return idx < toReturn.size();
+                }
+
+                public UnfilteredRowIterator next()
+                {
+                    ImmutableBTreePartition partition = toReturn.get(idx++);
+
+                    ClusteringIndexFilter filter = command.clusteringIndexFilter(partition.partitionKey());
+
+                    // Pre-3.0 we would always request one more row than we actually needed and the command-level "start" would
+                    // be the last-returned cell name, so the response would always include it. By consequence, we need to filter
+                    // the results here.
+                    UnfilteredRowIterator iterator = filter.filter(partition.sliceableUnfilteredIterator(command.columnFilter(), filter.isReversed()));
+
+                    // Wrap results with a ThriftResultMerger only if they're intended for the thrift command.
+                    if (command.isForThrift())
+                        return ThriftResultsMerger.maybeWrap(iterator, command.nowInSec());
+                    else
+                        return iterator;
+                }
+            };
+        }
+
+        public ByteBuffer digest(ReadCommand command)
+        {
+            try (UnfilteredPartitionIterator iterator = makeIterator(command))
+            {
+                return makeDigest(iterator, command);
+            }
+        }
+
+        public boolean isDigestResponse()
+        {
+            return false;
+        }
+    }
+
+    private static class Serializer implements IVersionedSerializer<ReadResponse>
+    {
+        public void serialize(ReadResponse response, DataOutputPlus out, int version) throws IOException
+        {
+            boolean isDigest = response instanceof DigestResponse;
+            ByteBuffer digest = isDigest ? ((DigestResponse)response).digest : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            if (version < MessagingService.VERSION_30)
+            {
+                out.writeInt(digest.remaining());
+                out.write(digest);
+                out.writeBoolean(isDigest);
+                if (!isDigest)
+                {
+                    assert response.command != null; // we only serialize LocalDataResponse, which always has the command set
+                    try (UnfilteredPartitionIterator iter = response.makeIterator(response.command))
+                    {
+                        assert iter.hasNext();
+                        try (UnfilteredRowIterator partition = iter.next())
+                        {
+                            ByteBufferUtil.writeWithShortLength(partition.partitionKey().getKey(), out);
+                            LegacyLayout.serializeAsLegacyPartition(response.command, partition, out, version);
+                        }
+                        assert !iter.hasNext();
+                    }
+                }
+                return;
+            }
+
+            ByteBufferUtil.writeWithVIntLength(digest, out);
+            if (!isDigest)
+            {
+                ByteBuffer data = ((DataResponse)response).data;
+                ByteBufferUtil.writeWithVIntLength(data, out);
+            }
+        }
+
+        public ReadResponse deserialize(DataInputPlus in, int version) throws IOException
+        {
+            if (version < MessagingService.VERSION_30)
+            {
+                byte[] digest = null;
+                int digestSize = in.readInt();
+                if (digestSize > 0)
+                {
+                    digest = new byte[digestSize];
+                    in.readFully(digest, 0, digestSize);
+                }
+                boolean isDigest = in.readBoolean();
+                assert isDigest == digestSize > 0;
+                if (isDigest)
+                {
+                    assert digest != null;
+                    return new DigestResponse(ByteBuffer.wrap(digest));
+                }
+
+                // ReadResponses from older versions are always single-partition (ranges are handled by RangeSliceReply)
+                ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
+                try (UnfilteredRowIterator rowIterator = LegacyLayout.deserializeLegacyPartition(in, version, SerializationHelper.Flag.FROM_REMOTE, key))
+                {
+                    if (rowIterator == null)
+                        return new LegacyRemoteDataResponse(Collections.emptyList());
+
+                    return new LegacyRemoteDataResponse(Collections.singletonList(ImmutableBTreePartition.create(rowIterator)));
+                }
+            }
+
+            ByteBuffer digest = ByteBufferUtil.readWithVIntLength(in);
+            if (digest.hasRemaining())
+                return new DigestResponse(digest);
+
+            assert version >= MessagingService.VERSION_30;
+            ByteBuffer data = ByteBufferUtil.readWithVIntLength(in);
+            return new RemoteDataResponse(data);
+        }
+
+        public long serializedSize(ReadResponse response, int version)
+        {
+            boolean isDigest = response instanceof DigestResponse;
+            ByteBuffer digest = isDigest ? ((DigestResponse)response).digest : ByteBufferUtil.EMPTY_BYTE_BUFFER;
+
+            if (version < MessagingService.VERSION_30)
+            {
+                long size = TypeSizes.sizeof(digest.remaining())
+                        + digest.remaining()
+                        + TypeSizes.sizeof(isDigest);
+                if (!isDigest)
+                {
+                    assert response.command != null; // we only serialize LocalDataResponse, which always has the command set
+                    try (UnfilteredPartitionIterator iter = response.makeIterator(response.command))
+                    {
+                        assert iter.hasNext();
+                        try (UnfilteredRowIterator partition = iter.next())
+                        {
+                            size += ByteBufferUtil.serializedSizeWithShortLength(partition.partitionKey().getKey());
+                            size += LegacyLayout.serializedSizeAsLegacyPartition(response.command, partition, version);
+                        }
+                        assert !iter.hasNext();
+                    }
+                }
+                return size;
+            }
+
+            long size = ByteBufferUtil.serializedSizeWithVIntLength(digest);
+            if (!isDigest)
+            {
+                // In theory, we should deserialize/re-serialize if the version asked is different from the current
+                // version as the content could have a different serialization format. So far though, we haven't made
+                // change to partition iterators serialization since 3.0 so we skip this.
+                assert version >= MessagingService.VERSION_30;
+                ByteBuffer data = ((DataResponse)response).data;
+                size += ByteBufferUtil.serializedSizeWithVIntLength(data);
+            }
+            return size;
+        }
+    }
+
+    private static class LegacyRangeSliceReplySerializer implements IVersionedSerializer<ReadResponse>
+    {
+        public void serialize(ReadResponse response, DataOutputPlus out, int version) throws IOException
+        {
+            assert version < MessagingService.VERSION_30;
+
+            // determine the number of partitions upfront for serialization
+            int numPartitions = 0;
+            assert response.command != null; // we only serialize LocalDataResponse, which always has the command set
+            try (UnfilteredPartitionIterator iterator = response.makeIterator(response.command))
+            {
+                while (iterator.hasNext())
+                {
+                    try (UnfilteredRowIterator atomIterator = iterator.next())
+                    {
+                        numPartitions++;
+
+                        // we have to fully exhaust the subiterator
+                        while (atomIterator.hasNext())
+                            atomIterator.next();
+                    }
+                }
+            }
+
+            out.writeInt(numPartitions);
+
+            try (UnfilteredPartitionIterator iterator = response.makeIterator(response.command))
+            {
+                while (iterator.hasNext())
+                {
+                    try (UnfilteredRowIterator partition = iterator.next())
+                    {
+                        ByteBufferUtil.writeWithShortLength(partition.partitionKey().getKey(), out);
+                        LegacyLayout.serializeAsLegacyPartition(response.command, partition, out, version);
+                    }
+                }
+            }
+        }
+
+        public ReadResponse deserialize(DataInputPlus in, int version) throws IOException
+        {
+            assert version < MessagingService.VERSION_30;
+
+            int partitionCount = in.readInt();
+            ArrayList<ImmutableBTreePartition> partitions = new ArrayList<>(partitionCount);
+            for (int i = 0; i < partitionCount; i++)
+            {
+                ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
+                try (UnfilteredRowIterator partition = LegacyLayout.deserializeLegacyPartition(in, version, SerializationHelper.Flag.FROM_REMOTE, key))
+                {
+                    partitions.add(ImmutableBTreePartition.create(partition));
+                }
+            }
+            return new LegacyRemoteDataResponse(partitions);
+        }
+
+        public long serializedSize(ReadResponse response, int version)
+        {
+            assert version < MessagingService.VERSION_30;
+            long size = TypeSizes.sizeof(0);  // number of partitions
+
+            assert response.command != null; // we only serialize LocalDataResponse, which always has the command set
+            try (UnfilteredPartitionIterator iterator = response.makeIterator(response.command))
+            {
+                while (iterator.hasNext())
+                {
+                    try (UnfilteredRowIterator partition = iterator.next())
+                    {
+                        size += ByteBufferUtil.serializedSizeWithShortLength(partition.partitionKey().getKey());
+                        size += LegacyLayout.serializedSizeAsLegacyPartition(response.command, partition, version);
+                    }
+                }
+            }
+            return size;
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/ReadVerbHandler.java b/src/java/org/apache/cassandra/db/ReadVerbHandler.java
deleted file mode 100644
index 8c167ed..0000000
--- a/src/java/org/apache/cassandra/db/ReadVerbHandler.java
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.net.IVerbHandler;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.tracing.Tracing;
-
-public class ReadVerbHandler implements IVerbHandler<ReadCommand>
-{
-    public void doVerb(MessageIn<ReadCommand> message, int id)
-    {
-        if (StorageService.instance.isBootstrapMode())
-        {
-            throw new RuntimeException("Cannot service reads while bootstrapping!");
-        }
-
-        ReadCommand command = message.payload;
-        Keyspace keyspace = Keyspace.open(command.ksName);
-        Row row = command.getRow(keyspace);
-
-        MessageOut<ReadResponse> reply = new MessageOut<ReadResponse>(MessagingService.Verb.REQUEST_RESPONSE,
-                                                                      getResponse(command, row),
-                                                                      ReadResponse.serializer);
-        Tracing.trace("Enqueuing response to {}", message.from);
-        MessagingService.instance().sendReply(reply, id, message.from);
-    }
-
-    public static ReadResponse getResponse(ReadCommand command, Row row)
-    {
-        if (command.isDigestQuery())
-        {
-            return new ReadResponse(ColumnFamily.digest(row.cf));
-        }
-        else
-        {
-            return new ReadResponse(row);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RetriedSliceFromReadCommand.java b/src/java/org/apache/cassandra/db/RetriedSliceFromReadCommand.java
deleted file mode 100644
index 41f5a50..0000000
--- a/src/java/org/apache/cassandra/db/RetriedSliceFromReadCommand.java
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-
-public class RetriedSliceFromReadCommand extends SliceFromReadCommand
-{
-    static final Logger logger = LoggerFactory.getLogger(RetriedSliceFromReadCommand.class);
-    public final int originalCount;
-
-    public RetriedSliceFromReadCommand(String keyspaceName, ByteBuffer key, String cfName, long timestamp, SliceQueryFilter filter, int originalCount)
-    {
-        super(keyspaceName, key, cfName, timestamp, filter);
-        this.originalCount = originalCount;
-    }
-
-    @Override
-    public ReadCommand copy()
-    {
-        return new RetriedSliceFromReadCommand(ksName, key, cfName, timestamp, filter, originalCount).setIsDigestQuery(isDigestQuery());
-    }
-
-    @Override
-    public int getOriginalRequestedCount()
-    {
-        return originalCount;
-    }
-
-    @Override
-    public String toString()
-    {
-        return "RetriedSliceFromReadCommand(" + "cmd=" + super.toString() + ", originalCount=" + originalCount + ")";
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/Row.java b/src/java/org/apache/cassandra/db/Row.java
deleted file mode 100644
index a826894..0000000
--- a/src/java/org/apache/cassandra/db/Row.java
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class Row
-{
-    public static final RowSerializer serializer = new RowSerializer();
-
-    public final DecoratedKey key;
-    public final ColumnFamily cf;
-
-    public Row(DecoratedKey key, ColumnFamily cf)
-    {
-        assert key != null;
-        // cf may be null, indicating no data
-        this.key = key;
-        this.cf = cf;
-    }
-
-    public Row(ByteBuffer key, ColumnFamily updates)
-    {
-        this(StorageService.getPartitioner().decorateKey(key), updates);
-    }
-
-    @Override
-    public String toString()
-    {
-        return "Row(" +
-               "key=" + key +
-               ", cf=" + cf +
-               ')';
-    }
-
-    public int getLiveCount(IDiskAtomFilter filter, long now)
-    {
-        return cf == null ? 0 : filter.getLiveCount(cf, now);
-    }
-
-    public static class RowSerializer implements IVersionedSerializer<Row>
-    {
-        public void serialize(Row row, DataOutputPlus out, int version) throws IOException
-        {
-            ByteBufferUtil.writeWithShortLength(row.key.getKey(), out);
-            ColumnFamily.serializer.serialize(row.cf, out, version);
-        }
-
-        public Row deserialize(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException
-        {
-            return new Row(StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in)),
-                           ColumnFamily.serializer.deserialize(in, flag, version));
-        }
-
-        public Row deserialize(DataInput in, int version) throws IOException
-        {
-            return deserialize(in, version, ColumnSerializer.Flag.LOCAL);
-        }
-
-        public long serializedSize(Row row, int version)
-        {
-            int keySize = row.key.getKey().remaining();
-            return TypeSizes.NATIVE.sizeof((short) keySize) + keySize + ColumnFamily.serializer.serializedSize(row.cf, TypeSizes.NATIVE, version);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/db/RowIndexEntry.java
index f9d8c6d..4e2f063 100644
--- a/src/java/org/apache/cassandra/db/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/db/RowIndexEntry.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -26,10 +25,11 @@
 
 import com.google.common.primitives.Ints;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.cache.IMeasurableMemory;
-import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -45,7 +45,7 @@
         this.position = position;
     }
 
-    public int promotedSize(ISerializer<T> idxSerializer)
+    protected int promotedSize(IndexHelper.IndexInfo.Serializer idxSerializer)
     {
         return 0;
     }
@@ -59,7 +59,7 @@
         // since if there are insufficient columns to be worth indexing we're going to seek to
         // the beginning of the row anyway, so we might as well read the tombstone there as well.
         if (index.columnsIndex.size() > 1)
-            return new IndexedEntry(position, deletionTime, index.columnsIndex);
+            return new IndexedEntry(position, deletionTime, index.partitionHeaderLength, index.columnsIndex);
         else
             return new RowIndexEntry<>(position);
     }
@@ -87,6 +87,16 @@
         return 0;
     }
 
+    /**
+     * The length of the row header (partition key, partition deletion and static row).
+     * This value is only provided for indexed entries and this method will throw
+     * {@code UnsupportedOperationException} if {@code !isIndexed()}.
+     */
+    public long headerLength()
+    {
+        throw new UnsupportedOperationException();
+    }
+
     public List<T> columnsIndex()
     {
         return Collections.emptyList();
@@ -97,51 +107,119 @@
         return EMPTY_SIZE;
     }
 
-    public static interface IndexSerializer<T>
+    public interface IndexSerializer<T>
     {
         void serialize(RowIndexEntry<T> rie, DataOutputPlus out) throws IOException;
-        RowIndexEntry<T> deserialize(DataInput in, Version version) throws IOException;
-        public int serializedSize(RowIndexEntry<T> rie);
+        RowIndexEntry<T> deserialize(DataInputPlus in) throws IOException;
+        int serializedSize(RowIndexEntry<T> rie);
     }
 
     public static class Serializer implements IndexSerializer<IndexHelper.IndexInfo>
     {
-        private final ISerializer<IndexHelper.IndexInfo> idxSerializer;
+        private final IndexHelper.IndexInfo.Serializer idxSerializer;
+        private final Version version;
 
-        public Serializer(ISerializer<IndexHelper.IndexInfo> idxSerializer)
+        public Serializer(CFMetaData metadata, Version version, SerializationHeader header)
         {
-            this.idxSerializer = idxSerializer;
+            this.idxSerializer = new IndexHelper.IndexInfo.Serializer(metadata, version, header);
+            this.version = version;
         }
 
         public void serialize(RowIndexEntry<IndexHelper.IndexInfo> rie, DataOutputPlus out) throws IOException
         {
-            out.writeLong(rie.position);
-            out.writeInt(rie.promotedSize(idxSerializer));
+            assert version.storeRows() : "We read old index files but we should never write them";
+
+            out.writeUnsignedVInt(rie.position);
+            out.writeUnsignedVInt(rie.promotedSize(idxSerializer));
 
             if (rie.isIndexed())
             {
+                out.writeUnsignedVInt(rie.headerLength());
                 DeletionTime.serializer.serialize(rie.deletionTime(), out);
-                out.writeInt(rie.columnsIndex().size());
-                for (IndexHelper.IndexInfo info : rie.columnsIndex())
-                    idxSerializer.serialize(info, out);
+                out.writeUnsignedVInt(rie.columnsIndex().size());
+
+                // Calculate and write the offsets to the IndexInfo objects.
+
+                int[] offsets = new int[rie.columnsIndex().size()];
+
+                if (out.hasPosition())
+                {
+                    // Out is usually a SequentialWriter, so using the file-pointer is fine to generate the offsets.
+                    // A DataOutputBuffer also works.
+                    long start = out.position();
+                    int i = 0;
+                    for (IndexHelper.IndexInfo info : rie.columnsIndex())
+                    {
+                        offsets[i] = i == 0 ? 0 : (int)(out.position() - start);
+                        i++;
+                        idxSerializer.serialize(info, out);
+                    }
+                }
+                else
+                {
+                    // Not sure this branch will ever be needed, but if it is called, it has to calculate the
+                    // serialized sizes instead of simply using the file-pointer.
+                    int i = 0;
+                    int offset = 0;
+                    for (IndexHelper.IndexInfo info : rie.columnsIndex())
+                    {
+                        offsets[i++] = offset;
+                        idxSerializer.serialize(info, out);
+                        offset += idxSerializer.serializedSize(info);
+                    }
+                }
+
+                for (int off : offsets)
+                    out.writeInt(off);
             }
         }
 
-        public RowIndexEntry<IndexHelper.IndexInfo> deserialize(DataInput in, Version version) throws IOException
+        public RowIndexEntry<IndexHelper.IndexInfo> deserialize(DataInputPlus in) throws IOException
         {
-            long position = in.readLong();
+            if (!version.storeRows())
+            {
+                long position = in.readLong();
 
-            int size = in.readInt();
+                int size = in.readInt();
+                if (size > 0)
+                {
+                    DeletionTime deletionTime = DeletionTime.serializer.deserialize(in);
+
+                    int entries = in.readInt();
+                    List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<>(entries);
+
+                    long headerLength = 0L;
+                    for (int i = 0; i < entries; i++)
+                    {
+                        IndexHelper.IndexInfo info = idxSerializer.deserialize(in);
+                        columnsIndex.add(info);
+                        if (i == 0)
+                            headerLength = info.offset;
+                    }
+
+                    return new IndexedEntry(position, deletionTime, headerLength, columnsIndex);
+                }
+                else
+                {
+                    return new RowIndexEntry<>(position);
+                }
+            }
+
+            long position = in.readUnsignedVInt();
+
+            int size = (int)in.readUnsignedVInt();
             if (size > 0)
             {
+                long headerLength = in.readUnsignedVInt();
                 DeletionTime deletionTime = DeletionTime.serializer.deserialize(in);
-
-                int entries = in.readInt();
+                int entries = (int)in.readUnsignedVInt();
                 List<IndexHelper.IndexInfo> columnsIndex = new ArrayList<>(entries);
                 for (int i = 0; i < entries; i++)
                     columnsIndex.add(idxSerializer.deserialize(in));
 
-                return new IndexedEntry(position, deletionTime, columnsIndex);
+                in.skipBytesFully(entries * TypeSizes.sizeof(0));
+
+                return new IndexedEntry(position, deletionTime, headerLength, columnsIndex);
             }
             else
             {
@@ -149,38 +227,49 @@
             }
         }
 
-        public static void skip(DataInput in) throws IOException
+        // Reads only the data 'position' of the index entry and returns it. Note that this left 'in' in the middle
+        // of reading an entry, so this is only useful if you know what you are doing and in most case 'deserialize'
+        // should be used instead.
+        public static long readPosition(DataInputPlus in, Version version) throws IOException
         {
-            in.readLong();
-            skipPromotedIndex(in);
+            return version.storeRows() ? in.readUnsignedVInt() : in.readLong();
         }
 
-        private static void skipPromotedIndex(DataInput in) throws IOException
+        public static void skip(DataInputPlus in, Version version) throws IOException
         {
-            int size = in.readInt();
+            readPosition(in, version);
+            skipPromotedIndex(in, version);
+        }
+
+        private static void skipPromotedIndex(DataInputPlus in, Version version) throws IOException
+        {
+            int size = version.storeRows() ? (int)in.readUnsignedVInt() : in.readInt();
             if (size <= 0)
                 return;
 
-            FileUtils.skipBytesFully(in, size);
+            in.skipBytesFully(size);
         }
 
         public int serializedSize(RowIndexEntry<IndexHelper.IndexInfo> rie)
         {
-            int size = TypeSizes.NATIVE.sizeof(rie.position) + TypeSizes.NATIVE.sizeof(rie.promotedSize(idxSerializer));
+            assert version.storeRows() : "We read old index files but we should never write them";
 
+            int indexedSize = 0;
             if (rie.isIndexed())
             {
                 List<IndexHelper.IndexInfo> index = rie.columnsIndex();
 
-                size += DeletionTime.serializer.serializedSize(rie.deletionTime(), TypeSizes.NATIVE);
-                size += TypeSizes.NATIVE.sizeof(index.size());
+                indexedSize += TypeSizes.sizeofUnsignedVInt(rie.headerLength());
+                indexedSize += DeletionTime.serializer.serializedSize(rie.deletionTime());
+                indexedSize += TypeSizes.sizeofUnsignedVInt(index.size());
 
                 for (IndexHelper.IndexInfo info : index)
-                    size += idxSerializer.serializedSize(info, TypeSizes.NATIVE);
+                    indexedSize += idxSerializer.serializedSize(info);
+
+                indexedSize += index.size() * TypeSizes.sizeof(0);
             }
 
-
-            return size;
+            return TypeSizes.sizeofUnsignedVInt(rie.position) + TypeSizes.sizeofUnsignedVInt(indexedSize) + indexedSize;
         }
     }
 
@@ -190,17 +279,21 @@
     private static class IndexedEntry extends RowIndexEntry<IndexHelper.IndexInfo>
     {
         private final DeletionTime deletionTime;
+
+        // The offset in the file when the index entry end
+        private final long headerLength;
         private final List<IndexHelper.IndexInfo> columnsIndex;
         private static final long BASE_SIZE =
-                ObjectSizes.measure(new IndexedEntry(0, DeletionTime.LIVE, Arrays.<IndexHelper.IndexInfo>asList(null, null)))
+                ObjectSizes.measure(new IndexedEntry(0, DeletionTime.LIVE, 0, Arrays.<IndexHelper.IndexInfo>asList(null, null)))
               + ObjectSizes.measure(new ArrayList<>(1));
 
-        private IndexedEntry(long position, DeletionTime deletionTime, List<IndexHelper.IndexInfo> columnsIndex)
+        private IndexedEntry(long position, DeletionTime deletionTime, long headerLength, List<IndexHelper.IndexInfo> columnsIndex)
         {
             super(position);
             assert deletionTime != null;
             assert columnsIndex != null && columnsIndex.size() > 1;
             this.deletionTime = deletionTime;
+            this.headerLength = headerLength;
             this.columnsIndex = columnsIndex;
         }
 
@@ -211,19 +304,27 @@
         }
 
         @Override
+        public long headerLength()
+        {
+            return headerLength;
+        }
+
+        @Override
         public List<IndexHelper.IndexInfo> columnsIndex()
         {
             return columnsIndex;
         }
 
         @Override
-        public int promotedSize(ISerializer<IndexHelper.IndexInfo> idxSerializer)
+        protected int promotedSize(IndexHelper.IndexInfo.Serializer idxSerializer)
         {
-            TypeSizes typeSizes = TypeSizes.NATIVE;
-            long size = DeletionTime.serializer.serializedSize(deletionTime, typeSizes);
-            size += typeSizes.sizeof(columnsIndex.size()); // number of entries
+            long size = TypeSizes.sizeofUnsignedVInt(headerLength)
+                      + DeletionTime.serializer.serializedSize(deletionTime)
+                      + TypeSizes.sizeofUnsignedVInt(columnsIndex.size()); // number of entries
             for (IndexHelper.IndexInfo info : columnsIndex)
-                size += idxSerializer.serializedSize(info, typeSizes);
+                size += idxSerializer.serializedSize(info);
+
+            size += columnsIndex.size() * TypeSizes.sizeof(0);
 
             return Ints.checkedCast(size);
         }

diff --git a/src/java/org/apache/cassandra/db/RowIteratorFactory.java b/src/java/org/apache/cassandra/db/RowIteratorFactory.java
deleted file mode 100644
index f4619f2..0000000
--- a/src/java/org/apache/cassandra/db/RowIteratorFactory.java
+++ /dev/null

@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.*;
-
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.db.columniterator.IColumnIteratorFactory;
-import org.apache.cassandra.db.columniterator.LazyColumnIterator;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.TombstoneOverwhelmingException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.MergeIterator;
-
-public class RowIteratorFactory
-{
-
-    private static final Comparator<OnDiskAtomIterator> COMPARE_BY_KEY = new Comparator<OnDiskAtomIterator>()
-    {
-        public int compare(OnDiskAtomIterator o1, OnDiskAtomIterator o2)
-        {
-            return DecoratedKey.comparator.compare(o1.getKey(), o2.getKey());
-        }
-    };
-
-
-    /**
-     * Get a row iterator over the provided memtables and sstables, between the provided keys
-     * and filtered by the queryfilter.
-     * @param memtables Memtables pending flush.
-     * @param sstables SStables to scan through.
-     * @param range The data range to fetch
-     * @param cfs
-     * @return A row iterator following all the given restrictions
-     */
-    public static CloseableIterator<Row> getIterator(final Iterable<Memtable> memtables,
-                                                     final Collection<SSTableReader> sstables,
-                                                     final DataRange range,
-                                                     final ColumnFamilyStore cfs,
-                                                     final long now)
-    {
-        // fetch data from current memtable, historical memtables, and SSTables in the correct order.
-        final List<CloseableIterator<OnDiskAtomIterator>> iterators = new ArrayList<>(Iterables.size(memtables) + sstables.size());
-
-        for (Memtable memtable : memtables)
-            iterators.add(new ConvertToColumnIterator(range, memtable.getEntryIterator(range.startKey(), range.stopKey())));
-
-        for (SSTableReader sstable : sstables)
-            iterators.add(sstable.getScanner(range));
-
-        // reduce rows from all sources into a single row
-        return MergeIterator.get(iterators, COMPARE_BY_KEY, new MergeIterator.Reducer<OnDiskAtomIterator, Row>()
-        {
-            private final int gcBefore = cfs.gcBefore(now);
-            private final List<OnDiskAtomIterator> colIters = new ArrayList<>();
-            private DecoratedKey key;
-            private ColumnFamily returnCF;
-
-            @Override
-            protected void onKeyChange()
-            {
-                this.returnCF = ArrayBackedSortedColumns.factory.create(cfs.metadata, range.columnFilter.isReversed());
-            }
-
-            public void reduce(OnDiskAtomIterator current)
-            {
-                this.colIters.add(current);
-                this.key = current.getKey();
-                this.returnCF.delete(current.getColumnFamily());
-            }
-
-            protected Row getReduced()
-            {
-                // First check if this row is in the rowCache. If it is and it covers our filter, we can skip the rest
-                ColumnFamily cached = cfs.getRawCachedRow(key);
-                IDiskAtomFilter filter = range.columnFilter(key.getKey());
-
-                if (cached == null || !cfs.isFilterFullyCoveredBy(filter, cached, now))
-                {
-                    // not cached: collate
-                    QueryFilter.collateOnDiskAtom(returnCF, colIters, filter, key, gcBefore, now);
-                }
-                else
-                {
-                    QueryFilter keyFilter = new QueryFilter(key, cfs.name, filter, now);
-                    returnCF = cfs.filterColumnFamily(cached, keyFilter);
-                }
-
-                Row rv = new Row(key, returnCF);
-                colIters.clear();
-                key = null;
-                return rv;
-            }
-        });
-    }
-
-    /**
-     * Get a ColumnIterator for a specific key in the memtable.
-     */
-    private static class ConvertToColumnIterator implements CloseableIterator<OnDiskAtomIterator>
-    {
-        private final DataRange range;
-        private final Iterator<Map.Entry<DecoratedKey, ColumnFamily>> iter;
-
-        public ConvertToColumnIterator(DataRange range, Iterator<Map.Entry<DecoratedKey, ColumnFamily>> iter)
-        {
-            this.range = range;
-            this.iter = iter;
-        }
-
-        public boolean hasNext()
-        {
-            return iter.hasNext();
-        }
-
-        /*
-         * Note that when doing get_paged_slice, we reset the start of the queryFilter after we've fetched the
-         * first row. This means that this iterator should not use in any way the filter to fetch a row before
-         * we call next(). Which prevents us for using guava AbstractIterator.
-         * This is obviously rather fragile and we should consider refactoring that code, but such refactor will go
-         * deep into the storage engine code so this will have to do until then.
-         */
-        public OnDiskAtomIterator next()
-        {
-            final Map.Entry<DecoratedKey, ColumnFamily> entry = iter.next();
-            return new LazyColumnIterator(entry.getKey(), new IColumnIteratorFactory()
-            {
-                public OnDiskAtomIterator create()
-                {
-                    return range.columnFilter(entry.getKey().getKey()).getColumnIterator(entry.getKey(), entry.getValue());
-                }
-            });
-        }
-
-        public void remove()
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        public void close()
-        {
-            // pass
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RowPosition.java b/src/java/org/apache/cassandra/db/RowPosition.java
deleted file mode 100644
index 3fa0465..0000000
--- a/src/java/org/apache/cassandra/db/RowPosition.java
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public interface RowPosition extends RingPosition<RowPosition>
-{
-    public static enum Kind
-    {
-        // Only add new values to the end of the enum, the ordinal is used
-        // during serialization
-        ROW_KEY, MIN_BOUND, MAX_BOUND;
-
-        private static final Kind[] allKinds = Kind.values();
-
-        static Kind fromOrdinal(int ordinal)
-        {
-            return allKinds[ordinal];
-        }
-    }
-
-    public static final class ForKey
-    {
-        public static RowPosition get(ByteBuffer key, IPartitioner p)
-        {
-            return key == null || key.remaining() == 0 ? p.getMinimumToken().minKeyBound() : p.decorateKey(key);
-        }
-    }
-
-    public static final RowPositionSerializer serializer = new RowPositionSerializer();
-
-    public Kind kind();
-    public boolean isMinimum();
-
-    public static class RowPositionSerializer implements IPartitionerDependentSerializer<RowPosition>
-    {
-        /*
-         * We need to be able to serialize both Token.KeyBound and
-         * DecoratedKey. To make this compact, we first write a byte whose
-         * meaning is:
-         *   - 0: DecoratedKey
-         *   - 1: a 'minimum' Token.KeyBound
-         *   - 2: a 'maximum' Token.KeyBound
-         * In the case of the DecoratedKey, we then serialize the key (the
-         * token is recreated on the other side). In the other cases, we then
-         * serialize the token.
-         */
-        public void serialize(RowPosition pos, DataOutputPlus out, int version) throws IOException
-        {
-            Kind kind = pos.kind();
-            out.writeByte(kind.ordinal());
-            if (kind == Kind.ROW_KEY)
-                ByteBufferUtil.writeWithShortLength(((DecoratedKey)pos).getKey(), out);
-            else
-                Token.serializer.serialize(pos.getToken(), out, version);
-        }
-
-        public RowPosition deserialize(DataInput in, IPartitioner p, int version) throws IOException
-        {
-            Kind kind = Kind.fromOrdinal(in.readByte());
-            if (kind == Kind.ROW_KEY)
-            {
-                ByteBuffer k = ByteBufferUtil.readWithShortLength(in);
-                return StorageService.getPartitioner().decorateKey(k);
-            }
-            else
-            {
-                Token t = Token.serializer.deserialize(in, p, version);
-                return kind == Kind.MIN_BOUND ? t.minKeyBound() : t.maxKeyBound();
-            }
-        }
-
-        public long serializedSize(RowPosition pos, int version)
-        {
-            Kind kind = pos.kind();
-            int size = 1; // 1 byte for enum
-            if (kind == Kind.ROW_KEY)
-            {
-                int keySize = ((DecoratedKey)pos).getKey().remaining();
-                size += TypeSizes.NATIVE.sizeof((short) keySize) + keySize;
-            }
-            else
-            {
-                size += Token.serializer.serializedSize(pos.getToken(), version);
-            }
-            return size;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/RowUpdateBuilder.java b/src/java/org/apache/cassandra/db/RowUpdateBuilder.java
new file mode 100644
index 0000000..c4b4c75
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/RowUpdateBuilder.java

@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.utils.*;
+
+/**
+ * Convenience object to create single row updates.
+ *
+ * This is meant for system table update, when performance is not of the utmost importance.
+ */
+public class RowUpdateBuilder
+{
+    private final PartitionUpdate update;
+
+    private final long timestamp;
+    private final int ttl;
+    private final int localDeletionTime;
+
+    private final DeletionTime deletionTime;
+
+    private final Mutation mutation;
+
+    private Row.Builder regularBuilder;
+    private Row.Builder staticBuilder;
+
+    private boolean useRowMarker = true;
+
+    private RowUpdateBuilder(PartitionUpdate update, long timestamp, int ttl, int localDeletionTime, Mutation mutation)
+    {
+        this.update = update;
+
+        this.timestamp = timestamp;
+        this.ttl = ttl;
+        this.localDeletionTime = localDeletionTime;
+        this.deletionTime = new DeletionTime(timestamp, localDeletionTime);
+
+        // note that the created mutation may get further update later on, so we don't use the ctor that create a singletonMap
+        // underneath (this class if for convenience, not performance)
+        this.mutation = mutation == null ? new Mutation(update.metadata().ksName, update.partitionKey()).add(update) : mutation;
+    }
+
+    private RowUpdateBuilder(PartitionUpdate update, long timestamp, int ttl, Mutation mutation)
+    {
+        this(update, timestamp, ttl, FBUtilities.nowInSeconds(), mutation);
+    }
+
+    private void startRow(Clustering clustering)
+    {
+        assert staticBuilder == null : "Cannot update both static and non-static columns with the same RowUpdateBuilder object";
+        assert regularBuilder == null : "Cannot add the clustering twice to the same row";
+
+        regularBuilder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        regularBuilder.newRow(clustering);
+
+        // If a CQL table, add the "row marker"
+        if (update.metadata().isCQLTable() && useRowMarker)
+            regularBuilder.addPrimaryKeyLivenessInfo(LivenessInfo.create(update.metadata(), timestamp, ttl, localDeletionTime));
+    }
+
+    private Row.Builder builder()
+    {
+        assert staticBuilder == null : "Cannot update both static and non-static columns with the same RowUpdateBuilder object";
+        if (regularBuilder == null)
+        {
+            // we don't force people to call clustering() if the table has no clustering, so call it ourselves
+            assert update.metadata().comparator.size() == 0 : "Missing call to clustering()";
+            startRow(Clustering.EMPTY);
+        }
+        return regularBuilder;
+    }
+
+    private Row.Builder staticBuilder()
+    {
+        assert regularBuilder == null : "Cannot update both static and non-static columns with the same RowUpdateBuilder object";
+        if (staticBuilder == null)
+        {
+            staticBuilder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+            staticBuilder.newRow(Clustering.STATIC_CLUSTERING);
+        }
+        return staticBuilder;
+    }
+
+    private Row.Builder builder(ColumnDefinition c)
+    {
+        return c.isStatic() ? staticBuilder() : builder();
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, long timestamp, Object partitionKey)
+    {
+        this(metadata, FBUtilities.nowInSeconds(), timestamp, partitionKey);
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, int localDeletionTime, long timestamp, Object partitionKey)
+    {
+        this(metadata, localDeletionTime, timestamp, metadata.params.defaultTimeToLive, partitionKey);
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, long timestamp, int ttl, Object partitionKey)
+    {
+        this(metadata, FBUtilities.nowInSeconds(), timestamp, ttl, partitionKey);
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, int localDeletionTime, long timestamp, int ttl, Object partitionKey)
+    {
+        this(new PartitionUpdate(metadata, makeKey(metadata, partitionKey), metadata.partitionColumns(), 1), timestamp, ttl, localDeletionTime, null);
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, long timestamp, Mutation mutation)
+    {
+        this(metadata, timestamp, LivenessInfo.NO_TTL, mutation);
+    }
+
+    public RowUpdateBuilder(CFMetaData metadata, long timestamp, int ttl, Mutation mutation)
+    {
+        this(getOrAdd(metadata, mutation), timestamp, ttl, mutation);
+    }
+
+    public RowUpdateBuilder(PartitionUpdate update, long timestamp, int ttl)
+    {
+        this(update, timestamp, ttl, null);
+    }
+
+    // This must be called before any addition or deletion if used.
+    public RowUpdateBuilder noRowMarker()
+    {
+        this.useRowMarker = false;
+        return this;
+    }
+
+    public RowUpdateBuilder clustering(Object... clusteringValues)
+    {
+        assert clusteringValues.length == update.metadata().comparator.size()
+             : "Invalid clustering values length. Expected: " + update.metadata().comparator.size() + " got: " + clusteringValues.length;
+
+        startRow(clusteringValues.length == 0 ? Clustering.EMPTY : update.metadata().comparator.make(clusteringValues));
+        return this;
+    }
+
+    public Mutation build()
+    {
+        Row.Builder builder = regularBuilder == null ? staticBuilder : regularBuilder;
+        if (builder != null)
+            update.add(builder.build());
+        return mutation;
+    }
+
+    public PartitionUpdate buildUpdate()
+    {
+        build();
+        return update;
+    }
+
+    private static void deleteRow(PartitionUpdate update, long timestamp, int localDeletionTime, Object... clusteringValues)
+    {
+        assert clusteringValues.length == update.metadata().comparator.size() || (clusteringValues.length == 0 && !update.columns().statics.isEmpty());
+
+        boolean isStatic = clusteringValues.length != update.metadata().comparator.size();
+        Row.Builder builder = BTreeRow.sortedBuilder();
+
+        if (isStatic)
+            builder.newRow(Clustering.STATIC_CLUSTERING);
+        else
+            builder.newRow(clusteringValues.length == 0 ? Clustering.EMPTY : update.metadata().comparator.make(clusteringValues));
+        builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(timestamp, localDeletionTime)));
+
+        update.add(builder.build());
+    }
+
+    public static Mutation deleteRow(CFMetaData metadata, long timestamp, Mutation mutation, Object... clusteringValues)
+    {
+        deleteRow(getOrAdd(metadata, mutation), timestamp, FBUtilities.nowInSeconds(), clusteringValues);
+        return mutation;
+    }
+
+    public static Mutation deleteRow(CFMetaData metadata, long timestamp, Object key, Object... clusteringValues)
+    {
+        return deleteRowAt(metadata, timestamp, FBUtilities.nowInSeconds(), key, clusteringValues);
+    }
+
+    public static Mutation deleteRowAt(CFMetaData metadata, long timestamp, int localDeletionTime, Object key, Object... clusteringValues)
+    {
+        PartitionUpdate update = new PartitionUpdate(metadata, makeKey(metadata, key), metadata.partitionColumns(), 0);
+        deleteRow(update, timestamp, localDeletionTime, clusteringValues);
+        // note that the created mutation may get further update later on, so we don't use the ctor that create a singletonMap
+        // underneath (this class if for convenience, not performance)
+        return new Mutation(update.metadata().ksName, update.partitionKey()).add(update);
+    }
+
+    private static DecoratedKey makeKey(CFMetaData metadata, Object... partitionKey)
+    {
+        if (partitionKey.length == 1 && partitionKey[0] instanceof DecoratedKey)
+            return (DecoratedKey)partitionKey[0];
+
+        ByteBuffer key = CFMetaData.serializePartitionKey(metadata.getKeyValidatorAsClusteringComparator().make(partitionKey));
+        return metadata.decorateKey(key);
+    }
+
+    private static PartitionUpdate getOrAdd(CFMetaData metadata, Mutation mutation)
+    {
+        PartitionUpdate upd = mutation.get(metadata);
+        if (upd == null)
+        {
+            upd = new PartitionUpdate(metadata, mutation.key(), metadata.partitionColumns(), 1);
+            mutation.add(upd);
+        }
+        return upd;
+    }
+
+    public RowUpdateBuilder resetCollection(String columnName)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c != null : "Cannot find column " + columnName;
+        assert c.isStatic() || update.metadata().comparator.size() == 0 || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type.isCollection() && c.type.isMultiCell();
+        builder(c).addComplexDeletion(c, new DeletionTime(timestamp - 1, localDeletionTime));
+        return this;
+    }
+
+    public RowUpdateBuilder addRangeTombstone(RangeTombstone rt)
+    {
+        update.add(rt);
+        return this;
+    }
+
+    public RowUpdateBuilder addRangeTombstone(Slice slice)
+    {
+        return addRangeTombstone(new RangeTombstone(slice, deletionTime));
+    }
+
+    public RowUpdateBuilder addRangeTombstone(Object start, Object end)
+    {
+        ClusteringComparator cmp = update.metadata().comparator;
+        Slice slice = Slice.make(cmp.make(start), cmp.make(end));
+        return addRangeTombstone(slice);
+    }
+
+    public RowUpdateBuilder add(String columnName, Object value)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c != null : "Cannot find column " + columnName;
+        return add(c, value);
+    }
+
+    private Cell makeCell(ColumnDefinition c, ByteBuffer value, CellPath path)
+    {
+        return value == null
+             ? BufferCell.tombstone(c, timestamp, localDeletionTime)
+             : (ttl == LivenessInfo.NO_TTL ? BufferCell.live(update.metadata(), c, timestamp, value, path) : BufferCell.expiring(c, timestamp, ttl, localDeletionTime, value, path));
+    }
+
+    public RowUpdateBuilder add(ColumnDefinition columnDefinition, Object value)
+    {
+        assert columnDefinition.isStatic() || update.metadata().comparator.size() == 0 || regularBuilder != null : "Cannot set non static column " + columnDefinition + " since no clustering hasn't been provided";
+        builder(columnDefinition).addCell(makeCell(columnDefinition, bb(value, columnDefinition.type), null));
+        return this;
+    }
+
+    public RowUpdateBuilder delete(String columnName)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c != null : "Cannot find column " + columnName;
+        return delete(c);
+    }
+
+    public RowUpdateBuilder delete(ColumnDefinition columnDefinition)
+    {
+        return add(columnDefinition, null);
+    }
+
+    private static ByteBuffer bb(Object value, AbstractType<?> type)
+    {
+        if (value == null)
+            return null;
+
+        if (value instanceof ByteBuffer)
+            return (ByteBuffer)value;
+
+        if (type.isCounter())
+        {
+            // See UpdateParameters.addCounter()
+            assert value instanceof Long : "Attempted to adjust Counter cell with non-long value.";
+            return CounterContext.instance().createGlobal(CounterId.getLocalId(), 1, (Long)value);
+        }
+        return ((AbstractType)type).decompose(value);
+    }
+
+    public RowUpdateBuilder map(String columnName, Map<?, ?> map)
+    {
+        resetCollection(columnName);
+        for (Map.Entry<?, ?> entry : map.entrySet())
+            addMapEntry(columnName, entry.getKey(), entry.getValue());
+        return this;
+    }
+
+    public RowUpdateBuilder set(String columnName, Set<?> set)
+    {
+        resetCollection(columnName);
+        for (Object element : set)
+            addSetEntry(columnName, element);
+        return this;
+    }
+
+    public RowUpdateBuilder frozenList(String columnName, List<?> list)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof ListType && !c.type.isMultiCell() : "Column " + c + " is not a frozen list";
+        builder(c).addCell(makeCell(c, bb(((AbstractType)c.type).decompose(list), c.type), null));
+        return this;
+    }
+
+    public RowUpdateBuilder frozenSet(String columnName, Set<?> set)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof SetType && !c.type.isMultiCell() : "Column " + c + " is not a frozen set";
+        builder(c).addCell(makeCell(c, bb(((AbstractType)c.type).decompose(set), c.type), null));
+        return this;
+    }
+
+    public RowUpdateBuilder frozenMap(String columnName, Map<?, ?> map)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof MapType && !c.type.isMultiCell() : "Column " + c + " is not a frozen map";
+        builder(c).addCell(makeCell(c, bb(((AbstractType)c.type).decompose(map), c.type), null));
+        return this;
+    }
+
+    public RowUpdateBuilder addMapEntry(String columnName, Object key, Object value)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || update.metadata().comparator.size() == 0 || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof MapType && c.type.isMultiCell() : "Column " + c + " is not a non-frozen map";
+        MapType mt = (MapType)c.type;
+        builder(c).addCell(makeCell(c, bb(value, mt.getValuesType()), CellPath.create(bb(key, mt.getKeysType()))));
+        return this;
+    }
+
+    public RowUpdateBuilder addListEntry(String columnName, Object value)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof ListType && c.type.isMultiCell() : "Column " + c + " is not a non-frozen list";
+        ListType lt = (ListType)c.type;
+        builder(c).addCell(makeCell(c, bb(value, lt.getElementsType()), CellPath.create(ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes()))));
+        return this;
+    }
+
+    public RowUpdateBuilder addSetEntry(String columnName, Object value)
+    {
+        ColumnDefinition c = getDefinition(columnName);
+        assert c.isStatic() || regularBuilder != null : "Cannot set non static column " + c + " since no clustering has been provided";
+        assert c.type instanceof SetType && c.type.isMultiCell() : "Column " + c + " is not a non-frozen set";
+        SetType st = (SetType)c.type;
+        builder(c).addCell(makeCell(c, ByteBufferUtil.EMPTY_BYTE_BUFFER, CellPath.create(bb(value, st.getElementsType()))));
+        return this;
+    }
+
+    private ColumnDefinition getDefinition(String name)
+    {
+        return update.metadata().getColumnDefinitionForCQL(new ColumnIdentifier(name, true));
+    }
+
+    public UnfilteredRowIterator unfilteredIterator()
+    {
+        return update.unfilteredIterator();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/SerializationHeader.java b/src/java/org/apache/cassandra/db/SerializationHeader.java
new file mode 100644
index 0000000..5c4f518
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/SerializationHeader.java

@@ -0,0 +1,573 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.IMetadataComponentSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class SerializationHeader
+{
+    public static final Serializer serializer = new Serializer();
+
+    private final boolean isForSSTable;
+
+    private final AbstractType<?> keyType;
+    private final List<AbstractType<?>> clusteringTypes;
+
+    private final PartitionColumns columns;
+    private final EncodingStats stats;
+
+    private final Map<ByteBuffer, AbstractType<?>> typeMap;
+
+    private SerializationHeader(boolean isForSSTable,
+                                AbstractType<?> keyType,
+                                List<AbstractType<?>> clusteringTypes,
+                                PartitionColumns columns,
+                                EncodingStats stats,
+                                Map<ByteBuffer, AbstractType<?>> typeMap)
+    {
+        this.isForSSTable = isForSSTable;
+        this.keyType = keyType;
+        this.clusteringTypes = clusteringTypes;
+        this.columns = columns;
+        this.stats = stats;
+        this.typeMap = typeMap;
+    }
+
+    public static SerializationHeader makeWithoutStats(CFMetaData metadata)
+    {
+        return new SerializationHeader(true, metadata, metadata.partitionColumns(), EncodingStats.NO_STATS);
+    }
+
+    public static SerializationHeader forKeyCache(CFMetaData metadata)
+    {
+        // We don't save type information in the key cache (we could change
+        // that but it's easier right now), so instead we simply use BytesType
+        // for both serialization and deserialization. Note that we also only
+        // serializer clustering prefixes in the key cache, so only the clusteringTypes
+        // really matter.
+        int size = metadata.clusteringColumns().size();
+        List<AbstractType<?>> clusteringTypes = new ArrayList<>(size);
+        for (int i = 0; i < size; i++)
+            clusteringTypes.add(BytesType.instance);
+        return new SerializationHeader(false,
+                                       BytesType.instance,
+                                       clusteringTypes,
+                                       PartitionColumns.NONE,
+                                       EncodingStats.NO_STATS,
+                                       Collections.<ByteBuffer, AbstractType<?>>emptyMap());
+    }
+
+    public static SerializationHeader make(CFMetaData metadata, Collection<SSTableReader> sstables)
+    {
+        // The serialization header has to be computed before the start of compaction (since it's used to write)
+        // the result. This means that when compacting multiple sources, we won't have perfectly accurate stats
+        // (for EncodingStats) since compaction may delete, purge and generally merge rows in unknown ways. This is
+        // kind of ok because those stats are only used for optimizing the underlying storage format and so we
+        // just have to strive for as good as possible. Currently, we stick to a relatively naive merge of existing
+        // global stats because it's simple and probably good enough in most situation but we could probably
+        // improve our marging of inaccuracy through the use of more fine-grained stats in the future.
+        // Note however that to avoid seeing our accuracy degrade through successive compactions, we don't base
+        // our stats merging on the compacted files headers, which as we just said can be somewhat inaccurate,
+        // but rather on their stats stored in StatsMetadata that are fully accurate.
+        EncodingStats.Collector stats = new EncodingStats.Collector();
+        PartitionColumns.Builder columns = PartitionColumns.builder();
+        // We need to order the SSTables by descending generation to be sure that we use latest column definitions.
+        for (SSTableReader sstable : orderByDescendingGeneration(sstables))
+        {
+            stats.updateTimestamp(sstable.getMinTimestamp());
+            stats.updateLocalDeletionTime(sstable.getMinLocalDeletionTime());
+            stats.updateTTL(sstable.getMinTTL());
+            if (sstable.header == null)
+                columns.addAll(metadata.partitionColumns());
+            else
+                columns.addAll(sstable.header.columns());
+        }
+        return new SerializationHeader(true, metadata, columns.build(), stats.get());
+    }
+
+    private static Collection<SSTableReader> orderByDescendingGeneration(Collection<SSTableReader> sstables)
+    {
+        if (sstables.size() < 2)
+            return sstables;
+
+        List<SSTableReader> readers = new ArrayList<>(sstables);
+        readers.sort(SSTableReader.generationReverseComparator);
+        return readers;
+    }
+
+    public SerializationHeader(boolean isForSSTable,
+                               CFMetaData metadata,
+                               PartitionColumns columns,
+                               EncodingStats stats)
+    {
+        this(isForSSTable,
+             metadata.getKeyValidator(),
+             typesOf(metadata.clusteringColumns()),
+             columns,
+             stats,
+             null);
+    }
+
+    private static List<AbstractType<?>> typesOf(List<ColumnDefinition> columns)
+    {
+        return ImmutableList.copyOf(Lists.transform(columns, column -> column.type));
+    }
+
+    public PartitionColumns columns()
+    {
+        return columns;
+    }
+
+    public boolean hasStatic()
+    {
+        return !columns.statics.isEmpty();
+    }
+
+    public boolean isForSSTable()
+    {
+        return isForSSTable;
+    }
+
+    public EncodingStats stats()
+    {
+        return stats;
+    }
+
+    public AbstractType<?> keyType()
+    {
+        return keyType;
+    }
+
+    public List<AbstractType<?>> clusteringTypes()
+    {
+        return clusteringTypes;
+    }
+
+    public Columns columns(boolean isStatic)
+    {
+        return isStatic ? columns.statics : columns.regulars;
+    }
+
+    public AbstractType<?> getType(ColumnDefinition column)
+    {
+        return typeMap == null ? column.type : typeMap.get(column.name.bytes);
+    }
+
+    public void writeTimestamp(long timestamp, DataOutputPlus out) throws IOException
+    {
+        out.writeUnsignedVInt(timestamp - stats.minTimestamp);
+    }
+
+    public void writeLocalDeletionTime(int localDeletionTime, DataOutputPlus out) throws IOException
+    {
+        out.writeUnsignedVInt(localDeletionTime - stats.minLocalDeletionTime);
+    }
+
+    public void writeTTL(int ttl, DataOutputPlus out) throws IOException
+    {
+        out.writeUnsignedVInt(ttl - stats.minTTL);
+    }
+
+    public void writeDeletionTime(DeletionTime dt, DataOutputPlus out) throws IOException
+    {
+        writeTimestamp(dt.markedForDeleteAt(), out);
+        writeLocalDeletionTime(dt.localDeletionTime(), out);
+    }
+
+    public long readTimestamp(DataInputPlus in) throws IOException
+    {
+        return in.readUnsignedVInt() + stats.minTimestamp;
+    }
+
+    public int readLocalDeletionTime(DataInputPlus in) throws IOException
+    {
+        return (int)in.readUnsignedVInt() + stats.minLocalDeletionTime;
+    }
+
+    public int readTTL(DataInputPlus in) throws IOException
+    {
+        return (int)in.readUnsignedVInt() + stats.minTTL;
+    }
+
+    public DeletionTime readDeletionTime(DataInputPlus in) throws IOException
+    {
+        long markedAt = readTimestamp(in);
+        int localDeletionTime = readLocalDeletionTime(in);
+        return new DeletionTime(markedAt, localDeletionTime);
+    }
+
+    public long timestampSerializedSize(long timestamp)
+    {
+        return TypeSizes.sizeofUnsignedVInt(timestamp - stats.minTimestamp);
+    }
+
+    public long localDeletionTimeSerializedSize(int localDeletionTime)
+    {
+        return TypeSizes.sizeofUnsignedVInt(localDeletionTime - stats.minLocalDeletionTime);
+    }
+
+    public long ttlSerializedSize(int ttl)
+    {
+        return TypeSizes.sizeofUnsignedVInt(ttl - stats.minTTL);
+    }
+
+    public long deletionTimeSerializedSize(DeletionTime dt)
+    {
+        return timestampSerializedSize(dt.markedForDeleteAt())
+             + localDeletionTimeSerializedSize(dt.localDeletionTime());
+    }
+
+    public void skipTimestamp(DataInputPlus in) throws IOException
+    {
+        in.readUnsignedVInt();
+    }
+
+    public void skipLocalDeletionTime(DataInputPlus in) throws IOException
+    {
+        in.readUnsignedVInt();
+    }
+
+    public void skipTTL(DataInputPlus in) throws IOException
+    {
+        in.readUnsignedVInt();
+    }
+
+    public void skipDeletionTime(DataInputPlus in) throws IOException
+    {
+        skipTimestamp(in);
+        skipLocalDeletionTime(in);
+    }
+
+    public Component toComponent()
+    {
+        Map<ByteBuffer, AbstractType<?>> staticColumns = new LinkedHashMap<>();
+        Map<ByteBuffer, AbstractType<?>> regularColumns = new LinkedHashMap<>();
+        for (ColumnDefinition column : columns.statics)
+            staticColumns.put(column.name.bytes, column.type);
+        for (ColumnDefinition column : columns.regulars)
+            regularColumns.put(column.name.bytes, column.type);
+        return new Component(keyType, clusteringTypes, staticColumns, regularColumns, stats);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("SerializationHeader[key=%s, cks=%s, columns=%s, stats=%s, typeMap=%s]", keyType, clusteringTypes, columns, stats, typeMap);
+    }
+
+    /**
+     * We need the CFMetadata to properly deserialize a SerializationHeader but it's clunky to pass that to
+     * a SSTable component, so we use this temporary object to delay the actual need for the metadata.
+     */
+    public static class Component extends MetadataComponent
+    {
+        private final AbstractType<?> keyType;
+        private final List<AbstractType<?>> clusteringTypes;
+        private final Map<ByteBuffer, AbstractType<?>> staticColumns;
+        private final Map<ByteBuffer, AbstractType<?>> regularColumns;
+        private final EncodingStats stats;
+
+        private Component(AbstractType<?> keyType,
+                          List<AbstractType<?>> clusteringTypes,
+                          Map<ByteBuffer, AbstractType<?>> staticColumns,
+                          Map<ByteBuffer, AbstractType<?>> regularColumns,
+                          EncodingStats stats)
+        {
+            this.keyType = keyType;
+            this.clusteringTypes = clusteringTypes;
+            this.staticColumns = staticColumns;
+            this.regularColumns = regularColumns;
+            this.stats = stats;
+        }
+
+        public MetadataType getType()
+        {
+            return MetadataType.HEADER;
+        }
+
+        public SerializationHeader toHeader(CFMetaData metadata)
+        {
+            Map<ByteBuffer, AbstractType<?>> typeMap = new HashMap<>(staticColumns.size() + regularColumns.size());
+
+            PartitionColumns.Builder builder = PartitionColumns.builder();
+            for (Map<ByteBuffer, AbstractType<?>> map : ImmutableList.of(staticColumns, regularColumns))
+            {
+                boolean isStatic = map == staticColumns;
+                for (Map.Entry<ByteBuffer, AbstractType<?>> e : map.entrySet())
+                {
+                    ByteBuffer name = e.getKey();
+                    AbstractType<?> other = typeMap.put(name, e.getValue());
+                    if (other != null && !other.equals(e.getValue()))
+                        throw new IllegalStateException("Column " + name + " occurs as both regular and static with types " + other + "and " + e.getValue());
+
+                    ColumnDefinition column = metadata.getColumnDefinition(name);
+                    if (column == null || column.isStatic() != isStatic)
+                    {
+                        // TODO: this imply we don't read data for a column we don't yet know about, which imply this is theoretically
+                        // racy with column addition. Currently, it is up to the user to not write data before the schema has propagated
+                        // and this is far from being the only place that has such problem in practice. This doesn't mean we shouldn't
+                        // improve this.
+
+                        // If we don't find the definition, it could be we have data for a dropped column, and we shouldn't
+                        // fail deserialization because of that. So we grab a "fake" ColumnDefinition that ensure proper
+                        // deserialization. The column will be ignore later on anyway.
+                        column = metadata.getDroppedColumnDefinition(name, isStatic);
+                        if (column == null)
+                            throw new RuntimeException("Unknown column " + UTF8Type.instance.getString(name) + " during deserialization");
+                    }
+                    builder.add(column);
+                }
+            }
+
+            return new SerializationHeader(true, keyType, clusteringTypes, builder.build(), stats, typeMap);
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if(!(o instanceof Component))
+                return false;
+
+            Component that = (Component)o;
+            return Objects.equals(this.keyType, that.keyType)
+                && Objects.equals(this.clusteringTypes, that.clusteringTypes)
+                && Objects.equals(this.staticColumns, that.staticColumns)
+                && Objects.equals(this.regularColumns, that.regularColumns)
+                && Objects.equals(this.stats, that.stats);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hash(keyType, clusteringTypes, staticColumns, regularColumns, stats);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("SerializationHeader.Component[key=%s, cks=%s, statics=%s, regulars=%s, stats=%s]",
+                                 keyType, clusteringTypes, staticColumns, regularColumns, stats);
+        }
+
+        public AbstractType<?> getKeyType()
+        {
+            return keyType;
+        }
+
+        public List<AbstractType<?>> getClusteringTypes()
+        {
+            return clusteringTypes;
+        }
+
+        public Map<ByteBuffer, AbstractType<?>> getStaticColumns()
+        {
+            return staticColumns;
+        }
+
+        public Map<ByteBuffer, AbstractType<?>> getRegularColumns()
+        {
+            return regularColumns;
+        }
+
+        public EncodingStats getEncodingStats()
+        {
+            return stats;
+        }
+    }
+
+    public static class Serializer implements IMetadataComponentSerializer<Component>
+    {
+        public void serializeForMessaging(SerializationHeader header, ColumnFilter selection, DataOutputPlus out, boolean hasStatic) throws IOException
+        {
+            EncodingStats.serializer.serialize(header.stats, out);
+
+            if (selection == null)
+            {
+                if (hasStatic)
+                    Columns.serializer.serialize(header.columns.statics, out);
+                Columns.serializer.serialize(header.columns.regulars, out);
+            }
+            else
+            {
+                if (hasStatic)
+                    Columns.serializer.serializeSubset(header.columns.statics, selection.fetchedColumns().statics, out);
+                Columns.serializer.serializeSubset(header.columns.regulars, selection.fetchedColumns().regulars, out);
+            }
+        }
+
+        public SerializationHeader deserializeForMessaging(DataInputPlus in, CFMetaData metadata, ColumnFilter selection, boolean hasStatic) throws IOException
+        {
+            EncodingStats stats = EncodingStats.serializer.deserialize(in);
+
+            AbstractType<?> keyType = metadata.getKeyValidator();
+            List<AbstractType<?>> clusteringTypes = typesOf(metadata.clusteringColumns());
+
+            Columns statics, regulars;
+            if (selection == null)
+            {
+                statics = hasStatic ? Columns.serializer.deserialize(in, metadata) : Columns.NONE;
+                regulars = Columns.serializer.deserialize(in, metadata);
+            }
+            else
+            {
+                statics = hasStatic ? Columns.serializer.deserializeSubset(selection.fetchedColumns().statics, in) : Columns.NONE;
+                regulars = Columns.serializer.deserializeSubset(selection.fetchedColumns().regulars, in);
+            }
+
+            return new SerializationHeader(false, keyType, clusteringTypes, new PartitionColumns(statics, regulars), stats, null);
+        }
+
+        public long serializedSizeForMessaging(SerializationHeader header, ColumnFilter selection, boolean hasStatic)
+        {
+            long size = EncodingStats.serializer.serializedSize(header.stats);
+
+            if (selection == null)
+            {
+                if (hasStatic)
+                    size += Columns.serializer.serializedSize(header.columns.statics);
+                size += Columns.serializer.serializedSize(header.columns.regulars);
+            }
+            else
+            {
+                if (hasStatic)
+                    size += Columns.serializer.serializedSubsetSize(header.columns.statics, selection.fetchedColumns().statics);
+                size += Columns.serializer.serializedSubsetSize(header.columns.regulars, selection.fetchedColumns().regulars);
+            }
+            return size;
+        }
+
+        // For SSTables
+        public void serialize(Version version, Component header, DataOutputPlus out) throws IOException
+        {
+            EncodingStats.serializer.serialize(header.stats, out);
+
+            writeType(header.keyType, out);
+            out.writeUnsignedVInt(header.clusteringTypes.size());
+            for (AbstractType<?> type : header.clusteringTypes)
+                writeType(type, out);
+
+            writeColumnsWithTypes(header.staticColumns, out);
+            writeColumnsWithTypes(header.regularColumns, out);
+        }
+
+        // For SSTables
+        public Component deserialize(Version version, DataInputPlus in) throws IOException
+        {
+            EncodingStats stats = EncodingStats.serializer.deserialize(in);
+
+            AbstractType<?> keyType = readType(in);
+            int size = (int)in.readUnsignedVInt();
+            List<AbstractType<?>> clusteringTypes = new ArrayList<>(size);
+            for (int i = 0; i < size; i++)
+                clusteringTypes.add(readType(in));
+
+            Map<ByteBuffer, AbstractType<?>> staticColumns = new LinkedHashMap<>();
+            Map<ByteBuffer, AbstractType<?>> regularColumns = new LinkedHashMap<>();
+
+            readColumnsWithType(in, staticColumns);
+            readColumnsWithType(in, regularColumns);
+
+            return new Component(keyType, clusteringTypes, staticColumns, regularColumns, stats);
+        }
+
+        // For SSTables
+        public int serializedSize(Version version, Component header)
+        {
+            int size = EncodingStats.serializer.serializedSize(header.stats);
+
+            size += sizeofType(header.keyType);
+            size += TypeSizes.sizeofUnsignedVInt(header.clusteringTypes.size());
+            for (AbstractType<?> type : header.clusteringTypes)
+                size += sizeofType(type);
+
+            size += sizeofColumnsWithTypes(header.staticColumns);
+            size += sizeofColumnsWithTypes(header.regularColumns);
+            return size;
+        }
+
+        private void writeColumnsWithTypes(Map<ByteBuffer, AbstractType<?>> columns, DataOutputPlus out) throws IOException
+        {
+            out.writeUnsignedVInt(columns.size());
+            for (Map.Entry<ByteBuffer, AbstractType<?>> entry : columns.entrySet())
+            {
+                ByteBufferUtil.writeWithVIntLength(entry.getKey(), out);
+                writeType(entry.getValue(), out);
+            }
+        }
+
+        private long sizeofColumnsWithTypes(Map<ByteBuffer, AbstractType<?>> columns)
+        {
+            long size = TypeSizes.sizeofUnsignedVInt(columns.size());
+            for (Map.Entry<ByteBuffer, AbstractType<?>> entry : columns.entrySet())
+            {
+                size += ByteBufferUtil.serializedSizeWithVIntLength(entry.getKey());
+                size += sizeofType(entry.getValue());
+            }
+            return size;
+        }
+
+        private void readColumnsWithType(DataInputPlus in, Map<ByteBuffer, AbstractType<?>> typeMap) throws IOException
+        {
+            int length = (int)in.readUnsignedVInt();
+            for (int i = 0; i < length; i++)
+            {
+                ByteBuffer name = ByteBufferUtil.readWithVIntLength(in);
+                typeMap.put(name, readType(in));
+            }
+        }
+
+        private void writeType(AbstractType<?> type, DataOutputPlus out) throws IOException
+        {
+            // TODO: we should have a terser serializaion format. Not a big deal though
+            ByteBufferUtil.writeWithVIntLength(UTF8Type.instance.decompose(type.toString()), out);
+        }
+
+        private AbstractType<?> readType(DataInputPlus in) throws IOException
+        {
+            ByteBuffer raw = ByteBufferUtil.readWithVIntLength(in);
+            return TypeParser.parse(UTF8Type.instance.compose(raw));
+        }
+
+        private int sizeofType(AbstractType<?> type)
+        {
+            return ByteBufferUtil.serializedSizeWithVIntLength(UTF8Type.instance.decompose(type.toString()));
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/Serializers.java b/src/java/org/apache/cassandra/db/Serializers.java
new file mode 100644
index 0000000..bf340e7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Serializers.java

@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Holds references on serializers that depend on the table definition.
+ */
+public class Serializers
+{
+    private final CFMetaData metadata;
+
+    public Serializers(CFMetaData metadata)
+    {
+        this.metadata = metadata;
+    }
+
+    // TODO: Once we drop support for old (pre-3.0) sstables, we can drop this method and inline the calls to
+    // ClusteringPrefix.serializer in IndexHelper directly. At which point this whole class probably becomes
+    // unecessary (since IndexInfo.Serializer won't depend on the metadata either).
+    public ISerializer<ClusteringPrefix> indexEntryClusteringPrefixSerializer(final Version version, final SerializationHeader header)
+    {
+        if (!version.storeRows() || header ==  null) //null header indicates streaming from pre-3.0 sstables
+        {
+            return oldFormatSerializer(version);
+        }
+
+        return newFormatSerializer(version, header);
+    }
+
+    private ISerializer<ClusteringPrefix> oldFormatSerializer(final Version version)
+    {
+        return new ISerializer<ClusteringPrefix>()
+        {
+            SerializationHeader newHeader = SerializationHeader.makeWithoutStats(metadata);
+
+            public void serialize(ClusteringPrefix clustering, DataOutputPlus out) throws IOException
+            {
+                //we deserialize in the old format and serialize in the new format
+                ClusteringPrefix.serializer.serialize(clustering, out,
+                                                      version.correspondingMessagingVersion(),
+                                                      newHeader.clusteringTypes());
+            }
+
+            public ClusteringPrefix deserialize(DataInputPlus in) throws IOException
+            {
+                // We're reading the old cellname/composite
+                ByteBuffer bb = ByteBufferUtil.readWithShortLength(in);
+                assert bb.hasRemaining(); // empty cellnames were invalid
+
+                int clusteringSize = metadata.clusteringColumns().size();
+                // If the table has no clustering column, then the cellname will just be the "column" name, which we ignore here.
+                if (clusteringSize == 0)
+                    return Clustering.EMPTY;
+
+                if (metadata.isCompound() && CompositeType.isStaticName(bb))
+                    return Clustering.STATIC_CLUSTERING;
+
+                if (!metadata.isCompound())
+                    return new Clustering(bb);
+
+                List<ByteBuffer> components = CompositeType.splitName(bb);
+                byte eoc = CompositeType.lastEOC(bb);
+
+                if (eoc == 0 || components.size() >= clusteringSize)
+                {
+                    // That's a clustering.
+                    if (components.size() > clusteringSize)
+                        components = components.subList(0, clusteringSize);
+
+                    return new Clustering(components.toArray(new ByteBuffer[clusteringSize]));
+                }
+                else
+                {
+                    // It's a range tombstone bound. It is a start since that's the only part we've ever included
+                    // in the index entries.
+                    Slice.Bound.Kind boundKind = eoc > 0
+                                                 ? Slice.Bound.Kind.EXCL_START_BOUND
+                                                 : Slice.Bound.Kind.INCL_START_BOUND;
+
+                    return Slice.Bound.create(boundKind, components.toArray(new ByteBuffer[components.size()]));
+                }
+            }
+
+            public long serializedSize(ClusteringPrefix clustering)
+            {
+                return ClusteringPrefix.serializer.serializedSize(clustering, version.correspondingMessagingVersion(),
+                                                                  newHeader.clusteringTypes());
+            }
+        };
+    }
+
+
+    private ISerializer<ClusteringPrefix> newFormatSerializer(final Version version, final SerializationHeader header)
+    {
+        return new ISerializer<ClusteringPrefix>() //Reading and writing from/to the new sstable format
+        {
+            public void serialize(ClusteringPrefix clustering, DataOutputPlus out) throws IOException
+            {
+                ClusteringPrefix.serializer.serialize(clustering, out, version.correspondingMessagingVersion(), header.clusteringTypes());
+            }
+
+            public ClusteringPrefix deserialize(DataInputPlus in) throws IOException
+            {
+                return ClusteringPrefix.serializer.deserialize(in, version.correspondingMessagingVersion(), header.clusteringTypes());
+            }
+
+            public long serializedSize(ClusteringPrefix clustering)
+            {
+                return ClusteringPrefix.serializer.serializedSize(clustering, version.correspondingMessagingVersion(), header.clusteringTypes());
+            }
+        };
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
new file mode 100644
index 0000000..841c3b9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java

@@ -0,0 +1,1293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+
+import org.apache.cassandra.cache.IRowCacheEntry;
+import org.apache.cassandra.cache.RowCacheKey;
+import org.apache.cassandra.cache.RowCacheSentinel;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.lifecycle.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.RTBoundValidator;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.pager.*;
+import org.apache.cassandra.thrift.ThriftResultsMerger;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.btree.BTreeSet;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapAllocator;
+
+
+/**
+ * A read command that selects a (part of a) single partition.
+ */
+public class SinglePartitionReadCommand extends ReadCommand
+{
+    protected static final SelectionDeserializer selectionDeserializer = new Deserializer();
+
+    private final DecoratedKey partitionKey;
+    private final ClusteringIndexFilter clusteringIndexFilter;
+
+    private int oldestUnrepairedTombstone = Integer.MAX_VALUE;
+
+    private SinglePartitionReadCommand(boolean isDigest,
+                                       int digestVersion,
+                                       boolean isForThrift,
+                                       CFMetaData metadata,
+                                       int nowInSec,
+                                       ColumnFilter columnFilter,
+                                       RowFilter rowFilter,
+                                       DataLimits limits,
+                                       DecoratedKey partitionKey,
+                                       ClusteringIndexFilter clusteringIndexFilter,
+                                       IndexMetadata index)
+    {
+        super(Kind.SINGLE_PARTITION, isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+        assert partitionKey.getPartitioner() == metadata.partitioner;
+        this.partitionKey = partitionKey;
+        this.clusteringIndexFilter = clusteringIndexFilter;
+    }
+
+    /**
+     * Creates a new read command on a single partition.
+     *
+     * @param isForThrift whether the query is for thrift or not.
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param columnFilter the column filter to use for the query.
+     * @param rowFilter the row filter to use for the query.
+     * @param limits the limits to use for the query.
+     * @param partitionKey the partition key for the partition to query.
+     * @param clusteringIndexFilter the clustering index filter to use for the query.
+     * @param indexMetadata explicitly specified index to use for the query
+     *
+     * @return a newly created read command.
+     */
+    public static SinglePartitionReadCommand create(boolean isForThrift,
+                                                    CFMetaData metadata,
+                                                    int nowInSec,
+                                                    ColumnFilter columnFilter,
+                                                    RowFilter rowFilter,
+                                                    DataLimits limits,
+                                                    DecoratedKey partitionKey,
+                                                    ClusteringIndexFilter clusteringIndexFilter,
+                                                    IndexMetadata indexMetadata)
+    {
+        return new SinglePartitionReadCommand(false,
+                                              0,
+                                              isForThrift,
+                                              metadata,
+                                              nowInSec,
+                                              columnFilter,
+                                              rowFilter,
+                                              limits,
+                                              partitionKey,
+                                              clusteringIndexFilter,
+                                              indexMetadata);
+    }
+
+    /**
+     * Creates a new read command on a single partition.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param columnFilter the column filter to use for the query.
+     * @param rowFilter the row filter to use for the query.
+     * @param limits the limits to use for the query.
+     * @param partitionKey the partition key for the partition to query.
+     * @param clusteringIndexFilter the clustering index filter to use for the query.
+     *
+     * @return a newly created read command.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata,
+                                                    int nowInSec,
+                                                    ColumnFilter columnFilter,
+                                                    RowFilter rowFilter,
+                                                    DataLimits limits,
+                                                    DecoratedKey partitionKey,
+                                                    ClusteringIndexFilter clusteringIndexFilter)
+    {
+        return create(false, metadata, nowInSec, columnFilter, rowFilter, limits, partitionKey, clusteringIndexFilter);
+    }
+
+    /**
+     * Creates a new read command on a single partition.
+     *
+     * @param isForThrift whether the query is for thrift or not.
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param columnFilter the column filter to use for the query.
+     * @param rowFilter the row filter to use for the query.
+     * @param limits the limits to use for the query.
+     * @param partitionKey the partition key for the partition to query.
+     * @param clusteringIndexFilter the clustering index filter to use for the query.
+     *
+     * @return a newly created read command.
+     */
+    public static SinglePartitionReadCommand create(boolean isForThrift,
+                                                    CFMetaData metadata,
+                                                    int nowInSec,
+                                                    ColumnFilter columnFilter,
+                                                    RowFilter rowFilter,
+                                                    DataLimits limits,
+                                                    DecoratedKey partitionKey,
+                                                    ClusteringIndexFilter clusteringIndexFilter)
+    {
+        return create(isForThrift,
+                      metadata,
+                      nowInSec,
+                      columnFilter,
+                      rowFilter,
+                      limits,
+                      partitionKey,
+                      clusteringIndexFilter,
+                      findIndex(metadata, rowFilter));
+    }
+
+    /**
+     * Creates a new read command on a single partition.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param columnFilter the column filter to use for the query.
+     * @param filter the clustering index filter to use for the query.
+     *
+     * @return a newly created read command. The returned command will use no row filter and have no limits.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata,
+                                                    int nowInSec,
+                                                    DecoratedKey key,
+                                                    ColumnFilter columnFilter,
+                                                    ClusteringIndexFilter filter)
+    {
+        return create(metadata, nowInSec, columnFilter, RowFilter.NONE, DataLimits.NONE, key, filter);
+    }
+
+    /**
+     * Creates a new read command that queries a single partition in its entirety.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     *
+     * @return a newly created read command that queries all the rows of {@code key}.
+     */
+    public static SinglePartitionReadCommand fullPartitionRead(CFMetaData metadata, int nowInSec, DecoratedKey key)
+    {
+        return create(metadata, nowInSec, key, Slices.ALL);
+    }
+
+    /**
+     * Creates a new read command that queries a single partition in its entirety.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     *
+     * @return a newly created read command that queries all the rows of {@code key}.
+     */
+    public static SinglePartitionReadCommand fullPartitionRead(CFMetaData metadata, int nowInSec, ByteBuffer key)
+    {
+        return create(metadata, nowInSec, metadata.decorateKey(key), Slices.ALL);
+    }
+
+    /**
+     * Creates a new single partition slice command for the provided single slice.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param slice the slice of rows to query.
+     *
+     * @return a newly created read command that queries {@code slice} in {@code key}. The returned query will
+     * query every columns for the table (without limit or row filtering) and be in forward order.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata, int nowInSec, DecoratedKey key, Slice slice)
+    {
+        return create(metadata, nowInSec, key, Slices.with(metadata.comparator, slice));
+    }
+
+    /**
+     * Creates a new single partition slice command for the provided slices.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param slices the slices of rows to query.
+     *
+     * @return a newly created read command that queries the {@code slices} in {@code key}. The returned query will
+     * query every columns for the table (without limit or row filtering) and be in forward order.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata, int nowInSec, DecoratedKey key, Slices slices)
+    {
+        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(slices, false);
+        return create(metadata, nowInSec, ColumnFilter.all(metadata), RowFilter.NONE, DataLimits.NONE, key, filter);
+    }
+
+    /**
+     * Creates a new single partition slice command for the provided slices.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param slices the slices of rows to query.
+     *
+     * @return a newly created read command that queries the {@code slices} in {@code key}. The returned query will
+     * query every columns for the table (without limit or row filtering) and be in forward order.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata, int nowInSec, ByteBuffer key, Slices slices)
+    {
+        return create(metadata, nowInSec, metadata.decorateKey(key), slices);
+    }
+
+    /**
+     * Creates a new single partition name command for the provided rows.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param names the clustering for the rows to query.
+     *
+     * @return a newly created read command that queries the {@code names} in {@code key}. The returned query will
+     * query every columns (without limit or row filtering) and be in forward order.
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata, int nowInSec, DecoratedKey key, NavigableSet<Clustering> names)
+    {
+        ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(names, false);
+        return create(metadata, nowInSec, ColumnFilter.all(metadata), RowFilter.NONE, DataLimits.NONE, key, filter);
+    }
+
+    /**
+     * Creates a new single partition name command for the provided row.
+     *
+     * @param metadata the table to query.
+     * @param nowInSec the time in seconds to use are "now" for this query.
+     * @param key the partition key for the partition to query.
+     * @param name the clustering for the row to query.
+     *
+     * @return a newly created read command that queries {@code name} in {@code key}. The returned query will
+     * query every columns (without limit or row filtering).
+     */
+    public static SinglePartitionReadCommand create(CFMetaData metadata, int nowInSec, DecoratedKey key, Clustering name)
+    {
+        return create(metadata, nowInSec, key, FBUtilities.singleton(name, metadata.comparator));
+    }
+
+    public SinglePartitionReadCommand copy()
+    {
+        return new SinglePartitionReadCommand(isDigestQuery(),
+                                              digestVersion(),
+                                              isForThrift(),
+                                              metadata(),
+                                              nowInSec(),
+                                              columnFilter(),
+                                              rowFilter(),
+                                              limits(),
+                                              partitionKey(),
+                                              clusteringIndexFilter(),
+                                              indexMetadata());
+    }
+
+    public SinglePartitionReadCommand copyAsDigestQuery()
+    {
+        return new SinglePartitionReadCommand(true,
+                                              digestVersion(),
+                                              isForThrift(),
+                                              metadata(),
+                                              nowInSec(),
+                                              columnFilter(),
+                                              rowFilter(),
+                                              limits(),
+                                              partitionKey(),
+                                              clusteringIndexFilter(),
+                                              indexMetadata());
+    }
+
+    public SinglePartitionReadCommand withUpdatedClusteringIndexFilter(ClusteringIndexFilter filter)
+    {
+        return new SinglePartitionReadCommand(isDigestQuery(),
+                                              digestVersion(),
+                                              isForThrift(),
+                                              metadata(),
+                                              nowInSec(),
+                                              columnFilter(),
+                                              rowFilter(),
+                                              limits(),
+                                              partitionKey(),
+                                              filter,
+                                              indexMetadata());
+    }
+
+    static SinglePartitionReadCommand legacySliceCommand(boolean isDigest,
+                                                         int digestVersion,
+                                                         CFMetaData metadata,
+                                                         int nowInSec,
+                                                         ColumnFilter columnFilter,
+                                                         DataLimits limits,
+                                                         DecoratedKey partitionKey,
+                                                         ClusteringIndexSliceFilter filter)
+    {
+        // messages from old nodes will expect the thrift format, so always use 'true' for isForThrift
+        return new SinglePartitionReadCommand(isDigest,
+                                              digestVersion,
+                                              true,
+                                              metadata,
+                                              nowInSec,
+                                              columnFilter,
+                                              RowFilter.NONE,
+                                              limits,
+                                              partitionKey,
+                                              filter,
+                                              null);
+    }
+
+    static SinglePartitionReadCommand legacyNamesCommand(boolean isDigest,
+                                                         int digestVersion,
+                                                         CFMetaData metadata,
+                                                         int nowInSec,
+                                                         ColumnFilter columnFilter,
+                                                         DecoratedKey partitionKey,
+                                                         ClusteringIndexNamesFilter filter)
+    {
+        // messages from old nodes will expect the thrift format, so always use 'true' for isForThrift
+        return new SinglePartitionReadCommand(isDigest, digestVersion, true, metadata, nowInSec, columnFilter, RowFilter.NONE, DataLimits.NONE, partitionKey, filter,null);
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return partitionKey;
+    }
+
+    public ClusteringIndexFilter clusteringIndexFilter()
+    {
+        return clusteringIndexFilter;
+    }
+
+    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key)
+    {
+        return clusteringIndexFilter;
+    }
+
+    public long getTimeout()
+    {
+        return DatabaseDescriptor.getReadRpcTimeout();
+    }
+
+    public boolean isReversed()
+    {
+        return clusteringIndexFilter.isReversed();
+    }
+
+    public boolean selectsKey(DecoratedKey key)
+    {
+        if (!this.partitionKey().equals(key))
+            return false;
+
+        return rowFilter().partitionKeyRestrictionsAreSatisfiedBy(key, metadata().getKeyValidator());
+    }
+
+    public boolean selectsClustering(DecoratedKey key, Clustering clustering)
+    {
+        if (clustering == Clustering.STATIC_CLUSTERING)
+            return !columnFilter().fetchedColumns().statics.isEmpty();
+
+        if (!clusteringIndexFilter().selects(clustering))
+            return false;
+
+        return rowFilter().clusteringKeyRestrictionsAreSatisfiedBy(clustering);
+    }
+
+    /**
+     * Returns a new command suitable to paging from the last returned row.
+     *
+     * @param lastReturned the last row returned by the previous page. The newly created command
+     * will only query row that comes after this (in query order). This can be {@code null} if this
+     * is the first page.
+     * @param pageSize the size to use for the page to query.
+     *
+     * @return the newly create command.
+     */
+    public SinglePartitionReadCommand forPaging(Clustering lastReturned, int pageSize)
+    {
+        // We shouldn't have set digest yet when reaching that point
+        assert !isDigestQuery();
+        return create(isForThrift(),
+                      metadata(),
+                      nowInSec(),
+                      columnFilter(),
+                      rowFilter(),
+                      limits().forPaging(pageSize),
+                      partitionKey(),
+                      lastReturned == null ? clusteringIndexFilter() : clusteringIndexFilter.forPaging(metadata().comparator, lastReturned, false));
+    }
+
+    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState) throws RequestExecutionException
+    {
+        return StorageProxy.read(Group.one(this), consistency, clientState);
+    }
+
+    public SinglePartitionPager getPager(PagingState pagingState, int protocolVersion)
+    {
+        return getPager(this, pagingState, protocolVersion);
+    }
+
+    private static SinglePartitionPager getPager(SinglePartitionReadCommand command, PagingState pagingState, int protocolVersion)
+    {
+        return new SinglePartitionPager(command, pagingState, protocolVersion);
+    }
+
+    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    {
+        metric.readLatency.addNano(latencyNanos);
+    }
+
+    @SuppressWarnings("resource") // we close the created iterator through closing the result of this method (and SingletonUnfilteredPartitionIterator ctor cannot fail)
+    protected UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadOrderGroup orderGroup)
+    {
+        UnfilteredRowIterator partition = cfs.isRowCacheEnabled()
+                                        ? getThroughCache(cfs, orderGroup.baseReadOpOrderGroup())
+                                        : queryMemtableAndDisk(cfs, orderGroup.baseReadOpOrderGroup());
+        return new SingletonUnfilteredPartitionIterator(partition, isForThrift());
+    }
+
+    /**
+     * Fetch the rows requested if in cache; if not, read it from disk and cache it.
+     * <p>
+     * If the partition is cached, and the filter given is within its bounds, we return
+     * from cache, otherwise from disk.
+     * <p>
+     * If the partition is is not cached, we figure out what filter is "biggest", read
+     * that from disk, then filter the result and either cache that or return it.
+     */
+    private UnfilteredRowIterator getThroughCache(ColumnFamilyStore cfs, OpOrder.Group readOp)
+    {
+        assert !cfs.isIndex(); // CASSANDRA-5732
+        assert cfs.isRowCacheEnabled() : String.format("Row cache is not enabled on table [%s]", cfs.name);
+
+        RowCacheKey key = new RowCacheKey(metadata().ksAndCFName, partitionKey());
+
+        // Attempt a sentinel-read-cache sequence.  if a write invalidates our sentinel, we'll return our
+        // (now potentially obsolete) data, but won't cache it. see CASSANDRA-3862
+        // TODO: don't evict entire partitions on writes (#2864)
+        IRowCacheEntry cached = CacheService.instance.rowCache.get(key);
+        if (cached != null)
+        {
+            if (cached instanceof RowCacheSentinel)
+            {
+                // Some other read is trying to cache the value, just do a normal non-caching read
+                Tracing.trace("Row cache miss (race)");
+                cfs.metric.rowCacheMiss.inc();
+                return queryMemtableAndDisk(cfs, readOp);
+            }
+
+            CachedPartition cachedPartition = (CachedPartition)cached;
+            if (cfs.isFilterFullyCoveredBy(clusteringIndexFilter(), limits(), cachedPartition, nowInSec()))
+            {
+                cfs.metric.rowCacheHit.inc();
+                Tracing.trace("Row cache hit");
+                UnfilteredRowIterator unfilteredRowIterator = clusteringIndexFilter().getUnfilteredRowIterator(columnFilter(), cachedPartition);
+                cfs.metric.updateSSTableIterated(0);
+                return unfilteredRowIterator;
+            }
+
+            cfs.metric.rowCacheHitOutOfRange.inc();
+            Tracing.trace("Ignoring row cache as cached value could not satisfy query");
+            return queryMemtableAndDisk(cfs, readOp);
+        }
+
+        cfs.metric.rowCacheMiss.inc();
+        Tracing.trace("Row cache miss");
+
+        // Note that on tables with no clustering keys, any positive value of
+        // rowsToCache implies caching the full partition
+        boolean cacheFullPartitions = metadata().clusteringColumns().size() > 0 ?
+                                      metadata().params.caching.cacheAllRows() :
+                                      metadata().params.caching.cacheRows();
+
+        // To be able to cache what we read, what we read must at least covers what the cache holds, that
+        // is the 'rowsToCache' first rows of the partition. We could read those 'rowsToCache' first rows
+        // systematically, but we'd have to "extend" that to whatever is needed for the user query that the
+        // 'rowsToCache' first rows don't cover and it's not trivial with our existing filters. So currently
+        // we settle for caching what we read only if the user query does query the head of the partition since
+        // that's the common case of when we'll be able to use the cache anyway. One exception is if we cache
+        // full partitions, in which case we just always read it all and cache.
+        if (cacheFullPartitions || clusteringIndexFilter().isHeadFilter())
+        {
+            RowCacheSentinel sentinel = new RowCacheSentinel();
+            boolean sentinelSuccess = CacheService.instance.rowCache.putIfAbsent(key, sentinel);
+            boolean sentinelReplaced = false;
+
+            try
+            {
+                final int rowsToCache = metadata().params.caching.rowsPerPartitionToCache();
+                final boolean enforceStrictLiveness = metadata().enforceStrictLiveness();
+
+                @SuppressWarnings("resource") // we close on exception or upon closing the result of this method
+                UnfilteredRowIterator iter = fullPartitionRead(metadata(), nowInSec(), partitionKey()).queryMemtableAndDisk(cfs, readOp);
+                try
+                {
+                    // Use a custom iterator instead of DataLimits to avoid stopping the original iterator
+                    UnfilteredRowIterator toCacheIterator = new WrappingUnfilteredRowIterator(iter)
+                    {
+                        private int rowsCounted = 0;
+
+                        @Override
+                        public boolean hasNext()
+                        {
+                            return rowsCounted < rowsToCache && super.hasNext();
+                        }
+
+                        @Override
+                        public Unfiltered next()
+                        {
+                            Unfiltered unfiltered = super.next();
+                            if (unfiltered.isRow())
+                            {
+                                Row row = (Row) unfiltered;
+                                if (row.hasLiveData(nowInSec(), enforceStrictLiveness))
+                                    rowsCounted++;
+                            }
+                            return unfiltered;
+                        }
+                    };
+
+                    // We want to cache only rowsToCache rows
+                    CachedPartition toCache = CachedBTreePartition.create(toCacheIterator, nowInSec());
+                    if (sentinelSuccess && !toCache.isEmpty())
+                    {
+                        Tracing.trace("Caching {} rows", toCache.rowCount());
+                        CacheService.instance.rowCache.replace(key, sentinel, toCache);
+                        // Whether or not the previous replace has worked, our sentinel is not in the cache anymore
+                        sentinelReplaced = true;
+                    }
+
+                    // We then re-filter out what this query wants.
+                    // Note that in the case where we don't cache full partitions, it's possible that the current query is interested in more
+                    // than what we've cached, so we can't just use toCache.
+                    UnfilteredRowIterator cacheIterator = clusteringIndexFilter().getUnfilteredRowIterator(columnFilter(), toCache);
+                    if (cacheFullPartitions)
+                    {
+                        // Everything is guaranteed to be in 'toCache', we're done with 'iter'
+                        assert !iter.hasNext();
+                        iter.close();
+                        return cacheIterator;
+                    }
+                    return UnfilteredRowIterators.concat(cacheIterator, clusteringIndexFilter().filterNotIndexed(columnFilter(), iter));
+                }
+                catch (RuntimeException | Error e)
+                {
+                    iter.close();
+                    throw e;
+                }
+            }
+            finally
+            {
+                if (sentinelSuccess && !sentinelReplaced)
+                    cfs.invalidateCachedPartition(key);
+            }
+        }
+
+        Tracing.trace("Fetching data but not populating cache as query does not query from the start of the partition");
+        return queryMemtableAndDisk(cfs, readOp);
+    }
+
+    /**
+     * Queries both memtable and sstables to fetch the result of this query.
+     * <p>
+     * Please note that this method:
+     *   1) does not check the row cache.
+     *   2) does not apply the query limit, nor the row filter (and so ignore 2ndary indexes).
+     *      Those are applied in {@link ReadCommand#executeLocally}.
+     *   3) does not record some of the read metrics (latency, scanned cells histograms) nor
+     *      throws TombstoneOverwhelmingException.
+     * It is publicly exposed because there is a few places where that is exactly what we want,
+     * but it should be used only where you know you don't need thoses things.
+     * <p>
+     * Also note that one must have "started" a {@code OpOrder.Group} on the queried table, and that is
+     * to enforce that that it is required as parameter, even though it's not explicitlly used by the method.
+     */
+    public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs, OpOrder.Group readOp)
+    {
+        Tracing.trace("Executing single-partition query on {}", cfs.name);
+
+        boolean copyOnHeap = Memtable.MEMORY_POOL.needToCopyOnHeap();
+        return queryMemtableAndDiskInternal(cfs, copyOnHeap);
+    }
+
+    @Override
+    protected int oldestUnrepairedTombstone()
+    {
+        return oldestUnrepairedTombstone;
+    }
+
+    private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs, boolean copyOnHeap)
+    {
+        /*
+         * We have 2 main strategies:
+         *   1) We query memtables and sstables simulateneously. This is our most generic strategy and the one we use
+         *      unless we have a names filter that we know we can optimize futher.
+         *   2) If we have a name filter (so we query specific rows), we can make a bet: that all column for all queried row
+         *      will have data in the most recent sstable(s), thus saving us from reading older ones. This does imply we
+         *      have a way to guarantee we have all the data for what is queried, which is only possible for name queries
+         *      and if we have neither non-frozen collections/UDTs nor counters (indeed, for a non-frozen collection or UDT,
+         *      we can't guarantee an older sstable won't have some elements that weren't in the most recent sstables,
+         *      and counters are intrinsically a collection of shards and so have the same problem).
+         */
+        if (clusteringIndexFilter() instanceof ClusteringIndexNamesFilter && !queriesMulticellType())
+            return queryMemtableAndSSTablesInTimestampOrder(cfs, copyOnHeap, (ClusteringIndexNamesFilter)clusteringIndexFilter());
+
+        Tracing.trace("Acquiring sstable references");
+        ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey()));
+
+        List<UnfilteredRowIterator> iterators = new ArrayList<>(Iterables.size(view.memtables) + view.sstables.size());
+        ClusteringIndexFilter filter = clusteringIndexFilter();
+
+        try
+        {
+            for (Memtable memtable : view.memtables)
+            {
+                Partition partition = memtable.getPartition(partitionKey());
+                if (partition == null)
+                    continue;
+
+                // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
+                @SuppressWarnings("resource")
+                UnfilteredRowIterator iter = filter.getUnfilteredRowIterator(columnFilter(), partition);
+
+                if (copyOnHeap)
+                    iter = UnfilteredRowIterators.cloningIterator(iter, HeapAllocator.instance);
+
+                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, partition.stats().minLocalDeletionTime);
+
+                if (isForThrift())
+                    iter = ThriftResultsMerger.maybeWrap(iter, nowInSec());
+
+                iterators.add(RTBoundValidator.validate(iter, RTBoundValidator.Stage.MEMTABLE, false));
+            }
+            /*
+             * We can't eliminate full sstables based on the timestamp of what we've already read like
+             * in collectTimeOrderedData, but we still want to eliminate sstable whose maxTimestamp < mostRecentTombstone
+             * we've read. We still rely on the sstable ordering by maxTimestamp since if
+             *   maxTimestamp_s1 < maxTimestamp_s0,
+             * we're guaranteed that s1 cannot have a row tombstone such that
+             *   timestamp(tombstone) > maxTimestamp_s0
+             * since we necessarily have
+             *   timestamp(tombstone) <= maxTimestamp_s1
+             * In other words, iterating in descending maxTimestamp order allow to do our mostRecentPartitionTombstone
+             * elimination in one pass, and minimize the number of sstables for which we read a partition tombstone.
+             */
+            Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
+            long mostRecentPartitionTombstone = Long.MIN_VALUE;
+            int nonIntersectingSSTables = 0;
+            int includedDueToTombstones = 0;
+            SSTableReadMetricsCollector metricsCollector = new SSTableReadMetricsCollector();
+
+            for (SSTableReader sstable : view.sstables)
+            {
+                // if we've already seen a partition tombstone with a timestamp greater
+                // than the most recent update to this sstable, we can skip it
+                if (sstable.getMaxTimestamp() < mostRecentPartitionTombstone)
+                    break;
+
+                if (shouldInclude(sstable))
+                {
+                    if (!sstable.isRepaired())
+                        oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
+
+                    // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
+                    @SuppressWarnings("resource")
+                    UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
+                                                                                columnFilter(),
+                                                                                filter.isReversed(),
+                                                                                isForThrift(),
+                                                                                metricsCollector));
+
+                    if (isForThrift())
+                        iter = ThriftResultsMerger.maybeWrap(iter, nowInSec());
+
+                    iterators.add(RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false));
+                    mostRecentPartitionTombstone = Math.max(mostRecentPartitionTombstone,
+                                                            iter.partitionLevelDeletion().markedForDeleteAt());
+                }
+                else
+                {
+
+                    nonIntersectingSSTables++;
+                    // sstable contains no tombstone if maxLocalDeletionTime == Integer.MAX_VALUE, so we can safely skip those entirely
+                    if (sstable.hasTombstones())
+                    {
+                        // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
+                        @SuppressWarnings("resource")
+                        UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
+                                                                                    columnFilter(),
+                                                                                    filter.isReversed(),
+                                                                                    isForThrift(),
+                                                                                    metricsCollector));
+                        // if the sstable contains a partition delete, then we must include it regardless of whether it
+                        // shadows any other data seen locally as we can't guarantee that other replicas have seen it
+                        if (!iter.partitionLevelDeletion().isLive())
+                        {
+                            includedDueToTombstones++;
+                            iterators.add(RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false));
+                            if (!sstable.isRepaired())
+                                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
+                            mostRecentPartitionTombstone = Math.max(mostRecentPartitionTombstone,
+                                                                    iter.partitionLevelDeletion().markedForDeleteAt());
+                        }
+                        else
+                        {
+                            iter.close();
+                        }
+
+                    }
+                }
+            }
+
+            if (Tracing.isTracing())
+                Tracing.trace("Skipped {}/{} non-slice-intersecting sstables, included {} due to tombstones",
+                              nonIntersectingSSTables, view.sstables.size(), includedDueToTombstones);
+
+            cfs.metric.updateSSTableIterated(metricsCollector.getMergedSSTables());
+
+            if (iterators.isEmpty())
+                return EmptyIterators.unfilteredRow(cfs.metadata, partitionKey(), filter.isReversed());
+
+            Tracing.trace("Merging data from memtables and {} sstables", metricsCollector.getMergedSSTables());
+
+            @SuppressWarnings("resource") //  Closed through the closing of the result of that method.
+            UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators, nowInSec());
+            if (!merged.isEmpty())
+            {
+                DecoratedKey key = merged.partitionKey();
+                cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1);
+            }
+
+            return merged;
+        }
+        catch (RuntimeException | Error e)
+        {
+            try
+            {
+                FBUtilities.closeAll(iterators);
+            }
+            catch (Exception suppressed)
+            {
+                e.addSuppressed(suppressed);
+            }
+            throw e;
+        }
+    }
+
+    private boolean shouldInclude(SSTableReader sstable)
+    {
+        // If some static columns are queried, we should always include the sstable: the clustering values stats of the sstable
+        // don't tell us if the sstable contains static values in particular.
+        // TODO: we could record if a sstable contains any static value at all.
+        if (!columnFilter().fetchedColumns().statics.isEmpty())
+            return true;
+
+        return clusteringIndexFilter().shouldInclude(sstable);
+    }
+
+    private boolean queriesMulticellType()
+    {
+        for (ColumnDefinition column : columnFilter().fetchedColumns())
+        {
+            if (column.type.isMultiCell() || column.type.isCounter())
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Do a read by querying the memtable(s) first, and then each relevant sstables sequentially by order of the sstable
+     * max timestamp.
+     *
+     * This is used for names query in the hope of only having to query the 1 or 2 most recent query and then knowing nothing
+     * more recent could be in the older sstables (which we can only guarantee if we know exactly which row we queries, and if
+     * no collection or counters are included).
+     * This method assumes the filter is a {@code ClusteringIndexNamesFilter}.
+     */
+    private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, boolean copyOnHeap, ClusteringIndexNamesFilter filter)
+    {
+        Tracing.trace("Acquiring sstable references");
+        ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey()));
+
+        ImmutableBTreePartition result = null;
+
+        Tracing.trace("Merging memtable contents");
+        for (Memtable memtable : view.memtables)
+        {
+            Partition partition = memtable.getPartition(partitionKey());
+            if (partition == null)
+                continue;
+
+            try (UnfilteredRowIterator iter = filter.getUnfilteredRowIterator(columnFilter(), partition))
+            {
+                if (iter.isEmpty())
+                    continue;
+
+                UnfilteredRowIterator clonedIter = copyOnHeap
+                                                   ? UnfilteredRowIterators.cloningIterator(iter, HeapAllocator.instance)
+                                                   : iter;
+                result = add(
+                    RTBoundValidator.validate(isForThrift() ? ThriftResultsMerger.maybeWrap(clonedIter, nowInSec()) : clonedIter, RTBoundValidator.Stage.MEMTABLE, false),
+                    result,
+                    filter,
+                    false
+                );
+            }
+        }
+
+        /* add the SSTables on disk */
+        Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
+        boolean onlyUnrepaired = true;
+        // read sorted sstables
+        SSTableReadMetricsCollector metricsCollector = new SSTableReadMetricsCollector();
+        for (SSTableReader sstable : view.sstables)
+        {
+            // if we've already seen a partition tombstone with a timestamp greater
+            // than the most recent update to this sstable, we're done, since the rest of the sstables
+            // will also be older
+            if (result != null && sstable.getMaxTimestamp() < result.partitionLevelDeletion().markedForDeleteAt())
+                break;
+
+            long currentMaxTs = sstable.getMaxTimestamp();
+            filter = reduceFilter(filter, result, currentMaxTs);
+            if (filter == null)
+                break;
+
+            if (!shouldInclude(sstable))
+            {
+                // This mean that nothing queried by the filter can be in the sstable. One exception is the top-level partition deletion
+                // however: if it is set, it impacts everything and must be included. Getting that top-level partition deletion costs us
+                // some seek in general however (unless the partition is indexed and is in the key cache), so we first check if the sstable
+                // has any tombstone at all as a shortcut.
+                if (!sstable.hasTombstones())
+                    continue; // Means no tombstone at all, we can skip that sstable
+
+                // We need to get the partition deletion and include it if it's live. In any case though, we're done with that sstable.
+                try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
+                                                                                 columnFilter(),
+                                                                                 filter.isReversed(),
+                                                                                 isForThrift(),
+                                                                                 metricsCollector)))
+                {
+                    if (!iter.partitionLevelDeletion().isLive())
+                    {
+                        result = add(
+                            UnfilteredRowIterators.noRowsIterator(iter.metadata(),
+                                                                  iter.partitionKey(),
+                                                                  Rows.EMPTY_STATIC_ROW,
+                                                                  iter.partitionLevelDeletion(),
+                                                                  filter.isReversed()),
+                            result,
+                            filter,
+                            sstable.isRepaired()
+                        );
+                    }
+                    else
+                    {
+                        result = add(
+                            RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false),
+                            result,
+                            filter,
+                            sstable.isRepaired()
+                        );
+                    }
+                }
+
+                continue;
+            }
+
+            Tracing.trace("Merging data from sstable {}", sstable.descriptor.generation);
+            try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
+                                                                             columnFilter(),
+                                                                             filter.isReversed(),
+                                                                             isForThrift(),
+                                                                             metricsCollector)))
+            {
+                if (iter.isEmpty())
+                    continue;
+
+                if (sstable.isRepaired())
+                    onlyUnrepaired = false;
+
+                result = add(
+                    RTBoundValidator.validate(isForThrift() ? ThriftResultsMerger.maybeWrap(iter, nowInSec()) : iter, RTBoundValidator.Stage.SSTABLE, false),
+                    result,
+                    filter,
+                    sstable.isRepaired()
+                );
+            }
+        }
+
+        cfs.metric.updateSSTableIterated(metricsCollector.getMergedSSTables());
+
+        if (result == null || result.isEmpty())
+            return EmptyIterators.unfilteredRow(metadata(), partitionKey(), false);
+
+        DecoratedKey key = result.partitionKey();
+        cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1);
+
+        // "hoist up" the requested data into a more recent sstable
+        if (metricsCollector.getMergedSSTables() > cfs.getMinimumCompactionThreshold()
+            && onlyUnrepaired
+            && !cfs.isAutoCompactionDisabled()
+            && cfs.getCompactionStrategyManager().shouldDefragment())
+        {
+            // !!WARNING!!   if we stop copying our data to a heap-managed object,
+            //               we will need to track the lifetime of this mutation as well
+            Tracing.trace("Defragmenting requested data");
+
+            try (UnfilteredRowIterator iter = result.unfilteredIterator(columnFilter(), Slices.ALL, false))
+            {
+                final Mutation mutation = new Mutation(PartitionUpdate.fromIterator(iter));
+                StageManager.getStage(Stage.MUTATION).execute(new Runnable()
+                {
+                    public void run()
+                    {
+                        // skipping commitlog and index updates is fine since we're just de-fragmenting existing data
+                        Keyspace.open(mutation.getKeyspaceName()).apply(mutation, false, false);
+                    }
+                });
+            }
+        }
+
+        return result.unfilteredIterator(columnFilter(), Slices.ALL, clusteringIndexFilter().isReversed());
+    }
+
+    private ImmutableBTreePartition add(UnfilteredRowIterator iter, ImmutableBTreePartition result, ClusteringIndexNamesFilter filter, boolean isRepaired)
+    {
+        if (!isRepaired)
+            oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.stats().minLocalDeletionTime);
+
+        int maxRows = Math.max(filter.requestedRows().size(), 1);
+        if (result == null)
+            return ImmutableBTreePartition.create(iter, maxRows);
+
+        try (UnfilteredRowIterator merged = UnfilteredRowIterators.merge(Arrays.asList(iter, result.unfilteredIterator(columnFilter(), Slices.ALL, filter.isReversed())), nowInSec()))
+        {
+            return ImmutableBTreePartition.create(merged, maxRows);
+        }
+    }
+
+    private ClusteringIndexNamesFilter reduceFilter(ClusteringIndexNamesFilter filter, Partition result, long sstableTimestamp)
+    {
+        if (result == null)
+            return filter;
+
+        SearchIterator<Clustering, Row> searchIter = result.searchIterator(columnFilter(), false);
+
+        PartitionColumns columns = columnFilter().fetchedColumns();
+        NavigableSet<Clustering> clusterings = filter.requestedRows();
+
+        // We want to remove rows for which we have values for all requested columns. We have to deal with both static and regular rows.
+        // TODO: we could also remove a selected column if we've found values for every requested row but we'll leave
+        // that for later.
+
+        boolean removeStatic = false;
+        if (!columns.statics.isEmpty())
+        {
+            Row staticRow = searchIter.next(Clustering.STATIC_CLUSTERING);
+            removeStatic = staticRow != null && canRemoveRow(staticRow, columns.statics, sstableTimestamp);
+        }
+
+        NavigableSet<Clustering> toRemove = null;
+        for (Clustering clustering : clusterings)
+        {
+            Row row = searchIter.next(clustering);
+            if (row == null || !canRemoveRow(row, columns.regulars, sstableTimestamp))
+                continue;
+
+            if (toRemove == null)
+                toRemove = new TreeSet<>(result.metadata().comparator);
+            toRemove.add(clustering);
+        }
+
+        if (!removeStatic && toRemove == null)
+            return filter;
+
+        // Check if we have everything we need
+        boolean hasNoMoreStatic = columns.statics.isEmpty() || removeStatic;
+        boolean hasNoMoreClusterings = clusterings.isEmpty() || (toRemove != null && toRemove.size() == clusterings.size());
+        if (hasNoMoreStatic && hasNoMoreClusterings)
+            return null;
+
+        if (toRemove != null)
+        {
+            BTreeSet.Builder<Clustering> newClusterings = BTreeSet.builder(result.metadata().comparator);
+            newClusterings.addAll(Sets.difference(clusterings, toRemove));
+            clusterings = newClusterings.build();
+        }
+        return new ClusteringIndexNamesFilter(clusterings, filter.isReversed());
+    }
+
+    private boolean canRemoveRow(Row row, Columns requestedColumns, long sstableTimestamp)
+    {
+        // We can remove a row if it has data that is more recent that the next sstable to consider for the data that the query
+        // cares about. And the data we care about is 1) the row timestamp (since every query cares if the row exists or not)
+        // and 2) the requested columns.
+        if (row.primaryKeyLivenessInfo().isEmpty() || row.primaryKeyLivenessInfo().timestamp() <= sstableTimestamp)
+            return false;
+
+        for (ColumnDefinition column : requestedColumns)
+        {
+            Cell cell = row.getCell(column);
+            if (cell == null || cell.timestamp() <= sstableTimestamp)
+                return false;
+        }
+        return true;
+    }
+
+    @Override
+    public boolean selectsFullPartition()
+    {
+        return metadata().isStaticCompactTable() ||
+               (clusteringIndexFilter.selectsAllPartition() && !rowFilter().hasExpressionOnClusteringOrRegularColumns());
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Read(%s.%s columns=%s rowFilter=%s limits=%s key=%s filter=%s, nowInSec=%d)",
+                             metadata().ksName,
+                             metadata().cfName,
+                             columnFilter(),
+                             rowFilter(),
+                             limits(),
+                             metadata().getKeyValidator().getString(partitionKey().getKey()),
+                             clusteringIndexFilter.toString(metadata()),
+                             nowInSec());
+    }
+
+    public MessageOut<ReadCommand> createMessage(int version)
+    {
+        return new MessageOut<>(MessagingService.Verb.READ, this, readSerializer);
+    }
+
+    protected void appendCQLWhereClause(StringBuilder sb)
+    {
+        sb.append(" WHERE ");
+
+        sb.append(ColumnDefinition.toCQLString(metadata().partitionKeyColumns())).append(" = ");
+        DataRange.appendKeyString(sb, metadata().getKeyValidator(), partitionKey().getKey());
+
+        // We put the row filter first because the clustering index filter can end by "ORDER BY"
+        if (!rowFilter().isEmpty())
+            sb.append(" AND ").append(rowFilter());
+
+        String filterString = clusteringIndexFilter().toCQLString(metadata());
+        if (!filterString.isEmpty())
+            sb.append(" AND ").append(filterString);
+    }
+
+    protected void serializeSelection(DataOutputPlus out, int version) throws IOException
+    {
+        metadata().getKeyValidator().writeValue(partitionKey().getKey(), out);
+        ClusteringIndexFilter.serializer.serialize(clusteringIndexFilter(), out, version);
+    }
+
+    protected long selectionSerializedSize(int version)
+    {
+        return metadata().getKeyValidator().writtenLength(partitionKey().getKey())
+             + ClusteringIndexFilter.serializer.serializedSize(clusteringIndexFilter(), version);
+    }
+
+    public boolean isLimitedToOnePartition()
+    {
+        return true;
+    }
+
+    /**
+     * Groups multiple single partition read commands.
+     */
+    public static class Group implements ReadQuery
+    {
+        public final List<SinglePartitionReadCommand> commands;
+        private final DataLimits limits;
+        private final int nowInSec;
+        private final boolean selectsFullPartitions;
+
+        public Group(List<SinglePartitionReadCommand> commands, DataLimits limits)
+        {
+            assert !commands.isEmpty();
+            this.commands = commands;
+            this.limits = limits;
+            SinglePartitionReadCommand firstCommand = commands.get(0);
+            this.nowInSec = firstCommand.nowInSec();
+            this.selectsFullPartitions = firstCommand.selectsFullPartition();
+            for (int i = 1; i < commands.size(); i++)
+                assert commands.get(i).nowInSec() == nowInSec;
+        }
+
+        public static Group one(SinglePartitionReadCommand command)
+        {
+            return new Group(Collections.<SinglePartitionReadCommand>singletonList(command), command.limits());
+        }
+
+        public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState) throws RequestExecutionException
+        {
+            return StorageProxy.read(this, consistency, clientState);
+        }
+
+        public int nowInSec()
+        {
+            return nowInSec;
+        }
+
+        public DataLimits limits()
+        {
+            return limits;
+        }
+
+        public CFMetaData metadata()
+        {
+            return commands.get(0).metadata();
+        }
+
+        @Override
+        public boolean selectsFullPartition()
+        {
+            return selectsFullPartitions;
+        }
+
+        public ReadOrderGroup startOrderGroup()
+        {
+            // Note that the only difference between the command in a group must be the partition key on which
+            // they applied. So as far as ReadOrderGroup is concerned, we can use any of the commands to start one.
+            return commands.get(0).startOrderGroup();
+        }
+
+        public PartitionIterator executeInternal(ReadOrderGroup orderGroup)
+        {
+            List<PartitionIterator> partitions = new ArrayList<>(commands.size());
+            for (SinglePartitionReadCommand cmd : commands)
+                partitions.add(cmd.executeInternal(orderGroup));
+
+            // Note that the only difference between the command in a group must be the partition key on which
+            // they applied.
+            boolean enforceStrictLiveness = commands.get(0).metadata().enforceStrictLiveness();
+            // Because we only have enforce the limit per command, we need to enforce it globally.
+            return limits.filter(PartitionIterators.concat(partitions),
+                                 nowInSec,
+                                 selectsFullPartitions,
+                                 enforceStrictLiveness);
+        }
+
+        public QueryPager getPager(PagingState pagingState, int protocolVersion)
+        {
+            if (commands.size() == 1)
+                return SinglePartitionReadCommand.getPager(commands.get(0), pagingState, protocolVersion);
+
+            return new MultiPartitionPager(this, pagingState, protocolVersion);
+        }
+
+        public boolean selectsKey(DecoratedKey key)
+        {
+            return Iterables.any(commands, c -> c.selectsKey(key));
+        }
+
+        public boolean selectsClustering(DecoratedKey key, Clustering clustering)
+        {
+            return Iterables.any(commands, c -> c.selectsClustering(key, clustering));
+        }
+
+        @Override
+        public String toString()
+        {
+            return commands.toString();
+        }
+    }
+
+    private static class Deserializer extends SelectionDeserializer
+    {
+        public ReadCommand deserialize(DataInputPlus in,
+                                       int version,
+                                       boolean isDigest,
+                                       int digestVersion,
+                                       boolean isForThrift,
+                                       CFMetaData metadata,
+                                       int nowInSec,
+                                       ColumnFilter columnFilter,
+                                       RowFilter rowFilter,
+                                       DataLimits limits,
+                                       IndexMetadata index)
+        throws IOException
+        {
+            DecoratedKey key = metadata.decorateKey(metadata.getKeyValidator().readValue(in, DatabaseDescriptor.getMaxValueSize()));
+            ClusteringIndexFilter filter = ClusteringIndexFilter.serializer.deserialize(in, version, metadata);
+            return new SinglePartitionReadCommand(isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, index);
+        }
+    }
+
+    /**
+     * {@code SSTableReaderListener} used to collect metrics about SSTable read access.
+     */
+    private static final class SSTableReadMetricsCollector implements SSTableReadsListener
+    {
+        /**
+         * The number of SSTables that need to be merged. This counter is only updated for single partition queries
+         * since this has been the behavior so far.
+         */
+        private int mergedSSTables;
+
+        @Override
+        public void onSSTableSelected(SSTableReader sstable, RowIndexEntry<?> indexEntry, SelectionReason reason)
+        {
+            sstable.incrementReadCount();
+            mergedSSTables++;
+        }
+
+        /**
+         * Returns the number of SSTables that need to be merged.
+         * @return the number of SSTables that need to be merged.
+         */
+        public int getMergedSSTables()
+        {
+            return mergedSSTables;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/SizeEstimatesRecorder.java b/src/java/org/apache/cassandra/db/SizeEstimatesRecorder.java
index 2f14fb1..ebe3f9a 100644
--- a/src/java/org/apache/cassandra/db/SizeEstimatesRecorder.java
+++ b/src/java/org/apache/cassandra/db/SizeEstimatesRecorder.java

@@ -23,6 +23,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.db.lifecycle.SSTableIntervalTree;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -66,12 +69,10 @@
 
         logger.trace("Recording size estimates");
 
-        // find primary token ranges for the local node.
-        Collection<Token> localTokens = StorageService.instance.getLocalTokens();
-        Collection<Range<Token>> localRanges = metadata.getPrimaryRangesFor(localTokens);
-
-        for (Keyspace keyspace : Keyspace.nonSystem())
+        for (Keyspace keyspace : Keyspace.nonLocalStrategy())
         {
+            Collection<Range<Token>> localRanges = StorageService.instance.getPrimaryRangesForEndpoint(keyspace.getName(),
+                    FBUtilities.getBroadcastAddress());
             for (ColumnFamilyStore table : keyspace.getColumnFamilyStores())
             {
                 long start = System.nanoTime();
@@ -88,34 +89,39 @@
     @SuppressWarnings("resource")
     private void recordSizeEstimates(ColumnFamilyStore table, Collection<Range<Token>> localRanges)
     {
-        List<Range<Token>> unwrappedRanges = Range.normalize(localRanges);
         // for each local primary range, estimate (crudely) mean partition size and partitions count.
         Map<Range<Token>, Pair<Long, Long>> estimates = new HashMap<>(localRanges.size());
-        for (Range<Token> range : unwrappedRanges)
+        for (Range<Token> localRange : localRanges)
         {
-            // filter sstables that have partitions in this range.
-            Refs<SSTableReader> refs = null;
-            long partitionsCount, meanPartitionSize;
-
-            try
+            for (Range<Token> unwrappedRange : localRange.unwrap())
             {
-                while (refs == null)
+                // filter sstables that have partitions in this range.
+                Refs<SSTableReader> refs = null;
+                long partitionsCount, meanPartitionSize;
+
+                try
                 {
-                    ColumnFamilyStore.ViewFragment view = table.select(table.viewFilter(Range.makeRowRange(range)));
-                    refs = Refs.tryRef(view.sstables);
+                    while (refs == null)
+                    {
+                        Iterable<SSTableReader> sstables = table.getTracker().getView().select(SSTableSet.CANONICAL);
+                        SSTableIntervalTree tree = SSTableIntervalTree.build(sstables);
+                        Range<PartitionPosition> r = Range.makeRowRange(unwrappedRange);
+                        Iterable<SSTableReader> canonicalSSTables = View.sstablesInBounds(r.left, r.right, tree);
+                        refs = Refs.tryRef(canonicalSSTables);
+                    }
+
+                    // calculate the estimates.
+                    partitionsCount = estimatePartitionsCount(refs, unwrappedRange);
+                    meanPartitionSize = estimateMeanPartitionSize(refs);
+                }
+                finally
+                {
+                    if (refs != null)
+                        refs.release();
                 }
 
-                // calculate the estimates.
-                partitionsCount = estimatePartitionsCount(refs, range);
-                meanPartitionSize = estimateMeanPartitionSize(refs);
+                estimates.put(unwrappedRange, Pair.create(partitionsCount, meanPartitionSize));
             }
-            finally
-            {
-                if (refs != null)
-                    refs.release();
-            }
-
-            estimates.put(range, Pair.create(partitionsCount, meanPartitionSize));
         }
 
         // atomically update the estimates.
@@ -135,8 +141,8 @@
         long sum = 0, count = 0;
         for (SSTableReader sstable : sstables)
         {
-            long n = sstable.getEstimatedRowSize().count();
-            sum += sstable.getEstimatedRowSize().mean() * n;
+            long n = sstable.getEstimatedPartitionSize().count();
+            sum += sstable.getEstimatedPartitionSize().mean() * n;
             count += n;
         }
         return count > 0 ? sum / count : 0;

diff --git a/src/java/org/apache/cassandra/db/Slice.java b/src/java/org/apache/cassandra/db/Slice.java
new file mode 100644
index 0000000..fb75b8e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Slice.java

@@ -0,0 +1,544 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * A slice represents the selection of a range of rows.
+ * <p>
+ * A slice has a start and an end bound that are both (potentially full) clustering prefixes.
+ * A slice selects every rows whose clustering is bigger than the slice start prefix but smaller
+ * than the end prefix. Both start and end can be either inclusive or exclusive.
+ */
+public class Slice
+{
+    public static final Serializer serializer = new Serializer();
+
+    /** The slice selecting all rows (of a given partition) */
+    public static final Slice ALL = new Slice(Bound.BOTTOM, Bound.TOP)
+    {
+        @Override
+        public boolean selects(ClusteringComparator comparator, Clustering clustering)
+        {
+            return true;
+        }
+
+        @Override
+        public boolean intersects(ClusteringComparator comparator, List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        {
+            return true;
+        }
+
+        @Override
+        public String toString(ClusteringComparator comparator)
+        {
+            return "ALL";
+        }
+    };
+
+    private final Bound start;
+    private final Bound end;
+
+    private Slice(Bound start, Bound end)
+    {
+        assert start.isStart() && end.isEnd();
+        this.start = start;
+        this.end = end;
+    }
+
+    public static Slice make(Bound start, Bound end)
+    {
+        if (start == Bound.BOTTOM && end == Bound.TOP)
+            return ALL;
+
+        return new Slice(start, end);
+    }
+
+    public static Slice make(ClusteringComparator comparator, Object... values)
+    {
+        CBuilder builder = CBuilder.create(comparator);
+        for (Object val : values)
+        {
+            if (val instanceof ByteBuffer)
+                builder.add((ByteBuffer) val);
+            else
+                builder.add(val);
+        }
+        return new Slice(builder.buildBound(true, true), builder.buildBound(false, true));
+    }
+
+    public static Slice make(Clustering clustering)
+    {
+        // This doesn't give us what we want with the clustering prefix
+        assert clustering != Clustering.STATIC_CLUSTERING;
+        ByteBuffer[] values = extractValues(clustering);
+        return new Slice(Bound.inclusiveStartOf(values), Bound.inclusiveEndOf(values));
+    }
+
+    public static Slice make(Clustering start, Clustering end)
+    {
+        // This doesn't give us what we want with the clustering prefix
+        assert start != Clustering.STATIC_CLUSTERING && end != Clustering.STATIC_CLUSTERING;
+
+        ByteBuffer[] startValues = extractValues(start);
+        ByteBuffer[] endValues = extractValues(end);
+
+        return new Slice(Bound.inclusiveStartOf(startValues), Bound.inclusiveEndOf(endValues));
+    }
+
+    private static ByteBuffer[] extractValues(ClusteringPrefix clustering)
+    {
+        ByteBuffer[] values = new ByteBuffer[clustering.size()];
+        for (int i = 0; i < clustering.size(); i++)
+            values[i] = clustering.get(i);
+        return values;
+    }
+
+    public Bound start()
+    {
+        return start;
+    }
+
+    public Bound end()
+    {
+        return end;
+    }
+
+    public Bound open(boolean reversed)
+    {
+        return reversed ? end : start;
+    }
+
+    public Bound close(boolean reversed)
+    {
+        return reversed ? start : end;
+    }
+
+    /**
+     * Return whether the slice is empty.
+     *
+     * @param comparator the comparator to compare the bounds.
+     * @return whether the slice formed is empty or not.
+     */
+    public boolean isEmpty(ClusteringComparator comparator)
+    {
+        return isEmpty(comparator, start(), end());
+    }
+
+    /**
+     * Return whether the slice formed by the two provided bound is empty or not.
+     *
+     * @param comparator the comparator to compare the bounds.
+     * @param start the start for the slice to consider. This must be a start bound.
+     * @param end the end for the slice to consider. This must be an end bound.
+     * @return whether the slice formed by {@code start} and {@code end} is
+     * empty or not.
+     */
+    public static boolean isEmpty(ClusteringComparator comparator, Slice.Bound start, Slice.Bound end)
+    {
+        assert start.isStart() && end.isEnd();
+        return comparator.compare(end, start) <= 0;
+    }
+
+    /**
+     * Returns whether a given clustering is selected by this slice.
+     *
+     * @param comparator the comparator for the table this is a slice of.
+     * @param clustering the clustering to test inclusion of.
+     *
+     * @return whether {@code clustering} is selected by this slice.
+     */
+    public boolean selects(ClusteringComparator comparator, Clustering clustering)
+    {
+        return comparator.compare(start, clustering) <= 0 && comparator.compare(clustering, end) <= 0;
+    }
+
+    /**
+     * Returns whether a given bound is included in this slice.
+     *
+     * @param comparator the comparator for the table this is a slice of.
+     * @param bound the bound to test inclusion of.
+     *
+     * @return whether {@code bound} is within the bounds of this slice.
+     */
+    public boolean includes(ClusteringComparator comparator, Bound bound)
+    {
+        return comparator.compare(start, bound) <= 0 && comparator.compare(bound, end) <= 0;
+    }
+
+    /**
+     * Returns a slice for continuing paging from the last returned clustering prefix.
+     *
+     * @param comparator the comparator for the table this is a filter for.
+     * @param lastReturned the last clustering that was returned for the query we are paging for. The
+     * resulting slices will be such that only results coming stricly after {@code lastReturned} are returned
+     * (where coming after means "greater than" if {@code !reversed} and "lesser than" otherwise).
+     * @param inclusive whether or not we want to include the {@code lastReturned} in the newly returned page of results.
+     * @param reversed whether the query we're paging for is reversed or not.
+     *
+     * @return a new slice that selects results coming after {@code lastReturned}, or {@code null} if paging
+     * the resulting slice selects nothing (i.e. if it originally selects nothing coming after {@code lastReturned}).
+     */
+    public Slice forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive, boolean reversed)
+    {
+        if (lastReturned == null)
+            return this;
+
+        if (reversed)
+        {
+            int cmp = comparator.compare(lastReturned, start);
+            if (cmp < 0 || (!inclusive && cmp == 0))
+                return null;
+
+            cmp = comparator.compare(end, lastReturned);
+            if (cmp < 0 || (inclusive && cmp == 0))
+                return this;
+
+            ByteBuffer[] values = extractValues(lastReturned);
+            return new Slice(start, inclusive ? Bound.inclusiveEndOf(values) : Bound.exclusiveEndOf(values));
+        }
+        else
+        {
+            int cmp = comparator.compare(end, lastReturned);
+            if (cmp < 0 || (!inclusive && cmp == 0))
+                return null;
+
+            cmp = comparator.compare(lastReturned, start);
+            if (cmp < 0 || (inclusive && cmp == 0))
+                return this;
+
+            ByteBuffer[] values = extractValues(lastReturned);
+            return new Slice(inclusive ? Bound.inclusiveStartOf(values) : Bound.exclusiveStartOf(values), end);
+        }
+    }
+
+    /**
+     * Given the per-clustering column minimum and maximum value a sstable contains, whether or not this slice potentially
+     * intersects that sstable or not.
+     *
+     * @param comparator the comparator for the table this is a slice of.
+     * @param minClusteringValues the smallest values for each clustering column that a sstable contains.
+     * @param maxClusteringValues the biggest values for each clustering column that a sstable contains.
+     *
+     * @return whether the slice might intersects with the sstable having {@code minClusteringValues} and
+     * {@code maxClusteringValues}.
+     */
+    public boolean intersects(ClusteringComparator comparator, List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+    {
+        // If this slice starts after max clustering or ends before min clustering, it can't intersect
+        return start.compareTo(comparator, maxClusteringValues) <= 0 && end.compareTo(comparator, minClusteringValues) >= 0;
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return toString(metadata.comparator);
+    }
+
+    public String toString(ClusteringComparator comparator)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append(start.isInclusive() ? "[" : "(");
+        for (int i = 0; i < start.size(); i++)
+        {
+            if (i > 0)
+                sb.append(':');
+            sb.append(comparator.subtype(i).getString(start.get(i)));
+        }
+        sb.append(", ");
+        for (int i = 0; i < end.size(); i++)
+        {
+            if (i > 0)
+                sb.append(':');
+            sb.append(comparator.subtype(i).getString(end.get(i)));
+        }
+        sb.append(end.isInclusive() ? "]" : ")");
+        return sb.toString();
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof Slice))
+            return false;
+
+        Slice that = (Slice)other;
+        return this.start().equals(that.start())
+            && this.end().equals(that.end());
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(start(), end());
+    }
+
+    public static class Serializer
+    {
+        public void serialize(Slice slice, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
+        {
+            Bound.serializer.serialize(slice.start, out, version, types);
+            Bound.serializer.serialize(slice.end, out, version, types);
+        }
+
+        public long serializedSize(Slice slice, int version, List<AbstractType<?>> types)
+        {
+            return Bound.serializer.serializedSize(slice.start, version, types)
+                 + Bound.serializer.serializedSize(slice.end, version, types);
+        }
+
+        public Slice deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException
+        {
+            Bound start = Bound.serializer.deserialize(in, version, types);
+            Bound end = Bound.serializer.deserialize(in, version, types);
+            return new Slice(start, end);
+        }
+    }
+
+    /**
+     * The bound of a slice.
+     * <p>
+     * This can be either a start or an end bound, and this can be either inclusive or exclusive.
+     */
+    public static class Bound extends AbstractClusteringPrefix
+    {
+        public static final Serializer serializer = new Serializer();
+
+        /**
+         * The smallest and biggest bound. Note that as range tomstone bounds are (special case) of slice bounds,
+         * we want the BOTTOM and TOP to be the same object, but we alias them here because it's cleaner when dealing
+         * with slices to refer to Slice.Bound.BOTTOM and Slice.Bound.TOP.
+         */
+        public static final Bound BOTTOM = RangeTombstone.Bound.BOTTOM;
+        public static final Bound TOP = RangeTombstone.Bound.TOP;
+
+        protected Bound(Kind kind, ByteBuffer[] values)
+        {
+            super(kind, values);
+        }
+
+        public static Bound create(Kind kind, ByteBuffer[] values)
+        {
+            assert !kind.isBoundary();
+            return new Bound(kind, values);
+        }
+
+        public static Kind boundKind(boolean isStart, boolean isInclusive)
+        {
+            return isStart
+                 ? (isInclusive ? Kind.INCL_START_BOUND : Kind.EXCL_START_BOUND)
+                 : (isInclusive ? Kind.INCL_END_BOUND : Kind.EXCL_END_BOUND);
+        }
+
+        public static Bound inclusiveStartOf(ByteBuffer... values)
+        {
+            return create(Kind.INCL_START_BOUND, values);
+        }
+
+        public static Bound inclusiveEndOf(ByteBuffer... values)
+        {
+            return create(Kind.INCL_END_BOUND, values);
+        }
+
+        public static Bound exclusiveStartOf(ByteBuffer... values)
+        {
+            return create(Kind.EXCL_START_BOUND, values);
+        }
+
+        public static Bound exclusiveEndOf(ByteBuffer... values)
+        {
+            return create(Kind.EXCL_END_BOUND, values);
+        }
+
+        public static Bound inclusiveStartOf(ClusteringPrefix prefix)
+        {
+            ByteBuffer[] values = new ByteBuffer[prefix.size()];
+            for (int i = 0; i < prefix.size(); i++)
+                values[i] = prefix.get(i);
+            return inclusiveStartOf(values);
+        }
+
+        public static Bound exclusiveStartOf(ClusteringPrefix prefix)
+        {
+            ByteBuffer[] values = new ByteBuffer[prefix.size()];
+            for (int i = 0; i < prefix.size(); i++)
+                values[i] = prefix.get(i);
+            return exclusiveStartOf(values);
+        }
+
+        public static Bound inclusiveEndOf(ClusteringPrefix prefix)
+        {
+            ByteBuffer[] values = new ByteBuffer[prefix.size()];
+            for (int i = 0; i < prefix.size(); i++)
+                values[i] = prefix.get(i);
+            return inclusiveEndOf(values);
+        }
+
+        public static Bound create(ClusteringComparator comparator, boolean isStart, boolean isInclusive, Object... values)
+        {
+            CBuilder builder = CBuilder.create(comparator);
+            for (Object val : values)
+            {
+                if (val instanceof ByteBuffer)
+                    builder.add((ByteBuffer) val);
+                else
+                    builder.add(val);
+            }
+            return builder.buildBound(isStart, isInclusive);
+        }
+
+        public Bound withNewKind(Kind kind)
+        {
+            assert !kind.isBoundary();
+            return new Bound(kind, values);
+        }
+
+        public boolean isStart()
+        {
+            return kind().isStart();
+        }
+
+        public boolean isEnd()
+        {
+            return !isStart();
+        }
+
+        public boolean isInclusive()
+        {
+            return kind == Kind.INCL_START_BOUND || kind == Kind.INCL_END_BOUND;
+        }
+
+        public boolean isExclusive()
+        {
+            return kind == Kind.EXCL_START_BOUND || kind == Kind.EXCL_END_BOUND;
+        }
+
+        /**
+         * Returns the inverse of the current bound.
+         * <p>
+         * This invert both start into end (and vice-versa) and inclusive into exclusive (and vice-versa).
+         *
+         * @return the invert of this bound. For instance, if this bound is an exlusive start, this return
+         * an inclusive end with the same values.
+         */
+        public Slice.Bound invert()
+        {
+            return withNewKind(kind().invert());
+        }
+
+        // For use by intersects, it's called with the sstable bound opposite to the slice bound
+        // (so if the slice bound is a start, it's call with the max sstable bound)
+        private int compareTo(ClusteringComparator comparator, List<ByteBuffer> sstableBound)
+        {
+            for (int i = 0; i < sstableBound.size(); i++)
+            {
+                // Say the slice bound is a start. It means we're in the case where the max
+                // sstable bound is say (1:5) while the slice start is (1). So the start
+                // does start before the sstable end bound (and intersect it). It's the exact
+                // inverse with a end slice bound.
+                if (i >= size())
+                    return isStart() ? -1 : 1;
+
+                int cmp = comparator.compareComponent(i, get(i), sstableBound.get(i));
+                if (cmp != 0)
+                    return cmp;
+            }
+
+            // Say the slice bound is a start. I means we're in the case where the max
+            // sstable bound is say (1), while the slice start is (1:5). This again means
+            // that the slice start before the end bound.
+            if (size() > sstableBound.size())
+                return isStart() ? -1 : 1;
+
+            // The slice bound is equal to the sstable bound. Results depends on whether the slice is inclusive or not
+            return isInclusive() ? 0 : (isStart() ? 1 : -1);
+        }
+
+        public String toString(CFMetaData metadata)
+        {
+            return toString(metadata.comparator);
+        }
+
+        public String toString(ClusteringComparator comparator)
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append(kind()).append('(');
+            for (int i = 0; i < size(); i++)
+            {
+                if (i > 0)
+                    sb.append(", ");
+                sb.append(comparator.subtype(i).getString(get(i)));
+            }
+            return sb.append(')').toString();
+        }
+
+        public ClusteringPrefix minimize()
+        {
+            if (!ByteBufferUtil.canMinimize(values))
+                return this;
+            return new Bound(kind, ByteBufferUtil.minimizeBuffers(values));
+        }
+
+        /**
+         * Serializer for slice bounds.
+         * <p>
+         * Contrarily to {@code Clustering}, a slice bound can be a true prefix of the full clustering, so we actually record
+         * its size.
+         */
+        public static class Serializer
+        {
+            public void serialize(Slice.Bound bound, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException
+            {
+                out.writeByte(bound.kind().ordinal());
+                out.writeShort(bound.size());
+                ClusteringPrefix.serializer.serializeValuesWithoutSize(bound, out, version, types);
+            }
+
+            public long serializedSize(Slice.Bound bound, int version, List<AbstractType<?>> types)
+            {
+                return 1 // kind ordinal
+                     + TypeSizes.sizeof((short)bound.size())
+                     + ClusteringPrefix.serializer.valuesWithoutSizeSerializedSize(bound, version, types);
+            }
+
+            public Slice.Bound deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException
+            {
+                Kind kind = Kind.values()[in.readByte()];
+                return deserializeValues(in, kind, version, types);
+            }
+
+            public Slice.Bound deserializeValues(DataInputPlus in, Kind kind, int version, List<AbstractType<?>> types) throws IOException
+            {
+                int size = in.readUnsignedShort();
+                if (size == 0)
+                    return kind.isStart() ? BOTTOM : TOP;
+
+                ByteBuffer[] values = ClusteringPrefix.serializer.deserializeValuesWithoutSize(in, size, version, types);
+                return Slice.Bound.create(kind, values);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java b/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java
deleted file mode 100644
index 65eefaa..0000000
--- a/src/java/org/apache/cassandra/db/SliceByNamesReadCommand.java
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-
-import com.google.common.base.Objects;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class SliceByNamesReadCommand extends ReadCommand
-{
-    static final SliceByNamesReadCommandSerializer serializer = new SliceByNamesReadCommandSerializer();
-
-    public final NamesQueryFilter filter;
-
-    public SliceByNamesReadCommand(String keyspaceName, ByteBuffer key, String cfName, long timestamp, NamesQueryFilter filter)
-    {
-        super(keyspaceName, key, cfName, timestamp, Type.GET_BY_NAMES);
-        this.filter = filter;
-    }
-
-    public ReadCommand copy()
-    {
-        return new SliceByNamesReadCommand(ksName, key, cfName, timestamp, filter).setIsDigestQuery(isDigestQuery());
-    }
-
-    public Row getRow(Keyspace keyspace)
-    {
-        DecoratedKey dk = StorageService.getPartitioner().decorateKey(key);
-        return keyspace.getRow(new QueryFilter(dk, cfName, filter, timestamp));
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this)
-                      .add("ksName", ksName)
-                      .add("cfName", cfName)
-                      .add("key", ByteBufferUtil.bytesToHex(key))
-                      .add("filter", filter)
-                      .add("timestamp", timestamp)
-                      .toString();
-    }
-
-    public IDiskAtomFilter filter()
-    {
-        return filter;
-    }
-}
-
-class SliceByNamesReadCommandSerializer implements IVersionedSerializer<ReadCommand>
-{
-    public void serialize(ReadCommand cmd, DataOutputPlus out, int version) throws IOException
-    {
-        SliceByNamesReadCommand command = (SliceByNamesReadCommand) cmd;
-        out.writeBoolean(command.isDigestQuery());
-        out.writeUTF(command.ksName);
-        ByteBufferUtil.writeWithShortLength(command.key, out);
-        out.writeUTF(command.cfName);
-        out.writeLong(cmd.timestamp);
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
-        metadata.comparator.namesQueryFilterSerializer().serialize(command.filter, out, version);
-    }
-
-    public ReadCommand deserialize(DataInput in, int version) throws IOException
-    {
-        boolean isDigest = in.readBoolean();
-        String keyspaceName = in.readUTF();
-        ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-        String cfName = in.readUTF();
-        long timestamp = in.readLong();
-        CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
-        if (metadata == null)
-        {
-            String message = String.format("Got slice command for nonexistent table %s.%s.  If the table was just " +
-                    "created, this is likely due to the schema not being fully propagated.  Please wait for schema " +
-                    "agreement on table creation.", keyspaceName, cfName);
-            throw new UnknownColumnFamilyException(message, null);
-        }
-        NamesQueryFilter filter = metadata.comparator.namesQueryFilterSerializer().deserialize(in, version);
-        return new SliceByNamesReadCommand(keyspaceName, key, cfName, timestamp, filter).setIsDigestQuery(isDigest);
-    }
-
-    public long serializedSize(ReadCommand cmd, int version)
-    {
-        TypeSizes sizes = TypeSizes.NATIVE;
-        SliceByNamesReadCommand command = (SliceByNamesReadCommand) cmd;
-        int size = sizes.sizeof(command.isDigestQuery());
-        int keySize = command.key.remaining();
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
-
-        size += sizes.sizeof(command.ksName);
-        size += sizes.sizeof((short)keySize) + keySize;
-        size += sizes.sizeof(command.cfName);
-        size += sizes.sizeof(cmd.timestamp);
-        size += metadata.comparator.namesQueryFilterSerializer().serializedSize(command.filter, version);
-
-        return size;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/SliceFromReadCommand.java b/src/java/org/apache/cassandra/db/SliceFromReadCommand.java
deleted file mode 100644
index edace9d..0000000
--- a/src/java/org/apache/cassandra/db/SliceFromReadCommand.java
+++ /dev/null

@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import com.google.common.base.Objects;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.service.RowDataResolver;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-
-public class SliceFromReadCommand extends ReadCommand
-{
-    private static final Logger logger = LoggerFactory.getLogger(SliceFromReadCommand.class);
-
-    static final SliceFromReadCommandSerializer serializer = new SliceFromReadCommandSerializer();
-
-    public final SliceQueryFilter filter;
-
-    public SliceFromReadCommand(String keyspaceName, ByteBuffer key, String cfName, long timestamp, SliceQueryFilter filter)
-    {
-        super(keyspaceName, key, cfName, timestamp, Type.GET_SLICES);
-        this.filter = filter;
-    }
-
-    public ReadCommand copy()
-    {
-        return new SliceFromReadCommand(ksName, key, cfName, timestamp, filter).setIsDigestQuery(isDigestQuery());
-    }
-
-    public Row getRow(Keyspace keyspace)
-    {
-        CFMetaData cfm = Schema.instance.getCFMetaData(ksName, cfName);
-        DecoratedKey dk = StorageService.getPartitioner().decorateKey(key);
-
-        // If we're doing a reversed query and the filter includes static columns, we need to issue two separate
-        // reads in order to guarantee that the static columns are fetched.  See CASSANDRA-8502 for more details.
-        if (filter.reversed && filter.hasStaticSlice(cfm))
-        {
-            logger.trace("Splitting reversed slice with static columns into two reads");
-            Pair<SliceQueryFilter, SliceQueryFilter> newFilters = filter.splitOutStaticSlice(cfm);
-
-            Row normalResults =  keyspace.getRow(new QueryFilter(dk, cfName, newFilters.right, timestamp));
-            Row staticResults =  keyspace.getRow(new QueryFilter(dk, cfName, newFilters.left, timestamp));
-
-            // add the static results to the start of the normal results
-            if (normalResults.cf == null)
-                return staticResults;
-
-            if (staticResults.cf != null)
-                for (Cell cell : staticResults.cf.getReverseSortedColumns())
-                    normalResults.cf.addColumn(cell);
-
-            return normalResults;
-        }
-
-        return keyspace.getRow(new QueryFilter(dk, cfName, filter, timestamp));
-    }
-
-    @Override
-    public ReadCommand maybeGenerateRetryCommand(RowDataResolver resolver, Row row)
-    {
-        int maxLiveColumns = resolver.getMaxLiveCount();
-
-        int count = filter.count;
-        // We generate a retry if at least one node reply with count live columns but after merge we have less
-        // than the total number of column we are interested in (which may be < count on a retry).
-        // So in particular, if no host returned count live columns, we know it's not a short read.
-        if (maxLiveColumns < count)
-            return null;
-
-        int liveCountInRow = row == null || row.cf == null ? 0 : filter.getLiveCount(row.cf, timestamp);
-        if (liveCountInRow < getOriginalRequestedCount())
-        {
-            // We asked t (= count) live columns and got l (=liveCountInRow) ones.
-            // From that, we can estimate that on this row, for x requested
-            // columns, only l/t end up live after reconciliation. So for next
-            // round we want to ask x column so that x * (l/t) == t, i.e. x = t^2/l.
-            int retryCount = liveCountInRow == 0 ? count + 1 : ((count * count) / liveCountInRow) + 1;
-            SliceQueryFilter newFilter = filter.withUpdatedCount(retryCount);
-            return new RetriedSliceFromReadCommand(ksName, key, cfName, timestamp, newFilter, getOriginalRequestedCount());
-        }
-
-        return null;
-    }
-
-    @Override
-    public Row maybeTrim(Row row)
-    {
-        if ((row == null) || (row.cf == null))
-            return row;
-
-        return new Row(row.key, filter.trim(row.cf, getOriginalRequestedCount(), timestamp));
-    }
-
-    public IDiskAtomFilter filter()
-    {
-        return filter;
-    }
-
-    public SliceFromReadCommand withUpdatedFilter(SliceQueryFilter newFilter)
-    {
-        return new SliceFromReadCommand(ksName, key, cfName, timestamp, newFilter);
-    }
-
-    /**
-     * The original number of columns requested by the user.
-     * This can be different from count when the slice command is a retry (see
-     * RetriedSliceFromReadCommand)
-     */
-    protected int getOriginalRequestedCount()
-    {
-        return filter.count;
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this)
-                      .add("ksName", ksName)
-                      .add("cfName", cfName)
-                      .add("key", ByteBufferUtil.bytesToHex(key))
-                      .add("filter", filter)
-                      .add("timestamp", timestamp)
-                      .toString();
-    }
-}
-
-class SliceFromReadCommandSerializer implements IVersionedSerializer<ReadCommand>
-{
-    public void serialize(ReadCommand rm, DataOutputPlus out, int version) throws IOException
-    {
-        SliceFromReadCommand realRM = (SliceFromReadCommand)rm;
-        out.writeBoolean(realRM.isDigestQuery());
-        out.writeUTF(realRM.ksName);
-        ByteBufferUtil.writeWithShortLength(realRM.key, out);
-        out.writeUTF(realRM.cfName);
-        out.writeLong(realRM.timestamp);
-        CFMetaData metadata = Schema.instance.getCFMetaData(realRM.ksName, realRM.cfName);
-        metadata.comparator.sliceQueryFilterSerializer().serialize(realRM.filter, out, version);
-    }
-
-    public ReadCommand deserialize(DataInput in, int version) throws IOException
-    {
-        boolean isDigest = in.readBoolean();
-        String keyspaceName = in.readUTF();
-        ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-        String cfName = in.readUTF();
-        long timestamp = in.readLong();
-        CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
-        if (metadata == null)
-        {
-            String message = String.format("Got slice command for nonexistent table %s.%s.  If the table was just " +
-                    "created, this is likely due to the schema not being fully propagated.  Please wait for schema " +
-                    "agreement on table creation.", keyspaceName, cfName);
-            throw new UnknownColumnFamilyException(message, null);
-        }
-        SliceQueryFilter filter = metadata.comparator.sliceQueryFilterSerializer().deserialize(in, version);
-        return new SliceFromReadCommand(keyspaceName, key, cfName, timestamp, filter).setIsDigestQuery(isDigest);
-    }
-
-    public long serializedSize(ReadCommand cmd, int version)
-    {
-        TypeSizes sizes = TypeSizes.NATIVE;
-        SliceFromReadCommand command = (SliceFromReadCommand) cmd;
-        int keySize = command.key.remaining();
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(cmd.ksName, cmd.cfName);
-
-        int size = sizes.sizeof(cmd.isDigestQuery()); // boolean
-        size += sizes.sizeof(command.ksName);
-        size += sizes.sizeof((short) keySize) + keySize;
-        size += sizes.sizeof(command.cfName);
-        size += sizes.sizeof(cmd.timestamp);
-        size += metadata.comparator.sliceQueryFilterSerializer().serializedSize(command.filter, version);
-
-        return size;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/Slices.java b/src/java/org/apache/cassandra/db/Slices.java
new file mode 100644
index 0000000..269386e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/Slices.java

@@ -0,0 +1,915 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Represents the selection of multiple range of rows within a partition.
+ * <p>
+ * A {@code Slices} is basically a list of {@code Slice}, though those are guaranteed to be non-overlapping
+ * and always in clustering order.
+ */
+public abstract class Slices implements Iterable<Slice>
+{
+    public static final Serializer serializer = new Serializer();
+
+    /** Slices selecting all the rows of a partition. */
+    public static final Slices ALL = new SelectAllSlices();
+    /** Slices selecting no rows in a partition. */
+    public static final Slices NONE = new SelectNoSlices();
+
+    protected Slices()
+    {
+    }
+
+    /**
+     * Creates a {@code Slices} object that contains a single slice.
+     *
+     * @param comparator the comparator for the table {@code slice} is a slice of.
+     * @param slice the single slice that the return object should contains.
+     *
+     * @return the newly created {@code Slices} object.
+     */
+    public static Slices with(ClusteringComparator comparator, Slice slice)
+    {
+        if (slice.start() == Slice.Bound.BOTTOM && slice.end() == Slice.Bound.TOP)
+            return Slices.ALL;
+
+        assert comparator.compare(slice.start(), slice.end()) <= 0;
+        return new ArrayBackedSlices(comparator, new Slice[]{ slice });
+    }
+
+    /**
+     * Whether the slices has a lower bound, that is whether it's first slice start is {@code Slice.BOTTOM}.
+     *
+     * @return whether the slices has a lower bound.
+     */
+    public abstract boolean hasLowerBound();
+
+    /**
+     * Whether the slices has an upper bound, that is whether it's last slice end is {@code Slice.TOP}.
+     *
+     * @return whether the slices has an upper bound.
+     */
+    public abstract boolean hasUpperBound();
+
+    /**
+     * The number of slice this object contains.
+     *
+     * @return the number of slice this object contains.
+     */
+    public abstract int size();
+
+    /**
+     * Returns the ith slice of this {@code Slices} object.
+     *
+     * @return the ith slice of this object.
+     */
+    public abstract Slice get(int i);
+
+    /**
+     * Returns slices for continuing the paging of those slices given the last returned clustering prefix.
+     *
+     * @param comparator the comparator for the table this is a filter for.
+     * @param lastReturned the last clustering that was returned for the query we are paging for. The
+     * resulting slices will be such that only results coming stricly after {@code lastReturned} are returned
+     * (where coming after means "greater than" if {@code !reversed} and "lesser than" otherwise).
+     * @param inclusive whether or not we want to include the {@code lastReturned} in the newly returned page of results.
+     * @param reversed whether the query we're paging for is reversed or not.
+     *
+     * @return new slices that select results coming after {@code lastReturned}.
+     */
+    public abstract Slices forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive, boolean reversed);
+
+    /**
+     * An object that allows to test whether rows are selected by this {@code Slices} objects assuming those rows
+     * are tested in clustering order.
+     *
+     * @param reversed if true, the rows passed to the returned object will be assumed to be in reversed clustering
+     * order, otherwise they should be in clustering order.
+     *
+     * @return an object that tests for selection of rows by this {@code Slices} object.
+     */
+    public abstract InOrderTester inOrderTester(boolean reversed);
+
+    /**
+     * Whether a given clustering (row) is selected by this {@code Slices} object.
+     *
+     * @param clustering the clustering to test for selection.
+     *
+     * @return whether a given clustering (row) is selected by this {@code Slices} object.
+     */
+    public abstract boolean selects(Clustering clustering);
+
+
+    /**
+     * Given the per-clustering column minimum and maximum value a sstable contains, whether or not this slices potentially
+     * intersects that sstable or not.
+     *
+     * @param minClusteringValues the smallest values for each clustering column that a sstable contains.
+     * @param maxClusteringValues the biggest values for each clustering column that a sstable contains.
+     *
+     * @return whether the slices might intersects with the sstable having {@code minClusteringValues} and
+     * {@code maxClusteringValues}.
+     */
+    public abstract boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues);
+
+    /**
+     * Given a sliceable row iterator, returns a row iterator that only return rows selected by the slice of
+     * this {@code Slices} object.
+     *
+     * @param iter the sliceable iterator to filter.
+     *
+     * @return an iterator that only returns the rows (or rather Unfiltered) of {@code iter} that are selected by those slices.
+     */
+    public abstract UnfilteredRowIterator makeSliceIterator(SliceableUnfilteredRowIterator iter);
+
+    public abstract String toCQLString(CFMetaData metadata);
+
+    /**
+     * Checks if this <code>Slices</code> is empty.
+     * @return <code>true</code> if this <code>Slices</code> is empty, <code>false</code> otherwise.
+     */
+    public final boolean isEmpty()
+    {
+        return size() == 0;
+    }
+
+    /**
+     * In simple object that allows to test the inclusion of rows in those slices assuming those rows
+     * are passed (to {@link #includes}) in clustering order (or reverse clustering ordered, depending
+     * of the argument passed to {@link #inOrderTester}).
+     */
+    public interface InOrderTester
+    {
+        public boolean includes(Clustering value);
+        public boolean isDone();
+    }
+
+    /**
+     * Builder to create {@code Slices} objects.
+     */
+    public static class Builder
+    {
+        private final ClusteringComparator comparator;
+
+        private final List<Slice> slices;
+
+        private boolean needsNormalizing;
+
+        public Builder(ClusteringComparator comparator)
+        {
+            this.comparator = comparator;
+            this.slices = new ArrayList<>();
+        }
+
+        public Builder(ClusteringComparator comparator, int initialSize)
+        {
+            this.comparator = comparator;
+            this.slices = new ArrayList<>(initialSize);
+        }
+
+        public Builder add(Slice.Bound start, Slice.Bound end)
+        {
+            return add(Slice.make(start, end));
+        }
+
+        public Builder add(Slice slice)
+        {
+            assert comparator.compare(slice.start(), slice.end()) <= 0;
+            if (slices.size() > 0 && comparator.compare(slices.get(slices.size()-1).end(), slice.start()) > 0)
+                needsNormalizing = true;
+            slices.add(slice);
+            return this;
+        }
+
+        public Builder addAll(Slices slices)
+        {
+            for (Slice slice : slices)
+                add(slice);
+            return this;
+        }
+
+        public int size()
+        {
+            return slices.size();
+        }
+
+        public Slices build()
+        {
+            if (slices.isEmpty())
+                return NONE;
+
+            if (slices.size() == 1 && slices.get(0) == Slice.ALL)
+                return ALL;
+
+            List<Slice> normalized = needsNormalizing
+                                   ? normalize(slices)
+                                   : slices;
+
+            return new ArrayBackedSlices(comparator, normalized.toArray(new Slice[normalized.size()]));
+        }
+
+        /**
+         * Given an array of slices (potentially overlapping and in any order) and return an equivalent array
+         * of non-overlapping slices in clustering order.
+         *
+         * @param slices an array of slices. This may be modified by this method.
+         * @return the smallest possible array of non-overlapping slices in clustering order. If the original
+         * slices are already non-overlapping and in comparator order, this may or may not return the provided slices
+         * directly.
+         */
+        private List<Slice> normalize(List<Slice> slices)
+        {
+            if (slices.size() <= 1)
+                return slices;
+
+            Collections.sort(slices, new Comparator<Slice>()
+            {
+                @Override
+                public int compare(Slice s1, Slice s2)
+                {
+                    int c = comparator.compare(s1.start(), s2.start());
+                    if (c != 0)
+                        return c;
+
+                    return comparator.compare(s1.end(), s2.end());
+                }
+            });
+
+            List<Slice> slicesCopy = new ArrayList<>(slices.size());
+
+            Slice last = slices.get(0);
+
+            for (int i = 1; i < slices.size(); i++)
+            {
+                Slice s2 = slices.get(i);
+
+                boolean includesStart = last.includes(comparator, s2.start());
+                boolean includesFinish = last.includes(comparator, s2.end());
+
+                if (includesStart && includesFinish)
+                    continue;
+
+                if (!includesStart && !includesFinish)
+                {
+                    slicesCopy.add(last);
+                    last = s2;
+                    continue;
+                }
+
+                if (includesStart)
+                {
+                    last = Slice.make(last.start(), s2.end());
+                    continue;
+                }
+
+                assert !includesFinish;
+            }
+
+            slicesCopy.add(last);
+            return slicesCopy;
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(Slices slices, DataOutputPlus out, int version) throws IOException
+        {
+            int size = slices.size();
+            out.writeUnsignedVInt(size);
+
+            if (size == 0)
+                return;
+
+            List<AbstractType<?>> types = slices == ALL
+                                        ? Collections.<AbstractType<?>>emptyList()
+                                        : ((ArrayBackedSlices)slices).comparator.subtypes();
+
+            for (Slice slice : slices)
+                Slice.serializer.serialize(slice, out, version, types);
+        }
+
+        public long serializedSize(Slices slices, int version)
+        {
+            long size = TypeSizes.sizeofUnsignedVInt(slices.size());
+
+            if (slices.size() == 0)
+                return size;
+
+            List<AbstractType<?>> types = slices instanceof SelectAllSlices
+                                        ? Collections.<AbstractType<?>>emptyList()
+                                        : ((ArrayBackedSlices)slices).comparator.subtypes();
+
+            for (Slice slice : slices)
+                size += Slice.serializer.serializedSize(slice, version, types);
+
+            return size;
+        }
+
+        public Slices deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            int size = (int)in.readUnsignedVInt();
+
+            if (size == 0)
+                return NONE;
+
+            Slice[] slices = new Slice[size];
+            for (int i = 0; i < size; i++)
+                slices[i] = Slice.serializer.deserialize(in, version, metadata.comparator.subtypes());
+
+            if (size == 1 && slices[0].start() == Slice.Bound.BOTTOM && slices[0].end() == Slice.Bound.TOP)
+                return ALL;
+
+            return new ArrayBackedSlices(metadata.comparator, slices);
+        }
+    }
+
+    /**
+     * Simple {@code Slices} implementation that stores its slices in an array.
+     */
+    private static class ArrayBackedSlices extends Slices
+    {
+        private final ClusteringComparator comparator;
+
+        private final Slice[] slices;
+
+        private ArrayBackedSlices(ClusteringComparator comparator, Slice[] slices)
+        {
+            this.comparator = comparator;
+            this.slices = slices;
+        }
+
+        public int size()
+        {
+            return slices.length;
+        }
+
+        public boolean hasLowerBound()
+        {
+            return slices[0].start().size() != 0;
+        }
+
+        public boolean hasUpperBound()
+        {
+            return slices[slices.length - 1].end().size() != 0;
+        }
+
+        public Slice get(int i)
+        {
+            return slices[i];
+        }
+
+        public boolean selects(Clustering clustering)
+        {
+            for (int i = 0; i < slices.length; i++)
+            {
+                Slice slice = slices[i];
+                if (comparator.compare(clustering, slice.start()) < 0)
+                    return false;
+
+                if (comparator.compare(clustering, slice.end()) <= 0)
+                    return true;
+            }
+            return false;
+        }
+
+        public InOrderTester inOrderTester(boolean reversed)
+        {
+            return reversed ? new InReverseOrderTester() : new InForwardOrderTester();
+        }
+
+        public Slices forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive, boolean reversed)
+        {
+            return reversed ? forReversePaging(comparator, lastReturned, inclusive) : forForwardPaging(comparator, lastReturned, inclusive);
+        }
+
+        private Slices forForwardPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive)
+        {
+            for (int i = 0; i < slices.length; i++)
+            {
+                Slice slice = slices[i];
+                Slice newSlice = slice.forPaging(comparator, lastReturned, inclusive, false);
+                if (newSlice == null)
+                    continue;
+
+                if (slice == newSlice && i == 0)
+                    return this;
+
+                ArrayBackedSlices newSlices = new ArrayBackedSlices(comparator, Arrays.copyOfRange(slices, i, slices.length));
+                newSlices.slices[0] = newSlice;
+                return newSlices;
+            }
+            return Slices.NONE;
+        }
+
+        private Slices forReversePaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive)
+        {
+            for (int i = slices.length - 1; i >= 0; i--)
+            {
+                Slice slice = slices[i];
+                Slice newSlice = slice.forPaging(comparator, lastReturned, inclusive, true);
+                if (newSlice == null)
+                    continue;
+
+                if (slice == newSlice && i == slices.length - 1)
+                    return this;
+
+                ArrayBackedSlices newSlices = new ArrayBackedSlices(comparator, Arrays.copyOfRange(slices, 0, i + 1));
+                newSlices.slices[i] = newSlice;
+                return newSlices;
+            }
+            return Slices.NONE;
+        }
+
+        public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        {
+            for (Slice slice : this)
+            {
+                if (slice.intersects(comparator, minClusteringValues, maxClusteringValues))
+                    return true;
+            }
+            return false;
+        }
+
+        public UnfilteredRowIterator makeSliceIterator(final SliceableUnfilteredRowIterator iter)
+        {
+            return new WrappingUnfilteredRowIterator(iter)
+            {
+                private int nextSlice = iter.isReverseOrder() ? slices.length - 1 : 0;
+                private Iterator<Unfiltered> currentSliceIterator = Collections.emptyIterator();
+
+                private Unfiltered next;
+
+                @Override
+                public boolean hasNext()
+                {
+                    prepareNext();
+                    return next != null;
+                }
+
+                @Override
+                public Unfiltered next()
+                {
+                    prepareNext();
+                    Unfiltered toReturn = next;
+                    next = null;
+                    return toReturn;
+                }
+
+                private boolean hasMoreSlice()
+                {
+                    return isReverseOrder()
+                         ? nextSlice >= 0
+                         : nextSlice < slices.length;
+                }
+
+                private Slice popNextSlice()
+                {
+                    return slices[isReverseOrder() ? nextSlice-- : nextSlice++];
+                }
+
+                private void prepareNext()
+                {
+                    if (next != null)
+                        return;
+
+                    while (true)
+                    {
+                        if (currentSliceIterator.hasNext())
+                        {
+                            next = currentSliceIterator.next();
+                            return;
+                        }
+
+                        if (!hasMoreSlice())
+                            return;
+
+                        currentSliceIterator = iter.slice(popNextSlice());
+                    }
+                }
+            };
+        }
+
+        public Iterator<Slice> iterator()
+        {
+            return Iterators.forArray(slices);
+        }
+
+        private class InForwardOrderTester implements InOrderTester
+        {
+            private int idx;
+            private boolean inSlice;
+
+            public boolean includes(Clustering value)
+            {
+                while (idx < slices.length)
+                {
+                    if (!inSlice)
+                    {
+                        int cmp = comparator.compare(value, slices[idx].start());
+                        // value < start
+                        if (cmp < 0)
+                            return false;
+
+                        inSlice = true;
+
+                        if (cmp == 0)
+                            return true;
+                    }
+
+                    // Here, start < value and inSlice
+                    if (comparator.compare(value, slices[idx].end()) <= 0)
+                        return true;
+
+                    ++idx;
+                    inSlice = false;
+                }
+                return false;
+            }
+
+            public boolean isDone()
+            {
+                return idx >= slices.length;
+            }
+        }
+
+        private class InReverseOrderTester implements InOrderTester
+        {
+            private int idx;
+            private boolean inSlice;
+
+            public InReverseOrderTester()
+            {
+                this.idx = slices.length - 1;
+            }
+
+            public boolean includes(Clustering value)
+            {
+                while (idx >= 0)
+                {
+                    if (!inSlice)
+                    {
+                        int cmp = comparator.compare(slices[idx].end(), value);
+                        // value > end
+                        if (cmp > 0)
+                            return false;
+
+                        inSlice = true;
+
+                        if (cmp == 0)
+                            return true;
+                    }
+
+                    // Here, value <= end and inSlice
+                    if (comparator.compare(slices[idx].start(), value) <= 0)
+                        return true;
+
+                    --idx;
+                    inSlice = false;
+                }
+                return false;
+            }
+
+            public boolean isDone()
+            {
+                return idx < 0;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.append("{");
+            for (int i = 0; i < slices.length; i++)
+            {
+                if (i > 0)
+                    sb.append(", ");
+                sb.append(slices[i].toString(comparator));
+            }
+            return sb.append("}").toString();
+        }
+
+        public String toCQLString(CFMetaData metadata)
+        {
+            StringBuilder sb = new StringBuilder();
+
+            // In CQL, condition are expressed by column, so first group things that way,
+            // i.e. for each column, we create a list of what each slice contains on that column
+            int clusteringSize = metadata.clusteringColumns().size();
+            List<List<ComponentOfSlice>> columnComponents = new ArrayList<>(clusteringSize);
+            for (int i = 0; i < clusteringSize; i++)
+            {
+                List<ComponentOfSlice> perSlice = new ArrayList<>();
+                columnComponents.add(perSlice);
+
+                for (int j = 0; j < slices.length; j++)
+                {
+                    ComponentOfSlice c = ComponentOfSlice.fromSlice(i, slices[j]);
+                    if (c != null)
+                        perSlice.add(c);
+                }
+            }
+
+            boolean needAnd = false;
+            for (int i = 0; i < clusteringSize; i++)
+            {
+                ColumnDefinition column = metadata.clusteringColumns().get(i);
+                List<ComponentOfSlice> componentInfo = columnComponents.get(i);
+                if (componentInfo.isEmpty())
+                    break;
+
+                // For a given column, there is only 3 cases that CQL currently generates:
+                //   1) every slice are EQ with the same value, it's a simple '=' relation.
+                //   2) every slice are EQ but with different values, it's a IN relation.
+                //   3) every slice aren't EQ but have the same values, we have inequality relations.
+                // Note that this doesn't cover everything that ReadCommand can express, but
+                // as it's all that CQL support for now, we'll ignore other cases (which would then
+                // display a bogus query but that's not the end of the world).
+                // TODO: we should improve this at some point.
+                ComponentOfSlice first = componentInfo.get(0);
+                if (first.isEQ())
+                {
+                    if (needAnd)
+                        sb.append(" AND ");
+                    needAnd = true;
+
+                    sb.append(column.name);
+
+                    Set<ByteBuffer> values = new LinkedHashSet<>();
+                    for (int j = 0; j < componentInfo.size(); j++)
+                        values.add(componentInfo.get(j).startValue);
+
+                    if (values.size() == 1)
+                    {
+                        sb.append(" = ").append(column.type.getString(first.startValue));
+                    }
+                    else
+                    {
+                        sb.append(" IN (");
+                        int j = 0;
+                        for (ByteBuffer value : values)
+                            sb.append(j++ == 0 ? "" : ", ").append(column.type.getString(value));
+                        sb.append(")");
+                    }
+                }
+                else
+                {
+                    // As said above, we assume (without checking) that this means all ComponentOfSlice for this column
+                    // are the same, so we only bother about the first.
+                    if (first.startValue != null)
+                    {
+                        if (needAnd)
+                            sb.append(" AND ");
+                        needAnd = true;
+                        sb.append(column.name).append(first.startInclusive ? " >= " : " > ").append(column.type.getString(first.startValue));
+                    }
+                    if (first.endValue != null)
+                    {
+                        if (needAnd)
+                            sb.append(" AND ");
+                        needAnd = true;
+                        sb.append(column.name).append(first.endInclusive ? " <= " : " < ").append(column.type.getString(first.endValue));
+                    }
+                }
+            }
+            return sb.toString();
+        }
+
+        // An somewhat adhoc utility class only used by toCQLString
+        private static class ComponentOfSlice
+        {
+            public final boolean startInclusive;
+            public final ByteBuffer startValue;
+            public final boolean endInclusive;
+            public final ByteBuffer endValue;
+
+            private ComponentOfSlice(boolean startInclusive, ByteBuffer startValue, boolean endInclusive, ByteBuffer endValue)
+            {
+                this.startInclusive = startInclusive;
+                this.startValue = startValue;
+                this.endInclusive = endInclusive;
+                this.endValue = endValue;
+            }
+
+            public static ComponentOfSlice fromSlice(int component, Slice slice)
+            {
+                Slice.Bound start = slice.start();
+                Slice.Bound end = slice.end();
+
+                if (component >= start.size() && component >= end.size())
+                    return null;
+
+                boolean startInclusive = true, endInclusive = true;
+                ByteBuffer startValue = null, endValue = null;
+                if (component < start.size())
+                {
+                    startInclusive = start.isInclusive();
+                    startValue = start.get(component);
+                }
+                if (component < end.size())
+                {
+                    endInclusive = end.isInclusive();
+                    endValue = end.get(component);
+                }
+                return new ComponentOfSlice(startInclusive, startValue, endInclusive, endValue);
+            }
+
+            public boolean isEQ()
+            {
+                return Objects.equals(startValue, endValue);
+            }
+        }
+    }
+
+    /**
+     * Specialized implementation of {@code Slices} that selects all rows.
+     * <p>
+     * This is equivalent to having the single {@code Slice.ALL} slice, but is somewhat more effecient.
+     */
+    private static class SelectAllSlices extends Slices
+    {
+        private static final InOrderTester trivialTester = new InOrderTester()
+        {
+            public boolean includes(Clustering value)
+            {
+                return true;
+            }
+
+            public boolean isDone()
+            {
+                return false;
+            }
+        };
+
+        public int size()
+        {
+            return 1;
+        }
+
+        public Slice get(int i)
+        {
+            return Slice.ALL;
+        }
+
+        public boolean hasLowerBound()
+        {
+            return false;
+        }
+
+        public boolean hasUpperBound()
+        {
+            return false;
+        }
+
+        public boolean selects(Clustering clustering)
+        {
+            return true;
+        }
+
+        public Slices forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive, boolean reversed)
+        {
+            return new ArrayBackedSlices(comparator, new Slice[]{ Slice.ALL.forPaging(comparator, lastReturned, inclusive, reversed) });
+        }
+
+        public InOrderTester inOrderTester(boolean reversed)
+        {
+            return trivialTester;
+        }
+
+        public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        {
+            return true;
+        }
+
+        public UnfilteredRowIterator makeSliceIterator(SliceableUnfilteredRowIterator iter)
+        {
+            return iter;
+        }
+
+        public Iterator<Slice> iterator()
+        {
+            return Iterators.singletonIterator(Slice.ALL);
+        }
+
+        @Override
+        public String toString()
+        {
+            return "ALL";
+        }
+
+        public String toCQLString(CFMetaData metadata)
+        {
+            return "";
+        }
+    }
+
+    /**
+     * Specialized implementation of {@code Slices} that selects no rows.
+     */
+    private static class SelectNoSlices extends Slices
+    {
+        private static final InOrderTester trivialTester = new InOrderTester()
+        {
+            public boolean includes(Clustering value)
+            {
+                return false;
+            }
+
+            public boolean isDone()
+            {
+                return true;
+            }
+        };
+
+        public int size()
+        {
+            return 0;
+        }
+
+        public Slice get(int i)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public boolean hasLowerBound()
+        {
+            return false;
+        }
+
+        public boolean hasUpperBound()
+        {
+            return false;
+        }
+
+        public Slices forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive, boolean reversed)
+        {
+            return this;
+        }
+
+        public boolean selects(Clustering clustering)
+        {
+            return false;
+        }
+
+        public InOrderTester inOrderTester(boolean reversed)
+        {
+            return trivialTester;
+        }
+
+        public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        {
+            return false;
+        }
+
+        public UnfilteredRowIterator makeSliceIterator(SliceableUnfilteredRowIterator iter)
+        {
+            return UnfilteredRowIterators.noRowsIterator(iter.metadata(), iter.partitionKey(), iter.staticRow(),
+                                                         iter.partitionLevelDeletion(), iter.isReverseOrder());
+        }
+
+        public Iterator<Slice> iterator()
+        {
+            return Iterators.emptyIterator();
+        }
+
+        @Override
+        public String toString()
+        {
+            return "NONE";
+        }
+
+        public String toCQLString(CFMetaData metadata)
+        {
+            return "";
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/SnapshotCommand.java b/src/java/org/apache/cassandra/db/SnapshotCommand.java
index 427e9ec..eb6f67a 100644
--- a/src/java/org/apache/cassandra/db/SnapshotCommand.java
+++ b/src/java/org/apache/cassandra/db/SnapshotCommand.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -67,7 +67,7 @@
         out.writeBoolean(snapshot_command.clear_snapshot);
     }
 
-    public SnapshotCommand deserialize(DataInput in, int version) throws IOException
+    public SnapshotCommand deserialize(DataInputPlus in, int version) throws IOException
     {
         String keyspace = in.readUTF();
         String column_family = in.readUTF();
@@ -78,9 +78,9 @@
 
     public long serializedSize(SnapshotCommand sc, int version)
     {
-        return TypeSizes.NATIVE.sizeof(sc.keyspace)
-             + TypeSizes.NATIVE.sizeof(sc.column_family)
-             + TypeSizes.NATIVE.sizeof(sc.snapshot_name)
-             + TypeSizes.NATIVE.sizeof(sc.clear_snapshot);
+        return TypeSizes.sizeof(sc.keyspace)
+             + TypeSizes.sizeof(sc.column_family)
+             + TypeSizes.sizeof(sc.snapshot_name)
+             + TypeSizes.sizeof(sc.clear_snapshot);
     }
 }

diff --git a/src/java/org/apache/cassandra/db/SuperColumns.java b/src/java/org/apache/cassandra/db/SuperColumns.java
deleted file mode 100644
index 65e153f..0000000
--- a/src/java/org/apache/cassandra/db/SuperColumns.java
+++ /dev/null

@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.DataInput;
-import java.io.IOError;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class SuperColumns
-{
-    public static Iterator<OnDiskAtom> onDiskIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore, CellNameType type)
-    {
-        return new SCIterator(in, superColumnCount, flag, expireBefore, type);
-    }
-
-    public static void deserializerSuperColumnFamily(DataInput in, ColumnFamily cf, ColumnSerializer.Flag flag, int version) throws IOException
-    {
-        // Note that there was no way to insert a range tombstone in a SCF in 1.2
-        cf.delete(cf.getComparator().deletionInfoSerializer().deserialize(in, version));
-        assert !cf.deletionInfo().rangeIterator().hasNext();
-
-        Iterator<OnDiskAtom> iter = onDiskIterator(in, in.readInt(), flag, Integer.MIN_VALUE, cf.getComparator());
-        while (iter.hasNext())
-            cf.addAtom(iter.next());
-    }
-
-    private static class SCIterator implements Iterator<OnDiskAtom>
-    {
-        private final DataInput in;
-        private final int scCount;
-
-        private final ColumnSerializer.Flag flag;
-        private final int expireBefore;
-
-        private final CellNameType type;
-
-        private int read;
-        private ByteBuffer scName;
-        private Iterator<Cell> subColumnsIterator;
-
-        private SCIterator(DataInput in, int superColumnCount, ColumnSerializer.Flag flag, int expireBefore, CellNameType type)
-        {
-            this.in = in;
-            this.scCount = superColumnCount;
-            this.flag = flag;
-            this.expireBefore = expireBefore;
-            this.type = type;
-        }
-
-        public boolean hasNext()
-        {
-            return (subColumnsIterator != null && subColumnsIterator.hasNext()) || read < scCount;
-        }
-
-        public OnDiskAtom next()
-        {
-            try
-            {
-                if (subColumnsIterator != null && subColumnsIterator.hasNext())
-                {
-                    Cell c = subColumnsIterator.next();
-                    return c.withUpdatedName(type.makeCellName(scName, c.name().toByteBuffer()));
-                }
-
-                // Read one more super column
-                ++read;
-
-                scName = ByteBufferUtil.readWithShortLength(in);
-                DeletionInfo delInfo = new DeletionInfo(DeletionTime.serializer.deserialize(in));
-
-                /* read the number of columns */
-                int size = in.readInt();
-                List<Cell> subCells = new ArrayList<>(size);
-
-                ColumnSerializer colSer = subType(type).columnSerializer();
-                for (int i = 0; i < size; ++i)
-                    subCells.add(colSer.deserialize(in, flag, expireBefore));
-
-                subColumnsIterator = subCells.iterator();
-
-                // If the SC was deleted, return that first, otherwise return the first subcolumn
-                DeletionTime dtime = delInfo.getTopLevelDeletion();
-                if (!dtime.equals(DeletionTime.LIVE))
-                    return new RangeTombstone(startOf(scName), endOf(scName), dtime);
-
-                return next();
-            }
-            catch (IOException e)
-            {
-                throw new IOError(e);
-            }
-        }
-
-        public void remove()
-        {
-            throw new UnsupportedOperationException();
-        }
-    }
-
-    private static CellNameType subType(CellNameType type)
-    {
-        return new SimpleDenseCellNameType(type.subtype(1));
-    }
-
-    public static CellNameType scNameType(CellNameType type)
-    {
-        return new SimpleDenseCellNameType(type.subtype(0));
-    }
-
-    public static AbstractType<?> getComparatorFor(CFMetaData metadata, ByteBuffer superColumn)
-    {
-        return getComparatorFor(metadata, superColumn != null);
-    }
-
-    public static AbstractType<?> getComparatorFor(CFMetaData metadata, boolean subColumn)
-    {
-        return metadata.isSuper()
-             ? metadata.comparator.subtype(subColumn ? 1 : 0)
-             : metadata.comparator.asAbstractType();
-    }
-
-    // Extract the first component of a columnName, i.e. the super column name
-    public static ByteBuffer scName(Composite columnName)
-    {
-        return columnName.get(0);
-    }
-
-    // Extract the 2nd component of a columnName, i.e. the sub-column name
-    public static ByteBuffer subName(Composite columnName)
-    {
-        return columnName.get(1);
-    }
-
-    public static Composite startOf(ByteBuffer scName)
-    {
-        return CellNames.compositeDense(scName).start();
-    }
-
-    public static Composite endOf(ByteBuffer scName)
-    {
-        return CellNames.compositeDense(scName).end();
-    }
-
-    public static IDiskAtomFilter fromSCFilter(CellNameType type, ByteBuffer scName, IDiskAtomFilter filter)
-    {
-        if (filter instanceof NamesQueryFilter)
-            return fromSCNamesFilter(type, scName, (NamesQueryFilter)filter);
-        else
-            return fromSCSliceFilter(type, scName, (SliceQueryFilter)filter);
-    }
-
-    public static IDiskAtomFilter fromSCNamesFilter(CellNameType type, ByteBuffer scName, NamesQueryFilter filter)
-    {
-        if (scName == null)
-        {
-            ColumnSlice[] slices = new ColumnSlice[filter.columns.size()];
-            int i = 0;
-            for (CellName name : filter.columns)
-            {
-                // Note that, because the filter in argument is the one from thrift, 'name' are SimpleDenseCellName.
-                // So calling name.slice() would be incorrect, as simple cell names don't handle the EOC properly.
-                // This is why we call buffer() and rebuild a  Composite of the right type before call slice().
-                slices[i++] = type.make(name.toByteBuffer()).slice();
-            }
-            return new SliceQueryFilter(slices, false, slices.length, 1);
-        }
-        else
-        {
-            SortedSet<CellName> newColumns = new TreeSet<>(type);
-            for (CellName c : filter.columns)
-                newColumns.add(type.makeCellName(scName, c.toByteBuffer()));
-            return filter.withUpdatedColumns(newColumns);
-        }
-    }
-
-    public static SliceQueryFilter fromSCSliceFilter(CellNameType type, ByteBuffer scName, SliceQueryFilter filter)
-    {
-        assert filter.slices.length == 1;
-        if (scName == null)
-        {
-            // The filter is on the super column name
-            CBuilder builder = type.builder();
-            Composite start = filter.start().isEmpty()
-                            ? Composites.EMPTY
-                            : builder.buildWith(filter.start().toByteBuffer()).withEOC(filter.reversed ? Composite.EOC.END : Composite.EOC.START);
-            Composite finish = filter.finish().isEmpty()
-                             ? Composites.EMPTY
-                             : builder.buildWith(filter.finish().toByteBuffer()).withEOC(filter.reversed ? Composite.EOC.START : Composite.EOC.END);
-            return new SliceQueryFilter(start, finish, filter.reversed, filter.count, 1);
-        }
-        else
-        {
-            CBuilder builder = type.builder().add(scName);
-            Composite start = filter.start().isEmpty()
-                            ? builder.build().withEOC(filter.reversed ? Composite.EOC.END : Composite.EOC.START)
-                            : builder.buildWith(filter.start().toByteBuffer());
-            Composite end = filter.finish().isEmpty()
-                          ? builder.build().withEOC(filter.reversed ? Composite.EOC.START : Composite.EOC.END)
-                          : builder.buildWith(filter.finish().toByteBuffer());
-            return new SliceQueryFilter(start, end, filter.reversed, filter.count);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java
index e0d5f66..7c222dd 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspace.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java

@@ -17,42 +17,46 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.*;
+import java.io.File;
+import java.io.IOError;
+import java.io.IOException;
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
 import javax.management.openmbean.OpenDataException;
 import javax.management.openmbean.TabularData;
+import java.util.concurrent.Future;
 
-import com.google.common.base.Function;
-import com.google.common.collect.*;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.SetMultimap;
 import com.google.common.io.ByteStreams;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.util.concurrent.Futures;
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.functions.*;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.CompactionHistoryTabularData;
-import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.locator.IEndpointSnitch;
-import org.apache.cassandra.locator.LocalStrategy;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.paxos.Commit;
 import org.apache.cassandra.service.paxos.PaxosState;
@@ -60,11 +64,18 @@
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.*;
 
+import static java.util.Collections.emptyMap;
+import static java.util.Collections.singletonMap;
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
+import static org.apache.cassandra.io.util.FileUtils.visitDirectory;
 
 public final class SystemKeyspace
 {
+    private SystemKeyspace()
+    {
+    }
+
     private static final Logger logger = LoggerFactory.getLogger(SystemKeyspace.class);
 
     // Used to indicate that there was a previous version written to the legacy (pre 1.2)
@@ -78,43 +89,40 @@
 
     public static final String NAME = "system";
 
-    public static final String HINTS = "hints";
-    public static final String BATCHLOG = "batchlog";
+    public static final String BATCHES = "batches";
     public static final String PAXOS = "paxos";
     public static final String BUILT_INDEXES = "IndexInfo";
     public static final String LOCAL = "local";
     public static final String PEERS = "peers";
     public static final String PEER_EVENTS = "peer_events";
     public static final String RANGE_XFERS = "range_xfers";
-    public static final String COMPACTIONS_IN_PROGRESS = "compactions_in_progress";
     public static final String COMPACTION_HISTORY = "compaction_history";
     public static final String SSTABLE_ACTIVITY = "sstable_activity";
     public static final String SIZE_ESTIMATES = "size_estimates";
     public static final String AVAILABLE_RANGES = "available_ranges";
+    public static final String VIEWS_BUILDS_IN_PROGRESS = "views_builds_in_progress";
+    public static final String BUILT_VIEWS = "built_views";
 
-    public static final CFMetaData Hints =
-        compile(HINTS,
-                "hints awaiting delivery",
-                "CREATE TABLE %s ("
-                + "target_id uuid,"
-                + "hint_id timeuuid,"
-                + "message_version int,"
-                + "mutation blob,"
-                + "PRIMARY KEY ((target_id), hint_id, message_version)) "
-                + "WITH COMPACT STORAGE")
-                .compactionStrategyOptions(Collections.singletonMap("enabled", "false"))
-                .gcGraceSeconds(0);
+    @Deprecated public static final String LEGACY_HINTS = "hints";
+    @Deprecated public static final String LEGACY_BATCHLOG = "batchlog";
+    @Deprecated public static final String LEGACY_KEYSPACES = "schema_keyspaces";
+    @Deprecated public static final String LEGACY_COLUMNFAMILIES = "schema_columnfamilies";
+    @Deprecated public static final String LEGACY_COLUMNS = "schema_columns";
+    @Deprecated public static final String LEGACY_TRIGGERS = "schema_triggers";
+    @Deprecated public static final String LEGACY_USERTYPES = "schema_usertypes";
+    @Deprecated public static final String LEGACY_FUNCTIONS = "schema_functions";
+    @Deprecated public static final String LEGACY_AGGREGATES = "schema_aggregates";
 
-    public static final CFMetaData Batchlog =
-        compile(BATCHLOG,
+    public static final CFMetaData Batches =
+        compile(BATCHES,
                 "batches awaiting replay",
                 "CREATE TABLE %s ("
-                + "id uuid,"
-                + "data blob,"
+                + "id timeuuid,"
+                + "mutations list<blob>,"
                 + "version int,"
-                + "written_at timestamp,"
                 + "PRIMARY KEY ((id)))")
-                .compactionStrategyOptions(Collections.singletonMap("min_threshold", "2"))
+                .copy(new LocalPartitioner(TimeUUIDType.instance))
+                .compaction(CompactionParams.scts(singletonMap("min_threshold", "2")))
                 .gcGraceSeconds(0);
 
     private static final CFMetaData Paxos =
@@ -126,17 +134,18 @@
                 + "in_progress_ballot timeuuid,"
                 + "most_recent_commit blob,"
                 + "most_recent_commit_at timeuuid,"
+                + "most_recent_commit_version int,"
                 + "proposal blob,"
                 + "proposal_ballot timeuuid,"
+                + "proposal_version int,"
                 + "PRIMARY KEY ((row_key), cf_id))")
-                .compactionStrategyClass(LeveledCompactionStrategy.class);
+                .compaction(CompactionParams.lcs(emptyMap()));
 
-    // TODO: make private
-    public static final CFMetaData BuiltIndexes =
+    private static final CFMetaData BuiltIndexes =
         compile(BUILT_INDEXES,
                 "built column indexes",
                 "CREATE TABLE \"%s\" ("
-                + "table_name text,"
+                + "table_name text," // table_name here is the name of the keyspace - don't be fooled
                 + "index_name text,"
                 + "PRIMARY KEY ((table_name), index_name)) "
                 + "WITH COMPACT STORAGE");
@@ -196,16 +205,6 @@
                 + "requested_at timestamp,"
                 + "PRIMARY KEY ((token_bytes)))");
 
-    private static final CFMetaData CompactionsInProgress =
-        compile(COMPACTIONS_IN_PROGRESS,
-                "unfinished compactions",
-                "CREATE TABLE %s ("
-                + "id uuid,"
-                + "columnfamily_name text,"
-                + "inputs set<int>,"
-                + "keyspace_name text,"
-                + "PRIMARY KEY ((id)))");
-
     private static final CFMetaData CompactionHistory =
         compile(COMPACTION_HISTORY,
                 "week-long compaction history",
@@ -246,11 +245,171 @@
 
     private static final CFMetaData AvailableRanges =
         compile(AVAILABLE_RANGES,
-                "Available keyspace/ranges during bootstrap/replace that are ready to be served",
+                "available keyspace/ranges during bootstrap/replace that are ready to be served",
                 "CREATE TABLE %s ("
-                        + "keyspace_name text PRIMARY KEY,"
-                        + "ranges set<blob>"
-                        + ")");
+                + "keyspace_name text,"
+                + "ranges set<blob>,"
+                + "PRIMARY KEY ((keyspace_name)))");
+
+    private static final CFMetaData ViewsBuildsInProgress =
+        compile(VIEWS_BUILDS_IN_PROGRESS,
+                "views builds current progress",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "view_name text,"
+                + "last_token varchar,"
+                + "generation_number int,"
+                + "PRIMARY KEY ((keyspace_name), view_name))");
+
+    private static final CFMetaData BuiltViews =
+        compile(BUILT_VIEWS,
+                "built views",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "view_name text,"
+                + "PRIMARY KEY ((keyspace_name), view_name))");
+
+    @Deprecated
+    public static final CFMetaData LegacyHints =
+        compile(LEGACY_HINTS,
+                "*DEPRECATED* hints awaiting delivery",
+                "CREATE TABLE %s ("
+                + "target_id uuid,"
+                + "hint_id timeuuid,"
+                + "message_version int,"
+                + "mutation blob,"
+                + "PRIMARY KEY ((target_id), hint_id, message_version)) "
+                + "WITH COMPACT STORAGE")
+                .compaction(CompactionParams.scts(singletonMap("enabled", "false")))
+                .gcGraceSeconds(0);
+
+    @Deprecated
+    public static final CFMetaData LegacyBatchlog =
+        compile(LEGACY_BATCHLOG,
+                "*DEPRECATED* batchlog entries",
+                "CREATE TABLE %s ("
+                + "id uuid,"
+                + "data blob,"
+                + "version int,"
+                + "written_at timestamp,"
+                + "PRIMARY KEY ((id)))")
+                .compaction(CompactionParams.scts(singletonMap("min_threshold", "2")))
+                .gcGraceSeconds(0);
+
+    @Deprecated
+    public static final CFMetaData LegacyKeyspaces =
+        compile(LEGACY_KEYSPACES,
+                "*DEPRECATED* keyspace definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "durable_writes boolean,"
+                + "strategy_class text,"
+                + "strategy_options text,"
+                + "PRIMARY KEY ((keyspace_name))) "
+                + "WITH COMPACT STORAGE");
+
+    @Deprecated
+    public static final CFMetaData LegacyColumnfamilies =
+        compile(LEGACY_COLUMNFAMILIES,
+                "*DEPRECATED* table definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "columnfamily_name text,"
+                + "bloom_filter_fp_chance double,"
+                + "caching text,"
+                + "cf_id uuid," // post-2.1 UUID cfid
+                + "comment text,"
+                + "compaction_strategy_class text,"
+                + "compaction_strategy_options text,"
+                + "comparator text,"
+                + "compression_parameters text,"
+                + "default_time_to_live int,"
+                + "default_validator text,"
+                + "dropped_columns map<text, bigint>,"
+                + "gc_grace_seconds int,"
+                + "is_dense boolean,"
+                + "key_validator text,"
+                + "local_read_repair_chance double,"
+                + "max_compaction_threshold int,"
+                + "max_index_interval int,"
+                + "memtable_flush_period_in_ms int,"
+                + "min_compaction_threshold int,"
+                + "min_index_interval int,"
+                + "read_repair_chance double,"
+                + "speculative_retry text,"
+                + "subcomparator text,"
+                + "type text,"
+                + "PRIMARY KEY ((keyspace_name), columnfamily_name))");
+
+    @Deprecated
+    public static final CFMetaData LegacyColumns =
+        compile(LEGACY_COLUMNS,
+                "*DEPRECATED* column definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "columnfamily_name text,"
+                + "column_name text,"
+                + "component_index int,"
+                + "index_name text,"
+                + "index_options text,"
+                + "index_type text,"
+                + "type text,"
+                + "validator text,"
+                + "PRIMARY KEY ((keyspace_name), columnfamily_name, column_name))");
+
+    @Deprecated
+    public static final CFMetaData LegacyTriggers =
+        compile(LEGACY_TRIGGERS,
+                "*DEPRECATED* trigger definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "columnfamily_name text,"
+                + "trigger_name text,"
+                + "trigger_options map<text, text>,"
+                + "PRIMARY KEY ((keyspace_name), columnfamily_name, trigger_name))");
+
+    @Deprecated
+    public static final CFMetaData LegacyUsertypes =
+        compile(LEGACY_USERTYPES,
+                "*DEPRECATED* user defined type definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "type_name text,"
+                + "field_names list<text>,"
+                + "field_types list<text>,"
+                + "PRIMARY KEY ((keyspace_name), type_name))");
+
+    @Deprecated
+    public static final CFMetaData LegacyFunctions =
+        compile(LEGACY_FUNCTIONS,
+                "*DEPRECATED* user defined function definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "function_name text,"
+                + "signature frozen<list<text>>,"
+                + "argument_names list<text>,"
+                + "argument_types list<text>,"
+                + "body text,"
+                + "language text,"
+                + "return_type text,"
+                + "called_on_null_input boolean,"
+                + "PRIMARY KEY ((keyspace_name), function_name, signature))");
+
+    @Deprecated
+    public static final CFMetaData LegacyAggregates =
+        compile(LEGACY_AGGREGATES,
+                "*DEPRECATED* user defined aggregate definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "aggregate_name text,"
+                + "signature frozen<list<text>>,"
+                + "argument_types list<text>,"
+                + "final_func text,"
+                + "initcond blob,"
+                + "return_type text,"
+                + "state_func text,"
+                + "state_type text,"
+                + "PRIMARY KEY ((keyspace_name), aggregate_name, signature))");
 
     private static CFMetaData compile(String name, String description, String schema)
     {
@@ -258,24 +417,45 @@
                          .comment(description);
     }
 
-    public static KSMetaData definition()
+    public static KeyspaceMetadata metadata()
     {
-        Iterable<CFMetaData> tables =
-            Iterables.concat(LegacySchemaTables.All,
-                             Arrays.asList(BuiltIndexes,
-                                           Hints,
-                                           Batchlog,
-                                           Paxos,
-                                           Local,
-                                           Peers,
-                                           PeerEvents,
-                                           RangeXfers,
-                                           CompactionsInProgress,
-                                           CompactionHistory,
-                                           SSTableActivity,
-                                           SizeEstimates,
-                                           AvailableRanges));
-        return new KSMetaData(NAME, LocalStrategy.class, Collections.<String, String>emptyMap(), true, tables);
+        return KeyspaceMetadata.create(NAME, KeyspaceParams.local(), tables(), Views.none(), Types.none(), functions());
+    }
+
+    private static Tables tables()
+    {
+        return Tables.of(BuiltIndexes,
+                         Batches,
+                         Paxos,
+                         Local,
+                         Peers,
+                         PeerEvents,
+                         RangeXfers,
+                         CompactionHistory,
+                         SSTableActivity,
+                         SizeEstimates,
+                         AvailableRanges,
+                         ViewsBuildsInProgress,
+                         BuiltViews,
+                         LegacyHints,
+                         LegacyBatchlog,
+                         LegacyKeyspaces,
+                         LegacyColumnfamilies,
+                         LegacyColumns,
+                         LegacyTriggers,
+                         LegacyUsertypes,
+                         LegacyFunctions,
+                         LegacyAggregates);
+    }
+
+    private static Functions functions()
+    {
+        return Functions.builder()
+                        .add(UuidFcts.all())
+                        .add(TimeFcts.all())
+                        .add(BytesConversionFcts.all())
+                        .add(AggregateFcts.all())
+                        .build();
     }
 
     private static volatile Map<UUID, Pair<ReplayPosition, Long>> truncationRecords;
@@ -284,17 +464,13 @@
     {
         NEEDS_BOOTSTRAP,
         COMPLETED,
-        IN_PROGRESS
-    }
-
-    private static DecoratedKey decorate(ByteBuffer key)
-    {
-        return StorageService.getPartitioner().decorateKey(key);
+        IN_PROGRESS,
+        DECOMMISSIONED
     }
 
     public static void finishStartup()
     {
-        LegacySchemaTables.saveSystemKeyspaceSchema();
+        SchemaKeyspace.saveSystemKeyspacesSchema();
     }
 
     public static void persistLocalMetadata()
@@ -329,81 +505,6 @@
                             FBUtilities.getLocalAddress());
     }
 
-    /**
-     * Write compaction log, except columfamilies under system keyspace.
-     *
-     * @param cfs cfs to compact
-     * @param toCompact sstables to compact
-     * @return compaction task id or null if cfs is under system keyspace
-     */
-    public static UUID startCompaction(ColumnFamilyStore cfs, Iterable<SSTableReader> toCompact)
-    {
-        if (NAME.equals(cfs.keyspace.getName()))
-            return null;
-
-        UUID compactionId = UUIDGen.getTimeUUID();
-        Iterable<Integer> generations = Iterables.transform(toCompact, new Function<SSTableReader, Integer>()
-        {
-            public Integer apply(SSTableReader sstable)
-            {
-                return sstable.descriptor.generation;
-            }
-        });
-        String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, inputs) VALUES (?, ?, ?, ?)";
-        executeInternal(String.format(req, COMPACTIONS_IN_PROGRESS), compactionId, cfs.keyspace.getName(), cfs.name, Sets.newHashSet(generations));
-        forceBlockingFlush(COMPACTIONS_IN_PROGRESS);
-        return compactionId;
-    }
-
-    /**
-     * Deletes the entry for this compaction from the set of compactions in progress.  The compaction does not need
-     * to complete successfully for this to be called.
-     * @param taskId what was returned from {@code startCompaction}
-     */
-    public static void finishCompaction(UUID taskId)
-    {
-        assert taskId != null;
-
-        executeInternal(String.format("DELETE FROM system.%s WHERE id = ?", COMPACTIONS_IN_PROGRESS), taskId);
-        forceBlockingFlush(COMPACTIONS_IN_PROGRESS);
-    }
-
-    /**
-     * Returns a Map whose keys are KS.CF pairs and whose values are maps from sstable generation numbers to the
-     * task ID of the compaction they were participating in.
-     */
-    public static Map<Pair<String, String>, Map<Integer, UUID>> getUnfinishedCompactions()
-    {
-        String req = "SELECT * FROM system.%s";
-        UntypedResultSet resultSet = executeInternal(String.format(req, COMPACTIONS_IN_PROGRESS));
-
-        Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = new HashMap<>();
-        for (UntypedResultSet.Row row : resultSet)
-        {
-            String keyspace = row.getString("keyspace_name");
-            String columnfamily = row.getString("columnfamily_name");
-            Set<Integer> inputs = row.getSet("inputs", Int32Type.instance);
-            UUID taskID = row.getUUID("id");
-
-            Pair<String, String> kscf = Pair.create(keyspace, columnfamily);
-            Map<Integer, UUID> generationToTaskID = unfinishedCompactions.get(kscf);
-            if (generationToTaskID == null)
-                generationToTaskID = new HashMap<>(inputs.size());
-
-            for (Integer generation : inputs)
-                generationToTaskID.put(generation, taskID);
-
-            unfinishedCompactions.put(kscf, generationToTaskID);
-        }
-        return unfinishedCompactions;
-    }
-
-    public static void discardCompactionsInProgress()
-    {
-        ColumnFamilyStore compactionLog = Keyspace.open(NAME).getColumnFamilyStore(COMPACTIONS_IN_PROGRESS);
-        compactionLog.truncateBlocking(false);
-    }
-
     public static void updateCompactionHistory(String ksname,
                                                String cfname,
                                                long compactedAt,
@@ -415,7 +516,14 @@
         if (ksname.equals("system") && cfname.equals(COMPACTION_HISTORY))
             return;
         String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)";
-        executeInternal(String.format(req, COMPACTION_HISTORY), UUIDGen.getTimeUUID(), ksname, cfname, ByteBufferUtil.bytes(compactedAt), bytesIn, bytesOut, rowsMerged);
+        executeInternal(String.format(req, COMPACTION_HISTORY),
+                        UUIDGen.getTimeUUID(),
+                        ksname,
+                        cfname,
+                        ByteBufferUtil.bytes(compactedAt),
+                        bytesIn,
+                        bytesOut,
+                        rowsMerged);
     }
 
     public static TabularData getCompactionHistory() throws OpenDataException
@@ -424,6 +532,82 @@
         return CompactionHistoryTabularData.from(queryResultSet);
     }
 
+    public static boolean isViewBuilt(String keyspaceName, String viewName)
+    {
+        String req = "SELECT view_name FROM %s.\"%s\" WHERE keyspace_name=? AND view_name=?";
+        UntypedResultSet result = executeInternal(String.format(req, NAME, BUILT_VIEWS), keyspaceName, viewName);
+        return !result.isEmpty();
+    }
+
+    public static void setViewBuilt(String keyspaceName, String viewName)
+    {
+        String req = "INSERT INTO %s.\"%s\" (keyspace_name, view_name) VALUES (?, ?)";
+        executeInternal(String.format(req, NAME, BUILT_VIEWS), keyspaceName, viewName);
+        forceBlockingFlush(BUILT_VIEWS);
+    }
+
+
+    public static void setViewRemoved(String keyspaceName, String viewName)
+    {
+        String buildReq = "DELETE FROM %S.%s WHERE keyspace_name = ? AND view_name = ?";
+        executeInternal(String.format(buildReq, NAME, VIEWS_BUILDS_IN_PROGRESS), keyspaceName, viewName);
+        forceBlockingFlush(VIEWS_BUILDS_IN_PROGRESS);
+
+        String builtReq = "DELETE FROM %s.\"%s\" WHERE keyspace_name = ? AND view_name = ?";
+        executeInternal(String.format(builtReq, NAME, BUILT_VIEWS), keyspaceName, viewName);
+        forceBlockingFlush(BUILT_VIEWS);
+    }
+
+    public static void beginViewBuild(String ksname, String viewName, int generationNumber)
+    {
+        executeInternal(String.format("INSERT INTO system.%s (keyspace_name, view_name, generation_number) VALUES (?, ?, ?)", VIEWS_BUILDS_IN_PROGRESS),
+                        ksname,
+                        viewName,
+                        generationNumber);
+    }
+
+    public static void finishViewBuildStatus(String ksname, String viewName)
+    {
+        // We flush the view built first, because if we fail now, we'll restart at the last place we checkpointed
+        // view build.
+        // If we flush the delete first, we'll have to restart from the beginning.
+        // Also, if the build succeeded, but the view build failed, we will be able to skip the view build check
+        // next boot.
+        setViewBuilt(ksname, viewName);
+        forceBlockingFlush(BUILT_VIEWS);
+        executeInternal(String.format("DELETE FROM system.%s WHERE keyspace_name = ? AND view_name = ?", VIEWS_BUILDS_IN_PROGRESS), ksname, viewName);
+        forceBlockingFlush(VIEWS_BUILDS_IN_PROGRESS);
+    }
+
+    public static void updateViewBuildStatus(String ksname, String viewName, Token token)
+    {
+        String req = "INSERT INTO system.%s (keyspace_name, view_name, last_token) VALUES (?, ?, ?)";
+        Token.TokenFactory factory = ViewsBuildsInProgress.partitioner.getTokenFactory();
+        executeInternal(String.format(req, VIEWS_BUILDS_IN_PROGRESS), ksname, viewName, factory.toString(token));
+    }
+
+    public static Pair<Integer, Token> getViewBuildStatus(String ksname, String viewName)
+    {
+        String req = "SELECT generation_number, last_token FROM system.%s WHERE keyspace_name = ? AND view_name = ?";
+        UntypedResultSet queryResultSet = executeInternal(String.format(req, VIEWS_BUILDS_IN_PROGRESS), ksname, viewName);
+        if (queryResultSet == null || queryResultSet.isEmpty())
+            return null;
+
+        UntypedResultSet.Row row = queryResultSet.one();
+
+        Integer generation = null;
+        Token lastKey = null;
+        if (row.has("generation_number"))
+            generation = row.getInt("generation_number");
+        if (row.has("last_key"))
+        {
+            Token.TokenFactory factory = ViewsBuildsInProgress.partitioner.getTokenFactory();
+            lastKey = factory.fromString(row.getString("last_key"));
+        }
+
+        return Pair.create(generation, lastKey);
+    }
+
     public static synchronized void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
     {
         String req = "UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'";
@@ -449,7 +633,7 @@
         {
             ReplayPosition.serializer.serialize(position, out);
             out.writeLong(truncatedAt);
-            return Collections.singletonMap(cfs.metadata.cfId, ByteBuffer.wrap(out.getData(), 0, out.getLength()));
+            return singletonMap(cfs.metadata.cfId, ByteBuffer.wrap(out.getData(), 0, out.getLength()));
         }
         catch (IOException e)
         {
@@ -494,9 +678,8 @@
 
     private static Pair<ReplayPosition, Long> truncationRecordFromBlob(ByteBuffer bytes)
     {
-        try
+        try (RebufferingInputStream in = new DataInputBuffer(bytes, true))
         {
-            DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(bytes));
             return Pair.create(ReplayPosition.serializer.deserialize(in), in.available() > 0 ? in.readLong() : Long.MIN_VALUE);
         }
         catch (IOException e)
@@ -508,32 +691,41 @@
     /**
      * Record tokens being used by another node
      */
-    public static synchronized void updateTokens(InetAddress ep, Collection<Token> tokens)
+    public static Future<?> updateTokens(final InetAddress ep, final Collection<Token> tokens, ExecutorService executorService)
     {
         if (ep.equals(FBUtilities.getBroadcastAddress()))
-        {
-            removeEndpoint(ep);
-            return;
-        }
+            return Futures.immediateFuture(null);
 
         String req = "INSERT INTO system.%s (peer, tokens) VALUES (?, ?)";
-        executeInternal(String.format(req, PEERS), ep, tokensAsSet(tokens));
+        return executorService.submit((Runnable) () -> executeInternal(String.format(req, PEERS), ep, tokensAsSet(tokens)));
     }
 
-    public static synchronized void updatePreferredIP(InetAddress ep, InetAddress preferred_ip)
+    public static void updatePreferredIP(InetAddress ep, InetAddress preferred_ip)
     {
         String req = "INSERT INTO system.%s (peer, preferred_ip) VALUES (?, ?)";
         executeInternal(String.format(req, PEERS), ep, preferred_ip);
         forceBlockingFlush(PEERS);
     }
 
-    public static synchronized void updatePeerInfo(InetAddress ep, String columnName, Object value)
+    public static Future<?> updatePeerInfo(final InetAddress ep, final String columnName, final Object value, ExecutorService executorService)
+    {
+        if (ep.equals(FBUtilities.getBroadcastAddress()))
+            return Futures.immediateFuture(null);
+
+        String req = "INSERT INTO system.%s (peer, %s) VALUES (?, ?)";
+        return executorService.submit((Runnable) () -> executeInternal(String.format(req, PEERS, columnName), ep, value));
+    }
+
+    public static void updatePeerReleaseVersion(final InetAddress ep, final Object value, Runnable postUpdateTask, ExecutorService executorService)
     {
         if (ep.equals(FBUtilities.getBroadcastAddress()))
             return;
 
         String req = "INSERT INTO system.%s (peer, %s) VALUES (?, ?)";
-        executeInternal(String.format(req, PEERS, columnName), ep, value);
+        executorService.execute(() -> {
+            executeInternal(String.format(req, PEERS, "release_version"), ep, value);
+            postUpdateTask.run();
+        });
     }
 
     public static synchronized void updateHintsDropped(InetAddress ep, UUID timePeriod, int value)
@@ -551,7 +743,9 @@
 
     private static Set<String> tokensAsSet(Collection<Token> tokens)
     {
-        Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
+        if (tokens.isEmpty())
+            return Collections.emptySet();
+        Token.TokenFactory factory = StorageService.instance.getTokenFactory();
         Set<String> s = new HashSet<>(tokens.size());
         for (Token tk : tokens)
             s.add(factory.toString(tk));
@@ -560,7 +754,7 @@
 
     private static Collection<Token> deserializeTokens(Collection<String> tokensStrings)
     {
-        Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
+        Token.TokenFactory factory = StorageService.instance.getTokenFactory();
         List<Token> tokens = new ArrayList<>(tokensStrings.size());
         for (String tk : tokensStrings)
             tokens.add(factory.fromString(tk));
@@ -570,10 +764,11 @@
     /**
      * Remove stored tokens being used by another node
      */
-    public static synchronized void removeEndpoint(InetAddress ep)
+    public static void removeEndpoint(InetAddress ep)
     {
         String req = "DELETE FROM system.%s WHERE peer = ?";
         executeInternal(String.format(req, PEERS), ep);
+        forceBlockingFlush(PEERS);
     }
 
     /**
@@ -629,6 +824,37 @@
     }
 
     /**
+     * Return a map of IP address to C* version. If an invalid version string, or no version
+     * at all is stored for a given peer IP, then NULL_VERSION will be reported for that peer
+     */
+    public static Map<InetAddress, CassandraVersion> loadPeerVersions()
+    {
+        Map<InetAddress, CassandraVersion> releaseVersionMap = new HashMap<>();
+        for (UntypedResultSet.Row row : executeInternal("SELECT peer, release_version FROM system." + PEERS))
+        {
+            InetAddress peer = row.getInetAddress("peer");
+            if (row.has("release_version"))
+            {
+                try
+                {
+                    releaseVersionMap.put(peer, new CassandraVersion(row.getString("release_version")));
+                }
+                catch (IllegalArgumentException e)
+                {
+                    logger.info("Invalid version string found for {}", peer);
+                    releaseVersionMap.put(peer, NULL_VERSION);
+                }
+            }
+            else
+            {
+                logger.info("No version string found for {}", peer);
+                releaseVersionMap.put(peer, NULL_VERSION);
+            }
+        }
+        return releaseVersionMap;
+    }
+
+    /**
      * Get preferred IP for given endpoint if it is known. Otherwise this returns given endpoint itself.
      *
      * @param ep endpoint address to check
@@ -723,7 +949,7 @@
         if (result.isEmpty() || !result.one().has("cluster_name"))
         {
             // this is a brand new node
-            if (!cfs.getSSTables().isEmpty())
+            if (!cfs.getLiveSSTables().isEmpty())
                 throw new ConfigurationException("Found system keyspace files, but they couldn't be loaded!");
 
             // no system files.  this is a new node.
@@ -802,6 +1028,11 @@
         return getBootstrapState() == BootstrapState.IN_PROGRESS;
     }
 
+    public static boolean wasDecommissioned()
+    {
+        return getBootstrapState() == BootstrapState.DECOMMISSIONED;
+    }
+
     public static void setBootstrapState(BootstrapState state)
     {
         String req = "INSERT INTO system.%s (key, bootstrapped) VALUES ('%s', ?)";
@@ -811,30 +1042,35 @@
 
     public static boolean isIndexBuilt(String keyspaceName, String indexName)
     {
-        ColumnFamilyStore cfs = Keyspace.open(NAME).getColumnFamilyStore(BUILT_INDEXES);
-        QueryFilter filter = QueryFilter.getNamesFilter(decorate(ByteBufferUtil.bytes(keyspaceName)),
-                                                        BUILT_INDEXES,
-                                                        FBUtilities.singleton(cfs.getComparator().makeCellName(indexName), cfs.getComparator()),
-                                                        System.currentTimeMillis());
-        return ColumnFamilyStore.removeDeleted(cfs.getColumnFamily(filter), Integer.MAX_VALUE) != null;
+        String req = "SELECT index_name FROM %s.\"%s\" WHERE table_name=? AND index_name=?";
+        UntypedResultSet result = executeInternal(String.format(req, NAME, BUILT_INDEXES), keyspaceName, indexName);
+        return !result.isEmpty();
     }
 
     public static void setIndexBuilt(String keyspaceName, String indexName)
     {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(NAME, BUILT_INDEXES);
-        cf.addColumn(new BufferCell(cf.getComparator().makeCellName(indexName), ByteBufferUtil.EMPTY_BYTE_BUFFER, FBUtilities.timestampMicros()));
-        new Mutation(NAME, ByteBufferUtil.bytes(keyspaceName), cf).apply();
+        String req = "INSERT INTO %s.\"%s\" (table_name, index_name) VALUES (?, ?)";
+        executeInternal(String.format(req, NAME, BUILT_INDEXES), keyspaceName, indexName);
         forceBlockingFlush(BUILT_INDEXES);
     }
 
     public static void setIndexRemoved(String keyspaceName, String indexName)
     {
-        Mutation mutation = new Mutation(NAME, ByteBufferUtil.bytes(keyspaceName));
-        mutation.delete(BUILT_INDEXES, BuiltIndexes.comparator.makeCellName(indexName), FBUtilities.timestampMicros());
-        mutation.apply();
+        String req = "DELETE FROM %s.\"%s\" WHERE table_name = ? AND index_name = ?";
+        executeInternal(String.format(req, NAME, BUILT_INDEXES), keyspaceName, indexName);
         forceBlockingFlush(BUILT_INDEXES);
     }
 
+    public static List<String> getBuiltIndexes(String keyspaceName, Set<String> indexNames)
+    {
+        List<String> names = new ArrayList<>(indexNames);
+        String req = "SELECT index_name from %s.\"%s\" WHERE table_name=? AND index_name IN ?";
+        UntypedResultSet results = executeInternal(String.format(req, NAME, BUILT_INDEXES), keyspaceName, names);
+        return StreamSupport.stream(results.spliterator(), false)
+                            .map(r -> r.getString("index_name"))
+                            .collect(Collectors.toList());
+    }
+
     /**
      * Read the host ID from the system keyspace, creating (and storing) one if
      * none exists.
@@ -894,23 +1130,25 @@
         return null;
     }
 
-    public static PaxosState loadPaxosState(ByteBuffer key, CFMetaData metadata, long now)
+    public static PaxosState loadPaxosState(DecoratedKey key, CFMetaData metadata, int nowInSec)
     {
         String req = "SELECT * FROM system.%s WHERE row_key = ? AND cf_id = ?";
-        UntypedResultSet results = QueryProcessor.executeInternalWithNow(now, String.format(req, PAXOS), key, metadata.cfId);
+        UntypedResultSet results = QueryProcessor.executeInternalWithNow(nowInSec, String.format(req, PAXOS), key.getKey(), metadata.cfId);
         if (results.isEmpty())
             return new PaxosState(key, metadata);
         UntypedResultSet.Row row = results.one();
         Commit promised = row.has("in_progress_ballot")
-                        ? new Commit(key, row.getUUID("in_progress_ballot"), ArrayBackedSortedColumns.factory.create(metadata))
+                        ? new Commit(row.getUUID("in_progress_ballot"), new PartitionUpdate(metadata, key, metadata.partitionColumns(), 1))
                         : Commit.emptyCommit(key, metadata);
         // either we have both a recently accepted ballot and update or we have neither
+        int proposalVersion = row.has("proposal_version") ? row.getInt("proposal_version") : MessagingService.VERSION_21;
         Commit accepted = row.has("proposal")
-                        ? new Commit(key, row.getUUID("proposal_ballot"), ColumnFamily.fromBytes(row.getBytes("proposal")))
+                        ? new Commit(row.getUUID("proposal_ballot"), PartitionUpdate.fromBytes(row.getBytes("proposal"), proposalVersion, key))
                         : Commit.emptyCommit(key, metadata);
         // either most_recent_commit and most_recent_commit_at will both be set, or neither
+        int mostRecentVersion = row.has("most_recent_commit_version") ? row.getInt("most_recent_commit_version") : MessagingService.VERSION_21;
         Commit mostRecent = row.has("most_recent_commit")
-                          ? new Commit(key, row.getUUID("most_recent_commit_at"), ColumnFamily.fromBytes(row.getBytes("most_recent_commit")))
+                          ? new Commit(row.getUUID("most_recent_commit_at"), PartitionUpdate.fromBytes(row.getBytes("most_recent_commit"), mostRecentVersion, key))
                           : Commit.emptyCommit(key, metadata);
         return new PaxosState(promised, accepted, mostRecent);
     }
@@ -920,41 +1158,43 @@
         String req = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET in_progress_ballot = ? WHERE row_key = ? AND cf_id = ?";
         executeInternal(String.format(req, PAXOS),
                         UUIDGen.microsTimestamp(promise.ballot),
-                        paxosTtl(promise.update.metadata),
+                        paxosTtlSec(promise.update.metadata()),
                         promise.ballot,
-                        promise.key,
-                        promise.update.id());
+                        promise.update.partitionKey().getKey(),
+                        promise.update.metadata().cfId);
     }
 
     public static void savePaxosProposal(Commit proposal)
     {
-        executeInternal(String.format("UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS),
+        executeInternal(String.format("UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ?, proposal_version = ? WHERE row_key = ? AND cf_id = ?", PAXOS),
                         UUIDGen.microsTimestamp(proposal.ballot),
-                        paxosTtl(proposal.update.metadata),
+                        paxosTtlSec(proposal.update.metadata()),
                         proposal.ballot,
-                        proposal.update.toBytes(),
-                        proposal.key,
-                        proposal.update.id());
+                        PartitionUpdate.toBytes(proposal.update, MessagingService.current_version),
+                        MessagingService.current_version,
+                        proposal.update.partitionKey().getKey(),
+                        proposal.update.metadata().cfId);
     }
 
-    public static int paxosTtl(CFMetaData metadata)
+    public static int paxosTtlSec(CFMetaData metadata)
     {
         // keep paxos state around for at least 3h
-        return Math.max(3 * 3600, metadata.getGcGraceSeconds());
+        return Math.max(3 * 3600, metadata.params.gcGraceSeconds);
     }
 
     public static void savePaxosCommit(Commit commit)
     {
         // We always erase the last proposal (with the commit timestamp to no erase more recent proposal in case the commit is old)
         // even though that's really just an optimization  since SP.beginAndRepairPaxos will exclude accepted proposal older than the mrc.
-        String cql = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ? WHERE row_key = ? AND cf_id = ?";
+        String cql = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ?, most_recent_commit_version = ? WHERE row_key = ? AND cf_id = ?";
         executeInternal(String.format(cql, PAXOS),
                         UUIDGen.microsTimestamp(commit.ballot),
-                        paxosTtl(commit.update.metadata),
+                        paxosTtlSec(commit.update.metadata()),
                         commit.ballot,
-                        commit.update.toBytes(),
-                        commit.key,
-                        commit.update.id());
+                        PartitionUpdate.toBytes(commit.update, MessagingService.current_version),
+                        MessagingService.current_version,
+                        commit.update.partitionKey().getKey(),
+                        commit.update.metadata().cfId);
     }
 
     /**
@@ -1008,24 +1248,23 @@
     public static void updateSizeEstimates(String keyspace, String table, Map<Range<Token>, Pair<Long, Long>> estimates)
     {
         long timestamp = FBUtilities.timestampMicros();
-        Mutation mutation = new Mutation(NAME, UTF8Type.instance.decompose(keyspace));
+        PartitionUpdate update = new PartitionUpdate(SizeEstimates, UTF8Type.instance.decompose(keyspace), SizeEstimates.partitionColumns(), estimates.size());
+        Mutation mutation = new Mutation(update);
 
         // delete all previous values with a single range tombstone.
-        mutation.deleteRange(SIZE_ESTIMATES,
-                             SizeEstimates.comparator.make(table).start(),
-                             SizeEstimates.comparator.make(table).end(),
-                             timestamp - 1);
+        int nowInSec = FBUtilities.nowInSeconds();
+        update.add(new RangeTombstone(Slice.make(SizeEstimates.comparator, table), new DeletionTime(timestamp - 1, nowInSec)));
 
         // add a CQL row for each primary token range.
-        ColumnFamily cells = mutation.addOrGet(SizeEstimates);
         for (Map.Entry<Range<Token>, Pair<Long, Long>> entry : estimates.entrySet())
         {
             Range<Token> range = entry.getKey();
             Pair<Long, Long> values = entry.getValue();
-            Composite prefix = SizeEstimates.comparator.make(table, range.left.toString(), range.right.toString());
-            CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-            adder.add("partitions_count", values.left)
-                 .add("mean_partition_size", values.right);
+            new RowUpdateBuilder(SizeEstimates, timestamp, mutation)
+                .clustering(table, range.left.toString(), range.right.toString())
+                .add("partitions_count", values.left)
+                .add("mean_partition_size", values.right)
+                .build();
         }
 
         mutation.apply();
@@ -1040,6 +1279,32 @@
         executeInternal(cql, keyspace, table);
     }
 
+    /**
+     * Clears size estimates for a keyspace (used to manually clean when we miss a keyspace drop)
+     */
+    public static void clearSizeEstimates(String keyspace)
+    {
+        String cql = String.format("DELETE FROM %s.%s WHERE keyspace_name = ?", NAME, SIZE_ESTIMATES);
+        executeInternal(cql, keyspace);
+    }
+
+    /**
+     * @return A multimap from keyspace to table for all tables with entries in size estimates
+     */
+
+    public static synchronized SetMultimap<String, String> getTablesWithSizeEstimates()
+    {
+        SetMultimap<String, String> keyspaceTableMap = HashMultimap.create();
+        String cql = String.format("SELECT keyspace_name, table_name FROM %s.%s", NAME, SIZE_ESTIMATES);
+        UntypedResultSet rs = executeInternal(cql);
+        for (UntypedResultSet.Row row : rs)
+        {
+            keyspaceTableMap.put(row.getString("keyspace_name"), row.getString("table_name"));
+        }
+
+        return keyspaceTableMap;
+    }
+
     public static synchronized void updateAvailableRanges(String keyspace, Collection<Range<Token>> completedRanges)
     {
         String cql = "UPDATE system.%s SET ranges = ranges + ? WHERE keyspace_name = ?";
@@ -1080,7 +1345,7 @@
      *
      * @throws IOException
      */
-    public static void snapshotOnVersionChange() throws IOException
+    public static boolean snapshotOnVersionChange() throws IOException
     {
         String previous = getPreviousVersionString();
         String next = FBUtilities.getReleaseVersionString();
@@ -1095,7 +1360,10 @@
                                                                                     next));
             Keyspace systemKs = Keyspace.open(SystemKeyspace.NAME);
             systemKs.snapshot(snapshotName, null);
+            return true;
         }
+
+        return false;
     }
 
     /**
@@ -1135,6 +1403,45 @@
         return result.one().getString("release_version");
     }
 
+    /**
+     * Check data directories for old files that can be removed when migrating from 2.1 or 2.2 to 3.0,
+     * these checks can be removed in 4.0, see CASSANDRA-7066
+     */
+    public static void migrateDataDirs()
+    {
+        Iterable<String> dirs = Arrays.asList(DatabaseDescriptor.getAllDataFileLocations());
+        for (String dataDir : dirs)
+        {
+            logger.debug("Checking {} for legacy files", dataDir);
+            File dir = new File(dataDir);
+            assert dir.exists() : dir + " should have been created by startup checks";
+
+            visitDirectory(dir.toPath(),
+                           File::isDirectory,
+                           ksdir ->
+                           {
+                               logger.trace("Checking {} for legacy files", ksdir);
+                               visitDirectory(ksdir.toPath(),
+                                              File::isDirectory,
+                                              cfdir ->
+                                              {
+                                                  logger.trace("Checking {} for legacy files", cfdir);
+
+                                                  if (Descriptor.isLegacyFile(cfdir))
+                                                  {
+                                                      FileUtils.deleteRecursive(cfdir);
+                                                  }
+                                                  else
+                                                  {
+                                                      visitDirectory(cfdir.toPath(),
+                                                                     Descriptor::isLegacyFile,
+                                                                     FileUtils::delete);
+                                                  }
+                                              });
+                           });
+        }
+    }
+
     private static ByteBuffer rangeToBytes(Range<Token> range)
     {
         try (DataOutputBuffer out = new DataOutputBuffer())

diff --git a/src/java/org/apache/cassandra/db/TruncateResponse.java b/src/java/org/apache/cassandra/db/TruncateResponse.java
index d8f5ad2..af4ed8f 100644
--- a/src/java/org/apache/cassandra/db/TruncateResponse.java
+++ b/src/java/org/apache/cassandra/db/TruncateResponse.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -58,7 +58,7 @@
             out.writeBoolean(tr.success);
         }
 
-        public TruncateResponse deserialize(DataInput in, int version) throws IOException
+        public TruncateResponse deserialize(DataInputPlus in, int version) throws IOException
         {
             String keyspace = in.readUTF();
             String columnFamily = in.readUTF();
@@ -68,9 +68,9 @@
 
         public long serializedSize(TruncateResponse tr, int version)
         {
-            return TypeSizes.NATIVE.sizeof(tr.keyspace)
-                 + TypeSizes.NATIVE.sizeof(tr.columnFamily)
-                 + TypeSizes.NATIVE.sizeof(tr.success);
+            return TypeSizes.sizeof(tr.keyspace)
+                 + TypeSizes.sizeof(tr.columnFamily)
+                 + TypeSizes.sizeof(tr.success);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/Truncation.java b/src/java/org/apache/cassandra/db/Truncation.java
index 88742cd..39a2ec6 100644
--- a/src/java/org/apache/cassandra/db/Truncation.java
+++ b/src/java/org/apache/cassandra/db/Truncation.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -60,7 +60,7 @@
         out.writeUTF(t.columnFamily);
     }
 
-    public Truncation deserialize(DataInput in, int version) throws IOException
+    public Truncation deserialize(DataInputPlus in, int version) throws IOException
     {
         String keyspace = in.readUTF();
         String columnFamily = in.readUTF();
@@ -69,6 +69,6 @@
 
     public long serializedSize(Truncation truncation, int version)
     {
-        return TypeSizes.NATIVE.sizeof(truncation.keyspace) + TypeSizes.NATIVE.sizeof(truncation.columnFamily);
+        return TypeSizes.sizeof(truncation.keyspace) + TypeSizes.sizeof(truncation.columnFamily);
     }
 }

diff --git a/src/java/org/apache/cassandra/db/TypeSizes.java b/src/java/org/apache/cassandra/db/TypeSizes.java
index efae762..b47e300 100644
--- a/src/java/org/apache/cassandra/db/TypeSizes.java
+++ b/src/java/org/apache/cassandra/db/TypeSizes.java

@@ -20,25 +20,22 @@
 import java.nio.ByteBuffer;
 import java.util.UUID;
 
-public abstract class TypeSizes
+import org.apache.cassandra.utils.vint.VIntCoding;
+
+public final class TypeSizes
 {
-    public static final TypeSizes NATIVE = new NativeDBTypeSizes();
-    public static final TypeSizes VINT = new VIntEncodedTypeSizes();
+
+    private TypeSizes(){}
 
     private static final int BOOL_SIZE = 1;
+    private static final int BYTE_SIZE = 1;
     private static final int SHORT_SIZE = 2;
     private static final int INT_SIZE = 4;
     private static final int LONG_SIZE = 8;
     private static final int UUID_SIZE = 16;
 
-    public abstract int sizeof(boolean value);
-    public abstract int sizeof(short value);
-    public abstract int sizeof(int value);
-    public abstract int sizeof(long value);
-    public abstract int sizeof(UUID value);
-
     /** assumes UTF8 */
-    public int sizeof(String value)
+    public static int sizeof(String value)
     {
         int length = encodedUTF8Length(value);
         assert length <= Short.MAX_VALUE;
@@ -62,95 +59,58 @@
         return utflen;
     }
 
-    public int sizeofWithShortLength(ByteBuffer value)
+    public static int sizeofWithShortLength(ByteBuffer value)
     {
         return sizeof((short) value.remaining()) + value.remaining();
     }
 
-    public int sizeofWithLength(ByteBuffer value)
+    public static int sizeofWithLength(ByteBuffer value)
     {
         return sizeof(value.remaining()) + value.remaining();
     }
 
-    public static class NativeDBTypeSizes extends TypeSizes
+    public static int sizeofWithVIntLength(ByteBuffer value)
     {
-        public int sizeof(boolean value)
-        {
-            return BOOL_SIZE;
-        }
-
-        public int sizeof(short value)
-        {
-            return SHORT_SIZE;
-        }
-
-        public int sizeof(int value)
-        {
-            return INT_SIZE;
-        }
-
-        public int sizeof(long value)
-        {
-            return LONG_SIZE;
-        }
-
-        public int sizeof(UUID value)
-        {
-            return UUID_SIZE;
-        }
+        return sizeofUnsignedVInt(value.remaining()) + value.remaining();
     }
 
-    public static class VIntEncodedTypeSizes extends TypeSizes
+    public static int sizeof(boolean value)
     {
-        private static final int BOOL_SIZE = 1;
+        return BOOL_SIZE;
+    }
 
-        public int sizeofVInt(long i)
-        {
-            if (i >= -112 && i <= 127)
-                return 1;
+    public static int sizeof(byte value)
+    {
+        return BYTE_SIZE;
+    }
 
-            int size = 0;
-            int len = -112;
-            if (i < 0)
-            {
-                i ^= -1L; // take one's complement'
-                len = -120;
-            }
-            long tmp = i;
-            while (tmp != 0)
-            {
-                tmp = tmp >> 8;
-                len--;
-            }
-            size++;
-            len = (len < -120) ? -(len + 120) : -(len + 112);
-            size += len;
-            return size;
-        }
+    public static int sizeof(short value)
+    {
+        return SHORT_SIZE;
+    }
 
-        public int sizeof(long i)
-        {
-            return sizeofVInt(i);
-        }
+    public static int sizeof(int value)
+    {
+        return INT_SIZE;
+    }
 
-        public int sizeof(boolean i)
-        {
-            return BOOL_SIZE;
-        }
+    public static int sizeof(long value)
+    {
+        return LONG_SIZE;
+    }
 
-        public int sizeof(short i)
-        {
-            return sizeofVInt(i);
-        }
+    public static int sizeof(UUID value)
+    {
+        return UUID_SIZE;
+    }
 
-        public int sizeof(int i)
-        {
-            return sizeofVInt(i);
-        }
+    public static int sizeofVInt(long value)
+    {
+        return VIntCoding.computeVIntSize(value);
+    }
 
-        public int sizeof(UUID value)
-        {
-            return sizeofVInt(value.getMostSignificantBits()) + sizeofVInt(value.getLeastSignificantBits());
-        }
+    public static int sizeofUnsignedVInt(long value)
+    {
+        return VIntCoding.computeUnsignedVIntSize(value);
     }
 }

diff --git a/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java
new file mode 100644
index 0000000..2d270bc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java

@@ -0,0 +1,899 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.io.IOError;
+import java.util.*;
+import java.util.function.Supplier;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.net.MessagingService;
+
+/**
+ * Helper class to deserialize Unfiltered object from disk efficiently.
+ *
+ * More precisely, this class is used by the low-level reader to ensure
+ * we don't do more work than necessary (i.e. we don't allocate/deserialize
+ * objects for things we don't care about).
+ */
+public abstract class UnfilteredDeserializer
+{
+    protected final CFMetaData metadata;
+    protected final DataInputPlus in;
+    protected final SerializationHelper helper;
+
+    protected UnfilteredDeserializer(CFMetaData metadata,
+                                     DataInputPlus in,
+                                     SerializationHelper helper)
+    {
+        this.metadata = metadata;
+        this.in = in;
+        this.helper = helper;
+    }
+
+    public static UnfilteredDeserializer create(CFMetaData metadata,
+                                                DataInputPlus in,
+                                                SerializationHeader header,
+                                                SerializationHelper helper,
+                                                DeletionTime partitionDeletion,
+                                                boolean readAllAsDynamic)
+    {
+        if (helper.version >= MessagingService.VERSION_30)
+            return new CurrentDeserializer(metadata, in, header, helper);
+        else
+            return new OldFormatDeserializer(metadata, in, helper, partitionDeletion, readAllAsDynamic);
+    }
+
+    /**
+     * Whether or not there is more atom to read.
+     */
+    public abstract boolean hasNext() throws IOException;
+
+    /**
+     * Compare the provided bound to the next atom to read on disk.
+     *
+     * This will not read/deserialize the whole atom but only what is necessary for the
+     * comparison. Whenever we know what to do with this atom (read it or skip it),
+     * readNext or skipNext should be called.
+     */
+    public abstract int compareNextTo(Slice.Bound bound) throws IOException;
+
+    /**
+     * Returns whether the next atom is a row or not.
+     */
+    public abstract boolean nextIsRow() throws IOException;
+
+    /**
+     * Returns whether the next atom is the static row or not.
+     */
+    public abstract boolean nextIsStatic() throws IOException;
+
+    /**
+     * Returns the next atom.
+     */
+    public abstract Unfiltered readNext() throws IOException;
+
+    /**
+     * Clears any state in this deserializer.
+     */
+    public abstract void clearState() throws IOException;
+
+    /**
+     * Skips the next atom.
+     */
+    public abstract void skipNext() throws IOException;
+
+
+    /**
+     * For the legacy layout deserializer, we have to deal with the fact that a row can span multiple index blocks and that
+     * the call to hasNext() reads the next element upfront. We must take that into account when we check in AbstractSSTableIterator if
+     * we're past the end of an index block boundary as that check expect to account for only consumed data (that is, if hasNext has
+     * been called and made us cross an index boundary but neither readNext() or skipNext() as yet been called, we shouldn't consider
+     * the index block boundary crossed yet).
+     *
+     * TODO: we don't care about this for the current file format because a row can never span multiple index blocks (further, hasNext()
+     * only just basically read 2 bytes from disk in that case). So once we drop backward compatibility with pre-3.0 sstable, we should
+     * remove this.
+     */
+    public abstract long bytesReadForUnconsumedData();
+
+    private static class CurrentDeserializer extends UnfilteredDeserializer
+    {
+        private final ClusteringPrefix.Deserializer clusteringDeserializer;
+        private final SerializationHeader header;
+
+        private int nextFlags;
+        private int nextExtendedFlags;
+        private boolean isReady;
+        private boolean isDone;
+
+        private final Row.Builder builder;
+
+        private CurrentDeserializer(CFMetaData metadata,
+                                    DataInputPlus in,
+                                    SerializationHeader header,
+                                    SerializationHelper helper)
+        {
+            super(metadata, in, helper);
+            this.header = header;
+            this.clusteringDeserializer = new ClusteringPrefix.Deserializer(metadata.comparator, in, header);
+            this.builder = BTreeRow.sortedBuilder();
+        }
+
+        public boolean hasNext() throws IOException
+        {
+            if (isReady)
+                return true;
+
+            prepareNext();
+            return !isDone;
+        }
+
+        private void prepareNext() throws IOException
+        {
+            if (isDone)
+                return;
+
+            nextFlags = in.readUnsignedByte();
+            if (UnfilteredSerializer.isEndOfPartition(nextFlags))
+            {
+                isDone = true;
+                isReady = false;
+                return;
+            }
+
+            nextExtendedFlags = UnfilteredSerializer.readExtendedFlags(in, nextFlags);
+
+            clusteringDeserializer.prepare(nextFlags, nextExtendedFlags);
+            isReady = true;
+        }
+
+        public int compareNextTo(Slice.Bound bound) throws IOException
+        {
+            if (!isReady)
+                prepareNext();
+
+            assert !isDone;
+
+            return clusteringDeserializer.compareNextTo(bound);
+        }
+
+        public boolean nextIsRow() throws IOException
+        {
+            if (!isReady)
+                prepareNext();
+
+            return UnfilteredSerializer.kind(nextFlags) == Unfiltered.Kind.ROW;
+        }
+
+        public boolean nextIsStatic() throws IOException
+        {
+            // This exists only for the sake of the OldFormatDeserializer
+            throw new UnsupportedOperationException();
+        }
+
+        public Unfiltered readNext() throws IOException
+        {
+            isReady = false;
+            if (UnfilteredSerializer.kind(nextFlags) == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+            {
+                RangeTombstone.Bound bound = clusteringDeserializer.deserializeNextBound();
+                return UnfilteredSerializer.serializer.deserializeMarkerBody(in, header, bound);
+            }
+            else
+            {
+                builder.newRow(clusteringDeserializer.deserializeNextClustering());
+                return UnfilteredSerializer.serializer.deserializeRowBody(in, header, helper, nextFlags, nextExtendedFlags, builder);
+            }
+        }
+
+        public void skipNext() throws IOException
+        {
+            isReady = false;
+            clusteringDeserializer.skipNext();
+            if (UnfilteredSerializer.kind(nextFlags) == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+            {
+                UnfilteredSerializer.serializer.skipMarkerBody(in);
+            }
+            else
+            {
+                UnfilteredSerializer.serializer.skipRowBody(in);
+            }
+        }
+
+        public void clearState()
+        {
+            isReady = false;
+            isDone = false;
+        }
+
+        public long bytesReadForUnconsumedData()
+        {
+            // In theory, hasNext() does consume 2-3 bytes, but we don't care about this for the current file format so returning
+            // 0 to mean "do nothing".
+            return 0;
+        }
+    }
+
+    public static class OldFormatDeserializer extends UnfilteredDeserializer
+    {
+        private final boolean readAllAsDynamic;
+        private boolean skipStatic;
+
+        // The next Unfiltered to return, computed by hasNext()
+        private Unfiltered next;
+
+        // Saved position in the input after the next Unfiltered that will be consumed
+        private long nextConsumedPosition;
+
+        // A temporary storage for an Unfiltered that isn't returned next but should be looked at just afterwards
+        private Stash stash;
+
+        private boolean couldBeStartOfPartition = true;
+
+        // The Unfiltered as read from the old format input
+        private final UnfilteredIterator iterator;
+
+        // The position in the input after the last data consumption (readNext/skipNext).
+        private long lastConsumedPosition;
+
+        // Tracks the size of the last LegacyAtom read from disk, because this needs to be accounted
+        // for when marking lastConsumedPosition after readNext/skipNext
+        // Reading/skipping an Unfiltered consumes LegacyAtoms from the underlying legacy atom iterator
+        // e.g. hasNext() -> iterator.hasNext() -> iterator.readRow() -> atoms.next()
+        // The stop condition of the loop which groups legacy atoms into rows causes that AtomIterator
+        // to read in the first atom which doesn't belong in the row. So by that point, our position
+        // is actually past the end of the next Unfiltered. To compensate, we record the size of
+        // the last LegacyAtom read and subtract it from the current position when we calculate lastConsumedPosition.
+        // If we don't, then when reading an indexed block, we can over correct and may think that we've
+        // exhausted the block before we actually have.
+        private long bytesReadForNextAtom = 0L;
+
+        private OldFormatDeserializer(CFMetaData metadata,
+                                      DataInputPlus in,
+                                      SerializationHelper helper,
+                                      DeletionTime partitionDeletion,
+                                      boolean readAllAsDynamic)
+        {
+            super(metadata, in, helper);
+            this.iterator = new UnfilteredIterator(metadata, partitionDeletion, helper, this::readAtom);
+            this.readAllAsDynamic = readAllAsDynamic;
+            this.lastConsumedPosition = currentPosition();
+        }
+
+        private LegacyLayout.LegacyAtom readAtom()
+        {
+            while (true)
+            {
+                try
+                {
+                    long pos = currentPosition();
+                    LegacyLayout.LegacyAtom atom = LegacyLayout.readLegacyAtom(metadata, in, readAllAsDynamic);
+                    bytesReadForNextAtom = currentPosition() - pos;
+                    return atom;
+                }
+                catch (UnknownColumnException e)
+                {
+                    // This is ok, see LegacyLayout.readLegacyAtom() for why this only happens in case were we're ok
+                    // skipping the cell. We do want to catch this at this level however because when that happen,
+                    // we should *not* count the byte of that discarded cell as part of the bytes for the atom
+                    // we will eventually return, as doing so could throw the logic bytesReadForNextAtom participates in.
+                }
+                catch (IOException e)
+                {
+                    throw new IOError(e);
+                }
+            }
+        }
+
+        public void setSkipStatic()
+        {
+            this.skipStatic = true;
+        }
+
+        private boolean isStatic(Unfiltered unfiltered)
+        {
+            return unfiltered.isRow() && ((Row)unfiltered).isStatic();
+        }
+
+        public boolean hasNext() throws IOException
+        {
+            try
+            {
+                while (next == null)
+                {
+                    if (null != stash)
+                    {
+                        next = stash.unfiltered;
+                        nextConsumedPosition = stash.consumedPosition;
+                        stash = null;
+                    }
+                    else
+                    {
+                        if (!iterator.hasNext())
+                            return false;
+                        next = iterator.next();
+                        nextConsumedPosition = currentPosition() - bytesReadForNextAtom;
+                    }
+
+                    /*
+                     * The sstable iterators assume that if there is one, the static row is the first thing this deserializer will return.
+                     * However, in the old format, a range tombstone with an empty start would sort before any static cell. So we should
+                     * detect that case and return the static parts first if necessary.
+                     */
+                    if (couldBeStartOfPartition && next.isRangeTombstoneMarker() && next.clustering().size() == 0 && iterator.hasNext())
+                    {
+                        Unfiltered unfiltered = iterator.next();
+                        long consumedPosition = currentPosition() - bytesReadForNextAtom;
+
+                        stash = new Stash(unfiltered, consumedPosition);
+
+                        /*
+                         * reorder next and stash (see the comment above that explains why), but retain their positions
+                         * it's ok to do so since consumedPosition value is only used to determine if we have gone past
+                         * the end of the index ‘block’; since the edge case requires that the first value be the ‘bottom’
+                         * RT bound (i.e. with no byte buffers), this has a small and well-defined size, and it must be
+                         * the case that both unfiltered are in the same index ‘block’ if we began at the beginning of it.
+                         * if we don't do this, however, we risk aborting early and not returning the BOTTOM rt bound,
+                         * if the static row is large enough to cross block boundaries.
+                         */
+                        if (isStatic(unfiltered))
+                        {
+                            stash.unfiltered = next;
+                            next = unfiltered;
+                        }
+                    }
+                    couldBeStartOfPartition = false;
+
+                    // When reading old tables, we sometimes want to skip static data (due to how staticly defined column of compact
+                    // tables are handled).
+                    if (skipStatic && isStatic(next))
+                        next = null;
+                }
+
+                return true;
+            }
+            catch (IOError e)
+            {
+                if (e.getCause() != null && e.getCause() instanceof IOException)
+                    throw (IOException)e.getCause();
+                throw e;
+            }
+        }
+
+        public int compareNextTo(Slice.Bound bound) throws IOException
+        {
+            if (!hasNext())
+                throw new IllegalStateException();
+            return metadata.comparator.compare(next.clustering(), bound);
+        }
+
+        public boolean nextIsRow() throws IOException
+        {
+            if (!hasNext())
+                throw new IllegalStateException();
+            return next.isRow();
+        }
+
+        public boolean nextIsStatic() throws IOException
+        {
+            return nextIsRow() && ((Row)next).isStatic();
+        }
+
+        private long currentPosition()
+        {
+            // We return a bogus value if the input is not file based, but check we never rely
+            // on that value in that case in bytesReadForUnconsumedData
+            return in instanceof FileDataInput ? ((FileDataInput)in).getFilePointer() : 0;
+        }
+
+        public Unfiltered readNext() throws IOException
+        {
+            if (!hasNext())
+                throw new IllegalStateException();
+            Unfiltered toReturn = next;
+            next = null;
+            lastConsumedPosition = nextConsumedPosition;
+            return toReturn;
+        }
+
+        public void skipNext() throws IOException
+        {
+            readNext();
+        }
+
+        // in case we had to reorder an empty RT bound with a static row, this won't be returning the precise unconsumed size,
+        // that corresponds to the last returned Unfiltered, but use the natural order in the sstable instead
+        public long bytesReadForUnconsumedData()
+        {
+            if (!(in instanceof FileDataInput))
+                throw new AssertionError();
+
+            return currentPosition() - lastConsumedPosition;
+        }
+
+        public void clearState()
+        {
+            next = null;
+            stash = null;
+            couldBeStartOfPartition = true;
+            iterator.clearState();
+            lastConsumedPosition = currentPosition();
+            bytesReadForNextAtom = 0L;
+        }
+
+        private static final class Stash
+        {
+            private Unfiltered unfiltered;
+            long consumedPosition;
+
+            private Stash(Unfiltered unfiltered, long consumedPosition)
+            {
+                this.unfiltered = unfiltered;
+                this.consumedPosition = consumedPosition;
+            }
+        }
+
+        // Groups atoms from the input into proper Unfiltered.
+        // Note: this could use guava AbstractIterator except that we want to be able to clear
+        // the internal state of the iterator so it's cleaner to do it ourselves.
+        @VisibleForTesting
+        static class UnfilteredIterator implements PeekingIterator<Unfiltered>
+        {
+            private final AtomIterator atoms;
+            private final LegacyLayout.CellGrouper grouper;
+            private final TombstoneTracker tombstoneTracker;
+            private final CFMetaData metadata;
+            private final SerializationHelper helper;
+
+            private Unfiltered next;
+
+            UnfilteredIterator(CFMetaData metadata,
+                               DeletionTime partitionDeletion,
+                               SerializationHelper helper,
+                               Supplier<LegacyLayout.LegacyAtom> atomReader)
+            {
+                this.metadata = metadata;
+                this.helper = helper;
+                this.grouper = new LegacyLayout.CellGrouper(metadata, helper);
+                this.tombstoneTracker = new TombstoneTracker(partitionDeletion);
+                this.atoms = new AtomIterator(atomReader, metadata);
+            }
+
+            public boolean hasNext()
+            {
+                // Note that we loop on next == null because TombstoneTracker.openNew() could return null below or the atom might be shadowed.
+                while (next == null)
+                {
+                    if (atoms.hasNext())
+                    {
+                        // If there is a range tombstone to open strictly before the next row/RT, we need to return that open (or boundary) marker first.
+                        if (tombstoneTracker.hasOpeningMarkerBefore(atoms.peek()))
+                        {
+                            next = tombstoneTracker.popOpeningMarker();
+                        }
+                        // If a range tombstone closes strictly before the next row/RT, we need to return that close (or boundary) marker first.
+                        else if (tombstoneTracker.hasClosingMarkerBefore(atoms.peek()))
+                        {
+                            next = tombstoneTracker.popClosingMarker();
+                        }
+                        else
+                        {
+                            LegacyLayout.LegacyAtom atom = atoms.next();
+                            if (tombstoneTracker.isShadowed(atom))
+                                continue;
+
+                            if (atom.isRowAtom(metadata))
+                                next = readRow(atom);
+                            else
+                                tombstoneTracker.openNew(atom.asRangeTombstone());
+                        }
+                    }
+                    else if (tombstoneTracker.hasOpenTombstones())
+                    {
+                        next = tombstoneTracker.popMarker();
+                    }
+                    else
+                    {
+                        return false;
+                    }
+                }
+                return next != null;
+            }
+
+            private Unfiltered readRow(LegacyLayout.LegacyAtom first)
+            {
+                LegacyLayout.CellGrouper grouper = first.isStatic()
+                                                 ? LegacyLayout.CellGrouper.staticGrouper(metadata, helper)
+                                                 : this.grouper;
+                grouper.reset();
+                // We know the first atom is not shadowed and is a "row" atom, so can be added blindly.
+                grouper.addAtom(first);
+
+                // We're less sure about the next atoms. In particular, CellGrouper want to make sure we only pass it
+                // "row" atoms (it's the only type it knows how to handle) so we should handle anything else.
+                while (atoms.hasNext())
+                {
+                    // Peek, but don't consume the next atom just yet
+                    LegacyLayout.LegacyAtom atom = atoms.peek();
+                    // First, that atom may be shadowed in which case we can simply ignore it. Note that this handles
+                    // the case of repeated RT start marker after we've crossed an index boundary, which could well
+                    // appear in the middle of a row (CASSANDRA-14008).
+                    if (!tombstoneTracker.hasClosingMarkerBefore(atom) && tombstoneTracker.isShadowed(atom))
+                    {
+                        atoms.next(); // consume the atom since we only peeked it so far
+                        continue;
+                    }
+
+                    // Second, we should only pass "row" atoms to the cell grouper
+                    if (atom.isRowAtom(metadata))
+                    {
+                        if (!grouper.addAtom(atom))
+                            break; // done with the row; don't consume the atom
+                        atoms.next(); // the grouper "accepted" the atom, consume it since we only peeked above
+                    }
+                    else
+                    {
+                        LegacyLayout.LegacyRangeTombstone rt = (LegacyLayout.LegacyRangeTombstone) atom;
+                        // This means we have a non-row range tombstone. Unfortunately, that does not guarantee the
+                        // current row is finished (though it may), because due to the logic within LegacyRangeTombstone
+                        // constructor, we can get an out-of-order RT that includes on the current row (even if it is
+                        // already started) and extends past it.
+
+                        // So first, evacuate the easy case of the range tombstone simply starting after the current
+                        // row, in which case we're done with the current row (but don't consume the new RT yet so it
+                        // gets handled as any other non-row RT).
+                        if (grouper.startsAfterCurrentRow(rt))
+                            break;
+
+                        // Otherwise, we "split" the RT in 2: the part covering the current row, which is now an
+                        // inRowAtom and can be passed to the grouper, and the part after that, which we push back into
+                        // the iterator for later processing.
+                        Clustering currentRow = grouper.currentRowClustering();
+                        atoms.next(); // consume since we had only just peeked it so far and we're using it
+                        atoms.pushOutOfOrder(rt.withNewStart(Slice.Bound.exclusiveStartOf(currentRow)));
+                        // Note: in theory the withNewStart is a no-op here, but not taking any risk
+                        grouper.addAtom(rt.withNewStart(Slice.Bound.inclusiveStartOf(currentRow))
+                                          .withNewEnd(Slice.Bound.inclusiveEndOf(currentRow)));
+                    }
+                }
+
+                return grouper.getRow();
+            }
+
+            public Unfiltered next()
+            {
+                if (!hasNext())
+                    throw new UnsupportedOperationException();
+                Unfiltered toReturn = next;
+                next = null;
+                return toReturn;
+            }
+
+            public Unfiltered peek()
+            {
+                if (!hasNext())
+                    throw new UnsupportedOperationException();
+                return next;
+            }
+
+            public void clearState()
+            {
+                atoms.clearState();
+                tombstoneTracker.clearState();
+                next = null;
+            }
+
+            public void remove()
+            {
+                throw new UnsupportedOperationException();
+            }
+
+            // Wraps the input of the deserializer to provide an iterator (and skip shadowed atoms).
+            // Note: this could use guava AbstractIterator except that we want to be able to clear
+            // the internal state of the iterator so it's cleaner to do it ourselves.
+            private static class AtomIterator implements PeekingIterator<LegacyLayout.LegacyAtom>
+            {
+                private final Supplier<LegacyLayout.LegacyAtom> atomReader;
+                private boolean readerExhausted;
+                private LegacyLayout.LegacyAtom next;
+
+                private final Comparator<LegacyLayout.LegacyAtom> atomComparator;
+                // May temporarily store atoms that needs to be handler later than when they were deserialized.
+                // Lazily initialized since it is used infrequently.
+                private Queue<LegacyLayout.LegacyAtom> outOfOrderAtoms;
+
+                private AtomIterator(Supplier<LegacyLayout.LegacyAtom> atomReader, CFMetaData metadata)
+                {
+                    this.atomReader = atomReader;
+                    this.atomComparator = LegacyLayout.legacyAtomComparator(metadata);
+                }
+
+                public boolean hasNext()
+                {
+                    if (readerExhausted)
+                        return hasOutOfOrderAtoms(); // We have to return out of order atoms when reader exhausts
+
+                    // Note that next() and peek() assumes that next has been set by this method, so we do it even if
+                    // we have some outOfOrderAtoms stacked up.
+                    if (next == null)
+                        next = atomReader.get();
+
+                    readerExhausted = next == null;
+                    return !readerExhausted || hasOutOfOrderAtoms();
+                }
+
+                public LegacyLayout.LegacyAtom next()
+                {
+                    if (!hasNext())
+                        throw new UnsupportedOperationException();
+
+                    if (hasOutOrderAtomBeforeNext())
+                        return outOfOrderAtoms.poll();
+
+                    LegacyLayout.LegacyAtom toReturn = next;
+                    next = null;
+                    return toReturn;
+                }
+
+                private boolean hasOutOfOrderAtoms()
+                {
+                    return outOfOrderAtoms != null && !outOfOrderAtoms.isEmpty();
+                }
+
+                private boolean hasOutOrderAtomBeforeNext()
+                {
+                    // Note that if outOfOrderAtoms is null, the first condition will be false, so we can save a null
+                    // check on calling `outOfOrderAtoms.peek()` in the right branch.
+                    return hasOutOfOrderAtoms()
+                           && (next == null || atomComparator.compare(outOfOrderAtoms.peek(), next) <= 0);
+                }
+
+                public LegacyLayout.LegacyAtom peek()
+                {
+                    if (!hasNext())
+                        throw new UnsupportedOperationException();
+                    if (hasOutOrderAtomBeforeNext())
+                        return outOfOrderAtoms.peek();
+                    return next;
+                }
+
+                /**
+                 * Push back an atom in the iterator assuming said atom sorts strictly _after_ the atom returned by
+                 * the last next() call (meaning the pushed atom fall in the part of the iterator that has not been
+                 * returned yet, not before). The atom will then be returned by the iterator in proper order.
+                 */
+                public void pushOutOfOrder(LegacyLayout.LegacyAtom atom)
+                {
+                    if (outOfOrderAtoms == null)
+                        outOfOrderAtoms = new PriorityQueue<>(atomComparator);
+                    outOfOrderAtoms.offer(atom);
+                }
+
+                public void clearState()
+                {
+                    this.next = null;
+                    this.readerExhausted = false;
+                    if (outOfOrderAtoms != null)
+                        outOfOrderAtoms.clear();
+                }
+
+                public void remove()
+                {
+                    throw new UnsupportedOperationException();
+                }
+            }
+
+            /**
+             * Tracks which range tombstones are open when deserializing the old format.
+             * <p>
+             * This is a bit tricky because in the old of format we could have duplicated tombstones, overlapping ones,
+             * shadowed ones, etc.., but we should generate from that a "flat" output where at most one non-shadoowed
+             * range is open at any given time and without empty range.
+             * <p>
+             * One consequence of that is that we have to be careful to not generate markers too soon. For instance,
+             * we might get a range tombstone [1, 1]@3 followed by [1, 10]@5. So if we generate an opening marker on
+             * the first tombstone (so INCL_START(1)@3), we're screwed when we get to the 2nd range tombstone: we really
+             * should ignore the first tombstone in that that and generate INCL_START(1)@5 (assuming obviously we don't
+             * have one more range tombstone starting at 1 in the stream). This is why we have the
+             * {@link #hasOpeningMarkerBefore} method: in practice, we remember when a marker should be opened, but only
+             * generate that opening marker when we're sure that we won't get anything shadowing that marker.
+             * <p>
+             * For closing marker, we also have a {@link #hasClosingMarkerBefore} because in the old format the closing
+             * markers comes with the opening one, but we should generate them "in order" in the new format.
+             */
+            private class TombstoneTracker
+            {
+                private final DeletionTime partitionDeletion;
+
+                // As explained in the javadoc, we need to wait to generate an opening marker until we're sure we have
+                // seen anything that could shadow it. So this remember a marker that needs to be opened but hasn't
+                // been yet. This is truly returned when hasOpeningMarkerBefore tells us it's safe to.
+                private RangeTombstoneMarker openMarkerToReturn;
+
+                // Open tombstones sorted by their closing bound (i.e. first tombstone is the first to close).
+                // As we only track non-fully-shadowed ranges, the first range is necessarily the currently
+                // open tombstone (the one with the higher timestamp).
+                private final SortedSet<LegacyLayout.LegacyRangeTombstone> openTombstones;
+
+                public TombstoneTracker(DeletionTime partitionDeletion)
+                {
+                    this.partitionDeletion = partitionDeletion;
+                    this.openTombstones = new TreeSet<>((rt1, rt2) -> metadata.comparator.compare(rt1.stop.bound, rt2.stop.bound));
+                }
+
+                /**
+                 * Checks if the provided atom is fully shadowed by the open tombstones of this tracker (or the partition deletion).
+                 */
+                public boolean isShadowed(LegacyLayout.LegacyAtom atom)
+                {
+                    assert !hasClosingMarkerBefore(atom);
+                    long timestamp = atom.isCell() ? atom.asCell().timestamp : atom.asRangeTombstone().deletionTime.markedForDeleteAt();
+
+                    if (partitionDeletion.deletes(timestamp))
+                        return true;
+
+                    SortedSet<LegacyLayout.LegacyRangeTombstone> coveringTombstones = atom.isRowAtom(metadata) ? openTombstones : openTombstones.tailSet(atom.asRangeTombstone());
+                    return Iterables.any(coveringTombstones, tombstone -> tombstone.deletionTime.deletes(timestamp));
+                }
+
+                /**
+                 * Whether there is an outstanding opening marker that should be returned before we process the provided row/RT.
+                 */
+                public boolean hasOpeningMarkerBefore(LegacyLayout.LegacyAtom atom)
+                {
+                    return openMarkerToReturn != null
+                           && metadata.comparator.compare(openMarkerToReturn.openBound(false), atom.clustering()) < 0;
+                }
+
+                public Unfiltered popOpeningMarker()
+                {
+                    assert openMarkerToReturn != null;
+                    Unfiltered toReturn = openMarkerToReturn;
+                    openMarkerToReturn = null;
+                    return toReturn;
+                }
+
+                /**
+                 * Whether the currently open marker closes stricly before the provided row/RT.
+                 */
+                public boolean hasClosingMarkerBefore(LegacyLayout.LegacyAtom atom)
+                {
+                    return !openTombstones.isEmpty()
+                           && metadata.comparator.compare(openTombstones.first().stop.bound, atom.clustering()) < 0;
+                }
+
+                /**
+                 * Returns the unfiltered corresponding to closing the currently open marker (and update the tracker accordingly).
+                 */
+                public Unfiltered popClosingMarker()
+                {
+                    assert !openTombstones.isEmpty();
+
+                    Iterator<LegacyLayout.LegacyRangeTombstone> iter = openTombstones.iterator();
+                    LegacyLayout.LegacyRangeTombstone first = iter.next();
+                    iter.remove();
+
+                    // If that was the last open tombstone, we just want to close it. Otherwise, we have a boundary with the
+                    // next tombstone
+                    if (!iter.hasNext())
+                        return new RangeTombstoneBoundMarker(first.stop.bound, first.deletionTime);
+
+                    LegacyLayout.LegacyRangeTombstone next = iter.next();
+                    return RangeTombstoneBoundaryMarker.makeBoundary(false, first.stop.bound, first.stop.bound.invert(), first.deletionTime, next.deletionTime);
+                }
+
+                 /**
+                  * Pop whatever next marker needs to be popped. This should be called as many time as necessary (until
+                  * {@link #hasOpenTombstones} returns {@false}) when all atoms have been consumed to "empty" the tracker.
+                  */
+                 public Unfiltered popMarker()
+                 {
+                     assert hasOpenTombstones();
+                     return openMarkerToReturn == null ? popClosingMarker() : popOpeningMarker();
+                 }
+
+                /**
+                 * Update the tracker given the provided newly open tombstone. This potentially update openMarkerToReturn
+                 * to account for th new opening.
+                 *
+                 * Note that this method assumes that:
+                 +  1) the added tombstone is not fully shadowed: !isShadowed(tombstone).
+                 +  2) there is no marker to open that open strictly before this new tombstone: !hasOpeningMarkerBefore(tombstone).
+                 +  3) no opened tombstone closes before that tombstone: !hasClosingMarkerBefore(tombstone).
+                 + One can check that this is only called after the condition above have been checked in UnfilteredIterator.hasNext above.
+                 */
+                public void openNew(LegacyLayout.LegacyRangeTombstone tombstone)
+                {
+                    if (openTombstones.isEmpty())
+                    {
+                        // If we have an openMarkerToReturn, the corresponding RT must be in openTombstones (or we wouldn't know when to close it)
+                        assert openMarkerToReturn == null;
+                        openTombstones.add(tombstone);
+                        openMarkerToReturn = new RangeTombstoneBoundMarker(tombstone.start.bound, tombstone.deletionTime);
+                        return;
+                    }
+
+                    if (openMarkerToReturn != null)
+                    {
+                        // If the new opening supersedes the one we're about to return, we need to update the one to return.
+                        if (tombstone.deletionTime.supersedes(openMarkerToReturn.openDeletionTime(false)))
+                            openMarkerToReturn = openMarkerToReturn.withNewOpeningDeletionTime(false, tombstone.deletionTime);
+                    }
+                    else
+                    {
+                        // We have no openMarkerToReturn set yet so set it now if needs be.
+                        // Since openTombstones isn't empty, it means we have a currently ongoing deletion. And if the new tombstone
+                        // supersedes that ongoing deletion, we need to close the opening  deletion and open with the new one.
+                        DeletionTime currentOpenDeletion = openTombstones.first().deletionTime;
+                        if (tombstone.deletionTime.supersedes(currentOpenDeletion))
+                            openMarkerToReturn = RangeTombstoneBoundaryMarker.makeBoundary(false, tombstone.start.bound.invert(), tombstone.start.bound, currentOpenDeletion, tombstone.deletionTime);
+                    }
+
+                    // In all cases, we know !isShadowed(tombstone) so we need to add the tombstone (note however that we may not have set openMarkerToReturn if the
+                    // new tombstone doesn't supersedes the current deletion _but_ extend past the marker currently open)
+                    add(tombstone);
+                }
+
+                /**
+                 * Adds a new tombstone to openTombstones, removing anything that would be shadowed by this new tombstone.
+                 */
+                private void add(LegacyLayout.LegacyRangeTombstone tombstone)
+                {
+                    // First, remove existing tombstone that is shadowed by this tombstone.
+                    Iterator<LegacyLayout.LegacyRangeTombstone> iter = openTombstones.iterator();
+                    while (iter.hasNext())
+                    {
+
+                        LegacyLayout.LegacyRangeTombstone existing = iter.next();
+                        // openTombstones is ordered by stop bound and the new tombstone can't be shadowing anything that
+                        // stop after it.
+                        if (metadata.comparator.compare(tombstone.stop.bound, existing.stop.bound) < 0)
+                            break;
+
+                        // Note that we remove an existing tombstone even if it is equal to the new one because in that case,
+                        // either the existing strictly stops before the new one and we don't want it, or it stops exactly
+                        // like the new one but we're going to inconditionally add the new one anyway.
+                        if (!existing.deletionTime.supersedes(tombstone.deletionTime))
+                            iter.remove();
+                    }
+                    openTombstones.add(tombstone);
+                }
+
+                public boolean hasOpenTombstones()
+                {
+                    return openMarkerToReturn != null || !openTombstones.isEmpty();
+                }
+
+                public void clearState()
+                {
+                    openMarkerToReturn = null;
+                    openTombstones.clear();
+                }
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/UnknownColumnException.java b/src/java/org/apache/cassandra/db/UnknownColumnException.java
new file mode 100644
index 0000000..55dc453
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/UnknownColumnException.java

@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Exception thrown when we read a column internally that is unknown. Note that
+ * this is an internal exception and is not meant to be user facing.
+ */
+public class UnknownColumnException extends Exception
+{
+    public final ByteBuffer columnName;
+
+    public UnknownColumnException(CFMetaData metadata, ByteBuffer columnName)
+    {
+        super(String.format("Unknown column %s in table %s.%s", stringify(columnName), metadata.ksName, metadata.cfName));
+        this.columnName = columnName;
+    }
+
+    private static String stringify(ByteBuffer name)
+    {
+        try
+        {
+            return UTF8Type.instance.getString(name);
+        }
+        catch (Exception e)
+        {
+            return ByteBufferUtil.bytesToHex(name);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/WriteResponse.java b/src/java/org/apache/cassandra/db/WriteResponse.java
index a7b108b..0dddaab 100644
--- a/src/java/org/apache/cassandra/db/WriteResponse.java
+++ b/src/java/org/apache/cassandra/db/WriteResponse.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -28,24 +28,30 @@
 /*
  * This empty response is sent by a replica to inform the coordinator that the write succeeded
  */
-public class WriteResponse
+public final class WriteResponse
 {
-    public static final WriteResponseSerializer serializer = new WriteResponseSerializer();
+    public static final Serializer serializer = new Serializer();
 
-    public MessageOut<WriteResponse> createMessage()
+    private static final WriteResponse instance = new WriteResponse();
+
+    private WriteResponse()
     {
-        return new MessageOut<WriteResponse>(MessagingService.Verb.REQUEST_RESPONSE, this, serializer);
     }
 
-    public static class WriteResponseSerializer implements IVersionedSerializer<WriteResponse>
+    public static MessageOut<WriteResponse> createMessage()
+    {
+        return new MessageOut<>(MessagingService.Verb.REQUEST_RESPONSE, instance, serializer);
+    }
+
+    public static class Serializer implements IVersionedSerializer<WriteResponse>
     {
         public void serialize(WriteResponse wm, DataOutputPlus out, int version) throws IOException
         {
         }
 
-        public WriteResponse deserialize(DataInput in, int version) throws IOException
+        public WriteResponse deserialize(DataInputPlus in, int version) throws IOException
         {
-            return new WriteResponse();
+            return instance;
         }
 
         public long serializedSize(WriteResponse response, int version)

diff --git a/src/java/org/apache/cassandra/db/WriteType.java b/src/java/org/apache/cassandra/db/WriteType.java
index 4f4c88d..fdbe97d 100644
--- a/src/java/org/apache/cassandra/db/WriteType.java
+++ b/src/java/org/apache/cassandra/db/WriteType.java

@@ -24,5 +24,6 @@
     UNLOGGED_BATCH,
     COUNTER,
     BATCH_LOG,
-    CAS;
+    CAS,
+    VIEW;
 }

diff --git a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java b/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
new file mode 100644
index 0000000..386b2c8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java

@@ -0,0 +1,533 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.columniterator;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.IndexHelper;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+abstract class AbstractSSTableIterator implements SliceableUnfilteredRowIterator
+{
+    protected final SSTableReader sstable;
+    protected final DecoratedKey key;
+    protected final DeletionTime partitionLevelDeletion;
+    protected final ColumnFilter columns;
+    protected final SerializationHelper helper;
+
+    protected final Row staticRow;
+    protected final Reader reader;
+
+    private final boolean isForThrift;
+
+    private boolean isClosed;
+
+    @SuppressWarnings("resource") // We need this because the analysis is not able to determine that we do close
+                                  // file on every path where we created it.
+    protected AbstractSSTableIterator(SSTableReader sstable,
+                                      FileDataInput file,
+                                      DecoratedKey key,
+                                      RowIndexEntry indexEntry,
+                                      ColumnFilter columnFilter,
+                                      boolean isForThrift)
+    {
+        this.sstable = sstable;
+        this.key = key;
+        this.columns = columnFilter;
+        this.helper = new SerializationHelper(sstable.metadata, sstable.descriptor.version.correspondingMessagingVersion(), SerializationHelper.Flag.LOCAL, columnFilter);
+        this.isForThrift = isForThrift;
+
+        if (indexEntry == null)
+        {
+            this.partitionLevelDeletion = DeletionTime.LIVE;
+            this.reader = null;
+            this.staticRow = Rows.EMPTY_STATIC_ROW;
+        }
+        else
+        {
+            boolean shouldCloseFile = file == null;
+            try
+            {
+                // We seek to the beginning to the partition if either:
+                //   - the partition is not indexed; we then have a single block to read anyway
+                //     (and we need to read the partition deletion time).
+                //   - we're querying static columns.
+                boolean needSeekAtPartitionStart = !indexEntry.isIndexed() || !columns.fetchedColumns().statics.isEmpty();
+
+                // For CQL queries on static compact tables, we only want to consider static value (only those are exposed),
+                // but readStaticRow have already read them and might in fact have consumed the whole partition (when reading
+                // the legacy file format), so set the reader to null so we don't try to read anything more. We can remove this
+                // once we drop support for the legacy file format
+                boolean needsReader = sstable.descriptor.version.storeRows() || isForThrift || !sstable.metadata.isStaticCompactTable();
+
+                if (needSeekAtPartitionStart)
+                {
+                    // Not indexed (or is reading static), set to the beginning of the partition and read partition level deletion there
+                    if (file == null)
+                        file = sstable.getFileDataInput(indexEntry.position);
+                    else
+                        file.seek(indexEntry.position);
+
+                    ByteBufferUtil.skipShortLength(file); // Skip partition key
+                    this.partitionLevelDeletion = DeletionTime.serializer.deserialize(file);
+
+                    // Note that this needs to be called after file != null and after the partitionDeletion has been set, but before readStaticRow
+                    // (since it uses it) so we can't move that up (but we'll be able to simplify as soon as we drop support for the old file format).
+                    this.reader = needsReader ? createReader(indexEntry, file, shouldCloseFile) : null;
+                    this.staticRow = readStaticRow(sstable, file, helper, columns.fetchedColumns().statics, isForThrift, reader == null ? null : reader.deserializer);
+                }
+                else
+                {
+                    this.partitionLevelDeletion = indexEntry.deletionTime();
+                    this.staticRow = Rows.EMPTY_STATIC_ROW;
+                    this.reader = needsReader ? createReader(indexEntry, file, shouldCloseFile) : null;
+                }
+
+                if (reader == null && file != null && shouldCloseFile)
+                    file.close();
+            }
+            catch (IOException e)
+            {
+                sstable.markSuspect();
+                String filePath = file.getPath();
+                if (shouldCloseFile)
+                {
+                    try
+                    {
+                        file.close();
+                    }
+                    catch (IOException suppressed)
+                    {
+                        e.addSuppressed(suppressed);
+                    }
+                }
+                throw new CorruptSSTableException(e, filePath);
+            }
+        }
+    }
+
+    private static Row readStaticRow(SSTableReader sstable,
+                                     FileDataInput file,
+                                     SerializationHelper helper,
+                                     Columns statics,
+                                     boolean isForThrift,
+                                     UnfilteredDeserializer deserializer) throws IOException
+    {
+        if (!sstable.descriptor.version.storeRows())
+        {
+            if (!sstable.metadata.isCompactTable())
+            {
+                assert deserializer != null;
+                return deserializer.hasNext() && deserializer.nextIsStatic()
+                     ? (Row)deserializer.readNext()
+                     : Rows.EMPTY_STATIC_ROW;
+            }
+
+            // For compact tables, we use statics for the "column_metadata" definition. However, in the old format, those
+            // "column_metadata" are intermingled as any other "cell". In theory, this means that we'd have to do a first
+            // pass to extract the static values. However, for thrift, we'll use the ThriftResultsMerger right away which
+            // will re-merge static values with dynamic ones, so we can just ignore static and read every cell as a
+            // "dynamic" one. For CQL, if the table is a "static compact", then is has only static columns exposed and no
+            // dynamic ones. So we do a pass to extract static columns here, but will have no more work to do. Otherwise,
+            // the table won't have static columns.
+            if (statics.isEmpty() || isForThrift)
+                return Rows.EMPTY_STATIC_ROW;
+
+            assert sstable.metadata.isStaticCompactTable();
+
+            // As said above, if it's a CQL query and the table is a "static compact", the only exposed columns are the
+            // static ones. So we don't have to mark the position to seek back later.
+            return LegacyLayout.extractStaticColumns(sstable.metadata, file, statics);
+        }
+
+        if (!sstable.header.hasStatic())
+            return Rows.EMPTY_STATIC_ROW;
+
+        if (statics.isEmpty())
+        {
+            UnfilteredSerializer.serializer.skipStaticRow(file, sstable.header, helper);
+            return Rows.EMPTY_STATIC_ROW;
+        }
+        else
+        {
+            return UnfilteredSerializer.serializer.deserializeStaticRow(file, sstable.header, helper);
+        }
+    }
+
+    protected abstract Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile);
+
+    public CFMetaData metadata()
+    {
+        return sstable.metadata;
+    }
+
+    public PartitionColumns columns()
+    {
+        return columns.fetchedColumns();
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return key;
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        return partitionLevelDeletion;
+    }
+
+    public Row staticRow()
+    {
+        return staticRow;
+    }
+
+    public EncodingStats stats()
+    {
+        // We could return sstable.header.stats(), but this may not be as accurate than the actual sstable stats (see
+        // SerializationHeader.make() for details) so we use the latter instead.
+        return new EncodingStats(sstable.getMinTimestamp(), sstable.getMinLocalDeletionTime(), sstable.getMinTTL());
+    }
+
+    public boolean hasNext()
+    {
+        return reader != null && reader.hasNext();
+    }
+
+    public Unfiltered next()
+    {
+        assert reader != null;
+        return reader.next();
+    }
+
+    public Iterator<Unfiltered> slice(Slice slice)
+    {
+        try
+        {
+            if (reader == null)
+                return Collections.emptyIterator();
+
+            reader.setForSlice(slice);
+            return reader;
+        }
+        catch (IOException e)
+        {
+            try
+            {
+                closeInternal();
+            }
+            catch (IOException suppressed)
+            {
+                e.addSuppressed(suppressed);
+            }
+            sstable.markSuspect();
+            throw new CorruptSSTableException(e, reader.file.getPath());
+        }
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    private void closeInternal() throws IOException
+    {
+        // It's important to make closing idempotent since it would bad to double-close 'file' as its a RandomAccessReader
+        // and its close is not idemptotent in the case where we recycle it.
+        if (isClosed)
+            return;
+
+        if (reader != null)
+            reader.close();
+
+        isClosed = true;
+    }
+
+    public void close()
+    {
+        try
+        {
+            closeInternal();
+        }
+        catch (IOException e)
+        {
+            sstable.markSuspect();
+            throw new CorruptSSTableException(e, reader.file.getPath());
+        }
+    }
+
+    protected abstract class Reader implements Iterator<Unfiltered>
+    {
+        private final boolean shouldCloseFile;
+        public FileDataInput file;
+        public final Version version;
+
+        protected UnfilteredDeserializer deserializer;
+
+        // Records the currently open range tombstone (if any)
+        protected DeletionTime openMarker = null;
+
+        protected Reader(FileDataInput file, boolean shouldCloseFile)
+        {
+            this.file = file;
+            this.shouldCloseFile = shouldCloseFile;
+            this.version = sstable.descriptor.version;
+
+            if (file != null)
+                createDeserializer();
+        }
+
+        private void createDeserializer()
+        {
+            assert file != null && deserializer == null;
+            deserializer = UnfilteredDeserializer.create(sstable.metadata, file, sstable.header, helper, partitionLevelDeletion, isForThrift);
+        }
+
+        protected void seekToPosition(long position) throws IOException
+        {
+            // This may be the first time we're actually looking into the file
+            if (file == null)
+            {
+                file = sstable.getFileDataInput(position);
+                createDeserializer();
+            }
+            else
+            {
+                file.seek(position);
+            }
+        }
+
+        protected void updateOpenMarker(RangeTombstoneMarker marker)
+        {
+            // Note that we always read index blocks in forward order so this method is always called in forward order
+            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
+        }
+
+        public boolean hasNext() 
+        {
+            try
+            {
+                return hasNextInternal();
+            }
+            catch (IOException | IndexOutOfBoundsException e)
+            {
+                try
+                {
+                    closeInternal();
+                }
+                catch (IOException suppressed)
+                {
+                    e.addSuppressed(suppressed);
+                }
+                sstable.markSuspect();
+                throw new CorruptSSTableException(e, reader.file.getPath());
+            }
+        }
+
+        public Unfiltered next()
+        {
+            try
+            {
+                return nextInternal();
+            }
+            catch (IOException e)
+            {
+                try
+                {
+                    closeInternal();
+                }
+                catch (IOException suppressed)
+                {
+                    e.addSuppressed(suppressed);
+                }
+                sstable.markSuspect();
+                throw new CorruptSSTableException(e, reader.file.getPath());
+            }
+        }
+
+        // Set the reader so its hasNext/next methods return values within the provided slice
+        public abstract void setForSlice(Slice slice) throws IOException;
+
+        protected abstract boolean hasNextInternal() throws IOException;
+        protected abstract Unfiltered nextInternal() throws IOException;
+
+        public void close() throws IOException
+        {
+            if (shouldCloseFile && file != null)
+                file.close();
+        }
+    }
+
+    // Used by indexed readers to store where they are of the index.
+    protected static class IndexState
+    {
+        private final Reader reader;
+        private final ClusteringComparator comparator;
+
+        private final RowIndexEntry indexEntry;
+        private final List<IndexHelper.IndexInfo> indexes;
+        private final boolean reversed;
+
+        private int currentIndexIdx;
+
+        // Marks the beginning of the block corresponding to currentIndexIdx.
+        private DataPosition mark;
+
+        public IndexState(Reader reader, ClusteringComparator comparator, RowIndexEntry indexEntry, boolean reversed)
+        {
+            this.reader = reader;
+            this.comparator = comparator;
+            this.indexEntry = indexEntry;
+            this.indexes = indexEntry.columnsIndex();
+            this.reversed = reversed;
+            this.currentIndexIdx = reversed ? indexEntry.columnsIndex().size() : -1;
+        }
+
+        public boolean isDone()
+        {
+            return reversed ? currentIndexIdx < 0 : currentIndexIdx >= indexes.size();
+        }
+
+        // Sets the reader to the beginning of blockIdx.
+        public void setToBlock(int blockIdx) throws IOException
+        {
+            if (blockIdx >= 0 && blockIdx < indexes.size())
+            {
+                reader.seekToPosition(columnOffset(blockIdx));
+                mark = reader.file.mark();
+                reader.deserializer.clearState();
+            }
+
+            currentIndexIdx = blockIdx;
+            reader.openMarker = blockIdx > 0 ? indexes.get(blockIdx - 1).endOpenMarker : null;
+
+            // If we're reading an old format file and we move to the first block in the index (i.e. the
+            // head of the partition), we skip the static row as it's already been read when we first opened
+            // the iterator. If we don't do this and a static row is present, we'll re-read it but treat it
+            // as a regular row, causing deserialization to blow up later as that row's flags will be invalid
+            // see CASSANDRA-12088 & CASSANDRA-13236
+            if (!reader.version.storeRows()
+                && blockIdx == 0
+                && reader.deserializer.hasNext()
+                && reader.deserializer.nextIsStatic())
+            {
+                reader.deserializer.skipNext();
+            }
+        }
+
+        private long columnOffset(int i)
+        {
+            return indexEntry.position + indexes.get(i).offset;
+        }
+
+        public int blocksCount()
+        {
+            return indexes.size();
+        }
+
+        // Update the block idx based on the current reader position if we're past the current block.
+        // This only makes sense for forward iteration (for reverse ones, when we reach the end of a block we
+        // should seek to the previous one, not update the index state and continue).
+        public void updateBlock() throws IOException
+        {
+            assert !reversed;
+
+            // If we get here with currentBlockIdx < 0, it means setToBlock() has never been called, so it means
+            // we're about to read from the beginning of the partition, but haven't "prepared" the IndexState yet.
+            // Do so by setting us on the first block.
+            if (currentIndexIdx < 0)
+            {
+                setToBlock(0);
+                return;
+            }
+
+            while (currentIndexIdx + 1 < indexes.size() && isPastCurrentBlock())
+            {
+                reader.openMarker = currentIndex().endOpenMarker;
+                ++currentIndexIdx;
+
+                // We have to set the mark, and we have to set it at the beginning of the block. So if we're not at the beginning of the block, this forces us to a weird seek dance.
+                // This can only happen when reading old file however.
+                long startOfBlock = columnOffset(currentIndexIdx);
+                long currentFilePointer = reader.file.getFilePointer();
+                if (startOfBlock == currentFilePointer)
+                {
+                    mark = reader.file.mark();
+                }
+                else
+                {
+                    reader.seekToPosition(startOfBlock);
+                    mark = reader.file.mark();
+                    reader.seekToPosition(currentFilePointer);
+                }
+            }
+        }
+
+        // Check if we've crossed an index boundary (based on the mark on the beginning of the index block).
+        public boolean isPastCurrentBlock()
+        {
+            assert reader.deserializer != null;
+            long correction = reader.deserializer.bytesReadForUnconsumedData();
+            return reader.file.bytesPastMark(mark) - correction >= currentIndex().width;
+        }
+
+        public int currentBlockIdx()
+        {
+            return currentIndexIdx;
+        }
+
+        public IndexHelper.IndexInfo currentIndex()
+        {
+            return index(currentIndexIdx);
+        }
+
+        public IndexHelper.IndexInfo index(int i)
+        {
+            return indexes.get(i);
+        }
+
+        // Finds the index of the first block containing the provided bound, starting at the provided index.
+        // Will be -1 if the bound is before any block, and blocksCount() if it is after every block.
+        public int findBlockIndex(Slice.Bound bound, int fromIdx)
+        {
+            if (bound == Slice.Bound.BOTTOM)
+                return -1;
+            if (bound == Slice.Bound.TOP)
+                return blocksCount();
+
+            return IndexHelper.indexFor(bound, indexes, comparator, reversed, fromIdx);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("IndexState(indexSize=%d, currentBlock=%d, reversed=%b)", indexes.size(), currentIndexIdx, reversed);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/columniterator/IColumnIteratorFactory.java b/src/java/org/apache/cassandra/db/columniterator/IColumnIteratorFactory.java
deleted file mode 100644
index 46983e9..0000000
--- a/src/java/org/apache/cassandra/db/columniterator/IColumnIteratorFactory.java
+++ /dev/null

@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.columniterator;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-public interface IColumnIteratorFactory
-{
-    OnDiskAtomIterator create();
-}

diff --git a/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java b/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java
deleted file mode 100644
index 7185eef..0000000
--- a/src/java/org/apache/cassandra/db/columniterator/IdentityQueryFilter.java
+++ /dev/null

@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.columniterator;
-
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-
-public class IdentityQueryFilter extends SliceQueryFilter
-{
-    /**
-     * Will read entire CF into memory.  Use with caution.
-     */
-    public IdentityQueryFilter()
-    {
-        super(Composites.EMPTY, Composites.EMPTY, false, Integer.MAX_VALUE);
-    }
-
-    @Override
-    protected boolean respectTombstoneThresholds()
-    {
-        return false;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/columniterator/LazyColumnIterator.java b/src/java/org/apache/cassandra/db/columniterator/LazyColumnIterator.java
deleted file mode 100644
index 9d1cecb..0000000
--- a/src/java/org/apache/cassandra/db/columniterator/LazyColumnIterator.java
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.columniterator;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.OnDiskAtom;
-
-import java.io.IOException;
-
-
-/*
- * The goal of this encapsulating OnDiskAtomIterator is to delay the use of
- * the filter until columns are actually queried.
- * The reason for that is get_paged_slice because it change the start of
- * the filter after having seen the first row, and so we must not use the
- * filter before the row data is actually queried. However, mergeIterator
- * needs to "fetch" a row in advance. But all it needs is the key and so
- * this IColumnIterator make sure getKey() can be called without triggering
- * the use of the filter itself.
- */
-public class LazyColumnIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
-{
-    private final DecoratedKey key;
-    private final IColumnIteratorFactory subIteratorFactory;
-
-    private OnDiskAtomIterator subIterator;
-
-    public LazyColumnIterator(DecoratedKey key, IColumnIteratorFactory subIteratorFactory)
-    {
-        this.key = key;
-        this.subIteratorFactory = subIteratorFactory;
-    }
-
-    private OnDiskAtomIterator getSubIterator()
-    {
-        if (subIterator == null)
-            subIterator = subIteratorFactory.create();
-        return subIterator;
-    }
-
-    protected OnDiskAtom computeNext()
-    {
-        getSubIterator();
-        return subIterator.hasNext() ? subIterator.next() : endOfData();
-    }
-
-    public ColumnFamily getColumnFamily()
-    {
-        return getSubIterator().getColumnFamily();
-    }
-
-    public DecoratedKey getKey()
-    {
-        return key;
-    }
-
-    public void close() throws IOException
-    {
-        if (subIterator != null)
-            subIterator.close();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/columniterator/OnDiskAtomIterator.java b/src/java/org/apache/cassandra/db/columniterator/OnDiskAtomIterator.java
deleted file mode 100644
index 21c38f7..0000000
--- a/src/java/org/apache/cassandra/db/columniterator/OnDiskAtomIterator.java
+++ /dev/null

@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.columniterator;
-
-import java.io.IOException;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.utils.CloseableIterator;
-
-public interface OnDiskAtomIterator extends CloseableIterator<OnDiskAtom>
-{
-    /**
-     * @return A ColumnFamily holding metadata for the row being iterated.
-     * Do not modify this CF. Whether it is empty or not is implementation-dependent.
-     */
-    public abstract ColumnFamily getColumnFamily();
-
-    /**
-     * @return the current row key
-     */
-    public DecoratedKey getKey();
-
-    /** clean up any open resources */
-    public void close() throws IOException;
-}
-

diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
new file mode 100644
index 0000000..47f85ac
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java

@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.columniterator;
+
+import java.io.IOException;
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.util.FileDataInput;
+
+/**
+ *  A Cell Iterator over SSTable
+ */
+public class SSTableIterator extends AbstractSSTableIterator
+{
+    public SSTableIterator(SSTableReader sstable,
+                           DecoratedKey key,
+                           ColumnFilter columns,
+                           boolean isForThrift,
+                           SSTableReadsListener listener)
+    {
+        this(sstable, null, key, sstable.getPosition(key, SSTableReader.Operator.EQ, listener), columns, isForThrift);
+    }
+
+    public SSTableIterator(SSTableReader sstable,
+                           FileDataInput file,
+                           DecoratedKey key,
+                           RowIndexEntry indexEntry,
+                           ColumnFilter columns,
+                           boolean isForThrift)
+    {
+        super(sstable, file, key, indexEntry, columns, isForThrift);
+    }
+
+    protected Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    {
+        return indexEntry.isIndexed()
+             ? new ForwardIndexedReader(indexEntry, file, shouldCloseFile)
+             : new ForwardReader(file, shouldCloseFile);
+    }
+
+    public boolean isReverseOrder()
+    {
+        return false;
+    }
+
+    private class ForwardReader extends Reader
+    {
+        // The start of the current slice. This will be null as soon as we know we've passed that bound.
+        protected Slice.Bound start;
+        // The end of the current slice. Will never be null.
+        protected Slice.Bound end = Slice.Bound.TOP;
+
+        protected Unfiltered next; // the next element to return: this is computed by hasNextInternal().
+
+        protected boolean sliceDone; // set to true once we know we have no more result for the slice. This is in particular
+                                     // used by the indexed reader when we know we can't have results based on the index.
+
+        private ForwardReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+        }
+
+        public void setForSlice(Slice slice) throws IOException
+        {
+            start = slice.start() == Slice.Bound.BOTTOM ? null : slice.start();
+            end = slice.end();
+
+            sliceDone = false;
+            next = null;
+        }
+
+        // Skip all data that comes before the currently set slice.
+        // Return what should be returned at the end of this, or null if nothing should.
+        private Unfiltered handlePreSliceData() throws IOException
+        {
+            assert deserializer != null;
+
+            // Note that the following comparison is not strict. The reason is that the only cases
+            // where it can be == is if the "next" is a RT start marker (either a '[' of a ')[' boundary),
+            // and if we had a strict inequality and an open RT marker before this, we would issue
+            // the open marker first, and then return then next later, which would send in the
+            // stream both '[' (or '(') and then ')[' for the same clustering value, which is wrong.
+            // By using a non-strict inequality, we avoid that problem (if we do get ')[' for the same
+            // clustering value than the slice, we'll simply record it in 'openMarker').
+            while (deserializer.hasNext() && deserializer.compareNextTo(start) <= 0)
+            {
+                if (deserializer.nextIsRow())
+                    deserializer.skipNext();
+                else
+                    updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
+            }
+
+            Slice.Bound sliceStart = start;
+            start = null;
+
+            // We've reached the beginning of our queried slice. If we have an open marker
+            // we should return that first.
+            if (openMarker != null)
+                return new RangeTombstoneBoundMarker(sliceStart, openMarker);
+
+            return null;
+        }
+
+        // Compute the next element to return, assuming we're in the middle to the slice
+        // and the next element is either in the slice, or just after it. Returns null
+        // if we're done with the slice.
+        protected Unfiltered computeNext() throws IOException
+        {
+            assert deserializer != null;
+
+            while (true)
+            {
+                // We use a same reasoning as in handlePreSliceData regarding the strictness of the inequality below.
+                // We want to exclude deserialized unfiltered equal to end, because 1) we won't miss any rows since those
+                // woudn't be equal to a slice bound and 2) a end bound can be equal to a start bound
+                // (EXCL_END(x) == INCL_START(x) for instance) and in that case we don't want to return start bound because
+                // it's fundamentally excluded. And if the bound is a  end (for a range tombstone), it means it's exactly
+                // our slice end, but in that  case we will properly close the range tombstone anyway as part of our "close
+                // an open marker" code in hasNextInterna
+                if (!deserializer.hasNext() || deserializer.compareNextTo(end) >= 0)
+                    return null;
+
+                Unfiltered next = deserializer.readNext();
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (next.isEmpty())
+                    continue;
+
+                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+                    updateOpenMarker((RangeTombstoneMarker) next);
+                return next;
+            }
+        }
+
+        protected boolean hasNextInternal() throws IOException
+        {
+            if (next != null)
+                return true;
+
+            if (sliceDone)
+                return false;
+
+            if (start != null)
+            {
+                Unfiltered unfiltered = handlePreSliceData();
+                if (unfiltered != null)
+                {
+                    next = unfiltered;
+                    return true;
+                }
+            }
+
+            next = computeNext();
+            if (next != null)
+                return true;
+
+            // for current slice, no data read from deserialization
+            sliceDone = true;
+            // If we have an open marker, we should not close it, there could be more slices
+            if (openMarker != null)
+            {
+                next = new RangeTombstoneBoundMarker(end, openMarker);
+                return true;
+            }
+            return false;
+        }
+
+        protected Unfiltered nextInternal() throws IOException
+        {
+            if (!hasNextInternal())
+                throw new NoSuchElementException();
+
+            Unfiltered toReturn = next;
+            next = null;
+            return toReturn;
+        }
+    }
+
+    private class ForwardIndexedReader extends ForwardReader
+    {
+        private final IndexState indexState;
+
+        private int lastBlockIdx; // the last index block that has data for the current query
+
+        private ForwardIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+            this.indexState = new IndexState(this, sstable.metadata.comparator, indexEntry, false);
+            this.lastBlockIdx = indexState.blocksCount(); // if we never call setForSlice, that's where we want to stop
+        }
+
+        @Override
+        public void setForSlice(Slice slice) throws IOException
+        {
+            super.setForSlice(slice);
+
+            // if our previous slicing already got us the biggest row in the sstable, we're done
+            if (indexState.isDone())
+            {
+                sliceDone = true;
+                return;
+            }
+
+            // Find the first index block we'll need to read for the slice.
+            int startIdx = indexState.findBlockIndex(slice.start(), indexState.currentBlockIdx());
+            if (startIdx >= indexState.blocksCount())
+            {
+                sliceDone = true;
+                return;
+            }
+
+            // Find the last index block we'll need to read for the slice.
+            lastBlockIdx = indexState.findBlockIndex(slice.end(), startIdx);
+
+            // If the slice end is before the very first block, we have nothing for that slice
+            if (lastBlockIdx < 0)
+            {
+                assert startIdx < 0;
+                sliceDone = true;
+                return;
+            }
+
+            // If we start before the very first block, just read from the first one.
+            if (startIdx < 0)
+                startIdx = 0;
+
+            // If that's the last block we were reading, we're already where we want to be. Otherwise,
+            // seek to that first block
+            if (startIdx != indexState.currentBlockIdx())
+                indexState.setToBlock(startIdx);
+
+            // The index search is based on the last name of the index blocks, so at that point we have that:
+            //   1) indexes[currentIdx - 1].lastName < slice.start <= indexes[currentIdx].lastName
+            //   2) indexes[lastBlockIdx - 1].lastName < slice.end <= indexes[lastBlockIdx].lastName
+            // so if currentIdx == lastBlockIdx and slice.end < indexes[currentIdx].firstName, we're guaranteed that the
+            // whole slice is between the previous block end and this block start, and thus has no corresponding
+            // data. One exception is if the previous block ends with an openMarker as it will cover our slice
+            // and we need to return it (we also don't skip the slice for the old format because we didn't have the openMarker
+            // info in that case and can't rely on this optimization).
+            if (indexState.currentBlockIdx() == lastBlockIdx
+                && metadata().comparator.compare(slice.end(), indexState.currentIndex().firstName) < 0
+                && openMarker == null
+                && sstable.descriptor.version.storeRows())
+            {
+                sliceDone = true;
+            }
+        }
+
+        @Override
+        protected Unfiltered computeNext() throws IOException
+        {
+            while (true)
+            {
+                // Our previous read might have made us cross an index block boundary. If so, update our informations.
+                // If we read from the beginning of the partition, this is also what will initialize the index state.
+                indexState.updateBlock();
+
+                // Return the next unfiltered unless we've reached the end, or we're beyond our slice
+                // end (note that unless we're on the last block for the slice, there is no point
+                // in checking the slice end).
+                if (indexState.isDone()
+                    || indexState.currentBlockIdx() > lastBlockIdx
+                    || !deserializer.hasNext()
+                    || (indexState.currentBlockIdx() == lastBlockIdx && deserializer.compareNextTo(end) >= 0))
+                    return null;
+
+
+                Unfiltered next = deserializer.readNext();
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (next.isEmpty())
+                    continue;
+
+                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+                    updateOpenMarker((RangeTombstoneMarker) next);
+                return next;
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
new file mode 100644
index 0000000..8d3f4f3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java

@@ -0,0 +1,531 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.columniterator;
+
+import java.io.IOException;
+import java.util.*;
+
+import com.google.common.base.Verify;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.btree.BTree;
+
+/**
+ *  A Cell Iterator in reversed clustering order over SSTable
+ */
+public class SSTableReversedIterator extends AbstractSSTableIterator
+{
+    public SSTableReversedIterator(SSTableReader sstable,
+                                   DecoratedKey key,
+                                   ColumnFilter columns,
+                                   boolean isForThrift,
+                                   SSTableReadsListener listener)
+    {
+        this(sstable, null, key, sstable.getPosition(key, SSTableReader.Operator.EQ, listener), columns, isForThrift);
+    }
+
+    public SSTableReversedIterator(SSTableReader sstable,
+                                   FileDataInput file,
+                                   DecoratedKey key,
+                                   RowIndexEntry indexEntry,
+                                   ColumnFilter columns,
+                                   boolean isForThrift)
+    {
+        super(sstable, file, key, indexEntry, columns, isForThrift);
+    }
+
+    protected Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    {
+        return indexEntry.isIndexed()
+             ? new ReverseIndexedReader(indexEntry, file, shouldCloseFile)
+             : new ReverseReader(file, shouldCloseFile);
+    }
+
+    public boolean isReverseOrder()
+    {
+        return true;
+    }
+
+    private class ReverseReader extends Reader
+    {
+        protected ReusablePartitionData buffer;
+        protected Iterator<Unfiltered> iterator;
+
+        // Set in loadFromDisk () and used in setIterator to handle range tombstone extending on multiple index block. See
+        // loadFromDisk for details. Note that those are always false for non-indexed readers.
+        protected boolean skipFirstIteratedItem;
+        protected boolean skipLastIteratedItem;
+
+        protected Unfiltered mostRecentlyEmitted = null;
+
+        private ReverseReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+        }
+
+        protected ReusablePartitionData createBuffer(int blocksCount)
+        {
+            int estimatedRowCount = 16;
+            int columnCount = metadata().partitionColumns().regulars.size();
+            if (columnCount == 0 || metadata().clusteringColumns().isEmpty())
+            {
+                estimatedRowCount = 1;
+            }
+            else
+            {
+                try
+                {
+                    // To avoid wasted resizing we guess-estimate the number of rows we're likely to read. For that
+                    // we use the stats on the number of rows per partition for that sstable.
+                    // FIXME: so far we only keep stats on cells, so to get a rough estimate on the number of rows,
+                    // we divide by the number of regular columns the table has. We should fix once we collect the
+                    // stats on rows
+                    int estimatedRowsPerPartition = (int)(sstable.getEstimatedColumnCount().percentile(0.75) / columnCount);
+                    estimatedRowCount = Math.max(estimatedRowsPerPartition / blocksCount, 1);
+                }
+                catch (IllegalStateException e)
+                {
+                    // The EstimatedHistogram mean() method can throw this (if it overflows). While such overflow
+                    // shouldn't happen, it's not worth taking the risk of letting the exception bubble up.
+                }
+            }
+            return new ReusablePartitionData(metadata(), partitionKey(), columns(), estimatedRowCount);
+        }
+
+        public void setForSlice(Slice slice) throws IOException
+        {
+            // If we have read the data, just create the iterator for the slice. Otherwise, read the data.
+            if (buffer == null)
+            {
+                buffer = createBuffer(1);
+                // Note that we can reuse that buffer between slices (we could alternatively re-read from disk
+                // every time, but that feels more wasteful) so we want to include everything from the beginning.
+                // We can stop at the slice end however since any following slice will be before that.
+                loadFromDisk(null, slice.end(), false, false, null, null);
+            }
+            setIterator(slice);
+        }
+
+        protected void setIterator(Slice slice)
+        {
+            assert buffer != null;
+            iterator = buffer.built.unfilteredIterator(columns, Slices.with(metadata().comparator, slice), true);
+
+            if (!iterator.hasNext())
+                return;
+
+            if (skipFirstIteratedItem)
+                iterator.next();
+
+            if (skipLastIteratedItem)
+                iterator = new SkipLastIterator(iterator);
+        }
+
+        protected boolean hasNextInternal() throws IOException
+        {
+            // If we've never called setForSlice, we're reading everything
+            if (iterator == null)
+                setForSlice(Slice.ALL);
+
+            return iterator.hasNext();
+        }
+
+        protected Unfiltered nextInternal() throws IOException
+        {
+            if (!hasNext())
+                throw new NoSuchElementException();
+            Unfiltered next = iterator.next();
+            mostRecentlyEmitted = next;
+            return next;
+        }
+
+        protected boolean stopReadingDisk()
+        {
+            return false;
+        }
+
+        // checks if left prefix precedes right prefix
+        private boolean precedes(ClusteringPrefix left, ClusteringPrefix right)
+        {
+            return metadata().comparator.compare(left, right) < 0;
+        }
+
+        // Reads the unfiltered from disk and load them into the reader buffer. It stops reading when either the partition
+        // is fully read, or when stopReadingDisk() returns true.
+        protected void loadFromDisk(Slice.Bound start,
+                                    Slice.Bound end,
+                                    boolean hasPreviousBlock,
+                                    boolean hasNextBlock,
+                                    ClusteringPrefix currentFirstName,
+                                    ClusteringPrefix nextLastName) throws IOException
+        {
+            // start != null means it's the block covering the beginning of the slice, so it has to be the last block for this slice.
+            assert start == null || !hasNextBlock;
+
+            buffer.reset();
+            skipFirstIteratedItem = false;
+            skipLastIteratedItem = false;
+
+            boolean isFirst = true;
+
+            // If the start might be in this block, skip everything that comes before it.
+            if (start != null)
+            {
+                while (deserializer.hasNext() && deserializer.compareNextTo(start) <= 0 && !stopReadingDisk())
+                {
+                    isFirst = false;
+                    if (deserializer.nextIsRow())
+                        deserializer.skipNext();
+                    else
+                        updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
+                }
+            }
+
+            // If we have an open marker, it's either one from what we just skipped or it's one that open in the next (or
+            // one of the next) index block (if openMarker == openMarkerAtStartOfBlock).
+            if (openMarker != null)
+            {
+                // We have to feed a marker to the buffer, because that marker is likely to be close later and ImmtableBTreePartition
+                // doesn't take kindly to marker that comes without their counterpart. If that's the last block we're gonna read (for
+                // the current slice at least) it's easy because we'll want to return that open marker at the end of the data in this
+                // block anyway, so we have nothing more to do than adding it to the buffer.
+                // If it's not the last block however, in which case we know we'll have start == null, it means this marker is really
+                // open in a next block and so while we do need to add it the buffer for the reason mentioned above, we don't
+                // want to "return" it just yet, we'll wait until we reach it in the next blocks. That's why we trigger
+                // skipLastIteratedItem in that case (this is first item of the block, but we're iterating in reverse order
+                // so it will be last returned by the iterator).
+                RangeTombstone.Bound markerStart = start == null ? RangeTombstone.Bound.BOTTOM : RangeTombstone.Bound.fromSliceBound(start);
+                buffer.add(new RangeTombstoneBoundMarker(markerStart, openMarker));
+                if (hasNextBlock)
+                    skipLastIteratedItem = true;
+            }
+
+            // Now deserialize everything until we reach our requested end (if we have one)
+            // See SSTableIterator.ForwardRead.computeNext() for why this is a strict inequality below: this is the same
+            // reasoning here.
+            while (deserializer.hasNext()
+                   && (end == null || deserializer.compareNextTo(end) < 0)
+                   && !stopReadingDisk())
+            {
+                Unfiltered unfiltered = deserializer.readNext();
+
+                if (isFirst && openMarker == null
+                    && currentFirstName != null && nextLastName != null
+                    && (precedes(currentFirstName, nextLastName) || precedes(unfiltered.clustering(), currentFirstName)))
+                {
+                    // Range tombstones spanning multiple index blocks when reading legacy sstables need special handling.
+                    // Pre-3.0, the column index didn't encode open markers. Instead, open range tombstones were rewritten
+                    // at the start of index blocks they at least partially covered. These rewritten RTs found at the
+                    // beginning of index blocks need to be handled as though they were an open marker, otherwise iterator
+                    // validation will fail and/or some rows will be excluded from the result. These rewritten RTs can be
+                    // detected based on their relation to the current index block and the next one depending on what wrote
+                    // the sstable. For sstables coming from a memtable flush, a rewritten RT will have a clustering value
+                    // less than the first name of its index block. For sstables coming from compaction, the index block
+                    // first name will be the RT open bound, which will be less than the last name of the next block. So,
+                    // here we compare the first name of this block to the last name of the next block to detect the
+                    // compaction case, and clustering value of the unfiltered we just read to the index block's first name
+                    // to detect the flush case.
+                    Verify.verify(!sstable.descriptor.version.storeRows());
+                    Verify.verify(openMarker == null);
+                    Verify.verify(!skipLastIteratedItem);
+                    Verify.verify(unfiltered.isRangeTombstoneMarker());
+                    buffer.add(unfiltered);
+                    if (hasNextBlock)
+                        skipLastIteratedItem = true;
+                }
+                else if (isFirst && nextLastName != null && !precedes(nextLastName, unfiltered.clustering()))
+                {
+                    // When dealing with old format sstable, we have the problem that a row can span 2 index block, i.e. it can
+                    // start at the end of a block and end at the beginning of the next one. That's not a problem per se for
+                    // UnfilteredDeserializer.OldFormatSerializer, since it always read rows entirely, even if they span index
+                    // blocks, but as we reading index block in reverse we must be careful to not read the end of the row at
+                    // beginning of a block before we're reading the beginning of that row. So what we do is that if we detect
+                    // that the row starting this block is also the row ending the next one we're read (previous on disk), then
+                    // we'll skip that first result and  let it be read with the next block.
+                    Verify.verify(!sstable.descriptor.version.storeRows());
+                    isFirst = false;
+                }
+                else if (unfiltered.isEmpty())
+                {
+                    isFirst = false;
+                }
+                else
+                {
+                    buffer.add(unfiltered);
+                    isFirst = false;
+                }
+
+                if (unfiltered.isRangeTombstoneMarker())
+                    updateOpenMarker((RangeTombstoneMarker)unfiltered);
+            }
+
+            if (!sstable.descriptor.version.storeRows()
+                && deserializer.hasNext()
+                && (end == null || deserializer.compareNextTo(end) < 0))
+            {
+                // Range tombstone start and end bounds are stored together in legacy sstables. When we read one, we
+                // stash the closing bound until we reach the appropriate place to emit it, which is immediately before
+                // the next unfiltered with a greater clustering.
+                // If SSTRI considers the block exhausted before encountering such a clustering though, this end marker
+                // will never be emitted. So here we just check if there's a closing bound left in the deserializer.
+                // If there is, we compare it against the most recently emitted unfiltered (i.e.: the last unfiltered
+                // that this RT would enclose. And we have to do THAT comparison because the last name field on the
+                // current index block will be whatever was written at the end of the index block (i.e. the last name
+                // physically in the block), not the closing bound of the range tombstone (i.e. the last name logically
+                // in the block). If all this indicates that there is indeed a range tombstone we're missing, we add it
+                // to the buffer and update the open marker field.
+                Unfiltered unfiltered = deserializer.readNext();
+                RangeTombstoneMarker marker = unfiltered.isRangeTombstoneMarker() ? (RangeTombstoneMarker) unfiltered : null;
+                if (marker != null && marker.isClose(false)
+                    && (mostRecentlyEmitted == null || precedes(marker.clustering(), mostRecentlyEmitted.clustering())))
+                {
+                    buffer.add(marker);
+                    updateOpenMarker(marker);
+                }
+            }
+
+            // If we have an open marker, we should close it before finishing
+            if (openMarker != null)
+            {
+                // This is the reverse problem than the one at the start of the block. Namely, if it's the first block
+                // we deserialize for the slice (the one covering the slice end basically), then it's easy, we just want
+                // to add the close marker to the buffer and return it normally.
+                // If it's note our first block (for the slice) however, it means that marker closed in a previously read
+                // block and we have already returned it. So while we should still add it to the buffer for the sake of
+                // not breaking ImmutableBTreePartition, we should skip it when returning from the iterator, hence the
+                // skipFirstIteratedItem (this is the last item of the block, but we're iterating in reverse order so it will
+                // be the first returned by the iterator).
+                RangeTombstone.Bound markerEnd = end == null ? RangeTombstone.Bound.TOP : RangeTombstone.Bound.fromSliceBound(end);
+                buffer.add(new RangeTombstoneBoundMarker(markerEnd, openMarker));
+                if (hasPreviousBlock)
+                    skipFirstIteratedItem = true;
+            }
+
+            buffer.build();
+        }
+    }
+
+    private class ReverseIndexedReader extends ReverseReader
+    {
+        private final IndexState indexState;
+
+        // The slice we're currently iterating over
+        private Slice slice;
+        // The last index block to consider for the slice
+        private int lastBlockIdx;
+
+        private ReverseIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+            this.indexState = new IndexState(this, sstable.metadata.comparator, indexEntry, true);
+        }
+
+        @Override
+        public void setForSlice(Slice slice) throws IOException
+        {
+            this.slice = slice;
+
+            // if our previous slicing already got us past the beginning of the sstable, we're done
+            if (indexState.isDone())
+            {
+                iterator = Collections.emptyIterator();
+                return;
+            }
+
+            // Find the first index block we'll need to read for the slice.
+            int startIdx = indexState.findBlockIndex(slice.end(), indexState.currentBlockIdx());
+            if (startIdx < 0)
+            {
+                iterator = Collections.emptyIterator();
+                indexState.setToBlock(startIdx);
+                return;
+            }
+
+            lastBlockIdx = indexState.findBlockIndex(slice.start(), startIdx);
+
+            // If the last block to look (in reverse order) is after the very last block, we have nothing for that slice
+            if (lastBlockIdx >= indexState.blocksCount())
+            {
+                assert startIdx >= indexState.blocksCount();
+                iterator = Collections.emptyIterator();
+                return;
+            }
+
+            // If we start (in reverse order) after the very last block, just read from the last one.
+            if (startIdx >= indexState.blocksCount())
+                startIdx = indexState.blocksCount() - 1;
+
+            // Note that even if we were already set on the proper block (which would happen if the previous slice
+            // requested ended on the same block this one start), we can't reuse it because when reading the previous
+            // slice we've only read that block from the previous slice start. Re-reading also handles
+            // skipFirstIteratedItem/skipLastIteratedItem that we would need to handle otherwise.
+            indexState.setToBlock(startIdx);
+
+            readCurrentBlock(false, startIdx != lastBlockIdx);
+        }
+
+        @Override
+        protected boolean hasNextInternal() throws IOException
+        {
+            if (super.hasNextInternal())
+                return true;
+
+            while (true)
+            {
+                // We have nothing more for our current block, move the next one (so the one before on disk).
+                int nextBlockIdx = indexState.currentBlockIdx() - 1;
+                if (nextBlockIdx < 0 || nextBlockIdx < lastBlockIdx)
+                    return false;
+
+                // The slice start can be in
+                indexState.setToBlock(nextBlockIdx);
+                readCurrentBlock(true, nextBlockIdx != lastBlockIdx);
+
+                // If an indexed block only contains data for a dropped column, the iterator will be empty, even
+                // though we may still have data to read in subsequent blocks
+
+                // also, for pre-3.0 storage formats, index blocks that only contain a single row and that row crosses
+                // index boundaries, the iterator will be empty even though we haven't read everything we're intending
+                // to read. In that case, we want to read the next index block. This shouldn't be possible in 3.0+
+                // formats (see next comment)
+                if (!iterator.hasNext() && nextBlockIdx > lastBlockIdx)
+                {
+                    continue;
+                }
+
+                return iterator.hasNext();
+            }
+        }
+
+        /**
+         * Reads the current block, the last one we've set.
+         *
+         * @param hasPreviousBlock is whether we have already read a previous block for the current slice.
+         * @param hasNextBlock is whether we have more blocks to read for the current slice.
+         */
+        private void readCurrentBlock(boolean hasPreviousBlock, boolean hasNextBlock) throws IOException
+        {
+            if (buffer == null)
+                buffer = createBuffer(indexState.blocksCount());
+
+            int currentBlock = indexState.currentBlockIdx();
+
+            // The slice start (resp. slice end) is only meaningful on the last (resp. first) block read (since again,
+            // we read blocks in reverse order).
+            boolean canIncludeSliceStart = !hasNextBlock;
+            boolean canIncludeSliceEnd = !hasPreviousBlock;
+
+            ClusteringPrefix currentFirstName = null;
+            ClusteringPrefix nextLastName = null;
+            if (!sstable.descriptor.version.storeRows() && currentBlock > 0)
+            {
+                currentFirstName = indexState.index(currentBlock).firstName;
+                nextLastName = indexState.index(currentBlock - 1).lastName;
+            }
+
+            loadFromDisk(canIncludeSliceStart ? slice.start() : null,
+                         canIncludeSliceEnd ? slice.end() : null,
+                         hasPreviousBlock,
+                         hasNextBlock,
+                         currentFirstName,
+                         nextLastName
+            );
+            setIterator(slice);
+        }
+
+        @Override
+        protected boolean stopReadingDisk()
+        {
+            return indexState.isPastCurrentBlock();
+        }
+    }
+
+    private class ReusablePartitionData
+    {
+        private final CFMetaData metadata;
+        private final DecoratedKey partitionKey;
+        private final PartitionColumns columns;
+
+        private MutableDeletionInfo.Builder deletionBuilder;
+        private MutableDeletionInfo deletionInfo;
+        private BTree.Builder<Row> rowBuilder;
+        private ImmutableBTreePartition built;
+
+        private ReusablePartitionData(CFMetaData metadata,
+                                      DecoratedKey partitionKey,
+                                      PartitionColumns columns,
+                                      int initialRowCapacity)
+        {
+            this.metadata = metadata;
+            this.partitionKey = partitionKey;
+            this.columns = columns;
+            this.rowBuilder = BTree.builder(metadata.comparator, initialRowCapacity);
+        }
+
+
+        public void add(Unfiltered unfiltered)
+        {
+            if (unfiltered.isRow())
+                rowBuilder.add((Row)unfiltered);
+            else
+                deletionBuilder.add((RangeTombstoneMarker)unfiltered);
+        }
+
+        public void reset()
+        {
+            built = null;
+            rowBuilder.reuse();
+            deletionBuilder = MutableDeletionInfo.builder(partitionLevelDeletion, metadata().comparator, false);
+        }
+
+        public void build()
+        {
+            deletionInfo = deletionBuilder.build();
+            built = new ImmutableBTreePartition(metadata, partitionKey, columns, Rows.EMPTY_STATIC_ROW, rowBuilder.build(),
+                                                deletionInfo, EncodingStats.NO_STATS);
+            deletionBuilder = null;
+        }
+    }
+
+    private static class SkipLastIterator extends AbstractIterator<Unfiltered>
+    {
+        private final Iterator<Unfiltered> iterator;
+
+        private SkipLastIterator(Iterator<Unfiltered> iterator)
+        {
+            this.iterator = iterator;
+        }
+
+        protected Unfiltered computeNext()
+        {
+            if (!iterator.hasNext())
+                return endOfData();
+
+            Unfiltered next = iterator.next();
+            return iterator.hasNext() ? next : endOfData();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
index 702ace5..0845bd5 100644
--- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java

@@ -17,6 +17,11 @@
  */
 package org.apache.cassandra.db.commitlog;
 
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
+import org.apache.cassandra.utils.Clock;
+import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.concurrent.WaitQueue;
 import org.slf4j.*;
 
@@ -24,14 +29,17 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
-import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
+import com.google.common.annotations.VisibleForTesting;
 
 public abstract class AbstractCommitLogService
 {
-    // how often should we log syngs that lag behind our desired period
-    private static final long LAG_REPORT_INTERVAL = TimeUnit.MINUTES.toMillis(5);
+    /**
+     * When in {@link Config.CommitLogSync#periodic} mode, the default number of milliseconds to wait between updating
+     * the commit log chained markers.
+     */
+    static final long DEFAULT_MARKER_INTERVAL_MILLIS = 100;
 
-    private Thread thread;
+    private volatile Thread thread;
     private volatile boolean shutdown = false;
 
     // all Allocations written before this time will be synced
@@ -47,7 +55,24 @@
 
     final CommitLog commitLog;
     private final String name;
-    private final long pollIntervalMillis;
+
+    /**
+     * The duration between syncs to disk.
+     */
+    final long syncIntervalMillis;
+
+    /**
+     * The duration between updating the chained markers in the the commit log file. This value should be
+     * 0 < {@link #markerIntervalMillis} <= {@link #syncIntervalMillis}.
+     */
+    final long markerIntervalMillis;
+
+    /**
+     * A flag that callers outside of the sync thread can use to signal they want the commitlog segments
+     * to be flushed to disk. Note: this flag is primarily to support commit log's batch mode, which requires
+     * an immediate flush to disk on every mutation; see {@link BatchCommitLogService#maybeWaitForSync(Allocation)}.
+     */
+    private volatile boolean syncRequested;
 
     private static final Logger logger = LoggerFactory.getLogger(AbstractCommitLogService.class);
 
@@ -57,105 +82,196 @@
      *
      * Subclasses may be notified when a sync finishes by using the syncComplete WaitQueue.
      */
-    AbstractCommitLogService(final CommitLog commitLog, final String name, final long pollIntervalMillis)
+    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis)
+    {
+        this (commitLog, name, syncIntervalMillis, false);
+    }
+
+    /**
+     * CommitLogService provides a fsync service for Allocations, fulfilling either the
+     * Batch or Periodic contract.
+     *
+     * Subclasses may be notified when a sync finishes by using the syncComplete WaitQueue.
+     *
+     * @param markHeadersFaster true if the chained markers should be updated more frequently than on the disk sync bounds.
+     */
+    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, boolean markHeadersFaster)
     {
         this.commitLog = commitLog;
         this.name = name;
-        this.pollIntervalMillis = pollIntervalMillis;
+
+        if (markHeadersFaster && syncIntervalMillis > DEFAULT_MARKER_INTERVAL_MILLIS)
+        {
+            markerIntervalMillis = DEFAULT_MARKER_INTERVAL_MILLIS;
+            long modulo = syncIntervalMillis % markerIntervalMillis;
+            if (modulo != 0)
+            {
+                // quantize syncIntervalMillis to a multiple of markerIntervalMillis
+                syncIntervalMillis -= modulo;
+
+                if (modulo >= markerIntervalMillis / 2)
+                    syncIntervalMillis += markerIntervalMillis;
+            }
+            logger.debug("Will update the commitlog markers every {}ms and flush every {}ms", markerIntervalMillis, syncIntervalMillis);
+        }
+        else
+        {
+            markerIntervalMillis = syncIntervalMillis;
+        }
+
+        assert syncIntervalMillis % markerIntervalMillis == 0;
+        this.syncIntervalMillis = syncIntervalMillis;
     }
 
     // Separated into individual method to ensure relevant objects are constructed before this is started.
     void start()
     {
-        if (pollIntervalMillis < 1)
-            throw new IllegalArgumentException(String.format("Commit log flush interval must be positive: %dms", pollIntervalMillis));
-
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-                long firstLagAt = 0;
-                long totalSyncDuration = 0; // total time spent syncing since firstLagAt
-                long syncExceededIntervalBy = 0; // time that syncs exceeded pollInterval since firstLagAt
-                int lagCount = 0;
-                int syncCount = 0;
-
-                boolean run = true;
-                while (run)
-                {
-                    try
-                    {
-                        // always run once after shutdown signalled
-                        run = !shutdown;
-
-                        // sync and signal
-                        long syncStarted = System.currentTimeMillis();
-                        commitLog.sync(shutdown);
-                        lastSyncedAt = syncStarted;
-                        syncComplete.signalAll();
-
-
-                        // sleep any time we have left before the next one is due
-                        long now = System.currentTimeMillis();
-                        long sleep = syncStarted + pollIntervalMillis - now;
-                        if (sleep < 0)
-                        {
-                            // if we have lagged noticeably, update our lag counter
-                            if (firstLagAt == 0)
-                            {
-                                firstLagAt = now;
-                                totalSyncDuration = syncExceededIntervalBy = syncCount = lagCount = 0;
-                            }
-                            syncExceededIntervalBy -= sleep;
-                            lagCount++;
-                        }
-                        syncCount++;
-                        totalSyncDuration += now - syncStarted;
-
-                        if (firstLagAt > 0 && now - firstLagAt >= LAG_REPORT_INTERVAL)
-                        {
-                            logger.warn(String.format("Out of %d commit log syncs over the past %ds with average duration of %.2fms, %d have exceeded the configured commit interval by an average of %.2fms",
-                                                      syncCount, (now - firstLagAt) / 1000, (double) totalSyncDuration / syncCount, lagCount, (double) syncExceededIntervalBy / lagCount));
-                            firstLagAt = 0;
-                        }
-
-                        // if we have lagged this round, we probably have work to do already so we don't sleep
-                        if (sleep < 0 || !run)
-                            continue;
-
-                        try
-                        {
-                            haveWork.tryAcquire(sleep, TimeUnit.MILLISECONDS);
-                            haveWork.drainPermits();
-                        }
-                        catch (InterruptedException e)
-                        {
-                            throw new AssertionError();
-                        }
-                    }
-                    catch (Throwable t)
-                    {
-                        if (!CommitLog.handleCommitError("Failed to persist commits to disk", t))
-                            break;
-
-                        // sleep for full poll-interval after an error, so we don't spam the log file
-                        try
-                        {
-                            haveWork.tryAcquire(pollIntervalMillis, TimeUnit.MILLISECONDS);
-                        }
-                        catch (InterruptedException e)
-                        {
-                            throw new AssertionError();
-                        }
-                    }
-                }
-            }
-        };
-
-        thread = new Thread(runnable, name);
+        if (syncIntervalMillis < 1)
+            throw new IllegalArgumentException(String.format("Commit log flush interval must be positive: %dms",
+                                                             syncIntervalMillis));
+        shutdown = false;
+        Runnable runnable = new SyncRunnable(new Clock());
+        thread = new Thread(NamedThreadFactory.threadLocalDeallocator(runnable), name);
         thread.start();
     }
 
+    class SyncRunnable implements Runnable
+    {
+        final Clock clock;
+        long firstLagAt = 0;
+        long totalSyncDuration = 0; // total time spent syncing since firstLagAt
+        long syncExceededIntervalBy = 0; // time that syncs exceeded pollInterval since firstLagAt
+        int lagCount = 0;
+        int syncCount = 0;
+
+        SyncRunnable(Clock clock)
+        {
+            this.clock = clock;
+        }
+
+        public void run()
+        {
+            while (true)
+            {
+                if (!sync())
+                    break;
+            }
+        }
+
+        boolean sync()
+        {
+            try
+            {
+                // always run once after shutdown signalled
+                boolean run = !shutdown;
+
+                // sync and signal
+                long pollStarted = clock.currentTimeMillis();
+                boolean flushToDisk = lastSyncedAt + syncIntervalMillis <= pollStarted || shutdown || syncRequested;
+                if (flushToDisk)
+                {
+                    // in this branch, we want to flush the commit log to disk
+                    syncRequested = false;
+                    commitLog.sync(shutdown, true);
+                    lastSyncedAt = pollStarted;
+                    syncComplete.signalAll();
+                    syncCount++;
+                }
+                else
+                {
+                    // in this branch, just update the commit log sync headers
+                    commitLog.sync(false, false);
+                }
+
+                long now = clock.currentTimeMillis();
+                if (flushToDisk)
+                    maybeLogFlushLag(pollStarted, now);
+
+                if (!run)
+                    return false;
+
+                // if we have lagged this round, we probably have work to do already so we don't sleep
+                long sleep = pollStarted + markerIntervalMillis - now;
+                if (sleep < 0)
+                    return true;
+
+                try
+                {
+                    haveWork.tryAcquire(sleep, TimeUnit.MILLISECONDS);
+                    haveWork.drainPermits();
+                }
+                catch (InterruptedException e)
+                {
+                    throw new AssertionError();
+                }
+            }
+            catch (Throwable t)
+            {
+                if (!CommitLog.handleCommitError("Failed to persist commits to disk", t))
+                    return false;
+
+                // sleep for full poll-interval after an error, so we don't spam the log file
+                try
+                {
+                    haveWork.tryAcquire(markerIntervalMillis, TimeUnit.MILLISECONDS);
+                }
+                catch (InterruptedException e)
+                {
+                    throw new AssertionError();
+                }
+            }
+            return true;
+        }
+
+        /**
+         * Add a log entry whenever the time to flush the commit log to disk exceeds {@link #syncIntervalMillis}.
+         */
+        @VisibleForTesting
+        boolean maybeLogFlushLag(long pollStarted, long now)
+        {
+            long flushDuration = now - pollStarted;
+            totalSyncDuration += flushDuration;
+
+            // this is the timestamp by which we should have completed the flush
+            long maxFlushTimestamp = pollStarted + syncIntervalMillis;
+            if (maxFlushTimestamp > now)
+                return false;
+
+            // if we have lagged noticeably, update our lag counter
+            if (firstLagAt == 0)
+            {
+                firstLagAt = now;
+                syncExceededIntervalBy = lagCount = 0;
+                syncCount = 1;
+                totalSyncDuration = flushDuration;
+            }
+            syncExceededIntervalBy += now - maxFlushTimestamp;
+            lagCount++;
+
+            if (firstLagAt > 0)
+            {
+                //Only reset the lag tracking if it actually logged this time
+                boolean logged = NoSpamLogger.log(
+                logger,
+                NoSpamLogger.Level.WARN,
+                5,
+                TimeUnit.MINUTES,
+                "Out of {} commit log syncs over the past {}s with average duration of {}ms, {} have exceeded the configured commit interval by an average of {}ms",
+                syncCount, (now - firstLagAt) / 1000, String.format("%.2f", (double) totalSyncDuration / syncCount), lagCount, String.format("%.2f", (double) syncExceededIntervalBy / lagCount));
+                if (logged)
+                    firstLagAt = 0;
+            }
+            return true;
+        }
+
+        @VisibleForTesting
+        long getTotalSyncDuration()
+        {
+            return totalSyncDuration;
+        }
+    }
+
+
     /**
      * Block for @param alloc to be sync'd as necessary, and handle bookkeeping
      */
@@ -173,10 +289,16 @@
     public WaitQueue.Signal requestExtraSync()
     {
         WaitQueue.Signal signal = syncComplete.register();
-        haveWork.release(1);
+        requestSync();
         return signal;
     }
 
+    protected void requestSync()
+    {
+        syncRequested = true;
+        haveWork.release(1);
+    }
+
     public void shutdown()
     {
         shutdown = true;

diff --git a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
index ceb5d64..c0e6afc 100644
--- a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java

@@ -30,7 +30,7 @@
     {
         // wait until record has been safely persisted to disk
         pending.incrementAndGet();
-        haveWork.release();
+        requestSync();
         alloc.awaitDiskSync(commitLog.metrics.waitingOnCommit);
         pending.decrementAndGet();
     }

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
index 6dd519a..18511e4 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java

@@ -20,6 +20,7 @@
 import java.io.*;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.zip.CRC32;
 
 import com.google.common.annotations.VisibleForTesting;
 
@@ -28,25 +29,24 @@
 
 import org.apache.commons.lang3.StringUtils;
 
-import com.github.tjake.ICRC32;
-
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.io.FSWriteError;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.io.compress.ICompressor;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputBufferFixed;
 import org.apache.cassandra.metrics.CommitLogMetrics;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.CRC32Factory;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.MBeanWrapper;
 
 import static org.apache.cassandra.db.commitlog.CommitLogSegment.*;
+import static org.apache.cassandra.utils.FBUtilities.updateChecksum;
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
 
 /*
  * Commit Log tracks every write operation into the system. The aim of the commit log is to be able to
@@ -60,7 +60,7 @@
 
     // we only permit records HALF the size of a commit log, to ensure we don't spin allocating many mostly
     // empty segments when writing large records
-    private final long MAX_MUTATION_SIZE = DatabaseDescriptor.getCommitLogSegmentSize() >> 1;
+    private final long MAX_MUTATION_SIZE = DatabaseDescriptor.getMaxMutationSize();
 
     public final CommitLogSegmentManager allocator;
     public final CommitLogArchiver archiver;
@@ -70,7 +70,7 @@
     volatile Configuration configuration;
     final public String location;
 
-    static private CommitLog construct()
+    private static CommitLog construct()
     {
         CommitLog log = new CommitLog(DatabaseDescriptor.getCommitLogLocation(), CommitLogArchiver.construct());
 
@@ -139,6 +139,7 @@
 
         File[] files = new File(DatabaseDescriptor.getCommitLogLocation()).listFiles(unmanagedFilesFilter);
         int replayed = 0;
+        allocator.enableReserveSegmentCreation();
         if (files.length == 0)
         {
             logger.info("No commitlog files found; skipping replay");
@@ -154,7 +155,6 @@
                 allocator.recycleSegment(f);
         }
 
-        allocator.enableReserveSegmentCreation();
         return replayed;
     }
 
@@ -209,14 +209,14 @@
     /**
      * Forces a disk flush on the commit log files that need it.  Blocking.
      */
-    public void sync(boolean syncAllSegments)
+    public void sync(boolean syncAllSegments, boolean flush)
     {
         CommitLogSegment current = allocator.allocatingFrom();
         for (CommitLogSegment segment : allocator.getActiveSegments())
         {
             if (!syncAllSegments && segment.id > current.id)
                 return;
-            segment.sync();
+            segment.sync(flush);
         }
     }
 
@@ -237,9 +237,9 @@
     {
         assert mutation != null;
 
-        long size = Mutation.serializer.serializedSize(mutation, MessagingService.current_version);
+        int size = (int) Mutation.serializer.serializedSize(mutation, MessagingService.current_version);
 
-        long totalSize = size + ENTRY_OVERHEAD_SIZE;
+        int totalSize = size + ENTRY_OVERHEAD_SIZE;
         if (totalSize > MAX_MUTATION_SIZE)
         {
             throw new IllegalArgumentException(String.format("Mutation of %s bytes is too large for the maximum size of %s",
@@ -247,20 +247,19 @@
         }
 
         Allocation alloc = allocator.allocate(mutation, (int) totalSize);
-        ICRC32 checksum = CRC32Factory.instance.create();
+        CRC32 checksum = new CRC32();
         final ByteBuffer buffer = alloc.getBuffer();
         try (BufferedDataOutputStreamPlus dos = new DataOutputBufferFixed(buffer))
         {
             // checksummed length
-            dos.writeInt((int) size);
-            checksum.update(buffer, buffer.position() - 4, 4);
-            buffer.putInt(checksum.getCrc());
+            dos.writeInt(size);
+            updateChecksumInt(checksum, size);
+            buffer.putInt((int) checksum.getValue());
 
-            int start = buffer.position();
             // checksummed mutation
             Mutation.serializer.serialize(mutation, dos, MessagingService.current_version);
-            checksum.update(buffer, start, (int) size);
-            buffer.putInt(checksum.getCrc());
+            updateChecksum(checksum, buffer, buffer.position() - size, size);
+            buffer.putInt((int) checksum.getValue());
         }
         catch (IOException e)
         {
@@ -280,11 +279,12 @@
      * given. Discards any commit log segments that are no longer used.
      *
      * @param cfId    the column family ID that was flushed
-     * @param context the replay position of the flush
+     * @param lowerBound the lowest covered replay position of the flush
+     * @param lowerBound the highest covered replay position of the flush
      */
-    public void discardCompletedSegments(final UUID cfId, final ReplayPosition context)
+    public void discardCompletedSegments(final UUID cfId, final ReplayPosition lowerBound, final ReplayPosition upperBound)
     {
-        logger.trace("discard completed log segments for {}, table {}", context, cfId);
+        logger.trace("discard completed log segments for {}-{}, table {}", lowerBound, upperBound, cfId);
 
         // Go thru the active segment files, which are ordered oldest to newest, marking the
         // flushed CF as clean, until we reach the segment file containing the ReplayPosition passed
@@ -293,7 +293,7 @@
         for (Iterator<CommitLogSegment> iter = allocator.getActiveSegments().iterator(); iter.hasNext();)
         {
             CommitLogSegment segment = iter.next();
-            segment.markClean(cfId, context);
+            segment.markClean(cfId, lowerBound, upperBound);
 
             if (segment.isUnused())
             {
@@ -302,13 +302,14 @@
             }
             else
             {
-                logger.trace("Not safe to delete{} commit log segment {}; dirty is {}",
-                        (iter.hasNext() ? "" : " active"), segment, segment.dirtyString());
+                if (logger.isTraceEnabled())
+                    logger.trace("Not safe to delete{} commit log segment {}; dirty is {}",
+                            (iter.hasNext() ? "" : " active"), segment, segment.dirtyString());
             }
 
             // Don't mark or try to delete any newer segments once we've reached the one containing the
             // position of the flush.
-            if (segment.contains(context))
+            if (segment.contains(upperBound))
                 break;
         }
     }
@@ -426,7 +427,7 @@
      */
     public void resetConfiguration()
     {
-        this.configuration = new Configuration(DatabaseDescriptor.getCommitLogCompression());
+        configuration = new Configuration(DatabaseDescriptor.getCommitLogCompression());
     }
 
     /**
@@ -500,7 +501,7 @@
         public Configuration(ParameterizedClass compressorClass)
         {
             this.compressorClass = compressorClass;
-            this.compressor = compressorClass != null ? CompressionParameters.createCompressor(compressorClass) : null;
+            this.compressor = compressorClass != null ? CompressionParams.createCompressor(compressorClass) : null;
         }
 
         /**

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java
index d9a511e..5547d0e 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogArchiver.java

@@ -33,8 +33,7 @@
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.slf4j.Logger;
@@ -223,13 +222,13 @@
                     descriptor = fromHeader;
                 else descriptor = fromName;
 
-                if (descriptor.version > CommitLogDescriptor.VERSION_22)
+                if (descriptor.version > CommitLogDescriptor.current_version)
                     throw new IllegalStateException("Unsupported commit log version: " + descriptor.version);
 
                 if (descriptor.compression != null) {
                     try
                     {
-                        CompressionParameters.createCompressor(descriptor.compression);
+                        CompressionParams.createCompressor(descriptor.compression);
                     }
                     catch (ConfigurationException e)
                     {

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
index c4728fd..0df20ce 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java

@@ -31,18 +31,19 @@
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.zip.CRC32;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Objects;
-import com.github.tjake.ICRC32;
 
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.CRC32Factory;
 import org.json.simple.JSONValue;
 
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+
 public class CommitLogDescriptor
 {
     private static final String SEPARATOR = "-";
@@ -57,12 +58,13 @@
     public static final int VERSION_20 = 3;
     public static final int VERSION_21 = 4;
     public static final int VERSION_22 = 5;
+    public static final int VERSION_30 = 6;
     /**
      * Increment this number if there is a changes in the commit log disc layout or MessagingVersion changes.
      * Note: make sure to handle {@link #getMessagingVersion()}
      */
     @VisibleForTesting
-    public static final int current_version = VERSION_22;
+    public static final int current_version = VERSION_30;
 
     final int version;
     public final long id;
@@ -82,12 +84,12 @@
 
     public static void writeHeader(ByteBuffer out, CommitLogDescriptor descriptor)
     {
-        ICRC32 crc = CRC32Factory.instance.create();
+        CRC32 crc = new CRC32();
         out.putInt(descriptor.version);
-        crc.updateInt(descriptor.version);
+        updateChecksumInt(crc, descriptor.version);
         out.putLong(descriptor.id);
-        crc.updateInt((int) (descriptor.id & 0xFFFFFFFFL));
-        crc.updateInt((int) (descriptor.id >>> 32));
+        updateChecksumInt(crc, (int) (descriptor.id & 0xFFFFFFFFL));
+        updateChecksumInt(crc, (int) (descriptor.id >>> 32));
         if (descriptor.version >= VERSION_22) {
             String parametersString = constructParametersString(descriptor);
             byte[] parametersBytes = parametersString.getBytes(StandardCharsets.UTF_8);
@@ -95,12 +97,12 @@
                 throw new ConfigurationException(String.format("Compression parameters too long, length %d cannot be above 65535.",
                                                                parametersBytes.length));
             out.putShort((short) parametersBytes.length);
-            crc.updateInt(parametersBytes.length);
+            updateChecksumInt(crc, parametersBytes.length);
             out.put(parametersBytes);
             crc.update(parametersBytes, 0, parametersBytes.length);
         } else
             assert descriptor.compression == null;
-        out.putInt(crc.getCrc());
+        out.putInt((int) crc.getValue());
     }
 
     private static String constructParametersString(CommitLogDescriptor descriptor)
@@ -134,16 +136,16 @@
 
     public static CommitLogDescriptor readHeader(DataInput input) throws IOException
     {
-        ICRC32 checkcrc = CRC32Factory.instance.create();
+        CRC32 checkcrc = new CRC32();
         int version = input.readInt();
-        checkcrc.updateInt(version);
+        updateChecksumInt(checkcrc, version);
         long id = input.readLong();
-        checkcrc.updateInt((int) (id & 0xFFFFFFFFL));
-        checkcrc.updateInt((int) (id >>> 32));
+        updateChecksumInt(checkcrc, (int) (id & 0xFFFFFFFFL));
+        updateChecksumInt(checkcrc, (int) (id >>> 32));
         int parametersLength = 0;
         if (version >= VERSION_22) {
             parametersLength = input.readShort() & 0xFFFF;
-            checkcrc.updateInt(parametersLength);
+            updateChecksumInt(checkcrc, parametersLength);
         }
         // This should always succeed as parametersLength cannot be too long even for a
         // corrupt segment file.
@@ -151,7 +153,7 @@
         input.readFully(parametersBytes);
         checkcrc.update(parametersBytes, 0, parametersBytes.length);
         int crc = input.readInt();
-        if (crc == checkcrc.getCrc())
+        if (crc == (int) checkcrc.getValue())
             return new CommitLogDescriptor(version, id,
                     parseCompression((Map<?, ?>) JSONValue.parse(new String(parametersBytes, StandardCharsets.UTF_8))));
         return null;
@@ -195,6 +197,8 @@
                 return MessagingService.VERSION_21;
             case VERSION_22:
                 return MessagingService.VERSION_22;
+            case VERSION_30:
+                return MessagingService.FORCE_3_0_PROTOCOL_VERSION ? MessagingService.VERSION_30 : MessagingService.VERSION_3014;
             default:
                 throw new IllegalStateException("Unknown commitlog version " + version);
         }

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
index 3cf4d0f..3ec4f15 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java

@@ -18,7 +18,6 @@
  */
 package org.apache.cassandra.db.commitlog;
 
-import java.io.DataInputStream;
 import java.io.DataOutputStream;
 import java.io.EOFException;
 import java.io.File;
@@ -28,36 +27,41 @@
 import java.util.*;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.CRC32;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Multimap;
+import com.google.common.collect.Ordering;
+
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import com.github.tjake.ICRC32;
-
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.io.util.FileSegmentInputStream;
+import org.apache.cassandra.io.util.RebufferingInputStream;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.io.compress.ICompressor;
-import org.apache.cassandra.io.util.ByteBufferDataInput;
-import org.apache.cassandra.io.util.FastByteArrayInputStream;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CRC32Factory;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.cliffc.high_scale_lib.NonBlockingHashSet;
 
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+
 public class CommitLogReplayer
 {
     static final String IGNORE_REPLAY_ERRORS_PROPERTY = "cassandra.commitlog.ignorereplayerrors";
@@ -69,16 +73,16 @@
     private final List<Future<?>> futures;
     private final Map<UUID, AtomicInteger> invalidMutations;
     private final AtomicInteger replayedCount;
-    private final Map<UUID, ReplayPosition.ReplayFilter> cfPersisted;
+    private final Map<UUID, IntervalSet<ReplayPosition>> cfPersisted;
     private final ReplayPosition globalPosition;
-    private final ICRC32 checksum;
+    private final CRC32 checksum;
     private byte[] buffer;
     private byte[] uncompressedBuffer;
 
     private final ReplayFilter replayFilter;
     private final CommitLogArchiver archiver;
 
-    CommitLogReplayer(CommitLog commitLog, ReplayPosition globalPosition, Map<UUID, ReplayPosition.ReplayFilter> cfPersisted, ReplayFilter replayFilter)
+    CommitLogReplayer(CommitLog commitLog, ReplayPosition globalPosition, Map<UUID, IntervalSet<ReplayPosition>> cfPersisted, ReplayFilter replayFilter)
     {
         this.keyspacesRecovered = new NonBlockingHashSet<Keyspace>();
         this.futures = new ArrayList<Future<?>>();
@@ -87,7 +91,7 @@
         this.invalidMutations = new HashMap<UUID, AtomicInteger>();
         // count the number of replayed mutation. We don't really care about atomicity, but we need it to be a reference.
         this.replayedCount = new AtomicInteger();
-        this.checksum = CRC32Factory.instance.create();
+        this.checksum = new CRC32();
         this.cfPersisted = cfPersisted;
         this.globalPosition = globalPosition;
         this.replayFilter = replayFilter;
@@ -96,13 +100,12 @@
 
     public static CommitLogReplayer construct(CommitLog commitLog)
     {
-        // compute per-CF and global replay positions
-        Map<UUID, ReplayPosition.ReplayFilter> cfPersisted = new HashMap<>();
+        // compute per-CF and global replay intervals
+        Map<UUID, IntervalSet<ReplayPosition>> cfPersisted = new HashMap<>();
         ReplayFilter replayFilter = ReplayFilter.create();
-        ReplayPosition globalPosition = null;
         for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
         {
-            // but, if we've truncted the cf in question, then we need to need to start replay after the truncation
+            // but, if we've truncated the cf in question, then we need to need to start replay after the truncation
             ReplayPosition truncatedAt = SystemKeyspace.getTruncatedPosition(cfs.metadata.cfId);
             if (truncatedAt != null)
             {
@@ -124,23 +127,102 @@
                 }
             }
 
-            ReplayPosition.ReplayFilter filter = new ReplayPosition.ReplayFilter(cfs.getSSTables(), truncatedAt);
-            if (!filter.isEmpty())
-                cfPersisted.put(cfs.metadata.cfId, filter);
-            else
-                globalPosition = ReplayPosition.NONE; // if we have no ranges for this CF, we must replay everything and filter
+            IntervalSet<ReplayPosition> filter = persistedIntervals(cfs.getLiveSSTables(), truncatedAt);
+            cfPersisted.put(cfs.metadata.cfId, filter);
         }
-        if (globalPosition == null)
-            globalPosition = ReplayPosition.firstNotCovered(cfPersisted.values());
+        ReplayPosition globalPosition = firstNotCovered(cfPersisted.values());
         logger.debug("Global replay position is {} from columnfamilies {}", globalPosition, FBUtilities.toString(cfPersisted));
         return new CommitLogReplayer(commitLog, globalPosition, cfPersisted, replayFilter);
     }
 
+    private static boolean shouldSkip(File file) throws IOException, ConfigurationException
+    {
+        CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(file.getName());
+        if (desc.version < CommitLogDescriptor.VERSION_21)
+        {
+            return false;
+        }
+        try(ChannelProxy channel = new ChannelProxy(file);
+            RandomAccessReader reader = RandomAccessReader.open(channel))
+        {
+            CommitLogDescriptor.readHeader(reader);
+            int end = reader.readInt();
+            long filecrc = reader.readInt() & 0xffffffffL;
+            return end == 0 && filecrc == 0;
+        }
+    }
+
+    private static List<File> filterCommitLogFiles(File[] toFilter)
+    {
+        List<File> filtered = new ArrayList<>(toFilter.length);
+        for (File file: toFilter)
+        {
+            try
+            {
+                if (shouldSkip(file))
+                {
+                    logger.info("Skipping playback of empty log: {}", file.getName());
+                }
+                else
+                {
+                    filtered.add(file);
+                }
+            }
+            catch (Exception e)
+            {
+                // let recover deal with it
+                filtered.add(file);
+            }
+        }
+
+        return filtered;
+    }
+
     public void recover(File[] clogs) throws IOException
     {
-        int i;
-        for (i = 0; i < clogs.length; ++i)
-            recover(clogs[i], i + 1 == clogs.length);
+        List<File> filteredLogs = filterCommitLogFiles(clogs);
+
+        int i = 0;
+        for (File clog: filteredLogs)
+        {
+            i++;
+            recover(clog, i == filteredLogs.size());
+        }
+    }
+
+    /**
+     * A set of known safe-to-discard commit log replay positions, based on
+     * the range covered by on disk sstables and those prior to the most recent truncation record
+     */
+    public static IntervalSet<ReplayPosition> persistedIntervals(Iterable<SSTableReader> onDisk, ReplayPosition truncatedAt)
+    {
+        IntervalSet.Builder<ReplayPosition> builder = new IntervalSet.Builder<>();
+        for (SSTableReader reader : onDisk)
+            builder.addAll(reader.getSSTableMetadata().commitLogIntervals);
+
+        if (truncatedAt != null)
+            builder.add(ReplayPosition.NONE, truncatedAt);
+        return builder.build();
+    }
+
+    /**
+     * Find the earliest commit log position that is not covered by the known flushed ranges for some table.
+     *
+     * For efficiency this assumes that the first contiguously flushed interval we know of contains the moment that the
+     * given table was constructed* and hence we can start replay from the end of that interval.
+     *
+     * If such an interval is not known, we must replay from the beginning.
+     *
+     * * This is not true only until if the very first flush of a table stalled or failed, while the second or latter
+     *   succeeded. The chances of this happening are at most very low, and if the assumption does prove to be
+     *   incorrect during replay there is little chance that the affected deployment is in production.
+     */
+    public static ReplayPosition firstNotCovered(Collection<IntervalSet<ReplayPosition>> ranges)
+    {
+        return ranges.stream()
+                .map(intervals -> Iterables.getFirst(intervals.ends(), ReplayPosition.NONE)) 
+                .min(Ordering.natural())
+                .get(); // iteration is per known-CF, there must be at least one. 
     }
 
     public int blockForWrites()
@@ -154,8 +236,19 @@
 
         // flush replayed keyspaces
         futures.clear();
+        boolean flushingSystem = false;
         for (Keyspace keyspace : keyspacesRecovered)
+        {
+            if (keyspace.getName().equals(SystemKeyspace.NAME))
+                flushingSystem = true;
+
             futures.addAll(keyspace.flush());
+        }
+
+        // also flush batchlog incase of any MV updates
+        if (!flushingSystem)
+            futures.add(Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceFlush());
+
         FBUtilities.waitOnFutures(futures);
         return replayedCount.get();
     }
@@ -168,17 +261,17 @@
             return -1;
         }
         reader.seek(offset);
-        ICRC32 crc = CRC32Factory.instance.create();
-        crc.updateInt((int) (descriptor.id & 0xFFFFFFFFL));
-        crc.updateInt((int) (descriptor.id >>> 32));
-        crc.updateInt((int) reader.getPosition());
+        CRC32 crc = new CRC32();
+        updateChecksumInt(crc, (int) (descriptor.id & 0xFFFFFFFFL));
+        updateChecksumInt(crc, (int) (descriptor.id >>> 32));
+        updateChecksumInt(crc, (int) reader.getPosition());
         int end = reader.readInt();
         long filecrc = reader.readInt() & 0xffffffffL;
         if (crc.getValue() != filecrc)
         {
             if (end != 0 || filecrc != 0)
             {
-                handleReplayError(false,
+                handleReplayError(false, null,
                                   "Encountered bad header at position %d of commit log %s, with invalid CRC. " +
                                   "The end of segment marker should be zero.",
                                   offset, reader.getPath());
@@ -187,7 +280,8 @@
         }
         else if (end < offset || end > reader.length())
         {
-            handleReplayError(tolerateTruncation, "Encountered bad header at position %d of commit log %s, with bad position but valid CRC",
+            handleReplayError(tolerateTruncation, null,
+                            "Encountered bad header at position %d of commit log %s, with bad position but valid CRC",
                               offset, reader.getPath());
             return -1;
         }
@@ -196,7 +290,7 @@
 
     abstract static class ReplayFilter
     {
-        public abstract Iterable<ColumnFamily> filter(Mutation mutation);
+        public abstract Iterable<PartitionUpdate> filter(Mutation mutation);
 
         public abstract boolean includes(CFMetaData metadata);
 
@@ -228,9 +322,9 @@
 
     private static class AlwaysReplayFilter extends ReplayFilter
     {
-        public Iterable<ColumnFamily> filter(Mutation mutation)
+        public Iterable<PartitionUpdate> filter(Mutation mutation)
         {
-            return mutation.getColumnFamilies();
+            return mutation.getPartitionUpdates();
         }
 
         public boolean includes(CFMetaData metadata)
@@ -248,17 +342,17 @@
             this.toReplay = toReplay;
         }
 
-        public Iterable<ColumnFamily> filter(Mutation mutation)
+        public Iterable<PartitionUpdate> filter(Mutation mutation)
         {
             final Collection<String> cfNames = toReplay.get(mutation.getKeyspaceName());
             if (cfNames == null)
                 return Collections.emptySet();
 
-            return Iterables.filter(mutation.getColumnFamilies(), new Predicate<ColumnFamily>()
+            return Iterables.filter(mutation.getPartitionUpdates(), new Predicate<PartitionUpdate>()
             {
-                public boolean apply(ColumnFamily cf)
+                public boolean apply(PartitionUpdate upd)
                 {
-                    return cfNames.contains(cf.metadata().cfName);
+                    return cfNames.contains(upd.metadata().cfName);
                 }
             });
         }
@@ -277,16 +371,15 @@
      */
     private boolean shouldReplay(UUID cfId, ReplayPosition position)
     {
-        ReplayPosition.ReplayFilter filter = cfPersisted.get(cfId);
-        return filter == null || filter.shouldReplay(position);
+        return !cfPersisted.get(cfId).contains(position);
     }
 
     @SuppressWarnings("resource")
     public void recover(File file, boolean tolerateTruncation) throws IOException
     {
         CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(file.getName());
-        RandomAccessReader reader = RandomAccessReader.open(new File(file.getAbsolutePath()));
-        try
+        try(ChannelProxy channel = new ChannelProxy(file);
+            RandomAccessReader reader = RandomAccessReader.open(channel))
         {
             if (desc.version < CommitLogDescriptor.VERSION_21)
             {
@@ -294,7 +387,7 @@
                     return;
                 if (globalPosition.segment == desc.id)
                     reader.seek(globalPosition.position);
-                replaySyncSection(reader, (int) reader.getPositionLimit(), desc, desc.fileName(), tolerateTruncation);
+                replaySyncSection(reader, (int) reader.length(), desc, desc.fileName(), tolerateTruncation);
                 return;
             }
 
@@ -308,12 +401,14 @@
                 desc = null;
             }
             if (desc == null) {
-                handleReplayError(false, "Could not read commit log descriptor in file %s", file);
+                // Presumably a failed CRC or other IO error occurred, which may be ok if it's the last segment
+                // where we tolerate (and expect) truncation
+                handleReplayError(tolerateTruncation, null, "Could not read commit log descriptor in file %s", file);
                 return;
             }
             if (segmentId != desc.id)
             {
-                handleReplayError(false, "Segment id mismatch (filename %d, descriptor %d) in file %s", segmentId, desc.id, file);
+                handleReplayError(false, null,"Segment id mismatch (filename %d, descriptor %d) in file %s", segmentId, desc.id, file);
                 // continue processing if ignored.
             }
 
@@ -325,11 +420,11 @@
             {
                 try
                 {
-                    compressor = CompressionParameters.createCompressor(desc.compression);
+                    compressor = CompressionParams.createCompressor(desc.compression);
                 }
                 catch (ConfigurationException e)
                 {
-                    handleReplayError(false, "Unknown compression: %s", e.getMessage());
+                    handleReplayError(false, null, "Unknown compression: %s", e.getMessage());
                     return;
                 }
             }
@@ -348,7 +443,8 @@
                 {
                     int uncompressedLength = reader.readInt();
                     replayEnd = replayPos + uncompressedLength;
-                } else
+                }
+                else
                 {
                     replayEnd = end;
                 }
@@ -382,14 +478,14 @@
                         if (uncompressedLength > uncompressedBuffer.length)
                             uncompressedBuffer = new byte[(int) (1.2 * uncompressedLength)];
                         compressedLength = compressor.uncompress(buffer, 0, compressedLength, uncompressedBuffer, 0);
-                        sectionReader = new ByteBufferDataInput(ByteBuffer.wrap(uncompressedBuffer), reader.getPath(), replayPos, 0);
+                        sectionReader = new FileSegmentInputStream(ByteBuffer.wrap(uncompressedBuffer), reader.getPath(), replayPos);
                         errorContext = "compressed section at " + start + " in " + errorContext;
                     }
                     catch (IOException | ArrayIndexOutOfBoundsException e)
                     {
-                        handleReplayError(tolerateErrorsInSection,
-                                          "Unexpected exception decompressing section at %d: %s",
-                                          start, e);
+                        handleReplayError(tolerateErrorsInSection, e,
+                                          "Unexpected exception decompressing section at %d",
+                                          start);
                         continue;
                     }
                 }
@@ -397,10 +493,6 @@
                 if (!replaySyncSection(sectionReader, replayEnd, desc, errorContext, tolerateErrorsInSection))
                     break;
             }
-        }
-        finally
-        {
-            FileUtils.closeQuietly(reader);
             logger.debug("Finished reading {}", file);
         }
     }
@@ -441,7 +533,7 @@
             {
                 // We rely on reading serialized size == 0 (LEGACY_END_OF_SEGMENT_MARKER) to identify the end
                 // of a segment, which happens naturally due to the 0 padding of the empty segment on creation.
-                // However, with 2.1 era commitlogs it's possible that the last mutation ended less than 4 bytes 
+                // However, it's possible with 2.1 era commitlogs that the last mutation ended less than 4 bytes 
                 // from the end of the file, which means that we'll be unable to read an a full int and instead 
                 // read an EOF here
                 if(end - reader.getFilePointer() < 4)
@@ -464,7 +556,7 @@
                 // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
                 if (serializedSize < 10)
                 {
-                    handleReplayError(tolerateErrors,
+                    handleReplayError(tolerateErrors, null,
                                       "Invalid mutation size %d at %d in %s",
                                       serializedSize, mutationStart, errorContext);
                     return false;
@@ -479,11 +571,11 @@
                 if (desc.version < CommitLogDescriptor.VERSION_20)
                     checksum.update(serializedSize);
                 else
-                    checksum.updateInt(serializedSize);
+                    updateChecksumInt(checksum, serializedSize);
 
                 if (checksum.getValue() != claimedSizeChecksum)
                 {
-                    handleReplayError(tolerateErrors,
+                    handleReplayError(tolerateErrors, null,
                                       "Mutation size checksum failure at %d in %s",
                                       mutationStart, errorContext);
                     return false;
@@ -500,7 +592,7 @@
             }
             catch (EOFException eof)
             {
-                handleReplayError(tolerateErrors,
+                handleReplayError(tolerateErrors, eof,
                                   "Unexpected end of segment",
                                   mutationStart, errorContext);
                 return false; // last CL entry didn't get completely written. that's ok.
@@ -509,7 +601,7 @@
             checksum.update(buffer, 0, serializedSize);
             if (claimedCRC32 != checksum.getValue())
             {
-                handleReplayError(tolerateErrors,
+                handleReplayError(tolerateErrors, null,
                                   "Mutation checksum failure at %d in %s",
                                   mutationStart, errorContext);
                 continue;
@@ -527,15 +619,14 @@
     {
 
         final Mutation mutation;
-        try (FastByteArrayInputStream bufIn = new FastByteArrayInputStream(inputBuffer, 0, size))
+        try (RebufferingInputStream bufIn = new DataInputBuffer(inputBuffer, 0, size))
         {
-            mutation = Mutation.serializer.deserialize(new DataInputStream(bufIn),
+            mutation = Mutation.serializer.deserialize(bufIn,
                                                        desc.getMessagingVersion(),
-                                                       ColumnSerializer.Flag.LOCAL);
+                                                       SerializationHelper.Flag.LOCAL);
             // doublecheck that what we read is [still] valid for the current schema
-            for (ColumnFamily cf : mutation.getColumnFamilies())
-                for (Cell cell : cf)
-                    cf.getComparator().validate(cell.name());
+            for (PartitionUpdate upd : mutation.getPartitionUpdates())
+                upd.validate();
         }
         catch (UnknownColumnFamilyException ex)
         {
@@ -562,21 +653,20 @@
             }
 
             // Checksum passed so this error can't be permissible.
-            handleReplayError(false,
+            handleReplayError(false, t,
                               "Unexpected error deserializing mutation; saved to %s.  " +
-                              "This may be caused by replaying a mutation against a table with the same name but incompatible schema.  " +
-                              "Exception follows: %s",
+                              "This may be caused by replaying a mutation against a table with the same name but incompatible schema.",
                               f.getAbsolutePath(),
                               t);
             return;
         }
 
         if (logger.isTraceEnabled())
-            logger.trace("replaying mutation for {}.{}: {}", mutation.getKeyspaceName(), ByteBufferUtil.bytesToHex(mutation.key()), "{" + StringUtils.join(mutation.getColumnFamilies().iterator(), ", ") + "}");
+            logger.trace("replaying mutation for {}.{}: {}", mutation.getKeyspaceName(), mutation.key(), "{" + StringUtils.join(mutation.getPartitionUpdates().iterator(), ", ") + "}");
 
         Runnable runnable = new WrappedRunnable()
         {
-            public void runMayThrow() throws IOException
+            public void runMayThrow()
             {
                 if (Schema.instance.getKSMetaData(mutation.getKeyspaceName()) == null)
                     return;
@@ -591,23 +681,26 @@
                 // or c) are part of a cf that was dropped.
                 // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead.
                 Mutation newMutation = null;
-                for (ColumnFamily columnFamily : replayFilter.filter(mutation))
+                for (PartitionUpdate update : replayFilter.filter(mutation))
                 {
-                    if (Schema.instance.getCF(columnFamily.id()) == null)
+                    if (Schema.instance.getCF(update.metadata().cfId) == null)
                         continue; // dropped
 
-                    if (shouldReplay(columnFamily.id(), new ReplayPosition(desc.id, entryLocation)))
+                    // replay if current segment is newer than last flushed one or,
+                    // if it is the last known segment, if we are after the replay position
+                    if (shouldReplay(update.metadata().cfId, new ReplayPosition(desc.id, entryLocation)))
                     {
                         if (newMutation == null)
                             newMutation = new Mutation(mutation.getKeyspaceName(), mutation.key());
-                        newMutation.add(columnFamily);
+                        newMutation.add(update);
                         replayedCount.incrementAndGet();
                     }
                 }
                 if (newMutation != null)
                 {
                     assert !newMutation.isEmpty();
-                    Keyspace.open(newMutation.getKeyspaceName()).apply(newMutation, false);
+
+                    Keyspace.open(newMutation.getKeyspaceName()).apply(newMutation, false, true, false);
                     keyspacesRecovered.add(keyspace);
                 }
             }
@@ -624,18 +717,18 @@
     {
         long restoreTarget = archiver.restorePointInTime;
 
-        for (ColumnFamily families : fm.getColumnFamilies())
+        for (PartitionUpdate upd : fm.getPartitionUpdates())
         {
-            if (archiver.precision.toMillis(families.maxTimestamp()) > restoreTarget)
+            if (archiver.precision.toMillis(upd.maxTimestamp()) > restoreTarget)
                 return true;
         }
         return false;
     }
 
-    static void handleReplayError(boolean permissible, String message, Object... messageArgs) throws IOException
+    static void handleReplayError(boolean permissible, Throwable t, String message, Object... messageArgs) throws IOException
     {
         String msg = String.format(message, messageArgs);
-        IOException e = new CommitLogReplayException(msg);
+        IOException e = new CommitLogReplayException(msg, t);
         if (permissible)
             logger.error("Ignoring commit log replay error likely due to incomplete flush to disk", e);
         else if (Boolean.getBoolean(IGNORE_REPLAY_ERRORS_PROPERTY))

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java
index ba28f3e..b803d88 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegment.java

@@ -32,11 +32,10 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.CRC32;
 
 import com.codahale.metrics.Timer;
-import com.github.tjake.ICRC32;
 
-import org.apache.cassandra.utils.CRC32Factory;
 import org.cliffc.high_scale_lib.NonBlockingHashMap;
 
 import org.slf4j.Logger;
@@ -45,14 +44,17 @@
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.NativeLibrary;
+import org.apache.cassandra.utils.IntegerInterval;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.concurrent.WaitQueue;
 
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+
 /*
  * A single commit log file on disk. Manages creation of the file and writing mutations to disk,
  * as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
@@ -92,6 +94,12 @@
     // sync marker in a segment will be zeroed out, or point to a position too close to the EOF to fit a marker.
     private volatile int lastSyncedOffset;
 
+    /**
+     * Everything before this offset has it's markers written into the {@link #buffer}, but has not necessarily
+     * been flushed to disk. This value should be greater than or equal to {@link #lastSyncedOffset}.
+     */
+    private volatile int lastMarkerOffset;
+
     // The end position of the buffer. Initially set to its capacity and updated to point to the last written position
     // as the segment is being closed.
     // No need to be volatile as writes are protected by appendOrder barrier.
@@ -100,11 +108,11 @@
     // a signal for writers to wait on to confirm the log message they provided has been written to disk
     private final WaitQueue syncComplete = new WaitQueue();
 
-    // a map of Cf->dirty position; this is used to permit marking Cfs clean whilst the log is still in use
-    private final NonBlockingHashMap<UUID, AtomicInteger> cfDirty = new NonBlockingHashMap<>(1024);
+    // a map of Cf->dirty interval in this segment; if interval is not covered by the clean set, the log contains unflushed data
+    private final NonBlockingHashMap<UUID, IntegerInterval> cfDirty = new NonBlockingHashMap<>(1024);
 
-    // a map of Cf->clean position; this is used to permit marking Cfs clean whilst the log is still in use
-    private final ConcurrentHashMap<UUID, AtomicInteger> cfClean = new ConcurrentHashMap<>();
+    // a map of Cf->clean intervals; separate map from above to permit marking Cfs clean whilst the log is still in use
+    private final ConcurrentHashMap<UUID, IntegerInterval.Set> cfClean = new ConcurrentHashMap<>();
 
     public final long id;
 
@@ -117,12 +125,23 @@
     final CommitLog commitLog;
     public final CommitLogDescriptor descriptor;
 
-    static CommitLogSegment createSegment(CommitLog commitLog)
+    static CommitLogSegment createSegment(CommitLog commitLog, Runnable onClose)
     {
-        return commitLog.configuration.useCompression() ? new CompressedSegment(commitLog)
+        return commitLog.configuration.useCompression() ? new CompressedSegment(commitLog, onClose)
                                                         : new MemoryMappedSegment(commitLog);
     }
 
+    /**
+     * Checks if the segments use a buffer pool.
+     *
+     * @param commitLog the commit log
+     * @return <code>true</code> if the segments use a buffer pool, <code>false</code> otherwise.
+     */
+    static boolean usesBufferPool(CommitLog commitLog)
+    {
+        return commitLog.configuration.useCompression();
+    }
+
     static long getNextId()
     {
         return idBase + nextId.getAndIncrement();
@@ -143,18 +162,19 @@
         try
         {
             channel = FileChannel.open(logFile.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ, StandardOpenOption.CREATE);
-            fd = CLibrary.getfd(channel);
+            fd = NativeLibrary.getfd(channel);
         }
         catch (IOException e)
         {
             throw new FSWriteError(e, logFile);
         }
-        
+
         buffer = createBuffer(commitLog);
         // write the header
         CommitLogDescriptor.writeHeader(buffer, descriptor);
         endOfBuffer = buffer.capacity();
-        lastSyncedOffset = buffer.position();
+
+        lastSyncedOffset = lastMarkerOffset = buffer.position();
         allocatePosition.set(lastSyncedOffset + SYNC_MARKER_SIZE);
     }
 
@@ -219,7 +239,7 @@
     // ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
     void discardUnusedTail()
     {
-        // We guard this with the OpOrdering instead of synchronised due to potential dead-lock with CLSM.advanceAllocatingFrom()
+        // We guard this with the OpOrdering instead of synchronised due to potential dead-lock with ACLSM.advanceAllocatingFrom()
         // Ensures endOfBuffer update is reflected in the buffer end position picked up by sync().
         // This actually isn't strictly necessary, as currently all calls to discardUnusedTail are executed either by the thread
         // running sync or within a mutation already protected by this OpOrdering, but to prevent future potential mistakes,
@@ -258,62 +278,87 @@
     }
 
     /**
-     * Forces a disk flush for this segment file.
+     * Update the chained markers in the commit log buffer and possibly force a disk flush for this segment file.
+     *
+     * @param flush true if the segment should flush to disk; else, false for just updating the chained markers.
      */
-    synchronized void sync()
+    synchronized void sync(boolean flush)
     {
-        boolean close = false;
+        assert lastMarkerOffset >= lastSyncedOffset : String.format("commit log segment positions are incorrect: last marked = %d, last synced = %d",
+                                                                    lastMarkerOffset, lastSyncedOffset);
         // check we have more work to do
-        if (allocatePosition.get() <= lastSyncedOffset + SYNC_MARKER_SIZE)
+        final boolean needToMarkData = allocatePosition.get() > lastMarkerOffset + SYNC_MARKER_SIZE;
+        final boolean hasDataToFlush = lastSyncedOffset != lastMarkerOffset;
+        if (!(needToMarkData || hasDataToFlush))
             return;
         // Note: Even if the very first allocation of this sync section failed, we still want to enter this
         // to ensure the segment is closed. As allocatePosition is set to 1 beyond the capacity of the buffer,
         // this will always be entered when a mutation allocation has been attempted after the marker allocation
-        // succeeded in the previous sync. 
+        // succeeded in the previous sync.
         assert buffer != null;  // Only close once.
 
-        int startMarker = lastSyncedOffset;
-        // Allocate a new sync marker; this is both necessary in itself, but also serves to demarcate
-        // the point at which we can safely consider records to have been completely written to.
-        int nextMarker = allocate(SYNC_MARKER_SIZE);
-        if (nextMarker < 0)
+        boolean close = false;
+        int startMarker = lastMarkerOffset;
+        int nextMarker, sectionEnd;
+        if (needToMarkData)
         {
-            // Ensure no more of this CLS is writeable, and mark ourselves for closing.
-            discardUnusedTail();
-            close = true;
+            // Allocate a new sync marker; this is both necessary in itself, but also serves to demarcate
+            // the point at which we can safely consider records to have been completely written to.
+            nextMarker = allocate(SYNC_MARKER_SIZE);
+            if (nextMarker < 0)
+            {
+                // Ensure no more of this CLS is writeable, and mark ourselves for closing.
+                discardUnusedTail();
+                close = true;
 
-            // We use the buffer size as the synced position after a close instead of the end of the actual data
-            // to make sure we only close the buffer once.
-            // The endOfBuffer position may be incorrect at this point (to be written by another stalled thread).
-            nextMarker = buffer.capacity();
+                // We use the buffer size as the synced position after a close instead of the end of the actual data
+                // to make sure we only close the buffer once.
+                // The endOfBuffer position may be incorrect at this point (to be written by another stalled thread).
+                nextMarker = buffer.capacity();
+            }
+            // Wait for mutations to complete as well as endOfBuffer to have been written.
+            waitForModifications();
+            sectionEnd = close ? endOfBuffer : nextMarker;
+
+            // Possibly perform compression or encryption and update the chained markers
+            write(startMarker, sectionEnd);
+            lastMarkerOffset = sectionEnd;
+        }
+        else
+        {
+            // note: we don't need to waitForModifications() as, once we get to this block, we are only doing the flush
+            // and any mutations have already been fully written into the segment (as we wait for it in the previous block).
+            nextMarker = lastMarkerOffset;
+            sectionEnd = nextMarker;
         }
 
-        // Wait for mutations to complete as well as endOfBuffer to have been written.
-        waitForModifications();
-        int sectionEnd = close ? endOfBuffer : nextMarker;
 
-        // Perform compression, writing to file and flush.
-        write(startMarker, sectionEnd);
+        if (flush || close)
+        {
+            flush(startMarker, sectionEnd);
+            lastSyncedOffset = lastMarkerOffset = nextMarker;
 
-        // Signal the sync as complete.
-        lastSyncedOffset = nextMarker;
-        if (close)
-            internalClose();
-        syncComplete.signalAll();
+            if (close)
+                internalClose();
+
+            syncComplete.signalAll();
+        }
     }
 
-    protected void writeSyncMarker(ByteBuffer buffer, int offset, int filePos, int nextMarker)
+    protected static void writeSyncMarker(long id, ByteBuffer buffer, int offset, int filePos, int nextMarker)
     {
-        ICRC32 crc = CRC32Factory.instance.create();
-        crc.updateInt((int) (id & 0xFFFFFFFFL));
-        crc.updateInt((int) (id >>> 32));
-        crc.updateInt(filePos);
+        CRC32 crc = new CRC32();
+        updateChecksumInt(crc, (int) (id & 0xFFFFFFFFL));
+        updateChecksumInt(crc, (int) (id >>> 32));
+        updateChecksumInt(crc, filePos);
         buffer.putInt(offset, nextMarker);
-        buffer.putInt(offset + 4, crc.getCrc());
+        buffer.putInt(offset + 4, (int) crc.getValue());
     }
 
     abstract void write(int lastSyncedOffset, int nextMarker);
 
+    abstract void flush(int startMarker, int nextMarker);
+
     public boolean isStillAllocating()
     {
         return allocatePosition.get() < endOfBuffer;
@@ -391,7 +436,7 @@
     synchronized void close()
     {
         discardUnusedTail();
-        sync();
+        sync(true);
         assert buffer == null;
     }
 
@@ -411,17 +456,23 @@
         }
     }
 
+    public static<K> void coverInMap(ConcurrentMap<K, IntegerInterval> map, K key, int value)
+    {
+        IntegerInterval i = map.get(key);
+        if (i == null)
+        {
+            i = map.putIfAbsent(key, new IntegerInterval(value, value));
+            if (i == null)
+                // success
+                return;
+        }
+        i.expandToCover(value);
+    }
+
     void markDirty(Mutation mutation, int allocatedPosition)
     {
-        for (ColumnFamily columnFamily : mutation.getColumnFamilies())
-        {
-            // check for deleted CFS
-            CFMetaData cfm = columnFamily.metadata();
-            if (cfm.isPurged())
-                logger.error("Attempted to write commit log entry for unrecognized table: {}", columnFamily.id());
-            else
-                ensureAtleast(cfDirty, cfm.cfId, allocatedPosition);
-        }
+        for (PartitionUpdate update : mutation.getPartitionUpdates())
+            coverInMap(cfDirty, update.metadata().cfId, allocatedPosition);
     }
 
     /**
@@ -432,55 +483,32 @@
      * @param cfId    the column family ID that is now clean
      * @param context the optional clean offset
      */
-    public synchronized void markClean(UUID cfId, ReplayPosition context)
+    public synchronized void markClean(UUID cfId, ReplayPosition startPosition, ReplayPosition endPosition)
     {
+        if (startPosition.segment > id || endPosition.segment < id)
+            return;
         if (!cfDirty.containsKey(cfId))
             return;
-        if (context.segment == id)
-            markClean(cfId, context.position);
-        else if (context.segment > id)
-            markClean(cfId, Integer.MAX_VALUE);
-    }
-
-    private void markClean(UUID cfId, int position)
-    {
-        ensureAtleast(cfClean, cfId, position);
+        int start = startPosition.segment == id ? startPosition.position : 0;
+        int end = endPosition.segment == id ? endPosition.position : Integer.MAX_VALUE;
+        cfClean.computeIfAbsent(cfId, k -> new IntegerInterval.Set()).add(start, end);
         removeCleanFromDirty();
     }
 
-    private static void ensureAtleast(ConcurrentMap<UUID, AtomicInteger> map, UUID cfId, int value)
-    {
-        AtomicInteger i = map.get(cfId);
-        if (i == null)
-        {
-            AtomicInteger i2 = map.putIfAbsent(cfId, i = new AtomicInteger());
-            if (i2 != null)
-                i = i2;
-        }
-        while (true)
-        {
-            int cur = i.get();
-            if (cur > value)
-                break;
-            if (i.compareAndSet(cur, value))
-                break;
-        }
-    }
-
     private void removeCleanFromDirty()
     {
         // if we're still allocating from this segment, don't touch anything since it can't be done thread-safely
         if (isStillAllocating())
             return;
 
-        Iterator<Map.Entry<UUID, AtomicInteger>> iter = cfClean.entrySet().iterator();
+        Iterator<Map.Entry<UUID, IntegerInterval.Set>> iter = cfClean.entrySet().iterator();
         while (iter.hasNext())
         {
-            Map.Entry<UUID, AtomicInteger> clean = iter.next();
+            Map.Entry<UUID, IntegerInterval.Set> clean = iter.next();
             UUID cfId = clean.getKey();
-            AtomicInteger cleanPos = clean.getValue();
-            AtomicInteger dirtyPos = cfDirty.get(cfId);
-            if (dirtyPos != null && dirtyPos.intValue() <= cleanPos.intValue())
+            IntegerInterval.Set cleanSet = clean.getValue();
+            IntegerInterval dirtyInterval = cfDirty.get(cfId);
+            if (dirtyInterval != null && cleanSet.covers(dirtyInterval))
             {
                 cfDirty.remove(cfId);
                 iter.remove();
@@ -497,12 +525,12 @@
             return cfDirty.keySet();
 
         List<UUID> r = new ArrayList<>(cfDirty.size());
-        for (Map.Entry<UUID, AtomicInteger> dirty : cfDirty.entrySet())
+        for (Map.Entry<UUID, IntegerInterval> dirty : cfDirty.entrySet())
         {
             UUID cfId = dirty.getKey();
-            AtomicInteger dirtyPos = dirty.getValue();
-            AtomicInteger cleanPos = cfClean.get(cfId);
-            if (cleanPos == null || cleanPos.intValue() < dirtyPos.intValue())
+            IntegerInterval dirtyInterval = dirty.getValue();
+            IntegerInterval.Set cleanSet = cfClean.get(cfId);
+            if (cleanSet == null || !cleanSet.covers(dirtyInterval))
                 r.add(dirty.getKey());
         }
         return r;
@@ -540,7 +568,10 @@
         for (UUID cfId : getDirtyCFIDs())
         {
             CFMetaData m = Schema.instance.getCFMetaData(cfId);
-            sb.append(m == null ? "<deleted>" : m.cfName).append(" (").append(cfId).append("), ");
+            sb.append(m == null ? "<deleted>" : m.cfName).append(" (").append(cfId)
+              .append(", dirty: ").append(cfDirty.get(cfId))
+              .append(", clean: ").append(cfClean.get(cfId))
+              .append("), ");
         }
         return sb.toString();
     }

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java
index 8670fd7..7651d1c 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentManager.java

@@ -21,11 +21,9 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentLinkedQueue;
@@ -34,25 +32,27 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.Iterables;
-import com.google.common.util.concurrent.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.concurrent.WaitQueue;
 import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.WrappedRunnable;
+import org.apache.cassandra.utils.concurrent.WaitQueue;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.Runnables;
+import com.google.common.util.concurrent.Uninterruptibles;
 
 /**
  * Performs eager-creation of commit log segments in a background thread. All the
@@ -108,19 +108,23 @@
         {
             public void runMayThrow() throws Exception
             {
-                while (run)
+                while (true)
                 {
                     try
                     {
                         Runnable task = segmentManagementTasks.poll();
                         if (task == null)
                         {
-                            // if we have no more work to do, check if we should create a new segment
-                            if (availableSegments.isEmpty() && (activeSegments.isEmpty() || createReserveSegments))
+                            // if we have no more work to do, check if we were requested to exit before starting background tasks
+                            if (!run)
+                                return;
+
+                            // check if we should create a new segment
+                            if (!atSegmentLimit() && availableSegments.isEmpty() && (activeSegments.isEmpty() || createReserveSegments))
                             {
                                 logger.trace("No segments in reserve; creating a fresh one");
                                 // TODO : some error handling in case we fail to create a new segment
-                                availableSegments.add(CommitLogSegment.createSegment(commitLog));
+                                availableSegments.add(CommitLogSegment.createSegment(commitLog, () -> wakeManager()));
                                 hasAvailableSegments.signalAll();
                             }
 
@@ -165,11 +169,17 @@
                     }
                 }
             }
+
+            private boolean atSegmentLimit()
+            {
+                return CommitLogSegment.usesBufferPool(commitLog) && CompressedSegment.hasReachedPoolLimit();
+            }
+
         };
 
         run = true;
 
-        managerThread = new Thread(runnable, "COMMIT-LOG-ALLOCATOR");
+        managerThread = new Thread(NamedThreadFactory.threadLocalDeallocator(runnable), "COMMIT-LOG-ALLOCATOR");
         managerThread.start();
     }
 
@@ -295,15 +305,7 @@
 
         // make sure the writes have materialized inside of the memtables by waiting for all outstanding writes
         // on the relevant keyspaces to complete
-        Set<Keyspace> keyspaces = new HashSet<>();
-        for (UUID cfId : last.getDirtyCFIDs())
-        {
-            ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(cfId);
-            if (cfs != null)
-                keyspaces.add(cfs.keyspace);
-        }
-        for (Keyspace keyspace : keyspaces)
-            keyspace.writeOrder.awaitNewBarrier();
+        Keyspace.writeOrder.awaitNewBarrier();
 
         // flush and wait for all CFs that are dirty in segments up-to and including 'last'
         Future<?> future = flushDataFrom(segmentsToRecycle, true);
@@ -313,7 +315,7 @@
 
             for (CommitLogSegment segment : activeSegments)
                 for (UUID cfId : droppedCfs)
-                    segment.markClean(cfId, segment.getContext());
+                    segment.markClean(cfId, ReplayPosition.NONE, segment.getContext());
 
             // now recycle segments that are unused, as we may not have triggered a discardCompletedSegments()
             // if the previous active segment was the only one to recycle (since an active segment isn't
@@ -454,7 +456,7 @@
                     // even though we remove the schema entry before a final flush when dropping a CF,
                     // it's still possible for a writer to race and finish his append after the flush.
                     logger.trace("Marking clean CF {} that doesn't exist anymore", dirtyCFId);
-                    segment.markClean(dirtyCFId, segment.getContext());
+                    segment.markClean(dirtyCFId, ReplayPosition.NONE, segment.getContext());
                 }
                 else if (!flushes.containsKey(dirtyCFId))
                 {
@@ -567,5 +569,6 @@
     {
         return Collections.unmodifiableCollection(activeSegments);
     }
+
 }
 

diff --git a/src/java/org/apache/cassandra/db/commitlog/CompressedSegment.java b/src/java/org/apache/cassandra/db/commitlog/CompressedSegment.java
index 219709b..8e05112 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CompressedSegment.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CompressedSegment.java

@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.Queue;
 import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSWriteError;
@@ -35,7 +36,7 @@
  */
 public class CompressedSegment extends CommitLogSegment
 {
-    static private final ThreadLocal<ByteBuffer> compressedBufferHolder = new ThreadLocal<ByteBuffer>() {
+    private static final ThreadLocal<ByteBuffer> compressedBufferHolder = new ThreadLocal<ByteBuffer>() {
         protected ByteBuffer initialValue()
         {
             return ByteBuffer.allocate(0);
@@ -44,6 +45,12 @@
     static Queue<ByteBuffer> bufferPool = new ConcurrentLinkedQueue<>();
 
     /**
+     * The number of buffers in use
+     */
+    private static AtomicInteger usedBuffers = new AtomicInteger(0);
+
+
+    /**
      * Maximum number of buffers in the compression pool. The default value is 3, it should not be set lower than that
      * (one segment in compression, one written to, one in reserve); delays in compression may cause the log to use
      * more, depending on how soon the sync policy stops all writing threads.
@@ -52,16 +59,18 @@
 
     static final int COMPRESSED_MARKER_SIZE = SYNC_MARKER_SIZE + 4;
     final ICompressor compressor;
+    final Runnable onClose;
 
     volatile long lastWrittenPos = 0;
 
     /**
      * Constructs a new segment file.
      */
-    CompressedSegment(CommitLog commitLog)
+    CompressedSegment(CommitLog commitLog, Runnable onClose)
     {
         super(commitLog);
         this.compressor = commitLog.configuration.getCompressor();
+        this.onClose = onClose;
         try
         {
             channel.write((ByteBuffer) buffer.duplicate().flip());
@@ -80,6 +89,7 @@
 
     ByteBuffer createBuffer(CommitLog commitLog)
     {
+        usedBuffers.incrementAndGet();
         ByteBuffer buf = bufferPool.poll();
         if (buf == null)
         {
@@ -124,11 +134,23 @@
 
             // Only one thread can be here at a given time.
             // Protected by synchronization on CommitLogSegment.sync().
-            writeSyncMarker(compressedBuffer, 0, (int) channel.position(), (int) channel.position() + compressedBuffer.remaining());
+            writeSyncMarker(id, compressedBuffer, 0, (int) channel.position(), (int) channel.position() + compressedBuffer.remaining());
             commitLog.allocator.addSize(compressedBuffer.limit());
             channel.write(compressedBuffer);
             assert channel.position() - lastWrittenPos == compressedBuffer.limit();
             lastWrittenPos = channel.position();
+        }
+        catch (Exception e)
+        {
+            throw new FSWriteError(e, getPath());
+        }
+    }
+
+    @Override
+    protected void flush(int startMarker, int nextMarker)
+    {
+        try
+        {
             SyncUtil.force(channel, true);
         }
         catch (Exception e)
@@ -140,12 +162,29 @@
     @Override
     protected void internalClose()
     {
-        if (bufferPool.size() < MAX_BUFFERPOOL_SIZE)
-            bufferPool.add(buffer);
-        else
-            FileUtils.clean(buffer);
+        usedBuffers.decrementAndGet();
+        try {
+            if (bufferPool.size() < MAX_BUFFERPOOL_SIZE)
+                bufferPool.add(buffer);
+            else
+                FileUtils.clean(buffer);
+            super.internalClose();
+        }
+        finally
+        {
+            onClose.run();
+        }
+    }
 
-        super.internalClose();
+    /**
+     * Checks if the number of buffers in use is greater or equals to the maximum number of buffers allowed in the pool.
+     *
+     * @return <code>true</code> if the number of buffers in use is greater or equals to the maximum number of buffers
+     * allowed in the pool, <code>false</code> otherwise.
+     */
+    static boolean hasReachedPoolLimit()
+    {
+        return usedBuffers.get() >= MAX_BUFFERPOOL_SIZE;
     }
 
     static void shutdown()

diff --git a/src/java/org/apache/cassandra/db/commitlog/IntervalSet.java b/src/java/org/apache/cassandra/db/commitlog/IntervalSet.java
new file mode 100644
index 0000000..bd0ea22
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/commitlog/IntervalSet.java

@@ -0,0 +1,192 @@
+package org.apache.cassandra.db.commitlog;
+
+import java.io.IOException;
+import java.util.*;
+
+import com.google.common.collect.ImmutableSortedMap;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * An immutable set of closed intervals, stored in normalized form (i.e. where overlapping intervals are converted
+ * to a single interval covering both).
+ *
+ * The set is stored as a sorted map from interval starts to the corresponding end. The map satisfies
+ *   curr().getKey() <= curr().getValue() < next().getKey()
+ */
+public class IntervalSet<T extends Comparable<T>>
+{
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    private static final IntervalSet EMPTY = new IntervalSet(ImmutableSortedMap.of());
+
+    final private NavigableMap<T, T> ranges;
+
+    private IntervalSet(ImmutableSortedMap<T, T> ranges)
+    {
+        this.ranges = ranges;
+    }
+
+    /**
+     * Construct new set containing the interval with the given start and end position.
+     */
+    public IntervalSet(T start, T end)
+    {
+        this(ImmutableSortedMap.of(start, end));
+    }
+
+    @SuppressWarnings("unchecked")
+    public static <T extends Comparable<T>> IntervalSet<T> empty()
+    {
+        return (IntervalSet<T>) EMPTY;
+    }
+
+    public boolean contains(T position)
+    {
+        // closed (i.e. inclusive) intervals
+        Map.Entry<T, T> range = ranges.floorEntry(position);
+        return range != null && position.compareTo(range.getValue()) <= 0;
+    }
+
+    public boolean isEmpty()
+    {
+        return ranges.isEmpty();
+    }
+
+    public Optional<T> lowerBound()
+    {
+        return isEmpty() ? Optional.empty() : Optional.of(ranges.firstKey());
+    }
+
+    public Optional<T> upperBound()
+    {
+        return isEmpty() ? Optional.empty() : Optional.of(ranges.lastEntry().getValue());
+    }
+
+    public Collection<T> starts()
+    {
+        return ranges.keySet();
+    }
+
+    public Collection<T> ends()
+    {
+        return ranges.values();
+    }
+
+    public String toString()
+    {
+        return ranges.toString();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return ranges.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        return obj instanceof IntervalSet && ranges.equals(((IntervalSet<?>) obj).ranges);
+    }
+
+    public static final <T extends Comparable<T>> ISerializer<IntervalSet<T>> serializer(ISerializer<T> pointSerializer)
+    {
+        return new ISerializer<IntervalSet<T>>()
+        {
+            public void serialize(IntervalSet<T> intervals, DataOutputPlus out) throws IOException
+            {
+                out.writeInt(intervals.ranges.size());
+                for (Map.Entry<T, T> en : intervals.ranges.entrySet())
+                {
+                    pointSerializer.serialize(en.getKey(), out);
+                    pointSerializer.serialize(en.getValue(), out);
+                }
+            }
+    
+            public IntervalSet<T> deserialize(DataInputPlus in) throws IOException
+            {
+                int count = in.readInt();
+                NavigableMap<T, T> ranges = new TreeMap<>();
+                for (int i = 0; i < count; ++i)
+                    ranges.put(pointSerializer.deserialize(in), pointSerializer.deserialize(in));
+                return new IntervalSet<T>(ImmutableSortedMap.copyOfSorted(ranges));
+            }
+    
+            public long serializedSize(IntervalSet<T> intervals)
+            {
+                long size = TypeSizes.sizeof(intervals.ranges.size());
+                for (Map.Entry<T, T> en : intervals.ranges.entrySet())
+                {
+                    size += pointSerializer.serializedSize(en.getKey());
+                    size += pointSerializer.serializedSize(en.getValue());
+                }
+                return size;
+            }
+        };
+    };
+
+    /**
+     * Builder of interval sets, applying the necessary normalization while adding ranges.
+     *
+     * Data is stored as above, as a sorted map from interval starts to the corresponding end, which satisfies
+     *   curr().getKey() <= curr().getValue() < next().getKey()
+     */
+    static public class Builder<T extends Comparable<T>>
+    {
+        final NavigableMap<T, T> ranges;
+
+        public Builder()
+        {
+            this.ranges = new TreeMap<>();
+        }
+
+        public Builder(T start, T end)
+        {
+            this();
+            assert start.compareTo(end) <= 0;
+            ranges.put(start, end);
+        }
+
+        /**
+         * Add an interval to the set and perform normalization.
+         */
+        public void add(T start, T end)
+        {
+            assert start.compareTo(end) <= 0;
+            // extend ourselves to cover any ranges we overlap
+            // record directly preceding our end may extend past us, so take the max of our end and its
+            Map.Entry<T, T> extend = ranges.floorEntry(end);
+            if (extend != null && extend.getValue().compareTo(end) > 0)
+                end = extend.getValue();
+
+            // record directly preceding our start may extend into us; if it does, we take it as our start
+            extend = ranges.lowerEntry(start);
+            if (extend != null && extend.getValue().compareTo(start) >= 0)
+                start = extend.getKey();
+
+            // remove all covered intervals
+            // since we have adjusted start and end to cover the ones that would be only partially covered, we
+            // are certain that anything whose start falls within the span is completely covered
+            ranges.subMap(start, end).clear();
+            // add the new interval
+            ranges.put(start, end);
+        }
+
+        public void addAll(IntervalSet<T> otherSet)
+        {
+            for (Map.Entry<T, T> en : otherSet.ranges.entrySet())
+            {
+                add(en.getKey(), en.getValue());
+            }
+        }
+
+        public IntervalSet<T> build()
+        {
+            return new IntervalSet<T>(ImmutableSortedMap.copyOfSorted(ranges));
+        }
+    }
+
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/commitlog/MemoryMappedSegment.java b/src/java/org/apache/cassandra/db/commitlog/MemoryMappedSegment.java
index fa9ef37..8259f04 100644
--- a/src/java/org/apache/cassandra/db/commitlog/MemoryMappedSegment.java
+++ b/src/java/org/apache/cassandra/db/commitlog/MemoryMappedSegment.java

@@ -25,7 +25,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.NativeLibrary;
 import org.apache.cassandra.utils.SyncUtil;
 
 /*
@@ -77,16 +77,21 @@
 
         // write previous sync marker to point to next sync marker
         // we don't chain the crcs here to ensure this method is idempotent if it fails
-        writeSyncMarker(buffer, startMarker, startMarker, nextMarker);
+        writeSyncMarker(id, buffer, startMarker, startMarker, nextMarker);
+    }
 
-        try {
+    @Override
+    protected void flush(int startMarker, int nextMarker)
+    {
+        try
+        {
             SyncUtil.force((MappedByteBuffer) buffer);
         }
         catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it
         {
             throw new FSWriteError(e, getPath());
         }
-        CLibrary.trySkipCache(fd, startMarker, nextMarker);
+        NativeLibrary.trySkipCache(fd, startMarker, nextMarker, logFile.getAbsolutePath());
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
index 86a248b..7a09de0 100644
--- a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java

@@ -26,7 +26,8 @@
 
     public PeriodicCommitLogService(final CommitLog commitLog)
     {
-        super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod());
+        super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod(),
+              !commitLog.configuration.useCompression());
     }
 
     protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)

diff --git a/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java b/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java
index 17802ad..b0214b8 100644
--- a/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java
+++ b/src/java/org/apache/cassandra/db/commitlog/ReplayPosition.java

@@ -17,17 +17,11 @@
  */
 package org.apache.cassandra.db.commitlog;
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.util.Map;
-import java.util.NavigableMap;
-import java.util.TreeMap;
-
-import com.google.common.collect.Ordering;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class ReplayPosition implements Comparable<ReplayPosition>
@@ -43,71 +37,6 @@
     public final long segment;
     public final int position;
 
-    /**
-     * A filter of known safe-to-discard commit log replay positions, based on
-     * the range covered by on disk sstables and those prior to the most recent truncation record
-     */
-    public static class ReplayFilter
-    {
-        final NavigableMap<ReplayPosition, ReplayPosition> persisted = new TreeMap<>();
-        public ReplayFilter(Iterable<SSTableReader> onDisk, ReplayPosition truncatedAt)
-        {
-            for (SSTableReader reader : onDisk)
-            {
-                ReplayPosition start = reader.getSSTableMetadata().commitLogLowerBound;
-                ReplayPosition end = reader.getSSTableMetadata().commitLogUpperBound;
-                add(persisted, start, end);
-            }
-            if (truncatedAt != null)
-                add(persisted, ReplayPosition.NONE, truncatedAt);
-        }
-
-        private static void add(NavigableMap<ReplayPosition, ReplayPosition> ranges, ReplayPosition start, ReplayPosition end)
-        {
-            // extend ourselves to cover any ranges we overlap
-            // record directly preceding our end may extend past us, so take the max of our end and its
-            Map.Entry<ReplayPosition, ReplayPosition> extend = ranges.floorEntry(end);
-            if (extend != null && extend.getValue().compareTo(end) > 0)
-                end = extend.getValue();
-
-            // record directly preceding our start may extend into us; if it does, we take it as our start
-            extend = ranges.lowerEntry(start);
-            if (extend != null && extend.getValue().compareTo(start) >= 0)
-                start = extend.getKey();
-
-            ranges.subMap(start, end).clear();
-            ranges.put(start, end);
-        }
-
-        public boolean shouldReplay(ReplayPosition position)
-        {
-            // replay ranges are start exclusive, end inclusive
-            Map.Entry<ReplayPosition, ReplayPosition> range = persisted.lowerEntry(position);
-            return range == null || position.compareTo(range.getValue()) > 0;
-        }
-
-        public boolean isEmpty()
-        {
-            return persisted.isEmpty();
-        }
-    }
-
-    public static ReplayPosition firstNotCovered(Iterable<ReplayFilter> ranges)
-    {
-        ReplayPosition min = null;
-        for (ReplayFilter map : ranges)
-        {
-            ReplayPosition first = map.persisted.firstEntry().getValue();
-            if (min == null)
-                min = first;
-            else
-                min = Ordering.natural().min(min, first);
-        }
-        if (min == null)
-            return NONE;
-        return min;
-    }
-
     public ReplayPosition(long segment, int position)
     {
         this.segment = segment;
@@ -165,14 +94,14 @@
             out.writeInt(rp.position);
         }
 
-        public ReplayPosition deserialize(DataInput in) throws IOException
+        public ReplayPosition deserialize(DataInputPlus in) throws IOException
         {
             return new ReplayPosition(in.readLong(), in.readInt());
         }
 
-        public long serializedSize(ReplayPosition rp, TypeSizes typeSizes)
+        public long serializedSize(ReplayPosition rp)
         {
-            return typeSizes.sizeof(rp.segment) + typeSizes.sizeof(rp.position);
+            return TypeSizes.sizeof(rp.segment) + TypeSizes.sizeof(rp.position);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java
deleted file mode 100644
index 16b5fac..0000000
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactedRow.java
+++ /dev/null

@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.security.MessageDigest;
-
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.util.SequentialWriter;
-
-/**
- * a CompactedRow is an object that takes a bunch of rows (keys + columnfamilies)
- * and can write a compacted version of those rows to an output stream.  It does
- * NOT necessarily require creating a merged CF object in memory.
- */
-public abstract class AbstractCompactedRow implements Closeable
-{
-    public final DecoratedKey key;
-
-    public AbstractCompactedRow(DecoratedKey key)
-    {
-        this.key = key;
-    }
-
-    /**
-     * write the row (size + column index + filter + column data, but NOT row key) to @param out.
-     *
-     * write() may change internal state; it is NOT valid to call write() or update() a second time.
-     *
-     * @return index information for the written row, or null if the compaction resulted in only expired tombstones.
-     */
-    public abstract RowIndexEntry write(long currentPosition, SequentialWriter out) throws IOException;
-
-    /**
-     * update @param digest with the data bytes of the row (not including row key or row size).
-     * May be called even if empty.
-     *
-     * update() may change internal state; it is NOT valid to call write() or update() a second time.
-     */
-    public abstract void update(MessageDigest digest);
-
-    /**
-     * @return aggregate information about the columns in this row.  Some fields may
-     * contain default values if computing them value would require extra effort we're not willing to make.
-     */
-    public abstract ColumnStats columnStats();
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionIterable.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionIterable.java
deleted file mode 100644
index 9fe8fd9..0000000
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionIterable.java
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.util.List;
-import java.util.UUID;
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.utils.CloseableIterator;
-
-public abstract class AbstractCompactionIterable extends CompactionInfo.Holder implements Iterable<AbstractCompactedRow>
-{
-    protected final OperationType type;
-    protected final CompactionController controller;
-    protected final long totalBytes;
-    protected volatile long bytesRead;
-    protected final List<ISSTableScanner> scanners;
-    protected final UUID compactionId;
-    /*
-     * counters for merged rows.
-     * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row),
-     * index 1 is counter for 2 rows merged, and so on.
-     */
-    protected final AtomicLong[] mergeCounters;
-
-    public AbstractCompactionIterable(CompactionController controller, OperationType type, List<ISSTableScanner> scanners, UUID compactionId)
-    {
-        this.controller = controller;
-        this.type = type;
-        this.scanners = scanners;
-        this.bytesRead = 0;
-        this.compactionId = compactionId;
-
-        long bytes = 0;
-        for (ISSTableScanner scanner : scanners)
-            bytes += scanner.getLengthInBytes();
-        this.totalBytes = bytes;
-        mergeCounters = new AtomicLong[scanners.size()];
-        for (int i = 0; i < mergeCounters.length; i++)
-            mergeCounters[i] = new AtomicLong();
-    }
-
-    public CompactionInfo getCompactionInfo()
-    {
-        return new CompactionInfo(controller.cfs.metadata,
-                                  type,
-                                  bytesRead,
-                                  totalBytes,
-                                  compactionId);
-    }
-
-    protected void updateCounterFor(int rows)
-    {
-        assert rows > 0 && rows - 1 < mergeCounters.length;
-        mergeCounters[rows - 1].incrementAndGet();
-    }
-
-    public long[] getMergedRowCounts()
-    {
-        long[] counters = new long[mergeCounters.length];
-        for (int i = 0; i < counters.length; i++)
-            counters[i] = mergeCounters[i].get();
-        return counters;
-    }
-
-    public abstract CloseableIterator<AbstractCompactedRow> iterator();
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index f9ed780..7219504 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java

@@ -24,6 +24,13 @@
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
 import com.google.common.util.concurrent.RateLimiter;
+
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -36,6 +43,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
@@ -60,8 +68,9 @@
     // disable range overlap check when deciding if an SSTable is candidate for tombstone compaction (CASSANDRA-6563)
     protected static final String UNCHECKED_TOMBSTONE_COMPACTION_OPTION = "unchecked_tombstone_compaction";
     protected static final String COMPACTION_ENABLED = "enabled";
+    public static final String ONLY_PURGE_REPAIRED_TOMBSTONES = "only_purge_repaired_tombstones";
 
-    public Map<String, String> options;
+    protected Map<String, String> options;
 
     protected final ColumnFamilyStore cfs;
     protected float tombstoneThreshold;
@@ -69,6 +78,8 @@
     protected boolean uncheckedTombstoneCompaction;
     protected boolean disableTombstoneCompactions = false;
 
+    private final Directories directories;
+
     /**
      * pause/resume/getNextBackgroundTask must synchronize.  This guarantees that after pause completes,
      * no new tasks will be generated; or put another way, pause can't run until in-progress tasks are
@@ -81,8 +92,6 @@
      */
     protected boolean isActive = false;
 
-    protected volatile boolean enabled = true;
-
     protected AbstractCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
     {
         assert cfs != null;
@@ -110,6 +119,13 @@
             tombstoneCompactionInterval = DEFAULT_TOMBSTONE_COMPACTION_INTERVAL;
             uncheckedTombstoneCompaction = DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION;
         }
+
+        directories = new Directories(cfs.metadata, Directories.dataDirectories);
+    }
+
+    public Directories getDirectories()
+    {
+        return directories;
     }
 
     /**
@@ -178,7 +194,7 @@
 
     public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, final int gcBefore, long maxSSTableBytes)
     {
-        return new CompactionTask(cfs, txn, gcBefore, false);
+        return new CompactionTask(cfs, txn, gcBefore);
     }
 
     /**
@@ -191,19 +207,12 @@
      */
     public abstract long getMaxSSTableBytes();
 
-    public boolean isEnabled()
-    {
-        return this.enabled && this.isActive;
-    }
-
     public void enable()
     {
-        this.enabled = true;
     }
 
     public void disable()
     {
-        this.enabled = false;
     }
 
     /**
@@ -228,18 +237,13 @@
      * Handle a flushed memtable.
      *
      * @param memtable the flushed memtable
-     * @param sstable the written sstable. can be null if the memtable was clean.
+     * @param sstables the written sstables. can be null or empty if the memtable was clean.
      */
-    public void replaceFlushed(Memtable memtable, SSTableReader sstable)
+    public void replaceFlushed(Memtable memtable, Collection<SSTableReader> sstables)
     {
-    }
-
-    /**
-     * @return a subset of the suggested sstables that are relevant for read requests.
-     */
-    public List<SSTableReader> filterSSTablesForReads(List<SSTableReader> sstables)
-    {
-        return sstables;
+        cfs.getTracker().replaceFlushed(memtable, sstables);
+        if (sstables != null && !sstables.isEmpty())
+            CompactionManager.instance.submitBackground(cfs);
     }
 
     /**
@@ -248,17 +252,22 @@
      * @param originalCandidates The collection to check for excluded SSTables
      * @return list of the SSTables with excluded ones filtered out
      */
-    public static Iterable<SSTableReader> filterSuspectSSTables(Iterable<SSTableReader> originalCandidates)
+    public static List<SSTableReader> filterSuspectSSTables(Iterable<SSTableReader> originalCandidates)
     {
-        return Iterables.filter(originalCandidates, new Predicate<SSTableReader>()
+        List<SSTableReader> filtered = new ArrayList<>();
+        for (SSTableReader sstable : originalCandidates)
         {
-            public boolean apply(SSTableReader sstable)
-            {
-                return !sstable.isMarkedSuspect();
-            }
-        });
+            if (!sstable.isMarkedSuspect())
+                filtered.add(sstable);
+        }
+        return filtered;
     }
 
+
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    {
+        return range == null ? getScanners(sstables, (Collection<Range<Token>>)null) : getScanners(sstables, Collections.singleton(range));
+    }
     /**
      * Returns a list of KeyScanners given sstables and a range on which to scan.
      * The default implementation simply grab one SSTableScanner per-sstable, but overriding this method
@@ -266,14 +275,14 @@
      * LeveledCompactionStrategy for instance).
      */
     @SuppressWarnings("resource")
-    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
     {
         RateLimiter limiter = CompactionManager.instance.getRateLimiter();
         ArrayList<ISSTableScanner> scanners = new ArrayList<ISSTableScanner>();
         try
         {
             for (SSTableReader sstable : sstables)
-                scanners.add(sstable.getScanner(range, limiter));
+                scanners.add(sstable.getScanner(ranges, limiter));
         }
         catch (Throwable t)
         {
@@ -345,7 +354,7 @@
 
     public ScannerList getScanners(Collection<SSTableReader> toCompact)
     {
-        return getScanners(toCompact, null);
+        return getScanners(toCompact, (Collection<Range<Token>>)null);
     }
 
     /**
@@ -374,7 +383,7 @@
         if (uncheckedTombstoneCompaction)
             return true;
 
-        Collection<SSTableReader> overlaps = cfs.getOverlappingSSTables(Collections.singleton(sstable));
+        Collection<SSTableReader> overlaps = cfs.getOverlappingLiveSSTables(Collections.singleton(sstable));
         if (overlaps.isEmpty())
         {
             // there is no overlap, tombstones are safely droppable
@@ -463,6 +472,7 @@
         uncheckedOptions.remove(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
         uncheckedOptions.remove(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
         uncheckedOptions.remove(COMPACTION_ENABLED);
+        uncheckedOptions.remove(ONLY_PURGE_REPAIRED_TOMBSTONES);
         return uncheckedOptions;
     }
 
@@ -503,4 +513,14 @@
             groupedSSTables.add(currGroup);
         return groupedSSTables;
     }
+
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, MetadataCollector meta, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
+    {
+        return SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, cfs.metadata, meta, header, lifecycleNewTracker);
+    }
+
+    public boolean supportsEarlyOpen()
+    {
+        return true;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
index 3bf224e..430c916 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java

@@ -20,8 +20,10 @@
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.compaction.CompactionManager.CompactionExecutorStatsCollector;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.io.FSDiskFullWriteError;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -58,12 +60,18 @@
         {
             return executeInternal(collector);
         }
+        catch(FSDiskFullWriteError e)
+        {
+            RuntimeException cause = new RuntimeException("Converted from FSDiskFullWriteError: " + e.getMessage());
+            cause.setStackTrace(e.getStackTrace());
+            throw new RuntimeException("Throwing new Runtime to bypass exception handler when disk is full", cause);
+        }
         finally
         {
             transaction.close();
         }
     }
-    public abstract CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables);
+    public abstract CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables);
 
     protected abstract int executeInternal(CompactionExecutorStatsCollector collector);
 

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index e895573..34d093e 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java

@@ -18,19 +18,19 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.*;
+import java.util.function.Predicate;
 
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
+import org.apache.cassandra.db.Memtable;
+import com.google.common.collect.Iterables;
 
+import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.utils.AlwaysPresentFilter;
 
 import org.apache.cassandra.utils.OverlapIterator;
@@ -47,11 +47,12 @@
     static final boolean NEVER_PURGE_TOMBSTONES = Boolean.getBoolean("cassandra.never_purge_tombstones");
 
     public final ColumnFamilyStore cfs;
+    private final boolean compactingRepaired;
     // note that overlapIterator and overlappingSSTables will be null if NEVER_PURGE_TOMBSTONES is set - this is a
     // good thing so that noone starts using them and thinks that if overlappingSSTables is empty, there
     // is no overlap.
     private Refs<SSTableReader> overlappingSSTables;
-    private OverlapIterator<RowPosition, SSTableReader> overlapIterator;
+    private OverlapIterator<PartitionPosition, SSTableReader> overlapIterator;
     private final Iterable<SSTableReader> compacting;
 
     public final int gcBefore;
@@ -67,12 +68,13 @@
         this.cfs = cfs;
         this.gcBefore = gcBefore;
         this.compacting = compacting;
+        compactingRepaired = compacting != null && compacting.stream().allMatch(SSTableReader::isRepaired);
         refreshOverlaps();
         if (NEVER_PURGE_TOMBSTONES)
             logger.warn("You are running with -Dcassandra.never_purge_tombstones=true, this is dangerous!");
     }
 
-    void maybeRefreshOverlaps()
+    public void maybeRefreshOverlaps()
     {
         if (NEVER_PURGE_TOMBSTONES)
         {
@@ -101,7 +103,7 @@
         if (compacting == null)
             overlappingSSTables = Refs.tryRef(Collections.<SSTableReader>emptyList());
         else
-            overlappingSSTables = cfs.getAndReferenceOverlappingSSTables(compacting);
+            overlappingSSTables = cfs.getAndReferenceOverlappingLiveSSTables(compacting);
         this.overlapIterator = new OverlapIterator<>(buildIntervals(overlappingSSTables));
     }
 
@@ -133,6 +135,9 @@
         if (compacting == null || NEVER_PURGE_TOMBSTONES)
             return Collections.<SSTableReader>emptySet();
 
+        if (cfStore.getCompactionStrategyManager().onlyPurgeRepairedTombstones() && !Iterables.all(compacting, SSTableReader::isRepaired))
+            return Collections.emptySet();
+
         List<SSTableReader> candidates = new ArrayList<>();
 
         long minTimestamp = Long.MAX_VALUE;
@@ -197,8 +202,8 @@
      */
     public Predicate<Long> getPurgeEvaluator(DecoratedKey key)
     {
-        if (NEVER_PURGE_TOMBSTONES)
-            return Predicates.alwaysFalse();
+        if (!compactingRepaired() || NEVER_PURGE_TOMBSTONES)
+            return time -> false;
 
         overlapIterator.update(key);
         Set<SSTableReader> filteredSSTables = overlapIterator.overlaps();
@@ -206,7 +211,7 @@
         long minTimestampSeen = Long.MAX_VALUE;
         boolean hasTimestamp = false;
 
-        for (SSTableReader sstable: filteredSSTables)
+        for (SSTableReader sstable : filteredSSTables)
         {
             // if we don't have bloom filter(bf_fp_chance=1.0 or filter file is missing),
             // we check index file instead.
@@ -216,31 +221,24 @@
                 minTimestampSeen = Math.min(minTimestampSeen, sstable.getMinTimestamp());
                 hasTimestamp = true;
             }
-
         }
 
         for (Memtable memtable : memtables)
         {
-            ColumnFamily cf = memtable.getColumnFamily(key);
-            if (cf != null)
+            Partition partition = memtable.getPartition(key);
+            if (partition != null)
             {
-                minTimestampSeen = Math.min(minTimestampSeen, memtable.getMinTimestamp());
+                minTimestampSeen = Math.min(minTimestampSeen, partition.stats().minTimestamp);
                 hasTimestamp = true;
             }
         }
 
         if (!hasTimestamp)
-            return Predicates.alwaysTrue();
+            return time -> true;
         else
         {
             final long finalTimestamp = minTimestampSeen;
-            return new Predicate<Long>()
-            {
-                public boolean apply(Long time)
-                {
-                    return time < finalTimestamp;
-                }
-            };
+            return time -> time < finalTimestamp;
         }
     }
 
@@ -250,4 +248,9 @@
             overlappingSSTables.release();
     }
 
+    public boolean compactingRepaired()
+    {
+        return !cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones() || compactingRepaired;
+    }
+
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
index 3cd8737..2bae5f8 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java

@@ -34,20 +34,43 @@
     private final OperationType tasktype;
     private final long completed;
     private final long total;
-    private final String unit;
+    private final Unit unit;
     private final UUID compactionId;
 
+    public static enum Unit
+    {
+        BYTES("bytes"), RANGES("ranges"), KEYS("keys");
+
+        private final String name;
+
+        private Unit(String name)
+        {
+            this.name = name;
+        }
+
+        @Override
+        public String toString()
+        {
+            return name;
+        }
+
+        public static boolean isFileSize(String unit)
+        {
+            return BYTES.toString().equals(unit);
+        }
+    }
+
     public CompactionInfo(CFMetaData cfm, OperationType tasktype, long bytesComplete, long totalBytes, UUID compactionId)
     {
-        this(cfm, tasktype, bytesComplete, totalBytes, "bytes", compactionId);
+        this(cfm, tasktype, bytesComplete, totalBytes, Unit.BYTES, compactionId);
     }
 
-    public CompactionInfo(OperationType tasktype, long completed, long total, String unit, UUID compactionId)
+    public CompactionInfo(OperationType tasktype, long completed, long total, Unit unit, UUID compactionId)
     {
         this(null, tasktype, completed, total, unit, compactionId);
     }
 
-    public CompactionInfo(CFMetaData cfm, OperationType tasktype, long completed, long total, String unit, UUID compactionId)
+    public CompactionInfo(CFMetaData cfm, OperationType tasktype, long completed, long total, Unit unit, UUID compactionId)
     {
         this.tasktype = tasktype;
         this.completed = completed;
@@ -129,7 +152,7 @@
         ret.put("completed", Long.toString(completed));
         ret.put("total", Long.toString(total));
         ret.put("taskType", tasktype.toString());
-        ret.put("unit", unit);
+        ret.put("unit", unit.toString());
         ret.put("compactionId", compactionId == null ? "" : compactionId.toString());
         return ret;
     }
@@ -146,10 +169,17 @@
             stopRequested = true;
         }
 
+        /**
+         * if this compaction involves several/all tables we can safely check globalCompactionsPaused
+         * in isStopRequested() below
+         */
+        public abstract boolean isGlobal();
+
         public boolean isStopRequested()
         {
-            return stopRequested;
+            return stopRequested || (isGlobal() && CompactionManager.instance.isGlobalCompactionPaused());
         }
+
         /**
          * report event on the size of the compaction.
          */

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java
deleted file mode 100644
index 23d8a4a..0000000
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterable.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.UUID;
-
-import com.google.common.collect.ImmutableList;
-
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.MergeIterator;
-
-public class CompactionIterable extends AbstractCompactionIterable
-{
-    final SSTableFormat format;
-
-    private static final Comparator<OnDiskAtomIterator> comparator = new Comparator<OnDiskAtomIterator>()
-    {
-        public int compare(OnDiskAtomIterator i1, OnDiskAtomIterator i2)
-        {
-            return i1.getKey().compareTo(i2.getKey());
-        }
-    };
-
-    public CompactionIterable(OperationType type,
-                              List<ISSTableScanner> scanners,
-                              CompactionController controller,
-                              SSTableFormat.Type formatType,
-                              UUID compactionId)
-    {
-        super(controller, type, scanners, compactionId);
-        this.format = formatType.info;
-    }
-
-    public CloseableIterator<AbstractCompactedRow> iterator()
-    {
-        return MergeIterator.get(scanners, comparator, new Reducer());
-    }
-
-    public String toString()
-    {
-        return this.getCompactionInfo().toString();
-    }
-
-    protected class Reducer extends MergeIterator.Reducer<OnDiskAtomIterator, AbstractCompactedRow>
-    {
-        protected final List<OnDiskAtomIterator> rows = new ArrayList<>();
-
-        public void reduce(OnDiskAtomIterator current)
-        {
-            rows.add(current);
-        }
-
-        protected AbstractCompactedRow getReduced()
-        {
-            assert !rows.isEmpty();
-
-            CompactionIterable.this.updateCounterFor(rows.size());
-            try
-            {
-                // create a new container for rows, since we're going to clear ours for the next one,
-                // and the AbstractCompactionRow code should be able to assume that the collection it receives
-                // won't be pulled out from under it.
-                return format.getCompactedRowWriter(controller, ImmutableList.copyOf(rows));
-            }
-            finally
-            {
-                rows.clear();
-                long n = 0;
-                for (ISSTableScanner scanner : scanners)
-                    n += scanner.getCurrentPosition();
-                bytesRead = n;
-            }
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
new file mode 100644
index 0000000..8c4732b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java

@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.compaction;
+
+import java.util.List;
+import java.util.UUID;
+import java.util.function.Predicate;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+
+import org.apache.cassandra.db.transform.DuplicateRowChecker;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PurgeFunction;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.index.transactions.CompactionTransaction;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.metrics.CompactionMetrics;
+
+/**
+ * Merge multiple iterators over the content of sstable into a "compacted" iterator.
+ * <p>
+ * On top of the actual merging the source iterators, this class:
+ * <ul>
+ *   <li>purge gc-able tombstones if possible (see PurgeIterator below).</li>
+ *   <li>update 2ndary indexes if necessary (as we don't read-before-write on index updates, index entries are
+ *       not deleted on deletion of the base table data, which is ok because we'll fix index inconsistency
+ *       on reads. This however mean that potentially obsolete index entries could be kept a long time for
+ *       data that is not read often, so compaction "pro-actively" fix such index entries. This is mainly
+ *       an optimization).</li>
+ *   <li>invalidate cached partitions that are empty post-compaction. This avoids keeping partitions with
+ *       only purgable tombstones in the row cache.</li>
+ *   <li>keep tracks of the compaction progress.</li>
+ * </ul>
+ */
+public class CompactionIterator extends CompactionInfo.Holder implements UnfilteredPartitionIterator
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompactionIterator.class);
+    private static final long UNFILTERED_TO_UPDATE_PROGRESS = 100;
+
+    private final OperationType type;
+    private final CompactionController controller;
+    private final List<ISSTableScanner> scanners;
+    private final int nowInSec;
+    private final UUID compactionId;
+
+    private final long totalBytes;
+    private long bytesRead;
+
+    /*
+     * counters for merged rows.
+     * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row),
+     * index 1 is counter for 2 rows merged, and so on.
+     */
+    private final long[] mergeCounters;
+
+    private final UnfilteredPartitionIterator compacted;
+    private final CompactionMetrics metrics;
+
+    public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, CompactionController controller, int nowInSec, UUID compactionId)
+    {
+        this(type, scanners, controller, nowInSec, compactionId, null);
+    }
+
+    @SuppressWarnings("resource") // We make sure to close mergedIterator in close() and CompactionIterator is itself an AutoCloseable
+    public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, CompactionController controller, int nowInSec, UUID compactionId, CompactionMetrics metrics)
+    {
+        this.controller = controller;
+        this.type = type;
+        this.scanners = scanners;
+        this.nowInSec = nowInSec;
+        this.compactionId = compactionId;
+        this.bytesRead = 0;
+
+        long bytes = 0;
+        for (ISSTableScanner scanner : scanners)
+            bytes += scanner.getLengthInBytes();
+        this.totalBytes = bytes;
+        this.mergeCounters = new long[scanners.size()];
+        this.metrics = metrics;
+
+        if (metrics != null)
+            metrics.beginCompaction(this);
+
+        UnfilteredPartitionIterator merged = scanners.isEmpty()
+                                             ? EmptyIterators.unfilteredPartition(controller.cfs.metadata, false)
+                                             : UnfilteredPartitionIterators.merge(scanners, nowInSec, listener());
+        boolean isForThrift = merged.isForThrift(); // to stop capture of iterator in Purger, which is confusing for debug
+        merged = Transformation.apply(merged, new Purger(isForThrift, controller, nowInSec));
+        this.compacted = DuplicateRowChecker.duringCompaction(merged, type);
+    }
+
+    public boolean isForThrift()
+    {
+        return false;
+    }
+
+    public CFMetaData metadata()
+    {
+        return controller.cfs.metadata;
+    }
+
+    public CompactionInfo getCompactionInfo()
+    {
+        return new CompactionInfo(controller.cfs.metadata,
+                                  type,
+                                  bytesRead,
+                                  totalBytes,
+                                  compactionId);
+    }
+
+    public boolean isGlobal()
+    {
+        return false;
+    }
+
+    private void updateCounterFor(int rows)
+    {
+        assert rows > 0 && rows - 1 < mergeCounters.length;
+        mergeCounters[rows - 1] += 1;
+    }
+
+    public long[] getMergedRowCounts()
+    {
+        return mergeCounters;
+    }
+
+    private UnfilteredPartitionIterators.MergeListener listener()
+    {
+        return new UnfilteredPartitionIterators.MergeListener()
+        {
+            public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List<UnfilteredRowIterator> versions)
+            {
+                int merged = 0;
+                for (UnfilteredRowIterator iter : versions)
+                {
+                    if (iter != null)
+                        merged++;
+                }
+
+                assert merged > 0;
+
+                CompactionIterator.this.updateCounterFor(merged);
+
+                if (type != OperationType.COMPACTION || !controller.cfs.indexManager.hasIndexes())
+                    return null;
+
+                Columns statics = Columns.NONE;
+                Columns regulars = Columns.NONE;
+                for (UnfilteredRowIterator iter : versions)
+                {
+                    if (iter != null)
+                    {
+                        statics = statics.mergeTo(iter.columns().statics);
+                        regulars = regulars.mergeTo(iter.columns().regulars);
+                    }
+                }
+                final PartitionColumns partitionColumns = new PartitionColumns(statics, regulars);
+
+                // If we have a 2ndary index, we must update it with deleted/shadowed cells.
+                // we can reuse a single CleanupTransaction for the duration of a partition.
+                // Currently, it doesn't do any batching of row updates, so every merge event
+                // for a single partition results in a fresh cycle of:
+                // * Get new Indexer instances
+                // * Indexer::start
+                // * Indexer::onRowMerge (for every row being merged by the compaction)
+                // * Indexer::commit
+                // A new OpOrder.Group is opened in an ARM block wrapping the commits
+                // TODO: this should probably be done asynchronously and batched.
+                final CompactionTransaction indexTransaction =
+                    controller.cfs.indexManager.newCompactionTransaction(partitionKey,
+                                                                         partitionColumns,
+                                                                         versions.size(),
+                                                                         nowInSec);
+
+                return new UnfilteredRowIterators.MergeListener()
+                {
+                    public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions)
+                    {
+                    }
+
+                    public Row onMergedRows(Row merged, Row[] versions)
+                    {
+                        indexTransaction.start();
+                        indexTransaction.onRowMerge(merged, versions);
+                        indexTransaction.commit();
+                        return merged;
+                    }
+
+                    public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker mergedMarker, RangeTombstoneMarker[] versions)
+                    {
+                    }
+
+                    public void close()
+                    {
+                    }
+                };
+            }
+
+            public void close()
+            {
+            }
+        };
+    }
+
+    private void updateBytesRead()
+    {
+        long n = 0;
+        for (ISSTableScanner scanner : scanners)
+            n += scanner.getCurrentPosition();
+        bytesRead = n;
+    }
+
+    public boolean hasNext()
+    {
+        return compacted.hasNext();
+    }
+
+    public UnfilteredRowIterator next()
+    {
+        return compacted.next();
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void close()
+    {
+        try
+        {
+            compacted.close();
+        }
+        finally
+        {
+            if (metrics != null)
+                metrics.finishCompaction(this);
+        }
+    }
+
+    public String toString()
+    {
+        return this.getCompactionInfo().toString();
+    }
+
+    private class Purger extends PurgeFunction
+    {
+        private final CompactionController controller;
+
+        private DecoratedKey currentKey;
+        private Predicate<Long> purgeEvaluator;
+
+        private long compactedUnfiltered;
+
+        private Purger(boolean isForThrift, CompactionController controller, int nowInSec)
+        {
+            super(isForThrift,
+                  nowInSec,
+                  controller.gcBefore,
+                  controller.compactingRepaired() ? Integer.MAX_VALUE : Integer.MIN_VALUE,
+                  controller.cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
+                  controller.cfs.metadata.enforceStrictLiveness());
+            this.controller = controller;
+        }
+
+        @Override
+        protected void onEmptyPartitionPostPurge(DecoratedKey key)
+        {
+            if (type == OperationType.COMPACTION)
+                controller.cfs.invalidateCachedPartition(key);
+        }
+
+        @Override
+        protected void onNewPartition(DecoratedKey key)
+        {
+            currentKey = key;
+            purgeEvaluator = null;
+        }
+
+        @Override
+        protected void updateProgress()
+        {
+            if ((++compactedUnfiltered) % UNFILTERED_TO_UPDATE_PROGRESS == 0)
+                updateBytesRead();
+        }
+
+        /*
+         * Evaluates whether a tombstone with the given deletion timestamp can be purged. This is the minimum
+         * timestamp for any sstable containing `currentKey` outside of the set of sstables involved in this compaction.
+         * This is computed lazily on demand as we only need this if there is tombstones and this a bit expensive
+         * (see #8914).
+         */
+        protected Predicate<Long> getPurgeEvaluator()
+        {
+            if (purgeEvaluator == null)
+            {
+                purgeEvaluator = controller.getPurgeEvaluator(currentKey);
+            }
+            return purgeEvaluator;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index 46f0acf..2b9ee50 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java

@@ -21,13 +21,12 @@
 import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Predicate;
 import javax.management.openmbean.OpenDataException;
 import javax.management.openmbean.TabularData;
 
-import org.apache.commons.lang3.StringUtils;
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
 import com.google.common.collect.*;
 import com.google.common.util.concurrent.*;
 
@@ -43,12 +42,22 @@
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.compaction.CompactionInfo.Holder;
-import org.apache.cassandra.db.index.SecondaryIndexBuilder;
+import org.apache.cassandra.db.lifecycle.ILifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.lifecycle.WrappedLifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.view.ViewBuilder;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.index.SecondaryIndexBuilder;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.IndexSummaryRedistribution;
+import org.apache.cassandra.io.sstable.SSTableRewriter;
+import org.apache.cassandra.io.sstable.SnapshotDeletingTask;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
@@ -58,7 +67,6 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.*;
-import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.concurrent.Refs;
 
 import static java.util.Collections.singleton;
@@ -100,12 +108,15 @@
 
     private final CompactionExecutor executor = new CompactionExecutor();
     private final CompactionExecutor validationExecutor = new ValidationExecutor();
-    private final static CompactionExecutor cacheCleanupExecutor = new CacheCleanupExecutor();
+    private final CompactionExecutor cacheCleanupExecutor = new CacheCleanupExecutor();
 
     private final CompactionMetrics metrics = new CompactionMetrics(executor, validationExecutor);
     @VisibleForTesting
     final Multiset<ColumnFamilyStore> compactingCF = ConcurrentHashMultiset.create();
 
+    // used to temporarily pause non-strategy managed compactions (like index summary redistribution)
+    private final AtomicInteger globalCompactionPauseCount = new AtomicInteger(0);
+
     private final RateLimiter compactionRateLimiter = RateLimiter.create(Double.MAX_VALUE);
 
     /**
@@ -165,7 +176,7 @@
         logger.trace("Scheduling a background task check for {}.{} with {}",
                      cfs.keyspace.getName(),
                      cfs.name,
-                     cfs.getCompactionStrategy().getName());
+                     cfs.getCompactionStrategyManager().getName());
 
         List<Future<?>> futures = new ArrayList<>(1);
         Future<?> fut = executor.submitIfRunning(new BackgroundCompactionCandidate(cfs), "background task");
@@ -247,8 +258,8 @@
                     return;
                 }
 
-                AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
-                AbstractCompactionTask task = strategy.getNextBackgroundTask(getDefaultGcBefore(cfs));
+                CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
+                AbstractCompactionTask task = strategy.getNextBackgroundTask(getDefaultGcBefore(cfs, FBUtilities.nowInSeconds()));
                 if (task == null)
                 {
                     logger.trace("No tasks available");
@@ -277,18 +288,21 @@
     @SuppressWarnings("resource")
     private AllSSTableOpStatus parallelAllSSTableOperation(final ColumnFamilyStore cfs, final OneSSTableOperation operation, int jobs, OperationType operationType) throws ExecutionException, InterruptedException
     {
+        logger.info("Starting {} for {}.{}", operationType, cfs.keyspace.getName(), cfs.getTableName());
         List<LifecycleTransaction> transactions = new ArrayList<>();
+        List<Future<?>> futures = new ArrayList<>();
         try (LifecycleTransaction compacting = cfs.markAllCompacting(operationType))
         {
-            Iterable<SSTableReader> sstables = compacting != null ? Lists.newArrayList(operation.filterSSTables(compacting)) : Collections.<SSTableReader>emptyList();
+            if (compacting == null)
+                return AllSSTableOpStatus.UNABLE_TO_CANCEL;
+
+            Iterable<SSTableReader> sstables = Lists.newArrayList(operation.filterSSTables(compacting));
             if (Iterables.isEmpty(sstables))
             {
                 logger.info("No sstables for {}.{}", cfs.keyspace.getName(), cfs.name);
                 return AllSSTableOpStatus.SUCCESSFUL;
             }
 
-            List<Future<?>> futures = new ArrayList<>();
-
             for (final SSTableReader sstable : sstables)
             {
                 final LifecycleTransaction txn = compacting.split(singleton(sstable));
@@ -310,19 +324,29 @@
 
                 if (jobs > 0 && futures.size() == jobs)
                 {
-                    FBUtilities.waitOnFutures(futures);
-                    futures.clear();
+                    Future<?> f = FBUtilities.waitOnFirstFuture(futures);
+                    futures.remove(f);
                 }
             }
             FBUtilities.waitOnFutures(futures);
             assert compacting.originals().isEmpty();
+            logger.info("Finished {} for {}.{} successfully", operationType, cfs.keyspace.getName(), cfs.getTableName());
             return AllSSTableOpStatus.SUCCESSFUL;
         }
         finally
         {
+            // wait on any unfinished futures to make sure we don't close an ongoing transaction
+            try
+            {
+                FBUtilities.waitOnFutures(futures);
+            }
+            catch (Throwable t)
+            {
+               // these are handled/logged in CompactionExecutor#afterExecute
+            }
             Throwable fail = Throwables.close(null, transactions);
             if (fail != null)
-                logger.error("Failed to cleanup lifecycle transactions {}", fail);
+                logger.error("Failed to cleanup lifecycle transactions ({} for {}.{})", operationType, cfs.keyspace.getName(), cfs.getTableName(), fail);
         }
     }
 
@@ -332,7 +356,12 @@
         void execute(LifecycleTransaction input) throws IOException;
     }
 
-    public enum AllSSTableOpStatus { ABORTED(1), SUCCESSFUL(0);
+    public enum AllSSTableOpStatus
+    {
+        SUCCESSFUL(0),
+        ABORTED(1),
+        UNABLE_TO_CANCEL(2);
+
         public final int statusCode;
 
         AllSSTableOpStatus(int statusCode)
@@ -341,13 +370,15 @@
         }
     }
 
-    public AllSSTableOpStatus performScrub(final ColumnFamilyStore cfs, final boolean skipCorrupted, final boolean checkData, int jobs)
+    public AllSSTableOpStatus performScrub(final ColumnFamilyStore cfs, final boolean skipCorrupted, final boolean checkData,
+                                           int jobs)
     throws InterruptedException, ExecutionException
     {
         return performScrub(cfs, skipCorrupted, checkData, false, jobs);
     }
 
-    public AllSSTableOpStatus performScrub(final ColumnFamilyStore cfs, final boolean skipCorrupted, final boolean checkData, final boolean reinsertOverflowedTTLRows, int jobs)
+    public AllSSTableOpStatus performScrub(final ColumnFamilyStore cfs, final boolean skipCorrupted, final boolean checkData,
+                                           final boolean reinsertOverflowedTTL, int jobs)
     throws InterruptedException, ExecutionException
     {
         return parallelAllSSTableOperation(cfs, new OneSSTableOperation()
@@ -361,7 +392,7 @@
             @Override
             public void execute(LifecycleTransaction input) throws IOException
             {
-                scrubOne(cfs, input, skipCorrupted, checkData, reinsertOverflowedTTLRows);
+                scrubOne(cfs, input, skipCorrupted, checkData, reinsertOverflowedTTL);
             }
         }, jobs, OperationType.SCRUB);
     }
@@ -392,8 +423,9 @@
             @Override
             public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
             {
-                Iterable<SSTableReader> sstables = new ArrayList<>(transaction.originals());
-                Iterator<SSTableReader> iter = sstables.iterator();
+                List<SSTableReader> sortedSSTables = Lists.newArrayList(transaction.originals());
+                Collections.sort(sortedSSTables, SSTableReader.sizeComparator.reversed());
+                Iterator<SSTableReader> iter = sortedSSTables.iterator();
                 while (iter.hasNext())
                 {
                     SSTableReader sstable = iter.next();
@@ -403,13 +435,13 @@
                         iter.remove();
                     }
                 }
-                return sstables;
+                return sortedSSTables;
             }
 
             @Override
             public void execute(LifecycleTransaction txn) throws IOException
             {
-                AbstractCompactionTask task = cfs.getCompactionStrategy().getCompactionTask(txn, NO_GC, Long.MAX_VALUE);
+                AbstractCompactionTask task = cfs.getCompactionStrategyManager().getCompactionTask(txn, NO_GC, Long.MAX_VALUE);
                 task.setUserDefined(true);
                 task.setCompactionType(OperationType.UPGRADE_SSTABLES);
                 task.execute(metrics);
@@ -426,12 +458,8 @@
             logger.info("Cleanup cannot run before a node has joined the ring");
             return AllSSTableOpStatus.ABORTED;
         }
+        // if local ranges is empty, it means no data should remain
         final Collection<Range<Token>> ranges = StorageService.instance.getLocalRanges(keyspace.getName());
-        if (ranges.isEmpty())
-        {
-            logger.info("Node owns no data for keyspace {}", keyspace.getName());
-            return AllSSTableOpStatus.SUCCESSFUL;
-        }
         final boolean hasIndexes = cfStore.indexManager.hasIndexes();
 
         return parallelAllSSTableOperation(cfStore, new OneSSTableOperation()
@@ -440,14 +468,32 @@
             public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
             {
                 List<SSTableReader> sortedSSTables = Lists.newArrayList(transaction.originals());
-                Collections.sort(sortedSSTables, new SSTableReader.SizeComparator());
+                Iterator<SSTableReader> sstableIter = sortedSSTables.iterator();
+                int totalSSTables = 0;
+                int skippedSStables = 0;
+                while (sstableIter.hasNext())
+                {
+                    SSTableReader sstable = sstableIter.next();
+                    totalSSTables++;
+                    if (!needsCleanup(sstable, ranges))
+                    {
+                        logger.debug("Not cleaning up {} ([{}, {}]) - no tokens outside owned ranges {}",
+                                     sstable, sstable.first.getToken(), sstable.last.getToken(), ranges);
+                        sstableIter.remove();
+                        transaction.cancel(sstable);
+                        skippedSStables++;
+                    }
+                }
+                logger.info("Skipping cleanup for {}/{} sstables for {}.{} since they are fully contained in owned ranges ({})",
+                            skippedSStables, totalSSTables, cfStore.keyspace.getName(), cfStore.getTableName(), ranges);
+                sortedSSTables.sort(new SSTableReader.SizeComparator());
                 return sortedSSTables;
             }
 
             @Override
             public void execute(LifecycleTransaction txn) throws IOException
             {
-                CleanupStrategy cleanupStrategy = CleanupStrategy.get(cfStore, ranges);
+                CleanupStrategy cleanupStrategy = CleanupStrategy.get(cfStore, ranges, FBUtilities.nowInSeconds());
                 doCleanupOne(cfStore, txn, cleanupStrategy, ranges, hasIndexes);
             }
         }, jobs, OperationType.CLEANUP);
@@ -461,12 +507,14 @@
      * @param ranges Repaired ranges to be anti-compacted into separate SSTables.
      * @param sstables {@link Refs} of SSTables within CF to anti-compact.
      * @param repairedAt Unix timestamp of when repair was completed.
+     * @param parentRepairSession Corresponding repair session
      * @return Futures executing anti-compaction.
      */
     public ListenableFuture<?> submitAntiCompaction(final ColumnFamilyStore cfs,
                                           final Collection<Range<Token>> ranges,
                                           final Refs<SSTableReader> sstables,
-                                          final long repairedAt)
+                                          final long repairedAt,
+                                          final UUID parentRepairSession)
     {
         Runnable runnable = new WrappedRunnable() {
             @Override
@@ -487,7 +535,7 @@
                     sstables.release(compactedSSTables);
                     modifier = cfs.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
                 }
-                performAnticompaction(cfs, ranges, sstables, modifier, repairedAt);
+                performAnticompaction(cfs, ranges, sstables, modifier, repairedAt, parentRepairSession);
             }
         };
 
@@ -520,6 +568,7 @@
      * @param ranges Ranges that the repair was carried out on
      * @param validatedForRepair SSTables containing the repaired ranges. Should be referenced before passing them.
      * @param txn Transaction across all SSTables that were repaired.
+     * @param parentRepairSession parent repair session ID
      * @throws InterruptedException
      * @throws IOException
      */
@@ -527,10 +576,11 @@
                                       Collection<Range<Token>> ranges,
                                       Refs<SSTableReader> validatedForRepair,
                                       LifecycleTransaction txn,
-                                      long repairedAt) throws InterruptedException, IOException
+                                      long repairedAt,
+                                      UUID parentRepairSession) throws InterruptedException, IOException
     {
-        logger.info("Starting anticompaction for {}.{} on {}/{} sstables", cfs.keyspace.getName(), cfs.getColumnFamilyName(), validatedForRepair.size(), cfs.getSSTables().size());
-        logger.trace("Starting anticompaction for ranges {}", ranges);
+        logger.info("[repair #{}] Starting anticompaction for {}.{} on {}/{} sstables", parentRepairSession, cfs.keyspace.getName(), cfs.getTableName(), validatedForRepair.size(), cfs.getLiveSSTables());
+        logger.trace("[repair #{}] Starting anticompaction for ranges {}", parentRepairSession, ranges);
         Set<SSTableReader> sstables = new HashSet<>(validatedForRepair);
         Set<SSTableReader> mutatedRepairStatuses = new HashSet<>(); // SSTables that were completely repaired only
         Set<SSTableReader> nonAnticompacting = new HashSet<>();
@@ -557,7 +607,7 @@
                 {
                     if (r.contains(sstableBounds.left) && r.contains(sstableBounds.right))
                     {
-                        logger.info("SSTable {} fully contained in range {}, mutating repairedAt instead of anticompacting", sstable, r);
+                        logger.info("[repair #{}] SSTable {} fully contained in range {}, mutating repairedAt instead of anticompacting", parentRepairSession, sstable, r);
                         sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, repairedAt);
                         sstable.reloadSSTableMetadata();
                         if (!nonAnticompacting.contains(sstable)) // don't notify if the SSTable was already repaired
@@ -574,11 +624,11 @@
                 }
 
                 if (!anticompactRanges.isEmpty())
-                    logger.info("SSTable {} ({}) will be anticompacted on ranges: {}", sstable, sstableBounds, StringUtils.join(anticompactRanges, ", "));
+                    logger.info("[repair #{}] SSTable {} ({}) will be anticompacted on range {}", parentRepairSession, sstable, sstableBounds, String.join(", ", anticompactRanges));
 
                 if (!shouldAnticompact)
                 {
-                    logger.info("SSTable {} ({}) not subject to anticompaction of repaired ranges {}, not touching repairedAt.", sstable, sstableBounds, normalizedRanges);
+                    logger.info("[repair #{}] SSTable {} ({}) not subject to anticompaction of repaired ranges {}, not touching repairedAt.", parentRepairSession, sstable, sstableBounds, normalizedRanges);
                     nonAnticompacting.add(sstable);
                     sstableIterator.remove();
                 }
@@ -597,12 +647,12 @@
             txn.close();
         }
 
-        logger.info("Completed anticompaction successfully");
+        logger.info("[repair #{}] Completed anticompaction successfully", parentRepairSession);
     }
 
     public void performMaximal(final ColumnFamilyStore cfStore, boolean splitOutput)
     {
-        FBUtilities.waitOnFutures(submitMaximal(cfStore, getDefaultGcBefore(cfStore), splitOutput));
+        FBUtilities.waitOnFutures(submitMaximal(cfStore, getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()), splitOutput));
     }
 
     public List<Future<?>> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore, boolean splitOutput)
@@ -610,7 +660,7 @@
         // here we compute the task off the compaction executor, so having that present doesn't
         // confuse runWithCompactionsDisabled -- i.e., we don't want to deadlock ourselves, waiting
         // for ourselves to finish/acknowledge cancellation before continuing.
-        final Collection<AbstractCompactionTask> tasks = cfStore.getCompactionStrategy().getMaximalTask(gcBefore, splitOutput);
+        final Collection<AbstractCompactionTask> tasks = cfStore.getCompactionStrategyManager().getMaximalTasks(gcBefore, splitOutput);
 
         if (tasks == null)
             return Collections.emptyList();
@@ -654,12 +704,13 @@
             }
             // group by keyspace/columnfamily
             ColumnFamilyStore cfs = Keyspace.open(desc.ksname).getColumnFamilyStore(desc.cfname);
-            descriptors.put(cfs, cfs.directories.find(new File(filename.trim()).getName()));
+            descriptors.put(cfs, cfs.getDirectories().find(new File(filename.trim()).getName()));
         }
 
         List<Future<?>> futures = new ArrayList<>();
+        int nowInSec = FBUtilities.nowInSeconds();
         for (ColumnFamilyStore cfs : descriptors.keySet())
-            futures.add(submitUserDefined(cfs, descriptors.get(cfs), getDefaultGcBefore(cfs)));
+            futures.add(submitUserDefined(cfs, descriptors.get(cfs), getDefaultGcBefore(cfs, nowInSec)));
         FBUtilities.waitOnFutures(futures);
     }
 
@@ -692,7 +743,7 @@
                 }
                 else
                 {
-                    AbstractCompactionTask task = cfs.getCompactionStrategy().getUserDefinedTask(sstables, gcBefore);
+                    AbstractCompactionTask task = cfs.getCompactionStrategyManager().getUserDefinedTask(sstables, gcBefore);
                     if (task != null)
                         task.execute(metrics);
                 }
@@ -706,7 +757,7 @@
     // This is not efficient, do not use in any critical path
     private SSTableReader lookupSSTable(final ColumnFamilyStore cfs, Descriptor descriptor)
     {
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
         {
             if (sstable.descriptor.equals(descriptor))
                 return sstable;
@@ -750,11 +801,11 @@
         }
     }
 
-    private void scrubOne(ColumnFamilyStore cfs, LifecycleTransaction modifier, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows) throws IOException
+    private void scrubOne(ColumnFamilyStore cfs, LifecycleTransaction modifier, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL) throws IOException
     {
         CompactionInfo.Holder scrubInfo = null;
 
-        try (Scrubber scrubber = new Scrubber(cfs, modifier, skipCorrupted, checkData, reinsertOverflowedTTLRows))
+        try (Scrubber scrubber = new Scrubber(cfs, modifier, skipCorrupted, checkData, reinsertOverflowedTTL))
         {
             scrubInfo = scrubber.getScrubInfo();
             metrics.beginCompaction(scrubInfo);
@@ -791,7 +842,10 @@
     @VisibleForTesting
     public static boolean needsCleanup(SSTableReader sstable, Collection<Range<Token>> ownedRanges)
     {
-        assert !ownedRanges.isEmpty(); // cleanup checks for this
+        if (ownedRanges.isEmpty())
+        {
+            return true; // all data will be cleaned
+        }
 
         // unwrap and sort the ranges by LHS token
         List<Range<Token>> sortedRanges = Range.normalize(ownedRanges);
@@ -844,22 +898,18 @@
      *
      * @throws IOException
      */
-    @SuppressWarnings("resource")
     private void doCleanupOne(final ColumnFamilyStore cfs, LifecycleTransaction txn, CleanupStrategy cleanupStrategy, Collection<Range<Token>> ranges, boolean hasIndexes) throws IOException
     {
         assert !cfs.isIndex();
 
         SSTableReader sstable = txn.onlyOne();
 
+        // if ranges is empty and no index, entire sstable is discarded
         if (!hasIndexes && !new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(ranges))
         {
             txn.obsoleteOriginals();
             txn.finish();
-            return;
-        }
-        if (!needsCleanup(sstable, ranges))
-        {
-            logger.trace("Skipping {} for cleanup; all rows should be kept", sstable);
+            logger.info("SSTable {} ([{}, {}]) does not intersect the owned ranges ({}), dropping it", sstable, sstable.first.getToken(), sstable.last.getToken(), ranges);
             return;
         }
 
@@ -867,53 +917,48 @@
 
         long totalkeysWritten = 0;
 
-        long expectedBloomFilterSize = Math.max(cfs.metadata.getMinIndexInterval(),
+        long expectedBloomFilterSize = Math.max(cfs.metadata.params.minIndexInterval,
                                                SSTableReader.getApproximateKeyCount(txn.originals()));
         if (logger.isTraceEnabled())
             logger.trace("Expected bloom filter size : {}", expectedBloomFilterSize);
 
         logger.info("Cleaning up {}", sstable);
 
-        File compactionFileLocation = cfs.directories.getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(txn.originals(), OperationType.CLEANUP));
+        File compactionFileLocation = cfs.getDirectories().getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(txn.originals(), OperationType.CLEANUP));
         if (compactionFileLocation == null)
             throw new IOException("disk full");
 
-        ISSTableScanner scanner = cleanupStrategy.getScanner(sstable, getRateLimiter());
-        CleanupInfo ci = new CleanupInfo(sstable, scanner);
-
-        metrics.beginCompaction(ci);
         List<SSTableReader> finished;
-        try (SSTableRewriter writer = new SSTableRewriter(cfs, txn, sstable.maxDataAge, false);
-             CompactionController controller = new CompactionController(cfs, txn.originals(), getDefaultGcBefore(cfs));
-             Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable)))
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (SSTableRewriter writer = SSTableRewriter.construct(cfs, txn, false, sstable.maxDataAge, false);
+             ISSTableScanner scanner = cleanupStrategy.getScanner(sstable, getRateLimiter());
+             CompactionController controller = new CompactionController(cfs, txn.originals(), getDefaultGcBefore(cfs, nowInSec));
+             Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable));
+             CompactionIterator ci = new CompactionIterator(OperationType.CLEANUP, Collections.singletonList(scanner), controller, nowInSec, UUIDGen.getTimeUUID(), metrics))
         {
-            writer.switchWriter(createWriter(cfs, compactionFileLocation, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable));
+            writer.switchWriter(createWriter(cfs, compactionFileLocation, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable, txn));
 
-            while (scanner.hasNext())
+            while (ci.hasNext())
             {
                 if (ci.isStopRequested())
                     throw new CompactionInterruptedException(ci.getCompactionInfo());
 
-                @SuppressWarnings("resource")
-                SSTableIdentityIterator row = cleanupStrategy.cleanup((SSTableIdentityIterator) scanner.next());
-                if (row == null)
-                    continue;
-                @SuppressWarnings("resource")
-                AbstractCompactedRow compactedRow = new LazilyCompactedRow(controller, Collections.singletonList(row));
-                if (writer.append(compactedRow) != null)
-                    totalkeysWritten++;
+                try (UnfilteredRowIterator partition = ci.next();
+                     UnfilteredRowIterator notCleaned = cleanupStrategy.cleanup(partition))
+                {
+                    if (notCleaned == null)
+                        continue;
+
+                    if (writer.append(notCleaned) != null)
+                        totalkeysWritten++;
+                }
             }
 
             // flush to ensure we don't lose the tombstones on a restart, since they are not commitlog'd
-            cfs.indexManager.flushIndexesBlocking();
+            cfs.indexManager.flushAllIndexesBlocking();
 
             finished = writer.finish();
         }
-        finally
-        {
-            scanner.close();
-            metrics.finishCompaction(ci);
-        }
 
         if (!finished.isEmpty())
         {
@@ -931,23 +976,30 @@
 
     private static abstract class CleanupStrategy
     {
-        public static CleanupStrategy get(ColumnFamilyStore cfs, Collection<Range<Token>> ranges)
+        protected final Collection<Range<Token>> ranges;
+        protected final int nowInSec;
+
+        protected CleanupStrategy(Collection<Range<Token>> ranges, int nowInSec)
+        {
+            this.ranges = ranges;
+            this.nowInSec = nowInSec;
+        }
+
+        public static CleanupStrategy get(ColumnFamilyStore cfs, Collection<Range<Token>> ranges, int nowInSec)
         {
             return cfs.indexManager.hasIndexes()
-                 ? new Full(cfs, ranges)
-                 : new Bounded(cfs, ranges);
+                 ? new Full(cfs, ranges, nowInSec)
+                 : new Bounded(cfs, ranges, nowInSec);
         }
 
         public abstract ISSTableScanner getScanner(SSTableReader sstable, RateLimiter limiter);
-        public abstract SSTableIdentityIterator cleanup(SSTableIdentityIterator row);
+        public abstract UnfilteredRowIterator cleanup(UnfilteredRowIterator partition);
 
         private static final class Bounded extends CleanupStrategy
         {
-            private final Collection<Range<Token>> ranges;
-
-            public Bounded(final ColumnFamilyStore cfs, Collection<Range<Token>> ranges)
+            public Bounded(final ColumnFamilyStore cfs, Collection<Range<Token>> ranges, int nowInSec)
             {
-                this.ranges = ranges;
+                super(ranges, nowInSec);
                 instance.cacheCleanupExecutor.submit(new Runnable()
                 {
                     @Override
@@ -956,8 +1008,8 @@
                         cfs.cleanupCache();
                     }
                 });
-
             }
+
             @Override
             public ISSTableScanner getScanner(SSTableReader sstable, RateLimiter limiter)
             {
@@ -965,23 +1017,20 @@
             }
 
             @Override
-            public SSTableIdentityIterator cleanup(SSTableIdentityIterator row)
+            public UnfilteredRowIterator cleanup(UnfilteredRowIterator partition)
             {
-                return row;
+                return partition;
             }
         }
 
         private static final class Full extends CleanupStrategy
         {
-            private final Collection<Range<Token>> ranges;
             private final ColumnFamilyStore cfs;
-            private List<Cell> indexedColumnsInRow;
 
-            public Full(ColumnFamilyStore cfs, Collection<Range<Token>> ranges)
+            public Full(ColumnFamilyStore cfs, Collection<Range<Token>> ranges, int nowInSec)
             {
+                super(ranges, nowInSec);
                 this.cfs = cfs;
-                this.ranges = ranges;
-                this.indexedColumnsInRow = null;
             }
 
             @Override
@@ -991,37 +1040,14 @@
             }
 
             @Override
-            public SSTableIdentityIterator cleanup(SSTableIdentityIterator row)
+            public UnfilteredRowIterator cleanup(UnfilteredRowIterator partition)
             {
-                if (Range.isInRanges(row.getKey().getToken(), ranges))
-                    return row;
+                if (Range.isInRanges(partition.partitionKey().getToken(), ranges))
+                    return partition;
 
-                cfs.invalidateCachedRow(row.getKey());
+                cfs.invalidateCachedPartition(partition.partitionKey());
 
-                if (indexedColumnsInRow != null)
-                    indexedColumnsInRow.clear();
-
-                while (row.hasNext())
-                {
-                    OnDiskAtom column = row.next();
-
-                    if (column instanceof Cell && cfs.indexManager.indexes((Cell) column))
-                    {
-                        if (indexedColumnsInRow == null)
-                            indexedColumnsInRow = new ArrayList<>();
-
-                        indexedColumnsInRow.add((Cell) column);
-                    }
-                }
-
-                if (indexedColumnsInRow != null && !indexedColumnsInRow.isEmpty())
-                {
-                    // acquire memtable lock here because secondary index deletion may cause a race. See CASSANDRA-3712
-                    try (OpOrder.Group opGroup = cfs.keyspace.writeOrder.start())
-                    {
-                        cfs.indexManager.deleteFromIndexes(row.getKey(), indexedColumnsInRow, opGroup);
-                    }
-                }
+                cfs.indexManager.deletePartition(partition, nowInSec);
                 return null;
             }
         }
@@ -1031,23 +1057,29 @@
                                              File compactionFileLocation,
                                              long expectedBloomFilterSize,
                                              long repairedAt,
-                                             SSTableReader sstable)
+                                             SSTableReader sstable,
+                                             LifecycleTransaction txn)
     {
         FileUtils.createDirectory(compactionFileLocation);
+        SerializationHeader header = sstable.header;
+        if (header == null)
+            header = SerializationHeader.make(sstable.metadata, Collections.singleton(sstable));
 
         return SSTableWriter.create(cfs.metadata,
-                                    Descriptor.fromFilename(cfs.getTempSSTablePath(compactionFileLocation)),
+                                    Descriptor.fromFilename(cfs.getSSTablePath(compactionFileLocation)),
                                     expectedBloomFilterSize,
                                     repairedAt,
                                     sstable.getSSTableLevel(),
-                                    cfs.partitioner);
+                                    header,
+                                    txn);
     }
 
     public static SSTableWriter createWriterForAntiCompaction(ColumnFamilyStore cfs,
-                                             File compactionFileLocation,
-                                             int expectedBloomFilterSize,
-                                             long repairedAt,
-                                             Collection<SSTableReader> sstables)
+                                                              File compactionFileLocation,
+                                                              int expectedBloomFilterSize,
+                                                              long repairedAt,
+                                                              Collection<SSTableReader> sstables,
+                                                              ILifecycleTransaction txn)
     {
         FileUtils.createDirectory(compactionFileLocation);
         int minLevel = Integer.MAX_VALUE;
@@ -1065,12 +1097,13 @@
                 break;
             }
         }
-        return SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(compactionFileLocation)),
+        return SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(compactionFileLocation)),
                                     (long) expectedBloomFilterSize,
                                     repairedAt,
                                     cfs.metadata,
-                                    cfs.partitioner,
-                                    new MetadataCollector(sstables, cfs.metadata.comparator, minLevel));
+                                    new MetadataCollector(sstables, cfs.metadata.comparator, minLevel),
+                                    SerializationHeader.make(cfs.metadata, sstables),
+                                    txn);
     }
 
 
@@ -1094,6 +1127,7 @@
         {
 
             int gcBefore;
+            int nowInSec = FBUtilities.nowInSeconds();
             UUID parentRepairSessionId = validator.desc.parentSessionId;
             String snapshotName;
             boolean isGlobalSnapshotValidation = cfs.snapshotExists(parentRepairSessionId.toString());
@@ -1115,7 +1149,7 @@
                 // this at a different time (that's the whole purpose of repair with snaphsot). So instead we take the creation
                 // time of the snapshot, which should give us roughtly the same time on each replica (roughtly being in that case
                 // 'as good as in the non-snapshot' case)
-                gcBefore = cfs.gcBefore(cfs.getSnapshotCreationTime(snapshotName));
+                gcBefore = cfs.gcBefore((int)(cfs.getSnapshotCreationTime(snapshotName) / 1000));
             }
             else
             {
@@ -1123,65 +1157,49 @@
                 StorageService.instance.forceKeyspaceFlush(cfs.keyspace.getName(), cfs.name);
                 sstables = getSSTablesToValidate(cfs, validator);
                 if (sstables == null)
-                    return; // this means the parent repair session was removed - the repair session failed on another node and we removed it
+                    return; // this means the parent repair session was removed - the repair session failed on another node and we removed i
                 if (validator.gcBefore > 0)
                     gcBefore = validator.gcBefore;
                 else
-                    gcBefore = getDefaultGcBefore(cfs);
+                    gcBefore = getDefaultGcBefore(cfs, nowInSec);
             }
 
-            // Create Merkle tree suitable to hold estimated partitions for given range.
-            // We blindly assume that partition is evenly distributed on all sstables for now.
-            long numPartitions = 0;
-            for (SSTableReader sstable : sstables)
-            {
-                numPartitions += sstable.estimatedKeysForRanges(singleton(validator.desc.range));
-            }
-            // determine tree depth from number of partitions, but cap at 20 to prevent large tree (CASSANDRA-5263)
-            int depth = numPartitions > 0 ? (int) Math.min(Math.ceil(Math.log(numPartitions) / Math.log(2)), 20) : 0;
-            MerkleTree tree = new MerkleTree(cfs.partitioner, validator.desc.range, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, depth));
-
+            // Create Merkle trees suitable to hold estimated partitions for the given ranges.
+            // We blindly assume that a partition is evenly distributed on all sstables for now.
+            MerkleTrees tree = createMerkleTrees(sstables, validator.desc.ranges, cfs);
             long start = System.nanoTime();
-            try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables, validator.desc.range))
+            try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables, validator.desc.ranges);
+                 ValidationCompactionController controller = new ValidationCompactionController(cfs, gcBefore);
+                 CompactionIterator ci = new ValidationCompactionIterator(scanners.scanners, controller, nowInSec, metrics))
             {
-                CompactionIterable ci = new ValidationCompactionIterable(cfs, scanners.scanners, gcBefore);
-                Iterator<AbstractCompactedRow> iter = ci.iterator();
-                metrics.beginCompaction(ci);
-                try
+                // validate the CF as we iterate over it
+                validator.prepare(cfs, tree);
+                while (ci.hasNext())
                 {
-                    // validate the CF as we iterate over it
-                    validator.prepare(cfs, tree);
-                    while (iter.hasNext())
+                    if (ci.isStopRequested())
+                        throw new CompactionInterruptedException(ci.getCompactionInfo());
+                    try (UnfilteredRowIterator partition = ci.next())
                     {
-                        if (ci.isStopRequested())
-                            throw new CompactionInterruptedException(ci.getCompactionInfo());
-                        AbstractCompactedRow row = iter.next();
-                        validator.add(row);
+                        validator.add(partition);
                     }
-                    validator.complete();
                 }
-                finally
+                validator.complete();
+            }
+            finally
+            {
+                if (isSnapshotValidation && !isGlobalSnapshotValidation)
                 {
                     // we can only clear the snapshot if we are not doing a global snapshot validation (we then clear it once anticompaction
                     // is done).
-                    if (isSnapshotValidation && !isGlobalSnapshotValidation)
-                    {
-                        cfs.clearSnapshot(snapshotName);
-                    }
-
-                    metrics.finishCompaction(ci);
+                    cfs.clearSnapshot(snapshotName);
                 }
             }
 
-            if (logger.isTraceEnabled())
+            if (logger.isDebugEnabled())
             {
-                // MT serialize may take time
                 long duration = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
-                logger.trace("Validation finished in {} msec, depth {} for {} keys, serialized size {} bytes for {}",
+                logger.debug("Validation finished in {} msec, for {}",
                              duration,
-                             depth,
-                             numPartitions,
-                             MerkleTree.serializer.serializedSize(tree, 0),
                              validator.desc);
             }
         }
@@ -1192,6 +1210,44 @@
         }
     }
 
+    private static MerkleTrees createMerkleTrees(Iterable<SSTableReader> sstables, Collection<Range<Token>> ranges, ColumnFamilyStore cfs)
+    {
+        MerkleTrees tree = new MerkleTrees(cfs.getPartitioner());
+        long allPartitions = 0;
+        Map<Range<Token>, Long> rangePartitionCounts = new HashMap<>();
+        for (Range<Token> range : ranges)
+        {
+            long numPartitions = 0;
+            for (SSTableReader sstable : sstables)
+                numPartitions += sstable.estimatedKeysForRanges(Collections.singleton(range));
+            rangePartitionCounts.put(range, numPartitions);
+            allPartitions += numPartitions;
+        }
+
+        for (Range<Token> range : ranges)
+        {
+            long numPartitions = rangePartitionCounts.get(range);
+            double rangeOwningRatio = allPartitions > 0 ? (double)numPartitions / allPartitions : 0;
+            // determine max tree depth proportional to range size to avoid blowing up memory with multiple tress,
+            // capping at a configurable depth (default 18) to prevent large tree (CASSANDRA-11390, CASSANDRA-14096)
+            int maxDepth = rangeOwningRatio > 0
+                           ? (int) Math.floor(Math.max(0.0, DatabaseDescriptor.getRepairSessionMaxTreeDepth() -
+                                                            Math.log(1 / rangeOwningRatio) / Math.log(2)))
+                           : 0;
+
+            // determine tree depth from number of partitions, capping at max tree depth (CASSANDRA-5263)
+            int depth = numPartitions > 0 ? (int) Math.min(Math.ceil(Math.log(numPartitions) / Math.log(2)), maxDepth) : 0;
+            tree.addMerkleTree((int) Math.pow(2, depth), range);
+        }
+        if (logger.isDebugEnabled())
+        {
+            // MT serialize may take time
+            logger.debug("Created {} merkle trees with merkle trees size {}, {} partitions, {} bytes", tree.ranges().size(), tree.size(), allPartitions, MerkleTrees.serializer.serializedSize(tree, 0));
+        }
+
+        return tree;
+    }
+
     private synchronized Refs<SSTableReader> getSSTablesToValidate(ColumnFamilyStore cfs, Validator validator)
     {
         Refs<SSTableReader> sstables;
@@ -1200,17 +1256,15 @@
         if (prs == null)
             return null;
         Set<SSTableReader> sstablesToValidate = new HashSet<>();
-
         if (prs.isGlobal)
             prs.markSSTablesRepairing(cfs.metadata.cfId, validator.desc.parentSessionId);
-
         // note that we always grab all existing sstables for this - if we were to just grab the ones that
         // were marked as repairing, we would miss any ranges that were compacted away and this would cause us to overstream
-        try (ColumnFamilyStore.RefViewFragment sstableCandidates = cfs.selectAndReference(prs.isIncremental ? ColumnFamilyStore.UNREPAIRED_SSTABLES : ColumnFamilyStore.CANONICAL_SSTABLES))
+        try (ColumnFamilyStore.RefViewFragment sstableCandidates = cfs.selectAndReference(View.select(SSTableSet.CANONICAL, (s) -> !prs.isIncremental || !s.isRepaired())))
         {
             for (SSTableReader sstable : sstableCandidates.sstables)
             {
-                if (new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(Collections.singletonList(validator.desc.range)))
+                if (new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(validator.desc.ranges))
                 {
                     sstablesToValidate.add(sstable);
                 }
@@ -1242,7 +1296,7 @@
         logger.info("Performing anticompaction on {} sstables", numAnticompact);
 
         //Group SSTables
-        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategy().groupSSTablesForAntiCompaction(repaired.originals());
+        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(repaired.originals());
         // iterate over sstables to check if the repaired / unrepaired ranges intersect them.
         int antiCompactedSSTableCount = 0;
         for (Collection<SSTableReader> sstableGroup : groupedSSTables)
@@ -1258,8 +1312,10 @@
         logger.info(format, numAnticompact, antiCompactedSSTableCount);
     }
 
-    private int antiCompactGroup(ColumnFamilyStore cfs, Collection<Range<Token>> ranges,
-                             LifecycleTransaction anticompactionGroup, long repairedAt)
+
+    @VisibleForTesting
+    int antiCompactGroup(ColumnFamilyStore cfs, Collection<Range<Token>> ranges,
+                         LifecycleTransaction anticompactionGroup, long repairedAt)
     {
         long groupMaxDataAge = -1;
 
@@ -1279,61 +1335,89 @@
         logger.info("Anticompacting {}", anticompactionGroup);
         Set<SSTableReader> sstableAsSet = anticompactionGroup.originals();
 
-        File destination = cfs.directories.getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(sstableAsSet, OperationType.ANTICOMPACTION));
+        File destination = cfs.getDirectories().getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(sstableAsSet, OperationType.ANTICOMPACTION));
         long repairedKeyCount = 0;
         long unrepairedKeyCount = 0;
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
-        try (SSTableRewriter repairedSSTableWriter = new SSTableRewriter(cfs, anticompactionGroup, groupMaxDataAge, false, false);
-             SSTableRewriter unRepairedSSTableWriter = new SSTableRewriter(cfs, anticompactionGroup, groupMaxDataAge, false, false);
-             AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(anticompactionGroup.originals());
-             CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs)))
+        int nowInSec = FBUtilities.nowInSeconds();
+
+        /**
+         * HACK WARNING
+         *
+         * We have multiple writers operating over the same Transaction, producing different sets of sstables that all
+         * logically replace the transaction's originals.  The SSTableRewriter assumes it has exclusive control over
+         * the transaction state, and this will lead to temporarily inconsistent sstable/tracker state if we do not
+         * take special measures to avoid it.
+         *
+         * Specifically, if a number of rewriter have prepareToCommit() invoked in sequence, then two problematic things happen:
+         *   1. The obsoleteOriginals() call of the first rewriter immediately remove the originals from the tracker, despite
+         *      their having been only partially replaced.  To avoid this, we must either avoid obsoleteOriginals() or checkpoint()
+         *   2. The LifecycleTransaction may only have prepareToCommit() invoked once, and this will checkpoint() also.
+         *
+         * Similarly commit() would finalise partially complete on-disk state.
+         *
+         * To avoid these problems, we introduce a SharedTxn that proxies all calls onto the underlying transaction
+         * except prepareToCommit(), checkpoint(), obsoleteOriginals(), and commit().
+         * We then invoke these methods directly once each of the rewriter has updated the transaction
+         * with their share of replacements.
+         *
+         * Note that for the same essential reason we also explicitly disable early open.
+         * By noop-ing checkpoint we avoid any of the problems with early open, but by continuing to explicitly
+         * disable it we also prevent any of the extra associated work from being performed.
+         */
+        class SharedTxn extends WrappedLifecycleTransaction
         {
-            int expectedBloomFilterSize = Math.max(cfs.metadata.getMinIndexInterval(), (int)(SSTableReader.getApproximateKeyCount(sstableAsSet)));
+            public SharedTxn(ILifecycleTransaction delegate) { super(delegate); }
+            public Throwable commit(Throwable accumulate) { return accumulate; }
+            public void prepareToCommit() {}
+            public void checkpoint() {}
+            public void obsoleteOriginals() {}
+            public void close() {}
+        }
 
-            repairedSSTableWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, repairedAt, sstableAsSet));
-            unRepairedSSTableWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, ActiveRepairService.UNREPAIRED_SSTABLE, sstableAsSet));
+        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
+        try (SharedTxn sharedTxn = new SharedTxn(anticompactionGroup);
+             SSTableRewriter repairedSSTableWriter = new SSTableRewriter(sharedTxn, groupMaxDataAge, false, false);
+             SSTableRewriter unRepairedSSTableWriter = new SSTableRewriter(sharedTxn, groupMaxDataAge, false, false);
+             AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(anticompactionGroup.originals());
+             CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs, nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.ANTICOMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), metrics))
+        {
+            int expectedBloomFilterSize = Math.max(cfs.metadata.params.minIndexInterval, (int)(SSTableReader.getApproximateKeyCount(sstableAsSet)));
 
-            CompactionIterable ci = new CompactionIterable(OperationType.ANTICOMPACTION, scanners.scanners, controller, DatabaseDescriptor.getSSTableFormat(), UUIDGen.getTimeUUID());
-            metrics.beginCompaction(ci);
-            try
+            repairedSSTableWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, repairedAt, sstableAsSet, sharedTxn));
+            unRepairedSSTableWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, ActiveRepairService.UNREPAIRED_SSTABLE, sstableAsSet, sharedTxn));
+            Range.OrderedRangeContainmentChecker containmentChecker = new Range.OrderedRangeContainmentChecker(ranges);
+            while (ci.hasNext())
             {
-                @SuppressWarnings("resource")
-                CloseableIterator<AbstractCompactedRow> iter = ci.iterator();
-                Range.OrderedRangeContainmentChecker containmentChecker = new Range.OrderedRangeContainmentChecker(ranges);
-                while (iter.hasNext())
+                try (UnfilteredRowIterator partition = ci.next())
                 {
-                    @SuppressWarnings("resource")
-                    AbstractCompactedRow row = iter.next();
                     // if current range from sstable is repaired, save it into the new repaired sstable
-                    if (containmentChecker.contains(row.key.getToken()))
+                    if (containmentChecker.contains(partition.partitionKey().getToken()))
                     {
-                        repairedSSTableWriter.append(row);
+                        repairedSSTableWriter.append(partition);
                         repairedKeyCount++;
                     }
                     // otherwise save into the new 'non-repaired' table
                     else
                     {
-                        unRepairedSSTableWriter.append(row);
+                        unRepairedSSTableWriter.append(partition);
                         unrepairedKeyCount++;
                     }
                 }
             }
-            finally
-            {
-                metrics.finishCompaction(ci);
-            }
 
             List<SSTableReader> anticompactedSSTables = new ArrayList<>();
-            // since both writers are operating over the same Transaction, we cannot use the convenience Transactional.finish() method,
-            // as on the second finish() we would prepareToCommit() on a Transaction that has already been committed, which is forbidden by the API
-            // (since it indicates misuse). We call permitRedundantTransitions so that calls that transition to a state already occupied are permitted.
-            anticompactionGroup.permitRedundantTransitions();
+
             repairedSSTableWriter.setRepairedAt(repairedAt).prepareToCommit();
             unRepairedSSTableWriter.prepareToCommit();
+            anticompactionGroup.checkpoint();
+            anticompactionGroup.obsoleteOriginals();
+            anticompactionGroup.prepareToCommit();
             anticompactedSSTables.addAll(repairedSSTableWriter.finished());
             anticompactedSSTables.addAll(unRepairedSSTableWriter.finished());
             repairedSSTableWriter.commit();
             unRepairedSSTableWriter.commit();
+            Throwables.maybeFail(anticompactionGroup.commit(null));
 
             logger.trace("Repaired {} keys out of {} for {}/{} in {}", repairedKeyCount,
                                                                        repairedKeyCount + unrepairedKeyCount,
@@ -1421,19 +1505,18 @@
         }
     }
 
-    public static int getDefaultGcBefore(ColumnFamilyStore cfs)
+    public static int getDefaultGcBefore(ColumnFamilyStore cfs, int nowInSec)
     {
         // 2ndary indexes have ExpiringColumns too, so we need to purge tombstones deleted before now. We do not need to
         // add any GcGrace however since 2ndary indexes are local to a node.
-        return cfs.isIndex() ? (int) (System.currentTimeMillis() / 1000) : cfs.gcBefore(System.currentTimeMillis());
+        return cfs.isIndex() ? nowInSec : cfs.gcBefore(nowInSec);
     }
 
-    private static class ValidationCompactionIterable extends CompactionIterable
+    private static class ValidationCompactionIterator extends CompactionIterator
     {
-        @SuppressWarnings("resource")
-        public ValidationCompactionIterable(ColumnFamilyStore cfs, List<ISSTableScanner> scanners, int gcBefore)
+        public ValidationCompactionIterator(List<ISSTableScanner> scanners, ValidationCompactionController controller, int nowInSec, CompactionMetrics metrics)
         {
-            super(OperationType.VALIDATION, scanners, new ValidationCompactionController(cfs, gcBefore), DatabaseDescriptor.getSSTableFormat(), UUIDGen.getTimeUUID());
+            super(OperationType.VALIDATION, scanners, controller, nowInSec, UUIDGen.getTimeUUID(), metrics);
         }
     }
 
@@ -1464,10 +1547,35 @@
              * a tombstone that could shadow a column in another sstable, but this is doubly not a concern
              * since validation compaction is read-only.
              */
-            return Predicates.alwaysTrue();
+            return time -> true;
         }
     }
 
+    public Future<?> submitViewBuilder(final ViewBuilder builder)
+    {
+        Runnable runnable = new Runnable()
+        {
+            public void run()
+            {
+                metrics.beginCompaction(builder);
+                try
+                {
+                    builder.run();
+                }
+                finally
+                {
+                    metrics.finishCompaction(builder);
+                }
+            }
+        };
+        if (executor.isShutdown())
+        {
+            logger.info("Compaction executor has shut down, not submitting index build");
+            return null;
+        }
+
+        return executor.submit(runnable);
+    }
     public int getActiveCompactions()
     {
         return CompactionMetrics.getCompactions().size();
@@ -1502,7 +1610,7 @@
         public void afterExecute(Runnable r, Throwable t)
         {
             DebuggableThreadPoolExecutor.maybeResetTraceSessionWrapper(r);
-
+    
             if (t == null)
                 t = DebuggableThreadPoolExecutor.extractThrowable(r);
 
@@ -1641,36 +1749,6 @@
         return metrics.completedTasks.getValue();
     }
 
-    private static class CleanupInfo extends CompactionInfo.Holder
-    {
-        private final SSTableReader sstable;
-        private final ISSTableScanner scanner;
-        private final UUID cleanupCompactionId;
-
-        public CleanupInfo(SSTableReader sstable, ISSTableScanner scanner)
-        {
-            this.sstable = sstable;
-            this.scanner = scanner;
-            cleanupCompactionId = UUIDGen.getTimeUUID();
-        }
-
-        public CompactionInfo getCompactionInfo()
-        {
-            try
-            {
-                return new CompactionInfo(sstable.metadata,
-                                          OperationType.CLEANUP,
-                                          scanner.getCurrentPosition(),
-                                          scanner.getLengthInBytes(),
-                                          cleanupCompactionId);
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException();
-            }
-        }
-    }
-
     public void stopCompaction(String type)
     {
         OperationType operation = OperationType.valueOf(type);
@@ -1779,4 +1857,26 @@
                 break;
         }
     }
+
+    /**
+     * Return whether "global" compactions should be paused, used by ColumnFamilyStore#runWithCompactionsDisabled
+     *
+     * a global compaction is one that includes several/all tables, currently only IndexSummaryBuilder
+     */
+    public boolean isGlobalCompactionPaused()
+    {
+        return globalCompactionPauseCount.get() > 0;
+    }
+
+    public CompactionPauser pauseGlobalCompaction()
+    {
+        CompactionPauser pauser = globalCompactionPauseCount::decrementAndGet;
+        globalCompactionPauseCount.incrementAndGet();
+        return pauser;
+    }
+
+    public interface CompactionPauser extends AutoCloseable
+    {
+        public void close();
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManagerMBean.java b/src/java/org/apache/cassandra/db/compaction/CompactionManagerMBean.java
index 8e200a1..d5da0fe 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManagerMBean.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManagerMBean.java

@@ -58,7 +58,8 @@
     /**
      * Stop an individual running compaction using the compactionId.
      * @param compactionId Compaction ID of compaction to stop. Such IDs can be found in
-     *                     the compactions_in_progress table of the system keyspace.
+     *                     the transaction log files whose name starts with compaction_,
+     *                     located in the table transactions folder.
      */
     public void stopCompactionById(String compactionId);
 

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
new file mode 100644
index 0000000..1d3d18c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java

@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.compaction;
+
+
+import java.util.*;
+import java.util.concurrent.Callable;
+
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.notifications.*;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.service.ActiveRepairService;
+
+/**
+ * Manages the compaction strategies.
+ *
+ * Currently has two instances of actual compaction strategies - one for repaired data and one for
+ * unrepaired data. This is done to be able to totally separate the different sets of sstables.
+ */
+public class CompactionStrategyManager implements INotificationConsumer
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompactionStrategyManager.class);
+    private final ColumnFamilyStore cfs;
+    private volatile AbstractCompactionStrategy repaired;
+    private volatile AbstractCompactionStrategy unrepaired;
+    private volatile boolean enabled = true;
+    public boolean isActive = true;
+    private volatile CompactionParams params;
+    /*
+        We keep a copy of the schema compaction parameters here to be able to decide if we
+        should update the compaction strategy in maybeReloadCompactionStrategy() due to an ALTER.
+
+        If a user changes the local compaction strategy and then later ALTERs a compaction parameter,
+        we will use the new compaction parameters.
+     */
+    private CompactionParams schemaCompactionParams;
+
+    public CompactionStrategyManager(ColumnFamilyStore cfs)
+    {
+        cfs.getTracker().subscribe(this);
+        logger.trace("{} subscribed to the data tracker.", this);
+        this.cfs = cfs;
+        reload(cfs.metadata);
+        params = cfs.metadata.params.compaction;
+        enabled = params.isEnabled();
+    }
+
+    /**
+     * Return the next background task
+     *
+     * Returns a task for the compaction strategy that needs it the most (most estimated remaining tasks)
+     *
+     */
+    public synchronized AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+    {
+        if (!isEnabled())
+            return null;
+
+        maybeReload(cfs.metadata);
+
+        if (repaired.getEstimatedRemainingTasks() > unrepaired.getEstimatedRemainingTasks())
+        {
+            AbstractCompactionTask repairedTask = repaired.getNextBackgroundTask(gcBefore);
+            if (repairedTask != null)
+                return repairedTask;
+            return unrepaired.getNextBackgroundTask(gcBefore);
+        }
+        else
+        {
+            AbstractCompactionTask unrepairedTask = unrepaired.getNextBackgroundTask(gcBefore);
+            if (unrepairedTask != null)
+                return unrepairedTask;
+            return repaired.getNextBackgroundTask(gcBefore);
+        }
+    }
+
+    public boolean isEnabled()
+    {
+        return enabled && isActive;
+    }
+
+    public synchronized void resume()
+    {
+        isActive = true;
+    }
+
+    /**
+     * pause compaction while we cancel all ongoing compactions
+     *
+     * Separate call from enable/disable to not have to save the enabled-state externally
+      */
+    public synchronized void pause()
+    {
+        isActive = false;
+    }
+
+
+    private void startup()
+    {
+        for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
+        {
+            if (sstable.openReason != SSTableReader.OpenReason.EARLY)
+                getCompactionStrategyFor(sstable).addSSTable(sstable);
+        }
+        repaired.startup();
+        unrepaired.startup();
+    }
+
+    /**
+     * return the compaction strategy for the given sstable
+     *
+     * returns differently based on the repaired status
+     * @param sstable
+     * @return
+     */
+    private AbstractCompactionStrategy getCompactionStrategyFor(SSTableReader sstable)
+    {
+        if (sstable.isRepaired())
+            return repaired;
+        else
+            return unrepaired;
+    }
+
+    public void shutdown()
+    {
+        isActive = false;
+        repaired.shutdown();
+        unrepaired.shutdown();
+    }
+
+    public synchronized void maybeReload(CFMetaData metadata)
+    {
+        // compare the old schema configuration to the new one, ignore any locally set changes.
+        if (metadata.params.compaction.equals(schemaCompactionParams))
+            return;
+        reload(metadata);
+    }
+
+    /**
+     * Reload the compaction strategies
+     *
+     * Called after changing configuration and at startup.
+     * @param metadata
+     */
+    public synchronized void reload(CFMetaData metadata)
+    {
+        boolean disabledWithJMX = !enabled && shouldBeEnabled();
+        setStrategy(metadata.params.compaction);
+        schemaCompactionParams = metadata.params.compaction;
+
+        if (disabledWithJMX || !shouldBeEnabled())
+            disable();
+        else
+            enable();
+        startup();
+    }
+
+    public void replaceFlushed(Memtable memtable, Collection<SSTableReader> sstables)
+    {
+        cfs.getTracker().replaceFlushed(memtable, sstables);
+        if (sstables != null && !sstables.isEmpty())
+            CompactionManager.instance.submitBackground(cfs);
+    }
+
+    public int getUnleveledSSTables()
+    {
+        if (repaired instanceof LeveledCompactionStrategy && unrepaired instanceof LeveledCompactionStrategy)
+        {
+            int count = 0;
+            count += ((LeveledCompactionStrategy)repaired).getLevelSize(0);
+            count += ((LeveledCompactionStrategy)unrepaired).getLevelSize(0);
+            return count;
+        }
+        return 0;
+    }
+
+    public synchronized int[] getSSTableCountPerLevel()
+    {
+        if (repaired instanceof LeveledCompactionStrategy && unrepaired instanceof LeveledCompactionStrategy)
+        {
+            int [] res = new int[LeveledManifest.MAX_LEVEL_COUNT];
+            int[] repairedCountPerLevel = ((LeveledCompactionStrategy) repaired).getAllLevelSize();
+            res = sumArrays(res, repairedCountPerLevel);
+            int[] unrepairedCountPerLevel = ((LeveledCompactionStrategy) unrepaired).getAllLevelSize();
+            res = sumArrays(res, unrepairedCountPerLevel);
+            return res;
+        }
+        return null;
+    }
+
+    private static int[] sumArrays(int[] a, int[] b)
+    {
+        int[] res = new int[Math.max(a.length, b.length)];
+        for (int i = 0; i < res.length; i++)
+        {
+            if (i < a.length && i < b.length)
+                res[i] = a[i] + b[i];
+            else if (i < a.length)
+                res[i] = a[i];
+            else
+                res[i] = b[i];
+        }
+        return res;
+    }
+
+    public boolean shouldDefragment()
+    {
+        assert repaired.getClass().equals(unrepaired.getClass());
+        return repaired.shouldDefragment();
+    }
+
+    public Directories getDirectories()
+    {
+        assert repaired.getClass().equals(unrepaired.getClass());
+        return repaired.getDirectories();
+    }
+
+    public synchronized void handleNotification(INotification notification, Object sender)
+    {
+        if (notification instanceof SSTableAddedNotification)
+        {
+            SSTableAddedNotification flushedNotification = (SSTableAddedNotification) notification;
+            for (SSTableReader sstable : flushedNotification.added)
+            {
+                if (sstable.isRepaired())
+                    repaired.addSSTable(sstable);
+                else
+                    unrepaired.addSSTable(sstable);
+            }
+        }
+        else if (notification instanceof SSTableListChangedNotification)
+        {
+            SSTableListChangedNotification listChangedNotification = (SSTableListChangedNotification) notification;
+            Set<SSTableReader> repairedRemoved = new HashSet<>();
+            Set<SSTableReader> repairedAdded = new HashSet<>();
+            Set<SSTableReader> unrepairedRemoved = new HashSet<>();
+            Set<SSTableReader> unrepairedAdded = new HashSet<>();
+
+            for (SSTableReader sstable : listChangedNotification.removed)
+            {
+                if (sstable.isRepaired())
+                    repairedRemoved.add(sstable);
+                else
+                    unrepairedRemoved.add(sstable);
+            }
+            for (SSTableReader sstable : listChangedNotification.added)
+            {
+                if (sstable.isRepaired())
+                    repairedAdded.add(sstable);
+                else
+                    unrepairedAdded.add(sstable);
+            }
+            if (!repairedRemoved.isEmpty())
+            {
+                repaired.replaceSSTables(repairedRemoved, repairedAdded);
+            }
+            else
+            {
+                for (SSTableReader sstable : repairedAdded)
+                    repaired.addSSTable(sstable);
+            }
+
+            if (!unrepairedRemoved.isEmpty())
+            {
+                unrepaired.replaceSSTables(unrepairedRemoved, unrepairedAdded);
+            }
+            else
+            {
+                for (SSTableReader sstable : unrepairedAdded)
+                    unrepaired.addSSTable(sstable);
+            }
+        }
+        else if (notification instanceof SSTableRepairStatusChanged)
+        {
+            for (SSTableReader sstable : ((SSTableRepairStatusChanged) notification).sstable)
+            {
+                if (sstable.isRepaired())
+                {
+                    unrepaired.removeSSTable(sstable);
+                    repaired.addSSTable(sstable);
+                }
+                else
+                {
+                    repaired.removeSSTable(sstable);
+                    unrepaired.addSSTable(sstable);
+                }
+            }
+        }
+        else if (notification instanceof SSTableDeletingNotification)
+        {
+            SSTableReader sstable = ((SSTableDeletingNotification)notification).deleting;
+            if (sstable.isRepaired())
+                repaired.removeSSTable(sstable);
+            else
+                unrepaired.removeSSTable(sstable);
+        }
+    }
+
+    public void enable()
+    {
+        if (repaired != null)
+            repaired.enable();
+        if (unrepaired != null)
+            unrepaired.enable();
+        // enable this last to make sure the strategies are ready to get calls.
+        enabled = true;
+    }
+
+    public void disable()
+    {
+        // disable this first avoid asking disabled strategies for compaction tasks
+        enabled = false;
+        if (repaired != null)
+            repaired.disable();
+        if (unrepaired != null)
+            unrepaired.disable();
+    }
+
+    /**
+     * Create ISSTableScanner from the given sstables
+     *
+     * Delegates the call to the compaction strategies to allow LCS to create a scanner
+     * @param sstables
+     * @param ranges
+     * @return
+     */
+    @SuppressWarnings("resource")
+    public synchronized AbstractCompactionStrategy.ScannerList getScanners(Collection<SSTableReader> sstables,  Collection<Range<Token>> ranges)
+    {
+        List<SSTableReader> repairedSSTables = new ArrayList<>();
+        List<SSTableReader> unrepairedSSTables = new ArrayList<>();
+        for (SSTableReader sstable : sstables)
+        {
+            if (sstable.isRepaired())
+                repairedSSTables.add(sstable);
+            else
+                unrepairedSSTables.add(sstable);
+        }
+
+        Set<ISSTableScanner> scanners = new HashSet<>(sstables.size());
+        AbstractCompactionStrategy.ScannerList repairedScanners = repaired.getScanners(repairedSSTables, ranges);
+        AbstractCompactionStrategy.ScannerList unrepairedScanners = unrepaired.getScanners(unrepairedSSTables, ranges);
+        scanners.addAll(repairedScanners.scanners);
+        scanners.addAll(unrepairedScanners.scanners);
+        return new AbstractCompactionStrategy.ScannerList(new ArrayList<>(scanners));
+    }
+
+    public synchronized AbstractCompactionStrategy.ScannerList getScanners(Collection<SSTableReader> sstables)
+    {
+        return getScanners(sstables, null);
+    }
+
+    public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
+    {
+        return unrepaired.groupSSTablesForAntiCompaction(sstablesToGroup);
+    }
+
+    public long getMaxSSTableBytes()
+    {
+        return unrepaired.getMaxSSTableBytes();
+    }
+
+    public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
+    {
+        return getCompactionStrategyFor(txn.originals().iterator().next()).getCompactionTask(txn, gcBefore, maxSSTableBytes);
+    }
+
+    public Collection<AbstractCompactionTask> getMaximalTasks(final int gcBefore, final boolean splitOutput)
+    {
+        // runWithCompactionsDisabled cancels active compactions and disables them, then we are able
+        // to make the repaired/unrepaired strategies mark their own sstables as compacting. Once the
+        // sstables are marked the compactions are re-enabled
+        return cfs.runWithCompactionsDisabled(new Callable<Collection<AbstractCompactionTask>>()
+        {
+            @Override
+            public Collection<AbstractCompactionTask> call() throws Exception
+            {
+                synchronized (CompactionStrategyManager.this)
+                {
+                    Collection<AbstractCompactionTask> repairedTasks = repaired.getMaximalTask(gcBefore, splitOutput);
+                    Collection<AbstractCompactionTask> unrepairedTasks = unrepaired.getMaximalTask(gcBefore, splitOutput);
+
+                    if (repairedTasks == null && unrepairedTasks == null)
+                        return null;
+
+                    if (repairedTasks == null)
+                        return unrepairedTasks;
+                    if (unrepairedTasks == null)
+                        return repairedTasks;
+
+                    List<AbstractCompactionTask> tasks = new ArrayList<>();
+                    tasks.addAll(repairedTasks);
+                    tasks.addAll(unrepairedTasks);
+                    return tasks;
+                }
+            }
+        }, false, false);
+    }
+
+    public AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
+    {
+        return getCompactionStrategyFor(sstables.iterator().next()).getUserDefinedTask(sstables, gcBefore);
+    }
+
+    public int getEstimatedRemainingTasks()
+    {
+        int tasks = 0;
+        tasks += repaired.getEstimatedRemainingTasks();
+        tasks += unrepaired.getEstimatedRemainingTasks();
+
+        return tasks;
+    }
+
+    public boolean shouldBeEnabled()
+    {
+        return params.isEnabled();
+    }
+
+    public String getName()
+    {
+        return unrepaired.getName();
+    }
+
+    public List<AbstractCompactionStrategy> getStrategies()
+    {
+        return Arrays.asList(repaired, unrepaired);
+    }
+
+    public synchronized void setNewLocalCompactionStrategy(CompactionParams params)
+    {
+        logger.info("Switching local compaction strategy from {} to {}}", this.params, params);
+        setStrategy(params);
+        if (shouldBeEnabled())
+            enable();
+        else
+            disable();
+        startup();
+    }
+
+    private void setStrategy(CompactionParams params)
+    {
+        if (repaired != null)
+            repaired.shutdown();
+        if (unrepaired != null)
+            unrepaired.shutdown();
+        repaired = CFMetaData.createCompactionStrategyInstance(cfs, params);
+        unrepaired = CFMetaData.createCompactionStrategyInstance(cfs, params);
+        this.params = params;
+    }
+
+    public CompactionParams getCompactionParams()
+    {
+        return params;
+    }
+
+    public boolean onlyPurgeRepairedTombstones()
+    {
+        return Boolean.parseBoolean(params.options().get(AbstractCompactionStrategy.ONLY_PURGE_REPAIRED_TOMBSTONES));
+    }
+
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, MetadataCollector collector, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
+    {
+        if (repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE)
+        {
+            return unrepaired.createSSTableMultiWriter(descriptor, keyCount, repairedAt, collector, header, lifecycleNewTracker);
+        }
+        else
+        {
+            return repaired.createSSTableMultiWriter(descriptor, keyCount, repairedAt, collector, header, lifecycleNewTracker);
+        }
+    }
+
+    public boolean supportsEarlyOpen()
+    {
+        return repaired.supportsEarlyOpen();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index 006c8ff..3437de7 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java

@@ -17,10 +17,8 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
@@ -30,9 +28,9 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.db.compaction.writers.DefaultCompactionWriter;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
@@ -44,23 +42,40 @@
 import org.apache.cassandra.db.compaction.CompactionManager.CompactionExecutorStatsCollector;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Refs;
 
 public class CompactionTask extends AbstractCompactionTask
 {
+    private static class Summary
+    {
+        final String partitionMerge;
+        final long totalSourceRows;
+
+        public Summary(String partitionMerge, long totalSourceRows)
+        {
+            this.partitionMerge = partitionMerge;
+            this.totalSourceRows = totalSourceRows;
+        }
+    }
     protected static final Logger logger = LoggerFactory.getLogger(CompactionTask.class);
     protected final int gcBefore;
-    private final boolean offline;
+    protected final boolean offline;
+    protected final boolean keepOriginals;
     protected static long totalBytesCompacted = 0;
     private CompactionExecutorStatsCollector collector;
 
-    public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean offline)
+    public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
+    {
+        this(cfs, txn, gcBefore, false, false);
+    }
+
+    public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean offline, boolean keepOriginals)
     {
         super(cfs, txn);
         this.gcBefore = gcBefore;
         this.offline = offline;
+        this.keepOriginals = keepOriginals;
     }
 
     public static synchronized long addToTotalBytesCompacted(long bytesCompacted)
@@ -108,7 +123,7 @@
 
         // Note that the current compaction strategy, is not necessarily the one this task was created under.
         // This should be harmless; see comments to CFS.maybeReloadCompactionStrategy.
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
+        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
 
         if (DatabaseDescriptor.isSnapshotBeforeCompaction())
             cfs.snapshotWithoutFlush(System.currentTimeMillis() + "-compact-" + cfs.name);
@@ -128,7 +143,7 @@
             }
         });
 
-        UUID taskId = offline ? null : SystemKeyspace.startCompaction(cfs, transaction.originals());
+        UUID taskId = transaction.opId();
 
         // new sstables from flush can be added during a compaction, but only the compaction can remove them,
         // so in our single-threaded compaction world this is a valid way of determining if we're compacting
@@ -139,74 +154,66 @@
             ssTableLoggerMsg.append(String.format("%s:level=%d, ", sstr.getFilename(), sstr.getSSTableLevel()));
         }
         ssTableLoggerMsg.append("]");
-        String taskIdLoggerMsg = taskId == null ? UUIDGen.getTimeUUID().toString() : taskId.toString();
-        logger.debug("Compacting ({}) {}", taskIdLoggerMsg, ssTableLoggerMsg);
+
+        logger.debug("Compacting ({}) {}", taskId, ssTableLoggerMsg);
 
         long start = System.nanoTime();
-
         long totalKeysWritten = 0;
-
         long estimatedKeys = 0;
         try (CompactionController controller = getCompactionController(transaction.originals()))
         {
             Set<SSTableReader> actuallyCompact = Sets.difference(transaction.originals(), controller.getFullyExpiredSSTables());
 
-            SSTableFormat.Type sstableFormat = getFormatType(transaction.originals());
+            Collection<SSTableReader> newSStables;
 
-            List<SSTableReader> newSStables;
-            AbstractCompactionIterable ci;
+            long[] mergedRowCounts;
 
             // SSTableScanners need to be closed before markCompactedSSTablesReplaced call as scanners contain references
             // to both ifile and dfile and SSTR will throw deletion errors on Windows if it tries to delete before scanner is closed.
             // See CASSANDRA-8019 and CASSANDRA-8399
+            int nowInSec = FBUtilities.nowInSeconds();
             try (Refs<SSTableReader> refs = Refs.ref(actuallyCompact);
-                 AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(actuallyCompact))
+                 AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(actuallyCompact);
+                 CompactionIterator ci = new CompactionIterator(compactionType, scanners.scanners, controller, nowInSec, taskId))
             {
-                ci = new CompactionIterable(compactionType, scanners.scanners, controller, sstableFormat, taskId);
-                try (CloseableIterator<AbstractCompactedRow> iter = ci.iterator())
+                long lastCheckObsoletion = start;
+
+                if (collector != null)
+                    collector.beginCompaction(ci);
+
+                try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, getDirectories(), transaction, actuallyCompact))
                 {
-                    long lastCheckObsoletion = start;
-
-                    if (!controller.cfs.getCompactionStrategy().isActive)
+                    // Note that we need to re-check this flag after calling beginCompaction above to avoid a window
+                    // where the compaction does not exist in activeCompactions but the CSM gets paused.
+                    // We already have the sstables marked compacting here so CompactionManager#waitForCessation will
+                    // block until the below exception is thrown and the transaction is cancelled.
+                    if (!controller.cfs.getCompactionStrategyManager().isActive)
                         throw new CompactionInterruptedException(ci.getCompactionInfo());
-
-                    if (collector != null)
-                        collector.beginCompaction(ci);
-
-                    try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, transaction, actuallyCompact))
+                    estimatedKeys = writer.estimatedKeys();
+                    while (ci.hasNext())
                     {
-                        estimatedKeys = writer.estimatedKeys();
-                        while (iter.hasNext())
+                        if (ci.isStopRequested())
+                            throw new CompactionInterruptedException(ci.getCompactionInfo());
+
+                        if (writer.append(ci.next()))
+                            totalKeysWritten++;
+
+                        if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
                         {
-                            if (ci.isStopRequested())
-                                throw new CompactionInterruptedException(ci.getCompactionInfo());
-
-                            try (AbstractCompactedRow row = iter.next())
-                            {
-                                if (writer.append(row))
-                                    totalKeysWritten++;
-
-                                if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
-                                {
-                                    controller.maybeRefreshOverlaps();
-                                    lastCheckObsoletion = System.nanoTime();
-                                }
-                            }
+                            controller.maybeRefreshOverlaps();
+                            lastCheckObsoletion = System.nanoTime();
                         }
-
-                        // don't replace old sstables yet, as we need to mark the compaction finished in the system table
-                        newSStables = writer.finish();
                     }
-                    finally
-                    {
-                        // point of no return -- the new sstables are live on disk; next we'll start deleting the old ones
-                        // (in replaceCompactedSSTables)
-                        if (taskId != null)
-                            SystemKeyspace.finishCompaction(taskId);
 
-                        if (collector != null)
-                            collector.finishCompaction(ci);
-                    }
+                    // point of no return
+                    newSStables = writer.finish();
+                }
+                finally
+                {
+                    if (collector != null)
+                        collector.finishCompaction(ci);
+
+                    mergedRowCounts = ci.getMergedRowCounts();
                 }
             }
 
@@ -227,40 +234,47 @@
             else
             {
                 double mbps = dTime > 0 ? (double) endsize / (1024 * 1024) / ((double) dTime / 1000) : 0;
-                long totalSourceRows = 0;
-                String mergeSummary = updateCompactionHistory(cfs.keyspace.getName(), cfs.getColumnFamilyName(), ci, startsize, endsize);
+                Summary mergeSummary = updateCompactionHistory(cfs.keyspace.getName(), cfs.getColumnFamilyName(), mergedRowCounts, startsize, endsize);
                 logger.debug(String.format("Compacted (%s) %d sstables to [%s] to level=%d.  %,d bytes to %,d (~%d%% of original) in %,dms = %fMB/s.  %,d total partitions merged to %,d.  Partition merge counts were {%s}",
-                                           taskIdLoggerMsg, transaction.originals().size(), newSSTableNames.toString(), getLevel(), startsize, endsize, (int) (ratio * 100), dTime, mbps, totalSourceRows, totalKeysWritten, mergeSummary));
+                                           taskId, transaction.originals().size(), newSSTableNames.toString(), getLevel(), startsize, endsize, (int) (ratio * 100), dTime, mbps, mergeSummary.totalSourceRows, totalKeysWritten, mergeSummary.partitionMerge));
                 logger.trace(String.format("CF Total Bytes Compacted: %,d", CompactionTask.addToTotalBytesCompacted(endsize)));
-                logger.trace("Actual #keys: {}, Estimated #keys:{}, Err%: {}", totalKeysWritten, estimatedKeys, ((double) (totalKeysWritten - estimatedKeys) / totalKeysWritten));
+                logger.trace("Actual #keys: {}, Estimated #keys:{}, Err%: {}", totalKeysWritten, estimatedKeys, ((double)(totalKeysWritten - estimatedKeys)/totalKeysWritten));
             }
         }
     }
 
     @Override
-    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction transaction, Set<SSTableReader> nonExpiredSSTables)
+    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
+                                                          Directories directories,
+                                                          LifecycleTransaction transaction,
+                                                          Set<SSTableReader> nonExpiredSSTables)
     {
-        return new DefaultCompactionWriter(cfs, transaction, nonExpiredSSTables, offline, compactionType);
-
+        return new DefaultCompactionWriter(cfs, directories, transaction, nonExpiredSSTables, offline, keepOriginals);
     }
 
-    public static String updateCompactionHistory(String keyspaceName, String columnFamilyName, AbstractCompactionIterable ci, long startSize, long endSize)
+    public static Summary updateCompactionHistory(String keyspaceName, String columnFamilyName, long[] mergedRowCounts, long startSize, long endSize)
     {
-        long[] counts = ci.getMergedRowCounts();
-        StringBuilder mergeSummary = new StringBuilder(counts.length * 10);
+        StringBuilder mergeSummary = new StringBuilder(mergedRowCounts.length * 10);
         Map<Integer, Long> mergedRows = new HashMap<>();
-        for (int i = 0; i < counts.length; i++)
+        long totalSourceRows = 0;
+        for (int i = 0; i < mergedRowCounts.length; i++)
         {
-            long count = counts[i];
+            long count = mergedRowCounts[i];
             if (count == 0)
                 continue;
 
             int rows = i + 1;
+            totalSourceRows += rows * count;
             mergeSummary.append(String.format("%d:%d, ", rows, count));
             mergedRows.put(rows, count);
         }
         SystemKeyspace.updateCompactionHistory(keyspaceName, columnFamilyName, System.currentTimeMillis(), startSize, endSize, mergedRows);
-        return mergeSummary.toString();
+        return new Summary(mergeSummary.toString(), totalSourceRows);
+    }
+
+    protected Directories getDirectories()
+    {
+        return cfs.getDirectories();
     }
 
     public static long getMinRepairedAt(Set<SSTableReader> actuallyCompact)
@@ -280,14 +294,20 @@
      */
     protected void checkAvailableDiskSpace()
     {
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategy();
+        if(!cfs.isCompactionDiskSpaceCheckEnabled() && compactionType == OperationType.COMPACTION)
+        {
+            logger.info("Compaction space check is disabled");
+            return;
+        }
+
+        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
 
         while(true)
         {
             long expectedWriteSize = cfs.getExpectedCompactedFileSize(transaction.originals(), compactionType);
             long estimatedSSTables = Math.max(1, expectedWriteSize / strategy.getMaxSSTableBytes());
 
-            if(cfs.directories.hasAvailableDiskSpace(estimatedSSTables, expectedWriteSize))
+            if(cfs.getDirectories().hasAvailableDiskSpace(estimatedSSTables, expectedWriteSize))
                 break;
 
             if (!reduceScopeForLimitedSpace(expectedWriteSize))
@@ -328,13 +348,4 @@
         }
         return max;
     }
-
-    public static SSTableFormat.Type getFormatType(Collection<SSTableReader> sstables)
-    {
-        if (sstables.isEmpty() || !SSTableFormat.enableSSTableDevelopmentTestMode)
-            return DatabaseDescriptor.getSSTableFormat();
-
-        //Allows us to test compaction of non-default formats
-        return sstables.iterator().next().descriptor.formatType;
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
index 8c59e1a..7c38fa8 100644
--- a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java

@@ -22,16 +22,23 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cql3.statements.CFPropDefs;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.utils.Pair;
 
+import static com.google.common.collect.Iterables.filter;
+
+/**
+ * @deprecated in favour of {@link TimeWindowCompactionStrategy}
+ */
+@Deprecated
 public class DateTieredCompactionStrategy extends AbstractCompactionStrategy
 {
     private static final Logger logger = LoggerFactory.getLogger(DateTieredCompactionStrategy.class);
@@ -71,7 +78,7 @@
 
             LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
             if (modifier != null)
-                return new CompactionTask(cfs, modifier, gcBefore, false);
+                return new CompactionTask(cfs, modifier, gcBefore);
         }
     }
 
@@ -80,19 +87,19 @@
      * @param gcBefore
      * @return
      */
-    private List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
     {
-        if (cfs.getSSTables().isEmpty())
+        if (Iterables.isEmpty(cfs.getSSTables(SSTableSet.LIVE)))
             return Collections.emptyList();
 
-        Set<SSTableReader> uncompacting = Sets.intersection(sstables, cfs.getUncompactingSSTables());
+        Set<SSTableReader> uncompacting = ImmutableSet.copyOf(filter(cfs.getUncompactingSSTables(), sstables::contains));
 
         Set<SSTableReader> expired = Collections.emptySet();
         // we only check for expired sstables every 10 minutes (by default) due to it being an expensive operation
         if (System.currentTimeMillis() - lastExpiredCheck > options.expiredSSTableCheckFrequency)
         {
             // Find fully expired SSTables. Those will be included no matter what.
-            expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, cfs.getOverlappingSSTables(uncompacting), gcBefore);
+            expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, cfs.getOverlappingLiveSSTables(uncompacting), gcBefore);
             lastExpiredCheck = System.currentTimeMillis();
         }
         Set<SSTableReader> candidates = Sets.newHashSet(filterSuspectSSTables(uncompacting));
@@ -156,13 +163,11 @@
      */
     private long getNow()
     {
-        return Collections.max(cfs.getSSTables(), new Comparator<SSTableReader>()
-        {
-            public int compare(SSTableReader o1, SSTableReader o2)
-            {
-                return Long.compare(o1.getMaxTimestamp(), o2.getMaxTimestamp());
-            }
-        }).getMaxTimestamp();
+        // no need to convert to collection if had an Iterables.max(), but not present in standard toolkit, and not worth adding
+        List<SSTableReader> list = new ArrayList<>();
+        Iterables.addAll(list, cfs.getSSTables(SSTableSet.LIVE));
+        return Collections.max(list, (o1, o2) -> Long.compare(o1.getMaxTimestamp(), o2.getMaxTimestamp()))
+                          .getMaxTimestamp();
     }
 
     /**
@@ -178,7 +183,7 @@
         if (maxSSTableAge == 0)
             return sstables;
         final long cutoff = now - maxSSTableAge;
-        return Iterables.filter(sstables, new Predicate<SSTableReader>()
+        return filter(sstables, new Predicate<SSTableReader>()
         {
             @Override
             public boolean apply(SSTableReader sstable)
@@ -188,11 +193,6 @@
         });
     }
 
-    /**
-     *
-     * @param sstables
-     * @return
-     */
     public static List<Pair<SSTableReader, Long>> createSSTableAndMinTimestampPairs(Iterable<SSTableReader> sstables)
     {
         List<Pair<SSTableReader, Long>> sstableMinTimestampPairs = Lists.newArrayListWithCapacity(Iterables.size(sstables));
@@ -200,14 +200,15 @@
             sstableMinTimestampPairs.add(Pair.create(sstable, sstable.getMinTimestamp()));
         return sstableMinTimestampPairs;
     }
+
     @Override
-    public void addSSTable(SSTableReader sstable)
+    public synchronized void addSSTable(SSTableReader sstable)
     {
         sstables.add(sstable);
     }
 
     @Override
-    public void removeSSTable(SSTableReader sstable)
+    public synchronized void removeSSTable(SSTableReader sstable)
     {
         sstables.remove(sstable);
     }
@@ -395,7 +396,7 @@
         LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
         if (txn == null)
             return null;
-        return Collections.<AbstractCompactionTask>singleton(new CompactionTask(cfs, txn, gcBefore, false));
+        return Collections.<AbstractCompactionTask>singleton(new CompactionTask(cfs, txn, gcBefore));
     }
 
     @Override
@@ -411,7 +412,7 @@
             return null;
         }
 
-        return new CompactionTask(cfs, modifier, gcBefore, false).setUserDefined(true);
+        return new CompactionTask(cfs, modifier, gcBefore).setUserDefined(true);
     }
 
     public int getEstimatedRemainingTasks()
@@ -443,8 +444,8 @@
         Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
         uncheckedOptions = DateTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 
-        uncheckedOptions.remove(CFPropDefs.KW_MINCOMPACTIONTHRESHOLD);
-        uncheckedOptions.remove(CFPropDefs.KW_MAXCOMPACTIONTHRESHOLD);
+        uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
+        uncheckedOptions.remove(CompactionParams.Option.MAX_THRESHOLD.toString());
 
         uncheckedOptions = SizeTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 

diff --git a/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java b/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java
deleted file mode 100644
index eaceead..0000000
--- a/src/java/org/apache/cassandra/db/compaction/LazilyCompactedRow.java
+++ /dev/null

@@ -1,359 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
-import org.apache.cassandra.io.sstable.ColumnNameHelper;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.SequentialWriter;
-import org.apache.cassandra.utils.MergeIterator;
-import org.apache.cassandra.utils.StreamingHistogram;
-import org.apache.cassandra.utils.Throwables;
-
-/**
- * LazilyCompactedRow only computes the row bloom filter and column index in memory
- * (at construction time); it does this by reading one column at a time from each
- * of the rows being compacted, and merging them as it does so.  So the most we have
- * in memory at a time is the bloom filter, the index, and one column from each
- * pre-compaction row.
- */
-public class LazilyCompactedRow extends AbstractCompactedRow
-{
-    protected final List<? extends OnDiskAtomIterator> rows;
-    protected final CompactionController controller;
-    protected Predicate<Long> purgeEvaluator;
-    protected final ColumnFamily emptyColumnFamily;
-    protected ColumnStats columnStats;
-    protected boolean closed;
-    protected ColumnIndex.Builder indexBuilder;
-    protected final SecondaryIndexManager.Updater indexer;
-    protected final Reducer reducer;
-    protected final Iterator<OnDiskAtom> merger;
-    protected DeletionTime maxRowTombstone;
-
-    public LazilyCompactedRow(CompactionController controller, List<? extends OnDiskAtomIterator> rows)
-    {
-        super(rows.get(0).getKey());
-        this.rows = rows;
-        this.controller = controller;
-        indexer = controller.cfs.indexManager.gcUpdaterFor(key);
-
-        // Combine top-level tombstones, keeping the one with the highest markedForDeleteAt timestamp.  This may be
-        // purged (depending on gcBefore), but we need to remember it to properly delete columns during the merge
-        maxRowTombstone = DeletionTime.LIVE;
-        for (OnDiskAtomIterator row : rows)
-        {
-            DeletionTime rowTombstone = row.getColumnFamily().deletionInfo().getTopLevelDeletion();
-            if (maxRowTombstone.compareTo(rowTombstone) < 0)
-                maxRowTombstone = rowTombstone;
-        }
-
-        emptyColumnFamily = ArrayBackedSortedColumns.factory.create(controller.cfs.metadata);
-        emptyColumnFamily.delete(maxRowTombstone);
-        if (!maxRowTombstone.isLive() && getPurgeEvaluator().apply(maxRowTombstone.markedForDeleteAt))
-            emptyColumnFamily.purgeTombstones(controller.gcBefore);
-
-        reducer = new Reducer();
-        merger = Iterators.filter(MergeIterator.get(rows, emptyColumnFamily.getComparator().onDiskAtomComparator(), reducer), Predicates.notNull());
-    }
-
-    private Predicate<Long> getPurgeEvaluator()
-    {
-        if (purgeEvaluator == null)
-        {
-            purgeEvaluator = controller.getPurgeEvaluator(key);
-        }
-
-        return purgeEvaluator;
-    }
-
-    private static void removeDeleted(ColumnFamily cf, boolean shouldPurge, DecoratedKey key, CompactionController controller)
-    {
-        // We should only purge cell tombstones if shouldPurge is true, but regardless, it's still ok to remove cells that
-        // are shadowed by a row or range tombstone; removeDeletedColumnsOnly(cf, Integer.MIN_VALUE) will accomplish this
-        // without purging tombstones.
-        int overriddenGCBefore = shouldPurge ? controller.gcBefore : Integer.MIN_VALUE;
-        ColumnFamilyStore.removeDeletedColumnsOnly(cf, overriddenGCBefore, controller.cfs.indexManager.gcUpdaterFor(key));
-    }
-
-    public RowIndexEntry write(long currentPosition, SequentialWriter dataFile) throws IOException
-    {
-        assert !closed;
-
-        DataOutputPlus out = dataFile.stream;
-
-        ColumnIndex columnsIndex;
-        try
-        {
-            indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.getKey(), out);
-            columnsIndex = indexBuilder.buildForCompaction(merger);
-
-            // if there aren't any columns or tombstones, return null
-            if (columnsIndex.columnsIndex.isEmpty() && !emptyColumnFamily.isMarkedForDelete())
-                return null;
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-        // reach into the reducer (created during iteration) to get column count, size, max column timestamp
-        columnStats = new ColumnStats(reducer.columns,
-                                      reducer.minTimestampTracker.get(),
-                                      Math.max(emptyColumnFamily.deletionInfo().maxTimestamp(), reducer.maxTimestampTracker.get()),
-                                      reducer.maxDeletionTimeTracker.get(),
-                                      reducer.tombstones,
-                                      reducer.minColumnNameSeen,
-                                      reducer.maxColumnNameSeen,
-                                      reducer.hasLegacyCounterShards);
-
-        // in case no columns were ever written, we may still need to write an empty header with a top-level tombstone
-        indexBuilder.maybeWriteEmptyRowHeader();
-
-        out.writeShort(BigTableWriter.END_OF_ROW);
-
-        close();
-
-        return RowIndexEntry.create(currentPosition, emptyColumnFamily.deletionInfo().getTopLevelDeletion(), columnsIndex);
-    }
-
-    public void update(final MessageDigest digest)
-    {
-        assert !closed;
-
-        // no special-case for rows.size == 1, we're actually skipping some bytes here so just
-        // blindly updating everything wouldn't be correct
-        try (DataOutputBuffer out = new DataOutputBuffer())
-        {
-            OnDiskAtom.SerializerForWriting serializer = new OnDiskAtom.SerializerForWriting()
-            {
-                @Override
-                public void serializeForSSTable(OnDiskAtom atom, DataOutputPlus out) throws IOException
-                {
-                    atom.updateDigest(digest);
-                }
-
-                @Override
-                public long serializedSizeForSSTable(OnDiskAtom atom)
-                {
-                    return 0;
-                }
-            };
-
-            // initialize indexBuilder for the benefit of its tombstoneTracker, used by our reducing iterator
-            indexBuilder = new ColumnIndex.Builder(emptyColumnFamily, key.getKey(), out, serializer);
-
-            DeletionTime.serializer.serialize(emptyColumnFamily.deletionInfo().getTopLevelDeletion(), out);
-
-            // do not update digest in case of missing or purged row level tombstones, see CASSANDRA-8979
-            // - digest for non-empty rows needs to be updated with deletion in any case to match digest with versions before patch
-            // - empty rows must not update digest in case of LIVE delete status to avoid mismatches with non-existing rows
-            //   this will however introduce in return a digest mismatch for versions before patch (which would update digest in any case)
-            if (merger.hasNext() || emptyColumnFamily.deletionInfo().getTopLevelDeletion() != DeletionTime.LIVE)
-            {
-                digest.update(out.getData(), 0, out.getLength());
-            }
-            indexBuilder.buildForCompaction(merger);
-        }
-        catch (IOException e)
-        {
-            throw new AssertionError(e);
-        }
-
-        close();
-    }
-
-    public ColumnStats columnStats()
-    {
-        return columnStats;
-    }
-
-    public void close()
-    {
-        Throwable accumulate = null;
-        for (OnDiskAtomIterator row : rows)
-        {
-            try
-            {
-                row.close();
-            }
-            catch (IOException e)
-            {
-                accumulate = Throwables.merge(accumulate, e);
-            }
-        }
-        closed = true;
-        Throwables.maybeFail(accumulate);
-    }
-
-    protected class Reducer extends MergeIterator.Reducer<OnDiskAtom, OnDiskAtom>
-    {
-        // all columns reduced together will have the same name, so there will only be one column
-        // in the container; we just want to leverage the conflict resolution code from CF.
-        // (Note that we add the row tombstone in getReduced.)
-        ColumnFamily container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());
-
-        // tombstone reference; will be reconciled w/ column during getReduced.  Note that the top-level (row) tombstone
-        // is held by LCR.deletionInfo.
-        public RangeTombstone tombstone;
-
-        public int columns = 0;
-        // if the row tombstone is 'live' we need to set timestamp to MAX_VALUE to be able to overwrite it later
-        // markedForDeleteAt is MIN_VALUE for 'live' row tombstones (which we use to default maxTimestampSeen)
-
-        ColumnStats.MinLongTracker minTimestampTracker = new ColumnStats.MinLongTracker(Long.MIN_VALUE);
-        ColumnStats.MaxLongTracker maxTimestampTracker = new ColumnStats.MaxLongTracker(Long.MAX_VALUE);
-        // we need to set MIN_VALUE if we are 'live' since we want to overwrite it later
-        // we are bound to have either a RangeTombstone or standard cells will set this properly:
-        ColumnStats.MaxIntTracker maxDeletionTimeTracker = new ColumnStats.MaxIntTracker(Integer.MAX_VALUE);
-
-        public StreamingHistogram tombstones = new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
-        public List<ByteBuffer> minColumnNameSeen = Collections.emptyList();
-        public List<ByteBuffer> maxColumnNameSeen = Collections.emptyList();
-        public boolean hasLegacyCounterShards = false;
-
-        public Reducer()
-        {
-            minTimestampTracker.update(maxRowTombstone.isLive() ? Long.MAX_VALUE : maxRowTombstone.markedForDeleteAt);
-            maxTimestampTracker.update(maxRowTombstone.markedForDeleteAt);
-            maxDeletionTimeTracker.update(maxRowTombstone.isLive() ? Integer.MIN_VALUE : maxRowTombstone.localDeletionTime);
-            if (!maxRowTombstone.isLive())
-                tombstones.update(maxRowTombstone.localDeletionTime);
-        }
-
-        /**
-         * Called once per version of a cell that we need to merge, after which getReduced() is called.  In other words,
-         * this will be called one or more times with cells that share the same column name.
-         */
-        public void reduce(OnDiskAtom current)
-        {
-            if (current instanceof RangeTombstone)
-            {
-                if (tombstone == null || current.timestamp() >= tombstone.timestamp())
-                    tombstone = (RangeTombstone)current;
-            }
-            else
-            {
-                Cell cell = (Cell) current;
-                container.addColumn(cell);
-
-                // skip the index-update checks if there is no indexing needed since they are a bit expensive
-                if (indexer == SecondaryIndexManager.nullUpdater)
-                    return;
-
-                if (cell.isLive() && !container.getColumn(cell.name()).equals(cell))
-                    indexer.remove(cell);
-            }
-        }
-
-        /**
-         * Called after reduce() has been called for each cell sharing the same name.
-         */
-        protected OnDiskAtom getReduced()
-        {
-            if (tombstone != null)
-            {
-                RangeTombstone t = tombstone;
-                tombstone = null;
-
-                if (t.data.isGcAble(controller.gcBefore) && getPurgeEvaluator().apply(t.timestamp()) ||
-                    maxRowTombstone.markedForDeleteAt >= t.timestamp())
-                {
-                    indexBuilder.tombstoneTracker().update(t, true);
-                    return null;
-                }
-                else
-                {
-                    tombstones.update(t.getLocalDeletionTime());
-                    minTimestampTracker.update(t.timestamp());
-                    maxTimestampTracker.update(t.timestamp());
-                    maxDeletionTimeTracker.update(t.getLocalDeletionTime());
-                    minColumnNameSeen = ColumnNameHelper.minComponents(minColumnNameSeen, t.min, controller.cfs.metadata.comparator);
-                    maxColumnNameSeen = ColumnNameHelper.maxComponents(maxColumnNameSeen, t.max, controller.cfs.metadata.comparator);
-                    return t;
-                }
-            }
-            else
-            {
-                // when we clear() the container, it removes the deletion info, so this needs to be reset each time
-                container.delete(maxRowTombstone);
-                Iterator<Cell> iter = container.iterator();
-                Cell c = iter.next();
-                boolean shouldPurge = c.getLocalDeletionTime() < Integer.MAX_VALUE && getPurgeEvaluator().apply(c.timestamp());
-                removeDeleted(container, shouldPurge, key, controller);
-                iter = container.iterator();
-                if (!iter.hasNext())
-                {
-                    // don't call clear() because that resets the deletion time. See CASSANDRA-7808.
-                    container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());
-                    return null;
-                }
-
-                int localDeletionTime = container.deletionInfo().getTopLevelDeletion().localDeletionTime;
-                if (localDeletionTime < Integer.MAX_VALUE)
-                    tombstones.update(localDeletionTime);
-
-                Cell reduced = iter.next();
-                container = ArrayBackedSortedColumns.factory.create(emptyColumnFamily.metadata());
-
-                // removeDeleted have only checked the top-level CF deletion times,
-                // not the range tombstone. For that we use the columnIndexer tombstone tracker.
-                if (indexBuilder.tombstoneTracker().isDeleted(reduced))
-                {
-                    // We skip that column so it won't be passed to the tracker by the index builded. So pass it now to
-                    // make sure we still discard potentially un-needed RT as soon as possible.
-                    indexBuilder.tombstoneTracker().update(reduced, false);
-                    indexer.remove(reduced);
-                    return null;
-                }
-
-                columns++;
-                minTimestampTracker.update(reduced.timestamp());
-                maxTimestampTracker.update(reduced.timestamp());
-                maxDeletionTimeTracker.update(reduced.getLocalDeletionTime());
-                minColumnNameSeen = ColumnNameHelper.minComponents(minColumnNameSeen, reduced.name(), controller.cfs.metadata.comparator);
-                maxColumnNameSeen = ColumnNameHelper.maxComponents(maxColumnNameSeen, reduced.name(), controller.cfs.metadata.comparator);
-
-                int deletionTime = reduced.getLocalDeletionTime();
-                if (deletionTime < Integer.MAX_VALUE)
-                    tombstones.update(deletionTime);
-
-                if (reduced instanceof CounterCell)
-                    hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) reduced).hasLegacyShards();
-
-                return reduced;
-            }
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
index 6daea36..65fca27 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.io.IOException;
 import java.util.*;
 
 
@@ -26,17 +25,19 @@
 import com.google.common.collect.*;
 import com.google.common.primitives.Doubles;
 
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.FBUtilities;
 
 public class LeveledCompactionStrategy extends AbstractCompactionStrategy
 {
@@ -84,6 +85,13 @@
         return manifest.getAllLevelSize();
     }
 
+    @Override
+    public void startup()
+    {
+        manifest.calculateLastCompactedKeys();
+        super.startup();
+    }
+
     /**
      * the only difference between background and maximal in LCS is that maximal is still allowed
      * (by explicit user request) even when compaction is disabled.
@@ -215,8 +223,7 @@
         return maxSSTableSizeInMB * 1024L * 1024L;
     }
 
-    @SuppressWarnings("resource")
-    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
     {
         Set<SSTableReader>[] sstablesPerLevel = manifest.getSStablesPerLevelSnapshot();
 
@@ -248,14 +255,18 @@
                 {
                     // L0 makes no guarantees about overlapping-ness.  Just create a direct scanner for each
                     for (SSTableReader sstable : byLevel.get(level))
-                        scanners.add(sstable.getScanner(range, CompactionManager.instance.getRateLimiter()));
+                        scanners.add(sstable.getScanner(ranges, CompactionManager.instance.getRateLimiter()));
                 }
                 else
                 {
                     // Create a LeveledScanner that only opens one sstable at a time, in sorted order
-                    List<SSTableReader> intersecting = LeveledScanner.intersecting(byLevel.get(level), range);
+                    Collection<SSTableReader> intersecting = LeveledScanner.intersecting(byLevel.get(level), ranges);
                     if (!intersecting.isEmpty())
-                        scanners.add(new LeveledScanner(intersecting, range));
+                    {
+                        @SuppressWarnings("resource") // The ScannerList will be in charge of closing (and we close properly on errors)
+                        ISSTableScanner scanner = new LeveledScanner(intersecting, ranges);
+                        scanners.add(scanner);
+                    }
                 }
             }
         }
@@ -295,9 +306,9 @@
 
     // Lazily creates SSTableBoundedScanner for sstable that are assumed to be from the
     // same level (e.g. non overlapping) - see #4142
-    private static class LeveledScanner extends AbstractIterator<OnDiskAtomIterator> implements ISSTableScanner
+    private static class LeveledScanner extends AbstractIterator<UnfilteredRowIterator> implements ISSTableScanner
     {
-        private final Range<Token> range;
+        private final Collection<Range<Token>> ranges;
         private final List<SSTableReader> sstables;
         private final Iterator<SSTableReader> sstableIterator;
         private final long totalLength;
@@ -305,9 +316,9 @@
         private ISSTableScanner currentScanner;
         private long positionOffset;
 
-        public LeveledScanner(Collection<SSTableReader> sstables, Range<Token> range)
+        public LeveledScanner(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
         {
-            this.range = range;
+            this.ranges = ranges;
 
             // add only sstables that intersect our range, and estimate how much data that involves
             this.sstables = new ArrayList<>(sstables.size());
@@ -318,8 +329,8 @@
                 long estimatedKeys = sstable.estimatedKeys();
                 double estKeysInRangeRatio = 1.0;
 
-                if (estimatedKeys > 0 && range != null)
-                    estKeysInRangeRatio = ((double) sstable.estimatedKeysForRanges(Collections.singleton(range))) / estimatedKeys;
+                if (estimatedKeys > 0 && ranges != null)
+                    estKeysInRangeRatio = ((double) sstable.estimatedKeysForRanges(ranges)) / estimatedKeys;
 
                 length += sstable.uncompressedLength() * estKeysInRangeRatio;
             }
@@ -328,51 +339,61 @@
             Collections.sort(this.sstables, SSTableReader.sstableComparator);
             sstableIterator = this.sstables.iterator();
             assert sstableIterator.hasNext(); // caller should check intersecting first
-            currentScanner = sstableIterator.next().getScanner(range, CompactionManager.instance.getRateLimiter());
+            currentScanner = sstableIterator.next().getScanner(ranges, CompactionManager.instance.getRateLimiter());
         }
 
-        public static List<SSTableReader> intersecting(Collection<SSTableReader> sstables, Range<Token> range)
+        public static Collection<SSTableReader> intersecting(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
         {
-            ArrayList<SSTableReader> filtered = new ArrayList<>();
-            for (SSTableReader sstable : sstables)
+            if (ranges == null)
+                return Lists.newArrayList(sstables);
+
+            Set<SSTableReader> filtered = new HashSet<>();
+            for (Range<Token> range : ranges)
             {
-                Range<Token> sstableRange = new Range<>(sstable.first.getToken(), sstable.last.getToken());
-                if (range == null || sstableRange.intersects(range))
-                    filtered.add(sstable);
+                for (SSTableReader sstable : sstables)
+                {
+                    Range<Token> sstableRange = new Range<>(sstable.first.getToken(), sstable.last.getToken());
+                    if (range == null || sstableRange.intersects(range))
+                        filtered.add(sstable);
+                }
             }
             return filtered;
         }
 
-        protected OnDiskAtomIterator computeNext()
+
+        public boolean isForThrift()
+        {
+            return false;
+        }
+
+        public CFMetaData metadata()
+        {
+            return sstables.get(0).metadata; // The ctor checks we have at least one sstable
+        }
+
+        protected UnfilteredRowIterator computeNext()
         {
             if (currentScanner == null)
                 return endOfData();
 
-            try
+            while (true)
             {
-                while (true)
-                {
-                    if (currentScanner.hasNext())
-                        return currentScanner.next();
+                if (currentScanner.hasNext())
+                    return currentScanner.next();
 
-                    positionOffset += currentScanner.getLengthInBytes();
-                    currentScanner.close();
-                    if (!sstableIterator.hasNext())
-                    {
-                        // reset to null so getCurrentPosition does not return wrong value
-                        currentScanner = null;
-                        return endOfData();
-                    }
-                    currentScanner = sstableIterator.next().getScanner(range, CompactionManager.instance.getRateLimiter());
+                positionOffset += currentScanner.getLengthInBytes();
+                currentScanner.close();
+                if (!sstableIterator.hasNext())
+                {
+                    // reset to null so getCurrentPosition does not return wrong value
+                    currentScanner = null;
+                    return endOfData();
                 }
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
+                currentScanner = sstableIterator.next().getScanner(ranges, CompactionManager.instance.getRateLimiter());
             }
         }
 
-        public void close() throws IOException
+        public void close()
         {
             if (currentScanner != null)
                 currentScanner.close();

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
index 4980ec3..f8c3521 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java

@@ -20,6 +20,7 @@
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.db.compaction.writers.MajorLeveledCompactionWriter;
 import org.apache.cassandra.db.compaction.writers.MaxSSTableSizeWriter;
@@ -34,18 +35,21 @@
 
     public LeveledCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
     {
-        super(cfs, txn, gcBefore, false);
+        super(cfs, txn, gcBefore);
         this.level = level;
         this.maxSSTableBytes = maxSSTableBytes;
         this.majorCompaction = majorCompaction;
     }
 
     @Override
-    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
+    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
+                                                          Directories directories,
+                                                          LifecycleTransaction txn,
+                                                          Set<SSTableReader> nonExpiredSSTables)
     {
         if (majorCompaction)
-            return new MajorLeveledCompactionWriter(cfs, txn, nonExpiredSSTables, maxSSTableBytes, false, compactionType);
-        return new MaxSSTableSizeWriter(cfs, txn, nonExpiredSSTables, maxSSTableBytes, getLevel(), false, compactionType);
+            return new MajorLeveledCompactionWriter(cfs, directories, txn, nonExpiredSSTables, maxSSTableBytes, false, false);
+        return new MaxSSTableSizeWriter(cfs, directories, txn, nonExpiredSSTables, maxSSTableBytes, getLevel(), false, false);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index fba6798..bf543e5 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java

@@ -28,13 +28,15 @@
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
 import com.google.common.primitives.Ints;
+
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.RowPosition;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
@@ -57,11 +59,14 @@
      * that level into lower level compactions
      */
     private static final int NO_COMPACTION_LIMIT = 25;
-
+    // allocate enough generations for a PB of data, with a 1-MB sstable size.  (Note that if maxSSTableSize is
+    // updated, we will still have sstables of the older, potentially smaller size.  So don't make this
+    // dependent on maxSSTableSize.)
+    public static final int MAX_LEVEL_COUNT = (int) Math.log10(1000 * 1000 * 1000);
     private final ColumnFamilyStore cfs;
     @VisibleForTesting
     protected final List<SSTableReader>[] generations;
-    private final RowPosition[] lastCompactedKeys;
+    private final PartitionPosition[] lastCompactedKeys;
     private final long maxSSTableSizeInBytes;
     private final SizeTieredCompactionStrategyOptions options;
     private final int [] compactionCounter;
@@ -72,18 +77,14 @@
         this.maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024L * 1024L;
         this.options = options;
 
-        // allocate enough generations for a PB of data, with a 1-MB sstable size.  (Note that if maxSSTableSize is
-        // updated, we will still have sstables of the older, potentially smaller size.  So don't make this
-        // dependent on maxSSTableSize.)
-        int n = (int) Math.log10(1000 * 1000 * 1000);
-        generations = new List[n];
-        lastCompactedKeys = new RowPosition[n];
+        generations = new List[MAX_LEVEL_COUNT];
+        lastCompactedKeys = new PartitionPosition[MAX_LEVEL_COUNT];
         for (int i = 0; i < generations.length; i++)
         {
             generations[i] = new ArrayList<>();
-            lastCompactedKeys[i] = cfs.partitioner.getMinimumToken().minKeyBound();
+            lastCompactedKeys[i] = cfs.getPartitioner().getMinimumToken().minKeyBound();
         }
-        compactionCounter = new int[n];
+        compactionCounter = new int[MAX_LEVEL_COUNT];
     }
 
     public static LeveledManifest create(ColumnFamilyStore cfs, int maxSSTableSize, List<SSTableReader> sstables)
@@ -104,9 +105,38 @@
         {
             manifest.repairOverlappingSSTables(i);
         }
+        manifest.calculateLastCompactedKeys();
         return manifest;
     }
 
+    /**
+     * If we want to start compaction in level n, find the newest (by modification time) file in level n+1
+     * and use its last token for last compacted key in level n;
+     */
+    public void calculateLastCompactedKeys()
+    {
+        for (int i = 0; i < generations.length - 1; i++)
+        {
+            // this level is empty
+            if (generations[i + 1].isEmpty())
+                continue;
+
+            SSTableReader sstableWithMaxModificationTime = null;
+            long maxModificationTime = Long.MIN_VALUE;
+            for (SSTableReader ssTableReader : generations[i + 1])
+            {
+                long modificationTime = ssTableReader.getCreationTimeFor(Component.DATA);
+                if (modificationTime >= maxModificationTime)
+                {
+                    sstableWithMaxModificationTime = ssTableReader;
+                    maxModificationTime = modificationTime;
+                }
+            }
+
+            lastCompactedKeys[i] = sstableWithMaxModificationTime.last;
+        }
+    }
+
     public synchronized void add(SSTableReader reader)
     {
         int level = reader.getSSTableLevel();
@@ -333,7 +363,7 @@
                     candidates = getOverlappingStarvedSSTables(nextLevel, candidates);
                     if (logger.isTraceEnabled())
                         logger.trace("Compaction candidates for L{} are {}", i, toString(candidates));
-                    return new CompactionCandidate(candidates, nextLevel, cfs.getCompactionStrategy().getMaxSSTableBytes());
+                    return new CompactionCandidate(candidates, nextLevel, cfs.getCompactionStrategyManager().getMaxSSTableBytes());
                 }
                 else
                 {
@@ -353,7 +383,7 @@
             // small in L0.
             return getSTCSInL0CompactionCandidate();
         }
-        return new CompactionCandidate(candidates, getNextLevel(candidates), cfs.getCompactionStrategy().getMaxSSTableBytes());
+        return new CompactionCandidate(candidates, getNextLevel(candidates), cfs.getCompactionStrategyManager().getMaxSSTableBytes());
     }
 
     private CompactionCandidate getSTCSInL0CompactionCandidate()
@@ -416,8 +446,8 @@
                     // say we are compacting 3 sstables: 0->30 in L1 and 0->12, 12->33 in L2
                     // this means that we will not create overlap in L2 if we add an sstable
                     // contained within 0 -> 33 to the compaction
-                    RowPosition max = null;
-                    RowPosition min = null;
+                    PartitionPosition max = null;
+                    PartitionPosition min = null;
                     for (SSTableReader candidate : candidates)
                     {
                         if (min == null || candidate.first.compareTo(min) < 0)
@@ -428,10 +458,10 @@
                     if (min == null || max == null || min.equals(max)) // single partition sstables - we cannot include a high level sstable.
                         return candidates;
                     Set<SSTableReader> compacting = cfs.getTracker().getCompacting();
-                    Range<RowPosition> boundaries = new Range<>(min, max);
+                    Range<PartitionPosition> boundaries = new Range<>(min, max);
                     for (SSTableReader sstable : getLevel(i))
                     {
-                        Range<RowPosition> r = new Range<RowPosition>(sstable.first, sstable.last);
+                        Range<PartitionPosition> r = new Range<PartitionPosition>(sstable.first, sstable.last);
                         if (boundaries.contains(r) && !compacting.contains(sstable))
                         {
                             logger.info("Adding high-level (L{}) {} to candidates", sstable.getSSTableLevel(), sstable);
@@ -560,8 +590,8 @@
         {
             Set<SSTableReader> compactingL0 = getCompacting(0);
 
-            RowPosition lastCompactingKey = null;
-            RowPosition firstCompactingKey = null;
+            PartitionPosition lastCompactingKey = null;
+            PartitionPosition firstCompactingKey = null;
             for (SSTableReader candidate : compactingL0)
             {
                 if (firstCompactingKey == null || candidate.first.compareTo(firstCompactingKey) < 0)

diff --git a/src/java/org/apache/cassandra/db/compaction/OperationType.java b/src/java/org/apache/cassandra/db/compaction/OperationType.java
index 6b66ded..20e6df2 100644
--- a/src/java/org/apache/cassandra/db/compaction/OperationType.java
+++ b/src/java/org/apache/cassandra/db/compaction/OperationType.java

@@ -33,13 +33,28 @@
     UNKNOWN("Unknown compaction type"),
     ANTICOMPACTION("Anticompaction after repair"),
     VERIFY("Verify"),
+    FLUSH("Flush"),
+    STREAM("Stream"),
+    WRITE("Write"),
+    VIEW_BUILD("View build"),
     INDEX_SUMMARY("Index summary redistribution");
 
-    private final String type;
+    public final String type;
+    public final String fileName;
 
     OperationType(String type)
     {
         this.type = type;
+        this.fileName = type.toLowerCase().replace(" ", "");
+    }
+
+    public static OperationType fromFileName(String fileName)
+    {
+        for (OperationType opType : OperationType.values())
+            if (opType.fileName.equals(fileName))
+                return opType;
+
+        throw new IllegalArgumentException("Invalid fileName for operation type: " + fileName);
     }
 
     public String toString()

diff --git a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
index 6b302d2..fce8c2e 100644
--- a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
+++ b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java

@@ -18,9 +18,7 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.*;
-
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
+import java.util.function.Predicate;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
@@ -63,7 +61,7 @@
 
         public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
         {
-            super(cfs, transaction, CompactionManager.NO_GC, true);
+            super(cfs, transaction, CompactionManager.NO_GC, true, false);
             this.sstableSizeInMB = sstableSizeInMB;
 
             if (sstableSizeInMB <= 0)
@@ -77,9 +75,12 @@
         }
 
         @Override
-        public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
+        public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
+                                                              Directories directories,
+                                                              LifecycleTransaction txn,
+                                                              Set<SSTableReader> nonExpiredSSTables)
         {
-            return new MaxSSTableSizeWriter(cfs, txn, nonExpiredSSTables, sstableSizeInMB * 1024L * 1024L, 0, true, compactionType);
+            return new MaxSSTableSizeWriter(cfs, directories, txn, nonExpiredSSTables, sstableSizeInMB * 1024L * 1024L, 0, true, false);
         }
 
         @Override
@@ -99,7 +100,7 @@
         @Override
         public Predicate<Long> getPurgeEvaluator(DecoratedKey key)
         {
-            return Predicates.alwaysFalse();
+            return time -> false;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index affee11..463cc9c 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java

@@ -22,27 +22,22 @@
 import java.util.*;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
 import com.google.common.base.Throwables;
-import com.google.common.collect.AbstractIterator;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.JVMStabilityInspector;
-import org.apache.cassandra.utils.OutputHandler;
-import org.apache.cassandra.utils.memory.HeapAllocator;
-import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.*;
 import org.apache.cassandra.utils.concurrent.Refs;
+import org.apache.cassandra.utils.memory.HeapAllocator;
 
 public class Scrubber implements Closeable
 {
@@ -53,7 +48,6 @@
     private final boolean skipCorrupted;
     private final boolean reinsertOverflowedTTLRows;
 
-    private final CompactionController controller;
     private final boolean isCommutative;
     private final boolean isIndex;
     private final boolean checkData;
@@ -64,9 +58,6 @@
     private final ScrubInfo scrubInfo;
     private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
 
-    private SSTableReader newSstable;
-    private SSTableReader newInOrderSstable;
-
     private int goodRows;
     private int badRows;
     private int emptyRows;
@@ -76,21 +67,22 @@
     long currentRowPositionFromIndex;
     long nextRowPositionFromIndex;
 
-    private final OutputHandler outputHandler;
     private NegativeLocalDeletionInfoMetrics negativeLocalDeletionInfoMetrics = new NegativeLocalDeletionInfoMetrics();
 
-    private static final Comparator<Row> rowComparator = new Comparator<Row>()
+    private final OutputHandler outputHandler;
+
+    private static final Comparator<Partition> partitionComparator = new Comparator<Partition>()
     {
-         public int compare(Row r1, Row r2)
+         public int compare(Partition r1, Partition r2)
          {
-             return r1.key.compareTo(r2.key);
+             return r1.partitionKey().compareTo(r2.partitionKey());
          }
     };
-    private final SortedSet<Row> outOfOrderRows = new TreeSet<>(rowComparator);
+    private final SortedSet<Partition> outOfOrder = new TreeSet<>(partitionComparator);
 
     public Scrubber(ColumnFamilyStore cfs, LifecycleTransaction transaction, boolean skipCorrupted, boolean checkData) throws IOException
     {
-        this(cfs, transaction, skipCorrupted, new OutputHandler.LogOutput(), checkData, false);
+        this(cfs, transaction, skipCorrupted, checkData, false);
     }
 
     public Scrubber(ColumnFamilyStore cfs, LifecycleTransaction transaction, boolean skipCorrupted, boolean checkData,
@@ -100,7 +92,11 @@
     }
 
     @SuppressWarnings("resource")
-    public Scrubber(ColumnFamilyStore cfs, LifecycleTransaction transaction, boolean skipCorrupted, OutputHandler outputHandler, boolean checkData,
+    public Scrubber(ColumnFamilyStore cfs,
+                    LifecycleTransaction transaction,
+                    boolean skipCorrupted,
+                    OutputHandler outputHandler,
+                    boolean checkData,
                     boolean reinsertOverflowedTTLRows) throws IOException
     {
         this.cfs = cfs;
@@ -108,18 +104,17 @@
         this.sstable = transaction.onlyOne();
         this.outputHandler = outputHandler;
         this.skipCorrupted = skipCorrupted;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata);
         this.reinsertOverflowedTTLRows = reinsertOverflowedTTLRows;
-
+        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata,
+                                                                                                        sstable.descriptor.version,
+                                                                                                        sstable.header);
         List<SSTableReader> toScrub = Collections.singletonList(sstable);
 
         // Calculate the expected compacted filesize
-        this.destination = cfs.directories.getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(toScrub, OperationType.SCRUB));
+        this.destination = cfs.getDirectories().getWriteableLocationAsFile(cfs.getExpectedCompactedFileSize(toScrub, OperationType.SCRUB));
+        if (destination == null)
+            throw new IOException("disk full");
 
-        // If we run scrub offline, we should never purge tombstone, as we cannot know if other sstable have data that the tombstone deletes.
-        this.controller = transaction.isOffline()
-                        ? new ScrubController(cfs)
-                        : new CompactionController(cfs, Collections.singleton(sstable), CompactionManager.getDefaultGcBefore(cfs));
         this.isCommutative = cfs.metadata.isCounter();
 
         boolean hasIndexFile = (new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX))).exists();
@@ -131,7 +126,7 @@
         }
         this.checkData = checkData && !this.isIndex; //LocalByPartitionerType does not support validation
         this.expectedBloomFilterSize = Math.max(
-            cfs.metadata.getMinIndexInterval(),
+            cfs.metadata.params.minIndexInterval,
             hasIndexFile ? SSTableReader.getApproximateKeyCount(toScrub) : 0);
 
         // loop through each row, deserializing to check for damage.
@@ -155,21 +150,28 @@
             outputHandler.output("Starting scrub with reinsert overflowed TTL option");
     }
 
+    private UnfilteredRowIterator withValidation(UnfilteredRowIterator iter, String filename)
+    {
+        return checkData ? UnfilteredRowIterators.withValidation(iter, filename) : iter;
+    }
+
     public void scrub()
     {
+        List<SSTableReader> finished = new ArrayList<>();
+        boolean completed = false;
         outputHandler.output(String.format("Scrubbing %s (%s bytes)", sstable, dataFile.length()));
-        try (SSTableRewriter writer = new SSTableRewriter(cfs, transaction, sstable.maxDataAge, transaction.isOffline());
+        try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge, transaction.isOffline());
              Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable)))
         {
             nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null;
             if (indexAvailable())
             {
                 // throw away variable so we don't have a side effect in the assert
-                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserialize(indexFile, sstable.descriptor.version).position;
+                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserialize(indexFile).position;
                 assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
             }
 
-            writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable));
+            writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable.getSSTableMetadata().repairedAt, sstable, transaction));
 
             DecoratedKey prevKey = null;
 
@@ -178,29 +180,13 @@
                 if (scrubInfo.isStopRequested())
                     throw new CompactionInterruptedException(scrubInfo.getCompactionInfo());
 
-                updateIndexKey();
-
-                if (prevKey != null && indexFile != null)
-                {
-                    long nextRowStart = currentRowPositionFromIndex == -1 ? dataFile.length() : currentRowPositionFromIndex;
-                    if (dataFile.getFilePointer() < nextRowStart)
-                    {
-                        // Encountered CASSANDRA-10791. Place post-END_OF_ROW data in the out-of-order table.
-                        saveOutOfOrderRow(prevKey,
-                                          SSTableIdentityIterator.createFragmentIterator(sstable, dataFile, prevKey, nextRowStart - dataFile.getFilePointer(), checkData),
-                                          String.format("Row fragment detected after END_OF_ROW at key %s", prevKey));
-                        if (dataFile.isEOF())
-                            break;
-                    }
-                }
-
                 long rowStart = dataFile.getFilePointer();
                 outputHandler.debug("Reading row at " + rowStart);
 
                 DecoratedKey key = null;
                 try
                 {
-                    key = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
+                    key = sstable.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
                 }
                 catch (Throwable th)
                 {
@@ -208,6 +194,8 @@
                     // check for null key below
                 }
 
+                updateIndexKey();
+
                 long dataStart = dataFile.getFilePointer();
 
                 long dataStartFromIndex = -1;
@@ -232,7 +220,8 @@
                     if (currentIndexKey != null && !key.getKey().equals(currentIndexKey))
                     {
                         throw new IOError(new IOException(String.format("Key from data file (%s) does not match key from index file (%s)",
-                                ByteBufferUtil.bytesToHex(key.getKey()), ByteBufferUtil.bytesToHex(currentIndexKey))));
+                                //ByteBufferUtil.bytesToHex(key.getKey()), ByteBufferUtil.bytesToHex(currentIndexKey))));
+                                "_too big_", ByteBufferUtil.bytesToHex(currentIndexKey))));
                     }
 
                     if (indexFile != null && dataSizeFromIndex > dataFile.length())
@@ -254,7 +243,7 @@
                     {
                         outputHandler.output(String.format("Retrying from row index; data is %s bytes starting at %s",
                                                   dataSizeFromIndex, dataStartFromIndex));
-                        key = sstable.partitioner.decorateKey(currentIndexKey);
+                        key = sstable.decorateKey(currentIndexKey);
                         try
                         {
                             dataFile.seek(dataStartFromIndex);
@@ -284,26 +273,25 @@
                 }
             }
 
-            if (!outOfOrderRows.isEmpty())
+            if (!outOfOrder.isEmpty())
             {
                 // out of order rows, but no bad rows found - we can keep our repairedAt time
                 long repairedAt = badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt;
-                try (SSTableWriter inOrderWriter = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, repairedAt, sstable);)
+                SSTableReader newInOrderSstable;
+                try (SSTableWriter inOrderWriter = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, repairedAt, sstable, transaction))
                 {
-                    for (Row row : outOfOrderRows)
-                        inOrderWriter.append(row.key, row.cf);
+                    for (Partition partition : outOfOrder)
+                        inOrderWriter.append(partition.unfilteredIterator());
                     newInOrderSstable = inOrderWriter.finish(-1, sstable.maxDataAge, true);
                 }
                 transaction.update(newInOrderSstable, false);
-                if (transaction.isOffline() && newInOrderSstable != null)
-                    newInOrderSstable.selfRef().release();
-                outputHandler.warn(String.format("%d out of order rows found while scrubbing %s; Those have been written (in order) to a new sstable (%s)", outOfOrderRows.size(), sstable, newInOrderSstable));
+                finished.add(newInOrderSstable);
+                outputHandler.warn(String.format("%d out of order rows found while scrubbing %s; Those have been written (in order) to a new sstable (%s)", outOfOrder.size(), sstable, newInOrderSstable));
             }
 
             // finish obsoletes the old sstable
-            List<SSTableReader> finished = writer.setRepairedAt(badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt).finish();
-            if (!finished.isEmpty())
-                newSstable = finished.get(0);
+            finished.addAll(writer.setRepairedAt(badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt).finish());
+            completed = true;
         }
         catch (IOException e)
         {
@@ -311,19 +299,11 @@
         }
         finally
         {
-            controller.close();
-            if (transaction.isOffline() && newSstable != null)
-                newSstable.selfRef().release();
+            if (transaction.isOffline())
+                finished.forEach(sstable -> sstable.selfRef().release());
         }
 
-        if (newSstable == null)
-        {
-            if (badRows > 0)
-                outputHandler.warn("No valid rows found while scrubbing " + sstable + "; it is marked for deletion now. If you want to attempt manual recovery, you can find a copy in the pre-scrub snapshot");
-            else
-                outputHandler.output("Scrub of " + sstable + " complete; looks like all " + emptyRows + " rows were tombstoned");
-        }
-        else
+        if (completed)
         {
             outputHandler.output("Scrub of " + sstable + " complete: " + goodRows + " rows in new sstable and " + emptyRows + " empty (tombstoned) rows dropped");
             if (negativeLocalDeletionInfoMetrics.fixedRows > 0)
@@ -331,30 +311,43 @@
             if (badRows > 0)
                 outputHandler.warn("Unable to recover " + badRows + " rows that were skipped.  You can attempt manual recovery from the pre-scrub snapshot.  You can also run nodetool repair to transfer the data from a healthy replica, if any");
         }
+        else
+        {
+            if (badRows > 0)
+                outputHandler.warn("No valid rows found while scrubbing " + sstable + "; it is marked for deletion now. If you want to attempt manual recovery, you can find a copy in the pre-scrub snapshot");
+            else
+                outputHandler.output("Scrub of " + sstable + " complete; looks like all " + emptyRows + " rows were tombstoned");
+        }
     }
 
     @SuppressWarnings("resource")
     private boolean tryAppend(DecoratedKey prevKey, DecoratedKey key, SSTableRewriter writer)
     {
-        // OrderCheckerIterator will check, at iteration time, that the cells are in the proper order. If it detects
-        // that one cell is out of order, it will stop returning them. The remaining cells will be sorted and added
-        // to the outOfOrderRows that will be later written to a new SSTable.
-        OrderCheckerIterator atoms = new OrderCheckerIterator(getIterator(key),
-                                                              cfs.metadata.comparator.onDiskAtomComparator());
-        if (prevKey != null && prevKey.compareTo(key) > 0)
+        // OrderCheckerIterator will check, at iteration time, that the rows are in the proper order. If it detects
+        // that one row is out of order, it will stop returning them. The remaining rows will be sorted and added
+        // to the outOfOrder set that will be later written to a new SSTable.
+        OrderCheckerIterator sstableIterator = new OrderCheckerIterator(getIterator(key),
+                                                                        cfs.metadata.comparator);
+
+        try (UnfilteredRowIterator iterator = withValidation(sstableIterator, dataFile.getPath()))
         {
-            saveOutOfOrderRow(prevKey, key, atoms);
-            return false;
+            if (prevKey != null && prevKey.compareTo(key) > 0)
+            {
+                saveOutOfOrderRow(prevKey, key, iterator);
+                return false;
+            }
+
+            if (writer.tryAppend(iterator) == null)
+                emptyRows++;
+            else
+                goodRows++;
         }
 
-        AbstractCompactedRow compactedRow = new LazilyCompactedRow(controller, Collections.singletonList(atoms));
-        if (writer.tryAppend(compactedRow) == null)
-            emptyRows++;
-        else
-            goodRows++;
-
-        if (atoms.hasOutOfOrderCells())
-            saveOutOfOrderRow(key, atoms);
+        if (sstableIterator.hasRowsOutOfOrder())
+        {
+            outputHandler.warn(String.format("Out of order rows found in partition: %s", key));
+            outOfOrder.add(sstableIterator.getRowsOutOfOrder());
+        }
 
         return true;
     }
@@ -363,12 +356,12 @@
      * Only wrap with {@link FixNegativeLocalDeletionTimeIterator} if {@link #reinsertOverflowedTTLRows} option
      * is specified
      */
-    private OnDiskAtomIterator getIterator(DecoratedKey key)
+    private UnfilteredRowIterator getIterator(DecoratedKey key)
     {
-        SSTableIdentityIterator sstableIdentityIterator = new SSTableIdentityIterator(sstable, dataFile, key, checkData);
-        return reinsertOverflowedTTLRows ? new FixNegativeLocalDeletionTimeIterator(sstableIdentityIterator,
+        RowMergingSSTableIterator rowMergingIterator = new RowMergingSSTableIterator(sstable, dataFile, key);
+        return reinsertOverflowedTTLRows ? new FixNegativeLocalDeletionTimeIterator(rowMergingIterator,
                                                                                     outputHandler,
-                                                                                    negativeLocalDeletionInfoMetrics) : sstableIdentityIterator;
+                                                                                    negativeLocalDeletionInfoMetrics) : rowMergingIterator;
     }
 
     private void updateIndexKey()
@@ -381,7 +374,7 @@
 
             nextRowPositionFromIndex = !indexAvailable()
                     ? dataFile.length()
-                    : rowIndexEntrySerializer.deserialize(indexFile, sstable.descriptor.version).position;
+                    : rowIndexEntrySerializer.deserialize(indexFile).position;
         }
         catch (Throwable th)
         {
@@ -417,40 +410,11 @@
         }
     }
 
-    private void saveOutOfOrderRow(DecoratedKey prevKey, DecoratedKey key, OnDiskAtomIterator atoms)
-    {
-        saveOutOfOrderRow(key, atoms, String.format("Out of order row detected (%s found after %s)", key, prevKey));
-    }
-
-    void saveOutOfOrderRow(DecoratedKey key, OnDiskAtomIterator atoms, String message)
+    private void saveOutOfOrderRow(DecoratedKey prevKey, DecoratedKey key, UnfilteredRowIterator iterator)
     {
         // TODO bitch if the row is too large?  if it is there's not much we can do ...
-        outputHandler.warn(message);
-        // adding atoms in sorted order is worst-case for TMBSC, but we shouldn't need to do this very often
-        // and there's no sense in failing on mis-sorted cells when a TreeMap could safe us
-        ColumnFamily cf = atoms.getColumnFamily().cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-        while (atoms.hasNext())
-        {
-            OnDiskAtom atom = atoms.next();
-            cf.addAtom(atom);
-        }
-        outOfOrderRows.add(new Row(key, cf));
-    }
-
-    void saveOutOfOrderRow(DecoratedKey key, OrderCheckerIterator atoms)
-    {
-        outputHandler.warn(String.format("Out of order cells found at key %s", key));
-        outOfOrderRows.add(new Row(key, atoms.getOutOfOrderCells()));
-    }
-
-    public SSTableReader getNewSSTable()
-    {
-        return newSstable;
-    }
-
-    public SSTableReader getNewInOrderSSTable()
-    {
-        return newInOrderSstable;
+        outputHandler.warn(String.format("Out of order row detected (%s found after %s)", key, prevKey));
+        outOfOrder.add(ImmutableBTreePartition.create(iterator));
     }
 
     private void throwIfFatal(Throwable th)
@@ -518,19 +482,10 @@
                 throw new RuntimeException();
             }
         }
-    }
 
-    private static class ScrubController extends CompactionController
-    {
-        public ScrubController(ColumnFamilyStore cfs)
+        public boolean isGlobal()
         {
-            super(cfs, Integer.MAX_VALUE);
-        }
-
-        @Override
-        public Predicate<Long> getPurgeEvaluator(DecoratedKey key)
-        {
-            return Predicates.alwaysFalse();
+            return false;
         }
     }
 
@@ -561,107 +516,165 @@
     }
 
     /**
+     * During 2.x migration, under some circumstances rows might have gotten duplicated.
+     * Merging iterator merges rows with same clustering.
+     *
+     * For more details, refer to CASSANDRA-12144.
+     */
+    private static class RowMergingSSTableIterator extends SSTableIdentityIterator
+    {
+        RowMergingSSTableIterator(SSTableReader sstable, RandomAccessReader file, DecoratedKey key)
+        {
+            super(sstable, file, key);
+        }
+
+        @Override
+        protected Unfiltered doCompute()
+        {
+            if (!iterator.hasNext())
+                return endOfData();
+
+            Unfiltered next = iterator.next();
+            if (!next.isRow())
+                return next;
+
+            while (iterator.hasNext())
+            {
+                Unfiltered peek = iterator.peek();
+                // If there was a duplicate row, merge it.
+                if (next.clustering().equals(peek.clustering()) && peek.isRow())
+                {
+                    iterator.next(); // Make sure that the peeked item was consumed.
+                    next = Rows.merge((Row) next, (Row) peek, FBUtilities.nowInSeconds());
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return next;
+        }
+
+    }
+
+    /**
      * In some case like CASSANDRA-12127 the cells might have been stored in the wrong order. This decorator check the
      * cells order and collect the out of order cells to correct the problem.
      */
-    private static final class OrderCheckerIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
+    private static final class OrderCheckerIterator extends AbstractIterator<Unfiltered> implements UnfilteredRowIterator
     {
         /**
          * The decorated iterator.
          */
-        private final OnDiskAtomIterator iterator;
+        private final UnfilteredRowIterator iterator;
+
+        private final ClusteringComparator comparator;
+
+        private Unfiltered previous;
 
         /**
-         * The atom comparator.
+         * The partition containing the rows which are out of order.
          */
-        private final Comparator<OnDiskAtom> comparator;
+        private Partition rowsOutOfOrder;
 
-        /**
-         * The Column family containing the cells which are out of order.
-         */
-        private ColumnFamily outOfOrderCells;
-
-        /**
-         * The previous atom returned
-         */
-        private OnDiskAtom previous;
-
-        public OrderCheckerIterator(OnDiskAtomIterator iterator, Comparator<OnDiskAtom> comparator)
+        public OrderCheckerIterator(UnfilteredRowIterator iterator, ClusteringComparator comparator)
         {
             this.iterator = iterator;
             this.comparator = comparator;
         }
 
-        public ColumnFamily getColumnFamily()
+        public CFMetaData metadata()
         {
-            return iterator.getColumnFamily();
+            return iterator.metadata();
         }
 
-        public DecoratedKey getKey()
+        public boolean isReverseOrder()
         {
-            return iterator.getKey();
+            return iterator.isReverseOrder();
         }
 
-        public void close() throws IOException
+        public PartitionColumns columns()
+        {
+            return iterator.columns();
+        }
+
+        public DecoratedKey partitionKey()
+        {
+            return iterator.partitionKey();
+        }
+
+        public Row staticRow()
+        {
+            return iterator.staticRow();
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return iterator.isEmpty();
+        }
+
+        public void close()
         {
             iterator.close();
         }
 
-        @Override
-        protected OnDiskAtom computeNext()
+        public DeletionTime partitionLevelDeletion()
+        {
+            return iterator.partitionLevelDeletion();
+        }
+
+        public EncodingStats stats()
+        {
+            return iterator.stats();
+        }
+
+        public boolean hasRowsOutOfOrder()
+        {
+            return rowsOutOfOrder != null;
+        }
+
+        public Partition getRowsOutOfOrder()
+        {
+            return rowsOutOfOrder;
+        }
+
+        protected Unfiltered computeNext()
         {
             if (!iterator.hasNext())
                 return endOfData();
 
-            OnDiskAtom next = iterator.next();
+            Unfiltered next = iterator.next();
 
-            // If we detect that some cells are out of order we will store and sort the remaining once to insert them
+            // If we detect that some rows are out of order we will store and sort the remaining ones to insert them
             // in a separate SSTable.
             if (previous != null && comparator.compare(next, previous) < 0)
             {
-                outOfOrderCells = collectOutOfOrderCells(next, iterator);
+                rowsOutOfOrder = ImmutableBTreePartition.create(UnfilteredRowIterators.concat(next, iterator), false);
                 return endOfData();
             }
             previous = next;
             return next;
         }
-
-        public boolean hasOutOfOrderCells()
-        {
-            return outOfOrderCells != null;
-        }
-
-        public ColumnFamily getOutOfOrderCells()
-        {
-            return outOfOrderCells;
-        }
-
-        private static ColumnFamily collectOutOfOrderCells(OnDiskAtom atom, OnDiskAtomIterator iterator)
-        {
-            ColumnFamily cf = iterator.getColumnFamily().cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-            cf.addAtom(atom);
-            while (iterator.hasNext())
-                cf.addAtom(iterator.next());
-            return cf;
-        }
     }
 
     /**
-     * This iterator converts negative {@link BufferExpiringCell#getLocalDeletionTime()} into {@link BufferExpiringCell#MAX_DELETION_TIME}
+     * This iterator converts negative {@link AbstractCell#localDeletionTime()} into {@link AbstractCell#MAX_DELETION_TIME}
      *
      * This is to recover entries with overflowed localExpirationTime due to CASSANDRA-14092
      */
-    private static final class FixNegativeLocalDeletionTimeIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
+    private static final class FixNegativeLocalDeletionTimeIterator extends AbstractIterator<Unfiltered> implements UnfilteredRowIterator
     {
         /**
          * The decorated iterator.
          */
-        private final OnDiskAtomIterator iterator;
+        private final UnfilteredRowIterator iterator;
 
         private final OutputHandler outputHandler;
         private final NegativeLocalDeletionInfoMetrics negativeLocalExpirationTimeMetrics;
 
-        public FixNegativeLocalDeletionTimeIterator(OnDiskAtomIterator iterator, OutputHandler outputHandler,
+        public FixNegativeLocalDeletionTimeIterator(UnfilteredRowIterator iterator, OutputHandler outputHandler,
                                                     NegativeLocalDeletionInfoMetrics negativeLocalDeletionInfoMetrics)
         {
             this.iterator = iterator;
@@ -669,37 +682,127 @@
             this.negativeLocalExpirationTimeMetrics = negativeLocalDeletionInfoMetrics;
         }
 
-        public ColumnFamily getColumnFamily()
+        public CFMetaData metadata()
         {
-            return iterator.getColumnFamily();
+            return iterator.metadata();
         }
 
-        public DecoratedKey getKey()
+        public boolean isReverseOrder()
         {
-            return iterator.getKey();
+            return iterator.isReverseOrder();
         }
 
-        public void close() throws IOException
+        public PartitionColumns columns()
+        {
+            return iterator.columns();
+        }
+
+        public DecoratedKey partitionKey()
+        {
+            return iterator.partitionKey();
+        }
+
+        public Row staticRow()
+        {
+            return iterator.staticRow();
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return iterator.isEmpty();
+        }
+
+        public void close()
         {
             iterator.close();
         }
 
-        @Override
-        protected OnDiskAtom computeNext()
+        public DeletionTime partitionLevelDeletion()
+        {
+            return iterator.partitionLevelDeletion();
+        }
+
+        public EncodingStats stats()
+        {
+            return iterator.stats();
+        }
+
+        protected Unfiltered computeNext()
         {
             if (!iterator.hasNext())
                 return endOfData();
 
-            OnDiskAtom next = iterator.next();
+            Unfiltered next = iterator.next();
+            if (!next.isRow())
+                return next;
 
-            if (next instanceof ExpiringCell && next.getLocalDeletionTime() < 0)
+            if (hasNegativeLocalExpirationTime((Row) next))
             {
-                outputHandler.debug(String.format("Found cell with negative local expiration time: %s", ((ExpiringCell) next).getString(getColumnFamily().getComparator()), getColumnFamily()));
+                outputHandler.debug(String.format("Found row with negative local expiration time: %s", next.toString(metadata(), false)));
                 negativeLocalExpirationTimeMetrics.fixedRows++;
-                next = ((Cell) next).localCopy(getColumnFamily().metadata(), HeapAllocator.instance).withUpdatedTimestampAndLocalDeletionTime(next.timestamp() + 1, BufferExpiringCell.MAX_DELETION_TIME);
+                return fixNegativeLocalExpirationTime((Row) next);
             }
 
             return next;
         }
+
+        private boolean hasNegativeLocalExpirationTime(Row next)
+        {
+            Row row = next;
+            if (row.primaryKeyLivenessInfo().isExpiring() && row.primaryKeyLivenessInfo().localExpirationTime() < 0)
+            {
+                return true;
+            }
+
+            for (ColumnData cd : row)
+            {
+                if (cd.column().isSimple())
+                {
+                    Cell cell = (Cell)cd;
+                    if (cell.isExpiring() && cell.localDeletionTime() < 0)
+                        return true;
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData)cd;
+                    for (Cell cell : complexData)
+                    {
+                        if (cell.isExpiring() && cell.localDeletionTime() < 0)
+                            return true;
+                    }
+                }
+            }
+
+            return false;
+        }
+
+        private Unfiltered fixNegativeLocalExpirationTime(Row row)
+        {
+            Row.Builder builder = HeapAllocator.instance.cloningBTreeRowBuilder();
+            builder.newRow(row.clustering());
+            builder.addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo().isExpiring() && row.primaryKeyLivenessInfo().localExpirationTime() < 0 ?
+                                              row.primaryKeyLivenessInfo().withUpdatedTimestampAndLocalDeletionTime(row.primaryKeyLivenessInfo().timestamp() + 1, AbstractCell.MAX_DELETION_TIME)
+                                              :row.primaryKeyLivenessInfo());
+            builder.addRowDeletion(row.deletion());
+            for (ColumnData cd : row)
+            {
+                if (cd.column().isSimple())
+                {
+                    Cell cell = (Cell)cd;
+                    builder.addCell(cell.isExpiring() && cell.localDeletionTime() < 0 ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp() + 1, AbstractCell.MAX_DELETION_TIME) : cell);
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData)cd;
+                    builder.addComplexDeletion(complexData.column(), complexData.complexDeletion());
+                    for (Cell cell : complexData)
+                    {
+                        builder.addCell(cell.isExpiring() && cell.localDeletionTime() < 0 ? cell.withUpdatedTimestampAndLocalDeletionTime(cell.timestamp() + 1, AbstractCell.MAX_DELETION_TIME) : cell);
+                    }
+                }
+            }
+            return builder.build();
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
index b4125bb..80f5e8c 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java

@@ -22,20 +22,21 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Sets;
-
-import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
-import org.apache.cassandra.db.compaction.writers.SplittingSizeTieredCompactionWriter;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cql3.statements.CFPropDefs;
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.db.compaction.writers.SplittingSizeTieredCompactionWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.utils.Pair;
 
+import static com.google.common.collect.Iterables.filter;
+
 public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy
 {
     private static final Logger logger = LoggerFactory.getLogger(SizeTieredCompactionStrategy.class);
@@ -73,13 +74,13 @@
         this.sizeTieredOptions = new SizeTieredCompactionStrategyOptions(options);
     }
 
-    private List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
     {
         // make local copies so they can't be changed out from under us mid-method
         int minThreshold = cfs.getMinimumCompactionThreshold();
         int maxThreshold = cfs.getMaximumCompactionThreshold();
 
-        Iterable<SSTableReader> candidates = filterSuspectSSTables(Sets.intersection(cfs.getUncompactingSSTables(), sstables));
+        Iterable<SSTableReader> candidates = filterSuspectSSTables(filter(cfs.getUncompactingSSTables(), sstables::contains));
 
         List<List<SSTableReader>> buckets = getBuckets(createSSTableAndLengthPairs(candidates), sizeTieredOptions.bucketHigh, sizeTieredOptions.bucketLow, sizeTieredOptions.minSSTableSize);
         logger.trace("Compaction buckets are {}", buckets);
@@ -184,12 +185,12 @@
 
             LifecycleTransaction transaction = cfs.getTracker().tryModify(hottestBucket, OperationType.COMPACTION);
             if (transaction != null)
-                return new CompactionTask(cfs, transaction, gcBefore, false);
+                return new CompactionTask(cfs, transaction, gcBefore);
         }
     }
 
     @SuppressWarnings("resource")
-    public Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore, boolean splitOutput)
+    public synchronized Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore, boolean splitOutput)
     {
         Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
         if (Iterables.isEmpty(filteredSSTables))
@@ -198,8 +199,8 @@
         if (txn == null)
             return null;
         if (splitOutput)
-            return Arrays.<AbstractCompactionTask>asList(new SplittingCompactionTask(cfs, txn, gcBefore, false));
-        return Arrays.<AbstractCompactionTask>asList(new CompactionTask(cfs, txn, gcBefore, false));
+            return Arrays.<AbstractCompactionTask>asList(new SplittingCompactionTask(cfs, txn, gcBefore));
+        return Arrays.<AbstractCompactionTask>asList(new CompactionTask(cfs, txn, gcBefore));
     }
 
     @SuppressWarnings("resource")
@@ -214,7 +215,7 @@
             return null;
         }
 
-        return new CompactionTask(cfs, transaction, gcBefore, false).setUserDefined(true);
+        return new CompactionTask(cfs, transaction, gcBefore).setUserDefined(true);
     }
 
     public int getEstimatedRemainingTasks()
@@ -302,8 +303,8 @@
         Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
         uncheckedOptions = SizeTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 
-        uncheckedOptions.remove(CFPropDefs.KW_MINCOMPACTIONTHRESHOLD);
-        uncheckedOptions.remove(CFPropDefs.KW_MAXCOMPACTIONTHRESHOLD);
+        uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
+        uncheckedOptions.remove(CompactionParams.Option.MAX_THRESHOLD.toString());
 
         return uncheckedOptions;
     }
@@ -315,13 +316,13 @@
     }
 
     @Override
-    public void addSSTable(SSTableReader added)
+    public synchronized void addSSTable(SSTableReader added)
     {
         sstables.add(added);
     }
 
     @Override
-    public void removeSSTable(SSTableReader sstable)
+    public synchronized void removeSSTable(SSTableReader sstable)
     {
         sstables.remove(sstable);
     }
@@ -335,15 +336,18 @@
 
     private static class SplittingCompactionTask extends CompactionTask
     {
-        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean offline)
+        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
         {
-            super(cfs, txn, gcBefore, offline);
+            super(cfs, txn, gcBefore);
         }
 
         @Override
-        public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
+        public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
+                                                              Directories directories,
+                                                              LifecycleTransaction txn,
+                                                              Set<SSTableReader> nonExpiredSSTables)
         {
-            return new SplittingSizeTieredCompactionWriter(cfs, txn, nonExpiredSSTables, compactionType);
+            return new SplittingSizeTieredCompactionWriter(cfs, directories, txn, nonExpiredSSTables);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
new file mode 100644
index 0000000..c44d3aa
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java

@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.TreeSet;
+import java.util.concurrent.TimeUnit;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.utils.Pair;
+
+import static com.google.common.collect.Iterables.filter;
+
+public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy
+{
+    private static final Logger logger = LoggerFactory.getLogger(TimeWindowCompactionStrategy.class);
+
+    private final TimeWindowCompactionStrategyOptions options;
+    protected volatile int estimatedRemainingTasks;
+    private final Set<SSTableReader> sstables = new HashSet<>();
+    private long lastExpiredCheck;
+    private long highestWindowSeen;
+
+    public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    {
+        super(cfs, options);
+        this.estimatedRemainingTasks = 0;
+        this.options = new TimeWindowCompactionStrategyOptions(options);
+        if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
+        {
+            disableTombstoneCompactions = true;
+            logger.debug("Disabling tombstone compactions for TWCS");
+        }
+        else
+            logger.debug("Enabling tombstone compactions for TWCS");
+
+    }
+
+    @Override
+    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
+    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+    {
+        while (true)
+        {
+            List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
+
+            if (latestBucket.isEmpty())
+                return null;
+
+            LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
+            if (modifier != null)
+                return new CompactionTask(cfs, modifier, gcBefore);
+        }
+    }
+
+    /**
+     *
+     * @param gcBefore
+     * @return
+     */
+    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    {
+        if (Iterables.isEmpty(cfs.getSSTables(SSTableSet.LIVE)))
+            return Collections.emptyList();
+
+        Set<SSTableReader> uncompacting = ImmutableSet.copyOf(filter(cfs.getUncompactingSSTables(), sstables::contains));
+
+        // Find fully expired SSTables. Those will be included no matter what.
+        Set<SSTableReader> expired = Collections.emptySet();
+
+        if (System.currentTimeMillis() - lastExpiredCheck > options.expiredSSTableCheckFrequency)
+        {
+            logger.debug("TWCS expired check sufficiently far in the past, checking for fully expired SSTables");
+            expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, cfs.getOverlappingLiveSSTables(uncompacting), gcBefore);
+            lastExpiredCheck = System.currentTimeMillis();
+        }
+        else
+        {
+            logger.debug("TWCS skipping check for fully expired SSTables");
+        }
+
+        Set<SSTableReader> candidates = Sets.newHashSet(filterSuspectSSTables(uncompacting));
+
+        List<SSTableReader> compactionCandidates = new ArrayList<>(getNextNonExpiredSSTables(Sets.difference(candidates, expired), gcBefore));
+        if (!expired.isEmpty())
+        {
+            logger.debug("Including expired sstables: {}", expired);
+            compactionCandidates.addAll(expired);
+        }
+
+        return compactionCandidates;
+    }
+
+    private List<SSTableReader> getNextNonExpiredSSTables(Iterable<SSTableReader> nonExpiringSSTables, final int gcBefore)
+    {
+        List<SSTableReader> mostInteresting = getCompactionCandidates(nonExpiringSSTables);
+
+        if (mostInteresting != null)
+        {
+            return mostInteresting;
+        }
+
+        // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
+        // ratio is greater than threshold.
+        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
+        for (SSTableReader sstable : nonExpiringSSTables)
+        {
+            if (worthDroppingTombstones(sstable, gcBefore))
+                sstablesWithTombstones.add(sstable);
+        }
+        if (sstablesWithTombstones.isEmpty())
+            return Collections.emptyList();
+
+        return Collections.singletonList(Collections.min(sstablesWithTombstones, new SSTableReader.SizeComparator()));
+    }
+
+    private List<SSTableReader> getCompactionCandidates(Iterable<SSTableReader> candidateSSTables)
+    {
+        Pair<HashMultimap<Long, SSTableReader>, Long> buckets = getBuckets(candidateSSTables, options.sstableWindowUnit, options.sstableWindowSize, options.timestampResolution);
+        // Update the highest window seen, if necessary
+        if(buckets.right > this.highestWindowSeen)
+            this.highestWindowSeen = buckets.right;
+
+        updateEstimatedCompactionsByTasks(buckets.left);
+        List<SSTableReader> mostInteresting = newestBucket(buckets.left,
+                                                           cfs.getMinimumCompactionThreshold(),
+                                                           cfs.getMaximumCompactionThreshold(),
+                                                           options.sstableWindowUnit,
+                                                           options.sstableWindowSize,
+                                                           options.stcsOptions,
+                                                           this.highestWindowSeen);
+        if (!mostInteresting.isEmpty())
+            return mostInteresting;
+        return null;
+    }
+
+    @Override
+    public synchronized void addSSTable(SSTableReader sstable)
+    {
+        sstables.add(sstable);
+    }
+
+    @Override
+    public synchronized void removeSSTable(SSTableReader sstable)
+    {
+        sstables.remove(sstable);
+    }
+
+    /**
+     * Find the lowest and highest timestamps in a given timestamp/unit pair
+     * Returns milliseconds, caller should adjust accordingly
+     */
+    public static Pair<Long,Long> getWindowBoundsInMillis(TimeUnit windowTimeUnit, int windowTimeSize, long timestampInMillis)
+    {
+        long lowerTimestamp;
+        long upperTimestamp;
+        long timestampInSeconds = TimeUnit.SECONDS.convert(timestampInMillis, TimeUnit.MILLISECONDS);
+
+        switch(windowTimeUnit)
+        {
+            case MINUTES:
+                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (60 * windowTimeSize));
+                upperTimestamp = (lowerTimestamp + (60L * (windowTimeSize - 1L))) + 59L;
+                break;
+            case HOURS:
+                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (3600 * windowTimeSize));
+                upperTimestamp = (lowerTimestamp + (3600L * (windowTimeSize - 1L))) + 3599L;
+                break;
+            case DAYS:
+            default:
+                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (86400 * windowTimeSize));
+                upperTimestamp = (lowerTimestamp + (86400L * (windowTimeSize - 1L))) + 86399L;
+                break;
+        }
+
+        return Pair.create(TimeUnit.MILLISECONDS.convert(lowerTimestamp, TimeUnit.SECONDS),
+                           TimeUnit.MILLISECONDS.convert(upperTimestamp, TimeUnit.SECONDS));
+
+    }
+
+    /**
+     * Group files with similar max timestamp into buckets.
+     *
+     * @param files pairs consisting of a file and its min timestamp
+     * @param sstableWindowUnit
+     * @param sstableWindowSize
+     * @param timestampResolution
+     * @return A pair, where the left element is the bucket representation (map of timestamp to sstablereader), and the right is the highest timestamp seen
+     */
+    @VisibleForTesting
+    static Pair<HashMultimap<Long, SSTableReader>, Long> getBuckets(Iterable<SSTableReader> files, TimeUnit sstableWindowUnit, int sstableWindowSize, TimeUnit timestampResolution)
+    {
+        HashMultimap<Long, SSTableReader> buckets = HashMultimap.create();
+
+        long maxTimestamp = 0;
+        // Create hash map to represent buckets
+        // For each sstable, add sstable to the time bucket
+        // Where the bucket is the file's max timestamp rounded to the nearest window bucket
+        for (SSTableReader f : files)
+        {
+            assert TimeWindowCompactionStrategyOptions.validTimestampTimeUnits.contains(timestampResolution);
+            long tStamp = TimeUnit.MILLISECONDS.convert(f.getMaxTimestamp(), timestampResolution);
+            Pair<Long,Long> bounds = getWindowBoundsInMillis(sstableWindowUnit, sstableWindowSize, tStamp);
+            buckets.put(bounds.left, f);
+            if (bounds.left > maxTimestamp)
+                maxTimestamp = bounds.left;
+        }
+
+        logger.trace("buckets {}, max timestamp", buckets, maxTimestamp);
+        return Pair.create(buckets, maxTimestamp);
+    }
+
+    private void updateEstimatedCompactionsByTasks(HashMultimap<Long, SSTableReader> tasks)
+    {
+        int n = 0;
+        long now = this.highestWindowSeen;
+
+        for(Long key : tasks.keySet())
+        {
+            // For current window, make sure it's compactable
+            if (key.compareTo(now) >= 0 && tasks.get(key).size() >= cfs.getMinimumCompactionThreshold())
+                n++;
+            else if (key.compareTo(now) < 0 && tasks.get(key).size() >= 2)
+                n++;
+        }
+        this.estimatedRemainingTasks = n;
+    }
+
+
+    /**
+     * @param buckets list of buckets, sorted from newest to oldest, from which to return the newest bucket within thresholds.
+     * @param minThreshold minimum number of sstables in a bucket to qualify.
+     * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this).
+     * @return a bucket (list) of sstables to compact.
+     */
+    @VisibleForTesting
+    static List<SSTableReader> newestBucket(HashMultimap<Long, SSTableReader> buckets, int minThreshold, int maxThreshold, TimeUnit sstableWindowUnit, int sstableWindowSize, SizeTieredCompactionStrategyOptions stcsOptions, long now)
+    {
+        // If the current bucket has at least minThreshold SSTables, choose that one.
+        // For any other bucket, at least 2 SSTables is enough.
+        // In any case, limit to maxThreshold SSTables.
+
+        TreeSet<Long> allKeys = new TreeSet<>(buckets.keySet());
+
+        Iterator<Long> it = allKeys.descendingIterator();
+        while(it.hasNext())
+        {
+            Long key = it.next();
+            Set<SSTableReader> bucket = buckets.get(key);
+            logger.trace("Key {}, now {}", key, now);
+            if (bucket.size() >= minThreshold && key >= now)
+            {
+                // If we're in the newest bucket, we'll use STCS to prioritize sstables
+                List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(bucket);
+                List<List<SSTableReader>> stcsBuckets = SizeTieredCompactionStrategy.getBuckets(pairs, stcsOptions.bucketHigh, stcsOptions.bucketLow, stcsOptions.minSSTableSize);
+                logger.debug("Using STCS compaction for first window of bucket: data files {} , options {}", pairs, stcsOptions);
+                List<SSTableReader> stcsInterestingBucket = SizeTieredCompactionStrategy.mostInterestingBucket(stcsBuckets, minThreshold, maxThreshold);
+
+                // If the tables in the current bucket aren't eligible in the STCS strategy, we'll skip it and look for other buckets
+                if (!stcsInterestingBucket.isEmpty())
+                    return stcsInterestingBucket;
+            }
+            else if (bucket.size() >= 2 && key < now)
+            {
+                logger.debug("bucket size {} >= 2 and not in current bucket, compacting what's here: {}", bucket.size(), bucket);
+                return trimToThreshold(bucket, maxThreshold);
+            }
+            else
+            {
+                logger.trace("No compaction necessary for bucket size {} , key {}, now {}", bucket.size(), key, now);
+            }
+        }
+        return Collections.<SSTableReader>emptyList();
+    }
+
+    /**
+     * @param bucket set of sstables
+     * @param maxThreshold maximum number of sstables in a single compaction task.
+     * @return A bucket trimmed to the maxThreshold newest sstables.
+     */
+    @VisibleForTesting
+    static List<SSTableReader> trimToThreshold(Set<SSTableReader> bucket, int maxThreshold)
+    {
+        List<SSTableReader> ssTableReaders = new ArrayList<>(bucket);
+
+        // Trim the largest sstables off the end to meet the maxThreshold
+        Collections.sort(ssTableReaders, new SSTableReader.SizeComparator());
+
+        return ImmutableList.copyOf(Iterables.limit(ssTableReaders, maxThreshold));
+    }
+
+    @Override
+    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
+    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
+    {
+        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
+        if (Iterables.isEmpty(filteredSSTables))
+            return null;
+        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
+        if (txn == null)
+            return null;
+        return Collections.singleton(new CompactionTask(cfs, txn, gcBefore));
+    }
+
+    @Override
+    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
+    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
+    {
+        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
+
+        LifecycleTransaction modifier = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        if (modifier == null)
+        {
+            logger.debug("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
+            return null;
+        }
+
+        return new CompactionTask(cfs, modifier, gcBefore).setUserDefined(true);
+    }
+
+    public int getEstimatedRemainingTasks()
+    {
+        return this.estimatedRemainingTasks;
+    }
+
+    public long getMaxSSTableBytes()
+    {
+        return Long.MAX_VALUE;
+    }
+
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
+        uncheckedOptions = TimeWindowCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
+
+        uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
+        uncheckedOptions.remove(CompactionParams.Option.MAX_THRESHOLD.toString());
+
+        return uncheckedOptions;
+    }
+
+    public String toString()
+    {
+        return String.format("TimeWindowCompactionStrategy[%s/%s]",
+                cfs.getMinimumCompactionThreshold(),
+                cfs.getMaximumCompactionThreshold());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyOptions.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyOptions.java
new file mode 100644
index 0000000..bcbdab6
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyOptions.java

@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import com.google.common.collect.ImmutableList;
+
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+public final class TimeWindowCompactionStrategyOptions
+{
+    private static final Logger logger = LoggerFactory.getLogger(TimeWindowCompactionStrategyOptions.class);
+
+    protected static final TimeUnit DEFAULT_TIMESTAMP_RESOLUTION = TimeUnit.MICROSECONDS;
+    protected static final TimeUnit DEFAULT_COMPACTION_WINDOW_UNIT = TimeUnit.DAYS;
+    protected static final int DEFAULT_COMPACTION_WINDOW_SIZE = 1;
+    protected static final int DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS = 60 * 10;
+
+    protected static final String TIMESTAMP_RESOLUTION_KEY = "timestamp_resolution";
+    protected static final String COMPACTION_WINDOW_UNIT_KEY = "compaction_window_unit";
+    protected static final String COMPACTION_WINDOW_SIZE_KEY = "compaction_window_size";
+    protected static final String EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY = "expired_sstable_check_frequency_seconds";
+
+    protected final int sstableWindowSize;
+    protected final TimeUnit sstableWindowUnit;
+    protected final TimeUnit timestampResolution;
+    protected final long expiredSSTableCheckFrequency;
+
+    SizeTieredCompactionStrategyOptions stcsOptions;
+
+    protected final static ImmutableList<TimeUnit> validTimestampTimeUnits = ImmutableList.of(TimeUnit.SECONDS, TimeUnit.MILLISECONDS, TimeUnit.MICROSECONDS, TimeUnit.NANOSECONDS);
+    protected final static ImmutableList<TimeUnit> validWindowTimeUnits = ImmutableList.of(TimeUnit.MINUTES, TimeUnit.HOURS, TimeUnit.DAYS);
+
+    public TimeWindowCompactionStrategyOptions(Map<String, String> options)
+    {
+        String optionValue = options.get(TIMESTAMP_RESOLUTION_KEY);
+        timestampResolution = optionValue == null ? DEFAULT_TIMESTAMP_RESOLUTION : TimeUnit.valueOf(optionValue);
+        if (timestampResolution != DEFAULT_TIMESTAMP_RESOLUTION)
+            logger.warn("Using a non-default timestamp_resolution {} - are you really doing inserts with USING TIMESTAMP <non_microsecond_timestamp> (or driver equivalent)?", timestampResolution.toString());
+
+        optionValue = options.get(COMPACTION_WINDOW_UNIT_KEY);
+        sstableWindowUnit = optionValue == null ? DEFAULT_COMPACTION_WINDOW_UNIT : TimeUnit.valueOf(optionValue);
+
+        optionValue = options.get(COMPACTION_WINDOW_SIZE_KEY);
+        sstableWindowSize = optionValue == null ? DEFAULT_COMPACTION_WINDOW_SIZE : Integer.parseInt(optionValue);
+
+        optionValue = options.get(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
+        expiredSSTableCheckFrequency = TimeUnit.MILLISECONDS.convert(optionValue == null ? DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS : Long.parseLong(optionValue), TimeUnit.SECONDS);
+
+        stcsOptions = new SizeTieredCompactionStrategyOptions(options);
+    }
+
+    public TimeWindowCompactionStrategyOptions()
+    {
+        sstableWindowUnit = DEFAULT_COMPACTION_WINDOW_UNIT;
+        timestampResolution = DEFAULT_TIMESTAMP_RESOLUTION;
+        sstableWindowSize = DEFAULT_COMPACTION_WINDOW_SIZE;
+        expiredSSTableCheckFrequency = TimeUnit.MILLISECONDS.convert(DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS, TimeUnit.SECONDS);
+        stcsOptions = new SizeTieredCompactionStrategyOptions();
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options, Map<String, String> uncheckedOptions) throws  ConfigurationException
+    {
+        String optionValue = options.get(TIMESTAMP_RESOLUTION_KEY);
+        try
+        {
+            if (optionValue != null)
+                if (!validTimestampTimeUnits.contains(TimeUnit.valueOf(optionValue)))
+                    throw new ConfigurationException(String.format("%s is not valid for %s", optionValue, TIMESTAMP_RESOLUTION_KEY));
+        }
+        catch (IllegalArgumentException e)
+        {
+            throw new ConfigurationException(String.format("%s is not valid for %s", optionValue, TIMESTAMP_RESOLUTION_KEY));
+        }
+
+
+        optionValue = options.get(COMPACTION_WINDOW_UNIT_KEY);
+        try
+        {
+            if (optionValue != null)
+                if (!validWindowTimeUnits.contains(TimeUnit.valueOf(optionValue)))
+                    throw new ConfigurationException(String.format("%s is not valid for %s", optionValue, COMPACTION_WINDOW_UNIT_KEY));
+
+        }
+        catch (IllegalArgumentException e)
+        {
+            throw new ConfigurationException(String.format("%s is not valid for %s", optionValue, COMPACTION_WINDOW_UNIT_KEY), e);
+        }
+
+        optionValue = options.get(COMPACTION_WINDOW_SIZE_KEY);
+        try
+        {
+            int sstableWindowSize = optionValue == null ? DEFAULT_COMPACTION_WINDOW_SIZE : Integer.parseInt(optionValue);
+            if (sstableWindowSize < 1)
+            {
+                throw new ConfigurationException(String.format("%s must be greater than 1", DEFAULT_COMPACTION_WINDOW_SIZE, sstableWindowSize));
+            }
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", optionValue, DEFAULT_COMPACTION_WINDOW_SIZE), e);
+        }
+
+        optionValue = options.get(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
+        try
+        {
+            long expiredCheckFrequency = optionValue == null ? DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS : Long.parseLong(optionValue);
+            if (expiredCheckFrequency < 0)
+            {
+                throw new ConfigurationException(String.format("%s must not be negative, but was %d", EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, expiredCheckFrequency));
+             }
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", optionValue, EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY), e);
+        }
+
+        uncheckedOptions.remove(COMPACTION_WINDOW_SIZE_KEY);
+        uncheckedOptions.remove(COMPACTION_WINDOW_UNIT_KEY);
+        uncheckedOptions.remove(TIMESTAMP_RESOLUTION_KEY);
+        uncheckedOptions.remove(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY);
+
+        uncheckedOptions = SizeTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
+
+        return uncheckedOptions;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
index d6ef60e..77831a7 100644
--- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java
+++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java

@@ -19,20 +19,20 @@
 
 import java.io.File;
 import java.util.*;
+import java.util.function.Predicate;
 
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
 import com.google.common.base.Throwables;
+import com.google.common.collect.Sets;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.utils.CloseableIterator;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
 import org.apache.cassandra.utils.UUIDGen;
 
@@ -43,9 +43,8 @@
     private final LifecycleTransaction transaction;
     private final File directory;
 
-    private final OperationType compactionType = OperationType.UPGRADE_SSTABLES;
     private final CompactionController controller;
-    private final AbstractCompactionStrategy strategy;
+    private final CompactionStrategyManager strategyManager;
     private final long estimatedRows;
 
     private final OutputHandler outputHandler;
@@ -61,43 +60,36 @@
 
         this.controller = new UpgradeController(cfs);
 
-        this.strategy = cfs.getCompactionStrategy();
-        long estimatedTotalKeys = Math.max(cfs.metadata.getMinIndexInterval(), SSTableReader.getApproximateKeyCount(Arrays.asList(this.sstable)));
-        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(Arrays.asList(this.sstable)) / strategy.getMaxSSTableBytes());
+        this.strategyManager = cfs.getCompactionStrategyManager();
+        long estimatedTotalKeys = Math.max(cfs.metadata.params.minIndexInterval, SSTableReader.getApproximateKeyCount(Arrays.asList(this.sstable)));
+        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(Arrays.asList(this.sstable)) / strategyManager.getMaxSSTableBytes());
         this.estimatedRows = (long) Math.ceil((double) estimatedTotalKeys / estimatedSSTables);
     }
 
     private SSTableWriter createCompactionWriter(long repairedAt)
     {
         MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.getComparator());
-
-        // Get the max timestamp of the precompacted sstables
-        // and adds generation of live ancestors
-        sstableMetadataCollector.addAncestor(sstable.descriptor.generation);
-        for (Integer i : sstable.getAncestors())
-        {
-            if (new File(sstable.descriptor.withGeneration(i).filenameFor(Component.DATA)).exists())
-                sstableMetadataCollector.addAncestor(i);
-        }
         sstableMetadataCollector.sstableLevel(sstable.getSSTableLevel());
-        return SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(directory)), estimatedRows, repairedAt, cfs.metadata, cfs.partitioner, sstableMetadataCollector);
+        return SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(directory)),
+                                    estimatedRows,
+                                    repairedAt,
+                                    cfs.metadata,
+                                    sstableMetadataCollector,
+                                    SerializationHeader.make(cfs.metadata, Sets.newHashSet(sstable)),
+                                    transaction);
     }
 
-    public void upgrade()
+    public void upgrade(boolean keepOriginals)
     {
         outputHandler.output("Upgrading " + sstable);
-
-        try (SSTableRewriter writer = new SSTableRewriter(cfs, transaction, CompactionTask.getMaxDataAge(transaction.originals()), true);
-             AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(transaction.originals());
-             CloseableIterator<AbstractCompactedRow> iter = new CompactionIterable(compactionType, scanners.scanners, controller, DatabaseDescriptor.getSSTableFormat(), UUIDGen.getTimeUUID()).iterator())
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, keepOriginals, CompactionTask.getMaxDataAge(transaction.originals()), true);
+             AbstractCompactionStrategy.ScannerList scanners = strategyManager.getScanners(transaction.originals());
+             CompactionIterator iter = new CompactionIterator(transaction.opType(), scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
             writer.switchWriter(createCompactionWriter(sstable.getSSTableMetadata().repairedAt));
             while (iter.hasNext())
-            {
-                @SuppressWarnings("resource")
-                AbstractCompactedRow row = iter.next();
-                writer.append(row);
-            }
+                writer.append(iter.next());
 
             writer.finish();
             outputHandler.output("Upgrade of " + sstable + " complete.");
@@ -122,7 +114,7 @@
         @Override
         public Predicate<Long> getPurgeEvaluator(DecoratedKey key)
         {
-            return Predicates.alwaysFalse();
+            return time -> false;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java
index 42302fe..82eabd0 100644
--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
+++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java

@@ -17,23 +17,24 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
 import com.google.common.base.Throwables;
-import com.google.common.collect.Sets;
+
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
 import org.apache.cassandra.io.util.DataIntegrityMetadata;
 import org.apache.cassandra.io.util.DataIntegrityMetadata.FileDigestValidator;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.OutputHandler;
 import org.apache.cassandra.utils.UUIDGen;
 
@@ -43,6 +44,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.function.Predicate;
 
 public class Verifier implements Closeable
 {
@@ -58,7 +60,6 @@
     private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
 
     private int goodRows;
-    private int badRows;
 
     private final OutputHandler outputHandler;
     private FileDigestValidator validator;
@@ -73,7 +74,7 @@
         this.cfs = cfs;
         this.sstable = sstable;
         this.outputHandler = outputHandler;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata);
+        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata, sstable.descriptor.version, sstable.header);
 
         this.controller = new VerifyController(cfs);
 
@@ -89,22 +90,37 @@
         long rowStart = 0;
 
         outputHandler.output(String.format("Verifying %s (%s bytes)", sstable, dataFile.length()));
+        outputHandler.output(String.format("Deserializing sstable metadata for %s ", sstable));
+        try
+        {
+            EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS, MetadataType.HEADER);
+            Map<MetadataType, MetadataComponent> sstableMetadata = sstable.descriptor.getMetadataSerializer().deserialize(sstable.descriptor, types);
+            if (sstableMetadata.containsKey(MetadataType.VALIDATION) &&
+                !((ValidationMetadata)sstableMetadata.get(MetadataType.VALIDATION)).partitioner.equals(sstable.getPartitioner().getClass().getCanonicalName()))
+                throw new IOException("Partitioner does not match validation metadata");
+        }
+        catch (Throwable t)
+        {
+            outputHandler.debug(t.getMessage());
+            markAndThrow(false);
+        }
         outputHandler.output(String.format("Checking computed hash of %s ", sstable));
 
 
-        // Verify will use the adler32 Digest files, which works for both compressed and uncompressed sstables
+        // Verify will use the Digest files, which works for both compressed and uncompressed sstables
         try
         {
             validator = null;
 
-            if (new File(sstable.descriptor.filenameFor(Component.DIGEST)).exists())
+            if (sstable.descriptor.digestComponent != null &&
+                new File(sstable.descriptor.filenameFor(sstable.descriptor.digestComponent)).exists())
             {
                 validator = DataIntegrityMetadata.fileDigestValidator(sstable.descriptor);
                 validator.validate();
             }
             else
             {
-                outputHandler.output("Data digest missing, assuming extended verification of disk atoms");
+                outputHandler.output("Data digest missing, assuming extended verification of disk values");
                 extended = true;
             }
         }
@@ -121,14 +137,14 @@
         if ( !extended )
             return;
 
-        outputHandler.output("Extended Verify requested, proceeding to inspect atoms");
+        outputHandler.output("Extended Verify requested, proceeding to inspect values");
 
 
         try
         {
             ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile);
             {
-                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserialize(indexFile, sstable.descriptor.version).position;
+                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserialize(indexFile).position;
                 if (firstRowPositionFromIndex != 0)
                     markAndThrow();
             }
@@ -147,7 +163,7 @@
                 DecoratedKey key = null;
                 try
                 {
-                    key = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
+                    key = sstable.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
                 }
                 catch (Throwable th)
                 {
@@ -162,7 +178,7 @@
                     nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile);
                     nextRowPositionFromIndex = indexFile.isEOF()
                                              ? dataFile.length()
-                                             : rowIndexEntrySerializer.deserialize(indexFile, sstable.descriptor.version).position;
+                                             : rowIndexEntrySerializer.deserialize(indexFile).position;
                 }
                 catch (Throwable th)
                 {
@@ -186,8 +202,11 @@
                     if (key == null || dataSize > dataFile.length())
                         markAndThrow();
 
-                    //mimic the scrub read path
-                    new SSTableIdentityIterator(sstable, dataFile, key, true);
+                    //mimic the scrub read path, intentionally unused
+                    try (UnfilteredRowIterator iterator = new SSTableIdentityIterator(sstable, dataFile, key))
+                    {
+                    }
+
                     if ( (prevKey != null && prevKey.compareTo(key) > 0) || !key.getKey().equals(currentIndexKey) || dataStart != dataStartFromIndex )
                         markAndThrow();
                     
@@ -200,7 +219,6 @@
                 }
                 catch (Throwable th)
                 {
-                    badRows++;
                     markAndThrow();
                 }
             }
@@ -231,8 +249,25 @@
 
     private void markAndThrow() throws IOException
     {
-        sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, ActiveRepairService.UNREPAIRED_SSTABLE);
-        throw new CorruptSSTableException(new Exception(String.format("Invalid SSTable %s, please force repair", sstable.getFilename())), sstable.getFilename());
+        markAndThrow(true);
+    }
+
+    private void markAndThrow(boolean mutateRepaired) throws IOException
+    {
+        if (mutateRepaired) // if we are able to mutate repaired flag, an incremental repair should be enough
+        {
+            try
+            {
+                sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, ActiveRepairService.UNREPAIRED_SSTABLE);
+                sstable.reloadSSTableMetadata();
+                cfs.getTracker().notifySSTableRepairedStatusChanged(Collections.singleton(sstable));
+            }
+            catch(IOException ioe)
+            {
+                outputHandler.output("Error mutating repairedAt for SSTable " +  sstable.getFilename() + ", as part of markAndThrow");
+            }
+        }
+        throw new CorruptSSTableException(new Exception(String.format("Invalid SSTable %s, please force %srepair", sstable.getFilename(), mutateRepaired ? "" : "a full ")), sstable.getFilename());
     }
 
     public CompactionInfo.Holder getVerifyInfo()
@@ -268,6 +303,11 @@
                 throw new RuntimeException();
             }
         }
+
+        public boolean isGlobal()
+        {
+            return false;
+        }
     }
 
     private static class VerifyController extends CompactionController
@@ -280,7 +320,7 @@
         @Override
         public Predicate<Long> getPurgeEvaluator(DecoratedKey key)
         {
-            return Predicates.alwaysFalse();
+            return time -> false;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/WrappingCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/WrappingCompactionStrategy.java
deleted file mode 100644
index 8555432..0000000
--- a/src/java/org/apache/cassandra/db/compaction/WrappingCompactionStrategy.java
+++ /dev/null

@@ -1,447 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.Callable;
-
-import com.google.common.collect.ImmutableMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.notifications.INotification;
-import org.apache.cassandra.notifications.INotificationConsumer;
-import org.apache.cassandra.notifications.SSTableAddedNotification;
-import org.apache.cassandra.notifications.SSTableDeletingNotification;
-import org.apache.cassandra.notifications.SSTableListChangedNotification;
-import org.apache.cassandra.notifications.SSTableRepairStatusChanged;
-
-public final class WrappingCompactionStrategy extends AbstractCompactionStrategy implements INotificationConsumer
-{
-    private static final Logger logger = LoggerFactory.getLogger(WrappingCompactionStrategy.class);
-    private volatile AbstractCompactionStrategy repaired;
-    private volatile AbstractCompactionStrategy unrepaired;
-    /*
-        We keep a copy of the schema compaction options and class here to be able to decide if we
-        should update the compaction strategy in maybeReloadCompactionStrategy() due to an ALTER.
-
-        If a user changes the local compaction strategy and then later ALTERs a compaction option,
-        we will use the new compaction options.
-     */
-    private Map<String, String> schemaCompactionOptions;
-    private Class<?> schemaCompactionStrategyClass;
-
-    public WrappingCompactionStrategy(ColumnFamilyStore cfs)
-    {
-        super(cfs, cfs.metadata.compactionStrategyOptions);
-        reloadCompactionStrategy(cfs.metadata);
-        cfs.getTracker().subscribe(this);
-        logger.trace("{} subscribed to the data tracker.", this);
-    }
-
-    @Override
-    public synchronized AbstractCompactionTask getNextBackgroundTask(int gcBefore)
-    {
-        if (!isEnabled())
-            return null;
-
-        if (repaired.getEstimatedRemainingTasks() > unrepaired.getEstimatedRemainingTasks())
-        {
-            AbstractCompactionTask repairedTask = repaired.getNextBackgroundTask(gcBefore);
-            if (repairedTask != null)
-                return repairedTask;
-            return unrepaired.getNextBackgroundTask(gcBefore);
-        }
-        else
-        {
-            AbstractCompactionTask unrepairedTask = unrepaired.getNextBackgroundTask(gcBefore);
-            if (unrepairedTask != null)
-                return unrepairedTask;
-            return repaired.getNextBackgroundTask(gcBefore);
-        }
-
-    }
-
-    @Override
-    public Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore, final boolean splitOutput)
-    {
-        // runWithCompactionsDisabled cancels active compactions and disables them, then we are able
-        // to make the repaired/unrepaired strategies mark their own sstables as compacting. Once the
-        // sstables are marked the compactions are re-enabled
-        return cfs.runWithCompactionsDisabled(new Callable<Collection<AbstractCompactionTask>>()
-        {
-            @Override
-            public Collection<AbstractCompactionTask> call() throws Exception
-            {
-                synchronized (WrappingCompactionStrategy.this)
-                {
-                    Collection<AbstractCompactionTask> repairedTasks = repaired.getMaximalTask(gcBefore, splitOutput);
-                    Collection<AbstractCompactionTask> unrepairedTasks = unrepaired.getMaximalTask(gcBefore, splitOutput);
-
-                    if (repairedTasks == null && unrepairedTasks == null)
-                        return null;
-
-                    if (repairedTasks == null)
-                        return unrepairedTasks;
-                    if (unrepairedTasks == null)
-                        return repairedTasks;
-
-                    List<AbstractCompactionTask> tasks = new ArrayList<>();
-                    tasks.addAll(repairedTasks);
-                    tasks.addAll(unrepairedTasks);
-                    return tasks;
-                }
-            }
-        }, false);
-    }
-
-    @Override
-    public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, final int gcBefore, long maxSSTableBytes)
-    {
-        assert txn.originals().size() > 0;
-        boolean repairedSSTables = txn.originals().iterator().next().isRepaired();
-        for (SSTableReader sstable : txn.originals())
-            if (repairedSSTables != sstable.isRepaired())
-                throw new RuntimeException("Can't mix repaired and unrepaired sstables in a compaction");
-
-        if (repairedSSTables)
-            return repaired.getCompactionTask(txn, gcBefore, maxSSTableBytes);
-        else
-            return unrepaired.getCompactionTask(txn, gcBefore, maxSSTableBytes);
-    }
-
-    @Override
-    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
-    {
-        assert !sstables.isEmpty();
-        boolean userDefinedInRepaired = sstables.iterator().next().isRepaired();
-        for (SSTableReader sstable : sstables)
-        {
-            if (userDefinedInRepaired != sstable.isRepaired())
-            {
-                logger.error("You can't mix repaired and unrepaired sstables in a user defined compaction");
-                return null;
-            }
-        }
-        if (userDefinedInRepaired)
-            return repaired.getUserDefinedTask(sstables, gcBefore);
-        else
-            return unrepaired.getUserDefinedTask(sstables, gcBefore);
-    }
-
-    @Override
-    public synchronized int getEstimatedRemainingTasks()
-    {
-        assert repaired.getClass().equals(unrepaired.getClass());
-        return repaired.getEstimatedRemainingTasks() + unrepaired.getEstimatedRemainingTasks();
-    }
-
-    @Override
-    public synchronized long getMaxSSTableBytes()
-    {
-        assert repaired.getClass().equals(unrepaired.getClass());
-        return unrepaired.getMaxSSTableBytes();
-    }
-
-    public synchronized void maybeReloadCompactionStrategy(CFMetaData metadata)
-    {
-        // compare the old schema configuration to the new one, ignore any locally set changes.
-        if (metadata.compactionStrategyClass.equals(schemaCompactionStrategyClass) &&
-            metadata.compactionStrategyOptions.equals(schemaCompactionOptions))
-            return;
-        reloadCompactionStrategy(metadata);
-    }
-
-    public synchronized void reloadCompactionStrategy(CFMetaData metadata)
-    {
-        boolean disabledWithJMX = !enabled && shouldBeEnabled();
-        setStrategy(metadata.compactionStrategyClass, metadata.compactionStrategyOptions);
-        schemaCompactionOptions = ImmutableMap.copyOf(metadata.compactionStrategyOptions);
-        schemaCompactionStrategyClass = repaired.getClass();
-
-        if (disabledWithJMX || !shouldBeEnabled())
-            disable();
-        else
-            enable();
-        startup();
-    }
-
-    public synchronized int getUnleveledSSTables()
-    {
-        if (this.repaired instanceof LeveledCompactionStrategy && this.unrepaired instanceof LeveledCompactionStrategy)
-        {
-            return ((LeveledCompactionStrategy)repaired).getLevelSize(0) + ((LeveledCompactionStrategy)unrepaired).getLevelSize(0);
-        }
-        return 0;
-    }
-
-    public synchronized int[] getSSTableCountPerLevel()
-    {
-        if (this.repaired instanceof LeveledCompactionStrategy && this.unrepaired instanceof LeveledCompactionStrategy)
-        {
-            int [] repairedCountPerLevel = ((LeveledCompactionStrategy) repaired).getAllLevelSize();
-            int [] unrepairedCountPerLevel = ((LeveledCompactionStrategy) unrepaired).getAllLevelSize();
-            return sumArrays(repairedCountPerLevel, unrepairedCountPerLevel);
-        }
-        return null;
-    }
-
-    public static int [] sumArrays(int[] a, int [] b)
-    {
-        int [] res = new int[Math.max(a.length, b.length)];
-        for (int i = 0; i < res.length; i++)
-        {
-            if (i < a.length && i < b.length)
-                res[i] = a[i] + b[i];
-            else if (i < a.length)
-                res[i] = a[i];
-            else
-                res[i] = b[i];
-        }
-        return res;
-    }
-
-    @Override
-    public boolean shouldDefragment()
-    {
-        assert repaired.getClass().equals(unrepaired.getClass());
-        return repaired.shouldDefragment();
-    }
-
-    @Override
-    public String getName()
-    {
-        assert repaired.getClass().equals(unrepaired.getClass());
-        return repaired.getName();
-    }
-
-    @Override
-    public void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added)
-    {
-        throw new UnsupportedOperationException("Can't replace sstables in the wrapping compaction strategy");
-    }
-
-    @Override
-    public void addSSTable(SSTableReader added)
-    {
-        throw new UnsupportedOperationException("Can't add sstables to the wrapping compaction strategy");
-    }
-
-    @Override
-    public void removeSSTable(SSTableReader sstable)
-    {
-        throw new UnsupportedOperationException("Can't remove sstables from the wrapping compaction strategy");
-    }
-
-    public synchronized void handleNotification(INotification notification, Object sender)
-    {
-        if (notification instanceof SSTableAddedNotification)
-        {
-            SSTableAddedNotification flushedNotification = (SSTableAddedNotification) notification;
-            if (flushedNotification.added.isRepaired())
-                repaired.addSSTable(flushedNotification.added);
-            else
-                unrepaired.addSSTable(flushedNotification.added);
-        }
-        else if (notification instanceof SSTableListChangedNotification)
-        {
-            SSTableListChangedNotification listChangedNotification = (SSTableListChangedNotification) notification;
-            Set<SSTableReader> repairedRemoved = new HashSet<>();
-            Set<SSTableReader> repairedAdded = new HashSet<>();
-            Set<SSTableReader> unrepairedRemoved = new HashSet<>();
-            Set<SSTableReader> unrepairedAdded = new HashSet<>();
-
-            for (SSTableReader sstable : listChangedNotification.removed)
-            {
-                if (sstable.isRepaired())
-                    repairedRemoved.add(sstable);
-                else
-                    unrepairedRemoved.add(sstable);
-            }
-            for (SSTableReader sstable : listChangedNotification.added)
-            {
-                if (sstable.isRepaired())
-                    repairedAdded.add(sstable);
-                else
-                    unrepairedAdded.add(sstable);
-            }
-            if (!repairedRemoved.isEmpty())
-            {
-                repaired.replaceSSTables(repairedRemoved, repairedAdded);
-            }
-            else
-            {
-                for (SSTableReader sstable : repairedAdded)
-                    repaired.addSSTable(sstable);
-            }
-
-            if (!unrepairedRemoved.isEmpty())
-            {
-                unrepaired.replaceSSTables(unrepairedRemoved, unrepairedAdded);
-            }
-            else
-            {
-                for (SSTableReader sstable : unrepairedAdded)
-                    unrepaired.addSSTable(sstable);
-            }
-        }
-        else if (notification instanceof SSTableRepairStatusChanged)
-        {
-            for (SSTableReader sstable : ((SSTableRepairStatusChanged) notification).sstable)
-            {
-                if (sstable.isRepaired())
-                {
-                    unrepaired.removeSSTable(sstable);
-                    repaired.addSSTable(sstable);
-                }
-                else
-                {
-                    repaired.removeSSTable(sstable);
-                    unrepaired.addSSTable(sstable);
-                }
-            }
-        }
-        else if (notification instanceof SSTableDeletingNotification)
-        {
-            SSTableReader sstable = ((SSTableDeletingNotification)notification).deleting;
-            if (sstable.isRepaired())
-                repaired.removeSSTable(sstable);
-            else
-                unrepaired.removeSSTable(sstable);
-        }
-    }
-
-    @Override
-    public List<SSTableReader> filterSSTablesForReads(List<SSTableReader> sstables)
-    {
-        // todo: union of filtered sstables or intersection?
-        return unrepaired.filterSSTablesForReads(repaired.filterSSTablesForReads(sstables));
-    }
-
-    @Override
-    public synchronized void startup()
-    {
-        super.startup();
-        for (SSTableReader sstable : cfs.getSSTables())
-        {
-            if (sstable.openReason != SSTableReader.OpenReason.EARLY)
-            {
-                if (sstable.isRepaired())
-                    repaired.addSSTable(sstable);
-                else
-                    unrepaired.addSSTable(sstable);
-            }
-        }
-        repaired.startup();
-        unrepaired.startup();
-    }
-
-    @Override
-    public synchronized void shutdown()
-    {
-        super.shutdown();
-        repaired.shutdown();
-        unrepaired.shutdown();
-    }
-
-    @Override
-    public void enable()
-    {
-        if (repaired != null)
-            repaired.enable();
-        if (unrepaired != null)
-            unrepaired.enable();
-        // enable this last to make sure the strategies are ready to get calls.
-        super.enable();
-    }
-
-    @Override
-    public void disable()
-    {
-        // disable this first avoid asking disabled strategies for compaction tasks
-        super.disable();
-        if (repaired != null)
-            repaired.disable();
-        if (unrepaired != null)
-            unrepaired.disable();
-    }
-
-    @Override
-    @SuppressWarnings("resource")
-    public synchronized ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
-    {
-        List<SSTableReader> repairedSSTables = new ArrayList<>();
-        List<SSTableReader> unrepairedSSTables = new ArrayList<>();
-        for (SSTableReader sstable : sstables)
-            if (sstable.isRepaired())
-                repairedSSTables.add(sstable);
-            else
-                unrepairedSSTables.add(sstable);
-        ScannerList repairedScanners = repaired.getScanners(repairedSSTables, range);
-        ScannerList unrepairedScanners = unrepaired.getScanners(unrepairedSSTables, range);
-        List<ISSTableScanner> scanners = new ArrayList<>(repairedScanners.scanners.size() + unrepairedScanners.scanners.size());
-        scanners.addAll(repairedScanners.scanners);
-        scanners.addAll(unrepairedScanners.scanners);
-        return new ScannerList(scanners);
-    }
-
-    public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
-    {
-        return unrepaired.groupSSTablesForAntiCompaction(sstablesToGroup);
-    }
-
-    public List<AbstractCompactionStrategy> getWrappedStrategies()
-    {
-        return Arrays.asList(repaired, unrepaired);
-    }
-
-    public synchronized void setNewLocalCompactionStrategy(Class<? extends AbstractCompactionStrategy> compactionStrategyClass, Map<String, String> options)
-    {
-        logger.info("Switching local compaction strategy from {} to {} with options={}", repaired == null ? "null" : repaired.getClass(), compactionStrategyClass, options);
-        setStrategy(compactionStrategyClass, options);
-        if (shouldBeEnabled())
-            enable();
-        else
-            disable();
-        startup();
-    }
-
-    private void setStrategy(Class<? extends AbstractCompactionStrategy> compactionStrategyClass, Map<String, String> options)
-    {
-        if (repaired != null)
-            repaired.shutdown();
-        if (unrepaired != null)
-            unrepaired.shutdown();
-        repaired = CFMetaData.createCompactionStrategyInstance(compactionStrategyClass, cfs, options);
-        unrepaired = CFMetaData.createCompactionStrategyInstance(compactionStrategyClass, cfs, options);
-        this.options = ImmutableMap.copyOf(options);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
index 20c96d6..d33d72c 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java

@@ -18,12 +18,13 @@
 
 package org.apache.cassandra.db.compaction.writers;
 
-import java.util.List;
+import java.util.Collection;
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.CompactionTask;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.SSTableRewriter;
@@ -38,29 +39,33 @@
 public abstract class CompactionAwareWriter extends Transactional.AbstractTransactional implements Transactional
 {
     protected final ColumnFamilyStore cfs;
+    protected final Directories directories;
     protected final Set<SSTableReader> nonExpiredSSTables;
     protected final long estimatedTotalKeys;
     protected final long maxAge;
     protected final long minRepairedAt;
-    protected final SSTableRewriter sstableWriter;
 
-    public CompactionAwareWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, boolean offline)
+    protected final LifecycleTransaction txn;
+    protected final SSTableRewriter sstableWriter;
+    private boolean isInitialized = false;
+
+    public CompactionAwareWriter(ColumnFamilyStore cfs,
+                                 Directories directories,
+                                 LifecycleTransaction txn,
+                                 Set<SSTableReader> nonExpiredSSTables,
+                                 boolean offline,
+                                 boolean keepOriginals)
     {
         this.cfs = cfs;
+        this.directories = directories;
         this.nonExpiredSSTables = nonExpiredSSTables;
         this.estimatedTotalKeys = SSTableReader.getApproximateKeyCount(nonExpiredSSTables);
         this.maxAge = CompactionTask.getMaxDataAge(nonExpiredSSTables);
         this.minRepairedAt = CompactionTask.getMinRepairedAt(nonExpiredSSTables);
-        this.sstableWriter = new SSTableRewriter(cfs, txn, maxAge, offline);
+        this.txn = txn;
+        this.sstableWriter = SSTableRewriter.construct(cfs, txn, keepOriginals, maxAge, offline);
     }
 
-    /**
-     * Writes a row in an implementation specific way
-     * @param row the row to append
-     * @return true if the row was written, false otherwise
-     */
-    public abstract boolean append(AbstractCompactedRow row);
-
     @Override
     protected Throwable doAbort(Throwable accumulate)
     {
@@ -84,7 +89,7 @@
      * @return all the written sstables sstables
      */
     @Override
-    public List<SSTableReader> finish()
+    public Collection<SSTableReader> finish()
     {
         super.finish();
         return sstableWriter.finished();
@@ -98,12 +103,46 @@
         return estimatedTotalKeys;
     }
 
+    public final boolean append(UnfilteredRowIterator partition)
+    {
+        maybeSwitchWriter(partition.partitionKey());
+        return realAppend(partition);
+    }
+
+    @Override
+    protected Throwable doPostCleanup(Throwable accumulate)
+    {
+        sstableWriter.close();
+        return super.doPostCleanup(accumulate);
+    }
+
+    protected abstract boolean realAppend(UnfilteredRowIterator partition);
+
+    /**
+     * Guaranteed to be called before the first call to realAppend.
+     * @param key
+     */
+    protected void maybeSwitchWriter(DecoratedKey key)
+    {
+        if (!isInitialized)
+            switchCompactionLocation(getDirectories().getWriteableLocation(cfs.getExpectedCompactedFileSize(nonExpiredSSTables, txn.opType())));
+        isInitialized = true;
+    }
+
+    /**
+     * Implementations of this method should finish the current sstable writer and start writing to this directory.
+     *
+     * Called once before starting to append and then whenever we see a need to start writing to another directory.
+     * @param directory
+     */
+    protected abstract void switchCompactionLocation(Directories.DataDirectory directory);
+
     /**
      * The directories we can write to
      */
     public Directories getDirectories()
     {
-        return cfs.directories;
+        return directories;
     }
 
     /**
@@ -117,4 +156,4 @@
 
         return directory;
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
index 6611b3f..8b90224 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java

@@ -18,22 +18,21 @@
 package org.apache.cassandra.db.compaction.writers;
 
 
-import java.io.File;
 import java.util.Set;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
-import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 
-
 /**
  * The default compaction writer - creates one output file in L0
  */
@@ -41,27 +40,35 @@
 {
     protected static final Logger logger = LoggerFactory.getLogger(DefaultCompactionWriter.class);
 
-    @SuppressWarnings("resource")
-    public DefaultCompactionWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, boolean offline, OperationType compactionType)
+    public DefaultCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
     {
-        super(cfs, txn, nonExpiredSSTables, offline);
-        logger.trace("Expected bloom filter size : {}", estimatedTotalKeys);
-        long expectedWriteSize = cfs.getExpectedCompactedFileSize(nonExpiredSSTables, compactionType);
-        File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(expectedWriteSize));
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                    estimatedTotalKeys,
-                                                    minRepairedAt,
-                                                    cfs.metadata,
-                                                    cfs.partitioner,
-                                                    new MetadataCollector(txn.originals(), cfs.metadata.comparator, 0));
-        sstableWriter.switchWriter(writer);
+        this(cfs, directories, txn, nonExpiredSSTables, false, false);
+    }
+
+    @SuppressWarnings("resource")
+    public DefaultCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, boolean offline, boolean keepOriginals)
+    {
+        super(cfs, directories, txn, nonExpiredSSTables, offline, keepOriginals);
     }
 
     @Override
-    public boolean append(AbstractCompactedRow row)
+    public boolean realAppend(UnfilteredRowIterator partition)
     {
-        return sstableWriter.append(row) != null;
+        return sstableWriter.append(partition) != null;
+    }
+
+    @Override
+    protected void switchCompactionLocation(Directories.DataDirectory directory)
+    {
+        @SuppressWarnings("resource")
+        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(getDirectories().getLocationForDisk(directory))),
+                                                    estimatedTotalKeys,
+                                                    minRepairedAt,
+                                                    cfs.metadata,
+                                                    new MetadataCollector(txn.originals(), cfs.metadata.comparator, 0),
+                                                    SerializationHeader.make(cfs.metadata, nonExpiredSSTables),
+                                                    txn);
+        sstableWriter.switchWriter(writer);
     }
 
     @Override
@@ -69,4 +76,4 @@
     {
         return estimatedTotalKeys;
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
index 95d7a0c..6d191f8 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java

@@ -24,10 +24,11 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.LeveledManifest;
-import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -45,39 +46,37 @@
     private long partitionsWritten = 0;
     private long totalWrittenInLevel = 0;
     private int sstablesWritten = 0;
-    private final boolean skipAncestors;
+
+    public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
+                                        Directories directories,
+                                        LifecycleTransaction txn,
+                                        Set<SSTableReader> nonExpiredSSTables,
+                                        long maxSSTableSize)
+    {
+        this(cfs, directories, txn, nonExpiredSSTables, maxSSTableSize, false, false);
+    }
 
     @SuppressWarnings("resource")
-    public MajorLeveledCompactionWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, long maxSSTableSize, boolean offline, OperationType compactionType)
+    public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
+                                        Directories directories,
+                                        LifecycleTransaction txn,
+                                        Set<SSTableReader> nonExpiredSSTables,
+                                        long maxSSTableSize,
+                                        boolean offline,
+                                        boolean keepOriginals)
     {
-        super(cfs, txn, nonExpiredSSTables, offline);
+        super(cfs, directories, txn, nonExpiredSSTables, offline, keepOriginals);
         this.maxSSTableSize = maxSSTableSize;
         this.allSSTables = txn.originals();
-        expectedWriteSize = Math.min(maxSSTableSize, cfs.getExpectedCompactedFileSize(nonExpiredSSTables, compactionType));
-        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(nonExpiredSSTables) / maxSSTableSize);
-        long keysPerSSTable = estimatedTotalKeys / estimatedSSTables;
-        File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(expectedWriteSize));
-        skipAncestors = estimatedSSTables * allSSTables.size() > 200000; // magic number, avoid storing too much ancestor information since allSSTables are ancestors to *all* resulting sstables
-
-        if (skipAncestors)
-            logger.warn("Many sstables involved in compaction, skipping storing ancestor information to avoid running out of memory");
-
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                    keysPerSSTable,
-                                                    minRepairedAt,
-                                                    cfs.metadata,
-                                                    cfs.partitioner,
-                                                    new MetadataCollector(allSSTables, cfs.metadata.comparator, currentLevel, skipAncestors));
-        sstableWriter.switchWriter(writer);
+        expectedWriteSize = Math.min(maxSSTableSize, cfs.getExpectedCompactedFileSize(nonExpiredSSTables, txn.opType()));
     }
 
     @Override
     @SuppressWarnings("resource")
-    public boolean append(AbstractCompactedRow row)
+    public boolean realAppend(UnfilteredRowIterator partition)
     {
         long posBefore = sstableWriter.currentWriter().getOnDiskFilePointer();
-        RowIndexEntry rie = sstableWriter.append(row);
+        RowIndexEntry rie = sstableWriter.append(partition);
         totalWrittenInLevel += sstableWriter.currentWriter().getOnDiskFilePointer() - posBefore;
         partitionsWritten++;
         if (sstableWriter.currentWriter().getOnDiskFilePointer() > maxSSTableSize)
@@ -89,18 +88,25 @@
             }
 
             averageEstimatedKeysPerSSTable = Math.round(((double) averageEstimatedKeysPerSSTable * sstablesWritten + partitionsWritten) / (sstablesWritten + 1));
-            File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(expectedWriteSize));
-            SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                        averageEstimatedKeysPerSSTable,
-                                                        minRepairedAt,
-                                                        cfs.metadata,
-                                                        cfs.partitioner,
-                                                        new MetadataCollector(allSSTables, cfs.metadata.comparator, currentLevel, skipAncestors));
-            sstableWriter.switchWriter(writer);
+            switchCompactionLocation(getWriteDirectory(expectedWriteSize));
             partitionsWritten = 0;
             sstablesWritten++;
         }
         return rie != null;
 
     }
-}
\ No newline at end of file
+
+    public void switchCompactionLocation(Directories.DataDirectory directory)
+    {
+        File sstableDirectory = getDirectories().getLocationForDisk(directory);
+        @SuppressWarnings("resource")
+        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(sstableDirectory)),
+                                                    averageEstimatedKeysPerSSTable,
+                                                    minRepairedAt,
+                                                    cfs.metadata,
+                                                    new MetadataCollector(allSSTables, cfs.metadata.comparator, currentLevel),
+                                                    SerializationHeader.make(cfs.metadata, nonExpiredSSTables),
+                                                    txn);
+        sstableWriter.switchWriter(writer);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
index 2bae3b8..b206498 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java

@@ -17,13 +17,14 @@
  */
 package org.apache.cassandra.db.compaction.writers;
 
-import java.io.File;
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -38,25 +39,34 @@
     private final long estimatedSSTables;
     private final Set<SSTableReader> allSSTables;
 
-    @SuppressWarnings("resource")
-    public MaxSSTableSizeWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, long maxSSTableSize, int level, boolean offline, OperationType compactionType)
+    public MaxSSTableSizeWriter(ColumnFamilyStore cfs,
+                                Directories directories,
+                                LifecycleTransaction txn,
+                                Set<SSTableReader> nonExpiredSSTables,
+                                long maxSSTableSize,
+                                int level)
     {
-        super(cfs, txn, nonExpiredSSTables, offline);
+        this(cfs, directories, txn, nonExpiredSSTables, maxSSTableSize, level, false, false);
+    }
+
+    @SuppressWarnings("resource")
+    public MaxSSTableSizeWriter(ColumnFamilyStore cfs,
+                                Directories directories,
+                                LifecycleTransaction txn,
+                                Set<SSTableReader> nonExpiredSSTables,
+                                long maxSSTableSize,
+                                int level,
+                                boolean offline,
+                                boolean keepOriginals)
+    {
+        super(cfs, directories, txn, nonExpiredSSTables, offline, keepOriginals);
         this.allSSTables = txn.originals();
         this.level = level;
         this.maxSSTableSize = maxSSTableSize;
-        long totalSize = getTotalWriteSize(nonExpiredSSTables, estimatedTotalKeys, cfs, compactionType);
+
+        long totalSize = getTotalWriteSize(nonExpiredSSTables, estimatedTotalKeys, cfs, txn.opType());
         expectedWriteSize = Math.min(maxSSTableSize, totalSize);
         estimatedSSTables = Math.max(1, totalSize / maxSSTableSize);
-        File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(expectedWriteSize));
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                    estimatedTotalKeys / estimatedSSTables,
-                                                    minRepairedAt,
-                                                    cfs.metadata,
-                                                    cfs.partitioner,
-                                                    new MetadataCollector(allSSTables, cfs.metadata.comparator, level));
-        sstableWriter.switchWriter(writer);
     }
 
     /**
@@ -73,22 +83,26 @@
     }
 
     @Override
-    public boolean append(AbstractCompactedRow row)
+    public boolean realAppend(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(row);
+        RowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getOnDiskFilePointer() > maxSSTableSize)
-        {
-            File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(expectedWriteSize));
-            @SuppressWarnings("resource")
-            SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                                estimatedTotalKeys / estimatedSSTables,
-                                                                minRepairedAt,
-                                                                cfs.metadata,
-                                                                cfs.partitioner,
-                                                                new MetadataCollector(allSSTables, cfs.metadata.comparator, level));
-
-            sstableWriter.switchWriter(writer);
-        }
+            switchCompactionLocation(getWriteDirectory(expectedWriteSize));
         return rie != null;
     }
+
+    public void switchCompactionLocation(Directories.DataDirectory location)
+    {
+        @SuppressWarnings("resource")
+        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(getDirectories().getLocationForDisk(location))),
+                                                    estimatedTotalKeys / estimatedSSTables,
+                                                    minRepairedAt,
+                                                    cfs.metadata,
+                                                    new MetadataCollector(allSSTables, cfs.metadata.comparator, level),
+                                                    SerializationHeader.make(cfs.metadata, nonExpiredSSTables),
+                                                    txn);
+
+        sstableWriter.switchWriter(writer);
+
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
index ed07df9..796391c 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java

@@ -25,9 +25,10 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
-import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -51,17 +52,17 @@
     private long currentBytesToWrite;
     private int currentRatioIndex = 0;
 
-    public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, OperationType compactionType)
+    public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
     {
-        this(cfs, txn, nonExpiredSSTables, compactionType, DEFAULT_SMALLEST_SSTABLE_BYTES);
+        this(cfs, directories, txn, nonExpiredSSTables, DEFAULT_SMALLEST_SSTABLE_BYTES);
     }
 
     @SuppressWarnings("resource")
-    public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, OperationType compactionType, long smallestSSTable)
+    public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables, long smallestSSTable)
     {
-        super(cfs, txn, nonExpiredSSTables, false);
+        super(cfs, directories, txn, nonExpiredSSTables, false, false);
         this.allSSTables = txn.originals();
-        totalSize = cfs.getExpectedCompactedFileSize(nonExpiredSSTables, compactionType);
+        totalSize = cfs.getExpectedCompactedFileSize(nonExpiredSSTables, txn.opType());
         double[] potentialRatios = new double[20];
         double currentRatio = 1;
         for (int i = 0; i < potentialRatios.length; i++)
@@ -81,41 +82,38 @@
             }
         }
         ratios = Arrays.copyOfRange(potentialRatios, 0, noPointIndex);
-        File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(Math.round(totalSize * ratios[currentRatioIndex])));
         long currentPartitionsToWrite = Math.round(estimatedTotalKeys * ratios[currentRatioIndex]);
         currentBytesToWrite = Math.round(totalSize * ratios[currentRatioIndex]);
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                                            currentPartitionsToWrite,
-                                                                            minRepairedAt,
-                                                                            cfs.metadata,
-                                                                            cfs.partitioner,
-                                                                            new MetadataCollector(allSSTables, cfs.metadata.comparator, 0));
-
-        sstableWriter.switchWriter(writer);
+        switchCompactionLocation(getWriteDirectory(currentBytesToWrite));
         logger.trace("Ratios={}, expectedKeys = {}, totalSize = {}, currentPartitionsToWrite = {}, currentBytesToWrite = {}", ratios, estimatedTotalKeys, totalSize, currentPartitionsToWrite, currentBytesToWrite);
     }
 
     @Override
-    public boolean append(AbstractCompactedRow row)
+    public boolean realAppend(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(row);
+        RowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getOnDiskFilePointer() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect
         {
             currentRatioIndex++;
             currentBytesToWrite = Math.round(totalSize * ratios[currentRatioIndex]);
-            long currentPartitionsToWrite = Math.round(ratios[currentRatioIndex] * estimatedTotalKeys);
-            File sstableDirectory = cfs.directories.getLocationForDisk(getWriteDirectory(Math.round(totalSize * ratios[currentRatioIndex])));
-            @SuppressWarnings("resource")
-            SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(sstableDirectory)),
-                                                                                currentPartitionsToWrite,
-                                                                                minRepairedAt,
-                                                                                cfs.metadata,
-                                                                                cfs.partitioner,
-                                                                                new MetadataCollector(allSSTables, cfs.metadata.comparator, 0));
-            sstableWriter.switchWriter(writer);
-            logger.trace("Switching writer, currentPartitionsToWrite = {}", currentPartitionsToWrite);
+            switchCompactionLocation(getWriteDirectory(Math.round(totalSize * ratios[currentRatioIndex])));
         }
         return rie != null;
     }
-}
\ No newline at end of file
+
+    public void switchCompactionLocation(Directories.DataDirectory location)
+    {
+        long currentPartitionsToWrite = Math.round(ratios[currentRatioIndex] * estimatedTotalKeys);
+        @SuppressWarnings("resource")
+        SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getSSTablePath(getDirectories().getLocationForDisk(location))),
+                                                    currentPartitionsToWrite,
+                                                    minRepairedAt,
+                                                    cfs.metadata,
+                                                    new MetadataCollector(allSSTables, cfs.metadata.comparator, 0),
+                                                    SerializationHeader.make(cfs.metadata, nonExpiredSSTables),
+                                                    txn);
+        logger.trace("Switching writer, currentPartitionsToWrite = {}", currentPartitionsToWrite);
+        sstableWriter.switchWriter(writer);
+
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCType.java b/src/java/org/apache/cassandra/db/composites/AbstractCType.java
deleted file mode 100644
index 2190c69..0000000
--- a/src/java/org/apache/cassandra/db/composites/AbstractCType.java
+++ /dev/null

@@ -1,395 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Comparator;
-
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.NativeCell;
-import org.apache.cassandra.db.RangeTombstone;
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.AbstractCompositeType;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
-
-public abstract class AbstractCType implements CType
-{
-    static final Comparator<Cell> rightNativeCell = new Comparator<Cell>()
-    {
-        public int compare(Cell o1, Cell o2)
-        {
-            return -((NativeCell) o2).compareTo(o1.name());
-        }
-    };
-
-    static final Comparator<Cell> neitherNativeCell = new Comparator<Cell>()
-    {
-        public int compare(Cell o1, Cell o2)
-        {
-            return compareUnsigned(o1.name(), o2.name());
-        }
-    };
-
-    // only one or the other of these will ever be used
-    static final Comparator<Object> asymmetricRightNativeCell = new Comparator<Object>()
-    {
-        public int compare(Object o1, Object o2)
-        {
-            return -((NativeCell) o2).compareTo((Composite) o1);
-        }
-    };
-
-    static final Comparator<Object> asymmetricNeitherNativeCell = new Comparator<Object>()
-    {
-        public int compare(Object o1, Object o2)
-        {
-            return compareUnsigned((Composite) o1, ((Cell) o2).name());
-        }
-    };
-
-    private final Comparator<Composite> reverseComparator;
-    private final Comparator<IndexInfo> indexComparator;
-    private final Comparator<IndexInfo> indexReverseComparator;
-
-    private final Serializer serializer;
-
-    private final IVersionedSerializer<ColumnSlice> sliceSerializer;
-    private final IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer;
-    private final DeletionInfo.Serializer deletionInfoSerializer;
-    private final RangeTombstone.Serializer rangeTombstoneSerializer;
-
-    protected final boolean isByteOrderComparable;
-
-    protected AbstractCType(boolean isByteOrderComparable)
-    {
-        reverseComparator = new Comparator<Composite>()
-        {
-            public int compare(Composite c1, Composite c2)
-            {
-                return AbstractCType.this.compare(c2, c1);
-            }
-        };
-        indexComparator = new Comparator<IndexInfo>()
-        {
-            public int compare(IndexInfo o1, IndexInfo o2)
-            {
-                return AbstractCType.this.compare(o1.lastName, o2.lastName);
-            }
-        };
-        indexReverseComparator = new Comparator<IndexInfo>()
-        {
-            public int compare(IndexInfo o1, IndexInfo o2)
-            {
-                return AbstractCType.this.compare(o1.firstName, o2.firstName);
-            }
-        };
-
-        serializer = new Serializer(this);
-
-        sliceSerializer = new ColumnSlice.Serializer(this);
-        sliceQueryFilterSerializer = new SliceQueryFilter.Serializer(this);
-        deletionInfoSerializer = new DeletionInfo.Serializer(this);
-        rangeTombstoneSerializer = new RangeTombstone.Serializer(this);
-        this.isByteOrderComparable = isByteOrderComparable;
-    }
-
-    protected static boolean isByteOrderComparable(Iterable<AbstractType<?>> types)
-    {
-        boolean isByteOrderComparable = true;
-        for (AbstractType<?> type : types)
-            isByteOrderComparable &= type.isByteOrderComparable();
-        return isByteOrderComparable;
-    }
-
-    static int compareUnsigned(Composite c1, Composite c2)
-    {
-        if (c1.isStatic() != c2.isStatic())
-        {
-            // Static sorts before non-static no matter what, except for empty which
-            // always sort first
-            if (c1.isEmpty())
-                return c2.isEmpty() ? 0 : -1;
-            if (c2.isEmpty())
-                return 1;
-            return c1.isStatic() ? -1 : 1;
-        }
-
-        int s1 = c1.size();
-        int s2 = c2.size();
-        int minSize = Math.min(s1, s2);
-
-        for (int i = 0; i < minSize; i++)
-        {
-            int cmp = ByteBufferUtil.compareUnsigned(c1.get(i), c2.get(i));
-            if (cmp != 0)
-                return cmp;
-        }
-
-        if (s1 == s2)
-            return c1.eoc().compareTo(c2.eoc());
-        return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
-    }
-
-    public int compare(Composite c1, Composite c2)
-    {
-        if (c1.isStatic() != c2.isStatic())
-        {
-            // Static sorts before non-static no matter what, except for empty which
-            // always sort first
-            if (c1.isEmpty())
-                return c2.isEmpty() ? 0 : -1;
-            if (c2.isEmpty())
-                return 1;
-            return c1.isStatic() ? -1 : 1;
-        }
-
-        int s1 = c1.size();
-        int s2 = c2.size();
-        int minSize = Math.min(s1, s2);
-
-        for (int i = 0; i < minSize; i++)
-        {
-            int cmp = isByteOrderComparable
-                      ? ByteBufferUtil.compareUnsigned(c1.get(i), c2.get(i))
-                      : subtype(i).compare(c1.get(i), c2.get(i));
-            if (cmp != 0)
-                return cmp;
-        }
-
-        if (s1 == s2)
-            return c1.eoc().compareTo(c2.eoc());
-        return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
-    }
-
-    protected Comparator<Cell> getByteOrderColumnComparator(boolean isRightNative)
-    {
-        if (isRightNative)
-            return rightNativeCell;
-        return neitherNativeCell;
-    }
-
-    protected Comparator<Object> getByteOrderAsymmetricColumnComparator(boolean isRightNative)
-    {
-        if (isRightNative)
-            return asymmetricRightNativeCell;
-        return asymmetricNeitherNativeCell;
-    }
-
-    public void validate(Composite name)
-    {
-        ByteBuffer previous = null;
-        for (int i = 0; i < name.size(); i++)
-        {
-            AbstractType<?> comparator = subtype(i);
-            ByteBuffer value = name.get(i);
-            comparator.validateCollectionMember(value, previous);
-            previous = value;
-        }
-    }
-
-    public boolean isCompatibleWith(CType previous)
-    {
-        if (this == previous)
-            return true;
-
-        // Extending with new components is fine, shrinking is not
-        if (size() < previous.size())
-            return false;
-
-        for (int i = 0; i < previous.size(); i++)
-        {
-            AbstractType<?> tprev = previous.subtype(i);
-            AbstractType<?> tnew = subtype(i);
-            if (!tnew.isCompatibleWith(tprev))
-                return false;
-        }
-        return true;
-    }
-
-    public String getString(Composite c)
-    {
-        StringBuilder sb = new StringBuilder();
-        for (int i = 0; i < c.size(); i++)
-        {
-            if (i > 0)
-                sb.append(":");
-            sb.append(AbstractCompositeType.escape(subtype(i).getString(c.get(i))));
-        }
-        switch (c.eoc())
-        {
-            case START:
-                sb.append(":_");
-                break;
-            case END:
-                sb.append(":!");
-                break;
-        }
-        return sb.toString();
-    }
-
-    public Composite make(Object... components)
-    {
-        if (components.length > size())
-            throw new IllegalArgumentException("Too many components, max is " + size());
-
-        CBuilder builder = builder();
-        for (int i = 0; i < components.length; i++)
-        {
-            Object obj = components[i];
-            if (obj instanceof ByteBuffer)
-                builder.add((ByteBuffer)obj);
-            else
-                builder.add(obj);
-        }
-        return builder.build();
-    }
-
-    public CType.Serializer serializer()
-    {
-        return serializer;
-    }
-
-    public Comparator<Composite> reverseComparator()
-    {
-        return reverseComparator;
-    }
-
-    public Comparator<IndexInfo> indexComparator()
-    {
-        return indexComparator;
-    }
-
-    public Comparator<IndexInfo> indexReverseComparator()
-    {
-        return indexReverseComparator;
-    }
-
-    public IVersionedSerializer<ColumnSlice> sliceSerializer()
-    {
-        return sliceSerializer;
-    }
-
-    public IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer()
-    {
-        return sliceQueryFilterSerializer;
-    }
-
-    public DeletionInfo.Serializer deletionInfoSerializer()
-    {
-        return deletionInfoSerializer;
-    }
-
-    public RangeTombstone.Serializer rangeTombstoneSerializer()
-    {
-        return rangeTombstoneSerializer;
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-
-        if (o == null)
-            return false;
-
-        if (!getClass().equals(o.getClass()))
-            return false;
-
-        CType c = (CType)o;
-        if (size() != c.size())
-            return false;
-
-        for (int i = 0; i < size(); i++)
-        {
-            if (!subtype(i).equals(c.subtype(i)))
-                return false;
-        }
-        return true;
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int h = 31;
-        for (int i = 0; i < size(); i++)
-            h += subtype(i).hashCode();
-        return h + getClass().hashCode();
-    }
-
-    @Override
-    public String toString()
-    {
-        return asAbstractType().toString();
-    }
-
-    protected static ByteBuffer sliceBytes(ByteBuffer bb, int offs, int length)
-    {
-        ByteBuffer copy = bb.duplicate();
-        copy.position(offs);
-        copy.limit(offs + length);
-        return copy;
-    }
-
-    protected static void checkRemaining(ByteBuffer bb, int offs, int length)
-    {
-        if (offs + length > bb.limit())
-            throw new IllegalArgumentException(String.format("Not enough bytes. Offset: %d. Length: %d. Buffer size: %d",
-                                                             offs, length, bb.limit()));
-    }
-
-    private static class Serializer implements CType.Serializer
-    {
-        private final CType type;
-
-        public Serializer(CType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(Composite c, DataOutputPlus out) throws IOException
-        {
-            ByteBufferUtil.writeWithShortLength(c.toByteBuffer(), out);
-        }
-
-        public Composite deserialize(DataInput in) throws IOException
-        {
-            return type.fromByteBuffer(ByteBufferUtil.readWithShortLength(in));
-        }
-
-        public long serializedSize(Composite c, TypeSizes type)
-        {
-            return type.sizeofWithShortLength(c.toByteBuffer());
-        }
-
-        public void skip(DataInput in) throws IOException
-        {
-            ByteBufferUtil.skipShortLength(in);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java
deleted file mode 100644
index c62f890..0000000
--- a/src/java/org/apache/cassandra/db/composites/AbstractCellNameType.java
+++ /dev/null

@@ -1,454 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public abstract class AbstractCellNameType extends AbstractCType implements CellNameType
-{
-    final Comparator<Cell> columnComparator;
-    private final Comparator<Cell> columnReverseComparator;
-    final Comparator<Object> asymmetricComparator;
-    private final Comparator<OnDiskAtom> onDiskAtomComparator;
-
-    private final ISerializer<CellName> cellSerializer;
-    private final ColumnSerializer columnSerializer;
-    private final OnDiskAtom.Serializer onDiskAtomSerializer;
-    private final IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer;
-    private final IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer;
-
-    protected AbstractCellNameType(boolean isByteOrderComparable)
-    {
-        super(isByteOrderComparable);
-        columnComparator = new Comparator<Cell>()
-        {
-            public int compare(Cell c1, Cell c2)
-            {
-                return AbstractCellNameType.this.compare(c1.name(), c2.name());
-            }
-        };
-        asymmetricComparator = new Comparator<Object>()
-        {
-            public int compare(Object c1, Object c2)
-            {
-                return AbstractCellNameType.this.compare((Composite) c1, ((Cell) c2).name());
-            }
-        };
-        columnReverseComparator = new Comparator<Cell>()
-        {
-            public int compare(Cell c1, Cell c2)
-            {
-                return AbstractCellNameType.this.compare(c2.name(), c1.name());
-            }
-        };
-        onDiskAtomComparator = new Comparator<OnDiskAtom>()
-        {
-            public int compare(OnDiskAtom c1, OnDiskAtom c2)
-            {
-                int comp = AbstractCellNameType.this.compare(c1.name(), c2.name());
-                if (comp != 0)
-                    return comp;
-
-                if (c1 instanceof RangeTombstone)
-                {
-                    if (c2 instanceof RangeTombstone)
-                    {
-                        RangeTombstone t1 = (RangeTombstone)c1;
-                        RangeTombstone t2 = (RangeTombstone)c2;
-                        int comp2 = AbstractCellNameType.this.compare(t1.max, t2.max);
-                        return comp2 == 0 ? t1.data.compareTo(t2.data) : comp2;
-                    }
-                    else
-                    {
-                        return -1;
-                    }
-                }
-                else
-                {
-                    return c2 instanceof RangeTombstone ? 1 : 0;
-                }
-            }
-        };
-
-        // A trivial wrapped over the composite serializer
-        cellSerializer = new ISerializer<CellName>()
-        {
-            public void serialize(CellName c, DataOutputPlus out) throws IOException
-            {
-                serializer().serialize(c, out);
-            }
-
-            public CellName deserialize(DataInput in) throws IOException
-            {
-                Composite ct = serializer().deserialize(in);
-                if (ct.isEmpty())
-                    throw ColumnSerializer.CorruptColumnException.create(in, ByteBufferUtil.EMPTY_BYTE_BUFFER);
-
-                assert ct instanceof CellName : ct;
-                return (CellName)ct;
-            }
-
-            public long serializedSize(CellName c, TypeSizes type)
-            {
-                return serializer().serializedSize(c, type);
-            }
-        };
-        columnSerializer = new ColumnSerializer(this);
-        onDiskAtomSerializer = new OnDiskAtom.Serializer(this);
-        namesQueryFilterSerializer = new NamesQueryFilter.Serializer(this);
-        diskAtomFilterSerializer = new IDiskAtomFilter.Serializer(this);
-    }
-
-    public final Comparator<Cell> columnComparator(boolean isRightNative)
-    {
-        if (!isByteOrderComparable)
-            return columnComparator;
-        return getByteOrderColumnComparator(isRightNative);
-    }
-
-    public final Comparator<Object> asymmetricColumnComparator(boolean isRightNative)
-    {
-        if (!isByteOrderComparable)
-            return asymmetricComparator;
-        return getByteOrderAsymmetricColumnComparator(isRightNative);
-    }
-
-    public Comparator<Cell> columnReverseComparator()
-    {
-        return columnReverseComparator;
-    }
-
-    public Comparator<OnDiskAtom> onDiskAtomComparator()
-    {
-        return onDiskAtomComparator;
-    }
-
-    public ISerializer<CellName> cellSerializer()
-    {
-        return cellSerializer;
-    }
-
-    public ColumnSerializer columnSerializer()
-    {
-        return columnSerializer;
-    }
-
-    public OnDiskAtom.Serializer onDiskAtomSerializer()
-    {
-        return onDiskAtomSerializer;
-    }
-
-    public IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer()
-    {
-        return namesQueryFilterSerializer;
-    }
-
-    public IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer()
-    {
-        return diskAtomFilterSerializer;
-    }
-
-    public CellName cellFromByteBuffer(ByteBuffer bytes)
-    {
-        // we're not guaranteed to get a CellName back from fromByteBuffer(), so it's on the caller to guarantee this
-        return (CellName)fromByteBuffer(bytes);
-    }
-
-    public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public CellName rowMarker(Composite prefix)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public Composite staticPrefix()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public boolean hasCollections()
-    {
-        return false;
-    }
-
-    public boolean supportCollections()
-    {
-        return false;
-    }
-
-    public ColumnToCollectionType collectionType()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Composite make(Object... components)
-    {
-        return components.length == size() ? makeCellName(components) : super.make(components);
-    }
-
-    public CellName makeCellName(Object... components)
-    {
-        ByteBuffer[] rawComponents = new ByteBuffer[components.length];
-        for (int i = 0; i < components.length; i++)
-        {
-            Object c = components[i];
-            if (c instanceof ByteBuffer)
-            {
-                rawComponents[i] = (ByteBuffer)c;
-            }
-            else
-            {
-                AbstractType<?> type = subtype(i);
-                // If it's a collection type, we need to find the right collection and use the key comparator (since we're building a cell name)
-                if (type instanceof ColumnToCollectionType)
-                {
-                    assert i > 0;
-                    type = ((ColumnToCollectionType)type).defined.get(rawComponents[i-1]).nameComparator();
-                }
-                rawComponents[i] = ((AbstractType)type).decompose(c);
-            }
-        }
-        return makeCellName(rawComponents);
-    }
-
-    protected abstract CellName makeCellName(ByteBuffer[] components);
-
-    protected static CQL3Row.Builder makeDenseCQL3RowBuilder(final long now)
-    {
-        return new CQL3Row.Builder()
-        {
-            public CQL3Row.RowIterator group(Iterator<Cell> cells)
-            {
-                return new DenseRowIterator(cells, now);
-            }
-        };
-    }
-
-    private static class DenseRowIterator extends AbstractIterator<CQL3Row> implements CQL3Row.RowIterator
-    {
-        private final Iterator<Cell> cells;
-        private final long now;
-
-        public DenseRowIterator(Iterator<Cell> cells, long now)
-        {
-            this.cells = cells;
-            this.now = now;
-        }
-
-        public CQL3Row getStaticRow()
-        {
-            // There can't be static columns in dense tables
-            return null;
-        }
-
-        protected CQL3Row computeNext()
-        {
-            while (cells.hasNext())
-            {
-                final Cell cell = cells.next();
-                if (!cell.isLive(now))
-                    continue;
-
-                return new CQL3Row()
-                {
-                    public ByteBuffer getClusteringColumn(int i)
-                    {
-                        return cell.name().get(i);
-                    }
-
-                    public Cell getColumn(ColumnIdentifier name)
-                    {
-                        return cell;
-                    }
-
-                    public List<Cell> getMultiCellColumn(ColumnIdentifier name)
-                    {
-                        return null;
-                    }
-                };
-            }
-            return endOfData();
-        }
-    }
-
-    protected static CQL3Row.Builder makeSparseCQL3RowBuilder(final CFMetaData cfMetaData, final CellNameType type, final long now)
-    {
-        return new CQL3Row.Builder()
-        {
-            public CQL3Row.RowIterator group(Iterator<Cell> cells)
-            {
-                return new SparseRowIterator(cfMetaData, type, cells, now);
-            }
-        };
-    }
-
-    private static class SparseRowIterator extends AbstractIterator<CQL3Row> implements CQL3Row.RowIterator
-    {
-        private final CFMetaData cfMetaData;
-        private final CellNameType type;
-        private final Iterator<Cell> cells;
-        private final long now;
-        private final CQL3Row staticRow;
-
-        private Cell nextCell;
-        private CellName previous;
-        private CQL3RowOfSparse currentRow;
-
-        public SparseRowIterator(CFMetaData cfMetaData, CellNameType type, Iterator<Cell> cells, long now)
-        {
-            this.cfMetaData = cfMetaData;
-            this.type = type;
-            this.cells = cells;
-            this.now = now;
-            this.staticRow = hasNextCell() && nextCell.name().isStatic()
-                           ? computeNext()
-                           : null;
-        }
-
-        public CQL3Row getStaticRow()
-        {
-            return staticRow;
-        }
-
-        private boolean hasNextCell()
-        {
-            if (nextCell != null)
-                return true;
-
-            while (cells.hasNext())
-            {
-                Cell cell = cells.next();
-                if (!cell.isLive(now))
-                    continue;
-
-                nextCell = cell;
-                return true;
-            }
-            return false;
-        }
-
-        protected CQL3Row computeNext()
-        {
-            while (hasNextCell())
-            {
-                CQL3Row toReturn = null;
-                CellName current = nextCell.name();
-                if (currentRow == null || !current.isSameCQL3RowAs(type, previous))
-                {
-                    toReturn = currentRow;
-                    currentRow = new CQL3RowOfSparse(cfMetaData, current);
-                }
-                currentRow.add(nextCell);
-                nextCell = null;
-                previous = current;
-
-                if (toReturn != null)
-                    return toReturn;
-            }
-            if (currentRow != null)
-            {
-                CQL3Row toReturn = currentRow;
-                currentRow = null;
-                return toReturn;
-            }
-            return endOfData();
-        }
-    }
-
-    private static class CQL3RowOfSparse implements CQL3Row
-    {
-        private final CFMetaData cfMetaData;
-        private final CellName cell;
-        private Map<ColumnIdentifier, Cell> columns;
-        private Map<ColumnIdentifier, List<Cell>> collections;
-
-        CQL3RowOfSparse(CFMetaData metadata, CellName cell)
-        {
-            this.cfMetaData = metadata;
-            this.cell = cell;
-        }
-
-        public ByteBuffer getClusteringColumn(int i)
-        {
-            return cell.get(i);
-        }
-
-        void add(Cell cell)
-        {
-            CellName cellName = cell.name();
-            ColumnIdentifier columnName =  cellName.cql3ColumnName(cfMetaData);
-            if (cellName.isCollectionCell())
-            {
-                if (collections == null)
-                    collections = new HashMap<>();
-
-                List<Cell> values = collections.get(columnName);
-                if (values == null)
-                {
-                    values = new ArrayList<Cell>();
-                    collections.put(columnName, values);
-                }
-                values.add(cell);
-            }
-            else
-            {
-                if (columns == null)
-                    columns = new HashMap<>();
-                columns.put(columnName, cell);
-            }
-        }
-
-        public Cell getColumn(ColumnIdentifier name)
-        {
-            return columns == null ? null : columns.get(name);
-        }
-
-        public List<Cell> getMultiCellColumn(ColumnIdentifier name)
-        {
-            return collections == null ? null : collections.get(name);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractComposite.java b/src/java/org/apache/cassandra/db/composites/AbstractComposite.java
deleted file mode 100644
index 14fa16c..0000000
--- a/src/java/org/apache/cassandra/db/composites/AbstractComposite.java
+++ /dev/null

@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public abstract class AbstractComposite implements Composite
-{
-    public boolean isEmpty()
-    {
-        return size() == 0;
-    }
-
-    public boolean isStatic()
-    {
-        return false;
-    }
-
-    public EOC eoc()
-    {
-        return EOC.NONE;
-    }
-
-    public Composite start()
-    {
-        return withEOC(EOC.START);
-    }
-
-    public Composite end()
-    {
-        return withEOC(EOC.END);
-    }
-
-    public Composite withEOC(EOC newEoc)
-    {
-        // Note: CompositeBound overwrite this so we assume the EOC of this is NONE
-        switch (newEoc)
-        {
-            case START:
-                return BoundedComposite.startOf(this);
-            case END:
-                return BoundedComposite.endOf(this);
-            default:
-                return this;
-        }
-    }
-
-    public ColumnSlice slice()
-    {
-        return new ColumnSlice(start(), end());
-    }
-
-    public ByteBuffer toByteBuffer()
-    {
-        // This is the legacy format of composites.
-        // See org.apache.cassandra.db.marshal.CompositeType for details.
-        ByteBuffer result = ByteBuffer.allocate(dataSize() + 3 * size() + (isStatic() ? 2 : 0));
-        if (isStatic())
-            ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER);
-
-        for (int i = 0; i < size(); i++)
-        {
-            ByteBuffer bb = get(i);
-            ByteBufferUtil.writeShortLength(result, bb.remaining());
-            result.put(bb.duplicate());
-            result.put((byte)0);
-        }
-        result.flip();
-        return result;
-    }
-
-    public int dataSize()
-    {
-        int size = 0;
-        for (int i = 0; i < size(); i++)
-            size += get(i).remaining();
-        return size;
-    }
-
-    public boolean isPrefixOf(CType type, Composite c)
-    {
-        if (size() > c.size() || isStatic() != c.isStatic())
-            return false;
-
-        for (int i = 0; i < size(); i++)
-        {
-            if (type.subtype(i).compare(get(i), c.get(i)) != 0)
-                return false;
-        }
-        return true;
-    }
-
-    @Override
-    public boolean equals(Object o)
-    {
-        if (this == o)
-            return true;
-
-        if(!(o instanceof Composite))
-            return false;
-
-        Composite c = (Composite)o;
-        if (size() != c.size() || isStatic() != c.isStatic())
-            return false;
-
-        for (int i = 0; i < size(); i++)
-        {
-            if (!get(i).equals(c.get(i)))
-                return false;
-        }
-        return eoc() == c.eoc();
-    }
-
-    @Override
-    public int hashCode()
-    {
-        int h = 31;
-        for (int i = 0; i < size(); i++)
-            h += get(i).hashCode();
-        return h + eoc().hashCode() + (isStatic() ? 1 : 0);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java
deleted file mode 100644
index bf303a7..0000000
--- a/src/java/org/apache/cassandra/db/composites/AbstractCompoundCellNameType.java
+++ /dev/null

@@ -1,295 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-
-public abstract class AbstractCompoundCellNameType extends AbstractCellNameType
-{
-    protected final CompoundCType clusteringType;
-    protected final CompoundCType fullType;
-
-    protected final int clusteringSize;
-    protected final int fullSize;
-
-    protected AbstractCompoundCellNameType(CompoundCType clusteringType, CompoundCType fullType)
-    {
-        super(isByteOrderComparable(fullType.types));
-        this.clusteringType = clusteringType;
-        this.fullType = fullType;
-
-        this.clusteringSize = clusteringType.size();
-        this.fullSize = fullType.size();
-    }
-
-    public int clusteringPrefixSize()
-    {
-        return clusteringSize;
-    }
-
-    public boolean isCompound()
-    {
-        return true;
-    }
-
-    public int size()
-    {
-        return fullSize;
-    }
-
-    public AbstractType<?> subtype(int i)
-    {
-        return fullType.subtype(i);
-    }
-
-    public CBuilder prefixBuilder()
-    {
-        return clusteringType.builder();
-    }
-
-    public CBuilder builder()
-    {
-        return new CompoundCType.CompoundCBuilder(this);
-    }
-
-    @Override
-    public Composite fromByteBuffer(ByteBuffer bytes)
-    {
-        if (!bytes.hasRemaining())
-            return Composites.EMPTY;
-
-        ByteBuffer[] elements = new ByteBuffer[fullSize];
-        int idx = bytes.position(), i = 0;
-        byte eoc = 0;
-
-        boolean isStatic = false;
-        if (CompositeType.isStaticName(bytes))
-        {
-            isStatic = true;
-            idx += 2;
-        }
-
-        while (idx < bytes.limit())
-        {
-            checkRemaining(bytes, idx, 2);
-            int length = bytes.getShort(idx) & 0xFFFF;
-            idx += 2;
-
-            checkRemaining(bytes, idx, length + 1);
-            elements[i++] = sliceBytes(bytes, idx, length);
-            idx += length;
-            eoc = bytes.get(idx++);
-        }
-
-        return makeWith(elements, i, Composite.EOC.from(eoc), isStatic);
-    }
-
-    public AbstractType<?> asAbstractType()
-    {
-        return CompositeType.getInstance(fullType.types);
-    }
-
-    public Deserializer newDeserializer(DataInput in)
-    {
-        return new CompositeDeserializer(this, in);
-    }
-
-    protected CellName makeCellName(ByteBuffer[] components)
-    {
-        return (CellName)makeWith(components, components.length, Composite.EOC.NONE, false);
-    }
-
-    protected abstract Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic);
-    protected abstract Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic);
-
-    private static class CompositeDeserializer implements CellNameType.Deserializer
-    {
-        private static byte[] EMPTY = new byte[0];
-
-        private final AbstractCompoundCellNameType type;
-        private final DataInput in;
-
-        private byte[] nextFull;
-        private int nextIdx;
-
-        private final ByteBuffer[] nextComponents;
-        private int nextSize;
-        private Composite.EOC nextEOC;
-        private boolean nextIsStatic;
-
-        public CompositeDeserializer(AbstractCompoundCellNameType type, DataInput in)
-        {
-            this.type = type;
-            this.in = in;
-            this.nextComponents = new ByteBuffer[type.size()];
-        }
-
-        public boolean hasNext() throws IOException
-        {
-            if (nextFull == null)
-                maybeReadNext();
-            return nextFull != EMPTY;
-        }
-
-        public boolean hasUnprocessed() throws IOException
-        {
-            return nextFull != null;
-        }
-
-        public int compareNextTo(Composite composite) throws IOException
-        {
-            maybeReadNext();
-
-            if (composite.isEmpty())
-                return nextFull == EMPTY ? 0 : 1;
-
-            if (nextFull == EMPTY)
-                return -1;
-
-            if (nextIsStatic != composite.isStatic())
-                return nextIsStatic ? -1 : 1;
-
-            ByteBuffer previous = null;
-            for (int i = 0; i < composite.size(); i++)
-            {
-                if (!hasComponent(i))
-                    return nextEOC == Composite.EOC.END ? 1 : -1;
-
-                AbstractType<?> comparator = type.subtype(i);
-                ByteBuffer value1 = nextComponents[i];
-                ByteBuffer value2 = composite.get(i);
-
-                int cmp = comparator.compareCollectionMembers(value1, value2, previous);
-                if (cmp != 0)
-                    return cmp;
-
-                previous = value1;
-            }
-
-            // If we have more component than composite
-            if (!allComponentsDeserialized() || composite.size() < nextSize)
-                return composite.eoc() == Composite.EOC.END ? -1 : 1;
-
-            // same size, check eoc
-            if (nextEOC != composite.eoc())
-            {
-                switch (nextEOC)
-                {
-                    case START: return -1;
-                    case END:   return 1;
-                    case NONE:  return composite.eoc() == Composite.EOC.START ? 1 : -1;
-                }
-            }
-
-            return 0;
-        }
-
-        private boolean hasComponent(int i)
-        {
-            while (i >= nextSize && deserializeOne())
-                continue;
-
-            return i < nextSize;
-        }
-
-        private int readShort()
-        {
-            return ((nextFull[nextIdx++] & 0xFF) << 8) | (nextFull[nextIdx++] & 0xFF);
-        }
-
-        private int peekShort()
-        {
-            return ((nextFull[nextIdx] & 0xFF) << 8) | (nextFull[nextIdx+1] & 0xFF);
-        }
-
-        private boolean deserializeOne()
-        {
-            if (allComponentsDeserialized())
-                return false;
-
-            int length = readShort();
-            ByteBuffer component = ByteBuffer.wrap(nextFull, nextIdx, length);
-            nextIdx += length;
-            nextComponents[nextSize++] = component;
-            nextEOC = Composite.EOC.from(nextFull[nextIdx++]);
-            return true;
-        }
-
-        private void deserializeAll()
-        {
-            while (deserializeOne())
-                continue;
-        }
-
-        private boolean allComponentsDeserialized()
-        {
-            return nextIdx >= nextFull.length;
-        }
-
-        private void maybeReadNext() throws IOException
-        {
-            if (nextFull != null)
-                return;
-
-            nextIdx = 0;
-            nextSize = 0;
-
-            int length = in.readShort() & 0xFFFF;
-            // Note that empty is ok because it marks the end of row
-            if (length == 0)
-            {
-                nextFull = EMPTY;
-                return;
-            }
-
-            nextFull = new byte[length];
-            in.readFully(nextFull);
-
-            // Is is a static?
-            nextIsStatic = false;
-            if (peekShort() == CompositeType.STATIC_MARKER)
-            {
-                nextIsStatic = true;
-                readShort(); // Skip the static marker
-            }
-        }
-
-        public Composite readNext() throws IOException
-        {
-            maybeReadNext();
-            if (nextFull == EMPTY)
-                return Composites.EMPTY;
-
-            deserializeAll();
-            Composite c = type.copyAndMakeWith(nextComponents, nextSize, nextEOC, nextIsStatic);
-            nextFull = null;
-            return c;
-        }
-
-        public void skipNext() throws IOException
-        {
-            maybeReadNext();
-            nextFull = null;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java b/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java
deleted file mode 100644
index b3f4778..0000000
--- a/src/java/org/apache/cassandra/db/composites/AbstractSimpleCellNameType.java
+++ /dev/null

@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Comparator;
-
-import net.nicoulaj.compilecommand.annotations.Inline;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.NativeCell;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public abstract class AbstractSimpleCellNameType extends AbstractCellNameType
-{
-    protected final AbstractType<?> type;
-
-    static final Comparator<Cell> rightNativeCell = new Comparator<Cell>()
-    {
-        public int compare(Cell o1, Cell o2)
-        {
-            return -((NativeCell) o2).compareToSimple(o1.name());
-        }
-    };
-
-    static final Comparator<Cell> neitherNativeCell = new Comparator<Cell>()
-    {
-        public int compare(Cell o1, Cell o2)
-        {
-            return compareUnsigned(o1.name(), o2.name());
-        }
-    };
-
-    // only one or the other of these will ever be used
-    static final Comparator<Object> asymmetricRightNativeCell = new Comparator<Object>()
-    {
-        public int compare(Object o1, Object o2)
-        {
-            return -((NativeCell) o2).compareToSimple((Composite) o1);
-        }
-    };
-
-    static final Comparator<Object> asymmetricNeitherNativeCell = new Comparator<Object>()
-    {
-        public int compare(Object o1, Object o2)
-        {
-            return compareUnsigned((Composite) o1, ((Cell) o2).name());
-        }
-    };
-
-    protected AbstractSimpleCellNameType(AbstractType<?> type)
-    {
-        super(type.isByteOrderComparable());
-        this.type = type;
-    }
-
-    public boolean isCompound()
-    {
-        return false;
-    }
-
-    public int size()
-    {
-        return 1;
-    }
-
-    @Inline
-    static int compareUnsigned(Composite c1, Composite c2)
-    {
-        ByteBuffer b1 = c1.toByteBuffer();
-        ByteBuffer b2 = c2.toByteBuffer();
-        return ByteBufferUtil.compareUnsigned(b1, b2);
-    }
-
-    public int compare(Composite c1, Composite c2)
-    {
-        if (isByteOrderComparable)
-            return compareUnsigned(c1, c2);
-
-        assert !(c1.isEmpty() | c2.isEmpty());
-        return type.compare(c1.get(0), c2.get(0));
-    }
-
-    protected Comparator<Cell> getByteOrderColumnComparator(boolean isRightNative)
-    {
-        if (isRightNative)
-            return rightNativeCell;
-        return neitherNativeCell;
-    }
-
-    protected Comparator<Object> getByteOrderAsymmetricColumnComparator(boolean isRightNative)
-    {
-        if (isRightNative)
-            return asymmetricRightNativeCell;
-        return asymmetricNeitherNativeCell;
-    }
-
-    public AbstractType<?> subtype(int i)
-    {
-        if (i != 0)
-            throw new IllegalArgumentException();
-        return type;
-    }
-
-    protected CellName makeCellName(ByteBuffer[] components)
-    {
-        assert components.length == 1;
-        return cellFromByteBuffer(components[0]);
-    }
-
-    public CBuilder builder()
-    {
-        return new SimpleCType.SimpleCBuilder(this);
-    }
-
-    public AbstractType<?> asAbstractType()
-    {
-        return type;
-    }
-
-    public Deserializer newDeserializer(DataInput in)
-    {
-        return new SimpleDeserializer(this, in);
-    }
-
-    private static class SimpleDeserializer implements CellNameType.Deserializer
-    {
-        private final AbstractSimpleCellNameType type;
-        private ByteBuffer next;
-        private final DataInput in;
-
-        public SimpleDeserializer(AbstractSimpleCellNameType type, DataInput in)
-        {
-            this.type = type;
-            this.in = in;
-        }
-
-        public boolean hasNext() throws IOException
-        {
-            if (next == null)
-                maybeReadNext();
-
-            return next.hasRemaining();
-        }
-
-        public boolean hasUnprocessed() throws IOException
-        {
-            return next != null;
-        }
-
-        public int compareNextTo(Composite composite) throws IOException
-        {
-            maybeReadNext();
-
-            if (composite.isEmpty())
-                return next.hasRemaining() ? 1 : 0;
-
-            return type.subtype(0).compare(next, composite.get(0));
-        }
-
-        private void maybeReadNext() throws IOException
-        {
-            if (next != null)
-                return;
-
-            int length = in.readShort() & 0xFFFF;
-            // Note that empty is ok because it marks the end of row
-            if (length == 0)
-            {
-                next = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-                return;
-            }
-
-            byte[] b = new byte[length];
-            in.readFully(b);
-            next = ByteBuffer.wrap(b);
-        }
-
-        public Composite readNext() throws IOException
-        {
-            maybeReadNext();
-            Composite c = type.fromByteBuffer(next);
-            next = null;
-            return c;
-        }
-
-        public void skipNext() throws IOException
-        {
-            maybeReadNext();
-            next = null;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/BoundedComposite.java b/src/java/org/apache/cassandra/db/composites/BoundedComposite.java
deleted file mode 100644
index 7f596fe..0000000
--- a/src/java/org/apache/cassandra/db/composites/BoundedComposite.java
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-/**
- * Wraps another Composite and adds an EOC byte to track whether this is a slice start or end.
- */
-public class BoundedComposite extends AbstractComposite
-{
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new BoundedComposite(null, false));
-
-    private final Composite wrapped;
-    private final boolean isStart;
-
-    private BoundedComposite(Composite wrapped, boolean isStart)
-    {
-        this.wrapped = wrapped;
-        this.isStart = isStart;
-    }
-
-    static Composite startOf(Composite c)
-    {
-        return new BoundedComposite(c, true);
-    }
-
-    static Composite endOf(Composite c)
-    {
-        return new BoundedComposite(c, false);
-    }
-
-    public int size()
-    {
-        return wrapped.size();
-    }
-
-    public boolean isStatic()
-    {
-        return wrapped.isStatic();
-    }
-
-    public ByteBuffer get(int i)
-    {
-        return wrapped.get(i);
-    }
-
-    @Override
-    public EOC eoc()
-    {
-        return isStart ? EOC.START : EOC.END;
-    }
-
-    @Override
-    public Composite withEOC(EOC eoc)
-    {
-        switch (eoc)
-        {
-            case START:
-                return isStart ? this : startOf(wrapped);
-            case END:
-                return isStart ? endOf(wrapped) : this;
-            default:
-                return wrapped;
-        }
-    }
-
-    @Override
-    public ByteBuffer toByteBuffer()
-    {
-        ByteBuffer bb = wrapped.toByteBuffer();
-        bb.put(bb.remaining() - 1, (byte)(isStart ? -1 : 1));
-        return bb;
-    }
-
-    public long unsharedHeapSize()
-    {
-        return EMPTY_SIZE + wrapped.unsharedHeapSize();
-    }
-
-    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new BoundedComposite(wrapped.copy(cfm, allocator), isStart);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CBuilder.java b/src/java/org/apache/cassandra/db/composites/CBuilder.java
deleted file mode 100644
index 39035cb..0000000
--- a/src/java/org/apache/cassandra/db/composites/CBuilder.java
+++ /dev/null

@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-/**
- * A builder of Composite.
- */
-public interface CBuilder
-{
-    public int remainingCount();
-
-    public CBuilder add(ByteBuffer value);
-    public CBuilder add(Object value);
-
-    public Composite build();
-    public Composite buildWith(ByteBuffer value);
-    public Composite buildWith(List<ByteBuffer> values);
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CType.java b/src/java/org/apache/cassandra/db/composites/CType.java
deleted file mode 100644
index 7f70313..0000000
--- a/src/java/org/apache/cassandra/db/composites/CType.java
+++ /dev/null

@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Comparator;
-
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.RangeTombstone;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-
-import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
-
-/**
- * A type for a Composite.
- *
- * There is essentially 2 types of Composite and such of CType:
- *   1. the "simple" ones, see SimpleCType.
- *   2. the "truly-composite" ones, see CompositeCType.
- *
- * API-wise, a CType is simply a collection of AbstractType with a few utility
- * methods.
- */
-public interface CType extends Comparator<Composite>
-{
-    /**
-     * Returns whether this is a "truly-composite" underneath.
-     */
-    public boolean isCompound();
-
-    /**
-     * The number of subtypes for this CType.
-     */
-    public int size();
-
-    int compare(Composite o1, Composite o2);
-
-    /**
-     * Gets a subtype of this CType.
-     */
-    public AbstractType<?> subtype(int i);
-
-    /**
-     * A builder of Composite.
-     */
-    public CBuilder builder();
-
-    /**
-     * Convenience method to build composites from their component.
-     *
-     * The arguments can be either ByteBuffer or actual objects of the type
-     * corresponding to their position.
-     */
-    public Composite make(Object... components);
-
-    /**
-     * Validates a composite.
-     */
-    public void validate(Composite name);
-
-    /**
-     * Converts a composite to a user-readable string.
-     */
-    public String getString(Composite c);
-
-    /**
-     * See AbstractType#isCompatibleWith.
-     */
-    public boolean isCompatibleWith(CType previous);
-
-    /**
-     * Returns a new CType that is equivalent to this CType but with
-     * one of the subtype replaced by the provided new type.
-     */
-    public CType setSubtype(int position, AbstractType<?> newType);
-
-    /**
-     * Deserialize a Composite from a ByteBuffer.
-     *
-     * This is meant for thrift to convert the fully serialized buffer we
-     * get from the clients to composites.
-     */
-    public Composite fromByteBuffer(ByteBuffer bb);
-
-    /**
-     * Returns a AbstractType corresponding to this CType for thrift sake.
-     *
-     * If the CType is a "simple" one, this just return the wrapped type, otherwise
-     * it returns the corresponding org.apache.cassandra.db.marshal.CompositeType.
-     *
-     * This is only meant to be use for backward compatibility (particularly for
-     * thrift) but it's not meant to be used internally.
-     */
-    public AbstractType<?> asAbstractType();
-
-
-    /**********************************************************/
-
-    /*
-     * Follows a number of per-CType instances for the Comparator and Serializer used throughout
-     * the code. The reason we need this is that we want the per-CType/per-CellNameType Composite/CellName
-     * serializers, which means the following instances have to depend on the type too.
-     */
-
-    public Comparator<Composite> reverseComparator();
-    public Comparator<IndexInfo> indexComparator();
-    public Comparator<IndexInfo> indexReverseComparator();
-
-    public Serializer serializer();
-
-    public IVersionedSerializer<ColumnSlice> sliceSerializer();
-    public IVersionedSerializer<SliceQueryFilter> sliceQueryFilterSerializer();
-    public DeletionInfo.Serializer deletionInfoSerializer();
-    public RangeTombstone.Serializer rangeTombstoneSerializer();
-
-    public interface Serializer extends ISerializer<Composite>
-    {
-        public void skip(DataInput in) throws IOException;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CellName.java b/src/java/org/apache/cassandra/db/composites/CellName.java
deleted file mode 100644
index 4d778d3..0000000
--- a/src/java/org/apache/cassandra/db/composites/CellName.java
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-/**
- * A CellName is a Composite, but for which, for the sake of CQL3, we
- * distinguish different parts: a CellName has first a number of clustering
- * components, followed by the CQL3 column name, and then possibly followed by
- * a collection element part.
- *
- * The clustering prefix can itself be composed of multiple component. It can
- * also be empty if the table has no clustering keys. In general, the CQL3
- * column name follows. However, some type of COMPACT STORAGE layout do not
- * store the CQL3 column name in the cell name and so this part can be null (we
- * call "dense" the cells whose name don't store the CQL3 column name).
- *
- * Lastly, if the cell is part of a CQL3 collection, we'll have a last
- * component (a UUID for lists, an element for sets and a key for maps).
- */
-public interface CellName extends Composite
-{
-    /**
-     * The number of clustering components.
-     *
-     * It can be 0 if the table has no clustering columns, and it can be
-     * equal to size() if the table is dense() (in which case cql3ColumnName()
-     * will be null).
-     */
-    public int clusteringSize();
-
-    /**
-     * The name of the CQL3 column this cell represents.
-     *
-     * Will be null for cells of "dense" tables.
-     * @param metadata
-     */
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata);
-
-    /**
-     * The value of the collection element, or null if the cell is not part
-     * of a collection (i.e. if !isCollectionCell()).
-     */
-    public ByteBuffer collectionElement();
-    public boolean isCollectionCell();
-
-    /**
-     * Whether this cell is part of the same CQL3 row as the other cell.
-     */
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other);
-
-    // If cellnames were sharing some prefix components, this will break it, so
-    // we might want to try to do better.
-    @Override
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator);
-
-    public long unsharedHeapSizeExcludingData();
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CellNameType.java b/src/java/org/apache/cassandra/db/composites/CellNameType.java
deleted file mode 100644
index 6c89660..0000000
--- a/src/java/org/apache/cassandra/db/composites/CellNameType.java
+++ /dev/null

@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Comparator;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnSerializer;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-
-/**
- * The type of CellNames.
- *
- * In the same way that a CellName is a Composite, a CellNameType is a CType, but
- * with a number of method specific to cell names.
- *
- * On top of the dichotomy simple/truly-composite of composites, cell names comes
- * in 2 variants: "dense" and "sparse". The sparse ones are CellName where one of
- * the component (the last or second-to-last for collections) is used to store the
- * CQL3 column name. Dense are those for which it's not the case.
- *
- * In other words, we have 4 types of CellName/CellNameType which correspond to the
- * 4 type of table layout that we need to distinguish:
- *   1. Simple (non-truly-composite) dense: this is the dynamic thrift CFs whose
- *      comparator is not composite.
- *   2. Composite dense: this is the dynamic thrift CFs with a CompositeType comparator.
- *   3. Simple (non-truly-composite) sparse: this is the thrift static CFs (that
- *      don't have a composite comparator).
- *   4. Composite sparse: this is the CQL3 layout (note that this is the only one that
- *      support collections).
- */
-public interface CellNameType extends CType
-{
-    /**
-     * Whether or not the cell names for this type are dense.
-     */
-    public boolean isDense();
-
-    /**
-     * The number of clustering columns for the table this is the type of.
-     */
-    public int clusteringPrefixSize();
-
-    /**
-     * A builder for the clustering prefix.
-     */
-    public CBuilder prefixBuilder();
-
-    /**
-     * The prefix to use for static columns.
-     *
-     * Note that create() methods below for creating CellName automatically handle static columns already
-     * for convenience, and so there is not need to pass this prefix for them. There is few other cases
-     * where we need the prefix directly however.
-     */
-    public Composite staticPrefix();
-
-    /**
-     * Whether or not there is some collections defined in this type.
-     */
-    public boolean hasCollections();
-
-    /**
-     * Whether or not this type layout support collections.
-     */
-    public boolean supportCollections();
-
-    /**
-     * The type of the collections (or null if the type does not have any non-frozen collections).
-     */
-    public ColumnToCollectionType collectionType();
-
-    /**
-     * Return the new type obtained by adding/updating to the new collection type for the provided column name
-     * to this type.
-     */
-    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection);
-
-    /**
-     * Returns a new CellNameType that is equivalent to this one but with one
-     * of the subtype replaced by the provided new type.
-     */
-    @Override
-    public CellNameType setSubtype(int position, AbstractType<?> newType);
-
-    /**
-     * Creates a row marker for the CQL3 having the provided clustering prefix.
-     *
-     * Note that this is only valid for CQL3 tables (isCompound() and !isDense()) and should
-     * only be called for them.
-     */
-    public CellName rowMarker(Composite prefix);
-
-    /**
-     * Creates a new CellName given a clustering prefix and a CQL3 column.
-     *
-     * Note that for dense types, the column can be null as a shortcut for designing the only
-     * COMPACT_VALUE column of the table.
-     */
-    public CellName create(Composite prefix, ColumnDefinition column);
-
-    /**
-     * Creates a new collection CellName given a clustering prefix, a CQL3 column and the collection element.
-     */
-    public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement);
-
-    /**
-     * Convenience method to create cell names given its components.
-     *
-     * This is equivalent to CType#make() but return a full cell name (and thus
-     * require all the components of the name).
-     */
-    public CellName makeCellName(Object... components);
-
-    /**
-     * Deserialize a Composite from a ByteBuffer.
-     *
-     * This is equilvalent to CType#fromByteBuffer but assumes the buffer is a full cell
-     * name. This is meant for thrift to convert the fully serialized buffer we
-     * get from the clients.
-     */
-    public CellName cellFromByteBuffer(ByteBuffer bb);
-
-    /**
-     * Creates a new CQL3Row builder for this type. See CQL3Row for details.
-     */
-    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now);
-
-    // The two following methods are used to pass the declared regular column names (in CFMetaData)
-    // to the CellNameType. This is only used for optimization sake, see SparseCellNameType.
-    public void addCQL3Column(ColumnIdentifier id);
-    public void removeCQL3Column(ColumnIdentifier id);
-
-    /**
-     * Creates a new Deserializer. This is used by AtomDeserializer to do incremental and on-demand
-     * deserialization of the on disk atoms. See AtomDeserializer for details.
-     */
-    public Deserializer newDeserializer(DataInput in);
-
-    /*
-     * Same as in CType, follows a number of per-CellNameType instances for the Comparator and Serializer used
-     * throughout the code (those that require full CellName versus just Composite).
-     */
-
-    // Ultimately, those might be split into an IVersionedSerializer and an ISSTableSerializer
-    public ISerializer<CellName> cellSerializer();
-
-    public Comparator<Cell> columnComparator(boolean isRightNative);
-    public Comparator<Object> asymmetricColumnComparator(boolean isRightNative);
-    public Comparator<Cell> columnReverseComparator();
-    public Comparator<OnDiskAtom> onDiskAtomComparator();
-
-    public ColumnSerializer columnSerializer();
-    public OnDiskAtom.Serializer onDiskAtomSerializer();
-    public IVersionedSerializer<NamesQueryFilter> namesQueryFilterSerializer();
-    public IVersionedSerializer<IDiskAtomFilter> diskAtomFilterSerializer();
-
-    public interface Deserializer
-    {
-        /**
-         * Whether this deserializer is done or not, i.e. whether we're reached the end of row marker.
-         */
-        public boolean hasNext() throws IOException;
-
-        /**
-         * Whether or not some name has been read but not consumed by readNext.
-         */
-        public boolean hasUnprocessed() throws IOException;
-
-        /**
-         * Compare the next name to read to the provided Composite.
-         * This does not consume the next name.
-         */
-        public int compareNextTo(Composite composite) throws IOException;
-
-        /**
-         * Actually consume the next name and return it.
-         */
-        public Composite readNext() throws IOException;
-
-        /**
-         * Skip the next name (consuming it).
-         */
-        public void skipNext() throws IOException;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CellNames.java b/src/java/org/apache/cassandra/db/composites/CellNames.java
deleted file mode 100644
index f73f7a7..0000000
--- a/src/java/org/apache/cassandra/db/composites/CellNames.java
+++ /dev/null

@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-
-public abstract class CellNames
-{
-    private CellNames() {}
-
-    public static CellNameType fromAbstractType(AbstractType<?> type, boolean isDense)
-    {
-        if (isDense)
-        {
-            if (type instanceof CompositeType)
-            {
-                return new CompoundDenseCellNameType(((CompositeType)type).types);
-            }
-            else
-            {
-                return new SimpleDenseCellNameType(type);
-            }
-        }
-        else
-        {
-            if (type instanceof CompositeType)
-            {
-                List<AbstractType<?>> types = ((CompositeType)type).types;
-                if (types.get(types.size() - 1) instanceof ColumnToCollectionType)
-                {
-                    // We don't allow collection for super columns, so the "name" type *must* be UTF8
-                    assert types.get(types.size() - 2) instanceof UTF8Type;
-                    return new CompoundSparseCellNameType.WithCollection(types.subList(0, types.size() - 2), (ColumnToCollectionType)types.get(types.size() - 1));
-                }
-                else
-                {
-                    AbstractType<?> nameType = types.get(types.size() - 1);
-                    return new CompoundSparseCellNameType(types.subList(0, types.size() - 1), nameType);
-                }
-            }
-            else
-            {
-                assert type != null;
-                return new SimpleSparseCellNameType(type);
-            }
-        }
-    }
-
-    // Mainly for tests and a few cases where we know what we need and didn't wanted to pass the type around.
-    // Avoid in general, prefer the CellNameType methods.
-    public static CellName simpleDense(ByteBuffer bb)
-    {
-        assert bb.hasRemaining();
-        return new SimpleDenseCellName(bb);
-    }
-
-    public static CellName simpleSparse(ColumnIdentifier identifier)
-    {
-        return new SimpleSparseCellName(identifier);
-    }
-
-    // Mainly for tests and a few cases where we know what we need and didn't wanted to pass the type around
-    // Avoid in general, prefer the CellNameType methods.
-    public static CellName compositeDense(ByteBuffer... bbs)
-    {
-        return new CompoundDenseCellName(bbs);
-    }
-
-    public static CellName compositeSparse(ByteBuffer[] bbs, ColumnIdentifier identifier, boolean isStatic)
-    {
-        return new CompoundSparseCellName(bbs, identifier, isStatic);
-    }
-
-    public static CellName compositeSparseWithCollection(ByteBuffer[] bbs, ByteBuffer collectionElement, ColumnIdentifier identifier, boolean isStatic)
-    {
-        return new CompoundSparseCellName.WithCollection(bbs, identifier, collectionElement, isStatic);
-    }
-
-    public static String getColumnsString(CellNameType type, Iterable<Cell> columns)
-    {
-        StringBuilder builder = new StringBuilder();
-        for (Cell cell : columns)
-            builder.append(cell.getString(type)).append(",");
-        return builder.toString();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/Composite.java b/src/java/org/apache/cassandra/db/composites/Composite.java
deleted file mode 100644
index b15daef..0000000
--- a/src/java/org/apache/cassandra/db/composites/Composite.java
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.cache.IMeasurableMemory;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-/**
- * A composite value.
- *
- * This can be though as a list of ByteBuffer, except that this also include an
- * 'end-of-component' flag, that allow precise selection of composite ranges.
- *
- * We also make a difference between "true" composites and the "simple" ones. The
- * non-truly composite will have a size() == 1 but differs from true composites with
- * size() == 1 in the way they are stored. Most code shouldn't have to care about the
- * difference.
- */
-public interface Composite extends IMeasurableMemory
-{
-    public enum EOC
-    {
-        START(-1), NONE(-1), END(1);
-
-        // If composite p has this EOC and is a strict prefix of composite c, then this
-        // the result of the comparison of p and c. Basically, p sorts before c unless
-        // it's EOC is END.
-        public final int prefixComparisonResult;
-
-        private EOC(int prefixComparisonResult)
-        {
-            this.prefixComparisonResult = prefixComparisonResult;
-        }
-
-        public static EOC from(int eoc)
-        {
-            return eoc == 0 ? NONE : (eoc < 0 ? START : END);
-        }
-    }
-
-    public int size();
-    public boolean isEmpty();
-    public ByteBuffer get(int i);
-
-    public EOC eoc();
-    public Composite withEOC(EOC eoc);
-    public Composite start();
-    public Composite end();
-    public ColumnSlice slice();
-
-    public boolean isStatic();
-
-    public boolean isPrefixOf(CType type, Composite other);
-
-    public ByteBuffer toByteBuffer();
-
-    public int dataSize();
-    public Composite copy(CFMetaData cfm, AbstractAllocator allocator);
-}

diff --git a/src/java/org/apache/cassandra/db/composites/Composites.java b/src/java/org/apache/cassandra/db/composites/Composites.java
deleted file mode 100644
index fa0df48..0000000
--- a/src/java/org/apache/cassandra/db/composites/Composites.java
+++ /dev/null

@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Lists;
-
-public abstract class Composites
-{
-    private Composites() {}
-
-    public static final Composite EMPTY = new EmptyComposite();
-
-    /**
-     * Converts the specified <code>Composites</code> into <code>ByteBuffer</code>s.
-     *
-     * @param composites the composites to convert.
-     * @return the <code>ByteBuffer</code>s corresponding to the specified <code>Composites</code>.
-     */
-    public static List<ByteBuffer> toByteBuffers(List<Composite> composites)
-    {
-        return Lists.transform(composites, new Function<Composite, ByteBuffer>()
-        {
-            public ByteBuffer apply(Composite composite)
-            {
-                return composite.toByteBuffer();
-            }
-        });
-    }
-
-    static final CBuilder EMPTY_BUILDER = new CBuilder()
-    {
-        public int remainingCount() { return 0; }
-
-        public CBuilder add(ByteBuffer value) { throw new IllegalStateException(); }
-        public CBuilder add(Object value) { throw new IllegalStateException(); }
-
-        public Composite build() { return EMPTY; }
-        public Composite buildWith(ByteBuffer value) { throw new IllegalStateException(); }
-        public Composite buildWith(List<ByteBuffer> values) { throw new IllegalStateException(); }
-    };
-
-    private static class EmptyComposite implements Composite
-    {
-        public boolean isEmpty()
-        {
-            return true;
-        }
-
-        public int size()
-        {
-            return 0;
-        }
-
-        public ByteBuffer get(int i)
-        {
-            if (i > 0)
-                throw new IndexOutOfBoundsException();
-
-            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        }
-
-        public EOC eoc()
-        {
-            return EOC.NONE;
-        }
-
-        public Composite start()
-        {
-            // Note that SimpleCType/AbstractSimpleCellNameType compare method
-            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
-            // (or more precisely on the fact that the EOC is NONE for all of those).
-            return this;
-        }
-
-        public Composite end()
-        {
-            // Note that SimpleCType/AbstractSimpleCellNameType compare method
-            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
-            // (or more precisely on the fact that the EOC is NONE for all of those).
-            return this;
-        }
-
-        public Composite withEOC(EOC newEoc)
-        {
-            // Note that SimpleCType/AbstractSimpleCellNameType compare method
-            // indirectly rely on the fact that EMPTY == EMPTY.start() == EMPTY.end()
-            // (or more precisely on the fact that the EOC is NONE for all of those).
-            return this;
-        }
-
-        public ColumnSlice slice()
-        {
-            return ColumnSlice.ALL_COLUMNS;
-        }
-
-        public ByteBuffer toByteBuffer()
-        {
-            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        }
-
-        public boolean isStatic()
-        {
-            return false;
-        }
-
-        public int dataSize()
-        {
-            return 0;
-        }
-
-        public long unsharedHeapSize()
-        {
-            return 0;
-        }
-
-        public boolean isPrefixOf(CType type, Composite c)
-        {
-            return true;
-        }
-
-        public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
-        {
-            return this;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompositesBuilder.java b/src/java/org/apache/cassandra/db/composites/CompositesBuilder.java
deleted file mode 100644
index 9a4da9e..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompositesBuilder.java
+++ /dev/null

@@ -1,299 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.db.composites.Composite.EOC;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import static java.util.Collections.singletonList;
-
-/**
- * Builder that allow to build multiple composites at the same time.
- */
-public final class CompositesBuilder
-{
-    /**
-     * The composite type.
-     */
-    private final CType ctype;
-
-    /**
-     * The elements of the composites
-     */
-    private final List<List<ByteBuffer>> elementsList = new ArrayList<>();
-
-    /**
-     * The number of elements that have been added.
-     */
-    private int size;
-
-    /**
-     * <code>true</code> if the composites have been build, <code>false</code> otherwise.
-     */
-    private boolean built;
-
-    /**
-     * <code>true</code> if the composites contains some <code>null</code> elements.
-     */
-    private boolean containsNull;
-
-    /**
-     * <code>true</code> if some empty collection have been added.
-     */
-    private boolean hasMissingElements;
-
-    /**
-     * <code>true</code> if the composites contains some <code>unset</code> elements.
-     */
-    private boolean containsUnset;
-
-    public CompositesBuilder(CType ctype)
-    {
-        this.ctype = ctype;
-    }
-
-    /**
-     * Adds the specified element to all the composites.
-     * <p>
-     * If this builder contains 2 composites: A-B and A-C a call to this method to add D will result in the composites:
-     * A-B-D and A-C-D.
-     * </p>
-     *
-     * @param value the value of the next element
-     * @return this <code>CompositeBuilder</code>
-     */
-    public CompositesBuilder addElementToAll(ByteBuffer value)
-    {
-        checkUpdateable();
-
-        if (isEmpty())
-            elementsList.add(new ArrayList<ByteBuffer>());
-
-        for (int i = 0, m = elementsList.size(); i < m; i++)
-        {
-            if (value == null)
-                containsNull = true;
-            if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
-                containsUnset = true;
-            elementsList.get(i).add(value);
-        }
-        size++;
-        return this;
-    }
-
-    /**
-     * Adds individually each of the specified elements to the end of all of the existing composites.
-     * <p>
-     * If this builder contains 2 composites: A-B and A-C a call to this method to add D and E will result in the 4
-     * composites: A-B-D, A-B-E, A-C-D and A-C-E.
-     * </p>
-     *
-     * @param values the elements to add
-     * @return this <code>CompositeBuilder</code>
-     */
-    public CompositesBuilder addEachElementToAll(List<ByteBuffer> values)
-    {
-        checkUpdateable();
-
-        if (isEmpty())
-            elementsList.add(new ArrayList<ByteBuffer>());
-
-        if (values.isEmpty())
-        {
-            hasMissingElements = true;
-        }
-        else
-        {
-            for (int i = 0, m = elementsList.size(); i < m; i++)
-            {
-                List<ByteBuffer> oldComposite = elementsList.remove(0);
-
-                for (int j = 0, n = values.size(); j < n; j++)
-                {
-                    List<ByteBuffer> newComposite = new ArrayList<>(oldComposite);
-                    elementsList.add(newComposite);
-
-                    ByteBuffer value = values.get(j);
-
-                    if (value == null)
-                        containsNull = true;
-                    if (value == ByteBufferUtil.UNSET_BYTE_BUFFER)
-                        containsUnset = true;
-
-                    newComposite.add(values.get(j));
-                }
-            }
-        }
-        size++;
-        return this;
-    }
-
-
-    /**
-     * Adds individually each of the specified list of elements to the end of all of the existing composites.
-     * <p>
-     * If this builder contains 2 composites: A-B and A-C a call to this method to add [[D, E], [F, G]] will result in the 4
-     * composites: A-B-D-E, A-B-F-G, A-C-D-E and A-C-F-G.
-     * </p>
-     *
-     * @param values the elements to add
-     * @return this <code>CompositeBuilder</code>
-     */
-    public CompositesBuilder addAllElementsToAll(List<List<ByteBuffer>> values)
-    {
-        checkUpdateable();
-
-        if (isEmpty())
-            elementsList.add(new ArrayList<ByteBuffer>());
-
-        if (values.isEmpty())
-        {
-            hasMissingElements = true;
-        }
-        else
-        {
-            for (int i = 0, m = elementsList.size(); i < m; i++)
-            {
-                List<ByteBuffer> oldComposite = elementsList.remove(0);
-
-                for (int j = 0, n = values.size(); j < n; j++)
-                {
-                    List<ByteBuffer> newComposite = new ArrayList<>(oldComposite);
-                    elementsList.add(newComposite);
-
-                    List<ByteBuffer> value = values.get(j);
-
-                    if (value.contains(null))
-                        containsNull = true;
-                    if (value.contains(ByteBufferUtil.UNSET_BYTE_BUFFER))
-                        containsUnset = true;
-
-                    newComposite.addAll(value);
-                }
-            }
-            size += values.get(0).size();
-        }
-        return this;
-    }
-
-    /**
-     * Returns the number of elements that can be added to the composites.
-     *
-     * @return the number of elements that can be added to the composites.
-     */
-    public int remainingCount()
-    {
-        return ctype.size() - size;
-    }
-
-    /**
-     * Checks if some elements can still be added to the composites.
-     *
-     * @return <code>true</code> if it is possible to add more elements to the composites, <code>false</code> otherwise.
-     */
-    public boolean hasRemaining()
-    {
-        return remainingCount() > 0;
-    }
-
-    /**
-     * Checks if this builder is empty.
-     *
-     * @return <code>true</code> if this builder is empty, <code>false</code> otherwise.
-     */
-    public boolean isEmpty()
-    {
-        return elementsList.isEmpty();
-    }
-
-    /**
-     * Checks if the composites contains null elements.
-     *
-     * @return <code>true</code> if the composites contains <code>null</code> elements, <code>false</code> otherwise.
-     */
-    public boolean containsNull()
-    {
-        return containsNull;
-    }
-
-    /**
-     * Checks if some empty list of values have been added
-     * @return <code>true</code> if the composites have some missing elements, <code>false</code> otherwise.
-     */
-    public boolean hasMissingElements()
-    {
-        return hasMissingElements;
-    }
-
-    /**
-     * Checks if the composites contains unset elements.
-     *
-     * @return <code>true</code> if the composites contains <code>unset</code> elements, <code>false</code> otherwise.
-     */
-    public boolean containsUnset()
-    {
-        return containsUnset;
-    }
-
-    /**
-     * Builds the <code>Composites</code>.
-     *
-     * @return the composites
-     */
-    public List<Composite> build()
-    {
-        return buildWithEOC(EOC.NONE);
-    }
-
-    /**
-     * Builds the <code>Composites</code> with the specified EOC.
-     *
-     * @return the composites
-     */
-    public List<Composite> buildWithEOC(EOC eoc)
-    {
-        built = true;
-
-        if (hasMissingElements)
-            return Collections.emptyList();
-
-        CBuilder builder = ctype.builder();
-
-        if (elementsList.isEmpty())
-            return singletonList(builder.build().withEOC(eoc));
-
-        List<Composite> list = new ArrayList<>();
-
-        for (int i = 0, m = elementsList.size(); i < m; i++)
-        {
-            List<ByteBuffer> elements = elementsList.get(i);
-            list.add(builder.buildWith(elements).withEOC(eoc));
-        }
-
-        return list;
-    }
-
-    private void checkUpdateable()
-    {
-        if (!hasRemaining() || built)
-            throw new IllegalStateException("this CompositesBuilder cannot be updated anymore");
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundCType.java b/src/java/org/apache/cassandra/db/composites/CompoundCType.java
deleted file mode 100644
index 0458748..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundCType.java
+++ /dev/null

@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-
-/**
- * A truly-composite CType.
- */
-public class CompoundCType extends AbstractCType
-{
-    final List<AbstractType<?>> types;
-
-    // It's up to the caller to pass a list that is effectively immutable
-    public CompoundCType(List<AbstractType<?>> types)
-    {
-        super(isByteOrderComparable(types));
-        this.types = types;
-    }
-
-    public boolean isCompound()
-    {
-        return true;
-    }
-
-    public int size()
-    {
-        return types.size();
-    }
-
-    public AbstractType<?> subtype(int i)
-    {
-        return types.get(i);
-    }
-
-    public Composite fromByteBuffer(ByteBuffer bytes)
-    {
-        if (!bytes.hasRemaining())
-            return Composites.EMPTY;
-
-        ByteBuffer[] elements = new ByteBuffer[size()];
-        int idx = bytes.position(), i = 0;
-        byte eoc = 0;
-
-        boolean isStatic = false;
-        if (CompositeType.isStaticName(bytes))
-        {
-            isStatic = true;
-            idx += 2;
-        }
-
-        while (idx < bytes.limit())
-        {
-            checkRemaining(bytes, idx, 2);
-            int length = bytes.getShort(idx) & 0xFFFF;
-            idx += 2;
-
-            checkRemaining(bytes, idx, length + 1);
-            elements[i++] = sliceBytes(bytes, idx, length);
-            idx += length;
-            eoc = bytes.get(idx++);
-        }
-        return new CompoundComposite(elements, i, isStatic).withEOC(Composite.EOC.from(eoc));
-    }
-
-    public CBuilder builder()
-    {
-        return new CompoundCBuilder(this);
-    }
-
-    public CompoundCType setSubtype(int position, AbstractType<?> newType)
-    {
-        List<AbstractType<?>> newTypes = new ArrayList<AbstractType<?>>(types);
-        newTypes.set(position, newType);
-        return new CompoundCType(newTypes);
-    }
-
-    public AbstractType<?> asAbstractType()
-    {
-        return CompositeType.getInstance(types);
-    }
-
-    public static class CompoundCBuilder implements CBuilder
-    {
-        private final CType type;
-        private final ByteBuffer[] values;
-        private int size;
-        private boolean built;
-
-        public CompoundCBuilder(CType type)
-        {
-            this.type = type;
-            this.values = new ByteBuffer[type.size()];
-        }
-
-        public int remainingCount()
-        {
-            return values.length - size;
-        }
-
-        public CBuilder add(ByteBuffer value)
-        {
-            if (isDone())
-                throw new IllegalStateException();
-            values[size++] = value;
-            return this;
-        }
-
-        public CBuilder add(Object value)
-        {
-            return add(((AbstractType)type.subtype(size)).decompose(value));
-        }
-
-        private boolean isDone()
-        {
-            return remainingCount() == 0 || built;
-        }
-
-        public Composite build()
-        {
-            if (size == 0)
-                return Composites.EMPTY;
-
-            // We don't allow to add more element to a builder that has been built so
-            // that we don't have to copy values.
-            built = true;
-
-            // If the builder is full and we're building a dense cell name, then we can
-            // directly allocate the CellName object as it's complete.
-            if (size == values.length && type instanceof CellNameType && ((CellNameType)type).isDense())
-                return new CompoundDenseCellName(values);
-            return new CompoundComposite(values, size, false);
-        }
-
-        public Composite buildWith(ByteBuffer value)
-        {
-            ByteBuffer[] newValues = Arrays.copyOf(values, values.length);
-            newValues[size] = value;
-            // Same as above
-            if (size+1 == newValues.length && type instanceof CellNameType && ((CellNameType)type).isDense())
-                return new CompoundDenseCellName(newValues);
-
-            return new CompoundComposite(newValues, size+1, false);
-        }
-
-        public Composite buildWith(List<ByteBuffer> newValues)
-        {
-            ByteBuffer[] buffers = Arrays.copyOf(values, values.length);
-            int newSize = size;
-            for (ByteBuffer value : newValues)
-                buffers[newSize++] = value;
-
-            if (newSize == buffers.length && type instanceof CellNameType && ((CellNameType)type).isDense())
-                return new CompoundDenseCellName(buffers);
-
-            return new CompoundComposite(buffers, newSize, false);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundComposite.java b/src/java/org/apache/cassandra/db/composites/CompoundComposite.java
deleted file mode 100644
index 7a21b01..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundComposite.java
+++ /dev/null

@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-/**
- * A "truly-composite" Composite.
- */
-public class CompoundComposite extends AbstractComposite
-{
-    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundComposite(null, 0, false));
-
-    // We could use a List, but we'll create such object *a lot* and using a array+size is not
-    // all that harder, so we save the List object allocation.
-    final ByteBuffer[] elements;
-    final int size;
-    final boolean isStatic;
-
-    CompoundComposite(ByteBuffer[] elements, int size, boolean isStatic)
-    {
-        this.elements = elements;
-        this.size = size;
-        this.isStatic = isStatic;
-    }
-
-    public int size()
-    {
-        return size;
-    }
-
-    public ByteBuffer get(int i)
-    {
-        // Note: most consumer should validate that i is within bounds. However, for backward compatibility
-        // reasons, composite dense tables can have names that don't have all their component of the clustering
-        // columns, which may end up here with i > size(). For those calls, it's actually simpler to return null
-        // than to force the caller to special case.
-        return i >= size() ? null : elements[i];
-    }
-
-    @Override
-    public boolean isStatic()
-    {
-        return isStatic;
-    }
-
-    protected ByteBuffer[] elementsCopy(AbstractAllocator allocator)
-    {
-        ByteBuffer[] elementsCopy = new ByteBuffer[size];
-        for (int i = 0; i < size; i++)
-            elementsCopy[i] = allocator.clone(elements[i]);
-        return elementsCopy;
-    }
-
-    public long unsharedHeapSize()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
-    }
-
-    public long unsharedHeapSizeExcludingData()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
-    }
-
-    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new CompoundComposite(elementsCopy(allocator), size, isStatic);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java
deleted file mode 100644
index 1f471a8..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellName.java
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-public class CompoundDenseCellName extends CompoundComposite implements CellName
-{
-
-    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundDenseCellName(new ByteBuffer[0]));
-
-    // Not meant to be used directly, you should use the CellNameType method instead
-    CompoundDenseCellName(ByteBuffer[] elements)
-    {
-        super(elements, elements.length, false);
-    }
-
-    CompoundDenseCellName(ByteBuffer[] elements, int size)
-    {
-        super(elements, size, false);
-    }
-
-    public int clusteringSize()
-    {
-        return size;
-    }
-
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
-    {
-        return null;
-    }
-
-    public ByteBuffer collectionElement()
-    {
-        return null;
-    }
-
-    public boolean isCollectionCell()
-    {
-        return false;
-    }
-
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
-    {
-        // Dense cell imply one cell by CQL row so no other cell will be the same row.
-        return type.compare(this, other) == 0;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
-    }
-
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new CompoundDenseCellName(elementsCopy(allocator));
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java b/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java
deleted file mode 100644
index 2e409fb..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundDenseCellNameType.java
+++ /dev/null

@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.AbstractType;
-
-public class CompoundDenseCellNameType extends AbstractCompoundCellNameType
-{
-    public CompoundDenseCellNameType(List<AbstractType<?>> types)
-    {
-        this(new CompoundCType(types));
-    }
-
-    private CompoundDenseCellNameType(CompoundCType type)
-    {
-        super(type, type);
-    }
-
-    public CellNameType setSubtype(int position, AbstractType<?> newType)
-    {
-        if (position != 0)
-            throw new IllegalArgumentException();
-        return new SimpleDenseCellNameType(newType);
-    }
-
-    public boolean isDense()
-    {
-        return true;
-    }
-
-    public CellName create(Composite prefix, ColumnDefinition column)
-    {
-        // We ignore the column because it's just the COMPACT_VALUE name which is not store in the cell name (and it can be null anyway)
-        if (prefix instanceof CellName)
-            return (CellName)prefix;
-
-        // as noted below in makeWith(), compound dense cell names don't have to include all components
-        assert prefix instanceof CompoundComposite;
-        CompoundComposite lc = (CompoundComposite)prefix;
-        return new CompoundDenseCellName(lc.elements, lc.size);
-    }
-
-    protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-    {
-        assert !isStatic;
-        // A composite dense table cell name don't have to have all the component set to qualify as a
-        // proper CellName (for backward compatibility reasons mostly), so always return a cellName
-        CompoundDenseCellName c = new CompoundDenseCellName(components, size);
-        return eoc != Composite.EOC.NONE ? c.withEOC(eoc) : c;
-    }
-
-    protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-    {
-        return makeWith(Arrays.copyOfRange(components, 0, size), size, eoc, isStatic);
-    }
-
-    public void addCQL3Column(ColumnIdentifier id) {}
-    public void removeCQL3Column(ColumnIdentifier id) {}
-
-    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
-    {
-        return makeDenseCQL3RowBuilder(now);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java
deleted file mode 100644
index 03af6d0..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellName.java
+++ /dev/null

@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-public class CompoundSparseCellName extends CompoundComposite implements CellName
-{
-    private static final ByteBuffer[] EMPTY_PREFIX = new ByteBuffer[0];
-
-    private static final long HEAP_SIZE = ObjectSizes.measure(new CompoundSparseCellName(null, false));
-
-    protected final ColumnIdentifier columnName;
-
-    // Not meant to be used directly, you should use the CellNameType method instead
-    CompoundSparseCellName(ColumnIdentifier columnName, boolean isStatic)
-    {
-        this(EMPTY_PREFIX, columnName, isStatic);
-    }
-
-    CompoundSparseCellName(ByteBuffer[] elements, ColumnIdentifier columnName, boolean isStatic)
-    {
-        this(elements, elements.length, columnName, isStatic);
-    }
-
-    CompoundSparseCellName(ByteBuffer[] elements, int size, ColumnIdentifier columnName, boolean isStatic)
-    {
-        super(elements, size, isStatic);
-        this.columnName = columnName;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements);
-    }
-
-    public int size()
-    {
-        return size + 1;
-    }
-
-    public ByteBuffer get(int i)
-    {
-        return i == size ? columnName.bytes : elements[i];
-    }
-
-    public int clusteringSize()
-    {
-        return size;
-    }
-
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
-    {
-        return columnName;
-    }
-
-    public ByteBuffer collectionElement()
-    {
-        return null;
-    }
-
-    public boolean isCollectionCell()
-    {
-        return false;
-    }
-
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
-    {
-        if (clusteringSize() != other.clusteringSize() || other.isStatic() != isStatic())
-            return false;
-
-        for (int i = 0; i < clusteringSize(); i++)
-        {
-            if (type.subtype(i).compare(elements[i], other.get(i)) != 0)
-                return false;
-        }
-        return true;
-    }
-
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        if (elements.length == 0)
-            return this;
-
-        // We don't copy columnName because it's interned in SparseCellNameType
-        return new CompoundSparseCellName(elementsCopy(allocator), columnName, isStatic());
-    }
-
-    public static class WithCollection extends CompoundSparseCellName
-    {
-        private static final long HEAP_SIZE = ObjectSizes.measure(new WithCollection(null, ByteBufferUtil.EMPTY_BYTE_BUFFER, false));
-
-        private final ByteBuffer collectionElement;
-
-        WithCollection(ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
-        {
-            this(EMPTY_PREFIX, columnName, collectionElement, isStatic);
-        }
-
-        WithCollection(ByteBuffer[] elements, ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
-        {
-            this(elements, elements.length, columnName, collectionElement, isStatic);
-        }
-
-        WithCollection(ByteBuffer[] elements, int size, ColumnIdentifier columnName, ByteBuffer collectionElement, boolean isStatic)
-        {
-            super(elements, size, columnName, isStatic);
-            this.collectionElement = collectionElement;
-        }
-
-        public int size()
-        {
-            return size + 2;
-        }
-
-        public ByteBuffer get(int i)
-        {
-            return i == size + 1 ? collectionElement : super.get(i);
-        }
-
-        @Override
-        public ByteBuffer collectionElement()
-        {
-            return collectionElement;
-        }
-
-        @Override
-        public boolean isCollectionCell()
-        {
-            return true;
-        }
-
-        @Override
-        public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-        {
-            // We don't copy columnName because it's interned in SparseCellNameType
-            return new CompoundSparseCellName.WithCollection(elements.length == 0 ? elements : elementsCopy(allocator), size, columnName, allocator.clone(collectionElement), isStatic());
-        }
-
-        @Override
-        public long unsharedHeapSize()
-        {
-            return HEAP_SIZE + ObjectSizes.sizeOnHeapOf(elements)
-                   + ObjectSizes.sizeOnHeapExcludingData(collectionElement);
-        }
-
-        @Override
-        public long unsharedHeapSizeExcludingData()
-        {
-            return HEAP_SIZE + ObjectSizes.sizeOnHeapExcludingData(elements)
-                   + ObjectSizes.sizeOnHeapExcludingData(collectionElement);
-        }
-    }
-}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java b/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java
deleted file mode 100644
index c88c6f4..0000000
--- a/src/java/org/apache/cassandra/db/composites/CompoundSparseCellNameType.java
+++ /dev/null

@@ -1,330 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-public class CompoundSparseCellNameType extends AbstractCompoundCellNameType
-{
-    public static final ColumnIdentifier rowMarkerId = new ColumnIdentifier(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance);
-    private static final CellName rowMarkerNoPrefix = new CompoundSparseCellName(rowMarkerId, false);
-
-    // For CQL3 columns, this is always UTF8Type. However, for compatibility with super columns, we need to allow it to be non-UTF8.
-    private final AbstractType<?> columnNameType;
-    protected final Map<ByteBuffer, ColumnIdentifier> internedIds;
-
-    private final Composite staticPrefix;
-
-    public CompoundSparseCellNameType(List<AbstractType<?>> types)
-    {
-        this(types, UTF8Type.instance);
-    }
-
-    public CompoundSparseCellNameType(List<AbstractType<?>> types, AbstractType<?> columnNameType)
-    {
-        this(new CompoundCType(types), columnNameType);
-    }
-
-    private CompoundSparseCellNameType(CompoundCType clusteringType, AbstractType<?> columnNameType)
-    {
-        this(clusteringType, columnNameType, makeCType(clusteringType, columnNameType, null), new HashMap<ByteBuffer, ColumnIdentifier>());
-    }
-
-    private CompoundSparseCellNameType(CompoundCType clusteringType, AbstractType<?> columnNameType, CompoundCType fullType, Map<ByteBuffer, ColumnIdentifier> internedIds)
-    {
-        super(clusteringType, fullType);
-        this.columnNameType = columnNameType;
-        this.internedIds = internedIds;
-        this.staticPrefix = makeStaticPrefix(clusteringType.size());
-    }
-
-    private static Composite makeStaticPrefix(int size)
-    {
-        ByteBuffer[] elements = new ByteBuffer[size];
-        for (int i = 0; i < size; i++)
-            elements[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-        return new CompoundComposite(elements, size, true)
-        {
-            @Override
-            public boolean isStatic()
-            {
-                return true;
-            }
-
-            @Override
-            public long unsharedHeapSize()
-            {
-                // We'll share this for a given type.
-                return 0;
-            }
-
-            @Override
-            public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
-            {
-                return this;
-            }
-        };
-    }
-
-    protected static CompoundCType makeCType(CompoundCType clusteringType, AbstractType<?> columnNameType, ColumnToCollectionType collectionType)
-    {
-        List<AbstractType<?>> allSubtypes = new ArrayList<AbstractType<?>>(clusteringType.size() + (collectionType == null ? 1 : 2));
-        for (int i = 0; i < clusteringType.size(); i++)
-            allSubtypes.add(clusteringType.subtype(i));
-        allSubtypes.add(columnNameType);
-        if (collectionType != null)
-            allSubtypes.add(collectionType);
-        return new CompoundCType(allSubtypes);
-    }
-
-    public CellNameType setSubtype(int position, AbstractType<?> newType)
-    {
-        if (position < clusteringSize)
-            return new CompoundSparseCellNameType(clusteringType.setSubtype(position, newType), columnNameType, fullType.setSubtype(position, newType), internedIds);
-
-        if (position == clusteringSize)
-            throw new IllegalArgumentException();
-
-        throw new IndexOutOfBoundsException();
-    }
-
-    @Override
-    public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
-    {
-        return new WithCollection(clusteringType, ColumnToCollectionType.getInstance(Collections.singletonMap(columnName.bytes, newCollection)), internedIds);
-    }
-
-    public boolean isDense()
-    {
-        return false;
-    }
-
-    public boolean supportCollections()
-    {
-        return true;
-    }
-
-    public Composite staticPrefix()
-    {
-        return staticPrefix;
-    }
-
-    public CellName create(Composite prefix, ColumnDefinition column)
-    {
-        return create(prefix, column.name, column.isStatic());
-    }
-
-    private CellName create(Composite prefix, ColumnIdentifier columnName, boolean isStatic)
-    {
-        if (isStatic)
-            prefix = staticPrefix();
-
-        assert prefix.size() == clusteringSize;
-
-        if (prefix.isEmpty())
-            return new CompoundSparseCellName(columnName, isStatic);
-
-        assert prefix instanceof CompoundComposite;
-        CompoundComposite lc = (CompoundComposite)prefix;
-        return new CompoundSparseCellName(lc.elements, clusteringSize, columnName, isStatic);
-    }
-
-    public CellName rowMarker(Composite prefix)
-    {
-        assert !prefix.isStatic(); // static columns don't really create rows, they shouldn't have a row marker
-        if (prefix.isEmpty())
-            return rowMarkerNoPrefix;
-
-        return create(prefix, rowMarkerId, false);
-    }
-
-    protected ColumnIdentifier idFor(ByteBuffer bb)
-    {
-        ColumnIdentifier id = internedIds.get(bb);
-        return id == null ? new ColumnIdentifier(bb, columnNameType) : id;
-    }
-
-    protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-    {
-        if (size < clusteringSize + 1 || eoc != Composite.EOC.NONE)
-            return new CompoundComposite(components, size, isStatic).withEOC(eoc);
-
-        return new CompoundSparseCellName(components, clusteringSize, idFor(components[clusteringSize]), isStatic);
-    }
-
-    protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-    {
-        if (size < clusteringSize + 1 || eoc != Composite.EOC.NONE)
-            return new CompoundComposite(Arrays.copyOfRange(components, 0, size), size, isStatic).withEOC(eoc);
-
-        ByteBuffer[] clusteringColumns = Arrays.copyOfRange(components, 0, clusteringSize);
-        return new CompoundSparseCellName(clusteringColumns, idFor(components[clusteringSize]), isStatic);
-    }
-
-    public void addCQL3Column(ColumnIdentifier id)
-    {
-        internedIds.put(id.bytes, id);
-    }
-
-    public void removeCQL3Column(ColumnIdentifier id)
-    {
-        internedIds.remove(id.bytes);
-    }
-
-    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
-    {
-        return makeSparseCQL3RowBuilder(metadata, this, now);
-    }
-
-    public static class WithCollection extends CompoundSparseCellNameType
-    {
-        private final ColumnToCollectionType collectionType;
-
-        public WithCollection(List<AbstractType<?>> types, ColumnToCollectionType collectionType)
-        {
-            this(new CompoundCType(types), collectionType);
-        }
-
-        WithCollection(CompoundCType clusteringType, ColumnToCollectionType collectionType)
-        {
-            this(clusteringType, collectionType, new HashMap<ByteBuffer, ColumnIdentifier>());
-        }
-
-        private WithCollection(CompoundCType clusteringType, ColumnToCollectionType collectionType, Map<ByteBuffer, ColumnIdentifier> internedIds)
-        {
-            this(clusteringType, makeCType(clusteringType, UTF8Type.instance, collectionType), collectionType, internedIds);
-        }
-
-        private WithCollection(CompoundCType clusteringType, CompoundCType fullCType, ColumnToCollectionType collectionType, Map<ByteBuffer, ColumnIdentifier> internedIds)
-        {
-            super(clusteringType, UTF8Type.instance, fullCType, internedIds);
-            this.collectionType = collectionType;
-        }
-
-        @Override
-        public CellNameType setSubtype(int position, AbstractType<?> newType)
-        {
-            if (position < clusteringSize)
-                return new WithCollection(clusteringType.setSubtype(position, newType), collectionType, internedIds);
-
-            throw position >= fullType.size() ? new IndexOutOfBoundsException() : new IllegalArgumentException();
-        }
-
-        @Override
-        public CellNameType addOrUpdateCollection(ColumnIdentifier columnName, CollectionType newCollection)
-        {
-            Map<ByteBuffer, CollectionType> newMap = new HashMap<>(collectionType.defined);
-            newMap.put(columnName.bytes, newCollection);
-            return new WithCollection(clusteringType, ColumnToCollectionType.getInstance(newMap), internedIds);
-        }
-
-        @Override
-        public CellName create(Composite prefix, ColumnDefinition column, ByteBuffer collectionElement)
-        {
-            if (column.isStatic())
-                prefix = staticPrefix();
-
-            assert prefix.size() == clusteringSize;
-
-            if (prefix.isEmpty())
-                return new CompoundSparseCellName.WithCollection(column.name, collectionElement, column.isStatic());
-
-            assert prefix instanceof CompoundComposite;
-            CompoundComposite lc = (CompoundComposite)prefix;
-            return new CompoundSparseCellName.WithCollection(lc.elements, clusteringSize, column.name, collectionElement, column.isStatic());
-        }
-
-        @Override
-        public int compare(Composite c1, Composite c2)
-        {
-            if (c1.isStatic() != c2.isStatic())
-            {
-                // Static sorts before non-static no matter what, except for empty which
-                // always sort first
-                if (c1.isEmpty())
-                    return c2.isEmpty() ? 0 : -1;
-                if (c2.isEmpty())
-                    return 1;
-                return c1.isStatic() ? -1 : 1;
-            }
-
-            int s1 = c1.size();
-            int s2 = c2.size();
-            int minSize = Math.min(s1, s2);
-
-            ByteBuffer previous = null;
-            for (int i = 0; i < minSize; i++)
-            {
-                AbstractType<?> comparator = subtype(i);
-                ByteBuffer value1 = c1.get(i);
-                ByteBuffer value2 = c2.get(i);
-
-                int cmp = comparator.compareCollectionMembers(value1, value2, previous);
-                if (cmp != 0)
-                    return cmp;
-
-                previous = value1;
-            }
-
-            if (s1 == s2)
-                return c1.eoc().compareTo(c2.eoc());
-            return s1 < s2 ? c1.eoc().prefixComparisonResult : -c2.eoc().prefixComparisonResult;
-        }
-
-        @Override
-        public boolean hasCollections()
-        {
-            return true;
-        }
-
-        @Override
-        public ColumnToCollectionType collectionType()
-        {
-            return collectionType;
-        }
-
-        @Override
-        protected Composite makeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-        {
-            if (size < fullSize)
-                return super.makeWith(components, size, eoc, isStatic);
-
-            return new CompoundSparseCellName.WithCollection(components, clusteringSize, idFor(components[clusteringSize]), components[fullSize - 1], isStatic);
-        }
-
-        protected Composite copyAndMakeWith(ByteBuffer[] components, int size, Composite.EOC eoc, boolean isStatic)
-        {
-            if (size < fullSize)
-                return super.copyAndMakeWith(components, size, eoc, isStatic);
-
-            ByteBuffer[] clusteringColumns = Arrays.copyOfRange(components, 0, clusteringSize);
-            return new CompoundSparseCellName.WithCollection(clusteringColumns, idFor(components[clusteringSize]), components[clusteringSize + 1], isStatic);
-        }
-    }
-}
-

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleCType.java b/src/java/org/apache/cassandra/db/composites/SimpleCType.java
deleted file mode 100644
index 7ee45ac..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleCType.java
+++ /dev/null

@@ -1,156 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.cassandra.db.marshal.AbstractType;
-
-/**
- * A not truly-composite CType.
- */
-public class SimpleCType extends AbstractCType
-{
-    protected final AbstractType<?> type;
-
-    public SimpleCType(AbstractType<?> type)
-    {
-        super(type.isByteOrderComparable());
-        this.type = type;
-    }
-
-    public boolean isCompound()
-    {
-        return false;
-    }
-
-    public int size()
-    {
-        return 1;
-    }
-
-    public int compare(Composite c1, Composite c2)
-    {
-        if (isByteOrderComparable)
-            return AbstractSimpleCellNameType.compareUnsigned(c1, c2);
-
-        assert !(c1.isEmpty() | c2.isEmpty());
-        // This method assumes that simple composites never have an EOC != NONE. This assumption
-        // stands in particular on the fact that a Composites.EMPTY never has a non-NONE EOC. If
-        // this ever change, we'll need to update this.
-        return type.compare(c1.get(0), c2.get(0));
-    }
-
-    public AbstractType<?> subtype(int i)
-    {
-        if (i != 0)
-            throw new IndexOutOfBoundsException();
-        return type;
-    }
-
-    public Composite fromByteBuffer(ByteBuffer bytes)
-    {
-        return !bytes.hasRemaining() ? Composites.EMPTY : new SimpleComposite(bytes);
-    }
-
-    public CBuilder builder()
-    {
-        return new SimpleCBuilder(this);
-    }
-
-    public CType setSubtype(int position, AbstractType<?> newType)
-    {
-        if (position != 0)
-            throw new IndexOutOfBoundsException();
-        return new SimpleCType(newType);
-    }
-
-    // Use sparingly, it defeats the purpose
-    public AbstractType<?> asAbstractType()
-    {
-        return type;
-    }
-
-    public static class SimpleCBuilder implements CBuilder
-    {
-        private final CType type;
-        private ByteBuffer value;
-
-        public SimpleCBuilder(CType type)
-        {
-            this.type = type;
-        }
-
-        public int remainingCount()
-        {
-            return value == null ? 1 : 0;
-        }
-
-        public CBuilder add(ByteBuffer value)
-        {
-            if (this.value != null)
-                throw new IllegalStateException();
-            this.value = value;
-            return this;
-        }
-
-        public CBuilder add(Object value)
-        {
-            return add(((AbstractType)type.subtype(0)).decompose(value));
-        }
-
-        public Composite build()
-        {
-            if (value == null || !value.hasRemaining())
-                return Composites.EMPTY;
-
-            // If we're building a dense cell name, then we can directly allocate the
-            // CellName object as it's complete.
-            if (type instanceof CellNameType && ((CellNameType)type).isDense())
-                return new SimpleDenseCellName(value);
-
-            return new SimpleComposite(value);
-        }
-
-        public Composite buildWith(ByteBuffer value)
-        {
-            if (this.value != null)
-                throw new IllegalStateException();
-
-            if (value == null || !value.hasRemaining())
-                return Composites.EMPTY;
-
-            // If we're building a dense cell name, then we can directly allocate the
-            // CellName object as it's complete.
-            if (type instanceof CellNameType && ((CellNameType)type).isDense())
-                return new SimpleDenseCellName(value);
-
-            return new SimpleComposite(value);
-        }
-
-        public Composite buildWith(List<ByteBuffer> values)
-        {
-            if (values.size() > 1)
-                throw new IllegalStateException();
-            if (values.isEmpty())
-                return Composites.EMPTY;
-            return buildWith(values.get(0));
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleComposite.java b/src/java/org/apache/cassandra/db/composites/SimpleComposite.java
deleted file mode 100644
index 3c80d9f..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleComposite.java
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-/**
- * A "simple" (not-truly-composite) Composite.
- */
-public class SimpleComposite extends AbstractComposite
-{
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleComposite(ByteBuffer.allocate(1)));
-
-    protected final ByteBuffer element;
-
-    SimpleComposite(ByteBuffer element)
-    {
-        // We have to be careful with empty ByteBuffers as we shouldn't store them.
-        // To avoid errors (and so isEmpty() works as we intend), we don't allow simpleComposite with
-        // an empty element (but it's ok for CompoundComposite, it's a row marker in that case).
-        assert element.hasRemaining();
-        this.element = element;
-    }
-
-    public int size()
-    {
-        return 1;
-    }
-
-    public ByteBuffer get(int i)
-    {
-        if (i != 0)
-            throw new IndexOutOfBoundsException();
-
-        return element;
-    }
-
-    @Override
-    public Composite withEOC(EOC newEoc)
-    {
-        // EOC makes no sense for not truly composites.
-        return this;
-    }
-
-    @Override
-    public ByteBuffer toByteBuffer()
-    {
-        return element;
-    }
-
-    public long unsharedHeapSize()
-    {
-        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(element);
-    }
-
-    public Composite copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new SimpleComposite(allocator.clone(element));
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java
deleted file mode 100644
index 2ca7d23..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellName.java
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-public class SimpleDenseCellName extends SimpleComposite implements CellName
-{
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleDenseCellName(ByteBuffer.allocate(1)));
-
-    // Not meant to be used directly, you should use the CellNameType method instead
-    SimpleDenseCellName(ByteBuffer element)
-    {
-        super(element);
-    }
-
-    public int clusteringSize()
-    {
-        return 1;
-    }
-
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
-    {
-        return null;
-    }
-
-    public ByteBuffer collectionElement()
-    {
-        return null;
-    }
-
-    public boolean isCollectionCell()
-    {
-        return false;
-    }
-
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
-    {
-        // Dense cell imply one cell by CQL row so no other cell will be the same row.
-        return type.compare(this, other) == 0;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(element);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return EMPTY_SIZE + ObjectSizes.sizeOnHeapExcludingData(element);
-    }
-
-    // If cellnames were sharing some prefix components, this will break it, so
-    // we might want to try to do better.
-    @Override
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new SimpleDenseCellName(allocator.clone(element));
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java b/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java
deleted file mode 100644
index 3db4bc4..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleDenseCellNameType.java
+++ /dev/null

@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.AbstractType;
-
-public class SimpleDenseCellNameType extends AbstractSimpleCellNameType
-{
-    public SimpleDenseCellNameType(AbstractType<?> type)
-    {
-        super(type);
-    }
-
-    public int clusteringPrefixSize()
-    {
-        return 1;
-    }
-
-    public CBuilder prefixBuilder()
-    {
-        // Simple dense is "all" prefix
-        return builder();
-    }
-
-    public CellNameType setSubtype(int position, AbstractType<?> newType)
-    {
-        if (position != 0)
-            throw new IllegalArgumentException();
-        return new SimpleDenseCellNameType(newType);
-    }
-
-    public boolean isDense()
-    {
-        return true;
-    }
-
-    public CellName create(Composite prefix, ColumnDefinition column)
-    {
-        assert prefix.size() == 1;
-        // We ignore the column because it's just the COMPACT_VALUE name which is not store in the cell name
-        return new SimpleDenseCellName(prefix.get(0));
-    }
-
-    @Override
-    public Composite fromByteBuffer(ByteBuffer bb)
-    {
-        return !bb.hasRemaining()
-             ? Composites.EMPTY
-             : new SimpleDenseCellName(bb);
-    }
-
-    public void addCQL3Column(ColumnIdentifier id) {}
-    public void removeCQL3Column(ColumnIdentifier id) {}
-
-    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
-    {
-        return makeDenseCQL3RowBuilder(now);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java
deleted file mode 100644
index c6351f1..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellName.java
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-import org.apache.cassandra.utils.ObjectSizes;
-
-public class SimpleSparseCellName extends AbstractComposite implements CellName
-{
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new SimpleSparseCellName(null));
-
-    private final ColumnIdentifier columnName;
-
-    // Not meant to be used directly, you should use the CellNameType method instead
-    SimpleSparseCellName(ColumnIdentifier columnName)
-    {
-        this.columnName = columnName;
-    }
-
-    public int size()
-    {
-        return 1;
-    }
-
-    public ByteBuffer get(int i)
-    {
-        if (i != 0)
-            throw new IndexOutOfBoundsException();
-
-        return columnName.bytes;
-    }
-
-    @Override
-    public Composite withEOC(EOC newEoc)
-    {
-        // EOC makes no sense for not truly composites.
-        return this;
-    }
-
-    @Override
-    public ByteBuffer toByteBuffer()
-    {
-        return columnName.bytes;
-    }
-
-    public int clusteringSize()
-    {
-        return 0;
-    }
-
-    public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
-    {
-        return columnName;
-    }
-
-    public ByteBuffer collectionElement()
-    {
-        return null;
-    }
-
-    public boolean isCollectionCell()
-    {
-        return false;
-    }
-
-    public boolean isSameCQL3RowAs(CellNameType type, CellName other)
-    {
-        return true;
-    }
-
-    public long unsharedHeapSizeExcludingData()
-    {
-        return EMPTY_SIZE + columnName.unsharedHeapSizeExcludingData();
-    }
-
-    public long unsharedHeapSize()
-    {
-        return EMPTY_SIZE + columnName.unsharedHeapSize();
-    }
-
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        return new SimpleSparseCellName(columnName.clone(allocator));
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java
deleted file mode 100644
index 5ce0deb..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleSparseCellNameType.java
+++ /dev/null

@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.CQL3Row;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.marshal.AbstractType;
-
-public class SimpleSparseCellNameType extends AbstractSimpleCellNameType
-{
-    // Simple sparse means static thrift CF or non-clustered CQL3. This means that cell names will mainly
-    // be those that have been declared and we can intern the whole CellName instances.
-    private final Map<ByteBuffer, CellName> internedNames;
-
-    public SimpleSparseCellNameType(AbstractType<?> type)
-    {
-        this(type, new HashMap<ByteBuffer, CellName>());
-    }
-
-    private SimpleSparseCellNameType(AbstractType<?> type, Map<ByteBuffer, CellName> internedNames)
-    {
-        super(type);
-        this.internedNames = internedNames;
-    }
-
-    public int clusteringPrefixSize()
-    {
-        return 0;
-    }
-
-    public CellNameType setSubtype(int position, AbstractType<?> newType)
-    {
-        if (position != 0)
-            throw new IllegalArgumentException();
-        return new SimpleSparseCellNameType(newType, internedNames);
-    }
-
-    public CBuilder prefixBuilder()
-    {
-        return Composites.EMPTY_BUILDER;
-    }
-
-    public boolean isDense()
-    {
-        return false;
-    }
-
-    public CellName create(Composite prefix, ColumnDefinition column)
-    {
-        assert prefix.isEmpty();
-        CellName cn = internedNames.get(column.name.bytes);
-        return cn == null ? new SimpleSparseCellName(column.name) : cn;
-    }
-
-    @Override
-    public Composite fromByteBuffer(ByteBuffer bb)
-    {
-        if (!bb.hasRemaining())
-            return Composites.EMPTY;
-
-        CellName cn = internedNames.get(bb);
-        return cn == null ? new SimpleSparseCellName(new ColumnIdentifier(bb, type)) : cn;
-    }
-
-    public void addCQL3Column(ColumnIdentifier id)
-    {
-        internedNames.put(id.bytes, new SimpleSparseInternedCellName(id));
-    }
-
-    public void removeCQL3Column(ColumnIdentifier id)
-    {
-        internedNames.remove(id.bytes);
-    }
-
-    public CQL3Row.Builder CQL3RowBuilder(CFMetaData metadata, long now)
-    {
-        return makeSparseCQL3RowBuilder(metadata, this, now);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java b/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java
deleted file mode 100644
index c613720..0000000
--- a/src/java/org/apache/cassandra/db/composites/SimpleSparseInternedCellName.java
+++ /dev/null

@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.composites;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.utils.memory.AbstractAllocator;
-
-public class SimpleSparseInternedCellName extends SimpleSparseCellName
-{
-
-    // Not meant to be used directly, you should use the CellNameType method instead
-    SimpleSparseInternedCellName(ColumnIdentifier columnName)
-    {
-        super(columnName);
-    }
-
-    @Override
-    public long unsharedHeapSizeExcludingData()
-    {
-        return 0;
-    }
-
-    @Override
-    public long unsharedHeapSize()
-    {
-        return 0;
-    }
-
-    @Override
-    public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
-    {
-        // We're interning those instance in SparceCellNameType so don't need to copy.
-        return this;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/context/CounterContext.java b/src/java/org/apache/cassandra/db/context/CounterContext.java
index ffffbb1..d0952d0 100644
--- a/src/java/org/apache/cassandra/db/context/CounterContext.java
+++ b/src/java/org/apache/cassandra/db/context/CounterContext.java

@@ -75,12 +75,20 @@
  */
 public class CounterContext
 {
-    private static final int HEADER_SIZE_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
-    private static final int HEADER_ELT_LENGTH = TypeSizes.NATIVE.sizeof(Short.MAX_VALUE);
-    private static final int CLOCK_LENGTH = TypeSizes.NATIVE.sizeof(Long.MAX_VALUE);
-    private static final int COUNT_LENGTH = TypeSizes.NATIVE.sizeof(Long.MAX_VALUE);
+    private static final int HEADER_SIZE_LENGTH = TypeSizes.sizeof(Short.MAX_VALUE);
+    private static final int HEADER_ELT_LENGTH = TypeSizes.sizeof(Short.MAX_VALUE);
+    private static final int CLOCK_LENGTH = TypeSizes.sizeof(Long.MAX_VALUE);
+    private static final int COUNT_LENGTH = TypeSizes.sizeof(Long.MAX_VALUE);
     private static final int STEP_LENGTH = CounterId.LENGTH + CLOCK_LENGTH + COUNT_LENGTH;
 
+    /*
+     * A special hard-coded value we use for clock ids to differentiate between regular local shards
+     * and 'fake' local shards used to emulate pre-3.0 CounterUpdateCell-s in UpdateParameters.
+     *
+     * Important for handling counter writes and reads during rolling 2.1/2.2 -> 3.0 upgrades.
+     */
+    static final CounterId UPDATE_CLOCK_ID = CounterId.fromInt(0);
+
     private static final Logger logger = LoggerFactory.getLogger(CounterContext.class);
 
     public static enum Relationship
@@ -100,6 +108,43 @@
     }
 
     /**
+     * Creates a counter context with a single local shard with clock id of UPDATE_CLOCK_ID.
+     *
+     * This is only used in a PartitionUpdate until the update has gone through
+     * CounterMutation.apply(), at which point this special local shard will be replaced by a regular global one.
+     * It should never hit commitlog / memtable / disk, but can hit network.
+     *
+     * We use this so that if an update statement has multiple increments of the same counter we properly
+     * add them rather than keeping only one of them.
+     *
+     * NOTE: Before CASSANDRA-13691 we used a regular local shard without a hard-coded clock id value here.
+     * It was problematic, because it was possible to return a false positive, and on read path encode an old counter
+     * cell from 2.0 era with a regular local shard as a counter update, and to break the 2.1 coordinator.
+     */
+    public ByteBuffer createUpdate(long count)
+    {
+        ContextState state = ContextState.allocate(0, 1, 0);
+        state.writeLocal(UPDATE_CLOCK_ID, 1L, count);
+        return state.context;
+    }
+
+    /**
+     * Checks if a context is an update (see createUpdate() for justification).
+     */
+    public boolean isUpdate(ByteBuffer context)
+    {
+        return ContextState.wrap(context).getCounterId().equals(UPDATE_CLOCK_ID);
+    }
+
+    /**
+     * Returns the count associated with the update counter id, or 0 if no such shard is present.
+     */
+    public long getUpdateCount(ByteBuffer context)
+    {
+        return getClockAndCountOf(context, UPDATE_CLOCK_ID).count;
+    }
+
+    /**
      * Creates a counter context with a single global, 2.1+ shard (a result of increment).
      */
     public ByteBuffer createGlobal(CounterId id, long clock, long count)
@@ -655,6 +700,9 @@
      */
     public void updateDigest(MessageDigest message, ByteBuffer context)
     {
+        // context can be empty due to the optimization from CASSANDRA-10657
+        if (!context.hasRemaining())
+            return;
         ByteBuffer dup = context.duplicate();
         dup.position(context.position() + headerLength(context));
         message.update(dup);

diff --git a/src/java/org/apache/cassandra/db/filter/AbstractClusteringIndexFilter.java b/src/java/org/apache/cassandra/db/filter/AbstractClusteringIndexFilter.java
new file mode 100644
index 0000000..51e9d8e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/AbstractClusteringIndexFilter.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+public abstract class AbstractClusteringIndexFilter implements ClusteringIndexFilter
+{
+    static final Serializer serializer = new FilterSerializer();
+
+    protected final boolean reversed;
+
+    protected AbstractClusteringIndexFilter(boolean reversed)
+    {
+        this.reversed = reversed;
+    }
+
+    public boolean isReversed()
+    {
+        return reversed;
+    }
+
+    protected abstract void serializeInternal(DataOutputPlus out, int version) throws IOException;
+    protected abstract long serializedSizeInternal(int version);
+
+    protected void appendOrderByToCQLString(CFMetaData metadata, StringBuilder sb)
+    {
+        if (reversed)
+        {
+            sb.append(" ORDER BY (");
+            int i = 0;
+            for (ColumnDefinition column : metadata.clusteringColumns())
+                sb.append(i++ == 0 ? "" : ", ").append(column.name).append(column.type instanceof ReversedType ? " ASC" : " DESC");
+            sb.append(')');
+        }
+    }
+
+    private static class FilterSerializer implements Serializer
+    {
+        public void serialize(ClusteringIndexFilter pfilter, DataOutputPlus out, int version) throws IOException
+        {
+            AbstractClusteringIndexFilter filter = (AbstractClusteringIndexFilter)pfilter;
+
+            out.writeByte(filter.kind().ordinal());
+            out.writeBoolean(filter.isReversed());
+
+            filter.serializeInternal(out, version);
+        }
+
+        public ClusteringIndexFilter deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            Kind kind = Kind.values()[in.readUnsignedByte()];
+            boolean reversed = in.readBoolean();
+
+            return kind.deserializer.deserialize(in, version, metadata, reversed);
+        }
+
+        public long serializedSize(ClusteringIndexFilter pfilter, int version)
+        {
+            AbstractClusteringIndexFilter filter = (AbstractClusteringIndexFilter)pfilter;
+
+            return 1
+                 + TypeSizes.sizeof(filter.isReversed())
+                 + filter.serializedSizeInternal(version);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java
new file mode 100644
index 0000000..e3f824f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java

@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.CachedPartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * A filter that selects a subset of the rows of a given partition by using the "clustering index".
+ * <p>
+ * In CQL terms, this correspond to the clustering columns selection and correspond to what
+ * the storage engine can do without filtering (and without 2ndary indexes). This does not include
+ * the restrictions on non-PK columns which can be found in {@link RowFilter}.
+ */
+public interface ClusteringIndexFilter
+{
+    public static Serializer serializer = AbstractClusteringIndexFilter.serializer;
+
+    public enum Kind
+    {
+        SLICE (ClusteringIndexSliceFilter.deserializer),
+        NAMES (ClusteringIndexNamesFilter.deserializer);
+
+        protected final InternalDeserializer deserializer;
+
+        private Kind(InternalDeserializer deserializer)
+        {
+            this.deserializer = deserializer;
+        }
+    }
+
+    static interface InternalDeserializer
+    {
+        public ClusteringIndexFilter deserialize(DataInputPlus in, int version, CFMetaData metadata, boolean reversed) throws IOException;
+    }
+
+    /**
+     * Whether the filter query rows in reversed clustering order or not.
+     *
+     * @return whether the filter query rows in reversed clustering order or not.
+     */
+    public boolean isReversed();
+
+    /**
+     * Returns a filter for continuing the paging of this filter given the last returned clustering prefix.
+     *
+     * @param comparator the comparator for the table this is a filter for.
+     * @param lastReturned the last clustering that was returned for the query we are paging for. The
+     * resulting filter will be such that results coming after {@code lastReturned} are returned
+     * (where coming after means "greater than" if the filter is not reversed, "lesser than" otherwise;
+     * futher, whether the comparison is strict or not depends on {@code inclusive}).
+     * @param inclusive whether or not we want to include the {@code lastReturned} in the newly returned
+     * page of results.
+     *
+     * @return a new filter that selects results coming after {@code lastReturned}.
+     */
+    public ClusteringIndexFilter forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive);
+
+    /**
+     * Returns whether we can guarantee that a given cached partition contains all the data selected by this filter.
+     *
+     * @param partition the cached partition. This method assumed that the rows of this partition contains all the table columns.
+     *
+     * @return whether we can guarantee that all data selected by this filter are in {@code partition}.
+     */
+    public boolean isFullyCoveredBy(CachedPartition partition);
+
+    /**
+     * Whether this filter selects the head of a partition (i.e. it isn't reversed and selects all rows up to a certain point).
+     *
+     * @return whether this filter selects the head of a partition.
+     */
+    public boolean isHeadFilter();
+
+    /**
+     * Whether this filter selects all the row of a partition (it's an "identity" filter).
+     *
+     * @return whether this filter selects all the row of a partition (it's an "identity" filter).
+     */
+    public boolean selectsAllPartition();
+
+    /**
+     * Whether a given row is selected by this filter.
+     *
+     * @param clustering the clustering of the row to test the selection of.
+     *
+     * @return whether the row with clustering {@code clustering} is selected by this filter.
+     */
+    public boolean selects(Clustering clustering);
+
+    /**
+     * Returns an iterator that only returns the rows of the provided iterator that this filter selects.
+     * <p>
+     * This method is the "dumb" counterpart to {@link #filter(SliceableUnfilteredRowIterator)} in that it has no way to quickly get
+     * to what is actually selected, so it simply iterate over it all and filters out what shouldn't be returned. This should
+     * be avoided in general, we should make sure to have {@code SliceableUnfilteredRowIterator} when we have filtering to do, but this
+     * currently only used in {@link SinglePartitionReadCommand#getThroughCache} when we know this won't be a performance problem.
+     * Another difference with {@link #filter(SliceableUnfilteredRowIterator)} is that this method also filter the queried
+     * columns in the returned result, while the former assumes that the provided iterator has already done it.
+     *
+     * @param columnFilter the columns to include in the rows of the result iterator.
+     * @param iterator the iterator for which we should filter rows.
+     *
+     * @return an iterator that only returns the rows (or rather Unfilted) from {@code iterator} that are selected by this filter.
+     */
+    public UnfilteredRowIterator filterNotIndexed(ColumnFilter columnFilter, UnfilteredRowIterator iterator);
+
+    /**
+     * Returns an iterator that only returns the rows of the provided sliceable iterator that this filter selects.
+     *
+     * @param iterator the sliceable iterator for which we should filter rows.
+     *
+     * @return an iterator that only returns the rows (or rather unfiltered) from {@code iterator} that are selected by this filter.
+     */
+    public UnfilteredRowIterator filter(SliceableUnfilteredRowIterator iterator);
+
+    /**
+     * Given a partition, returns a row iterator for the rows of this partition that are selected by this filter.
+     *
+     * @param columnFilter the columns to include in the rows of the result iterator.
+     * @param partition the partition containing the rows to filter.
+     *
+     * @return a unfiltered row iterator returning those rows (or rather Unfiltered) from {@code partition} that are selected by this filter.
+     */
+    // TODO: we could get rid of that if Partition was exposing a SliceableUnfilteredRowIterator (instead of the two searchIterator() and
+    // unfilteredIterator() methods). However, for AtomicBtreePartition this would require changes to Btree so we'll leave that for later.
+    public UnfilteredRowIterator getUnfilteredRowIterator(ColumnFilter columnFilter, Partition partition);
+
+    /**
+     * Whether the provided sstable may contain data that is selected by this filter (based on the sstable metadata).
+     *
+     * @param sstable the sstable for which we want to test the need for inclusion.
+     *
+     * @return whether {@code sstable} should be included to answer this filter.
+     */
+    public boolean shouldInclude(SSTableReader sstable);
+
+    public Kind kind();
+
+    public String toString(CFMetaData metadata);
+    public String toCQLString(CFMetaData metadata);
+
+    public interface Serializer
+    {
+        public void serialize(ClusteringIndexFilter filter, DataOutputPlus out, int version) throws IOException;
+        public ClusteringIndexFilter deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException;
+        public long serializedSize(ClusteringIndexFilter filter, int version);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java
new file mode 100644
index 0000000..f4859cd
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java

@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+/**
+ * A filter selecting rows given their clustering value.
+ */
+public class ClusteringIndexNamesFilter extends AbstractClusteringIndexFilter
+{
+    static final InternalDeserializer deserializer = new NamesDeserializer();
+
+    // This could be empty if selectedColumns only has static columns (in which case the filter still
+    // selects the static row)
+    private final NavigableSet<Clustering> clusterings;
+
+    // clusterings is always in clustering order (because we need it that way in some methods), but we also
+    // sometimes need those clustering in "query" order (i.e. in reverse clustering order if the query is
+    // reversed), so we keep that too for simplicity.
+    private final NavigableSet<Clustering> clusteringsInQueryOrder;
+
+    public ClusteringIndexNamesFilter(NavigableSet<Clustering> clusterings, boolean reversed)
+    {
+        super(reversed);
+        assert !clusterings.contains(Clustering.STATIC_CLUSTERING);
+        this.clusterings = clusterings;
+        this.clusteringsInQueryOrder = reversed ? clusterings.descendingSet() : clusterings;
+    }
+
+    /**
+     * The set of requested rows.
+     *
+     * Please note that this can be empty if only the static row is requested.
+     *
+     * @return the set of requested clustering in clustering order (note that
+     * this is always in clustering order even if the query is reversed).
+     */
+    public NavigableSet<Clustering> requestedRows()
+    {
+        return clusterings;
+    }
+
+    public boolean selectsAllPartition()
+    {
+        // if the clusterings set is empty we are selecting a static row and in this case we want to count
+        // static rows so we return true
+        return clusterings.isEmpty();
+    }
+
+    public boolean selects(Clustering clustering)
+    {
+        return clusterings.contains(clustering);
+    }
+
+    public ClusteringIndexNamesFilter forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive)
+    {
+        NavigableSet<Clustering> newClusterings = reversed ?
+                                                  clusterings.headSet(lastReturned, inclusive) :
+                                                  clusterings.tailSet(lastReturned, inclusive);
+
+        return new ClusteringIndexNamesFilter(newClusterings, reversed);
+    }
+
+    public boolean isFullyCoveredBy(CachedPartition partition)
+    {
+        if (partition.isEmpty())
+            return false;
+
+        // 'partition' contains all columns, so it covers our filter if our last clusterings
+        // is smaller than the last in the cache
+        return clusterings.comparator().compare(clusterings.last(), partition.lastRow().clustering()) <= 0;
+    }
+
+    public boolean isHeadFilter()
+    {
+        return false;
+    }
+
+    // Given another iterator, only return the rows that match this filter
+    public UnfilteredRowIterator filterNotIndexed(ColumnFilter columnFilter, UnfilteredRowIterator iterator)
+    {
+        // Note that we don't filter markers because that's a bit trickier (we don't know in advance until when
+        // the range extend) and it's harmless to left them.
+        class FilterNotIndexed extends Transformation
+        {
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                return columnFilter.fetchedColumns().statics.isEmpty() ? null : row.filter(columnFilter, iterator.metadata());
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                return clusterings.contains(row.clustering()) ? row.filter(columnFilter, iterator.metadata()) : null;
+            }
+        }
+        return Transformation.apply(iterator, new FilterNotIndexed());
+    }
+
+    public UnfilteredRowIterator filter(final SliceableUnfilteredRowIterator iter)
+    {
+        // Please note that this method assumes that rows from 'iter' already have their columns filtered, i.e. that
+        // they only include columns that we select.
+        return new WrappingUnfilteredRowIterator(iter)
+        {
+            private final Iterator<Clustering> clusteringIter = clusteringsInQueryOrder.iterator();
+            private Iterator<Unfiltered> currentClustering;
+            private Unfiltered next;
+
+            @Override
+            public boolean hasNext()
+            {
+                if (next != null)
+                    return true;
+
+                if (currentClustering != null && currentClustering.hasNext())
+                {
+                    next = currentClustering.next();
+                    return true;
+                }
+
+                while (clusteringIter.hasNext())
+                {
+                    Clustering nextClustering = clusteringIter.next();
+                    currentClustering = iter.slice(Slice.make(nextClustering));
+                    if (currentClustering.hasNext())
+                    {
+                        next = currentClustering.next();
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+            @Override
+            public Unfiltered next()
+            {
+                if (next == null && !hasNext())
+                    throw new NoSuchElementException();
+
+                Unfiltered toReturn = next;
+                next = null;
+                return toReturn;
+            }
+        };
+    }
+
+    public UnfilteredRowIterator getUnfilteredRowIterator(final ColumnFilter columnFilter, final Partition partition)
+    {
+        final Iterator<Clustering> clusteringIter = clusteringsInQueryOrder.iterator();
+        final SearchIterator<Clustering, Row> searcher = partition.searchIterator(columnFilter, reversed);
+
+        return new AbstractUnfilteredRowIterator(partition.metadata(),
+                                        partition.partitionKey(),
+                                        partition.partitionLevelDeletion(),
+                                        columnFilter.fetchedColumns(),
+                                        searcher.next(Clustering.STATIC_CLUSTERING),
+                                        reversed,
+                                        partition.stats())
+        {
+            protected Unfiltered computeNext()
+            {
+                while (clusteringIter.hasNext())
+                {
+                    Row row = searcher.next(clusteringIter.next());
+                    if (row != null)
+                        return row;
+                }
+                return endOfData();
+            }
+        };
+    }
+
+    public boolean shouldInclude(SSTableReader sstable)
+    {
+        ClusteringComparator comparator = sstable.metadata.comparator;
+        List<ByteBuffer> minClusteringValues = sstable.getSSTableMetadata().minClusteringValues;
+        List<ByteBuffer> maxClusteringValues = sstable.getSSTableMetadata().maxClusteringValues;
+
+        // If any of the requested clustering is within the bounds covered by the sstable, we need to include the sstable
+        for (Clustering clustering : clusterings)
+        {
+            if (Slice.make(clustering).intersects(comparator, minClusteringValues, maxClusteringValues))
+                return true;
+        }
+        return false;
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append("names(");
+        int i = 0;
+        for (Clustering clustering : clusterings)
+            sb.append(i++ == 0 ? "" : ", ").append(clustering.toString(metadata));
+        if (reversed)
+            sb.append(", reversed");
+        return sb.append(')').toString();
+    }
+
+    public String toCQLString(CFMetaData metadata)
+    {
+        if (clusterings.isEmpty())
+            return "";
+
+        StringBuilder sb = new StringBuilder();
+        sb.append('(').append(ColumnDefinition.toCQLString(metadata.clusteringColumns())).append(')');
+        sb.append(clusterings.size() == 1 ? " = " : " IN (");
+        int i = 0;
+        for (Clustering clustering : clusterings)
+            sb.append(i++ == 0 ? "" : ", ").append("(").append(clustering.toCQLString(metadata)).append(")");
+        sb.append(clusterings.size() == 1 ? "" : ")");
+
+        appendOrderByToCQLString(metadata, sb);
+        return sb.toString();
+    }
+
+    public Kind kind()
+    {
+        return Kind.NAMES;
+    }
+
+    protected void serializeInternal(DataOutputPlus out, int version) throws IOException
+    {
+        ClusteringComparator comparator = (ClusteringComparator)clusterings.comparator();
+        out.writeUnsignedVInt(clusterings.size());
+        for (Clustering clustering : clusterings)
+            Clustering.serializer.serialize(clustering, out, version, comparator.subtypes());
+    }
+
+    protected long serializedSizeInternal(int version)
+    {
+        ClusteringComparator comparator = (ClusteringComparator)clusterings.comparator();
+        long size = TypeSizes.sizeofUnsignedVInt(clusterings.size());
+        for (Clustering clustering : clusterings)
+            size += Clustering.serializer.serializedSize(clustering, version, comparator.subtypes());
+        return size;
+    }
+
+    private static class NamesDeserializer implements InternalDeserializer
+    {
+        public ClusteringIndexFilter deserialize(DataInputPlus in, int version, CFMetaData metadata, boolean reversed) throws IOException
+        {
+            ClusteringComparator comparator = metadata.comparator;
+            BTreeSet.Builder<Clustering> clusterings = BTreeSet.builder(comparator);
+            int size = (int)in.readUnsignedVInt();
+            for (int i = 0; i < size; i++)
+                clusterings.add(Clustering.serializer.deserialize(in, version, comparator.subtypes()));
+
+            return new ClusteringIndexNamesFilter(clusterings.build(), reversed);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java
new file mode 100644
index 0000000..7a174ee
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java

@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.util.List;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.CachedPartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * A filter over a single partition.
+ */
+public class ClusteringIndexSliceFilter extends AbstractClusteringIndexFilter
+{
+    static final InternalDeserializer deserializer = new SliceDeserializer();
+
+    private final Slices slices;
+
+    public ClusteringIndexSliceFilter(Slices slices, boolean reversed)
+    {
+        super(reversed);
+        this.slices = slices;
+    }
+
+    public Slices requestedSlices()
+    {
+        return slices;
+    }
+
+    public boolean selectsAllPartition()
+    {
+        return slices.size() == 1 && !slices.hasLowerBound() && !slices.hasUpperBound();
+    }
+
+    public boolean selects(Clustering clustering)
+    {
+        return slices.selects(clustering);
+    }
+
+    public ClusteringIndexSliceFilter forPaging(ClusteringComparator comparator, Clustering lastReturned, boolean inclusive)
+    {
+        Slices newSlices = slices.forPaging(comparator, lastReturned, inclusive, reversed);
+        return slices == newSlices
+             ? this
+             : new ClusteringIndexSliceFilter(newSlices, reversed);
+    }
+
+    public boolean isFullyCoveredBy(CachedPartition partition)
+    {
+        // Partition is guaranteed to cover the whole filter if it includes the filter start and finish bounds.
+
+        // (note that since partition is the head of a partition, to have no lower bound is ok)
+        if (!slices.hasUpperBound() || partition.isEmpty())
+            return false;
+
+        return partition.metadata().comparator.compare(slices.get(slices.size() - 1).end(), partition.lastRow().clustering()) <= 0;
+    }
+
+    public boolean isHeadFilter()
+    {
+        return !reversed && slices.size() == 1 && !slices.hasLowerBound();
+    }
+
+    // Given another iterator, only return the rows that match this filter
+    public UnfilteredRowIterator filterNotIndexed(final ColumnFilter columnFilter, UnfilteredRowIterator iterator)
+    {
+        final Slices.InOrderTester tester = slices.inOrderTester(reversed);
+
+        // Note that we don't filter markers because that's a bit trickier (we don't know in advance until when
+        // the range extend) and it's harmless to leave them.
+        class FilterNotIndexed extends Transformation
+        {
+            public boolean isDoneForPartition()
+            {
+                return tester.isDone();
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                return tester.includes(row.clustering()) ? row.filter(columnFilter, iterator.metadata()) : null;
+            }
+
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                return columnFilter.fetchedColumns().statics.isEmpty() ? Rows.EMPTY_STATIC_ROW : row.filter(columnFilter, iterator.metadata());
+            }
+        }
+        return Transformation.apply(iterator, new FilterNotIndexed());
+    }
+
+    public UnfilteredRowIterator filter(SliceableUnfilteredRowIterator iterator)
+    {
+        // Please note that this method assumes that rows from 'iter' already have their columns filtered, i.e. that
+        // they only include columns that we select.
+        return slices.makeSliceIterator(iterator);
+    }
+
+    public UnfilteredRowIterator getUnfilteredRowIterator(ColumnFilter columnFilter, Partition partition)
+    {
+        return partition.unfilteredIterator(columnFilter, slices, reversed);
+    }
+
+    public boolean shouldInclude(SSTableReader sstable)
+    {
+        List<ByteBuffer> minClusteringValues = sstable.getSSTableMetadata().minClusteringValues;
+        List<ByteBuffer> maxClusteringValues = sstable.getSSTableMetadata().maxClusteringValues;
+
+        if (minClusteringValues.isEmpty() || maxClusteringValues.isEmpty())
+            return true;
+
+        return slices.intersects(minClusteringValues, maxClusteringValues);
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return String.format("slice(slices=%s, reversed=%b)", slices, reversed);
+    }
+
+    public String toCQLString(CFMetaData metadata)
+    {
+        StringBuilder sb = new StringBuilder();
+
+        if (!selectsAllPartition())
+            sb.append(slices.toCQLString(metadata));
+
+        appendOrderByToCQLString(metadata, sb);
+
+        return sb.toString();
+    }
+
+    public Kind kind()
+    {
+        return Kind.SLICE;
+    }
+
+    protected void serializeInternal(DataOutputPlus out, int version) throws IOException
+    {
+        Slices.serializer.serialize(slices, out, version);
+    }
+
+    protected long serializedSizeInternal(int version)
+    {
+        return Slices.serializer.serializedSize(slices, version);
+    }
+
+    private static class SliceDeserializer implements InternalDeserializer
+    {
+        public ClusteringIndexFilter deserialize(DataInputPlus in, int version, CFMetaData metadata, boolean reversed) throws IOException
+        {
+            Slices slices = Slices.serializer.deserialize(in, version, metadata);
+            return new ClusteringIndexSliceFilter(slices, reversed);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnCounter.java b/src/java/org/apache/cassandra/db/filter/ColumnCounter.java
deleted file mode 100644
index a00d588..0000000
--- a/src/java/org/apache/cassandra/db/filter/ColumnCounter.java
+++ /dev/null

@@ -1,243 +0,0 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-package org.apache.cassandra.db.filter;
-
-import java.util.Iterator;
-
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DeletionInfo;
-
-public class ColumnCounter
-{
-    protected int live;
-    protected int tombstones;
-    protected final long timestamp;
-
-    public ColumnCounter(long timestamp)
-    {
-        this.timestamp = timestamp;
-    }
-
-    /**
-     * @return true if the cell counted as a live cell or a valid tombstone; false if it got immediately discarded for
-     *         being shadowed by a range- or a partition tombstone
-     */
-    public boolean count(Cell cell, DeletionInfo.InOrderTester tester)
-    {
-        // The cell is shadowed by a higher-level deletion, and won't be retained.
-        // For the purposes of this counter, we don't care if it's a tombstone or not.
-        if (tester.isDeleted(cell))
-            return false;
-
-        if (cell.isLive(timestamp))
-            live++;
-        else
-            tombstones++;
-
-        return true;
-    }
-
-    public int live()
-    {
-        return live;
-    }
-
-    public int tombstones()
-    {
-        return tombstones;
-    }
-
-    public ColumnCounter countAll(ColumnFamily container)
-    {
-        if (container == null)
-            return this;
-
-        DeletionInfo.InOrderTester tester = container.inOrderDeletionTester();
-        Iterator<Cell> cells = getCellIterator(container);
-        while (cells.hasNext())
-            count(cells.next(), tester);
-        return this;
-    }
-
-    protected Iterator<Cell> getCellIterator(ColumnFamily container)
-    {
-        // overridden by GroupByPrefixReversed to return a reverse iterator
-        return container.iterator();
-    }
-
-    public static class GroupByPrefix extends ColumnCounter
-    {
-        protected final CellNameType type;
-        protected final int toGroup;
-        protected final boolean countPartitionsWithOnlyStaticData;
-        protected CellName previous;
-
-        /**
-         * A column counter that count only 1 for all the columns sharing a
-         * given prefix of the key.
-         *
-         * @param type the type of the column name. This can be null if {@code
-         *             toGroup} is 0, otherwise it should be a composite.
-         * @param toGroup the number of composite components on which to group
-         *                column. If 0, all columns are grouped, otherwise we group
-         *                those for which the {@code toGroup} first component are equals.
-         * @param countPartitionsWithOnlyStaticData if {@code true} the partitions with only static data should be
-         * counted as 1 valid row.
-         */
-        public GroupByPrefix(long timestamp, CellNameType type, int toGroup, boolean countPartitionsWithOnlyStaticData)
-        {
-            super(timestamp);
-            this.type = type;
-            this.toGroup = toGroup;
-            this.countPartitionsWithOnlyStaticData = countPartitionsWithOnlyStaticData;
-
-            assert toGroup == 0 || type != null;
-        }
-
-        @Override
-        public boolean count(Cell cell, DeletionInfo.InOrderTester tester)
-        {
-            if (tester.isDeleted(cell))
-                return false;
-
-            if (!cell.isLive(timestamp))
-            {
-                tombstones++;
-                return true;
-            }
-
-            if (toGroup == 0)
-            {
-                live = 1;
-                return true;
-            }
-
-            CellName current = cell.name();
-            assert current.size() >= toGroup;
-
-            if (previous != null)
-            {
-                boolean isSameGroup = previous.isStatic() == current.isStatic();
-                if (isSameGroup)
-                {
-                    for (int i = 0; i < toGroup; i++)
-                    {
-                        if (type.subtype(i).compare(previous.get(i), current.get(i)) != 0)
-                        {
-                            isSameGroup = false;
-                            break;
-                        }
-                    }
-                }
-
-                if (isSameGroup)
-                    return true;
-
-                // We want to count the static group as 1 (CQL) row only if it's the only
-                // group in the partition. So, since we have already counted it at this point,
-                // just don't count the 2nd group if there is one and the first one was static
-                if (previous.isStatic() && countPartitionsWithOnlyStaticData)
-                {
-                    previous = current;
-                    return true;
-                }
-            }
-
-            if (!current.isStatic() || countPartitionsWithOnlyStaticData)
-                live++;
-
-            previous = current;
-
-            return true;
-        }
-    }
-
-    /**
-     * Similar to GroupByPrefix, but designed to handle counting cells in reverse order.
-     */
-    public static class GroupByPrefixReversed extends GroupByPrefix
-    {
-        public GroupByPrefixReversed(long timestamp, CellNameType type, int toGroup, boolean countPartitionsWithOnlyStaticData)
-        {
-            // GroupByPrefixReversed ignores countPartitionsWithOnlyStaticData because the original problem (CASSANDRA-11223)
-            // only affect range queries and multi-partition queries. Range queries do not accept an ORDER BY clause.
-            // Multi-partition queries only accept an ORDER BY clause when paging is off. The limit in this case is used
-            // only when the rows with only static data have already been discarded. So, in practice
-            // changing GroupByPrefixReversed.count() has no effect.
-            super(timestamp, type, toGroup, countPartitionsWithOnlyStaticData);
-        }
-
-        @Override
-        public Iterator<Cell> getCellIterator(ColumnFamily container)
-        {
-            return container.reverseIterator();
-        }
-
-        @Override
-        public boolean count(Cell cell, DeletionInfo.InOrderTester tester)
-        {
-            if (tester.isDeleted(cell))
-                return false;
-
-            if (!cell.isLive(timestamp))
-            {
-                tombstones++;
-                return true;
-            }
-
-            if (toGroup == 0)
-            {
-                live = 1;
-                return true;
-            }
-
-            CellName current = cell.name();
-            assert current.size() >= toGroup;
-
-            if (previous == null)
-            {
-                // This is the first group we've seen.  If it happens to be static, we still want to increment the
-                // count because a) there are no-static rows (statics are always last in reversed order), and b) any
-                // static cells we see after this will not increment the count
-                previous = current;
-                live++;
-            }
-            else if (!current.isStatic())  // ignore statics if we've seen any other statics or any other groups
-            {
-                for (int i = 0; i < toGroup; i++)
-                {
-                    if (type.subtype(i).compare(previous.get(i), current.get(i)) != 0)
-                    {
-                        // it's a new group
-                        live++;
-                        previous = current;
-                        return true;
-                    }
-                }
-            }
-
-            return true;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnFilter.java b/src/java/org/apache/cassandra/db/filter/ColumnFilter.java
new file mode 100644
index 0000000..c28c0ae
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/ColumnFilter.java

@@ -0,0 +1,493 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.util.*;
+
+import com.google.common.collect.SortedSetMultimap;
+import com.google.common.collect.TreeMultimap;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+
+/**
+ * Represents which (non-PK) columns (and optionally which sub-part of a column for complex columns) are selected
+ * by a query.
+ *
+ * In practice, this class cover 2 main cases:
+ *   1) most user queries have to internally query all columns, because the CQL semantic requires us to know if
+ *      a row is live or not even if it has no values for the columns requested by the user (see #6588for more
+ *      details). However, while we need to know for columns if it has live values, we can actually save from
+ *      sending the values for those columns that will not be returned to the user.
+ *   2) for some internal queries (and for queries using #6588 if we introduce it), we're actually fine only
+ *      actually querying some of the columns.
+ *
+ * For complex columns, this class allows to be more fine grained than the column by only selection some of the
+ * cells of the complex column (either individual cell by path name, or some slice).
+ */
+public class ColumnFilter
+{
+    public static final Serializer serializer = new Serializer();
+
+    // Distinguish between the 2 cases described above: if 'isFetchAll' is true, then all columns will be retrieved
+    // by the query, but the values for column/cells not selected by 'queried' and 'subSelections' will be skipped.
+    // Otherwise, only the column/cells returned by 'queried' and 'subSelections' will be returned at all.
+    private final boolean isFetchAll;
+
+    private final PartitionColumns queried; // can be null if isFetchAll and we don't want to skip any value
+    private final PartitionColumns fetched;
+    private final SortedSetMultimap<ColumnIdentifier, ColumnSubselection> subSelections; // can be null
+
+    /**
+     * Used on replica for deserialisation
+     */
+    private ColumnFilter(boolean isFetchAll,
+                         PartitionColumns fetched,
+                         PartitionColumns queried,
+                         SortedSetMultimap<ColumnIdentifier, ColumnSubselection> subSelections)
+    {
+        assert !isFetchAll || fetched != null;
+        assert isFetchAll || queried != null;
+        this.isFetchAll = isFetchAll;
+        this.fetched = isFetchAll ? fetched : queried;
+        this.queried = queried;
+        this.subSelections = subSelections;
+    }
+
+    /**
+     * A selection that includes all columns (and their values).
+     */
+    public static ColumnFilter all(CFMetaData metadata)
+    {
+        return new ColumnFilter(true, metadata.partitionColumns(), null, null);
+    }
+
+    /**
+     * A selection that only fetch the provided columns.
+     * <p>
+     * Note that this shouldn't be used for CQL queries in general as all columns should be queried to
+     * preserve CQL semantic (see class javadoc). This is ok for some internal queries however (and
+     * for #6588 if/when we implement it).
+     */
+    public static ColumnFilter selection(PartitionColumns columns)
+    {
+        return new ColumnFilter(false, null, columns, null);
+    }
+
+	/**
+     * A filter that fetches all columns for the provided table, but returns
+     * only the queried ones.
+     */
+    public static ColumnFilter selection(CFMetaData metadata, PartitionColumns queried)
+    {
+        return new ColumnFilter(true, metadata.partitionColumns(), queried, null);
+    }
+
+    /**
+     * The columns that needs to be fetched internally for this selection.
+     * <p>
+     * This is the columns that must be present in the internal rows returned by queries using this selection,
+     * not the columns that are actually queried by the user (see the class javadoc for details).
+     *
+     * @return the column to fetch for this selection.
+     */
+    public PartitionColumns fetchedColumns()
+    {
+        return fetched;
+    }
+
+    public boolean includesAllColumns()
+    {
+        return isFetchAll;
+    }
+
+    /**
+     * Whether the provided column is selected by this selection.
+     */
+    public boolean includes(ColumnDefinition column)
+    {
+        return isFetchAll || queried.contains(column);
+    }
+
+    /**
+     * Whether we can skip the value for the provided selected column.
+     */
+    public boolean canSkipValue(ColumnDefinition column)
+    {
+        // We don't use that currently, see #10655 for more details.
+        return false;
+    }
+
+    /**
+     * Whether the provided cell of a complex column is selected by this selection.
+     */
+    public boolean includes(Cell cell)
+    {
+        if (isFetchAll || subSelections == null || !cell.column().isComplex())
+            return true;
+
+        SortedSet<ColumnSubselection> s = subSelections.get(cell.column().name);
+        if (s.isEmpty())
+            return true;
+
+        for (ColumnSubselection subSel : s)
+            if (subSel.compareInclusionOf(cell.path()) == 0)
+                return true;
+
+        return false;
+    }
+
+    /**
+     * Whether we can skip the value of the cell of a complex column.
+     */
+    public boolean canSkipValue(ColumnDefinition column, CellPath path)
+    {
+        if (!isFetchAll || subSelections == null || !column.isComplex())
+            return false;
+
+        SortedSet<ColumnSubselection> s = subSelections.get(column.name);
+        if (s.isEmpty())
+            return false;
+
+        for (ColumnSubselection subSel : s)
+            if (subSel.compareInclusionOf(path) == 0)
+                return false;
+
+        return true;
+    }
+
+    /**
+     * Creates a new {@code Tester} to efficiently test the inclusion of cells of complex column
+     * {@code column}.
+     */
+    public Tester newTester(ColumnDefinition column)
+    {
+        if (subSelections == null || !column.isComplex())
+            return null;
+
+        SortedSet<ColumnSubselection> s = subSelections.get(column.name);
+        if (s.isEmpty())
+            return null;
+
+        return new Tester(isFetchAll, s.iterator());
+    }
+
+    /**
+     * Returns a {@code ColumnFilter}} builder that includes all columns (so the selections
+     * added to the builder are the columns/cells for which we shouldn't skip the values).
+     */
+    public static Builder allColumnsBuilder(CFMetaData metadata)
+    {
+        return new Builder(metadata);
+    }
+
+    /**
+     * Returns a {@code ColumnFilter}} builder that includes only the columns/cells
+     * added to the builder.
+     */
+    public static Builder selectionBuilder()
+    {
+        return new Builder(null);
+    }
+
+    public static class Tester
+    {
+        private final boolean isFetchAll;
+        private ColumnSubselection current;
+        private final Iterator<ColumnSubselection> iterator;
+
+        private Tester(boolean isFetchAll, Iterator<ColumnSubselection> iterator)
+        {
+            this.isFetchAll = isFetchAll;
+            this.iterator = iterator;
+        }
+
+        public boolean includes(CellPath path)
+        {
+            return isFetchAll || includedBySubselection(path);
+        }
+
+        public boolean canSkipValue(CellPath path)
+        {
+            return isFetchAll && !includedBySubselection(path);
+        }
+
+        private boolean includedBySubselection(CellPath path)
+        {
+            while (current != null || iterator.hasNext())
+            {
+                if (current == null)
+                    current = iterator.next();
+
+                int cmp = current.compareInclusionOf(path);
+                if (cmp == 0) // The path is included
+                    return true;
+                else if (cmp < 0) // The path is before this sub-selection, it's not included by any
+                    return false;
+
+                // the path is after this sub-selection, we need to check the next one.
+                current = null;
+            }
+            return false;
+        }
+    }
+
+    public static class Builder
+    {
+        private final CFMetaData metadata;
+        private PartitionColumns.Builder selection;
+        private List<ColumnSubselection> subSelections;
+
+        private Builder(CFMetaData metadata)
+        {
+            this.metadata = metadata;
+        }
+
+        public Builder add(ColumnDefinition c)
+        {
+            if (selection == null)
+                selection = PartitionColumns.builder();
+            selection.add(c);
+            return this;
+        }
+
+        public Builder addAll(Iterable<ColumnDefinition> columns)
+        {
+            if (selection == null)
+                selection = PartitionColumns.builder();
+            selection.addAll(columns);
+            return this;
+        }
+
+        private Builder addSubSelection(ColumnSubselection subSelection)
+        {
+            add(subSelection.column());
+            if (subSelections == null)
+                subSelections = new ArrayList<>();
+            subSelections.add(subSelection);
+            return this;
+        }
+
+        public Builder slice(ColumnDefinition c, CellPath from, CellPath to)
+        {
+            return addSubSelection(ColumnSubselection.slice(c, from, to));
+        }
+
+        public Builder select(ColumnDefinition c, CellPath elt)
+        {
+            return addSubSelection(ColumnSubselection.element(c, elt));
+        }
+
+        public ColumnFilter build()
+        {
+            boolean isFetchAll = metadata != null;
+
+            PartitionColumns selectedColumns = selection == null ? null : selection.build();
+            // It's only ok to have queried == null in ColumnFilter if isFetchAll. So deal with the case of a "selection" builder
+            // with nothing selected (we can at least happen on some backward compatible queries - CASSANDRA-10471).
+            if (!isFetchAll && selectedColumns == null)
+                selectedColumns = PartitionColumns.NONE;
+
+            SortedSetMultimap<ColumnIdentifier, ColumnSubselection> s = null;
+            if (subSelections != null)
+            {
+                s = TreeMultimap.create(Comparator.<ColumnIdentifier>naturalOrder(), Comparator.<ColumnSubselection>naturalOrder());
+                for (ColumnSubselection subSelection : subSelections)
+                    s.put(subSelection.column().name, subSelection);
+            }
+
+            return new ColumnFilter(isFetchAll, isFetchAll ? metadata.partitionColumns() : selectedColumns, selectedColumns, s);
+        }
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (other == this)
+            return true;
+
+        if (!(other instanceof ColumnFilter))
+            return false;
+
+        ColumnFilter otherCf = (ColumnFilter) other;
+
+        return otherCf.isFetchAll == this.isFetchAll &&
+               Objects.equals(otherCf.fetched, this.fetched) &&
+               Objects.equals(otherCf.queried, this.queried) &&
+               Objects.equals(otherCf.subSelections, this.subSelections);
+
+    }
+    @Override
+    public String toString()
+    {
+        if (isFetchAll)
+            return "*";
+
+        if (queried.isEmpty())
+            return "";
+
+        Iterator<ColumnDefinition> defs = queried.selectOrderIterator();
+        if (!defs.hasNext())
+            return "<none>";
+
+        StringBuilder sb = new StringBuilder();
+        appendColumnDef(sb, defs.next());
+        while (defs.hasNext())
+            appendColumnDef(sb.append(", "), defs.next());
+        return sb.toString();
+    }
+
+    private void appendColumnDef(StringBuilder sb, ColumnDefinition column)
+    {
+        if (subSelections == null)
+        {
+            sb.append(column.name);
+            return;
+        }
+
+        SortedSet<ColumnSubselection> s = subSelections.get(column.name);
+        if (s.isEmpty())
+        {
+            sb.append(column.name);
+            return;
+        }
+
+        int i = 0;
+        for (ColumnSubselection subSel : s)
+            sb.append(i++ == 0 ? "" : ", ").append(column.name).append(subSel);
+    }
+
+    public static class Serializer
+    {
+        private static final int IS_FETCH_ALL_MASK       = 0x01;
+        private static final int HAS_SELECTION_MASK      = 0x02;
+        private static final int HAS_SUB_SELECTIONS_MASK = 0x04;
+
+        private static int makeHeaderByte(ColumnFilter selection)
+        {
+            return (selection.isFetchAll ? IS_FETCH_ALL_MASK : 0)
+                 | (selection.queried != null ? HAS_SELECTION_MASK : 0)
+                 | (selection.subSelections != null ? HAS_SUB_SELECTIONS_MASK : 0);
+        }
+
+        public void serialize(ColumnFilter selection, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeByte(makeHeaderByte(selection));
+
+            if (version >= MessagingService.VERSION_3014 && selection.isFetchAll)
+            {
+                Columns.serializer.serialize(selection.fetched.statics, out);
+                Columns.serializer.serialize(selection.fetched.regulars, out);
+            }
+
+            if (selection.queried != null)
+            {
+                Columns.serializer.serialize(selection.queried.statics, out);
+                Columns.serializer.serialize(selection.queried.regulars, out);
+            }
+
+            if (selection.subSelections != null)
+            {
+                out.writeUnsignedVInt(selection.subSelections.size());
+                for (ColumnSubselection subSel : selection.subSelections.values())
+                    ColumnSubselection.serializer.serialize(subSel, out, version);
+            }
+        }
+
+        public ColumnFilter deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            int header = in.readUnsignedByte();
+            boolean isFetchAll = (header & IS_FETCH_ALL_MASK) != 0;
+            boolean hasSelection = (header & HAS_SELECTION_MASK) != 0;
+            boolean hasSubSelections = (header & HAS_SUB_SELECTIONS_MASK) != 0;
+
+            PartitionColumns fetched = null;
+            PartitionColumns selection = null;
+
+            if (isFetchAll)
+            {
+                if (version >= MessagingService.VERSION_3014)
+                {
+                    Columns statics = Columns.serializer.deserialize(in, metadata);
+                    Columns regulars = Columns.serializer.deserialize(in, metadata);
+                    fetched = new PartitionColumns(statics, regulars);
+                }
+                else
+                {
+                    fetched = metadata.partitionColumns();
+                }
+            }
+
+            if (hasSelection)
+            {
+                Columns statics = Columns.serializer.deserialize(in, metadata);
+                Columns regulars = Columns.serializer.deserialize(in, metadata);
+                selection = new PartitionColumns(statics, regulars);
+            }
+
+            SortedSetMultimap<ColumnIdentifier, ColumnSubselection> subSelections = null;
+            if (hasSubSelections)
+            {
+                subSelections = TreeMultimap.create(Comparator.<ColumnIdentifier>naturalOrder(), Comparator.<ColumnSubselection>naturalOrder());
+                int size = (int)in.readUnsignedVInt();
+                for (int i = 0; i < size; i++)
+                {
+                    ColumnSubselection subSel = ColumnSubselection.serializer.deserialize(in, version, metadata);
+                    subSelections.put(subSel.column().name, subSel);
+                }
+            }
+
+            return new ColumnFilter(isFetchAll, fetched, selection, subSelections);
+        }
+
+        public long serializedSize(ColumnFilter selection, int version)
+        {
+            long size = 1; // header byte
+
+            if (version >= MessagingService.VERSION_3014 && selection.isFetchAll)
+            {
+                size += Columns.serializer.serializedSize(selection.fetched.statics);
+                size += Columns.serializer.serializedSize(selection.fetched.regulars);
+            }
+
+            if (selection.queried != null)
+            {
+                size += Columns.serializer.serializedSize(selection.queried.statics);
+                size += Columns.serializer.serializedSize(selection.queried.regulars);
+            }
+
+            if (selection.subSelections != null)
+            {
+
+                size += TypeSizes.sizeofUnsignedVInt(selection.subSelections.size());
+                for (ColumnSubselection subSel : selection.subSelections.values())
+                    size += ColumnSubselection.serializer.serializedSize(subSel, version);
+            }
+
+            return size;
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnSlice.java b/src/java/org/apache/cassandra/db/filter/ColumnSlice.java
deleted file mode 100644
index 316226d..0000000
--- a/src/java/org/apache/cassandra/db/filter/ColumnSlice.java
+++ /dev/null

@@ -1,292 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class ColumnSlice
-{
-    public static final ColumnSlice ALL_COLUMNS = new ColumnSlice(Composites.EMPTY, Composites.EMPTY);
-    public static final ColumnSlice[] ALL_COLUMNS_ARRAY = new ColumnSlice[]{ ALL_COLUMNS };
-
-    public final Composite start;
-    public final Composite finish;
-
-    public ColumnSlice(Composite start, Composite finish)
-    {
-        assert start != null && finish != null;
-        this.start = start;
-        this.finish = finish;
-    }
-
-    public boolean isAlwaysEmpty(CellNameType comparator, boolean reversed)
-    {
-        Comparator<Composite> orderedComparator = reversed ? comparator.reverseComparator() : comparator;
-        return !start.isEmpty() && !finish.isEmpty() && orderedComparator.compare(start, finish) > 0;
-    }
-
-    public boolean includes(Comparator<Composite> cmp, Composite name)
-    {
-        return (start.isEmpty() || cmp.compare(start, name) <= 0) && (finish.isEmpty() || cmp.compare(finish, name) >= 0);
-    }
-
-    public boolean isBefore(Comparator<Composite> cmp, Composite name)
-    {
-        return !finish.isEmpty() && cmp.compare(finish, name) < 0;
-    }
-
-    public boolean intersects(List<ByteBuffer> minCellNames, List<ByteBuffer> maxCellNames, CellNameType comparator, boolean reversed)
-    {
-        Composite sStart = reversed ? finish : start;
-        Composite sEnd = reversed ? start : finish;
-
-        // don't compare static slice bounds with min/max cell names to determine intersection - that can yield unexpected
-        // results, in particular with ReverseType comparators; see CASSANDRA-14910 for more context.
-        if ((!sStart.isStatic() && compare(sStart, maxCellNames, comparator, true) > 0)
-         || (!sEnd.isStatic() && compare(sEnd, minCellNames, comparator, false) < 0))
-            return false;
-
-        // We could safely return true here, but there's a minor optimization: if the first component is restricted
-        // to a single value, we can check that the second component falls within the min/max for that component
-        // (and repeat for all components).
-        for (int i = 0; i < minCellNames.size() && i < maxCellNames.size(); i++)
-        {
-            AbstractType<?> t = comparator.subtype(i);
-            ByteBuffer s = i < sStart.size() ? sStart.get(i) : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-            ByteBuffer f = i < sEnd.size() ? sEnd.get(i) : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-            // we already know the first component falls within its min/max range (otherwise we wouldn't get here)
-            if (i > 0 && (i < sEnd.size() && t.compare(f, minCellNames.get(i)) < 0 ||
-                          i < sStart.size() && t.compare(s, maxCellNames.get(i)) > 0))
-                return false;
-
-            // if this component isn't equal in the start and finish, we don't need to check any more
-            if (i >= sStart.size() || i >= sEnd.size() || t.compare(s, f) != 0)
-                break;
-        }
-
-        return true;
-    }
-
-    /** Helper method for intersects() */
-    private int compare(Composite sliceBounds, List<ByteBuffer> sstableBounds, CellNameType comparator, boolean isSliceStart)
-    {
-        for (int i = 0; i < sstableBounds.size(); i++)
-        {
-            if (i >= sliceBounds.size())
-            {
-                // When isSliceStart is true, we're comparing the end of the slice against the min cell name for the sstable,
-                // so the slice is something like [(1, 0), (1, 0)], and the sstable max is something like (1, 0, 1).
-                // We want to return -1 (slice start is smaller than max column name) so that we say the slice intersects.
-                // The opposite is true when dealing with the end slice.  For example, with the same slice and a min
-                // cell name of (1, 0, 1), we want to return 1 (slice end is bigger than min column name).
-                return isSliceStart ? -1 : 1;
-            }
-
-            int comparison = comparator.subtype(i).compare(sliceBounds.get(i), sstableBounds.get(i));
-            if (comparison != 0)
-                return comparison;
-        }
-
-        // the slice bound and sstable bound have been equal in all components so far
-        if (sliceBounds.size() > sstableBounds.size())
-        {
-            // We have the opposite situation from the one described above.  With a slice of [(1, 0), (1, 0)],
-            // and a min/max cell name of (1), we want to say the slice start is smaller than the max and the slice
-            // end is larger than the min.
-            return isSliceStart ? -1 : 1;
-        }
-
-        return 0;
-    }
-
-    /**
-     * Validates that the provided slice array contains only non-overlapped slices valid for a query {@code reversed}
-     * or not on a table using {@code comparator}.
-     */
-    public static boolean validateSlices(ColumnSlice[] slices, CellNameType type, boolean reversed)
-    {
-        Comparator<Composite> comparator = reversed ? type.reverseComparator() : type;
-
-        for (int i = 0; i < slices.length; i++)
-        {
-            Composite start = slices[i].start;
-            Composite finish = slices[i].finish;
-
-            if (start.isEmpty() || finish.isEmpty())
-            {
-                if (start.isEmpty() && i > 0)
-                    return false;
-
-                if (finish.isEmpty())
-                    return i == slices.length - 1;
-            }
-            else
-            {
-                // !finish.isEmpty() is imposed by prior loop
-                if (i > 0 && comparator.compare(slices[i - 1].finish, start) >= 0)
-                    return false;
-
-                if (comparator.compare(start, finish) > 0)
-                    return false;
-            }
-        }
-        return true;
-    }
-
-    /**
-     * Takes an array of slices (potentially overlapping and in any order, though each individual slice must have
-     * its start before or equal its end in {@code comparator} orde) and return an equivalent array of non-overlapping
-     * slices in {@code comparator order}.
-     *
-     * @param slices an array of slices. This may be modified by this method.
-     * @param comparator the order in which to sort the slices.
-     * @return the smallest possible array of non-overlapping slices in {@code compator} order. If the original
-     * slices are already non-overlapping and in comparator order, this may or may not return the provided slices
-     * directly.
-     */
-    public static ColumnSlice[] deoverlapSlices(ColumnSlice[] slices, final Comparator<Composite> comparator)
-    {
-        if (slices.length <= 1)
-            return slices;
-
-        Arrays.sort(slices, new Comparator<ColumnSlice>()
-        {
-            @Override
-            public int compare(ColumnSlice s1, ColumnSlice s2)
-            {
-                if (s1.start.isEmpty() || s2.start.isEmpty())
-                {
-                    if (s1.start.isEmpty() != s2.start.isEmpty())
-                        return s1.start.isEmpty() ? -1 : 1;
-                }
-                else
-                {
-                    int c = comparator.compare(s1.start, s2.start);
-                    if (c != 0)
-                        return c;
-                }
-
-                // For the finish, empty always means greater
-                return s1.finish.isEmpty() || s2.finish.isEmpty()
-                     ? (s1.finish.isEmpty() ? 1 : -1)
-                     : comparator.compare(s1.finish, s2.finish);
-            }
-        });
-
-        List<ColumnSlice> slicesCopy = new ArrayList<>(slices.length);
-
-        ColumnSlice last = slices[0];
-
-        for (int i = 1; i < slices.length; i++)
-        {
-            ColumnSlice s2 = slices[i];
-
-            boolean includesStart = last.includes(comparator, s2.start);
-            boolean includesFinish = s2.finish.isEmpty() ? last.finish.isEmpty() : last.includes(comparator, s2.finish);
-
-            if (includesStart && includesFinish)
-                continue;
-
-            if (!includesStart && !includesFinish)
-            {
-                slicesCopy.add(last);
-                last = s2;
-                continue;
-            }
-
-            if (includesStart)
-            {
-                last = new ColumnSlice(last.start, s2.finish);
-                continue;
-            }
-
-            assert !includesFinish;
-        }
-
-        slicesCopy.add(last);
-
-        return slicesCopy.toArray(new ColumnSlice[slicesCopy.size()]);
-    }
-
-    @Override
-    public final int hashCode()
-    {
-        int hashCode = 31 + start.hashCode();
-        return 31*hashCode + finish.hashCode();
-    }
-
-    @Override
-    public final boolean equals(Object o)
-    {
-        if(!(o instanceof ColumnSlice))
-            return false;
-        ColumnSlice that = (ColumnSlice)o;
-        return start.equals(that.start) && finish.equals(that.finish);
-    }
-
-    @Override
-    public String toString()
-    {
-        return "[" + ByteBufferUtil.bytesToHex(start.toByteBuffer()) + ", " + ByteBufferUtil.bytesToHex(finish.toByteBuffer()) + "]";
-    }
-
-    public static class Serializer implements IVersionedSerializer<ColumnSlice>
-    {
-        private final CType type;
-
-        public Serializer(CType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(ColumnSlice cs, DataOutputPlus out, int version) throws IOException
-        {
-            ISerializer<Composite> serializer = type.serializer();
-            serializer.serialize(cs.start, out);
-            serializer.serialize(cs.finish, out);
-        }
-
-        public ColumnSlice deserialize(DataInput in, int version) throws IOException
-        {
-            ISerializer<Composite> serializer = type.serializer();
-            Composite start = serializer.deserialize(in);
-            Composite finish = serializer.deserialize(in);
-            return new ColumnSlice(start, finish);
-        }
-
-        public long serializedSize(ColumnSlice cs, int version)
-        {
-            ISerializer<Composite> serializer = type.serializer();
-            return serializer.serializedSize(cs.start, TypeSizes.NATIVE) + serializer.serializedSize(cs.finish, TypeSizes.NATIVE);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/ColumnSubselection.java b/src/java/org/apache/cassandra/db/filter/ColumnSubselection.java
new file mode 100644
index 0000000..b762fa5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/ColumnSubselection.java

@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Handles the selection of a subpart of a column.
+ * <p>
+ * This only make sense for complex column. For those, this allow for instance
+ * to select only a slice of a map.
+ */
+public abstract class ColumnSubselection implements Comparable<ColumnSubselection>
+{
+    public static final Serializer serializer = new Serializer();
+
+    private enum Kind { SLICE, ELEMENT }
+
+    protected final ColumnDefinition column;
+
+    protected ColumnSubselection(ColumnDefinition column)
+    {
+        this.column = column;
+    }
+
+    public static ColumnSubselection slice(ColumnDefinition column, CellPath from, CellPath to)
+    {
+        assert column.isComplex() && column.type instanceof CollectionType;
+        assert from.size() <= 1 && to.size() <= 1;
+        return new Slice(column, from, to);
+    }
+
+    public static ColumnSubselection element(ColumnDefinition column, CellPath elt)
+    {
+        assert column.isComplex() && column.type instanceof CollectionType;
+        assert elt.size() == 1;
+        return new Element(column, elt);
+    }
+
+    public ColumnDefinition column()
+    {
+        return column;
+    }
+
+    protected abstract Kind kind();
+
+    protected abstract CellPath comparisonPath();
+
+    public int compareTo(ColumnSubselection other)
+    {
+        assert other.column().name.equals(column().name);
+        return column().cellPathComparator().compare(comparisonPath(), other.comparisonPath());
+    }
+
+    /**
+     * Given a path, return -1 if the path is before anything selected by this subselection, 0 if it is selected by this
+     * subselection and 1 if the path is after anything selected by this subselection.
+     */
+    public abstract int compareInclusionOf(CellPath path);
+
+    private static class Slice extends ColumnSubselection
+    {
+        private final CellPath from;
+        private final CellPath to;
+
+        private Slice(ColumnDefinition column, CellPath from, CellPath to)
+        {
+            super(column);
+            this.from = from;
+            this.to = to;
+        }
+
+        protected Kind kind()
+        {
+            return Kind.SLICE;
+        }
+
+        public CellPath comparisonPath()
+        {
+            return from;
+        }
+
+        public int compareInclusionOf(CellPath path)
+        {
+            Comparator<CellPath> cmp = column.cellPathComparator();
+            if (cmp.compare(path, from) < 0)
+                return -1;
+            else if (cmp.compare(to, path) < 0)
+                return 1;
+            else
+                return 0;
+        }
+
+        @Override
+        public String toString()
+        {
+            // This assert we're dealing with a collection since that's the only thing it's used for so far.
+            AbstractType<?> type = ((CollectionType<?>)column().type).nameComparator();
+            return String.format("[%s:%s]", from == CellPath.BOTTOM ? "" : type.getString(from.get(0)), to == CellPath.TOP ? "" : type.getString(to.get(0)));
+        }
+    }
+
+    private static class Element extends ColumnSubselection
+    {
+        private final CellPath element;
+
+        private Element(ColumnDefinition column, CellPath elt)
+        {
+            super(column);
+            this.element = elt;
+        }
+
+        protected Kind kind()
+        {
+            return Kind.ELEMENT;
+        }
+
+        public CellPath comparisonPath()
+        {
+            return element;
+        }
+
+        public int compareInclusionOf(CellPath path)
+        {
+            return column.cellPathComparator().compare(path, element);
+        }
+
+        @Override
+        public String toString()
+        {
+            // This assert we're dealing with a collection since that's the only thing it's used for so far.
+            AbstractType<?> type = ((CollectionType<?>)column().type).nameComparator();
+            return String.format("[%s]", type.getString(element.get(0)));
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(ColumnSubselection subSel, DataOutputPlus out, int version) throws IOException
+        {
+            ColumnDefinition column = subSel.column();
+            ByteBufferUtil.writeWithShortLength(column.name.bytes, out);
+            out.writeByte(subSel.kind().ordinal());
+            switch (subSel.kind())
+            {
+                case SLICE:
+                    Slice slice = (Slice)subSel;
+                    column.cellPathSerializer().serialize(slice.from, out);
+                    column.cellPathSerializer().serialize(slice.to, out);
+                    break;
+                case ELEMENT:
+                    Element eltSelection = (Element)subSel;
+                    column.cellPathSerializer().serialize(eltSelection.element, out);
+                    break;
+                default:
+                    throw new AssertionError();
+            }
+        }
+
+        public ColumnSubselection deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            ByteBuffer name = ByteBufferUtil.readWithShortLength(in);
+            ColumnDefinition column = metadata.getColumnDefinition(name);
+            if (column == null)
+            {
+                // If we don't find the definition, it could be we have data for a dropped column, and we shouldn't
+                // fail deserialization because of that. So we grab a "fake" ColumnDefinition that ensure proper
+                // deserialization. The column will be ignore later on anyway.
+                column = metadata.getDroppedColumnDefinition(name);
+                if (column == null)
+                    throw new RuntimeException("Unknown column " + UTF8Type.instance.getString(name) + " during deserialization");
+            }
+
+            Kind kind = Kind.values()[in.readUnsignedByte()];
+            switch (kind)
+            {
+                case SLICE:
+                    CellPath from = column.cellPathSerializer().deserialize(in);
+                    CellPath to = column.cellPathSerializer().deserialize(in);
+                    return new Slice(column, from, to);
+                case ELEMENT:
+                    CellPath elt = column.cellPathSerializer().deserialize(in);
+                    return new Element(column, elt);
+            }
+            throw new AssertionError();
+        }
+
+        public long serializedSize(ColumnSubselection subSel, int version)
+        {
+            long size = 0;
+
+            ColumnDefinition column = subSel.column();
+            size += TypeSizes.sizeofWithShortLength(column.name.bytes);
+            size += 1; // kind
+            switch (subSel.kind())
+            {
+                case SLICE:
+                    Slice slice = (Slice)subSel;
+                    size += column.cellPathSerializer().serializedSize(slice.from);
+                    size += column.cellPathSerializer().serializedSize(slice.to);
+                    break;
+                case ELEMENT:
+                    Element element = (Element)subSel;
+                    size += column.cellPathSerializer().serializedSize(element.element);
+                    break;
+            }
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/DataLimits.java b/src/java/org/apache/cassandra/db/filter/DataLimits.java
new file mode 100644
index 0000000..fa9d47a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/DataLimits.java

@@ -0,0 +1,880 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.transform.BasePartitions;
+import org.apache.cassandra.db.transform.BaseRows;
+import org.apache.cassandra.db.transform.StoppingTransformation;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Object in charge of tracking if we have fetch enough data for a given query.
+ *
+ * The reason this is not just a simple integer is that Thrift and CQL3 count
+ * stuffs in different ways. This is what abstract those differences.
+ */
+public abstract class DataLimits
+{
+    public static final Serializer serializer = new Serializer();
+
+    public static final int NO_LIMIT = Integer.MAX_VALUE;
+
+    public static final DataLimits NONE = new CQLLimits(NO_LIMIT)
+    {
+        @Override
+        public boolean hasEnoughLiveData(CachedPartition cached, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            return false;
+        }
+
+        @Override
+        public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter,
+                                                  int nowInSec,
+                                                  boolean countPartitionsWithOnlyStaticData)
+        {
+            return iter;
+        }
+
+        @Override
+        public UnfilteredRowIterator filter(UnfilteredRowIterator iter,
+                                            int nowInSec,
+                                            boolean countPartitionsWithOnlyStaticData)
+        {
+            return iter;
+        }
+
+        @Override
+        public PartitionIterator filter(PartitionIterator iter, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            return iter;
+        }
+    };
+
+    // We currently deal with distinct queries by querying full partitions but limiting the result at 1 row per
+    // partition (see SelectStatement.makeFilter). So an "unbounded" distinct is still actually doing some filtering.
+    public static final DataLimits DISTINCT_NONE = new CQLLimits(NO_LIMIT, 1, true);
+
+    public enum Kind { CQL_LIMIT, CQL_PAGING_LIMIT, THRIFT_LIMIT, SUPER_COLUMN_COUNTING_LIMIT }
+
+    public static DataLimits cqlLimits(int cqlRowLimit)
+    {
+        return new CQLLimits(cqlRowLimit);
+    }
+
+    // mixed mode partition range scans on compact storage tables without clustering columns coordinated by 2.x are
+    // returned as one (cql) row per cell, but we need to count each partition as a single row. So we just return a
+    // CQLLimits instance that doesn't count rows towards it's limit. See CASSANDRA-15072
+    public static DataLimits legacyCompactStaticCqlLimits(int cqlRowLimits)
+    {
+        return new CQLLimits(cqlRowLimits) {
+            public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+            {
+                return new CQLCounter(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness) {
+                    public Row applyToRow(Row row)
+                    {
+                        // noop: only count full partitions
+                        return row;
+                    }
+                };
+            }
+        };
+    }
+
+    public static DataLimits cqlLimits(int cqlRowLimit, int perPartitionLimit)
+    {
+        return new CQLLimits(cqlRowLimit, perPartitionLimit);
+    }
+
+    public static DataLimits distinctLimits(int cqlRowLimit)
+    {
+        return CQLLimits.distinct(cqlRowLimit);
+    }
+
+    public static DataLimits thriftLimits(int partitionLimit, int cellPerPartitionLimit)
+    {
+        return new ThriftLimits(partitionLimit, cellPerPartitionLimit);
+    }
+
+    public static DataLimits superColumnCountingLimits(int partitionLimit, int cellPerPartitionLimit)
+    {
+        return new SuperColumnCountingLimits(partitionLimit, cellPerPartitionLimit);
+    }
+
+    public abstract Kind kind();
+
+    public abstract boolean isUnlimited();
+    public abstract boolean isDistinct();
+
+    public abstract DataLimits forPaging(int pageSize);
+    public abstract DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining);
+
+    public abstract DataLimits forShortReadRetry(int toFetch);
+
+    public abstract boolean hasEnoughLiveData(CachedPartition cached,
+                                              int nowInSec,
+                                              boolean countPartitionsWithOnlyStaticData,
+                                              boolean enforceStrictLiveness);
+
+    /**
+     * Returns a new {@code Counter} for this limits.
+     *
+     * @param nowInSec the current time in second (to decide what is expired or not).
+     * @param assumeLiveData if true, the counter will assume that every row passed is live and won't
+     * thus check for liveness, otherwise it will. This should be {@code true} when used on a
+     * {@code RowIterator} (since it only returns live rows), false otherwise.
+     * @param countPartitionsWithOnlyStaticData if {@code true} the partitions with only static data should be counted
+     * as 1 valid row.
+     * @param enforceStrictLiveness whether the row should be purged if there is no PK liveness info,
+     *                              normally retrieved from {@link CFMetaData#enforceStrictLiveness()}
+     * @return a new {@code Counter} for this limits.
+     */
+    public abstract Counter newCounter(int nowInSec,
+                                       boolean assumeLiveData,
+                                       boolean countPartitionsWithOnlyStaticData,
+                                       boolean enforceStrictLiveness);
+
+    /**
+     * The max number of results this limits enforces.
+     * <p>
+     * Note that the actual definition of "results" depends a bit: for CQL, it's always rows, but for
+     * thrift, it means cells.
+     *
+     * @return the maximum number of results this limits enforces.
+     */
+    public abstract int count();
+
+    public abstract int perPartitionCount();
+
+    public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter,
+                                              int nowInSec,
+                                              boolean countPartitionsWithOnlyStaticData)
+    {
+        return this.newCounter(nowInSec,
+                               false,
+                               countPartitionsWithOnlyStaticData,
+                               iter.metadata().enforceStrictLiveness())
+                   .applyTo(iter);
+    }
+
+    public UnfilteredRowIterator filter(UnfilteredRowIterator iter,
+                                        int nowInSec,
+                                        boolean countPartitionsWithOnlyStaticData)
+    {
+        return this.newCounter(nowInSec,
+                               false,
+                               countPartitionsWithOnlyStaticData,
+                               iter.metadata().enforceStrictLiveness())
+                   .applyTo(iter);
+    }
+
+    public PartitionIterator filter(PartitionIterator iter, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+    {
+        return this.newCounter(nowInSec, true, countPartitionsWithOnlyStaticData, enforceStrictLiveness).applyTo(iter);
+    }
+
+    /**
+     * Estimate the number of results (the definition of "results" will be rows for CQL queries
+     * and partitions for thrift ones) that a full scan of the provided cfs would yield.
+     */
+    public abstract float estimateTotalResults(ColumnFamilyStore cfs);
+
+    public static abstract class Counter extends StoppingTransformation<BaseRowIterator<?>>
+    {
+        // false means we do not propagate our stop signals onto the iterator, we only count
+        private boolean enforceLimits = true;
+
+        public Counter onlyCount()
+        {
+            this.enforceLimits = false;
+            return this;
+        }
+
+        public PartitionIterator applyTo(PartitionIterator partitions)
+        {
+            return Transformation.apply(partitions, this);
+        }
+
+        public UnfilteredPartitionIterator applyTo(UnfilteredPartitionIterator partitions)
+        {
+            return Transformation.apply(partitions, this);
+        }
+
+        public UnfilteredRowIterator applyTo(UnfilteredRowIterator partition)
+        {
+            return (UnfilteredRowIterator) applyToPartition(partition);
+        }
+
+        public RowIterator applyTo(RowIterator partition)
+        {
+            return (RowIterator) applyToPartition(partition);
+        }
+
+        /**
+         * The number of results counted.
+         * <p>
+         * Note that the definition of "results" should be the same that for {@link #count}.
+         *
+         * @return the number of results counted.
+         */
+        public abstract int counted();
+        public abstract int countedInCurrentPartition();
+
+        public abstract boolean isDone();
+        public abstract boolean isDoneForPartition();
+
+        @Override
+        protected BaseRowIterator<?> applyToPartition(BaseRowIterator<?> partition)
+        {
+            return partition instanceof UnfilteredRowIterator ? Transformation.apply((UnfilteredRowIterator) partition, this)
+                                                              : Transformation.apply((RowIterator) partition, this);
+        }
+
+        // called before we process a given partition
+        protected abstract void applyToPartition(DecoratedKey partitionKey, Row staticRow);
+
+        @Override
+        protected void attachTo(BasePartitions partitions)
+        {
+            if (enforceLimits)
+                super.attachTo(partitions);
+            if (isDone())
+                stop();
+        }
+
+        @Override
+        protected void attachTo(BaseRows rows)
+        {
+            if (enforceLimits)
+                super.attachTo(rows);
+            applyToPartition(rows.partitionKey(), rows.staticRow());
+            if (isDoneForPartition())
+                stopInPartition();
+        }
+    }
+
+    /**
+     * Limits used by CQL; this counts rows.
+     */
+    private static class CQLLimits extends DataLimits
+    {
+        protected final int rowLimit;
+        protected final int perPartitionLimit;
+
+        // Whether the query is a distinct query or not.
+        protected final boolean isDistinct;
+
+        private CQLLimits(int rowLimit)
+        {
+            this(rowLimit, NO_LIMIT);
+        }
+
+        private CQLLimits(int rowLimit, int perPartitionLimit)
+        {
+            this(rowLimit, perPartitionLimit, false);
+        }
+
+        private CQLLimits(int rowLimit, int perPartitionLimit, boolean isDistinct)
+        {
+            this.rowLimit = rowLimit;
+            this.perPartitionLimit = perPartitionLimit;
+            this.isDistinct = isDistinct;
+        }
+
+        private static CQLLimits distinct(int rowLimit)
+        {
+            return new CQLLimits(rowLimit, 1, true);
+        }
+
+        public Kind kind()
+        {
+            return Kind.CQL_LIMIT;
+        }
+
+        public boolean isUnlimited()
+        {
+            return rowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT;
+        }
+
+        public boolean isDistinct()
+        {
+            return isDistinct;
+        }
+
+        public DataLimits forPaging(int pageSize)
+        {
+            return new CQLLimits(pageSize, perPartitionLimit, isDistinct);
+        }
+
+        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        {
+            return new CQLPagingLimits(pageSize, perPartitionLimit, isDistinct, lastReturnedKey, lastReturnedKeyRemaining);
+        }
+
+        public DataLimits forShortReadRetry(int toFetch)
+        {
+            return new CQLLimits(toFetch, perPartitionLimit, isDistinct);
+        }
+
+        public boolean hasEnoughLiveData(CachedPartition cached, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            // We want the number of row that are currently live. Getting that precise number forces
+            // us to iterate the cached partition in general, but we can avoid that if:
+            //   - The number of rows with at least one non-expiring cell is greater than what we ask,
+            //     in which case we know we have enough live.
+            //   - The number of rows is less than requested, in which case we  know we won't have enough.
+            if (cached.rowsWithNonExpiringCells() >= rowLimit)
+                return true;
+
+            if (cached.rowCount() < rowLimit)
+                return false;
+
+            // Otherwise, we need to re-count
+
+            DataLimits.Counter counter = newCounter(nowInSec, false, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
+            try (UnfilteredRowIterator cacheIter = cached.unfilteredIterator(ColumnFilter.selection(cached.columns()), Slices.ALL, false);
+                 UnfilteredRowIterator iter = counter.applyTo(cacheIter))
+            {
+                // Consume the iterator until we've counted enough
+                while (iter.hasNext())
+                    iter.next();
+                return counter.isDone();
+            }
+        }
+
+        public Counter newCounter(int nowInSec,
+                                  boolean assumeLiveData,
+                                  boolean countPartitionsWithOnlyStaticData,
+                                  boolean enforceStrictLiveness)
+        {
+            return new CQLCounter(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
+        }
+
+        public int count()
+        {
+            return rowLimit;
+        }
+
+        public int perPartitionCount()
+        {
+            return perPartitionLimit;
+        }
+
+        public float estimateTotalResults(ColumnFamilyStore cfs)
+        {
+            // TODO: we should start storing stats on the number of rows (instead of the number of cells, which
+            // is what getMeanColumns returns)
+            float rowsPerPartition = ((float) cfs.getMeanColumns()) / cfs.metadata.partitionColumns().regulars.size();
+            return rowsPerPartition * (cfs.estimateKeys());
+        }
+
+        protected class CQLCounter extends Counter
+        {
+            protected final int nowInSec;
+            protected final boolean assumeLiveData;
+            protected final boolean countPartitionsWithOnlyStaticData;
+
+            protected int rowCounted;
+            protected int rowInCurrentPartition;
+
+            protected boolean hasLiveStaticRow;
+            private final boolean enforceStrictLiveness;
+
+            public CQLCounter(int nowInSec,
+                              boolean assumeLiveData,
+                              boolean countPartitionsWithOnlyStaticData,
+                              boolean enforceStrictLiveness)
+            {
+                this.nowInSec = nowInSec;
+                this.assumeLiveData = assumeLiveData;
+                this.countPartitionsWithOnlyStaticData = countPartitionsWithOnlyStaticData;
+                this.enforceStrictLiveness = enforceStrictLiveness;
+            }
+
+            @Override
+            public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
+            {
+                rowInCurrentPartition = 0;
+                hasLiveStaticRow = !staticRow.isEmpty() && (assumeLiveData || staticRow.hasLiveData(nowInSec, enforceStrictLiveness));
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                if (assumeLiveData || row.hasLiveData(nowInSec, enforceStrictLiveness))
+                    incrementRowCount();
+                return row;
+            }
+
+            @Override
+            public void onPartitionClose()
+            {
+                // Normally, we don't count static rows as from a CQL point of view, it will be merge with other
+                // rows in the partition. However, if we only have the static row, it will be returned as one row
+                // so count it.
+                if (countPartitionsWithOnlyStaticData && hasLiveStaticRow && rowInCurrentPartition == 0)
+                    incrementRowCount();
+                super.onPartitionClose();
+            }
+
+            private void incrementRowCount()
+            {
+                if (++rowCounted >= rowLimit)
+                    stop();
+                if (++rowInCurrentPartition >= perPartitionLimit)
+                    stopInPartition();
+            }
+
+            public int counted()
+            {
+                return rowCounted;
+            }
+
+            public int countedInCurrentPartition()
+            {
+                return rowInCurrentPartition;
+            }
+
+            public boolean isDone()
+            {
+                return rowCounted >= rowLimit;
+            }
+
+            public boolean isDoneForPartition()
+            {
+                return isDone() || rowInCurrentPartition >= perPartitionLimit;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+
+            if (rowLimit != NO_LIMIT)
+            {
+                sb.append("LIMIT ").append(rowLimit);
+                if (perPartitionLimit != NO_LIMIT)
+                    sb.append(' ');
+            }
+
+            if (perPartitionLimit != NO_LIMIT)
+                sb.append("PER PARTITION LIMIT ").append(perPartitionLimit);
+
+            return sb.toString();
+        }
+    }
+
+    private static class CQLPagingLimits extends CQLLimits
+    {
+        private final ByteBuffer lastReturnedKey;
+        private final int lastReturnedKeyRemaining;
+
+        public CQLPagingLimits(int rowLimit, int perPartitionLimit, boolean isDistinct, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        {
+            super(rowLimit, perPartitionLimit, isDistinct);
+            this.lastReturnedKey = lastReturnedKey;
+            this.lastReturnedKeyRemaining = lastReturnedKeyRemaining;
+        }
+
+        @Override
+        public Kind kind()
+        {
+            return Kind.CQL_PAGING_LIMIT;
+        }
+
+        @Override
+        public DataLimits forPaging(int pageSize)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            return new PagingAwareCounter(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
+        }
+
+        private class PagingAwareCounter extends CQLCounter
+        {
+            private PagingAwareCounter(int nowInSec,
+                                       boolean assumeLiveData,
+                                       boolean countPartitionsWithOnlyStaticData,
+                                       boolean enforceStrictLiveness)
+            {
+                super(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
+            }
+
+            @Override
+            public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
+            {
+                if (partitionKey.getKey().equals(lastReturnedKey))
+                {
+                    rowInCurrentPartition = perPartitionLimit - lastReturnedKeyRemaining;
+                    // lastReturnedKey is the last key for which we're returned rows in the first page.
+                    // So, since we know we have returned rows, we know we have accounted for the static row
+                    // if any already, so force hasLiveStaticRow to false so we make sure to not count it
+                    // once more.
+                    hasLiveStaticRow = false;
+                }
+                else
+                {
+                    super.applyToPartition(partitionKey, staticRow);
+                }
+            }
+        }
+    }
+
+    /**
+     * Limits used by thrift; this count partition and cells.
+     */
+    private static class ThriftLimits extends DataLimits
+    {
+        protected final int partitionLimit;
+        protected final int cellPerPartitionLimit;
+
+        private ThriftLimits(int partitionLimit, int cellPerPartitionLimit)
+        {
+            this.partitionLimit = partitionLimit;
+            this.cellPerPartitionLimit = cellPerPartitionLimit;
+        }
+
+        public Kind kind()
+        {
+            return Kind.THRIFT_LIMIT;
+        }
+
+        public boolean isUnlimited()
+        {
+            return partitionLimit == NO_LIMIT && cellPerPartitionLimit == NO_LIMIT;
+        }
+
+        public boolean isDistinct()
+        {
+            return false;
+        }
+
+        public DataLimits forPaging(int pageSize)
+        {
+            // We don't support paging on thrift in general but do use paging under the hood for get_count. For
+            // that case, we only care about limiting cellPerPartitionLimit (since it's paging over a single
+            // partition). We do check that the partition limit is 1 however to make sure this is not misused
+            // (as this wouldn't work properly for range queries).
+            assert partitionLimit == 1;
+            return new ThriftLimits(partitionLimit, pageSize);
+        }
+
+        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public DataLimits forShortReadRetry(int toFetch)
+        {
+            // Short read retries are always done for a single partition at a time, so it's ok to ignore the
+            // partition limit for those
+            return new ThriftLimits(1, toFetch);
+        }
+
+        public boolean hasEnoughLiveData(CachedPartition cached, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            // We want the number of cells that are currently live. Getting that precise number forces
+            // us to iterate the cached partition in general, but we can avoid that if:
+            //   - The number of non-expiring live cells is greater than the number of cells asked (we then
+            //     know we have enough live cells).
+            //   - The number of cells cached is less than requested, in which case we know we won't have enough.
+            if (cached.nonExpiringLiveCells() >= cellPerPartitionLimit)
+                return true;
+
+            if (cached.nonTombstoneCellCount() < cellPerPartitionLimit)
+                return false;
+
+            // Otherwise, we need to re-count
+            DataLimits.Counter counter = newCounter(nowInSec, false, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
+            try (UnfilteredRowIterator cacheIter = cached.unfilteredIterator(ColumnFilter.selection(cached.columns()), Slices.ALL, false);
+                 UnfilteredRowIterator iter = counter.applyTo(cacheIter))
+            {
+                // Consume the iterator until we've counted enough
+                while (iter.hasNext())
+                    iter.next();
+                return counter.isDone();
+            }
+        }
+
+        public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            return new ThriftCounter(nowInSec, assumeLiveData);
+        }
+
+        public int count()
+        {
+            return partitionLimit * cellPerPartitionLimit;
+        }
+
+        public int perPartitionCount()
+        {
+            return cellPerPartitionLimit;
+        }
+
+        public float estimateTotalResults(ColumnFamilyStore cfs)
+        {
+            // remember that getMeansColumns returns a number of cells: we should clean nomenclature
+            float cellsPerPartition = ((float) cfs.getMeanColumns()) / cfs.metadata.partitionColumns().regulars.size();
+            return cellsPerPartition * cfs.estimateKeys();
+        }
+
+        protected class ThriftCounter extends Counter
+        {
+            protected final int nowInSec;
+            protected final boolean assumeLiveData;
+
+            protected int partitionsCounted;
+            protected int cellsCounted;
+            protected int cellsInCurrentPartition;
+
+            public ThriftCounter(int nowInSec, boolean assumeLiveData)
+            {
+                this.nowInSec = nowInSec;
+                this.assumeLiveData = assumeLiveData;
+            }
+
+            @Override
+            public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
+            {
+                cellsInCurrentPartition = 0;
+                if (!staticRow.isEmpty())
+                    applyToRow(staticRow);
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                for (Cell cell : row.cells())
+                {
+                    if (assumeLiveData || cell.isLive(nowInSec))
+                    {
+                        ++cellsCounted;
+                        if (++cellsInCurrentPartition >= cellPerPartitionLimit)
+                            stopInPartition();
+                    }
+                }
+                return row;
+            }
+
+            @Override
+            public void onPartitionClose()
+            {
+                if (++partitionsCounted >= partitionLimit)
+                    stop();
+                super.onPartitionClose();
+            }
+
+            public int counted()
+            {
+                return cellsCounted;
+            }
+
+            public int countedInCurrentPartition()
+            {
+                return cellsInCurrentPartition;
+            }
+
+            public boolean isDone()
+            {
+                return partitionsCounted >= partitionLimit;
+            }
+
+            public boolean isDoneForPartition()
+            {
+                return isDone() || cellsInCurrentPartition >= cellPerPartitionLimit;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            // This is not valid CQL, but that's ok since it's not used for CQL queries.
+            return String.format("THRIFT LIMIT (partitions=%d, cells_per_partition=%d)", partitionLimit, cellPerPartitionLimit);
+        }
+    }
+
+    /**
+     * Limits used for thrift get_count when we only want to count super columns.
+     */
+    private static class SuperColumnCountingLimits extends ThriftLimits
+    {
+        private SuperColumnCountingLimits(int partitionLimit, int cellPerPartitionLimit)
+        {
+            super(partitionLimit, cellPerPartitionLimit);
+        }
+
+        public Kind kind()
+        {
+            return Kind.SUPER_COLUMN_COUNTING_LIMIT;
+        }
+
+        public DataLimits forPaging(int pageSize)
+        {
+            // We don't support paging on thrift in general but do use paging under the hood for get_count. For
+            // that case, we only care about limiting cellPerPartitionLimit (since it's paging over a single
+            // partition). We do check that the partition limit is 1 however to make sure this is not misused
+            // (as this wouldn't work properly for range queries).
+            assert partitionLimit == 1;
+            return new SuperColumnCountingLimits(partitionLimit, pageSize);
+        }
+
+        public DataLimits forShortReadRetry(int toFetch)
+        {
+            // Short read retries are always done for a single partition at a time, so it's ok to ignore the
+            // partition limit for those
+            return new SuperColumnCountingLimits(1, toFetch);
+        }
+
+        @Override
+        public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
+        {
+            return new SuperColumnCountingCounter(nowInSec, assumeLiveData, enforceStrictLiveness);
+        }
+
+        protected class SuperColumnCountingCounter extends ThriftCounter
+        {
+            private final boolean enforceStrictLiveness;
+
+            public SuperColumnCountingCounter(int nowInSec, boolean assumeLiveData, boolean enforceStrictLiveness)
+            {
+                super(nowInSec, assumeLiveData);
+                this.enforceStrictLiveness = enforceStrictLiveness;
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                // In the internal format, a row == a super column, so that's what we want to count.
+                if (assumeLiveData || row.hasLiveData(nowInSec, enforceStrictLiveness))
+                {
+                    ++cellsCounted;
+                    if (++cellsInCurrentPartition >= cellPerPartitionLimit)
+                        stopInPartition();
+                }
+                return row;
+            }
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(DataLimits limits, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeByte(limits.kind().ordinal());
+            switch (limits.kind())
+            {
+                case CQL_LIMIT:
+                case CQL_PAGING_LIMIT:
+                    CQLLimits cqlLimits = (CQLLimits)limits;
+                    out.writeUnsignedVInt(cqlLimits.rowLimit);
+                    out.writeUnsignedVInt(cqlLimits.perPartitionLimit);
+                    out.writeBoolean(cqlLimits.isDistinct);
+                    if (limits.kind() == Kind.CQL_PAGING_LIMIT)
+                    {
+                        CQLPagingLimits pagingLimits = (CQLPagingLimits)cqlLimits;
+                        ByteBufferUtil.writeWithVIntLength(pagingLimits.lastReturnedKey, out);
+                        out.writeUnsignedVInt(pagingLimits.lastReturnedKeyRemaining);
+                    }
+                    break;
+                case THRIFT_LIMIT:
+                case SUPER_COLUMN_COUNTING_LIMIT:
+                    ThriftLimits thriftLimits = (ThriftLimits)limits;
+                    out.writeUnsignedVInt(thriftLimits.partitionLimit);
+                    out.writeUnsignedVInt(thriftLimits.cellPerPartitionLimit);
+                    break;
+            }
+        }
+
+        public DataLimits deserialize(DataInputPlus in, int version) throws IOException
+        {
+            Kind kind = Kind.values()[in.readUnsignedByte()];
+            switch (kind)
+            {
+                case CQL_LIMIT:
+                case CQL_PAGING_LIMIT:
+                    int rowLimit = (int)in.readUnsignedVInt();
+                    int perPartitionLimit = (int)in.readUnsignedVInt();
+                    boolean isDistinct = in.readBoolean();
+                    if (kind == Kind.CQL_LIMIT)
+                        return new CQLLimits(rowLimit, perPartitionLimit, isDistinct);
+
+                    ByteBuffer lastKey = ByteBufferUtil.readWithVIntLength(in);
+                    int lastRemaining = (int)in.readUnsignedVInt();
+                    return new CQLPagingLimits(rowLimit, perPartitionLimit, isDistinct, lastKey, lastRemaining);
+                case THRIFT_LIMIT:
+                case SUPER_COLUMN_COUNTING_LIMIT:
+                    int partitionLimit = (int)in.readUnsignedVInt();
+                    int cellPerPartitionLimit = (int)in.readUnsignedVInt();
+                    return kind == Kind.THRIFT_LIMIT
+                         ? new ThriftLimits(partitionLimit, cellPerPartitionLimit)
+                         : new SuperColumnCountingLimits(partitionLimit, cellPerPartitionLimit);
+            }
+            throw new AssertionError();
+        }
+
+        public long serializedSize(DataLimits limits, int version)
+        {
+            long size = TypeSizes.sizeof((byte)limits.kind().ordinal());
+            switch (limits.kind())
+            {
+                case CQL_LIMIT:
+                case CQL_PAGING_LIMIT:
+                    CQLLimits cqlLimits = (CQLLimits)limits;
+                    size += TypeSizes.sizeofUnsignedVInt(cqlLimits.rowLimit);
+                    size += TypeSizes.sizeofUnsignedVInt(cqlLimits.perPartitionLimit);
+                    size += TypeSizes.sizeof(cqlLimits.isDistinct);
+                    if (limits.kind() == Kind.CQL_PAGING_LIMIT)
+                    {
+                        CQLPagingLimits pagingLimits = (CQLPagingLimits)cqlLimits;
+                        size += ByteBufferUtil.serializedSizeWithVIntLength(pagingLimits.lastReturnedKey);
+                        size += TypeSizes.sizeofUnsignedVInt(pagingLimits.lastReturnedKeyRemaining);
+                    }
+                    break;
+                case THRIFT_LIMIT:
+                case SUPER_COLUMN_COUNTING_LIMIT:
+                    ThriftLimits thriftLimits = (ThriftLimits)limits;
+                    size += TypeSizes.sizeofUnsignedVInt(thriftLimits.partitionLimit);
+                    size += TypeSizes.sizeofUnsignedVInt(thriftLimits.cellPerPartitionLimit);
+                    break;
+                default:
+                    throw new AssertionError();
+            }
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java b/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
deleted file mode 100644
index bd4718a..0000000
--- a/src/java/org/apache/cassandra/db/filter/ExtendedFilter.java
+++ /dev/null

@@ -1,491 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import com.google.common.base.Objects;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DataRange;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Extends a column filter (IFilter) to include a number of IndexExpression.
- */
-public abstract class ExtendedFilter
-{
-    private static final Logger logger = LoggerFactory.getLogger(ExtendedFilter.class);
-
-    public final ColumnFamilyStore cfs;
-    public final long timestamp;
-    public final DataRange dataRange;
-    private final int maxResults;
-    private final boolean countCQL3Rows;
-    private volatile int currentLimit;
-
-    public static ExtendedFilter create(ColumnFamilyStore cfs,
-                                        DataRange dataRange,
-                                        List<IndexExpression> clause,
-                                        int maxResults,
-                                        boolean countCQL3Rows,
-                                        long timestamp)
-    {
-        if (clause == null || clause.isEmpty())
-            return new EmptyClauseFilter(cfs, dataRange, maxResults, countCQL3Rows, timestamp);
-
-        return new WithClauses(cfs, dataRange, clause, maxResults, countCQL3Rows, timestamp);
-    }
-
-    protected ExtendedFilter(ColumnFamilyStore cfs, DataRange dataRange, int maxResults, boolean countCQL3Rows, long timestamp)
-    {
-        assert cfs != null;
-        assert dataRange != null;
-        this.cfs = cfs;
-        this.dataRange = dataRange;
-        this.maxResults = maxResults;
-        this.timestamp = timestamp;
-        this.countCQL3Rows = countCQL3Rows;
-        this.currentLimit = maxResults;
-        if (countCQL3Rows)
-            dataRange.updateColumnsLimit(maxResults);
-    }
-
-    public int maxRows()
-    {
-        return countCQL3Rows ? Integer.MAX_VALUE : maxResults;
-    }
-
-    public int maxColumns()
-    {
-        return countCQL3Rows ? maxResults : Integer.MAX_VALUE;
-    }
-
-    public int currentLimit()
-    {
-        return currentLimit;
-    }
-
-    public IDiskAtomFilter columnFilter(ByteBuffer key)
-    {
-        return dataRange.columnFilter(key);
-    }
-
-    public int lastCounted(ColumnFamily data)
-    {
-        return dataRange.getLiveCount(data, timestamp);
-    }
-
-    public void updateFilter(int currentColumnsCount)
-    {
-        if (!countCQL3Rows)
-            return;
-
-        currentLimit = maxResults - currentColumnsCount;
-        // We propagate that limit to the underlying filter so each internal query don't
-        // fetch more than we needs it to.
-        dataRange.updateColumnsLimit(currentLimit);
-    }
-
-    public abstract List<IndexExpression> getClause();
-
-    /**
-     * Returns a filter to query the columns from the clause that the initial slice filter may not have caught.
-     * @param data the data retrieve by the initial filter
-     * @return a filter or null if there can't be any columns we missed with our initial filter (typically if it was a names query, or a slice of the entire row)
-     */
-    public abstract IDiskAtomFilter getExtraFilter(DecoratedKey key, ColumnFamily data);
-
-    /**
-     * @return data pruned down to the columns originally asked for
-     */
-    public abstract ColumnFamily prune(DecoratedKey key, ColumnFamily data);
-
-    /** Returns true if tombstoned partitions should not be included in results or count towards the limit, false otherwise. */
-    public boolean ignoreTombstonedPartitions()
-    {
-        return dataRange.ignoredTombstonedPartitions();
-    }
-
-    /**
-     * @return true if the provided data satisfies all the expressions from
-     * the clause of this filter.
-     */
-    public abstract boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement);
-
-    public static boolean satisfies(int comparison, Operator op)
-    {
-        switch (op)
-        {
-            case EQ:
-                return comparison == 0;
-            case GTE:
-                return comparison >= 0;
-            case GT:
-                return comparison > 0;
-            case LTE:
-                return comparison <= 0;
-            case LT:
-                return comparison < 0;
-            default:
-                throw new IllegalStateException();
-        }
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this)
-                      .add("dataRange", dataRange)
-                      .add("maxResults", maxResults)
-                      .add("currentLimit", currentLimit)
-                      .add("timestamp", timestamp)
-                      .add("countCQL3Rows", countCQL3Rows)
-                      .toString();
-    }
-
-    public static class WithClauses extends ExtendedFilter
-    {
-        private final List<IndexExpression> clause;
-        private final IDiskAtomFilter optimizedFilter;
-
-        public WithClauses(ColumnFamilyStore cfs,
-                           DataRange range,
-                           List<IndexExpression> clause,
-                           int maxResults,
-                           boolean countCQL3Rows,
-                           long timestamp)
-        {
-            super(cfs, range, maxResults, countCQL3Rows, timestamp);
-            assert clause != null;
-            this.clause = clause;
-            this.optimizedFilter = computeOptimizedFilter();
-        }
-
-        /*
-         * Potentially optimize the column filter if we have a change to make it catch all clauses
-         * right away.
-         */
-        private IDiskAtomFilter computeOptimizedFilter()
-        {
-            /*
-             * We shouldn't do the "optimization" for composites as the index names are not valid column names 
-             * (which the rest of the method assumes). Said optimization is not useful for composites anyway.
-             * We also don't want to do for paging ranges as the actual filter depends on the row key (it would
-             * probably be possible to make it work but we won't really use it so we don't bother).
-             */
-            if (cfs.getComparator().isCompound() || dataRange instanceof DataRange.Paging)
-                return null;
-
-            IDiskAtomFilter filter = dataRange.columnFilter(null); // ok since not a paging range
-            if (filter instanceof SliceQueryFilter)
-            {
-                // if we have a high chance of getting all the columns in a single index slice (and it's not too costly), do that.
-                // otherwise, the extraFilter (lazily created) will fetch by name the columns referenced by the additional expressions.
-                if (cfs.metric.maxRowSize.getValue() < DatabaseDescriptor.getColumnIndexSize())
-                {
-                    logger.trace("Expanding slice filter to entire row to cover additional expressions");
-                    return new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, ((SliceQueryFilter)filter).reversed, Integer.MAX_VALUE);
-                }
-            }
-            else
-            {
-                logger.trace("adding columns to original Filter to cover additional expressions");
-                assert filter instanceof NamesQueryFilter;
-                if (!clause.isEmpty())
-                {
-                    SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
-                    for (IndexExpression expr : clause)
-                        columns.add(cfs.getComparator().cellFromByteBuffer(expr.column));
-                    columns.addAll(((NamesQueryFilter) filter).columns);
-                    return ((NamesQueryFilter) filter).withUpdatedColumns(columns);
-                }
-            }
-            return null;
-        }
-
-        @Override
-        public IDiskAtomFilter columnFilter(ByteBuffer key)
-        {
-            return optimizedFilter == null ? dataRange.columnFilter(key) : optimizedFilter;
-        }
-
-        public List<IndexExpression> getClause()
-        {
-            return clause;
-        }
-
-        /*
-         * We may need an extra query only if the original query wasn't selecting the row entirely.
-         * Furthermore, we only need the extra query if we haven't yet got all the expressions from the clause.
-         */
-        private boolean needsExtraQuery(ByteBuffer rowKey, ColumnFamily data)
-        {
-            IDiskAtomFilter filter = columnFilter(rowKey);
-            if (filter instanceof SliceQueryFilter && DataRange.isFullRowSlice((SliceQueryFilter)filter))
-                return false;
-
-            for (IndexExpression expr : clause)
-            {
-                if (data.getColumn(data.getComparator().cellFromByteBuffer(expr.column)) == null)
-                {
-                    logger.trace("adding extraFilter to cover additional expressions");
-                    return true;
-                }
-            }
-            return false;
-        }
-
-        public IDiskAtomFilter getExtraFilter(DecoratedKey rowKey, ColumnFamily data)
-        {
-            /*
-             * This method assumes the IndexExpression names are valid column names, which is not the
-             * case with composites. This is ok for now however since:
-             * 1) CompositeSearcher doesn't use it.
-             * 2) We don't yet allow non-indexed range slice with filters in CQL3 (i.e. this will never be
-             * called by CFS.filter() for composites).
-             */
-            assert !(cfs.getComparator().isCompound()) : "Sequential scan with filters is not supported (if you just created an index, you "
-                                                         + "need to wait for the creation to be propagated to all nodes before querying it)";
-
-            if (!needsExtraQuery(rowKey.getKey(), data))
-                return null;
-
-            // Note: for counters we must be careful to not add a column that was already there (to avoid overcount). That is
-            // why we do the dance of avoiding to query any column we already have (it's also more efficient anyway)
-            SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
-            for (IndexExpression expr : clause)
-            {
-                CellName name = data.getComparator().cellFromByteBuffer(expr.column);
-                if (data.getColumn(name) == null)
-                    columns.add(name);
-            }
-            assert !columns.isEmpty();
-            return new NamesQueryFilter(columns);
-        }
-
-        public ColumnFamily prune(DecoratedKey rowKey, ColumnFamily data)
-        {
-            if (optimizedFilter == null)
-                return data;
-
-            ColumnFamily pruned = data.cloneMeShallow();
-            IDiskAtomFilter filter = dataRange.columnFilter(rowKey.getKey());
-            Iterator<Cell> iter = filter.getColumnIterator(data);
-            filter.collectReducedColumns(pruned, QueryFilter.gatherTombstones(pruned, iter), rowKey, cfs.gcBefore(timestamp), timestamp);
-            return pruned;
-        }
-
-        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement)
-        {
-            for (IndexExpression expression : clause)
-            {
-                ColumnDefinition def = data.metadata().getColumnDefinition(expression.column);
-                ByteBuffer dataValue = null;
-                AbstractType<?> validator = null;
-                if (def == null)
-                {
-                    // This can't happen with CQL3 as this should be rejected upfront. For thrift however,
-                    // cell name are not predefined. But that means the cell name correspond to an internal one.
-                    Cell cell = data.getColumn(data.getComparator().cellFromByteBuffer(expression.column));
-                    if (cell != null)
-                    {
-                        dataValue = cell.value();
-                        validator = data.metadata().getDefaultValidator();
-                    }
-                }
-                else
-                {
-                    if (def.type.isCollection() && def.type.isMultiCell())
-                    {
-                        if (!collectionSatisfies(def, data, prefix, expression))
-                            return false;
-                        continue;
-                    }
-
-                    dataValue = extractDataValue(def, rowKey.getKey(), data, prefix);
-                    validator = def.type;
-                }
-
-                if (dataValue == null)
-                    return false;
-
-                if (expression.operator == Operator.CONTAINS)
-                {
-                    assert def != null && def.type.isCollection() && !def.type.isMultiCell();
-                    CollectionType type = (CollectionType)def.type;
-                    switch (type.kind)
-                    {
-                        case LIST:
-                            ListType<?> listType = (ListType)def.type;
-                            if (!listType.getSerializer().deserialize(dataValue).contains(listType.getElementsType().getSerializer().deserialize(expression.value)))
-                                return false;
-                            break;
-                        case SET:
-                            SetType<?> setType = (SetType)def.type;
-                            if (!setType.getSerializer().deserialize(dataValue).contains(setType.getElementsType().getSerializer().deserialize(expression.value)))
-                                return false;
-                            break;
-                        case MAP:
-                            MapType<?,?> mapType = (MapType)def.type;
-                            if (!mapType.getSerializer().deserialize(dataValue).containsValue(mapType.getValuesType().getSerializer().deserialize(expression.value)))
-                                return false;
-                            break;
-                    }
-                }
-                else if (expression.operator == Operator.CONTAINS_KEY)
-                {
-                    assert def != null && def.type.isCollection() && !def.type.isMultiCell() && def.type instanceof MapType;
-                    MapType<?,?> mapType = (MapType)def.type;
-                    if (mapType.getSerializer().getSerializedValue(dataValue, expression.value, mapType.getKeysType()) == null)
-                        return false;
-                }
-                else
-                {
-                    int v = validator.compare(dataValue, expression.value);
-                    if (!satisfies(v, expression.operator))
-                        return false;
-                }
-            }
-            return true;
-        }
-
-        private static boolean collectionSatisfies(ColumnDefinition def, ColumnFamily data, Composite prefix, IndexExpression expr)
-        {
-            assert def.type.isCollection() && def.type.isMultiCell();
-            CollectionType type = (CollectionType)def.type;
-
-            if (expr.isContains())
-            {
-                // get a slice of the collection cells
-                Iterator<Cell> iter = data.iterator(new ColumnSlice[]{ data.getComparator().create(prefix, def).slice() });
-                while (iter.hasNext())
-                {
-                    Cell cell = iter.next();
-                    if (type.kind == CollectionType.Kind.SET)
-                    {
-                        if (type.nameComparator().compare(cell.name().collectionElement(), expr.value) == 0)
-                            return true;
-                    }
-                    else
-                    {
-                        if (type.valueComparator().compare(cell.value(), expr.value) == 0)
-                            return true;
-                    }
-                }
-
-                return false;
-            }
-
-            assert type.kind == CollectionType.Kind.MAP;
-            if (expr.isContainsKey())
-                return data.getColumn(data.getComparator().create(prefix, def, expr.value)) != null;
-
-            Iterator<Cell> iter = data.iterator(new ColumnSlice[]{ data.getComparator().create(prefix, def).slice() });
-            ByteBuffer key = CompositeType.extractComponent(expr.value, 0);
-            ByteBuffer value = CompositeType.extractComponent(expr.value, 1);
-            while (iter.hasNext())
-            {
-                Cell next = iter.next();
-                if (type.nameComparator().compare(next.name().collectionElement(), key) == 0 &&
-                    type.valueComparator().compare(next.value(), value) == 0)
-                    return true;
-            }
-            return false;
-        }
-
-        private ByteBuffer extractDataValue(ColumnDefinition def, ByteBuffer rowKey, ColumnFamily data, Composite prefix)
-        {
-            switch (def.kind)
-            {
-                case PARTITION_KEY:
-                    return def.isOnAllComponents()
-                         ? rowKey
-                         : ((CompositeType)data.metadata().getKeyValidator()).split(rowKey)[def.position()];
-                case CLUSTERING_COLUMN:
-                    return prefix.get(def.position());
-                case REGULAR:
-                    CellName cname = prefix == null
-                                   ? data.getComparator().cellFromByteBuffer(def.name.bytes)
-                                   : data.getComparator().create(prefix, def);
-
-                    Cell cell = data.getColumn(cname);
-                    return cell == null ? null : cell.value();
-                case COMPACT_VALUE:
-                    assert data.getColumnCount() == 1;
-                    return data.getSortedColumns().iterator().next().value();
-            }
-            throw new AssertionError();
-        }
-
-        @Override
-        public String toString()
-        {
-            return Objects.toStringHelper(this)
-                          .add("dataRange", dataRange)
-                          .add("timestamp", timestamp)
-                          .add("clause", clause)
-                          .toString();
-        }
-    }
-
-    private static class EmptyClauseFilter extends ExtendedFilter
-    {
-        public EmptyClauseFilter(ColumnFamilyStore cfs, DataRange range, int maxResults, boolean countCQL3Rows, long timestamp)
-        {
-            super(cfs, range, maxResults, countCQL3Rows, timestamp);
-        }
-
-        public List<IndexExpression> getClause()
-        {
-            return Collections.<IndexExpression>emptyList();
-        }
-
-        public IDiskAtomFilter getExtraFilter(DecoratedKey key, ColumnFamily data)
-        {
-            return null;
-        }
-
-        public ColumnFamily prune(DecoratedKey rowKey, ColumnFamily data)
-        {
-            return data;
-        }
-
-        public boolean isSatisfiedBy(DecoratedKey rowKey, ColumnFamily data, Composite prefix, ByteBuffer collectionElement)
-        {
-            return true;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java b/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java
deleted file mode 100644
index a541d5e..0000000
--- a/src/java/org/apache/cassandra/db/filter/IDiskAtomFilter.java
+++ /dev/null

@@ -1,146 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.Comparator;
-import java.util.Iterator;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-
-/**
- * Given an implementation-specific description of what columns to look for, provides methods
- * to extract the desired columns from a Memtable, SSTable, or SuperColumn.  Either the get*ColumnIterator
- * methods will be called, or filterSuperColumn, but not both on the same object.  QueryFilter
- * takes care of putting the two together if subcolumn filtering needs to be done, based on the
- * querypath that it knows (but that IFilter implementations are oblivious to).
- */
-public interface IDiskAtomFilter
-{
-    /**
-     * returns an iterator that returns columns from the given columnFamily
-     * matching the Filter criteria in sorted order.
-     */
-    public Iterator<Cell> getColumnIterator(ColumnFamily cf);
-
-    public OnDiskAtomIterator getColumnIterator(DecoratedKey key, ColumnFamily cf);
-
-    /**
-     * Get an iterator that returns columns from the given SSTable using the opened file
-     * matching the Filter criteria in sorted order.
-     * @param sstable
-     * @param file Already opened file data input, saves us opening another one
-     * @param key The key of the row we are about to iterate over
-     */
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry);
-
-    /**
-     * returns an iterator that returns columns from the given SSTable
-     * matching the Filter criteria in sorted order.
-     */
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, DecoratedKey key);
-
-    /**
-     * collects columns from reducedColumns into returnCF.  Termination is determined
-     * by the filter code, which should have some limit on the number of columns
-     * to avoid running out of memory on large rows.
-     */
-    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, DecoratedKey key, int gcBefore, long now);
-
-    public Comparator<Cell> getColumnComparator(CellNameType comparator);
-
-    public boolean isReversed();
-    public void updateColumnsLimit(int newLimit);
-
-    public int getLiveCount(ColumnFamily cf, long now);
-    public ColumnCounter columnCounter(CellNameType comparator, long now);
-
-    public IDiskAtomFilter cloneShallow();
-    public boolean maySelectPrefix(CType type, Composite prefix);
-
-    public boolean shouldInclude(SSTableReader sstable);
-
-    public boolean countCQL3Rows(CellNameType comparator);
-
-    public boolean isHeadFilter();
-
-    /**
-     * Whether the provided cf, that is assumed to contain the head of the
-     * partition, contains enough data to cover this filter.
-     */
-    public boolean isFullyCoveredBy(ColumnFamily cf, long now);
-
-    public static class Serializer implements IVersionedSerializer<IDiskAtomFilter>
-    {
-        private final CellNameType type;
-
-        public Serializer(CellNameType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(IDiskAtomFilter filter, DataOutputPlus out, int version) throws IOException
-        {
-            if (filter instanceof SliceQueryFilter)
-            {
-                out.writeByte(0);
-                type.sliceQueryFilterSerializer().serialize((SliceQueryFilter)filter, out, version);
-            }
-            else
-            {
-                out.writeByte(1);
-                type.namesQueryFilterSerializer().serialize((NamesQueryFilter)filter, out, version);
-            }
-        }
-
-        public IDiskAtomFilter deserialize(DataInput in, int version) throws IOException
-        {
-            int b = in.readByte();
-            if (b == 0)
-            {
-                return type.sliceQueryFilterSerializer().deserialize(in, version);
-            }
-            else
-            {
-                assert b == 1;
-                return type.namesQueryFilterSerializer().deserialize(in, version);
-            }
-        }
-
-        public long serializedSize(IDiskAtomFilter filter, int version)
-        {
-            int size = 1;
-            if (filter instanceof SliceQueryFilter)
-                size += type.sliceQueryFilterSerializer().serializedSize((SliceQueryFilter)filter, version);
-            else
-                size += type.namesQueryFilterSerializer().serializedSize((NamesQueryFilter)filter, version);
-            return size;
-        }
-    }
-
-    public Iterator<RangeTombstone> getRangeTombstoneIterator(ColumnFamily source);
-}

diff --git a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java b/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
deleted file mode 100644
index 74ca1fd..0000000
--- a/src/java/org/apache/cassandra/db/filter/NamesQueryFilter.java
+++ /dev/null

@@ -1,301 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.commons.lang3.StringUtils;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.utils.SearchIterator;
-
-public class NamesQueryFilter implements IDiskAtomFilter
-{
-    public final SortedSet<CellName> columns;
-
-    // If true, getLiveCount will always return either 0 or 1. This uses the fact that we know 
-    // CQL3 will never use a name filter with cell names spanning multiple CQL3 rows.
-    private final boolean countCQL3Rows;
-
-    public NamesQueryFilter(SortedSet<CellName> columns)
-    {
-        this(columns, false);
-    }
-
-    public NamesQueryFilter(SortedSet<CellName> columns, boolean countCQL3Rows)
-    {
-        this.columns = columns;
-        this.countCQL3Rows = countCQL3Rows;
-    }
-
-    public NamesQueryFilter cloneShallow()
-    {
-        // NQF is immutable as far as shallow cloning is concerned, so save the allocation.
-        return this;
-    }
-
-    public NamesQueryFilter withUpdatedColumns(SortedSet<CellName> newColumns)
-    {
-       return new NamesQueryFilter(newColumns, countCQL3Rows);
-    }
-
-    @SuppressWarnings("unchecked")
-    public Iterator<Cell> getColumnIterator(ColumnFamily cf)
-    {
-        assert cf != null;
-        return (Iterator<Cell>) (Iterator<?>) new ByNameColumnIterator(columns.iterator(), null, cf);
-    }
-
-    public OnDiskAtomIterator getColumnIterator(DecoratedKey key, ColumnFamily cf)
-    {
-        assert cf != null;
-        return new ByNameColumnIterator(columns.iterator(), key, cf);
-    }
-
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, DecoratedKey key)
-    {
-        return sstable.iterator(key, columns);
-    }
-
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry)
-    {
-        return sstable.iterator(file, key, columns, indexEntry);
-    }
-
-    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, DecoratedKey key, int gcBefore, long now)
-    {
-        DeletionInfo.InOrderTester tester = container.inOrderDeletionTester();
-        while (reducedColumns.hasNext())
-            container.maybeAppendColumn(reducedColumns.next(), tester, gcBefore);
-    }
-
-    public Comparator<Cell> getColumnComparator(CellNameType comparator)
-    {
-        return comparator.columnComparator(false);
-    }
-
-    @Override
-    public String toString()
-    {
-        return "NamesQueryFilter(" +
-               "columns=" + StringUtils.join(columns, ",") +
-               ')';
-    }
-
-    public boolean isReversed()
-    {
-        return false;
-    }
-
-    public void updateColumnsLimit(int newLimit)
-    {
-    }
-
-    public int getLiveCount(ColumnFamily cf, long now)
-    {
-        // Note: we could use columnCounter() but we save the object allocation as it's simple enough
-
-        if (countCQL3Rows)
-            return cf.hasOnlyTombstones(now) ? 0 : 1;
-
-        int count = 0;
-        for (Cell cell : cf)
-        {
-            if (cell.isLive(now))
-                count++;
-        }
-        return count;
-    }
-
-    public boolean maySelectPrefix(CType type, Composite prefix)
-    {
-        for (CellName column : columns)
-        {
-            if (prefix.isPrefixOf(type, column))
-                return true;
-        }
-        return false;
-    }
-
-    public boolean shouldInclude(SSTableReader sstable)
-    {
-        return true;
-    }
-
-    public boolean isFullyCoveredBy(ColumnFamily cf, long now)
-    {
-        // cf will cover all the requested columns if the range it covers include
-        // all said columns
-        CellName first = cf.iterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
-        CellName last = cf.reverseIterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
-
-        return cf.getComparator().compare(first, columns.first()) <= 0
-            && cf.getComparator().compare(columns.last(), last) <= 0;
-    }
-
-    public boolean isHeadFilter()
-    {
-        return false;
-    }
-
-    public boolean countCQL3Rows(CellNameType comparator)
-    {
-        return countCQL3Rows;
-    }
-
-    public boolean countCQL3Rows()
-    {
-        return countCQL3Rows(null);
-    }
-
-    public ColumnCounter columnCounter(CellNameType comparator, long now)
-    {
-        return countCQL3Rows
-             ? new ColumnCounter.GroupByPrefix(now, null, 0, false)
-             : new ColumnCounter(now);
-    }
-
-    private static class ByNameColumnIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
-    {
-        private final ColumnFamily cf;
-        private final DecoratedKey key;
-        private final Iterator<CellName> names;
-        private final SearchIterator<CellName, Cell> cells;
-
-        public ByNameColumnIterator(Iterator<CellName> names, DecoratedKey key, ColumnFamily cf)
-        {
-            this.names = names;
-            this.cf = cf;
-            this.key = key;
-            this.cells = cf.searchIterator();
-        }
-
-        protected OnDiskAtom computeNext()
-        {
-            while (names.hasNext() && cells.hasNext())
-            {
-                CellName current = names.next();
-                Cell cell = cells.next(current);
-                if (cell != null)
-                    return cell;
-            }
-            return endOfData();
-        }
-
-        public ColumnFamily getColumnFamily()
-        {
-            return cf;
-        }
-
-        public DecoratedKey getKey()
-        {
-            return key;
-        }
-
-        public void close() throws IOException { }
-    }
-
-    public static class Serializer implements IVersionedSerializer<NamesQueryFilter>
-    {
-        private CellNameType type;
-
-        public Serializer(CellNameType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(NamesQueryFilter f, DataOutputPlus out, int version) throws IOException
-        {
-            out.writeInt(f.columns.size());
-            ISerializer<CellName> serializer = type.cellSerializer();
-            for (CellName cName : f.columns)
-            {
-                serializer.serialize(cName, out);
-            }
-            out.writeBoolean(f.countCQL3Rows);
-        }
-
-        public NamesQueryFilter deserialize(DataInput in, int version) throws IOException
-        {
-            int size = in.readInt();
-            SortedSet<CellName> columns = new TreeSet<>(type);
-            ISerializer<CellName> serializer = type.cellSerializer();
-            for (int i = 0; i < size; ++i)
-                columns.add(serializer.deserialize(in));
-            boolean countCQL3Rows = in.readBoolean();
-            return new NamesQueryFilter(columns, countCQL3Rows);
-        }
-
-        public long serializedSize(NamesQueryFilter f, int version)
-        {
-            TypeSizes sizes = TypeSizes.NATIVE;
-            int size = sizes.sizeof(f.columns.size());
-            ISerializer<CellName> serializer = type.cellSerializer();
-            for (CellName cName : f.columns)
-                size += serializer.serializedSize(cName, sizes);
-            size += sizes.sizeof(f.countCQL3Rows);
-            return size;
-        }
-    }
-
-    public Iterator<RangeTombstone> getRangeTombstoneIterator(final ColumnFamily source)
-    {
-        if (!source.deletionInfo().hasRanges())
-            return Iterators.emptyIterator();
-
-        return new AbstractIterator<RangeTombstone>()
-        {
-            private final Iterator<CellName> names = columns.iterator();
-            private RangeTombstone lastFindRange;
-
-            protected RangeTombstone computeNext()
-            {
-                while (names.hasNext())
-                {
-                    CellName next = names.next();
-                    if (lastFindRange != null && lastFindRange.includes(source.getComparator(), next))
-                        return lastFindRange;
-
-                    // We keep the last range around as since names are in sort order, it's
-                    // possible it will match the next name too.
-                    lastFindRange = source.deletionInfo().rangeCovering(next);
-                    if (lastFindRange != null)
-                        return lastFindRange;
-                }
-                return endOfData();
-            }
-        };
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/QueryFilter.java b/src/java/org/apache/cassandra/db/filter/QueryFilter.java
deleted file mode 100644
index 15ee33d..0000000
--- a/src/java/org/apache/cassandra/db/filter/QueryFilter.java
+++ /dev/null

@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.SortedSet;
-
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.db.RangeTombstone;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.utils.MergeIterator;
-
-public class QueryFilter
-{
-    public final DecoratedKey key;
-    public final String cfName;
-    public final IDiskAtomFilter filter;
-    public final long timestamp;
-
-    public QueryFilter(DecoratedKey key, String cfName, IDiskAtomFilter filter, long timestamp)
-    {
-        this.key = key;
-        this.cfName = cfName;
-        this.filter = filter;
-        this.timestamp = timestamp;
-    }
-
-    public Iterator<Cell> getIterator(ColumnFamily cf)
-    {
-        assert cf != null;
-        return filter.getColumnIterator(cf);
-    }
-
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable)
-    {
-        return filter.getSSTableColumnIterator(sstable, key);
-    }
-
-    public void collateOnDiskAtom(ColumnFamily returnCF,
-                                  List<? extends Iterator<? extends OnDiskAtom>> toCollate,
-                                  int gcBefore)
-    {
-        collateOnDiskAtom(returnCF, toCollate, filter, this.key, gcBefore, timestamp);
-    }
-
-    public static void collateOnDiskAtom(ColumnFamily returnCF,
-                                         List<? extends Iterator<? extends OnDiskAtom>> toCollate,
-                                         IDiskAtomFilter filter,
-                                         DecoratedKey key,
-                                         int gcBefore,
-                                         long timestamp)
-    {
-        List<Iterator<Cell>> filteredIterators = new ArrayList<>(toCollate.size());
-        for (Iterator<? extends OnDiskAtom> iter : toCollate)
-            filteredIterators.add(gatherTombstones(returnCF, iter));
-        collateColumns(returnCF, filteredIterators, filter, key, gcBefore, timestamp);
-    }
-
-    // When there is only a single source of atoms, we can skip the collate step
-    public void collateOnDiskAtom(ColumnFamily returnCF, Iterator<? extends OnDiskAtom> toCollate, int gcBefore)
-    {
-        filter.collectReducedColumns(returnCF, gatherTombstones(returnCF, toCollate), this.key, gcBefore, timestamp);
-    }
-
-    public void collateColumns(ColumnFamily returnCF, List<? extends Iterator<Cell>> toCollate, int gcBefore)
-    {
-        collateColumns(returnCF, toCollate, filter, this.key, gcBefore, timestamp);
-    }
-
-    public static void collateColumns(ColumnFamily returnCF,
-                                      List<? extends Iterator<Cell>> toCollate,
-                                      IDiskAtomFilter filter,
-                                      DecoratedKey key,
-                                      int gcBefore,
-                                      long timestamp)
-    {
-        Comparator<Cell> comparator = filter.getColumnComparator(returnCF.getComparator());
-
-        Iterator<Cell> reduced = toCollate.size() == 1
-                               ? toCollate.get(0)
-                               : MergeIterator.get(toCollate, comparator, getReducer(comparator));
-
-        filter.collectReducedColumns(returnCF, reduced, key, gcBefore, timestamp);
-    }
-
-    private static MergeIterator.Reducer<Cell, Cell> getReducer(final Comparator<Cell> comparator)
-    {
-        // define a 'reduced' iterator that merges columns w/ the same name, which
-        // greatly simplifies computing liveColumns in the presence of tombstones.
-        return new MergeIterator.Reducer<Cell, Cell>()
-        {
-            Cell current;
-
-            public void reduce(Cell next)
-            {
-                assert current == null || comparator.compare(current, next) == 0;
-                current = current == null ? next : current.reconcile(next);
-            }
-
-            protected Cell getReduced()
-            {
-                assert current != null;
-                Cell toReturn = current;
-                current = null;
-                return toReturn;
-            }
-
-            @Override
-            public boolean trivialReduceIsTrivial()
-            {
-                return true;
-            }
-        };
-    }
-
-    /**
-     * Given an iterator of on disk atom, returns an iterator that filters the tombstone range
-     * markers adding them to {@code returnCF} and returns the normal column.
-     */
-    public static Iterator<Cell> gatherTombstones(final ColumnFamily returnCF, final Iterator<? extends OnDiskAtom> iter)
-    {
-        return new Iterator<Cell>()
-        {
-            private Cell next;
-
-            public boolean hasNext()
-            {
-                if (next != null)
-                    return true;
-
-                getNext();
-                return next != null;
-            }
-
-            public Cell next()
-            {
-                if (next == null)
-                    getNext();
-
-                assert next != null;
-                Cell toReturn = next;
-                next = null;
-                return toReturn;
-            }
-
-            private void getNext()
-            {
-                while (iter.hasNext())
-                {
-                    OnDiskAtom atom = iter.next();
-
-                    if (atom instanceof Cell)
-                    {
-                        next = (Cell)atom;
-                        break;
-                    }
-                    else
-                    {
-                        returnCF.addAtom(atom);
-                    }
-                }
-            }
-
-            public void remove()
-            {
-                throw new UnsupportedOperationException();
-            }
-        };
-    }
-
-    public String getColumnFamilyName()
-    {
-        return cfName;
-    }
-
-    /**
-     * @return a QueryFilter object to satisfy the given slice criteria:
-     * @param key the row to slice
-     * @param cfName column family to query
-     * @param start column to start slice at, inclusive; empty for "the first column"
-     * @param finish column to stop slice at, inclusive; empty for "the last column"
-     * @param reversed true to start with the largest column (as determined by configured sort order) instead of smallest
-     * @param limit maximum number of non-deleted columns to return
-     * @param timestamp time to use for determining expiring columns' state
-     */
-    public static QueryFilter getSliceFilter(DecoratedKey key,
-                                             String cfName,
-                                             Composite start,
-                                             Composite finish,
-                                             boolean reversed,
-                                             int limit,
-                                             long timestamp)
-    {
-        return new QueryFilter(key, cfName, new SliceQueryFilter(start, finish, reversed, limit), timestamp);
-    }
-
-    /**
-     * return a QueryFilter object that includes every column in the row.
-     * This is dangerous on large rows; avoid except for test code.
-     */
-    public static QueryFilter getIdentityFilter(DecoratedKey key, String cfName, long timestamp)
-    {
-        return new QueryFilter(key, cfName, new IdentityQueryFilter(), timestamp);
-    }
-
-    /**
-     * @return a QueryFilter object that will return columns matching the given names
-     * @param key the row to slice
-     * @param cfName column family to query
-     * @param columns the column names to restrict the results to, sorted in comparator order
-     */
-    public static QueryFilter getNamesFilter(DecoratedKey key, String cfName, SortedSet<CellName> columns, long timestamp)
-    {
-        return new QueryFilter(key, cfName, new NamesQueryFilter(columns), timestamp);
-    }
-
-    @Override
-    public String toString()
-    {
-        return getClass().getSimpleName() + "(key=" + key + ", cfName=" + cfName + (filter == null ? "" : ", filter=" + filter) + ")";
-    }
-
-    public boolean shouldInclude(SSTableReader sstable)
-    {
-        return filter.shouldInclude(sstable);
-    }
-
-    public void delete(DeletionInfo target, ColumnFamily source)
-    {
-        target.add(source.deletionInfo().getTopLevelDeletion());
-        // source is the CF currently in the memtable, and it can be large compared to what the filter selects,
-        // so only consider those range tombstones that the filter do select.
-        for (Iterator<RangeTombstone> iter = filter.getRangeTombstoneIterator(source); iter.hasNext(); )
-            target.add(iter.next(), source.getComparator());
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
new file mode 100644
index 0000000..774e4d3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/filter/RowFilter.java

@@ -0,0 +1,1043 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.context.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkBindValueSet;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
+
+/**
+ * A filter on which rows a given query should include or exclude.
+ * <p>
+ * This corresponds to the restrictions on rows that are not handled by the query
+ * {@link ClusteringIndexFilter}. Some of the expressions of this filter may
+ * be handled by a 2ndary index, and the rest is simply filtered out from the
+ * result set (the later can only happen if the query was using ALLOW FILTERING).
+ */
+public abstract class RowFilter implements Iterable<RowFilter.Expression>
+{
+    public static final Serializer serializer = new Serializer();
+    public static final RowFilter NONE = new CQLFilter(Collections.emptyList());
+
+    protected final List<Expression> expressions;
+
+    protected RowFilter(List<Expression> expressions)
+    {
+        this.expressions = expressions;
+    }
+
+    public static RowFilter create()
+    {
+        return new CQLFilter(new ArrayList<>());
+    }
+
+    public static RowFilter create(int capacity)
+    {
+        return new CQLFilter(new ArrayList<>(capacity));
+    }
+
+    public static RowFilter forThrift(int capacity)
+    {
+        return new ThriftFilter(new ArrayList<>(capacity));
+    }
+
+    public void add(ColumnDefinition def, Operator op, ByteBuffer value)
+    {
+        add(new SimpleExpression(def, op, value));
+    }
+
+    public void addMapEquality(ColumnDefinition def, ByteBuffer key, Operator op, ByteBuffer value)
+    {
+        add(new MapEqualityExpression(def, key, op, value));
+    }
+
+    public void addThriftExpression(CFMetaData metadata, ByteBuffer name, Operator op, ByteBuffer value)
+    {
+        assert (this instanceof ThriftFilter);
+        add(new ThriftExpression(metadata, name, op, value));
+    }
+
+    public void addCustomIndexExpression(CFMetaData cfm, IndexMetadata targetIndex, ByteBuffer value)
+    {
+        add(new CustomExpression(cfm, targetIndex, value));
+    }
+
+    private void add(Expression expression)
+    {
+        expression.validate();
+        expressions.add(expression);
+    }
+
+    public List<Expression> getExpressions()
+    {
+        return expressions;
+    }
+
+    /**
+     * Checks if some of the expressions apply to clustering or regular columns.
+     * @return {@code true} if some of the expressions apply to clustering or regular columns, {@code false} otherwise.
+     */
+    public boolean hasExpressionOnClusteringOrRegularColumns()
+    {
+        for (Expression expression : expressions)
+        {
+            ColumnDefinition column = expression.column();
+            if (column.isClusteringColumn() || column.isRegular())
+                return true;
+        }
+        return false;
+    }
+
+    protected abstract Transformation<BaseRowIterator<?>> filter(CFMetaData metadata, int nowInSec);
+
+    /**
+     * Filters the provided iterator so that only the row satisfying the expression of this filter
+     * are included in the resulting iterator.
+     *
+     * @param iter the iterator to filter
+     * @param nowInSec the time of query in seconds.
+     * @return the filtered iterator.
+     */
+    public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter, int nowInSec)
+    {
+        return expressions.isEmpty() ? iter : Transformation.apply(iter, filter(iter.metadata(), nowInSec));
+    }
+
+    /**
+     * Filters the provided iterator so that only the row satisfying the expression of this filter
+     * are included in the resulting iterator.
+     *
+     * @param iter the iterator to filter
+     * @param nowInSec the time of query in seconds.
+     * @return the filtered iterator.
+     */
+    public PartitionIterator filter(PartitionIterator iter, CFMetaData metadata, int nowInSec)
+    {
+        return expressions.isEmpty() ? iter : Transformation.apply(iter, filter(metadata, nowInSec));
+    }
+
+    /**
+     * Whether the provided row in the provided partition satisfies this filter.
+     *
+     * @param metadata the table metadata.
+     * @param partitionKey the partition key for partition to test.
+     * @param row the row to test.
+     * @param nowInSec the current time in seconds (to know what is live and what isn't).
+     * @return {@code true} if {@code row} in partition {@code partitionKey} satisfies this row filter.
+     */
+    public boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row, int nowInSec)
+    {
+        // We purge all tombstones as the expressions isSatisfiedBy methods expects it
+        Row purged = row.purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness());
+        if (purged == null)
+            return expressions.isEmpty();
+
+        for (Expression e : expressions)
+        {
+            if (!e.isSatisfiedBy(metadata, partitionKey, purged))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Returns true if all of the expressions within this filter that apply to the partition key are satisfied by
+     * the given key, false otherwise.
+     */
+    public boolean partitionKeyRestrictionsAreSatisfiedBy(DecoratedKey key, AbstractType<?> keyValidator)
+    {
+        for (Expression e : expressions)
+        {
+            if (!e.column.isPartitionKey())
+                continue;
+
+            ByteBuffer value = keyValidator instanceof CompositeType
+                             ? ((CompositeType) keyValidator).split(key.getKey())[e.column.position()]
+                             : key.getKey();
+            if (!e.operator().isSatisfiedBy(e.column.type, value, e.value))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Returns true if all of the expressions within this filter that apply to the clustering key are satisfied by
+     * the given Clustering, false otherwise.
+     */
+    public boolean clusteringKeyRestrictionsAreSatisfiedBy(Clustering clustering)
+    {
+        for (Expression e : expressions)
+        {
+            if (!e.column.isClusteringColumn())
+                continue;
+
+            if (!e.operator().isSatisfiedBy(e.column.type, clustering.get(e.column.position()), e.value))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Returns this filter but without the provided expression. This method
+     * *assumes* that the filter contains the provided expression.
+     */
+    public RowFilter without(Expression expression)
+    {
+        assert expressions.contains(expression);
+        if (expressions.size() == 1)
+            return RowFilter.NONE;
+
+        List<Expression> newExpressions = new ArrayList<>(expressions.size() - 1);
+        for (Expression e : expressions)
+            if (!e.equals(expression))
+                newExpressions.add(e);
+
+        return withNewExpressions(newExpressions);
+    }
+
+    protected abstract RowFilter withNewExpressions(List<Expression> expressions);
+
+    public boolean isEmpty()
+    {
+        return expressions.isEmpty();
+    }
+
+    public Iterator<Expression> iterator()
+    {
+        return expressions.iterator();
+    }
+
+    private static Clustering makeCompactClustering(CFMetaData metadata, ByteBuffer name)
+    {
+        assert metadata.isCompactTable();
+        if (metadata.isCompound())
+        {
+            List<ByteBuffer> values = CompositeType.splitName(name);
+            return new Clustering(values.toArray(new ByteBuffer[metadata.comparator.size()]));
+        }
+        else
+        {
+            return new Clustering(name);
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < expressions.size(); i++)
+        {
+            if (i > 0)
+                sb.append(" AND ");
+            sb.append(expressions.get(i));
+        }
+        return sb.toString();
+    }
+
+    private static class CQLFilter extends RowFilter
+    {
+        private CQLFilter(List<Expression> expressions)
+        {
+            super(expressions);
+        }
+
+        protected Transformation<BaseRowIterator<?>> filter(CFMetaData metadata, int nowInSec)
+        {
+            long numberOfStaticColumnExpressions = expressions.stream().filter(e -> e.column.isStatic()).count();
+            final boolean filterStaticColumns = numberOfStaticColumnExpressions != 0;
+            final boolean filterNonStaticColumns = (expressions.size() - numberOfStaticColumnExpressions) > 0;
+
+            return new Transformation<BaseRowIterator<?>>()
+            {
+                DecoratedKey pk;
+                protected BaseRowIterator<?> applyToPartition(BaseRowIterator<?> partition)
+                {
+                    // The filter might be on static columns, so need to check static row first.
+                    if (filterStaticColumns && applyToRow(partition.staticRow()) == null)
+                    {
+                        partition.close();
+                        return null;
+                    }
+
+                    pk = partition.partitionKey();
+                    BaseRowIterator<?> iterator = partition instanceof UnfilteredRowIterator
+                                                  ? Transformation.apply((UnfilteredRowIterator) partition, this)
+                                                  : Transformation.apply((RowIterator) partition, this);
+
+                    if (filterNonStaticColumns && !iterator.hasNext())
+                    {
+                        iterator.close();
+                        return null;
+                    }
+
+                    return iterator;
+                }
+
+                public Row applyToRow(Row row)
+                {
+                    Row purged = row.purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness());
+                    if (purged == null)
+                        return null;
+
+                    for (Expression e : expressions)
+                        if (!e.isSatisfiedBy(metadata, pk, purged))
+                            return null;
+                    return row;
+                }
+            };
+        }
+
+        protected RowFilter withNewExpressions(List<Expression> expressions)
+        {
+            return new CQLFilter(expressions);
+        }
+    }
+
+    private static class ThriftFilter extends RowFilter
+    {
+        private ThriftFilter(List<Expression> expressions)
+        {
+            super(expressions);
+        }
+
+        protected Transformation<BaseRowIterator<?>> filter(CFMetaData metadata, int nowInSec)
+        {
+            // Thrift does not filter rows, it filters entire partition if any of the expression is not
+            // satisfied, which forces us to materialize the result (in theory we could materialize only
+            // what we need which might or might not be everything, but we keep it simple since in practice
+            // it's not worth that it has ever been).
+            return new Transformation<BaseRowIterator<?>>()
+            {
+                protected BaseRowIterator<?> applyToPartition(BaseRowIterator<?> partition)
+                {
+                    return partition instanceof UnfilteredRowIterator ? applyTo((UnfilteredRowIterator) partition)
+                                                                      : applyTo((RowIterator) partition);
+                }
+
+                private UnfilteredRowIterator applyTo(UnfilteredRowIterator partition)
+                {
+                    ImmutableBTreePartition result = ImmutableBTreePartition.create(partition);
+                    partition.close();
+                    return accepts(result) ? result.unfilteredIterator() : null;
+                }
+
+                private RowIterator applyTo(RowIterator partition)
+                {
+                    FilteredPartition result = FilteredPartition.create(partition);
+                    return accepts(result) ? result.rowIterator() : null;
+                }
+
+                private boolean accepts(ImmutableBTreePartition result)
+                {
+                    // The partition needs to have a row for every expression, and the expression needs to be valid.
+                    for (Expression expr : expressions)
+                    {
+                        assert expr instanceof ThriftExpression;
+                        Row row = result.getRow(makeCompactClustering(metadata, expr.column().name.bytes));
+                        if (row == null || !expr.isSatisfiedBy(metadata, result.partitionKey(), row))
+                            return false;
+                    }
+                    // If we get there, it means all expressions where satisfied, so return the original result
+                    return true;
+                }
+            };
+        }
+
+        protected RowFilter withNewExpressions(List<Expression> expressions)
+        {
+            return new ThriftFilter(expressions);
+        }
+    }
+
+    public static abstract class Expression
+    {
+        private static final Serializer serializer = new Serializer();
+
+        // Note: the order of this enum matter, it's used for serialization
+        protected enum Kind { SIMPLE, MAP_EQUALITY, THRIFT_DYN_EXPR, CUSTOM }
+
+        abstract Kind kind();
+        protected final ColumnDefinition column;
+        protected final Operator operator;
+        protected final ByteBuffer value;
+
+        protected Expression(ColumnDefinition column, Operator operator, ByteBuffer value)
+        {
+            this.column = column;
+            this.operator = operator;
+            this.value = value;
+        }
+
+        public boolean isCustom()
+        {
+            return kind() == Kind.CUSTOM;
+        }
+
+        public ColumnDefinition column()
+        {
+            return column;
+        }
+
+        public Operator operator()
+        {
+            return operator;
+        }
+
+        /**
+         * Checks if the operator of this <code>IndexExpression</code> is a <code>CONTAINS</code> operator.
+         *
+         * @return <code>true</code> if the operator of this <code>IndexExpression</code> is a <code>CONTAINS</code>
+         * operator, <code>false</code> otherwise.
+         */
+        public boolean isContains()
+        {
+            return Operator.CONTAINS == operator;
+        }
+
+        /**
+         * Checks if the operator of this <code>IndexExpression</code> is a <code>CONTAINS_KEY</code> operator.
+         *
+         * @return <code>true</code> if the operator of this <code>IndexExpression</code> is a <code>CONTAINS_KEY</code>
+         * operator, <code>false</code> otherwise.
+         */
+        public boolean isContainsKey()
+        {
+            return Operator.CONTAINS_KEY == operator;
+        }
+
+        /**
+         * If this expression is used to query an index, the value to use as
+         * partition key for that index query.
+         */
+        public ByteBuffer getIndexValue()
+        {
+            return value;
+        }
+
+        public void validate()
+        {
+            checkNotNull(value, "Unsupported null value for column %s", column.name);
+            checkBindValueSet(value, "Unsupported unset value for column %s", column.name);
+        }
+
+        @Deprecated
+        public void validateForIndexing()
+        {
+            checkFalse(value.remaining() > FBUtilities.MAX_UNSIGNED_SHORT,
+                       "Index expression values may not be larger than 64K");
+        }
+
+        /**
+         * Returns whether the provided row satisfied this expression or not.
+         *
+         * @param partitionKey the partition key for row to check.
+         * @param row the row to check. It should *not* contain deleted cells
+         * (i.e. it should come from a RowIterator).
+         * @return whether the row is satisfied by this expression.
+         */
+        public abstract boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row);
+
+        protected ByteBuffer getValue(CFMetaData metadata, DecoratedKey partitionKey, Row row)
+        {
+            switch (column.kind)
+            {
+                case PARTITION_KEY:
+                    return metadata.getKeyValidator() instanceof CompositeType
+                         ? CompositeType.extractComponent(partitionKey.getKey(), column.position())
+                         : partitionKey.getKey();
+                case CLUSTERING:
+                    return row.clustering().get(column.position());
+                default:
+                    Cell cell = row.getCell(column);
+                    return cell == null ? null : cell.value();
+            }
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (!(o instanceof Expression))
+                return false;
+
+            Expression that = (Expression)o;
+
+            return Objects.equal(this.kind(), that.kind())
+                && Objects.equal(this.column.name, that.column.name)
+                && Objects.equal(this.operator, that.operator)
+                && Objects.equal(this.value, that.value);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(column.name, operator, value);
+        }
+
+        private static class Serializer
+        {
+            public void serialize(Expression expression, DataOutputPlus out, int version) throws IOException
+            {
+                if (version >= MessagingService.VERSION_30)
+                    out.writeByte(expression.kind().ordinal());
+
+                // Custom expressions include neither a column or operator, but all
+                // other expressions do. Also, custom expressions are 3.0+ only, so
+                // the column & operator will always be the first things written for
+                // any pre-3.0 version
+                if (expression.kind() == Kind.CUSTOM)
+                {
+                    assert version >= MessagingService.VERSION_30;
+                    IndexMetadata.serializer.serialize(((CustomExpression)expression).targetIndex, out, version);
+                    ByteBufferUtil.writeWithShortLength(expression.value, out);
+                    return;
+                }
+
+                ByteBufferUtil.writeWithShortLength(expression.column.name.bytes, out);
+                expression.operator.writeTo(out);
+
+                switch (expression.kind())
+                {
+                    case SIMPLE:
+                        ByteBufferUtil.writeWithShortLength(((SimpleExpression)expression).value, out);
+                        break;
+                    case MAP_EQUALITY:
+                        MapEqualityExpression mexpr = (MapEqualityExpression)expression;
+                        if (version < MessagingService.VERSION_30)
+                        {
+                            ByteBufferUtil.writeWithShortLength(mexpr.getIndexValue(), out);
+                        }
+                        else
+                        {
+                            ByteBufferUtil.writeWithShortLength(mexpr.key, out);
+                            ByteBufferUtil.writeWithShortLength(mexpr.value, out);
+                        }
+                        break;
+                    case THRIFT_DYN_EXPR:
+                        ByteBufferUtil.writeWithShortLength(((ThriftExpression)expression).value, out);
+                        break;
+                }
+            }
+
+            public Expression deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+            {
+                Kind kind = null;
+                ByteBuffer name;
+                Operator operator;
+                ColumnDefinition column;
+
+                if (version >= MessagingService.VERSION_30)
+                {
+                    kind = Kind.values()[in.readByte()];
+                    // custom expressions (3.0+ only) do not contain a column or operator, only a value
+                    if (kind == Kind.CUSTOM)
+                    {
+                        return new CustomExpression(metadata,
+                                                    IndexMetadata.serializer.deserialize(in, version, metadata),
+                                                    ByteBufferUtil.readWithShortLength(in));
+                    }
+                }
+
+                name = ByteBufferUtil.readWithShortLength(in);
+                operator = Operator.readFrom(in);
+                column = metadata.getColumnDefinition(name);
+                if (!metadata.isCompactTable() && column == null)
+                    throw new RuntimeException("Unknown (or dropped) column " + UTF8Type.instance.getString(name) + " during deserialization");
+
+                if (version < MessagingService.VERSION_30)
+                {
+                    if (column == null)
+                        kind = Kind.THRIFT_DYN_EXPR;
+                    else if (column.type instanceof MapType && operator == Operator.EQ)
+                        kind = Kind.MAP_EQUALITY;
+                    else
+                        kind = Kind.SIMPLE;
+                }
+
+                assert kind != null;
+                switch (kind)
+                {
+                    case SIMPLE:
+                        return new SimpleExpression(column, operator, ByteBufferUtil.readWithShortLength(in));
+                    case MAP_EQUALITY:
+                        ByteBuffer key, value;
+                        if (version < MessagingService.VERSION_30)
+                        {
+                            ByteBuffer composite = ByteBufferUtil.readWithShortLength(in);
+                            key = CompositeType.extractComponent(composite, 0);
+                            value = CompositeType.extractComponent(composite, 0);
+                        }
+                        else
+                        {
+                            key = ByteBufferUtil.readWithShortLength(in);
+                            value = ByteBufferUtil.readWithShortLength(in);
+                        }
+                        return new MapEqualityExpression(column, key, operator, value);
+                    case THRIFT_DYN_EXPR:
+                        return new ThriftExpression(metadata, name, operator, ByteBufferUtil.readWithShortLength(in));
+                }
+                throw new AssertionError();
+            }
+
+
+            public long serializedSize(Expression expression, int version)
+            {
+                // version 3.0+ includes a byte for Kind
+                long size = version >= MessagingService.VERSION_30 ? 1 : 0;
+
+                // custom expressions don't include a column or operator, all other expressions do
+                if (expression.kind() != Kind.CUSTOM)
+                    size += ByteBufferUtil.serializedSizeWithShortLength(expression.column().name.bytes)
+                            + expression.operator.serializedSize();
+
+                switch (expression.kind())
+                {
+                    case SIMPLE:
+                        size += ByteBufferUtil.serializedSizeWithShortLength(((SimpleExpression)expression).value);
+                        break;
+                    case MAP_EQUALITY:
+                        MapEqualityExpression mexpr = (MapEqualityExpression)expression;
+                        if (version < MessagingService.VERSION_30)
+                            size += ByteBufferUtil.serializedSizeWithShortLength(mexpr.getIndexValue());
+                        else
+                            size += ByteBufferUtil.serializedSizeWithShortLength(mexpr.key)
+                                  + ByteBufferUtil.serializedSizeWithShortLength(mexpr.value);
+                        break;
+                    case THRIFT_DYN_EXPR:
+                        size += ByteBufferUtil.serializedSizeWithShortLength(((ThriftExpression)expression).value);
+                        break;
+                    case CUSTOM:
+                        if (version >= MessagingService.VERSION_30)
+                            size += IndexMetadata.serializer.serializedSize(((CustomExpression)expression).targetIndex, version)
+                                  + ByteBufferUtil.serializedSizeWithShortLength(expression.value);
+                        break;
+                }
+                return size;
+            }
+        }
+    }
+
+    /**
+     * An expression of the form 'column' 'op' 'value'.
+     */
+    private static class SimpleExpression extends Expression
+    {
+        public SimpleExpression(ColumnDefinition column, Operator operator, ByteBuffer value)
+        {
+            super(column, operator, value);
+        }
+
+        public boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row)
+        {
+            // We support null conditions for LWT (in ColumnCondition) but not for RowFilter.
+            // TODO: we should try to merge both code someday.
+            assert value != null;
+
+            if (row.isStatic() != column.isStatic())
+                return true;
+
+            switch (operator)
+            {
+                case EQ:
+                case LT:
+                case LTE:
+                case GTE:
+                case GT:
+                    {
+                        assert !column.isComplex() : "Only CONTAINS and CONTAINS_KEY are supported for 'complex' types";
+
+                        // In order to support operators on Counter types, their value has to be extracted from internal
+                        // representation. See CASSANDRA-11629
+                        if (column.type.isCounter())
+                        {
+                            ByteBuffer foundValue = getValue(metadata, partitionKey, row);
+                            if (foundValue == null)
+                                return false;
+
+                            ByteBuffer counterValue = LongType.instance.decompose(CounterContext.instance().total(foundValue));
+                            return operator.isSatisfiedBy(LongType.instance, counterValue, value);
+                        }
+                        else
+                        {
+                            // Note that CQL expression are always of the form 'x < 4', i.e. the tested value is on the left.
+                            ByteBuffer foundValue = getValue(metadata, partitionKey, row);
+                            return foundValue != null && operator.isSatisfiedBy(column.type, foundValue, value);
+                        }
+                    }
+                case NEQ:
+                    {
+                        assert !column.isComplex() : "Only CONTAINS and CONTAINS_KEY are supported for 'complex' types";
+                        ByteBuffer foundValue = getValue(metadata, partitionKey, row);
+                        // Note that CQL expression are always of the form 'x < 4', i.e. the tested value is on the left.
+                        return foundValue != null && operator.isSatisfiedBy(column.type, foundValue, value);
+                    }
+                case CONTAINS:
+                    assert column.type.isCollection();
+                    CollectionType<?> type = (CollectionType<?>)column.type;
+                    if (column.isComplex())
+                    {
+                        ComplexColumnData complexData = row.getComplexColumnData(column);
+                        if (complexData != null)
+                        {
+                            for (Cell cell : complexData)
+                            {
+                                if (type.kind == CollectionType.Kind.SET)
+                                {
+                                    if (type.nameComparator().compare(cell.path().get(0), value) == 0)
+                                        return true;
+                                }
+                                else
+                                {
+                                    if (type.valueComparator().compare(cell.value(), value) == 0)
+                                        return true;
+                                }
+                            }
+                        }
+                        return false;
+                    }
+                    else
+                    {
+                        ByteBuffer foundValue = getValue(metadata, partitionKey, row);
+                        if (foundValue == null)
+                            return false;
+
+                        switch (type.kind)
+                        {
+                            case LIST:
+                                ListType<?> listType = (ListType<?>)type;
+                                return listType.compose(foundValue).contains(listType.getElementsType().compose(value));
+                            case SET:
+                                SetType<?> setType = (SetType<?>)type;
+                                return setType.compose(foundValue).contains(setType.getElementsType().compose(value));
+                            case MAP:
+                                MapType<?,?> mapType = (MapType<?, ?>)type;
+                                return mapType.compose(foundValue).containsValue(mapType.getValuesType().compose(value));
+                        }
+                        throw new AssertionError();
+                    }
+                case CONTAINS_KEY:
+                    assert column.type.isCollection() && column.type instanceof MapType;
+                    MapType<?, ?> mapType = (MapType<?, ?>)column.type;
+                    if (column.isComplex())
+                    {
+                         return row.getCell(column, CellPath.create(value)) != null;
+                    }
+                    else
+                    {
+                        ByteBuffer foundValue = getValue(metadata, partitionKey, row);
+                        return foundValue != null && mapType.getSerializer().getSerializedValue(foundValue, value, mapType.getKeysType()) != null;
+                    }
+
+                case IN:
+                    // It wouldn't be terribly hard to support this (though doing so would imply supporting
+                    // IN for 2ndary index) but currently we don't.
+                    throw new AssertionError();
+            }
+            throw new AssertionError();
+        }
+
+        @Override
+        public String toString()
+        {
+            AbstractType<?> type = column.type;
+            switch (operator)
+            {
+                case CONTAINS:
+                    assert type instanceof CollectionType;
+                    CollectionType<?> ct = (CollectionType<?>)type;
+                    type = ct.kind == CollectionType.Kind.SET ? ct.nameComparator() : ct.valueComparator();
+                    break;
+                case CONTAINS_KEY:
+                    assert type instanceof MapType;
+                    type = ((MapType<?, ?>)type).nameComparator();
+                    break;
+                case IN:
+                    type = ListType.getInstance(type, false);
+                    break;
+                default:
+                    break;
+            }
+            return String.format("%s %s %s", column.name, operator, type.getString(value));
+        }
+
+        @Override
+        Kind kind()
+        {
+            return Kind.SIMPLE;
+        }
+    }
+
+    /**
+     * An expression of the form 'column' ['key'] = 'value' (which is only
+     * supported when 'column' is a map).
+     */
+    private static class MapEqualityExpression extends Expression
+    {
+        private final ByteBuffer key;
+
+        public MapEqualityExpression(ColumnDefinition column, ByteBuffer key, Operator operator, ByteBuffer value)
+        {
+            super(column, operator, value);
+            assert column.type instanceof MapType && operator == Operator.EQ;
+            this.key = key;
+        }
+
+        @Override
+        public void validate() throws InvalidRequestException
+        {
+            checkNotNull(key, "Unsupported null map key for column %s", column.name);
+            checkBindValueSet(key, "Unsupported unset map key for column %s", column.name);
+            checkNotNull(value, "Unsupported null map value for column %s", column.name);
+            checkBindValueSet(value, "Unsupported unset map value for column %s", column.name);
+        }
+
+        @Override
+        public ByteBuffer getIndexValue()
+        {
+            return CompositeType.build(key, value);
+        }
+
+        public boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row)
+        {
+            assert key != null;
+            // We support null conditions for LWT (in ColumnCondition) but not for RowFilter.
+            // TODO: we should try to merge both code someday.
+            assert value != null;
+
+            if (row.isStatic() != column.isStatic())
+                return true;
+
+            MapType<?, ?> mt = (MapType<?, ?>)column.type;
+            if (column.isComplex())
+            {
+                Cell cell = row.getCell(column, CellPath.create(key));
+                return cell != null && mt.valueComparator().compare(cell.value(), value) == 0;
+            }
+            else
+            {
+                ByteBuffer serializedMap = getValue(metadata, partitionKey, row);
+                if (serializedMap == null)
+                    return false;
+
+                ByteBuffer foundValue = mt.getSerializer().getSerializedValue(serializedMap, key, mt.getKeysType());
+                return foundValue != null && mt.valueComparator().compare(foundValue, value) == 0;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            MapType<?, ?> mt = (MapType<?, ?>)column.type;
+            return String.format("%s[%s] = %s", column.name, mt.nameComparator().getString(key), mt.valueComparator().getString(value));
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (!(o instanceof MapEqualityExpression))
+                return false;
+
+            MapEqualityExpression that = (MapEqualityExpression)o;
+
+            return Objects.equal(this.column.name, that.column.name)
+                && Objects.equal(this.operator, that.operator)
+                && Objects.equal(this.key, that.key)
+                && Objects.equal(this.value, that.value);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(column.name, operator, key, value);
+        }
+
+        @Override
+        Kind kind()
+        {
+            return Kind.MAP_EQUALITY;
+        }
+    }
+
+    /**
+     * An expression of the form 'name' = 'value', but where 'name' is actually the
+     * clustering value for a compact table. This is only for thrift.
+     */
+    private static class ThriftExpression extends Expression
+    {
+        public ThriftExpression(CFMetaData metadata, ByteBuffer name, Operator operator, ByteBuffer value)
+        {
+            super(makeDefinition(metadata, name), operator, value);
+            assert metadata.isCompactTable();
+        }
+
+        private static ColumnDefinition makeDefinition(CFMetaData metadata, ByteBuffer name)
+        {
+            ColumnDefinition def = metadata.getColumnDefinition(name);
+            if (def != null)
+                return def;
+
+            // In thrift, we actually allow expression on non-defined columns for the sake of filtering. To accomodate
+            // this we create a "fake" definition. This is messy but it works so is probably good enough.
+            return ColumnDefinition.regularDef(metadata, name, metadata.compactValueColumn().type);
+        }
+
+        public boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row)
+        {
+            assert value != null;
+
+            // On thrift queries, even if the column expression is a "static" one, we'll have convert it as a "dynamic"
+            // one in ThriftResultsMerger, so we always expect it to be a dynamic one. Further, we expect this is only
+            // called when the row clustering does match the column (see ThriftFilter above).
+            assert row.clustering().equals(makeCompactClustering(metadata, column.name.bytes));
+            Cell cell = row.getCell(metadata.compactValueColumn());
+            return cell != null && operator.isSatisfiedBy(column.type, cell.value(), value);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s %s %s", column.name, operator, column.type.getString(value));
+        }
+
+        @Override
+        Kind kind()
+        {
+            return Kind.THRIFT_DYN_EXPR;
+        }
+    }
+
+    /**
+     * A custom index expression for use with 2i implementations which support custom syntax and which are not
+     * necessarily linked to a single column in the base table.
+     */
+    public static final class CustomExpression extends Expression
+    {
+        private final IndexMetadata targetIndex;
+        private final CFMetaData cfm;
+
+        public CustomExpression(CFMetaData cfm, IndexMetadata targetIndex, ByteBuffer value)
+        {
+            // The operator is not relevant, but Expression requires it so for now we just hardcode EQ
+            super(makeDefinition(cfm, targetIndex), Operator.EQ, value);
+            this.targetIndex = targetIndex;
+            this.cfm = cfm;
+        }
+
+        private static ColumnDefinition makeDefinition(CFMetaData cfm, IndexMetadata index)
+        {
+            // Similarly to how we handle non-defined columns in thift, we create a fake column definition to
+            // represent the target index. This is definitely something that can be improved though.
+            return ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(index.name.getBytes()), BytesType.instance);
+        }
+
+        public IndexMetadata getTargetIndex()
+        {
+            return targetIndex;
+        }
+
+        public ByteBuffer getValue()
+        {
+            return value;
+        }
+
+        public String toString()
+        {
+            return String.format("expr(%s, %s)",
+                                 targetIndex.name,
+                                 Keyspace.openAndGetStore(cfm)
+                                         .indexManager
+                                         .getIndex(targetIndex)
+                                         .customExpressionValueType());
+        }
+
+        Kind kind()
+        {
+            return Kind.CUSTOM;
+        }
+
+        // Filtering by custom expressions isn't supported yet, so just accept any row
+        public boolean isSatisfiedBy(CFMetaData metadata, DecoratedKey partitionKey, Row row)
+        {
+            return true;
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(RowFilter filter, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeBoolean(filter instanceof ThriftFilter);
+            out.writeUnsignedVInt(filter.expressions.size());
+            for (Expression expr : filter.expressions)
+                Expression.serializer.serialize(expr, out, version);
+
+        }
+
+        public RowFilter deserialize(DataInputPlus in, int version, CFMetaData metadata) throws IOException
+        {
+            boolean forThrift = in.readBoolean();
+            int size = (int)in.readUnsignedVInt();
+            List<Expression> expressions = new ArrayList<>(size);
+            for (int i = 0; i < size; i++)
+                expressions.add(Expression.serializer.deserialize(in, version, metadata));
+
+            return forThrift
+                 ? new ThriftFilter(expressions)
+                 : new CQLFilter(expressions);
+        }
+
+        public long serializedSize(RowFilter filter, int version)
+        {
+            long size = 1 // forThrift
+                      + TypeSizes.sizeofUnsignedVInt(filter.expressions.size());
+            for (Expression expr : filter.expressions)
+                size += Expression.serializer.serializedSize(expr, version);
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java b/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
deleted file mode 100644
index 95f67e6..0000000
--- a/src/java/org/apache/cassandra/db/filter/SliceQueryFilter.java
+++ /dev/null

@@ -1,591 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.filter;
-
-import java.nio.ByteBuffer;
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.*;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Iterators;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.utils.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.service.ClientWarn;
-import org.apache.cassandra.tracing.Tracing;
-
-public class SliceQueryFilter implements IDiskAtomFilter
-{
-    private static final Logger logger = LoggerFactory.getLogger(SliceQueryFilter.class);
-
-    /**
-     * A special value for compositesToGroup that indicates that partitioned tombstones should not be included in results
-     * or count towards the limit.  See CASSANDRA-8490 for more details on why this is needed (and done this way).
-     **/
-    public static final int IGNORE_TOMBSTONED_PARTITIONS = -2;
-
-    public final ColumnSlice[] slices;
-    public final boolean reversed;
-    public volatile int count;
-    public final int compositesToGroup;
-
-    // Not serialized, just a ack for range slices to find the number of live column counted, even when we group
-    private ColumnCounter columnCounter;
-
-    public SliceQueryFilter(Composite start, Composite finish, boolean reversed, int count)
-    {
-        this(new ColumnSlice(start, finish), reversed, count);
-    }
-
-    public SliceQueryFilter(Composite start, Composite finish, boolean reversed, int count, int compositesToGroup)
-    {
-        this(new ColumnSlice(start, finish), reversed, count, compositesToGroup);
-    }
-
-    public SliceQueryFilter(ColumnSlice slice, boolean reversed, int count)
-    {
-        this(new ColumnSlice[]{ slice }, reversed, count);
-    }
-
-    public SliceQueryFilter(ColumnSlice slice, boolean reversed, int count, int compositesToGroup)
-    {
-        this(new ColumnSlice[]{ slice }, reversed, count, compositesToGroup);
-    }
-
-    /**
-     * Constructor that accepts multiple slices. All slices are assumed to be in the same direction (forward or
-     * reversed).
-     */
-    public SliceQueryFilter(ColumnSlice[] slices, boolean reversed, int count)
-    {
-        this(slices, reversed, count, -1);
-    }
-
-    public SliceQueryFilter(ColumnSlice[] slices, boolean reversed, int count, int compositesToGroup)
-    {
-        this.slices = slices;
-        this.reversed = reversed;
-        this.count = count;
-        this.compositesToGroup = compositesToGroup;
-    }
-
-    public SliceQueryFilter cloneShallow()
-    {
-        return new SliceQueryFilter(slices, reversed, count, compositesToGroup);
-    }
-
-    public SliceQueryFilter withUpdatedCount(int newCount)
-    {
-        return new SliceQueryFilter(slices, reversed, newCount, compositesToGroup);
-    }
-
-    public SliceQueryFilter withUpdatedSlices(ColumnSlice[] newSlices)
-    {
-        return new SliceQueryFilter(newSlices, reversed, count, compositesToGroup);
-    }
-
-    /** Returns true if the slice includes static columns, false otherwise. */
-    private boolean sliceIncludesStatics(ColumnSlice slice, CFMetaData cfm)
-    {
-        return cfm.hasStaticColumns() &&
-                slice.includes(reversed ? cfm.comparator.reverseComparator() : cfm.comparator, cfm.comparator.staticPrefix().end());
-    }
-
-    public boolean hasStaticSlice(CFMetaData cfm)
-    {
-        for (ColumnSlice slice : slices)
-            if (sliceIncludesStatics(slice, cfm))
-                return true;
-
-        return false;
-    }
-
-    /**
-     * Splits this filter into two SliceQueryFilters: one that slices only the static columns, and one that slices the
-     * remainder of the normal data.
-     *
-     * This should only be called when the filter is reversed and the filter is known to cover static columns (through
-     * hasStaticSlice()).
-     *
-     * @return a pair of (static, normal) SliceQueryFilters
-     */
-    public Pair<SliceQueryFilter, SliceQueryFilter> splitOutStaticSlice(CFMetaData cfm)
-    {
-        assert reversed;
-
-        Composite staticSliceEnd = cfm.comparator.staticPrefix().end();
-        List<ColumnSlice> nonStaticSlices = new ArrayList<>(slices.length);
-        for (ColumnSlice slice : slices)
-        {
-            if (sliceIncludesStatics(slice, cfm))
-                nonStaticSlices.add(new ColumnSlice(slice.start, staticSliceEnd));
-            else
-                nonStaticSlices.add(slice);
-        }
-
-        return Pair.create(
-            new SliceQueryFilter(staticSliceEnd, Composites.EMPTY, true, count, compositesToGroup),
-            new SliceQueryFilter(nonStaticSlices.toArray(new ColumnSlice[nonStaticSlices.size()]), true, count, compositesToGroup));
-    }
-
-    public SliceQueryFilter withUpdatedStart(Composite newStart, CFMetaData cfm)
-    {
-        Comparator<Composite> cmp = reversed ? cfm.comparator.reverseComparator() : cfm.comparator;
-
-        // Check our slices to see if any fall before the new start (in which case they can be removed) or
-        // if they contain the new start (in which case they should start from the page start).  However, if the
-        // slices would include static columns, we need to ensure they are also fetched, and so a separate
-        // slice for the static columns may be required.
-        // Note that if the query is reversed, we can't handle statics by simply adding a separate slice here, so
-        // the reversed case is handled by SliceFromReadCommand instead. See CASSANDRA-8502 for more details.
-        List<ColumnSlice> newSlices = new ArrayList<>();
-        boolean pastNewStart = false;
-        for (ColumnSlice slice : slices)
-        {
-            if (pastNewStart)
-            {
-                newSlices.add(slice);
-                continue;
-            }
-
-            if (slice.isBefore(cmp, newStart))
-            {
-                if (!reversed && sliceIncludesStatics(slice, cfm))
-                    newSlices.add(new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end()));
-
-                continue;
-            }
-            else if (slice.includes(cmp, newStart))
-            {
-                if (!reversed && sliceIncludesStatics(slice, cfm) && !newStart.isEmpty())
-                    newSlices.add(new ColumnSlice(Composites.EMPTY, cfm.comparator.staticPrefix().end()));
-
-                newSlices.add(new ColumnSlice(newStart, slice.finish));
-            }
-            else
-            {
-                newSlices.add(slice);
-            }
-
-            pastNewStart = true;
-        }
-        return withUpdatedSlices(newSlices.toArray(new ColumnSlice[newSlices.size()]));
-    }
-
-    public Iterator<Cell> getColumnIterator(ColumnFamily cf)
-    {
-        assert cf != null;
-        return reversed ? cf.reverseIterator(slices) : cf.iterator(slices);
-    }
-
-    public OnDiskAtomIterator getColumnIterator(final DecoratedKey key, final ColumnFamily cf)
-    {
-        assert cf != null;
-        final Iterator<Cell> iter = getColumnIterator(cf);
-
-        return new OnDiskAtomIterator()
-        {
-            public ColumnFamily getColumnFamily()
-            {
-                return cf;
-            }
-
-            public DecoratedKey getKey()
-            {
-                return key;
-            }
-
-            public boolean hasNext()
-            {
-                return iter.hasNext();
-            }
-
-            public OnDiskAtom next()
-            {
-                return iter.next();
-            }
-
-            public void close() throws IOException { }
-
-            public void remove()
-            {
-                throw new UnsupportedOperationException();
-            }
-        };
-    }
-
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, DecoratedKey key)
-    {
-        return sstable.iterator(key, slices, reversed);
-    }
-
-    public OnDiskAtomIterator getSSTableColumnIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry)
-    {
-        return sstable.iterator(file, key, slices, reversed, indexEntry);
-    }
-
-    public Comparator<Cell> getColumnComparator(CellNameType comparator)
-    {
-        return reversed ? comparator.columnReverseComparator() : comparator.columnComparator(false);
-    }
-
-    public void collectReducedColumns(ColumnFamily container, Iterator<Cell> reducedColumns, DecoratedKey key, int gcBefore, long now)
-    {
-        columnCounter = columnCounter(container.getComparator(), now);
-        DeletionInfo.InOrderTester tester = container.deletionInfo().inOrderTester(reversed);
-
-        while (reducedColumns.hasNext())
-        {
-            Cell cell = reducedColumns.next();
-
-            if (logger.isTraceEnabled())
-                logger.trace("collecting {} of {}: {}", columnCounter.live(), count, cell.getString(container.getComparator()));
-
-            // An expired tombstone will be immediately discarded in memory, and needn't be counted.
-            // Neither should be any cell shadowed by a range- or a partition tombstone.
-            if (cell.getLocalDeletionTime() < gcBefore || !columnCounter.count(cell, tester))
-                continue;
-
-            if (columnCounter.live() > count)
-                break;
-
-            if (respectTombstoneThresholds() && columnCounter.tombstones() > DatabaseDescriptor.getTombstoneFailureThreshold())
-            {
-                Tracing.trace("Scanned over {} tombstones; query aborted (see tombstone_failure_threshold); slices={}",
-                              DatabaseDescriptor.getTombstoneFailureThreshold(), getSlicesInfo(container));
-
-                throw new TombstoneOverwhelmingException(columnCounter.tombstones(),
-                                                         count,
-                                                         container.metadata().ksName,
-                                                         container.metadata().cfName,
-                                                         container.getComparator().getString(cell.name()),
-                                                         getSlicesInfo(container),
-                                                         container.metadata().getKeyValidator().getString(key.getKey()));
-            }
-
-            container.appendColumn(cell);
-        }
-
-        boolean warnTombstones = logger.isWarnEnabled() && respectTombstoneThresholds() && columnCounter.tombstones() > DatabaseDescriptor.getTombstoneWarnThreshold();
-        if (warnTombstones)
-        {
-            String msg = String.format("Read %d live and %d tombstone cells in %s.%s for key: %1.512s (see tombstone_warn_threshold). %d columns were requested, slices=%1.512s",
-                                       columnCounter.live(),
-                                       columnCounter.tombstones(),
-                                       container.metadata().ksName,
-                                       container.metadata().cfName,
-                                       container.metadata().getKeyValidator().getString(key.getKey()),
-                                       count,
-                                       getSlicesInfo(container));
-            ClientWarn.instance.warn(msg);
-            logger.warn(msg);
-        }
-        Tracing.trace("Read {} live and {} tombstone cells{}",
-                      columnCounter.live(),
-                      columnCounter.tombstones(),
-                      warnTombstones ? " (see tombstone_warn_threshold)" : "");
-    }
-
-    private String getSlicesInfo(ColumnFamily container)
-    {
-        StringBuilder sb = new StringBuilder();
-        CellNameType type = container.metadata().comparator;
-        for (ColumnSlice sl : slices)
-        {
-            assert sl != null;
-
-            sb.append('[');
-            sb.append(type.getString(sl.start));
-            sb.append('-');
-            sb.append(type.getString(sl.finish));
-            sb.append(']');
-        }
-        return sb.toString();
-    }
-
-    protected boolean respectTombstoneThresholds()
-    {
-        return true;
-    }
-
-    public int getLiveCount(ColumnFamily cf, long now)
-    {
-        return columnCounter(cf.getComparator(), now).countAll(cf).live();
-    }
-
-    public ColumnCounter columnCounter(CellNameType comparator, long now)
-    {
-        if (compositesToGroup < 0)
-            return new ColumnCounter(now);
-
-        boolean countPartitionsWithOnlyStaticData = Arrays.equals(slices, ColumnSlice.ALL_COLUMNS_ARRAY);
-
-        if (compositesToGroup == 0)
-            return new ColumnCounter.GroupByPrefix(now, null, 0, countPartitionsWithOnlyStaticData);
-
-        if (reversed)
-            return new ColumnCounter.GroupByPrefixReversed(now, comparator, compositesToGroup, countPartitionsWithOnlyStaticData);
-
-        return new ColumnCounter.GroupByPrefix(now, comparator, compositesToGroup, countPartitionsWithOnlyStaticData);
-    }
-
-    public ColumnFamily trim(ColumnFamily cf, int trimTo, long now)
-    {
-        // each cell can increment the count by at most one, so if we have fewer cells than trimTo, we can skip trimming
-        if (cf.getColumnCount() < trimTo)
-            return cf;
-
-        ColumnCounter counter = columnCounter(cf.getComparator(), now);
-
-        ColumnFamily trimmedCf = cf.getFactory().create(cf.metadata(), reversed, trimTo);
-        trimmedCf.delete(cf);
-
-        Collection<Cell> cells = reversed
-                                   ? cf.getReverseSortedColumns()
-                                   : cf.getSortedColumns();
-
-        DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester(reversed);
-
-        for (Iterator<Cell> iter = cells.iterator(); iter.hasNext(); )
-        {
-            Cell cell = iter.next();
-            counter.count(cell, tester);
-
-            if (counter.live() > trimTo)
-            {
-                break;
-            }
-            else
-            {
-                trimmedCf.addColumn(cell);
-            }
-        }
-
-        return trimmedCf;
-    }
-
-    public Composite start()
-    {
-        return this.slices[0].start;
-    }
-
-    public Composite finish()
-    {
-        return this.slices[slices.length - 1].finish;
-    }
-
-    public void setStart(Composite start)
-    {
-        assert slices.length == 1;
-        this.slices[0] = new ColumnSlice(start, this.slices[0].finish);
-    }
-
-    public int lastCounted()
-    {
-        // If we have a slice limit set, columnCounter.live() can overcount by one because we have to call
-        // columnCounter.count() before we can tell if we've exceeded the slice limit (and accordingly, should not
-        // add the cells to returned container).  To deal with this overcounting, we take the min of the slice
-        // limit and the counter's count.
-        return columnCounter == null ? 0 : Math.min(columnCounter.live(), count);
-    }
-
-    public int lastTombstones()
-    {
-        return columnCounter == null ? 0 : columnCounter.tombstones();
-    }
-
-    public int lastLive()
-    {
-        return columnCounter == null ? 0 : columnCounter.live();
-    }
-
-    @Override
-    public String toString()
-    {
-        return "SliceQueryFilter [reversed=" + reversed + ", slices=" + Arrays.toString(slices) + ", count=" + count + ", toGroup = " + compositesToGroup + "]";
-    }
-
-    public boolean isReversed()
-    {
-        return reversed;
-    }
-
-    public void updateColumnsLimit(int newLimit)
-    {
-        count = newLimit;
-    }
-
-    public boolean maySelectPrefix(CType type, Composite prefix)
-    {
-        for (ColumnSlice slice : slices)
-            if (slice.includes(type, prefix))
-                return true;
-        return false;
-    }
-
-    public boolean shouldInclude(SSTableReader sstable)
-    {
-        List<ByteBuffer> minColumnNames = sstable.getSSTableMetadata().minColumnNames;
-        List<ByteBuffer> maxColumnNames = sstable.getSSTableMetadata().maxColumnNames;
-        CellNameType comparator = sstable.metadata.comparator;
-
-        if (minColumnNames.isEmpty() || maxColumnNames.isEmpty())
-            return true;
-
-        for (ColumnSlice slice : slices)
-            if (slice.intersects(minColumnNames, maxColumnNames, comparator, reversed))
-                return true;
-
-        return false;
-    }
-
-    public boolean isHeadFilter()
-    {
-        return slices.length == 1 && slices[0].start.isEmpty() && !reversed;
-    }
-
-    public boolean countCQL3Rows(CellNameType comparator)
-    {
-        // If comparator is dense a cell == a CQL3 rows so we're always counting CQL3 rows
-        // in particular. Otherwise, we do so only if we group the cells into CQL rows.
-        return comparator.isDense() || compositesToGroup >= 0;
-    }
-
-    public boolean isFullyCoveredBy(ColumnFamily cf, long now)
-    {
-        // cf is the beginning of a partition. It covers this filter if:
-        //   1) either this filter requests the head of the partition and request less
-        //      than what cf has to offer (note: we do need to use getLiveCount() for that
-        //      as it knows if the filter count cells or CQL3 rows).
-        //   2) the start and finish bound of this filter are included in cf.
-        if (isHeadFilter() && count <= getLiveCount(cf, now))
-            return true;
-
-        if (start().isEmpty() || finish().isEmpty() || !cf.hasColumns())
-            return false;
-
-        Composite low = isReversed() ? finish() : start();
-        Composite high = isReversed() ? start() : finish();
-
-        CellName first = cf.iterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
-        CellName last = cf.reverseIterator(ColumnSlice.ALL_COLUMNS_ARRAY).next().name();
-
-        return cf.getComparator().compare(first, low) <= 0
-            && cf.getComparator().compare(high, last) <= 0;
-    }
-
-    public static class Serializer implements IVersionedSerializer<SliceQueryFilter>
-    {
-        private CType type;
-
-        public Serializer(CType type)
-        {
-            this.type = type;
-        }
-
-        public void serialize(SliceQueryFilter f, DataOutputPlus out, int version) throws IOException
-        {
-            out.writeInt(f.slices.length);
-            for (ColumnSlice slice : f.slices)
-                type.sliceSerializer().serialize(slice, out, version);
-            out.writeBoolean(f.reversed);
-            int count = f.count;
-            out.writeInt(count);
-
-            out.writeInt(f.compositesToGroup);
-        }
-
-        public SliceQueryFilter deserialize(DataInput in, int version) throws IOException
-        {
-            ColumnSlice[] slices;
-            slices = new ColumnSlice[in.readInt()];
-            for (int i = 0; i < slices.length; i++)
-                slices[i] = type.sliceSerializer().deserialize(in, version);
-            boolean reversed = in.readBoolean();
-            int count = in.readInt();
-            int compositesToGroup = in.readInt();
-
-            return new SliceQueryFilter(slices, reversed, count, compositesToGroup);
-        }
-
-        public long serializedSize(SliceQueryFilter f, int version)
-        {
-            TypeSizes sizes = TypeSizes.NATIVE;
-
-            int size = 0;
-            size += sizes.sizeof(f.slices.length);
-            for (ColumnSlice slice : f.slices)
-                size += type.sliceSerializer().serializedSize(slice, version);
-            size += sizes.sizeof(f.reversed);
-            size += sizes.sizeof(f.count);
-
-            size += sizes.sizeof(f.compositesToGroup);
-            return size;
-        }
-    }
-
-    public Iterator<RangeTombstone> getRangeTombstoneIterator(final ColumnFamily source)
-    {
-        final DeletionInfo delInfo = source.deletionInfo();
-        if (!delInfo.hasRanges() || slices.length == 0)
-            return Iterators.emptyIterator();
-
-        return new AbstractIterator<RangeTombstone>()
-        {
-            private int sliceIdx = 0;
-            private Iterator<RangeTombstone> sliceIter = currentRangeIter();
-
-            protected RangeTombstone computeNext()
-            {
-                while (true)
-                {
-                    if (sliceIter.hasNext())
-                        return sliceIter.next();
-
-                    if (!nextSlice())
-                        return endOfData();
-
-                    sliceIter = currentRangeIter();
-                }
-            }
-
-            private Iterator<RangeTombstone> currentRangeIter()
-            {
-                ColumnSlice slice = slices[reversed ? (slices.length - 1 - sliceIdx) : sliceIdx];
-                return reversed ? delInfo.rangeIterator(slice.finish, slice.start)
-                                : delInfo.rangeIterator(slice.start, slice.finish);
-            }
-
-            private boolean nextSlice()
-            {
-                return ++sliceIdx < slices.length;
-            }
-        };
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java b/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java
index da4e30f..622edb4 100644
--- a/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java
+++ b/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java

@@ -18,43 +18,51 @@
  */
 package org.apache.cassandra.db.filter;
 
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.*;
+
 public class TombstoneOverwhelmingException extends RuntimeException
 {
-    private final int numTombstones;
-    private final int numRequested;
-    private final String ksName;
-    private final String cfName;
-    private final String lastCellName;
-    private final String slicesInfo;
-    private final String partitionKey;
-
-    public TombstoneOverwhelmingException(int numTombstones,
-                                          int numRequested,
-                                          String ksName,
-                                          String cfName,
-                                          String lastCellName,
-                                          String slicesInfo,
-                                          String partitionKey)
+    public TombstoneOverwhelmingException(int numTombstones, String query, CFMetaData metadata, DecoratedKey lastPartitionKey, ClusteringPrefix lastClustering)
     {
-        this.numTombstones = numTombstones;
-        this.numRequested = numRequested;
-        this.ksName = ksName;
-        this.cfName = cfName;
-        this.lastCellName = lastCellName;
-        this.slicesInfo = slicesInfo;
-        this.partitionKey = partitionKey;
+        super(String.format("Scanned over %d tombstones during query '%s' (last scanned row token was %s and partion key was (%s)); query aborted",
+                            numTombstones, query, lastPartitionKey.getToken(), makePKString(metadata, lastPartitionKey.getKey(), lastClustering)));
     }
 
-    public String getLocalizedMessage()
+    private static String makePKString(CFMetaData metadata, ByteBuffer partitionKey, ClusteringPrefix clustering)
     {
-        return getMessage();
-    }
+        StringBuilder sb = new StringBuilder();
 
-    public String getMessage()
-    {
-        return String.format(
-                "Scanned over %d tombstones in %s.%s; %d columns were requested; query aborted " +
-                "(see tombstone_failure_threshold); partitionKey=%s; lastCell=%s; slices=%s",
-                numTombstones, ksName, cfName, numRequested, partitionKey, lastCellName, slicesInfo);
+        if (clustering.size() > 0)
+            sb.append("(");
+
+        // TODO: We should probably make that a lot easier/transparent for partition keys
+        AbstractType<?> pkType = metadata.getKeyValidator();
+        if (pkType instanceof CompositeType)
+        {
+            CompositeType ct = (CompositeType)pkType;
+            ByteBuffer[] values = ct.split(partitionKey);
+            for (int i = 0; i < values.length; i++)
+            {
+                if (i > 0)
+                    sb.append(", ");
+                sb.append(ct.types.get(i).getString(values[i]));
+            }
+        }
+        else
+        {
+            sb.append(pkType.getString(partitionKey));
+        }
+
+        if (clustering.size() > 0)
+            sb.append(")");
+
+        for (int i = 0; i < clustering.size(); i++)
+            sb.append(", ").append(metadata.comparator.subtype(i).getString(clustering.get(i)));
+
+        return sb.toString();
     }
 }

diff --git a/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java
deleted file mode 100644
index 4410acc..0000000
--- a/src/java/org/apache/cassandra/db/index/AbstractSimplePerColumnSecondaryIndex.java
+++ /dev/null

@@ -1,189 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-import java.util.concurrent.Future;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.dht.LocalPartitioner;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-/**
- * Implements a secondary index for a column family using a second column family
- * in which the row keys are indexed values, and column names are base row keys.
- */
-public abstract class AbstractSimplePerColumnSecondaryIndex extends PerColumnSecondaryIndex
-{
-    protected ColumnFamilyStore indexCfs;
-
-    // SecondaryIndex "forces" a set of ColumnDefinition. However this class (and thus it's subclass)
-    // only support one def per index. So inline it in a field for 1) convenience and 2) avoid creating
-    // an iterator each time we need to access it.
-    // TODO: we should fix SecondaryIndex API
-    protected ColumnDefinition columnDef;
-
-    public void init()
-    {
-        assert baseCfs != null && columnDefs != null && columnDefs.size() == 1;
-
-        columnDef = columnDefs.iterator().next();
-
-        CellNameType indexComparator = SecondaryIndex.getIndexComparator(baseCfs.metadata, columnDef);
-        CFMetaData indexedCfMetadata = CFMetaData.newIndexMetadata(baseCfs.metadata, columnDef, indexComparator);
-        indexCfs = ColumnFamilyStore.createColumnFamilyStore(baseCfs.keyspace,
-                                                             indexedCfMetadata.cfName,
-                                                             new LocalPartitioner(getIndexKeyComparator()),
-                                                             indexedCfMetadata,
-                                                             baseCfs.getTracker().loadsstables);
-    }
-
-    protected AbstractType<?> getIndexKeyComparator()
-    {
-        return columnDef.type;
-    }
-
-    @Override
-    String indexTypeForGrouping()
-    {
-        return "_internal_";
-    }
-
-    protected abstract CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell);
-
-    protected abstract ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell);
-
-    protected abstract AbstractType getExpressionComparator();
-
-    public String expressionString(IndexExpression expr)
-    {
-        return String.format("'%s.%s %s %s'",
-                             baseCfs.name,
-                             getExpressionComparator().getString(expr.column),
-                             expr.operator,
-                             baseCfs.metadata.getColumnDefinition(expr.column).type.getString(expr.value));
-    }
-
-    public void delete(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
-    {
-        deleteForCleanup(rowKey, cell, opGroup);
-    }
-
-    public void deleteForCleanup(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
-    {
-        if (!cell.isLive())
-            return;
-
-        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, cell));
-        int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
-        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata, false, 1);
-        cfi.addTombstone(makeIndexColumnName(rowKey, cell), localDeletionTime, cell.timestamp());
-        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
-        if (logger.isTraceEnabled())
-            logger.trace("removed index entry for cleaned-up value {}:{}", valueKey, cfi);
-    }
-
-    public void insert(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
-    {
-        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey, cell));
-        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata, false, 1);
-        CellName name = makeIndexColumnName(rowKey, cell);
-        if (!columnDef.isPrimaryKeyColumn() && cell instanceof ExpiringCell)
-        {
-            ExpiringCell ec = (ExpiringCell) cell;
-            cfi.addColumn(new BufferExpiringCell(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ec.timestamp(), ec.getTimeToLive(), ec.getLocalDeletionTime()));
-        }
-        else
-        {
-            cfi.addColumn(new BufferCell(name, ByteBufferUtil.EMPTY_BYTE_BUFFER, cell.timestamp()));
-        }
-        if (logger.isTraceEnabled())
-            logger.trace("applying index row {} in {}", indexCfs.metadata.getKeyValidator().getString(valueKey.getKey()), cfi);
-
-        indexCfs.apply(valueKey, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
-    }
-
-    public void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup)
-    {
-        // insert the new value before removing the old one, so we never have a period
-        // where the row is invisible to both queries (the opposite seems preferable); see CASSANDRA-5540                    
-        insert(rowKey, col, opGroup);
-        if (SecondaryIndexManager.shouldCleanupOldValue(oldCol, col))
-            delete(rowKey, oldCol, opGroup);
-    }
-
-    public void removeIndex(ByteBuffer columnName)
-    {
-        indexCfs.invalidate();
-    }
-
-    public void forceBlockingFlush()
-    {
-        Future<?> wait;
-        // we synchronise on the baseCfs to make sure we are ordered correctly with other flushes to the base CFS
-        synchronized (baseCfs.getTracker())
-        {
-            wait = indexCfs.forceFlush();
-        }
-        FBUtilities.waitOnFuture(wait);
-    }
-
-    public void invalidate()
-    {
-        indexCfs.invalidate();
-    }
-
-    public void truncateBlocking(long truncatedAt)
-    {
-        indexCfs.discardSSTables(truncatedAt);
-    }
-
-    public ColumnFamilyStore getIndexCfs()
-    {
-       return indexCfs;
-    }
-
-    public String getIndexName()
-    {
-        return indexCfs.name;
-    }
-
-    public void reload()
-    {
-        indexCfs.metadata.reloadSecondaryIndexMetadata(baseCfs.metadata);
-        indexCfs.reload();
-    }
-    
-    public long estimateResultRows()
-    {
-        return getIndexCfs().getMeanColumns();
-    }
-
-    public boolean validate(ByteBuffer rowKey, Cell cell)
-    {
-        return getIndexedValue(rowKey, cell).remaining() < FBUtilities.MAX_UNSIGNED_SHORT
-            && makeIndexColumnName(rowKey, cell).toByteBuffer().remaining() < FBUtilities.MAX_UNSIGNED_SHORT;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/IndexNotAvailableException.java b/src/java/org/apache/cassandra/db/index/IndexNotAvailableException.java
deleted file mode 100644
index 750e899..0000000
--- a/src/java/org/apache/cassandra/db/index/IndexNotAvailableException.java
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-/**
- * Thrown if a secondary index is not currently available.
- */
-public final class IndexNotAvailableException extends RuntimeException
-{
-    /**
-     * Creates a new <code>IndexNotAvailableException</code> for the specified index.
-     * @param name the index name
-     */
-    public IndexNotAvailableException(String name)
-    {
-        super(String.format("The secondary index '%s' is not yet available",
-                            removeTableNameIfNeeded(name)));
-    }
-
-    /**
-     * Extract the name of the index if necessary.
-     *
-     * @param name the index name prefixed by the tablename or not
-     * @return the index name
-     */
-    private static String removeTableNameIfNeeded(String name)
-    {
-        int index = name.indexOf('.');
-        if (index < 0)
-            return name;
-
-        return name.substring(index + 1);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java
deleted file mode 100644
index ba902ec..0000000
--- a/src/java/org/apache/cassandra/db/index/PerColumnSecondaryIndex.java
+++ /dev/null

@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.utils.FBUtilities;
-
-/**
- * Base class for Secondary indexes that implement a unique index per column
- *
- */
-public abstract class PerColumnSecondaryIndex extends SecondaryIndex
-{
-    /**
-     * Called when a column has been tombstoned or replaced.
-     *
-     * @param rowKey the underlying row key which is indexed
-     * @param col all the column info
-     */
-    public abstract void delete(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup);
-
-    /**
-     * Called when a column has been removed due to a cleanup operation.
-     */
-    public abstract void deleteForCleanup(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup);
-
-    /**
-     * insert a column to the index
-     *
-     * @param rowKey the underlying row key which is indexed
-     * @param col all the column info
-     */
-    public abstract void insert(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup);
-
-    /**
-     * update a column from the index
-     *
-     * @param rowKey the underlying row key which is indexed
-     * @param oldCol the previous column info
-     * @param col all the column info
-     */
-    public abstract void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup);
-
-    public String getNameForSystemKeyspace(ByteBuffer column)
-    {
-        return getIndexName();
-    }
-
-    public boolean validate(ByteBuffer rowKey, Cell cell)
-    {
-        return validate(cell);
-    }
-
-    public boolean validate(Cell cell)
-    {
-        return cell.value().remaining() < FBUtilities.MAX_UNSIGNED_SHORT;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java b/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java
deleted file mode 100644
index 5a3d457..0000000
--- a/src/java/org/apache/cassandra/db/index/PerRowSecondaryIndex.java
+++ /dev/null

@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-/**
- *  Base class for Secondary indexes that implement a unique index per row
- */
-public abstract class PerRowSecondaryIndex extends SecondaryIndex
-{
-    /**
-     * Index the given row.
-     *
-     * @param rowKey the row key
-     * @param cf the cf data to be indexed
-     */
-    public abstract void index(ByteBuffer rowKey, ColumnFamily cf);
-
-    /**
-     * cleans up deleted columns from cassandra cleanup compaction
-     *
-     * @param key
-     */
-    public abstract void delete(DecoratedKey key, OpOrder.Group opGroup);
-
-    public String getNameForSystemKeyspace(ByteBuffer columnName)
-    {
-        try
-        {
-            return getIndexName()+ByteBufferUtil.string(columnName);
-        }
-        catch (CharacterCodingException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-
-    public boolean validate(ByteBuffer rowKey, Cell cell)
-    {
-        return validate(cell);
-    }
-
-    public boolean validate(Cell cell)
-    {
-        return true;
-    }
-
-    public void validate(ByteBuffer key, ColumnFamily cf) throws InvalidRequestException
-    {
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndex.java b/src/java/org/apache/cassandra/db/index/SecondaryIndex.java
deleted file mode 100644
index cf2deeb..0000000
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndex.java
+++ /dev/null

@@ -1,429 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Future;
-import java.util.concurrent.FutureTask;
-
-import com.google.common.base.Objects;
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
-import org.apache.cassandra.db.index.composites.CompositesIndex;
-import org.apache.cassandra.db.index.keys.KeysIndex;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.LocalByPartionerType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.ReducingKeyIterator;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.FBUtilities;
-
-import org.apache.cassandra.utils.concurrent.Refs;
-
-/**
- * Abstract base class for different types of secondary indexes.
- *
- * Do not extend this directly, please pick from PerColumnSecondaryIndex or PerRowSecondaryIndex
- */
-public abstract class SecondaryIndex
-{
-    protected static final Logger logger = LoggerFactory.getLogger(SecondaryIndex.class);
-
-    public static final String CUSTOM_INDEX_OPTION_NAME = "class_name";
-
-    /**
-     * The name of the option used to specify that the index is on the collection keys.
-     */
-    public static final String INDEX_KEYS_OPTION_NAME = "index_keys";
-
-    /**
-     * The name of the option used to specify that the index is on the collection values.
-     */
-    public static final String INDEX_VALUES_OPTION_NAME = "index_values";
-
-    /**
-     * The name of the option used to specify that the index is on the collection (map) entries.
-     */
-    public static final String INDEX_ENTRIES_OPTION_NAME = "index_keys_and_values";
-
-    public static final AbstractType<?> keyComparator = StorageService.getPartitioner().preservesOrder()
-                                                      ? BytesType.instance
-                                                      : new LocalByPartionerType(StorageService.getPartitioner());
-
-    /**
-     * Base CF that has many indexes
-     */
-    protected ColumnFamilyStore baseCfs;
-
-    // We need to keep track if the index is queryable or not to be sure that we can safely use it. If the index
-    // is still being build, using it will return incomplete results.
-    /**
-     * Specify if the index is queryable or not.
-     */
-    private volatile boolean queryable;
-
-    /**
-     * The column definitions which this index is responsible for
-     */
-    protected final Set<ColumnDefinition> columnDefs = Collections.newSetFromMap(new ConcurrentHashMap<ColumnDefinition,Boolean>());
-
-    /**
-     * Perform any initialization work
-     */
-    public abstract void init();
-
-    /**
-     * Reload an existing index following a change to its configuration,
-     * or that of the indexed column(s). Differs from init() in that we expect
-     * expect new resources (such as CFS for a KEYS index) to be created by
-     * init() but not here
-     */
-    public abstract void reload();
-
-    /**
-     * Validates the index_options passed in the ColumnDef
-     * @throws ConfigurationException
-     */
-    public abstract void validateOptions() throws ConfigurationException;
-
-    /**
-     * @return The name of the index
-     */
-    abstract public String getIndexName();
-
-    /**
-     * All internal 2ndary indexes will return "_internal_" for this. Custom
-     * 2ndary indexes will return their class name. This only matter for
-     * SecondaryIndexManager.groupByIndexType.
-     */
-    String indexTypeForGrouping()
-    {
-        // Our internal indexes overwrite this
-        return getClass().getCanonicalName();
-    }
-
-    /**
-     * Return the unique name for this index and column
-     * to be stored in the SystemKeyspace that tracks if each column is built
-     *
-     * @param columnName the name of the column
-     * @return the unique name
-     */
-    abstract public String getNameForSystemKeyspace(ByteBuffer columnName);
-
-    /**
-     * Checks if the index for specified column is fully built
-     *
-     * @param columnName the column
-     * @return true if the index is fully built
-     */
-    public boolean isIndexBuilt(ByteBuffer columnName)
-    {
-        return SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnName));
-    }
-
-    /**
-     * Checks if the index is ready.
-     * @return <code>true</code> if the index is ready, <code>false</code> otherwise
-     */
-    public boolean isQueryable()
-    {
-        return queryable;
-    }
-
-    public void setIndexBuilt()
-    {
-        queryable = true;
-        for (ColumnDefinition columnDef : columnDefs)
-            SystemKeyspace.setIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name.bytes));
-    }
-
-    public void setIndexRemoved()
-    {
-        for (ColumnDefinition columnDef : columnDefs)
-            SystemKeyspace.setIndexRemoved(baseCfs.keyspace.getName(), getNameForSystemKeyspace(columnDef.name.bytes));
-    }
-
-    /**
-     * Called at query time
-     * Creates a implementation specific searcher instance for this index type
-     * @param columns the list of columns which belong to this index type
-     * @return the secondary index search impl
-     */
-    protected abstract SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns);
-
-    /**
-     * Forces this indexes' in memory data to disk
-     */
-    public abstract void forceBlockingFlush();
-
-    /**
-     * Allow access to the underlying column family store if there is one
-     * @return the underlying column family store or null
-     */
-    public abstract ColumnFamilyStore getIndexCfs();
-
-
-    /**
-     * Delete all files and references to this index
-     * @param columnName the indexed column to remove
-     */
-    public abstract void removeIndex(ByteBuffer columnName);
-
-    /**
-     * Remove the index and unregisters this index's mbean if one exists
-     */
-    public abstract void invalidate();
-
-    /**
-     * Truncate all the data from the current index
-     *
-     * @param truncatedAt The truncation timestamp, all data before that timestamp should be rejected.
-     */
-    public abstract void truncateBlocking(long truncatedAt);
-
-    /**
-     * Builds the index using the data in the underlying CFS
-     * Blocks till it's complete
-     */
-    protected void buildIndexBlocking()
-    {
-        logger.info(String.format("Submitting index build of %s for data in %s",
-                getIndexName(), StringUtils.join(baseCfs.getSSTables(), ", ")));
-
-        try (Refs<SSTableReader> sstables = baseCfs.selectAndReference(ColumnFamilyStore.CANONICAL_SSTABLES).refs)
-        {
-            SecondaryIndexBuilder builder = new SecondaryIndexBuilder(baseCfs,
-                                                                      Collections.singleton(getIndexName()),
-                                                                      new ReducingKeyIterator(sstables));
-            Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
-            FBUtilities.waitOnFuture(future);
-            forceBlockingFlush();
-            setIndexBuilt();
-        }
-        logger.info("Index build of {} complete", getIndexName());
-    }
-
-
-    /**
-     * Builds the index using the data in the underlying CF, non blocking
-     *
-     *
-     * @return A future object which the caller can block on (optional)
-     */
-    public final Future<?> buildIndexAsync()
-    {
-        // if we're just linking in the index to indexedColumns on an already-built index post-restart, we're done
-        boolean allAreBuilt = true;
-        for (ColumnDefinition cdef : columnDefs)
-        {
-            if (!SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), getNameForSystemKeyspace(cdef.name.bytes)))
-            {
-                allAreBuilt = false;
-                break;
-            }
-        }
-
-        if (allAreBuilt)
-        {
-            queryable = true;
-            return null;
-        }
-
-        // If the base table is empty we can directly mark the index as built.
-        if (baseCfs.isEmpty())
-        {
-            setIndexBuilt();
-            return null;
-        }
-
-        // build it asynchronously; addIndex gets called by CFS open and schema update, neither of which
-        // we want to block for a long period.  (actual build is serialized on CompactionManager.)
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-                baseCfs.forceBlockingFlush();
-                buildIndexBlocking();
-            }
-        };
-        FutureTask<?> f = new FutureTask<Object>(runnable, null);
-
-        new Thread(f, "Creating index: " + getIndexName()).start();
-        return f;
-    }
-
-    public ColumnFamilyStore getBaseCfs()
-    {
-        return baseCfs;
-    }
-
-    private void setBaseCfs(ColumnFamilyStore baseCfs)
-    {
-        this.baseCfs = baseCfs;
-    }
-
-    public Set<ColumnDefinition> getColumnDefs()
-    {
-        return columnDefs;
-    }
-
-    void addColumnDef(ColumnDefinition columnDef)
-    {
-       columnDefs.add(columnDef);
-    }
-
-    void removeColumnDef(ByteBuffer name)
-    {
-        Iterator<ColumnDefinition> it = columnDefs.iterator();
-        while (it.hasNext())
-        {
-            if (it.next().name.bytes.equals(name))
-                it.remove();
-        }
-    }
-
-    /** Returns true if the index supports lookups for the given operator, false otherwise. */
-    public boolean supportsOperator(Operator operator)
-    {
-        return operator == Operator.EQ;
-    }
-
-    /**
-     * Returns the decoratedKey for a column value. Assumes an index CFS is present.
-     * @param value column value
-     * @return decorated key
-     */
-    public DecoratedKey getIndexKeyFor(ByteBuffer value)
-    {
-        return getIndexCfs().partitioner.decorateKey(value);
-    }
-
-    /**
-     * Returns true if the provided cell name is indexed by this secondary index.
-     *
-     * The default implementation checks whether the name is one the columnDef name,
-     * but this should be overriden but subclass if needed.
-     */
-    public abstract boolean indexes(CellName name);
-
-    /**
-     * Returns true if the provided column definition is indexed by this secondary index.
-     *
-     * The default implementation checks whether it is contained in this index column definitions set.
-     */
-    public boolean indexes(ColumnDefinition cdef)
-    {
-        return columnDefs.contains(cdef);
-    }
-
-    /**
-     * This is the primary way to create a secondary index instance for a CF column.
-     * It will validate the index_options before initializing.
-     *
-     * @param baseCfs the source of data for the Index
-     * @param cdef the meta information about this column (index_type, index_options, name, etc...)
-     *
-     * @return The secondary index instance for this column
-     * @throws ConfigurationException
-     */
-    public static SecondaryIndex createInstance(ColumnFamilyStore baseCfs, ColumnDefinition cdef) throws ConfigurationException
-    {
-        SecondaryIndex index;
-
-        switch (cdef.getIndexType())
-        {
-        case KEYS:
-            index = new KeysIndex();
-            break;
-        case COMPOSITES:
-            index = CompositesIndex.create(cdef);
-            break;
-        case CUSTOM:
-            assert cdef.getIndexOptions() != null;
-            String class_name = cdef.getIndexOptions().get(CUSTOM_INDEX_OPTION_NAME);
-            assert class_name != null;
-            try
-            {
-                index = (SecondaryIndex) Class.forName(class_name).newInstance();
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException(e);
-            }
-            break;
-            default:
-                throw new RuntimeException("Unknown index type: " + cdef.getIndexName());
-        }
-
-        index.addColumnDef(cdef);
-        index.validateOptions();
-        index.setBaseCfs(baseCfs);
-
-        return index;
-    }
-
-    public abstract boolean validate(ByteBuffer rowKey, Cell cell);
-
-    public abstract long estimateResultRows();
-
-    /**
-     * Returns the index comparator for index backed by CFS, or null.
-     *
-     * Note: it would be cleaner to have this be a member method. However we need this when opening indexes
-     * sstables, but by then the CFS won't be fully initiated, so the SecondaryIndex object won't be accessible.
-     */
-    public static CellNameType getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cdef)
-    {
-        switch (cdef.getIndexType())
-        {
-            case KEYS:
-                return new SimpleDenseCellNameType(keyComparator);
-            case COMPOSITES:
-                return CompositesIndex.getIndexComparator(baseMetadata, cdef);
-            case CUSTOM:
-                return null;
-        }
-        throw new AssertionError();
-    }
-
-    @Override
-    public String toString()
-    {
-        return Objects.toStringHelper(this).add("columnDefs", columnDefs).toString();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndexBuilder.java b/src/java/org/apache/cassandra/db/index/SecondaryIndexBuilder.java
deleted file mode 100644
index 916c286..0000000
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndexBuilder.java
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.io.IOException;
-import java.util.Set;
-import java.util.UUID;
-
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.CompactionInfo;
-import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.compaction.CompactionInterruptedException;
-import org.apache.cassandra.io.sstable.ReducingKeyIterator;
-import org.apache.cassandra.utils.UUIDGen;
-
-/**
- * Manages building an entire index from column family data. Runs on to compaction manager.
- */
-public class SecondaryIndexBuilder extends CompactionInfo.Holder
-{
-    private final ColumnFamilyStore cfs;
-    private final Set<String> idxNames;
-    private final ReducingKeyIterator iter;
-    private final UUID compactionId;
-
-    public SecondaryIndexBuilder(ColumnFamilyStore cfs, Set<String> idxNames, ReducingKeyIterator iter)
-    {
-        this.cfs = cfs;
-        this.idxNames = idxNames;
-        this.iter = iter;
-        compactionId = UUIDGen.getTimeUUID();
-    }
-
-    public CompactionInfo getCompactionInfo()
-    {
-        return new CompactionInfo(cfs.metadata,
-                                  OperationType.INDEX_BUILD,
-                                  iter.getBytesRead(),
-                                  iter.getTotalBytes(),
-                                  compactionId);
-    }
-
-    public void build()
-    {
-        while (iter.hasNext())
-        {
-            if (isStopRequested())
-                throw new CompactionInterruptedException(getCompactionInfo());
-            DecoratedKey key = iter.next();
-            Keyspace.indexRow(key, cfs, idxNames);
-        }
-
-        try
-        {
-            iter.close();
-        }
-        catch (IOException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java
deleted file mode 100644
index 26327d4..0000000
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndexManager.java
+++ /dev/null

@@ -1,894 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.IdentityHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ConcurrentNavigableMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.Future;
-
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.base.Joiner;
-
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.IndexType;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.io.sstable.ReducingKeyIterator;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-/**
- * Manages all the indexes associated with a given CFS
- * Different types of indexes can be created across the same CF
- */
-public class SecondaryIndexManager
-{
-    private static final Logger logger = LoggerFactory.getLogger(SecondaryIndexManager.class);
-
-    public static final Updater nullUpdater = new Updater()
-    {
-        public void insert(Cell cell) { }
-
-        public void update(Cell oldCell, Cell cell) { }
-
-        public void remove(Cell current) { }
-
-        public void updateRowLevelIndexes() {}
-    };
-
-    /**
-     * Organizes the indexes by column name
-     */
-    private final ConcurrentNavigableMap<ByteBuffer, SecondaryIndex> indexesByColumn;
-
-
-    /**
-     * Keeps a single instance of a SecondaryIndex for many columns when the index type
-     * has isRowLevelIndex() == true
-     *
-     * This allows updates to happen to an entire row at once
-     */
-    private final ConcurrentMap<Class<? extends SecondaryIndex>, SecondaryIndex> rowLevelIndexMap;
-
-
-    /**
-     * Keeps all secondary index instances, either per-column or per-row
-     */
-    private final Collection<SecondaryIndex> allIndexes;
-    private final Map<String, SecondaryIndex> indexesByName;
-
-
-    /**
-     * The underlying column family containing the source data for these indexes
-     */
-    public final ColumnFamilyStore baseCfs;
-
-    public SecondaryIndexManager(ColumnFamilyStore baseCfs)
-    {
-        indexesByColumn = new ConcurrentSkipListMap<>();
-        rowLevelIndexMap = new ConcurrentHashMap<>();
-        indexesByName = new ConcurrentHashMap<String, SecondaryIndex>();
-        allIndexes = indexesByName.values();
-
-        this.baseCfs = baseCfs;
-    }
-
-    /**
-     * Drops and adds new indexes associated with the underlying CF
-     */
-    public void reload()
-    {
-        // figure out what needs to be added and dropped.
-        // future: if/when we have modifiable settings for secondary indexes,
-        // they'll need to be handled here.
-        Collection<ByteBuffer> indexedColumnNames = indexesByColumn.keySet();
-        for (ByteBuffer indexedColumn : indexedColumnNames)
-        {
-            ColumnDefinition def = baseCfs.metadata.getColumnDefinition(indexedColumn);
-            if (def == null || def.getIndexType() == null)
-                removeIndexedColumn(indexedColumn);
-        }
-
-        // TODO: allow all ColumnDefinition type
-        for (ColumnDefinition cdef : baseCfs.metadata.allColumns())
-            if (cdef.getIndexType() != null && !indexedColumnNames.contains(cdef.name.bytes))
-                addIndexedColumn(cdef);
-
-        for (SecondaryIndex index : allIndexes)
-            index.reload();
-    }
-
-    public Set<String> allIndexesNames()
-    {
-        Set<String> names = new HashSet<>(allIndexes.size());
-        for (SecondaryIndex index : allIndexes)
-            names.add(index.getIndexName());
-        return names;
-    }
-
-    /**
-     * Does a full, blocking rebuild of the indexes specified by columns from the sstables.
-     * Does nothing if columns is empty.
-     *
-     * Caller must acquire and release references to the sstables used here.
-     *
-     * @param sstables the data to build from
-     * @param idxNames the list of columns to index, ordered by comparator
-     */
-    public void maybeBuildSecondaryIndexes(Collection<SSTableReader> sstables, Set<String> idxNames)
-    {
-        idxNames = filterByColumn(idxNames);
-        if (idxNames.isEmpty())
-            return;
-
-        logger.info(String.format("Submitting index build of %s for data in %s",
-                                  idxNames, StringUtils.join(sstables, ", ")));
-
-        SecondaryIndexBuilder builder = new SecondaryIndexBuilder(baseCfs, idxNames, new ReducingKeyIterator(sstables));
-        Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
-        FBUtilities.waitOnFuture(future);
-
-        flushIndexesBlocking();
-
-        logger.info("Index build of {} complete", idxNames);
-    }
-
-    public boolean indexes(CellName name, Collection<SecondaryIndex> indexes)
-    {
-        boolean matching = false;
-        for (SecondaryIndex index : indexes)
-        {
-            if (index.indexes(name))
-            {
-                matching = true;
-                break;
-            }
-        }
-        return matching;
-    }
-
-    public Set<SecondaryIndex> indexFor(CellName name, Collection<SecondaryIndex> indexes)
-    {
-        Set<SecondaryIndex> matching = null;
-        for (SecondaryIndex index : indexes)
-        {
-            if (index.indexes(name))
-            {
-                if (matching == null)
-                    matching = new HashSet<>();
-                matching.add(index);
-            }
-        }
-        return matching == null ? Collections.<SecondaryIndex>emptySet() : matching;
-    }
-
-    public boolean indexes(Cell cell)
-    {
-        return indexes(cell.name());
-    }
-
-    public boolean indexes(CellName name)
-    {
-        return indexes(name, allIndexes);
-    }
-
-    public Set<SecondaryIndex> indexFor(CellName name)
-    {
-        return indexFor(name, allIndexes);
-    }
-
-    /**
-     * @return true if at least one of the indexes can handle the clause.
-     */
-    public boolean hasIndexFor(List<IndexExpression> clause)
-    {
-        if (clause == null || clause.isEmpty())
-            return false;
-
-        for (SecondaryIndexSearcher searcher : getIndexSearchersForQuery(clause))
-            if (searcher.canHandleIndexClause(clause))
-                return true;
-
-        return false;
-    }
-
-    /**
-     * Removes a existing index
-     * @param column the indexed column to remove
-     */
-    public void removeIndexedColumn(ByteBuffer column)
-    {
-        SecondaryIndex index = indexesByColumn.remove(column);
-
-        if (index == null)
-            return;
-
-        // Remove this column from from row level index map as well as all indexes set
-        if (index instanceof PerRowSecondaryIndex)
-        {
-            index.removeColumnDef(column);
-
-            // If no columns left remove from row level lookup as well as all indexes set
-            if (index.getColumnDefs().isEmpty())
-            {
-                allIndexes.remove(index);
-                rowLevelIndexMap.remove(index.getClass());
-            }
-        }
-        else
-        {
-            allIndexes.remove(index);
-        }
-
-        index.removeIndex(column);
-        SystemKeyspace.setIndexRemoved(baseCfs.metadata.ksName, index.getNameForSystemKeyspace(column));
-    }
-
-    /**
-     * Adds and builds a index for a column
-     * @param cdef the column definition holding the index data
-     * @return a future which the caller can optionally block on signaling the index is built
-     */
-    public synchronized Future<?> addIndexedColumn(ColumnDefinition cdef)
-    {
-        if (indexesByColumn.containsKey(cdef.name.bytes))
-            return null;
-
-        assert cdef.getIndexType() != null;
-
-        SecondaryIndex index = SecondaryIndex.createInstance(baseCfs, cdef);
-
-        // Keep a single instance of the index per-cf for row level indexes
-        // since we want all columns to be under the index
-        if (index instanceof PerRowSecondaryIndex)
-        {
-            SecondaryIndex currentIndex = rowLevelIndexMap.get(index.getClass());
-
-            if (currentIndex == null)
-            {
-                rowLevelIndexMap.put(index.getClass(), index);
-                index.init();
-            }
-            else
-            {
-                index = currentIndex;
-                index.addColumnDef(cdef);
-                logger.info("Creating new index : {}",cdef);
-            }
-        }
-        else
-        {
-            // TODO: We sould do better than throw a RuntimeException
-            if (cdef.getIndexType() == IndexType.CUSTOM && index instanceof AbstractSimplePerColumnSecondaryIndex)
-                throw new RuntimeException("Cannot use a subclass of AbstractSimplePerColumnSecondaryIndex as a CUSTOM index, as they assume they are CFS backed");
-            index.init();
-        }
-
-        // link in indexedColumns. this means that writes will add new data to
-        // the index immediately,
-        // so we don't have to lock everything while we do the build. it's up to
-        // the operator to wait
-        // until the index is actually built before using in queries.
-        indexesByColumn.put(cdef.name.bytes, index);
-
-        // Add to all indexes set:
-        indexesByName.put(index.getIndexName(), index);
-
-        // We do not need to check if the index is already build as buildIndexAsync will do it for us
-        return index.buildIndexAsync();
-    }
-
-    /**
-     *
-     * @param column the name of indexes column
-     * @return the index
-     */
-    public SecondaryIndex getIndexForColumn(ByteBuffer column)
-    {
-        return indexesByColumn.get(column);
-    }
-
-    /**
-     * Remove the index
-     */
-    public void invalidate()
-    {
-        for (SecondaryIndex index : allIndexes)
-            index.invalidate();
-    }
-
-    /**
-     * Flush all indexes to disk
-     */
-    public void flushIndexesBlocking()
-    {
-        // despatch flushes for all CFS backed indexes
-        List<Future<?>> wait = new ArrayList<>();
-        synchronized (baseCfs.getTracker())
-        {
-            for (SecondaryIndex index : allIndexes)
-                if (index.getIndexCfs() != null)
-                    wait.add(index.getIndexCfs().forceFlush());
-        }
-
-        // blockingFlush any non-CFS-backed indexes
-        for (SecondaryIndex index : allIndexes)
-            if (index.getIndexCfs() == null)
-                index.forceBlockingFlush();
-
-        // wait for the CFS-backed index flushes to complete
-        FBUtilities.waitOnFutures(wait);
-    }
-
-    /**
-     * @return all built indexes (ready to use)
-     */
-    public List<String> getBuiltIndexes()
-    {
-        List<String> indexList = new ArrayList<>();
-
-        for (Map.Entry<ByteBuffer, SecondaryIndex> entry : indexesByColumn.entrySet())
-        {
-            SecondaryIndex index = entry.getValue();
-
-            if (index.isIndexBuilt(entry.getKey()))
-                indexList.add(entry.getValue().getIndexName());
-        }
-
-        return indexList;
-    }
-
-    /**
-     * @return all CFS from indexes which use a backing CFS internally (KEYS)
-     */
-    public Set<ColumnFamilyStore> getIndexesBackedByCfs()
-    {
-        Set<ColumnFamilyStore> cfsList = new HashSet<>();
-
-        for (SecondaryIndex index: allIndexes)
-        {
-            ColumnFamilyStore cfs = index.getIndexCfs();
-            if (cfs != null)
-                cfsList.add(cfs);
-        }
-
-        return cfsList;
-    }
-
-    /**
-     * @return all indexes which do *not* use a backing CFS internally
-     */
-    public Set<SecondaryIndex> getIndexesNotBackedByCfs()
-    {
-        // we use identity map because per row indexes use same instance across many columns
-        Set<SecondaryIndex> indexes = Collections.newSetFromMap(new IdentityHashMap<SecondaryIndex, Boolean>());
-        for (SecondaryIndex index: allIndexes)
-            if (index.getIndexCfs() == null)
-                indexes.add(index);
-        return indexes;
-    }
-
-    /**
-     * @return all of the secondary indexes without distinction to the (non-)backed by secondary ColumnFamilyStore.
-     */
-    public Collection<SecondaryIndex> getIndexes()
-    {
-        return allIndexes;
-    }
-
-    public SecondaryIndex getIndexByName(String name)
-    {
-        return indexesByName.get(name);
-    }
-
-    /**
-     * @return if there are ANY indexes for this table..
-     */
-    public boolean hasIndexes()
-    {
-        return !indexesByColumn.isEmpty();
-    }
-
-    /**
-     * When building an index against existing data, add the given row to the index
-     *
-     * @param key the row key
-     * @param cf the current rows data
-     */
-    public void indexRow(ByteBuffer key, ColumnFamily cf, OpOrder.Group opGroup)
-    {
-        // Update entire row only once per row level index
-        Set<Class<? extends SecondaryIndex>> appliedRowLevelIndexes = null;
-
-        for (SecondaryIndex index : allIndexes)
-        {
-            if (index instanceof PerRowSecondaryIndex)
-            {
-                if (appliedRowLevelIndexes == null)
-                    appliedRowLevelIndexes = new HashSet<>();
-
-                if (appliedRowLevelIndexes.add(index.getClass()))
-                    ((PerRowSecondaryIndex)index).index(key, cf);
-            }
-            else
-            {
-                for (Cell cell : cf)
-                    if (cell.isLive() && index.indexes(cell.name()))
-                        ((PerColumnSecondaryIndex) index).insert(key, cell, opGroup);
-            }
-        }
-    }
-
-    /**
-     * Delete all columns from all indexes for this row.  For when cleanup rips a row out entirely.
-     *
-     * @param key the row key
-     * @param indexedColumnsInRow all column names in row
-     */
-    public void deleteFromIndexes(DecoratedKey key, List<Cell> indexedColumnsInRow, OpOrder.Group opGroup)
-    {
-        // Update entire row only once per row level index
-        Set<Class<? extends SecondaryIndex>> cleanedRowLevelIndexes = null;
-
-        for (Cell cell : indexedColumnsInRow)
-        {
-            for (SecondaryIndex index : indexFor(cell.name()))
-            {
-                if (index instanceof PerRowSecondaryIndex)
-                {
-                    if (cleanedRowLevelIndexes == null)
-                        cleanedRowLevelIndexes = new HashSet<>();
-                    if (cleanedRowLevelIndexes.add(index.getClass()))
-                        ((PerRowSecondaryIndex) index).delete(key, opGroup);
-                }
-                else
-                {
-                    ((PerColumnSecondaryIndex) index).deleteForCleanup(key.getKey(), cell, opGroup);
-                }
-            }
-        }
-    }
-
-    /**
-     * This helper acts as a closure around the indexManager
-     * and updated cf data to ensure that down in
-     * Memtable's ColumnFamily implementation, the index
-     * can get updated. Note: only a CF backed by AtomicSortedColumns implements
-     * this behaviour fully, other types simply ignore the index updater.
-     */
-    public Updater updaterFor(DecoratedKey key, ColumnFamily cf, OpOrder.Group opGroup)
-    {
-        return (indexesByColumn.isEmpty() && rowLevelIndexMap.isEmpty())
-                ? nullUpdater
-                : new StandardUpdater(key, cf, opGroup);
-    }
-
-    /**
-     * Updated closure with only the modified row key.
-     */
-    public Updater gcUpdaterFor(DecoratedKey key)
-    {
-        return (indexesByColumn.isEmpty() && rowLevelIndexMap.isEmpty())
-               ? nullUpdater
-               : new GCUpdater(key);
-    }
-
-    /**
-     * Get a list of IndexSearchers from the union of expression index types
-     * @param clause the query clause
-     * @return the searchers needed to query the index
-     */
-    public List<SecondaryIndexSearcher> getIndexSearchersForQuery(List<IndexExpression> clause)
-    {
-        Map<String, Set<ByteBuffer>> groupByIndexType = new HashMap<>();
-
-        //Group columns by type
-        for (IndexExpression ix : clause)
-        {
-            SecondaryIndex index = getIndexForColumn(ix.column);
-
-            if (index == null || !index.supportsOperator(ix.operator))
-                continue;
-
-            Set<ByteBuffer> columns = groupByIndexType.get(index.indexTypeForGrouping());
-
-            if (columns == null)
-            {
-                columns = new HashSet<>();
-                groupByIndexType.put(index.indexTypeForGrouping(), columns);
-            }
-
-            columns.add(ix.column);
-        }
-
-        List<SecondaryIndexSearcher> indexSearchers = new ArrayList<>(groupByIndexType.size());
-
-        //create searcher per type
-        for (Set<ByteBuffer> column : groupByIndexType.values())
-            indexSearchers.add(getIndexForColumn(column.iterator().next()).createSecondaryIndexSearcher(column));
-
-        return indexSearchers;
-    }
-
-    /**
-     * Validates an union of expression index types. It will throw a {@link RuntimeException} if
-     * any of the expressions in the provided clause is not valid for its index implementation.
-     * @param clause the query clause
-     * @throws org.apache.cassandra.exceptions.InvalidRequestException in case of validation errors
-     */
-    public void validateIndexSearchersForQuery(List<IndexExpression> clause) throws InvalidRequestException
-    {
-        // Group by index type
-        Map<String, Set<IndexExpression>> expressionsByIndexType = new HashMap<>();
-        Map<String, Set<ByteBuffer>> columnsByIndexType = new HashMap<>();
-        for (IndexExpression indexExpression : clause)
-        {
-            SecondaryIndex index = getIndexForColumn(indexExpression.column);
-
-            if (index == null)
-                continue;
-
-            String canonicalIndexName = index.getClass().getCanonicalName();
-            Set<IndexExpression> expressions = expressionsByIndexType.get(canonicalIndexName);
-            Set<ByteBuffer> columns = columnsByIndexType.get(canonicalIndexName);
-            if (expressions == null)
-            {
-                expressions = new HashSet<>();
-                columns = new HashSet<>();
-                expressionsByIndexType.put(canonicalIndexName, expressions);
-                columnsByIndexType.put(canonicalIndexName, columns);
-            }
-
-            expressions.add(indexExpression);
-            columns.add(indexExpression.column);
-        }
-
-        // Validate
-        boolean haveSupportedIndexLookup = false;
-        for (Map.Entry<String, Set<IndexExpression>> expressions : expressionsByIndexType.entrySet())
-        {
-            Set<ByteBuffer> columns = columnsByIndexType.get(expressions.getKey());
-            SecondaryIndex secondaryIndex = getIndexForColumn(columns.iterator().next());
-            SecondaryIndexSearcher searcher = secondaryIndex.createSecondaryIndexSearcher(columns);
-            for (IndexExpression expression : expressions.getValue())
-            {
-                searcher.validate(expression);
-                haveSupportedIndexLookup |= secondaryIndex.supportsOperator(expression.operator);
-            }
-        }
-
-        CellNameType comparator = baseCfs.metadata.comparator;
-        // For thrift static CFs we can use filtering if no indexes can be used
-        if (!haveSupportedIndexLookup && (comparator.isDense() ||  comparator.isCompound()))
-        {
-            if (expressionsByIndexType.isEmpty())
-                throw new InvalidRequestException(
-                    String.format("Predicates on non-primary-key columns (%s) are not yet supported for non secondary index queries",
-                                  Joiner.on(", ").join(getColumnNames(clause))));
-
-            // build the error message
-            int i = 0;
-            StringBuilder sb = new StringBuilder("No secondary indexes on the restricted columns support the provided operators: ");
-            for (Map.Entry<String, Set<IndexExpression>> expressions : expressionsByIndexType.entrySet())
-            {
-                for (IndexExpression expression : expressions.getValue())
-                {
-                    if (i++ > 0)
-                        sb.append(", ");
-                    sb.append("'");
-                    String columnName = getColumnName(expression);
-                    sb.append(columnName).append(" ").append(expression.operator).append(" <value>").append("'");
-                }
-            }
-
-            throw new InvalidRequestException(sb.toString());
-        }
-    }
-
-    private static String getColumnName(IndexExpression expression)
-    {
-        try
-        {
-            return ByteBufferUtil.string(expression.column);
-        }
-        catch (CharacterCodingException ex)
-        {
-            return "<unprintable>";
-        }
-    }
-
-    private static Set<String> getColumnNames(List<IndexExpression> expressions)
-    {
-        Set<String> columnNames = new HashSet<>();
-        for (IndexExpression expression : expressions)
-            columnNames.add(getColumnName(expression));
-
-        return columnNames;
-    }
-
-    /**
-     * Performs a search across a number of column indexes
-     *
-     * @param filter the column range to restrict to
-     * @return found indexed rows
-     */
-    public List<Row> search(ExtendedFilter filter)
-    {
-        SecondaryIndexSearcher mostSelective = getHighestSelectivityIndexSearcher(filter.getClause());
-        if (mostSelective == null)
-            return Collections.emptyList();
-        else
-            return mostSelective.search(filter);
-    }
-
-    public Set<SecondaryIndex> getIndexesByNames(Set<String> idxNames)
-    {
-        Set<SecondaryIndex> result = new HashSet<>();
-        for (SecondaryIndex index : allIndexes)
-            if (idxNames.contains(index.getIndexName()))
-                result.add(index);
-        return result;
-    }
-
-    public void setIndexBuilt(Set<String> idxNames)
-    {
-        for (SecondaryIndex index : getIndexesByNames(idxNames))
-            index.setIndexBuilt();
-    }
-
-    public void setIndexRemoved(Set<String> idxNames)
-    {
-        for (SecondaryIndex index : getIndexesByNames(idxNames))
-            index.setIndexRemoved();
-    }
-
-    public SecondaryIndex validate(ByteBuffer rowKey, Cell cell)
-    {
-        for (SecondaryIndex index : indexFor(cell.name()))
-        {
-            if (!index.validate(rowKey, cell))
-                return index;
-        }
-        return null;
-    }
-
-    public void validateRowLevelIndexes(ByteBuffer key, ColumnFamily cf) throws InvalidRequestException
-    {
-        for (SecondaryIndex index : rowLevelIndexMap.values())
-        {
-            ((PerRowSecondaryIndex) index).validate(key, cf);
-        }
-    }
-
-    static boolean shouldCleanupOldValue(Cell oldCell, Cell newCell)
-    {
-        // If any one of name/value/timestamp are different, then we
-        // should delete from the index. If not, then we can infer that
-        // at least one of the cells is an ExpiringColumn and that the
-        // difference is in the expiry time. In this case, we don't want to
-        // delete the old value from the index as the tombstone we insert
-        // will just hide the inserted value.
-        // Completely identical cells (including expiring columns with
-        // identical ttl & localExpirationTime) will not get this far due
-        // to the oldCell.equals(newColumn) in StandardUpdater.update
-        return !oldCell.name().equals(newCell.name())
-            || !oldCell.value().equals(newCell.value())
-            || oldCell.timestamp() != newCell.timestamp();
-    }
-
-    private Set<String> filterByColumn(Set<String> idxNames)
-    {
-        Set<SecondaryIndex> indexes = getIndexesByNames(idxNames);
-        Set<String> filtered = new HashSet<>(idxNames.size());
-        for (SecondaryIndex candidate : indexes)
-        {
-            for (ColumnDefinition column : baseCfs.metadata.allColumns())
-            {
-                if (candidate.indexes(column))
-                {
-                    filtered.add(candidate.getIndexName());
-                    break;
-                }
-            }
-        }
-        return filtered;
-    }
-
-    public static interface Updater
-    {
-        /** called when constructing the index against pre-existing data */
-        public void insert(Cell cell);
-
-        /** called when updating the index from a memtable */
-        public void update(Cell oldCell, Cell cell);
-
-        /** called when lazy-updating the index during compaction (CASSANDRA-2897) */
-        public void remove(Cell current);
-
-        /** called after memtable updates are complete (CASSANDRA-5397) */
-        public void updateRowLevelIndexes();
-    }
-
-    private final class GCUpdater implements Updater
-    {
-        private final DecoratedKey key;
-
-        public GCUpdater(DecoratedKey key)
-        {
-            this.key = key;
-        }
-
-        public void insert(Cell cell)
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        public void update(Cell oldCell, Cell newCell)
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        public void remove(Cell cell)
-        {
-            if (!cell.isLive())
-                return;
-
-            for (SecondaryIndex index : indexFor(cell.name()))
-            {
-                if (index instanceof PerColumnSecondaryIndex)
-                {
-                    try (OpOrder.Group opGroup = baseCfs.keyspace.writeOrder.start())
-                    {
-                        ((PerColumnSecondaryIndex) index).delete(key.getKey(), cell, opGroup);
-                    }
-                }
-            }
-        }
-
-        public void updateRowLevelIndexes()
-        {
-            for (SecondaryIndex index : rowLevelIndexMap.values())
-                ((PerRowSecondaryIndex) index).index(key.getKey(), null);
-        }
-    }
-
-    private final class StandardUpdater implements Updater
-    {
-        private final DecoratedKey key;
-        private final ColumnFamily cf;
-        private final OpOrder.Group opGroup;
-
-        public StandardUpdater(DecoratedKey key, ColumnFamily cf, OpOrder.Group opGroup)
-        {
-            this.key = key;
-            this.cf = cf;
-            this.opGroup = opGroup;
-        }
-
-        public void insert(Cell cell)
-        {
-            if (!cell.isLive())
-                return;
-
-            for (SecondaryIndex index : indexFor(cell.name()))
-                if (index instanceof PerColumnSecondaryIndex)
-                    ((PerColumnSecondaryIndex) index).insert(key.getKey(), cell, opGroup);
-        }
-
-        public void update(Cell oldCell, Cell cell)
-        {
-            if (oldCell.equals(cell))
-                return;
-
-            for (SecondaryIndex index : indexFor(cell.name()))
-            {
-                if (index instanceof PerColumnSecondaryIndex)
-                {
-                    if (cell.isLive())
-                    {
-                        ((PerColumnSecondaryIndex) index).update(key.getKey(), oldCell, cell, opGroup);
-                    }
-                    else
-                    {
-                        // Usually we want to delete the old value from the index, except when
-                        // name/value/timestamp are all equal, but the columns themselves
-                        // are not (as is the case when overwriting expiring columns with
-                        // identical values and ttl) Then, we don't want to delete as the
-                        // tombstone will hide the new value we just inserted; see CASSANDRA-7268
-                        if (shouldCleanupOldValue(oldCell, cell))
-                            ((PerColumnSecondaryIndex) index).delete(key.getKey(), oldCell, opGroup);
-                    }
-                }
-            }
-        }
-
-        public void remove(Cell cell)
-        {
-            if (!cell.isLive())
-                return;
-
-            for (SecondaryIndex index : indexFor(cell.name()))
-                if (index instanceof PerColumnSecondaryIndex)
-                   ((PerColumnSecondaryIndex) index).delete(key.getKey(), cell, opGroup);
-        }
-
-        public void updateRowLevelIndexes()
-        {
-            for (SecondaryIndex index : rowLevelIndexMap.values())
-                ((PerRowSecondaryIndex) index).index(key.getKey(), cf);
-        }
-
-    }
-
-    public SecondaryIndexSearcher getHighestSelectivityIndexSearcher(List<IndexExpression> clause)
-    {
-        if (clause == null)
-            return null;
-
-        List<SecondaryIndexSearcher> indexSearchers = getIndexSearchersForQuery(clause);
-
-        if (indexSearchers.isEmpty())
-            return null;
-
-        SecondaryIndexSearcher mostSelective = null;
-        long bestEstimate = Long.MAX_VALUE;
-        for (SecondaryIndexSearcher searcher : indexSearchers)
-        {
-            SecondaryIndex highestSelectivityIndex = searcher.highestSelectivityIndex(clause);
-            if (highestSelectivityIndex != null)
-            {
-                long estimate = highestSelectivityIndex.estimateResultRows();
-                if (estimate <= bestEstimate)
-                {
-                    bestEstimate = estimate;
-                    mostSelective = searcher;
-                }
-            }
-        }
-
-        return mostSelective;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java b/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java
deleted file mode 100644
index 5812e9d..0000000
--- a/src/java/org/apache/cassandra/db/index/SecondaryIndexSearcher.java
+++ /dev/null

@@ -1,139 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.FBUtilities;
-
-public abstract class SecondaryIndexSearcher
-{
-    protected final SecondaryIndexManager indexManager;
-    protected final Set<ByteBuffer> columns;
-    protected final ColumnFamilyStore baseCfs;
-
-    public SecondaryIndexSearcher(SecondaryIndexManager indexManager, Set<ByteBuffer> columns)
-    {
-        this.indexManager = indexManager;
-        this.columns = columns;
-        this.baseCfs = indexManager.baseCfs;
-    }
-
-    public SecondaryIndex highestSelectivityIndex(List<IndexExpression> clause)
-    {
-        IndexExpression expr = highestSelectivityPredicate(clause, false);
-        return expr == null ? null : indexManager.getIndexForColumn(expr.column);
-    }
-
-    public abstract List<Row> search(ExtendedFilter filter);
-
-    /**
-     * @return true this index is able to handle the given index expressions.
-     */
-    public boolean canHandleIndexClause(List<IndexExpression> clause)
-    {
-        for (IndexExpression expression : clause)
-        {
-            if (!columns.contains(expression.column))
-                continue;
-
-            SecondaryIndex index = indexManager.getIndexForColumn(expression.column);
-            if (index != null && index.getIndexCfs() != null && index.supportsOperator(expression.operator))
-                return true;
-        }
-        return false;
-    }
-    
-    /**
-     * Validates the specified {@link IndexExpression}. It will throw an {@link org.apache.cassandra.exceptions.InvalidRequestException}
-     * if the provided clause is not valid for the index implementation.
-     *
-     * @param indexExpression An {@link IndexExpression} to be validated
-     * @throws org.apache.cassandra.exceptions.InvalidRequestException in case of validation errors
-     */
-    public void validate(IndexExpression indexExpression) throws InvalidRequestException
-    {
-    }
-
-    protected IndexExpression highestSelectivityPredicate(List<IndexExpression> clause, boolean includeInTrace)
-    {
-        IndexExpression best = null;
-        int bestMeanCount = Integer.MAX_VALUE;
-        Map<SecondaryIndex, Integer> candidates = new HashMap<>();
-
-        for (IndexExpression expression : clause)
-        {
-            // skip columns belonging to a different index type
-            if (!columns.contains(expression.column))
-                continue;
-
-            SecondaryIndex index = indexManager.getIndexForColumn(expression.column);
-            if (index == null || index.getIndexCfs() == null || !index.supportsOperator(expression.operator))
-                continue;
-
-            int columns = index.getIndexCfs().getMeanColumns();
-            candidates.put(index, columns);
-            if (columns < bestMeanCount)
-            {
-                best = expression;
-                bestMeanCount = columns;
-            }
-        }
-
-        if (includeInTrace)
-        {
-            if (best == null)
-                Tracing.trace("No applicable indexes found");
-            else if (Tracing.isTracing())
-                // pay for an additional threadlocal get() rather than build the strings unnecessarily
-                Tracing.trace("Candidate index mean cardinalities are {}. Scanning with {}.",
-                              FBUtilities.toString(candidates),
-                              indexManager.getIndexForColumn(best.column).getIndexName());
-        }
-        return best;
-    }
-
-    /**
-     * Returns {@code true} if the specified list of {@link IndexExpression}s require a full scan of all the nodes.
-     *
-     * @param clause A list of {@link IndexExpression}s
-     * @return {@code true} if the {@code IndexExpression}s require a full scan, {@code false} otherwise
-     */
-    public boolean requiresScanningAllRanges(List<IndexExpression> clause)
-    {
-        return false;
-    }
-
-    /**
-     * Combines index query results from multiple nodes. This is done by the coordinator node after it has reconciled
-     * the replica responses.
-     *
-     * @param clause A list of {@link IndexExpression}s
-     * @param rows The index query results to be combined
-     * @return The combination of the index query results
-     */
-    public List<Row> postReconciliationProcessing(List<IndexExpression> clause, List<Row> rows)
-    {
-        return rows;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java
deleted file mode 100644
index 0be78cc..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndex.java
+++ /dev/null

@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.index.AbstractSimplePerColumnSecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-
-/**
- * Base class for secondary indexes where composites are involved.
- */
-public abstract class CompositesIndex extends AbstractSimplePerColumnSecondaryIndex
-{
-    private volatile CellNameType indexComparator;
-
-    protected CellNameType getIndexComparator()
-    {
-        // Yes, this is racy, but doing this more than once is not a big deal, we just want to avoid doing it every time
-        // More seriously, we should fix that whole SecondaryIndex API so this can be a final and avoid all that non-sense.
-        if (indexComparator == null)
-        {
-            assert columnDef != null;
-            indexComparator = getIndexComparator(baseCfs.metadata, columnDef);
-        }
-        return indexComparator;
-    }
-
-    public static CompositesIndex create(ColumnDefinition cfDef)
-    {
-        if (cfDef.type.isCollection() && cfDef.type.isMultiCell())
-        {
-            switch (((CollectionType)cfDef.type).kind)
-            {
-                case LIST:
-                    return new CompositesIndexOnCollectionValue();
-                case SET:
-                    return new CompositesIndexOnCollectionKey();
-                case MAP:
-                    if (cfDef.hasIndexOption(SecondaryIndex.INDEX_KEYS_OPTION_NAME))
-                        return new CompositesIndexOnCollectionKey();
-                    else if (cfDef.hasIndexOption(SecondaryIndex.INDEX_ENTRIES_OPTION_NAME))
-                        return new CompositesIndexOnCollectionKeyAndValue();
-                    else
-                        return new CompositesIndexOnCollectionValue();
-            }
-        }
-
-        switch (cfDef.kind)
-        {
-            case CLUSTERING_COLUMN:
-                return new CompositesIndexOnClusteringKey();
-            case REGULAR:
-                return new CompositesIndexOnRegular();
-            case PARTITION_KEY:
-                return new CompositesIndexOnPartitionKey();
-            //case COMPACT_VALUE:
-            //    return new CompositesIndexOnCompactValue();
-        }
-        throw new AssertionError();
-    }
-
-    // Check SecondaryIndex.getIndexComparator if you want to know why this is static
-    public static CellNameType getIndexComparator(CFMetaData baseMetadata, ColumnDefinition cfDef)
-    {
-        if (cfDef.type.isCollection() && cfDef.type.isMultiCell())
-        {
-            switch (((CollectionType)cfDef.type).kind)
-            {
-                case LIST:
-                    return CompositesIndexOnCollectionValue.buildIndexComparator(baseMetadata, cfDef);
-                case SET:
-                    return CompositesIndexOnCollectionKey.buildIndexComparator(baseMetadata, cfDef);
-                case MAP:
-                    if (cfDef.hasIndexOption(SecondaryIndex.INDEX_KEYS_OPTION_NAME))
-                        return CompositesIndexOnCollectionKey.buildIndexComparator(baseMetadata, cfDef);
-                    else if (cfDef.hasIndexOption(SecondaryIndex.INDEX_ENTRIES_OPTION_NAME))
-                        return CompositesIndexOnCollectionKeyAndValue.buildIndexComparator(baseMetadata, cfDef);
-                    else
-                        return CompositesIndexOnCollectionValue.buildIndexComparator(baseMetadata, cfDef);
-            }
-        }
-
-        switch (cfDef.kind)
-        {
-            case CLUSTERING_COLUMN:
-                return CompositesIndexOnClusteringKey.buildIndexComparator(baseMetadata, cfDef);
-            case REGULAR:
-                return CompositesIndexOnRegular.buildIndexComparator(baseMetadata, cfDef);
-            case PARTITION_KEY:
-                return CompositesIndexOnPartitionKey.buildIndexComparator(baseMetadata, cfDef);
-            //case COMPACT_VALUE:
-            //    return CompositesIndexOnCompactValue.buildIndexComparator(baseMetadata, cfDef);
-        }
-        throw new AssertionError();
-    }
-
-    protected CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell)
-    {
-        return getIndexComparator().create(makeIndexColumnPrefix(rowKey, cell.name()), null);
-    }
-
-    protected abstract Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName);
-
-    public abstract IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry);
-
-    public abstract boolean isStale(IndexedEntry entry, ColumnFamily data, long now);
-
-    public void delete(IndexedEntry entry, OpOrder.Group opGroup)
-    {
-        int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
-        ColumnFamily cfi = ArrayBackedSortedColumns.factory.create(indexCfs.metadata);
-        cfi.addTombstone(entry.indexEntry, localDeletionTime, entry.timestamp);
-        indexCfs.apply(entry.indexValue, cfi, SecondaryIndexManager.nullUpdater, opGroup, null);
-        if (logger.isTraceEnabled())
-            logger.trace("removed index entry for cleaned-up value {}:{}", entry.indexValue, cfi);
-    }
-
-    protected AbstractType<?> getExpressionComparator()
-    {
-        return baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-    }
-
-    public SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
-    {
-        return new CompositesSearcher(baseCfs.indexManager, columns);
-    }
-
-    public void validateOptions() throws ConfigurationException
-    {
-        ColumnDefinition columnDef = columnDefs.iterator().next();
-        Map<String, String> options = new HashMap<String, String>(columnDef.getIndexOptions());
-
-        // We used to have an option called "prefix_size" so skip it silently for backward compatibility sake.
-        options.remove("prefix_size");
-
-        if (columnDef.type.isCollection())
-        {
-            options.remove(SecondaryIndex.INDEX_VALUES_OPTION_NAME);
-            options.remove(SecondaryIndex.INDEX_KEYS_OPTION_NAME);
-            options.remove(SecondaryIndex.INDEX_ENTRIES_OPTION_NAME);
-        }
-
-        if (!options.isEmpty())
-            throw new ConfigurationException("Unknown options provided for COMPOSITES index: " + options.keySet());
-    }
-
-    public static class IndexedEntry
-    {
-        public final DecoratedKey indexValue;
-        public final CellName indexEntry;
-        public final long timestamp;
-
-        public final ByteBuffer indexedKey;
-        public final Composite indexedEntryPrefix;
-        public final ByteBuffer indexedEntryCollectionKey; // may be null
-
-        public IndexedEntry(DecoratedKey indexValue, CellName indexEntry, long timestamp, ByteBuffer indexedKey, Composite indexedEntryPrefix)
-        {
-            this(indexValue, indexEntry, timestamp, indexedKey, indexedEntryPrefix, null);
-        }
-
-        public IndexedEntry(DecoratedKey indexValue,
-                            CellName indexEntry,
-                            long timestamp,
-                            ByteBuffer indexedKey,
-                            Composite indexedEntryPrefix,
-                            ByteBuffer indexedEntryCollectionKey)
-        {
-            this.indexValue = indexValue;
-            this.indexEntry = indexEntry;
-            this.timestamp = timestamp;
-            this.indexedKey = indexedKey;
-            this.indexedEntryPrefix = indexedEntryPrefix;
-            this.indexedEntryCollectionKey = indexedEntryCollectionKey;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexIncludingCollectionKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexIncludingCollectionKey.java
deleted file mode 100644
index 402ea05..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexIncludingCollectionKey.java
+++ /dev/null

@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CBuilder;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Common superclass for indexes that capture collection keys, including
- * indexes on such keys themselves.
- *
- * A cell indexed by this index will have the general form:
- *   ck_0 ... ck_n c_name [col_elt] : v
- * where ck_i are the cluster keys, c_name the CQL3 column name, col_elt the
- * collection element that we want to index (which may or may not be there depending
- * on whether c_name is the collection we're indexing), and v the cell value.
- *
- * Such a cell is indexed if c_name is the indexed collection (in which case we are guaranteed to have
- * col_elt). The index entry can be viewed in the following way:
- *   - the row key is determined by subclasses of this type.
- *   - the cell name will be 'rk ck_0 ... ck_n' where rk is the row key of the initial cell.
- */
-public abstract class CompositesIndexIncludingCollectionKey extends CompositesIndex
-{
-    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
-    {
-        int count = 1 + baseMetadata.clusteringColumns().size(); // row key + clustering prefix
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(count);
-        types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < count - 1; i++)
-            types.add(baseMetadata.comparator.subtype(i));
-        return new CompoundDenseCellNameType(types);
-    }
-
-    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
-    {
-        int count = 1 + baseCfs.metadata.clusteringColumns().size();
-        CBuilder builder = getIndexComparator().builder();
-        builder.add(rowKey);
-        for (int i = 0; i < Math.min(cellName.size(), count - 1); i++)
-            builder.add(cellName.get(i));
-        return builder.build();
-    }
-
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
-    {
-        int count = 1 + baseCfs.metadata.clusteringColumns().size();
-        CBuilder builder = baseCfs.getComparator().builder();
-        for (int i = 0; i < count - 1; i++)
-            builder.add(indexEntry.name().get(i + 1));
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
-    }
-
-    @Override
-    public boolean indexes(CellName name)
-    {
-        // We index if the CQL3 column name is the one of the collection we index
-        AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-        return name.size() > columnDef.position()
-            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java
deleted file mode 100644
index 0243b0d..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnClusteringKey.java
+++ /dev/null

@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-/**
- * Index on a CLUSTERING_COLUMN column definition.
- *
- * A cell indexed by this index will have the general form:
- *   ck_0 ... ck_n c_name : v
- * where ck_i are the cluster keys, c_name the last component of the cell
- * composite name (or second to last if collections are in use, but this
- * has no impact) and v the cell value.
- *
- * Such a cell is always indexed by this index (or rather, it is indexed if
- * n >= columnDef.componentIndex, which will always be the case in practice)
- * and it will generate (makeIndexColumnName()) an index entry whose:
- *   - row key will be ck_i (getIndexedValue()) where i == columnDef.componentIndex.
- *   - cell name will
- *       rk ck_0 ... ck_{i-1} ck_{i+1} ck_n
- *     where rk is the row key of the initial cell and i == columnDef.componentIndex.
- */
-public class CompositesIndexOnClusteringKey extends CompositesIndex
-{
-    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
-    {
-        // Index cell names are rk ck_0 ... ck_{i-1} ck_{i+1} ck_n, so n
-        // components total (where n is the number of clustering keys)
-        int ckCount = baseMetadata.clusteringColumns().size();
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(ckCount);
-        types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < columnDef.position(); i++)
-            types.add(baseMetadata.clusteringColumns().get(i).type);
-        for (int i = columnDef.position() + 1; i < ckCount; i++)
-            types.add(baseMetadata.clusteringColumns().get(i).type);
-        return new CompoundDenseCellNameType(types);
-    }
-
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        return cell.name().get(columnDef.position());
-    }
-
-    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName)
-    {
-        int count = Math.min(baseCfs.metadata.clusteringColumns().size(), columnName.size());
-        CBuilder builder = getIndexComparator().prefixBuilder();
-        builder.add(rowKey);
-        for (int i = 0; i < Math.min(columnDef.position(), count); i++)
-            builder.add(columnName.get(i));
-        for (int i = columnDef.position() + 1; i < count; i++)
-            builder.add(columnName.get(i));
-        return builder.build();
-    }
-
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
-    {
-        int ckCount = baseCfs.metadata.clusteringColumns().size();
-
-        CBuilder builder = baseCfs.getComparator().builder();
-        for (int i = 0; i < columnDef.position(); i++)
-            builder.add(indexEntry.name().get(i + 1));
-
-        builder.add(indexedValue.getKey());
-
-        for (int i = columnDef.position() + 1; i < ckCount; i++)
-            builder.add(indexEntry.name().get(i));
-
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
-    }
-
-    @Override
-    public boolean indexes(CellName name)
-    {
-        // For now, assume this is only used in CQL3 when we know name has enough component.
-        return true;
-    }
-
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        return data.hasOnlyTombstones(now);
-    }
-
-    @Override
-    public void delete(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
-    {
-        // We only know that one column of the CQL row has been updated/deleted, but we don't know if the
-        // full row has been deleted so we should not do anything. If it ends up that the whole row has
-        // been deleted, it will be eventually cleaned up on read because the entry will be detected stale.
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java
deleted file mode 100644
index 1e40710..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKey.java
+++ /dev/null

@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Index on the collection element of the cell name of a collection.
- *
- * The row keys for this index are given by the collection element for
- * indexed columns.
- */
-public class CompositesIndexOnCollectionKey extends CompositesIndexIncludingCollectionKey
-{
-    @Override
-    protected AbstractType<?> getIndexKeyComparator()
-    {
-        return ((CollectionType)columnDef.type).nameComparator();
-    }
-
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        return cell.name().get(columnDef.position() + 1);
-    }
-
-    @Override
-    public boolean supportsOperator(Operator operator)
-    {
-        return operator == Operator.CONTAINS_KEY ||
-                operator == Operator.CONTAINS && columnDef.type instanceof SetType;
-    }
-
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef, entry.indexValue.getKey());
-        Cell cell = data.getColumn(name);
-        return cell == null || !cell.isLive(now);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKeyAndValue.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKeyAndValue.java
deleted file mode 100644
index 0b7f579..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionKeyAndValue.java
+++ /dev/null

@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Index on the element and value of cells participating in a collection.
- *
- * The row keys for this index are a composite of the collection element
- * and value of indexed columns.
- */
-public class CompositesIndexOnCollectionKeyAndValue extends CompositesIndexIncludingCollectionKey
-{
-    @Override
-    protected AbstractType<?> getIndexKeyComparator()
-    {
-        CollectionType colType = (CollectionType)columnDef.type;
-        return CompositeType.getInstance(colType.nameComparator(), colType.valueComparator());
-    }
-
-    @Override
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        final ByteBuffer key = cell.name().get(columnDef.position() + 1);
-        final ByteBuffer value = cell.value();
-        return CompositeType.build(key, value);
-    }
-
-    @Override
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        Cell cell = extractTargetCell(entry, data);
-        if (cellIsDead(cell, now))
-            return true;
-        ByteBuffer indexCollectionValue = extractCollectionValue(entry);
-        ByteBuffer targetCollectionValue = cell.value();
-        AbstractType<?> valueComparator = ((CollectionType)columnDef.type).valueComparator();
-        return valueComparator.compare(indexCollectionValue, targetCollectionValue) != 0;
-    }
-
-    private Cell extractTargetCell(IndexedEntry entry, ColumnFamily data)
-    {
-        ByteBuffer collectionKey = extractCollectionKey(entry);
-        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef, collectionKey);
-        return data.getColumn(name);
-    }
-
-    private ByteBuffer extractCollectionKey(IndexedEntry entry)
-    {
-        return extractIndexKeyComponent(entry, 0);
-    }
-
-    private ByteBuffer extractIndexKeyComponent(IndexedEntry entry, int component)
-    {
-        return CompositeType.extractComponent(entry.indexValue.getKey(), component);
-    }
-
-    private ByteBuffer extractCollectionValue(IndexedEntry entry)
-    {
-        return extractIndexKeyComponent(entry, 1);
-    }
-
-    private boolean cellIsDead(Cell cell, long now)
-    {
-        return cell == null || !cell.isLive(now);
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java
deleted file mode 100644
index a11a0d9..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnCollectionValue.java
+++ /dev/null

@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CBuilder;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Index the value of a collection cell.
- *
- * This is a lot like an index on REGULAR, except that we also need to make
- * the collection key part of the index entry so that:
- *   1) we don't have to scan the whole collection at query time to know the
- *   entry is stale and if it still satisfies the query.
- *   2) if a collection has multiple time the same value, we need one entry
- *   for each so that if we delete one of the value only we only delete the
- *   entry corresponding to that value.
- */
-public class CompositesIndexOnCollectionValue extends CompositesIndex
-{
-    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
-    {
-        int prefixSize = columnDef.position();
-        List<AbstractType<?>> types = new ArrayList<>(prefixSize + 2);
-        types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < prefixSize; i++)
-            types.add(baseMetadata.comparator.subtype(i));
-        types.add(((CollectionType)columnDef.type).nameComparator()); // collection key
-        return new CompoundDenseCellNameType(types);
-    }
-
-    @Override
-    protected AbstractType<?> getIndexKeyComparator()
-    {
-        return ((CollectionType)columnDef.type).valueComparator();
-    }
-
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        return cell.value();
-    }
-
-    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
-    {
-        CBuilder builder = getIndexComparator().prefixBuilder();
-        builder.add(rowKey);
-        for (int i = 0; i < Math.min(columnDef.position(), cellName.size()); i++)
-            builder.add(cellName.get(i));
-
-        // When indexing, cellName is a full name including the collection
-        // key. When searching, restricted clustering columns are included
-        // but the collection key is not. In this case, don't try to add an
-        // element to the builder for it, as it will just end up null and
-        // error out when retrieving cells from the index cf (CASSANDRA-7525)
-        if (cellName.size() >= columnDef.position() + 1)
-            builder.add(cellName.get(columnDef.position() + 1));
-        return builder.build();
-    }
-
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
-    {
-        int prefixSize = columnDef.position();
-        CellName name = indexEntry.name();
-        CBuilder builder = baseCfs.getComparator().builder();
-        for (int i = 0; i < prefixSize; i++)
-            builder.add(name.get(i + 1));
-        return new IndexedEntry(indexedValue, name, indexEntry.timestamp(), name.get(0), builder.build(), name.get(prefixSize + 1));
-    }
-
-    @Override
-    public boolean supportsOperator(Operator operator)
-    {
-        return operator == Operator.CONTAINS && !(columnDef.type instanceof SetType);
-    }
-
-    @Override
-    public boolean indexes(CellName name)
-    {
-        AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-        return name.size() > columnDef.position()
-            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
-    }
-
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef, entry.indexedEntryCollectionKey);
-        Cell cell = data.getColumn(name);
-        return cell == null || !cell.isLive(now) || ((CollectionType) columnDef.type).valueComparator().compare(entry.indexValue.getKey(), cell.value()) != 0;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java
deleted file mode 100644
index df43057..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnPartitionKey.java
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-/**
- * Index on a PARTITION_KEY column definition.
- *
- * This suppose a composite row key:
- *   rk = rk_0 ... rk_n
- *
- * The corresponding index entry will be:
- *   - index row key will be rk_i (where i == columnDef.componentIndex)
- *   - cell name will be: rk ck
- *     where rk is the fully partition key and ck the clustering keys of the
- *     original cell names (thus excluding the last column name as we want to refer to
- *     the whole CQL3 row, not just the cell itself)
- *
- * Note that contrarily to other type of index, we repeat the indexed value in
- * the index cell name (we use the whole partition key). The reason is that we
- * want to order the index cell name by partitioner first, and skipping a part
- * of the row key would change the order.
- */
-public class CompositesIndexOnPartitionKey extends CompositesIndex
-{
-    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
-    {
-        int ckCount = baseMetadata.clusteringColumns().size();
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(ckCount + 1);
-        types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < ckCount; i++)
-            types.add(baseMetadata.comparator.subtype(i));
-        return new CompoundDenseCellNameType(types);
-    }
-
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        CompositeType keyComparator = (CompositeType)baseCfs.metadata.getKeyValidator();
-        ByteBuffer[] components = keyComparator.split(rowKey);
-        return components[columnDef.position()];
-    }
-
-    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite columnName)
-    {
-        int count = Math.min(baseCfs.metadata.clusteringColumns().size(), columnName.size());
-        CBuilder builder = getIndexComparator().prefixBuilder();
-        builder.add(rowKey);
-        for (int i = 0; i < count; i++)
-            builder.add(columnName.get(i));
-        return builder.build();
-    }
-
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
-    {
-        int ckCount = baseCfs.metadata.clusteringColumns().size();
-        CBuilder builder = baseCfs.getComparator().builder();
-        for (int i = 0; i < ckCount; i++)
-            builder.add(indexEntry.name().get(i + 1));
-
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
-    }
-
-    @Override
-    public boolean indexes(CellName name)
-    {
-        // Since a partition key is always full, we always index it
-        return true;
-    }
-
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        return data.hasOnlyTombstones(now);
-    }
-
-    @Override
-    public void delete(ByteBuffer rowKey, Cell cell, OpOrder.Group opGroup)
-    {
-        // We only know that one column of the CQL row has been updated/deleted, but we don't know if the
-        // full row has been deleted so we should not do anything. If it ends up that the whole row has
-        // been deleted, it will be eventually cleaned up on read because the entry will be detected stale.
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java b/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java
deleted file mode 100644
index b9dc07f..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesIndexOnRegular.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
-
-/**
- * Index on a REGULAR column definition on a composite type.
- *
- * A cell indexed by this index will have the general form:
- *   ck_0 ... ck_n c_name : v
- * where ck_i are the cluster keys, c_name the last component of the cell
- * composite name (or second to last if collections are in use, but this
- * has no impact) and v the cell value.
- *
- * Such a cell is indexed if c_name == columnDef.name, and it will generate
- * (makeIndexColumnName()) an index entry whose:
- *   - row key will be the value v (getIndexedValue()).
- *   - cell name will
- *       rk ck_0 ... ck_n
- *     where rk is the row key of the initial cell. I.e. the index entry store
- *     all the information require to locate back the indexed cell.
- */
-public class CompositesIndexOnRegular extends CompositesIndex
-{
-    public static CellNameType buildIndexComparator(CFMetaData baseMetadata, ColumnDefinition columnDef)
-    {
-        int prefixSize = columnDef.position();
-        List<AbstractType<?>> types = new ArrayList<AbstractType<?>>(prefixSize + 1);
-        types.add(SecondaryIndex.keyComparator);
-        for (int i = 0; i < prefixSize; i++)
-            types.add(baseMetadata.comparator.subtype(i));
-        return new CompoundDenseCellNameType(types);
-    }
-
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        return cell.value();
-    }
-
-    protected Composite makeIndexColumnPrefix(ByteBuffer rowKey, Composite cellName)
-    {
-        CBuilder builder = getIndexComparator().prefixBuilder();
-        builder.add(rowKey);
-        for (int i = 0; i < Math.min(columnDef.position(), cellName.size()); i++)
-            builder.add(cellName.get(i));
-        return builder.build();
-    }
-
-    public IndexedEntry decodeEntry(DecoratedKey indexedValue, Cell indexEntry)
-    {
-        CBuilder builder = baseCfs.getComparator().builder();
-        for (int i = 0; i < columnDef.position(); i++)
-            builder.add(indexEntry.name().get(i + 1));
-        return new IndexedEntry(indexedValue, indexEntry.name(), indexEntry.timestamp(), indexEntry.name().get(0), builder.build());
-    }
-
-    @Override
-    public boolean indexes(CellName name)
-    {
-        AbstractType<?> comp = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-        return name.size() > columnDef.position()
-            && comp.compare(name.get(columnDef.position()), columnDef.name.bytes) == 0;
-    }
-
-    public boolean isStale(IndexedEntry entry, ColumnFamily data, long now)
-    {
-        CellName name = data.getComparator().create(entry.indexedEntryPrefix, columnDef);
-        Cell cell = data.getColumn(name);
-        return cell == null || !cell.isLive(now) || columnDef.type.compare(entry.indexValue.getKey(), cell.value()) != 0;
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java b/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java
deleted file mode 100644
index a67aa2b..0000000
--- a/src/java/org/apache/cassandra/db/index/composites/CompositesSearcher.java
+++ /dev/null

@@ -1,317 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.composites;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.index.IndexNotAvailableException;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-public class CompositesSearcher extends SecondaryIndexSearcher
-{
-    private static final Logger logger = LoggerFactory.getLogger(CompositesSearcher.class);
-
-    public CompositesSearcher(SecondaryIndexManager indexManager, Set<ByteBuffer> columns)
-    {
-        super(indexManager, columns);
-    }
-
-    @Override
-    public List<Row> search(ExtendedFilter filter)
-    {
-        assert filter.getClause() != null && !filter.getClause().isEmpty();
-        final IndexExpression primary = highestSelectivityPredicate(filter.getClause(), true);
-        final SecondaryIndex index = indexManager.getIndexForColumn(primary.column);
-        if (!index.isQueryable())
-            throw new IndexNotAvailableException(index.getIndexName());
-
-        // TODO: this should perhaps not open and maintain a writeOp for the full duration, but instead only *try* to delete stale entries, without blocking if there's no room
-        // as it stands, we open a writeOp and keep it open for the duration to ensure that should this CF get flushed to make room we don't block the reclamation of any room being made
-        try (OpOrder.Group writeOp = baseCfs.keyspace.writeOrder.start(); OpOrder.Group baseOp = baseCfs.readOrdering.start(); OpOrder.Group indexOp = index.getIndexCfs().readOrdering.start())
-        {
-            return baseCfs.filter(getIndexedIterator(writeOp, filter, primary, (CompositesIndex) index), filter);
-        }
-    }
-
-    private Composite makePrefix(CompositesIndex index, ByteBuffer key, ExtendedFilter filter, boolean isStart)
-    {
-        if (key.remaining() == 0)
-            return Composites.EMPTY;
-
-        Composite prefix;
-        IDiskAtomFilter columnFilter = filter.columnFilter(key);
-        if (columnFilter instanceof SliceQueryFilter)
-        {
-            SliceQueryFilter sqf = (SliceQueryFilter)columnFilter;
-            Composite columnName = isStart ? sqf.start() : sqf.finish();
-            prefix = columnName.isEmpty() ? index.getIndexComparator().make(key) : index.makeIndexColumnPrefix(key, columnName);
-        }
-        else
-        {
-            prefix = index.getIndexComparator().make(key);
-        }
-        return isStart ? prefix.start() : prefix.end();
-    }
-
-    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final OpOrder.Group writeOp, final ExtendedFilter filter, final IndexExpression primary, final CompositesIndex index)
-    {
-        // Start with the most-restrictive indexed clause, then apply remaining clauses
-        // to each row matching that clause.
-        // TODO: allow merge join instead of just one index + loop
-        assert index != null;
-        assert index.getIndexCfs() != null;
-        final DecoratedKey indexKey = index.getIndexKeyFor(primary.value);
-
-        if (logger.isTraceEnabled())
-            logger.trace("Most-selective indexed predicate is {}", index.expressionString(primary));
-
-        /*
-         * XXX: If the range requested is a token range, we'll have to start at the beginning (and stop at the end) of
-         * the indexed row unfortunately (which will be inefficient), because we have not way to intuit the smallest
-         * possible key having a given token. A fix would be to actually store the token along the key in the
-         * indexed row.
-         */
-        final AbstractBounds<RowPosition> range = filter.dataRange.keyRange();
-        ByteBuffer startKey = range.left instanceof DecoratedKey ? ((DecoratedKey)range.left).getKey() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        ByteBuffer endKey = range.right instanceof DecoratedKey ? ((DecoratedKey)range.right).getKey() : ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-        final CellNameType baseComparator = baseCfs.getComparator();
-        final CellNameType indexComparator = index.getIndexCfs().getComparator();
-
-        final Composite startPrefix = makePrefix(index, startKey, filter, true);
-        final Composite endPrefix = makePrefix(index, endKey, filter, false);
-
-        return new ColumnFamilyStore.AbstractScanIterator()
-        {
-            private Composite lastSeenPrefix = startPrefix;
-            private Deque<Cell> indexCells;
-            private int columnsRead = Integer.MAX_VALUE;
-            private int limit = filter.currentLimit();
-            private int columnsCount = 0;
-
-            // We have to fetch at least two rows to avoid breaking paging if the first row doesn't satisfy all clauses
-            private int indexCellsPerQuery = Math.max(2, Math.min(filter.maxColumns(), filter.maxRows()));
-
-            public boolean needsFiltering()
-            {
-                return false;
-            }
-
-            private Row makeReturn(DecoratedKey key, ColumnFamily data)
-            {
-                if (data == null)
-                    return endOfData();
-
-                assert key != null;
-                return new Row(key, data);
-            }
-
-            protected Row computeNext()
-            {
-                /*
-                 * Our internal index code is wired toward internal rows. So we need to accumulate all results for a given
-                 * row before returning from this method. Which unfortunately means that this method has to do what
-                 * CFS.filter does for KeysIndex.
-                 */
-                DecoratedKey currentKey = null;
-                ColumnFamily data = null;
-                Composite previousPrefix = null;
-
-                while (true)
-                {
-                    // Did we get more columns that needed to respect the user limit?
-                    // (but we still need to return what has been fetched already)
-                    if (columnsCount >= limit)
-                        return makeReturn(currentKey, data);
-
-                    if (indexCells == null || indexCells.isEmpty())
-                    {
-                        if (columnsRead < indexCellsPerQuery)
-                        {
-                            logger.trace("Read only {} (< {}) last page through, must be done", columnsRead, indexCellsPerQuery);
-                            return makeReturn(currentKey, data);
-                        }
-
-                        if (logger.isTraceEnabled())
-                            logger.trace("Scanning index {} starting with {}",
-                                         index.expressionString(primary), indexComparator.getString(startPrefix));
-
-                        QueryFilter indexFilter = QueryFilter.getSliceFilter(indexKey,
-                                                                             index.getIndexCfs().name,
-                                                                             lastSeenPrefix,
-                                                                             endPrefix,
-                                                                             false,
-                                                                             indexCellsPerQuery,
-                                                                             filter.timestamp);
-                        ColumnFamily indexRow = index.getIndexCfs().getColumnFamily(indexFilter);
-                        if (indexRow == null || !indexRow.hasColumns())
-                            return makeReturn(currentKey, data);
-
-                        Collection<Cell> sortedCells = indexRow.getSortedColumns();
-                        columnsRead = sortedCells.size();
-                        indexCells = new ArrayDeque<>(sortedCells);
-                        Cell firstCell = sortedCells.iterator().next();
-
-                        // Paging is racy, so it is possible the first column of a page is not the last seen one.
-                        if (lastSeenPrefix != startPrefix && lastSeenPrefix.equals(firstCell.name()))
-                        {
-                            // skip the row we already saw w/ the last page of results
-                            indexCells.poll();
-                            logger.trace("Skipping {}", indexComparator.getString(firstCell.name()));
-                        }
-                    }
-
-                    while (!indexCells.isEmpty() && columnsCount <= limit)
-                    {
-                        Cell cell = indexCells.poll();
-                        lastSeenPrefix = cell.name();
-                        if (!cell.isLive(filter.timestamp))
-                        {
-                            logger.trace("skipping {}", cell.name());
-                            continue;
-                        }
-
-                        CompositesIndex.IndexedEntry entry = index.decodeEntry(indexKey, cell);
-                        DecoratedKey dk = baseCfs.partitioner.decorateKey(entry.indexedKey);
-
-                        // Are we done for this row?
-                        if (currentKey == null)
-                        {
-                            currentKey = dk;
-                        }
-                        else if (!currentKey.equals(dk))
-                        {
-                            DecoratedKey previousKey = currentKey;
-                            currentKey = dk;
-                            previousPrefix = null;
-
-                            // We're done with the previous row, return it if it had data, continue otherwise
-                            indexCells.addFirst(cell);
-                            if (data == null)
-                                continue;
-                            else
-                                return makeReturn(previousKey, data);
-                        }
-
-                        if (!range.contains(dk))
-                        {
-                            // Either we're not yet in the range cause the range is start excluding, or we're
-                            // past it.
-                            if (!range.right.isMinimum() && range.right.compareTo(dk) < 0)
-                            {
-                                logger.trace("Reached end of assigned scan range");
-                                return endOfData();
-                            }
-                            else
-                            {
-                                logger.trace("Skipping entry {} before assigned scan range", dk.getToken());
-                                continue;
-                            }
-                        }
-
-                        // Check if this entry cannot be a hit due to the original cell filter
-                        Composite start = entry.indexedEntryPrefix;
-                        if (!filter.columnFilter(dk.getKey()).maySelectPrefix(baseComparator, start))
-                            continue;
-
-                        // If we've record the previous prefix, it means we're dealing with an index on the collection value. In
-                        // that case, we can have multiple index prefix for the same CQL3 row. In that case, we want to only add
-                        // the CQL3 row once (because requesting the data multiple time would be inefficient but more importantly
-                        // because we shouldn't count the columns multiple times with the lastCounted() call at the end of this
-                        // method).
-                        if (previousPrefix != null && previousPrefix.equals(start))
-                            continue;
-                        else
-                            previousPrefix = null;
-
-                        if (logger.isTraceEnabled())
-                            logger.trace("Adding index hit to current row for {}", indexComparator.getString(cell.name()));
-
-                        // We always query the whole CQL3 row. In the case where the original filter was a name filter this might be
-                        // slightly wasteful, but this probably doesn't matter in practice and it simplify things.
-                        ColumnSlice dataSlice = new ColumnSlice(start, entry.indexedEntryPrefix.end());
-                        // If the table has static columns, we must fetch them too as they may need to be returned too.
-                        // Note that this is potentially wasteful for 2 reasons:
-                        //  1) we will retrieve the static parts for each indexed row, even if we have more than one row in
-                        //     the same partition. If we were to group data queries to rows on the same slice, which would
-                        //     speed up things in general, we would also optimize here since we would fetch static columns only
-                        //     once for each group.
-                        //  2) at this point we don't know if the user asked for static columns or not, so we might be fetching
-                        //     them for nothing. We would however need to ship the list of "CQL3 columns selected" with getRangeSlice
-                        //     to be able to know that.
-                        // TODO: we should improve both point above
-                        ColumnSlice[] slices = baseCfs.metadata.hasStaticColumns()
-                                             ? new ColumnSlice[]{ baseCfs.metadata.comparator.staticPrefix().slice(), dataSlice }
-                                             : new ColumnSlice[]{ dataSlice };
-                        SliceQueryFilter dataFilter = new SliceQueryFilter(slices, false, Integer.MAX_VALUE, baseCfs.metadata.clusteringColumns().size());
-                        ColumnFamily newData = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, dataFilter, filter.timestamp));
-                        if (newData == null || index.isStale(entry, newData, filter.timestamp))
-                        {
-                            index.delete(entry, writeOp);
-                            continue;
-                        }
-
-                        assert newData != null : "An entry with no data should have been considered stale";
-
-                        // We know the entry is not stale and so the entry satisfy the primary clause. So whether
-                        // or not the data satisfies the other clauses, there will be no point to re-check the
-                        // same CQL3 row if we run into another collection value entry for this row.
-                        if (entry.indexedEntryCollectionKey != null)
-                            previousPrefix = start;
-
-                        if (!filter.isSatisfiedBy(dk, newData, entry.indexedEntryPrefix, entry.indexedEntryCollectionKey))
-                            continue;
-
-                        if (data == null)
-                            data = ArrayBackedSortedColumns.factory.create(baseCfs.metadata);
-                        data.addAll(newData);
-                        columnsCount += dataFilter.lastCounted();
-                    }
-                 }
-             }
-
-            public void close() throws IOException {}
-        };
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java b/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java
deleted file mode 100644
index e771d99..0000000
--- a/src/java/org/apache/cassandra/db/index/keys/KeysIndex.java
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.keys;
-
-import java.nio.ByteBuffer;
-import java.util.Set;
-
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.index.AbstractSimplePerColumnSecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-
-/**
- * Implements a secondary index for a column family using a second column family.
- * The design uses inverted index http://en.wikipedia.org/wiki/Inverted_index.
- * The row key is the indexed value. For example, if we're indexing a column named
- * city, the index value of city is the row key.
- * The column names are the keys of the records. To see a detailed example, please
- * refer to wikipedia.
- */
-public class KeysIndex extends AbstractSimplePerColumnSecondaryIndex
-{
-    protected ByteBuffer getIndexedValue(ByteBuffer rowKey, Cell cell)
-    {
-        return cell.value();
-    }
-
-    protected CellName makeIndexColumnName(ByteBuffer rowKey, Cell cell)
-    {
-        return CellNames.simpleDense(rowKey);
-    }
-
-    public SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
-    {
-        return new KeysSearcher(baseCfs.indexManager, columns);
-    }
-
-    public boolean isIndexEntryStale(ByteBuffer indexedValue, ColumnFamily data, long now)
-    {
-        Cell cell = data.getColumn(data.getComparator().makeCellName(columnDef.name.bytes));
-        return cell == null || !cell.isLive(now) || columnDef.type.compare(indexedValue, cell.value()) != 0;
-    }
-
-    public void validateOptions() throws ConfigurationException
-    {
-        // no options used
-    }
-
-    public boolean indexes(CellName name)
-    {
-        // This consider the full cellName directly
-        AbstractType<?> comparator = baseCfs.metadata.getColumnDefinitionComparator(columnDef);
-        return comparator.compare(columnDef.name.bytes, name.toByteBuffer()) == 0;
-    }
-
-    protected AbstractType getExpressionComparator()
-    {
-        return baseCfs.getComparator().asAbstractType();
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java b/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java
deleted file mode 100644
index 2b07c41..0000000
--- a/src/java/org/apache/cassandra/db/index/keys/KeysSearcher.java
+++ /dev/null

@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index.keys;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.index.*;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-public class KeysSearcher extends SecondaryIndexSearcher
-{
-    private static final Logger logger = LoggerFactory.getLogger(KeysSearcher.class);
-
-    public KeysSearcher(SecondaryIndexManager indexManager, Set<ByteBuffer> columns)
-    {
-        super(indexManager, columns);
-    }
-
-    @Override
-    public List<Row> search(ExtendedFilter filter)
-    {
-        assert filter.getClause() != null && !filter.getClause().isEmpty();
-        final IndexExpression primary = highestSelectivityPredicate(filter.getClause(), true);
-        final SecondaryIndex index = indexManager.getIndexForColumn(primary.column);
-        if (!index.isQueryable())
-            throw new IndexNotAvailableException(index.getIndexName());
-
-        // TODO: this should perhaps not open and maintain a writeOp for the full duration, but instead only *try* to delete stale entries, without blocking if there's no room
-        // as it stands, we open a writeOp and keep it open for the duration to ensure that should this CF get flushed to make room we don't block the reclamation of any room  being made
-        try (OpOrder.Group writeOp = baseCfs.keyspace.writeOrder.start(); OpOrder.Group baseOp = baseCfs.readOrdering.start(); OpOrder.Group indexOp = index.getIndexCfs().readOrdering.start())
-        {
-            return baseCfs.filter(getIndexedIterator(writeOp, filter, primary, index), filter);
-        }
-    }
-
-    private ColumnFamilyStore.AbstractScanIterator getIndexedIterator(final OpOrder.Group writeOp, final ExtendedFilter filter, final IndexExpression primary, final SecondaryIndex index)
-    {
-
-        // Start with the most-restrictive indexed clause, then apply remaining clauses
-        // to each row matching that clause.
-        // TODO: allow merge join instead of just one index + loop
-        assert index != null;
-        assert index.getIndexCfs() != null;
-        final DecoratedKey indexKey = index.getIndexKeyFor(primary.value);
-
-        if (logger.isTraceEnabled())
-            logger.trace("Most-selective indexed predicate is {}",
-                         ((AbstractSimplePerColumnSecondaryIndex) index).expressionString(primary));
-
-        /*
-         * XXX: If the range requested is a token range, we'll have to start at the beginning (and stop at the end) of
-         * the indexed row unfortunately (which will be inefficient), because we have not way to intuit the small
-         * possible key having a given token. A fix would be to actually store the token along the key in the
-         * indexed row.
-         */
-        final AbstractBounds<RowPosition> range = filter.dataRange.keyRange();
-        CellNameType type = index.getIndexCfs().getComparator();
-        final Composite startKey = range.left instanceof DecoratedKey ? type.make(((DecoratedKey)range.left).getKey()) : Composites.EMPTY;
-        final Composite endKey = range.right instanceof DecoratedKey ? type.make(((DecoratedKey)range.right).getKey()) : Composites.EMPTY;
-
-        final CellName primaryColumn = baseCfs.getComparator().cellFromByteBuffer(primary.column);
-
-        return new ColumnFamilyStore.AbstractScanIterator()
-        {
-            private Composite lastSeenKey = startKey;
-            private Iterator<Cell> indexColumns;
-            private int columnsRead = Integer.MAX_VALUE;
-
-            protected Row computeNext()
-            {
-                // We shouldn't fetch only 1 row as this provides buggy paging in case the first row doesn't satisfy all clauses
-                int rowsPerQuery = Math.max(Math.min(filter.maxRows(), filter.maxColumns()), 2);
-                while (true)
-                {
-                    if (indexColumns == null || !indexColumns.hasNext())
-                    {
-                        if (columnsRead < rowsPerQuery)
-                        {
-                            logger.trace("Read only {} (< {}) last page through, must be done", columnsRead, rowsPerQuery);
-                            return endOfData();
-                        }
-
-                        if (logger.isTraceEnabled() && (index instanceof AbstractSimplePerColumnSecondaryIndex))
-                            logger.trace("Scanning index {} starting with {}",
-                                         ((AbstractSimplePerColumnSecondaryIndex)index).expressionString(primary), index.getBaseCfs().metadata.getKeyValidator().getString(startKey.toByteBuffer()));
-
-                        QueryFilter indexFilter = QueryFilter.getSliceFilter(indexKey,
-                                                                             index.getIndexCfs().name,
-                                                                             lastSeenKey,
-                                                                             endKey,
-                                                                             false,
-                                                                             rowsPerQuery,
-                                                                             filter.timestamp);
-                        ColumnFamily indexRow = index.getIndexCfs().getColumnFamily(indexFilter);
-                        logger.trace("fetched {}", indexRow);
-                        if (indexRow == null)
-                        {
-                            logger.trace("no data, all done");
-                            return endOfData();
-                        }
-
-                        Collection<Cell> sortedCells = indexRow.getSortedColumns();
-                        columnsRead = sortedCells.size();
-                        indexColumns = sortedCells.iterator();
-                        Cell firstCell = sortedCells.iterator().next();
-
-                        // Paging is racy, so it is possible the first column of a page is not the last seen one.
-                        if (lastSeenKey != startKey && lastSeenKey.equals(firstCell.name()))
-                        {
-                            // skip the row we already saw w/ the last page of results
-                            indexColumns.next();
-                            logger.trace("Skipping {}", baseCfs.metadata.getKeyValidator().getString(firstCell.name().toByteBuffer()));
-                        }
-                        else if (range instanceof Range && indexColumns.hasNext() && firstCell.name().equals(startKey))
-                        {
-                            // skip key excluded by range
-                            indexColumns.next();
-                            logger.trace("Skipping first key as range excludes it");
-                        }
-                    }
-
-                    while (indexColumns.hasNext())
-                    {
-                        Cell cell = indexColumns.next();
-                        lastSeenKey = cell.name();
-                        if (!cell.isLive(filter.timestamp))
-                        {
-                            logger.trace("skipping {}", cell.name());
-                            continue;
-                        }
-
-                        DecoratedKey dk = baseCfs.partitioner.decorateKey(lastSeenKey.toByteBuffer());
-                        if (!range.right.isMinimum() && range.right.compareTo(dk) < 0)
-                        {
-                            logger.trace("Reached end of assigned scan range");
-                            return endOfData();
-                        }
-                        if (!range.contains(dk))
-                        {
-                            logger.trace("Skipping entry {} outside of assigned scan range", dk.getToken());
-                            continue;
-                        }
-
-                        logger.trace("Returning index hit for {}", dk);
-                        ColumnFamily data = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, filter.columnFilter(lastSeenKey.toByteBuffer()), filter.timestamp));
-                        // While the column family we'll get in the end should contains the primary clause cell, the initialFilter may not have found it and can thus be null
-                        if (data == null)
-                            data = ArrayBackedSortedColumns.factory.create(baseCfs.metadata);
-
-                        // as in CFS.filter - extend the filter to ensure we include the columns
-                        // from the index expressions, just in case they weren't included in the initialFilter
-                        IDiskAtomFilter extraFilter = filter.getExtraFilter(dk, data);
-                        if (extraFilter != null)
-                        {
-                            ColumnFamily cf = baseCfs.getColumnFamily(new QueryFilter(dk, baseCfs.name, extraFilter, filter.timestamp));
-                            if (cf != null)
-                                data.addAll(cf);
-                        }
-
-                        if (((KeysIndex)index).isIndexEntryStale(indexKey.getKey(), data, filter.timestamp))
-                        {
-                            // delete the index entry w/ its own timestamp
-                            Cell dummyCell = new BufferCell(primaryColumn, indexKey.getKey(), cell.timestamp());
-                            ((PerColumnSecondaryIndex)index).delete(dk.getKey(), dummyCell, writeOp);
-                            continue;
-                        }
-                        return new Row(dk, data);
-                    }
-                 }
-             }
-
-            public void close() throws IOException {}
-        };
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
index 536e13c..8e0d514 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java

@@ -22,7 +22,9 @@
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
 
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Throwables;
 
 import static com.google.common.base.Predicates.*;
 import static com.google.common.collect.Iterables.any;
@@ -70,6 +72,16 @@
      * A convenience method for encapsulating this action over multiple SSTableReader with exception-safety
      * @return accumulate if not null (with any thrown exception attached), or any thrown exception otherwise
      */
+    static void setupOnline(Iterable<SSTableReader> readers)
+    {
+        for (SSTableReader reader : readers)
+            reader.setupOnline();
+    }
+
+    /**
+     * A convenience method for encapsulating this action over multiple SSTableReader with exception-safety
+     * @return accumulate if not null (with any thrown exception attached), or any thrown exception otherwise
+     */
     static Throwable setReplaced(Iterable<SSTableReader> readers, Throwable accumulate)
     {
         for (SSTableReader reader : readers)
@@ -87,16 +99,6 @@
     }
 
     /**
-     * A convenience method for encapsulating this action over multiple SSTableReader with exception-safety
-     * @return accumulate if not null (with any thrown exception attached), or any thrown exception otherwise
-     */
-    static void setupKeycache(Iterable<SSTableReader> readers)
-    {
-        for (SSTableReader reader : readers)
-            reader.setupKeyCache();
-    }
-
-    /**
      * assert that none of these readers have been replaced
      */
     static void checkNotReplaced(Iterable<SSTableReader> readers)
@@ -105,18 +107,52 @@
             assert !reader.isReplaced();
     }
 
-    /**
-     * A convenience method for encapsulating this action over multiple SSTableReader with exception-safety
-     * @return accumulate if not null (with any thrown exception attached), or any thrown exception otherwise
-     */
-    static Throwable markObsolete(Tracker tracker, Iterable<SSTableReader> readers, Throwable accumulate)
+    static Throwable markObsolete(List<LogTransaction.Obsoletion> obsoletions, Throwable accumulate)
     {
+        if (obsoletions == null || obsoletions.isEmpty())
+            return accumulate;
+
+        for (LogTransaction.Obsoletion obsoletion : obsoletions)
+        {
+            try
+            {
+                obsoletion.reader.markObsolete(obsoletion.tidier);
+            }
+            catch (Throwable t)
+            {
+                accumulate = merge(accumulate, t);
+            }
+        }
+        return accumulate;
+    }
+
+    static Throwable prepareForObsoletion(Iterable<SSTableReader> readers, LogTransaction txnLogs, List<LogTransaction.Obsoletion> obsoletions, Throwable accumulate)
+    {
+        Map<SSTable, LogRecord> logRecords = txnLogs.makeRemoveRecords(readers);
         for (SSTableReader reader : readers)
         {
             try
             {
-                boolean firstToCompact = reader.markObsolete(tracker);
-                assert firstToCompact : reader + " was already marked compacted";
+                obsoletions.add(new LogTransaction.Obsoletion(reader, txnLogs.obsoleted(reader, logRecords.get(reader))));
+            }
+            catch (Throwable t)
+            {
+                accumulate = Throwables.merge(accumulate, t);
+            }
+        }
+        return accumulate;
+    }
+
+    static Throwable abortObsoletion(List<LogTransaction.Obsoletion> obsoletions, Throwable accumulate)
+    {
+        if (obsoletions == null || obsoletions.isEmpty())
+            return accumulate;
+
+        for (LogTransaction.Obsoletion obsoletion : obsoletions)
+        {
+            try
+            {
+                obsoletion.tidier.abort();
             }
             catch (Throwable t)
             {

diff --git a/src/java/org/apache/cassandra/db/lifecycle/ILifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/ILifecycleTransaction.java
new file mode 100644
index 0000000..d694a86
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/ILifecycleTransaction.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.lifecycle;
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+public interface ILifecycleTransaction extends Transactional, LifecycleNewTracker
+{
+    void checkpoint();
+    void update(SSTableReader reader, boolean original);
+    void update(Collection<SSTableReader> readers, boolean original);
+    public SSTableReader current(SSTableReader reader);
+    void obsolete(SSTableReader reader);
+    void obsoleteOriginals();
+    Set<SSTableReader> originals();
+    boolean isObsolete(SSTableReader reader);
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleNewTracker.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleNewTracker.java
new file mode 100644
index 0000000..9a0785c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleNewTracker.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.cassandra.db.lifecycle;
+
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.sstable.SSTable;
+
+/**
+ * An interface for tracking new sstables added to a LifecycleTransaction, possibly through some proxy.
+ */
+public interface LifecycleNewTracker
+{
+    /**
+     * Called when a new table is about to be created, so that this table can be tracked by a transaction.
+     * @param table - the new table to be tracked
+     */
+    void trackNew(SSTable table);
+
+
+    /**
+     * Called when a new table is no longer required, so that this table can be untracked by a transaction.
+     * @param table - the table to be untracked
+     */
+    void untrackNew(SSTable table);
+
+    /**
+     * @return the type of operation tracking these sstables
+     */
+    OperationType opType();
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
index a95c4a8..4abce33 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java

@@ -17,19 +17,25 @@
  */
 package org.apache.cassandra.db.lifecycle;
 
+import java.io.File;
+import java.nio.file.Path;
 import java.util.*;
+import java.util.function.BiFunction;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReader.UniqueIdentifier;
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
 import static com.google.common.base.Functions.compose;
@@ -44,12 +50,18 @@
 import static org.apache.cassandra.utils.concurrent.Refs.release;
 import static org.apache.cassandra.utils.concurrent.Refs.selfRefs;
 
-public class LifecycleTransaction extends Transactional.AbstractTransactional
+/**
+ * IMPORTANT: When this object is involved in a transactional graph, for correct behaviour its commit MUST occur before
+ * any others, since it may legitimately fail. This is consistent with the Transactional API, which permits one failing
+ * action to occur at the beginning of the commit phase, but also *requires* that the prepareToCommit() phase only take
+ * actions that can be rolled back.
+ */
+public class LifecycleTransaction extends Transactional.AbstractTransactional implements ILifecycleTransaction
 {
     private static final Logger logger = LoggerFactory.getLogger(LifecycleTransaction.class);
 
     /**
-     * a class that represents accumulated modifications to the Tracker.
+     * A class that represents accumulated modifications to the Tracker.
      * has two instances, one containing modifications that are "staged" (i.e. invisible)
      * and one containing those "logged" that have been made visible through a call to checkpoint()
      */
@@ -92,7 +104,8 @@
     }
 
     public final Tracker tracker;
-    private final OperationType operationType;
+    // The transaction logs keep track of new and old sstable files
+    private final LogTransaction log;
     // the original readers this transaction was opened over, and that it guards
     // (no other transactions may operate over these readers concurrently)
     private final Set<SSTableReader> originals = new HashSet<>();
@@ -101,13 +114,20 @@
     // the identity set of readers we've ever encountered; used to ensure we don't accidentally revisit the
     // same version of a reader. potentially a dangerous property if there are reference counting bugs
     // as they won't be caught until the transaction's lifespan is over.
-    private final Set<UniqueIdentifier> identities = Collections.newSetFromMap(new IdentityHashMap<UniqueIdentifier, Boolean>());
+    private final Set<UniqueIdentifier> identities = Collections.newSetFromMap(new IdentityHashMap<>());
 
     // changes that have been made visible
     private final State logged = new State();
     // changes that are pending
     private final State staged = new State();
 
+    // the tidier and their readers, to be used for marking readers obsoleted during a commit
+    private List<LogTransaction.Obsoletion> obsoletions;
+
+    // commit/rollback hooks
+    private List<Runnable> commitHooks = new ArrayList<>();
+    private List<Runnable> abortHooks = new ArrayList<>();
+
     /**
      * construct a Transaction for use in an offline operation
      */
@@ -128,10 +148,26 @@
         return new LifecycleTransaction(dummy, operationType, readers);
     }
 
+    /**
+     * construct an empty Transaction with no existing readers
+     */
+    @SuppressWarnings("resource") // log closed during postCleanup
+    public static LifecycleTransaction offline(OperationType operationType)
+    {
+        Tracker dummy = new Tracker(null, false);
+        return new LifecycleTransaction(dummy, new LogTransaction(operationType, dummy), Collections.emptyList());
+    }
+
+    @SuppressWarnings("resource") // log closed during postCleanup
     LifecycleTransaction(Tracker tracker, OperationType operationType, Iterable<SSTableReader> readers)
     {
+        this(tracker, new LogTransaction(operationType, tracker), readers);
+    }
+
+    LifecycleTransaction(Tracker tracker, LogTransaction log, Iterable<SSTableReader> readers)
+    {
         this.tracker = tracker;
-        this.operationType = operationType;
+        this.log = log;
         for (SSTableReader reader : readers)
         {
             originals.add(reader);
@@ -140,6 +176,22 @@
         }
     }
 
+    public LogTransaction log()
+    {
+        return log;
+    }
+
+    @Override //LifecycleNewTracker
+    public OperationType opType()
+    {
+        return log.type();
+    }
+
+    public UUID opId()
+    {
+        return log.id();
+    }
+
     public void doPrepare()
     {
         // note for future: in anticompaction two different operations use the same Transaction, and both prepareToCommit()
@@ -147,6 +199,11 @@
         // (and these happen anyway) this is fine but if more logic gets inserted here than is performed in a checkpoint,
         // it may break this use case, and care is needed
         checkpoint();
+
+        // prepare for compaction obsolete readers as long as they were part of the original set
+        // since those that are not original are early readers that share the same desc with the finals
+        maybeFail(prepareForObsoletion(filterIn(logged.obsolete, originals), log, obsoletions = new ArrayList<>(), null));
+        log.prepareToCommit();
     }
 
     /**
@@ -159,16 +216,26 @@
         if (logger.isTraceEnabled())
             logger.trace("Committing transaction over {} staged: {}, logged: {}", originals, staged, logged);
 
+        // accumulate must be null if we have been used correctly, so fail immediately if it is not
+        maybeFail(accumulate);
+
+        // transaction log commit failure means we must abort; safe commit is not possible
+        maybeFail(log.commit(null));
+
         // this is now the point of no return; we cannot safely rollback, so we ignore exceptions until we're done
         // we restore state by obsoleting our obsolete files, releasing our references to them, and updating our size
         // and notification status for the obsolete and new files
-        accumulate = markObsolete(tracker, logged.obsolete, accumulate);
+
+        accumulate = markObsolete(obsoletions, accumulate);
         accumulate = tracker.updateSizeTracking(logged.obsolete, logged.update, accumulate);
+        accumulate = runOnCommitHooks(accumulate);
         accumulate = release(selfRefs(logged.obsolete), accumulate);
-        accumulate = tracker.notifySSTablesChanged(originals, logged.update, operationType, accumulate);
+        accumulate = tracker.notifySSTablesChanged(originals, logged.update, log.type(), accumulate);
+
         return accumulate;
     }
 
+
     /**
      * undo all of the changes made by this transaction, resetting the state to its original form
      */
@@ -177,15 +244,20 @@
         if (logger.isTraceEnabled())
             logger.trace("Aborting transaction over {} staged: {}, logged: {}", originals, staged, logged);
 
+        accumulate = abortObsoletion(obsoletions, accumulate);
+
         if (logged.isEmpty() && staged.isEmpty())
-            return accumulate;
+            return log.abort(accumulate);
 
         // mark obsolete all readers that are not versions of those present in the original set
         Iterable<SSTableReader> obsolete = filterOut(concatUniq(staged.update, logged.update), originals);
         logger.trace("Obsoleting {}", obsolete);
-        // we don't pass the tracker in for the obsoletion, since these readers have never been notified externally
-        // nor had their size accounting affected
-        accumulate = markObsolete(null, obsolete, accumulate);
+
+        accumulate = prepareForObsoletion(obsolete, log, obsoletions = new ArrayList<>(), accumulate);
+        // it's safe to abort even if committed, see maybeFail in doCommit() above, in this case it will just report
+        // a failure to abort, which is useful information to have for debug
+        accumulate = log.abort(accumulate);
+        accumulate = markObsolete(obsoletions, accumulate);
 
         // replace all updated readers with a version restored to its original state
         List<SSTableReader> restored = restoreUpdatedOriginals();
@@ -194,19 +266,48 @@
         accumulate = tracker.notifySSTablesChanged(invalid, restored, OperationType.COMPACTION, accumulate);
         // setReplaced immediately preceding versions that have not been obsoleted
         accumulate = setReplaced(logged.update, accumulate);
+        accumulate = runOnAbortooks(accumulate);
         // we have replaced all of logged.update and never made visible staged.update,
         // and the files we have logged as obsolete we clone fresh versions of, so they are no longer needed either
         // any _staged_ obsoletes should either be in staged.update already, and dealt with there,
         // or is still in its original form (so left as is); in either case no extra action is needed
         accumulate = release(selfRefs(concat(staged.update, logged.update, logged.obsolete)), accumulate);
+
         logged.clear();
         staged.clear();
         return accumulate;
     }
 
+    private Throwable runOnCommitHooks(Throwable accumulate)
+    {
+        return runHooks(commitHooks, accumulate);
+    }
+
+    private Throwable runOnAbortooks(Throwable accumulate)
+    {
+        return runHooks(abortHooks, accumulate);
+    }
+
+    private static Throwable runHooks(Iterable<Runnable> hooks, Throwable accumulate)
+    {
+        for (Runnable hook : hooks)
+        {
+            try
+            {
+                hook.run();
+            }
+            catch (Exception e)
+            {
+                accumulate = Throwables.merge(accumulate, e);
+            }
+        }
+        return accumulate;
+    }
+
     @Override
     protected Throwable doPostCleanup(Throwable accumulate)
     {
+        log.close();
         return unmarkCompacting(marked, accumulate);
     }
 
@@ -215,11 +316,6 @@
         return tracker.isDummy();
     }
 
-    public void permitRedundantTransitions()
-    {
-        super.permitRedundantTransitions();
-    }
-
     /**
      * call when a consistent batch of changes is ready to be made atomically visible
      * these will be exposed in the Tracker atomically, or an exception will be thrown; in this case
@@ -262,6 +358,7 @@
         return accumulate;
     }
 
+
     /**
      * update a reader: if !original, this is a reader that is being introduced by this transaction;
      * otherwise it must be in the originals() set, i.e. a reader guarded by this transaction
@@ -276,12 +373,19 @@
         staged.update.add(reader);
         identities.add(reader.instanceId);
         if (!isOffline())
-            reader.setupKeyCache();
+            reader.setupOnline();
+    }
+
+    public void update(Collection<SSTableReader> readers, boolean original)
+    {
+        for(SSTableReader reader: readers)
+        {
+            update(reader, original);
+        }
     }
 
     /**
-     * mark this reader as for obsoletion. this does not actually obsolete the reader until commit() is called,
-     * but on checkpoint() the reader will be removed from the live set
+     * mark this reader as for obsoletion : on checkpoint() the reader will be removed from the live set
      */
     public void obsolete(SSTableReader reader)
     {
@@ -296,6 +400,16 @@
         staged.obsolete.add(reader);
     }
 
+    public void runOnCommit(Runnable fn)
+    {
+        commitHooks.add(fn);
+    }
+
+    public void runOnAbort(Runnable fn)
+    {
+        abortHooks.add(fn);
+    }
+
     /**
      * obsolete every file in the original transaction
      */
@@ -322,8 +436,7 @@
      */
     private Iterable<SSTableReader> fresh()
     {
-        return filterOut(staged.update,
-                         originals, logged.update);
+        return filterOut(staged.update, originals, logged.update);
     }
 
     /**
@@ -342,14 +455,7 @@
     private List<SSTableReader> restoreUpdatedOriginals()
     {
         Iterable<SSTableReader> torestore = filterIn(originals, logged.update, logged.obsolete);
-        return ImmutableList.copyOf(transform(torestore,
-                                              new Function<SSTableReader, SSTableReader>()
-                                              {
-                                                  public SSTableReader apply(SSTableReader reader)
-                                                  {
-                                                      return current(reader).cloneWithNewStart(reader.first, null);
-                                                  }
-                                              }));
+        return ImmutableList.copyOf(transform(torestore, (reader) -> current(reader).cloneWithRestoredStart(reader.first)));
     }
 
     /**
@@ -426,7 +532,7 @@
             originals.remove(reader);
             marked.remove(reader);
         }
-        return new LifecycleTransaction(tracker, operationType, readers);
+        return new LifecycleTransaction(tracker, log.type(), readers);
     }
 
     /**
@@ -457,6 +563,60 @@
         return getFirst(originals, null);
     }
 
+    // LifecycleNewTracker
+
+    @Override
+    public void trackNew(SSTable table)
+    {
+        log.trackNew(table);
+    }
+
+    @Override
+    public void untrackNew(SSTable table)
+    {
+        log.untrackNew(table);
+    }
+
+    public static void removeUnfinishedLeftovers(CFMetaData metadata)
+    {
+        LogTransaction.removeUnfinishedLeftovers(metadata);
+    }
+
+    /**
+     * Get the files in the folder specified, provided that the filter returns true.
+     * A filter is given each file and its type, and decides which files should be returned
+     * and which should be discarded. To classify files into their type, we read transaction
+     * log files. Should we fail to read these log files after a few times, we look at onTxnErr
+     * to determine what to do.
+     *
+     * @param folder - the folder to scan
+     * @param onTxnErr - how to handle a failure to read a txn log file
+     * @param filter - A function that receives each file and its type, it should return true to have the file returned
+     * @return - the list of files that were scanned and for which the filter returned true
+     */
+    public static List<File> getFiles(Path folder, BiFunction<File, Directories.FileType, Boolean> filter, Directories.OnTxnErr onTxnErr)
+    {
+        return new LogAwareFileLister(folder, filter, onTxnErr).list();
+    }
+
+    /**
+     * Retry all deletions that failed the first time around (presumably b/c the sstable was still mmap'd.)
+     * Useful because there are times when we know GC has been invoked; also exposed as an mbean.
+     */
+    public static void rescheduleFailedDeletions()
+    {
+        LogTransaction.rescheduleFailedDeletions();
+    }
+
+    /**
+     * Deletions run on the nonPeriodicTasks executor, (both failedDeletions or global tidiers in SSTableReader)
+     * so by scheduling a new empty task and waiting for it we ensure any prior deletion has completed.
+     */
+    public static void waitForDeletions()
+    {
+        LogTransaction.waitForDeletions();
+    }
+
     // a class representing the current state of the reader within this transaction, encoding the actions both logged
     // and pending, and the reader instances that are visible now, and will be after the next checkpoint (with null
     // indicating either obsolescence, or that the reader does not occur in the transaction; which is defined
@@ -464,7 +624,7 @@
     @VisibleForTesting
     public static class ReaderState
     {
-        public static enum Action
+        public enum Action
         {
             UPDATED, OBSOLETED, NONE;
             public static Action get(boolean updated, boolean obsoleted)

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogAwareFileLister.java b/src/java/org/apache/cassandra/db/lifecycle/LogAwareFileLister.java
new file mode 100644
index 0000000..7728f9c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogAwareFileLister.java

@@ -0,0 +1,205 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.Directories;
+
+import static org.apache.cassandra.db.Directories.*;
+
+/**
+ * A class for listing files in a folder.
+ */
+final class LogAwareFileLister
+{
+    private static final Logger logger = LoggerFactory.getLogger(LogAwareFileLister.class);
+
+    // The folder to scan
+    private final Path folder;
+
+    // The filter determines which files the client wants returned
+    private final BiFunction<File, FileType, Boolean> filter; //file, file type
+
+    // The behavior when we fail to list files
+    private final OnTxnErr onTxnErr;
+
+    // The unfiltered result
+    NavigableMap<File, Directories.FileType> files = new TreeMap<>();
+
+    @VisibleForTesting
+    LogAwareFileLister(Path folder, BiFunction<File, FileType, Boolean> filter, OnTxnErr onTxnErr)
+    {
+        this.folder = folder;
+        this.filter = filter;
+        this.onTxnErr = onTxnErr;
+    }
+
+    public List<File> list()
+    {
+        try
+        {
+            return innerList();
+        }
+        catch (Throwable t)
+        {
+            throw new RuntimeException(String.format("Failed to list files in %s", folder), t);
+        }
+    }
+
+    List<File> innerList() throws Throwable
+    {
+        list(Files.newDirectoryStream(folder))
+        .stream()
+        .filter((f) -> !LogFile.isLogFile(f))
+        .forEach((f) -> files.put(f, FileType.FINAL));
+
+        // Since many file systems are not atomic, we cannot be sure we have listed a consistent disk state
+        // (Linux would permit this, but for simplicity we keep our behaviour the same across platforms)
+        // so we must be careful to list txn log files AFTER every other file since these files are deleted last,
+        // after all other files are removed
+        list(Files.newDirectoryStream(folder, '*' + LogFile.EXT))
+        .stream()
+        .filter(LogFile::isLogFile)
+        .forEach(this::classifyFiles);
+
+        // Finally we apply the user filter before returning our result
+        return files.entrySet().stream()
+                    .filter((e) -> filter.apply(e.getKey(), e.getValue()))
+                    .map(Map.Entry::getKey)
+                    .collect(Collectors.toList());
+    }
+
+    static List<File> list(DirectoryStream<Path> stream) throws IOException
+    {
+        try
+        {
+            return StreamSupport.stream(stream.spliterator(), false)
+                                .map(Path::toFile)
+                                .filter((f) -> !f.isDirectory())
+                                .collect(Collectors.toList());
+        }
+        finally
+        {
+            stream.close();
+        }
+    }
+
+    /**
+     * We read txn log files, if we fail we throw only if the user has specified
+     * OnTxnErr.THROW, else we log an error and apply the txn log anyway
+     */
+    void classifyFiles(File txnFile)
+    {
+        try (LogFile txn = LogFile.make(txnFile))
+        {
+            readTxnLog(txn);
+            classifyFiles(txn);
+            files.put(txnFile, FileType.TXN_LOG);
+        }
+    }
+
+    void readTxnLog(LogFile txn)
+    {
+        if (!txn.verify() && onTxnErr == OnTxnErr.THROW)
+            throw new LogTransaction.CorruptTransactionLogException("Some records failed verification. See earlier in log for details.", txn);
+    }
+
+    void classifyFiles(LogFile txnFile)
+    {
+        Map<LogRecord, Set<File>> oldFiles = txnFile.getFilesOfType(folder, files.navigableKeySet(), LogRecord.Type.REMOVE);
+        Map<LogRecord, Set<File>> newFiles = txnFile.getFilesOfType(folder, files.navigableKeySet(), LogRecord.Type.ADD);
+
+        if (txnFile.completed())
+        { // last record present, filter regardless of disk status
+            setTemporary(txnFile, oldFiles.values(), newFiles.values());
+            return;
+        }
+
+        if (allFilesPresent(oldFiles))
+        {  // all old files present, transaction is in progress, this will filter as aborted
+            setTemporary(txnFile, oldFiles.values(), newFiles.values());
+            return;
+        }
+
+        // some old files are missing, we expect the txn file to either also be missing or completed, so check
+        // disk state again to resolve any previous races on non-atomic directory listing platforms
+
+        // if txn file also gone, then do nothing (all temporary should be gone, we could remove them if any)
+        if (!txnFile.exists())
+            return;
+
+        // otherwise read the file again to see if it is completed now
+        readTxnLog(txnFile);
+
+        if (txnFile.completed())
+        { // if after re-reading the txn is completed then filter accordingly
+            setTemporary(txnFile, oldFiles.values(), newFiles.values());
+            return;
+        }
+
+        logger.error("Failed to classify files in {}\n" +
+                     "Some old files are missing but the txn log is still there and not completed\n" +
+                     "Files in folder:\n{}\nTxn: {}\n{}",
+                     folder,
+                     files.isEmpty()
+                        ? "\t-"
+                        : String.join("\n", files.keySet().stream().map(f -> String.format("\t%s", f)).collect(Collectors.toList())),
+                     txnFile.toString(),
+                     String.join("\n", txnFile.getRecords().stream().map(r -> String.format("\t%s", r)).collect(Collectors.toList())));
+
+        // some old files are missing and yet the txn is still there and not completed
+        // something must be wrong (see comment at the top of LogTransaction requiring txn to be
+        // completed before obsoleting or aborting sstables)
+        throw new RuntimeException(String.format("Failed to list directory files in %s, inconsistent disk state for transaction %s",
+                                                 folder,
+                                                 txnFile));
+    }
+
+    /** See if all files are present */
+    private static boolean allFilesPresent(Map<LogRecord, Set<File>> oldFiles)
+    {
+        return !oldFiles.entrySet().stream()
+                        .filter((e) -> e.getKey().numFiles > e.getValue().size())
+                        .findFirst().isPresent();
+    }
+
+    private void setTemporary(LogFile txnFile, Collection<Set<File>> oldFiles, Collection<Set<File>> newFiles)
+    {
+        Collection<Set<File>> temporary = txnFile.committed() ? oldFiles : newFiles;
+        temporary.stream()
+                 .flatMap(Set::stream)
+                 .forEach((f) -> this.files.put(f, FileType.TEMPORARY));
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogFile.java b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java
new file mode 100644
index 0000000..6e820df
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java

@@ -0,0 +1,482 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterables;
+
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LogRecord.Type;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.utils.Throwables.merge;
+
+/**
+ * A transaction log file. We store transaction records into a log file, which is
+ * copied into multiple identical replicas on different disks, @see LogFileReplica.
+ *
+ * This class supports the transactional logic of LogTransaction and the removing
+ * of unfinished leftovers when a transaction is completed, or aborted, or when
+ * we clean up on start-up.
+ *
+ * @see LogTransaction
+ */
+final class LogFile implements AutoCloseable
+{
+    private static final Logger logger = LoggerFactory.getLogger(LogFile.class);
+
+    static String EXT = ".log";
+    static char SEP = '_';
+    // cc_txn_opname_id.log (where cc is one of the sstable versions defined in BigVersion)
+    static Pattern FILE_REGEX = Pattern.compile(String.format("^(.{2})_txn_(.*)_(.*)%s$", EXT));
+
+    // A set of physical files on disk, each file is an identical replica
+    private final LogReplicaSet replicas = new LogReplicaSet();
+
+    // The transaction records, this set must be ORDER PRESERVING
+    private final LinkedHashSet<LogRecord> records = new LinkedHashSet<>();
+
+    // The type of the transaction
+    private final OperationType type;
+
+    // The unique id of the transaction
+    private final UUID id;
+
+    static LogFile make(File logReplica)
+    {
+        return make(logReplica.getName(), Collections.singletonList(logReplica));
+    }
+
+    static LogFile make(String fileName, List<File> logReplicas)
+    {
+        Matcher matcher = LogFile.FILE_REGEX.matcher(fileName);
+        boolean matched = matcher.matches();
+        assert matched && matcher.groupCount() == 3;
+
+        // For now we don't need this but it is there in case we need to change
+        // file format later on, the version is the sstable version as defined in BigFormat
+        //String version = matcher.group(1);
+
+        OperationType operationType = OperationType.fromFileName(matcher.group(2));
+        UUID id = UUID.fromString(matcher.group(3));
+
+        return new LogFile(operationType, id, logReplicas);
+    }
+
+    Throwable syncFolder(Throwable accumulate)
+    {
+        return replicas.syncFolder(accumulate);
+    }
+
+    OperationType type()
+    {
+        return type;
+    }
+
+    UUID id()
+    {
+        return id;
+    }
+
+    Throwable removeUnfinishedLeftovers(Throwable accumulate)
+    {
+        try
+        {
+            deleteFilesForRecordsOfType(committed() ? Type.REMOVE : Type.ADD);
+
+            // we sync the parent folders between contents and log deletion
+            // to ensure there is a happens before edge between them
+            Throwables.maybeFail(syncFolder(accumulate));
+
+            accumulate = replicas.delete(accumulate);
+        }
+        catch (Throwable t)
+        {
+            accumulate = merge(accumulate, t);
+        }
+
+        return accumulate;
+    }
+
+    static boolean isLogFile(File file)
+    {
+        return LogFile.FILE_REGEX.matcher(file.getName()).matches();
+    }
+
+    LogFile(OperationType type, UUID id, List<File> replicas)
+    {
+        this(type, id);
+        this.replicas.addReplicas(replicas);
+    }
+
+    LogFile(OperationType type, UUID id)
+    {
+        this.type = type;
+        this.id = id;
+    }
+
+    boolean verify()
+    {
+        records.clear();
+        if (!replicas.readRecords(records))
+        {
+            logger.error("Failed to read records from {}", replicas);
+            return false;
+        }
+
+        Set<String> absolutePaths = new HashSet<>();
+        for (LogRecord record : records)
+            record.absolutePath.ifPresent(absolutePaths::add);
+
+        Map<String, List<File>> recordFiles = LogRecord.getExistingFiles(absolutePaths);
+        for (LogRecord record : records)
+        {
+            List<File> existingFiles = Collections.emptyList();
+            if (record.absolutePath.isPresent())
+            {
+                String key = record.absolutePath.get();
+                existingFiles = recordFiles.getOrDefault(key, Collections.emptyList());
+            }
+            LogFile.verifyRecord(record, existingFiles);
+        }
+
+        Optional<LogRecord> firstInvalid = records.stream().filter(LogRecord::isInvalidOrPartial).findFirst();
+        if (!firstInvalid.isPresent())
+            return true;
+
+        LogRecord failedOn = firstInvalid.get();
+        if (getLastRecord() != failedOn)
+        {
+            logError(failedOn);
+            return false;
+        }
+
+        records.stream().filter((r) -> r != failedOn).forEach(LogFile::verifyRecordWithCorruptedLastRecord);
+        if (records.stream()
+                   .filter((r) -> r != failedOn)
+                   .filter(LogRecord::isInvalid)
+                   .map(LogFile::logError)
+                   .findFirst().isPresent())
+        {
+            logError(failedOn);
+            return false;
+        }
+
+        // if only the last record is corrupt and all other records have matching files on disk, @see verifyRecord,
+        // then we simply exited whilst serializing the last record and we carry on
+        logger.warn(String.format("Last record of transaction %s is corrupt or incomplete [%s], " +
+                                  "but all previous records match state on disk; continuing",
+                                  id,
+                                  failedOn.error()));
+        return true;
+    }
+
+    static LogRecord logError(LogRecord record)
+    {
+        logger.error("{}", record.error());
+        return record;
+    }
+
+    static void verifyRecord(LogRecord record, List<File> existingFiles)
+    {
+        if (record.checksum != record.computeChecksum())
+        {
+            record.setError(String.format("Invalid checksum for sstable [%s], record [%s]: [%d] should have been [%d]",
+                                          record.fileName(),
+                                          record,
+                                          record.checksum,
+                                          record.computeChecksum()));
+            return;
+        }
+
+        if (record.type != Type.REMOVE)
+            return;
+
+        // Paranoid sanity checks: we create another record by looking at the files as they are
+        // on disk right now and make sure the information still matches. We don't want to delete
+        // files by mistake if the user has copied them from backup and forgot to remove a txn log
+        // file that obsoleted the very same files. So we check the latest update time and make sure
+        // it matches. Because we delete files from oldest to newest, the latest update time should
+        // always match.
+        record.status.onDiskRecord = record.withExistingFiles(existingFiles);
+        if (record.updateTime != record.status.onDiskRecord.updateTime && record.status.onDiskRecord.updateTime > 0)
+        {
+            record.setError(String.format("Unexpected files detected for sstable [%s], " +
+                                          "record [%s]: last update time [%tT] should have been [%tT]",
+                                          record.fileName(),
+                                          record,
+                                          record.status.onDiskRecord.updateTime,
+                                          record.updateTime));
+
+        }
+    }
+
+    static void verifyRecordWithCorruptedLastRecord(LogRecord record)
+    {
+        if (record.type == Type.REMOVE && record.status.onDiskRecord.numFiles < record.numFiles)
+        { // if we found a corruption in the last record, then we continue only
+          // if the number of files matches exactly for all previous records.
+            record.setError(String.format("Incomplete fileset detected for sstable [%s], record [%s]: " +
+                                          "number of files [%d] should have been [%d]. Treating as unrecoverable " +
+                                          "due to corruption of the final record.",
+                                          record.fileName(),
+                                          record.raw,
+                                          record.status.onDiskRecord.numFiles,
+                                          record.numFiles));
+        }
+    }
+
+    void commit()
+    {
+        addRecord(LogRecord.makeCommit(System.currentTimeMillis()));
+    }
+
+    void abort()
+    {
+        addRecord(LogRecord.makeAbort(System.currentTimeMillis()));
+    }
+
+    private boolean isLastRecordValidWithType(Type type)
+    {
+        LogRecord lastRecord = getLastRecord();
+        return lastRecord != null &&
+               lastRecord.type == type &&
+               lastRecord.isValid();
+    }
+
+    boolean committed()
+    {
+        return isLastRecordValidWithType(Type.COMMIT);
+    }
+
+    boolean aborted()
+    {
+        return isLastRecordValidWithType(Type.ABORT);
+    }
+
+    boolean completed()
+    {
+        return committed() || aborted();
+    }
+
+    void add(SSTable table)
+    {
+        addRecord(makeAddRecord(table));
+    }
+
+    public void addAll(Type type, Iterable<SSTableReader> toBulkAdd)
+    {
+        for (LogRecord record : makeRecords(type, toBulkAdd).values())
+            addRecord(record);
+    }
+
+    Map<SSTable, LogRecord> makeRecords(Type type, Iterable<SSTableReader> tables)
+    {
+        assert type == Type.ADD || type == Type.REMOVE;
+
+        for (SSTableReader sstable : tables)
+        {
+            File folder = sstable.descriptor.directory;
+            replicas.maybeCreateReplica(folder, getFileName(folder), records);
+        }
+        return LogRecord.make(type, tables);
+    }
+
+    private LogRecord makeAddRecord(SSTable table)
+    {
+        File folder = table.descriptor.directory;
+        replicas.maybeCreateReplica(folder, getFileName(folder), records);
+        return LogRecord.make(Type.ADD, table);
+    }
+
+    /**
+     * this version of makeRecord takes an existing LogRecord and converts it to a
+     * record with the given type. This avoids listing the directory and if the
+     * LogRecord already exists, we have all components for the sstable
+     */
+    private LogRecord makeRecord(Type type, SSTable table, LogRecord record)
+    {
+        assert type == Type.ADD || type == Type.REMOVE;
+
+        File folder = table.descriptor.directory;
+        replicas.maybeCreateReplica(folder, getFileName(folder), records);
+        return record.asType(type);
+    }
+
+    void addRecord(LogRecord record)
+    {
+        if (completed())
+            throw new IllegalStateException("Transaction already completed");
+
+        if (records.contains(record))
+            throw new IllegalStateException("Record already exists");
+
+        replicas.append(record);
+        if (!records.add(record))
+            throw new IllegalStateException("Failed to add record");
+    }
+
+    void remove(SSTable table)
+    {
+        LogRecord record = makeAddRecord(table);
+        assert records.contains(record) : String.format("[%s] is not tracked by %s", record, id);
+        assert record.absolutePath.isPresent();
+        deleteRecordFiles(LogRecord.getExistingFiles(record.absolutePath.get()));
+        records.remove(record);
+    }
+
+    boolean contains(Type type, SSTable sstable, LogRecord record)
+    {
+        return contains(makeRecord(type, sstable, record));
+    }
+
+    private boolean contains(LogRecord record)
+    {
+        return records.contains(record);
+    }
+
+    void deleteFilesForRecordsOfType(Type type)
+    {
+        assert type == Type.REMOVE || type == Type.ADD;
+        Set<String> absolutePaths = new HashSet<>();
+        for (LogRecord record : records)
+        {
+            if (type.matches(record))
+            {
+                assert record.absolutePath.isPresent() : "type is either REMOVE or ADD, record should always have an absolutePath: " + record;
+                absolutePaths.add(record.absolutePath.get());
+            }
+        }
+
+        Map<String, List<File>> existingFiles = LogRecord.getExistingFiles(absolutePaths);
+
+        for (List<File> toDelete : existingFiles.values())
+            LogFile.deleteRecordFiles(toDelete);
+
+        records.clear();
+    }
+
+    private static void deleteRecordFiles(List<File> existingFiles)
+    {
+        // we sort the files in ascending update time order so that the last update time
+        // stays the same even if we only partially delete files, see comment in isInvalid()
+        existingFiles.sort(Comparator.comparingLong(File::lastModified));
+        existingFiles.forEach(LogTransaction::delete);
+    }
+
+    /**
+     * Extract from the files passed in all those that are of the given type.
+     *
+     * Scan all records and select those that are of the given type, valid, and
+     * located in the same folder. For each such record extract from the files passed in
+     * those that belong to this record.
+     *
+     * @return a map linking each mapped record to its files, where the files where passed in as parameters.
+     */
+    Map<LogRecord, Set<File>> getFilesOfType(Path folder, NavigableSet<File> files, Type type)
+    {
+        Map<LogRecord, Set<File>> ret = new HashMap<>();
+
+        records.stream()
+               .filter(type::matches)
+               .filter(LogRecord::isValid)
+               .filter(r -> r.isInFolder(folder))
+               .forEach((r) -> ret.put(r, getRecordFiles(files, r)));
+
+        return ret;
+    }
+
+    LogRecord getLastRecord()
+    {
+        return Iterables.getLast(records, null);
+    }
+
+    private static Set<File> getRecordFiles(NavigableSet<File> files, LogRecord record)
+    {
+        String fileName = record.fileName();
+        return files.stream().filter(f -> f.getName().startsWith(fileName)).collect(Collectors.toSet());
+    }
+
+    boolean exists()
+    {
+        return replicas.exists();
+    }
+
+    public void close()
+    {
+        replicas.close();
+    }
+
+    @Override
+    public String toString()
+    {
+        return replicas.toString();
+    }
+
+    @VisibleForTesting
+    List<File> getFiles()
+    {
+        return replicas.getFiles();
+    }
+
+    @VisibleForTesting
+    List<String> getFilePaths()
+    {
+        return replicas.getFilePaths();
+    }
+
+    private String getFileName(File folder)
+    {
+        String fileName = StringUtils.join(BigFormat.latestVersion,
+                                           LogFile.SEP,
+                                           "txn",
+                                           LogFile.SEP,
+                                           type.fileName,
+                                           LogFile.SEP,
+                                           id.toString(),
+                                           LogFile.EXT);
+        return StringUtils.join(folder, File.separator, fileName);
+    }
+
+    Collection<LogRecord> getRecords()
+    {
+        return records;
+    }
+
+    public boolean isEmpty()
+    {
+        return records.isEmpty();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java
new file mode 100644
index 0000000..69b4920
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java

@@ -0,0 +1,423 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.zip.CRC32;
+
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A decoded line in a transaction log file replica.
+ *
+ * @see LogReplica and LogFile.
+ */
+final class LogRecord
+{
+    public enum Type
+    {
+        UNKNOWN, // a record that cannot be parsed
+        ADD,    // new files to be retained on commit
+        REMOVE, // old files to be retained on abort
+        COMMIT, // commit flag
+        ABORT;  // abort flag
+
+        public static Type fromPrefix(String prefix)
+        {
+            return valueOf(prefix.toUpperCase());
+        }
+
+        public boolean hasFile()
+        {
+            return this == Type.ADD || this == Type.REMOVE;
+        }
+
+        public boolean matches(LogRecord record)
+        {
+            return this == record.type;
+        }
+
+        public boolean isFinal() { return this == Type.COMMIT || this == Type.ABORT; }
+    }
+
+    /**
+     * The status of a record after it has been verified, any parsing errors
+     * are also store here.
+     */
+    public final static class Status
+    {
+        // if there are any errors, they end up here
+        Optional<String> error = Optional.empty();
+
+        // if the record was only partially matched across files this is true
+        boolean partial = false;
+
+        // if the status of this record on disk is required (e.g. existing files), it is
+        // stored here for caching
+        LogRecord onDiskRecord;
+
+        void setError(String error)
+        {
+            if (!this.error.isPresent())
+                this.error = Optional.of(error);
+        }
+
+        boolean hasError()
+        {
+            return error.isPresent();
+        }
+    }
+
+    // the type of record, see Type
+    public final Type type;
+    // for sstable records, the absolute path of the table desc
+    public final Optional<String> absolutePath;
+    // for sstable records, the last update time of all files (may not be available for NEW records)
+    public final long updateTime;
+    // for sstable records, the total number of files (may not be accurate for NEW records)
+    public final int numFiles;
+    // the raw string as written or read from a file
+    public final String raw;
+    // the checksum of this record, written at the end of the record string
+    public final long checksum;
+    // the status of this record, @see Status class
+    public final Status status;
+
+    // (add|remove|commit|abort):[*,*,*][checksum]
+    static Pattern REGEX = Pattern.compile("^(add|remove|commit|abort):\\[([^,]*),?([^,]*),?([^,]*)\\]\\[(\\d*)\\]$", Pattern.CASE_INSENSITIVE);
+
+    public static LogRecord make(String line)
+    {
+        try
+        {
+            Matcher matcher = REGEX.matcher(line);
+            if (!matcher.matches())
+                return new LogRecord(Type.UNKNOWN, null, 0, 0, 0, line)
+                       .setError(String.format("Failed to parse [%s]", line));
+
+            Type type = Type.fromPrefix(matcher.group(1));
+            return new LogRecord(type,
+                                 matcher.group(2) + Component.separator, // see comment on CASSANDRA-13294 below
+                                 Long.valueOf(matcher.group(3)),
+                                 Integer.valueOf(matcher.group(4)),
+                                 Long.valueOf(matcher.group(5)), line);
+        }
+        catch (Throwable t)
+        {
+            return new LogRecord(Type.UNKNOWN, null, 0, 0, 0, line).setError(t);
+        }
+    }
+
+    public static LogRecord makeCommit(long updateTime)
+    {
+        return new LogRecord(Type.COMMIT, updateTime);
+    }
+
+    public static LogRecord makeAbort(long updateTime)
+    {
+        return new LogRecord(Type.ABORT, updateTime);
+    }
+
+    public static LogRecord make(Type type, SSTable table)
+    {
+        // CASSANDRA-13294: add the sstable component separator because for legacy (2.1) files
+        // there is no separator after the generation number, and this would cause files of sstables with
+        // a higher generation number that starts with the same number, to be incorrectly classified as files
+        // of this record sstable
+        String absoluteTablePath = absolutePath(table.descriptor.baseFilename());
+        return make(type, getExistingFiles(absoluteTablePath), table.getAllFilePaths().size(), absoluteTablePath);
+    }
+
+    public static Map<SSTable, LogRecord> make(Type type, Iterable<SSTableReader> tables)
+    {
+        // contains a mapping from sstable absolute path (everything up until the 'Data'/'Index'/etc part of the filename) to the sstable
+        Map<String, SSTable> absolutePaths = new HashMap<>();
+        for (SSTableReader table : tables)
+            absolutePaths.put(absolutePath(table.descriptor.baseFilename()), table);
+
+        // maps sstable base file name to the actual files on disk
+        Map<String, List<File>> existingFiles = getExistingFiles(absolutePaths.keySet());
+        Map<SSTable, LogRecord> records = new HashMap<>(existingFiles.size());
+        for (Map.Entry<String, List<File>> entry : existingFiles.entrySet())
+        {
+            List<File> filesOnDisk = entry.getValue();
+            String baseFileName = entry.getKey();
+            SSTable sstable = absolutePaths.get(baseFileName);
+            records.put(sstable, make(type, filesOnDisk, sstable.getAllFilePaths().size(), baseFileName));
+        }
+        return records;
+    }
+
+    private static String absolutePath(String baseFilename)
+    {
+        return FileUtils.getCanonicalPath(baseFilename + Component.separator);
+    }
+
+    public LogRecord withExistingFiles(List<File> existingFiles)
+    {
+        return make(type, existingFiles, 0, absolutePath.get());
+    }
+
+    public static LogRecord make(Type type, List<File> files, int minFiles, String absolutePath)
+    {
+        // CASSANDRA-11889: File.lastModified() returns a positive value only if the file exists, therefore
+        // we filter by positive values to only consider the files that still exists right now, in case things
+        // changed on disk since getExistingFiles() was called
+        List<Long> positiveModifiedTimes = files.stream().map(File::lastModified).filter(lm -> lm > 0).collect(Collectors.toList());
+        long lastModified = positiveModifiedTimes.stream().reduce(0L, Long::max);
+        return new LogRecord(type, absolutePath, lastModified, Math.max(minFiles, positiveModifiedTimes.size()));
+    }
+
+    private LogRecord(Type type, long updateTime)
+    {
+        this(type, null, updateTime, 0, 0, null);
+    }
+
+    private LogRecord(Type type,
+                      String absolutePath,
+                      long updateTime,
+                      int numFiles)
+    {
+        this(type, absolutePath, updateTime, numFiles, 0, null);
+    }
+
+    private LogRecord(Type type,
+                      String absolutePath,
+                      long updateTime,
+                      int numFiles,
+                      long checksum,
+                      String raw)
+    {
+        assert !type.hasFile() || absolutePath != null : "Expected file path for file records";
+
+        this.type = type;
+        this.absolutePath = type.hasFile() ? Optional.of(absolutePath) : Optional.empty();
+        this.updateTime = type == Type.REMOVE ? updateTime : 0;
+        this.numFiles = type.hasFile() ? numFiles : 0;
+        this.status = new Status();
+        if (raw == null)
+        {
+            assert checksum == 0;
+            this.checksum = computeChecksum();
+            this.raw = format();
+        }
+        else
+        {
+            this.checksum = checksum;
+            this.raw = raw;
+        }
+    }
+
+    LogRecord setError(Throwable t)
+    {
+        return setError(t.getMessage());
+    }
+
+    LogRecord setError(String error)
+    {
+        status.setError(error);
+        return this;
+    }
+
+    String error()
+    {
+        return status.error.orElse("");
+    }
+
+    void setPartial()
+    {
+        status.partial = true;
+    }
+
+    boolean partial()
+    {
+        return status.partial;
+    }
+
+    boolean isValid()
+    {
+        return !status.hasError() && type != Type.UNKNOWN;
+    }
+
+    boolean isInvalid()
+    {
+        return !isValid();
+    }
+
+    boolean isInvalidOrPartial()
+    {
+        return isInvalid() || partial();
+    }
+
+    private String format()
+    {
+        return String.format("%s:[%s,%d,%d][%d]",
+                             type.toString(),
+                             absolutePath(),
+                             updateTime,
+                             numFiles,
+                             checksum);
+    }
+
+    public static List<File> getExistingFiles(String absoluteFilePath)
+    {
+        Path path = Paths.get(absoluteFilePath);
+        File[] files = path.getParent().toFile().listFiles((dir, name) -> name.startsWith(path.getFileName().toString()));
+        // files may be null if the directory does not exist yet, e.g. when tracking new files
+        return files == null ? Collections.emptyList() : Arrays.asList(files);
+    }
+
+    /**
+     * absoluteFilePaths contains full file parts up to (but excluding) the component name
+     *
+     * This method finds all files on disk beginning with any of the paths in absoluteFilePaths
+     *
+     * @return a map from absoluteFilePath to actual file on disk.
+     */
+    public static Map<String, List<File>> getExistingFiles(Set<String> absoluteFilePaths)
+    {
+        Map<String, List<File>> fileMap = new HashMap<>();
+        Map<File, TreeSet<String>> dirToFileNamePrefix = new HashMap<>();
+        for (String absolutePath : absoluteFilePaths)
+        {
+            Path fullPath = Paths.get(absolutePath);
+            Path path = fullPath.getParent();
+            if (path != null)
+                dirToFileNamePrefix.computeIfAbsent(path.toFile(), (k) -> new TreeSet<>()).add(fullPath.getFileName().toString());
+        }
+
+        FilenameFilter ff = (dir, name) -> {
+            TreeSet<String> dirSet = dirToFileNamePrefix.get(dir);
+            // if the set contains a prefix of the current file name, the file name we have here should sort directly
+            // after the prefix in the tree set, which means we can use 'floor' to get the prefix (returns the largest
+            // of the smaller strings in the set). Also note that the prefixes always end with '-' which means we won't
+            // have "xy-1111-Data.db".startsWith("xy-11") below (we'd get "xy-1111-Data.db".startsWith("xy-11-"))
+            String baseName = dirSet.floor(name);
+            if (baseName != null && name.startsWith(baseName))
+            {
+                String absolutePath = new File(dir, baseName).getPath();
+                fileMap.computeIfAbsent(absolutePath, k -> new ArrayList<>()).add(new File(dir, name));
+            }
+            return false;
+        };
+
+        // populate the file map:
+        for (File f : dirToFileNamePrefix.keySet())
+            f.listFiles(ff);
+
+        return fileMap;
+    }
+
+
+    public boolean isFinal()
+    {
+        return type.isFinal();
+    }
+
+    String fileName()
+    {
+        return absolutePath.isPresent() ? Paths.get(absolutePath.get()).getFileName().toString() : "";
+    }
+
+    boolean isInFolder(Path folder)
+    {
+        return absolutePath.isPresent()
+               ? FileUtils.isContained(folder.toFile(), Paths.get(absolutePath.get()).toFile())
+               : false;
+    }
+
+    /**
+     * Return the absolute path, if present, except for the last character (the descriptor separator), or
+     * the empty string if the record has no path. This method is only to be used internally for writing
+     * the record to file or computing the checksum.
+     *
+     * CASSANDRA-13294: the last character of the absolute path is the descriptor separator, it is removed
+     * from the absolute path for backward compatibility, to make sure that on upgrade from 3.0.x to 3.0.y
+     * or to 3.y or to 4.0, the checksum of existing txn files still matches (in case of non clean shutdown
+     * some txn files may be present). By removing the last character here, it means that
+     * it will never be written to txn files, but it is added after reading a txn file in LogFile.make().
+     */
+    private String absolutePath()
+    {
+        if (!absolutePath.isPresent())
+            return "";
+
+        String ret = absolutePath.get();
+        assert ret.charAt(ret.length() -1) == Component.separator : "Invalid absolute path, should end with '-'";
+        return ret.substring(0, ret.length() - 1);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        // see comment in equals
+        return Objects.hash(type, absolutePath, numFiles, updateTime);
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (!(obj instanceof LogRecord))
+            return false;
+
+        final LogRecord other = (LogRecord)obj;
+
+        // we exclude on purpose checksum, error and full file path
+        // since records must match across log file replicas on different disks
+        return type == other.type &&
+               absolutePath.equals(other.absolutePath) &&
+               numFiles == other.numFiles &&
+               updateTime == other.updateTime;
+    }
+
+    @Override
+    public String toString()
+    {
+        return raw;
+    }
+
+    long computeChecksum()
+    {
+        CRC32 crc32 = new CRC32();
+        crc32.update((absolutePath()).getBytes(FileUtils.CHARSET));
+        crc32.update(type.toString().getBytes(FileUtils.CHARSET));
+        FBUtilities.updateChecksumInt(crc32, (int) updateTime);
+        FBUtilities.updateChecksumInt(crc32, (int) (updateTime >>> 32));
+        FBUtilities.updateChecksumInt(crc32, numFiles);
+        return crc32.getValue() & (Long.MAX_VALUE);
+    }
+
+    LogRecord asType(Type type)
+    {
+        return new LogRecord(type, absolutePath.orElse(null), updateTime, numFiles);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java b/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java
new file mode 100644
index 0000000..0378046
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java

@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NativeLibrary;
+
+/**
+ * Because a column family may have sstables on different disks and disks can
+ * be removed, we duplicate log files into many replicas so as to have a file
+ * in each folder where sstables exist.
+ *
+ * Each replica contains the exact same content but we do allow for final
+ * partial records in case we crashed after writing to one replica but
+ * before compliting the write to another replica.
+ *
+ * @see LogFile
+ */
+final class LogReplica implements AutoCloseable
+{
+    private static final Logger logger = LoggerFactory.getLogger(LogReplica.class);
+
+    private final File file;
+    private int folderDescriptor;
+
+    static LogReplica create(File folder, String fileName)
+    {
+        int folderFD = NativeLibrary.tryOpenDirectory(folder.getPath());
+        if (folderFD == -1 && !FBUtilities.isWindows())
+            throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", folder.getPath())), folder.getPath());
+
+        return new LogReplica(new File(fileName), folderFD);
+    }
+
+    static LogReplica open(File file)
+    {
+        int folderFD = NativeLibrary.tryOpenDirectory(file.getParentFile().getPath());
+        if (folderFD == -1 && !FBUtilities.isWindows())
+            throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", file.getParentFile().getPath())), file.getParentFile().getPath());
+
+        return new LogReplica(file, folderFD);
+    }
+
+    LogReplica(File file, int folderDescriptor)
+    {
+        this.file = file;
+        this.folderDescriptor = folderDescriptor;
+    }
+
+    File file()
+    {
+        return file;
+    }
+
+    void append(LogRecord record)
+    {
+        boolean existed = exists();
+        try
+        {
+            FileUtils.appendAndSync(file, record.toString());
+        }
+        catch (FSError e)
+        {
+            logger.error("Failed to sync file {}", file, e);
+            FileUtils.handleFSErrorAndPropagate(e);
+        }
+
+        // If the file did not exist before appending the first
+        // line, then sync the folder as well since now it must exist
+        if (!existed)
+            syncFolder();
+    }
+
+    void syncFolder()
+    {
+        try
+        {
+            if (folderDescriptor >= 0)
+                NativeLibrary.trySync(folderDescriptor);
+        }
+        catch (FSError e)
+        {
+            logger.error("Failed to sync directory descriptor {}", folderDescriptor, e);
+            FileUtils.handleFSErrorAndPropagate(e);
+        }
+    }
+
+    void delete()
+    {
+        LogTransaction.delete(file);
+        syncFolder();
+    }
+
+    boolean exists()
+    {
+        return file.exists();
+    }
+
+    public void close()
+    {
+        if (folderDescriptor >= 0)
+        {
+            NativeLibrary.tryCloseFD(folderDescriptor);
+            folderDescriptor = -1;
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("[%s] ", file);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java
new file mode 100644
index 0000000..0bf20e5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java

@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Throwables;
+
+/**
+ * A set of log replicas. This class mostly iterates over replicas when writing or reading,
+ * ensuring consistency among them and hiding replication details from LogFile.
+ *
+ * @see LogReplica, LogFile
+ */
+public class LogReplicaSet implements AutoCloseable
+{
+    private static final Logger logger = LoggerFactory.getLogger(LogReplicaSet.class);
+
+    private final Map<File, LogReplica> replicasByFile = new LinkedHashMap<>();
+
+    private Collection<LogReplica> replicas()
+    {
+        return replicasByFile.values();
+    }
+
+    void addReplicas(List<File> replicas)
+    {
+        replicas.forEach(this::addReplica);
+    }
+
+    void addReplica(File file)
+    {
+        File folder = file.getParentFile();
+        assert !replicasByFile.containsKey(folder);
+        try
+        {
+            replicasByFile.put(folder, LogReplica.open(file));
+        }
+        catch(FSError e)
+        {
+            logger.error("Failed to open log replica {}", file, e);
+            FileUtils.handleFSErrorAndPropagate(e);
+        }
+
+        if (logger.isTraceEnabled())
+            logger.trace("Added log file replica {} ", file);
+    }
+
+    void maybeCreateReplica(File folder, String fileName, Set<LogRecord> records)
+    {
+        if (replicasByFile.containsKey(folder))
+            return;
+
+        try
+        {
+            @SuppressWarnings("resource")  // LogReplicas are closed in LogReplicaSet::close
+            final LogReplica replica = LogReplica.create(folder, fileName);
+            records.forEach(replica::append);
+            replicasByFile.put(folder, replica);
+
+            if (logger.isTraceEnabled())
+                logger.trace("Created new file replica {}", replica);
+        }
+        catch(FSError e)
+        {
+            logger.error("Failed to create log replica {}/{}", folder,  fileName, e);
+            FileUtils.handleFSErrorAndPropagate(e);
+        }
+    }
+
+    Throwable syncFolder(Throwable accumulate)
+    {
+        return Throwables.perform(accumulate, replicas().stream().map(s -> s::syncFolder));
+    }
+
+    Throwable delete(Throwable accumulate)
+    {
+        return Throwables.perform(accumulate, replicas().stream().map(s -> s::delete));
+    }
+
+    private static boolean isPrefixMatch(String first, String second)
+    {
+        return first.length() >= second.length() ?
+               first.startsWith(second) :
+               second.startsWith(first);
+    }
+
+    boolean readRecords(Set<LogRecord> records)
+    {
+        Map<File, List<String>> linesByReplica = replicas().stream()
+                                                           .map(LogReplica::file)
+                                                           .collect(Collectors.toMap(Function.<File>identity(), FileUtils::readLines));
+        int maxNumLines = linesByReplica.values().stream().map(List::size).reduce(0, Integer::max);
+        for (int i = 0; i < maxNumLines; i++)
+        {
+            String firstLine = null;
+            boolean partial = false;
+            for (Map.Entry<File, List<String>> entry : linesByReplica.entrySet())
+            {
+                List<String> currentLines = entry.getValue();
+                if (i >= currentLines.size())
+                    continue;
+
+                String currentLine = currentLines.get(i);
+                if (firstLine == null)
+                {
+                    firstLine = currentLine;
+                    continue;
+                }
+
+                if (!isPrefixMatch(firstLine, currentLine))
+                { // not a prefix match
+                    logger.error("Mismatched line in file {}: got '{}' expected '{}', giving up",
+                                 entry.getKey().getName(),
+                                 currentLine,
+                                 firstLine);
+                    return false;
+                }
+
+                if (!firstLine.equals(currentLine))
+                {
+                    if (i == currentLines.size() - 1)
+                    { // last record, just set record as invalid and move on
+                        logger.warn("Mismatched last line in file {}: '{}' not the same as '{}'",
+                                    entry.getKey().getName(),
+                                    currentLine,
+                                    firstLine);
+
+                        if (currentLine.length() > firstLine.length())
+                            firstLine = currentLine;
+
+                        partial = true;
+                    }
+                    else
+                    {   // mismatched entry file has more lines, giving up
+                        logger.error("Mismatched line in file {}: got '{}' expected '{}', giving up",
+                                     entry.getKey().getName(),
+                                     currentLine,
+                                     firstLine);
+                        return false;
+                    }
+                }
+            }
+
+            LogRecord record = LogRecord.make(firstLine);
+            if (records.contains(record))
+            { // duplicate records
+                logger.error("Found duplicate record {} for {}, giving up", record, record.fileName());
+                return false;
+            }
+
+            if (partial)
+                record.setPartial();
+
+            records.add(record);
+
+            if (record.isFinal() && i != (maxNumLines - 1))
+            { // too many final records
+                logger.error("Found too many lines for {}, giving up", record.fileName());
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /**
+     *  Add the record to all the replicas: if it is a final record then we throw only if we fail to write it
+     *  to all, otherwise we throw if we fail to write it to any file, see CASSANDRA-10421 for details
+     */
+    void append(LogRecord record)
+    {
+        Throwable err = Throwables.perform(null, replicas().stream().map(r -> () -> r.append(record)));
+        if (err != null)
+        {
+            if (!record.isFinal() || err.getSuppressed().length == replicas().size() -1)
+                Throwables.maybeFail(err);
+
+            logger.error("Failed to add record '{}' to some replicas '{}'", record, this);
+        }
+    }
+
+    boolean exists()
+    {
+        Optional<Boolean> ret = replicas().stream().map(LogReplica::exists).reduce(Boolean::logicalAnd);
+        return ret.isPresent() ?
+               ret.get()
+               : false;
+    }
+
+    public void close()
+    {
+        Throwables.maybeFail(Throwables.perform(null, replicas().stream().map(r -> r::close)));
+    }
+
+    @Override
+    public String toString()
+    {
+        Optional<String> ret = replicas().stream().map(LogReplica::toString).reduce(String::concat);
+        return ret.isPresent() ?
+               ret.get()
+               : "[-]";
+    }
+
+    @VisibleForTesting
+    List<File> getFiles()
+    {
+        return replicas().stream().map(LogReplica::file).collect(Collectors.toList());
+    }
+
+    @VisibleForTesting
+    List<String> getFilePaths()
+    {
+        return replicas().stream().map(LogReplica::file).map(File::getPath).collect(Collectors.toList());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
new file mode 100644
index 0000000..d8fc633
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java

@@ -0,0 +1,464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.NoSuchFileException;
+import java.util.*;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.util.concurrent.Runnables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LogRecord.Type;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.SnapshotDeletingTask;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.concurrent.Ref;
+import org.apache.cassandra.utils.concurrent.RefCounted;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+/**
+ * IMPORTANT: When this object is involved in a transactional graph, and is not encapsulated in a LifecycleTransaction,
+ * for correct behaviour its commit MUST occur before any others, since it may legitimately fail. This is consistent
+ * with the Transactional API, which permits one failing action to occur at the beginning of the commit phase, but also
+ * *requires* that the prepareToCommit() phase only take actions that can be rolled back.
+ *
+ * IMPORTANT: The transaction must complete (commit or abort) before any temporary files are deleted, even though the
+ * txn log file itself will not be deleted until all tracked files are deleted. This is required by FileLister to ensure
+ * a consistent disk state. LifecycleTransaction ensures this requirement, so this class should really never be used
+ * outside of LT. @see FileLister.classifyFiles(TransactionData txn)
+ *
+ * A class that tracks sstable files involved in a transaction across sstables:
+ * if the transaction succeeds the old files should be deleted and the new ones kept; vice-versa if it fails.
+ *
+ * The transaction log file contains new and old sstables as follows:
+ *
+ * add:[sstable-2][CRC]
+ * remove:[sstable-1,max_update_time,num files][CRC]
+ *
+ * where sstable-2 is a new sstable to be retained if the transaction succeeds and sstable-1 is an old sstable to be
+ * removed. CRC is an incremental CRC of the file content up to this point. For old sstable files we also log the
+ * last update time of all files for the sstable descriptor and a checksum of vital properties such as update times
+ * and file sizes.
+ *
+ * Upon commit we add a final line to the log file:
+ *
+ * commit:[commit_time][CRC]
+ *
+ * When the transaction log is cleaned-up by the TransactionTidier, which happens only after any old sstables have been
+ * osoleted, then any sstable files for old sstables are removed before deleting the transaction log if the transaction
+ * was committed, vice-versa if the transaction was aborted.
+ *
+ * On start-up we look for any transaction log files and repeat the cleanup process described above.
+ *
+ * See CASSANDRA-7066 for full details.
+ */
+class LogTransaction extends Transactional.AbstractTransactional implements Transactional
+{
+    private static final Logger logger = LoggerFactory.getLogger(LogTransaction.class);
+
+    /**
+     * If the format of the lines in the transaction log is wrong or the checksum
+     * does not match, then we throw this exception.
+     */
+    public static final class CorruptTransactionLogException extends RuntimeException
+    {
+        public final LogFile txnFile;
+
+        public CorruptTransactionLogException(String message, LogFile txnFile)
+        {
+            super(message);
+            this.txnFile = txnFile;
+        }
+    }
+
+    private final Tracker tracker;
+    private final LogFile txnFile;
+    private final Ref<LogTransaction> selfRef;
+    // Deleting sstables is tricky because the mmapping might not have been finalized yet,
+    // and delete will fail (on Windows) until it is (we only force the unmapping on SUN VMs).
+    // Additionally, we need to make sure to delete the data file first, so on restart the others
+    // will be recognized as GCable.
+    private static final Queue<Runnable> failedDeletions = new ConcurrentLinkedQueue<>();
+
+    LogTransaction(OperationType opType)
+    {
+        this(opType, null);
+    }
+
+    LogTransaction(OperationType opType, Tracker tracker)
+    {
+        this.tracker = tracker;
+        this.txnFile = new LogFile(opType, UUIDGen.getTimeUUID());
+        this.selfRef = new Ref<>(this, new TransactionTidier(txnFile));
+
+        if (logger.isTraceEnabled())
+            logger.trace("Created transaction logs with id {}", txnFile.id());
+    }
+
+    /**
+     * Track a reader as new.
+     **/
+    void trackNew(SSTable table)
+    {
+        txnFile.add(table);
+    }
+
+    /**
+     * Stop tracking a reader as new.
+     */
+    void untrackNew(SSTable table)
+    {
+        txnFile.remove(table);
+    }
+
+    /**
+     * helper method for tests, creates the remove records per sstable
+     */
+    @VisibleForTesting
+    SSTableTidier obsoleted(SSTableReader sstable)
+    {
+        return obsoleted(sstable, LogRecord.make(Type.REMOVE, sstable));
+    }
+
+    /**
+     * Schedule a reader for deletion as soon as it is fully unreferenced.
+     */
+    SSTableTidier obsoleted(SSTableReader reader, LogRecord logRecord)
+    {
+        if (txnFile.contains(Type.ADD, reader, logRecord))
+        {
+            if (txnFile.contains(Type.REMOVE, reader, logRecord))
+                throw new IllegalArgumentException();
+
+            return new SSTableTidier(reader, true, this);
+        }
+
+        txnFile.addRecord(logRecord);
+
+        if (tracker != null)
+            tracker.notifyDeleting(reader);
+
+        return new SSTableTidier(reader, false, this);
+    }
+
+    Map<SSTable, LogRecord> makeRemoveRecords(Iterable<SSTableReader> sstables)
+    {
+        return txnFile.makeRecords(Type.REMOVE, sstables);
+    }
+
+
+    OperationType type()
+    {
+        return txnFile.type();
+    }
+
+    UUID id()
+    {
+        return txnFile.id();
+    }
+
+    @VisibleForTesting
+    LogFile txnFile()
+    {
+        return txnFile;
+    }
+
+    @VisibleForTesting
+    List<File> logFiles()
+    {
+        return txnFile.getFiles();
+    }
+
+    @VisibleForTesting
+    List<String> logFilePaths()
+    {
+        return txnFile.getFilePaths();
+    }
+
+    static void delete(File file)
+    {
+        try
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Deleting {}", file);
+
+            Files.delete(file.toPath());
+        }
+        catch (NoSuchFileException e)
+        {
+            logger.error("Unable to delete {} as it does not exist", file);
+        }
+        catch (IOException e)
+        {
+            logger.error("Unable to delete {}", file, e);
+            FileUtils.handleFSErrorAndPropagate(new FSWriteError(e, file));
+        }
+    }
+
+    /**
+     * The transaction tidier.
+     *
+     * When the transaction reference is fully released we try to delete all the obsolete files
+     * depending on the transaction result, as well as the transaction log file.
+     */
+    private static class TransactionTidier implements RefCounted.Tidy, Runnable
+    {
+        private final LogFile data;
+
+        TransactionTidier(LogFile data)
+        {
+            this.data = data;
+        }
+
+        public void tidy() throws Exception
+        {
+            run();
+        }
+
+        public String name()
+        {
+            return data.toString();
+        }
+
+        public void run()
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Removing files for transaction {}", name());
+
+            // this happens if we forget to close a txn and the garbage collector closes it for us
+            // or if the transaction journal was never properly created in the first place
+            if (!data.completed())
+            {
+                logger.error("{} was not completed, trying to abort it now", data);
+                Throwable err = Throwables.perform((Throwable)null, data::abort);
+                if (err != null)
+                    logger.error("Failed to abort {}", data, err);
+            }
+
+            Throwable err = data.removeUnfinishedLeftovers(null);
+
+            if (err != null)
+            {
+                logger.info("Failed deleting files for transaction {}, we'll retry after GC and on on server restart", name(), err);
+                failedDeletions.add(this);
+            }
+            else
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace("Closing file transaction {}", name());
+
+                data.close();
+            }
+        }
+    }
+
+    static class Obsoletion
+    {
+        final SSTableReader reader;
+        final SSTableTidier tidier;
+
+        Obsoletion(SSTableReader reader, SSTableTidier tidier)
+        {
+            this.reader = reader;
+            this.tidier = tidier;
+        }
+    }
+
+    /**
+     * The SSTableReader tidier. When a reader is fully released and no longer referenced
+     * by any one, we run this. It keeps a reference to the parent transaction and releases
+     * it when done, so that the final transaction cleanup can run when all obsolete readers
+     * are released.
+     */
+    public static class SSTableTidier implements Runnable
+    {
+        // must not retain a reference to the SSTableReader, else leak detection cannot kick in
+        private final Descriptor desc;
+        private final long sizeOnDisk;
+        private final Tracker tracker;
+        private final boolean wasNew;
+        private final Ref<LogTransaction> parentRef;
+
+        public SSTableTidier(SSTableReader referent, boolean wasNew, LogTransaction parent)
+        {
+            this.desc = referent.descriptor;
+            this.sizeOnDisk = referent.bytesOnDisk();
+            this.tracker = parent.tracker;
+            this.wasNew = wasNew;
+            this.parentRef = parent.selfRef.tryRef();
+        }
+
+        public void run()
+        {
+            if (tracker != null && !tracker.isDummy())
+                SystemKeyspace.clearSSTableReadMeter(desc.ksname, desc.cfname, desc.generation);
+
+            try
+            {
+                // If we can't successfully delete the DATA component, set the task to be retried later: see TransactionTidier
+                File datafile = new File(desc.filenameFor(Component.DATA));
+
+                delete(datafile);
+                // let the remainder be cleaned up by delete
+                SSTable.delete(desc, SSTable.discoverComponentsFor(desc));
+            }
+            catch (Throwable t)
+            {
+                logger.error("Failed deletion for {}, we'll retry after GC and on server restart", desc);
+                failedDeletions.add(this);
+                return;
+            }
+
+            if (tracker != null && tracker.cfstore != null && !wasNew)
+                tracker.cfstore.metric.totalDiskSpaceUsed.dec(sizeOnDisk);
+
+            // release the referent to the parent so that the all transaction files can be released
+            parentRef.release();
+        }
+
+        public void abort()
+        {
+            parentRef.release();
+        }
+    }
+
+
+    static void rescheduleFailedDeletions()
+    {
+        Runnable task;
+        while ( null != (task = failedDeletions.poll()))
+            ScheduledExecutors.nonPeriodicTasks.submit(task);
+
+        // On Windows, snapshots cannot be deleted so long as a segment of the root element is memory-mapped in NTFS.
+        SnapshotDeletingTask.rescheduleFailedTasks();
+    }
+
+    static void waitForDeletions()
+    {
+        FBUtilities.waitOnFuture(ScheduledExecutors.nonPeriodicTasks.schedule(Runnables.doNothing(), 0, TimeUnit.MILLISECONDS));
+    }
+
+    @VisibleForTesting
+    Throwable complete(Throwable accumulate)
+    {
+        try
+        {
+            accumulate = selfRef.ensureReleased(accumulate);
+            return accumulate;
+        }
+        catch (Throwable t)
+        {
+            logger.error("Failed to complete file transaction {}", id(), t);
+            return Throwables.merge(accumulate, t);
+        }
+    }
+
+    protected Throwable doCommit(Throwable accumulate)
+    {
+        return complete(Throwables.perform(accumulate, txnFile::commit));
+    }
+
+    protected Throwable doAbort(Throwable accumulate)
+    {
+        return complete(Throwables.perform(accumulate, txnFile::abort));
+    }
+
+    protected void doPrepare() { }
+
+    /**
+     * Called on startup to scan existing folders for any unfinished leftovers of
+     * operations that were ongoing when the process exited. Also called by the standalone
+     * sstableutil tool when the cleanup option is specified, @see StandaloneSSTableUtil.
+     *
+     */
+    static void removeUnfinishedLeftovers(CFMetaData metadata)
+    {
+        removeUnfinishedLeftovers(new Directories(metadata, ColumnFamilyStore.getInitialDirectories()).getCFDirectories());
+    }
+
+    @VisibleForTesting
+    static void removeUnfinishedLeftovers(List<File> folders)
+    {
+        LogFilesByName logFiles = new LogFilesByName();
+        folders.forEach(logFiles::list);
+        logFiles.removeUnfinishedLeftovers();
+    }
+
+    private static final class LogFilesByName
+    {
+        Map<String, List<File>> files = new HashMap<>();
+
+        void list(File folder)
+        {
+            Arrays.stream(folder.listFiles(LogFile::isLogFile)).forEach(this::add);
+        }
+
+        void add(File file)
+        {
+            List<File> filesByName = files.get(file.getName());
+            if (filesByName == null)
+            {
+                filesByName = new ArrayList<>();
+                files.put(file.getName(), filesByName);
+            }
+
+            filesByName.add(file);
+        }
+
+        void removeUnfinishedLeftovers()
+        {
+            files.forEach(LogFilesByName::removeUnfinishedLeftovers);
+        }
+
+        static void removeUnfinishedLeftovers(String name, List<File> logFiles)
+        {
+
+            try(LogFile txn = LogFile.make(name, logFiles))
+            {
+                if (txn.verify())
+                {
+                    Throwable failure = txn.removeUnfinishedLeftovers(null);
+                    if (failure != null)
+                        logger.error("Failed to remove unfinished transaction leftovers for txn {}", txn, failure);
+                }
+                else
+                {
+                    logger.error("Unexpected disk state: failed to read transaction txn {}", txn);
+                }
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java
index 841fa92..61fab98 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java

@@ -26,16 +26,16 @@
 
 import com.google.common.collect.Iterables;
 
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.Interval;
 import org.apache.cassandra.utils.IntervalTree;
 
-public class SSTableIntervalTree extends IntervalTree<RowPosition, SSTableReader, Interval<RowPosition, SSTableReader>>
+public class SSTableIntervalTree extends IntervalTree<PartitionPosition, SSTableReader, Interval<PartitionPosition, SSTableReader>>
 {
     private static final SSTableIntervalTree EMPTY = new SSTableIntervalTree(null);
 
-    SSTableIntervalTree(Collection<Interval<RowPosition, SSTableReader>> intervals)
+    SSTableIntervalTree(Collection<Interval<PartitionPosition, SSTableReader>> intervals)
     {
         super(intervals);
     }
@@ -50,11 +50,11 @@
         return new SSTableIntervalTree(buildIntervals(sstables));
     }
 
-    public static List<Interval<RowPosition, SSTableReader>> buildIntervals(Iterable<SSTableReader> sstables)
+    public static List<Interval<PartitionPosition, SSTableReader>> buildIntervals(Iterable<SSTableReader> sstables)
     {
-        List<Interval<RowPosition, SSTableReader>> intervals = new ArrayList<>(Iterables.size(sstables));
+        List<Interval<PartitionPosition, SSTableReader>> intervals = new ArrayList<>(Iterables.size(sstables));
         for (SSTableReader sstable : sstables)
-            intervals.add(Interval.<RowPosition, SSTableReader>create(sstable.first, sstable.last, sstable));
+            intervals.add(Interval.<PartitionPosition, SSTableReader>create(sstable.first, sstable.last, sstable));
         return intervals;
     }
 }

diff --git a/src/java/org/apache/cassandra/db/lifecycle/SSTableSet.java b/src/java/org/apache/cassandra/db/lifecycle/SSTableSet.java
new file mode 100644
index 0000000..07a3b2b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/SSTableSet.java

@@ -0,0 +1,32 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.lifecycle;
+
+public enum SSTableSet
+{
+    // returns the "canonical" version of any current sstable, i.e. if an sstable is being replaced and is only partially
+    // visible to reads, this sstable will be returned as its original entirety, and its replacement will not be returned
+    // (even if it completely replaces it)
+    CANONICAL,
+    // returns the live versions of all sstables, i.e. including partially written sstables
+    LIVE,
+    NONCOMPACTING
+}

diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index e77ef78..9feaa3e 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java

@@ -36,18 +36,16 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.metrics.StorageMetrics;
 import org.apache.cassandra.notifications.*;
 import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
 import static com.google.common.base.Predicates.and;
-import static com.google.common.base.Predicates.in;
-import static com.google.common.base.Predicates.not;
 import static com.google.common.collect.ImmutableSet.copyOf;
 import static com.google.common.collect.Iterables.filter;
 import static java.util.Collections.singleton;
@@ -174,9 +172,11 @@
                 accumulate = merge(accumulate, t);
             }
         }
+
         StorageMetrics.load.inc(add - subtract);
         cfstore.metric.liveDiskSpaceUsed.inc(add - subtract);
-        // we don't subtract from total until the sstable is deleted
+
+        // we don't subtract from total until the sstable is deleted, see TransactionLogs.SSTableTidier
         cfstore.metric.totalDiskSpaceUsed.inc(add);
         return accumulate;
     }
@@ -186,7 +186,7 @@
     public void addInitialSSTables(Iterable<SSTableReader> sstables)
     {
         if (!isDummy())
-            setupKeycache(sstables);
+            setupOnline(sstables);
         apply(updateLiveSet(emptySet(), sstables));
         maybeFail(updateSizeTracking(emptySet(), sstables, null));
         // no notifications or backup necessary
@@ -195,22 +195,18 @@
     public void addSSTables(Iterable<SSTableReader> sstables)
     {
         addInitialSSTables(sstables);
-        for (SSTableReader sstable : sstables)
-        {
-            maybeIncrementallyBackup(sstable);
-            notifyAdded(sstable);
-        }
+        maybeIncrementallyBackup(sstables);
+        notifyAdded(sstables);
     }
 
     /** (Re)initializes the tracker, purging all references. */
     @VisibleForTesting
     public void reset(Memtable memtable)
     {
-        view.set(new View(memtable != null ? singletonList(memtable) : Collections.<Memtable>emptyList(),
-                          Collections.<Memtable>emptyList(),
-                          Collections.<SSTableReader, SSTableReader>emptyMap(),
-                          Collections.<SSTableReader>emptySet(),
-                          Collections.<SSTableReader>emptySet(),
+        view.set(new View(memtable != null ? singletonList(memtable) : Collections.emptyList(),
+                          Collections.emptyList(),
+                          Collections.emptyMap(),
+                          Collections.emptyMap(),
                           SSTableIntervalTree.empty()));
     }
 
@@ -236,29 +232,47 @@
      */
     public Throwable dropSSTables(final Predicate<SSTableReader> remove, OperationType operationType, Throwable accumulate)
     {
-        Pair<View, View> result = apply(new Function<View, View>()
+        try (LogTransaction txnLogs = new LogTransaction(operationType, this))
         {
-            public View apply(View view)
-            {
+            Pair<View, View> result = apply(view -> {
                 Set<SSTableReader> toremove = copyOf(filter(view.sstables, and(remove, notIn(view.compacting))));
                 return updateLiveSet(toremove, emptySet()).apply(view);
+            });
+
+            Set<SSTableReader> removed = Sets.difference(result.left.sstables, result.right.sstables);
+            assert Iterables.all(removed, remove);
+
+            // It is important that any method accepting/returning a Throwable never throws an exception, and does its best
+            // to complete the instructions given to it
+            List<LogTransaction.Obsoletion> obsoletions = new ArrayList<>();
+            accumulate = prepareForObsoletion(removed, txnLogs, obsoletions, accumulate);
+            try
+            {
+                txnLogs.finish();
+                if (!removed.isEmpty())
+                {
+                    accumulate = markObsolete(obsoletions, accumulate);
+                    accumulate = updateSizeTracking(removed, emptySet(), accumulate);
+                    accumulate = release(selfRefs(removed), accumulate);
+                    // notifySSTablesChanged -> LeveledManifest.promote doesn't like a no-op "promotion"
+                    accumulate = notifySSTablesChanged(removed, Collections.<SSTableReader>emptySet(), txnLogs.type(), accumulate);
+                }
             }
-        });
-
-        Set<SSTableReader> removed = Sets.difference(result.left.sstables, result.right.sstables);
-        assert Iterables.all(removed, remove);
-
-        if (!removed.isEmpty())
-        {
-            // notifySSTablesChanged -> LeveledManifest.promote doesn't like a no-op "promotion"
-            accumulate = notifySSTablesChanged(removed, Collections.<SSTableReader>emptySet(), operationType, accumulate);
-            accumulate = updateSizeTracking(removed, emptySet(), accumulate);
-            accumulate = markObsolete(this, removed, accumulate);
-            accumulate = release(selfRefs(removed), accumulate);
+            catch (Throwable t)
+            {
+                accumulate = abortObsoletion(obsoletions, accumulate);
+                accumulate = Throwables.merge(accumulate, t);
+            }
         }
+        catch (Throwable t)
+        {
+            accumulate = Throwables.merge(accumulate, t);
+        }
+
         return accumulate;
     }
 
+
     /**
      * Removes every SSTable in the directory from the Tracker's view.
      * @param directory the unreadable directory, possibly with SSTables in it, but not necessarily.
@@ -320,10 +334,10 @@
         apply(View.markFlushing(memtable));
     }
 
-    public void replaceFlushed(Memtable memtable, SSTableReader sstable)
+    public void replaceFlushed(Memtable memtable, Collection<SSTableReader> sstables)
     {
         assert !isDummy();
-        if (sstable == null)
+        if (sstables.isEmpty())
         {
             // sstable may be null if we flushed batchlog and nothing needed to be retained
             // if it's null, we don't care what state the cfstore is in, we just replace it and continue
@@ -331,66 +345,35 @@
             return;
         }
 
-        sstable.setupKeyCache();
+        sstables.forEach(SSTableReader::setupOnline);
         // back up before creating a new Snapshot (which makes the new one eligible for compaction)
-        maybeIncrementallyBackup(sstable);
+        maybeIncrementallyBackup(sstables);
 
-        apply(View.replaceFlushed(memtable, sstable));
+        apply(View.replaceFlushed(memtable, sstables));
 
         Throwable fail;
-        fail = updateSizeTracking(emptySet(), singleton(sstable), null);
+        fail = updateSizeTracking(emptySet(), sstables, null);
+        // TODO: if we're invalidated, should we notifyadded AND removed, or just skip both?
+        fail = notifyAdded(sstables, fail);
+
+        if (!isDummy() && !cfstore.isValid())
+            dropSSTables();
 
         maybeFail(fail);
     }
 
-    /**
-     * permit compaction of the provided sstable; this translates to notifying compaction
-     * strategies of its existence, and potentially submitting a background task
-     */
-    public void permitCompactionOfFlushed(SSTableReader sstable)
-    {
-        if (sstable == null)
-            return;
-
-        apply(View.permitCompactionOfFlushed(sstable));
-
-        if (isDummy())
-            return;
-
-        if (cfstore.isValid())
-        {
-            notifyAdded(sstable);
-            CompactionManager.instance.submitBackground(cfstore);
-        }
-        else
-        {
-            dropSSTables();
-        }
-    }
-
 
 
     // MISCELLANEOUS public utility calls
 
-    public Set<SSTableReader> getSSTables()
-    {
-        return view.get().sstables;
-    }
-
-    public Iterable<SSTableReader> getPermittedToCompact()
-    {
-        View view = this.view.get();
-        return filter(view.sstables, not(in(view.premature)));
-    }
-
     public Set<SSTableReader> getCompacting()
     {
         return view.get().compacting;
     }
 
-    public Set<SSTableReader> getUncompacting()
+    public Iterable<SSTableReader> getUncompacting()
     {
-        return view.get().nonCompactingSStables();
+        return view.get().select(SSTableSet.NONCOMPACTING);
     }
 
     public Iterable<SSTableReader> getUncompacting(Iterable<SSTableReader> candidates)
@@ -398,16 +381,18 @@
         return view.get().getUncompacting(candidates);
     }
 
-    public void maybeIncrementallyBackup(final SSTableReader sstable)
+    public void maybeIncrementallyBackup(final Iterable<SSTableReader> sstables)
     {
         if (!DatabaseDescriptor.isIncrementalBackupsEnabled())
             return;
 
-        File backupsDir = Directories.getBackupsDirectory(sstable.descriptor);
-        sstable.createLinks(FileUtils.getCanonicalPath(backupsDir));
+        for (SSTableReader sstable : sstables)
+        {
+            File backupsDir = Directories.getBackupsDirectory(sstable.descriptor);
+            sstable.createLinks(FileUtils.getCanonicalPath(backupsDir));
+        }
     }
 
-
     // NOTIFICATION
 
     Throwable notifySSTablesChanged(Collection<SSTableReader> removed, Collection<SSTableReader> added, OperationType compactionType, Throwable accumulate)
@@ -427,7 +412,7 @@
         return accumulate;
     }
 
-    Throwable notifyAdded(SSTableReader added, Throwable accumulate)
+    Throwable notifyAdded(Iterable<SSTableReader> added, Throwable accumulate)
     {
         INotification notification = new SSTableAddedNotification(added);
         for (INotificationConsumer subscriber : subscribers)
@@ -444,7 +429,7 @@
         return accumulate;
     }
 
-    public void notifyAdded(SSTableReader added)
+    public void notifyAdded(Iterable<SSTableReader> added)
     {
         maybeFail(notifyAdded(added, null));
     }

diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java
index e303801..4b3aae0 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/View.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/View.java

@@ -19,15 +19,15 @@
 
 import java.util.*;
 
-import javax.annotation.Nullable;
-
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
 import com.google.common.base.Functions;
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
 
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Memtable;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.Interval;
@@ -39,8 +39,9 @@
 import static com.google.common.collect.Iterables.all;
 import static com.google.common.collect.Iterables.concat;
 import static com.google.common.collect.Iterables.filter;
-import static java.util.Collections.singleton;
+import static com.google.common.collect.Iterables.transform;
 import static org.apache.cassandra.db.lifecycle.Helpers.emptySet;
+import static org.apache.cassandra.db.lifecycle.Helpers.filterOut;
 import static org.apache.cassandra.db.lifecycle.Helpers.replace;
 
 /**
@@ -66,18 +67,18 @@
      * flushed. In chronologically ascending order.
      */
     public final List<Memtable> flushingMemtables;
-    public final Set<SSTableReader> compacting;
-    public final Set<SSTableReader> sstables;
-    public final Set<SSTableReader> premature;
+    final Set<SSTableReader> compacting;
+    final Set<SSTableReader> sstables;
     // we use a Map here so that we can easily perform identity checks as well as equality checks.
     // When marking compacting, we now  indicate if we expect the sstables to be present (by default we do),
     // and we then check that not only are they all present in the live set, but that the exact instance present is
     // the one we made our decision to compact against.
-    public final Map<SSTableReader, SSTableReader> sstablesMap;
+    final Map<SSTableReader, SSTableReader> sstablesMap;
+    final Map<SSTableReader, SSTableReader> compactingMap;
 
-    public final SSTableIntervalTree intervalTree;
+    final SSTableIntervalTree intervalTree;
 
-    View(List<Memtable> liveMemtables, List<Memtable> flushingMemtables, Map<SSTableReader, SSTableReader> sstables, Set<SSTableReader> compacting, Set<SSTableReader> premature, SSTableIntervalTree intervalTree)
+    View(List<Memtable> liveMemtables, List<Memtable> flushingMemtables, Map<SSTableReader, SSTableReader> sstables, Map<SSTableReader, SSTableReader> compacting, SSTableIntervalTree intervalTree)
     {
         assert liveMemtables != null;
         assert flushingMemtables != null;
@@ -90,8 +91,8 @@
 
         this.sstablesMap = sstables;
         this.sstables = sstablesMap.keySet();
-        this.compacting = compacting;
-        this.premature = premature;
+        this.compactingMap = compacting;
+        this.compacting = compactingMap.keySet();
         this.intervalTree = intervalTree;
     }
 
@@ -108,9 +109,49 @@
         return concat(flushingMemtables, liveMemtables);
     }
 
-    public Sets.SetView<SSTableReader> nonCompactingSStables()
+    // shortcut for all live sstables, so can efficiently use it for size, etc
+    public Set<SSTableReader> liveSSTables()
     {
-        return Sets.difference(sstables, compacting);
+        return sstables;
+    }
+
+    public Iterable<SSTableReader> sstables(SSTableSet sstableSet, Predicate<SSTableReader> filter)
+    {
+        return filter(select(sstableSet), filter);
+    }
+
+    // any sstable known by this tracker in any form; we have a special method here since it's only used for testing/debug
+    // (strong leak detection), and it does not follow the normal pattern
+    @VisibleForTesting
+    public Iterable<SSTableReader> allKnownSSTables()
+    {
+        return Iterables.concat(sstables, filterOut(compacting, sstables));
+    }
+
+    public Iterable<SSTableReader> select(SSTableSet sstableSet)
+    {
+        switch (sstableSet)
+        {
+            case LIVE:
+                return sstables;
+            case NONCOMPACTING:
+                return filter(sstables, (s) -> !compacting.contains(s));
+            case CANONICAL:
+                Set<SSTableReader> canonicalSSTables = new HashSet<>();
+                for (SSTableReader sstable : compacting)
+                    if (sstable.openReason != SSTableReader.OpenReason.EARLY)
+                        canonicalSSTables.add(sstable);
+                // reason for checking if compacting contains the sstable is that if compacting has an EARLY version
+                // of a NORMAL sstable, we still have the canonical version of that sstable in sstables.
+                // note that the EARLY version is equal, but not == since it is a different instance of the same sstable.
+                for (SSTableReader sstable : sstables)
+                    if (!compacting.contains(sstable) && sstable.openReason != SSTableReader.OpenReason.EARLY)
+                        canonicalSSTables.add(sstable);
+
+                return canonicalSSTables;
+            default:
+                throw new IllegalStateException();
+        }
     }
 
     public Iterable<SSTableReader> getUncompacting(Iterable<SSTableReader> candidates)
@@ -124,6 +165,14 @@
         });
     }
 
+    public boolean isEmpty()
+    {
+        return sstables.isEmpty()
+               && liveMemtables.size() <= 1
+               && flushingMemtables.size() == 0
+               && (liveMemtables.size() == 0 || liveMemtables.get(0).getOperations() == 0);
+    }
+
     @Override
     public String toString()
     {
@@ -131,23 +180,63 @@
     }
 
     /**
-      * Returns the sstables that have any partition between {@code left} and {@code right}, when both bounds are taken inclusively.
-      * The interval formed by {@code left} and {@code right} shouldn't wrap.
-      */
-    public List<SSTableReader> sstablesInBounds(RowPosition left, RowPosition right)
-    {
-        return sstablesInBounds(left, right, intervalTree);
-    }
-
-    public static List<SSTableReader> sstablesInBounds(RowPosition left, RowPosition right, SSTableIntervalTree intervalTree)
+     * Returns the sstables that have any partition between {@code left} and {@code right}, when both bounds are taken inclusively.
+     * The interval formed by {@code left} and {@code right} shouldn't wrap.
+     */
+    public Iterable<SSTableReader> liveSSTablesInBounds(PartitionPosition left, PartitionPosition right)
     {
         assert !AbstractBounds.strictlyWrapsAround(left, right);
 
         if (intervalTree.isEmpty())
             return Collections.emptyList();
 
-        RowPosition stopInTree = right.isMinimum() ? intervalTree.max() : right;
-        return intervalTree.search(Interval.<RowPosition, SSTableReader>create(left, stopInTree));
+        PartitionPosition stopInTree = right.isMinimum() ? intervalTree.max() : right;
+        return intervalTree.search(Interval.create(left, stopInTree));
+    }
+
+    public static List<SSTableReader> sstablesInBounds(PartitionPosition left, PartitionPosition right, SSTableIntervalTree intervalTree)
+    {
+        assert !AbstractBounds.strictlyWrapsAround(left, right);
+
+        if (intervalTree.isEmpty())
+            return Collections.emptyList();
+
+        PartitionPosition stopInTree = right.isMinimum() ? intervalTree.max() : right;
+        return intervalTree.search(Interval.create(left, stopInTree));
+    }
+
+    public static Function<View, Iterable<SSTableReader>> selectFunction(SSTableSet sstableSet)
+    {
+        return (view) -> view.select(sstableSet);
+    }
+
+    public static Function<View, Iterable<SSTableReader>> select(SSTableSet sstableSet, Predicate<SSTableReader> filter)
+    {
+        return (view) -> view.sstables(sstableSet, filter);
+    }
+
+    /**
+     * @return a ViewFragment containing the sstables and memtables that may need to be merged
+     * for the given @param key, according to the interval tree
+     */
+    public static Function<View, Iterable<SSTableReader>> select(SSTableSet sstableSet, DecoratedKey key)
+    {
+        assert sstableSet == SSTableSet.LIVE;
+        return (view) -> view.intervalTree.search(key);
+    }
+
+    /**
+     * @return a ViewFragment containing the sstables and memtables that may need to be merged
+     * for rows within @param rowBounds, inclusive, according to the interval tree.
+     */
+    public static Function<View, Iterable<SSTableReader>> selectLive(AbstractBounds<PartitionPosition> rowBounds)
+    {
+        // Note that View.sstablesInBounds always includes it's bound while rowBounds may not. This is ok however
+        // because the fact we restrict the sstables returned by this function is an optimization in the first
+        // place and the returned sstables will (almost) never cover *exactly* rowBounds anyway. It's also
+        // *very* unlikely that a sstable is included *just* because we consider one of the bound inclusively
+        // instead of exclusively, so the performance impact is negligible in practice.
+        return (view) -> view.liveSSTablesInBounds(rowBounds.left, rowBounds.right);
     }
 
     // METHODS TO CONSTRUCT FUNCTIONS FOR MODIFYING A VIEW:
@@ -163,8 +252,8 @@
             {
                 assert all(mark, Helpers.idIn(view.sstablesMap));
                 return new View(view.liveMemtables, view.flushingMemtables, view.sstablesMap,
-                                replace(view.compacting, unmark, mark),
-                                view.premature, view.intervalTree);
+                                replace(view.compactingMap, unmark, mark),
+                                view.intervalTree);
             }
         };
     }
@@ -178,7 +267,7 @@
             public boolean apply(View view)
             {
                 for (SSTableReader reader : readers)
-                    if (view.compacting.contains(reader) || view.sstablesMap.get(reader) != reader || reader.isMarkedCompacted() || view.premature.contains(reader))
+                    if (view.compacting.contains(reader) || view.sstablesMap.get(reader) != reader || reader.isMarkedCompacted())
                         return false;
                 return true;
             }
@@ -195,7 +284,7 @@
             public View apply(View view)
             {
                 Map<SSTableReader, SSTableReader> sstableMap = replace(view.sstablesMap, remove, add);
-                return new View(view.liveMemtables, view.flushingMemtables, sstableMap, view.compacting, view.premature,
+                return new View(view.liveMemtables, view.flushingMemtables, sstableMap, view.compactingMap,
                                 SSTableIntervalTree.build(sstableMap.keySet()));
             }
         };
@@ -210,7 +299,7 @@
             {
                 List<Memtable> newLive = ImmutableList.<Memtable>builder().addAll(view.liveMemtables).add(newMemtable).build();
                 assert newLive.size() == view.liveMemtables.size() + 1;
-                return new View(newLive, view.flushingMemtables, view.sstablesMap, view.compacting, view.premature, view.intervalTree);
+                return new View(newLive, view.flushingMemtables, view.sstablesMap, view.compactingMap, view.intervalTree);
             }
         };
     }
@@ -229,13 +318,13 @@
                                                            filter(flushing, not(lessThan(toFlush)))));
                 assert newLive.size() == live.size() - 1;
                 assert newFlushing.size() == flushing.size() + 1;
-                return new View(newLive, newFlushing, view.sstablesMap, view.compacting, view.premature, view.intervalTree);
+                return new View(newLive, newFlushing, view.sstablesMap, view.compactingMap, view.intervalTree);
             }
         };
     }
 
     // called after flush: removes memtable from flushingMemtables, and inserts flushed into the live sstable set
-    static Function<View, View> replaceFlushed(final Memtable memtable, final SSTableReader flushed)
+    static Function<View, View> replaceFlushed(final Memtable memtable, final Collection<SSTableReader> flushed)
     {
         return new Function<View, View>()
         {
@@ -244,35 +333,17 @@
                 List<Memtable> flushingMemtables = copyOf(filter(view.flushingMemtables, not(equalTo(memtable))));
                 assert flushingMemtables.size() == view.flushingMemtables.size() - 1;
 
-                if (flushed == null)
+                if (flushed == null || flushed.isEmpty())
                     return new View(view.liveMemtables, flushingMemtables, view.sstablesMap,
-                                    view.compacting, view.premature, view.intervalTree);
+                                    view.compactingMap, view.intervalTree);
 
-                Map<SSTableReader, SSTableReader> sstableMap = replace(view.sstablesMap, emptySet(), singleton(flushed));
-                Set<SSTableReader> compacting = replace(view.compacting, emptySet(), singleton(flushed));
-                Set<SSTableReader> premature = replace(view.premature, emptySet(), singleton(flushed));
-                return new View(view.liveMemtables, flushingMemtables, sstableMap, compacting, premature,
+                Map<SSTableReader, SSTableReader> sstableMap = replace(view.sstablesMap, emptySet(), flushed);
+                return new View(view.liveMemtables, flushingMemtables, sstableMap, view.compactingMap,
                                 SSTableIntervalTree.build(sstableMap.keySet()));
             }
         };
     }
 
-    static Function<View, View> permitCompactionOfFlushed(final SSTableReader reader)
-    {
-        return new Function<View, View>()
-        {
-
-            @Nullable
-            public View apply(View view)
-            {
-                Set<SSTableReader> premature = ImmutableSet.copyOf(filter(view.premature, not(equalTo(reader))));
-                Set<SSTableReader> compacting = ImmutableSet.copyOf(filter(view.compacting, not(equalTo(reader))));
-                return new View(view.liveMemtables, view.flushingMemtables, view.sstablesMap, compacting, premature, view.intervalTree);
-            }
-        };
-    }
-
-
     private static <T extends Comparable<T>> Predicate<T> lessThan(final T lessThan)
     {
         return new Predicate<T>()

diff --git a/src/java/org/apache/cassandra/db/lifecycle/WrappedLifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/WrappedLifecycleTransaction.java
new file mode 100644
index 0000000..ff84208
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/lifecycle/WrappedLifecycleTransaction.java

@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.lifecycle;
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+public class WrappedLifecycleTransaction implements ILifecycleTransaction
+{
+
+    final ILifecycleTransaction delegate;
+    public WrappedLifecycleTransaction(ILifecycleTransaction delegate)
+    {
+        this.delegate = delegate;
+    }
+
+    public void checkpoint()
+    {
+        delegate.checkpoint();
+    }
+
+    public void update(SSTableReader reader, boolean original)
+    {
+        delegate.update(reader, original);
+    }
+
+    public void update(Collection<SSTableReader> readers, boolean original)
+    {
+        delegate.update(readers, original);
+    }
+
+    public SSTableReader current(SSTableReader reader)
+    {
+        return delegate.current(reader);
+    }
+
+    public void obsolete(SSTableReader reader)
+    {
+        delegate.obsolete(reader);
+    }
+
+    public void obsoleteOriginals()
+    {
+        delegate.obsoleteOriginals();
+    }
+
+    public Set<SSTableReader> originals()
+    {
+        return delegate.originals();
+    }
+
+    public boolean isObsolete(SSTableReader reader)
+    {
+        return delegate.isObsolete(reader);
+    }
+
+    public Throwable commit(Throwable accumulate)
+    {
+        return delegate.commit(accumulate);
+    }
+
+    public Throwable abort(Throwable accumulate)
+    {
+        return delegate.abort(accumulate);
+    }
+
+    public void prepareToCommit()
+    {
+        delegate.prepareToCommit();
+    }
+
+    public void close()
+    {
+        delegate.close();
+    }
+
+    public void trackNew(SSTable table)
+    {
+        delegate.trackNew(table);
+    }
+
+    public void untrackNew(SSTable table)
+    {
+        delegate.untrackNew(table);
+    }
+
+    public OperationType opType()
+    {
+        return delegate.opType();
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
index 4baf6a3..b0d6a5d 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java

@@ -36,7 +36,12 @@
  */
 public abstract class AbstractCompositeType extends AbstractType<ByteBuffer>
 {
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    protected AbstractCompositeType()
+    {
+        super(ComparisonType.CUSTOM);
+    }
+
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -103,38 +108,6 @@
         return l.toArray(new ByteBuffer[l.size()]);
     }
 
-    public static class CompositeComponent
-    {
-        public AbstractType<?> comparator;
-        public ByteBuffer   value;
-
-        public CompositeComponent( AbstractType<?> comparator, ByteBuffer value )
-        {
-            this.comparator = comparator;
-            this.value      = value;
-        }
-    }
-
-    public List<CompositeComponent> deconstruct( ByteBuffer bytes )
-    {
-        List<CompositeComponent> list = new ArrayList<CompositeComponent>();
-
-        ByteBuffer bb = bytes.duplicate();
-        readIsStatic(bb);
-        int i = 0;
-
-        while (bb.remaining() > 0)
-        {
-            AbstractType comparator = getComparator(i, bb);
-            ByteBuffer value = ByteBufferUtil.readBytesWithShortLength(bb);
-
-            list.add( new CompositeComponent(comparator,value) );
-
-            byte b = bb.get(); // Ignore; not relevant here
-            ++i;
-        }
-        return list;
-    }
 
     /*
      * Escapes all occurences of the ':' character from the input, replacing them by "\:".
@@ -318,6 +291,12 @@
         return BytesSerializer.instance;
     }
 
+    @Override
+    public boolean referencesUserType(String name)
+    {
+        return getComponents().stream().anyMatch(f -> f.referencesUserType(name));
+    }
+
     /**
      * @return the comparator for the given component. static CompositeType will consult
      * @param i DynamicCompositeType will read the type information from @param bb

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index 799d636..a15dd48 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java

@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.db.marshal;
 
+import java.io.IOException;
+import java.lang.reflect.Method;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -25,13 +27,23 @@
 import java.util.List;
 import java.util.Map;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 
+import org.apache.cassandra.utils.FastByteOperations;
 import org.github.jamm.Unmetered;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.CUSTOM;
 
 /**
  * Specifies a Comparator for a specific type of ByteBuffer.
@@ -44,26 +56,48 @@
 @Unmetered
 public abstract class AbstractType<T> implements Comparator<ByteBuffer>
 {
+    private static final Logger logger = LoggerFactory.getLogger(AbstractType.class);
+
     public final Comparator<ByteBuffer> reverseComparator;
 
-    protected AbstractType()
+    public enum ComparisonType
     {
-        reverseComparator = new Comparator<ByteBuffer>()
-        {
-            public int compare(ByteBuffer o1, ByteBuffer o2)
-            {
-                if (o1.remaining() == 0)
-                {
-                    return o2.remaining() == 0 ? 0 : -1;
-                }
-                if (o2.remaining() == 0)
-                {
-                    return 1;
-                }
+        /**
+         * This type should never be compared
+         */
+        NOT_COMPARABLE,
+        /**
+         * This type is always compared by its sequence of unsigned bytes
+         */
+        BYTE_ORDER,
+        /**
+         * This type can only be compared by calling the type's compareCustom() method, which may be expensive.
+         * Support for this may be removed in a major release of Cassandra, however upgrade facilities will be
+         * provided if and when this happens.
+         */
+        CUSTOM
+    }
 
-                return AbstractType.this.compare(o2, o1);
-            }
-        };
+    public final ComparisonType comparisonType;
+    public final boolean isByteOrderComparable;
+
+    protected AbstractType(ComparisonType comparisonType)
+    {
+        this.comparisonType = comparisonType;
+        this.isByteOrderComparable = comparisonType == ComparisonType.BYTE_ORDER;
+        reverseComparator = (o1, o2) -> AbstractType.this.compare(o2, o1);
+        try
+        {
+            Method custom = getClass().getMethod("compareCustom", ByteBuffer.class, ByteBuffer.class);
+            if ((custom.getDeclaringClass() == AbstractType.class) == (comparisonType == CUSTOM))
+                throw new IllegalStateException((comparisonType == CUSTOM ? "compareCustom must be overridden if ComparisonType is CUSTOM"
+                                                                         : "compareCustom should not be overridden if ComparisonType is not CUSTOM")
+                                                + " (" + getClass().getSimpleName() + ")");
+        }
+        catch (NoSuchMethodException e)
+        {
+            throw new IllegalStateException();
+        }
     }
 
     public static List<String> asCQLTypeStringList(List<AbstractType<?>> abstractTypes)
@@ -84,9 +118,12 @@
         return getSerializer().serialize(value);
     }
 
-    /** get a string representation of the bytes suitable for log messages */
+    /** get a string representation of the bytes used for various identifier (NOT just for log messages) */
     public String getString(ByteBuffer bytes)
     {
+        if (bytes == null)
+            return "null";
+
         TypeSerializer<T> serializer = getSerializer();
         serializer.validate(bytes);
 
@@ -122,6 +159,27 @@
         getSerializer().validate(bytes);
     }
 
+    public final int compare(ByteBuffer left, ByteBuffer right)
+    {
+        return isByteOrderComparable
+               ? FastByteOperations.compareUnsigned(left, right)
+               : compareCustom(left, right);
+    }
+
+    /**
+     * Implement IFF ComparisonType is CUSTOM
+     *
+     * Compares the ByteBuffer representation of two instances of this class,
+     * for types where this cannot be done by simple in-order comparison of the
+     * unsigned bytes
+     *
+     * Standard Java compare semantics
+     */
+    public int compareCustom(ByteBuffer left, ByteBuffer right)
+    {
+        throw new UnsupportedOperationException();
+    }
+
     /**
      * Validate cell value. Unlike {@linkplain #validate(java.nio.ByteBuffer)},
      * cell value is passed to validate its content.
@@ -141,6 +199,17 @@
         return new CQL3Type.Custom(this);
     }
 
+    /**
+     * Same as compare except that this ignore ReversedType. This is to be use when
+     * comparing 2 values to decide for a CQL condition (see Operator.isSatisfiedBy) as
+     * for CQL, ReversedType is simply an "hint" to the storage engine but it does not
+     * change the meaning of queries per-se.
+     */
+    public int compareForCQL(ByteBuffer v1, ByteBuffer v2)
+    {
+        return compare(v1, v2);
+    }
+
     public abstract TypeSerializer<T> getSerializer();
 
     /* convenience method */
@@ -224,15 +293,6 @@
     }
 
     /**
-     * @return true IFF the byte representation of this type can be compared unsigned
-     * and always return the same result as calling this object's compare or compareCollectionMembers methods
-     */
-    public boolean isByteOrderComparable()
-    {
-        return false;
-    }
-
-    /**
      * An alternative comparison function used by CollectionsType in conjunction with CompositeType.
      *
      * This comparator is only called to compare components of a CompositeType. It gets the value of the
@@ -265,6 +325,16 @@
         return false;
     }
 
+    public boolean isTuple()
+    {
+        return false;
+    }
+
+    public boolean isUDT()
+    {
+        return false;
+    }
+
     public AbstractType<?> freeze()
     {
         return this;
@@ -305,11 +375,85 @@
     }
 
     /**
-     * Checks whether this type or any of the types this type contains references the given type.
+     * The length of values for this type if all values are of fixed length, -1 otherwise.
      */
-    public boolean references(AbstractType<?> check)
+    protected int valueLengthIfFixed()
     {
-        return this.equals(check);
+        return -1;
+    }
+
+    public void validateIfFixedSize(ByteBuffer value)
+    {
+        if (valueLengthIfFixed() < 0)
+            return;
+
+        validate(value);
+    }
+
+    // This assumes that no empty values are passed
+    public void writeValue(ByteBuffer value, DataOutputPlus out) throws IOException
+    {
+        assert value.hasRemaining();
+        int valueLengthIfFixed = valueLengthIfFixed();
+        assert valueLengthIfFixed < 0 || value.remaining() == valueLengthIfFixed : String.format("Expected exactly %d bytes, but was %d",
+                                                                                                 valueLengthIfFixed, value.remaining());
+
+        if (valueLengthIfFixed >= 0)
+            out.write(value);
+        else
+            ByteBufferUtil.writeWithVIntLength(value, out);
+    }
+
+    public long writtenLength(ByteBuffer value)
+    {
+        assert value.hasRemaining();
+        int valueLengthIfFixed = valueLengthIfFixed();
+        assert valueLengthIfFixed < 0 || value.remaining() == valueLengthIfFixed : String.format("Expected exactly %d bytes, but was %d",
+                                                                                                 valueLengthIfFixed, value.remaining());
+
+        return valueLengthIfFixed >= 0
+             ? value.remaining()
+             : TypeSizes.sizeofWithVIntLength(value);
+    }
+
+    public ByteBuffer readValue(DataInputPlus in) throws IOException
+    {
+        return readValue(in, Integer.MAX_VALUE);
+    }
+
+    public ByteBuffer readValue(DataInputPlus in, int maxValueSize) throws IOException
+    {
+        int length = valueLengthIfFixed();
+
+        if (length >= 0)
+            return ByteBufferUtil.read(in, length);
+        else
+        {
+            int l = (int)in.readUnsignedVInt();
+            if (l < 0)
+                throw new IOException("Corrupt (negative) value length encountered");
+
+            if (l > maxValueSize)
+                throw new IOException(String.format("Corrupt value length %d encountered, as it exceeds the maximum of %d, " +
+                                                    "which is set via max_value_size_in_mb in cassandra.yaml",
+                                                    l, maxValueSize));
+
+            return ByteBufferUtil.read(in, l);
+        }
+    }
+
+    public void skipValue(DataInputPlus in) throws IOException
+    {
+        int length = valueLengthIfFixed();
+        if (length >= 0)
+            in.skipBytesFully(length);
+        else
+            ByteBufferUtil.skipWithVIntLength(in);
+    }
+
+    public boolean referencesUserType(String userTypeName)
+    {
+        return false;
     }
 
     /**
@@ -324,4 +468,13 @@
     {
         return getClass().getName();
     }
+
+    public void checkComparable()
+    {
+        switch (comparisonType)
+        {
+            case NOT_COMPARABLE:
+                throw new IllegalArgumentException(this + " cannot be used in comparisons, so cannot be used as a clustering column");
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/AsciiType.java b/src/java/org/apache/cassandra/db/marshal/AsciiType.java
index 2356c1c..69b2b01 100644
--- a/src/java/org/apache/cassandra/db/marshal/AsciiType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AsciiType.java

@@ -37,7 +37,7 @@
 {
     public static final AsciiType instance = new AsciiType();
 
-    AsciiType() {} // singleton
+    AsciiType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     private final ThreadLocal<CharsetEncoder> encoder = new ThreadLocal<CharsetEncoder>()
     {
@@ -48,11 +48,6 @@
         }
     };
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
-
     public ByteBuffer fromString(String source)
     {
         // the encoder must be reset each time it's used, hence the thread-local storage
@@ -105,9 +100,4 @@
     {
         return AsciiSerializer.instance;
     }
-
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
index bfe8c34..24d0632 100644
--- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java

@@ -35,14 +35,14 @@
 
     public static final BooleanType instance = new BooleanType();
 
-    BooleanType() {} // singleton
+    BooleanType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -94,4 +94,10 @@
     {
         return BooleanSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 1;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/ByteType.java b/src/java/org/apache/cassandra/db/marshal/ByteType.java
index e1cacc3..6bcf7cb 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteType.java

@@ -33,9 +33,10 @@
 
     ByteType()
     {
+        super(ComparisonType.CUSTOM);
     } // singleton
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return o1.get(o1.position()) - o2.get(o2.position());
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/BytesType.java b/src/java/org/apache/cassandra/db/marshal/BytesType.java
index eed3872..cec20f4 100644
--- a/src/java/org/apache/cassandra/db/marshal/BytesType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BytesType.java

@@ -32,12 +32,7 @@
 {
     public static final BytesType instance = new BytesType();
 
-    BytesType() {} // singleton
-
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
+    BytesType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public ByteBuffer fromString(String source)
     {
@@ -89,11 +84,6 @@
         return true;
     }
 
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
-
     public CQL3Type asCQL3Type()
     {
         return CQL3Type.Native.BLOB;

diff --git a/src/java/org/apache/cassandra/db/marshal/CollectionType.java b/src/java/org/apache/cassandra/db/marshal/CollectionType.java
index 1660b2e..d65e3a6 100644
--- a/src/java/org/apache/cassandra/db/marshal/CollectionType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CollectionType.java

@@ -18,20 +18,20 @@
 package org.apache.cassandra.db.marshal;
 
 import java.nio.ByteBuffer;
+import java.io.IOException;
 import java.util.List;
+import java.util.Iterator;
 
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.transport.Server;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.ColumnSpecification;
 import org.apache.cassandra.cql3.Lists;
 import org.apache.cassandra.cql3.Maps;
 import org.apache.cassandra.cql3.Sets;
-
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -43,9 +43,7 @@
  */
 public abstract class CollectionType<T> extends AbstractType<T>
 {
-    private static final Logger logger = LoggerFactory.getLogger(CollectionType.class);
-
-    public static final int MAX_ELEMENTS = 65535;
+    public static CellPath.Serializer cellPathSerializer = new CollectionPathSerializer();
 
     public enum Kind
     {
@@ -76,14 +74,17 @@
 
     public final Kind kind;
 
-    protected CollectionType(Kind kind)
+    protected CollectionType(ComparisonType comparisonType, Kind kind)
     {
+        super(comparisonType);
         this.kind = kind;
     }
 
     public abstract AbstractType<?> nameComparator();
     public abstract AbstractType<?> valueComparator();
 
+    protected abstract List<ByteBuffer> serializedValues(Iterator<Cell> cells);
+
     @Override
     public abstract CollectionSerializer<T> getSerializer();
 
@@ -118,7 +119,7 @@
     public void validateCellValue(ByteBuffer cellValue) throws MarshalException
     {
         if (isMultiCell())
-            valueComparator().validate(cellValue);
+            valueComparator().validateCellValue(cellValue);
         else
             super.validateCellValue(cellValue);
     }
@@ -132,28 +133,18 @@
         return kind == Kind.MAP;
     }
 
-    public List<Cell> enforceLimit(ColumnDefinition def, List<Cell> cells, int version)
+    // Overrided by maps
+    protected int collectionSize(List<ByteBuffer> values)
     {
-        assert isMultiCell();
-
-        if (version >= Server.VERSION_3 || cells.size() <= MAX_ELEMENTS)
-            return cells;
-
-        logger.error("Detected collection for table {}.{} with {} elements, more than the {} limit. Only the first {}" +
-                     " elements will be returned to the client. Please see " +
-                     "http://cassandra.apache.org/doc/cql3/CQL.html#collections for more details.",
-                     def.ksName, def.cfName, cells.size(), MAX_ELEMENTS, MAX_ELEMENTS);
-        return cells.subList(0, MAX_ELEMENTS);
+        return values.size();
     }
 
-    public abstract List<ByteBuffer> serializedValues(List<Cell> cells);
-
-    public ByteBuffer serializeForNativeProtocol(ColumnDefinition def, List<Cell> cells, int version)
+    public ByteBuffer serializeForNativeProtocol(ColumnDefinition def, Iterator<Cell> cells, int version)
     {
         assert isMultiCell();
-        cells = enforceLimit(def, cells, version);
         List<ByteBuffer> values = serializedValues(cells);
-        return CollectionSerializer.pack(values, cells.size(), version);
+        int size = collectionSize(values);
+        return CollectionSerializer.pack(values, size, version);
     }
 
     @Override
@@ -217,4 +208,27 @@
     {
         return this.toString(false);
     }
+
+    private static class CollectionPathSerializer implements CellPath.Serializer
+    {
+        public void serialize(CellPath path, DataOutputPlus out) throws IOException
+        {
+            ByteBufferUtil.writeWithVIntLength(path.get(0), out);
+        }
+
+        public CellPath deserialize(DataInputPlus in) throws IOException
+        {
+            return CellPath.create(ByteBufferUtil.readWithVIntLength(in));
+        }
+
+        public long serializedSize(CellPath path)
+        {
+            return ByteBufferUtil.serializedSizeWithVIntLength(path.get(0));
+        }
+
+        public void skip(DataInputPlus in) throws IOException
+        {
+            ByteBufferUtil.skipWithVIntLength(in);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java b/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java
index 1d2c88c..96efa24 100644
--- a/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ColumnToCollectionType.java

@@ -31,6 +31,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+/*
+ * This class is deprecated and only kept for backward compatibility.
+ */
 public class ColumnToCollectionType extends AbstractType<ByteBuffer>
 {
     // interning instances
@@ -58,10 +61,11 @@
 
     private ColumnToCollectionType(Map<ByteBuffer, CollectionType> defined)
     {
+        super(ComparisonType.CUSTOM);
         this.defined = ImmutableMap.copyOf(defined);
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         throw new UnsupportedOperationException("ColumnToCollectionType should only be used in composite types, never alone");
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
index 9892118..d4ddfc0 100644
--- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java

@@ -120,17 +120,6 @@
         this.types = ImmutableList.copyOf(types);
     }
 
-    @Override
-    public boolean references(AbstractType<?> check)
-    {
-        if (super.references(check))
-            return true;
-        for (AbstractType<?> type : types)
-            if (type.references(check))
-                return true;
-        return false;
-    }
-
     protected AbstractType<?> getComparator(int i, ByteBuffer bb)
     {
         try
@@ -192,6 +181,7 @@
         // most names will be complete.
         ByteBuffer[] l = new ByteBuffer[types.size()];
         ByteBuffer bb = name.duplicate();
+        readStatic(bb);
         int i = 0;
         while (bb.remaining() > 0)
         {
@@ -201,6 +191,24 @@
         return i == l.length ? l : Arrays.copyOfRange(l, 0, i);
     }
 
+    public static List<ByteBuffer> splitName(ByteBuffer name)
+    {
+        List<ByteBuffer> l = new ArrayList<>();
+        ByteBuffer bb = name.duplicate();
+        readStatic(bb);
+        while (bb.remaining() > 0)
+        {
+            l.add(ByteBufferUtil.readBytesWithShortLength(bb));
+            bb.get(); // skip end-of-component
+        }
+        return l;
+    }
+
+    public static byte lastEOC(ByteBuffer name)
+    {
+        return name.get(name.limit() - 1);
+    }
+
     // Extract component idx from bb. Return null if there is not enough component.
     public static ByteBuffer extractComponent(ByteBuffer bb, int idx)
     {
@@ -331,13 +339,27 @@
         return new Builder(this);
     }
 
+    public Builder builder(boolean isStatic)
+    {
+        return new Builder(this, isStatic);
+    }
+
     public static ByteBuffer build(ByteBuffer... buffers)
     {
-        int totalLength = 0;
+        return build(false, buffers);
+    }
+
+    public static ByteBuffer build(boolean isStatic, ByteBuffer... buffers)
+    {
+        int totalLength = isStatic ? 2 : 0;
         for (ByteBuffer bb : buffers)
             totalLength += 2 + bb.remaining() + 1;
 
         ByteBuffer out = ByteBuffer.allocate(totalLength);
+
+        if (isStatic)
+            out.putShort((short)STATIC_MARKER);
+
         for (ByteBuffer bb : buffers)
         {
             ByteBufferUtil.writeShortLength(out, bb.remaining());
@@ -359,12 +381,12 @@
 
         public Builder(CompositeType composite)
         {
-            this(composite, new ArrayList<ByteBuffer>(composite.types.size()), new byte[composite.types.size()], false);
+            this(composite, false);
         }
 
-        public static Builder staticBuilder(CompositeType composite)
+        public Builder(CompositeType composite, boolean isStatic)
         {
-            return new Builder(composite, new ArrayList<ByteBuffer>(composite.types.size()), new byte[composite.types.size()], true);
+            this(composite, new ArrayList<>(composite.types.size()), new byte[composite.types.size()], isStatic);
         }
 
         private Builder(CompositeType composite, List<ByteBuffer> components, byte[] endOfComponents, boolean isStatic)
@@ -381,7 +403,7 @@
 
         private Builder(Builder b)
         {
-            this(b.composite, new ArrayList<ByteBuffer>(b.components), Arrays.copyOf(b.endOfComponents, b.endOfComponents.length), b.isStatic);
+            this(b.composite, new ArrayList<>(b.components), Arrays.copyOf(b.endOfComponents, b.endOfComponents.length), b.isStatic);
             this.serializedSize = b.serializedSize;
         }
 

diff --git a/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java b/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java
index 4b3ce82..18ff256 100644
--- a/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CounterColumnType.java

@@ -24,13 +24,14 @@
 import org.apache.cassandra.db.context.CounterContext;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.CounterSerializer;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class CounterColumnType extends AbstractType<Long>
 {
     public static final CounterColumnType instance = new CounterColumnType();
 
-    CounterColumnType() {} // singleton
+    CounterColumnType() {super(ComparisonType.NOT_COMPARABLE);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
@@ -42,11 +43,6 @@
         return true;
     }
 
-    public boolean isByteOrderComparable()
-    {
-        throw new AssertionError();
-    }
-
     @Override
     public Long compose(ByteBuffer bytes)
     {
@@ -59,9 +55,10 @@
         return ByteBufferUtil.bytes(value);
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    @Override
+    public void validateCellValue(ByteBuffer cellValue) throws MarshalException
     {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
+        CounterContext.instance().validateContext(cellValue);
     }
 
     public String getString(ByteBuffer bytes)

diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java
index 5186ca8..dee800e 100644
--- a/src/java/org/apache/cassandra/db/marshal/DateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DateType.java

@@ -42,21 +42,13 @@
 
     public static final DateType instance = new DateType();
 
-    DateType() {} // singleton
+    DateType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        if (!o1.hasRemaining() || !o2.hasRemaining())
-            return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
-
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
-
     public ByteBuffer fromString(String source) throws MarshalException
     {
       // Return an empty ByteBuffer for an empty string.
@@ -108,11 +100,6 @@
         return false;
     }
 
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
-
     @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {
@@ -129,4 +116,10 @@
     {
         return TimestampSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 8;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
index 4052d70..17d91d3 100644
--- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java

@@ -32,14 +32,14 @@
 {
     public static final DecimalType instance = new DecimalType();
 
-    DecimalType() {} // singleton
+    DecimalType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;

diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
index d0f8485..d39059b 100644
--- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java

@@ -31,14 +31,14 @@
 {
     public static final DoubleType instance = new DoubleType();
 
-    DoubleType() {} // singleton
+    DoubleType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -101,4 +101,10 @@
     {
         return DoubleSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 8;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
index 97d145d..657f126 100644
--- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java

@@ -343,10 +343,11 @@
 
         public FixedValueComparator(int cmp)
         {
+            super(ComparisonType.CUSTOM);
             this.cmp = cmp;
         }
 
-        public int compare(ByteBuffer v1, ByteBuffer v2)
+        public int compareCustom(ByteBuffer v1, ByteBuffer v2)
         {
             return cmp;
         }
@@ -395,10 +396,5 @@
         {
             throw new UnsupportedOperationException();
         }
-
-        public boolean isByteOrderComparable()
-        {
-            return false;
-        }
     }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/EmptyType.java b/src/java/org/apache/cassandra/db/marshal/EmptyType.java
index e5abe5b..de087f5 100644
--- a/src/java/org/apache/cassandra/db/marshal/EmptyType.java
+++ b/src/java/org/apache/cassandra/db/marshal/EmptyType.java

@@ -18,13 +18,21 @@
 package org.apache.cassandra.db.marshal;
 
 import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.EmptySerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.NoSpamLogger;
 
 /**
  * A type that only accept empty data.
@@ -32,11 +40,34 @@
  */
 public class EmptyType extends AbstractType<Void>
 {
+    private enum NonEmptyWriteBehavior { FAIL, LOG_DATA_LOSS, SILENT_DATA_LOSS }
+
+    private static final Logger logger = LoggerFactory.getLogger(EmptyType.class);
+    private static final String KEY_EMPTYTYPE_NONEMPTY_BEHAVIOR = "cassandra.serialization.emptytype.nonempty_behavior";
+    private static final NoSpamLogger NON_EMPTY_WRITE_LOGGER = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
+    private static final NonEmptyWriteBehavior NON_EMPTY_WRITE_BEHAVIOR = parseNonEmptyWriteBehavior();
+
+    private static NonEmptyWriteBehavior parseNonEmptyWriteBehavior()
+    {
+        String value = System.getProperty(KEY_EMPTYTYPE_NONEMPTY_BEHAVIOR);
+        if (value == null)
+            return NonEmptyWriteBehavior.FAIL;
+        try
+        {
+            return NonEmptyWriteBehavior.valueOf(value.toUpperCase().trim());
+        }
+        catch (Exception e)
+        {
+            logger.warn("Unable to parse property " + KEY_EMPTYTYPE_NONEMPTY_BEHAVIOR + ", falling back to FAIL", e);
+            return NonEmptyWriteBehavior.FAIL;
+        }
+    }
+
     public static final EmptyType instance = new EmptyType();
 
-    private EmptyType() {} // singleton
+    private EmptyType() {super(ComparisonType.CUSTOM);} // singleton
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return 0;
     }
@@ -66,6 +97,11 @@
     }
 
     @Override
+    public CQL3Type asCQL3Type()
+    {
+        return CQL3Type.Native.EMPTY;
+    }
+
     public String toJSONString(ByteBuffer buffer, int protocolVersion)
     {
         return "\"\"";
@@ -75,4 +111,52 @@
     {
         return EmptySerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 0;
+    }
+
+    @Override
+    public ByteBuffer readValue(DataInputPlus in)
+    {
+        return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+    }
+
+    @Override
+    public ByteBuffer readValue(DataInputPlus in, int maxValueSize)
+    {
+        return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+    }
+
+    @Override
+    public void writeValue(ByteBuffer value, DataOutputPlus out)
+    {
+        if (!value.hasRemaining())
+            return;
+        // In 3.0 writeValue was added which required EmptyType to write data, and relied on caller to never do that;
+        // that behavior was unsafe so guard against it.  There are configurable behaviors, but the only allowed cases
+        // should be *_DATA_LOSS (last resort... really should avoid this) and fail; fail should be preferred in nearly
+        // all cases.
+        // see CASSANDRA-15790
+        switch (NON_EMPTY_WRITE_BEHAVIOR)
+        {
+            case LOG_DATA_LOSS:
+                NON_EMPTY_WRITE_LOGGER.warn("Dropping data...", new NonEmptyWriteException("Attempted to write a non-empty value using EmptyType"));
+            case SILENT_DATA_LOSS:
+                return;
+            case FAIL:
+            default:
+                throw new AssertionError("Attempted to write a non-empty value using EmptyType");
+        }
+    }
+
+    private static final class NonEmptyWriteException extends RuntimeException
+    {
+        NonEmptyWriteException(String message)
+        {
+            super(message);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java
index fc12b87..58f5702 100644
--- a/src/java/org/apache/cassandra/db/marshal/FloatType.java
+++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java

@@ -32,14 +32,14 @@
 {
     public static final FloatType instance = new FloatType();
 
-    FloatType() {} // singleton
+    FloatType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -100,4 +100,10 @@
     {
         return FloatSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 4;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/FrozenType.java b/src/java/org/apache/cassandra/db/marshal/FrozenType.java
index 7713028..261e789 100644
--- a/src/java/org/apache/cassandra/db/marshal/FrozenType.java
+++ b/src/java/org/apache/cassandra/db/marshal/FrozenType.java

@@ -31,6 +31,11 @@
  */
 public class FrozenType extends AbstractType<Void>
 {
+    protected FrozenType()
+    {
+        super(ComparisonType.NOT_COMPARABLE);
+    }
+
     public static AbstractType<?> getInstance(TypeParser parser) throws ConfigurationException, SyntaxException
     {
         List<AbstractType<?>> innerTypes = parser.getTypeParameters();
@@ -41,11 +46,6 @@
         return innerType.freeze();
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        throw new UnsupportedOperationException();
-    }
-
     public String getString(ByteBuffer bytes)
     {
         throw new UnsupportedOperationException();

diff --git a/src/java/org/apache/cassandra/db/marshal/InetAddressType.java b/src/java/org/apache/cassandra/db/marshal/InetAddressType.java
index 4901c74..7ffb9c7 100644
--- a/src/java/org/apache/cassandra/db/marshal/InetAddressType.java
+++ b/src/java/org/apache/cassandra/db/marshal/InetAddressType.java

@@ -32,18 +32,13 @@
 {
     public static final InetAddressType instance = new InetAddressType();
 
-    InetAddressType() {} // singleton
+    InetAddressType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
-
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
@@ -93,9 +88,4 @@
     {
         return InetAddressSerializer.instance;
     }
-
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
index 67d8142..770a76d 100644
--- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java

@@ -33,6 +33,7 @@
 
     Int32Type()
     {
+        super(ComparisonType.CUSTOM);
     } // singleton
 
     public boolean isEmptyValueMeaningless()
@@ -40,7 +41,7 @@
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -109,4 +110,9 @@
         return Int32Serializer.instance;
     }
 
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 4;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java
index a3741d4..8f4ba44 100644
--- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java
+++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java

@@ -58,14 +58,14 @@
         return i;
     }
 
-    IntegerType() {/* singleton */}
+    IntegerType() {super(ComparisonType.CUSTOM);}/* singleton */
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer lhs, ByteBuffer rhs)
+    public int compareCustom(ByteBuffer lhs, ByteBuffer rhs)
     {
         return IntegerType.compareIntegers(lhs, rhs);
     }

diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
index 3e00d71..70767d4 100644
--- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java

@@ -34,6 +34,7 @@
 
     LexicalUUIDType()
     {
+        super(ComparisonType.CUSTOM);
     } // singleton
 
     public boolean isEmptyValueMeaningless()
@@ -41,7 +42,7 @@
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -83,4 +84,10 @@
     {
         return UUIDSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 16;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java
index ed44616..29e75bd 100644
--- a/src/java/org/apache/cassandra/db/marshal/ListType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ListType.java

@@ -23,14 +23,13 @@
 import org.apache.cassandra.cql3.Json;
 import org.apache.cassandra.cql3.Lists;
 import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.ListSerializer;
 
-import org.apache.cassandra.transport.Server;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,16 +68,16 @@
 
     private ListType(AbstractType<T> elements, boolean isMultiCell)
     {
-        super(Kind.LIST);
+        super(ComparisonType.CUSTOM, Kind.LIST);
         this.elements = elements;
         this.serializer = ListSerializer.getInstance(elements.getSerializer());
         this.isMultiCell = isMultiCell;
     }
 
     @Override
-    public boolean references(AbstractType<?> check)
+    public boolean referencesUserType(String userTypeName)
     {
-        return super.references(check) || elements.references(check);
+        return getElementsType().referencesUserType(userTypeName);
     }
 
     public AbstractType<T> getElementsType()
@@ -131,7 +130,7 @@
     }
 
     @Override
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return compareListOrSet(elements, o1, o2);
     }
@@ -175,12 +174,12 @@
         return sb.toString();
     }
 
-    public List<ByteBuffer> serializedValues(List<Cell> cells)
+    public List<ByteBuffer> serializedValues(Iterator<Cell> cells)
     {
         assert isMultiCell;
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size());
-        for (Cell c : cells)
-            bbs.add(c.value());
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>();
+        while (cells.hasNext())
+            bbs.add(cells.next().value());
         return bbs;
     }
 

diff --git a/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java b/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java
deleted file mode 100644
index 427598d..0000000
--- a/src/java/org/apache/cassandra/db/marshal/LocalByPartionerType.java
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.marshal;
-
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.serializers.TypeSerializer;
-import org.apache.cassandra.serializers.MarshalException;
-
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-/** for sorting columns representing row keys in the row ordering as determined by a partitioner.
- * Not intended for user-defined CFs, and will in fact error out if used with such. */
-public class LocalByPartionerType extends AbstractType<ByteBuffer>
-{
-    private final IPartitioner partitioner;
-
-    public LocalByPartionerType(IPartitioner partitioner)
-    {
-        this.partitioner = partitioner;
-    }
-
-    @Override
-    public ByteBuffer compose(ByteBuffer bytes)
-    {
-        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
-    }
-
-    @Override
-    public ByteBuffer decompose(ByteBuffer bytes)
-    {
-        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
-    }
-
-    public String getString(ByteBuffer bytes)
-    {
-        return ByteBufferUtil.bytesToHex(bytes);
-    }
-
-    public ByteBuffer fromString(String source)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public Term fromJSONObject(Object parsed)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public String toJSONString(ByteBuffer buffer, int protocolVersion)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        // o1 and o2 can be empty so we need to use RowPosition, not DecoratedKey
-        return RowPosition.ForKey.get(o1, partitioner).compareTo(RowPosition.ForKey.get(o2, partitioner));
-    }
-
-    @Override
-    public void validate(ByteBuffer bytes) throws MarshalException
-    {
-        throw new IllegalStateException("You shouldn't be validating this.");
-    }
-
-    public TypeSerializer<ByteBuffer> getSerializer()
-    {
-        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
-    }
-}

diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java
index d77d7d0..8a1528a 100644
--- a/src/java/org/apache/cassandra/db/marshal/LongType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LongType.java

@@ -31,14 +31,14 @@
 {
     public static final LongType instance = new LongType();
 
-    LongType() {} // singleton
+    LongType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return compareLongs(o1, o2);
     }
@@ -118,4 +118,9 @@
         return LongSerializer.instance;
     }
 
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 8;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java
index c7267cf..9c9cda9 100644
--- a/src/java/org/apache/cassandra/db/marshal/MapType.java
+++ b/src/java/org/apache/cassandra/db/marshal/MapType.java

@@ -23,7 +23,7 @@
 import org.apache.cassandra.cql3.Json;
 import org.apache.cassandra.cql3.Maps;
 import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.CollectionSerializer;
@@ -67,17 +67,18 @@
 
     private MapType(AbstractType<K> keys, AbstractType<V> values, boolean isMultiCell)
     {
-        super(Kind.MAP);
+        super(ComparisonType.CUSTOM, Kind.MAP);
         this.keys = keys;
         this.values = values;
-        this.serializer = MapSerializer.getInstance(keys.getSerializer(), values.getSerializer());
+        this.serializer = MapSerializer.getInstance(keys.getSerializer(), values.getSerializer(), keys);
         this.isMultiCell = isMultiCell;
     }
 
     @Override
-    public boolean references(AbstractType<?> check)
+    public boolean referencesUserType(String userTypeName)
     {
-        return super.references(check) || keys.references(check) || values.references(check);
+        return getKeysType().referencesUserType(userTypeName) ||
+               getValuesType().referencesUserType(userTypeName);
     }
 
     public AbstractType<K> getKeysType()
@@ -132,7 +133,7 @@
     }
 
     @Override
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return compareMaps(keys, values, o1, o2);
     }
@@ -173,12 +174,12 @@
         return serializer;
     }
 
-    public boolean isByteOrderComparable()
+    @Override
+    protected int collectionSize(List<ByteBuffer> values)
     {
-        return keys.isByteOrderComparable();
+        return values.size() / 2;
     }
 
-    @Override
     public String toString(boolean ignoreFreezing)
     {
         boolean includeFrozenType = !ignoreFreezing && !isMultiCell();
@@ -192,13 +193,14 @@
         return sb.toString();
     }
 
-    public List<ByteBuffer> serializedValues(List<Cell> cells)
+    public List<ByteBuffer> serializedValues(Iterator<Cell> cells)
     {
         assert isMultiCell;
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size() * 2);
-        for (Cell c : cells)
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>();
+        while (cells.hasNext())
         {
-            bbs.add(c.name().collectionElement());
+            Cell c = cells.next();
+            bbs.add(c.path().get(0));
             bbs.add(c.value());
         }
         return bbs;

diff --git a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
new file mode 100644
index 0000000..02f01ae
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java

@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+/** for sorting columns representing row keys in the row ordering as determined by a partitioner.
+ * Not intended for user-defined CFs, and will in fact error out if used with such. */
+public class PartitionerDefinedOrder extends AbstractType<ByteBuffer>
+{
+    private final IPartitioner partitioner;
+
+    public PartitionerDefinedOrder(IPartitioner partitioner)
+    {
+        super(ComparisonType.CUSTOM);
+        this.partitioner = partitioner;
+    }
+
+    public static AbstractType<?> getInstance(TypeParser parser)
+    {
+        IPartitioner partitioner = DatabaseDescriptor.getPartitioner();
+        Iterator<String> argIterator = parser.getKeyValueParameters().keySet().iterator();
+        if (argIterator.hasNext())
+        {
+            partitioner = FBUtilities.newPartitioner(argIterator.next());
+            assert !argIterator.hasNext();
+        }
+        return partitioner.partitionOrdering();
+    }
+
+    @Override
+    public ByteBuffer compose(ByteBuffer bytes)
+    {
+        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
+    }
+
+    @Override
+    public ByteBuffer decompose(ByteBuffer bytes)
+    {
+        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
+    }
+
+    public String getString(ByteBuffer bytes)
+    {
+        return ByteBufferUtil.bytesToHex(bytes);
+    }
+
+    public ByteBuffer fromString(String source)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Term fromJSONObject(Object parsed)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String toJSONString(ByteBuffer buffer, int protocolVersion)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
+    {
+        // o1 and o2 can be empty so we need to use PartitionPosition, not DecoratedKey
+        return PartitionPosition.ForKey.get(o1, partitioner).compareTo(PartitionPosition.ForKey.get(o2, partitioner));
+    }
+
+    @Override
+    public void validate(ByteBuffer bytes) throws MarshalException
+    {
+        throw new IllegalStateException("You shouldn't be validating this.");
+    }
+
+    public TypeSerializer<ByteBuffer> getSerializer()
+    {
+        throw new UnsupportedOperationException("You can't do this with a local partitioner.");
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s(%s)", getClass().getName(), partitioner.getClass().getName());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
index 19bee5f..82a1895 100644
--- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java

@@ -57,6 +57,7 @@
 
     private ReversedType(AbstractType<T> baseType)
     {
+        super(ComparisonType.CUSTOM);
         this.baseType = baseType;
     }
 
@@ -65,11 +66,17 @@
         return baseType.isEmptyValueMeaningless();
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return baseType.compare(o2, o1);
     }
 
+    @Override
+    public int compareForCQL(ByteBuffer v1, ByteBuffer v2)
+    {
+        return baseType.compare(v1, v2);
+    }
+
     public String getString(ByteBuffer bytes)
     {
         return baseType.getString(bytes);
@@ -118,9 +125,15 @@
         return baseType.getSerializer();
     }
 
-    public boolean references(AbstractType<?> check)
+    public boolean referencesUserType(String userTypeName)
     {
-        return super.references(check) || baseType.references(check);
+        return baseType.referencesUserType(userTypeName);
+    }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return baseType.valueLengthIfFixed();
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java
index 52e3f479..22577b3 100644
--- a/src/java/org/apache/cassandra/db/marshal/SetType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SetType.java

@@ -23,12 +23,11 @@
 import org.apache.cassandra.cql3.Json;
 import org.apache.cassandra.cql3.Sets;
 import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.db.Cell;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.SetSerializer;
-import org.apache.cassandra.transport.Server;
 
 public class SetType<T> extends CollectionType<Set<T>>
 {
@@ -63,16 +62,16 @@
 
     public SetType(AbstractType<T> elements, boolean isMultiCell)
     {
-        super(Kind.SET);
+        super(ComparisonType.CUSTOM, Kind.SET);
         this.elements = elements;
-        this.serializer = SetSerializer.getInstance(elements.getSerializer());
+        this.serializer = SetSerializer.getInstance(elements.getSerializer(), elements);
         this.isMultiCell = isMultiCell;
     }
 
     @Override
-    public boolean references(AbstractType<?> check)
+    public boolean referencesUserType(String userTypeName)
     {
-        return super.references(check) || elements.references(check);
+        return getElementsType().referencesUserType(userTypeName);
     }
 
     public AbstractType<T> getElementsType()
@@ -120,7 +119,7 @@
     }
 
     @Override
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return ListType.compareListOrSet(elements, o1, o2);
     }
@@ -130,11 +129,6 @@
         return serializer;
     }
 
-    public boolean isByteOrderComparable()
-    {
-        return elements.isByteOrderComparable();
-    }
-
     @Override
     public String toString(boolean ignoreFreezing)
     {
@@ -150,11 +144,11 @@
         return sb.toString();
     }
 
-    public List<ByteBuffer> serializedValues(List<Cell> cells)
+    public List<ByteBuffer> serializedValues(Iterator<Cell> cells)
     {
-        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>(cells.size());
-        for (Cell c : cells)
-            bbs.add(c.name().collectionElement());
+        List<ByteBuffer> bbs = new ArrayList<ByteBuffer>();
+        while (cells.hasNext())
+            bbs.add(cells.next().path().get(0));
         return bbs;
     }
 

diff --git a/src/java/org/apache/cassandra/db/marshal/ShortType.java b/src/java/org/apache/cassandra/db/marshal/ShortType.java
index 2f9ec57..482fd81 100644
--- a/src/java/org/apache/cassandra/db/marshal/ShortType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ShortType.java

@@ -33,9 +33,10 @@
 
     ShortType()
     {
+        super(ComparisonType.CUSTOM);
     } // singleton
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         int diff = o1.get(o1.position()) - o2.get(o2.position());
         if (diff != 0)

diff --git a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
index cae9707..92b2dbd 100644
--- a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java

@@ -31,19 +31,7 @@
 {
     public static final SimpleDateType instance = new SimpleDateType();
 
-    SimpleDateType() {} // singleton
-
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        // We add Integer.MIN_VALUE to overflow to allow unsigned comparison
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
-
-    @Override
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
+    SimpleDateType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public ByteBuffer fromString(String source) throws MarshalException
     {

diff --git a/src/java/org/apache/cassandra/db/marshal/TimeType.java b/src/java/org/apache/cassandra/db/marshal/TimeType.java
index 86de574..8cd221e 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeType.java

@@ -33,12 +33,7 @@
 public class TimeType extends AbstractType<Long>
 {
     public static final TimeType instance = new TimeType();
-    private TimeType() {} // singleton
-
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
+    private TimeType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public ByteBuffer fromString(String source) throws MarshalException
     {
@@ -46,12 +41,6 @@
     }
 
     @Override
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
-
-    @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {
         return this == otherType || otherType == LongType.instance;

diff --git a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
index a1d8d82..36305a3 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java

@@ -33,6 +33,7 @@
 
     TimeUUIDType()
     {
+        super(ComparisonType.CUSTOM);
     } // singleton
 
     public boolean isEmptyValueMeaningless()
@@ -40,7 +41,7 @@
         return true;
     }
 
-    public int compare(ByteBuffer b1, ByteBuffer b2)
+    public int compareCustom(ByteBuffer b1, ByteBuffer b2)
     {
         // Compare for length
         int s1 = b1.position(), s2 = b2.position();
@@ -127,4 +128,10 @@
     {
         return TimeUUIDSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 16;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/TimestampType.java b/src/java/org/apache/cassandra/db/marshal/TimestampType.java
index 1704362..45b08d9 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimestampType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimestampType.java

@@ -43,14 +43,14 @@
 
     public static final TimestampType instance = new TimestampType();
 
-    private TimestampType() {} // singleton
+    private TimestampType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
     {
         return true;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         return LongType.compareLongs(o1, o2);
     }
@@ -125,4 +125,10 @@
     {
         return TimestampSerializer.instance;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 8;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java
index f3600ef..5c74332 100644
--- a/src/java/org/apache/cassandra/db/marshal/TupleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java

@@ -43,6 +43,7 @@
 
     public TupleType(List<AbstractType<?>> types)
     {
+        super(ComparisonType.CUSTOM);
         for (int i = 0; i < types.size(); i++)
             types.set(i, types.get(i).freeze());
         this.types = types;
@@ -67,14 +68,9 @@
     }
 
     @Override
-    public boolean references(AbstractType<?> check)
+    public boolean referencesUserType(String name)
     {
-        if (super.references(check))
-            return true;
-        for (AbstractType<?> type : types)
-            if (type.references(check))
-                return true;
-        return false;
+        return allTypes().stream().anyMatch(f -> f.referencesUserType(name));
     }
 
     public AbstractType<?> type(int i)
@@ -92,7 +88,7 @@
         return types;
     }
 
-    public int compare(ByteBuffer o1, ByteBuffer o2)
+    public int compareCustom(ByteBuffer o1, ByteBuffer o2)
     {
         if (!o1.hasRemaining() || !o2.hasRemaining())
             return o1.hasRemaining() ? 1 : o2.hasRemaining() ? -1 : 0;
@@ -351,4 +347,9 @@
     {
         return getClass().getName() + TypeParser.stringifyTypeParameters(types, true);
     }
+
+    public boolean isTuple()
+    {
+        return true;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/TypeParser.java b/src/java/org/apache/cassandra/db/marshal/TypeParser.java
index faa678e..590eea3 100644
--- a/src/java/org/apache/cassandra/db/marshal/TypeParser.java
+++ b/src/java/org/apache/cassandra/db/marshal/TypeParser.java

@@ -23,7 +23,9 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.cql3.CQL3Type;
+import com.google.common.base.Verify;
+import com.google.common.collect.ImmutableMap;
+
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -38,7 +40,7 @@
     private int idx;
 
     // A cache of parsed string, specially useful for DynamicCompositeType
-    private static final Map<String, AbstractType<?>> cache = new HashMap<>();
+    private static volatile ImmutableMap<String, AbstractType<?>> cache = ImmutableMap.of();
 
     public static final TypeParser EMPTY_PARSER = new TypeParser("", 0);
 
@@ -61,6 +63,7 @@
         if (str == null)
             return BytesType.instance;
 
+        // A single volatile read of 'cache' should not hurt.
         AbstractType<?> type = cache.get(str);
 
         if (type != null)
@@ -84,9 +87,27 @@
         else
             type = getAbstractType(name);
 
-        // We don't really care about concurrency here. Worst case scenario, we do some parsing unnecessarily
-        cache.put(str, type);
-        return type;
+        Verify.verify(type != null, "Parsing %s yielded null, which is a bug", str);
+
+        // Prevent concurrent modification to the map acting as the cache for TypeParser at the expense of
+        // more allocation when the cache needs to be updated, since updates to the cache are rare compared
+        // to the amount of reads.
+        //
+        // Copy the existing cache into a new map and add the parsed AbstractType instance and replace
+        // the cache, if the type is not already in the cache.
+        //
+        // The cache-update is done in a short synchronized block to prevent duplicate instances of AbstractType
+        // for the same string representation.
+        synchronized (TypeParser.class)
+        {
+            if (!cache.containsKey(str))
+            {
+                ImmutableMap.Builder<String, AbstractType<?>> builder = ImmutableMap.builder();
+                builder.putAll(cache).put(str, type);
+                cache = builder.build();
+            }
+            return type;
+        }
     }
 
     public static AbstractType<?> parse(CharSequence compareWith) throws SyntaxException, ConfigurationException
@@ -94,50 +115,6 @@
         return parse(compareWith == null ? null : compareWith.toString());
     }
 
-    public static String parseCqlNativeType(String str)
-    {
-        return CQL3Type.Native.valueOf(str.trim().toUpperCase(Locale.ENGLISH)).getType().toString();
-    }
-
-    public static String parseCqlCollectionOrFrozenType(String str) throws SyntaxException
-    {
-        str = str.trim().toLowerCase();
-        switch (str)
-        {
-            case "map": return "MapType";
-            case "set": return "SetType";
-            case "list": return "ListType";
-            case "frozen": return "FrozenType";
-            default: throw new SyntaxException("Invalid type name" + str);
-        }
-    }
-
-    /**
-     * Turns user facing type names into Abstract Types, 'text' -> UTF8Type
-     */
-    public static AbstractType<?> parseCqlName(String str) throws SyntaxException, ConfigurationException
-    {
-        return parse(parseCqlNameRecurse(str));
-    }
-
-    private static String parseCqlNameRecurse(String str) throws SyntaxException
-    {
-        if (str.indexOf(',') >= 0 && (!str.contains("<") || (str.indexOf(',') < str.indexOf('<'))))
-        {
-            String[] parseString = str.split(",", 2);
-            return parseCqlNameRecurse(parseString[0]) + "," + parseCqlNameRecurse(parseString[1]);
-        }
-        else if (str.contains("<"))
-        {
-            String[] parseString = str.trim().split("<", 2);
-            return parseCqlCollectionOrFrozenType(parseString[0]) + "(" + parseCqlNameRecurse(parseString[1].substring(0, parseString[1].length()-1)) + ")";
-        }
-        else
-        {
-            return parseCqlNativeType(str);
-        }
-    }
-
     /**
      * Parse an AbstractType from current position of this parser.
      */

diff --git a/src/java/org/apache/cassandra/db/marshal/UTF8Type.java b/src/java/org/apache/cassandra/db/marshal/UTF8Type.java
index 3b93d9a..7c18ce5 100644
--- a/src/java/org/apache/cassandra/db/marshal/UTF8Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/UTF8Type.java

@@ -35,12 +35,7 @@
 {
     public static final UTF8Type instance = new UTF8Type();
 
-    UTF8Type() {} // singleton
-
-    public int compare(ByteBuffer o1, ByteBuffer o2)
-    {
-        return ByteBufferUtil.compareUnsigned(o1, o2);
-    }
+    UTF8Type() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     public ByteBuffer fromString(String source)
     {
@@ -83,11 +78,6 @@
         return this == previous || previous == AsciiType.instance;
     }
 
-    public boolean isByteOrderComparable()
-    {
-        return true;
-    }
-
     public CQL3Type asCQL3Type()
     {
         return CQL3Type.Native.TEXT;

diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
index 0250eb20..acaf27c 100644
--- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java

@@ -48,6 +48,7 @@
 
     UUIDType()
     {
+        super(ComparisonType.CUSTOM);
     }
 
     public boolean isEmptyValueMeaningless()
@@ -55,7 +56,7 @@
         return true;
     }
 
-    public int compare(ByteBuffer b1, ByteBuffer b2)
+    public int compareCustom(ByteBuffer b1, ByteBuffer b2)
     {
         // Compare for length
         int s1 = b1.position(), s2 = b2.position();
@@ -168,4 +169,10 @@
     {
         return (uuid.get(6) & 0xf0) >> 4;
     }
+
+    @Override
+    protected int valueLengthIfFixed()
+    {
+        return 16;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/marshal/UserType.java b/src/java/org/apache/cassandra/db/marshal/UserType.java
index 93059cd..03545ca 100644
--- a/src/java/org/apache/cassandra/db/marshal/UserType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UserType.java

@@ -215,6 +215,13 @@
     }
 
     @Override
+    public boolean referencesUserType(String userTypeName)
+    {
+        return getNameAsString().equals(userTypeName) ||
+               fieldTypes().stream().anyMatch(f -> f.referencesUserType(userTypeName));
+    }
+
+    @Override
     public String toString()
     {
         return getClass().getName() + TypeParser.stringifyUserTypeParameters(keyspace, name, fieldNames, types);
@@ -225,4 +232,14 @@
     {
         return serializer;
     }
+
+    public boolean isTuple()
+    {
+        return false;
+    }
+
+    public boolean isUDT()
+    {
+        return true;
+    }
 }

diff --git a/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java
new file mode 100644
index 0000000..2cd9e97
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java

@@ -0,0 +1,420 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db.partitions;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSearchIterator;
+
+import static org.apache.cassandra.utils.btree.BTree.Dir.desc;
+
+public abstract class AbstractBTreePartition implements Partition, Iterable<Row>
+{
+    protected static final Holder EMPTY = new Holder(PartitionColumns.NONE, BTree.empty(), DeletionInfo.LIVE, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+
+    protected final CFMetaData metadata;
+    protected final DecoratedKey partitionKey;
+
+    protected abstract Holder holder();
+    protected abstract boolean canHaveShadowedData();
+
+    protected AbstractBTreePartition(CFMetaData metadata, DecoratedKey partitionKey)
+    {
+        this.metadata = metadata;
+        this.partitionKey = partitionKey;
+    }
+
+    protected static final class Holder
+    {
+        final PartitionColumns columns;
+        final DeletionInfo deletionInfo;
+        // the btree of rows
+        final Object[] tree;
+        final Row staticRow;
+        final EncodingStats stats;
+
+        Holder(PartitionColumns columns, Object[] tree, DeletionInfo deletionInfo, Row staticRow, EncodingStats stats)
+        {
+            this.columns = columns;
+            this.tree = tree;
+            this.deletionInfo = deletionInfo;
+            this.staticRow = staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow;
+            this.stats = stats;
+        }
+    }
+
+    public DeletionInfo deletionInfo()
+    {
+        return holder().deletionInfo;
+    }
+
+    public Row staticRow()
+    {
+        return holder().staticRow;
+    }
+
+    public boolean isEmpty()
+    {
+        Holder holder = holder();
+        return holder.deletionInfo.isLive() && BTree.isEmpty(holder.tree) && holder.staticRow.isEmpty();
+    }
+
+    public boolean hasRows()
+    {
+        Holder holder = holder();
+        return !BTree.isEmpty(holder.tree);
+    }
+
+    public CFMetaData metadata()
+    {
+        return metadata;
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return partitionKey;
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        return deletionInfo().getPartitionDeletion();
+    }
+
+    public PartitionColumns columns()
+    {
+        return holder().columns;
+    }
+
+    public EncodingStats stats()
+    {
+        return holder().stats;
+    }
+
+    public Row getRow(Clustering clustering)
+    {
+        Row row = searchIterator(ColumnFilter.selection(columns()), false).next(clustering);
+        // Note that for statics, this will never return null, this will return an empty row. However,
+        // it's more consistent for this method to return null if we don't really have a static row.
+        return row == null || (clustering == Clustering.STATIC_CLUSTERING && row.isEmpty()) ? null : row;
+    }
+
+    private Row staticRow(Holder current, ColumnFilter columns, boolean setActiveDeletionToRow)
+    {
+        DeletionTime partitionDeletion = current.deletionInfo.getPartitionDeletion();
+        if (columns.fetchedColumns().statics.isEmpty() || (current.staticRow.isEmpty() && partitionDeletion.isLive()))
+            return Rows.EMPTY_STATIC_ROW;
+
+        Row row = current.staticRow.filter(columns, partitionDeletion, setActiveDeletionToRow, metadata);
+        return row == null ? Rows.EMPTY_STATIC_ROW : row;
+    }
+
+    public SearchIterator<Clustering, Row> searchIterator(final ColumnFilter columns, final boolean reversed)
+    {
+        // TODO: we could optimize comparison for "NativeRow" à la #6755
+        final Holder current = holder();
+        return new SearchIterator<Clustering, Row>()
+        {
+            private final SearchIterator<Clustering, Row> rawIter = new BTreeSearchIterator<>(current.tree, metadata.comparator, desc(reversed));
+            private final DeletionTime partitionDeletion = current.deletionInfo.getPartitionDeletion();
+
+            public Row next(Clustering clustering)
+            {
+                if (clustering == Clustering.STATIC_CLUSTERING)
+                    return staticRow(current, columns, true);
+
+                Row row = rawIter.next(clustering);
+                RangeTombstone rt = current.deletionInfo.rangeCovering(clustering);
+
+                // A search iterator only return a row, so it doesn't allow to directly account for deletion that should apply to to row
+                // (the partition deletion or the deletion of a range tombstone that covers it). So if needs be, reuse the row deletion
+                // to carry the proper deletion on the row.
+                DeletionTime activeDeletion = partitionDeletion;
+                if (rt != null && rt.deletionTime().supersedes(activeDeletion))
+                    activeDeletion = rt.deletionTime();
+
+                if (row == null)
+                {
+                    // this means our partition level deletion superseedes all other deletions and we don't have to keep the row deletions
+                    if (activeDeletion == partitionDeletion)
+                        return null;
+                    // no need to check activeDeletion.isLive here - if anything superseedes the partitionDeletion
+                    // it must be non-live
+                    return BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(activeDeletion));
+                }
+
+                return row.filter(columns, activeDeletion, true, metadata);
+            }
+        };
+    }
+
+    public UnfilteredRowIterator unfilteredIterator()
+    {
+        return unfilteredIterator(ColumnFilter.all(metadata()), Slices.ALL, false);
+    }
+
+    public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed)
+    {
+        return unfilteredIterator(holder(), selection, slices, reversed);
+    }
+
+    public UnfilteredRowIterator unfilteredIterator(Holder current, ColumnFilter selection, Slices slices, boolean reversed)
+    {
+        Row staticRow = staticRow(current, selection, false);
+        if (slices.size() == 0)
+        {
+            DeletionTime partitionDeletion = current.deletionInfo.getPartitionDeletion();
+            return UnfilteredRowIterators.noRowsIterator(metadata, partitionKey, staticRow, partitionDeletion, reversed);
+        }
+
+        return slices.size() == 1
+               ? sliceIterator(selection, slices.get(0), reversed, current, staticRow)
+               : new SlicesIterator(selection, slices, reversed, current, staticRow);
+    }
+
+    private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice, boolean reversed, Holder current, Row staticRow)
+    {
+        Slice.Bound start = slice.start() == Slice.Bound.BOTTOM ? null : slice.start();
+        Slice.Bound end = slice.end() == Slice.Bound.TOP ? null : slice.end();
+        Iterator<Row> rowIter = BTree.slice(current.tree, metadata.comparator, start, true, end, true, desc(reversed));
+        Iterator<RangeTombstone> deleteIter = current.deletionInfo.rangeIterator(slice, reversed);
+
+        return merge(rowIter, deleteIter, selection, reversed, current, staticRow);
+    }
+
+    private RowAndDeletionMergeIterator merge(Iterator<Row> rowIter, Iterator<RangeTombstone> deleteIter,
+                                                     ColumnFilter selection, boolean reversed, Holder current, Row staticRow)
+    {
+        return new RowAndDeletionMergeIterator(metadata, partitionKey, current.deletionInfo.getPartitionDeletion(),
+                                               selection, staticRow, reversed, current.stats,
+                                               rowIter, deleteIter,
+                                               canHaveShadowedData());
+    }
+
+    private abstract class AbstractIterator extends AbstractUnfilteredRowIterator
+    {
+        final Holder current;
+        final ColumnFilter selection;
+
+        private AbstractIterator(ColumnFilter selection, boolean isReversed)
+        {
+            this(AbstractBTreePartition.this.holder(), selection, isReversed);
+        }
+
+        private AbstractIterator(Holder current, ColumnFilter selection, boolean isReversed)
+        {
+            this(current,
+                 AbstractBTreePartition.this.staticRow(current, selection, false),
+                 selection, isReversed);
+        }
+
+        private AbstractIterator(Holder current, Row staticRow, ColumnFilter selection, boolean isReversed)
+        {
+            super(AbstractBTreePartition.this.metadata,
+                  AbstractBTreePartition.this.partitionKey,
+                  current.deletionInfo.getPartitionDeletion(),
+                  selection.fetchedColumns(), // non-selected columns will be filtered in subclasses by RowAndDeletionMergeIterator
+                                              // it would also be more precise to return the intersection of the selection and current.columns,
+                                              // but its probably not worth spending time on computing that.
+                  staticRow,
+                  isReversed,
+                  current.stats);
+            this.current = current;
+            this.selection = selection;
+        }
+    }
+
+    public class SlicesIterator extends AbstractIterator
+    {
+        private final Slices slices;
+
+        private int idx;
+        private Iterator<Unfiltered> currentSlice;
+
+        private SlicesIterator(ColumnFilter selection,
+                               Slices slices,
+                               boolean isReversed,
+                               Holder current,
+                               Row staticRow)
+        {
+            super(current, staticRow, selection, isReversed);
+            this.slices = slices;
+        }
+
+        protected Unfiltered computeNext()
+        {
+            while (true)
+            {
+                if (currentSlice == null)
+                {
+                    if (idx >= slices.size())
+                        return endOfData();
+
+                    int sliceIdx = isReverseOrder ? slices.size() - idx - 1 : idx;
+                    currentSlice = sliceIterator(selection, slices.get(sliceIdx), isReverseOrder, current, Rows.EMPTY_STATIC_ROW);
+                    idx++;
+                }
+
+                if (currentSlice.hasNext())
+                    return currentSlice.next();
+
+                currentSlice = null;
+            }
+        }
+    }
+
+    public class SliceableIterator extends AbstractIterator implements SliceableUnfilteredRowIterator
+    {
+        private Iterator<Unfiltered> iterator;
+
+        protected SliceableIterator(ColumnFilter selection, boolean isReversed)
+        {
+            super(selection, isReversed);
+        }
+
+        protected Unfiltered computeNext()
+        {
+            if (iterator == null)
+                iterator = unfilteredIterator(selection, Slices.ALL, isReverseOrder);
+            if (!iterator.hasNext())
+                return endOfData();
+            return iterator.next();
+        }
+
+        public Iterator<Unfiltered> slice(Slice slice)
+        {
+            return sliceIterator(selection, slice, isReverseOrder, current, staticRow);
+        }
+    }
+
+    public SliceableUnfilteredRowIterator sliceableUnfilteredIterator(ColumnFilter columns, boolean reversed)
+    {
+        return new SliceableIterator(columns, reversed);
+    }
+
+    protected SliceableUnfilteredRowIterator sliceableUnfilteredIterator()
+    {
+        return sliceableUnfilteredIterator(ColumnFilter.all(metadata), false);
+    }
+
+    protected static Holder build(UnfilteredRowIterator iterator, int initialRowCapacity)
+    {
+        return build(iterator, initialRowCapacity, true, null);
+    }
+
+    protected static Holder build(UnfilteredRowIterator iterator, int initialRowCapacity, boolean ordered, BTree.Builder.QuickResolver<Row> quickResolver)
+    {
+        CFMetaData metadata = iterator.metadata();
+        PartitionColumns columns = iterator.columns();
+        boolean reversed = iterator.isReverseOrder();
+
+        BTree.Builder<Row> builder = BTree.builder(metadata.comparator, initialRowCapacity);
+        builder.auto(!ordered);
+        builder.setQuickResolver(quickResolver);
+        MutableDeletionInfo.Builder deletionBuilder = MutableDeletionInfo.builder(iterator.partitionLevelDeletion(), metadata.comparator, reversed);
+
+        while (iterator.hasNext())
+        {
+            Unfiltered unfiltered = iterator.next();
+            if (unfiltered.kind() == Unfiltered.Kind.ROW)
+                builder.add((Row)unfiltered);
+            else
+                deletionBuilder.add((RangeTombstoneMarker)unfiltered);
+        }
+
+        if (reversed)
+            builder.reverse();
+
+        return new Holder(columns, builder.build(), deletionBuilder.build(), iterator.staticRow(), iterator.stats());
+    }
+
+    // Note that when building with a RowIterator, deletion will generally be LIVE, but we allow to pass it nonetheless because PartitionUpdate
+    // passes a MutableDeletionInfo that it mutates later.
+    protected static Holder build(RowIterator rows, DeletionInfo deletion, boolean buildEncodingStats, int initialRowCapacity)
+    {
+        CFMetaData metadata = rows.metadata();
+        PartitionColumns columns = rows.columns();
+        boolean reversed = rows.isReverseOrder();
+
+        BTree.Builder<Row> builder = BTree.builder(metadata.comparator, initialRowCapacity);
+        builder.auto(false);
+        while (rows.hasNext())
+        {
+            Row row = rows.next();
+            builder.add(row);
+        }
+
+        if (reversed)
+            builder.reverse();
+
+        Row staticRow = rows.staticRow();
+        Object[] tree = builder.build();
+        EncodingStats stats = buildEncodingStats ? EncodingStats.Collector.collect(staticRow, BTree.iterator(tree), deletion)
+                                                 : EncodingStats.NO_STATS;
+        return new Holder(columns, tree, deletion, staticRow, stats);
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder sb = new StringBuilder();
+
+        sb.append(String.format("[%s.%s] key=%s partition_deletion=%s columns=%s",
+                                metadata.ksName,
+                                metadata.cfName,
+                                metadata.getKeyValidator().getString(partitionKey().getKey()),
+                                partitionLevelDeletion(),
+                                columns()));
+
+        if (staticRow() != Rows.EMPTY_STATIC_ROW)
+            sb.append("\n    ").append(staticRow().toString(metadata, true));
+
+        try (UnfilteredRowIterator iter = unfilteredIterator())
+        {
+            while (iter.hasNext())
+                sb.append("\n    ").append(iter.next().toString(metadata, true));
+        }
+
+        return sb.toString();
+    }
+
+    public int rowCount()
+    {
+        return BTree.size(holder().tree);
+    }
+
+    public Iterator<Row> iterator()
+    {
+        return BTree.<Row>iterator(holder().tree);
+    }
+
+    public Row lastRow()
+    {
+        Object[] tree = holder().tree;
+        if (BTree.isEmpty(tree))
+            return null;
+
+        return BTree.findByIndex(tree, BTree.size(tree) - 1);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/AbstractUnfilteredPartitionIterator.java b/src/java/org/apache/cassandra/db/partitions/AbstractUnfilteredPartitionIterator.java
new file mode 100644
index 0000000..d615ea9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/AbstractUnfilteredPartitionIterator.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+public abstract class AbstractUnfilteredPartitionIterator implements UnfilteredPartitionIterator
+{
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void close()
+    {
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java
new file mode 100644
index 0000000..1543fd3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java

@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
+import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+import org.apache.cassandra.utils.concurrent.Locks;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+/**
+ * A thread-safe and atomic Partition implementation.
+ *
+ * Operations (in particular addAll) on this implementation are atomic and
+ * isolated (in the sense of ACID). Typically a addAll is guaranteed that no
+ * other thread can see the state where only parts but not all rows have
+ * been added.
+ */
+public class AtomicBTreePartition extends AbstractBTreePartition
+{
+    public static final long EMPTY_SIZE = ObjectSizes.measure(new AtomicBTreePartition(CFMetaData.createFake("keyspace", "table"),
+                                                                                       DatabaseDescriptor.getPartitioner().decorateKey(ByteBuffer.allocate(1)),
+                                                                                       null));
+
+    // Reserved values for wasteTracker field. These values must not be consecutive (see avoidReservedValues)
+    private static final int TRACKER_NEVER_WASTED = 0;
+    private static final int TRACKER_PESSIMISTIC_LOCKING = Integer.MAX_VALUE;
+
+    // The granularity with which we track wasted allocation/work; we round up
+    private static final int ALLOCATION_GRANULARITY_BYTES = 1024;
+    // The number of bytes we have to waste in excess of our acceptable realtime rate of waste (defined below)
+    private static final long EXCESS_WASTE_BYTES = 10 * 1024 * 1024L;
+    private static final int EXCESS_WASTE_OFFSET = (int) (EXCESS_WASTE_BYTES / ALLOCATION_GRANULARITY_BYTES);
+    // Note this is a shift, because dividing a long time and then picking the low 32 bits doesn't give correct rollover behavior
+    private static final int CLOCK_SHIFT = 17;
+    // CLOCK_GRANULARITY = 1^9ns >> CLOCK_SHIFT == 132us == (1/7.63)ms
+
+    private static final AtomicIntegerFieldUpdater<AtomicBTreePartition> wasteTrackerUpdater = AtomicIntegerFieldUpdater.newUpdater(AtomicBTreePartition.class, "wasteTracker");
+    private static final AtomicReferenceFieldUpdater<AtomicBTreePartition, Holder> refUpdater = AtomicReferenceFieldUpdater.newUpdater(AtomicBTreePartition.class, Holder.class, "ref");
+
+    /**
+     * (clock + allocation) granularity are combined to give us an acceptable (waste) allocation rate that is defined by
+     * the passage of real time of ALLOCATION_GRANULARITY_BYTES/CLOCK_GRANULARITY, or in this case 7.63Kb/ms, or 7.45Mb/s
+     *
+     * in wasteTracker we maintain within EXCESS_WASTE_OFFSET before the current time; whenever we waste bytes
+     * we increment the current value if it is within this window, and set it to the min of the window plus our waste
+     * otherwise.
+     */
+    private volatile int wasteTracker = TRACKER_NEVER_WASTED;
+
+    private final MemtableAllocator allocator;
+    private volatile Holder ref;
+
+    public AtomicBTreePartition(CFMetaData metadata, DecoratedKey partitionKey, MemtableAllocator allocator)
+    {
+        // involved in potential bug? partition columns may be a subset if we alter columns while it's in memtable
+        super(metadata, partitionKey);
+        this.allocator = allocator;
+        this.ref = EMPTY;
+    }
+
+    protected Holder holder()
+    {
+        return ref;
+    }
+
+    protected boolean canHaveShadowedData()
+    {
+        return true;
+    }
+
+    /**
+     * Adds a given update to this in-memtable partition.
+     *
+     * @return an array containing first the difference in size seen after merging the updates, and second the minimum
+     * time detla between updates.
+     */
+    public long[] addAllWithSizeDelta(final PartitionUpdate update, OpOrder.Group writeOp, UpdateTransaction indexer)
+    {
+        RowUpdater updater = new RowUpdater(this, allocator, writeOp, indexer);
+        DeletionInfo inputDeletionInfoCopy = null;
+        boolean monitorOwned = false;
+        try
+        {
+            monitorOwned = maybeLock(writeOp);
+            indexer.start();
+
+            while (true)
+            {
+                Holder current = ref;
+                updater.ref = current;
+                updater.reset();
+
+                if (!update.deletionInfo().getPartitionDeletion().isLive())
+                    indexer.onPartitionDeletion(update.deletionInfo().getPartitionDeletion());
+
+                if (update.deletionInfo().hasRanges())
+                    update.deletionInfo().rangeIterator(false).forEachRemaining(indexer::onRangeTombstone);
+
+                DeletionInfo deletionInfo;
+                if (update.deletionInfo().mayModify(current.deletionInfo))
+                {
+                    if (inputDeletionInfoCopy == null)
+                        inputDeletionInfoCopy = update.deletionInfo().copy(HeapAllocator.instance);
+
+                    deletionInfo = current.deletionInfo.mutableCopy().add(inputDeletionInfoCopy);
+                    updater.allocated(deletionInfo.unsharedHeapSize() - current.deletionInfo.unsharedHeapSize());
+                }
+                else
+                {
+                    deletionInfo = current.deletionInfo;
+                }
+
+                PartitionColumns columns = update.columns().mergeTo(current.columns);
+                Row newStatic = update.staticRow();
+                Row staticRow = newStatic.isEmpty()
+                              ? current.staticRow
+                              : (current.staticRow.isEmpty() ? updater.apply(newStatic) : updater.apply(current.staticRow, newStatic));
+                Object[] tree = BTree.update(current.tree, update.metadata().comparator, update, update.rowCount(), updater);
+                EncodingStats newStats = current.stats.mergeWith(update.stats());
+
+                if (tree != null && refUpdater.compareAndSet(this, current, new Holder(columns, tree, deletionInfo, staticRow, newStats)))
+                {
+                    updater.finish();
+                    return new long[]{ updater.dataSize, updater.colUpdateTimeDelta };
+                }
+                else if (!monitorOwned)
+                {
+                    monitorOwned = maybeLock(updater.heapSize, writeOp);
+                }
+            }
+        }
+        finally
+        {
+            indexer.commit();
+            if (monitorOwned)
+                Locks.monitorExitUnsafe(this);
+        }
+    }
+
+    private boolean maybeLock(OpOrder.Group writeOp)
+    {
+        if (!useLock())
+            return false;
+
+        return lockIfOldest(writeOp);
+    }
+
+    private boolean maybeLock(long addWaste, OpOrder.Group writeOp)
+    {
+        if (!updateWastedAllocationTracker(addWaste))
+            return false;
+
+        return lockIfOldest(writeOp);
+    }
+
+    private boolean lockIfOldest(OpOrder.Group writeOp)
+    {
+        if (!writeOp.isOldestLiveGroup())
+        {
+            Thread.yield();
+            if (!writeOp.isOldestLiveGroup())
+                return false;
+        }
+
+        Locks.monitorEnterUnsafe(this);
+        return true;
+    }
+
+    public boolean useLock()
+    {
+        return wasteTracker == TRACKER_PESSIMISTIC_LOCKING;
+    }
+
+    /**
+     * Update the wasted allocation tracker state based on newly wasted allocation information
+     *
+     * @param wastedBytes the number of bytes wasted by this thread
+     * @return true if the caller should now proceed with pessimistic locking because the waste limit has been reached
+     */
+    private boolean updateWastedAllocationTracker(long wastedBytes)
+    {
+        // Early check for huge allocation that exceeds the limit
+        if (wastedBytes < EXCESS_WASTE_BYTES)
+        {
+            // We round up to ensure work < granularity are still accounted for
+            int wastedAllocation = ((int) (wastedBytes + ALLOCATION_GRANULARITY_BYTES - 1)) / ALLOCATION_GRANULARITY_BYTES;
+
+            int oldTrackerValue;
+            while (TRACKER_PESSIMISTIC_LOCKING != (oldTrackerValue = wasteTracker))
+            {
+                // Note this time value has an arbitrary offset, but is a constant rate 32 bit counter (that may wrap)
+                int time = (int) (System.nanoTime() >>> CLOCK_SHIFT);
+                int delta = oldTrackerValue - time;
+                if (oldTrackerValue == TRACKER_NEVER_WASTED || delta >= 0 || delta < -EXCESS_WASTE_OFFSET)
+                    delta = -EXCESS_WASTE_OFFSET;
+                delta += wastedAllocation;
+                if (delta >= 0)
+                    break;
+                if (wasteTrackerUpdater.compareAndSet(this, oldTrackerValue, avoidReservedValues(time + delta)))
+                    return false;
+            }
+        }
+        // We have definitely reached our waste limit so set the state if it isn't already
+        wasteTrackerUpdater.set(this, TRACKER_PESSIMISTIC_LOCKING);
+        // And tell the caller to proceed with pessimistic locking
+        return true;
+    }
+
+    private static int avoidReservedValues(int wasteTracker)
+    {
+        if (wasteTracker == TRACKER_NEVER_WASTED || wasteTracker == TRACKER_PESSIMISTIC_LOCKING)
+            return wasteTracker + 1;
+        return wasteTracker;
+    }
+
+    // the function we provide to the btree utilities to perform any column replacements
+    private static final class RowUpdater implements UpdateFunction<Row, Row>
+    {
+        final AtomicBTreePartition updating;
+        final MemtableAllocator allocator;
+        final OpOrder.Group writeOp;
+        final UpdateTransaction indexer;
+        final int nowInSec;
+        Holder ref;
+        Row.Builder regularBuilder;
+        long dataSize;
+        long heapSize;
+        long colUpdateTimeDelta = Long.MAX_VALUE;
+        List<Row> inserted; // TODO: replace with walk of aborted BTree
+
+        private RowUpdater(AtomicBTreePartition updating, MemtableAllocator allocator, OpOrder.Group writeOp, UpdateTransaction indexer)
+        {
+            this.updating = updating;
+            this.allocator = allocator;
+            this.writeOp = writeOp;
+            this.indexer = indexer;
+            this.nowInSec = FBUtilities.nowInSeconds();
+        }
+
+        private Row.Builder builder(Clustering clustering)
+        {
+            boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;
+            // We know we only insert/update one static per PartitionUpdate, so no point in saving the builder
+            if (isStatic)
+                return allocator.rowBuilder(writeOp);
+
+            if (regularBuilder == null)
+                regularBuilder = allocator.rowBuilder(writeOp);
+            return regularBuilder;
+        }
+
+        public Row apply(Row insert)
+        {
+            Row data = Rows.copy(insert, builder(insert.clustering())).build();
+            indexer.onInserted(insert);
+
+            this.dataSize += data.dataSize();
+            this.heapSize += data.unsharedHeapSizeExcludingData();
+            if (inserted == null)
+                inserted = new ArrayList<>();
+            inserted.add(data);
+            return data;
+        }
+
+        public Row apply(Row existing, Row update)
+        {
+            Row.Builder builder = builder(existing.clustering());
+            colUpdateTimeDelta = Math.min(colUpdateTimeDelta, Rows.merge(existing, update, builder, nowInSec));
+
+            Row reconciled = builder.build();
+
+            indexer.onUpdated(existing, reconciled);
+
+            dataSize += reconciled.dataSize() - existing.dataSize();
+            heapSize += reconciled.unsharedHeapSizeExcludingData() - existing.unsharedHeapSizeExcludingData();
+            if (inserted == null)
+                inserted = new ArrayList<>();
+            inserted.add(reconciled);
+
+            return reconciled;
+        }
+
+        protected void reset()
+        {
+            this.dataSize = 0;
+            this.heapSize = 0;
+            if (inserted != null)
+                inserted.clear();
+        }
+
+        public boolean abortEarly()
+        {
+            return updating.ref != ref;
+        }
+
+        public void allocated(long heapSize)
+        {
+            this.heapSize += heapSize;
+        }
+
+        protected void finish()
+        {
+            allocator.onHeap().adjust(heapSize, writeOp);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/BasePartitionIterator.java b/src/java/org/apache/cassandra/db/partitions/BasePartitionIterator.java
new file mode 100644
index 0000000..214f416
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/BasePartitionIterator.java

@@ -0,0 +1,27 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.utils.CloseableIterator;
+
+public interface BasePartitionIterator<I extends BaseRowIterator<?>> extends CloseableIterator<I>
+{
+    public void close();
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java
new file mode 100644
index 0000000..9c6ab59
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java

@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.io.IOException;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.btree.BTree;
+
+public class CachedBTreePartition extends ImmutableBTreePartition implements CachedPartition
+{
+    private final int createdAtInSec;
+
+    private final int cachedLiveRows;
+    private final int rowsWithNonExpiringCells;
+
+    private final int nonTombstoneCellCount;
+    private final int nonExpiringLiveCells;
+
+    private CachedBTreePartition(CFMetaData metadata,
+                                 DecoratedKey partitionKey,
+                                 Holder holder,
+                                 int createdAtInSec,
+                                 int cachedLiveRows,
+                                 int rowsWithNonExpiringCells,
+                                 int nonTombstoneCellCount,
+                                 int nonExpiringLiveCells)
+    {
+        super(metadata, partitionKey, holder);
+        this.createdAtInSec = createdAtInSec;
+        this.cachedLiveRows = cachedLiveRows;
+        this.rowsWithNonExpiringCells = rowsWithNonExpiringCells;
+        this.nonTombstoneCellCount = nonTombstoneCellCount;
+        this.nonExpiringLiveCells = nonExpiringLiveCells;
+    }
+
+    /**
+     * Creates an {@code ArrayBackedCachedPartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator got gather in memory.
+     * @param nowInSec the time of the creation in seconds. This is the time at which {@link #cachedLiveRows} applies.
+     * @return the created partition.
+     */
+    public static CachedBTreePartition create(UnfilteredRowIterator iterator, int nowInSec)
+    {
+        return create(iterator, 16, nowInSec);
+    }
+
+    /**
+     * Creates an {@code ArrayBackedCachedPartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator got gather in memory.
+     * @param initialRowCapacity sizing hint (in rows) to use for the created partition. It should ideally
+     * correspond or be a good estimation of the number or rows in {@code iterator}.
+     * @param nowInSec the time of the creation in seconds. This is the time at which {@link #cachedLiveRows} applies.
+     * @return the created partition.
+     */
+    public static CachedBTreePartition create(UnfilteredRowIterator iterator, int initialRowCapacity, int nowInSec)
+    {
+        Holder holder = ImmutableBTreePartition.build(iterator, initialRowCapacity);
+
+        int cachedLiveRows = 0;
+        int rowsWithNonExpiringCells = 0;
+        int nonTombstoneCellCount = 0;
+        int nonExpiringLiveCells = 0;
+        boolean enforceStrictLiveness = iterator.metadata().enforceStrictLiveness();
+
+        for (Row row : BTree.<Row>iterable(holder.tree))
+        {
+            if (row.hasLiveData(nowInSec, enforceStrictLiveness))
+                ++cachedLiveRows;
+
+            int nonExpiringLiveCellsThisRow = 0;
+            for (Cell cell : row.cells())
+            {
+                if (!cell.isTombstone())
+                {
+                    ++nonTombstoneCellCount;
+                    if (!cell.isExpiring())
+                        ++nonExpiringLiveCellsThisRow;
+                }
+            }
+
+            if (nonExpiringLiveCellsThisRow > 0)
+            {
+                ++rowsWithNonExpiringCells;
+                nonExpiringLiveCells += nonExpiringLiveCellsThisRow;
+            }
+        }
+
+        return new CachedBTreePartition(iterator.metadata(),
+                                        iterator.partitionKey(),
+                                        holder,
+                                        nowInSec,
+                                        cachedLiveRows,
+                                        rowsWithNonExpiringCells,
+                                        nonTombstoneCellCount,
+                                        nonExpiringLiveCells);
+    }
+
+    /**
+     * The number of rows that were live at the time the partition was cached.
+     *
+     * See {@link ColumnFamilyStore#isFilterFullyCoveredBy} to see why we need this.
+     *
+     * @return the number of rows in this partition that were live at the time the
+     * partition was cached (this can be different from the number of live rows now
+     * due to expiring cells).
+     */
+    public int cachedLiveRows()
+    {
+        return cachedLiveRows;
+    }
+
+    /**
+     * The number of rows in this cached partition that have at least one non-expiring
+     * non-deleted cell.
+     *
+     * Note that this is generally not a very meaningful number, but this is used by
+     * {@link DataLimits#hasEnoughLiveData} as an optimization.
+     *
+     * @return the number of row that have at least one non-expiring non-deleted cell.
+     */
+    public int rowsWithNonExpiringCells()
+    {
+        return rowsWithNonExpiringCells;
+    }
+
+    public int nonTombstoneCellCount()
+    {
+        return nonTombstoneCellCount;
+    }
+
+    public int nonExpiringLiveCells()
+    {
+        return nonExpiringLiveCells;
+    }
+
+    static class Serializer implements ISerializer<CachedPartition>
+    {
+        public void serialize(CachedPartition partition, DataOutputPlus out) throws IOException
+        {
+            int version = MessagingService.current_version;
+
+            assert partition instanceof CachedBTreePartition;
+            CachedBTreePartition p = (CachedBTreePartition)partition;
+
+            out.writeInt(p.createdAtInSec);
+            out.writeInt(p.cachedLiveRows);
+            out.writeInt(p.rowsWithNonExpiringCells);
+            out.writeInt(p.nonTombstoneCellCount);
+            out.writeInt(p.nonExpiringLiveCells);
+            CFMetaData.serializer.serialize(partition.metadata(), out, version);
+            try (UnfilteredRowIterator iter = p.unfilteredIterator())
+            {
+                UnfilteredRowIteratorSerializer.serializer.serialize(iter, null, out, version, p.rowCount());
+            }
+        }
+
+        public CachedPartition deserialize(DataInputPlus in) throws IOException
+        {
+            int version = MessagingService.current_version;
+
+            // Note that it would be slightly simpler to just do
+            //   ArrayBackedCachedPiartition.create(UnfilteredRowIteratorSerializer.serializer.deserialize(...));
+            // However deserializing the header separatly is not a lot harder and allows us to:
+            //   1) get the capacity of the partition so we can size it properly directly
+            //   2) saves the creation of a temporary iterator: rows are directly written to the partition, which
+            //      is slightly faster.
+
+            int createdAtInSec = in.readInt();
+            int cachedLiveRows = in.readInt();
+            int rowsWithNonExpiringCells = in.readInt();
+            int nonTombstoneCellCount = in.readInt();
+            int nonExpiringLiveCells = in.readInt();
+
+
+            CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
+            UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, null, in, version, SerializationHelper.Flag.LOCAL);
+            assert !header.isReversed && header.rowEstimate >= 0;
+
+            Holder holder;
+            try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version, metadata, SerializationHelper.Flag.LOCAL, header))
+            {
+                holder = ImmutableBTreePartition.build(partition, header.rowEstimate);
+            }
+
+            return new CachedBTreePartition(metadata,
+                                                  header.key,
+                                                  holder,
+                                                  createdAtInSec,
+                                                  cachedLiveRows,
+                                                  rowsWithNonExpiringCells,
+                                                  nonTombstoneCellCount,
+                                                  nonExpiringLiveCells);
+
+        }
+
+        public long serializedSize(CachedPartition partition)
+        {
+            int version = MessagingService.current_version;
+
+            assert partition instanceof CachedBTreePartition;
+            CachedBTreePartition p = (CachedBTreePartition)partition;
+
+            try (UnfilteredRowIterator iter = p.unfilteredIterator())
+            {
+                return TypeSizes.sizeof(p.createdAtInSec)
+                     + TypeSizes.sizeof(p.cachedLiveRows)
+                     + TypeSizes.sizeof(p.rowsWithNonExpiringCells)
+                     + TypeSizes.sizeof(p.nonTombstoneCellCount)
+                     + TypeSizes.sizeof(p.nonExpiringLiveCells)
+                     + CFMetaData.serializer.serializedSize(partition.metadata(), version)
+                     + UnfilteredRowIteratorSerializer.serializer.serializedSize(iter, null, MessagingService.current_version, p.rowCount());
+            }
+        }
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/partitions/CachedPartition.java b/src/java/org/apache/cassandra/db/partitions/CachedPartition.java
new file mode 100644
index 0000000..33e6ecc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/CachedPartition.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.cache.IRowCacheEntry;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.ISerializer;
+
+/**
+ * A partition stored in the partition cache.
+ *
+ * Note that in practice, the only implementation of this is {@link CachedBTreePartition},
+ * we keep this interface mainly to make it clear what we need from partition in the cache
+ * (that we don't otherwise)
+ */
+public interface CachedPartition extends Partition, IRowCacheEntry
+{
+    public static final ISerializer<CachedPartition> cacheSerializer = new CachedBTreePartition.Serializer();
+
+    /**
+     * The number of {@code Row} objects in this cached partition.
+     *
+     * Please note that this is <b>not</b> the number of <em>live</em> rows since
+     * some of the row may only contains deleted (or expired) information.
+     *
+     * @return the number of row in the partition.
+     */
+    public int rowCount();
+
+    /**
+     * The number of rows that were live at the time the partition was cached.
+     *
+     * See {@link ColumnFamilyStore#isFilterFullyCoveredBy} to see why we need this.
+     *
+     * @return the number of rows in this partition that were live at the time the
+     * partition was cached (this can be different from the number of live rows now
+     * due to expiring cells).
+     */
+    public int cachedLiveRows();
+
+    /**
+     * The number of rows in this cached partition that have at least one non-expiring
+     * non-deleted cell.
+     *
+     * Note that this is generally not a very meaningful number, but this is used by
+     * {@link DataLimits#hasEnoughLiveData} as an optimization.
+     *
+     * @return the number of row that have at least one non-expiring non-deleted cell.
+     */
+    public int rowsWithNonExpiringCells();
+
+    /**
+     * The last row in this cached partition (in order words, the row with the
+     * biggest clustering that the partition contains).
+     *
+     * @return the last row of the partition, or {@code null} if the partition is empty.
+     */
+    public Row lastRow();
+
+    /**
+     * The number of {@code cell} objects that are not tombstone in this cached partition.
+     *
+     * Please note that this is <b>not</b> the number of <em>live</em> cells since
+     * some of the cells might be expired.
+     *
+     * @return the number of non tombstone cells in the partition.
+     */
+    public int nonTombstoneCellCount();
+
+    /**
+     * The number of cells in this cached partition that are neither tombstone nor expiring.
+     *
+     * Note that this is generally not a very meaningful number, but this is used by
+     * {@link DataLimits#hasEnoughLiveData} as an optimization.
+     *
+     * @return the number of cells that are neither tombstones nor expiring.
+     */
+    public int nonExpiringLiveCells();
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java b/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java
new file mode 100644
index 0000000..26a947b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.rows.*;
+
+public class FilteredPartition extends ImmutableBTreePartition
+{
+    public FilteredPartition(RowIterator rows)
+    {
+        super(rows.metadata(), rows.partitionKey(), build(rows, DeletionInfo.LIVE, false, 16));
+    }
+
+    /**
+     * Create a FilteredPartition holding all the rows of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     */
+    public static FilteredPartition create(RowIterator iterator)
+    {
+        return new FilteredPartition(iterator);
+    }
+
+    public RowIterator rowIterator()
+    {
+        final Iterator<Row> iter = iterator();
+        return new RowIterator()
+        {
+            public CFMetaData metadata()
+            {
+                return metadata;
+            }
+
+            public boolean isReverseOrder()
+            {
+                return false;
+            }
+
+            public PartitionColumns columns()
+            {
+                return FilteredPartition.this.columns();
+            }
+
+            public DecoratedKey partitionKey()
+            {
+                return partitionKey;
+            }
+
+            public Row staticRow()
+            {
+                return FilteredPartition.this.staticRow();
+            }
+
+            public void close() {}
+
+            public boolean hasNext()
+            {
+                return iter.hasNext();
+            }
+
+            public Row next()
+            {
+                return iter.next();
+            }
+
+            public boolean isEmpty()
+            {
+                return staticRow().isEmpty() && !hasRows();
+            }
+        };
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java
new file mode 100644
index 0000000..8db5ee4
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java

@@ -0,0 +1,123 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.rows.*;
+
+public class ImmutableBTreePartition extends AbstractBTreePartition
+{
+
+    protected final Holder holder;
+
+    public ImmutableBTreePartition(CFMetaData metadata,
+                                      DecoratedKey partitionKey,
+                                      PartitionColumns columns,
+                                      Row staticRow,
+                                      Object[] tree,
+                                      DeletionInfo deletionInfo,
+                                      EncodingStats stats)
+    {
+        super(metadata, partitionKey);
+        this.holder = new Holder(columns, tree, deletionInfo, staticRow, stats);
+    }
+
+    protected ImmutableBTreePartition(CFMetaData metadata,
+                                      DecoratedKey partitionKey,
+                                      Holder holder)
+    {
+        super(metadata, partitionKey);
+        this.holder = holder;
+    }
+
+    /**
+     * Creates an {@code ImmutableBTreePartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator to gather in memory.
+     * @return the created partition.
+     */
+    public static ImmutableBTreePartition create(UnfilteredRowIterator iterator)
+    {
+        return create(iterator, 16);
+    }
+
+    /**
+     * Creates an {@code ImmutableBTreePartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator to gather in memory.
+     * @param ordered {@code true} if the iterator will return the rows in order, {@code false} otherwise.
+     * @return the created partition.
+     */
+    public static ImmutableBTreePartition create(UnfilteredRowIterator iterator, boolean ordered)
+    {
+        return create(iterator, 16, ordered);
+    }
+
+    /**
+     * Creates an {@code ImmutableBTreePartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator to gather in memory.
+     * @param initialRowCapacity sizing hint (in rows) to use for the created partition. It should ideally
+     * correspond or be a good estimation of the number or rows in {@code iterator}.
+     * @return the created partition.
+     */
+    public static ImmutableBTreePartition create(UnfilteredRowIterator iterator, int initialRowCapacity)
+    {
+        return create(iterator, initialRowCapacity, true);
+    }
+
+    /**
+     * Creates an {@code ImmutableBTreePartition} holding all the data of the provided iterator.
+     *
+     * Warning: Note that this method does not close the provided iterator and it is
+     * up to the caller to do so.
+     *
+     * @param iterator the iterator to gather in memory.
+     * @param initialRowCapacity sizing hint (in rows) to use for the created partition. It should ideally
+     * correspond or be a good estimation of the number or rows in {@code iterator}.
+     * @param ordered {@code true} if the iterator will return the rows in order, {@code false} otherwise.
+     * @return the created partition.
+     */
+    public static ImmutableBTreePartition create(UnfilteredRowIterator iterator, int initialRowCapacity, boolean ordered)
+    {
+        return new ImmutableBTreePartition(iterator.metadata(), iterator.partitionKey(), build(iterator, initialRowCapacity, ordered, null));
+    }
+
+    protected Holder holder()
+    {
+        return holder;
+    }
+
+    protected boolean canHaveShadowedData()
+    {
+        return false;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/Partition.java b/src/java/org/apache/cassandra/db/partitions/Partition.java
new file mode 100644
index 0000000..04568e9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/Partition.java

@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.utils.SearchIterator;
+
+/**
+ * In-memory representation of a Partition.
+ *
+ * Note that most of the storage engine works through iterators (UnfilteredPartitionIterator) to
+ * avoid "materializing" a full partition/query response in memory as much as possible,
+ * and so Partition objects should be use as sparingly as possible. There is a couple
+ * of cases where we do need to represent partition in-memory (memtables and row cache).
+ */
+public interface Partition
+{
+    public CFMetaData metadata();
+    public DecoratedKey partitionKey();
+    public DeletionTime partitionLevelDeletion();
+
+    public PartitionColumns columns();
+
+    public EncodingStats stats();
+
+    /**
+     * Whether the partition object has no informations at all, including any deletion informations.
+     */
+    public boolean isEmpty();
+
+    /**
+     * Returns the row corresponding to the provided clustering, or null if there is not such row.
+     */
+    public Row getRow(Clustering clustering);
+
+    /**
+     * Returns an iterator that allows to search specific rows efficiently.
+     */
+    public SearchIterator<Clustering, Row> searchIterator(ColumnFilter columns, boolean reversed);
+
+    /**
+     * Returns an UnfilteredRowIterator over all the rows/RT contained by this partition.
+     */
+    public UnfilteredRowIterator unfilteredIterator();
+
+    /**
+     * Returns an UnfilteredRowIterator over the rows/RT contained by this partition
+     * selected by the provided slices.
+     */
+    public UnfilteredRowIterator unfilteredIterator(ColumnFilter columns, Slices slices, boolean reversed);
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionIterator.java b/src/java/org/apache/cassandra/db/partitions/PartitionIterator.java
new file mode 100644
index 0000000..529a9e2
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionIterator.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.db.rows.*;
+
+/**
+ * An iterator over a number of (filtered) partition.
+ *
+ * PartitionIterator is to RowIterator what UnfilteredPartitionIterator is to UnfilteredRowIterator
+ * though unlike UnfilteredPartitionIterator, it is not guaranteed that the RowIterator
+ * returned are in partitioner order.
+ *
+ * The object returned by a call to next() is only guaranteed to be
+ * valid until the next call to hasNext() or next(). If a consumer wants to keep a
+ * reference on the returned objects for longer than the iteration, it must
+ * make a copy of it explicitely.
+ */
+public interface PartitionIterator extends BasePartitionIterator<RowIterator>
+{
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java b/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java
new file mode 100644
index 0000000..a3cf746
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java

@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.util.*;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.db.EmptyIterators;
+import org.apache.cassandra.db.transform.FilteredPartitions;
+import org.apache.cassandra.db.transform.MorePartitions;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.utils.AbstractIterator;
+
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.rows.*;
+
+public abstract class PartitionIterators
+{
+    private PartitionIterators() {}
+
+    @SuppressWarnings("resource") // The created resources are returned right away
+    public static RowIterator getOnlyElement(final PartitionIterator iter, SinglePartitionReadCommand command)
+    {
+        // If the query has no results, we'll get an empty iterator, but we still
+        // want a RowIterator out of this method, so we return an empty one.
+        RowIterator toReturn = iter.hasNext()
+                             ? iter.next()
+                             : EmptyIterators.row(command.metadata(),
+                                                  command.partitionKey(),
+                                                  command.clusteringIndexFilter().isReversed());
+
+        // Note that in general, we should wrap the result so that it's close method actually
+        // close the whole PartitionIterator.
+        class Close extends Transformation
+        {
+            public void onPartitionClose()
+            {
+                // asserting this only now because it bothers UnfilteredPartitionIterators.Serializer (which might be used
+                // under the provided DataIter) if hasNext() is called before the previously returned iterator hasn't been fully consumed.
+                boolean hadNext = iter.hasNext();
+                iter.close();
+                assert !hadNext;
+            }
+        }
+        return Transformation.apply(toReturn, new Close());
+    }
+
+    @SuppressWarnings("resource") // The created resources are returned right away
+    public static PartitionIterator concat(final List<PartitionIterator> iterators)
+    {
+        if (iterators.size() == 1)
+            return iterators.get(0);
+
+        class Extend implements MorePartitions<PartitionIterator>
+        {
+            int i = 0;
+            public PartitionIterator moreContents()
+            {
+                if (i >= iterators.size())
+                    return null;
+                return iterators.get(i++);
+            }
+        }
+
+        return MorePartitions.extend(EmptyIterators.partition(), new Extend());
+    }
+
+    public static PartitionIterator singletonIterator(RowIterator iterator)
+    {
+        return new SingletonPartitionIterator(iterator);
+    }
+
+    public static void consume(PartitionIterator iterator)
+    {
+        while (iterator.hasNext())
+        {
+            try (RowIterator partition = iterator.next())
+            {
+                while (partition.hasNext())
+                    partition.next();
+            }
+        }
+    }
+
+    /**
+     * Wraps the provided iterator so it logs the returned rows for debugging purposes.
+     * <p>
+     * Note that this is only meant for debugging as this can log a very large amount of
+     * logging at INFO.
+     */
+    @SuppressWarnings("resource") // The created resources are returned right away
+    public static PartitionIterator loggingIterator(PartitionIterator iterator, final String id)
+    {
+        class Logger extends Transformation<RowIterator>
+        {
+            public RowIterator applyToPartition(RowIterator partition)
+            {
+                return RowIterators.loggingIterator(partition, id);
+            }
+        }
+        return Transformation.apply(iterator, new Logger());
+    }
+
+    private static class SingletonPartitionIterator extends AbstractIterator<RowIterator> implements PartitionIterator
+    {
+        private final RowIterator iterator;
+        private boolean returned;
+
+        private SingletonPartitionIterator(RowIterator iterator)
+        {
+            this.iterator = iterator;
+        }
+
+        protected RowIterator computeNext()
+        {
+            if (returned)
+                return endOfData();
+
+            returned = true;
+            return iterator;
+        }
+
+        public void close()
+        {
+            iterator.close();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java b/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java
new file mode 100644
index 0000000..510b9c8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Cell;
+
+public interface PartitionStatisticsCollector
+{
+    public void update(LivenessInfo info);
+    public void update(DeletionTime deletionTime);
+    public void update(Cell cell);
+    public void updateColumnSetPerRow(long columnSetInRow);
+    public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards);
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java
new file mode 100644
index 0000000..3560e90
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java

@@ -0,0 +1,835 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NoSpamLogger;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+/**
+ * Stores updates made on a partition.
+ * <p>
+ * A PartitionUpdate object requires that all writes/additions are performed before we
+ * try to read the updates (attempts to write to the PartitionUpdate after a read method
+ * has been called will result in an exception being thrown). In other words, a Partition
+ * is mutable while it's written but becomes immutable as soon as it is read.
+ * <p>
+ * A typical usage is to create a new update ({@code new PartitionUpdate(metadata, key, columns, capacity)})
+ * and then add rows and range tombstones through the {@code add()} methods (the partition
+ * level deletion time can also be set with {@code addPartitionDeletion()}). However, there
+ * is also a few static helper constructor methods for special cases ({@code emptyUpdate()},
+ * {@code fullPartitionDelete} and {@code singleRowUpdate}).
+ */
+public class PartitionUpdate extends AbstractBTreePartition
+{
+    protected static final Logger logger = LoggerFactory.getLogger(PartitionUpdate.class);
+
+    public static final PartitionUpdateSerializer serializer = new PartitionUpdateSerializer();
+
+    private final int createdAtInSec = FBUtilities.nowInSeconds();
+
+    // Records whether this update is "built", i.e. if the build() method has been called, which
+    // happens when the update is read. Further writing is then rejected though a manual call
+    // to allowNewUpdates() allow new writes. We could make that more implicit but only triggers
+    // really requires that so we keep it simple for now).
+    private volatile boolean isBuilt;
+    private boolean canReOpen = true;
+
+    private Holder holder;
+    private BTree.Builder<Row> rowBuilder;
+    private MutableDeletionInfo deletionInfo;
+
+    private final boolean canHaveShadowedData;
+
+    private PartitionUpdate(CFMetaData metadata,
+                            DecoratedKey key,
+                            PartitionColumns columns,
+                            MutableDeletionInfo deletionInfo,
+                            int initialRowCapacity,
+                            boolean canHaveShadowedData)
+    {
+        super(metadata, key);
+        this.deletionInfo = deletionInfo;
+        this.holder = new Holder(columns, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+        this.canHaveShadowedData = canHaveShadowedData;
+        rowBuilder = builder(initialRowCapacity);
+    }
+
+    private PartitionUpdate(CFMetaData metadata,
+                            DecoratedKey key,
+                            Holder holder,
+                            MutableDeletionInfo deletionInfo,
+                            boolean canHaveShadowedData)
+    {
+        super(metadata, key);
+        this.holder = holder;
+        this.deletionInfo = deletionInfo;
+        this.isBuilt = true;
+        this.canHaveShadowedData = canHaveShadowedData;
+    }
+
+    public PartitionUpdate(CFMetaData metadata,
+                           DecoratedKey key,
+                           PartitionColumns columns,
+                           int initialRowCapacity)
+    {
+        this(metadata, key, columns, MutableDeletionInfo.live(), initialRowCapacity, true);
+    }
+
+    public PartitionUpdate(CFMetaData metadata,
+                           ByteBuffer key,
+                           PartitionColumns columns,
+                           int initialRowCapacity)
+    {
+        this(metadata,
+             metadata.decorateKey(key),
+             columns,
+             initialRowCapacity);
+    }
+
+    /**
+     * Creates a empty immutable partition update.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the created update.
+     *
+     * @return the newly created empty (and immutable) update.
+     */
+    public static PartitionUpdate emptyUpdate(CFMetaData metadata, DecoratedKey key)
+    {
+        MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
+        Holder holder = new Holder(PartitionColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+        return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
+    }
+
+    /**
+     * Creates an immutable partition update that entirely deletes a given partition.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the partition that the created update should delete.
+     * @param timestamp the timestamp for the deletion.
+     * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion.
+     *
+     * @return the newly created partition deletion update.
+     */
+    public static PartitionUpdate fullPartitionDelete(CFMetaData metadata, DecoratedKey key, long timestamp, int nowInSec)
+    {
+        MutableDeletionInfo deletionInfo = new MutableDeletionInfo(timestamp, nowInSec);
+        Holder holder = new Holder(PartitionColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+        return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
+    }
+
+    /**
+     * Creates an immutable partition update that contains a single row update.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the partition to update.
+     * @param row the row for the update (may be null).
+     * @param row the static row for the update (may be null).
+     *
+     * @return the newly created partition update containing only {@code row}.
+     */
+    public static PartitionUpdate singleRowUpdate(CFMetaData metadata, DecoratedKey key, Row row, Row staticRow)
+    {
+        MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
+        Holder holder = new Holder(
+            new PartitionColumns(
+                staticRow == null ? Columns.NONE : Columns.from(staticRow.columns()),
+                row == null ? Columns.NONE : Columns.from(row.columns())
+            ),
+            row == null ? BTree.empty() : BTree.singleton(row),
+            deletionInfo,
+            staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow,
+            EncodingStats.NO_STATS
+        );
+        return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
+    }
+
+    /**
+     * Creates an immutable partition update that contains a single row update.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the partition to update.
+     * @param row the row for the update (may be static).
+     *
+     * @return the newly created partition update containing only {@code row}.
+     */
+    public static PartitionUpdate singleRowUpdate(CFMetaData metadata, DecoratedKey key, Row row)
+    {
+        return singleRowUpdate(metadata, key, row.isStatic() ? null : row, row.isStatic() ? row : null);
+    }
+
+    /**
+     * Creates an immutable partition update that contains a single row update.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the partition to update.
+     * @param row the row for the update.
+     *
+     * @return the newly created partition update containing only {@code row}.
+     */
+    public static PartitionUpdate singleRowUpdate(CFMetaData metadata, ByteBuffer key, Row row)
+    {
+        return singleRowUpdate(metadata, metadata.decorateKey(key), row);
+    }
+
+    /**
+     * Turns the given iterator into an update.
+     *
+     * Warning: this method does not close the provided iterator, it is up to
+     * the caller to close it.
+     */
+    public static PartitionUpdate fromIterator(UnfilteredRowIterator iterator)
+    {
+        return fromIterator(iterator, true,  null);
+    }
+
+    private static final NoSpamLogger rowMergingLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
+    /**
+     * Removes duplicate rows from incoming iterator, to be used when we can't trust the underlying iterator (like when reading legacy sstables)
+     */
+    public static PartitionUpdate fromPre30Iterator(UnfilteredRowIterator iterator)
+    {
+        return fromIterator(iterator, false, (a, b) -> {
+            CFMetaData cfm = iterator.metadata();
+            rowMergingLogger.warn(String.format("Merging rows from pre 3.0 iterator for partition key: %s",
+                                                cfm.getKeyValidator().getString(iterator.partitionKey().getKey())));
+            return Rows.merge(a, b, FBUtilities.nowInSeconds());
+        });
+    }
+
+    private static PartitionUpdate fromIterator(UnfilteredRowIterator iterator, boolean ordered, BTree.Builder.QuickResolver<Row> quickResolver)
+    {
+        Holder holder = build(iterator, 16, ordered, quickResolver);
+        MutableDeletionInfo deletionInfo = (MutableDeletionInfo) holder.deletionInfo;
+        return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
+    }
+
+    public static PartitionUpdate fromIterator(RowIterator iterator)
+    {
+        MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
+        Holder holder = build(iterator, deletionInfo, true, 16);
+        return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
+    }
+
+    protected boolean canHaveShadowedData()
+    {
+        return canHaveShadowedData;
+    }
+
+    /**
+     * Deserialize a partition update from a provided byte buffer.
+     *
+     * @param bytes the byte buffer that contains the serialized update.
+     * @param version the version with which the update is serialized.
+     * @param key the partition key for the update. This is only used if {@code version &lt 3.0}
+     * and can be {@code null} otherwise.
+     *
+     * @return the deserialized update or {@code null} if {@code bytes == null}.
+     */
+    public static PartitionUpdate fromBytes(ByteBuffer bytes, int version, DecoratedKey key)
+    {
+        if (bytes == null)
+            return null;
+
+        try
+        {
+            return serializer.deserialize(new DataInputBuffer(bytes, true),
+                                          version,
+                                          SerializationHelper.Flag.LOCAL,
+                                          version < MessagingService.VERSION_30 ? key : null);
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Serialize a partition update as a byte buffer.
+     *
+     * @param update the partition update to serialize.
+     * @param version the version to serialize the update into.
+     *
+     * @return a newly allocated byte buffer containing the serialized update.
+     */
+    public static ByteBuffer toBytes(PartitionUpdate update, int version)
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            serializer.serialize(update, out, version);
+            return ByteBuffer.wrap(out.getData(), 0, out.getLength());
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Creates a partition update that entirely deletes a given partition.
+     *
+     * @param metadata the metadata for the created update.
+     * @param key the partition key for the partition that the created update should delete.
+     * @param timestamp the timestamp for the deletion.
+     * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion.
+     *
+     * @return the newly created partition deletion update.
+     */
+    public static PartitionUpdate fullPartitionDelete(CFMetaData metadata, ByteBuffer key, long timestamp, int nowInSec)
+    {
+        return fullPartitionDelete(metadata, metadata.decorateKey(key), timestamp, nowInSec);
+    }
+
+    /**
+     * Merges the provided updates, yielding a new update that incorporates all those updates.
+     *
+     * @param updates the collection of updates to merge. This shouldn't be empty.
+     *
+     * @return a partition update that include (merge) all the updates from {@code updates}.
+     */
+    public static PartitionUpdate merge(List<PartitionUpdate> updates)
+    {
+        assert !updates.isEmpty();
+        final int size = updates.size();
+
+        if (size == 1)
+            return Iterables.getOnlyElement(updates);
+
+        int nowInSecs = FBUtilities.nowInSeconds();
+        List<UnfilteredRowIterator> asIterators = Lists.transform(updates, AbstractBTreePartition::unfilteredIterator);
+        return fromIterator(UnfilteredRowIterators.merge(asIterators, nowInSecs));
+    }
+
+    // We override this, because the version in the super-class calls holder(), which build the update preventing
+    // further updates, but that's not necessary here and being able to check at least the partition deletion without
+    // "locking" the update is nice (and used in DataResolver.RepairMergeListener.MergeListener).
+    @Override
+    public DeletionInfo deletionInfo()
+    {
+        return deletionInfo;
+    }
+
+    /**
+     * Modify this update to set every timestamp for live data to {@code newTimestamp} and
+     * every deletion timestamp to {@code newTimestamp - 1}.
+     *
+     * There is no reason to use that expect on the Paxos code path, where we need ensure that
+     * anything inserted use the ballot timestamp (to respect the order of update decided by
+     * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones
+     * always win on timestamp equality and we don't want to delete our own insertions
+     * (typically, when we overwrite a collection, we first set a complex deletion to delete the
+     * previous collection before adding new elements. If we were to set that complex deletion
+     * to the same timestamp that the new elements, it would delete those elements). And since
+     * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still
+     * delete anything from a previous update.
+     */
+    public void updateAllTimestamp(long newTimestamp)
+    {
+        Holder holder = holder();
+        deletionInfo.updateAllTimestamp(newTimestamp - 1);
+        Object[] tree = BTree.<Row>transformAndFilter(holder.tree, (x) -> x.updateAllTimestamp(newTimestamp));
+        Row staticRow = holder.staticRow.updateAllTimestamp(newTimestamp);
+        EncodingStats newStats = EncodingStats.Collector.collect(staticRow, BTree.<Row>iterator(tree), deletionInfo);
+        this.holder = new Holder(holder.columns, tree, deletionInfo, staticRow, newStats);
+    }
+
+    /**
+     * The number of "operations" contained in the update.
+     * <p>
+     * This is used by {@code Memtable} to approximate how much work this update does. In practice, this
+     * count how many rows are updated and how many ranges are deleted by the partition update.
+     *
+     * @return the number of "operations" performed by the update.
+     */
+    public int operationCount()
+    {
+        return rowCount()
+             + (staticRow().isEmpty() ? 0 : 1)
+             + deletionInfo.rangeCount()
+             + (deletionInfo.getPartitionDeletion().isLive() ? 0 : 1);
+    }
+
+    /**
+     * The size of the data contained in this update.
+     *
+     * @return the size of the data contained in this update.
+     */
+    public int dataSize()
+    {
+        int size = 0;
+
+        if (holder.staticRow != null)
+        {
+            for (ColumnData cd : holder.staticRow.columnData())
+            {
+                size += cd.dataSize();
+            }
+        }
+
+        for (Row row : this)
+        {
+            size += row.clustering().dataSize();
+            for (ColumnData cd : row)
+                size += cd.dataSize();
+        }
+        return size;
+    }
+
+    @Override
+    public PartitionColumns columns()
+    {
+        // The superclass implementation calls holder(), but that triggers a build of the PartitionUpdate. But since
+        // the columns are passed to the ctor, we know the holder always has the proper columns even if it doesn't have
+        // the built rows yet, so just bypass the holder() method.
+        return holder.columns;
+    }
+
+    protected Holder holder()
+    {
+        maybeBuild();
+        return holder;
+    }
+
+    public EncodingStats stats()
+    {
+        return holder().stats;
+    }
+
+    /**
+     * If a partition update has been read (and is thus unmodifiable), a call to this method
+     * makes the update modifiable again.
+     * <p>
+     * Please note that calling this method won't result in optimal behavior in the sense that
+     * even if very little is added to the update after this call, the whole update will be sorted
+     * again on read. This should thus be used sparingly (and if it turns that we end up using
+     * this often, we should consider optimizing the behavior).
+     */
+    public synchronized void allowNewUpdates()
+    {
+        if (!canReOpen)
+            throw new IllegalStateException("You cannot do more updates on collectCounterMarks has been called");
+
+        // This is synchronized to make extra sure things work properly even if this is
+        // called concurrently with sort() (which should be avoided in the first place, but
+        // better safe than sorry).
+        isBuilt = false;
+        if (rowBuilder == null)
+            rowBuilder = builder(16);
+    }
+
+    private BTree.Builder<Row> builder(int initialCapacity)
+    {
+        return BTree.<Row>builder(metadata.comparator, initialCapacity)
+                    .setQuickResolver((a, b) ->
+                                      Rows.merge(a, b, createdAtInSec));
+    }
+
+    /**
+     * Returns an iterator that iterates over the rows of this update in clustering order.
+     * <p>
+     * Note that this might trigger a sorting of the update, and as such the update will not
+     * be modifiable anymore after this call.
+     *
+     * @return an iterator over the rows of this update.
+     */
+    @Override
+    public Iterator<Row> iterator()
+    {
+        maybeBuild();
+        return super.iterator();
+    }
+
+    @Override
+    public SliceableUnfilteredRowIterator sliceableUnfilteredIterator(ColumnFilter columns, boolean reversed)
+    {
+        maybeBuild();
+        return super.sliceableUnfilteredIterator(columns, reversed);
+    }
+
+    /**
+     * Validates the data contained in this update.
+     *
+     * @throws org.apache.cassandra.serializers.MarshalException if some of the data contained in this update is corrupted.
+     */
+    public void validate()
+    {
+        for (Row row : this)
+        {
+            metadata().comparator.validate(row.clustering());
+            for (ColumnData cd : row)
+                cd.validate();
+        }
+    }
+
+    /**
+     * The maximum timestamp used in this update.
+     *
+     * @return the maximum timestamp used in this update.
+     */
+    public long maxTimestamp()
+    {
+        maybeBuild();
+
+        long maxTimestamp = deletionInfo.maxTimestamp();
+        for (Row row : this)
+        {
+            maxTimestamp = Math.max(maxTimestamp, row.primaryKeyLivenessInfo().timestamp());
+            for (ColumnData cd : row)
+            {
+                if (cd.column().isSimple())
+                {
+                    maxTimestamp = Math.max(maxTimestamp, ((Cell)cd).timestamp());
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData)cd;
+                    maxTimestamp = Math.max(maxTimestamp, complexData.complexDeletion().markedForDeleteAt());
+                    for (Cell cell : complexData)
+                        maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
+                }
+            }
+        }
+
+        if (holder.staticRow != null)
+        {
+            for (ColumnData cd : holder.staticRow.columnData())
+            {
+                if (cd.column().isSimple())
+                {
+                    maxTimestamp = Math.max(maxTimestamp, ((Cell) cd).timestamp());
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData) cd;
+                    maxTimestamp = Math.max(maxTimestamp, complexData.complexDeletion().markedForDeleteAt());
+                    for (Cell cell : complexData)
+                        maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
+                }
+            }
+        }
+        return maxTimestamp;
+    }
+
+    /**
+     * For an update on a counter table, returns a list containing a {@code CounterMark} for
+     * every counter contained in the update.
+     *
+     * @return a list with counter marks for every counter in this update.
+     */
+    public List<CounterMark> collectCounterMarks()
+    {
+        assert metadata().isCounter();
+        maybeBuild();
+        // We will take aliases on the rows of this update, and update them in-place. So we should be sure the
+        // update is now immutable for all intent and purposes.
+        canReOpen = false;
+
+        List<CounterMark> marks = new ArrayList<>();
+        addMarksForRow(staticRow(), marks);
+        for (Row row : this)
+            addMarksForRow(row, marks);
+        return marks;
+    }
+
+    private void addMarksForRow(Row row, List<CounterMark> marks)
+    {
+        for (Cell cell : row.cells())
+        {
+            if (cell.isCounterCell())
+                marks.add(new CounterMark(row, cell.column(), cell.path()));
+        }
+    }
+
+    private void assertNotBuilt()
+    {
+        if (isBuilt)
+            throw new IllegalStateException("An update should not be written again once it has been read");
+    }
+
+    public void addPartitionDeletion(DeletionTime deletionTime)
+    {
+        assertNotBuilt();
+        deletionInfo.add(deletionTime);
+    }
+
+    public void add(RangeTombstone range)
+    {
+        assertNotBuilt();
+        deletionInfo.add(range, metadata.comparator);
+    }
+
+    /**
+     * Adds a row to this update.
+     *
+     * There is no particular assumption made on the order of row added to a partition update. It is further
+     * allowed to add the same row (more precisely, multiple row objects for the same clustering).
+     *
+     * Note however that the columns contained in the added row must be a subset of the columns used when
+     * creating this update.
+     *
+     * @param row the row to add.
+     */
+    public void add(Row row)
+    {
+        if (row.isEmpty())
+            return;
+
+        assertNotBuilt();
+
+        if (row.isStatic())
+        {
+            // this assert is expensive, and possibly of limited value; we should consider removing it
+            // or introducing a new class of assertions for test purposes
+            assert columns().statics.containsAll(row.columns()) : columns().statics + " is not superset of " + row.columns();
+            Row staticRow = holder.staticRow.isEmpty()
+                      ? row
+                      : Rows.merge(holder.staticRow, row, createdAtInSec);
+            holder = new Holder(holder.columns, holder.tree, holder.deletionInfo, staticRow, holder.stats);
+        }
+        else
+        {
+            // this assert is expensive, and possibly of limited value; we should consider removing it
+            // or introducing a new class of assertions for test purposes
+            assert columns().regulars.containsAll(row.columns()) : columns().regulars + " is not superset of " + row.columns();
+            rowBuilder.add(row);
+        }
+    }
+
+    private void maybeBuild()
+    {
+        if (isBuilt)
+            return;
+
+        build();
+    }
+
+    private synchronized void build()
+    {
+        if (isBuilt)
+            return;
+
+        Holder holder = this.holder;
+        Object[] cur = holder.tree;
+        Object[] add = rowBuilder.build();
+        Object[] merged = BTree.<Row>merge(cur, add, metadata.comparator,
+                                           UpdateFunction.Simple.of((a, b) -> Rows.merge(a, b, createdAtInSec)));
+
+        assert deletionInfo == holder.deletionInfo;
+        EncodingStats newStats = EncodingStats.Collector.collect(holder.staticRow, BTree.<Row>iterator(merged), deletionInfo);
+
+        this.holder = new Holder(holder.columns, merged, holder.deletionInfo, holder.staticRow, newStats);
+        rowBuilder = null;
+        isBuilt = true;
+    }
+
+    @Override
+    public String toString()
+    {
+        if (isBuilt)
+            return super.toString();
+
+        // We intentionally override AbstractBTreePartition#toString() to avoid iterating over the rows in the
+        // partition, which can result in build() being triggered and lead to errors if the PartitionUpdate is later
+        // modified.
+
+        StringBuilder sb = new StringBuilder();
+        sb.append(String.format("[%s.%s] key=%s columns=%s",
+                                metadata.ksName,
+                                metadata.cfName,
+                                metadata.getKeyValidator().getString(partitionKey().getKey()),
+                                columns()));
+
+        sb.append("\n    deletionInfo=").append(deletionInfo);
+        sb.append(" (not built)");
+        return sb.toString();
+    }
+
+    public static class PartitionUpdateSerializer
+    {
+        public void serialize(PartitionUpdate update, DataOutputPlus out, int version) throws IOException
+        {
+            try (UnfilteredRowIterator iter = update.sliceableUnfilteredIterator())
+            {
+                assert !iter.isReverseOrder();
+
+                if (version < MessagingService.VERSION_30)
+                {
+                    LegacyLayout.serializeAsLegacyPartition(null, iter, out, version);
+                }
+                else
+                {
+                    CFMetaData.serializer.serialize(update.metadata(), out, version);
+                    UnfilteredRowIteratorSerializer.serializer.serialize(iter, null, out, version, update.rowCount());
+                }
+            }
+        }
+
+        public PartitionUpdate deserialize(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException
+        {
+            if (version >= MessagingService.VERSION_30)
+            {
+                assert key == null; // key is only there for the old format
+                return deserialize30(in, version, flag);
+            }
+            else
+            {
+                assert key != null;
+                return deserializePre30(in, version, flag, key);
+            }
+        }
+
+        // Used to share same decorated key between updates.
+        public PartitionUpdate deserialize(DataInputPlus in, int version, SerializationHelper.Flag flag, DecoratedKey key) throws IOException
+        {
+            if (version >= MessagingService.VERSION_30)
+            {
+                return deserialize30(in, version, flag);
+            }
+            else
+            {
+                assert key != null;
+                return deserializePre30(in, version, flag, key.getKey());
+            }
+        }
+
+        private static PartitionUpdate deserialize30(DataInputPlus in, int version, SerializationHelper.Flag flag) throws IOException
+        {
+            CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
+            UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, null, in, version, flag);
+            if (header.isEmpty)
+                return emptyUpdate(metadata, header.key);
+
+            assert !header.isReversed;
+            assert header.rowEstimate >= 0;
+
+            MutableDeletionInfo.Builder deletionBuilder = MutableDeletionInfo.builder(header.partitionDeletion, metadata.comparator, false);
+            BTree.Builder<Row> rows = BTree.builder(metadata.comparator, header.rowEstimate);
+            rows.auto(false);
+
+            try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version, metadata, flag, header))
+            {
+                while (partition.hasNext())
+                {
+                    Unfiltered unfiltered = partition.next();
+                    if (unfiltered.kind() == Unfiltered.Kind.ROW)
+                        rows.add((Row)unfiltered);
+                    else
+                        deletionBuilder.add((RangeTombstoneMarker)unfiltered);
+                }
+            }
+
+            MutableDeletionInfo deletionInfo = deletionBuilder.build();
+            return new PartitionUpdate(metadata,
+                                       header.key,
+                                       new Holder(header.sHeader.columns(), rows.build(), deletionInfo, header.staticRow, header.sHeader.stats()),
+                                       deletionInfo,
+                                       false);
+        }
+
+        private static PartitionUpdate deserializePre30(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException
+        {
+            try (UnfilteredRowIterator iterator = LegacyLayout.deserializeLegacyPartition(in, version, flag, key))
+            {
+                assert iterator != null; // This is only used in mutation, and mutation have never allowed "null" column families
+                return PartitionUpdate.fromPre30Iterator(iterator);
+            }
+        }
+
+        public long serializedSize(PartitionUpdate update, int version)
+        {
+            try (UnfilteredRowIterator iter = update.sliceableUnfilteredIterator())
+            {
+                if (version < MessagingService.VERSION_30)
+                    return LegacyLayout.serializedSizeAsLegacyPartition(null, iter, version);
+
+                return CFMetaData.serializer.serializedSize(update.metadata(), version)
+                     + UnfilteredRowIteratorSerializer.serializer.serializedSize(iter, null, version, update.rowCount());
+            }
+        }
+    }
+
+    /**
+     * A counter mark is basically a pointer to a counter update inside this partition update. That pointer allows
+     * us to update the counter value based on the pre-existing value read during the read-before-write that counters
+     * do. See {@link CounterMutation} to understand how this is used.
+     */
+    public static class CounterMark
+    {
+        private final Row row;
+        private final ColumnDefinition column;
+        private final CellPath path;
+
+        private CounterMark(Row row, ColumnDefinition column, CellPath path)
+        {
+            this.row = row;
+            this.column = column;
+            this.path = path;
+        }
+
+        public Clustering clustering()
+        {
+            return row.clustering();
+        }
+
+        public ColumnDefinition column()
+        {
+            return column;
+        }
+
+        public CellPath path()
+        {
+            return path;
+        }
+
+        public ByteBuffer value()
+        {
+            return path == null
+                 ? row.getCell(column).value()
+                 : row.getCell(column, path).value();
+        }
+
+        public void setValue(ByteBuffer value)
+        {
+            // This is a bit of a giant hack as this is the only place where we mutate a Row object. This makes it more efficient
+            // for counters however and this won't be needed post-#6506 so that's probably fine.
+            assert row instanceof BTreeRow;
+            ((BTreeRow)row).setValue(column, path, value);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java b/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java
new file mode 100644
index 0000000..5cc9145
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java

@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.util.function.Predicate;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
+
+public abstract class PurgeFunction extends Transformation<UnfilteredRowIterator>
+{
+    private final boolean isForThrift;
+    private final DeletionPurger purger;
+    private final int nowInSec;
+
+    private final boolean enforceStrictLiveness;
+    private boolean isReverseOrder;
+
+    public PurgeFunction(boolean isForThrift,
+                         int nowInSec,
+                         int gcBefore,
+                         int oldestUnrepairedTombstone,
+                         boolean onlyPurgeRepairedTombstones,
+                         boolean enforceStrictLiveness)
+    {
+        this.isForThrift = isForThrift;
+        this.nowInSec = nowInSec;
+        this.purger = (timestamp, localDeletionTime) ->
+                      !(onlyPurgeRepairedTombstones && localDeletionTime >= oldestUnrepairedTombstone)
+                      && localDeletionTime < gcBefore
+                      && getPurgeEvaluator().test(timestamp);
+        this.enforceStrictLiveness = enforceStrictLiveness;
+    }
+
+    protected abstract Predicate<Long> getPurgeEvaluator();
+
+    // Called at the beginning of each new partition
+    protected void onNewPartition(DecoratedKey partitionKey)
+    {
+    }
+
+    // Called for each partition that had only purged infos and are empty post-purge.
+    protected void onEmptyPartitionPostPurge(DecoratedKey partitionKey)
+    {
+    }
+
+    // Called for every unfiltered. Meant for CompactionIterator to update progress
+    protected void updateProgress()
+    {
+    }
+
+    @Override
+    protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+    {
+        onNewPartition(partition.partitionKey());
+
+        isReverseOrder = partition.isReverseOrder();
+        UnfilteredRowIterator purged = Transformation.apply(partition, this);
+        if (!isForThrift && purged.isEmpty())
+        {
+            onEmptyPartitionPostPurge(purged.partitionKey());
+            purged.close();
+            return null;
+        }
+
+        return purged;
+    }
+
+    @Override
+    protected DeletionTime applyToDeletion(DeletionTime deletionTime)
+    {
+        return purger.shouldPurge(deletionTime) ? DeletionTime.LIVE : deletionTime;
+    }
+
+    @Override
+    protected Row applyToStatic(Row row)
+    {
+        updateProgress();
+        return row.purge(purger, nowInSec, enforceStrictLiveness);
+    }
+
+    @Override
+    protected Row applyToRow(Row row)
+    {
+        updateProgress();
+        return row.purge(purger, nowInSec, enforceStrictLiveness);
+    }
+
+    @Override
+    protected RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+    {
+        updateProgress();
+        boolean reversed = isReverseOrder;
+        if (marker.isBoundary())
+        {
+            // We can only skip the whole marker if both deletion time are purgeable.
+            // If only one of them is, filterTombstoneMarker will deal with it.
+            RangeTombstoneBoundaryMarker boundary = (RangeTombstoneBoundaryMarker)marker;
+            boolean shouldPurgeClose = purger.shouldPurge(boundary.closeDeletionTime(reversed));
+            boolean shouldPurgeOpen = purger.shouldPurge(boundary.openDeletionTime(reversed));
+
+            if (shouldPurgeClose)
+            {
+                if (shouldPurgeOpen)
+                    return null;
+
+                return boundary.createCorrespondingOpenMarker(reversed);
+            }
+
+            return shouldPurgeOpen
+                   ? boundary.createCorrespondingCloseMarker(reversed)
+                   : marker;
+        }
+        else
+        {
+            return purger.shouldPurge(((RangeTombstoneBoundMarker)marker).deletionTime()) ? null : marker;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/SingletonUnfilteredPartitionIterator.java b/src/java/org/apache/cassandra/db/partitions/SingletonUnfilteredPartitionIterator.java
new file mode 100644
index 0000000..1f966db
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/SingletonUnfilteredPartitionIterator.java

@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+public class SingletonUnfilteredPartitionIterator implements UnfilteredPartitionIterator
+{
+    private final UnfilteredRowIterator iter;
+    private final boolean isForThrift;
+    private boolean returned;
+
+    public SingletonUnfilteredPartitionIterator(UnfilteredRowIterator iter, boolean isForThrift)
+    {
+        this.iter = iter;
+        this.isForThrift = isForThrift;
+    }
+
+    public boolean isForThrift()
+    {
+        return isForThrift;
+    }
+
+    public CFMetaData metadata()
+    {
+        return iter.metadata();
+    }
+
+    public boolean hasNext()
+    {
+        return !returned;
+    }
+
+    public UnfilteredRowIterator next()
+    {
+        if (returned)
+            throw new NoSuchElementException();
+
+        returned = true;
+        return iter;
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void close()
+    {
+        if (!returned)
+            iter.close();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterator.java b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterator.java
new file mode 100644
index 0000000..201c934
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterator.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+/**
+ * An iterator over a number of unfiltered partitions (i.e. partitions containing deletion informations).
+ *
+ * The object returned by a call to next() is only guaranteed to be
+ * valid until the next call to hasNext() or next(). If a consumer wants to keep a
+ * reference on the returned objects for longer than the iteration, it must
+ * make a copy of it explicitely.
+ */
+public interface UnfilteredPartitionIterator extends BasePartitionIterator<UnfilteredRowIterator>
+{
+    /**
+     * Whether that partition iterator is for a thrift queries.
+     * <p>
+     * If this is true, the partition iterator may return some empty UnfilteredRowIterator and those
+     * should be preserved as thrift include partitions that "exists" (have some cells even
+     * if this are actually deleted) but have nothing matching the query.
+     *
+     * @return whether the iterator is for a thrift query.
+     */
+    public boolean isForThrift();
+
+    public CFMetaData metadata();
+}

diff --git a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java
new file mode 100644
index 0000000..bff910e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java

@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.io.IOError;
+import java.io.IOException;
+import java.security.MessageDigest;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.FilteredPartitions;
+import org.apache.cassandra.db.transform.MorePartitions;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.MergeIterator;
+
+/**
+ * Static methods to work with partition iterators.
+ */
+public abstract class UnfilteredPartitionIterators
+{
+    private static final Serializer serializer = new Serializer();
+
+    private static final Comparator<UnfilteredRowIterator> partitionComparator = (p1, p2) -> p1.partitionKey().compareTo(p2.partitionKey());
+
+    private UnfilteredPartitionIterators() {}
+
+    public interface MergeListener
+    {
+        public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List<UnfilteredRowIterator> versions);
+        public default void close() {}
+    }
+
+    @SuppressWarnings("resource") // The created resources are returned right away
+    public static UnfilteredRowIterator getOnlyElement(final UnfilteredPartitionIterator iter, SinglePartitionReadCommand command)
+    {
+        // If the query has no results, we'll get an empty iterator, but we still
+        // want a RowIterator out of this method, so we return an empty one.
+        UnfilteredRowIterator toReturn = iter.hasNext()
+                              ? iter.next()
+                              : EmptyIterators.unfilteredRow(command.metadata(),
+                                                             command.partitionKey(),
+                                                             command.clusteringIndexFilter().isReversed());
+
+        // Note that in general, we should wrap the result so that it's close method actually
+        // close the whole UnfilteredPartitionIterator.
+        class Close extends Transformation
+        {
+            public void onPartitionClose()
+            {
+                // asserting this only now because it bothers Serializer if hasNext() is called before
+                // the previously returned iterator hasn't been fully consumed.
+                boolean hadNext = iter.hasNext();
+                iter.close();
+                assert !hadNext;
+            }
+        }
+        return Transformation.apply(toReturn, new Close());
+    }
+
+    public static UnfilteredPartitionIterator concat(final List<UnfilteredPartitionIterator> iterators)
+    {
+        if (iterators.size() == 1)
+            return iterators.get(0);
+
+        class Extend implements MorePartitions<UnfilteredPartitionIterator>
+        {
+            int i = 1;
+            public UnfilteredPartitionIterator moreContents()
+            {
+                if (i >= iterators.size())
+                    return null;
+                return iterators.get(i++);
+            }
+        }
+        return MorePartitions.extend(iterators.get(0), new Extend());
+    }
+
+    public static PartitionIterator filter(final UnfilteredPartitionIterator iterator, final int nowInSec)
+    {
+        return FilteredPartitions.filter(iterator, nowInSec);
+    }
+
+    public static UnfilteredPartitionIterator merge(final List<? extends UnfilteredPartitionIterator> iterators, final int nowInSec, final MergeListener listener)
+    {
+        assert !iterators.isEmpty();
+
+        final boolean isForThrift = iterators.get(0).isForThrift();
+        final CFMetaData metadata = iterators.get(0).metadata();
+
+        final MergeIterator<UnfilteredRowIterator, UnfilteredRowIterator> merged = MergeIterator.get(iterators, partitionComparator, new MergeIterator.Reducer<UnfilteredRowIterator, UnfilteredRowIterator>()
+        {
+            private final List<UnfilteredRowIterator> toMerge = new ArrayList<>(iterators.size());
+
+            private DecoratedKey partitionKey;
+            private boolean isReverseOrder;
+
+            public void reduce(int idx, UnfilteredRowIterator current)
+            {
+                partitionKey = current.partitionKey();
+                isReverseOrder = current.isReverseOrder();
+
+                // Note that because the MergeListener cares about it, we want to preserve the index of the iterator.
+                // Non-present iterator will thus be set to empty in getReduced.
+                toMerge.set(idx, current);
+            }
+
+            protected UnfilteredRowIterator getReduced()
+            {
+                UnfilteredRowIterators.MergeListener rowListener = listener == null
+                                                                 ? null
+                                                                 : listener.getRowMergeListener(partitionKey, toMerge);
+
+                // Replace nulls by empty iterators
+                for (int i = 0; i < toMerge.size(); i++)
+                    if (toMerge.get(i) == null)
+                        toMerge.set(i, EmptyIterators.unfilteredRow(metadata, partitionKey, isReverseOrder));
+
+                return UnfilteredRowIterators.merge(toMerge, nowInSec, rowListener);
+            }
+
+            protected void onKeyChange()
+            {
+                toMerge.clear();
+                for (int i = 0; i < iterators.size(); i++)
+                    toMerge.add(null);
+            }
+        });
+
+        return new AbstractUnfilteredPartitionIterator()
+        {
+            public boolean isForThrift()
+            {
+                return isForThrift;
+            }
+
+            public CFMetaData metadata()
+            {
+                return metadata;
+            }
+
+            public boolean hasNext()
+            {
+                return merged.hasNext();
+            }
+
+            public UnfilteredRowIterator next()
+            {
+                return merged.next();
+            }
+
+            @Override
+            public void close()
+            {
+                merged.close();
+
+                if (listener != null)
+                    listener.close();
+            }
+        };
+    }
+
+    public static UnfilteredPartitionIterator mergeLazily(final List<? extends UnfilteredPartitionIterator> iterators, final int nowInSec)
+    {
+        assert !iterators.isEmpty();
+
+        if (iterators.size() == 1)
+            return iterators.get(0);
+
+        final boolean isForThrift = iterators.get(0).isForThrift();
+        final CFMetaData metadata = iterators.get(0).metadata();
+
+        final MergeIterator<UnfilteredRowIterator, UnfilteredRowIterator> merged = MergeIterator.get(iterators, partitionComparator, new MergeIterator.Reducer<UnfilteredRowIterator, UnfilteredRowIterator>()
+        {
+            private final List<UnfilteredRowIterator> toMerge = new ArrayList<>(iterators.size());
+
+            @Override
+            public boolean trivialReduceIsTrivial()
+            {
+                return false;
+            }
+
+            public void reduce(int idx, UnfilteredRowIterator current)
+            {
+                toMerge.add(current);
+            }
+
+            protected UnfilteredRowIterator getReduced()
+            {
+                return new LazilyInitializedUnfilteredRowIterator(toMerge.get(0).partitionKey())
+                {
+                    protected UnfilteredRowIterator initializeIterator()
+                    {
+                        return UnfilteredRowIterators.merge(toMerge, nowInSec);
+                    }
+                };
+            }
+
+            protected void onKeyChange()
+            {
+                toMerge.clear();
+            }
+        });
+
+        return new AbstractUnfilteredPartitionIterator()
+        {
+            public boolean isForThrift()
+            {
+                return isForThrift;
+            }
+
+            public CFMetaData metadata()
+            {
+                return metadata;
+            }
+
+            public boolean hasNext()
+            {
+                return merged.hasNext();
+            }
+
+            public UnfilteredRowIterator next()
+            {
+                return merged.next();
+            }
+
+            @Override
+            public void close()
+            {
+                merged.close();
+            }
+        };
+    }
+
+    /**
+     * Digests the the provided iterator.
+     *
+     * Caller must close the provided iterator.
+     *
+     * @param command the command that has yield {@code iterator}. This can be null if {@code version >= MessagingService.VERSION_30}
+     * as this is only used when producing digest to be sent to legacy nodes.
+     * @param iterator the iterator to digest.
+     * @param digest the {@code MessageDigest} to use for the digest.
+     * @param version the messaging protocol to use when producing the digest.
+     */
+    public static void digest(ReadCommand command, UnfilteredPartitionIterator iterator, MessageDigest digest, int version)
+    {
+        while (iterator.hasNext())
+        {
+            try (UnfilteredRowIterator partition = iterator.next())
+            {
+                UnfilteredRowIterators.digest(command, partition, digest, version);
+            }
+        }
+    }
+
+    public static Serializer serializerForIntraNode()
+    {
+        return serializer;
+    }
+
+    /**
+     * Wraps the provided iterator so it logs the returned rows/RT for debugging purposes.
+     * <p>
+     * Note that this is only meant for debugging as this can log a very large amount of
+     * logging at INFO.
+     */
+    public static UnfilteredPartitionIterator loggingIterator(UnfilteredPartitionIterator iterator, final String id, final boolean fullDetails)
+    {
+        class Logging extends Transformation<UnfilteredRowIterator>
+        {
+            public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+            {
+                return UnfilteredRowIterators.loggingIterator(partition, id, fullDetails);
+            }
+        }
+        return Transformation.apply(iterator, new Logging());
+    }
+
+    /**
+     * Serialize each UnfilteredSerializer one after the other, with an initial byte that indicates whether
+     * we're done or not.
+     */
+    public static class Serializer
+    {
+        public void serialize(UnfilteredPartitionIterator iter, ColumnFilter selection, DataOutputPlus out, int version) throws IOException
+        {
+            assert version >= MessagingService.VERSION_30; // We handle backward compatibility directy in ReadResponse.LegacyRangeSliceReplySerializer
+
+            out.writeBoolean(iter.isForThrift());
+            while (iter.hasNext())
+            {
+                out.writeBoolean(true);
+                try (UnfilteredRowIterator partition = iter.next())
+                {
+                    UnfilteredRowIteratorSerializer.serializer.serialize(partition, selection, out, version);
+                }
+            }
+            out.writeBoolean(false);
+        }
+
+        public UnfilteredPartitionIterator deserialize(final DataInputPlus in, final int version, final CFMetaData metadata, final ColumnFilter selection, final SerializationHelper.Flag flag) throws IOException
+        {
+            assert version >= MessagingService.VERSION_30; // We handle backward compatibility directy in ReadResponse.LegacyRangeSliceReplySerializer
+            final boolean isForThrift = in.readBoolean();
+
+            return new AbstractUnfilteredPartitionIterator()
+            {
+                private UnfilteredRowIterator next;
+                private boolean hasNext;
+                private boolean nextReturned = true;
+
+                public boolean isForThrift()
+                {
+                    return isForThrift;
+                }
+
+                public CFMetaData metadata()
+                {
+                    return metadata;
+                }
+
+                public boolean hasNext()
+                {
+                    if (!nextReturned)
+                        return hasNext;
+
+                    /*
+                     * We must consume the previous iterator before we start deserializing the next partition, so
+                     * that we start from the right position in the byte stream.
+                     *
+                     * It's possible however that it hasn't been fully consumed by upstream consumers - for example,
+                     * if a per partition limit caused merge iterator to stop early (see CASSANDRA-13911).
+                     *
+                     * In that case we must drain the unconsumed iterator fully ourselves, here.
+                     *
+                     * NOTE: transformations of the upstream BaseRows won't be applied for these consumed elements,
+                     * so, for exmaple, they won't be counted.
+                     */
+                    if (null != next)
+                        while (next.hasNext())
+                            next.next();
+
+                    try
+                    {
+                        hasNext = in.readBoolean();
+                        nextReturned = false;
+                        return hasNext;
+                    }
+                    catch (IOException e)
+                    {
+                        throw new IOError(e);
+                    }
+                }
+
+                public UnfilteredRowIterator next()
+                {
+                    if (nextReturned && !hasNext())
+                        throw new NoSuchElementException();
+
+                    try
+                    {
+                        nextReturned = true;
+                        next = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version, metadata, selection, flag);
+                        return next;
+                    }
+                    catch (IOException e)
+                    {
+                        throw new IOError(e);
+                    }
+                }
+
+                @Override
+                public void close()
+                {
+                    if (next != null)
+                        next.close();
+                }
+            };
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/AbstractCell.java b/src/java/org/apache/cassandra/db/rows/AbstractCell.java
new file mode 100644
index 0000000..576351e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/AbstractCell.java

@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.security.MessageDigest;
+import java.util.Objects;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Base abstract class for {@code Cell} implementations.
+ *
+ * Unless you have a very good reason not to, every cell implementation
+ * should probably extend this class.
+ */
+public abstract class AbstractCell extends Cell
+{
+    protected AbstractCell(ColumnDefinition column)
+    {
+        super(column);
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        if (isCounterCell())
+        {
+            CounterContext.instance().updateDigest(digest, value());
+        }
+        else
+        {
+            digest.update(value().duplicate());
+        }
+
+        FBUtilities.updateWithLong(digest, timestamp());
+        FBUtilities.updateWithInt(digest, ttl());
+        FBUtilities.updateWithBoolean(digest, isCounterCell());
+        if (path() != null)
+            path().digest(digest);
+    }
+
+    public void validate()
+    {
+        if (ttl() < 0)
+            throw new MarshalException("A TTL should not be negative");
+        if (localDeletionTime() < 0)
+            throw new MarshalException("A local deletion time should not be negative");
+        if (isExpiring() && localDeletionTime() == NO_DELETION_TIME)
+            throw new MarshalException("Shoud not have a TTL without an associated local deletion time");
+
+        if (isTombstone())
+        {
+            // If cell is a tombstone, it shouldn't have a value.
+            if (value().hasRemaining())
+                throw new MarshalException("A tombstone should not have a value");
+        }
+        else
+        {
+            column().validateCellValue(value());
+        }
+
+        if (path() != null)
+            column().validateCellPath(path());
+    }
+
+    public long maxTimestamp()
+    {
+        return timestamp();
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (this == other)
+            return true;
+
+        if(!(other instanceof Cell))
+            return false;
+
+        Cell that = (Cell)other;
+        return this.column().equals(that.column())
+            && this.isCounterCell() == that.isCounterCell()
+            && this.timestamp() == that.timestamp()
+            && this.ttl() == that.ttl()
+            && this.localDeletionTime() == that.localDeletionTime()
+            && Objects.equals(this.value(), that.value())
+            && Objects.equals(this.path(), that.path());
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(column(), isCounterCell(), timestamp(), ttl(), localDeletionTime(), value(), path());
+    }
+
+    @Override
+    public String toString()
+    {
+        if (isCounterCell())
+            return String.format("[%s=%d ts=%d]", column().name, CounterContext.instance().total(value()), timestamp());
+
+        AbstractType<?> type = column().type;
+        if (type instanceof CollectionType && type.isMultiCell())
+        {
+            CollectionType ct = (CollectionType)type;
+            return String.format("[%s[%s]=%s %s]",
+                                 column().name,
+                                 ct.nameComparator().getString(path().get(0)),
+                                 ct.valueComparator().getString(value()),
+                                 livenessInfoString());
+        }
+        if (isTombstone())
+            return String.format("[%s=<tombstone> %s]", column().name, livenessInfoString());
+        else
+            return String.format("[%s=%s %s]", column().name, type.getString(value()), livenessInfoString());
+    }
+
+    private String livenessInfoString()
+    {
+        if (isExpiring())
+            return String.format("ts=%d ttl=%d ldt=%d", timestamp(), ttl(), localDeletionTime());
+        else if (isTombstone())
+            return String.format("ts=%d ldt=%d", timestamp(), localDeletionTime());
+        else
+            return String.format("ts=%d", timestamp());
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/rows/AbstractRangeTombstoneMarker.java b/src/java/org/apache/cassandra/db/rows/AbstractRangeTombstoneMarker.java
new file mode 100644
index 0000000..b1ee7ec
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/AbstractRangeTombstoneMarker.java

@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+public abstract class AbstractRangeTombstoneMarker implements RangeTombstoneMarker
+{
+    protected final RangeTombstone.Bound bound;
+
+    protected AbstractRangeTombstoneMarker(RangeTombstone.Bound bound)
+    {
+        this.bound = bound;
+    }
+
+    public RangeTombstone.Bound clustering()
+    {
+        return bound;
+    }
+
+    public Unfiltered.Kind kind()
+    {
+        return Unfiltered.Kind.RANGE_TOMBSTONE_MARKER;
+    }
+
+    public boolean isBoundary()
+    {
+        return bound.isBoundary();
+    }
+
+    public boolean isOpen(boolean reversed)
+    {
+        return bound.isOpen(reversed);
+    }
+
+    public boolean isClose(boolean reversed)
+    {
+        return bound.isClose(reversed);
+    }
+
+    public void validateData(CFMetaData metadata)
+    {
+        Slice.Bound bound = clustering();
+        for (int i = 0; i < bound.size(); i++)
+        {
+            ByteBuffer value = bound.get(i);
+            if (value != null)
+                metadata.comparator.subtype(i).validate(value);
+        }
+    }
+
+    public String toString(CFMetaData metadata, boolean fullDetails)
+    {
+        return toString(metadata);
+    }
+    public String toString(CFMetaData metadata, boolean includeClusteringKeys, boolean fullDetails)
+    {
+        return toString(metadata);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/AbstractRow.java b/src/java/org/apache/cassandra/db/rows/AbstractRow.java
new file mode 100644
index 0000000..f91126b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/AbstractRow.java

@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.AbstractCollection;
+import java.util.Collection;
+import java.util.Objects;
+
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Base abstract class for {@code Row} implementations.
+ *
+ * Unless you have a very good reason not to, every row implementation
+ * should probably extend this class.
+ */
+public abstract class AbstractRow implements Row
+{
+    public Unfiltered.Kind kind()
+    {
+        return Unfiltered.Kind.ROW;
+    }
+
+    @Override
+    public boolean hasLiveData(int nowInSec, boolean enforceStrictLiveness)
+    {
+        if (primaryKeyLivenessInfo().isLive(nowInSec))
+            return true;
+        else if (enforceStrictLiveness)
+            return false;
+        return Iterables.any(cells(), cell -> cell.isLive(nowInSec));
+    }
+
+    public boolean isStatic()
+    {
+        return clustering() == Clustering.STATIC_CLUSTERING;
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        FBUtilities.updateWithByte(digest, kind().ordinal());
+        clustering().digest(digest);
+
+        deletion().digest(digest);
+        primaryKeyLivenessInfo().digest(digest);
+
+        for (ColumnData cd : this)
+            cd.digest(digest);
+    }
+
+    public void validateData(CFMetaData metadata)
+    {
+        Clustering clustering = clustering();
+        for (int i = 0; i < clustering.size(); i++)
+        {
+            ByteBuffer value = clustering.get(i);
+            if (value != null)
+                metadata.comparator.subtype(i).validate(value);
+        }
+
+        primaryKeyLivenessInfo().validate();
+        if (deletion().time().localDeletionTime() < 0)
+            throw new MarshalException("A local deletion time should not be negative");
+
+        for (ColumnData cd : this)
+            cd.validate();
+    }
+
+    public String toString()
+    {
+        return columnData().toString();
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return toString(metadata, false);
+    }
+
+    public String toString(CFMetaData metadata, boolean fullDetails)
+    {
+        return toString(metadata, true, fullDetails);
+    }
+
+    public String toString(CFMetaData metadata, boolean includeClusterKeys, boolean fullDetails)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append("Row");
+        if (fullDetails)
+        {
+            sb.append("[info=").append(primaryKeyLivenessInfo());
+            if (!deletion().isLive())
+                sb.append(" del=").append(deletion());
+            sb.append(" ]");
+        }
+        sb.append(": ");
+        if(includeClusterKeys)
+            sb.append(clustering().toString(metadata));
+        else
+            sb.append(clustering().toCQLString(metadata));
+        sb.append(" | ");
+        boolean isFirst = true;
+        for (ColumnData cd : this)
+        {
+            if (isFirst) isFirst = false; else sb.append(", ");
+            if (fullDetails)
+            {
+                if (cd.column().isSimple())
+                {
+                    sb.append(cd);
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData)cd;
+                    if (!complexData.complexDeletion().isLive())
+                        sb.append("del(").append(cd.column().name).append(")=").append(complexData.complexDeletion());
+                    for (Cell cell : complexData)
+                        sb.append(", ").append(cell);
+                }
+            }
+            else
+            {
+                if (cd.column().isSimple())
+                {
+                    Cell cell = (Cell)cd;
+                    sb.append(cell.column().name).append('=');
+                    if (cell.isTombstone())
+                        sb.append("<tombstone>");
+                    else
+                        sb.append(cell.column().type.getString(cell.value()));
+                }
+                else
+                {
+                    ComplexColumnData complexData = (ComplexColumnData)cd;
+                    CollectionType ct = (CollectionType)cd.column().type;
+                    sb.append(cd.column().name).append("={");
+                    int i = 0;
+                    for (Cell cell : complexData)
+                    {
+                        sb.append(i++ == 0 ? "" : ", ");
+                        sb.append(ct.nameComparator().getString(cell.path().get(0))).append("->").append(ct.valueComparator().getString(cell.value()));
+                    }
+                    sb.append('}');
+                }
+            }
+        }
+        return sb.toString();
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof Row))
+            return false;
+
+        Row that = (Row)other;
+        if (!this.clustering().equals(that.clustering())
+             || !this.primaryKeyLivenessInfo().equals(that.primaryKeyLivenessInfo())
+             || !this.deletion().equals(that.deletion()))
+            return false;
+
+        return Iterables.elementsEqual(this, that);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int hash = Objects.hash(clustering(), primaryKeyLivenessInfo(), deletion());
+        for (ColumnData cd : this)
+            hash += 31 * cd.hashCode();
+        return hash;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/AbstractTypeVersionComparator.java b/src/java/org/apache/cassandra/db/rows/AbstractTypeVersionComparator.java
new file mode 100644
index 0000000..e47f681
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/AbstractTypeVersionComparator.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.cassandra.db.marshal.*;
+
+/**
+ * A {@code Comparator} use to determine which version of a type should be used.
+ * <p>In the case of UDTs it is possible to have 2 versions or more of the same type, if some fields has been added to
+ * the type. To avoid problems the latest type need to be used.</p>
+ */
+final class AbstractTypeVersionComparator implements Comparator<AbstractType<?>>
+{
+    public static final Comparator<AbstractType<?>> INSTANCE = new AbstractTypeVersionComparator();
+
+    private AbstractTypeVersionComparator()
+    {
+    }
+
+    @Override
+    public int compare(AbstractType<?> type, AbstractType<?> otherType)
+    {
+        if (!type.getClass().equals(otherType.getClass()))
+            throw new IllegalArgumentException(String.format("Trying to compare 2 different types: %s and %s",
+                                                             type,
+                                                             otherType));
+
+        if (type.equals(otherType))
+            return 0;
+
+        // The only case where 2 types can differ is if they contains some UDTs and one of them has more
+        // fields (due to an ALTER type ADD) than in the other type. In this case we need to pick the type with
+        // the bigger amount of fields.
+        if (type.isUDT())
+            return compareUserType((UserType) type, (UserType) otherType);
+
+        if (type.isTuple())
+            return compareTuple((TupleType) type, (TupleType) otherType);
+
+        if (type.isCollection())
+            return compareCollectionTypes(type, otherType);
+
+        if (type instanceof CompositeType)
+            return compareCompositeTypes((CompositeType) type, (CompositeType) otherType);
+
+        // In theory we should never reach that point but to be on the safe side we allow it.
+        return 0;
+    }
+
+    private int compareCompositeTypes(CompositeType type, CompositeType otherType)
+    {
+        List<AbstractType<?>> types = type.getComponents();
+        List<AbstractType<?>> otherTypes = otherType.getComponents();
+
+        if (types.size() != otherTypes.size())
+            return Integer.compare(types.size(), otherTypes.size());
+
+        for (int i = 0, m = type.componentsCount(); i < m ; i++)
+        {
+            int test = compare(types.get(i), otherTypes.get(i));
+            if (test != 0);
+                return test;
+        }
+        return 0;
+    }
+
+    private int compareCollectionTypes(AbstractType<?> type, AbstractType<?> otherType)
+    {
+        if (type instanceof MapType)
+            return compareMapType((MapType<?, ?>) type, (MapType<?, ?>) otherType);
+
+        if (type instanceof SetType)
+            return compare(((SetType<?>) type).getElementsType(), ((SetType<?>) otherType).getElementsType());
+
+        return compare(((ListType<?>) type).getElementsType(), ((ListType<?>) otherType).getElementsType());
+    }
+
+    private int compareMapType(MapType<?, ?> type, MapType<?, ?> otherType)
+    {
+        int test = compare(type.getKeysType(), otherType.getKeysType());
+        return test != 0 ? test : compare(type.getValuesType(), otherType.getValuesType());
+    }
+
+    private int compareUserType(UserType type, UserType otherType)
+    {
+        return compareTuple(type, otherType);
+    }
+
+    private int compareTuple(TupleType type, TupleType otherType)
+    {
+        if (type.size() != otherType.size())
+            return Integer.compare(type.size(), otherType.size());
+
+        int test = 0;
+        int i = 0;
+        while (test == 0 && i < type.size())
+        {
+            test = compare(type.type(i), otherType.type(i));
+            i++;
+        }
+        return test;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/AbstractUnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/AbstractUnfilteredRowIterator.java
new file mode 100644
index 0000000..f2389a7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/AbstractUnfilteredRowIterator.java

@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import org.apache.cassandra.utils.AbstractIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+public abstract class AbstractUnfilteredRowIterator extends AbstractIterator<Unfiltered> implements UnfilteredRowIterator
+{
+    protected final CFMetaData metadata;
+    protected final DecoratedKey partitionKey;
+    protected final DeletionTime partitionLevelDeletion;
+    protected final PartitionColumns columns;
+    protected final Row staticRow;
+    protected final boolean isReverseOrder;
+    protected final EncodingStats stats;
+
+    protected AbstractUnfilteredRowIterator(CFMetaData metadata,
+                                            DecoratedKey partitionKey,
+                                            DeletionTime partitionLevelDeletion,
+                                            PartitionColumns columns,
+                                            Row staticRow,
+                                            boolean isReverseOrder,
+                                            EncodingStats stats)
+    {
+        this.metadata = metadata;
+        this.partitionKey = partitionKey;
+        this.partitionLevelDeletion = partitionLevelDeletion;
+        this.columns = columns;
+        this.staticRow = staticRow;
+        this.isReverseOrder = isReverseOrder;
+        this.stats = stats;
+    }
+
+    public CFMetaData metadata()
+    {
+        return metadata;
+    }
+
+    public PartitionColumns columns()
+    {
+        return columns;
+    }
+
+    public boolean isReverseOrder()
+    {
+        return isReverseOrder;
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return partitionKey;
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        return partitionLevelDeletion;
+    }
+
+    public Row staticRow()
+    {
+        return staticRow;
+    }
+
+    public EncodingStats stats()
+    {
+        return stats;
+    }
+
+    public void close()
+    {
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/BTreeRow.java b/src/java/org/apache/cassandra/db/rows/BTreeRow.java
new file mode 100644
index 0000000..e46d0cc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/BTreeRow.java

@@ -0,0 +1,738 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Predicate;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.BTreeSearchIterator;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+/**
+ * Immutable implementation of a Row object.
+ */
+public class BTreeRow extends AbstractRow
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(emptyRow(Clustering.EMPTY));
+
+    private final Clustering clustering;
+    private final LivenessInfo primaryKeyLivenessInfo;
+    private final Deletion deletion;
+
+    // The data for each columns present in this row in column sorted order.
+    private final Object[] btree;
+
+    // We need to filter the tombstones of a row on every read (twice in fact: first to remove purgeable tombstone, and then after reconciliation to remove
+    // all tombstone since we don't return them to the client) as well as on compaction. But it's likely that many rows won't have any tombstone at all, so
+    // we want to speed up that case by not having to iterate/copy the row in this case. We could keep a single boolean telling us if we have tombstones,
+    // but that doesn't work for expiring columns. So instead we keep the deletion time for the first thing in the row to be deleted. This allow at any given
+    // time to know if we have any deleted information or not. If we any "true" tombstone (i.e. not an expiring cell), this value will be forced to
+    // Integer.MIN_VALUE, but if we don't and have expiring cells, this will the time at which the first expiring cell expires. If we have no tombstones and
+    // no expiring cells, this will be Integer.MAX_VALUE;
+    private final int minLocalDeletionTime;
+
+    private BTreeRow(Clustering clustering, LivenessInfo primaryKeyLivenessInfo, Deletion deletion, Object[] btree, int minLocalDeletionTime)
+    {
+        assert !deletion.isShadowedBy(primaryKeyLivenessInfo);
+        this.clustering = clustering;
+        this.primaryKeyLivenessInfo = primaryKeyLivenessInfo;
+        this.deletion = deletion;
+        this.btree = btree;
+        this.minLocalDeletionTime = minLocalDeletionTime;
+    }
+
+    private BTreeRow(Clustering clustering, Object[] btree, int minLocalDeletionTime)
+    {
+        this(clustering, LivenessInfo.EMPTY, Deletion.LIVE, btree, minLocalDeletionTime);
+    }
+
+    // Note that it's often easier/safer to use the sortedBuilder/unsortedBuilder or one of the static creation method below. Only directly useful in a small amount of cases.
+    public static BTreeRow create(Clustering clustering, LivenessInfo primaryKeyLivenessInfo, Deletion deletion, Object[] btree)
+    {
+        int minDeletionTime = Math.min(minDeletionTime(primaryKeyLivenessInfo), minDeletionTime(deletion.time()));
+        if (minDeletionTime != Integer.MIN_VALUE)
+        {
+            for (ColumnData cd : BTree.<ColumnData>iterable(btree))
+                minDeletionTime = Math.min(minDeletionTime, minDeletionTime(cd));
+        }
+
+        return new BTreeRow(clustering, primaryKeyLivenessInfo, deletion, btree, minDeletionTime);
+    }
+
+    public static BTreeRow emptyRow(Clustering clustering)
+    {
+        return new BTreeRow(clustering, BTree.empty(), Integer.MAX_VALUE);
+    }
+
+    public static BTreeRow singleCellRow(Clustering clustering, Cell cell)
+    {
+        if (cell.column().isSimple())
+            return new BTreeRow(clustering, BTree.singleton(cell), minDeletionTime(cell));
+
+        ComplexColumnData complexData = new ComplexColumnData(cell.column(), new Cell[]{ cell }, DeletionTime.LIVE);
+        return new BTreeRow(clustering, BTree.singleton(complexData), minDeletionTime(cell));
+    }
+
+    public static BTreeRow emptyDeletedRow(Clustering clustering, Deletion deletion)
+    {
+        assert !deletion.isLive();
+        return new BTreeRow(clustering, LivenessInfo.EMPTY, deletion, BTree.empty(), Integer.MIN_VALUE);
+    }
+
+    public static BTreeRow noCellLiveRow(Clustering clustering, LivenessInfo primaryKeyLivenessInfo)
+    {
+        assert !primaryKeyLivenessInfo.isEmpty();
+        return new BTreeRow(clustering, primaryKeyLivenessInfo, Deletion.LIVE, BTree.empty(), minDeletionTime(primaryKeyLivenessInfo));
+    }
+
+    private static int minDeletionTime(Cell cell)
+    {
+        return cell.isTombstone() ? Integer.MIN_VALUE : cell.localDeletionTime();
+    }
+
+    private static int minDeletionTime(LivenessInfo info)
+    {
+        return info.isExpiring() ? info.localExpirationTime() : Integer.MAX_VALUE;
+    }
+
+    private static int minDeletionTime(DeletionTime dt)
+    {
+        return dt.isLive() ? Integer.MAX_VALUE : Integer.MIN_VALUE;
+    }
+
+    private static int minDeletionTime(ComplexColumnData cd)
+    {
+        int min = minDeletionTime(cd.complexDeletion());
+        for (Cell cell : cd)
+        {
+            min = Math.min(min, minDeletionTime(cell));
+            if (min == Integer.MIN_VALUE)
+                break;
+        }
+        return min;
+    }
+
+    private static int minDeletionTime(ColumnData cd)
+    {
+        return cd.column().isSimple() ? minDeletionTime((Cell) cd) : minDeletionTime((ComplexColumnData)cd);
+    }
+
+    private static int minDeletionTime(Object[] btree, LivenessInfo info, DeletionTime rowDeletion)
+    {
+        int min = Math.min(minDeletionTime(info), minDeletionTime(rowDeletion));
+        for (ColumnData cd : BTree.<ColumnData>iterable(btree))
+        {
+            min = Math.min(min, minDeletionTime(cd));
+            if (min == Integer.MIN_VALUE)
+                break;
+        }
+        return min;
+    }
+
+    public Clustering clustering()
+    {
+        return clustering;
+    }
+
+    public Collection<ColumnDefinition> columns()
+    {
+        return Collections2.transform(columnData(), ColumnData::column);
+    }
+
+    public int columnCount()
+    {
+        return BTree.size(btree);
+    }
+
+    public LivenessInfo primaryKeyLivenessInfo()
+    {
+        return primaryKeyLivenessInfo;
+    }
+
+    public boolean isEmpty()
+    {
+        return primaryKeyLivenessInfo().isEmpty()
+               && deletion().isLive()
+               && BTree.isEmpty(btree);
+    }
+
+    public Deletion deletion()
+    {
+        return deletion;
+    }
+
+    public Cell getCell(ColumnDefinition c)
+    {
+        assert !c.isComplex();
+        return (Cell) BTree.<Object>find(btree, ColumnDefinition.asymmetricColumnDataComparator, c);
+    }
+
+    public Cell getCell(ColumnDefinition c, CellPath path)
+    {
+        assert c.isComplex();
+        ComplexColumnData cd = getComplexColumnData(c);
+        if (cd == null)
+            return null;
+        return cd.getCell(path);
+    }
+
+    public ComplexColumnData getComplexColumnData(ColumnDefinition c)
+    {
+        assert c.isComplex();
+        return (ComplexColumnData) BTree.<Object>find(btree, ColumnDefinition.asymmetricColumnDataComparator, c);
+    }
+
+    @Override
+    public Collection<ColumnData> columnData()
+    {
+        return new AbstractCollection<ColumnData>()
+        {
+            @Override public Iterator<ColumnData> iterator() { return BTreeRow.this.iterator(); }
+            @Override public int size() { return BTree.size(btree); }
+        };
+    }
+
+    public Iterator<ColumnData> iterator()
+    {
+        return searchIterator();
+    }
+
+    public Iterable<Cell> cells()
+    {
+        return CellIterator::new;
+    }
+
+    public BTreeSearchIterator<ColumnDefinition, ColumnData> searchIterator()
+    {
+        return BTree.slice(btree, ColumnDefinition.asymmetricColumnDataComparator, BTree.Dir.ASC);
+    }
+
+    public Row filter(ColumnFilter filter, CFMetaData metadata)
+    {
+        return filter(filter, DeletionTime.LIVE, false, metadata);
+    }
+
+    public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setActiveDeletionToRow, CFMetaData metadata)
+    {
+        Map<ByteBuffer, CFMetaData.DroppedColumn> droppedColumns = metadata.getDroppedColumns();
+
+        if (filter.includesAllColumns() && (activeDeletion.isLive() || deletion.supersedes(activeDeletion)) && droppedColumns.isEmpty())
+            return this;
+
+        boolean mayHaveShadowed = activeDeletion.supersedes(deletion.time());
+
+        LivenessInfo newInfo = primaryKeyLivenessInfo;
+        Deletion newDeletion = deletion;
+        if (mayHaveShadowed)
+        {
+            if (activeDeletion.deletes(newInfo.timestamp()))
+                newInfo = LivenessInfo.EMPTY;
+            // note that mayHaveShadowed means the activeDeletion shadows the row deletion. So if don't have setActiveDeletionToRow,
+            // the row deletion is shadowed and we shouldn't return it.
+            newDeletion = setActiveDeletionToRow ? Deletion.regular(activeDeletion) : Deletion.LIVE;
+        }
+
+        Columns columns = filter.fetchedColumns().columns(isStatic());
+        Predicate<ColumnDefinition> inclusionTester = columns.inOrderInclusionTester();
+        return transformAndFilter(newInfo, newDeletion, (cd) -> {
+
+            ColumnDefinition column = cd.column();
+            if (!inclusionTester.test(column))
+                return null;
+
+            CFMetaData.DroppedColumn dropped = droppedColumns.get(column.name.bytes);
+            if (column.isComplex())
+                return ((ComplexColumnData) cd).filter(filter, mayHaveShadowed ? activeDeletion : DeletionTime.LIVE, dropped);
+
+            Cell cell = (Cell) cd;
+            return (dropped == null || cell.timestamp() > dropped.droppedTime) && !(mayHaveShadowed && activeDeletion.deletes(cell))
+                   ? cell : null;
+        });
+    }
+
+    public boolean hasComplex()
+    {
+        // We start by the end cause we know complex columns sort after the simple ones
+        ColumnData cd = Iterables.getFirst(BTree.<ColumnData>iterable(btree, BTree.Dir.DESC), null);
+        return cd != null && cd.column.isComplex();
+    }
+
+    public boolean hasComplexDeletion()
+    {
+        // We start by the end cause we know complex columns sort before simple ones
+        for (ColumnData cd : BTree.<ColumnData>iterable(btree, BTree.Dir.DESC))
+        {
+            if (cd.column().isSimple())
+                return false;
+
+            if (!((ComplexColumnData)cd).complexDeletion().isLive())
+                return true;
+        }
+        return false;
+    }
+
+    public Row markCounterLocalToBeCleared()
+    {
+        return transformAndFilter(primaryKeyLivenessInfo, deletion, (cd) -> cd.column().isCounterColumn()
+                                                                            ? cd.markCounterLocalToBeCleared()
+                                                                            : cd);
+    }
+
+    public boolean hasDeletion(int nowInSec)
+    {
+        return nowInSec >= minLocalDeletionTime;
+    }
+
+    /**
+     * Returns a copy of the row where all timestamps for live data have replaced by {@code newTimestamp} and
+     * all deletion timestamp by {@code newTimestamp - 1}.
+     *
+     * This exists for the Paxos path, see {@link PartitionUpdate#updateAllTimestamp} for additional details.
+     */
+    public Row updateAllTimestamp(long newTimestamp)
+    {
+        LivenessInfo newInfo = primaryKeyLivenessInfo.isEmpty() ? primaryKeyLivenessInfo : primaryKeyLivenessInfo.withUpdatedTimestamp(newTimestamp);
+        // If the deletion is shadowable and the row has a timestamp, we'll forced the deletion timestamp to be less than the row one, so we
+        // should get rid of said deletion.
+        Deletion newDeletion = deletion.isLive() || (deletion.isShadowable() && !primaryKeyLivenessInfo.isEmpty())
+                             ? Deletion.LIVE
+                             : new Deletion(new DeletionTime(newTimestamp - 1, deletion.time().localDeletionTime()), deletion.isShadowable());
+
+        return transformAndFilter(newInfo, newDeletion, (cd) -> cd.updateAllTimestamp(newTimestamp));
+    }
+
+    public Row withRowDeletion(DeletionTime newDeletion)
+    {
+        // Note that:
+        //  - it is a contract with the caller that the new deletion shouldn't shadow anything in
+        //    the row, and so in particular it can't shadow the row deletion. So if there is a
+        //    already a row deletion we have nothing to do.
+        //  - we set the minLocalDeletionTime to MIN_VALUE because we know the deletion is live
+        return newDeletion.isLive() || !deletion.isLive()
+             ? this
+             : new BTreeRow(clustering, primaryKeyLivenessInfo, Deletion.regular(newDeletion), btree, Integer.MIN_VALUE);
+    }
+
+    public Row purge(DeletionPurger purger, int nowInSec, boolean enforceStrictLiveness)
+    {
+        if (!hasDeletion(nowInSec))
+            return this;
+
+        LivenessInfo newInfo = purger.shouldPurge(primaryKeyLivenessInfo, nowInSec) ? LivenessInfo.EMPTY : primaryKeyLivenessInfo;
+        Deletion newDeletion = purger.shouldPurge(deletion.time()) ? Deletion.LIVE : deletion;
+
+        // when enforceStrictLiveness is set, a row is considered dead when it's PK liveness info is not present
+        if (enforceStrictLiveness && newDeletion.isLive() && newInfo.isEmpty())
+            return null;
+
+        return transformAndFilter(newInfo, newDeletion, (cd) -> cd.purge(purger, nowInSec));
+    }
+
+    private Row transformAndFilter(LivenessInfo info, Deletion deletion, Function<ColumnData, ColumnData> function)
+    {
+        Object[] transformed = BTree.transformAndFilter(btree, function);
+
+        if (btree == transformed && info == this.primaryKeyLivenessInfo && deletion == this.deletion)
+            return this;
+
+        if (info.isEmpty() && deletion.isLive() && BTree.isEmpty(transformed))
+            return null;
+
+        int minDeletionTime = minDeletionTime(transformed, info, deletion.time());
+        return new BTreeRow(clustering, info, deletion, transformed, minDeletionTime);
+    }
+
+    public int dataSize()
+    {
+        int dataSize = clustering.dataSize()
+                     + primaryKeyLivenessInfo.dataSize()
+                     + deletion.dataSize();
+
+        for (ColumnData cd : this)
+            dataSize += cd.dataSize();
+        return dataSize;
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        long heapSize = EMPTY_SIZE
+                      + clustering.unsharedHeapSizeExcludingData()
+                      + BTree.sizeOfStructureOnHeap(btree);
+
+        for (ColumnData cd : this)
+            heapSize += cd.unsharedHeapSizeExcludingData();
+        return heapSize;
+    }
+
+    public static Row.Builder sortedBuilder()
+    {
+        return new Builder(true);
+    }
+
+    public static Row.Builder unsortedBuilder(int nowInSec)
+    {
+        return new Builder(false, nowInSec);
+    }
+
+    // This is only used by PartitionUpdate.CounterMark but other uses should be avoided as much as possible as it breaks our general
+    // assumption that Row objects are immutable. This method should go away post-#6506 in particular.
+    // This method is in particular not exposed by the Row API on purpose.
+    // This method also *assumes* that the cell we're setting already exists.
+    public void setValue(ColumnDefinition column, CellPath path, ByteBuffer value)
+    {
+        ColumnData current = (ColumnData) BTree.<Object>find(btree, ColumnDefinition.asymmetricColumnDataComparator, column);
+        if (column.isSimple())
+            BTree.replaceInSitu(btree, ColumnData.comparator, current, ((Cell) current).withUpdatedValue(value));
+        else
+            ((ComplexColumnData) current).setValue(path, value);
+    }
+
+    public Iterable<Cell> cellsInLegacyOrder(CFMetaData metadata, boolean reversed)
+    {
+        return () -> new CellInLegacyOrderIterator(metadata, reversed);
+    }
+
+    private class CellIterator extends AbstractIterator<Cell>
+    {
+        private Iterator<ColumnData> columnData = iterator();
+        private Iterator<Cell> complexCells;
+
+        protected Cell computeNext()
+        {
+            while (true)
+            {
+                if (complexCells != null)
+                {
+                    if (complexCells.hasNext())
+                        return complexCells.next();
+
+                    complexCells = null;
+                }
+
+                if (!columnData.hasNext())
+                    return endOfData();
+
+                ColumnData cd = columnData.next();
+                if (cd.column().isComplex())
+                    complexCells = ((ComplexColumnData)cd).iterator();
+                else
+                    return (Cell)cd;
+            }
+        }
+    }
+
+    private class CellInLegacyOrderIterator extends AbstractIterator<Cell>
+    {
+        private final Comparator<ByteBuffer> comparator;
+        private final boolean reversed;
+        private final int firstComplexIdx;
+        private int simpleIdx;
+        private int complexIdx;
+        private Iterator<Cell> complexCells;
+        private final Object[] data;
+
+        private CellInLegacyOrderIterator(CFMetaData metadata, boolean reversed)
+        {
+            AbstractType<?> nameComparator = metadata.getColumnDefinitionNameComparator(isStatic() ? ColumnDefinition.Kind.STATIC : ColumnDefinition.Kind.REGULAR);
+            this.comparator = reversed ? Collections.reverseOrder(nameComparator) : nameComparator;
+            this.reversed = reversed;
+
+            // copy btree into array for simple separate iteration of simple and complex columns
+            this.data = new Object[BTree.size(btree)];
+            BTree.toArray(btree, data, 0);
+
+            int idx = Iterators.indexOf(Iterators.forArray(data), cd -> cd instanceof ComplexColumnData);
+            this.firstComplexIdx = idx < 0 ? data.length : idx;
+            this.complexIdx = firstComplexIdx;
+        }
+
+        private int getSimpleIdx()
+        {
+            return reversed ? firstComplexIdx - simpleIdx - 1 : simpleIdx;
+        }
+
+        private int getSimpleIdxAndIncrement()
+        {
+            int idx = getSimpleIdx();
+            ++simpleIdx;
+            return idx;
+        }
+
+        private int getComplexIdx()
+        {
+            return reversed ? data.length + firstComplexIdx - complexIdx - 1 : complexIdx;
+        }
+
+        private int getComplexIdxAndIncrement()
+        {
+            int idx = getComplexIdx();
+            ++complexIdx;
+            return idx;
+        }
+
+        private Iterator<Cell> makeComplexIterator(Object complexData)
+        {
+            ComplexColumnData ccd = (ComplexColumnData)complexData;
+            return reversed ? ccd.reverseIterator() : ccd.iterator();
+        }
+
+        protected Cell computeNext()
+        {
+            while (true)
+            {
+                if (complexCells != null)
+                {
+                    if (complexCells.hasNext())
+                        return complexCells.next();
+
+                    complexCells = null;
+                }
+
+                if (simpleIdx >= firstComplexIdx)
+                {
+                    if (complexIdx >= data.length)
+                        return endOfData();
+
+                    complexCells = makeComplexIterator(data[getComplexIdxAndIncrement()]);
+                }
+                else
+                {
+                    if (complexIdx >= data.length)
+                        return (Cell)data[getSimpleIdxAndIncrement()];
+
+                    if (comparator.compare(((ColumnData) data[getSimpleIdx()]).column().name.bytes, ((ColumnData) data[getComplexIdx()]).column().name.bytes) < 0)
+                        return (Cell)data[getSimpleIdxAndIncrement()];
+                    else
+                        complexCells = makeComplexIterator(data[getComplexIdxAndIncrement()]);
+                }
+            }
+        }
+    }
+
+    public static class Builder implements Row.Builder
+    {
+        // a simple marker class that will sort to the beginning of a run of complex cells to store the deletion time
+        private static class ComplexColumnDeletion extends BufferCell
+        {
+            public ComplexColumnDeletion(ColumnDefinition column, DeletionTime deletionTime)
+            {
+                super(column, deletionTime.markedForDeleteAt(), 0, deletionTime.localDeletionTime(), ByteBufferUtil.EMPTY_BYTE_BUFFER, CellPath.BOTTOM);
+            }
+        }
+
+        // converts a run of Cell with equal column into a ColumnData
+        private static class CellResolver implements BTree.Builder.Resolver
+        {
+            final int nowInSec;
+            private CellResolver(int nowInSec)
+            {
+                this.nowInSec = nowInSec;
+            }
+
+            public ColumnData resolve(Object[] cells, int lb, int ub)
+            {
+                Cell cell = (Cell) cells[lb];
+                ColumnDefinition column = cell.column;
+                if (cell.column.isSimple())
+                {
+                    assert lb + 1 == ub || nowInSec != Integer.MIN_VALUE;
+                    while (++lb < ub)
+                        cell = Cells.reconcile(cell, (Cell) cells[lb], nowInSec);
+                    return cell;
+                }
+
+                // TODO: relax this in the case our outer provider is sorted (want to delay until remaining changes are
+                // bedded in, as less important; galloping makes it pretty cheap anyway)
+                Arrays.sort(cells, lb, ub, (Comparator<Object>) column.cellComparator());
+                DeletionTime deletion = DeletionTime.LIVE;
+                // Deal with complex deletion (for which we've use "fake" ComplexColumnDeletion cells that we need to remove).
+                // Note that in almost all cases we'll at most one of those fake cell, but the contract of {{Row.Builder.addComplexDeletion}}
+                // does not forbid it being called twice (especially in the unsorted case) and this can actually happen when reading
+                // legacy sstables (see #10743).
+                while (lb < ub)
+                {
+                    cell = (Cell) cells[lb];
+                    if (!(cell instanceof ComplexColumnDeletion))
+                        break;
+
+                    if (cell.timestamp() > deletion.markedForDeleteAt())
+                        deletion = new DeletionTime(cell.timestamp(), cell.localDeletionTime());
+                    lb++;
+                }
+
+                List<Object> buildFrom = new ArrayList<>(ub - lb);
+                Cell previous = null;
+                for (int i = lb; i < ub; i++)
+                {
+                    Cell c = (Cell) cells[i];
+
+                    if (deletion == DeletionTime.LIVE || c.timestamp() >= deletion.markedForDeleteAt())
+                    {
+                        if (previous != null && column.cellComparator().compare(previous, c) == 0)
+                        {
+                            c = Cells.reconcile(previous, c, nowInSec);
+                            buildFrom.set(buildFrom.size() - 1, c);
+                        }
+                        else
+                        {
+                            buildFrom.add(c);
+                        }
+                        previous = c;
+                    }
+                }
+
+                Object[] btree = BTree.build(buildFrom, UpdateFunction.noOp());
+                return new ComplexColumnData(column, btree, deletion);
+            }
+
+        };
+        protected Clustering clustering;
+        protected LivenessInfo primaryKeyLivenessInfo = LivenessInfo.EMPTY;
+        protected Deletion deletion = Deletion.LIVE;
+
+        private final boolean isSorted;
+        private final BTree.Builder<Cell> cells;
+        private final CellResolver resolver;
+        private boolean hasComplex = false;
+
+        // For complex column at index i of 'columns', we store at complexDeletions[i] its complex deletion.
+
+        protected Builder(boolean isSorted)
+        {
+            this(isSorted, Integer.MIN_VALUE);
+        }
+
+        protected Builder(boolean isSorted, int nowInSecs)
+        {
+            this.cells = BTree.builder(ColumnData.comparator);
+            resolver = new CellResolver(nowInSecs);
+            this.isSorted = isSorted;
+            this.cells.auto(false);
+        }
+
+        protected Builder(Builder builder)
+        {
+            clustering = builder.clustering;
+            primaryKeyLivenessInfo = builder.primaryKeyLivenessInfo;
+            deletion = builder.deletion;
+            cells = builder.cells.copy();
+            resolver = builder.resolver;
+            isSorted = builder.isSorted;
+            hasComplex = builder.hasComplex;
+        }
+
+        @Override
+        public Builder copy()
+        {
+            return new Builder(this);
+        }
+
+        public boolean isSorted()
+        {
+            return isSorted;
+        }
+
+        public void newRow(Clustering clustering)
+        {
+            assert this.clustering == null; // Ensures we've properly called build() if we've use this builder before
+            this.clustering = clustering;
+        }
+
+        public Clustering clustering()
+        {
+            return clustering;
+        }
+
+        protected void reset()
+        {
+            this.clustering = null;
+            this.primaryKeyLivenessInfo = LivenessInfo.EMPTY;
+            this.deletion = Deletion.LIVE;
+            this.cells.reuse();
+            this.hasComplex = false;
+        }
+
+        public void addPrimaryKeyLivenessInfo(LivenessInfo info)
+        {
+            // The check is only required for unsorted builders, but it's worth the extra safety to have it unconditional
+            if (!deletion.deletes(info))
+                this.primaryKeyLivenessInfo = info;
+        }
+
+        public void addRowDeletion(Deletion deletion)
+        {
+            this.deletion = deletion;
+            // The check is only required for unsorted builders, but it's worth the extra safety to have it unconditional
+            if (deletion.deletes(primaryKeyLivenessInfo))
+                this.primaryKeyLivenessInfo = LivenessInfo.EMPTY;
+        }
+
+        public void addCell(Cell cell)
+        {
+            assert cell.column().isStatic() == (clustering == Clustering.STATIC_CLUSTERING) : "Column is " + cell.column() + ", clustering = " + clustering;
+            // In practice, only unsorted builder have to deal with shadowed cells, but it doesn't cost us much to deal with it unconditionally in this case
+            if (deletion.deletes(cell))
+                return;
+
+            cells.add(cell);
+            hasComplex |= cell.column.isComplex();
+        }
+
+        public void addComplexDeletion(ColumnDefinition column, DeletionTime complexDeletion)
+        {
+            cells.add(new ComplexColumnDeletion(column, complexDeletion));
+            hasComplex = true;
+        }
+
+        public Row build()
+        {
+            if (!isSorted)
+                cells.sort();
+            // we can avoid resolving if we're sorted and have no complex values
+            // (because we'll only have unique simple cells, which are already in their final condition)
+            if (!isSorted | hasComplex)
+                cells.resolve(resolver);
+            Object[] btree = cells.build();
+
+            if (deletion.isShadowedBy(primaryKeyLivenessInfo))
+                deletion = Deletion.LIVE;
+
+            int minDeletionTime = minDeletionTime(btree, primaryKeyLivenessInfo, deletion.time());
+            Row row = new BTreeRow(clustering, primaryKeyLivenessInfo, deletion, btree, minDeletionTime);
+            reset();
+            return row;
+        }
+
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/BaseRowIterator.java b/src/java/org/apache/cassandra/db/rows/BaseRowIterator.java
new file mode 100644
index 0000000..ce37297
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/BaseRowIterator.java

@@ -0,0 +1,64 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db.rows;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.utils.CloseableIterator;
+
+/**
+ * A common interface for Row and Unfiltered, that permits sharing of the (majority) common
+ * methods and functionality
+ */
+public interface BaseRowIterator<U extends Unfiltered> extends CloseableIterator<U>
+{
+    /**
+     * The metadata for the table this iterator on.
+     */
+    public CFMetaData metadata();
+
+    /**
+     * Whether or not the rows returned by this iterator are in reversed
+     * clustering order.
+     */
+    public boolean isReverseOrder();
+
+    /**
+     * A subset of the columns for the (static and regular) rows returned by this iterator.
+     * Every row returned by this iterator must guarantee that it has only those columns.
+     */
+    public PartitionColumns columns();
+
+    /**
+     * The partition key of the partition this in an iterator over.
+     */
+    public DecoratedKey partitionKey();
+
+    /**
+     * The static part corresponding to this partition (this can be an empty
+     * row but cannot be {@code null}).
+     */
+    public Row staticRow();
+
+    /**
+     * Returns whether the provided iterator has no data.
+     */
+    public boolean isEmpty();
+}

diff --git a/src/java/org/apache/cassandra/db/rows/BufferCell.java b/src/java/org/apache/cassandra/db/rows/BufferCell.java
new file mode 100644
index 0000000..df2619c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/BufferCell.java

@@ -0,0 +1,370 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.marshal.ByteType;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+public class BufferCell extends AbstractCell
+{
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(ColumnDefinition.regularDef("", "", "", ByteType.instance), 0L, 0, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, null));
+
+    private final long timestamp;
+    private final int ttl;
+    private final int localDeletionTime;
+
+    private final ByteBuffer value;
+    private final CellPath path;
+
+    public BufferCell(ColumnDefinition column, long timestamp, int ttl, int localDeletionTime, ByteBuffer value, CellPath path)
+    {
+        super(column);
+        assert column.isComplex() == (path != null);
+        this.timestamp = timestamp;
+        this.ttl = ttl;
+        this.localDeletionTime = localDeletionTime;
+        this.value = value;
+        this.path = path;
+    }
+
+    public static BufferCell live(CFMetaData metadata, ColumnDefinition column, long timestamp, ByteBuffer value)
+    {
+        return live(metadata, column, timestamp, value, null);
+    }
+
+    public static BufferCell live(CFMetaData metadata, ColumnDefinition column, long timestamp, ByteBuffer value, CellPath path)
+    {
+        if (metadata.params.defaultTimeToLive != NO_TTL)
+            return expiring(column, timestamp, metadata.params.defaultTimeToLive, FBUtilities.nowInSeconds(), value, path);
+
+        return new BufferCell(column, timestamp, NO_TTL, NO_DELETION_TIME, value, path);
+    }
+
+    public static BufferCell expiring(ColumnDefinition column, long timestamp, int ttl, int nowInSec, ByteBuffer value)
+    {
+        return expiring(column, timestamp, ttl, nowInSec, value, null);
+    }
+
+    public static BufferCell expiring(ColumnDefinition column, long timestamp, int ttl, int nowInSec, ByteBuffer value, CellPath path)
+    {
+        assert ttl != NO_TTL;
+        return new BufferCell(column, timestamp, ttl, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), value, path);
+    }
+
+    public static BufferCell tombstone(ColumnDefinition column, long timestamp, int nowInSec)
+    {
+        return tombstone(column, timestamp, nowInSec, null);
+    }
+
+    public static BufferCell tombstone(ColumnDefinition column, long timestamp, int nowInSec, CellPath path)
+    {
+        return new BufferCell(column, timestamp, NO_TTL, nowInSec, ByteBufferUtil.EMPTY_BYTE_BUFFER, path);
+    }
+
+    public boolean isCounterCell()
+    {
+        return !isTombstone() && column.isCounterColumn();
+    }
+
+    public boolean isLive(int nowInSec)
+    {
+        return localDeletionTime == NO_DELETION_TIME || (ttl != NO_TTL && nowInSec < localDeletionTime);
+    }
+
+    public boolean isTombstone()
+    {
+        return localDeletionTime != NO_DELETION_TIME && ttl == NO_TTL;
+    }
+
+    public boolean isExpiring()
+    {
+        return ttl != NO_TTL;
+    }
+
+    public long timestamp()
+    {
+        return timestamp;
+    }
+
+    public int ttl()
+    {
+        return ttl;
+    }
+
+    public int localDeletionTime()
+    {
+        return localDeletionTime;
+    }
+
+    public ByteBuffer value()
+    {
+        return value;
+    }
+
+    public CellPath path()
+    {
+        return path;
+    }
+
+    public Cell withUpdatedColumn(ColumnDefinition newColumn)
+    {
+        return new BufferCell(newColumn, timestamp, ttl, localDeletionTime, value, path);
+    }
+
+    public Cell withUpdatedValue(ByteBuffer newValue)
+    {
+        return new BufferCell(column, timestamp, ttl, localDeletionTime, newValue, path);
+    }
+
+    public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime)
+    {
+        return new BufferCell(column, newTimestamp, ttl, newLocalDeletionTime, value, path);
+    }
+
+    public Cell copy(AbstractAllocator allocator)
+    {
+        if (!value.hasRemaining())
+            return this;
+
+        return new BufferCell(column, timestamp, ttl, localDeletionTime, allocator.clone(value), path == null ? null : path.copy(allocator));
+    }
+
+    public Cell markCounterLocalToBeCleared()
+    {
+        if (!isCounterCell())
+            return this;
+
+        ByteBuffer marked = CounterContext.instance().markLocalToBeCleared(value());
+        return marked == value() ? this : new BufferCell(column, timestamp, ttl, localDeletionTime, marked, path);
+    }
+
+    public Cell purge(DeletionPurger purger, int nowInSec)
+    {
+        if (!isLive(nowInSec))
+        {
+            if (purger.shouldPurge(timestamp, localDeletionTime))
+                return null;
+
+            // We slightly hijack purging to convert expired but not purgeable columns to tombstones. The reason we do that is
+            // that once a column has expired it is equivalent to a tombstone but actually using a tombstone is more compact since
+            // we don't keep the column value. The reason we do it here is that 1) it's somewhat related to dealing with tombstones
+            // so hopefully not too surprising and 2) we want to this and purging at the same places, so it's simpler/more efficient
+            // to do both here.
+            if (isExpiring())
+            {
+                // Note that as long as the expiring column and the tombstone put together live longer than GC grace seconds,
+                // we'll fulfil our responsibility to repair. See discussion at
+                // http://cassandra-user-incubator-apache-org.3065146.n2.nabble.com/repair-compaction-and-tombstone-rows-td7583481.html
+                return BufferCell.tombstone(column, timestamp, localDeletionTime - ttl, path).purge(purger, nowInSec);
+            }
+        }
+        return this;
+    }
+
+    public Cell updateAllTimestamp(long newTimestamp)
+    {
+        return new BufferCell(column, isTombstone() ? newTimestamp - 1 : newTimestamp, ttl, localDeletionTime, value, path);
+    }
+
+    public int dataSize()
+    {
+        return TypeSizes.sizeof(timestamp)
+             + TypeSizes.sizeof(ttl)
+             + TypeSizes.sizeof(localDeletionTime)
+             + value.remaining()
+             + (path == null ? 0 : path.dataSize());
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        return EMPTY_SIZE + ObjectSizes.sizeOnHeapExcludingData(value) + (path == null ? 0 : path.unsharedHeapSizeExcludingData());
+    }
+
+    /**
+     * The serialization format for cell is:
+     *     [ flags ][ timestamp ][ deletion time ][    ttl    ][ path size ][ path ][ value size ][ value ]
+     *     [   1b  ][ 8b (vint) ][   4b (vint)   ][ 4b (vint) ][ 4b (vint) ][  arb ][  4b (vint) ][  arb  ]
+     *
+     * where not all field are always present (in fact, only the [ flags ] are guaranteed to be present). The fields have the following
+     * meaning:
+     *   - [ flags ] is the cell flags. It is a byte for which each bit represents a flag whose meaning is explained below (*_MASK constants)
+     *   - [ timestamp ] is the cell timestamp. Present unless the cell has the USE_TIMESTAMP_MASK.
+     *   - [ deletion time]: the local deletion time for the cell. Present if either the cell is deleted (IS_DELETED_MASK)
+     *       or it is expiring (IS_EXPIRING_MASK) but doesn't have the USE_ROW_TTL_MASK.
+     *   - [ ttl ]: the ttl for the cell. Present if the row is expiring (IS_EXPIRING_MASK) but doesn't have the
+     *       USE_ROW_TTL_MASK.
+     *   - [ value size ] is the size of the [ value ] field. It's present unless either the cell has the HAS_EMPTY_VALUE_MASK, or the value
+     *       for columns of this type have a fixed length.
+     *   - [ path size ] is the size of the [ path ] field. Present iff this is the cell of a complex column.
+     *   - [ value ]: the cell value, unless it has the HAS_EMPTY_VALUE_MASK.
+     *   - [ path ]: the cell path if the column this is a cell of is complex.
+     */
+    static class Serializer implements Cell.Serializer
+    {
+        private final static int IS_DELETED_MASK             = 0x01; // Whether the cell is a tombstone or not.
+        private final static int IS_EXPIRING_MASK            = 0x02; // Whether the cell is expiring.
+        private final static int HAS_EMPTY_VALUE_MASK        = 0x04; // Wether the cell has an empty value. This will be the case for tombstone in particular.
+        private final static int USE_ROW_TIMESTAMP_MASK      = 0x08; // Wether the cell has the same timestamp than the row this is a cell of.
+        private final static int USE_ROW_TTL_MASK            = 0x10; // Wether the cell has the same ttl than the row this is a cell of.
+
+        public void serialize(Cell cell, ColumnDefinition column, DataOutputPlus out, LivenessInfo rowLiveness, SerializationHeader header) throws IOException
+        {
+            assert cell != null;
+            boolean hasValue = cell.value().hasRemaining();
+            boolean isDeleted = cell.isTombstone();
+            boolean isExpiring = cell.isExpiring();
+            boolean useRowTimestamp = !rowLiveness.isEmpty() && cell.timestamp() == rowLiveness.timestamp();
+            boolean useRowTTL = isExpiring && rowLiveness.isExpiring() && cell.ttl() == rowLiveness.ttl() && cell.localDeletionTime() == rowLiveness.localExpirationTime();
+            int flags = 0;
+            if (!hasValue)
+                flags |= HAS_EMPTY_VALUE_MASK;
+
+            if (isDeleted)
+                flags |= IS_DELETED_MASK;
+            else if (isExpiring)
+                flags |= IS_EXPIRING_MASK;
+
+            if (useRowTimestamp)
+                flags |= USE_ROW_TIMESTAMP_MASK;
+            if (useRowTTL)
+                flags |= USE_ROW_TTL_MASK;
+
+            out.writeByte((byte)flags);
+
+            if (!useRowTimestamp)
+                header.writeTimestamp(cell.timestamp(), out);
+
+            if ((isDeleted || isExpiring) && !useRowTTL)
+                header.writeLocalDeletionTime(cell.localDeletionTime(), out);
+            if (isExpiring && !useRowTTL)
+                header.writeTTL(cell.ttl(), out);
+
+            if (column.isComplex())
+                column.cellPathSerializer().serialize(cell.path(), out);
+
+            if (hasValue)
+                header.getType(column).writeValue(cell.value(), out);
+        }
+
+        public Cell deserialize(DataInputPlus in, LivenessInfo rowLiveness, ColumnDefinition column, SerializationHeader header, SerializationHelper helper) throws IOException
+        {
+            int flags = in.readUnsignedByte();
+            boolean hasValue = (flags & HAS_EMPTY_VALUE_MASK) == 0;
+            boolean isDeleted = (flags & IS_DELETED_MASK) != 0;
+            boolean isExpiring = (flags & IS_EXPIRING_MASK) != 0;
+            boolean useRowTimestamp = (flags & USE_ROW_TIMESTAMP_MASK) != 0;
+            boolean useRowTTL = (flags & USE_ROW_TTL_MASK) != 0;
+
+            long timestamp = useRowTimestamp ? rowLiveness.timestamp() : header.readTimestamp(in);
+
+            int localDeletionTime = useRowTTL
+                                  ? rowLiveness.localExpirationTime()
+                                  : (isDeleted || isExpiring ? header.readLocalDeletionTime(in) : NO_DELETION_TIME);
+
+            int ttl = useRowTTL ? rowLiveness.ttl() : (isExpiring ? header.readTTL(in) : NO_TTL);
+
+            CellPath path = column.isComplex()
+                          ? column.cellPathSerializer().deserialize(in)
+                          : null;
+
+            boolean isCounter = localDeletionTime == NO_DELETION_TIME && column.type.isCounter();
+
+            ByteBuffer value = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            if (hasValue)
+            {
+                if (helper.canSkipValue(column) || (path != null && helper.canSkipValue(path)))
+                {
+                    header.getType(column).skipValue(in);
+                }
+                else
+                {
+                    value = header.getType(column).readValue(in, DatabaseDescriptor.getMaxValueSize());
+                    if (isCounter)
+                        value = helper.maybeClearCounterValue(value);
+                }
+            }
+
+            return new BufferCell(column, timestamp, ttl, localDeletionTime, value, path);
+        }
+
+        public long serializedSize(Cell cell, ColumnDefinition column, LivenessInfo rowLiveness, SerializationHeader header)
+        {
+            long size = 1; // flags
+            boolean hasValue = cell.value().hasRemaining();
+            boolean isDeleted = cell.isTombstone();
+            boolean isExpiring = cell.isExpiring();
+            boolean useRowTimestamp = !rowLiveness.isEmpty() && cell.timestamp() == rowLiveness.timestamp();
+            boolean useRowTTL = isExpiring && rowLiveness.isExpiring() && cell.ttl() == rowLiveness.ttl() && cell.localDeletionTime() == rowLiveness.localExpirationTime();
+
+            if (!useRowTimestamp)
+                size += header.timestampSerializedSize(cell.timestamp());
+
+            if ((isDeleted || isExpiring) && !useRowTTL)
+                size += header.localDeletionTimeSerializedSize(cell.localDeletionTime());
+            if (isExpiring && !useRowTTL)
+                size += header.ttlSerializedSize(cell.ttl());
+
+            if (column.isComplex())
+                size += column.cellPathSerializer().serializedSize(cell.path());
+
+            if (hasValue)
+                size += header.getType(column).writtenLength(cell.value());
+
+            return size;
+        }
+
+        // Returns if the skipped cell was an actual cell (i.e. it had its presence flag).
+        public boolean skip(DataInputPlus in, ColumnDefinition column, SerializationHeader header) throws IOException
+        {
+            int flags = in.readUnsignedByte();
+            boolean hasValue = (flags & HAS_EMPTY_VALUE_MASK) == 0;
+            boolean isDeleted = (flags & IS_DELETED_MASK) != 0;
+            boolean isExpiring = (flags & IS_EXPIRING_MASK) != 0;
+            boolean useRowTimestamp = (flags & USE_ROW_TIMESTAMP_MASK) != 0;
+            boolean useRowTTL = (flags & USE_ROW_TTL_MASK) != 0;
+
+            if (!useRowTimestamp)
+                header.skipTimestamp(in);
+
+            if (!useRowTTL && (isDeleted || isExpiring))
+                header.skipLocalDeletionTime(in);
+
+            if (!useRowTTL && isExpiring)
+                header.skipTTL(in);
+
+            if (column.isComplex())
+                column.cellPathSerializer().skip(in);
+
+            if (hasValue)
+                header.getType(column).skipValue(in);
+
+            return true;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/Cell.java b/src/java/org/apache/cassandra/db/rows/Cell.java
new file mode 100644
index 0000000..c69e11f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/Cell.java

@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A cell is our atomic unit for a single value of a single column.
+ * <p>
+ * A cell always holds at least a timestamp that gives us how the cell reconcile. We then
+ * have 3 main types of cells:
+ *   1) live regular cells: those will also have a value and, if for a complex column, a path.
+ *   2) expiring cells: on top of regular cells, those have a ttl and a local deletion time (when they are expired).
+ *   3) tombstone cells: those won't have value, but they have a local deletion time (when the tombstone was created).
+ */
+public abstract class Cell extends ColumnData
+{
+    public static final int NO_TTL = 0;
+    public static final int NO_DELETION_TIME = Integer.MAX_VALUE;
+    public static final int MAX_DELETION_TIME = Integer.MAX_VALUE - 1;
+
+    public final static Comparator<Cell> comparator = (c1, c2) ->
+    {
+        int cmp = c1.column().compareTo(c2.column());
+        if (cmp != 0)
+            return cmp;
+
+        Comparator<CellPath> pathComparator = c1.column().cellPathComparator();
+        return pathComparator == null ? 0 : pathComparator.compare(c1.path(), c2.path());
+    };
+
+    public static final Serializer serializer = new BufferCell.Serializer();
+
+    protected Cell(ColumnDefinition column)
+    {
+        super(column);
+    }
+
+    /**
+     * Whether the cell is a counter cell or not.
+     *
+     * @return whether the cell is a counter cell or not.
+     */
+    public abstract boolean isCounterCell();
+
+    /**
+     * The cell value.
+     *
+     * @return the cell value.
+     */
+    public abstract ByteBuffer value();
+
+    /**
+     * The cell timestamp.
+     * <p>
+     * @return the cell timestamp.
+     */
+    public abstract long timestamp();
+
+    /**
+     * The cell ttl.
+     *
+     * @return the cell ttl, or {@code NO_TTL} if the cell isn't an expiring one.
+     */
+    public abstract int ttl();
+
+    /**
+     * The cell local deletion time.
+     *
+     * @return the cell local deletion time, or {@code NO_DELETION_TIME} if the cell is neither
+     * a tombstone nor an expiring one.
+     */
+    public abstract int localDeletionTime();
+
+    /**
+     * Whether the cell is a tombstone or not.
+     *
+     * @return whether the cell is a tombstone or not.
+     */
+    public abstract boolean isTombstone();
+
+    /**
+     * Whether the cell is an expiring one or not.
+     * <p>
+     * Note that this only correspond to whether the cell liveness info
+     * have a TTL or not, but doesn't tells whether the cell is already expired
+     * or not. You should use {@link #isLive} for that latter information.
+     *
+     * @return whether the cell is an expiring one or not.
+     */
+    public abstract boolean isExpiring();
+
+    /**
+     * Whether the cell is live or not given the current time.
+     *
+     * @param nowInSec the current time in seconds. This is used to
+     * decide if an expiring cell is expired or live.
+     * @return whether the cell is live or not at {@code nowInSec}.
+     */
+    public abstract boolean isLive(int nowInSec);
+
+    /**
+     * For cells belonging to complex types (non-frozen collection and UDT), the
+     * path to the cell.
+     *
+     * @return the cell path for cells of complex column, and {@code null} for other cells.
+     */
+    public abstract CellPath path();
+
+    public abstract Cell withUpdatedColumn(ColumnDefinition newColumn);
+
+    public abstract Cell withUpdatedValue(ByteBuffer newValue);
+
+    public abstract Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, int newLocalDeletionTime);
+
+    public abstract Cell copy(AbstractAllocator allocator);
+
+    @Override
+    // Overrides super type to provide a more precise return type.
+    public abstract Cell markCounterLocalToBeCleared();
+
+    @Override
+    // Overrides super type to provide a more precise return type.
+    public abstract Cell purge(DeletionPurger purger, int nowInSec);
+
+    public interface Serializer
+    {
+        public void serialize(Cell cell, ColumnDefinition column, DataOutputPlus out, LivenessInfo rowLiveness, SerializationHeader header) throws IOException;
+
+        public Cell deserialize(DataInputPlus in, LivenessInfo rowLiveness, ColumnDefinition column, SerializationHeader header, SerializationHelper helper) throws IOException;
+
+        public long serializedSize(Cell cell, ColumnDefinition column, LivenessInfo rowLiveness, SerializationHeader header);
+
+        // Returns if the skipped cell was an actual cell (i.e. it had its presence flag).
+        public boolean skip(DataInputPlus in, ColumnDefinition column, SerializationHeader header) throws IOException;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/CellPath.java b/src/java/org/apache/cassandra/db/rows/CellPath.java
new file mode 100644
index 0000000..68e3c2b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/CellPath.java

@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.Objects;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A path for a cell belonging to a complex column type (non-frozen collection or UDT).
+ */
+public abstract class CellPath
+{
+    public static final CellPath BOTTOM = new EmptyCellPath();
+    public static final CellPath TOP = new EmptyCellPath();
+
+    public abstract int size();
+    public abstract ByteBuffer get(int i);
+
+    // The only complex we currently have are collections that have only one value.
+    public static CellPath create(ByteBuffer value)
+    {
+        assert value != null;
+        return new CollectionCellPath(value);
+    }
+
+    public int dataSize()
+    {
+        int size = 0;
+        for (int i = 0; i < size(); i++)
+            size += get(i).remaining();
+        return size;
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        for (int i = 0; i < size(); i++)
+            digest.update(get(i).duplicate());
+    }
+
+    public abstract CellPath copy(AbstractAllocator allocator);
+
+    public abstract long unsharedHeapSizeExcludingData();
+
+    @Override
+    public final int hashCode()
+    {
+        int result = 31;
+        for (int i = 0; i < size(); i++)
+            result += 31 * Objects.hash(get(i));
+        return result;
+    }
+
+    @Override
+    public final boolean equals(Object o)
+    {
+        if(!(o instanceof CellPath))
+            return false;
+
+        CellPath that = (CellPath)o;
+        if (this.size() != that.size())
+            return false;
+
+        for (int i = 0; i < size(); i++)
+            if (!Objects.equals(this.get(i), that.get(i)))
+                return false;
+
+        return true;
+    }
+
+    public interface Serializer
+    {
+        public void serialize(CellPath path, DataOutputPlus out) throws IOException;
+        public CellPath deserialize(DataInputPlus in) throws IOException;
+        public long serializedSize(CellPath path);
+        public void skip(DataInputPlus in) throws IOException;
+    }
+
+    private static class CollectionCellPath extends CellPath
+    {
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new CollectionCellPath(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+
+        protected final ByteBuffer value;
+
+        private CollectionCellPath(ByteBuffer value)
+        {
+            this.value = value;
+        }
+
+        public int size()
+        {
+            return 1;
+        }
+
+        public ByteBuffer get(int i)
+        {
+            assert i == 0;
+            return value;
+        }
+
+        public CellPath copy(AbstractAllocator allocator)
+        {
+            return new CollectionCellPath(allocator.clone(value));
+        }
+
+        public long unsharedHeapSizeExcludingData()
+        {
+            return EMPTY_SIZE + ObjectSizes.sizeOnHeapExcludingData(value);
+        }
+    }
+
+    private static class EmptyCellPath extends CellPath
+    {
+        public int size()
+        {
+            return 0;
+        }
+
+        public ByteBuffer get(int i)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public CellPath copy(AbstractAllocator allocator)
+        {
+            return this;
+        }
+
+        public long unsharedHeapSizeExcludingData()
+        {
+            return 0;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/Cells.java b/src/java/org/apache/cassandra/db/rows/Cells.java
new file mode 100644
index 0000000..54df26e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/Cells.java

@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+import java.util.Iterator;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.Conflicts;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
+
+/**
+ * Static methods to work on cells.
+ */
+public abstract class Cells
+{
+    private Cells() {}
+
+    /**
+     * Collect statistics ont a given cell.
+     *
+     * @param cell the cell for which to collect stats.
+     * @param collector the stats collector.
+     */
+    public static void collectStats(Cell cell, PartitionStatisticsCollector collector)
+    {
+        collector.update(cell);
+
+        if (cell.isCounterCell())
+            collector.updateHasLegacyCounterShards(CounterCells.hasLegacyShards(cell));
+    }
+
+    /**
+     * Reconciles/merges two cells, one being an update to an existing cell,
+     * yielding index updates if appropriate.
+     * <p>
+     * Note that this method assumes that the provided cells can meaningfully
+     * be reconciled together, that is that those cells are for the same row and same
+     * column (and same cell path if the column is complex).
+     * <p>
+     * Also note that which cell is provided as {@code existing} and which is
+     * provided as {@code update} matters for index updates.
+     *
+     * @param existing the pre-existing cell, the one that is updated. This can be
+     * {@code null} if this reconciliation correspond to an insertion.
+     * @param update the newly added cell, the update. This can be {@code null} out
+     * of convenience, in which case this function simply copy {@code existing} to
+     * {@code writer}.
+     * @param deletion the deletion time that applies to the cells being considered.
+     * This deletion time may delete both {@code existing} or {@code update}.
+     * @param builder the row builder to which the result of the reconciliation is written.
+     * @param nowInSec the current time in seconds (which plays a role during reconciliation
+     * because deleted cells always have precedence on timestamp equality and deciding if a
+     * cell is a live or not depends on the current time due to expiring cells).
+     *
+     * @return the timestamp delta between existing and update, or {@code Long.MAX_VALUE} if one
+     * of them is {@code null} or deleted by {@code deletion}).
+     */
+    public static long reconcile(Cell existing,
+                                 Cell update,
+                                 DeletionTime deletion,
+                                 Row.Builder builder,
+                                 int nowInSec)
+    {
+        existing = existing == null || deletion.deletes(existing) ? null : existing;
+        update = update == null || deletion.deletes(update) ? null : update;
+        if (existing == null || update == null)
+        {
+            if (update != null)
+            {
+                builder.addCell(update);
+            }
+            else if (existing != null)
+            {
+                builder.addCell(existing);
+            }
+            return Long.MAX_VALUE;
+        }
+
+        Cell reconciled = reconcile(existing, update, nowInSec);
+        builder.addCell(reconciled);
+
+        return Math.abs(existing.timestamp() - update.timestamp());
+    }
+
+    /**
+     * Reconciles/merge two cells.
+     * <p>
+     * Note that this method assumes that the provided cells can meaningfully
+     * be reconciled together, that is that cell are for the same row and same
+     * column (and same cell path if the column is complex).
+     * <p>
+     * This method is commutative over it's cells arguments: {@code reconcile(a, b, n) == reconcile(b, a, n)}.
+     *
+     * @param c1 the first cell participating in the reconciliation.
+     * @param c2 the second cell participating in the reconciliation.
+     * @param nowInSec the current time in seconds (which plays a role during reconciliation
+     * because deleted cells always have precedence on timestamp equality and deciding if a
+     * cell is a live or not depends on the current time due to expiring cells).
+     *
+     * @return a cell corresponding to the reconciliation of {@code c1} and {@code c2}.
+     * For non-counter cells, this will always be either {@code c1} or {@code c2}, but for
+     * counter cells this can be a newly allocated cell.
+     */
+    public static Cell reconcile(Cell c1, Cell c2, int nowInSec)
+    {
+        if (c1 == null)
+            return c2 == null ? null : c2;
+        if (c2 == null)
+            return c1;
+
+        if (c1.isCounterCell() || c2.isCounterCell())
+        {
+            Conflicts.Resolution res = Conflicts.resolveCounter(c1.timestamp(),
+                                                                c1.isLive(nowInSec),
+                                                                c1.value(),
+                                                                c2.timestamp(),
+                                                                c2.isLive(nowInSec),
+                                                                c2.value());
+
+            switch (res)
+            {
+                case LEFT_WINS: return c1;
+                case RIGHT_WINS: return c2;
+                default:
+                    ByteBuffer merged = Conflicts.mergeCounterValues(c1.value(), c2.value());
+                    long timestamp = Math.max(c1.timestamp(), c2.timestamp());
+
+                    // We save allocating a new cell object if it turns out that one cell was
+                    // a complete superset of the other
+                    if (merged == c1.value() && timestamp == c1.timestamp())
+                        return c1;
+                    else if (merged == c2.value() && timestamp == c2.timestamp())
+                        return c2;
+                    else // merge clocks and timestamps.
+                        return new BufferCell(c1.column(), timestamp, Cell.NO_TTL, Cell.NO_DELETION_TIME, merged, c1.path());
+            }
+        }
+
+        Conflicts.Resolution res = Conflicts.resolveRegular(c1.timestamp(),
+                                                            c1.isLive(nowInSec),
+                                                            c1.localDeletionTime(),
+                                                            c1.value(),
+                                                            c2.timestamp(),
+                                                            c2.isLive(nowInSec),
+                                                            c2.localDeletionTime(),
+                                                            c2.value());
+        assert res != Conflicts.Resolution.MERGE;
+        return res == Conflicts.Resolution.LEFT_WINS ? c1 : c2;
+    }
+
+    /**
+     * Computes the reconciliation of a complex column given its pre-existing
+     * cells and the ones it is updated with, and generating index update if
+     * appropriate.
+     * <p>
+     * Note that this method assumes that the provided cells can meaningfully
+     * be reconciled together, that is that the cells are for the same row and same
+     * complex column.
+     * <p>
+     * Also note that which cells is provided as {@code existing} and which are
+     * provided as {@code update} matters for index updates.
+     *
+     * @param column the complex column the cells are for.
+     * @param existing the pre-existing cells, the ones that are updated. This can be
+     * {@code null} if this reconciliation correspond to an insertion.
+     * @param update the newly added cells, the update. This can be {@code null} out
+     * of convenience, in which case this function simply copy the cells from
+     * {@code existing} to {@code writer}.
+     * @param deletion the deletion time that applies to the cells being considered.
+     * This deletion time may delete cells in both {@code existing} and {@code update}.
+     * @param builder the row build to which the result of the reconciliation is written.
+     * @param nowInSec the current time in seconds (which plays a role during reconciliation
+     * because deleted cells always have precedence on timestamp equality and deciding if a
+     * cell is a live or not depends on the current time due to expiring cells).
+     *
+     * @return the smallest timestamp delta between corresponding cells from existing and update. A
+     * timestamp delta being computed as the difference between a cell from {@code update} and the
+     * cell in {@code existing} having the same cell path (if such cell exists). If the intersection
+     * of cells from {@code existing} and {@code update} having the same cell path is empty, this
+     * returns {@code Long.MAX_VALUE}.
+     */
+    public static long reconcileComplex(ColumnDefinition column,
+                                        Iterator<Cell> existing,
+                                        Iterator<Cell> update,
+                                        DeletionTime deletion,
+                                        Row.Builder builder,
+                                        int nowInSec)
+    {
+        Comparator<CellPath> comparator = column.cellPathComparator();
+        Cell nextExisting = getNext(existing);
+        Cell nextUpdate = getNext(update);
+        long timeDelta = Long.MAX_VALUE;
+        while (nextExisting != null || nextUpdate != null)
+        {
+            int cmp = nextExisting == null ? 1
+                     : (nextUpdate == null ? -1
+                     : comparator.compare(nextExisting.path(), nextUpdate.path()));
+            if (cmp < 0)
+            {
+                reconcile(nextExisting, null, deletion, builder, nowInSec);
+                nextExisting = getNext(existing);
+            }
+            else if (cmp > 0)
+            {
+                reconcile(null, nextUpdate, deletion, builder, nowInSec);
+                nextUpdate = getNext(update);
+            }
+            else
+            {
+                timeDelta = Math.min(timeDelta, reconcile(nextExisting, nextUpdate, deletion, builder, nowInSec));
+                nextExisting = getNext(existing);
+                nextUpdate = getNext(update);
+            }
+        }
+        return timeDelta;
+    }
+
+    private static Cell getNext(Iterator<Cell> iterator)
+    {
+        return iterator == null || !iterator.hasNext() ? null : iterator.next();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/ColumnData.java b/src/java/org/apache/cassandra/db/rows/ColumnData.java
new file mode 100644
index 0000000..933da6a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/ColumnData.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.security.MessageDigest;
+import java.util.Comparator;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.DeletionPurger;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.serializers.MarshalException;
+
+/**
+ * Generic interface for the data of a given column (inside a row).
+ *
+ * In practice, there is only 2 implementations of this: either {@link Cell} for simple columns
+ * or {@code ComplexColumnData} for complex columns.
+ */
+public abstract class ColumnData
+{
+    public static final Comparator<ColumnData> comparator = (cd1, cd2) -> cd1.column().compareTo(cd2.column());
+
+    protected final ColumnDefinition column;
+    protected ColumnData(ColumnDefinition column)
+    {
+        this.column = column;
+    }
+
+    /**
+     * The column this is data for.
+     *
+     * @return the column this is a data for.
+     */
+    public final ColumnDefinition column() { return column; }
+
+    /**
+     * The size of the data hold by this {@code ColumnData}.
+     *
+     * @return the size used by the data of this {@code ColumnData}.
+     */
+    public abstract int dataSize();
+
+    public abstract long unsharedHeapSizeExcludingData();
+
+    /**
+     * Validate the column data.
+     *
+     * @throws MarshalException if the data is not valid.
+     */
+    public abstract void validate();
+
+    /**
+     * Adds the data to the provided digest.
+     *
+     * @param digest the {@code MessageDigest} to add the data to.
+     */
+    public abstract void digest(MessageDigest digest);
+
+    /**
+     * Returns a copy of the data where all timestamps for live data have replaced by {@code newTimestamp} and
+     * all deletion timestamp by {@code newTimestamp - 1}.
+     *
+     * This exists for the Paxos path, see {@link PartitionUpdate#updateAllTimestamp} for additional details.
+     */
+    public abstract ColumnData updateAllTimestamp(long newTimestamp);
+
+    public abstract ColumnData markCounterLocalToBeCleared();
+
+    public abstract ColumnData purge(DeletionPurger purger, int nowInSec);
+
+    public abstract long maxTimestamp();
+}

diff --git a/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java
new file mode 100644
index 0000000..e768769
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java

@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.*;
+import java.util.function.BiFunction;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.DeletionPurger;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.ByteType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+/**
+ * The data for a complex column, that is it's cells and potential complex
+ * deletion time.
+ */
+public class ComplexColumnData extends ColumnData implements Iterable<Cell>
+{
+    static final Cell[] NO_CELLS = new Cell[0];
+
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new ComplexColumnData(ColumnDefinition.regularDef("", "", "", SetType.getInstance(ByteType.instance, true)), NO_CELLS, new DeletionTime(0, 0)));
+
+    // The cells for 'column' sorted by cell path.
+    private final Object[] cells;
+
+    private final DeletionTime complexDeletion;
+
+    // Only ArrayBackedRow should call this.
+    ComplexColumnData(ColumnDefinition column, Object[] cells, DeletionTime complexDeletion)
+    {
+        super(column);
+        assert column.isComplex();
+        assert cells.length > 0 || !complexDeletion.isLive();
+        this.cells = cells;
+        this.complexDeletion = complexDeletion;
+    }
+
+    public boolean hasCells()
+    {
+        return !BTree.isEmpty(cells);
+    }
+
+    public int cellsCount()
+    {
+        return BTree.size(cells);
+    }
+
+    public Cell getCell(CellPath path)
+    {
+        return (Cell) BTree.<Object>find(cells, column.asymmetricCellPathComparator(), path);
+    }
+
+    public Cell getCellByIndex(int idx)
+    {
+        return BTree.findByIndex(cells, idx);
+    }
+
+    /**
+     * The complex deletion time of the complex column.
+     * <p>
+     * The returned "complex deletion" is a deletion of all the cells of the column. For instance,
+     * for a collection, this correspond to a full collection deletion.
+     * Please note that this deletion says nothing about the individual cells of the complex column:
+     * there can be no complex deletion but some of the individual cells can be deleted.
+     *
+     * @return the complex deletion time for the column this is the data of or {@code DeletionTime.LIVE}
+     * if the column is not deleted.
+     */
+    public DeletionTime complexDeletion()
+    {
+        return complexDeletion;
+    }
+
+    public Iterator<Cell> iterator()
+    {
+        return BTree.iterator(cells);
+    }
+
+    public Iterator<Cell> reverseIterator()
+    {
+        return BTree.iterator(cells, BTree.Dir.DESC);
+    }
+
+    public int dataSize()
+    {
+        int size = complexDeletion.dataSize();
+        for (Cell cell : this)
+            size += cell.dataSize();
+        return size;
+    }
+
+    public long unsharedHeapSizeExcludingData()
+    {
+        long heapSize = EMPTY_SIZE + ObjectSizes.sizeOfArray(cells);
+        // TODO: this can be turned into a simple multiplication, at least while we have only one Cell implementation
+        for (Cell cell : this)
+            heapSize += cell.unsharedHeapSizeExcludingData();
+        return heapSize;
+    }
+
+    public void validate()
+    {
+        for (Cell cell : this)
+            cell.validate();
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        if (!complexDeletion.isLive())
+            complexDeletion.digest(digest);
+
+        for (Cell cell : this)
+            cell.digest(digest);
+    }
+
+    public ComplexColumnData markCounterLocalToBeCleared()
+    {
+        return transformAndFilter(complexDeletion, Cell::markCounterLocalToBeCleared);
+    }
+
+    public ComplexColumnData filter(ColumnFilter filter, DeletionTime activeDeletion, CFMetaData.DroppedColumn dropped)
+    {
+        ColumnFilter.Tester cellTester = filter.newTester(column);
+        if (cellTester == null && activeDeletion.isLive() && dropped == null)
+            return this;
+
+        DeletionTime newDeletion = activeDeletion.supersedes(complexDeletion) ? DeletionTime.LIVE : complexDeletion;
+        return transformAndFilter(newDeletion,
+                                  (cell) ->
+                                           (cellTester == null || cellTester.includes(cell.path()))
+                                        && !activeDeletion.deletes(cell)
+                                        && (dropped == null || cell.timestamp() > dropped.droppedTime)
+                                           ? cell : null);
+    }
+
+    public ComplexColumnData purge(DeletionPurger purger, int nowInSec)
+    {
+        DeletionTime newDeletion = complexDeletion.isLive() || purger.shouldPurge(complexDeletion) ? DeletionTime.LIVE : complexDeletion;
+        return transformAndFilter(newDeletion, (cell) -> cell.purge(purger, nowInSec));
+    }
+
+    private ComplexColumnData transformAndFilter(DeletionTime newDeletion, Function<? super Cell, ? extends Cell> function)
+    {
+        Object[] transformed = BTree.transformAndFilter(cells, function);
+
+        if (cells == transformed && newDeletion == complexDeletion)
+            return this;
+
+        if (newDeletion == DeletionTime.LIVE && BTree.isEmpty(transformed))
+            return null;
+
+        return new ComplexColumnData(column, transformed, newDeletion);
+    }
+
+    public ComplexColumnData updateAllTimestamp(long newTimestamp)
+    {
+        DeletionTime newDeletion = complexDeletion.isLive() ? complexDeletion : new DeletionTime(newTimestamp - 1, complexDeletion.localDeletionTime());
+        return transformAndFilter(newDeletion, (cell) -> (Cell) cell.updateAllTimestamp(newTimestamp));
+    }
+
+    public long maxTimestamp()
+    {
+        long timestamp = complexDeletion.markedForDeleteAt();
+        for (Cell cell : this)
+            timestamp = Math.max(timestamp, cell.timestamp());
+        return timestamp;
+    }
+
+    // This is the partner in crime of ArrayBackedRow.setValue. The exact warning apply. The short
+    // version is: "don't use that method".
+    void setValue(CellPath path, ByteBuffer value)
+    {
+        Cell current = (Cell) BTree.<Object>find(cells, column.asymmetricCellPathComparator(), path);
+        BTree.replaceInSitu(cells, column.cellComparator(), current, current.withUpdatedValue(value));
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (this == other)
+            return true;
+
+        if(!(other instanceof ComplexColumnData))
+            return false;
+
+        ComplexColumnData that = (ComplexColumnData)other;
+        return this.column().equals(that.column())
+            && this.complexDeletion().equals(that.complexDeletion)
+            && BTree.equals(this.cells, that.cells);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(column(), complexDeletion(), BTree.hashCode(cells));
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static class Builder
+    {
+        private static BiFunction<Cell, Cell, Cell> noResolve = (a, b) -> {
+            throw new IllegalStateException();
+        };
+
+        private DeletionTime complexDeletion;
+        private ColumnDefinition column;
+        private BTree.Builder<Cell> builder;
+
+        public void newColumn(ColumnDefinition column)
+        {
+            this.column = column;
+            this.complexDeletion = DeletionTime.LIVE; // default if writeComplexDeletion is not called
+            if (builder == null) builder = BTree.builder(column.cellComparator());
+            else builder.reuse(column.cellComparator());
+        }
+
+        public void addComplexDeletion(DeletionTime complexDeletion)
+        {
+            this.complexDeletion = complexDeletion;
+        }
+
+        public void addCell(Cell cell)
+        {
+            builder.add(cell);
+        }
+
+        public ComplexColumnData build()
+        {
+            if (complexDeletion.isLive() && builder.isEmpty())
+                return null;
+
+            return new ComplexColumnData(column, builder.build(), complexDeletion);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/CounterCells.java b/src/java/org/apache/cassandra/db/rows/CounterCells.java
new file mode 100644
index 0000000..732f195
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/CounterCells.java

@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import org.apache.cassandra.db.context.CounterContext;
+
+public abstract class CounterCells
+{
+    private CounterCells() {}
+
+    private static final CounterContext contextManager = CounterContext.instance();
+
+    public static boolean hasLegacyShards(Cell cell)
+    {
+        return contextManager.hasLegacyShards(cell.value());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/EncodingStats.java b/src/java/org/apache/cassandra/db/rows/EncodingStats.java
new file mode 100644
index 0000000..955ffc7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/EncodingStats.java

@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Stats used for the encoding of the rows and tombstones of a given source.
+ * <p>
+ * Those stats are used to optimize the on-wire and on-disk storage of rows. More precisely,
+ * the {@code minTimestamp}, {@code minLocalDeletionTime} and {@code minTTL} stats are used to
+ * delta-encode those information for the sake of vint encoding.
+ * <p>
+ * Note that due to their use, those stats can suffer to be somewhat inaccurate (the more incurrate
+ * they are, the less effective the storage will be, but provided the stats are not completly wacky,
+ * this shouldn't have too huge an impact on performance) and in fact they will not always be
+ * accurate for reasons explained in {@link SerializationHeader#make}.
+ */
+public class EncodingStats
+{
+    // Default values for the timestamp, deletion time and ttl. We use this both for NO_STATS, but also to serialize
+    // an EncodingStats. Basically, we encode the diff of each value of to these epoch, which give values with better vint encoding.
+    private static final long TIMESTAMP_EPOCH;
+    private static final int DELETION_TIME_EPOCH;
+    private static final int TTL_EPOCH = 0;
+    static
+    {
+        // We want a fixed epoch, but that provide small values when substracted from our timestamp and deletion time.
+        // So we somewhat arbitrary use the date of the summit 2015, which should hopefully roughly correspond to 3.0 release.
+        Calendar c = Calendar.getInstance(TimeZone.getTimeZone("GMT-0"), Locale.US);
+        c.set(Calendar.YEAR, 2015);
+        c.set(Calendar.MONTH, Calendar.SEPTEMBER);
+        c.set(Calendar.DAY_OF_MONTH, 22);
+        c.set(Calendar.HOUR_OF_DAY, 0);
+        c.set(Calendar.MINUTE, 0);
+        c.set(Calendar.SECOND, 0);
+        c.set(Calendar.MILLISECOND, 0);
+
+        TIMESTAMP_EPOCH = c.getTimeInMillis() * 1000; // timestamps should be in microseconds by convention
+        DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
+    }
+
+    // We should use this sparingly obviously
+    public static final EncodingStats NO_STATS = new EncodingStats(TIMESTAMP_EPOCH, DELETION_TIME_EPOCH, TTL_EPOCH);
+
+    public static final Serializer serializer = new Serializer();
+
+    public final long minTimestamp;
+    public final int minLocalDeletionTime;
+    public final int minTTL;
+
+    public EncodingStats(long minTimestamp,
+                         int minLocalDeletionTime,
+                         int minTTL)
+    {
+        // Note that the exact value of those don't impact correctness, just the efficiency of the encoding. So when we
+        // get a value for timestamp (resp. minLocalDeletionTime) that means 'no object had a timestamp' (resp. 'a local
+        // deletion time'), then what value we store for minTimestamp (resp. minLocalDeletionTime) doesn't matter, and
+        // it's thus more efficient to use our EPOCH numbers, since it will result in a guaranteed 1 byte encoding.
+
+        this.minTimestamp = minTimestamp == LivenessInfo.NO_TIMESTAMP ? TIMESTAMP_EPOCH : minTimestamp;
+        this.minLocalDeletionTime = minLocalDeletionTime == LivenessInfo.NO_EXPIRATION_TIME ? DELETION_TIME_EPOCH : minLocalDeletionTime;
+        this.minTTL = minTTL;
+    }
+
+    /**
+     * Merge this stats with another one.
+     * <p>
+     * The comments of {@link SerializationHeader#make} applies here too, i.e. the result of
+     * merging will be not totally accurate but we can live with that.
+     */
+    public EncodingStats mergeWith(EncodingStats that)
+    {
+        long minTimestamp = this.minTimestamp == TIMESTAMP_EPOCH
+                          ? that.minTimestamp
+                          : (that.minTimestamp == TIMESTAMP_EPOCH ? this.minTimestamp : Math.min(this.minTimestamp, that.minTimestamp));
+
+        int minDelTime = this.minLocalDeletionTime == DELETION_TIME_EPOCH
+                       ? that.minLocalDeletionTime
+                       : (that.minLocalDeletionTime == DELETION_TIME_EPOCH ? this.minLocalDeletionTime : Math.min(this.minLocalDeletionTime, that.minLocalDeletionTime));
+
+        int minTTL = this.minTTL == TTL_EPOCH
+                   ? that.minTTL
+                   : (that.minTTL == TTL_EPOCH ? this.minTTL : Math.min(this.minTTL, that.minTTL));
+
+        return new EncodingStats(minTimestamp, minDelTime, minTTL);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        EncodingStats that = (EncodingStats) o;
+
+        return this.minLocalDeletionTime == that.minLocalDeletionTime
+            && this.minTTL == that.minTTL
+            && this.minTimestamp == that.minTimestamp;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(minTimestamp, minLocalDeletionTime, minTTL);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("EncodingStats(ts=%d, ldt=%d, ttl=%d)", minTimestamp, minLocalDeletionTime, minTTL);
+    }
+
+    public static class Collector implements PartitionStatisticsCollector
+    {
+        private boolean isTimestampSet;
+        private long minTimestamp = Long.MAX_VALUE;
+
+        private boolean isDelTimeSet;
+        private int minDeletionTime = Integer.MAX_VALUE;
+
+        private boolean isTTLSet;
+        private int minTTL = Integer.MAX_VALUE;
+
+        public void update(LivenessInfo info)
+        {
+            if (info.isEmpty())
+                return;
+
+            updateTimestamp(info.timestamp());
+
+            if (info.isExpiring())
+            {
+                updateTTL(info.ttl());
+                updateLocalDeletionTime(info.localExpirationTime());
+            }
+        }
+
+        public void update(Cell cell)
+        {
+            updateTimestamp(cell.timestamp());
+            if (cell.isExpiring())
+            {
+                updateTTL(cell.ttl());
+                updateLocalDeletionTime(cell.localDeletionTime());
+            }
+            else if (cell.isTombstone())
+            {
+                updateLocalDeletionTime(cell.localDeletionTime());
+            }
+        }
+
+        public void update(DeletionTime deletionTime)
+        {
+            if (deletionTime.isLive())
+                return;
+
+            updateTimestamp(deletionTime.markedForDeleteAt());
+            updateLocalDeletionTime(deletionTime.localDeletionTime());
+        }
+
+        public void updateTimestamp(long timestamp)
+        {
+            isTimestampSet = true;
+            minTimestamp = Math.min(minTimestamp, timestamp);
+        }
+
+        public void updateLocalDeletionTime(int deletionTime)
+        {
+            isDelTimeSet = true;
+            minDeletionTime = Math.min(minDeletionTime, deletionTime);
+        }
+
+        public void updateTTL(int ttl)
+        {
+            isTTLSet = true;
+            minTTL = Math.min(minTTL, ttl);
+        }
+
+        public void updateColumnSetPerRow(long columnSetInRow)
+        {
+        }
+
+        public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
+        {
+            // We don't care about this but this come with PartitionStatisticsCollector
+        }
+
+        public EncodingStats get()
+        {
+            return new EncodingStats(isTimestampSet ? minTimestamp : TIMESTAMP_EPOCH,
+                                     isDelTimeSet ? minDeletionTime : DELETION_TIME_EPOCH,
+                                     isTTLSet ? minTTL : TTL_EPOCH);
+        }
+
+        public static EncodingStats collect(Row staticRow, Iterator<Row> rows, DeletionInfo deletionInfo)
+        {
+            Collector collector = new Collector();
+            deletionInfo.collectStats(collector);
+            if (!staticRow.isEmpty())
+                Rows.collectStats(staticRow, collector);
+            while (rows.hasNext())
+                Rows.collectStats(rows.next(), collector);
+            return collector.get();
+        }
+    }
+
+    public static class Serializer
+    {
+        public void serialize(EncodingStats stats, DataOutputPlus out) throws IOException
+        {
+            out.writeUnsignedVInt(stats.minTimestamp - TIMESTAMP_EPOCH);
+            out.writeUnsignedVInt(stats.minLocalDeletionTime - DELETION_TIME_EPOCH);
+            out.writeUnsignedVInt(stats.minTTL - TTL_EPOCH);
+        }
+
+        public int serializedSize(EncodingStats stats)
+        {
+            return TypeSizes.sizeofUnsignedVInt(stats.minTimestamp - TIMESTAMP_EPOCH)
+                   + TypeSizes.sizeofUnsignedVInt(stats.minLocalDeletionTime - DELETION_TIME_EPOCH)
+                   + TypeSizes.sizeofUnsignedVInt(stats.minTTL - TTL_EPOCH);
+        }
+
+        public EncodingStats deserialize(DataInputPlus in) throws IOException
+        {
+            long minTimestamp = in.readUnsignedVInt() + TIMESTAMP_EPOCH;
+            int minLocalDeletionTime = (int)in.readUnsignedVInt() + DELETION_TIME_EPOCH;
+            int minTTL = (int)in.readUnsignedVInt() + TTL_EPOCH;
+            return new EncodingStats(minTimestamp, minLocalDeletionTime, minTTL);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java
new file mode 100644
index 0000000..8ba4394
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import org.apache.cassandra.utils.AbstractIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+/**
+ * Abstract class to create UnfilteredRowIterator that lazily initialize themselves.
+ *
+ * This is used during partition range queries when we know the partition key but want
+ * to defer the initialization of the rest of the UnfilteredRowIterator until we need those informations.
+ * See {@link BigTableScanner#KeyScanningIterator} for instance.
+ */
+public abstract class LazilyInitializedUnfilteredRowIterator extends AbstractIterator<Unfiltered> implements UnfilteredRowIterator
+{
+    private final DecoratedKey partitionKey;
+
+    private UnfilteredRowIterator iterator;
+
+    public LazilyInitializedUnfilteredRowIterator(DecoratedKey partitionKey)
+    {
+        this.partitionKey = partitionKey;
+    }
+
+    protected abstract UnfilteredRowIterator initializeIterator();
+
+    private void maybeInit()
+    {
+        if (iterator == null)
+            iterator = initializeIterator();
+    }
+
+    public CFMetaData metadata()
+    {
+        maybeInit();
+        return iterator.metadata();
+    }
+
+    public PartitionColumns columns()
+    {
+        maybeInit();
+        return iterator.columns();
+    }
+
+    public boolean isReverseOrder()
+    {
+        maybeInit();
+        return iterator.isReverseOrder();
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return partitionKey;
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        maybeInit();
+        return iterator.partitionLevelDeletion();
+    }
+
+    public Row staticRow()
+    {
+        maybeInit();
+        return iterator.staticRow();
+    }
+
+    public EncodingStats stats()
+    {
+        maybeInit();
+        return iterator.stats();
+    }
+
+    protected Unfiltered computeNext()
+    {
+        maybeInit();
+        return iterator.hasNext() ? iterator.next() : endOfData();
+    }
+
+    public void close()
+    {
+        if (iterator != null)
+            iterator.close();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundMarker.java b/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundMarker.java
new file mode 100644
index 0000000..0079114
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundMarker.java

@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.Objects;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A range tombstone marker that indicates the bound of a range tombstone (start or end).
+ */
+public class RangeTombstoneBoundMarker extends AbstractRangeTombstoneMarker
+{
+    private final DeletionTime deletion;
+
+    public RangeTombstoneBoundMarker(RangeTombstone.Bound bound, DeletionTime deletion)
+    {
+        super(bound);
+        assert !bound.isBoundary();
+        this.deletion = deletion;
+    }
+
+    public RangeTombstoneBoundMarker(Slice.Bound bound, DeletionTime deletion)
+    {
+        this(new RangeTombstone.Bound(bound.kind(), bound.getRawValues()), deletion);
+    }
+
+    public static RangeTombstoneBoundMarker inclusiveOpen(boolean reversed, ByteBuffer[] boundValues, DeletionTime deletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.inclusiveOpen(reversed, boundValues);
+        return new RangeTombstoneBoundMarker(bound, deletion);
+    }
+
+    public static RangeTombstoneBoundMarker exclusiveOpen(boolean reversed, ByteBuffer[] boundValues, DeletionTime deletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.exclusiveOpen(reversed, boundValues);
+        return new RangeTombstoneBoundMarker(bound, deletion);
+    }
+
+    public static RangeTombstoneBoundMarker inclusiveClose(boolean reversed, ByteBuffer[] boundValues, DeletionTime deletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.inclusiveClose(reversed, boundValues);
+        return new RangeTombstoneBoundMarker(bound, deletion);
+    }
+
+    public static RangeTombstoneBoundMarker exclusiveClose(boolean reversed, ByteBuffer[] boundValues, DeletionTime deletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.exclusiveClose(reversed, boundValues);
+        return new RangeTombstoneBoundMarker(bound, deletion);
+    }
+
+    public boolean isBoundary()
+    {
+        return false;
+    }
+
+    /**
+     * The deletion time for the range tombstone this is a bound of.
+     */
+    public DeletionTime deletionTime()
+    {
+        return deletion;
+    }
+
+    public DeletionTime openDeletionTime(boolean reversed)
+    {
+        if (!isOpen(reversed))
+            throw new IllegalStateException();
+        return deletion;
+    }
+
+    public DeletionTime closeDeletionTime(boolean reversed)
+    {
+        if (isOpen(reversed))
+            throw new IllegalStateException();
+        return deletion;
+    }
+
+    public boolean openIsInclusive(boolean reversed)
+    {
+        if (!isOpen(reversed))
+            throw new IllegalStateException();
+        return bound.isInclusive();
+    }
+
+    public boolean closeIsInclusive(boolean reversed)
+    {
+        if (isOpen(reversed))
+            throw new IllegalStateException();
+        return bound.isInclusive();
+    }
+
+    public RangeTombstone.Bound openBound(boolean reversed)
+    {
+        return isOpen(reversed) ? clustering() : null;
+    }
+
+    public RangeTombstone.Bound closeBound(boolean reversed)
+    {
+        return isClose(reversed) ? clustering() : null;
+    }
+
+    public RangeTombstoneBoundMarker copy(AbstractAllocator allocator)
+    {
+        return new RangeTombstoneBoundMarker(clustering().copy(allocator), deletion);
+    }
+
+    public RangeTombstoneBoundMarker withNewOpeningDeletionTime(boolean reversed, DeletionTime newDeletionTime)
+    {
+        if (!isOpen(reversed))
+            throw new IllegalStateException();
+
+        return new RangeTombstoneBoundMarker(clustering(), newDeletionTime);
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        bound.digest(digest);
+        deletion.digest(digest);
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return String.format("Marker %s@%d/%d", bound.toString(metadata), deletion.markedForDeleteAt(), deletion.localDeletionTime());
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof RangeTombstoneBoundMarker))
+            return false;
+
+        RangeTombstoneBoundMarker that = (RangeTombstoneBoundMarker)other;
+        return this.bound.equals(that.bound)
+            && this.deletion.equals(that.deletion);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(bound, deletion);
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundaryMarker.java b/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundaryMarker.java
new file mode 100644
index 0000000..f0f5421
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RangeTombstoneBoundaryMarker.java

@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.util.Objects;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A range tombstone marker that represents a boundary between 2 range tombstones (i.e. it closes one range and open another).
+ */
+public class RangeTombstoneBoundaryMarker extends AbstractRangeTombstoneMarker
+{
+    private final DeletionTime endDeletion;
+    private final DeletionTime startDeletion;
+
+    public RangeTombstoneBoundaryMarker(RangeTombstone.Bound bound, DeletionTime endDeletion, DeletionTime startDeletion)
+    {
+        super(bound);
+        assert bound.isBoundary();
+        this.endDeletion = endDeletion;
+        this.startDeletion = startDeletion;
+    }
+
+    public static RangeTombstoneBoundaryMarker exclusiveCloseInclusiveOpen(boolean reversed, ByteBuffer[] boundValues, DeletionTime closeDeletion, DeletionTime openDeletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.exclusiveCloseInclusiveOpen(reversed, boundValues);
+        DeletionTime endDeletion = reversed ? openDeletion : closeDeletion;
+        DeletionTime startDeletion = reversed ? closeDeletion : openDeletion;
+        return new RangeTombstoneBoundaryMarker(bound, endDeletion, startDeletion);
+    }
+
+    public static RangeTombstoneBoundaryMarker inclusiveCloseExclusiveOpen(boolean reversed, ByteBuffer[] boundValues, DeletionTime closeDeletion, DeletionTime openDeletion)
+    {
+        RangeTombstone.Bound bound = RangeTombstone.Bound.inclusiveCloseExclusiveOpen(reversed, boundValues);
+        DeletionTime endDeletion = reversed ? openDeletion : closeDeletion;
+        DeletionTime startDeletion = reversed ? closeDeletion : openDeletion;
+        return new RangeTombstoneBoundaryMarker(bound, endDeletion, startDeletion);
+    }
+
+    /**
+     * The deletion time for the range tombstone this boundary ends (in clustering order).
+     */
+    public DeletionTime endDeletionTime()
+    {
+        return endDeletion;
+    }
+
+    /**
+     * The deletion time for the range tombstone this boundary starts (in clustering order).
+     */
+    public DeletionTime startDeletionTime()
+    {
+        return startDeletion;
+    }
+
+    public DeletionTime closeDeletionTime(boolean reversed)
+    {
+        return reversed ? startDeletion : endDeletion;
+    }
+
+    public DeletionTime openDeletionTime(boolean reversed)
+    {
+        return reversed ? endDeletion : startDeletion;
+    }
+
+    public boolean openIsInclusive(boolean reversed)
+    {
+        return (bound.kind() == ClusteringPrefix.Kind.EXCL_END_INCL_START_BOUNDARY) ^ reversed;
+    }
+
+    public RangeTombstone.Bound openBound(boolean reversed)
+    {
+        return bound.withNewKind(bound.kind().openBoundOfBoundary(reversed));
+    }
+
+    public RangeTombstone.Bound closeBound(boolean reversed)
+    {
+        return bound.withNewKind(bound.kind().closeBoundOfBoundary(reversed));
+    }
+
+    public boolean closeIsInclusive(boolean reversed)
+    {
+        return (bound.kind() == ClusteringPrefix.Kind.INCL_END_EXCL_START_BOUNDARY) ^ reversed;
+    }
+
+    public boolean isOpen(boolean reversed)
+    {
+        // A boundary always open one side
+        return true;
+    }
+
+    public boolean isClose(boolean reversed)
+    {
+        // A boundary always close one side
+        return true;
+    }
+
+    public RangeTombstoneBoundaryMarker copy(AbstractAllocator allocator)
+    {
+        return new RangeTombstoneBoundaryMarker(clustering().copy(allocator), endDeletion, startDeletion);
+    }
+
+    public RangeTombstoneBoundaryMarker withNewOpeningDeletionTime(boolean reversed, DeletionTime newDeletionTime)
+    {
+        return new RangeTombstoneBoundaryMarker(clustering(), reversed ? newDeletionTime : endDeletion, reversed ? startDeletion : newDeletionTime);
+    }
+
+    public static RangeTombstoneBoundaryMarker makeBoundary(boolean reversed, Slice.Bound close, Slice.Bound open, DeletionTime closeDeletion, DeletionTime openDeletion)
+    {
+        assert RangeTombstone.Bound.Kind.compare(close.kind(), open.kind()) == 0 : "Both bound don't form a boundary";
+        boolean isExclusiveClose = close.isExclusive() || (close.isInclusive() && open.isInclusive() && openDeletion.supersedes(closeDeletion));
+        return isExclusiveClose
+             ? exclusiveCloseInclusiveOpen(reversed, close.getRawValues(), closeDeletion, openDeletion)
+             : inclusiveCloseExclusiveOpen(reversed, close.getRawValues(), closeDeletion, openDeletion);
+    }
+
+    public RangeTombstoneBoundMarker createCorrespondingCloseMarker(boolean reversed)
+    {
+        return new RangeTombstoneBoundMarker(closeBound(reversed), closeDeletionTime(reversed));
+    }
+
+    public RangeTombstoneBoundMarker createCorrespondingOpenMarker(boolean reversed)
+    {
+        return new RangeTombstoneBoundMarker(openBound(reversed), openDeletionTime(reversed));
+    }
+
+    public void digest(MessageDigest digest)
+    {
+        bound.digest(digest);
+        endDeletion.digest(digest);
+        startDeletion.digest(digest);
+    }
+
+    public String toString(CFMetaData metadata)
+    {
+        return String.format("Marker %s@%d/%d-%d/%d",
+                             bound.toString(metadata),
+                             endDeletion.markedForDeleteAt(), endDeletion.localDeletionTime(),
+                             startDeletion.markedForDeleteAt(), startDeletion.localDeletionTime());
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if(!(other instanceof RangeTombstoneBoundaryMarker))
+            return false;
+
+        RangeTombstoneBoundaryMarker that = (RangeTombstoneBoundaryMarker)other;
+        return this.bound.equals(that.bound)
+            && this.endDeletion.equals(that.endDeletion)
+            && this.startDeletion.equals(that.startDeletion);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(bound, endDeletion, startDeletion);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java b/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java
new file mode 100644
index 0000000..dee7231
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RangeTombstoneMarker.java

@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * A marker for a range tombstone bound.
+ * <p>
+ * There is 2 types of markers: bounds (see {@link RangeTombstoneBoundMarker}) and boundaries (see {@link RangeTombstoneBoundaryMarker}).
+ */
+public interface RangeTombstoneMarker extends Unfiltered
+{
+    @Override
+    public RangeTombstone.Bound clustering();
+
+    public boolean isBoundary();
+
+    public boolean isOpen(boolean reversed);
+    public boolean isClose(boolean reversed);
+
+    public DeletionTime openDeletionTime(boolean reversed);
+    public DeletionTime closeDeletionTime(boolean reversed);
+    public boolean openIsInclusive(boolean reversed);
+    public boolean closeIsInclusive(boolean reversed);
+
+    public RangeTombstone.Bound openBound(boolean reversed);
+    public RangeTombstone.Bound closeBound(boolean reversed);
+
+    public RangeTombstoneMarker copy(AbstractAllocator allocator);
+
+    default public boolean isEmpty()
+    {
+        // There is no such thing as an empty marker
+        return false;
+    }
+
+    public RangeTombstoneMarker withNewOpeningDeletionTime(boolean reversed, DeletionTime newDeletionTime);
+
+    /**
+     * Utility class to help merging range tombstone markers coming from multiple inputs (UnfilteredRowIterators).
+     * <p>
+     * The assumption that each individual input must validate and that we must preserve in the output is that every
+     * open marker has a corresponding close marker with the exact same deletion info, and that there is no other range
+     * tombstone marker between those open and close marker (of course, they could be rows in between). In other word,
+     * for any {@code UnfilteredRowIterator}, you only ever have to remenber the last open marker (if any) to have the
+     * full picture of what is deleted by range tombstones at any given point of iterating that iterator.
+     * <p>
+     * Note that this class can merge both forward and reverse iterators. To deal with reverse, we just reverse how we
+     * deal with open and close markers (in forward order, we'll get open-close, open-close, ..., while in reverse we'll
+     * get close-open, close-open, ...).
+     */
+    public static class Merger
+    {
+        private final DeletionTime partitionDeletion;
+        private final boolean reversed;
+
+        private RangeTombstone.Bound bound;
+        private final RangeTombstoneMarker[] markers;
+
+        // For each iterator, what is the currently open marker deletion time (or null if there is no open marker on that iterator)
+        private final DeletionTime[] openMarkers;
+        // The index in openMarkers of the "biggest" marker, the one with the biggest deletion time. Is < 0 iff there is no open
+        // marker on any iterator.
+        private int biggestOpenMarker = -1;
+
+        public Merger(int size, DeletionTime partitionDeletion, boolean reversed)
+        {
+            this.partitionDeletion = partitionDeletion;
+            this.reversed = reversed;
+
+            this.markers = new RangeTombstoneMarker[size];
+            this.openMarkers = new DeletionTime[size];
+        }
+
+        public void clear()
+        {
+            Arrays.fill(markers, null);
+        }
+
+        public void add(int i, RangeTombstoneMarker marker)
+        {
+            bound = marker.clustering();
+            markers[i] = marker;
+        }
+
+        public RangeTombstoneMarker merge()
+        {
+            /*
+             * Merging of range tombstones works this way:
+             *   1) We remember what is the currently open marker in the merged stream
+             *   2) We update our internal states of what range is opened on the input streams based on the new markers to merge
+             *   3) We compute what should be the state in the merge stream after 2)
+             *   4) We return what marker should be issued on the merged stream based on the difference between the state from 1) and 3)
+             */
+
+            DeletionTime previousDeletionTimeInMerged = currentOpenDeletionTimeInMerged();
+
+            updateOpenMarkers();
+
+            DeletionTime newDeletionTimeInMerged = currentOpenDeletionTimeInMerged();
+            if (previousDeletionTimeInMerged.equals(newDeletionTimeInMerged))
+                return null;
+
+            boolean isBeforeClustering = bound.kind().comparedToClustering < 0;
+            if (reversed)
+                isBeforeClustering = !isBeforeClustering;
+
+            ByteBuffer[] values = bound.getRawValues();
+            RangeTombstoneMarker merged;
+            if (previousDeletionTimeInMerged.isLive())
+            {
+                merged = isBeforeClustering
+                       ? RangeTombstoneBoundMarker.inclusiveOpen(reversed, values, newDeletionTimeInMerged)
+                       : RangeTombstoneBoundMarker.exclusiveOpen(reversed, values, newDeletionTimeInMerged);
+            }
+            else if (newDeletionTimeInMerged.isLive())
+            {
+                merged = isBeforeClustering
+                       ? RangeTombstoneBoundMarker.exclusiveClose(reversed, values, previousDeletionTimeInMerged)
+                       : RangeTombstoneBoundMarker.inclusiveClose(reversed, values, previousDeletionTimeInMerged);
+            }
+            else
+            {
+                merged = isBeforeClustering
+                       ? RangeTombstoneBoundaryMarker.exclusiveCloseInclusiveOpen(reversed, values, previousDeletionTimeInMerged, newDeletionTimeInMerged)
+                       : RangeTombstoneBoundaryMarker.inclusiveCloseExclusiveOpen(reversed, values, previousDeletionTimeInMerged, newDeletionTimeInMerged);
+            }
+
+            return merged;
+        }
+
+        public RangeTombstoneMarker[] mergedMarkers()
+        {
+            return markers;
+        }
+
+        private DeletionTime currentOpenDeletionTimeInMerged()
+        {
+            if (biggestOpenMarker < 0)
+                return DeletionTime.LIVE;
+
+            DeletionTime biggestDeletionTime = openMarkers[biggestOpenMarker];
+            // it's only open in the merged iterator if it's not shadowed by the partition level deletion
+            return partitionDeletion.supersedes(biggestDeletionTime) ? DeletionTime.LIVE : biggestDeletionTime;
+        }
+
+        private void updateOpenMarkers()
+        {
+            for (int i = 0; i < markers.length; i++)
+            {
+                RangeTombstoneMarker marker = markers[i];
+                if (marker == null)
+                    continue;
+
+                // Note that we can have boundaries that are both open and close, but in that case all we care about
+                // is what it the open deletion after the marker, so we favor the opening part in this case.
+                if (marker.isOpen(reversed))
+                    openMarkers[i] = marker.openDeletionTime(reversed);
+                else
+                    openMarkers[i] = null;
+            }
+
+            // Recompute what is now the biggest open marker
+            biggestOpenMarker = -1;
+            for (int i = 0; i < openMarkers.length; i++)
+            {
+                if (openMarkers[i] != null && (biggestOpenMarker < 0 || openMarkers[i].supersedes(openMarkers[biggestOpenMarker])))
+                    biggestOpenMarker = i;
+            }
+        }
+
+        public DeletionTime activeDeletion()
+        {
+            DeletionTime openMarker = currentOpenDeletionTimeInMerged();
+            // We only have an open marker in the merged stream if it's not shadowed by the partition deletion (which can be LIVE itself), so
+            // if have an open marker, we know it's the "active" deletion for the merged stream.
+            return openMarker.isLive() ? partitionDeletion : openMarker;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/Row.java b/src/java/org/apache/cassandra/db/rows/Row.java
new file mode 100644
index 0000000..dcb78f3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/Row.java

@@ -0,0 +1,743 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.*;
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.service.paxos.Commit;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MergeIterator;
+import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+
+/**
+ * Storage engine representation of a row.
+ *
+ * A row mainly contains the following informations:
+ *   1) Its {@code Clustering}, which holds the values for the clustering columns identifying the row.
+ *   2) Its row level informations: the primary key liveness infos and the row deletion (see
+ *      {@link #primaryKeyLivenessInfo()} and {@link #deletion()} for more details).
+ *   3) Data for the columns it contains, or in other words, it's a (sorted) collection of
+ *      {@code ColumnData}.
+ *
+ * Also note that as for every other storage engine object, a {@code Row} object cannot shadow
+ * it's own data. For instance, a {@code Row} cannot contains a cell that is deleted by its own
+ * row deletion.
+ */
+public interface Row extends Unfiltered, Iterable<ColumnData>
+{
+    /**
+     * The clustering values for this row.
+     */
+    @Override
+    public Clustering clustering();
+
+    /**
+     * An in-natural-order collection of the columns for which data (incl. simple tombstones)
+     * is present in this row.
+     */
+    public Collection<ColumnDefinition> columns();
+
+
+    /**
+     * The number of columns for which data (incl. simple tombstones) is present in this row.
+     */
+    public int columnCount();
+
+    /**
+     * The row deletion.
+     *
+     * This correspond to the last row deletion done on this row.
+     *
+     * @return the row deletion.
+     */
+    public Deletion deletion();
+
+    /**
+     * Liveness information for the primary key columns of this row.
+     * <p>
+     * As a row is uniquely identified by its primary key, all its primary key columns
+     * share the same {@code LivenessInfo}. This liveness information is what allows us
+     * to distinguish between a dead row (it has no live cells and its primary key liveness
+     * info is empty) and a live row but where all non PK columns are null (it has no
+     * live cells, but its primary key liveness is not empty). Please note that the liveness
+     * info (including it's eventually ttl/local deletion time) only apply to the primary key
+     * columns and has no impact on the row content.
+     * <p>
+     * Note in particular that a row may have live cells but no PK liveness info, because the
+     * primary key liveness informations are only set on {@code INSERT} (which makes sense
+     * in itself, see #6782) but live cells can be added through {@code UPDATE} even if the row
+     * wasn't pre-existing (which users are encouraged not to do, but we can't validate).
+     */
+    public LivenessInfo primaryKeyLivenessInfo();
+
+    /**
+     * Whether the row correspond to a static row or not.
+     *
+     * @return whether the row correspond to a static row or not.
+     */
+    public boolean isStatic();
+
+    /**
+     * Whether the row has no information whatsoever. This means no PK liveness info, no row
+     * deletion, no cells and no complex deletion info.
+     *
+     * @return {@code true} if the row has no data, {@code false} otherwise.
+     */
+    public boolean isEmpty();
+
+    /**
+     * Whether the row has some live information (i.e. it's not just deletion informations).
+     * 
+     * @param nowInSec the current time to decide what is deleted and what isn't
+     * @param enforceStrictLiveness whether the row should be purged if there is no PK liveness info,
+     *                              normally retrieved from {@link CFMetaData#enforceStrictLiveness()}
+     * @return true if there is some live information
+     */
+    public boolean hasLiveData(int nowInSec, boolean enforceStrictLiveness);
+
+    /**
+     * Returns a cell for a simple column.
+     *
+     * @param c the simple column for which to fetch the cell.
+     * @return the corresponding cell or {@code null} if the row has no such cell.
+     */
+    public Cell getCell(ColumnDefinition c);
+
+    /**
+     * Return a cell for a given complex column and cell path.
+     *
+     * @param c the complex column for which to fetch the cell.
+     * @param path the cell path for which to fetch the cell.
+     * @return the corresponding cell or {@code null} if the row has no such cell.
+     */
+    public Cell getCell(ColumnDefinition c, CellPath path);
+
+    /**
+     * The data for a complex column.
+     * <p>
+     * The returned object groups all the cells for the column, as well as it's complex deletion (if relevant).
+     *
+     * @param c the complex column for which to return the complex data.
+     * @return the data for {@code c} or {@code null} if the row has no data for this column.
+     */
+    public ComplexColumnData getComplexColumnData(ColumnDefinition c);
+
+    /**
+     * An iterable over the cells of this row.
+     * <p>
+     * The iterable guarantees that cells are returned in order of {@link Cell#comparator}.
+     *
+     * @return an iterable over the cells of this row.
+     */
+    public Iterable<Cell> cells();
+
+    /**
+     * A collection of the ColumnData representation of this row, for columns with some data (possibly not live) present
+     * <p>
+     * The data is returned in column order.
+     *
+     * @return a Collection of the non-empty ColumnData for this row.
+     */
+    public Collection<ColumnData> columnData();
+
+    /**
+     * An iterable over the cells of this row that return cells in "legacy order".
+     * <p>
+     * In 3.0+, columns are sorted so that all simple columns are before all complex columns. Previously
+     * however, the cells where just sorted by the column name. This iterator return cells in that
+     * legacy order. It's only ever meaningful for backward/thrift compatibility code.
+     *
+     * @param metadata the table this is a row of.
+     * @param reversed if cells should returned in reverse order.
+     * @return an iterable over the cells of this row in "legacy order".
+     */
+    public Iterable<Cell> cellsInLegacyOrder(CFMetaData metadata, boolean reversed);
+
+    /**
+     * Whether the row stores any (non-live) complex deletion for any complex column.
+     */
+    public boolean hasComplexDeletion();
+
+    /**
+     * Whether the row stores any (non-RT) data for any complex column.
+     */
+    boolean hasComplex();
+
+    /**
+     * Whether the row has any deletion info (row deletion, cell tombstone, expired cell or complex deletion).
+     *
+     * @param nowInSec the current time in seconds to decid if a cell is expired.
+     */
+    public boolean hasDeletion(int nowInSec);
+
+    /**
+     * An iterator to efficiently search data for a given column.
+     *
+     * @return a search iterator for the cells of this row.
+     */
+    public SearchIterator<ColumnDefinition, ColumnData> searchIterator();
+
+    /**
+     * Returns a copy of this row that:
+     *   1) only includes the data for the column included by {@code filter}.
+     *   2) doesn't include any data that belongs to a dropped column (recorded in {@code metadata}).
+     */
+    public Row filter(ColumnFilter filter, CFMetaData metadata);
+
+    /**
+     * Returns a copy of this row that:
+     *   1) only includes the data for the column included by {@code filter}.
+     *   2) doesn't include any data that belongs to a dropped column (recorded in {@code metadata}).
+     *   3) doesn't include any data that is shadowed/deleted by {@code activeDeletion}.
+     *   4) uses {@code activeDeletion} as row deletion iff {@code setActiveDeletionToRow} and {@code activeDeletion} supersedes the row deletion.
+     */
+    public Row filter(ColumnFilter filter, DeletionTime activeDeletion, boolean setActiveDeletionToRow, CFMetaData metadata);
+
+    /**
+     * Returns a copy of this row without any deletion info that should be purged according to {@code purger}.
+     *
+     * @param purger the {@code DeletionPurger} to use to decide what can be purged.
+     * @param nowInSec the current time to decide what is deleted and what isn't (in the case of expired cells).
+     * @param enforceStrictLiveness whether the row should be purged if there is no PK liveness info,
+     *                              normally retrieved from {@link CFMetaData#enforceStrictLiveness()}
+     *
+     *        When enforceStrictLiveness is set, rows with empty PK liveness info
+     *        and no row deletion are purged.
+     *
+     *        Currently this is only used by views with normal base column as PK column
+     *        so updates to other base columns do not make the row live when the PK column
+     *        is not live. See CASSANDRA-11500.
+     *
+     * @return this row but without any deletion info purged by {@code purger}. If the purged row is empty, returns
+     * {@code null}.
+     */
+    public Row purge(DeletionPurger purger, int nowInSec, boolean enforceStrictLiveness);
+
+    /**
+     * Returns a copy of this row where all counter cells have they "local" shard marked for clearing.
+     */
+    public Row markCounterLocalToBeCleared();
+
+    /**
+     * Returns a copy of this row where all live timestamp have been replaced by {@code newTimestamp} and every deletion
+     * timestamp by {@code newTimestamp - 1}.
+     *
+     * @param newTimestamp the timestamp to use for all live data in the returned row.
+     * @return a copy of this row with timestamp updated using {@code newTimestamp}. This can return {@code null} in the
+     * rare where the row only as a shadowable row deletion and the new timestamp supersedes it.
+     *
+     * @see Commit for why we need this.
+     */
+    public Row updateAllTimestamp(long newTimestamp);
+
+    /**
+     * Returns a copy of this row with the new deletion as row deletion if it is more recent
+     * than the current row deletion.
+     * <p>
+     * WARNING: this method <b>does not</b> check that nothing in the row is shadowed by the provided
+     * deletion and if that is the case, the created row will be <b>invalid</b>. It is thus up to the
+     * caller to verify that this is not the case and the only reasonable use case of this is probably
+     * when the row and the deletion comes from the same {@code UnfilteredRowIterator} since that gives
+     * use this guarantee.
+     */
+    public Row withRowDeletion(DeletionTime deletion);
+
+    public int dataSize();
+
+    public long unsharedHeapSizeExcludingData();
+
+    public String toString(CFMetaData metadata, boolean fullDetails);
+
+    /**
+     * A row deletion/tombstone.
+     * <p>
+     * A row deletion mostly consists of the time of said deletion, but there is 2 variants: shadowable
+     * and regular row deletion.
+     * <p>
+     * A shadowable row deletion only exists if the row has no timestamp. In other words, the deletion is only
+     * valid as long as no newer insert is done (thus setting a row timestap; note that if the row timestamp set
+     * is lower than the deletion, it is shadowed (and thus ignored) as usual).
+     * <p>
+     * That is, if a row has a shadowable deletion with timestamp A and an update is madeto that row with a
+     * timestamp B such that B > A (and that update sets the row timestamp), then the shadowable deletion is 'shadowed'
+     * by that update. A concrete consequence is that if said update has cells with timestamp lower than A, then those
+     * cells are preserved(since the deletion is removed), and this contrarily to a normal (regular) deletion where the
+     * deletion is preserved and such cells are removed.
+     * <p>
+     * Currently, the only use of shadowable row deletions is Materialized Views, see CASSANDRA-10261.
+     */
+    public static class Deletion
+    {
+        public static final Deletion LIVE = new Deletion(DeletionTime.LIVE, false);
+
+        private final DeletionTime time;
+        private final boolean isShadowable;
+
+        public Deletion(DeletionTime time, boolean isShadowable)
+        {
+            assert !time.isLive() || !isShadowable;
+            this.time = time;
+            this.isShadowable = isShadowable;
+        }
+
+        public static Deletion regular(DeletionTime time)
+        {
+            return time.isLive() ? LIVE : new Deletion(time, false);
+        }
+
+        @Deprecated
+        public static Deletion shadowable(DeletionTime time)
+        {
+            return new Deletion(time, true);
+        }
+
+        /**
+         * The time of the row deletion.
+         *
+         * @return the time of the row deletion.
+         */
+        public DeletionTime time()
+        {
+            return time;
+        }
+
+        /**
+         * Whether the deletion is a shadowable one or not.
+         *
+         * @return whether the deletion is a shadowable one. Note that if {@code isLive()}, then this is
+         * guarantee to return {@code false}.
+         */
+        public boolean isShadowable()
+        {
+            return isShadowable;
+        }
+
+        /**
+         * Wether the deletion is live or not, that is if its an actual deletion or not.
+         *
+         * @return {@code true} if this represents no deletion of the row, {@code false} if that's an actual
+         * deletion.
+         */
+        public boolean isLive()
+        {
+            return time().isLive();
+        }
+
+        public boolean supersedes(DeletionTime that)
+        {
+            return time.supersedes(that);
+        }
+
+        public boolean supersedes(Deletion that)
+        {
+            return time.supersedes(that.time);
+        }
+
+        public boolean isShadowedBy(LivenessInfo primaryKeyLivenessInfo)
+        {
+            return isShadowable && primaryKeyLivenessInfo.timestamp() > time.markedForDeleteAt();
+        }
+
+        public boolean deletes(LivenessInfo info)
+        {
+            return time.deletes(info);
+        }
+
+        public boolean deletes(Cell cell)
+        {
+            return time.deletes(cell);
+        }
+
+        public void digest(MessageDigest digest)
+        {
+            time.digest(digest);
+            FBUtilities.updateWithBoolean(digest, isShadowable);
+        }
+
+        public int dataSize()
+        {
+            return time.dataSize() + 1;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if(!(o instanceof Deletion))
+                return false;
+            Deletion that = (Deletion)o;
+            return this.time.equals(that.time) && this.isShadowable == that.isShadowable;
+        }
+
+        @Override
+        public final int hashCode()
+        {
+            return Objects.hash(time, isShadowable);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s%s", time, isShadowable ? "(shadowable)" : "");
+        }
+    }
+
+    /**
+     * Interface for building rows.
+     * <p>
+     * The builder of a row should always abid to the following rules:
+     *   1) {@link #newRow} is always called as the first thing for the row.
+     *   2) {@link #addPrimaryKeyLivenessInfo} and {@link #addRowDeletion}, if called, are called before
+     *      any {@link #addCell}/{@link #addComplexDeletion} call.
+     *   3) {@link #build} is called to construct the new row. The builder can then be reused.
+     *
+     * There is 2 variants of a builder: sorted and unsorted ones. A sorted builder expects user to abid to the
+     * following additional rules:
+     *   4) Calls to {@link #addCell}/{@link #addComplexDeletion} are done in strictly increasing column order.
+     *      In other words, all calls to these methods for a give column {@code c} are done after any call for
+     *      any column before {@code c} and before any call for any column after {@code c}.
+     *   5) Calls to {@link #addCell} are further done in strictly increasing cell order (the one defined by
+     *      {@link Cell#comparator}. That is, for a give column, cells are passed in {@code CellPath} order.
+     *   6) No shadowed data should be added. Concretely, this means that if a a row deletion is added, it doesn't
+     *      deletes the row timestamp or any cell added later, and similarly no cell added is deleted by the complex
+     *      deletion of the column this is a cell of.
+     *
+     * An unsorted builder will not expect those last rules however: {@link #addCell} and {@link #addComplexDeletion}
+     * can be done in any order. And in particular unsorted builder allows multiple calls for the same column/cell. In
+     * that latter case, the result will follow the usual reconciliation rules (so equal cells are reconciled with
+     * {@link Cells#reconcile} and the "biggest" of multiple complex deletion for the same column wins).
+     */
+    public interface Builder
+    {
+        /**
+         * Creates a copy of this {@code Builder}.
+         * @return a copy of this {@code Builder}
+         */
+        public Builder copy();
+
+        /**
+         * Whether the builder is a sorted one or not.
+         *
+         * @return if the builder requires calls to be done in sorted order or not (see above).
+         */
+        public boolean isSorted();
+
+        /**
+         * Prepares the builder to build a new row of clustering {@code clustering}.
+         * <p>
+         * This should always be the first call for a given row.
+         *
+         * @param clustering the clustering for the new row.
+         */
+        public void newRow(Clustering clustering);
+
+        /**
+         * The clustering for the row that is currently being built.
+         *
+         * @return the clustering for the row that is currently being built, or {@code null} if {@link #newRow} hasn't
+         * yet been called.
+         */
+        public Clustering clustering();
+
+        /**
+         * Adds the liveness information for the primary key columns of this row.
+         *
+         * This call is optional (skipping it is equivalent to calling {@code addPartitionKeyLivenessInfo(LivenessInfo.NONE)}).
+         *
+         * @param info the liveness information for the primary key columns of the built row.
+         */
+        public void addPrimaryKeyLivenessInfo(LivenessInfo info);
+
+        /**
+         * Adds the deletion information for this row.
+         *
+         * This call is optional and can be skipped if the row is not deleted.
+         *
+         * @param deletion the row deletion time, or {@code Deletion.LIVE} if the row isn't deleted.
+         */
+        public void addRowDeletion(Deletion deletion);
+
+        /**
+         * Adds a cell to this builder.
+         *
+         * @param cell the cell to add.
+         */
+        public void addCell(Cell cell);
+
+        /**
+         * Adds a complex deletion.
+         *
+         * @param column the column for which to add the {@code complexDeletion}.
+         * @param complexDeletion the complex deletion time to add.
+         */
+        public void addComplexDeletion(ColumnDefinition column, DeletionTime complexDeletion);
+
+        /**
+         * Builds and return built row.
+         *
+         * @return the last row built by this builder.
+         */
+        public Row build();
+    }
+
+    /**
+     * Utility class to help merging rows from multiple inputs (UnfilteredRowIterators).
+     */
+    public static class Merger
+    {
+        private final Row[] rows;
+        private final List<Iterator<ColumnData>> columnDataIterators;
+
+        private Clustering clustering;
+        private int rowsToMerge;
+        private int lastRowSet = -1;
+
+        private final List<ColumnData> dataBuffer = new ArrayList<>();
+        private final ColumnDataReducer columnDataReducer;
+
+        public Merger(int size, int nowInSec, boolean hasComplex)
+        {
+            this.rows = new Row[size];
+            this.columnDataIterators = new ArrayList<>(size);
+            this.columnDataReducer = new ColumnDataReducer(size, nowInSec, hasComplex);
+        }
+
+        public void clear()
+        {
+            dataBuffer.clear();
+            Arrays.fill(rows, null);
+            columnDataIterators.clear();
+            rowsToMerge = 0;
+            lastRowSet = -1;
+        }
+
+        public void add(int i, Row row)
+        {
+            clustering = row.clustering();
+            rows[i] = row;
+            ++rowsToMerge;
+            lastRowSet = i;
+        }
+
+        public Row merge(DeletionTime activeDeletion)
+        {
+            // If for this clustering we have only one row version and have no activeDeletion (i.e. nothing to filter out),
+            // then we can just return that single row
+            if (rowsToMerge == 1 && activeDeletion.isLive())
+            {
+                Row row = rows[lastRowSet];
+                assert row != null;
+                return row;
+            }
+
+            LivenessInfo rowInfo = LivenessInfo.EMPTY;
+            Deletion rowDeletion = Deletion.LIVE;
+            for (Row row : rows)
+            {
+                if (row == null)
+                    continue;
+
+                if (row.primaryKeyLivenessInfo().supersedes(rowInfo))
+                    rowInfo = row.primaryKeyLivenessInfo();
+                if (row.deletion().supersedes(rowDeletion))
+                    rowDeletion = row.deletion();
+            }
+
+            if (rowDeletion.isShadowedBy(rowInfo))
+                rowDeletion = Deletion.LIVE;
+
+            if (rowDeletion.supersedes(activeDeletion))
+                activeDeletion = rowDeletion.time();
+            else
+                rowDeletion = Deletion.LIVE;
+
+            if (activeDeletion.deletes(rowInfo))
+                rowInfo = LivenessInfo.EMPTY;
+
+            for (Row row : rows)
+                columnDataIterators.add(row == null ? Collections.emptyIterator() : row.iterator());
+
+            columnDataReducer.setActiveDeletion(activeDeletion);
+            Iterator<ColumnData> merged = MergeIterator.get(columnDataIterators, ColumnData.comparator, columnDataReducer);
+            while (merged.hasNext())
+            {
+                ColumnData data = merged.next();
+                if (data != null)
+                    dataBuffer.add(data);
+            }
+
+            // Because some data might have been shadowed by the 'activeDeletion', we could have an empty row
+            return rowInfo.isEmpty() && rowDeletion.isLive() && dataBuffer.isEmpty()
+                 ? null
+                 : BTreeRow.create(clustering, rowInfo, rowDeletion, BTree.build(dataBuffer, UpdateFunction.<ColumnData>noOp()));
+        }
+
+        public Clustering mergedClustering()
+        {
+            return clustering;
+        }
+
+        public Row[] mergedRows()
+        {
+            return rows;
+        }
+
+        private static class ColumnDataReducer extends MergeIterator.Reducer<ColumnData, ColumnData>
+        {
+            private final int nowInSec;
+
+            private ColumnDefinition column;
+            private final List<ColumnData> versions;
+
+            private DeletionTime activeDeletion;
+
+            private final ComplexColumnData.Builder complexBuilder;
+            private final List<Iterator<Cell>> complexCells;
+            private final CellReducer cellReducer;
+
+            public ColumnDataReducer(int size, int nowInSec, boolean hasComplex)
+            {
+                this.nowInSec = nowInSec;
+                this.versions = new ArrayList<>(size);
+                this.complexBuilder = hasComplex ? ComplexColumnData.builder() : null;
+                this.complexCells = hasComplex ? new ArrayList<>(size) : null;
+                this.cellReducer = new CellReducer(nowInSec);
+            }
+
+            public void setActiveDeletion(DeletionTime activeDeletion)
+            {
+                this.activeDeletion = activeDeletion;
+            }
+
+            public void reduce(int idx, ColumnData data)
+            {
+                if (useColumnDefinition(data.column()))
+                    column = data.column();
+
+                versions.add(data);
+            }
+
+            /**
+             * Determines it the {@code ColumnDefinition} is the one that should be used.
+             * @param dataColumn the {@code ColumnDefinition} to use.
+             * @return {@code true} if the {@code ColumnDefinition} is the one that should be used, {@code false} otherwise.
+             */
+            private boolean useColumnDefinition(ColumnDefinition dataColumn)
+            {
+                if (column == null)
+                    return true;
+
+                return AbstractTypeVersionComparator.INSTANCE.compare(column.type, dataColumn.type) < 0;
+            }
+
+            protected ColumnData getReduced()
+            {
+                if (column.isSimple())
+                {
+                    Cell merged = null;
+                    for (ColumnData data : versions)
+                    {
+                        Cell cell = (Cell)data;
+                        if (!activeDeletion.deletes(cell))
+                            merged = merged == null ? cell : Cells.reconcile(merged, cell, nowInSec);
+                    }
+                    return merged;
+                }
+                else
+                {
+                    complexBuilder.newColumn(column);
+                    complexCells.clear();
+                    DeletionTime complexDeletion = DeletionTime.LIVE;
+                    for (ColumnData data : versions)
+                    {
+                        ComplexColumnData cd = (ComplexColumnData)data;
+                        if (cd.complexDeletion().supersedes(complexDeletion))
+                            complexDeletion = cd.complexDeletion();
+                        complexCells.add(cd.iterator());
+                    }
+
+                    if (complexDeletion.supersedes(activeDeletion))
+                    {
+                        cellReducer.setActiveDeletion(complexDeletion);
+                        complexBuilder.addComplexDeletion(complexDeletion);
+                    }
+                    else
+                    {
+                        cellReducer.setActiveDeletion(activeDeletion);
+                    }
+
+                    Iterator<Cell> cells = MergeIterator.get(complexCells, Cell.comparator, cellReducer);
+                    while (cells.hasNext())
+                    {
+                        Cell merged = cells.next();
+                        if (merged != null)
+                            complexBuilder.addCell(merged);
+                    }
+                    return complexBuilder.build();
+                }
+            }
+
+            protected void onKeyChange()
+            {
+                column = null;
+                versions.clear();
+            }
+        }
+
+        private static class CellReducer extends MergeIterator.Reducer<Cell, Cell>
+        {
+            private final int nowInSec;
+
+            private DeletionTime activeDeletion;
+            private Cell merged;
+
+            public CellReducer(int nowInSec)
+            {
+                this.nowInSec = nowInSec;
+            }
+
+            public void setActiveDeletion(DeletionTime activeDeletion)
+            {
+                this.activeDeletion = activeDeletion;
+                onKeyChange();
+            }
+
+            public void reduce(int idx, Cell cell)
+            {
+                if (!activeDeletion.deletes(cell))
+                    merged = merged == null ? cell : Cells.reconcile(merged, cell, nowInSec);
+            }
+
+            protected Cell getReduced()
+            {
+                return merged;
+            }
+
+            protected void onKeyChange()
+            {
+                merged = null;
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RowAndDeletionMergeIterator.java b/src/java/org/apache/cassandra/db/rows/RowAndDeletionMergeIterator.java
new file mode 100644
index 0000000..d47bd8c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RowAndDeletionMergeIterator.java

@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.Comparator;
+import java.util.Iterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+
+/**
+ * An iterator that merges a source of rows with the range tombstone and partition level deletion of a give partition.
+ * <p>
+ * This is used by our {@code Partition} implementations to produce a {@code UnfilteredRowIterator} by merging the rows
+ * and deletion infos that are kept separate. This has also 2 additional role:
+ *   1) this make sure the row returned only includes the columns selected for the resulting iterator.
+ *   2) this (optionally) remove any data that can be shadowed (see commet on 'removeShadowedData' below for more details)
+ */
+public class RowAndDeletionMergeIterator extends AbstractUnfilteredRowIterator
+{
+    // For some of our Partition implementation, we can't guarantee that the deletion information (partition level
+    // deletion and range tombstones) don't shadow data in the rows. If that is the case, this class also take
+    // cares of skipping such shadowed data (since it is the contract of an UnfilteredRowIterator that it doesn't
+    // shadow its own data). Sometimes however, we know this can't happen, in which case we can skip that step.
+    private final boolean removeShadowedData;
+    private final Comparator<Clusterable> comparator;
+    private final ColumnFilter selection;
+
+    private final Iterator<Row> rows;
+    private Row nextRow;
+
+    private final Iterator<RangeTombstone> ranges;
+    private RangeTombstone nextRange;
+
+    // The currently open tombstone. Note that unless this is null, there is no point in checking nextRange.
+    private RangeTombstone openRange;
+
+    public RowAndDeletionMergeIterator(CFMetaData metadata,
+                                       DecoratedKey partitionKey,
+                                       DeletionTime partitionLevelDeletion,
+                                       ColumnFilter selection,
+                                       Row staticRow,
+                                       boolean isReversed,
+                                       EncodingStats stats,
+                                       Iterator<Row> rows,
+                                       Iterator<RangeTombstone> ranges,
+                                       boolean removeShadowedData)
+    {
+        super(metadata, partitionKey, partitionLevelDeletion, selection.fetchedColumns(), staticRow, isReversed, stats);
+        this.comparator = isReversed ? metadata.comparator.reversed() : metadata.comparator;
+        this.selection = selection;
+        this.removeShadowedData = removeShadowedData;
+        this.rows = rows;
+        this.ranges = ranges;
+    }
+
+    private Unfiltered computeNextInternal()
+    {
+        while (true)
+        {
+            updateNextRow();
+            if (nextRow == null)
+            {
+                if (openRange != null)
+                    return closeOpenedRange();
+
+                updateNextRange();
+                return nextRange == null ? endOfData() : openRange();
+            }
+
+            // We have a next row
+
+            if (openRange == null)
+            {
+                // We have no currently open tombstone range. So check if we have a next range and if it sorts before this row.
+                // If it does, the opening of that range should go first. Otherwise, the row goes first.
+                updateNextRange();
+                if (nextRange != null && comparator.compare(openBound(nextRange), nextRow.clustering()) < 0)
+                    return openRange();
+
+                Row row = consumeNextRow();
+                // it's possible for the row to be fully shadowed by the current range tombstone
+                if (row != null)
+                    return row;
+            }
+            else
+            {
+                // We have both a next row and a currently opened tombstone. Check which goes first between the range closing and the row.
+                if (comparator.compare(closeBound(openRange), nextRow.clustering()) < 0)
+                    return closeOpenedRange();
+
+                Row row = consumeNextRow();
+                if (row != null)
+                    return row;
+            }
+        }
+    }
+
+    /**
+     * RangeTombstoneList doesn't correctly merge multiple superseded rts, or overlapping rts with the
+     * same ts. This causes it to emit noop boundary markers which can cause unneeded read repairs and
+     * repair over streaming. This should technically be fixed in RangeTombstoneList. However, fixing
+     * it isn't trivial and that class is already so complicated that the fix would have a good chance
+     * of adding a worse bug. So we just swallow the noop boundary markers here. See CASSANDRA-14894
+     */
+    private static boolean shouldSkip(Unfiltered unfiltered)
+    {
+        if (unfiltered == null || !unfiltered.isRangeTombstoneMarker())
+            return false;
+
+        RangeTombstoneMarker marker = (RangeTombstoneMarker) unfiltered;
+
+        if (!marker.isBoundary())
+            return false;
+
+        DeletionTime open = marker.openDeletionTime(false);
+        DeletionTime close = marker.closeDeletionTime(false);
+
+        return open.equals(close);
+
+    }
+
+    @Override
+    protected Unfiltered computeNext()
+    {
+        while (true)
+        {
+            Unfiltered next = computeNextInternal();
+
+            if (shouldSkip(next))
+                continue;
+
+            return next;
+        }
+    }
+
+    private void updateNextRow()
+    {
+        if (nextRow == null && rows.hasNext())
+            nextRow = rows.next();
+    }
+
+    private void updateNextRange()
+    {
+        while (nextRange == null && ranges.hasNext())
+        {
+            nextRange = ranges.next();
+            if ((removeShadowedData && partitionLevelDeletion().supersedes(nextRange.deletionTime()))
+                || nextRange.deletedSlice().isEmpty(metadata.comparator))
+                nextRange = null;
+        }
+    }
+
+    private Row consumeNextRow()
+    {
+        Row row = nextRow;
+        nextRow = null;
+        if (!removeShadowedData)
+            return row.filter(selection, metadata());
+
+        DeletionTime activeDeletion = openRange == null ? partitionLevelDeletion() : openRange.deletionTime();
+        return row.filter(selection, activeDeletion, false, metadata());
+    }
+
+    private RangeTombstone consumeNextRange()
+    {
+        RangeTombstone range = nextRange;
+        nextRange = null;
+        return range;
+    }
+
+    private RangeTombstone consumeOpenRange()
+    {
+        RangeTombstone range = openRange;
+        openRange = null;
+        return range;
+    }
+
+    private Slice.Bound openBound(RangeTombstone range)
+    {
+        return range.deletedSlice().open(isReverseOrder());
+    }
+
+    private Slice.Bound closeBound(RangeTombstone range)
+    {
+        return range.deletedSlice().close(isReverseOrder());
+    }
+
+    private RangeTombstoneMarker closeOpenedRange()
+    {
+        // Check if that close if actually a boundary between markers
+        updateNextRange();
+        RangeTombstoneMarker marker;
+        if (nextRange != null && comparator.compare(closeBound(openRange), openBound(nextRange)) == 0)
+        {
+            marker = RangeTombstoneBoundaryMarker.makeBoundary(isReverseOrder(), closeBound(openRange), openBound(nextRange), openRange.deletionTime(), nextRange.deletionTime());
+            openRange = consumeNextRange();
+        }
+        else
+        {
+            RangeTombstone toClose = consumeOpenRange();
+            marker = new RangeTombstoneBoundMarker(closeBound(toClose), toClose.deletionTime());
+        }
+        return marker;
+    }
+
+    private RangeTombstoneMarker openRange()
+    {
+        assert openRange == null && nextRange != null;
+        openRange = consumeNextRange();
+        return new RangeTombstoneBoundMarker(openBound(openRange), openRange.deletionTime());
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RowDiffListener.java b/src/java/org/apache/cassandra/db/rows/RowDiffListener.java
new file mode 100644
index 0000000..ec848a0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RowDiffListener.java

@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+
+/**
+ * Interface that allows to act on the result of merging multiple rows.
+ *
+ * More precisely, given N rows and the result of merging them, one can call {@link Rows#diff()}
+ * with a {@code RowDiffListener} and that listener will be informed for each input row of the diff between
+ * that input and merge row.
+ */
+public interface RowDiffListener
+{
+    /**
+     * Called for the row primary key liveness info of input {@code i}.
+     *
+     * @param i the input row from which {@code original} is from.
+     * @param clustering the clustering for the row that is merged.
+     * @param merged the primary key liveness info of the merged row. Will be {@code null} if input {@code i} had
+     * a {@code LivenessInfo}, but the merged result don't (i.e. the original info has been shadowed/deleted).
+     * @param original the primary key liveness info of input {@code i}. May be {@code null} if input {@code i}
+     * has not primary key liveness info (i.e. it has {@code LivenessInfo.NONE}) but the merged result has.
+     */
+    public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original);
+
+    /**
+     * Called for the row deletion of input {@code i}.
+     *
+     * @param i the input row from which {@code original} is from.
+     * @param clustering the clustering for the row that is merged.
+     * @param merged the deletion of the merged row. Will be {@code null} if input {@code i} had deletion
+     * but the merged result doesn't (i.e. the deletion has been shadowed).
+     * @param original the deletion of input {@code i}. May be {@code null} if input {@code i} had no deletion but the merged row has.
+     */
+    public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original);
+
+    /**
+     * Called for every (non-live) complex deletion of any complex column present in either the merged row of input {@code i}.
+     *
+     * @param i the input row from which {@code original} is from.
+     * @param clustering the clustering for the row that is merged.
+     * @param column the column for which this is a complex deletion of.
+     * @param merged the complex deletion of the merged row. Will be {@code null} if input {@code i} had a complex deletion
+     * for {@code column} but the merged result doesn't (i.e. the deletion has been shadowed).
+     * @param original the complex deletion of input {@code i} for column {@code column}. May be {@code null} if input {@code i}
+     * had no complex deletion but the merged row has.
+     */
+    public void onComplexDeletion(int i, Clustering clustering, ColumnDefinition column, DeletionTime merged, DeletionTime original);
+
+    /**
+     * Called for any cell that is either in the merged row or in input {@code i}.
+     *
+     * @param i the input row from which {@code original} is from.
+     * @param clustering the clustering for the row that is merged.
+     * @param merged the cell of the merged row. Will be {@code null} if input {@code i} had a cell but that cell is no present
+     * in the merged result (it has been deleted/shadowed).
+     * @param original the cell of input {@code i}. May be {@code null} if input {@code i} had cell corresponding to {@code merged}.
+     */
+    public void onCell(int i, Clustering clustering, Cell merged, Cell original);
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RowIterator.java b/src/java/org/apache/cassandra/db/rows/RowIterator.java
new file mode 100644
index 0000000..f0b4499
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RowIterator.java

@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+/**
+ * An iterator over rows belonging to a partition.
+ *
+ * A RowIterator is an UnfilteredRowIterator to which any deletion information has been
+ * filtered out. As such, all cell of all rows returned by this iterator are,
+ * by definition, live, and hence code using a RowIterator don't have to worry
+ * about tombstones and other deletion information.
+ *
+ * Note that as for UnfilteredRowIterator, the rows returned must be in clustering order (or
+ * reverse clustering order if isReverseOrder is true), and the Row objects returned
+ * by next() are only valid until the next call to hasNext() or next().
+ */
+public interface RowIterator extends BaseRowIterator<Row>
+{
+    /**
+     * Returns whether the provided iterator has no data.
+     */
+    public default boolean isEmpty()
+    {
+        return staticRow().isEmpty() && !hasNext();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/RowIterators.java b/src/java/org/apache/cassandra/db/rows/RowIterators.java
new file mode 100644
index 0000000..ae051c0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/RowIterators.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.security.MessageDigest;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * Static methods to work with row iterators.
+ */
+public abstract class RowIterators
+{
+    private static final Logger logger = LoggerFactory.getLogger(RowIterators.class);
+
+    private RowIterators() {}
+
+    public static void digest(RowIterator iterator, MessageDigest digest)
+    {
+        // TODO: we're not computing digest the same way that old nodes. This is
+        // currently ok as this is only used for schema digest and the is no exchange
+        // of schema digest between different versions. If this changes however,
+        // we'll need to agree on a version.
+        digest.update(iterator.partitionKey().getKey().duplicate());
+        iterator.columns().regulars.digest(digest);
+        iterator.columns().statics.digest(digest);
+        FBUtilities.updateWithBoolean(digest, iterator.isReverseOrder());
+        iterator.staticRow().digest(digest);
+
+        while (iterator.hasNext())
+            iterator.next().digest(digest);
+    }
+
+    /**
+     * Wraps the provided iterator so it logs the returned rows for debugging purposes.
+     * <p>
+     * Note that this is only meant for debugging as this can log a very large amount of
+     * logging at INFO.
+     */
+    public static RowIterator loggingIterator(RowIterator iterator, final String id)
+    {
+        CFMetaData metadata = iterator.metadata();
+        logger.info("[{}] Logging iterator on {}.{}, partition key={}, reversed={}",
+                    id,
+                    metadata.ksName,
+                    metadata.cfName,
+                    metadata.getKeyValidator().getString(iterator.partitionKey().getKey()),
+                    iterator.isReverseOrder());
+
+        class Log extends Transformation
+        {
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                if (!row.isEmpty())
+                    logger.info("[{}] {}", id, row.toString(metadata));
+                return row;
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                logger.info("[{}] {}", id, row.toString(metadata));
+                return row;
+            }
+        }
+        return Transformation.apply(iterator, new Log());
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/Rows.java b/src/java/org/apache/cassandra/db/rows/Rows.java
new file mode 100644
index 0000000..09213a4
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/Rows.java

@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.*;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
+import org.apache.cassandra.utils.MergeIterator;
+
+/**
+ * Static utilities to work on Row objects.
+ */
+public abstract class Rows
+{
+    private Rows() {}
+
+    public static final Row EMPTY_STATIC_ROW = BTreeRow.emptyRow(Clustering.STATIC_CLUSTERING);
+
+    public static Row.Builder copy(Row row, Row.Builder builder)
+    {
+        builder.newRow(row.clustering());
+        builder.addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo());
+        builder.addRowDeletion(row.deletion());
+        for (ColumnData cd : row)
+        {
+            if (cd.column().isSimple())
+            {
+                builder.addCell((Cell)cd);
+            }
+            else
+            {
+                ComplexColumnData complexData = (ComplexColumnData)cd;
+                builder.addComplexDeletion(complexData.column(), complexData.complexDeletion());
+                for (Cell cell : complexData)
+                    builder.addCell(cell);
+            }
+        }
+        return builder;
+    }
+
+    /**
+     * Collect statistics on a given row.
+     *
+     * @param row the row for which to collect stats.
+     * @param collector the stats collector.
+     * @return the total number of cells in {@code row}.
+     */
+    public static int collectStats(Row row, PartitionStatisticsCollector collector)
+    {
+        assert !row.isEmpty();
+
+        collector.update(row.primaryKeyLivenessInfo());
+        collector.update(row.deletion().time());
+
+        int columnCount = 0;
+        int cellCount = 0;
+        for (ColumnData cd : row)
+        {
+            if (cd.column().isSimple())
+            {
+                ++columnCount;
+                ++cellCount;
+                Cells.collectStats((Cell) cd, collector);
+            }
+            else
+            {
+                ComplexColumnData complexData = (ComplexColumnData)cd;
+                collector.update(complexData.complexDeletion());
+                if (complexData.hasCells())
+                {
+                    ++columnCount;
+                    for (Cell cell : complexData)
+                    {
+                        ++cellCount;
+                        Cells.collectStats(cell, collector);
+                    }
+                }
+            }
+
+        }
+        collector.updateColumnSetPerRow(columnCount);
+        return cellCount;
+    }
+
+    /**
+     * Given the result ({@code merged}) of merging multiple {@code inputs}, signals the difference between
+     * each input and {@code merged} to {@code diffListener}.
+     * <p>
+     * Note that this method doesn't only emit cells etc where there's a difference. The listener is informed
+     * of every corresponding entity between the merged and input rows, including those that are equal.
+     *
+     * @param diffListener the listener to which to signal the differences between the inputs and the merged result.
+     * @param merged the result of merging {@code inputs}.
+     * @param inputs the inputs whose merge yielded {@code merged}.
+     */
+    public static void diff(RowDiffListener diffListener, Row merged, Row...inputs)
+    {
+        Clustering clustering = merged.clustering();
+        LivenessInfo mergedInfo = merged.primaryKeyLivenessInfo().isEmpty() ? null : merged.primaryKeyLivenessInfo();
+        Row.Deletion mergedDeletion = merged.deletion().isLive() ? null : merged.deletion();
+        for (int i = 0; i < inputs.length; i++)
+        {
+            Row input = inputs[i];
+            LivenessInfo inputInfo = input == null || input.primaryKeyLivenessInfo().isEmpty() ? null : input.primaryKeyLivenessInfo();
+            Row.Deletion inputDeletion = input == null || input.deletion().isLive() ? null : input.deletion();
+
+            if (mergedInfo != null || inputInfo != null)
+                diffListener.onPrimaryKeyLivenessInfo(i, clustering, mergedInfo, inputInfo);
+            if (mergedDeletion != null || inputDeletion != null)
+                diffListener.onDeletion(i, clustering, mergedDeletion, inputDeletion);
+        }
+
+        List<Iterator<ColumnData>> inputIterators = new ArrayList<>(1 + inputs.length);
+        inputIterators.add(merged.iterator());
+        for (Row row : inputs)
+            inputIterators.add(row == null ? Collections.emptyIterator() : row.iterator());
+
+        Iterator<?> iter = MergeIterator.get(inputIterators, ColumnData.comparator, new MergeIterator.Reducer<ColumnData, Object>()
+        {
+            ColumnData mergedData;
+            ColumnData[] inputDatas = new ColumnData[inputs.length];
+            public void reduce(int idx, ColumnData current)
+            {
+                if (idx == 0)
+                    mergedData = current;
+                else
+                    inputDatas[idx - 1] = current;
+            }
+
+            protected Object getReduced()
+            {
+                for (int i = 0 ; i != inputDatas.length ; i++)
+                {
+                    ColumnData input = inputDatas[i];
+                    if (mergedData != null || input != null)
+                    {
+                        ColumnDefinition column = (mergedData != null ? mergedData : input).column;
+                        if (column.isSimple())
+                        {
+                            diffListener.onCell(i, clustering, (Cell) mergedData, (Cell) input);
+                        }
+                        else
+                        {
+                            ComplexColumnData mergedData = (ComplexColumnData) this.mergedData;
+                            ComplexColumnData inputData = (ComplexColumnData) input;
+                            if (mergedData == null)
+                            {
+                                // Everything in inputData has been shadowed
+                                if (!inputData.complexDeletion().isLive())
+                                    diffListener.onComplexDeletion(i, clustering, column, null, inputData.complexDeletion());
+                                for (Cell inputCell : inputData)
+                                    diffListener.onCell(i, clustering, null, inputCell);
+                            }
+                            else if (inputData == null)
+                            {
+                                // Everything in inputData is new
+                                if (!mergedData.complexDeletion().isLive())
+                                    diffListener.onComplexDeletion(i, clustering, column, mergedData.complexDeletion(), null);
+                                for (Cell mergedCell : mergedData)
+                                    diffListener.onCell(i, clustering, mergedCell, null);
+                            }
+                            else
+                            {
+
+                                if (!mergedData.complexDeletion().isLive() || !inputData.complexDeletion().isLive())
+                                    diffListener.onComplexDeletion(i, clustering, column, mergedData.complexDeletion(), inputData.complexDeletion());
+
+                                PeekingIterator<Cell> mergedCells = Iterators.peekingIterator(mergedData.iterator());
+                                PeekingIterator<Cell> inputCells = Iterators.peekingIterator(inputData.iterator());
+                                while (mergedCells.hasNext() && inputCells.hasNext())
+                                {
+                                    int cmp = column.cellPathComparator().compare(mergedCells.peek().path(), inputCells.peek().path());
+                                    if (cmp == 0)
+                                        diffListener.onCell(i, clustering, mergedCells.next(), inputCells.next());
+                                    else if (cmp < 0)
+                                        diffListener.onCell(i, clustering, mergedCells.next(), null);
+                                    else // cmp > 0
+                                        diffListener.onCell(i, clustering, null, inputCells.next());
+                                }
+                                while (mergedCells.hasNext())
+                                    diffListener.onCell(i, clustering, mergedCells.next(), null);
+                                while (inputCells.hasNext())
+                                    diffListener.onCell(i, clustering, null, inputCells.next());
+                            }
+                        }
+                    }
+
+                }
+                return null;
+            }
+
+            protected void onKeyChange()
+            {
+                mergedData = null;
+                Arrays.fill(inputDatas, null);
+            }
+        });
+
+        while (iter.hasNext())
+            iter.next();
+    }
+
+    public static Row merge(Row row1, Row row2, int nowInSec)
+    {
+        Row.Builder builder = BTreeRow.sortedBuilder();
+        merge(row1, row2, builder, nowInSec);
+        return builder.build();
+    }
+
+    /**
+     * Merges two rows into the given builder, mainly for merging memtable rows. In addition to reconciling the cells
+     * in each row, the liveness info, and deletion times for the row and complex columns are also merged.
+     * <p>
+     * Note that this method assumes that the provided rows can meaningfully be reconciled together. That is,
+     * that the rows share the same clustering value, and belong to the same partition.
+     *
+     * @param existing
+     * @param update
+     * @param builder the row build to which the result of the reconciliation is written.
+     * @param nowInSec the current time in seconds (which plays a role during reconciliation
+     * because deleted cells always have precedence on timestamp equality and deciding if a
+     * cell is a live or not depends on the current time due to expiring cells).
+     *
+     * @return the smallest timestamp delta between corresponding rows from existing and update. A
+     * timestamp delta being computed as the difference between the cells and DeletionTimes from {@code existing}
+     * and those in {@code existing}.
+     */
+    public static long merge(Row existing,
+                             Row update,
+                             Row.Builder builder,
+                             int nowInSec)
+    {
+        Clustering clustering = existing.clustering();
+        builder.newRow(clustering);
+
+        LivenessInfo existingInfo = existing.primaryKeyLivenessInfo();
+        LivenessInfo updateInfo = update.primaryKeyLivenessInfo();
+        LivenessInfo mergedInfo = existingInfo.supersedes(updateInfo) ? existingInfo : updateInfo;
+
+        long timeDelta = Math.abs(existingInfo.timestamp() - mergedInfo.timestamp());
+
+        Row.Deletion rowDeletion = existing.deletion().supersedes(update.deletion()) ? existing.deletion() : update.deletion();
+
+        if (rowDeletion.deletes(mergedInfo))
+            mergedInfo = LivenessInfo.EMPTY;
+        else if (rowDeletion.isShadowedBy(mergedInfo))
+            rowDeletion = Row.Deletion.LIVE;
+
+        builder.addPrimaryKeyLivenessInfo(mergedInfo);
+        builder.addRowDeletion(rowDeletion);
+
+        DeletionTime deletion = rowDeletion.time();
+
+        Iterator<ColumnData> a = existing.iterator();
+        Iterator<ColumnData> b = update.iterator();
+        ColumnData nexta = a.hasNext() ? a.next() : null, nextb = b.hasNext() ? b.next() : null;
+        while (nexta != null | nextb != null)
+        {
+            int comparison = nexta == null ? 1 : nextb == null ? -1 : nexta.column.compareTo(nextb.column);
+            ColumnData cura = comparison <= 0 ? nexta : null;
+            ColumnData curb = comparison >= 0 ? nextb : null;
+            ColumnDefinition column = getColumnDefinition(cura, curb);
+
+            if (column.isSimple())
+            {
+                timeDelta = Math.min(timeDelta, Cells.reconcile((Cell) cura, (Cell) curb, deletion, builder, nowInSec));
+            }
+            else
+            {
+                ComplexColumnData existingData = (ComplexColumnData) cura;
+                ComplexColumnData updateData = (ComplexColumnData) curb;
+
+                DeletionTime existingDt = existingData == null ? DeletionTime.LIVE : existingData.complexDeletion();
+                DeletionTime updateDt = updateData == null ? DeletionTime.LIVE : updateData.complexDeletion();
+                DeletionTime maxDt = existingDt.supersedes(updateDt) ? existingDt : updateDt;
+                if (maxDt.supersedes(deletion))
+                    builder.addComplexDeletion(column, maxDt);
+                else
+                    maxDt = deletion;
+
+                Iterator<Cell> existingCells = existingData == null ? null : existingData.iterator();
+                Iterator<Cell> updateCells = updateData == null ? null : updateData.iterator();
+                timeDelta = Math.min(timeDelta, Cells.reconcileComplex(column, existingCells, updateCells, maxDt, builder, nowInSec));
+            }
+
+            if (cura != null)
+                nexta = a.hasNext() ? a.next() : null;
+            if (curb != null)
+                nextb = b.hasNext() ? b.next() : null;
+        }
+        return timeDelta;
+    }
+
+    /**
+     * Returns the {@code ColumnDefinition} to use for merging the columns.
+     * If the 2 column definitions are different the latest one will be returned.
+     */
+    private static ColumnDefinition getColumnDefinition(ColumnData cura, ColumnData curb)
+    {
+        if (cura == null)
+            return curb.column;
+
+        if (curb == null)
+            return cura.column;
+
+        if (AbstractTypeVersionComparator.INSTANCE.compare(cura.column.type, curb.column.type) >= 0)
+            return cura.column;
+
+        return curb.column;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/SerializationHelper.java b/src/java/org/apache/cassandra/db/rows/SerializationHelper.java
new file mode 100644
index 0000000..6b4bc2e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/SerializationHelper.java

@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.filter.ColumnFilter;
+
+public class SerializationHelper
+{
+    /**
+     * Flag affecting deserialization behavior (this only affect counters in practice).
+     *  - LOCAL: for deserialization of local data (Expired columns are
+     *      converted to tombstones (to gain disk space)).
+     *  - FROM_REMOTE: for deserialization of data received from remote hosts
+     *      (Expired columns are converted to tombstone and counters have
+     *      their delta cleared)
+     *  - PRESERVE_SIZE: used when no transformation must be performed, i.e,
+     *      when we must ensure that deserializing and reserializing the
+     *      result yield the exact same bytes. Streaming uses this.
+     */
+    public enum Flag
+    {
+        LOCAL, FROM_REMOTE, PRESERVE_SIZE
+    }
+
+    private final Flag flag;
+    public final int version;
+
+    private final ColumnFilter columnsToFetch;
+    private ColumnFilter.Tester tester;
+
+    private final Map<ByteBuffer, CFMetaData.DroppedColumn> droppedColumns;
+    private CFMetaData.DroppedColumn currentDroppedComplex;
+
+
+    public SerializationHelper(CFMetaData metadata, int version, Flag flag, ColumnFilter columnsToFetch)
+    {
+        this.flag = flag;
+        this.version = version;
+        this.columnsToFetch = columnsToFetch;
+        this.droppedColumns = metadata.getDroppedColumns();
+    }
+
+    public SerializationHelper(CFMetaData metadata, int version, Flag flag)
+    {
+        this(metadata, version, flag, null);
+    }
+
+    public Columns fetchedStaticColumns(SerializationHeader header)
+    {
+        return columnsToFetch == null ? header.columns().statics : columnsToFetch.fetchedColumns().statics;
+    }
+
+    public Columns fetchedRegularColumns(SerializationHeader header)
+    {
+        return columnsToFetch == null ? header.columns().regulars : columnsToFetch.fetchedColumns().regulars;
+    }
+
+    public boolean includes(ColumnDefinition column)
+    {
+        return columnsToFetch == null || columnsToFetch.includes(column);
+    }
+
+    public boolean includes(CellPath path)
+    {
+        return path == null || tester == null || tester.includes(path);
+    }
+
+    public boolean canSkipValue(ColumnDefinition column)
+    {
+        return columnsToFetch != null && columnsToFetch.canSkipValue(column);
+    }
+
+    public boolean canSkipValue(CellPath path)
+    {
+        return path != null && tester != null && tester.canSkipValue(path);
+    }
+
+    public void startOfComplexColumn(ColumnDefinition column)
+    {
+        this.tester = columnsToFetch == null ? null : columnsToFetch.newTester(column);
+        this.currentDroppedComplex = droppedColumns.get(column.name.bytes);
+    }
+
+    public void endOfComplexColumn()
+    {
+        this.tester = null;
+    }
+
+    public boolean isDropped(Cell cell, boolean isComplex)
+    {
+        CFMetaData.DroppedColumn dropped = isComplex ? currentDroppedComplex : droppedColumns.get(cell.column().name.bytes);
+        return dropped != null && cell.timestamp() <= dropped.droppedTime;
+    }
+
+    public boolean isDroppedComplexDeletion(DeletionTime complexDeletion)
+    {
+        return currentDroppedComplex != null && complexDeletion.markedForDeleteAt() <= currentDroppedComplex.droppedTime;
+    }
+
+    public ByteBuffer maybeClearCounterValue(ByteBuffer value)
+    {
+        return flag == Flag.FROM_REMOTE || (flag == Flag.LOCAL && CounterContext.instance().shouldClearLocal(value))
+             ? CounterContext.instance().clearAllLocal(value)
+             : value;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/SliceableUnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/SliceableUnfilteredRowIterator.java
new file mode 100644
index 0000000..2250ee9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/SliceableUnfilteredRowIterator.java

@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.db.Slice;
+
+public interface SliceableUnfilteredRowIterator extends UnfilteredRowIterator
+{
+    /**
+     * Move forward (resp. backward if isReverseOrder() is true for the iterator) in
+     * the iterator and return an iterator over the Unfiltered selected by the provided
+     * {@code slice}.
+     * <p>
+     * Please note that successive calls to {@code slice} are allowed provided the
+     * slice are non overlapping and are passed in clustering (resp. reverse clustering) order.
+     * However, {@code slice} is allowed to leave the iterator in an unknown state and there
+     * is no guarantee over what a call to {@code hasNext} or {@code next} will yield after
+     * a call to {@code slice}. In other words, for a given iterator, you should either use
+     * {@code slice} or {@code hasNext/next} but not both.
+     */
+    public Iterator<Unfiltered> slice(Slice slice);
+}

diff --git a/src/java/org/apache/cassandra/db/rows/Unfiltered.java b/src/java/org/apache/cassandra/db/rows/Unfiltered.java
new file mode 100644
index 0000000..9511eeb
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/Unfiltered.java

@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.security.MessageDigest;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Clusterable;
+
+/**
+ * Unfiltered is the common class for the main constituent of an unfiltered partition.
+ * <p>
+ * In practice, an Unfiltered is either a row or a range tombstone marker. Unfiltereds
+ * are uniquely identified by their clustering information and can be sorted according
+ * to those.
+ */
+public interface Unfiltered extends Clusterable
+{
+    public enum Kind { ROW, RANGE_TOMBSTONE_MARKER };
+
+    /**
+     * The kind of the atom: either row or range tombstone marker.
+     */
+    public Kind kind();
+
+    /**
+     * Digest the atom using the provided {@code MessageDigest}.
+     *
+     * @param digest the {@code MessageDigest} to use.
+     */
+    public void digest(MessageDigest digest);
+
+    /**
+     * Validate the data of this atom.
+     *
+     * @param metadata the metadata for the table this atom is part of.
+     * @throws MarshalException if some of the data in this atom is
+     * invalid (some value is invalid for its column type, or some field
+     * is nonsensical).
+     */
+    public void validateData(CFMetaData metadata);
+
+    public boolean isEmpty();
+
+    public String toString(CFMetaData metadata);
+    public String toString(CFMetaData metadata, boolean fullDetails);
+    public String toString(CFMetaData metadata, boolean includeClusterKeys, boolean fullDetails);
+
+    default boolean isRow()
+    {
+        return kind() == Kind.ROW;
+    }
+
+    default boolean isRangeTombstoneMarker()
+    {
+        return kind() == Kind.RANGE_TOMBSTONE_MARKER;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java
new file mode 100644
index 0000000..a969858
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterator.java

@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.Iterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+/**
+ * An iterator over the rows of a given partition that also includes deletion informations.
+ * <p>
+ * An {@code UnfilteredRowIterator} contains a few partition top-level informations and is an
+ * iterator of {@code Unfiltered}, that is of either {@code Row} or {@code RangeTombstoneMarker}.
+ * An implementation of {@code UnfilteredRowIterator} <b>must</b> provide the following
+ * guarantees:
+ *   1. the returned {@code Unfiltered} must be in clustering order, or in reverse clustering
+ *      order iff {@link #isReverseOrder} returns true.
+ *   2. the iterator should not shadow its own data. That is, no deletion
+ *      (partition level deletion, row deletion, range tombstone, complex
+ *      deletion) should delete anything else returned by the iterator (cell, row, ...).
+ *   3. every "start" range tombstone marker should have a corresponding "end" marker, and no other
+ *      marker should be in-between this start-end pair of marker. Note that due to the
+ *      previous rule this means that between a "start" and a corresponding "end" marker there
+ *      can only be rows that are not deleted by the markers. Also note that when iterating
+ *      in reverse order, "end" markers are returned before their "start" counterpart (i.e.
+ *      "start" and "end" are always in the sense of the clustering order).
+ *
+ * Note further that the objects returned by next() are only valid until the
+ * next call to hasNext() or next(). If a consumer wants to keep a reference on
+ * the returned objects for longer than the iteration, it must make a copy of
+ * it explicitly.
+ */
+public interface UnfilteredRowIterator extends BaseRowIterator<Unfiltered>
+{
+    /**
+     * The partition level deletion for the partition this iterate over.
+     */
+    public DeletionTime partitionLevelDeletion();
+
+    /**
+     * Return "statistics" about what is returned by this iterator. Those are used for
+     * performance reasons (for delta-encoding for instance) and code should not
+     * expect those to be exact.
+     */
+    public EncodingStats stats();
+
+    /**
+     * Returns whether this iterator has no data (including no deletion data).
+     */
+    public default boolean isEmpty()
+    {
+        return partitionLevelDeletion().isLive()
+            && staticRow().isEmpty()
+            && !hasNext();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java
new file mode 100644
index 0000000..932ca4c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java

@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+import java.io.IOError;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Serialize/Deserialize an unfiltered row iterator.
+ *
+ * The serialization is composed of a header, follows by the rows and range tombstones of the iterator serialized
+ * until we read the end of the partition (see UnfilteredSerializer for details). The header itself
+ * is:
+ *     <cfid><key><flags><s_header>[<partition_deletion>][<static_row>][<row_estimate>]
+ * where:
+ *     <cfid> is the table cfid.
+ *     <key> is the partition key.
+ *     <flags> contains bit flags. Each flag is set if it's corresponding bit is set. From rightmost
+ *         bit to leftmost one, the flags are:
+ *         - is empty: whether the iterator is empty. If so, nothing follows the <flags>
+ *         - is reversed: whether the iterator is in reversed clustering order
+ *         - has partition deletion: whether or not there is a <partition_deletion> following
+ *         - has static row: whether or not there is a <static_row> following
+ *         - has row estimate: whether or not there is a <row_estimate> following
+ *     <s_header> is the {@code SerializationHeader}. It contains in particular the columns contains in the serialized
+ *         iterator as well as other information necessary to decoding the serialized rows
+ *         (see {@code SerializationHeader.Serializer for details}).
+ *     <partition_deletion> is the deletion time for the partition (delta-encoded)
+ *     <static_row> is the static row for this partition as serialized by UnfilteredSerializer.
+ *     <row_estimate> is the (potentially estimated) number of rows serialized. This is only used for
+ *         the purpose of sizing on the receiving end and should not be relied upon too strongly.
+ *
+ * Please note that the format described above is the on-wire format. On-disk, the format is basically the
+ * same, but the header is written once per sstable, not once per-partition. Further, the actual row and
+ * range tombstones are not written using this class, but rather by {@link ColumnIndex}.
+ */
+public class UnfilteredRowIteratorSerializer
+{
+    protected static final Logger logger = LoggerFactory.getLogger(UnfilteredRowIteratorSerializer.class);
+
+    private static final int IS_EMPTY               = 0x01;
+    private static final int IS_REVERSED            = 0x02;
+    private static final int HAS_PARTITION_DELETION = 0x04;
+    private static final int HAS_STATIC_ROW         = 0x08;
+    private static final int HAS_ROW_ESTIMATE       = 0x10;
+
+    public static final UnfilteredRowIteratorSerializer serializer = new UnfilteredRowIteratorSerializer();
+
+    // Should only be used for the on-wire format.
+    public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, DataOutputPlus out, int version) throws IOException
+    {
+        serialize(iterator, selection, out, version, -1);
+    }
+
+    // Should only be used for the on-wire format.
+    public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, DataOutputPlus out, int version, int rowEstimate) throws IOException
+    {
+        SerializationHeader header = new SerializationHeader(false,
+                                                             iterator.metadata(),
+                                                             iterator.columns(),
+                                                             iterator.stats());
+        serialize(iterator, header, selection, out, version, rowEstimate);
+    }
+
+    // Should only be used for the on-wire format.
+    public void serialize(UnfilteredRowIterator iterator, SerializationHeader header, ColumnFilter selection, DataOutputPlus out, int version, int rowEstimate) throws IOException
+    {
+        assert !header.isForSSTable();
+
+        ByteBufferUtil.writeWithVIntLength(iterator.partitionKey().getKey(), out);
+
+        int flags = 0;
+        if (iterator.isReverseOrder())
+            flags |= IS_REVERSED;
+
+        if (iterator.isEmpty())
+        {
+            out.writeByte((byte)(flags | IS_EMPTY));
+            return;
+        }
+
+        DeletionTime partitionDeletion = iterator.partitionLevelDeletion();
+        if (!partitionDeletion.isLive())
+            flags |= HAS_PARTITION_DELETION;
+        Row staticRow = iterator.staticRow();
+        boolean hasStatic = staticRow != Rows.EMPTY_STATIC_ROW;
+        if (hasStatic)
+            flags |= HAS_STATIC_ROW;
+
+        if (rowEstimate >= 0)
+            flags |= HAS_ROW_ESTIMATE;
+
+        out.writeByte((byte)flags);
+
+        SerializationHeader.serializer.serializeForMessaging(header, selection, out, hasStatic);
+
+        if (!partitionDeletion.isLive())
+            header.writeDeletionTime(partitionDeletion, out);
+
+        if (hasStatic)
+            UnfilteredSerializer.serializer.serialize(staticRow, header, out, version);
+
+        if (rowEstimate >= 0)
+            out.writeUnsignedVInt(rowEstimate);
+
+        while (iterator.hasNext())
+            UnfilteredSerializer.serializer.serialize(iterator.next(), header, out, version);
+        UnfilteredSerializer.serializer.writeEndOfPartition(out);
+    }
+
+    // Please note that this consume the iterator, and as such should not be called unless we have a simple way to
+    // recreate an iterator for both serialize and serializedSize, which is mostly only PartitionUpdate/ArrayBackedCachedPartition.
+    public long serializedSize(UnfilteredRowIterator iterator, ColumnFilter selection, int version, int rowEstimate)
+    {
+        SerializationHeader header = new SerializationHeader(false,
+                                                             iterator.metadata(),
+                                                             iterator.columns(),
+                                                             iterator.stats());
+
+        assert rowEstimate >= 0;
+
+        long size = ByteBufferUtil.serializedSizeWithVIntLength(iterator.partitionKey().getKey())
+                  + 1; // flags
+
+        if (iterator.isEmpty())
+            return size;
+
+        DeletionTime partitionDeletion = iterator.partitionLevelDeletion();
+        Row staticRow = iterator.staticRow();
+        boolean hasStatic = staticRow != Rows.EMPTY_STATIC_ROW;
+
+        size += SerializationHeader.serializer.serializedSizeForMessaging(header, selection, hasStatic);
+
+        if (!partitionDeletion.isLive())
+            size += header.deletionTimeSerializedSize(partitionDeletion);
+
+        if (hasStatic)
+            size += UnfilteredSerializer.serializer.serializedSize(staticRow, header, version);
+
+        if (rowEstimate >= 0)
+            size += TypeSizes.sizeofUnsignedVInt(rowEstimate);
+
+        while (iterator.hasNext())
+            size += UnfilteredSerializer.serializer.serializedSize(iterator.next(), header, version);
+        size += UnfilteredSerializer.serializer.serializedSizeEndOfPartition();
+
+        return size;
+    }
+
+    public Header deserializeHeader(CFMetaData metadata, ColumnFilter selection, DataInputPlus in, int version, SerializationHelper.Flag flag) throws IOException
+    {
+        DecoratedKey key = metadata.decorateKey(ByteBufferUtil.readWithVIntLength(in));
+        int flags = in.readUnsignedByte();
+        boolean isReversed = (flags & IS_REVERSED) != 0;
+        if ((flags & IS_EMPTY) != 0)
+        {
+            SerializationHeader sh = new SerializationHeader(false, metadata, PartitionColumns.NONE, EncodingStats.NO_STATS);
+            return new Header(sh, key, isReversed, true, null, null, 0);
+        }
+
+        boolean hasPartitionDeletion = (flags & HAS_PARTITION_DELETION) != 0;
+        boolean hasStatic = (flags & HAS_STATIC_ROW) != 0;
+        boolean hasRowEstimate = (flags & HAS_ROW_ESTIMATE) != 0;
+
+        SerializationHeader header = SerializationHeader.serializer.deserializeForMessaging(in, metadata, selection, hasStatic);
+
+        DeletionTime partitionDeletion = hasPartitionDeletion ? header.readDeletionTime(in) : DeletionTime.LIVE;
+
+        Row staticRow = Rows.EMPTY_STATIC_ROW;
+        if (hasStatic)
+            staticRow = UnfilteredSerializer.serializer.deserializeStaticRow(in, header, new SerializationHelper(metadata, version, flag));
+
+        int rowEstimate = hasRowEstimate ? (int)in.readUnsignedVInt() : -1;
+        return new Header(header, key, isReversed, false, partitionDeletion, staticRow, rowEstimate);
+    }
+
+    public UnfilteredRowIterator deserialize(DataInputPlus in, int version, CFMetaData metadata, SerializationHelper.Flag flag, Header header) throws IOException
+    {
+        if (header.isEmpty)
+            return EmptyIterators.unfilteredRow(metadata, header.key, header.isReversed);
+
+        final SerializationHelper helper = new SerializationHelper(metadata, version, flag);
+        final SerializationHeader sHeader = header.sHeader;
+        return new AbstractUnfilteredRowIterator(metadata, header.key, header.partitionDeletion, sHeader.columns(), header.staticRow, header.isReversed, sHeader.stats())
+        {
+            private final Row.Builder builder = BTreeRow.sortedBuilder();
+
+            protected Unfiltered computeNext()
+            {
+                try
+                {
+                    Unfiltered unfiltered = UnfilteredSerializer.serializer.deserialize(in, sHeader, helper, builder);
+                    return unfiltered == null ? endOfData() : unfiltered;
+                }
+                catch (IOException e)
+                {
+                    throw new IOError(e);
+                }
+            }
+        };
+    }
+
+    public UnfilteredRowIterator deserialize(DataInputPlus in, int version, CFMetaData metadata, ColumnFilter selection, SerializationHelper.Flag flag) throws IOException
+    {
+        return deserialize(in, version, metadata, flag, deserializeHeader(metadata, selection, in, version, flag));
+    }
+
+    public static class Header
+    {
+        public final SerializationHeader sHeader;
+        public final DecoratedKey key;
+        public final boolean isReversed;
+        public final boolean isEmpty;
+        public final DeletionTime partitionDeletion;
+        public final Row staticRow;
+        public final int rowEstimate; // -1 if no estimate
+
+        private Header(SerializationHeader sHeader,
+                       DecoratedKey key,
+                       boolean isReversed,
+                       boolean isEmpty,
+                       DeletionTime partitionDeletion,
+                       Row staticRow,
+                       int rowEstimate)
+        {
+            this.sHeader = sHeader;
+            this.key = key;
+            this.isReversed = isReversed;
+            this.isEmpty = isEmpty;
+            this.partitionDeletion = partitionDeletion;
+            this.staticRow = staticRow;
+            this.rowEstimate = rowEstimate;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("{header=%s, key=%s, isReversed=%b, isEmpty=%b, del=%s, staticRow=%s, rowEstimate=%d}",
+                                 sHeader, key, isReversed, isEmpty, partitionDeletion, staticRow, rowEstimate);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java
new file mode 100644
index 0000000..b6dbf82
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java

@@ -0,0 +1,625 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.util.*;
+import java.security.MessageDigest;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.transform.FilteredRows;
+import org.apache.cassandra.db.transform.MoreRows;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.IMergeIterator;
+import org.apache.cassandra.utils.MergeIterator;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+
+/**
+ * Static methods to work with atom iterators.
+ */
+public abstract class UnfilteredRowIterators
+{
+    private static final Logger logger = LoggerFactory.getLogger(UnfilteredRowIterators.class);
+
+    private UnfilteredRowIterators() {}
+
+    /**
+     * Interface for a listener interested in the result of merging multiple versions of a given row.
+     * <p>
+     * Implementors of this interface are given enough information that they can easily reconstruct the difference
+     * between the merged result and each individual input. This is used when reconciling results on replias for
+     * instance to figure out what to send as read-repair to each source.
+     */
+    public interface MergeListener
+    {
+        /**
+         * Called once for the merged partition.
+         *
+         * @param mergedDeletion the partition level deletion for the merged partition. Implementors can test if the
+         * merged partition actually has a partition level deletion or not by calling {@code mergedDeletion.isLive()}.
+         * @param versions the partition level deletion for the sources of the merge. Elements of the array will never
+         * be null, but be "live".
+         **/
+        public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions);
+
+        /**
+         * Called once for every row participating in the merge.
+         * <p>
+         * Note that this is called for every clustering where at least one of the source merged has a row. In
+         * particular, this may be called in cases where there is no row in the merged output (if a source has a row
+         * that is shadowed by another source range tombstone or partition level deletion).
+         *
+         * @param merged the result of the merge. This cannot be {@code null} (so that listener can always access the
+         * clustering from this safely)but can be empty, in which case this is a placeholder for when at least one
+         * source has a row, but that row is shadowed in the merged output.
+         * @param versions for each source, the row in that source corresponding to {@code merged}. This can be
+         * {@code null} for some sources if the source has not such row.
+         * @return the row to use as result of the merge (can be {@code null}). Most implementations should simply
+         * return {@code merged}, but this allows some implementations to impact the merge result if necessary. If this
+         * returns either {@code null} or an empty row, then the row is skipped from the merge result. If this returns a
+         * non {@code null} result, then the returned row <b>must</b> have the same clustering than {@code merged}.
+         */
+        public Row onMergedRows(Row merged, Row[] versions);
+
+        /**
+         * Called once for every range tombstone marker participating in the merge.
+         * <p>
+         * Note that this is called for every "clustering position" where at least one of the source merged has a range
+         * tombstone marker.
+         *
+         * @param merged the marker in the merged output. This can be {@code null} if there is no such marker, which
+         * means that at least one source has a marker in {@code versions} but the merged out has nothing corresponding
+         * (this basically mean the merged output has a currently open deletion that shadows whatever marker the source
+         * had).
+         * @param versions the marker for each source merged. This can be {@code null} for some source if that source
+         * has not such marker.
+         */
+        public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions);
+
+        public void close();
+    }
+
+    /**
+     * Returns a iterator that only returns rows with only live content.
+     *
+     * This is mainly used in the CQL layer when we know we don't care about deletion
+     * infos (and since an UnfilteredRowIterator cannot shadow it's own data, we know everyting
+     * returned isn't shadowed by a tombstone).
+     */
+    public static RowIterator filter(UnfilteredRowIterator iter, int nowInSec)
+    {
+        return FilteredRows.filter(iter, nowInSec);
+    }
+
+    /**
+     * Returns an iterator that is the result of merging other iterators.
+     */
+    public static UnfilteredRowIterator merge(List<UnfilteredRowIterator> iterators, int nowInSec)
+    {
+        assert !iterators.isEmpty();
+        if (iterators.size() == 1)
+            return iterators.get(0);
+
+        return UnfilteredRowMergeIterator.create(iterators, nowInSec, null);
+    }
+
+    /**
+     * Returns an iterator that is the result of merging other iterators, and (optionally) using
+     * specific MergeListener.
+     *
+     * Note that this method assumes that there is at least 2 iterators to merge.
+     */
+    public static UnfilteredRowIterator merge(List<UnfilteredRowIterator> iterators, int nowInSec, MergeListener mergeListener)
+    {
+        return UnfilteredRowMergeIterator.create(iterators, nowInSec, mergeListener);
+    }
+
+    /**
+     * Returns an empty unfiltered iterator for a given partition.
+     */
+    public static UnfilteredRowIterator noRowsIterator(final CFMetaData cfm, final DecoratedKey partitionKey, final Row staticRow, final DeletionTime partitionDeletion, final boolean isReverseOrder)
+    {
+        return EmptyIterators.unfilteredRow(cfm, partitionKey, isReverseOrder, staticRow, partitionDeletion);
+    }
+
+    public static UnfilteredRowIterator singleton(Unfiltered unfiltered,
+                                                  CFMetaData metadata,
+                                                  DecoratedKey partitionKey,
+                                                  DeletionTime partitionLevelDeletion,
+                                                  PartitionColumns columns,
+                                                  Row staticRow,
+                                                  boolean isReverseOrder,
+                                                  EncodingStats encodingStats)
+    {
+        return new AbstractUnfilteredRowIterator(metadata, partitionKey, partitionLevelDeletion, columns, staticRow, isReverseOrder, encodingStats)
+        {
+            boolean isDone = false;
+
+            protected Unfiltered computeNext()
+            {
+                if (!isDone)
+                {
+                    isDone = true;
+                    return unfiltered;
+                }
+
+                return endOfData();
+            }
+        };
+    }
+
+    /**
+     * Digests the partition represented by the provided iterator.
+     *
+     * @param command the command that has yield {@code iterator}. This can be null if {@code version >= MessagingService.VERSION_30}
+     * as this is only used when producing digest to be sent to legacy nodes.
+     * @param iterator the iterator to digest.
+     * @param digest the {@code MessageDigest} to use for the digest.
+     * @param version the messaging protocol to use when producing the digest.
+     */
+    public static void digest(ReadCommand command, UnfilteredRowIterator iterator, MessageDigest digest, int version)
+    {
+        if (version < MessagingService.VERSION_30)
+        {
+            LegacyLayout.fromUnfilteredRowIterator(command, iterator).digest(iterator.metadata(), digest);
+            return;
+        }
+
+        digest.update(iterator.partitionKey().getKey().duplicate());
+        iterator.partitionLevelDeletion().digest(digest);
+        iterator.columns().regulars.digest(digest);
+        // When serializing an iterator, we skip the static columns if the iterator has not static row, even if the
+        // columns() object itself has some (the columns() is a superset of what the iterator actually contains, and
+        // will correspond to the queried columns pre-serialization). So we must avoid taking the satic column names
+        // into account if there is no static row or we'd have a digest mismatch between depending on whether the digest
+        // is computed on an iterator that has been serialized or not (see CASSANDRA-12090)
+        // TODO: in practice we could completely skip digesting the columns since they are more informative of what the
+        // iterator may contain, and digesting the actual content is enough. And in fact, that would be more correct
+        // (since again, the columns could be different without the information represented by the iterator being
+        // different), but removing them entirely is stricly speaking a breaking change (it would create mismatches on
+        // upgrade) so we can only do on the next protocol version bump.
+        if (iterator.staticRow() != Rows.EMPTY_STATIC_ROW)
+            iterator.columns().statics.digest(digest);
+        FBUtilities.updateWithBoolean(digest, iterator.isReverseOrder());
+        iterator.staticRow().digest(digest);
+
+        while (iterator.hasNext())
+        {
+            Unfiltered unfiltered = iterator.next();
+            unfiltered.digest(digest);
+        }
+    }
+
+    /**
+     * Returns an iterator that concatenate two atom iterators.
+     * This method assumes that both iterator are from the same partition and that the atom from
+     * {@code iter2} come after the ones of {@code iter1} (that is, that concatenating the iterator
+     * make sense).
+     */
+    public static UnfilteredRowIterator concat(final UnfilteredRowIterator iter1, final UnfilteredRowIterator iter2)
+    {
+        assert iter1.metadata().cfId.equals(iter2.metadata().cfId)
+            && iter1.partitionKey().equals(iter2.partitionKey())
+            && iter1.partitionLevelDeletion().equals(iter2.partitionLevelDeletion())
+            && iter1.isReverseOrder() == iter2.isReverseOrder()
+            && iter1.staticRow().equals(iter2.staticRow());
+
+        class Extend implements MoreRows<UnfilteredRowIterator>
+        {
+            boolean returned = false;
+            public UnfilteredRowIterator moreContents()
+            {
+                if (returned)
+                    return null;
+                returned = true;
+                return iter2;
+            }
+        }
+
+        return MoreRows.extend(iter1, new Extend(), iter1.columns().mergeTo(iter2.columns()));
+    }
+
+    /**
+     * Returns an iterator that concatenate the specified atom with the iterator.
+     */
+    public static UnfilteredRowIterator concat(final Unfiltered first, final UnfilteredRowIterator rest)
+    {
+        return new WrappingUnfilteredRowIterator(rest)
+        {
+            private boolean hasReturnedFirst;
+
+            @Override
+            public boolean hasNext()
+            {
+                return hasReturnedFirst ? super.hasNext() : true;
+            }
+
+            @Override
+            public Unfiltered next()
+            {
+                if (!hasReturnedFirst)
+                {
+                    hasReturnedFirst = true;
+                    return first;
+                }
+                return super.next();
+            }
+        };
+    }
+
+    public static UnfilteredRowIterator cloningIterator(UnfilteredRowIterator iterator, final AbstractAllocator allocator)
+    {
+        class Cloner extends Transformation
+        {
+            private final Row.Builder builder = allocator.cloningBTreeRowBuilder();
+
+            public Row applyToStatic(Row row)
+            {
+                return Rows.copy(row, builder).build();
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                return Rows.copy(row, builder).build();
+            }
+
+            @Override
+            public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+            {
+                return marker.copy(allocator);
+            }
+        }
+        return Transformation.apply(iterator, new Cloner());
+    }
+
+    /**
+     * Validate that the data of the provided iterator is valid, that is that the values
+     * it contains are valid for the type they represent, and more generally that the
+     * infos stored are sensible.
+     *
+     * This is mainly used by scrubber to detect problems in sstables.
+     *
+     * @param iterator the partition to check.
+     * @param filename the name of the file the data is comming from.
+     * @return an iterator that returns the same data than {@code iterator} but that
+     * checks said data and throws a {@code CorruptedSSTableException} if it detects
+     * invalid data.
+     */
+    public static UnfilteredRowIterator withValidation(UnfilteredRowIterator iterator, final String filename)
+    {
+        class Validator extends Transformation
+        {
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                validate(row);
+                return row;
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                validate(row);
+                return row;
+            }
+
+            @Override
+            public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+            {
+                validate(marker);
+                return marker;
+            }
+
+            private void validate(Unfiltered unfiltered)
+            {
+                try
+                {
+                    unfiltered.validateData(iterator.metadata());
+                }
+                catch (MarshalException me)
+                {
+                    throw new CorruptSSTableException(me, filename);
+                }
+            }
+        }
+        return Transformation.apply(iterator, new Validator());
+    }
+
+    /**
+     * Wraps the provided iterator so it logs the returned atoms for debugging purposes.
+     * <p>
+     * Note that this is only meant for debugging as this can log a very large amount of
+     * logging at INFO.
+     */
+    public static UnfilteredRowIterator loggingIterator(UnfilteredRowIterator iterator, final String id, final boolean fullDetails)
+    {
+        CFMetaData metadata = iterator.metadata();
+        logger.info("[{}] Logging iterator on {}.{}, partition key={}, reversed={}, deletion={}",
+                    id,
+                    metadata.ksName,
+                    metadata.cfName,
+                    metadata.getKeyValidator().getString(iterator.partitionKey().getKey()),
+                    iterator.isReverseOrder(),
+                    iterator.partitionLevelDeletion().markedForDeleteAt());
+
+        class Logger extends Transformation
+        {
+            @Override
+            public Row applyToStatic(Row row)
+            {
+                if (!row.isEmpty())
+                    logger.info("[{}] {}", id, row.toString(metadata, fullDetails));
+                return row;
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                logger.info("[{}] {}", id, row.toString(metadata, fullDetails));
+                return row;
+            }
+
+            @Override
+            public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+            {
+                logger.info("[{}] {}", id, marker.toString(metadata));
+                return marker;
+            }
+        }
+        return Transformation.apply(iterator, new Logger());
+    }
+
+    /**
+     * A wrapper over MergeIterator to implement the UnfilteredRowIterator interface.
+     */
+    private static class UnfilteredRowMergeIterator extends AbstractUnfilteredRowIterator
+    {
+        private final IMergeIterator<Unfiltered, Unfiltered> mergeIterator;
+        private final MergeListener listener;
+
+        private UnfilteredRowMergeIterator(CFMetaData metadata,
+                                           List<UnfilteredRowIterator> iterators,
+                                           PartitionColumns columns,
+                                           DeletionTime partitionDeletion,
+                                           int nowInSec,
+                                           boolean reversed,
+                                           MergeListener listener)
+        {
+            super(metadata,
+                  iterators.get(0).partitionKey(),
+                  partitionDeletion,
+                  columns,
+                  mergeStaticRows(iterators, columns.statics, nowInSec, listener, partitionDeletion),
+                  reversed,
+                  mergeStats(iterators));
+
+            this.mergeIterator = MergeIterator.get(iterators,
+                                                   reversed ? metadata.comparator.reversed() : metadata.comparator,
+                                                   new MergeReducer(iterators.size(), reversed, nowInSec, listener));
+            this.listener = listener;
+        }
+
+        private static UnfilteredRowMergeIterator create(List<UnfilteredRowIterator> iterators, int nowInSec, MergeListener listener)
+        {
+            try
+            {
+                checkForInvalidInput(iterators);
+                return new UnfilteredRowMergeIterator(iterators.get(0).metadata(),
+                                                      iterators,
+                                                      collectColumns(iterators),
+                                                      collectPartitionLevelDeletion(iterators, listener),
+                                                      nowInSec,
+                                                      iterators.get(0).isReverseOrder(),
+                                                      listener);
+            }
+            catch (RuntimeException | Error e)
+            {
+                try
+                {
+                    FBUtilities.closeAll(iterators);
+                }
+                catch (Exception suppressed)
+                {
+                    e.addSuppressed(suppressed);
+                }
+                throw e;
+            }
+        }
+
+        @SuppressWarnings("resource") // We're not really creating any resource here
+        private static void checkForInvalidInput(List<UnfilteredRowIterator> iterators)
+        {
+            if (iterators.isEmpty())
+                return;
+
+            UnfilteredRowIterator first = iterators.get(0);
+            for (int i = 1; i < iterators.size(); i++)
+            {
+                UnfilteredRowIterator iter = iterators.get(i);
+                assert first.metadata().cfId.equals(iter.metadata().cfId);
+                assert first.partitionKey().equals(iter.partitionKey());
+                assert first.isReverseOrder() == iter.isReverseOrder();
+            }
+        }
+
+        @SuppressWarnings("resource") // We're not really creating any resource here
+        private static DeletionTime collectPartitionLevelDeletion(List<UnfilteredRowIterator> iterators, MergeListener listener)
+        {
+            DeletionTime[] versions = listener == null ? null : new DeletionTime[iterators.size()];
+
+            DeletionTime delTime = DeletionTime.LIVE;
+            for (int i = 0; i < iterators.size(); i++)
+            {
+                UnfilteredRowIterator iter = iterators.get(i);
+                DeletionTime iterDeletion = iter.partitionLevelDeletion();
+                if (listener != null)
+                    versions[i] = iterDeletion;
+                if (!delTime.supersedes(iterDeletion))
+                    delTime = iterDeletion;
+            }
+            if (listener != null)
+                listener.onMergedPartitionLevelDeletion(delTime, versions);
+            return delTime;
+        }
+
+        private static Row mergeStaticRows(List<UnfilteredRowIterator> iterators,
+                                           Columns columns,
+                                           int nowInSec,
+                                           MergeListener listener,
+                                           DeletionTime partitionDeletion)
+        {
+            if (columns.isEmpty())
+                return Rows.EMPTY_STATIC_ROW;
+
+            if (iterators.stream().allMatch(iter -> iter.staticRow().isEmpty()))
+                return Rows.EMPTY_STATIC_ROW;
+
+            Row.Merger merger = new Row.Merger(iterators.size(), nowInSec, columns.hasComplex());
+            for (int i = 0; i < iterators.size(); i++)
+                merger.add(i, iterators.get(i).staticRow());
+
+            Row merged = merger.merge(partitionDeletion);
+            if (merged == null)
+                merged = Rows.EMPTY_STATIC_ROW;
+            if (listener == null)
+                return merged;
+
+            merged = listener.onMergedRows(merged, merger.mergedRows());
+            // Note that onMergedRows can have returned null even though his input wasn't null
+            return merged == null ? Rows.EMPTY_STATIC_ROW : merged;
+        }
+
+        private static PartitionColumns collectColumns(List<UnfilteredRowIterator> iterators)
+        {
+            PartitionColumns first = iterators.get(0).columns();
+            Columns statics = first.statics;
+            Columns regulars = first.regulars;
+            for (int i = 1; i < iterators.size(); i++)
+            {
+                PartitionColumns cols = iterators.get(i).columns();
+                statics = statics.mergeTo(cols.statics);
+                regulars = regulars.mergeTo(cols.regulars);
+            }
+            return statics == first.statics && regulars == first.regulars
+                 ? first
+                 : new PartitionColumns(statics, regulars);
+        }
+
+        private static EncodingStats mergeStats(List<UnfilteredRowIterator> iterators)
+        {
+            EncodingStats stats = EncodingStats.NO_STATS;
+            for (UnfilteredRowIterator iter : iterators)
+                stats = stats.mergeWith(iter.stats());
+            return stats;
+        }
+
+        protected Unfiltered computeNext()
+        {
+            while (mergeIterator.hasNext())
+            {
+                Unfiltered merged = mergeIterator.next();
+                if (merged != null)
+                    return merged;
+            }
+            return endOfData();
+        }
+
+        public void close()
+        {
+            // This will close the input iterators
+            FileUtils.closeQuietly(mergeIterator);
+
+            if (listener != null)
+                listener.close();
+        }
+
+        private class MergeReducer extends MergeIterator.Reducer<Unfiltered, Unfiltered>
+        {
+            private final MergeListener listener;
+
+            private Unfiltered.Kind nextKind;
+
+            private final Row.Merger rowMerger;
+            private final RangeTombstoneMarker.Merger markerMerger;
+
+            private MergeReducer(int size, boolean reversed, int nowInSec, MergeListener listener)
+            {
+                this.rowMerger = new Row.Merger(size, nowInSec, columns().regulars.hasComplex());
+                this.markerMerger = new RangeTombstoneMarker.Merger(size, partitionLevelDeletion(), reversed);
+                this.listener = listener;
+            }
+
+            @Override
+            public boolean trivialReduceIsTrivial()
+            {
+                // If we have a listener, we must signal it even when we have a single version
+                return listener == null;
+            }
+
+            public void reduce(int idx, Unfiltered current)
+            {
+                nextKind = current.kind();
+                if (nextKind == Unfiltered.Kind.ROW)
+                    rowMerger.add(idx, (Row)current);
+                else
+                    markerMerger.add(idx, (RangeTombstoneMarker)current);
+            }
+
+            protected Unfiltered getReduced()
+            {
+                if (nextKind == Unfiltered.Kind.ROW)
+                {
+                    Row merged = rowMerger.merge(markerMerger.activeDeletion());
+                    if (listener == null)
+                        return merged;
+
+                    merged = listener.onMergedRows(merged == null
+                                                   ? BTreeRow.emptyRow(rowMerger.mergedClustering())
+                                                   : merged,
+                                                   rowMerger.mergedRows());
+
+                    return merged == null || merged.isEmpty() ? null : merged;
+                }
+                else
+                {
+                    RangeTombstoneMarker merged = markerMerger.merge();
+                    if (listener != null)
+                        listener.onMergedRangeTombstoneMarkers(merged, markerMerger.mergedMarkers());
+                    return merged;
+                }
+            }
+
+            protected void onKeyChange()
+            {
+                if (nextKind == Unfiltered.Kind.ROW)
+                    rowMerger.clear();
+                else
+                    markerMerger.clear();
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java
new file mode 100644
index 0000000..0342e39
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java

@@ -0,0 +1,610 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.io.IOException;
+
+import com.google.common.collect.Collections2;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.SearchIterator;
+
+/**
+ * Serialize/deserialize a single Unfiltered (both on-wire and on-disk).
+ *
+ * The encoded format for an unfiltered is <flags>(<row>|<marker>) where:
+ *
+ *   <flags> is a byte (or two) whose bits are flags used by the rest of the serialization. Each
+ *       flag is defined/explained below as the "Unfiltered flags" constants. One of those flags
+ *       is an extension flag, and if present, trigger the rid of another byte that contains more
+ *       flags. If the extension is not set, defaults are assumed for the flags of that 2nd byte.
+ *   <row> is <clustering><size>[<timestamp>][<ttl>][<deletion>]<sc1>...<sci><cc1>...<ccj> where
+ *       <clustering> is the row clustering as serialized by {@code Clustering.serializer} (note
+ *       that static row are an exception and don't have this).
+ *       <size> is the size of the whole unfiltered on disk (it's only used for sstables and is
+ *       used to efficiently skip rows).
+ *       <timestamp>, <ttl> and <deletion> are the row timestamp, ttl and deletion
+ *       whose presence is determined by the flags. <sci> is the simple columns of the row and <ccj> the
+ *       complex ones.
+ *       The columns for the row are then serialized if they differ from those in the header,
+ *       and each cell then follows:
+ *         * Each simple column <sci> will simply be a <cell>
+ *           (which might have no value, see below),
+ *         * Each <ccj> will be [<delTime>]<n><cell1>...<celln> where <delTime>
+ *           is the deletion for this complex column (if flags indicates it present), <n>
+ *           is the vint encoded value of n, i.e. <celln>'s 1-based index, <celli>
+ *           are the <cell> for this complex column
+ *   <marker> is <bound><deletion> where <bound> is the marker bound as serialized
+ *       by {@code Slice.Bound.serializer} and <deletion> is the marker deletion
+ *       time.
+ *
+ *   <cell> A cell start with a 1 byte <flag>. The 2nd and third flag bits indicate if
+ *       it's a deleted or expiring cell. The 4th flag indicates if the value
+ *       is empty or not. The 5th and 6th indicates if the timestamp and ttl/
+ *       localDeletionTime for the cell are the same than the row one (if that
+ *       is the case, those are not repeated for the cell).Follows the <value>
+ *       (unless it's marked empty in the flag) and a delta-encoded long <timestamp>
+ *       (unless the flag tells to use the row level one).
+ *       Then if it's a deleted or expiring cell a delta-encoded int <localDelTime>
+ *       and if it's expiring a delta-encoded int <ttl> (unless it's an expiring cell
+ *       and the ttl and localDeletionTime are indicated by the flags to be the same
+ *       than the row ones, in which case none of those appears).
+ */
+public class UnfilteredSerializer
+{
+    public static final UnfilteredSerializer serializer = new UnfilteredSerializer();
+
+    /*
+     * Unfiltered flags constants.
+     */
+    private final static int END_OF_PARTITION     = 0x01; // Signal the end of the partition. Nothing follows a <flags> field with that flag.
+    private final static int IS_MARKER            = 0x02; // Whether the encoded unfiltered is a marker or a row. All following markers applies only to rows.
+    private final static int HAS_TIMESTAMP        = 0x04; // Whether the encoded row has a timestamp (i.e. if row.partitionKeyLivenessInfo().hasTimestamp() == true).
+    private final static int HAS_TTL              = 0x08; // Whether the encoded row has some expiration info (i.e. if row.partitionKeyLivenessInfo().hasTTL() == true).
+    private final static int HAS_DELETION         = 0x10; // Whether the encoded row has some deletion info.
+    private final static int HAS_ALL_COLUMNS      = 0x20; // Whether the encoded row has all of the columns from the header present.
+    private final static int HAS_COMPLEX_DELETION = 0x40; // Whether the encoded row has some complex deletion for at least one of its columns.
+    private final static int EXTENSION_FLAG       = 0x80; // If present, another byte is read containing the "extended flags" above.
+
+    /*
+     * Extended flags
+     */
+    private final static int IS_STATIC               = 0x01; // Whether the encoded row is a static. If there is no extended flag, the row is assumed not static.
+    /**
+     * A shadowable tombstone cannot replace a previous row deletion otherwise it could resurrect a
+     * previously deleted cell not updated by a subsequent update, SEE CASSANDRA-11500
+     */
+    @Deprecated
+    private final static int HAS_SHADOWABLE_DELETION = 0x02; // Whether the row deletion is shadowable. If there is no extended flag (or no row deletion), the deletion is assumed not shadowable.
+
+    public void serialize(Unfiltered unfiltered, SerializationHeader header, DataOutputPlus out, int version)
+    throws IOException
+    {
+        assert !header.isForSSTable();
+        serialize(unfiltered, header, out, 0, version);
+    }
+
+    public void serialize(Unfiltered unfiltered, SerializationHeader header, DataOutputPlus out, long previousUnfilteredSize, int version)
+    throws IOException
+    {
+        if (unfiltered.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+        {
+            serialize((RangeTombstoneMarker) unfiltered, header, out, previousUnfilteredSize, version);
+        }
+        else
+        {
+            serialize((Row) unfiltered, header, out, previousUnfilteredSize, version);
+        }
+    }
+
+    public void serializeStaticRow(Row row, SerializationHeader header, DataOutputPlus out, int version)
+    throws IOException
+    {
+        assert row.isStatic();
+        serialize(row, header, out, 0, version);
+    }
+
+    private void serialize(Row row, SerializationHeader header, DataOutputPlus out, long previousUnfilteredSize, int version)
+    throws IOException
+    {
+        int flags = 0;
+        int extendedFlags = 0;
+
+        boolean isStatic = row.isStatic();
+        Columns headerColumns = header.columns(isStatic);
+        LivenessInfo pkLiveness = row.primaryKeyLivenessInfo();
+        Row.Deletion deletion = row.deletion();
+        boolean hasComplexDeletion = row.hasComplexDeletion();
+        boolean hasAllColumns = (row.columnCount() == headerColumns.size());
+        boolean hasExtendedFlags = hasExtendedFlags(row);
+
+        if (isStatic)
+            extendedFlags |= IS_STATIC;
+
+        if (!pkLiveness.isEmpty())
+            flags |= HAS_TIMESTAMP;
+        if (pkLiveness.isExpiring())
+            flags |= HAS_TTL;
+        if (!deletion.isLive())
+        {
+            flags |= HAS_DELETION;
+            if (deletion.isShadowable())
+                extendedFlags |= HAS_SHADOWABLE_DELETION;
+        }
+        if (hasComplexDeletion)
+            flags |= HAS_COMPLEX_DELETION;
+        if (hasAllColumns)
+            flags |= HAS_ALL_COLUMNS;
+
+        if (hasExtendedFlags)
+            flags |= EXTENSION_FLAG;
+
+        out.writeByte((byte)flags);
+        if (hasExtendedFlags)
+            out.writeByte((byte)extendedFlags);
+
+        if (!isStatic)
+            Clustering.serializer.serialize(row.clustering(), out, version, header.clusteringTypes());
+
+        if (header.isForSSTable())
+        {
+            out.writeUnsignedVInt(serializedRowBodySize(row, header, previousUnfilteredSize, version));
+            out.writeUnsignedVInt(previousUnfilteredSize);
+        }
+
+        if ((flags & HAS_TIMESTAMP) != 0)
+            header.writeTimestamp(pkLiveness.timestamp(), out);
+        if ((flags & HAS_TTL) != 0)
+        {
+            header.writeTTL(pkLiveness.ttl(), out);
+            header.writeLocalDeletionTime(pkLiveness.localExpirationTime(), out);
+        }
+        if ((flags & HAS_DELETION) != 0)
+            header.writeDeletionTime(deletion.time(), out);
+
+        if (!hasAllColumns)
+            Columns.serializer.serializeSubset(row.columns(), headerColumns, out);
+
+        SearchIterator<ColumnDefinition, ColumnDefinition> si = headerColumns.iterator();
+        for (ColumnData data : row)
+        {
+            // We can obtain the column for data directly from data.column(). However, if the cell/complex data
+            // originates from a sstable, the column we'll get will have the type used when the sstable was serialized,
+            // and if that type have been recently altered, that may not be the type we want to serialize the column
+            // with. So we use the ColumnDefinition from the "header" which is "current". Also see #11810 for what
+            // happens if we don't do that.
+            ColumnDefinition column = si.next(data.column());
+            assert column != null;
+
+            if (data.column.isSimple())
+                Cell.serializer.serialize((Cell) data, column, out, pkLiveness, header);
+            else
+                writeComplexColumn((ComplexColumnData) data, column, hasComplexDeletion, pkLiveness, header, out);
+        }
+    }
+
+    private void writeComplexColumn(ComplexColumnData data, ColumnDefinition column, boolean hasComplexDeletion, LivenessInfo rowLiveness, SerializationHeader header, DataOutputPlus out)
+    throws IOException
+    {
+        if (hasComplexDeletion)
+            header.writeDeletionTime(data.complexDeletion(), out);
+
+        out.writeUnsignedVInt(data.cellsCount());
+        for (Cell cell : data)
+            Cell.serializer.serialize(cell, column, out, rowLiveness, header);
+    }
+
+    private void serialize(RangeTombstoneMarker marker, SerializationHeader header, DataOutputPlus out, long previousUnfilteredSize, int version)
+    throws IOException
+    {
+        out.writeByte((byte)IS_MARKER);
+        RangeTombstone.Bound.serializer.serialize(marker.clustering(), out, version, header.clusteringTypes());
+
+        if (header.isForSSTable())
+        {
+            out.writeUnsignedVInt(serializedMarkerBodySize(marker, header, previousUnfilteredSize, version));
+            out.writeUnsignedVInt(previousUnfilteredSize);
+        }
+
+        if (marker.isBoundary())
+        {
+            RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker)marker;
+            header.writeDeletionTime(bm.endDeletionTime(), out);
+            header.writeDeletionTime(bm.startDeletionTime(), out);
+        }
+        else
+        {
+            header.writeDeletionTime(((RangeTombstoneBoundMarker)marker).deletionTime(), out);
+        }
+    }
+
+    public long serializedSize(Unfiltered unfiltered, SerializationHeader header, int version)
+    {
+        assert !header.isForSSTable();
+        return serializedSize(unfiltered, header, 0, version);
+    }
+
+    public long serializedSize(Unfiltered unfiltered, SerializationHeader header, long previousUnfilteredSize,int version)
+    {
+        return unfiltered.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER
+             ? serializedSize((RangeTombstoneMarker) unfiltered, header, previousUnfilteredSize, version)
+             : serializedSize((Row) unfiltered, header, previousUnfilteredSize, version);
+    }
+
+    private long serializedSize(Row row, SerializationHeader header, long previousUnfilteredSize, int version)
+    {
+        long size = 1; // flags
+
+        if (hasExtendedFlags(row))
+            size += 1; // extended flags
+
+        if (!row.isStatic())
+            size += Clustering.serializer.serializedSize(row.clustering(), version, header.clusteringTypes());
+
+        return size + serializedRowBodySize(row, header, previousUnfilteredSize, version);
+    }
+
+    private long serializedRowBodySize(Row row, SerializationHeader header, long previousUnfilteredSize, int version)
+    {
+        long size = 0;
+
+        if (header.isForSSTable())
+            size += TypeSizes.sizeofUnsignedVInt(previousUnfilteredSize);
+
+        boolean isStatic = row.isStatic();
+        Columns headerColumns = header.columns(isStatic);
+        LivenessInfo pkLiveness = row.primaryKeyLivenessInfo();
+        Row.Deletion deletion = row.deletion();
+        boolean hasComplexDeletion = row.hasComplexDeletion();
+        boolean hasAllColumns = (row.columnCount() == headerColumns.size());
+
+        if (!pkLiveness.isEmpty())
+            size += header.timestampSerializedSize(pkLiveness.timestamp());
+        if (pkLiveness.isExpiring())
+        {
+            size += header.ttlSerializedSize(pkLiveness.ttl());
+            size += header.localDeletionTimeSerializedSize(pkLiveness.localExpirationTime());
+        }
+        if (!deletion.isLive())
+            size += header.deletionTimeSerializedSize(deletion.time());
+
+        if (!hasAllColumns)
+            size += Columns.serializer.serializedSubsetSize(row.columns(), header.columns(isStatic));
+
+        SearchIterator<ColumnDefinition, ColumnDefinition> si = headerColumns.iterator();
+        for (ColumnData data : row)
+        {
+            ColumnDefinition column = si.next(data.column());
+            assert column != null;
+
+            if (data.column.isSimple())
+                size += Cell.serializer.serializedSize((Cell) data, column, pkLiveness, header);
+            else
+                size += sizeOfComplexColumn((ComplexColumnData) data, column, hasComplexDeletion, pkLiveness, header);
+        }
+
+        return size;
+    }
+
+    private long sizeOfComplexColumn(ComplexColumnData data, ColumnDefinition column, boolean hasComplexDeletion, LivenessInfo rowLiveness, SerializationHeader header)
+    {
+        long size = 0;
+
+        if (hasComplexDeletion)
+            size += header.deletionTimeSerializedSize(data.complexDeletion());
+
+        size += TypeSizes.sizeofUnsignedVInt(data.cellsCount());
+        for (Cell cell : data)
+            size += Cell.serializer.serializedSize(cell, column, rowLiveness, header);
+
+        return size;
+    }
+
+    private long serializedSize(RangeTombstoneMarker marker, SerializationHeader header, long previousUnfilteredSize, int version)
+    {
+        assert !header.isForSSTable();
+        return 1 // flags
+             + RangeTombstone.Bound.serializer.serializedSize(marker.clustering(), version, header.clusteringTypes())
+             + serializedMarkerBodySize(marker, header, previousUnfilteredSize, version);
+    }
+
+    private long serializedMarkerBodySize(RangeTombstoneMarker marker, SerializationHeader header, long previousUnfilteredSize, int version)
+    {
+        long size = 0;
+        if (header.isForSSTable())
+            size += TypeSizes.sizeofUnsignedVInt(previousUnfilteredSize);
+
+        if (marker.isBoundary())
+        {
+            RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker)marker;
+            size += header.deletionTimeSerializedSize(bm.endDeletionTime());
+            size += header.deletionTimeSerializedSize(bm.startDeletionTime());
+        }
+        else
+        {
+           size += header.deletionTimeSerializedSize(((RangeTombstoneBoundMarker)marker).deletionTime());
+        }
+        return size;
+    }
+
+    public void writeEndOfPartition(DataOutputPlus out) throws IOException
+    {
+        out.writeByte((byte)1);
+    }
+
+    public long serializedSizeEndOfPartition()
+    {
+        return 1;
+    }
+
+    /**
+     * Deserialize an {@link Unfiltered} from the provided input.
+     *
+     * @param in the input from which to deserialize.
+     * @param header serialization header corresponding to the serialized data.
+     * @param helper the helper to use for deserialization.
+     * @param builder a row builder, passed here so we don't allocate a new one for every new row.
+     * @return the deserialized {@link Unfiltered} or {@code null} if we've read the end of a partition. This method is
+     * guaranteed to never return empty rows.
+     */
+    public Unfiltered deserialize(DataInputPlus in, SerializationHeader header, SerializationHelper helper, Row.Builder builder)
+    throws IOException
+    {
+        while (true)
+        {
+            Unfiltered unfiltered = deserializeOne(in, header, helper, builder);
+            if (unfiltered == null)
+                return null;
+
+            // Skip empty rows, see deserializeOne javadoc
+            if (!unfiltered.isEmpty())
+                return unfiltered;
+        }
+    }
+
+    /**
+     * Deserialize a single {@link Unfiltered} from the provided input.
+     * <p>
+     * <b>WARNING:</b> this can return an empty row because it's possible there is a row serialized, but that row only
+     * contains data for dropped columns, see CASSANDRA-13337. But as most code expect rows to not be empty, this isn't
+     * meant to be exposed publicly.
+     *
+     * But as {@link UnfilteredRowIterator} should not return empty
+     * rows, this mean consumer of this method should make sure to skip said empty rows.
+     */
+    private Unfiltered deserializeOne(DataInputPlus in, SerializationHeader header, SerializationHelper helper, Row.Builder builder)
+    throws IOException
+    {
+        // It wouldn't be wrong per-se to use an unsorted builder, but it would be inefficient so make sure we don't do it by mistake
+        assert builder.isSorted();
+
+        int flags = in.readUnsignedByte();
+        if (isEndOfPartition(flags))
+            return null;
+
+        int extendedFlags = readExtendedFlags(in, flags);
+
+        if (kind(flags) == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+        {
+            RangeTombstone.Bound bound = RangeTombstone.Bound.serializer.deserialize(in, helper.version, header.clusteringTypes());
+            return deserializeMarkerBody(in, header, bound);
+        }
+        else
+        {
+            // deserializeStaticRow should be used for that.
+            if (isStatic(extendedFlags))
+                throw new IOException("Corrupt flags value for unfiltered partition (isStatic flag set): " + flags);
+
+            builder.newRow(Clustering.serializer.deserialize(in, helper.version, header.clusteringTypes()));
+            return deserializeRowBody(in, header, helper, flags, extendedFlags, builder);
+        }
+    }
+
+    public Row deserializeStaticRow(DataInputPlus in, SerializationHeader header, SerializationHelper helper)
+    throws IOException
+    {
+        int flags = in.readUnsignedByte();
+        assert !isEndOfPartition(flags) && kind(flags) == Unfiltered.Kind.ROW && isExtended(flags) : flags;
+        int extendedFlags = in.readUnsignedByte();
+        Row.Builder builder = BTreeRow.sortedBuilder();
+        builder.newRow(Clustering.STATIC_CLUSTERING);
+        return deserializeRowBody(in, header, helper, flags, extendedFlags, builder);
+    }
+
+    public RangeTombstoneMarker deserializeMarkerBody(DataInputPlus in, SerializationHeader header, RangeTombstone.Bound bound)
+    throws IOException
+    {
+        if (header.isForSSTable())
+        {
+            in.readUnsignedVInt(); // marker size
+            in.readUnsignedVInt(); // previous unfiltered size
+        }
+
+        if (bound.isBoundary())
+            return new RangeTombstoneBoundaryMarker(bound, header.readDeletionTime(in), header.readDeletionTime(in));
+        else
+            return new RangeTombstoneBoundMarker(bound, header.readDeletionTime(in));
+    }
+
+    public Row deserializeRowBody(DataInputPlus in,
+                                  SerializationHeader header,
+                                  SerializationHelper helper,
+                                  int flags,
+                                  int extendedFlags,
+                                  Row.Builder builder)
+    throws IOException
+    {
+        try
+        {
+            boolean isStatic = isStatic(extendedFlags);
+            boolean hasTimestamp = (flags & HAS_TIMESTAMP) != 0;
+            boolean hasTTL = (flags & HAS_TTL) != 0;
+            boolean hasDeletion = (flags & HAS_DELETION) != 0;
+            boolean deletionIsShadowable = (extendedFlags & HAS_SHADOWABLE_DELETION) != 0;
+            boolean hasComplexDeletion = (flags & HAS_COMPLEX_DELETION) != 0;
+            boolean hasAllColumns = (flags & HAS_ALL_COLUMNS) != 0;
+            Columns headerColumns = header.columns(isStatic);
+
+            if (header.isForSSTable())
+            {
+                in.readUnsignedVInt(); // Skip row size
+                in.readUnsignedVInt(); // previous unfiltered size
+            }
+
+            LivenessInfo rowLiveness = LivenessInfo.EMPTY;
+            if (hasTimestamp)
+            {
+                long timestamp = header.readTimestamp(in);
+                int ttl = hasTTL ? header.readTTL(in) : LivenessInfo.NO_TTL;
+                int localDeletionTime = hasTTL ? header.readLocalDeletionTime(in) : LivenessInfo.NO_EXPIRATION_TIME;
+                rowLiveness = LivenessInfo.create(timestamp, ttl, localDeletionTime);
+            }
+
+            builder.addPrimaryKeyLivenessInfo(rowLiveness);
+            builder.addRowDeletion(hasDeletion ? new Row.Deletion(header.readDeletionTime(in), deletionIsShadowable) : Row.Deletion.LIVE);
+
+            Columns columns = hasAllColumns ? headerColumns : Columns.serializer.deserializeSubset(headerColumns, in);
+            for (ColumnDefinition column : columns)
+            {
+                if (column.isSimple())
+                    readSimpleColumn(column, in, header, helper, builder, rowLiveness);
+                else
+                    readComplexColumn(column, in, header, helper, hasComplexDeletion, builder, rowLiveness);
+            }
+
+            return builder.build();
+        }
+        catch (RuntimeException | AssertionError e)
+        {
+            // Corrupted data could be such that it triggers an assertion in the row Builder, or break one of its assumption.
+            // Of course, a bug in said builder could also trigger this, but it's impossible a priori to always make the distinction
+            // between a real bug and data corrupted in just the bad way. Besides, re-throwing as an IOException doesn't hide the
+            // exception, it just make we catch it properly and mark the sstable as corrupted.
+            throw new IOException("Error building row with data deserialized from " + in, e);
+        }
+    }
+
+    private void readSimpleColumn(ColumnDefinition column, DataInputPlus in, SerializationHeader header, SerializationHelper helper, Row.Builder builder, LivenessInfo rowLiveness)
+    throws IOException
+    {
+        if (helper.includes(column))
+        {
+            Cell cell = Cell.serializer.deserialize(in, rowLiveness, column, header, helper);
+            if (!helper.isDropped(cell, false))
+                builder.addCell(cell);
+        }
+        else
+        {
+            Cell.serializer.skip(in, column, header);
+        }
+    }
+
+    private void readComplexColumn(ColumnDefinition column, DataInputPlus in, SerializationHeader header, SerializationHelper helper, boolean hasComplexDeletion, Row.Builder builder, LivenessInfo rowLiveness)
+    throws IOException
+    {
+        if (helper.includes(column))
+        {
+            helper.startOfComplexColumn(column);
+            if (hasComplexDeletion)
+            {
+                DeletionTime complexDeletion = header.readDeletionTime(in);
+                if (!helper.isDroppedComplexDeletion(complexDeletion))
+                    builder.addComplexDeletion(column, complexDeletion);
+            }
+
+            int count = (int) in.readUnsignedVInt();
+            while (--count >= 0)
+            {
+                Cell cell = Cell.serializer.deserialize(in, rowLiveness, column, header, helper);
+                if (helper.includes(cell.path()) && !helper.isDropped(cell, true))
+                    builder.addCell(cell);
+            }
+
+            helper.endOfComplexColumn();
+        }
+        else
+        {
+            skipComplexColumn(in, column, header, hasComplexDeletion);
+        }
+    }
+
+    public void skipRowBody(DataInputPlus in) throws IOException
+    {
+        int rowSize = (int)in.readUnsignedVInt();
+        in.skipBytesFully(rowSize);
+    }
+
+    public void skipStaticRow(DataInputPlus in, SerializationHeader header, SerializationHelper helper) throws IOException
+    {
+        int flags = in.readUnsignedByte();
+        assert !isEndOfPartition(flags) && kind(flags) == Unfiltered.Kind.ROW && isExtended(flags) : "Flags is " + flags;
+        int extendedFlags = in.readUnsignedByte();
+        assert isStatic(extendedFlags);
+        skipRowBody(in);
+    }
+
+    public void skipMarkerBody(DataInputPlus in) throws IOException
+    {
+        int markerSize = (int)in.readUnsignedVInt();
+        in.skipBytesFully(markerSize);
+    }
+
+    private void skipComplexColumn(DataInputPlus in, ColumnDefinition column, SerializationHeader header, boolean hasComplexDeletion)
+    throws IOException
+    {
+        if (hasComplexDeletion)
+            header.skipDeletionTime(in);
+
+        int count = (int) in.readUnsignedVInt();
+        while (--count >= 0)
+            Cell.serializer.skip(in, column, header);
+    }
+
+    public static boolean isEndOfPartition(int flags)
+    {
+        return (flags & END_OF_PARTITION) != 0;
+    }
+
+    public static Unfiltered.Kind kind(int flags)
+    {
+        return (flags & IS_MARKER) != 0 ? Unfiltered.Kind.RANGE_TOMBSTONE_MARKER : Unfiltered.Kind.ROW;
+    }
+
+    public static boolean isStatic(int extendedFlags)
+    {
+        return (extendedFlags & IS_STATIC) != 0;
+    }
+
+    private static boolean isExtended(int flags)
+    {
+        return (flags & EXTENSION_FLAG) != 0;
+    }
+
+    public static int readExtendedFlags(DataInputPlus in, int flags) throws IOException
+    {
+        return isExtended(flags) ? in.readUnsignedByte() : 0;
+    }
+
+    public static boolean hasExtendedFlags(Row row)
+    {
+        return row.isStatic() || row.deletion().isShadowable();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/rows/WrappingUnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/WrappingUnfilteredRowIterator.java
new file mode 100644
index 0000000..8b18554
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/WrappingUnfilteredRowIterator.java

@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import com.google.common.collect.UnmodifiableIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+
+/**
+ * Abstract class to make writing unfiltered iterators that wrap another iterator
+ * easier. By default, the wrapping iterator simply delegate every call to
+ * the wrapped iterator so concrete implementations will have to override
+ * some of the methods.
+ * <p>
+ * Note that if most of what you want to do is modifying/filtering the returned
+ * {@code Unfiltered}, {@link org.apache.cassandra.db.transform.Transformation.apply} can be a simpler option.
+ */
+public abstract class WrappingUnfilteredRowIterator extends UnmodifiableIterator<Unfiltered>  implements UnfilteredRowIterator
+{
+    protected final UnfilteredRowIterator wrapped;
+
+    protected WrappingUnfilteredRowIterator(UnfilteredRowIterator wrapped)
+    {
+        this.wrapped = wrapped;
+    }
+
+    public CFMetaData metadata()
+    {
+        return wrapped.metadata();
+    }
+
+    public PartitionColumns columns()
+    {
+        return wrapped.columns();
+    }
+
+    public boolean isReverseOrder()
+    {
+        return wrapped.isReverseOrder();
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return wrapped.partitionKey();
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        return wrapped.partitionLevelDeletion();
+    }
+
+    public Row staticRow()
+    {
+        return wrapped.staticRow();
+    }
+
+    public EncodingStats stats()
+    {
+        return wrapped.stats();
+    }
+
+    public boolean hasNext()
+    {
+        return wrapped.hasNext();
+    }
+
+    public Unfiltered next()
+    {
+        return wrapped.next();
+    }
+
+    public void close()
+    {
+        wrapped.close();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/BaseIterator.java b/src/java/org/apache/cassandra/db/transform/BaseIterator.java
new file mode 100644
index 0000000..d00e406
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/BaseIterator.java

@@ -0,0 +1,160 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import net.nicoulaj.compilecommand.annotations.DontInline;
+import org.apache.cassandra.utils.CloseableIterator;
+
+import static org.apache.cassandra.utils.Throwables.maybeFail;
+import static org.apache.cassandra.utils.Throwables.merge;
+
+abstract class BaseIterator<V, I extends CloseableIterator<? extends V>, O extends V> extends Stack implements AutoCloseable, Iterator<O>
+{
+    I input;
+    V next;
+
+    // We require two stop signals for correctness, since the `stop` reference of the base iterator can "leak"
+    // into the transformations stack. Using a single `stop` signal may result into the inconsistent state,
+    // since stopping transformation would stop only the child iterator.
+
+    // Signals that the base iterator has been signalled to stop. Applies at the end of the current next().
+    Stop stop;
+    // Signals that the current child iterator has been signalled to stop.
+    Stop stopChild;
+
+    static class Stop
+    {
+        // TODO: consider moving "next" into here, so that a stop() when signalled outside of a function call (e.g. in attach)
+        // can take effect immediately; this doesn't seem to be necessary at the moment, but it might cause least surprise in future
+        boolean isSignalled;
+    }
+
+    // responsibility for initialising next lies with the subclass
+    BaseIterator(BaseIterator<? extends V, ? extends I, ?> copyFrom)
+    {
+        super(copyFrom);
+        this.input = copyFrom.input;
+        this.next = copyFrom.next;
+        this.stop = copyFrom.stop;
+        this.stopChild = copyFrom.stopChild;
+    }
+
+    BaseIterator(I input)
+    {
+        this.input = input;
+        this.stop = new Stop();
+        this.stopChild = this.stop;
+    }
+
+    /**
+     * run the corresponding runOnClose method for the first length transformations.
+     *
+     * used in hasMoreContents to close the methods preceding the MoreContents
+     */
+    protected abstract Throwable runOnClose(int length);
+
+    /**
+     * apply the relevant method from the transformation to the value.
+     *
+     * used in hasMoreContents to apply the functions that follow the MoreContents
+     */
+    protected abstract V applyOne(V value, Transformation transformation);
+
+    public final void close()
+    {
+        Throwable fail = runOnClose(length);
+        if (next instanceof AutoCloseable)
+        {
+            try { ((AutoCloseable) next).close(); }
+            catch (Throwable t) { fail = merge(fail, t); }
+        }
+        try { input.close(); }
+        catch (Throwable t) { fail = merge(fail, t); }
+        maybeFail(fail);
+    }
+
+    public final O next()
+    {
+        if (next == null && !hasNext())
+            throw new NoSuchElementException();
+
+        O next = (O) this.next;
+        this.next = null;
+        return next;
+    }
+
+    // may set next != null if the next contents are a transforming iterator that already has data to return,
+    // in which case we immediately have more contents to yield
+    protected final boolean hasMoreContents()
+    {
+        return moreContents.length > 0 && tryGetMoreContents();
+    }
+
+    @DontInline
+    private boolean tryGetMoreContents()
+    {
+        for (int i = 0 ; i < moreContents.length ; i++)
+        {
+            MoreContentsHolder holder = moreContents[i];
+            MoreContents provider = holder.moreContents;
+            I newContents = (I) provider.moreContents();
+            if (newContents == null)
+                continue;
+
+            input.close();
+            input = newContents;
+            Stack prefix = EMPTY;
+            if (newContents instanceof BaseIterator)
+            {
+                // we're refilling with transformed contents, so swap in its internals directly
+                // TODO: ensure that top-level data is consistent. i.e. staticRow, partitionlevelDeletion etc are same?
+                BaseIterator abstr = (BaseIterator) newContents;
+                prefix = abstr;
+                input = (I) abstr.input;
+                stopChild = abstr.stop;
+                next = apply((V) abstr.next, holder.length); // must apply all remaining functions to the next, if any
+            }
+
+            // since we're truncating our transformation stack to only those occurring after the extend transformation
+            // we have to run any prior runOnClose methods
+            maybeFail(runOnClose(holder.length));
+            refill(prefix, holder, i);
+
+            if (next != null || input.hasNext())
+                return true;
+
+            i = -1;
+        }
+        return false;
+    }
+
+    // apply the functions [from..length)
+    private V apply(V next, int from)
+    {
+        while (next != null & from < length)
+            next = applyOne(next, stack[from++]);
+        return next;
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/BasePartitions.java b/src/java/org/apache/cassandra/db/transform/BasePartitions.java
new file mode 100644
index 0000000..f6c486d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/BasePartitions.java

@@ -0,0 +1,120 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import java.util.Collections;
+
+import org.apache.cassandra.db.partitions.BasePartitionIterator;
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.utils.Throwables.merge;
+
+public abstract class BasePartitions<R extends BaseRowIterator<?>, I extends BasePartitionIterator<? extends BaseRowIterator<?>>>
+extends BaseIterator<BaseRowIterator<?>, I, R>
+implements BasePartitionIterator<R>
+{
+
+    public BasePartitions(I input)
+    {
+        super(input);
+    }
+
+    BasePartitions(BasePartitions<?, ? extends I> copyFrom)
+    {
+        super(copyFrom);
+    }
+
+
+    // *********************************
+
+
+    protected BaseRowIterator<?> applyOne(BaseRowIterator<?> value, Transformation transformation)
+    {
+        return value == null ? null : transformation.applyToPartition(value);
+    }
+
+    void add(Transformation transformation)
+    {
+        transformation.attachTo(this);
+        super.add(transformation);
+        next = applyOne(next, transformation);
+    }
+
+    protected Throwable runOnClose(int length)
+    {
+        Throwable fail = null;
+        Transformation[] fs = stack;
+        for (int i = 0 ; i < length ; i++)
+        {
+            try
+            {
+                fs[i].onClose();
+            }
+            catch (Throwable t)
+            {
+                fail = merge(fail, t);
+            }
+        }
+        return fail;
+    }
+
+    public final boolean hasNext()
+    {
+        BaseRowIterator<?> next = null;
+        try
+        {
+
+            Stop stop = this.stop;
+            while (this.next == null)
+            {
+                Transformation[] fs = stack;
+                int len = length;
+
+                while (!stop.isSignalled && !stopChild.isSignalled && input.hasNext())
+                {
+                    next = input.next();
+                    for (int i = 0 ; next != null & i < len ; i++)
+                        next = fs[i].applyToPartition(next);
+
+                    if (next != null)
+                    {
+                        this.next = next;
+                        return true;
+                    }
+                }
+
+                if (stop.isSignalled || !hasMoreContents())
+                    return false;
+            }
+            return true;
+
+        }
+        catch (Throwable t)
+        {
+            if (next != null)
+                Throwables.close(t, Collections.singleton(next));
+            throw t;
+        }
+    }
+
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/BaseRows.java b/src/java/org/apache/cassandra/db/transform/BaseRows.java
new file mode 100644
index 0000000..e6ce1da
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/BaseRows.java

@@ -0,0 +1,160 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.rows.*;
+
+import static org.apache.cassandra.utils.Throwables.merge;
+
+public abstract class BaseRows<R extends Unfiltered, I extends BaseRowIterator<? extends Unfiltered>>
+extends BaseIterator<Unfiltered, I, R>
+implements BaseRowIterator<R>
+{
+
+    private Row staticRow;
+
+    public BaseRows(I input)
+    {
+        super(input);
+        staticRow = input.staticRow();
+    }
+
+    // swap parameter order to avoid casting errors
+    BaseRows(BaseRows<?, ? extends I> copyFrom)
+    {
+        super(copyFrom);
+        staticRow = copyFrom.staticRow;
+    }
+
+    public CFMetaData metadata()
+    {
+        return input.metadata();
+    }
+
+    public boolean isReverseOrder()
+    {
+        return input.isReverseOrder();
+    }
+
+    public PartitionColumns columns()
+    {
+        return input.columns();
+    }
+
+    public DecoratedKey partitionKey()
+    {
+        return input.partitionKey();
+    }
+
+    public Row staticRow()
+    {
+        return staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow;
+    }
+
+
+    // **************************
+
+
+    @Override
+    protected Throwable runOnClose(int length)
+    {
+        Throwable fail = null;
+        Transformation[] fs = stack;
+        for (int i = 0 ; i < length ; i++)
+        {
+            try
+            {
+                fs[i].onPartitionClose();
+            }
+            catch (Throwable t)
+            {
+                fail = merge(fail, t);
+            }
+        }
+        return fail;
+    }
+
+    @Override
+    void add(Transformation transformation)
+    {
+        transformation.attachTo(this);
+        super.add(transformation);
+
+        // transform any existing data
+        if (staticRow != null)
+            staticRow = transformation.applyToStatic(staticRow);
+        next = applyOne(next, transformation);
+    }
+
+    @Override
+    protected Unfiltered applyOne(Unfiltered value, Transformation transformation)
+    {
+        return value == null
+               ? null
+               : value instanceof Row
+                 ? transformation.applyToRow((Row) value)
+                 : transformation.applyToMarker((RangeTombstoneMarker) value);
+    }
+
+    @Override
+    public final boolean hasNext()
+    {
+        Stop stop = this.stop;
+        while (this.next == null)
+        {
+            Transformation[] fs = stack;
+            int len = length;
+
+            while (!stop.isSignalled && !stopChild.isSignalled && input.hasNext())
+            {
+                Unfiltered next = input.next();
+
+                if (next.isRow())
+                {
+                    Row row = (Row) next;
+                    for (int i = 0 ; row != null && i < len ; i++)
+                        row = fs[i].applyToRow(row);
+                    next = row;
+                }
+                else
+                {
+                    RangeTombstoneMarker rtm = (RangeTombstoneMarker) next;
+                    for (int i = 0 ; rtm != null && i < len ; i++)
+                        rtm = fs[i].applyToMarker(rtm);
+                    next = rtm;
+                }
+
+                if (next != null)
+                {
+                    this.next = next;
+                    return true;
+                }
+            }
+
+            if (stop.isSignalled || stopChild.isSignalled || !hasMoreContents())
+                return false;
+        }
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/DuplicateRowChecker.java b/src/java/org/apache/cassandra/db/transform/DuplicateRowChecker.java
new file mode 100644
index 0000000..7a6f7f9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/DuplicateRowChecker.java

@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.transform;
+
+import java.net.InetAddress;
+import java.util.Collections;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.utils.DiagnosticSnapshotService;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class DuplicateRowChecker extends Transformation<BaseRowIterator<?>>
+{
+    private static final Logger logger = LoggerFactory.getLogger(DuplicateRowChecker.class);
+
+    Clustering previous = null;
+    int duplicatesDetected = 0;
+
+    final String stage;
+    final List<InetAddress> replicas;
+    final CFMetaData metadata;
+    final DecoratedKey key;
+    final boolean snapshotOnDuplicate;
+
+    DuplicateRowChecker(final DecoratedKey key,
+                        final CFMetaData metadata,
+                        final String stage,
+                        final boolean snapshotOnDuplicate,
+                        final List<InetAddress> replicas)
+    {
+        this.key = key;
+        this.metadata = metadata;
+        this.stage = stage;
+        this.snapshotOnDuplicate = snapshotOnDuplicate;
+        this.replicas = replicas;
+    }
+
+    protected DeletionTime applyToDeletion(DeletionTime deletionTime)
+    {
+        return deletionTime;
+    }
+
+    protected RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+    {
+        return marker;
+    }
+
+    protected Row applyToStatic(Row row)
+    {
+        return row;
+    }
+
+    protected Row applyToRow(Row row)
+    {
+        if (null != previous && row.clustering().equals(previous))
+            duplicatesDetected++;
+        previous = row.clustering();
+        return row;
+    }
+
+    protected void onPartitionClose()
+    {
+        if (duplicatesDetected > 0)
+        {
+            logger.warn("Detected {} duplicate rows for {} during {}",
+                        duplicatesDetected,
+                        metadata.getKeyValidator().getString(key.getKey()),
+                        stage);
+            if (snapshotOnDuplicate)
+                DiagnosticSnapshotService.duplicateRows(metadata, replicas);
+        }
+        duplicatesDetected = 0;
+        previous = null;
+        super.onPartitionClose();
+    }
+
+    public static UnfilteredPartitionIterator duringCompaction(final UnfilteredPartitionIterator iterator, OperationType type)
+    {
+        if (!DatabaseDescriptor.checkForDuplicateRowsDuringCompaction())
+            return iterator;
+        final List<InetAddress> address = Collections.singletonList(FBUtilities.getBroadcastAddress());
+        final boolean snapshot = DatabaseDescriptor.snapshotOnDuplicateRowDetection();
+        return Transformation.apply(iterator, new Transformation<UnfilteredRowIterator>()
+        {
+            protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+            {
+                return Transformation.apply(partition, new DuplicateRowChecker(partition.partitionKey(),
+                                                                               partition.metadata(),
+                                                                               type.toString(),
+                                                                               snapshot,
+                                                                               address));
+            }
+        });
+    }
+
+    public static PartitionIterator duringRead(final PartitionIterator iterator, final List<InetAddress> replicas)
+    {
+        if (!DatabaseDescriptor.checkForDuplicateRowsDuringReads())
+            return iterator;
+        final boolean snapshot = DatabaseDescriptor.snapshotOnDuplicateRowDetection();
+        return Transformation.apply(iterator, new Transformation<RowIterator>()
+        {
+            protected RowIterator applyToPartition(RowIterator partition)
+            {
+                return Transformation.apply(partition, new DuplicateRowChecker(partition.partitionKey(),
+                                                                               partition.metadata(),
+                                                                               "Read",
+                                                                               snapshot,
+                                                                               replicas));
+            }
+        });
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/EmptyPartitionsDiscarder.java b/src/java/org/apache/cassandra/db/transform/EmptyPartitionsDiscarder.java
new file mode 100644
index 0000000..5e41cec
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/EmptyPartitionsDiscarder.java

@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.rows.BaseRowIterator;
+
+public final class EmptyPartitionsDiscarder extends Transformation<BaseRowIterator<?>>
+{
+    @Override
+    protected BaseRowIterator applyToPartition(BaseRowIterator iterator)
+    {
+        if (iterator.isEmpty())
+        {
+            iterator.close();
+            return null;
+        }
+
+        return iterator;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/Filter.java b/src/java/org/apache/cassandra/db/transform/Filter.java
new file mode 100644
index 0000000..48a1634
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/Filter.java

@@ -0,0 +1,66 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.DeletionPurger;
+import org.apache.cassandra.db.rows.*;
+
+public final class Filter extends Transformation
+{
+    private final int nowInSec;
+    private final boolean enforceStrictLiveness;
+
+    public Filter(int nowInSec, boolean enforceStrictLiveness)
+    {
+        this.nowInSec = nowInSec;
+        this.enforceStrictLiveness = enforceStrictLiveness;
+    }
+
+    @Override
+    protected RowIterator applyToPartition(BaseRowIterator iterator)
+    {
+        return iterator instanceof UnfilteredRows
+             ? new FilteredRows(this, (UnfilteredRows) iterator)
+             : new FilteredRows((UnfilteredRowIterator) iterator, this);
+    }
+
+    @Override
+    protected Row applyToStatic(Row row)
+    {
+        if (row.isEmpty())
+            return Rows.EMPTY_STATIC_ROW;
+
+        row = row.purge(DeletionPurger.PURGE_ALL, nowInSec, enforceStrictLiveness);
+        return row == null ? Rows.EMPTY_STATIC_ROW : row;
+    }
+
+    @Override
+    protected Row applyToRow(Row row)
+    {
+        return row.purge(DeletionPurger.PURGE_ALL, nowInSec, enforceStrictLiveness);
+    }
+
+    @Override
+    protected RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+    {
+        return null;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java b/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java
new file mode 100644
index 0000000..b835a6b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java

@@ -0,0 +1,70 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.partitions.BasePartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.RowIterator;
+
+public final class FilteredPartitions extends BasePartitions<RowIterator, BasePartitionIterator<?>> implements PartitionIterator
+{
+    // wrap basic iterator for transformation
+    FilteredPartitions(PartitionIterator input)
+    {
+        super(input);
+    }
+
+    // wrap basic unfiltered iterator for transformation, applying filter as first transformation
+    FilteredPartitions(UnfilteredPartitionIterator input, Filter filter)
+    {
+        super(input);
+        add(filter);
+    }
+
+    // copy from an UnfilteredPartitions, applying a filter to convert it
+    FilteredPartitions(Filter filter, UnfilteredPartitions copyFrom)
+    {
+        super(copyFrom);
+        add(filter);
+    }
+
+    /**
+     * Filter any RangeTombstoneMarker from the iterator's iterators, transforming it into a PartitionIterator.
+     */
+    public static FilteredPartitions filter(UnfilteredPartitionIterator iterator, int nowInSecs)
+    {
+        FilteredPartitions filtered = filter(iterator,
+                                             new Filter(nowInSecs,
+                                                        iterator.metadata().enforceStrictLiveness()));
+
+        return iterator.isForThrift()
+             ? filtered
+             : (FilteredPartitions) Transformation.apply(filtered, new EmptyPartitionsDiscarder());
+    }
+
+    public static FilteredPartitions filter(UnfilteredPartitionIterator iterator, Filter filter)
+    {
+        return iterator instanceof UnfilteredPartitions
+             ? new FilteredPartitions(filter, (UnfilteredPartitions) iterator)
+             : new FilteredPartitions(iterator, filter);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/FilteredRows.java b/src/java/org/apache/cassandra/db/transform/FilteredRows.java
new file mode 100644
index 0000000..349183c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/FilteredRows.java

@@ -0,0 +1,60 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+public final class FilteredRows extends BaseRows<Row, BaseRowIterator<?>> implements RowIterator
+{
+    FilteredRows(RowIterator input)
+    {
+        super(input);
+    }
+
+    FilteredRows(UnfilteredRowIterator input, Filter filter)
+    {
+        super(input);
+        add(filter);
+    }
+
+    FilteredRows(Filter filter, UnfilteredRows input)
+    {
+        super(input);
+        add(filter);
+    }
+
+    @Override
+    public boolean isEmpty()
+    {
+        return staticRow().isEmpty() && !hasNext();
+    }
+
+    /**
+     * Filter any RangeTombstoneMarker from the iterator, transforming it into a RowIterator.
+     */
+    public static RowIterator filter(UnfilteredRowIterator iterator, int nowInSecs)
+    {
+        return new Filter(nowInSecs, iterator.metadata().enforceStrictLiveness()).applyToPartition(iterator);
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/MoreContents.java b/src/java/org/apache/cassandra/db/transform/MoreContents.java
new file mode 100644
index 0000000..5277b07
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/MoreContents.java

@@ -0,0 +1,28 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+// a shared internal interface, that is hidden to provide type-safety to the user
+interface MoreContents<I>
+{
+    public abstract I moreContents();
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/MorePartitions.java b/src/java/org/apache/cassandra/db/transform/MorePartitions.java
new file mode 100644
index 0000000..898eb7d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/MorePartitions.java

@@ -0,0 +1,55 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.partitions.BasePartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+
+import static org.apache.cassandra.db.transform.Transformation.add;
+import static org.apache.cassandra.db.transform.Transformation.mutable;
+
+/**
+ * An interface for providing new partitions for a partitions iterator.
+ *
+ * The new contents are produced as a normal arbitrary PartitionIterator or UnfilteredPartitionIterator (as appropriate)
+ *
+ * The transforming iterator invokes this method when any current source is exhausted, then then inserts the
+ * new contents as the new source.
+ *
+ * If the new source is itself a product of any transformations, the two transforming iterators are merged
+ * so that control flow always occurs at the outermost point
+ */
+public interface MorePartitions<I extends BasePartitionIterator<?>> extends MoreContents<I>
+{
+
+    public static UnfilteredPartitionIterator extend(UnfilteredPartitionIterator iterator, MorePartitions<? super UnfilteredPartitionIterator> more)
+    {
+        return add(mutable(iterator), more);
+    }
+
+    public static PartitionIterator extend(PartitionIterator iterator, MorePartitions<? super PartitionIterator> more)
+    {
+        return add(mutable(iterator), more);
+    }
+
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/MoreRows.java b/src/java/org/apache/cassandra/db/transform/MoreRows.java
new file mode 100644
index 0000000..118739b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/MoreRows.java

@@ -0,0 +1,62 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+import static org.apache.cassandra.db.transform.Transformation.add;
+import static org.apache.cassandra.db.transform.Transformation.mutable;
+
+/**
+ * An interface for providing new row contents for a partition.
+ *
+ * The new contents are produced as a normal arbitrary RowIterator or UnfilteredRowIterator (as appropriate),
+ * with matching staticRow, partitionKey and partitionLevelDeletion.
+ *
+ * The transforming iterator invokes this method when any current source is exhausted, then then inserts the
+ * new contents as the new source.
+ *
+ * If the new source is itself a product of any transformations, the two transforming iterators are merged
+ * so that control flow always occurs at the outermost point
+ */
+public interface MoreRows<I extends BaseRowIterator<?>> extends MoreContents<I>
+{
+
+    public static UnfilteredRowIterator extend(UnfilteredRowIterator iterator, MoreRows<? super UnfilteredRowIterator> more)
+    {
+        return add(mutable(iterator), more);
+    }
+
+    public static UnfilteredRowIterator extend(UnfilteredRowIterator iterator, MoreRows<? super UnfilteredRowIterator> more, PartitionColumns columns)
+    {
+        return add(Transformation.wrapIterator(iterator, columns), more);
+    }
+
+    public static RowIterator extend(RowIterator iterator, MoreRows<? super RowIterator> more)
+    {
+        return add(mutable(iterator), more);
+    }
+
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/RTBoundCloser.java b/src/java/org/apache/cassandra/db/transform/RTBoundCloser.java
new file mode 100644
index 0000000..ee5401d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/RTBoundCloser.java

@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.ReadOrderGroup;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+
+/**
+ * A transformation that appends an RT bound marker to row iterators in case they don't have one.
+ *
+ * This used to happen, for example, in {@link org.apache.cassandra.db.ReadCommand#executeLocally(ReadOrderGroup)},
+ * if {@link org.apache.cassandra.db.filter.DataLimits} stopped the iterator on a live row that was enclosed in an
+ * older RT.
+ *
+ * If we don't do this, and send a response without the closing bound, we can break read/short read protection read
+ * isolation, and potentially cause data loss.
+ *
+ * See CASSANDRA-14515 for context.
+ */
+public final class RTBoundCloser extends Transformation<UnfilteredRowIterator>
+{
+    private RTBoundCloser()
+    {
+    }
+
+    public static UnfilteredPartitionIterator close(UnfilteredPartitionIterator partitions)
+    {
+        return Transformation.apply(partitions, new RTBoundCloser());
+    }
+
+    public static UnfilteredRowIterator close(UnfilteredRowIterator partition)
+    {
+        RowsTransformation transformation = new RowsTransformation(partition);
+        return Transformation.apply(MoreRows.extend(partition, transformation, partition.columns()), transformation);
+    }
+
+    @Override
+    public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+    {
+        RowsTransformation transformation = new RowsTransformation(partition);
+        return Transformation.apply(MoreRows.extend(partition, transformation, partition.columns()), transformation);
+    }
+
+    private final static class RowsTransformation extends Transformation implements MoreRows<UnfilteredRowIterator>
+    {
+        private final UnfilteredRowIterator partition;
+
+        private Clustering lastRowClustering;
+        private DeletionTime openMarkerDeletionTime;
+
+        private RowsTransformation(UnfilteredRowIterator partition)
+        {
+            this.partition = partition;
+        }
+
+        @Override
+        public Row applyToRow(Row row)
+        {
+            lastRowClustering = row.clustering();
+            return row;
+        }
+
+        @Override
+        public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+        {
+            openMarkerDeletionTime =
+                marker.isOpen(partition.isReverseOrder()) ? marker.openDeletionTime(partition.isReverseOrder()) : null;
+            lastRowClustering = null;
+            return marker;
+        }
+
+        @Override
+        public UnfilteredRowIterator moreContents()
+        {
+            // there is no open RT in the stream - nothing for us to do
+            if (null == openMarkerDeletionTime)
+                return null;
+
+            /*
+             * there *is* an open RT in the stream, but there have been no rows after the opening bound - this must
+             * never happen in scenarios where RTBoundCloser is meant to be used; the last encountered clustering
+             * should be either a closing bound marker - if the iterator was exhausted fully - or a live row - if
+             * DataLimits stopped it short in the middle of an RT.
+             */
+            if (null == lastRowClustering)
+            {
+                CFMetaData metadata = partition.metadata();
+                String message =
+                    String.format("UnfilteredRowIterator for %s.%s has an open RT bound as its last item", metadata.ksName, metadata.cfName);
+                throw new IllegalStateException(message);
+            }
+
+            // create an artificial inclusive closing RT bound with bound matching last seen row's clustering
+            RangeTombstoneBoundMarker closingBound =
+                RangeTombstoneBoundMarker.inclusiveClose(partition.isReverseOrder(), lastRowClustering.getRawValues(), openMarkerDeletionTime);
+
+            return UnfilteredRowIterators.singleton(closingBound,
+                                                    partition.metadata(),
+                                                    partition.partitionKey(),
+                                                    partition.partitionLevelDeletion(),
+                                                    partition.columns(),
+                                                    partition.staticRow(),
+                                                    partition.isReverseOrder(),
+                                                    partition.stats());
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/RTBoundValidator.java b/src/java/org/apache/cassandra/db/transform/RTBoundValidator.java
new file mode 100644
index 0000000..1f675cf
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/RTBoundValidator.java

@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+/**
+ * A validating transformation that sanity-checks the sequence of RT bounds and boundaries in every partition.
+ *
+ * What we validate, specifically:
+ * - that open markers are only followed by close markers
+ * - that open markers and close markers have equal deletion times
+ * - optionally, that the iterator closes its last RT marker
+ */
+public final class RTBoundValidator extends Transformation<UnfilteredRowIterator>
+{
+    public enum Stage { MEMTABLE, SSTABLE, MERGED, PURGED, PROCESSED }
+
+    private final Stage stage;
+    private final boolean enforceIsClosed;
+
+    private RTBoundValidator(Stage stage, boolean enforceIsClosed)
+    {
+        this.stage = stage;
+        this.enforceIsClosed = enforceIsClosed;
+    }
+
+    public static UnfilteredPartitionIterator validate(UnfilteredPartitionIterator partitions, Stage stage, boolean enforceIsClosed)
+    {
+        return Transformation.apply(partitions, new RTBoundValidator(stage, enforceIsClosed));
+    }
+
+    public static UnfilteredRowIterator validate(UnfilteredRowIterator partition, Stage stage, boolean enforceIsClosed)
+    {
+        return Transformation.apply(partition, new RowsTransformation(stage, partition.metadata(), partition.isReverseOrder(), enforceIsClosed));
+    }
+
+    @Override
+    public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+    {
+        return Transformation.apply(partition, new RowsTransformation(stage, partition.metadata(), partition.isReverseOrder(), enforceIsClosed));
+    }
+
+    private final static class RowsTransformation extends Transformation
+    {
+        private final Stage stage;
+        private final CFMetaData metadata;
+        private final boolean isReverseOrder;
+        private final boolean enforceIsClosed;
+
+        private DeletionTime openMarkerDeletionTime;
+
+        private RowsTransformation(Stage stage, CFMetaData metadata, boolean isReverseOrder, boolean enforceIsClosed)
+        {
+            this.stage = stage;
+            this.metadata = metadata;
+            this.isReverseOrder = isReverseOrder;
+            this.enforceIsClosed = enforceIsClosed;
+        }
+
+        @Override
+        public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+        {
+            if (null == openMarkerDeletionTime)
+            {
+                 // there is no open RT in the stream - we are expecting a *_START_BOUND
+                if (marker.isClose(isReverseOrder))
+                    throw ise("unexpected end bound or boundary " + marker.toString(metadata));
+            }
+            else
+            {
+                // there is an open RT in the stream - we are expecting a *_BOUNDARY or an *_END_BOUND
+                if (!marker.isClose(isReverseOrder))
+                    throw ise("start bound followed by another start bound " + marker.toString(metadata));
+
+                // deletion times of open/close markers must match
+                DeletionTime deletionTime = marker.closeDeletionTime(isReverseOrder);
+                if (!deletionTime.equals(openMarkerDeletionTime))
+                    throw ise("open marker and close marker have different deletion times");
+
+                openMarkerDeletionTime = null;
+            }
+
+            if (marker.isOpen(isReverseOrder))
+                openMarkerDeletionTime = marker.openDeletionTime(isReverseOrder);
+
+            return marker;
+        }
+
+        @Override
+        public void onPartitionClose()
+        {
+            if (enforceIsClosed && null != openMarkerDeletionTime)
+                throw ise("expected all RTs to be closed, but the last one is open");
+        }
+
+        private IllegalStateException ise(String why)
+        {
+            String message = String.format("%s UnfilteredRowIterator for %s.%s has an illegal RT bounds sequence: %s",
+                                           stage, metadata.ksName, metadata.cfName, why);
+            throw new IllegalStateException(message);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/Stack.java b/src/java/org/apache/cassandra/db/transform/Stack.java
new file mode 100644
index 0000000..f680ec9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/Stack.java

@@ -0,0 +1,101 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import java.util.Arrays;
+
+class Stack
+{
+    static final Stack EMPTY = new Stack();
+
+    Transformation[] stack;
+    int length; // number of used stack entries
+    MoreContentsHolder[] moreContents; // stack of more contents providers (if any; usually zero or one)
+
+    // an internal placeholder for a MoreContents, storing the associated stack length at time it was applied
+    static class MoreContentsHolder
+    {
+        final MoreContents moreContents;
+        int length;
+        private MoreContentsHolder(MoreContents moreContents, int length)
+        {
+            this.moreContents = moreContents;
+            this.length = length;
+        }
+    }
+
+    Stack()
+    {
+        stack = new Transformation[0];
+        moreContents = new MoreContentsHolder[0];
+    }
+
+    Stack(Stack copy)
+    {
+        stack = copy.stack;
+        length = copy.length;
+        moreContents = copy.moreContents;
+    }
+
+    void add(Transformation add)
+    {
+        if (length == stack.length)
+            stack = resize(stack);
+        stack[length++] = add;
+    }
+
+    void add(MoreContents more)
+    {
+        this.moreContents = Arrays.copyOf(moreContents, moreContents.length + 1);
+        this.moreContents[moreContents.length - 1] = new MoreContentsHolder(more, length);
+    }
+
+    private static <E> E[] resize(E[] array)
+    {
+        int newLen = array.length == 0 ? 5 : array.length * 2;
+        return Arrays.copyOf(array, newLen);
+    }
+
+    // reinitialise the transformations after a moreContents applies
+    void refill(Stack prefix, MoreContentsHolder holder, int index)
+    {
+        // drop the transformations that were present when the MoreContents was attached,
+        // and prefix any transformations in the new contents (if it's a transformer)
+        moreContents = splice(prefix.moreContents, prefix.moreContents.length, moreContents, index, moreContents.length);
+        stack = splice(prefix.stack, prefix.length, stack, holder.length, length);
+        length += prefix.length - holder.length;
+        holder.length = prefix.length;
+    }
+
+    private static <E> E[] splice(E[] prefix, int prefixCount, E[] keep, int keepFrom, int keepTo)
+    {
+        int keepCount = keepTo - keepFrom;
+        int newCount = prefixCount + keepCount;
+        if (newCount > keep.length)
+            keep = Arrays.copyOf(keep, newCount);
+        if (keepFrom != prefixCount)
+            System.arraycopy(keep, keepFrom, keep, prefixCount, keepCount);
+        if (prefixCount != 0)
+            System.arraycopy(prefix, 0, keep, 0, prefixCount);
+        return keep;
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/db/transform/StoppingTransformation.java b/src/java/org/apache/cassandra/db/transform/StoppingTransformation.java
new file mode 100644
index 0000000..79563e9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/StoppingTransformation.java

@@ -0,0 +1,87 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import net.nicoulaj.compilecommand.annotations.DontInline;
+import org.apache.cassandra.db.rows.BaseRowIterator;
+
+// A Transformation that can stop an iterator earlier than its natural exhaustion
+public abstract class StoppingTransformation<I extends BaseRowIterator<?>> extends Transformation<I>
+{
+    private BaseIterator rows;
+    private BaseIterator partitions;
+
+    /**
+     * If invoked by a subclass, any partitions iterator this transformation has been applied to will terminate
+     * after any currently-processing item is returned, as will any row/unfiltered iterator
+     */
+    @DontInline
+    protected void stop()
+    {
+        if (partitions != null)
+        {
+            partitions.stop.isSignalled = true;
+            partitions.stopChild.isSignalled = true;
+        }
+
+        stopInPartition();
+    }
+
+    /**
+     * If invoked by a subclass, any rows/unfiltered iterator this transformation has been applied to will terminate
+     * after any currently-processing item is returned
+     */
+    @DontInline
+    protected void stopInPartition()
+    {
+        if (rows != null)
+        {
+            rows.stop.isSignalled = true;
+            rows.stopChild.isSignalled = true;
+        }
+    }
+
+    @Override
+    protected void attachTo(BasePartitions partitions)
+    {
+        assert this.partitions == null;
+        this.partitions = partitions;
+    }
+
+    @Override
+    protected void attachTo(BaseRows rows)
+    {
+        assert this.rows == null;
+        this.rows = rows;
+    }
+
+    @Override
+    protected void onClose()
+    {
+        partitions = null;
+    }
+
+    @Override
+    protected void onPartitionClose()
+    {
+        rows = null;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/Transformation.java b/src/java/org/apache/cassandra/db/transform/Transformation.java
new file mode 100644
index 0000000..06dd057
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/Transformation.java

@@ -0,0 +1,189 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+
+/**
+ * We have a single common superclass for all Transformations to make implementation efficient.
+ * we have a shared stack for all transformations, and can share the same transformation across partition and row
+ * iterators, reducing garbage. Internal code is also simplified by always having a basic no-op implementation to invoke.
+ *
+ * Only the necessary methods need be overridden. Early termination is provided by invoking the method's stop or stopInPartition
+ * methods, rather than having their own abstract method to invoke, as this is both more efficient and simpler to reason about.
+ */
+public abstract class Transformation<I extends BaseRowIterator<?>>
+{
+    // internal methods for StoppableTransformation only
+    void attachTo(BasePartitions partitions) { }
+    void attachTo(BaseRows rows) { }
+
+    /**
+     * Run on the close of any (logical) partitions iterator this function was applied to
+     *
+     * We stipulate logical, because if applied to a transformed iterator the lifetime of the iterator
+     * object may be longer than the lifetime of the "logical" iterator it was applied to; if the iterator
+     * is refilled with MoreContents, for instance, the iterator may outlive this function
+     */
+    protected void onClose() { }
+
+    /**
+     * Run on the close of any (logical) rows iterator this function was applied to
+     *
+     * We stipulate logical, because if applied to a transformed iterator the lifetime of the iterator
+     * object may be longer than the lifetime of the "logical" iterator it was applied to; if the iterator
+     * is refilled with MoreContents, for instance, the iterator may outlive this function
+     */
+    protected void onPartitionClose() { }
+
+    /**
+     * Applied to any rows iterator (partition) we encounter in a partitions iterator
+     */
+    protected I applyToPartition(I partition)
+    {
+        return partition;
+    }
+
+    /**
+     * Applied to any row we encounter in a rows iterator
+     */
+    protected Row applyToRow(Row row)
+    {
+        return row;
+    }
+
+    /**
+     * Applied to any RTM we encounter in a rows/unfiltered iterator
+     */
+    protected RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+    {
+        return marker;
+    }
+
+    /**
+     * Applied to the static row of any rows iterator.
+     *
+     * NOTE that this is only applied to the first iterator in any sequence of iterators filled by a MoreContents;
+     * the static data for such iterators is all expected to be equal
+     */
+    protected Row applyToStatic(Row row)
+    {
+        return row;
+    }
+
+    /**
+     * Applied to the partition-level deletion of any rows iterator.
+     *
+     * NOTE that this is only applied to the first iterator in any sequence of iterators filled by a MoreContents;
+     * the static data for such iterators is all expected to be equal
+     */
+    protected DeletionTime applyToDeletion(DeletionTime deletionTime)
+    {
+        return deletionTime;
+    }
+
+
+    //******************************************************
+    //          Static Application Methods
+    //******************************************************
+
+
+    public static UnfilteredPartitionIterator apply(UnfilteredPartitionIterator iterator, Transformation<? super UnfilteredRowIterator> transformation)
+    {
+        return add(mutable(iterator), transformation);
+    }
+    public static PartitionIterator apply(PartitionIterator iterator, Transformation<? super RowIterator> transformation)
+    {
+        return add(mutable(iterator), transformation);
+    }
+    public static UnfilteredRowIterator apply(UnfilteredRowIterator iterator, Transformation<?> transformation)
+    {
+        return add(mutable(iterator), transformation);
+    }
+    public static RowIterator apply(RowIterator iterator, Transformation<?> transformation)
+    {
+        return add(mutable(iterator), transformation);
+    }
+
+    static UnfilteredPartitions mutable(UnfilteredPartitionIterator iterator)
+    {
+        return iterator instanceof UnfilteredPartitions
+               ? (UnfilteredPartitions) iterator
+               : new UnfilteredPartitions(iterator);
+    }
+    static FilteredPartitions mutable(PartitionIterator iterator)
+    {
+        return iterator instanceof FilteredPartitions
+               ? (FilteredPartitions) iterator
+               : new FilteredPartitions(iterator);
+    }
+    static UnfilteredRows mutable(UnfilteredRowIterator iterator)
+    {
+        return iterator instanceof UnfilteredRows
+               ? (UnfilteredRows) iterator
+               : new UnfilteredRows(iterator);
+    }
+    static FilteredRows mutable(RowIterator iterator)
+    {
+        return iterator instanceof FilteredRows
+               ? (FilteredRows) iterator
+               : new FilteredRows(iterator);
+    }
+
+    /**
+     * Even though this method is sumilar to `mutable`, it supresses the optimisation of avoiding creating an additional
+     * wrapping interator object (which both creates an extra object and grows the call stack during the iteration), it
+     * should be used with caution.
+     *
+     * It is useful in cases when the input has to be checked for more contents rather than directly checking if it
+     * is stopped. For example, when concatenating two iterators (pseudocode):
+     *
+     *    iter1 = [row(1), row(2), row(3)]
+     *    iter2 = [row(4), row(5), row(6)]
+     *
+     *    UnfilteredRowIterators.concat(DataLimits.cqlLimits(1).filter(iter1), DataLimits.cqlLimits(1).filter(iter1))
+     *
+     * Which should yield two rows: [row(1), row(4)].
+     *
+     * Using stacked transformations instead of wrapping would result into returning a single row, since the first
+     * iterator will signal the iterator is stopped.
+     */
+    static UnfilteredRows wrapIterator(UnfilteredRowIterator iterator, PartitionColumns columns)
+    {
+        return new UnfilteredRows(iterator, columns);
+    }
+
+    static <E extends BaseIterator> E add(E to, Transformation add)
+    {
+        to.add(add);
+        return to;
+    }
+    static <E extends BaseIterator> E add(E to, MoreContents add)
+    {
+        to.add(add);
+        return to;
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/db/transform/UnfilteredPartitions.java b/src/java/org/apache/cassandra/db/transform/UnfilteredPartitions.java
new file mode 100644
index 0000000..bad14ad
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/UnfilteredPartitions.java

@@ -0,0 +1,47 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+final class UnfilteredPartitions extends BasePartitions<UnfilteredRowIterator, UnfilteredPartitionIterator> implements UnfilteredPartitionIterator
+{
+    final boolean isForThrift;
+
+    // wrap an iterator for transformation
+    public UnfilteredPartitions(UnfilteredPartitionIterator input)
+    {
+        super(input);
+        this.isForThrift = input.isForThrift();
+    }
+
+    public boolean isForThrift()
+    {
+        return isForThrift;
+    }
+
+    public CFMetaData metadata()
+    {
+        return input.metadata();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java b/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java
new file mode 100644
index 0000000..c631f2e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/transform/UnfilteredRows.java

@@ -0,0 +1,73 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.transform;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+
+final class UnfilteredRows extends BaseRows<Unfiltered, UnfilteredRowIterator> implements UnfilteredRowIterator
+{
+    private PartitionColumns columns;
+    private DeletionTime partitionLevelDeletion;
+
+    public UnfilteredRows(UnfilteredRowIterator input)
+    {
+        this(input, input.columns());
+    }
+
+    public UnfilteredRows(UnfilteredRowIterator input, PartitionColumns columns)
+    {
+        super(input);
+        this.columns = columns;
+        partitionLevelDeletion = input.partitionLevelDeletion();
+    }
+
+    public PartitionColumns columns()
+    {
+        return columns;
+    }
+
+    @Override
+    void add(Transformation add)
+    {
+        super.add(add);
+        partitionLevelDeletion = add.applyToDeletion(partitionLevelDeletion);
+    }
+
+    public DeletionTime partitionLevelDeletion()
+    {
+        return partitionLevelDeletion;
+    }
+
+    public EncodingStats stats()
+    {
+        return input.stats();
+    }
+
+    @Override
+    public boolean isEmpty()
+    {
+        return staticRow().isEmpty() && partitionLevelDeletion().isLive() && !hasNext();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/TableViews.java b/src/java/org/apache/cassandra/db/view/TableViews.java
new file mode 100644
index 0000000..d2d4a45
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/TableViews.java

@@ -0,0 +1,556 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.view;
+
+import java.util.*;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+
+/**
+ * Groups all the views for a given table.
+ */
+public class TableViews extends AbstractCollection<View>
+{
+    private final CFMetaData baseTableMetadata;
+
+    // We need this to be thread-safe, but the number of times this is changed (when a view is created in the keyspace)
+    // is massively exceeded by the number of times it's read (for every mutation on the keyspace), so a copy-on-write
+    // list is the best option.
+    private final List<View> views = new CopyOnWriteArrayList();
+
+    public TableViews(CFMetaData baseTableMetadata)
+    {
+        this.baseTableMetadata = baseTableMetadata;
+    }
+
+    public int size()
+    {
+        return views.size();
+    }
+
+    public Iterator<View> iterator()
+    {
+        return views.iterator();
+    }
+
+    public boolean contains(String viewName)
+    {
+        return Iterables.any(views, view -> view.name.equals(viewName));
+    }
+
+    public boolean add(View view)
+    {
+        // We should have validated that there is no existing view with this name at this point
+        assert !contains(view.name);
+        return views.add(view);
+    }
+
+    public Iterable<ColumnFamilyStore> allViewsCfs()
+    {
+        Keyspace keyspace = Keyspace.open(baseTableMetadata.ksName);
+        return Iterables.transform(views, view -> keyspace.getColumnFamilyStore(view.getDefinition().viewName));
+    }
+
+    public void forceBlockingFlush()
+    {
+        for (ColumnFamilyStore viewCfs : allViewsCfs())
+            viewCfs.forceBlockingFlush();
+    }
+
+    public void dumpMemtables()
+    {
+        for (ColumnFamilyStore viewCfs : allViewsCfs())
+            viewCfs.dumpMemtable();
+    }
+
+    public void truncateBlocking(ReplayPosition replayAfter, long truncatedAt)
+    {
+        for (ColumnFamilyStore viewCfs : allViewsCfs())
+        {
+            viewCfs.discardSSTables(truncatedAt);
+            SystemKeyspace.saveTruncationRecord(viewCfs, truncatedAt, replayAfter);
+        }
+    }
+
+    public void removeByName(String viewName)
+    {
+        views.removeIf(v -> v.name.equals(viewName));
+    }
+
+    /**
+     * Calculates and pushes updates to the views replicas. The replicas are determined by
+     * {@link ViewUtils#getViewNaturalEndpoint(String, Token, Token)}.
+     *
+     * @param update an update on the base table represented by this object.
+     * @param writeCommitLog whether we should write the commit log for the view updates.
+     * @param baseComplete time from epoch in ms that the local base mutation was (or will be) completed
+     */
+    public void pushViewReplicaUpdates(PartitionUpdate update, boolean writeCommitLog, AtomicLong baseComplete)
+    {
+        assert update.metadata().cfId.equals(baseTableMetadata.cfId);
+
+        Collection<View> views = updatedViews(update);
+        if (views.isEmpty())
+            return;
+
+        // Read modified rows
+        int nowInSec = FBUtilities.nowInSeconds();
+        SinglePartitionReadCommand command = readExistingRowsCommand(update, views, nowInSec);
+        if (command == null)
+            return;
+
+        ColumnFamilyStore cfs = Keyspace.openAndGetStore(update.metadata());
+        long start = System.nanoTime();
+        Collection<Mutation> mutations;
+        try (ReadOrderGroup orderGroup = command.startOrderGroup();
+             UnfilteredRowIterator existings = UnfilteredPartitionIterators.getOnlyElement(command.executeLocally(orderGroup), command);
+             UnfilteredRowIterator updates = update.unfilteredIterator())
+        {
+            mutations = Iterators.getOnlyElement(generateViewUpdates(views, updates, existings, nowInSec, false));
+        }
+        Keyspace.openAndGetStore(update.metadata()).metric.viewReadTime.update(System.nanoTime() - start, TimeUnit.NANOSECONDS);
+
+        if (!mutations.isEmpty())
+            StorageProxy.mutateMV(update.partitionKey().getKey(), mutations, writeCommitLog, baseComplete);
+    }
+
+
+    /**
+     * Given some updates on the base table of this object and the existing values for the rows affected by that update, generates the
+     * mutation to be applied to the provided views.
+     *
+     * @param views the views potentially affected by {@code updates}.
+     * @param updates the base table updates being applied.
+     * @param existings the existing values for the rows affected by {@code updates}. This is used to decide if a view is
+     * obsoleted by the update and should be removed, gather the values for columns that may not be part of the update if
+     * a new view entry needs to be created, and compute the minimal updates to be applied if the view entry isn't changed
+     * but has simply some updated values. This will be empty for view building as we want to assume anything we'll pass
+     * to {@code updates} is new.
+     * @param nowInSec the current time in seconds.
+     * @param separateUpdates, if false, mutation is per partition.
+     * @return the mutations to apply to the {@code views}. This can be empty.
+     */
+    public Iterator<Collection<Mutation>> generateViewUpdates(Collection<View> views,
+                                                              UnfilteredRowIterator updates,
+                                                              UnfilteredRowIterator existings,
+                                                              int nowInSec,
+                                                              boolean separateUpdates)
+    {
+        assert updates.metadata().cfId.equals(baseTableMetadata.cfId);
+
+        List<ViewUpdateGenerator> generators = new ArrayList<>(views.size());
+        for (View view : views)
+            generators.add(new ViewUpdateGenerator(view, updates.partitionKey(), nowInSec));
+
+        DeletionTracker existingsDeletion = new DeletionTracker(existings.partitionLevelDeletion());
+        DeletionTracker updatesDeletion = new DeletionTracker(updates.partitionLevelDeletion());
+
+        /*
+         * We iterate through the updates and the existing rows in parallel. This allows us to know the consequence
+         * on the view of each update.
+         */
+        PeekingIterator<Unfiltered> existingsIter = Iterators.peekingIterator(existings);
+        PeekingIterator<Unfiltered> updatesIter = Iterators.peekingIterator(updates);
+
+        while (existingsIter.hasNext() && updatesIter.hasNext())
+        {
+            Unfiltered existing = existingsIter.peek();
+            Unfiltered update = updatesIter.peek();
+
+            Row existingRow;
+            Row updateRow;
+            int cmp = baseTableMetadata.comparator.compare(update, existing);
+            if (cmp < 0)
+            {
+                // We have an update where there was nothing before
+                if (update.isRangeTombstoneMarker())
+                {
+                    updatesDeletion.update(updatesIter.next());
+                    continue;
+                }
+
+                updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion());
+                existingRow = emptyRow(updateRow.clustering(), existingsDeletion.currentDeletion());
+            }
+            else if (cmp > 0)
+            {
+                // We have something existing but no update (which will happen either because it's a range tombstone marker in
+                // existing, or because we've fetched the existing row due to some partition/range deletion in the updates)
+                if (existing.isRangeTombstoneMarker())
+                {
+                    existingsDeletion.update(existingsIter.next());
+                    continue;
+                }
+
+                existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion());
+                updateRow = emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion());
+
+                // The way we build the read command used for existing rows, we should always have updatesDeletion.currentDeletion()
+                // that is not live, since we wouldn't have read the existing row otherwise. And we could assert that, but if we ever
+                // change the read method so that it can slightly over-read in some case, that would be an easily avoiding bug lurking,
+                // so we just handle the case.
+                if (updateRow == null)
+                    continue;
+            }
+            else
+            {
+                // We're updating a row that had pre-existing data
+                if (update.isRangeTombstoneMarker())
+                {
+                    assert existing.isRangeTombstoneMarker();
+                    updatesDeletion.update(updatesIter.next());
+                    existingsDeletion.update(existingsIter.next());
+                    continue;
+                }
+
+                assert !existing.isRangeTombstoneMarker();
+                existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion());
+                updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion());
+            }
+
+            addToViewUpdateGenerators(existingRow, updateRow, generators, nowInSec);
+        }
+
+        // We only care about more existing rows if the update deletion isn't live, i.e. if we had a partition deletion
+        if (!updatesDeletion.currentDeletion().isLive())
+        {
+            while (existingsIter.hasNext())
+            {
+                Unfiltered existing = existingsIter.next();
+                // If it's a range tombstone, we don't care, we're only looking for existing entry that gets deleted by
+                // the new partition deletion
+                if (existing.isRangeTombstoneMarker())
+                    continue;
+
+                Row existingRow = (Row)existing;
+                addToViewUpdateGenerators(existingRow, emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion()), generators, nowInSec);
+            }
+        }
+
+        if (separateUpdates)
+        {
+            final Collection<Mutation> firstBuild = buildMutations(baseTableMetadata, generators);
+
+            return new Iterator<Collection<Mutation>>()
+            {
+                // If the previous values are already empty, this update must be either empty or exclusively appending.
+                // In the case we are exclusively appending, we need to drop the build that was passed in and try to build a
+                // new first update instead.
+                // If there are no other updates, next will be null and the iterator will be empty.
+                Collection<Mutation> next = firstBuild.isEmpty()
+                                            ? buildNext()
+                                            : firstBuild;
+
+                private Collection<Mutation> buildNext()
+                {
+                    while (updatesIter.hasNext())
+                    {
+                        Unfiltered update = updatesIter.next();
+                        // If it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it for view updates
+                        if (update.isRangeTombstoneMarker())
+                            continue;
+
+                        Row updateRow = (Row) update;
+                        addToViewUpdateGenerators(emptyRow(updateRow.clustering(), existingsDeletion.currentDeletion()),
+                                                  updateRow,
+                                                  generators,
+                                                  nowInSec);
+
+                        // If the updates have been filtered, then we won't have any mutations; we need to make sure that we
+                        // only return if the mutations are empty. Otherwise, we continue to search for an update which is
+                        // not filtered
+                        Collection<Mutation> mutations = buildMutations(baseTableMetadata, generators);
+                        if (!mutations.isEmpty())
+                            return mutations;
+                    }
+
+                    return null;
+                }
+
+                public boolean hasNext()
+                {
+                    return next != null;
+                }
+
+                public Collection<Mutation> next()
+                {
+                    Collection<Mutation> mutations = next;
+
+                    next = buildNext();
+
+                    assert !mutations.isEmpty() : "Expected mutations to be non-empty";
+                    return mutations;
+                }
+            };
+        }
+        else
+        {
+            while (updatesIter.hasNext())
+            {
+                Unfiltered update = updatesIter.next();
+                // If it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it for view updates
+                if (update.isRangeTombstoneMarker())
+                    continue;
+
+                Row updateRow = (Row) update;
+                addToViewUpdateGenerators(emptyRow(updateRow.clustering(), existingsDeletion.currentDeletion()),
+                                          updateRow,
+                                          generators,
+                                          nowInSec);
+            }
+
+            return Iterators.singletonIterator(buildMutations(baseTableMetadata, generators));
+        }
+    }
+
+    /**
+     * Return the views that are potentially updated by the provided updates.
+     *
+     * @param updates the updates applied to the base table.
+     * @return the views affected by {@code updates}.
+     */
+    public Collection<View> updatedViews(PartitionUpdate updates)
+    {
+        List<View> matchingViews = new ArrayList<>(views.size());
+
+        for (View view : views)
+        {
+            ReadQuery selectQuery = view.getReadQuery();
+            if (!selectQuery.selectsKey(updates.partitionKey()))
+                continue;
+
+            matchingViews.add(view);
+        }
+        return matchingViews;
+    }
+
+    /**
+     * Returns the command to use to read the existing rows required to generate view updates for the provided base
+     * base updates.
+     *
+     * @param updates the base table updates being applied.
+     * @param views the views potentially affected by {@code updates}.
+     * @param nowInSec the current time in seconds.
+     * @return the command to use to read the base table rows required to generate view updates for {@code updates}.
+     */
+    private SinglePartitionReadCommand readExistingRowsCommand(PartitionUpdate updates, Collection<View> views, int nowInSec)
+    {
+        Slices.Builder sliceBuilder = null;
+        DeletionInfo deletionInfo = updates.deletionInfo();
+        CFMetaData metadata = updates.metadata();
+        DecoratedKey key = updates.partitionKey();
+        // TODO: This is subtle: we need to gather all the slices that we have to fetch between partition del, range tombstones and rows.
+        if (!deletionInfo.isLive())
+        {
+            sliceBuilder = new Slices.Builder(metadata.comparator);
+            // Everything covered by a deletion might invalidate an existing view entry, which means we must read it to know. In practice
+            // though, the views involved might filter some base table clustering columns, in which case we can restrict what we read
+            // using those restrictions.
+            // If there is a partition deletion, then we can simply take each slices from each view select filter. They may overlap but
+            // the Slices.Builder handles that for us. Note that in many case this will just involve reading everything (as soon as any
+            // view involved has no clustering restrictions for instance).
+            // For range tombstone, we should theoretically take the difference between the range tombstoned and the slices selected
+            // by every views, but as we don't an easy way to compute that right now, we keep it simple and just use the tombstoned
+            // range.
+            // TODO: we should improve that latter part.
+            if (!deletionInfo.getPartitionDeletion().isLive())
+            {
+                for (View view : views)
+                    sliceBuilder.addAll(view.getSelectStatement().clusteringIndexFilterAsSlices());
+            }
+            else
+            {
+                assert deletionInfo.hasRanges();
+                Iterator<RangeTombstone> iter = deletionInfo.rangeIterator(false);
+                while (iter.hasNext())
+                    sliceBuilder.add(iter.next().deletedSlice());
+            }
+        }
+
+        // We need to read every row that is updated, unless we can prove that it has no impact on any view entries.
+
+        // If we had some slices from the deletions above, we'll continue using that. Otherwise, it's more efficient to build
+        // a names query.
+        BTreeSet.Builder<Clustering> namesBuilder = sliceBuilder == null ? BTreeSet.builder(metadata.comparator) : null;
+        for (Row row : updates)
+        {
+            // Don't read the existing state if we can prove the update won't affect any views
+            if (!affectsAnyViews(key, row, views))
+                continue;
+
+            if (namesBuilder == null)
+                sliceBuilder.add(Slice.make(row.clustering()));
+            else
+                namesBuilder.add(row.clustering());
+        }
+
+        NavigableSet<Clustering> names = namesBuilder == null ? null : namesBuilder.build();
+        // If we have a slice builder, it means we had some deletions and we have to read. But if we had
+        // only row updates, it's possible none of them affected the views, in which case we have nothing
+        // to do.
+        if (names != null && names.isEmpty())
+            return null;
+
+        ClusteringIndexFilter clusteringFilter = names == null
+                                               ? new ClusteringIndexSliceFilter(sliceBuilder.build(), false)
+                                               : new ClusteringIndexNamesFilter(names, false);
+        // since unselected columns also affect view liveness, we need to query all base columns if base and view have same key columns.
+        // If we have more than one view, we should merge the queried columns by each views but to keep it simple we just
+        // include everything. We could change that in the future.
+        ColumnFilter queriedColumns = views.size() == 1 && metadata.enforceStrictLiveness()
+                                   ? Iterables.getOnlyElement(views).getSelectStatement().queriedColumns()
+                                   : ColumnFilter.all(metadata);
+        // Note that the views could have restrictions on regular columns, but even if that's the case we shouldn't apply those
+        // when we read, because even if an existing row doesn't match the view filter, the update can change that in which
+        // case we'll need to know the existing content. There is also no easy way to merge those RowFilter when we have multiple views.
+        // TODO: we could still make sense to special case for when there is a single view and a small number of updates (and
+        // no deletions). Indeed, in that case we could check whether any of the update modify any of the restricted regular
+        // column, and if that's not the case we could use view filter. We keep it simple for now though.
+        RowFilter rowFilter = RowFilter.NONE;
+        return SinglePartitionReadCommand.create(metadata, nowInSec, queriedColumns, rowFilter, DataLimits.NONE, key, clusteringFilter);
+    }
+
+    private boolean affectsAnyViews(DecoratedKey partitionKey, Row update, Collection<View> views)
+    {
+        for (View view : views)
+        {
+            if (view.mayBeAffectedBy(partitionKey, update))
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Given an existing base row and the update that we're going to apply to this row, generate the modifications
+     * to apply to MVs using the provided {@code ViewUpdateGenerator}s.
+     *
+     * @param existingBaseRow the base table row as it is before an update.
+     * @param updateBaseRow the newly updates made to {@code existingBaseRow}.
+     * @param generators the view update generators to add the new changes to.
+     * @param nowInSec the current time in seconds. Used to decide if data is live or not.
+     */
+    private static void addToViewUpdateGenerators(Row existingBaseRow, Row updateBaseRow, Collection<ViewUpdateGenerator> generators, int nowInSec)
+    {
+        // Having existing empty is useful, it just means we'll insert a brand new entry for updateBaseRow,
+        // but if we have no update at all, we shouldn't get there.
+        assert !updateBaseRow.isEmpty();
+
+        // We allow existingBaseRow to be null, which we treat the same as being empty as an small optimization
+        // to avoid allocating empty row objects when we know there was nothing existing.
+        Row mergedBaseRow = existingBaseRow == null ? updateBaseRow : Rows.merge(existingBaseRow, updateBaseRow, nowInSec);
+        for (ViewUpdateGenerator generator : generators)
+            generator.addBaseTableUpdate(existingBaseRow, mergedBaseRow);
+    }
+
+    private static Row emptyRow(Clustering clustering, DeletionTime deletion)
+    {
+        // Returning null for an empty row is slightly ugly, but the case where there is no pre-existing row is fairly common
+        // (especially when building the view), so we want to avoid a dummy allocation of an empty row every time.
+        // And MultiViewUpdateBuilder knows how to deal with that.
+        return deletion.isLive() ? null : BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(deletion));
+    }
+
+    /**
+     * Extracts (and potentially groups) the mutations generated by the provided view update generator.
+     * Returns the mutation that needs to be done to the views given the base table updates
+     * passed to {@link #addBaseTableUpdate}.
+     *
+     * @param baseTableMetadata the metadata for the base table being updated.
+     * @param generators the generators from which to extract the view mutations from.
+     * @return the mutations created by all the generators in {@code generators}.
+     */
+    private Collection<Mutation> buildMutations(CFMetaData baseTableMetadata, List<ViewUpdateGenerator> generators)
+    {
+        // One view is probably common enough and we can optimize a bit easily
+        if (generators.size() == 1)
+        {
+            ViewUpdateGenerator generator = generators.get(0);
+            Collection<PartitionUpdate> updates = generator.generateViewUpdates();
+            List<Mutation> mutations = new ArrayList<>(updates.size());
+            for (PartitionUpdate update : updates)
+                mutations.add(new Mutation(update));
+
+            generator.clear();
+            return mutations;
+        }
+
+        Map<DecoratedKey, Mutation> mutations = new HashMap<>();
+        for (ViewUpdateGenerator generator : generators)
+        {
+            for (PartitionUpdate update : generator.generateViewUpdates())
+            {
+                DecoratedKey key = update.partitionKey();
+                Mutation mutation = mutations.get(key);
+                if (mutation == null)
+                {
+                    mutation = new Mutation(baseTableMetadata.ksName, key);
+                    mutations.put(key, mutation);
+                }
+                mutation.add(update);
+            }
+            generator.clear();
+        }
+        return mutations.values();
+    }
+
+    /**
+     * A simple helper that tracks for a given {@code UnfilteredRowIterator} what is the current deletion at any time of the
+     * iteration. It will be the currently open range tombstone deletion if there is one and the partition deletion otherwise.
+     */
+    private static class DeletionTracker
+    {
+        private final DeletionTime partitionDeletion;
+        private DeletionTime deletion;
+
+        public DeletionTracker(DeletionTime partitionDeletion)
+        {
+            this.partitionDeletion = partitionDeletion;
+        }
+
+        public void update(Unfiltered marker)
+        {
+            assert marker instanceof RangeTombstoneMarker;
+            RangeTombstoneMarker rtm = (RangeTombstoneMarker)marker;
+            this.deletion = rtm.isOpen(false)
+                          ? rtm.openDeletionTime(false)
+                          : null;
+        }
+
+        public DeletionTime currentDeletion()
+        {
+            return deletion == null ? partitionDeletion : deletion;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/View.java b/src/java/org/apache/cassandra/db/view/View.java
new file mode 100644
index 0000000..9716dc4
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/View.java

@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.view;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+
+import com.google.common.collect.Iterables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.MultiColumnRelation;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.Relation;
+import org.apache.cassandra.cql3.SingleColumnRelation;
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.cql3.statements.ParsedStatement;
+import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.ReadQuery;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A View copies data from a base table into a view table which can be queried independently from the
+ * base. Every update which targets the base table must be fed through the {@link ViewManager} to ensure
+ * that if a view needs to be updated, the updates are properly created and fed into the view.
+ */
+public class View
+{
+    private static final Logger logger = LoggerFactory.getLogger(View.class);
+
+    public final String name;
+    private volatile ViewDefinition definition;
+
+    private final ColumnFamilyStore baseCfs;
+
+    public volatile List<ColumnDefinition> baseNonPKColumnsInViewPK;
+
+    private ViewBuilder builder;
+
+    // Only the raw statement can be final, because the statement cannot always be prepared when the MV is initialized.
+    // For example, during startup, this view will be initialized as part of the Keyspace.open() work; preparing a statement
+    // also requires the keyspace to be open, so this results in double-initialization problems.
+    private final SelectStatement.RawStatement rawSelect;
+    private SelectStatement select;
+    private ReadQuery query;
+
+    public View(ViewDefinition definition,
+                ColumnFamilyStore baseCfs)
+    {
+        this.baseCfs = baseCfs;
+        this.name = definition.viewName;
+        this.rawSelect = definition.select;
+
+        updateDefinition(definition);
+    }
+
+    public ViewDefinition getDefinition()
+    {
+        return definition;
+    }
+
+    /**
+     * This updates the columns stored which are dependent on the base CFMetaData.
+     *
+     * @return true if the view contains only columns which are part of the base's primary key; false if there is at
+     *         least one column which is not.
+     */
+    public void updateDefinition(ViewDefinition definition)
+    {
+        this.definition = definition;
+        List<ColumnDefinition> nonPKDefPartOfViewPK = new ArrayList<>();
+        for (ColumnDefinition baseColumn : baseCfs.metadata.allColumns())
+        {
+            ColumnDefinition viewColumn = getViewColumn(baseColumn);
+            if (viewColumn != null && !baseColumn.isPrimaryKeyColumn() && viewColumn.isPrimaryKeyColumn())
+                nonPKDefPartOfViewPK.add(baseColumn);
+        }
+        this.baseNonPKColumnsInViewPK = nonPKDefPartOfViewPK;
+    }
+
+    /**
+     * The view column corresponding to the provided base column. This <b>can</b>
+     * return {@code null} if the column is denormalized in the view.
+     */
+    public ColumnDefinition getViewColumn(ColumnDefinition baseColumn)
+    {
+        return definition.metadata.getColumnDefinition(baseColumn.name);
+    }
+
+    /**
+     * The base column corresponding to the provided view column. This should
+     * never return {@code null} since a view can't have its "own" columns.
+     */
+    public ColumnDefinition getBaseColumn(ColumnDefinition viewColumn)
+    {
+        ColumnDefinition baseColumn = baseCfs.metadata.getColumnDefinition(viewColumn.name);
+        assert baseColumn != null;
+        return baseColumn;
+    }
+
+    /**
+     * Whether the view might be affected by the provided update.
+     * <p>
+     * Note that having this method return {@code true} is not an absolute guarantee that the view will be
+     * updated, just that it most likely will, but a {@code false} return guarantees it won't be affected).
+     *
+     * @param partitionKey the partition key that is updated.
+     * @param update the update being applied.
+     * @return {@code false} if we can guarantee that inserting {@code update} for key {@code partitionKey}
+     * won't affect the view in any way, {@code true} otherwise.
+     */
+    public boolean mayBeAffectedBy(DecoratedKey partitionKey, Row update)
+    {
+        // We can guarantee that the view won't be affected if:
+        //  - the clustering is excluded by the view filter (note that this isn't true of the filter on regular columns:
+        //    even if an update don't match a view condition on a regular column, that update can still invalidate an pre-existing
+        //    entry).
+        //  - or the update don't modify any of the columns impacting the view (where "impacting" the view means that column is
+        //    neither included in the view, nor used by the view filter).
+        if (!getReadQuery().selectsClustering(partitionKey, update.clustering()))
+            return false;
+        return true;
+    }
+
+    /**
+     * Whether a given base row matches the view filter (and thus if is should have a corresponding entry).
+     * <p>
+     * Note that this differs from {@link #mayBeAffectedBy} in that the provide row <b>must</b> be the current
+     * state of the base row, not just some updates to it. This method also has no false positive: a base
+     * row either do or don't match the view filter.
+     *
+     * @param partitionKey the partition key that is updated.
+     * @param baseRow the current state of a particular base row.
+     * @param nowInSec the current time in seconds (to decide what is live and what isn't).
+     * @return {@code true} if {@code baseRow} matches the view filters, {@code false} otherwise.
+     */
+    public boolean matchesViewFilter(DecoratedKey partitionKey, Row baseRow, int nowInSec)
+    {
+        return getReadQuery().selectsClustering(partitionKey, baseRow.clustering())
+            && getSelectStatement().rowFilterForInternalCalls().isSatisfiedBy(baseCfs.metadata, partitionKey, baseRow, nowInSec);
+    }
+
+    /**
+     * Returns the SelectStatement used to populate and filter this view.  Internal users should access the select
+     * statement this way to ensure it has been prepared.
+     */
+    public SelectStatement getSelectStatement()
+    {
+        if (select == null)
+        {
+            ClientState state = ClientState.forInternalCalls();
+            state.setKeyspace(baseCfs.keyspace.getName());
+            rawSelect.prepareKeyspace(state);
+            ParsedStatement.Prepared prepared = rawSelect.prepare(true, ClientState.forInternalCalls());
+            select = (SelectStatement) prepared.statement;
+        }
+
+        return select;
+    }
+
+    /**
+     * Returns the ReadQuery used to filter this view.  Internal users should access the query this way to ensure it
+     * has been prepared.
+     */
+    public ReadQuery getReadQuery()
+    {
+        if (query == null)
+        {
+            query = getSelectStatement().getQuery(QueryOptions.forInternalCalls(Collections.emptyList()), FBUtilities.nowInSeconds());
+            logger.trace("View query: {}", rawSelect);
+        }
+
+        return query;
+    }
+
+    public synchronized void build()
+    {
+        if (this.builder != null)
+        {
+            logger.debug("Stopping current view builder due to schema change");
+            this.builder.stop();
+            this.builder = null;
+        }
+
+        this.builder = new ViewBuilder(baseCfs, this);
+        CompactionManager.instance.submitViewBuilder(builder);
+    }
+
+    @Nullable
+    public static CFMetaData findBaseTable(String keyspace, String viewName)
+    {
+        ViewDefinition view = Schema.instance.getView(keyspace, viewName);
+        return (view == null) ? null : Schema.instance.getCFMetaData(view.baseTableId);
+    }
+
+    public static Iterable<ViewDefinition> findAll(String keyspace, String baseTable)
+    {
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace);
+        final UUID baseId = Schema.instance.getId(keyspace, baseTable);
+        return Iterables.filter(ksm.views, view -> view.baseTableId.equals(baseId));
+    }
+
+    /**
+     * Builds the string text for a materialized view's SELECT statement.
+     */
+    public static String buildSelectStatement(String cfName, Collection<ColumnDefinition> includedColumns, String whereClause)
+    {
+         StringBuilder rawSelect = new StringBuilder("SELECT ");
+        if (includedColumns == null || includedColumns.isEmpty())
+            rawSelect.append("*");
+        else
+            rawSelect.append(includedColumns.stream().map(id -> id.name.toCQLString()).collect(Collectors.joining(", ")));
+        rawSelect.append(" FROM \"").append(cfName).append("\" WHERE ") .append(whereClause).append(" ALLOW FILTERING");
+        return rawSelect.toString();
+    }
+
+    public static String relationsToWhereClause(List<Relation> whereClause)
+    {
+        List<String> expressions = new ArrayList<>(whereClause.size());
+        for (Relation rel : whereClause)
+        {
+            StringBuilder sb = new StringBuilder();
+
+            if (rel.isMultiColumn())
+            {
+                sb.append(((MultiColumnRelation) rel).getEntities().stream()
+                        .map(ColumnIdentifier.Raw::toCQLString)
+                        .collect(Collectors.joining(", ", "(", ")")));
+            }
+            else
+            {
+                sb.append(((SingleColumnRelation) rel).getEntity().toCQLString());
+            }
+
+            sb.append(" ").append(rel.operator()).append(" ");
+
+            if (rel.isIN())
+            {
+                sb.append(rel.getInValues().stream()
+                        .map(Term.Raw::getText)
+                        .collect(Collectors.joining(", ", "(", ")")));
+            }
+            else
+            {
+                sb.append(rel.getValue().getText());
+            }
+
+            expressions.add(sb.toString());
+        }
+
+        return expressions.stream().collect(Collectors.joining(" AND "));
+    }
+
+    public boolean hasSamePrimaryKeyColumnsAsBaseTable()
+    {
+        return baseNonPKColumnsInViewPK.isEmpty();
+    }
+
+    /**
+     * When views contains a primary key column that is not part
+     * of the base table primary key, we use that column liveness
+     * info as the view PK, to ensure that whenever that column
+     * is not live in the base, the row is not live in the view.
+     *
+     * This is done to prevent cells other than the view PK from
+     * making the view row alive when the view PK column is not
+     * live in the base. So in this case we tie the row liveness,
+     * to the primary key liveness.
+     *
+     * See CASSANDRA-11500 for context.
+     */
+    public boolean enforceStrictLiveness()
+    {
+        return !baseNonPKColumnsInViewPK.isEmpty();
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/ViewBuilder.java b/src/java/org/apache/cassandra/db/view/ViewBuilder.java
new file mode 100644
index 0000000..57bba29
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/ViewBuilder.java

@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.view;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.concurrent.Refs;
+
+public class ViewBuilder extends CompactionInfo.Holder
+{
+    private final ColumnFamilyStore baseCfs;
+    private final View view;
+    private final UUID compactionId;
+    private volatile Token prevToken = null;
+
+    private static final Logger logger = LoggerFactory.getLogger(ViewBuilder.class);
+
+    public ViewBuilder(ColumnFamilyStore baseCfs, View view)
+    {
+        this.baseCfs = baseCfs;
+        this.view = view;
+        compactionId = UUIDGen.getTimeUUID();
+    }
+
+    private void buildKey(DecoratedKey key)
+    {
+        AtomicLong noBase = new AtomicLong(Long.MAX_VALUE);
+        ReadQuery selectQuery = view.getReadQuery();
+
+        if (!selectQuery.selectsKey(key))
+        {
+            logger.trace("Skipping {}, view query filters", key);
+            return;
+        }
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        SinglePartitionReadCommand command = view.getSelectStatement().internalReadForView(key, nowInSec);
+
+        // We're rebuilding everything from what's on disk, so we read everything, consider that as new updates
+        // and pretend that there is nothing pre-existing.
+        UnfilteredRowIterator empty = UnfilteredRowIterators.noRowsIterator(baseCfs.metadata, key, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE, false);
+
+        try (ReadOrderGroup orderGroup = command.startOrderGroup();
+             UnfilteredRowIterator data = UnfilteredPartitionIterators.getOnlyElement(command.executeLocally(orderGroup), command))
+        {
+            Iterator<Collection<Mutation>> mutations = baseCfs.keyspace.viewManager
+                                                      .forTable(baseCfs.metadata)
+                                                      .generateViewUpdates(Collections.singleton(view), data, empty, nowInSec, true);
+
+            mutations.forEachRemaining(m -> StorageProxy.mutateMV(key.getKey(), m, true, noBase));
+        }
+    }
+
+    public void run()
+    {
+        logger.debug("Starting view builder for {}.{}", baseCfs.metadata.ksName, view.name);
+        String ksname = baseCfs.metadata.ksName, viewName = view.name;
+
+        if (SystemKeyspace.isViewBuilt(ksname, viewName))
+        {
+            logger.debug("View already marked built for {}.{}", baseCfs.metadata.ksName, view.name);
+            return;
+        }
+        Iterable<Range<Token>> ranges = StorageService.instance.getLocalRanges(baseCfs.metadata.ksName);
+
+        final Pair<Integer, Token> buildStatus = SystemKeyspace.getViewBuildStatus(ksname, viewName);
+        Token lastToken;
+        Function<org.apache.cassandra.db.lifecycle.View, Iterable<SSTableReader>> function;
+        if (buildStatus == null)
+        {
+            logger.debug("Starting new view build. flushing base table {}.{}", baseCfs.metadata.ksName, baseCfs.name);
+            lastToken = null;
+
+            //We don't track the generation number anymore since if a rebuild is stopped and
+            //restarted the max generation filter may yield no sstables due to compactions.
+            //We only care about max generation *during* a build, not across builds.
+            //see CASSANDRA-13405
+            SystemKeyspace.beginViewBuild(ksname, viewName, 0);
+        }
+        else
+        {
+            lastToken = buildStatus.right;
+            logger.debug("Resuming view build from token {}. flushing base table {}.{}", lastToken, baseCfs.metadata.ksName, baseCfs.name);
+        }
+
+        baseCfs.forceBlockingFlush();
+        function = org.apache.cassandra.db.lifecycle.View.selectFunction(SSTableSet.CANONICAL);
+
+        prevToken = lastToken;
+        long keysBuilt = 0;
+        try (Refs<SSTableReader> sstables = baseCfs.selectAndReference(function).refs;
+             ReducingKeyIterator iter = new ReducingKeyIterator(sstables))
+        {
+            while (!isStopRequested() && iter.hasNext())
+            {
+                DecoratedKey key = iter.next();
+                Token token = key.getToken();
+                if (lastToken == null || lastToken.compareTo(token) < 0)
+                {
+                    for (Range<Token> range : ranges)
+                    {
+                        if (range.contains(token))
+                        {
+                            buildKey(key);
+                            ++keysBuilt;
+
+                            if (prevToken == null || prevToken.compareTo(token) != 0)
+                            {
+                                SystemKeyspace.updateViewBuildStatus(ksname, viewName, key.getToken());
+                                prevToken = token;
+                            }
+                        }
+                    }
+
+                    lastToken = null;
+                }
+            }
+
+            if (!isStopRequested())
+            {
+                logger.debug("Marking view({}.{}) as built covered {} keys ", ksname, viewName, keysBuilt);
+                SystemKeyspace.finishViewBuildStatus(ksname, viewName);
+            }
+            else
+            {
+                logger.debug("Stopped build for view({}.{}) after covering {} keys", ksname, viewName, keysBuilt);
+            }
+        }
+        catch (Exception e)
+        {
+            final ViewBuilder builder = new ViewBuilder(baseCfs, view);
+            ScheduledExecutors.nonPeriodicTasks.schedule(() -> CompactionManager.instance.submitViewBuilder(builder),
+                                                         5,
+                                                         TimeUnit.MINUTES);
+            logger.warn("Materialized View failed to complete, sleeping 5 minutes before restarting", e);
+        }
+    }
+
+    public CompactionInfo getCompactionInfo()
+    {
+        long rangesCompleted = 0, rangesTotal = 0;
+        Token lastToken = prevToken;
+
+        // This approximation is not very accurate, but since we do not have a method which allows us to calculate the
+        // percentage of a range covered by a second range, this is the best approximation that we can calculate.
+        // Instead, we just count the total number of ranges that haven't been seen by the node (we use the order of
+        // the tokens to determine whether they have been seen yet or not), and the total number of ranges that a node
+        // has.
+        for (Range<Token> range : StorageService.instance.getLocalRanges(baseCfs.keyspace.getName()))
+        {
+            rangesTotal++;
+             if ((lastToken != null) && lastToken.compareTo(range.right) > 0)
+                 rangesCompleted++;
+          }
+         return new CompactionInfo(baseCfs.metadata, OperationType.VIEW_BUILD, rangesCompleted, rangesTotal, Unit.RANGES, compactionId);
+    }
+
+    public boolean isGlobal()
+    {
+        return false;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/ViewManager.java b/src/java/org/apache/cassandra/db/view/ViewManager.java
new file mode 100644
index 0000000..d1cfd9e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/ViewManager.java

@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.view;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.locks.Lock;
+
+import com.google.common.util.concurrent.Striped;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+
+
+/**
+ * Manages {@link View}'s for a single {@link ColumnFamilyStore}. All of the views for that table are created when this
+ * manager is initialized.
+ *
+ * The main purposes of the manager are to provide a single location for updates to be vetted to see whether they update
+ * any views {@link ViewManager#updatesAffectView(Collection, boolean)}, provide locks to prevent multiple
+ * updates from creating incoherent updates in the view {@link ViewManager#acquireLockFor(ByteBuffer)}, and
+ * to affect change on the view.
+ *
+ * TODO: I think we can get rid of that class. For addition/removal of view by names, we could move it Keyspace. And we
+ * not sure it's even worth keeping viewsByName as none of the related operation are performance sensitive so we could
+ * find the view by iterating over the CFStore.viewManager directly.
+ * For the lock, it could move to Keyspace too, but I don't remmenber why it has to be at the keyspace level and if it
+ * can be at the table level, maybe that's where it should be.
+ */
+public class ViewManager
+{
+    private static final Logger logger = LoggerFactory.getLogger(ViewManager.class);
+
+    private static final Striped<Lock> LOCKS = Striped.lazyWeakLock(DatabaseDescriptor.getConcurrentViewWriters() * 1024);
+
+    private static final boolean enableCoordinatorBatchlog = Boolean.getBoolean("cassandra.mv_enable_coordinator_batchlog");
+
+    private final ConcurrentMap<String, View> viewsByName = new ConcurrentHashMap<>();
+    private final ConcurrentMap<UUID, TableViews> viewsByBaseTable = new ConcurrentHashMap<>();
+    private final Keyspace keyspace;
+
+    public ViewManager(Keyspace keyspace)
+    {
+        this.keyspace = keyspace;
+    }
+
+    public boolean updatesAffectView(Collection<? extends IMutation> mutations, boolean coordinatorBatchlog)
+    {
+        if (coordinatorBatchlog && !enableCoordinatorBatchlog)
+            return false;
+
+        for (IMutation mutation : mutations)
+        {
+            for (PartitionUpdate update : mutation.getPartitionUpdates())
+            {
+                assert keyspace.getName().equals(update.metadata().ksName);
+
+                if (coordinatorBatchlog && keyspace.getReplicationStrategy().getReplicationFactor() == 1)
+                    continue;
+
+                if (!forTable(update.metadata()).updatedViews(update).isEmpty())
+                    return true;
+            }
+        }
+
+        return false;
+    }
+
+    private Iterable<View> allViews()
+    {
+        return viewsByName.values();
+    }
+
+    public void update(String viewName)
+    {
+        View view = viewsByName.get(viewName);
+        assert view != null : "When updating a view, it should already be in the ViewManager";
+        view.build();
+
+        // We provide the new definition from the base metadata
+        Optional<ViewDefinition> viewDefinition = keyspace.getMetadata().views.get(viewName);
+        assert viewDefinition.isPresent() : "When updating a view, it should still be in the Keyspaces views";
+        view.updateDefinition(viewDefinition.get());
+    }
+
+    public void reload()
+    {
+        Map<String, ViewDefinition> newViewsByName = new HashMap<>();
+        for (ViewDefinition definition : keyspace.getMetadata().views)
+        {
+            newViewsByName.put(definition.viewName, definition);
+        }
+
+        for (String viewName : viewsByName.keySet())
+        {
+            if (!newViewsByName.containsKey(viewName))
+                removeView(viewName);
+        }
+
+        for (Map.Entry<String, ViewDefinition> entry : newViewsByName.entrySet())
+        {
+            if (!viewsByName.containsKey(entry.getKey()))
+                addView(entry.getValue());
+        }
+
+        for (View view : allViews())
+        {
+            view.build();
+            // We provide the new definition from the base metadata
+            view.updateDefinition(newViewsByName.get(view.name));
+        }
+    }
+
+    public void addView(ViewDefinition definition)
+    {
+        // Skip if the base table doesn't exist due to schema propagation issues, see CASSANDRA-13737
+        if (!keyspace.hasColumnFamilyStore(definition.baseTableId))
+        {
+            logger.warn("Not adding view {} because the base table {} is unknown",
+                        definition.viewName,
+                        definition.baseTableId);
+            return;
+        }
+
+        View view = new View(definition, keyspace.getColumnFamilyStore(definition.baseTableId));
+        forTable(view.getDefinition().baseTableMetadata()).add(view);
+        viewsByName.put(definition.viewName, view);
+    }
+
+    public void removeView(String name)
+    {
+        View view = viewsByName.remove(name);
+
+        if (view == null)
+            return;
+
+        forTable(view.getDefinition().baseTableMetadata()).removeByName(name);
+        SystemKeyspace.setViewRemoved(keyspace.getName(), view.name);
+    }
+
+    public View getByName(String name)
+    {
+        return viewsByName.get(name);
+    }
+
+    public void buildAllViews()
+    {
+        for (View view : allViews())
+            view.build();
+    }
+
+    public TableViews forTable(CFMetaData metadata)
+    {
+        UUID baseId = metadata.cfId;
+        TableViews views = viewsByBaseTable.get(baseId);
+        if (views == null)
+        {
+            views = new TableViews(metadata);
+            TableViews previous = viewsByBaseTable.putIfAbsent(baseId, views);
+            if (previous != null)
+                views = previous;
+        }
+        return views;
+    }
+
+    public static Lock acquireLockFor(ByteBuffer key)
+    {
+        Lock lock = LOCKS.get(key);
+
+        if (lock.tryLock())
+            return lock;
+
+        return null;
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/ViewUpdateGenerator.java b/src/java/org/apache/cassandra/db/view/ViewUpdateGenerator.java
new file mode 100644
index 0000000..74d3e52
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/ViewUpdateGenerator.java

@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.view;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+
+/**
+ * Creates the updates to apply to a view given the existing rows in the base
+ * table and the updates that we're applying to them (this handles updates
+ * on a single partition only).
+ *
+ * This class is used by passing the updates made to the base table to
+ * {@link #addBaseTableUpdate} and calling {@link #generateViewUpdates} once all updates have
+ * been handled to get the resulting view mutations.
+ */
+public class ViewUpdateGenerator
+{
+    private final View view;
+    private final int nowInSec;
+
+    private final CFMetaData baseMetadata;
+    private final DecoratedKey baseDecoratedKey;
+    private final ByteBuffer[] basePartitionKey;
+
+    private final CFMetaData viewMetadata;
+    private final boolean baseEnforceStrictLiveness;
+
+    private final Map<DecoratedKey, PartitionUpdate> updates = new HashMap<>();
+
+    // Reused internally to build a new entry
+    private final ByteBuffer[] currentViewEntryPartitionKey;
+    private final Row.Builder currentViewEntryBuilder;
+
+    /**
+     * The type of type update action to perform to the view for a given base table
+     * update.
+     */
+    private enum UpdateAction
+    {
+        NONE,            // There was no view entry and none should be added
+        NEW_ENTRY,       // There was no entry but there is one post-update
+        DELETE_OLD,      // There was an entry but there is nothing after update
+        UPDATE_EXISTING, // There was an entry and the update modifies it
+        SWITCH_ENTRY     // There was an entry and there is still one after update,
+                         // but they are not the same one.
+    }
+
+    /**
+     * Creates a new {@code ViewUpdateBuilder}.
+     *
+     * @param view the view for which this will be building updates for.
+     * @param basePartitionKey the partition key for the base table partition for which
+     * we'll handle updates for.
+     * @param nowInSec the current time in seconds. Used to decide if data are live or not
+     * and as base reference for new deletions.
+     */
+    public ViewUpdateGenerator(View view, DecoratedKey basePartitionKey, int nowInSec)
+    {
+        this.view = view;
+        this.nowInSec = nowInSec;
+
+        this.baseMetadata = view.getDefinition().baseTableMetadata();
+        this.baseEnforceStrictLiveness = baseMetadata.enforceStrictLiveness();
+        this.baseDecoratedKey = basePartitionKey;
+        this.basePartitionKey = extractKeyComponents(basePartitionKey, baseMetadata.getKeyValidator());
+
+        this.viewMetadata = view.getDefinition().metadata;
+
+        this.currentViewEntryPartitionKey = new ByteBuffer[viewMetadata.partitionKeyColumns().size()];
+        this.currentViewEntryBuilder = BTreeRow.sortedBuilder();
+    }
+
+    private static ByteBuffer[] extractKeyComponents(DecoratedKey partitionKey, AbstractType<?> type)
+    {
+        return type instanceof CompositeType
+             ? ((CompositeType)type).split(partitionKey.getKey())
+             : new ByteBuffer[]{ partitionKey.getKey() };
+    }
+
+    /**
+     * Adds to this generator the updates to be made to the view given a base table row
+     * before and after an update.
+     *
+     * @param existingBaseRow the base table row as it is before an update.
+     * @param mergedBaseRow the base table row after the update is applied (note that
+     * this is not just the new update, but rather the resulting row).
+     */
+    public void addBaseTableUpdate(Row existingBaseRow, Row mergedBaseRow)
+    {
+        switch (updateAction(existingBaseRow, mergedBaseRow))
+        {
+            case NONE:
+                return;
+            case NEW_ENTRY:
+                createEntry(mergedBaseRow);
+                return;
+            case DELETE_OLD:
+                deleteOldEntry(existingBaseRow, mergedBaseRow);
+                return;
+            case UPDATE_EXISTING:
+                updateEntry(existingBaseRow, mergedBaseRow);
+                return;
+            case SWITCH_ENTRY:
+                createEntry(mergedBaseRow);
+                deleteOldEntry(existingBaseRow, mergedBaseRow);
+                return;
+        }
+    }
+
+    /**
+     * Returns the updates that needs to be done to the view given the base table updates
+     * passed to {@link #generateViewMutations}.
+     *
+     * @return the updates to do to the view.
+     */
+    public Collection<PartitionUpdate> generateViewUpdates()
+    {
+        return updates.values();
+    }
+
+    /**
+     * Clears the current state so that the generator may be reused.
+     */
+    public void clear()
+    {
+        updates.clear();
+    }
+
+    /**
+     * Compute which type of action needs to be performed to the view for a base table row
+     * before and after an update.
+     */
+    private UpdateAction updateAction(Row existingBaseRow, Row mergedBaseRow)
+    {
+        // Having existing empty is useful, it just means we'll insert a brand new entry for mergedBaseRow,
+        // but if we have no update at all, we shouldn't get there.
+        assert !mergedBaseRow.isEmpty();
+
+        // Note that none of the base PK columns will differ since we're intrinsically dealing
+        // with the same base row. So we have to check 3 things:
+        //   1) that the clustering doesn't have a null, which can happen for compact tables. If that's the case,
+        //      there is no corresponding entries.
+        //   2) if there is a column not part of the base PK in the view PK, whether it is changed by the update.
+        //   3) whether mergedBaseRow actually match the view SELECT filter
+
+        if (baseMetadata.isCompactTable())
+        {
+            Clustering clustering = mergedBaseRow.clustering();
+            for (int i = 0; i < clustering.size(); i++)
+            {
+                if (clustering.get(i) == null)
+                    return UpdateAction.NONE;
+            }
+        }
+
+        assert view.baseNonPKColumnsInViewPK.size() <= 1 : "We currently only support one base non-PK column in the view PK";
+
+        if (view.baseNonPKColumnsInViewPK.isEmpty())
+        {
+            // The view entry is necessarily the same pre and post update.
+
+            // Note that we allow existingBaseRow to be null and treat it as empty (see MultiViewUpdateBuilder.generateViewsMutations).
+            boolean existingHasLiveData = existingBaseRow != null && existingBaseRow.hasLiveData(nowInSec, baseEnforceStrictLiveness);
+            boolean mergedHasLiveData = mergedBaseRow.hasLiveData(nowInSec, baseEnforceStrictLiveness);
+            return existingHasLiveData
+                 ? (mergedHasLiveData ? UpdateAction.UPDATE_EXISTING : UpdateAction.DELETE_OLD)
+                 : (mergedHasLiveData ? UpdateAction.NEW_ENTRY : UpdateAction.NONE);
+        }
+
+        ColumnDefinition baseColumn = view.baseNonPKColumnsInViewPK.get(0);
+        assert !baseColumn.isComplex() : "A complex column couldn't be part of the view PK";
+        Cell before = existingBaseRow == null ? null : existingBaseRow.getCell(baseColumn);
+        Cell after = mergedBaseRow.getCell(baseColumn);
+
+        // If the update didn't modified this column, the cells will be the same object so it's worth checking
+        if (before == after)
+            return isLive(before) ? UpdateAction.UPDATE_EXISTING : UpdateAction.NONE;
+
+        if (!isLive(before))
+            return isLive(after) ? UpdateAction.NEW_ENTRY : UpdateAction.NONE;
+        if (!isLive(after))
+        {
+            return UpdateAction.DELETE_OLD;
+        }
+
+        return baseColumn.cellValueType().compare(before.value(), after.value()) == 0
+             ? UpdateAction.UPDATE_EXISTING
+             : UpdateAction.SWITCH_ENTRY;
+    }
+
+    private boolean matchesViewFilter(Row baseRow)
+    {
+        return view.matchesViewFilter(baseDecoratedKey, baseRow, nowInSec);
+    }
+
+    private boolean isLive(Cell cell)
+    {
+        return cell != null && cell.isLive(nowInSec);
+    }
+
+    /**
+     * Creates a view entry corresponding to the provided base row.
+     * <p>
+     * This method checks that the base row does match the view filter before applying it.
+     */
+    private void createEntry(Row baseRow)
+    {
+        // Before create a new entry, make sure it matches the view filter
+        if (!matchesViewFilter(baseRow))
+            return;
+
+        startNewUpdate(baseRow);
+        currentViewEntryBuilder.addPrimaryKeyLivenessInfo(computeLivenessInfoForEntry(baseRow));
+        currentViewEntryBuilder.addRowDeletion(baseRow.deletion());
+
+        for (ColumnData data : baseRow)
+        {
+            ColumnDefinition viewColumn = view.getViewColumn(data.column());
+            // If that base table column is not denormalized in the view, we had nothing to do.
+            // Alose, if it's part of the view PK it's already been taken into account in the clustering.
+            if (viewColumn == null || viewColumn.isPrimaryKeyColumn())
+                continue;
+
+            addColumnData(viewColumn, data);
+        }
+
+        submitUpdate();
+    }
+
+    /**
+     * Creates the updates to apply to the existing view entry given the base table row before
+     * and after the update, assuming that the update hasn't changed to which view entry the
+     * row correspond (that is, we know the columns composing the view PK haven't changed).
+     * <p>
+     * This method checks that the base row (before and after) does match the view filter before
+     * applying anything.
+     */
+    private void updateEntry(Row existingBaseRow, Row mergedBaseRow)
+    {
+        // While we know existingBaseRow and mergedBaseRow are corresponding to the same view entry,
+        // they may not match the view filter.
+        if (!matchesViewFilter(existingBaseRow))
+        {
+            createEntry(mergedBaseRow);
+            return;
+        }
+        if (!matchesViewFilter(mergedBaseRow))
+        {
+            deleteOldEntryInternal(existingBaseRow, mergedBaseRow);
+            return;
+        }
+
+        startNewUpdate(mergedBaseRow);
+
+        // In theory, it may be the PK liveness and row deletion hasn't been change by the update
+        // and we could condition the 2 additions below. In practice though, it's as fast (if not
+        // faster) to compute those info than to check if they have changed so we keep it simple.
+        currentViewEntryBuilder.addPrimaryKeyLivenessInfo(computeLivenessInfoForEntry(mergedBaseRow));
+        currentViewEntryBuilder.addRowDeletion(mergedBaseRow.deletion());
+
+        addDifferentCells(existingBaseRow, mergedBaseRow);
+        submitUpdate();
+    }
+
+    private void addDifferentCells(Row existingBaseRow, Row mergedBaseRow)
+    {
+        // We only add to the view update the cells from mergedBaseRow that differs from
+        // existingBaseRow. For that and for speed we can just cell pointer equality: if the update
+        // hasn't touched a cell, we know it will be the same object in existingBaseRow and
+        // mergedBaseRow (note that including more cells than we strictly should isn't a problem
+        // for correction, so even if the code change and pointer equality don't work anymore, it'll
+        // only a slightly inefficiency which we can fix then).
+        // Note: we could alternatively use Rows.diff() for this, but because it is a bit more generic
+        // than what we need here, it's also a bit less efficient (it allocates more in particular),
+        // and this might be called a lot of time for view updates. So, given that this is not a whole
+        // lot of code anyway, it's probably doing the diff manually.
+        PeekingIterator<ColumnData> existingIter = Iterators.peekingIterator(existingBaseRow.iterator());
+        for (ColumnData mergedData : mergedBaseRow)
+        {
+            ColumnDefinition baseColumn = mergedData.column();
+            ColumnDefinition viewColumn = view.getViewColumn(baseColumn);
+            // If that base table column is not denormalized in the view, we had nothing to do.
+            // Alose, if it's part of the view PK it's already been taken into account in the clustering.
+            if (viewColumn == null || viewColumn.isPrimaryKeyColumn())
+                continue;
+
+            ColumnData existingData = null;
+            // Find if there is data for that column in the existing row
+            while (existingIter.hasNext())
+            {
+                int cmp = baseColumn.compareTo(existingIter.peek().column());
+                if (cmp < 0)
+                    break;
+
+                ColumnData next = existingIter.next();
+                if (cmp == 0)
+                {
+                    existingData = next;
+                    break;
+                }
+            }
+
+            if (existingData == null)
+            {
+                addColumnData(viewColumn, mergedData);
+                continue;
+            }
+
+            if (mergedData == existingData)
+                continue;
+
+            if (baseColumn.isComplex())
+            {
+                ComplexColumnData mergedComplexData = (ComplexColumnData)mergedData;
+                ComplexColumnData existingComplexData = (ComplexColumnData)existingData;
+                if (mergedComplexData.complexDeletion().supersedes(existingComplexData.complexDeletion()))
+                    currentViewEntryBuilder.addComplexDeletion(viewColumn, mergedComplexData.complexDeletion());
+
+                PeekingIterator<Cell> existingCells = Iterators.peekingIterator(existingComplexData.iterator());
+                for (Cell mergedCell : mergedComplexData)
+                {
+                    Cell existingCell = null;
+                    // Find if there is corresponding cell in the existing row
+                    while (existingCells.hasNext())
+                    {
+                        int cmp = baseColumn.cellPathComparator().compare(mergedCell.path(), existingCells.peek().path());
+                        if (cmp > 0)
+                            break;
+
+                        Cell next = existingCells.next();
+                        if (cmp == 0)
+                        {
+                            existingCell = next;
+                            break;
+                        }
+                    }
+
+                    if (mergedCell != existingCell)
+                        addCell(viewColumn, mergedCell);
+                }
+            }
+            else
+            {
+                // Note that we've already eliminated the case where merged == existing
+                addCell(viewColumn, (Cell)mergedData);
+            }
+        }
+    }
+
+    /**
+     * Deletes the view entry corresponding to the provided base row.
+     * <p>
+     * This method checks that the base row does match the view filter before bothering.
+     */
+    private void deleteOldEntry(Row existingBaseRow, Row mergedBaseRow)
+    {
+        // Before deleting an old entry, make sure it was matching the view filter (otherwise there is nothing to delete)
+        if (!matchesViewFilter(existingBaseRow))
+            return;
+
+        deleteOldEntryInternal(existingBaseRow, mergedBaseRow);
+    }
+
+    private void deleteOldEntryInternal(Row existingBaseRow, Row mergedBaseRow)
+    {
+        startNewUpdate(existingBaseRow);
+        long timestamp = computeTimestampForEntryDeletion(existingBaseRow, mergedBaseRow);
+        long rowDeletion = mergedBaseRow.deletion().time().markedForDeleteAt();
+        assert timestamp >= rowDeletion;
+
+        // If computed deletion timestamp greater than row deletion, it must be coming from
+        //  1. non-pk base column used in view pk, or
+        //  2. unselected base column
+        //  any case, we need to use it as expired livenessInfo
+        // If computed deletion timestamp is from row deletion, we only need row deletion itself
+        if (timestamp > rowDeletion)
+        {
+            /**
+              * We use an expired liveness instead of a row tombstone to allow a shadowed MV
+              * entry to co-exist with a row tombstone, see ViewComplexTest#testCommutativeRowDeletion.
+              *
+              * TODO This is a dirty overload of LivenessInfo and we should modify
+              * the storage engine to properly support this on CASSANDRA-13826.
+              */
+            LivenessInfo info = LivenessInfo.create(timestamp, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSec);
+            currentViewEntryBuilder.addPrimaryKeyLivenessInfo(info);
+        }
+        currentViewEntryBuilder.addRowDeletion(mergedBaseRow.deletion());
+
+        addDifferentCells(existingBaseRow, mergedBaseRow);
+        submitUpdate();
+    }
+
+    /**
+     * Computes the partition key and clustering for a new view entry, and setup the internal
+     * row builder for the new row.
+     *
+     * This assumes that there is corresponding entry, i.e. no values for the partition key and
+     * clustering are null (since we have eliminated that case through updateAction).
+     */
+    private void startNewUpdate(Row baseRow)
+    {
+        ByteBuffer[] clusteringValues = new ByteBuffer[viewMetadata.clusteringColumns().size()];
+        for (ColumnDefinition viewColumn : viewMetadata.primaryKeyColumns())
+        {
+            ColumnDefinition baseColumn = view.getBaseColumn(viewColumn);
+            ByteBuffer value = getValueForPK(baseColumn, baseRow);
+            if (viewColumn.isPartitionKey())
+                currentViewEntryPartitionKey[viewColumn.position()] = value;
+            else
+                clusteringValues[viewColumn.position()] = value;
+        }
+
+        currentViewEntryBuilder.newRow(new Clustering(clusteringValues));
+    }
+
+    private LivenessInfo computeLivenessInfoForEntry(Row baseRow)
+    {
+        /**
+         * There 3 cases:
+         *  1. No extra primary key in view and all base columns are selected in MV. all base row's components(livenessInfo,
+         *     deletion, cells) are same as view row. Simply map base components to view row.
+         *  2. There is a base non-key column used in view pk. This base non-key column determines the liveness of view row. view's row level
+         *     info should based on this column.
+         *  3. Most tricky case is no extra primary key in view and some base columns are not selected in MV. We cannot use 1 livenessInfo or
+         *     row deletion to represent the liveness of unselected column properly, see CASSANDRA-11500.
+         *     We could make some simplification: the unselected columns will be used only when it affects view row liveness. eg. if view row
+         *     already exists and not expiring, there is no need to use unselected columns.
+         *     Note: if the view row is removed due to unselected column removal(ttl or cell tombstone), we will have problem keeping view
+         *     row alive with a smaller or equal timestamp than the max unselected column timestamp.
+         *
+         */
+        assert view.baseNonPKColumnsInViewPK.size() <= 1; // This may change, but is currently an enforced limitation
+
+        LivenessInfo baseLiveness = baseRow.primaryKeyLivenessInfo();
+
+        if (view.hasSamePrimaryKeyColumnsAsBaseTable())
+        {
+            if (view.getDefinition().includeAllColumns)
+                return baseLiveness;
+
+            long timestamp = baseLiveness.timestamp();
+            boolean hasNonExpiringLiveCell = false;
+            Cell biggestExpirationCell = null;
+            for (Cell cell : baseRow.cells())
+            {
+                if (view.getViewColumn(cell.column()) != null)
+                    continue;
+                if (!isLive(cell))
+                    continue;
+                timestamp = Math.max(timestamp, cell.maxTimestamp());
+                if (!cell.isExpiring())
+                    hasNonExpiringLiveCell = true;
+                else
+                {
+                    if (biggestExpirationCell == null)
+                        biggestExpirationCell = cell;
+                    else if (cell.localDeletionTime() > biggestExpirationCell.localDeletionTime())
+                        biggestExpirationCell = cell;
+                }
+            }
+            if (baseLiveness.isLive(nowInSec) && !baseLiveness.isExpiring())
+                return LivenessInfo.create(viewMetadata, timestamp, nowInSec);
+            if (hasNonExpiringLiveCell)
+                return LivenessInfo.create(viewMetadata, timestamp, nowInSec);
+            if (biggestExpirationCell == null)
+                return baseLiveness;
+            if (biggestExpirationCell.localDeletionTime() > baseLiveness.localExpirationTime()
+                    || !baseLiveness.isLive(nowInSec))
+                return LivenessInfo.create(timestamp,
+                                                       biggestExpirationCell.ttl(),
+                                                       biggestExpirationCell.localDeletionTime());
+            return baseLiveness;
+        }
+
+        Cell cell = baseRow.getCell(view.baseNonPKColumnsInViewPK.get(0));
+        assert isLive(cell) : "We shouldn't have got there if the base row had no associated entry";
+
+        return LivenessInfo.create(cell.timestamp(), cell.ttl(), cell.localDeletionTime());
+    }
+
+    private long computeTimestampForEntryDeletion(Row existingBaseRow, Row mergedBaseRow)
+    {
+        DeletionTime deletion = mergedBaseRow.deletion().time();
+        if (view.hasSamePrimaryKeyColumnsAsBaseTable())
+        {
+            long timestamp = Math.max(deletion.markedForDeleteAt(), existingBaseRow.primaryKeyLivenessInfo().timestamp());
+            if (view.getDefinition().includeAllColumns)
+                return timestamp;
+
+            for (Cell cell : existingBaseRow.cells())
+            {
+                // selected column should not contribute to view deletion, itself is already included in view row
+                if (view.getViewColumn(cell.column()) != null)
+                    continue;
+                // unselected column is used regardless live or dead, because we don't know if it was used for liveness.
+                timestamp = Math.max(timestamp, cell.maxTimestamp());
+            }
+            return timestamp;
+        }
+        // has base non-pk column in view pk
+        Cell before = existingBaseRow.getCell(view.baseNonPKColumnsInViewPK.get(0));
+        assert isLive(before) : "We shouldn't have got there if the base row had no associated entry";
+        return deletion.deletes(before) ? deletion.markedForDeleteAt() : before.timestamp();
+    }
+
+    private void addColumnData(ColumnDefinition viewColumn, ColumnData baseTableData)
+    {
+        assert viewColumn.isComplex() == baseTableData.column().isComplex();
+        if (!viewColumn.isComplex())
+        {
+            addCell(viewColumn, (Cell)baseTableData);
+            return;
+        }
+
+        ComplexColumnData complexData = (ComplexColumnData)baseTableData;
+        currentViewEntryBuilder.addComplexDeletion(viewColumn, complexData.complexDeletion());
+        for (Cell cell : complexData)
+            addCell(viewColumn, cell);
+    }
+
+    private void addCell(ColumnDefinition viewColumn, Cell baseTableCell)
+    {
+        assert !viewColumn.isPrimaryKeyColumn();
+        currentViewEntryBuilder.addCell(baseTableCell.withUpdatedColumn(viewColumn));
+    }
+
+    /**
+     * Finish building the currently updated view entry and add it to the other built
+     * updates.
+     */
+    private void submitUpdate()
+    {
+        Row row = currentViewEntryBuilder.build();
+        // I'm not sure we can reach there is there is nothing is updated, but adding an empty row breaks things
+        // and it costs us nothing to be prudent here.
+        if (row.isEmpty())
+            return;
+
+        DecoratedKey partitionKey = makeCurrentPartitionKey();
+        PartitionUpdate update = updates.get(partitionKey);
+        if (update == null)
+        {
+            // We can't really know which columns of the view will be updated nor how many row will be updated for this key
+            // so we rely on hopefully sane defaults.
+            update = new PartitionUpdate(viewMetadata, partitionKey, viewMetadata.partitionColumns(), 4);
+            updates.put(partitionKey, update);
+        }
+        update.add(row);
+    }
+
+    private DecoratedKey makeCurrentPartitionKey()
+    {
+        ByteBuffer rawKey = viewMetadata.partitionKeyColumns().size() == 1
+                          ? currentViewEntryPartitionKey[0]
+                          : CompositeType.build(currentViewEntryPartitionKey);
+
+        return viewMetadata.decorateKey(rawKey);
+    }
+
+    private ByteBuffer getValueForPK(ColumnDefinition column, Row row)
+    {
+        switch (column.kind)
+        {
+            case PARTITION_KEY:
+                return basePartitionKey[column.position()];
+            case CLUSTERING:
+                return row.clustering().get(column.position());
+            default:
+                // This shouldn't NPE as we shouldn't get there if the value can be null (or there is a bug in updateAction())
+                return row.getCell(column).value();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/db/view/ViewUtils.java b/src/java/org/apache/cassandra/db/view/ViewUtils.java
new file mode 100644
index 0000000..4d9517f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/view/ViewUtils.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.view;
+
+import java.net.InetAddress;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.locator.NetworkTopologyStrategy;
+import org.apache.cassandra.utils.FBUtilities;
+
+public final class ViewUtils
+{
+    private ViewUtils()
+    {
+    }
+
+    /**
+     * Calculate the natural endpoint for the view.
+     *
+     * The view natural endpoint is the endpoint which has the same cardinality as this node in the replication factor.
+     * The cardinality is the number at which this node would store a piece of data, given the change in replication
+     * factor. If the keyspace's replication strategy is a NetworkTopologyStrategy, we filter the ring to contain only
+     * nodes in the local datacenter when calculating cardinality.
+     *
+     * For example, if we have the following ring:
+     *   A, T1 -> B, T2 -> C, T3 -> A
+     *
+     * For the token T1, at RF=1, A would be included, so A's cardinality for T1 is 1. For the token T1, at RF=2, B would
+     * be included, so B's cardinality for token T1 is 2. For token T3, at RF = 2, A would be included, so A's cardinality
+     * for T3 is 2.
+     *
+     * For a view whose base token is T1 and whose view token is T3, the pairings between the nodes would be:
+     *  A writes to C (A's cardinality is 1 for T1, and C's cardinality is 1 for T3)
+     *  B writes to A (B's cardinality is 2 for T1, and A's cardinality is 2 for T3)
+     *  C writes to B (C's cardinality is 3 for T1, and B's cardinality is 3 for T3)
+     *
+     * @return Optional.empty() if this method is called using a base token which does not belong to this replica
+     */
+    public static Optional<InetAddress> getViewNaturalEndpoint(String keyspaceName, Token baseToken, Token viewToken)
+    {
+        AbstractReplicationStrategy replicationStrategy = Keyspace.open(keyspaceName).getReplicationStrategy();
+
+        String localDataCenter = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
+        List<InetAddress> baseEndpoints = new ArrayList<>();
+        List<InetAddress> viewEndpoints = new ArrayList<>();
+        for (InetAddress baseEndpoint : replicationStrategy.getNaturalEndpoints(baseToken))
+        {
+            // An endpoint is local if we're not using Net
+            if (!(replicationStrategy instanceof NetworkTopologyStrategy) ||
+                DatabaseDescriptor.getEndpointSnitch().getDatacenter(baseEndpoint).equals(localDataCenter))
+                baseEndpoints.add(baseEndpoint);
+        }
+
+        for (InetAddress viewEndpoint : replicationStrategy.getNaturalEndpoints(viewToken))
+        {
+            // If we are a base endpoint which is also a view replica, we use ourselves as our view replica
+            if (viewEndpoint.equals(FBUtilities.getBroadcastAddress()))
+                return Optional.of(viewEndpoint);
+
+            // We have to remove any endpoint which is shared between the base and the view, as it will select itself
+            // and throw off the counts otherwise.
+            if (baseEndpoints.contains(viewEndpoint))
+                baseEndpoints.remove(viewEndpoint);
+            else if (!(replicationStrategy instanceof NetworkTopologyStrategy) ||
+                     DatabaseDescriptor.getEndpointSnitch().getDatacenter(viewEndpoint).equals(localDataCenter))
+                viewEndpoints.add(viewEndpoint);
+        }
+
+        // The replication strategy will be the same for the base and the view, as they must belong to the same keyspace.
+        // Since the same replication strategy is used, the same placement should be used and we should get the same
+        // number of replicas for all of the tokens in the ring.
+        assert baseEndpoints.size() == viewEndpoints.size() : "Replication strategy should have the same number of endpoints for the base and the view";
+        int baseIdx = baseEndpoints.indexOf(FBUtilities.getBroadcastAddress());
+
+        if (baseIdx < 0)
+            //This node is not a base replica of this key, so we return empty
+            return Optional.empty();
+
+        return Optional.of(viewEndpoints.get(baseIdx));
+    }
+}

diff --git a/src/java/org/apache/cassandra/dht/AbstractBounds.java b/src/java/org/apache/cassandra/dht/AbstractBounds.java
index c33ffc0..298c316 100644
--- a/src/java/org/apache/cassandra/dht/AbstractBounds.java
+++ b/src/java/org/apache/cassandra/dht/AbstractBounds.java

@@ -24,10 +24,11 @@
 import java.util.List;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.Pair;
 
 public abstract class AbstractBounds<T extends RingPosition<T>> implements Serializable
@@ -35,8 +36,8 @@
     private static final long serialVersionUID = 1L;
     public static final IPartitionerDependentSerializer<AbstractBounds<Token>> tokenSerializer =
             new AbstractBoundsSerializer<Token>(Token.serializer);
-    public static final IPartitionerDependentSerializer<AbstractBounds<RowPosition>> rowPositionSerializer =
-            new AbstractBoundsSerializer<RowPosition>(RowPosition.serializer);
+    public static final IPartitionerDependentSerializer<AbstractBounds<PartitionPosition>> rowPositionSerializer =
+            new AbstractBoundsSerializer<PartitionPosition>(PartitionPosition.serializer);
 
     private enum Type
     {
@@ -137,12 +138,20 @@
     protected abstract String getOpeningString();
     protected abstract String getClosingString();
 
+    public abstract boolean isStartInclusive();
+    public abstract boolean isEndInclusive();
+
     public abstract AbstractBounds<T> withNewRight(T newRight);
 
     public static class AbstractBoundsSerializer<T extends RingPosition<T>> implements IPartitionerDependentSerializer<AbstractBounds<T>>
     {
+        private static final int IS_TOKEN_FLAG        = 0x01;
+        private static final int START_INCLUSIVE_FLAG = 0x02;
+        private static final int END_INCLUSIVE_FLAG   = 0x04;
+
         IPartitionerDependentSerializer<T> serializer;
 
+        // Use for pre-3.0 protocol
         private static int kindInt(AbstractBounds<?> ab)
         {
             int kind = ab instanceof Range ? Type.RANGE.ordinal() : Type.BOUNDS.ordinal();
@@ -151,6 +160,19 @@
             return kind;
         }
 
+        // For from 3.0 onwards
+        private static int kindFlags(AbstractBounds<?> ab)
+        {
+            int flags = 0;
+            if (ab.left instanceof Token)
+                flags |= IS_TOKEN_FLAG;
+            if (ab.isStartInclusive())
+                flags |= START_INCLUSIVE_FLAG;
+            if (ab.isEndInclusive())
+                flags |= END_INCLUSIVE_FLAG;
+            return flags;
+        }
+
         public AbstractBoundsSerializer(IPartitionerDependentSerializer<T> serializer)
         {
             this.serializer = serializer;
@@ -162,30 +184,51 @@
              * The first int tells us if it's a range or bounds (depending on the value) _and_ if it's tokens or keys (depending on the
              * sign). We use negative kind for keys so as to preserve the serialization of token from older version.
              */
-            out.writeInt(kindInt(range));
+            if (version < MessagingService.VERSION_30)
+                out.writeInt(kindInt(range));
+            else
+                out.writeByte(kindFlags(range));
             serializer.serialize(range.left, out, version);
             serializer.serialize(range.right, out, version);
         }
 
         public AbstractBounds<T> deserialize(DataInput in, IPartitioner p, int version) throws IOException
         {
-            int kind = in.readInt();
-            boolean isToken = kind >= 0;
-            if (!isToken)
-                kind = -(kind+1);
+            boolean isToken, startInclusive, endInclusive;
+            if (version < MessagingService.VERSION_30)
+            {
+                int kind = in.readInt();
+                isToken = kind >= 0;
+                if (!isToken)
+                    kind = -(kind+1);
+
+                // Pre-3.0, everything that wasa not a Range was (wrongly) serialized as a Bound;
+                startInclusive = kind != Type.RANGE.ordinal();
+                endInclusive = true;
+            }
+            else
+            {
+                int flags = in.readUnsignedByte();
+                isToken = (flags & IS_TOKEN_FLAG) != 0;
+                startInclusive = (flags & START_INCLUSIVE_FLAG) != 0;
+                endInclusive = (flags & END_INCLUSIVE_FLAG) != 0;
+            }
 
             T left = serializer.deserialize(in, p, version);
             T right = serializer.deserialize(in, p, version);
             assert isToken == left instanceof Token;
 
-            if (kind == Type.RANGE.ordinal())
-                return new Range<T>(left, right);
-            return new Bounds<T>(left, right);
+            if (startInclusive)
+                return endInclusive ? new Bounds<T>(left, right) : new IncludingExcludingBounds<T>(left, right);
+            else
+                return endInclusive ? new Range<T>(left, right) : new ExcludingBounds<T>(left, right);
         }
 
         public long serializedSize(AbstractBounds<T> ab, int version)
         {
-            int size = TypeSizes.NATIVE.sizeof(kindInt(ab));
+            int size = version < MessagingService.VERSION_30
+                     ? TypeSizes.sizeof(kindInt(ab))
+                     : 1;
             size += serializer.serializedSize(ab.left, version);
             size += serializer.serializedSize(ab.right, version);
             return size;

diff --git a/src/java/org/apache/cassandra/dht/BootStrapper.java b/src/java/org/apache/cassandra/dht/BootStrapper.java
index 26fa6b3..1c40482 100644
--- a/src/java/org/apache/cassandra/dht/BootStrapper.java
+++ b/src/java/org/apache/cassandra/dht/BootStrapper.java

@@ -17,27 +17,27 @@
  */
 package org.apache.cassandra.dht;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import com.google.common.util.concurrent.ListenableFuture;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.dht.tokenallocator.TokenAllocation;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.*;
 import org.apache.cassandra.utils.progress.ProgressEvent;
 import org.apache.cassandra.utils.progress.ProgressEventNotifierSupport;
@@ -77,7 +77,7 @@
         streamer.addSourceFilter(new RangeStreamer.FailureDetectorSourceFilter(FailureDetector.instance));
         streamer.addSourceFilter(new RangeStreamer.ExcludeLocalNodeFilter());
 
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             AbstractReplicationStrategy strategy = Keyspace.open(keyspaceName).getReplicationStrategy();
             streamer.addRanges(keyspaceName, strategy.getPendingAddressRanges(tokenMetadata, tokens, address));
@@ -151,43 +151,67 @@
 
     /**
      * if initialtoken was specified, use that (split on comma).
-     * otherwise, if num_tokens == 1, pick a token to assume half the load of the most-loaded node.
+     * otherwise, if allocationKeyspace is specified use the token allocation algorithm to generate suitable tokens
      * else choose num_tokens tokens at random
      */
-    public static Collection<Token> getBootstrapTokens(final TokenMetadata metadata) throws ConfigurationException
+    public static Collection<Token> getBootstrapTokens(final TokenMetadata metadata, InetAddress address) throws ConfigurationException
     {
+        String allocationKeyspace = DatabaseDescriptor.getAllocateTokensForKeyspace();
         Collection<String> initialTokens = DatabaseDescriptor.getInitialTokens();
+        if (initialTokens.size() > 0 && allocationKeyspace != null)
+            logger.warn("manually specified tokens override automatic allocation");
+
         // if user specified tokens, use those
         if (initialTokens.size() > 0)
-        {
-            logger.trace("tokens manually specified as {}",  initialTokens);
-            List<Token> tokens = new ArrayList<>(initialTokens.size());
-            for (String tokenString : initialTokens)
-            {
-                Token token = StorageService.getPartitioner().getTokenFactory().fromString(tokenString);
-                if (metadata.getEndpoint(token) != null)
-                    throw new ConfigurationException("Bootstrapping to existing token " + tokenString + " is not allowed (decommission/removenode the old node first).");
-                tokens.add(token);
-            }
-            return tokens;
-        }
+            return getSpecifiedTokens(metadata, initialTokens);
 
         int numTokens = DatabaseDescriptor.getNumTokens();
         if (numTokens < 1)
             throw new ConfigurationException("num_tokens must be >= 1");
 
+        if (allocationKeyspace != null)
+            return allocateTokens(metadata, address, allocationKeyspace, numTokens);
+
         if (numTokens == 1)
-            logger.warn("Picking random token for a single vnode.  You should probably add more vnodes; failing that, you should probably specify the token manually");
+            logger.warn("Picking random token for a single vnode.  You should probably add more vnodes and/or use the automatic token allocation mechanism.");
 
         return getRandomTokens(metadata, numTokens);
     }
 
+    private static Collection<Token> getSpecifiedTokens(final TokenMetadata metadata,
+                                                        Collection<String> initialTokens)
+    {
+        logger.trace("tokens manually specified as {}",  initialTokens);
+        List<Token> tokens = new ArrayList<>(initialTokens.size());
+        for (String tokenString : initialTokens)
+        {
+            Token token = metadata.partitioner.getTokenFactory().fromString(tokenString);
+            if (metadata.getEndpoint(token) != null)
+                throw new ConfigurationException("Bootstrapping to existing token " + tokenString + " is not allowed (decommission/removenode the old node first).");
+            tokens.add(token);
+        }
+        return tokens;
+    }
+
+    static Collection<Token> allocateTokens(final TokenMetadata metadata,
+                                            InetAddress address,
+                                            String allocationKeyspace,
+                                            int numTokens)
+    {
+        Keyspace ks = Keyspace.open(allocationKeyspace);
+        if (ks == null)
+            throw new ConfigurationException("Problem opening token allocation keyspace " + allocationKeyspace);
+        AbstractReplicationStrategy rs = ks.getReplicationStrategy();
+
+        return TokenAllocation.allocateTokens(metadata, rs, address, numTokens);
+    }
+
     public static Collection<Token> getRandomTokens(TokenMetadata metadata, int numTokens)
     {
         Set<Token> tokens = new HashSet<>(numTokens);
         while (tokens.size() < numTokens)
         {
-            Token token = StorageService.getPartitioner().getRandomToken();
+            Token token = metadata.partitioner.getRandomToken();
             if (metadata.getEndpoint(token) == null)
                 tokens.add(token);
         }
@@ -203,14 +227,14 @@
             out.writeUTF(s);
         }
 
-        public String deserialize(DataInput in, int version) throws IOException
+        public String deserialize(DataInputPlus in, int version) throws IOException
         {
             return in.readUTF();
         }
 
         public long serializedSize(String s, int version)
         {
-            return TypeSizes.NATIVE.sizeof(s);
+            return TypeSizes.sizeof(s);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/dht/Bounds.java b/src/java/org/apache/cassandra/dht/Bounds.java
index 73414cd..a125168 100644
--- a/src/java/org/apache/cassandra/dht/Bounds.java
+++ b/src/java/org/apache/cassandra/dht/Bounds.java

@@ -29,7 +29,7 @@
 import com.google.common.collect.PeekingIterator;
 import com.google.common.collect.Sets;
 
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -125,12 +125,22 @@
         return false;
     }
 
+    public boolean isStartInclusive()
+    {
+        return true;
+    }
+
+    public boolean isEndInclusive()
+    {
+        return true;
+    }
+
     /**
      * Compute a bounds of keys corresponding to a given bounds of token.
      */
-    public static Bounds<RowPosition> makeRowBounds(Token left, Token right)
+    public static Bounds<PartitionPosition> makeRowBounds(Token left, Token right)
     {
-        return new Bounds<RowPosition>(left.minKeyBound(), right.maxKeyBound());
+        return new Bounds<PartitionPosition>(left.minKeyBound(), right.maxKeyBound());
     }
 
     public AbstractBounds<T> withNewRight(T newRight)

diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
index 297e5a6..bbf6fd6 100644
--- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java

@@ -116,6 +116,20 @@
         {
             return token;
         }
+
+        @Override
+        public double size(Token next)
+        {
+            throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.",
+                                                                  getClass().getSimpleName()));
+        }
+
+        @Override
+        public Token increaseSlightly()
+        {
+            throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.",
+                                                                  getClass().getSimpleName()));
+        }
     }
 
     public BytesToken getToken(ByteBuffer key)
@@ -262,7 +276,7 @@
 
         for (String ks : Schema.instance.getKeyspaces())
         {
-            for (CFMetaData cfmd : Schema.instance.getKSMetaData(ks).cfMetaData().values())
+            for (CFMetaData cfmd : Schema.instance.getTablesAndViews(ks))
             {
                 for (Range<Token> r : sortedRanges)
                 {
@@ -286,4 +300,9 @@
     {
         return BytesType.instance;
     }
+
+    public AbstractType<?> partitionOrdering()
+    {
+        return BytesType.instance;
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/ComparableObjectToken.java b/src/java/org/apache/cassandra/dht/ComparableObjectToken.java
index 137f27c..97c0c52 100644
--- a/src/java/org/apache/cassandra/dht/ComparableObjectToken.java
+++ b/src/java/org/apache/cassandra/dht/ComparableObjectToken.java

@@ -66,4 +66,18 @@
 
         return token.compareTo(((ComparableObjectToken<C>) o).token);
     }
+
+    @Override
+    public double size(Token next)
+    {
+        throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.",
+                                                              getClass().getSimpleName()));
+    }
+
+    @Override
+    public Token increaseSlightly()
+    {
+        throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.",
+                                                              getClass().getSimpleName()));
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/ExcludingBounds.java b/src/java/org/apache/cassandra/dht/ExcludingBounds.java
index 7319356..8fbde28 100644
--- a/src/java/org/apache/cassandra/dht/ExcludingBounds.java
+++ b/src/java/org/apache/cassandra/dht/ExcludingBounds.java

@@ -90,6 +90,16 @@
         return ")";
     }
 
+    public boolean isStartInclusive()
+    {
+        return false;
+    }
+
+    public boolean isEndInclusive()
+    {
+        return false;
+    }
+
     public AbstractBounds<T> withNewRight(T newRight)
     {
         return new ExcludingBounds<T>(left, newRight);

diff --git a/src/java/org/apache/cassandra/dht/IPartitioner.java b/src/java/org/apache/cassandra/dht/IPartitioner.java
index b22da66..e0a08dc 100644
--- a/src/java/org/apache/cassandra/dht/IPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/IPartitioner.java

@@ -78,4 +78,10 @@
     public Map<Token, Float> describeOwnership(List<Token> sortedTokens);
 
     public AbstractType<?> getTokenValidator();
+
+    /**
+     * Abstract type that orders the same way as DecoratedKeys provided by this partitioner.
+     * Used by secondary indices.
+     */
+    public AbstractType<?> partitionOrdering();
 }

diff --git a/src/java/org/apache/cassandra/dht/IncludingExcludingBounds.java b/src/java/org/apache/cassandra/dht/IncludingExcludingBounds.java
index abcf87b..19c098e 100644
--- a/src/java/org/apache/cassandra/dht/IncludingExcludingBounds.java
+++ b/src/java/org/apache/cassandra/dht/IncludingExcludingBounds.java

@@ -89,6 +89,16 @@
         return ")";
     }
 
+    public boolean isStartInclusive()
+    {
+        return true;
+    }
+
+    public boolean isEndInclusive()
+    {
+        return false;
+    }
+
     public AbstractBounds<T> withNewRight(T newRight)
     {
         return new IncludingExcludingBounds<T>(left, newRight);

diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
index 01dc75e..aaf2569 100644
--- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java

@@ -27,10 +27,11 @@
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.memory.HeapAllocator;
 
 public class LocalPartitioner implements IPartitioner
 {
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new LocalPartitioner(null).new LocalToken(null));
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new LocalPartitioner(null).new LocalToken());
 
     final AbstractType<?> comparator;   // package-private to avoid access workarounds in embedded LocalToken.
 
@@ -66,9 +67,37 @@
 
     public Token.TokenFactory getTokenFactory()
     {
-        throw new UnsupportedOperationException();
+        return tokenFactory;
     }
 
+    private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
+    {
+        public ByteBuffer toByteArray(Token token)
+        {
+            return ((LocalToken)token).token;
+        }
+
+        public Token fromByteArray(ByteBuffer bytes)
+        {
+            return new LocalToken(bytes);
+        }
+
+        public String toString(Token token)
+        {
+            return comparator.getString(((LocalToken)token).token);
+        }
+
+        public void validate(String token)
+        {
+            comparator.validate(comparator.fromString(token));
+        }
+
+        public Token fromString(String string)
+        {
+            return new LocalToken(comparator.fromString(string));
+        }
+    };
+
     public boolean preservesOrder()
     {
         return true;
@@ -84,13 +113,23 @@
         return comparator;
     }
 
+    public AbstractType<?> partitionOrdering()
+    {
+        return comparator;
+    }
+
     public class LocalToken extends ComparableObjectToken<ByteBuffer>
     {
         static final long serialVersionUID = 8437543776403014875L;
 
+        private LocalToken()
+        {
+            super(null);
+        }
+
         public LocalToken(ByteBuffer token)
         {
-            super(token);
+            super(HeapAllocator.instance.clone(token));
         }
 
         @Override

diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
index 96c603e..d68be3f 100644
--- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
+++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java

@@ -20,15 +20,13 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
 
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.PreHashedDecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -48,6 +46,7 @@
     private static final int HEAP_SIZE = (int) ObjectSizes.measureDeep(MINIMUM);
 
     public static final Murmur3Partitioner instance = new Murmur3Partitioner();
+    public static final AbstractType<?> partitionOrdering = new PartitionerDefinedOrder(instance);
 
     public DecoratedKey decorateKey(ByteBuffer key)
     {
@@ -140,6 +139,21 @@
         {
             return token;
         }
+
+        @Override
+        public double size(Token next)
+        {
+            LongToken n = (LongToken) next;
+            long v = n.token - token;  // Overflow acceptable and desired.
+            double d = Math.scalb((double) v, -Long.SIZE); // Scale so that the full range is 1.
+            return d > 0.0 ? d : (d + 1.0); // Adjust for signed long, also making sure t.size(t) == 1.
+        }
+
+        @Override
+        public Token increaseSlightly()
+        {
+            return new LongToken(token + 1);
+        }
     }
 
     /**
@@ -170,7 +184,12 @@
 
     public LongToken getRandomToken()
     {
-        return new LongToken(normalize(ThreadLocalRandom.current().nextLong()));
+        return getRandomToken(ThreadLocalRandom.current());
+    }
+
+    public LongToken getRandomToken(Random r)
+    {
+        return new LongToken(normalize(r.nextLong()));
     }
 
     private long normalize(long v)
@@ -271,4 +290,9 @@
     {
         return LongType.instance;
     }
+
+    public AbstractType<?> partitionOrdering()
+    {
+        return partitionOrdering;
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
index cffa4fc..96b4ca0 100644
--- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java

@@ -216,12 +216,12 @@
 
         for (String ks : Schema.instance.getKeyspaces())
         {
-            for (CFMetaData cfmd : Schema.instance.getKSMetaData(ks).cfMetaData().values())
+            for (CFMetaData cfmd : Schema.instance.getTablesAndViews(ks))
             {
                 for (Range<Token> r : sortedRanges)
                 {
                     // Looping over every KS:CF:Range, get the splits size and add it to the count
-                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, cfmd.getMinIndexInterval()).size());
+                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, cfmd.params.minIndexInterval).size());
                 }
             }
         }
@@ -240,4 +240,9 @@
     {
         return UTF8Type.instance;
     }
+
+    public AbstractType<?> partitionOrdering()
+    {
+        return UTF8Type.instance;
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
index 71a0a99..c7837c9 100644
--- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java

@@ -20,6 +20,7 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
+import java.security.MessageDigest;
 import java.util.*;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -29,6 +30,7 @@
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.GuidGenerator;
@@ -44,9 +46,34 @@
     public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1");
     public static final BigInteger MAXIMUM = new BigInteger("2").pow(127);
 
-    private static final int HEAP_SIZE = (int) ObjectSizes.measureDeep(new BigIntegerToken(FBUtilities.hashToBigInteger(ByteBuffer.allocate(1))));
+    /**
+     * Maintain a separate threadlocal message digest, exclusively for token hashing. This is necessary because
+     * when Tracing is enabled and using the default tracing implementation, creating the mutations for the trace
+     * events involves tokenizing the partition keys. This happens multiple times whilst servicing a ReadCommand,
+     * and so can interfere with the stateful digest calculation if the node is a replica producing a digest response.
+     */
+    private static final ThreadLocal<MessageDigest> localMD5Digest = new ThreadLocal<MessageDigest>()
+    {
+        @Override
+        protected MessageDigest initialValue()
+        {
+            return FBUtilities.newMessageDigest("MD5");
+        }
+
+        @Override
+        public MessageDigest get()
+        {
+            MessageDigest digest = super.get();
+            digest.reset();
+            return digest;
+        }
+    };
+
+    private static final int HEAP_SIZE = (int) ObjectSizes.measureDeep(new BigIntegerToken(hashToBigInteger(ByteBuffer.allocate(1))));
 
     public static final RandomPartitioner instance = new RandomPartitioner();
+    public static final AbstractType<?> partitionOrdering = new PartitionerDefinedOrder(instance);
+
 
     public DecoratedKey decorateKey(ByteBuffer key)
     {
@@ -70,7 +97,7 @@
 
     public BigIntegerToken getRandomToken()
     {
-        BigInteger token = FBUtilities.hashToBigInteger(GuidGenerator.guidAsBytes());
+        BigInteger token = hashToBigInteger(GuidGenerator.guidAsBytes());
         if ( token.signum() == -1 )
             token = token.multiply(BigInteger.valueOf(-1L));
         return new BigIntegerToken(token);
@@ -158,7 +185,8 @@
     {
         if (key.remaining() == 0)
             return MINIMUM;
-        return new BigIntegerToken(FBUtilities.hashToBigInteger(key));
+
+        return new BigIntegerToken(hashToBigInteger(key));
     }
 
     public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
@@ -196,4 +224,20 @@
     {
         return IntegerType.instance;
     }
+
+    public AbstractType<?> partitionOrdering()
+    {
+        return partitionOrdering;
+    }
+
+    private static BigInteger hashToBigInteger(ByteBuffer data)
+    {
+        MessageDigest messageDigest = localMD5Digest.get();
+        if (data.hasArray())
+            messageDigest.update(data.array(), data.arrayOffset() + data.position(), data.remaining());
+        else
+            messageDigest.update(data.duplicate());
+
+        return new BigInteger(messageDigest.digest()).abs();
+    }
 }

diff --git a/src/java/org/apache/cassandra/dht/Range.java b/src/java/org/apache/cassandra/dht/Range.java
index 34e91ea..3cf292a 100644
--- a/src/java/org/apache/cassandra/dht/Range.java
+++ b/src/java/org/apache/cassandra/dht/Range.java

@@ -22,8 +22,7 @@
 
 import org.apache.commons.lang3.ObjectUtils;
 
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -257,6 +256,14 @@
     }
 
     /**
+     * Tells if the given range covers the entire ring
+     */
+    private static <T extends RingPosition<T>> boolean isFull(T left, T right)
+    {
+        return left.equals(right);
+    }
+
+    /**
      * Note: this class has a natural ordering that is inconsistent with equals
      */
     public int compareTo(Range<T> rhs)
@@ -275,13 +282,24 @@
      * Subtracts a portion of this range.
      * @param contained The range to subtract from this. It must be totally
      * contained by this range.
-     * @return An ArrayList of the Ranges left after subtracting contained
+     * @return A List of the Ranges left after subtracting contained
      * from this.
      */
-    private ArrayList<Range<T>> subtractContained(Range<T> contained)
+    private List<Range<T>> subtractContained(Range<T> contained)
     {
-        ArrayList<Range<T>> difference = new ArrayList<Range<T>>(2);
+        // both ranges cover the entire ring, their difference is an empty set
+        if(isFull(left, right) && isFull(contained.left, contained.right))
+        {
+            return Collections.emptyList();
+        }
 
+        // a range is subtracted from another range that covers the entire ring
+        if(isFull(left, right))
+        {
+            return Collections.singletonList(new Range<>(contained.right, contained.left));
+        }
+
+        List<Range<T>> difference = new ArrayList<>(2);
         if (!left.equals(contained.left))
             difference.add(new Range<T>(left, contained.left));
         if (!right.equals(contained.right))
@@ -347,7 +365,7 @@
                 // intersections.length must be 2
                 Range<T> first = intersections[0];
                 Range<T> second = intersections[1];
-                ArrayList<Range<T>> temp = rhs.subtractContained(first);
+                List<Range<T>> temp = rhs.subtractContained(first);
 
                 // Because there are two intersections, subtracting only one of them
                 // will yield a single Range.
@@ -397,6 +415,16 @@
         return "]";
     }
 
+    public boolean isStartInclusive()
+    {
+        return false;
+    }
+
+    public boolean isEndInclusive()
+    {
+        return true;
+    }
+
     public List<String> asList()
     {
         ArrayList<String> ret = new ArrayList<String>(2);
@@ -490,12 +518,12 @@
     /**
      * Compute a range of keys corresponding to a given range of token.
      */
-    public static Range<RowPosition> makeRowRange(Token left, Token right)
+    public static Range<PartitionPosition> makeRowRange(Token left, Token right)
     {
-        return new Range<RowPosition>(left.maxKeyBound(), right.maxKeyBound());
+        return new Range<PartitionPosition>(left.maxKeyBound(), right.maxKeyBound());
     }
 
-    public static Range<RowPosition> makeRowRange(Range<Token> tokenBounds)
+    public static Range<PartitionPosition> makeRowRange(Range<Token> tokenBounds)
     {
         return makeRowRange(tokenBounds.left, tokenBounds.right);
     }

diff --git a/src/java/org/apache/cassandra/dht/RangeStreamer.java b/src/java/org/apache/cassandra/dht/RangeStreamer.java
index aef588e..32e084f 100644
--- a/src/java/org/apache/cassandra/dht/RangeStreamer.java
+++ b/src/java/org/apache/cassandra/dht/RangeStreamer.java

@@ -322,8 +322,8 @@
                 if (strat != null && strat.getReplicationFactor() == 1)
                 {
                     if (useStrictConsistency)
-                        throw new IllegalStateException("Unable to find sufficient sources for streaming range " + range + " in keyspace " + keyspace + " with RF=1." +
-                                                        "If you want to ignore this, consider using system property -Dcassandra.consistent.rangemovement=false.");
+                        throw new IllegalStateException("Unable to find sufficient sources for streaming range " + range + " in keyspace " + keyspace + " with RF=1. " +
+                                                        "Ensure this keyspace contains replicas in the source datacenter.");
                     else
                         logger.warn("Unable to find sufficient sources for streaming range " + range + " in keyspace " + keyspace + " with RF=1. " +
                                     "Keyspace might be missing data.");
@@ -359,7 +359,7 @@
             Collection<Range<Token>> ranges = entry.getValue().getValue();
 
             // filter out already streamed ranges
-            Set<Range<Token>> availableRanges = stateStore.getAvailableRanges(keyspace, StorageService.getPartitioner());
+            Set<Range<Token>> availableRanges = stateStore.getAvailableRanges(keyspace, StorageService.instance.getTokenMetadata().partitioner);
             if (ranges.removeAll(availableRanges))
             {
                 logger.info("Some ranges of {} are already available. Skipping streaming those ranges.", availableRanges);

diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index 76918a7..20b45ef 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java

@@ -22,7 +22,7 @@
 import java.io.Serializable;
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataOutputPlus;
@@ -65,7 +65,7 @@
         {
             IPartitioner p = object.getPartitioner();
             ByteBuffer b = p.getTokenFactory().toByteArray(object);
-            return TypeSizes.NATIVE.sizeof(b.remaining()) + b.remaining();
+            return TypeSizes.sizeof(b.remaining()) + b.remaining();
         }
     }
 
@@ -73,6 +73,18 @@
     abstract public long getHeapSize();
     abstract public Object getTokenValue();
 
+    /**
+     * Returns a measure for the token space covered between this token and next.
+     * Used by the token allocation algorithm (see CASSANDRA-7032).
+     */
+    abstract public double size(Token next);
+    /**
+     * Returns a token that is slightly greater than this. Used to avoid clashes
+     * between nodes in separate datacentres trying to use the same token via
+     * the token allocation algorithm.
+     */
+    abstract public Token increaseSlightly();
+
     public Token getToken()
     {
         return this;
@@ -130,7 +142,7 @@
             return (R)maxKeyBound();
     }
 
-    public static class KeyBound implements RowPosition
+    public static class KeyBound implements PartitionPosition
     {
         private final Token token;
         public final boolean isMinimumBound;
@@ -146,7 +158,7 @@
             return token;
         }
 
-        public int compareTo(RowPosition pos)
+        public int compareTo(PartitionPosition pos)
         {
             if (this == pos)
                 return 0;
@@ -176,9 +188,9 @@
             return getToken().isMinimum();
         }
 
-        public RowPosition.Kind kind()
+        public PartitionPosition.Kind kind()
         {
-            return isMinimumBound ? RowPosition.Kind.MIN_BOUND : RowPosition.Kind.MAX_BOUND;
+            return isMinimumBound ? PartitionPosition.Kind.MIN_BOUND : PartitionPosition.Kind.MAX_BOUND;
         }
 
         @Override

diff --git a/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocator.java b/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocator.java
new file mode 100644
index 0000000..054a90e
--- /dev/null
+++ b/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocator.java

@@ -0,0 +1,805 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht.tokenallocator;
+
+import java.util.*;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
+
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Token;
+
+/**
+ * A Replication Aware allocator for tokens, that attempts to ensure an even distribution of ownership across
+ * the known cluster for the provided replication strategy.
+ *
+ * A unit is shorthand for a "unit of ownership" which translates roughly to a node, or a disk on the node,
+ * a CPU on the node, or some other relevant unit of ownership. These units should be the lowest rung over which
+ * ownership needs to be evenly distributed. At the moment only nodes as a whole are treated as units, but that
+ * will change with the introduction of token ranges per disk.
+ */
+class ReplicationAwareTokenAllocator<Unit> implements TokenAllocator<Unit>
+{
+    final NavigableMap<Token, Unit> sortedTokens;
+    final Multimap<Unit, Token> unitToTokens;
+    final ReplicationStrategy<Unit> strategy;
+    final IPartitioner partitioner;
+    final int replicas;
+
+    ReplicationAwareTokenAllocator(NavigableMap<Token, Unit> sortedTokens, ReplicationStrategy<Unit> strategy, IPartitioner partitioner)
+    {
+        this.sortedTokens = sortedTokens;
+        unitToTokens = HashMultimap.create();
+        for (Map.Entry<Token, Unit> en : sortedTokens.entrySet())
+            unitToTokens.put(en.getValue(), en.getKey());
+        this.strategy = strategy;
+        this.replicas = strategy.replicas();
+        this.partitioner = partitioner;
+    }
+
+    public Collection<Token> addUnit(Unit newUnit, int numTokens)
+    {
+        assert !unitToTokens.containsKey(newUnit);
+
+        if (unitCount() < replicas)
+            // Allocation does not matter; everything replicates everywhere.
+            return generateRandomTokens(newUnit, numTokens);
+        if (numTokens > sortedTokens.size())
+            // Some of the heuristics below can't deal with this case. Use random for now, later allocations can fix any problems this may cause.
+            return generateRandomTokens(newUnit, numTokens);
+
+        // ============= construct our initial token ring state =============
+
+        double optTokenOwnership = optimalTokenOwnership(numTokens);
+        Map<Object, GroupInfo> groups = Maps.newHashMap();
+        Map<Unit, UnitInfo<Unit>> unitInfos = createUnitInfos(groups);
+        if (groups.size() < replicas)
+        {
+            // We need at least replicas groups to do allocation correctly. If there aren't enough, 
+            // use random allocation.
+            // This part of the code should only be reached via the RATATest. StrategyAdapter should disallow
+            // token allocation in this case as the algorithm is not able to cover the behavior of NetworkTopologyStrategy.
+            return generateRandomTokens(newUnit, numTokens);
+        }
+
+        // initialise our new unit's state (with an idealised ownership)
+        // strategy must already know about this unit
+        UnitInfo<Unit> newUnitInfo = new UnitInfo<>(newUnit, numTokens * optTokenOwnership, groups, strategy);
+
+        // build the current token ring state
+        TokenInfo<Unit> tokens = createTokenInfos(unitInfos, newUnitInfo.group);
+        newUnitInfo.tokenCount = numTokens;
+
+        // ============= construct and rank our candidate token allocations =============
+
+        // walk the token ring, constructing the set of candidates in ring order
+        // as the midpoints between all existing tokens
+        CandidateInfo<Unit> candidates = createCandidates(tokens, newUnitInfo, optTokenOwnership);
+
+        // Evaluate the expected improvements from all candidates and form a priority queue.
+        PriorityQueue<Weighted<CandidateInfo<Unit>>> improvements = new PriorityQueue<>(sortedTokens.size());
+        CandidateInfo<Unit> candidate = candidates;
+        do
+        {
+            double impr = evaluateImprovement(candidate, optTokenOwnership, 1.0 / numTokens);
+            improvements.add(new Weighted<>(impr, candidate));
+            candidate = candidate.next;
+        } while (candidate != candidates);
+
+        // ============= iteratively take the best candidate, and re-rank =============
+
+        CandidateInfo<Unit> bestToken = improvements.remove().value;
+        for (int vn = 1; ; ++vn)
+        {
+            candidates = bestToken.removeFrom(candidates);
+            confirmCandidate(bestToken);
+
+            if (vn == numTokens)
+                break;
+
+            while (true)
+            {
+                // Get the next candidate in the queue. Its improvement may have changed (esp. if multiple tokens
+                // were good suggestions because they could improve the same problem)-- evaluate it again to check
+                // if it is still a good candidate.
+                bestToken = improvements.remove().value;
+                double impr = evaluateImprovement(bestToken, optTokenOwnership, (vn + 1.0) / numTokens);
+                Weighted<CandidateInfo<Unit>> next = improvements.peek();
+
+                // If it is better than the next in the queue, it is good enough. This is a heuristic that doesn't
+                // get the best results, but works well enough and on average cuts search time by a factor of O(vnodes).
+                if (next == null || impr >= next.weight)
+                    break;
+                improvements.add(new Weighted<>(impr, bestToken));
+            }
+        }
+
+        return ImmutableList.copyOf(unitToTokens.get(newUnit));
+    }
+
+    private Collection<Token> generateRandomTokens(Unit newUnit, int numTokens)
+    {
+        Set<Token> tokens = new HashSet<>(numTokens);
+        while (tokens.size() < numTokens)
+        {
+            Token token = partitioner.getRandomToken();
+            if (!sortedTokens.containsKey(token))
+            {
+                tokens.add(token);
+                sortedTokens.put(token, newUnit);
+                unitToTokens.put(newUnit, token);
+            }
+        }
+        return tokens;
+    }
+
+    private Map<Unit, UnitInfo<Unit>> createUnitInfos(Map<Object, GroupInfo> groups)
+    {
+        Map<Unit, UnitInfo<Unit>> map = Maps.newHashMap();
+        for (Unit n : sortedTokens.values())
+        {
+            UnitInfo<Unit> ni = map.get(n);
+            if (ni == null)
+                map.put(n, ni = new UnitInfo<>(n, 0, groups, strategy));
+            ni.tokenCount++;
+        }
+        return map;
+    }
+
+    /**
+     * Construct the token ring as a CircularList of TokenInfo,
+     * and populate the ownership of the UnitInfo's provided
+     */
+    private TokenInfo<Unit> createTokenInfos(Map<Unit, UnitInfo<Unit>> units, GroupInfo newUnitGroup)
+    {
+        // build the circular list
+        TokenInfo<Unit> prev = null;
+        TokenInfo<Unit> first = null;
+        for (Map.Entry<Token, Unit> en : sortedTokens.entrySet())
+        {
+            Token t = en.getKey();
+            UnitInfo<Unit> ni = units.get(en.getValue());
+            TokenInfo<Unit> ti = new TokenInfo<>(t, ni);
+            first = ti.insertAfter(first, prev);
+            prev = ti;
+        }
+
+        TokenInfo<Unit> curr = first;
+        do
+        {
+            populateTokenInfoAndAdjustUnit(curr, newUnitGroup);
+            curr = curr.next;
+        } while (curr != first);
+
+        return first;
+    }
+
+    private CandidateInfo<Unit> createCandidates(TokenInfo<Unit> tokens, UnitInfo<Unit> newUnitInfo, double initialTokenOwnership)
+    {
+        TokenInfo<Unit> curr = tokens;
+        CandidateInfo<Unit> first = null;
+        CandidateInfo<Unit> prev = null;
+        do
+        {
+            CandidateInfo<Unit> candidate = new CandidateInfo<Unit>(partitioner.midpoint(curr.prev.token, curr.token), curr, newUnitInfo);
+            first = candidate.insertAfter(first, prev);
+
+            candidate.replicatedOwnership = initialTokenOwnership;
+            populateCandidate(candidate);
+
+            prev = candidate;
+            curr = curr.next;
+        } while (curr != tokens);
+        prev.next = first;
+        return first;
+    }
+
+    private void populateCandidate(CandidateInfo<Unit> candidate)
+    {
+        // Only finding replication start would do.
+        populateTokenInfo(candidate, candidate.owningUnit.group);
+    }
+
+    /**
+     * Incorporates the selected candidate into the ring, adjusting ownership information and calculated token
+     * information.
+     */
+    private void confirmCandidate(CandidateInfo<Unit> candidate)
+    {
+        // This process is less efficient than it could be (loops through each vnode's replication span instead
+        // of recalculating replicationStart, replicationThreshold from existing data + new token data in an O(1)
+        // case analysis similar to evaluateImprovement). This is fine as the method does not dominate processing
+        // time.
+
+        // Put the accepted candidate in the token list.
+        UnitInfo<Unit> newUnit = candidate.owningUnit;
+        Token newToken = candidate.token;
+        sortedTokens.put(newToken, newUnit.unit);
+        unitToTokens.put(newUnit.unit, newToken);
+
+        TokenInfo<Unit> prev = candidate.prevInRing();
+        TokenInfo<Unit> newTokenInfo = new TokenInfo<>(newToken, newUnit);
+        newTokenInfo.replicatedOwnership = candidate.replicatedOwnership;
+        newTokenInfo.insertAfter(prev, prev);   // List is not empty so this won't need to change head of list.
+
+        // Update data for candidate.
+        populateTokenInfoAndAdjustUnit(newTokenInfo, newUnit.group);
+
+        ReplicationVisitor replicationVisitor = new ReplicationVisitor();
+        assert newTokenInfo.next == candidate.split;
+        for (TokenInfo<Unit> curr = newTokenInfo.next; !replicationVisitor.visitedAll(); curr = curr.next)
+        {
+            // update the candidate between curr and next
+            candidate = candidate.next;
+            populateCandidate(candidate);
+
+            if (!replicationVisitor.add(curr.owningUnit.group))
+                continue;    // If we've already seen this group, the token cannot be affected.
+
+            populateTokenInfoAndAdjustUnit(curr, newUnit.group);
+        }
+
+        replicationVisitor.clean();
+    }
+
+    /**
+     * Calculates the {@code replicationStart} of a token, as well as {@code replicationThreshold} which is chosen in a way
+     * that permits {@code findUpdatedReplicationStart} to quickly identify changes in ownership.
+     */
+    private Token populateTokenInfo(BaseTokenInfo<Unit, ?> token, GroupInfo newUnitGroup)
+    {
+        GroupInfo tokenGroup = token.owningUnit.group;
+        PopulateVisitor visitor = new PopulateVisitor();
+
+        // Replication start = the end of a token from the RF'th different group seen before the token.
+        Token replicationStart;
+        // The end of a token from the RF-1'th different group seen before the token.
+        Token replicationThreshold = token.token;
+        GroupInfo currGroup;
+        for (TokenInfo<Unit> curr = token.prevInRing(); ; curr = curr.prev)
+        {
+            replicationStart = curr.token;
+            currGroup = curr.owningUnit.group;
+            if (!visitor.add(currGroup))
+                continue; // Group is already seen.
+            if (visitor.visitedAll())
+                break;
+
+            replicationThreshold = replicationStart;
+            // Another instance of the same group precedes us in the replication range of the ring,
+            // so this is where our replication range begins
+            if (currGroup == tokenGroup)
+                break;
+        }
+        if (newUnitGroup == tokenGroup)
+            // new token is always a boundary (as long as it's closer than replicationStart)
+            replicationThreshold = token.token;
+        else if (newUnitGroup != currGroup && visitor.seen(newUnitGroup))
+            // already has new group in replication span before last seen. cannot be affected
+            replicationThreshold = replicationStart;
+        visitor.clean();
+
+        token.replicationThreshold = replicationThreshold;
+        token.replicationStart = replicationStart;
+        return replicationStart;
+    }
+
+    private void populateTokenInfoAndAdjustUnit(TokenInfo<Unit> populate, GroupInfo newUnitGroup)
+    {
+        Token replicationStart = populateTokenInfo(populate, newUnitGroup);
+        double newOwnership = replicationStart.size(populate.token);
+        double oldOwnership = populate.replicatedOwnership;
+        populate.replicatedOwnership = newOwnership;
+        populate.owningUnit.ownership += newOwnership - oldOwnership;
+    }
+
+    /**
+     * Evaluates the improvement in variance for both units and individual tokens when candidate is inserted into the
+     * ring.
+     */
+    private double evaluateImprovement(CandidateInfo<Unit> candidate, double optTokenOwnership, double newUnitMult)
+    {
+        double tokenChange = 0;
+
+        UnitInfo<Unit> candidateUnit = candidate.owningUnit;
+        Token candidateEnd = candidate.token;
+
+        // Form a chain of units affected by the insertion to be able to qualify change of unit ownership.
+        // A unit may be affected more than once.
+        UnitAdjustmentTracker<Unit> unitTracker = new UnitAdjustmentTracker<>(candidateUnit);
+
+        // Reflect change in ownership of the splitting token (candidate).
+        tokenChange += applyOwnershipAdjustment(candidate, candidateUnit, candidate.replicationStart, candidateEnd, optTokenOwnership, unitTracker);
+
+        // Loop through all vnodes that replicate candidate or split and update their ownership.
+        ReplicationVisitor replicationVisitor = new ReplicationVisitor();
+        for (TokenInfo<Unit> curr = candidate.split; !replicationVisitor.visitedAll(); curr = curr.next)
+        {
+            UnitInfo<Unit> currUnit = curr.owningUnit;
+
+            if (!replicationVisitor.add(currUnit.group))
+                continue;    // If this group is already seen, the token cannot be affected.
+
+            Token replicationEnd = curr.token;
+            Token replicationStart = findUpdatedReplicationStart(curr, candidate);
+            tokenChange += applyOwnershipAdjustment(curr, currUnit, replicationStart, replicationEnd, optTokenOwnership, unitTracker);
+        }
+        replicationVisitor.clean();
+
+        double nodeChange = unitTracker.calculateUnitChange(newUnitMult, optTokenOwnership);
+        return -(tokenChange + nodeChange);
+    }
+
+    /**
+     * Returns the start of the replication span for the token {@code curr} when {@code candidate} is inserted into the
+     * ring.
+     */
+    private Token findUpdatedReplicationStart(TokenInfo<Unit> curr, CandidateInfo<Unit> candidate)
+    {
+        return furtherStartToken(curr.replicationThreshold, candidate.token, curr.token);
+    }
+
+    /**
+     * Applies the ownership adjustment for the given element, updating tracked unit ownership and returning the change
+     * of variance.
+     */
+    private double applyOwnershipAdjustment(BaseTokenInfo<Unit, ?> curr, UnitInfo<Unit> currUnit,
+            Token replicationStart, Token replicationEnd,
+            double optTokenOwnership, UnitAdjustmentTracker<Unit> unitTracker)
+    {
+        double oldOwnership = curr.replicatedOwnership;
+        double newOwnership = replicationStart.size(replicationEnd);
+        double tokenCount = currUnit.tokenCount;
+        assert tokenCount > 0;
+        unitTracker.add(currUnit, newOwnership - oldOwnership);
+        return (sq(newOwnership - optTokenOwnership) - sq(oldOwnership - optTokenOwnership)) / sq(tokenCount);
+    }
+
+    /**
+     * Tracker for unit ownership changes. The changes are tracked by a chain of UnitInfos where the adjustedOwnership
+     * field is being updated as we see changes in token ownership.
+     *
+     * The chain ends with an element that points to itself; this element must be specified as argument to the
+     * constructor as well as be the first unit with which 'add' is called; when calculating the variance change
+     * a separate multiplier is applied to it (used to permit more freedom in choosing the first tokens of a unit).
+     */
+    private static class UnitAdjustmentTracker<Unit>
+    {
+        UnitInfo<Unit> unitsChain;
+
+        UnitAdjustmentTracker(UnitInfo<Unit> newUnit)
+        {
+            unitsChain = newUnit;
+        }
+
+        void add(UnitInfo<Unit> currUnit, double diff)
+        {
+            if (currUnit.prevUsed == null)
+            {
+                assert unitsChain.prevUsed != null || currUnit == unitsChain;
+
+                currUnit.adjustedOwnership = currUnit.ownership + diff;
+                currUnit.prevUsed = unitsChain;
+                unitsChain = currUnit;
+            }
+            else
+            {
+                currUnit.adjustedOwnership += diff;
+            }
+        }
+
+        double calculateUnitChange(double newUnitMult, double optTokenOwnership)
+        {
+            double unitChange = 0;
+            UnitInfo<Unit> unitsChain = this.unitsChain;
+            // Now loop through the units chain and add the unit-level changes. Also clear the groups' seen marks.
+            while (true)
+            {
+                double newOwnership = unitsChain.adjustedOwnership;
+                double oldOwnership = unitsChain.ownership;
+                double tokenCount = unitsChain.tokenCount;
+                double diff = (sq(newOwnership / tokenCount - optTokenOwnership) - sq(oldOwnership / tokenCount - optTokenOwnership));
+                UnitInfo<Unit> prev = unitsChain.prevUsed;
+                unitsChain.prevUsed = null;
+                if (unitsChain != prev)
+                    unitChange += diff;
+                else
+                {
+                    unitChange += diff * newUnitMult;
+                    break;
+                }
+                unitsChain = prev;
+            }
+            this.unitsChain = unitsChain;
+            return unitChange;
+        }
+    }
+
+
+    /**
+     * Helper class for marking/unmarking visited a chain of groups
+     */
+    private abstract class GroupVisitor
+    {
+        GroupInfo groupChain = GroupInfo.TERMINATOR;
+        int seen = 0;
+
+        abstract GroupInfo prevSeen(GroupInfo group);
+        abstract void setPrevSeen(GroupInfo group, GroupInfo prevSeen);
+
+        // true iff this is the first time we've visited this group
+        boolean add(GroupInfo group)
+        {
+            if (prevSeen(group) != null)
+                return false;
+            ++seen;
+            setPrevSeen(group, groupChain);
+            groupChain = group;
+            return true;
+        }
+
+        boolean visitedAll()
+        {
+            return seen >= replicas;
+        }
+
+        boolean seen(GroupInfo group)
+        {
+            return prevSeen(group) != null;
+        }
+
+        // Clean group seen markers.
+        void clean()
+        {
+            GroupInfo groupChain = this.groupChain;
+            while (groupChain != GroupInfo.TERMINATOR)
+            {
+                GroupInfo prev = prevSeen(groupChain);
+                setPrevSeen(groupChain, null);
+                groupChain = prev;
+            }
+            this.groupChain = GroupInfo.TERMINATOR;
+        }
+    }
+
+    private class ReplicationVisitor extends GroupVisitor
+    {
+        GroupInfo prevSeen(GroupInfo group)
+        {
+            return group.prevSeen;
+        }
+
+        void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
+        {
+            group.prevSeen = prevSeen;
+        }
+    }
+
+    private class PopulateVisitor extends GroupVisitor
+    {
+        GroupInfo prevSeen(GroupInfo group)
+        {
+            return group.prevPopulate;
+        }
+
+        void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
+        {
+            group.prevPopulate = prevSeen;
+        }
+    }
+
+    private Map.Entry<Token, Unit> mapEntryFor(Token t)
+    {
+        Map.Entry<Token, Unit> en = sortedTokens.floorEntry(t);
+        if (en == null)
+            en = sortedTokens.lastEntry();
+        return en;
+    }
+
+    Unit unitFor(Token t)
+    {
+        return mapEntryFor(t).getValue();
+    }
+
+    private double optimalTokenOwnership(int tokensToAdd)
+    {
+        return 1.0 * replicas / (sortedTokens.size() + tokensToAdd);
+    }
+
+    /**
+     * Selects from {@code t1}, {@code t2} the token that forms a bigger range with {@code towards} as the upper bound,
+     * taking into account wrapping.
+     * Unlike Token.size(), equality is taken to mean "same as" rather than covering the whole range.
+     */
+    private static Token furtherStartToken(Token t1, Token t2, Token towards)
+    {
+        if (t1.equals(towards))
+            return t2;
+        if (t2.equals(towards))
+            return t1;
+
+        return t1.size(towards) > t2.size(towards) ? t1 : t2;
+    }
+
+    private static double sq(double d)
+    {
+        return d * d;
+    }
+
+
+    /**
+     * For testing, remove the given unit preserving correct state of the allocator.
+     */
+    void removeUnit(Unit n)
+    {
+        Collection<Token> tokens = unitToTokens.removeAll(n);
+        sortedTokens.keySet().removeAll(tokens);
+    }
+
+    int unitCount()
+    {
+        return unitToTokens.asMap().size();
+    }
+
+    public String toString()
+    {
+        return getClass().getSimpleName();
+    }
+
+    // get or initialise the shared GroupInfo associated with the unit
+    private static <Unit> GroupInfo getGroup(Unit unit, Map<Object, GroupInfo> groupMap, ReplicationStrategy<Unit> strategy)
+    {
+        Object groupClass = strategy.getGroup(unit);
+        GroupInfo group = groupMap.get(groupClass);
+        if (group == null)
+            groupMap.put(groupClass, group = new GroupInfo(groupClass));
+        return group;
+    }
+
+    /**
+     * Unique group object that one or more UnitInfo objects link to.
+     */
+    private static class GroupInfo
+    {
+        /**
+         * Group identifier given by ReplicationStrategy.getGroup(Unit).
+         */
+        final Object group;
+
+        /**
+         * Seen marker. When non-null, the group is already seen in replication walks.
+         * Also points to previous seen group to enable walking the seen groups and clearing the seen markers.
+         */
+        GroupInfo prevSeen = null;
+        /**
+         * Same marker/chain used by populateTokenInfo.
+         */
+        GroupInfo prevPopulate = null;
+
+        /**
+         * Value used as terminator for seen chains.
+         */
+        static GroupInfo TERMINATOR = new GroupInfo(null);
+
+        public GroupInfo(Object group)
+        {
+            this.group = group;
+        }
+
+        public String toString()
+        {
+            return group.toString() + (prevSeen != null ? "*" : "");
+        }
+    }
+
+    /**
+     * Unit information created and used by ReplicationAwareTokenDistributor. Contained vnodes all point to the same
+     * instance.
+     */
+    static class UnitInfo<Unit>
+    {
+        final Unit unit;
+        final GroupInfo group;
+        double ownership;
+        int tokenCount;
+
+        /**
+         * During evaluateImprovement this is used to form a chain of units affected by the candidate insertion.
+         */
+        UnitInfo<Unit> prevUsed;
+        /**
+         * During evaluateImprovement this holds the ownership after the candidate insertion.
+         */
+        double adjustedOwnership;
+
+        private UnitInfo(Unit unit, GroupInfo group)
+        {
+            this.unit = unit;
+            this.group = group;
+            this.tokenCount = 0;
+        }
+
+        public UnitInfo(Unit unit, double ownership, Map<Object, GroupInfo> groupMap, ReplicationStrategy<Unit> strategy)
+        {
+            this(unit, getGroup(unit, groupMap, strategy));
+            this.ownership = ownership;
+        }
+
+        public String toString()
+        {
+            return String.format("%s%s(%.2e)%s",
+                    unit, unit == group.group ? (group.prevSeen != null ? "*" : "") : ":" + group.toString(),
+                    ownership, prevUsed != null ? (prevUsed == this ? "#" : "->" + prevUsed.toString()) : "");
+        }
+    }
+
+    private static class CircularList<T extends CircularList<T>>
+    {
+        T prev;
+        T next;
+
+        /**
+         * Inserts this after unit in the circular list which starts at head. Returns the new head of the list, which
+         * only changes if head was null.
+         */
+        @SuppressWarnings("unchecked")
+        T insertAfter(T head, T unit)
+        {
+            if (head == null)
+            {
+                return prev = next = (T) this;
+            }
+            assert unit != null;
+            assert unit.next != null;
+            prev = unit;
+            next = unit.next;
+            prev.next = (T) this;
+            next.prev = (T) this;
+            return head;
+        }
+
+        /**
+         * Removes this from the list that starts at head. Returns the new head of the list, which only changes if the
+         * head was removed.
+         */
+        T removeFrom(T head)
+        {
+            next.prev = prev;
+            prev.next = next;
+            return this == head ? (this == next ? null : next) : head;
+        }
+    }
+
+    private static class BaseTokenInfo<Unit, T extends BaseTokenInfo<Unit, T>> extends CircularList<T>
+    {
+        final Token token;
+        final UnitInfo<Unit> owningUnit;
+
+        /**
+         * Start of the replication span for the vnode, i.e. the first token of the RF'th group seen before the token.
+         * The replicated ownership of the unit is the range between {@code replicationStart} and {@code token}.
+         */
+        Token replicationStart;
+        /**
+         * The closest position that the new candidate can take to become the new replication start. If candidate is
+         * closer, the start moves to this position. Used to determine replicationStart after insertion of new token.
+         *
+         * Usually the RF minus one boundary, i.e. the first token of the RF-1'th group seen before the token.
+         */
+        Token replicationThreshold;
+        /**
+         * Current replicated ownership. This number is reflected in the owning unit's ownership.
+         */
+        double replicatedOwnership = 0;
+
+        public BaseTokenInfo(Token token, UnitInfo<Unit> owningUnit)
+        {
+            this.token = token;
+            this.owningUnit = owningUnit;
+        }
+
+        public String toString()
+        {
+            return String.format("%s(%s)", token, owningUnit);
+        }
+
+        /**
+         * Previous unit in the token ring. For existing tokens this is prev,
+         * for candidates it's "split".
+         */
+        TokenInfo<Unit> prevInRing()
+        {
+            return null;
+        }
+    }
+
+    /**
+     * TokenInfo about existing tokens/vnodes.
+     */
+    private static class TokenInfo<Unit> extends BaseTokenInfo<Unit, TokenInfo<Unit>>
+    {
+        public TokenInfo(Token token, UnitInfo<Unit> owningUnit)
+        {
+            super(token, owningUnit);
+        }
+
+        TokenInfo<Unit> prevInRing()
+        {
+            return prev;
+        }
+    }
+
+    /**
+     * TokenInfo about candidate new tokens/vnodes.
+     */
+    private static class CandidateInfo<Unit> extends BaseTokenInfo<Unit, CandidateInfo<Unit>>
+    {
+        // directly preceding token in the current token ring
+        final TokenInfo<Unit> split;
+
+        public CandidateInfo(Token token, TokenInfo<Unit> split, UnitInfo<Unit> owningUnit)
+        {
+            super(token, owningUnit);
+            this.split = split;
+        }
+
+        TokenInfo<Unit> prevInRing()
+        {
+            return split.prev;
+        }
+    }
+
+    static void dumpTokens(String lead, BaseTokenInfo<?, ?> tokens)
+    {
+        BaseTokenInfo<?, ?> token = tokens;
+        do
+        {
+            System.out.format("%s%s: rs %s rt %s size %.2e\n", lead, token, token.replicationStart, token.replicationThreshold, token.replicatedOwnership);
+            token = token.next;
+        } while (token != null && token != tokens);
+    }
+
+    static class Weighted<T> implements Comparable<Weighted<T>>
+    {
+        final double weight;
+        final T value;
+
+        public Weighted(double weight, T value)
+        {
+            this.weight = weight;
+            this.value = value;
+        }
+
+        @Override
+        public int compareTo(Weighted<T> o)
+        {
+            int cmp = Double.compare(o.weight, this.weight);
+            return cmp;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s<%s>", value, weight);
+        }
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationStrategy.java b/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationStrategy.java
new file mode 100644
index 0000000..6dbd37c
--- /dev/null
+++ b/src/java/org/apache/cassandra/dht/tokenallocator/ReplicationStrategy.java

@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht.tokenallocator;
+
+interface ReplicationStrategy<Unit>
+{
+    int replicas();
+
+    /**
+     * Returns a group identifier. getGroup(a) == getGroup(b) iff a and b are on the same group.
+     * @return Some hashable object.
+     */
+    Object getGroup(Unit unit);
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocation.java b/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocation.java
new file mode 100644
index 0000000..5501378
--- /dev/null
+++ b/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocation.java

@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht.tokenallocator;
+
+import java.net.InetAddress;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.locator.IEndpointSnitch;
+import org.apache.cassandra.locator.NetworkTopologyStrategy;
+import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.locator.TokenMetadata.Topology;
+
+public class TokenAllocation
+{
+    private static final Logger logger = LoggerFactory.getLogger(TokenAllocation.class);
+
+    public static Collection<Token> allocateTokens(final TokenMetadata tokenMetadata,
+                                                   final AbstractReplicationStrategy rs,
+                                                   final InetAddress endpoint,
+                                                   int numTokens)
+    {
+        TokenMetadata tokenMetadataCopy = tokenMetadata.cloneOnlyTokenMap();
+        StrategyAdapter strategy = getStrategy(tokenMetadataCopy, rs, endpoint);
+        Collection<Token> tokens = create(tokenMetadata, strategy).addUnit(endpoint, numTokens);
+        tokens = adjustForCrossDatacenterClashes(tokenMetadata, strategy, tokens);
+
+        if (logger.isWarnEnabled())
+        {
+            logger.warn("Selected tokens {}", tokens);
+            SummaryStatistics os = replicatedOwnershipStats(tokenMetadataCopy, rs, endpoint);
+            tokenMetadataCopy.updateNormalTokens(tokens, endpoint);
+            SummaryStatistics ns = replicatedOwnershipStats(tokenMetadataCopy, rs, endpoint);
+            logger.warn("Replicated node load in datacentre before allocation " + statToString(os));
+            logger.warn("Replicated node load in datacentre after allocation " + statToString(ns));
+
+            // TODO: Is it worth doing the replicated ownership calculation always to be able to raise this alarm?
+            if (ns.getStandardDeviation() > os.getStandardDeviation())
+                logger.warn("Unexpected growth in standard deviation after allocation.");
+        }
+        return tokens;
+    }
+
+    private static Collection<Token> adjustForCrossDatacenterClashes(final TokenMetadata tokenMetadata,
+                                                                     StrategyAdapter strategy, Collection<Token> tokens)
+    {
+        List<Token> filtered = Lists.newArrayListWithCapacity(tokens.size());
+
+        for (Token t : tokens)
+        {
+            while (tokenMetadata.getEndpoint(t) != null)
+            {
+                InetAddress other = tokenMetadata.getEndpoint(t);
+                if (strategy.inAllocationRing(other))
+                    throw new ConfigurationException(String.format("Allocated token %s already assigned to node %s. Is another node also allocating tokens?", t, other));
+                t = t.increaseSlightly();
+            }
+            filtered.add(t);
+        }
+        return filtered;
+    }
+
+    // return the ratio of ownership for each endpoint
+    public static Map<InetAddress, Double> evaluateReplicatedOwnership(TokenMetadata tokenMetadata, AbstractReplicationStrategy rs)
+    {
+        Map<InetAddress, Double> ownership = Maps.newHashMap();
+        List<Token> sortedTokens = tokenMetadata.sortedTokens();
+        Iterator<Token> it = sortedTokens.iterator();
+        Token current = it.next();
+        while (it.hasNext())
+        {
+            Token next = it.next();
+            addOwnership(tokenMetadata, rs, current, next, ownership);
+            current = next;
+        }
+        addOwnership(tokenMetadata, rs, current, sortedTokens.get(0), ownership);
+
+        return ownership;
+    }
+
+    static void addOwnership(final TokenMetadata tokenMetadata, final AbstractReplicationStrategy rs, Token current, Token next, Map<InetAddress, Double> ownership)
+    {
+        double size = current.size(next);
+        Token representative = current.getPartitioner().midpoint(current, next);
+        for (InetAddress n : rs.calculateNaturalEndpoints(representative, tokenMetadata))
+        {
+            Double v = ownership.get(n);
+            ownership.put(n, v != null ? v + size : size);
+        }
+    }
+
+    public static String statToString(SummaryStatistics stat)
+    {
+        return String.format("max %.2f min %.2f stddev %.4f", stat.getMax() / stat.getMean(), stat.getMin() / stat.getMean(), stat.getStandardDeviation());
+    }
+
+    public static SummaryStatistics replicatedOwnershipStats(TokenMetadata tokenMetadata,
+                                                             AbstractReplicationStrategy rs, InetAddress endpoint)
+    {
+        SummaryStatistics stat = new SummaryStatistics();
+        StrategyAdapter strategy = getStrategy(tokenMetadata, rs, endpoint);
+        for (Map.Entry<InetAddress, Double> en : evaluateReplicatedOwnership(tokenMetadata, rs).entrySet())
+        {
+            // Filter only in the same datacentre.
+            if (strategy.inAllocationRing(en.getKey()))
+                stat.addValue(en.getValue() / tokenMetadata.getTokens(en.getKey()).size());
+        }
+        return stat;
+    }
+
+    static TokenAllocator<InetAddress> create(TokenMetadata tokenMetadata, StrategyAdapter strategy)
+    {
+        NavigableMap<Token, InetAddress> sortedTokens = new TreeMap<>();
+        for (Map.Entry<Token, InetAddress> en : tokenMetadata.getNormalAndBootstrappingTokenToEndpointMap().entrySet())
+        {
+            if (strategy.inAllocationRing(en.getValue()))
+                sortedTokens.put(en.getKey(), en.getValue());
+        }
+        return new ReplicationAwareTokenAllocator<>(sortedTokens, strategy, tokenMetadata.partitioner);
+    }
+
+    interface StrategyAdapter extends ReplicationStrategy<InetAddress>
+    {
+        // return true iff the provided endpoint occurs in the same virtual token-ring we are allocating for
+        // i.e. the set of the nodes that share ownership with the node we are allocating
+        // alternatively: return false if the endpoint's ownership is independent of the node we are allocating tokens for
+        boolean inAllocationRing(InetAddress other);
+    }
+
+    static StrategyAdapter getStrategy(final TokenMetadata tokenMetadata, final AbstractReplicationStrategy rs, final InetAddress endpoint)
+    {
+        if (rs instanceof NetworkTopologyStrategy)
+            return getStrategy(tokenMetadata, (NetworkTopologyStrategy) rs, rs.snitch, endpoint);
+        if (rs instanceof SimpleStrategy)
+            return getStrategy(tokenMetadata, (SimpleStrategy) rs, endpoint);
+        throw new ConfigurationException("Token allocation does not support replication strategy " + rs.getClass().getSimpleName());
+    }
+
+    static StrategyAdapter getStrategy(final TokenMetadata tokenMetadata, final SimpleStrategy rs, final InetAddress endpoint)
+    {
+        final int replicas = rs.getReplicationFactor();
+
+        return new StrategyAdapter()
+        {
+            @Override
+            public int replicas()
+            {
+                return replicas;
+            }
+
+            @Override
+            public Object getGroup(InetAddress unit)
+            {
+                return unit;
+            }
+
+            @Override
+            public boolean inAllocationRing(InetAddress other)
+            {
+                return true;
+            }
+        };
+    }
+
+    static StrategyAdapter getStrategy(final TokenMetadata tokenMetadata, final NetworkTopologyStrategy rs, final IEndpointSnitch snitch, final InetAddress endpoint)
+    {
+        final String dc = snitch.getDatacenter(endpoint);
+        final int replicas = rs.getReplicationFactor(dc);
+
+        Topology topology = tokenMetadata.getTopology();
+
+        // if topology hasn't been setup yet for this endpoint+rack then treat it as a separate unit
+        int racks = topology.getDatacenterRacks().get(dc) != null && topology.getDatacenterRacks().get(dc).containsKey(snitch.getRack(endpoint))
+                ? topology.getDatacenterRacks().get(dc).asMap().size()
+                : 1;
+
+        if (racks >= replicas)
+        {
+            return new StrategyAdapter()
+            {
+                @Override
+                public int replicas()
+                {
+                    return replicas;
+                }
+
+                @Override
+                public Object getGroup(InetAddress unit)
+                {
+                    return snitch.getRack(unit);
+                }
+
+                @Override
+                public boolean inAllocationRing(InetAddress other)
+                {
+                    return dc.equals(snitch.getDatacenter(other));
+                }
+            };
+        }
+        else if (racks == 1)
+        {
+            // One rack, each node treated as separate.
+            return new StrategyAdapter()
+            {
+                @Override
+                public int replicas()
+                {
+                    return replicas;
+                }
+
+                @Override
+                public Object getGroup(InetAddress unit)
+                {
+                    return unit;
+                }
+
+                @Override
+                public boolean inAllocationRing(InetAddress other)
+                {
+                    return dc.equals(snitch.getDatacenter(other));
+                }
+            };
+        }
+        else
+            throw new ConfigurationException(
+                    String.format("Token allocation failed: the number of racks %d in datacenter %s is lower than its replication factor %d.",
+                                  racks, dc, replicas));
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocator.java b/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocator.java
new file mode 100644
index 0000000..580f2ec
--- /dev/null
+++ b/src/java/org/apache/cassandra/dht/tokenallocator/TokenAllocator.java

@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht.tokenallocator;
+
+import java.util.Collection;
+
+import org.apache.cassandra.dht.Token;
+
+public interface TokenAllocator<Unit>
+{
+    public Collection<Token> addUnit(Unit newUnit, int numTokens);
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/exceptions/UnavailableException.java b/src/java/org/apache/cassandra/exceptions/UnavailableException.java
index baee0b2..7b4edd8 100644
--- a/src/java/org/apache/cassandra/exceptions/UnavailableException.java
+++ b/src/java/org/apache/cassandra/exceptions/UnavailableException.java

@@ -30,6 +30,11 @@
         this("Cannot achieve consistency level " + consistency, consistency, required, alive);
     }
 
+    public UnavailableException(ConsistencyLevel consistency, String dc, int required, int alive)
+    {
+        this("Cannot achieve consistency level " + consistency + " in DC " + dc, consistency, required, alive);
+    }
+
     public UnavailableException(String msg, ConsistencyLevel consistency, int required, int alive)
     {
         super(ExceptionCode.UNAVAILABLE, msg);

diff --git a/src/java/org/apache/cassandra/gms/EchoMessage.java b/src/java/org/apache/cassandra/gms/EchoMessage.java
index 2d4c095..339750d 100644
--- a/src/java/org/apache/cassandra/gms/EchoMessage.java
+++ b/src/java/org/apache/cassandra/gms/EchoMessage.java

@@ -21,10 +21,10 @@
  */
 
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public final class EchoMessage
@@ -43,7 +43,7 @@
         {
         }
 
-        public EchoMessage deserialize(DataInput in, int version) throws IOException
+        public EchoMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             return EchoMessage.instance;
         }

diff --git a/src/java/org/apache/cassandra/gms/EndpointState.java b/src/java/org/apache/cassandra/gms/EndpointState.java
index 931da8d..70f2a68 100644
--- a/src/java/org/apache/cassandra/gms/EndpointState.java
+++ b/src/java/org/apache/cassandra/gms/EndpointState.java

@@ -26,11 +26,10 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-
 /**
  * This abstraction represents both the HeartBeatState and the ApplicationState in an EndpointState
  * instance. Any state for a given endpoint can be retrieved from this instance.
@@ -180,7 +179,7 @@
         }
     }
 
-    public EndpointState deserialize(DataInput in, int version) throws IOException
+    public EndpointState deserialize(DataInputPlus in, int version) throws IOException
     {
         HeartBeatState hbState = HeartBeatState.serializer.deserialize(in, version);
 
@@ -200,11 +199,11 @@
     {
         long size = HeartBeatState.serializer.serializedSize(epState.getHeartBeatState(), version);
         Set<Map.Entry<ApplicationState, VersionedValue>> states = epState.states();
-        size += TypeSizes.NATIVE.sizeof(states.size());
+        size += TypeSizes.sizeof(states.size());
         for (Map.Entry<ApplicationState, VersionedValue> state : states)
         {
             VersionedValue value = state.getValue();
-            size += TypeSizes.NATIVE.sizeof(state.getKey().ordinal());
+            size += TypeSizes.sizeof(state.getKey().ordinal());
             size += VersionedValue.serializer.serializedSize(value, version);
         }
         return size;

diff --git a/src/java/org/apache/cassandra/gms/FailureDetector.java b/src/java/org/apache/cassandra/gms/FailureDetector.java
index 679d0b8..69888a6 100644
--- a/src/java/org/apache/cassandra/gms/FailureDetector.java
+++ b/src/java/org/apache/cassandra/gms/FailureDetector.java

@@ -33,6 +33,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Clock;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MBeanWrapper;
 
@@ -50,7 +51,7 @@
     private static final int DEBUG_PERCENTAGE = 80; // if the phi is larger than this percentage of the max, log a debug message
     private static final long DEFAULT_MAX_PAUSE = 5000L * 1000000L; // 5 seconds
     private static final long MAX_LOCAL_PAUSE_IN_NANOS = getMaxLocalPause();
-    private long lastInterpret = System.nanoTime();
+    private long lastInterpret = Clock.instance.nanoTime();
     private long lastPause = 0L;
 
     private static long getMaxLocalPause()
@@ -242,7 +243,7 @@
 
     public void report(InetAddress ep)
     {
-        long now = System.nanoTime();
+        long now = Clock.instance.nanoTime();
         ArrivalWindow heartbeatWindow = arrivalSamples.get(ep);
         if (heartbeatWindow == null)
         {
@@ -269,7 +270,7 @@
         {
             return;
         }
-        long now = System.nanoTime();
+        long now = Clock.instance.nanoTime();
         long diff = now - lastInterpret;
         lastInterpret = now;
         if (diff > MAX_LOCAL_PAUSE_IN_NANOS)
@@ -278,7 +279,7 @@
             lastPause = now;
             return;
         }
-        if (System.nanoTime() - lastPause < MAX_LOCAL_PAUSE_IN_NANOS)
+        if (Clock.instance.nanoTime() - lastPause < MAX_LOCAL_PAUSE_IN_NANOS)
         {
             logger.debug("Still not marking nodes down due to local pause");
             return;
@@ -443,7 +444,7 @@
             }
             else
             {
-                logger.debug("Ignoring interval time of {} for {}", interArrivalTime, ep);
+                logger.trace("Ignoring interval time of {} for {}", interArrivalTime, ep);
             }
         }
         else

diff --git a/src/java/org/apache/cassandra/gms/GossipDigest.java b/src/java/org/apache/cassandra/gms/GossipDigest.java
index 471602e..9dfd486 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigest.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigest.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
@@ -87,7 +88,7 @@
         out.writeInt(gDigest.maxVersion);
     }
 
-    public GossipDigest deserialize(DataInput in, int version) throws IOException
+    public GossipDigest deserialize(DataInputPlus in, int version) throws IOException
     {
         InetAddress endpoint = CompactEndpointSerializationHelper.deserialize(in);
         int generation = in.readInt();
@@ -98,8 +99,8 @@
     public long serializedSize(GossipDigest gDigest, int version)
     {
         long size = CompactEndpointSerializationHelper.serializedSize(gDigest.endpoint);
-        size += TypeSizes.NATIVE.sizeof(gDigest.generation);
-        size += TypeSizes.NATIVE.sizeof(gDigest.maxVersion);
+        size += TypeSizes.sizeof(gDigest.generation);
+        size += TypeSizes.sizeof(gDigest.maxVersion);
         return size;
     }
 }

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestAck.java b/src/java/org/apache/cassandra/gms/GossipDigestAck.java
index e3be9aa..cf71ae6 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestAck.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestAck.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.gms;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.HashMap;
@@ -26,6 +25,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
@@ -71,7 +71,7 @@
         }
     }
 
-    public GossipDigestAck deserialize(DataInput in, int version) throws IOException
+    public GossipDigestAck deserialize(DataInputPlus in, int version) throws IOException
     {
         List<GossipDigest> gDigestList = GossipDigestSerializationHelper.deserialize(in, version);
         int size = in.readInt();
@@ -89,7 +89,7 @@
     public long serializedSize(GossipDigestAck ack, int version)
     {
         int size = GossipDigestSerializationHelper.serializedSize(ack.gDigestList, version);
-        size += TypeSizes.NATIVE.sizeof(ack.epStateMap.size());
+        size += TypeSizes.sizeof(ack.epStateMap.size());
         for (Map.Entry<InetAddress, EndpointState> entry : ack.epStateMap.entrySet())
             size += CompactEndpointSerializationHelper.serializedSize(entry.getKey())
                     + EndpointState.serializer.serializedSize(entry.getValue(), version);

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestAck2.java b/src/java/org/apache/cassandra/gms/GossipDigestAck2.java
index 4a6a06e..9d779fe 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestAck2.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestAck2.java

@@ -24,6 +24,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
@@ -61,7 +62,7 @@
         }
     }
 
-    public GossipDigestAck2 deserialize(DataInput in, int version) throws IOException
+    public GossipDigestAck2 deserialize(DataInputPlus in, int version) throws IOException
     {
         int size = in.readInt();
         Map<InetAddress, EndpointState> epStateMap = new HashMap<InetAddress, EndpointState>(size);
@@ -77,7 +78,7 @@
 
     public long serializedSize(GossipDigestAck2 ack2, int version)
     {
-        long size = TypeSizes.NATIVE.sizeof(ack2.epStateMap.size());
+        long size = TypeSizes.sizeof(ack2.epStateMap.size());
         for (Map.Entry<InetAddress, EndpointState> entry : ack2.epStateMap.entrySet())
             size += CompactEndpointSerializationHelper.serializedSize(entry.getKey())
                     + EndpointState.serializer.serializedSize(entry.getValue(), version);

diff --git a/src/java/org/apache/cassandra/gms/GossipDigestSyn.java b/src/java/org/apache/cassandra/gms/GossipDigestSyn.java
index 0ad67bd..17c8da3 100644
--- a/src/java/org/apache/cassandra/gms/GossipDigestSyn.java
+++ b/src/java/org/apache/cassandra/gms/GossipDigestSyn.java

@@ -23,6 +23,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -59,7 +60,7 @@
             GossipDigest.serializer.serialize(gDigest, out, version);
     }
 
-    static List<GossipDigest> deserialize(DataInput in, int version) throws IOException
+    static List<GossipDigest> deserialize(DataInputPlus in, int version) throws IOException
     {
         int size = in.readInt();
         List<GossipDigest> gDigests = new ArrayList<GossipDigest>(size);
@@ -70,7 +71,7 @@
 
     static int serializedSize(List<GossipDigest> digests, int version)
     {
-        int size = TypeSizes.NATIVE.sizeof(digests.size());
+        int size = TypeSizes.sizeof(digests.size());
         for (GossipDigest digest : digests)
             size += GossipDigest.serializer.serializedSize(digest, version);
         return size;
@@ -86,7 +87,7 @@
         GossipDigestSerializationHelper.serialize(gDigestSynMessage.gDigests, out, version);
     }
 
-    public GossipDigestSyn deserialize(DataInput in, int version) throws IOException
+    public GossipDigestSyn deserialize(DataInputPlus in, int version) throws IOException
     {
         String clusterId = in.readUTF();
         String partioner = null;
@@ -97,8 +98,8 @@
 
     public long serializedSize(GossipDigestSyn syn, int version)
     {
-        long size = TypeSizes.NATIVE.sizeof(syn.clusterId);
-        size += TypeSizes.NATIVE.sizeof(syn.partioner);
+        long size = TypeSizes.sizeof(syn.clusterId);
+        size += TypeSizes.sizeof(syn.partioner);
         size += GossipDigestSerializationHelper.serializedSize(syn.gDigests, version);
         return size;
     }

diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java
index bd4fe13..7984dd4 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java

@@ -23,14 +23,19 @@
 import java.util.Map.Entry;
 import java.util.concurrent.*;
 import java.util.concurrent.locks.ReentrantLock;
+import java.util.stream.Collectors;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.util.concurrent.ListenableFutureTask;
 import com.google.common.util.concurrent.Uninterruptibles;
 
+import io.netty.util.concurrent.FastThreadLocal;
 import org.apache.cassandra.utils.ExecutorUtils;
 import org.apache.cassandra.utils.MBeanWrapper;
+import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.Pair;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -62,11 +67,17 @@
  * of the three above mentioned messages updates the Failure Detector with the liveness information.
  * Upon hearing a GossipShutdownMessage, this module will instantly mark the remote node as down in
  * the Failure Detector.
+ *
+ * This class is not threadsafe and any state changes should happen in the gossip stage.
  */
 
 public class Gossiper implements IFailureDetectionEventListener, GossiperMBean
 {
     public static final String MBEAN_NAME = "org.apache.cassandra.net:type=Gossiper";
+    public static class Props
+    {
+        public static final String DISABLE_THREAD_VALIDATION = "cassandra.gossip.disable_thread_validation";
+    }
 
     private static final DebuggableScheduledThreadPoolExecutor executor = new DebuggableScheduledThreadPoolExecutor("GossipTasks");
 
@@ -85,6 +96,7 @@
     public final static int intervalInMillis = 1000;
     public final static int QUARANTINE_DELAY = StorageService.RING_DELAY * 2;
     private static final Logger logger = LoggerFactory.getLogger(Gossiper.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 15L, TimeUnit.MINUTES);
     public static final Gossiper instance = new Gossiper();
 
     // Timestamp to prevent processing any in-flight messages for we've not send any SYN yet, see CASSANDRA-12653.
@@ -128,12 +140,42 @@
     private final Map<InetAddress, Long> expireTimeEndpointMap = new ConcurrentHashMap<InetAddress, Long>();
 
     private volatile boolean inShadowRound = false;
-
     // endpoint states as gathered during shadow round
     private final Map<InetAddress, EndpointState> endpointShadowStateMap = new ConcurrentHashMap<>();
 
     private volatile long lastProcessedMessageAt = System.currentTimeMillis();
 
+    private static FastThreadLocal<Boolean> isGossipStage = new FastThreadLocal<>();
+
+    private static final boolean disableThreadValidation = Boolean.getBoolean(Props.DISABLE_THREAD_VALIDATION);
+
+    private static boolean isInGossipStage()
+    {
+        Boolean isGossip = isGossipStage.get();
+        if (isGossip == null)
+        {
+            isGossip = Thread.currentThread().getName().contains(Stage.GOSSIP.getJmxName());
+            isGossipStage.set(isGossip);
+        }
+        return isGossip;
+    }
+
+    private static void checkProperThreadForStateMutation()
+    {
+        if (disableThreadValidation || isInGossipStage())
+            return;
+
+        IllegalStateException e = new IllegalStateException("Attempting gossip state mutation from illegal thread: " + Thread.currentThread().getName());
+        if (DatabaseDescriptor.strictRuntimeChecks())
+        {
+            throw e;
+        }
+        else
+        {
+            noSpamLogger.getStatement(Throwables.getStackTraceAsString(e)).error(e.getMessage(), e);
+        }
+    }
+
     private class GossipTask implements Runnable
     {
         public void run()
@@ -321,6 +363,27 @@
         return state.equals(VersionedValue.SHUTDOWN);
     }
 
+    public static void runInGossipStageBlocking(Runnable runnable)
+    {
+        // run immediately if we're already in the gossip stage
+        if (isInGossipStage())
+        {
+            runnable.run();
+            return;
+        }
+
+        ListenableFutureTask task = ListenableFutureTask.create(runnable, null);
+        StageManager.getStage(Stage.GOSSIP).execute(task);
+        try
+        {
+            task.get();
+        }
+        catch (InterruptedException | ExecutionException e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
     /**
      * This method is part of IFailureDetectionEventListener interface. This is invoked
      * by the Failure Detector when it convicts an end point.
@@ -329,24 +392,26 @@
      */
     public void convict(InetAddress endpoint, double phi)
     {
-        EndpointState epState = endpointStateMap.get(endpoint);
-        if (epState == null)
-            return;
+        runInGossipStageBlocking(() -> {
+            EndpointState epState = endpointStateMap.get(endpoint);
+            if (epState == null)
+                return;
 
-        if (!epState.isAlive())
-            return;
+            if (!epState.isAlive())
+                return;
 
-        logger.debug("Convicting {} with status {} - alive {}", endpoint, getGossipStatus(epState), epState.isAlive());
+            logger.debug("Convicting {} with status {} - alive {}", endpoint, getGossipStatus(epState), epState.isAlive());
 
 
-        if (isShutdown(endpoint))
-        {
-            markAsShutdown(endpoint);
-        }
-        else
-        {
-            markDead(endpoint, epState);
-        }
+            if (isShutdown(endpoint))
+            {
+                markAsShutdown(endpoint);
+            }
+            else
+            {
+                markDead(endpoint, epState);
+            }
+        });
     }
 
     /**
@@ -355,10 +420,12 @@
      */
     protected void markAsShutdown(InetAddress endpoint)
     {
+        checkProperThreadForStateMutation();
         EndpointState epState = endpointStateMap.get(endpoint);
         if (epState == null)
             return;
         epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.shutdown(true));
+        epState.addApplicationState(ApplicationState.RPC_READY, StorageService.instance.valueFactory.rpcReady(false));
         epState.getHeartBeatState().forceHighestPossibleVersionUnsafe();
         markDead(endpoint, epState);
         FailureDetector.instance.forceConviction(endpoint);
@@ -385,6 +452,7 @@
      */
     private void evictFromMembership(InetAddress endpoint)
     {
+        checkProperThreadForStateMutation();
         unreachableEndpoints.remove(endpoint);
         endpointStateMap.remove(endpoint);
         expireTimeEndpointMap.remove(endpoint);
@@ -399,6 +467,7 @@
      */
     public void removeEndpoint(InetAddress endpoint)
     {
+        checkProperThreadForStateMutation();
         // do subscribers first so anything in the subscriber that depends on gossiper state won't get confused
         for (IEndpointStateChangeSubscriber subscriber : subscribers)
             subscriber.onRemove(endpoint);
@@ -459,6 +528,7 @@
      */
     public void replacedEndpoint(InetAddress endpoint)
     {
+        checkProperThreadForStateMutation();
         removeEndpoint(endpoint);
         evictFromMembership(endpoint);
         replacementQuarantine(endpoint);
@@ -571,16 +641,33 @@
     public void assassinateEndpoint(String address) throws UnknownHostException
     {
         InetAddress endpoint = InetAddress.getByName(address);
-        EndpointState epState = endpointStateMap.get(endpoint);
-        Collection<Token> tokens = null;
-        logger.warn("Assassinating {} via gossip", endpoint);
+        runInGossipStageBlocking(() -> {
+            EndpointState epState = endpointStateMap.get(endpoint);
+            Collection<Token> tokens = null;
+            logger.warn("Assassinating {} via gossip", endpoint);
 
-        if (epState == null)
-        {
-            epState = new EndpointState(new HeartBeatState((int) ((System.currentTimeMillis() + 60000) / 1000), 9999));
-        }
-        else
-        {
+            if (epState == null)
+            {
+                epState = new EndpointState(new HeartBeatState((int) ((System.currentTimeMillis() + 60000) / 1000), 9999));
+            }
+            else
+            {
+                int generation = epState.getHeartBeatState().getGeneration();
+                int heartbeat = epState.getHeartBeatState().getHeartBeatVersion();
+                logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
+                Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
+                // make sure it did not change
+                EndpointState newState = endpointStateMap.get(endpoint);
+                if (newState == null)
+                    logger.warn("Endpoint {} disappeared while trying to assassinate, continuing anyway", endpoint);
+                else if (newState.getHeartBeatState().getGeneration() != generation)
+                    throw new RuntimeException("Endpoint still alive: " + endpoint + " generation changed while trying to assassinate it");
+                else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
+                    throw new RuntimeException("Endpoint still alive: " + endpoint + " heartbeat changed while trying to assassinate it");
+                epState.updateTimestamp(); // make sure we don't evict it too soon
+                epState.getHeartBeatState().forceNewerGenerationUnsafe();
+            }
+
             try
             {
                 tokens = StorageService.instance.getTokenMetadata().getTokens(endpoint);
@@ -590,29 +677,15 @@
                 JVMStabilityInspector.inspectThrowable(th);
                 // TODO this is broken
                 logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
-                tokens = Collections.singletonList(StorageService.getPartitioner().getRandomToken());
+                tokens = Collections.singletonList(StorageService.instance.getTokenMetadata().partitioner.getRandomToken());
             }
-            int generation = epState.getHeartBeatState().getGeneration();
-            int heartbeat = epState.getHeartBeatState().getHeartBeatVersion();
-            logger.info("Sleeping for {}ms to ensure {} does not change", StorageService.RING_DELAY, endpoint);
-            Uninterruptibles.sleepUninterruptibly(StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
-            // make sure it did not change
-            EndpointState newState = endpointStateMap.get(endpoint);
-            if (newState == null)
-                logger.warn("Endpoint {} disappeared while trying to assassinate, continuing anyway", endpoint);
-            else if (newState.getHeartBeatState().getGeneration() != generation)
-                throw new RuntimeException("Endpoint still alive: " + endpoint + " generation changed while trying to assassinate it");
-            else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
-                throw new RuntimeException("Endpoint still alive: " + endpoint + " heartbeat changed while trying to assassinate it");
-            epState.updateTimestamp(); // make sure we don't evict it too soon
-            epState.getHeartBeatState().forceNewerGenerationUnsafe();
-        }
 
-        // do not pass go, do not collect 200 dollars, just gtfo
-        epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.left(tokens, computeExpireTime()));
-        handleMajorStateChange(endpoint, epState);
-        Uninterruptibles.sleepUninterruptibly(intervalInMillis * 4, TimeUnit.MILLISECONDS);
-        logger.warn("Finished assassinating {}", endpoint);
+            // do not pass go, do not collect 200 dollars, just gtfo
+            epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.left(tokens, computeExpireTime()));
+            handleMajorStateChange(endpoint, epState);
+            Uninterruptibles.sleepUninterruptibly(intervalInMillis * 4, TimeUnit.MILLISECONDS);
+            logger.warn("Finished assassinating {}", endpoint);
+        });
     }
 
     public boolean isKnownEndpoint(InetAddress endpoint)
@@ -735,7 +808,8 @@
         return !unsafeStatuses.contains(status);
     }
 
-    private void doStatusCheck()
+    @VisibleForTesting
+    void doStatusCheck()
     {
         if (logger.isTraceEnabled())
             logger.trace("Performing status check ...");
@@ -774,8 +848,10 @@
                     && TimeUnit.NANOSECONDS.toMillis(nowNano - epState.getUpdateTimestamp()) > fatClientTimeout)
                 {
                     logger.info("FatClient {} has been silent for {}ms, removing from gossip", endpoint, fatClientTimeout);
-                    removeEndpoint(endpoint); // will put it in justRemovedEndpoints to respect quarantine delay
-                    evictFromMembership(endpoint); // can get rid of the state immediately
+                    runInGossipStageBlocking(() -> {
+                        removeEndpoint(endpoint); // will put it in justRemovedEndpoints to respect quarantine delay
+                        evictFromMembership(endpoint); // can get rid of the state immediately
+                    });
                 }
 
                 // check for dead state removal
@@ -787,7 +863,7 @@
                     {
                         logger.debug("time is expiring for endpoint : {} ({})", endpoint, expireTime);
                     }
-                    evictFromMembership(endpoint);
+                    runInGossipStageBlocking(() -> evictFromMembership(endpoint));
                 }
             }
         }
@@ -818,6 +894,20 @@
         return endpointStateMap.get(ep);
     }
 
+    public boolean valuesEqual(InetAddress ep1, InetAddress ep2, ApplicationState as)
+    {
+        EndpointState state1 = getEndpointStateForEndpoint(ep1);
+        EndpointState state2 = getEndpointStateForEndpoint(ep2);
+
+        if (state1 == null || state2 == null)
+            return false;
+
+        VersionedValue value1 = state1.getApplicationState(as);
+        VersionedValue value2 = state2.getApplicationState(as);
+
+        return !(value1 == null || value2 == null) && value1.value.equals(value2.value);
+    }
+
     public Set<Entry<InetAddress, EndpointState>> getEndpointStates()
     {
         return endpointStateMap.entrySet();
@@ -962,7 +1052,7 @@
 
             public void response(MessageIn msg)
             {
-                realMarkAlive(addr, localState);
+                runInGossipStageBlocking(() -> realMarkAlive(addr, localState));
             }
         };
 
@@ -972,6 +1062,7 @@
     @VisibleForTesting
     public void realMarkAlive(final InetAddress addr, final EndpointState localState)
     {
+        checkProperThreadForStateMutation();
         if (logger.isTraceEnabled())
             logger.trace("marking as alive {}", addr);
         localState.markAlive();
@@ -990,6 +1081,7 @@
     @VisibleForTesting
     public void markDead(InetAddress addr, EndpointState localState)
     {
+        checkProperThreadForStateMutation();
         if (logger.isTraceEnabled())
             logger.trace("marking as down {}", addr);
         localState.markDead();
@@ -1010,6 +1102,7 @@
      */
     private void handleMajorStateChange(InetAddress ep, EndpointState epState)
     {
+        checkProperThreadForStateMutation();
         EndpointState localEpState = endpointStateMap.get(ep);
         if (!isDeadState(epState))
         {
@@ -1081,6 +1174,7 @@
 
     void applyStateLocally(Map<InetAddress, EndpointState> epStateMap)
     {
+        checkProperThreadForStateMutation();
         for (Entry<InetAddress, EndpointState> entry : epStateMap.entrySet())
         {
             InetAddress ep = entry.getKey();
@@ -1163,10 +1257,24 @@
 
         Set<Entry<ApplicationState, VersionedValue>> remoteStates = remoteState.states();
         assert remoteState.getHeartBeatState().getGeneration() == localState.getHeartBeatState().getGeneration();
-        localState.addApplicationStates(remoteStates);
 
-        for (Entry<ApplicationState, VersionedValue> remoteEntry : remoteStates)
-            doOnChangeNotifications(addr, remoteEntry.getKey(), remoteEntry.getValue());
+        // filter out the states that are already up to date (has the same or higher version)
+        Set<Entry<ApplicationState, VersionedValue>> updatedStates = remoteStates.stream().filter(entry -> {
+            VersionedValue local = localState.getApplicationState(entry.getKey());
+            return (local == null || local.version < entry.getValue().version);
+            }).collect(Collectors.toSet());
+
+        if (logger.isTraceEnabled() && updatedStates.size() > 0)
+        {
+            for (Entry<ApplicationState, VersionedValue> entry : updatedStates)
+            {
+                logger.trace("Updating {} state version to {} for {}", entry.getKey().toString(), entry.getValue().version, addr);
+            }
+        }
+        localState.addApplicationStates(updatedStates);
+
+        for (Entry<ApplicationState, VersionedValue> updatedEntry : updatedStates)
+            doOnChangeNotifications(addr, updatedEntry.getKey(), updatedEntry.getValue());
     }
     
     // notify that a local application state is going to change (doesn't get triggered for remote changes)
@@ -1326,11 +1434,6 @@
     public synchronized Map<InetAddress, EndpointState> doShadowRound()
     {
         buildSeedsList();
-        // it may be that the local address is the only entry in the seed
-        // list in which case, attempting a shadow round is pointless
-        if (seeds.isEmpty())
-            return endpointShadowStateMap;
-
         endpointShadowStateMap.clear();
         // send a completely empty syn
         List<GossipDigest> gDigests = new ArrayList<GossipDigest>();
@@ -1402,6 +1505,7 @@
      */
     public void addSavedEndpoint(InetAddress ep)
     {
+        checkProperThreadForStateMutation();
         if (ep.equals(FBUtilities.getBroadcastAddress()))
         {
             logger.debug("Attempt to add self as saved endpoint");

diff --git a/src/java/org/apache/cassandra/gms/HeartBeatState.java b/src/java/org/apache/cassandra/gms/HeartBeatState.java
index 901f1c7..13e1ace 100644
--- a/src/java/org/apache/cassandra/gms/HeartBeatState.java
+++ b/src/java/org/apache/cassandra/gms/HeartBeatState.java

@@ -21,6 +21,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -83,13 +84,13 @@
         out.writeInt(hbState.getHeartBeatVersion());
     }
 
-    public HeartBeatState deserialize(DataInput in, int version) throws IOException
+    public HeartBeatState deserialize(DataInputPlus in, int version) throws IOException
     {
         return new HeartBeatState(in.readInt(), in.readInt());
     }
 
     public long serializedSize(HeartBeatState state, int version)
     {
-        return TypeSizes.NATIVE.sizeof(state.getGeneration()) + TypeSizes.NATIVE.sizeof(state.getHeartBeatVersion());
+        return TypeSizes.sizeof(state.getGeneration()) + TypeSizes.sizeof(state.getHeartBeatVersion());
     }
 }

diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 661d3ba..d9c8d0b 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.gms;
 
 import java.io.*;
-
 import java.net.InetAddress;
 import java.util.Collection;
 import java.util.UUID;
@@ -31,10 +30,10 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.FBUtilities;
-
 import org.apache.commons.lang3.StringUtils;
 
 
@@ -283,7 +282,7 @@
             return value.value;
         }
 
-        public VersionedValue deserialize(DataInput in, int version) throws IOException
+        public VersionedValue deserialize(DataInputPlus in, int version) throws IOException
         {
             String value = in.readUTF();
             int valVersion = in.readInt();
@@ -292,7 +291,7 @@
 
         public long serializedSize(VersionedValue value, int version)
         {
-            return TypeSizes.NATIVE.sizeof(outValue(value, version)) + TypeSizes.NATIVE.sizeof(value.version);
+            return TypeSizes.sizeof(outValue(value, version)) + TypeSizes.sizeof(value.version);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java b/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java
deleted file mode 100644
index d55f205..0000000
--- a/src/java/org/apache/cassandra/hadoop/AbstractColumnFamilyInputFormat.java
+++ /dev/null

@@ -1,314 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Future;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.datastax.driver.core.Cluster;
-import com.datastax.driver.core.Host;
-import com.datastax.driver.core.Metadata;
-import com.datastax.driver.core.ResultSet;
-import com.datastax.driver.core.Row;
-import com.datastax.driver.core.Session;
-import com.datastax.driver.core.TokenRange;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
-import org.apache.cassandra.thrift.KeyRange;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-
-public abstract class AbstractColumnFamilyInputFormat<K, Y> extends InputFormat<K, Y> implements org.apache.hadoop.mapred.InputFormat<K, Y>
-{
-    private static final Logger logger = LoggerFactory.getLogger(AbstractColumnFamilyInputFormat.class);
-
-    public static final String MAPRED_TASK_ID = "mapred.task.id";
-    // The simple fact that we need this is because the old Hadoop API wants us to "write"
-    // to the key and value whereas the new asks for it.
-    // I choose 8kb as the default max key size (instantiated only once), but you can
-    // override it in your jobConf with this setting.
-    public static final String CASSANDRA_HADOOP_MAX_KEY_SIZE = "cassandra.hadoop.max_key_size";
-    public static final int    CASSANDRA_HADOOP_MAX_KEY_SIZE_DEFAULT = 8192;
-
-    private String keyspace;
-    private String cfName;
-    private IPartitioner partitioner;
-
-    protected void validateConfiguration(Configuration conf)
-    {
-        if (ConfigHelper.getInputKeyspace(conf) == null || ConfigHelper.getInputColumnFamily(conf) == null)
-        {
-            throw new UnsupportedOperationException("you must set the keyspace and table with setInputColumnFamily()");
-        }
-        if (ConfigHelper.getInputInitialAddress(conf) == null)
-            throw new UnsupportedOperationException("You must set the initial output address to a Cassandra node with setInputInitialAddress");
-        if (ConfigHelper.getInputPartitioner(conf) == null)
-            throw new UnsupportedOperationException("You must set the Cassandra partitioner class with setInputPartitioner");
-    }
-
-    public List<InputSplit> getSplits(JobContext context) throws IOException
-    {
-        Configuration conf = HadoopCompat.getConfiguration(context);
-
-        validateConfiguration(conf);
-
-        keyspace = ConfigHelper.getInputKeyspace(conf);
-        cfName = ConfigHelper.getInputColumnFamily(conf);
-        partitioner = ConfigHelper.getInputPartitioner(conf);
-        logger.trace("partitioner is {}", partitioner);
-
-        // canonical ranges and nodes holding replicas
-        Map<TokenRange, Set<Host>> masterRangeNodes = getRangeMap(conf, keyspace);
-
-        // canonical ranges, split into pieces, fetching the splits in parallel
-        ExecutorService executor = new ThreadPoolExecutor(0, 128, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
-        List<InputSplit> splits = new ArrayList<>();
-
-        List<Future<List<InputSplit>>> splitfutures = new ArrayList<>();
-        KeyRange jobKeyRange = ConfigHelper.getInputKeyRange(conf);
-        Range<Token> jobRange = null;
-        if (jobKeyRange != null)
-        {
-            if (jobKeyRange.start_key != null)
-            {
-                if (!partitioner.preservesOrder())
-                    throw new UnsupportedOperationException("KeyRange based on keys can only be used with a order preserving partitioner");
-                if (jobKeyRange.start_token != null)
-                    throw new IllegalArgumentException("only start_key supported");
-                if (jobKeyRange.end_token != null)
-                    throw new IllegalArgumentException("only start_key supported");
-                jobRange = new Range<>(partitioner.getToken(jobKeyRange.start_key),
-                                       partitioner.getToken(jobKeyRange.end_key));
-            }
-            else if (jobKeyRange.start_token != null)
-            {
-                jobRange = new Range<>(partitioner.getTokenFactory().fromString(jobKeyRange.start_token),
-                                       partitioner.getTokenFactory().fromString(jobKeyRange.end_token));
-            }
-            else
-            {
-                logger.warn("ignoring jobKeyRange specified without start_key or start_token");
-            }
-        }
-
-        try (Cluster cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf).split(","), conf);
-             Session session = cluster.connect())
-        {
-            Metadata metadata = session.getCluster().getMetadata();
-
-            for (TokenRange range : masterRangeNodes.keySet())
-            {
-                if (jobRange == null)
-                {
-                    // for each tokenRange, pick a live owner and ask it to compute bite-sized splits
-                    splitfutures.add(executor.submit(new SplitCallable(range, masterRangeNodes.get(range), conf, session)));
-                }
-                else
-                {
-                    TokenRange jobTokenRange = rangeToTokenRange(metadata, jobRange);
-                    if (range.intersects(jobTokenRange))
-                    {
-                        for (TokenRange intersection: range.intersectWith(jobTokenRange))
-                        {
-                            // for each tokenRange, pick a live owner and ask it to compute bite-sized splits
-                            splitfutures.add(executor.submit(new SplitCallable(intersection,  masterRangeNodes.get(range), conf, session)));
-                        }
-                    }
-                }
-            }
-
-            // wait until we have all the results back
-            for (Future<List<InputSplit>> futureInputSplits : splitfutures)
-            {
-                try
-                {
-                    splits.addAll(futureInputSplits.get());
-                }
-                catch (Exception e)
-                {
-                    throw new IOException("Could not get input splits", e);
-                }
-            }
-        }
-        finally
-        {
-            executor.shutdownNow();
-        }
-
-        assert splits.size() > 0;
-        Collections.shuffle(splits, new Random(System.nanoTime()));
-        return splits;
-    }
-
-    private TokenRange rangeToTokenRange(Metadata metadata, Range<Token> range)
-    {
-        return metadata.newTokenRange(metadata.newToken(partitioner.getTokenFactory().toString(range.left)),
-                metadata.newToken(partitioner.getTokenFactory().toString(range.right)));
-    }
-
-    /**
-     * Gets a token tokenRange and splits it up according to the suggested
-     * size into input splits that Hadoop can use.
-     */
-    class SplitCallable implements Callable<List<InputSplit>>
-    {
-
-        private final TokenRange tokenRange;
-        private final Set<Host> hosts;
-        private final Configuration conf;
-        private final Session session;
-
-        public SplitCallable(TokenRange tr, Set<Host> hosts, Configuration conf, Session session)
-        {
-            this.tokenRange = tr;
-            this.hosts = hosts;
-            this.conf = conf;
-            this.session = session;
-        }
-
-        public List<InputSplit> call() throws Exception
-        {
-            ArrayList<InputSplit> splits = new ArrayList<>();
-            Map<TokenRange, Long> subSplits;
-            subSplits = getSubSplits(keyspace, cfName, tokenRange, conf, session);
-            // turn the sub-ranges into InputSplits
-            String[] endpoints = new String[hosts.size()];
-
-            // hadoop needs hostname, not ip
-            int endpointIndex = 0;
-            for (Host endpoint : hosts)
-                endpoints[endpointIndex++] = endpoint.getAddress().getHostName();
-
-            for (TokenRange subSplit : subSplits.keySet())
-            {
-                List<TokenRange> ranges = subSplit.unwrap();
-                for (TokenRange subrange : ranges)
-                {
-                    ColumnFamilySplit split =
-                            new ColumnFamilySplit(
-                                    partitioner.preservesOrder() ?
-                                            subrange.getStart().toString().substring(2) : subrange.getStart().toString(),
-                                    partitioner.preservesOrder() ?
-                                            subrange.getEnd().toString().substring(2) : subrange.getEnd().toString(),
-                                    subSplits.get(subSplit),
-                                    endpoints);
-
-                    logger.trace("adding {}", split);
-                    splits.add(split);
-                }
-            }
-            return splits;
-        }
-    }
-
-    private Map<TokenRange, Long> getSubSplits(String keyspace, String cfName, TokenRange range, Configuration conf, Session session) throws IOException
-    {
-        int splitSize = ConfigHelper.getInputSplitSize(conf);
-        int splitSizeMb = ConfigHelper.getInputSplitSizeInMb(conf);
-        try
-        {
-            return describeSplits(keyspace, cfName, range, splitSize, splitSizeMb, session);
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private Map<TokenRange, Set<Host>> getRangeMap(Configuration conf, String keyspace)
-    {
-        try (Cluster cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf).split(","), conf))
-        {
-            Map<TokenRange, Set<Host>> map = new HashMap<>();
-            Metadata metadata = cluster.connect().getCluster().getMetadata();
-            for (TokenRange tokenRange : metadata.getTokenRanges())
-                map.put(tokenRange, metadata.getReplicas('"' + keyspace + '"', tokenRange));
-            return map;
-        }
-    }
-
-    private Map<TokenRange, Long> describeSplits(String keyspace, String table, TokenRange tokenRange, int splitSize, int splitSizeMb, Session session)
-    {
-        String query = String.format("SELECT mean_partition_size, partitions_count " +
-                                     "FROM %s.%s " +
-                                     "WHERE keyspace_name = ? AND table_name = ? AND range_start = ? AND range_end = ?",
-                                     SystemKeyspace.NAME,
-                                     SystemKeyspace.SIZE_ESTIMATES);
-
-        ResultSet resultSet = session.execute(query, keyspace, table, tokenRange.getStart().toString(), tokenRange.getEnd().toString());
-
-        Row row = resultSet.one();
-        // If we have no data on this split, return the full split i.e., do not sub-split
-        // Assume smallest granularity of partition count available from CASSANDRA-7688
-        if (row == null)
-        {
-            Map<TokenRange, Long> wrappedTokenRange = new HashMap<>();
-            wrappedTokenRange.put(tokenRange, (long) 128);
-            return wrappedTokenRange;
-        }
-
-        long meanPartitionSize = row.getLong("mean_partition_size");
-        long partitionCount = row.getLong("partitions_count");
-
-        int splitCount = splitSizeMb > 0
-            ? (int)(meanPartitionSize * partitionCount / splitSizeMb / 1024 / 1024)
-            : (int)(partitionCount / splitSize);
-
-        if (splitCount <= 0) splitCount = 1;
-        List<TokenRange> splitRanges = tokenRange.splitEvenly(splitCount);
-        Map<TokenRange, Long> rangesWithLength = new HashMap<>();
-        for (TokenRange range : splitRanges)
-            rangesWithLength.put(range, partitionCount/splitCount);
-
-        return rangesWithLength;
-    }
-
-    // Old Hadoop API
-    public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException
-    {
-        TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
-        List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
-        org.apache.hadoop.mapred.InputSplit[] oldInputSplits = new org.apache.hadoop.mapred.InputSplit[newInputSplits.size()];
-        for (int i = 0; i < newInputSplits.size(); i++)
-            oldInputSplits[i] = (ColumnFamilySplit)newInputSplits.get(i);
-        return oldInputSplits;
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/BulkOutputFormat.java b/src/java/org/apache/cassandra/hadoop/BulkOutputFormat.java
deleted file mode 100644
index 5282279..0000000
--- a/src/java/org/apache/cassandra/hadoop/BulkOutputFormat.java
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.cassandra.thrift.Mutation;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.*;
-
-@Deprecated
-public class BulkOutputFormat extends OutputFormat<ByteBuffer,List<Mutation>>
-        implements org.apache.hadoop.mapred.OutputFormat<ByteBuffer,List<Mutation>>
-{
-    /** Fills the deprecated OutputFormat interface for streaming. */
-    @Deprecated
-    public BulkRecordWriter getRecordWriter(org.apache.hadoop.fs.FileSystem filesystem, org.apache.hadoop.mapred.JobConf job, String name, org.apache.hadoop.util.Progressable progress) throws IOException
-    {
-        return new BulkRecordWriter(job, progress);
-    }
-
-    @Override
-    public BulkRecordWriter getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException
-    {
-        return new BulkRecordWriter(context);
-    }
-
-
-    @Override
-    public void checkOutputSpecs(JobContext context)
-    {
-        checkOutputSpecs(HadoopCompat.getConfiguration(context));
-    }
-
-    private void checkOutputSpecs(Configuration conf)
-    {
-        if (ConfigHelper.getOutputKeyspace(conf) == null)
-        {
-            throw new UnsupportedOperationException("you must set the keyspace with setColumnFamily()");
-        }
-    }
-
-    @Override
-    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException
-    {
-        return new NullOutputCommitter();
-    }
-
-    /** Fills the deprecated OutputFormat interface for streaming. */
-    @Deprecated
-    public void checkOutputSpecs(org.apache.hadoop.fs.FileSystem filesystem, org.apache.hadoop.mapred.JobConf job) throws IOException
-    {
-        checkOutputSpecs(job);
-    }
-
-    public static class NullOutputCommitter extends OutputCommitter
-    {
-        public void abortTask(TaskAttemptContext taskContext) { }
-
-        public void cleanupJob(JobContext jobContext) { }
-
-        public void commitTask(TaskAttemptContext taskContext) { }
-
-        public boolean needsTaskCommit(TaskAttemptContext taskContext)
-        {
-            return false;
-        }
-
-        public void setupJob(JobContext jobContext) { }
-
-        public void setupTask(TaskAttemptContext taskContext) { }
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/BulkRecordWriter.java b/src/java/org/apache/cassandra/hadoop/BulkRecordWriter.java
deleted file mode 100644
index 6b9ecb5..0000000
--- a/src/java/org/apache/cassandra/hadoop/BulkRecordWriter.java
+++ /dev/null

@@ -1,296 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.Config;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
-import org.apache.cassandra.io.sstable.SSTableLoader;
-import org.apache.cassandra.io.sstable.SSTableSimpleUnsortedWriter;
-import org.apache.cassandra.streaming.StreamState;
-import org.apache.cassandra.thrift.Column;
-import org.apache.cassandra.thrift.CounterColumn;
-import org.apache.cassandra.thrift.Mutation;
-import org.apache.cassandra.utils.NativeSSTableLoaderClient;
-import org.apache.cassandra.utils.OutputHandler;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.util.Progressable;
-
-@Deprecated
-public final class BulkRecordWriter extends RecordWriter<ByteBuffer, List<Mutation>>
-        implements org.apache.hadoop.mapred.RecordWriter<ByteBuffer, List<Mutation>>
-{
-    public final static String OUTPUT_LOCATION = "mapreduce.output.bulkoutputformat.localdir";
-    public final static String BUFFER_SIZE_IN_MB = "mapreduce.output.bulkoutputformat.buffersize";
-    public final static String STREAM_THROTTLE_MBITS = "mapreduce.output.bulkoutputformat.streamthrottlembits";
-    public final static String MAX_FAILED_HOSTS = "mapreduce.output.bulkoutputformat.maxfailedhosts";
-
-    private final Logger logger = LoggerFactory.getLogger(BulkRecordWriter.class);
-
-    protected final Configuration conf;
-    protected final int maxFailures;
-    protected final int bufferSize;
-    protected Closeable writer;
-    protected SSTableLoader loader;
-    protected Progressable progress;
-    protected TaskAttemptContext context;
-    private File outputDir;
-    
-    
-    private enum CFType
-    {
-        NORMAL,
-        SUPER,
-    }
-
-    private enum ColType
-    {
-        NORMAL,
-        COUNTER
-    }
-
-    private CFType cfType;
-    private ColType colType;
-
-    BulkRecordWriter(TaskAttemptContext context)
-    {
-
-        this(HadoopCompat.getConfiguration(context));
-        this.context = context;
-    }
-
-    BulkRecordWriter(Configuration conf, Progressable progress)
-    {
-        this(conf);
-        this.progress = progress;
-    }
-
-    BulkRecordWriter(Configuration conf)
-    {
-        Config.setOutboundBindAny(true);
-        this.conf = conf;
-        DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(Integer.parseInt(conf.get(STREAM_THROTTLE_MBITS, "0")));
-        maxFailures = Integer.parseInt(conf.get(MAX_FAILED_HOSTS, "0"));
-        bufferSize = Integer.parseInt(conf.get(BUFFER_SIZE_IN_MB, "64"));
-    }
-
-    protected String getOutputLocation() throws IOException
-    {
-        String dir = conf.get(OUTPUT_LOCATION, System.getProperty("java.io.tmpdir"));
-        if (dir == null)
-            throw new IOException("Output directory not defined, if hadoop is not setting java.io.tmpdir then define " + OUTPUT_LOCATION);
-        return dir;
-    }
-
-    private void setTypes(Mutation mutation)
-    {
-       if (cfType == null)
-       {
-           if (mutation.getColumn_or_supercolumn().isSetSuper_column() || mutation.getColumn_or_supercolumn().isSetCounter_super_column())
-               cfType = CFType.SUPER;
-           else
-               cfType = CFType.NORMAL;
-           if (mutation.getColumn_or_supercolumn().isSetCounter_column() || mutation.getColumn_or_supercolumn().isSetCounter_super_column())
-               colType = ColType.COUNTER;
-           else
-               colType = ColType.NORMAL;
-       }
-    }
-
-    private void prepareWriter() throws IOException
-    {
-        if (outputDir == null)
-        {
-            String keyspace = ConfigHelper.getOutputKeyspace(conf);
-            //dir must be named by ks/cf for the loader
-            outputDir = new File(getOutputLocation() + File.separator + keyspace + File.separator + ConfigHelper.getOutputColumnFamily(conf));
-            outputDir.mkdirs();
-        }
-        
-        if (writer == null)
-        {
-            AbstractType<?> subcomparator = null;
-
-            if (cfType == CFType.SUPER)
-                subcomparator = BytesType.instance;
-
-            writer = new SSTableSimpleUnsortedWriter(
-                    outputDir,
-                    ConfigHelper.getOutputPartitioner(conf),
-                    ConfigHelper.getOutputKeyspace(conf),
-                    ConfigHelper.getOutputColumnFamily(conf),
-                    BytesType.instance,
-                    subcomparator,
-                    Integer.parseInt(conf.get(BUFFER_SIZE_IN_MB, "64")),
-                    ConfigHelper.getOutputCompressionParamaters(conf));
-
-            this.loader = new SSTableLoader(outputDir, new ExternalClient(conf), new NullOutputHandler());
-        }
-    }
-
-    @Override
-    public void close(TaskAttemptContext context) throws IOException, InterruptedException
-    {
-        close();
-    }
-
-    /** Fills the deprecated RecordWriter interface for streaming. */
-    @Deprecated
-    public void close(org.apache.hadoop.mapred.Reporter reporter) throws IOException
-    {
-        close();
-    }
-
-    private void close() throws IOException
-    {
-        if (writer != null)
-        {
-            writer.close();
-            Future<StreamState> future = loader.stream();
-            while (true)
-            {
-                try
-                {
-                    future.get(1000, TimeUnit.MILLISECONDS);
-                    break;
-                }
-                catch (ExecutionException | TimeoutException te)
-                {
-                    if (null != progress)
-                        progress.progress();
-                    if (null != context)
-                        HadoopCompat.progress(context);
-                }
-                catch (InterruptedException e)
-                {
-                    throw new IOException(e);
-                }
-            }
-            if (loader.getFailedHosts().size() > 0)
-            {
-                if (loader.getFailedHosts().size() > maxFailures)
-                    throw new IOException("Too many hosts failed: " + loader.getFailedHosts());
-                else
-                    logger.warn("Some hosts failed: {}", loader.getFailedHosts());
-            }
-        }
-    }
-
-    @Override
-    public void write(ByteBuffer keybuff, List<Mutation> value) throws IOException
-    {
-        setTypes(value.get(0));
-        prepareWriter();
-        SSTableSimpleUnsortedWriter ssWriter = (SSTableSimpleUnsortedWriter) writer;
-        ssWriter.newRow(keybuff);
-        for (Mutation mut : value)
-        {
-            if (cfType == CFType.SUPER)
-            {
-                ssWriter.newSuperColumn(mut.getColumn_or_supercolumn().getSuper_column().name);
-                if (colType == ColType.COUNTER)
-                    for (CounterColumn column : mut.getColumn_or_supercolumn().getCounter_super_column().columns)
-                        ssWriter.addCounterColumn(column.name, column.value);
-                else
-                {
-                    for (Column column : mut.getColumn_or_supercolumn().getSuper_column().columns)
-                    {
-                        if(column.ttl == 0)
-                            ssWriter.addColumn(column.name, column.value, column.timestamp);
-                        else
-                            ssWriter.addExpiringColumn(column.name, column.value, column.timestamp, column.ttl, System.currentTimeMillis() + ((long)column.ttl * 1000));
-                    }
-                }
-            }
-            else
-            {
-                if (colType == ColType.COUNTER)
-                    ssWriter.addCounterColumn(mut.getColumn_or_supercolumn().counter_column.name, mut.getColumn_or_supercolumn().counter_column.value);
-                else
-                {
-                    if(mut.getColumn_or_supercolumn().column.ttl == 0)
-                        ssWriter.addColumn(mut.getColumn_or_supercolumn().column.name, mut.getColumn_or_supercolumn().column.value, mut.getColumn_or_supercolumn().column.timestamp);
-                    else
-                        ssWriter.addExpiringColumn(mut.getColumn_or_supercolumn().column.name, mut.getColumn_or_supercolumn().column.value, mut.getColumn_or_supercolumn().column.timestamp, mut.getColumn_or_supercolumn().column.ttl, System.currentTimeMillis() + ((long)(mut.getColumn_or_supercolumn().column.ttl) * 1000));
-                }
-            }
-            if (null != progress)
-                progress.progress();
-            if (null != context)
-                HadoopCompat.progress(context);
-        }
-    }
-
-    public static class ExternalClient extends NativeSSTableLoaderClient
-    {
-        public ExternalClient(Configuration conf)
-        {
-            super(resolveHostAddresses(conf),
-                  CqlConfigHelper.getOutputNativePort(conf),
-                  ConfigHelper.getOutputKeyspaceUserName(conf),
-                  ConfigHelper.getOutputKeyspacePassword(conf),
-                  CqlConfigHelper.getSSLOptions(conf).orNull());
-        }
-
-        private static Collection<InetAddress> resolveHostAddresses(Configuration conf)
-        {
-            Set<InetAddress> addresses = new HashSet<>();
-
-            for (String host : ConfigHelper.getOutputInitialAddress(conf).split(","))
-            {
-                try
-                {
-                    addresses.add(InetAddress.getByName(host));
-                }
-                catch (UnknownHostException e)
-                {
-                    throw new RuntimeException(e);
-                }
-            }
-
-            return addresses;
-        }
-    }
-
-    public static class NullOutputHandler implements OutputHandler
-    {
-        public void output(String msg) {}
-        public void debug(String msg) {}
-        public void warn(String msg) {}
-        public void warn(String msg, Throwable th) {}
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
deleted file mode 100644
index 87cb791..0000000
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyInputFormat.java
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.auth.PasswordAuthenticator;
-import org.apache.cassandra.thrift.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.thrift.protocol.TBinaryProtocol;
-import org.apache.thrift.protocol.TProtocol;
-import org.apache.thrift.transport.TTransport;
-import org.apache.thrift.transport.TTransportException;
-
-/**
- * Hadoop InputFormat allowing map/reduce against Cassandra rows within one ColumnFamily.
- *
- * At minimum, you need to set the CF and predicate (description of columns to extract from each row)
- * in your Hadoop job Configuration.  The ConfigHelper class is provided to make this
- * simple:
- *   ConfigHelper.setInputColumnFamily
- *   ConfigHelper.setInputSlicePredicate
- *
- * You can also configure the number of rows per InputSplit with
- *   ConfigHelper.setInputSplitSize
- * This should be "as big as possible, but no bigger."  Each InputSplit is read from Cassandra
- * with multiple get_slice_range queries, and the per-call overhead of get_slice_range is high,
- * so larger split sizes are better -- but if it is too large, you will run out of memory.
- *
- * The default split size is 64k rows.
- */
-@Deprecated
-public class ColumnFamilyInputFormat extends AbstractColumnFamilyInputFormat<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>>
-{
-    private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyInputFormat.class);
-
-    @SuppressWarnings("resource")
-    public static Cassandra.Client createAuthenticatedClient(String location, int port, Configuration conf) throws Exception
-    {
-        logger.trace("Creating authenticated client for CF input format");
-        TTransport transport;
-        try
-        {
-            transport = ConfigHelper.getClientTransportFactory(conf).openTransport(location, port);
-        }
-        catch (Exception e)
-        {
-            throw new TTransportException("Failed to open a transport to " + location + ":" + port + ".", e);
-        }
-        TProtocol binaryProtocol = new TBinaryProtocol(transport, true, true);
-        Cassandra.Client client = new Cassandra.Client(binaryProtocol);
-
-        // log in
-        client.set_keyspace(ConfigHelper.getInputKeyspace(conf));
-        if ((ConfigHelper.getInputKeyspaceUserName(conf) != null) && (ConfigHelper.getInputKeyspacePassword(conf) != null))
-        {
-            Map<String, String> creds = new HashMap<String, String>();
-            creds.put(PasswordAuthenticator.USERNAME_KEY, ConfigHelper.getInputKeyspaceUserName(conf));
-            creds.put(PasswordAuthenticator.PASSWORD_KEY, ConfigHelper.getInputKeyspacePassword(conf));
-            AuthenticationRequest authRequest = new AuthenticationRequest(creds);
-            client.login(authRequest);
-        }
-        logger.trace("Authenticated client for CF input format created successfully");
-        return client;
-    }
-
-    public RecordReader<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException
-    {
-        return new ColumnFamilyRecordReader();
-    }
-
-    public org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>> getRecordReader(org.apache.hadoop.mapred.InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException
-    {
-        TaskAttemptContext tac = HadoopCompat.newMapContext(
-                jobConf,
-                TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)),
-                null,
-                null,
-                null,
-                new ReporterWrapper(reporter),
-                null);
-
-        ColumnFamilyRecordReader recordReader = new ColumnFamilyRecordReader(jobConf.getInt(CASSANDRA_HADOOP_MAX_KEY_SIZE, CASSANDRA_HADOOP_MAX_KEY_SIZE_DEFAULT));
-        recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit)split, tac);
-        return recordReader;
-    }
-    
-    @Override
-    protected void validateConfiguration(Configuration conf)
-    {
-        super.validateConfiguration(conf);
-        
-        if (ConfigHelper.getInputSlicePredicate(conf) == null)
-        {
-            throw new UnsupportedOperationException("you must set the predicate with setInputSlicePredicate");
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyOutputFormat.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyOutputFormat.java
deleted file mode 100644
index edc988b..0000000
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyOutputFormat.java
+++ /dev/null

@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.slf4j.*;
-
-import org.apache.cassandra.auth.*;
-import org.apache.cassandra.thrift.*;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.mapreduce.*;
-import org.apache.thrift.protocol.*;
-import org.apache.thrift.transport.*;
-
-/**
- * The <code>ColumnFamilyOutputFormat</code> acts as a Hadoop-specific
- * OutputFormat that allows reduce tasks to store keys (and corresponding
- * values) as Cassandra rows (and respective columns) in a given
- * ColumnFamily.
- *
- * <p>
- * As is the case with the {@link ColumnFamilyInputFormat}, you need to set the
- * Keyspace and ColumnFamily in your
- * Hadoop job Configuration. The {@link ConfigHelper} class, through its
- * {@link ConfigHelper#setOutputColumnFamily} method, is provided to make this
- * simple.
- * </p>
- *
- * <p>
- * For the sake of performance, this class employs a lazy write-back caching
- * mechanism, where its record writer batches mutations created based on the
- * reduce's inputs (in a task-specific map), and periodically makes the changes
- * official by sending a batch mutate request to Cassandra.
- * </p>
- */
-@Deprecated
-public class ColumnFamilyOutputFormat extends OutputFormat<ByteBuffer,List<Mutation>>
-        implements org.apache.hadoop.mapred.OutputFormat<ByteBuffer,List<Mutation>>
-{
-    public static final String BATCH_THRESHOLD = "mapreduce.output.columnfamilyoutputformat.batch.threshold";
-    public static final String QUEUE_SIZE = "mapreduce.output.columnfamilyoutputformat.queue.size";
-
-    private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyOutputFormat.class);
-
-    /**
-     * The OutputCommitter for this format does not write any data to the DFS.
-     *
-     * @param context
-     *            the task context
-     * @return an output committer
-     * @throws IOException
-     * @throws InterruptedException
-     */
-    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException
-    {
-        return new NullOutputCommitter();
-    }
-
-    /**
-     * Check for validity of the output-specification for the job.
-     *
-     * @param context
-     *            information about the job
-     */
-    public void checkOutputSpecs(JobContext context)
-    {
-        checkOutputSpecs(HadoopCompat.getConfiguration(context));
-    }
-
-    protected void checkOutputSpecs(Configuration conf)
-    {
-        if (ConfigHelper.getOutputKeyspace(conf) == null)
-            throw new UnsupportedOperationException("You must set the keyspace with setOutputKeyspace()");
-        if (ConfigHelper.getOutputPartitioner(conf) == null)
-            throw new UnsupportedOperationException("You must set the output partitioner to the one used by your Cassandra cluster");
-        if (ConfigHelper.getOutputInitialAddress(conf) == null)
-            throw new UnsupportedOperationException("You must set the initial output address to a Cassandra node");
-    }
-
-    /** Fills the deprecated OutputFormat interface for streaming. */
-    @Deprecated
-    public void checkOutputSpecs(org.apache.hadoop.fs.FileSystem filesystem, org.apache.hadoop.mapred.JobConf job) throws IOException
-    {
-        checkOutputSpecs(job);
-    }
-
-    /**
-     * Connects to the given server:port and returns a client based on the given socket that points to the configured
-     * keyspace, and is logged in with the configured credentials.
-     *
-     * @param host fully qualified host name to connect to
-     * @param port RPC port of the server
-     * @param conf a job configuration
-     * @return a cassandra client
-     * @throws Exception set of thrown exceptions may be implementation defined,
-     *                   depending on the used transport factory
-     */
-    @SuppressWarnings("resource")
-    public static Cassandra.Client createAuthenticatedClient(String host, int port, Configuration conf) throws Exception
-    {
-        logger.trace("Creating authenticated client for CF output format");
-        TTransport transport = ConfigHelper.getClientTransportFactory(conf).openTransport(host, port);
-        TProtocol binaryProtocol = new TBinaryProtocol(transport, true, true);
-        Cassandra.Client client = new Cassandra.Client(binaryProtocol);
-        client.set_keyspace(ConfigHelper.getOutputKeyspace(conf));
-        String user = ConfigHelper.getOutputKeyspaceUserName(conf);
-        String password = ConfigHelper.getOutputKeyspacePassword(conf);
-        if ((user != null) && (password != null))
-            login(user, password, client);
-
-        logger.trace("Authenticated client for CF output format created successfully");
-        return client;
-    }
-
-    public static void login(String user, String password, Cassandra.Client client) throws Exception
-    {
-        Map<String, String> creds = new HashMap<String, String>();
-        creds.put(PasswordAuthenticator.USERNAME_KEY, user);
-        creds.put(PasswordAuthenticator.PASSWORD_KEY, password);
-        AuthenticationRequest authRequest = new AuthenticationRequest(creds);
-        client.login(authRequest);
-    }
-
-    /** Fills the deprecated OutputFormat interface for streaming. */
-    @Deprecated
-    public ColumnFamilyRecordWriter getRecordWriter(org.apache.hadoop.fs.FileSystem filesystem, org.apache.hadoop.mapred.JobConf job, String name, org.apache.hadoop.util.Progressable progress)
-    {
-        return new ColumnFamilyRecordWriter(job, progress);
-    }
-
-    /**
-     * Get the {@link RecordWriter} for the given task.
-     *
-     * @param context
-     *            the information about the current task.
-     * @return a {@link RecordWriter} to write the output for the job.
-     */
-    public ColumnFamilyRecordWriter getRecordWriter(final TaskAttemptContext context) throws InterruptedException
-    {
-        return new ColumnFamilyRecordWriter(context);
-    }
-
-    /**
-     * An {@link OutputCommitter} that does nothing.
-     */
-    private static class NullOutputCommitter extends OutputCommitter
-    {
-        public void abortTask(TaskAttemptContext taskContext) { }
-
-        public void cleanupJob(JobContext jobContext) { }
-
-        public void commitTask(TaskAttemptContext taskContext) { }
-
-        public boolean needsTaskCommit(TaskAttemptContext taskContext)
-        {
-            return false;
-        }
-
-        public void setupJob(JobContext jobContext) { }
-
-        public void setupTask(TaskAttemptContext taskContext) { }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
deleted file mode 100644
index 9d1d10c..0000000
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
+++ /dev/null

@@ -1,618 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import com.google.common.collect.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.schema.LegacySchemaTables;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.TypeParser;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.thrift.TException;
-import org.apache.thrift.transport.TTransport;
-
-@Deprecated
-public class ColumnFamilyRecordReader extends RecordReader<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>>
-    implements org.apache.hadoop.mapred.RecordReader<ByteBuffer, SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>>
-{
-    private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyRecordReader.class);
-
-    public static final int CASSANDRA_HADOOP_MAX_KEY_SIZE_DEFAULT = 8192;
-
-    private ColumnFamilySplit split;
-    private RowIterator iter;
-    private Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> currentRow;
-    private SlicePredicate predicate;
-    private boolean isEmptyPredicate;
-    private int totalRowCount; // total number of rows to fetch
-    private int batchSize; // fetch this many per batch
-    private String keyspace;
-    private String cfName;
-    private Cassandra.Client client;
-    private ConsistencyLevel consistencyLevel;
-    private int keyBufferSize = 8192;
-    private List<IndexExpression> filter;
-
-
-    public ColumnFamilyRecordReader()
-    {
-        this(ColumnFamilyRecordReader.CASSANDRA_HADOOP_MAX_KEY_SIZE_DEFAULT);
-    }
-
-    public ColumnFamilyRecordReader(int keyBufferSize)
-    {
-        super();
-        this.keyBufferSize = keyBufferSize;
-    }
-
-    @SuppressWarnings("resource")
-    public void close()
-    {
-        if (client != null)
-        {
-            TTransport transport = client.getOutputProtocol().getTransport();
-            if (transport.isOpen())
-                transport.close();
-        }
-    }
-
-    public ByteBuffer getCurrentKey()
-    {
-        return currentRow.left;
-    }
-
-    public SortedMap<ByteBuffer, Column> getCurrentValue()
-    {
-        return currentRow.right;
-    }
-
-    public float getProgress()
-    {
-        if (!iter.hasNext())
-            return 1.0F;
-
-        // the progress is likely to be reported slightly off the actual but close enough
-        float progress = ((float) iter.rowsRead() / totalRowCount);
-        return progress > 1.0F ? 1.0F : progress;
-    }
-
-    static boolean isEmptyPredicate(SlicePredicate predicate)
-    {
-        if (predicate == null)
-            return true;
-
-        if (predicate.isSetColumn_names() && predicate.getSlice_range() == null)
-            return false;
-
-        if (predicate.getSlice_range() == null)
-            return true;
-
-        byte[] start = predicate.getSlice_range().getStart();
-        if ((start != null) && (start.length > 0))
-            return false;
-
-        byte[] finish = predicate.getSlice_range().getFinish();
-        if ((finish != null) && (finish.length > 0))
-            return false;
-
-        return true;
-    }
-
-    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException
-    {
-        this.split = (ColumnFamilySplit) split;
-        Configuration conf = HadoopCompat.getConfiguration(context);
-        KeyRange jobRange = ConfigHelper.getInputKeyRange(conf);
-        filter = jobRange == null ? null : jobRange.row_filter;
-        predicate = ConfigHelper.getInputSlicePredicate(conf);
-        boolean widerows = ConfigHelper.getInputIsWide(conf);
-        isEmptyPredicate = isEmptyPredicate(predicate);
-        totalRowCount = (this.split.getLength() < Long.MAX_VALUE)
-                ? (int) this.split.getLength()
-                : ConfigHelper.getInputSplitSize(conf);
-        batchSize = ConfigHelper.getRangeBatchSize(conf);
-        cfName = ConfigHelper.getInputColumnFamily(conf);
-        consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getReadConsistencyLevel(conf));
-        keyspace = ConfigHelper.getInputKeyspace(conf);
-        
-        if (batchSize < 2)
-            throw new IllegalArgumentException("Minimum batchSize is 2.  Suggested batchSize is 100 or more");
-
-        String[] locations = getLocations();
-        int port = ConfigHelper.getInputRpcPort(conf);
-
-        Exception lastException = null;
-        for (String location : locations)
-        {
-            try
-            {
-                client = ColumnFamilyInputFormat.createAuthenticatedClient(location, port, conf);
-                break;
-            }
-            catch (Exception e)
-            {
-                lastException = e;
-                logger.warn("Failed to create authenticated client to {}:{}", location , port);
-            }
-        }
-        if (client == null && lastException != null)
-            throw new RuntimeException(lastException);
-
-        iter = widerows ? new WideRowIterator() : new StaticRowIterator();
-        logger.trace("created {}", iter);
-    }
-
-    public boolean nextKeyValue() throws IOException
-    {
-        if (!iter.hasNext())
-        {
-            logger.trace("Finished scanning {} rows (estimate was: {})", iter.rowsRead(), totalRowCount);
-            return false;
-        }
-
-        currentRow = iter.next();
-        return true;
-    }
-
-    // we don't use endpointsnitch since we are trying to support hadoop nodes that are
-    // not necessarily on Cassandra machines, too.  This should be adequate for single-DC clusters, at least.
-    private String[] getLocations()
-    {
-        Collection<InetAddress> localAddresses = FBUtilities.getAllLocalAddresses();
-
-        for (InetAddress address : localAddresses)
-        {
-            for (String location : split.getLocations())
-            {
-                InetAddress locationAddress = null;
-                try
-                {
-                    locationAddress = InetAddress.getByName(location);
-                }
-                catch (UnknownHostException e)
-                {
-                    throw new AssertionError(e);
-                }
-                if (address.equals(locationAddress))
-                {
-                    return new String[]{location};
-                }
-            }
-        }
-        return split.getLocations();
-    }
-
-    private abstract class RowIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>>
-    {
-        protected List<KeySlice> rows;
-        protected int totalRead = 0;
-        protected final boolean isSuper;
-        protected final AbstractType<?> comparator;
-        protected final AbstractType<?> subComparator;
-        protected final IPartitioner partitioner;
-
-        private RowIterator()
-        {
-            CfDef cfDef = new CfDef();
-            try
-            {
-                partitioner = FBUtilities.newPartitioner(client.describe_partitioner());           
-                // get CF meta data
-                String query = String.format("SELECT comparator, subcomparator, type " +
-                                             "FROM %s.%s " +
-                                             "WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
-                                             SystemKeyspace.NAME,
-                                             LegacySchemaTables.COLUMNFAMILIES,
-                                             keyspace,
-                                             cfName);
-
-                CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE, ConsistencyLevel.ONE);
-
-                Iterator<CqlRow> iteraRow = result.rows.iterator();
-
-                if (iteraRow.hasNext())
-                {
-                    CqlRow cqlRow = iteraRow.next();
-                    cfDef.comparator_type = ByteBufferUtil.string(cqlRow.columns.get(0).value);
-                    ByteBuffer subComparator = cqlRow.columns.get(1).value;
-                    if (subComparator != null)
-                        cfDef.subcomparator_type = ByteBufferUtil.string(subComparator);
-                    
-                    ByteBuffer type = cqlRow.columns.get(2).value;
-                    if (type != null)
-                        cfDef.column_type = ByteBufferUtil.string(type);
-                }
-
-                comparator = TypeParser.parse(cfDef.comparator_type);
-                subComparator = cfDef.subcomparator_type == null ? null : TypeParser.parse(cfDef.subcomparator_type);
-            }
-            catch (ConfigurationException e)
-            {
-                throw new RuntimeException("unable to load sub/comparator", e);
-            }
-            catch (TException e)
-            {
-                throw new RuntimeException("error communicating via Thrift", e);
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException("unable to load keyspace " + keyspace, e);
-            }
-            isSuper = "Super".equalsIgnoreCase(cfDef.column_type);
-        }
-
-        /**
-         * @return total number of rows read by this record reader
-         */
-        public int rowsRead()
-        {
-            return totalRead;
-        }
-
-        protected List<Pair<ByteBuffer, Column>> unthriftify(ColumnOrSuperColumn cosc)
-        {
-            if (cosc.counter_column != null)
-                return Collections.singletonList(unthriftifyCounter(cosc.counter_column));
-            if (cosc.counter_super_column != null)
-                return unthriftifySuperCounter(cosc.counter_super_column);
-            if (cosc.super_column != null)
-                return unthriftifySuper(cosc.super_column);
-            assert cosc.column != null;
-            return Collections.singletonList(unthriftifySimple(cosc.column));
-        }
-
-        private List<Pair<ByteBuffer, Column>> unthriftifySuper(SuperColumn super_column)
-        {
-            List<Pair<ByteBuffer, Column>> columns = new ArrayList<>(super_column.columns.size());
-            for (org.apache.cassandra.thrift.Column column : super_column.columns)
-            {
-                Pair<ByteBuffer, Column> c = unthriftifySimple(column);
-                columns.add(Pair.create(CompositeType.build(super_column.name, c.left), c.right));
-            }
-            return columns;
-        }
-
-        protected Pair<ByteBuffer, Column> unthriftifySimple(org.apache.cassandra.thrift.Column column)
-        {
-            return Pair.create(column.name, Column.fromRegularColumn(column));
-        }
-
-        private Pair<ByteBuffer, Column> unthriftifyCounter(CounterColumn column)
-        {
-            return Pair.create(column.name, Column.fromCounterColumn(column));
-        }
-
-        private List<Pair<ByteBuffer, Column>> unthriftifySuperCounter(CounterSuperColumn super_column)
-        {
-            List<Pair<ByteBuffer, Column>> columns = new ArrayList<>(super_column.columns.size());
-            for (CounterColumn column : super_column.columns)
-            {
-                Pair<ByteBuffer, Column> c = unthriftifyCounter(column);
-                columns.add(Pair.create(CompositeType.build(super_column.name, c.left), c.right));
-            }
-            return columns;
-        }
-    }
-
-    private class StaticRowIterator extends RowIterator
-    {
-        protected int i = 0;
-
-        private void maybeInit()
-        {
-            // check if we need another batch
-            if (rows != null && i < rows.size())
-                return;
-
-            String startToken;
-            if (totalRead == 0)
-            {
-                // first request
-                startToken = split.getStartToken();
-            }
-            else
-            {
-                startToken = partitioner.getTokenFactory().toString(partitioner.getToken(Iterables.getLast(rows).key));
-                if (startToken.equals(split.getEndToken()))
-                {
-                    // reached end of the split
-                    rows = null;
-                    return;
-                }
-            }
-
-            KeyRange keyRange = new KeyRange(batchSize)
-                                .setStart_token(startToken)
-                                .setEnd_token(split.getEndToken())
-                                .setRow_filter(filter);
-            try
-            {
-                rows = client.get_range_slices(new ColumnParent(cfName), predicate, keyRange, consistencyLevel);
-
-                // nothing new? reached the end
-                if (rows.isEmpty())
-                {
-                    rows = null;
-                    return;
-                }
-
-                // remove ghosts when fetching all columns
-                if (isEmptyPredicate)
-                {
-                    Iterator<KeySlice> it = rows.iterator();
-                    KeySlice ks;
-                    do
-                    {
-                        ks = it.next();
-                        if (ks.getColumnsSize() == 0)
-                        {
-                            it.remove();
-                        }
-                    } while (it.hasNext());
-
-                    // all ghosts, spooky
-                    if (rows.isEmpty())
-                    {
-                        // maybeInit assumes it can get the start-with key from the rows collection, so add back the last
-                        rows.add(ks);
-                        maybeInit();
-                        return;
-                    }
-                }
-
-                // reset to iterate through this new batch
-                i = 0;
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-
-        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
-        {
-            maybeInit();
-            if (rows == null)
-                return endOfData();
-
-            totalRead++;
-            KeySlice ks = rows.get(i++);
-            AbstractType<?> comp = isSuper ? CompositeType.getInstance(comparator, subComparator) : comparator;
-            SortedMap<ByteBuffer, Column> map = new TreeMap<>(comp);
-            for (ColumnOrSuperColumn cosc : ks.columns)
-            {
-                List<Pair<ByteBuffer, Column>> columns = unthriftify(cosc);
-                for (Pair<ByteBuffer, Column> column : columns)
-                    map.put(column.left, column.right);
-            }
-            return Pair.create(ks.key, map);
-        }
-    }
-
-    private class WideRowIterator extends RowIterator
-    {
-        private PeekingIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>> wideColumns;
-        private ByteBuffer lastColumn = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-        private ByteBuffer lastCountedKey = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-        private void maybeInit()
-        {
-            if (wideColumns != null && wideColumns.hasNext())
-                return;
-
-            KeyRange keyRange;
-            if (totalRead == 0)
-            {
-                String startToken = split.getStartToken();
-                keyRange = new KeyRange(batchSize)
-                          .setStart_token(startToken)
-                          .setEnd_token(split.getEndToken())
-                          .setRow_filter(filter);
-            }
-            else
-            {
-                KeySlice lastRow = Iterables.getLast(rows);
-                logger.trace("Starting with last-seen row {}", lastRow.key);
-                keyRange = new KeyRange(batchSize)
-                          .setStart_key(lastRow.key)
-                          .setEnd_token(split.getEndToken())
-                          .setRow_filter(filter);
-            }
-
-            try
-            {
-                rows = client.get_paged_slice(cfName, keyRange, lastColumn, consistencyLevel);
-                int n = 0;
-                for (KeySlice row : rows)
-                    n += row.columns.size();
-                logger.trace("read {} columns in {} rows for {} starting with {}",
-                             new Object[]{ n, rows.size(), keyRange, lastColumn });
-
-                wideColumns = Iterators.peekingIterator(new WideColumnIterator(rows));
-                if (wideColumns.hasNext() && wideColumns.peek().right.keySet().iterator().next().equals(lastColumn))
-                    wideColumns.next();
-                if (!wideColumns.hasNext())
-                    rows = null;
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-
-        protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
-        {
-            maybeInit();
-            if (rows == null)
-                return endOfData();
-
-            Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> next = wideColumns.next();
-            lastColumn = next.right.keySet().iterator().next().duplicate();
-
-            maybeIncreaseRowCounter(next);
-            return next;
-        }
-
-
-        /**
-         * Increases the row counter only if we really moved to the next row.
-         * @param next just fetched row slice
-         */
-        private void maybeIncreaseRowCounter(Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> next)
-        {
-            ByteBuffer currentKey = next.left;
-            if (!currentKey.equals(lastCountedKey))
-            {
-                totalRead++;
-                lastCountedKey = currentKey;
-            }
-        }
-
-        private class WideColumnIterator extends AbstractIterator<Pair<ByteBuffer, SortedMap<ByteBuffer, Column>>>
-        {
-            private final Iterator<KeySlice> rows;
-            private Iterator<ColumnOrSuperColumn> columns;
-            public KeySlice currentRow;
-
-            public WideColumnIterator(List<KeySlice> rows)
-            {
-                this.rows = rows.iterator();
-                if (this.rows.hasNext())
-                    nextRow();
-                else
-                    columns = Iterators.emptyIterator();
-            }
-
-            private void nextRow()
-            {
-                currentRow = rows.next();
-                columns = currentRow.columns.iterator();
-            }
-
-            protected Pair<ByteBuffer, SortedMap<ByteBuffer, Column>> computeNext()
-            {
-                AbstractType<?> comp = isSuper ? CompositeType.getInstance(comparator, subComparator) : comparator;
-                while (true)
-                {
-                    if (columns.hasNext())
-                    {
-                        ColumnOrSuperColumn cosc = columns.next();
-                        SortedMap<ByteBuffer, Column> map;
-                        List<Pair<ByteBuffer, Column>> columns = unthriftify(cosc);
-                        if (columns.size() == 1)
-                        {
-                            map = ImmutableSortedMap.of(columns.get(0).left, columns.get(0).right);
-                        }
-                        else
-                        {
-                            assert isSuper;
-                            map = new TreeMap<>(comp);
-                            for (Pair<ByteBuffer, Column> column : columns)
-                                map.put(column.left, column.right);
-                        }
-                        return Pair.create(currentRow.key, map);
-                    }
-
-                    if (!rows.hasNext())
-                        return endOfData();
-
-                    nextRow();
-                }
-            }
-        }
-    }
-
-    // Because the old Hadoop API wants us to write to the key and value
-    // and the new asks for them, we need to copy the output of the new API
-    // to the old. Thus, expect a small performance hit.
-    // And obviously this wouldn't work for wide rows. But since ColumnFamilyInputFormat
-    // and ColumnFamilyRecordReader don't support them, it should be fine for now.
-    public boolean next(ByteBuffer key, SortedMap<ByteBuffer, Column> value) throws IOException
-    {
-        if (this.nextKeyValue())
-        {
-            key.clear();
-            key.put(this.getCurrentKey().duplicate());
-            key.flip();
-
-            value.clear();
-            value.putAll(this.getCurrentValue());
-
-            return true;
-        }
-        return false;
-    }
-
-    public ByteBuffer createKey()
-    {
-        return ByteBuffer.wrap(new byte[this.keyBufferSize]);
-    }
-
-    public SortedMap<ByteBuffer, Column> createValue()
-    {
-        return new TreeMap<>();
-    }
-
-    public long getPos() throws IOException
-    {
-        return iter.rowsRead();
-    }
-
-    public static final class Column
-    {
-        public final ByteBuffer name;
-        public final ByteBuffer value;
-        public final long timestamp;
-
-        private Column(ByteBuffer name, ByteBuffer value, long timestamp)
-        {
-            this.name = name;
-            this.value = value;
-            this.timestamp = timestamp;
-        }
-
-        static Column fromRegularColumn(org.apache.cassandra.thrift.Column input)
-        {
-            return new Column(input.name, input.value, input.timestamp);
-        }
-
-        static Column fromCounterColumn(org.apache.cassandra.thrift.CounterColumn input)
-        {
-            return new Column(input.name, ByteBufferUtil.bytes(input.value), 0);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordWriter.java b/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordWriter.java
deleted file mode 100644
index f06f03d..0000000
--- a/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordWriter.java
+++ /dev/null

@@ -1,341 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop;
-
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
-
-import org.apache.cassandra.client.*;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.thrift.ConsistencyLevel;
-import org.apache.cassandra.utils.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.*;
-import org.apache.thrift.TException;
-import org.apache.hadoop.util.Progressable;
-import org.apache.thrift.transport.*;
-
-
-/**
- * The <code>ColumnFamilyRecordWriter</code> maps the output &lt;key, value&gt;
- * pairs to a Cassandra column family. In particular, it applies all mutations
- * in the value, which it associates with the key, and in turn the responsible
- * endpoint.
- *
- * <p>
- * Furthermore, this writer groups the mutations by the endpoint responsible for
- * the rows being affected. This allows the mutations to be executed in parallel,
- * directly to a responsible endpoint.
- * </p>
- *
- * @see ColumnFamilyOutputFormat
- */
-@Deprecated
-final class ColumnFamilyRecordWriter extends RecordWriter<ByteBuffer, List<Mutation>> implements
-        org.apache.hadoop.mapred.RecordWriter<ByteBuffer, List<Mutation>>
-{
-    // The configuration this writer is associated with.
-    protected final Configuration conf;
-
-    // The number of mutations to buffer per endpoint
-    protected final int queueSize;
-
-    protected final long batchThreshold;
-
-    protected final ConsistencyLevel consistencyLevel;
-    protected Progressable progressable;
-    protected TaskAttemptContext context;
-    // handles for clients for each range running in the threadpool
-    private final Map<Range, RangeClient> clients;
-
-    // The ring cache that describes the token ranges each node in the ring is
-    // responsible for. This is what allows us to group the mutations by
-    // the endpoints they should be targeted at. The targeted endpoint
-    // essentially
-    // acts as the primary replica for the rows being affected by the mutations.
-    private final RingCache ringCache;
-    
-    /**
-     * Upon construction, obtain the map that this writer will use to collect
-     * mutations, and the ring cache for the given keyspace.
-     *
-     * @param context the task attempt context
-     * @throws IOException
-     */
-    ColumnFamilyRecordWriter(TaskAttemptContext context)
-    {
-        this(HadoopCompat.getConfiguration(context));
-        this.context = context;
-
-    }
-    ColumnFamilyRecordWriter(Configuration conf, Progressable progressable)
-    {
-        this(conf);
-        this.progressable = progressable;
-    }
-
-    ColumnFamilyRecordWriter(Configuration conf)
-    {
-        this.conf = conf;
-        this.queueSize = conf.getInt(ColumnFamilyOutputFormat.QUEUE_SIZE, 32 * FBUtilities.getAvailableProcessors());
-        batchThreshold = conf.getLong(ColumnFamilyOutputFormat.BATCH_THRESHOLD, 32);
-        consistencyLevel = ConsistencyLevel.valueOf(ConfigHelper.getWriteConsistencyLevel(conf));
-        this.ringCache = new RingCache(conf);
-        this.clients = new HashMap<Range, RangeClient>();
-    }
-
-    /**
-     * Close this <code>RecordWriter</code> to future operations, but not before
-     * flushing out the batched mutations.
-     *
-     * @param context the context of the task
-     * @throws IOException
-     */
-    public void close(TaskAttemptContext context) throws IOException, InterruptedException
-    {
-        close();
-    }
-
-    /** Fills the deprecated RecordWriter interface for streaming. */
-    @Deprecated
-    public void close(org.apache.hadoop.mapred.Reporter reporter) throws IOException
-    {
-        close();
-    }
-
-    public void close() throws IOException
-    {
-        // close all the clients before throwing anything
-        IOException clientException = null;
-        for (RangeClient client : clients.values())
-        {
-            try
-            {
-                client.close();
-            }
-            catch (IOException e)
-            {
-                clientException = e;
-            }
-        }
-        if (clientException != null)
-            throw clientException;
-    }
-    
-    /**
-     * If the key is to be associated with a valid value, a mutation is created
-     * for it with the given column family and columns. In the event the value
-     * in the column is missing (i.e., null), then it is marked for
-     * {@link Deletion}. Similarly, if the entire value for a key is missing
-     * (i.e., null), then the entire key is marked for {@link Deletion}.
-     * </p>
-     *
-     * @param keybuff
-     *            the key to write.
-     * @param value
-     *            the value to write.
-     * @throws IOException
-     */
-    @Override
-    public void write(ByteBuffer keybuff, List<Mutation> value) throws IOException
-    {
-        Range<Token> range = ringCache.getRange(keybuff);
-
-        // get the client for the given range, or create a new one
-        RangeClient client = clients.get(range);
-        if (client == null)
-        {
-            // haven't seen keys for this range: create new client
-            client = new RangeClient(ringCache.getEndpoint(range));
-            client.start();
-            clients.put(range, client);
-        }
-
-        for (Mutation amut : value)
-            client.put(Pair.create(keybuff, amut));
-        if (progressable != null)
-            progressable.progress();
-        if (context != null)
-            HadoopCompat.progress(context);
-    }
-
-    /**
-     * A client that runs in a threadpool and connects to the list of endpoints for a particular
-     * range. Mutations for keys in that range are sent to this client via a queue.
-     */
-    public class RangeClient extends Thread
-    {
-        // The list of endpoints for this range
-        protected final List<InetAddress> endpoints;
-        // A bounded queue of incoming mutations for this range
-        protected final BlockingQueue<Pair<ByteBuffer, Mutation>> queue = new ArrayBlockingQueue<>(queueSize);
-
-        protected volatile boolean run = true;
-        // we want the caller to know if something went wrong, so we record any unrecoverable exception while writing
-        // so we can throw it on the caller's stack when he calls put() again, or if there are no more put calls,
-        // when the client is closed.
-        protected volatile IOException lastException;
-
-        protected Cassandra.Client client;
-        public final String columnFamily = ConfigHelper.getOutputColumnFamily(conf);
-        
-        /**
-        * Constructs an {@link RangeClient} for the given endpoints.
-        * @param endpoints the possible endpoints to execute the mutations on
-        */
-        public RangeClient(List<InetAddress> endpoints)
-        {
-            super("client-" + endpoints);
-            this.endpoints = endpoints;
-         }
-
-        /**
-         * enqueues the given value to Cassandra
-         */
-        public void put(Pair<ByteBuffer, Mutation> value) throws IOException
-        {
-            while (true)
-            {
-                if (lastException != null)
-                    throw lastException;
-                try
-                {
-                    if (queue.offer(value, 100, TimeUnit.MILLISECONDS))
-                        break;
-                }
-                catch (InterruptedException e)
-                {
-                    throw new AssertionError(e);
-                }
-            }
-        }
-
-        public void close() throws IOException
-        {
-            // stop the run loop.  this will result in closeInternal being called by the time join() finishes.
-            run = false;
-            interrupt();
-            try
-            {
-                this.join();
-            }
-            catch (InterruptedException e)
-            {
-                throw new AssertionError(e);
-            }
-
-            if (lastException != null)
-                throw lastException;
-        }
-
-        @SuppressWarnings("resource")
-        protected void closeInternal()
-        {
-            if (client != null)
-            {
-                TTransport transport = client.getOutputProtocol().getTransport();
-                if (transport.isOpen())
-                    transport.close();
-            }
-        }
-        
-        /**
-         * Loops collecting mutations from the queue and sending to Cassandra
-         */
-        public void run()
-        {
-            outer:
-            while (run || !queue.isEmpty())
-            {
-                Pair<ByteBuffer, Mutation> mutation;
-                try
-                {
-                    mutation = queue.take();
-                }
-                catch (InterruptedException e)
-                {
-                    // re-check loop condition after interrupt
-                    continue;
-                }
-
-                Map<ByteBuffer, Map<String, List<Mutation>>> batch = new HashMap<ByteBuffer, Map<String, List<Mutation>>>();
-                while (mutation != null)
-                {
-                    Map<String, List<Mutation>> subBatch = batch.get(mutation.left);
-                    if (subBatch == null)
-                    {
-                        subBatch = Collections.singletonMap(columnFamily, (List<Mutation>) new ArrayList<Mutation>());
-                        batch.put(mutation.left, subBatch);
-                    }
-
-                    subBatch.get(columnFamily).add(mutation.right);
-                    if (batch.size() >= batchThreshold)
-                        break;
-
-                    mutation = queue.poll();
-                }
-
-                Iterator<InetAddress> iter = endpoints.iterator();
-                while (true)
-                {
-                    // send the mutation to the last-used endpoint.  first time through, this will NPE harmlessly.
-                    try
-                    {
-                        client.batch_mutate(batch, consistencyLevel);
-                        break;
-                    }
-                    catch (Exception e)
-                    {
-                        closeInternal();
-                        if (!iter.hasNext())
-                        {
-                            lastException = new IOException(e);
-                            break outer;
-                        }
-                    }
-
-                    // attempt to connect to a different endpoint
-                    try
-                    {
-                        InetAddress address = iter.next();
-                        String host = address.getHostName();
-                        int port = ConfigHelper.getOutputRpcPort(conf);
-                        client = ColumnFamilyOutputFormat.createAuthenticatedClient(host, port, conf);
-                    }
-                    catch (Exception e)
-                    {
-                        closeInternal();
-                        // TException means something unexpected went wrong to that endpoint, so
-                        // we should try again to another.  Other exceptions (auth or invalid request) are fatal.
-                        if ((!(e instanceof TException)) || !iter.hasNext())
-                        {
-                            lastException = new IOException(e);
-                            break outer;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/ConfigHelper.java b/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
index 376c250..a4deb4a 100644
--- a/src/java/org/apache/cassandra/hadoop/ConfigHelper.java
+++ b/src/java/org/apache/cassandra/hadoop/ConfigHelper.java

@@ -1,4 +1,3 @@
-package org.apache.cassandra.hadoop;
 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -19,6 +18,7 @@
  * under the License.
  *
  */
+package org.apache.cassandra.hadoop;
 
 import java.io.IOException;
 import java.util.*;
@@ -27,7 +27,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.thrift.*;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Hex;
@@ -190,12 +190,12 @@
     }
 
     /**
-     * Set the size of the input split. getInputSplitSize value is used if this is not set.
+     * Set the size of the input split. setInputSplitSize value is used if this is not set.
      * This affects the number of maps created, if the number is too small
      * the overhead of each map will take up the bulk of the job time.
      *
-     * @param conf        Job configuration you are about to run
-     * @param splitSizeMb Input split size in MB
+     * @param conf          Job configuration you are about to run
+     * @param splitSizeMb   Input split size in MB
      */
     public static void setInputSplitSizeInMb(Configuration conf, int splitSizeMb)
     {
@@ -480,7 +480,7 @@
 
     public static String getOutputCompressionChunkLength(Configuration conf)
     {
-        return conf.get(OUTPUT_COMPRESSION_CHUNK_LENGTH, String.valueOf(CompressionParameters.DEFAULT_CHUNK_LENGTH));
+        return conf.get(OUTPUT_COMPRESSION_CHUNK_LENGTH, String.valueOf(CompressionParams.DEFAULT_CHUNK_LENGTH));
     }
 
     public static void setOutputCompressionClass(Configuration conf, String classname)
@@ -507,18 +507,6 @@
         return conf.getInt(THRIFT_FRAMED_TRANSPORT_SIZE_IN_MB, 15) * 1024 * 1024; // 15MB is default in Cassandra
     }
 
-    public static CompressionParameters getOutputCompressionParamaters(Configuration conf)
-    {
-        if (getOutputCompressionClass(conf) == null)
-            return new CompressionParameters(null);
-
-        Map<String, String> options = new HashMap<String, String>(2);
-        options.put(CompressionParameters.SSTABLE_COMPRESSION, getOutputCompressionClass(conf));
-        options.put(CompressionParameters.CHUNK_LENGTH_KB, getOutputCompressionChunkLength(conf));
-
-        return CompressionParameters.create(options);
-    }
-
     public static boolean getOutputLocalDCOnly(Configuration conf)
     {
         return Boolean.parseBoolean(conf.get(OUTPUT_LOCAL_DC_ONLY, "false"));

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java
index d064e27..2ed37ee 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlBulkRecordWriter.java

@@ -34,7 +34,6 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.hadoop.BulkRecordWriter;
 import org.apache.cassandra.hadoop.ConfigHelper;
 import org.apache.cassandra.hadoop.HadoopCompat;
 import org.apache.cassandra.io.sstable.CQLSSTableWriter;
@@ -42,6 +41,7 @@
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.streaming.StreamState;
 import org.apache.cassandra.utils.NativeSSTableLoaderClient;
+import org.apache.cassandra.utils.OutputHandler;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
@@ -174,7 +174,7 @@
             ExternalClient externalClient = new ExternalClient(conf);
             externalClient.setTableMetadata(CFMetaData.compile(schema, keyspace));
 
-            loader = new SSTableLoader(outputDir, externalClient, new BulkRecordWriter.NullOutputHandler())
+            loader = new SSTableLoader(outputDir, externalClient, new NullOutputHandler())
             {
                 @Override
                 public void onSuccess(StreamState finalState)
@@ -309,4 +309,12 @@
             return addresses;
         }
     }
+
+    public static class NullOutputHandler implements OutputHandler
+    {
+        public void output(String msg) {}
+        public void debug(String msg) {}
+        public void warn(String msg) {}
+        public void warn(String msg, Throwable th) {}
+    }
 }

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlConfigHelper.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlConfigHelper.java
index 35cdca8..757be65 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlConfigHelper.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlConfigHelper.java

@@ -40,6 +40,7 @@
 import com.datastax.driver.core.AuthProvider;
 import com.datastax.driver.core.Cluster;
 import com.datastax.driver.core.HostDistance;
+import com.datastax.driver.core.JdkSSLOptions;
 import com.datastax.driver.core.PlainTextAuthProvider;
 import com.datastax.driver.core.ProtocolVersion;
 import com.datastax.driver.core.policies.LoadBalancingPolicy;
@@ -445,7 +446,7 @@
             if (maxConnections.isPresent())
                 poolingOptions.setMaxConnectionsPerHost(hostDistance, maxConnections.get());
             if (maxSimultaneousRequests.isPresent())
-                poolingOptions.setMaxSimultaneousRequestsPerConnectionThreshold(hostDistance, maxSimultaneousRequests.get());
+                poolingOptions.setNewConnectionThreshold(hostDistance, maxSimultaneousRequests.get());
         }
 
         return poolingOptions;
@@ -501,11 +502,26 @@
         return new LimitedLocalNodeFirstLocalBalancingPolicy(stickHosts);
     }
 
+    private static Optional<AuthProvider> getDefaultAuthProvider(Configuration conf)
+    {
+        Optional<String> username = getStringSetting(USERNAME, conf);
+        Optional<String> password = getStringSetting(PASSWORD, conf);
+
+        if (username.isPresent() && password.isPresent())
+        {
+            return Optional.of(new PlainTextAuthProvider(username.get(), password.get()));
+        }
+        else
+        {
+            return Optional.absent();
+        }
+    }
+
     private static Optional<AuthProvider> getAuthProvider(Configuration conf)
     {
         Optional<String> authProvider = getInputNativeAuthProvider(conf);
         if (!authProvider.isPresent())
-            return Optional.absent();
+            return getDefaultAuthProvider(conf);
 
         return Optional.of(getClientAuthProvider(authProvider.get(), conf));
     }
@@ -530,10 +546,13 @@
             {
                 throw new RuntimeException(e);
             }
-            String[] css = SSLOptions.DEFAULT_SSL_CIPHER_SUITES;
+            String[] css = null;
             if (cipherSuites.isPresent())
                 css = cipherSuites.get().split(",");
-            return Optional.of(new SSLOptions(context,css));
+            return Optional.of(JdkSSLOptions.builder()
+                                            .withSSLContext(context)
+                                            .withCipherSuites(css)
+                                            .build());
         }
         return Optional.absent();
     }

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java
index c46ceb8..a426532 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlInputFormat.java

@@ -18,18 +18,33 @@
 package org.apache.cassandra.hadoop.cql3;
 
 import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.*;
 
-import org.apache.cassandra.hadoop.HadoopCompat;
-import org.apache.cassandra.hadoop.AbstractColumnFamilyInputFormat;
-import org.apache.cassandra.hadoop.ReporterWrapper;
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.Host;
+import com.datastax.driver.core.Metadata;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.TokenRange;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.thrift.KeyRange;
+import org.apache.cassandra.hadoop.*;
 
-import com.datastax.driver.core.Row;
+import static java.util.stream.Collectors.toMap;
 
 /**
  * Hadoop InputFormat allowing map/reduce against Cassandra rows within one ColumnFamily.
@@ -43,7 +58,7 @@
  *   1: ConfigHelper.setInputSplitSize. The default split size is 64k rows.
  *   or
  *   2: ConfigHelper.setInputSplitSizeInMb. InputSplit size in MB with new, more precise method
- *   If no value is provided for InputSplitSizeInMb, InputSplitSize will be used.
+ *   If no value is provided for InputSplitSizeInMb, we default to using InputSplitSize.
  *
  *   CQLConfigHelper.setInputCQLPageRowSize. The default page row size is 1000. You
  *   should set it to "as big as possible, but no bigger." It set the LIMIT for the CQL 
@@ -52,8 +67,14 @@
  *   
  *   other native protocol connection parameters in CqlConfigHelper
  */
-public class CqlInputFormat extends AbstractColumnFamilyInputFormat<Long, Row>
+public class CqlInputFormat extends org.apache.hadoop.mapreduce.InputFormat<Long, Row> implements org.apache.hadoop.mapred.InputFormat<Long, Row>
 {
+    public static final String MAPRED_TASK_ID = "mapred.task.id";
+    private static final Logger logger = LoggerFactory.getLogger(CqlInputFormat.class);
+    private String keyspace;
+    private String cfName;
+    private IPartitioner partitioner;
+
     public RecordReader<Long, Row> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter)
             throws IOException
     {
@@ -80,4 +101,248 @@
         return new CqlRecordReader();
     }
 
+    protected void validateConfiguration(Configuration conf)
+    {
+        if (ConfigHelper.getInputKeyspace(conf) == null || ConfigHelper.getInputColumnFamily(conf) == null)
+        {
+            throw new UnsupportedOperationException("you must set the keyspace and table with setInputColumnFamily()");
+        }
+        if (ConfigHelper.getInputInitialAddress(conf) == null)
+            throw new UnsupportedOperationException("You must set the initial output address to a Cassandra node with setInputInitialAddress");
+        if (ConfigHelper.getInputPartitioner(conf) == null)
+            throw new UnsupportedOperationException("You must set the Cassandra partitioner class with setInputPartitioner");
+    }
+
+    public List<org.apache.hadoop.mapreduce.InputSplit> getSplits(JobContext context) throws IOException
+    {
+        Configuration conf = HadoopCompat.getConfiguration(context);
+
+        validateConfiguration(conf);
+
+        keyspace = ConfigHelper.getInputKeyspace(conf);
+        cfName = ConfigHelper.getInputColumnFamily(conf);
+        partitioner = ConfigHelper.getInputPartitioner(conf);
+        logger.trace("partitioner is {}", partitioner);
+
+        // canonical ranges, split into pieces, fetching the splits in parallel
+        ExecutorService executor = new ThreadPoolExecutor(0, 128, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>());
+        List<org.apache.hadoop.mapreduce.InputSplit> splits = new ArrayList<>();
+
+        try (Cluster cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf).split(","), conf);
+             Session session = cluster.connect())
+        {
+            List<Future<List<org.apache.hadoop.mapreduce.InputSplit>>> splitfutures = new ArrayList<>();
+            KeyRange jobKeyRange = ConfigHelper.getInputKeyRange(conf);
+            Range<Token> jobRange = null;
+            if (jobKeyRange != null)
+            {
+                if (jobKeyRange.start_key != null)
+                {
+                    if (!partitioner.preservesOrder())
+                        throw new UnsupportedOperationException("KeyRange based on keys can only be used with a order preserving partitioner");
+                    if (jobKeyRange.start_token != null)
+                        throw new IllegalArgumentException("only start_key supported");
+                    if (jobKeyRange.end_token != null)
+                        throw new IllegalArgumentException("only start_key supported");
+                    jobRange = new Range<>(partitioner.getToken(jobKeyRange.start_key),
+                                           partitioner.getToken(jobKeyRange.end_key));
+                }
+                else if (jobKeyRange.start_token != null)
+                {
+                    jobRange = new Range<>(partitioner.getTokenFactory().fromString(jobKeyRange.start_token),
+                                           partitioner.getTokenFactory().fromString(jobKeyRange.end_token));
+                }
+                else
+                {
+                    logger.warn("ignoring jobKeyRange specified without start_key or start_token");
+                }
+            }
+
+            Metadata metadata = cluster.getMetadata();
+
+            // canonical ranges and nodes holding replicas
+            Map<TokenRange, Set<Host>> masterRangeNodes = getRangeMap(keyspace, metadata);
+
+            for (TokenRange range : masterRangeNodes.keySet())
+            {
+                if (jobRange == null)
+                {
+                    // for each tokenRange, pick a live owner and ask it to compute bite-sized splits
+                    splitfutures.add(executor.submit(new SplitCallable(range, masterRangeNodes.get(range), conf, session)));
+                }
+                else
+                {
+                    TokenRange jobTokenRange = rangeToTokenRange(metadata, jobRange);
+                    if (range.intersects(jobTokenRange))
+                    {
+                        for (TokenRange intersection: range.intersectWith(jobTokenRange))
+                        {
+                            // for each tokenRange, pick a live owner and ask it to compute bite-sized splits
+                            splitfutures.add(executor.submit(new SplitCallable(intersection,  masterRangeNodes.get(range), conf, session)));
+                        }
+                    }
+                }
+            }
+
+            // wait until we have all the results back
+            for (Future<List<org.apache.hadoop.mapreduce.InputSplit>> futureInputSplits : splitfutures)
+            {
+                try
+                {
+                    splits.addAll(futureInputSplits.get());
+                }
+                catch (Exception e)
+                {
+                    throw new IOException("Could not get input splits", e);
+                }
+            }
+        }
+        finally
+        {
+            executor.shutdownNow();
+        }
+
+        assert splits.size() > 0;
+        Collections.shuffle(splits, new Random(System.nanoTime()));
+        return splits;
+    }
+
+    private TokenRange rangeToTokenRange(Metadata metadata, Range<Token> range)
+    {
+        return metadata.newTokenRange(metadata.newToken(partitioner.getTokenFactory().toString(range.left)),
+                metadata.newToken(partitioner.getTokenFactory().toString(range.right)));
+    }
+
+    private Map<TokenRange, Long> getSubSplits(String keyspace, String cfName, TokenRange range, Configuration conf, Session session) throws IOException
+    {
+        int splitSize = ConfigHelper.getInputSplitSize(conf);
+        int splitSizeMb = ConfigHelper.getInputSplitSizeInMb(conf);
+        try
+        {
+            return describeSplits(keyspace, cfName, range, splitSize, splitSizeMb, session);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private Map<TokenRange, Set<Host>> getRangeMap(String keyspace, Metadata metadata)
+    {
+        return metadata.getTokenRanges()
+                       .stream()
+                       .collect(toMap(p -> p, p -> metadata.getReplicas('"' + keyspace + '"', p)));
+    }
+
+    private Map<TokenRange, Long> describeSplits(String keyspace, String table, TokenRange tokenRange, int splitSize, int splitSizeMb, Session session)
+    {
+        String query = String.format("SELECT mean_partition_size, partitions_count " +
+                                     "FROM %s.%s " +
+                                     "WHERE keyspace_name = ? AND table_name = ? AND range_start = ? AND range_end = ?",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.SIZE_ESTIMATES);
+
+        ResultSet resultSet = session.execute(query, keyspace, table, tokenRange.getStart().toString(), tokenRange.getEnd().toString());
+
+        Row row = resultSet.one();
+
+        long meanPartitionSize = 0;
+        long partitionCount = 0;
+        int splitCount = 0;
+
+        if (row != null)
+        {
+            meanPartitionSize = row.getLong("mean_partition_size");
+            partitionCount = row.getLong("partitions_count");
+
+            splitCount = splitSizeMb > 0
+                ? (int)(meanPartitionSize * partitionCount / splitSizeMb / 1024 / 1024)
+                : (int)(partitionCount / splitSize);
+        }
+
+        // If we have no data on this split or the size estimate is 0,
+        // return the full split i.e., do not sub-split
+        // Assume smallest granularity of partition count available from CASSANDRA-7688
+        if (splitCount == 0)
+        {
+            Map<TokenRange, Long> wrappedTokenRange = new HashMap<>();
+            wrappedTokenRange.put(tokenRange, (long) 128);
+            return wrappedTokenRange;
+        }
+
+        List<TokenRange> splitRanges = tokenRange.splitEvenly(splitCount);
+        Map<TokenRange, Long> rangesWithLength = new HashMap<>();
+        for (TokenRange range : splitRanges)
+            rangesWithLength.put(range, partitionCount/splitCount);
+
+        return rangesWithLength;
+    }
+
+    // Old Hadoop API
+    public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException
+    {
+        TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
+        List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
+        InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()];
+        for (int i = 0; i < newInputSplits.size(); i++)
+            oldInputSplits[i] = (ColumnFamilySplit)newInputSplits.get(i);
+        return oldInputSplits;
+    }
+
+    /**
+     * Gets a token tokenRange and splits it up according to the suggested
+     * size into input splits that Hadoop can use.
+     */
+    class SplitCallable implements Callable<List<org.apache.hadoop.mapreduce.InputSplit>>
+    {
+
+        private final TokenRange tokenRange;
+        private final Set<Host> hosts;
+        private final Configuration conf;
+        private final Session session;
+
+        public SplitCallable(TokenRange tr, Set<Host> hosts, Configuration conf, Session session)
+        {
+            this.tokenRange = tr;
+            this.hosts = hosts;
+            this.conf = conf;
+            this.session = session;
+        }
+
+        public List<org.apache.hadoop.mapreduce.InputSplit> call() throws Exception
+        {
+            ArrayList<org.apache.hadoop.mapreduce.InputSplit> splits = new ArrayList<>();
+            Map<TokenRange, Long> subSplits;
+            subSplits = getSubSplits(keyspace, cfName, tokenRange, conf, session);
+            // turn the sub-ranges into InputSplits
+            String[] endpoints = new String[hosts.size()];
+
+            // hadoop needs hostname, not ip
+            int endpointIndex = 0;
+            for (Host endpoint : hosts)
+                endpoints[endpointIndex++] = endpoint.getAddress().getHostName();
+
+            boolean partitionerIsOpp = partitioner instanceof OrderPreservingPartitioner || partitioner instanceof ByteOrderedPartitioner;
+
+            for (TokenRange subSplit : subSplits.keySet())
+            {
+                List<TokenRange> ranges = subSplit.unwrap();
+                for (TokenRange subrange : ranges)
+                {
+                    ColumnFamilySplit split =
+                            new ColumnFamilySplit(
+                                    partitionerIsOpp ?
+                                            subrange.getStart().toString().substring(2) : subrange.getStart().toString(),
+                                    partitionerIsOpp ?
+                                            subrange.getEnd().toString().substring(2) : subrange.getEnd().toString(),
+                                    subSplits.get(subSplit),
+                                    endpoints);
+
+                    logger.trace("adding {}", split);
+                    splits.add(split);
+                }
+            }
+            return splits;
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java
index 9a1cda6..cc0a6b1 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlOutputFormat.java

@@ -55,6 +55,9 @@
 public class CqlOutputFormat extends OutputFormat<Map<String, ByteBuffer>, List<ByteBuffer>>
         implements org.apache.hadoop.mapred.OutputFormat<Map<String, ByteBuffer>, List<ByteBuffer>>
 {
+    public static final String BATCH_THRESHOLD = "mapreduce.output.columnfamilyoutputformat.batch.threshold";
+    public static final String QUEUE_SIZE = "mapreduce.output.columnfamilyoutputformat.queue.size";
+
     /**
      * Check for validity of the output-specification for the job.
      *

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java
index b3e440d..8b04df3 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordReader.java

@@ -27,7 +27,9 @@
 import com.google.common.base.Function;
 import com.google.common.base.Joiner;
 import com.google.common.base.Splitter;
-import com.google.common.collect.AbstractIterator;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.utils.AbstractIterator;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Maps;
 import org.apache.commons.lang3.StringUtils;
@@ -340,12 +342,48 @@
         }
 
         @Override
+        public <T> T get(int i, Class<T> aClass)
+        {
+            return row.get(i, aClass);
+        }
+
+        @Override
+        public <T> T get(int i, TypeToken<T> typeToken)
+        {
+            return row.get(i, typeToken);
+        }
+
+        @Override
+        public <T> T get(int i, TypeCodec<T> typeCodec)
+        {
+            return row.get(i, typeCodec);
+        }
+
+        @Override
         public Object getObject(String s)
         {
             return row.getObject(s);
         }
 
         @Override
+        public <T> T get(String s, Class<T> aClass)
+        {
+            return row.get(s, aClass);
+        }
+
+        @Override
+        public <T> T get(String s, TypeToken<T> typeToken)
+        {
+            return row.get(s, typeToken);
+        }
+
+        @Override
+        public <T> T get(String s, TypeCodec<T> typeCodec)
+        {
+            return row.get(s, typeCodec);
+        }
+
+        @Override
         public boolean getBool(int i)
         {
             return row.getBool(i);

diff --git a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java
index e3d1772..d2a0d86 100644
--- a/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java
+++ b/src/java/org/apache/cassandra/hadoop/cql3/CqlRecordWriter.java

@@ -28,10 +28,10 @@
 
 import com.datastax.driver.core.*;
 import com.datastax.driver.core.exceptions.*;
+
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat;
 import org.apache.cassandra.hadoop.ConfigHelper;
 import org.apache.cassandra.hadoop.HadoopCompat;
 import org.apache.cassandra.utils.FBUtilities;
@@ -40,6 +40,8 @@
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.util.Progressable;
 
+import static java.util.stream.Collectors.toMap;
+
 /**
  * The <code>CqlRecordWriter</code> maps the output &lt;key, value&gt;
  * pairs to a Cassandra table. In particular, it applies the binded variables
@@ -109,30 +111,23 @@
     CqlRecordWriter(Configuration conf)
     {
         this.conf = conf;
-        this.queueSize = conf.getInt(ColumnFamilyOutputFormat.QUEUE_SIZE, 32 * FBUtilities.getAvailableProcessors());
-        batchThreshold = conf.getLong(ColumnFamilyOutputFormat.BATCH_THRESHOLD, 32);
+        this.queueSize = conf.getInt(CqlOutputFormat.QUEUE_SIZE, 32 * FBUtilities.getAvailableProcessors());
+        batchThreshold = conf.getLong(CqlOutputFormat.BATCH_THRESHOLD, 32);
         this.clients = new HashMap<>();
         String keyspace = ConfigHelper.getOutputKeyspace(conf);
 
-        try (Cluster cluster = CqlConfigHelper.getOutputCluster(ConfigHelper.getOutputInitialAddress(conf), conf);
-             Session client = cluster.connect(keyspace))
+        try (Cluster cluster = CqlConfigHelper.getOutputCluster(ConfigHelper.getOutputInitialAddress(conf), conf))
         {
-            ringCache = new NativeRingCache(conf);
-            if (client != null)
-            {
-                TableMetadata tableMetadata = client.getCluster().getMetadata().getKeyspace(client.getLoggedKeyspace()).getTable(ConfigHelper.getOutputColumnFamily(conf));
-                clusterColumns = tableMetadata.getClusteringColumns();
-                partitionKeyColumns = tableMetadata.getPartitionKey();
+            Metadata metadata = cluster.getMetadata();
+            ringCache = new NativeRingCache(conf, metadata);
+            TableMetadata tableMetadata = metadata.getKeyspace(Metadata.quote(keyspace)).getTable(ConfigHelper.getOutputColumnFamily(conf));
+            clusterColumns = tableMetadata.getClusteringColumns();
+            partitionKeyColumns = tableMetadata.getPartitionKey();
 
-                String cqlQuery = CqlConfigHelper.getOutputCql(conf).trim();
-                if (cqlQuery.toLowerCase().startsWith("insert"))
-                    throw new UnsupportedOperationException("INSERT with CqlRecordWriter is not supported, please use UPDATE/DELETE statement");
-                cql = appendKeyWhereClauses(cqlQuery);
-            }
-            else
-            {
-                throw new IllegalArgumentException("Invalid configuration specified " + conf);
-            }
+            String cqlQuery = CqlConfigHelper.getOutputCql(conf).trim();
+            if (cqlQuery.toLowerCase(Locale.ENGLISH).startsWith("insert"))
+                throw new UnsupportedOperationException("INSERT with CqlRecordWriter is not supported, please use UPDATE/DELETE statement");
+            cql = appendKeyWhereClauses(cqlQuery);
         }
         catch (Exception e)
         {
@@ -295,6 +290,7 @@
         public void run()
         {
             Session session = null;
+
             try
             {
                 outer:
@@ -383,10 +379,9 @@
             finally
             {
                 closeSession(session);
+                // close all our connections once we are done.
+                closeInternal();
             }
-
-            // close all our connections once we are done.
-            closeInternal();
         }
 
         /** get prepared statement id from cache, otherwise prepare it from Cassandra server*/
@@ -495,33 +490,20 @@
         return "\"" + identifier.replaceAll("\"", "\"\"") + "\"";
     }
 
-    class NativeRingCache
+    static class NativeRingCache
     {
-        private Map<TokenRange, Set<Host>> rangeMap;
-        private Metadata metadata;
+        private final Map<TokenRange, Set<Host>> rangeMap;
+        private final Metadata metadata;
         private final IPartitioner partitioner;
-        private final Configuration conf;
 
-        public NativeRingCache(Configuration conf)
+        public NativeRingCache(Configuration conf, Metadata metadata)
         {
-            this.conf = conf;
             this.partitioner = ConfigHelper.getOutputPartitioner(conf);
-            refreshEndpointMap();
-        }
-
-
-        private void refreshEndpointMap()
-        {
+            this.metadata = metadata;
             String keyspace = ConfigHelper.getOutputKeyspace(conf);
-            try (Cluster cluster = CqlConfigHelper.getOutputCluster(ConfigHelper.getOutputInitialAddress(conf), conf);
-                 Session session = cluster.connect(keyspace))
-            {
-                rangeMap = new HashMap<>();
-                metadata = session.getCluster().getMetadata();
-                Set<TokenRange> ranges = metadata.getTokenRanges();
-                for (TokenRange range : ranges)
-                    rangeMap.put(range, metadata.getReplicas(keyspace, range));
-            }
+            this.rangeMap = metadata.getTokenRanges()
+                                    .stream()
+                                    .collect(toMap(p -> p, p -> metadata.getReplicas('"' + keyspace + '"', p)));
         }
 
         public TokenRange getRange(ByteBuffer key)

diff --git a/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java b/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java
deleted file mode 100644
index 71fe037..0000000
--- a/src/java/org/apache/cassandra/hadoop/pig/CassandraStorage.java
+++ /dev/null

@@ -1,1413 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop.pig;
-
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.auth.PasswordAuthenticator;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.hadoop.ColumnFamilyRecordReader;
-import org.apache.cassandra.hadoop.ConfigHelper;
-import org.apache.cassandra.hadoop.HadoopCompat;
-import org.apache.cassandra.schema.LegacySchemaTables;
-import org.apache.cassandra.serializers.CollectionSerializer;
-import org.apache.cassandra.thrift.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Hex;
-import org.apache.cassandra.utils.UUIDGen;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.*;
-import org.apache.pig.Expression;
-import org.apache.pig.LoadFunc;
-import org.apache.pig.LoadMetadata;
-import org.apache.pig.ResourceSchema;
-import org.apache.pig.ResourceStatistics;
-import org.apache.pig.StoreFuncInterface;
-import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
-import org.apache.pig.data.*;
-import org.apache.pig.impl.util.UDFContext;
-import org.apache.pig.ResourceSchema.ResourceFieldSchema;
-import org.apache.thrift.TDeserializer;
-import org.apache.thrift.TException;
-import org.apache.thrift.TSerializer;
-import org.apache.thrift.protocol.TBinaryProtocol;
-
-/**
- * A LoadStoreFunc for retrieving data from and storing data to Cassandra
- *
- * A row from a standard CF will be returned as nested tuples: (key, ((name1, val1), (name2, val2))).
- */
-@Deprecated
-public class CassandraStorage extends LoadFunc implements StoreFuncInterface, LoadMetadata
-{
-    public final static String PIG_ALLOW_DELETES = "PIG_ALLOW_DELETES";
-    public final static String PIG_WIDEROW_INPUT = "PIG_WIDEROW_INPUT";
-    public final static String PIG_USE_SECONDARY = "PIG_USE_SECONDARY";
-
-    private final static ByteBuffer BOUND = ByteBufferUtil.EMPTY_BYTE_BUFFER;
-    private static final Logger logger = LoggerFactory.getLogger(CassandraStorage.class);
-
-    private ByteBuffer slice_start = BOUND;
-    private ByteBuffer slice_end = BOUND;
-    private boolean slice_reverse = false;
-    private boolean allow_deletes = false;
-
-    private RecordReader<ByteBuffer, Map<ByteBuffer, ColumnFamilyRecordReader.Column>> reader;
-    private RecordWriter<ByteBuffer, List<Mutation>> writer;
-
-    private boolean widerows = false;
-    private int limit;
-
-    protected String DEFAULT_INPUT_FORMAT;
-    protected String DEFAULT_OUTPUT_FORMAT;
-
-    protected enum MarshallerType { COMPARATOR, DEFAULT_VALIDATOR, KEY_VALIDATOR, SUBCOMPARATOR };
-
-    protected String username;
-    protected String password;
-    protected String keyspace;
-    protected String column_family;
-    protected String loadSignature;
-    protected String storeSignature;
-
-    protected Configuration conf;
-    protected String inputFormatClass;
-    protected String outputFormatClass;
-    protected int splitSize = 64 * 1024;
-    protected String partitionerClass;
-    protected boolean usePartitionFilter = false;
-    protected String initHostAddress;
-    protected String rpcPort;
-    protected int nativeProtocolVersion = 1;
-    
-    // wide row hacks
-    private ByteBuffer lastKey;
-    private Map<ByteBuffer, ColumnFamilyRecordReader.Column> lastRow;
-    private boolean hasNext = true;
-
-    public CassandraStorage()
-    {
-        this(1024);
-    }
-
-    /**@param limit number of columns to fetch in a slice */
-    public CassandraStorage(int limit)
-    {
-        super();
-        this.limit = limit;
-        DEFAULT_INPUT_FORMAT = "org.apache.cassandra.hadoop.ColumnFamilyInputFormat";
-        DEFAULT_OUTPUT_FORMAT = "org.apache.cassandra.hadoop.ColumnFamilyOutputFormat";
-    }
-
-    public int getLimit()
-    {
-        return limit;
-    }
-
-    public void prepareToRead(RecordReader reader, PigSplit split)
-    {
-        this.reader = reader;
-    }
-
-    /** read wide row*/
-    public Tuple getNextWide() throws IOException
-    {
-        CfDef cfDef = getCfDef(loadSignature);
-        ByteBuffer key = null;
-        Tuple tuple = null; 
-        DefaultDataBag bag = new DefaultDataBag();
-        try
-        {
-            while(true)
-            {
-                hasNext = reader.nextKeyValue();
-                if (!hasNext)
-                {
-                    if (tuple == null)
-                        tuple = TupleFactory.getInstance().newTuple();
-
-                    if (lastRow != null)
-                    {
-                        if (tuple.size() == 0) // lastRow is a new one
-                        {
-                            key = reader.getCurrentKey();
-                            tuple = keyToTuple(key, cfDef, parseType(cfDef.getKey_validation_class()));
-                        }
-                        for (Map.Entry<ByteBuffer, ColumnFamilyRecordReader.Column> entry : lastRow.entrySet())
-                        {
-                            bag.add(columnToTuple(entry.getValue(), cfDef, parseType(cfDef.getComparator_type())));
-                        }
-                        lastKey = null;
-                        lastRow = null;
-                        tuple.append(bag);
-                        return tuple;
-                    }
-                    else
-                    {
-                        if (tuple.size() == 1) // rare case of just one wide row, key already set
-                        {
-                            tuple.append(bag);
-                            return tuple;
-                        }
-                        else
-                            return null;
-                    }
-                }
-                if (key != null && !(reader.getCurrentKey()).equals(key)) // key changed
-                {
-                    // read too much, hold on to it for next time
-                    lastKey = reader.getCurrentKey();
-                    lastRow = reader.getCurrentValue();
-                    // but return what we have so far
-                    tuple.append(bag);
-                    return tuple;
-                }
-                if (key == null) // only set the key on the first iteration
-                {
-                    key = reader.getCurrentKey();
-                    if (lastKey != null && !(key.equals(lastKey))) // last key only had one value
-                    {
-                        if (tuple == null)
-                            tuple = keyToTuple(lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
-                        else
-                            addKeyToTuple(tuple, lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
-                        for (Map.Entry<ByteBuffer, ColumnFamilyRecordReader.Column> entry : lastRow.entrySet())
-                        {
-                            bag.add(columnToTuple(entry.getValue(), cfDef, parseType(cfDef.getComparator_type())));
-                        }
-                        tuple.append(bag);
-                        lastKey = key;
-                        lastRow = reader.getCurrentValue();
-                        return tuple;
-                    }
-                    if (tuple == null)
-                        tuple = keyToTuple(key, cfDef, parseType(cfDef.getKey_validation_class()));
-                    else
-                        addKeyToTuple(tuple, lastKey, cfDef, parseType(cfDef.getKey_validation_class()));
-                }
-                SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column> row =
-                    (SortedMap<ByteBuffer, ColumnFamilyRecordReader.Column>)reader.getCurrentValue();
-                if (lastRow != null) // prepend what was read last time
-                {
-                    for (Map.Entry<ByteBuffer, ColumnFamilyRecordReader.Column> entry : lastRow.entrySet())
-                    {
-                        bag.add(columnToTuple(entry.getValue(), cfDef, parseType(cfDef.getComparator_type())));
-                    }
-                    lastKey = null;
-                    lastRow = null;
-                }
-                for (Map.Entry<ByteBuffer, ColumnFamilyRecordReader.Column> entry : row.entrySet())
-                {
-                    bag.add(columnToTuple(entry.getValue(), cfDef, parseType(cfDef.getComparator_type())));
-                }
-            }
-        }
-        catch (InterruptedException e)
-        {
-            throw new IOException(e.getMessage());
-        }
-    }
-
-    /** read next row */
-    public Tuple getNext() throws IOException
-    {
-        if (widerows)
-            return getNextWide();
-        try
-        {
-            // load the next pair
-            if (!reader.nextKeyValue())
-                return null;
-
-            CfDef cfDef = getCfDef(loadSignature);
-            ByteBuffer key = reader.getCurrentKey();
-            Map<ByteBuffer, ColumnFamilyRecordReader.Column> cf = reader.getCurrentValue();
-            assert key != null && cf != null;
-
-            // output tuple, will hold the key, each indexed column in a tuple, then a bag of the rest
-            // NOTE: we're setting the tuple size here only for the key so we can use setTupleValue on it
-
-            Tuple tuple = keyToTuple(key, cfDef, parseType(cfDef.getKey_validation_class()));
-            DefaultDataBag bag = new DefaultDataBag();
-            // we must add all the indexed columns first to match the schema
-            Map<ByteBuffer, Boolean> added = new HashMap<ByteBuffer, Boolean>(cfDef.column_metadata.size());
-            // take care to iterate these in the same order as the schema does
-            for (ColumnDef cdef : cfDef.column_metadata)
-            {
-                boolean hasColumn = false;
-                boolean cql3Table = false;
-                try
-                {
-                    hasColumn = cf.containsKey(cdef.name);
-                }
-                catch (Exception e)
-                {
-                    cql3Table = true;
-                }
-                if (hasColumn)
-                {
-                    tuple.append(columnToTuple(cf.get(cdef.name), cfDef, parseType(cfDef.getComparator_type())));
-                }
-                else if (!cql3Table)
-                {   // otherwise, we need to add an empty tuple to take its place
-                    tuple.append(TupleFactory.getInstance().newTuple());
-                }
-                added.put(cdef.name, true);
-            }
-            // now add all the other columns
-            for (Map.Entry<ByteBuffer, ColumnFamilyRecordReader.Column> entry : cf.entrySet())
-            {
-                if (!added.containsKey(entry.getKey()))
-                    bag.add(columnToTuple(entry.getValue(), cfDef, parseType(cfDef.getComparator_type())));
-            }
-            tuple.append(bag);
-            // finally, special top-level indexes if needed
-            if (usePartitionFilter)
-            {
-                for (ColumnDef cdef : getIndexes())
-                {
-                    Tuple throwaway = columnToTuple(cf.get(cdef.name), cfDef, parseType(cfDef.getComparator_type()));
-                    tuple.append(throwaway.get(1));
-                }
-            }
-            return tuple;
-        }
-        catch (InterruptedException e)
-        {
-            throw new IOException(e.getMessage());
-        }
-    }
-
-    /** write next row */
-    public void putNext(Tuple t) throws IOException
-    {
-        /*
-        We support two cases for output:
-        First, the original output:
-            (key, (name, value), (name,value), {(name,value)}) (tuples or bag is optional)
-        For supers, we only accept the original output.
-        */
-
-        if (t.size() < 1)
-        {
-            // simply nothing here, we can't even delete without a key
-            logger.warn("Empty output skipped, filter empty tuples to suppress this warning");
-            return;
-        }
-        ByteBuffer key = objToBB(t.get(0));
-        if (t.getType(1) == DataType.TUPLE)
-            writeColumnsFromTuple(key, t, 1);
-        else if (t.getType(1) == DataType.BAG)
-        {
-            if (t.size() > 2)
-                throw new IOException("No arguments allowed after bag");
-            writeColumnsFromBag(key, (DataBag) t.get(1));
-        }
-        else
-            throw new IOException("Second argument in output must be a tuple or bag");
-    }
-
-    /** set hadoop cassandra connection settings */
-    protected void setConnectionInformation() throws IOException
-    {
-        StorageHelper.setConnectionInformation(conf);
-        if (System.getenv(StorageHelper.PIG_INPUT_FORMAT) != null)
-            inputFormatClass = getFullyQualifiedClassName(System.getenv(StorageHelper.PIG_INPUT_FORMAT));
-        else
-            inputFormatClass = DEFAULT_INPUT_FORMAT;
-        if (System.getenv(StorageHelper.PIG_OUTPUT_FORMAT) != null)
-            outputFormatClass = getFullyQualifiedClassName(System.getenv(StorageHelper.PIG_OUTPUT_FORMAT));
-        else
-            outputFormatClass = DEFAULT_OUTPUT_FORMAT;
-        if (System.getenv(PIG_ALLOW_DELETES) != null)
-            allow_deletes = Boolean.parseBoolean(System.getenv(PIG_ALLOW_DELETES));
-    }
-
-    /** get the full class name */
-    protected String getFullyQualifiedClassName(String classname)
-    {
-        return classname.contains(".") ? classname : "org.apache.cassandra.hadoop." + classname;
-    }
-
-    /** set read configuration settings */
-    public void setLocation(String location, Job job) throws IOException
-    {
-        conf = HadoopCompat.getConfiguration(job);
-        setLocationFromUri(location);
-
-        if (ConfigHelper.getInputSlicePredicate(conf) == null)
-        {
-            SliceRange range = new SliceRange(slice_start, slice_end, slice_reverse, limit);
-            SlicePredicate predicate = new SlicePredicate().setSlice_range(range);
-            ConfigHelper.setInputSlicePredicate(conf, predicate);
-        }
-        if (System.getenv(PIG_WIDEROW_INPUT) != null)
-            widerows = Boolean.parseBoolean(System.getenv(PIG_WIDEROW_INPUT));
-        if (System.getenv(PIG_USE_SECONDARY) != null)
-            usePartitionFilter = Boolean.parseBoolean(System.getenv(PIG_USE_SECONDARY));
-        if (System.getenv(StorageHelper.PIG_INPUT_SPLIT_SIZE) != null)
-        {
-            try
-            {
-                ConfigHelper.setInputSplitSize(conf, Integer.parseInt(System.getenv(StorageHelper.PIG_INPUT_SPLIT_SIZE)));
-            }
-            catch (NumberFormatException e)
-            {
-                throw new IOException("PIG_INPUT_SPLIT_SIZE is not a number", e);
-            }           
-        } 
-
-        if (usePartitionFilter && getIndexExpressions() != null)
-            ConfigHelper.setInputRange(conf, getIndexExpressions());
-
-        if (username != null && password != null)
-            ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, username, password);
-
-        if (splitSize > 0)
-            ConfigHelper.setInputSplitSize(conf, splitSize);
-        if (partitionerClass!= null)
-            ConfigHelper.setInputPartitioner(conf, partitionerClass);
-        if (rpcPort != null)
-            ConfigHelper.setInputRpcPort(conf, rpcPort);
-        if (initHostAddress != null)
-            ConfigHelper.setInputInitialAddress(conf, initHostAddress);
-
-        ConfigHelper.setInputColumnFamily(conf, keyspace, column_family, widerows);
-        setConnectionInformation();
-
-        if (ConfigHelper.getInputRpcPort(conf) == 0)
-            throw new IOException("PIG_INPUT_RPC_PORT or PIG_RPC_PORT environment variable not set");
-        if (ConfigHelper.getInputInitialAddress(conf) == null)
-            throw new IOException("PIG_INPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
-        if (ConfigHelper.getInputPartitioner(conf) == null)
-            throw new IOException("PIG_INPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");
-        if (loadSignature == null)
-            loadSignature = location;
-        initSchema(loadSignature);
-    }
-
-    /** set store configuration settings */
-    public void setStoreLocation(String location, Job job) throws IOException
-    {
-        conf = HadoopCompat.getConfiguration(job);
-        
-        // don't combine mappers to a single mapper per node
-        conf.setBoolean("pig.noSplitCombination", true);
-        setLocationFromUri(location);
-
-        if (username != null && password != null)
-            ConfigHelper.setOutputKeyspaceUserNameAndPassword(conf, username, password);
-        if (splitSize > 0)
-            ConfigHelper.setInputSplitSize(conf, splitSize);
-        if (partitionerClass!= null)
-            ConfigHelper.setOutputPartitioner(conf, partitionerClass);
-        if (rpcPort != null)
-        {
-            ConfigHelper.setOutputRpcPort(conf, rpcPort);
-            ConfigHelper.setInputRpcPort(conf, rpcPort);
-        }
-        if (initHostAddress != null)
-        {
-            ConfigHelper.setOutputInitialAddress(conf, initHostAddress);
-            ConfigHelper.setInputInitialAddress(conf, initHostAddress);
-        }
-
-        ConfigHelper.setOutputColumnFamily(conf, keyspace, column_family);
-        setConnectionInformation();
-
-        if (ConfigHelper.getOutputRpcPort(conf) == 0)
-            throw new IOException("PIG_OUTPUT_RPC_PORT or PIG_RPC_PORT environment variable not set");
-        if (ConfigHelper.getOutputInitialAddress(conf) == null)
-            throw new IOException("PIG_OUTPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
-        if (ConfigHelper.getOutputPartitioner(conf) == null)
-            throw new IOException("PIG_OUTPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");
-
-        // we have to do this again here for the check in writeColumnsFromTuple
-        if (System.getenv(PIG_USE_SECONDARY) != null)
-            usePartitionFilter = Boolean.parseBoolean(System.getenv(PIG_USE_SECONDARY));
-
-        initSchema(storeSignature);
-    }
-
-    /** Methods to get the column family schema from Cassandra */
-    protected void initSchema(String signature) throws IOException
-    {
-        Properties properties = UDFContext.getUDFContext().getUDFProperties(CassandraStorage.class);
-
-        // Only get the schema if we haven't already gotten it
-        if (!properties.containsKey(signature))
-        {
-            try
-            {
-                Cassandra.Client client = ConfigHelper.getClientFromInputAddressList(conf);
-                client.set_keyspace(keyspace);
-
-                if (username != null && password != null)
-                {
-                    Map<String, String> credentials = new HashMap<String, String>(2);
-                    credentials.put(PasswordAuthenticator.USERNAME_KEY, username);
-                    credentials.put(PasswordAuthenticator.PASSWORD_KEY, password);
-
-                    try
-                    {
-                        client.login(new AuthenticationRequest(credentials));
-                    }
-                    catch (AuthenticationException e)
-                    {
-                        logger.error("Authentication exception: invalid username and/or password");
-                        throw new IOException(e);
-                    }
-                }
-
-                // compose the CfDef for the columfamily
-                CfDef cfDef = getCfDef(client);
-
-                if (cfDef != null)
-                {
-                    StringBuilder sb = new StringBuilder();
-                    sb.append(cfdefToString(cfDef));
-                    properties.setProperty(signature, sb.toString());
-                }
-                else
-                    throw new IOException(String.format("Table '%s' not found in keyspace '%s'",
-                            column_family,
-                            keyspace));
-            }
-            catch (Exception e)
-            {
-                throw new IOException(e);
-            }
-        }
-    }
-
-    public void checkSchema(ResourceSchema schema) throws IOException
-    {
-        // we don't care about types, they all get casted to ByteBuffers
-    }
-
-    /** define the schema */
-    public ResourceSchema getSchema(String location, Job job) throws IOException
-    {
-        setLocation(location, job);
-        CfDef cfDef = getCfDef(loadSignature);
-        if (cfDef.column_type.equals("Super"))
-            return null;
-        /*
-        Our returned schema should look like this:
-        (key, index1:(name, value), index2:(name, value), columns:{(name, value)})
-        Which is to say, columns that have metadata will be returned as named tuples, but unknown columns will go into a bag.
-        This way, wide rows can still be handled by the bag, but known columns can easily be referenced.
-         */
-
-        // top-level schema, no type
-        ResourceSchema schema = new ResourceSchema();
-
-        // get default marshallers and validators
-        Map<MarshallerType, AbstractType> marshallers = getDefaultMarshallers(cfDef);
-        Map<ByteBuffer,AbstractType> validators = getValidatorMap(cfDef);
-
-        // add key
-        ResourceFieldSchema keyFieldSchema = new ResourceFieldSchema();
-        keyFieldSchema.setName("key");
-        keyFieldSchema.setType(StorageHelper.getPigType(marshallers.get(MarshallerType.KEY_VALIDATOR)));
-
-        ResourceSchema bagSchema = new ResourceSchema();
-        ResourceFieldSchema bagField = new ResourceFieldSchema();
-        bagField.setType(DataType.BAG);
-        bagField.setName("columns");
-        // inside the bag, place one tuple with the default comparator/validator schema
-        ResourceSchema bagTupleSchema = new ResourceSchema();
-        ResourceFieldSchema bagTupleField = new ResourceFieldSchema();
-        bagTupleField.setType(DataType.TUPLE);
-        ResourceFieldSchema bagcolSchema = new ResourceFieldSchema();
-        ResourceFieldSchema bagvalSchema = new ResourceFieldSchema();
-        bagcolSchema.setName("name");
-        bagvalSchema.setName("value");
-        bagcolSchema.setType(StorageHelper.getPigType(marshallers.get(MarshallerType.COMPARATOR)));
-        bagvalSchema.setType(StorageHelper.getPigType(marshallers.get(MarshallerType.DEFAULT_VALIDATOR)));
-        bagTupleSchema.setFields(new ResourceFieldSchema[] { bagcolSchema, bagvalSchema });
-        bagTupleField.setSchema(bagTupleSchema);
-        bagSchema.setFields(new ResourceFieldSchema[] { bagTupleField });
-        bagField.setSchema(bagSchema);
-
-        // will contain all fields for this schema
-        List<ResourceFieldSchema> allSchemaFields = new ArrayList<ResourceFieldSchema>();
-        // add the key first, then the indexed columns, and finally the bag
-        allSchemaFields.add(keyFieldSchema);
-
-        if (!widerows)
-        {
-            // defined validators/indexes
-            for (ColumnDef cdef : cfDef.column_metadata)
-            {
-                // make a new tuple for each col/val pair
-                ResourceSchema innerTupleSchema = new ResourceSchema();
-                ResourceFieldSchema innerTupleField = new ResourceFieldSchema();
-                innerTupleField.setType(DataType.TUPLE);
-                innerTupleField.setSchema(innerTupleSchema);
-                innerTupleField.setName(new String(cdef.getName()));
-
-                ResourceFieldSchema idxColSchema = new ResourceFieldSchema();
-                idxColSchema.setName("name");
-                idxColSchema.setType(StorageHelper.getPigType(marshallers.get(MarshallerType.COMPARATOR)));
-
-                ResourceFieldSchema valSchema = new ResourceFieldSchema();
-                AbstractType validator = validators.get(cdef.name);
-                if (validator == null)
-                    validator = marshallers.get(MarshallerType.DEFAULT_VALIDATOR);
-                valSchema.setName("value");
-                valSchema.setType(StorageHelper.getPigType(validator));
-
-                innerTupleSchema.setFields(new ResourceFieldSchema[] { idxColSchema, valSchema });
-                allSchemaFields.add(innerTupleField);
-            }   
-        }
-
-        // bag at the end for unknown columns
-        allSchemaFields.add(bagField);
-
-        // add top-level index elements if needed
-        if (usePartitionFilter)
-        {
-            for (ColumnDef cdef : getIndexes())
-            {
-                ResourceFieldSchema idxSchema = new ResourceFieldSchema();
-                idxSchema.setName("index_" + new String(cdef.getName()));
-                AbstractType validator = validators.get(cdef.name);
-                if (validator == null)
-                    validator = marshallers.get(MarshallerType.DEFAULT_VALIDATOR);
-                idxSchema.setType(StorageHelper.getPigType(validator));
-                allSchemaFields.add(idxSchema);
-            }
-        }
-        // top level schema contains everything
-        schema.setFields(allSchemaFields.toArray(new ResourceFieldSchema[allSchemaFields.size()]));
-        return schema;
-    }
-
-    /** set partition filter */
-    public void setPartitionFilter(Expression partitionFilter) throws IOException
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CassandraStorage.class);
-        property.setProperty(StorageHelper.PARTITION_FILTER_SIGNATURE, indexExpressionsToString(filterToIndexExpressions(partitionFilter)));
-    }
-
-    /** prepare writer */
-    public void prepareToWrite(RecordWriter writer)
-    {
-        this.writer = writer;
-    }
-
-    /** convert object to ByteBuffer */
-    protected ByteBuffer objToBB(Object o)
-    {
-        if (o == null)
-            return nullToBB();
-        if (o instanceof java.lang.String)
-            return ByteBuffer.wrap(new DataByteArray((String)o).get());
-        if (o instanceof Integer)
-            return Int32Type.instance.decompose((Integer)o);
-        if (o instanceof Long)
-            return LongType.instance.decompose((Long)o);
-        if (o instanceof Float)
-            return FloatType.instance.decompose((Float)o);
-        if (o instanceof Double)
-            return DoubleType.instance.decompose((Double)o);
-        if (o instanceof UUID)
-            return ByteBuffer.wrap(UUIDGen.decompose((UUID) o));
-        if(o instanceof Tuple) {
-            List<Object> objects = ((Tuple)o).getAll();
-            //collections
-            if (objects.size() > 0 && objects.get(0) instanceof String)
-            {
-                String collectionType = (String) objects.get(0);
-                if ("set".equalsIgnoreCase(collectionType) ||
-                        "list".equalsIgnoreCase(collectionType))
-                    return objToListOrSetBB(objects.subList(1, objects.size()));
-                else if ("map".equalsIgnoreCase(collectionType))
-                    return objToMapBB(objects.subList(1, objects.size()));
-
-            }
-            return objToCompositeBB(objects);
-        }
-
-        return ByteBuffer.wrap(((DataByteArray) o).get());
-    }
-
-    private ByteBuffer objToListOrSetBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size());
-        for(Object sub : objects)
-        {
-            ByteBuffer buffer = objToBB(sub);
-            serialized.add(buffer);
-        }
-        // NOTE: using protocol v1 serialization format for collections so as to not break
-        // compatibility. Not sure if that's the right thing.
-        return CollectionSerializer.pack(serialized, objects.size(), 1);
-    }
-
-    private ByteBuffer objToMapBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size() * 2);
-        for(Object sub : objects)
-        {
-            List<Object> keyValue = ((Tuple)sub).getAll();
-            for (Object entry: keyValue)
-            {
-                ByteBuffer buffer = objToBB(entry);
-                serialized.add(buffer);
-            }
-        }
-        // NOTE: using protocol v1 serialization format for collections so as to not break
-        // compatibility. Not sure if that's the right thing.
-        return CollectionSerializer.pack(serialized, objects.size(), 1);
-    }
-
-    private ByteBuffer objToCompositeBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size());
-        int totalLength = 0;
-        for(Object sub : objects)
-        {
-            ByteBuffer buffer = objToBB(sub);
-            serialized.add(buffer);
-            totalLength += 2 + buffer.remaining() + 1;
-        }
-        ByteBuffer out = ByteBuffer.allocate(totalLength);
-        for (ByteBuffer bb : serialized)
-        {
-            int length = bb.remaining();
-            out.put((byte) ((length >> 8) & 0xFF));
-            out.put((byte) (length & 0xFF));
-            out.put(bb);
-            out.put((byte) 0);
-        }
-        out.flip();
-        return out;
-    }
-
-    /** write tuple data to cassandra */
-    private void writeColumnsFromTuple(ByteBuffer key, Tuple t, int offset) throws IOException
-    {
-        ArrayList<Mutation> mutationList = new ArrayList<Mutation>();
-        for (int i = offset; i < t.size(); i++)
-        {
-            if (t.getType(i) == DataType.BAG)
-                writeColumnsFromBag(key, (DataBag) t.get(i));
-            else if (t.getType(i) == DataType.TUPLE)
-            {
-                Tuple inner = (Tuple) t.get(i);
-                if (inner.size() > 0) // may be empty, for an indexed column that wasn't present
-                    mutationList.add(mutationFromTuple(inner));
-            }
-            else if (!usePartitionFilter)
-            {
-                throw new IOException("Output type was not a bag or a tuple");
-            }
-        }
-        if (mutationList.size() > 0)
-            writeMutations(key, mutationList);
-    }
-
-    /** compose Cassandra mutation from tuple */
-    private Mutation mutationFromTuple(Tuple t) throws IOException
-    {
-        Mutation mutation = new Mutation();
-        if (t.get(1) == null)
-        {
-            if (allow_deletes)
-            {
-                mutation.deletion = new Deletion();
-                mutation.deletion.predicate = new org.apache.cassandra.thrift.SlicePredicate();
-                mutation.deletion.predicate.column_names = Arrays.asList(objToBB(t.get(0)));
-                mutation.deletion.setTimestamp(FBUtilities.timestampMicros());
-            }
-            else
-                throw new IOException("null found but deletes are disabled, set " + PIG_ALLOW_DELETES +
-                    "=true in environment or allow_deletes=true in URL to enable");
-        }
-        else
-        {
-            org.apache.cassandra.thrift.Column column = new org.apache.cassandra.thrift.Column();
-            column.setName(objToBB(t.get(0)));
-            column.setValue(objToBB(t.get(1)));
-            column.setTimestamp(FBUtilities.timestampMicros());
-            mutation.column_or_supercolumn = new ColumnOrSuperColumn();
-            mutation.column_or_supercolumn.column = column;
-        }
-        return mutation;
-    }
-
-    /** write bag data to Cassandra */
-    private void writeColumnsFromBag(ByteBuffer key, DataBag bag) throws IOException
-    {
-        List<Mutation> mutationList = new ArrayList<Mutation>();
-        for (Tuple pair : bag)
-        {
-            Mutation mutation = new Mutation();
-            if (DataType.findType(pair.get(1)) == DataType.BAG) // supercolumn
-            {
-                SuperColumn sc = new SuperColumn();
-                sc.setName(objToBB(pair.get(0)));
-                List<org.apache.cassandra.thrift.Column> columns = new ArrayList<org.apache.cassandra.thrift.Column>();
-                for (Tuple subcol : (DataBag) pair.get(1))
-                {
-                    org.apache.cassandra.thrift.Column column = new org.apache.cassandra.thrift.Column();
-                    column.setName(objToBB(subcol.get(0)));
-                    column.setValue(objToBB(subcol.get(1)));
-                    column.setTimestamp(FBUtilities.timestampMicros());
-                    columns.add(column);
-                }
-                if (columns.isEmpty())
-                {
-                    if (allow_deletes)
-                    {
-                        mutation.deletion = new Deletion();
-                        mutation.deletion.super_column = objToBB(pair.get(0));
-                        mutation.deletion.setTimestamp(FBUtilities.timestampMicros());
-                    }
-                    else
-                        throw new IOException("SuperColumn deletion attempted with empty bag, but deletes are disabled, set " +
-                            PIG_ALLOW_DELETES + "=true in environment or allow_deletes=true in URL to enable");
-                }
-                else
-                {
-                    sc.columns = columns;
-                    mutation.column_or_supercolumn = new ColumnOrSuperColumn();
-                    mutation.column_or_supercolumn.super_column = sc;
-                }
-            }
-            else
-                mutation = mutationFromTuple(pair);
-            mutationList.add(mutation);
-            // for wide rows, we need to limit the amount of mutations we write at once
-            if (mutationList.size() >= 10) // arbitrary, CFOF will re-batch this up, and BOF won't care
-            {
-                writeMutations(key, mutationList);
-                mutationList.clear();
-            }
-        }
-        // write the last batch
-        if (mutationList.size() > 0)
-            writeMutations(key, mutationList);
-    }
-
-    /** write mutation to Cassandra */
-    private void writeMutations(ByteBuffer key, List<Mutation> mutations) throws IOException
-    {
-        try
-        {
-            writer.write(key, mutations);
-        }
-        catch (InterruptedException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** get a list of columns with defined index*/
-    protected List<ColumnDef> getIndexes() throws IOException
-    {
-        CfDef cfdef = getCfDef(loadSignature);
-        List<ColumnDef> indexes = new ArrayList<ColumnDef>();
-        for (ColumnDef cdef : cfdef.column_metadata)
-        {
-            if (cdef.index_type != null)
-                indexes.add(cdef);
-        }
-        return indexes;
-    }
-
-    /** get a list of Cassandra IndexExpression from Pig expression */
-    private List<IndexExpression> filterToIndexExpressions(Expression expression) throws IOException
-    {
-        List<IndexExpression> indexExpressions = new ArrayList<IndexExpression>();
-        Expression.BinaryExpression be = (Expression.BinaryExpression)expression;
-        ByteBuffer name = ByteBuffer.wrap(be.getLhs().toString().getBytes());
-        ByteBuffer value = ByteBuffer.wrap(be.getRhs().toString().getBytes());
-        switch (expression.getOpType())
-        {
-            case OP_EQ:
-                indexExpressions.add(new IndexExpression(name, IndexOperator.EQ, value));
-                break;
-            case OP_GE:
-                indexExpressions.add(new IndexExpression(name, IndexOperator.GTE, value));
-                break;
-            case OP_GT:
-                indexExpressions.add(new IndexExpression(name, IndexOperator.GT, value));
-                break;
-            case OP_LE:
-                indexExpressions.add(new IndexExpression(name, IndexOperator.LTE, value));
-                break;
-            case OP_LT:
-                indexExpressions.add(new IndexExpression(name, IndexOperator.LT, value));
-                break;
-            case OP_AND:
-                indexExpressions.addAll(filterToIndexExpressions(be.getLhs()));
-                indexExpressions.addAll(filterToIndexExpressions(be.getRhs()));
-                break;
-            default:
-                throw new IOException("Unsupported expression type: " + expression.getOpType().name());
-        }
-        return indexExpressions;
-    }
-
-    /** convert a list of index expression to string */
-    private static String indexExpressionsToString(List<IndexExpression> indexExpressions) throws IOException
-    {
-        assert indexExpressions != null;
-        // oh, you thought cfdefToString was awful?
-        IndexClause indexClause = new IndexClause();
-        indexClause.setExpressions(indexExpressions);
-        indexClause.setStart_key("".getBytes());
-        TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
-        try
-        {
-            return Hex.bytesToHex(serializer.serialize(indexClause));
-        }
-        catch (TException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** convert string to a list of index expression */
-    private static List<IndexExpression> indexExpressionsFromString(String ie) throws IOException
-    {
-        assert ie != null;
-        TDeserializer deserializer = new TDeserializer(new TBinaryProtocol.Factory());
-        IndexClause indexClause = new IndexClause();
-        try
-        {
-            deserializer.deserialize(indexClause, Hex.hexToBytes(ie));
-        }
-        catch (TException e)
-        {
-            throw new IOException(e);
-        }
-        return indexClause.getExpressions();
-    }
-
-    public ResourceStatistics getStatistics(String location, Job job)
-    {
-        return null;
-    }
-
-    public void cleanupOnFailure(String failure, Job job)
-    {
-    }
-
-    public void cleanupOnSuccess(String location, Job job) throws IOException {
-    }
-
-
-    /** StoreFunc methods */
-    public void setStoreFuncUDFContextSignature(String signature)
-    {
-        this.storeSignature = signature;
-    }
-
-    public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException
-    {
-        return relativeToAbsolutePath(location, curDir);
-    }
-
-    /** output format */
-    public OutputFormat getOutputFormat() throws IOException
-    {
-        try
-        {
-            return FBUtilities.construct(outputFormatClass, "outputformat");
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-
-    @Override
-    public InputFormat getInputFormat() throws IOException
-    {
-        try
-        {
-            return FBUtilities.construct(inputFormatClass, "inputformat");
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** get a list of index expression */
-    private List<IndexExpression> getIndexExpressions() throws IOException
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CassandraStorage.class);
-        if (property.getProperty(StorageHelper.PARTITION_FILTER_SIGNATURE) != null)
-            return indexExpressionsFromString(property.getProperty(StorageHelper.PARTITION_FILTER_SIGNATURE));
-        else
-            return null;
-    }
-
-    /** get a list of column for the column family */
-    protected List<ColumnDef> getColumnMetadata(Cassandra.Client client)
-    throws TException, CharacterCodingException, InvalidRequestException, ConfigurationException
-    {   
-        return getColumnMeta(client, true, true);
-    }
-
-
-    /** get column meta data */
-    protected List<ColumnDef> getColumnMeta(Cassandra.Client client, boolean cassandraStorage, boolean includeCompactValueColumn)
-            throws org.apache.cassandra.thrift.InvalidRequestException,
-            UnavailableException,
-            TimedOutException,
-            SchemaDisagreementException,
-            TException,
-            CharacterCodingException,
-            org.apache.cassandra.exceptions.InvalidRequestException,
-            ConfigurationException,
-            NotFoundException
-    {
-        String query = String.format("SELECT column_name, validator, index_type, type " +
-                        "FROM %s.%s " +
-                        "WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
-                SystemKeyspace.NAME,
-                LegacySchemaTables.COLUMNS,
-                keyspace,
-                column_family);
-
-        CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE, ConsistencyLevel.ONE);
-
-        List<CqlRow> rows = result.rows;
-        List<ColumnDef> columnDefs = new ArrayList<ColumnDef>();
-        if (rows == null || rows.isEmpty())
-        {
-            // if CassandraStorage, just return the empty list
-            if (cassandraStorage)
-                return columnDefs;
-
-            // otherwise for CqlNativeStorage, check metadata for classic thrift tables
-            CFMetaData cfm = getCFMetaData(keyspace, column_family, client);
-            for (ColumnDefinition def : cfm.regularAndStaticColumns())
-            {
-                ColumnDef cDef = new ColumnDef();
-                String columnName = def.name.toString();
-                String type = def.type.toString();
-                logger.trace("name: {}, type: {} ", columnName, type);
-                cDef.name = ByteBufferUtil.bytes(columnName);
-                cDef.validation_class = type;
-                columnDefs.add(cDef);
-            }
-            // we may not need to include the value column for compact tables as we
-            // could have already processed it as schema_columnfamilies.value_alias
-            if (columnDefs.size() == 0 && includeCompactValueColumn && cfm.compactValueColumn() != null)
-            {
-                ColumnDefinition def = cfm.compactValueColumn();
-                if ("value".equals(def.name.toString()))
-                {
-                    ColumnDef cDef = new ColumnDef();
-                    cDef.name = def.name.bytes;
-                    cDef.validation_class = def.type.toString();
-                    columnDefs.add(cDef);
-                }
-            }
-            return columnDefs;
-        }
-
-        Iterator<CqlRow> iterator = rows.iterator();
-        while (iterator.hasNext())
-        {
-            CqlRow row = iterator.next();
-            ColumnDef cDef = new ColumnDef();
-            String type = ByteBufferUtil.string(row.getColumns().get(3).value);
-            if (!type.equals("regular"))
-                continue;
-            cDef.setName(ByteBufferUtil.clone(row.getColumns().get(0).value));
-            cDef.validation_class = ByteBufferUtil.string(row.getColumns().get(1).value);
-            ByteBuffer indexType = row.getColumns().get(2).value;
-            if (indexType != null)
-                cDef.index_type = getIndexType(ByteBufferUtil.string(indexType));
-            columnDefs.add(cDef);
-        }
-        return columnDefs;
-    }
-
-
-    /** get CFMetaData of a column family */
-    protected CFMetaData getCFMetaData(String ks, String cf, Cassandra.Client client)
-            throws NotFoundException,
-            org.apache.cassandra.thrift.InvalidRequestException,
-            TException,
-            org.apache.cassandra.exceptions.InvalidRequestException,
-            ConfigurationException
-    {
-        KsDef ksDef = client.describe_keyspace(ks);
-        for (CfDef cfDef : ksDef.cf_defs)
-        {
-            if (cfDef.name.equalsIgnoreCase(cf))
-                return ThriftConversion.fromThrift(cfDef);
-        }
-        return null;
-    }
-
-    /** get index type from string */
-    protected IndexType getIndexType(String type)
-    {
-        type = type.toLowerCase();
-        if ("keys".equals(type))
-            return IndexType.KEYS;
-        else if("custom".equals(type))
-            return IndexType.CUSTOM;
-        else if("composites".equals(type))
-            return IndexType.COMPOSITES;
-        else
-            return null;
-    }
-
-    /** return partition keys */
-    public String[] getPartitionKeys(String location, Job job) throws IOException
-    {
-        if (!usePartitionFilter)
-            return null;
-        List<ColumnDef> indexes = getIndexes();
-        String[] partitionKeys = new String[indexes.size()];
-        for (int i = 0; i < indexes.size(); i++)
-        {
-            partitionKeys[i] = new String(indexes.get(i).getName());
-        }
-        return partitionKeys;
-    }
-
-    /** convert key to a tuple */
-    private Tuple keyToTuple(ByteBuffer key, CfDef cfDef, AbstractType comparator) throws IOException
-    {
-        Tuple tuple = TupleFactory.getInstance().newTuple(1);
-        addKeyToTuple(tuple, key, cfDef, comparator);
-        return tuple;
-    }
-
-    /** add key to a tuple */
-    private void addKeyToTuple(Tuple tuple, ByteBuffer key, CfDef cfDef, AbstractType comparator) throws IOException
-    {
-        if( comparator instanceof AbstractCompositeType )
-        {
-            StorageHelper.setTupleValue(tuple, 0, composeComposite((AbstractCompositeType) comparator, key));
-        }
-        else
-        {
-            StorageHelper.setTupleValue(tuple, 0, StorageHelper.cassandraToObj(getDefaultMarshallers(cfDef).get(MarshallerType.KEY_VALIDATOR), key, nativeProtocolVersion));
-        }
-
-    }
-
-    /** Deconstructs a composite type to a Tuple. */
-    protected Tuple composeComposite(AbstractCompositeType comparator, ByteBuffer name) throws IOException
-    {
-        List<AbstractCompositeType.CompositeComponent> result = comparator.deconstruct(name);
-        Tuple t = TupleFactory.getInstance().newTuple(result.size());
-        for (int i=0; i<result.size(); i++)
-            StorageHelper.setTupleValue(t, i, StorageHelper.cassandraToObj(result.get(i).comparator, result.get(i).value, nativeProtocolVersion));
-
-        return t;
-    }
-
-    /** cassandra://[username:password@]<keyspace>/<columnfamily>[?slice_start=<start>&slice_end=<end>
-     * [&reversed=true][&limit=1][&allow_deletes=true][&widerows=true]
-     * [&use_secondary=true][&comparator=<comparator>][&partitioner=<partitioner>]]*/
-    private void setLocationFromUri(String location) throws IOException
-    {
-        try
-        {
-            if (!location.startsWith("cassandra://"))
-                throw new Exception("Bad scheme." + location);
-            
-            String[] urlParts = location.split("\\?");
-            if (urlParts.length > 1)
-            {
-                Map<String, String> urlQuery = getQueryMap(urlParts[1]);
-                AbstractType comparator = BytesType.instance;
-                if (urlQuery.containsKey("comparator"))
-                    comparator = TypeParser.parse(urlQuery.get("comparator"));
-                if (urlQuery.containsKey("slice_start"))
-                    slice_start = comparator.fromString(urlQuery.get("slice_start"));
-                if (urlQuery.containsKey("slice_end"))
-                    slice_end = comparator.fromString(urlQuery.get("slice_end"));
-                if (urlQuery.containsKey("reversed"))
-                    slice_reverse = Boolean.parseBoolean(urlQuery.get("reversed"));
-                if (urlQuery.containsKey("limit"))
-                    limit = Integer.parseInt(urlQuery.get("limit"));
-                if (urlQuery.containsKey("allow_deletes"))
-                    allow_deletes = Boolean.parseBoolean(urlQuery.get("allow_deletes"));
-                if (urlQuery.containsKey("widerows"))
-                    widerows = Boolean.parseBoolean(urlQuery.get("widerows"));
-                if (urlQuery.containsKey("use_secondary"))
-                    usePartitionFilter = Boolean.parseBoolean(urlQuery.get("use_secondary"));
-                if (urlQuery.containsKey("split_size"))
-                    splitSize = Integer.parseInt(urlQuery.get("split_size"));
-                if (urlQuery.containsKey("partitioner"))
-                    partitionerClass = urlQuery.get("partitioner");
-                if (urlQuery.containsKey("init_address"))
-                    initHostAddress = urlQuery.get("init_address");
-                if (urlQuery.containsKey("rpc_port"))
-                    rpcPort = urlQuery.get("rpc_port");
-            }
-            String[] parts = urlParts[0].split("/+");
-            String[] credentialsAndKeyspace = parts[1].split("@");
-            if (credentialsAndKeyspace.length > 1)
-            {
-                String[] credentials = credentialsAndKeyspace[0].split(":");
-                username = credentials[0];
-                password = credentials[1];
-                keyspace = credentialsAndKeyspace[1];
-            }
-            else
-            {
-                keyspace = parts[1];
-            }
-            column_family = parts[2];
-        }
-        catch (Exception e)
-        {
-            throw new IOException("Expected 'cassandra://[username:password@]<keyspace>/<table>" +
-                    "[?slice_start=<start>&slice_end=<end>[&reversed=true][&limit=1]" +
-                    "[&allow_deletes=true][&widerows=true][&use_secondary=true]" +
-                    "[&comparator=<comparator>][&split_size=<size>][&partitioner=<partitioner>]" +
-                    "[&init_address=<host>][&rpc_port=<port>]]': " + e.getMessage());
-        }
-    }
-
-
-    /** decompose the query to store the parameters in a map */
-    public static Map<String, String> getQueryMap(String query) throws UnsupportedEncodingException
-    {
-        String[] params = query.split("&");
-        Map<String, String> map = new HashMap<String, String>(params.length);
-        for (String param : params)
-        {
-            String[] keyValue = param.split("=");
-            map.put(keyValue[0], URLDecoder.decode(keyValue[1], "UTF-8"));
-        }
-        return map;
-    }
-
-    public ByteBuffer nullToBB()
-    {
-        return null;
-    }
-
-    /** return the CfInfo for the column family */
-    protected CfDef getCfDef(Cassandra.Client client)
-            throws org.apache.cassandra.thrift.InvalidRequestException,
-            UnavailableException,
-            TimedOutException,
-            SchemaDisagreementException,
-            TException,
-            NotFoundException,
-            org.apache.cassandra.exceptions.InvalidRequestException,
-            ConfigurationException,
-            IOException
-    {
-        // get CF meta data
-        String query = String.format("SELECT type, comparator, subcomparator, default_validator, key_validator " +
-                        "FROM %s.%s " +
-                        "WHERE keyspace_name = '%s' AND columnfamily_name = '%s'",
-                SystemKeyspace.NAME,
-                LegacySchemaTables.COLUMNFAMILIES,
-                keyspace,
-                column_family);
-
-        CqlResult result = client.execute_cql3_query(ByteBufferUtil.bytes(query), Compression.NONE, ConsistencyLevel.ONE);
-
-        if (result == null || result.rows == null || result.rows.isEmpty())
-            return null;
-
-        Iterator<CqlRow> iteraRow = result.rows.iterator();
-        CfDef cfDef = new CfDef();
-        cfDef.keyspace = keyspace;
-        cfDef.name = column_family;
-        if (iteraRow.hasNext())
-        {
-            CqlRow cqlRow = iteraRow.next();
-
-            cfDef.column_type = ByteBufferUtil.string(cqlRow.columns.get(0).value);
-            cfDef.comparator_type = ByteBufferUtil.string(cqlRow.columns.get(1).value);
-            ByteBuffer subComparator = cqlRow.columns.get(2).value;
-            if (subComparator != null)
-                cfDef.subcomparator_type = ByteBufferUtil.string(subComparator);
-            cfDef.default_validation_class = ByteBufferUtil.string(cqlRow.columns.get(3).value);
-            cfDef.key_validation_class = ByteBufferUtil.string(cqlRow.columns.get(4).value);
-        }
-        cfDef.column_metadata = getColumnMetadata(client);
-        return cfDef;
-    }
-
-    /** get the columnfamily definition for the signature */
-    protected CfDef getCfDef(String signature) throws IOException
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CassandraStorage.class);
-        String prop = property.getProperty(signature);
-        return cfdefFromString(prop);
-    }
-
-    /** convert string back to CfDef */
-    protected static CfDef cfdefFromString(String st) throws IOException
-    {
-        assert st != null;
-        TDeserializer deserializer = new TDeserializer(new TBinaryProtocol.Factory());
-        CfDef cfDef = new CfDef();
-        try
-        {
-            deserializer.deserialize(cfDef, Hex.hexToBytes(st));
-        }
-        catch (TException e)
-        {
-            throw new IOException(e);
-        }
-        return cfDef;
-    }
-
-    /** convert CfDef to string */
-    protected static String cfdefToString(CfDef cfDef) throws IOException
-    {
-        assert cfDef != null;
-        // this is so awful it's kind of cool!
-        TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
-        try
-        {
-            return Hex.bytesToHex(serializer.serialize(cfDef));
-        }
-        catch (TException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** parse the string to a cassandra data type */
-    protected AbstractType parseType(String type) throws IOException
-    {
-        try
-        {
-            // always treat counters like longs, specifically CCT.compose is not what we need
-            if (type != null && type.equals("org.apache.cassandra.db.marshal.CounterColumnType"))
-                return LongType.instance;
-            return TypeParser.parse(type);
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IOException(e);
-        }
-        catch (SyntaxException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** convert a column to a tuple */
-    protected Tuple columnToTuple(ColumnFamilyRecordReader.Column column, CfDef cfDef, AbstractType comparator) throws IOException
-    {
-        Tuple pair = TupleFactory.getInstance().newTuple(2);
-
-        // name
-        if(comparator instanceof AbstractCompositeType)
-            StorageHelper.setTupleValue(pair, 0, composeComposite((AbstractCompositeType) comparator, column.name));
-        else
-            StorageHelper.setTupleValue(pair, 0, StorageHelper.cassandraToObj(comparator, column.name, nativeProtocolVersion));
-
-        // value
-        Map<ByteBuffer,AbstractType> validators = getValidatorMap(cfDef);
-        if (validators.get(column.name) == null)
-        {
-            Map<MarshallerType, AbstractType> marshallers = getDefaultMarshallers(cfDef);
-            StorageHelper.setTupleValue(pair, 1, StorageHelper.cassandraToObj(marshallers.get(MarshallerType.DEFAULT_VALIDATOR), column.value, nativeProtocolVersion));
-        }
-        else
-            StorageHelper.setTupleValue(pair, 1, StorageHelper.cassandraToObj(validators.get(column.name), column.value, nativeProtocolVersion));
-        return pair;
-    }
-
-    /** construct a map to store the mashaller type to cassandra data type mapping */
-    protected Map<MarshallerType, AbstractType> getDefaultMarshallers(CfDef cfDef) throws IOException
-    {
-        Map<MarshallerType, AbstractType> marshallers = new EnumMap<MarshallerType, AbstractType>(MarshallerType.class);
-        AbstractType comparator;
-        AbstractType subcomparator;
-        AbstractType default_validator;
-        AbstractType key_validator;
-
-        comparator = parseType(cfDef.getComparator_type());
-        subcomparator = parseType(cfDef.getSubcomparator_type());
-        default_validator = parseType(cfDef.getDefault_validation_class());
-        key_validator = parseType(cfDef.getKey_validation_class());
-
-        marshallers.put(MarshallerType.COMPARATOR, comparator);
-        marshallers.put(MarshallerType.DEFAULT_VALIDATOR, default_validator);
-        marshallers.put(MarshallerType.KEY_VALIDATOR, key_validator);
-        marshallers.put(MarshallerType.SUBCOMPARATOR, subcomparator);
-        return marshallers;
-    }
-
-    /** get the validators */
-    protected Map<ByteBuffer, AbstractType> getValidatorMap(CfDef cfDef) throws IOException
-    {
-        Map<ByteBuffer, AbstractType> validators = new HashMap<ByteBuffer, AbstractType>();
-        for (ColumnDef cd : cfDef.getColumn_metadata())
-        {
-            if (cd.getValidation_class() != null && !cd.getValidation_class().isEmpty())
-            {
-                AbstractType validator = null;
-                try
-                {
-                    validator = TypeParser.parse(cd.getValidation_class());
-                    if (validator instanceof CounterColumnType)
-                        validator = LongType.instance;
-                    validators.put(cd.name, validator);
-                }
-                catch (ConfigurationException e)
-                {
-                    throw new IOException(e);
-                }
-                catch (SyntaxException e)
-                {
-                    throw new IOException(e);
-                }
-            }
-        }
-        return validators;
-    }
-}

diff --git a/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java b/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java
deleted file mode 100644
index 8831cf2..0000000
--- a/src/java/org/apache/cassandra/hadoop/pig/CqlNativeStorage.java
+++ /dev/null

@@ -1,1084 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.hadoop.pig;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import com.datastax.driver.core.Cluster;
-import com.datastax.driver.core.ColumnMetadata;
-import com.datastax.driver.core.Metadata;
-import com.datastax.driver.core.Row;
-import com.datastax.driver.core.Session;
-import com.datastax.driver.core.TableMetadata;
-import com.datastax.driver.core.exceptions.NoHostAvailableException;
-
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.AuthenticationException;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.hadoop.ConfigHelper;
-import org.apache.cassandra.hadoop.HadoopCompat;
-import org.apache.cassandra.hadoop.cql3.CqlBulkOutputFormat;
-import org.apache.cassandra.hadoop.cql3.CqlBulkRecordWriter;
-import org.apache.cassandra.hadoop.cql3.CqlConfigHelper;
-import org.apache.cassandra.hadoop.cql3.CqlRecordReader;
-import org.apache.cassandra.serializers.CollectionSerializer;
-import org.apache.cassandra.utils.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.*;
-import org.apache.pig.*;
-import org.apache.pig.Expression.OpType;
-import org.apache.pig.ResourceSchema.ResourceFieldSchema;
-import org.apache.pig.backend.executionengine.ExecException;
-import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
-import org.apache.pig.data.*;
-import org.apache.pig.impl.util.UDFContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.yaml.snakeyaml.external.biz.base64Coder.Base64Coder;
-
-@Deprecated
-public class CqlNativeStorage extends LoadFunc implements StoreFuncInterface, LoadMetadata
-{
-    protected String DEFAULT_INPUT_FORMAT;
-    protected String DEFAULT_OUTPUT_FORMAT;
-
-    protected String username;
-    protected String password;
-    protected String keyspace;
-    protected String column_family;
-    protected String loadSignature;
-    protected String storeSignature;
-
-    protected Configuration conf;
-    protected String inputFormatClass;
-    protected String outputFormatClass;
-    protected int splitSize = 64 * 1024;
-    protected String partitionerClass;
-    protected boolean usePartitionFilter = false;
-    protected String initHostAddress;
-    protected String rpcPort;
-    protected int nativeProtocolVersion = 1;
-
-    private static final Logger logger = LoggerFactory.getLogger(CqlNativeStorage.class);
-    private static String BULK_OUTPUT_FORMAT = "org.apache.cassandra.hadoop.cql3.CqlBulkOutputFormat";
-    private int pageSize = 1000;
-    private String columns;
-    private String outputQuery;
-    private String whereClause;
-
-    private RecordReader<Long, Row> reader;
-    private RecordWriter<Map<String, ByteBuffer>, List<ByteBuffer>> writer;
-    private String nativePort;
-    private String nativeCoreConnections;
-    private String nativeMaxConnections;
-    private String nativeMaxSimultReqs;
-    private String nativeConnectionTimeout;
-    private String nativeReadConnectionTimeout;
-    private String nativeReceiveBufferSize;
-    private String nativeSendBufferSize;
-    private String nativeSolinger;
-    private String nativeTcpNodelay;
-    private String nativeReuseAddress;
-    private String nativeKeepAlive;
-    private String nativeAuthProvider;
-    private String nativeSSLTruststorePath;
-    private String nativeSSLKeystorePath;
-    private String nativeSSLTruststorePassword;
-    private String nativeSSLKeystorePassword;
-    private String nativeSSLCipherSuites;
-    private String inputCql;
-
-    private boolean bulkOutputFormat = false;
-    private String bulkCfSchema;
-    private String bulkInsertStatement;
-    private String bulkOutputLocation;
-    private int bulkBuffSize = -1;
-    private int bulkStreamThrottle = -1;
-    private int bulkMaxFailedHosts = -1;
-    private boolean bulkDeleteSourceOnSuccess = true;
-    private String bulkTableAlias;
-
-    public CqlNativeStorage()
-    {
-        this(1000);
-    }
-
-    /** @param pageSize limit number of CQL rows to fetch in a thrift request */
-    public CqlNativeStorage(int pageSize)
-    {
-        super();
-        this.pageSize = pageSize;
-        DEFAULT_INPUT_FORMAT = "org.apache.cassandra.hadoop.cql3.CqlInputFormat";
-        DEFAULT_OUTPUT_FORMAT = "org.apache.cassandra.hadoop.cql3.CqlOutputFormat";
-    }
-
-    public void prepareToRead(RecordReader reader, PigSplit split)
-    {
-        this.reader = reader;
-        if (reader instanceof CqlRecordReader) {
-            nativeProtocolVersion = ((CqlRecordReader) reader).getNativeProtocolVersion();
-        }
-    }
-
-    public void prepareToWrite(RecordWriter writer)
-    {
-        this.writer = writer;
-    }
-
-    /** get next row */
-    public Tuple getNext() throws IOException
-    {
-        try
-        {
-            // load the next pair
-            if (!reader.nextKeyValue())
-                return null;
-
-            TableInfo tableMetadata = getCfInfo(loadSignature);
-            Row row = reader.getCurrentValue();
-            Tuple tuple = TupleFactory.getInstance().newTuple(tableMetadata.getColumns().size());
-            Iterator<ColumnInfo> itera = tableMetadata.getColumns().iterator();
-            int i = 0;
-            while (itera.hasNext())
-            {
-                ColumnInfo cdef = itera.next();
-                ByteBuffer columnValue = row.getBytesUnsafe(cdef.getName());
-                if (columnValue != null)
-                {
-                    AbstractType<?> validator = getValidatorMap(tableMetadata).get(ByteBufferUtil.bytes(cdef.getName()));
-                    setTupleValue(tuple, i, cqlColumnToObj(ByteBufferUtil.bytes(cdef.getName()), columnValue,
-                                                           tableMetadata), validator);
-                }
-                else
-                    tuple.set(i, null);
-                i++;
-            }
-            return tuple;
-        }
-        catch (InterruptedException e)
-        {
-            throw new IOException(e.getMessage());
-        }
-    }
-
-    /** convert a cql column to an object */
-    private Object cqlColumnToObj(ByteBuffer name, ByteBuffer columnValue, TableInfo cfDef) throws IOException
-    {
-        // standard
-        Map<ByteBuffer,AbstractType> validators = getValidatorMap(cfDef);
-        return StorageHelper.cassandraToObj(validators.get(name), columnValue, nativeProtocolVersion);
-    }
-
-    /** set the value to the position of the tuple */
-    private void setTupleValue(Tuple tuple, int position, Object value, AbstractType<?> validator) throws ExecException
-    {
-        if (validator instanceof CollectionType)
-            setCollectionTupleValues(tuple, position, value, validator);
-        else
-           StorageHelper.setTupleValue(tuple, position, value);
-    }
-
-    /** set the values of set/list at and after the position of the tuple */
-    private void setCollectionTupleValues(Tuple tuple, int position, Object value, AbstractType<?> validator) throws ExecException
-    {
-        if (validator instanceof MapType)
-        {
-            setMapTupleValues(tuple, position, value, validator);
-            return;
-        }
-        AbstractType elementValidator;
-        if (validator instanceof SetType)
-            elementValidator = ((SetType<?>) validator).getElementsType();
-        else if (validator instanceof ListType)
-            elementValidator = ((ListType<?>) validator).getElementsType();
-        else
-            return;
-
-        int i = 0;
-        Tuple innerTuple = TupleFactory.getInstance().newTuple(((Collection<?>) value).size());
-        for (Object entry : (Collection<?>) value)
-        {
-            setTupleValue(innerTuple, i, cassandraToPigData(entry, elementValidator), elementValidator);
-            i++;
-        }
-        tuple.set(position, innerTuple);
-    }
-
-    /** set the values of set/list at and after the position of the tuple */
-    private void setMapTupleValues(Tuple tuple, int position, Object value, AbstractType<?> validator) throws ExecException
-    {
-        AbstractType<?> keyValidator = ((MapType<?, ?>) validator).getKeysType();
-        AbstractType<?> valueValidator = ((MapType<?, ?>) validator).getValuesType();
-
-        int i = 0;
-        Tuple innerTuple = TupleFactory.getInstance().newTuple(((Map<?,?>) value).size());
-        for(Map.Entry<?,?> entry :  ((Map<Object, Object>)value).entrySet())
-        {
-            Tuple mapEntryTuple = TupleFactory.getInstance().newTuple(2);
-            setTupleValue(mapEntryTuple, 0, cassandraToPigData(entry.getKey(), keyValidator), keyValidator);
-            setTupleValue(mapEntryTuple, 1, cassandraToPigData(entry.getValue(), valueValidator), valueValidator);
-            innerTuple.set(i, mapEntryTuple);
-            i++;
-        }
-        tuple.set(position, innerTuple);
-    }
-
-    private Object cassandraToPigData(Object obj, AbstractType validator)
-    {
-        if (validator instanceof DecimalType || validator instanceof InetAddressType)
-            return validator.getString(validator.decompose(obj));
-        return obj;
-    }
-
-    /** get the columnfamily definition for the signature */
-    protected TableInfo getCfInfo(String signature) throws IOException
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CqlNativeStorage.class);
-        TableInfo cfInfo;
-        try
-        {
-            cfInfo = cfdefFromString(property.getProperty(signature));
-        }
-        catch (ClassNotFoundException e)
-        {
-            throw new IOException(e);
-        }
-        return cfInfo;
-    }
-
-    /** return the CfInfo for the column family */
-    protected TableMetadata getCfInfo(Session client)
-            throws NoHostAvailableException,
-            AuthenticationException,
-            IllegalStateException
-    {
-        // get CF meta data
-        return client.getCluster().getMetadata().getKeyspace(Metadata.quote(keyspace)).getTable(Metadata.quote(column_family));
-    }
-
-    /** convert key tuple to key map */
-    private Map<String, ByteBuffer> tupleToKeyMap(Tuple t) throws IOException
-    {
-        Map<String, ByteBuffer> keys = new HashMap<String, ByteBuffer>();
-        for (int i = 0; i < t.size(); i++)
-        {
-            if (t.getType(i) != DataType.TUPLE)
-                throw new IOException("keys was not a tuple");
-            Tuple inner = (Tuple) t.get(i);
-            if (inner.size() != 2)
-                throw new IOException("Keys were not in name and value pairs");
-            Object name = inner.get(0);
-            if (name == null)
-                throw new IOException("Key name was empty");
-            keys.put(name.toString(), objToBB(inner.get(1)));
-        }
-        return keys;
-    }
-
-    /** convert object to ByteBuffer */
-    protected ByteBuffer objToBB(Object o)
-    {
-        if (o == null)
-            return nullToBB();
-        if (o instanceof java.lang.String)
-            return ByteBuffer.wrap(new DataByteArray((String)o).get());
-        if (o instanceof Integer)
-            return Int32Type.instance.decompose((Integer)o);
-        if (o instanceof Long)
-            return LongType.instance.decompose((Long)o);
-        if (o instanceof Float)
-            return FloatType.instance.decompose((Float)o);
-        if (o instanceof Double)
-            return DoubleType.instance.decompose((Double)o);
-        if (o instanceof UUID)
-            return ByteBuffer.wrap(UUIDGen.decompose((UUID) o));
-        if(o instanceof Tuple) {
-            List<Object> objects = ((Tuple)o).getAll();
-            //collections
-            if (objects.size() > 0 && objects.get(0) instanceof String)
-            {
-                String collectionType = (String) objects.get(0);
-                if ("set".equalsIgnoreCase(collectionType) ||
-                        "list".equalsIgnoreCase(collectionType))
-                    return objToListOrSetBB(objects.subList(1, objects.size()));
-                else if ("map".equalsIgnoreCase(collectionType))
-                    return objToMapBB(objects.subList(1, objects.size()));
-
-            }
-            return objToCompositeBB(objects);
-        }
-
-        return ByteBuffer.wrap(((DataByteArray) o).get());
-    }
-
-    private ByteBuffer objToListOrSetBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size());
-        for(Object sub : objects)
-        {
-            ByteBuffer buffer = objToBB(sub);
-            serialized.add(buffer);
-        }
-        return CollectionSerializer.pack(serialized, objects.size(), 3);
-    }
-
-    private ByteBuffer objToMapBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size() * 2);
-        for(Object sub : objects)
-        {
-            List<Object> keyValue = ((Tuple)sub).getAll();
-            for (Object entry: keyValue)
-            {
-                ByteBuffer buffer = objToBB(entry);
-                serialized.add(buffer);
-            }
-        }
-        return CollectionSerializer.pack(serialized, objects.size(), 3);
-    }
-
-    private ByteBuffer objToCompositeBB(List<Object> objects)
-    {
-        List<ByteBuffer> serialized = new ArrayList<ByteBuffer>(objects.size());
-        int totalLength = 0;
-        for(Object sub : objects)
-        {
-            ByteBuffer buffer = objToBB(sub);
-            serialized.add(buffer);
-            totalLength += 2 + buffer.remaining() + 1;
-        }
-        ByteBuffer out = ByteBuffer.allocate(totalLength);
-        for (ByteBuffer bb : serialized)
-        {
-            int length = bb.remaining();
-            out.put((byte) ((length >> 8) & 0xFF));
-            out.put((byte) (length & 0xFF));
-            out.put(bb);
-            out.put((byte) 0);
-        }
-        out.flip();
-        return out;
-    }
-
-    /** send CQL query request using data from tuple */
-    private void cqlQueryFromTuple(Map<String, ByteBuffer> key, Tuple t, int offset) throws IOException
-    {
-        for (int i = offset; i < t.size(); i++)
-        {
-            if (t.getType(i) != DataType.TUPLE)
-                throw new IOException("Output type was not a tuple");
-
-            Tuple inner = (Tuple) t.get(i);
-            if (inner.size() > 0)
-            {
-                List<ByteBuffer> bindedVariables = bindedVariablesFromTuple(inner);
-                if (bindedVariables.size() <= 0)
-                    throw new IOException("Missing binded variables");
-                sendCqlQuery(key, bindedVariables);
-            }
-        }
-    }
-
-    /** compose a list of binded variables */
-    private List<ByteBuffer> bindedVariablesFromTuple(Tuple t) throws IOException
-    {
-        List<ByteBuffer> variables = new ArrayList<ByteBuffer>();
-        for (int i = 0; i < t.size(); i++)
-            variables.add(objToBB(t.get(i)));
-        return variables;
-    }
-
-    /** writer write the data by executing CQL query */
-    private void sendCqlQuery(Map<String, ByteBuffer> key, List<ByteBuffer> bindedVariables) throws IOException
-    {
-        try
-        {
-            writer.write(key, bindedVariables);
-        }
-        catch (InterruptedException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    /** get the validators */
-    protected Map<ByteBuffer, AbstractType> getValidatorMap(TableInfo cfDef) throws IOException
-    {
-        Map<ByteBuffer, AbstractType> validators = new HashMap<>();
-        for (ColumnInfo cd : cfDef.getColumns())
-        {
-            if (cd.getTypeName() != null)
-            {
-                try
-                {
-                    AbstractType validator = TypeParser.parseCqlName(cd.getTypeName());
-                    if (validator instanceof CounterColumnType)
-                        validator = LongType.instance;
-                    validators.put(ByteBufferUtil.bytes(cd.getName()), validator);
-                }
-                catch (ConfigurationException | SyntaxException e)
-                {
-                    throw new IOException(e);
-                }
-            }
-        }
-        return validators;
-    }
-
-    /** schema: (value, value, value) where keys are in the front. */
-    public ResourceSchema getSchema(String location, Job job) throws IOException
-    {
-        setLocation(location, job);
-        TableInfo cfInfo = getCfInfo(loadSignature);
-        // top-level schema, no type
-        ResourceSchema schema = new ResourceSchema();
-
-        // get default validators
-        Map<ByteBuffer, AbstractType> validators = getValidatorMap(cfInfo);
-
-        // will contain all fields for this schema
-        List<ResourceFieldSchema> allSchemaFields = new ArrayList<ResourceFieldSchema>();
-
-        for (ColumnInfo cdef : cfInfo.getColumns())
-        {
-            ResourceFieldSchema valSchema = new ResourceFieldSchema();
-            AbstractType<?> validator = validators.get(ByteBufferUtil.bytes(cdef.getName()));
-            valSchema.setName(cdef.getName());
-            valSchema.setType(StorageHelper.getPigType(validator));
-            allSchemaFields.add(valSchema);
-        }
-
-        // top level schema contains everything
-        schema.setFields(allSchemaFields.toArray(new ResourceFieldSchema[allSchemaFields.size()]));
-        return schema;
-    }
-
-    public void setPartitionFilter(Expression partitionFilter) throws IOException
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CqlNativeStorage.class);
-        property.setProperty(StorageHelper.PARTITION_FILTER_SIGNATURE, partitionFilterToWhereClauseString(partitionFilter));
-    }
-
-    /**
-     * Return cql where clauses for the corresponding partition filter. Make sure the data format matches
-     * Only support the following Pig data types: int, long, float, double, boolean and chararray
-     * */
-    private String partitionFilterToWhereClauseString(Expression expression) throws IOException
-    {
-        Expression.BinaryExpression be = (Expression.BinaryExpression) expression;
-        OpType op = expression.getOpType();
-        String opString = op.toString();
-        switch (op)
-        {
-            case OP_EQ:
-                opString = " = ";
-            case OP_GE:
-            case OP_GT:
-            case OP_LE:
-            case OP_LT:
-                String name = be.getLhs().toString();
-                String value = be.getRhs().toString();
-                return String.format("%s %s %s", name, opString, value);
-            case OP_AND:
-                return String.format("%s AND %s", partitionFilterToWhereClauseString(be.getLhs()), partitionFilterToWhereClauseString(be.getRhs()));
-            default:
-                throw new IOException("Unsupported expression type: " + opString);
-        }
-    }
-
-    /** retrieve where clause for partition filter */
-    private String getWhereClauseForPartitionFilter()
-    {
-        UDFContext context = UDFContext.getUDFContext();
-        Properties property = context.getUDFProperties(CqlNativeStorage.class);
-        return property.getProperty(StorageHelper.PARTITION_FILTER_SIGNATURE);
-    }
-
-    /**
-     *  output: (((name, value), (name, value)), (value ... value), (value...value))
-     *  bulk output: ((value ... value), (value...value))
-     *
-     * */
-    public void putNext(Tuple t) throws IOException
-    {
-        if (t.size() < 1)
-        {
-            // simply nothing here, we can't even delete without a key
-            logger.warn("Empty output skipped, filter empty tuples to suppress this warning");
-            return;
-        }
-
-        if (t.getType(0) != DataType.TUPLE)
-            throw new IOException("First argument in output must be a tuple");
-
-        if (!bulkOutputFormat && t.getType(1) != DataType.TUPLE)
-            throw new IOException("Second argument in output must be a tuple");
-
-        if (bulkOutputFormat)
-        {
-            cqlQueryFromTuple(null, t, 0);
-        }
-        else
-        {
-            Map<String, ByteBuffer> key = tupleToKeyMap((Tuple)t.get(0));
-            cqlQueryFromTuple(key, t, 1);
-        }
-    }
-
-    /** set read configuration settings */
-    public void setLocation(String location, Job job) throws IOException
-    {
-        conf = job.getConfiguration();
-        setLocationFromUri(location);
-
-        if (username != null && password != null)
-        {
-            ConfigHelper.setInputKeyspaceUserNameAndPassword(conf, username, password);
-            CqlConfigHelper.setUserNameAndPassword(conf, username, password);
-        }
-        if (splitSize > 0)
-            ConfigHelper.setInputSplitSize(conf, splitSize);
-        if (partitionerClass!= null)
-            ConfigHelper.setInputPartitioner(conf, partitionerClass);
-        if (initHostAddress != null)
-            ConfigHelper.setInputInitialAddress(conf, initHostAddress);
-        if (rpcPort != null)
-            ConfigHelper.setInputRpcPort(conf, rpcPort);
-        if (nativePort != null)
-            CqlConfigHelper.setInputNativePort(conf, nativePort);
-        if (nativeCoreConnections != null)
-            CqlConfigHelper.setInputCoreConnections(conf, nativeCoreConnections);
-        if (nativeMaxConnections != null)
-            CqlConfigHelper.setInputMaxConnections(conf, nativeMaxConnections);
-        if (nativeMaxSimultReqs != null)
-            CqlConfigHelper.setInputMaxSimultReqPerConnections(conf, nativeMaxSimultReqs);
-        if (nativeConnectionTimeout != null)
-            CqlConfigHelper.setInputNativeConnectionTimeout(conf, nativeConnectionTimeout);
-        if (nativeReadConnectionTimeout != null)
-            CqlConfigHelper.setInputNativeReadConnectionTimeout(conf, nativeReadConnectionTimeout);
-        if (nativeReceiveBufferSize != null)
-            CqlConfigHelper.setInputNativeReceiveBufferSize(conf, nativeReceiveBufferSize);
-        if (nativeSendBufferSize != null)
-            CqlConfigHelper.setInputNativeSendBufferSize(conf, nativeSendBufferSize);
-        if (nativeSolinger != null)
-            CqlConfigHelper.setInputNativeSolinger(conf, nativeSolinger);
-        if (nativeTcpNodelay != null)
-            CqlConfigHelper.setInputNativeTcpNodelay(conf, nativeTcpNodelay);
-        if (nativeReuseAddress != null)
-            CqlConfigHelper.setInputNativeReuseAddress(conf, nativeReuseAddress);
-        if (nativeKeepAlive != null)
-            CqlConfigHelper.setInputNativeKeepAlive(conf, nativeKeepAlive);
-        if (nativeAuthProvider != null)
-            CqlConfigHelper.setInputNativeAuthProvider(conf, nativeAuthProvider);
-        if (nativeSSLTruststorePath != null)
-            CqlConfigHelper.setInputNativeSSLTruststorePath(conf, nativeSSLTruststorePath);
-        if (nativeSSLKeystorePath != null)
-            CqlConfigHelper.setInputNativeSSLKeystorePath(conf, nativeSSLKeystorePath);
-        if (nativeSSLTruststorePassword != null)
-            CqlConfigHelper.setInputNativeSSLTruststorePassword(conf, nativeSSLTruststorePassword);
-        if (nativeSSLKeystorePassword != null)
-            CqlConfigHelper.setInputNativeSSLKeystorePassword(conf, nativeSSLKeystorePassword);
-        if (nativeSSLCipherSuites != null)
-            CqlConfigHelper.setInputNativeSSLCipherSuites(conf, nativeSSLCipherSuites);
-
-        ConfigHelper.setInputColumnFamily(conf, keyspace, column_family);
-        setConnectionInformation();
-
-        CqlConfigHelper.setInputCQLPageRowSize(conf, String.valueOf(pageSize));
-        if (inputCql != null)
-            CqlConfigHelper.setInputCql(conf, inputCql);
-        if (columns != null)
-            CqlConfigHelper.setInputColumns(conf, columns);
-        if (whereClause != null)
-            CqlConfigHelper.setInputWhereClauses(conf, whereClause);
-
-        String whereClauseForPartitionFilter = getWhereClauseForPartitionFilter();
-        String wc = whereClause != null && !whereClause.trim().isEmpty()
-                               ? whereClauseForPartitionFilter == null ? whereClause: String.format("%s AND %s", whereClause.trim(), whereClauseForPartitionFilter)
-                               : whereClauseForPartitionFilter;
-
-        if (wc != null)
-        {
-            logger.trace("where clause: {}", wc);
-            CqlConfigHelper.setInputWhereClauses(conf, wc);
-        }
-        if (System.getenv(StorageHelper.PIG_INPUT_SPLIT_SIZE) != null)
-        {
-            try
-            {
-                ConfigHelper.setInputSplitSize(conf, Integer.parseInt(System.getenv(StorageHelper.PIG_INPUT_SPLIT_SIZE)));
-            }
-            catch (NumberFormatException e)
-            {
-                throw new IOException("PIG_INPUT_SPLIT_SIZE is not a number", e);
-            }
-        }
-
-        if (ConfigHelper.getInputInitialAddress(conf) == null)
-            throw new IOException("PIG_INPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
-        if (ConfigHelper.getInputPartitioner(conf) == null)
-            throw new IOException("PIG_INPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");
-        if (loadSignature == null)
-            loadSignature = location;
-
-        initSchema(loadSignature);
-    }
-
-    /** set store configuration settings */
-    public void setStoreLocation(String location, Job job) throws IOException
-    {
-        conf = HadoopCompat.getConfiguration(job);
-        setLocationFromUri(location);
-
-        if (username != null && password != null)
-            ConfigHelper.setOutputKeyspaceUserNameAndPassword(conf, username, password);
-        if (splitSize > 0)
-            ConfigHelper.setInputSplitSize(conf, splitSize);
-        if (partitionerClass!= null)
-            ConfigHelper.setOutputPartitioner(conf, partitionerClass);
-        if (rpcPort != null)
-        {
-            ConfigHelper.setOutputRpcPort(conf, rpcPort);
-            ConfigHelper.setInputRpcPort(conf, rpcPort);
-        }
-        if (initHostAddress != null)
-        {
-            ConfigHelper.setOutputInitialAddress(conf, initHostAddress);
-            ConfigHelper.setInputInitialAddress(conf, initHostAddress);
-        }
-
-        ConfigHelper.setOutputColumnFamily(conf, keyspace, column_family);
-        CqlConfigHelper.setOutputCql(conf, outputQuery);
-
-        if (bulkOutputFormat)
-        {
-            DEFAULT_OUTPUT_FORMAT = BULK_OUTPUT_FORMAT;
-            if (bulkCfSchema != null)
-                CqlBulkOutputFormat.setTableSchema(conf, column_family, bulkCfSchema);
-            else
-                throw new IOException("bulk_cf_schema is missing in input url parameter");
-            if (bulkInsertStatement != null)
-                CqlBulkOutputFormat.setTableInsertStatement(conf, column_family, bulkInsertStatement);
-            else
-                throw new IOException("bulk_insert_statement is missing in input url parameter");
-            if (bulkTableAlias != null)
-                CqlBulkOutputFormat.setTableAlias(conf, bulkTableAlias, column_family);
-            CqlBulkOutputFormat.setDeleteSourceOnSuccess(conf, bulkDeleteSourceOnSuccess);
-            if (bulkOutputLocation != null)
-                conf.set(CqlBulkRecordWriter.OUTPUT_LOCATION, bulkOutputLocation);
-            if (bulkBuffSize > 0)
-                conf.set(CqlBulkRecordWriter.BUFFER_SIZE_IN_MB, String.valueOf(bulkBuffSize));
-            if (bulkStreamThrottle > 0)
-                conf.set(CqlBulkRecordWriter.STREAM_THROTTLE_MBITS, String.valueOf(bulkStreamThrottle));
-            if (bulkMaxFailedHosts > 0)
-                conf.set(CqlBulkRecordWriter.MAX_FAILED_HOSTS, String.valueOf(bulkMaxFailedHosts));
-            if (partitionerClass!= null)
-                ConfigHelper.setInputPartitioner(conf, partitionerClass);
-        }
-
-        setConnectionInformation();
-
-        if (ConfigHelper.getOutputRpcPort(conf) == 0)
-            throw new IOException("PIG_OUTPUT_RPC_PORT or PIG_RPC_PORT environment variable not set");
-        if (ConfigHelper.getOutputInitialAddress(conf) == null)
-            throw new IOException("PIG_OUTPUT_INITIAL_ADDRESS or PIG_INITIAL_ADDRESS environment variable not set");
-        if (ConfigHelper.getOutputPartitioner(conf) == null)
-            throw new IOException("PIG_OUTPUT_PARTITIONER or PIG_PARTITIONER environment variable not set");
-
-        initSchema(storeSignature);
-    }
-
-    /** Methods to get the column family schema from Cassandra */
-    protected void initSchema(String signature) throws IOException
-    {
-        Properties properties = UDFContext.getUDFContext().getUDFProperties(CqlNativeStorage.class);
-
-        // Only get the schema if we haven't already gotten it
-        if (!properties.containsKey(signature))
-        {
-            try (Cluster cluster = CqlConfigHelper.getInputCluster(ConfigHelper.getInputInitialAddress(conf), conf);
-                 Session client = cluster.connect())
-            {
-                client.execute("USE " + keyspace);
-
-                // compose the CfDef for the columfamily
-                TableMetadata cfInfo = getCfInfo(client);
-
-                if (cfInfo != null)
-                {
-                    properties.setProperty(signature, cfdefToString(cfInfo));
-                }
-                else
-                    throw new IOException(String.format("Table '%s' not found in keyspace '%s'",
-                            column_family,
-                            keyspace));
-            }
-            catch (Exception e)
-            {
-                throw new IOException(e);
-            }
-        }
-    }
-
-
-    /** convert CfDef to string */
-    protected static String cfdefToString(TableMetadata cfDef) throws IOException
-    {
-        TableInfo tableInfo = new TableInfo(cfDef);
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        try (ObjectOutputStream oos = new ObjectOutputStream( baos ))
-        {
-            oos.writeObject(tableInfo);
-        }
-
-        return new String( Base64Coder.encode(baos.toByteArray()) );
-    }
-
-    /** convert string back to CfDef */
-    protected static TableInfo cfdefFromString(String st) throws IOException, ClassNotFoundException
-    {
-        byte [] data = Base64Coder.decode( st );
-        try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(data)))
-        {
-            Object o = ois.readObject();
-            return (TableInfo)o;
-        }
-    }
-
-    /** decompose the query to store the parameters in a map */
-    public static Map<String, String> getQueryMap(String query) throws UnsupportedEncodingException
-    {
-        String[] params = query.split("&");
-        Map<String, String> map = new HashMap<String, String>(params.length);
-        for (String param : params)
-        {
-            String[] keyValue = param.split("=");
-            map.put(keyValue[0], URLDecoder.decode(keyValue[1], "UTF-8"));
-        }
-        return map;
-    }
-
-    private void setLocationFromUri(String location) throws IOException
-    {
-        try
-        {
-            if (!location.startsWith("cql://"))
-                throw new Exception("Bad scheme: " + location);
-
-            String[] urlParts = location.split("\\?");
-            if (urlParts.length > 1)
-            {
-                Map<String, String> urlQuery = getQueryMap(urlParts[1]);
-
-                // each page row size
-                if (urlQuery.containsKey("page_size"))
-                    pageSize = Integer.parseInt(urlQuery.get("page_size"));
-
-                // output prepared statement
-                if (urlQuery.containsKey("output_query"))
-                    outputQuery = urlQuery.get("output_query");
-
-                if (urlQuery.containsKey("bulk_output_format"))
-                    bulkOutputFormat = Boolean.valueOf(urlQuery.get("bulk_output_format"));
-                if (urlQuery.containsKey("bulk_cf_schema"))
-                    bulkCfSchema = urlQuery.get("bulk_cf_schema");
-                if (urlQuery.containsKey("bulk_insert_statement"))
-                    bulkInsertStatement = urlQuery.get("bulk_insert_statement");
-                if (urlQuery.containsKey("bulk_output_location"))
-                    bulkOutputLocation = urlQuery.get("bulk_output_location");
-                if (urlQuery.containsKey("bulk_buff_size"))
-                    bulkBuffSize = Integer.valueOf(urlQuery.get("bulk_buff_size"));
-                if (urlQuery.containsKey("bulk_stream_throttle"))
-                    bulkStreamThrottle = Integer.valueOf(urlQuery.get("bulk_stream_throttle"));
-                if (urlQuery.containsKey("bulk_max_failed_hosts"))
-                    bulkMaxFailedHosts = Integer.valueOf(urlQuery.get("bulk_max_failed_hosts"));
-                if (urlQuery.containsKey("bulk_delete_source"))
-                    bulkDeleteSourceOnSuccess = Boolean.parseBoolean(urlQuery.get("bulk_delete_source"));
-                if (urlQuery.containsKey("bulk_table_alias"))
-                    bulkTableAlias = urlQuery.get("bulk_table_alias");
-
-                //split size
-                if (urlQuery.containsKey("split_size"))
-                    splitSize = Integer.parseInt(urlQuery.get("split_size"));
-                if (urlQuery.containsKey("partitioner"))
-                    partitionerClass = urlQuery.get("partitioner");
-                if (urlQuery.containsKey("use_secondary"))
-                    usePartitionFilter = Boolean.parseBoolean(urlQuery.get("use_secondary"));
-                if (urlQuery.containsKey("init_address"))
-                    initHostAddress = urlQuery.get("init_address");
-
-                if (urlQuery.containsKey("native_port"))
-                    nativePort = urlQuery.get("native_port");
-                if (urlQuery.containsKey("core_conns"))
-                    nativeCoreConnections = urlQuery.get("core_conns");
-                if (urlQuery.containsKey("max_conns"))
-                    nativeMaxConnections = urlQuery.get("max_conns");
-                if (urlQuery.containsKey("max_simult_reqs"))
-                    nativeMaxSimultReqs = urlQuery.get("max_simult_reqs");
-                if (urlQuery.containsKey("native_timeout"))
-                    nativeConnectionTimeout = urlQuery.get("native_timeout");
-                if (urlQuery.containsKey("native_read_timeout"))
-                    nativeReadConnectionTimeout = urlQuery.get("native_read_timeout");
-                if (urlQuery.containsKey("rec_buff_size"))
-                    nativeReceiveBufferSize = urlQuery.get("rec_buff_size");
-                if (urlQuery.containsKey("send_buff_size"))
-                    nativeSendBufferSize = urlQuery.get("send_buff_size");
-                if (urlQuery.containsKey("solinger"))
-                    nativeSolinger = urlQuery.get("solinger");
-                if (urlQuery.containsKey("tcp_nodelay"))
-                    nativeTcpNodelay = urlQuery.get("tcp_nodelay");
-                if (urlQuery.containsKey("reuse_address"))
-                    nativeReuseAddress = urlQuery.get("reuse_address");
-                if (urlQuery.containsKey("keep_alive"))
-                    nativeKeepAlive = urlQuery.get("keep_alive");
-                if (urlQuery.containsKey("auth_provider"))
-                    nativeAuthProvider = urlQuery.get("auth_provider");
-                if (urlQuery.containsKey("trust_store_path"))
-                    nativeSSLTruststorePath = urlQuery.get("trust_store_path");
-                if (urlQuery.containsKey("key_store_path"))
-                    nativeSSLKeystorePath = urlQuery.get("key_store_path");
-                if (urlQuery.containsKey("trust_store_password"))
-                    nativeSSLTruststorePassword = urlQuery.get("trust_store_password");
-                if (urlQuery.containsKey("key_store_password"))
-                    nativeSSLKeystorePassword = urlQuery.get("key_store_password");
-                if (urlQuery.containsKey("cipher_suites"))
-                    nativeSSLCipherSuites = urlQuery.get("cipher_suites");
-                if (urlQuery.containsKey("input_cql"))
-                    inputCql = urlQuery.get("input_cql");
-                if (urlQuery.containsKey("columns"))
-                    columns = urlQuery.get("columns");
-                if (urlQuery.containsKey("where_clause"))
-                    whereClause = urlQuery.get("where_clause");
-                if (urlQuery.containsKey("rpc_port"))
-                    rpcPort = urlQuery.get("rpc_port");
-            }
-            String[] parts = urlParts[0].split("/+");
-            String[] credentialsAndKeyspace = parts[1].split("@");
-            if (credentialsAndKeyspace.length > 1)
-            {
-                String[] credentials = credentialsAndKeyspace[0].split(":");
-                username = credentials[0];
-                password = credentials[1];
-                keyspace = credentialsAndKeyspace[1];
-            }
-            else
-            {
-                keyspace = parts[1];
-            }
-            column_family = parts[2];
-        }
-        catch (Exception e)
-        {
-            throw new IOException("Expected 'cql://[username:password@]<keyspace>/<columnfamily>" +
-                    "[?[page_size=<size>][&columns=<col1,col2>][&output_query=<prepared_statement>]" +
-                    "[&where_clause=<clause>][&split_size=<size>][&partitioner=<partitioner>][&use_secondary=true|false]" +
-                    "[&init_address=<host>][&native_port=<native_port>][&core_conns=<core_conns>]" +
-                    "[&max_conns=<max_conns>][&min_simult_reqs=<min_simult_reqs>][&max_simult_reqs=<max_simult_reqs>]" +
-                    "[&native_timeout=<native_timeout>][&native_read_timeout=<native_read_timeout>][&rec_buff_size=<rec_buff_size>]" +
-                    "[&send_buff_size=<send_buff_size>][&solinger=<solinger>][&tcp_nodelay=<tcp_nodelay>][&reuse_address=<reuse_address>]" +
-                    "[&keep_alive=<keep_alive>][&auth_provider=<auth_provider>][&trust_store_path=<trust_store_path>]" +
-                    "[&key_store_path=<key_store_path>][&trust_store_password=<trust_store_password>]" +
-                    "[&key_store_password=<key_store_password>][&cipher_suites=<cipher_suites>][&input_cql=<input_cql>]" +
-                    "[columns=<columns>][where_clause=<where_clause>]" +
-                    "[&bulk_cf_schema=bulk_cf_schema][&bulk_insert_statement=bulk_insert_statement][&bulk_table_alias=<bulk_table_alias>]" +
-                    "[&bulk_output_location=<bulk_output_location>][&bulk_buff_size=<bulk_buff_size>][&bulk_delete_source=<bulk_delete_source>]" +
-                    "[&bulk_stream_throttle=<bulk_stream_throttle>][&bulk_max_failed_hosts=<bulk_max_failed_hosts>]]': " +  e.getMessage());
-         }
-    }
-
-    public ByteBuffer nullToBB()
-    {
-        return ByteBuffer.wrap(new byte[0]);
-    }
-
-    /** output format */
-    public OutputFormat getOutputFormat() throws IOException
-    {
-        try
-        {
-            return FBUtilities.construct(outputFormatClass, "outputformat");
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    public void cleanupOnFailure(String failure, Job job)
-    {
-    }
-
-    public void cleanupOnSuccess(String location, Job job) throws IOException {
-    }
-
-    /** return partition keys */
-    public String[] getPartitionKeys(String location, Job job) throws IOException
-    {
-        if (!usePartitionFilter)
-            return null;
-        TableInfo tableMetadata = getCfInfo(loadSignature);
-        String[] partitionKeys = new String[tableMetadata.getPartitionKey().size()];
-        for (int i = 0; i < tableMetadata.getPartitionKey().size(); i++)
-        {
-            partitionKeys[i] = tableMetadata.getPartitionKey().get(i).getName();
-        }
-        return partitionKeys;
-    }
-
-    public void checkSchema(ResourceSchema schema) throws IOException
-    {
-        // we don't care about types, they all get casted to ByteBuffers
-    }
-
-    public ResourceStatistics getStatistics(String location, Job job)
-    {
-        return null;
-    }
-
-    @Override
-    public InputFormat getInputFormat() throws IOException
-    {
-        try
-        {
-            return FBUtilities.construct(inputFormatClass, "inputformat");
-        }
-        catch (ConfigurationException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException
-    {
-        return relativeToAbsolutePath(location, curDir);
-    }
-
-    @Override
-    public String relativeToAbsolutePath(String location, Path curDir) throws IOException
-    {
-        return location;
-    }
-
-    @Override
-    public void setUDFContextSignature(String signature)
-    {
-        this.loadSignature = signature;
-    }
-
-    /** StoreFunc methods */
-    public void setStoreFuncUDFContextSignature(String signature)
-    {
-        this.storeSignature = signature;
-    }
-
-    /** set hadoop cassandra connection settings */
-    protected void setConnectionInformation() throws IOException
-    {
-        StorageHelper.setConnectionInformation(conf);
-        if (System.getenv(StorageHelper.PIG_INPUT_FORMAT) != null)
-            inputFormatClass = getFullyQualifiedClassName(System.getenv(StorageHelper.PIG_INPUT_FORMAT));
-        else
-            inputFormatClass = DEFAULT_INPUT_FORMAT;
-        if (System.getenv(StorageHelper.PIG_OUTPUT_FORMAT) != null)
-            outputFormatClass = getFullyQualifiedClassName(System.getenv(StorageHelper.PIG_OUTPUT_FORMAT));
-        else
-            outputFormatClass = DEFAULT_OUTPUT_FORMAT;
-    }
-
-    /** get the full class name */
-    protected String getFullyQualifiedClassName(String classname)
-    {
-        return classname.contains(".") ? classname : "org.apache.cassandra.hadoop." + classname;
-    }
-}
-
-class TableInfo implements Serializable
-{
-    private final List<ColumnInfo> columns;
-    private final List<ColumnInfo> partitionKey;
-    private final String name;
-
-    public TableInfo(TableMetadata tableMetadata)
-    {
-        List<ColumnMetadata> cmColumns = tableMetadata.getColumns();
-        columns = new ArrayList<>(cmColumns.size());
-        for (ColumnMetadata cm : cmColumns)
-        {
-            columns.add(new ColumnInfo(this, cm));
-        }
-        List<ColumnMetadata> cmPartitionKey = tableMetadata.getPartitionKey();
-        partitionKey = new ArrayList<>(cmPartitionKey.size());
-        for (ColumnMetadata cm : cmPartitionKey)
-        {
-            partitionKey.add(new ColumnInfo(this, cm));
-        }
-        name = tableMetadata.getName();
-    }
-
-    public List<ColumnInfo> getPartitionKey()
-    {
-        return partitionKey;
-    }
-
-    public List<ColumnInfo> getColumns()
-    {
-        return columns;
-    }
-
-    public String getName()
-    {
-        return name;
-    }
-}
-
-class ColumnInfo implements Serializable
-{
-    private final TableInfo table;
-    private final String name;
-    private final String typeName;
-
-    public ColumnInfo(TableInfo tableInfo, ColumnMetadata columnMetadata)
-    {
-        table = tableInfo;
-        name = columnMetadata.getName();
-        typeName = columnMetadata.getType().toString();
-    }
-
-    public String getName()
-    {
-        return name;
-    }
-
-    public String getTypeName()
-    {
-        return typeName;
-    }
-}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/hadoop/pig/StorageHelper.java b/src/java/org/apache/cassandra/hadoop/pig/StorageHelper.java
deleted file mode 100644
index 74f734e..0000000
--- a/src/java/org/apache/cassandra/hadoop/pig/StorageHelper.java
+++ /dev/null

@@ -1,142 +0,0 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-package org.apache.cassandra.hadoop.pig;
-
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.Date;
-import java.util.UUID;
-
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.hadoop.ConfigHelper;
-import org.apache.cassandra.serializers.CollectionSerializer;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.UUIDGen;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.pig.backend.executionengine.ExecException;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.DataType;
-import org.apache.pig.data.Tuple;
-
-@Deprecated
-public class StorageHelper
-{
-    // system environment variables that can be set to configure connection info:
-    // alternatively, Hadoop JobConf variables can be set using keys from ConfigHelper
-    public final static String PIG_INPUT_RPC_PORT = "PIG_INPUT_RPC_PORT";
-    public final static String PIG_INPUT_INITIAL_ADDRESS = "PIG_INPUT_INITIAL_ADDRESS";
-    public final static String PIG_INPUT_PARTITIONER = "PIG_INPUT_PARTITIONER";
-    public final static String PIG_OUTPUT_RPC_PORT = "PIG_OUTPUT_RPC_PORT";
-    public final static String PIG_OUTPUT_INITIAL_ADDRESS = "PIG_OUTPUT_INITIAL_ADDRESS";
-    public final static String PIG_OUTPUT_PARTITIONER = "PIG_OUTPUT_PARTITIONER";
-    public final static String PIG_RPC_PORT = "PIG_RPC_PORT";
-    public final static String PIG_INITIAL_ADDRESS = "PIG_INITIAL_ADDRESS";
-    public final static String PIG_PARTITIONER = "PIG_PARTITIONER";
-    public final static String PIG_INPUT_FORMAT = "PIG_INPUT_FORMAT";
-    public final static String PIG_OUTPUT_FORMAT = "PIG_OUTPUT_FORMAT";
-    public final static String PIG_INPUT_SPLIT_SIZE = "PIG_INPUT_SPLIT_SIZE";
-
-
-    public final static String PARTITION_FILTER_SIGNATURE = "cassandra.partition.filter";
-
-    protected static void setConnectionInformation(Configuration conf)
-    {
-        if (System.getenv(PIG_RPC_PORT) != null)
-        {
-            ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_RPC_PORT));
-            ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_RPC_PORT));
-        }
-
-        if (System.getenv(PIG_INPUT_RPC_PORT) != null)
-            ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_INPUT_RPC_PORT));
-        if (System.getenv(PIG_OUTPUT_RPC_PORT) != null)
-            ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_OUTPUT_RPC_PORT));
-
-        if (System.getenv(PIG_INITIAL_ADDRESS) != null)
-        {
-            ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS));
-            ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS));
-        }
-        if (System.getenv(PIG_INPUT_INITIAL_ADDRESS) != null)
-            ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INPUT_INITIAL_ADDRESS));
-        if (System.getenv(PIG_OUTPUT_INITIAL_ADDRESS) != null)
-            ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_OUTPUT_INITIAL_ADDRESS));
-
-        if (System.getenv(PIG_PARTITIONER) != null)
-        {
-            ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_PARTITIONER));
-            ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_PARTITIONER));
-        }
-        if(System.getenv(PIG_INPUT_PARTITIONER) != null)
-            ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_INPUT_PARTITIONER));
-        if(System.getenv(PIG_OUTPUT_PARTITIONER) != null)
-            ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_OUTPUT_PARTITIONER));
-    }
-
-    protected static Object cassandraToObj(AbstractType validator, ByteBuffer value, int nativeProtocolVersion)
-    {
-        if (validator instanceof DecimalType || validator instanceof InetAddressType)
-            return validator.getString(value);
-
-        if (validator instanceof CollectionType)
-        {
-            // For CollectionType, the compose() method assumes the v3 protocol format of collection, which
-            // is not correct here since we query using the CQL-over-thrift interface which use the pre-v3 format
-            return ((CollectionSerializer)validator.getSerializer()).deserializeForNativeProtocol(value, nativeProtocolVersion);
-        }
-
-        return validator.compose(value);
-    }
-
-    /** set the value to the position of the tuple */
-    protected static void setTupleValue(Tuple pair, int position, Object value) throws ExecException
-    {
-        if (value instanceof BigInteger)
-            pair.set(position, ((BigInteger) value).intValue());
-        else if (value instanceof ByteBuffer)
-            pair.set(position, new DataByteArray(ByteBufferUtil.getArray((ByteBuffer) value)));
-        else if (value instanceof UUID)
-            pair.set(position, new DataByteArray(UUIDGen.decompose((java.util.UUID) value)));
-        else if (value instanceof Date)
-            pair.set(position, TimestampType.instance.decompose((Date) value).getLong());
-        else
-            pair.set(position, value);
-    }
-
-    /** get pig type for the cassandra data type*/
-    protected static byte getPigType(AbstractType type)
-    {
-        if (type instanceof LongType || type instanceof DateType || type instanceof TimestampType) // DateType is bad and it should feel bad
-            return DataType.LONG;
-        else if (type instanceof IntegerType || type instanceof Int32Type) // IntegerType will overflow at 2**31, but is kept for compatibility until pig has a BigInteger
-            return DataType.INTEGER;
-        else if (type instanceof AsciiType || type instanceof UTF8Type || type instanceof DecimalType || type instanceof InetAddressType)
-            return DataType.CHARARRAY;
-        else if (type instanceof FloatType)
-            return DataType.FLOAT;
-        else if (type instanceof DoubleType)
-            return DataType.DOUBLE;
-        else if (type instanceof AbstractCompositeType || type instanceof CollectionType)
-            return DataType.TUPLE;
-
-        return DataType.BYTEARRAY;
-    }
-}

diff --git a/src/java/org/apache/cassandra/hints/ChecksumMismatchException.java b/src/java/org/apache/cassandra/hints/ChecksumMismatchException.java
new file mode 100644
index 0000000..84dbbb2
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/ChecksumMismatchException.java

@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+
+final class ChecksumMismatchException extends IOException
+{
+    ChecksumMismatchException(String message)
+    {
+        super(message);
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/ChecksummedDataInput.java b/src/java/org/apache/cassandra/hints/ChecksummedDataInput.java
new file mode 100644
index 0000000..a78256b
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/ChecksummedDataInput.java

@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.zip.CRC32;
+
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.NativeLibrary;
+
+/**
+ * A {@link RandomAccessReader} wrapper that calculates the CRC in place.
+ *
+ * Useful for {@link org.apache.cassandra.hints.HintsReader}, for example, where we must verify the CRC, yet don't want
+ * to allocate an extra byte array just that purpose. The CRC can be embedded in the input stream and checked via checkCrc().
+ *
+ * In addition to calculating the CRC, it allows to enforce a maximum known size. This is needed
+ * so that {@link org.apache.cassandra.db.Mutation.MutationSerializer} doesn't blow up the heap when deserializing a
+ * corrupted sequence by reading a huge corrupted length of bytes via
+ * {@link org.apache.cassandra.utils.ByteBufferUtil#readWithLength(java.io.DataInput)}.
+ */
+public class ChecksummedDataInput extends RandomAccessReader.RandomAccessReaderWithOwnChannel
+{
+    private final CRC32 crc;
+    private int crcPosition;
+    private boolean crcUpdateDisabled;
+
+    private long limit;
+    private DataPosition limitMark;
+
+    protected ChecksummedDataInput(Builder builder)
+    {
+        super(builder);
+
+        crc = new CRC32();
+        crcPosition = 0;
+        crcUpdateDisabled = false;
+
+        resetLimit();
+    }
+
+    @SuppressWarnings("resource")   // channel owned by RandomAccessReaderWithOwnChannel
+    public static ChecksummedDataInput open(File file)
+    {
+        return new Builder(new ChannelProxy(file)).build();
+    }
+
+    static class Position implements InputPosition
+    {
+        final long sourcePosition;
+
+        public Position(long sourcePosition)
+        {
+            super();
+            this.sourcePosition = sourcePosition;
+        }
+
+        @Override
+        public long subtract(InputPosition other)
+        {
+            return sourcePosition - ((Position)other).sourcePosition;
+        }
+    }
+
+    /**
+     * Return a seekable representation of the current position. For compressed files this is chunk position
+     * in file and offset within chunk.
+     */
+    public InputPosition getSeekPosition()
+    {
+        return new Position(getPosition());
+    }
+
+    public void seek(InputPosition pos)
+    {
+        updateCrc();
+        bufferOffset = ((Position) pos).sourcePosition;
+        buffer.position(0).limit(0);
+    }
+
+    public void resetCrc()
+    {
+        crc.reset();
+        crcPosition = buffer.position();
+    }
+
+    public void limit(long newLimit)
+    {
+        limit = newLimit;
+        limitMark = mark();
+    }
+
+    /**
+     * Returns the position in the source file, which is different for getPosition() for compressed/encrypted files
+     * and may be imprecise.
+     */
+    protected long getSourcePosition()
+    {
+        return bufferOffset;
+    }
+
+    public void resetLimit()
+    {
+        limit = Long.MAX_VALUE;
+        limitMark = null;
+    }
+
+    public void checkLimit(int length) throws IOException
+    {
+        if (limitMark == null)
+            return;
+
+        if ((bytesPastLimit() + length) > limit)
+            throw new IOException("Digest mismatch exception");
+    }
+
+    public long bytesPastLimit()
+    {
+        assert limitMark != null;
+        return bytesPastMark(limitMark);
+    }
+
+    public boolean checkCrc() throws IOException
+    {
+        try
+        {
+            updateCrc();
+
+            // we must disable crc updates in case we rebuffer
+            // when called source.readInt()
+            crcUpdateDisabled = true;
+            return ((int) crc.getValue()) == readInt();
+        }
+        finally
+        {
+            crcPosition = buffer.position();
+            crcUpdateDisabled = false;
+        }
+    }
+
+    @Override
+    public void readFully(byte[] b) throws IOException
+    {
+        checkLimit(b.length);
+        super.readFully(b);
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException
+    {
+        checkLimit(len);
+        return super.read(b, off, len);
+    }
+
+    @Override
+    public void reBuffer()
+    {
+        updateCrc();
+        super.reBuffer();
+        crcPosition = buffer.position();
+    }
+
+    public void tryUncacheRead()
+    {
+        NativeLibrary.trySkipCache(getChannel().getFileDescriptor(), 0, getSourcePosition(), getPath());
+    }
+
+    private void updateCrc()
+    {
+        if (crcPosition == buffer.position() || crcUpdateDisabled)
+            return;
+
+        assert crcPosition >= 0 && crcPosition < buffer.position();
+
+        ByteBuffer unprocessed = buffer.duplicate();
+        unprocessed.position(crcPosition)
+                   .limit(buffer.position());
+
+        crc.update(unprocessed);
+    }
+
+    public static class Builder extends RandomAccessReader.Builder
+    {
+        public Builder(ChannelProxy channel)
+        {
+            super(channel);
+        }
+
+        public ChecksummedDataInput build()
+        {
+            return new ChecksummedDataInput(this);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/CompressedChecksummedDataInput.java b/src/java/org/apache/cassandra/hints/CompressedChecksummedDataInput.java
new file mode 100644
index 0000000..c0de1cf
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/CompressedChecksummedDataInput.java

@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.utils.memory.BufferPool;
+
+public final class CompressedChecksummedDataInput extends ChecksummedDataInput
+{
+    private final ICompressor compressor;
+    private volatile long filePosition = 0;     // Current position in file, advanced when reading chunk.
+    private volatile long sourcePosition = 0;   // Current position in file to report, advanced after consuming chunk.
+    private volatile ByteBuffer compressedBuffer = null;
+    private final ByteBuffer metadataBuffer = ByteBuffer.allocate(CompressedHintsWriter.METADATA_SIZE);
+
+    public CompressedChecksummedDataInput(Builder builder)
+    {
+        super(builder);
+        assert regions == null;  //mmapped regions are not supported
+
+        compressor = builder.compressor;
+        sourcePosition =  filePosition = builder.position;
+    }
+
+    /**
+     * Since an entire block of compressed data is read off of disk, not just a hint at a time,
+     * we don't report EOF until the decompressed data has also been read completely
+     */
+    public boolean isEOF()
+    {
+        return filePosition == channel.size() && buffer.remaining() == 0;
+    }
+
+    public long getSourcePosition()
+    {
+        return sourcePosition;
+    }
+
+    static class Position extends ChecksummedDataInput.Position
+    {
+        final long bufferStart;
+        final int bufferPosition;
+
+        public Position(long sourcePosition, long bufferStart, int bufferPosition)
+        {
+            super(sourcePosition);
+            this.bufferStart = bufferStart;
+            this.bufferPosition = bufferPosition;
+        }
+
+        @Override
+        public long subtract(InputPosition o)
+        {
+            Position other = (Position) o;
+            return bufferStart - other.bufferStart + bufferPosition - other.bufferPosition;
+        }
+    }
+
+    public InputPosition getSeekPosition()
+    {
+        return new Position(sourcePosition, bufferOffset, buffer.position());
+    }
+
+    public void seek(InputPosition p)
+    {
+        Position pos = (Position) p;
+        bufferOffset = pos.bufferStart;
+        filePosition = pos.sourcePosition;
+        buffer.position(0).limit(0);
+        resetCrc();
+        reBuffer();
+        buffer.position(pos.bufferPosition);
+        assert sourcePosition == pos.sourcePosition;
+        assert bufferOffset == pos.bufferStart;
+        assert buffer.position() == pos.bufferPosition;
+    }
+
+    protected void reBufferStandard()
+    {
+        sourcePosition = filePosition;
+        if (isEOF())
+            return;
+
+        metadataBuffer.clear();
+        channel.read(metadataBuffer, filePosition);
+        filePosition += CompressedHintsWriter.METADATA_SIZE;
+        metadataBuffer.rewind();
+
+        int uncompressedSize = metadataBuffer.getInt();
+        int compressedSize = metadataBuffer.getInt();
+
+        if (compressedBuffer == null || compressedSize > compressedBuffer.capacity())
+        {
+            int bufferSize = compressedSize + (compressedSize / 20);  // allocate +5% to cover variability in compressed size
+            if (compressedBuffer != null)
+            {
+                BufferPool.put(compressedBuffer);
+            }
+            compressedBuffer = allocateBuffer(bufferSize, compressor.preferredBufferType());
+        }
+
+        compressedBuffer.clear();
+        compressedBuffer.limit(compressedSize);
+        channel.read(compressedBuffer, filePosition);
+        compressedBuffer.rewind();
+        filePosition += compressedSize;
+
+        bufferOffset += buffer.position();
+        if (buffer.capacity() < uncompressedSize)
+        {
+            int bufferSize = uncompressedSize + (uncompressedSize / 20);
+            BufferPool.put(buffer);
+            buffer = allocateBuffer(bufferSize, compressor.preferredBufferType());
+        }
+
+        buffer.clear();
+        buffer.limit(uncompressedSize);
+        try
+        {
+            compressor.uncompress(compressedBuffer, buffer);
+            buffer.flip();
+        }
+        catch (IOException e)
+        {
+            throw new FSReadError(e, getPath());
+        }
+    }
+
+    protected void releaseBuffer()
+    {
+        super.releaseBuffer();
+        if (compressedBuffer != null)
+        {
+            BufferPool.put(compressedBuffer);
+            compressedBuffer = null;
+        }
+    }
+
+    protected void reBufferMmap()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public static final class Builder extends ChecksummedDataInput.Builder
+    {
+        private long position;
+        private ICompressor compressor;
+
+        public Builder(ChannelProxy channel)
+        {
+            super(channel);
+            bufferType = null;
+        }
+
+        public CompressedChecksummedDataInput build()
+        {
+            assert position >= 0;
+            assert compressor != null;
+            return new CompressedChecksummedDataInput(this);
+        }
+
+        public Builder withCompressor(ICompressor compressor)
+        {
+            this.compressor = compressor;
+            bufferType = compressor.preferredBufferType();
+            return this;
+        }
+
+        public Builder withPosition(long position)
+        {
+            this.position = position;
+            return this;
+        }
+    }
+
+    // Closing the CompressedChecksummedDataInput will close the underlying channel.
+    @SuppressWarnings("resource")
+    public static final CompressedChecksummedDataInput upgradeInput(ChecksummedDataInput input, ICompressor compressor)
+    {
+        long position = input.getPosition();
+        input.close();
+
+        Builder builder = new Builder(new ChannelProxy(input.getPath()));
+        builder.withPosition(position);
+        builder.withCompressor(compressor);
+        return builder.build();
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/CompressedHintsWriter.java b/src/java/org/apache/cassandra/hints/CompressedHintsWriter.java
new file mode 100644
index 0000000..491dceb
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/CompressedHintsWriter.java

@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.zip.CRC32;
+
+import org.apache.cassandra.io.compress.ICompressor;
+
+public class CompressedHintsWriter extends HintsWriter
+{
+    // compressed and uncompressed size is stored at the beginning of each compressed block
+    static final int METADATA_SIZE = 8;
+
+    private final ICompressor compressor;
+
+    private volatile ByteBuffer compressionBuffer = null;
+
+    public CompressedHintsWriter(File directory, HintsDescriptor descriptor, File file, FileChannel channel, int fd, CRC32 globalCRC)
+    {
+        super(directory, descriptor, file, channel, fd, globalCRC);
+        compressor = descriptor.createCompressor();
+        assert compressor != null;
+    }
+
+    protected void writeBuffer(ByteBuffer bb) throws IOException
+    {
+        int originalSize = bb.remaining();
+        int estimatedSize = compressor.initialCompressedBufferLength(originalSize) + METADATA_SIZE;
+
+        if (compressionBuffer == null || compressionBuffer.capacity() < estimatedSize)
+        {
+            compressionBuffer = compressor.preferredBufferType().allocate(estimatedSize);
+        }
+        compressionBuffer.clear();
+
+        compressionBuffer.position(METADATA_SIZE);
+        compressor.compress(bb, compressionBuffer);
+        int compressedSize = compressionBuffer.position() - METADATA_SIZE;
+
+        compressionBuffer.rewind();
+        compressionBuffer.putInt(originalSize);
+        compressionBuffer.putInt(compressedSize);
+        compressionBuffer.rewind();
+        compressionBuffer.limit(compressedSize + METADATA_SIZE);
+        super.writeBuffer(compressionBuffer);
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/EncodedHintMessage.java b/src/java/org/apache/cassandra/hints/EncodedHintMessage.java
new file mode 100644
index 0000000..4fe05ac
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/EncodedHintMessage.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+/**
+ * A specialized version of {@link HintMessage} that takes an already encoded in a bytebuffer hint and sends it verbatim.
+ *
+ * An optimization for when dispatching a hint file of the current messaging version to a node of the same messaging version,
+ * which is the most common case. Saves on extra ByteBuffer allocations one redundant hint deserialization-serialization cycle.
+ *
+ * Never deserialized as an EncodedHintMessage - the receiving side will always deserialize the message as vanilla
+ * {@link HintMessage}.
+ */
+final class EncodedHintMessage
+{
+    private static final IVersionedSerializer<EncodedHintMessage> serializer = new Serializer();
+
+    private final UUID hostId;
+    private final ByteBuffer hint;
+    private final int version;
+
+    EncodedHintMessage(UUID hostId, ByteBuffer hint, int version)
+    {
+        this.hostId = hostId;
+        this.hint = hint;
+        this.version = version;
+    }
+
+    MessageOut<EncodedHintMessage> createMessageOut()
+    {
+        return new MessageOut<>(MessagingService.Verb.HINT, this, serializer);
+    }
+
+    private static class Serializer implements IVersionedSerializer<EncodedHintMessage>
+    {
+        public long serializedSize(EncodedHintMessage message, int version)
+        {
+            if (version != message.version)
+                throw new IllegalArgumentException("serializedSize() called with non-matching version " + version);
+
+            long size = UUIDSerializer.serializer.serializedSize(message.hostId, version);
+            size += TypeSizes.sizeofUnsignedVInt(message.hint.remaining());
+            size += message.hint.remaining();
+            return size;
+        }
+
+        public void serialize(EncodedHintMessage message, DataOutputPlus out, int version) throws IOException
+        {
+            if (version != message.version)
+                throw new IllegalArgumentException("serialize() called with non-matching version " + version);
+
+            UUIDSerializer.serializer.serialize(message.hostId, out, version);
+            out.writeUnsignedVInt(message.hint.remaining());
+            out.write(message.hint);
+        }
+
+        public EncodedHintMessage deserialize(DataInputPlus in, int version) throws IOException
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/Hint.java b/src/java/org/apache/cassandra/hints/Hint.java
new file mode 100644
index 0000000..17fbf5d
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/Hint.java

@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.base.Throwables;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+import static org.apache.cassandra.db.TypeSizes.sizeof;
+import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
+
+/**
+ * Encapsulates the hinted mutation, its creation time, and the gc grace seconds param for each table involved.
+ *
+ * - Why do we need to track hint creation time?
+ * - We must exclude updates for tables that have been truncated after hint's creation, otherwise the result is data corruption.
+ *
+ * - Why do we need to track gc grace seconds?
+ * - Hints can stay in storage for a while before being applied, and without recording gc grace seconds (+ creation time),
+ *   if we apply the mutation blindly, we risk resurrecting a deleted value, a tombstone for which had been already
+ *   compacted away while the hint was in storage.
+ *
+ *   We also look at the smallest current value of the gcgs param for each affected table when applying the hint, and use
+ *   creation time + min(recorded gc gs, current gcgs + current gc grace) as the overall hint expiration time.
+ *   This allows now to safely reduce gc gs on tables without worrying that an applied old hint might resurrect any data.
+ */
+public final class Hint
+{
+    public static final Serializer serializer = new Serializer();
+
+    final Mutation mutation;
+    final long creationTime;  // time of hint creation (in milliseconds)
+    final int gcgs; // the smallest gc gs of all involved tables
+
+    private Hint(Mutation mutation, long creationTime, int gcgs)
+    {
+        this.mutation = mutation;
+        this.creationTime = creationTime;
+        this.gcgs = gcgs;
+    }
+
+    /**
+     * @param mutation the hinted mutation
+     * @param creationTime time of this hint's creation (in milliseconds since epoch)
+     */
+    public static Hint create(Mutation mutation, long creationTime)
+    {
+        return new Hint(mutation, creationTime, mutation.smallestGCGS());
+    }
+
+    /*
+     * @param mutation the hinted mutation
+     * @param creationTime time of this hint's creation (in milliseconds since epoch)
+     * @param gcgs the smallest gcgs of all tables involved at the time of hint creation (in seconds)
+     */
+    public static Hint create(Mutation mutation, long creationTime, int gcgs)
+    {
+        return new Hint(mutation, creationTime, gcgs);
+    }
+
+    /**
+     * Applies the contained mutation unless it's expired, filtering out any updates for truncated tables
+     */
+    CompletableFuture<?> applyFuture()
+    {
+        if (isLive())
+        {
+            // filter out partition update for table that have been truncated since hint's creation
+            Mutation filtered = mutation;
+            for (UUID id : mutation.getColumnFamilyIds())
+                if (creationTime <= SystemKeyspace.getTruncatedAt(id))
+                    filtered = filtered.without(id);
+
+            if (!filtered.isEmpty())
+                return filtered.applyFuture();
+        }
+
+        return CompletableFuture.completedFuture(null);
+    }
+
+    void apply()
+    {
+        try
+        {
+            applyFuture().get();
+        }
+        catch (Exception e)
+        {
+            throw Throwables.propagate(e.getCause());
+        }
+    }
+
+    /**
+     * @return calculates whether or not it is safe to apply the hint without risking to resurrect any deleted data
+     */
+    boolean isLive()
+    {
+        int smallestGCGS = Math.min(gcgs, mutation.smallestGCGS());
+        long expirationTime = creationTime + TimeUnit.SECONDS.toMillis(smallestGCGS);
+        return expirationTime > System.currentTimeMillis();
+    }
+
+    static final class Serializer implements IVersionedSerializer<Hint>
+    {
+        public long serializedSize(Hint hint, int version)
+        {
+            long size = sizeof(hint.creationTime);
+            size += sizeofUnsignedVInt(hint.gcgs);
+            size += Mutation.serializer.serializedSize(hint.mutation, version);
+            return size;
+        }
+
+        public void serialize(Hint hint, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeLong(hint.creationTime);
+            out.writeUnsignedVInt(hint.gcgs);
+            Mutation.serializer.serialize(hint.mutation, out, version);
+        }
+
+        public Hint deserialize(DataInputPlus in, int version) throws IOException
+        {
+            long creationTime = in.readLong();
+            int gcgs = (int) in.readUnsignedVInt();
+            return new Hint(Mutation.serializer.deserialize(in, version), creationTime, gcgs);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintMessage.java b/src/java/org/apache/cassandra/hints/HintMessage.java
new file mode 100644
index 0000000..723ab6d
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintMessage.java

@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.UUID;
+
+import javax.annotation.Nullable;
+
+import com.google.common.primitives.Ints;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.UnknownColumnFamilyException;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.io.util.TrackedDataInputPlus;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+/**
+ * The message we use to dispatch and forward hints.
+ *
+ * Encodes the host id the hint is meant for and the hint itself.
+ * We use the host id to determine whether we should store or apply the hint:
+ * 1. If host id equals to the receiving node host id, then we apply the hint
+ * 2. If host id is different from the receiving node's host id, then we store the hint
+ *
+ * Scenario (1) means that we are dealing with regular hint dispatch.
+ * Scenario (2) means that we got a hint from a node that's going through decommissioning and is streaming its hints
+ * elsewhere first.
+ */
+public final class HintMessage
+{
+    public static final IVersionedSerializer<HintMessage> serializer = new Serializer();
+
+    final UUID hostId;
+
+    @Nullable // can be null if we fail do decode the hint because of an unknown table id in it
+    final Hint hint;
+
+    @Nullable // will usually be null, unless a hint deserialization fails due to an unknown table id
+    final UUID unknownTableID;
+
+    HintMessage(UUID hostId, Hint hint)
+    {
+        this.hostId = hostId;
+        this.hint = hint;
+        this.unknownTableID = null;
+    }
+
+    HintMessage(UUID hostId, UUID unknownTableID)
+    {
+        this.hostId = hostId;
+        this.hint = null;
+        this.unknownTableID = unknownTableID;
+    }
+
+    public MessageOut<HintMessage> createMessageOut()
+    {
+        return new MessageOut<>(MessagingService.Verb.HINT, this, serializer);
+    }
+
+    public static class Serializer implements IVersionedSerializer<HintMessage>
+    {
+        public long serializedSize(HintMessage message, int version)
+        {
+            long size = UUIDSerializer.serializer.serializedSize(message.hostId, version);
+
+            long hintSize = Hint.serializer.serializedSize(message.hint, version);
+            size += TypeSizes.sizeofUnsignedVInt(hintSize);
+            size += hintSize;
+
+            return size;
+        }
+
+        public void serialize(HintMessage message, DataOutputPlus out, int version) throws IOException
+        {
+            Objects.requireNonNull(message.hint); // we should never *send* a HintMessage with null hint
+
+            UUIDSerializer.serializer.serialize(message.hostId, out, version);
+
+            /*
+             * We are serializing the hint size so that the receiver of the message could gracefully handle
+             * deserialize failure when a table had been dropped, by simply skipping the unread bytes.
+             */
+            out.writeUnsignedVInt(Hint.serializer.serializedSize(message.hint, version));
+
+            Hint.serializer.serialize(message.hint, out, version);
+        }
+
+        /*
+         * It's not an exceptional scenario to have a hints file streamed that have partition updates for tables
+         * that don't exist anymore. We want to handle that case gracefully instead of dropping the connection for every
+         * one of them.
+         */
+        public HintMessage deserialize(DataInputPlus in, int version) throws IOException
+        {
+            UUID hostId = UUIDSerializer.serializer.deserialize(in, version);
+
+            long hintSize = in.readUnsignedVInt();
+            TrackedDataInputPlus countingIn = new TrackedDataInputPlus(in);
+            try
+            {
+                return new HintMessage(hostId, Hint.serializer.deserialize(countingIn, version));
+            }
+            catch (UnknownColumnFamilyException e)
+            {
+                in.skipBytes(Ints.checkedCast(hintSize - countingIn.getBytesRead()));
+                return new HintMessage(hostId, e.cfId);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintResponse.java b/src/java/org/apache/cassandra/hints/HintResponse.java
new file mode 100644
index 0000000..8aa888f
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintResponse.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.hints;
+
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+
+/**
+ * An empty successful response to a HintMessage.
+ */
+public final class HintResponse
+{
+    public static final IVersionedSerializer<HintResponse> serializer = new Serializer();
+
+    static final HintResponse instance = new HintResponse();
+    static final MessageOut<HintResponse> message =
+        new MessageOut<>(MessagingService.Verb.REQUEST_RESPONSE, instance, serializer);
+
+    private HintResponse()
+    {
+    }
+
+    private static final class Serializer implements IVersionedSerializer<HintResponse>
+    {
+        public long serializedSize(HintResponse response, int version)
+        {
+            return 0;
+        }
+
+        public void serialize(HintResponse response, DataOutputPlus out, int version)
+        {
+        }
+
+        public HintResponse deserialize(DataInputPlus in, int version)
+        {
+            return instance;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintVerbHandler.java b/src/java/org/apache/cassandra/hints/HintVerbHandler.java
new file mode 100644
index 0000000..2b92a42
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintVerbHandler.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.net.InetAddress;
+import java.util.UUID;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.net.IVerbHandler;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+
+/**
+ * Verb handler used both for hint dispatch and streaming.
+ *
+ * With the non-sstable format, we cannot just stream hint sstables on node decommission. So sometimes, at decommission
+ * time, we might have to stream hints to a non-owning host (say, if the owning host B is down during decommission of host A).
+ * In that case the handler just stores the received hint in its local hint store.
+ */
+public final class HintVerbHandler implements IVerbHandler<HintMessage>
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintVerbHandler.class);
+
+    public void doVerb(MessageIn<HintMessage> message, int id)
+    {
+        UUID hostId = message.payload.hostId;
+        Hint hint = message.payload.hint;
+        InetAddress address = StorageService.instance.getEndpointForHostId(hostId);
+
+        // If we see an unknown table id, it means the table, or one of the tables in the mutation, had been dropped.
+        // In that case there is nothing we can really do, or should do, other than log it go on.
+        // This will *not* happen due to a not-yet-seen table, because we don't transfer hints unless there
+        // is schema agreement between the sender and the receiver.
+        if (hint == null)
+        {
+            logger.trace("Failed to decode and apply a hint for {}: {} - table with id {} is unknown",
+                         address,
+                         hostId,
+                         message.payload.unknownTableID);
+            reply(id, message.from);
+            return;
+        }
+
+        // We must perform validation before applying the hint, and there is no other place to do it other than here.
+        try
+        {
+            hint.mutation.getPartitionUpdates().forEach(PartitionUpdate::validate);
+        }
+        catch (MarshalException e)
+        {
+            logger.warn("Failed to validate a hint for {}: {} - skipped", address, hostId);
+            reply(id, message.from);
+            return;
+        }
+
+        if (!hostId.equals(StorageService.instance.getLocalHostUUID()))
+        {
+            // the node is not the final destination of the hint (must have gotten it from a decommissioning node),
+            // so just store it locally, to be delivered later.
+            HintsService.instance.write(hostId, hint);
+            reply(id, message.from);
+        }
+        else if (!StorageProxy.instance.appliesLocally(hint.mutation))
+        {
+            // the topology has changed, and we are no longer a replica of the mutation - since we don't know which node(s)
+            // it has been handed over to, re-address the hint to all replicas; see CASSANDRA-5902.
+            HintsService.instance.writeForAllReplicas(hint);
+            reply(id, message.from);
+        }
+        else
+        {
+            // the common path - the node is both the destination and a valid replica for the hint.
+            hint.applyFuture().thenAccept(o -> reply(id, message.from)).exceptionally(e -> {logger.debug("Failed to apply hint", e); return null;});
+        }
+    }
+
+    private static void reply(int id, InetAddress to)
+    {
+        MessagingService.instance().sendReply(HintResponse.message, id, to);
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsBuffer.java b/src/java/org/apache/cassandra/hints/HintsBuffer.java
new file mode 100644
index 0000000..e86dede
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsBuffer.java

@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.zip.CRC32;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputBufferFixed;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+import static org.apache.cassandra.utils.FBUtilities.updateChecksum;
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+
+/**
+ * A shared buffer that temporarily holds the serialized hints before they are flushed to disk.
+ *
+ * Consists of :
+ * - a ByteBuffer holding the serialized hints (length, length checksum and total checksum included)
+ * - a pointer to the current allocation offset
+ * - an {@link OpOrder} appendOrder for {@link HintsWriteExecutor} to wait on for all writes completion
+ * - a map of (host id -> offset queue) for the hints written
+ *
+ * It's possible to write a single hint for two or more hosts at the same time, in which case the same offset will be put
+ * into two or more offset queues.
+ */
+final class HintsBuffer
+{
+    // hint entry overhead in bytes (int length, int length checksum, int body checksum)
+    static final int ENTRY_OVERHEAD_SIZE = 12;
+    static final int CLOSED = -1;
+
+    private final ByteBuffer slab; // the underlying backing ByteBuffer for all the serialized hints
+    private final AtomicInteger position; // the position in the slab that we currently allocate from
+
+    private final ConcurrentMap<UUID, Queue<Integer>> offsets;
+    private final OpOrder appendOrder;
+
+    private HintsBuffer(ByteBuffer slab)
+    {
+        this.slab = slab;
+
+        position = new AtomicInteger();
+        offsets = new ConcurrentHashMap<>();
+        appendOrder = new OpOrder();
+    }
+
+    static HintsBuffer create(int slabSize)
+    {
+        return new HintsBuffer(ByteBuffer.allocateDirect(slabSize));
+    }
+
+    boolean isClosed()
+    {
+        return position.get() == CLOSED;
+    }
+
+    int capacity()
+    {
+        return slab.capacity();
+    }
+
+    int remaining()
+    {
+        int pos = position.get();
+        return pos == CLOSED ? 0 : capacity() - pos;
+    }
+
+    HintsBuffer recycle()
+    {
+        slab.clear();
+        return new HintsBuffer(slab);
+    }
+
+    void free()
+    {
+        FileUtils.clean(slab);
+    }
+
+    /**
+     * Wait for any appends started before this method was called.
+     */
+    void waitForModifications()
+    {
+        appendOrder.awaitNewBarrier(); // issue a barrier and wait for it
+    }
+
+    Set<UUID> hostIds()
+    {
+        return offsets.keySet();
+    }
+
+    /**
+     * Coverts the queue of offsets for the selected host id into an iterator of hints encoded as ByteBuffers.
+     */
+    Iterator<ByteBuffer> consumingHintsIterator(UUID hostId)
+    {
+        final Queue<Integer> bufferOffsets = offsets.get(hostId);
+
+        if (bufferOffsets == null)
+            return Collections.emptyIterator();
+
+        return new AbstractIterator<ByteBuffer>()
+        {
+            private final ByteBuffer flyweight = slab.duplicate();
+
+            protected ByteBuffer computeNext()
+            {
+                Integer offset = bufferOffsets.poll();
+
+                if (offset == null)
+                    return endOfData();
+
+                int totalSize = slab.getInt(offset) + ENTRY_OVERHEAD_SIZE;
+
+                return (ByteBuffer) flyweight.clear().position(offset).limit(offset + totalSize);
+            }
+        };
+    }
+
+    @SuppressWarnings("resource")
+    Allocation allocate(int hintSize)
+    {
+        int totalSize = hintSize + ENTRY_OVERHEAD_SIZE;
+
+        if (totalSize > slab.capacity() / 2)
+        {
+            throw new IllegalArgumentException(String.format("Hint of %s bytes is too large - the maximum size is %s",
+                                                             hintSize,
+                                                             slab.capacity() / 2));
+        }
+
+        OpOrder.Group opGroup = appendOrder.start(); // will eventually be closed by the receiver of the allocation
+        try
+        {
+            return allocate(totalSize, opGroup);
+        }
+        catch (Throwable t)
+        {
+            opGroup.close();
+            throw t;
+        }
+    }
+
+    private Allocation allocate(int totalSize, OpOrder.Group opGroup)
+    {
+        int offset = allocateBytes(totalSize);
+        if (offset < 0)
+        {
+            opGroup.close();
+            return null;
+        }
+        return new Allocation(offset, totalSize, opGroup);
+    }
+
+    private int allocateBytes(int totalSize)
+    {
+        while (true)
+        {
+            int prev = position.get();
+            int next = prev + totalSize;
+
+            if (prev == CLOSED) // the slab has been 'closed'
+                return CLOSED;
+
+            if (next > slab.capacity())
+            {
+                position.set(CLOSED); // mark the slab as no longer allocating if we've exceeded its capacity
+                return CLOSED;
+            }
+
+            if (position.compareAndSet(prev, next))
+                return prev;
+        }
+    }
+
+    private void put(UUID hostId, int offset)
+    {
+        // we intentionally don't just return offsets.computeIfAbsent() because it's expensive compared to simple get(),
+        // and the method is on a really hot path
+        Queue<Integer> queue = offsets.get(hostId);
+        if (queue == null)
+            queue = offsets.computeIfAbsent(hostId, (id) -> new ConcurrentLinkedQueue<>());
+        queue.offer(offset);
+    }
+
+    /**
+     * A placeholder for hint serialization. Should always be used in a try-with-resources block.
+     */
+    final class Allocation implements AutoCloseable
+    {
+        private final Integer offset;
+        private final int totalSize;
+        private final OpOrder.Group opGroup;
+
+        Allocation(int offset, int totalSize, OpOrder.Group opGroup)
+        {
+            this.offset = offset;
+            this.totalSize = totalSize;
+            this.opGroup = opGroup;
+        }
+
+        void write(Iterable<UUID> hostIds, Hint hint)
+        {
+            write(hint);
+            for (UUID hostId : hostIds)
+                put(hostId, offset);
+        }
+
+        public void close()
+        {
+            opGroup.close();
+        }
+
+        private void write(Hint hint)
+        {
+            ByteBuffer buffer = (ByteBuffer) slab.duplicate().position(offset).limit(offset + totalSize);
+            CRC32 crc = new CRC32();
+            int hintSize = totalSize - ENTRY_OVERHEAD_SIZE;
+            try (DataOutputBuffer dop = new DataOutputBufferFixed(buffer))
+            {
+                dop.writeInt(hintSize);
+                updateChecksumInt(crc, hintSize);
+                dop.writeInt((int) crc.getValue());
+
+                Hint.serializer.serialize(hint, dop, MessagingService.current_version);
+                updateChecksum(crc, buffer, buffer.position() - hintSize, hintSize);
+                dop.writeInt((int) crc.getValue());
+            }
+            catch (IOException e)
+            {
+                throw new AssertionError(); // cannot happen
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsBufferPool.java b/src/java/org/apache/cassandra/hints/HintsBufferPool.java
new file mode 100644
index 0000000..f705de1
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsBufferPool.java

@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.UUID;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.net.MessagingService;
+import sun.nio.ch.DirectBuffer;
+
+/**
+ * A primitive pool of {@link HintsBuffer} buffers. Under normal conditions should only hold two buffers - the currently
+ * written to one, and a reserve buffer to switch to when the first one is beyond capacity.
+ */
+final class HintsBufferPool implements Closeable
+{
+    interface FlushCallback
+    {
+        void flush(HintsBuffer buffer, HintsBufferPool pool);
+    }
+
+    static final int MAX_ALLOCATED_BUFFERS = Integer.getInteger(Config.PROPERTY_PREFIX + "MAX_HINT_BUFFERS", 3);
+    private volatile HintsBuffer currentBuffer;
+    private final BlockingQueue<HintsBuffer> reserveBuffers;
+    private final int bufferSize;
+    private final FlushCallback flushCallback;
+    private int allocatedBuffers = 0;
+
+    HintsBufferPool(int bufferSize, FlushCallback flushCallback)
+    {
+        reserveBuffers = new LinkedBlockingQueue<>();
+        this.bufferSize = bufferSize;
+        this.flushCallback = flushCallback;
+    }
+
+    /**
+     * @param hostIds host ids of the hint's target nodes
+     * @param hint the hint to store
+     */
+    void write(Iterable<UUID> hostIds, Hint hint)
+    {
+        int hintSize = (int) Hint.serializer.serializedSize(hint, MessagingService.current_version);
+        try (HintsBuffer.Allocation allocation = allocate(hintSize))
+        {
+            allocation.write(hostIds, hint);
+        }
+    }
+
+    private HintsBuffer.Allocation allocate(int hintSize)
+    {
+        HintsBuffer current = currentBuffer();
+
+        while (true)
+        {
+            HintsBuffer.Allocation allocation = current.allocate(hintSize);
+            if (allocation != null)
+                return allocation;
+
+            // allocation failed due to insufficient size remaining in the buffer
+            if (switchCurrentBuffer(current))
+                flushCallback.flush(current, this);
+
+            current = currentBuffer;
+        }
+    }
+
+    void offer(HintsBuffer buffer)
+    {
+        if (!reserveBuffers.offer(buffer))
+            throw new RuntimeException("Failed to store buffer");
+    }
+
+    // A wrapper to ensure a non-null currentBuffer value on the first call.
+    HintsBuffer currentBuffer()
+    {
+        if (currentBuffer == null)
+            initializeCurrentBuffer();
+
+        return currentBuffer;
+    }
+
+    private synchronized void initializeCurrentBuffer()
+    {
+        if (currentBuffer == null)
+            currentBuffer = createBuffer();
+    }
+
+    private synchronized boolean switchCurrentBuffer(HintsBuffer previous)
+    {
+        if (currentBuffer != previous)
+            return false;
+
+        HintsBuffer buffer = reserveBuffers.poll();
+        if (buffer == null && allocatedBuffers >= MAX_ALLOCATED_BUFFERS)
+        {
+            try
+            {
+                //This BlockingQueue.take is a target for byteman in HintsBufferPoolTest
+                buffer = reserveBuffers.take();
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        currentBuffer = buffer == null ? createBuffer() : buffer;
+
+        return true;
+    }
+
+    private HintsBuffer createBuffer()
+    {
+        allocatedBuffers++;
+        return HintsBuffer.create(bufferSize);
+    }
+
+    public void close()
+    {
+        currentBuffer.free();
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsCatalog.java b/src/java/org/apache/cassandra/hints/HintsCatalog.java
new file mode 100644
index 0000000..48bbc08
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsCatalog.java

@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Stream;
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NativeLibrary;
+import org.apache.cassandra.utils.SyncUtil;
+
+import static java.util.stream.Collectors.groupingBy;
+
+/**
+ * A simple catalog for easy host id -> {@link HintsStore} lookup and manipulation.
+ */
+final class HintsCatalog
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsCatalog.class);
+
+    private final File hintsDirectory;
+    private final Map<UUID, HintsStore> stores;
+    private final ImmutableMap<String, Object> writerParams;
+
+    private HintsCatalog(File hintsDirectory, ImmutableMap<String, Object> writerParams, Map<UUID, List<HintsDescriptor>> descriptors)
+    {
+        this.hintsDirectory = hintsDirectory;
+        this.writerParams = writerParams;
+        this.stores = new ConcurrentHashMap<>();
+
+        for (Map.Entry<UUID, List<HintsDescriptor>> entry : descriptors.entrySet())
+            stores.put(entry.getKey(), HintsStore.create(entry.getKey(), hintsDirectory, writerParams, entry.getValue()));
+    }
+
+    /**
+     * Loads hints stores from a given directory.
+     */
+    static HintsCatalog load(File hintsDirectory, ImmutableMap<String, Object> writerParams)
+    {
+        try(Stream<Path> list = Files.list(hintsDirectory.toPath()))
+        {
+            Map<UUID, List<HintsDescriptor>> stores =
+                     list
+                     .filter(HintsDescriptor::isHintFileName)
+                     .map(HintsDescriptor::readFromFileQuietly)
+                     .filter(Optional::isPresent)
+                     .map(Optional::get)
+                     .collect(groupingBy(h -> h.hostId));
+            return new HintsCatalog(hintsDirectory, writerParams, stores);
+        }
+        catch (IOException e)
+        {
+            throw new FSReadError(e, hintsDirectory);
+        }
+    }
+
+    Stream<HintsStore> stores()
+    {
+        return stores.values().stream();
+    }
+
+    void maybeLoadStores(Iterable<UUID> hostIds)
+    {
+        for (UUID hostId : hostIds)
+            get(hostId);
+    }
+
+    HintsStore get(UUID hostId)
+    {
+        // we intentionally don't just return stores.computeIfAbsent() because it's expensive compared to simple get(),
+        // and in this case would also allocate for the capturing lambda; the method is on a really hot path
+        HintsStore store = stores.get(hostId);
+        return store == null
+             ? stores.computeIfAbsent(hostId, (id) -> HintsStore.create(id, hintsDirectory, writerParams, Collections.emptyList()))
+             : store;
+    }
+
+    @Nullable
+    HintsStore getNullable(UUID hostId)
+    {
+        return stores.get(hostId);
+    }
+
+    /**
+     * Delete all hints for all host ids.
+     *
+     * Will not delete the files that are currently being dispatched, or written to.
+     */
+    void deleteAllHints()
+    {
+        stores.keySet().forEach(this::deleteAllHints);
+    }
+
+    /**
+     * Delete all hints for the specified host id.
+     *
+     * Will not delete the files that are currently being dispatched, or written to.
+     */
+    void deleteAllHints(UUID hostId)
+    {
+        HintsStore store = stores.get(hostId);
+        if (store != null)
+            store.deleteAllHints();
+    }
+
+    /**
+     * @return true if at least one of the stores has a file pending dispatch
+     */
+    boolean hasFiles()
+    {
+        return stores().anyMatch(HintsStore::hasFiles);
+    }
+
+    void exciseStore(UUID hostId)
+    {
+        deleteAllHints(hostId);
+        stores.remove(hostId);
+    }
+
+    void fsyncDirectory()
+    {
+        int fd = NativeLibrary.tryOpenDirectory(hintsDirectory.getAbsolutePath());
+        if (fd != -1)
+        {
+            try
+            {
+                SyncUtil.trySync(fd);
+                NativeLibrary.tryCloseFD(fd);
+            }
+            catch (FSError e) // trySync failed
+            {
+                logger.error("Unable to sync directory {}", hintsDirectory.getAbsolutePath(), e);
+                FileUtils.handleFSErrorAndPropagate(e);
+            }
+        }
+        else if (!FBUtilities.isWindows())
+        {
+            logger.error("Unable to open directory {}", hintsDirectory.getAbsolutePath());
+            FileUtils.handleFSErrorAndPropagate(new FSWriteError(new IOException(String.format("Unable to open hint directory %s", hintsDirectory.getAbsolutePath())), hintsDirectory.getAbsolutePath()));
+        }
+    }
+
+    ImmutableMap<String, Object> getWriterParams()
+    {
+        return writerParams;
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsDescriptor.java b/src/java/org/apache/cassandra/hints/HintsDescriptor.java
new file mode 100644
index 0000000..e9e1c30
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsDescriptor.java

@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.DataInput;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.Map;
+import java.util.Optional;
+import java.util.UUID;
+import java.util.regex.Pattern;
+import java.util.zip.CRC32;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.CompressionParams;
+import org.json.simple.JSONValue;
+
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+
+/**
+ * Describes the host id, the version, the timestamp of creation, and an arbitrary map of JSON-encoded parameters of a
+ * hints file.
+ *
+ * Written in the beginning of each hints file.
+ */
+final class HintsDescriptor
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsDescriptor.class);
+
+    static final int VERSION_30 = 1;
+    static final int CURRENT_VERSION = VERSION_30;
+
+    static final String COMPRESSION = "compression";
+
+    static final Pattern pattern =
+        Pattern.compile("^[a-fA-F0-9]{8}\\-[a-fA-F0-9]{4}\\-[a-fA-F0-9]{4}\\-[a-fA-F0-9]{4}\\-[a-fA-F0-9]{12}\\-(\\d+)\\-(\\d+)\\.hints$");
+
+    final UUID hostId;
+    final int version;
+    final long timestamp;
+
+    // implemented for future compression support - see CASSANDRA-9428
+    final ImmutableMap<String, Object> parameters;
+    final ParameterizedClass compressionConfig;
+
+    HintsDescriptor(UUID hostId, int version, long timestamp, ImmutableMap<String, Object> parameters)
+    {
+        this.hostId = hostId;
+        this.version = version;
+        this.timestamp = timestamp;
+        this.parameters = parameters;
+        compressionConfig = createCompressionConfig(parameters);
+    }
+
+    HintsDescriptor(UUID hostId, long timestamp, ImmutableMap<String, Object> parameters)
+    {
+        this(hostId, CURRENT_VERSION, timestamp, parameters);
+    }
+
+    HintsDescriptor(UUID hostId, long timestamp)
+    {
+        this(hostId, CURRENT_VERSION, timestamp, ImmutableMap.<String, Object>of());
+    }
+
+    @SuppressWarnings("unchecked")
+    static ParameterizedClass createCompressionConfig(Map<String, Object> params)
+    {
+        if (params.containsKey(COMPRESSION))
+        {
+            Map<String, Object> compressorConfig = (Map<String, Object>) params.get(COMPRESSION);
+            return new ParameterizedClass((String) compressorConfig.get(ParameterizedClass.CLASS_NAME),
+                                          (Map<String, String>) compressorConfig.get(ParameterizedClass.PARAMETERS));
+        }
+        else
+        {
+            return null;
+        }
+    }
+
+    String fileName()
+    {
+        return String.format("%s-%s-%s.hints", hostId, timestamp, version);
+    }
+
+    String checksumFileName()
+    {
+        return String.format("%s-%s-%s.crc32", hostId, timestamp, version);
+    }
+
+    int messagingVersion()
+    {
+        return messagingVersion(version);
+    }
+
+    static int messagingVersion(int hintsVersion)
+    {
+        switch (hintsVersion)
+        {
+            case VERSION_30:
+                return MessagingService.FORCE_3_0_PROTOCOL_VERSION ? MessagingService.VERSION_30 : MessagingService.VERSION_3014;
+            default:
+                throw new AssertionError();
+        }
+    }
+
+    static boolean isHintFileName(Path path)
+    {
+        return pattern.matcher(path.getFileName().toString()).matches();
+    }
+
+    static Optional<HintsDescriptor> readFromFileQuietly(Path path)
+    {
+        try (RandomAccessFile raf = new RandomAccessFile(path.toFile(), "r"))
+        {
+            return Optional.of(deserialize(raf));
+        }
+        catch (ChecksumMismatchException e)
+        {
+            throw new FSReadError(e, path.toFile());
+        }
+        catch (IOException e)
+        {
+            logger.error("Failed to deserialize hints descriptor {}", path.toString(), e);
+            return Optional.empty();
+        }
+    }
+
+    static HintsDescriptor readFromFile(Path path)
+    {
+        try (RandomAccessFile raf = new RandomAccessFile(path.toFile(), "r"))
+        {
+            return deserialize(raf);
+        }
+        catch (IOException e)
+        {
+            throw new FSReadError(e, path.toFile());
+        }
+    }
+
+    public boolean isCompressed()
+    {
+        return compressionConfig != null;
+    }
+
+    public ICompressor createCompressor()
+    {
+        return isCompressed() ? CompressionParams.createCompressor(compressionConfig) : null;
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("hostId", hostId)
+                          .add("version", version)
+                          .add("timestamp", timestamp)
+                          .add("parameters", parameters)
+                          .toString();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof HintsDescriptor))
+            return false;
+
+        HintsDescriptor hd = (HintsDescriptor) o;
+
+        return Objects.equal(hostId, hd.hostId)
+            && Objects.equal(version, hd.version)
+            && Objects.equal(timestamp, hd.timestamp)
+            && Objects.equal(parameters, hd.parameters);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(hostId, version, timestamp, parameters);
+    }
+
+    void serialize(DataOutputPlus out) throws IOException
+    {
+        CRC32 crc = new CRC32();
+
+        out.writeInt(version);
+        updateChecksumInt(crc, version);
+
+        out.writeLong(timestamp);
+        updateChecksumLong(crc, timestamp);
+
+        out.writeLong(hostId.getMostSignificantBits());
+        updateChecksumLong(crc, hostId.getMostSignificantBits());
+        out.writeLong(hostId.getLeastSignificantBits());
+        updateChecksumLong(crc, hostId.getLeastSignificantBits());
+
+        byte[] paramsBytes = JSONValue.toJSONString(parameters).getBytes(StandardCharsets.UTF_8);
+        out.writeInt(paramsBytes.length);
+        updateChecksumInt(crc, paramsBytes.length);
+        out.writeInt((int) crc.getValue());
+
+        out.write(paramsBytes);
+        crc.update(paramsBytes, 0, paramsBytes.length);
+
+        out.writeInt((int) crc.getValue());
+    }
+
+    int serializedSize()
+    {
+        int size = TypeSizes.sizeof(version);
+        size += TypeSizes.sizeof(timestamp);
+
+        size += TypeSizes.sizeof(hostId.getMostSignificantBits());
+        size += TypeSizes.sizeof(hostId.getLeastSignificantBits());
+
+        byte[] paramsBytes = JSONValue.toJSONString(parameters).getBytes(StandardCharsets.UTF_8);
+        size += TypeSizes.sizeof(paramsBytes.length);
+        size += 4; // size checksum
+        size += paramsBytes.length;
+        size += 4; // total checksum
+
+        return size;
+    }
+
+    static HintsDescriptor deserialize(DataInput in) throws IOException
+    {
+        CRC32 crc = new CRC32();
+
+        int version = in.readInt();
+        updateChecksumInt(crc, version);
+
+        long timestamp = in.readLong();
+        updateChecksumLong(crc, timestamp);
+
+        long msb = in.readLong();
+        updateChecksumLong(crc, msb);
+        long lsb = in.readLong();
+        updateChecksumLong(crc, lsb);
+
+        UUID hostId = new UUID(msb, lsb);
+
+        int paramsLength = in.readInt();
+        updateChecksumInt(crc, paramsLength);
+        validateCRC(in.readInt(), (int) crc.getValue());
+
+        byte[] paramsBytes = new byte[paramsLength];
+        in.readFully(paramsBytes, 0, paramsLength);
+        crc.update(paramsBytes, 0, paramsLength);
+        validateCRC(in.readInt(), (int) crc.getValue());
+
+        return new HintsDescriptor(hostId, version, timestamp, decodeJSONBytes(paramsBytes));
+    }
+
+    @SuppressWarnings("unchecked")
+    private static ImmutableMap<String, Object> decodeJSONBytes(byte[] bytes)
+    {
+        return ImmutableMap.copyOf((Map<String, Object>) JSONValue.parse(new String(bytes, StandardCharsets.UTF_8)));
+    }
+
+    private static void updateChecksumLong(CRC32 crc, long value)
+    {
+        updateChecksumInt(crc, (int) (value & 0xFFFFFFFFL));
+        updateChecksumInt(crc, (int) (value >>> 32));
+    }
+
+    private static void validateCRC(int expected, int actual) throws IOException
+    {
+        if (expected != actual)
+            throw new ChecksumMismatchException("Hints Descriptor CRC Mismatch");
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java
new file mode 100644
index 0000000..eda4179
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java

@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.net.InetAddress;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.BooleanSupplier;
+import java.util.function.Function;
+import java.util.function.Supplier;
+
+import com.google.common.util.concurrent.RateLimiter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.service.StorageService;
+
+/**
+ * A multi-threaded (by default) executor for dispatching hints.
+ *
+ * Most of dispatch is triggered by {@link HintsDispatchTrigger} running every ~10 seconds.
+ */
+final class HintsDispatchExecutor
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsDispatchExecutor.class);
+
+    private final File hintsDirectory;
+    private final ExecutorService executor;
+    private final AtomicBoolean isPaused;
+    private final Function<InetAddress, Boolean> isAlive;
+    private final Map<UUID, Future> scheduledDispatches;
+
+    HintsDispatchExecutor(File hintsDirectory, int maxThreads, AtomicBoolean isPaused, Function<InetAddress, Boolean> isAlive)
+    {
+        this.hintsDirectory = hintsDirectory;
+        this.isPaused = isPaused;
+        this.isAlive = isAlive;
+
+        scheduledDispatches = new ConcurrentHashMap<>();
+        executor = new JMXEnabledThreadPoolExecutor(maxThreads, 1, TimeUnit.MINUTES,
+                                                    new LinkedBlockingQueue<>(),
+                                                    new NamedThreadFactory("HintsDispatcher", Thread.MIN_PRIORITY),
+                                                    "internal");
+    }
+
+    /*
+     * It's safe to terminate dispatch in process and to deschedule dispatch.
+     */
+    void shutdownBlocking()
+    {
+        scheduledDispatches.clear();
+        executor.shutdownNow();
+        try
+        {
+            executor.awaitTermination(1, TimeUnit.MINUTES);
+        }
+        catch (InterruptedException e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    boolean isScheduled(HintsStore store)
+    {
+        return scheduledDispatches.containsKey(store.hostId);
+    }
+
+    Future dispatch(HintsStore store)
+    {
+        return dispatch(store, store.hostId);
+    }
+
+    Future dispatch(HintsStore store, UUID hostId)
+    {
+        /*
+         * It is safe to perform dispatch for the same host id concurrently in two or more threads,
+         * however there is nothing to win from it - so we don't.
+         *
+         * Additionally, having just one dispatch task per host id ensures that we'll never violate our per-destination
+         * rate limit, without having to share a ratelimiter between threads.
+         *
+         * It also simplifies reasoning about dispatch sessions.
+         */
+        return scheduledDispatches.computeIfAbsent(hostId, uuid -> executor.submit(new DispatchHintsTask(store, hostId)));
+    }
+
+    Future transfer(HintsCatalog catalog, Supplier<UUID> hostIdSupplier)
+    {
+        return executor.submit(new TransferHintsTask(catalog, hostIdSupplier));
+    }
+
+    void completeDispatchBlockingly(HintsStore store)
+    {
+        Future future = scheduledDispatches.get(store.hostId);
+        try
+        {
+            if (future != null)
+                future.get();
+        }
+        catch (ExecutionException | InterruptedException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    void interruptDispatch(UUID hostId)
+    {
+        Future future = scheduledDispatches.remove(hostId);
+
+        if (null != future)
+            future.cancel(true);
+    }
+
+    private final class TransferHintsTask implements Runnable
+    {
+        private final HintsCatalog catalog;
+
+        /*
+         * Supplies target hosts to stream to. Generally returns the one the DynamicSnitch thinks is closest.
+         * We use a supplier here to be able to get a new host if the current one dies during streaming.
+         */
+        private final Supplier<UUID> hostIdSupplier;
+
+        private TransferHintsTask(HintsCatalog catalog, Supplier<UUID> hostIdSupplier)
+        {
+            this.catalog = catalog;
+            this.hostIdSupplier = hostIdSupplier;
+        }
+
+        @Override
+        public void run()
+        {
+            UUID hostId = hostIdSupplier.get();
+            InetAddress address = StorageService.instance.getEndpointForHostId(hostId);
+            logger.info("Transferring all hints to {}: {}", address, hostId);
+            if (transfer(hostId))
+                return;
+
+            logger.warn("Failed to transfer all hints to {}: {}; will retry in {} seconds", address, hostId, 10);
+
+            try
+            {
+                TimeUnit.SECONDS.sleep(10);
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+
+            hostId = hostIdSupplier.get();
+            logger.info("Transferring all hints to {}: {}", address, hostId);
+            if (!transfer(hostId))
+            {
+                logger.error("Failed to transfer all hints to {}: {}", address, hostId);
+                throw new RuntimeException("Failed to transfer all hints to " + hostId);
+            }
+        }
+
+        private boolean transfer(UUID hostId)
+        {
+            catalog.stores()
+                   .map(store -> new DispatchHintsTask(store, hostId))
+                   .forEach(Runnable::run);
+
+            return !catalog.hasFiles();
+        }
+    }
+
+    private final class DispatchHintsTask implements Runnable
+    {
+        private final HintsStore store;
+        private final UUID hostId;
+        private final RateLimiter rateLimiter;
+
+        DispatchHintsTask(HintsStore store, UUID hostId)
+        {
+            this.store = store;
+            this.hostId = hostId;
+
+            // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
+            // max rate is scaled by the number of nodes in the cluster (CASSANDRA-5272).
+            // the goal is to bound maximum hints traffic going towards a particular node from the rest of the cluster,
+            // not total outgoing hints traffic from this node - this is why the rate limiter is not shared between
+            // all the dispatch tasks (as there will be at most one dispatch task for a particular host id at a time).
+            int nodesCount = Math.max(1, StorageService.instance.getTokenMetadata().getAllEndpoints().size() - 1);
+            double throttleInBytes = DatabaseDescriptor.getHintedHandoffThrottleInKB() * 1024.0 / nodesCount;
+            this.rateLimiter = RateLimiter.create(throttleInBytes == 0 ? Double.MAX_VALUE : throttleInBytes);
+        }
+
+        public void run()
+        {
+            try
+            {
+                dispatch();
+            }
+            finally
+            {
+                scheduledDispatches.remove(hostId);
+            }
+        }
+
+        private void dispatch()
+        {
+            while (true)
+            {
+                if (isPaused.get())
+                    break;
+
+                HintsDescriptor descriptor = store.poll();
+                if (descriptor == null)
+                    break;
+
+                try
+                {
+                    if (!dispatch(descriptor))
+                        break;
+                }
+                catch (FSReadError e)
+                {
+                    logger.error("Failed to dispatch hints file {}: file is corrupted ({})", descriptor.fileName(), e);
+                    store.cleanUp(descriptor);
+                    store.markCorrupted(descriptor);
+                    throw e;
+                }
+            }
+        }
+
+        /*
+         * Will return true if dispatch was successful, false if we hit a failure (destination node went down, for example).
+         */
+        private boolean dispatch(HintsDescriptor descriptor)
+        {
+            logger.trace("Dispatching hints file {}", descriptor.fileName());
+
+            InetAddress address = StorageService.instance.getEndpointForHostId(hostId);
+            if (address != null)
+                return deliver(descriptor, address);
+
+            // address == null means the target no longer exist; find new home for each hint entry.
+            convert(descriptor);
+            return true;
+        }
+
+        private boolean deliver(HintsDescriptor descriptor, InetAddress address)
+        {
+            File file = new File(hintsDirectory, descriptor.fileName());
+            InputPosition offset = store.getDispatchOffset(descriptor);
+
+            BooleanSupplier shouldAbort = () -> !isAlive.apply(address) || isPaused.get();
+            try (HintsDispatcher dispatcher = HintsDispatcher.create(file, rateLimiter, address, descriptor.hostId, shouldAbort))
+            {
+                if (offset != null)
+                    dispatcher.seek(offset);
+
+                if (dispatcher.dispatch())
+                {
+                    store.delete(descriptor);
+                    store.cleanUp(descriptor);
+                    logger.info("Finished hinted handoff of file {} to endpoint {}: {}", descriptor.fileName(), address, hostId);
+                    return true;
+                }
+                else
+                {
+                    store.markDispatchOffset(descriptor, dispatcher.dispatchPosition());
+                    store.offerFirst(descriptor);
+                    logger.info("Finished hinted handoff of file {} to endpoint {}: {}, partially", descriptor.fileName(), address, hostId);
+                    return false;
+                }
+            }
+        }
+
+        // for each hint in the hints file for a node that isn't part of the ring anymore, write RF hints for each replica
+        private void convert(HintsDescriptor descriptor)
+        {
+            File file = new File(hintsDirectory, descriptor.fileName());
+
+            try (HintsReader reader = HintsReader.open(file, rateLimiter))
+            {
+                reader.forEach(page -> page.hintsIterator().forEachRemaining(HintsService.instance::writeForAllReplicas));
+                store.delete(descriptor);
+                store.cleanUp(descriptor);
+                logger.info("Finished converting hints file {}", descriptor.fileName());
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsDispatchTrigger.java b/src/java/org/apache/cassandra/hints/HintsDispatchTrigger.java
new file mode 100644
index 0000000..5fe0e27
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsDispatchTrigger.java

@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.gms.Gossiper;
+
+import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddress;
+
+/**
+ * A simple dispatch trigger that's being run every 10 seconds.
+ *
+ * Goes through all hint stores and schedules for dispatch all the hints for hosts that are:
+ * 1. Not currently scheduled for dispatch, and
+ * 2. Either have some hint files, or an active hint writer, and
+ * 3. Are live, and
+ * 4. Have matching schema versions
+ *
+ * What does triggering a hints store for dispatch mean?
+ * - If there are existing hint files, it means submitting them for dispatch;
+ * - If there is an active writer, closing it, for the next run to pick it up.
+ */
+final class HintsDispatchTrigger implements Runnable
+{
+    private final HintsCatalog catalog;
+    private final HintsWriteExecutor writeExecutor;
+    private final HintsDispatchExecutor dispatchExecutor;
+    private final AtomicBoolean isPaused;
+
+    HintsDispatchTrigger(HintsCatalog catalog,
+                         HintsWriteExecutor writeExecutor,
+                         HintsDispatchExecutor dispatchExecutor,
+                         AtomicBoolean isPaused)
+    {
+        this.catalog = catalog;
+        this.writeExecutor = writeExecutor;
+        this.dispatchExecutor = dispatchExecutor;
+        this.isPaused = isPaused;
+    }
+
+    public void run()
+    {
+        if (isPaused.get())
+            return;
+
+        catalog.stores()
+               .filter(store -> !isScheduled(store))
+               .filter(HintsStore::isLive)
+               .filter(store -> store.isWriting() || store.hasFiles())
+               .filter(store -> Gossiper.instance.valuesEqual(getBroadcastAddress(), store.address(), ApplicationState.SCHEMA))
+               .forEach(this::schedule);
+    }
+
+    private void schedule(HintsStore store)
+    {
+        if (store.hasFiles())
+            dispatchExecutor.dispatch(store);
+
+        if (store.isWriting())
+            writeExecutor.closeWriter(store);
+    }
+
+    private boolean isScheduled(HintsStore store)
+    {
+        return dispatchExecutor.isScheduled(store);
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsDispatcher.java b/src/java/org/apache/cassandra/hints/HintsDispatcher.java
new file mode 100644
index 0000000..db5f42f
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsDispatcher.java

@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BooleanSupplier;
+import java.util.function.Function;
+
+import com.google.common.util.concurrent.RateLimiter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.net.IAsyncCallbackWithFailure;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.concurrent.SimpleCondition;
+
+/**
+ * Dispatches a single hints file to a specified node in a batched manner.
+ *
+ * Uses either {@link EncodedHintMessage} - when dispatching hints into a node with the same messaging version as the hints file,
+ * or {@link HintMessage}, when conversion is required.
+ */
+final class HintsDispatcher implements AutoCloseable
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsDispatcher.class);
+
+    private enum Action { CONTINUE, ABORT }
+
+    private final HintsReader reader;
+    private final UUID hostId;
+    private final InetAddress address;
+    private final int messagingVersion;
+    private final BooleanSupplier abortRequested;
+
+    private InputPosition currentPagePosition;
+
+    private HintsDispatcher(HintsReader reader, UUID hostId, InetAddress address, int messagingVersion, BooleanSupplier abortRequested)
+    {
+        currentPagePosition = null;
+
+        this.reader = reader;
+        this.hostId = hostId;
+        this.address = address;
+        this.messagingVersion = messagingVersion;
+        this.abortRequested = abortRequested;
+    }
+
+    static HintsDispatcher create(File file, RateLimiter rateLimiter, InetAddress address, UUID hostId, BooleanSupplier abortRequested)
+    {
+        int messagingVersion = MessagingService.instance().getVersion(address);
+        return new HintsDispatcher(HintsReader.open(file, rateLimiter), hostId, address, messagingVersion, abortRequested);
+    }
+
+    public void close()
+    {
+        reader.close();
+    }
+
+    void seek(InputPosition position)
+    {
+        reader.seek(position);
+    }
+
+    /**
+     * @return whether or not dispatch completed entirely and successfully
+     */
+    boolean dispatch()
+    {
+        for (HintsReader.Page page : reader)
+        {
+            currentPagePosition = page.position;
+            if (dispatch(page) != Action.CONTINUE)
+                return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * @return offset of the first non-delivered page
+     */
+    InputPosition dispatchPosition()
+    {
+        return currentPagePosition;
+    }
+
+
+    // retry in case of a timeout; stop in case of a failure, host going down, or delivery paused
+    private Action dispatch(HintsReader.Page page)
+    {
+        return sendHintsAndAwait(page);
+    }
+
+    private Action sendHintsAndAwait(HintsReader.Page page)
+    {
+        Collection<Callback> callbacks = new ArrayList<>();
+
+        /*
+         * If hints file messaging version matches the version of the target host, we'll use the optimised path -
+         * skipping the redundant decoding/encoding cycle of the already encoded hint.
+         *
+         * If that is not the case, we'll need to perform conversion to a newer (or an older) format, and decoding the hint
+         * is an unavoidable intermediate step.
+         */
+        Action action = reader.descriptor().messagingVersion() == messagingVersion
+                      ? sendHints(page.buffersIterator(), callbacks, this::sendEncodedHint)
+                      : sendHints(page.hintsIterator(), callbacks, this::sendHint);
+
+        if (action == Action.ABORT)
+            return action;
+
+        for (Callback cb : callbacks)
+            if (cb.await() != Callback.Outcome.SUCCESS)
+                return Action.ABORT;
+
+        return Action.CONTINUE;
+    }
+
+    /*
+     * Sending hints in compatibility mode.
+     */
+
+    private <T> Action sendHints(Iterator<T> hints, Collection<Callback> callbacks, Function<T, Callback> sendFunction)
+    {
+        while (hints.hasNext())
+        {
+            if (abortRequested.getAsBoolean())
+                return Action.ABORT;
+            callbacks.add(sendFunction.apply(hints.next()));
+        }
+        return Action.CONTINUE;
+    }
+
+    private Callback sendHint(Hint hint)
+    {
+        Callback callback = new Callback();
+        HintMessage message = new HintMessage(hostId, hint);
+        MessagingService.instance().sendRRWithFailure(message.createMessageOut(), address, callback);
+        return callback;
+    }
+
+    /*
+     * Sending hints in raw mode.
+     */
+
+    private Callback sendEncodedHint(ByteBuffer hint)
+    {
+        Callback callback = new Callback();
+        EncodedHintMessage message = new EncodedHintMessage(hostId, hint, messagingVersion);
+        MessagingService.instance().sendRRWithFailure(message.createMessageOut(), address, callback);
+        return callback;
+    }
+
+    private static final class Callback implements IAsyncCallbackWithFailure
+    {
+        enum Outcome { SUCCESS, TIMEOUT, FAILURE, INTERRUPTED }
+
+        private final long start = System.nanoTime();
+        private final SimpleCondition condition = new SimpleCondition();
+        private volatile Outcome outcome;
+
+        Outcome await()
+        {
+            long timeout = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getTimeout(MessagingService.Verb.HINT)) - (System.nanoTime() - start);
+            boolean timedOut;
+
+            try
+            {
+                timedOut = !condition.await(timeout, TimeUnit.NANOSECONDS);
+            }
+            catch (InterruptedException e)
+            {
+                logger.warn("Hint dispatch was interrupted", e);
+                return Outcome.INTERRUPTED;
+            }
+
+            return timedOut ? Outcome.TIMEOUT : outcome;
+        }
+
+        public void onFailure(InetAddress from)
+        {
+            outcome = Outcome.FAILURE;
+            condition.signalAll();
+        }
+
+        public void response(MessageIn msg)
+        {
+            outcome = Outcome.SUCCESS;
+            condition.signalAll();
+        }
+
+        public boolean isLatencyForSnitch()
+        {
+            return false;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsReader.java b/src/java/org/apache/cassandra/hints/HintsReader.java
new file mode 100644
index 0000000..7003e04
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsReader.java

@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import javax.annotation.Nullable;
+
+import com.google.common.primitives.Ints;
+import com.google.common.util.concurrent.RateLimiter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.UnknownColumnFamilyException;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.NativeLibrary;
+
+/**
+ * A paged non-compressed hints reader that provides two iterators:
+ * - a 'raw' ByteBuffer iterator that doesn't deserialize the hints, but returns the pre-encoded hints verbatim
+ * - a decoded iterator, that deserializes the underlying bytes into {@link Hint} instances.
+ *
+ * The former is an optimisation for when the messaging version of the file matches the messaging version of the destination
+ * node. Extra decoding and reencoding is a waste of effort in this scenario, so we avoid it.
+ *
+ * The latter is required for dispatch of hints to nodes that have a different messaging version, and in general is just an
+ * easy way to enable backward and future compatibilty.
+ */
+class HintsReader implements AutoCloseable, Iterable<HintsReader.Page>
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsReader.class);
+
+    // don't read more than 512 KB of hints at a time.
+    private static final int PAGE_SIZE = 512 << 10;
+
+    private final HintsDescriptor descriptor;
+    private final File file;
+    private final ChecksummedDataInput input;
+
+    // we pass the RateLimiter into HintsReader itself because it's cheaper to calculate the size before the hint is deserialized
+    @Nullable
+    private final RateLimiter rateLimiter;
+
+    protected HintsReader(HintsDescriptor descriptor, File file, ChecksummedDataInput reader, RateLimiter rateLimiter)
+    {
+        this.descriptor = descriptor;
+        this.file = file;
+        this.input = reader;
+        this.rateLimiter = rateLimiter;
+    }
+
+    @SuppressWarnings("resource") // HintsReader owns input
+    static HintsReader open(File file, RateLimiter rateLimiter)
+    {
+        ChecksummedDataInput reader = ChecksummedDataInput.open(file);
+        try
+        {
+            HintsDescriptor descriptor = HintsDescriptor.deserialize(reader);
+            if (descriptor.isCompressed())
+            {
+                // since the hints descriptor is always uncompressed, it needs to be read with the normal ChecksummedDataInput.
+                // The compressed input is instantiated with the uncompressed input's position
+                reader = CompressedChecksummedDataInput.upgradeInput(reader, descriptor.createCompressor());
+            }
+            return new HintsReader(descriptor, file, reader, rateLimiter);
+        }
+        catch (IOException e)
+        {
+            reader.close();
+            throw new FSReadError(e, file);
+        }
+    }
+
+    static HintsReader open(File file)
+    {
+        return open(file, null);
+    }
+
+    public void close()
+    {
+        input.close();
+    }
+
+    public HintsDescriptor descriptor()
+    {
+        return descriptor;
+    }
+
+    void seek(InputPosition newPosition)
+    {
+        input.seek(newPosition);
+    }
+
+    public Iterator<Page> iterator()
+    {
+        return new PagesIterator();
+    }
+
+    public ChecksummedDataInput getInput()
+    {
+        return input;
+    }
+
+    final class Page
+    {
+        public final InputPosition position;
+
+        private Page(InputPosition inputPosition)
+        {
+            this.position = inputPosition;
+        }
+
+        Iterator<Hint> hintsIterator()
+        {
+            return new HintsIterator(position);
+        }
+
+        Iterator<ByteBuffer> buffersIterator()
+        {
+            return new BuffersIterator(position);
+        }
+    }
+
+    final class PagesIterator extends AbstractIterator<Page>
+    {
+        @SuppressWarnings("resource")
+        protected Page computeNext()
+        {
+            input.tryUncacheRead();
+
+            if (input.isEOF())
+                return endOfData();
+
+            return new Page(input.getSeekPosition());
+        }
+    }
+
+    /**
+     * A decoding iterator that deserializes the hints as it goes.
+     */
+    final class HintsIterator extends AbstractIterator<Hint>
+    {
+        private final InputPosition offset;
+
+        HintsIterator(InputPosition offset)
+        {
+            super();
+            this.offset = offset;
+        }
+
+        protected Hint computeNext()
+        {
+            Hint hint;
+
+            do
+            {
+                InputPosition position = input.getSeekPosition();
+
+                if (input.isEOF())
+                    return endOfData(); // reached EOF
+
+                if (position.subtract(offset) >= PAGE_SIZE)
+                    return endOfData(); // read page size or more bytes
+
+                try
+                {
+                    hint = computeNextInternal();
+                }
+                catch (EOFException e)
+                {
+                    logger.warn("Unexpected EOF replaying hints ({}), likely due to unflushed hint file on shutdown; continuing", descriptor.fileName(), e);
+                    return endOfData();
+                }
+                catch (IOException e)
+                {
+                    throw new FSReadError(e, file);
+                }
+            }
+            while (hint == null);
+
+            return hint;
+        }
+
+        private Hint computeNextInternal() throws IOException
+        {
+            input.resetCrc();
+            input.resetLimit();
+
+            int size = input.readInt();
+
+            // if we cannot corroborate the size via crc, then we cannot safely skip this hint
+            if (!input.checkCrc())
+                throw new IOException("Digest mismatch exception");
+
+            return readHint(size);
+        }
+
+        private Hint readHint(int size) throws IOException
+        {
+            if (rateLimiter != null)
+                rateLimiter.acquire(size);
+            input.limit(size);
+
+            Hint hint;
+            try
+            {
+                hint = Hint.serializer.deserialize(input, descriptor.messagingVersion());
+                input.checkLimit(0);
+            }
+            catch (UnknownColumnFamilyException e)
+            {
+                logger.warn("Failed to read a hint for {}: {} - table with id {} is unknown in file {}",
+                            StorageService.instance.getEndpointForHostId(descriptor.hostId),
+                            descriptor.hostId,
+                            e.cfId,
+                            descriptor.fileName());
+                input.skipBytes(Ints.checkedCast(size - input.bytesPastLimit()));
+
+                hint = null; // set the return value to null and let following code to update/check the CRC
+            }
+
+            if (input.checkCrc())
+                return hint;
+
+            // log a warning and skip the corrupted entry
+            logger.warn("Failed to read a hint for {}: {} - digest mismatch for hint at position {} in file {}",
+                        StorageService.instance.getEndpointForHostId(descriptor.hostId),
+                        descriptor.hostId,
+                        input.getPosition() - size - 4,
+                        descriptor.fileName());
+            return null;
+        }
+    }
+
+    /**
+     * A verbatim iterator that simply returns the underlying ByteBuffers.
+     */
+    final class BuffersIterator extends AbstractIterator<ByteBuffer>
+    {
+        private final InputPosition offset;
+
+        BuffersIterator(InputPosition offset)
+        {
+            super();
+            this.offset = offset;
+        }
+
+        protected ByteBuffer computeNext()
+        {
+            ByteBuffer buffer;
+
+            do
+            {
+                InputPosition position = input.getSeekPosition();
+
+                if (input.isEOF())
+                    return endOfData(); // reached EOF
+
+                if (position.subtract(offset) >= PAGE_SIZE)
+                    return endOfData(); // read page size or more bytes
+
+                try
+                {
+                    buffer = computeNextInternal();
+                }
+                catch (EOFException e)
+                {
+                    logger.warn("Unexpected EOF replaying hints ({}), likely due to unflushed hint file on shutdown; continuing", descriptor.fileName(), e);
+                    return endOfData();
+                }
+                catch (IOException e)
+                {
+                    throw new FSReadError(e, file);
+                }
+            }
+            while (buffer == null);
+
+            return buffer;
+        }
+
+        private ByteBuffer computeNextInternal() throws IOException
+        {
+            input.resetCrc();
+            input.resetLimit();
+
+            int size = input.readInt();
+
+            // if we cannot corroborate the size via crc, then we cannot safely skip this hint
+            if (!input.checkCrc())
+                throw new IOException("Digest mismatch exception");
+
+            return readBuffer(size);
+        }
+
+        private ByteBuffer readBuffer(int size) throws IOException
+        {
+            if (rateLimiter != null)
+                rateLimiter.acquire(size);
+            input.limit(size);
+
+            ByteBuffer buffer = ByteBufferUtil.read(input, size);
+            if (input.checkCrc())
+                return buffer;
+
+            // log a warning and skip the corrupted entry
+            logger.warn("Failed to read a hint for {} - digest mismatch for hint at position {} in file {}",
+                        descriptor.hostId,
+                        input.getPosition() - size - 4,
+                        descriptor.fileName());
+            return null;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsService.java b/src/java/org/apache/cassandra/hints/HintsService.java
new file mode 100644
index 0000000..00d1954
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsService.java

@@ -0,0 +1,380 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.Collections;
+import java.util.UUID;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Supplier;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.gms.IFailureDetector;
+import org.apache.cassandra.metrics.HintedHandoffMetrics;
+import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.MBeanWrapper;
+
+import static com.google.common.collect.Iterables.filter;
+import static com.google.common.collect.Iterables.transform;
+import static com.google.common.collect.Iterables.size;
+
+/**
+ * A singleton-ish wrapper over various hints components:
+ * - a catalog of all hints stores
+ * - a single-threaded write executor
+ * - a multi-threaded dispatch executor
+ * - the buffer pool for writing hints into
+ *
+ * The front-end for everything hints related.
+ */
+public final class HintsService implements HintsServiceMBean
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsService.class);
+
+    public static HintsService instance = new HintsService();
+
+    private static final String MBEAN_NAME = "org.apache.cassandra.hints:type=HintsService";
+
+    private static final int MIN_BUFFER_SIZE = 32 << 20;
+    static final ImmutableMap<String, Object> EMPTY_PARAMS = ImmutableMap.of();
+
+    private final HintsCatalog catalog;
+    private final HintsWriteExecutor writeExecutor;
+    private final HintsBufferPool bufferPool;
+    private final HintsDispatchExecutor dispatchExecutor;
+    private final AtomicBoolean isDispatchPaused;
+
+    private volatile boolean isShutDown = false;
+
+    private final ScheduledFuture triggerFlushingFuture;
+    private volatile ScheduledFuture triggerDispatchFuture;
+
+    public final HintedHandoffMetrics metrics;
+
+    private HintsService()
+    {
+        this(FailureDetector.instance);
+    }
+
+    @VisibleForTesting
+    HintsService(IFailureDetector failureDetector)
+    {
+        File hintsDirectory = DatabaseDescriptor.getHintsDirectory();
+        int maxDeliveryThreads = DatabaseDescriptor.getMaxHintsDeliveryThreads();
+
+        catalog = HintsCatalog.load(hintsDirectory, createDescriptorParams());
+        writeExecutor = new HintsWriteExecutor(catalog);
+
+        int bufferSize = Math.max(DatabaseDescriptor.getMaxMutationSize() * 2, MIN_BUFFER_SIZE);
+        bufferPool = new HintsBufferPool(bufferSize, writeExecutor::flushBuffer);
+
+        isDispatchPaused = new AtomicBoolean(true);
+        dispatchExecutor = new HintsDispatchExecutor(hintsDirectory, maxDeliveryThreads, isDispatchPaused, failureDetector::isAlive);
+
+        // periodically empty the current content of the buffers
+        int flushPeriod = DatabaseDescriptor.getHintsFlushPeriodInMS();
+        triggerFlushingFuture = ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(() -> writeExecutor.flushBufferPool(bufferPool),
+                                                                                        flushPeriod,
+                                                                                        flushPeriod,
+                                                                                        TimeUnit.MILLISECONDS);
+        metrics = new HintedHandoffMetrics();
+    }
+
+    private static ImmutableMap<String, Object> createDescriptorParams()
+    {
+        ImmutableMap.Builder<String, Object> builder = ImmutableMap.builder();
+
+        ParameterizedClass compressionConfig = DatabaseDescriptor.getHintsCompression();
+        if (compressionConfig != null)
+        {
+            ImmutableMap.Builder<String, Object> compressorParams = ImmutableMap.builder();
+
+            compressorParams.put(ParameterizedClass.CLASS_NAME, compressionConfig.class_name);
+            if (compressionConfig.parameters != null)
+            {
+                compressorParams.put(ParameterizedClass.PARAMETERS, compressionConfig.parameters);
+            }
+            builder.put(HintsDescriptor.COMPRESSION, compressorParams.build());
+        }
+
+        return builder.build();
+    }
+
+    public void registerMBean()
+    {
+        MBeanWrapper.instance.registerMBean(this, MBEAN_NAME);
+    }
+
+    /**
+     * Write a hint for a iterable of nodes.
+     *
+     * @param hostIds host ids of the hint's target nodes
+     * @param hint the hint to store
+     */
+    public void write(Iterable<UUID> hostIds, Hint hint)
+    {
+        if (isShutDown)
+            throw new IllegalStateException("HintsService is shut down and can't accept new hints");
+
+        // we have to make sure that the HintsStore instances get properly initialized - otherwise dispatch will not trigger
+        catalog.maybeLoadStores(hostIds);
+
+        if (hint.isLive())
+            bufferPool.write(hostIds, hint);
+
+        StorageMetrics.totalHints.inc(size(hostIds));
+    }
+
+    /**
+     * Write a hint for a single node.
+     *
+     * @param hostId host id of the hint's target node
+     * @param hint the hint to store
+     */
+    public void write(UUID hostId, Hint hint)
+    {
+        write(Collections.singleton(hostId), hint);
+    }
+
+    /**
+     * Write a hint for all replicas. Used to re-dispatch hints whose destination is either missing or no longer correct.
+     */
+    void writeForAllReplicas(Hint hint)
+    {
+        String keyspaceName = hint.mutation.getKeyspaceName();
+        Token token = hint.mutation.key().getToken();
+
+        Iterable<UUID> hostIds =
+        transform(filter(StorageService.instance.getNaturalAndPendingEndpoints(keyspaceName, token), StorageProxy::shouldHint),
+                  StorageService.instance::getHostIdForEndpoint);
+
+        write(hostIds, hint);
+    }
+
+    /**
+     * Flush the buffer pool for the selected target nodes, then fsync their writers.
+     *
+     * @param hostIds host ids of the nodes to flush and fsync hints for
+     */
+    public void flushAndFsyncBlockingly(Iterable<UUID> hostIds)
+    {
+        Iterable<HintsStore> stores = transform(hostIds, catalog::get);
+        writeExecutor.flushBufferPool(bufferPool, stores);
+        writeExecutor.fsyncWritersBlockingly(stores);
+    }
+
+    public synchronized void startDispatch()
+    {
+        if (isShutDown)
+            throw new IllegalStateException("HintsService is shut down and cannot be restarted");
+
+        isDispatchPaused.set(false);
+
+        HintsDispatchTrigger trigger = new HintsDispatchTrigger(catalog, writeExecutor, dispatchExecutor, isDispatchPaused);
+        // triggering hint dispatch is now very cheap, so we can do it more often - every 10 seconds vs. every 10 minutes,
+        // previously; this reduces mean time to delivery, and positively affects batchlog delivery latencies, too
+        triggerDispatchFuture = ScheduledExecutors.scheduledTasks.scheduleWithFixedDelay(trigger, 10, 10, TimeUnit.SECONDS);
+    }
+
+    public void pauseDispatch()
+    {
+        logger.info("Paused hints dispatch");
+        isDispatchPaused.set(true);
+    }
+
+    public void resumeDispatch()
+    {
+        logger.info("Resumed hints dispatch");
+        isDispatchPaused.set(false);
+    }
+
+    /**
+     * Gracefully and blockingly shut down the service.
+     *
+     * Will abort dispatch sessions that are currently in progress (which is okay, it's idempotent),
+     * and make sure the buffers are flushed, hints files written and fsynced.
+     */
+    public synchronized void shutdownBlocking() throws ExecutionException, InterruptedException
+    {
+        if (isShutDown)
+            throw new IllegalStateException("HintsService has already been shut down");
+        isShutDown = true;
+
+        if (triggerDispatchFuture != null)
+            triggerDispatchFuture.cancel(false);
+        pauseDispatch();
+
+        triggerFlushingFuture.cancel(false);
+
+        writeExecutor.flushBufferPool(bufferPool).get();
+        writeExecutor.closeAllWriters().get();
+
+        dispatchExecutor.shutdownBlocking();
+        writeExecutor.shutdownBlocking();
+        bufferPool.close();
+    }
+
+    /**
+     * Deletes all hints for all destinations. Doesn't make snapshots - should be used with care.
+     */
+    public void deleteAllHints()
+    {
+        catalog.deleteAllHints();
+    }
+
+    /**
+     * Deletes all hints for the provided destination. Doesn't make snapshots - should be used with care.
+     *
+     * @param address inet address of the target node - encoded as a string for easier JMX consumption
+     */
+    public void deleteAllHintsForEndpoint(String address)
+    {
+        InetAddress target;
+        try
+        {
+            target = InetAddress.getByName(address);
+        }
+        catch (UnknownHostException e)
+        {
+            throw new IllegalArgumentException(e);
+        }
+        deleteAllHintsForEndpoint(target);
+    }
+
+    /**
+     * Deletes all hints for the provided destination. Doesn't make snapshots - should be used with care.
+     *
+     * @param target inet address of the target node
+     */
+    public void deleteAllHintsForEndpoint(InetAddress target)
+    {
+        UUID hostId = StorageService.instance.getHostIdForEndpoint(target);
+        if (hostId == null)
+            throw new IllegalArgumentException("Can't delete hints for unknown address " + target);
+        catalog.deleteAllHints(hostId);
+    }
+
+    /**
+     * Cleans up hints-related state after a node with id = hostId left.
+     *
+     * Dispatcher can not stop itself (isHostAlive() can not start returning false for the leaving host because this
+     * method is called by the same thread as gossip, which blocks gossip), so we can't simply wait for
+     * completion.
+     *
+     * We should also flush the buffer if there are any hints for the node there, and close the writer (if any),
+     * so that we don't leave any hint files lying around.
+     *
+     * Once that is done, we can simply delete all hint files and remove the host id from the catalog.
+     *
+     * The worst that can happen if we don't get everything right is a hints file (or two) remaining undeleted.
+     *
+     * @param hostId id of the node being excised
+     */
+    public void excise(UUID hostId)
+    {
+        HintsStore store = catalog.getNullable(hostId);
+        if (store == null)
+            return;
+
+        // flush the buffer and then close the writer for the excised host id, to make sure that no new files will appear
+        // for this host id after we are done
+        Future flushFuture = writeExecutor.flushBufferPool(bufferPool, Collections.singleton(store));
+        Future closeFuture = writeExecutor.closeWriter(store);
+        try
+        {
+            flushFuture.get();
+            closeFuture.get();
+        }
+        catch (InterruptedException | ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        // interrupt the current dispatch session to end (if any), so that the currently dispatched file gets removed
+        dispatchExecutor.interruptDispatch(store.hostId);
+
+        // delete all the hints files and remove the HintsStore instance from the map in the catalog
+        catalog.exciseStore(hostId);
+    }
+
+    /**
+     * Transfer all local hints to the hostId supplied by hostIdSupplier
+     *
+     * Flushes the buffer to make sure all hints are on disk and closes the hint writers
+     * so we don't leave any hint files around.
+     *
+     * After that, we serially dispatch all the hints in the HintsCatalog.
+     *
+     * If we fail delivering all hints, we will ask the hostIdSupplier for a new target host
+     * and retry delivering any remaining hints there, once, with a delay of 10 seconds before retrying.
+     *
+     * @param hostIdSupplier supplier of stream target host ids. This is generally
+     *                       the closest one according to the DynamicSnitch
+     * @return When this future is done, it either has streamed all hints to remote nodes or has failed with a proper
+     *         log message
+     */
+    public Future transferHints(Supplier<UUID> hostIdSupplier)
+    {
+        Future flushFuture = writeExecutor.flushBufferPool(bufferPool);
+        Future closeFuture = writeExecutor.closeAllWriters();
+        try
+        {
+            flushFuture.get();
+            closeFuture.get();
+        }
+        catch (InterruptedException | ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        // unpause dispatch, or else transfer() will return immediately
+        resumeDispatch();
+
+        // wait for the current dispatch session to end
+        catalog.stores().forEach(dispatchExecutor::completeDispatchBlockingly);
+
+        return dispatchExecutor.transfer(catalog, hostIdSupplier);
+    }
+
+    HintsCatalog getCatalog()
+    {
+        return catalog;
+    }
+
+    /**
+     * Returns true in case service is shut down.
+     */
+    public boolean isShutDown()
+    {
+        return isShutDown;
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsServiceMBean.java b/src/java/org/apache/cassandra/hints/HintsServiceMBean.java
new file mode 100644
index 0000000..fe0abcc
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsServiceMBean.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+public interface HintsServiceMBean
+{
+    /**
+     * Pause dispatch of all hints. Does not affect the creation of hints.
+     */
+    void pauseDispatch();
+
+    /**
+     * Resume dispatch of all hints. Does not affect the creation of hints.
+     */
+    void resumeDispatch();
+
+    /**
+     * Irrevocably deletes all the stored hints files (with the exception of those that are being dispatched right now,
+     * or being written to).
+     */
+    void deleteAllHints();
+
+    /**
+     * Irrevocably deletes all the stored hints files for the target address (with the exception of those that are
+     * being dispatched right now, or being written to).
+     */
+    void deleteAllHintsForEndpoint(String address);
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsStore.java b/src/java/org/apache/cassandra/hints/HintsStore.java
new file mode 100644
index 0000000..b08fc72
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsStore.java

@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.SyncUtil;
+
+/**
+ * Encapsulates the state of a peer's hints: the queue of hints files for dispatch, and the current writer (if any).
+ *
+ * The queue for dispatch is multi-threading safe.
+ *
+ * The writer MUST only be accessed by {@link HintsWriteExecutor}.
+ */
+final class HintsStore
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsStore.class);
+
+    public final UUID hostId;
+    private final File hintsDirectory;
+    private final ImmutableMap<String, Object> writerParams;
+
+    private final Map<HintsDescriptor, InputPosition> dispatchPositions;
+    private final Deque<HintsDescriptor> dispatchDequeue;
+    private final Queue<HintsDescriptor> corruptedFiles;
+
+    // last timestamp used in a descriptor; make sure to not reuse the same timestamp for new descriptors.
+    private volatile long lastUsedTimestamp;
+    private volatile HintsWriter hintsWriter;
+
+    private HintsStore(UUID hostId, File hintsDirectory, ImmutableMap<String, Object> writerParams, List<HintsDescriptor> descriptors)
+    {
+        this.hostId = hostId;
+        this.hintsDirectory = hintsDirectory;
+        this.writerParams = writerParams;
+
+        dispatchPositions = new ConcurrentHashMap<>();
+        dispatchDequeue = new ConcurrentLinkedDeque<>(descriptors);
+        corruptedFiles = new ConcurrentLinkedQueue<>();
+
+        //noinspection resource
+        lastUsedTimestamp = descriptors.stream().mapToLong(d -> d.timestamp).max().orElse(0L);
+    }
+
+    static HintsStore create(UUID hostId, File hintsDirectory, ImmutableMap<String, Object> writerParams, List<HintsDescriptor> descriptors)
+    {
+        descriptors.sort((d1, d2) -> Long.compare(d1.timestamp, d2.timestamp));
+        return new HintsStore(hostId, hintsDirectory, writerParams, descriptors);
+    }
+
+    @VisibleForTesting
+    int getDispatchQueueSize()
+    {
+        return dispatchDequeue.size();
+    }
+
+    InetAddress address()
+    {
+        return StorageService.instance.getEndpointForHostId(hostId);
+    }
+
+    boolean isLive()
+    {
+        InetAddress address = address();
+        return address != null && FailureDetector.instance.isAlive(address);
+    }
+
+    HintsDescriptor poll()
+    {
+        return dispatchDequeue.poll();
+    }
+
+    void offerFirst(HintsDescriptor descriptor)
+    {
+        dispatchDequeue.offerFirst(descriptor);
+    }
+
+    void offerLast(HintsDescriptor descriptor)
+    {
+        dispatchDequeue.offerLast(descriptor);
+    }
+
+    void deleteAllHints()
+    {
+        HintsDescriptor descriptor;
+        while ((descriptor = poll()) != null)
+        {
+            cleanUp(descriptor);
+            delete(descriptor);
+        }
+
+        while ((descriptor = corruptedFiles.poll()) != null)
+        {
+            cleanUp(descriptor);
+            delete(descriptor);
+        }
+    }
+
+    void delete(HintsDescriptor descriptor)
+    {
+        File hintsFile = new File(hintsDirectory, descriptor.fileName());
+        if (hintsFile.delete())
+            logger.info("Deleted hint file {}", descriptor.fileName());
+        else
+            logger.error("Failed to delete hint file {}", descriptor.fileName());
+
+        //noinspection ResultOfMethodCallIgnored
+        new File(hintsDirectory, descriptor.checksumFileName()).delete();
+    }
+
+    boolean hasFiles()
+    {
+        return !dispatchDequeue.isEmpty();
+    }
+
+    InputPosition getDispatchOffset(HintsDescriptor descriptor)
+    {
+        return dispatchPositions.get(descriptor);
+    }
+
+    void markDispatchOffset(HintsDescriptor descriptor, InputPosition inputPosition)
+    {
+        dispatchPositions.put(descriptor, inputPosition);
+    }
+
+    void cleanUp(HintsDescriptor descriptor)
+    {
+        dispatchPositions.remove(descriptor);
+    }
+
+    void markCorrupted(HintsDescriptor descriptor)
+    {
+        corruptedFiles.add(descriptor);
+    }
+
+    /*
+     * Methods dealing with HintsWriter.
+     *
+     * All of these, with the exception of isWriting(), are for exclusively single-threaded use by HintsWriteExecutor.
+     */
+
+    boolean isWriting()
+    {
+        return hintsWriter != null;
+    }
+
+    HintsWriter getOrOpenWriter()
+    {
+        if (hintsWriter == null)
+            hintsWriter = openWriter();
+        return hintsWriter;
+    }
+
+    HintsWriter getWriter()
+    {
+        return hintsWriter;
+    }
+
+    private HintsWriter openWriter()
+    {
+        lastUsedTimestamp = Math.max(System.currentTimeMillis(), lastUsedTimestamp + 1);
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, lastUsedTimestamp, writerParams);
+
+        try
+        {
+            return HintsWriter.create(hintsDirectory, descriptor);
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, descriptor.fileName());
+        }
+    }
+
+    void closeWriter()
+    {
+        if (hintsWriter != null)
+        {
+            hintsWriter.close();
+            offerLast(hintsWriter.descriptor());
+            hintsWriter = null;
+            SyncUtil.trySyncDir(hintsDirectory);
+        }
+    }
+
+    void fsyncWriter()
+    {
+        if (hintsWriter != null)
+            hintsWriter.fsync();
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsWriteExecutor.java b/src/java/org/apache/cassandra/hints/HintsWriteExecutor.java
new file mode 100644
index 0000000..51a5362
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsWriteExecutor.java

@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.concurrent.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.FileUtils;
+
+/**
+ * A single threaded executor that exclusively writes all the hints and otherwise manipulate the writers.
+ *
+ * Flushing demultiplexes the provided {@link HintsBuffer} and sequentially writes to each {@link HintsWriter},
+ * using the same shared write buffer. In the near future, when CASSANDRA-9428 (compression) is implemented,
+ * will also share a compression buffer.
+ */
+final class HintsWriteExecutor
+{
+    private static final Logger logger = LoggerFactory.getLogger(HintsWriteExecutor.class);
+
+    static final int WRITE_BUFFER_SIZE = 256 << 10;
+
+    private final HintsCatalog catalog;
+    private final ByteBuffer writeBuffer;
+    private final ExecutorService executor;
+
+    HintsWriteExecutor(HintsCatalog catalog)
+    {
+        this.catalog = catalog;
+
+        writeBuffer = ByteBuffer.allocateDirect(WRITE_BUFFER_SIZE);
+        executor = DebuggableThreadPoolExecutor.createWithFixedPoolSize("HintsWriteExecutor", 1);
+    }
+
+    /*
+     * Should be very fast (worst case scenario - write a few 10s of megabytes to disk).
+     */
+    void shutdownBlocking()
+    {
+        executor.shutdown();
+        try
+        {
+            executor.awaitTermination(1, TimeUnit.MINUTES);
+        }
+        catch (InterruptedException e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    /**
+     * Flush the provided buffer, recycle it and offer it back to the pool.
+     */
+    Future<?> flushBuffer(HintsBuffer buffer, HintsBufferPool bufferPool)
+    {
+        return executor.submit(new FlushBufferTask(buffer, bufferPool));
+    }
+
+    /**
+     * Flush the current buffer, but without clearing/recycling it.
+     */
+    Future<?> flushBufferPool(HintsBufferPool bufferPool)
+    {
+        return executor.submit(new FlushBufferPoolTask(bufferPool));
+    }
+
+    /**
+     * Flush the current buffer just for the specified hints stores. Without clearing/recycling it.
+     */
+    Future<?> flushBufferPool(HintsBufferPool bufferPool, Iterable<HintsStore> stores)
+    {
+        return executor.submit(new PartiallyFlushBufferPoolTask(bufferPool, stores));
+    }
+
+    void fsyncWritersBlockingly(Iterable<HintsStore> stores)
+    {
+        try
+        {
+            executor.submit(new FsyncWritersTask(stores)).get();
+        }
+        catch (InterruptedException | ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    Future<?> closeWriter(HintsStore store)
+    {
+        return executor.submit(store::closeWriter);
+    }
+
+    Future<?> closeAllWriters()
+    {
+        return executor.submit(() -> catalog.stores().forEach(HintsStore::closeWriter));
+    }
+
+    private final class FlushBufferTask implements Runnable
+    {
+        private final HintsBuffer buffer;
+        private final HintsBufferPool bufferPool;
+
+        FlushBufferTask(HintsBuffer buffer, HintsBufferPool bufferPool)
+        {
+            this.buffer = buffer;
+            this.bufferPool = bufferPool;
+        }
+
+        public void run()
+        {
+            buffer.waitForModifications();
+
+            try
+            {
+                flush(buffer);
+            }
+            finally
+            {
+                HintsBuffer recycledBuffer = buffer.recycle();
+                bufferPool.offer(recycledBuffer);
+            }
+        }
+    }
+
+    private final class FlushBufferPoolTask implements Runnable
+    {
+        private final HintsBufferPool bufferPool;
+
+        FlushBufferPoolTask(HintsBufferPool bufferPool)
+        {
+            this.bufferPool = bufferPool;
+        }
+
+        public void run()
+        {
+            HintsBuffer buffer = bufferPool.currentBuffer();
+            buffer.waitForModifications();
+            try
+            {
+                flush(buffer);
+            }
+            catch(FSError e)
+            {
+                logger.error("Unable to flush hint buffer: {}", e.getLocalizedMessage(), e);
+                FileUtils.handleFSErrorAndPropagate(e);
+            }
+        }
+    }
+
+    private final class PartiallyFlushBufferPoolTask implements Runnable
+    {
+        private final HintsBufferPool bufferPool;
+        private final Iterable<HintsStore> stores;
+
+        PartiallyFlushBufferPoolTask(HintsBufferPool bufferPool, Iterable<HintsStore> stores)
+        {
+            this.bufferPool = bufferPool;
+            this.stores = stores;
+        }
+
+        public void run()
+        {
+            HintsBuffer buffer = bufferPool.currentBuffer();
+            buffer.waitForModifications();
+            stores.forEach(store -> flush(buffer.consumingHintsIterator(store.hostId), store));
+        }
+    }
+
+    private final class FsyncWritersTask implements Runnable
+    {
+        private final Iterable<HintsStore> stores;
+
+        FsyncWritersTask(Iterable<HintsStore> stores)
+        {
+            this.stores = stores;
+        }
+
+        public void run()
+        {
+            stores.forEach(HintsStore::fsyncWriter);
+            catalog.fsyncDirectory();
+        }
+    }
+
+    private void flush(HintsBuffer buffer)
+    {
+        buffer.hostIds().forEach(hostId -> flush(buffer.consumingHintsIterator(hostId), catalog.get(hostId)));
+    }
+
+    private void flush(Iterator<ByteBuffer> iterator, HintsStore store)
+    {
+        while (true)
+        {
+            if (iterator.hasNext())
+                flushInternal(iterator, store);
+
+            if (!iterator.hasNext())
+                break;
+
+            // exceeded the size limit for an individual file, but still have more to write
+            // close the current writer and continue flushing to a new one in the next iteration
+            store.closeWriter();
+        }
+    }
+
+    @SuppressWarnings("resource")   // writer not closed here
+    private void flushInternal(Iterator<ByteBuffer> iterator, HintsStore store)
+    {
+        long maxHintsFileSize = DatabaseDescriptor.getMaxHintsFileSize();
+
+        HintsWriter writer = store.getOrOpenWriter();
+
+        try (HintsWriter.Session session = writer.newSession(writeBuffer))
+        {
+            while (iterator.hasNext())
+            {
+                session.append(iterator.next());
+                if (session.position() >= maxHintsFileSize)
+                    break;
+            }
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, writer.descriptor().fileName());
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/HintsWriter.java b/src/java/org/apache/cassandra/hints/HintsWriter.java
new file mode 100644
index 0000000..31a440d
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/HintsWriter.java

@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.zip.CRC32;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputBufferFixed;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.NativeLibrary;
+import org.apache.cassandra.utils.SyncUtil;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.utils.FBUtilities.updateChecksum;
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
+import static org.apache.cassandra.utils.Throwables.perform;
+
+class HintsWriter implements AutoCloseable
+{
+    static final int PAGE_SIZE = 4096;
+
+    private final File directory;
+    private final HintsDescriptor descriptor;
+    private final File file;
+    private final FileChannel channel;
+    private final int fd;
+    private final CRC32 globalCRC;
+
+    private volatile long lastSyncPosition = 0L;
+
+    protected HintsWriter(File directory, HintsDescriptor descriptor, File file, FileChannel channel, int fd, CRC32 globalCRC)
+    {
+        this.directory = directory;
+        this.descriptor = descriptor;
+        this.file = file;
+        this.channel = channel;
+        this.fd = fd;
+        this.globalCRC = globalCRC;
+    }
+
+    @SuppressWarnings("resource") // HintsWriter owns channel
+    static HintsWriter create(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        File file = new File(directory, descriptor.fileName());
+
+        FileChannel channel = FileChannel.open(file.toPath(), StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW);
+        int fd = NativeLibrary.getfd(channel);
+
+        CRC32 crc = new CRC32();
+
+        try (DataOutputBuffer dob = new DataOutputBuffer())
+        {
+            // write the descriptor
+            descriptor.serialize(dob);
+            ByteBuffer descriptorBytes = dob.buffer();
+            updateChecksum(crc, descriptorBytes);
+            channel.write(descriptorBytes);
+        }
+        catch (Throwable e)
+        {
+            channel.close();
+            throw e;
+        }
+
+        if (descriptor.isCompressed())
+        {
+            return new CompressedHintsWriter(directory, descriptor, file, channel, fd, crc);
+        }
+        else
+        {
+            return new HintsWriter(directory, descriptor, file, channel, fd, crc);
+        }
+    }
+
+    HintsDescriptor descriptor()
+    {
+        return descriptor;
+    }
+
+    private void writeChecksum()
+    {
+        File checksumFile = new File(directory, descriptor.checksumFileName());
+        try (OutputStream out = Files.newOutputStream(checksumFile.toPath()))
+        {
+            out.write(Integer.toHexString((int) globalCRC.getValue()).getBytes(StandardCharsets.UTF_8));
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, checksumFile);
+        }
+    }
+
+    public void close()
+    {
+        perform(file, Throwables.FileOpType.WRITE, this::doFsync, channel::close);
+
+        writeChecksum();
+    }
+
+    public void fsync()
+    {
+        perform(file, Throwables.FileOpType.WRITE, this::doFsync);
+    }
+
+    private void doFsync() throws IOException
+    {
+        SyncUtil.force(channel, true);
+        lastSyncPosition = channel.position();
+    }
+
+    Session newSession(ByteBuffer buffer)
+    {
+        try
+        {
+            return new Session(buffer, channel.size());
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, file);
+        }
+    }
+
+    /**
+     * Writes byte buffer into the file channel. Buffer should be flipped before calling this
+     */
+    protected void writeBuffer(ByteBuffer bb) throws IOException
+    {
+        updateChecksum(globalCRC, bb);
+        channel.write(bb);
+    }
+
+    /**
+     * The primary goal of the Session class is to be able to share the same buffers among potentially dozens or hundreds
+     * of hints writers, and ensure that their contents are always written to the underlying channels in the end.
+     */
+    final class Session implements AutoCloseable
+    {
+        private final ByteBuffer buffer;
+
+        private final long initialSize;
+        private long bytesWritten;
+
+        Session(ByteBuffer buffer, long initialSize)
+        {
+            buffer.clear();
+            bytesWritten = 0L;
+
+            this.buffer = buffer;
+            this.initialSize = initialSize;
+        }
+
+        @VisibleForTesting
+        long getBytesWritten()
+        {
+            return bytesWritten;
+        }
+
+        long position()
+        {
+            return initialSize + bytesWritten;
+        }
+
+        /**
+         * Appends the serialized hint (with CRC included) to this session's aggregation buffer,
+         * writes to the underlying channel when the buffer is overflown.
+         *
+         * @param hint the serialized hint (with CRC included)
+         * @throws IOException
+         */
+        void append(ByteBuffer hint) throws IOException
+        {
+            bytesWritten += hint.remaining();
+
+            // if the hint to write won't fit in the aggregation buffer, flush it
+            if (hint.remaining() > buffer.remaining())
+            {
+                buffer.flip();
+                writeBuffer(buffer);
+                buffer.clear();
+            }
+
+            // if the hint fits in the aggregation buffer, then update the aggregation buffer,
+            // otherwise write the hint buffer to the channel
+            if (hint.remaining() <= buffer.remaining())
+            {
+                buffer.put(hint);
+            }
+            else
+            {
+                writeBuffer(hint);
+            }
+        }
+
+        /**
+         * Serializes and appends the hint (with CRC included) to this session's aggregation buffer,
+         * writes to the underlying channel when the buffer is overflown.
+         *
+         * Used mainly by tests and {@link LegacyHintsMigrator}
+         *
+         * @param hint the unserialized hint
+         * @throws IOException
+         */
+        void append(Hint hint) throws IOException
+        {
+            int hintSize = (int) Hint.serializer.serializedSize(hint, descriptor.messagingVersion());
+            int totalSize = hintSize + HintsBuffer.ENTRY_OVERHEAD_SIZE;
+
+            if (totalSize > buffer.remaining())
+                flushBuffer();
+
+            ByteBuffer hintBuffer = totalSize <= buffer.remaining()
+                                  ? buffer
+                                  : ByteBuffer.allocate(totalSize);
+
+            CRC32 crc = new CRC32();
+            try (DataOutputBufferFixed out = new DataOutputBufferFixed(hintBuffer))
+            {
+                out.writeInt(hintSize);
+                updateChecksumInt(crc, hintSize);
+                out.writeInt((int) crc.getValue());
+
+                Hint.serializer.serialize(hint, out, descriptor.messagingVersion());
+                updateChecksum(crc, hintBuffer, hintBuffer.position() - hintSize, hintSize);
+                out.writeInt((int) crc.getValue());
+            }
+
+            if (hintBuffer == buffer)
+                bytesWritten += totalSize;
+            else
+                append((ByteBuffer) hintBuffer.flip());
+        }
+
+        /**
+         * Closes the session - flushes the aggregation buffer (if not empty), does page aligning, and potentially fsyncs.
+         * @throws IOException
+         */
+        public void close() throws IOException
+        {
+            flushBuffer();
+            maybeFsync();
+            maybeSkipCache();
+        }
+
+        private void flushBuffer() throws IOException
+        {
+            buffer.flip();
+
+            if (buffer.remaining() > 0)
+            {
+                writeBuffer(buffer);
+            }
+
+            buffer.clear();
+        }
+
+        private void maybeFsync()
+        {
+            if (position() >= lastSyncPosition + DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024L)
+                fsync();
+        }
+
+        private void maybeSkipCache()
+        {
+            long position = position();
+
+            // don't skip page cache for tiny files, on the assumption that if they are tiny, the target node is probably
+            // alive, and if so, the file will be closed and dispatched shortly (within a minute), and the file will be dropped.
+            if (position >= DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024L)
+                NativeLibrary.trySkipCache(fd, 0, position - (position % PAGE_SIZE), file.getPath());
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/InputPosition.java b/src/java/org/apache/cassandra/hints/InputPosition.java
new file mode 100644
index 0000000..0b8953c
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/InputPosition.java

@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+/**
+ * Marker interface for file positions as provided by the various ChecksummedDataReader implementations.
+ */
+public interface InputPosition
+{
+    long subtract(InputPosition other);
+}

diff --git a/src/java/org/apache/cassandra/hints/LegacyHintsMigrator.java b/src/java/org/apache/cassandra/hints/LegacyHintsMigrator.java
new file mode 100644
index 0000000..30e5fe0
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/LegacyHintsMigrator.java

@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A migrator that goes through the legacy system.hints table and writes all the hints to the new hints storage format.
+ */
+@SuppressWarnings("deprecation")
+public final class LegacyHintsMigrator
+{
+    private static final Logger logger = LoggerFactory.getLogger(LegacyHintsMigrator.class);
+
+    private final File hintsDirectory;
+    private final long maxHintsFileSize;
+
+    private final ColumnFamilyStore legacyHintsTable;
+    private final int pageSize;
+
+    public LegacyHintsMigrator(File hintsDirectory, long maxHintsFileSize)
+    {
+        this.hintsDirectory = hintsDirectory;
+        this.maxHintsFileSize = maxHintsFileSize;
+
+        legacyHintsTable = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_HINTS);
+        pageSize = calculatePageSize(legacyHintsTable);
+    }
+
+    // read fewer columns (mutations) per page if they are very large
+    private static int calculatePageSize(ColumnFamilyStore legacyHintsTable)
+    {
+        int size = 128;
+
+        int meanCellCount = legacyHintsTable.getMeanColumns();
+        double meanPartitionSize = legacyHintsTable.getMeanPartitionSize();
+
+        if (meanCellCount != 0 && meanPartitionSize != 0)
+        {
+            int avgHintSize = (int) meanPartitionSize / meanCellCount;
+            size = Math.max(2, Math.min(size, (512 << 10) / avgHintSize));
+        }
+
+        return size;
+    }
+
+    public void migrate()
+    {
+        // nothing to migrate
+        if (legacyHintsTable.isEmpty())
+            return;
+        logger.info("Migrating legacy hints to new storage");
+
+        // major-compact all of the existing sstables to get rid of the tombstones + expired hints
+        logger.info("Forcing a major compaction of {}.{} table", SystemKeyspace.NAME, SystemKeyspace.LEGACY_HINTS);
+        compactLegacyHints();
+
+        // paginate over legacy hints and write them to the new storage
+        logger.info("Writing legacy hints to the new storage");
+        migrateLegacyHints();
+
+        // truncate the legacy hints table
+        logger.info("Truncating {}.{} table", SystemKeyspace.NAME, SystemKeyspace.LEGACY_HINTS);
+        legacyHintsTable.truncateBlocking();
+    }
+
+    private void compactLegacyHints()
+    {
+        Collection<Descriptor> descriptors = new ArrayList<>();
+        legacyHintsTable.getTracker().getUncompacting().forEach(sstable -> descriptors.add(sstable.descriptor));
+        if (!descriptors.isEmpty())
+            forceCompaction(descriptors);
+    }
+
+    private void forceCompaction(Collection<Descriptor> descriptors)
+    {
+        try
+        {
+            CompactionManager.instance.submitUserDefined(legacyHintsTable, descriptors, FBUtilities.nowInSeconds()).get();
+        }
+        catch (InterruptedException | ExecutionException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private void migrateLegacyHints()
+    {
+        ByteBuffer buffer = ByteBuffer.allocateDirect(256 * 1024);
+        String query = String.format("SELECT DISTINCT target_id FROM %s.%s", SystemKeyspace.NAME, SystemKeyspace.LEGACY_HINTS);
+        //noinspection ConstantConditions
+        QueryProcessor.executeInternal(query).forEach(row -> migrateLegacyHints(row.getUUID("target_id"), buffer));
+        FileUtils.clean(buffer);
+    }
+
+    private void migrateLegacyHints(UUID hostId, ByteBuffer buffer)
+    {
+        String query = String.format("SELECT target_id, hint_id, message_version, mutation, ttl(mutation) AS ttl, writeTime(mutation) AS write_time " +
+                                     "FROM %s.%s " +
+                                     "WHERE target_id = ?",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.LEGACY_HINTS);
+
+        // read all the old hints (paged iterator), write them in the new format
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, pageSize, hostId);
+        migrateLegacyHints(hostId, rows, buffer);
+
+        // delete the whole partition in the legacy table; we would truncate the whole table afterwards, but this allows
+        // to not lose progress in case of a terminated conversion
+        deleteLegacyHintsPartition(hostId);
+    }
+
+    private void migrateLegacyHints(UUID hostId, UntypedResultSet rows, ByteBuffer buffer)
+    {
+        migrateLegacyHints(hostId, rows.iterator(), buffer);
+    }
+
+    private void migrateLegacyHints(UUID hostId, Iterator<UntypedResultSet.Row> iterator, ByteBuffer buffer)
+    {
+        do
+        {
+            migrateLegacyHintsInternal(hostId, iterator, buffer);
+            // if there are hints that didn't fit in the previous file, keep calling the method to write to a new
+            // file until we get everything written.
+        }
+        while (iterator.hasNext());
+    }
+
+    private void migrateLegacyHintsInternal(UUID hostId, Iterator<UntypedResultSet.Row> iterator, ByteBuffer buffer)
+    {
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, System.currentTimeMillis());
+
+        try (HintsWriter writer = HintsWriter.create(hintsDirectory, descriptor))
+        {
+            try (HintsWriter.Session session = writer.newSession(buffer))
+            {
+                while (iterator.hasNext())
+                {
+                    Hint hint = convertLegacyHint(iterator.next());
+                    if (hint != null)
+                        session.append(hint);
+
+                    if (session.position() >= maxHintsFileSize)
+                        break;
+                }
+            }
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, descriptor.fileName());
+        }
+    }
+
+    private static Hint convertLegacyHint(UntypedResultSet.Row row)
+    {
+        Mutation mutation = deserializeLegacyMutation(row);
+        if (mutation == null)
+            return null;
+
+        long creationTime = row.getLong("write_time"); // milliseconds, not micros, for the hints table
+        int expirationTime = FBUtilities.nowInSeconds() + row.getInt("ttl");
+        int originalGCGS = expirationTime - (int) TimeUnit.MILLISECONDS.toSeconds(creationTime);
+
+        int gcgs = Math.min(originalGCGS, mutation.smallestGCGS());
+
+        return Hint.create(mutation, creationTime, gcgs);
+    }
+
+    private static Mutation deserializeLegacyMutation(UntypedResultSet.Row row)
+    {
+        try (DataInputBuffer dib = new DataInputBuffer(row.getBlob("mutation"), true))
+        {
+            Mutation mutation = Mutation.serializer.deserialize(dib,
+                                                                row.getInt("message_version"));
+            mutation.getPartitionUpdates().forEach(PartitionUpdate::validate);
+            return mutation;
+        }
+        catch (IOException e)
+        {
+            logger.error("Failed to migrate a hint for {} from legacy {}.{} table: {}",
+                         row.getUUID("target_id"),
+                         SystemKeyspace.NAME,
+                         SystemKeyspace.LEGACY_HINTS,
+                         e);
+            return null;
+        }
+        catch (MarshalException e)
+        {
+            logger.warn("Failed to validate a hint for {} (table id {}) from legacy {}.{} table - skipping: {})",
+                        row.getUUID("target_id"),
+                        SystemKeyspace.NAME,
+                        SystemKeyspace.LEGACY_HINTS,
+                        e);
+            return null;
+        }
+    }
+
+    private static void deleteLegacyHintsPartition(UUID hostId)
+    {
+        // intentionally use millis, like the rest of the legacy implementation did, just in case
+        Mutation mutation = new Mutation(PartitionUpdate.fullPartitionDelete(SystemKeyspace.LegacyHints,
+                                                                             UUIDType.instance.decompose(hostId),
+                                                                             System.currentTimeMillis(),
+                                                                             FBUtilities.nowInSeconds()));
+        mutation.applyUnsafe();
+    }
+}

diff --git a/src/java/org/apache/cassandra/hints/package-info.java b/src/java/org/apache/cassandra/hints/package-info.java
new file mode 100644
index 0000000..faa7b9f
--- /dev/null
+++ b/src/java/org/apache/cassandra/hints/package-info.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Hints subsystem consists of several components.
+ *
+ * {@link org.apache.cassandra.hints.Hint} encodes all the required metadata and the mutation being hinted.
+ *
+ * {@link org.apache.cassandra.hints.HintsBuffer} provides a temporary buffer for writing the hints to in a concurrent manner,
+ * before we flush them to disk.
+ *
+ * {@link org.apache.cassandra.hints.HintsBufferPool} is responsible for submitting {@link org.apache.cassandra.hints.HintsBuffer}
+ * instances for flushing when they exceed their capacity, and for maitaining a reserve {@link org.apache.cassandra.hints.HintsBuffer}
+ * instance, and creating extra ones if flushing cannot keep up with arrival rate.
+ *
+ * {@link org.apache.cassandra.hints.HintsWriteExecutor} is a single-threaded executor that performs all the writing to disk.
+ *
+ * {@link org.apache.cassandra.hints.HintsDispatchExecutor} is a multi-threaded executor responsible for dispatch of
+ * the hints to their destinations.
+ *
+ * {@link org.apache.cassandra.hints.HintsStore} tracks the state of all hints files (written and being written to)
+ * for a given host id destination.
+ *
+ * {@link org.apache.cassandra.hints.HintsCatalog} maintains the mapping of host ids to {@link org.apache.cassandra.hints.HintsStore}
+ * instances, and provides some aggregate APIs.
+ *
+ * {@link org.apache.cassandra.hints.HintsService} wraps the catalog, the pool, and the two executors, acting as a front-end
+ * for hints.
+ */
+package org.apache.cassandra.hints;
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
new file mode 100644
index 0000000..469ef07
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/Index.java

@@ -0,0 +1,472 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index;
+
+import java.util.Optional;
+import java.util.concurrent.Callable;
+import java.util.function.BiFunction;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Consisting of a top level Index interface and two sub-interfaces which handle read and write operations,
+ * Searcher and Indexer respectively, this defines a secondary index implementation.
+ * Instantiation is done via reflection and implementations must provide a constructor which takes the base
+ * table's ColumnFamilyStore and the IndexMetadata which defines the Index as arguments. e.g:
+ *  {@code MyCustomIndex( ColumnFamilyStore baseCfs, IndexMetadata indexDef )}
+ *
+ * The main interface defines methods for index management, index selection at both write and query time,
+ * as well as validation of values that will ultimately be indexed.
+ * Two sub-interfaces are also defined, which represent single use helpers for short lived tasks at read and write time.
+ * Indexer: an event listener which receives notifications at particular points during an update of a single partition
+ *          in the base table.
+ * Searcher: performs queries against the index based on a predicate defined in a RowFilter. An instance
+ *          is expected to be single use, being involved in the execution of a single ReadCommand.
+ *
+ * The main interface includes factory methods for obtaining instances of both of the sub-interfaces;
+ *
+ * The methods defined in the top level interface can be grouped into 3 categories:
+ *
+ * Management Tasks:
+ * This group of methods is primarily concerned with maintenance of secondary indexes are are mainly called from
+ * SecondaryIndexManager. It includes methods for registering and un-registering an index, performing maintenance
+ * tasks such as (re)building an index from SSTable data, flushing, invalidating and so forth, as well as some to
+ * retrieve general metadata about the index (index name, any internal tables used for persistence etc).
+ * Several of these maintenance functions have a return type of Callable<?>; the expectation for these methods is
+ * that any work required to be performed by the method be done inside the Callable so that the responsibility for
+ * scheduling its execution can rest with SecondaryIndexManager. For instance, a task like reloading index metadata
+ * following potential updates caused by modifications to the base table may be performed in a blocking way. In
+ * contrast, adding a new index may require it to be built from existing SSTable data, a potentially expensive task
+ * which should be performed asyncronously.
+ *
+ * Index Selection:
+ * There are two facets to index selection, write time and read time selection. The former is concerned with
+ * identifying whether an index should be informed about a particular write operation. The latter is about providing
+ * means to use the index for search during query execution.
+ *
+ * Validation:
+ * Values that may be written to an index are checked as part of input validation, prior to an update or insert
+ * operation being accepted.
+ *
+ *
+ * Sub-interfaces:
+ *
+ * Update processing:
+ * Indexes are subscribed to the stream of events generated by modifications to the base table. Subscription is
+ * done via first registering the Index with the base table's SecondaryIndexManager. For each partition update, the set
+ * of registered indexes are then filtered based on the properties of the update using the selection methods on the main
+ * interface described above. Each of the indexes in the filtered set then provides an event listener to receive
+ * notifications about the update as it is processed. As such then, a event handler instance is scoped to a single
+ * partition update; SecondaryIndexManager obtains a new handler for every update it processes (via a call to the
+ * factory method, indexerFor. That handler will then receive all events for the update, before being
+ * discarded by the SecondaryIndexManager. Indexer instances are never re-used by SecondaryIndexManager and the
+ * expectation is that each call to indexerFor should return a unique instance, or at least if instances can
+ * be recycled, that a given instance is only used to process a single partition update at a time.
+ *
+ * Search:
+ * Each query (i.e. a single ReadCommand) that uses indexes will use a single instance of Index.Searcher. As with
+ * processing of updates, an Index must be registered with the primary table's SecondaryIndexManager to be able to
+ * support queries. During the processing of a ReadCommand, the Expressions in its RowFilter are examined to determine
+ * whether any of them are supported by a registered Index. supportsExpression is used to filter out Indexes which
+ * cannot support a given Expression. After filtering, the set of candidate indexes are ranked according to the result
+ * of getEstimatedResultRows and the most selective (i.e. the one expected to return the smallest number of results) is
+ * chosen. A Searcher instance is then obtained from the searcherFor method & used to perform the actual Index lookup.
+ * Finally, Indexes can define a post processing step to be performed on the coordinator, after results (partitions from
+ * the primary table) have been received from replicas and reconciled. This post processing is defined as a
+ * java.util.functions.BiFunction<PartitionIterator, RowFilter, PartitionIterator>, that is a function which takes as
+ * arguments a PartitionIterator (containing the reconciled result rows) and a RowFilter (from the ReadCommand being
+ * executed) and returns another iterator of partitions, possibly having transformed the initial results in some way.
+ * The post processing function is obtained from the Index's postProcessorFor method; the built-in indexes which ship
+ * with Cassandra return a no-op function here.
+ *
+ * An optional static method may be provided to validate custom index options (two variants are supported):
+ *
+ * <pre>{@code public static Map<String, String> validateOptions(Map<String, String> options);</pre>
+ *
+ * The input is the map of index options supplied in the WITH clause of a CREATE INDEX statement.
+ *
+ * <pre>{@code public static Map<String, String> validateOptions(Map<String, String> options, CFMetaData cfm);}</pre>
+ *
+ * In this version, the base table's metadata is also supplied as an argument.
+ * If both overloaded methods are provided, only the one including the base table's metadata will be invoked.
+ *
+ * The validation method should return a map containing any of the supplied options which are not valid for the
+ * implementation. If the returned map is not empty, validation is considered failed and an error is raised.
+ * Alternatively, the implementation may choose to throw an org.apache.cassandra.exceptions.ConfigurationException
+ * if invalid options are encountered.
+ *
+ */
+public interface Index
+{
+
+    /*
+     * Management functions
+     */
+
+    /**
+     * Return a task to perform any initialization work when a new index instance is created.
+     * This may involve costly operations such as (re)building the index, and is performed asynchronously
+     * by SecondaryIndexManager
+     * @return a task to perform any necessary initialization work
+     */
+    public Callable<?> getInitializationTask();
+
+    /**
+     * Returns the IndexMetadata which configures and defines the index instance. This should be the same
+     * object passed as the argument to setIndexMetadata.
+     * @return the index's metadata
+     */
+    public IndexMetadata getIndexMetadata();
+
+    /**
+     * Return a task to reload the internal metadata of an index.
+     * Called when the base table metadata is modified or when the configuration of the Index is updated
+     * Implementations should return a task which performs any necessary work to be done due to
+     * updating the configuration(s) such as (re)building etc. This task is performed asynchronously
+     * by SecondaryIndexManager
+     * @return task to be executed by the index manager during a reload
+     */
+    public Callable<?> getMetadataReloadTask(IndexMetadata indexMetadata);
+
+    /**
+     * An index must be registered in order to be able to either subscribe to update events on the base
+     * table and/or to provide Searcher functionality for reads. The double dispatch involved here, where
+     * the Index actually performs its own registration by calling back to the supplied IndexRegistry's
+     * own registerIndex method, is to make the decision as to whether or not to register an index belong
+     * to the implementation, not the manager.
+     * @param registry the index registry to register the instance with
+     */
+    public void register(IndexRegistry registry);
+
+    /**
+     * If the index implementation uses a local table to store its index data this method should return a
+     * handle to it. If not, an empty Optional should be returned. Typically, this is useful for the built-in
+     * Index implementations.
+     * @return an Optional referencing the Index's backing storage table if it has one, or Optional.empty() if not.
+     */
+    public Optional<ColumnFamilyStore> getBackingTable();
+
+    /**
+     * Return a task which performs a blocking flush of the index's data to persistent storage.
+     * @return task to be executed by the index manager to perform the flush.
+     */
+    public Callable<?> getBlockingFlushTask();
+
+    /**
+     * Return a task which invalidates the index, indicating it should no longer be considered usable.
+     * This should include an clean up and releasing of resources required when dropping an index.
+     * @return task to be executed by the index manager to invalidate the index.
+     */
+    public Callable<?> getInvalidateTask();
+
+    /**
+     * Return a task to truncate the index with the specified truncation timestamp.
+     * Called when the base table is truncated.
+     * @param truncatedAt timestamp of the truncation operation. This will be the same timestamp used
+     *                    in the truncation of the base table.
+     * @return task to be executed by the index manager when the base table is truncated.
+     */
+    public Callable<?> getTruncateTask(long truncatedAt);
+
+    /**
+     * Return true if this index can be built or rebuilt when the index manager determines it is necessary. Returning
+     * false enables the index implementation (or some other component) to control if and when SSTable data is
+     * incorporated into the index.
+     *
+     * This is called by SecondaryIndexManager in buildIndexBlocking, buildAllIndexesBlocking & rebuildIndexesBlocking
+     * where a return value of false causes the index to be exluded from the set of those which will process the
+     * SSTable data.
+     * @return if the index should be included in the set which processes SSTable data, false otherwise.
+     */
+    public boolean shouldBuildBlocking();
+
+
+    /*
+     * Index selection
+     */
+
+    /**
+     * Called to determine whether this index targets a specific column.
+     * Used during schema operations such as when dropping or renaming a column, to check if
+     * the index will be affected by the change. Typically, if an index answers that it does
+     * depend upon a column, then schema operations on that column are not permitted until the index
+     * is dropped or altered.
+     *
+     * @param column the column definition to check
+     * @return true if the index depends on the supplied column being present; false if the column may be
+     *              safely dropped or modified without adversely affecting the index
+     */
+    public boolean dependsOn(ColumnDefinition column);
+
+    /**
+     * Called to determine whether this index can provide a searcher to execute a query on the
+     * supplied column using the specified operator. This forms part of the query validation done
+     * before a CQL select statement is executed.
+     * @param column the target column of a search query predicate
+     * @param operator the operator of a search query predicate
+     * @return true if this index is capable of supporting such expressions, false otherwise
+     */
+    public boolean supportsExpression(ColumnDefinition column, Operator operator);
+
+    /**
+     * If the index supports custom search expressions using the
+     * {@code}SELECT * FROM table WHERE expr(index_name, expression){@code} syntax, this
+     * method should return the expected type of the expression argument.
+     * For example, if the index supports custom expressions as Strings, calls to this
+     * method should return {@code}UTF8Type.instance{@code}.
+     * If the index implementation does not support custom expressions, then it should
+     * return null.
+     * @return an the type of custom index expressions supported by this index, or an
+     *         null if custom expressions are not supported.
+     */
+    public AbstractType<?> customExpressionValueType();
+
+    /**
+     * Transform an initial RowFilter into the filter that will still need to applied
+     * to a set of Rows after the index has performed it's initial scan.
+     * Used in ReadCommand#executeLocal to reduce the amount of filtering performed on the
+     * results of the index query.
+     *
+     * @param filter the intial filter belonging to a ReadCommand
+     * @return the (hopefully) reduced filter that would still need to be applied after
+     *         the index was used to narrow the initial result set
+     */
+    public RowFilter getPostIndexQueryFilter(RowFilter filter);
+
+    /**
+     * Return an estimate of the number of results this index is expected to return for any given
+     * query that it can be used to answer. Used in conjunction with indexes() and supportsExpression()
+     * to determine the most selective index for a given ReadCommand. Additionally, this is also used
+     * by StorageProxy.estimateResultsPerRange to calculate the initial concurrency factor for range requests
+     *
+     * @return the estimated average number of results a Searcher may return for any given query
+     */
+    public long getEstimatedResultRows();
+
+    /*
+     * Input validation
+     */
+
+    /**
+     * Called at write time to ensure that values present in the update
+     * are valid according to the rules of all registered indexes which
+     * will process it. The partition key as well as the clustering and
+     * cell values for each row in the update may be checked by index
+     * implementations
+     * @param update PartitionUpdate containing the values to be validated by registered Index implementations
+     * @throws InvalidRequestException
+     */
+    public void validate(PartitionUpdate update) throws InvalidRequestException;
+
+    /*
+     * Update processing
+     */
+
+    /**
+     * Creates an new {@code Indexer} object for updates to a given partition.
+     *
+     * @param key key of the partition being modified
+     * @param columns the regular and static columns the created indexer will have to deal with.
+     * This can be empty as an update might only contain partition, range and row deletions, but
+     * the indexer is guaranteed to not get any cells for a column that is not part of {@code columns}.
+     * @param nowInSec current time of the update operation
+     * @param opGroup operation group spanning the update operation
+     * @param transactionType indicates what kind of update is being performed on the base data
+     *                        i.e. a write time insert/update/delete or the result of compaction
+     * @return the newly created indexer or {@code null} if the index is not interested by the update
+     * (this could be because the index doesn't care about that particular partition, doesn't care about
+     * that type of transaction, ...).
+     */
+    public Indexer indexerFor(DecoratedKey key,
+                              PartitionColumns columns,
+                              int nowInSec,
+                              OpOrder.Group opGroup,
+                              IndexTransaction.Type transactionType);
+
+    /**
+     * Listener for processing events emitted during a single partition update.
+     * Instances of this are responsible for applying modifications to the index in response to a single update
+     * operation on a particular partition of the base table.
+     *
+     * That update may be generated by the normal write path, by iterating SSTables during streaming operations or when
+     * building or rebuilding an index from source. Updates also occur during compaction when multiple versions of a
+     * source partition from different SSTables are merged.
+     *
+     * Implementations should not make assumptions about resolution or filtering of the partition update being
+     * processed. That is to say that it is possible for an Indexer instance to receive notification of a
+     * PartitionDelete or RangeTombstones which shadow a Row it then receives via insertRow/updateRow.
+     *
+     * It is important to note that the only ordering guarantee made for the methods here is that the first call will
+     * be to begin() and the last call to finish(). The other methods may be called to process update events in any
+     * order. This can also include duplicate calls, in cases where a memtable partition is under contention from
+     * several updates. In that scenario, the same set of events may be delivered to the Indexer as memtable update
+     * which failed due to contention is re-applied.
+     */
+    public interface Indexer
+    {
+        /**
+         * Notification of the start of a partition update.
+         * This event always occurs before any other during the update.
+         */
+        public void begin();
+
+        /**
+         * Notification of a top level partition delete.
+         * @param deletionTime
+         */
+        public void partitionDelete(DeletionTime deletionTime);
+
+        /**
+         * Notification of a RangeTombstone.
+         * An update of a single partition may contain multiple RangeTombstones,
+         * and a notification will be passed for each of them.
+         * @param tombstone
+         */
+        public void rangeTombstone(RangeTombstone tombstone);
+
+        /**
+         * Notification that a new row was inserted into the Memtable holding the partition.
+         * This only implies that the inserted row was not already present in the Memtable,
+         * it *does not* guarantee that the row does not exist in an SSTable, potentially with
+         * additional column data.
+         *
+         * @param row the Row being inserted into the base table's Memtable.
+         */
+        public void insertRow(Row row);
+
+        /**
+         * Notification of a modification to a row in the base table's Memtable.
+         * This is allow an Index implementation to clean up entries for base data which is
+         * never flushed to disk (and so will not be purged during compaction).
+         * It's important to note that the old & new rows supplied here may not represent
+         * the totality of the data for the Row with this particular Clustering. There may be
+         * additional column data in SSTables which is not present in either the old or new row,
+         * so implementations should be aware of that.
+         * The supplied rows contain only column data which has actually been updated.
+         * oldRowData contains only the columns which have been removed from the Row's
+         * representation in the Memtable, while newRowData includes only new columns
+         * which were not previously present. Any column data which is unchanged by
+         * the update is not included.
+         *
+         * @param oldRowData data that was present in existing row and which has been removed from
+         *                   the base table's Memtable
+         * @param newRowData data that was not present in the existing row and is being inserted
+         *                   into the base table's Memtable
+         */
+        public void updateRow(Row oldRowData, Row newRowData);
+
+        /**
+         * Notification that a row was removed from the partition.
+         * Note that this is only called as part of either a compaction or a cleanup.
+         * This context is indicated by the TransactionType supplied to the indexerFor method.
+         *
+         * As with updateRow, it cannot be guaranteed that all data belonging to the Clustering
+         * of the supplied Row has been removed (although in the case of a cleanup, that is the
+         * ultimate intention).
+         * There may be data for the same row in other SSTables, so in this case Indexer implementations
+         * should *not* assume that all traces of the row have been removed. In particular,
+         * it is not safe to assert that all values associated with the Row's Clustering
+         * have been deleted, so implementations which index primary key columns should not
+         * purge those entries from their indexes.
+         *
+         * @param row data being removed from the base table
+         */
+        public void removeRow(Row row);
+
+        /**
+         * Notification of the end of the partition update.
+         * This event always occurs after all others for the particular update.
+         */
+        public void finish();
+    }
+
+    /*
+     * Querying
+     */
+
+    /**
+     * Used to validate the various parameters of a supplied {@code}ReadCommand{@code},
+     * this is called prior to execution. In theory, any command instance may be checked
+     * by any {@code}Index{@code} instance, but in practice the index will be the one
+     * returned by a call to the {@code}getIndex(ColumnFamilyStore cfs){@code} method on
+     * the supplied command.
+     *
+     * Custom index implementations should perform any validation of query expressions here and throw a meaningful
+     * InvalidRequestException when any expression or other parameter is invalid.
+     *
+     * @param command a ReadCommand whose parameters are to be verified
+     * @throws InvalidRequestException if the details of the command fail to meet the
+     *         index's validation rules
+     */
+    default void validate(ReadCommand command) throws InvalidRequestException
+    {
+    }
+
+    /**
+     * Return a function which performs post processing on the results of a partition range read command.
+     * In future, this may be used as a generalized mechanism for transforming results on the coordinator prior
+     * to returning them to the caller.
+     *
+     * This is used on the coordinator during execution of a range command to perform post
+     * processing of merged results obtained from the necessary replicas. This is the only way in which results are
+     * transformed in this way but this may change over time as usage is generalized.
+     * See CASSANDRA-8717 for further discussion.
+     *
+     * The function takes a PartitionIterator of the results from the replicas which has already been collated
+     * & reconciled, along with the command being executed. It returns another PartitionIterator containing the results
+     * of the transformation (which may be the same as the input if the transformation is a no-op).
+     */
+    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command);
+
+    /**
+     * Factory method for query time search helper.
+     *
+     * @param command the read command being executed
+     * @return an Searcher with which to perform the supplied command
+     */
+    public Searcher searcherFor(ReadCommand command);
+
+    /**
+     * Performs the actual index lookup during execution of a ReadCommand.
+     * An instance performs its query according to the RowFilter.Expression it was created for (see searcherFor)
+     * An Expression is a predicate of the form [column] [operator] [value].
+     */
+    public interface Searcher
+    {
+        /**
+         * @param orderGroup the collection of OpOrder.Groups which the ReadCommand is being performed under.
+         * @return partitions from the base table matching the criteria of the search.
+         */
+        public UnfilteredPartitionIterator search(ReadOrderGroup orderGroup);
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/IndexNotAvailableException.java b/src/java/org/apache/cassandra/index/IndexNotAvailableException.java
new file mode 100644
index 0000000..5440e2a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/IndexNotAvailableException.java

@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index;
+
+/**
+ * Thrown if a secondary index is not currently available.
+ */
+public final class IndexNotAvailableException extends RuntimeException
+{
+    /**
+     * Creates a new <code>IndexNotAvailableException</code> for the specified index.
+     * @param name the index name
+     */
+    public IndexNotAvailableException(Index index)
+    {
+        super(String.format("The secondary index '%s' is not yet available", index.getIndexMetadata().name));
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/IndexRegistry.java b/src/java/org/apache/cassandra/index/IndexRegistry.java
new file mode 100644
index 0000000..9f5ed02
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/IndexRegistry.java

@@ -0,0 +1,42 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index;
+
+import java.util.Collection;
+
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * The collection of all Index instances for a base table.
+ * The SecondaryIndexManager for a ColumnFamilyStore contains an IndexRegistry
+ * (actually it implements this interface at present) and Index implementations
+ * register in order to:
+ * i) subscribe to the stream of updates being applied to partitions in the base table
+ * ii) provide searchers to support queries with the relevant search predicates
+ */
+public interface IndexRegistry
+{
+    void registerIndex(Index index);
+    void unregisterIndex(Index index);
+
+    Index getIndex(IndexMetadata indexMetadata);
+    Collection<Index> listIndexes();
+}

diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java b/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java
new file mode 100644
index 0000000..907f65f
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index;
+
+import java.util.Set;
+import java.util.UUID;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.CompactionInterruptedException;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.utils.UUIDGen;
+
+/**
+ * Manages building an entire index from column family data. Runs on to compaction manager.
+ */
+public class SecondaryIndexBuilder extends CompactionInfo.Holder
+{
+    private final ColumnFamilyStore cfs;
+    private final Set<Index> indexers;
+    private final ReducingKeyIterator iter;
+    private final UUID compactionId;
+
+    public SecondaryIndexBuilder(ColumnFamilyStore cfs, Set<Index> indexers, ReducingKeyIterator iter)
+    {
+        this.cfs = cfs;
+        this.indexers = indexers;
+        this.iter = iter;
+        this.compactionId = UUIDGen.getTimeUUID();
+    }
+
+    public CompactionInfo getCompactionInfo()
+    {
+        return new CompactionInfo(cfs.metadata,
+                                  OperationType.INDEX_BUILD,
+                                  iter.getBytesRead(),
+                                  iter.getTotalBytes(),
+                                  compactionId);
+    }
+
+    public void build()
+    {
+        try
+        {
+            int pageSize = cfs.indexManager.calculateIndexingPageSize();
+            while (iter.hasNext())
+            {
+                if (isStopRequested())
+                    throw new CompactionInterruptedException(getCompactionInfo());
+                DecoratedKey key = iter.next();
+                cfs.indexManager.indexPartition(key, indexers, pageSize);
+            }
+        }
+        finally
+        {
+            iter.close();
+        }
+    }
+
+    public boolean isGlobal()
+    {
+        return false;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
new file mode 100644
index 0000000..d66a18b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java

@@ -0,0 +1,1158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index;
+
+import java.lang.reflect.Constructor;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Joiner;
+import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import com.google.common.primitives.Longs;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.MoreExecutors;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.transactions.*;
+import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Indexes;
+import org.apache.cassandra.service.pager.SinglePartitionPager;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.Refs;
+
+import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
+import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
+import static org.apache.cassandra.utils.ExecutorUtils.shutdownNow;
+
+/**
+ * Handles the core maintenance functionality associated with indexes: adding/removing them to or from
+ * a table, (re)building during bootstrap or other streaming operations, flushing, reloading metadata
+ * and so on.
+ *
+ * The Index interface defines a number of methods which return Callable<?>. These are primarily the
+ * management tasks for an index implementation. Most of them are currently executed in a blocking
+ * fashion via submission to SIM's blockingExecutor. This provides the desired behaviour in pretty
+ * much all cases, as tasks like flushing an index needs to be executed synchronously to avoid potentially
+ * deadlocking on the FlushWriter or PostFlusher. Several of these Callable<?> returning methods on Index could
+ * then be defined with as void and called directly from SIM (rather than being run via the executor service).
+ * Separating the task defintion from execution gives us greater flexibility though, so that in future, for example,
+ * if the flush process allows it we leave open the possibility of executing more of these tasks asynchronously.
+ *
+ * The primary exception to the above is the Callable returned from Index#addIndexedColumn. This may
+ * involve a significant effort, building a new index over any existing data. We perform this task asynchronously;
+ * as it is called as part of a schema update, which we do not want to block for a long period. Building non-custom
+ * indexes is performed on the CompactionManager.
+ *
+ * This class also provides instances of processors which listen to updates to the base table and forward to
+ * registered Indexes the info required to keep those indexes up to date.
+ * There are two variants of these processors, each with a factory method provided by SIM:
+ *      IndexTransaction: deals with updates generated on the regular write path.
+ *      CleanupTransaction: used when partitions are modified during compaction or cleanup operations.
+ * Further details on their usage and lifecycles can be found in the interface definitions below.
+ *
+ * Finally, the bestIndexFor method is used at query time to identify the most selective index of those able
+ * to satisfy any search predicates defined by a ReadCommand's RowFilter. It returns a thin IndexAccessor object
+ * which enables the ReadCommand to access the appropriate functions of the Index at various stages in its lifecycle.
+ * e.g. the getEstimatedResultRows is required when StorageProxy calculates the initial concurrency factor for
+ * distributing requests to replicas, whereas a Searcher instance is needed when the ReadCommand is executed locally on
+ * a target replica.
+ */
+public class SecondaryIndexManager implements IndexRegistry
+{
+    private static final Logger logger = LoggerFactory.getLogger(SecondaryIndexManager.class);
+
+    // default page size (in rows) when rebuilding the index for a whole partition
+    public static final int DEFAULT_PAGE_SIZE = 10000;
+
+    private Map<String, Index> indexes = Maps.newConcurrentMap();
+
+    /**
+     * The indexes that are ready to server requests.
+     */
+    private Set<String> builtIndexes = Sets.newConcurrentHashSet();
+
+    // executes tasks returned by Indexer#addIndexColumn which may require index(es) to be (re)built
+    private static final ExecutorService asyncExecutor =
+        new JMXEnabledThreadPoolExecutor(1,
+                                         StageManager.KEEPALIVE,
+                                         TimeUnit.SECONDS,
+                                         new LinkedBlockingQueue<>(),
+                                         new NamedThreadFactory("SecondaryIndexManagement"),
+                                         "internal");
+
+    // executes all blocking tasks produced by Indexers e.g. getFlushTask, getMetadataReloadTask etc
+    private static final ExecutorService blockingExecutor = MoreExecutors.newDirectExecutorService();
+
+    /**
+     * The underlying column family containing the source data for these indexes
+     */
+    public final ColumnFamilyStore baseCfs;
+
+    public SecondaryIndexManager(ColumnFamilyStore baseCfs)
+    {
+        this.baseCfs = baseCfs;
+    }
+
+
+    /**
+     * Drops and adds new indexes associated with the underlying CF
+     */
+    public void reload()
+    {
+        // figure out what needs to be added and dropped.
+        Indexes tableIndexes = baseCfs.metadata.getIndexes();
+        indexes.keySet()
+               .stream()
+               .filter(indexName -> !tableIndexes.has(indexName))
+               .forEach(this::removeIndex);
+
+        // we call add for every index definition in the collection as
+        // some may not have been created here yet, only added to schema
+        for (IndexMetadata tableIndex : tableIndexes)
+            addIndex(tableIndex);
+    }
+
+    private Future<?> reloadIndex(IndexMetadata indexDef)
+    {
+        Index index = indexes.get(indexDef.name);
+        Callable<?> reloadTask = index.getMetadataReloadTask(indexDef);
+        return reloadTask == null
+               ? Futures.immediateFuture(null)
+               : blockingExecutor.submit(reloadTask);
+    }
+
+    private Future<?> createIndex(IndexMetadata indexDef)
+    {
+        Index index = createInstance(indexDef);
+        index.register(this);
+
+        // if the index didn't register itself, we can probably assume that no initialization needs to happen
+        final Callable<?> initialBuildTask = indexes.containsKey(indexDef.name)
+                                           ? index.getInitializationTask()
+                                           : null;
+        if (initialBuildTask == null)
+        {
+            // We need to make sure that the index is marked as built in the case where the initialBuildTask
+            // does not need to be run (if the index didn't register itself or if the base table was empty).
+            markIndexBuilt(indexDef.name);
+            return Futures.immediateFuture(null);
+        }
+        return asyncExecutor.submit(index.getInitializationTask());
+    }
+
+    /**
+     * Adds and builds a index
+     * @param indexDef the IndexMetadata describing the index
+     */
+    public synchronized Future<?> addIndex(IndexMetadata indexDef)
+    {
+        if (indexes.containsKey(indexDef.name))
+            return reloadIndex(indexDef);
+        else
+            return createIndex(indexDef);
+    }
+
+    /**
+     * Checks if the specified index is queryable.
+     *
+     * @param index the index
+     * @return <code>true</code> if the specified index is queryable, <code>false</code> otherwise
+     */
+    public boolean isIndexQueryable(Index index)
+    {
+        return builtIndexes.contains(index.getIndexMetadata().name);
+    }
+
+    public synchronized void removeIndex(String indexName)
+    {
+        Index index = unregisterIndex(indexName);
+        if (null != index)
+        {
+            markIndexRemoved(indexName);
+            executeBlocking(index.getInvalidateTask());
+        }
+    }
+
+
+    public Set<IndexMetadata> getDependentIndexes(ColumnDefinition column)
+    {
+        if (indexes.isEmpty())
+            return Collections.emptySet();
+
+        Set<IndexMetadata> dependentIndexes = new HashSet<>();
+        for (Index index : indexes.values())
+            if (index.dependsOn(column))
+                dependentIndexes.add(index.getIndexMetadata());
+
+        return dependentIndexes;
+    }
+
+    /**
+     * Called when dropping a Table
+     */
+    public void markAllIndexesRemoved()
+    {
+       getBuiltIndexNames().forEach(this::markIndexRemoved);
+    }
+
+    /**
+    * Does a full, blocking rebuild of the indexes specified by columns from the sstables.
+    * Caller must acquire and release references to the sstables used here.
+    * Note also that only this method of (re)building indexes:
+    *   a) takes a set of index *names* rather than Indexers
+    *   b) marks exsiting indexes removed prior to rebuilding
+    *
+    * @param sstables the data to build from
+    * @param indexNames the list of indexes to be rebuilt
+    */
+    public void rebuildIndexesBlocking(Collection<SSTableReader> sstables, Set<String> indexNames)
+    {
+        Set<Index> toRebuild = indexes.values().stream()
+                                               .filter(index -> indexNames.contains(index.getIndexMetadata().name))
+                                               .filter(Index::shouldBuildBlocking)
+                                               .collect(Collectors.toSet());
+        if (toRebuild.isEmpty())
+        {
+            logger.info("No defined indexes with the supplied names: {}", Joiner.on(',').join(indexNames));
+            return;
+        }
+
+        toRebuild.forEach(indexer -> markIndexRemoved(indexer.getIndexMetadata().name));
+
+        buildIndexesBlocking(sstables, toRebuild);
+
+        toRebuild.forEach(indexer -> markIndexBuilt(indexer.getIndexMetadata().name));
+    }
+
+    public void buildAllIndexesBlocking(Collection<SSTableReader> sstables)
+    {
+        buildIndexesBlocking(sstables, indexes.values()
+                                              .stream()
+                                              .filter(Index::shouldBuildBlocking)
+                                              .collect(Collectors.toSet()));
+    }
+
+    // For convenience, may be called directly from Index impls
+    public void buildIndexBlocking(Index index)
+    {
+        if (index.shouldBuildBlocking())
+        {
+            try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+                 Refs<SSTableReader> sstables = viewFragment.refs)
+            {
+                buildIndexesBlocking(sstables, Collections.singleton(index));
+                markIndexBuilt(index.getIndexMetadata().name);
+            }
+        }
+    }
+
+    /**
+     * Checks if the specified {@link ColumnFamilyStore} is a secondary index.
+     *
+     * @param cfs the <code>ColumnFamilyStore</code> to check.
+     * @return <code>true</code> if the specified <code>ColumnFamilyStore</code> is a secondary index,
+     * <code>false</code> otherwise.
+     */
+    public static boolean isIndexColumnFamilyStore(ColumnFamilyStore cfs)
+    {
+        return isIndexColumnFamily(cfs.name);
+    }
+
+    /**
+     * Checks if the specified {@link ColumnFamilyStore} is the one secondary index.
+     *
+     * @param cfName the name of the <code>ColumnFamilyStore</code> to check.
+     * @return <code>true</code> if the specified <code>ColumnFamilyStore</code> is a secondary index,
+     * <code>false</code> otherwise.
+     */
+    public static boolean isIndexColumnFamily(String cfName)
+    {
+        return cfName.contains(Directories.SECONDARY_INDEX_NAME_SEPARATOR);
+    }
+
+    /**
+     * Returns the parent of the specified {@link ColumnFamilyStore}.
+     *
+     * @param cfs the <code>ColumnFamilyStore</code>
+     * @return the parent of the specified <code>ColumnFamilyStore</code>
+     */
+    public static ColumnFamilyStore getParentCfs(ColumnFamilyStore cfs)
+    {
+        String parentCfs = getParentCfsName(cfs.name);
+        return cfs.keyspace.getColumnFamilyStore(parentCfs);
+    }
+
+    /**
+     * Returns the parent name of the specified {@link ColumnFamilyStore}.
+     *
+     * @param cfName the <code>ColumnFamilyStore</code> name
+     * @return the parent name of the specified <code>ColumnFamilyStore</code>
+     */
+    public static String getParentCfsName(String cfName)
+    {
+        assert isIndexColumnFamily(cfName);
+        return StringUtils.substringBefore(cfName, Directories.SECONDARY_INDEX_NAME_SEPARATOR);
+    }
+
+    /**
+     * Returns the index name
+     *
+     * @param cfs the <code>ColumnFamilyStore</code>
+     * @return the index name
+     */
+    public static String getIndexName(ColumnFamilyStore cfs)
+    {
+        return getIndexName(cfs.name);
+    }
+
+    /**
+     * Returns the index name
+     *
+     * @param cfName the <code>ColumnFamilyStore</code> name
+     * @return the index name
+     */
+    public static String getIndexName(String cfName)
+    {
+        assert isIndexColumnFamily(cfName);
+        return StringUtils.substringAfter(cfName, Directories.SECONDARY_INDEX_NAME_SEPARATOR);
+    }
+
+    private void buildIndexesBlocking(Collection<SSTableReader> sstables, Set<Index> indexes)
+    {
+        if (indexes.isEmpty())
+            return;
+
+        logger.info("Submitting index build of {} for data in {}",
+                    indexes.stream().map(i -> i.getIndexMetadata().name).collect(Collectors.joining(",")),
+                    sstables.stream().map(SSTableReader::toString).collect(Collectors.joining(",")));
+
+        SecondaryIndexBuilder builder = new SecondaryIndexBuilder(baseCfs,
+                                                                  indexes,
+                                                                  new ReducingKeyIterator(sstables));
+        Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
+        FBUtilities.waitOnFuture(future);
+
+        flushIndexesBlocking(indexes);
+        logger.info("Index build of {} complete",
+                    indexes.stream().map(i -> i.getIndexMetadata().name).collect(Collectors.joining(",")));
+    }
+
+    /**
+     * Marks the specified index as build.
+     * <p>This method is public as it need to be accessible from the {@link Index} implementations</p>
+     * @param indexName the index name
+     */
+    public void markIndexBuilt(String indexName)
+    {
+        builtIndexes.add(indexName);
+        if (DatabaseDescriptor.isDaemonInitialized())
+            SystemKeyspace.setIndexBuilt(baseCfs.keyspace.getName(), indexName);
+    }
+
+    /**
+     * Marks the specified index as removed.
+     * <p>This method is public as it need to be accessible from the {@link Index} implementations</p>
+     * @param indexName the index name
+     */
+    public void markIndexRemoved(String indexName)
+    {
+        SystemKeyspace.setIndexRemoved(baseCfs.keyspace.getName(), indexName);
+    }
+
+    public Index getIndexByName(String indexName)
+    {
+        return indexes.get(indexName);
+    }
+
+    private Index createInstance(IndexMetadata indexDef)
+    {
+        Index newIndex;
+        if (indexDef.isCustom())
+        {
+            assert indexDef.options != null;
+            String className = indexDef.options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME);
+            assert ! Strings.isNullOrEmpty(className);
+            try
+            {
+                Class<? extends Index> indexClass = FBUtilities.classForName(className, "Index");
+                Constructor<? extends Index> ctor = indexClass.getConstructor(ColumnFamilyStore.class, IndexMetadata.class);
+                newIndex = (Index)ctor.newInstance(baseCfs, indexDef);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        else
+        {
+            newIndex = CassandraIndex.newIndex(baseCfs, indexDef);
+        }
+        return newIndex;
+    }
+
+    /**
+     * Truncate all indexes
+     */
+    public void truncateAllIndexesBlocking(final long truncatedAt)
+    {
+        executeAllBlocking(indexes.values().stream(), (index) -> index.getTruncateTask(truncatedAt));
+    }
+
+    /**
+     * Remove all indexes
+     */
+    public void invalidateAllIndexesBlocking()
+    {
+        markAllIndexesRemoved();
+        executeAllBlocking(indexes.values().stream(), Index::getInvalidateTask);
+    }
+
+    /**
+     * Perform a blocking flush all indexes
+     */
+    public void flushAllIndexesBlocking()
+    {
+       flushIndexesBlocking(ImmutableSet.copyOf(indexes.values()));
+    }
+
+    /**
+     * Perform a blocking flush of selected indexes
+     */
+    public void flushIndexesBlocking(Set<Index> indexes)
+    {
+        if (indexes.isEmpty())
+            return;
+
+        List<Future<?>> wait = new ArrayList<>();
+        List<Index> nonCfsIndexes = new ArrayList<>();
+
+        // for each CFS backed index, submit a flush task which we'll wait on for completion
+        // for the non-CFS backed indexes, we'll flush those while we wait.
+        synchronized (baseCfs.getTracker())
+        {
+            indexes.forEach(index ->
+                index.getBackingTable()
+                     .map(cfs -> wait.add(cfs.forceFlush()))
+                     .orElseGet(() -> nonCfsIndexes.add(index)));
+        }
+
+        executeAllBlocking(nonCfsIndexes.stream(), Index::getBlockingFlushTask);
+        FBUtilities.waitOnFutures(wait);
+    }
+
+    /**
+     * Performs a blocking flush of all custom indexes
+     */
+    public void flushAllNonCFSBackedIndexesBlocking()
+    {
+        executeAllBlocking(indexes.values()
+                                  .stream()
+                                  .filter(index -> !index.getBackingTable().isPresent()),
+                           Index::getBlockingFlushTask);
+    }
+
+    /**
+     * @return all indexes which are marked as built and ready to use
+     */
+    public List<String> getBuiltIndexNames()
+    {
+        Set<String> allIndexNames = new HashSet<>();
+        indexes.values().stream()
+                .map(i -> i.getIndexMetadata().name)
+                .forEach(allIndexNames::add);
+        return SystemKeyspace.getBuiltIndexes(baseCfs.keyspace.getName(), allIndexNames);
+    }
+
+    /**
+     * @return all backing Tables used by registered indexes
+     */
+    public Set<ColumnFamilyStore> getAllIndexColumnFamilyStores()
+    {
+        Set<ColumnFamilyStore> backingTables = new HashSet<>();
+        indexes.values().forEach(index -> index.getBackingTable().ifPresent(backingTables::add));
+        return backingTables;
+    }
+
+    /**
+     * @return if there are ANY indexes registered for this table
+     */
+    public boolean hasIndexes()
+    {
+        return !indexes.isEmpty();
+    }
+
+    /**
+     * When building an index against existing data in sstables, add the given partition to the index
+     */
+    public void indexPartition(DecoratedKey key, Set<Index> indexes, int pageSize)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace("Indexing partition {}", baseCfs.metadata.getKeyValidator().getString(key.getKey()));
+
+        if (!indexes.isEmpty())
+        {
+            SinglePartitionReadCommand cmd = SinglePartitionReadCommand.fullPartitionRead(baseCfs.metadata,
+                                                                                          FBUtilities.nowInSeconds(),
+                                                                                          key);
+            int nowInSec = cmd.nowInSec();
+            boolean readStatic = false;
+
+            SinglePartitionPager pager = new SinglePartitionPager(cmd, null, Server.CURRENT_VERSION);
+            while (!pager.isExhausted())
+            {
+                try (ReadOrderGroup readGroup = cmd.startOrderGroup();
+                     OpOrder.Group writeGroup = Keyspace.writeOrder.start();
+                     UnfilteredPartitionIterator page = pager.fetchPageUnfiltered(baseCfs.metadata, pageSize, readGroup))
+                {
+                    if (!page.hasNext())
+                        break;
+
+                    try (UnfilteredRowIterator partition = page.next()) {
+                        Set<Index.Indexer> indexers = indexes.stream()
+                                                             .map(index -> index.indexerFor(key,
+                                                                                            partition.columns(),
+                                                                                            nowInSec,
+                                                                                            writeGroup,
+                                                                                            IndexTransaction.Type.UPDATE))
+                                                             .filter(Objects::nonNull)
+                                                             .collect(Collectors.toSet());
+
+                        // Short-circuit empty partitions if static row is processed or isn't read
+                        if (!readStatic && partition.isEmpty() && partition.staticRow().isEmpty())
+                            break;
+
+                        indexers.forEach(Index.Indexer::begin);
+
+                        if (!readStatic)
+                        {
+                            if (!partition.staticRow().isEmpty())
+                                indexers.forEach(indexer -> indexer.insertRow(partition.staticRow()));
+                            indexers.forEach((Index.Indexer i) -> i.partitionDelete(partition.partitionLevelDeletion()));
+                            readStatic = true;
+                        }
+
+                        MutableDeletionInfo.Builder deletionBuilder = MutableDeletionInfo.builder(partition.partitionLevelDeletion(), baseCfs.getComparator(), false);
+
+                        while (partition.hasNext())
+                        {
+                            Unfiltered unfilteredRow = partition.next();
+
+                            if (unfilteredRow.isRow())
+                            {
+                                Row row = (Row) unfilteredRow;
+                                indexers.forEach(indexer -> indexer.insertRow(row));
+                            }
+                            else
+                            {
+                                assert unfilteredRow.isRangeTombstoneMarker();
+                                RangeTombstoneMarker marker = (RangeTombstoneMarker) unfilteredRow;
+                                deletionBuilder.add(marker);
+                            }
+                        }
+
+                        MutableDeletionInfo deletionInfo = deletionBuilder.build();
+                        if (deletionInfo.hasRanges())
+                        {
+                            Iterator<RangeTombstone> iter = deletionInfo.rangeIterator(false);
+                            while (iter.hasNext())
+                            {
+                                RangeTombstone rt = iter.next();
+                                indexers.forEach(indexer -> indexer.rangeTombstone(rt));
+                            }
+                        }
+
+                        indexers.forEach(Index.Indexer::finish);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Return the page size used when indexing an entire partition
+     */
+    public int calculateIndexingPageSize()
+    {
+        if (Boolean.getBoolean("cassandra.force_default_indexing_page_size"))
+            return DEFAULT_PAGE_SIZE;
+
+        double targetPageSizeInBytes = 32 * 1024 * 1024;
+        double meanPartitionSize = baseCfs.getMeanPartitionSize();
+        if (meanPartitionSize <= 0)
+            return DEFAULT_PAGE_SIZE;
+
+        int meanCellsPerPartition = baseCfs.getMeanColumns();
+        if (meanCellsPerPartition <= 0)
+            return DEFAULT_PAGE_SIZE;
+
+        int columnsPerRow = baseCfs.metadata.partitionColumns().regulars.size();
+        if (columnsPerRow <= 0)
+            return DEFAULT_PAGE_SIZE;
+
+        int meanRowsPerPartition = meanCellsPerPartition / columnsPerRow;
+        double meanRowSize = meanPartitionSize / meanRowsPerPartition;
+
+        int pageSize = (int) Math.max(1, Math.min(DEFAULT_PAGE_SIZE, targetPageSizeInBytes / meanRowSize));
+
+        logger.trace("Calculated page size {} for indexing {}.{} ({}/{}/{}/{})",
+                     pageSize,
+                     baseCfs.metadata.ksName,
+                     baseCfs.metadata.cfName,
+                     meanPartitionSize,
+                     meanCellsPerPartition,
+                     meanRowsPerPartition,
+                     meanRowSize);
+
+        return pageSize;
+    }
+
+    /**
+     * Delete all data from all indexes for this partition.
+     * For when cleanup rips a partition out entirely.
+     *
+     * TODO : improve cleanup transaction to batch updates & perform them async
+     */
+    public void deletePartition(UnfilteredRowIterator partition, int nowInSec)
+    {
+        // we need to acquire memtable lock because secondary index deletion may
+        // cause a race (see CASSANDRA-3712). This is done internally by the
+        // index transaction when it commits
+        CleanupTransaction indexTransaction = newCleanupTransaction(partition.partitionKey(),
+                                                                    partition.columns(),
+                                                                    nowInSec);
+        indexTransaction.start();
+        indexTransaction.onPartitionDeletion(new DeletionTime(FBUtilities.timestampMicros(), nowInSec));
+        indexTransaction.commit();
+
+        while (partition.hasNext())
+        {
+            Unfiltered unfiltered = partition.next();
+            if (unfiltered.kind() != Unfiltered.Kind.ROW)
+                continue;
+
+            indexTransaction = newCleanupTransaction(partition.partitionKey(),
+                                                     partition.columns(),
+                                                     nowInSec);
+            indexTransaction.start();
+            indexTransaction.onRowDelete((Row)unfiltered);
+            indexTransaction.commit();
+        }
+    }
+
+    /**
+     * Called at query time to choose which (if any) of the registered index implementations to use for a given query.
+     *
+     * This is a two step processes, firstly compiling the set of searchable indexes then choosing the one which reduces
+     * the search space the most.
+     *
+     * In the first phase, if the command's RowFilter contains any custom index expressions, the indexes that they
+     * specify are automatically included. Following that, the registered indexes are filtered to include only those
+     * which support the standard expressions in the RowFilter.
+     *
+     * The filtered set then sorted by selectivity, as reported by the Index implementations' getEstimatedResultRows
+     * method.
+     *
+     * Implementation specific validation of the target expression, either custom or standard, by the selected
+     * index should be performed in the searcherFor method to ensure that we pick the right index regardless of
+     * the validity of the expression.
+     *
+     * This method is only called once during the lifecycle of a ReadCommand and the result is
+     * cached for future use when obtaining a Searcher, getting the index's underlying CFS for
+     * ReadOrderGroup, or an estimate of the result size from an average index query.
+     *
+     * @param rowFilter RowFilter of the command to be executed
+     * @return an Index instance, ready to use during execution of the command, or null if none
+     * of the registered indexes can support the command.
+     */
+    public Index getBestIndexFor(RowFilter rowFilter)
+    {
+        if (indexes.isEmpty() || rowFilter.isEmpty())
+            return null;
+
+        Set<Index> searchableIndexes = new HashSet<>();
+        for (RowFilter.Expression expression : rowFilter)
+        {
+            if (expression.isCustom())
+            {
+                // Only a single custom expression is allowed per query and, if present,
+                // we want to always favour the index specified in such an expression
+                RowFilter.CustomExpression customExpression = (RowFilter.CustomExpression)expression;
+                logger.trace("Command contains a custom index expression, using target index {}", customExpression.getTargetIndex().name);
+                Tracing.trace("Command contains a custom index expression, using target index {}", customExpression.getTargetIndex().name);
+                return indexes.get(customExpression.getTargetIndex().name);
+            }
+            else
+            {
+                indexes.values().stream()
+                       .filter(index -> index.supportsExpression(expression.column(), expression.operator()))
+                       .forEach(searchableIndexes::add);
+            }
+        }
+
+        if (searchableIndexes.isEmpty())
+        {
+            logger.trace("No applicable indexes found");
+            Tracing.trace("No applicable indexes found");
+            return null;
+        }
+
+        Index selected = searchableIndexes.size() == 1
+                         ? Iterables.getOnlyElement(searchableIndexes)
+                         : searchableIndexes.stream()
+                                            .min((a, b) -> Longs.compare(a.getEstimatedResultRows(),
+                                                                         b.getEstimatedResultRows()))
+                                            .orElseThrow(() -> new AssertionError("Could not select most selective index"));
+
+        // pay for an additional threadlocal get() rather than build the strings unnecessarily
+        if (Tracing.isTracing())
+        {
+            Tracing.trace("Index mean cardinalities are {}. Scanning with {}.",
+                          searchableIndexes.stream().map(i -> i.getIndexMetadata().name + ':' + i.getEstimatedResultRows())
+                                           .collect(Collectors.joining(",")),
+                          selected.getIndexMetadata().name);
+        }
+        return selected;
+    }
+
+    /**
+     * Called at write time to ensure that values present in the update
+     * are valid according to the rules of all registered indexes which
+     * will process it. The partition key as well as the clustering and
+     * cell values for each row in the update may be checked by index
+     * implementations
+     * @param update PartitionUpdate containing the values to be validated by registered Index implementations
+     * @throws InvalidRequestException
+     */
+    public void validate(PartitionUpdate update) throws InvalidRequestException
+    {
+        for (Index index : indexes.values())
+            index.validate(update);
+    }
+
+    /**
+     * IndexRegistry methods
+     */
+    public void registerIndex(Index index)
+    {
+        String name = index.getIndexMetadata().name;
+        indexes.put(name, index);
+        logger.trace("Registered index {}", name);
+    }
+
+    public void unregisterIndex(Index index)
+    {
+        unregisterIndex(index.getIndexMetadata().name);
+    }
+
+    private Index unregisterIndex(String name)
+    {
+        Index removed = indexes.remove(name);
+        builtIndexes.remove(name);
+        logger.trace(removed == null ? "Index {} was not registered" : "Removed index {} from registry",
+                     name);
+        return removed;
+    }
+
+    public Index getIndex(IndexMetadata metadata)
+    {
+        return indexes.get(metadata.name);
+    }
+
+    public Collection<Index> listIndexes()
+    {
+        return ImmutableSet.copyOf(indexes.values());
+    }
+
+    /**
+     * Handling of index updates.
+     * Implementations of the various IndexTransaction interfaces, for keeping indexes in sync with base data
+     * during updates, compaction and cleanup. Plus factory methods for obtaining transaction instances.
+     */
+
+    /**
+     * Transaction for updates on the write path.
+     */
+    public UpdateTransaction newUpdateTransaction(PartitionUpdate update, OpOrder.Group opGroup, int nowInSec)
+    {
+        if (!hasIndexes())
+            return UpdateTransaction.NO_OP;
+
+        Index.Indexer[] indexers = indexes.values().stream()
+                                          .map(i -> i.indexerFor(update.partitionKey(),
+                                                                 update.columns(),
+                                                                 nowInSec,
+                                                                 opGroup,
+                                                                 IndexTransaction.Type.UPDATE))
+                                          .filter(Objects::nonNull)
+                                          .toArray(Index.Indexer[]::new);
+
+        return indexers.length == 0 ? UpdateTransaction.NO_OP : new WriteTimeTransaction(indexers);
+    }
+
+    /**
+     * Transaction for use when merging rows during compaction
+     */
+    public CompactionTransaction newCompactionTransaction(DecoratedKey key,
+                                                          PartitionColumns partitionColumns,
+                                                          int versions,
+                                                          int nowInSec)
+    {
+        // the check for whether there are any registered indexes is already done in CompactionIterator
+        return new IndexGCTransaction(key, partitionColumns, versions, nowInSec, listIndexes());
+    }
+
+    /**
+     * Transaction for use when removing partitions during cleanup
+     */
+    public CleanupTransaction newCleanupTransaction(DecoratedKey key,
+                                                    PartitionColumns partitionColumns,
+                                                    int nowInSec)
+    {
+        if (!hasIndexes())
+            return CleanupTransaction.NO_OP;
+
+        return new CleanupGCTransaction(key, partitionColumns, nowInSec, listIndexes());
+    }
+
+    /**
+     * A single use transaction for processing a partition update on the regular write path
+     */
+    private static final class WriteTimeTransaction implements UpdateTransaction
+    {
+        private final Index.Indexer[] indexers;
+
+        private WriteTimeTransaction(Index.Indexer...indexers)
+        {
+            // don't allow null indexers, if we don't need any use a NullUpdater object
+            for (Index.Indexer indexer : indexers) assert indexer != null;
+            this.indexers = indexers;
+        }
+
+        public void start()
+        {
+            for (Index.Indexer indexer : indexers)
+                indexer.begin();
+        }
+
+        public void onPartitionDeletion(DeletionTime deletionTime)
+        {
+            for (Index.Indexer indexer : indexers)
+                indexer.partitionDelete(deletionTime);
+        }
+
+        public void onRangeTombstone(RangeTombstone tombstone)
+        {
+            for (Index.Indexer indexer : indexers)
+                indexer.rangeTombstone(tombstone);
+        }
+
+        public void onInserted(Row row)
+        {
+            for (Index.Indexer indexer : indexers)
+                indexer.insertRow(row);
+        }
+
+        public void onUpdated(Row existing, Row updated)
+        {
+            final Row.Builder toRemove = BTreeRow.sortedBuilder();
+            toRemove.newRow(existing.clustering());
+            toRemove.addPrimaryKeyLivenessInfo(existing.primaryKeyLivenessInfo());
+            toRemove.addRowDeletion(existing.deletion());
+            final Row.Builder toInsert = BTreeRow.sortedBuilder();
+            toInsert.newRow(updated.clustering());
+            toInsert.addPrimaryKeyLivenessInfo(updated.primaryKeyLivenessInfo());
+            toInsert.addRowDeletion(updated.deletion());
+            // diff listener collates the columns to be added & removed from the indexes
+            RowDiffListener diffListener = new RowDiffListener()
+            {
+                public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
+                {
+                }
+
+                public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
+                {
+                }
+
+                public void onComplexDeletion(int i, Clustering clustering, ColumnDefinition column, DeletionTime merged, DeletionTime original)
+                {
+                }
+
+                public void onCell(int i, Clustering clustering, Cell merged, Cell original)
+                {
+                    if (merged != null && !merged.equals(original))
+                        toInsert.addCell(merged);
+
+                    if (merged == null || (original != null && shouldCleanupOldValue(original, merged)))
+                        toRemove.addCell(original);
+
+                }
+            };
+            Rows.diff(diffListener, updated, existing);
+            Row oldRow = toRemove.build();
+            Row newRow = toInsert.build();
+            for (Index.Indexer indexer : indexers)
+                indexer.updateRow(oldRow, newRow);
+        }
+
+        public void commit()
+        {
+            for (Index.Indexer indexer : indexers)
+                indexer.finish();
+        }
+
+        private boolean shouldCleanupOldValue(Cell oldCell, Cell newCell)
+        {
+            // If either the value or timestamp is different, then we
+            // should delete from the index. If not, then we can infer that
+            // at least one of the cells is an ExpiringColumn and that the
+            // difference is in the expiry time. In this case, we don't want to
+            // delete the old value from the index as the tombstone we insert
+            // will just hide the inserted value.
+            // Completely identical cells (including expiring columns with
+            // identical ttl & localExpirationTime) will not get this far due
+            // to the oldCell.equals(newCell) in StandardUpdater.update
+            return !oldCell.value().equals(newCell.value()) || oldCell.timestamp() != newCell.timestamp();
+        }
+    }
+
+    /**
+     * A single-use transaction for updating indexes for a single partition during compaction where the only
+     * operation is to merge rows
+     * TODO : make this smarter at batching updates so we can use a single transaction to process multiple rows in
+     * a single partition
+     */
+    private static final class IndexGCTransaction implements CompactionTransaction
+    {
+        private final DecoratedKey key;
+        private final PartitionColumns columns;
+        private final int versions;
+        private final int nowInSec;
+        private final Collection<Index> indexes;
+
+        private Row[] rows;
+
+        private IndexGCTransaction(DecoratedKey key,
+                                   PartitionColumns columns,
+                                   int versions,
+                                   int nowInSec,
+                                   Collection<Index> indexes)
+        {
+            this.key = key;
+            this.columns = columns;
+            this.versions = versions;
+            this.indexes = indexes;
+            this.nowInSec = nowInSec;
+        }
+
+        public void start()
+        {
+            if (versions > 0)
+                rows = new Row[versions];
+        }
+
+        public void onRowMerge(Row merged, Row...versions)
+        {
+            // Diff listener constructs rows representing deltas between the merged and original versions
+            // These delta rows are then passed to registered indexes for removal processing
+            final Row.Builder[] builders = new Row.Builder[versions.length];
+            RowDiffListener diffListener = new RowDiffListener()
+            {
+                public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
+                {
+                    if (original != null && (merged == null || !merged.isLive(nowInSec)))
+                        getBuilder(i, clustering).addPrimaryKeyLivenessInfo(original);
+                }
+
+                public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
+                {
+                }
+
+                public void onComplexDeletion(int i, Clustering clustering, ColumnDefinition column, DeletionTime merged, DeletionTime original)
+                {
+                }
+
+                public void onCell(int i, Clustering clustering, Cell merged, Cell original)
+                {
+                    if (original != null && (merged == null || !merged.isLive(nowInSec)))
+                        getBuilder(i, clustering).addCell(original);
+                }
+
+                private Row.Builder getBuilder(int index, Clustering clustering)
+                {
+                    if (builders[index] == null)
+                    {
+                        builders[index] = BTreeRow.sortedBuilder();
+                        builders[index].newRow(clustering);
+                    }
+                    return builders[index];
+                }
+            };
+
+            Rows.diff(diffListener, merged, versions);
+
+            for(int i = 0; i < builders.length; i++)
+                if (builders[i] != null)
+                    rows[i] = builders[i].build();
+        }
+
+        public void commit()
+        {
+            if (rows == null)
+                return;
+
+            try (OpOrder.Group opGroup = Keyspace.writeOrder.start())
+            {
+                for (Index index : indexes)
+                {
+                    Index.Indexer indexer = index.indexerFor(key, columns, nowInSec, opGroup, Type.COMPACTION);
+                    if (indexer == null)
+                        continue;
+
+                    indexer.begin();
+                    for (Row row : rows)
+                        if (row != null)
+                            indexer.removeRow(row);
+                    indexer.finish();
+                }
+            }
+        }
+    }
+
+    /**
+     * A single-use transaction for updating indexes for a single partition during cleanup, where
+     * partitions and rows are only removed
+     * TODO : make this smarter at batching updates so we can use a single transaction to process multiple rows in
+     * a single partition
+     */
+    private static final class CleanupGCTransaction implements CleanupTransaction
+    {
+        private final DecoratedKey key;
+        private final PartitionColumns columns;
+        private final int nowInSec;
+        private final Collection<Index> indexes;
+
+        private Row row;
+        private DeletionTime partitionDelete;
+
+        private CleanupGCTransaction(DecoratedKey key,
+                                     PartitionColumns columns,
+                                     int nowInSec,
+                                     Collection<Index> indexes)
+        {
+            this.key = key;
+            this.columns = columns;
+            this.indexes = indexes;
+            this.nowInSec = nowInSec;
+        }
+
+        public void start()
+        {
+        }
+
+        public void onPartitionDeletion(DeletionTime deletionTime)
+        {
+            partitionDelete = deletionTime;
+        }
+
+        public void onRowDelete(Row row)
+        {
+            this.row = row;
+        }
+
+        public void commit()
+        {
+            if (row == null && partitionDelete == null)
+                return;
+
+            try (OpOrder.Group opGroup = Keyspace.writeOrder.start())
+            {
+                for (Index index : indexes)
+                {
+                    Index.Indexer indexer = index.indexerFor(key, columns, nowInSec, opGroup, Type.CLEANUP);
+                    if (indexer == null)
+                        continue;
+
+                    indexer.begin();
+
+                    if (partitionDelete != null)
+                        indexer.partitionDelete(partitionDelete);
+
+                    if (row != null)
+                        indexer.removeRow(row);
+
+                    indexer.finish();
+                }
+            }
+        }
+    }
+
+    private static void executeBlocking(Callable<?> task)
+    {
+        if (null != task)
+            FBUtilities.waitOnFuture(blockingExecutor.submit(task));
+    }
+
+    private static void executeAllBlocking(Stream<Index> indexers, Function<Index, Callable<?>> function)
+    {
+        List<Future<?>> waitFor = new ArrayList<>();
+        indexers.forEach(indexer -> {
+            Callable<?> task = function.apply(indexer);
+            if (null != task)
+                waitFor.add(blockingExecutor.submit(task));
+        });
+        FBUtilities.waitOnFutures(waitFor);
+    }
+
+    @VisibleForTesting
+    public static void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
+    {
+        ExecutorService[] executors = new ExecutorService[]{ asyncExecutor, blockingExecutor };
+        shutdown(executors);
+        awaitTermination(timeout, unit, executors);
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
new file mode 100644
index 0000000..ad5dd4b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java

@@ -0,0 +1,912 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.internal;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Future;
+import java.util.function.BiFunction;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import com.google.common.collect.ImmutableSet;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.EmptyType;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.index.SecondaryIndexBuilder;
+import org.apache.cassandra.index.internal.composites.CompositesSearcher;
+import org.apache.cassandra.index.internal.keys.KeysSearcher;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.Refs;
+
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+
+/**
+ * Index implementation which indexes the values for a single column in the base
+ * table and which stores its index data in a local, hidden table.
+ */
+public abstract class CassandraIndex implements Index
+{
+    private static final Logger logger = LoggerFactory.getLogger(CassandraIndex.class);
+
+    public static final Pattern TARGET_REGEX = Pattern.compile("^(keys|entries|values|full)\\((.+)\\)$");
+
+    public final ColumnFamilyStore baseCfs;
+    protected IndexMetadata metadata;
+    protected ColumnFamilyStore indexCfs;
+    protected ColumnDefinition indexedColumn;
+    protected CassandraIndexFunctions functions;
+
+    protected CassandraIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        this.baseCfs = baseCfs;
+        setMetadata(indexDef);
+    }
+
+    /**
+     * Returns true if an index of this type can support search predicates of the form [column] OPERATOR [value]
+     * @param indexedColumn
+     * @param operator
+     * @return
+     */
+    protected boolean supportsOperator(ColumnDefinition indexedColumn, Operator operator)
+    {
+        return operator == Operator.EQ;
+    }
+
+    /**
+     * Used to construct an the clustering for an entry in the index table based on values from the base data.
+     * The clustering columns in the index table encode the values required to retrieve the correct data from the base
+     * table and varies depending on the kind of the indexed column. See indexCfsMetadata for more details
+     * Used whenever a row in the index table is written or deleted.
+     * @param partitionKey from the base data being indexed
+     * @param prefix from the base data being indexed
+     * @param path from the base data being indexed
+     * @return a clustering prefix to be used to insert into the index table
+     */
+    protected abstract CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                                           ClusteringPrefix prefix,
+                                                           CellPath path);
+
+    /**
+     * Used at search time to convert a row in the index table into a simple struct containing the values required
+     * to retrieve the corresponding row from the base table.
+     * @param indexedValue the partition key of the indexed table (i.e. the value that was indexed)
+     * @param indexEntry a row from the index table
+     * @return
+     */
+    public abstract IndexEntry decodeEntry(DecoratedKey indexedValue,
+                                           Row indexEntry);
+
+    /**
+     * Check whether a value retrieved from an index is still valid by comparing it to current row from the base table.
+     * Used at read time to identify out of date index entries so that they can be excluded from search results and
+     * repaired
+     * @param row the current row from the primary data table
+     * @param indexValue the value we retrieved from the index
+     * @param nowInSec
+     * @return true if the index is out of date and the entry should be dropped
+     */
+    public abstract boolean isStale(Row row, ByteBuffer indexValue, int nowInSec);
+
+    /**
+     * Extract the value to be inserted into the index from the components of the base data
+     * @param partitionKey from the primary data
+     * @param clustering from the primary data
+     * @param path from the primary data
+     * @param cellValue from the primary data
+     * @return a ByteBuffer containing the value to be inserted in the index. This will be used to make the partition
+     * key in the index table
+     */
+    protected abstract ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                                  Clustering clustering,
+                                                  CellPath path,
+                                                  ByteBuffer cellValue);
+
+    public ColumnDefinition getIndexedColumn()
+    {
+        return indexedColumn;
+    }
+
+    public ClusteringComparator getIndexComparator()
+    {
+        return indexCfs.metadata.comparator;
+    }
+
+    public ColumnFamilyStore getIndexCfs()
+    {
+        return indexCfs;
+    }
+
+    public void register(IndexRegistry registry)
+    {
+        registry.registerIndex(this);
+    }
+
+    public Callable<?> getInitializationTask()
+    {
+        // if we're just linking in the index on an already-built index post-restart or if the base
+        // table is empty we've nothing to do. Otherwise, submit for building via SecondaryIndexBuilder
+        return isBuilt() || baseCfs.isEmpty() ? null : getBuildIndexTask();
+    }
+
+    public IndexMetadata getIndexMetadata()
+    {
+        return metadata;
+    }
+
+    public Optional<ColumnFamilyStore> getBackingTable()
+    {
+        return indexCfs == null ? Optional.empty() : Optional.of(indexCfs);
+    }
+
+    public Callable<Void> getBlockingFlushTask()
+    {
+        return () -> {
+            indexCfs.forceBlockingFlush();
+            return null;
+        };
+    }
+
+    public Callable<?> getInvalidateTask()
+    {
+        return () -> {
+            invalidate();
+            return null;
+        };
+    }
+
+    public Callable<?> getMetadataReloadTask(IndexMetadata indexDef)
+    {
+        return () -> {
+            indexCfs.metadata.reloadIndexMetadataProperties(baseCfs.metadata);
+            indexCfs.reload();
+            return null;
+        };
+    }
+
+    @Override
+    public void validate(ReadCommand command) throws InvalidRequestException
+    {
+        Optional<RowFilter.Expression> target = getTargetExpression(command.rowFilter().getExpressions());
+
+        if (target.isPresent())
+        {
+            ByteBuffer indexValue = target.get().getIndexValue();
+            checkFalse(indexValue.remaining() > FBUtilities.MAX_UNSIGNED_SHORT,
+                       "Index expression values may not be larger than 64K");
+        }
+    }
+
+    private void setMetadata(IndexMetadata indexDef)
+    {
+        metadata = indexDef;
+        Pair<ColumnDefinition, IndexTarget.Type> target = parseTarget(baseCfs.metadata, indexDef);
+        functions = getFunctions(indexDef, target);
+        CFMetaData cfm = indexCfsMetadata(baseCfs.metadata, indexDef);
+        indexCfs = ColumnFamilyStore.createColumnFamilyStore(baseCfs.keyspace,
+                                                             cfm.cfName,
+                                                             cfm,
+                                                             baseCfs.getTracker().loadsstables);
+        indexedColumn = target.left;
+    }
+
+    public Callable<?> getTruncateTask(final long truncatedAt)
+    {
+        return () -> {
+            indexCfs.discardSSTables(truncatedAt);
+            return null;
+        };
+    }
+
+    public boolean shouldBuildBlocking()
+    {
+        // built-in indexes are always included in builds initiated from SecondaryIndexManager
+        return true;
+    }
+
+    public boolean dependsOn(ColumnDefinition column)
+    {
+        return indexedColumn.name.equals(column.name);
+    }
+
+    public boolean supportsExpression(ColumnDefinition column, Operator operator)
+    {
+        return indexedColumn.name.equals(column.name)
+               && supportsOperator(indexedColumn, operator);
+    }
+
+    private boolean supportsExpression(RowFilter.Expression expression)
+    {
+        return supportsExpression(expression.column(), expression.operator());
+    }
+
+    public AbstractType<?> customExpressionValueType()
+    {
+        return null;
+    }
+
+    public long getEstimatedResultRows()
+    {
+        long totalRows = 0;
+        long totalPartitions = 0;
+        for (SSTableReader sstable : indexCfs.getSSTables(SSTableSet.CANONICAL))
+        {
+            if (sstable.descriptor.version.storeRows())
+            {
+                totalPartitions += sstable.getEstimatedPartitionSize().count();
+                totalRows += sstable.getTotalRows();
+            } else
+            {
+                // for legacy sstables we don't have a total row count so we approximate it
+                // using estimated column count (which is the same logic as pre-3.0
+                // see CASSANDRA-15259
+                long colCount = sstable.getEstimatedColumnCount().count();
+                totalPartitions += colCount;
+                totalRows += sstable.getEstimatedColumnCount().mean() * colCount;
+            }
+        }
+
+        return totalPartitions > 0 ? (int) (totalRows / totalPartitions) : 0;
+    }
+
+    /**
+     * No post processing of query results, just return them unchanged
+     */
+    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command)
+    {
+        return (partitionIterator, readCommand) -> partitionIterator;
+    }
+
+    public RowFilter getPostIndexQueryFilter(RowFilter filter)
+    {
+        return getTargetExpression(filter.getExpressions()).map(filter::without)
+                                                           .orElse(filter);
+    }
+
+    private Optional<RowFilter.Expression> getTargetExpression(List<RowFilter.Expression> expressions)
+    {
+        return expressions.stream().filter(this::supportsExpression).findFirst();
+    }
+
+    public Index.Searcher searcherFor(ReadCommand command)
+    {
+        Optional<RowFilter.Expression> target = getTargetExpression(command.rowFilter().getExpressions());
+
+        if (target.isPresent())
+        {
+            target.get().validateForIndexing();
+            switch (getIndexMetadata().kind)
+            {
+                case COMPOSITES:
+                    return new CompositesSearcher(command, target.get(), this);
+                case KEYS:
+                    return new KeysSearcher(command, target.get(), this);
+                default:
+                    throw new IllegalStateException(String.format("Unsupported index type %s for index %s on %s",
+                                                                  metadata.kind,
+                                                                  metadata.name,
+                                                                  indexedColumn.name.toString()));
+            }
+        }
+
+        return null;
+
+    }
+
+    public void validate(PartitionUpdate update) throws InvalidRequestException
+    {
+        switch (indexedColumn.kind)
+        {
+            case PARTITION_KEY:
+                validatePartitionKey(update.partitionKey());
+                break;
+            case CLUSTERING:
+                validateClusterings(update);
+                break;
+            case REGULAR:
+                if (update.columns().regulars.contains(indexedColumn))
+                    validateRows(update);
+                break;
+            case STATIC:
+                if (update.columns().statics.contains(indexedColumn))
+                    validateRows(Collections.singleton(update.staticRow()));
+                break;
+        }
+    }
+
+    public Indexer indexerFor(final DecoratedKey key,
+                              final PartitionColumns columns,
+                              final int nowInSec,
+                              final OpOrder.Group opGroup,
+                              final IndexTransaction.Type transactionType)
+    {
+        /**
+         * Indexes on regular and static columns (the non primary-key ones) only care about updates with live
+         * data for the column they index. In particular, they don't care about having just row or range deletions
+         * as they don't know how to update the index table unless they know exactly the value that is deleted.
+         *
+         * Note that in practice this means that those indexes are only purged of stale entries on compaction,
+         * when we resolve both the deletion and the prior data it deletes. Of course, such stale entries are also
+         * filtered on read.
+         */
+        if (!isPrimaryKeyIndex() && !columns.contains(indexedColumn))
+            return null;
+
+        return new Indexer()
+        {
+            public void begin()
+            {
+            }
+
+            public void partitionDelete(DeletionTime deletionTime)
+            {
+            }
+
+            public void rangeTombstone(RangeTombstone tombstone)
+            {
+            }
+
+            public void insertRow(Row row)
+            {
+                if (row.isStatic() && !indexedColumn.isStatic() && !indexedColumn.isPartitionKey())
+                    return;
+
+                if (isPrimaryKeyIndex())
+                {
+                    indexPrimaryKey(row.clustering(),
+                                    getPrimaryKeyIndexLiveness(row),
+                                    row.deletion());
+                }
+                else
+                {
+                    if (indexedColumn.isComplex())
+                        indexCells(row.clustering(), row.getComplexColumnData(indexedColumn));
+                    else
+                        indexCell(row.clustering(), row.getCell(indexedColumn));
+                }
+            }
+
+            public void removeRow(Row row)
+            {
+                if (isPrimaryKeyIndex())
+                    return;
+
+                if (indexedColumn.isComplex())
+                    removeCells(row.clustering(), row.getComplexColumnData(indexedColumn));
+                else
+                    removeCell(row.clustering(), row.getCell(indexedColumn));
+            }
+
+            public void updateRow(Row oldRow, Row newRow)
+            {
+                assert oldRow.isStatic() == newRow.isStatic();
+                if (newRow.isStatic() != indexedColumn.isStatic())
+                    return;
+
+                if (isPrimaryKeyIndex())
+                    indexPrimaryKey(newRow.clustering(),
+                                    newRow.primaryKeyLivenessInfo(),
+                                    newRow.deletion());
+
+                if (indexedColumn.isComplex())
+                {
+                    indexCells(newRow.clustering(), newRow.getComplexColumnData(indexedColumn));
+                    removeCells(oldRow.clustering(), oldRow.getComplexColumnData(indexedColumn));
+                }
+                else
+                {
+                    indexCell(newRow.clustering(), newRow.getCell(indexedColumn));
+                    removeCell(oldRow.clustering(), oldRow.getCell(indexedColumn));
+                }
+            }
+
+            public void finish()
+            {
+            }
+
+            private void indexCells(Clustering clustering, Iterable<Cell> cells)
+            {
+                if (cells == null)
+                    return;
+
+                for (Cell cell : cells)
+                    indexCell(clustering, cell);
+            }
+
+            private void indexCell(Clustering clustering, Cell cell)
+            {
+                if (cell == null || !cell.isLive(nowInSec))
+                    return;
+
+                insert(key.getKey(),
+                       clustering,
+                       cell,
+                       LivenessInfo.create(cell.timestamp(), cell.ttl(), cell.localDeletionTime()),
+                       opGroup);
+            }
+
+            private void removeCells(Clustering clustering, Iterable<Cell> cells)
+            {
+                if (cells == null)
+                    return;
+
+                for (Cell cell : cells)
+                    removeCell(clustering, cell);
+            }
+
+            private void removeCell(Clustering clustering, Cell cell)
+            {
+                if (cell == null || !cell.isLive(nowInSec))
+                    return;
+
+                delete(key.getKey(), clustering, cell, opGroup, nowInSec);
+            }
+
+            private void indexPrimaryKey(final Clustering clustering,
+                                         final LivenessInfo liveness,
+                                         final Row.Deletion deletion)
+            {
+                if (liveness.timestamp() != LivenessInfo.NO_TIMESTAMP)
+                    insert(key.getKey(), clustering, null, liveness, opGroup);
+
+                if (!deletion.isLive())
+                    delete(key.getKey(), clustering, deletion.time(), opGroup);
+            }
+
+            private LivenessInfo getPrimaryKeyIndexLiveness(Row row)
+            {
+                long timestamp = row.primaryKeyLivenessInfo().timestamp();
+                int ttl = row.primaryKeyLivenessInfo().ttl();
+                for (Cell cell : row.cells())
+                {
+                    long cellTimestamp = cell.timestamp();
+                    if (cell.isLive(nowInSec))
+                    {
+                        if (cellTimestamp > timestamp)
+                        {
+                            timestamp = cellTimestamp;
+                            ttl = cell.ttl();
+                        }
+                    }
+                }
+                return LivenessInfo.create(baseCfs.metadata, timestamp, ttl, nowInSec);
+            }
+        };
+    }
+
+    /**
+     * Specific to internal indexes, this is called by a
+     * searcher when it encounters a stale entry in the index
+     * @param indexKey the partition key in the index table
+     * @param indexClustering the clustering in the index table
+     * @param deletion deletion timestamp etc
+     * @param opGroup the operation under which to perform the deletion
+     */
+    public void deleteStaleEntry(DecoratedKey indexKey,
+                                 Clustering indexClustering,
+                                 DeletionTime deletion,
+                                 OpOrder.Group opGroup)
+    {
+        doDelete(indexKey, indexClustering, deletion, opGroup);
+        logger.trace("Removed index entry for stale value {}", indexKey);
+    }
+
+    /**
+     * Called when adding a new entry to the index
+     */
+    private void insert(ByteBuffer rowKey,
+                        Clustering clustering,
+                        Cell cell,
+                        LivenessInfo info,
+                        OpOrder.Group opGroup)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               cell));
+        Row row = BTreeRow.noCellLiveRow(buildIndexClustering(rowKey, clustering, cell), info);
+        PartitionUpdate upd = partitionUpdate(valueKey, row);
+        indexCfs.apply(upd, UpdateTransaction.NO_OP, opGroup, null);
+        logger.trace("Inserted entry into index for value {}", valueKey);
+    }
+
+    /**
+     * Called when deleting entries on non-primary key columns
+     */
+    private void delete(ByteBuffer rowKey,
+                        Clustering clustering,
+                        Cell cell,
+                        OpOrder.Group opGroup,
+                        int nowInSec)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               cell));
+        doDelete(valueKey,
+                 buildIndexClustering(rowKey, clustering, cell),
+                 new DeletionTime(cell.timestamp(), nowInSec),
+                 opGroup);
+    }
+
+    /**
+     * Called when deleting entries from indexes on primary key columns
+     */
+    private void delete(ByteBuffer rowKey,
+                        Clustering clustering,
+                        DeletionTime deletion,
+                        OpOrder.Group opGroup)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               null));
+        doDelete(valueKey,
+                 buildIndexClustering(rowKey, clustering, null),
+                 deletion,
+                 opGroup);
+    }
+
+    private void doDelete(DecoratedKey indexKey,
+                          Clustering indexClustering,
+                          DeletionTime deletion,
+                          OpOrder.Group opGroup)
+    {
+        Row row = BTreeRow.emptyDeletedRow(indexClustering, Row.Deletion.regular(deletion));
+        PartitionUpdate upd = partitionUpdate(indexKey, row);
+        indexCfs.apply(upd, UpdateTransaction.NO_OP, opGroup, null);
+        logger.trace("Removed index entry for value {}", indexKey);
+    }
+
+    private void validatePartitionKey(DecoratedKey partitionKey) throws InvalidRequestException
+    {
+        assert indexedColumn.isPartitionKey();
+        validateIndexedValue(getIndexedValue(partitionKey.getKey(), null, null));
+    }
+
+    private void validateClusterings(PartitionUpdate update) throws InvalidRequestException
+    {
+        assert indexedColumn.isClusteringColumn();
+        for (Row row : update)
+            validateIndexedValue(getIndexedValue(null, row.clustering(), null));
+    }
+
+    private void validateRows(Iterable<Row> rows)
+    {
+        assert !indexedColumn.isPrimaryKeyColumn();
+        for (Row row : rows)
+        {
+            if (indexedColumn.isComplex())
+            {
+                ComplexColumnData data = row.getComplexColumnData(indexedColumn);
+                if (data != null)
+                {
+                    for (Cell cell : data)
+                    {
+                        validateIndexedValue(getIndexedValue(null, null, cell.path(), cell.value()));
+                    }
+                }
+            }
+            else
+            {
+                validateIndexedValue(getIndexedValue(null, null, row.getCell(indexedColumn)));
+            }
+        }
+    }
+
+    private void validateIndexedValue(ByteBuffer value)
+    {
+        if (value != null && value.remaining() >= FBUtilities.MAX_UNSIGNED_SHORT)
+            throw new InvalidRequestException(String.format(
+                                                           "Cannot index value of size %d for index %s on %s.%s(%s) (maximum allowed size=%d)",
+                                                           value.remaining(),
+                                                           metadata.name,
+                                                           baseCfs.metadata.ksName,
+                                                           baseCfs.metadata.cfName,
+                                                           indexedColumn.name.toString(),
+                                                           FBUtilities.MAX_UNSIGNED_SHORT));
+    }
+
+    private ByteBuffer getIndexedValue(ByteBuffer rowKey,
+                                       Clustering clustering,
+                                       Cell cell)
+    {
+        return getIndexedValue(rowKey,
+                               clustering,
+                               cell == null ? null : cell.path(),
+                               cell == null ? null : cell.value()
+        );
+    }
+
+    private Clustering buildIndexClustering(ByteBuffer rowKey,
+                                            Clustering clustering,
+                                            Cell cell)
+    {
+        return buildIndexClusteringPrefix(rowKey,
+                                          clustering,
+                                          cell == null ? null : cell.path()).build();
+    }
+
+    private DecoratedKey getIndexKeyFor(ByteBuffer value)
+    {
+        return indexCfs.decorateKey(value);
+    }
+
+    private PartitionUpdate partitionUpdate(DecoratedKey valueKey, Row row)
+    {
+        return PartitionUpdate.singleRowUpdate(indexCfs.metadata, valueKey, row);
+    }
+
+    private void invalidate()
+    {
+        // interrupt in-progress compactions
+        Collection<ColumnFamilyStore> cfss = Collections.singleton(indexCfs);
+        CompactionManager.instance.interruptCompactionForCFs(cfss, true);
+        CompactionManager.instance.waitForCessation(cfss);
+        Keyspace.writeOrder.awaitNewBarrier();
+        indexCfs.forceBlockingFlush();
+        indexCfs.readOrdering.awaitNewBarrier();
+        indexCfs.invalidate();
+    }
+
+    private boolean isBuilt()
+    {
+        return SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), metadata.name);
+    }
+
+    private boolean isPrimaryKeyIndex()
+    {
+        return indexedColumn.isPrimaryKeyColumn();
+    }
+
+    private Callable<?> getBuildIndexTask()
+    {
+        return () -> {
+            buildBlocking();
+            return null;
+        };
+    }
+
+    private void buildBlocking()
+    {
+        baseCfs.forceBlockingFlush();
+
+        try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+             Refs<SSTableReader> sstables = viewFragment.refs)
+        {
+            if (sstables.isEmpty())
+            {
+                logger.info("No SSTable data for {}.{} to build index {} from, marking empty index as built",
+                            baseCfs.metadata.ksName,
+                            baseCfs.metadata.cfName,
+                            metadata.name);
+                baseCfs.indexManager.markIndexBuilt(metadata.name);
+                return;
+            }
+
+            logger.info("Submitting index build of {} for data in {}",
+                        metadata.name,
+                        getSSTableNames(sstables));
+
+            SecondaryIndexBuilder builder = new SecondaryIndexBuilder(baseCfs,
+                                                                      Collections.singleton(this),
+                                                                      new ReducingKeyIterator(sstables));
+            Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
+            FBUtilities.waitOnFuture(future);
+            indexCfs.forceBlockingFlush();
+            baseCfs.indexManager.markIndexBuilt(metadata.name);
+        }
+        logger.info("Index build of {} complete", metadata.name);
+    }
+
+    private static String getSSTableNames(Collection<SSTableReader> sstables)
+    {
+        return StreamSupport.stream(sstables.spliterator(), false)
+                            .map(SSTableReader::toString)
+                            .collect(Collectors.joining(", "));
+    }
+
+    /**
+     * Construct the CFMetadata for an index table, the clustering columns in the index table
+     * vary dependent on the kind of the indexed value.
+     * @param baseCfsMetadata
+     * @param indexMetadata
+     * @return
+     */
+    public static final CFMetaData indexCfsMetadata(CFMetaData baseCfsMetadata, IndexMetadata indexMetadata)
+    {
+        Pair<ColumnDefinition, IndexTarget.Type> target = parseTarget(baseCfsMetadata, indexMetadata);
+        CassandraIndexFunctions utils = getFunctions(indexMetadata, target);
+        ColumnDefinition indexedColumn = target.left;
+        AbstractType<?> indexedValueType = utils.getIndexedValueType(indexedColumn);
+
+        // Tables for legacy KEYS indexes are non-compound and dense
+        CFMetaData.Builder builder = indexMetadata.isKeys()
+                                     ? CFMetaData.Builder.create(baseCfsMetadata.ksName,
+                                                                 baseCfsMetadata.indexColumnFamilyName(indexMetadata),
+                                                                 true, false, false)
+                                     : CFMetaData.Builder.create(baseCfsMetadata.ksName,
+                                                                 baseCfsMetadata.indexColumnFamilyName(indexMetadata));
+
+        builder =  builder.withId(baseCfsMetadata.cfId)
+                          .withPartitioner(new LocalPartitioner(indexedValueType))
+                          .addPartitionKey(indexedColumn.name, indexedColumn.type)
+                          .addClusteringColumn("partition_key", baseCfsMetadata.partitioner.partitionOrdering());
+
+        if (indexMetadata.isKeys())
+        {
+            // A dense, compact table for KEYS indexes must have a compact
+            // value column defined, even though it is never used
+            CompactTables.DefaultNames names =
+                CompactTables.defaultNameGenerator(ImmutableSet.of(indexedColumn.name.toString(), "partition_key"));
+            builder = builder.addRegularColumn(names.defaultCompactValueName(), EmptyType.instance);
+        }
+        else
+        {
+            // The clustering columns for a table backing a COMPOSITES index are dependent
+            // on the specific type of index (there are specializations for indexes on collections)
+            builder = utils.addIndexClusteringColumns(builder, baseCfsMetadata, indexedColumn);
+        }
+
+        return builder.build().reloadIndexMetadataProperties(baseCfsMetadata);
+    }
+
+    /**
+     * Factory method for new CassandraIndex instances
+     * @param baseCfs
+     * @param indexMetadata
+     * @return
+     */
+    public static CassandraIndex newIndex(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+    {
+        return getFunctions(indexMetadata, parseTarget(baseCfs.metadata, indexMetadata)).newIndexInstance(baseCfs, indexMetadata);
+    }
+
+    public static Pair<ColumnDefinition, IndexTarget.Type> parseTarget(CFMetaData cfm, IndexMetadata indexDef)
+    {
+        String target = indexDef.options.get("target");
+        assert target != null : String.format("No target definition found for index %s", indexDef.name);
+        Pair<ColumnDefinition, IndexTarget.Type> result = parseTarget(cfm, target);
+        if (result == null)
+            throw new ConfigurationException(String.format("Unable to parse targets for index %s (%s)", indexDef.name, target));
+        return result;
+    }
+
+    // Public because it's also used to convert index metadata into a thrift-compatible format
+    public static Pair<ColumnDefinition, IndexTarget.Type> parseTarget(CFMetaData cfm,
+                                                                       String target)
+    {
+        // if the regex matches then the target is in the form "keys(foo)", "entries(bar)" etc
+        // if not, then it must be a simple column name and implictly its type is VALUES
+        Matcher matcher = TARGET_REGEX.matcher(target);
+        String columnName;
+        IndexTarget.Type targetType;
+        if (matcher.matches())
+        {
+            targetType = IndexTarget.Type.fromString(matcher.group(1));
+            columnName = matcher.group(2);
+        }
+        else
+        {
+            columnName = target;
+            targetType = IndexTarget.Type.VALUES;
+        }
+
+        // in the case of a quoted column name the name in the target string
+        // will be enclosed in quotes, which we need to unwrap. It may also
+        // include quote characters internally, escaped like so:
+        //      abc"def -> abc""def.
+        // Because the target string is stored in a CQL compatible form, we
+        // need to un-escape any such quotes to get the actual column name
+        if (columnName.startsWith("\""))
+        {
+            columnName = StringUtils.substring(StringUtils.substring(columnName, 1), 0, -1);
+            columnName = columnName.replaceAll("\"\"", "\"");
+        }
+
+        // if it's not a CQL table, we can't assume that the column name is utf8, so
+        // in that case we have to do a linear scan of the cfm's columns to get the matching one.
+        // After dropping compact storage (see CASSANDRA-10857), we can't distinguish between the
+        // former compact/thrift table, so we have to fall back to linear scan in both cases.
+        ColumnDefinition cd = cfm.getColumnDefinition(new ColumnIdentifier(columnName, true));
+        if (cd != null)
+            return Pair.create(cd, targetType);
+
+        for (ColumnDefinition column : cfm.allColumns())
+            if (column.name.toString().equals(columnName))
+                return Pair.create(column, targetType);
+
+        return null;
+    }
+
+    static CassandraIndexFunctions getFunctions(IndexMetadata indexDef,
+                                                Pair<ColumnDefinition, IndexTarget.Type> target)
+    {
+        if (indexDef.isKeys())
+            return CassandraIndexFunctions.KEYS_INDEX_FUNCTIONS;
+
+        ColumnDefinition indexedColumn = target.left;
+        if (indexedColumn.type.isCollection() && indexedColumn.type.isMultiCell())
+        {
+            switch (((CollectionType)indexedColumn.type).kind)
+            {
+                case LIST:
+                    return CassandraIndexFunctions.COLLECTION_VALUE_INDEX_FUNCTIONS;
+                case SET:
+                    return CassandraIndexFunctions.COLLECTION_KEY_INDEX_FUNCTIONS;
+                case MAP:
+                    switch (target.right)
+                    {
+                        case KEYS:
+                            return CassandraIndexFunctions.COLLECTION_KEY_INDEX_FUNCTIONS;
+                        case KEYS_AND_VALUES:
+                            return CassandraIndexFunctions.COLLECTION_ENTRY_INDEX_FUNCTIONS;
+                        case VALUES:
+                            return CassandraIndexFunctions.COLLECTION_VALUE_INDEX_FUNCTIONS;
+                    }
+                    throw new AssertionError();
+            }
+        }
+
+        switch (indexedColumn.kind)
+        {
+            case CLUSTERING:
+                return CassandraIndexFunctions.CLUSTERING_COLUMN_INDEX_FUNCTIONS;
+            case REGULAR:
+                return CassandraIndexFunctions.REGULAR_COLUMN_INDEX_FUNCTIONS;
+            case PARTITION_KEY:
+                return CassandraIndexFunctions.PARTITION_KEY_INDEX_FUNCTIONS;
+            //case COMPACT_VALUE:
+            //    return new CompositesIndexOnCompactValue();
+        }
+        throw new AssertionError();
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndexFunctions.java b/src/java/org/apache/cassandra/index/internal/CassandraIndexFunctions.java
new file mode 100644
index 0000000..b7cb3a2
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndexFunctions.java

@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.internal;
+
+import java.util.List;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.index.internal.composites.*;
+import org.apache.cassandra.index.internal.keys.KeysIndex;
+import org.apache.cassandra.schema.IndexMetadata;
+
+public interface CassandraIndexFunctions
+{
+    /**
+     *
+     * @param baseCfs
+     * @param indexMetadata
+     * @return
+     */
+    public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata);
+
+    /**
+     * Returns the type of the the values in the index. For most columns this is simply its type, but for collections
+     * it depends on whether the index is on the collection name/value element or on a frozen collection
+     * @param indexedColumn
+     * @return
+     */
+    default AbstractType<?> getIndexedValueType(ColumnDefinition indexedColumn)
+    {
+        return indexedColumn.type;
+    }
+
+    /**
+     * Add the clustering columns for a specific type of index table to the a CFMetaData.Builder (which is being
+     * used to construct the index table's CFMetadata. In the default implementation, the clustering columns of the
+     * index table hold the partition key & clustering columns of the base table. This is overridden in several cases:
+     * * When the indexed value is itself a clustering column, in which case, we only need store the base table's
+     *   *other* clustering values in the index - the indexed value being the index table's partition key
+     * * When the indexed value is a collection value, in which case we also need to capture the cell path from the base
+     *   table
+     * * In a KEYS index (for thrift/compact storage/static column indexes), where only the base partition key is
+     *   held in the index table.
+     *
+     * Called from indexCfsMetadata
+     * @param builder
+     * @param baseMetadata
+     * @param cfDef
+     * @return
+     */
+    default CFMetaData.Builder addIndexClusteringColumns(CFMetaData.Builder builder,
+                                                         CFMetaData baseMetadata,
+                                                         ColumnDefinition cfDef)
+    {
+        for (ColumnDefinition def : baseMetadata.clusteringColumns())
+            builder.addClusteringColumn(def.name, def.type);
+        return builder;
+    }
+
+    /*
+     * implementations providing specializations for the built in index types
+     */
+
+    static final CassandraIndexFunctions KEYS_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new KeysIndex(baseCfs, indexMetadata);
+        }
+    };
+
+    static final CassandraIndexFunctions REGULAR_COLUMN_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new RegularColumnIndex(baseCfs, indexMetadata);
+        }
+    };
+
+    static final CassandraIndexFunctions CLUSTERING_COLUMN_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new ClusteringColumnIndex(baseCfs, indexMetadata);
+        }
+
+        public CFMetaData.Builder addIndexClusteringColumns(CFMetaData.Builder builder,
+                                                            CFMetaData baseMetadata,
+                                                            ColumnDefinition columnDef)
+        {
+            List<ColumnDefinition> cks = baseMetadata.clusteringColumns();
+            for (int i = 0; i < columnDef.position(); i++)
+            {
+                ColumnDefinition def = cks.get(i);
+                builder.addClusteringColumn(def.name, def.type);
+            }
+            for (int i = columnDef.position() + 1; i < cks.size(); i++)
+            {
+                ColumnDefinition def = cks.get(i);
+                builder.addClusteringColumn(def.name, def.type);
+            }
+            return builder;
+        }
+    };
+
+    static final CassandraIndexFunctions PARTITION_KEY_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new PartitionKeyIndex(baseCfs, indexMetadata);
+        }
+    };
+
+    static final CassandraIndexFunctions COLLECTION_KEY_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new CollectionKeyIndex(baseCfs, indexMetadata);
+        }
+
+        public AbstractType<?> getIndexedValueType(ColumnDefinition indexedColumn)
+        {
+            return ((CollectionType) indexedColumn.type).nameComparator();
+        }
+    };
+
+    static final CassandraIndexFunctions COLLECTION_VALUE_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new CollectionValueIndex(baseCfs, indexMetadata);
+        }
+
+        public AbstractType<?> getIndexedValueType(ColumnDefinition indexedColumn)
+        {
+            return ((CollectionType)indexedColumn.type).valueComparator();
+        }
+
+        public CFMetaData.Builder addIndexClusteringColumns(CFMetaData.Builder builder,
+                                                            CFMetaData baseMetadata,
+                                                            ColumnDefinition columnDef)
+        {
+            for (ColumnDefinition def : baseMetadata.clusteringColumns())
+                builder.addClusteringColumn(def.name, def.type);
+
+            // collection key
+            builder.addClusteringColumn("cell_path", ((CollectionType)columnDef.type).nameComparator());
+            return builder;
+        }
+    };
+
+    static final CassandraIndexFunctions COLLECTION_ENTRY_INDEX_FUNCTIONS = new CassandraIndexFunctions()
+    {
+        public CassandraIndex newIndexInstance(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata)
+        {
+            return new CollectionEntryIndex(baseCfs, indexMetadata);
+        }
+
+        public AbstractType<?> getIndexedValueType(ColumnDefinition indexedColumn)
+        {
+            CollectionType colType = (CollectionType)indexedColumn.type;
+            return CompositeType.getInstance(colType.nameComparator(), colType.valueComparator());
+        }
+    };
+}

diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java b/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java
new file mode 100644
index 0000000..d6b39e6
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java

@@ -0,0 +1,192 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.internal;
+
+import java.nio.ByteBuffer;
+import java.util.NavigableSet;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+public abstract class CassandraIndexSearcher implements Index.Searcher
+{
+    private static final Logger logger = LoggerFactory.getLogger(CassandraIndexSearcher.class);
+
+    private final RowFilter.Expression expression;
+    protected final CassandraIndex index;
+    protected final ReadCommand command;
+
+    public CassandraIndexSearcher(ReadCommand command,
+                                  RowFilter.Expression expression,
+                                  CassandraIndex index)
+    {
+        this.command = command;
+        this.expression = expression;
+        this.index = index;
+    }
+
+    @SuppressWarnings("resource") // Both the OpOrder and 'indexIter' are closed on exception, or through the closing of the result
+    // of this method.
+    public UnfilteredPartitionIterator search(ReadOrderGroup orderGroup)
+    {
+        // the value of the index expression is the partition key in the index table
+        DecoratedKey indexKey = index.getBackingTable().get().decorateKey(expression.getIndexValue());
+        UnfilteredRowIterator indexIter = queryIndex(indexKey, command, orderGroup);
+        try
+        {
+            return queryDataFromIndex(indexKey, UnfilteredRowIterators.filter(indexIter, command.nowInSec()), command, orderGroup);
+        }
+        catch (RuntimeException | Error e)
+        {
+            indexIter.close();
+            throw e;
+        }
+    }
+
+    private UnfilteredRowIterator queryIndex(DecoratedKey indexKey, ReadCommand command, ReadOrderGroup orderGroup)
+    {
+        ClusteringIndexFilter filter = makeIndexFilter(command);
+        ColumnFamilyStore indexCfs = index.getBackingTable().get();
+        CFMetaData indexCfm = indexCfs.metadata;
+        return SinglePartitionReadCommand.create(indexCfm, command.nowInSec(), indexKey, ColumnFilter.all(indexCfm), filter)
+                                         .queryMemtableAndDisk(indexCfs, orderGroup.indexReadOpOrderGroup());
+    }
+
+    private ClusteringIndexFilter makeIndexFilter(ReadCommand command)
+    {
+        if (command instanceof SinglePartitionReadCommand)
+        {
+            // Note: as yet there's no route to get here - a 2i query *always* uses a
+            // PartitionRangeReadCommand. This is here in preparation for coming changes
+            // in SelectStatement.
+            SinglePartitionReadCommand sprc = (SinglePartitionReadCommand)command;
+            ByteBuffer pk = sprc.partitionKey().getKey();
+            ClusteringIndexFilter filter = sprc.clusteringIndexFilter();
+
+            if (filter instanceof ClusteringIndexNamesFilter)
+            {
+                NavigableSet<Clustering> requested = ((ClusteringIndexNamesFilter)filter).requestedRows();
+                BTreeSet.Builder<Clustering> clusterings = BTreeSet.builder(index.getIndexComparator());
+                for (Clustering c : requested)
+                    clusterings.add(makeIndexClustering(pk, c));
+                return new ClusteringIndexNamesFilter(clusterings.build(), filter.isReversed());
+            }
+            else
+            {
+                Slices requested = ((ClusteringIndexSliceFilter)filter).requestedSlices();
+                Slices.Builder builder = new Slices.Builder(index.getIndexComparator());
+                for (Slice slice : requested)
+                    builder.add(makeIndexBound(pk, slice.start()), makeIndexBound(pk, slice.end()));
+                return new ClusteringIndexSliceFilter(builder.build(), filter.isReversed());
+            }
+        }
+        else
+        {
+
+            DataRange dataRange = ((PartitionRangeReadCommand)command).dataRange();
+            AbstractBounds<PartitionPosition> range = dataRange.keyRange();
+
+            Slice slice = Slice.ALL;
+
+            /*
+             * XXX: If the range requested is a token range, we'll have to start at the beginning (and stop at the end) of
+             * the indexed row unfortunately (which will be inefficient), because we have no way to intuit the smallest possible
+             * key having a given token. A potential fix would be to actually store the token along the key in the indexed row.
+             */
+            if (range.left instanceof DecoratedKey)
+            {
+                // the right hand side of the range may not be a DecoratedKey (for instance if we're paging),
+                // but if it is, we can optimise slightly by restricting the slice
+                if (range.right instanceof DecoratedKey)
+                {
+
+                    DecoratedKey startKey = (DecoratedKey) range.left;
+                    DecoratedKey endKey = (DecoratedKey) range.right;
+
+                    Slice.Bound start = Slice.Bound.BOTTOM;
+                    Slice.Bound end = Slice.Bound.TOP;
+
+                    /*
+                     * For index queries over a range, we can't do a whole lot better than querying everything for the key range, though for
+                     * slice queries where we can slightly restrict the beginning and end.
+                     */
+                    if (!dataRange.isNamesQuery())
+                    {
+                        ClusteringIndexSliceFilter startSliceFilter = ((ClusteringIndexSliceFilter) dataRange.clusteringIndexFilter(
+                                                                                                                                   startKey));
+                        ClusteringIndexSliceFilter endSliceFilter = ((ClusteringIndexSliceFilter) dataRange.clusteringIndexFilter(
+                                                                                                                                 endKey));
+
+                        // We can't effectively support reversed queries when we have a range, so we don't support it
+                        // (or through post-query reordering) and shouldn't get there.
+                        assert !startSliceFilter.isReversed() && !endSliceFilter.isReversed();
+
+                        Slices startSlices = startSliceFilter.requestedSlices();
+                        Slices endSlices = endSliceFilter.requestedSlices();
+
+                        if (startSlices.size() > 0)
+                            start = startSlices.get(0).start();
+
+                        if (endSlices.size() > 0)
+                            end = endSlices.get(endSlices.size() - 1).end();
+                    }
+
+                    slice = Slice.make(makeIndexBound(startKey.getKey(), start),
+                                       makeIndexBound(endKey.getKey(), end));
+                }
+                else
+                {
+                    // otherwise, just start the index slice from the key we do have
+                    slice = Slice.make(makeIndexBound(((DecoratedKey)range.left).getKey(), Slice.Bound.BOTTOM),
+                                       Slice.Bound.TOP);
+                }
+            }
+            return new ClusteringIndexSliceFilter(Slices.with(index.getIndexComparator(), slice), false);
+        }
+    }
+
+    private Slice.Bound makeIndexBound(ByteBuffer rowKey, Slice.Bound bound)
+    {
+        return index.buildIndexClusteringPrefix(rowKey, bound, null)
+                                 .buildBound(bound.isStart(), bound.isInclusive());
+    }
+
+    protected Clustering makeIndexClustering(ByteBuffer rowKey, Clustering clustering)
+    {
+        return index.buildIndexClusteringPrefix(rowKey, clustering, null).build();
+    }
+
+    protected abstract UnfilteredPartitionIterator queryDataFromIndex(DecoratedKey indexKey,
+                                                                      RowIterator indexHits,
+                                                                      ReadCommand command,
+                                                                      ReadOrderGroup orderGroup);
+}

diff --git a/src/java/org/apache/cassandra/index/internal/IndexEntry.java b/src/java/org/apache/cassandra/index/internal/IndexEntry.java
new file mode 100644
index 0000000..97525d6
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/IndexEntry.java

@@ -0,0 +1,54 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.internal;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+
+/**
+ * Entries in indexes on non-compact tables (tables with composite comparators)
+ * can be encapsulated as IndexedEntry instances. These are not used when dealing
+ * with indexes on static/compact/thrift tables (i.e. KEYS indexes).
+ */
+public final class IndexEntry
+{
+    public final DecoratedKey indexValue;
+    public final Clustering indexClustering;
+    public final long timestamp;
+
+    public final ByteBuffer indexedKey;
+    public final Clustering indexedEntryClustering;
+
+    public IndexEntry(DecoratedKey indexValue,
+                      Clustering indexClustering,
+                      long timestamp,
+                      ByteBuffer indexedKey,
+                      Clustering indexedEntryClustering)
+    {
+        this.indexValue = indexValue;
+        this.indexClustering = indexClustering;
+        this.timestamp = timestamp;
+        this.indexedKey = indexedKey;
+        this.indexedEntryClustering = indexedEntryClustering;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/ClusteringColumnIndex.java b/src/java/org/apache/cassandra/index/internal/composites/ClusteringColumnIndex.java
new file mode 100644
index 0000000..cace6de
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/ClusteringColumnIndex.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index on a CLUSTERING_COLUMN column definition.
+ *
+ * A cell indexed by this index will have the general form:
+ *   ck_0 ... ck_n c_name : v
+ * where ck_i are the cluster keys, c_name the last component of the cell
+ * composite name (or second to last if collections are in use, but this
+ * has no impact) and v the cell value.
+ *
+ * Such a cell is always indexed by this index (or rather, it is indexed if
+ * n >= columnDef.componentIndex, which will always be the case in practice)
+ * and it will generate (makeIndexColumnName()) an index entry whose:
+ *   - row key will be ck_i (getIndexedValue()) where i == columnDef.componentIndex.
+ *   - cell name will
+ *       rk ck_0 ... ck_{i-1} ck_{i+1} ck_n
+ *     where rk is the row key of the initial cell and i == columnDef.componentIndex.
+ */
+public class ClusteringColumnIndex extends CassandraIndex
+{
+    private final boolean enforceStrictLiveness;
+
+    public ClusteringColumnIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+        this.enforceStrictLiveness = baseCfs.metadata.enforceStrictLiveness();
+    }
+
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path, ByteBuffer cellValue)
+    {
+        return clustering.get(indexedColumn.position());
+    }
+
+    public CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        for (int i = 0; i < Math.min(indexedColumn.position(), prefix.size()); i++)
+            builder.add(prefix.get(i));
+        for (int i = indexedColumn.position() + 1; i < prefix.size(); i++)
+            builder.add(prefix.get(i));
+        return builder;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue,
+                                  Row indexEntry)
+    {
+        int ckCount = baseCfs.metadata.clusteringColumns().size();
+
+        Clustering clustering = indexEntry.clustering();
+        CBuilder builder = CBuilder.create(baseCfs.getComparator());
+        for (int i = 0; i < indexedColumn.position(); i++)
+            builder.add(clustering.get(i + 1));
+
+        builder.add(indexedValue.getKey());
+
+        for (int i = indexedColumn.position() + 1; i < ckCount; i++)
+            builder.add(clustering.get(i));
+
+        return new IndexEntry(indexedValue,
+                              clustering,
+                              indexEntry.primaryKeyLivenessInfo().timestamp(),
+                              clustering.get(0),
+                              builder.build());
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        return !data.hasLiveData(nowInSec, enforceStrictLiveness);
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/CollectionEntryIndex.java b/src/java/org/apache/cassandra/index/internal/composites/CollectionEntryIndex.java
new file mode 100644
index 0000000..1113600
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/CollectionEntryIndex.java

@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index on the element and value of cells participating in a collection.
+ *
+ * The row keys for this index are a composite of the collection element
+ * and value of indexed columns.
+ */
+public class CollectionEntryIndex extends CollectionKeyIndexBase
+{
+    public CollectionEntryIndex(ColumnFamilyStore baseCfs,
+                                IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path, ByteBuffer cellValue)
+    {
+        return CompositeType.build(path.get(0), cellValue);
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        ByteBuffer[] components = ((CompositeType)functions.getIndexedValueType(indexedColumn)).split(indexValue);
+        ByteBuffer mapKey = components[0];
+        ByteBuffer mapValue = components[1];
+
+        ColumnDefinition columnDef = indexedColumn;
+        Cell cell = data.getCell(columnDef, CellPath.create(mapKey));
+        if (cell == null || !cell.isLive(nowInSec))
+            return true;
+
+        AbstractType<?> valueComparator = ((CollectionType)columnDef.type).valueComparator();
+        return valueComparator.compare(mapValue, cell.value()) != 0;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndex.java b/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndex.java
new file mode 100644
index 0000000..42c45e5
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndex.java

@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index on the collection element of the cell name of a collection.
+ *
+ * The row keys for this index are given by the collection element for
+ * indexed columns.
+ */
+public class CollectionKeyIndex extends CollectionKeyIndexBase
+{
+    public CollectionKeyIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path,
+                                      ByteBuffer cellValue)
+    {
+        return path.get(0);
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        Cell cell = data.getCell(indexedColumn, CellPath.create(indexValue));
+        return cell == null || !cell.isLive(nowInSec);
+    }
+
+    public boolean supportsOperator(ColumnDefinition indexedColumn, Operator operator)
+    {
+        return operator == Operator.CONTAINS_KEY ||
+               operator == Operator.CONTAINS && indexedColumn.type instanceof SetType;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndexBase.java b/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndexBase.java
new file mode 100644
index 0000000..fe77c96
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/CollectionKeyIndexBase.java

@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Common superclass for indexes that capture collection keys, including
+ * indexes on such keys themselves.
+ *
+ * A cell indexed by this index will have the general form:
+ *   ck_0 ... ck_n c_name [col_elt] : v
+ * where ck_i are the cluster keys, c_name the CQL3 column name, col_elt the
+ * collection element that we want to index (which may or may not be there depending
+ * on whether c_name is the collection we're indexing), and v the cell value.
+ *
+ * Such a cell is indexed if c_name is the indexed collection (in which case we are guaranteed to have
+ * col_elt). The index entry can be viewed in the following way:
+ *   - the row key is determined by subclasses of this type.
+ *   - the cell name will be 'rk ck_0 ... ck_n' where rk is the row key of the initial cell.
+ */
+public abstract class CollectionKeyIndexBase extends CassandraIndex
+{
+    public CollectionKeyIndexBase(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        for (int i = 0; i < prefix.size(); i++)
+            builder.add(prefix.get(i));
+
+        return builder;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue,
+                                  Row indexEntry)
+    {
+        int count = 1 + baseCfs.metadata.clusteringColumns().size();
+        Clustering clustering = indexEntry.clustering();
+        CBuilder builder = CBuilder.create(baseCfs.getComparator());
+        for (int i = 0; i < count - 1; i++)
+            builder.add(clustering.get(i + 1));
+
+        return new IndexEntry(indexedValue,
+                              clustering,
+                              indexEntry.primaryKeyLivenessInfo().timestamp(),
+                              clustering.get(0),
+                              builder.build());
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/CollectionValueIndex.java b/src/java/org/apache/cassandra/index/internal/composites/CollectionValueIndex.java
new file mode 100644
index 0000000..95bd7e1
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/CollectionValueIndex.java

@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index the value of a collection cell.
+ *
+ * This is a lot like an index on REGULAR, except that we also need to make
+ * the collection key part of the index entry so that:
+ *   1) we don't have to scan the whole collection at query time to know the
+ *   entry is stale and if it still satisfies the query.
+ *   2) if a collection has multiple time the same value, we need one entry
+ *   for each so that if we delete one of the value only we only delete the
+ *   entry corresponding to that value.
+ */
+public class CollectionValueIndex extends CassandraIndex
+{
+    public CollectionValueIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path, ByteBuffer cellValue)
+    {
+        return cellValue;
+    }
+
+    public CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        for (int i = 0; i < prefix.size(); i++)
+            builder.add(prefix.get(i));
+
+        // When indexing, cell will be present, but when searching, it won't  (CASSANDRA-7525)
+        if (prefix.size() == baseCfs.metadata.clusteringColumns().size() && path != null)
+            builder.add(path.get(0));
+
+        return builder;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue, Row indexEntry)
+    {
+        Clustering clustering = indexEntry.clustering();
+        CBuilder builder = CBuilder.create(baseCfs.getComparator());
+        for (int i = 0; i < baseCfs.getComparator().size(); i++)
+            builder.add(clustering.get(i + 1));
+
+        return new IndexEntry(indexedValue,
+                                clustering,
+                                indexEntry.primaryKeyLivenessInfo().timestamp(),
+                                clustering.get(0),
+                                builder.build());
+    }
+
+    public boolean supportsOperator(ColumnDefinition indexedColumn, Operator operator)
+    {
+        return operator == Operator.CONTAINS && !(indexedColumn.type instanceof SetType);
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        ColumnDefinition columnDef = indexedColumn;
+        ComplexColumnData complexData = data.getComplexColumnData(columnDef);
+        if (complexData == null)
+            return true;
+
+        for (Cell cell : complexData)
+        {
+            if (cell.isLive(nowInSec) && ((CollectionType) columnDef.type).valueComparator()
+                                                                          .compare(indexValue, cell.value()) == 0)
+                return false;
+        }
+        return true;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java b/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java
new file mode 100644
index 0000000..6bb9869
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java

@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.CassandraIndexSearcher;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.utils.btree.BTreeSet;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+
+public class CompositesSearcher extends CassandraIndexSearcher
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompositesSearcher.class);
+
+    public CompositesSearcher(ReadCommand command,
+                              RowFilter.Expression expression,
+                              CassandraIndex index)
+    {
+        super(command, expression, index);
+    }
+
+    private boolean isMatchingEntry(DecoratedKey partitionKey, IndexEntry entry, ReadCommand command)
+    {
+        return command.selectsKey(partitionKey) && command.selectsClustering(partitionKey, entry.indexedEntryClustering);
+    }
+
+    protected UnfilteredPartitionIterator queryDataFromIndex(final DecoratedKey indexKey,
+                                                             final RowIterator indexHits,
+                                                             final ReadCommand command,
+                                                             final ReadOrderGroup orderGroup)
+    {
+        assert indexHits.staticRow() == Rows.EMPTY_STATIC_ROW;
+
+        return new UnfilteredPartitionIterator()
+        {
+            private IndexEntry nextEntry;
+
+            private UnfilteredRowIterator next;
+
+            public boolean isForThrift()
+            {
+                return command.isForThrift();
+            }
+
+            public CFMetaData metadata()
+            {
+                return command.metadata();
+            }
+
+            public boolean hasNext()
+            {
+                return prepareNext();
+            }
+
+            public UnfilteredRowIterator next()
+            {
+                if (next == null)
+                    prepareNext();
+
+                UnfilteredRowIterator toReturn = next;
+                next = null;
+                return toReturn;
+            }
+
+            private boolean prepareNext()
+            {
+                while (true)
+                {
+                    if (next != null)
+                        return true;
+
+                    if (nextEntry == null)
+                    {
+                        if (!indexHits.hasNext())
+                            return false;
+
+                        nextEntry = index.decodeEntry(indexKey, indexHits.next());
+                    }
+
+                    // Gather all index hits belonging to the same partition and query the data for those hits.
+                    // TODO: it's much more efficient to do 1 read for all hits to the same partition than doing
+                    // 1 read per index hit. However, this basically mean materializing all hits for a partition
+                    // in memory so we should consider adding some paging mechanism. However, index hits should
+                    // be relatively small so it's much better than the previous code that was materializing all
+                    // *data* for a given partition.
+                    BTreeSet.Builder<Clustering> clusterings = BTreeSet.builder(index.baseCfs.getComparator());
+                    List<IndexEntry> entries = new ArrayList<>();
+                    DecoratedKey partitionKey = index.baseCfs.decorateKey(nextEntry.indexedKey);
+
+                    while (nextEntry != null && partitionKey.getKey().equals(nextEntry.indexedKey))
+                    {
+                        // We're queried a slice of the index, but some hits may not match some of the clustering column constraints
+                        if (isMatchingEntry(partitionKey, nextEntry, command))
+                        {
+                            clusterings.add(nextEntry.indexedEntryClustering);
+                            entries.add(nextEntry);
+                        }
+
+                        nextEntry = indexHits.hasNext() ? index.decodeEntry(indexKey, indexHits.next()) : null;
+                    }
+
+                    // Because we've eliminated entries that don't match the clustering columns, it's possible we added nothing
+                    if (clusterings.isEmpty())
+                        continue;
+
+                    // Query the gathered index hits. We still need to filter stale hits from the resulting query.
+                    ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings.build(), false);
+                    SinglePartitionReadCommand dataCmd = SinglePartitionReadCommand.create(isForThrift(),
+                                                                                           index.baseCfs.metadata,
+                                                                                           command.nowInSec(),
+                                                                                           command.columnFilter(),
+                                                                                           command.rowFilter(),
+                                                                                           DataLimits.NONE,
+                                                                                           partitionKey,
+                                                                                           filter,
+                                                                                           null);
+                    @SuppressWarnings("resource") // We close right away if empty, and if it's assign to next it will be called either
+                    // by the next caller of next, or through closing this iterator is this come before.
+                    UnfilteredRowIterator dataIter =
+                        filterStaleEntries(dataCmd.queryMemtableAndDisk(index.baseCfs,
+                                                                        orderGroup.baseReadOpOrderGroup()),
+                                           indexKey.getKey(),
+                                           entries,
+                                           orderGroup.writeOpOrderGroup(),
+                                           command.nowInSec());
+
+                    if (dataIter.isEmpty())
+                    {
+                        dataIter.close();
+                        continue;
+                    }
+
+                    next = dataIter;
+                    return true;
+                }
+            }
+
+            public void remove()
+            {
+                throw new UnsupportedOperationException();
+            }
+
+            public void close()
+            {
+                indexHits.close();
+                if (next != null)
+                    next.close();
+            }
+        };
+    }
+
+    private void deleteAllEntries(final List<IndexEntry> entries, final OpOrder.Group writeOp, final int nowInSec)
+    {
+        entries.forEach(entry ->
+            index.deleteStaleEntry(entry.indexValue,
+                                     entry.indexClustering,
+                                     new DeletionTime(entry.timestamp, nowInSec),
+                                     writeOp));
+    }
+
+    private UnfilteredRowIterator filterStaleEntries(UnfilteredRowIterator dataIter,
+                                                     final ByteBuffer indexValue,
+                                                     final List<IndexEntry> entries,
+                                                     final OpOrder.Group writeOp,
+                                                     final int nowInSec)
+    {
+        // collect stale index entries and delete them when we close this iterator
+        final List<IndexEntry> staleEntries = new ArrayList<>();
+
+        // if there is a partition level delete in the base table, we need to filter
+        // any index entries which would be shadowed by it
+        if (!dataIter.partitionLevelDeletion().isLive())
+        {
+            DeletionTime deletion = dataIter.partitionLevelDeletion();
+            entries.forEach(e -> {
+                if (deletion.deletes(e.timestamp))
+                    staleEntries.add(e);
+            });
+        }
+
+        ClusteringComparator comparator = dataIter.metadata().comparator;
+        class Transform extends Transformation
+        {
+            private int entriesIdx;
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                IndexEntry entry = findEntry(row.clustering());
+                if (!index.isStale(row, indexValue, nowInSec))
+                    return row;
+
+                staleEntries.add(entry);
+                return null;
+            }
+
+            private IndexEntry findEntry(Clustering clustering)
+            {
+                assert entriesIdx < entries.size();
+                while (entriesIdx < entries.size())
+                {
+                    IndexEntry entry = entries.get(entriesIdx++);
+                    Clustering indexedEntryClustering = entry.indexedEntryClustering;
+                    // The entries are in clustering order. So that the requested entry should be the
+                    // next entry, the one at 'entriesIdx'. However, we can have stale entries, entries
+                    // that have no corresponding row in the base table typically because of a range
+                    // tombstone or partition level deletion. Delete such stale entries.
+                    int cmp = comparator.compare(indexedEntryClustering, clustering);
+                    assert cmp <= 0; // this would means entries are not in clustering order, which shouldn't happen
+                    if (cmp == 0)
+                        return entry;
+                    else
+                    {
+                        // COMPACT COMPOSITE tables support null values in there clustering key but
+                        // those tables do not support static columns. By consequence if a table
+                        // has some static columns and all its clustering key elements are null
+                        // it means that the partition exists and contains only static data
+                       if (!dataIter.metadata().hasStaticColumns() || !containsOnlyNullValues(indexedEntryClustering))
+                           staleEntries.add(entry);
+                    }
+                }
+                // entries correspond to the rows we've queried, so we shouldn't have a row that has no corresponding entry.
+                throw new AssertionError();
+            }
+
+            private boolean containsOnlyNullValues(Clustering indexedEntryClustering)
+            {
+                int i = 0;
+                for (; i < indexedEntryClustering.size() && indexedEntryClustering.get(i) == null; i++);
+                return i == indexedEntryClustering.size();
+            }
+
+            @Override
+            public void onPartitionClose()
+            {
+                deleteAllEntries(staleEntries, writeOp, nowInSec);
+            }
+        }
+
+        return Transformation.apply(dataIter, new Transform());
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/PartitionKeyIndex.java b/src/java/org/apache/cassandra/index/internal/composites/PartitionKeyIndex.java
new file mode 100644
index 0000000..d854102
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/PartitionKeyIndex.java

@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index on a PARTITION_KEY column definition.
+ *
+ * This suppose a composite row key:
+ *   rk = rk_0 ... rk_n
+ *
+ * The corresponding index entry will be:
+ *   - index row key will be rk_i (where i == columnDef.componentIndex)
+ *   - cell name will be: rk ck
+ *     where rk is the fully partition key and ck the clustering keys of the
+ *     original cell names (thus excluding the last column name as we want to refer to
+ *     the whole CQL3 row, not just the cell itself)
+ *
+ * Note that contrarily to other type of index, we repeat the indexed value in
+ * the index cell name (we use the whole partition key). The reason is that we
+ * want to order the index cell name by partitioner first, and skipping a part
+ * of the row key would change the order.
+ */
+public class PartitionKeyIndex extends CassandraIndex
+{
+    private final boolean enforceStrictLiveness;
+    public PartitionKeyIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+        this.enforceStrictLiveness = baseCfs.metadata.enforceStrictLiveness();
+    }
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path,
+                                      ByteBuffer cellValue)
+    {
+        CompositeType keyComparator = (CompositeType)baseCfs.metadata.getKeyValidator();
+        ByteBuffer[] components = keyComparator.split(partitionKey);
+        return components[indexedColumn.position()];
+    }
+
+    public CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        for (int i = 0; i < prefix.size(); i++)
+            builder.add(prefix.get(i));
+        return builder;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue, Row indexEntry)
+    {
+        int ckCount = baseCfs.metadata.clusteringColumns().size();
+        Clustering clustering = indexEntry.clustering();
+        CBuilder builder = CBuilder.create(baseCfs.getComparator());
+        for (int i = 0; i < ckCount; i++)
+            builder.add(clustering.get(i + 1));
+
+        return new IndexEntry(indexedValue,
+                              clustering,
+                              indexEntry.primaryKeyLivenessInfo().timestamp(),
+                              clustering.get(0),
+                              builder.build());
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        return !data.hasLiveData(nowInSec, enforceStrictLiveness);
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/composites/RegularColumnIndex.java b/src/java/org/apache/cassandra/index/internal/composites/RegularColumnIndex.java
new file mode 100644
index 0000000..f1dc3af
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/composites/RegularColumnIndex.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.composites;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+/**
+ * Index on a REGULAR column definition on a composite type.
+ *
+ * A cell indexed by this index will have the general form:
+ *   ck_0 ... ck_n c_name : v
+ * where ck_i are the cluster keys, c_name the last component of the cell
+ * composite name (or second to last if collections are in use, but this
+ * has no impact) and v the cell value.
+ *
+ * Such a cell is indexed if c_name == columnDef.name, and it will generate
+ * (makeIndexColumnName()) an index entry whose:
+ *   - row key will be the value v (getIndexedValue()).
+ *   - cell name will
+ *       rk ck_0 ... ck_n
+ *     where rk is the row key of the initial cell. I.e. the index entry store
+ *     all the information require to locate back the indexed cell.
+ */
+public class RegularColumnIndex extends CassandraIndex
+{
+    public RegularColumnIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path,
+                                      ByteBuffer cellValue)
+    {
+        return cellValue;
+    }
+
+    public CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        for (int i = 0; i < prefix.size(); i++)
+            builder.add(prefix.get(i));
+
+        return builder;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue, Row indexEntry)
+    {
+        Clustering clustering = indexEntry.clustering();
+        ClusteringComparator baseComparator = baseCfs.getComparator();
+        CBuilder builder = CBuilder.create(baseComparator);
+        for (int i = 0; i < baseComparator.size(); i++)
+            builder.add(clustering.get(i + 1));
+
+        return new IndexEntry(indexedValue,
+                                clustering,
+                                indexEntry.primaryKeyLivenessInfo().timestamp(),
+                                clustering.get(0),
+                                builder.build());
+    }
+
+    public boolean isStale(Row data, ByteBuffer indexValue, int nowInSec)
+    {
+        Cell cell = data.getCell(indexedColumn);
+        return cell == null
+            || !cell.isLive(nowInSec)
+            || indexedColumn.type.compare(indexValue, cell.value()) != 0;
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/keys/KeysIndex.java b/src/java/org/apache/cassandra/index/internal/keys/KeysIndex.java
new file mode 100644
index 0000000..d680253
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/keys/KeysIndex.java

@@ -0,0 +1,82 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.internal.keys;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.IndexEntry;
+import org.apache.cassandra.schema.IndexMetadata;
+
+public class KeysIndex extends CassandraIndex
+{
+    public KeysIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        super(baseCfs, indexDef);
+    }
+
+    public CFMetaData.Builder addIndexClusteringColumns(CFMetaData.Builder builder,
+                                                        CFMetaData baseMetadata,
+                                                        ColumnDefinition cfDef)
+    {
+        // no additional clustering columns required
+        return builder;
+    }
+
+    protected CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        return builder;
+    }
+
+    protected ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path, ByteBuffer cellValue)
+    {
+        return cellValue;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue, Row indexEntry)
+    {
+        throw new UnsupportedOperationException("KEYS indexes do not use a specialized index entry format");
+    }
+
+    public boolean isStale(Row row, ByteBuffer indexValue, int nowInSec)
+    {
+        if (row == null)
+            return true;
+
+        Cell cell = row.getCell(indexedColumn);
+
+        return (cell == null
+             || !cell.isLive(nowInSec)
+             || indexedColumn.type.compare(indexValue, cell.value()) != 0);
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java b/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java
new file mode 100644
index 0000000..7cf4c51
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java

@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.internal.keys;
+
+import java.nio.ByteBuffer;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.internal.CassandraIndexSearcher;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+public class KeysSearcher extends CassandraIndexSearcher
+{
+    private static final Logger logger = LoggerFactory.getLogger(KeysSearcher.class);
+
+    public KeysSearcher(ReadCommand command,
+                        RowFilter.Expression expression,
+                        CassandraIndex indexer)
+    {
+        super(command, expression, indexer);
+    }
+
+    protected UnfilteredPartitionIterator queryDataFromIndex(final DecoratedKey indexKey,
+                                                             final RowIterator indexHits,
+                                                             final ReadCommand command,
+                                                             final ReadOrderGroup orderGroup)
+    {
+        assert indexHits.staticRow() == Rows.EMPTY_STATIC_ROW;
+
+        return new UnfilteredPartitionIterator()
+        {
+            private UnfilteredRowIterator next;
+
+            public boolean isForThrift()
+            {
+                return command.isForThrift();
+            }
+
+            public CFMetaData metadata()
+            {
+                return command.metadata();
+            }
+
+            public boolean hasNext()
+            {
+                return prepareNext();
+            }
+
+            public UnfilteredRowIterator next()
+            {
+                if (next == null)
+                    prepareNext();
+
+                UnfilteredRowIterator toReturn = next;
+                next = null;
+                return toReturn;
+            }
+
+            private boolean prepareNext()
+            {
+                while (next == null && indexHits.hasNext())
+                {
+                    Row hit = indexHits.next();
+                    DecoratedKey key = index.baseCfs.decorateKey(hit.clustering().get(0));
+                    if (!command.selectsKey(key))
+                        continue;
+
+                    ColumnFilter extendedFilter = getExtendedFilter(command.columnFilter());
+                    SinglePartitionReadCommand dataCmd = SinglePartitionReadCommand.create(isForThrift(),
+                                                                                           index.baseCfs.metadata,
+                                                                                           command.nowInSec(),
+                                                                                           extendedFilter,
+                                                                                           command.rowFilter(),
+                                                                                           DataLimits.NONE,
+                                                                                           key,
+                                                                                           command.clusteringIndexFilter(key),
+                                                                                           null);
+
+                    @SuppressWarnings("resource") // filterIfStale closes it's iterator if either it materialize it or if it returns null.
+                                                  // Otherwise, we close right away if empty, and if it's assigned to next it will be called either
+                                                  // by the next caller of next, or through closing this iterator is this come before.
+                    UnfilteredRowIterator dataIter = filterIfStale(dataCmd.queryMemtableAndDisk(index.baseCfs,
+                                                                                                orderGroup.baseReadOpOrderGroup()),
+                                                                   hit,
+                                                                   indexKey.getKey(),
+                                                                   orderGroup.writeOpOrderGroup(),
+                                                                   isForThrift(),
+                                                                   command.nowInSec());
+
+                    if (dataIter != null)
+                    {
+                        if (dataIter.isEmpty())
+                            dataIter.close();
+                        else
+                            next = dataIter;
+                    }
+                }
+                return next != null;
+            }
+
+            public void remove()
+            {
+                throw new UnsupportedOperationException();
+            }
+
+            public void close()
+            {
+                indexHits.close();
+                if (next != null)
+                    next.close();
+            }
+        };
+    }
+
+    private ColumnFilter getExtendedFilter(ColumnFilter initialFilter)
+    {
+        if (command.columnFilter().includes(index.getIndexedColumn()))
+            return initialFilter;
+
+        ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+        builder.addAll(initialFilter.fetchedColumns());
+        builder.add(index.getIndexedColumn());
+        return builder.build();
+    }
+
+    private UnfilteredRowIterator filterIfStale(UnfilteredRowIterator iterator,
+                                                Row indexHit,
+                                                ByteBuffer indexedValue,
+                                                OpOrder.Group writeOp,
+                                                boolean isForThrift,
+                                                int nowInSec)
+    {
+        if (isForThrift)
+        {
+            // The data we got has gone though ThrifResultsMerger, so we're looking for the row whose clustering
+            // is the indexed name and so we need to materialize the partition.
+            ImmutableBTreePartition result = ImmutableBTreePartition.create(iterator);
+            iterator.close();
+            Row data = result.getRow(new Clustering(index.getIndexedColumn().name.bytes));
+            if (data == null)
+                return null;
+
+            // for thrift tables, we need to compare the index entry against the compact value column,
+            // not the column actually designated as the indexed column so we don't use the index function
+            // lib for the staleness check like we do in every other case
+            Cell baseData = data.getCell(index.baseCfs.metadata.compactValueColumn());
+            if (baseData == null || !baseData.isLive(nowInSec) || index.getIndexedColumn().type.compare(indexedValue, baseData.value()) != 0)
+            {
+                // Index is stale, remove the index entry and ignore
+                index.deleteStaleEntry(index.getIndexCfs().decorateKey(indexedValue),
+                                         new Clustering(index.getIndexedColumn().name.bytes),
+                                         new DeletionTime(indexHit.primaryKeyLivenessInfo().timestamp(), nowInSec),
+                                         writeOp);
+                return null;
+            }
+            else
+            {
+                if (command.columnFilter().includes(index.getIndexedColumn()))
+                    return result.unfilteredIterator();
+
+                // The query on the base table used an extended column filter to ensure that the
+                // indexed column was actually read for use in the staleness check, before
+                // returning the results we must filter the base table partition so that it
+                // contains only the originally requested columns. See CASSANDRA-11523
+                ClusteringComparator comparator = result.metadata().comparator;
+                Slices.Builder slices = new Slices.Builder(comparator);
+                for (ColumnDefinition selected : command.columnFilter().fetchedColumns())
+                    slices.add(Slice.make(comparator, selected.name.bytes));
+                return result.unfilteredIterator(ColumnFilter.all(command.metadata()), slices.build(), false);
+            }
+        }
+        else
+        {
+            if (!iterator.metadata().isCompactTable())
+            {
+                logger.warn("Non-composite index was used on the table '{}' during the query. Starting from Cassandra 4.0, only " +
+                            "composite indexes will be supported. If compact flags were dropped for this table, drop and re-create " +
+                            "the index.", iterator.metadata().cfName);
+            }
+
+            Row data = iterator.staticRow();
+            if (index.isStale(data, indexedValue, nowInSec))
+            {
+                // Index is stale, remove the index entry and ignore
+                index.deleteStaleEntry(index.getIndexCfs().decorateKey(indexedValue),
+                                         makeIndexClustering(iterator.partitionKey().getKey(), Clustering.EMPTY),
+                                         new DeletionTime(indexHit.primaryKeyLivenessInfo().timestamp(), nowInSec),
+                                         writeOp);
+                iterator.close();
+                return null;
+            }
+            else
+            {
+                return iterator;
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/index/transactions/CleanupTransaction.java b/src/java/org/apache/cassandra/index/transactions/CleanupTransaction.java
new file mode 100644
index 0000000..1d6ba56
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/transactions/CleanupTransaction.java

@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.transactions;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.rows.Row;
+
+/**
+ * Performs garbage collection of index entries during a cleanup.
+ *
+ * Notifies registered indexers of each partition being removed and
+ *
+ * Compaction & Cleanup are somewhat simpler than dealing with incoming writes,
+ * being only concerned with cleaning up stale index entries.
+ *
+ * When multiple versions of a row are compacted, the CleanupTransaction is
+ * notified of the versions being merged, which it diffs against the merge result
+ * and forwards to the registered Index.Indexer instances when on commit.
+ *
+ * Instances are currently scoped to a single row within a partition, but this could be improved to batch process
+ * multiple rows within a single partition.
+ */
+public interface CleanupTransaction extends IndexTransaction
+{
+
+    void onPartitionDeletion(DeletionTime deletionTime);
+    void onRowDelete(Row row);
+
+    CleanupTransaction NO_OP = new CleanupTransaction()
+    {
+        public void start(){}
+        public void onPartitionDeletion(DeletionTime deletionTime){}
+        public void onRowDelete(Row row){}
+        public void commit(){}
+    };
+}

diff --git a/src/java/org/apache/cassandra/index/transactions/CompactionTransaction.java b/src/java/org/apache/cassandra/index/transactions/CompactionTransaction.java
new file mode 100644
index 0000000..f2436af
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/transactions/CompactionTransaction.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.transactions;
+
+import org.apache.cassandra.db.rows.Row;
+
+/**
+ * Performs garbage collection of stale index entries during a regular compaction.
+ *
+ * A CompactionTransaction is concerned with cleaning up stale index entries.
+ * When multiple versions of a row are compacted, the CompactionTransaction is
+ * notified of the versions being merged, which it diffs against the merge result.
+ *
+ * Instances are currently scoped to a single row within a partition, but this could be improved to batch process
+ * multiple rows within a single partition.
+ */
+public interface CompactionTransaction extends IndexTransaction
+{
+    void onRowMerge(Row merged, Row...versions);
+
+    CompactionTransaction NO_OP = new CompactionTransaction()
+    {
+        public void start(){}
+        public void onRowMerge(Row merged, Row...versions){}
+        public void commit(){}
+    };
+}

diff --git a/src/java/org/apache/cassandra/index/transactions/IndexTransaction.java b/src/java/org/apache/cassandra/index/transactions/IndexTransaction.java
new file mode 100644
index 0000000..3fb8235
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/transactions/IndexTransaction.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.transactions;
+
+/**
+ * Base interface for the handling of index updates.
+ * There are 3 types of transaction where indexes are updated to stay in sync with the base table, each represented by
+ * a subinterface:
+ * * {@code UpdateTransaction}
+ *   Used on the regular write path and when indexing newly acquired SSTables from streaming or sideloading. This type
+ *   of transaction may include both row inserts and updates to rows previously existing in the base Memtable. Instances
+ *   are scoped to a single partition update and are obtained from the factory method
+ *   @{code SecondaryIndexManager#newUpdateTransaction}
+ *
+ * * {@code CompactionTransaction}
+ *   Used during compaction when stale entries which have been superceded are cleaned up from the index. As rows in a
+ *   partition are merged during the compaction, index entries for any purged rows are cleaned from the index to
+ *   compensate for the fact that they may not have been removed at write time if the data in the base table had been
+ *   already flushed to disk (and so was processed as an insert, not an update by the UpdateTransaction). These
+ *   transactions are currently scoped to a single row within a partition, but this could be improved to batch process
+ *   multiple rows within a single partition.
+ *
+ * * @{code CleanupTransaction}
+ *   During cleanup no merging is required, the only thing to do is to notify indexes of the partitions being removed,
+ *   along with the rows within those partitions. Like with compaction, these transactions are currently scoped to a
+ *   single row within a partition, but this could be improved with batching.
+ */
+public interface IndexTransaction
+{
+    /**
+     * Used to differentiate between type of index transaction when obtaining
+     * a handler from Index implementations.
+     */
+    public enum Type
+    {
+        UPDATE, COMPACTION, CLEANUP
+    }
+
+    void start();
+    void commit();
+}

diff --git a/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java b/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java
new file mode 100644
index 0000000..c78304a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/transactions/UpdateTransaction.java

@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.transactions;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.rows.Row;
+
+/**
+ * Handling of index updates on the write path.
+ *
+ * Instances of an UpdateTransaction are scoped to a single partition update
+ * A new instance is used for every write, obtained from the
+ * newUpdateTransaction(PartitionUpdate) method. Likewise, a single
+ * CleanupTransaction instance is used for each partition processed during a
+ * compaction or cleanup.
+ *
+ * We make certain guarantees about the lifecycle of each UpdateTransaction
+ * instance. Namely that start() will be called before any other method, and
+ * commit() will be called at the end of the update.
+ * Each instance is initialized with 1..many Index.Indexer instances, one per
+ * registered Index. As with the transaction itself, these are scoped to a
+ * specific partition update, so implementations can be assured that all indexing
+ * events they receive relate to the same logical operation.
+ *
+ * onPartitionDelete(), onRangeTombstone(), onInserted() and onUpdated()
+ * calls may arrive in any order, but this should have no impact for the
+ * Indexers being notified as any events delivered to a single instance
+ * necessarily relate to a single partition.
+ *
+ * The typical sequence of events during a Memtable update would be:
+ * start()                       -- no-op, used to notify Indexers of the start of the transaction
+ * onPartitionDeletion(dt)       -- if the PartitionUpdate implies one
+ * onRangeTombstone(rt)*         -- for each in the PartitionUpdate, if any
+ *
+ * then:
+ * onInserted(row)*              -- called for each Row not already present in the Memtable
+ * onUpdated(existing, updated)* -- called for any Row in the update for where a version was already present
+ *                                  in the Memtable. It's important to note here that existing is the previous
+ *                                  row from the Memtable & updated is the final version replacing it. It is
+ *                                  *not* the incoming row, but the result of merging the incoming and existing
+ *                                  rows.
+ * commit()                      -- finally, finish is called when the new Partition is swapped into the Memtable
+ */
+public interface UpdateTransaction extends IndexTransaction
+{
+    void onPartitionDeletion(DeletionTime deletionTime);
+    void onRangeTombstone(RangeTombstone rangeTombstone);
+    void onInserted(Row row);
+    void onUpdated(Row existing, Row updated);
+
+    UpdateTransaction NO_OP = new UpdateTransaction()
+    {
+        public void start(){}
+        public void onPartitionDeletion(DeletionTime deletionTime){}
+        public void onRangeTombstone(RangeTombstone rangeTombstone){}
+        public void onInserted(Row row){}
+        public void onUpdated(Row existing, Row updated){}
+        public void commit(){}
+    };
+}

diff --git a/src/java/org/apache/cassandra/io/FSDiskFullWriteError.java b/src/java/org/apache/cassandra/io/FSDiskFullWriteError.java
new file mode 100644
index 0000000..ca5d8da
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/FSDiskFullWriteError.java

@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io;
+
+public class FSDiskFullWriteError extends FSWriteError
+{
+    public FSDiskFullWriteError(Throwable cause, String path)
+    {
+        super(cause, path);
+    }
+
+    @Override
+    public String toString()
+    {
+        return "FSDiskFullWriteError in " + path;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/ForwardingVersionedSerializer.java b/src/java/org/apache/cassandra/io/ForwardingVersionedSerializer.java
new file mode 100644
index 0000000..64f91d7
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/ForwardingVersionedSerializer.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io;
+
+import java.io.IOException;
+
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * A serializer which forwards all its method calls to another serializer. Subclasses should override one or more
+ * methods to modify the behavior of the backing serializer as desired per the decorator pattern.
+ */
+public abstract class ForwardingVersionedSerializer<T> implements IVersionedSerializer<T>
+{
+    protected ForwardingVersionedSerializer()
+    {
+    }
+
+    /**
+     * Returns the backing delegate instance that methods are forwarded to.
+     *
+     * @param version the server version
+     * @return the backing delegate instance that methods are forwarded to.
+     */
+    protected abstract IVersionedSerializer<T> delegate(int version);
+
+    public void serialize(T t, DataOutputPlus out, int version) throws IOException
+    {
+        delegate(version).serialize(t, out, version);
+    }
+
+    public T deserialize(DataInputPlus in, int version) throws IOException
+    {
+        return delegate(version).deserialize(in, version);
+    }
+
+    public long serializedSize(T t, int version)
+    {
+        return delegate(version).serializedSize(t, version);
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/ISerializer.java b/src/java/org/apache/cassandra/io/ISerializer.java
index 7e1759c..562d226 100644
--- a/src/java/org/apache/cassandra/io/ISerializer.java
+++ b/src/java/org/apache/cassandra/io/ISerializer.java

@@ -17,10 +17,9 @@
  */
 package org.apache.cassandra.io;
 
-import java.io.DataInput;
 import java.io.IOException;
 
-import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public interface ISerializer<T>
@@ -41,7 +40,7 @@
      * @throws IOException
      * @return the type that was deserialized
      */
-    public T deserialize(DataInput in) throws IOException;
+    public T deserialize(DataInputPlus in) throws IOException;
 
-    public long serializedSize(T t, TypeSizes type);
+    public long serializedSize(T t);
 }

diff --git a/src/java/org/apache/cassandra/io/IVersionedSerializer.java b/src/java/org/apache/cassandra/io/IVersionedSerializer.java
index 2572840..e555573 100644
--- a/src/java/org/apache/cassandra/io/IVersionedSerializer.java
+++ b/src/java/org/apache/cassandra/io/IVersionedSerializer.java

@@ -17,9 +17,9 @@
  */
 package org.apache.cassandra.io;
 
-import java.io.DataInput;
 import java.io.IOException;
 
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public interface IVersionedSerializer<T>
@@ -41,7 +41,7 @@
      * @return the type that was deserialized
      * @throws IOException if deserialization fails
      */
-    public T deserialize(DataInput in, int version) throws IOException;
+    public T deserialize(DataInputPlus in, int version) throws IOException;
 
     /**
      * Calculate serialized size of object without actually serializing.

diff --git a/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java b/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java
index 0fc96ed..2dbb013 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressedRandomAccessReader.java

@@ -19,19 +19,16 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
-import java.nio.MappedByteBuffer;
-import java.util.Map;
-import java.util.TreeMap;
 import java.util.concurrent.ThreadLocalRandom;
-import java.util.zip.Adler32;
+import java.util.zip.Checksum;
 
-
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.primitives.Ints;
 
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.util.*;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.memory.BufferPool;
 
 /**
  * CRAR extends RAR to transparently uncompress blocks from the file into RAR.buffer.  Most of the RAR
@@ -39,63 +36,60 @@
  */
 public class CompressedRandomAccessReader extends RandomAccessReader
 {
-    public static CompressedRandomAccessReader open(ChannelProxy channel, CompressionMetadata metadata)
-    {
-        return new CompressedRandomAccessReader(channel, metadata, null, null);
-    }
-
-    public static CompressedRandomAccessReader open(ICompressedFile file)
-    {
-        return new CompressedRandomAccessReader(file.channel(),
-                                                file.getMetadata(),
-                                                file,
-                                                file instanceof PoolingSegmentedFile ? (PoolingSegmentedFile) file : null);
-    }
-
-    private final TreeMap<Long, MappedByteBuffer> chunkSegments;
-
     private final CompressionMetadata metadata;
 
     // we read the raw compressed bytes into this buffer, then move the uncompressed ones into super.buffer.
     private ByteBuffer compressed;
 
     // re-use single crc object
-    private final Adler32 checksum;
+    private final Checksum checksum;
 
     // raw checksum bytes
     private ByteBuffer checksumBytes;
 
-    protected CompressedRandomAccessReader(ChannelProxy channel, CompressionMetadata metadata, ICompressedFile file, PoolingSegmentedFile owner)
+    @VisibleForTesting
+    public double getCrcCheckChance()
     {
-        super(channel, metadata.chunkLength(), metadata.compressedFileLength, metadata.compressor().preferredBufferType(), owner);
-        this.metadata = metadata;
-        checksum = new Adler32();
+        return metadata.parameters.getCrcCheckChance();
+    }
 
-        chunkSegments = file == null ? null : file.chunkSegments();
-        if (chunkSegments == null)
+    protected CompressedRandomAccessReader(Builder builder)
+    {
+        super(builder);
+        this.metadata = builder.metadata;
+        this.checksum = metadata.checksumType.newInstance();
+
+        if (regions == null)
         {
-            compressed = super.allocateBuffer(metadata.compressor().initialCompressedBufferLength(metadata.chunkLength()), metadata.compressor().preferredBufferType());
+            compressed = allocateBuffer(metadata.compressor().initialCompressedBufferLength(metadata.chunkLength()), bufferType);
             checksumBytes = ByteBuffer.wrap(new byte[4]);
         }
     }
 
     @Override
-    protected ByteBuffer allocateBuffer(int bufferSize, BufferType bufferType)
+    protected void releaseBuffer()
     {
-        assert Integer.bitCount(bufferSize) == 1;
-        return bufferType.allocate(bufferSize);
+        try
+        {
+            if (buffer != null)
+            {
+                BufferPool.put(buffer);
+                buffer = null;
+            }
+        }
+        finally
+        {
+            // this will always be null if using mmap access mode (unlike in parent, where buffer is set to a region)
+            if (compressed != null)
+            {
+                BufferPool.put(compressed);
+                compressed = null;
+            }
+        }
     }
 
     @Override
-    public void deallocate()
-    {
-        super.deallocate();
-        if (compressed != null)
-            FileUtils.clean(compressed);
-        compressed = null;
-    }
-
-    private void reBufferStandard()
+    protected void reBufferStandard()
     {
         try
         {
@@ -105,25 +99,33 @@
             CompressionMetadata.Chunk chunk = metadata.chunkFor(position);
 
             if (compressed.capacity() < chunk.length)
-                compressed = allocateBuffer(chunk.length, metadata.compressor().preferredBufferType());
+            {
+                BufferPool.put(compressed);
+                compressed = allocateBuffer(chunk.length, bufferType);
+            }
             else
+            {
                 compressed.clear();
-            compressed.limit(chunk.length);
+            }
 
+            compressed.limit(chunk.length);
             if (channel.read(compressed, chunk.offset) != chunk.length)
                 throw new CorruptBlockException(getPath(), chunk);
+
             compressed.flip();
             buffer.clear();
 
-            if (metadata.parameters.getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
+            if (getCrcCheckChance() >= 1d ||
+                    getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
             {
-                FBUtilities.directCheckSum(checksum, compressed);
+                metadata.checksumType.update(checksum, (compressed));
 
                 if (checksum(chunk) != (int) checksum.getValue())
                     throw new CorruptBlockException(getPath(), chunk);
 
                 // reset checksum object back to the original (blank) state
                 checksum.reset();
+
                 compressed.rewind();
             }
 
@@ -133,7 +135,7 @@
             }
             catch (IOException e)
             {
-                throw new CorruptBlockException(getPath(), chunk);
+                throw new CorruptBlockException(getPath(), chunk, e);
             }
             finally
             {
@@ -158,7 +160,8 @@
         }
     }
 
-    private void reBufferMmap()
+    @Override
+    protected void reBufferMmap()
     {
         try
         {
@@ -167,18 +170,19 @@
 
             CompressionMetadata.Chunk chunk = metadata.chunkFor(position);
 
-            Map.Entry<Long, MappedByteBuffer> entry = chunkSegments.floorEntry(chunk.offset);
-            long segmentOffset = entry.getKey();
+            MmappedRegions.Region region = regions.floor(chunk.offset);
+            long segmentOffset = region.bottom();
             int chunkOffset = Ints.checkedCast(chunk.offset - segmentOffset);
-            ByteBuffer compressedChunk = entry.getValue().duplicate(); // TODO: change to slice(chunkOffset) when we upgrade LZ4-java
+            ByteBuffer compressedChunk = region.buffer.duplicate(); // TODO: change to slice(chunkOffset) when we upgrade LZ4-java
 
             compressedChunk.position(chunkOffset).limit(chunkOffset + chunk.length);
 
             buffer.clear();
 
-            if (metadata.parameters.getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
+            if (getCrcCheckChance() >= 1d ||
+                getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
             {
-                FBUtilities.directCheckSum(checksum, compressedChunk);
+                metadata.checksumType.update( checksum, compressedChunk);
 
                 compressedChunk.limit(compressedChunk.capacity());
                 if (compressedChunk.getInt() != (int) checksum.getValue())
@@ -196,7 +200,7 @@
             }
             catch (IOException e)
             {
-                throw new CorruptBlockException(getPath(), chunk);
+                throw new CorruptBlockException(getPath(), chunk, e);
             }
             finally
             {
@@ -218,19 +222,6 @@
 
     }
 
-    @Override
-    protected void reBuffer()
-    {
-        if (chunkSegments != null)
-        {
-            reBufferMmap();
-        }
-        else
-        {
-            reBufferStandard();
-        }
-    }
-
     private int checksum(CompressionMetadata.Chunk chunk) throws IOException
     {
         long position = chunk.offset + chunk.length;
@@ -240,11 +231,6 @@
         return checksumBytes.getInt(0);
     }
 
-    public int getTotalBufferSize()
-    {
-        return super.getTotalBufferSize() + (chunkSegments != null ? 0 : compressed.capacity());
-    }
-
     @Override
     public long length()
     {
@@ -256,4 +242,47 @@
     {
         return String.format("%s - chunk length %d, data length %d.", getPath(), metadata.chunkLength(), metadata.dataLength);
     }
+
+    public final static class Builder extends RandomAccessReader.Builder
+    {
+        private final CompressionMetadata metadata;
+
+        public Builder(ICompressedFile file)
+        {
+            super(file.channel());
+            this.metadata = applyMetadata(file.getMetadata());
+            this.regions = file.regions();
+        }
+
+        public Builder(ChannelProxy channel, CompressionMetadata metadata)
+        {
+            super(channel);
+            this.metadata = applyMetadata(metadata);
+        }
+
+        private CompressionMetadata applyMetadata(CompressionMetadata metadata)
+        {
+            this.overrideLength = metadata.compressedFileLength;
+            this.bufferSize = metadata.chunkLength();
+            this.bufferType = metadata.compressor().preferredBufferType();
+
+            assert Integer.bitCount(this.bufferSize) == 1; //must be a power of two
+
+            return metadata;
+        }
+
+        @Override
+        protected ByteBuffer createBuffer()
+        {
+            buffer = allocateBuffer(bufferSize, bufferType);
+            buffer.limit(0);
+            return buffer;
+        }
+
+        @Override
+        public RandomAccessReader build()
+        {
+            return new CompressedRandomAccessReader(this);
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
index a7f9bb4..9c47513 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java

@@ -17,22 +17,25 @@
  */
 package org.apache.cassandra.io.compress;
 
+import static org.apache.cassandra.utils.Throwables.merge;
+
 import java.io.DataOutputStream;
 import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
-import java.util.zip.Adler32;
+import java.util.zip.CRC32;
 
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.DataIntegrityMetadata;
-import org.apache.cassandra.io.util.FileMark;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.SequentialWriter;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.schema.CompressionParams;
 
 public class CompressedSequentialWriter extends SequentialWriter
 {
@@ -60,11 +63,11 @@
 
     public CompressedSequentialWriter(File file,
                                       String offsetsPath,
-                                      CompressionParameters parameters,
+                                      CompressionParams parameters,
                                       MetadataCollector sstableMetadataCollector)
     {
-        super(file, parameters.chunkLength(), parameters.sstableCompressor.preferredBufferType());
-        this.compressor = parameters.sstableCompressor;
+        super(file, parameters.chunkLength(), parameters.getSstableCompressor().preferredBufferType());
+        this.compressor = parameters.getSstableCompressor();
 
         // buffer for compression should be the same size as buffer itself
         compressed = compressor.preferredBufferType().allocate(compressor.initialCompressedBufferLength(buffer.capacity()));
@@ -81,7 +84,7 @@
     {
         try
         {
-            return channel.position();
+            return fchannel.position();
         }
         catch (IOException e)
         {
@@ -130,9 +133,6 @@
             compressed.rewind();
             crcMetadata.appendDirect(compressed, true);
             lastFlushOffset = uncompressedSize;
-
-            // adjust our bufferOffset to account for the new uncompressed data we've now written out
-            resetBuffer();
         }
         catch (IOException e)
         {
@@ -153,13 +153,15 @@
     }
 
     @Override
-    public FileMark mark()
+    public DataPosition mark()
     {
+        if (!buffer.hasRemaining())
+            doFlush(0);
         return new CompressedFileWriterMark(chunkOffset, current(), buffer.position(), chunkCount + 1);
     }
 
     @Override
-    public synchronized void resetAndTruncate(FileMark mark)
+    public synchronized void resetAndTruncate(DataPosition mark)
     {
         assert mark instanceof CompressedFileWriterMark;
 
@@ -183,14 +185,17 @@
         // compressed chunk size (- 4 bytes reserved for checksum)
         int chunkSize = (int) (metadataWriter.chunkOffsetBy(realMark.nextChunkIndex) - chunkOffset - 4);
         if (compressed.capacity() < chunkSize)
+        {
+            FileUtils.clean(compressed);
             compressed = compressor.preferredBufferType().allocate(chunkSize);
+        }
 
         try
         {
             compressed.clear();
             compressed.limit(chunkSize);
-            channel.position(chunkOffset);
-            channel.read(compressed);
+            fchannel.position(chunkOffset);
+            fchannel.read(compressed);
 
             try
             {
@@ -201,15 +206,15 @@
             }
             catch (IOException e)
             {
-                throw new CorruptBlockException(getPath(), chunkOffset, chunkSize);
+                throw new CorruptBlockException(getPath(), chunkOffset, chunkSize, e);
             }
 
-            Adler32 checksum = new Adler32();
+            CRC32 checksum = new CRC32();
             compressed.rewind();
-            FBUtilities.directCheckSum(checksum, compressed);
+            checksum.update(compressed);
 
             crcCheckBuffer.clear();
-            channel.read(crcCheckBuffer);
+            fchannel.read(crcCheckBuffer);
             crcCheckBuffer.flip();
             if (crcCheckBuffer.getInt() != (int) checksum.getValue())
                 throw new CorruptBlockException(getPath(), chunkOffset, chunkSize);
@@ -229,7 +234,6 @@
 
         // Mark as dirty so we can guarantee the newly buffered bytes won't be lost on a rebuffer
         buffer.position(realMark.validBufferBytes);
-        isDirty = true;
 
         bufferOffset = truncateTarget - buffer.position();
         chunkCount = realMark.nextChunkIndex - 1;
@@ -243,7 +247,7 @@
     {
         try
         {
-            channel.truncate(toFileSize);
+            fchannel.truncate(toFileSize);
             lastFlushOffset = toBufferOffset;
         }
         catch (IOException e)
@@ -261,7 +265,7 @@
         {
             try
             {
-                channel.position(chunkOffset);
+                fchannel.position(chunkOffset);
             }
             catch (IOException e)
             {
@@ -275,7 +279,7 @@
         @Override
         protected Throwable doCommit(Throwable accumulate)
         {
-            return metadataWriter.commit(accumulate);
+            return super.doCommit(metadataWriter.commit(accumulate));
         }
 
         @Override
@@ -290,10 +294,23 @@
             syncInternal();
             if (descriptor != null)
                 crcMetadata.writeFullChecksum(descriptor);
-            releaseFileHandle();
             sstableMetadataCollector.addCompressionRatio(compressedSize, uncompressedSize);
             metadataWriter.finalizeLength(current(), chunkCount).prepareToCommit();
         }
+
+        @Override
+        protected Throwable doPreCleanup(Throwable accumulate)
+        {
+            accumulate = super.doPreCleanup(accumulate);
+            if (compressed != null)
+            {
+                try { FileUtils.clean(compressed); }
+                catch (Throwable t) { accumulate = merge(accumulate, t); }
+                compressed = null;
+            }
+
+            return accumulate;
+        }
     }
 
     @Override
@@ -305,7 +322,7 @@
     /**
      * Class to hold a mark to the position of the file
      */
-    protected static class CompressedFileWriterMark implements FileMark
+    protected static class CompressedFileWriterMark implements DataPosition
     {
         // chunk offset in the compressed file
         final long chunkOffset;

diff --git a/src/java/org/apache/cassandra/io/compress/CompressedThrottledReader.java b/src/java/org/apache/cassandra/io/compress/CompressedThrottledReader.java
deleted file mode 100644
index 2b07c50..0000000
--- a/src/java/org/apache/cassandra/io/compress/CompressedThrottledReader.java
+++ /dev/null

@@ -1,49 +0,0 @@
-package org.apache.cassandra.io.compress;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import com.google.common.util.concurrent.RateLimiter;
-
-import org.apache.cassandra.io.util.ChannelProxy;
-import org.apache.cassandra.io.util.ICompressedFile;
-
-public class CompressedThrottledReader extends CompressedRandomAccessReader
-{
-    private final RateLimiter limiter;
-
-    public CompressedThrottledReader(ChannelProxy channel, CompressionMetadata metadata, ICompressedFile file, RateLimiter limiter)
-    {
-        super(channel, metadata, file, null);
-        this.limiter = limiter;
-    }
-
-    protected void reBuffer()
-    {
-        limiter.acquire(buffer.capacity());
-        super.reBuffer();
-    }
-
-    public static CompressedThrottledReader open(ICompressedFile file, RateLimiter limiter)
-    {
-        return new CompressedThrottledReader(file.channel(), file.getMetadata(), file, limiter);
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java b/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java
index 45fb0e0..10d1ae9 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressionMetadata.java

@@ -47,12 +47,16 @@
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.Memory;
 import org.apache.cassandra.io.util.SafeMemory;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.ChecksumType;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.concurrent.Transactional;
+import org.apache.cassandra.utils.concurrent.Ref;
 
 /**
  * Holds metadata about compressed file
@@ -67,7 +71,8 @@
     private final Memory chunkOffsets;
     private final long chunkOffsetsSize;
     public final String indexFilePath;
-    public final CompressionParameters parameters;
+    public final CompressionParams parameters;
+    public final ChecksumType checksumType;
 
     /**
      * Create metadata about given compressed file including uncompressed data length, chunk size
@@ -83,13 +88,14 @@
     public static CompressionMetadata create(String dataFilePath)
     {
         Descriptor desc = Descriptor.fromFilename(dataFilePath);
-        return new CompressionMetadata(desc.filenameFor(Component.COMPRESSION_INFO), new File(dataFilePath).length());
+        return new CompressionMetadata(desc.filenameFor(Component.COMPRESSION_INFO), new File(dataFilePath).length(), desc.version.compressedChecksumType());
     }
 
     @VisibleForTesting
-    CompressionMetadata(String indexFilePath, long compressedLength)
+    public CompressionMetadata(String indexFilePath, long compressedLength, ChecksumType checksumType)
     {
         this.indexFilePath = indexFilePath;
+        this.checksumType = checksumType;
 
         try (DataInputStream stream = new DataInputStream(new FileInputStream(indexFilePath)))
         {
@@ -105,11 +111,11 @@
             int chunkLength = stream.readInt();
             try
             {
-                parameters = new CompressionParameters(compressorName, chunkLength, options);
+                parameters = new CompressionParams(compressorName, chunkLength, options);
             }
             catch (ConfigurationException e)
             {
-                throw new RuntimeException("Cannot create CompressionParameters for stored parameters", e);
+                throw new RuntimeException("Cannot create CompressionParams for stored parameters", e);
             }
 
             dataLength = stream.readLong();
@@ -128,7 +134,7 @@
         this.chunkOffsetsSize = chunkOffsets.size();
     }
 
-    private CompressionMetadata(String filePath, CompressionParameters parameters, SafeMemory offsets, long offsetsSize, long dataLength, long compressedLength)
+    private CompressionMetadata(String filePath, CompressionParams parameters, SafeMemory offsets, long offsetsSize, long dataLength, long compressedLength, ChecksumType checksumType)
     {
         this.indexFilePath = filePath;
         this.parameters = parameters;
@@ -136,11 +142,12 @@
         this.compressedFileLength = compressedLength;
         this.chunkOffsets = offsets;
         this.chunkOffsetsSize = offsetsSize;
+        this.checksumType = checksumType;
     }
 
     public ICompressor compressor()
     {
-        return parameters.sstableCompressor;
+        return parameters.getSstableCompressor();
     }
 
     public int chunkLength()
@@ -157,6 +164,11 @@
         return chunkOffsets.size();
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        identities.add(chunkOffsets);
+    }
+
     /**
      * Read offsets of the individual chunks from the given input.
      *
@@ -215,11 +227,15 @@
     public Chunk chunkFor(long position)
     {
         // position of the chunk
-        int idx = 8 * (int) (position / parameters.chunkLength());
+        long idx = 8 * (position / parameters.chunkLength());
 
         if (idx >= chunkOffsetsSize)
             throw new CorruptSSTableException(new EOFException(), indexFilePath);
 
+        if (idx < 0)
+            throw new CorruptSSTableException(new IllegalArgumentException(String.format("Invalid negative chunk index %d with position %d", idx, position)),
+                                              indexFilePath);
+
         long chunkOffset = chunkOffsets.getLong(idx);
         long nextChunkOffset = (idx + 8 == chunkOffsetsSize)
                                 ? compressedFileLength
@@ -298,7 +314,7 @@
     public static class Writer extends Transactional.AbstractTransactional implements Transactional
     {
         // path to the file
-        private final CompressionParameters parameters;
+        private final CompressionParams parameters;
         private final String filePath;
         private int maxCount = 100;
         private SafeMemory offsets = new SafeMemory(maxCount * 8L);
@@ -307,13 +323,13 @@
         // provided by user when setDescriptor
         private long dataLength, chunkCount;
 
-        private Writer(CompressionParameters parameters, String path)
+        private Writer(CompressionParams parameters, String path)
         {
             this.parameters = parameters;
             filePath = path;
         }
 
-        public static Writer open(CompressionParameters parameters, String path)
+        public static Writer open(CompressionParams parameters, String path)
         {
             return new Writer(parameters, path);
         }
@@ -333,9 +349,9 @@
         {
             try
             {
-                out.writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
-                out.writeInt(parameters.otherOptions.size());
-                for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
+                out.writeUTF(parameters.getSstableCompressor().getClass().getSimpleName());
+                out.writeInt(parameters.getOtherOptions().size());
+                for (Map.Entry<String, String> entry : parameters.getOtherOptions().entrySet())
                 {
                     out.writeUTF(entry.getKey());
                     out.writeUTF(entry.getValue());
@@ -406,7 +422,7 @@
             if (count < this.count)
                 compressedLength = offsets.getLong(count * 8L);
 
-            return new CompressionMetadata(filePath, parameters, offsets, count * 8L, dataLength, compressedLength);
+            return new CompressionMetadata(filePath, parameters, offsets, count * 8L, dataLength, compressedLength, ChecksumType.CRC32);
         }
 
         /**
@@ -424,7 +440,7 @@
         /**
          * Reset the writer so that the next chunk offset written will be the
          * one of {@code chunkIndex}.
-         * 
+         *
          * @param chunkIndex the next index to write
          */
         public void resetAndTruncate(int chunkIndex)
@@ -432,7 +448,7 @@
             count = chunkIndex;
         }
 
-        protected Throwable doPreCleanup(Throwable failed)
+        protected Throwable doPostCleanup(Throwable failed)
         {
             return offsets.close(failed);
         }
@@ -444,7 +460,7 @@
 
         protected Throwable doAbort(Throwable accumulate)
         {
-            return FileUtils.deleteWithConfirm(filePath, false, accumulate);
+            return accumulate;
         }
     }
 
@@ -496,15 +512,15 @@
             out.writeInt(chunk.length);
         }
 
-        public Chunk deserialize(DataInput in, int version) throws IOException
+        public Chunk deserialize(DataInputPlus in, int version) throws IOException
         {
             return new Chunk(in.readLong(), in.readInt());
         }
 
         public long serializedSize(Chunk chunk, int version)
         {
-            long size = TypeSizes.NATIVE.sizeof(chunk.offset);
-            size += TypeSizes.NATIVE.sizeof(chunk.length);
+            long size = TypeSizes.sizeof(chunk.offset);
+            size += TypeSizes.sizeof(chunk.length);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/io/compress/CompressionParameters.java b/src/java/org/apache/cassandra/io/compress/CompressionParameters.java
deleted file mode 100644
index b114826..0000000
--- a/src/java/org/apache/cassandra/io/compress/CompressionParameters.java
+++ /dev/null

@@ -1,378 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.compress;
-
-import java.io.DataInput;
-import java.io.IOException;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
-import java.util.AbstractSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Sets;
-
-import org.apache.commons.lang3.builder.EqualsBuilder;
-import org.apache.commons.lang3.builder.HashCodeBuilder;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ParameterizedClass;
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.util.DataOutputPlus;
-
-public class CompressionParameters
-{
-    public final static int DEFAULT_CHUNK_LENGTH = 65536;
-    public final static double DEFAULT_CRC_CHECK_CHANCE = 1.0;
-    public final static IVersionedSerializer<CompressionParameters> serializer = new Serializer();
-
-    public static final String SSTABLE_COMPRESSION = "sstable_compression";
-    public static final String CHUNK_LENGTH_KB = "chunk_length_kb";
-    public static final String CRC_CHECK_CHANCE = "crc_check_chance";
-
-    public static final Set<String> GLOBAL_OPTIONS = ImmutableSet.of(CRC_CHECK_CHANCE);
-
-    public final ICompressor sstableCompressor;
-    private final Integer chunkLength;
-    private volatile double crcCheckChance;
-    public final Map<String, String> otherOptions; // Unrecognized options, can be use by the compressor
-    private CFMetaData liveMetadata;
-
-    public static CompressionParameters create(Map<? extends CharSequence, ? extends CharSequence> opts) throws ConfigurationException
-    {
-        Map<String, String> options = copyOptions(opts);
-        String sstableCompressionClass = options.get(SSTABLE_COMPRESSION);
-        String chunkLength = options.get(CHUNK_LENGTH_KB);
-        options.remove(SSTABLE_COMPRESSION);
-        options.remove(CHUNK_LENGTH_KB);
-        CompressionParameters cp = new CompressionParameters(sstableCompressionClass, parseChunkLength(chunkLength), options);
-        cp.validate();
-        return cp;
-    }
-
-    public CompressionParameters(String sstableCompressorClass, Integer chunkLength, Map<String, String> otherOptions) throws ConfigurationException
-    {
-        this(createCompressor(parseCompressorClass(sstableCompressorClass), otherOptions), chunkLength, otherOptions);
-    }
-
-    public CompressionParameters(ICompressor sstableCompressor)
-    {
-        // can't try/catch as first statement in the constructor, thus repeating constructor code here.
-        this.sstableCompressor = sstableCompressor;
-        chunkLength = null;
-        otherOptions = Collections.emptyMap();
-        crcCheckChance = DEFAULT_CRC_CHECK_CHANCE;
-    }
-
-    public CompressionParameters(ICompressor sstableCompressor, Integer chunkLength, Map<String, String> otherOptions) throws ConfigurationException
-    {
-        this.sstableCompressor = sstableCompressor;
-        this.chunkLength = chunkLength;
-        this.otherOptions = otherOptions;
-        String chance = otherOptions.get(CRC_CHECK_CHANCE);
-        this.crcCheckChance = (chance == null) ? DEFAULT_CRC_CHECK_CHANCE : parseCrcCheckChance(chance);
-    }
-
-    public CompressionParameters copy()
-    {
-        return new CompressionParameters(sstableCompressor, chunkLength, new HashMap<>(otherOptions));
-    }
-
-    public void setLiveMetadata(final CFMetaData liveMetadata)
-    {
-        if (liveMetadata == null)
-            return;
-
-        this.liveMetadata = liveMetadata;
-    }
-
-    public void setCrcCheckChance(double crcCheckChance) throws ConfigurationException
-    {
-        validateCrcCheckChance(crcCheckChance);
-        this.crcCheckChance = crcCheckChance;
-
-        if (liveMetadata != null && this != liveMetadata.compressionParameters)
-            liveMetadata.compressionParameters.setCrcCheckChance(crcCheckChance);
-    }
-
-    public double getCrcCheckChance()
-    {
-        return liveMetadata == null ? this.crcCheckChance : liveMetadata.compressionParameters.crcCheckChance;
-    }
-
-    private static double parseCrcCheckChance(String crcCheckChance) throws ConfigurationException
-    {
-        try
-        {
-            double chance = Double.parseDouble(crcCheckChance);
-            validateCrcCheckChance(chance);
-            return chance;
-        }
-        catch (NumberFormatException e)
-        {
-            throw new ConfigurationException("crc_check_chance should be a double");
-        }
-    }
-
-    private static void validateCrcCheckChance(double crcCheckChance) throws ConfigurationException
-    {
-        if (crcCheckChance < 0.0d || crcCheckChance > 1.0d)
-            throw new ConfigurationException("crc_check_chance should be between 0.0 and 1.0");
-    }
-
-    public int chunkLength()
-    {
-        return chunkLength == null ? DEFAULT_CHUNK_LENGTH : chunkLength;
-    }
-
-    private static Class<?> parseCompressorClass(String className) throws ConfigurationException
-    {
-        if (className == null || className.isEmpty())
-            return null;
-
-        className = className.contains(".") ? className : "org.apache.cassandra.io.compress." + className;
-        try
-        {
-            return Class.forName(className);
-        }
-        catch (Exception e)
-        {
-            throw new ConfigurationException("Could not create Compression for type " + className, e);
-        }
-    }
-
-    private static ICompressor createCompressor(Class<?> compressorClass, Map<String, String> compressionOptions) throws ConfigurationException
-    {
-        if (compressorClass == null)
-        {
-            if (!compressionOptions.isEmpty())
-                throw new ConfigurationException("Unknown compression options (" + compressionOptions.keySet() + ") since no compression class found");
-            return null;
-        }
-
-        try
-        {
-            Method method = compressorClass.getMethod("create", Map.class);
-            ICompressor compressor = (ICompressor)method.invoke(null, compressionOptions);
-            // Check for unknown options
-            AbstractSet<String> supportedOpts = Sets.union(compressor.supportedOptions(), GLOBAL_OPTIONS);
-            for (String provided : compressionOptions.keySet())
-                if (!supportedOpts.contains(provided))
-                    throw new ConfigurationException("Unknown compression options " + provided);
-            return compressor;
-        }
-        catch (NoSuchMethodException e)
-        {
-            throw new ConfigurationException("create method not found", e);
-        }
-        catch (SecurityException e)
-        {
-            throw new ConfigurationException("Access forbiden", e);
-        }
-        catch (IllegalAccessException e)
-        {
-            throw new ConfigurationException("Cannot access method create in " + compressorClass.getName(), e);
-        }
-        catch (InvocationTargetException e)
-        {
-            Throwable cause = e.getCause();
-            throw new ConfigurationException(String.format("%s.create() threw an error: %s",
-                                             compressorClass.getSimpleName(),
-                                             cause == null ? e.getClass().getName() + " " + e.getMessage() : cause.getClass().getName() + " " + cause.getMessage()),
-                                             e);
-        }
-        catch (ExceptionInInitializerError e)
-        {
-            throw new ConfigurationException("Cannot initialize class " + compressorClass.getName());
-        }
-    }
-
-    public static ICompressor createCompressor(ParameterizedClass compression) throws ConfigurationException {
-        return createCompressor(parseCompressorClass(compression.class_name), copyOptions(compression.parameters));
-    }
-
-    private static Map<String, String> copyOptions(Map<? extends CharSequence, ? extends CharSequence> co)
-    {
-        if (co == null || co.isEmpty())
-            return Collections.<String, String>emptyMap();
-
-        Map<String, String> compressionOptions = new HashMap<String, String>();
-        for (Map.Entry<? extends CharSequence, ? extends CharSequence> entry : co.entrySet())
-        {
-            compressionOptions.put(entry.getKey().toString(), entry.getValue().toString());
-        }
-        return compressionOptions;
-    }
-
-    /**
-     * Parse the chunk length (in KB) and returns it as bytes.
-     * 
-     * @param chLengthKB the length of the chunk to parse
-     * @return the chunk length in bytes
-     * @throws ConfigurationException if the chunk size is too large
-     */
-    public static Integer parseChunkLength(String chLengthKB) throws ConfigurationException
-    {
-        if (chLengthKB == null)
-            return null;
-
-        try
-        {
-            int parsed = Integer.parseInt(chLengthKB);
-            if (parsed > Integer.MAX_VALUE / 1024)
-                throw new ConfigurationException("Value of " + CHUNK_LENGTH_KB + " is too large (" + parsed + ")");
-            return 1024 * parsed;
-        }
-        catch (NumberFormatException e)
-        {
-            throw new ConfigurationException("Invalid value for " + CHUNK_LENGTH_KB, e);
-        }
-    }
-
-    // chunkLength must be a power of 2 because we assume so when
-    // computing the chunk number from an uncompressed file offset (see
-    // CompressedRandomAccessReader.decompresseChunk())
-    public void validate() throws ConfigurationException
-    {
-        // if chunk length was not set (chunkLength == null), this is fine, default will be used
-        if (chunkLength != null)
-        {
-            if (chunkLength <= 0)
-                throw new ConfigurationException("Invalid negative or null " + CHUNK_LENGTH_KB);
-
-            int c = chunkLength;
-            boolean found = false;
-            while (c != 0)
-            {
-                if ((c & 0x01) != 0)
-                {
-                    if (found)
-                        throw new ConfigurationException(CHUNK_LENGTH_KB + " must be a power of 2");
-                    else
-                        found = true;
-                }
-                c >>= 1;
-            }
-        }
-
-        validateCrcCheckChance(crcCheckChance);
-    }
-
-    public Map<String, String> asThriftOptions()
-    {
-        Map<String, String> options = new HashMap<String, String>(otherOptions);
-        if (sstableCompressor == null)
-            return options;
-
-        options.put(SSTABLE_COMPRESSION, sstableCompressor.getClass().getName());
-        if (chunkLength != null)
-            options.put(CHUNK_LENGTH_KB, chunkLengthInKB());
-        return options;
-    }
-
-    private String chunkLengthInKB()
-    {
-        return String.valueOf(chunkLength() / 1024);
-    }
-
-    @Override
-    public boolean equals(Object obj)
-    {
-        if (obj == this)
-        {
-            return true;
-        }
-        else if (obj == null || obj.getClass() != getClass())
-        {
-            return false;
-        }
-
-        CompressionParameters cp = (CompressionParameters) obj;
-        return new EqualsBuilder()
-            .append(sstableCompressor, cp.sstableCompressor)
-            .append(chunkLength, cp.chunkLength)
-            .append(otherOptions, cp.otherOptions)
-            .isEquals();
-    }
-
-    @Override
-    public int hashCode()
-    {
-        return new HashCodeBuilder(29, 1597)
-            .append(sstableCompressor)
-            .append(chunkLength)
-            .append(otherOptions)
-            .toHashCode();
-    }
-
-    static class Serializer implements IVersionedSerializer<CompressionParameters>
-    {
-        public void serialize(CompressionParameters parameters, DataOutputPlus out, int version) throws IOException
-        {
-            out.writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
-            out.writeInt(parameters.otherOptions.size());
-            for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
-            {
-                out.writeUTF(entry.getKey());
-                out.writeUTF(entry.getValue());
-            }
-            out.writeInt(parameters.chunkLength());
-        }
-
-        public CompressionParameters deserialize(DataInput in, int version) throws IOException
-        {
-            String compressorName = in.readUTF();
-            int optionCount = in.readInt();
-            Map<String, String> options = new HashMap<String, String>();
-            for (int i = 0; i < optionCount; ++i)
-            {
-                String key = in.readUTF();
-                String value = in.readUTF();
-                options.put(key, value);
-            }
-            int chunkLength = in.readInt();
-            CompressionParameters parameters;
-            try
-            {
-                parameters = new CompressionParameters(compressorName, chunkLength, options);
-            }
-            catch (ConfigurationException e)
-            {
-                throw new RuntimeException("Cannot create CompressionParameters for parameters", e);
-            }
-            return parameters;
-        }
-
-        public long serializedSize(CompressionParameters parameters, int version)
-        {
-            long size = TypeSizes.NATIVE.sizeof(parameters.sstableCompressor.getClass().getSimpleName());
-            size += TypeSizes.NATIVE.sizeof(parameters.otherOptions.size());
-            for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
-            {
-                size += TypeSizes.NATIVE.sizeof(entry.getKey());
-                size += TypeSizes.NATIVE.sizeof(entry.getValue());
-            }
-            size += TypeSizes.NATIVE.sizeof(parameters.chunkLength());
-            return size;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/compress/LZ4Compressor.java b/src/java/org/apache/cassandra/io/compress/LZ4Compressor.java
index 5fd4309..3a3b024 100644
--- a/src/java/org/apache/cassandra/io/compress/LZ4Compressor.java
+++ b/src/java/org/apache/cassandra/io/compress/LZ4Compressor.java

@@ -27,6 +27,7 @@
 import com.google.common.annotations.VisibleForTesting;
 import net.jpountz.lz4.LZ4Exception;
 import net.jpountz.lz4.LZ4Factory;
+import org.apache.cassandra.schema.CompressionParams;
 
 public class LZ4Compressor implements ICompressor
 {
@@ -126,7 +127,7 @@
 
     public Set<String> supportedOptions()
     {
-        return new HashSet<>(Arrays.asList(CompressionParameters.CRC_CHECK_CHANCE));
+        return new HashSet<>();
     }
 
     public BufferType preferredBufferType()

diff --git a/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java b/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java
index e416c7b..62348ec 100644
--- a/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.io.Closeable;
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
+import java.io.Closeable;
 import java.nio.ByteBuffer;
 import java.util.HashSet;
 import java.util.Set;
@@ -28,34 +28,29 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.Attributes;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.CounterId;
 import org.apache.cassandra.utils.Pair;
 
-public abstract class AbstractSSTableSimpleWriter implements Closeable
+/**
+ * Base class for the sstable writers used by CQLSSTableWriter.
+ */
+abstract class AbstractSSTableSimpleWriter implements Closeable
 {
     protected final File directory;
     protected final CFMetaData metadata;
-    protected DecoratedKey currentKey;
-    protected ColumnFamily columnFamily;
-    protected ByteBuffer currentSuperColumn;
-    protected final CounterId counterid = CounterId.generate();
-    private SSTableFormat.Type formatType = DatabaseDescriptor.getSSTableFormat();
+    protected final PartitionColumns columns;
+    protected SSTableFormat.Type formatType = DatabaseDescriptor.getSSTableFormat();
     protected static AtomicInteger generation = new AtomicInteger(0);
 
-
-    public AbstractSSTableSimpleWriter(File directory, CFMetaData metadata, IPartitioner partitioner)
+    protected AbstractSSTableSimpleWriter(File directory, CFMetaData metadata, PartitionColumns columns)
     {
         this.metadata = metadata;
         this.directory = directory;
-        DatabaseDescriptor.setPartitioner(partitioner);
+        this.columns = columns;
     }
 
     protected void setSSTableFormatType(SSTableFormat.Type type)
@@ -63,15 +58,20 @@
         this.formatType = type;
     }
 
-    protected SSTableWriter getWriter()
+    protected SSTableTxnWriter createWriter()
     {
-        return SSTableWriter.create(createDescriptor(directory, metadata.ksName, metadata.cfName, formatType), 0, ActiveRepairService.UNREPAIRED_SSTABLE);
+        return SSTableTxnWriter.create(metadata,
+                                       createDescriptor(directory, metadata.ksName, metadata.cfName, formatType),
+                                       0,
+                                       ActiveRepairService.UNREPAIRED_SSTABLE,
+                                       0,
+                                       new SerializationHeader(true, metadata, columns, EncodingStats.NO_STATS));
     }
 
-    protected static Descriptor createDescriptor(File directory, final String keyspace, final String columnFamily, final SSTableFormat.Type fmt)
+    private static Descriptor createDescriptor(File directory, final String keyspace, final String columnFamily, final SSTableFormat.Type fmt)
     {
         int maxGen = getNextGeneration(directory, columnFamily);
-        return new Descriptor(directory, keyspace, columnFamily, maxGen + 1, Descriptor.Type.TEMP, fmt);
+        return new Descriptor(directory, keyspace, columnFamily, maxGen + 1, fmt);
     }
 
     private static int getNextGeneration(File directory, final String columnFamily)
@@ -103,124 +103,17 @@
         return maxGen;
     }
 
-    /**
-     * Start a new row whose key is {@code key}.
-     * @param key the row key
-     */
-    public void newRow(ByteBuffer key) throws IOException
+    PartitionUpdate getUpdateFor(ByteBuffer key) throws IOException
     {
-        if (currentKey != null && !columnFamily.isEmpty())
-            writeRow(currentKey, columnFamily);
-
-        currentKey = DatabaseDescriptor.getPartitioner().decorateKey(key);
-        columnFamily = getColumnFamily();
+        return getUpdateFor(metadata.decorateKey(key));
     }
 
     /**
-     * Start a new super column with name {@code name}.
-     * @param name the name for the super column
+     * Returns a PartitionUpdate suitable to write on this writer for the provided key.
+     *
+     * @param key they partition key for which the returned update will be.
+     * @return an update on partition {@code key} that is tied to this writer.
      */
-    public void newSuperColumn(ByteBuffer name)
-    {
-        if (!columnFamily.metadata().isSuper())
-            throw new IllegalStateException("Cannot add a super column to a standard table");
-
-        currentSuperColumn = name;
-    }
-
-    protected void addColumn(Cell cell) throws IOException
-    {
-        if (columnFamily.metadata().isSuper())
-        {
-            if (currentSuperColumn == null)
-                throw new IllegalStateException("Trying to add a cell to a super column family, but no super cell has been started.");
-
-            cell = cell.withUpdatedName(columnFamily.getComparator().makeCellName(currentSuperColumn, cell.name().toByteBuffer()));
-        }
-        columnFamily.addColumn(cell);
-    }
-
-    /**
-     * Insert a new "regular" column to the current row (and super column if applicable).
-     * @param name the column name
-     * @param value the column value
-     * @param timestamp the column timestamp
-     */
-    public void addColumn(ByteBuffer name, ByteBuffer value, long timestamp) throws IOException
-    {
-        addColumn(new BufferCell(metadata.comparator.cellFromByteBuffer(name), value, timestamp));
-    }
-
-    /**
-     * Insert a new expiring column to the current row (and super column if applicable).
-     * @param name the column name
-     * @param value the column value
-     * @param timestamp the column timestamp
-     * @param ttl the column time to live in seconds
-     * @param expirationTimestampMS the local expiration timestamp in milliseconds. This is the server time timestamp used for actually
-     * expiring the column, and as a consequence should be synchronized with the cassandra servers time. If {@code timestamp} represents
-     * the insertion time in microseconds (which is not required), this should be {@code (timestamp / 1000) + (ttl * 1000)}.
-     */
-    public void addExpiringColumn(ByteBuffer name, ByteBuffer value, long timestamp, int ttl, long expirationTimestampMS) throws IOException
-    {
-        int localExpirationTime = (int) (expirationTimestampMS / 1000);
-        try
-        {
-            // This will throw exception if policy is REJECT and now() + ttl is higher than MAX_DELETION_TIME
-            Attributes.maybeApplyExpirationDateOverflowPolicy(metadata, ttl, false);
-            // If exception was not thrown, this means the policy was CAP, so we check for overflow and cap if that's the case
-            if (localExpirationTime < 0)
-                localExpirationTime = BufferExpiringCell.MAX_DELETION_TIME;
-        }
-        catch (InvalidRequestException e)
-        {
-            throw new RuntimeException(e);
-        }
-        addColumn(new BufferExpiringCell(metadata.comparator.cellFromByteBuffer(name), value, timestamp, ttl, localExpirationTime));
-    }
-
-    /**
-     * Insert a new counter column to the current row (and super column if applicable).
-     * @param name the column name
-     * @param value the value of the counter
-     */
-    public void addCounterColumn(ByteBuffer name, long value) throws IOException
-    {
-        addColumn(new BufferCounterCell(metadata.comparator.cellFromByteBuffer(name),
-                                        CounterContext.instance().createGlobal(counterid, 1L, value),
-                                        System.currentTimeMillis()));
-    }
-
-    /**
-     * Package protected for use by AbstractCQLSSTableWriter.
-     * Not meant to be exposed publicly.
-     */
-    ColumnFamily currentColumnFamily()
-    {
-        return columnFamily;
-    }
-
-    /**
-     * Package protected for use by AbstractCQLSSTableWriter.
-     * Not meant to be exposed publicly.
-     */
-    DecoratedKey currentKey()
-    {
-        return currentKey;
-    }
-
-    /**
-     * Package protected for use by AbstractCQLSSTableWriter.
-     * Not meant to be exposed publicly.
-     */
-    boolean shouldStartNewRow() throws IOException
-    {
-        return currentKey == null;
-    }
-
-    protected abstract void writeRow(DecoratedKey key, ColumnFamily columnFamily) throws IOException;
-
-    protected abstract ColumnFamily getColumnFamily() throws IOException;
-
-    public abstract Descriptor getCurrentDescriptor();
+    abstract PartitionUpdate getUpdateFor(DecoratedKey key) throws IOException;
 }
+

diff --git a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
index 33ddbe0..39f7339 100644
--- a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java

@@ -23,27 +23,27 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.collect.ImmutableMap;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Config;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.statements.CFStatement;
 import org.apache.cassandra.cql3.statements.CreateTableStatement;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.cql3.statements.UpdateStatement;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.composites.Composite;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.schema.Tables;
+import org.apache.cassandra.schema.Types;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.utils.Pair;
 
@@ -86,6 +86,9 @@
     static
     {
         Config.setClientMode(true);
+        // Partitioner is not set in client mode.
+        if (DatabaseDescriptor.getPartitioner() == null)
+            DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
     }
 
     private final AbstractSSTableSimpleWriter writer;
@@ -212,26 +215,28 @@
 
         QueryOptions options = QueryOptions.forInternalCalls(null, values);
         List<ByteBuffer> keys = insert.buildPartitionKeyNames(options);
-        Composite clusteringPrefix = insert.createClusteringPrefix(options);
+        SortedSet<Clustering> clusterings = insert.createClustering(options);
 
         long now = System.currentTimeMillis() * 1000;
+        // Note that we asks indexes to not validate values (the last 'false' arg below) because that triggers a 'Keyspace.open'
+        // and that forces a lot of initialization that we don't want.
         UpdateParameters params = new UpdateParameters(insert.cfm,
+                                                       insert.updatedColumns(),
                                                        options,
                                                        insert.getTimestamp(now, options),
                                                        insert.getTimeToLive(options),
-                                                       Collections.<ByteBuffer, CQL3Row>emptyMap());
+                                                       Collections.<DecoratedKey, Partition>emptyMap());
 
         try
         {
             for (ByteBuffer key : keys)
             {
-                if (writer.shouldStartNewRow() || !key.equals(writer.currentKey().getKey()))
-                    writer.newRow(key);
-                insert.addUpdateForKey(writer.currentColumnFamily(), key, clusteringPrefix, params, false);
+                for (Clustering clustering : clusterings)
+                    insert.addUpdateForKey(writer.getUpdateFor(key), clustering, params);
             }
             return this;
         }
-        catch (BufferedWriter.SyncException e)
+        catch (SSTableSimpleUnsortedWriter.SyncException e)
         {
             // If we use a BufferedWriter and had a problem writing to disk, the IOException has been
             // wrapped in a SyncException (see BufferedWriter below). We want to extract that IOE.
@@ -277,23 +282,12 @@
         writer.close();
     }
 
-    public Descriptor getCurrentDescriptor()
-    {
-        return writer.getCurrentDescriptor();
-    }
-
-    public CFMetaData getCFMetaData()
-    {
-        return writer.metadata;
-    }
-
     /**
      * A Builder for a CQLSSTableWriter object.
      */
     public static class Builder
     {
         private File directory;
-        private IPartitioner partitioner = Murmur3Partitioner.instance;
 
         protected SSTableFormat.Type formatType = null;
 
@@ -362,11 +356,16 @@
             {
                 synchronized (CQLSSTableWriter.class)
                 {
-                    this.schema = getStatement(schema, CreateTableStatement.class, "CREATE TABLE").left.getCFMetaData().rebuild();
+                    if (Schema.instance.getKSMetaData(SchemaKeyspace.NAME) == null)
+                        Schema.instance.load(SchemaKeyspace.metadata());
+                    if (Schema.instance.getKSMetaData(SystemKeyspace.NAME) == null)
+                        Schema.instance.load(SystemKeyspace.metadata());
+
+                    this.schema = getTableMetadata(schema);
 
                     // We need to register the keyspace/table metadata through Schema, otherwise we won't be able to properly
                     // build the insert statement in using().
-                    KSMetaData ksm = Schema.instance.getKSMetaData(this.schema.ksName);
+                    KeyspaceMetadata ksm = Schema.instance.getKSMetaData(this.schema.ksName);
                     if (ksm == null)
                     {
                         createKeyspaceWithTable(this.schema);
@@ -384,25 +383,14 @@
             }
         }
 
-        CFMetaData metadata()
-        {
-            return schema;
-        }
-
         /**
          * Creates the keyspace with the specified table.
          *
-         * @param the table the table that must be created.
+         * @param table the table that must be created.
          */
         private static void createKeyspaceWithTable(CFMetaData table)
         {
-            KSMetaData ksm;
-            ksm = KSMetaData.newKeyspace(table.ksName,
-                                         AbstractReplicationStrategy.getClass("org.apache.cassandra.locator.SimpleStrategy"),
-                                         ImmutableMap.of("replication_factor", "1"),
-                                         true,
-                                         Collections.singleton(table));
-            Schema.instance.load(ksm);
+            Schema.instance.load(KeyspaceMetadata.create(table.ksName, KeyspaceParams.simple(1), Tables.of(table)));
         }
 
         /**
@@ -411,11 +399,10 @@
          * @param keyspace the keyspace to add to
          * @param table the table to add
          */
-        private static void addTableToKeyspace(KSMetaData keyspace, CFMetaData table)
+        private static void addTableToKeyspace(KeyspaceMetadata keyspace, CFMetaData table)
         {
-            KSMetaData clone = keyspace.cloneWithTableAdded(table);
             Schema.instance.load(table);
-            Schema.instance.setKeyspaceDefinition(clone);
+            Schema.instance.setKeyspaceMetadata(keyspace.withSwapped(keyspace.tables.with(table)));
         }
 
         /**
@@ -430,7 +417,7 @@
          */
         public Builder withPartitioner(IPartitioner partitioner)
         {
-            this.partitioner = partitioner;
+            this.schema = schema.copy(partitioner);
             return this;
         }
 
@@ -510,6 +497,16 @@
             return this;
         }
 
+        private static CFMetaData getTableMetadata(String schema)
+        {
+            CFStatement parsed = (CFStatement)QueryProcessor.parseStatement(schema);
+            // tables with UDTs are currently not supported by CQLSSTableWrite, so we just use Types.none(), for now
+            // see CASSANDRA-10624 for more details
+            CreateTableStatement statement = (CreateTableStatement) ((CreateTableStatement.RawStatement) parsed).prepare(Types.none()).statement;
+            statement.validate(ClientState.forInternalCalls());
+            return statement.getCFMetaData();
+        }
+
         private static <T extends CQLStatement> Pair<T, List<ColumnSpecification>> getStatement(String query, Class<T> klass, String type)
         {
             try
@@ -541,8 +538,8 @@
                 throw new IllegalStateException("No insert statement specified, you should provide an insert statement through using()");
 
             AbstractSSTableSimpleWriter writer = sorted
-                                               ? new SSTableSimpleWriter(directory, schema, partitioner)
-                                               : new BufferedWriter(directory, schema, partitioner, bufferSizeInMB);
+                                               ? new SSTableSimpleWriter(directory, schema, insert.updatedColumns())
+                                               : new SSTableSimpleUnsortedWriter(directory, schema, insert.updatedColumns(), bufferSizeInMB);
 
             if (formatType != null)
                 writer.setSSTableFormatType(formatType);
@@ -550,81 +547,4 @@
             return new CQLSSTableWriter(writer, insert, boundNames);
         }
     }
-
-    /**
-     * CQLSSTableWriter doesn't use the method addColumn() from AbstractSSTableSimpleWriter.
-     * Instead, it adds cells directly to the ColumnFamily the latter exposes. But this means
-     * that the sync() method of SSTableSimpleUnsortedWriter is not called (at least not for
-     * each CQL row, so adding many rows to the same partition can buffer too much data in
-     * memory - #7360). So we create a slightly modified SSTableSimpleUnsortedWriter that uses
-     * a tweaked ColumnFamily object that calls back the proper method after each added cell
-     * so we sync when we should.
-     */
-    private static class BufferedWriter extends SSTableSimpleUnsortedWriter
-    {
-        private boolean needsSync = false;
-
-        public BufferedWriter(File directory, CFMetaData metadata, IPartitioner partitioner, long bufferSizeInMB)
-        {
-            super(directory, metadata, partitioner, bufferSizeInMB);
-        }
-
-        @Override
-        protected ColumnFamily createColumnFamily()
-        {
-            return new ArrayBackedSortedColumns(metadata, false)
-            {
-                @Override
-                public void addColumn(Cell cell)
-                {
-                    super.addColumn(cell);
-                    try
-                    {
-                        countColumn(cell);
-                    }
-                    catch (IOException e)
-                    {
-                        // addColumn does not throw IOException but we want to report this to the user,
-                        // so wrap it in a temporary RuntimeException that we'll catch in rawAddRow above.
-                        throw new SyncException(e);
-                    }
-                }
-            };
-        }
-
-        @Override
-        protected void replaceColumnFamily() throws IOException
-        {
-            needsSync = true;
-        }
-
-        /**
-         * If we have marked that the column family is being replaced, when we start the next row,
-         * we should sync out the previous partition and create a new row based on the current value.
-         */
-        @Override
-        boolean shouldStartNewRow() throws IOException
-        {
-            if (needsSync)
-            {
-                needsSync = false;
-                super.sync();
-                return true;
-            }
-            return super.shouldStartNewRow();
-        }
-
-        protected void addColumn(Cell cell) throws IOException
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        static class SyncException extends RuntimeException
-        {
-            SyncException(IOException ioe)
-            {
-                super(ioe);
-            }
-        }
-    }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java b/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java
deleted file mode 100644
index 846634a..0000000
--- a/src/java/org/apache/cassandra/io/sstable/ColumnNameHelper.java
+++ /dev/null

@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable;
-
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.marshal.AbstractType;
-
-import static org.apache.cassandra.utils.ByteBufferUtil.minimalBufferFor;
-
-public class ColumnNameHelper
-{
-    private static List<ByteBuffer> maybeGrow(List<ByteBuffer> l, int size)
-    {
-        if (l.size() >= size)
-            return l;
-
-        List<ByteBuffer> nl = new ArrayList<>(size);
-        nl.addAll(l);
-        for (int i = l.size(); i < size; i++)
-            nl.add(null);
-        return nl;
-    }
-
-    private static List<ByteBuffer> getComponents(Composite prefix, int size)
-    {
-        List<ByteBuffer> l = new ArrayList<>(size);
-        for (int i = 0; i < size; i++)
-            l.add(prefix.get(i));
-        return l;
-    }
-
-    /**
-     * finds the max cell name component(s)
-     *
-     * Note that this method *can modify maxSeen*.
-     *
-     * @param maxSeen the max columns seen so far
-     * @param candidate the candidate column(s)
-     * @param comparator the comparator to use
-     * @return a list with the max column(s)
-     */
-    public static List<ByteBuffer> maxComponents(List<ByteBuffer> maxSeen, Composite candidate, CellNameType comparator)
-    {
-        // For a cell name, no reason to look more than the clustering prefix
-        // (and comparing the collection element would actually crash)
-        int size = Math.min(candidate.size(), comparator.clusteringPrefixSize());
-
-        if (maxSeen.isEmpty())
-            return getComponents(candidate, size);
-
-        // In most case maxSeen is big enough to hold the result so update it in place in those cases
-        maxSeen = maybeGrow(maxSeen, size);
-
-        for (int i = 0; i < size; i++)
-            maxSeen.set(i, max(maxSeen.get(i), candidate.get(i), comparator.subtype(i)));
-
-        return maxSeen;
-    }
-
-    /**
-     * finds the min cell name component(s)
-     *
-     * Note that this method *can modify maxSeen*.
-     *
-     * @param minSeen the max columns seen so far
-     * @param candidate the candidate column(s)
-     * @param comparator the comparator to use
-     * @return a list with the min column(s)
-     */
-    public static List<ByteBuffer> minComponents(List<ByteBuffer> minSeen, Composite candidate, CellNameType comparator)
-    {
-        // For a cell name, no reason to look more than the clustering prefix
-        // (and comparing the collection element would actually crash)
-        int size = Math.min(candidate.size(), comparator.clusteringPrefixSize());
-
-        if (minSeen.isEmpty())
-            return getComponents(candidate, size);
-
-        // In most case maxSeen is big enough to hold the result so update it in place in those cases
-        minSeen = maybeGrow(minSeen, size);
-
-        for (int i = 0; i < size; i++)
-            minSeen.set(i, min(minSeen.get(i), candidate.get(i), comparator.subtype(i)));
-
-        return minSeen;
-    }
-
-    /**
-     * return the min column
-     *
-     * note that comparator should not be of CompositeType!
-     *
-     * @param b1 lhs
-     * @param b2 rhs
-     * @param comparator the comparator to use
-     * @return the smallest column according to comparator
-     */
-    private static ByteBuffer min(ByteBuffer b1, ByteBuffer b2, AbstractType<?> comparator)
-    {
-        if (b1 == null)
-            return b2;
-        if (b2 == null)
-            return b1;
-
-        if (comparator.compare(b1, b2) >= 0)
-            return b2;
-        return b1;
-    }
-
-    /**
-     * return the max column
-     *
-     * note that comparator should not be of CompositeType!
-     *
-     * @param b1 lhs
-     * @param b2 rhs
-     * @param comparator the comparator to use
-     * @return the biggest column according to comparator
-     */
-    private static ByteBuffer max(ByteBuffer b1, ByteBuffer b2, AbstractType<?> comparator)
-    {
-        if (b1 == null)
-            return b2;
-        if (b2 == null)
-            return b1;
-
-        if (comparator.compare(b1, b2) >= 0)
-            return b1;
-        return b2;
-    }
-
-    /**
-     * Merge 2 lists of min cell name components.
-     *
-     * @param minColumnNames lhs
-     * @param candidates rhs
-     * @param comparator comparator to use
-     * @return a list with smallest column names according to (sub)comparator
-     */
-    public static List<ByteBuffer> mergeMin(List<ByteBuffer> minColumnNames, List<ByteBuffer> candidates, CellNameType comparator)
-    {
-        if (minColumnNames.isEmpty())
-            return minimalBuffersFor(candidates);
-
-        if (candidates.isEmpty())
-            return minColumnNames;
-
-        List<ByteBuffer> biggest = minColumnNames.size() > candidates.size() ? minColumnNames : candidates;
-        List<ByteBuffer> smallest = minColumnNames.size() > candidates.size() ? candidates : minColumnNames;
-
-        // We want to always copy the smallest list, and maybeGrow does it only if it's actually smaller
-        List<ByteBuffer> retList = smallest.size() == biggest.size()
-                                 ? new ArrayList<>(smallest)
-                                 : maybeGrow(smallest, biggest.size());
-
-        for (int i = 0; i < biggest.size(); i++)
-            retList.set(i, minimalBufferFor(min(retList.get(i), biggest.get(i), comparator.subtype(i))));
-
-        return retList;
-    }
-
-    private static List<ByteBuffer> minimalBuffersFor(List<ByteBuffer> candidates)
-    {
-        List<ByteBuffer> minimalBuffers = new ArrayList<ByteBuffer>(candidates.size());
-        for (ByteBuffer byteBuffer : candidates)
-            minimalBuffers.add(minimalBufferFor(byteBuffer));
-        return minimalBuffers;
-    }
-
-    /**
-     * Merge 2 lists of max cell name components.
-     *
-     * @param maxColumnNames lhs
-     * @param candidates rhs
-     * @param comparator comparator to use
-     * @return a list with biggest column names according to (sub)comparator
-     */
-    public static List<ByteBuffer> mergeMax(List<ByteBuffer> maxColumnNames, List<ByteBuffer> candidates, CellNameType comparator)
-    {
-        if (maxColumnNames.isEmpty())
-            return minimalBuffersFor(candidates);
-
-        if (candidates.isEmpty())
-            return maxColumnNames;
-
-        List<ByteBuffer> biggest = maxColumnNames.size() > candidates.size() ? maxColumnNames : candidates;
-        List<ByteBuffer> smallest = maxColumnNames.size() > candidates.size() ? candidates : maxColumnNames;
-
-        // We want to always copy the smallest list, and maybeGrow does it only if it's actually smaller
-        List<ByteBuffer> retList = smallest.size() == biggest.size()
-                                 ? new ArrayList<>(smallest)
-                                 : maybeGrow(smallest, biggest.size());
-
-        for (int i = 0; i < biggest.size(); i++)
-            retList.set(i, minimalBufferFor(max(retList.get(i), biggest.get(i), comparator.subtype(i))));
-
-        return retList;
-    }
-
-    /**
-     * Checks if the given min/max column names could overlap (i.e they could share some column names based on the max/min column names in the sstables)
-     */
-    public static boolean overlaps(List<ByteBuffer> minColumnNames1, List<ByteBuffer> maxColumnNames1, List<ByteBuffer> minColumnNames2, List<ByteBuffer> maxColumnNames2, CellNameType comparator)
-    {
-        if (minColumnNames1.isEmpty() || maxColumnNames1.isEmpty() || minColumnNames2.isEmpty() || maxColumnNames2.isEmpty())
-            return true;
-
-        return !(compare(maxColumnNames1, minColumnNames2, comparator) < 0 || compare(minColumnNames1, maxColumnNames2, comparator) > 0);
-    }
-
-    private static int compare(List<ByteBuffer> columnNames1, List<ByteBuffer> columnNames2, CellNameType comparator)
-    {
-        for (int i = 0; i < Math.min(columnNames1.size(), columnNames2.size()); i++)
-        {
-            int cmp = comparator.subtype(i).compare(columnNames1.get(i), columnNames2.get(i));
-            if (cmp != 0)
-                return cmp;
-        }
-        return 0;
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/ColumnStats.java b/src/java/org/apache/cassandra/io/sstable/ColumnStats.java
deleted file mode 100644
index a1cb199..0000000
--- a/src/java/org/apache/cassandra/io/sstable/ColumnStats.java
+++ /dev/null

@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import org.apache.cassandra.utils.StreamingHistogram;
-
-/**
- * ColumnStats holds information about the columns for one row inside sstable
- */
-public class ColumnStats
-{
-    /** how many columns are there in the row */
-    public final int columnCount;
-
-    /** the largest (client-supplied) timestamp in the row */
-    public final long minTimestamp;
-    public final long maxTimestamp;
-    public final int maxLocalDeletionTime;
-    /** histogram of tombstone drop time */
-    public final StreamingHistogram tombstoneHistogram;
-
-    /** max and min column names according to comparator */
-    public final List<ByteBuffer> minColumnNames;
-    public final List<ByteBuffer> maxColumnNames;
-
-    public final boolean hasLegacyCounterShards;
-
-    public ColumnStats(int columnCount,
-                       long minTimestamp,
-                       long maxTimestamp,
-                       int maxLocalDeletionTime,
-                       StreamingHistogram tombstoneHistogram,
-                       List<ByteBuffer> minColumnNames,
-                       List<ByteBuffer> maxColumnNames,
-                       boolean hasLegacyCounterShards)
-    {
-        this.minTimestamp = minTimestamp;
-        this.maxTimestamp = maxTimestamp;
-        this.maxLocalDeletionTime = maxLocalDeletionTime;
-        this.columnCount = columnCount;
-        this.tombstoneHistogram = tombstoneHistogram;
-        this.minColumnNames = minColumnNames;
-        this.maxColumnNames = maxColumnNames;
-        this.hasLegacyCounterShards = hasLegacyCounterShards;
-    }
-
-    // We use explicit classes for ints and longs instead of generics to avoid boxing and unboxing (See CASSANDRA-8109)
-    public static class MinLongTracker
-    {
-        private final long defaultValue;
-        private boolean isSet = false;
-        private long value;
-
-        public MinLongTracker(long defaultValue)
-        {
-            this.defaultValue = defaultValue;
-        }
-
-        public void update(long value)
-        {
-            if (!isSet)
-            {
-                this.value = value;
-                isSet = true;
-            }
-            else
-            {
-                if (value < this.value)
-                    this.value = value;
-            }
-        }
-
-        public long get()
-        {
-            if (isSet)
-                return value;
-            return defaultValue;
-        }
-    }
-
-    public static class MaxLongTracker
-    {
-        private final long defaultValue;
-        private boolean isSet = false;
-        private long value;
-
-        public MaxLongTracker(long defaultValue)
-        {
-            this.defaultValue = defaultValue;
-        }
-
-        public void update(long value)
-        {
-            if (!isSet)
-            {
-                this.value = value;
-                isSet = true;
-            }
-            else
-            {
-                if (value >this.value)
-                    this.value = value;
-            }
-        }
-
-        public long get()
-        {
-            if (isSet)
-                return value;
-            return defaultValue;
-        }
-    }
-
-    public static class MaxIntTracker
-    {
-        private final int defaultValue;
-        private boolean isSet = false;
-        private int value;
-
-        public MaxIntTracker(int defaultValue)
-        {
-            this.defaultValue = defaultValue;
-        }
-
-        public void update(int value)
-        {
-            if (!isSet)
-            {
-                this.value = value;
-                isSet = true;
-            }
-            else
-            {
-                if (value > this.value)
-                    this.value = value;
-            }
-        }
-
-        public int get()
-        {
-            if (isSet)
-                return value;
-            return defaultValue;
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/Component.java b/src/java/org/apache/cassandra/io/sstable/Component.java
index a431f29..9454882 100644
--- a/src/java/org/apache/cassandra/io/sstable/Component.java
+++ b/src/java/org/apache/cassandra/io/sstable/Component.java

@@ -22,6 +22,7 @@
 
 import com.google.common.base.Objects;
 
+import org.apache.cassandra.utils.ChecksumType;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -34,6 +35,7 @@
     public static final char separator = '-';
 
     final static EnumSet<Type> TYPES = EnumSet.allOf(Type.class);
+
     public enum Type
     {
         // the base data for an sstable: the remaining components can be regenerated
@@ -48,7 +50,7 @@
         // statistical metadata about the content of the sstable
         STATS("Statistics.db"),
         // holds adler32 checksum of the data file
-        DIGEST("Digest.adler32"),
+        DIGEST("Digest.crc32", "Digest.adler32", "Digest.sha1"),
         // holds the CRC32 for chunks in an a uncompressed file.
         CRC("CRC.db"),
         // holds SSTable Index Summary (sampling of Index component)
@@ -56,19 +58,25 @@
         // table of contents, stores the list of all components for the sstable
         TOC("TOC.txt"),
         // custom component, used by e.g. custom compaction strategy
-        CUSTOM(null);
-
-        final String repr;
+        CUSTOM(new String[] { null });
+        
+        final String[] repr;
         Type(String repr)
         {
+            this(new String[] { repr });
+        }
+
+        Type(String... repr)
+        {
             this.repr = repr;
         }
 
         static Type fromRepresentation(String repr)
         {
             for (Type type : TYPES)
-                if (repr.equals(type.repr))
-                    return type;
+                for (String representation : type.repr)
+                    if (repr.equals(representation))
+                        return type;
             return CUSTOM;
         }
     }
@@ -79,18 +87,36 @@
     public final static Component FILTER = new Component(Type.FILTER);
     public final static Component COMPRESSION_INFO = new Component(Type.COMPRESSION_INFO);
     public final static Component STATS = new Component(Type.STATS);
-    public final static Component DIGEST = new Component(Type.DIGEST);
+    private static final String digestCrc32 = "Digest.crc32";
+    private static final String digestAdler32 = "Digest.adler32";
+    private static final String digestSha1 = "Digest.sha1";
+    public final static Component DIGEST_CRC32 = new Component(Type.DIGEST, digestCrc32);
+    public final static Component DIGEST_ADLER32 = new Component(Type.DIGEST, digestAdler32);
+    public final static Component DIGEST_SHA1 = new Component(Type.DIGEST, digestSha1);
     public final static Component CRC = new Component(Type.CRC);
     public final static Component SUMMARY = new Component(Type.SUMMARY);
     public final static Component TOC = new Component(Type.TOC);
 
+    public static Component digestFor(ChecksumType checksumType)
+    {
+        switch (checksumType)
+        {
+            case Adler32:
+                return DIGEST_ADLER32;
+            case CRC32:
+                return DIGEST_CRC32;
+        }
+        throw new AssertionError();
+    }
+
     public final Type type;
     public final String name;
     public final int hashCode;
 
     public Component(Type type)
     {
-        this(type, type.repr);
+        this(type, type.repr[0]);
+        assert type.repr.length == 1;
         assert type != Type.CUSTOM;
     }
 
@@ -132,7 +158,14 @@
             case FILTER:            component = Component.FILTER;                       break;
             case COMPRESSION_INFO:  component = Component.COMPRESSION_INFO;             break;
             case STATS:             component = Component.STATS;                        break;
-            case DIGEST:            component = Component.DIGEST;                       break;
+            case DIGEST:            switch (path.right)
+                                    {
+                                        case digestCrc32:   component = Component.DIGEST_CRC32;     break;
+                                        case digestAdler32: component = Component.DIGEST_ADLER32;   break;
+                                        case digestSha1:    component = Component.DIGEST_SHA1;      break;
+                                        default:            throw new IllegalArgumentException("Invalid digest component " + path.right);
+                                    }
+                                    break;
             case CRC:               component = Component.CRC;                          break;
             case SUMMARY:           component = Component.SUMMARY;                      break;
             case TOC:               component = Component.TOC;                          break;

diff --git a/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java b/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java
index 0fe316d..93be2ee 100644
--- a/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java
+++ b/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java

@@ -23,13 +23,13 @@
 {
     public final File path;
 
-    public CorruptSSTableException(Exception cause, File path)
+    public CorruptSSTableException(Throwable cause, File path)
     {
         super("Corrupted: " + path, cause);
         this.path = path;
     }
 
-    public CorruptSSTableException(Exception cause, String path)
+    public CorruptSSTableException(Throwable cause, String path)
     {
         this(cause, new File(path));
     }

diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
index d9c7550..811e1a1 100644
--- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java
+++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java

@@ -20,10 +20,10 @@
 import java.io.File;
 import java.io.IOError;
 import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.Deque;
-import java.util.StringTokenizer;
+import java.util.*;
+import java.util.regex.Pattern;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.CharMatcher;
 import com.google.common.base.Objects;
 
@@ -31,6 +31,7 @@
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.io.sstable.metadata.IMetadataSerializer;
 import org.apache.cassandra.io.sstable.metadata.LegacyMetadataSerializer;
 import org.apache.cassandra.io.sstable.metadata.MetadataSerializer;
@@ -47,18 +48,7 @@
  */
 public class Descriptor
 {
-
-    public static enum Type
-    {
-        TEMP("tmp", true), TEMPLINK("tmplink", true), FINAL(null, false);
-        public final boolean isTemporary;
-        public final String marker;
-        Type(String marker, boolean isTemporary)
-        {
-            this.isTemporary = isTemporary;
-            this.marker = marker;
-        }
-    }
+    public static String TMP_EXT = ".tmp";
 
     /** canonicalized path to the directory where SSTable resides */
     public final File directory;
@@ -67,29 +57,35 @@
     public final String ksname;
     public final String cfname;
     public final int generation;
-    public final Type type;
     public final SSTableFormat.Type formatType;
+    /** digest component - might be {@code null} for old, legacy sstables */
+    public final Component digestComponent;
     private final int hashCode;
 
     /**
      * A descriptor that assumes CURRENT_VERSION.
      */
-    public Descriptor(File directory, String ksname, String cfname, int generation, Type temp)
+    @VisibleForTesting
+    public Descriptor(File directory, String ksname, String cfname, int generation)
     {
-        this(DatabaseDescriptor.getSSTableFormat().info.getLatestVersion(), directory, ksname, cfname, generation, temp, DatabaseDescriptor.getSSTableFormat());
+        this(DatabaseDescriptor.getSSTableFormat().info.getLatestVersion(), directory, ksname, cfname, generation, DatabaseDescriptor.getSSTableFormat(), null);
     }
 
-    public Descriptor(File directory, String ksname, String cfname, int generation, Type temp, SSTableFormat.Type formatType)
+    /**
+     * Constructor for sstable writers only.
+     */
+    public Descriptor(File directory, String ksname, String cfname, int generation, SSTableFormat.Type formatType)
     {
-        this(formatType.info.getLatestVersion(), directory, ksname, cfname, generation, temp, formatType);
+        this(formatType.info.getLatestVersion(), directory, ksname, cfname, generation, formatType, Component.digestFor(BigFormat.latestVersion.uncompressedChecksumType()));
     }
 
-    public Descriptor(String version, File directory, String ksname, String cfname, int generation, Type temp, SSTableFormat.Type formatType)
+    @VisibleForTesting
+    public Descriptor(String version, File directory, String ksname, String cfname, int generation, SSTableFormat.Type formatType)
     {
-        this(formatType.info.getVersion(version), directory, ksname, cfname, generation, temp, formatType);
+        this(formatType.info.getVersion(version), directory, ksname, cfname, generation, formatType, Component.digestFor(BigFormat.latestVersion.uncompressedChecksumType()));
     }
 
-    public Descriptor(Version version, File directory, String ksname, String cfname, int generation, Type temp, SSTableFormat.Type formatType)
+    public Descriptor(Version version, File directory, String ksname, String cfname, int generation, SSTableFormat.Type formatType, Component digestComponent)
     {
         assert version != null && directory != null && ksname != null && cfname != null && formatType.info.getLatestVersion().getClass().equals(version.getClass());
         this.version = version;
@@ -104,25 +100,35 @@
         this.ksname = ksname;
         this.cfname = cfname;
         this.generation = generation;
-        this.type = temp;
         this.formatType = formatType;
+        this.digestComponent = digestComponent;
 
-        hashCode = Objects.hashCode(version, this.directory, generation, ksname, cfname, temp, formatType);
+        hashCode = Objects.hashCode(version, this.directory, generation, ksname, cfname, formatType);
     }
 
     public Descriptor withGeneration(int newGeneration)
     {
-        return new Descriptor(version, directory, ksname, cfname, newGeneration, type, formatType);
+        return new Descriptor(version, directory, ksname, cfname, newGeneration, formatType, digestComponent);
     }
 
     public Descriptor withFormatType(SSTableFormat.Type newType)
     {
-        return new Descriptor(newType.info.getLatestVersion(), directory, ksname, cfname, generation, type, newType);
+        return new Descriptor(newType.info.getLatestVersion(), directory, ksname, cfname, generation, newType, digestComponent);
+    }
+
+    public Descriptor withDigestComponent(Component newDigestComponent)
+    {
+        return new Descriptor(version, directory, ksname, cfname, generation, formatType, newDigestComponent);
+    }
+
+    public String tmpFilenameFor(Component component)
+    {
+        return filenameFor(component) + TMP_EXT;
     }
 
     public String filenameFor(Component component)
     {
-        return filenameFor(component.name());
+        return baseFilename() + separator + component.name();
     }
 
     public String baseFilename()
@@ -140,8 +146,6 @@
             buff.append(ksname).append(separator);
             buff.append(cfname).append(separator);
         }
-        if (type.isTemporary)
-            buff.append(type.marker).append(separator);
         buff.append(version).append(separator);
         buff.append(generation);
         if (formatType != SSTableFormat.Type.LEGACY)
@@ -166,13 +170,43 @@
         return formatType.info;
     }
 
-    /**
-     * @param suffix A component suffix, such as 'Data.db'/'Index.db'/etc
-     * @return A filename for this descriptor with the given suffix.
-     */
-    public String filenameFor(String suffix)
+    /** Return any temporary files found in the directory */
+    public List<File> getTemporaryFiles()
     {
-        return baseFilename() + separator + suffix;
+        List<File> ret = new ArrayList<>();
+        File[] tmpFiles = directory.listFiles((dir, name) ->
+                                              name.endsWith(Descriptor.TMP_EXT));
+
+        for (File tmpFile : tmpFiles)
+            ret.add(tmpFile);
+
+        return ret;
+    }
+
+    /**
+     *  Files obsoleted by CASSANDRA-7066 : temporary files and compactions_in_progress. We support
+     *  versions 2.1 (ka) and 2.2 (la).
+     *  Temporary files have tmp- or tmplink- at the beginning for 2.2 sstables or after ks-cf- for 2.1 sstables
+     */
+
+    private final static String LEGACY_COMP_IN_PROG_REGEX_STR = "^compactions_in_progress(\\-[\\d,a-f]{32})?$";
+    private final static Pattern LEGACY_COMP_IN_PROG_REGEX = Pattern.compile(LEGACY_COMP_IN_PROG_REGEX_STR);
+    private final static String LEGACY_TMP_REGEX_STR = "^((.*)\\-(.*)\\-)?tmp(link)?\\-((?:l|k).)\\-(\\d)*\\-(.*)$";
+    private final static Pattern LEGACY_TMP_REGEX = Pattern.compile(LEGACY_TMP_REGEX_STR);
+
+    public static boolean isLegacyFile(File file)
+    {
+        if (file.isDirectory())
+            return file.getParentFile() != null &&
+                   file.getParentFile().getName().equalsIgnoreCase("system") &&
+                   LEGACY_COMP_IN_PROG_REGEX.matcher(file.getName()).matches();
+        else
+            return LEGACY_TMP_REGEX.matcher(file.getName()).matches();
+    }
+
+    public static boolean isValidFile(String fileName)
+    {
+        return fileName.endsWith(".db") && !LEGACY_TMP_REGEX.matcher(fileName).matches();
     }
 
     /**
@@ -236,7 +270,7 @@
         String component = skipComponent ? null : tokenStack.pop();
 
         nexttok = tokenStack.pop();
-        // generation OR Type
+        // generation OR format type
         SSTableFormat.Type fmt = SSTableFormat.Type.LEGACY;
         if (!CharMatcher.DIGIT.matchesAllOf(nexttok))
         {
@@ -254,20 +288,6 @@
         if (!version.validate(nexttok))
             throw new UnsupportedOperationException("SSTable " + name + " is too old to open.  Upgrade to 2.0 first, and run upgradesstables");
 
-        // optional temporary marker
-        Type type = Descriptor.Type.FINAL;
-        nexttok = tokenStack.peek();
-        if (Descriptor.Type.TEMP.marker.equals(nexttok))
-        {
-            type = Descriptor.Type.TEMP;
-            tokenStack.pop();
-        }
-        else if (Descriptor.Type.TEMPLINK.marker.equals(nexttok))
-        {
-            type = Descriptor.Type.TEMPLINK;
-            tokenStack.pop();
-        }
-
         // ks/cf names
         String ksname, cfname;
         if (version.hasNewFileName())
@@ -299,16 +319,10 @@
         }
         assert tokenStack.isEmpty() : "Invalid file name " + name + " in " + directory;
 
-        return Pair.create(new Descriptor(version, parentDirectory, ksname, cfname, generation, type, fmt), component);
-    }
-
-    /**
-     * @param type temporary flag
-     * @return A clone of this descriptor with the given 'temporary' status.
-     */
-    public Descriptor asType(Type type)
-    {
-        return new Descriptor(version, directory, ksname, cfname, generation, type, formatType);
+        return Pair.create(new Descriptor(version, parentDirectory, ksname, cfname, generation, fmt,
+                                          // _assume_ version from version
+                                          Component.digestFor(version.uncompressedChecksumType())),
+                           component);
     }
 
     public IMetadataSerializer getMetadataSerializer()
@@ -345,8 +359,7 @@
                        && that.generation == this.generation
                        && that.ksname.equals(this.ksname)
                        && that.cfname.equals(this.cfname)
-                       && that.formatType == this.formatType
-                       && that.type == this.type;
+                       && that.formatType == this.formatType;
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java b/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java
index b80bd87..7063057 100644
--- a/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java

@@ -19,14 +19,13 @@
 
 package org.apache.cassandra.io.sstable;
 
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.utils.CloseableIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 
 /**
  * An ISSTableScanner is an abstraction allowing multiple SSTableScanners to be
  * chained together under the hood.  See LeveledCompactionStrategy.getScanners.
  */
-public interface ISSTableScanner extends CloseableIterator<OnDiskAtomIterator>
+public interface ISSTableScanner extends UnfilteredPartitionIterator
 {
     public long getLengthInBytes();
     public long getCurrentPosition();

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
index 3d304c5..74a0fc5 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexHelper.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexHelper.java

@@ -19,83 +19,45 @@
 
 import java.io.*;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.List;
 
-import org.apache.cassandra.db.composites.CType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.*;
 
 /**
  * Provides helper to serialize, deserialize and use column indexes.
  */
-public class IndexHelper
+public final class IndexHelper
 {
-    public static void skipBloomFilter(DataInput in) throws IOException
+    private IndexHelper()
     {
-        int size = in.readInt();
-        FileUtils.skipBytesFully(in, size);
-    }
-
-    /**
-     * Skip the index
-     * @param in the data input from which the index should be skipped
-     * @throws IOException if an I/O error occurs.
-     */
-    public static void skipIndex(DataInput in) throws IOException
-    {
-        /* read only the column index list */
-        int columnIndexSize = in.readInt();
-        /* skip the column index data */
-        if (in instanceof FileDataInput)
-        {
-            FileUtils.skipBytesFully(in, columnIndexSize);
-        }
-        else
-        {
-            // skip bytes
-            byte[] skip = new byte[columnIndexSize];
-            in.readFully(skip);
-        }
     }
 
     /**
      * The index of the IndexInfo in which a scan starting with @name should begin.
      *
-     * @param name
-     *         name of the index
-     *
-     * @param indexList
-     *          list of the indexInfo objects
-     *
-     * @param comparator
-     *          comparator type
-     *
-     * @param reversed
-     *          is name reversed
+     * @param name name to search for
+     * @param indexList list of the indexInfo objects
+     * @param comparator the comparator to use
+     * @param reversed whether or not the search is reversed, i.e. we scan forward or backward from name
+     * @param lastIndex where to start the search from in indexList
      *
      * @return int index
      */
-    public static int indexFor(Composite name, List<IndexInfo> indexList, CType comparator, boolean reversed, int lastIndex)
+    public static int indexFor(ClusteringPrefix name, List<IndexInfo> indexList, ClusteringComparator comparator, boolean reversed, int lastIndex)
     {
-        if (name.isEmpty())
-            return lastIndex >= 0 ? lastIndex : reversed ? indexList.size() - 1 : 0;
-
-        if (lastIndex >= indexList.size())
-            return -1;
-
-        IndexInfo target = new IndexInfo(name, name, 0, 0);
+        IndexInfo target = new IndexInfo(name, name, 0, 0, null);
         /*
         Take the example from the unit test, and say your index looks like this:
         [0..5][10..15][20..25]
         and you look for the slice [13..17].
 
-        When doing forward slice, we we doing a binary search comparing 13 (the start of the query)
+        When doing forward slice, we are doing a binary search comparing 13 (the start of the query)
         to the lastName part of the index slot. You'll end up with the "first" slot, going from left to right,
         that may contain the start.
 
@@ -105,81 +67,126 @@
         */
         int startIdx = 0;
         List<IndexInfo> toSearch = indexList;
-        if (lastIndex >= 0)
+        if (reversed)
         {
-            if (reversed)
+            if (lastIndex < indexList.size() - 1)
             {
                 toSearch = indexList.subList(0, lastIndex + 1);
             }
-            else
+        }
+        else
+        {
+            if (lastIndex > 0)
             {
                 startIdx = lastIndex;
                 toSearch = indexList.subList(lastIndex, indexList.size());
             }
         }
-        int index = Collections.binarySearch(toSearch, target, getComparator(comparator, reversed));
+        int index = Collections.binarySearch(toSearch, target, comparator.indexComparator(reversed));
         return startIdx + (index < 0 ? -index - (reversed ? 2 : 1) : index);
     }
 
-    public static Comparator<IndexInfo> getComparator(final CType nameComparator, boolean reversed)
-    {
-        return reversed ? nameComparator.indexReverseComparator() : nameComparator.indexComparator();
-    }
-
     public static class IndexInfo
     {
-        private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexInfo(null, null, 0, 0));
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexInfo(null, null, 0, 0, null));
 
-        public final long width;
-        public final Composite lastName;
-        public final Composite firstName;
         public final long offset;
+        public final long width;
+        public final ClusteringPrefix firstName;
+        public final ClusteringPrefix lastName;
 
-        public IndexInfo(Composite firstName, Composite lastName, long offset, long width)
+        // If at the end of the index block there is an open range tombstone marker, this marker
+        // deletion infos. null otherwise.
+        public final DeletionTime endOpenMarker;
+
+        public IndexInfo(ClusteringPrefix firstName,
+                         ClusteringPrefix lastName,
+                         long offset,
+                         long width,
+                         DeletionTime endOpenMarker)
         {
             this.firstName = firstName;
             this.lastName = lastName;
             this.offset = offset;
             this.width = width;
+            this.endOpenMarker = endOpenMarker;
         }
 
-        public static class Serializer implements ISerializer<IndexInfo>
+        public static class Serializer
         {
-            private final CType type;
+            // This is the default index size that we use to delta-encode width when serializing so we get better vint-encoding.
+            // This is imperfect as user can change the index size and ideally we would save the index size used with each index file
+            // to use as base. However, that's a bit more involved a change that we want for now and very seldom do use change the index
+            // size so using the default is almost surely better than using no base at all.
+            public static final long WIDTH_BASE = 64 * 1024;
 
-            public Serializer(CType type)
+            private final ISerializer<ClusteringPrefix> clusteringSerializer;
+            private final Version version;
+
+            public Serializer(CFMetaData metadata, Version version, SerializationHeader header)
             {
-                this.type = type;
+                this.clusteringSerializer = metadata.serializers().indexEntryClusteringPrefixSerializer(version, header);
+                this.version = version;
             }
 
             public void serialize(IndexInfo info, DataOutputPlus out) throws IOException
             {
-                type.serializer().serialize(info.firstName, out);
-                type.serializer().serialize(info.lastName, out);
-                out.writeLong(info.offset);
-                out.writeLong(info.width);
+                assert version.storeRows() : "We read old index files but we should never write them";
+
+                clusteringSerializer.serialize(info.firstName, out);
+                clusteringSerializer.serialize(info.lastName, out);
+                out.writeUnsignedVInt(info.offset);
+                out.writeVInt(info.width - WIDTH_BASE);
+
+                out.writeBoolean(info.endOpenMarker != null);
+                if (info.endOpenMarker != null)
+                    DeletionTime.serializer.serialize(info.endOpenMarker, out);
             }
 
-            public IndexInfo deserialize(DataInput in) throws IOException
+            public IndexInfo deserialize(DataInputPlus in) throws IOException
             {
-                return new IndexInfo(type.serializer().deserialize(in),
-                                     type.serializer().deserialize(in),
-                                     in.readLong(),
-                                     in.readLong());
+                ClusteringPrefix firstName = clusteringSerializer.deserialize(in);
+                ClusteringPrefix lastName = clusteringSerializer.deserialize(in);
+                long offset;
+                long width;
+                DeletionTime endOpenMarker = null;
+                if (version.storeRows())
+                {
+                    offset = in.readUnsignedVInt();
+                    width = in.readVInt() + WIDTH_BASE;
+                    if (in.readBoolean())
+                        endOpenMarker = DeletionTime.serializer.deserialize(in);
+                }
+                else
+                {
+                    offset = in.readLong();
+                    width = in.readLong();
+                }
+                return new IndexInfo(firstName, lastName, offset, width, endOpenMarker);
             }
 
-            public long serializedSize(IndexInfo info, TypeSizes typeSizes)
+            public long serializedSize(IndexInfo info)
             {
-                return type.serializer().serializedSize(info.firstName, typeSizes)
-                     + type.serializer().serializedSize(info.lastName, typeSizes)
-                     + typeSizes.sizeof(info.offset)
-                     + typeSizes.sizeof(info.width);
+                assert version.storeRows() : "We read old index files but we should never write them";
+
+                long size = clusteringSerializer.serializedSize(info.firstName)
+                          + clusteringSerializer.serializedSize(info.lastName)
+                          + TypeSizes.sizeofUnsignedVInt(info.offset)
+                          + TypeSizes.sizeofVInt(info.width - WIDTH_BASE)
+                          + TypeSizes.sizeof(info.endOpenMarker != null);
+
+                if (info.endOpenMarker != null)
+                    size += DeletionTime.serializer.serializedSize(info.endOpenMarker);
+                return size;
             }
         }
 
         public long unsharedHeapSize()
         {
-            return EMPTY_SIZE + firstName.unsharedHeapSize() + lastName.unsharedHeapSize();
+            return EMPTY_SIZE
+                 + firstName.unsharedHeapSize()
+                 + lastName.unsharedHeapSize()
+                 + (endOpenMarker == null ? 0 : endOpenMarker.unsharedHeapSize());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummary.java b/src/java/org/apache/cassandra/io/sstable/IndexSummary.java
index 7df7349..371a243 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummary.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummary.java

@@ -18,8 +18,6 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -28,11 +26,11 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.util.*;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.Ref;
 import org.apache.cassandra.utils.concurrent.WrappedSharedCloseable;
 import org.apache.cassandra.utils.memory.MemoryUtil;
 
@@ -110,7 +108,7 @@
 
     // binary search is notoriously more difficult to get right than it looks; this is lifted from
     // Harmony's Collections implementation
-    public int binarySearch(RowPosition key)
+    public int binarySearch(PartitionPosition key)
     {
         // We will be comparing non-native Keys, so use a buffer with appropriate byte order
         ByteBuffer hollow = MemoryUtil.getHollowDirectByteBuffer().order(ByteOrder.BIG_ENDIAN);
@@ -164,6 +162,13 @@
         entries.setByteBuffer(buffer, start, keySize);
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        super.addTo(identities);
+        identities.add(offsets);
+        identities.add(entries);
+    }
+
     public long getPosition(int index)
     {
         return entries.getLong(calculateEnd(index) - 8);

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java
index 0f604e0..cb6fcc0 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryBuilder.java

@@ -25,7 +25,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.util.Memory;
 import org.apache.cassandra.io.util.SafeMemoryWriter;
@@ -36,6 +38,9 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(IndexSummaryBuilder.class);
 
+    static final String defaultExpectedKeySizeName = Config.PROPERTY_PREFIX + "index_summary_expected_key_size";
+    static long defaultExpectedKeySize = Long.valueOf(System.getProperty(defaultExpectedKeySizeName, "64"));
+
     // the offset in the keys memory region to look for a given summary boundary
     private final SafeMemoryWriter offsets;
     private final SafeMemoryWriter entries;
@@ -85,20 +90,30 @@
         }
     }
 
+    /**
+     * Build an index summary builder.
+     *
+     * @param expectedKeys - the number of keys we expect in the sstable
+     * @param minIndexInterval - the minimum interval between entries selected for sampling
+     * @param samplingLevel - the level at which entries are sampled
+     */
     public IndexSummaryBuilder(long expectedKeys, int minIndexInterval, int samplingLevel)
     {
         this.samplingLevel = samplingLevel;
         this.startPoints = Downsampling.getStartPoints(BASE_SAMPLING_LEVEL, samplingLevel);
 
+        long expectedEntrySize = getEntrySize(defaultExpectedKeySize);
         long maxExpectedEntries = expectedKeys / minIndexInterval;
-        if (maxExpectedEntries > Integer.MAX_VALUE)
+        long maxExpectedEntriesSize = maxExpectedEntries * expectedEntrySize;
+        if (maxExpectedEntriesSize > Integer.MAX_VALUE)
         {
             // that's a _lot_ of keys, and a very low min index interval
-            int effectiveMinInterval = (int) Math.ceil((double) Integer.MAX_VALUE / expectedKeys);
+            int effectiveMinInterval = (int) Math.ceil((double)(expectedKeys * expectedEntrySize) / Integer.MAX_VALUE);
             maxExpectedEntries = expectedKeys / effectiveMinInterval;
-            assert maxExpectedEntries <= Integer.MAX_VALUE : maxExpectedEntries;
-            logger.warn("min_index_interval of {} is too low for {} expected keys; using interval of {} instead",
-                        minIndexInterval, expectedKeys, effectiveMinInterval);
+            maxExpectedEntriesSize = maxExpectedEntries * expectedEntrySize;
+            assert maxExpectedEntriesSize <= Integer.MAX_VALUE : maxExpectedEntriesSize;
+            logger.warn("min_index_interval of {} is too low for {} expected keys of avg size {}; using interval of {} instead",
+                        minIndexInterval, expectedKeys, defaultExpectedKeySize, effectiveMinInterval);
             this.minIndexInterval = effectiveMinInterval;
         }
         else
@@ -109,13 +124,30 @@
         // for initializing data structures, adjust our estimates based on the sampling level
         maxExpectedEntries = Math.max(1, (maxExpectedEntries * samplingLevel) / BASE_SAMPLING_LEVEL);
         offsets = new SafeMemoryWriter(4 * maxExpectedEntries).order(ByteOrder.nativeOrder());
-        entries = new SafeMemoryWriter(40 * maxExpectedEntries).order(ByteOrder.nativeOrder());
+        entries = new SafeMemoryWriter(expectedEntrySize * maxExpectedEntries).order(ByteOrder.nativeOrder());
 
         // the summary will always contain the first index entry (downsampling will never remove it)
         nextSamplePosition = 0;
         indexIntervalMatches++;
     }
 
+    /**
+     * Given a key, return how long the serialized index summary entry will be.
+     */
+    private static long getEntrySize(DecoratedKey key)
+    {
+        return getEntrySize(key.getKey().remaining());
+    }
+
+    /**
+     * Given a key size, return how long the serialized index summary entry will be, that is add 8 bytes to
+     * accomodate for the size of the position.
+     */
+    private static long getEntrySize(long keySize)
+    {
+        return keySize + TypeSizes.sizeof(0L);
+    }
+
     // the index file has been flushed to the provided position; stash it and use that to recalculate our max readable boundary
     public void markIndexSynced(long upToPosition)
     {
@@ -169,21 +201,29 @@
     {
         if (keysWritten == nextSamplePosition)
         {
-            assert entries.length() <= Integer.MAX_VALUE;
-            offsets.writeInt((int) entries.length());
-            entries.write(decoratedKey.getKey());
-            entries.writeLong(indexStart);
-            setNextSamplePosition(keysWritten);
+            if ((entries.length() + getEntrySize(decoratedKey)) <= Integer.MAX_VALUE)
+            {
+                offsets.writeInt((int) entries.length());
+                entries.write(decoratedKey.getKey());
+                entries.writeLong(indexStart);
+                setNextSamplePosition(keysWritten);
+            }
+            else
+            {
+                // we cannot fully sample this sstable due to too much memory in the index summary, so let's tell the user
+                logger.error("Memory capacity of index summary exceeded (2GB), index summary will not cover full sstable, " +
+                             "you should increase min_sampling_level");
+            }
         }
         else if (dataEnd != 0 && keysWritten + 1 == nextSamplePosition)
         {
             // this is the last key in this summary interval, so stash it
-            ReadableBoundary boundary = new ReadableBoundary(decoratedKey, indexEnd, dataEnd, (int)(offsets.length() / 4), entries.length());
+            ReadableBoundary boundary = new ReadableBoundary(decoratedKey, indexEnd, dataEnd, (int) (offsets.length() / 4), entries.length());
             lastReadableByData.put(dataEnd, boundary);
             lastReadableByIndex.put(indexEnd, boundary);
         }
-        keysWritten++;
 
+        keysWritten++;
         return this;
     }
 
@@ -251,12 +291,12 @@
         return accumulate;
     }
 
-    public static int entriesAtSamplingLevel(int samplingLevel, int maxSummarySize)
+    static int entriesAtSamplingLevel(int samplingLevel, int maxSummarySize)
     {
         return (int) Math.ceil((samplingLevel * maxSummarySize) / (double) BASE_SAMPLING_LEVEL);
     }
 
-    public static int calculateSamplingLevel(int currentSamplingLevel, int currentNumEntries, long targetNumEntries, int minIndexInterval, int maxIndexInterval)
+    static int calculateSamplingLevel(int currentSamplingLevel, int currentNumEntries, long targetNumEntries, int minIndexInterval, int maxIndexInterval)
     {
         // effective index interval == (BASE_SAMPLING_LEVEL / samplingLevel) * minIndexInterval
         // so we can just solve for minSamplingLevel here:

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
index 9317132..dea1cd6 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java

@@ -42,6 +42,7 @@
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.ExecutorUtils;
@@ -185,7 +186,7 @@
         for (Keyspace ks : Keyspace.all())
         {
             for (ColumnFamilyStore cfStore: ks.getColumnFamilyStores())
-                result.addAll(cfStore.getSSTables());
+                result.addAll(cfStore.getLiveSSTables());
         }
 
         return result;
@@ -209,7 +210,7 @@
                 do
                 {
                     View view = cfStore.getTracker().getView();
-                    allSSTables = view.sstables;
+                    allSSTables = ImmutableSet.copyOf(view.select(SSTableSet.CANONICAL));
                     nonCompacting = ImmutableSet.copyOf(view.getUncompacting(allSSTables));
                 }
                 while (null == (txn = cfStore.getTracker().tryModify(nonCompacting, OperationType.UNKNOWN)));
@@ -223,6 +224,8 @@
 
     public void redistributeSummaries() throws IOException
     {
+        if (CompactionManager.instance.isGlobalCompactionPaused())
+            return;
         Pair<List<SSTableReader>, Map<UUID, LifecycleTransaction>> compactingAndNonCompacting = getCompactingAndNonCompactingSSTables();
         try
         {

diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
index 12586e5..45bd7eb 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java

@@ -30,16 +30,18 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.metrics.StorageMetrics;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.concurrent.Refs;
 
@@ -159,8 +161,8 @@
             if (isStopRequested())
                 throw new CompactionInterruptedException(getCompactionInfo());
 
-            int minIndexInterval = sstable.metadata.getMinIndexInterval();
-            int maxIndexInterval = sstable.metadata.getMaxIndexInterval();
+            int minIndexInterval = sstable.metadata.params.minIndexInterval;
+            int maxIndexInterval = sstable.metadata.params.maxIndexInterval;
 
             double readsPerSec = sstable.getReadMeter() == null ? 0.0 : sstable.getReadMeter().fifteenMinuteRate();
             long idealSpace = Math.round(remainingSpace * (readsPerSec / totalReadsPerSec));
@@ -259,14 +261,39 @@
                          sstable, sstable.getIndexSummarySamplingLevel(), Downsampling.BASE_SAMPLING_LEVEL,
                          entry.newSamplingLevel, Downsampling.BASE_SAMPLING_LEVEL);
             ColumnFamilyStore cfs = Keyspace.open(sstable.metadata.ksName).getColumnFamilyStore(sstable.metadata.cfId);
+            long oldSize = sstable.bytesOnDisk();
             SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(cfs, entry.newSamplingLevel);
+            long newSize = replacement.bytesOnDisk();
             newSSTables.add(replacement);
             transactions.get(sstable.metadata.cfId).update(replacement, true);
+            addHooks(cfs, transactions, oldSize, newSize);
         }
 
         return newSSTables;
     }
 
+    /**
+     * Add hooks to correctly update the storage load metrics once the transaction is closed/aborted
+     */
+    @SuppressWarnings("resource") // Transactions are closed in finally outside of this method
+    private void addHooks(ColumnFamilyStore cfs, Map<UUID, LifecycleTransaction> transactions, long oldSize, long newSize)
+    {
+        LifecycleTransaction txn = transactions.get(cfs.metadata.cfId);
+        txn.runOnCommit(() -> {
+            // The new size will be added in Transactional.commit() as an updated SSTable, more details: CASSANDRA-13738
+            StorageMetrics.load.dec(oldSize);
+            cfs.metric.liveDiskSpaceUsed.dec(oldSize);
+            cfs.metric.totalDiskSpaceUsed.dec(oldSize);
+        });
+        txn.runOnAbort(() -> {
+            // the local disk was modified but book keeping couldn't be commited, apply the delta
+            long delta = oldSize - newSize; // if new is larger this will be negative, so dec will become a inc
+            StorageMetrics.load.dec(delta);
+            cfs.metric.liveDiskSpaceUsed.dec(delta);
+            cfs.metric.totalDiskSpaceUsed.dec(delta);
+        });
+    }
+
     @VisibleForTesting
     static Pair<List<SSTableReader>, List<ResampleEntry>> distributeRemainingSpace(List<ResampleEntry> toDownsample, long remainingSpace)
     {
@@ -308,7 +335,12 @@
 
     public CompactionInfo getCompactionInfo()
     {
-        return new CompactionInfo(OperationType.INDEX_SUMMARY, (memoryPoolBytes - remainingSpace), memoryPoolBytes, "bytes", compactionId);
+        return new CompactionInfo(OperationType.INDEX_SUMMARY, (memoryPoolBytes - remainingSpace), memoryPoolBytes, Unit.BYTES, compactionId);
+    }
+
+    public boolean isGlobal()
+    {
+        return true;
     }
 
     /** Utility class for sorting sstables by their read rates. */

diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
index 4d1c663..f02b9d1 100644
--- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java

@@ -20,23 +20,76 @@
 import java.io.File;
 import java.io.IOException;
 
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CloseableIterator;
 
 public class KeyIterator extends AbstractIterator<DecoratedKey> implements CloseableIterator<DecoratedKey>
 {
-    private final RandomAccessReader in;
-
-    public KeyIterator(Descriptor desc)
+    private final static class In
     {
-        File path = new File(desc.filenameFor(Component.PRIMARY_INDEX));
-        in = RandomAccessReader.open(path);
+        private final File path;
+        private RandomAccessReader in;
+
+        public In(File path)
+        {
+            this.path = path;
+        }
+
+        private void maybeInit()
+        {
+            if (in == null)
+                in = RandomAccessReader.open(path);
+        }
+
+        public DataInputPlus get()
+        {
+            maybeInit();
+            return in;
+        }
+
+        public boolean isEOF()
+        {
+            maybeInit();
+            return in.isEOF();
+        }
+
+        public void close()
+        {
+            if (in != null)
+                in.close();
+        }
+
+        public long getFilePointer()
+        {
+            maybeInit();
+            return in.getFilePointer();
+        }
+
+        public long length()
+        {
+            maybeInit();
+            return in.length();
+        }
+    }
+
+    private final Descriptor desc;
+    private final In in;
+    private final IPartitioner partitioner;
+
+
+    public KeyIterator(Descriptor desc, CFMetaData metadata)
+    {
+        this.desc = desc;
+        in = new In(new File(desc.filenameFor(Component.PRIMARY_INDEX)));
+        partitioner = metadata.partitioner;
     }
 
     protected DecoratedKey computeNext()
@@ -45,8 +98,9 @@
         {
             if (in.isEOF())
                 return endOfData();
-            DecoratedKey key = StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in));
-            RowIndexEntry.Serializer.skip(in); // skip remainder of the entry
+
+            DecoratedKey key = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in.get()));
+            RowIndexEntry.Serializer.skip(in.get(), desc.version); // skip remainder of the entry
             return key;
         }
         catch (IOException e)

diff --git a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
index a1fda57..6f395f8 100644
--- a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
@@ -33,42 +32,53 @@
  */
 public class ReducingKeyIterator implements CloseableIterator<DecoratedKey>
 {
-    private final IMergeIterator<DecoratedKey,DecoratedKey> mi;
+    private final ArrayList<KeyIterator> iters;
+    private IMergeIterator<DecoratedKey,DecoratedKey> mi;
 
     public ReducingKeyIterator(Collection<SSTableReader> sstables)
     {
-        ArrayList<KeyIterator> iters = new ArrayList<KeyIterator>(sstables.size());
+        iters = new ArrayList<>(sstables.size());
         for (SSTableReader sstable : sstables)
-            iters.add(new KeyIterator(sstable.descriptor));
-        mi = MergeIterator.get(iters, DecoratedKey.comparator, new MergeIterator.Reducer<DecoratedKey,DecoratedKey>()
-        {
-            DecoratedKey reduced = null;
-
-            @Override
-            public boolean trivialReduceIsTrivial()
-            {
-                return true;
-            }
-
-            public void reduce(DecoratedKey current)
-            {
-                reduced = current;
-            }
-
-            protected DecoratedKey getReduced()
-            {
-                return reduced;
-            }
-        });
+            iters.add(new KeyIterator(sstable.descriptor, sstable.metadata));
     }
 
-    public void close() throws IOException
+    private void maybeInit()
     {
-        mi.close();
+        if (mi == null)
+        {
+            mi = MergeIterator.get(iters, DecoratedKey.comparator, new MergeIterator.Reducer<DecoratedKey,DecoratedKey>()
+            {
+                DecoratedKey reduced = null;
+
+                @Override
+                public boolean trivialReduceIsTrivial()
+                {
+                    return true;
+                }
+
+                public void reduce(int idx, DecoratedKey current)
+                {
+                    reduced = current;
+                }
+
+                protected DecoratedKey getReduced()
+                {
+                    return reduced;
+                }
+            });
+        }
+    }
+
+    public void close()
+    {
+        if (mi != null)
+            mi.close();
     }
 
     public long getTotalBytes()
     {
+        maybeInit();
+
         long m = 0;
         for (Iterator<DecoratedKey> iter : mi.iterators())
         {
@@ -79,6 +89,8 @@
 
     public long getBytesRead()
     {
+        maybeInit();
+
         long m = 0;
         for (Iterator<DecoratedKey> iter : mi.iterators())
         {
@@ -87,18 +99,15 @@
         return m;
     }
 
-    public String getTaskType()
-    {
-        return "Secondary index build";
-    }
-
     public boolean hasNext()
     {
+        maybeInit();
         return mi.hasNext();
     }
 
     public DecoratedKey next()
     {
+        maybeInit();
         return mi.next();
     }
 

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index b0aa89e..b5703fc 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.*;
 import java.util.concurrent.CopyOnWriteArraySet;
@@ -38,7 +39,6 @@
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.RefCounted;
 import org.apache.cassandra.utils.memory.HeapAllocator;
 import org.apache.cassandra.utils.Pair;
 
@@ -58,36 +58,37 @@
 {
     static final Logger logger = LoggerFactory.getLogger(SSTable.class);
 
+
     public static final int TOMBSTONE_HISTOGRAM_BIN_SIZE = 100;
+    public static final int TOMBSTONE_HISTOGRAM_SPOOL_SIZE = 100000;
+    public static final int TOMBSTONE_HISTOGRAM_TTL_ROUND_SECONDS = Integer.valueOf(System.getProperty("cassandra.streaminghistogram.roundseconds", "60"));
 
     public final Descriptor descriptor;
     protected final Set<Component> components;
     public final CFMetaData metadata;
-    public final IPartitioner partitioner;
     public final boolean compression;
 
     public DecoratedKey first;
     public DecoratedKey last;
 
-    protected SSTable(Descriptor descriptor, CFMetaData metadata, IPartitioner partitioner)
+    protected SSTable(Descriptor descriptor, CFMetaData metadata)
     {
-        this(descriptor, new HashSet<Component>(), metadata, partitioner);
+        this(descriptor, new HashSet<>(), metadata);
     }
 
-    protected SSTable(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner)
+    protected SSTable(Descriptor descriptor, Set<Component> components, CFMetaData metadata)
     {
         // In almost all cases, metadata shouldn't be null, but allowing null allows to create a mostly functional SSTable without
         // full schema definition. SSTableLoader use that ability
         assert descriptor != null;
         assert components != null;
-        assert partitioner != null;
+        assert metadata != null;
 
         this.descriptor = descriptor;
         Set<Component> dataComponents = new HashSet<>(components);
         this.compression = dataComponents.contains(Component.COMPRESSION_INFO);
         this.components = new CopyOnWriteArraySet<>(dataComponents);
         this.metadata = metadata;
-        this.partitioner = partitioner;
     }
 
     /**
@@ -103,6 +104,7 @@
      */
     public static boolean delete(Descriptor desc, Set<Component> components)
     {
+        logger.debug("Deleting sstable: {}", desc);
         // remove the DATA component first if it exists
         if (components.contains(Component.DATA))
             FileUtils.deleteWithConfirm(desc.filenameFor(Component.DATA));
@@ -113,12 +115,23 @@
 
             FileUtils.deleteWithConfirm(desc.filenameFor(component));
         }
-        FileUtils.delete(desc.filenameFor(Component.SUMMARY));
 
-        logger.trace("Deleted {}", desc);
+        if (components.contains(Component.SUMMARY))
+            FileUtils.delete(desc.filenameFor(Component.SUMMARY));
+
         return true;
     }
 
+    public IPartitioner getPartitioner()
+    {
+        return metadata.partitioner;
+    }
+
+    public DecoratedKey decorateKey(ByteBuffer key)
+    {
+        return getPartitioner().decorateKey(key);
+    }
+
     /**
      * If the given @param key occupies only part of a larger buffer, allocate a new buffer that is only
      * as large as necessary.
@@ -150,6 +163,14 @@
         return descriptor.ksname;
     }
 
+    public List<String> getAllFilePaths()
+    {
+        List<String> ret = new ArrayList<>();
+        for (Component component : components)
+            ret.add(descriptor.filenameFor(component));
+        return ret;
+    }
+
     /**
      * @return Descriptor and Component pair. null if given file is not acceptable as SSTable component.
      *         If component is of unknown type, returns CUSTOM component.
@@ -201,9 +222,17 @@
         Set<Component> components = Sets.newHashSetWithExpectedSize(knownTypes.size());
         for (Component.Type componentType : knownTypes)
         {
-            Component component = new Component(componentType);
-            if (new File(desc.filenameFor(component)).exists())
-                components.add(component);
+            if (componentType == Component.Type.DIGEST)
+            {
+                if (desc.digestComponent != null && new File(desc.filenameFor(desc.digestComponent)).exists())
+                    components.add(desc.digestComponent);
+            }
+            else
+            {
+                Component component = new Component(componentType);
+                if (new File(desc.filenameFor(component)).exists())
+                    components.add(component);
+            }
         }
         return components;
     }
@@ -217,7 +246,7 @@
         while (ifile.getFilePointer() < BYTES_CAP && keys < SAMPLES_CAP)
         {
             ByteBufferUtil.skipShortLength(ifile);
-            RowIndexEntry.Serializer.skip(ifile);
+            RowIndexEntry.Serializer.skip(ifile, descriptor.version);
             keys++;
         }
         assert keys > 0 && ifile.getFilePointer() > 0 && ifile.length() > 0 : "Unexpected empty index file: " + ifile;

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java b/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java
deleted file mode 100644
index ef16b5c..0000000
--- a/src/java/org/apache/cassandra/io/sstable/SSTableDeletingTask.java
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable;
-
-import java.io.File;
-import java.util.Collections;
-import java.util.Queue;
-import java.util.Set;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.Sets;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.codahale.metrics.Counter;
-import org.apache.cassandra.concurrent.ScheduledExecutors;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.Blocker;
-
-public class SSTableDeletingTask implements Runnable
-{
-    private static final Logger logger = LoggerFactory.getLogger(SSTableDeletingTask.class);
-
-    // Deleting sstables is tricky because the mmapping might not have been finalized yet,
-    // and delete will fail (on Windows) until it is (we only force the unmapping on SUN VMs).
-    // Additionally, we need to make sure to delete the data file first, so on restart the others
-    // will be recognized as GCable.
-    private static final Queue<Runnable> failedTasks = new ConcurrentLinkedQueue<>();
-    private static final Blocker blocker = new Blocker();
-
-    private final Descriptor desc;
-    private final Set<Component> components;
-    private final long bytesOnDisk;
-    private final Counter totalDiskSpaceUsed;
-
-    /**
-     * realDescriptor is the actual descriptor for the sstable, the descriptor inside
-     * referent can be 'faked' as FINAL for early opened files. We need the real one
-     * to be able to remove the files.
-     */
-    public SSTableDeletingTask(Descriptor realDescriptor, Set<Component> components, Counter totalDiskSpaceUsed, long bytesOnDisk)
-    {
-        this.desc = realDescriptor;
-        this.bytesOnDisk = bytesOnDisk;
-        this.totalDiskSpaceUsed = totalDiskSpaceUsed;
-        switch (desc.type)
-        {
-            case FINAL:
-                this.components = components;
-                break;
-            case TEMPLINK:
-                this.components = Sets.newHashSet(Component.DATA, Component.PRIMARY_INDEX);
-                break;
-            default:
-                throw new IllegalStateException();
-        }
-    }
-
-    public void run()
-    {
-        blocker.ask();
-        // If we can't successfully delete the DATA component, set the task to be retried later: see above
-        File datafile = new File(desc.filenameFor(Component.DATA));
-        if (!datafile.delete())
-        {
-            logger.error("Unable to delete {} (it will be removed on server restart; we'll also retry after GC)", datafile);
-            failedTasks.add(this);
-            return;
-        }
-        // let the remainder be cleaned up by delete
-        SSTable.delete(desc, Sets.difference(components, Collections.singleton(Component.DATA)));
-        if (totalDiskSpaceUsed != null)
-            totalDiskSpaceUsed.dec(bytesOnDisk);
-    }
-
-    /**
-     * Retry all deletions that failed the first time around (presumably b/c the sstable was still mmap'd.)
-     * Useful because there are times when we know GC has been invoked; also exposed as an mbean.
-     */
-    public static void rescheduleFailedTasks()
-    {
-        Runnable task;
-        while ( null != (task = failedTasks.poll()))
-            ScheduledExecutors.nonPeriodicTasks.submit(task);
-
-        // On Windows, snapshots cannot be deleted so long as a segment of the root element is memory-mapped in NTFS.
-        SnapshotDeletingTask.rescheduleFailedTasks();
-    }
-
-    /** for tests */
-    @VisibleForTesting
-    public static void waitForDeletions()
-    {
-        Runnable runnable = new Runnable()
-        {
-            public void run()
-            {
-            }
-        };
-
-        FBUtilities.waitOnFuture(ScheduledExecutors.nonPeriodicTasks.schedule(runnable, 0, TimeUnit.MILLISECONDS));
-    }
-
-    @VisibleForTesting
-    public static void pauseDeletions(boolean stop)
-    {
-        blocker.block(stop);
-    }
-}
-

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
index 8c02ee7..a5af334 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java

@@ -18,34 +18,24 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.*;
-import java.util.Iterator;
 
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.serializers.MarshalException;
 
-    public class SSTableIdentityIterator implements Comparable<SSTableIdentityIterator>, OnDiskAtomIterator
+public class SSTableIdentityIterator extends AbstractIterator<Unfiltered> implements Comparable<SSTableIdentityIterator>, UnfilteredRowIterator
 {
+    private final SSTableReader sstable;
     private final DecoratedKey key;
-    private final DataInput in;
-    public final ColumnSerializer.Flag flag;
-
-    private final ColumnFamily columnFamily;
-    private final Iterator<OnDiskAtom> atomIterator;
-    private final boolean validateColumns;
+    private final DeletionTime partitionLevelDeletion;
     private final String filename;
 
-    // Not every SSTableIdentifyIterator is attached to a sstable, so this can be null.
-    private final SSTableReader sstable;
+    protected final SSTableSimpleIterator iterator;
+    private final Row staticRow;
 
     /**
      * Used to iterate through the columns of a row.
@@ -55,127 +45,71 @@
      */
     public SSTableIdentityIterator(SSTableReader sstable, RandomAccessReader file, DecoratedKey key)
     {
-        this(sstable, file, key, false);
-    }
+        this.sstable = sstable;
+        this.filename = file.getPath();
+        this.key = key;
 
-    /**
-     * Used to iterate through the columns of a row.
-     * @param sstable SSTable we are reading ffrom.
-     * @param file Reading using this file.
-     * @param key Key of this row.
-     * @param checkData if true, do its best to deserialize and check the coherence of row data
-     */
-    public SSTableIdentityIterator(SSTableReader sstable, RandomAccessReader file, DecoratedKey key, boolean checkData)
-    {
-        this(sstable.metadata, file, file.getPath(), key, checkData, sstable, ColumnSerializer.Flag.LOCAL);
-    }
-
-    /**
-     * Used only by scrubber to solve problems with data written after the END_OF_ROW marker. Iterates atoms for the given dataSize only and does not accept an END_OF_ROW marker.
-     */
-    public static SSTableIdentityIterator createFragmentIterator(SSTableReader sstable, final RandomAccessReader file, DecoratedKey key, long dataSize, boolean checkData)
-    {
-        final ColumnSerializer.Flag flag = ColumnSerializer.Flag.LOCAL;
-        final CellNameType type = sstable.metadata.comparator;
-        final int expireBefore = (int) (System.currentTimeMillis() / 1000);
-        final Version version = sstable.descriptor.version;
-        final long dataEnd = file.getFilePointer() + dataSize;
-        return new SSTableIdentityIterator(sstable.metadata, file, file.getPath(), key, checkData, sstable, flag, DeletionTime.LIVE,
-                                           new AbstractIterator<OnDiskAtom>()
-                                                   {
-                                                       protected OnDiskAtom computeNext()
-                                                       {
-                                                           if (file.getFilePointer() >= dataEnd)
-                                                               return endOfData();
-                                                           try
-                                                           {
-                                                               return type.onDiskAtomSerializer().deserializeFromSSTable(file, flag, expireBefore, version);
-                                                           }
-                                                           catch (IOException e)
-                                                           {
-                                                               throw new IOError(e);
-                                                           }
-                                                       }
-                                                   });
-    }
-
-    // sstable may be null *if* checkData is false
-    // If it is null, we assume the data is in the current file format
-    private SSTableIdentityIterator(CFMetaData metadata,
-                                    FileDataInput in,
-                                    String filename,
-                                    DecoratedKey key,
-                                    boolean checkData,
-                                    SSTableReader sstable,
-                                    ColumnSerializer.Flag flag)
-    {
-        this(metadata, in, filename, key, checkData, sstable, flag, readDeletionTime(in, sstable, filename),
-             metadata.getOnDiskIterator(in, flag, (int) (System.currentTimeMillis() / 1000),
-                                        sstable == null ? DatabaseDescriptor.getSSTableFormat().info.getLatestVersion() : sstable.descriptor.version));
-    }
-
-    private static DeletionTime readDeletionTime(DataInput in, SSTableReader sstable, String filename)
-    {
         try
         {
-            return DeletionTime.serializer.deserialize(in);
+            this.partitionLevelDeletion = DeletionTime.serializer.deserialize(file);
+            SerializationHelper helper = new SerializationHelper(sstable.metadata, sstable.descriptor.version.correspondingMessagingVersion(), SerializationHelper.Flag.LOCAL);
+            this.iterator = SSTableSimpleIterator.create(sstable.metadata, file, sstable.header, helper, partitionLevelDeletion);
+            this.staticRow = iterator.readStaticRow();
         }
         catch (IOException e)
         {
-            if (sstable != null)
-                sstable.markSuspect();
+            sstable.markSuspect();
             throw new CorruptSSTableException(e, filename);
         }
     }
 
-    // sstable may be null *if* checkData is false
-    // If it is null, we assume the data is in the current file format
-    private SSTableIdentityIterator(CFMetaData metadata,
-                                    DataInput in,
-                                    String filename,
-                                    DecoratedKey key,
-                                    boolean checkData,
-                                    SSTableReader sstable,
-                                    ColumnSerializer.Flag flag,
-                                    DeletionTime deletion,
-                                    Iterator<OnDiskAtom> atomIterator)
+    public CFMetaData metadata()
     {
-        assert !checkData || (sstable != null);
-        this.in = in;
-        this.filename = filename;
-        this.key = key;
-        this.flag = flag;
-        this.validateColumns = checkData;
-        this.sstable = sstable;
-        columnFamily = ArrayBackedSortedColumns.factory.create(metadata);
-        columnFamily.delete(deletion);
-        this.atomIterator = atomIterator;
+        return sstable.metadata;
     }
 
-    public DecoratedKey getKey()
+    public PartitionColumns columns()
+    {
+        return metadata().partitionColumns();
+    }
+
+    public boolean isReverseOrder()
+    {
+        return false;
+    }
+
+    public DecoratedKey partitionKey()
     {
         return key;
     }
 
-    public ColumnFamily getColumnFamily()
+    public DeletionTime partitionLevelDeletion()
     {
-        return columnFamily;
+        return partitionLevelDeletion;
     }
 
-    public boolean hasNext()
+    public Row staticRow()
+    {
+        return staticRow;
+    }
+
+    protected Unfiltered computeNext()
     {
         try
         {
-            return atomIterator.hasNext();
+            return doCompute();
+        }
+        catch (IndexOutOfBoundsException e)
+        {
+            sstable.markSuspect();
+            throw new CorruptSSTableException(e, filename);
         }
         catch (IOError e)
         {
-            // catch here b/c atomIterator is an AbstractIterator; hasNext reads the value
             if (e.getCause() instanceof IOException)
             {
-                if (sstable != null)
-                    sstable.markSuspect();
-                throw new CorruptSSTableException((IOException)e.getCause(), filename);
+                sstable.markSuspect();
+                throw new CorruptSSTableException((Exception)e.getCause(), filename);
             }
             else
             {
@@ -184,24 +118,9 @@
         }
     }
 
-    public OnDiskAtom next()
+    protected Unfiltered doCompute()
     {
-        try
-        {
-            OnDiskAtom atom = atomIterator.next();
-            if (validateColumns)
-                atom.validateFields(columnFamily.metadata());
-            return atom;
-        }
-        catch (MarshalException me)
-        {
-            throw new CorruptSSTableException(me, filename);
-        }
-    }
-
-    public void remove()
-    {
-        throw new UnsupportedOperationException();
+        return iterator.hasNext() ? iterator.next() : endOfData();
     }
 
     public void close()
@@ -211,16 +130,14 @@
 
     public String getPath()
     {
-        // if input is from file, then return that path, otherwise it's from streaming
-        if (in instanceof RandomAccessReader)
-        {
-            RandomAccessReader file = (RandomAccessReader) in;
-            return file.getPath();
-        }
-        else
-        {
-            throw new UnsupportedOperationException();
-        }
+        return filename;
+    }
+
+    public EncodingStats stats()
+    {
+        // We could return sstable.header.stats(), but this may not be as accurate than the actual sstable stats (see
+        // SerializationHeader.make() for details) so we use the latter instead.
+        return new EncodingStats(sstable.getMinTimestamp(), sstable.getMinLocalDeletionTime(), sstable.getMinTTL());
     }
 
     public int compareTo(SSTableIdentityIterator o)

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
index b99003b..e597e54 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java

@@ -18,7 +18,6 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
-import java.io.FilenameFilter;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.*;
@@ -27,15 +26,13 @@
 import com.google.common.collect.Multimap;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.*;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
 import org.apache.cassandra.utils.Pair;
 
@@ -76,72 +73,79 @@
     {
         outputHandler.output("Opening sstables and calculating sections to stream");
 
-        directory.list(new FilenameFilter()
-        {
-            public boolean accept(File dir, String name)
-            {
-                if (new File(dir, name).isDirectory())
-                    return false;
-                Pair<Descriptor, Component> p = SSTable.tryComponentFromFilename(dir, name);
-                Descriptor desc = p == null ? null : p.left;
-                if (p == null || !p.right.equals(Component.DATA) || desc.type.isTemporary)
-                    return false;
+        LifecycleTransaction.getFiles(directory.toPath(),
+                                      (file, type) ->
+                                      {
+                                          File dir = file.getParentFile();
+                                          String name = file.getName();
 
-                if (!new File(desc.filenameFor(Component.PRIMARY_INDEX)).exists())
-                {
-                    outputHandler.output(String.format("Skipping file %s because index is missing", name));
-                    return false;
-                }
+                                          if (type != Directories.FileType.FINAL)
+                                          {
+                                              outputHandler.output(String.format("Skipping temporary file %s", name));
+                                              return false;
+                                          }
 
-                CFMetaData metadata = client.getTableMetadata(desc.cfname);
-                if (metadata == null)
-                {
-                    outputHandler.output(String.format("Skipping file %s: table %s.%s doesn't exist", name, keyspace, desc.cfname));
-                    return false;
-                }
+                                          Pair<Descriptor, Component> p = SSTable.tryComponentFromFilename(dir, name);
+                                          Descriptor desc = p == null ? null : p.left;
+                                          if (p == null || !p.right.equals(Component.DATA))
+                                              return false;
 
-                Set<Component> components = new HashSet<>();
-                components.add(Component.DATA);
-                components.add(Component.PRIMARY_INDEX);
-                if (new File(desc.filenameFor(Component.SUMMARY)).exists())
-                    components.add(Component.SUMMARY);
-                if (new File(desc.filenameFor(Component.COMPRESSION_INFO)).exists())
-                    components.add(Component.COMPRESSION_INFO);
-                if (new File(desc.filenameFor(Component.STATS)).exists())
-                    components.add(Component.STATS);
+                                          if (!new File(desc.filenameFor(Component.PRIMARY_INDEX)).exists())
+                                          {
+                                              outputHandler.output(String.format("Skipping file %s because index is missing", name));
+                                              return false;
+                                          }
 
-                try
-                {
-                    // To conserve memory, open SSTableReaders without bloom filters and discard
-                    // the index summary after calculating the file sections to stream and the estimated
-                    // number of keys for each endpoint. See CASSANDRA-5555 for details.
-                    SSTableReader sstable = SSTableReader.openForBatch(desc, components, metadata, client.getPartitioner());
-                    sstables.add(sstable);
+                                          CFMetaData metadata = client.getTableMetadata(desc.cfname);
+                                          if (metadata == null)
+                                          {
+                                              outputHandler.output(String.format("Skipping file %s: table %s.%s doesn't exist", name, keyspace, desc.cfname));
+                                              return false;
+                                          }
 
-                    // calculate the sstable sections to stream as well as the estimated number of
-                    // keys per host
-                    for (Map.Entry<InetAddress, Collection<Range<Token>>> entry : ranges.entrySet())
-                    {
-                        InetAddress endpoint = entry.getKey();
-                        Collection<Range<Token>> tokenRanges = entry.getValue();
+                                          Set<Component> components = new HashSet<>();
+                                          components.add(Component.DATA);
+                                          components.add(Component.PRIMARY_INDEX);
+                                          if (new File(desc.filenameFor(Component.SUMMARY)).exists())
+                                              components.add(Component.SUMMARY);
+                                          if (new File(desc.filenameFor(Component.COMPRESSION_INFO)).exists())
+                                              components.add(Component.COMPRESSION_INFO);
+                                          if (new File(desc.filenameFor(Component.STATS)).exists())
+                                              components.add(Component.STATS);
 
-                        List<Pair<Long, Long>> sstableSections = sstable.getPositionsForRanges(tokenRanges);
-                        long estimatedKeys = sstable.estimatedKeysForRanges(tokenRanges);
-                        Ref<SSTableReader> ref = sstable.ref();
-                        StreamSession.SSTableStreamingSections details = new StreamSession.SSTableStreamingSections(ref, sstableSections, estimatedKeys, ActiveRepairService.UNREPAIRED_SSTABLE);
-                        streamingDetails.put(endpoint, details);
-                    }
+                                          try
+                                          {
+                                              // To conserve memory, open SSTableReaders without bloom filters and discard
+                                              // the index summary after calculating the file sections to stream and the estimated
+                                              // number of keys for each endpoint. See CASSANDRA-5555 for details.
+                                              SSTableReader sstable = SSTableReader.openForBatch(desc, components, metadata);
+                                              sstables.add(sstable);
 
-                    // to conserve heap space when bulk loading
-                    sstable.releaseSummary();
-                }
-                catch (IOException e)
-                {
-                    outputHandler.output(String.format("Skipping file %s, error opening it: %s", name, e.getMessage()));
-                }
-                return false;
-            }
-        });
+                                              // calculate the sstable sections to stream as well as the estimated number of
+                                              // keys per host
+                                              for (Map.Entry<InetAddress, Collection<Range<Token>>> entry : ranges.entrySet())
+                                              {
+                                                  InetAddress endpoint = entry.getKey();
+                                                  Collection<Range<Token>> tokenRanges = entry.getValue();
+
+                                                  List<Pair<Long, Long>> sstableSections = sstable.getPositionsForRanges(tokenRanges);
+                                                  long estimatedKeys = sstable.estimatedKeysForRanges(tokenRanges);
+                                                  Ref<SSTableReader> ref = sstable.ref();
+                                                  StreamSession.SSTableStreamingSections details = new StreamSession.SSTableStreamingSections(ref, sstableSections, estimatedKeys, ActiveRepairService.UNREPAIRED_SSTABLE);
+                                                  streamingDetails.put(endpoint, details);
+                                              }
+
+                                              // to conserve heap space when bulk loading
+                                              sstable.releaseSummary();
+                                          }
+                                          catch (IOException e)
+                                          {
+                                              outputHandler.output(String.format("Skipping file %s, error opening it: %s", name, e.getMessage()));
+                                          }
+                                          return false;
+                                      },
+                                      Directories.OnTxnErr.IGNORE);
+
         return sstables;
     }
 
@@ -165,7 +169,7 @@
             return plan.execute();
         }
 
-        outputHandler.output(String.format("Streaming relevant part of %sto %s", names(sstables), endpointToRanges.keySet()));
+        outputHandler.output(String.format("Streaming relevant part of %s to %s", names(sstables), endpointToRanges.keySet()));
 
         for (Map.Entry<InetAddress, Collection<Range<Token>>> entry : endpointToRanges.entrySet())
         {
@@ -234,7 +238,6 @@
     public static abstract class Client
     {
         private final Map<InetAddress, Collection<Range<Token>>> endpointToRanges = new HashMap<>();
-        private IPartitioner partitioner;
 
         /**
          * Initialize the client.
@@ -281,23 +284,6 @@
             return endpointToRanges;
         }
 
-        protected void setPartitioner(String partclass) throws ConfigurationException
-        {
-            setPartitioner(FBUtilities.newPartitioner(partclass));
-        }
-
-        protected void setPartitioner(IPartitioner partitioner)
-        {
-            this.partitioner = partitioner;
-            // the following is still necessary since Range/Token reference partitioner through StorageService.getPartitioner
-            DatabaseDescriptor.setPartitioner(partitioner);
-        }
-
-        public IPartitioner getPartitioner()
-        {
-            return partitioner;
-        }
-
         protected void addRangeForEndpoint(Range<Token> range, InetAddress endpoint)
         {
             Collection<Range<Token>> ranges = endpointToRanges.get(endpoint);

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java
new file mode 100644
index 0000000..0bb3721
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java

@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+public interface SSTableMultiWriter extends Transactional
+{
+
+    /**
+     * Writes a partition in an implementation specific way
+     * @param partition the partition to append
+     * @return true if the partition was written, false otherwise
+     */
+    boolean append(UnfilteredRowIterator partition);
+
+    Collection<SSTableReader> finish(long repairedAt, long maxDataAge, boolean openResult);
+    Collection<SSTableReader> finish(boolean openResult);
+    Collection<SSTableReader> finished();
+
+    SSTableMultiWriter setOpenResult(boolean openResult);
+
+    String getFilename();
+    long getFilePointer();
+    UUID getCfId();
+
+    static void abortOrDie(SSTableMultiWriter writer)
+    {
+        Throwables.maybeFail(writer.abort(null));
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
index c243904..b2fbcb1 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java

@@ -28,11 +28,12 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.lifecycle.ILifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.NativeLibrary;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
 /**
@@ -55,19 +56,18 @@
     @VisibleForTesting
     public static boolean disableEarlyOpeningForTests = false;
 
-    private final ColumnFamilyStore cfs;
     private final long preemptiveOpenInterval;
     private final long maxAge;
     private long repairedAt = -1;
     // the set of final readers we will expose on commit
-    private final LifecycleTransaction transaction; // the readers we are rewriting (updated as they are replaced)
+    private final ILifecycleTransaction transaction; // the readers we are rewriting (updated as they are replaced)
     private final List<SSTableReader> preparedForCommit = new ArrayList<>();
-    private final Map<Descriptor, Integer> fileDescriptors = new HashMap<>(); // the file descriptors for each reader descriptor we are rewriting
 
     private long currentlyOpenedEarlyAt; // the position (in MB) in the target file we last (re)opened at
 
     private final List<SSTableWriter> writers = new ArrayList<>();
     private final boolean isOffline; // true for operations that are performed without Cassandra running (prevents updates of Tracker)
+    private final boolean keepOriginals; // true if we do not want to obsolete the originals
 
     private SSTableWriter writer;
     private Map<DecoratedKey, RowIndexEntry> cachedKeys = new HashMap<>();
@@ -75,28 +75,36 @@
     // for testing (TODO: remove when have byteman setup)
     private boolean throwEarly, throwLate;
 
-    public SSTableRewriter(ColumnFamilyStore cfs, LifecycleTransaction transaction, long maxAge, boolean isOffline)
+    public SSTableRewriter(ILifecycleTransaction transaction, long maxAge, boolean isOffline)
     {
-        this(cfs, transaction, maxAge, isOffline, true);
+        this(transaction, maxAge, isOffline, true);
     }
 
-    public SSTableRewriter(ColumnFamilyStore cfs, LifecycleTransaction transaction, long maxAge, boolean isOffline, boolean shouldOpenEarly)
+    public SSTableRewriter(ILifecycleTransaction transaction, long maxAge, boolean isOffline, boolean shouldOpenEarly)
     {
-        this(cfs, transaction, maxAge, isOffline, calculateOpenInterval(shouldOpenEarly));
+        this(transaction, maxAge, isOffline, calculateOpenInterval(shouldOpenEarly), false);
     }
 
     @VisibleForTesting
-    public SSTableRewriter(ColumnFamilyStore cfs, LifecycleTransaction transaction, long maxAge, boolean isOffline, long preemptiveOpenInterval)
+    public SSTableRewriter(ILifecycleTransaction transaction, long maxAge, boolean isOffline, long preemptiveOpenInterval, boolean keepOriginals)
     {
         this.transaction = transaction;
-        for (SSTableReader sstable : this.transaction.originals())
-            fileDescriptors.put(sstable.descriptor, CLibrary.getfd(sstable.getFilename()));
-        this.cfs = cfs;
         this.maxAge = maxAge;
         this.isOffline = isOffline;
+        this.keepOriginals = keepOriginals;
         this.preemptiveOpenInterval = preemptiveOpenInterval;
     }
 
+    public static SSTableRewriter constructKeepingOriginals(ILifecycleTransaction transaction, boolean keepOriginals, long maxAge, boolean isOffline)
+    {
+        return new SSTableRewriter(transaction, maxAge, isOffline, calculateOpenInterval(true), keepOriginals);
+    }
+
+    public static SSTableRewriter construct(ColumnFamilyStore cfs, ILifecycleTransaction transaction, boolean keepOriginals, long maxAge, boolean isOffline)
+    {
+        return new SSTableRewriter(transaction, maxAge, isOffline, calculateOpenInterval(cfs.supportsEarlyOpen()), keepOriginals);
+    }
+
     private static long calculateOpenInterval(boolean shouldOpenEarly)
     {
         long interval = DatabaseDescriptor.getSSTablePreempiveOpenIntervalInMB() * (1L << 20);
@@ -110,42 +118,36 @@
         return writer;
     }
 
-    public RowIndexEntry append(AbstractCompactedRow row)
+    public RowIndexEntry append(UnfilteredRowIterator partition)
     {
         // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
-        maybeReopenEarly(row.key);
-        RowIndexEntry index = writer.append(row);
-        if (!isOffline)
+        DecoratedKey key = partition.partitionKey();
+        maybeReopenEarly(key);
+        RowIndexEntry index = writer.append(partition);
+        if (!isOffline && index != null)
         {
-            if (index == null)
+            boolean save = false;
+            for (SSTableReader reader : transaction.originals())
             {
-                cfs.invalidateCachedRow(row.key);
-            }
-            else
-            {
-                boolean save = false;
-                for (SSTableReader reader : transaction.originals())
+                if (reader.getCachedPosition(key, false) != null)
                 {
-                    if (reader.getCachedPosition(row.key, false) != null)
-                    {
-                        save = true;
-                        break;
-                    }
+                    save = true;
+                    break;
                 }
-                if (save)
-                    cachedKeys.put(row.key, index);
             }
+            if (save)
+                cachedKeys.put(key, index);
         }
         return index;
     }
 
     // attempts to append the row, if fails resets the writer position
-    public RowIndexEntry tryAppend(AbstractCompactedRow row)
+    public RowIndexEntry tryAppend(UnfilteredRowIterator partition)
     {
         writer.mark();
         try
         {
-            return append(row);
+            return append(partition);
         }
         catch (Throwable t)
         {
@@ -163,7 +165,7 @@
                 for (SSTableReader reader : transaction.originals())
                 {
                     RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE);
-                    CLibrary.trySkipCache(fileDescriptors.get(reader.descriptor), 0, index == null ? 0 : index.position);
+                    NativeLibrary.trySkipCache(reader.getFilename(), 0, index == null ? 0 : index.position);
                 }
             }
             else
@@ -194,6 +196,7 @@
     {
         for (SSTableWriter writer : writers)
             accumulate = writer.commit(accumulate);
+
         accumulate = transaction.commit(accumulate);
         return accumulate;
     }
@@ -220,7 +223,7 @@
 
         final List<DecoratedKey> invalidateKeys = new ArrayList<>();
         invalidateKeys.addAll(cachedKeys.keySet());
-        newReader.setupKeyCache();
+        newReader.setupOnline();
         for (Map.Entry<DecoratedKey, RowIndexEntry> cacheKey : cachedKeys.entrySet())
             newReader.cacheKey(cacheKey.getKey(), cacheKey.getValue());
 
@@ -260,7 +263,7 @@
 
         private InvalidateKeys(SSTableReader reader, Collection<DecoratedKey> invalidate)
         {
-            this.cacheRef = new WeakReference<InstrumentingCache<KeyCacheKey, ?>>(reader.getKeyCache());
+            this.cacheRef = new WeakReference<>(reader.getKeyCache());
             if (cacheRef.get() != null)
             {
                 for (DecoratedKey key : invalidate)
@@ -289,17 +292,19 @@
             if (writer != null)
             {
                 writer.abort();
+
+                transaction.untrackNew(writer);
                 writers.remove(writer);
             }
             writer = newWriter;
+
             return;
         }
 
-        SSTableReader reader = null;
         if (preemptiveOpenInterval != Long.MAX_VALUE)
         {
             // we leave it as a tmp file, but we open it and add it to the Tracker
-            reader = writer.setMaxDataAge(maxAge).openFinalEarly();
+            SSTableReader reader = writer.setMaxDataAge(maxAge).openFinalEarly();
             transaction.update(reader, false);
             moveStarts(reader, reader.last);
             transaction.checkpoint();
@@ -366,8 +371,7 @@
         if (throwLate)
             throw new RuntimeException("exception thrown after all sstables finished, for testing");
 
-        // TODO: do we always want to avoid obsoleting if offline?
-        if (!isOffline)
+        if (!keepOriginals)
             transaction.obsoleteOriginals();
 
         transaction.prepareToCommit();

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleIterator.java
new file mode 100644
index 0000000..f82db4e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleIterator.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.IOException;
+import java.io.IOError;
+import java.util.Iterator;
+
+import org.apache.cassandra.io.util.RewindableDataInput;
+import org.apache.cassandra.utils.AbstractIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.net.MessagingService;
+
+/**
+ * Utility class to handle deserializing atom from sstables.
+ *
+ * Note that this is not a full fledged UnfilteredRowIterator. It's also not closeable, it is always
+ * the job of the user to close the underlying ressources.
+ */
+public abstract class SSTableSimpleIterator extends AbstractIterator<Unfiltered> implements Iterator<Unfiltered>
+{
+    protected final CFMetaData metadata;
+    protected final DataInputPlus in;
+    protected final SerializationHelper helper;
+
+    private SSTableSimpleIterator(CFMetaData metadata, DataInputPlus in, SerializationHelper helper)
+    {
+        this.metadata = metadata;
+        this.in = in;
+        this.helper = helper;
+    }
+
+    public static SSTableSimpleIterator create(CFMetaData metadata, DataInputPlus in, SerializationHeader header, SerializationHelper helper, DeletionTime partitionDeletion)
+    {
+        if (helper.version < MessagingService.VERSION_30)
+            return new OldFormatIterator(metadata, in, helper, partitionDeletion);
+        else
+            return new CurrentFormatIterator(metadata, in, header, helper);
+    }
+
+    public abstract Row readStaticRow() throws IOException;
+
+    private static class CurrentFormatIterator extends SSTableSimpleIterator
+    {
+        private final SerializationHeader header;
+
+        private final Row.Builder builder;
+
+        private CurrentFormatIterator(CFMetaData metadata, DataInputPlus in, SerializationHeader header, SerializationHelper helper)
+        {
+            super(metadata, in, helper);
+            this.header = header;
+            this.builder = BTreeRow.sortedBuilder();
+        }
+
+        public Row readStaticRow() throws IOException
+        {
+            return header.hasStatic() ? UnfilteredSerializer.serializer.deserializeStaticRow(in, header, helper) : Rows.EMPTY_STATIC_ROW;
+        }
+
+        protected Unfiltered computeNext()
+        {
+            try
+            {
+                Unfiltered unfiltered = UnfilteredSerializer.serializer.deserialize(in, header, helper, builder);
+                return unfiltered == null ? endOfData() : unfiltered;
+            }
+            catch (IOException e)
+            {
+                throw new IOError(e);
+            }
+        }
+    }
+
+    private static class OldFormatIterator extends SSTableSimpleIterator
+    {
+        private final UnfilteredDeserializer deserializer;
+
+        private OldFormatIterator(CFMetaData metadata, DataInputPlus in, SerializationHelper helper, DeletionTime partitionDeletion)
+        {
+            super(metadata, in, helper);
+            // We use an UnfilteredDeserializer because even though we don't need all it's fanciness, it happens to handle all
+            // the details we need for reading the old format.
+            this.deserializer = UnfilteredDeserializer.create(metadata, in, null, helper, partitionDeletion, false);
+        }
+
+        public Row readStaticRow() throws IOException
+        {
+            if (metadata.isCompactTable())
+            {
+                // For static compact tables, in the old format, static columns are intermingled with the other columns, so we
+                // need to extract them. Which imply 2 passes (one to extract the static, then one for other value).
+                if (metadata.isStaticCompactTable())
+                {
+                    assert in instanceof RewindableDataInput;
+                    RewindableDataInput file = (RewindableDataInput)in;
+                    DataPosition mark = file.mark();
+                    Row staticRow = LegacyLayout.extractStaticColumns(metadata, file, metadata.partitionColumns().statics);
+                    file.reset(mark);
+
+                    // We've extracted the static columns, so we must ignore them on the 2nd pass
+                    ((UnfilteredDeserializer.OldFormatDeserializer)deserializer).setSkipStatic();
+                    return staticRow;
+                }
+                else
+                {
+                    return Rows.EMPTY_STATIC_ROW;
+                }
+            }
+
+            return deserializer.hasNext() && deserializer.nextIsStatic()
+                 ? (Row)deserializer.readNext()
+                 : Rows.EMPTY_STATIC_ROW;
+
+        }
+
+        protected Unfiltered computeNext()
+        {
+            try
+            {
+                if (!deserializer.hasNext())
+                    return endOfData();
+
+                Unfiltered unfiltered = deserializer.readNext();
+                if (metadata.isStaticCompactTable() && unfiltered.kind() == Unfiltered.Kind.ROW)
+                {
+                    Row row = (Row) unfiltered;
+                    ColumnDefinition def = metadata.getColumnDefinition(LegacyLayout.encodeClustering(metadata, row.clustering()));
+                    if (def != null && def.isStatic())
+                        return computeNext();
+                }
+                return unfiltered;
+            }
+            catch (IOException e)
+            {
+                throw new IOError(e);
+            }
+        }
+
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java
index 534e77b..6d3a714 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleUnsortedWriter.java

@@ -28,30 +28,24 @@
 import com.google.common.base.Throwables;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.UnfilteredSerializer;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
 /**
  * A SSTable writer that doesn't assume rows are in sorted order.
+ * <p>
  * This writer buffers rows in memory and then write them all in sorted order.
  * To avoid loading the entire data set in memory, the amount of rows buffered
  * is configurable. Each time the threshold is met, one SSTable will be
  * created (and the buffer be reseted).
  *
- * @see AbstractSSTableSimpleWriter
- *
- * @deprecated this class is depracted in favor of {@link CQLSSTableWriter}.
+ * @see SSTableSimpleWriter
  */
-@Deprecated
-public class SSTableSimpleUnsortedWriter extends AbstractSSTableSimpleWriter
+class SSTableSimpleUnsortedWriter extends AbstractSSTableSimpleWriter
 {
     private static final Buffer SENTINEL = new Buffer();
 
@@ -59,101 +53,75 @@
     private final long bufferSize;
     private long currentSize;
 
+    // Used to compute the row serialized size
+    private final SerializationHeader header;
+
     private final BlockingQueue<Buffer> writeQueue = new SynchronousQueue<Buffer>();
     private final DiskWriter diskWriter = new DiskWriter();
 
-    /**
-     * Create a new buffering writer.
-     * @param directory the directory where to write the sstables
-     * @param partitioner  the partitioner
-     * @param keyspace the keyspace name
-     * @param columnFamily the column family name
-     * @param comparator the column family comparator
-     * @param subComparator the column family subComparator or null if not a Super column family.
-     * @param bufferSizeInMB the data size in MB before which a sstable is written and the buffer reseted. This correspond roughly to the written
-     * data size (i.e. the size of the create sstable). The actual size used in memory will be higher (by how much depends on the size of the
-     * columns you add). For 1GB of heap, a 128 bufferSizeInMB is probably a reasonable choice. If you experience OOM, this value should be lowered.
-     */
-    public SSTableSimpleUnsortedWriter(File directory,
-                                       IPartitioner partitioner,
-                                       String keyspace,
-                                       String columnFamily,
-                                       AbstractType<?> comparator,
-                                       AbstractType<?> subComparator,
-                                       int bufferSizeInMB,
-                                       CompressionParameters compressParameters)
+    SSTableSimpleUnsortedWriter(File directory, CFMetaData metadata, PartitionColumns columns, long bufferSizeInMB)
     {
-        this(directory, CFMetaData.denseCFMetaData(keyspace, columnFamily, comparator, subComparator).compressionParameters(compressParameters), partitioner, bufferSizeInMB);
-    }
-
-    public SSTableSimpleUnsortedWriter(File directory,
-                                       IPartitioner partitioner,
-                                       String keyspace,
-                                       String columnFamily,
-                                       AbstractType<?> comparator,
-                                       AbstractType<?> subComparator,
-                                       int bufferSizeInMB)
-    {
-        this(directory, partitioner, keyspace, columnFamily, comparator, subComparator, bufferSizeInMB, new CompressionParameters(null));
-    }
-
-    public SSTableSimpleUnsortedWriter(File directory, CFMetaData metadata, IPartitioner partitioner, long bufferSizeInMB)
-    {
-        super(directory, metadata, partitioner);
-        bufferSize = bufferSizeInMB * 1024L * 1024L;
+        super(directory, metadata, columns);
+        this.bufferSize = bufferSizeInMB * 1024L * 1024L;
+        this.header = new SerializationHeader(true, metadata, columns, EncodingStats.NO_STATS);
         diskWriter.start();
     }
 
-    protected void writeRow(DecoratedKey key, ColumnFamily columnFamily) throws IOException
+    PartitionUpdate getUpdateFor(DecoratedKey key)
     {
-        // Nothing to do since we'll sync if needed in addColumn.
-    }
+        assert key != null;
 
-    @Override
-    protected void addColumn(Cell cell) throws IOException
-    {
-        super.addColumn(cell);
-        countColumn(cell);
-    }
-
-    protected void countColumn(Cell cell) throws IOException
-    {
-        currentSize += cell.serializedSize(metadata.comparator, TypeSizes.NATIVE);
-
-        // We don't want to sync in writeRow() only as this might blow up the bufferSize for wide rows.
-        if (currentSize > bufferSize)
-            replaceColumnFamily();
-    }
-
-    protected ColumnFamily getColumnFamily()
-    {
-        ColumnFamily previous = buffer.get(currentKey);
-        // If the CF already exist in memory, we'll just continue adding to it
+        PartitionUpdate previous = buffer.get(key);
         if (previous == null)
         {
-            previous = createColumnFamily();
-            buffer.put(currentKey, previous);
-
-            // Since this new CF will be written by the next sync(), count its header. And a CF header
-            // on disk is:
-            //   - the row key: 2 bytes size + key size bytes
-            //   - the row level deletion infos: 4 + 8 bytes
-            currentSize += 14 + currentKey.getKey().remaining();
+            previous = createPartitionUpdate(key);
+            currentSize += PartitionUpdate.serializer.serializedSize(previous, formatType.info.getLatestVersion().correspondingMessagingVersion());
+            previous.allowNewUpdates();
+            buffer.put(key, previous);
         }
         return previous;
     }
 
-    public Descriptor getCurrentDescriptor()
+    private void countRow(Row row)
     {
-        // can be implemented, but isn't necessary
-        throw new UnsupportedOperationException();
+        // Note that the accounting of a row is a bit inaccurate (it doesn't take some of the file format optimization into account)
+        // and the maintaining of the bufferSize is in general not perfect. This has always been the case for this class but we should
+        // improve that. In particular, what we count is closer to the serialized value, but it's debatable that it's the right thing
+        // to count since it will take a lot more space in memory and the bufferSize if first and foremost used to avoid OOM when
+        // using this writer.
+        currentSize += UnfilteredSerializer.serializer.serializedSize(row, header, 0, formatType.info.getLatestVersion().correspondingMessagingVersion());
     }
 
-    protected ColumnFamily createColumnFamily()
+    private void maybeSync() throws SyncException
     {
-        return ArrayBackedSortedColumns.factory.create(metadata);
+        try
+        {
+            if (currentSize > bufferSize)
+                sync();
+        }
+        catch (IOException e)
+        {
+            // addColumn does not throw IOException but we want to report this to the user,
+            // so wrap it in a temporary RuntimeException that we'll catch in rawAddRow above.
+            throw new SyncException(e);
+        }
     }
 
+    private PartitionUpdate createPartitionUpdate(DecoratedKey key)
+    {
+        return new PartitionUpdate(metadata, key, columns, 4)
+        {
+            @Override
+            public void add(Row row)
+            {
+                super.add(row);
+                countRow(row);
+                maybeSync();
+            }
+        };
+    }
+
+    @Override
     public void close() throws IOException
     {
         sync();
@@ -161,18 +129,14 @@
         try
         {
             diskWriter.join();
+            checkForWriterException();
         }
-        catch (InterruptedException e)
+        catch (Throwable e)
         {
             throw new RuntimeException(e);
         }
-        checkForWriterException();
-    }
 
-    // This is overridden by CQLSSTableWriter to hold off replacing column family until the next iteration through
-    protected void replaceColumnFamily() throws IOException
-    {
-        sync();
+        checkForWriterException();
     }
 
     protected void sync() throws IOException
@@ -180,12 +144,9 @@
         if (buffer.isEmpty())
             return;
 
-        columnFamily = null;
         put(buffer);
         buffer = new Buffer();
         currentSize = 0;
-        columnFamily = getColumnFamily();
-        buffer.setFirstInsertedKey(currentKey);
     }
 
     private void put(Buffer buffer) throws IOException
@@ -217,56 +178,45 @@
         }
     }
 
-    // typedef
-    private static class Buffer extends TreeMap<DecoratedKey, ColumnFamily> {
-        private DecoratedKey firstInsertedKey;
-
-        public void setFirstInsertedKey(DecoratedKey firstInsertedKey) {
-            this.firstInsertedKey = firstInsertedKey;
-        }
-
-        public DecoratedKey getFirstInsertedKey() {
-            return firstInsertedKey;
+    static class SyncException extends RuntimeException
+    {
+        SyncException(IOException ioe)
+        {
+            super(ioe);
         }
     }
 
+    //// typedef
+    static class Buffer extends TreeMap<DecoratedKey, PartitionUpdate> {}
+
     private class DiskWriter extends Thread
     {
         volatile Throwable exception = null;
 
         public void run()
         {
+            while (true)
             {
-                while (true)
+                try
                 {
-                    try
-                    {
-                        Buffer b = writeQueue.take();
-                        if (b == SENTINEL)
-                            return;
+                    Buffer b = writeQueue.take();
+                    if (b == SENTINEL)
+                        return;
 
-                        try (SSTableWriter writer = getWriter();)
-                        {
-                            for (Map.Entry<DecoratedKey, ColumnFamily> entry : b.entrySet())
-                            {
-                                if (entry.getValue().getColumnCount() > 0)
-                                    writer.append(entry.getKey(), entry.getValue());
-                                else if (!entry.getKey().equals(b.getFirstInsertedKey()))
-                                    throw new AssertionError("Empty partition");
-                            }
-                            
-                            writer.finish(false);
-                        }
-                    }
-                    catch (Throwable e)
+                        try (SSTableTxnWriter writer = createWriter())
                     {
-                        JVMStabilityInspector.inspectThrowable(e);
-                        // Keep only the first exception
-                        if (exception == null)
-                            exception = e;
+                        for (Map.Entry<DecoratedKey, PartitionUpdate> entry : b.entrySet())
+                            writer.append(entry.getValue().unfilteredIterator());
+                        writer.finish(false);
                     }
                 }
-
+                catch (Throwable e)
+                {
+                    JVMStabilityInspector.inspectThrowable(e);
+                    // Keep only the first exception
+                    if (exception == null)
+                        exception = e;
+                }
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java
index f81e57d..45722cd 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableSimpleWriter.java

@@ -18,93 +18,80 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
+import java.io.IOException;
 
 import com.google.common.base.Throwables;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.FSError;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 
 /**
  * A SSTable writer that assumes rows are in (partitioner) sorted order.
+ * <p>
  * Contrarily to SSTableSimpleUnsortedWriter, this writer does not buffer
  * anything into memory, however it assumes that row are added in sorted order
  * (an exception will be thrown otherwise), which for the RandomPartitioner
  * means that rows should be added by increasing md5 of the row key. This is
  * rarely possible and SSTableSimpleUnsortedWriter should most of the time be
  * prefered.
- *
- * @see AbstractSSTableSimpleWriter
- *
- * @deprecated this class is depracted in favor of {@link CQLSSTableWriter}.
  */
-@Deprecated
-public class SSTableSimpleWriter extends AbstractSSTableSimpleWriter
+class SSTableSimpleWriter extends AbstractSSTableSimpleWriter
 {
-    private final SSTableWriter writer;
+    protected DecoratedKey currentKey;
+    protected PartitionUpdate update;
 
-    /**
-     * Create a new writer.
-     * @param directory the directory where to write the sstable
-     * @param partitioner the partitioner
-     * @param keyspace the keyspace name
-     * @param columnFamily the column family name
-     * @param comparator the column family comparator
-     * @param subComparator the column family subComparator or null if not a Super column family.
-     */
-    public SSTableSimpleWriter(File directory,
-                               IPartitioner partitioner,
-                               String keyspace,
-                               String columnFamily,
-                               AbstractType<?> comparator,
-                               AbstractType<?> subComparator)
+    private SSTableTxnWriter writer;
+
+    protected SSTableSimpleWriter(File directory, CFMetaData metadata, PartitionColumns columns)
     {
-        this(directory, CFMetaData.denseCFMetaData(keyspace, columnFamily, comparator, subComparator), partitioner);
+        super(directory, metadata, columns);
     }
 
-    public SSTableSimpleWriter(File directory, CFMetaData metadata, IPartitioner partitioner)
+    private SSTableTxnWriter getOrCreateWriter()
     {
-        super(directory, metadata, partitioner);
-        writer = getWriter();
+        if (writer == null)
+            writer = createWriter();
+
+        return writer;
     }
 
-    SSTableReader closeAndOpenReader()
+    PartitionUpdate getUpdateFor(DecoratedKey key) throws IOException
     {
-        if (currentKey != null)
-            writeRow(currentKey, columnFamily);
-        return writer.finish(true);
+        assert key != null;
+
+        // If that's not the current key, write the current one if necessary and create a new
+        // update for the new key.
+        if (!key.equals(currentKey))
+        {
+            if (update != null)
+                writePartition(update);
+            currentKey = key;
+            update = new PartitionUpdate(metadata, currentKey, columns, 4);
+        }
+
+        assert update != null;
+        return update;
     }
 
     public void close()
     {
         try
         {
-            if (currentKey != null)
-                writeRow(currentKey, columnFamily);
-            writer.finish(false);
+            if (update != null)
+                writePartition(update);
+            if (writer != null)
+                writer.finish(false);
         }
         catch (Throwable t)
         {
-            throw Throwables.propagate(writer.abort(t));
+            throw Throwables.propagate(writer == null ? t : writer.abort(t));
         }
     }
 
-    protected void writeRow(DecoratedKey key, ColumnFamily columnFamily)
+    private void writePartition(PartitionUpdate update) throws IOException
     {
-        writer.append(key, columnFamily);
-    }
-
-    protected ColumnFamily getColumnFamily()
-    {
-        return ArrayBackedSortedColumns.factory.create(metadata);
-    }
-
-    public Descriptor getCurrentDescriptor()
-    {
-        return writer.descriptor;
+        getOrCreateWriter().append(update.unfilteredIterator());
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
new file mode 100644
index 0000000..e889d85
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java

@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.util.Collection;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+/**
+ * A wrapper for SSTableWriter and LifecycleTransaction to be used when
+ * the writer is the only participant in the transaction and therefore
+ * it can safely own the transaction.
+ */
+public class SSTableTxnWriter extends Transactional.AbstractTransactional implements Transactional
+{
+    private final LifecycleTransaction txn;
+    private final SSTableMultiWriter writer;
+
+    public SSTableTxnWriter(LifecycleTransaction txn, SSTableMultiWriter writer)
+    {
+        this.txn = txn;
+        this.writer = writer;
+    }
+
+    public boolean append(UnfilteredRowIterator iterator)
+    {
+        return writer.append(iterator);
+    }
+
+    public String getFilename()
+    {
+        return writer.getFilename();
+    }
+
+    public long getFilePointer()
+    {
+        return writer.getFilePointer();
+    }
+
+    protected Throwable doCommit(Throwable accumulate)
+    {
+        return writer.commit(txn.commit(accumulate));
+    }
+
+    protected Throwable doAbort(Throwable accumulate)
+    {
+        return txn.abort(writer.abort(accumulate));
+    }
+
+    protected void doPrepare()
+    {
+        writer.prepareToCommit();
+        txn.prepareToCommit();
+    }
+
+    @Override
+    protected Throwable doPostCleanup(Throwable accumulate)
+    {
+        txn.close();
+        writer.close();
+        return super.doPostCleanup(accumulate);
+    }
+
+    public Collection<SSTableReader> finish(boolean openResult)
+    {
+        writer.setOpenResult(openResult);
+        finish();
+        return writer.finished();
+    }
+
+    @SuppressWarnings("resource") // log and writer closed during postCleanup
+    public static SSTableTxnWriter create(ColumnFamilyStore cfs, Descriptor descriptor, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header)
+    {
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
+        SSTableMultiWriter writer = cfs.createSSTableMultiWriter(descriptor, keyCount, repairedAt, sstableLevel, header, txn);
+        return new SSTableTxnWriter(txn, writer);
+    }
+
+    @SuppressWarnings("resource") // log and writer closed during postCleanup
+    public static SSTableTxnWriter create(CFMetaData cfm, Descriptor descriptor, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header)
+    {
+        // if the column family store does not exist, we create a new default SSTableMultiWriter to use:
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
+        MetadataCollector collector = new MetadataCollector(cfm.comparator).sstableLevel(sstableLevel);
+        SSTableMultiWriter writer = SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, cfm, collector, header, txn);
+        return new SSTableTxnWriter(txn, writer);
+    }
+
+    public static SSTableTxnWriter create(ColumnFamilyStore cfs, String filename, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header)
+    {
+        Descriptor desc = Descriptor.fromFilename(filename);
+        return create(cfs, desc, keyCount, repairedAt, sstableLevel, header);
+    }
+
+    public static SSTableTxnWriter create(ColumnFamilyStore cfs, String filename, long keyCount, long repairedAt, SerializationHeader header)
+    {
+        return create(cfs, filename, keyCount, repairedAt, 0, header);
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
new file mode 100644
index 0000000..ded070e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java

@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.UUID;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+
+public class SimpleSSTableMultiWriter implements SSTableMultiWriter
+{
+    private final SSTableWriter writer;
+
+    protected SimpleSSTableMultiWriter(SSTableWriter writer)
+    {
+        this.writer = writer;
+    }
+
+    public boolean append(UnfilteredRowIterator partition)
+    {
+        RowIndexEntry<?> indexEntry = writer.append(partition);
+        return indexEntry != null;
+    }
+
+    public Collection<SSTableReader> finish(long repairedAt, long maxDataAge, boolean openResult)
+    {
+        return Collections.singleton(writer.finish(repairedAt, maxDataAge, openResult));
+    }
+
+    public Collection<SSTableReader> finish(boolean openResult)
+    {
+        return Collections.singleton(writer.finish(openResult));
+    }
+
+    public Collection<SSTableReader> finished()
+    {
+        return Collections.singleton(writer.finished());
+    }
+
+    public SSTableMultiWriter setOpenResult(boolean openResult)
+    {
+        writer.setOpenResult(openResult);
+        return this;
+    }
+
+    public String getFilename()
+    {
+        return writer.getFilename();
+    }
+
+    public long getFilePointer()
+    {
+        return writer.getFilePointer();
+    }
+
+    public UUID getCfId()
+    {
+        return writer.metadata.cfId;
+    }
+
+    public Throwable commit(Throwable accumulate)
+    {
+        return writer.commit(accumulate);
+    }
+
+    public Throwable abort(Throwable accumulate)
+    {
+        return writer.abort(accumulate);
+    }
+
+    public void prepareToCommit()
+    {
+        writer.prepareToCommit();
+    }
+
+    public void close()
+    {
+        writer.close();
+    }
+
+    @SuppressWarnings("resource") // SimpleSSTableMultiWriter closes writer
+    public static SSTableMultiWriter create(Descriptor descriptor,
+                                            long keyCount,
+                                            long repairedAt,
+                                            CFMetaData cfm,
+                                            MetadataCollector metadataCollector,
+                                            SerializationHeader header,
+                                            LifecycleNewTracker lifecycleNewTracker)
+    {
+        SSTableWriter writer = SSTableWriter.create(descriptor, keyCount, repairedAt, cfm, metadataCollector, header, lifecycleNewTracker);
+        return new SimpleSSTableMultiWriter(writer);
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
index ca003b6..1286f16 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java

@@ -20,11 +20,9 @@
 import com.google.common.base.CharMatcher;
 import com.google.common.collect.ImmutableList;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.ColumnSerializer;
-import org.apache.cassandra.db.OnDiskAtom;
+import org.apache.cassandra.db.LegacyLayout;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.CompactionController;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.io.util.FileDataInput;
@@ -45,11 +43,7 @@
     SSTableWriter.Factory getWriterFactory();
     SSTableReader.Factory getReaderFactory();
 
-    Iterator<OnDiskAtom> getOnDiskIterator(FileDataInput in, ColumnSerializer.Flag flag, int expireBefore, CFMetaData cfm, Version version);
-
-    AbstractCompactedRow getCompactedRowWriter(CompactionController controller, ImmutableList<OnDiskAtomIterator> onDiskAtomIterators);
-
-    RowIndexEntry.IndexSerializer<?> getIndexSerializer(CFMetaData cfm);
+    RowIndexEntry.IndexSerializer<?> getIndexSerializer(CFMetaData cfm, Version version, SerializationHeader header);
 
     public static enum Type
     {
@@ -62,6 +56,7 @@
 
         public final SSTableFormat info;
         public final String name;
+
         private Type(String name, SSTableFormat info)
         {
             //Since format comes right after generation

diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index 96bf01d..0ca1f3a 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.io.sstable.format;
 
 import java.io.*;
+import java.lang.ref.WeakReference;
 import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.*;
@@ -27,44 +28,42 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
 import com.google.common.collect.Ordering;
 import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.RateLimiter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import com.clearspring.analytics.stream.cardinality.ICardinality;
-import com.codahale.metrics.Counter;
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.cache.InstrumentingCache;
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
-import org.apache.cassandra.config.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.lifecycle.Tracker;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.SliceableUnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.internal.CassandraIndex;
 import org.apache.cassandra.io.FSError;
-import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.metadata.*;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.*;
 import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.cassandra.utils.concurrent.Ref;
 import org.apache.cassandra.utils.concurrent.SelfRefCounted;
 
@@ -132,7 +131,6 @@
 public abstract class SSTableReader extends SSTable implements SelfRefCounted<SSTableReader>
 {
     private static final Logger logger = LoggerFactory.getLogger(SSTableReader.class);
-    private static final int ACCURATE_BOUNDARIES_MAGIC_NUMBER = 248923458;
 
     private static final ScheduledThreadPoolExecutor syncExecutor = new ScheduledThreadPoolExecutor(1);
     static
@@ -142,29 +140,20 @@
     }
     private static final RateLimiter meterSyncThrottle = RateLimiter.create(100.0);
 
-    public static final Comparator<SSTableReader> maxTimestampComparator = new Comparator<SSTableReader>()
-    {
-        public int compare(SSTableReader o1, SSTableReader o2)
-        {
-            long ts1 = o1.getMaxTimestamp();
-            long ts2 = o2.getMaxTimestamp();
-            return (ts1 > ts2 ? -1 : (ts1 == ts2 ? 0 : 1));
-        }
-    };
+    // Descending order
+    public static final Comparator<SSTableReader> maxTimestampComparator = (o1, o2) -> Long.compare(o2.getMaxTimestamp(), o1.getMaxTimestamp());
 
     // it's just an object, which we use regular Object equality on; we introduce a special class just for easy recognition
     public static final class UniqueIdentifier {}
 
-    public static final Comparator<SSTableReader> sstableComparator = new Comparator<SSTableReader>()
-    {
-        public int compare(SSTableReader o1, SSTableReader o2)
-        {
-            return o1.first.compareTo(o2.first);
-        }
-    };
+    public static final Comparator<SSTableReader> sstableComparator = (o1, o2) -> o1.first.compareTo(o2.first);
+
+    public static final Comparator<SSTableReader> generationReverseComparator = (o1, o2) -> -Integer.compare(o1.descriptor.generation, o2.descriptor.generation);
 
     public static final Ordering<SSTableReader> sstableOrdering = Ordering.from(sstableComparator);
 
+    public static final Comparator<SSTableReader> sizeComparator = (o1, o2) -> Longs.compare(o1.onDiskLength(), o2.onDiskLength());
+
     /**
      * maxDataAge is a timestamp in local server time (e.g. System.currentTimeMilli) which represents an upper bound
      * to the newest piece of data stored in the sstable. In other words, this sstable does not contain items created
@@ -209,6 +198,8 @@
     // not final since we need to be able to change level on a file.
     protected volatile StatsMetadata sstableMetadata;
 
+    public final SerializationHeader header;
+
     protected final AtomicLong keyCacheHit = new AtomicLong(0);
     protected final AtomicLong keyCacheRequest = new AtomicLong(0);
 
@@ -217,6 +208,8 @@
 
     private RestorableMeter readMeter;
 
+    private volatile double crcCheckChance;
+
     /**
      * Calculate approximate key count.
      * If cardinality estimator is available on all given sstables, then this method use them to estimate
@@ -226,12 +219,12 @@
      * @param sstables SSTables to calculate key count
      * @return estimated key count
      */
-    public static long getApproximateKeyCount(Collection<SSTableReader> sstables)
+    public static long getApproximateKeyCount(Iterable<SSTableReader> sstables)
     {
         long count = -1;
 
         // check if cardinality estimator is available for all SSTables
-        boolean cardinalityAvailable = !sstables.isEmpty() && Iterators.all(sstables.iterator(), new Predicate<SSTableReader>()
+        boolean cardinalityAvailable = !Iterables.isEmpty(sstables) && Iterables.all(sstables, new Predicate<SSTableReader>()
         {
             public boolean apply(SSTableReader sstable)
             {
@@ -347,9 +340,13 @@
         {
             int i = descriptor.cfname.indexOf(SECONDARY_INDEX_NAME_SEPARATOR);
             String parentName = descriptor.cfname.substring(0, i);
+            String indexName = descriptor.cfname.substring(i + 1);
             CFMetaData parent = Schema.instance.getCFMetaData(descriptor.ksname, parentName);
-            ColumnDefinition def = parent.getColumnDefinitionForIndex(descriptor.cfname.substring(i + 1));
-            metadata = CFMetaData.newIndexMetadata(parent, def, SecondaryIndex.getIndexComparator(parent, def));
+            IndexMetadata def = parent.getIndexes()
+                                      .get(indexName)
+                                      .orElseThrow(() -> new AssertionError(
+                                                                           "Could not find index metadata for index cf " + i));
+            metadata = CassandraIndex.indexCfsMetadata(parent, def);
         }
         else
         {
@@ -360,21 +357,24 @@
 
     public static SSTableReader open(Descriptor desc, CFMetaData metadata) throws IOException
     {
-        IPartitioner p = desc.cfname.contains(SECONDARY_INDEX_NAME_SEPARATOR)
-                ? new LocalPartitioner(metadata.getKeyValidator())
-                : StorageService.getPartitioner();
-        return open(desc, componentsFor(desc), metadata, p);
+        return open(desc, componentsFor(desc), metadata);
     }
 
-    public static SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
+    public static SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata) throws IOException
     {
-        return open(descriptor, components, metadata, partitioner, true, true);
+        return open(descriptor, components, metadata, true, false);
     }
 
     // use only for offline or "Standalone" operations
     public static SSTableReader openNoValidation(Descriptor descriptor, Set<Component> components, ColumnFamilyStore cfs) throws IOException
     {
-        return open(descriptor, components, cfs.metadata, cfs.partitioner, false, false); // do not track hotness
+        return open(descriptor, components, cfs.metadata, false, true);
+    }
+
+    // use only for offline or "Standalone" operations
+    public static SSTableReader openNoValidation(Descriptor descriptor, CFMetaData metadata) throws IOException
+    {
+        return open(descriptor, componentsFor(descriptor), metadata, false, true);
     }
 
     /**
@@ -383,25 +383,26 @@
      * @param descriptor
      * @param components
      * @param metadata
-     * @param partitioner
      * @return opened SSTableReader
      * @throws IOException
      */
-    public static SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
+    public static SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, CFMetaData metadata) throws IOException
     {
         // Minimum components without which we can't do anything
         assert components.contains(Component.DATA) : "Data component is missing for sstable " + descriptor;
         assert components.contains(Component.PRIMARY_INDEX) : "Primary index component is missing for sstable " + descriptor;
 
-        Map<MetadataType, MetadataComponent> sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor,
-                EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS));
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS, MetadataType.HEADER);
+        Map<MetadataType, MetadataComponent> sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor, types);
+
         ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
         StatsMetadata statsMetadata = (StatsMetadata) sstableMetadata.get(MetadataType.STATS);
+        SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
 
         // Check if sstable is created using same partitioner.
         // Partitioner can be null, which indicates older version of sstable or no stats available.
         // In that case, we skip the check.
-        String partitionerName = partitioner.getClass().getCanonicalName();
+        String partitionerName = metadata.partitioner.getClass().getCanonicalName();
         if (validationMetadata != null && !partitionerName.equals(validationMetadata.partitioner))
         {
             logger.error(String.format("Cannot open %s; partitioner %s does not match system partitioner %s.  Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, so you will need to edit that to match your old partitioner if upgrading.",
@@ -410,8 +411,13 @@
         }
 
         logger.debug("Opening {} ({} bytes)", descriptor, new File(descriptor.filenameFor(Component.DATA)).length());
-        SSTableReader sstable = internalOpen(descriptor, components, metadata, partitioner, System.currentTimeMillis(),
-                statsMetadata, OpenReason.NORMAL);
+        SSTableReader sstable = internalOpen(descriptor,
+                                             components,
+                                             metadata,
+                                             System.currentTimeMillis(),
+                                             statsMetadata,
+                                             OpenReason.NORMAL,
+                                             header == null? null : header.toHeader(metadata));
 
         // special implementation of load to use non-pooled SegmentedFile builders
         try(SegmentedFile.Builder ibuilder = new BufferedSegmentedFile.Builder();
@@ -421,34 +427,58 @@
         {
             if (!sstable.loadSummary(ibuilder, dbuilder))
                 sstable.buildSummary(false, ibuilder, dbuilder, false, Downsampling.BASE_SAMPLING_LEVEL);
-            sstable.ifile = ibuilder.complete(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX));
-            sstable.dfile = dbuilder.complete(sstable.descriptor.filenameFor(Component.DATA));
+            sstable.ifile = ibuilder.buildIndex(sstable.descriptor, sstable.indexSummary);
+            sstable.dfile = dbuilder.buildData(sstable.descriptor, statsMetadata);
             sstable.bf = FilterFactory.AlwaysPresent;
             sstable.setup(false);
             return sstable;
         }
     }
 
+    /**
+     * Open an SSTable for reading
+     * @param descriptor SSTable to open
+     * @param components Components included with this SSTable
+     * @param metadata for this SSTables CF
+     * @param validate Check SSTable for corruption (limited)
+     * @param isOffline Whether we are opening this SSTable "offline", for example from an external tool or not for inclusion in queries (validations)
+     *                  This stops regenerating BF + Summaries and also disables tracking of hotness for the SSTable.
+     * @return {@link SSTableReader}
+     * @throws IOException
+     */
     public static SSTableReader open(Descriptor descriptor,
-                                      Set<Component> components,
-                                      CFMetaData metadata,
-                                      IPartitioner partitioner,
-                                      boolean validate,
-                                      boolean trackHotness) throws IOException
+                                     Set<Component> components,
+                                     CFMetaData metadata,
+                                     boolean validate,
+                                     boolean isOffline) throws IOException
     {
         // Minimum components without which we can't do anything
         assert components.contains(Component.DATA) : "Data component is missing for sstable " + descriptor;
         assert !validate || components.contains(Component.PRIMARY_INDEX) : "Primary index component is missing for sstable " + descriptor;
 
-        Map<MetadataType, MetadataComponent> sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor,
-                                                                                                               EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS));
+        // For the 3.0+ sstable format, the (misnomed) stats component hold the serialization header which we need to deserialize the sstable content
+        assert !descriptor.version.storeRows() || components.contains(Component.STATS) : "Stats component is missing for sstable " + descriptor;
+
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS, MetadataType.HEADER);
+
+        Map<MetadataType, MetadataComponent> sstableMetadata;
+        try
+        {
+            sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor, types);
+        }
+        catch (Throwable t)
+        {
+            throw new CorruptSSTableException(t, descriptor.filenameFor(Component.STATS));
+        }
         ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
         StatsMetadata statsMetadata = (StatsMetadata) sstableMetadata.get(MetadataType.STATS);
+        SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
+        assert !descriptor.version.storeRows() || header != null;
 
         // Check if sstable is created using same partitioner.
         // Partitioner can be null, which indicates older version of sstable or no stats available.
         // In that case, we skip the check.
-        String partitionerName = partitioner.getClass().getCanonicalName();
+        String partitionerName = metadata.partitioner.getClass().getCanonicalName();
         if (validationMetadata != null && !partitionerName.equals(validationMetadata.partitioner))
         {
             logger.error(String.format("Cannot open %s; partitioner %s does not match system partitioner %s.  Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, so you will need to edit that to match your old partitioner if upgrading.",
@@ -457,16 +487,22 @@
         }
 
         logger.debug("Opening {} ({} bytes)", descriptor, new File(descriptor.filenameFor(Component.DATA)).length());
-        SSTableReader sstable = internalOpen(descriptor, components, metadata, partitioner, System.currentTimeMillis(),
-                                             statsMetadata, OpenReason.NORMAL);
+        SSTableReader sstable = internalOpen(descriptor,
+                                             components,
+                                             metadata,
+                                             System.currentTimeMillis(),
+                                             statsMetadata,
+                                             OpenReason.NORMAL,
+                                             header == null ? null : header.toHeader(metadata));
+
         try
         {
             // load index and filter
             long start = System.nanoTime();
-            sstable.load(validationMetadata);
+            sstable.load(validationMetadata, isOffline);
             logger.trace("INDEX LOAD TIME for {}: {} ms.", descriptor, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
 
-            sstable.setup(trackHotness);
+            sstable.setup(!isOffline); // Don't track hotness if we're offline.
             if (validate)
                 sstable.validate();
 
@@ -478,7 +514,7 @@
         catch (Throwable t)
         {
             sstable.selfRef().release();
-            throw t;
+            throw new CorruptSSTableException(t, sstable.getFilename());
         }
     }
 
@@ -491,8 +527,7 @@
     }
 
     public static Collection<SSTableReader> openAll(Set<Map.Entry<Descriptor, Set<Component>>> entries,
-                                                    final CFMetaData metadata,
-                                                    final IPartitioner partitioner)
+                                                    final CFMetaData metadata)
     {
         final Collection<SSTableReader> sstables = new LinkedBlockingQueue<>();
 
@@ -506,7 +541,7 @@
                     SSTableReader sstable;
                     try
                     {
-                        sstable = open(entry.getKey(), entry.getValue(), metadata, partitioner);
+                        sstable = open(entry.getKey(), entry.getValue(), metadata);
                     }
                     catch (CorruptSSTableException ex)
                     {
@@ -522,6 +557,7 @@
                     }
                     catch (IOException ex)
                     {
+                        FileUtils.handleCorruptSSTable(new CorruptSSTableException(ex, entry.getKey().filenameFor(Component.DATA)));
                         logger.error("Cannot read sstable {}; other IO error, skipping table", entry, ex);
                         return;
                     }
@@ -551,18 +587,18 @@
     public static SSTableReader internalOpen(Descriptor desc,
                                       Set<Component> components,
                                       CFMetaData metadata,
-                                      IPartitioner partitioner,
                                       SegmentedFile ifile,
                                       SegmentedFile dfile,
                                       IndexSummary isummary,
                                       IFilter bf,
                                       long maxDataAge,
                                       StatsMetadata sstableMetadata,
-                                      OpenReason openReason)
+                                      OpenReason openReason,
+                                      SerializationHeader header)
     {
-        assert desc != null && partitioner != null && ifile != null && dfile != null && isummary != null && bf != null && sstableMetadata != null;
+        assert desc != null && ifile != null && dfile != null && isummary != null && bf != null && sstableMetadata != null;
 
-        SSTableReader reader = internalOpen(desc, components, metadata, partitioner, maxDataAge, sstableMetadata, openReason);
+        SSTableReader reader = internalOpen(desc, components, metadata, maxDataAge, sstableMetadata, openReason, header);
 
         reader.bf = bf;
         reader.ifile = ifile;
@@ -577,29 +613,30 @@
     private static SSTableReader internalOpen(final Descriptor descriptor,
                                             Set<Component> components,
                                             CFMetaData metadata,
-                                            IPartitioner partitioner,
                                             Long maxDataAge,
                                             StatsMetadata sstableMetadata,
-                                            OpenReason openReason)
+                                            OpenReason openReason,
+                                            SerializationHeader header)
     {
         Factory readerFactory = descriptor.getFormat().getReaderFactory();
 
-        return readerFactory.open(descriptor, components, metadata, partitioner, maxDataAge, sstableMetadata, openReason);
+        return readerFactory.open(descriptor, components, metadata, maxDataAge, sstableMetadata, openReason, header);
     }
 
     protected SSTableReader(final Descriptor desc,
                             Set<Component> components,
                             CFMetaData metadata,
-                            IPartitioner partitioner,
                             long maxDataAge,
                             StatsMetadata sstableMetadata,
-                            OpenReason openReason)
+                            OpenReason openReason,
+                            SerializationHeader header)
     {
-        super(desc, components, metadata, partitioner);
+        super(desc, components, metadata);
         this.sstableMetadata = sstableMetadata;
+        this.header = header;
         this.maxDataAge = maxDataAge;
         this.openReason = openReason;
-        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata);
+        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata, desc.version, header);
     }
 
     public static long getTotalBytes(Iterable<SSTableReader> sstables)
@@ -634,43 +671,54 @@
         return dfile.path();
     }
 
-    public void setupKeyCache()
+    public void setupOnline()
     {
         // under normal operation we can do this at any time, but SSTR is also used outside C* proper,
         // e.g. by BulkLoader, which does not initialize the cache.  As a kludge, we set up the cache
         // here when we know we're being wired into the rest of the server infrastructure.
         keyCache = CacheService.instance.keyCache;
+        final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata.cfId);
+        if (cfs != null)
+            setCrcCheckChance(cfs.getCrcCheckChance());
     }
 
-    private void load(ValidationMetadata validation) throws IOException
+    public boolean isKeyCacheSetup()
     {
-        if (metadata.getBloomFilterFpChance() == 1.0)
+        return keyCache != null;
+    }
+
+    /**
+     * See {@link #load(boolean, boolean)}
+     * @param validation Metadata for SSTable being loaded
+     * @param isOffline Whether the SSTable is being loaded by an offline tool (sstabledump, scrub, etc)
+     * @throws IOException
+     */
+    private void load(ValidationMetadata validation, boolean isOffline) throws IOException
+    {
+        if (metadata.params.bloomFilterFpChance == 1.0)
         {
             // bf is disabled.
-            load(false, true);
+            load(false, !isOffline);
             bf = FilterFactory.AlwaysPresent;
         }
-        else if (!components.contains(Component.PRIMARY_INDEX))
+        else if (!components.contains(Component.PRIMARY_INDEX)) // What happens if filter component and primary index is missing?
         {
             // avoid any reading of the missing primary index component.
             // this should only happen during StandaloneScrubber
-            load(false, false);
+            load(false, !isOffline);
         }
         else if (!components.contains(Component.FILTER) || validation == null)
         {
             // bf is enabled, but filter component is missing.
-            load(true, true);
-        }
-        else if (validation.bloomFilterFPChance != metadata.getBloomFilterFpChance())
-        {
-            // bf fp chance in sstable metadata and it has changed since compaction.
-            load(true, true);
+            load(!isOffline, !isOffline);
+            if (isOffline)
+                bf = FilterFactory.AlwaysPresent;
         }
         else
         {
             // bf is enabled and fp chance matches the currently configured value.
-            load(false, true);
-            loadBloomFilter();
+            load(false, !isOffline);
+            loadBloomFilter(descriptor.version.hasOldBfHashOrder());
         }
     }
 
@@ -679,16 +727,17 @@
      *
      * @throws IOException
      */
-    private void loadBloomFilter() throws IOException
+    private void loadBloomFilter(boolean oldBfHashOrder) throws IOException
     {
         try (DataInputStream stream = new DataInputStream(new BufferedInputStream(new FileInputStream(descriptor.filenameFor(Component.FILTER)))))
         {
-            bf = FilterFactory.deserialize(stream, true);
+            bf = FilterFactory.deserialize(stream, true, oldBfHashOrder);
         }
     }
 
     /**
-     * Loads ifile, dfile and indexSummary, and optionally recreates the bloom filter.
+     * Loads ifile, dfile and indexSummary, and optionally recreates and persists the bloom filter.
+     * @param recreateBloomFilter Recreate the bloomfilter.
      * @param saveSummaryIfCreated for bulk loading purposes, if the summary was absent and needed to be built, you can
      *                             avoid persisting it to disk by setting this to false
      */
@@ -698,41 +747,21 @@
             SegmentedFile.Builder dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(), compression))
         {
             boolean summaryLoaded = loadSummary(ibuilder, dbuilder);
-            boolean builtSummary = false;
-            if (recreateBloomFilter || !summaryLoaded)
-            {
+            boolean buildSummary = !summaryLoaded || recreateBloomFilter;
+            if (buildSummary)
                 buildSummary(recreateBloomFilter, ibuilder, dbuilder, summaryLoaded, Downsampling.BASE_SAMPLING_LEVEL);
-                builtSummary = true;
-            }
 
             if (components.contains(Component.PRIMARY_INDEX))
-                ifile = ibuilder.complete(descriptor.filenameFor(Component.PRIMARY_INDEX));
+                ifile = ibuilder.buildIndex(descriptor, indexSummary);
 
-            dfile = dbuilder.complete(descriptor.filenameFor(Component.DATA));
+            dfile = dbuilder.buildData(descriptor, sstableMetadata);
 
-            // Check for an index summary that was downsampled even though the serialization format doesn't support
-            // that.  If it was downsampled, rebuild it.  See CASSANDRA-8993 for details.
-        if (!descriptor.version.hasSamplingLevel() && !builtSummary && !validateSummarySamplingLevel() && ifile != null)
+            if (buildSummary)
             {
-                indexSummary.close();
-                ifile.close();
-                dfile.close();
-
-                logger.info("Detected erroneously downsampled index summary; will rebuild summary at full sampling");
-                FileUtils.deleteWithConfirm(new File(descriptor.filenameFor(Component.SUMMARY)));
-
-                try(SegmentedFile.Builder ibuilderRebuild = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(), false);
-                    SegmentedFile.Builder dbuilderRebuild = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(), compression))
-                {
-                    buildSummary(false, ibuilderRebuild, dbuilderRebuild, false, Downsampling.BASE_SAMPLING_LEVEL);
-                    ifile = ibuilderRebuild.complete(descriptor.filenameFor(Component.PRIMARY_INDEX));
-                    dfile = dbuilderRebuild.complete(descriptor.filenameFor(Component.DATA));
-                    saveSummary(ibuilderRebuild, dbuilderRebuild);
-                }
-            }
-            else if (saveSummaryIfCreated && builtSummary)
-            {
-                saveSummary(ibuilder, dbuilder);
+                if (saveSummaryIfCreated)
+                    saveSummary(ibuilder, dbuilder);
+                if (recreateBloomFilter)
+                    saveBloomFilter();
             }
         }
         catch (Throwable t)
@@ -777,24 +806,24 @@
         try (RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX))))
         {
             long indexSize = primaryIndex.length();
-            long histogramCount = sstableMetadata.estimatedRowSize.count();
-            long estimatedKeys = histogramCount > 0 && !sstableMetadata.estimatedRowSize.isOverflowed()
+            long histogramCount = sstableMetadata.estimatedPartitionSize.count();
+            long estimatedKeys = histogramCount > 0 && !sstableMetadata.estimatedPartitionSize.isOverflowed()
                     ? histogramCount
                     : estimateRowsFromIndex(primaryIndex); // statistics is supposed to be optional
 
             if (recreateBloomFilter)
-                bf = FilterFactory.getFilter(estimatedKeys, metadata.getBloomFilterFpChance(), true);
+                bf = FilterFactory.getFilter(estimatedKeys, metadata.params.bloomFilterFpChance, true, descriptor.version.hasOldBfHashOrder());
 
-            try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.getMinIndexInterval(), samplingLevel))
+            try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, samplingLevel))
             {
                 long indexPosition;
-                RowIndexEntry.IndexSerializer rowIndexSerializer = descriptor.getFormat().getIndexSerializer(metadata);
+                RowIndexEntry.IndexSerializer rowIndexSerializer = descriptor.getFormat().getIndexSerializer(metadata, descriptor.version, header);
 
                 while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
                 {
                     ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex);
-                    RowIndexEntry indexEntry = rowIndexSerializer.deserialize(primaryIndex, descriptor.version);
-                    DecoratedKey decoratedKey = partitioner.decorateKey(key);
+                    RowIndexEntry indexEntry = rowIndexSerializer.deserialize(primaryIndex);
+                    DecoratedKey decoratedKey = decorateKey(key);
                     if (first == null)
                         first = decoratedKey;
                     last = decoratedKey;
@@ -806,13 +835,11 @@
                     if (!summaryLoaded)
                     {
                         summaryBuilder.maybeAddEntry(decoratedKey, indexPosition);
-                        ibuilder.addPotentialBoundary(indexPosition);
-                        dbuilder.addPotentialBoundary(indexEntry.position);
                     }
                 }
 
                 if (!summaryLoaded)
-                    indexSummary = summaryBuilder.build(partitioner);
+                    indexSummary = summaryBuilder.build(getPartitioner());
             }
         }
 
@@ -842,25 +869,12 @@
         {
             iStream = new DataInputStream(new FileInputStream(summariesFile));
             indexSummary = IndexSummary.serializer.deserialize(
-                    iStream, partitioner, descriptor.version.hasSamplingLevel(),
-                    metadata.getMinIndexInterval(), metadata.getMaxIndexInterval());
-            first = partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
-            last = partitioner.decorateKey(ByteBufferUtil.readWithLength(iStream));
-            ibuilder.deserializeBounds(iStream);
-            dbuilder.deserializeBounds(iStream);
-
-            boolean checkForRepair = true;
-            try
-            {
-                int v = iStream.readInt();
-                // check for our magic number, indicating this summary has been sampled correctly
-                checkForRepair = v != ACCURATE_BOUNDARIES_MAGIC_NUMBER;
-            }
-            catch (Throwable t) {}
-
-            // fix CASSANDRA-10357 on-the-fly
-            if (checkForRepair && MmappedSegmentedFile.maybeRepair(metadata, descriptor, indexSummary, ibuilder, dbuilder))
-                saveSummary(ibuilder, dbuilder);
+                    iStream, getPartitioner(), descriptor.version.hasSamplingLevel(),
+                    metadata.params.minIndexInterval, metadata.params.maxIndexInterval);
+            first = decorateKey(ByteBufferUtil.readWithLength(iStream));
+            last = decorateKey(ByteBufferUtil.readWithLength(iStream));
+            ibuilder.deserializeBounds(iStream, descriptor.version);
+            dbuilder.deserializeBounds(iStream, descriptor.version);
         }
         catch (IOException e)
         {
@@ -882,57 +896,6 @@
     }
 
     /**
-     * Validates that an index summary has full sampling, as expected when the serialization format does not support
-     * persisting the sampling level.
-     * @return true if the summary has full sampling, false otherwise
-     */
-    private boolean validateSummarySamplingLevel()
-    {
-        // We need to check index summary entries against the index to verify that none of them were dropped due to
-        // downsampling.  Downsampling can drop any of the first BASE_SAMPLING_LEVEL entries (repeating that drop pattern
-        // for the remainder of the summary).  Unfortunately, the first entry to be dropped is the entry at
-        // index (BASE_SAMPLING_LEVEL - 1), so we need to check a full set of BASE_SAMPLING_LEVEL entries.
-        if (ifile == null)
-            return false;
-
-        Iterator<FileDataInput> segments = ifile.iterator(0);
-        int i = 0;
-        int summaryEntriesChecked = 0;
-        int expectedIndexInterval = getMinIndexInterval();
-        while (segments.hasNext())
-        {
-            String path = null;
-            try (FileDataInput in = segments.next())
-            {
-                path = in.getPath();
-                while (!in.isEOF())
-                {
-                    ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
-                    if (i % expectedIndexInterval == 0)
-                    {
-                        ByteBuffer summaryKey = ByteBuffer.wrap(indexSummary.getKey(i / expectedIndexInterval));
-                        if (!summaryKey.equals(indexKey))
-                            return false;
-                        summaryEntriesChecked++;
-
-                        if (summaryEntriesChecked == Downsampling.BASE_SAMPLING_LEVEL)
-                            return true;
-                    }
-                    RowIndexEntry.Serializer.skip(in);
-                    i++;
-                }
-            }
-            catch (IOException e)
-            {
-                markSuspect();
-                throw new CorruptSSTableException(e, path);
-            }
-        }
-
-        return true;
-    }
-
-    /**
      * Save index summary to Summary.db file.
      *
      * @param ibuilder
@@ -963,10 +926,8 @@
             IndexSummary.serializer.serialize(summary, oStream, descriptor.version.hasSamplingLevel());
             ByteBufferUtil.writeWithLength(first.getKey(), oStream);
             ByteBufferUtil.writeWithLength(last.getKey(), oStream);
-            ibuilder.serializeBounds(oStream);
-            dbuilder.serializeBounds(oStream);
-            // write a magic number, to indicate this summary has been sampled correctly
-            oStream.writeInt(ACCURATE_BOUNDARIES_MAGIC_NUMBER);
+            ibuilder.serializeBounds(oStream, descriptor.version);
+            dbuilder.serializeBounds(oStream, descriptor.version);
         }
         catch (IOException e)
         {
@@ -978,6 +939,30 @@
         }
     }
 
+    public void saveBloomFilter()
+    {
+        saveBloomFilter(this.descriptor, bf);
+    }
+
+    public static void saveBloomFilter(Descriptor descriptor, IFilter filter)
+    {
+        File filterFile = new File(descriptor.filenameFor(Component.FILTER));
+        try (DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(new FileOutputStream(filterFile)))
+        {
+            FilterFactory.serialize(filter, stream);
+            stream.flush();
+        }
+        catch (IOException e)
+        {
+            logger.trace("Cannot save SSTable bloomfilter: ", e);
+
+            // corrupted hence delete it and let it load it now.
+            if (filterFile.exists())
+                FileUtils.deleteWithConfirm(filterFile);
+        }
+
+    }
+
     public void setReplaced()
     {
         synchronized (tidy.global)
@@ -995,7 +980,7 @@
         }
     }
 
-    // runOnClose must NOT be an anonymous or non-static inner class, nor must it retain a reference chain to this reader
+    // These runnables must NOT be an anonymous or non-static inner class, nor must it retain a reference chain to this reader
     public void runOnClose(final Runnable runOnClose)
     {
         synchronized (tidy.global)
@@ -1059,20 +1044,28 @@
         SSTableReader replacement = internalOpen(descriptor,
                                                  components,
                                                  metadata,
-                                                 partitioner,
                                                  ifile != null ? ifile.sharedCopy() : null,
                                                  dfile.sharedCopy(),
                                                  newSummary,
                                                  bf.sharedCopy(),
                                                  maxDataAge,
                                                  sstableMetadata,
-                                                 reason);
+                                                 reason,
+                                                 header);
         replacement.first = newFirst;
         replacement.last = last;
         replacement.isSuspect.set(isSuspect.get());
         return replacement;
     }
 
+    public SSTableReader cloneWithRestoredStart(DecoratedKey restoredStart)
+    {
+        synchronized (tidy.global)
+        {
+            return cloneAndReplace(restoredStart, OpenReason.NORMAL);
+        }
+    }
+
     // runOnClose must NOT be an anonymous or non-static inner class, nor must it retain a reference chain to this reader
     public SSTableReader cloneWithNewStart(DecoratedKey newStart, final Runnable runOnClose)
     {
@@ -1135,12 +1128,11 @@
         {
             assert openReason != OpenReason.EARLY;
 
-            int minIndexInterval = metadata.getMinIndexInterval();
-            int maxIndexInterval = metadata.getMaxIndexInterval();
+            int minIndexInterval = metadata.params.minIndexInterval;
+            int maxIndexInterval = metadata.params.maxIndexInterval;
             double effectiveInterval = indexSummary.getEffectiveIndexInterval();
 
             IndexSummary newSummary;
-            long oldSize = bytesOnDisk();
 
             // We have to rebuild the summary from the on-disk primary index in three cases:
             // 1. The sampling level went up, so we need to read more entries off disk
@@ -1154,7 +1146,7 @@
             else if (samplingLevel < indexSummary.getSamplingLevel())
             {
                 // we can use the existing index summary to make a smaller one
-                newSummary = IndexSummaryBuilder.downsample(indexSummary, samplingLevel, minIndexInterval, partitioner);
+                newSummary = IndexSummaryBuilder.downsample(indexSummary, samplingLevel, minIndexInterval, getPartitioner());
             }
             else
             {
@@ -1166,18 +1158,9 @@
             try(SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(), false);
                 SegmentedFile.Builder dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(), compression))
             {
-                for (long boundry : dfile.copyReadableBounds())
-                    dbuilder.addPotentialBoundary(boundry);
-                for (long boundry : ifile.copyReadableBounds())
-                    ibuilder.addPotentialBoundary(boundry);
-
                 saveSummary(ibuilder, dbuilder, newSummary);
             }
 
-            // The new size will be added in Transactional.commit() as an updated SSTable, more details: CASSANDRA-13738
-            StorageMetrics.load.dec(oldSize);
-            parent.metric.liveDiskSpaceUsed.dec(oldSize);
-
             return cloneAndReplace(first, OpenReason.METADATA_CHANGE, newSummary);
         }
     }
@@ -1189,16 +1172,16 @@
         try
         {
             long indexSize = primaryIndex.length();
-            try (IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata.getMinIndexInterval(), newSamplingLevel))
+            try (IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata.params.minIndexInterval, newSamplingLevel))
             {
                 long indexPosition;
                 while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
                 {
-                    summaryBuilder.maybeAddEntry(partitioner.decorateKey(ByteBufferUtil.readWithShortLength(primaryIndex)), indexPosition);
-                    RowIndexEntry.Serializer.skip(primaryIndex);
+                    summaryBuilder.maybeAddEntry(decorateKey(ByteBufferUtil.readWithShortLength(primaryIndex)), indexPosition);
+                    RowIndexEntry.Serializer.skip(primaryIndex, descriptor.version);
                 }
 
-                return summaryBuilder.build(partitioner);
+                return summaryBuilder.build(getPartitioner());
             }
         }
         finally
@@ -1242,8 +1225,7 @@
     {
         if (this.first.compareTo(this.last) > 0)
         {
-            selfRef().release();
-            throw new IllegalStateException(String.format("SSTable first key %s > last key %s", this.first, this.last));
+            throw new CorruptSSTableException(new IllegalStateException(String.format("SSTable first key %s > last key %s", this.first, this.last)), getFilename());
         }
     }
 
@@ -1251,7 +1233,7 @@
      * Gets the position in the index file to start scanning to find the given key (at most indexInterval keys away,
      * modulo downsampling of the index summary). Always returns a value >= 0
      */
-    public long getIndexScanPosition(RowPosition key)
+    public long getIndexScanPosition(PartitionPosition key)
     {
         if (openReason == OpenReason.MOVED_START && key.compareTo(first) < 0)
             key = first;
@@ -1294,13 +1276,7 @@
         if (!compression)
             throw new IllegalStateException(this + " is not compressed");
 
-        CompressionMetadata cmd = ((ICompressedFile) dfile).getMetadata();
-
-        //We need the parent cf metadata
-        String cfName = metadata.isSecondaryIndex() ? metadata.getParentColumnFamilyName() : metadata.cfName;
-        cmd.parameters.setLiveMetadata(Schema.instance.getCFMetaData(metadata.ksName, cfName));
-
-        return cmd;
+        return ((ICompressedFile) dfile).getMetadata();
     }
 
     /**
@@ -1398,8 +1374,8 @@
 
         for (Range<Token> range : Range.normalize(ranges))
         {
-            RowPosition leftPosition = range.left.maxKeyBound();
-            RowPosition rightPosition = range.right.maxKeyBound();
+            PartitionPosition leftPosition = range.left.maxKeyBound();
+            PartitionPosition rightPosition = range.right.maxKeyBound();
 
             int left = summary.binarySearch(leftPosition);
             if (left < 0)
@@ -1469,7 +1445,7 @@
                     public DecoratedKey next()
                     {
                         byte[] bytes = indexSummary.getKey(idx++);
-                        return partitioner.decorateKey(ByteBuffer.wrap(bytes));
+                        return decorateKey(ByteBuffer.wrap(bytes));
                     }
 
                     public void remove()
@@ -1493,9 +1469,9 @@
         {
             assert !range.isWrapAround() || range.right.isMinimum();
             // truncate the range so it at most covers the sstable
-            AbstractBounds<RowPosition> bounds = Range.makeRowRange(range);
-            RowPosition leftBound = bounds.left.compareTo(first) > 0 ? bounds.left : first.getToken().minKeyBound();
-            RowPosition rightBound = bounds.right.isMinimum() ? last.getToken().maxKeyBound() : bounds.right;
+            AbstractBounds<PartitionPosition> bounds = Range.makeRowRange(range);
+            PartitionPosition leftBound = bounds.left.compareTo(first) > 0 ? bounds.left : first.getToken().minKeyBound();
+            PartitionPosition rightBound = bounds.right.isMinimum() ? last.getToken().maxKeyBound() : bounds.right;
 
             if (leftBound.compareTo(last) > 0 || rightBound.compareTo(first) < 0)
                 continue;
@@ -1522,14 +1498,10 @@
 
     public void cacheKey(DecoratedKey key, RowIndexEntry info)
     {
-        CachingOptions caching = metadata.getCaching();
+        CachingParams caching = metadata.params.caching;
 
-        if (!caching.keyCache.isEnabled()
-                || keyCache == null
-                || keyCache.getCapacity() == 0)
-        {
+        if (!caching.cacheKeys() || keyCache == null || keyCache.getCapacity() == 0)
             return;
-        }
 
         KeyCacheKey cacheKey = new KeyCacheKey(metadata.ksAndCFName, descriptor, key.getKey());
         logger.trace("Adding cache entry for {} -> {}", cacheKey, info);
@@ -1543,7 +1515,7 @@
 
     protected RowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
     {
-        if (keyCache != null && keyCache.getCapacity() > 0 && metadata.getCaching().keyCache.isEnabled()) {
+        if (keyCache != null && keyCache.getCapacity() > 0 && metadata.params.caching.cacheKeys()) {
             if (updateStats)
             {
                 RowIndexEntry cachedEntry = keyCache.get(unifiedKey);
@@ -1565,16 +1537,21 @@
 
     /**
      * Get position updating key cache and stats.
-     * @see #getPosition(org.apache.cassandra.db.RowPosition, SSTableReader.Operator, boolean)
+     * @see #getPosition(PartitionPosition, SSTableReader.Operator, boolean)
      */
-    public RowIndexEntry getPosition(RowPosition key, Operator op)
+    public final RowIndexEntry getPosition(PartitionPosition key, Operator op)
     {
-        return getPosition(key, op, true, false);
+        return getPosition(key, op, SSTableReadsListener.NOOP_LISTENER);
     }
 
-    public RowIndexEntry getPosition(RowPosition key, Operator op, boolean updateCacheAndStats)
+    public final RowIndexEntry getPosition(PartitionPosition key, Operator op, SSTableReadsListener listener)
     {
-        return getPosition(key, op, updateCacheAndStats, false);
+        return getPosition(key, op, true, false, listener);
+    }
+
+    public final RowIndexEntry getPosition(PartitionPosition key, Operator op, boolean updateCacheAndStats)
+    {
+        return getPosition(key, op, updateCacheAndStats, false, SSTableReadsListener.NOOP_LISTENER);
     }
     /**
      * @param key The key to apply as the rhs to the given Operator. A 'fake' key is allowed to
@@ -1583,20 +1560,29 @@
      * @param updateCacheAndStats true if updating stats and cache
      * @return The index entry corresponding to the key, or null if the key is not present
      */
-    protected abstract RowIndexEntry getPosition(RowPosition key, Operator op, boolean updateCacheAndStats, boolean permitMatchPastLast);
+    protected abstract RowIndexEntry getPosition(PartitionPosition key,
+                                                 Operator op,
+                                                 boolean updateCacheAndStats,
+                                                 boolean permitMatchPastLast,
+                                                 SSTableReadsListener listener);
 
-    //Corresponds to a name column
-    public abstract OnDiskAtomIterator iterator(DecoratedKey key, SortedSet<CellName> columns);
-    public abstract OnDiskAtomIterator iterator(FileDataInput file, DecoratedKey key, SortedSet<CellName> columns, RowIndexEntry indexEntry);
+    public abstract SliceableUnfilteredRowIterator iterator(DecoratedKey key,
+                                                            ColumnFilter selectedColumns,
+                                                            boolean reversed,
+                                                            boolean isForThrift,
+                                                            SSTableReadsListener listener);
 
-    //Corresponds to a slice query
-    public abstract OnDiskAtomIterator iterator(DecoratedKey key, ColumnSlice[] slices, boolean reverse);
-    public abstract OnDiskAtomIterator iterator(FileDataInput file, DecoratedKey key, ColumnSlice[] slices, boolean reversed, RowIndexEntry indexEntry);
+    public abstract SliceableUnfilteredRowIterator iterator(FileDataInput file,
+                                                            DecoratedKey key,
+                                                            RowIndexEntry indexEntry,
+                                                            ColumnFilter selectedColumns,
+                                                            boolean reversed,
+                                                            boolean isForThrift);
 
     /**
      * Finds and returns the first key beyond a given token in this SSTable or null if no such key exists.
      */
-    public DecoratedKey firstKeyBeyond(RowPosition token)
+    public DecoratedKey firstKeyBeyond(PartitionPosition token)
     {
         if (token.compareTo(first) < 0)
             return first;
@@ -1606,28 +1592,24 @@
         if (ifile == null)
             return null;
 
-        Iterator<FileDataInput> segments = ifile.iterator(sampledPosition);
-        while (segments.hasNext())
+        String path = null;
+        try (FileDataInput in = ifile.createReader(sampledPosition))
         {
-            String path = null;
-            try (FileDataInput in = segments.next();)
+            path = in.getPath();
+            while (!in.isEOF())
             {
-                path = in.getPath();
-                while (!in.isEOF())
-                {
-                    ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
-                    DecoratedKey indexDecoratedKey = partitioner.decorateKey(indexKey);
-                    if (indexDecoratedKey.compareTo(token) > 0)
-                        return indexDecoratedKey;
+                ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
+                DecoratedKey indexDecoratedKey = decorateKey(indexKey);
+                if (indexDecoratedKey.compareTo(token) > 0)
+                    return indexDecoratedKey;
 
-                    RowIndexEntry.Serializer.skip(in);
-                }
+                RowIndexEntry.Serializer.skip(in, descriptor.version);
             }
-            catch (IOException e)
-            {
-                markSuspect();
-                throw new CorruptSSTableException(e, path);
-            }
+        }
+        catch (IOException e)
+        {
+            markSuspect();
+            throw new CorruptSSTableException(e, path);
         }
 
         return null;
@@ -1653,6 +1635,26 @@
         return dfile.onDiskLength;
     }
 
+    @VisibleForTesting
+    public double getCrcCheckChance()
+    {
+        return crcCheckChance;
+    }
+
+    /**
+     * Set the value of CRC check chance. The argument supplied is obtained
+     * from the the property of the owning CFS. Called when either the SSTR
+     * is initialized, or the CFS's property is updated via JMX
+     * @param crcCheckChance
+     */
+    public void setCrcCheckChance(double crcCheckChance)
+    {
+        this.crcCheckChance = crcCheckChance;
+        if (compression)
+            ((CompressedSegmentedFile)dfile).metadata.parameters.setCrcCheckChance(crcCheckChance);
+
+    }
+
     /**
      * Mark the sstable as obsolete, i.e., compacted into newer sstables.
      *
@@ -1662,7 +1664,7 @@
      * @return true if the this is the first time the file was marked obsolete.  Calling this
      * multiple times is usually buggy (see exceptions in Tracker.unmarkCompacting and removeOldSSTablesSize).
      */
-    public boolean markObsolete(Tracker tracker)
+    public void markObsolete(Runnable tidier)
     {
         if (logger.isTraceEnabled())
             logger.trace("Marking {} compacted", getFilename());
@@ -1670,18 +1672,16 @@
         synchronized (tidy.global)
         {
             assert !tidy.isReplaced;
+            assert tidy.global.obsoletion == null: this + " was already marked compacted";
+
+            tidy.global.obsoletion = tidier;
+            tidy.global.stopReadMeterPersistence();
         }
-        if (!tidy.global.isCompacted.getAndSet(true))
-        {
-            tidy.type.markObsolete(this, tracker);
-            return true;
-        }
-        return false;
     }
 
     public boolean isMarkedCompacted()
     {
-        return tidy.global.isCompacted.get();
+        return tidy.global.obsoletion != null;
     }
 
     public void markSuspect()
@@ -1707,19 +1707,18 @@
         return getScanner((RateLimiter) null);
     }
 
-    public ISSTableScanner getScanner(RateLimiter limiter)
-    {
-        return getScanner(DataRange.allData(partitioner), limiter);
-    }
-
     /**
-     *
+     * @param columns the columns to return.
      * @param dataRange filter to use when reading the columns
+     * @param listener a listener used to handle internal read events
      * @return A Scanner for seeking over the rows of the SSTable.
      */
-    public ISSTableScanner getScanner(DataRange dataRange)
+    public ISSTableScanner getScanner(ColumnFilter columns,
+                                      DataRange dataRange,
+                                      boolean isForThrift,
+                                      SSTableReadsListener listener)
     {
-        return getScanner(dataRange, null);
+        return getScanner(columns, dataRange, null, isForThrift, listener);
     }
 
     /**
@@ -1736,6 +1735,13 @@
     }
 
     /**
+     * Direct I/O SSTableScanner over the entirety of the sstable..
+     *
+     * @return A Scanner over the full content of the SSTable.
+     */
+    public abstract ISSTableScanner getScanner(RateLimiter limiter);
+
+    /**
      * Direct I/O SSTableScanner over a defined collection of ranges of tokens.
      *
      * @param ranges the range of keys to cover
@@ -1744,17 +1750,28 @@
     public abstract ISSTableScanner getScanner(Collection<Range<Token>> ranges, RateLimiter limiter);
 
     /**
+     * Direct I/O SSTableScanner over an iterator of bounds.
      *
-     * @param dataRange filter to use when reading the columns
+     * @param rangeIterator the keys to cover
      * @return A Scanner for seeking over the rows of the SSTable.
      */
-    public abstract ISSTableScanner getScanner(DataRange dataRange, RateLimiter limiter);
+    public abstract ISSTableScanner getScanner(Iterator<AbstractBounds<PartitionPosition>> rangeIterator);
 
-
+    /**
+     * @param columns the columns to return.
+     * @param dataRange filter to use when reading the columns
+     * @param listener a listener used to handle internal read events
+     * @return A Scanner for seeking over the rows of the SSTable.
+     */
+    public abstract ISSTableScanner getScanner(ColumnFilter columns,
+                                               DataRange dataRange,
+                                               RateLimiter limiter,
+                                               boolean isForThrift,
+                                               SSTableReadsListener listener);
 
     public FileDataInput getFileDataInput(long position)
     {
-        return dfile.getSegment(position);
+        return dfile.createReader(position);
     }
 
     /**
@@ -1773,6 +1790,8 @@
         for (Component component : components)
         {
             File sourceFile = new File(descriptor.filenameFor(component));
+            if (!sourceFile.exists())
+                continue;
             File targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
             FileUtils.createHardLink(sourceFile, targetLink);
         }
@@ -1839,9 +1858,9 @@
         return keyCache;
     }
 
-    public EstimatedHistogram getEstimatedRowSize()
+    public EstimatedHistogram getEstimatedPartitionSize()
     {
-        return sstableMetadata.estimatedRowSize;
+        return sstableMetadata.estimatedPartitionSize;
     }
 
     public EstimatedHistogram getEstimatedColumnCount()
@@ -1874,20 +1893,49 @@
         return sstableMetadata.maxTimestamp;
     }
 
-    public Set<Integer> getAncestors()
+    public int getMinLocalDeletionTime()
     {
-        try
-        {
-            CompactionMetadata compactionMetadata = (CompactionMetadata) descriptor.getMetadataSerializer().deserialize(descriptor, MetadataType.COMPACTION);
-            if (compactionMetadata != null)
-                return compactionMetadata.ancestors;
-            return Collections.emptySet();
-        }
-        catch (IOException e)
-        {
-            SSTableReader.logOpenException(descriptor, e);
-            return Collections.emptySet();
-        }
+        return sstableMetadata.minLocalDeletionTime;
+    }
+
+    public int getMaxLocalDeletionTime()
+    {
+        return sstableMetadata.maxLocalDeletionTime;
+    }
+
+    /** sstable contains no tombstones if minLocalDeletionTime == Integer.MAX_VALUE */
+    public boolean hasTombstones()
+    {
+        // sstable contains no tombstone if minLocalDeletionTime is still set to  the default value Integer.MAX_VALUE
+        // which is bigger than any valid deletion times
+        return getMinLocalDeletionTime() != Integer.MAX_VALUE;
+    }
+
+    public int getMinTTL()
+    {
+        return sstableMetadata.minTTL;
+    }
+
+    public int getMaxTTL()
+    {
+        return sstableMetadata.maxTTL;
+    }
+
+    public long getTotalColumnsSet()
+    {
+        return sstableMetadata.totalColumnsSet;
+    }
+
+    public long getTotalRows()
+    {
+        return sstableMetadata.totalRows;
+    }
+
+    public int getAvgColumnSetPerRow()
+    {
+        return sstableMetadata.totalRows < 0
+             ? -1
+             : (sstableMetadata.totalRows == 0 ? 0 : (int)(sstableMetadata.totalColumnsSet / sstableMetadata.totalRows));
     }
 
     public int getSSTableLevel()
@@ -1917,7 +1965,7 @@
     public RandomAccessReader openDataReader(RateLimiter limiter)
     {
         assert limiter != null;
-        return dfile.createThrottledReader(limiter);
+        return dfile.createReader(limiter);
     }
 
     public RandomAccessReader openDataReader()
@@ -1932,6 +1980,16 @@
         return null;
     }
 
+    public ChannelProxy getDataChannel()
+    {
+        return dfile.channel;
+    }
+
+    public ChannelProxy getIndexChannel()
+    {
+        return ifile.channel;
+    }
+
     /**
      * @param component component to get timestamp.
      * @return last modified time for given component. 0 if given component does not exist or IO error occurs.
@@ -1958,8 +2016,8 @@
     }
 
     /**
-     * Increment the total row read count and read rate for this SSTable.  This should not be incremented for range
-     * slice queries, row cache hits, or non-query reads, like compaction.
+     * Increment the total read count and read rate for this SSTable.  This should not be incremented for non-query reads,
+     * like compaction.
      */
     public void incrementReadCount()
     {
@@ -1967,6 +2025,34 @@
             readMeter.mark();
     }
 
+    /**
+     * Checks if this sstable can overlap with another one based on the min/man clustering values.
+     * If this methods return false, we're guarantee that {@code this} and {@code other} have no overlapping
+     * data, i.e. no cells to reconcile.
+     */
+    public boolean mayOverlapsWith(SSTableReader other)
+    {
+        StatsMetadata m1 = getSSTableMetadata();
+        StatsMetadata m2 = other.getSSTableMetadata();
+
+        if (m1.minClusteringValues.isEmpty() || m1.maxClusteringValues.isEmpty() || m2.minClusteringValues.isEmpty() || m2.maxClusteringValues.isEmpty())
+            return true;
+
+        return !(compare(m1.maxClusteringValues, m2.minClusteringValues) < 0 || compare(m1.minClusteringValues, m2.maxClusteringValues) > 0);
+    }
+
+    private int compare(List<ByteBuffer> values1, List<ByteBuffer> values2)
+    {
+        ClusteringComparator comparator = metadata.comparator;
+        for (int i = 0; i < Math.min(values1.size(), values2.size()); i++)
+        {
+            int cmp = comparator.subtype(i).compare(values1.get(i), values2.get(i));
+            if (cmp != 0)
+                return cmp;
+        }
+        return 0;
+    }
+
     public static class SizeComparator implements Comparator<SSTableReader>
     {
         public int compare(SSTableReader o1, SSTableReader o2)
@@ -2002,19 +2088,25 @@
         this.readMeter = tidy.global.readMeter = readMeter;
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        identities.add(this);
+        identities.add(tidy.globalRef);
+        dfile.addTo(identities);
+        ifile.addTo(identities);
+        bf.addTo(identities);
+        indexSummary.addTo(identities);
+
+    }
+
     /**
-     * One instance per SSTableReader we create. This references the type-shared tidy, which in turn references
-     * the globally shared tidy, i.e.
+     * One instance per SSTableReader we create.
      *
-     * InstanceTidier => DescriptorTypeTitdy => GlobalTidy
+     * We can create many InstanceTidiers (one for every time we reopen an sstable with MOVED_START for example),
+     * but there can only be one GlobalTidy for one single logical sstable.
      *
-     * We can create many InstanceTidiers (one for every time we reopen an sstable with MOVED_START for example), but there can only be
-     * two DescriptorTypeTidy (FINAL and TEMPLINK) and only one GlobalTidy for one single logical sstable.
-     *
-     * When the InstanceTidier cleansup, it releases its reference to its DescriptorTypeTidy; when all InstanceTidiers
-     * for that type have run, the DescriptorTypeTidy cleansup. DescriptorTypeTidy behaves in the same way towards GlobalTidy.
-     *
-     * For ease, we stash a direct reference to both our type-shared and global tidier
+     * When the InstanceTidier cleansup, it releases its reference to its GlobalTidy; when all InstanceTidiers
+     * for that type have run, the GlobalTidy cleans up.
      */
     private static final class InstanceTidier implements Tidy
     {
@@ -2028,13 +2120,9 @@
         private Runnable runOnClose;
         private boolean isReplaced = false;
 
-        // a reference to our shared per-Descriptor.Type tidy instance, that
+        // a reference to our shared tidy instance, that
         // we will release when we are ourselves released
-        private Ref<DescriptorTypeTidy> typeRef;
-
-        // a convenience stashing of the shared per-descriptor-type tidy instance itself
-        // and the per-logical-sstable globally shared state that it is linked to
-        private DescriptorTypeTidy type;
+        private Ref<GlobalTidy> globalRef;
         private GlobalTidy global;
 
         private volatile boolean setup;
@@ -2047,9 +2135,8 @@
             this.dfile = reader.dfile;
             this.ifile = reader.ifile;
             // get a new reference to the shared descriptor-type tidy
-            this.typeRef = DescriptorTypeTidy.get(reader);
-            this.type = typeRef.get();
-            this.global = type.globalRef.get();
+            this.globalRef = GlobalTidy.get(reader);
+            this.global = globalRef.get();
             if (trackHotness)
                 global.ensureReadMeter();
         }
@@ -2102,7 +2189,7 @@
                         dfile.close();
                     if (ifile != null)
                         ifile.close();
-                    typeRef.release();
+                    globalRef.release();
 
                     if (logger.isTraceEnabled())
                         logger.trace("Async instance tidier for {}, completed", descriptor);
@@ -2124,102 +2211,17 @@
     }
 
     /**
-     * One shared between all instances of a given Descriptor.Type.
-     * Performs only two things: the deletion of the sstables for the type,
-     * if necessary; and the shared reference to the globally shared state.
+     * One instance per logical sstable. This both tracks shared cleanup and some shared state related
+     * to the sstable's lifecycle.
      *
      * All InstanceTidiers, on setup(), ask the static get() method for their shared state,
      * and stash a reference to it to be released when they are. Once all such references are
-     * released, the shared tidy will be performed.
-     */
-    static final class DescriptorTypeTidy implements Tidy
-    {
-        // keyed by REAL descriptor (TMPLINK/FINAL), mapping to the shared DescriptorTypeTidy for that descriptor
-        static final ConcurrentMap<Descriptor, Ref<DescriptorTypeTidy>> lookup = new ConcurrentHashMap<>();
-
-        private final Descriptor desc;
-        private final Ref<GlobalTidy> globalRef;
-        private final Set<Component> components;
-        private long sizeOnDelete;
-        private Counter totalDiskSpaceUsed;
-
-        DescriptorTypeTidy(Descriptor desc, SSTableReader sstable)
-        {
-            this.desc = desc;
-            // get a new reference to the shared global tidy
-            this.globalRef = GlobalTidy.get(sstable);
-            this.components = sstable.components;
-        }
-
-        void markObsolete(SSTableReader instance, Tracker tracker)
-        {
-            // the tracker is used only to notify listeners of deletion of the sstable;
-            // since deletion of a non-final file is not really deletion of the sstable,
-            // we don't want to notify the listeners in this event
-            if (tracker != null && tracker.cfstore != null && desc.type == Descriptor.Type.FINAL)
-            {
-                sizeOnDelete = instance.bytesOnDisk();
-                totalDiskSpaceUsed = tracker.cfstore.metric.totalDiskSpaceUsed;
-                tracker.notifyDeleting(instance);
-            }
-        }
-
-        public void tidy()
-        {
-            lookup.remove(desc);
-            boolean isCompacted = globalRef.get().isCompacted.get();
-            globalRef.release();
-            switch (desc.type)
-            {
-                case FINAL:
-                    if (isCompacted)
-                        new SSTableDeletingTask(desc, components, totalDiskSpaceUsed, sizeOnDelete).run();
-                    break;
-                case TEMPLINK:
-                    new SSTableDeletingTask(desc, components, null, 0).run();
-                    break;
-                default:
-                    throw new IllegalStateException();
-            }
-        }
-
-        public String name()
-        {
-            return desc.toString();
-        }
-
-        // get a new reference to the shared DescriptorTypeTidy for this sstable
-        @SuppressWarnings("resource")
-        public static Ref<DescriptorTypeTidy> get(SSTableReader sstable)
-        {
-            Descriptor desc = sstable.descriptor;
-            if (sstable.openReason == OpenReason.EARLY)
-                desc = desc.asType(Descriptor.Type.TEMPLINK);
-            Ref<DescriptorTypeTidy> refc = lookup.get(desc);
-            if (refc != null)
-                return refc.ref();
-            final DescriptorTypeTidy tidy = new DescriptorTypeTidy(desc, sstable);
-            refc = new Ref<>(tidy, tidy);
-            Ref<?> ex = lookup.putIfAbsent(desc, refc);
-            if (ex != null)
-            {
-                refc.close();
-                throw new AssertionError();
-            }
-            return refc;
-        }
-    }
-
-    /**
-     * One instance per logical sstable. This both tracks shared cleanup and some shared state related
-     * to the sstable's lifecycle. All DescriptorTypeTidy instances, on construction, obtain a reference to us
-     * via our static get(). There should only ever be at most two such references extant at any one time,
-     * since only TMPLINK and FINAL type descriptors should be open as readers. When all files of both
-     * kinds have been released, this shared tidy will be performed.
+     * released, this shared tidy will be performed.
      */
     static final class GlobalTidy implements Tidy
     {
-        // keyed by FINAL descriptor, mapping to the shared GlobalTidy for that descriptor
+        static WeakReference<ScheduledFuture<?>> NULL = new WeakReference<>(null);
+        // keyed by descriptor, mapping to the shared GlobalTidy for that descriptor
         static final ConcurrentMap<Descriptor, Ref<GlobalTidy>> lookup = new ConcurrentHashMap<>();
 
         private final Descriptor desc;
@@ -2228,15 +2230,13 @@
         private RestorableMeter readMeter;
         // the scheduled persistence of the readMeter, that we will cancel once all instances of this logical
         // sstable have been released
-        private ScheduledFuture readMeterSyncFuture;
-        // shared state managing if the logical sstable has been compacted; this is used in cleanup both here
-        // and in the FINAL type tidier
-        private final AtomicBoolean isCompacted;
+        private WeakReference<ScheduledFuture<?>> readMeterSyncFuture = NULL;
+        // shared state managing if the logical sstable has been compacted; this is used in cleanup
+        private volatile Runnable obsoletion;
 
         GlobalTidy(final SSTableReader reader)
         {
             this.desc = reader.descriptor;
-            this.isCompacted = new AtomicBoolean();
         }
 
         void ensureReadMeter()
@@ -2246,40 +2246,48 @@
 
             // Don't track read rates for tables in the system keyspace and don't bother trying to load or persist
             // the read meter when in client mode.
-            if (SystemKeyspace.NAME.equals(desc.ksname) || !DatabaseDescriptor.isDaemonInitialized())
+            if (Schema.isLocalSystemKeyspace(desc.ksname) || !DatabaseDescriptor.isDaemonInitialized())
             {
                 readMeter = null;
-                readMeterSyncFuture = null;
+                readMeterSyncFuture = NULL;
                 return;
             }
 
             readMeter = SystemKeyspace.getSSTableReadMeter(desc.ksname, desc.cfname, desc.generation);
             // sync the average read rate to system.sstable_activity every five minutes, starting one minute from now
-            readMeterSyncFuture = syncExecutor.scheduleAtFixedRate(new Runnable()
+            readMeterSyncFuture = new WeakReference<>(syncExecutor.scheduleAtFixedRate(new Runnable()
             {
                 public void run()
                 {
-                    if (!isCompacted.get())
+                    if (obsoletion == null)
                     {
                         meterSyncThrottle.acquire();
                         SystemKeyspace.persistSSTableReadMeter(desc.ksname, desc.cfname, desc.generation, readMeter);
                     }
                 }
-            }, 1, 5, TimeUnit.MINUTES);
+            }, 1, 5, TimeUnit.MINUTES));
+        }
+
+        private void stopReadMeterPersistence()
+        {
+            ScheduledFuture<?> readMeterSyncFutureLocal = readMeterSyncFuture.get();
+            if (readMeterSyncFutureLocal != null)
+            {
+                readMeterSyncFutureLocal.cancel(true);
+                readMeterSyncFuture = NULL;
+            }
         }
 
         public void tidy()
         {
             lookup.remove(desc);
-            if (readMeterSyncFuture != null)
-            {
-                readMeterSyncFuture.cancel(true);
-                if (isCompacted.get())
-                    SystemKeyspace.clearSSTableReadMeter(desc.ksname, desc.cfname, desc.generation);
-            }
+
+            if (obsoletion != null)
+                obsoletion.run();
+
             // don't ideally want to dropPageCache for the file until all instances have been released
-            CLibrary.trySkipCache(desc.filenameFor(Component.DATA), 0, 0);
-            CLibrary.trySkipCache(desc.filenameFor(Component.PRIMARY_INDEX), 0, 0);
+            NativeLibrary.trySkipCache(desc.filenameFor(Component.DATA), 0, 0);
+            NativeLibrary.trySkipCache(desc.filenameFor(Component.PRIMARY_INDEX), 0, 0);
         }
 
         public String name()
@@ -2311,7 +2319,6 @@
     public static void resetTidying()
     {
         GlobalTidy.lookup.clear();
-        DescriptorTypeTidy.lookup.clear();
     }
 
     public static abstract class Factory
@@ -2319,10 +2326,10 @@
         public abstract SSTableReader open(final Descriptor descriptor,
                                            Set<Component> components,
                                            CFMetaData metadata,
-                                           IPartitioner partitioner,
                                            Long maxDataAge,
                                            StatsMetadata sstableMetadata,
-                                           OpenReason openReason);
+                                           OpenReason openReason,
+                                           SerializationHeader header);
 
     }
 

diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
new file mode 100644
index 0000000..6d384bf
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format;
+
+import org.apache.cassandra.db.RowIndexEntry;
+
+/**
+ * Listener for receiving notifications associated with reading SSTables.
+ */
+public interface SSTableReadsListener
+{
+    /**
+     * The reasons for skipping an SSTable
+     */
+    enum SkippingReason
+    {
+        BLOOM_FILTER,
+        MIN_MAX_KEYS,
+        PARTITION_INDEX_LOOKUP,
+        INDEX_ENTRY_NOT_FOUND;
+    }
+
+    /**
+     * The reasons for selecting an SSTable
+     */
+    enum SelectionReason
+    {
+        KEY_CACHE_HIT,
+        INDEX_ENTRY_FOUND;
+    }
+
+    /**
+     * Listener that does nothing.
+     */
+    static final SSTableReadsListener NOOP_LISTENER = new SSTableReadsListener() {};
+
+    /**
+     * Handles notification that the specified SSTable has been skipped during a single partition query.
+     *
+     * @param sstable the SSTable reader
+     * @param reason the reason for which the SSTable has been skipped
+     */
+    default void onSSTableSkipped(SSTableReader sstable, SkippingReason reason)
+    {
+    }
+
+    /**
+     * Handles notification that the specified SSTable has been selected during a single partition query.
+     *
+     * @param sstable the SSTable reader
+     * @param indexEntry the index entry
+     * @param reason the reason for which the SSTable has been selected
+     */
+    default void onSSTableSelected(SSTableReader sstable, RowIndexEntry<?> indexEntry, SelectionReason reason)
+    {
+    }
+
+    /**
+     * Handles notification that the specified SSTable is being scanned during a partition range query.
+     *
+     * @param sstable the SSTable reader of the SSTable being scanned.
+     */
+    default void onScanningStarted(SSTableReader sstable)
+    {
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index f99292e..fcc23a2 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java

@@ -18,18 +18,24 @@
 
 package org.apache.cassandra.io.sstable.format;
 
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Sets;
+
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -37,13 +43,6 @@
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
-import java.io.DataInput;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
 /**
  * This is the API all table writers must implement.
  *
@@ -57,6 +56,7 @@
     protected final long keyCount;
     protected final MetadataCollector metadataCollector;
     protected final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
+    protected final SerializationHeader header;
     protected final TransactionalProxy txnProxy = txnProxy();
 
     protected abstract TransactionalProxy txnProxy();
@@ -69,30 +69,37 @@
         protected boolean openResult;
     }
 
-    protected SSTableWriter(Descriptor descriptor, long keyCount, long repairedAt, CFMetaData metadata, IPartitioner partitioner, MetadataCollector metadataCollector)
+    protected SSTableWriter(Descriptor descriptor, 
+                            long keyCount, 
+                            long repairedAt, 
+                            CFMetaData metadata, 
+                            MetadataCollector metadataCollector, 
+                            SerializationHeader header)
     {
-        super(descriptor, components(metadata), metadata, partitioner);
+        super(descriptor, components(metadata), metadata);
         this.keyCount = keyCount;
         this.repairedAt = repairedAt;
         this.metadataCollector = metadataCollector;
-        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata);
+        this.header = header != null ? header : SerializationHeader.makeWithoutStats(metadata); //null header indicates streaming from pre-3.0 sstable
+        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata, descriptor.version, header);
     }
 
-    public static SSTableWriter create(Descriptor descriptor, Long keyCount, Long repairedAt, CFMetaData metadata,  IPartitioner partitioner, MetadataCollector metadataCollector)
+    public static SSTableWriter create(Descriptor descriptor,
+                                       Long keyCount,
+                                       Long repairedAt,
+                                       CFMetaData metadata,
+                                       MetadataCollector metadataCollector,
+                                       SerializationHeader header,
+                                       LifecycleNewTracker lifecycleNewTracker)
     {
         Factory writerFactory = descriptor.getFormat().getWriterFactory();
-        return writerFactory.open(descriptor, keyCount, repairedAt, metadata, partitioner, metadataCollector);
+        return writerFactory.open(descriptor, keyCount, repairedAt, metadata, metadataCollector, header, lifecycleNewTracker);
     }
 
-    public static SSTableWriter create(Descriptor descriptor, long keyCount, long repairedAt)
-    {
-        return create(descriptor, keyCount, repairedAt, 0);
-    }
-
-    public static SSTableWriter create(Descriptor descriptor, long keyCount, long repairedAt, int sstableLevel)
+    public static SSTableWriter create(Descriptor descriptor, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
         CFMetaData metadata = Schema.instance.getCFMetaData(descriptor);
-        return create(metadata, descriptor, keyCount, repairedAt, sstableLevel, DatabaseDescriptor.getPartitioner());
+        return create(metadata, descriptor, keyCount, repairedAt, sstableLevel, header, lifecycleNewTracker);
     }
 
     public static SSTableWriter create(CFMetaData metadata,
@@ -100,20 +107,22 @@
                                        long keyCount,
                                        long repairedAt,
                                        int sstableLevel,
-                                       IPartitioner partitioner)
+                                       SerializationHeader header,
+                                       LifecycleNewTracker lifecycleNewTracker)
     {
         MetadataCollector collector = new MetadataCollector(metadata.comparator).sstableLevel(sstableLevel);
-        return create(descriptor, keyCount, repairedAt, metadata, partitioner, collector);
+        return create(descriptor, keyCount, repairedAt, metadata, collector, header, lifecycleNewTracker);
     }
 
-    public static SSTableWriter create(String filename, long keyCount, long repairedAt, int sstableLevel)
+    public static SSTableWriter create(String filename, long keyCount, long repairedAt, int sstableLevel, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
-        return create(Descriptor.fromFilename(filename), keyCount, repairedAt, sstableLevel);
+        return create(Descriptor.fromFilename(filename), keyCount, repairedAt, sstableLevel, header, lifecycleNewTracker);
     }
 
-    public static SSTableWriter create(String filename, long keyCount, long repairedAt)
+    @VisibleForTesting
+    public static SSTableWriter create(String filename, long keyCount, long repairedAt, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
-        return create(Descriptor.fromFilename(filename), keyCount, repairedAt, 0);
+        return create(Descriptor.fromFilename(filename), keyCount, repairedAt, 0, header, lifecycleNewTracker);
     }
 
     private static Set<Component> components(CFMetaData metadata)
@@ -123,12 +132,12 @@
                 Component.STATS,
                 Component.SUMMARY,
                 Component.TOC,
-                Component.DIGEST));
+                Component.digestFor(BigFormat.latestVersion.uncompressedChecksumType())));
 
-        if (metadata.getBloomFilterFpChance() < 1.0)
+        if (metadata.params.bloomFilterFpChance < 1.0)
             components.add(Component.FILTER);
 
-        if (metadata.compressionParameters().sstableCompressor != null)
+        if (metadata.params.compression.isEnabled())
         {
             components.add(Component.COMPRESSION_INFO);
         }
@@ -141,19 +150,18 @@
         return components;
     }
 
-
     public abstract void mark();
 
-
     /**
-     * @param row
-     * @return null if the row was compacted away entirely; otherwise, the PK index entry for this row
+     * Appends partition data to this writer.
+     *
+     * @param iterator the partition to write
+     * @return the created index entry if something was written, that is if {@code iterator}
+     * wasn't empty, {@code null} otherwise.
+     *
+     * @throws FSWriteError if a write to the dataFile fails
      */
-    public abstract RowIndexEntry append(AbstractCompactedRow row);
-
-    public abstract void append(DecoratedKey decoratedKey, ColumnFamily cf);
-
-    public abstract long appendFromStream(DecoratedKey key, CFMetaData metadata, DataInput in, Version version) throws IOException;
+    public abstract RowIndexEntry append(UnfilteredRowIterator iterator);
 
     public abstract long getFilePointer();
 
@@ -201,7 +209,7 @@
 
     public SSTableReader finish(boolean openResult)
     {
-        txnProxy.openResult = openResult;
+        setOpenResult(openResult);
         txnProxy.finish();
         return finished();
     }
@@ -243,8 +251,10 @@
 
     protected Map<MetadataType, MetadataComponent> finalizeMetadata()
     {
-        return metadataCollector.finalizeMetadata(partitioner.getClass().getCanonicalName(),
-                                                  metadata.getBloomFilterFpChance(), repairedAt);
+        return metadataCollector.finalizeMetadata(getPartitioner().getClass().getCanonicalName(),
+                                                  metadata.params.bloomFilterFpChance,
+                                                  repairedAt,
+                                                  header);
     }
 
     protected StatsMetadata statsMetadata()
@@ -252,13 +262,6 @@
         return (StatsMetadata) finalizeMetadata().get(MetadataType.STATS);
     }
 
-    public static Descriptor rename(Descriptor tmpdesc, Set<Component> components)
-    {
-        Descriptor newdesc = tmpdesc.asType(Descriptor.Type.FINAL);
-        rename(tmpdesc, newdesc, components);
-        return newdesc;
-    }
-
     public static void rename(Descriptor tmpdesc, Descriptor newdesc, Set<Component> components)
     {
         for (Component component : Sets.difference(components, Sets.newHashSet(Component.DATA, Component.SUMMARY)))
@@ -276,6 +279,12 @@
 
     public static abstract class Factory
     {
-        public abstract SSTableWriter open(Descriptor descriptor, long keyCount, long repairedAt, CFMetaData metadata, IPartitioner partitioner, MetadataCollector metadataCollector);
+        public abstract SSTableWriter open(Descriptor descriptor,
+                                           long keyCount,
+                                           long repairedAt,
+                                           CFMetaData metadata,
+                                           MetadataCollector metadataCollector,
+                                           SerializationHeader header,
+                                           LifecycleNewTracker lifecycleNewTracker);
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/format/Version.java b/src/java/org/apache/cassandra/io/sstable/format/Version.java
index 41a83e1..2b9dcbd 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/Version.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/Version.java

@@ -17,6 +17,10 @@
  */
 package org.apache.cassandra.io.sstable.format;
 
+import java.util.regex.Pattern;
+
+import org.apache.cassandra.utils.ChecksumType;
+
 /**
  * A set of feature flags associated with a SSTable format
  *
@@ -30,6 +34,8 @@
  */
 public abstract class Version
 {
+    private static final Pattern VALIDATION = Pattern.compile("[a-z]+");
+
     protected final String version;
     protected final SSTableFormat format;
     protected Version(SSTableFormat format, String version)
@@ -44,7 +50,9 @@
 
     public abstract boolean hasNewStatsFile();
 
-    public abstract boolean hasAllAdlerChecksums();
+    public abstract ChecksumType compressedChecksumType();
+
+    public abstract ChecksumType uncompressedChecksumType();
 
     public abstract boolean hasRepairedAt();
 
@@ -52,8 +60,22 @@
 
     public abstract boolean hasNewFileName();
 
+    public abstract boolean storeRows();
+
+    public abstract int correspondingMessagingVersion(); // Only use by storage that 'storeRows' so far
+
+    public abstract boolean hasOldBfHashOrder();
+
+    public abstract boolean hasCompactionAncestors();
+
+    public abstract boolean hasBoundaries();
+
     public abstract boolean hasCommitLogLowerBound();
 
+    public abstract boolean hasCommitLogIntervals();
+
+    public abstract boolean hasAccurateMinMax();
+
     public String getVersion()
     {
         return version;
@@ -71,10 +93,11 @@
      */
     public static boolean validate(String ver)
     {
-        return ver != null && ver.matches("[a-z]+");
+        return ver != null && VALIDATION.matcher(ver).matches();
     }
 
     abstract public boolean isCompatible();
+    abstract public boolean isCompatibleForStreaming();
 
     @Override
     public String toString()

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 9244bbb..360ef8a 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java

@@ -17,30 +17,22 @@
  */
 package org.apache.cassandra.io.sstable.format.big;
 
-import com.google.common.collect.ImmutableList;
+import java.util.Set;
+
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.AbstractCell;
-import org.apache.cassandra.db.ColumnSerializer;
-import org.apache.cassandra.db.OnDiskAtom;
 import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
-import org.apache.cassandra.db.compaction.CompactionController;
-import org.apache.cassandra.db.compaction.LazilyCompactedRow;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.IndexHelper;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
-import org.apache.cassandra.io.util.FileDataInput;
-
-import java.util.Iterator;
-import java.util.Set;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ChecksumType;
 
 /**
  * Legacy bigtable format
@@ -48,7 +40,7 @@
 public class BigFormat implements SSTableFormat
 {
     public static final BigFormat instance = new BigFormat();
-    public static final BigVersion latestVersion = new BigVersion(BigVersion.current_version);
+    public static final Version latestVersion = new BigVersion(BigVersion.current_version);
     private static final SSTableReader.Factory readerFactory = new ReaderFactory();
     private static final SSTableWriter.Factory writerFactory = new WriterFactory();
 
@@ -82,38 +74,32 @@
     }
 
     @Override
-    public Iterator<OnDiskAtom> getOnDiskIterator(FileDataInput in, ColumnSerializer.Flag flag, int expireBefore, CFMetaData cfm, Version version)
+    public RowIndexEntry.IndexSerializer getIndexSerializer(CFMetaData metadata, Version version, SerializationHeader header)
     {
-        return AbstractCell.onDiskIterator(in, flag, expireBefore, version, cfm.comparator);
-    }
-
-    @Override
-    public AbstractCompactedRow getCompactedRowWriter(CompactionController controller, ImmutableList<OnDiskAtomIterator> onDiskAtomIterators)
-    {
-        return new LazilyCompactedRow(controller, onDiskAtomIterators);
-    }
-
-    @Override
-    public RowIndexEntry.IndexSerializer getIndexSerializer(CFMetaData cfMetaData)
-    {
-        return new RowIndexEntry.Serializer(new IndexHelper.IndexInfo.Serializer(cfMetaData.comparator));
+        return new RowIndexEntry.Serializer(metadata, version, header);
     }
 
     static class WriterFactory extends SSTableWriter.Factory
     {
         @Override
-        public SSTableWriter open(Descriptor descriptor, long keyCount, long repairedAt, CFMetaData metadata, IPartitioner partitioner, MetadataCollector metadataCollector)
+        public SSTableWriter open(Descriptor descriptor,
+                                  long keyCount,
+                                  long repairedAt,
+                                  CFMetaData metadata,
+                                  MetadataCollector metadataCollector,
+                                  SerializationHeader header,
+                                  LifecycleNewTracker lifecycleNewTracker)
         {
-            return new BigTableWriter(descriptor, keyCount, repairedAt, metadata, partitioner, metadataCollector);
+            return new BigTableWriter(descriptor, keyCount, repairedAt, metadata, metadataCollector, header, lifecycleNewTracker);
         }
     }
 
     static class ReaderFactory extends SSTableReader.Factory
     {
         @Override
-        public SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner, Long maxDataAge, StatsMetadata sstableMetadata, SSTableReader.OpenReason openReason)
+        public SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, Long maxDataAge, StatsMetadata sstableMetadata, SSTableReader.OpenReason openReason, SerializationHeader header)
         {
-            return new BigTableReader(descriptor, components, metadata, partitioner, maxDataAge, sstableMetadata, openReason);
+            return new BigTableReader(descriptor, components, metadata, maxDataAge, sstableMetadata, openReason, header);
         }
     }
 
@@ -125,7 +111,7 @@
     // we always incremented the major version.
     static class BigVersion extends Version
     {
-        public static final String current_version = "lb";
+        public static final String current_version = "md";
         public static final String earliest_supported_version = "jb";
 
         // jb (2.0.1): switch from crc32 to adler32 for compression checksums
@@ -136,28 +122,76 @@
         //             tracks presense of legacy (local and remote) counter shards
         // la (2.2.0): new file name format
         // lb (2.2.7): commit log lower bound included
+        // ma (3.0.0): swap bf hash order
+        //             store rows natively
+        // mb (3.0.7, 3.7): commit log lower bound included
+        // mc (3.0.8, 3.9): commit log intervals included
+        // md (3.0.18, 3.11.4): corrected sstable min/max clustering
+        //
+        // NOTE: when adding a new version, please add that to LegacySSTableTest, too.
 
         private final boolean isLatestVersion;
         private final boolean hasSamplingLevel;
         private final boolean newStatsFile;
-        private final boolean hasAllAdlerChecksums;
+        private final ChecksumType compressedChecksumType;
+        private final ChecksumType uncompressedChecksumType;
         private final boolean hasRepairedAt;
         private final boolean tracksLegacyCounterShards;
         private final boolean newFileName;
+        public final boolean storeRows;
+        public final int correspondingMessagingVersion; // Only use by storage that 'storeRows' so far
+        public final boolean hasBoundaries;
+        /**
+         * CASSANDRA-8413: 3.0 bloom filter representation changed (two longs just swapped)
+         * have no 'static' bits caused by using the same upper bits for both bloom filter and token distribution.
+         */
+        private final boolean hasOldBfHashOrder;
         private final boolean hasCommitLogLowerBound;
+        private final boolean hasCommitLogIntervals;
+        private final boolean hasAccurateMinMax;
 
-        public BigVersion(String version)
+        /**
+         * CASSANDRA-7066: compaction ancerstors are no longer used and have been removed.
+         */
+        private final boolean hasCompactionAncestors;
+
+        BigVersion(String version)
         {
-            super(instance,version);
+            super(instance, version);
 
             isLatestVersion = version.compareTo(current_version) == 0;
             hasSamplingLevel = version.compareTo("ka") >= 0;
             newStatsFile = version.compareTo("ka") >= 0;
-            hasAllAdlerChecksums = version.compareTo("ka") >= 0;
+
+            //For a while Adler32 was in use, now the CRC32 instrinsic is very good especially after Haswell
+            //PureJavaCRC32 was always faster than Adler32. See CASSANDRA-8684
+            ChecksumType checksumType = ChecksumType.CRC32;
+            if (version.compareTo("ka") >= 0 && version.compareTo("ma") < 0)
+                checksumType = ChecksumType.Adler32;
+            this.uncompressedChecksumType = checksumType;
+
+            checksumType = ChecksumType.CRC32;
+            if (version.compareTo("jb") >= 0 && version.compareTo("ma") < 0)
+                checksumType = ChecksumType.Adler32;
+            this.compressedChecksumType = checksumType;
+
             hasRepairedAt = version.compareTo("ka") >= 0;
             tracksLegacyCounterShards = version.compareTo("ka") >= 0;
+
             newFileName = version.compareTo("la") >= 0;
-            hasCommitLogLowerBound = version.compareTo("lb") >= 0;
+
+            hasOldBfHashOrder = version.compareTo("ma") < 0;
+            hasCompactionAncestors = version.compareTo("ma") < 0;
+            storeRows = version.compareTo("ma") >= 0;
+            correspondingMessagingVersion = storeRows
+                                          ? MessagingService.VERSION_30
+                                          : MessagingService.VERSION_21;
+
+            hasBoundaries = version.compareTo("ma") < 0;
+            hasCommitLogLowerBound = (version.compareTo("lb") >= 0 && version.compareTo("ma") < 0)
+                                     || version.compareTo("mb") >= 0;
+            hasCommitLogIntervals = version.compareTo("mc") >= 0;
+            hasAccurateMinMax = version.compareTo("md") >= 0;
         }
 
         @Override
@@ -179,9 +213,15 @@
         }
 
         @Override
-        public boolean hasAllAdlerChecksums()
+        public ChecksumType compressedChecksumType()
         {
-            return hasAllAdlerChecksums;
+            return compressedChecksumType;
+        }
+
+        @Override
+        public ChecksumType uncompressedChecksumType()
+        {
+            return uncompressedChecksumType;
         }
 
         @Override
@@ -197,20 +237,69 @@
         }
 
         @Override
+        public boolean hasOldBfHashOrder()
+        {
+            return hasOldBfHashOrder;
+        }
+
+        @Override
+        public boolean hasCompactionAncestors()
+        {
+            return hasCompactionAncestors;
+        }
+
+        @Override
         public boolean hasNewFileName()
         {
             return newFileName;
         }
 
+        @Override
         public boolean hasCommitLogLowerBound()
         {
             return hasCommitLogLowerBound;
         }
 
         @Override
+        public boolean hasCommitLogIntervals()
+        {
+            return hasCommitLogIntervals;
+        }
+
+        @Override
+        public boolean hasAccurateMinMax()
+        {
+            return hasAccurateMinMax;
+        }
+
+        @Override
+        public boolean storeRows()
+        {
+            return storeRows;
+        }
+
+        @Override
+        public int correspondingMessagingVersion()
+        {
+            return correspondingMessagingVersion;
+        }
+
+        @Override
+        public boolean hasBoundaries()
+        {
+            return hasBoundaries;
+        }
+
+        @Override
         public boolean isCompatible()
         {
             return version.compareTo(earliest_supported_version) >= 0 && version.charAt(0) <= current_version.charAt(0);
         }
+
+        @Override
+        public boolean isCompatibleForStreaming()
+        {
+            return isCompatible() && version.charAt(0) == current_version.charAt(0);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
index 5c51fbb..eeea18f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java

@@ -20,14 +20,12 @@
 import com.google.common.util.concurrent.RateLimiter;
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.DataRange;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.SliceableUnfilteredRowIterator;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.columniterator.SSTableIterator;
+import org.apache.cassandra.db.columniterator.SSTableReversedIterator;
+import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.Component;
@@ -35,9 +33,11 @@
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SkippingReason;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SelectionReason;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.slf4j.Logger;
@@ -55,40 +55,59 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(BigTableReader.class);
 
-    BigTableReader(Descriptor desc, Set<Component> components, CFMetaData metadata, IPartitioner partitioner, Long maxDataAge, StatsMetadata sstableMetadata, OpenReason openReason)
+    BigTableReader(Descriptor desc, Set<Component> components, CFMetaData metadata, Long maxDataAge, StatsMetadata sstableMetadata, OpenReason openReason, SerializationHeader header)
     {
-        super(desc, components, metadata, partitioner, maxDataAge, sstableMetadata, openReason);
+        super(desc, components, metadata, maxDataAge, sstableMetadata, openReason, header);
     }
 
-    public OnDiskAtomIterator iterator(DecoratedKey key, SortedSet<CellName> columns)
+    public SliceableUnfilteredRowIterator iterator(DecoratedKey key,
+                                                   ColumnFilter selectedColumns,
+                                                   boolean reversed,
+                                                   boolean isForThrift,
+                                                   SSTableReadsListener listener)
     {
-        return new SSTableNamesIterator(this, key, columns);
+        return reversed
+             ? new SSTableReversedIterator(this, key, selectedColumns, isForThrift, listener)
+             : new SSTableIterator(this, key, selectedColumns, isForThrift, listener);
     }
 
-    public OnDiskAtomIterator iterator(FileDataInput input, DecoratedKey key, SortedSet<CellName> columns, RowIndexEntry indexEntry )
+    public SliceableUnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, ColumnFilter selectedColumns, boolean reversed, boolean isForThrift)
     {
-        return new SSTableNamesIterator(this, input, key, columns, indexEntry);
+        return reversed
+             ? new SSTableReversedIterator(this, file, key, indexEntry, selectedColumns, isForThrift)
+             : new SSTableIterator(this, file, key, indexEntry, selectedColumns, isForThrift);
     }
 
-    public OnDiskAtomIterator iterator(DecoratedKey key, ColumnSlice[] slices, boolean reverse)
+    @Override
+    public ISSTableScanner getScanner(ColumnFilter columns,
+                                      DataRange dataRange,
+                                      RateLimiter limiter,
+                                      boolean isForThrift,
+                                      SSTableReadsListener listener)
     {
-        return new SSTableSliceIterator(this, key, slices, reverse);
+        return BigTableScanner.getScanner(this, columns, dataRange, limiter, isForThrift, listener);
     }
 
-    public OnDiskAtomIterator iterator(FileDataInput input, DecoratedKey key, ColumnSlice[] slices, boolean reverse, RowIndexEntry indexEntry)
-    {
-        return new SSTableSliceIterator(this, input, key, slices, reverse, indexEntry);
-    }
     /**
+     * Direct I/O SSTableScanner over an iterator of bounds.
      *
-     * @param dataRange filter to use when reading the columns
+     * @param boundsIterator the keys to cover
      * @return A Scanner for seeking over the rows of the SSTable.
      */
-    public ISSTableScanner getScanner(DataRange dataRange, RateLimiter limiter)
+    public ISSTableScanner getScanner(Iterator<AbstractBounds<PartitionPosition>> boundsIterator)
     {
-        return BigTableScanner.getScanner(this, dataRange, limiter);
+        return BigTableScanner.getScanner(this, boundsIterator);
     }
 
+    /**
+     * Direct I/O SSTableScanner over the full sstable.
+     *
+     * @return A Scanner for reading the full SSTable.
+     */
+    public ISSTableScanner getScanner(RateLimiter limiter)
+    {
+        return BigTableScanner.getScanner(this, limiter);
+    }
 
     /**
      * Direct I/O SSTableScanner over a defined collection of ranges of tokens.
@@ -98,7 +117,10 @@
      */
     public ISSTableScanner getScanner(Collection<Range<Token>> ranges, RateLimiter limiter)
     {
-        return BigTableScanner.getScanner(this, ranges, limiter);
+        if (ranges != null)
+            return BigTableScanner.getScanner(this, ranges, limiter);
+        else
+            return getScanner(limiter);
     }
 
 
@@ -107,15 +129,21 @@
      * allow key selection by token bounds but only if op != * EQ
      * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
      * @param updateCacheAndStats true if updating stats and cache
+     * @param listener a listener used to handle internal events
      * @return The index entry corresponding to the key, or null if the key is not present
      */
-    protected RowIndexEntry getPosition(RowPosition key, Operator op, boolean updateCacheAndStats, boolean permitMatchPastLast)
+    protected RowIndexEntry getPosition(PartitionPosition key,
+                                        Operator op,
+                                        boolean updateCacheAndStats,
+                                        boolean permitMatchPastLast,
+                                        SSTableReadsListener listener)
     {
         if (op == Operator.EQ)
         {
             assert key instanceof DecoratedKey; // EQ only make sense if the key is a valid row key
             if (!bf.isPresent((DecoratedKey)key))
             {
+                listener.onSSTableSkipped(this, SkippingReason.BLOOM_FILTER);
                 Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
                 return null;
             }
@@ -129,6 +157,7 @@
             RowIndexEntry cachedPosition = getCachedPosition(cacheKey, updateCacheAndStats);
             if (cachedPosition != null)
             {
+                listener.onSSTableSelected(this, cachedPosition, SelectionReason.KEY_CACHE_HIT);
                 Tracing.trace("Key cache hit for sstable {}", descriptor.generation);
                 return cachedPosition;
             }
@@ -157,6 +186,7 @@
         {
             if (op == Operator.EQ && updateCacheAndStats)
                 bloomFilterTracker.addFalsePositive();
+            listener.onSSTableSkipped(this, SkippingReason.MIN_MAX_KEYS);
             Tracing.trace("Check against min and max keys allows skipping sstable {}", descriptor.generation);
             return null;
         }
@@ -177,82 +207,81 @@
         // is lesser than the first key of next interval (and in that case we must return the position of the first key
         // of the next interval).
         int i = 0;
-        Iterator<FileDataInput> segments = ifile.iterator(sampledPosition);
-        while (segments.hasNext())
+        String path = null;
+        try (FileDataInput in = ifile.createReader(sampledPosition))
         {
-            String path = null;
-            try (FileDataInput in = segments.next())
+            path = in.getPath();
+            while (!in.isEOF())
             {
-                path = in.getPath();
-                while (!in.isEOF())
+                i++;
+
+                ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
+
+                boolean opSatisfied; // did we find an appropriate position for the op requested
+                boolean exactMatch; // is the current position an exact match for the key, suitable for caching
+
+                // Compare raw keys if possible for performance, otherwise compare decorated keys.
+                if (op == Operator.EQ && i <= effectiveInterval)
                 {
-                    i++;
-
-                    ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
-
-                    boolean opSatisfied; // did we find an appropriate position for the op requested
-                    boolean exactMatch; // is the current position an exact match for the key, suitable for caching
-
-                    // Compare raw keys if possible for performance, otherwise compare decorated keys.
-                    if (op == Operator.EQ && i <= effectiveInterval)
-                    {
-                        opSatisfied = exactMatch = indexKey.equals(((DecoratedKey) key).getKey());
-                    }
-                    else
-                    {
-                        DecoratedKey indexDecoratedKey = partitioner.decorateKey(indexKey);
-                        int comparison = indexDecoratedKey.compareTo(key);
-                        int v = op.apply(comparison);
-                        opSatisfied = (v == 0);
-                        exactMatch = (comparison == 0);
-                        if (v < 0)
-                        {
-                            Tracing.trace("Partition index lookup allows skipping sstable {}", descriptor.generation);
-                            return null;
-                        }
-                    }
-
-                    if (opSatisfied)
-                    {
-                        // read data position from index entry
-                        RowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in, descriptor.version);
-                        if (exactMatch && updateCacheAndStats)
-                        {
-                            assert key instanceof DecoratedKey; // key can be == to the index key only if it's a true row key
-                            DecoratedKey decoratedKey = (DecoratedKey)key;
-
-                            if (logger.isTraceEnabled())
-                            {
-                                // expensive sanity check!  see CASSANDRA-4687
-                                try (FileDataInput fdi = dfile.getSegment(indexEntry.position))
-                                {
-                                    DecoratedKey keyInDisk = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(fdi));
-                                    if (!keyInDisk.equals(key))
-                                        throw new AssertionError(String.format("%s != %s in %s", keyInDisk, key, fdi.getPath()));
-                                }
-                            }
-
-                            // store exact match for the key
-                            cacheKey(decoratedKey, indexEntry);
-                        }
-                        if (op == Operator.EQ && updateCacheAndStats)
-                            bloomFilterTracker.addTruePositive();
-                        Tracing.trace("Partition index with {} entries found for sstable {}", indexEntry.columnsIndex().size(), descriptor.generation);
-                        return indexEntry;
-                    }
-
-                    RowIndexEntry.Serializer.skip(in);
+                    opSatisfied = exactMatch = indexKey.equals(((DecoratedKey) key).getKey());
                 }
+                else
+                {
+                    DecoratedKey indexDecoratedKey = decorateKey(indexKey);
+                    int comparison = indexDecoratedKey.compareTo(key);
+                    int v = op.apply(comparison);
+                    opSatisfied = (v == 0);
+                    exactMatch = (comparison == 0);
+                    if (v < 0)
+                    {
+                        listener.onSSTableSkipped(this, SkippingReason.PARTITION_INDEX_LOOKUP);
+                        Tracing.trace("Partition index lookup allows skipping sstable {}", descriptor.generation);
+                        return null;
+                    }
+                }
+
+                if (opSatisfied)
+                {
+                    // read data position from index entry
+                    RowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in);
+                    if (exactMatch && updateCacheAndStats)
+                    {
+                        assert key instanceof DecoratedKey; // key can be == to the index key only if it's a true row key
+                        DecoratedKey decoratedKey = (DecoratedKey)key;
+
+                        if (logger.isTraceEnabled())
+                        {
+                            // expensive sanity check!  see CASSANDRA-4687
+                            try (FileDataInput fdi = dfile.createReader(indexEntry.position))
+                            {
+                                DecoratedKey keyInDisk = decorateKey(ByteBufferUtil.readWithShortLength(fdi));
+                                if (!keyInDisk.equals(key))
+                                    throw new AssertionError(String.format("%s != %s in %s", keyInDisk, key, fdi.getPath()));
+                            }
+                        }
+
+                        // store exact match for the key
+                        cacheKey(decoratedKey, indexEntry);
+                    }
+                    if (op == Operator.EQ && updateCacheAndStats)
+                        bloomFilterTracker.addTruePositive();
+                    listener.onSSTableSelected(this, indexEntry, SelectionReason.INDEX_ENTRY_FOUND);
+                    Tracing.trace("Partition index with {} entries found for sstable {}", indexEntry.columnsIndex().size(), descriptor.generation);
+                    return indexEntry;
+                }
+
+                RowIndexEntry.Serializer.skip(in, descriptor.version);
             }
-            catch (IOException e)
-            {
-                markSuspect();
-                throw new CorruptSSTableException(e, path);
-            }
+        }
+        catch (IOException e)
+        {
+            markSuspect();
+            throw new CorruptSSTableException(e, path);
         }
 
         if (op == SSTableReader.Operator.EQ && updateCacheAndStats)
             bloomFilterTracker.addFalsePositive();
+        listener.onSSTableSkipped(this, SkippingReason.INDEX_ENTRY_NOT_FOUND);
         Tracing.trace("Partition index lookup complete (bloom filter false positive) for sstable {}", descriptor.generation);
         return null;
     }

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
index d477152..82d8211 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java

@@ -21,17 +21,15 @@
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 
-import com.google.common.collect.AbstractIterator;
-import com.google.common.collect.Ordering;
+import org.apache.cassandra.utils.AbstractIterator;
+import com.google.common.collect.Iterators;
 import com.google.common.util.concurrent.RateLimiter;
 
-import org.apache.cassandra.db.DataRange;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.db.columniterator.IColumnIteratorFactory;
-import org.apache.cassandra.db.columniterator.LazyColumnIterator;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.AbstractBounds.Boundary;
 import org.apache.cassandra.dht.Bounds;
@@ -41,6 +39,7 @@
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -52,84 +51,118 @@
 
 public class BigTableScanner implements ISSTableScanner
 {
-    private AtomicBoolean isClosed = new AtomicBoolean(false);
+    private final AtomicBoolean isClosed = new AtomicBoolean(false);
     protected final RandomAccessReader dfile;
     protected final RandomAccessReader ifile;
     public final SSTableReader sstable;
 
-    private final Iterator<AbstractBounds<RowPosition>> rangeIterator;
-    private AbstractBounds<RowPosition> currentRange;
+    private final Iterator<AbstractBounds<PartitionPosition>> rangeIterator;
+    private AbstractBounds<PartitionPosition> currentRange;
 
+    private final ColumnFilter columns;
     private final DataRange dataRange;
     private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
+    private final boolean isForThrift;
+    private final SSTableReadsListener listener;
 
-    protected Iterator<OnDiskAtomIterator> iterator;
+    protected Iterator<UnfilteredRowIterator> iterator;
 
-    public static ISSTableScanner getScanner(SSTableReader sstable, DataRange dataRange, RateLimiter limiter)
+    // Full scan of the sstables
+    public static ISSTableScanner getScanner(SSTableReader sstable, RateLimiter limiter)
     {
-        return new BigTableScanner(sstable, dataRange, limiter);
+        return new BigTableScanner(sstable, limiter, Iterators.singletonIterator(fullRange(sstable)));
     }
+
+    public static ISSTableScanner getScanner(SSTableReader sstable,
+                                             ColumnFilter columns,
+                                             DataRange dataRange,
+                                             RateLimiter limiter,
+                                             boolean isForThrift,
+                                             SSTableReadsListener listener)
+    {
+        return new BigTableScanner(sstable,
+                                   columns,
+                                   dataRange,
+                                   limiter,
+                                   isForThrift,
+                                   makeBounds(sstable, dataRange).iterator(),
+                                   listener);
+    }
+
     public static ISSTableScanner getScanner(SSTableReader sstable, Collection<Range<Token>> tokenRanges, RateLimiter limiter)
     {
         // We want to avoid allocating a SSTableScanner if the range don't overlap the sstable (#5249)
         List<Pair<Long, Long>> positions = sstable.getPositionsForRanges(tokenRanges);
         if (positions.isEmpty())
-            return new EmptySSTableScanner(sstable.getFilename());
+            return new EmptySSTableScanner(sstable);
 
-        return new BigTableScanner(sstable, tokenRanges, limiter);
+        return new BigTableScanner(sstable, limiter, makeBounds(sstable, tokenRanges).iterator());
     }
 
-    /**
-     * @param sstable SSTable to scan; must not be null
-     * @param dataRange a single range to scan; must not be null
-     * @param limiter background i/o RateLimiter; may be null
-     */
-    private BigTableScanner(SSTableReader sstable, DataRange dataRange, RateLimiter limiter)
+    public static ISSTableScanner getScanner(SSTableReader sstable, Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
+    {
+        return new BigTableScanner(sstable, null, rangeIterator);
+    }
+
+    private BigTableScanner(SSTableReader sstable,
+                            RateLimiter limiter,
+                            Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
+    {
+        this(sstable, ColumnFilter.all(sstable.metadata), null, limiter, false, rangeIterator, SSTableReadsListener.NOOP_LISTENER);
+    }
+
+    private BigTableScanner(SSTableReader sstable,
+                            ColumnFilter columns,
+                            DataRange dataRange,
+                            RateLimiter limiter,
+                            boolean isForThrift,
+                            Iterator<AbstractBounds<PartitionPosition>> rangeIterator,
+                            SSTableReadsListener listener)
     {
         assert sstable != null;
 
         this.dfile = limiter == null ? sstable.openDataReader() : sstable.openDataReader(limiter);
         this.ifile = sstable.openIndexReader();
         this.sstable = sstable;
+        this.columns = columns;
         this.dataRange = dataRange;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata);
-
-        List<AbstractBounds<RowPosition>> boundsList = new ArrayList<>(2);
-        addRange(dataRange.keyRange(), boundsList);
-        this.rangeIterator = boundsList.iterator();
+        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata,
+                                                                                                        sstable.descriptor.version,
+                                                                                                        sstable.header);
+        this.isForThrift = isForThrift;
+        this.rangeIterator = rangeIterator;
+        this.listener = listener;
     }
 
-    /**
-     * @param sstable SSTable to scan; must not be null
-     * @param tokenRanges A set of token ranges to scan
-     * @param limiter background i/o RateLimiter; may be null
-     */
-    private BigTableScanner(SSTableReader sstable, Collection<Range<Token>> tokenRanges, RateLimiter limiter)
+    private static List<AbstractBounds<PartitionPosition>> makeBounds(SSTableReader sstable, Collection<Range<Token>> tokenRanges)
     {
-        assert sstable != null;
-
-        this.dfile = limiter == null ? sstable.openDataReader() : sstable.openDataReader(limiter);
-        this.ifile = sstable.openIndexReader();
-        this.sstable = sstable;
-        this.dataRange = null;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata);
-
-        List<AbstractBounds<RowPosition>> boundsList = new ArrayList<>(tokenRanges.size());
+        List<AbstractBounds<PartitionPosition>> boundsList = new ArrayList<>(tokenRanges.size());
         for (Range<Token> range : Range.normalize(tokenRanges))
-            addRange(Range.makeRowRange(range), boundsList);
-
-        this.rangeIterator = boundsList.iterator();
+            addRange(sstable, Range.makeRowRange(range), boundsList);
+        return boundsList;
     }
 
-    private void addRange(AbstractBounds<RowPosition> requested, List<AbstractBounds<RowPosition>> boundsList)
+    private static List<AbstractBounds<PartitionPosition>> makeBounds(SSTableReader sstable, DataRange dataRange)
+    {
+        List<AbstractBounds<PartitionPosition>> boundsList = new ArrayList<>(2);
+        addRange(sstable, dataRange.keyRange(), boundsList);
+        return boundsList;
+    }
+
+    private static AbstractBounds<PartitionPosition> fullRange(SSTableReader sstable)
+    {
+        return new Bounds<PartitionPosition>(sstable.first, sstable.last);
+    }
+
+    private static void addRange(SSTableReader sstable, AbstractBounds<PartitionPosition> requested, List<AbstractBounds<PartitionPosition>> boundsList)
     {
         if (requested instanceof Range && ((Range)requested).isWrapAround())
         {
             if (requested.right.compareTo(sstable.first) >= 0)
             {
                 // since we wrap, we must contain the whole sstable prior to stopKey()
-                Boundary<RowPosition> left = new Boundary<RowPosition>(sstable.first, true);
-                Boundary<RowPosition> right;
+                Boundary<PartitionPosition> left = new Boundary<PartitionPosition>(sstable.first, true);
+                Boundary<PartitionPosition> right;
                 right = requested.rightBoundary();
                 right = minRight(right, sstable.last, true);
                 if (!isEmpty(left, right))
@@ -138,8 +171,8 @@
             if (requested.left.compareTo(sstable.last) <= 0)
             {
                 // since we wrap, we must contain the whole sstable after dataRange.startKey()
-                Boundary<RowPosition> right = new Boundary<RowPosition>(sstable.last, true);
-                Boundary<RowPosition> left;
+                Boundary<PartitionPosition> right = new Boundary<PartitionPosition>(sstable.last, true);
+                Boundary<PartitionPosition> left;
                 left = requested.leftBoundary();
                 left = maxLeft(left, sstable.first, true);
                 if (!isEmpty(left, right))
@@ -149,12 +182,12 @@
         else
         {
             assert requested.left.compareTo(requested.right) <= 0 || requested.right.isMinimum();
-            Boundary<RowPosition> left, right;
+            Boundary<PartitionPosition> left, right;
             left = requested.leftBoundary();
             right = requested.rightBoundary();
             left = maxLeft(left, sstable.first, true);
             // apparently isWrapAround() doesn't count Bounds that extend to the limit (min) as wrapping
-            right = requested.right.isMinimum() ? new Boundary<RowPosition>(sstable.last, true)
+            right = requested.right.isMinimum() ? new Boundary<PartitionPosition>(sstable.last, true)
                                                     : minRight(right, sstable.last, true);
             if (!isEmpty(left, right))
                 boundsList.add(AbstractBounds.bounds(left, right));
@@ -171,18 +204,18 @@
             while (!ifile.isEOF())
             {
                 indexPosition = ifile.getFilePointer();
-                DecoratedKey indexDecoratedKey = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
+                DecoratedKey indexDecoratedKey = sstable.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
                 if (indexDecoratedKey.compareTo(currentRange.left) > 0 || currentRange.contains(indexDecoratedKey))
                 {
                     // Found, just read the dataPosition and seek into index and data files
-                    long dataPosition = ifile.readLong();
+                    long dataPosition = RowIndexEntry.Serializer.readPosition(ifile, sstable.descriptor.version);
                     ifile.seek(indexPosition);
                     dfile.seek(dataPosition);
                     break;
                 }
                 else
                 {
-                    RowIndexEntry.Serializer.skip(ifile);
+                    RowIndexEntry.Serializer.skip(ifile, sstable.descriptor.version);
                 }
             }
         }
@@ -193,10 +226,18 @@
         }
     }
 
-    public void close() throws IOException
+    public void close()
     {
-        if (isClosed.compareAndSet(false, true))
-            FileUtils.close(dfile, ifile);
+        try
+        {
+            if (isClosed.compareAndSet(false, true))
+                FileUtils.close(dfile, ifile);
+        }
+        catch (IOException e)
+        {
+            sstable.markSuspect();
+            throw new CorruptSSTableException(e, sstable.getFilename());
+        }
     }
 
     public long getLengthInBytes()
@@ -214,6 +255,16 @@
         return sstable.toString();
     }
 
+    public boolean isForThrift()
+    {
+        return isForThrift;
+    }
+
+    public CFMetaData metadata()
+    {
+        return sstable.metadata;
+    }
+
     public boolean hasNext()
     {
         if (iterator == null)
@@ -221,7 +272,7 @@
         return iterator.hasNext();
     }
 
-    public OnDiskAtomIterator next()
+    public UnfilteredRowIterator next()
     {
         if (iterator == null)
             iterator = createIterator();
@@ -233,19 +284,20 @@
         throw new UnsupportedOperationException();
     }
 
-    private Iterator<OnDiskAtomIterator> createIterator()
+    private Iterator<UnfilteredRowIterator> createIterator()
     {
+        listener.onScanningStarted(sstable);
         return new KeyScanningIterator();
     }
 
-    protected class KeyScanningIterator extends AbstractIterator<OnDiskAtomIterator>
+    protected class KeyScanningIterator extends AbstractIterator<UnfilteredRowIterator>
     {
         private DecoratedKey nextKey;
         private RowIndexEntry nextEntry;
         private DecoratedKey currentKey;
         private RowIndexEntry currentEntry;
 
-        protected OnDiskAtomIterator computeNext()
+        protected UnfilteredRowIterator computeNext()
         {
             try
             {
@@ -263,8 +315,8 @@
                         if (ifile.isEOF())
                             return endOfData();
 
-                        currentKey = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
-                        currentEntry = rowIndexEntrySerializer.deserialize(ifile, sstable.descriptor.version);
+                        currentKey = sstable.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
+                        currentEntry = rowIndexEntrySerializer.deserialize(ifile);
                     } while (!currentRange.contains(currentKey));
                 }
                 else
@@ -282,8 +334,8 @@
                 else
                 {
                     // we need the position of the start of the next key, regardless of whether it falls in the current range
-                    nextKey = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
-                    nextEntry = rowIndexEntrySerializer.deserialize(ifile, sstable.descriptor.version);
+                    nextKey = sstable.decorateKey(ByteBufferUtil.readWithShortLength(ifile));
+                    nextEntry = rowIndexEntrySerializer.deserialize(ifile);
 
                     if (!currentRange.contains(nextKey))
                     {
@@ -292,21 +344,34 @@
                     }
                 }
 
-                if (dataRange == null || dataRange.selectsFullRowFor(currentKey.getKey()))
+                /*
+                 * For a given partition key, we want to avoid hitting the data
+                 * file unless we're explicitely asked to. This is important
+                 * for PartitionRangeReadCommand#checkCacheFilter.
+                 */
+                return new LazilyInitializedUnfilteredRowIterator(currentKey)
                 {
-                    dfile.seek(currentEntry.position + currentEntry.headerOffset());
-                    ByteBufferUtil.readWithShortLength(dfile); // key
-                    return new SSTableIdentityIterator(sstable, dfile, currentKey);
-                }
-
-                return new LazyColumnIterator(currentKey, new IColumnIteratorFactory()
-                {
-                    public OnDiskAtomIterator create()
+                    protected UnfilteredRowIterator initializeIterator()
                     {
-                        return dataRange.columnFilter(currentKey.getKey()).getSSTableColumnIterator(sstable, dfile, currentKey, currentEntry);
-                    }
-                });
+                        try
+                        {
+                            if (dataRange == null)
+                            {
+                                dfile.seek(currentEntry.position);
+                                ByteBufferUtil.skipShortLength(dfile); // key
+                                return new SSTableIdentityIterator(sstable, dfile, partitionKey());
+                            }
 
+                            ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(partitionKey());
+                            return filter.filter(sstable.iterator(dfile, partitionKey(), currentEntry, columns, filter.isReversed(), isForThrift));
+                        }
+                        catch (CorruptSSTableException | IOException e)
+                        {
+                            sstable.markSuspect();
+                            throw new CorruptSSTableException(e, sstable.getFilename());
+                        }
+                    }
+                };
             }
             catch (CorruptSSTableException | IOException e)
             {
@@ -326,13 +391,13 @@
                ")";
     }
 
-    public static class EmptySSTableScanner implements ISSTableScanner
+    public static class EmptySSTableScanner extends AbstractUnfilteredPartitionIterator implements ISSTableScanner
     {
-        private final String filename;
+        private final SSTableReader sstable;
 
-        public EmptySSTableScanner(String filename)
+        public EmptySSTableScanner(SSTableReader sstable)
         {
-            this.filename = filename;
+            this.sstable = sstable;
         }
 
         public long getLengthInBytes()
@@ -347,7 +412,17 @@
 
         public String getBackingFiles()
         {
-            return filename;
+            return sstable.getFilename();
+        }
+
+        public boolean isForThrift()
+        {
+            return false;
+        }
+
+        public CFMetaData metadata()
+        {
+            return sstable.metadata;
         }
 
         public boolean hasNext()
@@ -355,15 +430,9 @@
             return false;
         }
 
-        public OnDiskAtomIterator next()
+        public UnfilteredRowIterator next()
         {
             return null;
         }
-
-        public void close() throws IOException { }
-
-        public void remove() { }
     }
-
-
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index 3a01f87..f733619 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java

@@ -18,24 +18,20 @@
 package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
-import java.util.Set;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.io.sstable.format.Version;
+
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
@@ -47,34 +43,36 @@
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.FilterFactory;
 import org.apache.cassandra.utils.IFilter;
-import org.apache.cassandra.utils.StreamingHistogram;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
-import static org.apache.cassandra.utils.Throwables.merge;
 import org.apache.cassandra.utils.SyncUtil;
 
 public class BigTableWriter extends SSTableWriter
 {
     private static final Logger logger = LoggerFactory.getLogger(BigTableWriter.class);
 
-    // not very random, but the only value that can't be mistaken for a legal column-name length
-    public static final int END_OF_ROW = 0x0000;
-
     private final IndexWriter iwriter;
-    private SegmentedFile.Builder dbuilder;
-    private final SequentialWriter dataFile;
+    private final SegmentedFile.Builder dbuilder;
+    protected final SequentialWriter dataFile;
     private DecoratedKey lastWrittenKey;
-    private FileMark dataMark;
+    private DataPosition dataMark;
 
-    BigTableWriter(Descriptor descriptor, Long keyCount, Long repairedAt, CFMetaData metadata, IPartitioner partitioner, MetadataCollector metadataCollector)
+    public BigTableWriter(Descriptor descriptor, 
+                          Long keyCount, 
+                          Long repairedAt, 
+                          CFMetaData metadata, 
+                          MetadataCollector metadataCollector, 
+                          SerializationHeader header,
+                          LifecycleNewTracker lifecycleNewTracker)
     {
-        super(descriptor, keyCount, repairedAt, metadata, partitioner, metadataCollector);
+        super(descriptor, keyCount, repairedAt, metadata, metadataCollector, header);
+        lifecycleNewTracker.trackNew(this); // must track before any files are created
 
         if (compression)
         {
             dataFile = SequentialWriter.open(getFilename(),
                                              descriptor.filenameFor(Component.COMPRESSION_INFO),
-                                             metadata.compressionParameters(),
+                                             metadata.params.compression,
                                              metadataCollector);
             dbuilder = SegmentedFile.getCompressedBuilder((CompressedSequentialWriter) dataFile);
         }
@@ -101,12 +99,12 @@
     /**
      * Perform sanity checks on @param decoratedKey and @return the position in the data file before any data is written
      */
-    private long beforeAppend(DecoratedKey decoratedKey)
+    protected long beforeAppend(DecoratedKey decoratedKey)
     {
         assert decoratedKey != null : "Keys must not be null"; // empty keys ARE allowed b/c of indexed column values
         if (lastWrittenKey != null && lastWrittenKey.compareTo(decoratedKey) >= 0)
             throw new RuntimeException("Last written key " + lastWrittenKey + " >= current key " + decoratedKey + " writing into " + getFilename());
-        return (lastWrittenKey == null) ? 0 : dataFile.getFilePointer();
+        return (lastWrittenKey == null) ? 0 : dataFile.position();
     }
 
     private void afterAppend(DecoratedKey decoratedKey, long dataEnd, RowIndexEntry index) throws IOException
@@ -120,27 +118,43 @@
         if (logger.isTraceEnabled())
             logger.trace("wrote {} at {}", decoratedKey, dataEnd);
         iwriter.append(decoratedKey, index, dataEnd);
-        dbuilder.addPotentialBoundary(dataEnd);
     }
 
     /**
-     * @param row
-     * @return null if the row was compacted away entirely; otherwise, the PK index entry for this row
+     * Appends partition data to this writer.
+     *
+     * @param iterator the partition to write
+     * @return the created index entry if something was written, that is if {@code iterator}
+     * wasn't empty, {@code null} otherwise.
+     *
+     * @throws FSWriteError if a write to the dataFile fails
      */
-    public RowIndexEntry append(AbstractCompactedRow row)
+    public RowIndexEntry append(UnfilteredRowIterator iterator)
     {
-        long startPosition = beforeAppend(row.key);
-        RowIndexEntry entry;
-        try
+        DecoratedKey key = iterator.partitionKey();
+
+        if (key.getKey().remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
         {
-            entry = row.write(startPosition, dataFile);
-            if (entry == null)
-                return null;
-            long endPosition = dataFile.getFilePointer();
+            logger.error("Key size {} exceeds maximum of {}, skipping row", key.getKey().remaining(), FBUtilities.MAX_UNSIGNED_SHORT);
+            return null;
+        }
+
+        if (iterator.isEmpty())
+            return null;
+
+        long startPosition = beforeAppend(key);
+
+        try (UnfilteredRowIterator collecting = Transformation.apply(iterator, new StatsCollector(metadataCollector)))
+        {
+            ColumnIndex index = ColumnIndex.writeAndBuildIndex(collecting, dataFile, header, descriptor.version);
+
+            RowIndexEntry entry = RowIndexEntry.create(startPosition, collecting.partitionLevelDeletion(), index);
+
+            long endPosition = dataFile.position();
             long rowSize = endPosition - startPosition;
-            maybeLogLargePartitionWarning(row.key, rowSize);
-            metadataCollector.update(rowSize, row.columnStats());
-            afterAppend(row.key, endPosition, entry);
+            maybeLogLargePartitionWarning(key, rowSize);
+            metadataCollector.addPartitionSizeInBytes(rowSize);
+            afterAppend(key, endPosition, entry);
             return entry;
         }
         catch (IOException e)
@@ -149,154 +163,70 @@
         }
     }
 
-    public void append(DecoratedKey decoratedKey, ColumnFamily cf)
-    {
-        if (decoratedKey.getKey().remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
-        {
-            logger.error("Key size {} exceeds maximum of {}, skipping row",
-                         decoratedKey.getKey().remaining(),
-                         FBUtilities.MAX_UNSIGNED_SHORT);
-            return;
-        }
-
-        long startPosition = beforeAppend(decoratedKey);
-        long endPosition;
-        try
-        {
-            RowIndexEntry entry = rawAppend(cf, startPosition, decoratedKey, dataFile.stream);
-            endPosition = dataFile.getFilePointer();
-            afterAppend(decoratedKey, endPosition, entry);
-        }
-        catch (IOException e)
-        {
-            throw new FSWriteError(e, dataFile.getPath());
-        }
-        long rowSize = endPosition - startPosition;
-        maybeLogLargePartitionWarning(decoratedKey, rowSize);
-        metadataCollector.update(endPosition - startPosition, cf.getColumnStats());
-    }
-
     private void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
     {
         if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
         {
             String keyString = metadata.getKeyValidator().getString(key.getKey());
-            logger.warn("Writing large partition {}/{}:{} ({} bytes)", metadata.ksName, metadata.cfName, keyString, rowSize);
+            logger.warn("Writing large partition {}/{}:{} ({} bytes to sstable {}) ", metadata.ksName, metadata.cfName, keyString, rowSize, getFilename());
         }
     }
 
-    private static RowIndexEntry rawAppend(ColumnFamily cf, long startPosition, DecoratedKey key, DataOutputPlus out) throws IOException
+    private static class StatsCollector extends Transformation
     {
-        assert cf.hasColumns() || cf.isMarkedForDelete();
+        private final MetadataCollector collector;
+        private int cellCount;
 
-        ColumnIndex.Builder builder = new ColumnIndex.Builder(cf, key.getKey(), out);
-        ColumnIndex index = builder.build(cf);
-
-        out.writeShort(END_OF_ROW);
-        return RowIndexEntry.create(startPosition, cf.deletionInfo().getTopLevelDeletion(), index);
-    }
-
-    /**
-     * @throws IOException if a read from the DataInput fails
-     * @throws FSWriteError if a write to the dataFile fails
-     */
-    public long appendFromStream(DecoratedKey key, CFMetaData metadata, DataInput in, Version version) throws IOException
-    {
-        long currentPosition = beforeAppend(key);
-
-        ColumnStats.MaxLongTracker maxTimestampTracker = new ColumnStats.MaxLongTracker(Long.MAX_VALUE);
-        ColumnStats.MinLongTracker minTimestampTracker = new ColumnStats.MinLongTracker(Long.MIN_VALUE);
-        ColumnStats.MaxIntTracker maxDeletionTimeTracker = new ColumnStats.MaxIntTracker(Integer.MAX_VALUE);
-        List<ByteBuffer> minColumnNames = Collections.emptyList();
-        List<ByteBuffer> maxColumnNames = Collections.emptyList();
-        StreamingHistogram tombstones = new StreamingHistogram(TOMBSTONE_HISTOGRAM_BIN_SIZE);
-        boolean hasLegacyCounterShards = false;
-
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(metadata);
-        cf.delete(DeletionTime.serializer.deserialize(in));
-
-        ColumnIndex.Builder columnIndexer = new ColumnIndex.Builder(cf, key.getKey(), dataFile.stream);
-
-        if (cf.deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
+        StatsCollector(MetadataCollector collector)
         {
-            tombstones.update(cf.deletionInfo().getTopLevelDeletion().localDeletionTime);
-            maxDeletionTimeTracker.update(cf.deletionInfo().getTopLevelDeletion().localDeletionTime);
-            minTimestampTracker.update(cf.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
-            maxTimestampTracker.update(cf.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
+            this.collector = collector;
         }
 
-        Iterator<RangeTombstone> rangeTombstoneIterator = cf.deletionInfo().rangeIterator();
-        while (rangeTombstoneIterator.hasNext())
+        @Override
+        public Row applyToStatic(Row row)
         {
-            RangeTombstone rangeTombstone = rangeTombstoneIterator.next();
-            tombstones.update(rangeTombstone.getLocalDeletionTime());
-            minTimestampTracker.update(rangeTombstone.timestamp());
-            maxTimestampTracker.update(rangeTombstone.timestamp());
-            maxDeletionTimeTracker.update(rangeTombstone.getLocalDeletionTime());
-            minColumnNames = ColumnNameHelper.minComponents(minColumnNames, rangeTombstone.min, metadata.comparator);
-            maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, rangeTombstone.max, metadata.comparator);
+            if (!row.isEmpty())
+                cellCount += Rows.collectStats(row, collector);
+            return row;
         }
 
-        Iterator<OnDiskAtom> iter = AbstractCell.onDiskIterator(in, ColumnSerializer.Flag.PRESERVE_SIZE, Integer.MIN_VALUE, version, metadata.comparator);
-        try
+        @Override
+        public Row applyToRow(Row row)
         {
-            while (iter.hasNext())
+            collector.updateClusteringValues(row.clustering());
+            cellCount += Rows.collectStats(row, collector);
+            return row;
+        }
+
+        @Override
+        public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
+        {
+            collector.updateClusteringValues(marker.clustering());
+            if (marker.isBoundary())
             {
-                OnDiskAtom atom = iter.next();
-                if (atom == null)
-                    break;
-
-                if (atom instanceof CounterCell)
-                {
-                    atom = ((CounterCell) atom).markLocalToBeCleared();
-                    hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) atom).hasLegacyShards();
-                }
-
-                int deletionTime = atom.getLocalDeletionTime();
-                if (deletionTime < Integer.MAX_VALUE)
-                    tombstones.update(deletionTime);
-                minTimestampTracker.update(atom.timestamp());
-                maxTimestampTracker.update(atom.timestamp());
-                minColumnNames = ColumnNameHelper.minComponents(minColumnNames, atom.name(), metadata.comparator);
-                maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, atom.name(), metadata.comparator);
-                maxDeletionTimeTracker.update(atom.getLocalDeletionTime());
-
-                columnIndexer.add(atom); // This write the atom on disk too
+                RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker)marker;
+                collector.update(bm.endDeletionTime());
+                collector.update(bm.startDeletionTime());
             }
-            columnIndexer.finishAddingAtoms();
-
-            columnIndexer.maybeWriteEmptyRowHeader();
-            dataFile.stream.writeShort(END_OF_ROW);
+            else
+            {
+                collector.update(((RangeTombstoneBoundMarker)marker).deletionTime());
+            }
+            return marker;
         }
-        catch (IOException e)
+
+        @Override
+        public void onPartitionClose()
         {
-            throw new FSWriteError(e, dataFile.getPath());
+            collector.addCellPerPartitionCount(cellCount);
         }
 
-        metadataCollector.updateMinTimestamp(minTimestampTracker.get())
-                         .updateMaxTimestamp(maxTimestampTracker.get())
-                         .updateMaxLocalDeletionTime(maxDeletionTimeTracker.get())
-                         .addRowSize(dataFile.getFilePointer() - currentPosition)
-                         .addColumnCount(columnIndexer.writtenAtomCount())
-                         .mergeTombstoneHistogram(tombstones)
-                         .updateMinColumnNames(minColumnNames)
-                         .updateMaxColumnNames(maxColumnNames)
-                         .updateHasLegacyCounterShards(hasLegacyCounterShards);
-
-        afterAppend(key, currentPosition, RowIndexEntry.create(currentPosition, cf.deletionInfo().getTopLevelDeletion(), columnIndexer.build()));
-        return currentPosition;
-    }
-
-    private Descriptor makeTmpLinks()
-    {
-        // create temp links if they don't already exist
-        Descriptor link = descriptor.asType(Descriptor.Type.TEMPLINK);
-        if (!new File(link.filenameFor(Component.PRIMARY_INDEX)).exists())
+        @Override
+        public DeletionTime applyToDeletion(DeletionTime deletionTime)
         {
-            FileUtils.createHardLink(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)), new File(link.filenameFor(Component.PRIMARY_INDEX)));
-            FileUtils.createHardLink(new File(descriptor.filenameFor(Component.DATA)), new File(link.filenameFor(Component.DATA)));
+            collector.update(deletionTime);
+            return deletionTime;
         }
-        return link;
     }
 
     @SuppressWarnings("resource")
@@ -309,15 +239,14 @@
 
         StatsMetadata stats = statsMetadata();
         assert boundary.indexLength > 0 && boundary.dataLength > 0;
-        Descriptor link = makeTmpLinks();
-        // open the reader early, giving it a FINAL descriptor type so that it is indistinguishable for other consumers
-        SegmentedFile ifile = iwriter.builder.complete(link.filenameFor(Component.PRIMARY_INDEX), boundary.indexLength);
-        SegmentedFile dfile = dbuilder.complete(link.filenameFor(Component.DATA), boundary.dataLength);
-        SSTableReader sstable = SSTableReader.internalOpen(descriptor.asType(Descriptor.Type.FINAL),
+        // open the reader early
+        IndexSummary indexSummary = iwriter.summary.build(metadata.partitioner, boundary);
+        SegmentedFile ifile = iwriter.builder.buildIndex(descriptor, indexSummary, boundary);
+        SegmentedFile dfile = dbuilder.buildData(descriptor, stats, boundary);
+        SSTableReader sstable = SSTableReader.internalOpen(descriptor,
                                                            components, metadata,
-                                                           partitioner, ifile,
-                                                           dfile, iwriter.summary.build(partitioner, boundary),
-                                                           iwriter.bf.sharedCopy(), maxDataAge, stats, SSTableReader.OpenReason.EARLY);
+                                                           ifile, dfile, indexSummary,
+                                                           iwriter.bf.sharedCopy(), maxDataAge, stats, SSTableReader.OpenReason.EARLY, header);
 
         // now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
         sstable.first = getMinimalKey(first);
@@ -330,7 +259,8 @@
         // we must ensure the data is completely flushed to disk
         dataFile.sync();
         iwriter.indexFile.sync();
-        return openFinal(makeTmpLinks(), SSTableReader.OpenReason.EARLY);
+
+        return openFinal(descriptor, SSTableReader.OpenReason.EARLY);
     }
 
     @SuppressWarnings("resource")
@@ -341,19 +271,20 @@
 
         StatsMetadata stats = statsMetadata();
         // finalize in-memory state for the reader
-        SegmentedFile ifile = iwriter.builder.complete(desc.filenameFor(Component.PRIMARY_INDEX));
-        SegmentedFile dfile = dbuilder.complete(desc.filenameFor(Component.DATA));
-        SSTableReader sstable = SSTableReader.internalOpen(desc.asType(Descriptor.Type.FINAL),
+        IndexSummary indexSummary = iwriter.summary.build(this.metadata.partitioner);
+        SegmentedFile ifile = iwriter.builder.buildIndex(desc, indexSummary);
+        SegmentedFile dfile = dbuilder.buildData(desc, stats);
+        SSTableReader sstable = SSTableReader.internalOpen(desc,
                                                            components,
                                                            this.metadata,
-                                                           partitioner,
                                                            ifile,
                                                            dfile,
-                                                           iwriter.summary.build(partitioner),
+                                                           indexSummary,
                                                            iwriter.bf.sharedCopy(),
                                                            maxDataAge,
                                                            stats,
-                                                           openReason);
+                                                           openReason,
+                                                           header);
         sstable.first = getMinimalKey(first);
         sstable.last = getMinimalKey(last);
         return sstable;
@@ -378,11 +309,8 @@
             // save the table of components
             SSTable.appendTOC(descriptor, components);
 
-            // rename to final
-            rename(descriptor, components);
-
             if (openResult)
-                finalReader = openFinal(descriptor.asType(Descriptor.Type.FINAL), SSTableReader.OpenReason.NORMAL);
+                finalReader = openFinal(descriptor, SSTableReader.OpenReason.NORMAL);
         }
 
         protected Throwable doCommit(Throwable accumulate)
@@ -393,7 +321,7 @@
         }
 
         @Override
-        protected Throwable doPreCleanup(Throwable accumulate)
+        protected Throwable doPostCleanup(Throwable accumulate)
         {
             accumulate = dbuilder.close(accumulate);
             return accumulate;
@@ -403,26 +331,6 @@
         {
             accumulate = iwriter.abort(accumulate);
             accumulate = dataFile.abort(accumulate);
-
-            accumulate = delete(descriptor, accumulate);
-            if (!openResult)
-                accumulate = delete(descriptor.asType(Descriptor.Type.FINAL), accumulate);
-            return accumulate;
-        }
-
-        private Throwable delete(Descriptor desc, Throwable accumulate)
-        {
-            try
-            {
-                Set<Component> components = SSTable.discoverComponentsFor(desc);
-                if (!components.isEmpty())
-                    SSTable.delete(desc, components);
-            }
-            catch (Throwable t)
-            {
-                logger.error(String.format("Failed deleting temp components for %s", descriptor), t);
-                accumulate = merge(accumulate, t);
-            }
             return accumulate;
         }
     }
@@ -430,9 +338,9 @@
     private static void writeMetadata(Descriptor desc, Map<MetadataType, MetadataComponent> components)
     {
         File file = new File(desc.filenameFor(Component.STATS));
-        try (SequentialWriter out = SequentialWriter.open(file);)
+        try (SequentialWriter out = SequentialWriter.open(file))
         {
-            desc.getMetadataSerializer().serialize(components, desc.version, out.stream);
+            desc.getMetadataSerializer().serialize(components, out, desc.version);
             out.setDescriptor(desc).finish();
         }
         catch (IOException e)
@@ -443,7 +351,7 @@
 
     public long getFilePointer()
     {
-        return dataFile.getFilePointer();
+        return dataFile.position();
     }
 
     public long getOnDiskFilePointer()
@@ -460,14 +368,14 @@
         public final SegmentedFile.Builder builder;
         public final IndexSummaryBuilder summary;
         public final IFilter bf;
-        private FileMark mark;
+        private DataPosition mark;
 
         IndexWriter(long keyCount, final SequentialWriter dataFile)
         {
             indexFile = SequentialWriter.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
             builder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(), false);
-            summary = new IndexSummaryBuilder(keyCount, metadata.getMinIndexInterval(), Downsampling.BASE_SAMPLING_LEVEL);
-            bf = FilterFactory.getFilter(keyCount, metadata.getBloomFilterFpChance(), true);
+            summary = new IndexSummaryBuilder(keyCount, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL);
+            bf = FilterFactory.getFilter(keyCount, metadata.params.bloomFilterFpChance, true, descriptor.version.hasOldBfHashOrder());
             // register listeners to be alerted when the data files are flushed
             indexFile.setPostFlushListener(new Runnable()
             {
@@ -494,23 +402,22 @@
         public void append(DecoratedKey key, RowIndexEntry indexEntry, long dataEnd) throws IOException
         {
             bf.add(key);
-            long indexStart = indexFile.getFilePointer();
+            long indexStart = indexFile.position();
             try
             {
-                ByteBufferUtil.writeWithShortLength(key.getKey(), indexFile.stream);
-                rowIndexEntrySerializer.serialize(indexEntry, indexFile.stream);
+                ByteBufferUtil.writeWithShortLength(key.getKey(), indexFile);
+                rowIndexEntrySerializer.serialize(indexEntry, indexFile);
             }
             catch (IOException e)
             {
                 throw new FSWriteError(e, indexFile.getPath());
             }
-            long indexEnd = indexFile.getFilePointer();
+            long indexEnd = indexFile.position();
 
             if (logger.isTraceEnabled())
                 logger.trace("wrote index entry: {} at {}", indexEntry, indexStart);
 
             summary.maybeAddEntry(key, indexStart, indexEnd, dataEnd);
-            builder.addPotentialBoundary(indexStart);
         }
 
         /**
@@ -554,13 +461,13 @@
             flushBf();
 
             // truncate index file
-            long position = iwriter.indexFile.getFilePointer();
+            long position = iwriter.indexFile.position();
             iwriter.indexFile.setDescriptor(descriptor).prepareToCommit();
             FileUtils.truncate(iwriter.indexFile.getPath(), position);
 
             // save summary
             summary.prepareToCommit();
-            try (IndexSummary summary = iwriter.summary.build(partitioner))
+            try (IndexSummary summary = iwriter.summary.build(getPartitioner()))
             {
                 SSTableReader.saveSummary(descriptor, first, last, iwriter.builder, dbuilder, summary);
             }
@@ -577,7 +484,7 @@
         }
 
         @Override
-        protected Throwable doPreCleanup(Throwable accumulate)
+        protected Throwable doPostCleanup(Throwable accumulate)
         {
             accumulate = summary.close(accumulate);
             accumulate = bf.close(accumulate);

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/IndexedSliceReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/IndexedSliceReader.java
deleted file mode 100644
index 6db9c3d..0000000
--- a/src/java/org/apache/cassandra/io/sstable/format/big/IndexedSliceReader.java
+++ /dev/null

@@ -1,542 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable.format.big;
-
-import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.Deque;
-import java.util.List;
-
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.cassandra.io.sstable.IndexHelper;
-import org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileMark;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-/**
- * This is a reader that finds the block for a starting column and returns blocks before/after it for each next call.
- * This function assumes that the CF is sorted by name and exploits the name index.
- */
-class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
-{
-    private final ColumnFamily emptyColumnFamily;
-
-    private final SSTableReader sstable;
-    private final List<IndexHelper.IndexInfo> indexes;
-    private final FileDataInput originalInput;
-    private FileDataInput file;
-    private final boolean reversed;
-    private final ColumnSlice[] slices;
-    private final BlockFetcher fetcher;
-    private final Deque<OnDiskAtom> blockColumns = new ArrayDeque<OnDiskAtom>();
-    private final CellNameType comparator;
-
-    // Holds range tombstone in reverse queries. See addColumn()
-    private final Deque<OnDiskAtom> rangeTombstonesReversed;
-
-    /**
-     * This slice reader assumes that slices are sorted correctly, e.g. that for forward lookup slices are in
-     * lexicographic order of start elements and that for reverse lookup they are in reverse lexicographic order of
-     * finish (reverse start) elements. i.e. forward: [a,b],[d,e],[g,h] reverse: [h,g],[e,d],[b,a]. This reader also
-     * assumes that validation has been performed in terms of intervals (no overlapping intervals).
-     */
-    IndexedSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, ColumnSlice[] slices, boolean reversed)
-    {
-        Tracing.trace("Seeking to partition indexed section in data file");
-        this.sstable = sstable;
-        this.originalInput = input;
-        this.reversed = reversed;
-        this.slices = slices;
-        this.comparator = sstable.metadata.comparator;
-        this.rangeTombstonesReversed = reversed ? new ArrayDeque<OnDiskAtom>() : null;
-
-        try
-        {
-            this.indexes = indexEntry.columnsIndex();
-            emptyColumnFamily = ArrayBackedSortedColumns.factory.create(sstable.metadata);
-            if (indexes.isEmpty())
-            {
-                setToRowStart(indexEntry, input);
-                emptyColumnFamily.delete(DeletionTime.serializer.deserialize(file));
-                fetcher = new SimpleBlockFetcher();
-            }
-            else
-            {
-                emptyColumnFamily.delete(indexEntry.deletionTime());
-                fetcher = new IndexedBlockFetcher(indexEntry.position);
-            }
-        }
-        catch (IOException e)
-        {
-            sstable.markSuspect();
-            throw new CorruptSSTableException(e, file.getPath());
-        }
-    }
-
-    /**
-     * Sets the seek position to the start of the row for column scanning.
-     */
-    private void setToRowStart(RowIndexEntry rowEntry, FileDataInput in) throws IOException
-    {
-        if (in == null)
-        {
-            this.file = sstable.getFileDataInput(rowEntry.position);
-        }
-        else
-        {
-            this.file = in;
-            in.seek(rowEntry.position);
-        }
-        sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
-    }
-
-    public ColumnFamily getColumnFamily()
-    {
-        return emptyColumnFamily;
-    }
-
-    public DecoratedKey getKey()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    protected OnDiskAtom computeNext()
-    {
-        while (true)
-        {
-            if (reversed)
-            {
-                // Return all tombstone for the block first (see addColumn() below)
-                OnDiskAtom column = rangeTombstonesReversed.poll();
-                if (column != null)
-                    return column;
-            }
-
-            OnDiskAtom column = blockColumns.poll();
-            if (column == null)
-            {
-                if (!fetcher.fetchMoreData())
-                    return endOfData();
-            }
-            else
-            {
-                return column;
-            }
-        }
-    }
-
-    public void close() throws IOException
-    {
-        if (originalInput == null && file != null)
-            file.close();
-    }
-
-    protected void addColumn(OnDiskAtom col)
-    {
-        if (reversed)
-        {
-            /*
-             * We put range tomstone markers at the beginning of the range they delete. But for reversed queries,
-             * the caller still need to know about a RangeTombstone before it sees any column that it covers.
-             * To make that simple, we keep said tombstones separate and return them all before any column for
-             * a given block.
-             */
-            if (col instanceof RangeTombstone)
-                rangeTombstonesReversed.addFirst(col);
-            else
-                blockColumns.addFirst(col);
-        }
-        else
-        {
-            blockColumns.addLast(col);
-        }
-    }
-
-    private abstract class BlockFetcher
-    {
-        protected int currentSliceIdx;
-
-        protected BlockFetcher(int sliceIdx)
-        {
-            this.currentSliceIdx = sliceIdx;
-        }
-
-        /*
-         * Return the smallest key selected by the current ColumnSlice.
-         */
-        protected Composite currentStart()
-        {
-            return reversed ? slices[currentSliceIdx].finish : slices[currentSliceIdx].start;
-        }
-
-        /*
-         * Return the biggest key selected by the current ColumnSlice.
-         */
-        protected Composite currentFinish()
-        {
-            return reversed ? slices[currentSliceIdx].start : slices[currentSliceIdx].finish;
-        }
-
-        protected abstract boolean setNextSlice();
-
-        protected abstract boolean fetchMoreData();
-
-        protected boolean isColumnBeforeSliceStart(OnDiskAtom column)
-        {
-            return isBeforeSliceStart(column.name());
-        }
-
-        protected boolean isBeforeSliceStart(Composite name)
-        {
-            Composite start = currentStart();
-            return !start.isEmpty() && comparator.compare(name, start) < 0;
-        }
-
-        protected boolean isColumnBeforeSliceFinish(OnDiskAtom column)
-        {
-            Composite finish = currentFinish();
-            return finish.isEmpty() || comparator.compare(column.name(), finish) <= 0;
-        }
-
-        protected boolean isAfterSliceFinish(Composite name)
-        {
-            Composite finish = currentFinish();
-            return !finish.isEmpty() && comparator.compare(name, finish) > 0;
-        }
-    }
-
-    private class IndexedBlockFetcher extends BlockFetcher
-    {
-        // where this row starts
-        private final long columnsStart;
-
-        // the index entry for the next block to deserialize
-        private int nextIndexIdx = -1;
-
-        // index of the last block we've read from disk;
-        private int lastDeserializedBlock = -1;
-
-        // For reversed, keep columns at the beginning of the last deserialized block that
-        // may still match a slice
-        private final Deque<OnDiskAtom> prefetched;
-
-        public IndexedBlockFetcher(long columnsStart)
-        {
-            super(-1);
-            this.columnsStart = columnsStart;
-            this.prefetched = reversed ? new ArrayDeque<OnDiskAtom>() : null;
-            setNextSlice();
-        }
-
-        protected boolean setNextSlice()
-        {
-            while (++currentSliceIdx < slices.length)
-            {
-                nextIndexIdx = IndexHelper.indexFor(slices[currentSliceIdx].start, indexes, comparator, reversed, nextIndexIdx);
-                if (nextIndexIdx < 0 || nextIndexIdx >= indexes.size())
-                    // no index block for that slice
-                    continue;
-
-                // Check if we can exclude this slice entirely from the index
-                IndexInfo info = indexes.get(nextIndexIdx);
-                if (reversed)
-                {
-                    if (!isBeforeSliceStart(info.lastName))
-                        return true;
-                }
-                else
-                {
-                    if (!isAfterSliceFinish(info.firstName))
-                        return true;
-                }
-            }
-            nextIndexIdx = -1;
-            return false;
-        }
-
-        protected boolean hasMoreSlice()
-        {
-            return currentSliceIdx < slices.length;
-        }
-
-        protected boolean fetchMoreData()
-        {
-            if (!hasMoreSlice())
-                return false;
-
-            // If we read blocks in reversed disk order, we may have columns from the previous block to handle.
-            // Note that prefetched keeps columns in reversed disk order.
-            // Also note that Range Tombstone handling is a bit tricky, because we may run into range tombstones
-            // that cover a slice *after* we've move to the previous slice. To keep it simple, we simply include
-            // every RT in prefetched: it's only slightly inefficient to do so and there is only so much RT that
-            // can be mistakenly added this way.
-            if (reversed && !prefetched.isEmpty())
-            {
-                // Avoids some comparison when we know it's not useful
-                boolean inSlice = false;
-
-                OnDiskAtom prefetchedCol;
-                while ((prefetchedCol = prefetched.peek()) != null)
-                {
-                    // col is before slice, we update the slice
-                    if (isColumnBeforeSliceStart(prefetchedCol))
-                    {
-                        inSlice = false;
-
-                        // As explained above, we add RT unconditionally
-                        if (prefetchedCol instanceof RangeTombstone)
-                        {
-                            blockColumns.addLast(prefetched.poll());
-                            continue;
-                        }
-
-                        // Otherwise, we either move to the next slice. If we have no more slice, then
-                        // simply unwind prefetched entirely and add all RT.
-                        if (!setNextSlice())
-                        {
-                            while ((prefetchedCol = prefetched.poll()) != null)
-                                if (prefetchedCol instanceof RangeTombstone)
-                                    blockColumns.addLast(prefetchedCol);
-                            break;
-                        }
-
-                    }
-                    // col is within slice, all columns
-                    // (we go in reverse, so as soon as we are in a slice, no need to check
-                    // we're after the slice until we change slice)
-                    else if (inSlice || isColumnBeforeSliceFinish(prefetchedCol))
-                    {
-                        blockColumns.addLast(prefetched.poll());
-                        inSlice = true;
-                    }
-                    // if col is after slice, ignore
-                    else
-                    {
-                        prefetched.poll();
-                    }
-                }
-
-                if (!blockColumns.isEmpty())
-                    return true;
-                else if (!hasMoreSlice())
-                    return false;
-            }
-            try
-            {
-                return getNextBlock();
-            }
-            catch (IOException e)
-            {
-                throw new CorruptSSTableException(e, file.getPath());
-            }
-        }
-
-        private boolean getNextBlock() throws IOException
-        {
-            if (lastDeserializedBlock == nextIndexIdx)
-            {
-                if (reversed)
-                    nextIndexIdx--;
-                else
-                    nextIndexIdx++;
-            }
-            lastDeserializedBlock = nextIndexIdx;
-
-            // Are we done?
-            if (lastDeserializedBlock < 0 || lastDeserializedBlock >= indexes.size())
-                return false;
-
-            IndexInfo currentIndex = indexes.get(lastDeserializedBlock);
-
-            /* seek to the correct offset to the data, and calculate the data size */
-            long positionToSeek = columnsStart + currentIndex.offset;
-
-            // With new promoted indexes, our first seek in the data file will happen at that point.
-            if (file == null)
-                file = originalInput == null ? sstable.getFileDataInput(positionToSeek) : originalInput;
-
-            AtomDeserializer deserializer = emptyColumnFamily.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
-
-            file.seek(positionToSeek);
-            FileMark mark = file.mark();
-
-            // We remenber when we are whithin a slice to avoid some comparison
-            boolean inSlice = false;
-
-            // scan from index start
-            while (file.bytesPastMark(mark) < currentIndex.width || deserializer.hasUnprocessed())
-            {
-                // col is before slice
-                // (If in slice, don't bother checking that until we change slice)
-                Composite start = currentStart();
-                if (!inSlice && !start.isEmpty() && deserializer.compareNextTo(start) < 0)
-                {
-                    // If it's a rangeTombstone, then we need to read it and include it unless it's end
-                    // stops before our slice start.
-                    if (deserializer.nextIsRangeTombstone())
-                    {
-                        RangeTombstone rt = (RangeTombstone)deserializer.readNext();
-                        if (comparator.compare(rt.max, start) >= 0)
-                            addColumn(rt);
-                        continue;
-                    }
-
-                    if (reversed)
-                    {
-                        // the next slice select columns that are before the current one, so it may
-                        // match this column, so keep it around.
-                        prefetched.addFirst(deserializer.readNext());
-                    }
-                    else
-                    {
-                        deserializer.skipNext();
-                    }
-                }
-                // col is within slice
-                else
-                {
-                    Composite finish = currentFinish();
-                    if (finish.isEmpty() || deserializer.compareNextTo(finish) <= 0)
-                    {
-                        inSlice = true;
-                        addColumn(deserializer.readNext());
-                    }
-                    // col is after slice.
-                    else
-                    {
-                        // When reading forward, if we hit a column that sorts after the current slice, it means we're done with this slice.
-                        // For reversed, this may either mean that we're done with the current slice, or that we need to read the previous
-                        // index block. However, we can be sure that we are in the first case though (the current slice is done) if the first
-                        // columns of the block were not part of the current slice, i.e. if we have columns in prefetched.
-                        if (reversed && prefetched.isEmpty())
-                            break;
-
-                        if (!setNextSlice())
-                            break;
-
-                        inSlice = false;
-
-                        // The next index block now corresponds to the first block that may have columns for the newly set slice.
-                        // So if it's different from the current block, we're done with this block. And in that case, we know
-                        // that our prefetched columns won't match.
-                        if (nextIndexIdx != lastDeserializedBlock)
-                        {
-                            if (reversed)
-                                prefetched.clear();
-                            break;
-                        }
-
-                        // Even if the next slice may have column in this blocks, if we're reversed, those columns have been
-                        // prefetched and we're done with that block
-                        if (reversed)
-                            break;
-
-                        // otherwise, we will deal with that column at the next iteration
-                    }
-                }
-            }
-            return true;
-        }
-    }
-
-    private class SimpleBlockFetcher extends BlockFetcher
-    {
-        public SimpleBlockFetcher() throws IOException
-        {
-            // Since we have to deserialize in order and will read all slices might as well reverse the slices and
-            // behave as if it was not reversed
-            super(reversed ? slices.length - 1 : 0);
-
-            // We remenber when we are whithin a slice to avoid some comparison
-            boolean inSlice = false;
-
-            AtomDeserializer deserializer = emptyColumnFamily.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
-            while (deserializer.hasNext())
-            {
-                // col is before slice
-                // (If in slice, don't bother checking that until we change slice)
-                Composite start = currentStart();
-                if (!inSlice && !start.isEmpty() && deserializer.compareNextTo(start) < 0)
-                {
-                    // If it's a rangeTombstone, then we need to read it and include it unless it's end
-                    // stops before our slice start. Otherwise, we can skip it.
-                    if (deserializer.nextIsRangeTombstone())
-                    {
-                        RangeTombstone rt = (RangeTombstone)deserializer.readNext();
-                        if (comparator.compare(rt.max, start) >= 0)
-                            addColumn(rt);
-                    }
-                    else
-                    {
-                        deserializer.skipNext();
-                    }
-                    continue;
-                }
-
-                // col is within slice
-                Composite finish = currentFinish();
-                if (finish.isEmpty() || deserializer.compareNextTo(finish) <= 0)
-                {
-                    inSlice = true;
-                    addColumn(deserializer.readNext());
-                }
-                // col is after slice. more slices?
-                else
-                {
-                    inSlice = false;
-                    if (!setNextSlice())
-                        break;
-                }
-            }
-        }
-
-        protected boolean setNextSlice()
-        {
-            if (reversed)
-            {
-                if (currentSliceIdx <= 0)
-                    return false;
-
-                currentSliceIdx--;
-            }
-            else
-            {
-                if (currentSliceIdx >= slices.length - 1)
-                    return false;
-
-                currentSliceIdx++;
-            }
-            return true;
-        }
-
-        protected boolean fetchMoreData()
-        {
-            return false;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableNamesIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableNamesIterator.java
deleted file mode 100644
index b8910c7..0000000
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableNamesIterator.java
+++ /dev/null

@@ -1,264 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable.format.big;
-
-import java.io.IOException;
-import java.util.*;
-
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.cassandra.io.sstable.IndexHelper;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileMark;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-class SSTableNamesIterator extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
-{
-    private ColumnFamily cf;
-    private final SSTableReader sstable;
-    private FileDataInput fileToClose;
-    private Iterator<OnDiskAtom> iter;
-    public final SortedSet<CellName> columns;
-    public final DecoratedKey key;
-
-    public SSTableNamesIterator(SSTableReader sstable, DecoratedKey key, SortedSet<CellName> columns)
-    {
-        assert columns != null;
-        this.sstable = sstable;
-        this.columns = columns;
-        this.key = key;
-
-        RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
-        if (indexEntry == null)
-            return;
-
-        try
-        {
-            read(sstable, null, indexEntry);
-        }
-        catch (IOException e)
-        {
-            sstable.markSuspect();
-            throw new CorruptSSTableException(e, sstable.getFilename());
-        }
-        finally
-        {
-            if (fileToClose != null)
-                FileUtils.closeQuietly(fileToClose);
-        }
-    }
-
-    public SSTableNamesIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, SortedSet<CellName> columns, RowIndexEntry indexEntry)
-    {
-        assert columns != null;
-        this.sstable = sstable;
-        this.columns = columns;
-        this.key = key;
-
-        try
-        {
-            read(sstable, file, indexEntry);
-        }
-        catch (IOException e)
-        {
-            sstable.markSuspect();
-            throw new CorruptSSTableException(e, sstable.getFilename());
-        }
-    }
-
-    private FileDataInput createFileDataInput(long position)
-    {
-        fileToClose = sstable.getFileDataInput(position);
-        return fileToClose;
-    }
-
-    @SuppressWarnings("resource")
-    private void read(SSTableReader sstable, FileDataInput file, RowIndexEntry indexEntry)
-    throws IOException
-    {
-        List<IndexHelper.IndexInfo> indexList;
-
-        // If the entry is not indexed or the index is not promoted, read from the row start
-        if (!indexEntry.isIndexed())
-        {
-            if (file == null)
-                file = createFileDataInput(indexEntry.position);
-            else
-                file.seek(indexEntry.position);
-
-            DecoratedKey keyInDisk = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
-            assert keyInDisk.equals(key) : String.format("%s != %s in %s", keyInDisk, key, file.getPath());
-        }
-
-        indexList = indexEntry.columnsIndex();
-
-        if (!indexEntry.isIndexed())
-        {
-            ColumnFamilySerializer serializer = ColumnFamily.serializer;
-            try
-            {
-                cf = ArrayBackedSortedColumns.factory.create(sstable.metadata);
-                cf.delete(DeletionTime.serializer.deserialize(file));
-            }
-            catch (Exception e)
-            {
-                throw new IOException(serializer + " failed to deserialize " + sstable.getColumnFamilyName() + " with " + sstable.metadata + " from " + file, e);
-            }
-        }
-        else
-        {
-            cf = ArrayBackedSortedColumns.factory.create(sstable.metadata);
-            cf.delete(indexEntry.deletionTime());
-        }
-
-        List<OnDiskAtom> result = new ArrayList<OnDiskAtom>();
-        if (indexList.isEmpty())
-        {
-            readSimpleColumns(file, columns, result);
-        }
-        else
-        {
-            readIndexedColumns(sstable.metadata, file, columns, indexList, indexEntry.position, result);
-        }
-
-        // create an iterator view of the columns we read
-        iter = result.iterator();
-    }
-
-    private void readSimpleColumns(FileDataInput file, SortedSet<CellName> columnNames, List<OnDiskAtom> result)
-    {
-        Iterator<OnDiskAtom> atomIterator = cf.metadata().getOnDiskIterator(file, sstable.descriptor.version);
-        int n = 0;
-        while (atomIterator.hasNext())
-        {
-            OnDiskAtom column = atomIterator.next();
-            if (column instanceof Cell)
-            {
-                if (columnNames.contains(column.name()))
-                {
-                    result.add(column);
-                    if (++n >= columns.size())
-                        break;
-                }
-            }
-            else
-            {
-                result.add(column);
-            }
-        }
-    }
-
-    @SuppressWarnings("resource")
-    private void readIndexedColumns(CFMetaData metadata,
-                                    FileDataInput file,
-                                    SortedSet<CellName> columnNames,
-                                    List<IndexHelper.IndexInfo> indexList,
-                                    long basePosition,
-                                    List<OnDiskAtom> result)
-    throws IOException
-    {
-        /* get the various column ranges we have to read */
-        CellNameType comparator = metadata.comparator;
-        List<IndexHelper.IndexInfo> ranges = new ArrayList<IndexHelper.IndexInfo>();
-        int lastIndexIdx = -1;
-        for (CellName name : columnNames)
-        {
-            int index = IndexHelper.indexFor(name, indexList, comparator, false, lastIndexIdx);
-            if (index < 0 || index == indexList.size())
-                continue;
-            IndexHelper.IndexInfo indexInfo = indexList.get(index);
-            // Check the index block does contain the column names and that we haven't inserted this block yet.
-            if (comparator.compare(name, indexInfo.firstName) < 0 || index == lastIndexIdx)
-                continue;
-
-            ranges.add(indexInfo);
-            lastIndexIdx = index;
-        }
-
-        if (ranges.isEmpty())
-            return;
-
-        Iterator<CellName> toFetch = columnNames.iterator();
-        CellName nextToFetch = toFetch.next();
-        for (IndexHelper.IndexInfo indexInfo : ranges)
-        {
-            long positionToSeek = basePosition + indexInfo.offset;
-
-            // With new promoted indexes, our first seek in the data file will happen at that point.
-            if (file == null)
-                file = createFileDataInput(positionToSeek);
-
-            AtomDeserializer deserializer = cf.metadata().getOnDiskDeserializer(file, sstable.descriptor.version);
-            file.seek(positionToSeek);
-            FileMark mark = file.mark();
-            while (file.bytesPastMark(mark) < indexInfo.width && nextToFetch != null)
-            {
-                int cmp = deserializer.compareNextTo(nextToFetch);
-                if (cmp < 0)
-                {
-                    // If it's a rangeTombstone, then we need to read it and include
-                    // it if it includes our target. Otherwise, we can skip it.
-                    if (deserializer.nextIsRangeTombstone())
-                    {
-                        RangeTombstone rt = (RangeTombstone)deserializer.readNext();
-                        if (comparator.compare(rt.max, nextToFetch) >= 0)
-                            result.add(rt);
-                    }
-                    else
-                    {
-                        deserializer.skipNext();
-                    }
-                }
-                else if (cmp == 0)
-                {
-                    nextToFetch = toFetch.hasNext() ? toFetch.next() : null;
-                    result.add(deserializer.readNext());
-                }
-                else
-                    nextToFetch = toFetch.hasNext() ? toFetch.next() : null;
-            }
-        }
-    }
-
-    public DecoratedKey getKey()
-    {
-        return key;
-    }
-
-    public ColumnFamily getColumnFamily()
-    {
-        return cf;
-    }
-
-    protected OnDiskAtom computeNext()
-    {
-        if (iter == null || !iter.hasNext())
-            return endOfData();
-        return iter.next();
-    }
-
-    public void close() throws IOException { }
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableSliceIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableSliceIterator.java
deleted file mode 100644
index 07d867d..0000000
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableSliceIterator.java
+++ /dev/null

@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable.format.big;
-
-import java.io.IOException;
-
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
-
-/**
- *  A Cell Iterator over SSTable
- */
-class SSTableSliceIterator implements OnDiskAtomIterator
-{
-    private final OnDiskAtomIterator reader;
-    private final DecoratedKey key;
-
-    public SSTableSliceIterator(SSTableReader sstable, DecoratedKey key, ColumnSlice[] slices, boolean reversed)
-    {
-        this.key = key;
-        RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
-        this.reader = indexEntry == null ? null : createReader(sstable, indexEntry, null, slices, reversed);
-    }
-
-    /**
-     * An iterator for a slice within an SSTable
-     * @param sstable Keyspace for the CFS we are reading from
-     * @param file Optional parameter that input is read from.  If null is passed, this class creates an appropriate one automatically.
-     * If this class creates, it will close the underlying file when #close() is called.
-     * If a caller passes a non-null argument, this class will NOT close the underlying file when the iterator is closed (i.e. the caller is responsible for closing the file)
-     * In all cases the caller should explicitly #close() this iterator.
-     * @param key The key the requested slice resides under
-     * @param slices the column slices
-     * @param reversed Results are returned in reverse order iff reversed is true.
-     * @param indexEntry position of the row
-     */
-    public SSTableSliceIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, ColumnSlice[] slices, boolean reversed, RowIndexEntry indexEntry)
-    {
-        this.key = key;
-        reader = createReader(sstable, indexEntry, file, slices, reversed);
-    }
-
-    private static OnDiskAtomIterator createReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput file, ColumnSlice[] slices, boolean reversed)
-    {
-        return slices.length == 1 && slices[0].start.isEmpty() && !reversed
-             ? new SimpleSliceReader(sstable, indexEntry, file, slices[0].finish)
-             : new IndexedSliceReader(sstable, indexEntry, file, slices, reversed);
-    }
-
-    public DecoratedKey getKey()
-    {
-        return key;
-    }
-
-    public ColumnFamily getColumnFamily()
-    {
-        return reader == null ? null : reader.getColumnFamily();
-    }
-
-    public boolean hasNext()
-    {
-        return reader != null && reader.hasNext();
-    }
-
-    public OnDiskAtom next()
-    {
-        return reader.next();
-    }
-
-    public void remove()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public void close() throws IOException
-    {
-        if (reader != null)
-            reader.close();
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SimpleSliceReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/SimpleSliceReader.java
deleted file mode 100644
index 9fec303..0000000
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SimpleSliceReader.java
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.sstable.format.big;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import com.google.common.collect.AbstractIterator;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.io.sstable.CorruptSSTableException;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-class SimpleSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
-{
-    private static final Logger logger = LoggerFactory.getLogger(SimpleSliceReader.class);
-
-    private final FileDataInput file;
-    private final boolean needsClosing;
-    private final Composite finishColumn;
-    private final CellNameType comparator;
-    private final ColumnFamily emptyColumnFamily;
-    private final Iterator<OnDiskAtom> atomIterator;
-
-    SimpleSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, Composite finishColumn)
-    {
-        Tracing.trace("Seeking to partition beginning in data file");
-        this.finishColumn = finishColumn;
-        this.comparator = sstable.metadata.comparator;
-        try
-        {
-            if (input == null)
-            {
-                this.file = sstable.getFileDataInput(indexEntry.position);
-                this.needsClosing = true;
-            }
-            else
-            {
-                this.file = input;
-                input.seek(indexEntry.position);
-                this.needsClosing = false;
-            }
-
-            // Skip key and data size
-            ByteBufferUtil.skipShortLength(file);
-
-            emptyColumnFamily = ArrayBackedSortedColumns.factory.create(sstable.metadata);
-            emptyColumnFamily.delete(DeletionTime.serializer.deserialize(file));
-            atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, sstable.descriptor.version);
-        }
-        catch (IOException e)
-        {
-            sstable.markSuspect();
-            throw new CorruptSSTableException(e, sstable.getFilename());
-        }
-    }
-
-    protected OnDiskAtom computeNext()
-    {
-        if (!atomIterator.hasNext())
-            return endOfData();
-
-        OnDiskAtom column = atomIterator.next();
-        if (!finishColumn.isEmpty() && comparator.compare(column.name(), finishColumn) > 0)
-            return endOfData();
-
-        return column;
-    }
-
-    public ColumnFamily getColumnFamily()
-    {
-        return emptyColumnFamily;
-    }
-
-    public void close() throws IOException
-    {
-        if (needsClosing)
-            file.close();
-    }
-
-    public DecoratedKey getKey()
-    {
-        throw new UnsupportedOperationException();
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java
index c8e6ee8..ef3453a 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/CompactionMetadata.java

@@ -17,16 +17,14 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import com.clearspring.analytics.stream.cardinality.ICardinality;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -39,13 +37,10 @@
 {
     public static final IMetadataComponentSerializer serializer = new CompactionMetadataSerializer();
 
-    public final Set<Integer> ancestors;
-
     public final ICardinality cardinalityEstimator;
 
-    public CompactionMetadata(Set<Integer> ancestors, ICardinality cardinalityEstimator)
+    public CompactionMetadata(ICardinality cardinalityEstimator)
     {
-        this.ancestors = ancestors;
         this.cardinalityEstimator = cardinalityEstimator;
     }
 
@@ -57,48 +52,55 @@
     @Override
     public boolean equals(Object o)
     {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
+        if (this == o)
+            return true;
 
-        CompactionMetadata that = (CompactionMetadata) o;
-        return ancestors == null ? that.ancestors == null : ancestors.equals(that.ancestors);
+        if (o == null || getClass() != o.getClass())
+            return false;
+
+        // keeping equals and hashCode as all classes inheriting from MetadataComponent
+        // implement them but we have really nothing to compare
+        return true;
     }
 
     @Override
     public int hashCode()
     {
-        return ancestors != null ? ancestors.hashCode() : 0;
+        // see comment in equals
+        return 31;
     }
 
     public static class CompactionMetadataSerializer implements IMetadataComponentSerializer<CompactionMetadata>
     {
-        public int serializedSize(CompactionMetadata component, Version version) throws IOException
+        public int serializedSize(Version version, CompactionMetadata component) throws IOException
         {
-            int size = 0;
-            size += TypeSizes.NATIVE.sizeof(component.ancestors.size());
-            for (int g : component.ancestors)
-                size += TypeSizes.NATIVE.sizeof(g);
+            int sz = 0;
+            if (version.hasCompactionAncestors())
+            {   // write empty ancestor marker
+                sz = 4;
+            }
             byte[] serializedCardinality = component.cardinalityEstimator.getBytes();
-            size += TypeSizes.NATIVE.sizeof(serializedCardinality.length) + serializedCardinality.length;
-            return size;
+            return TypeSizes.sizeof(serializedCardinality.length) + serializedCardinality.length + sz;
         }
 
-        public void serialize(CompactionMetadata component, Version version, DataOutputPlus out) throws IOException
+        public void serialize(Version version, CompactionMetadata component, DataOutputPlus out) throws IOException
         {
-            out.writeInt(component.ancestors.size());
-            for (int g : component.ancestors)
-                out.writeInt(g);
+            if (version.hasCompactionAncestors())
+            {   // write empty ancestor marker
+                out.writeInt(0);
+            }
             ByteBufferUtil.writeWithLength(component.cardinalityEstimator.getBytes(), out);
         }
 
-        public CompactionMetadata deserialize(Version version, DataInput in) throws IOException
+        public CompactionMetadata deserialize(Version version, DataInputPlus in) throws IOException
         {
-            int nbAncestors = in.readInt();
-            Set<Integer> ancestors = new HashSet<>(nbAncestors);
-            for (int i = 0; i < nbAncestors; i++)
-                ancestors.add(in.readInt());
+            if (version.hasCompactionAncestors())
+            { // skip ancestors
+                int nbAncestors = in.readInt();
+                in.skipBytes(nbAncestors * TypeSizes.sizeof(nbAncestors));
+            }
             ICardinality cardinality = HyperLogLogPlus.Builder.build(ByteBufferUtil.readBytes(in, in.readInt()));
-            return new CompactionMetadata(ancestors, cardinality);
+            return new CompactionMetadata(cardinality);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java
index e3d867f..7c03f54 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataComponentSerializer.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -31,21 +31,26 @@
     /**
      * Calculate and return serialized size.
      *
+     *
+     *
+     * @param version
      * @param component MetadataComponent to calculate serialized size
      * @return serialized size of this component
      * @throws IOException
      */
-    int serializedSize(T component, Version version) throws IOException;
+    int serializedSize(Version version, T component) throws IOException;
 
     /**
      * Serialize metadata component to given output.
      *
      *
+     *
+     * @param version
      * @param component MetadataComponent to serialize
      * @param out  serialize destination
      * @throws IOException
      */
-    void serialize(T component, Version version, DataOutputPlus out) throws IOException;
+    void serialize(Version version, T component, DataOutputPlus out) throws IOException;
 
     /**
      * Deserialize metadata component from given input.
@@ -55,5 +60,5 @@
      * @return Deserialized component
      * @throws IOException
      */
-    T deserialize(Version version, DataInput in) throws IOException;
+    T deserialize(Version version, DataInputPlus in) throws IOException;
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
index a7d23f4..100cfdb 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java

@@ -36,9 +36,10 @@
      *
      * @param components Metadata components to serialize
      * @param out
+     * @param version
      * @throws IOException
      */
-    void serialize(Map<MetadataType, MetadataComponent> components, Version version, DataOutputPlus out) throws IOException;
+    void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out, Version version) throws IOException;
 
     /**
      * Deserialize specified metadata components from given descriptor.

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java
index bfeb930..a683513 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/LegacyMetadataSerializer.java

@@ -23,16 +23,21 @@
 
 import com.google.common.collect.Maps;
 
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.commitlog.IntervalSet;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
 import org.apache.cassandra.utils.StreamingHistogram;
 
+import static org.apache.cassandra.io.sstable.metadata.StatsMetadata.replayPositionSetSerializer;
+
 /**
  * Serializer for SSTable from legacy versions
  */
@@ -43,7 +48,7 @@
      * Legacy serialization is only used for SSTable level reset.
      */
     @Override
-    public void serialize(Map<MetadataType, MetadataComponent> components, Version version, DataOutputPlus out) throws IOException
+    public void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out, Version version) throws IOException
     {
         ValidationMetadata validation = (ValidationMetadata) components.get(MetadataType.VALIDATION);
         StatsMetadata stats = (StatsMetadata) components.get(MetadataType.STATS);
@@ -51,28 +56,28 @@
 
         assert validation != null && stats != null && compaction != null && validation.partitioner != null;
 
-        EstimatedHistogram.serializer.serialize(stats.estimatedRowSize, out);
+        EstimatedHistogram.serializer.serialize(stats.estimatedPartitionSize, out);
         EstimatedHistogram.serializer.serialize(stats.estimatedColumnCount, out);
-        ReplayPosition.serializer.serialize(stats.commitLogUpperBound, out);
+        ReplayPosition.serializer.serialize(stats.commitLogIntervals.upperBound().orElse(ReplayPosition.NONE), out);
         out.writeLong(stats.minTimestamp);
         out.writeLong(stats.maxTimestamp);
         out.writeInt(stats.maxLocalDeletionTime);
         out.writeDouble(validation.bloomFilterFPChance);
         out.writeDouble(stats.compressionRatio);
         out.writeUTF(validation.partitioner);
-        out.writeInt(compaction.ancestors.size());
-        for (Integer g : compaction.ancestors)
-            out.writeInt(g);
+        out.writeInt(0); // compaction ancestors
         StreamingHistogram.serializer.serialize(stats.estimatedTombstoneDropTime, out);
         out.writeInt(stats.sstableLevel);
-        out.writeInt(stats.minColumnNames.size());
-        for (ByteBuffer columnName : stats.minColumnNames)
-            ByteBufferUtil.writeWithShortLength(columnName, out);
-        out.writeInt(stats.maxColumnNames.size());
-        for (ByteBuffer columnName : stats.maxColumnNames)
-            ByteBufferUtil.writeWithShortLength(columnName, out);
+        out.writeInt(stats.minClusteringValues.size());
+        for (ByteBuffer value : stats.minClusteringValues)
+            ByteBufferUtil.writeWithShortLength(value, out);
+        out.writeInt(stats.maxClusteringValues.size());
+        for (ByteBuffer value : stats.maxClusteringValues)
+            ByteBufferUtil.writeWithShortLength(value, out);
         if (version.hasCommitLogLowerBound())
-            ReplayPosition.serializer.serialize(stats.commitLogLowerBound, out);
+            ReplayPosition.serializer.serialize(stats.commitLogIntervals.lowerBound().orElse(ReplayPosition.NONE), out);
+        if (version.hasCommitLogIntervals())
+            replayPositionSetSerializer.serialize(stats.commitLogIntervals, out);
     }
 
     /**
@@ -90,9 +95,9 @@
         }
         else
         {
-            try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(statsFile))))
+            try (DataInputStreamPlus in = new DataInputStreamPlus(new BufferedInputStream(new FileInputStream(statsFile))))
             {
-                EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(in);
+                EstimatedHistogram partitionSizes = EstimatedHistogram.serializer.deserialize(in);
                 EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(in);
                 ReplayPosition commitLogLowerBound = ReplayPosition.NONE;
                 ReplayPosition commitLogUpperBound = ReplayPosition.serializer.deserialize(in);
@@ -102,10 +107,8 @@
                 double bloomFilterFPChance = in.readDouble();
                 double compressionRatio = in.readDouble();
                 String partitioner = in.readUTF();
-                int nbAncestors = in.readInt();
-                Set<Integer> ancestors = new HashSet<>(nbAncestors);
-                for (int i = 0; i < nbAncestors; i++)
-                    ancestors.add(in.readInt());
+                int nbAncestors = in.readInt(); //skip compaction ancestors
+                in.skipBytes(nbAncestors * TypeSizes.sizeof(nbAncestors));
                 StreamingHistogram tombstoneHistogram = StreamingHistogram.serializer.deserialize(in);
                 int sstableLevel = 0;
                 if (in.available() > 0)
@@ -120,31 +123,41 @@
                 List<ByteBuffer> maxColumnNames = new ArrayList<>(colCount);
                 for (int i = 0; i < colCount; i++)
                     maxColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+
                 if (descriptor.version.hasCommitLogLowerBound())
                     commitLogLowerBound = ReplayPosition.serializer.deserialize(in);
+                IntervalSet<ReplayPosition> commitLogIntervals;
+                if (descriptor.version.hasCommitLogIntervals())
+                    commitLogIntervals = replayPositionSetSerializer.deserialize(in);
+                else
+                    commitLogIntervals = new IntervalSet<>(commitLogLowerBound, commitLogUpperBound);
 
                 if (types.contains(MetadataType.VALIDATION))
                     components.put(MetadataType.VALIDATION,
                                    new ValidationMetadata(partitioner, bloomFilterFPChance));
                 if (types.contains(MetadataType.STATS))
                     components.put(MetadataType.STATS,
-                                   new StatsMetadata(rowSizes,
+                                   new StatsMetadata(partitionSizes,
                                                      columnCounts,
-                                                     commitLogLowerBound,
-                                                     commitLogUpperBound,
+                                                     commitLogIntervals,
                                                      minTimestamp,
                                                      maxTimestamp,
+                                                     Integer.MAX_VALUE,
                                                      maxLocalDeletionTime,
+                                                     0,
+                                                     Integer.MAX_VALUE,
                                                      compressionRatio,
                                                      tombstoneHistogram,
                                                      sstableLevel,
                                                      minColumnNames,
                                                      maxColumnNames,
                                                      true,
-                                                     ActiveRepairService.UNREPAIRED_SSTABLE));
+                                                     ActiveRepairService.UNREPAIRED_SSTABLE,
+                                                     -1,
+                                                     -1));
                 if (types.contains(MetadataType.COMPACTION))
                     components.put(MetadataType.COMPACTION,
-                                   new CompactionMetadata(ancestors, null));
+                                   new CompactionMetadata(null));
             }
         }
         return components;

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index 579ff7a..867e9a1 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java

@@ -17,86 +17,90 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
-import java.io.File;
 import java.nio.ByteBuffer;
-import java.util.Collection;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 
-import com.google.common.collect.ImmutableList;
+import com.google.common.base.Preconditions;
 import com.google.common.collect.Maps;
-import com.google.common.collect.Ordering;
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import com.clearspring.analytics.stream.cardinality.ICardinality;
-import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.io.sstable.ColumnNameHelper;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.commitlog.IntervalSet;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
+import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
 import org.apache.cassandra.utils.MurmurHash;
 import org.apache.cassandra.utils.StreamingHistogram;
 
-public class MetadataCollector
+public class MetadataCollector implements PartitionStatisticsCollector
 {
     public static final double NO_COMPRESSION_RATIO = -1.0;
+    private static final ByteBuffer[] EMPTY_CLUSTERING = new ByteBuffer[0];
 
-    static EstimatedHistogram defaultColumnCountHistogram()
+    static EstimatedHistogram defaultCellPerPartitionCountHistogram()
     {
         // EH of 114 can track a max value of 2395318855, i.e., > 2B columns
         return new EstimatedHistogram(114);
     }
 
-    static EstimatedHistogram defaultRowSizeHistogram()
+    static EstimatedHistogram defaultPartitionSizeHistogram()
     {
         // EH of 150 can track a max value of 1697806495183, i.e., > 1.5PB
         return new EstimatedHistogram(150);
     }
 
-    static StreamingHistogram defaultTombstoneDropTimeHistogram()
+    static StreamingHistogram.StreamingHistogramBuilder defaultTombstoneDropTimeHistogramBuilder()
     {
-        return new StreamingHistogram(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE);
+        return new StreamingHistogram.StreamingHistogramBuilder(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE, SSTable.TOMBSTONE_HISTOGRAM_SPOOL_SIZE, SSTable.TOMBSTONE_HISTOGRAM_TTL_ROUND_SECONDS);
     }
 
     public static StatsMetadata defaultStatsMetadata()
     {
-        return new StatsMetadata(defaultRowSizeHistogram(),
-                                 defaultColumnCountHistogram(),
-                                 ReplayPosition.NONE,
-                                 ReplayPosition.NONE,
+        return new StatsMetadata(defaultPartitionSizeHistogram(),
+                                 defaultCellPerPartitionCountHistogram(),
+                                 IntervalSet.empty(),
                                  Long.MIN_VALUE,
                                  Long.MAX_VALUE,
                                  Integer.MAX_VALUE,
+                                 Integer.MAX_VALUE,
+                                 0,
+                                 Integer.MAX_VALUE,
                                  NO_COMPRESSION_RATIO,
-                                 defaultTombstoneDropTimeHistogram(),
+                                 defaultTombstoneDropTimeHistogramBuilder().build(),
                                  0,
                                  Collections.<ByteBuffer>emptyList(),
                                  Collections.<ByteBuffer>emptyList(),
                                  true,
-                                 ActiveRepairService.UNREPAIRED_SSTABLE);
+                                 ActiveRepairService.UNREPAIRED_SSTABLE,
+                                 -1,
+                                 -1);
     }
 
-    protected EstimatedHistogram estimatedRowSize = defaultRowSizeHistogram();
-    protected EstimatedHistogram estimatedColumnCount = defaultColumnCountHistogram();
-    protected ReplayPosition commitLogLowerBound = ReplayPosition.NONE;
-    protected ReplayPosition commitLogUpperBound = ReplayPosition.NONE;
-    protected long minTimestamp = Long.MAX_VALUE;
-    protected long maxTimestamp = Long.MIN_VALUE;
-    protected int maxLocalDeletionTime = Integer.MIN_VALUE;
+    protected EstimatedHistogram estimatedPartitionSize = defaultPartitionSizeHistogram();
+    // TODO: cound the number of row per partition (either with the number of cells, or instead)
+    protected EstimatedHistogram estimatedCellPerPartitionCount = defaultCellPerPartitionCountHistogram();
+    protected IntervalSet commitLogIntervals = IntervalSet.empty();
+    protected final MinMaxLongTracker timestampTracker = new MinMaxLongTracker();
+    protected final MinMaxIntTracker localDeletionTimeTracker = new MinMaxIntTracker(Cell.NO_DELETION_TIME, Cell.NO_DELETION_TIME);
+    protected final MinMaxIntTracker ttlTracker = new MinMaxIntTracker(Cell.NO_TTL, Cell.NO_TTL);
     protected double compressionRatio = NO_COMPRESSION_RATIO;
-    protected Set<Integer> ancestors = new HashSet<>();
-    protected StreamingHistogram estimatedTombstoneDropTime = defaultTombstoneDropTimeHistogram();
+    protected StreamingHistogram.StreamingHistogramBuilder estimatedTombstoneDropTime = defaultTombstoneDropTimeHistogramBuilder();
     protected int sstableLevel;
-    protected List<ByteBuffer> minColumnNames = Collections.emptyList();
-    protected List<ByteBuffer> maxColumnNames = Collections.emptyList();
+    private ClusteringPrefix minClustering = null;
+    private ClusteringPrefix maxClustering = null;
     protected boolean hasLegacyCounterShards = false;
+    protected long totalColumnsSet;
+    protected long totalRows;
 
     /**
      * Default cardinality estimation method is to use HyperLogLog++.
@@ -105,52 +109,26 @@
      * See CASSANDRA-5906 for detail.
      */
     protected ICardinality cardinality = new HyperLogLogPlus(13, 25);
-    private final CellNameType columnNameComparator;
+    private final ClusteringComparator comparator;
 
-    public MetadataCollector(CellNameType columnNameComparator)
+    public MetadataCollector(ClusteringComparator comparator)
     {
-        this.columnNameComparator = columnNameComparator;
+        this.comparator = comparator;
+
     }
 
-    public MetadataCollector(Iterable<SSTableReader> sstables, CellNameType columnNameComparator, int level, boolean skipAncestors)
+    public MetadataCollector(Iterable<SSTableReader> sstables, ClusteringComparator comparator, int level)
     {
-        this(columnNameComparator);
+        this(comparator);
 
-        ReplayPosition min = null, max = null;
+        IntervalSet.Builder intervals = new IntervalSet.Builder();
         for (SSTableReader sstable : sstables)
         {
-            if (min == null)
-            {
-                min = sstable.getSSTableMetadata().commitLogLowerBound;
-                max = sstable.getSSTableMetadata().commitLogUpperBound;
-            }
-            else
-            {
-                min = Ordering.natural().min(min, sstable.getSSTableMetadata().commitLogLowerBound);
-                max = Ordering.natural().max(max, sstable.getSSTableMetadata().commitLogUpperBound);
-            }
+            intervals.addAll(sstable.getSSTableMetadata().commitLogIntervals);
         }
 
-        commitLogLowerBound(min);
-        commitLogUpperBound(max);
+        commitLogIntervals(intervals.build());
         sstableLevel(level);
-        // Get the max timestamp of the precompacted sstables
-        // and adds generation of live ancestors
-        if (!skipAncestors)
-        {
-            for (SSTableReader sstable : sstables)
-            {
-                addAncestor(sstable.descriptor.generation);
-                for (Integer i : sstable.getAncestors())
-                    if (new File(sstable.descriptor.withGeneration(i).filenameFor(Component.DATA)).exists())
-                        addAncestor(i);
-            }
-        }
-    }
-
-    public MetadataCollector(Iterable<SSTableReader> sstables, CellNameType columnNameComparator, int level)
-    {
-        this(sstables, columnNameComparator, level, false);
     }
 
     public MetadataCollector addKey(ByteBuffer key)
@@ -160,15 +138,15 @@
         return this;
     }
 
-    public MetadataCollector addRowSize(long rowSize)
+    public MetadataCollector addPartitionSizeInBytes(long partitionSize)
     {
-        estimatedRowSize.add(rowSize);
+        estimatedPartitionSize.add(partitionSize);
         return this;
     }
 
-    public MetadataCollector addColumnCount(long columnCount)
+    public MetadataCollector addCellPerPartitionCount(long cellCount)
     {
-        estimatedColumnCount.add(columnCount);
+        estimatedCellPerPartitionCount.add(cellCount);
         return this;
     }
 
@@ -188,51 +166,58 @@
         return this;
     }
 
-    public MetadataCollector updateMinTimestamp(long potentialMin)
+    public void update(LivenessInfo newInfo)
     {
-        minTimestamp = Math.min(minTimestamp, potentialMin);
-        return this;
+        if (newInfo.isEmpty())
+            return;
+
+        updateTimestamp(newInfo.timestamp());
+        updateTTL(newInfo.ttl());
+        updateLocalDeletionTime(newInfo.localExpirationTime());
     }
 
-    public MetadataCollector updateMaxTimestamp(long potentialMax)
+    public void update(Cell cell)
     {
-        maxTimestamp = Math.max(maxTimestamp, potentialMax);
-        return this;
+        updateTimestamp(cell.timestamp());
+        updateTTL(cell.ttl());
+        updateLocalDeletionTime(cell.localDeletionTime());
     }
 
-    public MetadataCollector updateMaxLocalDeletionTime(int maxLocalDeletionTime)
+    public void update(DeletionTime dt)
     {
-        this.maxLocalDeletionTime = Math.max(this.maxLocalDeletionTime, maxLocalDeletionTime);
-        return this;
+        if (!dt.isLive())
+        {
+            updateTimestamp(dt.markedForDeleteAt());
+            updateLocalDeletionTime(dt.localDeletionTime());
+        }
     }
 
-    public MetadataCollector estimatedRowSize(EstimatedHistogram estimatedRowSize)
+    public void updateColumnSetPerRow(long columnSetInRow)
     {
-        this.estimatedRowSize = estimatedRowSize;
-        return this;
+        totalColumnsSet += columnSetInRow;
+        ++totalRows;
     }
 
-    public MetadataCollector estimatedColumnCount(EstimatedHistogram estimatedColumnCount)
+    private void updateTimestamp(long newTimestamp)
     {
-        this.estimatedColumnCount = estimatedColumnCount;
-        return this;
+        timestampTracker.update(newTimestamp);
     }
 
-    public MetadataCollector commitLogLowerBound(ReplayPosition commitLogLowerBound)
+    private void updateLocalDeletionTime(int newLocalDeletionTime)
     {
-        this.commitLogLowerBound = commitLogLowerBound;
-        return this;
+        localDeletionTimeTracker.update(newLocalDeletionTime);
+        if (newLocalDeletionTime != Cell.NO_DELETION_TIME)
+            estimatedTombstoneDropTime.update(newLocalDeletionTime);
     }
 
-    public MetadataCollector commitLogUpperBound(ReplayPosition commitLogUpperBound)
+    private void updateTTL(int newTTL)
     {
-        this.commitLogUpperBound = commitLogUpperBound;
-        return this;
+        ttlTracker.update(newTTL);
     }
 
-    public MetadataCollector addAncestor(int generation)
+    public MetadataCollector commitLogIntervals(IntervalSet commitLogIntervals)
     {
-        this.ancestors.add(generation);
+        this.commitLogIntervals = commitLogIntervals;
         return this;
     }
 
@@ -242,59 +227,152 @@
         return this;
     }
 
-    public MetadataCollector updateMinColumnNames(List<ByteBuffer> minColumnNames)
+    public MetadataCollector updateClusteringValues(ClusteringPrefix clustering)
     {
-        if (minColumnNames.size() > 0)
-            this.minColumnNames = ColumnNameHelper.mergeMin(this.minColumnNames, minColumnNames, columnNameComparator);
+        minClustering = minClustering == null || comparator.compare(clustering, minClustering) < 0 ? clustering.minimize() : minClustering;
+        maxClustering = maxClustering == null || comparator.compare(clustering, maxClustering) > 0 ? clustering.minimize() : maxClustering;
         return this;
     }
 
-    public MetadataCollector updateMaxColumnNames(List<ByteBuffer> maxColumnNames)
-    {
-        if (maxColumnNames.size() > 0)
-            this.maxColumnNames = ColumnNameHelper.mergeMax(this.maxColumnNames, maxColumnNames, columnNameComparator);
-        return this;
-    }
-
-    public MetadataCollector updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
+    public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
     {
         this.hasLegacyCounterShards = this.hasLegacyCounterShards || hasLegacyCounterShards;
-        return this;
     }
 
-    public MetadataCollector update(long rowSize, ColumnStats stats)
+    public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner, double bloomFilterFPChance, long repairedAt, SerializationHeader header)
     {
-        updateMinTimestamp(stats.minTimestamp);
-        updateMaxTimestamp(stats.maxTimestamp);
-        updateMaxLocalDeletionTime(stats.maxLocalDeletionTime);
-        addRowSize(rowSize);
-        addColumnCount(stats.columnCount);
-        mergeTombstoneHistogram(stats.tombstoneHistogram);
-        updateMinColumnNames(stats.minColumnNames);
-        updateMaxColumnNames(stats.maxColumnNames);
-        updateHasLegacyCounterShards(stats.hasLegacyCounterShards);
-        return this;
-    }
-
-    public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner, double bloomFilterFPChance, long repairedAt)
-    {
+        Preconditions.checkState((minClustering == null && maxClustering == null)
+                                 || comparator.compare(maxClustering, minClustering) >= 0);
+        ByteBuffer[] minValues = minClustering != null ? minClustering.getRawValues() : EMPTY_CLUSTERING;
+        ByteBuffer[] maxValues = maxClustering != null ? maxClustering.getRawValues() : EMPTY_CLUSTERING;
         Map<MetadataType, MetadataComponent> components = Maps.newHashMap();
         components.put(MetadataType.VALIDATION, new ValidationMetadata(partitioner, bloomFilterFPChance));
-        components.put(MetadataType.STATS, new StatsMetadata(estimatedRowSize,
-                                                             estimatedColumnCount,
-                                                             commitLogLowerBound,
-                                                             commitLogUpperBound,
-                                                             minTimestamp,
-                                                             maxTimestamp,
-                                                             maxLocalDeletionTime,
+        components.put(MetadataType.STATS, new StatsMetadata(estimatedPartitionSize,
+                                                             estimatedCellPerPartitionCount,
+                                                             commitLogIntervals,
+                                                             timestampTracker.min(),
+                                                             timestampTracker.max(),
+                                                             localDeletionTimeTracker.min(),
+                                                             localDeletionTimeTracker.max(),
+                                                             ttlTracker.min(),
+                                                             ttlTracker.max(),
                                                              compressionRatio,
-                                                             estimatedTombstoneDropTime,
+                                                             estimatedTombstoneDropTime.build(),
                                                              sstableLevel,
-                                                             ImmutableList.copyOf(minColumnNames),
-                                                             ImmutableList.copyOf(maxColumnNames),
+                                                             makeList(minValues),
+                                                             makeList(maxValues),
                                                              hasLegacyCounterShards,
-                                                             repairedAt));
-        components.put(MetadataType.COMPACTION, new CompactionMetadata(ancestors, cardinality));
+                                                             repairedAt,
+                                                             totalColumnsSet,
+                                                             totalRows));
+        components.put(MetadataType.COMPACTION, new CompactionMetadata(cardinality));
+        components.put(MetadataType.HEADER, header.toComponent());
         return components;
     }
+
+    private static List<ByteBuffer> makeList(ByteBuffer[] values)
+    {
+        // In most case, l will be the same size than values, but it's possible for it to be smaller
+        List<ByteBuffer> l = new ArrayList<ByteBuffer>(values.length);
+        for (int i = 0; i < values.length; i++)
+            if (values[i] == null)
+                break;
+            else
+                l.add(values[i]);
+        return l;
+    }
+
+    public static class MinMaxLongTracker
+    {
+        private final long defaultMin;
+        private final long defaultMax;
+
+        private boolean isSet = false;
+        private long min;
+        private long max;
+
+        public MinMaxLongTracker()
+        {
+            this(Long.MIN_VALUE, Long.MAX_VALUE);
+        }
+
+        public MinMaxLongTracker(long defaultMin, long defaultMax)
+        {
+            this.defaultMin = defaultMin;
+            this.defaultMax = defaultMax;
+        }
+
+        public void update(long value)
+        {
+            if (!isSet)
+            {
+                min = max = value;
+                isSet = true;
+            }
+            else
+            {
+                if (value < min)
+                    min = value;
+                if (value > max)
+                    max = value;
+            }
+        }
+
+        public long min()
+        {
+            return isSet ? min : defaultMin;
+        }
+
+        public long max()
+        {
+            return isSet ? max : defaultMax;
+        }
+    }
+
+    public static class MinMaxIntTracker
+    {
+        private final int defaultMin;
+        private final int defaultMax;
+
+        private boolean isSet = false;
+        private int min;
+        private int max;
+
+        public MinMaxIntTracker()
+        {
+            this(Integer.MIN_VALUE, Integer.MAX_VALUE);
+        }
+
+        public MinMaxIntTracker(int defaultMin, int defaultMax)
+        {
+            this.defaultMin = defaultMin;
+            this.defaultMax = defaultMax;
+        }
+
+        public void update(int value)
+        {
+            if (!isSet)
+            {
+                min = max = value;
+                isSet = true;
+            }
+            else
+            {
+                if (value < min)
+                    min = value;
+                if (value > max)
+                    max = value;
+            }
+        }
+
+        public int min()
+        {
+            return isSet ? min : defaultMin;
+        }
+
+        public int max()
+        {
+            return isSet ? max : defaultMax;
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
index ca7fe82..635adcd 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java

@@ -50,7 +50,7 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(MetadataSerializer.class);
 
-    public void serialize(Map<MetadataType, MetadataComponent> components, Version version, DataOutputPlus out) throws IOException
+    public void serialize(Map<MetadataType, MetadataComponent> components, DataOutputPlus out, Version version) throws IOException
     {
         // sort components by type
         List<MetadataComponent> sortedComponents = Lists.newArrayList(components.values());
@@ -67,16 +67,16 @@
             out.writeInt(type.ordinal());
             // serialize position
             out.writeInt(lastPosition);
-            lastPosition += type.serializer.serializedSize(component, version);
+            lastPosition += type.serializer.serializedSize(version, component);
         }
         // serialize components
         for (MetadataComponent component : sortedComponents)
         {
-            component.getType().serializer.serialize(component, version, out);
+            component.getType().serializer.serialize(version, component, out);
         }
     }
 
-    public Map<MetadataType, MetadataComponent> deserialize(Descriptor descriptor, EnumSet<MetadataType> types) throws IOException
+    public Map<MetadataType, MetadataComponent> deserialize( Descriptor descriptor, EnumSet<MetadataType> types) throws IOException
     {
         Map<MetadataType, MetadataComponent> components;
         logger.trace("Load metadata for {}", descriptor);
@@ -116,14 +116,13 @@
         }
         for (MetadataType type : types)
         {
-            MetadataComponent component = null;
             Integer offset = toc.get(type);
             if (offset != null)
             {
                 in.seek(offset);
-                component = type.serializer.deserialize(descriptor.version, in);
+                MetadataComponent component = type.serializer.deserialize(descriptor.version, in);
+                components.put(type, component);
             }
-            components.put(type, component);
         }
         return components;
     }
@@ -150,17 +149,16 @@
 
     private void rewriteSSTableMetadata(Descriptor descriptor, Map<MetadataType, MetadataComponent> currentComponents) throws IOException
     {
-        Descriptor tmpDescriptor = descriptor.asType(Descriptor.Type.TEMP);
-
-        try (DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(tmpDescriptor.filenameFor(Component.STATS))))
+        String filePath = descriptor.tmpFilenameFor(Component.STATS);
+        try (DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(filePath)))
         {
-            serialize(currentComponents, descriptor.version, out);
+            serialize(currentComponents, out, descriptor.version);
             out.flush();
         }
         // we cant move a file on top of another file in windows:
         if (FBUtilities.isWindows())
             FileUtils.delete(descriptor.filenameFor(Component.STATS));
-        FileUtils.renameWithConfirm(tmpDescriptor.filenameFor(Component.STATS), descriptor.filenameFor(Component.STATS));
+        FileUtils.renameWithConfirm(filePath, descriptor.filenameFor(Component.STATS));
 
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java
index 9717da1..875cec4 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataType.java

@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
+import org.apache.cassandra.db.SerializationHeader;
+
 /**
  * Defines Metadata component type.
  */
@@ -27,7 +29,9 @@
     /** Metadata only used at compaction */
     COMPACTION(CompactionMetadata.serializer),
     /** Metadata always keep in memory */
-    STATS(StatsMetadata.serializer);
+    STATS(StatsMetadata.serializer),
+    /** Serialization header */
+    HEADER((IMetadataComponentSerializer)SerializationHeader.serializer);
 
     public final IMetadataComponentSerializer<MetadataComponent> serializer;
 

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
index 3d48e34..1994bca 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java

@@ -17,18 +17,19 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.commons.lang3.builder.EqualsBuilder;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
-
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.commitlog.IntervalSet;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
@@ -40,51 +41,64 @@
 public class StatsMetadata extends MetadataComponent
 {
     public static final IMetadataComponentSerializer serializer = new StatsMetadataSerializer();
+    public static final ISerializer<IntervalSet<ReplayPosition>> replayPositionSetSerializer = IntervalSet.serializer(ReplayPosition.serializer);
 
-    public final EstimatedHistogram estimatedRowSize;
+    public final EstimatedHistogram estimatedPartitionSize;
     public final EstimatedHistogram estimatedColumnCount;
-    public final ReplayPosition commitLogLowerBound;
-    public final ReplayPosition commitLogUpperBound;
+    public final IntervalSet<ReplayPosition> commitLogIntervals;
     public final long minTimestamp;
     public final long maxTimestamp;
+    public final int minLocalDeletionTime;
     public final int maxLocalDeletionTime;
+    public final int minTTL;
+    public final int maxTTL;
     public final double compressionRatio;
     public final StreamingHistogram estimatedTombstoneDropTime;
     public final int sstableLevel;
-    public final List<ByteBuffer> maxColumnNames;
-    public final List<ByteBuffer> minColumnNames;
+    public final List<ByteBuffer> minClusteringValues;
+    public final List<ByteBuffer> maxClusteringValues;
     public final boolean hasLegacyCounterShards;
     public final long repairedAt;
+    public final long totalColumnsSet;
+    public final long totalRows;
 
-    public StatsMetadata(EstimatedHistogram estimatedRowSize,
+    public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
                          EstimatedHistogram estimatedColumnCount,
-                         ReplayPosition commitLogLowerBound,
-                         ReplayPosition commitLogUpperBound,
+                         IntervalSet<ReplayPosition> commitLogIntervals,
                          long minTimestamp,
                          long maxTimestamp,
+                         int minLocalDeletionTime,
                          int maxLocalDeletionTime,
+                         int minTTL,
+                         int maxTTL,
                          double compressionRatio,
                          StreamingHistogram estimatedTombstoneDropTime,
                          int sstableLevel,
-                         List<ByteBuffer> minColumnNames,
-                         List<ByteBuffer> maxColumnNames,
+                         List<ByteBuffer> minClusteringValues,
+                         List<ByteBuffer> maxClusteringValues,
                          boolean hasLegacyCounterShards,
-                         long repairedAt)
+                         long repairedAt,
+                         long totalColumnsSet,
+                         long totalRows)
     {
-        this.estimatedRowSize = estimatedRowSize;
+        this.estimatedPartitionSize = estimatedPartitionSize;
         this.estimatedColumnCount = estimatedColumnCount;
-        this.commitLogLowerBound = commitLogLowerBound;
-        this.commitLogUpperBound = commitLogUpperBound;
+        this.commitLogIntervals = commitLogIntervals;
         this.minTimestamp = minTimestamp;
         this.maxTimestamp = maxTimestamp;
+        this.minLocalDeletionTime = minLocalDeletionTime;
         this.maxLocalDeletionTime = maxLocalDeletionTime;
+        this.minTTL = minTTL;
+        this.maxTTL = maxTTL;
         this.compressionRatio = compressionRatio;
         this.estimatedTombstoneDropTime = estimatedTombstoneDropTime;
         this.sstableLevel = sstableLevel;
-        this.minColumnNames = minColumnNames;
-        this.maxColumnNames = maxColumnNames;
+        this.minClusteringValues = minClusteringValues;
+        this.maxClusteringValues = maxClusteringValues;
         this.hasLegacyCounterShards = hasLegacyCounterShards;
         this.repairedAt = repairedAt;
+        this.totalColumnsSet = totalColumnsSet;
+        this.totalRows = totalRows;
     }
 
     public MetadataType getType()
@@ -118,38 +132,46 @@
 
     public StatsMetadata mutateLevel(int newLevel)
     {
-        return new StatsMetadata(estimatedRowSize,
+        return new StatsMetadata(estimatedPartitionSize,
                                  estimatedColumnCount,
-                                 commitLogLowerBound,
-                                 commitLogUpperBound,
+                                 commitLogIntervals,
                                  minTimestamp,
                                  maxTimestamp,
+                                 minLocalDeletionTime,
                                  maxLocalDeletionTime,
+                                 minTTL,
+                                 maxTTL,
                                  compressionRatio,
                                  estimatedTombstoneDropTime,
                                  newLevel,
-                                 minColumnNames,
-                                 maxColumnNames,
+                                 minClusteringValues,
+                                 maxClusteringValues,
                                  hasLegacyCounterShards,
-                                 repairedAt);
+                                 repairedAt,
+                                 totalColumnsSet,
+                                 totalRows);
     }
 
     public StatsMetadata mutateRepairedAt(long newRepairedAt)
     {
-        return new StatsMetadata(estimatedRowSize,
+        return new StatsMetadata(estimatedPartitionSize,
                                  estimatedColumnCount,
-                                 commitLogLowerBound,
-                                 commitLogUpperBound,
+                                 commitLogIntervals,
                                  minTimestamp,
                                  maxTimestamp,
+                                 minLocalDeletionTime,
                                  maxLocalDeletionTime,
+                                 minTTL,
+                                 maxTTL,
                                  compressionRatio,
                                  estimatedTombstoneDropTime,
                                  sstableLevel,
-                                 minColumnNames,
-                                 maxColumnNames,
+                                 minClusteringValues,
+                                 maxClusteringValues,
                                  hasLegacyCounterShards,
-                                 newRepairedAt);
+                                 newRepairedAt,
+                                 totalColumnsSet,
+                                 totalRows);
     }
 
     @Override
@@ -160,20 +182,24 @@
 
         StatsMetadata that = (StatsMetadata) o;
         return new EqualsBuilder()
-                       .append(estimatedRowSize, that.estimatedRowSize)
+                       .append(estimatedPartitionSize, that.estimatedPartitionSize)
                        .append(estimatedColumnCount, that.estimatedColumnCount)
-                       .append(commitLogLowerBound, that.commitLogLowerBound)
-                       .append(commitLogUpperBound, that.commitLogUpperBound)
+                       .append(commitLogIntervals, that.commitLogIntervals)
                        .append(minTimestamp, that.minTimestamp)
                        .append(maxTimestamp, that.maxTimestamp)
+                       .append(minLocalDeletionTime, that.minLocalDeletionTime)
                        .append(maxLocalDeletionTime, that.maxLocalDeletionTime)
+                       .append(minTTL, that.minTTL)
+                       .append(maxTTL, that.maxTTL)
                        .append(compressionRatio, that.compressionRatio)
                        .append(estimatedTombstoneDropTime, that.estimatedTombstoneDropTime)
                        .append(sstableLevel, that.sstableLevel)
                        .append(repairedAt, that.repairedAt)
-                       .append(maxColumnNames, that.maxColumnNames)
-                       .append(minColumnNames, that.minColumnNames)
+                       .append(maxClusteringValues, that.maxClusteringValues)
+                       .append(minClusteringValues, that.minClusteringValues)
                        .append(hasLegacyCounterShards, that.hasLegacyCounterShards)
+                       .append(totalColumnsSet, that.totalColumnsSet)
+                       .append(totalRows, that.totalRows)
                        .build();
     }
 
@@ -181,80 +207,111 @@
     public int hashCode()
     {
         return new HashCodeBuilder()
-                       .append(estimatedRowSize)
+                       .append(estimatedPartitionSize)
                        .append(estimatedColumnCount)
-                       .append(commitLogLowerBound)
-                       .append(commitLogUpperBound)
+                       .append(commitLogIntervals)
                        .append(minTimestamp)
                        .append(maxTimestamp)
+                       .append(minLocalDeletionTime)
                        .append(maxLocalDeletionTime)
+                       .append(minTTL)
+                       .append(maxTTL)
                        .append(compressionRatio)
                        .append(estimatedTombstoneDropTime)
                        .append(sstableLevel)
                        .append(repairedAt)
-                       .append(maxColumnNames)
-                       .append(minColumnNames)
+                       .append(maxClusteringValues)
+                       .append(minClusteringValues)
                        .append(hasLegacyCounterShards)
+                       .append(totalColumnsSet)
+                       .append(totalRows)
                        .build();
     }
 
     public static class StatsMetadataSerializer implements IMetadataComponentSerializer<StatsMetadata>
     {
-        public int serializedSize(StatsMetadata component, Version version) throws IOException
+        public int serializedSize(Version version, StatsMetadata component) throws IOException
         {
             int size = 0;
-            size += EstimatedHistogram.serializer.serializedSize(component.estimatedRowSize, TypeSizes.NATIVE);
-            size += EstimatedHistogram.serializer.serializedSize(component.estimatedColumnCount, TypeSizes.NATIVE);
-            size += ReplayPosition.serializer.serializedSize(component.commitLogUpperBound, TypeSizes.NATIVE);
-            size += 8 + 8 + 4 + 8 + 8; // mix/max timestamp(long), maxLocalDeletionTime(int), compressionRatio(double), repairedAt (long)
-            size += StreamingHistogram.serializer.serializedSize(component.estimatedTombstoneDropTime, TypeSizes.NATIVE);
-            size += TypeSizes.NATIVE.sizeof(component.sstableLevel);
+            size += EstimatedHistogram.serializer.serializedSize(component.estimatedPartitionSize);
+            size += EstimatedHistogram.serializer.serializedSize(component.estimatedColumnCount);
+            size += ReplayPosition.serializer.serializedSize(component.commitLogIntervals.upperBound().orElse(ReplayPosition.NONE));
+            if (version.storeRows())
+                size += 8 + 8 + 4 + 4 + 4 + 4 + 8 + 8; // mix/max timestamp(long), min/maxLocalDeletionTime(int), min/max TTL, compressionRatio(double), repairedAt (long)
+            else
+                size += 8 + 8 + 4 + 8 + 8; // mix/max timestamp(long), maxLocalDeletionTime(int), compressionRatio(double), repairedAt (long)
+            size += StreamingHistogram.serializer.serializedSize(component.estimatedTombstoneDropTime);
+            size += TypeSizes.sizeof(component.sstableLevel);
             // min column names
             size += 4;
-            for (ByteBuffer columnName : component.minColumnNames)
-                size += 2 + columnName.remaining(); // with short length
+            for (ByteBuffer value : component.minClusteringValues)
+                size += 2 + value.remaining(); // with short length
             // max column names
             size += 4;
-            for (ByteBuffer columnName : component.maxColumnNames)
-                size += 2 + columnName.remaining(); // with short length
-            size += TypeSizes.NATIVE.sizeof(component.hasLegacyCounterShards);
+            for (ByteBuffer value : component.maxClusteringValues)
+                size += 2 + value.remaining(); // with short length
+            size += TypeSizes.sizeof(component.hasLegacyCounterShards);
+            if (version.storeRows())
+                size += 8 + 8; // totalColumnsSet, totalRows
             if (version.hasCommitLogLowerBound())
-                size += ReplayPosition.serializer.serializedSize(component.commitLogLowerBound, TypeSizes.NATIVE);
+                size += ReplayPosition.serializer.serializedSize(component.commitLogIntervals.lowerBound().orElse(ReplayPosition.NONE));
+            if (version.hasCommitLogIntervals())
+                size += replayPositionSetSerializer.serializedSize(component.commitLogIntervals);
             return size;
         }
 
-        public void serialize(StatsMetadata component, Version version, DataOutputPlus out) throws IOException
+        public void serialize(Version version, StatsMetadata component, DataOutputPlus out) throws IOException
         {
-            EstimatedHistogram.serializer.serialize(component.estimatedRowSize, out);
+            EstimatedHistogram.serializer.serialize(component.estimatedPartitionSize, out);
             EstimatedHistogram.serializer.serialize(component.estimatedColumnCount, out);
-            ReplayPosition.serializer.serialize(component.commitLogUpperBound, out);
+            ReplayPosition.serializer.serialize(component.commitLogIntervals.upperBound().orElse(ReplayPosition.NONE), out);
             out.writeLong(component.minTimestamp);
             out.writeLong(component.maxTimestamp);
+            if (version.storeRows())
+                out.writeInt(component.minLocalDeletionTime);
             out.writeInt(component.maxLocalDeletionTime);
+            if (version.storeRows())
+            {
+                out.writeInt(component.minTTL);
+                out.writeInt(component.maxTTL);
+            }
             out.writeDouble(component.compressionRatio);
             StreamingHistogram.serializer.serialize(component.estimatedTombstoneDropTime, out);
             out.writeInt(component.sstableLevel);
             out.writeLong(component.repairedAt);
-            out.writeInt(component.minColumnNames.size());
-            for (ByteBuffer columnName : component.minColumnNames)
-                ByteBufferUtil.writeWithShortLength(columnName, out);
-            out.writeInt(component.maxColumnNames.size());
-            for (ByteBuffer columnName : component.maxColumnNames)
-                ByteBufferUtil.writeWithShortLength(columnName, out);
+            out.writeInt(component.minClusteringValues.size());
+            for (ByteBuffer value : component.minClusteringValues)
+                ByteBufferUtil.writeWithShortLength(value, out);
+            out.writeInt(component.maxClusteringValues.size());
+            for (ByteBuffer value : component.maxClusteringValues)
+                ByteBufferUtil.writeWithShortLength(value, out);
             out.writeBoolean(component.hasLegacyCounterShards);
+
+            if (version.storeRows())
+            {
+                out.writeLong(component.totalColumnsSet);
+                out.writeLong(component.totalRows);
+            }
+
             if (version.hasCommitLogLowerBound())
-                ReplayPosition.serializer.serialize(component.commitLogLowerBound, out);
+                ReplayPosition.serializer.serialize(component.commitLogIntervals.lowerBound().orElse(ReplayPosition.NONE), out);
+            if (version.hasCommitLogIntervals())
+                replayPositionSetSerializer.serialize(component.commitLogIntervals, out);
         }
 
-        public StatsMetadata deserialize(Version version, DataInput in) throws IOException
+        public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOException
         {
-            EstimatedHistogram rowSizes = EstimatedHistogram.serializer.deserialize(in);
+            EstimatedHistogram partitionSizes = EstimatedHistogram.serializer.deserialize(in);
             EstimatedHistogram columnCounts = EstimatedHistogram.serializer.deserialize(in);
             ReplayPosition commitLogLowerBound = ReplayPosition.NONE, commitLogUpperBound;
             commitLogUpperBound = ReplayPosition.serializer.deserialize(in);
             long minTimestamp = in.readLong();
             long maxTimestamp = in.readLong();
+            // We use MAX_VALUE as that's the default value for "no deletion time"
+            int minLocalDeletionTime = version.storeRows() ? in.readInt() : Integer.MAX_VALUE;
             int maxLocalDeletionTime = in.readInt();
+            int minTTL = version.storeRows() ? in.readInt() : 0;
+            int maxTTL = version.storeRows() ? in.readInt() : Integer.MAX_VALUE;
             double compressionRatio = in.readDouble();
             StreamingHistogram tombstoneHistogram = StreamingHistogram.serializer.deserialize(in);
             int sstableLevel = in.readInt();
@@ -262,36 +319,59 @@
             if (version.hasRepairedAt())
                 repairedAt = in.readLong();
 
+            // for legacy sstables, we skip deserializing the min and max clustering value
+            // to prevent erroneously excluding sstables from reads (see CASSANDRA-14861)
             int colCount = in.readInt();
-            List<ByteBuffer> minColumnNames = new ArrayList<>(colCount);
+            List<ByteBuffer> minClusteringValues = new ArrayList<>(colCount);
             for (int i = 0; i < colCount; i++)
-                minColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+            {
+                ByteBuffer val = ByteBufferUtil.readWithShortLength(in);
+                if (version.hasAccurateMinMax())
+                    minClusteringValues.add(val);
+            }
 
             colCount = in.readInt();
-            List<ByteBuffer> maxColumnNames = new ArrayList<>(colCount);
+            List<ByteBuffer> maxClusteringValues = new ArrayList<>(colCount);
             for (int i = 0; i < colCount; i++)
-                maxColumnNames.add(ByteBufferUtil.readWithShortLength(in));
+            {
+                ByteBuffer val = ByteBufferUtil.readWithShortLength(in);
+                if (version.hasAccurateMinMax())
+                    maxClusteringValues.add(val);
+            }
 
             boolean hasLegacyCounterShards = true;
             if (version.tracksLegacyCounterShards())
                 hasLegacyCounterShards = in.readBoolean();
 
+            long totalColumnsSet = version.storeRows() ? in.readLong() : -1L;
+            long totalRows = version.storeRows() ? in.readLong() : -1L;
+
             if (version.hasCommitLogLowerBound())
                 commitLogLowerBound = ReplayPosition.serializer.deserialize(in);
-            return new StatsMetadata(rowSizes,
+            IntervalSet<ReplayPosition> commitLogIntervals;
+            if (version.hasCommitLogIntervals())
+                commitLogIntervals = replayPositionSetSerializer.deserialize(in);
+            else
+                commitLogIntervals = new IntervalSet<ReplayPosition>(commitLogLowerBound, commitLogUpperBound);
+
+            return new StatsMetadata(partitionSizes,
                                      columnCounts,
-                                     commitLogLowerBound,
-                                     commitLogUpperBound,
+                                     commitLogIntervals,
                                      minTimestamp,
                                      maxTimestamp,
+                                     minLocalDeletionTime,
                                      maxLocalDeletionTime,
+                                     minTTL,
+                                     maxTTL,
                                      compressionRatio,
                                      tombstoneHistogram,
                                      sstableLevel,
-                                     minColumnNames,
-                                     maxColumnNames,
+                                     minClusteringValues,
+                                     maxClusteringValues,
                                      hasLegacyCounterShards,
-                                     repairedAt);
+                                     repairedAt,
+                                     totalColumnsSet,
+                                     totalRows);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java
index 4ca078b..0eda8eb 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/ValidationMetadata.java

@@ -17,11 +17,11 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -71,18 +71,18 @@
 
     public static class ValidationMetadataSerializer implements IMetadataComponentSerializer<ValidationMetadata>
     {
-        public int serializedSize(ValidationMetadata component, Version version) throws IOException
+        public int serializedSize(Version version, ValidationMetadata component) throws IOException
         {
-            return TypeSizes.NATIVE.sizeof(component.partitioner) + 8;
+            return TypeSizes.sizeof(component.partitioner) + 8;
         }
 
-        public void serialize(ValidationMetadata component, Version version, DataOutputPlus out) throws IOException
+        public void serialize(Version version, ValidationMetadata component, DataOutputPlus out) throws IOException
         {
             out.writeUTF(component.partitioner);
             out.writeDouble(component.bloomFilterFPChance);
         }
 
-        public ValidationMetadata deserialize(Version version, DataInput in) throws IOException
+        public ValidationMetadata deserialize(Version version, DataInputPlus in) throws IOException
         {
 
             return new ValidationMetadata(in.readUTF(), in.readDouble());

diff --git a/src/java/org/apache/cassandra/io/util/AbstractDataInput.java b/src/java/org/apache/cassandra/io/util/AbstractDataInput.java
deleted file mode 100644
index 588540d..0000000
--- a/src/java/org/apache/cassandra/io/util/AbstractDataInput.java
+++ /dev/null

@@ -1,343 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.util;
-
-import java.io.*;
-
-public abstract class AbstractDataInput extends InputStream implements DataInput
-{
-    public abstract void seek(long position) throws IOException;
-    public abstract long getPosition();
-    public abstract long getPositionLimit();
-
-    public int skipBytes(int n) throws IOException
-    {
-        if (n <= 0)
-            return 0;
-        long oldPosition = getPosition();
-        seek(Math.min(getPositionLimit(), oldPosition + n));
-        long skipped = getPosition() - oldPosition;
-        assert skipped >= 0 && skipped <= n;
-        return (int) skipped;
-    }
-
-    /**
-     * Reads a boolean from the current position in this file. Blocks until one
-     * byte has been read, the end of the file is reached or an exception is
-     * thrown.
-     *
-     * @return the next boolean value from this file.
-     * @throws java.io.EOFException
-     *             if the end of this file is detected.
-     * @throws java.io.IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final boolean readBoolean() throws IOException {
-        int temp = this.read();
-        if (temp < 0) {
-            throw new EOFException();
-        }
-        return temp != 0;
-    }
-
-    /**
-     * Reads an 8-bit byte from the current position in this file. Blocks until
-     * one byte has been read, the end of the file is reached or an exception is
-     * thrown.
-     *
-     * @return the next signed 8-bit byte value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final byte readByte() throws IOException {
-        int temp = this.read();
-        if (temp < 0) {
-            throw new EOFException();
-        }
-        return (byte) temp;
-    }
-
-    /**
-     * Reads a 16-bit character from the current position in this file. Blocks until
-     * two bytes have been read, the end of the file is reached or an exception is
-     * thrown.
-     *
-     * @return the next char value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final char readChar() throws IOException {
-        int ch1 = this.read();
-        int ch2 = this.read();
-        if ((ch1 | ch2) < 0)
-            throw new EOFException();
-        return (char)((ch1 << 8) + (ch2 << 0));
-    }
-
-    /**
-     * Reads a 64-bit double from the current position in this file. Blocks
-     * until eight bytes have been read, the end of the file is reached or an
-     * exception is thrown.
-     *
-     * @return the next double value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final double readDouble() throws IOException {
-        return Double.longBitsToDouble(readLong());
-    }
-
-    /**
-     * Reads a 32-bit float from the current position in this file. Blocks
-     * until four bytes have been read, the end of the file is reached or an
-     * exception is thrown.
-     *
-     * @return the next float value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final float readFloat() throws IOException {
-        return Float.intBitsToFloat(readInt());
-    }
-
-    /**
-     * Reads bytes from this file into {@code buffer}. Blocks until {@code
-     * buffer.length} number of bytes have been read, the end of the file is
-     * reached or an exception is thrown.
-     *
-     * @param buffer
-     *            the buffer to read bytes into.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     * @throws NullPointerException
-     *             if {@code buffer} is {@code null}.
-     */
-    public void readFully(byte[] buffer) throws IOException
-    {
-        readFully(buffer, 0, buffer.length);
-    }
-
-    /**
-     * Read bytes from this file into {@code buffer} starting at offset {@code
-     * offset}. This method blocks until {@code count} number of bytes have been
-     * read.
-     *
-     * @param buffer
-     *            the buffer to read bytes into.
-     * @param offset
-     *            the initial position in {@code buffer} to store the bytes read
-     *            from this file.
-     * @param count
-     *            the maximum number of bytes to store in {@code buffer}.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IndexOutOfBoundsException
-     *             if {@code offset < 0} or {@code count < 0}, or if {@code
-     *             offset + count} is greater than the length of {@code buffer}.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     * @throws NullPointerException
-     *             if {@code buffer} is {@code null}.
-     */
-    public void readFully(byte[] buffer, int offset, int count) throws IOException
-    {
-        if (buffer == null) {
-            throw new NullPointerException();
-        }
-        // avoid int overflow
-        if (offset < 0 || offset > buffer.length || count < 0
-                || count > buffer.length - offset) {
-            throw new IndexOutOfBoundsException();
-        }
-        while (count > 0) {
-            int result = read(buffer, offset, count);
-            if (result < 0) {
-                throw new EOFException();
-            }
-            offset += result;
-            count -= result;
-        }
-    }
-
-    /**
-     * Reads a 32-bit integer from the current position in this file. Blocks
-     * until four bytes have been read, the end of the file is reached or an
-     * exception is thrown.
-     *
-     * @return the next int value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public int readInt() throws IOException {
-        int ch1 = this.read();
-        int ch2 = this.read();
-        int ch3 = this.read();
-        int ch4 = this.read();
-        if ((ch1 | ch2 | ch3 | ch4) < 0)
-            throw new EOFException();
-        return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
-    }
-
-    /**
-     * Reads a line of text form the current position in this file. A line is
-     * represented by zero or more characters followed by {@code '\n'}, {@code
-     * '\r'}, {@code "\r\n"} or the end of file marker. The string does not
-     * include the line terminating sequence.
-     * <p>
-     * Blocks until a line terminating sequence has been read, the end of the
-     * file is reached or an exception is thrown.
-     *
-     * @return the contents of the line or {@code null} if no characters have
-     *         been read before the end of the file has been reached.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final String readLine() throws IOException {
-        StringBuilder line = new StringBuilder(80); // Typical line length
-        boolean foundTerminator = false;
-        long unreadPosition = -1;
-        while (true) {
-            int nextByte = read();
-            switch (nextByte) {
-                case -1:
-                    return line.length() != 0 ? line.toString() : null;
-                case (byte) '\r':
-                    if (foundTerminator) {
-                        seek(unreadPosition);
-                        return line.toString();
-                    }
-                    foundTerminator = true;
-                    /* Have to be able to peek ahead one byte */
-                    unreadPosition = getPosition();
-                    break;
-                case (byte) '\n':
-                    return line.toString();
-                default:
-                    if (foundTerminator) {
-                        seek(unreadPosition);
-                        return line.toString();
-                    }
-                    line.append((char) nextByte);
-            }
-        }
-    }
-
-    /**
-     * Reads a 64-bit long from the current position in this file. Blocks until
-     * eight bytes have been read, the end of the file is reached or an
-     * exception is thrown.
-     *
-     * @return the next long value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public long readLong() throws IOException {
-        return ((long)(readInt()) << 32) + (readInt() & 0xFFFFFFFFL);
-    }
-
-    /**
-     * Reads a 16-bit short from the current position in this file. Blocks until
-     * two bytes have been read, the end of the file is reached or an exception
-     * is thrown.
-     *
-     * @return the next short value from this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public short readShort() throws IOException {
-        int ch1 = this.read();
-        int ch2 = this.read();
-        if ((ch1 | ch2) < 0)
-            throw new EOFException();
-        return (short)((ch1 << 8) + (ch2 << 0));
-    }
-
-    /**
-     * Reads an unsigned 8-bit byte from the current position in this file and
-     * returns it as an integer. Blocks until one byte has been read, the end of
-     * the file is reached or an exception is thrown.
-     *
-     * @return the next unsigned byte value from this file as an int.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public final int readUnsignedByte() throws IOException {
-        int temp = this.read();
-        if (temp < 0) {
-            throw new EOFException();
-        }
-        return temp;
-    }
-
-    /**
-     * Reads an unsigned 16-bit short from the current position in this file and
-     * returns it as an integer. Blocks until two bytes have been read, the end of
-     * the file is reached or an exception is thrown.
-     *
-     * @return the next unsigned short value from this file as an int.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     */
-    public int readUnsignedShort() throws IOException {
-        int ch1 = this.read();
-        int ch2 = this.read();
-        if ((ch1 | ch2) < 0)
-            throw new EOFException();
-        return (ch1 << 8) + (ch2 << 0);
-    }
-
-    /**
-     * Reads a string that is encoded in {@link java.io.DataInput modified UTF-8} from
-     * this file. The number of bytes that must be read for the complete string
-     * is determined by the first two bytes read from the file. Blocks until all
-     * required bytes have been read, the end of the file is reached or an
-     * exception is thrown.
-     *
-     * @return the next string encoded in {@link java.io.DataInput modified UTF-8} from
-     *         this file.
-     * @throws EOFException
-     *             if the end of this file is detected.
-     * @throws IOException
-     *             if this file is closed or another I/O error occurs.
-     * @throws java.io.UTFDataFormatException
-     *             if the bytes read cannot be decoded into a character string.
-     */
-    public final String readUTF() throws IOException {
-        return DataInputStream.readUTF(this);
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java
index d55db47..54122ee 100644
--- a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java
+++ b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java

@@ -27,9 +27,11 @@
 import com.google.common.base.Function;
 import com.google.common.base.Preconditions;
 
+import net.nicoulaj.compilecommand.annotations.DontInline;
+
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.utils.memory.MemoryUtil;
-
+import org.apache.cassandra.utils.vint.VIntCoding;
 
 /**
  * An implementation of the DataOutputStreamPlus interface using a ByteBuffer to stage writes
@@ -41,7 +43,16 @@
 {
     private static final int DEFAULT_BUFFER_SIZE = Integer.getInteger(Config.PROPERTY_PREFIX + "nio_data_output_stream_plus_buffer_size", 1024 * 32);
 
-    ByteBuffer buffer;
+    protected ByteBuffer buffer;
+
+    //Allow derived classes to specify writing to the channel
+    //directly shouldn't happen because they intercept via doFlush for things
+    //like compression or checksumming
+    //Another hack for this value is that it also indicates that flushing early
+    //should not occur, flushes aligned with buffer size are desired
+    //Unless... it's the last flush. Compression and checksum formats
+    //expect block (same as buffer size) alignment for everything except the last block
+    protected boolean strictFlushing = false;
 
     public BufferedDataOutputStreamPlus(RandomAccessFile ras)
     {
@@ -142,41 +153,54 @@
         else
         {
             assert toWrite.isDirect();
+            MemoryUtil.duplicateDirectByteBuffer(toWrite, hollowBuffer);
             int toWriteRemaining = toWrite.remaining();
+
             if (toWriteRemaining > buffer.remaining())
             {
-                doFlush(toWriteRemaining);
-                MemoryUtil.duplicateDirectByteBuffer(toWrite, hollowBuffer);
-                if (toWriteRemaining > buffer.remaining())
+                if (strictFlushing)
                 {
-                    while (hollowBuffer.hasRemaining())
-                        channel.write(hollowBuffer);
+                    writeExcessSlow();
                 }
                 else
                 {
-                    buffer.put(hollowBuffer);
+                    doFlush(toWriteRemaining - buffer.remaining());
+                    while (hollowBuffer.remaining() > buffer.capacity())
+                        channel.write(hollowBuffer);
                 }
             }
-            else
-            {
-                MemoryUtil.duplicateDirectByteBuffer(toWrite, hollowBuffer);
-                buffer.put(hollowBuffer);
-            }
+
+            buffer.put(hollowBuffer);
         }
     }
 
+    // writes anything we can't fit into the buffer
+    @DontInline
+    private void writeExcessSlow() throws IOException
+    {
+        int originalLimit = hollowBuffer.limit();
+        while (originalLimit - hollowBuffer.position() > buffer.remaining())
+        {
+            hollowBuffer.limit(hollowBuffer.position() + buffer.remaining());
+            buffer.put(hollowBuffer);
+            doFlush(originalLimit - hollowBuffer.position());
+        }
+        hollowBuffer.limit(originalLimit);
+    }
 
     @Override
     public void write(int b) throws IOException
     {
-        ensureRemaining(1);
+        if (!buffer.hasRemaining())
+            doFlush(1);
         buffer.put((byte) (b & 0xFF));
     }
 
     @Override
     public void writeBoolean(boolean v) throws IOException
     {
-        ensureRemaining(1);
+        if (!buffer.hasRemaining())
+            doFlush(1);
         buffer.put(v ? (byte)1 : (byte)0);
     }
 
@@ -189,43 +213,75 @@
     @Override
     public void writeShort(int v) throws IOException
     {
-        ensureRemaining(2);
-        buffer.putShort((short) v);
+        writeChar(v);
     }
 
     @Override
     public void writeChar(int v) throws IOException
     {
-        ensureRemaining(2);
-        buffer.putChar((char) v);
+        if (buffer.remaining() < 2)
+            writeSlow(v, 2);
+        else
+            buffer.putChar((char) v);
     }
 
     @Override
     public void writeInt(int v) throws IOException
     {
-        ensureRemaining(4);
-        buffer.putInt(v);
+        if (buffer.remaining() < 4)
+            writeSlow(v, 4);
+        else
+            buffer.putInt(v);
     }
 
     @Override
     public void writeLong(long v) throws IOException
     {
-        ensureRemaining(8);
-        buffer.putLong(v);
+        if (buffer.remaining() < 8)
+            writeSlow(v, 8);
+        else
+            buffer.putLong(v);
+    }
+
+    @Override
+    public void writeVInt(long value) throws IOException
+    {
+        writeUnsignedVInt(VIntCoding.encodeZigZag64(value));
+    }
+
+    @Override
+    public void writeUnsignedVInt(long value) throws IOException
+    {
+        int size = VIntCoding.computeUnsignedVIntSize(value);
+        if (size == 1)
+        {
+            write((int) value);
+            return;
+        }
+
+        write(VIntCoding.encodeVInt(value, size), 0, size);
     }
 
     @Override
     public void writeFloat(float v) throws IOException
     {
-        ensureRemaining(4);
-        buffer.putFloat(v);
+        writeInt(Float.floatToRawIntBits(v));
     }
 
     @Override
     public void writeDouble(double v) throws IOException
     {
-        ensureRemaining(8);
-        buffer.putDouble(v);
+        writeLong(Double.doubleToRawLongBits(v));
+    }
+
+    @DontInline
+    private void writeSlow(long bytes, int count) throws IOException
+    {
+        int origCount = count;
+        if (ByteOrder.BIG_ENDIAN == buffer.order())
+            while (count > 0) writeByte((int) (bytes >>> (8 * --count)));
+        else
+            while (count > 0) writeByte((int) (bytes >>> (8 * (origCount - count--))));
     }
 
     @Override
@@ -258,6 +314,7 @@
     /*
      * Count is the number of bytes remaining to write ignoring already remaining capacity
      */
+    @DontInline
     protected void doFlush(int count) throws IOException
     {
         buffer.flip();
@@ -283,15 +340,11 @@
         buffer = null;
     }
 
-    protected void ensureRemaining(int minimum) throws IOException
-    {
-        if (buffer.remaining() < minimum)
-            doFlush(minimum);
-    }
-
     @Override
     public <R> R applyToChannel(Function<WritableByteChannel, R> f) throws IOException
     {
+        if (strictFlushing)
+            throw new UnsupportedOperationException();
         //Don't allow writes to the underlying channel while data is buffered
         flush();
         return f.apply(channel);

diff --git a/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java
deleted file mode 100644
index b623e54..0000000
--- a/src/java/org/apache/cassandra/io/util/BufferedPoolingSegmentedFile.java
+++ /dev/null

@@ -1,50 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-package org.apache.cassandra.io.util;
-
-public class BufferedPoolingSegmentedFile extends PoolingSegmentedFile
-{
-    public BufferedPoolingSegmentedFile(ChannelProxy channel, long length)
-    {
-        super(new Cleanup(channel), channel, length);
-    }
-
-    private BufferedPoolingSegmentedFile(BufferedPoolingSegmentedFile copy)
-    {
-        super(copy);
-    }
-
-    public BufferedPoolingSegmentedFile sharedCopy()
-    {
-        return new BufferedPoolingSegmentedFile(this);
-    }
-
-    public static class Builder extends SegmentedFile.Builder
-    {
-        public void addPotentialBoundary(long boundary)
-        {
-            // only one segment in a standard-io file
-        }
-
-        public SegmentedFile complete(ChannelProxy channel, long overrideLength)
-        {
-            long length = overrideLength > 0 ? overrideLength : channel.size();
-            return new BufferedPoolingSegmentedFile(channel, length);
-        }
-    }
-}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java
index 2c59def..090c5bd 100644
--- a/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/BufferedSegmentedFile.java

@@ -19,9 +19,9 @@
 
 public class BufferedSegmentedFile extends SegmentedFile
 {
-    public BufferedSegmentedFile(ChannelProxy channel, long length)
+    public BufferedSegmentedFile(ChannelProxy channel, int bufferSize, long length)
     {
-        super(new Cleanup(channel), channel, length);
+        super(new Cleanup(channel), channel, bufferSize, length);
     }
 
     private BufferedSegmentedFile(BufferedSegmentedFile copy)
@@ -29,39 +29,15 @@
         super(copy);
     }
 
-    private static class Cleanup extends SegmentedFile.Cleanup
-    {
-        protected Cleanup(ChannelProxy channel)
-        {
-            super(channel);
-        }
-        public void tidy()
-        {
-            super.tidy();
-        }
-    }
-
     public static class Builder extends SegmentedFile.Builder
     {
-        public void addPotentialBoundary(long boundary)
-        {
-            // only one segment in a standard-io file
-        }
-
-        public SegmentedFile complete(ChannelProxy channel, long overrideLength)
+        public SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength)
         {
             long length = overrideLength > 0 ? overrideLength : channel.size();
-            return new BufferedSegmentedFile(channel, length);
+            return new BufferedSegmentedFile(channel, bufferSize, length);
         }
     }
 
-    public FileDataInput getSegment(long position)
-    {
-        RandomAccessReader reader = RandomAccessReader.open(channel);
-        reader.seek(position);
-        return reader;
-    }
-
     public BufferedSegmentedFile sharedCopy()
     {
         return new BufferedSegmentedFile(this);

diff --git a/src/java/org/apache/cassandra/io/util/ByteBufferDataInput.java b/src/java/org/apache/cassandra/io/util/ByteBufferDataInput.java
deleted file mode 100644
index bf926e9..0000000
--- a/src/java/org/apache/cassandra/io/util/ByteBufferDataInput.java
+++ /dev/null

@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.util;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class ByteBufferDataInput extends AbstractDataInput implements FileDataInput, DataInput
-{
-    private final ByteBuffer buffer;
-    private final String filename;
-    private final long segmentOffset;
-    private int position;
-
-    public ByteBufferDataInput(ByteBuffer buffer, String filename, long segmentOffset, int position)
-    {
-        assert buffer != null;
-        this.buffer = buffer;
-        this.filename = filename;
-        this.segmentOffset = segmentOffset;
-        this.position = position;
-    }
-
-    // Only use when we know the seek in within the mapped segment. Throws an
-    // IOException otherwise.
-    public void seek(long pos) throws IOException
-    {
-        long inSegmentPos = pos - segmentOffset;
-        if (inSegmentPos < 0 || inSegmentPos > buffer.capacity())
-            throw new IOException(String.format("Seek position %d is not within mmap segment (seg offs: %d, length: %d)", pos, segmentOffset, buffer.capacity()));
-
-        position = (int) inSegmentPos;
-    }
-
-    public long getFilePointer()
-    {
-        return segmentOffset + position;
-    }
-
-    public long getPosition()
-    {
-        return segmentOffset + position;
-    }
-
-    public long getPositionLimit()
-    {
-        return segmentOffset + buffer.capacity();
-    }
-
-    @Override
-    public boolean markSupported()
-    {
-        return false;
-    }
-
-    public void reset(FileMark mark) throws IOException
-    {
-        assert mark instanceof MappedFileDataInputMark;
-        position = ((MappedFileDataInputMark) mark).position;
-    }
-
-    public FileMark mark()
-    {
-        return new MappedFileDataInputMark(position);
-    }
-
-    public long bytesPastMark(FileMark mark)
-    {
-        assert mark instanceof MappedFileDataInputMark;
-        assert position >= ((MappedFileDataInputMark) mark).position;
-        return position - ((MappedFileDataInputMark) mark).position;
-    }
-
-    public boolean isEOF() throws IOException
-    {
-        return position == buffer.capacity();
-    }
-
-    public long bytesRemaining() throws IOException
-    {
-        return buffer.capacity() - position;
-    }
-
-    public String getPath()
-    {
-        return filename;
-    }
-
-    public int read() throws IOException
-    {
-        if (isEOF())
-            return -1;
-        return buffer.get(position++) & 0xFF;
-    }
-
-    /**
-     * Does the same thing as <code>readFully</code> do but without copying data (thread safe)
-     * @param length length of the bytes to read
-     * @return buffer with portion of file content
-     * @throws IOException on any fail of I/O operation
-     */
-    public ByteBuffer readBytes(int length) throws IOException
-    {
-        int remaining = buffer.remaining() - position;
-        if (length > remaining)
-            throw new IOException(String.format("mmap segment underflow; remaining is %d but %d requested",
-                                                remaining, length));
-
-        if (length == 0)
-            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
-
-        ByteBuffer bytes = buffer.duplicate();
-        bytes.position(buffer.position() + position).limit(buffer.position() + position + length);
-        position += length;
-
-        // we have to copy the data in case we unreference the underlying sstable.  See CASSANDRA-3179
-        ByteBuffer clone = ByteBuffer.allocate(bytes.remaining());
-        clone.put(bytes);
-        clone.flip();
-        return clone;
-    }
-
-    @Override
-    public final void readFully(byte[] bytes) throws IOException
-    {
-        ByteBufferUtil.arrayCopy(buffer, buffer.position() + position, bytes, 0, bytes.length);
-        position += bytes.length;
-    }
-
-    @Override
-    public final void readFully(byte[] bytes, int offset, int count) throws IOException
-    {
-        ByteBufferUtil.arrayCopy(buffer, buffer.position() + position, bytes, offset, count);
-        position += count;
-    }
-
-    private static class MappedFileDataInputMark implements FileMark
-    {
-        int position;
-
-        MappedFileDataInputMark(int position)
-        {
-            this.position = position;
-        }
-    }
-
-    @Override
-    public String toString() {
-        return getClass().getSimpleName() + "(" +
-               "filename='" + filename + "'" +
-               ", position=" + position +
-               ")";
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/BytesReadTracker.java b/src/java/org/apache/cassandra/io/util/BytesReadTracker.java
new file mode 100644
index 0000000..fc83856
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/BytesReadTracker.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+public interface BytesReadTracker
+{
+    public long getBytesRead();
+
+    /**
+     * reset counter to @param count
+     */
+    public void reset(long count);
+
+}

diff --git a/src/java/org/apache/cassandra/io/util/ChannelProxy.java b/src/java/org/apache/cassandra/io/util/ChannelProxy.java
index 79954a5..1463fdd 100644
--- a/src/java/org/apache/cassandra/io/util/ChannelProxy.java
+++ b/src/java/org/apache/cassandra/io/util/ChannelProxy.java

@@ -26,7 +26,7 @@
 import java.nio.file.StandardOpenOption;
 
 import org.apache.cassandra.io.FSReadError;
-import org.apache.cassandra.utils.CLibrary;
+import org.apache.cassandra.utils.NativeLibrary;
 import org.apache.cassandra.utils.concurrent.RefCounted;
 import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;
 
@@ -63,7 +63,7 @@
 
     public ChannelProxy(File file)
     {
-        this(file.getAbsolutePath(), openChannel(file));
+        this(file.getPath(), openChannel(file));
     }
 
     public ChannelProxy(String filePath, FileChannel channel)
@@ -87,7 +87,7 @@
         final String filePath;
         final FileChannel channel;
 
-        protected Cleanup(String filePath, FileChannel channel)
+        Cleanup(String filePath, FileChannel channel)
         {
             this.filePath = filePath;
             this.channel = channel;
@@ -171,7 +171,7 @@
 
     public int getFileDescriptor()
     {
-        return CLibrary.getfd(channel);
+        return NativeLibrary.getfd(channel);
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/io/util/ChecksummedRandomAccessReader.java b/src/java/org/apache/cassandra/io/util/ChecksummedRandomAccessReader.java
index 9015b61..30f1e0c 100644
--- a/src/java/org/apache/cassandra/io/util/ChecksummedRandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/util/ChecksummedRandomAccessReader.java

@@ -19,70 +19,37 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.zip.Adler32;
+import java.util.zip.CRC32;
 
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Throwables;
 
 public class ChecksummedRandomAccessReader extends RandomAccessReader
 {
     @SuppressWarnings("serial")
     public static class CorruptFileException extends RuntimeException
     {
-        public final File file;
+        public final String filePath;
 
-        public CorruptFileException(Exception cause, File file)
+        public CorruptFileException(Exception cause, String filePath)
         {
             super(cause);
-            this.file = file;
+            this.filePath = filePath;
         }
     }
 
     private final DataIntegrityMetadata.ChecksumValidator validator;
-    private final File file;
 
-    protected ChecksummedRandomAccessReader(File file, ChannelProxy channel, DataIntegrityMetadata.ChecksumValidator validator)
+    private ChecksummedRandomAccessReader(Builder builder)
     {
-        super(channel, validator.chunkSize, -1, BufferType.ON_HEAP, null);
-        this.validator = validator;
-        this.file = file;
+        super(builder);
+        this.validator = builder.validator;
     }
 
     @SuppressWarnings("resource")
-    public static ChecksummedRandomAccessReader open(File file, File crcFile) throws IOException
-    {
-        try (ChannelProxy channel = new ChannelProxy(file))
-        {
-            RandomAccessReader crcReader = RandomAccessReader.open(crcFile);
-            boolean closeCrcReader = true;
-            try
-            {
-                DataIntegrityMetadata.ChecksumValidator validator =
-                        new DataIntegrityMetadata.ChecksumValidator(new Adler32(), crcReader, file.getPath());
-                closeCrcReader = false;
-                boolean closeValidator = true;
-                try
-                {
-                    ChecksummedRandomAccessReader retval = new ChecksummedRandomAccessReader(file, channel, validator);
-                    closeValidator = false;
-                    return retval;
-                }
-                finally
-                {
-                    if (closeValidator)
-                        validator.close();
-                }
-            }
-            finally
-            {
-                if (closeCrcReader)
-                    crcReader.close();
-            }
-        }
-    }
-
     @Override
-    protected void reBuffer()
+    protected void reBufferStandard()
     {
         long desiredPosition = current();
         // align with buffer size, as checksums were computed in chunks of buffer size each.
@@ -107,13 +74,19 @@
         }
         catch (IOException e)
         {
-            throw new CorruptFileException(e, file);
+            throw new CorruptFileException(e, channel.filePath());
         }
 
         buffer.position((int) (desiredPosition - bufferOffset));
     }
 
     @Override
+    protected void reBufferMmap()
+    {
+        throw new AssertionError("Unsupported operation");
+    }
+
+    @Override
     public void seek(long newPosition)
     {
         validator.seek(newPosition);
@@ -123,7 +96,32 @@
     @Override
     public void close()
     {
-        super.close();
-        validator.close();
+        Throwables.perform(channel.filePath(), Throwables.FileOpType.READ,
+                           super::close,
+                           validator::close,
+                           channel::close);
+    }
+
+    public static final class Builder extends RandomAccessReader.Builder
+    {
+        private final DataIntegrityMetadata.ChecksumValidator validator;
+
+        @SuppressWarnings("resource")
+        public Builder(File file, File crcFile) throws IOException
+        {
+            super(new ChannelProxy(file));
+            this.validator = new DataIntegrityMetadata.ChecksumValidator(new CRC32(),
+                                                                         RandomAccessReader.open(crcFile),
+                                                                         file.getPath());
+
+            super.bufferSize(validator.chunkSize)
+                 .bufferType(BufferType.ON_HEAP);
+        }
+
+        @Override
+        public RandomAccessReader build()
+        {
+            return new ChecksummedRandomAccessReader(this);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java b/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java
index d5e6be9..fd88151 100644
--- a/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/ChecksummedSequentialWriter.java

@@ -31,10 +31,11 @@
     {
         super(file, bufferSize, BufferType.ON_HEAP);
         crcWriter = new SequentialWriter(crcPath, 8 * 1024, BufferType.ON_HEAP);
-        crcMetadata = new DataIntegrityMetadata.ChecksumWriter(crcWriter.stream);
+        crcMetadata = new DataIntegrityMetadata.ChecksumWriter(crcWriter);
         crcMetadata.writeChunkSize(buffer.capacity());
     }
 
+    @Override
     protected void flushData()
     {
         super.flushData();
@@ -49,7 +50,7 @@
         @Override
         protected Throwable doCommit(Throwable accumulate)
         {
-            return crcWriter.commit(accumulate);
+            return super.doCommit(crcWriter.commit(accumulate));
         }
 
         @Override
@@ -65,9 +66,6 @@
             if (descriptor != null)
                 crcMetadata.writeFullChecksum(descriptor);
             crcWriter.setDescriptor(descriptor).prepareToCommit();
-            // we must cleanup our file handles during prepareCommit for Windows compatibility as we cannot rename an open file;
-            // TODO: once we stop file renaming, remove this for clarity
-            releaseFileHandle();
         }
     }
 

diff --git a/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java
deleted file mode 100644
index fdc4f61..0000000
--- a/src/java/org/apache/cassandra/io/util/CompressedPoolingSegmentedFile.java
+++ /dev/null

@@ -1,136 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-package org.apache.cassandra.io.util;
-
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.TreeMap;
-
-import com.google.common.util.concurrent.RateLimiter;
-
-import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
-import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.io.compress.CompressedThrottledReader;
-import org.apache.cassandra.io.compress.CompressionMetadata;
-
-public class CompressedPoolingSegmentedFile extends PoolingSegmentedFile implements ICompressedFile
-{
-    public final CompressionMetadata metadata;
-    private final TreeMap<Long, MappedByteBuffer> chunkSegments;
-
-    public CompressedPoolingSegmentedFile(ChannelProxy channel, CompressionMetadata metadata)
-    {
-        this(channel, metadata, CompressedSegmentedFile.createMappedSegments(channel, metadata));
-    }
-
-    private CompressedPoolingSegmentedFile(ChannelProxy channel, CompressionMetadata metadata, TreeMap<Long, MappedByteBuffer> chunkSegments)
-    {
-        super(new Cleanup(channel, metadata, chunkSegments), channel, metadata.dataLength, metadata.compressedFileLength);
-        this.metadata = metadata;
-        this.chunkSegments = chunkSegments;
-    }
-
-    private CompressedPoolingSegmentedFile(CompressedPoolingSegmentedFile copy)
-    {
-        super(copy);
-        this.metadata = copy.metadata;
-        this.chunkSegments = copy.chunkSegments;
-    }
-
-    public ChannelProxy channel()
-    {
-        return channel;
-    }
-
-    public TreeMap<Long, MappedByteBuffer> chunkSegments()
-    {
-        return chunkSegments;
-    }
-
-    protected static final class Cleanup extends PoolingSegmentedFile.Cleanup
-    {
-        final CompressionMetadata metadata;
-        final TreeMap<Long, MappedByteBuffer> chunkSegments;
-        protected Cleanup(ChannelProxy channel, CompressionMetadata metadata, TreeMap<Long, MappedByteBuffer> chunkSegments)
-        {
-            super(channel);
-            this.metadata = metadata;
-            this.chunkSegments = chunkSegments;
-        }
-        public void tidy()
-        {
-            super.tidy();
-            metadata.close();
-            if (chunkSegments != null)
-            {
-                for (MappedByteBuffer segment : chunkSegments.values())
-                    FileUtils.clean(segment);
-            }
-        }
-    }
-
-    public static class Builder extends CompressedSegmentedFile.Builder
-    {
-        public Builder(CompressedSequentialWriter writer)
-        {
-            super(writer);
-        }
-
-        public void addPotentialBoundary(long boundary)
-        {
-            // only one segment in a standard-io file
-        }
-
-        public SegmentedFile complete(ChannelProxy channel, long overrideLength)
-        {
-            return new CompressedPoolingSegmentedFile(channel, metadata(channel.filePath(), overrideLength));
-        }
-    }
-
-    public void dropPageCache(long before)
-    {
-        if (before >= metadata.dataLength)
-            super.dropPageCache(0);
-        super.dropPageCache(metadata.chunkFor(before).offset);
-    }
-
-    public RandomAccessReader createReader()
-    {
-        return CompressedRandomAccessReader.open(this);
-    }
-
-    public RandomAccessReader createThrottledReader(RateLimiter limiter)
-    {
-        return CompressedThrottledReader.open(this, limiter);
-    }
-
-    protected RandomAccessReader createPooledReader()
-    {
-        return CompressedRandomAccessReader.open(this);
-    }
-
-    public CompressionMetadata getMetadata()
-    {
-        return metadata;
-    }
-
-    public CompressedPoolingSegmentedFile sharedCopy()
-    {
-        return new CompressedPoolingSegmentedFile(this);
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java
index ceff7ba..16f791a 100644
--- a/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/CompressedSegmentedFile.java

@@ -17,43 +17,49 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.TreeMap;
-
 import com.google.common.util.concurrent.RateLimiter;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.io.compress.CompressedThrottledReader;
 import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.concurrent.Ref;
 
 public class CompressedSegmentedFile extends SegmentedFile implements ICompressedFile
 {
-    public final CompressionMetadata metadata;
+    private static final Logger logger = LoggerFactory.getLogger(CompressedSegmentedFile.class);
     private static final boolean useMmap = DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap;
-    private static int MAX_SEGMENT_SIZE = Integer.MAX_VALUE;
-    private final TreeMap<Long, MappedByteBuffer> chunkSegments;
 
-    public CompressedSegmentedFile(ChannelProxy channel, CompressionMetadata metadata)
+    public final CompressionMetadata metadata;
+    private final MmappedRegions regions;
+
+    public CompressedSegmentedFile(ChannelProxy channel, int bufferSize, CompressionMetadata metadata)
     {
-        this(channel, metadata, createMappedSegments(channel, metadata));
+        this(channel,
+             bufferSize,
+             metadata,
+             useMmap
+             ? MmappedRegions.map(channel, metadata)
+             : null);
     }
 
-    public CompressedSegmentedFile(ChannelProxy channel, CompressionMetadata metadata, TreeMap<Long, MappedByteBuffer> chunkSegments)
+    public CompressedSegmentedFile(ChannelProxy channel, int bufferSize, CompressionMetadata metadata, MmappedRegions regions)
     {
-        super(new Cleanup(channel, metadata, chunkSegments), channel, metadata.dataLength, metadata.compressedFileLength);
+        super(new Cleanup(channel, metadata, regions), channel, bufferSize, metadata.dataLength, metadata.compressedFileLength);
         this.metadata = metadata;
-        this.chunkSegments = chunkSegments;
+        this.regions = regions;
     }
 
     private CompressedSegmentedFile(CompressedSegmentedFile copy)
     {
         super(copy);
         this.metadata = copy.metadata;
-        this.chunkSegments = copy.chunkSegments;
+        this.regions = copy.regions;
     }
 
     public ChannelProxy channel()
@@ -61,60 +67,36 @@
         return channel;
     }
 
-    public TreeMap<Long, MappedByteBuffer> chunkSegments()
+    public MmappedRegions regions()
     {
-        return chunkSegments;
-    }
-
-    static TreeMap<Long, MappedByteBuffer> createMappedSegments(ChannelProxy channel, CompressionMetadata metadata)
-    {
-        if (!useMmap)
-            return null;
-        TreeMap<Long, MappedByteBuffer> chunkSegments = new TreeMap<>();
-        long offset = 0;
-        long lastSegmentOffset = 0;
-        long segmentSize = 0;
-
-        while (offset < metadata.dataLength)
-        {
-            CompressionMetadata.Chunk chunk = metadata.chunkFor(offset);
-
-            //Reached a new mmap boundary
-            if (segmentSize + chunk.length + 4 > MAX_SEGMENT_SIZE)
-            {
-                chunkSegments.put(lastSegmentOffset, channel.map(FileChannel.MapMode.READ_ONLY, lastSegmentOffset, segmentSize));
-                lastSegmentOffset += segmentSize;
-                segmentSize = 0;
-            }
-
-            segmentSize += chunk.length + 4; //checksum
-            offset += metadata.chunkLength();
-        }
-
-        if (segmentSize > 0)
-            chunkSegments.put(lastSegmentOffset, channel.map(FileChannel.MapMode.READ_ONLY, lastSegmentOffset, segmentSize));
-        return chunkSegments;
+        return regions;
     }
 
     private static final class Cleanup extends SegmentedFile.Cleanup
     {
         final CompressionMetadata metadata;
-        final TreeMap<Long, MappedByteBuffer> chunkSegments;
-        protected Cleanup(ChannelProxy channel, CompressionMetadata metadata, TreeMap<Long, MappedByteBuffer> chunkSegments)
+        private final MmappedRegions regions;
+
+        protected Cleanup(ChannelProxy channel, CompressionMetadata metadata, MmappedRegions regions)
         {
             super(channel);
             this.metadata = metadata;
-            this.chunkSegments = chunkSegments;
+            this.regions = regions;
         }
         public void tidy()
         {
-            super.tidy();
-            metadata.close();
-            if (chunkSegments != null)
+            Throwable err = regions == null ? null : regions.close(null);
+            if (err != null)
             {
-                for (MappedByteBuffer segment : chunkSegments.values())
-                    FileUtils.clean(segment);
+                JVMStabilityInspector.inspectThrowable(err);
+
+                // This is not supposed to happen
+                logger.error("Error while closing mmapped regions", err);
             }
+
+            metadata.close();
+
+            super.tidy();
         }
     }
 
@@ -123,19 +105,20 @@
         return new CompressedSegmentedFile(this);
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        super.addTo(identities);
+        metadata.addTo(identities);
+    }
+
     public static class Builder extends SegmentedFile.Builder
     {
-        protected final CompressedSequentialWriter writer;
+        final CompressedSequentialWriter writer;
         public Builder(CompressedSequentialWriter writer)
         {
             this.writer = writer;
         }
 
-        public void addPotentialBoundary(long boundary)
-        {
-            // only one segment in a standard-io file
-        }
-
         protected CompressionMetadata metadata(String path, long overrideLength)
         {
             if (writer == null)
@@ -144,9 +127,9 @@
             return writer.open(overrideLength);
         }
 
-        public SegmentedFile complete(ChannelProxy channel, long overrideLength)
+        public SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength)
         {
-            return new CompressedSegmentedFile(channel, metadata(channel.filePath(), overrideLength));
+            return new CompressedSegmentedFile(channel, bufferSize, metadata(channel.filePath(), overrideLength));
         }
     }
 
@@ -159,12 +142,12 @@
 
     public RandomAccessReader createReader()
     {
-        return CompressedRandomAccessReader.open(this);
+        return new CompressedRandomAccessReader.Builder(this).build();
     }
 
-    public RandomAccessReader createThrottledReader(RateLimiter limiter)
+    public RandomAccessReader createReader(RateLimiter limiter)
     {
-        return CompressedThrottledReader.open(this, limiter);
+        return new CompressedRandomAccessReader.Builder(this).limiter(limiter).build();
     }
 
     public CompressionMetadata getMetadata()

diff --git a/src/java/org/apache/cassandra/io/util/DataInputBuffer.java b/src/java/org/apache/cassandra/io/util/DataInputBuffer.java
new file mode 100644
index 0000000..9df9861
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataInputBuffer.java

@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Input stream around a single ByteBuffer.
+ */
+public class DataInputBuffer extends RebufferingInputStream
+{
+    private static ByteBuffer slice(byte[] buffer, int offset, int length)
+    {
+        ByteBuffer buf = ByteBuffer.wrap(buffer);
+        if (offset > 0 || length < buf.capacity())
+        {
+            buf.position(offset);
+            buf.limit(offset + length);
+            buf = buf.slice();
+        }
+        return buf;
+    }
+
+    /**
+     * @param buffer
+     * @param duplicate Whether or not to duplicate the buffer to ensure thread safety
+     */
+    public DataInputBuffer(ByteBuffer buffer, boolean duplicate)
+    {
+        super(duplicate ? buffer.duplicate() : buffer);
+    }
+
+    public DataInputBuffer(byte[] buffer, int offset, int length)
+    {
+        super(slice(buffer, offset, length));
+    }
+
+    public DataInputBuffer(byte[] buffer)
+    {
+        super(ByteBuffer.wrap(buffer));
+    }
+
+    @Override
+    protected void reBuffer()
+    {
+        //nope, we don't rebuffer, we are done!
+    }
+
+    @Override
+    public int available()
+    {
+        return buffer.remaining();
+    }
+
+    @Override
+    public void close() {}
+}

diff --git a/src/java/org/apache/cassandra/io/util/DataInputPlus.java b/src/java/org/apache/cassandra/io/util/DataInputPlus.java
new file mode 100644
index 0000000..7c29ee1
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataInputPlus.java

@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.*;
+
+import org.apache.cassandra.utils.vint.VIntCoding;
+
+/**
+ * Extension to DataInput that provides support for reading varints
+ */
+public interface DataInputPlus extends DataInput
+{
+    default long readVInt() throws IOException
+    {
+        return VIntCoding.readVInt(this);
+    }
+
+    /**
+     * Think hard before opting for an unsigned encoding. Is this going to bite someone because some day
+     * they might need to pass in a sentinel value using negative numbers? Is the risk worth it
+     * to save a few bytes?
+     *
+     * Signed, not a fan of unsigned values in protocols and formats
+     */
+    default long readUnsignedVInt() throws IOException
+    {
+        return VIntCoding.readUnsignedVInt(this);
+    }
+
+    /**
+     * Always skips the requested number of bytes, unless EOF is reached
+     *
+     * @param n number of bytes to skip
+     * @return number of bytes skipped
+     */
+    public int skipBytes(int n) throws IOException;
+
+    public default void skipBytesFully(int n) throws IOException
+    {
+        int skipped = skipBytes(n);
+        if (skipped != n)
+            throw new EOFException("EOF after " + skipped + " bytes out of " + n);
+    }
+
+    /**
+     * Wrapper around an InputStream that provides no buffering but can decode varints
+     */
+    public class DataInputStreamPlus extends DataInputStream implements DataInputPlus
+    {
+        public DataInputStreamPlus(InputStream is)
+        {
+            super(is);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java b/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java
index d44bd1c..cbf5753 100644
--- a/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java
+++ b/src/java/org/apache/cassandra/io/util/DataIntegrityMetadata.java

@@ -25,7 +25,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
-import java.util.zip.Adler32;
+import java.util.zip.CRC32;
 import java.util.zip.CheckedInputStream;
 import java.util.zip.Checksum;
 
@@ -34,8 +34,7 @@
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.utils.CRC32Factory;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Throwables;
 
 public class DataIntegrityMetadata
 {
@@ -53,12 +52,13 @@
 
         public ChecksumValidator(Descriptor descriptor) throws IOException
         {
-            this(descriptor.version.hasAllAdlerChecksums() ? new Adler32() : CRC32Factory.instance.create(),
+            this(descriptor.version.uncompressedChecksumType().newInstance(),
                  RandomAccessReader.open(new File(descriptor.filenameFor(Component.CRC))),
                  descriptor.filenameFor(Component.DATA));
         }
 
-        public ChecksumValidator(Checksum checksum, RandomAccessReader reader, String dataFilename) throws IOException {
+        public ChecksumValidator(Checksum checksum, RandomAccessReader reader, String dataFilename) throws IOException
+        {
             this.checksum = checksum;
             this.reader = reader;
             this.dataFilename = dataFilename;
@@ -109,8 +109,8 @@
         public FileDigestValidator(Descriptor descriptor) throws IOException
         {
             this.descriptor = descriptor;
-            checksum = descriptor.version.hasAllAdlerChecksums() ? new Adler32() : CRC32Factory.instance.create();
-            digestReader = RandomAccessReader.open(new File(descriptor.filenameFor(Component.DIGEST)));
+            checksum = descriptor.version.uncompressedChecksumType().newInstance();
+            digestReader = RandomAccessReader.open(new File(descriptor.filenameFor(descriptor.digestComponent)));
             dataReader = RandomAccessReader.open(new File(descriptor.filenameFor(Component.DATA)));
             try
             {
@@ -118,10 +118,10 @@
             }
             catch (Exception e)
             {
+                close();
                 // Attempting to create a FileDigestValidator without a DIGEST file will fail
                 throw new IOException("Corrupted SSTable : " + descriptor.filenameFor(Component.DATA));
             }
-
         }
 
         // Validate the entire file
@@ -139,16 +139,17 @@
 
         public void close()
         {
-            this.digestReader.close();
+            Throwables.perform(digestReader::close,
+                               dataReader::close);
         }
     }
 
 
     public static class ChecksumWriter
     {
-        private final Adler32 incrementalChecksum = new Adler32();
+        private final CRC32 incrementalChecksum = new CRC32();
         private final DataOutput incrementalOut;
-        private final Adler32 fullChecksum = new Adler32();
+        private final CRC32 fullChecksum = new CRC32();
 
         public ChecksumWriter(DataOutput incrementalOut)
         {
@@ -181,13 +182,13 @@
 
                 ByteBuffer toAppend = bb.duplicate();
                 toAppend.mark();
-                FBUtilities.directCheckSum(incrementalChecksum, toAppend);
+                incrementalChecksum.update(toAppend);
                 toAppend.reset();
 
                 int incrementalChecksumValue = (int) incrementalChecksum.getValue();
                 incrementalOut.writeInt(incrementalChecksumValue);
 
-                FBUtilities.directCheckSum(fullChecksum, toAppend);
+                fullChecksum.update(toAppend);
                 if (checksumIncrementalResult)
                 {
                     ByteBuffer byteBuffer = ByteBuffer.allocate(4);
@@ -205,7 +206,9 @@
 
         public void writeFullChecksum(Descriptor descriptor)
         {
-            File outFile = new File(descriptor.filenameFor(Component.DIGEST));
+            if (descriptor.digestComponent == null)
+                throw new NullPointerException("Null digest component for " + descriptor.ksname + '.' + descriptor.cfname + " file " + descriptor.baseFilename());
+            File outFile = new File(descriptor.filenameFor(descriptor.digestComponent));
             try (BufferedWriter out =Files.newBufferedWriter(outFile.toPath(), Charsets.UTF_8))
             {
                 out.write(String.valueOf(fullChecksum.getValue()));

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
index 3f1e081..195fdb4 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java

@@ -151,7 +151,7 @@
             return true;
         }
 
-        public void close() throws IOException
+        public void close()
         {
         }
     }
@@ -163,6 +163,19 @@
 
     public ByteBuffer buffer()
     {
+        return buffer(true);
+    }
+
+    public ByteBuffer buffer(boolean duplicate)
+    {
+        if (!duplicate)
+        {
+            ByteBuffer buf = buffer;
+            buf.flip();
+            buffer = null;
+            return buf;
+        }
+
         ByteBuffer result = buffer.duplicate();
         result.flip();
         return result;
@@ -178,6 +191,16 @@
         return buffer.position();
     }
 
+    public boolean hasPosition()
+    {
+        return true;
+    }
+
+    public long position()
+    {
+        return getLength();
+    }
+
     public byte[] toByteArray()
     {
         ByteBuffer buffer = buffer();

diff --git a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
index f63c1e5..a9dbb68 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java

@@ -24,6 +24,8 @@
 
 import com.google.common.base.Function;
 
+import org.apache.cassandra.utils.vint.VIntCoding;
+
 /**
  * Extension to DataOutput that provides for writing ByteBuffer and Memory, potentially with an efficient
  * implementation that is zero copy or at least has reduced bounds checking overhead.
@@ -40,4 +42,44 @@
      * and forget to flush
      */
     <R> R applyToChannel(Function<WritableByteChannel, R> c) throws IOException;
+
+    default void writeVInt(long i) throws IOException
+    {
+        VIntCoding.writeVInt(i, this);
+    }
+
+    /**
+     * This is more efficient for storing unsigned values, both in storage and CPU burden.
+     *
+     * Note that it is still possible to store negative values, they just take up more space.
+     * So this method doesn't forbid e.g. negative sentinel values in future, if they need to be snuck in.
+     * A protocol version bump can then be introduced to improve efficiency.
+     */
+    default void writeUnsignedVInt(long i) throws IOException
+    {
+        VIntCoding.writeUnsignedVInt(i, this);
+    }
+
+    /**
+     * Returns the current position of the underlying target like a file-pointer
+     * or the position withing a buffer. Not every implementation may support this
+     * functionality. Whether or not this functionality is supported can be checked
+     * via the {@link #hasPosition()}.
+     *
+     * @throws UnsupportedOperationException if the implementation does not support
+     *                                       position
+     */
+    default long position()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * If the implementation supports providing a position, this method returns
+     * {@code true}, otherwise {@code false}.
+     */
+    default boolean hasPosition()
+    {
+        return false;
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/DataPosition.java b/src/java/org/apache/cassandra/io/util/DataPosition.java
new file mode 100644
index 0000000..e106dae
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DataPosition.java

@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+public interface DataPosition
+{}

diff --git a/src/java/org/apache/cassandra/io/util/DiskAwareRunnable.java b/src/java/org/apache/cassandra/io/util/DiskAwareRunnable.java
new file mode 100644
index 0000000..1a15d6f
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/DiskAwareRunnable.java

@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.utils.WrappedRunnable;
+
+public abstract class DiskAwareRunnable extends WrappedRunnable
+{
+    protected Directories.DataDirectory getWriteDirectory(long writeSize)
+    {
+        Directories.DataDirectory directory = getDirectories().getWriteableLocation(writeSize);
+        if (directory == null)
+            throw new FSWriteError(new IOException("Insufficient disk space to write " + writeSize + " bytes"), "");
+
+        return directory;
+    }
+
+    /**
+     * Get sstable directories for the CF.
+     * @return Directories instance for the CF.
+     */
+    protected abstract Directories getDirectories();
+}

diff --git a/src/java/org/apache/cassandra/io/util/FileDataInput.java b/src/java/org/apache/cassandra/io/util/FileDataInput.java
index d94075c..1059b01 100644
--- a/src/java/org/apache/cassandra/io/util/FileDataInput.java
+++ b/src/java/org/apache/cassandra/io/util/FileDataInput.java

@@ -18,33 +18,17 @@
 package org.apache.cassandra.io.util;
 
 import java.io.Closeable;
-import java.io.DataInput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 
-public interface FileDataInput extends DataInput, Closeable
+public interface FileDataInput extends RewindableDataInput, Closeable
 {
-    public String getPath();
+    String getPath();
 
-    public boolean isEOF() throws IOException;
+    boolean isEOF() throws IOException;
 
-    public long bytesRemaining() throws IOException;
+    long bytesRemaining() throws IOException;
 
-    public void seek(long pos) throws IOException;
+    void seek(long pos) throws IOException;
 
-    public FileMark mark();
-
-    public void reset(FileMark mark) throws IOException;
-
-    public long bytesPastMark(FileMark mark);
-
-    public long getFilePointer();
-
-    /**
-     * Read length bytes from current file position
-     * @param length length of the bytes to read
-     * @return buffer with bytes read
-     * @throws IOException if any I/O operation failed
-     */
-    public ByteBuffer readBytes(int length) throws IOException;
+    long getFilePointer();
 }

diff --git a/src/java/org/apache/cassandra/io/util/FileMark.java b/src/java/org/apache/cassandra/io/util/FileMark.java
deleted file mode 100644
index 781bc1e..0000000
--- a/src/java/org/apache/cassandra/io/util/FileMark.java
+++ /dev/null

@@ -1,20 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.util;
-
-public interface FileMark {}

diff --git a/src/java/org/apache/cassandra/io/util/FileSegmentInputStream.java b/src/java/org/apache/cassandra/io/util/FileSegmentInputStream.java
new file mode 100644
index 0000000..a585215
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/FileSegmentInputStream.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+
+/**
+ * This is the same as DataInputBuffer, i.e. a stream for a fixed byte buffer,
+ * except that we also implement FileDataInput by using an offset and a file path.
+ */
+public class FileSegmentInputStream extends DataInputBuffer implements FileDataInput
+{
+    private final String filePath;
+    private final long offset;
+
+    public FileSegmentInputStream(ByteBuffer buffer, String filePath, long offset)
+    {
+        super(buffer, false);
+        this.filePath = filePath;
+        this.offset = offset;
+    }
+
+    public String getPath()
+    {
+        return filePath;
+    }
+
+    private long size()
+    {
+        return offset + buffer.capacity();
+    }
+
+    public boolean isEOF()
+    {
+        return !buffer.hasRemaining();
+    }
+
+    public long bytesRemaining()
+    {
+        return buffer.remaining();
+    }
+
+    public void seek(long pos)
+    {
+        if (pos < 0 || pos > size())
+            throw new IllegalArgumentException(String.format("Unable to seek to position %d in %s (%d bytes) in partial mode",
+                                                             pos,
+                                                             getPath(),
+                                                             size()));
+
+
+        buffer.position((int) (pos - offset));
+    }
+
+    @Override
+    public boolean markSupported()
+    {
+        return false;
+    }
+
+    public DataPosition mark()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void reset(DataPosition mark)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public long bytesPastMark(DataPosition mark)
+    {
+        return 0;
+    }
+
+    public long getFilePointer()
+    {
+        return offset + buffer.position();
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/FileUtils.java b/src/java/org/apache/cassandra/io/util/FileUtils.java
index 3252ec8..f9406e5 100644
--- a/src/java/org/apache/cassandra/io/util/FileUtils.java
+++ b/src/java/org/apache/cassandra/io/util/FileUtils.java

@@ -20,17 +20,23 @@
 import java.io.*;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.*;
 import java.nio.file.attribute.FileAttributeView;
 import java.nio.file.attribute.FileStoreAttributeView;
 import java.text.DecimalFormat;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
 import java.util.concurrent.atomic.AtomicReference;
-
-import sun.nio.ch.DirectBuffer;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+import java.util.stream.StreamSupport;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import sun.nio.ch.DirectBuffer;
 
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.io.FSError;
@@ -40,11 +46,14 @@
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
+import static com.google.common.base.Throwables.propagate;
 import static org.apache.cassandra.utils.Throwables.maybeFail;
 import static org.apache.cassandra.utils.Throwables.merge;
 
 public final class FileUtils
 {
+    public static final Charset CHARSET = StandardCharsets.UTF_8;
+
     private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);
     public static final long ONE_KB = 1024;
     public static final long ONE_MB = 1024 * ONE_KB;
@@ -169,7 +178,7 @@
 
     public static void renameWithConfirm(File from, File to)
     {
-        assert from.exists() : from + " should exist";
+        assert from.exists();
         if (logger.isTraceEnabled())
             logger.trace((String.format("Renaming %s to %s", from.getPath(), to.getPath())));
         // this is not FSWE because usually when we see it it's because we didn't close the file before renaming it,
@@ -228,6 +237,19 @@
         }
     }
 
+    public static void closeQuietly(AutoCloseable c)
+    {
+        try
+        {
+            if (c != null)
+                c.close();
+        }
+        catch (Exception e)
+        {
+            logger.warn("Failed closing {}", c, e);
+        }
+    }
+
     public static void close(Closeable... cs) throws IOException
     {
         close(Arrays.asList(cs));
@@ -253,6 +275,22 @@
         maybeFail(e, IOException.class);
     }
 
+    public static void closeQuietly(Iterable<? extends AutoCloseable> cs)
+    {
+        for (AutoCloseable c : cs)
+        {
+            try
+            {
+                if (c != null)
+                    c.close();
+            }
+            catch (Exception ex)
+            {
+                logger.warn("Failed closing {}", c, ex);
+            }
+        }
+    }
+
     public static String getCanonicalPath(String filename)
     {
         try
@@ -277,6 +315,29 @@
         }
     }
 
+    /** Return true if file is contained in folder */
+    public static boolean isContained(File folder, File file)
+    {
+        Path folderPath = Paths.get(getCanonicalPath(folder));
+        Path filePath = Paths.get(getCanonicalPath(file));
+
+        return filePath.startsWith(folderPath);
+    }
+
+    /** Convert absolute path into a path relative to the base path */
+    public static String getRelativePath(String basePath, String path)
+    {
+        try
+        {
+            return Paths.get(basePath).relativize(Paths.get(path)).toString();
+        }
+        catch(Exception ex)
+        {
+            String absDataPath = FileUtils.getCanonicalPath(basePath);
+            return Paths.get(absDataPath).relativize(Paths.get(path)).toString();
+        }
+    }
+
     public static boolean isCleanerAvailable()
     {
         return canCleanDirectBuffers;
@@ -285,7 +346,11 @@
     public static void clean(ByteBuffer buffer)
     {
         if (isCleanerAvailable() && buffer.isDirect())
-            ((DirectBuffer)buffer).cleaner().clean();
+        {
+            DirectBuffer db = (DirectBuffer) buffer;
+            if (db.cleaner() != null)
+                db.cleaner().clean();
+        }
     }
 
     public static void createDirectory(String directory)
@@ -310,6 +375,13 @@
 
     public static void delete(File... files)
     {
+        if (files == null)
+        {
+            // CASSANDRA-13389: some callers use Files.listFiles() which, on error, silently returns null
+            logger.debug("Received null list of files to delete");
+            return;
+        }
+
         for ( File file : files )
         {
             file.delete();
@@ -328,6 +400,22 @@
         ScheduledExecutors.nonPeriodicTasks.execute(runnable);
     }
 
+    public static void visitDirectory(Path dir, Predicate<? super File> filter, Consumer<? super File> consumer)
+    {
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir))
+        {
+            StreamSupport.stream(stream.spliterator(), false)
+                         .map(Path::toFile)
+                         // stream directories are weakly consistent so we always check if the file still exists
+                         .filter(f -> f.exists() && (filter == null || filter.test(f)))
+                         .forEach(consumer);
+        }
+        catch (IOException|DirectoryIteratorException ex)
+        {
+            logger.error("Failed to list files in {} with exception: {}", dir, ex.getMessage(), ex);
+        }
+    }
+
     public static String stringifyFileSize(double value)
     {
         double d;
@@ -397,18 +485,6 @@
         dir.deleteOnExit();
     }
 
-    public static void skipBytesFully(DataInput in, int bytes) throws IOException
-    {
-        int n = 0;
-        while (n < bytes)
-        {
-            int skipped = in.skipBytes(bytes - n);
-            if (skipped == 0)
-                throw new EOFException("EOF after " + n + " bytes out of " + bytes);
-            n += skipped;
-        }
-    }
-
     public static void handleCorruptSSTable(CorruptSSTableException e)
     {
         FSErrorHandler handler = fsErrorHandler.get();
@@ -422,6 +498,21 @@
         if (handler != null)
             handler.handleFSError(e);
     }
+
+    /**
+     * handleFSErrorAndPropagate will invoke the disk failure policy error handler,
+     * which may or may not stop the daemon or transports. However, if we don't exit,
+     * we still want to propagate the exception to the caller in case they have custom
+     * exception handling
+     *
+     * @param e A filesystem error
+     */
+    public static void handleFSErrorAndPropagate(FSError e)
+    {
+        handleFSError(e);
+        throw propagate(e);
+    }
+
     /**
      * Get the size of a directory in bytes
      * @param directory The directory for which we need size.
@@ -475,6 +566,57 @@
         return false;
     }
 
+    public static void append(File file, String ... lines)
+    {
+        if (file.exists())
+            write(file, Arrays.asList(lines), StandardOpenOption.APPEND);
+        else
+            write(file, Arrays.asList(lines), StandardOpenOption.CREATE);
+    }
+
+    public static void appendAndSync(File file, String ... lines)
+    {
+        if (file.exists())
+            write(file, Arrays.asList(lines), StandardOpenOption.APPEND, StandardOpenOption.SYNC);
+        else
+            write(file, Arrays.asList(lines), StandardOpenOption.CREATE, StandardOpenOption.SYNC);
+    }
+
+    public static void replace(File file, String ... lines)
+    {
+        write(file, Arrays.asList(lines), StandardOpenOption.TRUNCATE_EXISTING);
+    }
+
+    public static void write(File file, List<String> lines, StandardOpenOption ... options)
+    {
+        try
+        {
+            Files.write(file.toPath(),
+                        lines,
+                        CHARSET,
+                        options);
+        }
+        catch (IOException ex)
+        {
+            throw new RuntimeException(ex);
+        }
+    }
+
+    public static List<String> readLines(File file)
+    {
+        try
+        {
+            return Files.readAllLines(file.toPath(), CHARSET);
+        }
+        catch (IOException ex)
+        {
+            if (ex instanceof NoSuchFileException)
+                return Collections.emptyList();
+
+            throw new RuntimeException(ex);
+        }
+    }
+
     public static void setFSErrorHandler(FSErrorHandler handler)
     {
         fsErrorHandler.getAndSet(handler);

diff --git a/src/java/org/apache/cassandra/io/util/ICompressedFile.java b/src/java/org/apache/cassandra/io/util/ICompressedFile.java
index ce7b22c..43cef8c 100644
--- a/src/java/org/apache/cassandra/io/util/ICompressedFile.java
+++ b/src/java/org/apache/cassandra/io/util/ICompressedFile.java

@@ -17,14 +17,12 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.nio.MappedByteBuffer;
-import java.util.TreeMap;
-
 import org.apache.cassandra.io.compress.CompressionMetadata;
 
 public interface ICompressedFile
 {
-    public ChannelProxy channel();
-    public CompressionMetadata getMetadata();
-    public TreeMap<Long, MappedByteBuffer> chunkSegments();
+    ChannelProxy channel();
+    CompressionMetadata getMetadata();
+    MmappedRegions regions();
+
 }

diff --git a/src/java/org/apache/cassandra/io/util/Memory.java b/src/java/org/apache/cassandra/io/util/Memory.java
index 07d3ca3..78950ce 100644
--- a/src/java/org/apache/cassandra/io/util/Memory.java
+++ b/src/java/org/apache/cassandra/io/util/Memory.java

@@ -59,7 +59,7 @@
     {
         String arch = System.getProperty("os.arch");
         unaligned = arch.equals("i386") || arch.equals("x86")
-                    || arch.equals("amd64") || arch.equals("x86_64");
+                    || arch.equals("amd64") || arch.equals("x86_64") || arch.equals("s390x");
     }
 
     protected long peer;

diff --git a/src/java/org/apache/cassandra/io/util/MemoryInputStream.java b/src/java/org/apache/cassandra/io/util/MemoryInputStream.java
index 45261e0..e009528 100644
--- a/src/java/org/apache/cassandra/io/util/MemoryInputStream.java
+++ b/src/java/org/apache/cassandra/io/util/MemoryInputStream.java

@@ -19,50 +19,58 @@
 
 import java.io.DataInput;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
-public class MemoryInputStream extends AbstractDataInput implements DataInput
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.primitives.Ints;
+
+import org.apache.cassandra.utils.memory.MemoryUtil;
+
+public class MemoryInputStream extends RebufferingInputStream implements DataInput
 {
     private final Memory mem;
-    private int position = 0;
+    private final int bufferSize;
+    private long offset;
+
 
     public MemoryInputStream(Memory mem)
     {
+        this(mem, Ints.saturatedCast(mem.size));
+    }
+
+    @VisibleForTesting
+    public MemoryInputStream(Memory mem, int bufferSize)
+    {
+        super(getByteBuffer(mem.peer, bufferSize));
         this.mem = mem;
+        this.bufferSize = bufferSize;
+        this.offset = mem.peer + bufferSize;
     }
 
-    public int read() throws IOException
+    @Override
+    protected void reBuffer() throws IOException
     {
-        return mem.getByte(position++) & 0xFF;
+        if (offset - mem.peer >= mem.size())
+            return;
+
+        buffer = getByteBuffer(offset, Math.min(bufferSize, Ints.saturatedCast(memRemaining())));
+        offset += buffer.capacity();
     }
 
-    public void readFully(byte[] buffer, int offset, int count) throws IOException
+    @Override
+    public int available()
     {
-        mem.getBytes(position, buffer, offset, count);
-        position += count;
+        return Ints.saturatedCast(buffer.remaining() + memRemaining());
     }
 
-    public void seek(long pos)
+    private long memRemaining()
     {
-        position = (int) pos;
+        return mem.size + mem.peer - offset;
     }
 
-    public long getPosition()
+    private static ByteBuffer getByteBuffer(long offset, int length)
     {
-        return position;
-    }
-
-    public long getPositionLimit()
-    {
-        return mem.size();
-    }
-
-    protected long length()
-    {
-        return mem.size();
-    }
-
-    public void close()
-    {
-        // do nothing.
+        return MemoryUtil.getByteBuffer(offset, length).order(ByteOrder.BIG_ENDIAN);
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/MmappedRegions.java b/src/java/org/apache/cassandra/io/util/MmappedRegions.java
new file mode 100644
index 0000000..8f6cd92
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/MmappedRegions.java

@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Arrays;
+
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.RefCounted;
+import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;
+
+import static java.util.stream.Stream.of;
+import static org.apache.cassandra.utils.Throwables.perform;
+
+public class MmappedRegions extends SharedCloseableImpl
+{
+    /** In a perfect world, MAX_SEGMENT_SIZE would be final, but we need to test with a smaller size */
+    public static int MAX_SEGMENT_SIZE = Integer.MAX_VALUE;
+
+    /** When we need to grow the arrays, we add this number of region slots */
+    static final int REGION_ALLOC_SIZE = 15;
+
+    /** The original state, which is shared with the tidier and
+     * contains all the regions mapped so far. It also
+     * does the actual mapping. */
+    private final State state;
+
+    /** A copy of the latest state. We update this each time the original state is
+     * updated and we share this with copies. If we are a copy, then this
+     * is null. Copies can only access existing regions, they cannot create
+     * new ones. This is for thread safety and because MmappedRegions is
+     * reference counted, only the original state will be cleaned-up,
+     * therefore only the original state should create new mapped regions.
+     */
+    private volatile State copy;
+
+    private MmappedRegions(ChannelProxy channel, CompressionMetadata metadata, long length)
+    {
+        this(new State(channel), metadata, length);
+    }
+
+    private MmappedRegions(State state, CompressionMetadata metadata, long length)
+    {
+        super(new Tidier(state));
+
+        this.state = state;
+
+        if (metadata != null)
+        {
+            assert length == 0 : "expected no length with metadata";
+            updateState(metadata);
+        }
+        else if (length > 0)
+        {
+            updateState(length);
+        }
+
+        this.copy = new State(state);
+    }
+
+    private MmappedRegions(MmappedRegions original)
+    {
+        super(original);
+        this.state = original.copy;
+    }
+
+    public static MmappedRegions empty(ChannelProxy channel)
+    {
+        return new MmappedRegions(channel, null, 0);
+    }
+
+    public static MmappedRegions map(ChannelProxy channel, CompressionMetadata metadata)
+    {
+        if (metadata == null)
+            throw new IllegalArgumentException("metadata cannot be null");
+
+        return new MmappedRegions(channel, metadata, 0);
+    }
+
+    public static MmappedRegions map(ChannelProxy channel, long length)
+    {
+        if (length <= 0)
+            throw new IllegalArgumentException("Length must be positive");
+
+        return new MmappedRegions(channel, null, length);
+    }
+
+    /**
+     * @return a snapshot of the memory mapped regions. The snapshot can
+     * only use existing regions, it cannot create new ones.
+     */
+    public MmappedRegions sharedCopy()
+    {
+        return new MmappedRegions(this);
+    }
+
+    private boolean isCopy()
+    {
+        return copy == null;
+    }
+
+    public void extend(long length)
+    {
+        if (length < 0)
+            throw new IllegalArgumentException("Length must not be negative");
+
+        assert !isCopy() : "Copies cannot be extended";
+
+        if (length <= state.length)
+            return;
+
+        updateState(length);
+        copy = new State(state);
+    }
+
+    private void updateState(long length)
+    {
+        state.length = length;
+        long pos = state.getPosition();
+        while (pos < length)
+        {
+            long size = Math.min(MAX_SEGMENT_SIZE, length - pos);
+            state.add(pos, size);
+            pos += size;
+        }
+    }
+
+    private void updateState(CompressionMetadata metadata)
+    {
+        long offset = 0;
+        long lastSegmentOffset = 0;
+        long segmentSize = 0;
+
+        while (offset < metadata.dataLength)
+        {
+            CompressionMetadata.Chunk chunk = metadata.chunkFor(offset);
+
+            //Reached a new mmap boundary
+            if (segmentSize + chunk.length + 4 > MAX_SEGMENT_SIZE)
+            {
+                if (segmentSize > 0)
+                {
+                    state.add(lastSegmentOffset, segmentSize);
+                    lastSegmentOffset += segmentSize;
+                    segmentSize = 0;
+                }
+            }
+
+            segmentSize += chunk.length + 4; //checksum
+            offset += metadata.chunkLength();
+        }
+
+        if (segmentSize > 0)
+            state.add(lastSegmentOffset, segmentSize);
+
+        state.length = lastSegmentOffset + segmentSize;
+    }
+
+    public boolean isValid(ChannelProxy channel)
+    {
+        return state.isValid(channel);
+    }
+
+    public boolean isEmpty()
+    {
+        return state.isEmpty();
+    }
+
+    public Region floor(long position)
+    {
+        assert !isCleanedUp() : "Attempted to use closed region";
+        return state.floor(position);
+    }
+
+    public static final class Region
+    {
+        public final long offset;
+        public final ByteBuffer buffer;
+
+        public Region(long offset, ByteBuffer buffer)
+        {
+            this.offset = offset;
+            this.buffer = buffer;
+        }
+
+        public long bottom()
+        {
+            return offset;
+        }
+
+        public long top()
+        {
+            return offset + buffer.capacity();
+        }
+    }
+
+    private static final class State
+    {
+        /** The file channel */
+        private final ChannelProxy channel;
+
+        /** An array of region buffers, synchronized with offsets */
+        private ByteBuffer[] buffers;
+
+        /** An array of region offsets, synchronized with buffers */
+        private long[] offsets;
+
+        /** The maximum file length we have mapped */
+        private long length;
+
+        /** The index to the last region added */
+        private int last;
+
+        private State(ChannelProxy channel)
+        {
+            this.channel = channel.sharedCopy();
+            this.buffers = new ByteBuffer[REGION_ALLOC_SIZE];
+            this.offsets = new long[REGION_ALLOC_SIZE];
+            this.length = 0;
+            this.last = -1;
+        }
+
+        private State(State original)
+        {
+            this.channel = original.channel;
+            this.buffers = original.buffers;
+            this.offsets = original.offsets;
+            this.length = original.length;
+            this.last = original.last;
+        }
+
+        private boolean isEmpty()
+        {
+            return last < 0;
+        }
+
+        private boolean isValid(ChannelProxy channel)
+        {
+            return this.channel.filePath().equals(channel.filePath());
+        }
+
+        private Region floor(long position)
+        {
+            assert 0 <= position && position < length : String.format("%d >= %d", position, length);
+
+            int idx = Arrays.binarySearch(offsets, 0, last +1, position);
+            assert idx != -1 : String.format("Bad position %d for regions %s, last %d in %s", position, Arrays.toString(offsets), last, channel);
+            if (idx < 0)
+                idx = -(idx + 2); // round down to entry at insertion point
+
+            return new Region(offsets[idx], buffers[idx]);
+        }
+
+        private long getPosition()
+        {
+            return last < 0 ? 0 : offsets[last] + buffers[last].capacity();
+        }
+
+        private void add(long pos, long size)
+        {
+            ByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, pos, size);
+
+            ++last;
+
+            if (last == offsets.length)
+            {
+                offsets = Arrays.copyOf(offsets, offsets.length + REGION_ALLOC_SIZE);
+                buffers = Arrays.copyOf(buffers, buffers.length + REGION_ALLOC_SIZE);
+            }
+
+            offsets[last] = pos;
+            buffers[last] = buffer;
+        }
+
+        private Throwable close(Throwable accumulate)
+        {
+            accumulate = channel.close(accumulate);
+
+            /*
+             * Try forcing the unmapping of segments using undocumented unsafe sun APIs.
+             * If this fails (non Sun JVM), we'll have to wait for the GC to finalize the mapping.
+             * If this works and a thread tries to access any segment, hell will unleash on earth.
+             */
+            if (!FileUtils.isCleanerAvailable())
+                return accumulate;
+
+            return perform(accumulate, channel.filePath(), Throwables.FileOpType.READ,
+                           of(buffers)
+                           .map((buffer) ->
+                                () ->
+                                {
+                                    if (buffer != null)
+                                        FileUtils.clean(buffer);
+                                }));
+        }
+    }
+
+    public static final class Tidier implements RefCounted.Tidy
+    {
+        final State state;
+
+        Tidier(State state)
+        {
+            this.state = state;
+        }
+
+        public String name()
+        {
+            return state.channel.filePath();
+        }
+
+        public void tidy()
+        {
+            try
+            {
+                Throwables.maybeFail(state.close(null));
+            }
+            catch (Exception e)
+            {
+                throw new FSReadError(e, state.channel.filePath());
+            }
+        }
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java b/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java
index 70ac77a..5f56ff6 100644
--- a/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/MmappedSegmentedFile.java

@@ -18,48 +18,31 @@
 package org.apache.cassandra.io.util;
 
 import java.io.*;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
 
-import com.google.common.annotations.VisibleForTesting;
+import com.google.common.util.concurrent.RateLimiter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.IndexSummary;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
 public class MmappedSegmentedFile extends SegmentedFile
 {
     private static final Logger logger = LoggerFactory.getLogger(MmappedSegmentedFile.class);
 
-    // in a perfect world, MAX_SEGMENT_SIZE would be final, but we need to test with a smaller size to stay sane.
-    public static long MAX_SEGMENT_SIZE = Integer.MAX_VALUE;
+    private final MmappedRegions regions;
 
-    /**
-     * Sorted array of segment offsets and MappedByteBuffers for segments. If mmap is completely disabled, or if the
-     * segment would be too long to mmap, the value for an offset will be null, indicating that we need to fall back
-     * to a RandomAccessFile.
-     */
-    private final Segment[] segments;
-
-    public MmappedSegmentedFile(ChannelProxy channel, long length, Segment[] segments)
+    public MmappedSegmentedFile(ChannelProxy channel, int bufferSize, long length, MmappedRegions regions)
     {
-        super(new Cleanup(channel, segments), channel, length);
-        this.segments = segments;
+        super(new Cleanup(channel, regions), channel, bufferSize, length);
+        this.regions = regions;
     }
 
     private MmappedSegmentedFile(MmappedSegmentedFile copy)
     {
         super(copy);
-        this.segments = copy.segments;
+        this.regions = copy.regions;
     }
 
     public MmappedSegmentedFile sharedCopy()
@@ -67,352 +50,112 @@
         return new MmappedSegmentedFile(this);
     }
 
-    /**
-     * @return The segment entry for the given position.
-     */
-    private Segment floor(long position)
+    public RandomAccessReader createReader()
     {
-        assert 0 <= position && position < length: String.format("%d >= %d in %s", position, length, path());
-        Segment seg = new Segment(position, null);
-        int idx = Arrays.binarySearch(segments, seg);
-        assert idx != -1 : String.format("Bad position %d for segments %s in %s", position, Arrays.toString(segments), path());
-        if (idx < 0)
-            // round down to entry at insertion point
-            idx = -(idx + 2);
-        return segments[idx];
+        return new RandomAccessReader.Builder(channel)
+               .overrideLength(length)
+               .regions(regions)
+               .build();
     }
 
-    /**
-     * @return The segment containing the given position: must be closed after use.
-     */
-    public FileDataInput getSegment(long position)
+    public RandomAccessReader createReader(RateLimiter limiter)
     {
-        Segment segment = floor(position);
-        if (segment.right != null)
-        {
-            // segment is mmap'd
-            return new ByteBufferDataInput(segment.right, path(), segment.left, (int) (position - segment.left));
-        }
-
-        // we can have single cells or partitions larger than 2Gb, which is our maximum addressable range in a single segment;
-        // in this case we open as a normal random access reader
-        // FIXME: brafs are unbounded, so this segment will cover the rest of the file, rather than just the row
-        RandomAccessReader file = RandomAccessReader.open(channel);
-        file.seek(position);
-        return file;
-    }
-
-    @Override
-    public long[] copyReadableBounds()
-    {
-        long[] bounds  = new long[segments.length + 1];
-        for (int i = 0; i < segments.length; i++)
-            bounds[i] = segments[i].left;
-        bounds[segments.length] = length;
-        return bounds;
+        return new RandomAccessReader.Builder(channel)
+               .overrideLength(length)
+               .bufferSize(bufferSize)
+               .regions(regions)
+               .limiter(limiter)
+               .build();
     }
 
     private static final class Cleanup extends SegmentedFile.Cleanup
     {
-        final Segment[] segments;
-        protected Cleanup(ChannelProxy channel, Segment[] segments)
+        private final MmappedRegions regions;
+
+        Cleanup(ChannelProxy channel, MmappedRegions regions)
         {
             super(channel);
-            this.segments = segments;
+            this.regions = regions;
         }
 
         public void tidy()
         {
-            super.tidy();
-
-            if (!FileUtils.isCleanerAvailable())
-                return;
-
-        /*
-         * Try forcing the unmapping of segments using undocumented unsafe sun APIs.
-         * If this fails (non Sun JVM), we'll have to wait for the GC to finalize the mapping.
-         * If this works and a thread tries to access any segment, hell will unleash on earth.
-         */
-            try
+            Throwable err = regions.close(null);
+            if (err != null)
             {
-                for (Segment segment : segments)
-                {
-                    if (segment.right == null)
-                        continue;
-                    FileUtils.clean(segment.right);
-                }
-                logger.trace("All segments have been unmapped successfully");
-            }
-            catch (Exception e)
-            {
-                JVMStabilityInspector.inspectThrowable(e);
+                JVMStabilityInspector.inspectThrowable(err);
+
                 // This is not supposed to happen
-                logger.error("Error while unmapping segments", e);
+                logger.error("Error while closing mmapped regions", err);
             }
-        }
-    }
 
-    // see CASSANDRA-10357
-    public static boolean maybeRepair(CFMetaData metadata, Descriptor descriptor, IndexSummary indexSummary, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder)
-    {
-        boolean mayNeedRepair = false;
-        if (ibuilder instanceof Builder)
-            mayNeedRepair = ((Builder) ibuilder).mayNeedRepair(descriptor.filenameFor(Component.PRIMARY_INDEX));
-        if (dbuilder instanceof Builder)
-            mayNeedRepair |= ((Builder) dbuilder).mayNeedRepair(descriptor.filenameFor(Component.DATA));
-
-        if (mayNeedRepair)
-            forceRepair(metadata, descriptor, indexSummary, ibuilder, dbuilder);
-        return mayNeedRepair;
-    }
-
-    // if one of the index/data files have boundaries larger than we can mmap, and they were written by a version that did not guarantee correct boundaries were saved,
-    // rebuild the boundaries and save them again
-    private static void forceRepair(CFMetaData metadata, Descriptor descriptor, IndexSummary indexSummary, SegmentedFile.Builder ibuilder, SegmentedFile.Builder dbuilder)
-    {
-        if (ibuilder instanceof Builder)
-            ((Builder) ibuilder).boundaries.clear();
-        if (dbuilder instanceof Builder)
-            ((Builder) dbuilder).boundaries.clear();
-
-        RowIndexEntry.IndexSerializer rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata);
-        try (RandomAccessFile raf = new RandomAccessFile(descriptor.filenameFor(Component.PRIMARY_INDEX), "r");)
-        {
-            long iprev = 0, dprev = 0;
-            for (int i = 0; i < indexSummary.size(); i++)
-            {
-                // first read the position in the summary, and read the corresponding position in the data file
-                long icur = indexSummary.getPosition(i);
-                raf.seek(icur);
-                ByteBufferUtil.readWithShortLength(raf);
-                RowIndexEntry rie = rowIndexEntrySerializer.deserialize(raf, descriptor.version);
-                long dcur = rie.position;
-
-                // if these positions are small enough to map out a segment from the prior version (i.e. less than 2Gb),
-                // just add these as a boundary and proceed to the next index summary record; most scenarios will be
-                // served by this, keeping the cost of rebuild to a minimum.
-
-                if (Math.max(icur - iprev , dcur - dprev) > MAX_SEGMENT_SIZE)
-                {
-                    // otherwise, loop over its index block, providing each RIE as a potential boundary for both files
-                    raf.seek(iprev);
-                    while (raf.getFilePointer() < icur)
-                    {
-                        // add the position of this record in the index file as an index file boundary
-                        ibuilder.addPotentialBoundary(raf.getFilePointer());
-                        // then read the RIE, and add its data file position as a boundary for the data file
-                        ByteBufferUtil.readWithShortLength(raf);
-                        rie = rowIndexEntrySerializer.deserialize(raf, descriptor.version);
-                        dbuilder.addPotentialBoundary(rie.position);
-                    }
-                }
-
-                ibuilder.addPotentialBoundary(icur);
-                dbuilder.addPotentialBoundary(dcur);
-
-                iprev = icur;
-                dprev = dcur;
-            }
-        }
-        catch (IOException e)
-        {
-            logger.error("Failed to recalculate boundaries for {}; mmap access may degrade to buffered for this file", descriptor);
+            super.tidy();
         }
     }
 
     /**
      * Overrides the default behaviour to create segments of a maximum size.
      */
-    public static class Builder extends SegmentedFile.Builder
+    static class Builder extends SegmentedFile.Builder
     {
-        @VisibleForTesting
-        public static class Boundaries
-        {
-            private long[] boundaries;
+        private MmappedRegions regions;
 
-            // number of boundaries we have "fixed" (i.e. have determined the final value of)
-            private int fixedCount;
-
-            public Boundaries()
-            {
-                // we always have a boundary of zero, so we start with a fixedCount of 1
-                this(new long[8], 1);
-            }
-
-            public Boundaries(long[] boundaries, int fixedCount)
-            {
-                init(boundaries, fixedCount);
-            }
-
-            void init(long[] boundaries, int fixedCount)
-            {
-                this.boundaries = boundaries;
-                this.fixedCount = fixedCount;
-            }
-
-            public void addCandidate(long candidate)
-            {
-                // we make sure we have room before adding another element, so that we can share the addCandidate logic statically
-                boundaries = ensureCapacity(boundaries, fixedCount);
-                fixedCount = addCandidate(boundaries, fixedCount, candidate);
-            }
-
-            private static int addCandidate(long[] boundaries, int fixedCount, long candidate)
-            {
-                // check how far we are from the last fixed boundary
-                long delta = candidate - boundaries[fixedCount - 1];
-                assert delta >= 0;
-                if (delta != 0)
-                {
-                    if (delta <= MAX_SEGMENT_SIZE)
-                        // overwrite the unfixed (potential) boundary if the resultant segment would still be mmappable
-                        boundaries[fixedCount] = candidate;
-                    else if (boundaries[fixedCount] == 0)
-                        // or, if it is not initialised, we cannot make an mmapped segment here, so this is the fixed boundary
-                        boundaries[fixedCount++] = candidate;
-                    else
-                        // otherwise, fix the prior boundary and initialise our unfixed boundary
-                        boundaries[++fixedCount] = candidate;
-                }
-                return fixedCount;
-            }
-
-            // ensures there is room for another fixed boundary AND an unfixed candidate boundary, i.e. fixedCount + 2 items
-            private static long[] ensureCapacity(long[] boundaries, int fixedCount)
-            {
-                if (fixedCount + 1 >= boundaries.length)
-                    return Arrays.copyOf(boundaries, Math.max(fixedCount + 2, boundaries.length * 2));
-                return boundaries;
-            }
-
-            void clear()
-            {
-                fixedCount = 1;
-                Arrays.fill(boundaries, 0);
-            }
-
-            // returns the fixed boundaries, truncated to a correctly sized long[]
-            public long[] truncate()
-            {
-                return Arrays.copyOf(boundaries, fixedCount);
-            }
-
-            // returns the finished boundaries for the provided length, truncated to a correctly sized long[]
-            public long[] finish(long length, boolean isFinal)
-            {
-                assert length > 0;
-                // ensure there's room for the length to be added
-                boundaries = ensureCapacity(boundaries, fixedCount);
-
-                // clone our current contents, so we don't corrupt them
-                int fixedCount = this.fixedCount;
-                long[] boundaries = this.boundaries.clone();
-
-                // if we're finishing early, our length may be before some of our boundaries,
-                // so walk backwards until our boundaries are <= length
-                while (boundaries[fixedCount - 1] >= length)
-                    boundaries[fixedCount--] = 0;
-                if (boundaries[fixedCount] >= length)
-                    boundaries[fixedCount] = 0;
-
-                // add our length as a boundary
-                fixedCount = addCandidate(boundaries, fixedCount, length);
-
-                // if we have any unfixed boundary at the end, it's now fixed, since we're done
-                if (boundaries[fixedCount] != 0)
-                    fixedCount++;
-
-                boundaries = Arrays.copyOf(boundaries, fixedCount);
-                if (isFinal)
-                {
-                    // if this is the final one, save it
-                    this.boundaries = boundaries;
-                    this.fixedCount = fixedCount;
-                }
-                return boundaries;
-            }
-        }
-
-        private final Boundaries boundaries = new Boundaries();
-
-        public Builder()
+        Builder()
         {
             super();
         }
 
-        public long[] boundaries()
-        {
-            return boundaries.truncate();
-        }
-
-        // indicates if we may need to repair the mmapped file boundaries. this is a cheap check to see if there
-        // are any spans larger than an mmap segment size, which should be rare to occur in practice.
-        boolean mayNeedRepair(String path)
-        {
-            // old boundaries were created without the length, so add it as a candidate
-            long length = new File(path).length();
-            boundaries.addCandidate(length);
-            long[] boundaries = this.boundaries.truncate();
-
-            long prev = 0;
-            for (long boundary : boundaries)
-            {
-                if (boundary - prev > MAX_SEGMENT_SIZE)
-                    return true;
-                prev = boundary;
-            }
-            return false;
-        }
-
-        public void addPotentialBoundary(long boundary)
-        {
-            boundaries.addCandidate(boundary);
-        }
-
-        public SegmentedFile complete(ChannelProxy channel, long overrideLength)
+        public SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength)
         {
             long length = overrideLength > 0 ? overrideLength : channel.size();
-            // create the segments
+            updateRegions(channel, length);
 
-            long[] boundaries = this.boundaries.finish(length, overrideLength <= 0);
+            return new MmappedSegmentedFile(channel, bufferSize, length, regions.sharedCopy());
+        }
 
-            int segcount = boundaries.length - 1;
-            Segment[] segments = new Segment[segcount];
-
-            for (int i = 0; i < segcount; i++)
+        private void updateRegions(ChannelProxy channel, long length)
+        {
+            if (regions != null && !regions.isValid(channel))
             {
-                long start = boundaries[i];
-                long size = boundaries[i + 1] - start;
-                MappedByteBuffer segment = size <= MAX_SEGMENT_SIZE
-                                           ? channel.map(FileChannel.MapMode.READ_ONLY, start, size)
-                                           : null;
-                segments[i] = new Segment(start, segment);
+                Throwable err = regions.close(null);
+                if (err != null)
+                    logger.error("Failed to close mapped regions", err);
+
+                regions = null;
             }
 
-            return new MmappedSegmentedFile(channel, length, segments);
+            if (regions == null)
+                regions = MmappedRegions.map(channel, length);
+            else
+                regions.extend(length);
         }
 
         @Override
-        public void serializeBounds(DataOutput out) throws IOException
+        public void serializeBounds(DataOutput out, Version version) throws IOException
         {
-            super.serializeBounds(out);
-            long[] boundaries = this.boundaries.truncate();
-            out.writeInt(boundaries.length);
-            for (long boundary : boundaries)
-                out.writeLong(boundary);
+            if (!version.hasBoundaries())
+                return;
+
+            super.serializeBounds(out, version);
+            out.writeInt(0);
         }
 
         @Override
-        public void deserializeBounds(DataInput in) throws IOException
+        public void deserializeBounds(DataInput in, Version version) throws IOException
         {
-            super.deserializeBounds(in);
+            if (!version.hasBoundaries())
+                return;
 
-            int size = in.readInt();
-            long[] boundaries = new long[size];
-            for (int i = 0; i < size; i++)
-                boundaries[i] = in.readLong();
+            super.deserializeBounds(in, version);
+            in.skipBytes(in.readInt() * TypeSizes.sizeof(0L));
+        }
 
-            this.boundaries.init(boundaries, size);
+        @Override
+        public Throwable close(Throwable accumulate)
+        {
+            return super.close(regions == null
+                               ? accumulate
+                               : regions.close(accumulate));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/NIODataInputStream.java b/src/java/org/apache/cassandra/io/util/NIODataInputStream.java
index ebeb8ba..c75d44f 100644
--- a/src/java/org/apache/cassandra/io/util/NIODataInputStream.java
+++ b/src/java/org/apache/cassandra/io/util/NIODataInputStream.java

@@ -17,12 +17,7 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.io.Closeable;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.EOFException;
 import java.io.IOException;
-import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.ReadableByteChannel;
 import java.nio.channels.SeekableByteChannel;
@@ -41,274 +36,56 @@
  *
  * NIODataInputStream is not thread safe.
  */
-public class NIODataInputStream extends InputStream implements DataInput, Closeable
+public class NIODataInputStream extends RebufferingInputStream
 {
-    private final ReadableByteChannel rbc;
-    private ByteBuffer buf;
+    protected final ReadableByteChannel channel;
 
-
-    public NIODataInputStream(ReadableByteChannel rbc, int bufferSize)
+    private static ByteBuffer makeBuffer(int bufferSize)
     {
-        Preconditions.checkNotNull(rbc);
-        Preconditions.checkArgument(bufferSize >= 8, "Buffer size must be large enough to accomadate a long/double");
-        this.rbc = rbc;
-        buf = ByteBuffer.allocateDirect(bufferSize);
-        buf.position(0);
-        buf.limit(0);
+        ByteBuffer buffer = ByteBuffer.allocateDirect(bufferSize);
+        buffer.position(0);
+        buffer.limit(0);
+
+        return buffer;
+    }
+
+    public NIODataInputStream(ReadableByteChannel channel, int bufferSize)
+    {
+        super(makeBuffer(bufferSize));
+
+        Preconditions.checkNotNull(channel);
+        this.channel = channel;
     }
 
     @Override
-    public void readFully(byte[] b) throws IOException
+    protected void reBuffer() throws IOException
     {
-        readFully(b, 0, b.length);
-    }
+        Preconditions.checkState(buffer.remaining() == 0);
+        buffer.clear();
 
+        while ((channel.read(buffer)) == 0) {}
 
-    @Override
-    public void readFully(byte[] b, int off, int len) throws IOException
-    {
-        int copied = 0;
-        while (copied < len)
-        {
-            int read = read(b, off + copied, len - copied);
-            if (read < 0)
-                throw new EOFException();
-            copied += read;
-        }
-    }
-
-    @Override
-    public int read(byte b[], int off, int len) throws IOException {
-        if (b == null)
-            throw new NullPointerException();
-
-        // avoid int overflow
-        if (off < 0 || off > b.length || len < 0
-                || len > b.length - off)
-            throw new IndexOutOfBoundsException();
-
-        if (len == 0)
-            return 0;
-
-        int copied = 0;
-        while (copied < len)
-        {
-            if (buf.hasRemaining())
-            {
-                int toCopy = Math.min(len - copied, buf.remaining());
-                buf.get(b, off + copied, toCopy);
-                copied += toCopy;
-            }
-            else
-            {
-                int read = readNext();
-                if (read < 0 && copied == 0) return -1;
-                if (read <= 0) return copied;
-            }
-        }
-
-        return copied;
-    }
-
-    /*
-     * Refill the buffer, preserving any unread bytes remaining in the buffer
-     */
-    private int readNext() throws IOException
-    {
-        Preconditions.checkState(buf.remaining() != buf.capacity());
-        assert(buf.remaining() < 8);
-
-        /*
-         * If there is data already at the start of the buffer, move the position to the end
-         * If there is data but not at the start, move it to the start
-         * Otherwise move the position to 0 so writes start at the beginning of the buffer
-         *
-         * We go to the trouble of shuffling the bytes remaining for cases where the buffer isn't fully drained
-         * while retrieving a multi-byte value while the position is in the middle.
-         */
-        if (buf.position() == 0 && buf.hasRemaining())
-        {
-            buf.position(buf.limit());
-        }
-        else if (buf.hasRemaining())
-        {
-            ByteBuffer dup = buf.duplicate();
-            buf.clear();
-            buf.put(dup);
-        }
-        else
-        {
-            buf.position(0);
-        }
-
-        buf.limit(buf.capacity());
-
-        int read = 0;
-        while ((read = rbc.read(buf)) == 0) {}
-
-        buf.flip();
-
-        return read;
-    }
-
-    /*
-     * Read at least minimum bytes and throw EOF if that fails
-     */
-    private void readMinimum(int minimum) throws IOException
-    {
-        assert(buf.remaining() < 8);
-        while (buf.remaining() < minimum)
-        {
-            int read = readNext();
-            if (read == -1)
-            {
-                //DataInputStream consumes the bytes even if it doesn't get the entire value, match the behavior here
-                buf.position(0);
-                buf.limit(0);
-                throw new EOFException();
-            }
-        }
-    }
-
-    /*
-     * Ensure the buffer contains the minimum number of readable bytes
-     */
-    private void prepareReadPrimitive(int minimum) throws IOException
-    {
-        if (buf.remaining() < minimum) readMinimum(minimum);
-    }
-
-    @Override
-    public int skipBytes(int n) throws IOException
-    {
-        int skipped = 0;
-
-        while (skipped < n)
-        {
-            int skippedThisTime = (int)skip(n - skipped);
-            if (skippedThisTime <= 0) break;
-            skipped += skippedThisTime;
-        }
-
-        return skipped;
-    }
-
-    @Override
-    public boolean readBoolean() throws IOException
-    {
-        prepareReadPrimitive(1);
-        return buf.get() != 0;
-    }
-
-    @Override
-    public byte readByte() throws IOException
-    {
-        prepareReadPrimitive(1);
-        return buf.get();
-    }
-
-    @Override
-    public int readUnsignedByte() throws IOException
-    {
-        prepareReadPrimitive(1);
-        return buf.get() & 0xff;
-    }
-
-    @Override
-    public short readShort() throws IOException
-    {
-        prepareReadPrimitive(2);
-        return buf.getShort();
-    }
-
-    @Override
-    public int readUnsignedShort() throws IOException
-    {
-        return readShort() & 0xFFFF;
-    }
-
-    @Override
-    public char readChar() throws IOException
-    {
-        prepareReadPrimitive(2);
-        return buf.getChar();
-    }
-
-    @Override
-    public int readInt() throws IOException
-    {
-        prepareReadPrimitive(4);
-        return buf.getInt();
-    }
-
-    @Override
-    public long readLong() throws IOException
-    {
-        prepareReadPrimitive(8);
-        return buf.getLong();
-    }
-
-    @Override
-    public float readFloat() throws IOException
-    {
-        prepareReadPrimitive(4);
-        return buf.getFloat();
-    }
-
-    @Override
-    public double readDouble() throws IOException
-    {
-        prepareReadPrimitive(8);
-        return buf.getDouble();
-    }
-
-    @Override
-    public String readLine() throws IOException
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public String readUTF() throws IOException
-    {
-        return DataInputStream.readUTF(this);
+        buffer.flip();
     }
 
     @Override
     public void close() throws IOException
     {
-        rbc.close();
-        FileUtils.clean(buf);
-        buf = null;
-    }
-
-    @Override
-    public int read() throws IOException
-    {
-        return readUnsignedByte();
+        channel.close();
+        super.close();
+        FileUtils.clean(buffer);
+        buffer = null;
     }
 
     @Override
     public int available() throws IOException
     {
-        if (rbc instanceof SeekableByteChannel)
+        if (channel instanceof SeekableByteChannel)
         {
-            SeekableByteChannel sbc = (SeekableByteChannel)rbc;
+            SeekableByteChannel sbc = (SeekableByteChannel) channel;
             long remainder = Math.max(0, sbc.size() - sbc.position());
-            return (remainder > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)(remainder + buf.remaining());
+            return (remainder > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)(remainder + buffer.remaining());
         }
-        return buf.remaining();
-    }
-
-    @Override
-    public void reset() throws IOException
-    {
-        throw new IOException("mark/reset not supported");
-    }
-
-    @Override
-    public boolean markSupported()
-    {
-        return false;
+        return buffer.remaining();
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java b/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java
deleted file mode 100644
index a5fa20b..0000000
--- a/src/java/org/apache/cassandra/io/util/PoolingSegmentedFile.java
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.io.util;
-
-import org.apache.cassandra.service.FileCacheService;
-
-public abstract class PoolingSegmentedFile extends SegmentedFile
-{
-    final FileCacheService.CacheKey cacheKey;
-    protected PoolingSegmentedFile(Cleanup cleanup, ChannelProxy channel, long length)
-    {
-        this(cleanup, channel, length, length);
-    }
-
-    protected PoolingSegmentedFile(Cleanup cleanup, ChannelProxy channel, long length, long onDiskLength)
-    {
-        super(cleanup, channel, length, onDiskLength);
-        cacheKey = cleanup.cacheKey;
-    }
-
-    public PoolingSegmentedFile(PoolingSegmentedFile copy)
-    {
-        super(copy);
-        cacheKey = copy.cacheKey;
-    }
-
-    protected static class Cleanup extends SegmentedFile.Cleanup
-    {
-        final FileCacheService.CacheKey cacheKey = new FileCacheService.CacheKey();
-        protected Cleanup(ChannelProxy channel)
-        {
-            super(channel);
-        }
-        public void tidy()
-        {
-            super.tidy();
-
-            FileCacheService.instance.invalidate(cacheKey, channel.filePath());
-        }
-    }
-
-    @SuppressWarnings("resource")
-    public FileDataInput getSegment(long position)
-    {
-        RandomAccessReader reader = FileCacheService.instance.get(cacheKey);
-
-        if (reader == null)
-            reader = createPooledReader();
-
-        reader.seek(position);
-        return reader;
-    }
-
-    protected RandomAccessReader createPooledReader()
-    {
-        return RandomAccessReader.open(channel, length, this);
-    }
-
-    public void recycle(RandomAccessReader reader)
-    {
-        FileCacheService.instance.put(cacheKey, reader);
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
index 0265be5..1943773 100644
--- a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java

@@ -19,128 +19,147 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
-import com.google.common.annotations.VisibleForTesting;
+import com.google.common.primitives.Ints;
+import com.google.common.util.concurrent.RateLimiter;
 
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.compress.BufferType;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.BufferPool;
 
-public class RandomAccessReader extends AbstractDataInput implements FileDataInput
+public class RandomAccessReader extends RebufferingInputStream implements FileDataInput
 {
-    // default buffer size, 64Kb
-    public static final int DEFAULT_BUFFER_SIZE = 65536;
-    public static final int BUFFER_SIZE = Integer.getInteger("cassandra.rar_buffer_size", DEFAULT_BUFFER_SIZE);
+    // The default buffer size when the client doesn't specify it
+    public static final int DEFAULT_BUFFER_SIZE = 4096;
 
-    // buffer which will cache file blocks
-    protected ByteBuffer buffer;
+    // The maximum buffer size, we will never buffer more than this size. Further,
+    // when the limiter is not null, i.e. when throttling is enabled, we read exactly
+    // this size, since when throttling the intention is to eventually read everything,
+    // see CASSANDRA-8630
+    // NOTE: this size is chosen both for historical consistency, as a reasonable upper bound,
+    //       and because our BufferPool currently has a maximum allocation size of this.
+    public static final int MAX_BUFFER_SIZE = 1 << 16; // 64k
 
-    // `bufferOffset` is the offset of the beginning of the buffer
-    // `markedPointer` folds the offset of the last file mark
-    protected long bufferOffset, markedPointer;
-
+    // the IO channel to the file, we do not own a reference to this due to
+    // performance reasons (CASSANDRA-9379) so it's up to the owner of the RAR to
+    // ensure that the channel stays open and that it is closed afterwards
     protected final ChannelProxy channel;
 
-    // this can be overridden at construction to a value shorter than the true length of the file;
-    // if so, it acts as an imposed limit on reads, rather than a convenience property
+    // optional memory mapped regions for the channel
+    protected final MmappedRegions regions;
+
+    // An optional limiter that will throttle the amount of data we read
+    protected final RateLimiter limiter;
+
+    // the file length, this can be overridden at construction to a value shorter
+    // than the true length of the file; if so, it acts as an imposed limit on reads,
+    // required when opening sstables early not to read past the mark
     private final long fileLength;
 
-    protected final PoolingSegmentedFile owner;
+    // the buffer size for buffered readers
+    protected final int bufferSize;
 
-    protected RandomAccessReader(ChannelProxy channel, int bufferSize, long overrideLength, BufferType bufferType, PoolingSegmentedFile owner)
+    // the buffer type for buffered readers
+    protected final BufferType bufferType;
+
+    // offset from the beginning of the file
+    protected long bufferOffset;
+
+    // offset of the last file mark
+    protected long markedPointer;
+
+    protected RandomAccessReader(Builder builder)
     {
-        this.channel = channel.sharedCopy();
-        this.owner = owner;
+        super(builder.createBuffer());
 
-        // allocating required size of the buffer
-        if (bufferSize <= 0)
-            throw new IllegalArgumentException("bufferSize must be positive");
-
-        // we can cache file length in read-only mode
-        fileLength = overrideLength <= 0 ? channel.size() : overrideLength;
-
-        buffer = allocateBuffer(bufferSize, bufferType);
-        buffer.limit(0);
+        this.channel = builder.channel;
+        this.regions = builder.regions;
+        this.limiter = builder.limiter;
+        this.fileLength = builder.overrideLength <= 0 ? builder.channel.size() : builder.overrideLength;
+        this.bufferSize = builder.bufferSize;
+        this.bufferType = builder.bufferType;
+        this.buffer = builder.buffer;
     }
 
-    protected ByteBuffer allocateBuffer(int bufferSize, BufferType bufferType)
+    protected static ByteBuffer allocateBuffer(int size, BufferType bufferType)
     {
-        int size = (int) Math.min(fileLength, bufferSize);
-        return bufferType.allocate(size);
+        return BufferPool.get(size, bufferType).order(ByteOrder.BIG_ENDIAN);
     }
 
-    public static RandomAccessReader open(ChannelProxy channel, long overrideSize, PoolingSegmentedFile owner)
+    protected void releaseBuffer()
     {
-        return open(channel, BUFFER_SIZE, overrideSize, owner);
-    }
-
-    public static RandomAccessReader open(File file)
-    {
-        try (ChannelProxy channel = new ChannelProxy(file))
+        if (buffer != null)
         {
-            return open(channel);
+            if (regions == null)
+                BufferPool.put(buffer);
+            buffer = null;
         }
     }
 
-    public static RandomAccessReader open(ChannelProxy channel)
-    {
-        return open(channel, -1L);
-    }
-
-    public static RandomAccessReader open(ChannelProxy channel, long overrideSize)
-    {
-        return open(channel, BUFFER_SIZE, overrideSize, null);
-    }
-
-    @VisibleForTesting
-    static RandomAccessReader open(ChannelProxy channel, int bufferSize, PoolingSegmentedFile owner)
-    {
-        return open(channel, bufferSize, -1L, owner);
-    }
-
-    private static RandomAccessReader open(ChannelProxy channel, int bufferSize, long overrideSize, PoolingSegmentedFile owner)
-    {
-        return new RandomAccessReader(channel, bufferSize, overrideSize, BufferType.ON_HEAP, owner);
-    }
-
-    @VisibleForTesting
-    static RandomAccessReader open(SequentialWriter writer)
-    {
-        try (ChannelProxy channel = new ChannelProxy(writer.getPath()))
-        {
-            return open(channel, BUFFER_SIZE, null);
-        }
-    }
-
-    public ChannelProxy getChannel()
-    {
-        return channel;
-    }
-
     /**
      * Read data from file starting from current currentOffset to populate buffer.
      */
-    protected void reBuffer()
+    public void reBuffer()
+    {
+        if (isEOF())
+            return;
+
+        if (regions == null)
+            reBufferStandard();
+        else
+            reBufferMmap();
+
+        if (limiter != null)
+            limiter.acquire(buffer.remaining());
+
+        assert buffer.order() == ByteOrder.BIG_ENDIAN : "Buffer must have BIG ENDIAN byte ordering";
+    }
+
+    protected void reBufferStandard()
     {
         bufferOffset += buffer.position();
-        buffer.clear();
         assert bufferOffset < fileLength;
 
+        buffer.clear();
         long position = bufferOffset;
         long limit = bufferOffset;
-        while (buffer.hasRemaining() && limit < fileLength)
+
+        long pageAligedPos = position & ~4095;
+        // Because the buffer capacity is a multiple of the page size, we read less
+        // the first time and then we should read at page boundaries only,
+        // unless the user seeks elsewhere
+        long upperLimit = Math.min(fileLength, pageAligedPos + buffer.capacity());
+        buffer.limit((int)(upperLimit - position));
+        while (buffer.hasRemaining() && limit < upperLimit)
         {
             int n = channel.read(buffer, position);
             if (n < 0)
-                break;
+                throw new FSReadError(new IOException("Unexpected end of file"), channel.filePath());
+
             position += n;
             limit = bufferOffset + buffer.position();
         }
-        if (limit > fileLength)
-            buffer.position((int)(fileLength - bufferOffset));
+
         buffer.flip();
     }
 
+    protected void reBufferMmap()
+    {
+        long position = bufferOffset + buffer.position();
+        assert position < fileLength;
+
+        MmappedRegions.Region region = regions.floor(position);
+        bufferOffset = region.bottom();
+        buffer = region.buffer.duplicate();
+        buffer.position(Ints.checkedCast(position - bufferOffset));
+
+        if (limiter != null && bufferSize < buffer.remaining())
+        { // ensure accurate throttling
+            buffer.limit(buffer.position() + bufferSize);
+        }
+    }
+
     @Override
     public long getFilePointer()
     {
@@ -157,19 +176,23 @@
         return channel.filePath();
     }
 
-    public int getTotalBufferSize()
+    public ChannelProxy getChannel()
     {
-        //This may NPE so we make a ref
-        //https://issues.apache.org/jira/browse/CASSANDRA-7756
-        ByteBuffer ref = buffer;
-        return ref != null ? ref.capacity() : 0;
+        return channel;
     }
 
-    public void reset()
+    @Override
+    public void reset() throws IOException
     {
         seek(markedPointer);
     }
 
+    @Override
+    public boolean markSupported()
+    {
+        return true;
+    }
+
     public long bytesPastMark()
     {
         long bytes = current() - markedPointer;
@@ -177,19 +200,19 @@
         return bytes;
     }
 
-    public FileMark mark()
+    public DataPosition mark()
     {
         markedPointer = current();
         return new BufferedRandomAccessFileMark(markedPointer);
     }
 
-    public void reset(FileMark mark)
+    public void reset(DataPosition mark)
     {
         assert mark instanceof BufferedRandomAccessFileMark;
         seek(((BufferedRandomAccessFileMark) mark).pointer);
     }
 
-    public long bytesPastMark(FileMark mark)
+    public long bytesPastMark(DataPosition mark)
     {
         assert mark instanceof BufferedRandomAccessFileMark;
         long bytes = current() - ((BufferedRandomAccessFileMark) mark).pointer;
@@ -202,7 +225,7 @@
      */
     public boolean isEOF()
     {
-        return getFilePointer() == length();
+        return current() == length();
     }
 
     public long bytesRemaining()
@@ -211,46 +234,35 @@
     }
 
     @Override
-    public void close()
+    public int available() throws IOException
     {
-        if (owner == null || buffer == null)
-        {
-            // The buffer == null check is so that if the pool owner has deallocated us, calling close()
-            // will re-call deallocate rather than recycling a deallocated object.
-            // I'd be more comfortable if deallocate didn't have to handle being idempotent like that,
-            // but RandomAccessFile.close will call AbstractInterruptibleChannel.close which will
-            // re-call RAF.close -- in this case, [C]RAR.close since we are overriding that.
-            deallocate();
-        }
-        else
-        {
-            owner.recycle(this);
-        }
+        return Ints.saturatedCast(bytesRemaining());
     }
 
-    public void deallocate()
+    @Override
+    public void close()
     {
-        //make idempotent
+	    //make idempotent
         if (buffer == null)
             return;
 
         bufferOffset += buffer.position();
-        FileUtils.clean(buffer);
+        releaseBuffer();
 
-        buffer = null; // makes sure we don't use this after it's ostensibly closed
-        channel.close();
+        //For performance reasons we don't keep a reference to the file
+        //channel so we don't close it
     }
 
     @Override
     public String toString()
     {
-        return getClass().getSimpleName() + "(" + "filePath='" + channel + "')";
+        return getClass().getSimpleName() + "(filePath='" + channel + "')";
     }
 
     /**
      * Class to hold a mark to the position of the file
      */
-    protected static class BufferedRandomAccessFileMark implements FileMark
+    protected static class BufferedRandomAccessFileMark implements DataPosition
     {
         final long pointer;
 
@@ -266,6 +278,9 @@
         if (newPosition < 0)
             throw new IllegalArgumentException("new position should not be negative");
 
+        if (buffer == null)
+            throw new IllegalStateException("Attempted to seek in a closed RAR");
+
         if (newPosition >= length()) // it is save to call length() in read-only mode
         {
             if (newPosition > length())
@@ -288,81 +303,51 @@
         assert current() == newPosition;
     }
 
-    // -1 will be returned if there is nothing to read; higher-level methods like readInt
-    // or readFully (from RandomAccessFile) will throw EOFException but this should not
-    public int read()
+    /**
+     * Reads a line of text form the current position in this file. A line is
+     * represented by zero or more characters followed by {@code '\n'}, {@code
+     * '\r'}, {@code "\r\n"} or the end of file marker. The string does not
+     * include the line terminating sequence.
+     * <p/>
+     * Blocks until a line terminating sequence has been read, the end of the
+     * file is reached or an exception is thrown.
+     *
+     * @return the contents of the line or {@code null} if no characters have
+     * been read before the end of the file has been reached.
+     * @throws IOException if this file is closed or another I/O error occurs.
+     */
+    public final String readLine() throws IOException
     {
-        if (buffer == null)
-            throw new AssertionError("Attempted to read from closed RAR");
-
-        if (!buffer.hasRemaining())
+        StringBuilder line = new StringBuilder(80); // Typical line length
+        boolean foundTerminator = false;
+        long unreadPosition = -1;
+        while (true)
         {
-            if (isEOF())
-                return -1; // required by RandomAccessFile
-
-            reBuffer();
-        }
-
-        return (int)buffer.get() & 0xff;
-    }
-
-    @Override
-    public int read(byte[] buffer)
-    {
-        return read(buffer, 0, buffer.length);
-    }
-
-    @Override
-    // -1 will be returned if there is nothing to read; higher-level methods like readInt
-    // or readFully (from RandomAccessFile) will throw EOFException but this should not
-    public int read(byte[] buff, int offset, int length)
-    {
-        if (buffer == null)
-            throw new AssertionError("Attempted to read from closed RAR");
-
-        if (length == 0)
-            return 0;
-
-        if (!buffer.hasRemaining())
-        {
-            if (isEOF())
-                return -1;
-
-            reBuffer();
-        }
-
-        int toCopy = Math.min(length, buffer.remaining());
-        buffer.get(buff, offset, toCopy);
-        return toCopy;
-    }
-
-    public ByteBuffer readBytes(int length) throws EOFException
-    {
-        assert length >= 0 : "buffer length should not be negative: " + length;
-        try
-        {
-            ByteBuffer result = ByteBuffer.allocate(length);
-            while (result.hasRemaining())
+            int nextByte = read();
+            switch (nextByte)
             {
-                if (!buffer.hasRemaining())
-                {
-                    if (isEOF())
-                        throw new EOFException();
-
-                    reBuffer();
-                }
-                ByteBufferUtil.put(buffer, result);
+                case -1:
+                    return line.length() != 0 ? line.toString() : null;
+                case (byte) '\r':
+                    if (foundTerminator)
+                    {
+                        seek(unreadPosition);
+                        return line.toString();
+                    }
+                    foundTerminator = true;
+                    /* Have to be able to peek ahead one byte */
+                    unreadPosition = getPosition();
+                    break;
+                case (byte) '\n':
+                    return line.toString();
+                default:
+                    if (foundTerminator)
+                    {
+                        seek(unreadPosition);
+                        return line.toString();
+                    }
+                    line.append((char) nextByte);
             }
-            result.flip();
-            return result;
-        }
-        catch (EOFException e)
-        {
-            throw e;
-        }
-        catch (Exception e)
-        {
-            throw new FSReadError(e, channel.toString());
         }
     }
 
@@ -373,11 +358,154 @@
 
     public long getPosition()
     {
-        return bufferOffset + buffer.position();
+        return current();
     }
 
-    public long getPositionLimit()
+    public static class Builder
     {
-        return length();
+        // The NIO file channel or an empty channel
+        public final ChannelProxy channel;
+
+        // We override the file length when we open sstables early, so that we do not
+        // read past the early mark
+        public long overrideLength;
+
+        // The size of the buffer for buffered readers
+        public int bufferSize;
+
+        // The type of the buffer for buffered readers
+        public BufferType bufferType;
+
+        // The buffer
+        public ByteBuffer buffer;
+
+        // The mmap segments for mmap readers
+        public MmappedRegions regions;
+
+        // An optional limiter that will throttle the amount of data we read
+        public RateLimiter limiter;
+
+        public Builder(ChannelProxy channel)
+        {
+            this.channel = channel;
+            this.overrideLength = -1L;
+            this.bufferSize = DEFAULT_BUFFER_SIZE;
+            this.bufferType = BufferType.OFF_HEAP;
+            this.regions = null;
+            this.limiter = null;
+        }
+
+        /** The buffer size is typically already page aligned but if that is not the case
+         * make sure that it is a multiple of the page size, 4096. Also limit it to the maximum
+         * buffer size unless we are throttling, in which case we may as well read the maximum
+         * directly since the intention is to read the full file, see CASSANDRA-8630.
+         * */
+        private void setBufferSize()
+        {
+            if (limiter != null)
+            {
+                bufferSize = MAX_BUFFER_SIZE;
+                return;
+            }
+
+            if ((bufferSize & ~4095) != bufferSize)
+            { // should already be a page size multiple but if that's not case round it up
+                bufferSize = (bufferSize + 4095) & ~4095;
+            }
+
+            bufferSize = Math.min(MAX_BUFFER_SIZE, bufferSize);
+        }
+
+        protected ByteBuffer createBuffer()
+        {
+            setBufferSize();
+
+            buffer = regions == null
+                     ? allocateBuffer(bufferSize, bufferType)
+                     : regions.floor(0).buffer.duplicate();
+
+            buffer.limit(0);
+            return buffer;
+        }
+
+        public Builder overrideLength(long overrideLength)
+        {
+            this.overrideLength = overrideLength;
+            return this;
+        }
+
+        public Builder bufferSize(int bufferSize)
+        {
+            if (bufferSize <= 0)
+                throw new IllegalArgumentException("bufferSize must be positive");
+
+            this.bufferSize = bufferSize;
+            return this;
+        }
+
+        public Builder bufferType(BufferType bufferType)
+        {
+            this.bufferType = bufferType;
+            return this;
+        }
+
+        public Builder regions(MmappedRegions regions)
+        {
+            this.regions = regions;
+            return this;
+        }
+
+        public Builder limiter(RateLimiter limiter)
+        {
+            this.limiter = limiter;
+            return this;
+        }
+
+        public RandomAccessReader build()
+        {
+            return new RandomAccessReader(this);
+        }
+
+        public RandomAccessReader buildWithChannel()
+        {
+            return new RandomAccessReaderWithOwnChannel(this);
+        }
+    }
+
+    // A wrapper of the RandomAccessReader that closes the channel when done.
+    // For performance reasons RAR does not increase the reference count of
+    // a channel but assumes the owner will keep it open and close it,
+    // see CASSANDRA-9379, this thin class is just for those cases where we do
+    // not have a shared channel.
+    public static class RandomAccessReaderWithOwnChannel extends RandomAccessReader
+    {
+        protected RandomAccessReaderWithOwnChannel(Builder builder)
+        {
+            super(builder);
+        }
+
+        @Override
+        public void close()
+        {
+            try
+            {
+                super.close();
+            }
+            finally
+            {
+                channel.close();
+            }
+        }
+    }
+
+    @SuppressWarnings("resource")
+    public static RandomAccessReader open(File file)
+    {
+        return new Builder(new ChannelProxy(file)).buildWithChannel();
+    }
+
+    public static RandomAccessReader open(ChannelProxy channel)
+    {
+        return new Builder(channel).build();
     }
 }

diff --git a/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java b/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java
new file mode 100644
index 0000000..15d0975
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java

@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import net.nicoulaj.compilecommand.annotations.DontInline;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.vint.VIntCoding;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Rough equivalent of BufferedInputStream and DataInputStream wrapping a ByteBuffer that can be refilled
+ * via rebuffer. Implementations provide this buffer from various channels (socket, file, memory, etc).
+ *
+ * RebufferingInputStream is not thread safe.
+ */
+public abstract class RebufferingInputStream extends InputStream implements DataInputPlus, Closeable
+{
+    protected ByteBuffer buffer;
+
+    protected RebufferingInputStream(ByteBuffer buffer)
+    {
+        Preconditions.checkArgument(buffer == null || buffer.order() == ByteOrder.BIG_ENDIAN, "Buffer must have BIG ENDIAN byte ordering");
+        this.buffer = buffer;
+    }
+
+    /**
+     * Implementations must implement this method to refill the buffer.
+     * They can expect the buffer to be empty when this method is invoked.
+     * @throws IOException
+     */
+    protected abstract void reBuffer() throws IOException;
+
+    @Override
+    public void readFully(byte[] b) throws IOException
+    {
+        readFully(b, 0, b.length);
+    }
+
+    @Override
+    public void readFully(byte[] b, int off, int len) throws IOException
+    {
+        int read = read(b, off, len);
+        if (read < len)
+            throw new EOFException("EOF after " + read + " bytes out of " + len);
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException {
+
+        // avoid int overflow
+        if (off < 0 || off > b.length || len < 0 || len > b.length - off)
+            throw new IndexOutOfBoundsException();
+
+        if (len == 0)
+            return 0;
+
+        int copied = 0;
+        while (copied < len)
+        {
+            int position = buffer.position();
+            int remaining = buffer.limit() - position;
+            if (remaining == 0)
+            {
+                reBuffer();
+                position = buffer.position();
+                remaining = buffer.limit() - position;
+                if (remaining == 0)
+                    return copied == 0 ? -1 : copied;
+            }
+            int toCopy = Math.min(len - copied, remaining);
+            FastByteOperations.copy(buffer, position, b, off + copied, toCopy);
+            buffer.position(position + toCopy);
+            copied += toCopy;
+        }
+
+        return copied;
+    }
+
+    @DontInline
+    protected long readPrimitiveSlowly(int bytes) throws IOException
+    {
+        long result = 0;
+        for (int i = 0; i < bytes; i++)
+            result = (result << 8) | (readByte() & 0xFFL);
+        return result;
+    }
+
+    @Override
+    public int skipBytes(int n) throws IOException
+    {
+        if (n < 0)
+            return 0;
+        int requested = n;
+        int position = buffer.position(), limit = buffer.limit(), remaining;
+        while ((remaining = limit - position) < n)
+        {
+            n -= remaining;
+            buffer.position(limit);
+            reBuffer();
+            position = buffer.position();
+            limit = buffer.limit();
+            if (position == limit)
+                return requested - n;
+        }
+        buffer.position(position + n);
+        return requested;
+    }
+
+    @Override
+    public boolean readBoolean() throws IOException
+    {
+        return readByte() != 0;
+    }
+
+    @Override
+    public byte readByte() throws IOException
+    {
+        if (!buffer.hasRemaining())
+        {
+            reBuffer();
+            if (!buffer.hasRemaining())
+                throw new EOFException();
+        }
+
+        return buffer.get();
+    }
+
+    @Override
+    public int readUnsignedByte() throws IOException
+    {
+        return readByte() & 0xff;
+    }
+
+    @Override
+    public short readShort() throws IOException
+    {
+        if (buffer.remaining() >= 2)
+            return buffer.getShort();
+        else
+            return (short) readPrimitiveSlowly(2);
+    }
+
+    @Override
+    public int readUnsignedShort() throws IOException
+    {
+        return readShort() & 0xFFFF;
+    }
+
+    @Override
+    public char readChar() throws IOException
+    {
+        if (buffer.remaining() >= 2)
+            return buffer.getChar();
+        else
+            return (char) readPrimitiveSlowly(2);
+    }
+
+    @Override
+    public int readInt() throws IOException
+    {
+        if (buffer.remaining() >= 4)
+            return buffer.getInt();
+        else
+            return (int) readPrimitiveSlowly(4);
+    }
+
+    @Override
+    public long readLong() throws IOException
+    {
+        if (buffer.remaining() >= 8)
+            return buffer.getLong();
+        else
+            return readPrimitiveSlowly(8);
+    }
+
+    public long readVInt() throws IOException
+    {
+        return VIntCoding.decodeZigZag64(readUnsignedVInt());
+    }
+
+    public long readUnsignedVInt() throws IOException
+    {
+        //If 9 bytes aren't available use the slow path in VIntCoding
+        if (buffer.remaining() < 9)
+            return VIntCoding.readUnsignedVInt(this);
+
+        byte firstByte = buffer.get();
+
+        //Bail out early if this is one byte, necessary or it fails later
+        if (firstByte >= 0)
+            return firstByte;
+
+        int extraBytes = VIntCoding.numberOfExtraBytesToRead(firstByte);
+
+        int position = buffer.position();
+        int extraBits = extraBytes * 8;
+
+        long retval = buffer.getLong(position);
+        if (buffer.order() == ByteOrder.LITTLE_ENDIAN)
+            retval = Long.reverseBytes(retval);
+        buffer.position(position + extraBytes);
+
+        // truncate the bytes we read in excess of those we needed
+        retval >>>= 64 - extraBits;
+        // remove the non-value bits from the first byte
+        firstByte &= VIntCoding.firstByteValueMask(extraBytes);
+        // shift the first byte up to its correct position
+        retval |= (long) firstByte << extraBits;
+        return retval;
+    }
+
+    @Override
+    public float readFloat() throws IOException
+    {
+        if (buffer.remaining() >= 4)
+            return buffer.getFloat();
+        else
+            return Float.intBitsToFloat((int)readPrimitiveSlowly(4));
+    }
+
+    @Override
+    public double readDouble() throws IOException
+    {
+        if (buffer.remaining() >= 8)
+            return buffer.getDouble();
+        else
+            return Double.longBitsToDouble(readPrimitiveSlowly(8));
+    }
+
+    @Override
+    public String readLine() throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String readUTF() throws IOException
+    {
+        return DataInputStream.readUTF(this);
+    }
+
+    @Override
+    public int read() throws IOException
+    {
+        try
+        {
+            return readUnsignedByte();
+        }
+        catch (EOFException ex)
+        {
+            return -1;
+        }
+    }
+
+    @Override
+    public void reset() throws IOException
+    {
+        throw new IOException("mark/reset not supported");
+    }
+
+    @Override
+    public boolean markSupported()
+    {
+        return false;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/RewindableDataInput.java b/src/java/org/apache/cassandra/io/util/RewindableDataInput.java
new file mode 100644
index 0000000..c202f60
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/RewindableDataInput.java

@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.IOException;
+
+public interface RewindableDataInput extends DataInputPlus
+{
+    DataPosition mark();
+
+    void reset(DataPosition mark) throws IOException;
+
+    long bytesPastMark(DataPosition mark);
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/io/util/RewindableDataInputStreamPlus.java b/src/java/org/apache/cassandra/io/util/RewindableDataInputStreamPlus.java
new file mode 100644
index 0000000..3a680f4
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/RewindableDataInputStreamPlus.java

@@ -0,0 +1,569 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.apache.cassandra.utils.Throwables.maybeFail;
+import static org.apache.cassandra.utils.Throwables.merge;
+
+/**
+ * Adds mark/reset functionality to another input stream by caching read bytes to a memory buffer and
+ * spilling to disk if necessary.
+ *
+ * When the stream is marked via {@link this#mark()} or {@link this#mark(int)}, up to
+ * <code>maxMemBufferSize</code> will be cached in memory (heap). If more than
+ * <code>maxMemBufferSize</code> bytes are read while the stream is marked, the
+ * following bytes are cached on the <code>spillFile</code> for up to <code>maxDiskBufferSize</code>.
+ *
+ * Please note that successive calls to {@link this#mark()} and {@link this#reset()} will write
+ * sequentially to the same <code>spillFile</code> until <code>maxDiskBufferSize</code> is reached.
+ * At this point, if less than <code>maxDiskBufferSize</code> bytes are currently cached on the
+ * <code>spillFile</code>, the remaining bytes are written to the beginning of the file,
+ * treating the <code>spillFile</code> as a circular buffer.
+ *
+ * If more than <code>maxMemBufferSize + maxDiskBufferSize</code> are cached while the stream is marked,
+ * the following {@link this#reset()} invocation will throw a {@link IllegalStateException}.
+ *
+ */
+public class RewindableDataInputStreamPlus extends FilterInputStream implements RewindableDataInput, Closeable
+{
+    private boolean marked = false;
+    private boolean exhausted = false;
+    private AtomicBoolean closed = new AtomicBoolean(false);
+
+    protected int memAvailable = 0;
+    protected int diskTailAvailable = 0;
+    protected int diskHeadAvailable = 0;
+
+    private final File spillFile;
+    private final int initialMemBufferSize;
+    private final int maxMemBufferSize;
+    private final int maxDiskBufferSize;
+
+    private volatile byte memBuffer[];
+    private int memBufferSize;
+    private RandomAccessFile spillBuffer;
+
+    private final DataInputPlus dataReader;
+
+    public RewindableDataInputStreamPlus(InputStream in, int initialMemBufferSize, int maxMemBufferSize,
+                                         File spillFile, int maxDiskBufferSize)
+    {
+        super(in);
+        dataReader = new DataInputStreamPlus(this);
+        this.initialMemBufferSize = initialMemBufferSize;
+        this.maxMemBufferSize = maxMemBufferSize;
+        this.spillFile = spillFile;
+        this.maxDiskBufferSize = maxDiskBufferSize;
+    }
+
+    /* RewindableDataInput methods */
+
+    /**
+     * Marks the current position of a stream to return to this position later via the {@link this#reset(DataPosition)} method.
+     * @return An empty @link{DataPosition} object
+     */
+    public DataPosition mark()
+    {
+        mark(0);
+        return new RewindableDataInputPlusMark();
+    }
+
+    /**
+     * Rewinds to the previously marked position via the {@link this#mark()} method.
+     * @param mark it's not possible to return to a custom position, so this parameter is ignored.
+     * @throws IOException if an error ocurs while resetting
+     */
+    public void reset(DataPosition mark) throws IOException
+    {
+        reset();
+    }
+
+    public long bytesPastMark(DataPosition mark)
+    {
+        return maxMemBufferSize - memAvailable + (diskTailAvailable == -1? 0 : maxDiskBufferSize - diskHeadAvailable - diskTailAvailable);
+    }
+
+
+    protected static class RewindableDataInputPlusMark implements DataPosition
+    {
+    }
+
+    /* InputStream methods */
+
+    public boolean markSupported()
+    {
+        return true;
+    }
+
+    /**
+     * Marks the current position of a stream to return to this position
+     * later via the {@link this#reset()} method.
+     * @param readlimit the maximum amount of bytes to cache
+     */
+    public synchronized void mark(int readlimit)
+    {
+        if (marked)
+            throw new IllegalStateException("Cannot mark already marked stream.");
+
+        if (memAvailable > 0 || diskHeadAvailable > 0 || diskTailAvailable > 0)
+            throw new IllegalStateException("Can only mark stream after reading previously marked data.");
+
+        marked = true;
+        memAvailable = maxMemBufferSize;
+        diskHeadAvailable = -1;
+        diskTailAvailable = -1;
+    }
+
+    public synchronized void reset() throws IOException
+    {
+        if (!marked)
+            throw new IOException("Must call mark() before calling reset().");
+
+        if (exhausted)
+            throw new IOException(String.format("Read more than capacity: %d bytes.", maxMemBufferSize + maxDiskBufferSize));
+
+        memAvailable = maxMemBufferSize - memAvailable;
+        memBufferSize = memAvailable;
+
+        if (diskTailAvailable == -1)
+        {
+            diskHeadAvailable = 0;
+            diskTailAvailable = 0;
+        }
+        else
+        {
+            int initialPos = diskTailAvailable > 0 ? 0 : (int)getIfNotClosed(spillBuffer).getFilePointer();
+            int diskMarkpos = initialPos + diskHeadAvailable;
+            getIfNotClosed(spillBuffer).seek(diskMarkpos);
+
+            diskHeadAvailable = diskMarkpos - diskHeadAvailable;
+            diskTailAvailable = (maxDiskBufferSize - diskTailAvailable) - diskMarkpos;
+        }
+
+        marked = false;
+    }
+
+    public int available() throws IOException
+    {
+
+        return super.available() + (marked? 0 : memAvailable + diskHeadAvailable + diskTailAvailable);
+    }
+
+    public int read() throws IOException
+    {
+        int read = readOne();
+        if (read == -1)
+            return read;
+
+        if (marked)
+        {
+            //mark exhausted
+            if (isExhausted(1))
+            {
+                exhausted = true;
+                return read;
+            }
+
+            writeOne(read);
+        }
+
+        return read;
+    }
+
+    public int read(byte[] b, int off, int len) throws IOException
+    {
+        int readBytes = readMulti(b, off, len);
+        if (readBytes == -1)
+            return readBytes;
+
+        if (marked)
+        {
+            //check we have space on buffer
+            if (isExhausted(readBytes))
+            {
+                exhausted = true;
+                return readBytes;
+            }
+
+            writeMulti(b, off, readBytes);
+        }
+
+        return readBytes;
+    }
+
+    private void maybeCreateDiskBuffer() throws IOException
+    {
+        if (spillBuffer == null)
+        {
+            if (!spillFile.getParentFile().exists())
+                spillFile.getParentFile().mkdirs();
+            spillFile.createNewFile();
+
+            this.spillBuffer = new RandomAccessFile(spillFile, "rw");
+        }
+    }
+
+
+    private int readOne() throws IOException
+    {
+        if (!marked)
+        {
+            if (memAvailable > 0)
+            {
+                int pos = memBufferSize - memAvailable;
+                memAvailable--;
+                return getIfNotClosed(memBuffer)[pos] & 0xff;
+            }
+
+            if (diskTailAvailable > 0 || diskHeadAvailable > 0)
+            {
+                int read = getIfNotClosed(spillBuffer).read();
+                if (diskTailAvailable > 0)
+                    diskTailAvailable--;
+                else if (diskHeadAvailable > 0)
+                    diskHeadAvailable++;
+                if (diskTailAvailable == 0)
+                    spillBuffer.seek(0);
+                return read;
+            }
+        }
+
+        return getIfNotClosed(in).read();
+    }
+
+    private boolean isExhausted(int readBytes)
+    {
+        return exhausted || readBytes > memAvailable + (long)(diskTailAvailable == -1? maxDiskBufferSize : diskTailAvailable + diskHeadAvailable);
+    }
+
+    private int readMulti(byte[] b, int off, int len) throws IOException
+    {
+        int readBytes = 0;
+        if (!marked)
+        {
+            if (memAvailable > 0)
+            {
+                readBytes += memAvailable < len ? memAvailable : len;
+                int pos = memBufferSize - memAvailable;
+                System.arraycopy(memBuffer, pos, b, off, readBytes);
+                memAvailable -= readBytes;
+                off += readBytes;
+                len -= readBytes;
+            }
+            if (len > 0 && diskTailAvailable > 0)
+            {
+                int readFromTail = diskTailAvailable < len? diskTailAvailable : len;
+                getIfNotClosed(spillBuffer).read(b, off, readFromTail);
+                readBytes += readFromTail;
+                diskTailAvailable -= readFromTail;
+                off += readFromTail;
+                len -= readFromTail;
+                if (diskTailAvailable == 0)
+                    spillBuffer.seek(0);
+            }
+            if (len > 0 && diskHeadAvailable > 0)
+            {
+                int readFromHead = diskHeadAvailable < len? diskHeadAvailable : len;
+                getIfNotClosed(spillBuffer).read(b, off, readFromHead);
+                readBytes += readFromHead;
+                diskHeadAvailable -= readFromHead;
+                off += readFromHead;
+                len -= readFromHead;
+            }
+        }
+
+        if (len > 0)
+            readBytes += getIfNotClosed(in).read(b, off, len);
+
+        return readBytes;
+    }
+
+    private void writeMulti(byte[] b, int off, int len) throws IOException
+    {
+        if (memAvailable > 0)
+        {
+            if (memBuffer == null)
+                memBuffer = new byte[initialMemBufferSize];
+            int pos = maxMemBufferSize - memAvailable;
+            int memWritten = memAvailable < len? memAvailable : len;
+            if (pos + memWritten >= getIfNotClosed(memBuffer).length)
+                growMemBuffer(pos, memWritten);
+            System.arraycopy(b, off, memBuffer, pos, memWritten);
+            off += memWritten;
+            len -= memWritten;
+            memAvailable -= memWritten;
+        }
+
+        if (len > 0)
+        {
+            if (diskTailAvailable == -1)
+            {
+                maybeCreateDiskBuffer();
+                diskHeadAvailable = (int)spillBuffer.getFilePointer();
+                diskTailAvailable = maxDiskBufferSize - diskHeadAvailable;
+            }
+
+            if (len > 0 && diskTailAvailable > 0)
+            {
+                int diskTailWritten = diskTailAvailable < len? diskTailAvailable : len;
+                getIfNotClosed(spillBuffer).write(b, off, diskTailWritten);
+                off += diskTailWritten;
+                len -= diskTailWritten;
+                diskTailAvailable -= diskTailWritten;
+                if (diskTailAvailable == 0)
+                    spillBuffer.seek(0);
+            }
+
+            if (len > 0 && diskTailAvailable > 0)
+            {
+                int diskHeadWritten = diskHeadAvailable < len? diskHeadAvailable : len;
+                getIfNotClosed(spillBuffer).write(b, off, diskHeadWritten);
+            }
+        }
+    }
+
+    private void writeOne(int value) throws IOException
+    {
+        if (memAvailable > 0)
+        {
+            if (memBuffer == null)
+                memBuffer = new byte[initialMemBufferSize];
+            int pos = maxMemBufferSize - memAvailable;
+            if (pos == getIfNotClosed(memBuffer).length)
+                growMemBuffer(pos, 1);
+            getIfNotClosed(memBuffer)[pos] = (byte)value;
+            memAvailable--;
+            return;
+        }
+
+        if (diskTailAvailable == -1)
+        {
+            maybeCreateDiskBuffer();
+            diskHeadAvailable = (int)spillBuffer.getFilePointer();
+            diskTailAvailable = maxDiskBufferSize - diskHeadAvailable;
+        }
+
+        if (diskTailAvailable > 0 || diskHeadAvailable > 0)
+        {
+            getIfNotClosed(spillBuffer).write(value);
+            if (diskTailAvailable > 0)
+                diskTailAvailable--;
+            else if (diskHeadAvailable > 0)
+                diskHeadAvailable--;
+            if (diskTailAvailable == 0)
+                spillBuffer.seek(0);
+            return;
+        }
+    }
+
+    public int read(byte[] b) throws IOException
+    {
+        return read(b, 0, b.length);
+    }
+
+    private void growMemBuffer(int pos, int writeSize)
+    {
+        int newSize = Math.min(2 * (pos + writeSize), maxMemBufferSize);
+        byte newBuffer[] = new byte[newSize];
+        System.arraycopy(memBuffer, 0, newBuffer, 0, (int)pos);
+        memBuffer = newBuffer;
+    }
+
+    public long skip(long n) throws IOException
+    {
+        long skipped = 0;
+
+        if (marked)
+        {
+            //if marked, we need to cache skipped bytes
+            while (n-- > 0 && read() != -1)
+            {
+                skipped++;
+            }
+            return skipped;
+        }
+
+        if (memAvailable > 0)
+        {
+            skipped += memAvailable < n ? memAvailable : n;
+            memAvailable -= skipped;
+            n -= skipped;
+        }
+        if (n > 0 && diskTailAvailable > 0)
+        {
+            int skipFromTail = diskTailAvailable < n? diskTailAvailable : (int)n;
+            getIfNotClosed(spillBuffer).skipBytes(skipFromTail);
+            diskTailAvailable -= skipFromTail;
+            skipped += skipFromTail;
+            n -= skipFromTail;
+            if (diskTailAvailable == 0)
+                spillBuffer.seek(0);
+        }
+        if (n > 0 && diskHeadAvailable > 0)
+        {
+            int skipFromHead = diskHeadAvailable < n? diskHeadAvailable : (int)n;
+            getIfNotClosed(spillBuffer).skipBytes(skipFromHead);
+            diskHeadAvailable -= skipFromHead;
+            skipped += skipFromHead;
+            n -= skipFromHead;
+        }
+
+        if (n > 0)
+            skipped += getIfNotClosed(in).skip(n);
+
+        return skipped;
+    }
+
+    private <T> T getIfNotClosed(T in) throws IOException {
+        if (closed.get())
+            throw new IOException("Stream closed");
+        return in;
+    }
+
+    public void close() throws IOException
+    {
+        close(true);
+    }
+
+    public void close(boolean closeUnderlying) throws IOException
+    {
+        if (closed.compareAndSet(false, true))
+        {
+            Throwable fail = null;
+            if (closeUnderlying)
+            {
+                try
+                {
+                    super.close();
+                }
+                catch (IOException e)
+                {
+                    fail = merge(fail, e);
+                }
+            }
+            try
+            {
+                if (spillBuffer != null)
+                {
+                    this.spillBuffer.close();
+                    this.spillBuffer = null;
+                }
+            } catch (IOException e)
+            {
+                fail = merge(fail, e);
+            }
+            try {
+                if (spillFile.exists())
+                {
+                    spillFile.delete();
+                }
+            }
+            catch (Throwable e)
+            {
+                fail = merge(fail, e);
+            }
+            maybeFail(fail, IOException.class);
+        }
+    }
+
+    /* DataInputPlus methods */
+
+    public void readFully(byte[] b) throws IOException
+    {
+        dataReader.readFully(b);
+    }
+
+    public void readFully(byte[] b, int off, int len) throws IOException
+    {
+        dataReader.readFully(b, off, len);
+    }
+
+    public int skipBytes(int n) throws IOException
+    {
+        return dataReader.skipBytes(n);
+    }
+
+    public boolean readBoolean() throws IOException
+    {
+        return dataReader.readBoolean();
+    }
+
+    public byte readByte() throws IOException
+    {
+        return dataReader.readByte();
+    }
+
+    public int readUnsignedByte() throws IOException
+    {
+        return dataReader.readUnsignedByte();
+    }
+
+    public short readShort() throws IOException
+    {
+        return dataReader.readShort();
+    }
+
+    public int readUnsignedShort() throws IOException
+    {
+        return dataReader.readUnsignedShort();
+    }
+
+    public char readChar() throws IOException
+    {
+        return dataReader.readChar();
+    }
+
+    public int readInt() throws IOException
+    {
+        return dataReader.readInt();
+    }
+
+    public long readLong() throws IOException
+    {
+        return dataReader.readLong();
+    }
+
+    public float readFloat() throws IOException
+    {
+        return dataReader.readFloat();
+    }
+
+    public double readDouble() throws IOException
+    {
+        return dataReader.readDouble();
+    }
+
+    public String readLine() throws IOException
+    {
+        return dataReader.readLine();
+    }
+
+    public String readUTF() throws IOException
+    {
+        return dataReader.readUTF();
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/SafeMemory.java b/src/java/org/apache/cassandra/io/util/SafeMemory.java
index ad11472..e8cd54f 100644
--- a/src/java/org/apache/cassandra/io/util/SafeMemory.java
+++ b/src/java/org/apache/cassandra/io/util/SafeMemory.java

@@ -103,4 +103,9 @@
         assert peer != 0 || size == 0 : ref.printDebugInfo();
         super.checkBounds(start, end);
     }
+
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        identities.add(ref);
+    }
 }

diff --git a/src/java/org/apache/cassandra/io/util/SegmentedFile.java b/src/java/org/apache/cassandra/io/util/SegmentedFile.java
index cb331de..9df4c81 100644
--- a/src/java/org/apache/cassandra/io/util/SegmentedFile.java
+++ b/src/java/org/apache/cassandra/io/util/SegmentedFile.java

@@ -21,20 +21,20 @@
 import java.io.DataOutput;
 import java.io.File;
 import java.io.IOException;
-import java.nio.MappedByteBuffer;
-import java.util.Iterator;
-import java.util.List;
-import java.util.NoSuchElementException;
+import java.util.function.Supplier;
 
-import com.google.common.base.Throwables;
 import com.google.common.util.concurrent.RateLimiter;
 
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.utils.CLibrary;
-import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.IndexSummary;
+import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.utils.NativeLibrary;
 import org.apache.cassandra.utils.concurrent.RefCounted;
 import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;
 
@@ -52,6 +52,7 @@
 public abstract class SegmentedFile extends SharedCloseableImpl
 {
     public final ChannelProxy channel;
+    public final int bufferSize;
     public final long length;
 
     // This differs from length for compressed files (but we still need length for
@@ -61,23 +62,25 @@
     /**
      * Use getBuilder to get a Builder to construct a SegmentedFile.
      */
-    SegmentedFile(Cleanup cleanup, ChannelProxy channel, long length)
+    SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length)
     {
-        this(cleanup, channel, length, length);
+        this(cleanup, channel, bufferSize, length, length);
     }
 
-    protected SegmentedFile(Cleanup cleanup, ChannelProxy channel, long length, long onDiskLength)
+    protected SegmentedFile(Cleanup cleanup, ChannelProxy channel, int bufferSize, long length, long onDiskLength)
     {
         super(cleanup);
         this.channel = channel;
+        this.bufferSize = bufferSize;
         this.length = length;
         this.onDiskLength = onDiskLength;
     }
 
-    public SegmentedFile(SegmentedFile copy)
+    protected SegmentedFile(SegmentedFile copy)
     {
         super(copy);
         channel = copy.channel;
+        bufferSize = copy.bufferSize;
         length = copy.length;
         onDiskLength = copy.onDiskLength;
     }
@@ -87,7 +90,7 @@
         return channel.filePath();
     }
 
-    protected static abstract class Cleanup implements RefCounted.Tidy
+    protected static class Cleanup implements RefCounted.Tidy
     {
         final ChannelProxy channel;
         protected Cleanup(ChannelProxy channel)
@@ -110,16 +113,22 @@
 
     public RandomAccessReader createReader()
     {
-        return RandomAccessReader.open(channel, length);
+        return new RandomAccessReader.Builder(channel)
+               .overrideLength(length)
+               .bufferSize(bufferSize)
+               .build();
     }
 
-    public RandomAccessReader createThrottledReader(RateLimiter limiter)
+    public RandomAccessReader createReader(RateLimiter limiter)
     {
-        assert limiter != null;
-        return ThrottledReader.open(channel, length, limiter);
+        return new RandomAccessReader.Builder(channel)
+               .overrideLength(length)
+               .bufferSize(bufferSize)
+               .limiter(limiter)
+               .build();
     }
 
-    public FileDataInput getSegment(long position)
+    public FileDataInput createReader(long position)
     {
         RandomAccessReader reader = createReader();
         reader.seek(position);
@@ -128,7 +137,7 @@
 
     public void dropPageCache(long before)
     {
-        CLibrary.trySkipCache(channel.getFileDescriptor(), 0, before);
+        NativeLibrary.trySkipCache(channel.getFileDescriptor(), 0, before, path());
     }
 
     /**
@@ -136,32 +145,14 @@
      */
     public static Builder getBuilder(Config.DiskAccessMode mode, boolean compressed)
     {
-        return compressed ? new CompressedPoolingSegmentedFile.Builder(null)
+        return compressed ? new CompressedSegmentedFile.Builder(null)
                           : mode == Config.DiskAccessMode.mmap ? new MmappedSegmentedFile.Builder()
-                                                               : new BufferedPoolingSegmentedFile.Builder();
+                                                               : new BufferedSegmentedFile.Builder();
     }
 
     public static Builder getCompressedBuilder(CompressedSequentialWriter writer)
     {
-        return new CompressedPoolingSegmentedFile.Builder(writer);
-    }
-
-    /**
-     * @return An Iterator over segments, beginning with the segment containing the given position: each segment must be closed after use.
-     */
-    public Iterator<FileDataInput> iterator(long position)
-    {
-        return new SegmentIterator(position);
-    }
-
-    /**
-     * Retrieve the readable bounds if any so they can be cloned into other files such
-     * as when downsampling an index summary. Readable bounds are in between record locations in a file
-     * that are good positions for mapping the file such that records don't cross mappings.
-     */
-    public long[] copyReadableBounds()
-    {
-        return new long[0];
+        return new CompressedSegmentedFile.Builder(writer);
     }
 
     /**
@@ -172,30 +163,18 @@
         private ChannelProxy channel;
 
         /**
-         * Adds a position that would be a safe place for a segment boundary in the file. For a block/row based file
-         * format, safe boundaries are block/row edges.
-         * @param boundary The absolute position of the potential boundary in the file.
-         */
-        public abstract void addPotentialBoundary(long boundary);
-
-        /**
          * Called after all potential boundaries have been added to apply this Builder to a concrete file on disk.
          * @param channel The channel to the file on disk.
          */
-        protected abstract SegmentedFile complete(ChannelProxy channel, long overrideLength);
+        protected abstract SegmentedFile complete(ChannelProxy channel, int bufferSize, long overrideLength);
 
-        public SegmentedFile complete(String path)
-        {
-            return complete(path, -1L);
-        }
-
-        @SuppressWarnings("resource")
-        public SegmentedFile complete(String path, long overrideLength)
+        @SuppressWarnings("resource") // SegmentedFile owns channel
+        private SegmentedFile complete(String path, int bufferSize, long overrideLength)
         {
             ChannelProxy channelCopy = getChannel(path);
             try
             {
-                return complete(channelCopy, overrideLength);
+                return complete(channelCopy, bufferSize, overrideLength);
             }
             catch (Throwable t)
             {
@@ -204,13 +183,92 @@
             }
         }
 
-        public void serializeBounds(DataOutput out) throws IOException
+        public SegmentedFile buildData(Descriptor desc, StatsMetadata stats, IndexSummaryBuilder.ReadableBoundary boundary)
         {
+            return complete(desc.filenameFor(Component.DATA), bufferSize(stats), boundary.dataLength);
+        }
+
+        public SegmentedFile buildData(Descriptor desc, StatsMetadata stats)
+        {
+            return complete(desc.filenameFor(Component.DATA), bufferSize(stats), -1L);
+        }
+
+        public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary, IndexSummaryBuilder.ReadableBoundary boundary)
+        {
+            return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), boundary.indexLength);
+        }
+
+        public SegmentedFile buildIndex(Descriptor desc, IndexSummary indexSummary)
+        {
+            return complete(desc.filenameFor(Component.PRIMARY_INDEX), bufferSize(desc, indexSummary), -1L);
+        }
+
+        private static int bufferSize(StatsMetadata stats)
+        {
+            return bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
+        }
+
+        private static int bufferSize(Descriptor desc, IndexSummary indexSummary)
+        {
+            File file = new File(desc.filenameFor(Component.PRIMARY_INDEX));
+            return bufferSize(file.length() / indexSummary.size());
+        }
+
+        /**
+            Return the buffer size for a given record size. For spinning disks always add one page.
+            For solid state disks only add one page if the chance of crossing to the next page is more
+            than a predifined value, @see Config.disk_optimization_page_cross_chance.
+         */
+        static int bufferSize(long recordSize)
+        {
+            Config.DiskOptimizationStrategy strategy = DatabaseDescriptor.getDiskOptimizationStrategy();
+            if (strategy == Config.DiskOptimizationStrategy.ssd)
+            {
+                // The crossing probability is calculated assuming a uniform distribution of record
+                // start position in a page, so it's the record size modulo the page size divided by
+                // the total page size.
+                double pageCrossProbability = (recordSize % 4096) / 4096.;
+                // if the page cross probability is equal or bigger than disk_optimization_page_cross_chance we add one page
+                if ((pageCrossProbability - DatabaseDescriptor.getDiskOptimizationPageCrossChance()) > -1e-16)
+                    recordSize += 4096;
+
+                return roundBufferSize(recordSize);
+            }
+            else if (strategy == Config.DiskOptimizationStrategy.spinning)
+            {
+                return roundBufferSize(recordSize + 4096);
+            }
+            else
+            {
+                throw new IllegalStateException("Unsupported disk optimization strategy: " + strategy);
+            }
+        }
+
+        /**
+           Round up to the next multiple of 4k but no more than 64k
+         */
+        static int roundBufferSize(long size)
+        {
+            if (size <= 0)
+                return 4096;
+
+            size = (size + 4095) & ~4095;
+            return (int)Math.min(size, 1 << 16);
+        }
+
+        public void serializeBounds(DataOutput out, Version version) throws IOException
+        {
+            if (!version.hasBoundaries())
+                return;
+
             out.writeUTF(DatabaseDescriptor.getDiskAccessMode().name());
         }
 
-        public void deserializeBounds(DataInput in) throws IOException
+        public void deserializeBounds(DataInput in, Version version) throws IOException
         {
+            if (!version.hasBoundaries())
+                return;
+
             if (!in.readUTF().equals(DatabaseDescriptor.getDiskAccessMode().name()))
                 throw new IOException("Cannot deserialize SSTable Summary component because the DiskAccessMode was changed!");
         }
@@ -219,6 +277,7 @@
         {
             if (channel != null)
                 return channel.close(accumulate);
+
             return accumulate;
         }
 
@@ -231,6 +290,10 @@
         {
             if (channel != null)
             {
+                // This is really fragile, both path and channel.filePath()
+                // must agree, i.e. they both must be absolute or both relative
+                // eventually we should really pass the filePath to the builder
+                // constructor and remove this
                 if (channel.filePath().equals(path))
                     return channel.sharedCopy();
                 else
@@ -242,61 +305,10 @@
         }
     }
 
-    static final class Segment extends Pair<Long, MappedByteBuffer> implements Comparable<Segment>
-    {
-        public Segment(long offset, MappedByteBuffer segment)
-        {
-            super(offset, segment);
-        }
-
-        public final int compareTo(Segment that)
-        {
-            return (int)Math.signum(this.left - that.left);
-        }
-    }
-
-    /**
-     * A lazy Iterator over segments in forward order from the given position.  It is caller's responsibility
-     * to close the FileDataIntputs when finished.
-     */
-    final class SegmentIterator implements Iterator<FileDataInput>
-    {
-        private long nextpos;
-        public SegmentIterator(long position)
-        {
-            this.nextpos = position;
-        }
-
-        public boolean hasNext()
-        {
-            return nextpos < length;
-        }
-
-        public FileDataInput next()
-        {
-            long position = nextpos;
-            if (position >= length)
-                throw new NoSuchElementException();
-
-            FileDataInput segment = getSegment(nextpos);
-            try
-            {
-                nextpos = nextpos + segment.bytesRemaining();
-            }
-            catch (IOException e)
-            {
-                throw new FSReadError(e, path());
-            }
-            return segment;
-        }
-
-        public void remove() { throw new UnsupportedOperationException(); }
-    }
-
     @Override
     public String toString() {
-        return getClass().getSimpleName() + "(path='" + path() + "'" +
+        return getClass().getSimpleName() + "(path='" + path() + '\'' +
                ", length=" + length +
-               ")";
+               ')';
 }
 }

diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index 452318e..d17ac34 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java

@@ -18,10 +18,7 @@
 package org.apache.cassandra.io.util;
 
 import java.io.*;
-import java.nio.ByteBuffer;
-import java.nio.channels.ClosedChannelException;
 import java.nio.channels.FileChannel;
-import java.nio.channels.WritableByteChannel;
 import java.nio.file.StandardOpenOption;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -29,36 +26,30 @@
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.utils.CLibrary;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
 import static org.apache.cassandra.utils.Throwables.merge;
+
 import org.apache.cassandra.utils.SyncUtil;
 
 /**
  * Adds buffering, mark, and fsyncing to OutputStream.  We always fsync on close; we may also
  * fsync incrementally if Config.trickle_fsync is enabled.
  */
-public class SequentialWriter extends OutputStream implements WritableByteChannel, Transactional
+public class SequentialWriter extends BufferedDataOutputStreamPlus implements Transactional
 {
-    // isDirty - true if this.buffer contains any un-synced bytes
-    protected boolean isDirty = false, syncNeeded = false;
+    private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
 
     // absolute path to the given file
     private final String filePath;
 
-    protected ByteBuffer buffer;
-    private int directoryFD;
-    // directory should be synced only after first file sync, in other words, only once per file
-    private boolean directorySynced = false;
-
     // Offset for start of buffer relative to underlying file
     protected long bufferOffset;
 
-    protected final FileChannel channel;
+    protected final FileChannel fchannel;
 
     // whether to do trickling fsync() to avoid sudden bursts of dirty buffer flushing by kernel causing read
     // latency spikes
@@ -66,7 +57,6 @@
     private int trickleFsyncByteInterval;
     private int bytesSinceTrickleFsync = 0;
 
-    public final DataOutputPlus stream;
     protected long lastFlushOffset;
 
     protected Runnable runPostFlush;
@@ -81,13 +71,6 @@
         @Override
         protected Throwable doPreCleanup(Throwable accumulate)
         {
-            if (directoryFD >= 0)
-            {
-                try { CLibrary.tryCloseFD(directoryFD); }
-                catch (Throwable t) { accumulate = merge(accumulate, t); }
-                directoryFD = -1;
-            }
-
             // close is idempotent
             try { channel.close(); }
             catch (Throwable t) { accumulate = merge(accumulate, t); }
@@ -105,9 +88,6 @@
         protected void doPrepare()
         {
             syncInternal();
-            // we must cleanup our file handles during prepareCommit for Windows compatibility as we cannot rename an open file;
-            // TODO: once we stop file renaming, remove this for clarity
-            releaseFileHandle();
         }
 
         protected Throwable doCommit(Throwable accumulate)
@@ -117,34 +97,49 @@
 
         protected Throwable doAbort(Throwable accumulate)
         {
-            return FileUtils.deleteWithConfirm(filePath, false, accumulate);
+            return accumulate;
         }
     }
 
-    public SequentialWriter(File file, int bufferSize, BufferType bufferType)
-    {
+    // TODO: we should specify as a parameter if we permit an existing file or not
+    private static FileChannel openChannel(File file) {
         try
         {
             if (file.exists())
-                channel = FileChannel.open(file.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE);
+            {
+                return FileChannel.open(file.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE);
+            }
             else
-                channel = FileChannel.open(file.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
+            {
+                FileChannel channel = FileChannel.open(file.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW);
+                try
+                {
+                    SyncUtil.trySyncDir(file.getParentFile());
+                }
+                catch (Throwable t)
+                {
+                    try { channel.close(); }
+                    catch (Throwable t2) { t.addSuppressed(t2); }
+                }
+                return channel;
+            }
         }
         catch (IOException e)
         {
             throw new RuntimeException(e);
         }
+    }
+
+    public SequentialWriter(File file, int bufferSize, BufferType bufferType)
+    {
+        super(openChannel(file), bufferType.allocate(bufferSize));
+        strictFlushing = true;
+        fchannel = (FileChannel)channel;
 
         filePath = file.getAbsolutePath();
 
-        // Allow children to allocate buffer as direct (snappy compression) if necessary
-        buffer = bufferType.allocate(bufferSize);
-
         this.trickleFsync = DatabaseDescriptor.getTrickleFsync();
         this.trickleFsyncByteInterval = DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024;
-
-        directoryFD = CLibrary.tryOpenDirectory(file.getParent());
-        stream = new WrappedDataOutputStreamPlus(this, this);
     }
 
     /**
@@ -152,17 +147,17 @@
      */
     public static SequentialWriter open(File file)
     {
-        return new SequentialWriter(file, RandomAccessReader.DEFAULT_BUFFER_SIZE, BufferType.ON_HEAP);
+        return new SequentialWriter(file, DEFAULT_BUFFER_SIZE, BufferType.ON_HEAP);
     }
 
     public static ChecksummedSequentialWriter open(File file, File crcPath)
     {
-        return new ChecksummedSequentialWriter(file, RandomAccessReader.DEFAULT_BUFFER_SIZE, crcPath);
+        return new ChecksummedSequentialWriter(file, DEFAULT_BUFFER_SIZE, crcPath);
     }
 
     public static CompressedSequentialWriter open(String dataFilePath,
                                                   String offsetsPath,
-                                                  CompressionParameters parameters,
+                                                  CompressionParams parameters,
                                                   MetadataCollector sstableMetadataCollector)
     {
         return new CompressedSequentialWriter(new File(dataFilePath), offsetsPath, parameters, sstableMetadataCollector);
@@ -174,73 +169,6 @@
         return this;
     }
 
-    public void write(int value) throws ClosedChannelException
-    {
-        if (buffer == null)
-            throw new ClosedChannelException();
-
-        if (!buffer.hasRemaining())
-        {
-            reBuffer();
-        }
-
-        buffer.put((byte) value);
-
-        isDirty = true;
-        syncNeeded = true;
-    }
-
-    public void write(byte[] buffer) throws IOException
-    {
-        write(buffer, 0, buffer.length);
-    }
-
-    public void write(byte[] data, int offset, int length) throws IOException
-    {
-        if (buffer == null)
-            throw new ClosedChannelException();
-
-        int position = offset;
-        int remaining = length;
-        while (remaining > 0)
-        {
-            if (!buffer.hasRemaining())
-                reBuffer();
-
-            int toCopy = Math.min(remaining, buffer.remaining());
-            buffer.put(data, position, toCopy);
-
-            remaining -= toCopy;
-            position += toCopy;
-
-            isDirty = true;
-            syncNeeded = true;
-        }
-    }
-
-    public int write(ByteBuffer src) throws IOException
-    {
-        if (buffer == null)
-            throw new ClosedChannelException();
-
-        int length = src.remaining();
-        int finalLimit = src.limit();
-        while (src.hasRemaining())
-        {
-            if (!buffer.hasRemaining())
-                reBuffer();
-
-            if (buffer.remaining() < src.remaining())
-                src.limit(src.position() + buffer.remaining());
-            buffer.put(src);
-            src.limit(finalLimit);
-
-            isDirty = true;
-            syncNeeded = true;
-        }
-        return length;
-    }
-
     /**
      * Synchronize file contents with disk.
      */
@@ -253,7 +181,7 @@
     {
         try
         {
-            SyncUtil.force(channel, false);
+            SyncUtil.force(fchannel, false);
         }
         catch (IOException e)
         {
@@ -261,55 +189,34 @@
         }
     }
 
+    /*
+     * This is only safe to call before truncation or close for CompressedSequentialWriter
+     * Otherwise it will leave a non-uniform size compressed block in the middle of the file
+     * and the compressed format can't handle that.
+     */
     protected void syncInternal()
     {
-        if (syncNeeded)
-        {
-            flushInternal();
-            syncDataOnlyInternal();
-
-            if (!directorySynced)
-            {
-                SyncUtil.trySync(directoryFD);
-                directorySynced = true;
-            }
-
-            syncNeeded = false;
-        }
+        doFlush(0);
+        syncDataOnlyInternal();
     }
 
-    /**
-     * If buffer is dirty, flush it's contents to the operating system. Does not imply fsync().
-     *
-     * Currently, for implementation reasons, this also invalidates the buffer.
-     */
     @Override
-    public void flush()
+    protected void doFlush(int count)
     {
-        flushInternal();
-    }
+        flushData();
 
-    protected void flushInternal()
-    {
-        if (isDirty)
+        if (trickleFsync)
         {
-            flushData();
-
-            if (trickleFsync)
+            bytesSinceTrickleFsync += buffer.position();
+            if (bytesSinceTrickleFsync >= trickleFsyncByteInterval)
             {
-                bytesSinceTrickleFsync += buffer.position();
-                if (bytesSinceTrickleFsync >= trickleFsyncByteInterval)
-                {
-                    syncDataOnlyInternal();
-                    bytesSinceTrickleFsync = 0;
-                }
+                syncDataOnlyInternal();
+                bytesSinceTrickleFsync = 0;
             }
-
-            // Remember that we wrote, so we don't write it again on next flush().
-            resetBuffer();
-
-            isDirty = false;
         }
+
+        // Remember that we wrote, so we don't write it again on next flush().
+        resetBuffer();
     }
 
     public void setPostFlushListener(Runnable runPostFlush)
@@ -338,7 +245,12 @@
             runPostFlush.run();
     }
 
-    public long getFilePointer()
+    public boolean hasPosition()
+    {
+        return true;
+    }
+
+    public long position()
     {
         return current();
     }
@@ -354,14 +266,14 @@
      */
     public long getOnDiskFilePointer()
     {
-        return getFilePointer();
+        return position();
     }
 
     public long length()
     {
         try
         {
-            return Math.max(current(), channel.size());
+            return Math.max(current(), fchannel.size());
         }
         catch (IOException e)
         {
@@ -374,12 +286,6 @@
         return filePath;
     }
 
-    protected void reBuffer()
-    {
-        flushInternal();
-        resetBuffer();
-    }
-
     protected void resetBuffer()
     {
         bufferOffset = current();
@@ -391,7 +297,7 @@
         return bufferOffset + (buffer == null ? 0 : buffer.position());
     }
 
-    public FileMark mark()
+    public DataPosition mark()
     {
         return new BufferedFileWriterMark(current());
     }
@@ -400,7 +306,7 @@
      * Drops all buffered data that's past the limits of our new file mark + buffer capacity, or syncs and truncates
      * the underlying file to the marked position
      */
-    public void resetAndTruncate(FileMark mark)
+    public void resetAndTruncate(DataPosition mark)
     {
         assert mark instanceof BufferedFileWriterMark;
 
@@ -423,7 +329,7 @@
 
         try
         {
-            channel.position(truncateTarget);
+            fchannel.position(truncateTarget);
         }
         catch (IOException e)
         {
@@ -443,7 +349,7 @@
     {
         try
         {
-            channel.truncate(toSize);
+            fchannel.truncate(toSize);
             lastFlushOffset = toSize;
         }
         catch (IOException e)
@@ -497,22 +403,10 @@
         return new TransactionalProxy();
     }
 
-    public void releaseFileHandle()
-    {
-        try
-        {
-            channel.close();
-        }
-        catch (IOException e)
-        {
-            throw new FSWriteError(e, filePath);
-        }
-    }
-
     /**
      * Class to hold a mark to the position of the file
      */
-    protected static class BufferedFileWriterMark implements FileMark
+    protected static class BufferedFileWriterMark implements DataPosition
     {
         final long pointer;
 

diff --git a/src/java/org/apache/cassandra/io/util/ThrottledReader.java b/src/java/org/apache/cassandra/io/util/ThrottledReader.java
deleted file mode 100644
index f725984..0000000
--- a/src/java/org/apache/cassandra/io/util/ThrottledReader.java
+++ /dev/null

@@ -1,57 +0,0 @@
-package org.apache.cassandra.io.util;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.io.FileNotFoundException;
-
-import com.google.common.util.concurrent.RateLimiter;
-
-import org.apache.cassandra.io.compress.BufferType;
-
-public class ThrottledReader extends RandomAccessReader
-{
-    private final RateLimiter limiter;
-
-    protected ThrottledReader(ChannelProxy channel, long overrideLength, RateLimiter limiter) throws FileNotFoundException
-    {
-        super(channel, RandomAccessReader.DEFAULT_BUFFER_SIZE, overrideLength, BufferType.ON_HEAP, null);
-        this.limiter = limiter;
-    }
-
-    protected void reBuffer()
-    {
-        limiter.acquire(buffer.capacity());
-        super.reBuffer();
-    }
-
-    public static ThrottledReader open(ChannelProxy channel, long overrideLength, RateLimiter limiter)
-    {
-        try
-        {
-            return new ThrottledReader(channel, overrideLength, limiter);
-        }
-        catch (FileNotFoundException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/io/util/TrackedDataInputPlus.java b/src/java/org/apache/cassandra/io/util/TrackedDataInputPlus.java
new file mode 100644
index 0000000..dc5bbb6
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/TrackedDataInputPlus.java

@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.IOException;
+
+/**
+ * This class is to track bytes read from given DataInput
+ */
+public class TrackedDataInputPlus implements DataInputPlus, BytesReadTracker
+{
+    private long bytesRead;
+    final DataInput source;
+
+    public TrackedDataInputPlus(DataInput source)
+    {
+        this.source = source;
+    }
+
+    public long getBytesRead()
+    {
+        return bytesRead;
+    }
+
+    /**
+     * reset counter to @param count
+     */
+    public void reset(long count)
+    {
+        bytesRead = count;
+    }
+
+    public boolean readBoolean() throws IOException
+    {
+        boolean bool = source.readBoolean();
+        bytesRead += 1;
+        return bool;
+    }
+
+    public byte readByte() throws IOException
+    {
+        byte b = source.readByte();
+        bytesRead += 1;
+        return b;
+    }
+
+    public char readChar() throws IOException
+    {
+        char c = source.readChar();
+        bytesRead += 2;
+        return c;
+    }
+
+    public double readDouble() throws IOException
+    {
+        double d = source.readDouble();
+        bytesRead += 8;
+        return d;
+    }
+
+    public float readFloat() throws IOException
+    {
+        float f = source.readFloat();
+        bytesRead += 4;
+        return f;
+    }
+
+    public void readFully(byte[] b, int off, int len) throws IOException
+    {
+        source.readFully(b, off, len);
+        bytesRead += len;
+    }
+
+    public void readFully(byte[] b) throws IOException
+    {
+        source.readFully(b);
+        bytesRead += b.length;
+    }
+
+    public int readInt() throws IOException
+    {
+        int i = source.readInt();
+        bytesRead += 4;
+        return i;
+    }
+
+    public String readLine() throws IOException
+    {
+        // since this method is deprecated and cannot track bytes read
+        // just throw exception
+        throw new UnsupportedOperationException();
+    }
+
+    public long readLong() throws IOException
+    {
+        long l = source.readLong();
+        bytesRead += 8;
+        return l;
+    }
+
+    public short readShort() throws IOException
+    {
+        short s = source.readShort();
+        bytesRead += 2;
+        return s;
+    }
+
+    public String readUTF() throws IOException
+    {
+        return DataInputStream.readUTF(this);
+    }
+
+    public int readUnsignedByte() throws IOException
+    {
+        int i = source.readUnsignedByte();
+        bytesRead += 1;
+        return i;
+    }
+
+    public int readUnsignedShort() throws IOException
+    {
+        int i = source.readUnsignedShort();
+        bytesRead += 2;
+        return i;
+    }
+
+    public int skipBytes(int n) throws IOException
+    {
+        int skipped = source.skipBytes(n);
+        bytesRead += skipped;
+        return skipped;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/TrackedInputStream.java b/src/java/org/apache/cassandra/io/util/TrackedInputStream.java
new file mode 100644
index 0000000..f398d30
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/TrackedInputStream.java

@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * This class is to track bytes read from given DataInputStream
+ */
+public class TrackedInputStream extends FilterInputStream implements BytesReadTracker
+{
+    private long bytesRead;
+
+    public TrackedInputStream(InputStream source)
+    {
+        super(source);
+    }
+
+    public long getBytesRead()
+    {
+        return bytesRead;
+    }
+
+    /**
+     * reset counter to @param count
+     */
+    public void reset(long count)
+    {
+        bytesRead = count;
+    }
+
+    public int read() throws IOException
+    {
+        int read = super.read();
+        bytesRead += 1;
+        return read;
+    }
+
+    public int read(byte[] b, int off, int len) throws IOException
+    {
+        int read = super.read(b, off, len);
+        bytesRead += read;
+        return read;
+    }
+
+    public int read(byte[] b) throws IOException
+    {
+        int read = super.read(b);
+        bytesRead += read;
+        return read;
+    }
+
+    public long skip(long n) throws IOException
+    {
+        long skip = super.skip(n);
+        bytesRead += skip;
+        return skip;
+    }
+}

diff --git a/src/java/org/apache/cassandra/io/util/UnbufferedDataOutputStreamPlus.java b/src/java/org/apache/cassandra/io/util/UnbufferedDataOutputStreamPlus.java
index 10aefa6..54b4cb1 100644
--- a/src/java/org/apache/cassandra/io/util/UnbufferedDataOutputStreamPlus.java
+++ b/src/java/org/apache/cassandra/io/util/UnbufferedDataOutputStreamPlus.java

@@ -315,7 +315,7 @@
                 for (int i = 0 ; i < charRunLength ; i++)
                 {
                     char ch = str.charAt(offset + i);
-                    if ((ch > 0) & (ch <= 127))
+                    if ((ch > 0) && (ch <= 127))
                     {
                         utfBytes[utfIndex++] = (byte) ch;
                     }

diff --git a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java
index c90c6a1..b326e1c 100644
--- a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java
+++ b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java

@@ -60,7 +60,7 @@
 
     public IEndpointSnitch snitch;
 
-    AbstractReplicationStrategy(String keyspaceName, TokenMetadata tokenMetadata, IEndpointSnitch snitch, Map<String, String> configOptions)
+    protected AbstractReplicationStrategy(String keyspaceName, TokenMetadata tokenMetadata, IEndpointSnitch snitch, Map<String, String> configOptions)
     {
         assert keyspaceName != null;
         assert snitch != null;

diff --git a/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java b/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java
index d6a601c..542677b 100644
--- a/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java
+++ b/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java

@@ -26,6 +26,7 @@
 
 import com.codahale.metrics.ExponentiallyDecayingReservoir;
 
+import com.codahale.metrics.Snapshot;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.net.MessagingService;
@@ -247,19 +248,26 @@
 
         }
         double maxLatency = 1;
+
+        Map<InetAddress, Snapshot> snapshots = new HashMap<>(samples.size());
+        for (Map.Entry<InetAddress, ExponentiallyDecayingReservoir> entry : samples.entrySet())
+        {
+            snapshots.put(entry.getKey(), entry.getValue().getSnapshot());
+        }
+
         // We're going to weight the latency for each host against the worst one we see, to
         // arrive at sort of a 'badness percentage' for them. First, find the worst for each:
         HashMap<InetAddress, Double> newScores = new HashMap<>();
-        for (Map.Entry<InetAddress, ExponentiallyDecayingReservoir> entry : samples.entrySet())
+        for (Map.Entry<InetAddress, Snapshot> entry : snapshots.entrySet())
         {
-            double mean = entry.getValue().getSnapshot().getMedian();
+            double mean = entry.getValue().getMedian();
             if (mean > maxLatency)
                 maxLatency = mean;
         }
         // now make another pass to do the weighting based on the maximums we found before
-        for (Map.Entry<InetAddress, ExponentiallyDecayingReservoir> entry: samples.entrySet())
+        for (Map.Entry<InetAddress, Snapshot> entry : snapshots.entrySet())
         {
-            double score = entry.getValue().getSnapshot().getMedian() / maxLatency;
+            double score = entry.getValue().getMedian() / maxLatency;
             // finally, add the severity without any weighting, since hosts scale this relative to their own load and the size of the task causing the severity.
             // "Severity" is basically a measure of compaction activity (CASSANDRA-3722).
             if (USE_SEVERITY)

diff --git a/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java b/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java
index 7c8d95e..82183bb 100644
--- a/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java
+++ b/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java

@@ -29,6 +29,7 @@
 import org.apache.cassandra.locator.TokenMetadata.Topology;
 import org.apache.cassandra.utils.FBUtilities;
 
+import com.google.common.collect.ImmutableMultimap;
 import com.google.common.collect.Multimap;
 
 /**
@@ -91,7 +92,7 @@
         // all endpoints in each DC, so we can check when we have exhausted all the members of a DC
         Multimap<String, InetAddress> allEndpoints = topology.getDatacenterEndpoints();
         // all racks in a DC so we can check when we have exhausted all racks in a DC
-        Map<String, Multimap<String, InetAddress>> racks = topology.getDatacenterRacks();
+        Map<String, ImmutableMultimap<String, InetAddress>> racks = topology.getDatacenterRacks();
         assert !allEndpoints.isEmpty() && !racks.isEmpty() : "not aware of any cluster members";
 
         // tracks the racks we have already placed replicas in

diff --git a/src/java/org/apache/cassandra/locator/TokenMetadata.java b/src/java/org/apache/cassandra/locator/TokenMetadata.java
index aafd7f9..3978eeb 100644
--- a/src/java/org/apache/cassandra/locator/TokenMetadata.java
+++ b/src/java/org/apache/cassandra/locator/TokenMetadata.java

@@ -26,13 +26,15 @@
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
-import com.google.common.base.Optional;
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.*;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.FailureDetector;
@@ -93,9 +95,11 @@
 
     /* Use this lock for manipulating the token map */
     private final ReadWriteLock lock = new ReentrantReadWriteLock(true);
-    private volatile ArrayList<Token> sortedTokens;
+    private volatile ArrayList<Token> sortedTokens; // safe to be read without a lock, as it's never mutated
 
-    private final Topology topology;
+    private volatile Topology topology;
+
+    public final IPartitioner partitioner;
 
     private static final Comparator<InetAddress> inetaddressCmp = new Comparator<InetAddress>()
     {
@@ -112,17 +116,28 @@
     {
         this(SortedBiMultiValMap.<Token, InetAddress>create(null, inetaddressCmp),
              HashBiMap.<InetAddress, UUID>create(),
-             new Topology());
+             Topology.empty(),
+             DatabaseDescriptor.getPartitioner());
     }
 
-    private TokenMetadata(BiMultiValMap<Token, InetAddress> tokenToEndpointMap, BiMap<InetAddress, UUID> endpointsMap, Topology topology)
+    private TokenMetadata(BiMultiValMap<Token, InetAddress> tokenToEndpointMap, BiMap<InetAddress, UUID> endpointsMap, Topology topology, IPartitioner partitioner)
     {
         this.tokenToEndpointMap = tokenToEndpointMap;
         this.topology = topology;
+        this.partitioner = partitioner;
         endpointToHostIdMap = endpointsMap;
         sortedTokens = sortTokens();
     }
 
+    /**
+     * To be used by tests only (via {@link StorageService#setPartitionerUnsafe}).
+     */
+    @VisibleForTesting
+    public TokenMetadata cloneWithNewPartitioner(IPartitioner newPartitioner)
+    {
+        return new TokenMetadata(tokenToEndpointMap, endpointToHostIdMap, topology, newPartitioner);
+    }
+
     private ArrayList<Token> sortTokens()
     {
         return new ArrayList<>(tokenToEndpointMap.keySet());
@@ -179,6 +194,7 @@
         try
         {
             boolean shouldSortTokens = false;
+            Topology.Builder topologyBuilder = topology.unbuild();
             for (InetAddress endpoint : endpointTokens.keySet())
             {
                 Collection<Token> tokens = endpointTokens.get(endpoint);
@@ -187,7 +203,7 @@
 
                 bootstrapTokens.removeValue(endpoint);
                 tokenToEndpointMap.removeValue(endpoint);
-                topology.addEndpoint(endpoint);
+                topologyBuilder.addEndpoint(endpoint);
                 leavingEndpoints.remove(endpoint);
                 replacementToOriginal.remove(endpoint);
                 removeFromMoving(endpoint); // also removing this endpoint from moving
@@ -203,6 +219,7 @@
                     }
                 }
             }
+            topology = topologyBuilder.build();
 
             if (shouldSortTokens)
                 sortedTokens = sortTokens();
@@ -313,6 +330,7 @@
         lock.writeLock().lock();
         try
         {
+
             InetAddress oldEndpoint;
 
             for (Token token : tokens)
@@ -366,12 +384,28 @@
 
     public Optional<InetAddress> getReplacementNode(InetAddress endpoint)
     {
-        return Optional.fromNullable(replacementToOriginal.inverse().get(endpoint));
+        lock.readLock().lock();
+        try
+        {
+            return Optional.ofNullable(replacementToOriginal.inverse().get(endpoint));
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
     }
 
     public Optional<InetAddress> getReplacingNode(InetAddress endpoint)
     {
-        return Optional.fromNullable((replacementToOriginal.get(endpoint)));
+        lock.readLock().lock();
+        try
+        {
+            return Optional.ofNullable((replacementToOriginal.get(endpoint)));
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
     }
 
     public void removeBootstrapTokens(Collection<Token> tokens)
@@ -415,7 +449,6 @@
         assert endpoint != null;
 
         lock.writeLock().lock();
-
         try
         {
             movingEndpoints.add(Pair.create(token, endpoint));
@@ -435,7 +468,7 @@
         {
             bootstrapTokens.removeValue(endpoint);
             tokenToEndpointMap.removeValue(endpoint);
-            topology.removeEndpoint(endpoint);
+            topology = topology.unbuild().removeEndpoint(endpoint).build();
             leavingEndpoints.remove(endpoint);
             if (replacementToOriginal.remove(endpoint) != null)
             {
@@ -454,7 +487,7 @@
     /**
      * This is called when the snitch properties for this endpoint are updated, see CASSANDRA-10238.
      */
-    public void updateTopology(InetAddress endpoint)
+    public Topology updateTopology(InetAddress endpoint)
     {
         assert endpoint != null;
 
@@ -462,8 +495,9 @@
         try
         {
             logger.info("Updating topology for {}", endpoint);
-            topology.updateEndpoint(endpoint);
+            topology = topology.unbuild().updateEndpoint(endpoint).build();
             invalidateCachedRings();
+            return topology;
         }
         finally
         {
@@ -475,14 +509,15 @@
      * This is called when the snitch properties for many endpoints are updated, it will update
      * the topology mappings of any endpoints whose snitch has changed, see CASSANDRA-10238.
      */
-    public void updateTopology()
+    public Topology updateTopology()
     {
         lock.writeLock().lock();
         try
         {
             logger.info("Updating topology for all endpoints that have changed");
-            topology.updateEndpoints();
+            topology = topology.unbuild().updateEndpoints().build();
             invalidateCachedRings();
+            return topology;
         }
         finally
         {
@@ -575,7 +610,6 @@
         assert endpoint != null;
 
         lock.readLock().lock();
-
         try
         {
             for (Pair<Token, InetAddress> pair : movingEndpoints)
@@ -605,7 +639,8 @@
         {
             return new TokenMetadata(SortedBiMultiValMap.create(tokenToEndpointMap, null, inetaddressCmp),
                                      HashBiMap.create(endpointToHostIdMap),
-                                     new Topology(topology));
+                                     topology,
+                                     partitioner);
         }
         finally
         {
@@ -674,7 +709,6 @@
     public TokenMetadata cloneAfterAllSettled()
     {
         lock.readLock().lock();
-
         try
         {
             TokenMetadata metadata = cloneOnlyTokenMap();
@@ -791,50 +825,49 @@
     public void calculatePendingRanges(AbstractReplicationStrategy strategy, String keyspaceName)
     {
         // avoid race between both branches - do not use a lock here as this will block any other unrelated operations!
+        long startedAt = System.currentTimeMillis();
         synchronized (pendingRanges)
         {
-            if (bootstrapTokens.isEmpty() && leavingEndpoints.isEmpty() && movingEndpoints.isEmpty())
-            {
-                if (logger.isTraceEnabled())
-                    logger.trace("No bootstrapping, leaving or moving nodes -> empty pending ranges for {}", keyspaceName);
+            // create clone of current state
+            BiMultiValMap<Token, InetAddress> bootstrapTokensClone;
+            Set<InetAddress> leavingEndpointsClone;
+            Set<Pair<Token, InetAddress>> movingEndpointsClone;
+            TokenMetadata metadata;
 
-                pendingRanges.put(keyspaceName, new PendingRangeMaps());
-            }
-            else
+            lock.readLock().lock();
+            try
             {
+                if (bootstrapTokens.isEmpty() && leavingEndpoints.isEmpty() && movingEndpoints.isEmpty())
+                {
+                    if (logger.isTraceEnabled())
+                        logger.trace("No bootstrapping, leaving or moving nodes -> empty pending ranges for {}", keyspaceName);
+
+                    pendingRanges.put(keyspaceName, new PendingRangeMaps());
+
+                    return;
+                }
+
                 if (logger.isDebugEnabled())
                     logger.debug("Starting pending range calculation for {}", keyspaceName);
 
-                long startedAt = System.currentTimeMillis();
-
-                // create clone of current state
-                BiMultiValMap<Token, InetAddress> bootstrapTokens = new BiMultiValMap<>();
-                Set<InetAddress> leavingEndpoints = new HashSet<>();
-                Set<Pair<Token, InetAddress>> movingEndpoints = new HashSet<>();
-                TokenMetadata metadata;
-
-                lock.readLock().lock();
-                try
-                {
-                    bootstrapTokens.putAll(this.bootstrapTokens);
-                    leavingEndpoints.addAll(this.leavingEndpoints);
-                    movingEndpoints.addAll(this.movingEndpoints);
-                    metadata = this.cloneOnlyTokenMap();
-                }
-                finally
-                {
-                    lock.readLock().unlock();
-                }
-
-                pendingRanges.put(keyspaceName, calculatePendingRanges(strategy, metadata, bootstrapTokens,
-                                                                       leavingEndpoints, movingEndpoints));
-                long took = System.currentTimeMillis() - startedAt;
-
-                if (logger.isDebugEnabled())
-                    logger.debug("Pending range calculation for {} completed (took: {}ms)", keyspaceName, took);
-                if (logger.isTraceEnabled())
-                    logger.trace("Calculated pending ranges for {}:\n{}", keyspaceName, (pendingRanges.isEmpty() ? "<empty>" : printPendingRanges()));
+                bootstrapTokensClone  = new BiMultiValMap<>(this.bootstrapTokens);
+                leavingEndpointsClone = new HashSet<>(this.leavingEndpoints);
+                movingEndpointsClone = new HashSet<>(this.movingEndpoints);
+                metadata = this.cloneOnlyTokenMap();
             }
+            finally
+            {
+                lock.readLock().unlock();
+            }
+
+            pendingRanges.put(keyspaceName, calculatePendingRanges(strategy, metadata, bootstrapTokensClone,
+                                                                   leavingEndpointsClone, movingEndpointsClone));
+            long took = System.currentTimeMillis() - startedAt;
+
+            if (logger.isDebugEnabled())
+                logger.debug("Pending range calculation for {} completed (took: {}ms)", keyspaceName, took);
+            if (logger.isTraceEnabled())
+                logger.trace("Calculated pending ranges for {}:\n{}", keyspaceName, (pendingRanges.isEmpty() ? "<empty>" : printPendingRanges()));
         }
     }
 
@@ -944,7 +977,7 @@
     {
         List tokens = sortedTokens();
         int index = Collections.binarySearch(tokens, token);
-        assert index >= 0 : token + " not found in " + StringUtils.join(tokenToEndpointMap.keySet(), ", ");
+        assert index >= 0 : token + " not found in " + tokenToEndpointMapKeysAsStrings();
         return (Token) (index == 0 ? tokens.get(tokens.size() - 1) : tokens.get(index - 1));
     }
 
@@ -952,17 +985,30 @@
     {
         List tokens = sortedTokens();
         int index = Collections.binarySearch(tokens, token);
-        assert index >= 0 : token + " not found in " + StringUtils.join(tokenToEndpointMap.keySet(), ", ");
+        assert index >= 0 : token + " not found in " + tokenToEndpointMapKeysAsStrings();
         return (Token) ((index == (tokens.size() - 1)) ? tokens.get(0) : tokens.get(index + 1));
     }
 
+    private String tokenToEndpointMapKeysAsStrings()
+    {
+        lock.readLock().lock();
+        try
+        {
+            return StringUtils.join(tokenToEndpointMap.keySet(), ", ");
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
+    }
+
     /** @return a copy of the bootstrapping tokens map */
     public BiMultiValMap<Token, InetAddress> getBootstrapTokens()
     {
         lock.readLock().lock();
         try
         {
-            return new BiMultiValMap<Token, InetAddress>(bootstrapTokens);
+            return new BiMultiValMap<>(bootstrapTokens);
         }
         finally
         {
@@ -1041,7 +1087,7 @@
     public static Iterator<Token> ringIterator(final ArrayList<Token> ring, Token start, boolean includeMin)
     {
         if (ring.isEmpty())
-            return includeMin ? Iterators.singletonIterator(StorageService.getPartitioner().getMinimumToken())
+            return includeMin ? Iterators.singletonIterator(start.getPartitioner().getMinimumToken())
                               : Iterators.<Token>emptyIterator();
 
         final boolean insertMin = includeMin && !ring.get(0).isMinimum();
@@ -1057,7 +1103,7 @@
                 {
                     // return minimum for index == -1
                     if (j == -1)
-                        return StorageService.getPartitioner().getMinimumToken();
+                        return start.getPartitioner().getMinimumToken();
                     // return ring token for other indexes
                     return ring.get(j);
                 }
@@ -1087,7 +1133,7 @@
             pendingRanges.clear();
             movingEndpoints.clear();
             sortedTokens.clear();
-            topology.clear();
+            topology = Topology.empty();
             invalidateCachedRings();
         }
         finally
@@ -1243,6 +1289,11 @@
         cachedTokenMap.set(null);
     }
 
+    public DecoratedKey decorateKey(ByteBuffer key)
+    {
+        return partitioner.decorateKey(key);
+    }
+
     /**
      * Tracks the assignment of racks and endpoints in each datacenter for all the "normal" endpoints
      * in this TokenMetadata. This allows faster calculation of endpoints in NetworkTopologyStrategy.
@@ -1250,114 +1301,22 @@
     public static class Topology
     {
         /** multi-map of DC to endpoints in that DC */
-        private final Multimap<String, InetAddress> dcEndpoints;
+        private final ImmutableMultimap<String, InetAddress> dcEndpoints;
         /** map of DC to multi-map of rack to endpoints in that rack */
-        private final Map<String, Multimap<String, InetAddress>> dcRacks;
+        private final ImmutableMap<String, ImmutableMultimap<String, InetAddress>> dcRacks;
         /** reverse-lookup map for endpoint to current known dc/rack assignment */
-        private final Map<InetAddress, Pair<String, String>> currentLocations;
+        private final ImmutableMap<InetAddress, Pair<String, String>> currentLocations;
 
-        Topology()
+        private Topology(Builder builder)
         {
-            dcEndpoints = HashMultimap.create();
-            dcRacks = new HashMap<>();
-            currentLocations = new HashMap<>();
-        }
+            this.dcEndpoints = ImmutableMultimap.copyOf(builder.dcEndpoints);
 
-        void clear()
-        {
-            dcEndpoints.clear();
-            dcRacks.clear();
-            currentLocations.clear();
-        }
+            ImmutableMap.Builder<String, ImmutableMultimap<String, InetAddress>> dcRackBuilder = ImmutableMap.builder();
+            for (Map.Entry<String, Multimap<String, InetAddress>> entry : builder.dcRacks.entrySet())
+                dcRackBuilder.put(entry.getKey(), ImmutableMultimap.copyOf(entry.getValue()));
+            this.dcRacks = dcRackBuilder.build();
 
-        /**
-         * construct deep-copy of other
-         */
-        Topology(Topology other)
-        {
-            dcEndpoints = HashMultimap.create(other.dcEndpoints);
-            dcRacks = new HashMap<>();
-            for (String dc : other.dcRacks.keySet())
-                dcRacks.put(dc, HashMultimap.create(other.dcRacks.get(dc)));
-            currentLocations = new HashMap<>(other.currentLocations);
-        }
-
-        /**
-         * Stores current DC/rack assignment for ep
-         */
-        void addEndpoint(InetAddress ep)
-        {
-            IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
-            String dc = snitch.getDatacenter(ep);
-            String rack = snitch.getRack(ep);
-            Pair<String, String> current = currentLocations.get(ep);
-            if (current != null)
-            {
-                if (current.left.equals(dc) && current.right.equals(rack))
-                    return;
-                doRemoveEndpoint(ep, current);
-            }
-
-            doAddEndpoint(ep, dc, rack);
-        }
-
-        private void doAddEndpoint(InetAddress ep, String dc, String rack)
-        {
-            dcEndpoints.put(dc, ep);
-
-            if (!dcRacks.containsKey(dc))
-                dcRacks.put(dc, HashMultimap.<String, InetAddress>create());
-            dcRacks.get(dc).put(rack, ep);
-
-            currentLocations.put(ep, Pair.create(dc, rack));
-        }
-
-        /**
-         * Removes current DC/rack assignment for ep
-         */
-        void removeEndpoint(InetAddress ep)
-        {
-            if (!currentLocations.containsKey(ep))
-                return;
-
-            doRemoveEndpoint(ep, currentLocations.remove(ep));
-        }
-
-        private void doRemoveEndpoint(InetAddress ep, Pair<String, String> current)
-        {
-            dcRacks.get(current.left).remove(current.right, ep);
-            dcEndpoints.remove(current.left, ep);
-        }
-
-        void updateEndpoint(InetAddress ep)
-        {
-            IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
-            if (snitch == null || !currentLocations.containsKey(ep))
-                return;
-
-           updateEndpoint(ep, snitch);
-        }
-
-        void updateEndpoints()
-        {
-            IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
-            if (snitch == null)
-                return;
-
-            for (InetAddress ep : currentLocations.keySet())
-                updateEndpoint(ep, snitch);
-        }
-
-        private void updateEndpoint(InetAddress ep, IEndpointSnitch snitch)
-        {
-            Pair<String, String> current = currentLocations.get(ep);
-            String dc = snitch.getDatacenter(ep);
-            String rack = snitch.getRack(ep);
-            if (dc.equals(current.left) && rack.equals(current.right))
-                return;
-
-            doRemoveEndpoint(ep, current);
-            doAddEndpoint(ep, dc, rack);
+            this.currentLocations = ImmutableMap.copyOf(builder.currentLocations);
         }
 
         /**
@@ -1371,9 +1330,141 @@
         /**
          * @return map of DC to multi-map of rack to endpoints in that rack
          */
-        public Map<String, Multimap<String, InetAddress>> getDatacenterRacks()
+        public ImmutableMap<String, ImmutableMultimap<String, InetAddress>> getDatacenterRacks()
         {
             return dcRacks;
         }
+
+        Builder unbuild()
+        {
+            return new Builder(this);
+        }
+
+        static Builder builder()
+        {
+            return new Builder();
+        }
+
+        static Topology empty()
+        {
+            return builder().build();
+        }
+
+        private static class Builder
+        {
+            /** multi-map of DC to endpoints in that DC */
+            private final Multimap<String, InetAddress> dcEndpoints;
+            /** map of DC to multi-map of rack to endpoints in that rack */
+            private final Map<String, Multimap<String, InetAddress>> dcRacks;
+            /** reverse-lookup map for endpoint to current known dc/rack assignment */
+            private final Map<InetAddress, Pair<String, String>> currentLocations;
+
+            Builder()
+            {
+                this.dcEndpoints = HashMultimap.create();
+                this.dcRacks = new HashMap<>();
+                this.currentLocations = new HashMap<>();
+            }
+
+            Builder(Topology from)
+            {
+                this.dcEndpoints = HashMultimap.create(from.dcEndpoints);
+
+                this.dcRacks = Maps.newHashMapWithExpectedSize(from.dcRacks.size());
+                for (Map.Entry<String, ImmutableMultimap<String, InetAddress>> entry : from.dcRacks.entrySet())
+                    dcRacks.put(entry.getKey(), HashMultimap.create(entry.getValue()));
+
+                this.currentLocations = new HashMap<>(from.currentLocations);
+            }
+
+            /**
+             * Stores current DC/rack assignment for ep
+             */
+            Builder addEndpoint(InetAddress ep)
+            {
+                IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
+                String dc = snitch.getDatacenter(ep);
+                String rack = snitch.getRack(ep);
+                Pair<String, String> current = currentLocations.get(ep);
+                if (current != null)
+                {
+                    if (current.left.equals(dc) && current.right.equals(rack))
+                        return this;
+                    doRemoveEndpoint(ep, current);
+                }
+
+                doAddEndpoint(ep, dc, rack);
+                return this;
+            }
+
+            private void doAddEndpoint(InetAddress ep, String dc, String rack)
+            {
+                dcEndpoints.put(dc, ep);
+
+                if (!dcRacks.containsKey(dc))
+                    dcRacks.put(dc, HashMultimap.<String, InetAddress>create());
+                dcRacks.get(dc).put(rack, ep);
+
+                currentLocations.put(ep, Pair.create(dc, rack));
+            }
+
+            /**
+             * Removes current DC/rack assignment for ep
+             */
+            Builder removeEndpoint(InetAddress ep)
+            {
+                if (!currentLocations.containsKey(ep))
+                    return this;
+
+                doRemoveEndpoint(ep, currentLocations.remove(ep));
+                return this;
+            }
+
+            private void doRemoveEndpoint(InetAddress ep, Pair<String, String> current)
+            {
+                dcRacks.get(current.left).remove(current.right, ep);
+                dcEndpoints.remove(current.left, ep);
+            }
+
+            Builder updateEndpoint(InetAddress ep)
+            {
+                IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
+                if (snitch == null || !currentLocations.containsKey(ep))
+                    return this;
+
+                updateEndpoint(ep, snitch);
+                return this;
+            }
+
+            Builder updateEndpoints()
+            {
+                IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
+                if (snitch == null)
+                    return this;
+
+                for (InetAddress ep : currentLocations.keySet())
+                    updateEndpoint(ep, snitch);
+
+                return this;
+            }
+
+            private void updateEndpoint(InetAddress ep, IEndpointSnitch snitch)
+            {
+                Pair<String, String> current = currentLocations.get(ep);
+                String dc = snitch.getDatacenter(ep);
+                String rack = snitch.getRack(ep);
+                if (dc.equals(current.left) && rack.equals(current.right))
+                    return;
+
+                doRemoveEndpoint(ep, current);
+                doAddEndpoint(ep, dc, rack);
+            }
+
+            Topology build()
+            {
+                return new Topology(this);
+            }
+        }
+
     }
 }

diff --git a/src/java/org/apache/cassandra/metrics/BufferPoolMetrics.java b/src/java/org/apache/cassandra/metrics/BufferPoolMetrics.java
new file mode 100644
index 0000000..107717d
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/BufferPoolMetrics.java

@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Meter;
+import com.codahale.metrics.RatioGauge;
+import org.apache.cassandra.utils.memory.BufferPool;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class BufferPoolMetrics
+{
+    private static final MetricNameFactory factory = new DefaultNameFactory("BufferPool");
+
+    /** Total number of misses */
+    public final Meter misses;
+
+    /** Total size of buffer pools, in bytes */
+    public final Gauge<Long> size;
+
+    public BufferPoolMetrics()
+    {
+        misses = Metrics.meter(factory.createMetricName("Misses"));
+
+        size = Metrics.register(factory.createMetricName("Size"), new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return BufferPool.sizeInBytes();
+            }
+        });
+    }
+}

diff --git a/src/java/org/apache/cassandra/metrics/CacheMetrics.java b/src/java/org/apache/cassandra/metrics/CacheMetrics.java
index 8b00e1c..151268b 100644
--- a/src/java/org/apache/cassandra/metrics/CacheMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CacheMetrics.java

@@ -37,8 +37,14 @@
     public final Meter hits;
     /** Total number of cache requests */
     public final Meter requests;
-    /** cache hit rate */
+    /** all time cache hit rate */
     public final Gauge<Double> hitRate;
+    /** 1m hit rate */
+    public final Gauge<Double> oneMinuteHitRate;
+    /** 5m hit rate */
+    public final Gauge<Double> fiveMinuteHitRate;
+    /** 15m hit rate */
+    public final Gauge<Double> fifteenMinuteHitRate;
     /** Total size of cache, in bytes */
     public final Gauge<Long> size;
     /** Total number of cache entries */
@@ -71,6 +77,27 @@
                 return Ratio.of(hits.getCount(), requests.getCount());
             }
         });
+        oneMinuteHitRate = Metrics.register(factory.createMetricName("OneMinuteHitRate"), new RatioGauge()
+        {
+            protected Ratio getRatio()
+            {
+                return Ratio.of(hits.getOneMinuteRate(), requests.getOneMinuteRate());
+            }
+        });
+        fiveMinuteHitRate = Metrics.register(factory.createMetricName("FiveMinuteHitRate"), new RatioGauge()
+        {
+            protected Ratio getRatio()
+            {
+                return Ratio.of(hits.getFiveMinuteRate(), requests.getFiveMinuteRate());
+            }
+        });
+        fifteenMinuteHitRate = Metrics.register(factory.createMetricName("FifteenMinuteHitRate"), new RatioGauge()
+        {
+            protected Ratio getRatio()
+            {
+                return Ratio.of(hits.getFifteenMinuteRate(), requests.getFifteenMinuteRate());
+            }
+        });
         size = Metrics.register(factory.createMetricName("Size"), new Gauge<Long>()
         {
             public Long getValue()

diff --git a/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java b/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java
index d525a26..e455dc0 100644
--- a/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java
+++ b/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java

@@ -52,6 +52,13 @@
         return counter;
     }
 
+    public Counter counter(MetricName name, MetricName alias)
+    {
+        Counter counter = counter(name);
+        registerAlias(name, alias);
+        return counter;
+    }
+
     public Meter meter(MetricName name)
     {
         Meter meter = meter(name.getMetricName());
@@ -60,6 +67,13 @@
         return meter;
     }
 
+    public Meter meter(MetricName name, MetricName alias)
+    {
+        Meter meter = meter(name);
+        registerAlias(name, alias);
+        return meter;
+    }
+
     public Histogram histogram(MetricName name, boolean considerZeroes)
     {
         Histogram histogram = register(name, new ClearableHistogram(new DecayingEstimatedHistogramReservoir(considerZeroes)));
@@ -68,6 +82,13 @@
         return histogram;
     }
 
+    public Histogram histogram(MetricName name, MetricName alias, boolean considerZeroes)
+    {
+        Histogram histogram = histogram(name, considerZeroes);
+        registerAlias(name, alias);
+        return histogram;
+    }
+
     public Timer timer(MetricName name)
     {
         Timer timer = register(name, new Timer(new DecayingEstimatedHistogramReservoir()));
@@ -76,6 +97,13 @@
         return timer;
     }
 
+    public Timer timer(MetricName name, MetricName alias)
+    {
+        Timer timer = timer(name);
+        registerAlias(name, alias);
+        return timer;
+    }
+
     public <T extends Metric> T register(MetricName name, T metric)
     {
         try
@@ -91,6 +119,13 @@
         }
     }
 
+    public <T extends Metric> T register(MetricName name, MetricName aliasName, T metric)
+    {
+        T ret = register(name, metric);
+        registerAlias(name, aliasName);
+        return ret;
+    }
+
     public boolean remove(MetricName name)
     {
         boolean removed = remove(name.getMetricName());
@@ -103,6 +138,16 @@
         return removed;
     }
 
+    public boolean remove(MetricName name, MetricName alias)
+    {
+        if (remove(name))
+        {
+            removeAlias(alias);
+            return true;
+        }
+        return false;
+    }
+
     public void registerMBean(Metric metric, ObjectName name)
     {
         AbstractBean mbean;
@@ -134,6 +179,22 @@
         catch (Exception ignored) {}
     }
 
+    private void registerAlias(MetricName existingName, MetricName aliasName)
+    {
+        Metric existing = Metrics.getMetrics().get(existingName.getMetricName());
+        assert existing != null : existingName + " not registered";
+
+        registerMBean(existing, aliasName.getMBeanName());
+    }
+
+    private void removeAlias(MetricName name)
+    {
+        try
+        {
+            MBeanWrapper.instance.unregisterMBean(name.getMBeanName());
+        } catch (Exception ignored) {}
+    }
+
     public interface MetricMBean
     {
         ObjectName objectName();

diff --git a/src/java/org/apache/cassandra/metrics/ClientMetrics.java b/src/java/org/apache/cassandra/metrics/ClientMetrics.java
index 4a384eb..08f0531 100644
--- a/src/java/org/apache/cassandra/metrics/ClientMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/ClientMetrics.java

@@ -18,9 +18,14 @@
  */
 package org.apache.cassandra.metrics;
 
+import java.util.Collection;
+import java.util.Collections;
 import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Meter;
+import org.apache.cassandra.transport.Server;
 
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
 
@@ -28,13 +33,40 @@
 public class ClientMetrics
 {
     private static final MetricNameFactory factory = new DefaultNameFactory("Client");
-    
     public static final ClientMetrics instance = new ClientMetrics();
-    
+
+    private volatile boolean initialized = false;
+
+    private Collection<Server> servers = Collections.emptyList();
+
+    private AtomicInteger pausedConnections;
+    private Gauge<Integer> pausedConnectionsGauge;
+    private Meter requestDiscarded;
+
     private ClientMetrics()
     {
     }
 
+    public void pauseConnection() { pausedConnections.incrementAndGet(); }
+    public void unpauseConnection() { pausedConnections.decrementAndGet(); }
+    public void markRequestDiscarded() { requestDiscarded.mark(); }
+
+    public synchronized void init(Collection<Server> servers)
+    {
+        if (initialized)
+            return;
+
+        this.servers = servers;
+
+        registerGauge("connectedNativeClients", this::countConnectedClients);
+
+        pausedConnections = new AtomicInteger();
+        pausedConnectionsGauge = registerGauge("PausedConnections", pausedConnections::get);
+        requestDiscarded = registerMeter("RequestDiscarded");
+
+        initialized = true;
+    }
+
     public void addCounter(String name, final Callable<Integer> provider)
     {
         Metrics.register(factory.createMetricName(name), new Gauge<Integer>()
@@ -51,4 +83,24 @@
             }
         });
     }
+
+    private int countConnectedClients()
+    {
+        int count = 0;
+
+        for (Server server : servers)
+            count += server.getConnectedClients();
+
+        return count;
+    }
+
+    private <T> Gauge<T> registerGauge(String name, Gauge<T> gauge)
+    {
+        return Metrics.register(factory.createMetricName(name), gauge);
+    }
+
+    private Meter registerMeter(String name)
+    {
+        return Metrics.meter(factory.createMetricName(name));
+    }
 }

diff --git a/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java b/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java
deleted file mode 100644
index 40ed2e4..0000000
--- a/src/java/org/apache/cassandra/metrics/ColumnFamilyMetrics.java
+++ /dev/null

@@ -1,797 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.metrics;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.ConcurrentMap;
-
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-import com.codahale.metrics.*;
-import com.codahale.metrics.Timer;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Memtable;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.utils.EstimatedHistogram;
-import org.apache.cassandra.utils.TopKSampler;
-
-import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
-
-
-/**
- * Metrics for {@link ColumnFamilyStore}.
- */
-public class ColumnFamilyMetrics
-{
-
-    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and overwritten rows. */
-    public final Gauge<Long> memtableOnHeapSize;
-    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and overwritten rows. */
-    public final Gauge<Long> memtableOffHeapSize;
-    /** Total amount of live data stored in the memtable, excluding any data structure overhead */
-    public final Gauge<Long> memtableLiveDataSize;
-    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides on-heap. */
-    public final Gauge<Long> allMemtablesOnHeapSize;
-    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides off-heap. */
-    public final Gauge<Long> allMemtablesOffHeapSize;
-    /** Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead */
-    public final Gauge<Long> allMemtablesLiveDataSize;
-    /** Total number of columns present in the memtable. */
-    public final Gauge<Long> memtableColumnsCount;
-    /** Number of times flush has resulted in the memtable being switched out. */
-    public final Counter memtableSwitchCount;
-    /** Current compression ratio for all SSTables */
-    public final Gauge<Double> compressionRatio;
-    /** Histogram of estimated row size (in bytes). */
-    public final Gauge<long[]> estimatedRowSizeHistogram;
-    /** Approximate number of keys in table. */
-    public final Gauge<Long> estimatedRowCount;
-    /** Histogram of estimated number of columns. */
-    public final Gauge<long[]> estimatedColumnCountHistogram;
-    /** Histogram of the number of sstable data files accessed per read */
-    public final ColumnFamilyHistogram sstablesPerReadHistogram;
-    /** (Local) read metrics */
-    public final LatencyMetrics readLatency;
-    /** (Local) range slice metrics */
-    public final LatencyMetrics rangeLatency;
-    /** (Local) write metrics */
-    public final LatencyMetrics writeLatency;
-    /** Estimated number of tasks pending for this column family */
-    public final Counter pendingFlushes;
-    /** Estimate of number of pending compactios for this CF */
-    public final Gauge<Integer> pendingCompactions;
-    /** Number of SSTables on disk for this CF */
-    public final Gauge<Integer> liveSSTableCount;
-    /** Disk space used by SSTables belonging to this CF */
-    public final Counter liveDiskSpaceUsed;
-    /** Total disk space used by SSTables belonging to this CF, including obsolete ones waiting to be GC'd */
-    public final Counter totalDiskSpaceUsed;
-    /** Size of the smallest compacted row */
-    public final Gauge<Long> minRowSize;
-    /** Size of the largest compacted row */
-    public final Gauge<Long> maxRowSize;
-    /** Size of the smallest compacted row */
-    public final Gauge<Long> meanRowSize;
-    /** Number of false positives in bloom filter */
-    public final Gauge<Long> bloomFilterFalsePositives;
-    /** Number of false positives in bloom filter from last read */
-    public final Gauge<Long> recentBloomFilterFalsePositives;
-    /** False positive ratio of bloom filter */
-    public final Gauge<Double> bloomFilterFalseRatio;
-    /** False positive ratio of bloom filter from last read */
-    public final Gauge<Double> recentBloomFilterFalseRatio;
-    /** Disk space used by bloom filter */
-    public final Gauge<Long> bloomFilterDiskSpaceUsed;
-    /** Off heap memory used by bloom filter */
-    public final Gauge<Long> bloomFilterOffHeapMemoryUsed;
-    /** Off heap memory used by index summary */
-    public final Gauge<Long> indexSummaryOffHeapMemoryUsed;
-    /** Off heap memory used by compression meta data*/
-    public final Gauge<Long> compressionMetadataOffHeapMemoryUsed;
-    /** Key cache hit rate  for this CF */
-    public final Gauge<Double> keyCacheHitRate;
-    /** Tombstones scanned in queries on this CF */
-    public final ColumnFamilyHistogram tombstoneScannedHistogram;
-    /** Live cells scanned in queries on this CF */
-    public final ColumnFamilyHistogram liveScannedHistogram;
-    /** Column update time delta on this CF */
-    public final ColumnFamilyHistogram colUpdateTimeDeltaHistogram;
-    /** Disk space used by snapshot files which */
-    public final Gauge<Long> trueSnapshotsSize;
-    /** Row cache hits, but result out of range */
-    public final Counter rowCacheHitOutOfRange;
-    /** Number of row cache hits */
-    public final Counter rowCacheHit;
-    /** Number of row cache misses */
-    public final Counter rowCacheMiss;
-    /** CAS Prepare metrics */
-    public final LatencyMetrics casPrepare;
-    /** CAS Propose metrics */
-    public final LatencyMetrics casPropose;
-    /** CAS Commit metrics */
-    public final LatencyMetrics casCommit;
-
-    public final Timer coordinatorReadLatency;
-    public final Timer coordinatorScanLatency;
-
-    /** Time spent waiting for free memtable space, either on- or off-heap */
-    public final Histogram waitingOnFreeMemtableSpace;
-
-    private final MetricNameFactory factory;
-    private static final MetricNameFactory globalNameFactory = new AllColumnFamilyMetricNameFactory();
-
-    public final Counter speculativeRetries;
-
-    public final static LatencyMetrics globalReadLatency = new LatencyMetrics(globalNameFactory, "Read");
-    public final static LatencyMetrics globalWriteLatency = new LatencyMetrics(globalNameFactory, "Write");
-    public final static LatencyMetrics globalRangeLatency = new LatencyMetrics(globalNameFactory, "Range");
-    
-    public final Map<Sampler, TopKSampler<ByteBuffer>> samplers;
-    /**
-     * stores metrics that will be rolled into a single global metric
-     */
-    public final static ConcurrentMap<String, Set<Metric>> allColumnFamilyMetrics = Maps.newConcurrentMap();
-    
-    /**
-     * Stores all metric names created that can be used when unregistering
-     */
-    public final static Set<String> all = Sets.newHashSet();
-
-    private interface GetHistogram
-    {
-        public EstimatedHistogram getHistogram(SSTableReader reader);
-    }
-
-    private static long[] combineHistograms(Iterable<SSTableReader> sstables, GetHistogram getHistogram)
-    {
-        Iterator<SSTableReader> iterator = sstables.iterator();
-        if (!iterator.hasNext())
-        {
-            return new long[0];
-        }
-        long[] firstBucket = getHistogram.getHistogram(iterator.next()).getBuckets(false);
-        long[] values = new long[firstBucket.length];
-        System.arraycopy(firstBucket, 0, values, 0, values.length);
-
-        while (iterator.hasNext())
-        {
-            long[] nextBucket = getHistogram.getHistogram(iterator.next()).getBuckets(false);
-            if (nextBucket.length > values.length)
-            {
-                long[] newValues = new long[nextBucket.length];
-                System.arraycopy(firstBucket, 0, newValues, 0, firstBucket.length);
-                for (int i = 0; i < newValues.length; i++)
-                {
-                    newValues[i] += nextBucket[i];
-                }
-                values = newValues;
-            }
-            else
-            {
-                for (int i = 0; i < values.length; i++)
-                {
-                    values[i] += nextBucket[i];
-                }
-            }
-        }
-        return values;
-    }
-    
-    /**
-     * Creates metrics for given {@link ColumnFamilyStore}.
-     *
-     * @param cfs ColumnFamilyStore to measure metrics
-     */
-    public ColumnFamilyMetrics(final ColumnFamilyStore cfs)
-    {
-        factory = new ColumnFamilyMetricNameFactory(cfs);
-
-        samplers = Maps.newHashMap();
-        for (Sampler sampler : Sampler.values())
-        {
-            samplers.put(sampler, new TopKSampler<ByteBuffer>());
-        }
-
-        memtableColumnsCount = createColumnFamilyGauge("MemtableColumnsCount", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return cfs.getTracker().getView().getCurrentMemtable().getOperations();
-            }
-        });
-        memtableOnHeapSize = createColumnFamilyGauge("MemtableOnHeapSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return cfs.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
-            }
-        });
-        memtableOffHeapSize = createColumnFamilyGauge("MemtableOffHeapSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return cfs.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
-            }
-        });
-        memtableLiveDataSize = createColumnFamilyGauge("MemtableLiveDataSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return cfs.getTracker().getView().getCurrentMemtable().getLiveDataSize();
-            }
-        });
-        allMemtablesOnHeapSize = createColumnFamilyGauge("AllMemtablesHeapSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long size = 0;
-                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
-                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
-                return size;
-            }
-        });
-        allMemtablesOffHeapSize = createColumnFamilyGauge("AllMemtablesOffHeapSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long size = 0;
-                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
-                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
-                return size;
-            }
-        });
-        allMemtablesLiveDataSize = createColumnFamilyGauge("AllMemtablesLiveDataSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long size = 0;
-                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
-                    size += cfs2.getTracker().getView().getCurrentMemtable().getLiveDataSize();
-                return size;
-            }
-        });
-        memtableSwitchCount = createColumnFamilyCounter("MemtableSwitchCount");
-        estimatedRowSizeHistogram = Metrics.register(factory.createMetricName("EstimatedRowSizeHistogram"), new Gauge<long[]>()
-        {
-            public long[] getValue()
-            {
-                return combineHistograms(cfs.getSSTables(), new GetHistogram()
-                {
-                    public EstimatedHistogram getHistogram(SSTableReader reader)
-                    {
-                        return reader.getEstimatedRowSize();
-                    }
-                });
-            }
-        });
-        estimatedRowCount = Metrics.register(factory.createMetricName("EstimatedRowCount"), new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long memtablePartitions = 0;
-                for (Memtable memtable : cfs.getTracker().getView().getAllMemtables())
-                    memtablePartitions += memtable.partitionCount();
-                return SSTableReader.getApproximateKeyCount(cfs.getSSTables()) + memtablePartitions;
-            }
-        });
-        estimatedColumnCountHistogram = Metrics.register(factory.createMetricName("EstimatedColumnCountHistogram"), new Gauge<long[]>()
-        {
-            public long[] getValue()
-            {
-                return combineHistograms(cfs.getSSTables(), new GetHistogram()
-                {
-                    public EstimatedHistogram getHistogram(SSTableReader reader)
-                    {
-                        return reader.getEstimatedColumnCount();
-                    }
-                });
-            }
-        });
-        sstablesPerReadHistogram = createColumnFamilyHistogram("SSTablesPerReadHistogram", cfs.keyspace.metric.sstablesPerReadHistogram, true);
-        compressionRatio = createColumnFamilyGauge("CompressionRatio", new Gauge<Double>()
-        {
-            public Double getValue()
-            {
-                double sum = 0;
-                int total = 0;
-                for (SSTableReader sstable : cfs.getSSTables())
-                {
-                    if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
-                    {
-                        sum += sstable.getCompressionRatio();
-                        total++;
-                    }
-                }
-                return total != 0 ? sum / total : 0;
-            }
-        }, new Gauge<Double>() // global gauge
-        {
-            public Double getValue()
-            {
-                double sum = 0;
-                int total = 0;
-                for (Keyspace keyspace : Keyspace.all())
-                {
-                    for (SSTableReader sstable : keyspace.getAllSSTables())
-                    {
-                        if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
-                        {
-                            sum += sstable.getCompressionRatio();
-                            total++;
-                        }
-                    }
-                }
-                return total != 0 ? sum / total : 0;
-            }
-        });
-        readLatency = new LatencyMetrics(factory, "Read", cfs.keyspace.metric.readLatency, globalReadLatency);
-        writeLatency = new LatencyMetrics(factory, "Write", cfs.keyspace.metric.writeLatency, globalWriteLatency);
-        rangeLatency = new LatencyMetrics(factory, "Range", cfs.keyspace.metric.rangeLatency, globalRangeLatency);
-        pendingFlushes = createColumnFamilyCounter("PendingFlushes");
-        pendingCompactions = createColumnFamilyGauge("PendingCompactions", new Gauge<Integer>()
-        {
-            public Integer getValue()
-            {
-                return cfs.getCompactionStrategy().getEstimatedRemainingTasks();
-            }
-        });
-        liveSSTableCount = createColumnFamilyGauge("LiveSSTableCount", new Gauge<Integer>()
-        {
-            public Integer getValue()
-            {
-                return cfs.getTracker().getSSTables().size();
-            }
-        });
-        liveDiskSpaceUsed = createColumnFamilyCounter("LiveDiskSpaceUsed");
-        totalDiskSpaceUsed = createColumnFamilyCounter("TotalDiskSpaceUsed");
-        minRowSize = createColumnFamilyGauge("MinRowSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long min = 0;
-                for (SSTableReader sstable : cfs.getSSTables())
-                {
-                    if (min == 0 || sstable.getEstimatedRowSize().min() < min)
-                        min = sstable.getEstimatedRowSize().min();
-                }
-                return min;
-            }
-        }, new Gauge<Long>() // global gauge
-        {
-            public Long getValue()
-            {
-                long min = Long.MAX_VALUE;
-                for (Metric cfGauge : allColumnFamilyMetrics.get("MinRowSize"))
-                {
-                    min = Math.min(min, ((Gauge<? extends Number>) cfGauge).getValue().longValue());
-                }
-                return min;
-            }
-        });
-        maxRowSize = createColumnFamilyGauge("MaxRowSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long max = 0;
-                for (SSTableReader sstable : cfs.getSSTables())
-                {
-                    if (sstable.getEstimatedRowSize().max() > max)
-                        max = sstable.getEstimatedRowSize().max();
-                }
-                return max;
-            }
-        }, new Gauge<Long>() // global gauge
-        {
-            public Long getValue()
-            {
-                long max = 0;
-                for (Metric cfGauge : allColumnFamilyMetrics.get("MaxRowSize"))
-                {
-                    max = Math.max(max, ((Gauge<? extends Number>) cfGauge).getValue().longValue());
-                }
-                return max;
-            }
-        });
-        meanRowSize = createColumnFamilyGauge("MeanRowSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long sum = 0;
-                long count = 0;
-                for (SSTableReader sstable : cfs.getSSTables())
-                {
-                    long n = sstable.getEstimatedRowSize().count();
-                    sum += sstable.getEstimatedRowSize().mean() * n;
-                    count += n;
-                }
-                return count > 0 ? sum / count : 0;
-            }
-        }, new Gauge<Long>() // global gauge
-        {
-            public Long getValue()
-            {
-                long sum = 0;
-                long count = 0;
-                for (Keyspace keyspace : Keyspace.all())
-                {
-                    for (SSTableReader sstable : keyspace.getAllSSTables())
-                    {
-                        long n = sstable.getEstimatedRowSize().count();
-                        sum += sstable.getEstimatedRowSize().mean() * n;
-                        count += n;
-                    }
-                }
-                return count > 0 ? sum / count : 0;
-            }
-        });
-        bloomFilterFalsePositives = createColumnFamilyGauge("BloomFilterFalsePositives", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long count = 0L;
-                for (SSTableReader sstable: cfs.getSSTables())
-                    count += sstable.getBloomFilterFalsePositiveCount();
-                return count;
-            }
-        });
-        recentBloomFilterFalsePositives = createColumnFamilyGauge("RecentBloomFilterFalsePositives", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long count = 0L;
-                for (SSTableReader sstable : cfs.getSSTables())
-                    count += sstable.getRecentBloomFilterFalsePositiveCount();
-                return count;
-            }
-        });
-        bloomFilterFalseRatio = createColumnFamilyGauge("BloomFilterFalseRatio", new Gauge<Double>()
-        {
-            public Double getValue()
-            {
-                long falseCount = 0L;
-                long trueCount = 0L;
-                for (SSTableReader sstable : cfs.getSSTables())
-                {
-                    falseCount += sstable.getBloomFilterFalsePositiveCount();
-                    trueCount += sstable.getBloomFilterTruePositiveCount();
-                }
-                if (falseCount == 0L && trueCount == 0L)
-                    return 0d;
-                return (double) falseCount / (trueCount + falseCount);
-            }
-        }, new Gauge<Double>() // global gauge
-        {
-            public Double getValue()
-            {
-                long falseCount = 0L;
-                long trueCount = 0L;
-                for (Keyspace keyspace : Keyspace.all())
-                {
-                    for (SSTableReader sstable : keyspace.getAllSSTables())
-                    {
-                        falseCount += sstable.getBloomFilterFalsePositiveCount();
-                        trueCount += sstable.getBloomFilterTruePositiveCount();
-                    }
-                }
-                if (falseCount == 0L && trueCount == 0L)
-                    return 0d;
-                return (double) falseCount / (trueCount + falseCount);
-            }
-        });
-        recentBloomFilterFalseRatio = createColumnFamilyGauge("RecentBloomFilterFalseRatio", new Gauge<Double>()
-        {
-            public Double getValue()
-            {
-                long falseCount = 0L;
-                long trueCount = 0L;
-                for (SSTableReader sstable: cfs.getSSTables())
-                {
-                    falseCount += sstable.getRecentBloomFilterFalsePositiveCount();
-                    trueCount += sstable.getRecentBloomFilterTruePositiveCount();
-                }
-                if (falseCount == 0L && trueCount == 0L)
-                    return 0d;
-                return (double) falseCount / (trueCount + falseCount);
-            }
-        }, new Gauge<Double>() // global gauge
-        {
-            public Double getValue()
-            {
-                long falseCount = 0L;
-                long trueCount = 0L;
-                for (Keyspace keyspace : Keyspace.all())
-                {
-                    for (SSTableReader sstable : keyspace.getAllSSTables())
-                    {
-                        falseCount += sstable.getRecentBloomFilterFalsePositiveCount();
-                        trueCount += sstable.getRecentBloomFilterTruePositiveCount();
-                    }
-                }
-                if (falseCount == 0L && trueCount == 0L)
-                    return 0d;
-                return (double) falseCount / (trueCount + falseCount);
-            }
-        });
-        bloomFilterDiskSpaceUsed = createColumnFamilyGauge("BloomFilterDiskSpaceUsed", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long total = 0;
-                for (SSTableReader sst : cfs.getSSTables())
-                    total += sst.getBloomFilterSerializedSize();
-                return total;
-            }
-        });
-        bloomFilterOffHeapMemoryUsed = createColumnFamilyGauge("BloomFilterOffHeapMemoryUsed", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long total = 0;
-                for (SSTableReader sst : cfs.getSSTables())
-                    total += sst.getBloomFilterOffHeapSize();
-                return total;
-            }
-        });
-        indexSummaryOffHeapMemoryUsed = createColumnFamilyGauge("IndexSummaryOffHeapMemoryUsed", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long total = 0;
-                for (SSTableReader sst : cfs.getSSTables())
-                    total += sst.getIndexSummaryOffHeapSize();
-                return total;
-            }
-        });
-        compressionMetadataOffHeapMemoryUsed = createColumnFamilyGauge("CompressionMetadataOffHeapMemoryUsed", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long total = 0;
-                for (SSTableReader sst : cfs.getSSTables())
-                    total += sst.getCompressionMetadataOffHeapSize();
-                return total;
-            }
-        });
-        speculativeRetries = createColumnFamilyCounter("SpeculativeRetries");
-        keyCacheHitRate = Metrics.register(factory.createMetricName("KeyCacheHitRate"), new RatioGauge()
-        {
-            @Override
-            public Ratio getRatio()
-            {
-                return Ratio.of(getNumerator(), getDenominator());
-            }
-
-            protected double getNumerator()
-            {
-                long hits = 0L;
-                for (SSTableReader sstable : cfs.getSSTables())
-                    hits += sstable.getKeyCacheHit();
-                return hits;
-            }
-
-            protected double getDenominator()
-            {
-                long requests = 0L;
-                for (SSTableReader sstable : cfs.getSSTables())
-                    requests += sstable.getKeyCacheRequest();
-                return Math.max(requests, 1); // to avoid NaN.
-            }
-        });
-        tombstoneScannedHistogram = createColumnFamilyHistogram("TombstoneScannedHistogram", cfs.keyspace.metric.tombstoneScannedHistogram, false);
-        liveScannedHistogram = createColumnFamilyHistogram("LiveScannedHistogram", cfs.keyspace.metric.liveScannedHistogram, false);
-        colUpdateTimeDeltaHistogram = createColumnFamilyHistogram("ColUpdateTimeDeltaHistogram", cfs.keyspace.metric.colUpdateTimeDeltaHistogram, false);
-        coordinatorReadLatency = Metrics.timer(factory.createMetricName("CoordinatorReadLatency"));
-        coordinatorScanLatency = Metrics.timer(factory.createMetricName("CoordinatorScanLatency"));
-        waitingOnFreeMemtableSpace = Metrics.histogram(factory.createMetricName("WaitingOnFreeMemtableSpace"), false);
-
-        trueSnapshotsSize = createColumnFamilyGauge("SnapshotsSize", new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return cfs.trueSnapshotsSize();
-            }
-        });
-        rowCacheHitOutOfRange = createColumnFamilyCounter("RowCacheHitOutOfRange");
-        rowCacheHit = createColumnFamilyCounter("RowCacheHit");
-        rowCacheMiss = createColumnFamilyCounter("RowCacheMiss");
-
-        casPrepare = new LatencyMetrics(factory, "CasPrepare", cfs.keyspace.metric.casPrepare);
-        casPropose = new LatencyMetrics(factory, "CasPropose", cfs.keyspace.metric.casPropose);
-        casCommit = new LatencyMetrics(factory, "CasCommit", cfs.keyspace.metric.casCommit);
-    }
-
-    public void updateSSTableIterated(int count)
-    {
-        sstablesPerReadHistogram.update(count);
-    }
-
-    /**
-     * Release all associated metrics.
-     */
-    public void release()
-    {
-        for(String name : all)
-        {
-            allColumnFamilyMetrics.get(name).remove(Metrics.getMetrics().get(factory.createMetricName(name).getMetricName()));
-            Metrics.remove(factory.createMetricName(name));
-        }
-        readLatency.release();
-        writeLatency.release();
-        rangeLatency.release();
-        Metrics.remove(factory.createMetricName("EstimatedRowSizeHistogram"));
-        Metrics.remove(factory.createMetricName("EstimatedRowCount"));
-        Metrics.remove(factory.createMetricName("EstimatedColumnCountHistogram"));
-        Metrics.remove(factory.createMetricName("KeyCacheHitRate"));
-        Metrics.remove(factory.createMetricName("CoordinatorReadLatency"));
-        Metrics.remove(factory.createMetricName("CoordinatorScanLatency"));
-        Metrics.remove(factory.createMetricName("WaitingOnFreeMemtableSpace"));
-    }
-
-
-    /**
-     * Create a gauge that will be part of a merged version of all column families.  The global gauge
-     * will merge each CF gauge by adding their values 
-     */
-    protected <T extends Number> Gauge<T> createColumnFamilyGauge(final String name, Gauge<T> gauge)
-    {
-        return createColumnFamilyGauge(name, gauge, new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                long total = 0;
-                for (Metric cfGauge : allColumnFamilyMetrics.get(name))
-                {
-                    total = total + ((Gauge<? extends Number>) cfGauge).getValue().longValue();
-                }
-                return total;
-            }
-        });
-    }
-    
-    /**
-     * Create a gauge that will be part of a merged version of all column families.  The global gauge
-     * is defined as the globalGauge parameter
-     */
-    protected <G,T> Gauge<T> createColumnFamilyGauge(String name, Gauge<T> gauge, Gauge<G> globalGauge)
-    {
-        Gauge<T> cfGauge = Metrics.register(factory.createMetricName(name), gauge);
-        if (register(name, cfGauge))
-        {
-            Metrics.register(globalNameFactory.createMetricName(name), globalGauge);
-        }
-        return cfGauge;
-    }
-    
-    /**
-     * Creates a counter that will also have a global counter thats the sum of all counters across 
-     * different column families
-     */
-    protected Counter createColumnFamilyCounter(final String name)
-    {
-        Counter cfCounter = Metrics.counter(factory.createMetricName(name));
-        if (register(name, cfCounter))
-        {
-            Metrics.register(globalNameFactory.createMetricName(name), new Gauge<Long>()
-            {
-                public Long getValue()
-                {
-                    long total = 0;
-                    for (Metric cfGauge : allColumnFamilyMetrics.get(name))
-                    {
-                        total += ((Counter) cfGauge).getCount();
-                    }
-                    return total;
-                }
-            });
-        }
-        return cfCounter;
-    }
-    
-    /**
-     * Create a histogram-like interface that will register both a CF, keyspace and global level
-     * histogram and forward any updates to both
-     */
-    protected ColumnFamilyHistogram createColumnFamilyHistogram(String name, Histogram keyspaceHistogram, boolean considerZeroes)
-    {
-        Histogram cfHistogram = Metrics.histogram(factory.createMetricName(name), considerZeroes);
-        register(name, cfHistogram);
-        return new ColumnFamilyHistogram(cfHistogram, keyspaceHistogram, Metrics.histogram(globalNameFactory.createMetricName(name), considerZeroes));
-    }
-
-    /**
-     * Registers a metric to be removed when unloading CF.
-     * @return true if first time metric with that name has been registered
-     */
-    private boolean register(String name, Metric metric)
-    { 
-        boolean ret = allColumnFamilyMetrics.putIfAbsent(name,  new HashSet<Metric>()) == null;
-        allColumnFamilyMetrics.get(name).add(metric);
-        all.add(name);
-        return ret;
-    }
-    
-    public static class ColumnFamilyHistogram
-    {
-        public final Histogram[] all;
-        public final Histogram cf;
-        private ColumnFamilyHistogram(Histogram cf, Histogram keyspace, Histogram global)
-        {
-            this.cf = cf;
-            this.all = new Histogram[]{cf, keyspace, global};
-        }
-
-        public void update(long i)
-        {
-            for(Histogram histo : all)
-            {
-                histo.update(i);
-            }
-        }
-    }
-    
-    static class ColumnFamilyMetricNameFactory implements MetricNameFactory
-    {
-        private final String keyspaceName;
-        private final String columnFamilyName;
-        private final boolean isIndex;
-
-        ColumnFamilyMetricNameFactory(ColumnFamilyStore cfs)
-        {
-            this.keyspaceName = cfs.keyspace.getName();
-            this.columnFamilyName = cfs.name;
-            isIndex = cfs.isIndex();
-        }
-
-        public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
-        {
-            String groupName = ColumnFamilyMetrics.class.getPackage().getName();
-            String type = isIndex ? "IndexColumnFamily" : "ColumnFamily";
-
-            StringBuilder mbeanName = new StringBuilder();
-            mbeanName.append(groupName).append(":");
-            mbeanName.append("type=").append(type);
-            mbeanName.append(",keyspace=").append(keyspaceName);
-            mbeanName.append(",scope=").append(columnFamilyName);
-            mbeanName.append(",name=").append(metricName);
-
-            return new CassandraMetricsRegistry.MetricName(groupName, type, metricName, keyspaceName + "." + columnFamilyName, mbeanName.toString());
-        }
-    }
-    
-    static class AllColumnFamilyMetricNameFactory implements MetricNameFactory
-    {
-        public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
-        {
-            String groupName = ColumnFamilyMetrics.class.getPackage().getName(); 
-            StringBuilder mbeanName = new StringBuilder();
-            mbeanName.append(groupName).append(":");
-            mbeanName.append("type=ColumnFamily");
-            mbeanName.append(",name=").append(metricName);
-            return new CassandraMetricsRegistry.MetricName(groupName, "ColumnFamily", metricName, "all", mbeanName.toString());
-        }
-    }
-
-    public static enum Sampler
-    {
-        READS, WRITES
-    }
-}

diff --git a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
index eb00728..19eadc8 100644
--- a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java

@@ -62,7 +62,7 @@
                 for (String keyspaceName : Schema.instance.getKeyspaces())
                 {
                     for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
-                        n += cfs.getCompactionStrategy().getEstimatedRemainingTasks();
+                        n += cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
                 }
                 // add number of currently running compactions
                 return n + compactions.size();

diff --git a/src/java/org/apache/cassandra/metrics/DroppedMessageMetrics.java b/src/java/org/apache/cassandra/metrics/DroppedMessageMetrics.java
index 58c80fb..6d16f8b 100644
--- a/src/java/org/apache/cassandra/metrics/DroppedMessageMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/DroppedMessageMetrics.java

@@ -32,7 +32,11 @@
 
     public DroppedMessageMetrics(MessagingService.Verb verb)
     {
-        MetricNameFactory factory = new DefaultNameFactory("DroppedMessage", verb.toString());
+        this(new DefaultNameFactory("DroppedMessage", verb.toString()));
+    }
+
+    public DroppedMessageMetrics(MetricNameFactory factory)
+    {
         dropped = Metrics.meter(factory.createMetricName("Dropped"));
     }
 }

diff --git a/src/java/org/apache/cassandra/metrics/FileCacheMetrics.java b/src/java/org/apache/cassandra/metrics/FileCacheMetrics.java
deleted file mode 100644
index c240c03..0000000
--- a/src/java/org/apache/cassandra/metrics/FileCacheMetrics.java
+++ /dev/null

@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.metrics;
-
-import com.codahale.metrics.Gauge;
-import com.codahale.metrics.Meter;
-import com.codahale.metrics.RatioGauge;
-import org.apache.cassandra.service.FileCacheService;
-
-import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
-
-
-public class FileCacheMetrics
-{
-    private static final MetricNameFactory factory = new DefaultNameFactory("FileCache");
-
-    /** Total number of hits */
-    public final Meter hits;
-    /** Total number of requests */
-    public final Meter requests;
-    /** hit rate */
-    public final Gauge<Double> hitRate;
-    /** Total size of file cache, in bytes */
-    public final Gauge<Long> size;
-
-    public FileCacheMetrics()
-    {
-        hits = Metrics.meter(factory.createMetricName("Hits"));
-        requests = Metrics.meter(factory.createMetricName("Requests"));
-        hitRate = Metrics.register(factory.createMetricName("HitRate"), new RatioGauge()
-        {
-            @Override
-            public Ratio getRatio()
-            {
-                return Ratio.of(hits.getCount(), requests.getCount());
-            }
-        });
-        size = Metrics.register(factory.createMetricName("Size"), new Gauge<Long>()
-        {
-            public Long getValue()
-            {
-                return FileCacheService.instance.sizeInBytes();
-            }
-        });
-    }
-}

diff --git a/src/java/org/apache/cassandra/metrics/HintedHandoffMetrics.java b/src/java/org/apache/cassandra/metrics/HintedHandoffMetrics.java
index e44279a..51f6569 100644
--- a/src/java/org/apache/cassandra/metrics/HintedHandoffMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/HintedHandoffMetrics.java

@@ -21,7 +21,6 @@
 import java.util.Map.Entry;
 
 import com.codahale.metrics.Counter;
-import org.apache.cassandra.db.HintedHandOffManager;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.utils.UUIDGen;
 import org.slf4j.Logger;
@@ -34,7 +33,7 @@
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
 
 /**
- * Metrics for {@link HintedHandOffManager}.
+ * Metrics for {@link org.apache.cassandra.hints.HintsService}.
  */
 public class HintedHandoffMetrics
 {

diff --git a/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java b/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java
new file mode 100644
index 0000000..062f67d
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java

@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+/**
+ * Metrics for {@link HintsService}.
+ */
+public final class HintsServiceMetrics
+{
+}

diff --git a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java
index 369f323..ef62034 100644
--- a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java

@@ -21,7 +21,7 @@
 
 import com.codahale.metrics.Gauge;
 import com.codahale.metrics.Histogram;
-import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.Timer;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 
@@ -30,7 +30,6 @@
 
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
 
-
 /**
  * Metrics for {@link ColumnFamilyStore}.
  */
@@ -38,9 +37,9 @@
 {
     /** Total amount of live data stored in the memtable, excluding any data structure overhead */
     public final Gauge<Long> memtableLiveDataSize;
-    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and overwritten rows. */
+    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and partitions overwritten. */
     public final Gauge<Long> memtableOnHeapDataSize;
-    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and overwritten rows. */
+    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and partitions overwritten. */
     public final Gauge<Long> memtableOffHeapDataSize;
     /** Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead */
     public final Gauge<Long> allMemtablesLiveDataSize;
@@ -82,13 +81,17 @@
     public final Histogram liveScannedHistogram;
     /** Column update time delta on this Keyspace */
     public final Histogram colUpdateTimeDeltaHistogram;
+    /** time taken acquiring the partition lock for materialized view updates on this keyspace */
+    public final Timer viewLockAcquireTime;
+    /** time taken during the local read of a materialized view update */
+    public final Timer viewReadTime;
     /** CAS Prepare metric */
     public final LatencyMetrics casPrepare;
     /** CAS Propose metrics */
     public final LatencyMetrics casPropose;
     /** CAS Commit metrics */
     public final LatencyMetrics casCommit;
-    
+
     public final MetricNameFactory factory;
     private Keyspace keyspace;
     
@@ -106,125 +109,127 @@
         keyspace = ks;
         memtableColumnsCount = createKeyspaceGauge("MemtableColumnsCount", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.memtableColumnsCount.getValue();
             }
         });
         memtableLiveDataSize = createKeyspaceGauge("MemtableLiveDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.memtableLiveDataSize.getValue();
             }
         }); 
         memtableOnHeapDataSize = createKeyspaceGauge("MemtableOnHeapDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.memtableOnHeapSize.getValue();
             }
         });
         memtableOffHeapDataSize = createKeyspaceGauge("MemtableOffHeapDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.memtableOffHeapSize.getValue();
             }
         });
         allMemtablesLiveDataSize = createKeyspaceGauge("AllMemtablesLiveDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.allMemtablesLiveDataSize.getValue();
             }
         });
         allMemtablesOnHeapDataSize = createKeyspaceGauge("AllMemtablesOnHeapDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.allMemtablesOnHeapSize.getValue();
             }
         });
         allMemtablesOffHeapDataSize = createKeyspaceGauge("AllMemtablesOffHeapDataSize", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.allMemtablesOffHeapSize.getValue();
             }
         });
         memtableSwitchCount = createKeyspaceGauge("MemtableSwitchCount", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.memtableSwitchCount.getCount();
             }
         });
         pendingCompactions = createKeyspaceGauge("PendingCompactions", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return (long) metric.pendingCompactions.getValue();
             }
         });
         pendingFlushes = createKeyspaceGauge("PendingFlushes", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return (long) metric.pendingFlushes.getCount();
             }
         });
         liveDiskSpaceUsed = createKeyspaceGauge("LiveDiskSpaceUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.liveDiskSpaceUsed.getCount();
             }
         });
         totalDiskSpaceUsed = createKeyspaceGauge("TotalDiskSpaceUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.totalDiskSpaceUsed.getCount();
             }
         });
         bloomFilterDiskSpaceUsed = createKeyspaceGauge("BloomFilterDiskSpaceUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.bloomFilterDiskSpaceUsed.getValue();
             }
         });
         bloomFilterOffHeapMemoryUsed = createKeyspaceGauge("BloomFilterOffHeapMemoryUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.bloomFilterOffHeapMemoryUsed.getValue();
             }
         });
         indexSummaryOffHeapMemoryUsed = createKeyspaceGauge("IndexSummaryOffHeapMemoryUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.indexSummaryOffHeapMemoryUsed.getValue();
             }
         });
         compressionMetadataOffHeapMemoryUsed = createKeyspaceGauge("CompressionMetadataOffHeapMemoryUsed", new MetricValue()
         {
-            public Long getValue(ColumnFamilyMetrics metric)
+            public Long getValue(TableMetrics metric)
             {
                 return metric.compressionMetadataOffHeapMemoryUsed.getValue();
             }
         });
-        // latency metrics for ColumnFamilyMetrics to update
+        // latency metrics for TableMetrics to update
         readLatency = new LatencyMetrics(factory, "Read");
         writeLatency = new LatencyMetrics(factory, "Write");
         rangeLatency = new LatencyMetrics(factory, "Range");
-        // create histograms for ColumnFamilyMetrics to replicate updates to
+        // create histograms for TableMetrics to replicate updates to
         sstablesPerReadHistogram = Metrics.histogram(factory.createMetricName("SSTablesPerReadHistogram"), true);
         tombstoneScannedHistogram = Metrics.histogram(factory.createMetricName("TombstoneScannedHistogram"), false);
         liveScannedHistogram = Metrics.histogram(factory.createMetricName("LiveScannedHistogram"), false);
         colUpdateTimeDeltaHistogram = Metrics.histogram(factory.createMetricName("ColUpdateTimeDeltaHistogram"), false);
+        viewLockAcquireTime =  Metrics.timer(factory.createMetricName("ViewLockAcquireTime"));
+        viewReadTime = Metrics.timer(factory.createMetricName("ViewReadTime"));
         // add manually since histograms do not use createKeyspaceGauge method
         allMetrics.addAll(Lists.newArrayList("SSTablesPerReadHistogram", "TombstoneScannedHistogram", "LiveScannedHistogram"));
 
@@ -258,7 +263,7 @@
          * @param metric of a column family in this keyspace
          * @return current value of a metric
          */
-        public Long getValue(ColumnFamilyMetrics metric);
+        public Long getValue(TableMetrics metric);
     }
 
     /**
@@ -295,7 +300,7 @@
 
         public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
         {
-            String groupName = ColumnFamilyMetrics.class.getPackage().getName();
+            String groupName = TableMetrics.class.getPackage().getName();
 
             StringBuilder mbeanName = new StringBuilder();
             mbeanName.append(groupName).append(":");

diff --git a/src/java/org/apache/cassandra/metrics/LatencyMetrics.java b/src/java/org/apache/cassandra/metrics/LatencyMetrics.java
index a2eef68..a1915b1 100644
--- a/src/java/org/apache/cassandra/metrics/LatencyMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/LatencyMetrics.java

@@ -43,6 +43,7 @@
     private List<LatencyMetrics> parents = Lists.newArrayList();
     
     protected final MetricNameFactory factory;
+    protected final MetricNameFactory aliasFactory;
     protected final String namePrefix;
 
     /**
@@ -76,11 +77,25 @@
      */
     public LatencyMetrics(MetricNameFactory factory, String namePrefix)
     {
+        this(factory, null, namePrefix);
+    }
+
+    public LatencyMetrics(MetricNameFactory factory, MetricNameFactory aliasFactory, String namePrefix)
+    {
         this.factory = factory;
+        this.aliasFactory = aliasFactory;
         this.namePrefix = namePrefix;
 
-        latency = Metrics.timer(factory.createMetricName(namePrefix + "Latency"));
-        totalLatency = Metrics.counter(factory.createMetricName(namePrefix + "TotalLatency"));
+        if (aliasFactory == null)
+        {
+            latency = Metrics.timer(factory.createMetricName(namePrefix + "Latency"));
+            totalLatency = Metrics.counter(factory.createMetricName(namePrefix + "TotalLatency"));
+        }
+        else
+        {
+            latency = Metrics.timer(factory.createMetricName(namePrefix + "Latency"), aliasFactory.createMetricName(namePrefix + "Latency"));
+            totalLatency = Metrics.counter(factory.createMetricName(namePrefix + "TotalLatency"), aliasFactory.createMetricName(namePrefix + "TotalLatency"));
+        }
     }
     
     /**
@@ -93,7 +108,7 @@
      */
     public LatencyMetrics(MetricNameFactory factory, String namePrefix, LatencyMetrics ... parents)
     {
-        this(factory, namePrefix);
+        this(factory, null, namePrefix);
         this.parents.addAll(ImmutableList.copyOf(parents));
     }
 
@@ -111,7 +126,15 @@
 
     public void release()
     {
-        Metrics.remove(factory.createMetricName(namePrefix + "Latency"));
-        Metrics.remove(factory.createMetricName(namePrefix + "TotalLatency"));
+        if (aliasFactory == null)
+        {
+            Metrics.remove(factory.createMetricName(namePrefix + "Latency"));
+            Metrics.remove(factory.createMetricName(namePrefix + "TotalLatency"));
+        }
+        else
+        {
+            Metrics.remove(factory.createMetricName(namePrefix + "Latency"), aliasFactory.createMetricName(namePrefix + "Latency"));
+            Metrics.remove(factory.createMetricName(namePrefix + "TotalLatency"), aliasFactory.createMetricName(namePrefix + "TotalLatency"));
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java
new file mode 100644
index 0000000..1f4803e
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java

@@ -0,0 +1,914 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+
+import com.codahale.metrics.*;
+import com.codahale.metrics.Timer;
+import com.google.common.collect.Maps;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.TopKSampler;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+/**
+ * Metrics for {@link ColumnFamilyStore}.
+ */
+public class TableMetrics
+{
+
+    /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and partitions overwritten. */
+    public final Gauge<Long> memtableOnHeapSize;
+    /** Total amount of data stored in the memtable that resides off-heap, including column related overhead and partitions overwritten. */
+    public final Gauge<Long> memtableOffHeapSize;
+    /** Total amount of live data stored in the memtable, excluding any data structure overhead */
+    public final Gauge<Long> memtableLiveDataSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides on-heap. */
+    public final Gauge<Long> allMemtablesOnHeapSize;
+    /** Total amount of data stored in the memtables (2i and pending flush memtables included) that resides off-heap. */
+    public final Gauge<Long> allMemtablesOffHeapSize;
+    /** Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead */
+    public final Gauge<Long> allMemtablesLiveDataSize;
+    /** Total number of columns present in the memtable. */
+    public final Gauge<Long> memtableColumnsCount;
+    /** Number of times flush has resulted in the memtable being switched out. */
+    public final Counter memtableSwitchCount;
+    /** Current compression ratio for all SSTables */
+    public final Gauge<Double> compressionRatio;
+    /** Histogram of estimated partition size (in bytes). */
+    public final Gauge<long[]> estimatedPartitionSizeHistogram;
+    /** Approximate number of keys in table. */
+    public final Gauge<Long> estimatedPartitionCount;
+    /** Histogram of estimated number of columns. */
+    public final Gauge<long[]> estimatedColumnCountHistogram;
+    /** Histogram of the number of sstable data files accessed per read */
+    public final TableHistogram sstablesPerReadHistogram;
+    /** (Local) read metrics */
+    public final LatencyMetrics readLatency;
+    /** (Local) range slice metrics */
+    public final LatencyMetrics rangeLatency;
+    /** (Local) write metrics */
+    public final LatencyMetrics writeLatency;
+    /** Estimated number of tasks pending for this table */
+    public final Counter pendingFlushes;
+    /** Estimate of number of pending compactios for this table */
+    public final Gauge<Integer> pendingCompactions;
+    /** Number of SSTables on disk for this CF */
+    public final Gauge<Integer> liveSSTableCount;
+    /** Disk space used by SSTables belonging to this table */
+    public final Counter liveDiskSpaceUsed;
+    /** Total disk space used by SSTables belonging to this table, including obsolete ones waiting to be GC'd */
+    public final Counter totalDiskSpaceUsed;
+    /** Size of the smallest compacted partition */
+    public final Gauge<Long> minPartitionSize;
+    /** Size of the largest compacted partition */
+    public final Gauge<Long> maxPartitionSize;
+    /** Size of the smallest compacted partition */
+    public final Gauge<Long> meanPartitionSize;
+    /** Number of false positives in bloom filter */
+    public final Gauge<Long> bloomFilterFalsePositives;
+    /** Number of false positives in bloom filter from last read */
+    public final Gauge<Long> recentBloomFilterFalsePositives;
+    /** False positive ratio of bloom filter */
+    public final Gauge<Double> bloomFilterFalseRatio;
+    /** False positive ratio of bloom filter from last read */
+    public final Gauge<Double> recentBloomFilterFalseRatio;
+    /** Disk space used by bloom filter */
+    public final Gauge<Long> bloomFilterDiskSpaceUsed;
+    /** Off heap memory used by bloom filter */
+    public final Gauge<Long> bloomFilterOffHeapMemoryUsed;
+    /** Off heap memory used by index summary */
+    public final Gauge<Long> indexSummaryOffHeapMemoryUsed;
+    /** Off heap memory used by compression meta data*/
+    public final Gauge<Long> compressionMetadataOffHeapMemoryUsed;
+    /** Key cache hit rate  for this CF */
+    public final Gauge<Double> keyCacheHitRate;
+    /** Tombstones scanned in queries on this CF */
+    public final TableHistogram tombstoneScannedHistogram;
+    /** Live cells scanned in queries on this CF */
+    public final TableHistogram liveScannedHistogram;
+    /** Column update time delta on this CF */
+    public final TableHistogram colUpdateTimeDeltaHistogram;
+    /** time taken acquiring the partition lock for materialized view updates for this table */
+    public final TableTimer viewLockAcquireTime;
+    /** time taken during the local read of a materialized view update */
+    public final TableTimer viewReadTime;
+    /** Disk space used by snapshot files which */
+    public final Gauge<Long> trueSnapshotsSize;
+    /** Row cache hits, but result out of range */
+    public final Counter rowCacheHitOutOfRange;
+    /** Number of row cache hits */
+    public final Counter rowCacheHit;
+    /** Number of row cache misses */
+    public final Counter rowCacheMiss;
+    /** CAS Prepare metrics */
+    public final LatencyMetrics casPrepare;
+    /** CAS Propose metrics */
+    public final LatencyMetrics casPropose;
+    /** CAS Commit metrics */
+    public final LatencyMetrics casCommit;
+
+    public final Timer coordinatorReadLatency;
+    public final Timer coordinatorScanLatency;
+
+    /** Time spent waiting for free memtable space, either on- or off-heap */
+    public final Histogram waitingOnFreeMemtableSpace;
+
+    private final MetricNameFactory factory;
+    private final MetricNameFactory aliasFactory;
+    private static final MetricNameFactory globalFactory = new AllTableMetricNameFactory("Table");
+    private static final MetricNameFactory globalAliasFactory = new AllTableMetricNameFactory("ColumnFamily");
+
+    public final Counter speculativeRetries;
+
+    public final static LatencyMetrics globalReadLatency = new LatencyMetrics(globalFactory, globalAliasFactory, "Read");
+    public final static LatencyMetrics globalWriteLatency = new LatencyMetrics(globalFactory, globalAliasFactory, "Write");
+    public final static LatencyMetrics globalRangeLatency = new LatencyMetrics(globalFactory, globalAliasFactory, "Range");
+
+    public final Meter readRepairRequests;
+    public final Meter shortReadProtectionRequests;
+    public final Meter replicaSideFilteringProtectionRequests;
+
+    public final Map<Sampler, TopKSampler<ByteBuffer>> samplers;
+    /**
+     * stores metrics that will be rolled into a single global metric
+     */
+    public final static ConcurrentMap<String, Set<Metric>> allTableMetrics = Maps.newConcurrentMap();
+
+    /**
+     * Stores all metric names created that can be used when unregistering, optionally mapped to an alias name.
+     */
+    public final static Map<String, String> all = Maps.newHashMap();
+
+    private interface GetHistogram
+    {
+        EstimatedHistogram getHistogram(SSTableReader reader);
+    }
+
+    private static long[] combineHistograms(Iterable<SSTableReader> sstables, GetHistogram getHistogram)
+    {
+        Iterator<SSTableReader> iterator = sstables.iterator();
+        if (!iterator.hasNext())
+        {
+            return new long[0];
+        }
+        long[] firstBucket = getHistogram.getHistogram(iterator.next()).getBuckets(false);
+        long[] values = new long[firstBucket.length];
+        System.arraycopy(firstBucket, 0, values, 0, values.length);
+
+        while (iterator.hasNext())
+        {
+            long[] nextBucket = getHistogram.getHistogram(iterator.next()).getBuckets(false);
+            if (nextBucket.length > values.length)
+            {
+                long[] newValues = new long[nextBucket.length];
+                System.arraycopy(firstBucket, 0, newValues, 0, firstBucket.length);
+                for (int i = 0; i < newValues.length; i++)
+                {
+                    newValues[i] += nextBucket[i];
+                }
+                values = newValues;
+            }
+            else
+            {
+                for (int i = 0; i < values.length; i++)
+                {
+                    values[i] += nextBucket[i];
+                }
+            }
+        }
+        return values;
+    }
+
+    /**
+     * Creates metrics for given {@link ColumnFamilyStore}.
+     *
+     * @param cfs ColumnFamilyStore to measure metrics
+     */
+    public TableMetrics(final ColumnFamilyStore cfs)
+    {
+        factory = new TableMetricNameFactory(cfs, "Table");
+        aliasFactory = new TableMetricNameFactory(cfs, "ColumnFamily");
+
+        samplers = Maps.newHashMap();
+        for (Sampler sampler : Sampler.values())
+        {
+            samplers.put(sampler, new TopKSampler<>());
+        }
+
+        memtableColumnsCount = createTableGauge("MemtableColumnsCount", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return cfs.getTracker().getView().getCurrentMemtable().getOperations();
+            }
+        });
+        memtableOnHeapSize = createTableGauge("MemtableOnHeapSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return cfs.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
+            }
+        });
+        memtableOffHeapSize = createTableGauge("MemtableOffHeapSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return cfs.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
+            }
+        });
+        memtableLiveDataSize = createTableGauge("MemtableLiveDataSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return cfs.getTracker().getView().getCurrentMemtable().getLiveDataSize();
+            }
+        });
+        allMemtablesOnHeapSize = createTableGauge("AllMemtablesHeapSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
+                return size;
+            }
+        });
+        allMemtablesOffHeapSize = createTableGauge("AllMemtablesOffHeapSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
+                return size;
+            }
+        });
+        allMemtablesLiveDataSize = createTableGauge("AllMemtablesLiveDataSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long size = 0;
+                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
+                    size += cfs2.getTracker().getView().getCurrentMemtable().getLiveDataSize();
+                return size;
+            }
+        });
+        memtableSwitchCount = createTableCounter("MemtableSwitchCount");
+        estimatedPartitionSizeHistogram = Metrics.register(factory.createMetricName("EstimatedPartitionSizeHistogram"),
+                                                           aliasFactory.createMetricName("EstimatedRowSizeHistogram"),
+                                                           new Gauge<long[]>()
+                                                           {
+                                                               public long[] getValue()
+                                                               {
+                                                                   return combineHistograms(cfs.getSSTables(SSTableSet.CANONICAL), new GetHistogram()
+                                                                   {
+                                                                       public EstimatedHistogram getHistogram(SSTableReader reader)
+                                                                       {
+                                                                           return reader.getEstimatedPartitionSize();
+                                                                       }
+                                                                   });
+                                                               }
+                                                           });
+        estimatedPartitionCount = Metrics.register(factory.createMetricName("EstimatedPartitionCount"),
+                                                   aliasFactory.createMetricName("EstimatedRowCount"),
+                                                   new Gauge<Long>()
+                                                   {
+                                                       public Long getValue()
+                                                       {
+                                                           long memtablePartitions = 0;
+                                                           for (Memtable memtable : cfs.getTracker().getView().getAllMemtables())
+                                                               memtablePartitions += memtable.partitionCount();
+                                                           return SSTableReader.getApproximateKeyCount(cfs.getSSTables(SSTableSet.CANONICAL)) + memtablePartitions;
+                                                       }
+                                                   });
+        estimatedColumnCountHistogram = Metrics.register(factory.createMetricName("EstimatedColumnCountHistogram"),
+                                                         aliasFactory.createMetricName("EstimatedColumnCountHistogram"),
+                                                         new Gauge<long[]>()
+                                                         {
+                                                             public long[] getValue()
+                                                             {
+                                                                 return combineHistograms(cfs.getSSTables(SSTableSet.CANONICAL), new GetHistogram()
+                                                                 {
+                                                                     public EstimatedHistogram getHistogram(SSTableReader reader)
+                                                                     {
+                                                                         return reader.getEstimatedColumnCount();
+                                                                     }
+                                                                 });
+            }
+        });
+        sstablesPerReadHistogram = createTableHistogram("SSTablesPerReadHistogram", cfs.keyspace.metric.sstablesPerReadHistogram, true);
+        compressionRatio = createTableGauge("CompressionRatio", new Gauge<Double>()
+        {
+            public Double getValue()
+            {
+                double sum = 0;
+                int total = 0;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
+                {
+                    if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
+                    {
+                        sum += sstable.getCompressionRatio();
+                        total++;
+                    }
+                }
+                return total != 0 ? sum / total : 0;
+            }
+        }, new Gauge<Double>() // global gauge
+        {
+            public Double getValue()
+            {
+                double sum = 0;
+                int total = 0;
+                for (Keyspace keyspace : Keyspace.all())
+                {
+                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.CANONICAL))
+                    {
+                        if (sstable.getCompressionRatio() != MetadataCollector.NO_COMPRESSION_RATIO)
+                        {
+                            sum += sstable.getCompressionRatio();
+                            total++;
+                        }
+                    }
+                }
+                return total != 0 ? sum / total : 0;
+            }
+        });
+        readLatency = new LatencyMetrics(factory, "Read", cfs.keyspace.metric.readLatency, globalReadLatency);
+        writeLatency = new LatencyMetrics(factory, "Write", cfs.keyspace.metric.writeLatency, globalWriteLatency);
+        rangeLatency = new LatencyMetrics(factory, "Range", cfs.keyspace.metric.rangeLatency, globalRangeLatency);
+        pendingFlushes = createTableCounter("PendingFlushes");
+        pendingCompactions = createTableGauge("PendingCompactions", new Gauge<Integer>()
+        {
+            public Integer getValue()
+            {
+                return cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
+            }
+        });
+        liveSSTableCount = createTableGauge("LiveSSTableCount", new Gauge<Integer>()
+        {
+            public Integer getValue()
+            {
+                return cfs.getTracker().getView().liveSSTables().size();
+            }
+        });
+        liveDiskSpaceUsed = createTableCounter("LiveDiskSpaceUsed");
+        totalDiskSpaceUsed = createTableCounter("TotalDiskSpaceUsed");
+        minPartitionSize = createTableGauge("MinPartitionSize", "MinRowSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long min = 0;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
+                {
+                    if (min == 0 || sstable.getEstimatedPartitionSize().min() < min)
+                        min = sstable.getEstimatedPartitionSize().min();
+                }
+                return min;
+            }
+        }, new Gauge<Long>() // global gauge
+        {
+            public Long getValue()
+            {
+                long min = Long.MAX_VALUE;
+                for (Metric cfGauge : allTableMetrics.get("MinPartitionSize"))
+                {
+                    min = Math.min(min, ((Gauge<? extends Number>) cfGauge).getValue().longValue());
+                }
+                return min;
+            }
+        });
+        maxPartitionSize = createTableGauge("MaxPartitionSize", "MaxRowSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long max = 0;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
+                {
+                    if (sstable.getEstimatedPartitionSize().max() > max)
+                        max = sstable.getEstimatedPartitionSize().max();
+                }
+                return max;
+            }
+        }, new Gauge<Long>() // global gauge
+        {
+            public Long getValue()
+            {
+                long max = 0;
+                for (Metric cfGauge : allTableMetrics.get("MaxPartitionSize"))
+                {
+                    max = Math.max(max, ((Gauge<? extends Number>) cfGauge).getValue().longValue());
+                }
+                return max;
+            }
+        });
+        meanPartitionSize = createTableGauge("MeanPartitionSize", "MeanRowSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long sum = 0;
+                long count = 0;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
+                {
+                    long n = sstable.getEstimatedPartitionSize().count();
+                    sum += sstable.getEstimatedPartitionSize().mean() * n;
+                    count += n;
+                }
+                return count > 0 ? sum / count : 0;
+            }
+        }, new Gauge<Long>() // global gauge
+        {
+            public Long getValue()
+            {
+                long sum = 0;
+                long count = 0;
+                for (Keyspace keyspace : Keyspace.all())
+                {
+                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.CANONICAL))
+                    {
+                        long n = sstable.getEstimatedPartitionSize().count();
+                        sum += sstable.getEstimatedPartitionSize().mean() * n;
+                        count += n;
+                    }
+                }
+                return count > 0 ? sum / count : 0;
+            }
+        });
+        bloomFilterFalsePositives = createTableGauge("BloomFilterFalsePositives", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long count = 0L;
+                for (SSTableReader sstable: cfs.getSSTables(SSTableSet.LIVE))
+                    count += sstable.getBloomFilterFalsePositiveCount();
+                return count;
+            }
+        });
+        recentBloomFilterFalsePositives = createTableGauge("RecentBloomFilterFalsePositives", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long count = 0L;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                    count += sstable.getRecentBloomFilterFalsePositiveCount();
+                return count;
+            }
+        });
+        bloomFilterFalseRatio = createTableGauge("BloomFilterFalseRatio", new Gauge<Double>()
+        {
+            public Double getValue()
+            {
+                long falseCount = 0L;
+                long trueCount = 0L;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                {
+                    falseCount += sstable.getBloomFilterFalsePositiveCount();
+                    trueCount += sstable.getBloomFilterTruePositiveCount();
+                }
+                if (falseCount == 0L && trueCount == 0L)
+                    return 0d;
+                return (double) falseCount / (trueCount + falseCount);
+            }
+        }, new Gauge<Double>() // global gauge
+        {
+            public Double getValue()
+            {
+                long falseCount = 0L;
+                long trueCount = 0L;
+                for (Keyspace keyspace : Keyspace.all())
+                {
+                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.LIVE))
+                    {
+                        falseCount += sstable.getBloomFilterFalsePositiveCount();
+                        trueCount += sstable.getBloomFilterTruePositiveCount();
+                    }
+                }
+                if (falseCount == 0L && trueCount == 0L)
+                    return 0d;
+                return (double) falseCount / (trueCount + falseCount);
+            }
+        });
+        recentBloomFilterFalseRatio = createTableGauge("RecentBloomFilterFalseRatio", new Gauge<Double>()
+        {
+            public Double getValue()
+            {
+                long falseCount = 0L;
+                long trueCount = 0L;
+                for (SSTableReader sstable: cfs.getSSTables(SSTableSet.LIVE))
+                {
+                    falseCount += sstable.getRecentBloomFilterFalsePositiveCount();
+                    trueCount += sstable.getRecentBloomFilterTruePositiveCount();
+                }
+                if (falseCount == 0L && trueCount == 0L)
+                    return 0d;
+                return (double) falseCount / (trueCount + falseCount);
+            }
+        }, new Gauge<Double>() // global gauge
+        {
+            public Double getValue()
+            {
+                long falseCount = 0L;
+                long trueCount = 0L;
+                for (Keyspace keyspace : Keyspace.all())
+                {
+                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.LIVE))
+                    {
+                        falseCount += sstable.getRecentBloomFilterFalsePositiveCount();
+                        trueCount += sstable.getRecentBloomFilterTruePositiveCount();
+                    }
+                }
+                if (falseCount == 0L && trueCount == 0L)
+                    return 0d;
+                return (double) falseCount / (trueCount + falseCount);
+            }
+        });
+        bloomFilterDiskSpaceUsed = createTableGauge("BloomFilterDiskSpaceUsed", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long total = 0;
+                for (SSTableReader sst : cfs.getSSTables(SSTableSet.CANONICAL))
+                    total += sst.getBloomFilterSerializedSize();
+                return total;
+            }
+        });
+        bloomFilterOffHeapMemoryUsed = createTableGauge("BloomFilterOffHeapMemoryUsed", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long total = 0;
+                for (SSTableReader sst : cfs.getSSTables(SSTableSet.LIVE))
+                    total += sst.getBloomFilterOffHeapSize();
+                return total;
+            }
+        });
+        indexSummaryOffHeapMemoryUsed = createTableGauge("IndexSummaryOffHeapMemoryUsed", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long total = 0;
+                for (SSTableReader sst : cfs.getSSTables(SSTableSet.LIVE))
+                    total += sst.getIndexSummaryOffHeapSize();
+                return total;
+            }
+        });
+        compressionMetadataOffHeapMemoryUsed = createTableGauge("CompressionMetadataOffHeapMemoryUsed", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long total = 0;
+                for (SSTableReader sst : cfs.getSSTables(SSTableSet.LIVE))
+                    total += sst.getCompressionMetadataOffHeapSize();
+                return total;
+            }
+        });
+        speculativeRetries = createTableCounter("SpeculativeRetries");
+        keyCacheHitRate = Metrics.register(factory.createMetricName("KeyCacheHitRate"),
+                                           aliasFactory.createMetricName("KeyCacheHitRate"),
+                                           new RatioGauge()
+        {
+            @Override
+            public Ratio getRatio()
+            {
+                return Ratio.of(getNumerator(), getDenominator());
+            }
+
+            protected double getNumerator()
+            {
+                long hits = 0L;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                    hits += sstable.getKeyCacheHit();
+                return hits;
+            }
+
+            protected double getDenominator()
+            {
+                long requests = 0L;
+                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                    requests += sstable.getKeyCacheRequest();
+                return Math.max(requests, 1); // to avoid NaN.
+            }
+        });
+        tombstoneScannedHistogram = createTableHistogram("TombstoneScannedHistogram", cfs.keyspace.metric.tombstoneScannedHistogram, false);
+        liveScannedHistogram = createTableHistogram("LiveScannedHistogram", cfs.keyspace.metric.liveScannedHistogram, false);
+        colUpdateTimeDeltaHistogram = createTableHistogram("ColUpdateTimeDeltaHistogram", cfs.keyspace.metric.colUpdateTimeDeltaHistogram, false);
+        coordinatorReadLatency = Metrics.timer(factory.createMetricName("CoordinatorReadLatency"));
+        coordinatorScanLatency = Metrics.timer(factory.createMetricName("CoordinatorScanLatency"));
+        waitingOnFreeMemtableSpace = Metrics.histogram(factory.createMetricName("WaitingOnFreeMemtableSpace"), false);
+
+        // We do not want to capture view mutation specific metrics for a view
+        // They only makes sense to capture on the base table
+        if (cfs.metadata.isView())
+        {
+            viewLockAcquireTime = null;
+            viewReadTime = null;
+        }
+        else
+        {
+            viewLockAcquireTime = createTableTimer("ViewLockAcquireTime", cfs.keyspace.metric.viewLockAcquireTime);
+            viewReadTime = createTableTimer("ViewReadTime", cfs.keyspace.metric.viewReadTime);
+        }
+
+        trueSnapshotsSize = createTableGauge("SnapshotsSize", new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                return cfs.trueSnapshotsSize();
+            }
+        });
+        rowCacheHitOutOfRange = createTableCounter("RowCacheHitOutOfRange");
+        rowCacheHit = createTableCounter("RowCacheHit");
+        rowCacheMiss = createTableCounter("RowCacheMiss");
+
+        casPrepare = new LatencyMetrics(factory, "CasPrepare", cfs.keyspace.metric.casPrepare);
+        casPropose = new LatencyMetrics(factory, "CasPropose", cfs.keyspace.metric.casPropose);
+        casCommit = new LatencyMetrics(factory, "CasCommit", cfs.keyspace.metric.casCommit);
+
+        readRepairRequests = createTableMeter("ReadRepairRequests");
+        shortReadProtectionRequests = createTableMeter("ShortReadProtectionRequests");
+        replicaSideFilteringProtectionRequests = createTableMeter("ReplicaSideFilteringProtectionRequests");
+    }
+
+    public void updateSSTableIterated(int count)
+    {
+        sstablesPerReadHistogram.update(count);
+    }
+
+    /**
+     * Release all associated metrics.
+     */
+    public void release()
+    {
+        for(Map.Entry<String, String> entry : all.entrySet())
+        {
+            final CassandraMetricsRegistry.MetricName name = factory.createMetricName(entry.getKey());
+            final Metric metric = Metrics.getMetrics().get(name.getMetricName());
+            if (metric != null)
+            {  // Metric will be null if it's a view metric we are releasing. Views have null for ViewLockAcquireTime and ViewLockReadTime
+                final CassandraMetricsRegistry.MetricName alias = aliasFactory.createMetricName(entry.getValue());
+                allTableMetrics.get(entry.getKey()).remove(metric);
+                Metrics.remove(name, alias);
+            }
+        }
+        readLatency.release();
+        writeLatency.release();
+        rangeLatency.release();
+        Metrics.remove(factory.createMetricName("EstimatedPartitionSizeHistogram"), aliasFactory.createMetricName("EstimatedRowSizeHistogram"));
+        Metrics.remove(factory.createMetricName("EstimatedPartitionCount"), aliasFactory.createMetricName("EstimatedRowCount"));
+        Metrics.remove(factory.createMetricName("EstimatedColumnCountHistogram"), aliasFactory.createMetricName("EstimatedColumnCountHistogram"));
+        Metrics.remove(factory.createMetricName("KeyCacheHitRate"), aliasFactory.createMetricName("KeyCacheHitRate"));
+        Metrics.remove(factory.createMetricName("CoordinatorReadLatency"), aliasFactory.createMetricName("CoordinatorReadLatency"));
+        Metrics.remove(factory.createMetricName("CoordinatorScanLatency"), aliasFactory.createMetricName("CoordinatorScanLatency"));
+        Metrics.remove(factory.createMetricName("WaitingOnFreeMemtableSpace"), aliasFactory.createMetricName("WaitingOnFreeMemtableSpace"));
+    }
+
+
+    /**
+     * Create a gauge that will be part of a merged version of all column families.  The global gauge
+     * will merge each CF gauge by adding their values
+     */
+    protected <T extends Number> Gauge<T> createTableGauge(final String name, Gauge<T> gauge)
+    {
+        return createTableGauge(name, gauge, new Gauge<Long>()
+        {
+            public Long getValue()
+            {
+                long total = 0;
+                for (Metric cfGauge : allTableMetrics.get(name))
+                {
+                    total = total + ((Gauge<? extends Number>) cfGauge).getValue().longValue();
+                }
+                return total;
+            }
+        });
+    }
+
+    /**
+     * Create a gauge that will be part of a merged version of all column families.  The global gauge
+     * is defined as the globalGauge parameter
+     */
+    protected <G,T> Gauge<T> createTableGauge(String name, Gauge<T> gauge, Gauge<G> globalGauge)
+    {
+        return createTableGauge(name, name, gauge, globalGauge);
+    }
+
+    protected <G,T> Gauge<T> createTableGauge(String name, String alias, Gauge<T> gauge, Gauge<G> globalGauge)
+    {
+        Gauge<T> cfGauge = Metrics.register(factory.createMetricName(name), aliasFactory.createMetricName(alias), gauge);
+        if (register(name, alias, cfGauge))
+        {
+            Metrics.register(globalFactory.createMetricName(name), globalAliasFactory.createMetricName(alias), globalGauge);
+        }
+        return cfGauge;
+    }
+
+    /**
+     * Creates a counter that will also have a global counter thats the sum of all counters across
+     * different column families
+     */
+    protected Counter createTableCounter(final String name)
+    {
+        return createTableCounter(name, name);
+    }
+
+    protected Counter createTableCounter(final String name, final String alias)
+    {
+        Counter cfCounter = Metrics.counter(factory.createMetricName(name), aliasFactory.createMetricName(alias));
+        if (register(name, alias, cfCounter))
+        {
+            Metrics.register(globalFactory.createMetricName(name),
+                             globalAliasFactory.createMetricName(alias),
+                             new Gauge<Long>()
+            {
+                public Long getValue()
+                {
+                    long total = 0;
+                    for (Metric cfGauge : allTableMetrics.get(name))
+                    {
+                        total += ((Counter) cfGauge).getCount();
+                    }
+                    return total;
+                }
+            });
+        }
+        return cfCounter;
+    }
+
+    private Meter createTableMeter(final String name)
+    {
+        return createTableMeter(name, name);
+    }
+
+    private Meter createTableMeter(final String name, final String alias)
+    {
+        Meter tableMeter = Metrics.meter(factory.createMetricName(name), aliasFactory.createMetricName(alias));
+        register(name, alias, tableMeter);
+        return tableMeter;
+    }
+
+    /**
+     * Create a histogram-like interface that will register both a CF, keyspace and global level
+     * histogram and forward any updates to both
+     */
+    protected TableHistogram createTableHistogram(String name, Histogram keyspaceHistogram, boolean considerZeroes)
+    {
+        return createTableHistogram(name, name, keyspaceHistogram, considerZeroes);
+    }
+
+    protected TableHistogram createTableHistogram(String name, String alias, Histogram keyspaceHistogram, boolean considerZeroes)
+    {
+        Histogram cfHistogram = Metrics.histogram(factory.createMetricName(name), aliasFactory.createMetricName(alias), considerZeroes);
+        register(name, alias, cfHistogram);
+        return new TableHistogram(cfHistogram,
+                                  keyspaceHistogram,
+                                  Metrics.histogram(globalFactory.createMetricName(name),
+                                                    globalAliasFactory.createMetricName(alias),
+                                                    considerZeroes));
+    }
+
+    protected TableTimer createTableTimer(String name, Timer keyspaceTimer)
+    {
+        return createTableTimer(name, name, keyspaceTimer);
+    }
+
+    protected TableTimer createTableTimer(String name, String alias, Timer keyspaceTimer)
+    {
+        Timer cfTimer = Metrics.timer(factory.createMetricName(name), aliasFactory.createMetricName(alias));
+        register(name, alias, cfTimer);
+        return new TableTimer(cfTimer,
+                              keyspaceTimer,
+                              Metrics.timer(globalFactory.createMetricName(name),
+                                            globalAliasFactory.createMetricName(alias)));
+    }
+
+    /**
+     * Registers a metric to be removed when unloading CF.
+     * @return true if first time metric with that name has been registered
+     */
+    private boolean register(String name, String alias, Metric metric)
+    {
+        boolean ret = allTableMetrics.putIfAbsent(name, ConcurrentHashMap.newKeySet()) == null;
+        allTableMetrics.get(name).add(metric);
+        all.put(name, alias);
+        return ret;
+    }
+
+    public static class TableHistogram
+    {
+        public final Histogram[] all;
+        public final Histogram cf;
+        private TableHistogram(Histogram cf, Histogram keyspace, Histogram global)
+        {
+            this.cf = cf;
+            this.all = new Histogram[]{cf, keyspace, global};
+        }
+
+        public void update(long i)
+        {
+            for(Histogram histo : all)
+            {
+                histo.update(i);
+            }
+        }
+    }
+
+    public static class TableTimer
+    {
+        public final Timer[] all;
+        public final Timer cf;
+        private TableTimer(Timer cf, Timer keyspace, Timer global)
+        {
+            this.cf = cf;
+            this.all = new Timer[]{cf, keyspace, global};
+        }
+
+        public void update(long i, TimeUnit unit)
+        {
+            for(Timer timer : all)
+            {
+                timer.update(i, unit);
+            }
+        }
+    }
+
+    static class TableMetricNameFactory implements MetricNameFactory
+    {
+        private final String keyspaceName;
+        private final String tableName;
+        private final boolean isIndex;
+        private final String type;
+
+        TableMetricNameFactory(ColumnFamilyStore cfs, String type)
+        {
+            this.keyspaceName = cfs.keyspace.getName();
+            this.tableName = cfs.name;
+            this.isIndex = cfs.isIndex();
+            this.type = type;
+        }
+
+        public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
+        {
+            String groupName = TableMetrics.class.getPackage().getName();
+            String type = isIndex ? "Index" + this.type : this.type;
+
+            StringBuilder mbeanName = new StringBuilder();
+            mbeanName.append(groupName).append(":");
+            mbeanName.append("type=").append(type);
+            mbeanName.append(",keyspace=").append(keyspaceName);
+            mbeanName.append(",scope=").append(tableName);
+            mbeanName.append(",name=").append(metricName);
+
+            return new CassandraMetricsRegistry.MetricName(groupName, type, metricName, keyspaceName + "." + tableName, mbeanName.toString());
+        }
+    }
+
+    static class AllTableMetricNameFactory implements MetricNameFactory
+    {
+        private final String type;
+        public AllTableMetricNameFactory(String type)
+        {
+            this.type = type;
+        }
+
+        public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
+        {
+            String groupName = TableMetrics.class.getPackage().getName();
+            StringBuilder mbeanName = new StringBuilder();
+            mbeanName.append(groupName).append(":");
+            mbeanName.append("type=" + type);
+            mbeanName.append(",name=").append(metricName);
+            return new CassandraMetricsRegistry.MetricName(groupName, type, metricName, "all", mbeanName.toString());
+        }
+    }
+
+    public enum Sampler
+    {
+        READS, WRITES
+    }
+}

diff --git a/src/java/org/apache/cassandra/metrics/ViewWriteMetrics.java b/src/java/org/apache/cassandra/metrics/ViewWriteMetrics.java
new file mode 100644
index 0000000..df98865
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/ViewWriteMetrics.java

@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.metrics;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Timer;
+import com.codahale.metrics.Gauge;
+
+public class ViewWriteMetrics extends ClientRequestMetrics
+{
+    public final Counter viewReplicasAttempted;
+    public final Counter viewReplicasSuccess;
+    // time between when mutation is applied to local memtable to when CL.ONE is achieved on MV
+    public final Timer viewWriteLatency;
+
+    public ViewWriteMetrics(String scope) {
+        super(scope);
+        viewReplicasAttempted = Metrics.counter(factory.createMetricName("ViewReplicasAttempted"));
+        viewReplicasSuccess = Metrics.counter(factory.createMetricName("ViewReplicasSuccess"));
+        viewWriteLatency = Metrics.timer(factory.createMetricName("ViewWriteLatency"));
+        Metrics.register(factory.createMetricName("ViewPendingMutations"), new Gauge<Long>()
+                {
+                    public Long getValue()
+                    {
+                        return viewReplicasAttempted.getCount() - viewReplicasSuccess.getCount();
+                    }
+                });
+    }
+
+    public void release()
+    {
+        super.release();
+        Metrics.remove(factory.createMetricName("ViewReplicasAttempted"));
+        Metrics.remove(factory.createMetricName("ViewReplicasSuccess"));
+        Metrics.remove(factory.createMetricName("ViewWriteLatency"));
+        Metrics.remove(factory.createMetricName("ViewPendingMutations"));
+    }
+}

diff --git a/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java b/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java
index f7dc240..b97b836 100644
--- a/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java
+++ b/src/java/org/apache/cassandra/net/IncomingStreamingConnection.java

@@ -18,8 +18,6 @@
 package org.apache.cassandra.net;
 
 import java.io.Closeable;
-import java.io.DataInput;
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.net.Socket;
 import java.util.Set;
@@ -27,6 +25,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.streaming.StreamResultFuture;
 import org.apache.cassandra.streaming.messages.StreamInitMessage;
@@ -52,6 +52,7 @@
     }
 
     @Override
+    @SuppressWarnings("resource") // Not closing constructed DataInputPlus's as the stream needs to remain open.
     public void run()
     {
         try
@@ -61,7 +62,7 @@
             if (version != StreamMessage.CURRENT_VERSION)
                 throw new IOException(String.format("Received stream using protocol version %d (my version %d). Terminating connection", version, StreamMessage.CURRENT_VERSION));
 
-            DataInput input = new DataInputStream(socket.getInputStream());
+            DataInputPlus input = new DataInputStreamPlus(socket.getInputStream());
             StreamInitMessage init = StreamInitMessage.serializer.deserialize(input, version);
 
             //Set SO_TIMEOUT on follower side

diff --git a/src/java/org/apache/cassandra/net/IncomingTcpConnection.java b/src/java/org/apache/cassandra/net/IncomingTcpConnection.java
index 6ac9cd1..e79da31 100644
--- a/src/java/org/apache/cassandra/net/IncomingTcpConnection.java
+++ b/src/java/org/apache/cassandra/net/IncomingTcpConnection.java

@@ -38,7 +38,8 @@
 import org.xerial.snappy.SnappyInputStream;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.UnknownColumnFamilyException;
-import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.NIODataInputStream;
 
 public class IncomingTcpConnection extends Thread implements Closeable
@@ -131,6 +132,7 @@
         }
     }
 
+    @SuppressWarnings("resource") // Not closing constructed DataInputPlus's as the stream needs to remain open.
     private void receiveMessages() throws IOException
     {
         // handshake (true) endpoint versions
@@ -139,10 +141,13 @@
         // to connect with, the other node will disconnect
         out.writeInt(MessagingService.current_version);
         out.flush();
-        DataInput in = new DataInputStream(socket.getInputStream());
-        int maxVersion = in.readInt();
+
         // outbound side will reconnect if necessary to upgrade version
-        assert version <= MessagingService.current_version;
+        if (version > MessagingService.current_version)
+            throw new IOException("Peer-used messaging version " + version + " is larger than max supported " + MessagingService.current_version);
+
+        DataInputPlus in = new DataInputStreamPlus(socket.getInputStream());
+        int maxVersion = in.readInt();
         from = CompactEndpointSerializationHelper.deserialize(in);
         // record the (true) version of the endpoint
         MessagingService.instance().setVersion(from, maxVersion);
@@ -153,20 +158,19 @@
             logger.trace("Upgrading incoming connection to be compressed");
             if (version < MessagingService.VERSION_21)
             {
-                in = new DataInputStream(new SnappyInputStream(socket.getInputStream()));
+                in = new DataInputStreamPlus(new SnappyInputStream(socket.getInputStream()));
             }
             else
             {
                 LZ4FastDecompressor decompressor = LZ4Factory.fastestInstance().fastDecompressor();
                 Checksum checksum = XXHashFactory.fastestInstance().newStreamingHash32(OutboundTcpConnection.LZ4_HASH_SEED).asChecksum();
-                in = new DataInputStream(new LZ4BlockInputStream(socket.getInputStream(),
+                in = new DataInputStreamPlus(new LZ4BlockInputStream(socket.getInputStream(),
                                                                  decompressor,
                                                                  checksum));
             }
         }
         else
         {
-            @SuppressWarnings("resource")
             ReadableByteChannel channel = socket.getChannel();
             in = new NIODataInputStream(channel != null ? channel : Channels.newChannel(socket.getInputStream()), BUFFER_SIZE);
         }
@@ -178,7 +182,7 @@
         }
     }
 
-    private InetAddress receiveMessage(DataInput input, int version) throws IOException
+    private InetAddress receiveMessage(DataInputPlus input, int version) throws IOException
     {
         int id;
         if (version < MessagingService.VERSION_20)

diff --git a/src/java/org/apache/cassandra/net/MessageDeliveryTask.java b/src/java/org/apache/cassandra/net/MessageDeliveryTask.java
index 4211f5a..ce6eebc 100644
--- a/src/java/org/apache/cassandra/net/MessageDeliveryTask.java
+++ b/src/java/org/apache/cassandra/net/MessageDeliveryTask.java

@@ -24,8 +24,8 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.filter.TombstoneOverwhelmingException;
-import org.apache.cassandra.db.index.IndexNotAvailableException;
 import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.index.IndexNotAvailableException;
 
 public class MessageDeliveryTask implements Runnable
 {

diff --git a/src/java/org/apache/cassandra/net/MessageIn.java b/src/java/org/apache/cassandra/net/MessageIn.java
index 10260c2..64b8e81 100644
--- a/src/java/org/apache/cassandra/net/MessageIn.java
+++ b/src/java/org/apache/cassandra/net/MessageIn.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.net;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.Collections;
@@ -27,10 +26,10 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 
 public class MessageIn<T>
@@ -57,7 +56,7 @@
         return new MessageIn<T>(from, payload, parameters, verb, version);
     }
 
-    public static <T2> MessageIn<T2> read(DataInput in, int version, int id) throws IOException
+    public static <T2> MessageIn<T2> read(DataInputPlus in, int version, int id) throws IOException
     {
         InetAddress from = CompactEndpointSerializationHelper.deserialize(in);
 
@@ -89,7 +88,7 @@
             if (callback == null)
             {
                 // reply for expired callback.  we'll have to skip it.
-                FileUtils.skipBytesFully(in, payloadSize);
+                in.skipBytesFully(payloadSize);
                 return null;
             }
             serializer = (IVersionedSerializer<T2>) callback.serializer;

diff --git a/src/java/org/apache/cassandra/net/MessageOut.java b/src/java/org/apache/cassandra/net/MessageOut.java
index 1e291c2..09ff63b 100644
--- a/src/java/org/apache/cassandra/net/MessageOut.java
+++ b/src/java/org/apache/cassandra/net/MessageOut.java

@@ -30,6 +30,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
@@ -109,7 +110,7 @@
     {
         CompactEndpointSerializationHelper.serialize(from, out);
 
-        out.writeInt(verb.ordinal());
+        out.writeInt(MessagingService.Verb.convertForMessagingServiceVersion(verb, version).ordinal());
         out.writeInt(parameters.size());
         for (Map.Entry<String, byte[]> entry : parameters.entrySet())
         {
@@ -129,18 +130,18 @@
     {
         int size = CompactEndpointSerializationHelper.serializedSize(from);
 
-        size += TypeSizes.NATIVE.sizeof(verb.ordinal());
-        size += TypeSizes.NATIVE.sizeof(parameters.size());
+        size += TypeSizes.sizeof(verb.ordinal());
+        size += TypeSizes.sizeof(parameters.size());
         for (Map.Entry<String, byte[]> entry : parameters.entrySet())
         {
-            size += TypeSizes.NATIVE.sizeof(entry.getKey());
-            size += TypeSizes.NATIVE.sizeof(entry.getValue().length);
+            size += TypeSizes.sizeof(entry.getKey());
+            size += TypeSizes.sizeof(entry.getValue().length);
             size += entry.getValue().length;
         }
 
         long longSize = payloadSize(version);
         assert longSize <= Integer.MAX_VALUE; // larger values are supported in sstables but not messages
-        size += TypeSizes.NATIVE.sizeof((int) longSize);
+        size += TypeSizes.sizeof((int) longSize);
         size += longSize;
         return size;
     }

diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java
index f125b09..b6f12a5 100644
--- a/src/java/org/apache/cassandra/net/MessagingService.java
+++ b/src/java/org/apache/cassandra/net/MessagingService.java

@@ -36,11 +36,8 @@
 import com.google.common.collect.Sets;
 
 import org.cliffc.high_scale_lib.NonBlockingHashMap;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.concurrent.ExecutorLocal;
 import org.apache.cassandra.concurrent.ExecutorLocals;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.concurrent.Stage;
@@ -49,6 +46,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.batchlog.Batch;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.BootStrapper;
 import org.apache.cassandra.dht.IPartitioner;
@@ -57,10 +55,14 @@
 import org.apache.cassandra.gms.GossipDigestAck;
 import org.apache.cassandra.gms.GossipDigestAck2;
 import org.apache.cassandra.gms.GossipDigestSyn;
+import org.apache.cassandra.hints.HintMessage;
+import org.apache.cassandra.hints.HintResponse;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.locator.ILatencySubscriber;
+import org.apache.cassandra.metrics.CassandraMetricsRegistry;
 import org.apache.cassandra.metrics.ConnectionMetrics;
 import org.apache.cassandra.metrics.DroppedMessageMetrics;
 import org.apache.cassandra.repair.messages.RepairMessage;
@@ -75,6 +77,10 @@
 
 public final class MessagingService implements MessagingServiceMBean
 {
+    // Required to allow schema migrations while upgrading within the minor 3.0.x versions to 3.0.14.
+    // See CASSANDRA-13004 for details.
+    public final static boolean FORCE_3_0_PROTOCOL_VERSION = Boolean.getBoolean("cassandra.force_3_0_protocol_version");
+
     public static final String MBEAN_NAME = "org.apache.cassandra.net:type=MessagingService";
 
     // 8 bits version, so don't waste versions
@@ -82,7 +88,9 @@
     public static final int VERSION_20 = 7;
     public static final int VERSION_21 = 8;
     public static final int VERSION_22 = 9;
-    public static final int current_version = VERSION_22;
+    public static final int VERSION_30 = 10;
+    public static final int VERSION_3014 = 11;
+    public static final int current_version = FORCE_3_0_PROTOCOL_VERSION ? VERSION_30 : VERSION_3014;
 
     public static final String FAILURE_CALLBACK_PARAM = "CAL_BAC";
     public static final byte[] ONE_BYTE = new byte[1];
@@ -94,17 +102,18 @@
     public static final int PROTOCOL_MAGIC = 0xCA552DFA;
 
     private boolean allNodesAtLeast22 = true;
+    private boolean allNodesAtLeast30 = true;
 
     /* All verb handler identifiers */
     public enum Verb
     {
         MUTATION,
-        @Deprecated BINARY,
+        HINT,
         READ_REPAIR,
         READ,
         REQUEST_RESPONSE, // client-initiated reads and writes
-        @Deprecated STREAM_INITIATE,
-        @Deprecated STREAM_INITIATE_DONE,
+        BATCH_STORE,  // was @Deprecated STREAM_INITIATE,
+        BATCH_REMOVE, // was @Deprecated STREAM_INITIATE_DONE,
         @Deprecated STREAM_REPLY,
         @Deprecated STREAM_REQUEST,
         RANGE_SLICE,
@@ -131,16 +140,34 @@
         _TRACE, // dummy verb so we can use MS.droppedMessagesMap
         ECHO,
         REPAIR_MESSAGE,
-        // use as padding for backwards compatability where a previous version needs to validate a verb from the future.
         PAXOS_PREPARE,
         PAXOS_PROPOSE,
         PAXOS_COMMIT,
-        PAGED_RANGE,
-        // remember to add new verbs at the end, since we serialize by ordinal
-        UNUSED_1,
+        @Deprecated PAGED_RANGE,
+        PING,
+
+        // UNUSED verbs were used as padding for backward/forward compatability before 4.0,
+        // but it wasn't quite as bullet/future proof as needed. We still need to keep these entries
+        // around, at least for a major rev or two (post-4.0). see CASSANDRA-13993 for a discussion.
+        // For now, though, the UNUSED are legacy values (placeholders, basically) that should only be used
+        // for correctly adding VERBs that need to be emergency additions to 3.0/3.11.
+        // We can reclaim them (their id's, to be correct) in future versions, if desired, though.
         UNUSED_2,
         UNUSED_3,
+        UNUSED_4,
+        UNUSED_5,
         ;
+        // remember to add new verbs at the end, since we serialize by ordinal
+
+        // This is to support a "late" choice of the verb based on the messaging service version.
+        // See CASSANDRA-12249 for more details.
+        public static Verb convertForMessagingServiceVersion(Verb verb, int version)
+        {
+            if (verb == PAGED_RANGE && version >= VERSION_30)
+                return RANGE_SLICE;
+
+            return verb;
+        }
     }
 
     public static final EnumMap<MessagingService.Verb, Stage> verbStages = new EnumMap<MessagingService.Verb, Stage>(MessagingService.Verb.class)
@@ -148,10 +175,13 @@
         put(Verb.MUTATION, Stage.MUTATION);
         put(Verb.COUNTER_MUTATION, Stage.COUNTER_MUTATION);
         put(Verb.READ_REPAIR, Stage.MUTATION);
+        put(Verb.HINT, Stage.MUTATION);
         put(Verb.TRUNCATE, Stage.MUTATION);
         put(Verb.PAXOS_PREPARE, Stage.MUTATION);
         put(Verb.PAXOS_PROPOSE, Stage.MUTATION);
         put(Verb.PAXOS_COMMIT, Stage.MUTATION);
+        put(Verb.BATCH_STORE, Stage.MUTATION);
+        put(Verb.BATCH_REMOVE, Stage.MUTATION);
 
         put(Verb.READ, Stage.READ);
         put(Verb.RANGE_SLICE, Stage.READ);
@@ -184,9 +214,10 @@
         put(Verb.SNAPSHOT, Stage.MISC);
         put(Verb.ECHO, Stage.GOSSIP);
 
-        put(Verb.UNUSED_1, Stage.INTERNAL_RESPONSE);
         put(Verb.UNUSED_2, Stage.INTERNAL_RESPONSE);
         put(Verb.UNUSED_3, Stage.INTERNAL_RESPONSE);
+
+        put(Verb.PING, Stage.READ);
     }};
 
     /**
@@ -205,9 +236,9 @@
 
         put(Verb.MUTATION, Mutation.serializer);
         put(Verb.READ_REPAIR, Mutation.serializer);
-        put(Verb.READ, ReadCommand.serializer);
-        put(Verb.RANGE_SLICE, RangeSliceCommand.serializer);
-        put(Verb.PAGED_RANGE, PagedRangeCommand.serializer);
+        put(Verb.READ, ReadCommand.readSerializer);
+        put(Verb.RANGE_SLICE, ReadCommand.rangeSliceSerializer);
+        put(Verb.PAGED_RANGE, ReadCommand.pagedRangeSerializer);
         put(Verb.BOOTSTRAP_TOKEN, BootStrapper.StringSerializer.instance);
         put(Verb.REPAIR_MESSAGE, RepairMessage.serializer);
         put(Verb.GOSSIP_DIGEST_ACK, GossipDigestAck.serializer);
@@ -222,6 +253,10 @@
         put(Verb.PAXOS_PREPARE, Commit.serializer);
         put(Verb.PAXOS_PROPOSE, Commit.serializer);
         put(Verb.PAXOS_COMMIT, Commit.serializer);
+        put(Verb.HINT, HintMessage.serializer);
+        put(Verb.BATCH_STORE, Batch.serializer);
+        put(Verb.BATCH_REMOVE, UUIDSerializer.serializer);
+        put(Verb.PING, PingMessage.serializer);
     }};
 
     /**
@@ -230,10 +265,11 @@
     public static final EnumMap<Verb, IVersionedSerializer<?>> callbackDeserializers = new EnumMap<Verb, IVersionedSerializer<?>>(Verb.class)
     {{
         put(Verb.MUTATION, WriteResponse.serializer);
+        put(Verb.HINT, HintResponse.serializer);
         put(Verb.READ_REPAIR, WriteResponse.serializer);
         put(Verb.COUNTER_MUTATION, WriteResponse.serializer);
-        put(Verb.RANGE_SLICE, RangeSliceReply.serializer);
-        put(Verb.PAGED_RANGE, RangeSliceReply.serializer);
+        put(Verb.RANGE_SLICE, ReadResponse.rangeSliceSerializer);
+        put(Verb.PAGED_RANGE, ReadResponse.rangeSliceSerializer);
         put(Verb.READ, ReadResponse.serializer);
         put(Verb.TRUNCATE, TruncateResponse.serializer);
         put(Verb.SNAPSHOT, null);
@@ -245,6 +281,9 @@
 
         put(Verb.PAXOS_PREPARE, PrepareResponse.serializer);
         put(Verb.PAXOS_PROPOSE, BooleanSerializer.serializer);
+
+        put(Verb.BATCH_STORE, WriteResponse.serializer);
+        put(Verb.BATCH_REMOVE, WriteResponse.serializer);
     }};
 
     /* This records all the results mapped by message Id */
@@ -258,7 +297,7 @@
     {
         public static final CallbackDeterminedSerializer instance = new CallbackDeterminedSerializer();
 
-        public Object deserialize(DataInput in, int version) throws IOException
+        public Object deserialize(DataInputPlus in, int version) throws IOException
         {
             throw new UnsupportedOperationException();
         }
@@ -277,7 +316,7 @@
     /* Lookup table for registering message handlers based on the verb. */
     private final Map<Verb, IVerbHandler> verbHandlers;
 
-    private final ConcurrentMap<InetAddress, OutboundTcpConnectionPool> connectionManagers = new NonBlockingHashMap<InetAddress, OutboundTcpConnectionPool>();
+    private final ConcurrentMap<InetAddress, OutboundTcpConnectionPool> connectionManagers = new NonBlockingHashMap<>();
 
     private static final Logger logger = LoggerFactory.getLogger(MessagingService.class);
     private static final int LOG_DROPPED_INTERVAL_IN_MS = 5000;
@@ -293,11 +332,14 @@
     public static final EnumSet<Verb> DROPPABLE_VERBS = EnumSet.of(Verb._TRACE,
                                                                    Verb.MUTATION,
                                                                    Verb.COUNTER_MUTATION,
+                                                                   Verb.HINT,
                                                                    Verb.READ_REPAIR,
                                                                    Verb.READ,
                                                                    Verb.RANGE_SLICE,
                                                                    Verb.PAGED_RANGE,
-                                                                   Verb.REQUEST_RESPONSE);
+                                                                   Verb.REQUEST_RESPONSE,
+                                                                   Verb.BATCH_STORE,
+                                                                   Verb.BATCH_REMOVE);
 
 
     private static final class DroppedMessages
@@ -308,12 +350,26 @@
 
         DroppedMessages(Verb verb)
         {
-            this.metrics = new DroppedMessageMetrics(verb);
+            this(new DroppedMessageMetrics(verb));
+        }
+
+        DroppedMessages(DroppedMessageMetrics metrics)
+        {
+            this.metrics = metrics;
             this.droppedInternalTimeout = new AtomicInteger(0);
             this.droppedCrossNodeTimeout = new AtomicInteger(0);
         }
-
     }
+
+    @VisibleForTesting
+    public void resetDroppedMessagesMap(String scope)
+    {
+        for (Verb verb : droppedMessagesMap.keySet())
+            droppedMessagesMap.put(verb, new DroppedMessages(new DroppedMessageMetrics(metricName -> {
+                return new CassandraMetricsRegistry.MetricName("DroppedMessages", metricName, scope);
+            })));
+    }
+
     // total dropped message counts for server lifetime
     private final Map<Verb, DroppedMessages> droppedMessagesMap = new EnumMap<>(Verb.class);
 
@@ -330,6 +386,11 @@
         messageSinks.add(sink);
     }
 
+    public void removeMessageSink(IMessageSink sink)
+    {
+        messageSinks.remove(sink);
+    }
+
     public void clearMessageSinks()
     {
         messageSinks.clear();
@@ -361,7 +422,7 @@
             droppedMessagesMap.put(verb, new DroppedMessages(verb));
 
         listenGate = new SimpleCondition();
-        verbHandlers = new EnumMap<Verb, IVerbHandler>(Verb.class);
+        verbHandlers = new EnumMap<>(Verb.class);
         if (!testOnly)
         {
             Runnable logDropped = new Runnable()
@@ -622,7 +683,9 @@
                            ConsistencyLevel consistencyLevel,
                            boolean allowHints)
     {
-        assert message.verb == Verb.MUTATION || message.verb == Verb.COUNTER_MUTATION || message.verb == Verb.PAXOS_COMMIT;
+        assert message.verb == Verb.MUTATION
+            || message.verb == Verb.COUNTER_MUTATION
+            || message.verb == Verb.PAXOS_COMMIT;
         int messageId = nextId();
 
         CallbackInfo previous = callbacks.put(messageId,
@@ -753,11 +816,18 @@
      */
     public void shutdown()
     {
+        shutdown(true);
+    }
+    public void shutdown(boolean gracefully)
+    {
         logger.info("Waiting for messaging service to quiesce");
         // We may need to schedule hints on the mutation stage, so it's erroneous to shut down the mutation stage first
         assert !StageManager.getStage(Stage.MUTATION).isShutdown();
 
         // the important part
+        if (!gracefully)
+            callbacks.reset();
+
         if (!callbacks.shutdownBlocking())
             logger.warn("Failed to wait for messaging service callbacks shutdown");
 
@@ -766,6 +836,7 @@
         {
             clearMessageSinks();
             for (SocketThread th : socketThreads)
+            {
                 try
                 {
                     th.close();
@@ -773,9 +844,9 @@
                 catch (IOException e)
                 {
                     // see https://issues.apache.org/jira/browse/CASSANDRA-10545
-                    handleIOException(e);
+                    handleIOExceptionOnClose(e);
                 }
-
+            }
             connectionManagers.values().forEach(OutboundTcpConnectionPool::close);
         }
         catch (IOException e)
@@ -841,6 +912,11 @@
         return allNodesAtLeast22;
     }
 
+    public boolean areAllNodesAtLeast30()
+    {
+        return allNodesAtLeast30;
+    }
+
     /**
      * @return the last version associated with address, or @param version if this is the first such version
      */
@@ -850,12 +926,14 @@
 
         if (version < VERSION_22)
             allNodesAtLeast22 = false;
+        if (version < VERSION_30)
+            allNodesAtLeast30 = false;
 
         Integer v = versions.put(endpoint, version);
 
-        // if the version was increased to 2.2 or later, see if all nodes are >= 2.2 now
-        if (v != null && v < VERSION_22 && version >= VERSION_22)
-            refreshAllNodesAtLeast22();
+        // if the version was increased to 2.2 or later see if the min version across the cluster has changed
+        if (v != null && (v < VERSION_30 && version >= VERSION_22))
+            refreshAllNodeMinVersions();
 
         return v == null ? version : v;
     }
@@ -864,23 +942,35 @@
     {
         logger.trace("Resetting version for {}", endpoint);
         Integer removed = versions.remove(endpoint);
-        if (removed != null && removed <= VERSION_22)
-            refreshAllNodesAtLeast22();
+        if (removed != null && Math.min(removed, current_version) <= VERSION_30)
+            refreshAllNodeMinVersions();
     }
 
-    private void refreshAllNodesAtLeast22()
+    private void refreshAllNodeMinVersions()
     {
-        for (Integer version: versions.values())
+        boolean anyNodeLowerThan30 = false;
+        for (Integer version : versions.values())
         {
-            if (version < VERSION_22)
+            if (version < MessagingService.VERSION_30)
+            {
+                anyNodeLowerThan30 = true;
+                allNodesAtLeast30 = false;
+            }
+
+            if (version < MessagingService.VERSION_22)
             {
                 allNodesAtLeast22 = false;
                 return;
             }
         }
         allNodesAtLeast22 = true;
+        allNodesAtLeast30 = !anyNodeLowerThan30;
     }
 
+    /**
+     * Returns the messaging-version as announced by the given node but capped
+     * to the min of the version as announced by the node and {@link #current_version}.
+     */
     public int getVersion(InetAddress endpoint)
     {
         Integer v = versions.get(endpoint);
@@ -899,6 +989,9 @@
         return getVersion(InetAddress.getByName(endpoint));
     }
 
+    /**
+     * Returns the messaging-version exactly as announced by the given endpoint.
+     */
     public int getRawVersion(InetAddress endpoint)
     {
         Integer v = versions.get(endpoint);
@@ -1047,7 +1140,8 @@
             catch (IOException e)
             {
                 // see https://issues.apache.org/jira/browse/CASSANDRA-8220
-                handleIOException(e);
+                // see https://issues.apache.org/jira/browse/CASSANDRA-12513
+                handleIOExceptionOnClose(e);
             }
             for (Closeable connection : connections)
             {
@@ -1061,15 +1155,24 @@
         }
     }
 
-    private static void handleIOException(IOException e) throws IOException
+    private static void handleIOExceptionOnClose(IOException e) throws IOException
     {
         // dirty hack for clean shutdown on OSX w/ Java >= 1.8.0_20
-        // see https://bugs.openjdk.java.net/browse/JDK-8050499
-        if ((!"Unknown error: 316".equals(e.getMessage()) || !"Mac OS X".equals(System.getProperty("os.name"))) &&
-            !"Thread signal failed".equals(e.getMessage()) && // handle shutdown for in-JVM dtests
-            !"Bad file descriptor".equals(e.getMessage()) &&
-            !"No such file or directory".equals(e.getMessage()))
-            throw e;
+        // see https://bugs.openjdk.java.net/browse/JDK-8050499;
+        // also CASSANDRA-12513
+        if ("Mac OS X".equals(System.getProperty("os.name")))
+        {
+            switch (e.getMessage())
+            {
+                case "Unknown error: 316":
+                case "No such file or directory":
+                case "Bad file descriptor":
+                case "Thread signal failed":
+                    return;
+            }
+        }
+
+        throw e;
     }
 
     public Map<String, Integer> getLargeMessagePendingTasks()
@@ -1178,13 +1281,21 @@
 
     public static IPartitioner globalPartitioner()
     {
-        return DatabaseDescriptor.getPartitioner();
+        return StorageService.instance.getTokenMetadata().partitioner;
+    }
+
+    public static void validatePartitioner(Collection<? extends AbstractBounds<?>> allBounds)
+    {
+        for (AbstractBounds<?> bounds : allBounds)
+            validatePartitioner(bounds);
     }
 
     public static void validatePartitioner(AbstractBounds<?> bounds)
     {
         if (globalPartitioner() != bounds.left.getPartitioner())
-            throw new AssertionError();
+            throw new AssertionError(String.format("Partitioner in bounds serialization. Expected %s, was %s.",
+                                                   globalPartitioner().getClass().getName(),
+                                                   bounds.left.getPartitioner().getClass().getName()));
     }
 
     @VisibleForTesting

diff --git a/src/java/org/apache/cassandra/net/OutboundTcpConnection.java b/src/java/org/apache/cassandra/net/OutboundTcpConnection.java
index e8346d8..8df10b1 100644
--- a/src/java/org/apache/cassandra/net/OutboundTcpConnection.java
+++ b/src/java/org/apache/cassandra/net/OutboundTcpConnection.java

@@ -31,6 +31,7 @@
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.zip.Checksum;
@@ -45,6 +46,7 @@
 import net.jpountz.lz4.LZ4Factory;
 import net.jpountz.xxhash.XXHashFactory;
 
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
@@ -61,6 +63,7 @@
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.util.concurrent.Uninterruptibles;
 
 public class OutboundTcpConnection extends Thread
@@ -115,9 +118,14 @@
         if (coalescingWindow < 0)
             throw new ExceptionInInitializerError(
                     "Value provided for coalescing window must be greather than 0: " + coalescingWindow);
+
+        int otc_backlog_expiration_interval_in_ms = DatabaseDescriptor.getOtcBacklogExpirationInterval();
+        if (otc_backlog_expiration_interval_in_ms != Config.otc_backlog_expiration_interval_ms_default)
+            logger.info("OutboundTcpConnection backlog expiration interval set to to {}ms", otc_backlog_expiration_interval_in_ms);
+
     }
 
-    private static final MessageOut CLOSE_SENTINEL = new MessageOut(MessagingService.Verb.INTERNAL_RESPONSE);
+    private static final MessageOut<?> CLOSE_SENTINEL = new MessageOut<MessagingService.Verb>(MessagingService.Verb.INTERNAL_RESPONSE);
     private volatile boolean isStopped = false;
 
     private static final int OPEN_RETRY_DELAY = 100; // ms between retries
@@ -127,6 +135,11 @@
     static final int LZ4_HASH_SEED = 0x9747b28c;
 
     private final BlockingQueue<QueuedMessage> backlog = new LinkedBlockingQueue<>();
+    private static final String BACKLOG_PURGE_SIZE_PROPERTY = PREFIX + "otc_backlog_purge_size";
+    @VisibleForTesting
+    static final int BACKLOG_PURGE_SIZE = Integer.getInteger(BACKLOG_PURGE_SIZE_PROPERTY, 1024);
+    private final AtomicBoolean backlogExpirationActive = new AtomicBoolean(false);
+    private volatile long backlogNextExpirationTime;
 
     private final OutboundTcpConnectionPool poolReference;
 
@@ -136,13 +149,22 @@
     private volatile long completed;
     private final AtomicLong dropped = new AtomicLong();
     private volatile int currentMsgBufferCount = 0;
-    private int targetVersion = MessagingService.current_version;
+    private volatile int targetVersion;
 
     public OutboundTcpConnection(OutboundTcpConnectionPool pool)
     {
         super("MessagingService-Outgoing-" + pool.endPoint());
         this.poolReference = pool;
         cs = newCoalescingStrategy(pool.endPoint().getHostAddress());
+
+        // We want to use the most precise version we know because while there is version detection on connect(),
+        // the target version might be accessed by the pool (in getConnection()) before we actually connect (as we
+        // connect when the first message is submitted). Note however that the only case where we'll connect
+        // without knowing the true version of a node is if that node is a seed (otherwise, we can't know a node
+        // unless it has been gossiped to us or it has connected to us and in both case this sets the version) and
+        // in that case we won't rely on that targetVersion before we're actually connected and so the version
+        // detection in connect() will do its job.
+        targetVersion = MessagingService.instance().getVersion(pool.endPoint());
     }
 
     private static boolean isLocalDC(InetAddress targetHost)
@@ -154,11 +176,11 @@
 
     public void enqueue(MessageOut<?> message, int id)
     {
-        if (backlog.size() > 1024)
-            expireMessages();
+        long nanoTime = System.nanoTime();
+        expireMessages(nanoTime);
         try
         {
-            backlog.put(new QueuedMessage(message, id));
+            backlog.put(new QueuedMessage(message, id, nanoTime));
         }
         catch (InterruptedException e)
         {
@@ -166,6 +188,18 @@
         }
     }
 
+    /**
+     * This is a helper method for unit testing. Disclaimer: Do not use this method outside unit tests, as
+     * this method is iterating the queue which can be an expensive operation (CPU time, queue locking).
+     * 
+     * @return true, if the queue contains at least one expired element
+     */
+    @VisibleForTesting // (otherwise = VisibleForTesting.NONE)
+    boolean backlogContainsExpiredMessages(long nowNanos)
+    {
+        return backlog.stream().anyMatch(entry -> entry.isTimedOut(nowNanos));
+    }
+
     void closeSocket(boolean destroyThread)
     {
         isStopped = destroyThread; // Exit loop to stop the thread
@@ -204,9 +238,8 @@
                 throw new AssertionError(e);
             }
 
-            currentMsgBufferCount = drainedMessages.size();
+            int count = currentMsgBufferCount = drainedMessages.size();
 
-            int count = drainedMessages.size();
             //The timestamp of the first message has already been provided to the coalescing strategy
             //so skip logging it.
             inner:
@@ -223,14 +256,16 @@
                         continue;
                     }
 
-                    if (qm.isTimedOut())
+                    if (qm.isTimedOut(System.nanoTime()))
                         dropped.incrementAndGet();
                     else if (socket != null || connect())
                         writeConnected(qm, count == 1 && backlog.isEmpty());
                     else
                     {
-                        // clear out the queue, else gossip messages back up.
-                        drainedMessages.clear();
+                        // Not connected! Clear out the queue, else gossip messages back up. Update dropped
+                        // statistics accordingly. Hint: The statistics may be slightly too low, if messages
+                        // are added between the calls of backlog.size() and backlog.clear()
+                        dropped.addAndGet(backlog.size());
                         backlog.clear();
                         currentMsgBufferCount = 0;
                         break inner;
@@ -245,6 +280,8 @@
                 }
                 currentMsgBufferCount = --count;
             }
+            // Update dropped statistics by the number of unprocessed drainedMessages
+            dropped.addAndGet(currentMsgBufferCount);
             drainedMessages.clear();
         }
     }
@@ -334,7 +371,7 @@
         }
     }
 
-    private void writeInternal(MessageOut message, int id, long timestamp) throws IOException
+    private void writeInternal(MessageOut<?> message, int id, long timestamp) throws IOException
     {
         out.writeInt(MessagingService.PROTOCOL_MAGIC);
 
@@ -502,13 +539,13 @@
             catch (SSLHandshakeException e)
             {
                 logger.error("SSL handshake error for outbound connection to " + socket, e);
-                socket = null;
+                disconnect();
                 // SSL errors won't be recoverable within timeout period so we'll just abort
                 return false;
             }
             catch (IOException e)
             {
-                socket = null;
+                disconnect();
                 if (logger.isTraceEnabled())
                     logger.trace("unable to connect to " + poolReference.endPoint(), e);
                 Uninterruptibles.sleepUninterruptibly(OPEN_RETRY_DELAY, TimeUnit.MILLISECONDS);
@@ -521,31 +558,27 @@
     {
         final AtomicInteger version = new AtomicInteger(NO_VERSION);
         final CountDownLatch versionLatch = new CountDownLatch(1);
-        new Thread("HANDSHAKE-" + poolReference.endPoint())
+        new Thread(NamedThreadFactory.threadLocalDeallocator(() ->
         {
-            @Override
-            public void run()
+            try
             {
-                try
-                {
-                    logger.info("Handshaking version with {}", poolReference.endPoint());
-                    version.set(inputStream.readInt());
-                }
-                catch (IOException ex)
-                {
-                    final String msg = "Cannot handshake version with " + poolReference.endPoint();
-                    if (logger.isTraceEnabled())
-                        logger.trace(msg, ex);
-                    else
-                        logger.info(msg);
-                }
-                finally
-                {
-                    //unblock the waiting thread on either success or fail
-                    versionLatch.countDown();
-                }
+                logger.info("Handshaking version with {}", poolReference.endPoint());
+                version.set(inputStream.readInt());
             }
-        }.start();
+            catch (IOException ex)
+            {
+                final String msg = "Cannot handshake version with " + poolReference.endPoint();
+                if (logger.isTraceEnabled())
+                    logger.trace(msg, ex);
+                else
+                    logger.info(msg);
+            }
+            finally
+            {
+                //unblock the waiting thread on either success or fail
+                versionLatch.countDown();
+            }
+        }),"HANDSHAKE-" + poolReference.endPoint()).start();
 
         try
         {
@@ -558,18 +591,53 @@
         return version.get();
     }
 
-    private void expireMessages()
+    /**
+     * Expire elements from the queue if the queue is pretty full and expiration is not already in progress.
+     * This method will only remove droppable expired entries. If no such element exists, nothing is removed from the queue.
+     * 
+     * @param timestampNanos The current time as from System.nanoTime()
+     */
+    @VisibleForTesting
+    void expireMessages(long timestampNanos)
     {
-        Iterator<QueuedMessage> iter = backlog.iterator();
-        while (iter.hasNext())
+        if (backlog.size() <= BACKLOG_PURGE_SIZE)
+            return; // Plenty of space
+
+        if (backlogNextExpirationTime - timestampNanos > 0)
+            return; // Expiration is not due.
+
+        /**
+         * Expiration is an expensive process. Iterating the queue locks the queue for both writes and
+         * reads during iter.next() and iter.remove(). Thus letting only a single Thread do expiration.
+         */
+        if (backlogExpirationActive.compareAndSet(false, true))
         {
-            QueuedMessage qm = iter.next();
-            if (!qm.droppable)
-                continue;
-            if (!qm.isTimedOut())
-                return;
-            iter.remove();
-            dropped.incrementAndGet();
+            try
+            {
+                Iterator<QueuedMessage> iter = backlog.iterator();
+                while (iter.hasNext())
+                {
+                    QueuedMessage qm = iter.next();
+                    if (!qm.droppable)
+                        continue;
+                    if (!qm.isTimedOut(timestampNanos))
+                        continue;
+                    iter.remove();
+                    dropped.incrementAndGet();
+                }
+
+                if (logger.isTraceEnabled())
+                {
+                    long duration = TimeUnit.NANOSECONDS.toMicros(System.nanoTime() - timestampNanos);
+                    logger.trace("Expiration of {} took {}μs", getName(), duration);
+                }
+            }
+            finally
+            {
+                long backlogExpirationIntervalNanos = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getOtcBacklogExpirationInterval());
+                backlogNextExpirationTime = timestampNanos + backlogExpirationIntervalNanos;
+                backlogExpirationActive.set(false);
+            }
         }
     }
 
@@ -581,18 +649,19 @@
         final long timestampNanos;
         final boolean droppable;
 
-        QueuedMessage(MessageOut<?> message, int id)
+        QueuedMessage(MessageOut<?> message, int id, long timestampNanos)
         {
             this.message = message;
             this.id = id;
-            this.timestampNanos = System.nanoTime();
+            this.timestampNanos = timestampNanos;
             this.droppable = MessagingService.DROPPABLE_VERBS.contains(message.verb);
         }
 
         /** don't drop a non-droppable message just because it's timestamp is expired */
-        boolean isTimedOut()
+        boolean isTimedOut(long nowNanos)
         {
-            return droppable && timestampNanos < System.nanoTime() - TimeUnit.MILLISECONDS.toNanos(message.getTimeout());
+            long messageTimeoutNanos = TimeUnit.MILLISECONDS.toNanos(message.getTimeout());
+            return droppable && nowNanos - timestampNanos  > messageTimeoutNanos;
         }
 
         boolean shouldRetry()
@@ -610,7 +679,7 @@
     {
         RetriedQueuedMessage(QueuedMessage msg)
         {
-            super(msg.message, msg.id);
+            super(msg.message, msg.id, msg.timestampNanos);
         }
 
         boolean shouldRetry()

diff --git a/src/java/org/apache/cassandra/net/PingMessage.java b/src/java/org/apache/cassandra/net/PingMessage.java
new file mode 100644
index 0000000..8eaf23e
--- /dev/null
+++ b/src/java/org/apache/cassandra/net/PingMessage.java

@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.net;
+
+import java.io.IOException;
+
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * A backport of the version from 4.0, intentionnaly added as versions 4.0 or greater aren't guaranteed
+ * to know the c* versions they communicate with before they connect.
+ *
+ * It is intentional that no {@link IVerbHandler} is provided as we do not want process the message;
+ * the intent is to not break the stream by leaving it in an unclean state, with unconsumed bytes.
+ * We do, however, assign a {@link org.apache.cassandra.concurrent.StageManager} stage
+ * to maintain proper message flow.
+ * See CASSANDRA-13393 for a discussion.
+ */
+public class PingMessage
+{
+    public static IVersionedSerializer<PingMessage> serializer = new PingMessageSerializer();
+
+    public static class PingMessageSerializer implements IVersionedSerializer<PingMessage>
+    {
+        public void serialize(PingMessage t, DataOutputPlus out, int version)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public PingMessage deserialize(DataInputPlus in, int version) throws IOException
+        {
+            // throw away the one byte of the payload
+            in.readByte();
+            return new PingMessage();
+        }
+
+        public long serializedSize(PingMessage t, int version)
+        {
+            return 1;
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/net/ResourceLimits.java b/src/java/org/apache/cassandra/net/ResourceLimits.java
new file mode 100644
index 0000000..f8d24d7
--- /dev/null
+++ b/src/java/org/apache/cassandra/net/ResourceLimits.java

@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+public abstract class ResourceLimits
+{
+    /**
+     * Represents permits to utilise a resource and ways to allocate and release them.
+     *
+     * Two implementations are currently provided:
+     * 1. {@link Concurrent}, for shared limits, which is thread-safe;
+     * 2. {@link Basic}, for limits that are not shared between threads, is not thread-safe.
+     */
+    public interface Limit
+    {
+        /**
+         * @return total amount of permits represented by this {@link Limit} - the capacity
+         */
+        long limit();
+
+        /**
+         * @return remaining, unallocated permit amount
+         */
+        long remaining();
+
+        /**
+         * @return amount of permits currently in use
+         */
+        long using();
+
+        /**
+         * Attempts to allocate an amount of permits from this limit. If allocated, <em>MUST</em> eventually
+         * be released back with {@link #release(long)}.
+         *
+         * @return {@code true} if the allocation was successful, {@code false} otherwise
+         */
+        boolean tryAllocate(long amount);
+
+        /**
+         * Allocates an amount independent of permits available from this limit. <em>MUST</em> eventually
+         * be released back with {@link #release(long)}.
+         *
+         */
+        void allocate(long amount);
+
+        /**
+         * @param amount return the amount of permits back to this limit
+         * @return {@code ABOVE_LIMIT} if there aren't enough permits available even after the release, or
+         *         {@code BELOW_LIMIT} if there are enough permits available after the releaese.
+         */
+        Outcome release(long amount);
+    }
+
+    /**
+     * A thread-safe permit container.
+     */
+    public static class Concurrent implements Limit
+    {
+        private final long limit;
+
+        private volatile long using;
+        private static final AtomicLongFieldUpdater<Concurrent> usingUpdater =
+            AtomicLongFieldUpdater.newUpdater(Concurrent.class, "using");
+
+        public Concurrent(long limit)
+        {
+            this.limit = limit;
+        }
+
+        public long limit()
+        {
+            return limit;
+        }
+
+        public long remaining()
+        {
+            return limit - using;
+        }
+
+        public long using()
+        {
+            return using;
+        }
+
+        public boolean tryAllocate(long amount)
+        {
+            long current, next;
+            do
+            {
+                current = using;
+                next = current + amount;
+
+                if (next > limit)
+                    return false;
+            }
+            while (!usingUpdater.compareAndSet(this, current, next));
+
+            return true;
+        }
+
+        public void allocate(long amount)
+        {
+            long current, next;
+            do
+            {
+                current = using;
+                next = current + amount;
+            } while (!usingUpdater.compareAndSet(this, current, next));
+        }
+
+        public Outcome release(long amount)
+        {
+            assert amount >= 0;
+            long using = usingUpdater.addAndGet(this, -amount);
+            assert using >= 0;
+            return using >= limit ? Outcome.ABOVE_LIMIT : Outcome.BELOW_LIMIT;
+        }
+    }
+
+    /**
+     * A cheaper, thread-unsafe permit container to be used for unshared limits.
+     */
+    static class Basic implements Limit
+    {
+        private final long limit;
+        private long using;
+
+        Basic(long limit)
+        {
+            this.limit = limit;
+        }
+
+        public long limit()
+        {
+            return limit;
+        }
+
+        public long remaining()
+        {
+            return limit - using;
+        }
+
+        public long using()
+        {
+            return using;
+        }
+
+        public boolean tryAllocate(long amount)
+        {
+            if (using + amount > limit)
+                return false;
+
+            using += amount;
+            return true;
+        }
+
+        public void allocate(long amount)
+        {
+            using += amount;
+        }
+
+        public Outcome release(long amount)
+        {
+            assert amount >= 0 && amount <= using;
+            using -= amount;
+            return using >= limit ? Outcome.ABOVE_LIMIT : Outcome.BELOW_LIMIT;
+        }
+    }
+
+    /**
+     * A convenience class that groups a per-endpoint limit with the global one
+     * to allow allocating/releasing permits from/to both limits as one logical operation.
+     */
+    public static class EndpointAndGlobal
+    {
+        final Limit endpoint;
+        final Limit global;
+
+        public EndpointAndGlobal(Limit endpoint, Limit global)
+        {
+            this.endpoint = endpoint;
+            this.global = global;
+        }
+
+        public Limit endpoint()
+        {
+            return endpoint;
+        }
+
+        public Limit global()
+        {
+            return global;
+        }
+
+        /**
+         * @return {@code INSUFFICIENT_GLOBAL} if there weren't enough permits in the global limit, or
+         *         {@code INSUFFICIENT_ENDPOINT} if there weren't enough permits in the per-endpoint limit, or
+         *         {@code SUCCESS} if there were enough permits to take from both.
+         */
+        public Outcome tryAllocate(long amount)
+        {
+            if (!global.tryAllocate(amount))
+                return Outcome.INSUFFICIENT_GLOBAL;
+
+            if (endpoint.tryAllocate(amount))
+                return Outcome.SUCCESS;
+
+            global.release(amount);
+            return Outcome.INSUFFICIENT_ENDPOINT;
+        }
+
+        public void allocate(long amount)
+        {
+            global.allocate(amount);
+            endpoint.allocate(amount);
+        }
+
+        public Outcome release(long amount)
+        {
+            Outcome endpointReleaseOutcome = endpoint.release(amount);
+            Outcome globalReleaseOutcome = global.release(amount);
+            return (endpointReleaseOutcome == Outcome.ABOVE_LIMIT || globalReleaseOutcome == Outcome.ABOVE_LIMIT)
+                   ? Outcome.ABOVE_LIMIT : Outcome.BELOW_LIMIT;
+        }
+    }
+
+    public enum Outcome { SUCCESS, INSUFFICIENT_ENDPOINT, INSUFFICIENT_GLOBAL, BELOW_LIMIT, ABOVE_LIMIT }
+}

diff --git a/src/java/org/apache/cassandra/net/WriteCallbackInfo.java b/src/java/org/apache/cassandra/net/WriteCallbackInfo.java
index c1fb98d..9ecc385 100644
--- a/src/java/org/apache/cassandra/net/WriteCallbackInfo.java
+++ b/src/java/org/apache/cassandra/net/WriteCallbackInfo.java

@@ -29,9 +29,8 @@
 
 public class WriteCallbackInfo extends CallbackInfo
 {
-    private final MessageOut sentMessage;
-    private final ConsistencyLevel consistencyLevel;
-    private final boolean allowHints;
+    // either a Mutation, or a Paxos Commit (MessageOut)
+    private final Object mutation;
 
     public WriteCallbackInfo(InetAddress target,
                              IAsyncCallback callback,
@@ -42,25 +41,34 @@
     {
         super(target, callback, serializer, true);
         assert message != null;
-        this.sentMessage = message;
-        this.consistencyLevel = consistencyLevel;
-        this.allowHints = allowHints;
+        this.mutation = shouldHint(allowHints, message, consistencyLevel);
         //Local writes shouldn't go through messaging service (https://issues.apache.org/jira/browse/CASSANDRA-10477)
         assert (!target.equals(FBUtilities.getBroadcastAddress()));
     }
 
-    Mutation mutation()
-    {
-        return sentMessage.verb == MessagingService.Verb.PAXOS_COMMIT
-             ? ((Commit) sentMessage.payload).makeMutation()
-             : (Mutation) sentMessage.payload;
-    }
-
     public boolean shouldHint()
     {
-        return allowHints
-            && sentMessage.verb != MessagingService.Verb.COUNTER_MUTATION
-            && consistencyLevel != ConsistencyLevel.ANY
-            && StorageProxy.shouldHint(target);
+        return mutation != null && StorageProxy.shouldHint(target);
     }
+
+    public Mutation mutation()
+    {
+        return getMutation(mutation);
+    }
+
+    private static Mutation getMutation(Object object)
+    {
+        assert object instanceof Commit || object instanceof Mutation : object;
+        return object instanceof Commit ? ((Commit) object).makeMutation()
+                                        : (Mutation) object;
+    }
+
+    private static Object shouldHint(boolean allowHints, MessageOut sentMessage, ConsistencyLevel consistencyLevel)
+    {
+        return allowHints
+               && sentMessage.verb != MessagingService.Verb.COUNTER_MUTATION
+               && consistencyLevel != ConsistencyLevel.ANY
+               ? sentMessage.payload : null;
+    }
+
 }

diff --git a/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java b/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java
index 15230ea..56d6130 100644
--- a/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java
+++ b/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java

@@ -21,8 +21,8 @@
 
 public class SSTableAddedNotification implements INotification
 {
-    public final SSTableReader added;
-    public SSTableAddedNotification(SSTableReader added)
+    public final Iterable<SSTableReader> added;
+    public SSTableAddedNotification(Iterable<SSTableReader> added)
     {
         this.added = added;
     }

diff --git a/src/java/org/apache/cassandra/repair/LocalSyncTask.java b/src/java/org/apache/cassandra/repair/LocalSyncTask.java
index daace01..5d43868 100644
--- a/src/java/org/apache/cassandra/repair/LocalSyncTask.java
+++ b/src/java/org/apache/cassandra/repair/LocalSyncTask.java

@@ -47,9 +47,9 @@
 
     private final long repairedAt;
 
-    public LocalSyncTask(RepairJobDesc desc, TreeResponse r1, TreeResponse r2, long repairedAt)
+    public LocalSyncTask(RepairJobDesc desc, InetAddress firstEndpoint, InetAddress secondEndpoint, List<Range<Token>> rangesToSync, long repairedAt)
     {
-        super(desc, r1, r2);
+        super(desc, firstEndpoint, secondEndpoint, rangesToSync);
         this.repairedAt = repairedAt;
     }
 
@@ -61,7 +61,7 @@
     {
         InetAddress local = FBUtilities.getBroadcastAddress();
         // We can take anyone of the node as source or destination, however if one is localhost, we put at source to avoid a forwarding
-        InetAddress dst = r2.endpoint.equals(local) ? r1.endpoint : r2.endpoint;
+        InetAddress dst = secondEndpoint.equals(local) ? firstEndpoint : secondEndpoint;
         InetAddress preferred = SystemKeyspace.getPreferredIP(dst);
 
         String message = String.format("Performing streaming repair of %d ranges with %s", differences.size(), dst);
@@ -110,7 +110,7 @@
 
     public void onSuccess(StreamState result)
     {
-        String message = String.format("Sync complete using session %s between %s and %s on %s", desc.sessionId, r1.endpoint, r2.endpoint, desc.columnFamily);
+        String message = String.format("Sync complete using session %s between %s and %s on %s", desc.sessionId, firstEndpoint, secondEndpoint, desc.columnFamily);
         logger.info("[repair #{}] {}", desc.sessionId, message);
         Tracing.traceRepair(message);
         set(stat);

diff --git a/src/java/org/apache/cassandra/repair/NodePair.java b/src/java/org/apache/cassandra/repair/NodePair.java
index bb6be04..a73c61a 100644
--- a/src/java/org/apache/cassandra/repair/NodePair.java
+++ b/src/java/org/apache/cassandra/repair/NodePair.java

@@ -17,13 +17,13 @@
  */
 package org.apache.cassandra.repair;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 
 import com.google.common.base.Objects;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 
@@ -69,7 +69,7 @@
             CompactEndpointSerializationHelper.serialize(nodePair.endpoint2, out);
         }
 
-        public NodePair deserialize(DataInput in, int version) throws IOException
+        public NodePair deserialize(DataInputPlus in, int version) throws IOException
         {
             InetAddress ep1 = CompactEndpointSerializationHelper.deserialize(in);
             InetAddress ep2 = CompactEndpointSerializationHelper.deserialize(in);

diff --git a/src/java/org/apache/cassandra/repair/RemoteSyncTask.java b/src/java/org/apache/cassandra/repair/RemoteSyncTask.java
index ededc40..5af815a 100644
--- a/src/java/org/apache/cassandra/repair/RemoteSyncTask.java
+++ b/src/java/org/apache/cassandra/repair/RemoteSyncTask.java

@@ -41,15 +41,15 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(RemoteSyncTask.class);
 
-    public RemoteSyncTask(RepairJobDesc desc, TreeResponse r1, TreeResponse r2)
+    public RemoteSyncTask(RepairJobDesc desc, InetAddress firstEndpoint, InetAddress secondEndpoint, List<Range<Token>> rangesToSync)
     {
-        super(desc, r1, r2);
+        super(desc, firstEndpoint, secondEndpoint, rangesToSync);
     }
 
     protected void startSync(List<Range<Token>> differences)
     {
         InetAddress local = FBUtilities.getBroadcastAddress();
-        SyncRequest request = new SyncRequest(desc, local, r1.endpoint, r2.endpoint, differences);
+        SyncRequest request = new SyncRequest(desc, local, firstEndpoint, secondEndpoint, differences);
         String message = String.format("Forwarding streaming repair of %d ranges to %s (to be streamed with %s)", request.ranges.size(), request.src, request.dst);
         logger.info("[repair #{}] {}", desc.sessionId, message);
         Tracing.traceRepair(message);
@@ -64,7 +64,7 @@
         }
         else
         {
-            setException(new RepairException(desc, String.format("Sync failed between %s and %s", r1.endpoint, r2.endpoint)));
+            setException(new RepairException(desc, String.format("Sync failed between %s and %s", firstEndpoint, secondEndpoint)));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/RepairJob.java b/src/java/org/apache/cassandra/repair/RepairJob.java
index a92233b..5443bf8 100644
--- a/src/java/org/apache/cassandra/repair/RepairJob.java
+++ b/src/java/org/apache/cassandra/repair/RepairJob.java

@@ -20,14 +20,18 @@
 import java.net.InetAddress;
 import java.util.*;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.util.concurrent.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MerkleTrees;
 import org.apache.cassandra.utils.Pair;
 
 /**
@@ -48,21 +52,14 @@
      *
      * @param session RepairSession that this RepairJob belongs
      * @param columnFamily name of the ColumnFamily to repair
-     * @param parallelismDegree how to run repair job in parallel
-     * @param repairedAt when the repair occurred (millis)
-     * @param taskExecutor Executor to run various repair tasks
      */
-    public RepairJob(RepairSession session,
-                     String columnFamily,
-                     RepairParallelism parallelismDegree,
-                     long repairedAt,
-                     ListeningExecutorService taskExecutor)
+    public RepairJob(RepairSession session, String columnFamily)
     {
         this.session = session;
-        this.desc = new RepairJobDesc(session.parentRepairSession, session.getId(), session.keyspace, columnFamily, session.getRange());
-        this.repairedAt = repairedAt;
-        this.taskExecutor = taskExecutor;
-        this.parallelismDegree = parallelismDegree;
+        this.desc = new RepairJobDesc(session.parentRepairSession, session.getId(), session.keyspace, columnFamily, session.getRanges());
+        this.repairedAt = session.repairedAt;
+        this.taskExecutor = session.taskExecutor;
+        this.parallelismDegree = session.parallelismDegree;
     }
 
     /**
@@ -110,35 +107,9 @@
         // When all validations complete, submit sync tasks
         ListenableFuture<List<SyncStat>> syncResults = Futures.transform(validations, new AsyncFunction<List<TreeResponse>, List<SyncStat>>()
         {
-            public ListenableFuture<List<SyncStat>> apply(List<TreeResponse> trees) throws Exception
+            public ListenableFuture<List<SyncStat>> apply(List<TreeResponse> trees)
             {
-                InetAddress local = FBUtilities.getLocalAddress();
-
-                List<SyncTask> syncTasks = new ArrayList<>();
-                // We need to difference all trees one against another
-                for (int i = 0; i < trees.size() - 1; ++i)
-                {
-                    TreeResponse r1 = trees.get(i);
-                    for (int j = i + 1; j < trees.size(); ++j)
-                    {
-                        TreeResponse r2 = trees.get(j);
-                        SyncTask task;
-                        if (r1.endpoint.equals(local) || r2.endpoint.equals(local))
-                        {
-                            task = new LocalSyncTask(desc, r1, r2, repairedAt);
-                        }
-                        else
-                        {
-                            task = new RemoteSyncTask(desc, r1, r2);
-                            // RemoteSyncTask expects SyncComplete message sent back.
-                            // Register task to RepairSession to receive response.
-                            session.waitForSync(Pair.create(desc, new NodePair(r1.endpoint, r2.endpoint)), (RemoteSyncTask) task);
-                        }
-                        syncTasks.add(task);
-                        taskExecutor.submit(task);
-                    }
-                }
-                return Futures.allAsList(syncTasks);
+                return Futures.allAsList(createSyncTasks(trees, FBUtilities.getLocalAddress()));
             }
         }, taskExecutor);
 
@@ -167,6 +138,39 @@
         Futures.getUnchecked(validations);
     }
 
+    @VisibleForTesting
+    List<SyncTask> createSyncTasks(List<TreeResponse> trees, InetAddress local)
+    {
+        List<SyncTask> syncTasks = new ArrayList<>();
+        // We need to difference all trees one against another
+        for (int i = 0; i < trees.size() - 1; ++i)
+        {
+            TreeResponse r1 = trees.get(i);
+            for (int j = i + 1; j < trees.size(); ++j)
+            {
+                TreeResponse r2 = trees.get(j);
+                SyncTask task;
+
+                List<Range<Token>> differences = MerkleTrees.difference(r1.trees, r2.trees);
+
+                if (r1.endpoint.equals(local) || r2.endpoint.equals(local))
+                {
+                    task = new LocalSyncTask(desc, r1.endpoint, r2.endpoint, differences, repairedAt);
+                }
+                else
+                {
+                    task = new RemoteSyncTask(desc, r1.endpoint, r2.endpoint, differences);
+                    // RemoteSyncTask expects SyncComplete message sent back.
+                    // Register task to RepairSession to receive response.
+                    session.waitForSync(Pair.create(desc, new NodePair(r1.endpoint, r2.endpoint)), (RemoteSyncTask) task);
+                }
+                syncTasks.add(task);
+                taskExecutor.submit(task);
+            }
+        }
+        return syncTasks;
+    }
+
     /**
      * Creates {@link ValidationTask} and submit them to task executor in parallel.
      *
@@ -178,7 +182,7 @@
         String message = String.format("Requesting merkle trees for %s (to %s)", desc.columnFamily, endpoints);
         logger.info("[repair #{}] {}", desc.sessionId, message);
         Tracing.traceRepair(message);
-        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(System.currentTimeMillis());
+        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(FBUtilities.nowInSeconds());
         List<ListenableFuture<TreeResponse>> tasks = new ArrayList<>(endpoints.size());
         for (InetAddress endpoint : endpoints)
         {
@@ -198,7 +202,7 @@
         String message = String.format("Requesting merkle trees for %s (to %s)", desc.columnFamily, endpoints);
         logger.info("[repair #{}] {}", desc.sessionId, message);
         Tracing.traceRepair(message);
-        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(System.currentTimeMillis());
+        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(FBUtilities.nowInSeconds());
         List<ListenableFuture<TreeResponse>> tasks = new ArrayList<>(endpoints.size());
 
         Queue<InetAddress> requests = new LinkedList<>(endpoints);
@@ -240,7 +244,7 @@
         String message = String.format("Requesting merkle trees for %s (to %s)", desc.columnFamily, endpoints);
         logger.info("[repair #{}] {}", desc.sessionId, message);
         Tracing.traceRepair(message);
-        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(System.currentTimeMillis());
+        int gcBefore = Keyspace.open(desc.keyspace).getColumnFamilyStore(desc.columnFamily).gcBefore(FBUtilities.nowInSeconds());
         List<ListenableFuture<TreeResponse>> tasks = new ArrayList<>(endpoints.size());
 
         Map<String, Queue<InetAddress>> requestsByDatacenter = new HashMap<>();

diff --git a/src/java/org/apache/cassandra/repair/RepairJobDesc.java b/src/java/org/apache/cassandra/repair/RepairJobDesc.java
index 8382136..05adbf9 100644
--- a/src/java/org/apache/cassandra/repair/RepairJobDesc.java
+++ b/src/java/org/apache/cassandra/repair/RepairJobDesc.java

@@ -17,8 +17,9 @@
  */
 package org.apache.cassandra.repair;
 
-import java.io.DataInput;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.UUID;
 
 import com.google.common.base.Objects;
@@ -28,6 +29,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -47,21 +49,21 @@
     public final String keyspace;
     public final String columnFamily;
     /** repairing range  */
-    public final Range<Token> range;
+    public final Collection<Range<Token>> ranges;
 
-    public RepairJobDesc(UUID parentSessionId, UUID sessionId, String keyspace, String columnFamily, Range<Token> range)
+    public RepairJobDesc(UUID parentSessionId, UUID sessionId, String keyspace, String columnFamily, Collection<Range<Token>> ranges)
     {
         this.parentSessionId = parentSessionId;
         this.sessionId = sessionId;
         this.keyspace = keyspace;
         this.columnFamily = columnFamily;
-        this.range = range;
+        this.ranges = ranges;
     }
 
     @Override
     public String toString()
     {
-        return "[repair #" + sessionId + " on " + keyspace + "/" + columnFamily + ", " + range + "]";
+        return "[repair #" + sessionId + " on " + keyspace + "/" + columnFamily + ", " + ranges + "]";
     }
 
     @Override
@@ -74,7 +76,7 @@
 
         if (!columnFamily.equals(that.columnFamily)) return false;
         if (!keyspace.equals(that.keyspace)) return false;
-        if (range != null ? !range.equals(that.range) : that.range != null) return false;
+        if (ranges != null ? that.ranges == null || (ranges.size() != that.ranges.size()) || (ranges.size() == that.ranges.size() && !ranges.containsAll(that.ranges)) : that.ranges != null) return false;
         if (!sessionId.equals(that.sessionId)) return false;
         if (parentSessionId != null ? !parentSessionId.equals(that.parentSessionId) : that.parentSessionId != null) return false;
 
@@ -84,7 +86,7 @@
     @Override
     public int hashCode()
     {
-        return Objects.hashCode(sessionId, keyspace, columnFamily, range);
+        return Objects.hashCode(sessionId, keyspace, columnFamily, ranges);
     }
 
     private static class RepairJobDescSerializer implements IVersionedSerializer<RepairJobDesc>
@@ -100,11 +102,13 @@
             UUIDSerializer.serializer.serialize(desc.sessionId, out, version);
             out.writeUTF(desc.keyspace);
             out.writeUTF(desc.columnFamily);
-            MessagingService.validatePartitioner(desc.range);
-            AbstractBounds.tokenSerializer.serialize(desc.range, out, version);
+            MessagingService.validatePartitioner(desc.ranges);
+            out.writeInt(desc.ranges.size());
+            for (Range<Token> rt : desc.ranges)
+                AbstractBounds.tokenSerializer.serialize(rt, out, version);
         }
 
-        public RepairJobDesc deserialize(DataInput in, int version) throws IOException
+        public RepairJobDesc deserialize(DataInputPlus in, int version) throws IOException
         {
             UUID parentSessionId = null;
             if (version >= MessagingService.VERSION_21)
@@ -115,8 +119,19 @@
             UUID sessionId = UUIDSerializer.serializer.deserialize(in, version);
             String keyspace = in.readUTF();
             String columnFamily = in.readUTF();
-            Range<Token> range = (Range<Token>)AbstractBounds.tokenSerializer.deserialize(in, MessagingService.globalPartitioner(), version);
-            return new RepairJobDesc(parentSessionId, sessionId, keyspace, columnFamily, range);
+
+            int nRanges = in.readInt();
+            Collection<Range<Token>> ranges = new ArrayList<>();
+            Range<Token> range;
+
+            for (int i = 0; i < nRanges; i++)
+            {
+                range = (Range<Token>) AbstractBounds.tokenSerializer.deserialize(in,
+                        MessagingService.globalPartitioner(), version);
+                ranges.add(range);
+            }
+
+            return new RepairJobDesc(parentSessionId, sessionId, keyspace, columnFamily, ranges);
         }
 
         public long serializedSize(RepairJobDesc desc, int version)
@@ -124,14 +139,18 @@
             int size = 0;
             if (version >= MessagingService.VERSION_21)
             {
-                size += TypeSizes.NATIVE.sizeof(desc.parentSessionId != null);
+                size += TypeSizes.sizeof(desc.parentSessionId != null);
                 if (desc.parentSessionId != null)
                     size += UUIDSerializer.serializer.serializedSize(desc.parentSessionId, version);
             }
             size += UUIDSerializer.serializer.serializedSize(desc.sessionId, version);
-            size += TypeSizes.NATIVE.sizeof(desc.keyspace);
-            size += TypeSizes.NATIVE.sizeof(desc.columnFamily);
-            size += AbstractBounds.tokenSerializer.serializedSize(desc.range, version);
+            size += TypeSizes.sizeof(desc.keyspace);
+            size += TypeSizes.sizeof(desc.columnFamily);
+            size += TypeSizes.sizeof(desc.ranges.size());
+            for (Range<Token> rt : desc.ranges)
+            {
+                size += AbstractBounds.tokenSerializer.serializedSize(rt, version);
+            }
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java
index 1701e9a..edcb4f9 100644
--- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java
+++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java

@@ -28,10 +28,8 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -41,7 +39,6 @@
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.messages.*;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.CassandraVersion;
 
 /**
  * Handles all repair related message.
@@ -59,7 +56,6 @@
         {
             switch (message.payload.messageType)
             {
-                case PREPARE_GLOBAL_MESSAGE:
                 case PREPARE_MESSAGE:
                     PrepareMessage prepareMessage = (PrepareMessage) message.payload;
                     logger.debug("Preparing, {}", prepareMessage);
@@ -75,18 +71,13 @@
                         }
                         columnFamilyStores.add(columnFamilyStore);
                     }
-                    CassandraVersion peerVersion = SystemKeyspace.getReleaseVersion(message.from);
-                    // note that we default isGlobal to true since old version always default to true:
-                    boolean isGlobal = peerVersion == null ||
-                                       peerVersion.compareTo(ActiveRepairService.SUPPORTS_GLOBAL_PREPARE_FLAG_VERSION) < 0 ||
-                                       message.payload.messageType.equals(RepairMessage.Type.PREPARE_GLOBAL_MESSAGE);
-                    logger.debug("Received prepare message: global message = {}, peerVersion = {},", message.payload.messageType.equals(RepairMessage.Type.PREPARE_GLOBAL_MESSAGE), peerVersion);
                     ActiveRepairService.instance.registerParentRepairSession(prepareMessage.parentRepairSession,
                                                                              message.from,
                                                                              columnFamilyStores,
                                                                              prepareMessage.ranges,
                                                                              prepareMessage.isIncremental,
-                                                                             isGlobal);
+                                                                             prepareMessage.timestamp,
+                                                                             prepareMessage.isGlobal);
                     MessagingService.instance().sendReply(new MessageOut(MessagingService.Verb.INTERNAL_RESPONSE), id, message.from);
                     break;
 
@@ -106,14 +97,13 @@
                     }
                     else
                     {
-                        final Range<Token> repairingRange = desc.range;
                         cfs.snapshot(desc.sessionId.toString(), new Predicate<SSTableReader>()
                         {
                             public boolean apply(SSTableReader sstable)
                             {
                                 return sstable != null &&
-                                       !(sstable.partitioner instanceof LocalPartitioner) && // exclude SSTables from 2i
-                                       new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(Collections.singleton(repairingRange));
+                                       !sstable.metadata.isIndex() && // exclude SSTables from 2i
+                                       new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(desc.ranges);
                             }
                         }, true); //ephemeral snapshot, if repair fails, it will be cleaned next startup
                     }

diff --git a/src/java/org/apache/cassandra/repair/RepairRunnable.java b/src/java/org/apache/cassandra/repair/RepairRunnable.java
index 0f2e839..774409f 100644
--- a/src/java/org/apache/cassandra/repair/RepairRunnable.java
+++ b/src/java/org/apache/cassandra/repair/RepairRunnable.java

@@ -45,6 +45,7 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.repair.messages.RepairOption;
 import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.tracing.TraceKeyspace;
@@ -53,6 +54,7 @@
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.apache.cassandra.utils.progress.ProgressEvent;
@@ -101,7 +103,7 @@
 
     protected void fireErrorAndComplete(String tag, int progressCount, int totalProgress, String message)
     {
-        fireProgressEvent(tag, new ProgressEvent(ProgressEventType.ERROR, progressCount, totalProgress, message));
+        fireProgressEvent(tag, new ProgressEvent(ProgressEventType.ERROR, progressCount, totalProgress, String.format("Repair command #%d failed with error %s", cmd, message)));
         fireProgressEvent(tag, new ProgressEvent(ProgressEventType.COMPLETE, progressCount, totalProgress, String.format("Repair command #%d finished with error", cmd)));
     }
 
@@ -156,7 +158,7 @@
         }
 
         final Set<InetAddress> allNeighbors = new HashSet<>();
-        Map<Range, Set<InetAddress>> rangeToNeighbors = new HashMap<>();
+        List<Pair<Set<InetAddress>, ? extends Collection<Range<Token>>>> commonRanges = new ArrayList<>();
 
         //pre-calculate output of getLocalRanges and pass it to getNeighbors to increase performance and prevent
         //calculation multiple times
@@ -166,12 +168,14 @@
         {
             for (Range<Token> range : options.getRanges())
             {
-                    Set<InetAddress> neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges,
-                                                                                  range, options.getDataCenters(),
-                                                                                  options.getHosts());
-                    rangeToNeighbors.put(range, neighbors);
-                    allNeighbors.addAll(neighbors);
+                Set<InetAddress> neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges, range,
+                                                                              options.getDataCenters(),
+                                                                              options.getHosts());
+
+                addRangeToNeighbors(commonRanges, range, neighbors);
+                allNeighbors.addAll(neighbors);
             }
+
             progress.incrementAndGet();
         }
         catch (IllegalArgumentException e)
@@ -225,13 +229,13 @@
                                                                                                                          "internal"));
 
         List<ListenableFuture<RepairSessionResult>> futures = new ArrayList<>(options.getRanges().size());
-        for (Range<Token> range : options.getRanges())
+        for (Pair<Set<InetAddress>, ? extends Collection<Range<Token>>> p : commonRanges)
         {
             final RepairSession session = ActiveRepairService.instance.submitRepairSession(parentSession,
-                                                              range,
+                                                              p.right,
                                                               keyspace,
                                                               options.getParallelism(),
-                                                              rangeToNeighbors.get(range),
+                                                              p.left,
                                                               repairedAt,
                                                               executor,
                                                               cfnames);
@@ -248,7 +252,7 @@
                      * for backward-compatibility support.
                      */
                     String message = String.format("Repair session %s for range %s finished", session.getId(),
-                                                   session.getRange().toString());
+                                                   session.getRanges().toString());
                     logger.info(message);
                     fireProgressEvent(tag, new ProgressEvent(ProgressEventType.PROGRESS,
                                                              progress.incrementAndGet(),
@@ -264,7 +268,7 @@
                      * for backward-compatibility support.
                      */
                     String message = String.format("Repair session %s for range %s failed with error %s",
-                                                   session.getId(), session.getRange().toString(), t.getMessage());
+                                                   session.getId(), session.getRanges().toString(), t.getMessage());
                     logger.error(message, t);
                     fireProgressEvent(tag, new ProgressEvent(ProgressEventType.PROGRESS,
                                                              progress.incrementAndGet(),
@@ -290,7 +294,7 @@
                 {
                     if (sessionResult != null)
                     {
-                        successfulRanges.add(sessionResult.range);
+                        successfulRanges.addAll(sessionResult.ranges);
                     }
                     else
                     {
@@ -350,9 +354,27 @@
         });
     }
 
+    private void addRangeToNeighbors(List<Pair<Set<InetAddress>, ? extends Collection<Range<Token>>>> neighborRangeList, Range<Token> range, Set<InetAddress> neighbors)
+    {
+        for (int i = 0; i < neighborRangeList.size(); i++)
+        {
+            Pair<Set<InetAddress>, ? extends Collection<Range<Token>>> p = neighborRangeList.get(i);
+
+            if (p.left.containsAll(neighbors))
+            {
+                p.right.add(range);
+                return;
+            }
+        }
+
+        List<Range<Token>> ranges = new ArrayList<>();
+        ranges.add(range);
+        neighborRangeList.add(Pair.create(neighbors, ranges));
+    }
+
     private Thread createQueryThread(final int cmd, final UUID sessionId)
     {
-        return new Thread(new WrappedRunnable()
+        return new Thread(NamedThreadFactory.threadLocalDeallocator(new WrappedRunnable()
         {
             // Query events within a time interval that overlaps the last by one second. Ignore duplicates. Ignore local traces.
             // Wake up upon local trace activity. Query when notified of trace activity with a timeout that doubles every two timeouts.
@@ -364,7 +386,7 @@
 
                 String format = "select event_id, source, activity from %s.%s where session_id = ? and event_id > ? and event_id < ?;";
                 String query = String.format(format, TraceKeyspace.NAME, TraceKeyspace.EVENTS);
-                SelectStatement statement = (SelectStatement) QueryProcessor.parseStatement(query).prepare().statement;
+                SelectStatement statement = (SelectStatement) QueryProcessor.parseStatement(query).prepare(ClientState.forInternalCalls()).statement;
 
                 ByteBuffer sessionIdBytes = ByteBufferUtil.bytes(sessionId);
                 InetAddress source = FBUtilities.getBroadcastAddress();
@@ -419,6 +441,6 @@
                     seen[si].clear();
                 }
             }
-        });
+        }));
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/RepairSession.java b/src/java/org/apache/cassandra/repair/RepairSession.java
index 70bfaa6..ac8e0a9 100644
--- a/src/java/org/apache/cassandra/repair/RepairSession.java
+++ b/src/java/org/apache/cassandra/repair/RepairSession.java

@@ -24,6 +24,7 @@
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
 import com.google.common.util.concurrent.*;
 import org.slf4j.Logger;
@@ -36,13 +37,13 @@
 import org.apache.cassandra.gms.*;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 import org.apache.cassandra.utils.Pair;
 
 /**
- * Coordinates the (active) repair of a token range.
+ * Coordinates the (active) repair of a list of non overlapping token ranges.
  *
- * A given RepairSession repairs a set of replicas for a given range on a list
+ * A given RepairSession repairs a set of replicas for a given set of ranges on a list
  * of column families. For each of the column family to repair, RepairSession
  * creates a {@link RepairJob} that handles the repair of that CF.
  *
@@ -53,8 +54,8 @@
  *      validationComplete()).
  *   </li>
  *   <li>Synchronization phase: once all trees are received, the job compares each tree with
- *      all the other using a so-called {@link SyncTask}. If there is difference between 2 trees, the
- *      concerned SyncTask will start a streaming of the difference between the 2 endpoint concerned.
+ *      all the others and creates a {@link SyncTask} for each diverging replica. If there are differences
+ *      between 2 trees, the concerned SyncTask stream the differences between the 2 endpoints concerned.
  *   </li>
  * </ol>
  * The job is done once all its SyncTasks are done (i.e. have either computed no differences
@@ -63,11 +64,11 @@
  * A given session will execute the first phase (validation phase) of each of it's job
  * sequentially. In other words, it will start the first job and only start the next one
  * once that first job validation phase is complete. This is done so that the replica only
- * create one merkle tree at a time, which is our way to ensure that such creation starts
+ * create one merkle tree per range at a time, which is our way to ensure that such creation starts
  * roughly at the same time on every node (see CASSANDRA-2816). However the synchronization
  * phases are allowed to run concurrently (with each other and with validation phases).
  *
- * A given RepairJob has 2 modes: either sequential or not (isSequential flag). If sequential,
+ * A given RepairJob has 2 modes: either sequential or not (RepairParallelism). If sequential,
  * it will requests merkle tree creation from each replica in sequence (though in that case
  * we still first send a message to each node to flush and snapshot data so each merkle tree
  * creation is still done on similar data, even if the actual creation is not
@@ -87,9 +88,9 @@
     private final String[] cfnames;
     public final RepairParallelism parallelismDegree;
     /** Range to repair */
-    public final Range<Token> range;
+    public final Collection<Range<Token>> ranges;
     public final Set<InetAddress> endpoints;
-    private final long repairedAt;
+    public final long repairedAt;
 
     private final AtomicBoolean isFailed = new AtomicBoolean(false);
 
@@ -99,7 +100,7 @@
     private final ConcurrentMap<Pair<RepairJobDesc, NodePair>, RemoteSyncTask> syncingTasks = new ConcurrentHashMap<>();
 
     // Tasks(snapshot, validate request, differencing, ...) are run on taskExecutor
-    private final ListeningExecutorService taskExecutor = MoreExecutors.listeningDecorator(DebuggableThreadPoolExecutor.createCachedThreadpoolWithMaxSize("RepairJobTask"));
+    public final ListeningExecutorService taskExecutor;
 
     private volatile boolean terminated = false;
 
@@ -108,7 +109,7 @@
      *
      * @param parentRepairSession the parent sessions id
      * @param id this sessions id
-     * @param range range to repair
+     * @param ranges ranges to repair
      * @param keyspace name of keyspace
      * @param parallelismDegree specifies the degree of parallelism when calculating the merkle trees
      * @param endpoints the data centers that should be part of the repair; null for all DCs
@@ -117,7 +118,7 @@
      */
     public RepairSession(UUID parentRepairSession,
                          UUID id,
-                         Range<Token> range,
+                         Collection<Range<Token>> ranges,
                          String keyspace,
                          RepairParallelism parallelismDegree,
                          Set<InetAddress> endpoints,
@@ -131,9 +132,16 @@
         this.parallelismDegree = parallelismDegree;
         this.keyspace = keyspace;
         this.cfnames = cfnames;
-        this.range = range;
+        this.ranges = ranges;
         this.endpoints = endpoints;
         this.repairedAt = repairedAt;
+        this.taskExecutor = MoreExecutors.listeningDecorator(createExecutor());
+    }
+
+    @VisibleForTesting
+    protected DebuggableThreadPoolExecutor createExecutor()
+    {
+        return DebuggableThreadPoolExecutor.createCachedThreadpoolWithMaxSize("RepairJobTask");
     }
 
     public UUID getId()
@@ -141,9 +149,9 @@
         return id;
     }
 
-    public Range<Token> getRange()
+    public Collection<Range<Token>> getRanges()
     {
-        return range;
+        return ranges;
     }
 
     public void waitForValidation(Pair<RepairJobDesc, InetAddress> key, ValidationTask task)
@@ -161,9 +169,9 @@
      *
      * @param desc repair job description
      * @param endpoint endpoint that sent merkle tree
-     * @param tree calculated merkle tree, or null if validation failed
+     * @param trees calculated merkle trees, or null if validation failed
      */
-    public void validationComplete(RepairJobDesc desc, InetAddress endpoint, MerkleTree tree)
+    public void validationComplete(RepairJobDesc desc, InetAddress endpoint, MerkleTrees trees)
     {
         ValidationTask task = validating.remove(Pair.create(desc, endpoint));
         if (task == null)
@@ -175,7 +183,7 @@
         String message = String.format("Received merkle tree for %s from %s", desc.columnFamily, endpoint);
         logger.info("[repair #{}] {}", getId(), message);
         Tracing.traceRepair(message);
-        task.treeReceived(tree);
+        task.treesReceived(trees);
     }
 
     /**
@@ -198,6 +206,12 @@
         task.syncComplete(success);
     }
 
+    @VisibleForTesting
+    Map<Pair<RepairJobDesc, NodePair>, RemoteSyncTask> getSyncingTasks()
+    {
+        return Collections.unmodifiableMap(syncingTasks);
+    }
+
     private String repairedNodes()
     {
         StringBuilder sb = new StringBuilder();
@@ -221,15 +235,15 @@
         if (terminated)
             return;
 
-        logger.info(String.format("[repair #%s] new session: will sync %s on range %s for %s.%s", getId(), repairedNodes(), range, keyspace, Arrays.toString(cfnames)));
-        Tracing.traceRepair("Syncing range {}", range);
-        SystemDistributedKeyspace.startRepairs(getId(), parentRepairSession, keyspace, cfnames, range, endpoints);
+        logger.info(String.format("[repair #%s] new session: will sync %s on range %s for %s.%s", getId(), repairedNodes(), ranges, keyspace, Arrays.toString(cfnames)));
+        Tracing.traceRepair("Syncing range {}", ranges);
+        SystemDistributedKeyspace.startRepairs(getId(), parentRepairSession, keyspace, cfnames, ranges, endpoints);
 
         if (endpoints.isEmpty())
         {
-            logger.info("[repair #{}] {}", getId(), message = String.format("No neighbors to repair with on range %s: session completed", range));
+            logger.info("[repair #{}] {}", getId(), message = String.format("No neighbors to repair with on range %s: session completed", ranges));
             Tracing.traceRepair(message);
-            set(new RepairSessionResult(id, keyspace, range, Lists.<RepairResult>newArrayList()));
+            set(new RepairSessionResult(id, keyspace, ranges, Lists.<RepairResult>newArrayList()));
             SystemDistributedKeyspace.failRepairs(getId(), keyspace, cfnames, new RuntimeException(message));
             return;
         }
@@ -252,7 +266,7 @@
         List<ListenableFuture<RepairResult>> jobs = new ArrayList<>(cfnames.length);
         for (String cfname : cfnames)
         {
-            RepairJob job = new RepairJob(this, cfname, parallelismDegree, repairedAt, taskExecutor);
+            RepairJob job = new RepairJob(this, cfname);
             executor.execute(job);
             jobs.add(job);
         }
@@ -264,8 +278,8 @@
             {
                 // this repair session is completed
                 logger.info("[repair #{}] {}", getId(), "Session completed successfully");
-                Tracing.traceRepair("Completed sync of range {}", range);
-                set(new RepairSessionResult(id, keyspace, range, results));
+                Tracing.traceRepair("Completed sync of range {}", ranges);
+                set(new RepairSessionResult(id, keyspace, ranges, results));
 
                 taskExecutor.shutdown();
                 // mark this session as terminated

diff --git a/src/java/org/apache/cassandra/repair/RepairSessionResult.java b/src/java/org/apache/cassandra/repair/RepairSessionResult.java
index 4551608..d4fff37 100644
--- a/src/java/org/apache/cassandra/repair/RepairSessionResult.java
+++ b/src/java/org/apache/cassandra/repair/RepairSessionResult.java

@@ -30,14 +30,14 @@
 {
     public final UUID sessionId;
     public final String keyspace;
-    public final Range<Token> range;
+    public final Collection<Range<Token>> ranges;
     public final Collection<RepairResult> repairJobResults;
 
-    public RepairSessionResult(UUID sessionId, String keyspace, Range<Token> range, Collection<RepairResult> repairJobResults)
+    public RepairSessionResult(UUID sessionId, String keyspace, Collection<Range<Token>> ranges, Collection<RepairResult> repairJobResults)
     {
         this.sessionId = sessionId;
         this.keyspace = keyspace;
-        this.range = range;
+        this.ranges = ranges;
         this.repairJobResults = repairJobResults;
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/SyncTask.java b/src/java/org/apache/cassandra/repair/SyncTask.java
index 7350a66..c96caf4 100644
--- a/src/java/org/apache/cassandra/repair/SyncTask.java
+++ b/src/java/org/apache/cassandra/repair/SyncTask.java

@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.repair;
 
-import java.util.ArrayList;
+import java.net.InetAddress;
 import java.util.List;
 
 import com.google.common.util.concurrent.AbstractFuture;
@@ -27,10 +27,9 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.MerkleTree;
 
 /**
- * SyncTask will calculate the difference of MerkleTree between two nodes
+ * SyncTask takes the difference of MerkleTrees between two nodes
  * and perform necessary operation to repair replica.
  */
 public abstract class SyncTask extends AbstractFuture<SyncStat> implements Runnable
@@ -38,16 +37,19 @@
     private static Logger logger = LoggerFactory.getLogger(SyncTask.class);
 
     protected final RepairJobDesc desc;
-    protected final TreeResponse r1;
-    protected final TreeResponse r2;
+    protected final InetAddress firstEndpoint;
+    protected final InetAddress secondEndpoint;
+
+    private final List<Range<Token>> rangesToSync;
 
     protected volatile SyncStat stat;
 
-    public SyncTask(RepairJobDesc desc, TreeResponse r1, TreeResponse r2)
+    public SyncTask(RepairJobDesc desc, InetAddress firstEndpoint, InetAddress secondEndpoint, List<Range<Token>> rangesToSync)
     {
         this.desc = desc;
-        this.r1 = r1;
-        this.r2 = r2;
+        this.firstEndpoint = firstEndpoint;
+        this.secondEndpoint = secondEndpoint;
+        this.rangesToSync = rangesToSync;
     }
 
     /**
@@ -55,26 +57,22 @@
      */
     public void run()
     {
-        // compare trees, and collect differences
-        List<Range<Token>> differences = new ArrayList<>();
-        differences.addAll(MerkleTree.difference(r1.tree, r2.tree));
-
-        stat = new SyncStat(new NodePair(r1.endpoint, r2.endpoint), differences.size());
+        stat = new SyncStat(new NodePair(firstEndpoint, secondEndpoint), rangesToSync.size());
 
         // choose a repair method based on the significance of the difference
-        String format = String.format("[repair #%s] Endpoints %s and %s %%s for %s", desc.sessionId, r1.endpoint, r2.endpoint, desc.columnFamily);
-        if (differences.isEmpty())
+        String format = String.format("[repair #%s] Endpoints %s and %s %%s for %s", desc.sessionId, firstEndpoint, secondEndpoint, desc.columnFamily);
+        if (rangesToSync.isEmpty())
         {
             logger.info(String.format(format, "are consistent"));
-            Tracing.traceRepair("Endpoint {} is consistent with {} for {}", r1.endpoint, r2.endpoint, desc.columnFamily);
+            Tracing.traceRepair("Endpoint {} is consistent with {} for {}", firstEndpoint, secondEndpoint, desc.columnFamily);
             set(stat);
             return;
         }
 
         // non-0 difference: perform streaming repair
-        logger.info(String.format(format, "have " + differences.size() + " range(s) out of sync"));
-        Tracing.traceRepair("Endpoint {} has {} range(s) out of sync with {} for {}", r1.endpoint, differences.size(), r2.endpoint, desc.columnFamily);
-        startSync(differences);
+        logger.info(String.format(format, "have " + rangesToSync.size() + " range(s) out of sync"));
+        Tracing.traceRepair("Endpoint {} has {} range(s) out of sync with {} for {}", firstEndpoint, rangesToSync.size(), secondEndpoint, desc.columnFamily);
+        startSync(rangesToSync);
     }
 
     public SyncStat getCurrentStat()

diff --git a/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java b/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java
index 2509597..a922b28 100644
--- a/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java
+++ b/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java

@@ -22,35 +22,49 @@
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.Set;
 import java.util.UUID;
 
 import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Sets;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Tables;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
 public final class SystemDistributedKeyspace
 {
-    private static Logger logger = LoggerFactory.getLogger(SystemDistributedKeyspace.class);
+    private SystemDistributedKeyspace()
+    {
+    }
+
+    private static final Logger logger = LoggerFactory.getLogger(SystemDistributedKeyspace.class);
 
     public static final String NAME = "system_distributed";
 
+    /**
+     * Generation is used as a timestamp for automatic table creation on startup.
+     * If you make any changes to the tables below, make sure to increment the
+     * generation and document your change here.
+     *
+     * gen 0: original definition in 2.2
+     * gen 1: (pre-)add options column to parent_repair_history in 3.0, 3.11
+     * gen 2: (pre-)add coordinator_port and participants_v2 columns to repair_history in 3.0, 3.11, 4.0
+     */
+    public static final long GENERATION = 2;
+
     public static final String REPAIR_HISTORY = "repair_history";
 
     public static final String PARENT_REPAIR_HISTORY = "parent_repair_history";
@@ -66,7 +80,9 @@
                      + "range_begin text,"
                      + "range_end text,"
                      + "coordinator inet,"
+                     + "coordinator_port int,"
                      + "participants set<inet>,"
+                     + "participants_v2 set<text>,"
                      + "exception_message text,"
                      + "exception_stacktrace text,"
                      + "status text,"
@@ -87,6 +103,7 @@
                      + "exception_stacktrace text,"
                      + "requested_ranges set<text>,"
                      + "successful_ranges set<text>,"
+                     + "options map<text, text>,"
                      + "PRIMARY KEY (parent_id))");
 
     private static CFMetaData compile(String name, String description, String schema)
@@ -95,10 +112,9 @@
                          .comment(description);
     }
 
-    public static KSMetaData definition()
+    public static KeyspaceMetadata metadata()
     {
-        List<CFMetaData> tables = Arrays.asList(RepairHistory, ParentRepairHistory);
-        return new KSMetaData(NAME, SimpleStrategy.class, ImmutableMap.of("replication_factor", "3"), true, tables);
+        return KeyspaceMetadata.create(NAME, KeyspaceParams.simple(3), Tables.of(RepairHistory, ParentRepairHistory));
     }
 
     public static void startParentRepair(UUID parent_id, String keyspaceName, String[] cfnames, Collection<Range<Token>> ranges)
@@ -128,7 +144,7 @@
         processSilent(fmtQuery);
     }
 
-    public static void startRepairs(UUID id, UUID parent_id, String keyspaceName, String[] cfnames, Range<Token> range, Iterable<InetAddress> endpoints)
+    public static void startRepairs(UUID id, UUID parent_id, String keyspaceName, String[] cfnames, Collection<Range<Token>> ranges, Iterable<InetAddress> endpoints)
     {
         String coordinator = FBUtilities.getBroadcastAddress().getHostAddress();
         Set<String> participants = Sets.newHashSet(coordinator);
@@ -142,17 +158,20 @@
 
         for (String cfname : cfnames)
         {
-            String fmtQry = String.format(query, NAME, REPAIR_HISTORY,
-                                          keyspaceName,
-                                          cfname,
-                                          id.toString(),
-                                          parent_id.toString(),
-                                          range.left.toString(),
-                                          range.right.toString(),
-                                          coordinator,
-                                          Joiner.on("', '").join(participants),
-                    RepairState.STARTED.toString());
-            processSilent(fmtQry);
+            for (Range<Token> range : ranges)
+            {
+                String fmtQry = String.format(query, NAME, REPAIR_HISTORY,
+                                              keyspaceName,
+                                              cfname,
+                                              id.toString(),
+                                              parent_id.toString(),
+                                              range.left.toString(),
+                                              range.right.toString(),
+                                              coordinator,
+                                              Joiner.on("', '").join(participants),
+                                              RepairState.STARTED.toString());
+                processSilent(fmtQry);
+            }
         }
     }
 

diff --git a/src/java/org/apache/cassandra/repair/TreeResponse.java b/src/java/org/apache/cassandra/repair/TreeResponse.java
index eede4ee..c898b36 100644
--- a/src/java/org/apache/cassandra/repair/TreeResponse.java
+++ b/src/java/org/apache/cassandra/repair/TreeResponse.java

@@ -19,7 +19,7 @@
 
 import java.net.InetAddress;
 
-import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 
 /**
  * Merkle tree response sent from given endpoint.
@@ -27,11 +27,11 @@
 public class TreeResponse
 {
     public final InetAddress endpoint;
-    public final MerkleTree tree;
+    public final MerkleTrees trees;
 
-    public TreeResponse(InetAddress endpoint, MerkleTree tree)
+    public TreeResponse(InetAddress endpoint, MerkleTrees trees)
     {
         this.endpoint = endpoint;
-        this.tree = tree;
+        this.trees = trees;
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/ValidationTask.java b/src/java/org/apache/cassandra/repair/ValidationTask.java
index a52ec4f..bd866d2 100644
--- a/src/java/org/apache/cassandra/repair/ValidationTask.java
+++ b/src/java/org/apache/cassandra/repair/ValidationTask.java

@@ -18,13 +18,17 @@
 package org.apache.cassandra.repair;
 
 import java.net.InetAddress;
+import java.util.Map;
 
 import com.google.common.util.concurrent.AbstractFuture;
 
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.RepairException;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.messages.ValidationRequest;
 import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 
 /**
  * ValidationTask sends {@link ValidationRequest} to a replica.
@@ -53,19 +57,19 @@
     }
 
     /**
-     * Receive MerkleTree from replica node.
+     * Receive MerkleTrees from replica node.
      *
-     * @param tree MerkleTree that is sent from replica. Null if validation failed on replica node.
+     * @param trees MerkleTrees that is sent from replica. Null if validation failed on replica node.
      */
-    public void treeReceived(MerkleTree tree)
+    public void treesReceived(MerkleTrees trees)
     {
-        if (tree == null)
+        if (trees == null)
         {
             setException(new RepairException(desc, "Validation failed in " + endpoint));
         }
         else
         {
-            set(new TreeResponse(endpoint, tree));
+            set(new TreeResponse(endpoint, trees));
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/Validator.java b/src/java/org/apache/cassandra/repair/Validator.java
index 8dbb4cf..9baa358 100644
--- a/src/java/org/apache/cassandra/repair/Validator.java
+++ b/src/java/org/apache/cassandra/repair/Validator.java

@@ -31,13 +31,17 @@
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.repair.messages.ValidationComplete;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MerkleTree;
 import org.apache.cassandra.utils.MerkleTree.RowHash;
+import org.apache.cassandra.utils.MerkleTrees;
 
 /**
  * Handles the building of a merkle tree for a column family.
@@ -58,11 +62,11 @@
 
     // null when all rows with the min token have been consumed
     private long validated;
-    private MerkleTree tree;
+    private MerkleTrees trees;
     // current range being updated
     private MerkleTree.TreeRange range;
     // iterator for iterating sub ranges (MT's leaves)
-    private MerkleTree.TreeRangeIterator ranges;
+    private MerkleTrees.TreeRangeIterator ranges;
     // last key seen
     private DecoratedKey lastKey;
 
@@ -82,9 +86,9 @@
         this.evenTreeDistribution = evenTreeDistribution;
     }
 
-    public void prepare(ColumnFamilyStore cfs, MerkleTree tree)
+    public void prepare(ColumnFamilyStore cfs, MerkleTrees tree)
     {
-        this.tree = tree;
+        this.trees = tree;
 
         if (!tree.partitioner().preservesOrder() || evenTreeDistribution)
         {
@@ -94,31 +98,36 @@
         else
         {
             List<DecoratedKey> keys = new ArrayList<>();
-            for (DecoratedKey sample : cfs.keySamples(desc.range))
+            Random random = new Random();
+            
+            for (Range<Token> range : tree.ranges())
             {
-                assert desc.range.contains(sample.getToken()): "Token " + sample.getToken() + " is not within range " + desc.range;
-                keys.add(sample);
-            }
-
-            if (keys.isEmpty())
-            {
-                // use an even tree distribution
-                tree.init();
-            }
-            else
-            {
-                int numkeys = keys.size();
-                Random random = new Random();
-                // sample the column family using random keys from the index
-                while (true)
+                for (DecoratedKey sample : cfs.keySamples(range))
                 {
-                    DecoratedKey dk = keys.get(random.nextInt(numkeys));
-                    if (!tree.split(dk.getToken()))
-                        break;
+                    assert range.contains(sample.getToken()) : "Token " + sample.getToken() + " is not within range " + desc.ranges;
+                    keys.add(sample);
+                }
+
+                if (keys.isEmpty())
+                {
+                    // use an even tree distribution
+                    tree.init(range);
+                }
+                else
+                {
+                    int numKeys = keys.size();
+                    // sample the column family using random keys from the index
+                    while (true)
+                    {
+                        DecoratedKey dk = keys.get(random.nextInt(numKeys));
+                        if (!tree.split(dk.getToken()))
+                            break;
+                    }
+                    keys.clear();
                 }
             }
         }
-        logger.debug("Prepared AEService tree of size {} for {}", tree.size(), desc);
+        logger.debug("Prepared AEService trees of size {} for {}", trees.size(), desc);
         ranges = tree.invalids();
     }
 
@@ -128,32 +137,43 @@
      *
      * @param row Row to add hash
      */
-    public void add(AbstractCompactedRow row)
+    public void add(UnfilteredRowIterator partition)
     {
-        assert desc.range.contains(row.key.getToken()) : row.key.getToken() + " is not contained in " + desc.range;
-        assert lastKey == null || lastKey.compareTo(row.key) < 0
-               : "row " + row.key + " received out of order wrt " + lastKey;
-        lastKey = row.key;
+        assert Range.isInRanges(partition.partitionKey().getToken(), desc.ranges) : partition.partitionKey().getToken() + " is not contained in " + desc.ranges;
+        assert lastKey == null || lastKey.compareTo(partition.partitionKey()) < 0
+               : "partition " + partition.partitionKey() + " received out of order wrt " + lastKey;
+        lastKey = partition.partitionKey();
 
         if (range == null)
             range = ranges.next();
 
         // generate new ranges as long as case 1 is true
-        while (!range.contains(row.key.getToken()))
+        if (!findCorrectRange(lastKey.getToken()))
         {
             // add the empty hash, and move to the next range
-            range.ensureHashInitialised();
-            range = ranges.next();
+            ranges = trees.invalids();
+            findCorrectRange(lastKey.getToken());
         }
 
+        assert range.contains(lastKey.getToken()) : "Token not in MerkleTree: " + lastKey.getToken();
         // case 3 must be true: mix in the hashed row
-        RowHash rowHash = rowHash(row);
+        RowHash rowHash = rowHash(partition);
         if (rowHash != null)
         {
             range.addHash(rowHash);
         }
     }
 
+    public boolean findCorrectRange(Token t)
+    {
+        while (!range.contains(t) && ranges.hasNext())
+        {
+            range = ranges.next();
+        }
+
+        return range.contains(t);
+    }
+
     static class CountingDigest extends MessageDigest
     {
         private long count;
@@ -193,21 +213,16 @@
 
     }
 
-    private MerkleTree.RowHash rowHash(AbstractCompactedRow row)
+    private MerkleTree.RowHash rowHash(UnfilteredRowIterator partition)
     {
         validated++;
         // MerkleTree uses XOR internally, so we want lots of output bits here
         CountingDigest digest = new CountingDigest(FBUtilities.newMessageDigest("SHA-256"));
-        row.update(digest);
+        UnfilteredRowIterators.digest(null, partition, digest, MessagingService.current_version);
         // only return new hash for merkle tree in case digest was updated - see CASSANDRA-8979
-        if (digest.count > 0)
-        {
-            return new MerkleTree.RowHash(row.key.getToken(), digest.digest(), digest.count);
-        }
-        else
-        {
-            return null;
-        }
+        return digest.count > 0
+             ? new MerkleTree.RowHash(partition.partitionKey().getToken(), digest.digest(), digest.count)
+             : null;
     }
 
     /**
@@ -223,9 +238,9 @@
         {
             // log distribution of rows in tree
             logger.debug("Validated {} partitions for {}.  Partitions per leaf are:", validated, desc.sessionId);
-            tree.histogramOfRowCountPerLeaf().log(logger);
+            trees.logRowCountPerLeaf(logger);
             logger.debug("Validated {} partitions for {}.  Partition sizes are:", validated, desc.sessionId);
-            tree.histogramOfRowSizePerLeaf().log(logger);
+            trees.logRowSizePerLeaf(logger);
         }
     }
 
@@ -234,8 +249,8 @@
     {
         assert ranges != null : "Validator was not prepared()";
 
-        if (range != null)
-            range.ensureHashInitialised();
+        ranges = trees.invalids();
+
         while (ranges.hasNext())
         {
             range = ranges.next();
@@ -266,6 +281,6 @@
             logger.info(String.format("[repair #%s] Sending completed merkle tree to %s for %s.%s", desc.sessionId, initiator, desc.keyspace, desc.columnFamily));
             Tracing.traceRepair("Sending completed merkle tree to {} for {}.{}", initiator, desc.keyspace, desc.columnFamily);
         }
-        MessagingService.instance().sendOneWay(new ValidationComplete(desc, tree).createMessage(), initiator);
+        MessagingService.instance().sendOneWay(new ValidationComplete(desc, trees).createMessage(), initiator);
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java b/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java
index b554500..a29cc87 100644
--- a/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java
+++ b/src/java/org/apache/cassandra/repair/messages/AnticompactionRequest.java

@@ -17,15 +17,16 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Objects;
 import java.util.UUID;
 
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -46,6 +47,23 @@
         this.successfulRanges = ranges;
     }
 
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof AnticompactionRequest))
+            return false;
+        AnticompactionRequest other = (AnticompactionRequest)o;
+        return messageType == other.messageType &&
+               parentRepairSession.equals(other.parentRepairSession) &&
+               successfulRanges.equals(other.successfulRanges);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, parentRepairSession, successfulRanges);
+    }
+
     public static class AnticompactionRequestSerializer implements MessageSerializer<AnticompactionRequest>
     {
         public void serialize(AnticompactionRequest message, DataOutputPlus out, int version) throws IOException
@@ -59,7 +77,7 @@
             }
         }
 
-        public AnticompactionRequest deserialize(DataInput in, int version) throws IOException
+        public AnticompactionRequest deserialize(DataInputPlus in, int version) throws IOException
         {
             UUID parentRepairSession = UUIDSerializer.serializer.deserialize(in, version);
             int rangeCount = in.readInt();
@@ -72,6 +90,7 @@
         public long serializedSize(AnticompactionRequest message, int version)
         {
             long size = UUIDSerializer.serializer.serializedSize(message.parentRepairSession, version);
+            size += Integer.BYTES; // count of items in successfulRanges
             for (Range<Token> r : message.successfulRanges)
                 size += Range.tokenSerializer.serializedSize(r, version);
             return size;

diff --git a/src/java/org/apache/cassandra/repair/messages/CleanupMessage.java b/src/java/org/apache/cassandra/repair/messages/CleanupMessage.java
index 6d702ce..69d147a 100644
--- a/src/java/org/apache/cassandra/repair/messages/CleanupMessage.java
+++ b/src/java/org/apache/cassandra/repair/messages/CleanupMessage.java

@@ -17,10 +17,11 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
+import java.util.Objects;
 import java.util.UUID;
 
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.UUIDSerializer;
 
@@ -40,6 +41,22 @@
         this.parentRepairSession = parentRepairSession;
     }
 
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof CleanupMessage))
+            return false;
+        CleanupMessage other = (CleanupMessage) o;
+        return messageType == other.messageType &&
+               parentRepairSession.equals(other.parentRepairSession);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, parentRepairSession);
+    }
+
     public static class CleanupMessageSerializer implements MessageSerializer<CleanupMessage>
     {
         public void serialize(CleanupMessage message, DataOutputPlus out, int version) throws IOException
@@ -47,7 +64,7 @@
             UUIDSerializer.serializer.serialize(message.parentRepairSession, out, version);
         }
 
-        public CleanupMessage deserialize(DataInput in, int version) throws IOException
+        public CleanupMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             UUID parentRepairSession = UUIDSerializer.serializer.deserialize(in, version);
             return new CleanupMessage(parentRepairSession);

diff --git a/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java b/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java
index 3a00376..b3efeae 100644
--- a/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java
+++ b/src/java/org/apache/cassandra/repair/messages/PrepareMessage.java

@@ -17,16 +17,17 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Objects;
 import java.util.UUID;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -34,32 +35,49 @@
 
 public class PrepareMessage extends RepairMessage
 {
-    public final static MessageSerializer serializer = new PrepareMessageSerializer(false);
-    public final static MessageSerializer globalSerializer = new PrepareMessageSerializer(true);
+    public final static MessageSerializer serializer = new PrepareMessageSerializer();
     public final List<UUID> cfIds;
     public final Collection<Range<Token>> ranges;
 
     public final UUID parentRepairSession;
     public final boolean isIncremental;
+    public final long timestamp;
+    public final boolean isGlobal;
 
-    public PrepareMessage(UUID parentRepairSession, List<UUID> cfIds, Collection<Range<Token>> ranges, boolean isIncremental, boolean isGlobal)
+    public PrepareMessage(UUID parentRepairSession, List<UUID> cfIds, Collection<Range<Token>> ranges, boolean isIncremental, long timestamp, boolean isGlobal)
     {
-        super(isGlobal ? Type.PREPARE_GLOBAL_MESSAGE : Type.PREPARE_MESSAGE, null);
+        super(Type.PREPARE_MESSAGE, null);
         this.parentRepairSession = parentRepairSession;
         this.cfIds = cfIds;
         this.ranges = ranges;
         this.isIncremental = isIncremental;
+        this.timestamp = timestamp;
+        this.isGlobal = isGlobal;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof PrepareMessage))
+            return false;
+        PrepareMessage other = (PrepareMessage) o;
+        return messageType == other.messageType &&
+               parentRepairSession.equals(other.parentRepairSession) &&
+               isIncremental == other.isIncremental &&
+               isGlobal == other.isGlobal &&
+               timestamp == other.timestamp &&
+               cfIds.equals(other.cfIds) &&
+               ranges.equals(other.ranges);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, parentRepairSession, isGlobal, isIncremental, timestamp, cfIds, ranges);
     }
 
     public static class PrepareMessageSerializer implements MessageSerializer<PrepareMessage>
     {
-        private final boolean isGlobal;
-
-        public PrepareMessageSerializer(boolean global)
-        {
-            this.isGlobal = global;
-        }
-
         public void serialize(PrepareMessage message, DataOutputPlus out, int version) throws IOException
         {
             out.writeInt(message.cfIds.size());
@@ -73,9 +91,11 @@
                 Range.tokenSerializer.serialize(r, out, version);
             }
             out.writeBoolean(message.isIncremental);
+            out.writeLong(message.timestamp);
+            out.writeBoolean(message.isGlobal);
         }
 
-        public PrepareMessage deserialize(DataInput in, int version) throws IOException
+        public PrepareMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             int cfIdCount = in.readInt();
             List<UUID> cfIds = new ArrayList<>(cfIdCount);
@@ -87,22 +107,24 @@
             for (int i = 0; i < rangeCount; i++)
                 ranges.add((Range<Token>) Range.tokenSerializer.deserialize(in, MessagingService.globalPartitioner(), version));
             boolean isIncremental = in.readBoolean();
-
-            return new PrepareMessage(parentRepairSession, cfIds, ranges, isIncremental, isGlobal);
+            long timestamp = in.readLong();
+            boolean isGlobal = in.readBoolean();
+            return new PrepareMessage(parentRepairSession, cfIds, ranges, isIncremental, timestamp, isGlobal);
         }
 
         public long serializedSize(PrepareMessage message, int version)
         {
             long size;
-            TypeSizes sizes = TypeSizes.NATIVE;
-            size = sizes.sizeof(message.cfIds.size());
+            size = TypeSizes.sizeof(message.cfIds.size());
             for (UUID cfId : message.cfIds)
                 size += UUIDSerializer.serializer.serializedSize(cfId, version);
             size += UUIDSerializer.serializer.serializedSize(message.parentRepairSession, version);
-            size += sizes.sizeof(message.ranges.size());
+            size += TypeSizes.sizeof(message.ranges.size());
             for (Range<Token> r : message.ranges)
                 size += Range.tokenSerializer.serializedSize(r, version);
-            size += sizes.sizeof(message.isIncremental);
+            size += TypeSizes.sizeof(message.isIncremental);
+            size += TypeSizes.sizeof(message.timestamp);
+            size += TypeSizes.sizeof(message.isGlobal);
             return size;
         }
     }
@@ -115,6 +137,8 @@
                 ", ranges=" + ranges +
                 ", parentRepairSession=" + parentRepairSession +
                 ", isIncremental="+isIncremental +
+                ", timestamp=" + timestamp +
+                ", isGlobal=" + isGlobal +
                 '}';
     }
 }

diff --git a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java
index 82e474f..55fdb66 100644
--- a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java
+++ b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
@@ -45,7 +45,6 @@
         SYNC_COMPLETE(3, SyncComplete.serializer),
         ANTICOMPACTION_REQUEST(4, AnticompactionRequest.serializer),
         PREPARE_MESSAGE(5, PrepareMessage.serializer),
-        PREPARE_GLOBAL_MESSAGE(8, PrepareMessage.globalSerializer),
         SNAPSHOT(6, SnapshotMessage.serializer),
         CLEANUP(7, CleanupMessage.serializer);
 
@@ -91,7 +90,7 @@
             message.messageType.serializer.serialize(message, out, version);
         }
 
-        public RepairMessage deserialize(DataInput in, int version) throws IOException
+        public RepairMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             RepairMessage.Type messageType = RepairMessage.Type.fromByte(in.readByte());
             return messageType.serializer.deserialize(in, version);

diff --git a/src/java/org/apache/cassandra/repair/messages/RepairOption.java b/src/java/org/apache/cassandra/repair/messages/RepairOption.java
index 44a1e57..9d60ad7 100644
--- a/src/java/org/apache/cassandra/repair/messages/RepairOption.java
+++ b/src/java/org/apache/cassandra/repair/messages/RepairOption.java

@@ -158,6 +158,10 @@
                 }
                 Token parsedBeginToken = partitioner.getTokenFactory().fromString(rangeStr[0].trim());
                 Token parsedEndToken = partitioner.getTokenFactory().fromString(rangeStr[1].trim());
+                if (parsedBeginToken.equals(parsedEndToken))
+                {
+                    throw new IllegalArgumentException("Start and end tokens must be different.");
+                }
                 ranges.add(new Range<>(parsedBeginToken, parsedEndToken));
             }
         }

diff --git a/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java b/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java
index caccc82..d4737d3 100644
--- a/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java
+++ b/src/java/org/apache/cassandra/repair/messages/SnapshotMessage.java

@@ -17,9 +17,10 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
+import java.util.Objects;
 
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.RepairJobDesc;
 
@@ -32,6 +33,21 @@
         super(Type.SNAPSHOT, desc);
     }
 
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof SnapshotMessage))
+            return false;
+        SnapshotMessage other = (SnapshotMessage) o;
+        return messageType == other.messageType;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType);
+    }
+
     public static class SnapshotMessageSerializer implements MessageSerializer<SnapshotMessage>
     {
         public void serialize(SnapshotMessage message, DataOutputPlus out, int version) throws IOException
@@ -39,7 +55,7 @@
             RepairJobDesc.serializer.serialize(message.desc, out, version);
         }
 
-        public SnapshotMessage deserialize(DataInput in, int version) throws IOException
+        public SnapshotMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version);
             return new SnapshotMessage(desc);

diff --git a/src/java/org/apache/cassandra/repair/messages/SyncComplete.java b/src/java/org/apache/cassandra/repair/messages/SyncComplete.java
index c9548ca..178e710 100644
--- a/src/java/org/apache/cassandra/repair/messages/SyncComplete.java
+++ b/src/java/org/apache/cassandra/repair/messages/SyncComplete.java

@@ -17,11 +17,12 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
+import java.util.Objects;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.NodePair;
 import org.apache.cassandra.repair.RepairJobDesc;
@@ -53,6 +54,24 @@
         this.success = success;
     }
 
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof SyncComplete))
+            return false;
+        SyncComplete other = (SyncComplete)o;
+        return messageType == other.messageType &&
+               desc.equals(other.desc) &&
+               success == other.success &&
+               nodes.equals(other.nodes);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, desc, success, nodes);
+    }
+
     private static class SyncCompleteSerializer implements MessageSerializer<SyncComplete>
     {
         public void serialize(SyncComplete message, DataOutputPlus out, int version) throws IOException
@@ -62,7 +81,7 @@
             out.writeBoolean(message.success);
         }
 
-        public SyncComplete deserialize(DataInput in, int version) throws IOException
+        public SyncComplete deserialize(DataInputPlus in, int version) throws IOException
         {
             RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version);
             NodePair nodes = NodePair.serializer.deserialize(in, version);
@@ -73,7 +92,7 @@
         {
             long size = RepairJobDesc.serializer.serializedSize(message.desc, version);
             size += NodePair.serializer.serializedSize(message.nodes, version);
-            size += TypeSizes.NATIVE.sizeof(message.success);
+            size += TypeSizes.sizeof(message.success);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/repair/messages/SyncRequest.java b/src/java/org/apache/cassandra/repair/messages/SyncRequest.java
index 68aaf4d..e31cc6c 100644
--- a/src/java/org/apache/cassandra/repair/messages/SyncRequest.java
+++ b/src/java/org/apache/cassandra/repair/messages/SyncRequest.java

@@ -17,17 +17,18 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Objects;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 import org.apache.cassandra.net.MessagingService;
@@ -57,6 +58,26 @@
         this.ranges = ranges;
     }
 
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof SyncRequest))
+            return false;
+        SyncRequest req = (SyncRequest)o;
+        return messageType == req.messageType &&
+               desc.equals(req.desc) &&
+               initiator.equals(req.initiator) &&
+               src.equals(req.src) &&
+               dst.equals(req.dst) &&
+               ranges.equals(req.ranges);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, desc, initiator, src, dst, ranges);
+    }
+
     public static class SyncRequestSerializer implements MessageSerializer<SyncRequest>
     {
         public void serialize(SyncRequest message, DataOutputPlus out, int version) throws IOException
@@ -73,7 +94,7 @@
             }
         }
 
-        public SyncRequest deserialize(DataInput in, int version) throws IOException
+        public SyncRequest deserialize(DataInputPlus in, int version) throws IOException
         {
             RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version);
             InetAddress owner = CompactEndpointSerializationHelper.deserialize(in);
@@ -90,7 +111,7 @@
         {
             long size = RepairJobDesc.serializer.serializedSize(message.desc, version);
             size += 3 * CompactEndpointSerializationHelper.serializedSize(message.initiator);
-            size += TypeSizes.NATIVE.sizeof(message.ranges.size());
+            size += TypeSizes.sizeof(message.ranges.size());
             for (Range<Token> range : message.ranges)
                 size += AbstractBounds.tokenSerializer.serializedSize(range, version);
             return size;

diff --git a/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java b/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java
index 8328979..704bffb 100644
--- a/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java
+++ b/src/java/org/apache/cassandra/repair/messages/ValidationComplete.java

@@ -17,13 +17,14 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
+import java.util.Objects;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.RepairJobDesc;
-import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 
 /**
  * ValidationComplete message is sent when validation compaction completed successfully.
@@ -34,24 +35,42 @@
 {
     public static MessageSerializer serializer = new ValidationCompleteSerializer();
 
-    /** true if validation success, false otherwise */
-    public final boolean success;
     /** Merkle hash tree response. Null if validation failed. */
-    public final MerkleTree tree;
+    public final MerkleTrees trees;
 
     public ValidationComplete(RepairJobDesc desc)
     {
         super(Type.VALIDATION_COMPLETE, desc);
-        this.success = false;
-        this.tree = null;
+        trees = null;
     }
 
-    public ValidationComplete(RepairJobDesc desc, MerkleTree tree)
+    public ValidationComplete(RepairJobDesc desc, MerkleTrees trees)
     {
         super(Type.VALIDATION_COMPLETE, desc);
-        assert tree != null;
-        this.success = true;
-        this.tree = tree;
+        assert trees != null;
+        this.trees = trees;
+    }
+
+    public boolean success()
+    {
+        return trees != null;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof ValidationComplete))
+            return false;
+
+        ValidationComplete other = (ValidationComplete)o;
+        return messageType == other.messageType &&
+               desc.equals(other.desc);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(messageType, desc);
     }
 
     private static class ValidationCompleteSerializer implements MessageSerializer<ValidationComplete>
@@ -59,31 +78,31 @@
         public void serialize(ValidationComplete message, DataOutputPlus out, int version) throws IOException
         {
             RepairJobDesc.serializer.serialize(message.desc, out, version);
-            out.writeBoolean(message.success);
-            if (message.success)
-                MerkleTree.serializer.serialize(message.tree, out, version);
+            out.writeBoolean(message.success());
+            if (message.trees != null)
+                MerkleTrees.serializer.serialize(message.trees, out, version);
         }
 
-        public ValidationComplete deserialize(DataInput in, int version) throws IOException
+        public ValidationComplete deserialize(DataInputPlus in, int version) throws IOException
         {
             RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version);
-            if (in.readBoolean())
+            boolean success = in.readBoolean();
+
+            if (success)
             {
-                MerkleTree tree = MerkleTree.serializer.deserialize(in, version);
-                return new ValidationComplete(desc, tree);
+                MerkleTrees trees = MerkleTrees.serializer.deserialize(in, version);
+                return new ValidationComplete(desc, trees);
             }
-            else
-            {
-                return new ValidationComplete(desc);
-            }
+
+            return new ValidationComplete(desc);
         }
 
         public long serializedSize(ValidationComplete message, int version)
         {
             long size = RepairJobDesc.serializer.serializedSize(message.desc, version);
-            size += TypeSizes.NATIVE.sizeof(message.success);
-            if (message.success)
-                size += MerkleTree.serializer.serializedSize(message.tree, version);
+            size += TypeSizes.sizeof(message.success());
+            if (message.trees != null)
+                size += MerkleTrees.serializer.serializedSize(message.trees, version);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java b/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java
index 43bcf23..0dfab6a 100644
--- a/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java
+++ b/src/java/org/apache/cassandra/repair/messages/ValidationRequest.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.repair.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.repair.RepairJobDesc;
 
@@ -73,7 +73,7 @@
             out.writeInt(message.gcBefore);
         }
 
-        public ValidationRequest deserialize(DataInput dis, int version) throws IOException
+        public ValidationRequest deserialize(DataInputPlus dis, int version) throws IOException
         {
             RepairJobDesc desc = RepairJobDesc.serializer.deserialize(dis, version);
             return new ValidationRequest(desc, dis.readInt());
@@ -82,7 +82,7 @@
         public long serializedSize(ValidationRequest message, int version)
         {
             long size = RepairJobDesc.serializer.serializedSize(message.desc, version);
-            size += TypeSizes.NATIVE.sizeof(message.gcBefore);
+            size += TypeSizes.sizeof(message.gcBefore);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/scheduler/RoundRobinScheduler.java b/src/java/org/apache/cassandra/scheduler/RoundRobinScheduler.java
index c98c0fe..61dfa50 100644
--- a/src/java/org/apache/cassandra/scheduler/RoundRobinScheduler.java
+++ b/src/java/org/apache/cassandra/scheduler/RoundRobinScheduler.java

@@ -25,6 +25,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.config.RequestSchedulerOptions;
 import org.cliffc.high_scale_lib.NonBlockingHashMap;
 
@@ -69,7 +70,7 @@
                 }
             }
         };
-        Thread scheduler = new Thread(runnable, "REQUEST-SCHEDULER");
+        Thread scheduler = new Thread(NamedThreadFactory.threadLocalDeallocator(runnable), "REQUEST-SCHEDULER");
         scheduler.start();
         logger.info("Started the RoundRobin Request Scheduler");
     }

diff --git a/src/java/org/apache/cassandra/schema/CQLTypeParser.java b/src/java/org/apache/cassandra/schema/CQLTypeParser.java
new file mode 100644
index 0000000..c79de88
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/CQLTypeParser.java

@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UserType;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public final class CQLTypeParser
+{
+    private static final ImmutableSet<String> PRIMITIVE_TYPES;
+
+    static
+    {
+        ImmutableSet.Builder<String> builder = ImmutableSet.builder();
+        for (CQL3Type.Native primitive : CQL3Type.Native.values())
+            builder.add(primitive.name().toLowerCase());
+        PRIMITIVE_TYPES = builder.build();
+    }
+
+    public static AbstractType<?> parse(String keyspace, String unparsed, Types userTypes)
+    {
+        String lowercased = unparsed.toLowerCase();
+
+        // fast path for the common case of a primitive type
+        if (PRIMITIVE_TYPES.contains(lowercased))
+            return CQL3Type.Native.valueOf(unparsed.toUpperCase()).getType();
+
+        // special-case top-level UDTs
+        UserType udt = userTypes.getNullable(bytes(lowercased));
+        if (udt != null)
+            return udt;
+
+        return parseRaw(unparsed).prepareInternal(keyspace, userTypes).getType();
+    }
+
+    static CQL3Type.Raw parseRaw(String type)
+    {
+        return CQLFragmentParser.parseAny(CqlParser::comparatorType, type, "CQL type");
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/CachingParams.java b/src/java/org/apache/cassandra/schema/CachingParams.java
new file mode 100644
index 0000000..1976835
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/CachingParams.java

@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+import static java.lang.String.format;
+
+// CQL: {'keys' : 'ALL'|'NONE', 'rows_per_partition': '200'|'NONE'|'ALL'}
+public final class CachingParams
+{
+    public enum Option
+    {
+        KEYS,
+        ROWS_PER_PARTITION;
+
+        @Override
+        public String toString()
+        {
+            return name().toLowerCase();
+        }
+    }
+
+    private static final String ALL = "ALL";
+    private static final String NONE = "NONE";
+
+    static final boolean DEFAULT_CACHE_KEYS = true;
+    static final int DEFAULT_ROWS_PER_PARTITION_TO_CACHE = 0;
+
+    public static final CachingParams CACHE_NOTHING = new CachingParams(false, 0);
+    public static final CachingParams CACHE_KEYS = new CachingParams(true, 0);
+    public static final CachingParams CACHE_EVERYTHING = new CachingParams(true, Integer.MAX_VALUE);
+
+    @VisibleForTesting
+    public static CachingParams DEFAULT = new CachingParams(DEFAULT_CACHE_KEYS, DEFAULT_ROWS_PER_PARTITION_TO_CACHE);
+
+    final boolean cacheKeys;
+    final int rowsPerPartitionToCache;
+
+    public CachingParams(boolean cacheKeys, int rowsPerPartitionToCache)
+    {
+        this.cacheKeys = cacheKeys;
+        this.rowsPerPartitionToCache = rowsPerPartitionToCache;
+    }
+
+    public boolean cacheKeys()
+    {
+        return cacheKeys;
+    }
+
+    public boolean cacheRows()
+    {
+        return rowsPerPartitionToCache > 0;
+    }
+
+    public boolean cacheAllRows()
+    {
+        return rowsPerPartitionToCache == Integer.MAX_VALUE;
+    }
+
+    public int rowsPerPartitionToCache()
+    {
+        return rowsPerPartitionToCache;
+    }
+
+    public static CachingParams fromMap(Map<String, String> map)
+    {
+        Map<String, String> copy = new HashMap<>(map);
+
+        String keys = copy.remove(Option.KEYS.toString());
+        boolean cacheKeys = keys != null && keysFromString(keys);
+
+        String rows = copy.remove(Option.ROWS_PER_PARTITION.toString());
+        int rowsPerPartitionToCache = rows == null
+                                    ? 0
+                                    : rowsPerPartitionFromString(rows);
+
+        if (!copy.isEmpty())
+        {
+            throw new ConfigurationException(format("Invalid caching sub-options %s: only '%s' and '%s' are allowed",
+                                                    copy.keySet(),
+                                                    Option.KEYS,
+                                                    Option.ROWS_PER_PARTITION));
+        }
+
+        return new CachingParams(cacheKeys, rowsPerPartitionToCache);
+    }
+
+    public Map<String, String> asMap()
+    {
+        return ImmutableMap.of(Option.KEYS.toString(),
+                               keysAsString(),
+                               Option.ROWS_PER_PARTITION.toString(),
+                               rowsPerPartitionAsString());
+    }
+
+    private static boolean keysFromString(String value)
+    {
+        if (value.equalsIgnoreCase(ALL))
+            return true;
+
+        if (value.equalsIgnoreCase(NONE))
+            return false;
+
+        throw new ConfigurationException(format("Invalid value '%s' for caching sub-option '%s': only '%s' and '%s' are allowed",
+                                                value,
+                                                Option.KEYS,
+                                                ALL,
+                                                NONE));
+    }
+
+    String keysAsString()
+    {
+        return cacheKeys ? ALL : NONE;
+    }
+
+    private static int rowsPerPartitionFromString(String value)
+    {
+        if (value.equalsIgnoreCase(ALL))
+            return Integer.MAX_VALUE;
+
+        if (value.equalsIgnoreCase(NONE))
+            return 0;
+
+        if (StringUtils.isNumeric(value))
+            return Integer.parseInt(value);
+
+        throw new ConfigurationException(format("Invalid value '%s' for caching sub-option '%s':"
+                                                + " only '%s', '%s', and integer values are allowed",
+                                                value,
+                                                Option.ROWS_PER_PARTITION,
+                                                ALL,
+                                                NONE));
+    }
+
+    String rowsPerPartitionAsString()
+    {
+        if (rowsPerPartitionToCache == 0)
+            return NONE;
+        else if (rowsPerPartitionToCache == Integer.MAX_VALUE)
+            return ALL;
+        else
+            return Integer.toString(rowsPerPartitionToCache);
+    }
+
+    @Override
+    public String toString()
+    {
+        return format("{'%s' : '%s', '%s' : '%s'}",
+                      Option.KEYS,
+                      keysAsString(),
+                      Option.ROWS_PER_PARTITION,
+                      rowsPerPartitionAsString());
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof CachingParams))
+            return false;
+
+        CachingParams c = (CachingParams) o;
+
+        return cacheKeys == c.cacheKeys && rowsPerPartitionToCache == c.rowsPerPartitionToCache;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(cacheKeys, rowsPerPartitionToCache);
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/CompactionParams.java b/src/java/org/apache/cassandra/schema/CompactionParams.java
new file mode 100644
index 0000000..720efa3
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/CompactionParams.java

@@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
+import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static java.lang.String.format;
+
+public final class CompactionParams
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompactionParams.class);
+
+    public enum Option
+    {
+        CLASS,
+        ENABLED,
+        MIN_THRESHOLD,
+        MAX_THRESHOLD;
+
+        @Override
+        public String toString()
+        {
+            return name().toLowerCase();
+        }
+    }
+
+    public static final int DEFAULT_MIN_THRESHOLD = 4;
+    public static final int DEFAULT_MAX_THRESHOLD = 32;
+
+    public static final boolean DEFAULT_ENABLED = true;
+
+    public static final Map<String, String> DEFAULT_THRESHOLDS =
+        ImmutableMap.of(Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD),
+                        Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
+
+    public static final CompactionParams DEFAULT =
+        new CompactionParams(SizeTieredCompactionStrategy.class, DEFAULT_THRESHOLDS, DEFAULT_ENABLED);
+
+    private final Class<? extends AbstractCompactionStrategy> klass;
+    private final ImmutableMap<String, String> options;
+    private final boolean isEnabled;
+
+    private CompactionParams(Class<? extends AbstractCompactionStrategy> klass, Map<String, String> options, boolean isEnabled)
+    {
+        this.klass = klass;
+        this.options = ImmutableMap.copyOf(options);
+        this.isEnabled = isEnabled;
+    }
+
+    public static CompactionParams create(Class<? extends AbstractCompactionStrategy> klass, Map<String, String> options)
+    {
+        boolean isEnabled = options.containsKey(Option.ENABLED.toString())
+                          ? Boolean.parseBoolean(options.get(Option.ENABLED.toString()))
+                          : DEFAULT_ENABLED;
+
+        Map<String, String> allOptions = new HashMap<>(options);
+        if (supportsThresholdParams(klass))
+        {
+            allOptions.putIfAbsent(Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD));
+            allOptions.putIfAbsent(Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
+        }
+
+        return new CompactionParams(klass, allOptions, isEnabled);
+    }
+
+    public static CompactionParams scts(Map<String, String> options)
+    {
+        return create(SizeTieredCompactionStrategy.class, options);
+    }
+
+    public static CompactionParams lcs(Map<String, String> options)
+    {
+        return create(LeveledCompactionStrategy.class, options);
+    }
+
+    public int minCompactionThreshold()
+    {
+        String threshold = options.get(Option.MIN_THRESHOLD.toString());
+        return threshold == null
+             ? DEFAULT_MIN_THRESHOLD
+             : Integer.parseInt(threshold);
+    }
+
+    public int maxCompactionThreshold()
+    {
+        String threshold = options.get(Option.MAX_THRESHOLD.toString());
+        return threshold == null
+             ? DEFAULT_MAX_THRESHOLD
+             : Integer.parseInt(threshold);
+    }
+
+    public void validate()
+    {
+        try
+        {
+            Map<?, ?> unknownOptions = (Map) klass.getMethod("validateOptions", Map.class).invoke(null, options);
+            if (!unknownOptions.isEmpty())
+            {
+                throw new ConfigurationException(format("Properties specified %s are not understood by %s",
+                                                        unknownOptions.keySet(),
+                                                        klass.getSimpleName()));
+            }
+        }
+        catch (NoSuchMethodException e)
+        {
+            logger.warn("Compaction strategy {} does not have a static validateOptions method. Validation ignored",
+                        klass.getName());
+        }
+        catch (InvocationTargetException e)
+        {
+            if (e.getTargetException() instanceof ConfigurationException)
+                throw (ConfigurationException) e.getTargetException();
+
+            Throwable cause = e.getCause() == null
+                            ? e
+                            : e.getCause();
+
+            throw new ConfigurationException(format("%s.validateOptions() threw an error: %s %s",
+                                                    klass.getName(),
+                                                    cause.getClass().getName(),
+                                                    cause.getMessage()),
+                                             e);
+        }
+        catch (IllegalAccessException e)
+        {
+            throw new ConfigurationException("Cannot access method validateOptions in " + klass.getName(), e);
+        }
+
+        String minThreshold = options.get(Option.MIN_THRESHOLD.toString());
+        if (minThreshold != null && !StringUtils.isNumeric(minThreshold))
+        {
+            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
+                                                    minThreshold,
+                                                    Option.MIN_THRESHOLD));
+        }
+
+        String maxThreshold = options.get(Option.MAX_THRESHOLD.toString());
+        if (maxThreshold != null && !StringUtils.isNumeric(maxThreshold))
+        {
+            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
+                                                    maxThreshold,
+                                                    Option.MAX_THRESHOLD));
+        }
+
+        if (minCompactionThreshold() <= 0 || maxCompactionThreshold() <= 0)
+        {
+            throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been removed,"
+                                             + " set the compaction option 'enabled' to false instead.");
+        }
+
+        if (minCompactionThreshold() <= 1)
+        {
+            throw new ConfigurationException(format("Min compaction threshold cannot be less than 2 (got %d)",
+                                                    minCompactionThreshold()));
+        }
+
+        if (minCompactionThreshold() > maxCompactionThreshold())
+        {
+            throw new ConfigurationException(format("Min compaction threshold (got %d) cannot be greater than max compaction threshold (got %d)",
+                                                    minCompactionThreshold(),
+                                                    maxCompactionThreshold()));
+        }
+    }
+
+    double defaultBloomFilterFbChance()
+    {
+        return klass.equals(LeveledCompactionStrategy.class) ? 0.1 : 0.01;
+    }
+
+    public Class<? extends AbstractCompactionStrategy> klass()
+    {
+        return klass;
+    }
+
+    /**
+     * All strategy options - excluding 'class'.
+     */
+    public Map<String, String> options()
+    {
+        return options;
+    }
+
+    public boolean isEnabled()
+    {
+        return isEnabled;
+    }
+
+    public static CompactionParams fromMap(Map<String, String> map)
+    {
+        Map<String, String> options = new HashMap<>(map);
+
+        String className = options.remove(Option.CLASS.toString());
+        if (className == null)
+        {
+            throw new ConfigurationException(format("Missing sub-option '%s' for the '%s' option",
+                                                    Option.CLASS,
+                                                    TableParams.Option.COMPACTION));
+        }
+
+        return create(classFromName(className), options);
+    }
+
+    private static Class<? extends AbstractCompactionStrategy> classFromName(String name)
+    {
+        String className = name.contains(".")
+                         ? name
+                         : "org.apache.cassandra.db.compaction." + name;
+        Class<AbstractCompactionStrategy> strategyClass = FBUtilities.classForName(className, "compaction strategy");
+
+        if (!AbstractCompactionStrategy.class.isAssignableFrom(strategyClass))
+        {
+            throw new ConfigurationException(format("Compaction strategy class %s is not derived from AbstractReplicationStrategy",
+                                                    className));
+        }
+
+        return strategyClass;
+    }
+
+    /*
+     * LCS doesn't, STCS and DTCS do
+     */
+    @SuppressWarnings("unchecked")
+    public static boolean supportsThresholdParams(Class<? extends AbstractCompactionStrategy> klass)
+    {
+        try
+        {
+            Map<String, String> unrecognizedOptions =
+                (Map<String, String>) klass.getMethod("validateOptions", Map.class)
+                                           .invoke(null, DEFAULT_THRESHOLDS);
+
+            return unrecognizedOptions.isEmpty();
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public Map<String, String> asMap()
+    {
+        Map<String, String> map = new HashMap<>(options());
+        map.put(Option.CLASS.toString(), klass.getName());
+        return map;
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("class", klass.getName())
+                          .add("options", options)
+                          .toString();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof CompactionParams))
+            return false;
+
+        CompactionParams cp = (CompactionParams) o;
+
+        return klass.equals(cp.klass) && options.equals(cp.options);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(klass, options);
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/CompressionParams.java b/src/java/org/apache/cassandra/schema/CompressionParams.java
new file mode 100644
index 0000000..cd1686f
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/CompressionParams.java

@@ -0,0 +1,551 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.lang3.builder.EqualsBuilder;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.compress.*;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+import static java.lang.String.format;
+
+@SuppressWarnings("deprecation")
+public final class CompressionParams
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompressionParams.class);
+
+    private static volatile boolean hasLoggedSsTableCompressionWarning;
+    private static volatile boolean hasLoggedChunkLengthWarning;
+    private static volatile boolean hasLoggedCrcCheckChanceWarning;
+
+    public static final int DEFAULT_CHUNK_LENGTH = 65536;
+    public static final IVersionedSerializer<CompressionParams> serializer = new Serializer();
+
+    public static final String CLASS = "class";
+    public static final String CHUNK_LENGTH_IN_KB = "chunk_length_in_kb";
+    public static final String ENABLED = "enabled";
+
+    public static final CompressionParams DEFAULT = new CompressionParams(LZ4Compressor.instance,
+                                                                          DEFAULT_CHUNK_LENGTH,
+                                                                          Collections.emptyMap());
+
+    private static final String CRC_CHECK_CHANCE_WARNING = "The option crc_check_chance was deprecated as a compression option. " +
+                                                           "You should specify it as a top-level table option instead";
+
+    @Deprecated public static final String SSTABLE_COMPRESSION = "sstable_compression";
+    @Deprecated public static final String CHUNK_LENGTH_KB = "chunk_length_kb";
+    @Deprecated public static final String CRC_CHECK_CHANCE = "crc_check_chance";
+
+    private final ICompressor sstableCompressor;
+    private final Integer chunkLength;
+    private final ImmutableMap<String, String> otherOptions; // Unrecognized options, can be used by the compressor
+
+    private volatile double crcCheckChance = 1.0;
+
+    public static CompressionParams fromMap(Map<String, String> opts)
+    {
+        Map<String, String> options = copyOptions(opts);
+
+        String sstableCompressionClass;
+
+        if (!opts.isEmpty() && isEnabled(opts) && !containsSstableCompressionClass(opts))
+            throw new ConfigurationException(format("Missing sub-option '%s' for the 'compression' option.", CLASS));
+
+        if (!removeEnabled(options))
+        {
+            sstableCompressionClass = null;
+
+            if (!options.isEmpty())
+                throw new ConfigurationException(format("If the '%s' option is set to false no other options must be specified", ENABLED));
+        }
+        else
+        {
+            sstableCompressionClass = removeSstableCompressionClass(options);
+        }
+
+        Integer chunkLength = removeChunkLength(options);
+
+        CompressionParams cp = new CompressionParams(sstableCompressionClass, chunkLength, options);
+        cp.validate();
+
+        return cp;
+    }
+
+    public Class<? extends ICompressor> klass()
+    {
+        return sstableCompressor.getClass();
+    }
+
+    public static CompressionParams noCompression()
+    {
+        return new CompressionParams((ICompressor) null, DEFAULT_CHUNK_LENGTH, Collections.emptyMap());
+    }
+
+    public static CompressionParams snappy()
+    {
+        return snappy(null);
+    }
+
+    public static CompressionParams snappy(Integer chunkLength)
+    {
+        return new CompressionParams(SnappyCompressor.instance, chunkLength, Collections.emptyMap());
+    }
+
+    public static CompressionParams deflate()
+    {
+        return deflate(null);
+    }
+
+    public static CompressionParams deflate(Integer chunkLength)
+    {
+        return new CompressionParams(DeflateCompressor.instance, chunkLength, Collections.emptyMap());
+    }
+
+    public static CompressionParams lz4()
+    {
+        return lz4(null);
+    }
+
+    public static CompressionParams lz4(Integer chunkLength)
+    {
+        return new CompressionParams(LZ4Compressor.instance, chunkLength, Collections.emptyMap());
+    }
+
+    public CompressionParams(String sstableCompressorClass, Integer chunkLength, Map<String, String> otherOptions) throws ConfigurationException
+    {
+        this(createCompressor(parseCompressorClass(sstableCompressorClass), otherOptions), chunkLength, otherOptions);
+    }
+
+    private CompressionParams(ICompressor sstableCompressor, Integer chunkLength, Map<String, String> otherOptions) throws ConfigurationException
+    {
+        this.sstableCompressor = sstableCompressor;
+        this.chunkLength = chunkLength;
+        this.otherOptions = ImmutableMap.copyOf(otherOptions);
+    }
+
+    public CompressionParams copy()
+    {
+        return new CompressionParams(sstableCompressor, chunkLength, otherOptions);
+    }
+
+    /**
+     * Checks if compression is enabled.
+     * @return {@code true} if compression is enabled, {@code false} otherwise.
+     */
+    public boolean isEnabled()
+    {
+        return sstableCompressor != null;
+    }
+
+    /**
+     * Returns the SSTable compressor.
+     * @return the SSTable compressor or {@code null} if compression is disabled.
+     */
+    public ICompressor getSstableCompressor()
+    {
+        return sstableCompressor;
+    }
+
+    public ImmutableMap<String, String> getOtherOptions()
+    {
+        return otherOptions;
+    }
+
+    public int chunkLength()
+    {
+        return chunkLength == null ? DEFAULT_CHUNK_LENGTH : chunkLength;
+    }
+
+    private static Class<?> parseCompressorClass(String className) throws ConfigurationException
+    {
+        if (className == null || className.isEmpty())
+            return null;
+
+        className = className.contains(".") ? className : "org.apache.cassandra.io.compress." + className;
+        try
+        {
+            return Class.forName(className);
+        }
+        catch (Exception e)
+        {
+            throw new ConfigurationException("Could not create Compression for type " + className, e);
+        }
+    }
+
+    private static ICompressor createCompressor(Class<?> compressorClass, Map<String, String> compressionOptions) throws ConfigurationException
+    {
+        if (compressorClass == null)
+        {
+            if (!compressionOptions.isEmpty())
+                throw new ConfigurationException("Unknown compression options (" + compressionOptions.keySet() + ") since no compression class found");
+            return null;
+        }
+
+        if (compressionOptions.containsKey(CRC_CHECK_CHANCE))
+        {
+            if (!hasLoggedCrcCheckChanceWarning)
+            {
+                logger.warn(CRC_CHECK_CHANCE_WARNING);
+                hasLoggedCrcCheckChanceWarning = true;
+            }
+            compressionOptions.remove(CRC_CHECK_CHANCE);
+        }
+
+        try
+        {
+            Method method = compressorClass.getMethod("create", Map.class);
+            ICompressor compressor = (ICompressor)method.invoke(null, compressionOptions);
+            // Check for unknown options
+            for (String provided : compressionOptions.keySet())
+                if (!compressor.supportedOptions().contains(provided))
+                    throw new ConfigurationException("Unknown compression options " + provided);
+            return compressor;
+        }
+        catch (NoSuchMethodException e)
+        {
+            throw new ConfigurationException("create method not found", e);
+        }
+        catch (SecurityException e)
+        {
+            throw new ConfigurationException("Access forbiden", e);
+        }
+        catch (IllegalAccessException e)
+        {
+            throw new ConfigurationException("Cannot access method create in " + compressorClass.getName(), e);
+        }
+        catch (InvocationTargetException e)
+        {
+            if (e.getTargetException() instanceof ConfigurationException)
+                throw (ConfigurationException) e.getTargetException();
+
+            Throwable cause = e.getCause() == null
+                            ? e
+                            : e.getCause();
+
+            throw new ConfigurationException(format("%s.create() threw an error: %s %s",
+                                                    compressorClass.getSimpleName(),
+                                                    cause.getClass().getName(),
+                                                    cause.getMessage()),
+                                             e);
+        }
+        catch (ExceptionInInitializerError e)
+        {
+            throw new ConfigurationException("Cannot initialize class " + compressorClass.getName());
+        }
+    }
+
+    public static ICompressor createCompressor(ParameterizedClass compression) throws ConfigurationException {
+        return createCompressor(parseCompressorClass(compression.class_name), copyOptions(compression.parameters));
+    }
+
+    private static Map<String, String> copyOptions(Map<? extends CharSequence, ? extends CharSequence> co)
+    {
+        if (co == null || co.isEmpty())
+            return Collections.<String, String>emptyMap();
+
+        Map<String, String> compressionOptions = new HashMap<>();
+        for (Map.Entry<? extends CharSequence, ? extends CharSequence> entry : co.entrySet())
+            compressionOptions.put(entry.getKey().toString(), entry.getValue().toString());
+        return compressionOptions;
+    }
+
+    /**
+     * Parse the chunk length (in KB) and returns it as bytes.
+     * 
+     * @param chLengthKB the length of the chunk to parse
+     * @return the chunk length in bytes
+     * @throws ConfigurationException if the chunk size is too large
+     */
+    private static Integer parseChunkLength(String chLengthKB) throws ConfigurationException
+    {
+        if (chLengthKB == null)
+            return null;
+
+        try
+        {
+            int parsed = Integer.parseInt(chLengthKB);
+            if (parsed > Integer.MAX_VALUE / 1024)
+                throw new ConfigurationException(format("Value of %s is too large (%s)", CHUNK_LENGTH_IN_KB,parsed));
+            return 1024 * parsed;
+        }
+        catch (NumberFormatException e)
+        {
+            throw new ConfigurationException("Invalid value for " + CHUNK_LENGTH_IN_KB, e);
+        }
+    }
+
+    /**
+     * Removes the chunk length option from the specified set of option.
+     *
+     * @param options the options
+     * @return the chunk length value
+     */
+    private static Integer removeChunkLength(Map<String, String> options)
+    {
+        if (options.containsKey(CHUNK_LENGTH_IN_KB))
+        {
+            if (options.containsKey(CHUNK_LENGTH_KB))
+            {
+                throw new ConfigurationException(format("The '%s' option must not be used if the chunk length is already specified by the '%s' option",
+                                                        CHUNK_LENGTH_KB,
+                                                        CHUNK_LENGTH_IN_KB));
+            }
+
+            return parseChunkLength(options.remove(CHUNK_LENGTH_IN_KB));
+        }
+
+        if (options.containsKey(CHUNK_LENGTH_KB))
+        {
+            if (!hasLoggedChunkLengthWarning)
+            {
+                hasLoggedChunkLengthWarning = true;
+                logger.warn(format("The %s option has been deprecated. You should use %s instead",
+                                   CHUNK_LENGTH_KB,
+                                   CHUNK_LENGTH_IN_KB));
+            }
+
+            return parseChunkLength(options.remove(CHUNK_LENGTH_KB));
+        }
+
+        return null;
+    }
+
+    /**
+     * Returns {@code true} if the specified options contains the name of the compression class to be used,
+     * {@code false} otherwise.
+     *
+     * @param options the options
+     * @return {@code true} if the specified options contains the name of the compression class to be used,
+     * {@code false} otherwise.
+     */
+    public static boolean containsSstableCompressionClass(Map<String, String> options)
+    {
+        return options.containsKey(CLASS) || options.containsKey(SSTABLE_COMPRESSION);
+    }
+
+    /**
+     * Removes the option specifying the name of the compression class
+     *
+     * @param options the options
+     * @return the name of the compression class
+     */
+    private static String removeSstableCompressionClass(Map<String, String> options)
+    {
+        if (options.containsKey(CLASS))
+        {
+            if (options.containsKey(SSTABLE_COMPRESSION))
+                throw new ConfigurationException(format("The '%s' option must not be used if the compression algorithm is already specified by the '%s' option",
+                                                        SSTABLE_COMPRESSION,
+                                                        CLASS));
+
+            String clazz = options.remove(CLASS);
+            if (clazz.isEmpty())
+                throw new ConfigurationException(format("The '%s' option must not be empty. To disable compression use 'enabled' : false", CLASS));
+
+            return clazz;
+        }
+
+        if (options.containsKey(SSTABLE_COMPRESSION) && !hasLoggedSsTableCompressionWarning)
+        {
+            hasLoggedSsTableCompressionWarning = true;
+            logger.warn(format("The %s option has been deprecated. You should use %s instead",
+                               SSTABLE_COMPRESSION,
+                               CLASS));
+        }
+
+        return options.remove(SSTABLE_COMPRESSION);
+    }
+
+    /**
+     * Returns {@code true} if the options contains the {@code enabled} option and that its value is
+     * {@code true}, otherwise returns {@code false}.
+     *
+     * @param options the options
+     * @return {@code true} if the options contains the {@code enabled} option and that its value is
+     * {@code true}, otherwise returns {@code false}.
+     */
+    public static boolean isEnabled(Map<String, String> options)
+    {
+        String enabled = options.get(ENABLED);
+        return enabled == null || Boolean.parseBoolean(enabled);
+    }
+
+    /**
+     * Removes the {@code enabled} option from the specified options.
+     *
+     * @param options the options
+     * @return the value of the {@code enabled} option
+     */
+    private static boolean removeEnabled(Map<String, String> options)
+    {
+        String enabled = options.remove(ENABLED);
+        return enabled == null || Boolean.parseBoolean(enabled);
+    }
+
+    // chunkLength must be a power of 2 because we assume so when
+    // computing the chunk number from an uncompressed file offset (see
+    // CompressedRandomAccessReader.decompresseChunk())
+    public void validate() throws ConfigurationException
+    {
+        // if chunk length was not set (chunkLength == null), this is fine, default will be used
+        if (chunkLength != null)
+        {
+            if (chunkLength <= 0)
+                throw new ConfigurationException("Invalid negative or null " + CHUNK_LENGTH_IN_KB);
+
+            int c = chunkLength;
+            boolean found = false;
+            while (c != 0)
+            {
+                if ((c & 0x01) != 0)
+                {
+                    if (found)
+                        throw new ConfigurationException(CHUNK_LENGTH_IN_KB + " must be a power of 2");
+                    else
+                        found = true;
+                }
+                c >>= 1;
+            }
+        }
+    }
+
+    public Map<String, String> asMap()
+    {
+        if (!isEnabled())
+            return Collections.singletonMap(ENABLED, "false");
+
+        Map<String, String> options = new HashMap<>(otherOptions);
+        options.put(CLASS, sstableCompressor.getClass().getName());
+        options.put(CHUNK_LENGTH_IN_KB, chunkLengthInKB());
+
+        return options;
+    }
+
+    public String chunkLengthInKB()
+    {
+        return String.valueOf(chunkLength() / 1024);
+    }
+
+    public void setCrcCheckChance(double crcCheckChance)
+    {
+        this.crcCheckChance = crcCheckChance;
+    }
+
+    public double getCrcCheckChance()
+    {
+        return crcCheckChance;
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+        {
+            return true;
+        }
+        else if (obj == null || obj.getClass() != getClass())
+        {
+            return false;
+        }
+
+        CompressionParams cp = (CompressionParams) obj;
+        return new EqualsBuilder()
+            .append(sstableCompressor, cp.sstableCompressor)
+            .append(chunkLength(), cp.chunkLength())
+            .append(otherOptions, cp.otherOptions)
+            .isEquals();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return new HashCodeBuilder(29, 1597)
+            .append(sstableCompressor)
+            .append(chunkLength())
+            .append(otherOptions)
+            .toHashCode();
+    }
+
+    static class Serializer implements IVersionedSerializer<CompressionParams>
+    {
+        public void serialize(CompressionParams parameters, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeUTF(parameters.sstableCompressor.getClass().getSimpleName());
+            out.writeInt(parameters.otherOptions.size());
+            for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
+            {
+                out.writeUTF(entry.getKey());
+                out.writeUTF(entry.getValue());
+            }
+            out.writeInt(parameters.chunkLength());
+        }
+
+        public CompressionParams deserialize(DataInputPlus in, int version) throws IOException
+        {
+            String compressorName = in.readUTF();
+            int optionCount = in.readInt();
+            Map<String, String> options = new HashMap<>();
+            for (int i = 0; i < optionCount; ++i)
+            {
+                String key = in.readUTF();
+                String value = in.readUTF();
+                options.put(key, value);
+            }
+            int chunkLength = in.readInt();
+            CompressionParams parameters;
+            try
+            {
+                parameters = new CompressionParams(compressorName, chunkLength, options);
+            }
+            catch (ConfigurationException e)
+            {
+                throw new RuntimeException("Cannot create CompressionParams for parameters", e);
+            }
+            return parameters;
+        }
+
+        public long serializedSize(CompressionParams parameters, int version)
+        {
+            long size = TypeSizes.sizeof(parameters.sstableCompressor.getClass().getSimpleName());
+            size += TypeSizes.sizeof(parameters.otherOptions.size());
+            for (Map.Entry<String, String> entry : parameters.otherOptions.entrySet())
+            {
+                size += TypeSizes.sizeof(entry.getKey());
+                size += TypeSizes.sizeof(entry.getValue());
+            }
+            size += TypeSizes.sizeof(parameters.chunkLength());
+            return size;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Functions.java b/src/java/org/apache/cassandra/schema/Functions.java
new file mode 100644
index 0000000..c65f58d
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Functions.java

@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ImmutableMultimap;
+
+import org.apache.cassandra.cql3.functions.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+import static com.google.common.collect.Iterables.filter;
+
+/**
+ * An immutable container for a keyspace's UDAs and UDFs (and, in case of {@link org.apache.cassandra.db.SystemKeyspace},
+ * native functions and aggregates).
+ */
+public final class Functions implements Iterable<Function>
+{
+    private final ImmutableMultimap<FunctionName, Function> functions;
+
+    private Functions(Builder builder)
+    {
+        functions = builder.functions.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Functions none()
+    {
+        return builder().build();
+    }
+
+    public static Functions of(Function... funs)
+    {
+        return builder().add(funs).build();
+    }
+
+    public Iterator<Function> iterator()
+    {
+        return functions.values().iterator();
+    }
+
+    public Stream<Function> stream()
+    {
+        return functions.values().stream();
+    }
+
+    /**
+     * @return a stream of keyspace's UDFs
+     */
+    public Stream<UDFunction> udfs()
+    {
+        return stream().filter(f -> f instanceof UDFunction).map(f -> (UDFunction) f);
+    }
+
+    /**
+     * @return a stream of keyspace's UDAs
+     */
+    public Stream<UDAggregate> udas()
+    {
+        return stream().filter(f -> f instanceof UDAggregate).map(f -> (UDAggregate) f);
+    }
+
+    /**
+     * @return a collection of aggregates that use the provided function as either a state or a final function
+     * @param function the referree function
+     */
+    public Collection<UDAggregate> aggregatesUsingFunction(Function function)
+    {
+        return udas().filter(uda -> uda.hasReferenceTo(function)).collect(Collectors.toList());
+    }
+
+    /**
+     * Get all function overloads with the specified name
+     *
+     * @param name fully qualified function name
+     * @return an empty list if the function name is not found; a non-empty collection of {@link Function} otherwise
+     */
+    public Collection<Function> get(FunctionName name)
+    {
+        return functions.get(name);
+    }
+
+    /**
+     * Find the function with the specified name
+     *
+     * @param name fully qualified function name
+     * @param argTypes function argument types
+     * @return an empty {@link Optional} if the function name is not found; a non-empty optional of {@link Function} otherwise
+     */
+    public Optional<Function> find(FunctionName name, List<AbstractType<?>> argTypes)
+    {
+        return get(name).stream()
+                        .filter(fun -> typesMatch(fun.argTypes(), argTypes))
+                        .findAny();
+    }
+
+    /*
+     * We need to compare the CQL3 representation of the type because comparing
+     * the AbstractType will fail for example if a UDT has been changed.
+     * Reason is that UserType.equals() takes the field names and types into account.
+     * Example CQL sequence that would fail when comparing AbstractType:
+     *    CREATE TYPE foo ...
+     *    CREATE FUNCTION bar ( par foo ) RETURNS foo ...
+     *    ALTER TYPE foo ADD ...
+     * or
+     *    ALTER TYPE foo ALTER ...
+     * or
+     *    ALTER TYPE foo RENAME ...
+     */
+    public static boolean typesMatch(AbstractType<?> t1, AbstractType<?> t2)
+    {
+        return t1.asCQL3Type().toString().equals(t2.asCQL3Type().toString());
+    }
+
+    public static boolean typesMatch(List<AbstractType<?>> t1, List<AbstractType<?>> t2)
+    {
+        if (t1.size() != t2.size())
+            return false;
+
+        for (int i = 0; i < t1.size(); i++)
+            if (!typesMatch(t1.get(i), t2.get(i)))
+                return false;
+
+        return true;
+    }
+
+    public static int typeHashCode(AbstractType<?> t)
+    {
+        return t.asCQL3Type().toString().hashCode();
+    }
+
+    public static int typeHashCode(List<AbstractType<?>> types)
+    {
+        int h = 0;
+        for (AbstractType<?> type : types)
+            h = h * 31 + typeHashCode(type);
+        return h;
+    }
+
+    /**
+     * Create a Functions instance with the provided function added
+     */
+    public Functions with(Function fun)
+    {
+        if (find(fun.name(), fun.argTypes()).isPresent())
+            throw new IllegalStateException(String.format("Function %s already exists", fun.name()));
+
+        return builder().add(this).add(fun).build();
+    }
+
+    /**
+     * Creates a Functions instance with the function with the provided name and argument types removed
+     */
+    public Functions without(FunctionName name, List<AbstractType<?>> argTypes)
+    {
+        Function fun =
+            find(name, argTypes).orElseThrow(() -> new IllegalStateException(String.format("Function %s doesn't exists", name)));
+
+        return builder().add(filter(this, f -> f != fun)).build();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Functions && functions.equals(((Functions) o).functions));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return functions.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return functions.values().toString();
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMultimap.Builder<FunctionName, Function> functions = new ImmutableMultimap.Builder<>();
+
+        private Builder()
+        {
+            // we need deterministic iteration order; otherwise Functions.equals() breaks down
+            functions.orderValuesBy((f1, f2) -> Integer.compare(f1.hashCode(), f2.hashCode()));
+        }
+
+        public Functions build()
+        {
+            return new Functions(this);
+        }
+
+        public Builder add(Function fun)
+        {
+            functions.put(fun.name(), fun);
+            return this;
+        }
+
+        public Builder add(Function... funs)
+        {
+            for (Function fun : funs)
+                add(fun);
+            return this;
+        }
+
+        public  Builder add(Iterable<? extends Function> funs)
+        {
+            funs.forEach(this::add);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/IndexMetadata.java b/src/java/org/apache/cassandra/schema/IndexMetadata.java
new file mode 100644
index 0000000..7c60a64
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/IndexMetadata.java

@@ -0,0 +1,276 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Maps;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDSerializer;
+
+/**
+ * An immutable representation of secondary index metadata.
+ */
+public final class IndexMetadata
+{
+    private static final Logger logger = LoggerFactory.getLogger(IndexMetadata.class);
+
+    public static final Serializer serializer = new Serializer();
+
+    public enum Kind
+    {
+        KEYS, CUSTOM, COMPOSITES
+    }
+
+    // UUID for serialization. This is a deterministic UUID generated from the index name
+    // Both the id and name are guaranteed unique per keyspace.
+    public final UUID id;
+    public final String name;
+    public final Kind kind;
+    public final Map<String, String> options;
+
+    private IndexMetadata(String name,
+                          Map<String, String> options,
+                          Kind kind)
+    {
+        this.id = UUID.nameUUIDFromBytes(name.getBytes());
+        this.name = name;
+        this.options = options == null ? ImmutableMap.of() : ImmutableMap.copyOf(options);
+        this.kind = kind;
+    }
+
+    public static IndexMetadata fromLegacyMetadata(CFMetaData cfm,
+                                                   ColumnDefinition column,
+                                                   String name,
+                                                   Kind kind,
+                                                   Map<String, String> options)
+    {
+        Map<String, String> newOptions = new HashMap<>();
+        if (options != null)
+            newOptions.putAll(options);
+
+        IndexTarget target;
+        if (newOptions.containsKey(IndexTarget.INDEX_KEYS_OPTION_NAME))
+        {
+            newOptions.remove(IndexTarget.INDEX_KEYS_OPTION_NAME);
+            target = new IndexTarget(column.name, IndexTarget.Type.KEYS);
+        }
+        else if (newOptions.containsKey(IndexTarget.INDEX_ENTRIES_OPTION_NAME))
+        {
+            newOptions.remove(IndexTarget.INDEX_KEYS_OPTION_NAME);
+            target = new IndexTarget(column.name, IndexTarget.Type.KEYS_AND_VALUES);
+        }
+        else
+        {
+            if (column.type.isCollection() && !column.type.isMultiCell())
+            {
+                target = new IndexTarget(column.name, IndexTarget.Type.FULL);
+            }
+            else
+            {
+                target = new IndexTarget(column.name, IndexTarget.Type.VALUES);
+            }
+        }
+        newOptions.put(IndexTarget.TARGET_OPTION_NAME, target.asCqlString(cfm));
+        return new IndexMetadata(name, newOptions, kind);
+    }
+
+    public static IndexMetadata fromSchemaMetadata(String name, Kind kind, Map<String, String> options)
+    {
+        return new IndexMetadata(name, options, kind);
+    }
+
+    public static IndexMetadata fromIndexTargets(CFMetaData cfm,
+                                                 List<IndexTarget> targets,
+                                                 String name,
+                                                 Kind kind,
+                                                 Map<String, String> options)
+    {
+        Map<String, String> newOptions = new HashMap<>(options);
+        newOptions.put(IndexTarget.TARGET_OPTION_NAME, targets.stream()
+                                                              .map(target -> target.asCqlString(cfm))
+                                                              .collect(Collectors.joining(", ")));
+        return new IndexMetadata(name, newOptions, kind);
+    }
+
+    public static boolean isNameValid(String name)
+    {
+        return name != null && !name.isEmpty() && name.matches("\\w+");
+    }
+
+    public static String getDefaultIndexName(String cfName, String root)
+    {
+        if (root == null)
+            return (cfName + "_" + "idx").replaceAll("\\W", "");
+        else
+            return (cfName + "_" + root + "_idx").replaceAll("\\W", "");
+    }
+
+    public void validate(CFMetaData cfm)
+    {
+        if (!isNameValid(name))
+            throw new ConfigurationException("Illegal index name " + name);
+
+        if (kind == null)
+            throw new ConfigurationException("Index kind is null for index " + name);
+
+        if (kind == Kind.CUSTOM)
+        {
+            if (options == null || !options.containsKey(IndexTarget.CUSTOM_INDEX_OPTION_NAME))
+                throw new ConfigurationException(String.format("Required option missing for index %s : %s",
+                                                               name, IndexTarget.CUSTOM_INDEX_OPTION_NAME));
+            String className = options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME);
+            Class<Index> indexerClass = FBUtilities.classForName(className, "custom indexer");
+            if(!Index.class.isAssignableFrom(indexerClass))
+                throw new ConfigurationException(String.format("Specified Indexer class (%s) does not implement the Indexer interface", className));
+            validateCustomIndexOptions(cfm, indexerClass, options);
+        }
+    }
+
+    private void validateCustomIndexOptions(CFMetaData cfm,
+                                            Class<? extends Index> indexerClass,
+                                            Map<String, String> options)
+    throws ConfigurationException
+    {
+        try
+        {
+            Map<String, String> filteredOptions =
+                Maps.filterKeys(options,key -> !key.equals(IndexTarget.CUSTOM_INDEX_OPTION_NAME));
+
+            if (filteredOptions.isEmpty())
+                return;
+
+            Map<?,?> unknownOptions;
+            try
+            {
+                unknownOptions = (Map) indexerClass.getMethod("validateOptions", Map.class, CFMetaData.class).invoke(null, filteredOptions, cfm);
+            }
+            catch (NoSuchMethodException e)
+            {
+                unknownOptions = (Map) indexerClass.getMethod("validateOptions", Map.class).invoke(null, filteredOptions);
+            }
+
+            if (!unknownOptions.isEmpty())
+                throw new ConfigurationException(String.format("Properties specified %s are not understood by %s", unknownOptions.keySet(), indexerClass.getSimpleName()));
+        }
+        catch (NoSuchMethodException e)
+        {
+            logger.info("Indexer {} does not have a static validateOptions method. Validation ignored",
+                        indexerClass.getName());
+        }
+        catch (InvocationTargetException e)
+        {
+            if (e.getTargetException() instanceof ConfigurationException)
+                throw (ConfigurationException) e.getTargetException();
+            throw new ConfigurationException("Failed to validate custom indexer options: " + options);
+        }
+        catch (ConfigurationException e)
+        {
+            throw e;
+        }
+        catch (Exception e)
+        {
+            throw new ConfigurationException("Failed to validate custom indexer options: " + options);
+        }
+    }
+
+    public boolean isCustom()
+    {
+        return kind == Kind.CUSTOM;
+    }
+
+    public boolean isKeys()
+    {
+        return kind == Kind.KEYS;
+    }
+
+    public boolean isComposites()
+    {
+        return kind == Kind.COMPOSITES;
+    }
+
+    public int hashCode()
+    {
+        return Objects.hashCode(id, name, kind, options);
+    }
+
+    public boolean equalsWithoutName(IndexMetadata other)
+    {
+        return Objects.equal(kind, other.kind)
+            && Objects.equal(options, other.options);
+    }
+
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+            return true;
+
+        if (!(obj instanceof IndexMetadata))
+            return false;
+
+        IndexMetadata other = (IndexMetadata)obj;
+
+        return Objects.equal(id, other.id) && Objects.equal(name, other.name) && equalsWithoutName(other);
+    }
+
+    public String toString()
+    {
+        return new ToStringBuilder(this)
+            .append("id", id.toString())
+            .append("name", name)
+            .append("kind", kind)
+            .append("options", options)
+            .build();
+    }
+
+    public static class Serializer
+    {
+        public void serialize(IndexMetadata metadata, DataOutputPlus out, int version) throws IOException
+        {
+            UUIDSerializer.serializer.serialize(metadata.id, out, version);
+        }
+
+        public IndexMetadata deserialize(DataInputPlus in, int version, CFMetaData cfm) throws IOException
+        {
+            UUID id = UUIDSerializer.serializer.deserialize(in, version);
+            return cfm.getIndexes().get(id).orElseThrow(() -> new UnknownIndexException(cfm, id));
+        }
+
+        public long serializedSize(IndexMetadata metadata, int version)
+        {
+            return UUIDSerializer.serializer.serializedSize(metadata.id, version);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Indexes.java b/src/java/org/apache/cassandra/schema/Indexes.java
new file mode 100644
index 0000000..49a1d3b
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Indexes.java

@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+import java.util.*;
+
+import com.google.common.collect.ImmutableMap;
+
+import org.apache.cassandra.config.Schema;
+
+import static com.google.common.collect.Iterables.filter;
+
+/**
+ * For backwards compatibility, in the first instance an IndexMetadata must have
+ * TargetType.COLUMN and its Set of target columns must contain only a single
+ * ColumnIdentifier. Hence, this is what is enforced by the public factory methods
+ * on IndexMetadata.
+ * These constraints, along with the internal datastructures here will be relaxed as
+ * support is added for multiple target columns per-index and for indexes with
+ * TargetType.ROW
+ */
+public class Indexes implements Iterable<IndexMetadata>
+{
+    private final ImmutableMap<String, IndexMetadata> indexesByName;
+    private final ImmutableMap<UUID, IndexMetadata> indexesById;
+
+    private Indexes(Builder builder)
+    {
+        indexesByName = builder.indexesByName.build();
+        indexesById = builder.indexesById.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Indexes none()
+    {
+        return builder().build();
+    }
+
+    public Iterator<IndexMetadata> iterator()
+    {
+        return indexesByName.values().iterator();
+    }
+
+    public int size()
+    {
+        return indexesByName.size();
+    }
+
+    public boolean isEmpty()
+    {
+        return indexesByName.isEmpty();
+    }
+
+    /**
+     * Get the index with the specified name
+     *
+     * @param name a non-qualified index name
+     * @return an empty {@link Optional} if the named index is not found; a non-empty optional of {@link IndexMetadata} otherwise
+     */
+    public Optional<IndexMetadata> get(String name)
+    {
+        return Optional.ofNullable(indexesByName.get(name));
+    }
+
+    /**
+     * Answer true if contains an index with the specified name.
+     * @param name a non-qualified index name.
+     * @return true if the named index is found; false otherwise
+     */
+    public boolean has(String name)
+    {
+        return indexesByName.containsKey(name);
+    }
+
+    /**
+     * Get the index with the specified id
+     *
+     * @param name a UUID which identifies an index
+     * @return an empty {@link Optional} if no index with the specified id is found; a non-empty optional of
+     *         {@link IndexMetadata} otherwise
+     */
+
+    public Optional<IndexMetadata> get(UUID id)
+    {
+        return Optional.ofNullable(indexesById.get(id));
+    }
+
+    /**
+     * Answer true if contains an index with the specified id.
+     * @param name a UUID which identifies an index.
+     * @return true if an index with the specified id is found; false otherwise
+     */
+    public boolean has(UUID id)
+    {
+        return indexesById.containsKey(id);
+    }
+
+    /**
+     * Create a SecondaryIndexes instance with the provided index added
+     */
+    public Indexes with(IndexMetadata index)
+    {
+        if (get(index.name).isPresent())
+            throw new IllegalStateException(String.format("Index %s already exists", index.name));
+
+        return builder().add(this).add(index).build();
+    }
+
+    /**
+     * Creates a SecondaryIndexes instance with the index with the provided name removed
+     */
+    public Indexes without(String name)
+    {
+        IndexMetadata index = get(name).orElseThrow(() -> new IllegalStateException(String.format("Index %s doesn't exist", name)));
+        return builder().add(filter(this, v -> v != index)).build();
+    }
+
+    /**
+     * Creates a SecondaryIndexes instance which contains an updated index definition
+     */
+    public Indexes replace(IndexMetadata index)
+    {
+        return without(index.name).with(index);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Indexes && indexesByName.equals(((Indexes) o).indexesByName));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return indexesByName.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return indexesByName.values().toString();
+    }
+
+    public static String getAvailableIndexName(String ksName, String cfName, String indexNameRoot)
+    {
+
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
+        Set<String> existingNames = ksm == null ? new HashSet<>() : ksm.existingIndexNames(null);
+        String baseName = IndexMetadata.getDefaultIndexName(cfName, indexNameRoot);
+        String acceptedName = baseName;
+        int i = 0;
+        while (existingNames.contains(acceptedName))
+            acceptedName = baseName + '_' + (++i);
+
+        return acceptedName;
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMap.Builder<String, IndexMetadata> indexesByName = new ImmutableMap.Builder<>();
+        final ImmutableMap.Builder<UUID, IndexMetadata> indexesById = new ImmutableMap.Builder<>();
+
+        private Builder()
+        {
+        }
+
+        public Indexes build()
+        {
+            return new Indexes(this);
+        }
+
+        public Builder add(IndexMetadata index)
+        {
+            indexesByName.put(index.name, index);
+            indexesById.put(index.id, index);
+            return this;
+        }
+
+        public Builder add(Iterable<IndexMetadata> indexes)
+        {
+            indexes.forEach(this::add);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java b/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java
new file mode 100644
index 0000000..76ba27d
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java

@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
+
+import javax.annotation.Nullable;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+/**
+ * An immutable representation of keyspace metadata (name, params, tables, types, and functions).
+ */
+public final class KeyspaceMetadata
+{
+    public final String name;
+    public final KeyspaceParams params;
+    public final Tables tables;
+    public final Views views;
+    public final Types types;
+    public final Functions functions;
+
+    private KeyspaceMetadata(String name, KeyspaceParams params, Tables tables, Views views, Types types, Functions functions)
+    {
+        this.name = name;
+        this.params = params;
+        this.tables = tables;
+        this.views = views;
+        this.types = types;
+        this.functions = functions;
+    }
+
+    public static KeyspaceMetadata create(String name, KeyspaceParams params)
+    {
+        return new KeyspaceMetadata(name, params, Tables.none(), Views.none(), Types.none(), Functions.none());
+    }
+
+    public static KeyspaceMetadata create(String name, KeyspaceParams params, Tables tables)
+    {
+        return new KeyspaceMetadata(name, params, tables, Views.none(), Types.none(), Functions.none());
+    }
+
+    public static KeyspaceMetadata create(String name, KeyspaceParams params, Tables tables, Views views, Types types, Functions functions)
+    {
+        return new KeyspaceMetadata(name, params, tables, views, types, functions);
+    }
+
+    public KeyspaceMetadata withSwapped(KeyspaceParams params)
+    {
+        return new KeyspaceMetadata(name, params, tables, views, types, functions);
+    }
+
+    public KeyspaceMetadata withSwapped(Tables regular)
+    {
+        return new KeyspaceMetadata(name, params, regular, views, types, functions);
+    }
+
+    public KeyspaceMetadata withSwapped(Views views)
+    {
+        return new KeyspaceMetadata(name, params, tables, views, types, functions);
+    }
+
+    public KeyspaceMetadata withSwapped(Types types)
+    {
+        return new KeyspaceMetadata(name, params, tables, views, types, functions);
+    }
+
+    public KeyspaceMetadata withSwapped(Functions functions)
+    {
+        return new KeyspaceMetadata(name, params, tables, views, types, functions);
+    }
+
+    public Iterable<CFMetaData> tablesAndViews()
+    {
+        return Iterables.concat(tables, views.metadatas());
+    }
+
+    @Nullable
+    public CFMetaData getTableOrViewNullable(String tableOrViewName)
+    {
+        ViewDefinition view = views.getNullable(tableOrViewName);
+        return view == null
+             ? tables.getNullable(tableOrViewName)
+             : view.metadata;
+    }
+
+    public Set<String> existingIndexNames(String cfToExclude)
+    {
+        Set<String> indexNames = new HashSet<>();
+        for (CFMetaData table : tables)
+            if (cfToExclude == null || !table.cfName.equals(cfToExclude))
+                for (IndexMetadata index : table.getIndexes())
+                    indexNames.add(index.name);
+        return indexNames;
+    }
+
+    public Optional<CFMetaData> findIndexedTable(String indexName)
+    {
+        for (CFMetaData cfm : tablesAndViews())
+            if (cfm.getIndexes().has(indexName))
+                return Optional.of(cfm);
+
+        return Optional.empty();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(name, params, tables, views, functions, types);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof KeyspaceMetadata))
+            return false;
+
+        KeyspaceMetadata other = (KeyspaceMetadata) o;
+
+        return name.equals(other.name)
+            && params.equals(other.params)
+            && tables.equals(other.tables)
+            && views.equals(other.views)
+            && functions.equals(other.functions)
+            && types.equals(other.types);
+    }
+
+    @Override
+    public String toString()
+    {
+        return Objects.toStringHelper(this)
+                      .add("name", name)
+                      .add("params", params)
+                      .add("tables", tables)
+                      .add("views", views)
+                      .add("functions", functions)
+                      .add("types", types)
+                      .toString();
+    }
+
+    public void validate()
+    {
+        if (!CFMetaData.isNameValid(name))
+            throw new ConfigurationException(String.format("Keyspace name must not be empty, more than %s characters long, "
+                                                           + "or contain non-alphanumeric-underscore characters (got \"%s\")",
+                                                           Schema.NAME_LENGTH,
+                                                           name));
+        params.validate(name);
+        tablesAndViews().forEach(CFMetaData::validate);
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/KeyspaceParams.java b/src/java/org/apache/cassandra/schema/KeyspaceParams.java
new file mode 100644
index 0000000..2ea18ca
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/KeyspaceParams.java

@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.Map;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Objects;
+
+/**
+ * An immutable class representing keyspace parameters (durability and replication).
+ */
+public final class KeyspaceParams
+{
+    public static final boolean DEFAULT_DURABLE_WRITES = true;
+
+    /**
+     * This determines durable writes for the {@link org.apache.cassandra.db.SystemKeyspace#NAME}
+     * and {@link SchemaKeyspace#NAME} keyspaces, the only reason it is not final is for commitlog
+     * unit tests. It should only be changed for testing purposes.
+     */
+    @VisibleForTesting
+    public static boolean DEFAULT_LOCAL_DURABLE_WRITES = true;
+
+    public enum Option
+    {
+        DURABLE_WRITES,
+        REPLICATION;
+
+        @Override
+        public String toString()
+        {
+            return name().toLowerCase();
+        }
+    }
+
+    public final boolean durableWrites;
+    public final ReplicationParams replication;
+
+    public KeyspaceParams(boolean durableWrites, ReplicationParams replication)
+    {
+        this.durableWrites = durableWrites;
+        this.replication = replication;
+    }
+
+    public static KeyspaceParams create(boolean durableWrites, Map<String, String> replication)
+    {
+        return new KeyspaceParams(durableWrites, ReplicationParams.fromMap(replication));
+    }
+
+    public static KeyspaceParams local()
+    {
+        return new KeyspaceParams(DEFAULT_LOCAL_DURABLE_WRITES, ReplicationParams.local());
+    }
+
+    public static KeyspaceParams simple(int replicationFactor)
+    {
+        return new KeyspaceParams(true, ReplicationParams.simple(replicationFactor));
+    }
+
+    public static KeyspaceParams simpleTransient(int replicationFactor)
+    {
+        return new KeyspaceParams(false, ReplicationParams.simple(replicationFactor));
+    }
+
+    public static KeyspaceParams nts(Object... args)
+    {
+        return new KeyspaceParams(true, ReplicationParams.nts(args));
+    }
+
+    public void validate(String name)
+    {
+        replication.validate(name);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof KeyspaceParams))
+            return false;
+
+        KeyspaceParams p = (KeyspaceParams) o;
+
+        return durableWrites == p.durableWrites && replication.equals(p.replication);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(durableWrites, replication);
+    }
+
+    @Override
+    public String toString()
+    {
+        return Objects.toStringHelper(this)
+                      .add(Option.DURABLE_WRITES.toString(), durableWrites)
+                      .add(Option.REPLICATION.toString(), replication)
+                      .toString();
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Keyspaces.java b/src/java/org/apache/cassandra/schema/Keyspaces.java
new file mode 100644
index 0000000..8c0a63e
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Keyspaces.java

@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.Iterator;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.MapDifference;
+import com.google.common.collect.Maps;
+
+public final class Keyspaces implements Iterable<KeyspaceMetadata>
+{
+    private final ImmutableMap<String, KeyspaceMetadata> keyspaces;
+
+    private Keyspaces(Builder builder)
+    {
+        keyspaces = builder.keyspaces.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Keyspaces none()
+    {
+        return builder().build();
+    }
+
+    public static Keyspaces of(KeyspaceMetadata... keyspaces)
+    {
+        return builder().add(keyspaces).build();
+    }
+
+    public Iterator<KeyspaceMetadata> iterator()
+    {
+        return keyspaces.values().iterator();
+    }
+
+    public Stream<KeyspaceMetadata> stream()
+    {
+        return keyspaces.values().stream();
+    }
+
+    public Keyspaces filter(Predicate<KeyspaceMetadata> predicate)
+    {
+        Builder builder = builder();
+        stream().filter(predicate).forEach(builder::add);
+        return builder.build();
+    }
+
+    MapDifference<String, KeyspaceMetadata> diff(Keyspaces other)
+    {
+        return Maps.difference(keyspaces, other.keyspaces);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Keyspaces && keyspaces.equals(((Keyspaces) o).keyspaces));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return keyspaces.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return keyspaces.values().toString();
+    }
+
+    public static final class Builder
+    {
+        private final ImmutableMap.Builder<String, KeyspaceMetadata> keyspaces = new ImmutableMap.Builder<>();
+
+        private Builder()
+        {
+        }
+
+        public Keyspaces build()
+        {
+            return new Keyspaces(this);
+        }
+
+        public Builder add(KeyspaceMetadata keyspace)
+        {
+            keyspaces.put(keyspace.name, keyspace);
+            return this;
+        }
+
+        public Builder add(KeyspaceMetadata... keyspaces)
+        {
+            for (KeyspaceMetadata keyspace : keyspaces)
+                add(keyspace);
+            return this;
+        }
+
+        public Builder add(Iterable<KeyspaceMetadata> keyspaces)
+        {
+            keyspaces.forEach(this::add);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/LegacySchemaMigrator.java b/src/java/org/apache/cassandra/schema/LegacySchemaMigrator.java
new file mode 100644
index 0000000..b7f7e73
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/LegacySchemaMigrator.java

@@ -0,0 +1,1114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.functions.FunctionName;
+import org.apache.cassandra.cql3.functions.UDAggregate;
+import org.apache.cassandra.cql3.functions.UDFunction;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+import static java.lang.String.format;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.apache.cassandra.utils.FBUtilities.fromJsonMap;
+
+/**
+ * This majestic class performs migration from legacy (pre-3.0) system.schema_* schema tables to the new and glorious
+ * system_schema keyspace.
+ *
+ * The goal is to not lose any information in the migration - including the timestamps.
+ */
+@SuppressWarnings("deprecation")
+public final class LegacySchemaMigrator
+{
+    private LegacySchemaMigrator()
+    {
+    }
+
+    private static final Logger logger = LoggerFactory.getLogger(LegacySchemaMigrator.class);
+
+    static final List<CFMetaData> LegacySchemaTables =
+        ImmutableList.of(SystemKeyspace.LegacyKeyspaces,
+                         SystemKeyspace.LegacyColumnfamilies,
+                         SystemKeyspace.LegacyColumns,
+                         SystemKeyspace.LegacyTriggers,
+                         SystemKeyspace.LegacyUsertypes,
+                         SystemKeyspace.LegacyFunctions,
+                         SystemKeyspace.LegacyAggregates);
+
+    public static void migrate()
+    {
+        // read metadata from the legacy schema tables
+        Collection<Keyspace> keyspaces = readSchema();
+
+        // if already upgraded, or starting a new 3.0 node, abort early
+        if (keyspaces.isEmpty())
+        {
+            unloadLegacySchemaTables();
+            return;
+        }
+
+        // write metadata to the new schema tables
+        logger.info("Moving {} keyspaces from legacy schema tables to the new schema keyspace ({})",
+                    keyspaces.size(),
+                    SchemaKeyspace.NAME);
+        keyspaces.forEach(LegacySchemaMigrator::storeKeyspaceInNewSchemaTables);
+        keyspaces.forEach(LegacySchemaMigrator::migrateBuiltIndexesForKeyspace);
+
+        // flush the new tables before truncating the old ones
+        SchemaKeyspace.flush();
+
+        // truncate the original tables (will be snapshotted now, and will have been snapshotted by pre-flight checks)
+        logger.info("Truncating legacy schema tables");
+        truncateLegacySchemaTables();
+
+        // remove legacy schema tables from Schema, so that their presence doesn't give the users any wrong ideas
+        unloadLegacySchemaTables();
+
+        logger.info("Completed migration of legacy schema tables");
+    }
+
+    private static void migrateBuiltIndexesForKeyspace(Keyspace keyspace)
+    {
+        keyspace.tables.forEach(LegacySchemaMigrator::migrateBuiltIndexesForTable);
+    }
+
+    private static void migrateBuiltIndexesForTable(Table table)
+    {
+        table.metadata.getIndexes().forEach((index) -> migrateIndexBuildStatus(table.metadata.ksName,
+                                                                               table.metadata.cfName,
+                                                                               index));
+    }
+
+    private static void migrateIndexBuildStatus(String keyspace, String table, IndexMetadata index)
+    {
+        if (SystemKeyspace.isIndexBuilt(keyspace, table + '.' + index.name))
+        {
+            SystemKeyspace.setIndexBuilt(keyspace, index.name);
+            SystemKeyspace.setIndexRemoved(keyspace, table + '.' + index.name);
+        }
+    }
+
+    static void unloadLegacySchemaTables()
+    {
+        KeyspaceMetadata systemKeyspace = Schema.instance.getKSMetaData(SystemKeyspace.NAME);
+
+        Tables systemTables = systemKeyspace.tables;
+        for (CFMetaData table : LegacySchemaTables)
+            systemTables = systemTables.without(table.cfName);
+
+        LegacySchemaTables.forEach(Schema.instance::unload);
+        LegacySchemaTables.forEach((cfm) -> org.apache.cassandra.db.Keyspace.openAndGetStore(cfm).invalidate());
+
+        Schema.instance.setKeyspaceMetadata(systemKeyspace.withSwapped(systemTables));
+    }
+
+    private static void truncateLegacySchemaTables()
+    {
+        LegacySchemaTables.forEach(table -> Schema.instance.getColumnFamilyStoreInstance(table.cfId).truncateBlocking());
+    }
+
+    private static void storeKeyspaceInNewSchemaTables(Keyspace keyspace)
+    {
+        logger.info("Migrating keyspace {}", keyspace);
+
+        Mutation mutation = SchemaKeyspace.makeCreateKeyspaceMutation(keyspace.name, keyspace.params, keyspace.timestamp);
+        for (Table table : keyspace.tables)
+            SchemaKeyspace.addTableToSchemaMutation(table.metadata, table.timestamp, true, mutation);
+
+        for (Type type : keyspace.types)
+            SchemaKeyspace.addTypeToSchemaMutation(type.metadata, type.timestamp, mutation);
+
+        for (Function function : keyspace.functions)
+            SchemaKeyspace.addFunctionToSchemaMutation(function.metadata, function.timestamp, mutation);
+
+        for (Aggregate aggregate : keyspace.aggregates)
+            SchemaKeyspace.addAggregateToSchemaMutation(aggregate.metadata, aggregate.timestamp, mutation);
+
+        mutation.apply();
+    }
+
+    /*
+     * Read all keyspaces metadata (including nested tables, types, and functions), with their modification timestamps
+     */
+    private static Collection<Keyspace> readSchema()
+    {
+        String query = format("SELECT keyspace_name FROM %s.%s", SystemKeyspace.NAME, SystemKeyspace.LEGACY_KEYSPACES);
+        Collection<String> keyspaceNames = new ArrayList<>();
+        query(query).forEach(row -> keyspaceNames.add(row.getString("keyspace_name")));
+        keyspaceNames.removeAll(Schema.LOCAL_SYSTEM_KEYSPACE_NAMES);
+
+        Collection<Keyspace> keyspaces = new ArrayList<>();
+        keyspaceNames.forEach(name -> keyspaces.add(readKeyspace(name)));
+        return keyspaces;
+    }
+
+    private static Keyspace readKeyspace(String keyspaceName)
+    {
+        long timestamp = readKeyspaceTimestamp(keyspaceName);
+        KeyspaceParams params = readKeyspaceParams(keyspaceName);
+
+        Collection<Table> tables = readTables(keyspaceName);
+        Collection<Type> types = readTypes(keyspaceName);
+        Collection<Function> functions = readFunctions(keyspaceName);
+        Functions.Builder functionsBuilder = Functions.builder();
+        functions.forEach(udf -> functionsBuilder.add(udf.metadata));
+        Collection<Aggregate> aggregates = readAggregates(functionsBuilder.build(), keyspaceName);
+
+        return new Keyspace(timestamp, keyspaceName, params, tables, types, functions, aggregates);
+    }
+
+    /*
+     * Reading keyspace params
+     */
+
+    private static long readKeyspaceTimestamp(String keyspaceName)
+    {
+        String query = format("SELECT writeTime(durable_writes) AS timestamp FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_KEYSPACES);
+        return query(query, keyspaceName).one().getLong("timestamp");
+    }
+
+    private static KeyspaceParams readKeyspaceParams(String keyspaceName)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_KEYSPACES);
+        UntypedResultSet.Row row = query(query, keyspaceName).one();
+
+        boolean durableWrites = row.getBoolean("durable_writes");
+
+        Map<String, String> replication = new HashMap<>();
+        replication.putAll(fromJsonMap(row.getString("strategy_options")));
+        replication.put(ReplicationParams.CLASS, row.getString("strategy_class"));
+
+        return KeyspaceParams.create(durableWrites, replication);
+    }
+
+    /*
+     * Reading tables
+     */
+
+    private static Collection<Table> readTables(String keyspaceName)
+    {
+        String query = format("SELECT columnfamily_name FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_COLUMNFAMILIES);
+        Collection<String> tableNames = new ArrayList<>();
+        query(query, keyspaceName).forEach(row -> tableNames.add(row.getString("columnfamily_name")));
+
+        Collection<Table> tables = new ArrayList<>();
+        tableNames.forEach(name -> tables.add(readTable(keyspaceName, name)));
+        return tables;
+    }
+
+    private static Table readTable(String keyspaceName, String tableName)
+    {
+        long timestamp = readTableTimestamp(keyspaceName, tableName);
+        CFMetaData metadata = readTableMetadata(keyspaceName, tableName);
+        return new Table(timestamp, metadata);
+    }
+
+    private static long readTableTimestamp(String keyspaceName, String tableName)
+    {
+        String query = format("SELECT writeTime(type) AS timestamp FROM %s.%s WHERE keyspace_name = ? AND columnfamily_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_COLUMNFAMILIES);
+        return query(query, keyspaceName, tableName).one().getLong("timestamp");
+    }
+
+    private static CFMetaData readTableMetadata(String keyspaceName, String tableName)
+    {
+        String tableQuery = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND columnfamily_name = ?",
+                                   SystemKeyspace.NAME,
+                                   SystemKeyspace.LEGACY_COLUMNFAMILIES);
+        UntypedResultSet.Row tableRow = query(tableQuery, keyspaceName, tableName).one();
+
+        String columnsQuery = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND columnfamily_name = ?",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.LEGACY_COLUMNS);
+        UntypedResultSet columnRows = query(columnsQuery, keyspaceName, tableName);
+
+        String triggersQuery = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND columnfamily_name = ?",
+                                      SystemKeyspace.NAME,
+                                      SystemKeyspace.LEGACY_TRIGGERS);
+        UntypedResultSet triggerRows = query(triggersQuery, keyspaceName, tableName);
+
+        return decodeTableMetadata(tableName, tableRow, columnRows, triggerRows);
+    }
+
+    private static CFMetaData decodeTableMetadata(String tableName,
+                                                  UntypedResultSet.Row tableRow,
+                                                  UntypedResultSet columnRows,
+                                                  UntypedResultSet triggerRows)
+    {
+        String ksName = tableRow.getString("keyspace_name");
+        String cfName = tableRow.getString("columnfamily_name");
+
+        AbstractType<?> rawComparator = TypeParser.parse(tableRow.getString("comparator"));
+        AbstractType<?> subComparator = tableRow.has("subcomparator") ? TypeParser.parse(tableRow.getString("subcomparator")) : null;
+
+        boolean isSuper = "super".equals(tableRow.getString("type").toLowerCase());
+        boolean isCompound = rawComparator instanceof CompositeType || isSuper;
+
+        /*
+         * Determine whether or not the table is *really* dense
+         * We cannot trust is_dense value of true (see CASSANDRA-11502, that fixed the issue for 2.2 only, and not retroactively),
+         * but we can trust is_dense value of false.
+         */
+        Boolean rawIsDense = tableRow.has("is_dense") ? tableRow.getBoolean("is_dense") : null;
+        boolean isDense;
+        if (rawIsDense != null && !rawIsDense)
+            isDense = false;
+        else
+            isDense = calculateIsDense(rawComparator, columnRows, isSuper);
+
+        // now, if switched to sparse, remove redundant compact_value column and the last clustering column,
+        // directly copying CASSANDRA-11502 logic. See CASSANDRA-11315.
+        Iterable<UntypedResultSet.Row> filteredColumnRows = !isDense && (rawIsDense == null || rawIsDense)
+                                                          ? filterOutRedundantRowsForSparse(columnRows, isSuper, isCompound)
+                                                          : columnRows;
+
+        // We don't really use the default validator but as we have it for backward compatibility, we use it to know if it's a counter table
+        AbstractType<?> defaultValidator = TypeParser.parse(tableRow.getString("default_validator"));
+        boolean isCounter = defaultValidator instanceof CounterColumnType;
+
+        /*
+         * With CASSANDRA-5202 we stopped inferring the cf id from the combination of keyspace/table names,
+         * and started storing the generated uuids in system.schema_columnfamilies.
+         *
+         * In 3.0 we SHOULD NOT see tables like that (2.0-created, non-upgraded).
+         * But in the off-chance that we do, we generate the deterministic uuid here.
+         */
+        UUID cfId = tableRow.has("cf_id")
+                  ? tableRow.getUUID("cf_id")
+                  : CFMetaData.generateLegacyCfId(ksName, cfName);
+
+        boolean isCQLTable = !isSuper && !isDense && isCompound;
+        boolean isStaticCompactTable = !isDense && !isCompound;
+
+        // Internally, compact tables have a specific layout, see CompactTables. But when upgrading from
+        // previous versions, they may not have the expected schema, so detect if we need to upgrade and do
+        // it in createColumnsFromColumnRows.
+        // We can remove this once we don't support upgrade from versions < 3.0.
+        boolean needsUpgrade = !isCQLTable && checkNeedsUpgrade(filteredColumnRows, isSuper, isStaticCompactTable);
+
+        List<ColumnDefinition> columnDefs = createColumnsFromColumnRows(filteredColumnRows,
+                                                                        ksName,
+                                                                        cfName,
+                                                                        rawComparator,
+                                                                        subComparator,
+                                                                        isSuper,
+                                                                        isCQLTable,
+                                                                        isStaticCompactTable,
+                                                                        needsUpgrade);
+
+        if (needsUpgrade)
+        {
+            addDefinitionForUpgrade(columnDefs,
+                                    ksName,
+                                    cfName,
+                                    isStaticCompactTable,
+                                    isSuper,
+                                    rawComparator,
+                                    subComparator,
+                                    defaultValidator);
+        }
+
+        CFMetaData cfm = CFMetaData.create(ksName,
+                                           cfName,
+                                           cfId,
+                                           isDense,
+                                           isCompound,
+                                           isSuper,
+                                           isCounter,
+                                           false, // legacy schema did not contain views
+                                           columnDefs,
+                                           DatabaseDescriptor.getPartitioner());
+
+        Indexes indexes = createIndexesFromColumnRows(cfm,
+                                                      filteredColumnRows,
+                                                      ksName,
+                                                      cfName,
+                                                      rawComparator,
+                                                      subComparator,
+                                                      isSuper,
+                                                      isCQLTable,
+                                                      isStaticCompactTable,
+                                                      needsUpgrade);
+        cfm.indexes(indexes);
+
+        if (tableRow.has("dropped_columns"))
+            addDroppedColumns(cfm, rawComparator, tableRow.getMap("dropped_columns", UTF8Type.instance, LongType.instance));
+
+        return cfm.params(decodeTableParams(tableRow))
+                  .triggers(createTriggersFromTriggerRows(triggerRows));
+    }
+
+    /*
+     * We call dense a CF for which each component of the comparator is a clustering column, i.e. no
+     * component is used to store a regular column names. In other words, non-composite static "thrift"
+     * and CQL3 CF are *not* dense.
+     * We save whether the table is dense or not during table creation through CQL, but we don't have this
+     * information for table just created through thrift, nor for table prior to CASSANDRA-7744, so this
+     * method does its best to infer whether the table is dense or not based on other elements.
+     */
+    private static boolean calculateIsDense(AbstractType<?> comparator, UntypedResultSet columnRows, boolean isSuper)
+    {
+        /*
+         * As said above, this method is only here because we need to deal with thrift upgrades.
+         * Once a CF has been "upgraded", i.e. we've rebuilt and save its CQL3 metadata at least once,
+         * then we'll have saved the "is_dense" value and will be good to go.
+         *
+         * But non-upgraded thrift CF (and pre-7744 CF) will have no value for "is_dense", so we need
+         * to infer that information without relying on it in that case. And for the most part this is
+         * easy, a CF that has at least one REGULAR definition is not dense. But the subtlety is that not
+         * having a REGULAR definition may not mean dense because of CQL3 definitions that have only the
+         * PRIMARY KEY defined.
+         *
+         * So we need to recognize those special case CQL3 table with only a primary key. If we have some
+         * clustering columns, we're fine as said above. So the only problem is that we cannot decide for
+         * sure if a CF without REGULAR columns nor CLUSTERING_COLUMN definition is meant to be dense, or if it
+         * has been created in CQL3 by say:
+         *    CREATE TABLE test (k int PRIMARY KEY)
+         * in which case it should not be dense. However, we can limit our margin of error by assuming we are
+         * in the latter case only if the comparator is exactly CompositeType(UTF8Type).
+         */
+        for (UntypedResultSet.Row columnRow : columnRows)
+        {
+            if ("regular".equals(columnRow.getString("type")))
+                return false;
+        }
+
+        // If we've checked the columns for supercf and found no regulars, it's dense. Relying on the emptiness
+        // of the value column is not enough due to index calculation.
+        if (isSuper)
+            return true;
+
+        int maxClusteringIdx = -1;
+        for (UntypedResultSet.Row columnRow : columnRows)
+            if ("clustering_key".equals(columnRow.getString("type")))
+                maxClusteringIdx = Math.max(maxClusteringIdx, columnRow.has("component_index") ? columnRow.getInt("component_index") : 0);
+
+        return maxClusteringIdx >= 0
+             ? maxClusteringIdx == comparator.componentsCount() - 1
+             : !isCQL3OnlyPKComparator(comparator);
+    }
+
+    private static Iterable<UntypedResultSet.Row> filterOutRedundantRowsForSparse(UntypedResultSet columnRows, boolean isSuper, boolean isCompound)
+    {
+        Collection<UntypedResultSet.Row> filteredRows = new ArrayList<>();
+        for (UntypedResultSet.Row columnRow : columnRows)
+        {
+            String kind = columnRow.getString("type");
+
+            if (!isSuper && "compact_value".equals(kind))
+                continue;
+
+            if ("clustering_key".equals(kind) && !isSuper && !isCompound)
+                continue;
+
+            filteredRows.add(columnRow);
+        }
+
+        return filteredRows;
+    }
+
+    private static boolean isCQL3OnlyPKComparator(AbstractType<?> comparator)
+    {
+        if (!(comparator instanceof CompositeType))
+            return false;
+
+        CompositeType ct = (CompositeType)comparator;
+        return ct.types.size() == 1 && ct.types.get(0) instanceof UTF8Type;
+    }
+
+    private static TableParams decodeTableParams(UntypedResultSet.Row row)
+    {
+        TableParams.Builder params = TableParams.builder();
+
+        params.readRepairChance(row.getDouble("read_repair_chance"))
+              .dcLocalReadRepairChance(row.getDouble("local_read_repair_chance"))
+              .gcGraceSeconds(row.getInt("gc_grace_seconds"));
+
+        if (row.has("comment"))
+            params.comment(row.getString("comment"));
+
+        if (row.has("memtable_flush_period_in_ms"))
+            params.memtableFlushPeriodInMs(row.getInt("memtable_flush_period_in_ms"));
+
+        params.caching(cachingFromRow(row.getString("caching")));
+
+        if (row.has("default_time_to_live"))
+            params.defaultTimeToLive(row.getInt("default_time_to_live"));
+
+        if (row.has("speculative_retry"))
+            params.speculativeRetry(SpeculativeRetryParam.fromString(row.getString("speculative_retry")));
+
+        Map<String, String> compressionParameters = fromJsonMap(row.getString("compression_parameters"));
+        String crcCheckChance = compressionParameters.remove("crc_check_chance");
+        //crc_check_chance was promoted from a compression property to a top-level property
+        if (crcCheckChance != null)
+            params.crcCheckChance(Double.parseDouble(crcCheckChance));
+
+        params.compression(CompressionParams.fromMap(compressionParameters));
+
+        params.compaction(compactionFromRow(row));
+
+        if (row.has("min_index_interval"))
+            params.minIndexInterval(row.getInt("min_index_interval"));
+
+        if (row.has("max_index_interval"))
+            params.maxIndexInterval(row.getInt("max_index_interval"));
+
+        if (row.has("bloom_filter_fp_chance"))
+            params.bloomFilterFpChance(row.getDouble("bloom_filter_fp_chance"));
+
+        return params.build();
+    }
+
+    /**
+     *
+     * 2.1 and newer use JSON'ified map of caching parameters, but older versions had valid Strings
+     * NONE, KEYS_ONLY, ROWS_ONLY, and ALL
+     *
+     * @param caching, the string representing the table's caching options
+     * @return CachingParams object corresponding to the input string
+     */
+    @VisibleForTesting
+    public static CachingParams cachingFromRow(String caching)
+    {
+        switch(caching)
+        {
+            case "NONE":
+                return CachingParams.CACHE_NOTHING;
+            case "KEYS_ONLY":
+                return CachingParams.CACHE_KEYS;
+            case "ROWS_ONLY":
+                return new CachingParams(false, Integer.MAX_VALUE);
+            case "ALL":
+                return CachingParams.CACHE_EVERYTHING;
+            default:
+                return CachingParams.fromMap(fromJsonMap(caching));
+        }
+    }
+
+    /*
+     * The method is needed - to migrate max_compaction_threshold and min_compaction_threshold
+     * to the compaction map, where they belong.
+     *
+     * We must use reflection to validate the options because not every compaction strategy respects and supports
+     * the threshold params (LCS doesn't, STCS and DTCS do).
+     */
+    @SuppressWarnings("unchecked")
+    private static CompactionParams compactionFromRow(UntypedResultSet.Row row)
+    {
+        Class<? extends AbstractCompactionStrategy> klass =
+            CFMetaData.createCompactionStrategy(row.getString("compaction_strategy_class"));
+        Map<String, String> options = fromJsonMap(row.getString("compaction_strategy_options"));
+
+        int minThreshold = row.getInt("min_compaction_threshold");
+        int maxThreshold = row.getInt("max_compaction_threshold");
+
+        Map<String, String> optionsWithThresholds = new HashMap<>(options);
+        optionsWithThresholds.putIfAbsent(CompactionParams.Option.MIN_THRESHOLD.toString(), Integer.toString(minThreshold));
+        optionsWithThresholds.putIfAbsent(CompactionParams.Option.MAX_THRESHOLD.toString(), Integer.toString(maxThreshold));
+
+        try
+        {
+            Map<String, String> unrecognizedOptions =
+                (Map<String, String>) klass.getMethod("validateOptions", Map.class).invoke(null, optionsWithThresholds);
+
+            if (unrecognizedOptions.isEmpty())
+                options = optionsWithThresholds;
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        return CompactionParams.create(klass, options);
+    }
+
+    // Should only be called on compact tables
+    private static boolean checkNeedsUpgrade(Iterable<UntypedResultSet.Row> defs, boolean isSuper, boolean isStaticCompactTable)
+    {
+        // For SuperColumn tables, re-create a compact value column
+        if (isSuper)
+            return true;
+
+        // For static compact tables, we need to upgrade if the regular definitions haven't been converted to static yet,
+        // i.e. if we don't have a static definition yet.
+        if (isStaticCompactTable)
+            return !hasKind(defs, ColumnDefinition.Kind.STATIC);
+
+        // For dense compact tables, we need to upgrade if we don't have a compact value definition
+        return !hasRegularColumns(defs);
+    }
+
+    private static boolean hasRegularColumns(Iterable<UntypedResultSet.Row> columnRows)
+    {
+        for (UntypedResultSet.Row row : columnRows)
+        {
+            /*
+             * We need to special case and ignore the empty compact column (pre-3.0, COMPACT STORAGE, primary-key only tables),
+             * since deserializeKind() will otherwise just return a REGULAR.
+             * We want the proper EmptyType regular column to be added by addDefinitionForUpgrade(), so we need
+             * checkNeedsUpgrade() to return true in this case.
+             * See CASSANDRA-9874.
+             */
+            if (isEmptyCompactValueColumn(row))
+                return false;
+
+            if (deserializeKind(row.getString("type")) == ColumnDefinition.Kind.REGULAR)
+                return true;
+        }
+
+        return false;
+    }
+
+    private static boolean isEmptyCompactValueColumn(UntypedResultSet.Row row)
+    {
+        return "compact_value".equals(row.getString("type")) && row.getString("column_name").isEmpty();
+    }
+
+    private static void addDefinitionForUpgrade(List<ColumnDefinition> defs,
+                                                String ksName,
+                                                String cfName,
+                                                boolean isStaticCompactTable,
+                                                boolean isSuper,
+                                                AbstractType<?> rawComparator,
+                                                AbstractType<?> subComparator,
+                                                AbstractType<?> defaultValidator)
+    {
+        CompactTables.DefaultNames names = CompactTables.defaultNameGenerator(defs);
+
+        if (isSuper)
+        {
+            defs.add(ColumnDefinition.regularDef(ksName, cfName, SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN_STR, MapType.getInstance(subComparator, defaultValidator, true)));
+        }
+        else if (isStaticCompactTable)
+        {
+            defs.add(ColumnDefinition.clusteringDef(ksName, cfName, names.defaultClusteringName(), rawComparator, 0));
+            defs.add(ColumnDefinition.regularDef(ksName, cfName, names.defaultCompactValueName(), defaultValidator));
+        }
+        else
+        {
+            // For dense compact tables, we get here if we don't have a compact value column, in which case we should add it.
+            // We use EmptyType to recognize that the compact value was not declared by the user (see CreateTableStatement).
+            // If user made any writes to this column, compact value column should be initialized as bytes (see CASSANDRA-15778).
+            AbstractType<?> compactColumnType = Boolean.getBoolean("cassandra.init_dense_table_compact_value_as_bytes")
+                                                ? BytesType.instance : EmptyType.instance;
+            defs.add(ColumnDefinition.regularDef(ksName, cfName, names.defaultCompactValueName(), compactColumnType));
+        }
+    }
+
+    private static boolean hasKind(Iterable<UntypedResultSet.Row> defs, ColumnDefinition.Kind kind)
+    {
+        for (UntypedResultSet.Row row : defs)
+            if (deserializeKind(row.getString("type")) == kind)
+                return true;
+
+        return false;
+    }
+
+    /*
+     * Prior to 3.0 we used to not store the type of the dropped columns, relying on all collection info being
+     * present in the comparator, forever. That allowed us to perform certain validations in AlterTableStatement
+     * (namely not allowing to re-add incompatible collection columns, with the same name, but a different type).
+     *
+     * In 3.0, we no longer preserve the original comparator, and reconstruct it from the columns instead. That means
+     * that we should preserve the type of the dropped columns now, and, during migration, fetch the types from
+     * the original comparator if necessary.
+     */
+    private static void addDroppedColumns(CFMetaData cfm, AbstractType<?> comparator, Map<String, Long> droppedTimes)
+    {
+        AbstractType<?> last = comparator.getComponents().get(comparator.componentsCount() - 1);
+        Map<ByteBuffer, CollectionType> collections = last instanceof ColumnToCollectionType
+                                                    ? ((ColumnToCollectionType) last).defined
+                                                    : Collections.emptyMap();
+
+        for (Map.Entry<String, Long> entry : droppedTimes.entrySet())
+        {
+            String name = entry.getKey();
+            ByteBuffer nameBytes = UTF8Type.instance.decompose(name);
+            long time = entry.getValue();
+
+            AbstractType<?> type = collections.containsKey(nameBytes)
+                                 ? collections.get(nameBytes)
+                                 : BytesType.instance;
+
+            cfm.getDroppedColumns().put(nameBytes, new CFMetaData.DroppedColumn(name, null, type, time));
+        }
+    }
+
+    private static List<ColumnDefinition> createColumnsFromColumnRows(Iterable<UntypedResultSet.Row> rows,
+                                                                      String keyspace,
+                                                                      String table,
+                                                                      AbstractType<?> rawComparator,
+                                                                      AbstractType<?> rawSubComparator,
+                                                                      boolean isSuper,
+                                                                      boolean isCQLTable,
+                                                                      boolean isStaticCompactTable,
+                                                                      boolean needsUpgrade)
+    {
+        List<ColumnDefinition> columns = new ArrayList<>();
+
+        for (UntypedResultSet.Row row : rows)
+        {
+            // Skip the empty compact value column. Make addDefinitionForUpgrade() re-add the proper REGULAR one.
+            if (isEmptyCompactValueColumn(row))
+                continue;
+
+            columns.add(createColumnFromColumnRow(row,
+                                                  keyspace,
+                                                  table,
+                                                  rawComparator,
+                                                  rawSubComparator,
+                                                  isSuper,
+                                                  isCQLTable,
+                                                  isStaticCompactTable,
+                                                  needsUpgrade));
+        }
+
+        return columns;
+    }
+
+    private static ColumnDefinition createColumnFromColumnRow(UntypedResultSet.Row row,
+                                                              String keyspace,
+                                                              String table,
+                                                              AbstractType<?> rawComparator,
+                                                              AbstractType<?> rawSubComparator,
+                                                              boolean isSuper,
+                                                              boolean isCQLTable,
+                                                              boolean isStaticCompactTable,
+                                                              boolean needsUpgrade)
+    {
+        String rawKind = row.getString("type");
+
+        ColumnDefinition.Kind kind = deserializeKind(rawKind);
+        if (needsUpgrade && isStaticCompactTable && kind == ColumnDefinition.Kind.REGULAR)
+            kind = ColumnDefinition.Kind.STATIC;
+
+        int componentIndex = ColumnDefinition.NO_POSITION;
+        // Note that the component_index is not useful for non-primary key parts (it never really in fact since there is
+        // no particular ordering of non-PK columns, we only used to use it as a simplification but that's not needed
+        // anymore)
+        if (kind.isPrimaryKeyKind())
+            // We use to not have a component index when there was a single partition key, we don't anymore (#10491)
+            componentIndex = row.has("component_index") ? row.getInt("component_index") : 0;
+
+        // Note: we save the column name as string, but we should not assume that it is an UTF8 name, we
+        // we need to use the comparator fromString method
+        AbstractType<?> comparator = isCQLTable
+                                     ? UTF8Type.instance
+                                     : CompactTables.columnDefinitionComparator(rawKind, isSuper, rawComparator, rawSubComparator);
+        ColumnIdentifier name = ColumnIdentifier.getInterned(comparator.fromString(row.getString("column_name")), comparator);
+
+        AbstractType<?> validator = parseType(row.getString("validator"));
+
+        return new ColumnDefinition(keyspace, table, name, validator, componentIndex, kind);
+    }
+
+    private static Indexes createIndexesFromColumnRows(CFMetaData cfm,
+                                                       Iterable<UntypedResultSet.Row> rows,
+                                                       String keyspace,
+                                                       String table,
+                                                       AbstractType<?> rawComparator,
+                                                       AbstractType<?> rawSubComparator,
+                                                       boolean isSuper,
+                                                       boolean isCQLTable,
+                                                       boolean isStaticCompactTable,
+                                                       boolean needsUpgrade)
+    {
+        Indexes.Builder indexes = Indexes.builder();
+
+        for (UntypedResultSet.Row row : rows)
+        {
+            IndexMetadata.Kind kind = null;
+            if (row.has("index_type"))
+                kind = IndexMetadata.Kind.valueOf(row.getString("index_type"));
+
+            if (kind == null)
+                continue;
+
+            Map<String, String> indexOptions = null;
+            if (row.has("index_options"))
+                indexOptions = fromJsonMap(row.getString("index_options"));
+
+            String indexName = null;
+            if (row.has("index_name"))
+                indexName = row.getString("index_name");
+
+            ColumnDefinition column = createColumnFromColumnRow(row,
+                                                                keyspace,
+                                                                table,
+                                                                rawComparator,
+                                                                rawSubComparator,
+                                                                isSuper,
+                                                                isCQLTable,
+                                                                isStaticCompactTable,
+                                                                needsUpgrade);
+
+            indexes.add(IndexMetadata.fromLegacyMetadata(cfm, column, indexName, kind, indexOptions));
+        }
+
+        return indexes.build();
+    }
+
+    private static ColumnDefinition.Kind deserializeKind(String kind)
+    {
+        if ("clustering_key".equalsIgnoreCase(kind))
+            return ColumnDefinition.Kind.CLUSTERING;
+
+        if ("compact_value".equalsIgnoreCase(kind))
+            return ColumnDefinition.Kind.REGULAR;
+
+        return Enum.valueOf(ColumnDefinition.Kind.class, kind.toUpperCase());
+    }
+
+    private static Triggers createTriggersFromTriggerRows(UntypedResultSet rows)
+    {
+        Triggers.Builder triggers = org.apache.cassandra.schema.Triggers.builder();
+        rows.forEach(row -> triggers.add(createTriggerFromTriggerRow(row)));
+        return triggers.build();
+    }
+
+    private static TriggerMetadata createTriggerFromTriggerRow(UntypedResultSet.Row row)
+    {
+        String name = row.getString("trigger_name");
+        String classOption = row.getTextMap("trigger_options").get("class");
+        return new TriggerMetadata(name, classOption);
+    }
+
+    /*
+     * Reading user types
+     */
+
+    private static Collection<Type> readTypes(String keyspaceName)
+    {
+        String query = format("SELECT type_name FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_USERTYPES);
+        Collection<String> typeNames = new ArrayList<>();
+        query(query, keyspaceName).forEach(row -> typeNames.add(row.getString("type_name")));
+
+        Collection<Type> types = new ArrayList<>();
+        typeNames.forEach(name -> types.add(readType(keyspaceName, name)));
+        return types;
+    }
+
+    private static Type readType(String keyspaceName, String typeName)
+    {
+        long timestamp = readTypeTimestamp(keyspaceName, typeName);
+        UserType metadata = readTypeMetadata(keyspaceName, typeName);
+        return new Type(timestamp, metadata);
+    }
+
+    /*
+     * Unfortunately there is not a single REGULAR column in system.schema_usertypes, so annoyingly we cannot
+     * use the writeTime() CQL function, and must resort to a lower level.
+     */
+    private static long readTypeTimestamp(String keyspaceName, String typeName)
+    {
+        ColumnFamilyStore store = org.apache.cassandra.db.Keyspace.open(SystemKeyspace.NAME)
+                                                                  .getColumnFamilyStore(SystemKeyspace.LEGACY_USERTYPES);
+
+        ClusteringComparator comparator = store.metadata.comparator;
+        Slices slices = Slices.with(comparator, Slice.make(comparator, typeName));
+        int nowInSec = FBUtilities.nowInSeconds();
+        DecoratedKey key = store.metadata.decorateKey(AsciiType.instance.fromString(keyspaceName));
+        SinglePartitionReadCommand command = SinglePartitionReadCommand.create(store.metadata, nowInSec, key, slices);
+
+        try (OpOrder.Group op = store.readOrdering.start();
+             RowIterator partition = UnfilteredRowIterators.filter(command.queryMemtableAndDisk(store, op), nowInSec))
+        {
+            return partition.next().primaryKeyLivenessInfo().timestamp();
+        }
+    }
+
+    private static UserType readTypeMetadata(String keyspaceName, String typeName)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND type_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_USERTYPES);
+        UntypedResultSet.Row row = query(query, keyspaceName, typeName).one();
+
+        List<ByteBuffer> names =
+            row.getList("field_names", UTF8Type.instance)
+               .stream()
+               .map(ByteBufferUtil::bytes)
+               .collect(Collectors.toList());
+
+        List<AbstractType<?>> types =
+            row.getList("field_types", UTF8Type.instance)
+               .stream()
+               .map(LegacySchemaMigrator::parseType)
+               .collect(Collectors.toList());
+
+        return new UserType(keyspaceName, bytes(typeName), names, types);
+    }
+
+    /*
+     * Reading UDFs
+     */
+
+    private static Collection<Function> readFunctions(String keyspaceName)
+    {
+        String query = format("SELECT function_name, signature FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_FUNCTIONS);
+        HashMultimap<String, List<String>> functionSignatures = HashMultimap.create();
+        query(query, keyspaceName).forEach(row -> functionSignatures.put(row.getString("function_name"), row.getList("signature", UTF8Type.instance)));
+
+        Collection<Function> functions = new ArrayList<>();
+        functionSignatures.entries().forEach(pair -> functions.add(readFunction(keyspaceName, pair.getKey(), pair.getValue())));
+        return functions;
+    }
+
+    private static Function readFunction(String keyspaceName, String functionName, List<String> signature)
+    {
+        long timestamp = readFunctionTimestamp(keyspaceName, functionName, signature);
+        UDFunction metadata = readFunctionMetadata(keyspaceName, functionName, signature);
+        return new Function(timestamp, metadata);
+    }
+
+    private static long readFunctionTimestamp(String keyspaceName, String functionName, List<String> signature)
+    {
+        String query = format("SELECT writeTime(return_type) AS timestamp " +
+                              "FROM %s.%s " +
+                              "WHERE keyspace_name = ? AND function_name = ? AND signature = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_FUNCTIONS);
+        return query(query, keyspaceName, functionName, signature).one().getLong("timestamp");
+    }
+
+    private static UDFunction readFunctionMetadata(String keyspaceName, String functionName, List<String> signature)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND function_name = ? AND signature = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_FUNCTIONS);
+        UntypedResultSet.Row row = query(query, keyspaceName, functionName, signature).one();
+
+        FunctionName name = new FunctionName(keyspaceName, functionName);
+
+        List<ColumnIdentifier> argNames = new ArrayList<>();
+        if (row.has("argument_names"))
+            for (String arg : row.getList("argument_names", UTF8Type.instance))
+                argNames.add(new ColumnIdentifier(arg, true));
+
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        if (row.has("argument_types"))
+            for (String type : row.getList("argument_types", UTF8Type.instance))
+                argTypes.add(parseType(type));
+
+        AbstractType<?> returnType = parseType(row.getString("return_type"));
+
+        String language = row.getString("language");
+        String body = row.getString("body");
+        boolean calledOnNullInput = row.getBoolean("called_on_null_input");
+
+        try
+        {
+            return UDFunction.create(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
+        }
+        catch (InvalidRequestException e)
+        {
+            return UDFunction.createBrokenFunction(name, argNames, argTypes, returnType, calledOnNullInput, language, body, e);
+        }
+    }
+
+    /*
+     * Reading UDAs
+     */
+
+    private static Collection<Aggregate> readAggregates(Functions functions, String keyspaceName)
+    {
+        String query = format("SELECT aggregate_name, signature FROM %s.%s WHERE keyspace_name = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_AGGREGATES);
+        HashMultimap<String, List<String>> aggregateSignatures = HashMultimap.create();
+        query(query, keyspaceName).forEach(row -> aggregateSignatures.put(row.getString("aggregate_name"), row.getList("signature", UTF8Type.instance)));
+
+        Collection<Aggregate> aggregates = new ArrayList<>();
+        aggregateSignatures.entries().forEach(pair -> aggregates.add(readAggregate(functions, keyspaceName, pair.getKey(), pair.getValue())));
+        return aggregates;
+    }
+
+    private static Aggregate readAggregate(Functions functions, String keyspaceName, String aggregateName, List<String> signature)
+    {
+        long timestamp = readAggregateTimestamp(keyspaceName, aggregateName, signature);
+        UDAggregate metadata = readAggregateMetadata(functions, keyspaceName, aggregateName, signature);
+        return new Aggregate(timestamp, metadata);
+    }
+
+    private static long readAggregateTimestamp(String keyspaceName, String aggregateName, List<String> signature)
+    {
+        String query = format("SELECT writeTime(return_type) AS timestamp " +
+                              "FROM %s.%s " +
+                              "WHERE keyspace_name = ? AND aggregate_name = ? AND signature = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_AGGREGATES);
+        return query(query, keyspaceName, aggregateName, signature).one().getLong("timestamp");
+    }
+
+    private static UDAggregate readAggregateMetadata(Functions functions, String keyspaceName, String functionName, List<String> signature)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND aggregate_name = ? AND signature = ?",
+                              SystemKeyspace.NAME,
+                              SystemKeyspace.LEGACY_AGGREGATES);
+        UntypedResultSet.Row row = query(query, keyspaceName, functionName, signature).one();
+
+        FunctionName name = new FunctionName(keyspaceName, functionName);
+
+        List<String> types = row.getList("argument_types", UTF8Type.instance);
+
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        if (types != null)
+        {
+            argTypes = new ArrayList<>(types.size());
+            for (String type : types)
+                argTypes.add(parseType(type));
+        }
+
+        AbstractType<?> returnType = parseType(row.getString("return_type"));
+
+        FunctionName stateFunc = new FunctionName(keyspaceName, row.getString("state_func"));
+        AbstractType<?> stateType = parseType(row.getString("state_type"));
+        FunctionName finalFunc = row.has("final_func") ? new FunctionName(keyspaceName, row.getString("final_func")) : null;
+        ByteBuffer initcond = row.has("initcond") ? row.getBytes("initcond") : null;
+
+        try
+        {
+            return UDAggregate.create(functions, name, argTypes, returnType, stateFunc, finalFunc, stateType, initcond);
+        }
+        catch (InvalidRequestException reason)
+        {
+            return UDAggregate.createBroken(name, argTypes, returnType, initcond, reason);
+        }
+    }
+
+    private static UntypedResultSet query(String query, Object... values)
+    {
+        return QueryProcessor.executeOnceInternal(query, values);
+    }
+
+    private static AbstractType<?> parseType(String str)
+    {
+        return TypeParser.parse(str);
+    }
+
+    private static final class Keyspace
+    {
+        final long timestamp;
+        final String name;
+        final KeyspaceParams params;
+        final Collection<Table> tables;
+        final Collection<Type> types;
+        final Collection<Function> functions;
+        final Collection<Aggregate> aggregates;
+
+        Keyspace(long timestamp,
+                 String name,
+                 KeyspaceParams params,
+                 Collection<Table> tables,
+                 Collection<Type> types,
+                 Collection<Function> functions,
+                 Collection<Aggregate> aggregates)
+        {
+            this.timestamp = timestamp;
+            this.name = name;
+            this.params = params;
+            this.tables = tables;
+            this.types = types;
+            this.functions = functions;
+            this.aggregates = aggregates;
+        }
+    }
+
+    private static final class Table
+    {
+        final long timestamp;
+        final CFMetaData metadata;
+
+        Table(long timestamp, CFMetaData metadata)
+        {
+            this.timestamp = timestamp;
+            this.metadata = metadata;
+        }
+    }
+
+    private static final class Type
+    {
+        final long timestamp;
+        final UserType metadata;
+
+        Type(long timestamp, UserType metadata)
+        {
+            this.timestamp = timestamp;
+            this.metadata = metadata;
+        }
+    }
+
+    private static final class Function
+    {
+        final long timestamp;
+        final UDFunction metadata;
+
+        Function(long timestamp, UDFunction metadata)
+        {
+            this.timestamp = timestamp;
+            this.metadata = metadata;
+        }
+    }
+
+    private static final class Aggregate
+    {
+        final long timestamp;
+        final UDAggregate metadata;
+
+        Aggregate(long timestamp, UDAggregate metadata)
+        {
+            this.timestamp = timestamp;
+            this.metadata = metadata;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/LegacySchemaTables.java b/src/java/org/apache/cassandra/schema/LegacySchemaTables.java
deleted file mode 100644
index 8d5bf4f..0000000
--- a/src/java/org/apache/cassandra/schema/LegacySchemaTables.java
+++ /dev/null

@@ -1,1491 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.schema;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.*;
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.MapDifference;
-import com.google.common.collect.Maps;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.cql3.functions.*;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
-import static org.apache.cassandra.utils.FBUtilities.fromJsonMap;
-import static org.apache.cassandra.utils.FBUtilities.json;
-
-/** system.schema_* tables used to store keyspace/table/type attributes prior to C* 3.0 */
-public class LegacySchemaTables
-{
-    private static final Logger logger = LoggerFactory.getLogger(LegacySchemaTables.class);
-
-    public static final String KEYSPACES = "schema_keyspaces";
-    public static final String COLUMNFAMILIES = "schema_columnfamilies";
-    public static final String COLUMNS = "schema_columns";
-    public static final String TRIGGERS = "schema_triggers";
-    public static final String USERTYPES = "schema_usertypes";
-    public static final String FUNCTIONS = "schema_functions";
-    public static final String AGGREGATES = "schema_aggregates";
-
-    public static final List<String> ALL = Arrays.asList(KEYSPACES, COLUMNFAMILIES, COLUMNS, TRIGGERS, USERTYPES, FUNCTIONS, AGGREGATES);
-
-    private static final CFMetaData Keyspaces =
-        compile(KEYSPACES,
-                "keyspace definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "durable_writes boolean,"
-                + "strategy_class text,"
-                + "strategy_options text,"
-                + "PRIMARY KEY ((keyspace_name))) "
-                + "WITH COMPACT STORAGE");
-
-    private static final CFMetaData Columnfamilies =
-        compile(COLUMNFAMILIES,
-                "table definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "columnfamily_name text,"
-                + "bloom_filter_fp_chance double,"
-                + "caching text,"
-                + "cf_id uuid," // post-2.1 UUID cfid
-                + "comment text,"
-                + "compaction_strategy_class text,"
-                + "compaction_strategy_options text,"
-                + "comparator text,"
-                + "compression_parameters text,"
-                + "default_time_to_live int,"
-                + "default_validator text,"
-                + "dropped_columns map<text, bigint>,"
-                + "gc_grace_seconds int,"
-                + "is_dense boolean,"
-                + "key_validator text,"
-                + "local_read_repair_chance double,"
-                + "max_compaction_threshold int,"
-                + "max_index_interval int,"
-                + "memtable_flush_period_in_ms int,"
-                + "min_compaction_threshold int,"
-                + "min_index_interval int,"
-                + "read_repair_chance double,"
-                + "speculative_retry text,"
-                + "subcomparator text,"
-                + "type text,"
-                + "PRIMARY KEY ((keyspace_name), columnfamily_name))");
-
-    private static final CFMetaData Columns =
-        compile(COLUMNS,
-                "column definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "columnfamily_name text,"
-                + "column_name text,"
-                + "component_index int,"
-                + "index_name text,"
-                + "index_options text,"
-                + "index_type text,"
-                + "type text,"
-                + "validator text,"
-                + "PRIMARY KEY ((keyspace_name), columnfamily_name, column_name))");
-
-    private static final CFMetaData Triggers =
-        compile(TRIGGERS,
-                "trigger definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "columnfamily_name text,"
-                + "trigger_name text,"
-                + "trigger_options map<text, text>,"
-                + "PRIMARY KEY ((keyspace_name), columnfamily_name, trigger_name))");
-
-    private static final CFMetaData Usertypes =
-        compile(USERTYPES,
-                "user defined type definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "type_name text,"
-                + "field_names list<text>,"
-                + "field_types list<text>,"
-                + "PRIMARY KEY ((keyspace_name), type_name))");
-
-    private static final CFMetaData Functions =
-        compile(FUNCTIONS,
-                "user defined function definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "function_name text,"
-                + "signature frozen<list<text>>,"
-                + "argument_names list<text>,"
-                + "argument_types list<text>,"
-                + "body text,"
-                + "language text,"
-                + "return_type text,"
-                + "called_on_null_input boolean,"
-                + "PRIMARY KEY ((keyspace_name), function_name, signature))");
-
-    private static final CFMetaData Aggregates =
-        compile(AGGREGATES,
-                "user defined aggregate definitions",
-                "CREATE TABLE %s ("
-                + "keyspace_name text,"
-                + "aggregate_name text,"
-                + "signature frozen<list<text>>,"
-                + "argument_types list<text>,"
-                + "final_func text,"
-                + "initcond blob,"
-                + "return_type text,"
-                + "state_func text,"
-                + "state_type text,"
-                + "PRIMARY KEY ((keyspace_name), aggregate_name, signature))");
-
-    public static final List<CFMetaData> All = Arrays.asList(Keyspaces, Columnfamilies, Columns, Triggers, Usertypes, Functions, Aggregates);
-
-    private static CFMetaData compile(String name, String description, String schema)
-    {
-        return CFMetaData.compile(String.format(schema, name), SystemKeyspace.NAME)
-                         .comment(description)
-                         .gcGraceSeconds((int) TimeUnit.DAYS.toSeconds(7));
-    }
-
-    /** add entries to system.schema_* for the hardcoded system definitions */
-    public static void saveSystemKeyspaceSchema()
-    {
-        KSMetaData keyspace = Schema.instance.getKSMetaData(SystemKeyspace.NAME);
-        long timestamp = FBUtilities.timestampMicros();
-        // delete old, possibly obsolete entries in schema tables
-        for (String table : ALL)
-        {
-            executeOnceInternal(String.format("DELETE FROM system.%s USING TIMESTAMP ? WHERE keyspace_name = ?", table),
-                                timestamp,
-                                keyspace.name);
-        }
-        // (+1 to timestamp to make sure we don't get shadowed by the tombstones we just added)
-        makeCreateKeyspaceMutation(keyspace, timestamp + 1).apply();
-    }
-
-    public static Collection<KSMetaData> readSchemaFromSystemTables()
-    {
-        List<Row> serializedSchema = getSchemaPartitionsForTable(KEYSPACES);
-
-        List<KSMetaData> keyspaces = new ArrayList<>(serializedSchema.size());
-
-        for (Row partition : serializedSchema)
-        {
-            if (isEmptySchemaPartition(partition) || isSystemKeyspaceSchemaPartition(partition))
-                continue;
-
-            keyspaces.add(createKeyspaceFromSchemaPartitions(partition,
-                                                             readSchemaPartitionForKeyspace(COLUMNFAMILIES, partition.key),
-                                                             readSchemaPartitionForKeyspace(USERTYPES, partition.key)));
-
-            // Will be moved away in #6717
-            for (UDFunction function : createFunctionsFromFunctionsPartition(readSchemaPartitionForKeyspace(FUNCTIONS, partition.key)).values())
-                org.apache.cassandra.cql3.functions.Functions.addOrReplaceFunction(function);
-
-            // Will be moved away in #6717
-            for (UDAggregate aggregate : createAggregatesFromAggregatesPartition(readSchemaPartitionForKeyspace(AGGREGATES, partition.key)).values())
-                org.apache.cassandra.cql3.functions.Functions.addOrReplaceFunction(aggregate);
-        }
-
-        return keyspaces;
-    }
-
-    public static void truncateSchemaTables()
-    {
-        for (String table : ALL)
-            getSchemaCFS(table).truncateBlocking();
-    }
-
-    private static void flushSchemaTables()
-    {
-        for (String table : ALL)
-            SystemKeyspace.forceBlockingFlush(table);
-    }
-
-    /**
-     * Read schema from system keyspace and calculate MD5 digest of every row, resulting digest
-     * will be converted into UUID which would act as content-based version of the schema.
-     */
-    public static UUID calculateSchemaDigest()
-    {
-        MessageDigest digest;
-        try
-        {
-            digest = MessageDigest.getInstance("MD5");
-        }
-        catch (NoSuchAlgorithmException e)
-        {
-            throw new RuntimeException(e);
-        }
-
-        for (String table : ALL)
-        {
-            for (Row partition : getSchemaPartitionsForTable(table))
-            {
-                if (isEmptySchemaPartition(partition) || isSystemKeyspaceSchemaPartition(partition))
-                    continue;
-
-                // we want to digest only live columns
-                ColumnFamilyStore.removeDeletedColumnsOnly(partition.cf, Integer.MAX_VALUE, SecondaryIndexManager.nullUpdater);
-                partition.cf.purgeTombstones(Integer.MAX_VALUE);
-                partition.cf.updateDigest(digest);
-            }
-        }
-
-        return UUID.nameUUIDFromBytes(digest.digest());
-    }
-
-    /**
-     * @param schemaTableName The name of the table responsible for part of the schema
-     * @return CFS responsible to hold low-level serialized schema
-     */
-    private static ColumnFamilyStore getSchemaCFS(String schemaTableName)
-    {
-        return Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(schemaTableName);
-    }
-
-    /**
-     * @param schemaTableName The name of the table responsible for part of the schema.
-     * @return low-level schema representation
-     */
-    private static List<Row> getSchemaPartitionsForTable(String schemaTableName)
-    {
-        Token minToken = StorageService.getPartitioner().getMinimumToken();
-        return getSchemaCFS(schemaTableName).getRangeSlice(new Range<RowPosition>(minToken.minKeyBound(), minToken.maxKeyBound()),
-                                                           null,
-                                                           new IdentityQueryFilter(),
-                                                           Integer.MAX_VALUE,
-                                                           System.currentTimeMillis());
-    }
-
-    public static Collection<Mutation> convertSchemaToMutations()
-    {
-        Map<DecoratedKey, Mutation> mutationMap = new HashMap<>();
-
-        for (String table : ALL)
-            convertSchemaToMutations(mutationMap, table);
-
-        return mutationMap.values();
-    }
-
-    private static void convertSchemaToMutations(Map<DecoratedKey, Mutation> mutationMap, String schemaTableName)
-    {
-        for (Row partition : getSchemaPartitionsForTable(schemaTableName))
-        {
-            if (isSystemKeyspaceSchemaPartition(partition))
-                continue;
-
-            Mutation mutation = mutationMap.get(partition.key);
-            if (mutation == null)
-            {
-                mutation = new Mutation(SystemKeyspace.NAME, partition.key.getKey());
-                mutationMap.put(partition.key, mutation);
-            }
-
-            mutation.add(partition.cf);
-        }
-    }
-
-    private static Map<DecoratedKey, ColumnFamily> readSchemaForKeyspaces(String schemaTableName, Set<String> keyspaceNames)
-    {
-        Map<DecoratedKey, ColumnFamily> schema = new HashMap<>();
-
-        for (String keyspaceName : keyspaceNames)
-        {
-            Row schemaEntity = readSchemaPartitionForKeyspace(schemaTableName, keyspaceName);
-            if (schemaEntity.cf != null)
-                schema.put(schemaEntity.key, schemaEntity.cf);
-        }
-
-        return schema;
-    }
-
-    private static ByteBuffer getSchemaKSKey(String ksName)
-    {
-        return AsciiType.instance.fromString(ksName);
-    }
-
-    private static Row readSchemaPartitionForKeyspace(String schemaTableName, String keyspaceName)
-    {
-        DecoratedKey keyspaceKey = StorageService.getPartitioner().decorateKey(getSchemaKSKey(keyspaceName));
-        return readSchemaPartitionForKeyspace(schemaTableName, keyspaceKey);
-    }
-
-    private static Row readSchemaPartitionForKeyspace(String schemaTableName, DecoratedKey keyspaceKey)
-    {
-        QueryFilter filter = QueryFilter.getIdentityFilter(keyspaceKey, schemaTableName, System.currentTimeMillis());
-        return new Row(keyspaceKey, getSchemaCFS(schemaTableName).getColumnFamily(filter));
-    }
-
-    private static Row readSchemaPartitionForTable(String schemaTableName, String keyspaceName, String tableName)
-    {
-        DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(keyspaceName));
-        ColumnFamilyStore store = getSchemaCFS(schemaTableName);
-        Composite prefix = store.getComparator().make(tableName);
-        ColumnFamily cells = store.getColumnFamily(key, prefix, prefix.end(), false, Integer.MAX_VALUE, System.currentTimeMillis());
-        return new Row(key, cells);
-    }
-
-    private static boolean isEmptySchemaPartition(Row partition)
-    {
-        return partition.cf == null || (partition.cf.isMarkedForDelete() && !partition.cf.hasColumns());
-    }
-
-    private static boolean isSystemKeyspaceSchemaPartition(Row partition)
-    {
-        return getSchemaKSKey(SystemKeyspace.NAME).equals(partition.key.getKey());
-    }
-
-    /**
-     * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
-     * (which also involves fs operations on add/drop ks/cf)
-     *
-     * @param mutations the schema changes to apply
-     *
-     * @throws ConfigurationException If one of metadata attributes has invalid value
-     * @throws IOException If data was corrupted during transportation or failed to apply fs operations
-     */
-    public static synchronized void mergeSchema(Collection<Mutation> mutations) throws ConfigurationException, IOException
-    {
-        mergeSchema(mutations, true);
-        Schema.instance.updateVersionAndAnnounce();
-    }
-
-    public static synchronized void mergeSchema(Collection<Mutation> mutations, boolean doFlush) throws IOException
-    {
-        // compare before/after schemas of the affected keyspaces only
-        Set<String> keyspaces = new HashSet<>(mutations.size());
-        for (Mutation mutation : mutations)
-            keyspaces.add(ByteBufferUtil.string(mutation.key()));
-
-        // current state of the schema
-        Map<DecoratedKey, ColumnFamily> oldKeyspaces = readSchemaForKeyspaces(KEYSPACES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> oldColumnFamilies = readSchemaForKeyspaces(COLUMNFAMILIES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> oldTypes = readSchemaForKeyspaces(USERTYPES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> oldFunctions = readSchemaForKeyspaces(FUNCTIONS, keyspaces);
-        Map<DecoratedKey, ColumnFamily> oldAggregates = readSchemaForKeyspaces(AGGREGATES, keyspaces);
-
-        for (Mutation mutation : mutations)
-            mutation.apply();
-
-        if (doFlush)
-            flushSchemaTables();
-
-        // with new data applied
-        Map<DecoratedKey, ColumnFamily> newKeyspaces = readSchemaForKeyspaces(KEYSPACES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> newColumnFamilies = readSchemaForKeyspaces(COLUMNFAMILIES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> newTypes = readSchemaForKeyspaces(USERTYPES, keyspaces);
-        Map<DecoratedKey, ColumnFamily> newFunctions = readSchemaForKeyspaces(FUNCTIONS, keyspaces);
-        Map<DecoratedKey, ColumnFamily> newAggregates = readSchemaForKeyspaces(AGGREGATES, keyspaces);
-
-        Set<String> keyspacesToDrop = mergeKeyspaces(oldKeyspaces, newKeyspaces);
-        mergeTables(oldColumnFamilies, newColumnFamilies);
-        mergeTypes(oldTypes, newTypes);
-        mergeFunctions(oldFunctions, newFunctions);
-        mergeAggregates(oldAggregates, newAggregates);
-
-        // it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
-        for (String keyspaceToDrop : keyspacesToDrop)
-            Schema.instance.dropKeyspace(keyspaceToDrop);
-    }
-
-    private static Set<String> mergeKeyspaces(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
-    {
-        List<Row> created = new ArrayList<>();
-        List<String> altered = new ArrayList<>();
-        Set<String> dropped = new HashSet<>();
-
-        /*
-         * - we don't care about entriesOnlyOnLeft() or entriesInCommon(), because only the changes are of interest to us
-         * - of all entriesOnlyOnRight(), we only care about ones that have live columns; it's possible to have a ColumnFamily
-         *   there that only has the top-level deletion, if:
-         *      a) a pushed DROP KEYSPACE change for a keyspace hadn't ever made it to this node in the first place
-         *      b) a pulled dropped keyspace that got dropped before it could find a way to this node
-         * - of entriesDiffering(), we don't care about the scenario where both pre and post-values have zero live columns:
-         *   that means that a keyspace had been recreated and dropped, and the recreated keyspace had never found a way
-         *   to this node
-         */
-        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
-
-        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().hasColumns())
-                created.add(new Row(entry.getKey(), entry.getValue()));
-
-        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
-        {
-            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
-
-            ColumnFamily pre  = entry.getValue().leftValue();
-            ColumnFamily post = entry.getValue().rightValue();
-
-            if (pre.hasColumns() && post.hasColumns())
-                altered.add(keyspaceName);
-            else if (pre.hasColumns())
-                dropped.add(keyspaceName);
-            else if (post.hasColumns()) // a (re)created keyspace
-                created.add(new Row(entry.getKey(), post));
-        }
-
-        for (Row row : created)
-            Schema.instance.addKeyspace(createKeyspaceFromSchemaPartition(row));
-        for (String name : altered)
-            Schema.instance.updateKeyspace(name);
-        return dropped;
-    }
-
-    // see the comments for mergeKeyspaces()
-    private static void mergeTables(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
-    {
-        List<CFMetaData> created = new ArrayList<>();
-        List<CFMetaData> altered = new ArrayList<>();
-        List<CFMetaData> dropped = new ArrayList<>();
-
-        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
-
-        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().hasColumns())
-                created.addAll(createTablesFromTablesPartition(new Row(entry.getKey(), entry.getValue())).values());
-
-        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
-        {
-            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
-
-            ColumnFamily pre  = entry.getValue().leftValue();
-            ColumnFamily post = entry.getValue().rightValue();
-
-            if (pre.hasColumns() && post.hasColumns())
-            {
-                MapDifference<String, CFMetaData> delta =
-                    Maps.difference(Schema.instance.getKSMetaData(keyspaceName).cfMetaData(),
-                                    createTablesFromTablesPartition(new Row(entry.getKey(), post)));
-
-                dropped.addAll(delta.entriesOnlyOnLeft().values());
-                created.addAll(delta.entriesOnlyOnRight().values());
-                Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function<MapDifference.ValueDifference<CFMetaData>, CFMetaData>()
-                {
-                    public CFMetaData apply(MapDifference.ValueDifference<CFMetaData> pair)
-                    {
-                        return pair.rightValue();
-                    }
-                }));
-            }
-            else if (pre.hasColumns())
-            {
-                dropped.addAll(Schema.instance.getKSMetaData(keyspaceName).cfMetaData().values());
-            }
-            else if (post.hasColumns())
-            {
-                created.addAll(createTablesFromTablesPartition(new Row(entry.getKey(), post)).values());
-            }
-        }
-
-        for (CFMetaData cfm : created)
-            Schema.instance.addTable(cfm);
-        for (CFMetaData cfm : altered)
-            Schema.instance.updateTable(cfm.ksName, cfm.cfName);
-        for (CFMetaData cfm : dropped)
-            Schema.instance.dropTable(cfm.ksName, cfm.cfName);
-    }
-
-    // see the comments for mergeKeyspaces()
-    private static void mergeTypes(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
-    {
-        List<UserType> created = new ArrayList<>();
-        List<UserType> altered = new ArrayList<>();
-        List<UserType> dropped = new ArrayList<>();
-
-        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
-
-        // New keyspace with types
-        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().hasColumns())
-                created.addAll(createTypesFromPartition(new Row(entry.getKey(), entry.getValue())).values());
-
-        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
-        {
-            String keyspaceName = AsciiType.instance.compose(entry.getKey().getKey());
-
-            ColumnFamily pre  = entry.getValue().leftValue();
-            ColumnFamily post = entry.getValue().rightValue();
-
-            if (pre.hasColumns() && post.hasColumns())
-            {
-                MapDifference<ByteBuffer, UserType> delta =
-                    Maps.difference(Schema.instance.getKSMetaData(keyspaceName).userTypes.getAllTypes(),
-                                    createTypesFromPartition(new Row(entry.getKey(), post)));
-
-                dropped.addAll(delta.entriesOnlyOnLeft().values());
-                created.addAll(delta.entriesOnlyOnRight().values());
-                Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function<MapDifference.ValueDifference<UserType>, UserType>()
-                {
-                    public UserType apply(MapDifference.ValueDifference<UserType> pair)
-                    {
-                        return pair.rightValue();
-                    }
-                }));
-            }
-            else if (pre.hasColumns())
-            {
-                dropped.addAll(Schema.instance.getKSMetaData(keyspaceName).userTypes.getAllTypes().values());
-            }
-            else if (post.hasColumns())
-            {
-                created.addAll(createTypesFromPartition(new Row(entry.getKey(), post)).values());
-            }
-        }
-
-        for (UserType type : created)
-            Schema.instance.addType(type);
-        for (UserType type : altered)
-            Schema.instance.updateType(type);
-        for (UserType type : dropped)
-            Schema.instance.dropType(type);
-    }
-
-    // see the comments for mergeKeyspaces()
-    private static void mergeFunctions(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
-    {
-        List<UDFunction> created = new ArrayList<>();
-        List<UDFunction> altered = new ArrayList<>();
-        List<UDFunction> dropped = new ArrayList<>();
-
-        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
-
-        // New keyspace with functions
-        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().hasColumns())
-                created.addAll(createFunctionsFromFunctionsPartition(new Row(entry.getKey(), entry.getValue())).values());
-
-        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
-        {
-            ColumnFamily pre = entry.getValue().leftValue();
-            ColumnFamily post = entry.getValue().rightValue();
-
-            if (pre.hasColumns() && post.hasColumns())
-            {
-                MapDifference<ByteBuffer, UDFunction> delta =
-                    Maps.difference(createFunctionsFromFunctionsPartition(new Row(entry.getKey(), pre)),
-                                    createFunctionsFromFunctionsPartition(new Row(entry.getKey(), post)));
-
-                dropped.addAll(delta.entriesOnlyOnLeft().values());
-                created.addAll(delta.entriesOnlyOnRight().values());
-                Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function<MapDifference.ValueDifference<UDFunction>, UDFunction>()
-                {
-                    public UDFunction apply(MapDifference.ValueDifference<UDFunction> pair)
-                    {
-                        return pair.rightValue();
-                    }
-                }));
-            }
-            else if (pre.hasColumns())
-            {
-                dropped.addAll(createFunctionsFromFunctionsPartition(new Row(entry.getKey(), pre)).values());
-            }
-            else if (post.hasColumns())
-            {
-                created.addAll(createFunctionsFromFunctionsPartition(new Row(entry.getKey(), post)).values());
-            }
-        }
-
-        for (UDFunction udf : created)
-            Schema.instance.addFunction(udf);
-        for (UDFunction udf : altered)
-            Schema.instance.updateFunction(udf);
-        for (UDFunction udf : dropped)
-            Schema.instance.dropFunction(udf);
-    }
-
-    // see the comments for mergeKeyspaces()
-    private static void mergeAggregates(Map<DecoratedKey, ColumnFamily> before, Map<DecoratedKey, ColumnFamily> after)
-    {
-        List<UDAggregate> created = new ArrayList<>();
-        List<UDAggregate> altered = new ArrayList<>();
-        List<UDAggregate> dropped = new ArrayList<>();
-
-        MapDifference<DecoratedKey, ColumnFamily> diff = Maps.difference(before, after);
-
-        // New keyspace with functions
-        for (Map.Entry<DecoratedKey, ColumnFamily> entry : diff.entriesOnlyOnRight().entrySet())
-            if (entry.getValue().hasColumns())
-                created.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), entry.getValue())).values());
-
-        for (Map.Entry<DecoratedKey, MapDifference.ValueDifference<ColumnFamily>> entry : diff.entriesDiffering().entrySet())
-        {
-            ColumnFamily pre = entry.getValue().leftValue();
-            ColumnFamily post = entry.getValue().rightValue();
-
-            if (pre.hasColumns() && post.hasColumns())
-            {
-                MapDifference<ByteBuffer, UDAggregate> delta =
-                    Maps.difference(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), pre)),
-                                    createAggregatesFromAggregatesPartition(new Row(entry.getKey(), post)));
-
-                dropped.addAll(delta.entriesOnlyOnLeft().values());
-                created.addAll(delta.entriesOnlyOnRight().values());
-                Iterables.addAll(altered, Iterables.transform(delta.entriesDiffering().values(), new Function<MapDifference.ValueDifference<UDAggregate>, UDAggregate>()
-                {
-                    public UDAggregate apply(MapDifference.ValueDifference<UDAggregate> pair)
-                    {
-                        return pair.rightValue();
-                    }
-                }));
-            }
-            else if (pre.hasColumns())
-            {
-                dropped.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), pre)).values());
-            }
-            else if (post.hasColumns())
-            {
-                created.addAll(createAggregatesFromAggregatesPartition(new Row(entry.getKey(), post)).values());
-            }
-        }
-
-        for (UDAggregate udf : created)
-            Schema.instance.addAggregate(udf);
-        for (UDAggregate udf : altered)
-            Schema.instance.updateAggregate(udf);
-        for (UDAggregate udf : dropped)
-            Schema.instance.dropAggregate(udf);
-    }
-
-    /*
-     * Keyspace metadata serialization/deserialization.
-     */
-
-    public static Mutation makeCreateKeyspaceMutation(KSMetaData keyspace, long timestamp)
-    {
-        return makeCreateKeyspaceMutation(keyspace, timestamp, true);
-    }
-
-    private static Mutation makeCreateKeyspaceMutation(KSMetaData keyspace, long timestamp, boolean withTablesAndTypesAndFunctions)
-    {
-        Mutation mutation = new Mutation(SystemKeyspace.NAME, getSchemaKSKey(keyspace.name));
-        ColumnFamily cells = mutation.addOrGet(Keyspaces);
-        CFRowAdder adder = new CFRowAdder(cells, Keyspaces.comparator.builder().build(), timestamp);
-
-        adder.add("durable_writes", keyspace.durableWrites);
-        adder.add("strategy_class", keyspace.strategyClass.getName());
-        adder.add("strategy_options", json(keyspace.strategyOptions));
-
-        if (withTablesAndTypesAndFunctions)
-        {
-            for (UserType type : keyspace.userTypes.getAllTypes().values())
-                addTypeToSchemaMutation(type, timestamp, mutation);
-
-            for (CFMetaData table : keyspace.cfMetaData().values())
-                addTableToSchemaMutation(table, timestamp, true, mutation);
-        }
-
-        return mutation;
-    }
-
-    public static Mutation makeDropKeyspaceMutation(KSMetaData keyspace, long timestamp)
-    {
-        Mutation mutation = new Mutation(SystemKeyspace.NAME, getSchemaKSKey(keyspace.name));
-        for (String schemaTable : ALL)
-            mutation.delete(schemaTable, timestamp);
-        mutation.delete(SystemKeyspace.BUILT_INDEXES, timestamp);
-        return mutation;
-    }
-
-    private static KSMetaData createKeyspaceFromSchemaPartitions(Row serializedKeyspace, Row serializedTables, Row serializedTypes)
-    {
-        Collection<CFMetaData> tables = createTablesFromTablesPartition(serializedTables).values();
-        UTMetaData types = new UTMetaData(createTypesFromPartition(serializedTypes));
-        return createKeyspaceFromSchemaPartition(serializedKeyspace).cloneWith(tables, types);
-    }
-
-    public static KSMetaData createKeyspaceFromName(String keyspace)
-    {
-        Row partition = readSchemaPartitionForKeyspace(KEYSPACES, keyspace);
-
-        if (isEmptySchemaPartition(partition))
-            throw new RuntimeException(String.format("%s not found in the schema definitions keyspaceName (%s).", keyspace, KEYSPACES));
-
-        return createKeyspaceFromSchemaPartition(partition);
-    }
-
-    /**
-     * Deserialize only Keyspace attributes without nested tables or types
-     *
-     * @param partition Keyspace attributes in serialized form
-     */
-    private static KSMetaData createKeyspaceFromSchemaPartition(Row partition)
-    {
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, KEYSPACES);
-        UntypedResultSet.Row row = QueryProcessor.resultify(query, partition).one();
-        return new KSMetaData(row.getString("keyspace_name"),
-                              AbstractReplicationStrategy.getClass(row.getString("strategy_class")),
-                              fromJsonMap(row.getString("strategy_options")),
-                              row.getBoolean("durable_writes"));
-    }
-
-    /*
-     * User type metadata serialization/deserialization.
-     */
-
-    public static Mutation makeCreateTypeMutation(KSMetaData keyspace, UserType type, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-        addTypeToSchemaMutation(type, timestamp, mutation);
-        return mutation;
-    }
-
-    private static void addTypeToSchemaMutation(UserType type, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Usertypes);
-
-        Composite prefix = Usertypes.comparator.make(type.name);
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-
-        adder.resetCollection("field_names");
-        adder.resetCollection("field_types");
-
-        for (int i = 0; i < type.size(); i++)
-        {
-            adder.addListEntry("field_names", type.fieldName(i));
-            adder.addListEntry("field_types", type.fieldType(i).toString());
-        }
-    }
-
-    public static Mutation dropTypeFromSchemaMutation(KSMetaData keyspace, UserType type, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-
-        ColumnFamily cells = mutation.addOrGet(Usertypes);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        Composite prefix = Usertypes.comparator.make(type.name);
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-
-        return mutation;
-    }
-
-    private static Map<ByteBuffer, UserType> createTypesFromPartition(Row partition)
-    {
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, USERTYPES);
-        Map<ByteBuffer, UserType> types = new HashMap<>();
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition))
-        {
-            UserType type = createTypeFromRow(row);
-            types.put(type.name, type);
-        }
-        return types;
-    }
-
-    private static UserType createTypeFromRow(UntypedResultSet.Row row)
-    {
-        String keyspace = row.getString("keyspace_name");
-        ByteBuffer name = ByteBufferUtil.bytes(row.getString("type_name"));
-        List<String> rawColumns = row.getList("field_names", UTF8Type.instance);
-        List<String> rawTypes = row.getList("field_types", UTF8Type.instance);
-
-        List<ByteBuffer> columns = new ArrayList<>(rawColumns.size());
-        for (String rawColumn : rawColumns)
-            columns.add(ByteBufferUtil.bytes(rawColumn));
-
-        List<AbstractType<?>> types = new ArrayList<>(rawTypes.size());
-        for (String rawType : rawTypes)
-            types.add(parseType(rawType));
-
-        return new UserType(keyspace, name, columns, types);
-    }
-
-    /*
-     * Table metadata serialization/deserialization.
-     */
-
-    public static Mutation makeCreateTableMutation(KSMetaData keyspace, CFMetaData table, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-        addTableToSchemaMutation(table, timestamp, true, mutation);
-        return mutation;
-    }
-
-    private static void addTableToSchemaMutation(CFMetaData table, long timestamp, boolean withColumnsAndTriggers, Mutation mutation)
-    {
-        // For property that can be null (and can be changed), we insert tombstones, to make sure
-        // we don't keep a property the user has removed
-        ColumnFamily cells = mutation.addOrGet(Columnfamilies);
-        Composite prefix = Columnfamilies.comparator.make(table.cfName);
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-
-        adder.add("cf_id", table.cfId);
-        adder.add("type", table.cfType.toString());
-
-        if (table.isSuper())
-        {
-            // We need to continue saving the comparator and subcomparator separatly, otherwise
-            // we won't know at deserialization if the subcomparator should be taken into account
-            // TODO: we should implement an on-start migration if we want to get rid of that.
-            adder.add("comparator", table.comparator.subtype(0).toString());
-            adder.add("subcomparator", table.comparator.subtype(1).toString());
-        }
-        else
-        {
-            adder.add("comparator", table.comparator.toString());
-        }
-
-        adder.add("bloom_filter_fp_chance", table.getBloomFilterFpChance());
-        adder.add("caching", table.getCaching().toString());
-        adder.add("comment", table.getComment());
-        adder.add("compaction_strategy_class", table.compactionStrategyClass.getName());
-        adder.add("compaction_strategy_options", json(table.compactionStrategyOptions));
-        adder.add("compression_parameters", json(table.compressionParameters.asThriftOptions()));
-        adder.add("default_time_to_live", table.getDefaultTimeToLive());
-        adder.add("default_validator", table.getDefaultValidator().toString());
-        adder.add("gc_grace_seconds", table.getGcGraceSeconds());
-        adder.add("key_validator", table.getKeyValidator().toString());
-        adder.add("local_read_repair_chance", table.getDcLocalReadRepairChance());
-        adder.add("max_compaction_threshold", table.getMaxCompactionThreshold());
-        adder.add("max_index_interval", table.getMaxIndexInterval());
-        adder.add("memtable_flush_period_in_ms", table.getMemtableFlushPeriod());
-        adder.add("min_compaction_threshold", table.getMinCompactionThreshold());
-        adder.add("min_index_interval", table.getMinIndexInterval());
-        adder.add("read_repair_chance", table.getReadRepairChance());
-        adder.add("speculative_retry", table.getSpeculativeRetry().toString());
-
-        for (Map.Entry<ColumnIdentifier, Long> entry : table.getDroppedColumns().entrySet())
-            adder.addMapEntry("dropped_columns", entry.getKey().toString(), entry.getValue());
-
-        adder.add("is_dense", table.getIsDense());
-
-        if (withColumnsAndTriggers)
-        {
-            for (ColumnDefinition column : table.allColumns())
-                addColumnToSchemaMutation(table, column, timestamp, mutation);
-
-            for (TriggerDefinition trigger : table.getTriggers().values())
-                addTriggerToSchemaMutation(table, trigger, timestamp, mutation);
-        }
-    }
-
-    public static Mutation makeUpdateTableMutation(KSMetaData keyspace,
-                                                   CFMetaData oldTable,
-                                                   CFMetaData newTable,
-                                                   long timestamp)
-    {
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-
-        addTableToSchemaMutation(newTable, timestamp, false, mutation);
-
-        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(oldTable.getColumnMetadata(),
-                                                                                 newTable.getColumnMetadata());
-
-        // columns that are no longer needed
-        for (ColumnDefinition column : columnDiff.entriesOnlyOnLeft().values())
-            dropColumnFromSchemaMutation(oldTable, column, timestamp, mutation);
-
-        // newly added columns
-        for (ColumnDefinition column : columnDiff.entriesOnlyOnRight().values())
-            addColumnToSchemaMutation(newTable, column, timestamp, mutation);
-
-        // old columns with updated attributes
-        for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
-            addColumnToSchemaMutation(newTable, newTable.getColumnDefinition(name), timestamp, mutation);
-
-        MapDifference<String, TriggerDefinition> triggerDiff = Maps.difference(oldTable.getTriggers(), newTable.getTriggers());
-
-        // dropped triggers
-        for (TriggerDefinition trigger : triggerDiff.entriesOnlyOnLeft().values())
-            dropTriggerFromSchemaMutation(oldTable, trigger, timestamp, mutation);
-
-        // newly created triggers
-        for (TriggerDefinition trigger : triggerDiff.entriesOnlyOnRight().values())
-            addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);
-
-        return mutation;
-    }
-
-    public static Mutation makeDropTableMutation(KSMetaData keyspace, CFMetaData table, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-
-        ColumnFamily cells = mutation.addOrGet(Columnfamilies);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        Composite prefix = Columnfamilies.comparator.make(table.cfName);
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-
-        for (ColumnDefinition column : table.allColumns())
-            dropColumnFromSchemaMutation(table, column, timestamp, mutation);
-
-        for (TriggerDefinition trigger : table.getTriggers().values())
-            dropTriggerFromSchemaMutation(table, trigger, timestamp, mutation);
-
-        // TODO: get rid of in #6717
-        ColumnFamily indexCells = mutation.addOrGet(SystemKeyspace.BuiltIndexes);
-        for (String indexName : Keyspace.open(keyspace.name).getColumnFamilyStore(table.cfName).getBuiltIndexes())
-            indexCells.addTombstone(indexCells.getComparator().makeCellName(indexName), ldt, timestamp);
-
-        return mutation;
-    }
-
-    public static CFMetaData createTableFromName(String keyspace, String table)
-    {
-        Row partition = readSchemaPartitionForTable(COLUMNFAMILIES, keyspace, table);
-
-        if (isEmptySchemaPartition(partition))
-            throw new RuntimeException(String.format("%s:%s not found in the schema definitions keyspace.", keyspace, table));
-
-        return createTableFromTablePartition(partition);
-    }
-
-    /**
-     * Deserialize tables from low-level schema representation, all of them belong to the same keyspace
-     *
-     * @return map containing name of the table and its metadata for faster lookup
-     */
-    private static Map<String, CFMetaData> createTablesFromTablesPartition(Row partition)
-    {
-        if (partition.cf == null)
-            return Collections.emptyMap();
-
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, COLUMNFAMILIES);
-        Map<String, CFMetaData> tables = new HashMap<>();
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition))
-        {
-            CFMetaData cfm = createTableFromTableRow(row);
-            tables.put(cfm.cfName, cfm);
-        }
-        return tables;
-    }
-
-    public static CFMetaData createTableFromTablePartitionAndColumnsPartition(Row serializedTable, Row serializedColumns)
-    {
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, COLUMNFAMILIES);
-        return createTableFromTableRowAndColumnsPartition(QueryProcessor.resultify(query, serializedTable).one(), serializedColumns);
-    }
-
-    private static CFMetaData createTableFromTableRowAndColumnsPartition(UntypedResultSet.Row tableRow, Row serializedColumns)
-    {
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, COLUMNS);
-        return createTableFromTableRowAndColumnRows(tableRow, QueryProcessor.resultify(query, serializedColumns));
-    }
-
-    private static CFMetaData createTableFromTablePartition(Row row)
-    {
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, COLUMNFAMILIES);
-        return createTableFromTableRow(QueryProcessor.resultify(query, row).one());
-    }
-
-    /**
-     * Deserialize table metadata from low-level representation
-     *
-     * @return Metadata deserialized from schema
-     */
-    private static CFMetaData createTableFromTableRow(UntypedResultSet.Row result)
-    {
-        String ksName = result.getString("keyspace_name");
-        String cfName = result.getString("columnfamily_name");
-
-        Row serializedColumns = readSchemaPartitionForTable(COLUMNS, ksName, cfName);
-        CFMetaData cfm = createTableFromTableRowAndColumnsPartition(result, serializedColumns);
-
-        Row serializedTriggers = readSchemaPartitionForTable(TRIGGERS, ksName, cfName);
-        for (TriggerDefinition trigger : createTriggersFromTriggersPartition(serializedTriggers))
-            cfm.addTriggerDefinition(trigger);
-
-        return cfm;
-    }
-
-    public static CFMetaData createTableFromTableRowAndColumnRows(UntypedResultSet.Row result,
-                                                                  UntypedResultSet serializedColumnDefinitions)
-    {
-        String ksName = result.getString("keyspace_name");
-        String cfName = result.getString("columnfamily_name");
-
-        AbstractType<?> rawComparator = TypeParser.parse(result.getString("comparator"));
-        AbstractType<?> subComparator = result.has("subcomparator") ? TypeParser.parse(result.getString("subcomparator")) : null;
-        ColumnFamilyType cfType = ColumnFamilyType.valueOf(result.getString("type"));
-
-        AbstractType<?> fullRawComparator = CFMetaData.makeRawAbstractType(rawComparator, subComparator);
-
-        List<ColumnDefinition> columnDefs = createColumnsFromColumnRows(serializedColumnDefinitions,
-                                                                        ksName,
-                                                                        cfName,
-                                                                        fullRawComparator,
-                                                                        cfType == ColumnFamilyType.Super);
-
-        boolean isDense = result.has("is_dense")
-                        ? result.getBoolean("is_dense")
-                        : CFMetaData.calculateIsDense(fullRawComparator, columnDefs);
-
-        CellNameType comparator = CellNames.fromAbstractType(fullRawComparator, isDense);
-
-        // if we are upgrading, we use id generated from names initially
-        UUID cfId = result.has("cf_id")
-                  ? result.getUUID("cf_id")
-                  : CFMetaData.generateLegacyCfId(ksName, cfName);
-
-        CFMetaData cfm = new CFMetaData(ksName, cfName, cfType, comparator, cfId);
-        cfm.isDense(isDense);
-
-        cfm.readRepairChance(result.getDouble("read_repair_chance"));
-        cfm.dcLocalReadRepairChance(result.getDouble("local_read_repair_chance"));
-        cfm.gcGraceSeconds(result.getInt("gc_grace_seconds"));
-        cfm.defaultValidator(TypeParser.parse(result.getString("default_validator")));
-        cfm.keyValidator(TypeParser.parse(result.getString("key_validator")));
-        cfm.minCompactionThreshold(result.getInt("min_compaction_threshold"));
-        cfm.maxCompactionThreshold(result.getInt("max_compaction_threshold"));
-        if (result.has("comment"))
-            cfm.comment(result.getString("comment"));
-        if (result.has("memtable_flush_period_in_ms"))
-            cfm.memtableFlushPeriod(result.getInt("memtable_flush_period_in_ms"));
-        cfm.caching(CachingOptions.fromString(result.getString("caching")));
-        if (result.has("default_time_to_live"))
-            cfm.defaultTimeToLive(result.getInt("default_time_to_live"));
-        if (result.has("speculative_retry"))
-            cfm.speculativeRetry(CFMetaData.SpeculativeRetry.fromString(result.getString("speculative_retry")));
-        cfm.compactionStrategyClass(CFMetaData.createCompactionStrategy(result.getString("compaction_strategy_class")));
-        cfm.compressionParameters(CompressionParameters.create(fromJsonMap(result.getString("compression_parameters"))));
-        cfm.compactionStrategyOptions(fromJsonMap(result.getString("compaction_strategy_options")));
-
-        if (result.has("min_index_interval"))
-            cfm.minIndexInterval(result.getInt("min_index_interval"));
-
-        if (result.has("max_index_interval"))
-            cfm.maxIndexInterval(result.getInt("max_index_interval"));
-
-        if (result.has("bloom_filter_fp_chance"))
-            cfm.bloomFilterFpChance(result.getDouble("bloom_filter_fp_chance"));
-        else
-            cfm.bloomFilterFpChance(cfm.getBloomFilterFpChance());
-
-        if (result.has("dropped_columns"))
-            cfm.droppedColumns(convertDroppedColumns(result.getMap("dropped_columns", UTF8Type.instance, LongType.instance)));
-
-        for (ColumnDefinition cd : columnDefs)
-            cfm.addOrReplaceColumnDefinition(cd);
-
-        return cfm.rebuild();
-    }
-
-    private static Map<ColumnIdentifier, Long> convertDroppedColumns(Map<String, Long> raw)
-    {
-        Map<ColumnIdentifier, Long> converted = Maps.newHashMap();
-        for (Map.Entry<String, Long> entry : raw.entrySet())
-            converted.put(new ColumnIdentifier(entry.getKey(), true), entry.getValue());
-        return converted;
-    }
-
-    /*
-     * Column metadata serialization/deserialization.
-     */
-
-    private static void addColumnToSchemaMutation(CFMetaData table, ColumnDefinition column, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Columns);
-        Composite prefix = Columns.comparator.make(table.cfName, column.name.toString());
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-
-        adder.add("validator", column.type.toString());
-        adder.add("type", serializeKind(column.kind));
-        adder.add("component_index", column.isOnAllComponents() ? null : column.position());
-        adder.add("index_name", column.getIndexName());
-        adder.add("index_type", column.getIndexType() == null ? null : column.getIndexType().toString());
-        adder.add("index_options", json(column.getIndexOptions()));
-    }
-
-    private static String serializeKind(ColumnDefinition.Kind kind)
-    {
-        // For backward compatibility we need to special case CLUSTERING_COLUMN
-        return kind == ColumnDefinition.Kind.CLUSTERING_COLUMN ? "clustering_key" : kind.toString().toLowerCase();
-    }
-
-    private static ColumnDefinition.Kind deserializeKind(String kind)
-    {
-        if (kind.equalsIgnoreCase("clustering_key"))
-            return ColumnDefinition.Kind.CLUSTERING_COLUMN;
-        return Enum.valueOf(ColumnDefinition.Kind.class, kind.toUpperCase());
-    }
-
-    private static void dropColumnFromSchemaMutation(CFMetaData table, ColumnDefinition column, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Columns);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        // Note: we do want to use name.toString(), not name.bytes directly for backward compatibility (For CQL3, this won't make a difference).
-        Composite prefix = Columns.comparator.make(table.cfName, column.name.toString());
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-    }
-
-    private static List<ColumnDefinition> createColumnsFromColumnRows(UntypedResultSet rows,
-                                                                      String keyspace,
-                                                                      String table,
-                                                                      AbstractType<?> rawComparator,
-                                                                      boolean isSuper)
-    {
-        List<ColumnDefinition> columns = new ArrayList<>();
-        for (UntypedResultSet.Row row : rows)
-            columns.add(createColumnFromColumnRow(row, keyspace, table, rawComparator, isSuper));
-        return columns;
-    }
-
-    private static ColumnDefinition createColumnFromColumnRow(UntypedResultSet.Row row,
-                                                              String keyspace,
-                                                              String table,
-                                                              AbstractType<?> rawComparator,
-                                                              boolean isSuper)
-    {
-        ColumnDefinition.Kind kind = deserializeKind(row.getString("type"));
-
-        Integer componentIndex = null;
-        if (kind == ColumnDefinition.Kind.REGULAR && isSuper)
-            componentIndex = 1; // A ColumnDefinition for super columns applies to the column component
-        else if (row.has("component_index"))
-            componentIndex = row.getInt("component_index");
-
-        // Note: we save the column name as string, but we should not assume that it is an UTF8 name, we
-        // we need to use the comparator fromString method
-        AbstractType<?> comparator = kind == ColumnDefinition.Kind.REGULAR
-                                   ? getComponentComparator(rawComparator, componentIndex)
-                                   : UTF8Type.instance;
-        ColumnIdentifier name = new ColumnIdentifier(comparator.fromString(row.getString("column_name")), comparator);
-
-        AbstractType<?> validator = parseType(row.getString("validator"));
-
-        IndexType indexType = null;
-        if (row.has("index_type"))
-            indexType = IndexType.valueOf(row.getString("index_type"));
-
-        Map<String, String> indexOptions = null;
-        if (row.has("index_options"))
-            indexOptions = fromJsonMap(row.getString("index_options"));
-
-        String indexName = null;
-        if (row.has("index_name"))
-            indexName = row.getString("index_name");
-
-        return new ColumnDefinition(keyspace, table, name, validator, indexType, indexOptions, indexName, componentIndex, kind);
-    }
-
-    private static AbstractType<?> getComponentComparator(AbstractType<?> rawComparator, Integer componentIndex)
-    {
-        return (componentIndex == null || (componentIndex == 0 && !(rawComparator instanceof CompositeType)))
-               ? rawComparator
-               : ((CompositeType)rawComparator).types.get(componentIndex);
-    }
-
-    /*
-     * Trigger metadata serialization/deserialization.
-     */
-
-    private static void addTriggerToSchemaMutation(CFMetaData table, TriggerDefinition trigger, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Triggers);
-        Composite prefix = Triggers.comparator.make(table.cfName, trigger.name);
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-        adder.addMapEntry("trigger_options", "class", trigger.classOption);
-    }
-
-    private static void dropTriggerFromSchemaMutation(CFMetaData table, TriggerDefinition trigger, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Triggers);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        Composite prefix = Triggers.comparator.make(table.cfName, trigger.name);
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-    }
-
-    /**
-     * Deserialize triggers from storage-level representation.
-     *
-     * @param partition storage-level partition containing the trigger definitions
-     * @return the list of processed TriggerDefinitions
-     */
-    private static List<TriggerDefinition> createTriggersFromTriggersPartition(Row partition)
-    {
-        List<TriggerDefinition> triggers = new ArrayList<>();
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, TRIGGERS);
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition))
-        {
-            String name = row.getString("trigger_name");
-            String classOption = row.getMap("trigger_options", UTF8Type.instance, UTF8Type.instance).get("class");
-            triggers.add(new TriggerDefinition(name, classOption));
-        }
-        return triggers;
-    }
-
-    /*
-     * UDF metadata serialization/deserialization.
-     */
-
-    public static Mutation makeCreateFunctionMutation(KSMetaData keyspace, UDFunction function, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-        addFunctionToSchemaMutation(function, timestamp, mutation);
-        return mutation;
-    }
-
-    private static void addFunctionToSchemaMutation(UDFunction function, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Functions);
-        Composite prefix = Functions.comparator.make(function.name().name, functionSignatureWithTypes(function));
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-
-        adder.resetCollection("argument_names");
-        adder.resetCollection("argument_types");
-
-        for (int i = 0; i < function.argNames().size(); i++)
-        {
-            adder.addListEntry("argument_names", function.argNames().get(i).bytes);
-            adder.addListEntry("argument_types", function.argTypes().get(i).toString());
-        }
-
-        adder.add("body", function.body());
-        adder.add("language", function.language());
-        adder.add("return_type", function.returnType().toString());
-        adder.add("called_on_null_input", function.isCalledOnNullInput());
-    }
-
-    public static Mutation makeDropFunctionMutation(KSMetaData keyspace, UDFunction function, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-
-        ColumnFamily cells = mutation.addOrGet(Functions);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        Composite prefix = Functions.comparator.make(function.name().name, functionSignatureWithTypes(function));
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-
-        return mutation;
-    }
-
-    private static Map<ByteBuffer, UDFunction> createFunctionsFromFunctionsPartition(Row partition)
-    {
-        Map<ByteBuffer, UDFunction> functions = new HashMap<>();
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, FUNCTIONS);
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition))
-        {
-            UDFunction function = createFunctionFromFunctionRow(row);
-            functions.put(functionSignatureWithNameAndTypes(function), function);
-        }
-        return functions;
-    }
-
-    private static UDFunction createFunctionFromFunctionRow(UntypedResultSet.Row row)
-    {
-        String ksName = row.getString("keyspace_name");
-        String functionName = row.getString("function_name");
-        FunctionName name = new FunctionName(ksName, functionName);
-
-        List<ColumnIdentifier> argNames = new ArrayList<>();
-        if (row.has("argument_names"))
-            for (String arg : row.getList("argument_names", UTF8Type.instance))
-                argNames.add(new ColumnIdentifier(arg, true));
-
-        List<AbstractType<?>> argTypes = new ArrayList<>();
-        if (row.has("argument_types"))
-            for (String type : row.getList("argument_types", UTF8Type.instance))
-                argTypes.add(parseType(type));
-
-        AbstractType<?> returnType = parseType(row.getString("return_type"));
-
-        String language = row.getString("language");
-        String body = row.getString("body");
-        boolean calledOnNullInput = row.getBoolean("called_on_null_input");
-
-        org.apache.cassandra.cql3.functions.Function existing = org.apache.cassandra.cql3.functions.Functions.find(name, argTypes);
-        if (existing instanceof UDFunction)
-        {
-            // This check prevents duplicate compilation of effectively the same UDF.
-            // Duplicate compilation attempts can occur on the coordinator node handling the CREATE FUNCTION
-            // statement, since CreateFunctionStatement needs to execute UDFunction.create but schema migration
-            // also needs that (since it needs to handle its own change).
-            UDFunction udf = (UDFunction) existing;
-            if (udf.argNames().equals(argNames) && // arg types checked in Functions.find call
-                udf.returnType().equals(returnType) &&
-                !udf.isAggregate() &&
-                udf.language().equals(language) &&
-                udf.body().equals(body) &&
-                udf.isCalledOnNullInput() == calledOnNullInput)
-            {
-                logger.trace("Skipping duplicate compilation of already existing UDF {}", name);
-                return udf;
-            }
-        }
-
-        try
-        {
-            return UDFunction.create(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
-        }
-        catch (InvalidRequestException e)
-        {
-            logger.error(String.format("Cannot load function '%s' from schema: this function won't be available (on this node)", name), e);
-            return UDFunction.createBrokenFunction(name, argNames, argTypes, returnType, calledOnNullInput, language, body, e);
-        }
-    }
-
-    /*
-     * Aggregate UDF metadata serialization/deserialization.
-     */
-
-    public static Mutation makeCreateAggregateMutation(KSMetaData keyspace, UDAggregate aggregate, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-        addAggregateToSchemaMutation(aggregate, timestamp, mutation);
-        return mutation;
-    }
-
-    private static void addAggregateToSchemaMutation(UDAggregate aggregate, long timestamp, Mutation mutation)
-    {
-        ColumnFamily cells = mutation.addOrGet(Aggregates);
-        Composite prefix = Aggregates.comparator.make(aggregate.name().name, functionSignatureWithTypes(aggregate));
-        CFRowAdder adder = new CFRowAdder(cells, prefix, timestamp);
-
-        adder.resetCollection("argument_types");
-        adder.add("return_type", aggregate.returnType().toString());
-        adder.add("state_func", aggregate.stateFunction().name().name);
-        adder.add("state_type", aggregate.stateType().toString());
-        adder.add("final_func", aggregate.finalFunction() != null ? aggregate.finalFunction().name().name : null);
-        adder.add("initcond", aggregate.initialCondition() != null ? aggregate.initialCondition() : null);
-
-        for (AbstractType<?> argType : aggregate.argTypes())
-            adder.addListEntry("argument_types", argType.toString());
-    }
-
-    private static Map<ByteBuffer, UDAggregate> createAggregatesFromAggregatesPartition(Row partition)
-    {
-        Map<ByteBuffer, UDAggregate> aggregates = new HashMap<>();
-        String query = String.format("SELECT * FROM %s.%s", SystemKeyspace.NAME, AGGREGATES);
-        for (UntypedResultSet.Row row : QueryProcessor.resultify(query, partition))
-        {
-            UDAggregate aggregate = createAggregateFromAggregateRow(row);
-            aggregates.put(functionSignatureWithNameAndTypes(aggregate), aggregate);
-        }
-        return aggregates;
-    }
-
-    private static UDAggregate createAggregateFromAggregateRow(UntypedResultSet.Row row)
-    {
-        String ksName = row.getString("keyspace_name");
-        String functionName = row.getString("aggregate_name");
-        FunctionName name = new FunctionName(ksName, functionName);
-
-        List<String> types = row.getList("argument_types", UTF8Type.instance);
-
-        List<AbstractType<?>> argTypes;
-        if (types == null)
-        {
-            argTypes = Collections.emptyList();
-        }
-        else
-        {
-            argTypes = new ArrayList<>(types.size());
-            for (String type : types)
-                argTypes.add(parseType(type));
-        }
-
-        AbstractType<?> returnType = parseType(row.getString("return_type"));
-
-        FunctionName stateFunc = new FunctionName(ksName, row.getString("state_func"));
-        FunctionName finalFunc = row.has("final_func") ? new FunctionName(ksName, row.getString("final_func")) : null;
-        AbstractType<?> stateType = row.has("state_type") ? parseType(row.getString("state_type")) : null;
-        ByteBuffer initcond = row.has("initcond") ? row.getBytes("initcond") : null;
-
-        try
-        {
-            return UDAggregate.create(name, argTypes, returnType, stateFunc, finalFunc, stateType, initcond);
-        }
-        catch (InvalidRequestException reason)
-        {
-            return UDAggregate.createBroken(name, argTypes, returnType, initcond, reason);
-        }
-    }
-
-    public static Mutation makeDropAggregateMutation(KSMetaData keyspace, UDAggregate aggregate, long timestamp)
-    {
-        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
-        Mutation mutation = makeCreateKeyspaceMutation(keyspace, timestamp, false);
-
-        ColumnFamily cells = mutation.addOrGet(Aggregates);
-        int ldt = (int) (System.currentTimeMillis() / 1000);
-
-        Composite prefix = Aggregates.comparator.make(aggregate.name().name, functionSignatureWithTypes(aggregate));
-        cells.addAtom(new RangeTombstone(prefix, prefix.end(), timestamp, ldt));
-
-        return mutation;
-    }
-
-    private static AbstractType<?> parseType(String str)
-    {
-        return TypeParser.parse(str);
-    }
-
-    // We allow method overloads, so a function is not uniquely identified by its name only, but
-    // also by its argument types. To distinguish overloads of given function name in the schema
-    // we use a "signature" which is just a list of it's CQL argument types (we could replace that by
-    // using a "signature" UDT that would be comprised of the function name and argument types,
-    // which we could then use as clustering column. But as we haven't yet used UDT in system tables,
-    // We'll leave that decision to #6717).
-    public static ByteBuffer functionSignatureWithTypes(AbstractFunction fun)
-    {
-        ListType<String> list = ListType.getInstance(UTF8Type.instance, false);
-        List<String> strList = new ArrayList<>(fun.argTypes().size());
-        for (AbstractType<?> argType : fun.argTypes())
-            strList.add(argType.asCQL3Type().toString());
-        return list.decompose(strList);
-    }
-
-    public static ByteBuffer functionSignatureWithNameAndTypes(AbstractFunction fun)
-    {
-        ListType<String> list = ListType.getInstance(UTF8Type.instance, false);
-        List<String> strList = new ArrayList<>(fun.argTypes().size() + 2);
-        strList.add(fun.name().keyspace);
-        strList.add(fun.name().name);
-        for (AbstractType<?> argType : fun.argTypes())
-            strList.add(argType.asCQL3Type().toString());
-        return list.decompose(strList);
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/schema/ReplicationParams.java b/src/java/org/apache/cassandra/schema/ReplicationParams.java
new file mode 100644
index 0000000..21c029e
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/ReplicationParams.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.locator.*;
+import org.apache.cassandra.service.StorageService;
+
+public final class ReplicationParams
+{
+    public static final String CLASS = "class";
+
+    public final Class<? extends AbstractReplicationStrategy> klass;
+    public final ImmutableMap<String, String> options;
+
+    private ReplicationParams(Class<? extends AbstractReplicationStrategy> klass, Map<String, String> options)
+    {
+        this.klass = klass;
+        this.options = ImmutableMap.copyOf(options);
+    }
+
+    static ReplicationParams local()
+    {
+        return new ReplicationParams(LocalStrategy.class, ImmutableMap.of());
+    }
+
+    static ReplicationParams simple(int replicationFactor)
+    {
+        return new ReplicationParams(SimpleStrategy.class, ImmutableMap.of("replication_factor", Integer.toString(replicationFactor)));
+    }
+
+    static ReplicationParams nts(Object... args)
+    {
+        assert args.length % 2 == 0;
+
+        Map<String, String> options = new HashMap<>();
+        for (int i = 0; i < args.length; i += 2)
+        {
+            String dc = (String) args[i];
+            Integer rf = (Integer) args[i + 1];
+            options.put(dc, rf.toString());
+        }
+
+        return new ReplicationParams(NetworkTopologyStrategy.class, options);
+    }
+
+    public void validate(String name)
+    {
+        // Attempt to instantiate the ARS, which will throw a ConfigurationException if the options aren't valid.
+        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+        IEndpointSnitch eps = DatabaseDescriptor.getEndpointSnitch();
+        AbstractReplicationStrategy.validateReplicationStrategy(name, klass, tmd, eps, options);
+    }
+
+    public static ReplicationParams fromMap(Map<String, String> map)
+    {
+        Map<String, String> options = new HashMap<>(map);
+        String className = options.remove(CLASS);
+        Class<? extends AbstractReplicationStrategy> klass = AbstractReplicationStrategy.getClass(className);
+        return new ReplicationParams(klass, options);
+    }
+
+    public Map<String, String> asMap()
+    {
+        Map<String, String> map = new HashMap<>(options);
+        map.put(CLASS, klass.getName());
+        return map;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof ReplicationParams))
+            return false;
+
+        ReplicationParams r = (ReplicationParams) o;
+
+        return klass.equals(r.klass) && options.equals(r.options);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(klass, options);
+    }
+
+    @Override
+    public String toString()
+    {
+        MoreObjects.ToStringHelper helper = MoreObjects.toStringHelper(this);
+        helper.add(CLASS, klass.getName());
+        for (Map.Entry<String, String> entry : options.entrySet())
+            helper.add(entry.getKey(), entry.getValue());
+        return helper.toString();
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
new file mode 100644
index 0000000..695fb4f
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java

@@ -0,0 +1,1496 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.*;
+import com.google.common.collect.Maps;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.config.CFMetaData.DroppedColumn;
+import org.apache.cassandra.config.ColumnDefinition.ClusteringOrder;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.functions.*;
+import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.view.View;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+import static java.lang.String.format;
+
+import static java.util.stream.Collectors.toList;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
+import static org.apache.cassandra.schema.CQLTypeParser.parse;
+
+/**
+ * system_schema.* tables and methods for manipulating them.
+ */
+public final class SchemaKeyspace
+{
+    private SchemaKeyspace()
+    {
+    }
+
+    private static final Logger logger = LoggerFactory.getLogger(SchemaKeyspace.class);
+
+    private static final boolean FLUSH_SCHEMA_TABLES = Boolean.valueOf(System.getProperty("cassandra.test.flush_local_schema_changes", "true"));
+    private static final boolean IGNORE_CORRUPTED_SCHEMA_TABLES = Boolean.valueOf(System.getProperty("cassandra.ignore_corrupted_schema_tables", "false"));
+
+    public static final String NAME = "system_schema";
+
+    public static final String KEYSPACES = "keyspaces";
+    public static final String TABLES = "tables";
+    public static final String COLUMNS = "columns";
+    public static final String DROPPED_COLUMNS = "dropped_columns";
+    public static final String TRIGGERS = "triggers";
+    public static final String VIEWS = "views";
+    public static final String TYPES = "types";
+    public static final String FUNCTIONS = "functions";
+    public static final String AGGREGATES = "aggregates";
+    public static final String INDEXES = "indexes";
+
+    /**
+     * The order in this list matters.
+     *
+     * When flushing schema tables, we want to flush them in a way that mitigates the effects of an abrupt shutdown whilst
+     * the tables are being flushed. On startup, we load the schema from disk before replaying the CL, so we need to
+     * try to avoid problems like reading a table without columns or types, for example. So columns and types should be
+     * flushed before tables, which should be flushed before keyspaces.
+     *
+     * When truncating, the order should be reversed. For immutable lists this is an efficient operation that simply
+     * iterates in reverse order.
+     *
+     * See CASSANDRA-12213 for more details.
+     */
+    public static final ImmutableList<String> ALL =
+        ImmutableList.of(COLUMNS, DROPPED_COLUMNS, TRIGGERS, TYPES, FUNCTIONS, AGGREGATES, INDEXES, TABLES, VIEWS, KEYSPACES);
+
+    /**
+     * Until we upgrade the messaging service version, that is version 4.0, we must preserve the old order (before CASSANDRA-12213)
+     * for digest calculations, otherwise the nodes will never agree on the schema during a rolling upgrade, see CASSANDRA-13559.
+     */
+    public static final ImmutableList<String> ALL_FOR_DIGEST =
+        ImmutableList.of(KEYSPACES, TABLES, COLUMNS, DROPPED_COLUMNS, TRIGGERS, VIEWS, TYPES, FUNCTIONS, AGGREGATES, INDEXES);
+
+    private static final CFMetaData Keyspaces =
+        compile(KEYSPACES,
+                "keyspace definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "durable_writes boolean,"
+                + "replication frozen<map<text, text>>,"
+                + "PRIMARY KEY ((keyspace_name)))");
+
+    private static final CFMetaData Tables =
+        compile(TABLES,
+                "table definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "table_name text,"
+                + "bloom_filter_fp_chance double,"
+                + "caching frozen<map<text, text>>,"
+                + "comment text,"
+                + "compaction frozen<map<text, text>>,"
+                + "compression frozen<map<text, text>>,"
+                + "crc_check_chance double,"
+                + "dclocal_read_repair_chance double,"
+                + "default_time_to_live int,"
+                + "extensions frozen<map<text, blob>>,"
+                + "flags frozen<set<text>>," // SUPER, COUNTER, DENSE, COMPOUND
+                + "gc_grace_seconds int,"
+                + "id uuid,"
+                + "max_index_interval int,"
+                + "memtable_flush_period_in_ms int,"
+                + "min_index_interval int,"
+                + "read_repair_chance double,"
+                + "speculative_retry text,"
+                + "PRIMARY KEY ((keyspace_name), table_name))");
+
+    private static final CFMetaData Columns =
+        compile(COLUMNS,
+                "column definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "table_name text,"
+                + "column_name text,"
+                + "clustering_order text,"
+                + "column_name_bytes blob,"
+                + "kind text,"
+                + "position int,"
+                + "type text,"
+                + "PRIMARY KEY ((keyspace_name), table_name, column_name))");
+
+    private static final CFMetaData DroppedColumns =
+        compile(DROPPED_COLUMNS,
+                "dropped column registry",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "table_name text,"
+                + "column_name text,"
+                + "dropped_time timestamp,"
+                + "kind text,"
+                + "type text,"
+                + "PRIMARY KEY ((keyspace_name), table_name, column_name))");
+
+    private static final CFMetaData Triggers =
+        compile(TRIGGERS,
+                "trigger definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "table_name text,"
+                + "trigger_name text,"
+                + "options frozen<map<text, text>>,"
+                + "PRIMARY KEY ((keyspace_name), table_name, trigger_name))");
+
+    private static final CFMetaData Views =
+        compile(VIEWS,
+                "view definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "view_name text,"
+                + "base_table_id uuid,"
+                + "base_table_name text,"
+                + "where_clause text,"
+                + "bloom_filter_fp_chance double,"
+                + "caching frozen<map<text, text>>,"
+                + "comment text,"
+                + "compaction frozen<map<text, text>>,"
+                + "compression frozen<map<text, text>>,"
+                + "crc_check_chance double,"
+                + "dclocal_read_repair_chance double,"
+                + "default_time_to_live int,"
+                + "extensions frozen<map<text, blob>>,"
+                + "gc_grace_seconds int,"
+                + "id uuid,"
+                + "include_all_columns boolean,"
+                + "max_index_interval int,"
+                + "memtable_flush_period_in_ms int,"
+                + "min_index_interval int,"
+                + "read_repair_chance double,"
+                + "speculative_retry text,"
+                + "PRIMARY KEY ((keyspace_name), view_name))");
+
+    private static final CFMetaData Indexes =
+        compile(INDEXES,
+                "secondary index definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "table_name text,"
+                + "index_name text,"
+                + "kind text,"
+                + "options frozen<map<text, text>>,"
+                + "PRIMARY KEY ((keyspace_name), table_name, index_name))");
+
+    private static final CFMetaData Types =
+        compile(TYPES,
+                "user defined type definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "type_name text,"
+                + "field_names frozen<list<text>>,"
+                + "field_types frozen<list<text>>,"
+                + "PRIMARY KEY ((keyspace_name), type_name))");
+
+    private static final CFMetaData Functions =
+        compile(FUNCTIONS,
+                "user defined function definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "function_name text,"
+                + "argument_types frozen<list<text>>,"
+                + "argument_names frozen<list<text>>,"
+                + "body text,"
+                + "language text,"
+                + "return_type text,"
+                + "called_on_null_input boolean,"
+                + "PRIMARY KEY ((keyspace_name), function_name, argument_types))");
+
+    private static final CFMetaData Aggregates =
+        compile(AGGREGATES,
+                "user defined aggregate definitions",
+                "CREATE TABLE %s ("
+                + "keyspace_name text,"
+                + "aggregate_name text,"
+                + "argument_types frozen<list<text>>,"
+                + "final_func text,"
+                + "initcond text,"
+                + "return_type text,"
+                + "state_func text,"
+                + "state_type text,"
+                + "PRIMARY KEY ((keyspace_name), aggregate_name, argument_types))");
+
+    public static final List<CFMetaData> ALL_TABLE_METADATA =
+        ImmutableList.of(Keyspaces, Tables, Columns, Triggers, DroppedColumns, Views, Types, Functions, Aggregates, Indexes);
+
+    private static CFMetaData compile(String name, String description, String schema)
+    {
+        return CFMetaData.compile(String.format(schema, name), NAME)
+                         .comment(description)
+                         .gcGraceSeconds((int) TimeUnit.DAYS.toSeconds(7));
+    }
+
+    public static KeyspaceMetadata metadata()
+    {
+        return KeyspaceMetadata.create(NAME, KeyspaceParams.local(), org.apache.cassandra.schema.Tables.of(ALL_TABLE_METADATA));
+    }
+
+    /**
+     * Add entries to system_schema.* for the hardcoded system keyspaces
+     */
+    public static void saveSystemKeyspacesSchema()
+    {
+        KeyspaceMetadata system = Schema.instance.getKSMetaData(SystemKeyspace.NAME);
+        KeyspaceMetadata schema = Schema.instance.getKSMetaData(NAME);
+
+        long timestamp = FBUtilities.timestampMicros();
+
+        // delete old, possibly obsolete entries in schema tables
+        for (String schemaTable : ALL)
+        {
+            String query = String.format("DELETE FROM %s.%s USING TIMESTAMP ? WHERE keyspace_name = ?", NAME, schemaTable);
+            for (String systemKeyspace : Schema.LOCAL_SYSTEM_KEYSPACE_NAMES)
+                executeOnceInternal(query, timestamp, systemKeyspace);
+        }
+
+        // (+1 to timestamp to make sure we don't get shadowed by the tombstones we just added)
+        makeCreateKeyspaceMutation(system, timestamp + 1).apply();
+        makeCreateKeyspaceMutation(schema, timestamp + 1).apply();
+    }
+
+    public static void truncate()
+    {
+        ALL.reverse().forEach(table -> getSchemaCFS(table).truncateBlocking());
+    }
+
+    static void flush()
+    {
+        if (!Boolean.getBoolean("cassandra.unsafesystem"))
+            ALL.forEach(table -> FBUtilities.waitOnFuture(getSchemaCFS(table).forceFlush()));
+    }
+
+    /**
+     * Read schema from system keyspace and calculate MD5 digest of every row, resulting digest
+     * will be converted into UUID which would act as content-based version of the schema.
+     */
+    public static UUID calculateSchemaDigest()
+    {
+        MessageDigest digest;
+        try
+        {
+            digest = MessageDigest.getInstance("MD5");
+        }
+        catch (NoSuchAlgorithmException e)
+        {
+            throw new RuntimeException(e);
+        }
+
+        for (String table : ALL_FOR_DIGEST)
+        {
+            // Due to CASSANDRA-11050 we want to exclude DROPPED_COLUMNS for schema digest computation. We can and
+            // should remove that in the next major release (so C* 4.0).
+            if (table.equals(DROPPED_COLUMNS))
+                continue;
+
+            ReadCommand cmd = getReadCommandForTableSchema(table);
+            try (ReadOrderGroup orderGroup = cmd.startOrderGroup();
+                 PartitionIterator schema = cmd.executeInternal(orderGroup))
+            {
+                while (schema.hasNext())
+                {
+                    try (RowIterator partition = schema.next())
+                    {
+                        if (!isSystemKeyspaceSchemaPartition(partition.partitionKey()))
+                            RowIterators.digest(partition, digest);
+                    }
+                }
+            }
+        }
+        return UUID.nameUUIDFromBytes(digest.digest());
+    }
+
+    /**
+     * @param schemaTableName The name of the table responsible for part of the schema
+     * @return CFS responsible to hold low-level serialized schema
+     */
+    private static ColumnFamilyStore getSchemaCFS(String schemaTableName)
+    {
+        return Keyspace.open(NAME).getColumnFamilyStore(schemaTableName);
+    }
+
+    /**
+     * @param schemaTableName The name of the table responsible for part of the schema.
+     * @return low-level schema representation
+     */
+    private static ReadCommand getReadCommandForTableSchema(String schemaTableName)
+    {
+        ColumnFamilyStore cfs = getSchemaCFS(schemaTableName);
+        return PartitionRangeReadCommand.allDataRead(cfs.metadata, FBUtilities.nowInSeconds());
+    }
+
+    public static Collection<Mutation> convertSchemaToMutations()
+    {
+        Map<DecoratedKey, Mutation> mutationMap = new HashMap<>();
+
+        for (String table : ALL)
+            convertSchemaToMutations(mutationMap, table);
+
+        return mutationMap.values();
+    }
+
+    private static void convertSchemaToMutations(Map<DecoratedKey, Mutation> mutationMap, String schemaTableName)
+    {
+        ReadCommand cmd = getReadCommandForTableSchema(schemaTableName);
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator iter = cmd.executeLocally(orderGroup))
+        {
+            while (iter.hasNext())
+            {
+                try (UnfilteredRowIterator partition = iter.next())
+                {
+                    if (isSystemKeyspaceSchemaPartition(partition.partitionKey()))
+                        continue;
+
+                    DecoratedKey key = partition.partitionKey();
+                    Mutation mutation = mutationMap.get(key);
+                    if (mutation == null)
+                    {
+                        mutation = new Mutation(NAME, key);
+                        mutationMap.put(key, mutation);
+                    }
+
+                    mutation.add(PartitionUpdate.fromIterator(partition));
+                }
+            }
+        }
+    }
+
+    private static ByteBuffer getSchemaKSKey(String ksName)
+    {
+        return AsciiType.instance.fromString(ksName);
+    }
+
+    private static boolean isSystemKeyspaceSchemaPartition(DecoratedKey partitionKey)
+    {
+        return Schema.isLocalSystemKeyspace(UTF8Type.instance.compose(partitionKey.getKey()));
+    }
+
+    /*
+     * Schema entities to mutations
+     */
+
+    public static Mutation makeCreateKeyspaceMutation(String name, KeyspaceParams params, long timestamp)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(Keyspaces, timestamp, name).clustering();
+        return adder.add(KeyspaceParams.Option.DURABLE_WRITES.toString(), params.durableWrites)
+                    .frozenMap(KeyspaceParams.Option.REPLICATION.toString(), params.replication.asMap())
+                    .build();
+    }
+
+    public static Mutation makeCreateKeyspaceMutation(KeyspaceMetadata keyspace, long timestamp)
+    {
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+
+        keyspace.tables.forEach(table -> addTableToSchemaMutation(table, timestamp, true, mutation));
+        keyspace.views.forEach(view -> addViewToSchemaMutation(view, timestamp, true, mutation));
+        keyspace.types.forEach(type -> addTypeToSchemaMutation(type, timestamp, mutation));
+        keyspace.functions.udfs().forEach(udf -> addFunctionToSchemaMutation(udf, timestamp, mutation));
+        keyspace.functions.udas().forEach(uda -> addAggregateToSchemaMutation(uda, timestamp, mutation));
+
+        return mutation;
+    }
+
+    public static Mutation makeDropKeyspaceMutation(KeyspaceMetadata keyspace, long timestamp)
+    {
+        int nowInSec = FBUtilities.nowInSeconds();
+        Mutation mutation = new Mutation(NAME, Keyspaces.decorateKey(getSchemaKSKey(keyspace.name)));
+
+        for (CFMetaData schemaTable : ALL_TABLE_METADATA)
+            mutation.add(PartitionUpdate.fullPartitionDelete(schemaTable, mutation.key(), timestamp, nowInSec));
+
+        return mutation;
+    }
+
+    public static Mutation makeCreateTypeMutation(KeyspaceMetadata keyspace, UserType type, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        addTypeToSchemaMutation(type, timestamp, mutation);
+        return mutation;
+    }
+
+    static void addTypeToSchemaMutation(UserType type, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(Types, timestamp, mutation)
+                                 .clustering(type.getNameAsString())
+                                 .frozenList("field_names", type.fieldNames().stream().map(SchemaKeyspace::bbToString).collect(toList()))
+                                 .frozenList("field_types", type.fieldTypes().stream().map(AbstractType::asCQL3Type).map(CQL3Type::toString).collect(toList()));
+
+        adder.build();
+    }
+
+    private static String bbToString(ByteBuffer bb)
+    {
+        try
+        {
+            return ByteBufferUtil.string(bb);
+        }
+        catch (CharacterCodingException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static Mutation dropTypeFromSchemaMutation(KeyspaceMetadata keyspace, UserType type, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        return RowUpdateBuilder.deleteRow(Types, timestamp, mutation, type.name);
+    }
+
+    public static Mutation makeCreateTableMutation(KeyspaceMetadata keyspace, CFMetaData table, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        addTableToSchemaMutation(table, timestamp, true, mutation);
+        return mutation;
+    }
+
+    public static void addTableToSchemaMutation(CFMetaData table, long timestamp, boolean withColumnsAndTriggers, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(Tables, timestamp, mutation).clustering(table.cfName);
+
+        addTableParamsToSchemaMutation(table.params, adder);
+
+        adder.add("id", table.cfId)
+             .frozenSet("flags", CFMetaData.flagsToStrings(table.flags()))
+             .build();
+
+        if (withColumnsAndTriggers)
+        {
+            for (ColumnDefinition column : table.allColumns())
+                addColumnToSchemaMutation(table, column, timestamp, mutation);
+
+            for (CFMetaData.DroppedColumn column : table.getDroppedColumns().values())
+                addDroppedColumnToSchemaMutation(table, column, timestamp, mutation);
+
+            for (TriggerMetadata trigger : table.getTriggers())
+                addTriggerToSchemaMutation(table, trigger, timestamp, mutation);
+
+            for (IndexMetadata index : table.getIndexes())
+                addIndexToSchemaMutation(table, index, timestamp, mutation);
+        }
+    }
+
+    private static void addTableParamsToSchemaMutation(TableParams params, RowUpdateBuilder adder)
+    {
+        adder.add("bloom_filter_fp_chance", params.bloomFilterFpChance)
+             .add("comment", params.comment)
+             .add("dclocal_read_repair_chance", params.dcLocalReadRepairChance)
+             .add("default_time_to_live", params.defaultTimeToLive)
+             .add("gc_grace_seconds", params.gcGraceSeconds)
+             .add("max_index_interval", params.maxIndexInterval)
+             .add("memtable_flush_period_in_ms", params.memtableFlushPeriodInMs)
+             .add("min_index_interval", params.minIndexInterval)
+             .add("read_repair_chance", params.readRepairChance)
+             .add("speculative_retry", params.speculativeRetry.toString())
+             .add("crc_check_chance", params.crcCheckChance)
+             .frozenMap("caching", params.caching.asMap())
+             .frozenMap("compaction", params.compaction.asMap())
+             .frozenMap("compression", params.compression.asMap())
+             .frozenMap("extensions", params.extensions);
+    }
+
+    public static Mutation makeUpdateTableMutation(KeyspaceMetadata keyspace,
+                                                   CFMetaData oldTable,
+                                                   CFMetaData newTable,
+                                                   long timestamp)
+    {
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+
+        addTableToSchemaMutation(newTable, timestamp, false, mutation);
+
+        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(oldTable.getColumnMetadata(),
+                                                                                 newTable.getColumnMetadata());
+
+        // columns that are no longer needed
+        for (ColumnDefinition column : columnDiff.entriesOnlyOnLeft().values())
+            dropColumnFromSchemaMutation(oldTable, column, timestamp, mutation);
+
+        // newly added columns
+        for (ColumnDefinition column : columnDiff.entriesOnlyOnRight().values())
+            addColumnToSchemaMutation(newTable, column, timestamp, mutation);
+
+        // old columns with updated attributes
+        for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
+            addColumnToSchemaMutation(newTable, newTable.getColumnDefinition(name), timestamp, mutation);
+
+        // dropped columns
+        MapDifference<ByteBuffer, CFMetaData.DroppedColumn> droppedColumnDiff =
+            Maps.difference(oldTable.getDroppedColumns(), newTable.getDroppedColumns());
+
+        // newly dropped columns
+        for (CFMetaData.DroppedColumn column : droppedColumnDiff.entriesOnlyOnRight().values())
+            addDroppedColumnToSchemaMutation(newTable, column, timestamp, mutation);
+
+        // columns added then dropped again
+        for (ByteBuffer name : droppedColumnDiff.entriesDiffering().keySet())
+            addDroppedColumnToSchemaMutation(newTable, newTable.getDroppedColumns().get(name), timestamp, mutation);
+
+        MapDifference<String, TriggerMetadata> triggerDiff = triggersDiff(oldTable.getTriggers(), newTable.getTriggers());
+
+        // dropped triggers
+        for (TriggerMetadata trigger : triggerDiff.entriesOnlyOnLeft().values())
+            dropTriggerFromSchemaMutation(oldTable, trigger, timestamp, mutation);
+
+        // newly created triggers
+        for (TriggerMetadata trigger : triggerDiff.entriesOnlyOnRight().values())
+            addTriggerToSchemaMutation(newTable, trigger, timestamp, mutation);
+
+        MapDifference<String, IndexMetadata> indexesDiff = indexesDiff(oldTable.getIndexes(),
+                                                                       newTable.getIndexes());
+
+        // dropped indexes
+        for (IndexMetadata index : indexesDiff.entriesOnlyOnLeft().values())
+            dropIndexFromSchemaMutation(oldTable, index, timestamp, mutation);
+
+        // newly created indexes
+        for (IndexMetadata index : indexesDiff.entriesOnlyOnRight().values())
+            addIndexToSchemaMutation(newTable, index, timestamp, mutation);
+
+        // updated indexes need to be updated
+        for (MapDifference.ValueDifference<IndexMetadata> diff : indexesDiff.entriesDiffering().values())
+            addUpdatedIndexToSchemaMutation(newTable, diff.rightValue(), timestamp, mutation);
+
+        return mutation;
+    }
+
+    private static MapDifference<String, IndexMetadata> indexesDiff(Indexes before, Indexes after)
+    {
+        Map<String, IndexMetadata> beforeMap = new HashMap<>();
+        before.forEach(i -> beforeMap.put(i.name, i));
+
+        Map<String, IndexMetadata> afterMap = new HashMap<>();
+        after.forEach(i -> afterMap.put(i.name, i));
+
+        return Maps.difference(beforeMap, afterMap);
+    }
+
+    private static MapDifference<String, TriggerMetadata> triggersDiff(Triggers before, Triggers after)
+    {
+        Map<String, TriggerMetadata> beforeMap = new HashMap<>();
+        before.forEach(t -> beforeMap.put(t.name, t));
+
+        Map<String, TriggerMetadata> afterMap = new HashMap<>();
+        after.forEach(t -> afterMap.put(t.name, t));
+
+        return Maps.difference(beforeMap, afterMap);
+    }
+
+    public static Mutation makeDropTableMutation(KeyspaceMetadata keyspace, CFMetaData table, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+
+        RowUpdateBuilder.deleteRow(Tables, timestamp, mutation, table.cfName);
+
+        for (ColumnDefinition column : table.allColumns())
+            dropColumnFromSchemaMutation(table, column, timestamp, mutation);
+
+        for (CFMetaData.DroppedColumn column : table.getDroppedColumns().values())
+            dropDroppedColumnFromSchemaMutation(table, column, timestamp, mutation);
+
+        for (TriggerMetadata trigger : table.getTriggers())
+            dropTriggerFromSchemaMutation(table, trigger, timestamp, mutation);
+
+        for (IndexMetadata index : table.getIndexes())
+            dropIndexFromSchemaMutation(table, index, timestamp, mutation);
+
+        return mutation;
+    }
+
+    private static void addColumnToSchemaMutation(CFMetaData table, ColumnDefinition column, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(Columns, timestamp, mutation).clustering(table.cfName, column.name.toString());
+
+        AbstractType<?> type = column.type;
+        if (type instanceof ReversedType)
+            type = ((ReversedType) type).baseType;
+
+        adder.add("column_name_bytes", column.name.bytes)
+             .add("kind", column.kind.toString().toLowerCase())
+             .add("position", column.position())
+             .add("clustering_order", column.clusteringOrder().toString().toLowerCase())
+             .add("type", type.asCQL3Type().toString())
+             .build();
+    }
+
+    private static void dropColumnFromSchemaMutation(CFMetaData table, ColumnDefinition column, long timestamp, Mutation mutation)
+    {
+        // Note: we do want to use name.toString(), not name.bytes directly for backward compatibility (For CQL3, this won't make a difference).
+        RowUpdateBuilder.deleteRow(Columns, timestamp, mutation, table.cfName, column.name.toString());
+    }
+
+    private static void dropDroppedColumnFromSchemaMutation(CFMetaData table, DroppedColumn column, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder.deleteRow(DroppedColumns, timestamp, mutation, table.cfName, column.name);
+    }
+
+    private static void addDroppedColumnToSchemaMutation(CFMetaData table, CFMetaData.DroppedColumn column, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(DroppedColumns, timestamp, mutation).clustering(table.cfName, column.name);
+
+        adder.add("dropped_time", new Date(TimeUnit.MICROSECONDS.toMillis(column.droppedTime)))
+             .add("kind", null != column.kind ? column.kind.toString().toLowerCase() : null)
+             .add("type", expandUserTypes(column.type).asCQL3Type().toString())
+             .build();
+    }
+
+    private static void addTriggerToSchemaMutation(CFMetaData table, TriggerMetadata trigger, long timestamp, Mutation mutation)
+    {
+        new RowUpdateBuilder(Triggers, timestamp, mutation)
+            .clustering(table.cfName, trigger.name)
+            .frozenMap("options", Collections.singletonMap("class", trigger.classOption))
+            .build();
+    }
+
+    private static void dropTriggerFromSchemaMutation(CFMetaData table, TriggerMetadata trigger, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder.deleteRow(Triggers, timestamp, mutation, table.cfName, trigger.name);
+    }
+
+    public static Mutation makeCreateViewMutation(KeyspaceMetadata keyspace, ViewDefinition view, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        addViewToSchemaMutation(view, timestamp, true, mutation);
+        return mutation;
+    }
+
+    private static void addViewToSchemaMutation(ViewDefinition view, long timestamp, boolean includeColumns, Mutation mutation)
+    {
+        RowUpdateBuilder builder = new RowUpdateBuilder(Views, timestamp, mutation)
+            .clustering(view.viewName);
+
+        CFMetaData table = view.metadata;
+
+        builder.add("include_all_columns", view.includeAllColumns)
+               .add("base_table_id", view.baseTableId)
+               .add("base_table_name", view.baseTableMetadata().cfName)
+               .add("where_clause", view.whereClause)
+               .add("id", table.cfId);
+
+        addTableParamsToSchemaMutation(table.params, builder);
+
+        if (includeColumns)
+        {
+            for (ColumnDefinition column : table.allColumns())
+                addColumnToSchemaMutation(table, column, timestamp, mutation);
+
+            for (CFMetaData.DroppedColumn column : table.getDroppedColumns().values())
+                addDroppedColumnToSchemaMutation(table, column, timestamp, mutation);
+        }
+
+        builder.build();
+    }
+
+    public static Mutation makeDropViewMutation(KeyspaceMetadata keyspace, ViewDefinition view, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+
+        RowUpdateBuilder.deleteRow(Views, timestamp, mutation, view.viewName);
+
+        CFMetaData table = view.metadata;
+        for (ColumnDefinition column : table.allColumns())
+            dropColumnFromSchemaMutation(table, column, timestamp, mutation);
+
+        for (IndexMetadata index : table.getIndexes())
+            dropIndexFromSchemaMutation(table, index, timestamp, mutation);
+
+        return mutation;
+    }
+
+    public static Mutation makeUpdateViewMutation(Mutation mutation,
+                                                  ViewDefinition oldView,
+                                                  ViewDefinition newView,
+                                                  long timestamp)
+    {
+        addViewToSchemaMutation(newView, timestamp, false, mutation);
+
+        MapDifference<ByteBuffer, ColumnDefinition> columnDiff = Maps.difference(oldView.metadata.getColumnMetadata(),
+                                                                                 newView.metadata.getColumnMetadata());
+
+        // columns that are no longer needed
+        for (ColumnDefinition column : columnDiff.entriesOnlyOnLeft().values())
+            dropColumnFromSchemaMutation(oldView.metadata, column, timestamp, mutation);
+
+        // newly added columns
+        for (ColumnDefinition column : columnDiff.entriesOnlyOnRight().values())
+            addColumnToSchemaMutation(newView.metadata, column, timestamp, mutation);
+
+        // old columns with updated attributes
+        for (ByteBuffer name : columnDiff.entriesDiffering().keySet())
+            addColumnToSchemaMutation(newView.metadata, newView.metadata.getColumnDefinition(name), timestamp, mutation);
+
+        // dropped columns
+        MapDifference<ByteBuffer, CFMetaData.DroppedColumn> droppedColumnDiff =
+            Maps.difference(oldView.metadata.getDroppedColumns(), oldView.metadata.getDroppedColumns());
+
+        // newly dropped columns
+        for (CFMetaData.DroppedColumn column : droppedColumnDiff.entriesOnlyOnRight().values())
+            addDroppedColumnToSchemaMutation(oldView.metadata, column, timestamp, mutation);
+
+        // columns added then dropped again
+        for (ByteBuffer name : droppedColumnDiff.entriesDiffering().keySet())
+            addDroppedColumnToSchemaMutation(newView.metadata, newView.metadata.getDroppedColumns().get(name), timestamp, mutation);
+
+        return mutation;
+    }
+
+    private static void addIndexToSchemaMutation(CFMetaData table,
+                                                 IndexMetadata index,
+                                                 long timestamp,
+                                                 Mutation mutation)
+    {
+        RowUpdateBuilder builder = new RowUpdateBuilder(Indexes, timestamp, mutation).clustering(table.cfName, index.name);
+
+        builder.add("kind", index.kind.toString());
+        builder.frozenMap("options", index.options);
+        builder.build();
+    }
+
+    private static void dropIndexFromSchemaMutation(CFMetaData table,
+                                                    IndexMetadata index,
+                                                    long timestamp,
+                                                    Mutation mutation)
+    {
+        RowUpdateBuilder.deleteRow(Indexes, timestamp, mutation, table.cfName, index.name);
+    }
+
+    private static void addUpdatedIndexToSchemaMutation(CFMetaData table,
+                                                        IndexMetadata index,
+                                                        long timestamp,
+                                                        Mutation mutation)
+    {
+        addIndexToSchemaMutation(table, index, timestamp, mutation);
+    }
+
+    public static Mutation makeCreateFunctionMutation(KeyspaceMetadata keyspace, UDFunction function, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        addFunctionToSchemaMutation(function, timestamp, mutation);
+        return mutation;
+    }
+
+    static void addFunctionToSchemaMutation(UDFunction function, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder =
+            new RowUpdateBuilder(Functions, timestamp, mutation).clustering(function.name().name, functionArgumentsList(function));
+
+        adder.add("body", function.body())
+             .add("language", function.language())
+             .add("return_type", function.returnType().asCQL3Type().toString())
+             .add("called_on_null_input", function.isCalledOnNullInput())
+             .frozenList("argument_names", function.argNames().stream().map((c) -> bbToString(c.bytes)).collect(toList()));
+
+        adder.build();
+    }
+
+    private static List<String> functionArgumentsList(AbstractFunction fun)
+    {
+        return fun.argTypes()
+                  .stream()
+                  .map(AbstractType::asCQL3Type)
+                  .map(CQL3Type::toString)
+                  .collect(toList());
+    }
+
+    public static Mutation makeDropFunctionMutation(KeyspaceMetadata keyspace, UDFunction function, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        return RowUpdateBuilder.deleteRow(Functions, timestamp, mutation, function.name().name, functionArgumentsList(function));
+    }
+
+    public static Mutation makeCreateAggregateMutation(KeyspaceMetadata keyspace, UDAggregate aggregate, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        addAggregateToSchemaMutation(aggregate, timestamp, mutation);
+        return mutation;
+    }
+
+    static void addAggregateToSchemaMutation(UDAggregate aggregate, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder =
+            new RowUpdateBuilder(Aggregates, timestamp, mutation) .clustering(aggregate.name().name, functionArgumentsList(aggregate));
+
+        adder.add("return_type", aggregate.returnType().asCQL3Type().toString())
+             .add("state_func", aggregate.stateFunction().name().name)
+             .add("state_type", aggregate.stateType().asCQL3Type().toString())
+             .add("final_func", aggregate.finalFunction() != null ? aggregate.finalFunction().name().name : null)
+             .add("initcond", aggregate.initialCondition() != null
+                              // must use the frozen state type here, as 'null' for unfrozen collections may mean 'empty'
+                              ? aggregate.stateType().freeze().asCQL3Type().toCQLLiteral(aggregate.initialCondition(), Server.CURRENT_VERSION)
+                              : null)
+             .build();
+    }
+
+    public static Mutation makeDropAggregateMutation(KeyspaceMetadata keyspace, UDAggregate aggregate, long timestamp)
+    {
+        // Include the serialized keyspace in case the target node missed a CREATE KEYSPACE migration (see CASSANDRA-5631).
+        Mutation mutation = makeCreateKeyspaceMutation(keyspace.name, keyspace.params, timestamp);
+        return RowUpdateBuilder.deleteRow(Aggregates, timestamp, mutation, aggregate.name().name, functionArgumentsList(aggregate));
+    }
+
+    /*
+     * Fetching schema
+     */
+
+    public static Keyspaces fetchNonSystemKeyspaces()
+    {
+        return fetchKeyspacesWithout(Schema.LOCAL_SYSTEM_KEYSPACE_NAMES);
+    }
+
+    private static Keyspaces fetchKeyspacesWithout(Set<String> excludedKeyspaceNames)
+    {
+        String query = format("SELECT keyspace_name FROM %s.%s", NAME, KEYSPACES);
+
+        Keyspaces.Builder keyspaces = org.apache.cassandra.schema.Keyspaces.builder();
+        for (UntypedResultSet.Row row : query(query))
+        {
+            String keyspaceName = row.getString("keyspace_name");
+            if (!excludedKeyspaceNames.contains(keyspaceName))
+                keyspaces.add(fetchKeyspace(keyspaceName));
+        }
+        return keyspaces.build();
+    }
+
+    private static Keyspaces fetchKeyspacesOnly(Set<String> includedKeyspaceNames)
+    {
+        /*
+         * We know the keyspace names we are going to query, but we still want to run the SELECT IN
+         * query, to filter out the keyspaces that had been dropped by the applied mutation set.
+         */
+        String query = format("SELECT keyspace_name FROM %s.%s WHERE keyspace_name IN ?", NAME, KEYSPACES);
+
+        Keyspaces.Builder keyspaces = org.apache.cassandra.schema.Keyspaces.builder();
+        for (UntypedResultSet.Row row : query(query, new ArrayList<>(includedKeyspaceNames)))
+            keyspaces.add(fetchKeyspace(row.getString("keyspace_name")));
+        return keyspaces.build();
+    }
+
+    private static KeyspaceMetadata fetchKeyspace(String keyspaceName)
+    {
+        KeyspaceParams params = fetchKeyspaceParams(keyspaceName);
+        Types types = fetchTypes(keyspaceName);
+        Tables tables = fetchTables(keyspaceName, types);
+        Views views = fetchViews(keyspaceName, types);
+        Functions functions = fetchFunctions(keyspaceName, types);
+        return KeyspaceMetadata.create(keyspaceName, params, tables, views, types, functions);
+    }
+
+    private static KeyspaceParams fetchKeyspaceParams(String keyspaceName)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ?", NAME, KEYSPACES);
+
+        UntypedResultSet.Row row = query(query, keyspaceName).one();
+        boolean durableWrites = row.getBoolean(KeyspaceParams.Option.DURABLE_WRITES.toString());
+        Map<String, String> replication = row.getFrozenTextMap(KeyspaceParams.Option.REPLICATION.toString());
+        return KeyspaceParams.create(durableWrites, replication);
+    }
+
+    private static Types fetchTypes(String keyspaceName)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ?", NAME, TYPES);
+
+        Types.RawBuilder types = org.apache.cassandra.schema.Types.rawBuilder(keyspaceName);
+        for (UntypedResultSet.Row row : query(query, keyspaceName))
+        {
+            String name = row.getString("type_name");
+            List<String> fieldNames = row.getFrozenList("field_names", UTF8Type.instance);
+            List<String> fieldTypes = row.getFrozenList("field_types", UTF8Type.instance);
+            types.add(name, fieldNames, fieldTypes);
+        }
+        return types.build();
+    }
+
+    private static Tables fetchTables(String keyspaceName, Types types)
+    {
+        String query = format("SELECT table_name FROM %s.%s WHERE keyspace_name = ?", NAME, TABLES);
+
+        Tables.Builder tables = org.apache.cassandra.schema.Tables.builder();
+        for (UntypedResultSet.Row row : query(query, keyspaceName))
+        {
+            String tableName = row.getString("table_name");
+            try
+            {
+                tables.add(fetchTable(keyspaceName, tableName, types));
+            }
+            catch (MissingColumns exc)
+            {
+                String errorMsg = String.format("No partition columns found for table %s.%s in %s.%s.  This may be due to " +
+                                                "corruption or concurrent dropping and altering of a table. If this table is supposed " +
+                                                "to be dropped, {}run the following query to cleanup: " +
+                                                "\"DELETE FROM %s.%s WHERE keyspace_name = '%s' AND table_name = '%s'; " +
+                                                "DELETE FROM %s.%s WHERE keyspace_name = '%s' AND table_name = '%s';\" " +
+                                                "If the table is not supposed to be dropped, restore %s.%s sstables from backups.",
+                                                keyspaceName, tableName, NAME, COLUMNS,
+                                                NAME, TABLES, keyspaceName, tableName,
+                                                NAME, COLUMNS, keyspaceName, tableName,
+                                                NAME, COLUMNS);
+
+                if (IGNORE_CORRUPTED_SCHEMA_TABLES)
+                {
+                    logger.error(errorMsg, "", exc);
+                }
+                else
+                {
+                    logger.error(errorMsg, "restart cassandra with -Dcassandra.ignore_corrupted_schema_tables=true and ");
+                    throw exc;
+                }
+            }
+        }
+        return tables.build();
+    }
+
+    private static CFMetaData fetchTable(String keyspaceName, String tableName, Types types)
+    {
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?", NAME, TABLES);
+        UntypedResultSet rows = query(query, keyspaceName, tableName);
+        if (rows.isEmpty())
+            throw new RuntimeException(String.format("%s:%s not found in the schema definitions keyspace.", keyspaceName, tableName));
+        UntypedResultSet.Row row = rows.one();
+
+        UUID id = row.getUUID("id");
+
+        Set<CFMetaData.Flag> flags = CFMetaData.flagsFromStrings(row.getFrozenSet("flags", UTF8Type.instance));
+
+        boolean isSuper = flags.contains(CFMetaData.Flag.SUPER);
+        boolean isCounter = flags.contains(CFMetaData.Flag.COUNTER);
+        boolean isDense = flags.contains(CFMetaData.Flag.DENSE);
+        boolean isCompound = flags.contains(CFMetaData.Flag.COMPOUND);
+
+        List<ColumnDefinition> columns = fetchColumns(keyspaceName, tableName, types);
+        Map<ByteBuffer, CFMetaData.DroppedColumn> droppedColumns = fetchDroppedColumns(keyspaceName, tableName);
+        Indexes indexes = fetchIndexes(keyspaceName, tableName);
+        Triggers triggers = fetchTriggers(keyspaceName, tableName);
+
+        return CFMetaData.create(keyspaceName,
+                                 tableName,
+                                 id,
+                                 isDense,
+                                 isCompound,
+                                 isSuper,
+                                 isCounter,
+                                 false,
+                                 columns,
+                                 DatabaseDescriptor.getPartitioner())
+                         .params(createTableParamsFromRow(row))
+                         .droppedColumns(droppedColumns)
+                         .indexes(indexes)
+                         .triggers(triggers);
+    }
+
+    public static TableParams createTableParamsFromRow(UntypedResultSet.Row row)
+    {
+        return TableParams.builder()
+                          .bloomFilterFpChance(row.getDouble("bloom_filter_fp_chance"))
+                          .caching(CachingParams.fromMap(row.getFrozenTextMap("caching")))
+                          .comment(row.getString("comment"))
+                          .compaction(CompactionParams.fromMap(row.getFrozenTextMap("compaction")))
+                          .compression(CompressionParams.fromMap(row.getFrozenTextMap("compression")))
+                          .dcLocalReadRepairChance(row.getDouble("dclocal_read_repair_chance"))
+                          .defaultTimeToLive(row.getInt("default_time_to_live"))
+                          .extensions(row.getFrozenMap("extensions", UTF8Type.instance, BytesType.instance))
+                          .gcGraceSeconds(row.getInt("gc_grace_seconds"))
+                          .maxIndexInterval(row.getInt("max_index_interval"))
+                          .memtableFlushPeriodInMs(row.getInt("memtable_flush_period_in_ms"))
+                          .minIndexInterval(row.getInt("min_index_interval"))
+                          .readRepairChance(row.getDouble("read_repair_chance"))
+                          .crcCheckChance(row.getDouble("crc_check_chance"))
+                          .speculativeRetry(SpeculativeRetryParam.fromString(row.getString("speculative_retry")))
+                          .build();
+    }
+
+    private static List<ColumnDefinition> fetchColumns(String keyspace, String table, Types types)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?", NAME, COLUMNS);
+        UntypedResultSet columnRows = query(query, keyspace, table);
+        if (columnRows.isEmpty())
+            throw new MissingColumns("Columns not found in schema table for " + keyspace + "." + table);
+
+        List<ColumnDefinition> columns = new ArrayList<>();
+        columnRows.forEach(row -> columns.add(createColumnFromRow(row, types)));
+
+        if (columns.stream().noneMatch(ColumnDefinition::isPartitionKey))
+            throw new MissingColumns("No partition key columns found in schema table for " + keyspace + "." + table);
+
+        return columns;
+    }
+
+    public static ColumnDefinition createColumnFromRow(UntypedResultSet.Row row, Types types)
+    {
+        String keyspace = row.getString("keyspace_name");
+        String table = row.getString("table_name");
+
+        ColumnDefinition.Kind kind = ColumnDefinition.Kind.valueOf(row.getString("kind").toUpperCase());
+
+        int position = row.getInt("position");
+        ClusteringOrder order = ClusteringOrder.valueOf(row.getString("clustering_order").toUpperCase());
+
+        AbstractType<?> type = parse(keyspace, row.getString("type"), types);
+        if (order == ClusteringOrder.DESC)
+            type = ReversedType.getInstance(type);
+
+        ColumnIdentifier name = new ColumnIdentifier(row.getBytes("column_name_bytes"), row.getString("column_name"));
+
+        return new ColumnDefinition(keyspace, table, name, type, position, kind);
+    }
+
+    private static Map<ByteBuffer, CFMetaData.DroppedColumn> fetchDroppedColumns(String keyspace, String table)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?", NAME, DROPPED_COLUMNS);
+        Map<ByteBuffer, CFMetaData.DroppedColumn> columns = new HashMap<>();
+        for (UntypedResultSet.Row row : query(query, keyspace, table))
+        {
+            CFMetaData.DroppedColumn column = createDroppedColumnFromRow(row);
+            columns.put(UTF8Type.instance.decompose(column.name), column);
+        }
+        return columns;
+    }
+
+    private static CFMetaData.DroppedColumn createDroppedColumnFromRow(UntypedResultSet.Row row)
+    {
+        String keyspace = row.getString("keyspace_name");
+        String name = row.getString("column_name");
+
+        ColumnDefinition.Kind kind =
+            row.has("kind") ? ColumnDefinition.Kind.valueOf(row.getString("kind").toUpperCase())
+                            : null;
+        /*
+         * we never store actual UDT names in dropped column types (so that we can safely drop types if nothing refers to
+         * them anymore), so before storing dropped columns in schema we expand UDTs to tuples. See expandUserTypes method.
+         * Because of that, we can safely pass Types.none() to parse()
+         */
+        AbstractType<?> type = parse(keyspace, row.getString("type"), org.apache.cassandra.schema.Types.none());
+        long droppedTime = TimeUnit.MILLISECONDS.toMicros(row.getLong("dropped_time"));
+        return new CFMetaData.DroppedColumn(name, kind, type, droppedTime);
+    }
+
+    private static Indexes fetchIndexes(String keyspace, String table)
+    {
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?", NAME, INDEXES);
+        Indexes.Builder indexes = org.apache.cassandra.schema.Indexes.builder();
+        query(query, keyspace, table).forEach(row -> indexes.add(createIndexMetadataFromRow(row)));
+        return indexes.build();
+    }
+
+    private static IndexMetadata createIndexMetadataFromRow(UntypedResultSet.Row row)
+    {
+        String name = row.getString("index_name");
+        IndexMetadata.Kind type = IndexMetadata.Kind.valueOf(row.getString("kind"));
+        Map<String, String> options = row.getFrozenTextMap("options");
+        return IndexMetadata.fromSchemaMetadata(name, type, options);
+    }
+
+    private static Triggers fetchTriggers(String keyspace, String table)
+    {
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?", NAME, TRIGGERS);
+        Triggers.Builder triggers = org.apache.cassandra.schema.Triggers.builder();
+        query(query, keyspace, table).forEach(row -> triggers.add(createTriggerFromRow(row)));
+        return triggers.build();
+    }
+
+    private static TriggerMetadata createTriggerFromRow(UntypedResultSet.Row row)
+    {
+        String name = row.getString("trigger_name");
+        String classOption = row.getFrozenTextMap("options").get("class");
+        return new TriggerMetadata(name, classOption);
+    }
+
+    private static Views fetchViews(String keyspaceName, Types types)
+    {
+        String query = format("SELECT view_name FROM %s.%s WHERE keyspace_name = ?", NAME, VIEWS);
+
+        Views.Builder views = org.apache.cassandra.schema.Views.builder();
+        for (UntypedResultSet.Row row : query(query, keyspaceName))
+            views.add(fetchView(keyspaceName, row.getString("view_name"), types));
+        return views.build();
+    }
+
+    private static ViewDefinition fetchView(String keyspaceName, String viewName, Types types)
+    {
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND view_name = ?", NAME, VIEWS);
+        UntypedResultSet rows = query(query, keyspaceName, viewName);
+        if (rows.isEmpty())
+            throw new RuntimeException(String.format("%s:%s not found in the schema definitions keyspace.", keyspaceName, viewName));
+        UntypedResultSet.Row row = rows.one();
+
+        UUID id = row.getUUID("id");
+        UUID baseTableId = row.getUUID("base_table_id");
+        String baseTableName = row.getString("base_table_name");
+        boolean includeAll = row.getBoolean("include_all_columns");
+        String whereClause = row.getString("where_clause");
+
+        List<ColumnDefinition> columns = fetchColumns(keyspaceName, viewName, types);
+
+        Map<ByteBuffer, CFMetaData.DroppedColumn> droppedColumns = fetchDroppedColumns(keyspaceName, viewName);
+
+        CFMetaData cfm = CFMetaData.create(keyspaceName,
+                                           viewName,
+                                           id,
+                                           false,
+                                           true,
+                                           false,
+                                           false,
+                                           true,
+                                           columns,
+                                           DatabaseDescriptor.getPartitioner())
+                                   .params(createTableParamsFromRow(row))
+                                   .droppedColumns(droppedColumns);
+
+            String rawSelect = View.buildSelectStatement(baseTableName, columns, whereClause);
+            SelectStatement.RawStatement rawStatement = (SelectStatement.RawStatement) QueryProcessor.parseStatement(rawSelect);
+
+            return new ViewDefinition(keyspaceName, viewName, baseTableId, baseTableName, includeAll, rawStatement, whereClause, cfm);
+    }
+
+    private static Functions fetchFunctions(String keyspaceName, Types types)
+    {
+        Functions udfs = fetchUDFs(keyspaceName, types);
+        Functions udas = fetchUDAs(keyspaceName, udfs, types);
+
+        return org.apache.cassandra.schema.Functions.builder()
+                                                    .add(udfs)
+                                                    .add(udas)
+                                                    .build();
+    }
+
+    private static Functions fetchUDFs(String keyspaceName, Types types)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ?", NAME, FUNCTIONS);
+
+        Functions.Builder functions = org.apache.cassandra.schema.Functions.builder();
+        for (UntypedResultSet.Row row : query(query, keyspaceName))
+            functions.add(createUDFFromRow(row, types));
+        return functions.build();
+    }
+
+    private static UDFunction createUDFFromRow(UntypedResultSet.Row row, Types types)
+    {
+        String ksName = row.getString("keyspace_name");
+        String functionName = row.getString("function_name");
+        FunctionName name = new FunctionName(ksName, functionName);
+
+        List<ColumnIdentifier> argNames = new ArrayList<>();
+        for (String arg : row.getFrozenList("argument_names", UTF8Type.instance))
+            argNames.add(new ColumnIdentifier(arg, true));
+
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        for (String type : row.getFrozenList("argument_types", UTF8Type.instance))
+            argTypes.add(parse(ksName, type, types));
+
+        AbstractType<?> returnType = parse(ksName, row.getString("return_type"), types);
+
+        String language = row.getString("language");
+        String body = row.getString("body");
+        boolean calledOnNullInput = row.getBoolean("called_on_null_input");
+
+        org.apache.cassandra.cql3.functions.Function existing = Schema.instance.findFunction(name, argTypes).orElse(null);
+        if (existing instanceof UDFunction)
+        {
+            // This check prevents duplicate compilation of effectively the same UDF.
+            // Duplicate compilation attempts can occur on the coordinator node handling the CREATE FUNCTION
+            // statement, since CreateFunctionStatement needs to execute UDFunction.create but schema migration
+            // also needs that (since it needs to handle its own change).
+            UDFunction udf = (UDFunction) existing;
+            if (udf.argNames().equals(argNames) && // arg types checked in Functions.find call
+                udf.returnType().equals(returnType) &&
+                !udf.isAggregate() &&
+                udf.language().equals(language) &&
+                udf.body().equals(body) &&
+                udf.isCalledOnNullInput() == calledOnNullInput)
+            {
+                logger.trace("Skipping duplicate compilation of already existing UDF {}", name);
+                return udf;
+            }
+        }
+
+        try
+        {
+            return UDFunction.create(name, argNames, argTypes, returnType, calledOnNullInput, language, body);
+        }
+        catch (InvalidRequestException e)
+        {
+            logger.error(String.format("Cannot load function '%s' from schema: this function won't be available (on this node)", name), e);
+            return UDFunction.createBrokenFunction(name, argNames, argTypes, returnType, calledOnNullInput, language, body, e);
+        }
+    }
+
+    private static Functions fetchUDAs(String keyspaceName, Functions udfs, Types types)
+    {
+        String query = format("SELECT * FROM %s.%s WHERE keyspace_name = ?", NAME, AGGREGATES);
+
+        Functions.Builder aggregates = org.apache.cassandra.schema.Functions.builder();
+        for (UntypedResultSet.Row row : query(query, keyspaceName))
+            aggregates.add(createUDAFromRow(row, udfs, types));
+        return aggregates.build();
+    }
+
+    private static UDAggregate createUDAFromRow(UntypedResultSet.Row row, Functions functions, Types types)
+    {
+        String ksName = row.getString("keyspace_name");
+        String functionName = row.getString("aggregate_name");
+        FunctionName name = new FunctionName(ksName, functionName);
+
+        List<AbstractType<?>> argTypes =
+            row.getFrozenList("argument_types", UTF8Type.instance)
+               .stream()
+               .map(t -> parse(ksName, t, types))
+               .collect(toList());
+
+        AbstractType<?> returnType = parse(ksName, row.getString("return_type"), types);
+
+        FunctionName stateFunc = new FunctionName(ksName, (row.getString("state_func")));
+        FunctionName finalFunc = row.has("final_func") ? new FunctionName(ksName, row.getString("final_func")) : null;
+        AbstractType<?> stateType = row.has("state_type") ? parse(ksName, row.getString("state_type"), types) : null;
+        ByteBuffer initcond = row.has("initcond") ? Terms.asBytes(ksName, row.getString("initcond"), stateType) : null;
+
+        try
+        {
+            return UDAggregate.create(functions, name, argTypes, returnType, stateFunc, finalFunc, stateType, initcond);
+        }
+        catch (InvalidRequestException reason)
+        {
+            return UDAggregate.createBroken(name, argTypes, returnType, initcond, reason);
+        }
+    }
+
+    private static UntypedResultSet query(String query, Object... variables)
+    {
+        return executeInternal(query, variables);
+    }
+
+    /*
+     * Merging schema
+     */
+
+    /*
+     * Reload schema from local disk. Useful if a user made changes to schema tables by hand, or has suspicion that
+     * in-memory representation got out of sync somehow with what's on disk.
+     */
+    public static synchronized void reloadSchemaAndAnnounceVersion()
+    {
+        Keyspaces before = Schema.instance.getReplicatedKeyspaces();
+        Keyspaces after = fetchNonSystemKeyspaces();
+        mergeSchema(before, after);
+        Schema.instance.updateVersionAndAnnounce();
+    }
+
+    /**
+     * Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
+     * (which also involves fs operations on add/drop ks/cf)
+     *
+     * @param mutations the schema changes to apply
+     *
+     * @throws ConfigurationException If one of metadata attributes has invalid value
+     */
+    public static synchronized void mergeSchemaAndAnnounceVersion(Collection<Mutation> mutations) throws ConfigurationException
+    {
+        mergeSchema(mutations);
+        Schema.instance.updateVersionAndAnnounce();
+    }
+
+    public static synchronized void mergeSchema(Collection<Mutation> mutations)
+    {
+        // only compare the keyspaces affected by this set of schema mutations
+        Set<String> affectedKeyspaces =
+        mutations.stream()
+                 .map(m -> UTF8Type.instance.compose(m.key().getKey()))
+                 .collect(Collectors.toSet());
+
+        // fetch the current state of schema for the affected keyspaces only
+        Keyspaces before = Schema.instance.getKeyspaces(affectedKeyspaces);
+
+        // apply the schema mutations and flush
+        mutations.forEach(Mutation::apply);
+        if (FLUSH_SCHEMA_TABLES)
+            flush();
+
+        // fetch the new state of schema from schema tables (not applied to Schema.instance yet)
+        Keyspaces after = fetchKeyspacesOnly(affectedKeyspaces);
+
+        mergeSchema(before, after);
+    }
+
+    private static synchronized void mergeSchema(Keyspaces before, Keyspaces after)
+    {
+        MapDifference<String, KeyspaceMetadata> keyspacesDiff = before.diff(after);
+
+        // dropped keyspaces
+        for (KeyspaceMetadata keyspace : keyspacesDiff.entriesOnlyOnLeft().values())
+        {
+            keyspace.functions.udas().forEach(Schema.instance::dropAggregate);
+            keyspace.functions.udfs().forEach(Schema.instance::dropFunction);
+            keyspace.views.forEach(v -> Schema.instance.dropView(v.ksName, v.viewName));
+            keyspace.tables.forEach(t -> Schema.instance.dropTable(t.ksName, t.cfName));
+            keyspace.types.forEach(Schema.instance::dropType);
+            Schema.instance.dropKeyspace(keyspace.name);
+        }
+
+        // new keyspaces
+        for (KeyspaceMetadata keyspace : keyspacesDiff.entriesOnlyOnRight().values())
+        {
+            Schema.instance.addKeyspace(KeyspaceMetadata.create(keyspace.name, keyspace.params));
+            keyspace.types.forEach(Schema.instance::addType);
+            keyspace.tables.forEach(Schema.instance::addTable);
+            keyspace.views.forEach(Schema.instance::addView);
+            keyspace.functions.udfs().forEach(Schema.instance::addFunction);
+            keyspace.functions.udas().forEach(Schema.instance::addAggregate);
+        }
+
+        // updated keyspaces
+        for (Map.Entry<String, MapDifference.ValueDifference<KeyspaceMetadata>> diff : keyspacesDiff.entriesDiffering().entrySet())
+            updateKeyspace(diff.getKey(), diff.getValue().leftValue(), diff.getValue().rightValue());
+    }
+
+    private static void updateKeyspace(String keyspaceName, KeyspaceMetadata keyspaceBefore, KeyspaceMetadata keyspaceAfter)
+    {
+        // calculate the deltas
+        MapDifference<String, CFMetaData> tablesDiff = keyspaceBefore.tables.diff(keyspaceAfter.tables);
+        MapDifference<String, ViewDefinition> viewsDiff = keyspaceBefore.views.diff(keyspaceAfter.views);
+        MapDifference<ByteBuffer, UserType> typesDiff = keyspaceBefore.types.diff(keyspaceAfter.types);
+
+        Map<Pair<FunctionName, List<String>>, UDFunction> udfsBefore = new HashMap<>();
+        keyspaceBefore.functions.udfs().forEach(f -> udfsBefore.put(Pair.create(f.name(), functionArgumentsList(f)), f));
+        Map<Pair<FunctionName, List<String>>, UDFunction> udfsAfter = new HashMap<>();
+        keyspaceAfter.functions.udfs().forEach(f -> udfsAfter.put(Pair.create(f.name(), functionArgumentsList(f)), f));
+        MapDifference<Pair<FunctionName, List<String>>, UDFunction> udfsDiff = Maps.difference(udfsBefore, udfsAfter);
+
+        Map<Pair<FunctionName, List<String>>, UDAggregate> udasBefore = new HashMap<>();
+        keyspaceBefore.functions.udas().forEach(f -> udasBefore.put(Pair.create(f.name(), functionArgumentsList(f)), f));
+        Map<Pair<FunctionName, List<String>>, UDAggregate> udasAfter = new HashMap<>();
+        keyspaceAfter.functions.udas().forEach(f -> udasAfter.put(Pair.create(f.name(), functionArgumentsList(f)), f));
+        MapDifference<Pair<FunctionName, List<String>>, UDAggregate> udasDiff = Maps.difference(udasBefore, udasAfter);
+
+        // update keyspace params, if changed
+        if (!keyspaceBefore.params.equals(keyspaceAfter.params))
+            Schema.instance.updateKeyspace(keyspaceName, keyspaceAfter.params);
+
+        // drop everything removed
+        udasDiff.entriesOnlyOnLeft().values().forEach(Schema.instance::dropAggregate);
+        udfsDiff.entriesOnlyOnLeft().values().forEach(Schema.instance::dropFunction);
+        viewsDiff.entriesOnlyOnLeft().values().forEach(v -> Schema.instance.dropView(v.ksName, v.viewName));
+        tablesDiff.entriesOnlyOnLeft().values().forEach(t -> Schema.instance.dropTable(t.ksName, t.cfName));
+        typesDiff.entriesOnlyOnLeft().values().forEach(Schema.instance::dropType);
+
+        // add everything created
+        typesDiff.entriesOnlyOnRight().values().forEach(Schema.instance::addType);
+        tablesDiff.entriesOnlyOnRight().values().forEach(Schema.instance::addTable);
+        viewsDiff.entriesOnlyOnRight().values().forEach(Schema.instance::addView);
+        udfsDiff.entriesOnlyOnRight().values().forEach(Schema.instance::addFunction);
+        udasDiff.entriesOnlyOnRight().values().forEach(Schema.instance::addAggregate);
+
+        // update everything altered
+        for (MapDifference.ValueDifference<UserType> diff : typesDiff.entriesDiffering().values())
+            Schema.instance.updateType(diff.rightValue());
+        for (MapDifference.ValueDifference<CFMetaData> diff : tablesDiff.entriesDiffering().values())
+            Schema.instance.updateTable(diff.rightValue());
+        for (MapDifference.ValueDifference<ViewDefinition> diff : viewsDiff.entriesDiffering().values())
+            Schema.instance.updateView(diff.rightValue());
+        for (MapDifference.ValueDifference<UDFunction> diff : udfsDiff.entriesDiffering().values())
+            Schema.instance.updateFunction(diff.rightValue());
+        for (MapDifference.ValueDifference<UDAggregate> diff : udasDiff.entriesDiffering().values())
+            Schema.instance.updateAggregate(diff.rightValue());
+    }
+
+    /*
+     * Type parsing and transformation
+     */
+
+    /*
+     * Recursively replaces any instances of UserType with an equivalent TupleType.
+     * We do it for dropped_columns, to allow safely dropping unused user types without retaining any references
+     * in dropped_columns.
+     */
+    private static AbstractType<?> expandUserTypes(AbstractType<?> original)
+    {
+        if (original instanceof UserType)
+            return new TupleType(expandUserTypes(((UserType) original).fieldTypes()));
+
+        if (original instanceof TupleType)
+            return new TupleType(expandUserTypes(((TupleType) original).allTypes()));
+
+        if (original instanceof ListType<?>)
+            return ListType.getInstance(expandUserTypes(((ListType<?>) original).getElementsType()), original.isMultiCell());
+
+        if (original instanceof MapType<?,?>)
+        {
+            MapType<?, ?> mt = (MapType<?, ?>) original;
+            return MapType.getInstance(expandUserTypes(mt.getKeysType()), expandUserTypes(mt.getValuesType()), mt.isMultiCell());
+        }
+
+        if (original instanceof SetType<?>)
+            return SetType.getInstance(expandUserTypes(((SetType<?>) original).getElementsType()), original.isMultiCell());
+
+        // this is very unlikely to ever happen, but it's better to be safe than sorry
+        if (original instanceof ReversedType<?>)
+            return ReversedType.getInstance(expandUserTypes(((ReversedType) original).baseType));
+
+        if (original instanceof CompositeType)
+            return CompositeType.getInstance(expandUserTypes(original.getComponents()));
+
+        return original;
+    }
+
+    private static List<AbstractType<?>> expandUserTypes(List<AbstractType<?>> types)
+    {
+        return types.stream()
+                    .map(SchemaKeyspace::expandUserTypes)
+                    .collect(toList());
+    }
+
+    @VisibleForTesting
+    static class MissingColumns extends RuntimeException
+    {
+        MissingColumns(String message)
+        {
+            super(message);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/SpeculativeRetryParam.java b/src/java/org/apache/cassandra/schema/SpeculativeRetryParam.java
new file mode 100644
index 0000000..43447f0
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/SpeculativeRetryParam.java

@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.text.DecimalFormat;
+import java.util.concurrent.TimeUnit;
+import java.util.Locale;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+import static java.lang.String.format;
+
+public final class SpeculativeRetryParam
+{
+    public enum Kind
+    {
+        NONE, CUSTOM, PERCENTILE, ALWAYS
+    }
+
+    public static final SpeculativeRetryParam NONE = none();
+    public static final SpeculativeRetryParam ALWAYS = always();
+    public static final SpeculativeRetryParam DEFAULT = percentile(99);
+
+    private final Kind kind;
+    private final double value;
+
+    // pre-processed (divided by 100 for PERCENTILE), multiplied by 1M for CUSTOM (to nanos)
+    private final double threshold;
+
+    private SpeculativeRetryParam(Kind kind, double value)
+    {
+        this.kind = kind;
+        this.value = value;
+
+        if (kind == Kind.PERCENTILE)
+            threshold = value / 100;
+        else if (kind == Kind.CUSTOM)
+            threshold = TimeUnit.MILLISECONDS.toNanos((long) value);
+        else
+            threshold = value;
+    }
+
+    public Kind kind()
+    {
+        return kind;
+    }
+
+    public double threshold()
+    {
+        return threshold;
+    }
+
+    public static SpeculativeRetryParam none()
+    {
+        return new SpeculativeRetryParam(Kind.NONE, 0);
+    }
+
+    public static SpeculativeRetryParam always()
+    {
+        return new SpeculativeRetryParam(Kind.ALWAYS, 0);
+    }
+
+    public static SpeculativeRetryParam custom(double value)
+    {
+        return new SpeculativeRetryParam(Kind.CUSTOM, value);
+    }
+
+    public static SpeculativeRetryParam percentile(double value)
+    {
+        return new SpeculativeRetryParam(Kind.PERCENTILE, value);
+    }
+
+    public static SpeculativeRetryParam fromString(String value)
+    {
+        if (value.toLowerCase(Locale.ENGLISH).endsWith("ms"))
+        {
+            try
+            {
+                return custom(Double.parseDouble(value.substring(0, value.length() - "ms".length())));
+            }
+            catch (IllegalArgumentException e)
+            {
+                throw new ConfigurationException(format("Invalid value %s for option '%s'", value, TableParams.Option.SPECULATIVE_RETRY));
+            }
+        }
+
+        if (value.toUpperCase(Locale.ENGLISH).endsWith(Kind.PERCENTILE.toString()))
+        {
+            double threshold;
+            try
+            {
+                threshold = Double.parseDouble(value.substring(0, value.length() - Kind.PERCENTILE.toString().length()));
+            }
+            catch (IllegalArgumentException e)
+            {
+                throw new ConfigurationException(format("Invalid value %s for option '%s'", value, TableParams.Option.SPECULATIVE_RETRY));
+            }
+
+            if (threshold >= 0.0 && threshold <= 100.0)
+                return percentile(threshold);
+
+            throw new ConfigurationException(format("Invalid value %s for PERCENTILE option '%s': must be between 0.0 and 100.0",
+                                                    value,
+                                                    TableParams.Option.SPECULATIVE_RETRY));
+        }
+
+        if (value.equals(Kind.NONE.toString()))
+            return NONE;
+
+        if (value.equals(Kind.ALWAYS.toString()))
+            return ALWAYS;
+
+        throw new ConfigurationException(format("Invalid value %s for option '%s'", value, TableParams.Option.SPECULATIVE_RETRY));
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (!(o instanceof SpeculativeRetryParam))
+            return false;
+        SpeculativeRetryParam srp = (SpeculativeRetryParam) o;
+        return kind == srp.kind && threshold == srp.threshold;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(kind, threshold);
+    }
+
+    @Override
+    public String toString()
+    {
+        switch (kind)
+        {
+            case CUSTOM:
+                return format("%sms", value);
+            case PERCENTILE:
+                return format("%sPERCENTILE", new DecimalFormat("#.#####").format(value));
+            default: // NONE and ALWAYS
+                return kind.toString();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/TableParams.java b/src/java/org/apache/cassandra/schema/TableParams.java
new file mode 100644
index 0000000..dfa8603
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/TableParams.java

@@ -0,0 +1,384 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.nio.ByteBuffer;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableMap;
+
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.BloomCalculations;
+import static java.lang.String.format;
+
+public final class TableParams
+{
+    public static final TableParams DEFAULT = TableParams.builder().build();
+
+    public enum Option
+    {
+        BLOOM_FILTER_FP_CHANCE,
+        CACHING,
+        COMMENT,
+        COMPACTION,
+        COMPRESSION,
+        DCLOCAL_READ_REPAIR_CHANCE,
+        DEFAULT_TIME_TO_LIVE,
+        EXTENSIONS,
+        GC_GRACE_SECONDS,
+        MAX_INDEX_INTERVAL,
+        MEMTABLE_FLUSH_PERIOD_IN_MS,
+        MIN_INDEX_INTERVAL,
+        READ_REPAIR_CHANCE,
+        SPECULATIVE_RETRY,
+        CRC_CHECK_CHANCE;
+
+        @Override
+        public String toString()
+        {
+            return name().toLowerCase();
+        }
+    }
+
+    public static final String DEFAULT_COMMENT = "";
+    public static final double DEFAULT_READ_REPAIR_CHANCE = 0.0;
+    public static final double DEFAULT_DCLOCAL_READ_REPAIR_CHANCE = 0.1;
+    public static final int DEFAULT_GC_GRACE_SECONDS = 864000; // 10 days
+    public static final int DEFAULT_DEFAULT_TIME_TO_LIVE = 0;
+    public static final int DEFAULT_MEMTABLE_FLUSH_PERIOD_IN_MS = 0;
+    public static final int DEFAULT_MIN_INDEX_INTERVAL = 128;
+    public static final int DEFAULT_MAX_INDEX_INTERVAL = 2048;
+    public static final double DEFAULT_CRC_CHECK_CHANCE = 1.0;
+
+    public final String comment;
+    public final double readRepairChance;
+    public final double dcLocalReadRepairChance;
+    public final double bloomFilterFpChance;
+    public final double crcCheckChance;
+    public final int gcGraceSeconds;
+    public final int defaultTimeToLive;
+    public final int memtableFlushPeriodInMs;
+    public final int minIndexInterval;
+    public final int maxIndexInterval;
+    public final SpeculativeRetryParam speculativeRetry;
+    public final CachingParams caching;
+    public final CompactionParams compaction;
+    public final CompressionParams compression;
+    public final ImmutableMap<String, ByteBuffer> extensions;
+
+    private TableParams(Builder builder)
+    {
+        comment = builder.comment;
+        readRepairChance = builder.readRepairChance;
+        dcLocalReadRepairChance = builder.dcLocalReadRepairChance;
+        bloomFilterFpChance = builder.bloomFilterFpChance == null
+                            ? builder.compaction.defaultBloomFilterFbChance()
+                            : builder.bloomFilterFpChance;
+        crcCheckChance = builder.crcCheckChance;
+        gcGraceSeconds = builder.gcGraceSeconds;
+        defaultTimeToLive = builder.defaultTimeToLive;
+        memtableFlushPeriodInMs = builder.memtableFlushPeriodInMs;
+        minIndexInterval = builder.minIndexInterval;
+        maxIndexInterval = builder.maxIndexInterval;
+        speculativeRetry = builder.speculativeRetry;
+        caching = builder.caching;
+        compaction = builder.compaction;
+        compression = builder.compression;
+        extensions = builder.extensions;
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Builder builder(TableParams params)
+    {
+        return new Builder().bloomFilterFpChance(params.bloomFilterFpChance)
+                            .caching(params.caching)
+                            .comment(params.comment)
+                            .compaction(params.compaction)
+                            .compression(params.compression)
+                            .dcLocalReadRepairChance(params.dcLocalReadRepairChance)
+                            .crcCheckChance(params.crcCheckChance)
+                            .defaultTimeToLive(params.defaultTimeToLive)
+                            .gcGraceSeconds(params.gcGraceSeconds)
+                            .maxIndexInterval(params.maxIndexInterval)
+                            .memtableFlushPeriodInMs(params.memtableFlushPeriodInMs)
+                            .minIndexInterval(params.minIndexInterval)
+                            .readRepairChance(params.readRepairChance)
+                            .speculativeRetry(params.speculativeRetry)
+                            .extensions(params.extensions);
+    }
+
+    public void validate()
+    {
+        compaction.validate();
+        compression.validate();
+
+        double minBloomFilterFpChanceValue = BloomCalculations.minSupportedBloomFilterFpChance();
+        if (bloomFilterFpChance <=  minBloomFilterFpChanceValue || bloomFilterFpChance > 1)
+        {
+            fail("%s must be larger than %s and less than or equal to 1.0 (got %s)",
+                 Option.BLOOM_FILTER_FP_CHANCE,
+                 minBloomFilterFpChanceValue,
+                 bloomFilterFpChance);
+        }
+
+        if (dcLocalReadRepairChance < 0 || dcLocalReadRepairChance > 1.0)
+        {
+            fail("%s must be larger than or equal to 0 and smaller than or equal to 1.0 (got %s)",
+                 Option.DCLOCAL_READ_REPAIR_CHANCE,
+                 dcLocalReadRepairChance);
+        }
+
+        if (readRepairChance < 0 || readRepairChance > 1.0)
+        {
+            fail("%s must be larger than or equal to 0 and smaller than or equal to 1.0 (got %s)",
+                 Option.READ_REPAIR_CHANCE,
+                 readRepairChance);
+        }
+
+        if (crcCheckChance < 0 || crcCheckChance > 1.0)
+        {
+            fail("%s must be larger than or equal to 0 and smaller than or equal to 1.0 (got %s)",
+                 Option.CRC_CHECK_CHANCE,
+                 crcCheckChance);
+        }
+
+        if (defaultTimeToLive < 0)
+            fail("%s must be greater than or equal to 0 (got %s)", Option.DEFAULT_TIME_TO_LIVE, defaultTimeToLive);
+
+        if (defaultTimeToLive > Attributes.MAX_TTL)
+            fail("%s must be less than or equal to %d (got %s)", Option.DEFAULT_TIME_TO_LIVE, Attributes.MAX_TTL, defaultTimeToLive);
+
+        if (gcGraceSeconds < 0)
+            fail("%s must be greater than or equal to 0 (got %s)", Option.GC_GRACE_SECONDS, gcGraceSeconds);
+
+        if (minIndexInterval < 1)
+            fail("%s must be greater than or equal to 1 (got %s)", Option.MIN_INDEX_INTERVAL, minIndexInterval);
+
+        if (maxIndexInterval < minIndexInterval)
+        {
+            fail("%s must be greater than or equal to %s (%s) (got %s)",
+                 Option.MAX_INDEX_INTERVAL,
+                 Option.MIN_INDEX_INTERVAL,
+                 minIndexInterval,
+                 maxIndexInterval);
+        }
+
+        if (memtableFlushPeriodInMs < 0)
+            fail("%s must be greater than or equal to 0 (got %s)", Option.MEMTABLE_FLUSH_PERIOD_IN_MS, memtableFlushPeriodInMs);
+    }
+
+    private static void fail(String format, Object... args)
+    {
+        throw new ConfigurationException(format(format, args));
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof TableParams))
+            return false;
+
+        TableParams p = (TableParams) o;
+
+        return comment.equals(p.comment)
+            && readRepairChance == p.readRepairChance
+            && dcLocalReadRepairChance == p.dcLocalReadRepairChance
+            && bloomFilterFpChance == p.bloomFilterFpChance
+            && crcCheckChance == p.crcCheckChance
+            && gcGraceSeconds == p.gcGraceSeconds
+            && defaultTimeToLive == p.defaultTimeToLive
+            && memtableFlushPeriodInMs == p.memtableFlushPeriodInMs
+            && minIndexInterval == p.minIndexInterval
+            && maxIndexInterval == p.maxIndexInterval
+            && speculativeRetry.equals(p.speculativeRetry)
+            && caching.equals(p.caching)
+            && compaction.equals(p.compaction)
+            && compression.equals(p.compression)
+            && extensions.equals(p.extensions);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(comment,
+                                readRepairChance,
+                                dcLocalReadRepairChance,
+                                bloomFilterFpChance,
+                                crcCheckChance,
+                                gcGraceSeconds,
+                                defaultTimeToLive,
+                                memtableFlushPeriodInMs,
+                                minIndexInterval,
+                                maxIndexInterval,
+                                speculativeRetry,
+                                caching,
+                                compaction,
+                                compression,
+                                extensions);
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add(Option.COMMENT.toString(), comment)
+                          .add(Option.READ_REPAIR_CHANCE.toString(), readRepairChance)
+                          .add(Option.DCLOCAL_READ_REPAIR_CHANCE.toString(), dcLocalReadRepairChance)
+                          .add(Option.BLOOM_FILTER_FP_CHANCE.toString(), bloomFilterFpChance)
+                          .add(Option.CRC_CHECK_CHANCE.toString(), crcCheckChance)
+                          .add(Option.GC_GRACE_SECONDS.toString(), gcGraceSeconds)
+                          .add(Option.DEFAULT_TIME_TO_LIVE.toString(), defaultTimeToLive)
+                          .add(Option.MEMTABLE_FLUSH_PERIOD_IN_MS.toString(), memtableFlushPeriodInMs)
+                          .add(Option.MIN_INDEX_INTERVAL.toString(), minIndexInterval)
+                          .add(Option.MAX_INDEX_INTERVAL.toString(), maxIndexInterval)
+                          .add(Option.SPECULATIVE_RETRY.toString(), speculativeRetry)
+                          .add(Option.CACHING.toString(), caching)
+                          .add(Option.COMPACTION.toString(), compaction)
+                          .add(Option.COMPRESSION.toString(), compression)
+                          .add(Option.EXTENSIONS.toString(), extensions)
+                          .toString();
+    }
+
+    public static final class Builder
+    {
+        private String comment = DEFAULT_COMMENT;
+        private double readRepairChance = DEFAULT_READ_REPAIR_CHANCE;
+        private double dcLocalReadRepairChance = DEFAULT_DCLOCAL_READ_REPAIR_CHANCE;
+        private Double bloomFilterFpChance;
+        public Double crcCheckChance = DEFAULT_CRC_CHECK_CHANCE;
+        private int gcGraceSeconds = DEFAULT_GC_GRACE_SECONDS;
+        private int defaultTimeToLive = DEFAULT_DEFAULT_TIME_TO_LIVE;
+        private int memtableFlushPeriodInMs = DEFAULT_MEMTABLE_FLUSH_PERIOD_IN_MS;
+        private int minIndexInterval = DEFAULT_MIN_INDEX_INTERVAL;
+        private int maxIndexInterval = DEFAULT_MAX_INDEX_INTERVAL;
+        private SpeculativeRetryParam speculativeRetry = SpeculativeRetryParam.DEFAULT;
+        private CachingParams caching = CachingParams.DEFAULT;
+        private CompactionParams compaction = CompactionParams.DEFAULT;
+        private CompressionParams compression = CompressionParams.DEFAULT;
+        private ImmutableMap<String, ByteBuffer> extensions = ImmutableMap.of();
+
+        public Builder()
+        {
+        }
+
+        public TableParams build()
+        {
+            return new TableParams(this);
+        }
+
+        public Builder comment(String val)
+        {
+            comment = val;
+            return this;
+        }
+
+        public Builder readRepairChance(double val)
+        {
+            readRepairChance = val;
+            return this;
+        }
+
+        public Builder dcLocalReadRepairChance(double val)
+        {
+            dcLocalReadRepairChance = val;
+            return this;
+        }
+
+        public Builder bloomFilterFpChance(double val)
+        {
+            bloomFilterFpChance = val;
+            return this;
+        }
+
+        public Builder crcCheckChance(double val)
+        {
+            crcCheckChance = val;
+            return this;
+        }
+
+        public Builder gcGraceSeconds(int val)
+        {
+            gcGraceSeconds = val;
+            return this;
+        }
+
+        public Builder defaultTimeToLive(int val)
+        {
+            defaultTimeToLive = val;
+            return this;
+        }
+
+        public Builder memtableFlushPeriodInMs(int val)
+        {
+            memtableFlushPeriodInMs = val;
+            return this;
+        }
+
+        public Builder minIndexInterval(int val)
+        {
+            minIndexInterval = val;
+            return this;
+        }
+
+        public Builder maxIndexInterval(int val)
+        {
+            maxIndexInterval = val;
+            return this;
+        }
+
+        public Builder speculativeRetry(SpeculativeRetryParam val)
+        {
+            speculativeRetry = val;
+            return this;
+        }
+
+        public Builder caching(CachingParams val)
+        {
+            caching = val;
+            return this;
+        }
+
+        public Builder compaction(CompactionParams val)
+        {
+            compaction = val;
+            return this;
+        }
+
+        public Builder compression(CompressionParams val)
+        {
+            compression = val;
+            return this;
+        }
+
+        public Builder extensions(Map<String, ByteBuffer> val)
+        {
+            extensions = ImmutableMap.copyOf(val);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Tables.java b/src/java/org/apache/cassandra/schema/Tables.java
new file mode 100644
index 0000000..4f728d4
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Tables.java

@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.Iterator;
+import java.util.Optional;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.MapDifference;
+import com.google.common.collect.Maps;
+
+import org.apache.cassandra.config.CFMetaData;
+
+import static com.google.common.collect.Iterables.filter;
+
+/**
+ * An immutable container for a keyspace's Tables.
+ */
+public final class Tables implements Iterable<CFMetaData>
+{
+    private final ImmutableMap<String, CFMetaData> tables;
+
+    private Tables(Builder builder)
+    {
+        tables = builder.tables.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Tables none()
+    {
+        return builder().build();
+    }
+
+    public static Tables of(CFMetaData... tables)
+    {
+        return builder().add(tables).build();
+    }
+
+    public static Tables of(Iterable<CFMetaData> tables)
+    {
+        return builder().add(tables).build();
+    }
+
+    public Iterator<CFMetaData> iterator()
+    {
+        return tables.values().iterator();
+    }
+
+    public int size()
+    {
+        return tables.size();
+    }
+
+    /**
+     * Get the table with the specified name
+     *
+     * @param name a non-qualified table name
+     * @return an empty {@link Optional} if the table name is not found; a non-empty optional of {@link CFMetaData} otherwise
+     */
+    public Optional<CFMetaData> get(String name)
+    {
+        return Optional.ofNullable(tables.get(name));
+    }
+
+    /**
+     * Get the table with the specified name
+     *
+     * @param name a non-qualified table name
+     * @return null if the table name is not found; the found {@link CFMetaData} otherwise
+     */
+    @Nullable
+    public CFMetaData getNullable(String name)
+    {
+        return tables.get(name);
+    }
+
+    /**
+     * Create a Tables instance with the provided table added
+     */
+    public Tables with(CFMetaData table)
+    {
+        if (get(table.cfName).isPresent())
+            throw new IllegalStateException(String.format("Table %s already exists", table.cfName));
+
+        return builder().add(this).add(table).build();
+    }
+
+    /**
+     * Creates a Tables instance with the table with the provided name removed
+     */
+    public Tables without(String name)
+    {
+        CFMetaData table =
+            get(name).orElseThrow(() -> new IllegalStateException(String.format("Table %s doesn't exists", name)));
+
+        return builder().add(filter(this, t -> t != table)).build();
+    }
+
+    MapDifference<String, CFMetaData> diff(Tables other)
+    {
+        return Maps.difference(tables, other.tables);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Tables && tables.equals(((Tables) o).tables));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return tables.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return tables.values().toString();
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMap.Builder<String, CFMetaData> tables = new ImmutableMap.Builder<>();
+
+        private Builder()
+        {
+        }
+
+        public Tables build()
+        {
+            return new Tables(this);
+        }
+
+        public Builder add(CFMetaData table)
+        {
+            tables.put(table.cfName, table);
+            return this;
+        }
+
+        public Builder add(CFMetaData... tables)
+        {
+            for (CFMetaData table : tables)
+                add(table);
+            return this;
+        }
+
+        public Builder add(Iterable<CFMetaData> tables)
+        {
+            tables.forEach(this::add);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/TriggerMetadata.java b/src/java/org/apache/cassandra/schema/TriggerMetadata.java
new file mode 100644
index 0000000..2e0d547
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/TriggerMetadata.java

@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.schema;
+
+import com.google.common.base.Objects;
+
+public final class TriggerMetadata
+{
+    public static final String CLASS = "class";
+
+    public final String name;
+
+    // For now, the only supported option is 'class'.
+    // Proper trigger parametrization will be added later.
+    public final String classOption;
+
+    public TriggerMetadata(String name, String classOption)
+    {
+        this.name = name;
+        this.classOption = classOption;
+    }
+
+    public static TriggerMetadata create(String name, String classOption)
+    {
+        return new TriggerMetadata(name, classOption);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof TriggerMetadata))
+            return false;
+
+        TriggerMetadata td = (TriggerMetadata) o;
+
+        return name.equals(td.name) && classOption.equals(td.classOption);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(name, classOption);
+    }
+
+    @Override
+    public String toString()
+    {
+        return Objects.toStringHelper(this)
+                      .add("name", name)
+                      .add("class", classOption)
+                      .toString();
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Triggers.java b/src/java/org/apache/cassandra/schema/Triggers.java
new file mode 100644
index 0000000..bb39f1f
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Triggers.java

@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.util.Iterator;
+import java.util.Optional;
+
+import com.google.common.collect.ImmutableMap;
+
+import static com.google.common.collect.Iterables.filter;
+
+public final class Triggers implements Iterable<TriggerMetadata>
+{
+    private final ImmutableMap<String, TriggerMetadata> triggers;
+
+    private Triggers(Builder builder)
+    {
+        triggers = builder.triggers.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Triggers none()
+    {
+        return builder().build();
+    }
+
+    public Iterator<TriggerMetadata> iterator()
+    {
+        return triggers.values().iterator();
+    }
+
+    public int size()
+    {
+        return triggers.size();
+    }
+
+    public boolean isEmpty()
+    {
+        return triggers.isEmpty();
+    }
+
+    /**
+     * Get the trigger with the specified name
+     *
+     * @param name a non-qualified trigger name
+     * @return an empty {@link Optional} if the trigger name is not found; a non-empty optional of {@link TriggerMetadata} otherwise
+     */
+    public Optional<TriggerMetadata> get(String name)
+    {
+        return Optional.ofNullable(triggers.get(name));
+    }
+
+    /**
+     * Create a Triggers instance with the provided trigger added
+     */
+    public Triggers with(TriggerMetadata trigger)
+    {
+        if (get(trigger.name).isPresent())
+            throw new IllegalStateException(String.format("Trigger %s already exists", trigger.name));
+
+        return builder().add(this).add(trigger).build();
+    }
+
+    /**
+     * Creates a Triggers instance with the trigger with the provided name removed
+     */
+    public Triggers without(String name)
+    {
+        TriggerMetadata trigger =
+            get(name).orElseThrow(() -> new IllegalStateException(String.format("Trigger %s doesn't exists", name)));
+
+        return builder().add(filter(this, t -> t != trigger)).build();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Triggers && triggers.equals(((Triggers) o).triggers));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return triggers.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return triggers.values().toString();
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMap.Builder<String, TriggerMetadata> triggers = new ImmutableMap.Builder<>();
+
+        private Builder()
+        {
+        }
+
+        public Triggers build()
+        {
+            return new Triggers(this);
+        }
+
+        public Builder add(TriggerMetadata trigger)
+        {
+            triggers.put(trigger.name, trigger);
+            return this;
+        }
+
+        public Builder add(Iterable<TriggerMetadata> triggers)
+        {
+            triggers.forEach(this::add);
+            return this;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Types.java b/src/java/org/apache/cassandra/schema/Types.java
new file mode 100644
index 0000000..1b71364
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Types.java

@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.MapDifference;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
+
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static java.lang.String.format;
+import static com.google.common.collect.Iterables.filter;
+import static java.util.stream.Collectors.toList;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+/**
+ * An immutable container for a keyspace's UDTs.
+ */
+public final class Types implements Iterable<UserType>
+{
+    private static final Types NONE = new Types(ImmutableMap.of());
+
+    private final Map<ByteBuffer, UserType> types;
+
+    private Types(Builder builder)
+    {
+        types = builder.types.build();
+    }
+
+    /*
+     * For use in RawBuilder::build only.
+     */
+    private Types(Map<ByteBuffer, UserType> types)
+    {
+        this.types = types;
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static RawBuilder rawBuilder(String keyspace)
+    {
+        return new RawBuilder(keyspace);
+    }
+
+    public static Types none()
+    {
+        return NONE;
+    }
+
+    public static Types of(UserType... types)
+    {
+        return builder().add(types).build();
+    }
+
+    public Iterator<UserType> iterator()
+    {
+        return types.values().iterator();
+    }
+
+    /**
+     * Get the type with the specified name
+     *
+     * @param name a non-qualified type name
+     * @return an empty {@link Optional} if the type name is not found; a non-empty optional of {@link UserType} otherwise
+     */
+    public Optional<UserType> get(ByteBuffer name)
+    {
+        return Optional.ofNullable(types.get(name));
+    }
+
+    /**
+     * Get the type with the specified name
+     *
+     * @param name a non-qualified type name
+     * @return null if the type name is not found; the found {@link UserType} otherwise
+     */
+    @Nullable
+    public UserType getNullable(ByteBuffer name)
+    {
+        return types.get(name);
+    }
+
+    /**
+     * Create a Types instance with the provided type added
+     */
+    public Types with(UserType type)
+    {
+        if (get(type.name).isPresent())
+            throw new IllegalStateException(format("Type %s already exists", type.name));
+
+        return builder().add(this).add(type).build();
+    }
+
+    /**
+     * Creates a Types instance with the type with the provided name removed
+     */
+    public Types without(ByteBuffer name)
+    {
+        UserType type =
+            get(name).orElseThrow(() -> new IllegalStateException(format("Type %s doesn't exists", name)));
+
+        return builder().add(filter(this, t -> t != type)).build();
+    }
+
+    MapDifference<ByteBuffer, UserType> diff(Types other)
+    {
+        return Maps.difference(types, other.types);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Types && types.equals(((Types) o).types));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return types.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return types.values().toString();
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMap.Builder<ByteBuffer, UserType> types = ImmutableMap.builder();
+
+        private Builder()
+        {
+        }
+
+        public Types build()
+        {
+            return new Types(this);
+        }
+
+        public Builder add(UserType type)
+        {
+            types.put(type.name, type);
+            return this;
+        }
+
+        public Builder add(UserType... types)
+        {
+            for (UserType type : types)
+                add(type);
+            return this;
+        }
+
+        public Builder add(Iterable<UserType> types)
+        {
+            types.forEach(this::add);
+            return this;
+        }
+    }
+
+    public static final class RawBuilder
+    {
+        final String keyspace;
+        final List<RawUDT> definitions;
+
+        private RawBuilder(String keyspace)
+        {
+            this.keyspace = keyspace;
+            this.definitions = new ArrayList<>();
+        }
+
+        /**
+         * Build a Types instance from Raw definitions.
+         *
+         * Constructs a DAG of graph dependencies and resolves them 1 by 1 in topological order.
+         */
+        public Types build()
+        {
+            if (definitions.isEmpty())
+                return Types.none();
+
+            /*
+             * build a DAG of UDT dependencies
+             */
+            Map<RawUDT, Integer> vertices = new HashMap<>(); // map values are numbers of referenced types
+            for (RawUDT udt : definitions)
+                vertices.put(udt, 0);
+
+            Multimap<RawUDT, RawUDT> adjacencyList = HashMultimap.create();
+            for (RawUDT udt1 : definitions)
+                for (RawUDT udt2 : definitions)
+                    if (udt1 != udt2 && udt1.referencesUserType(udt2))
+                        adjacencyList.put(udt2, udt1);
+
+            /*
+             * resolve dependencies in topological order, using Kahn's algorithm
+             */
+            adjacencyList.values().forEach(vertex -> vertices.put(vertex, vertices.get(vertex) + 1));
+
+            Queue<RawUDT> resolvableTypes = new LinkedList<>(); // UDTs with 0 dependencies
+            for (Map.Entry<RawUDT, Integer> entry : vertices.entrySet())
+                if (entry.getValue() == 0)
+                    resolvableTypes.add(entry.getKey());
+
+            Types types = new Types(new HashMap<>());
+            while (!resolvableTypes.isEmpty())
+            {
+                RawUDT vertex = resolvableTypes.remove();
+
+                for (RawUDT dependentType : adjacencyList.get(vertex))
+                    if (vertices.replace(dependentType, vertices.get(dependentType) - 1) == 1)
+                        resolvableTypes.add(dependentType);
+
+                UserType udt = vertex.prepare(keyspace, types);
+                types.types.put(udt.name, udt);
+            }
+
+            if (types.types.size() != definitions.size())
+                throw new ConfigurationException(format("Cannot resolve UDTs for keyspace %s: some types are missing", keyspace));
+
+            /*
+             * return an immutable copy
+             */
+            return Types.builder().add(types).build();
+        }
+
+        public void add(String name, List<String> fieldNames, List<String> fieldTypes)
+        {
+            List<CQL3Type.Raw> rawFieldTypes =
+                fieldTypes.stream()
+                          .map(CQLTypeParser::parseRaw)
+                          .collect(toList());
+
+            definitions.add(new RawUDT(name, fieldNames, rawFieldTypes));
+        }
+
+        private static final class RawUDT
+        {
+            final String name;
+            final List<String> fieldNames;
+            final List<CQL3Type.Raw> fieldTypes;
+
+            RawUDT(String name, List<String> fieldNames, List<CQL3Type.Raw> fieldTypes)
+            {
+                this.name = name;
+                this.fieldNames = fieldNames;
+                this.fieldTypes = fieldTypes;
+            }
+
+            boolean referencesUserType(RawUDT other)
+            {
+                return fieldTypes.stream().anyMatch(t -> t.referencesUserType(other.name));
+            }
+
+            UserType prepare(String keyspace, Types types)
+            {
+                List<ByteBuffer> preparedFieldNames =
+                    fieldNames.stream()
+                              .map(ByteBufferUtil::bytes)
+                              .collect(toList());
+
+                List<AbstractType<?>> preparedFieldTypes =
+                    fieldTypes.stream()
+                              .map(t -> t.prepareInternal(keyspace, types).getType())
+                              .collect(toList());
+
+                return new UserType(keyspace, bytes(name), preparedFieldNames, preparedFieldTypes);
+            }
+
+            @Override
+            public int hashCode()
+            {
+                return name.hashCode();
+            }
+
+            @Override
+            public boolean equals(Object other)
+            {
+                return name.equals(((RawUDT) other).name);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/UnknownIndexException.java b/src/java/org/apache/cassandra/schema/UnknownIndexException.java
new file mode 100644
index 0000000..5daf631
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/UnknownIndexException.java

@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+import java.io.IOException;
+import java.util.UUID;
+
+import org.apache.cassandra.config.CFMetaData;
+
+/**
+ * Exception thrown when we read an index id from a serialized ReadCommand and no corresponding IndexMetadata
+ * can be found in the CFMetaData#indexes collection. Note that this is an internal exception and is not meant
+ * to be user facing, the node reading the ReadCommand should proceed as if no index id were present.
+ */
+public class UnknownIndexException extends IOException
+{
+    public final UUID indexId;
+    public UnknownIndexException(CFMetaData metadata, UUID id)
+    {
+        super(String.format("Unknown index %s for table %s.%s", id.toString(), metadata.ksName, metadata.cfName));
+        indexId = id;
+    }
+}

diff --git a/src/java/org/apache/cassandra/schema/Views.java b/src/java/org/apache/cassandra/schema/Views.java
new file mode 100644
index 0000000..b8fdd4b
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/Views.java

@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+
+import java.util.Iterator;
+import java.util.Optional;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.MapDifference;
+import com.google.common.collect.Maps;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ViewDefinition;
+
+import static com.google.common.collect.Iterables.filter;
+
+public final class Views implements Iterable<ViewDefinition>
+{
+    private final ImmutableMap<String, ViewDefinition> views;
+
+    private Views(Builder builder)
+    {
+        views = builder.views.build();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Views none()
+    {
+        return builder().build();
+    }
+
+    public Iterator<ViewDefinition> iterator()
+    {
+        return views.values().iterator();
+    }
+
+    public Iterable<CFMetaData> metadatas()
+    {
+        return Iterables.transform(views.values(), view -> view.metadata);
+    }
+
+    public int size()
+    {
+        return views.size();
+    }
+
+    public boolean isEmpty()
+    {
+        return views.isEmpty();
+    }
+
+    /**
+     * Get the materialized view with the specified name
+     *
+     * @param name a non-qualified materialized view name
+     * @return an empty {@link Optional} if the materialized view name is not found; a non-empty optional of {@link ViewDefinition} otherwise
+     */
+    public Optional<ViewDefinition> get(String name)
+    {
+        return Optional.ofNullable(views.get(name));
+    }
+
+    /**
+     * Get the view with the specified name
+     *
+     * @param name a non-qualified view name
+     * @return null if the view name is not found; the found {@link ViewDefinition} otherwise
+     */
+    @Nullable
+    public ViewDefinition getNullable(String name)
+    {
+        return views.get(name);
+    }
+
+    /**
+     * Create a MaterializedViews instance with the provided materialized view added
+     */
+    public Views with(ViewDefinition view)
+    {
+        if (get(view.viewName).isPresent())
+            throw new IllegalStateException(String.format("Materialized View %s already exists", view.viewName));
+
+        return builder().add(this).add(view).build();
+    }
+
+    /**
+     * Creates a MaterializedViews instance with the materializedView with the provided name removed
+     */
+    public Views without(String name)
+    {
+        ViewDefinition materializedView =
+            get(name).orElseThrow(() -> new IllegalStateException(String.format("Materialized View %s doesn't exists", name)));
+
+        return builder().add(filter(this, v -> v != materializedView)).build();
+    }
+
+    /**
+     * Creates a MaterializedViews instance which contains an updated materialized view
+     */
+    public Views replace(ViewDefinition view, CFMetaData cfm)
+    {
+        return without(view.viewName).with(view);
+    }
+
+    MapDifference<String, ViewDefinition> diff(Views other)
+    {
+        return Maps.difference(views, other.views);
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        return this == o || (o instanceof Views && views.equals(((Views) o).views));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return views.hashCode();
+    }
+
+    @Override
+    public String toString()
+    {
+        return views.values().toString();
+    }
+
+    public static final class Builder
+    {
+        final ImmutableMap.Builder<String, ViewDefinition> views = new ImmutableMap.Builder<>();
+
+        private Builder()
+        {
+        }
+
+        public Views build()
+        {
+            return new Views(this);
+        }
+
+
+        public Builder add(ViewDefinition view)
+        {
+            views.put(view.viewName, view);
+            return this;
+        }
+
+        public Builder add(Iterable<ViewDefinition> views)
+        {
+            views.forEach(this::add);
+            return this;
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/security/SSLFactory.java b/src/java/org/apache/cassandra/security/SSLFactory.java
index a327de9..56a3a3f 100644
--- a/src/java/org/apache/cassandra/security/SSLFactory.java
+++ b/src/java/org/apache/cassandra/security/SSLFactory.java

@@ -59,14 +59,22 @@
     public static SSLServerSocket getServerSocket(EncryptionOptions options, InetAddress address, int port) throws IOException
     {
         SSLContext ctx = createSSLContext(options, true);
-        SSLServerSocket serverSocket = (SSLServerSocket)ctx.getServerSocketFactory().createServerSocket();
-        serverSocket.setReuseAddress(true);
-        String[] suites = filterCipherSuites(serverSocket.getSupportedCipherSuites(), options.cipher_suites);
-        serverSocket.setEnabledCipherSuites(suites);
-        serverSocket.setNeedClientAuth(options.require_client_auth);
-        serverSocket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
-        serverSocket.bind(new InetSocketAddress(address, port), 500);
-        return serverSocket;
+        SSLServerSocket serverSocket = (SSLServerSocket) ctx.getServerSocketFactory().createServerSocket();
+        try
+        {
+            serverSocket.setReuseAddress(true);
+            String[] suites = filterCipherSuites(serverSocket.getSupportedCipherSuites(), options.cipher_suites);
+            serverSocket.setEnabledCipherSuites(suites);
+            serverSocket.setNeedClientAuth(options.require_client_auth);
+            serverSocket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
+            serverSocket.bind(new InetSocketAddress(address, port), 500);
+            return serverSocket;
+        }
+        catch (IllegalArgumentException | SecurityException | IOException e)
+        {
+            serverSocket.close();
+            throw e;
+        }
     }
 
     /** Create a socket and connect */
@@ -74,10 +82,18 @@
     {
         SSLContext ctx = createSSLContext(options, true);
         SSLSocket socket = (SSLSocket) ctx.getSocketFactory().createSocket(address, port, localAddress, localPort);
-        String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
-        socket.setEnabledCipherSuites(suites);
-        socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
-        return socket;
+        try
+        {
+            String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
+            socket.setEnabledCipherSuites(suites);
+            socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
+            return socket;
+        }
+        catch (IllegalArgumentException e)
+        {
+            socket.close();
+            throw e;
+        }
     }
 
     /** Create a socket and connect, using any local address */
@@ -85,10 +101,18 @@
     {
         SSLContext ctx = createSSLContext(options, true);
         SSLSocket socket = (SSLSocket) ctx.getSocketFactory().createSocket(address, port);
-        String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
-        socket.setEnabledCipherSuites(suites);
-        socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
-        return socket;
+        try
+        {
+            String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
+            socket.setEnabledCipherSuites(suites);
+            socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
+            return socket;
+        }
+        catch (IllegalArgumentException e)
+        {
+            socket.close();
+            throw e;
+        }
     }
 
     /** Just create a socket */
@@ -96,10 +120,18 @@
     {
         SSLContext ctx = createSSLContext(options, true);
         SSLSocket socket = (SSLSocket) ctx.getSocketFactory().createSocket();
-        String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
-        socket.setEnabledCipherSuites(suites);
-        socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
-        return socket;
+        try
+        {
+            String[] suites = filterCipherSuites(socket.getSupportedCipherSuites(), options.cipher_suites);
+            socket.setEnabledCipherSuites(suites);
+            socket.setEnabledProtocols(ACCEPTED_PROTOCOLS);
+            return socket;
+        }
+        catch (IllegalArgumentException e)
+        {
+            socket.close();
+            throw e;
+        }
     }
 
     @SuppressWarnings("resource")

diff --git a/src/java/org/apache/cassandra/serializers/AbstractTextSerializer.java b/src/java/org/apache/cassandra/serializers/AbstractTextSerializer.java
index f1de6a4..7a3afed 100644
--- a/src/java/org/apache/cassandra/serializers/AbstractTextSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/AbstractTextSerializer.java

@@ -17,12 +17,14 @@
  */
 package org.apache.cassandra.serializers;
 
-import org.apache.cassandra.utils.ByteBufferUtil;
-
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
 
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
 public abstract class AbstractTextSerializer implements TypeSerializer<String>
 {
     private final Charset charset;
@@ -58,4 +60,16 @@
     {
         return String.class;
     }
+
+    /**
+     * Generates CQL literal for TEXT/VARCHAR/ASCII types.
+     * Caveat: it does only generate literals with single quotes and not pg-style literals.
+     */
+    @Override
+    public String toCQLLiteral(ByteBuffer buffer)
+    {
+        return buffer == null
+             ? "null"
+             : '\'' + StringUtils.replace(deserialize(buffer), "'", "''") + '\'';
+    }
 }

diff --git a/src/java/org/apache/cassandra/serializers/BytesSerializer.java b/src/java/org/apache/cassandra/serializers/BytesSerializer.java
index 4dcaa82..ed0bf77 100644
--- a/src/java/org/apache/cassandra/serializers/BytesSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/BytesSerializer.java

@@ -52,4 +52,12 @@
     {
         return ByteBuffer.class;
     }
+
+    @Override
+    public String toCQLLiteral(ByteBuffer buffer)
+    {
+        return buffer == null
+             ? "null"
+             : "0x" + toString(deserialize(buffer));
+    }
 }

diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
index 5fb3e0a..3d6be67 100644
--- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java

@@ -50,11 +50,6 @@
         return deserializeForNativeProtocol(bytes, Server.VERSION_3);
     }
 
-    public ByteBuffer reserializeToV3(ByteBuffer bytes)
-    {
-        return serialize(deserializeForNativeProtocol(bytes, 2));
-    }
-
     public void validate(ByteBuffer bytes) throws MarshalException
     {
         // Same thing as above
@@ -76,69 +71,42 @@
 
     protected static void writeCollectionSize(ByteBuffer output, int elements, int version)
     {
-        if (version >= Server.VERSION_3)
             output.putInt(elements);
-        else
-            output.putShort((short)elements);
     }
 
     public static int readCollectionSize(ByteBuffer input, int version)
     {
-        return version >= Server.VERSION_3 ? input.getInt() : ByteBufferUtil.readShortLength(input);
+        return input.getInt();
     }
 
     protected static int sizeOfCollectionSize(int elements, int version)
     {
-        return version >= Server.VERSION_3 ? 4 : 2;
+        return 4;
     }
 
     public static void writeValue(ByteBuffer output, ByteBuffer value, int version)
     {
-        if (version >= Server.VERSION_3)
+        if (value == null)
         {
-            if (value == null)
-            {
-                output.putInt(-1);
-                return;
-            }
+            output.putInt(-1);
+            return;
+        }
 
-            output.putInt(value.remaining());
-            output.put(value.duplicate());
-        }
-        else
-        {
-            assert value != null;
-            output.putShort((short)value.remaining());
-            output.put(value.duplicate());
-        }
+        output.putInt(value.remaining());
+        output.put(value.duplicate());
     }
 
     public static ByteBuffer readValue(ByteBuffer input, int version)
     {
-        if (version >= Server.VERSION_3)
-        {
-            int size = input.getInt();
-            if (size < 0)
-                return null;
+        int size = input.getInt();
+        if (size < 0)
+            return null;
 
-            return ByteBufferUtil.readBytes(input, size);
-        }
-        else
-        {
-            return ByteBufferUtil.readBytesWithShortLength(input);
-        }
+        return ByteBufferUtil.readBytes(input, size);
     }
 
     public static int sizeOfValue(ByteBuffer value, int version)
     {
-        if (version >= Server.VERSION_3)
-        {
-            return value == null ? 4 : 4 + value.remaining();
-        }
-        else
-        {
-            assert value != null;
-            return 2 + value.remaining();
-        }
+        return value == null ? 4 : 4 + value.remaining();
     }
 }

diff --git a/src/java/org/apache/cassandra/serializers/EmptySerializer.java b/src/java/org/apache/cassandra/serializers/EmptySerializer.java
index 2ccecc5..352ef2c 100644
--- a/src/java/org/apache/cassandra/serializers/EmptySerializer.java
+++ b/src/java/org/apache/cassandra/serializers/EmptySerializer.java

@@ -28,6 +28,7 @@
 
     public Void deserialize(ByteBuffer bytes)
     {
+        validate(bytes);
         return null;
     }
 
@@ -39,7 +40,11 @@
     public void validate(ByteBuffer bytes) throws MarshalException
     {
         if (bytes.remaining() > 0)
-            throw new MarshalException("EmptyType only accept empty values");
+        {
+            throw new MarshalException("EmptyType only accept empty values. " +
+                                       "A non-empty value can be a result of a Thrift write into CQL-created dense table. " +
+                                       "See CASSANDRA-15778 for details.");
+        }
     }
 
     public String toString(Void value)

diff --git a/src/java/org/apache/cassandra/serializers/ListSerializer.java b/src/java/org/apache/cassandra/serializers/ListSerializer.java
index d2d0610..3fd0803 100644
--- a/src/java/org/apache/cassandra/serializers/ListSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/ListSerializer.java

@@ -151,14 +151,16 @@
     {
         StringBuilder sb = new StringBuilder();
         boolean isFirst = true;
+        sb.append('[');
         for (T element : value)
         {
             if (isFirst)
                 isFirst = false;
             else
-                sb.append("; ");
+                sb.append(", ");
             sb.append(elements.toString(element));
         }
+        sb.append(']');
         return sb.toString();
     }
 

diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java
index 70cd944..fa8432a 100644
--- a/src/java/org/apache/cassandra/serializers/MapSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java

@@ -33,32 +33,38 @@
 
     public final TypeSerializer<K> keys;
     public final TypeSerializer<V> values;
+    private final Comparator<Pair<ByteBuffer, ByteBuffer>> comparator;
 
-    public static synchronized <K, V> MapSerializer<K, V> getInstance(TypeSerializer<K> keys, TypeSerializer<V> values)
+    public static synchronized <K, V> MapSerializer<K, V> getInstance(TypeSerializer<K> keys, TypeSerializer<V> values, Comparator<ByteBuffer> comparator)
     {
         Pair<TypeSerializer<?>, TypeSerializer<?>> p = Pair.<TypeSerializer<?>, TypeSerializer<?>>create(keys, values);
         MapSerializer<K, V> t = instances.get(p);
         if (t == null)
         {
-            t = new MapSerializer<K, V>(keys, values);
+            t = new MapSerializer<K, V>(keys, values, comparator);
             instances.put(p, t);
         }
         return t;
     }
 
-    private MapSerializer(TypeSerializer<K> keys, TypeSerializer<V> values)
+    private MapSerializer(TypeSerializer<K> keys, TypeSerializer<V> values, Comparator<ByteBuffer> comparator)
     {
         this.keys = keys;
         this.values = values;
+        this.comparator = (p1, p2) -> comparator.compare(p1.left, p2.left);
     }
 
     public List<ByteBuffer> serializeValues(Map<K, V> map)
     {
-        List<ByteBuffer> buffers = new ArrayList<>(map.size() * 2);
+        List<Pair<ByteBuffer, ByteBuffer>> pairs = new ArrayList<>(map.size());
         for (Map.Entry<K, V> entry : map.entrySet())
+            pairs.add(Pair.create(keys.serialize(entry.getKey()), values.serialize(entry.getValue())));
+        Collections.sort(pairs, comparator);
+        List<ByteBuffer> buffers = new ArrayList<>(pairs.size() * 2);
+        for (Pair<ByteBuffer, ByteBuffer> p : pairs)
         {
-            buffers.add(keys.serialize(entry.getKey()));
-            buffers.add(values.serialize(entry.getValue()));
+            buffers.add(p.left);
+            buffers.add(p.right);
         }
         return buffers;
     }
@@ -162,19 +168,19 @@
     public String toString(Map<K, V> value)
     {
         StringBuilder sb = new StringBuilder();
+        sb.append('{');
         boolean isFirst = true;
         for (Map.Entry<K, V> element : value.entrySet())
         {
             if (isFirst)
                 isFirst = false;
             else
-                sb.append("; ");
-            sb.append('(');
+                sb.append(", ");
             sb.append(keys.toString(element.getKey()));
-            sb.append(", ");
+            sb.append(": ");
             sb.append(values.toString(element.getValue()));
-            sb.append(')');
         }
+        sb.append('}');
         return sb.toString();
     }
 

diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java
index 0ed14d3..14fde3b 100644
--- a/src/java/org/apache/cassandra/serializers/SetSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java

@@ -28,21 +28,23 @@
     private static final Map<TypeSerializer<?>, SetSerializer> instances = new HashMap<TypeSerializer<?>, SetSerializer>();
 
     public final TypeSerializer<T> elements;
+    private final Comparator<ByteBuffer> comparator;
 
-    public static synchronized <T> SetSerializer<T> getInstance(TypeSerializer<T> elements)
+    public static synchronized <T> SetSerializer<T> getInstance(TypeSerializer<T> elements, Comparator<ByteBuffer> elementComparator)
     {
         SetSerializer<T> t = instances.get(elements);
         if (t == null)
         {
-            t = new SetSerializer<T>(elements);
+            t = new SetSerializer<T>(elements, elementComparator);
             instances.put(elements, t);
         }
         return t;
     }
 
-    private SetSerializer(TypeSerializer<T> elements)
+    private SetSerializer(TypeSerializer<T> elements, Comparator<ByteBuffer> comparator)
     {
         this.elements = elements;
+        this.comparator = comparator;
     }
 
     public List<ByteBuffer> serializeValues(Set<T> values)
@@ -50,6 +52,7 @@
         List<ByteBuffer> buffers = new ArrayList<>(values.size());
         for (T value : values)
             buffers.add(elements.serialize(value));
+        Collections.sort(buffers, comparator);
         return buffers;
     }
 
@@ -114,6 +117,7 @@
     public String toString(Set<T> value)
     {
         StringBuilder sb = new StringBuilder();
+        sb.append('{');
         boolean isFirst = true;
         for (T element : value)
         {
@@ -123,10 +127,11 @@
             }
             else
             {
-                sb.append("; ");
+                sb.append(", ");
             }
             sb.append(elements.toString(element));
         }
+        sb.append('}');
         return sb.toString();
     }
 

diff --git a/src/java/org/apache/cassandra/serializers/TimestampSerializer.java b/src/java/org/apache/cassandra/serializers/TimestampSerializer.java
index 77a5df9..9bd9a8d 100644
--- a/src/java/org/apache/cassandra/serializers/TimestampSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/TimestampSerializer.java

@@ -22,7 +22,8 @@
 import java.nio.ByteBuffer;
 import java.text.SimpleDateFormat;
 import java.text.ParseException;
-import java.util.*;
+import java.util.Date;
+import java.util.TimeZone;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang3.time.DateUtils;
@@ -73,7 +74,7 @@
             "yyyy-MM-dd'T'HH:mm:ss.SSS z",
             "yyyy-MM-dd'T'HH:mm:ss.SSS zz",
             "yyyy-MM-dd'T'HH:mm:ss.SSS zzz",
-            "yyyy-MM-dd'T'HH:mm:ss.SSSX",
+            "yyyy-MM-dd'T'HH:mm:ss.SSSX",  // UTC_FORMAT
             "yyyy-MM-dd'T'HH:mm:ss.SSSXX",
             "yyyy-MM-dd'T'HH:mm:ss.SSSXXX",
             "yyyy-MM-dd",
@@ -96,6 +97,17 @@
         }
     };
 
+    private static final String UTC_FORMAT = dateStringPatterns[40];
+    private static final ThreadLocal<SimpleDateFormat> FORMATTER_UTC = new ThreadLocal<SimpleDateFormat>()
+    {
+        protected SimpleDateFormat initialValue()
+        {
+            SimpleDateFormat sdf = new SimpleDateFormat(UTC_FORMAT);
+            sdf.setTimeZone(TimeZone.getTimeZone("UTC"));
+            return sdf;
+        }
+    };
+    
     private static final String TO_JSON_FORMAT = dateStringPatterns[19];
     private static final ThreadLocal<SimpleDateFormat> FORMATTER_TO_JSON = new ThreadLocal<SimpleDateFormat>()
     {
@@ -106,6 +118,8 @@
             return sdf;
         }
     };
+
+
     
     public static final TimestampSerializer instance = new TimestampSerializer();
 
@@ -164,8 +178,25 @@
         return value == null ? "" : FORMATTER.get().format(value);
     }
 
+    public String toStringUTC(Date value)
+    {
+        return value == null ? "" : FORMATTER_UTC.get().format(value);
+    }
+
     public Class<Date> getType()
     {
         return Date.class;
     }
+
+    /**
+     * Builds CQL literal for a timestamp using time zone UTC and fixed date format.
+     * @see #FORMATTER_UTC
+     */
+    @Override
+    public String toCQLLiteral(ByteBuffer buffer)
+    {
+        return buffer == null || !buffer.hasRemaining()
+             ? "null"
+             : FORMATTER_UTC.get().format(deserialize(buffer));
+    }
 }

diff --git a/src/java/org/apache/cassandra/serializers/TypeSerializer.java b/src/java/org/apache/cassandra/serializers/TypeSerializer.java
index cddef08..e66c36d 100644
--- a/src/java/org/apache/cassandra/serializers/TypeSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/TypeSerializer.java

@@ -34,5 +34,12 @@
     public String toString(T value);
 
     public Class<T> getType();
+
+    public default String toCQLLiteral(ByteBuffer buffer)
+    {
+        return buffer == null || !buffer.hasRemaining()
+             ? "null"
+             : toString(deserialize(buffer));
+    }
 }
 

diff --git a/src/java/org/apache/cassandra/service/AbstractReadExecutor.java b/src/java/org/apache/cassandra/service/AbstractReadExecutor.java
index 2bfd059..177fdb2 100644
--- a/src/java/org/apache/cassandra/service/AbstractReadExecutor.java
+++ b/src/java/org/apache/cassandra/service/AbstractReadExecutor.java

@@ -28,25 +28,23 @@
 
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
-import org.apache.cassandra.config.CFMetaData.SpeculativeRetry.RetryType;
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.config.ReadRepairDecision;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.ReadCommand;
-import org.apache.cassandra.db.ReadResponse;
-import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.exceptions.ReadFailureException;
 import org.apache.cassandra.exceptions.ReadTimeoutException;
 import org.apache.cassandra.exceptions.UnavailableException;
 import org.apache.cassandra.metrics.ReadRepairMetrics;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.SpeculativeRetryParam;
 import org.apache.cassandra.service.StorageProxy.LocalReadRunnable;
 import org.apache.cassandra.tracing.TraceState;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Sends a read request to the replicas needed to satisfy a given ConsistencyLevel.
@@ -62,22 +60,24 @@
 
     protected final ReadCommand command;
     protected final List<InetAddress> targetReplicas;
-    protected final RowDigestResolver resolver;
-    protected final ReadCallback<ReadResponse, Row> handler;
+    protected final ReadCallback handler;
     protected final TraceState traceState;
 
-    AbstractReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel, List<InetAddress> targetReplicas)
+    AbstractReadExecutor(Keyspace keyspace, ReadCommand command, ConsistencyLevel consistencyLevel, List<InetAddress> targetReplicas)
     {
         this.command = command;
         this.targetReplicas = targetReplicas;
-        resolver = new RowDigestResolver(command.ksName, command.key, targetReplicas.size());
-        traceState = Tracing.instance.get();
-        handler = new ReadCallback<>(resolver, consistencyLevel, command, targetReplicas);
-    }
+        this.handler = new ReadCallback(new DigestResolver(keyspace, command, consistencyLevel, targetReplicas.size()), consistencyLevel, command, targetReplicas);
+        this.traceState = Tracing.instance.get();
 
-    private static boolean isLocalRequest(InetAddress replica)
-    {
-        return replica.equals(FBUtilities.getBroadcastAddress());
+        // Set the digest version (if we request some digests). This is the smallest version amongst all our target replicas since new nodes
+        // knows how to produce older digest but the reverse is not true.
+        // TODO: we need this when talking with pre-3.0 nodes. So if we preserve the digest format moving forward, we can get rid of this once
+        // we stop being compatible with pre-3.0 nodes.
+        int digestVersion = MessagingService.current_version;
+        for (InetAddress replica : targetReplicas)
+            digestVersion = Math.min(digestVersion, MessagingService.instance().getVersion(replica));
+        command.setDigestVersion(digestVersion);
     }
 
     protected void makeDataRequests(Iterable<InetAddress> endpoints)
@@ -88,17 +88,16 @@
 
     protected void makeDigestRequests(Iterable<InetAddress> endpoints)
     {
-        makeRequests(command.copy().setIsDigestQuery(true), endpoints);
+        makeRequests(command.copyAsDigestQuery(), endpoints);
     }
 
     private void makeRequests(ReadCommand readCommand, Iterable<InetAddress> endpoints)
     {
-        MessageOut<ReadCommand> message = null;
         boolean hasLocalEndpoint = false;
 
         for (InetAddress endpoint : endpoints)
         {
-            if (isLocalRequest(endpoint))
+            if (StorageProxy.canDoLocalRequest(endpoint))
             {
                 hasLocalEndpoint = true;
                 continue;
@@ -107,8 +106,7 @@
             if (traceState != null)
                 traceState.trace("reading {} from {}", readCommand.isDigestQuery() ? "digest" : "data", endpoint);
             logger.trace("reading {} from {}", readCommand.isDigestQuery() ? "digest" : "data", endpoint);
-            if (message == null)
-                message = readCommand.createMessage();
+            MessageOut<ReadCommand> message = readCommand.createMessage(MessagingService.instance().getVersion(endpoint));
             MessagingService.instance().sendRRWithFailure(message, endpoint, handler);
         }
 
@@ -142,7 +140,7 @@
      * wait for an answer.  Blocks until success or timeout, so it is caller's
      * responsibility to call maybeTryAdditionalReplicas first.
      */
-    public Row get() throws ReadFailureException, ReadTimeoutException, DigestMismatchException
+    public PartitionIterator get() throws ReadFailureException, ReadTimeoutException, DigestMismatchException
     {
         return handler.get();
     }
@@ -150,11 +148,14 @@
     /**
      * @return an executor appropriate for the configured speculative read policy
      */
-    public static AbstractReadExecutor getReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel) throws UnavailableException
+    public static AbstractReadExecutor getReadExecutor(SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel) throws UnavailableException
     {
-        Keyspace keyspace = Keyspace.open(command.ksName);
-        List<InetAddress> allReplicas = StorageProxy.getLiveSortedEndpoints(keyspace, command.key);
-        ReadRepairDecision repairDecision = Schema.instance.getCFMetaData(command.ksName, command.cfName).newReadRepairDecision();
+        Keyspace keyspace = Keyspace.open(command.metadata().ksName);
+        List<InetAddress> allReplicas = StorageProxy.getLiveSortedEndpoints(keyspace, command.partitionKey());
+        // 11980: Excluding EACH_QUORUM reads from potential RR, so that we do not miscount DC responses
+        ReadRepairDecision repairDecision = consistencyLevel == ConsistencyLevel.EACH_QUORUM
+                                            ? ReadRepairDecision.NONE
+                                            : command.metadata().newReadRepairDecision();
         List<InetAddress> targetReplicas = consistencyLevel.filterForQuery(keyspace, allReplicas, repairDecision);
 
         // Throw UAE early if we don't have enough replicas.
@@ -166,19 +167,22 @@
             ReadRepairMetrics.attempted.mark();
         }
 
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.cfName);
-        RetryType retryType = cfs.metadata.getSpeculativeRetry().type;
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.metadata().cfId);
+        SpeculativeRetryParam retry = cfs.metadata.params.speculativeRetry;
 
         // Speculative retry is disabled *OR* there are simply no extra replicas to speculate.
-        if (retryType == RetryType.NONE || consistencyLevel.blockFor(keyspace) == allReplicas.size())
-            return new NeverSpeculatingReadExecutor(command, consistencyLevel, targetReplicas);
+        // 11980: Disable speculative retry if using EACH_QUORUM in order to prevent miscounting DC responses
+        if (retry.equals(SpeculativeRetryParam.NONE)
+            || consistencyLevel == ConsistencyLevel.EACH_QUORUM
+            || consistencyLevel.blockFor(keyspace) == allReplicas.size())
+            return new NeverSpeculatingReadExecutor(keyspace, command, consistencyLevel, targetReplicas);
 
         if (targetReplicas.size() == allReplicas.size())
         {
             // CL.ALL, RRD.GLOBAL or RRD.DC_LOCAL and a single-DC.
             // We are going to contact every node anyway, so ask for 2 full data requests instead of 1, for redundancy
             // (same amount of requests in total, but we turn 1 digest request into a full blown data request).
-            return new AlwaysSpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
+            return new AlwaysSpeculatingReadExecutor(keyspace, cfs, command, consistencyLevel, targetReplicas);
         }
 
         // RRD.NONE or RRD.DC_LOCAL w/ multiple DCs.
@@ -198,17 +202,17 @@
         }
         targetReplicas.add(extraReplica);
 
-        if (retryType == RetryType.ALWAYS)
-            return new AlwaysSpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
+        if (retry.equals(SpeculativeRetryParam.ALWAYS))
+            return new AlwaysSpeculatingReadExecutor(keyspace, cfs, command, consistencyLevel, targetReplicas);
         else // PERCENTILE or CUSTOM.
-            return new SpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
+            return new SpeculatingReadExecutor(keyspace, cfs, command, consistencyLevel, targetReplicas);
     }
 
-    private static class NeverSpeculatingReadExecutor extends AbstractReadExecutor
+    public static class NeverSpeculatingReadExecutor extends AbstractReadExecutor
     {
-        public NeverSpeculatingReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel, List<InetAddress> targetReplicas)
+        public NeverSpeculatingReadExecutor(Keyspace keyspace, ReadCommand command, ConsistencyLevel consistencyLevel, List<InetAddress> targetReplicas)
         {
-            super(command, consistencyLevel, targetReplicas);
+            super(keyspace, command, consistencyLevel, targetReplicas);
         }
 
         public void executeAsync()
@@ -234,12 +238,13 @@
         private final ColumnFamilyStore cfs;
         private volatile boolean speculated = false;
 
-        public SpeculatingReadExecutor(ColumnFamilyStore cfs,
+        public SpeculatingReadExecutor(Keyspace keyspace,
+                                       ColumnFamilyStore cfs,
                                        ReadCommand command,
                                        ConsistencyLevel consistencyLevel,
                                        List<InetAddress> targetReplicas)
         {
-            super(command, consistencyLevel, targetReplicas);
+            super(keyspace, command, consistencyLevel, targetReplicas);
             this.cfs = cfs;
         }
 
@@ -278,14 +283,15 @@
             {
                 // Could be waiting on the data, or on enough digests.
                 ReadCommand retryCommand = command;
-                if (resolver.getData() != null)
-                    retryCommand = command.copy().setIsDigestQuery(true);
+                if (handler.resolver.isDataPresent())
+                    retryCommand = command.copyAsDigestQuery();
 
                 InetAddress extraReplica = Iterables.getLast(targetReplicas);
                 if (traceState != null)
                     traceState.trace("speculating read retry on {}", extraReplica);
                 logger.trace("speculating read retry on {}", extraReplica);
-                MessagingService.instance().sendRRWithFailure(retryCommand.createMessage(), extraReplica, handler);
+                int version = MessagingService.instance().getVersion(extraReplica);
+                MessagingService.instance().sendRRWithFailure(retryCommand.createMessage(version), extraReplica, handler);
                 speculated = true;
 
                 cfs.metric.speculativeRetries.inc();
@@ -304,12 +310,13 @@
     {
         private final ColumnFamilyStore cfs;
 
-        public AlwaysSpeculatingReadExecutor(ColumnFamilyStore cfs,
+        public AlwaysSpeculatingReadExecutor(Keyspace keyspace,
+                                             ColumnFamilyStore cfs,
                                              ReadCommand command,
                                              ConsistencyLevel consistencyLevel,
                                              List<InetAddress> targetReplicas)
         {
-            super(command, consistencyLevel, targetReplicas);
+            super(keyspace, command, consistencyLevel, targetReplicas);
             this.cfs = cfs;
         }
 

diff --git a/src/java/org/apache/cassandra/service/AbstractRowResolver.java b/src/java/org/apache/cassandra/service/AbstractRowResolver.java
deleted file mode 100644
index f362047..0000000
--- a/src/java/org/apache/cassandra/service/AbstractRowResolver.java
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.nio.ByteBuffer;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.ReadResponse;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.utils.concurrent.Accumulator;
-
-public abstract class AbstractRowResolver implements IResponseResolver<ReadResponse, Row>
-{
-    protected static final Logger logger = LoggerFactory.getLogger(AbstractRowResolver.class);
-
-    protected final String keyspaceName;
-    // Accumulator gives us non-blocking thread-safety with optimal algorithmic constraints
-    protected final Accumulator<MessageIn<ReadResponse>> replies;
-    protected final DecoratedKey key;
-
-    public AbstractRowResolver(ByteBuffer key, String keyspaceName, int maxResponseCount)
-    {
-        this.key = StorageService.getPartitioner().decorateKey(key);
-        this.keyspaceName = keyspaceName;
-        this.replies = new Accumulator<>(maxResponseCount);
-    }
-
-    public void preprocess(MessageIn<ReadResponse> message)
-    {
-        replies.add(message);
-    }
-
-    public Iterable<MessageIn<ReadResponse>> getMessages()
-    {
-        return replies;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java
index 8978034..e3ba66e 100644
--- a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java
+++ b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java

@@ -47,7 +47,7 @@
     public final ConsistencyLevel consistencyLevel;
     protected final Runnable callback;
     protected final Collection<InetAddress> pendingEndpoints;
-    private final WriteType writeType;
+    protected final WriteType writeType;
     private static final AtomicIntegerFieldUpdater<AbstractWriteResponseHandler> failuresUpdater
         = AtomicIntegerFieldUpdater.newUpdater(AbstractWriteResponseHandler.class, "failures");
     private volatile int failures = 0;

diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java
index 7d56e4b..f63cb86 100644
--- a/src/java/org/apache/cassandra/service/ActiveRepairService.java
+++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java

@@ -38,12 +38,11 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.ApplicationState;
@@ -54,7 +53,6 @@
 import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
 import org.apache.cassandra.gms.IFailureDetectionEventListener;
 import org.apache.cassandra.gms.VersionedValue;
-import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.net.IAsyncCallbackWithFailure;
@@ -67,6 +65,7 @@
 import org.apache.cassandra.repair.RepairSession;
 import org.apache.cassandra.repair.messages.*;
 import org.apache.cassandra.utils.CassandraVersion;
+import org.apache.cassandra.utils.Clock;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Ref;
@@ -130,7 +129,7 @@
      * @return Future for asynchronous call or null if there is no need to repair
      */
     public RepairSession submitRepairSession(UUID parentRepairSession,
-                                             Range<Token> range,
+                                             Collection<Range<Token>> range,
                                              String keyspace,
                                              RepairParallelism parallelismDegree,
                                              Set<InetAddress> endpoints,
@@ -206,7 +205,9 @@
      *
      * @return neighbors with whom we share the provided range
      */
-    public static Set<InetAddress> getNeighbors(String keyspaceName, Collection<Range<Token>> keyspaceLocalRanges, Range<Token> toRepair, Collection<String> dataCenters, Collection<String> hosts)
+    public static Set<InetAddress> getNeighbors(String keyspaceName, Collection<Range<Token>> keyspaceLocalRanges,
+                                                Range<Token> toRepair, Collection<String> dataCenters,
+                                                Collection<String> hosts)
     {
         StorageService ss = StorageService.instance;
         Map<Range<Token>, List<InetAddress>> replicaSets = ss.getRangeToAddressMap(keyspaceName);
@@ -220,7 +221,10 @@
             }
             else if (range.intersects(toRepair))
             {
-                throw new IllegalArgumentException("Requested range intersects a local range but is not fully contained in one; this would lead to imprecise repair");
+                throw new IllegalArgumentException(String.format("Requested range %s intersects a local range (%s) " +
+                                                                 "but is not fully contained in one; this would lead to " +
+                                                                 "imprecise repair. keyspace: %s", toRepair.toString(),
+                                                                 range.toString(), keyspaceName));
             }
         }
         if (rangeSuperSet == null || !replicaSets.containsKey(rangeSuperSet))
@@ -277,9 +281,10 @@
         return neighbors;
     }
 
-    public synchronized UUID prepareForRepair(UUID parentRepairSession, InetAddress coordinator, Set<InetAddress> endpoints, RepairOption options, List<ColumnFamilyStore> columnFamilyStores)
+    public UUID prepareForRepair(UUID parentRepairSession, InetAddress coordinator, Set<InetAddress> endpoints, RepairOption options, List<ColumnFamilyStore> columnFamilyStores)
     {
-        registerParentRepairSession(parentRepairSession, coordinator, columnFamilyStores, options.getRanges(), options.isIncremental(), options.isGlobal());
+        long timestamp = Clock.instance.currentTimeMillis();
+        registerParentRepairSession(parentRepairSession, coordinator, columnFamilyStores, options.getRanges(), options.isIncremental(), timestamp, options.isGlobal());
         final CountDownLatch prepareLatch = new CountDownLatch(endpoints.size());
         final AtomicBoolean status = new AtomicBoolean(true);
         final Set<String> failedNodes = Collections.synchronizedSet(new HashSet<String>());
@@ -311,40 +316,43 @@
         {
             if (FailureDetector.instance.isAlive(neighbour))
             {
-                CassandraVersion peerVersion = SystemKeyspace.getReleaseVersion(neighbour);
-                boolean isGlobal = options.isGlobal() && peerVersion != null && peerVersion.compareTo(SUPPORTS_GLOBAL_PREPARE_FLAG_VERSION) >= 0;
-                logger.debug("Sending prepare message: options.isGlobal = {}, peerVersion = {}", options.isGlobal(), peerVersion);
-                PrepareMessage message = new PrepareMessage(parentRepairSession, cfIds, options.getRanges(), options.isIncremental(), isGlobal);
+                PrepareMessage message = new PrepareMessage(parentRepairSession, cfIds, options.getRanges(), options.isIncremental(), timestamp, options.isGlobal());
                 MessageOut<RepairMessage> msg = message.createMessage();
                 MessagingService.instance().sendRR(msg, neighbour, callback, TimeUnit.HOURS.toMillis(1), true);
             }
             else
             {
-                status.set(false);
-                failedNodes.add(neighbour.getHostAddress());
-                prepareLatch.countDown();
+                // bailout early to avoid potentially waiting for a long time.
+                failRepair(parentRepairSession, "Endpoint not alive: " + neighbour);
             }
         }
+
         try
         {
-            prepareLatch.await(1, TimeUnit.HOURS);
+            // Failed repair is expensive so we wait for longer time.
+            if (!prepareLatch.await(1, TimeUnit.HOURS)) {
+                failRepair(parentRepairSession, "Did not get replies from all endpoints.");
+            }
         }
         catch (InterruptedException e)
         {
-            removeParentRepairSession(parentRepairSession);
-            throw new RuntimeException("Did not get replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString(), e);
+            failRepair(parentRepairSession, "Interrupted while waiting for prepare repair response.");
         }
 
         if (!status.get())
         {
-            removeParentRepairSession(parentRepairSession);
-            throw new RuntimeException("Did not get positive replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString());
+            failRepair(parentRepairSession, "Got negative replies from endpoints " + failedNodes);
         }
 
         return parentRepairSession;
     }
 
-    public void registerParentRepairSession(UUID parentRepairSession, InetAddress coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, boolean isGlobal)
+    private void failRepair(UUID parentRepairSession, String errorMsg) {
+        removeParentRepairSession(parentRepairSession);
+        throw new RuntimeException(errorMsg);
+    }
+
+    public synchronized void registerParentRepairSession(UUID parentRepairSession, InetAddress coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, long timestamp, boolean isGlobal)
     {
         if (!registeredForEndpointChanges)
         {
@@ -352,7 +360,11 @@
             FailureDetector.instance.registerFailureDetectionEventListener(this);
             registeredForEndpointChanges = true;
         }
-        parentRepairSessions.put(parentRepairSession, new ParentRepairSession(coordinator, columnFamilyStores, ranges, isIncremental, isGlobal, System.currentTimeMillis()));
+
+        if (!parentRepairSessions.containsKey(parentRepairSession))
+        {
+            parentRepairSessions.put(parentRepairSession, new ParentRepairSession(coordinator, columnFamilyStores, ranges, isIncremental, timestamp, isGlobal));
+        }
     }
 
     public Set<SSTableReader> currentlyRepairing(UUID cfId, UUID parentRepairSession)
@@ -434,7 +446,7 @@
         //in addition to other scenarios such as repairs not involving all DCs or hosts
         if (!prs.isGlobal)
         {
-            logger.info("Not a global repair, will not do anticompaction");
+            logger.info("[repair #{}] Not a global repair, will not do anticompaction", parentRepairSession);
             removeParentRepairSession(parentRepairSession);
             return Futures.immediateFuture(Collections.emptyList());
         }
@@ -448,7 +460,7 @@
             {
                 Refs<SSTableReader> sstables = prs.getActiveRepairedSSTableRefsForAntiCompaction(columnFamilyStoreEntry.getKey(), parentRepairSession);
                 ColumnFamilyStore cfs = columnFamilyStoreEntry.getValue();
-                futures.add(CompactionManager.instance.submitAntiCompaction(cfs, successfulRanges, sstables, prs.repairedAt));
+                futures.add(CompactionManager.instance.submitAntiCompaction(cfs, successfulRanges, sstables, prs.repairedAt, parentRepairSession));
             }
         }
 
@@ -475,7 +487,7 @@
         {
             case VALIDATION_COMPLETE:
                 ValidationComplete validation = (ValidationComplete) message;
-                session.validationComplete(desc, endpoint, validation.tree);
+                session.validationComplete(desc, endpoint, validation.trees);
                 break;
             case SYNC_COMPLETE:
                 // one of replica is synced.
@@ -513,7 +525,7 @@
          */
         private final Set<UUID> marked = new HashSet<>();
 
-        public ParentRepairSession(InetAddress coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, boolean isGlobal, long repairedAt)
+        public ParentRepairSession(InetAddress coordinator, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, boolean isIncremental, long repairedAt, boolean isGlobal)
         {
             this.coordinator = coordinator;
             for (ColumnFamilyStore cfs : columnFamilyStores)
@@ -523,8 +535,8 @@
             }
             this.ranges = ranges;
             this.repairedAt = repairedAt;
-            this.isGlobal = isGlobal;
             this.isIncremental = isIncremental;
+            this.isGlobal = isGlobal;
         }
 
         /**
@@ -539,7 +551,7 @@
         {
             if (!marked.contains(cfId))
             {
-                List<SSTableReader> sstables = columnFamilyStores.get(cfId).select(isIncremental ? ColumnFamilyStore.UNREPAIRED_SSTABLES : ColumnFamilyStore.CANONICAL_SSTABLES).sstables;
+                List<SSTableReader> sstables = columnFamilyStores.get(cfId).select(View.select(SSTableSet.CANONICAL, (s) -> !isIncremental || !s.isRepaired())).sstables;
                 Set<SSTableReader> currentlyRepairing = ActiveRepairService.instance.currentlyRepairing(cfId, parentSessionId);
                 if (!Sets.intersection(currentlyRepairing, Sets.newHashSet(sstables)).isEmpty())
                 {
@@ -612,7 +624,7 @@
             {
                 throw new RuntimeException(e);
             }
-            for (SSTableReader sstable : cfs.select(ColumnFamilyStore.CANONICAL_SSTABLES).sstables)
+            for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
                 if (snapshotGenerations.contains(sstable.descriptor.generation))
                     activeSSTables.add(sstable);
             return activeSSTables;
@@ -629,7 +641,7 @@
                     {
                         return sstable != null &&
                                (!isIncremental || !sstable.isRepaired()) &&
-                               !(sstable.partitioner instanceof LocalPartitioner) && // exclude SSTables from 2i
+                               !(sstable.metadata.isIndex()) && // exclude SSTables from 2i
                                new Bounds<>(sstable.first.getToken(), sstable.last.getToken()).intersects(ranges);
                     }
                 }, true);
@@ -678,7 +690,7 @@
             Set<SSTableReader> activeSSTables = new HashSet<>();
             Set<String> activeSSTableNames = new HashSet<>();
             ColumnFamilyStore cfs = columnFamilyStores.get(cfId);
-            for (SSTableReader sstable : cfs.select(ColumnFamilyStore.CANONICAL_SSTABLES).sstables)
+            for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
             {
                 if (repairedSSTables.contains(sstable.getFilename()))
                 {

diff --git a/src/java/org/apache/cassandra/service/AsyncRepairCallback.java b/src/java/org/apache/cassandra/service/AsyncRepairCallback.java
index 6ac765b..d613f3d 100644
--- a/src/java/org/apache/cassandra/service/AsyncRepairCallback.java
+++ b/src/java/org/apache/cassandra/service/AsyncRepairCallback.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.service;
 
-import java.io.IOException;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.cassandra.concurrent.Stage;
@@ -29,11 +28,11 @@
 
 public class AsyncRepairCallback implements IAsyncCallback<ReadResponse>
 {
-    private final RowDataResolver repairResolver;
+    private final DataResolver repairResolver;
     private final int blockfor;
     protected final AtomicInteger received = new AtomicInteger(0);
 
-    public AsyncRepairCallback(RowDataResolver repairResolver, int blockfor)
+    public AsyncRepairCallback(DataResolver repairResolver, int blockfor)
     {
         this.repairResolver = repairResolver;
         this.blockfor = blockfor;
@@ -46,9 +45,9 @@
         {
             StageManager.getStage(Stage.READ_REPAIR).execute(new WrappedRunnable()
             {
-                protected void runMayThrow() throws DigestMismatchException, IOException
+                protected void runMayThrow()
                 {
-                    repairResolver.resolve();
+                    repairResolver.compareResponses();
                 }
             });
         }

diff --git a/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java b/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java
new file mode 100644
index 0000000..a1477e6
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service;
+
+import java.net.InetAddress;
+import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
+
+import org.apache.cassandra.exceptions.WriteFailureException;
+import org.apache.cassandra.exceptions.WriteTimeoutException;
+import org.apache.cassandra.net.MessageIn;
+
+public class BatchlogResponseHandler<T> extends AbstractWriteResponseHandler<T>
+{
+    AbstractWriteResponseHandler<T> wrapped;
+    BatchlogCleanup cleanup;
+    protected volatile int requiredBeforeFinish;
+    private static final AtomicIntegerFieldUpdater<BatchlogResponseHandler> requiredBeforeFinishUpdater
+            = AtomicIntegerFieldUpdater.newUpdater(BatchlogResponseHandler.class, "requiredBeforeFinish");
+
+    public BatchlogResponseHandler(AbstractWriteResponseHandler<T> wrapped, int requiredBeforeFinish, BatchlogCleanup cleanup)
+    {
+        super(wrapped.keyspace, wrapped.naturalEndpoints, wrapped.pendingEndpoints, wrapped.consistencyLevel, wrapped.callback, wrapped.writeType);
+        this.wrapped = wrapped;
+        this.requiredBeforeFinish = requiredBeforeFinish;
+        this.cleanup = cleanup;
+    }
+
+    protected int ackCount()
+    {
+        return wrapped.ackCount();
+    }
+
+    public void response(MessageIn<T> msg)
+    {
+        wrapped.response(msg);
+        if (requiredBeforeFinishUpdater.decrementAndGet(this) == 0)
+            cleanup.ackMutation();
+    }
+
+    public boolean isLatencyForSnitch()
+    {
+        return wrapped.isLatencyForSnitch();
+    }
+
+    public void onFailure(InetAddress from)
+    {
+        wrapped.onFailure(from);
+    }
+
+    public void assureSufficientLiveNodes()
+    {
+        wrapped.assureSufficientLiveNodes();
+    }
+
+    public void get() throws WriteTimeoutException, WriteFailureException
+    {
+        wrapped.get();
+    }
+
+    protected int totalBlockFor()
+    {
+        return wrapped.totalBlockFor();
+    }
+
+    protected int totalEndpoints()
+    {
+        return wrapped.totalEndpoints();
+    }
+
+    protected boolean waitingFor(InetAddress from)
+    {
+        return wrapped.waitingFor(from);
+    }
+
+    protected void signal()
+    {
+        wrapped.signal();
+    }
+
+    public static class BatchlogCleanup
+    {
+        private final BatchlogCleanupCallback callback;
+
+        protected volatile int mutationsWaitingFor;
+        private static final AtomicIntegerFieldUpdater<BatchlogCleanup> mutationsWaitingForUpdater
+            = AtomicIntegerFieldUpdater.newUpdater(BatchlogCleanup.class, "mutationsWaitingFor");
+
+        public BatchlogCleanup(int mutationsWaitingFor, BatchlogCleanupCallback callback)
+        {
+            this.mutationsWaitingFor = mutationsWaitingFor;
+            this.callback = callback;
+        }
+
+        public void ackMutation()
+        {
+            if (mutationsWaitingForUpdater.decrementAndGet(this) == 0)
+                callback.invoke();
+        }
+    }
+
+    public interface BatchlogCleanupCallback
+    {
+        void invoke();
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/CASRequest.java b/src/java/org/apache/cassandra/service/CASRequest.java
index 3d86637..1db100d 100644
--- a/src/java/org/apache/cassandra/service/CASRequest.java
+++ b/src/java/org/apache/cassandra/service/CASRequest.java

@@ -17,8 +17,9 @@
  */
 package org.apache.cassandra.service;
 
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
 /**
@@ -27,19 +28,19 @@
 public interface CASRequest
 {
     /**
-     * The filter to use to fetch the value to compare for the CAS.
+     * The command to use to fetch the value to compare for the CAS.
      */
-    public IDiskAtomFilter readFilter();
+    public SinglePartitionReadCommand readCommand(int nowInSec);
 
     /**
      * Returns whether the provided CF, that represents the values fetched using the
      * readFilter(), match the CAS conditions this object stands for.
      */
-    public boolean appliesTo(ColumnFamily current) throws InvalidRequestException;
+    public boolean appliesTo(FilteredPartition current) throws InvalidRequestException;
 
     /**
      * The updates to perform of a CAS success. The values fetched using the readFilter()
      * are passed as argument.
      */
-    public ColumnFamily makeUpdates(ColumnFamily current) throws InvalidRequestException;
+    public PartitionUpdate makeUpdates(FilteredPartition current) throws InvalidRequestException;
 }

diff --git a/src/java/org/apache/cassandra/service/CacheService.java b/src/java/org/apache/cassandra/service/CacheService.java
index 3872e5f..c4d1722 100644
--- a/src/java/org/apache/cassandra/service/CacheService.java
+++ b/src/java/org/apache/cassandra/service/CacheService.java

@@ -17,11 +17,9 @@
  */
 package org.apache.cassandra.service;
 
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.Callable;
@@ -30,27 +28,32 @@
 
 import com.google.common.util.concurrent.Futures;
 
+import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.cache.*;
 import org.apache.cassandra.cache.AutoSavingCache.CacheSerializer;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.CachedBTreePartition;
+import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MBeanWrapper;
 import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public class CacheService implements CacheServiceMBean
 {
@@ -349,33 +352,53 @@
             ByteBufferUtil.writeWithLength(key.cellName, out);
         }
 
-        public Future<Pair<CounterCacheKey, ClockAndCount>> deserialize(DataInputStream in, final ColumnFamilyStore cfs) throws IOException
+        public Future<Pair<CounterCacheKey, ClockAndCount>> deserialize(DataInputPlus in, final ColumnFamilyStore cfs) throws IOException
         {
             //Keyspace and CF name are deserialized by AutoSaving cache and used to fetch the CFS provided as a
             //parameter so they aren't deserialized here, even though they are serialized by this serializer
             final ByteBuffer partitionKey = ByteBufferUtil.readWithLength(in);
-            ByteBuffer cellNameBuffer = ByteBufferUtil.readWithLength(in);
+            final ByteBuffer cellName = ByteBufferUtil.readWithLength(in);
             if (cfs == null || !cfs.metadata.isCounter() || !cfs.isCounterCacheEnabled())
                 return null;
             assert(cfs.metadata.isCounter());
-            final CellName cellName = cfs.metadata.comparator.cellFromByteBuffer(cellNameBuffer);
             return StageManager.getStage(Stage.READ).submit(new Callable<Pair<CounterCacheKey, ClockAndCount>>()
             {
                 public Pair<CounterCacheKey, ClockAndCount> call() throws Exception
                 {
-                    DecoratedKey key = cfs.partitioner.decorateKey(partitionKey);
-                    QueryFilter filter = QueryFilter.getNamesFilter(key,
-                                                                    cfs.metadata.cfName,
-                                                                    FBUtilities.singleton(cellName, cfs.metadata.comparator),
-                                                                    Long.MIN_VALUE);
-                    ColumnFamily cf = cfs.getTopLevelColumns(filter, Integer.MIN_VALUE);
-                    if (cf == null)
-                        return null;
-                    Cell cell = cf.getColumn(cellName);
-                    if (cell == null || !cell.isLive(Long.MIN_VALUE))
-                        return null;
-                    ClockAndCount clockAndCount = CounterContext.instance().getLocalClockAndCount(cell.value());
-                    return Pair.create(CounterCacheKey.create(cfs.metadata.ksAndCFName, partitionKey, cellName), clockAndCount);
+                    DecoratedKey key = cfs.decorateKey(partitionKey);
+                    LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(cfs.metadata, cellName);
+                    ColumnDefinition column = name.column;
+                    CellPath path = name.collectionElement == null ? null : CellPath.create(name.collectionElement);
+
+                    int nowInSec = FBUtilities.nowInSeconds();
+                    ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+                    if (path == null)
+                        builder.add(column);
+                    else
+                        builder.select(column, path);
+
+                    ClusteringIndexFilter filter = new ClusteringIndexNamesFilter(FBUtilities.<Clustering>singleton(name.clustering, cfs.metadata.comparator), false);
+                    SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(cfs.metadata, nowInSec, key, builder.build(), filter);
+                    try (OpOrder.Group op = cfs.readOrdering.start(); RowIterator iter = UnfilteredRowIterators.filter(cmd.queryMemtableAndDisk(cfs, op), nowInSec))
+                    {
+                        Cell cell;
+                        if (column.isStatic())
+                        {
+                            cell = iter.staticRow().getCell(column);
+                        }
+                        else
+                        {
+                            if (!iter.hasNext())
+                                return null;
+                            cell = iter.next().getCell(column);
+                        }
+
+                        if (cell == null)
+                            return null;
+
+                        ClockAndCount clockAndCount = CounterContext.instance().getLocalClockAndCount(cell.value());
+                        return Pair.create(CounterCacheKey.create(cfs.metadata.ksAndCFName, partitionKey, name.clustering, column, path), clockAndCount);
+                    }
                 }
             });
         }
@@ -385,28 +408,32 @@
     {
         public void serialize(RowCacheKey key, DataOutputPlus out, ColumnFamilyStore cfs) throws IOException
         {
-            assert(!cfs.isIndex());
+            assert(!cfs.isIndex());//Shouldn't have row cache entries for indexes
             out.write(cfs.metadata.ksAndCFBytes);
             ByteBufferUtil.writeWithLength(key.key, out);
         }
 
-        public Future<Pair<RowCacheKey, IRowCacheEntry>> deserialize(DataInputStream in, final ColumnFamilyStore cfs) throws IOException
+        public Future<Pair<RowCacheKey, IRowCacheEntry>> deserialize(DataInputPlus in, final ColumnFamilyStore cfs) throws IOException
         {
             //Keyspace and CF name are deserialized by AutoSaving cache and used to fetch the CFS provided as a
             //parameter so they aren't deserialized here, even though they are serialized by this serializer
             final ByteBuffer buffer = ByteBufferUtil.readWithLength(in);
+            final int rowsToCache = cfs.metadata.params.caching.rowsPerPartitionToCache();
             if (cfs == null  || !cfs.isRowCacheEnabled())
                 return null;
-            assert(!cfs.isIndex());
+            assert(!cfs.isIndex());//Shouldn't have row cache entries for indexes
 
             return StageManager.getStage(Stage.READ).submit(new Callable<Pair<RowCacheKey, IRowCacheEntry>>()
             {
                 public Pair<RowCacheKey, IRowCacheEntry> call() throws Exception
                 {
-                    DecoratedKey key = cfs.partitioner.decorateKey(buffer);
-                    QueryFilter cacheFilter = new QueryFilter(key, cfs.getColumnFamilyName(), cfs.readFilterForCache(), Integer.MIN_VALUE);
-                    ColumnFamily data = cfs.getTopLevelColumns(cacheFilter, Integer.MIN_VALUE);
-                    return Pair.create(new RowCacheKey(cfs.metadata.ksAndCFName, key), (IRowCacheEntry) data);
+                    DecoratedKey key = cfs.decorateKey(buffer);
+                    int nowInSec = FBUtilities.nowInSeconds();
+                    try (OpOrder.Group op = cfs.readOrdering.start(); UnfilteredRowIterator iter = SinglePartitionReadCommand.fullPartitionRead(cfs.metadata, nowInSec, key).queryMemtableAndDisk(cfs, op))
+                    {
+                        CachedPartition toCache = CachedBTreePartition.create(DataLimits.cqlLimits(rowsToCache).filter(iter, nowInSec, true), nowInSec);
+                        return Pair.create(new RowCacheKey(cfs.metadata.ksAndCFName, key), (IRowCacheEntry)toCache);
+                    }
                 }
             });
         }
@@ -416,6 +443,10 @@
     {
         public void serialize(KeyCacheKey key, DataOutputPlus out, ColumnFamilyStore cfs) throws IOException
         {
+            //Don't serialize old format entries since we didn't bother to implement serialization of both for simplicity
+            //https://issues.apache.org/jira/browse/CASSANDRA-10778
+            if (!key.desc.version.storeRows()) return;
+
             RowIndexEntry entry = CacheService.instance.keyCache.getInternal(key);
             if (entry == null)
                 return;
@@ -424,10 +455,10 @@
             ByteBufferUtil.writeWithLength(key.key, out);
             out.writeInt(key.desc.generation);
             out.writeBoolean(true);
-            key.desc.getFormat().getIndexSerializer(cfs.metadata).serialize(entry, out);
+            key.desc.getFormat().getIndexSerializer(cfs.metadata, key.desc.version, SerializationHeader.forKeyCache(cfs.metadata)).serialize(entry, out);
         }
 
-        public Future<Pair<KeyCacheKey, RowIndexEntry>> deserialize(DataInputStream input, ColumnFamilyStore cfs) throws IOException
+        public Future<Pair<KeyCacheKey, RowIndexEntry>> deserialize(DataInputPlus input, ColumnFamilyStore cfs) throws IOException
         {
             //Keyspace and CF name are deserialized by AutoSaving cache and used to fetch the CFS provided as a
             //parameter so they aren't deserialized here, even though they are serialized by this serializer
@@ -441,16 +472,23 @@
             int generation = input.readInt();
             input.readBoolean(); // backwards compatibility for "promoted indexes" boolean
             SSTableReader reader = null;
-            if (cfs == null || !cfs.isKeyCacheEnabled() || (reader = findDesc(generation, cfs.getSSTables())) == null)
+            if (cfs == null || !cfs.isKeyCacheEnabled() || (reader = findDesc(generation, cfs.getSSTables(SSTableSet.CANONICAL))) == null)
             {
-                RowIndexEntry.Serializer.skip(input);
+                // The sstable doesn't exist anymore, so we can't be sure of the exact version and assume its the current version. The only case where we'll be
+                // wrong is during upgrade, in which case we fail at deserialization. This is not a huge deal however since 1) this is unlikely enough that
+                // this won't affect many users (if any) and only once, 2) this doesn't prevent the node from starting and 3) CASSANDRA-10219 shows that this
+                // part of the code has been broken for a while without anyone noticing (it is, btw, still broken until CASSANDRA-10219 is fixed).
+                RowIndexEntry.Serializer.skip(input, BigFormat.instance.getLatestVersion());
                 return null;
             }
-            RowIndexEntry entry = reader.descriptor.getFormat().getIndexSerializer(reader.metadata).deserialize(input, reader.descriptor.version);
+            RowIndexEntry.IndexSerializer<?> indexSerializer = reader.descriptor.getFormat().getIndexSerializer(reader.metadata,
+                                                                                                                reader.descriptor.version,
+                                                                                                                SerializationHeader.forKeyCache(cfs.metadata));
+            RowIndexEntry entry = indexSerializer.deserialize(input);
             return Futures.immediateFuture(Pair.create(new KeyCacheKey(cfs.metadata.ksAndCFName, reader.descriptor, key), entry));
         }
 
-        private SSTableReader findDesc(int generation, Collection<SSTableReader> collection)
+        private SSTableReader findDesc(int generation, Iterable<SSTableReader> collection)
         {
             for (SSTableReader sstable : collection)
             {

diff --git a/src/java/org/apache/cassandra/service/CassandraDaemon.java b/src/java/org/apache/cassandra/service/CassandraDaemon.java
index a317ab3..85a002f 100644
--- a/src/java/org/apache/cassandra/service/CassandraDaemon.java
+++ b/src/java/org/apache/cassandra/service/CassandraDaemon.java

@@ -35,7 +35,6 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
-import java.util.UUID;
 import java.util.concurrent.TimeUnit;
 
 import javax.management.ObjectName;
@@ -57,6 +56,7 @@
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.Uninterruptibles;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -65,15 +65,20 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.batchlog.LegacyBatchlogMigrator;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.StartupException;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.hints.LegacyHintsMigrator;
 import org.apache.cassandra.io.FSError;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.metrics.CassandraMetricsRegistry;
 import org.apache.cassandra.metrics.DefaultNameFactory;
 import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.schema.LegacySchemaMigrator;
+import org.apache.cassandra.cql3.functions.ThreadAwareSecurityManager;
 import org.apache.cassandra.thrift.ThriftServer;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.*;
@@ -150,7 +155,7 @@
     private static final CassandraDaemon instance = new CassandraDaemon();
 
     public Server thriftServer;
-    public Server nativeServer;
+    private NativeTransportService nativeTransportService;
 
     private final boolean runManaged;
     protected final StartupChecks startupChecks;
@@ -179,9 +184,15 @@
         if (FBUtilities.isWindows())
             WindowsFailedSnapshotTracker.deleteOldSnapshots();
 
+        maybeInitJmx();
+
+        Mx4jTool.maybeLoad();
+
+        ThreadAwareSecurityManager.install();
+
         logSystemInfo();
 
-        CLibrary.tryMlockall();
+        NativeLibrary.tryMlockall();
 
         try
         {
@@ -194,7 +205,10 @@
 
         try
         {
-            SystemKeyspace.snapshotOnVersionChange();
+            if (SystemKeyspace.snapshotOnVersionChange())
+            {
+                SystemKeyspace.migrateDataDirs();
+            }
         }
         catch (IOException e)
         {
@@ -205,8 +219,6 @@
         // This should be the first write to SystemKeyspace (CASSANDRA-11742)
         SystemKeyspace.persistLocalMetadata();
 
-        maybeInitJmx();
-
         Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler()
         {
             public void uncaughtException(Thread t, Throwable e)
@@ -235,20 +247,19 @@
             }
         });
 
+        /*
+         * Migrate pre-3.0 keyspaces, tables, types, functions, and aggregates, to their new 3.0 storage.
+         * We don't (and can't) wait for commit log replay here, but we don't need to - all schema changes force
+         * explicit memtable flushes.
+         */
+        LegacySchemaMigrator.migrate();
+
+        // Populate token metadata before flushing, for token-aware sstable partitioning (#6696)
+        StorageService.instance.populateTokenMetadata();
+
         // load schema from disk
         Schema.instance.loadFromDisk();
 
-        // clean up compaction leftovers
-        Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = SystemKeyspace.getUnfinishedCompactions();
-        for (Pair<String, String> kscf : unfinishedCompactions.keySet())
-        {
-            CFMetaData cfm = Schema.instance.getCFMetaData(kscf.left, kscf.right);
-            // CFMetaData can be null if CF is already dropped
-            if (cfm != null)
-                ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfm, unfinishedCompactions.get(kscf));
-        }
-        SystemKeyspace.discardCompactionsInProgress();
-
         // clean up debris in the rest of the keyspaces
         for (String keyspaceName : Schema.instance.getKeyspaces())
         {
@@ -256,7 +267,7 @@
             if (keyspaceName.equals(SystemKeyspace.NAME))
                 continue;
 
-            for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(keyspaceName).values())
+            for (CFMetaData cfm : Schema.instance.getTablesAndViews(keyspaceName))
                 ColumnFamilyStore.scrubDataDirectories(cfm);
         }
 
@@ -308,6 +319,15 @@
             throw new RuntimeException(e);
         }
 
+        // Re-populate token metadata after commit log recover (new peers might be loaded onto system keyspace #10293)
+        StorageService.instance.populateTokenMetadata();
+
+        // migrate any legacy (pre-3.0) hints from system.hints table into the new store
+        new LegacyHintsMigrator(DatabaseDescriptor.getHintsDirectory(), DatabaseDescriptor.getMaxHintsFileSize()).migrate();
+
+        // migrate any legacy (pre-3.0) batch entries from system.batchlog to system.batches (new table format)
+        LegacyBatchlogMigrator.migrate();
+
         // enable auto compaction
         for (Keyspace keyspace : Keyspace.all())
         {
@@ -315,14 +335,38 @@
             {
                 for (final ColumnFamilyStore store : cfs.concatWithIndexes())
                 {
-                    if (store.getCompactionStrategy().shouldBeEnabled())
+                    if (store.getCompactionStrategyManager().shouldBeEnabled())
                         store.enableAutoCompaction();
                 }
             }
         }
 
+        Runnable viewRebuild = new Runnable()
+        {
+            @Override
+            public void run()
+            {
+                for (Keyspace keyspace : Keyspace.all())
+                {
+                    keyspace.viewManager.buildAllViews();
+                }
+                logger.debug("Completed submission of build tasks for any materialized views defined at startup");
+            }
+        };
+
+        ScheduledExecutors.optionalTasks.schedule(viewRebuild, StorageService.RING_DELAY, TimeUnit.MILLISECONDS);
+
         SystemKeyspace.finishStartup();
 
+        // Clean up system.size_estimates entries left lying around from missed keyspace drops (CASSANDRA-14905)
+        StorageService.instance.cleanupSizeEstimates();
+
+        // schedule periodic dumps of table size estimates into SystemKeyspace.SIZE_ESTIMATES_CF
+        // set cassandra.size_recorder_interval to 0 to disable
+        int sizeRecorderInterval = Integer.getInteger("cassandra.size_recorder_interval", 5 * 60);
+        if (sizeRecorderInterval > 0)
+            ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(SizeEstimatesRecorder.instance, 30, sizeRecorderInterval, TimeUnit.SECONDS);
+
         // start server internals
         StorageService.instance.registerDaemon(this);
         try
@@ -335,8 +379,6 @@
             exitOrFail(1, "Fatal configuration error", e);
         }
 
-        Mx4jTool.maybeLoad();
-
         // Metrics
         String metricsReporterConfigFile = System.getProperty("cassandra.metricsReporterConfigFile");
         if (metricsReporterConfigFile != null)
@@ -366,12 +408,6 @@
         // due to scheduling errors or race conditions
         ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(ColumnFamilyStore.getBackgroundCompactionTaskSubmitter(), 5, 1, TimeUnit.MINUTES);
 
-        // schedule periodic dumps of table size estimates into SystemKeyspace.SIZE_ESTIMATES_CF
-        // set cassandra.size_recorder_interval to 0 to disable
-        int sizeRecorderInterval = Integer.getInteger("cassandra.size_recorder_interval", 5 * 60);
-        if (sizeRecorderInterval > 0)
-            ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(SizeEstimatesRecorder.instance, 30, sizeRecorderInterval, TimeUnit.SECONDS);
-
         // Thrift
         InetAddress rpcAddr = DatabaseDescriptor.getRpcAddress();
         int rpcPort = DatabaseDescriptor.getRpcPort();
@@ -385,45 +421,8 @@
     public void initializeNativeTransport()
     {
         // Native transport
-        InetAddress nativeAddr = DatabaseDescriptor.getRpcAddress();
-        int nativePort = DatabaseDescriptor.getNativeTransportPort();
-        nativeServer = new org.apache.cassandra.transport.Server(nativeAddr, nativePort);
-    }
-
-    public void startNativeTransport()
-    {
-        validateTransportsCanStart();
-
-        if (nativeServer == null)
-            throw new IllegalStateException("native transport should be set up before it can be started");
-
-        nativeServer.start();
-    }
-
-    private void validateTransportsCanStart()
-    {
-        // We only start transports if bootstrap has completed and we're not in survey mode, OR if we are in
-        // survey mode and streaming has completed but we're not using auth.
-        // OR if we have not joined the ring yet.
-        if (StorageService.instance.hasJoined())
-        {
-            if (StorageService.instance.isSurveyMode())
-            {
-                if (StorageService.instance.isBootstrapMode() || DatabaseDescriptor.getAuthenticator().requireAuthentication())
-                {
-                    throw new IllegalStateException("Not starting client transports in write_survey mode as it's bootstrapping or " +
-                                                    "auth is enabled");
-                }
-            }
-            else
-            {
-                if (!SystemKeyspace.bootstrapComplete())
-                {
-                    throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
-                                                    " state and resume. For more, see `nodetool help bootstrap`");
-                }
-            }
-        }
+        if (nativeTransportService == null)
+            nativeTransportService = new NativeTransportService();
     }
 
     /*
@@ -514,7 +513,8 @@
         String nativeFlag = System.getProperty("cassandra.start_native_transport");
         if ((nativeFlag != null && Boolean.parseBoolean(nativeFlag)) || (nativeFlag == null && DatabaseDescriptor.startNativeTransport()))
         {
-            nativeServer.start();
+            startNativeTransport();
+            StorageService.instance.setRpcReady(true);
         }
         else
             logger.info("Not starting native transport as requested. Use JMX (StorageService->startNativeTransport()) or nodetool (enablebinary) to start it");
@@ -536,8 +536,11 @@
         // On linux, this doesn't entirely shut down Cassandra, just the RPC server.
         // jsvc takes care of taking the rest down
         logger.info("Cassandra shutting down...");
-        thriftServer.stop();
-        nativeServer.stop();
+        if (thriftServer != null)
+            thriftServer.stop();
+        if (nativeTransportService != null)
+            nativeTransportService.destroy();
+        StorageService.instance.setRpcReady(false);
 
         // On windows, we need to stop the entire system as prunsrv doesn't have the jsvc hooks
         // We rely on the shutdown hook to drain the node
@@ -560,12 +563,10 @@
     @VisibleForTesting
     public void destroyNativeTransport() throws InterruptedException
     {
-        // In 2.2, just stopping the server works. Future versions require `destroy` to be called
-        // so we maintain the name for consistency
-        if (nativeServer != null)
+        if (nativeTransportService != null)
         {
-            nativeServer.stopAndAwaitTermination();
-            nativeServer = null;
+            nativeTransportService.destroy();
+            nativeTransportService = null;
         }
     }
 
@@ -647,6 +648,64 @@
         }
     }
 
+    public void validateTransportsCanStart()
+    {
+        // We only start transports if bootstrap has completed and we're not in survey mode, OR if we are in
+        // survey mode and streaming has completed but we're not using auth.
+        // OR if we have not joined the ring yet.
+        if (StorageService.instance.hasJoined())
+        {
+            if (StorageService.instance.isSurveyMode())
+            {
+                if (StorageService.instance.isBootstrapMode() || DatabaseDescriptor.getAuthenticator().requireAuthentication())
+                {
+                    throw new IllegalStateException("Not starting client transports in write_survey mode as it's bootstrapping or " +
+                                                    "auth is enabled");
+                }
+            }
+            else
+            {
+                if (!SystemKeyspace.bootstrapComplete())
+                {
+                    throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
+                                                    " state and resume. For more, see `nodetool help bootstrap`");
+                }
+            }
+        }
+    }
+
+    public void startNativeTransport()
+    {
+        validateTransportsCanStart();
+
+        if (nativeTransportService == null)
+            throw new IllegalStateException("setup() must be called first for CassandraDaemon");
+        else
+            nativeTransportService.start();
+    }
+
+    public void stopNativeTransport()
+    {
+        if (nativeTransportService != null)
+            nativeTransportService.stop();
+    }
+
+    public boolean isNativeTransportRunning()
+    {
+        return nativeTransportService != null ? nativeTransportService.isRunning() : false;
+    }
+
+    public int getMaxNativeProtocolVersion()
+    {
+        return nativeTransportService.getMaxProtocolVersion();
+    }
+
+    public void refreshMaxNativeProtocolVersion()
+    {
+        if (nativeTransportService != null)
+            nativeTransportService.refreshMaxNegotiableProtocolVersion();
+    }
+
     /**
      * A convenience method to stop and destroy the daemon in one shot.
      */
@@ -676,28 +735,27 @@
         Uninterruptibles.sleepUninterruptibly(GOSSIP_SETTLE_MIN_WAIT_MS, TimeUnit.MILLISECONDS);
         int totalPolls = 0;
         int numOkay = 0;
-        JMXEnabledThreadPoolExecutor gossipStage = (JMXEnabledThreadPoolExecutor)StageManager.getStage(Stage.GOSSIP);
+        int epSize = Gossiper.instance.getEndpointStates().size();
         while (numOkay < GOSSIP_SETTLE_POLL_SUCCESSES_REQUIRED)
         {
             Uninterruptibles.sleepUninterruptibly(GOSSIP_SETTLE_POLL_INTERVAL_MS, TimeUnit.MILLISECONDS);
-            long completed = gossipStage.metrics.completedTasks.getValue();
-            long active = gossipStage.metrics.activeTasks.getValue();
-            long pending = gossipStage.metrics.pendingTasks.getValue();
+            int currentSize = Gossiper.instance.getEndpointStates().size();
             totalPolls++;
-            if (active == 0 && pending == 0)
+            if (currentSize == epSize)
             {
-                logger.debug("Gossip looks settled. CompletedTasks: {}", completed);
+                logger.debug("Gossip looks settled.");
                 numOkay++;
             }
             else
             {
-                logger.info("Gossip not settled after {} polls. Gossip Stage active/pending/completed: {}/{}/{}", totalPolls, active, pending, completed);
+                logger.info("Gossip not settled after {} polls.", totalPolls);
                 numOkay = 0;
             }
+            epSize = currentSize;
             if (forceAfter > 0 && totalPolls > forceAfter)
             {
-                logger.warn("Gossip not settled but startup forced by cassandra.skip_wait_for_gossip_to_settle. Gossip Stage total/active/pending/completed: {}/{}/{}/{}",
-                            totalPolls, active, pending, completed);
+                logger.warn("Gossip not settled but startup forced by cassandra.skip_wait_for_gossip_to_settle. Gossip total polls: {}",
+                            totalPolls);
                 break;
             }
         }
@@ -707,7 +765,7 @@
             logger.info("No gossip backlog; proceeding");
     }
 
-    public static void stop(String[] args) throws InterruptedException
+    public static void stop(String[] args)
     {
         instance.deactivate();
     }
@@ -737,12 +795,12 @@
     {
         public boolean isAvailable()
         {
-            return CLibrary.jnaAvailable();
+            return NativeLibrary.isAvailable();
         }
 
         public boolean isMemoryLockable()
         {
-            return CLibrary.jnaMemoryLockable();
+            return NativeLibrary.jnaMemoryLockable();
         }
     }
 
@@ -762,9 +820,6 @@
          */
         public void stop();
 
-        @VisibleForTesting
-        public void stopAndAwaitTermination();
-
         /**
          * Returns whether the server is currently running.
          */

diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java
index 9593802..bba6ca4 100644
--- a/src/java/org/apache/cassandra/service/ClientState.java
+++ b/src/java/org/apache/cassandra/service/ClientState.java

@@ -24,7 +24,6 @@
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicLong;
 
-import com.google.common.collect.Iterables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,9 +39,8 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.tracing.TraceKeyspace;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.CassandraVersion;
@@ -57,15 +55,16 @@
 
     private static final Set<IResource> READABLE_SYSTEM_RESOURCES = new HashSet<>();
     private static final Set<IResource> PROTECTED_AUTH_RESOURCES = new HashSet<>();
-    private static final Set<String> ALTERABLE_SYSTEM_KEYSPACES = new HashSet<>();
-    private static final Set<IResource> DROPPABLE_SYSTEM_TABLES = new HashSet<>();
+    private static final Set<IResource> DROPPABLE_SYSTEM_AUTH_TABLES = new HashSet<>();
     static
     {
         // We want these system cfs to be always readable to authenticated users since many tools rely on them
         // (nodetool, cqlsh, bulkloader, etc.)
-        for (String cf : Iterables.concat(Arrays.asList(SystemKeyspace.LOCAL, SystemKeyspace.PEERS), LegacySchemaTables.ALL))
+        for (String cf : Arrays.asList(SystemKeyspace.LOCAL, SystemKeyspace.PEERS))
             READABLE_SYSTEM_RESOURCES.add(DataResource.table(SystemKeyspace.NAME, cf));
 
+        SchemaKeyspace.ALL.forEach(table -> READABLE_SYSTEM_RESOURCES.add(DataResource.table(SchemaKeyspace.NAME, table)));
+
         if (!Config.isClientMode())
         {
             PROTECTED_AUTH_RESOURCES.addAll(DatabaseDescriptor.getAuthenticator().protectedResources());
@@ -73,20 +72,22 @@
             PROTECTED_AUTH_RESOURCES.addAll(DatabaseDescriptor.getRoleManager().protectedResources());
         }
 
-        // allow users with sufficient privileges to alter KS level options on AUTH_KS and
-        // TRACING_KS, and also to drop legacy tables (users, credentials, permissions) from
-        // AUTH_KS
-        ALTERABLE_SYSTEM_KEYSPACES.add(AuthKeyspace.NAME);
-        ALTERABLE_SYSTEM_KEYSPACES.add(TraceKeyspace.NAME);
-        DROPPABLE_SYSTEM_TABLES.add(DataResource.table(AuthKeyspace.NAME, PasswordAuthenticator.LEGACY_CREDENTIALS_TABLE));
-        DROPPABLE_SYSTEM_TABLES.add(DataResource.table(AuthKeyspace.NAME, CassandraRoleManager.LEGACY_USERS_TABLE));
-        DROPPABLE_SYSTEM_TABLES.add(DataResource.table(AuthKeyspace.NAME, CassandraAuthorizer.USER_PERMISSIONS));
+        // allow users with sufficient privileges to drop legacy tables (users, credentials, permissions) from AUTH_KS
+        DROPPABLE_SYSTEM_AUTH_TABLES.add(DataResource.table(AuthKeyspace.NAME, PasswordAuthenticator.LEGACY_CREDENTIALS_TABLE));
+        DROPPABLE_SYSTEM_AUTH_TABLES.add(DataResource.table(AuthKeyspace.NAME, CassandraRoleManager.LEGACY_USERS_TABLE));
+        DROPPABLE_SYSTEM_AUTH_TABLES.add(DataResource.table(AuthKeyspace.NAME, CassandraAuthorizer.USER_PERMISSIONS));
     }
 
     // Current user for the session
     private volatile AuthenticatedUser user;
     private volatile String keyspace;
 
+    /**
+     * Force Compact Tables to be represented as CQL ones for the current client session (simulates
+     * ALTER .. DROP COMPACT STORAGE but only for this session)
+     */
+    private volatile boolean noCompactMode;
+
     private static final QueryHandler cqlQueryHandler;
     static
     {
@@ -259,6 +260,16 @@
         keyspace = ks;
     }
 
+    public void setNoCompactMode()
+    {
+        this.noCompactMode = true;
+    }
+
+    public boolean isNoCompactMode()
+    {
+        return noCompactMode;
+    }
+
     /**
      * Attempts to login the given user.
      */
@@ -307,15 +318,21 @@
     throws UnauthorizedException, InvalidRequestException
     {
         validateKeyspace(keyspace);
+
         if (isInternal)
             return;
+
         validateLogin();
+
         preventSystemKSSchemaModification(keyspace, resource, perm);
+
         if ((perm == Permission.SELECT) && READABLE_SYSTEM_RESOURCES.contains(resource))
             return;
+
         if (PROTECTED_AUTH_RESOURCES.contains(resource))
             if ((perm == Permission.CREATE) || (perm == Permission.ALTER) || (perm == Permission.DROP))
                 throw new UnauthorizedException(String.format("%s schema is protected", resource));
+
         ensureHasPermission(perm, resource);
     }
 
@@ -363,21 +380,25 @@
 
     private void preventSystemKSSchemaModification(String keyspace, DataResource resource, Permission perm) throws UnauthorizedException
     {
-        // we only care about schema modification.
-        if (!((perm == Permission.ALTER) || (perm == Permission.DROP) || (perm == Permission.CREATE)))
+        // we only care about DDL statements
+        if (perm != Permission.ALTER && perm != Permission.DROP && perm != Permission.CREATE)
             return;
 
-        // prevent system keyspace modification
-        if (SystemKeyspace.NAME.equalsIgnoreCase(keyspace))
+        // prevent ALL local system keyspace modification
+        if (Schema.isLocalSystemKeyspace(keyspace))
             throw new UnauthorizedException(keyspace + " keyspace is not user-modifiable.");
 
-        // allow users with sufficient privileges to alter KS level options on AUTH_KS and
-        // TRACING_KS, and also to drop legacy tables (users, credentials, permissions) from
-        // AUTH_KS
-        if (ALTERABLE_SYSTEM_KEYSPACES.contains(resource.getKeyspace().toLowerCase())
-           && ((perm == Permission.ALTER && !resource.isKeyspaceLevel())
-               || (perm == Permission.DROP && !DROPPABLE_SYSTEM_TABLES.contains(resource))))
+        if (Schema.isReplicatedSystemKeyspace(keyspace))
         {
+            // allow users with sufficient privileges to alter replication params of replicated system keyspaces
+            if (perm == Permission.ALTER && resource.isKeyspaceLevel())
+                return;
+
+            // allow users with sufficient privileges to drop legacy tables in replicated system keyspaces
+            if (perm == Permission.DROP && DROPPABLE_SYSTEM_AUTH_TABLES.contains(resource))
+                return;
+
+            // prevent all other modifications of replicated system keyspaces
             throw new UnauthorizedException(String.format("Cannot %s %s", perm, resource));
         }
     }

diff --git a/src/java/org/apache/cassandra/service/DataResolver.java b/src/java/org/apache/cassandra/service/DataResolver.java
new file mode 100644
index 0000000..02d355e
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/DataResolver.java

@@ -0,0 +1,901 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.net.InetAddress;
+import java.util.*;
+import java.util.concurrent.TimeoutException;
+import java.util.function.UnaryOperator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.*;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.ExcludingBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.exceptions.ReadTimeoutException;
+import org.apache.cassandra.net.*;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class DataResolver extends ResponseResolver
+{
+    private static final boolean DROP_OVERSIZED_READ_REPAIR_MUTATIONS =
+        Boolean.getBoolean("cassandra.drop_oversized_readrepair_mutations");
+
+    @VisibleForTesting
+    final List<AsyncOneResponse> repairResults = Collections.synchronizedList(new ArrayList<>());
+
+    private final boolean enforceStrictLiveness;
+
+    DataResolver(Keyspace keyspace, ReadCommand command, ConsistencyLevel consistency, int maxResponseCount)
+    {
+        super(keyspace, command, consistency, maxResponseCount);
+        this.enforceStrictLiveness = command.metadata().enforceStrictLiveness();
+    }
+
+    public PartitionIterator getData()
+    {
+        ReadResponse response = responses.iterator().next().payload;
+        return UnfilteredPartitionIterators.filter(response.makeIterator(command), command.nowInSec());
+    }
+
+    public boolean isDataPresent()
+    {
+        return !responses.isEmpty();
+    }
+
+    public void compareResponses()
+    {
+        // We need to fully consume the results to trigger read repairs if appropriate
+        try (PartitionIterator iterator = resolve())
+        {
+            PartitionIterators.consume(iterator);
+        }
+    }
+
+    public PartitionIterator resolve()
+    {
+        if (!needsReplicaFilteringProtection())
+        {
+            ResolveContext context = new ResolveContext(responses.size());
+            return resolveWithReadRepair(context,
+                                         i -> shortReadProtectedResponse(i, context),
+                                         UnaryOperator.identity());
+        }
+
+        return resolveWithReplicaFilteringProtection();
+    }
+
+    private boolean needsReplicaFilteringProtection()
+    {
+        return !command.rowFilter().isEmpty();
+    }
+
+    private class ResolveContext
+    {
+        private final InetAddress[] sources;
+        private final DataLimits.Counter mergedResultCounter;
+
+        private ResolveContext(int count)
+        {
+            assert count <= responses.size();
+            this.sources = new InetAddress[count];
+            for (int i = 0; i < count; i++)
+                sources[i] = responses.get(i).from;
+            this.mergedResultCounter = command.limits().newCounter(command.nowInSec(),
+                                                                   true,
+                                                                   command.selectsFullPartition(),
+                                                                   enforceStrictLiveness);
+        }
+
+        private boolean needShortReadProtection()
+        {
+            // If we have only one result, there is no read repair to do and we can't get short reads
+            // Also, so-called "short reads" stems from nodes returning only a subset of the results they have for a
+            // partition due to the limit, but that subset not being enough post-reconciliation. So if we don't have limit,
+            // don't bother protecting against short reads.
+            return sources.length > 1 && !command.limits().isUnlimited();
+        }
+    }
+
+    @FunctionalInterface
+    private interface ResponseProvider
+    {
+        UnfilteredPartitionIterator getResponse(int i);
+    }
+
+    private UnfilteredPartitionIterator shortReadProtectedResponse(int i, ResolveContext context)
+    {
+        UnfilteredPartitionIterator originalResponse = responses.get(i).payload.makeIterator(command);
+
+        return context.needShortReadProtection()
+               ? extendWithShortReadProtection(originalResponse, context.sources[i], context.mergedResultCounter)
+               : originalResponse;
+    }
+
+    private PartitionIterator resolveWithReadRepair(ResolveContext context,
+                                                    ResponseProvider responseProvider,
+                                                    UnaryOperator<PartitionIterator> preCountFilter)
+    {
+        return resolveInternal(context, new RepairMergeListener(context.sources), responseProvider, preCountFilter);
+    }
+
+    private PartitionIterator resolveWithReplicaFilteringProtection()
+    {
+        // Protecting against inconsistent replica filtering (some replica returning a row that is outdated but that
+        // wouldn't be removed by normal reconciliation because up-to-date replica have filtered the up-to-date version
+        // of that row) works in 3 steps:
+        //   1) we read the full response just to collect rows that may be outdated (the ones we got from some
+        //      replica but didn't got any response for other; it could be those other replica have filtered a more
+        //      up-to-date result). In doing so, we do not count any of such "potentially outdated" row towards the
+        //      query limit. This simulate the worst case scenario where all those "potentially outdated" rows are
+        //      indeed outdated, and thus make sure we are guaranteed to read enough results (thanks to short read
+        //      protection).
+        //   2) we query all the replica/rows we need to rule out whether those "potentially outdated" rows are outdated
+        //      or not.
+        //   3) we re-read cached copies of each replica response using the "normal" read path merge with read-repair,
+        //      but where for each replica we use their original response _plus_ the additional rows queried in the
+        //      previous step (and apply the command#rowFilter() on the full result). Since the first phase has
+        //      pessimistically collected enough results for the case where all potentially outdated results are indeed
+        //      outdated, we shouldn't need further short-read protection requests during this phase.
+
+        // We could get more responses while this method runs, which is ok (we're happy to ignore any response not here
+        // at the beginning of this method), so grab the response count once and use that through the method.
+        int count = responses.size();
+        // We need separate contexts, as each context has his own counter
+        ResolveContext firstPhaseContext = new ResolveContext(count);
+        ResolveContext secondPhaseContext = new ResolveContext(count);
+        ReplicaFilteringProtection rfp = new ReplicaFilteringProtection(keyspace, command, consistency, firstPhaseContext.sources);
+        PartitionIterator firstPhasePartitions = resolveInternal(firstPhaseContext,
+                                                                 rfp.mergeController(),
+                                                                 i -> shortReadProtectedResponse(i, firstPhaseContext),
+                                                                 UnaryOperator.identity());
+
+        // Consume the first phase partitions to populate the replica filtering protection with both those materialized
+        // partitions and the primary keys to be fetched.
+        PartitionIterators.consume(firstPhasePartitions);
+        firstPhasePartitions.close();
+
+        // After reading the entire query results the protection helper should have cached all the partitions so we can
+        // clear the responses accumulator for the sake of memory usage, given that the second phase might take long if
+        // it needs to query replicas.
+        responses.clearUnsafe();
+
+        return resolveWithReadRepair(secondPhaseContext,
+                                     rfp::queryProtectedPartitions,
+                                     results -> command.rowFilter().filter(results, command.metadata(), command.nowInSec()));
+    }
+
+    private PartitionIterator resolveInternal(ResolveContext context,
+                                              UnfilteredPartitionIterators.MergeListener mergeListener,
+                                              ResponseProvider responseProvider,
+                                              UnaryOperator<PartitionIterator> preCountFilter)
+    {
+        int count = context.sources.length;
+        List<UnfilteredPartitionIterator> results = new ArrayList<>(count);
+        for (int i = 0; i < count; i++)
+            results.add(responseProvider.getResponse(i));
+
+        /*
+         * Even though every response, individually, will honor the limit, it is possible that we will, after the merge,
+         * have more rows than the client requested. To make sure that we still conform to the original limit,
+         * we apply a top-level post-reconciliation counter to the merged partition iterator.
+         *
+         * Short read protection logic (ShortReadRowsProtection.moreContents()) relies on this counter to be applied
+         * to the current partition to work. For this reason we have to apply the counter transformation before
+         * empty partition discard logic kicks in - for it will eagerly consume the iterator.
+         *
+         * That's why the order here is: 1) merge; 2) filter rows; 3) count; 4) discard empty partitions
+         *
+         * See CASSANDRA-13747 for more details.
+         */
+
+        UnfilteredPartitionIterator merged = UnfilteredPartitionIterators.merge(results, command.nowInSec(), mergeListener);
+        FilteredPartitions filtered =
+        FilteredPartitions.filter(merged, new Filter(command.nowInSec(), command.metadata().enforceStrictLiveness()));
+        PartitionIterator counted = Transformation.apply(preCountFilter.apply(filtered), context.mergedResultCounter);
+
+        return command.isForThrift()
+               ? counted
+               : Transformation.apply(counted, new EmptyPartitionsDiscarder());
+    }
+
+    private class RepairMergeListener implements UnfilteredPartitionIterators.MergeListener
+    {
+        private final InetAddress[] sources;
+
+        private RepairMergeListener(InetAddress[] sources)
+        {
+            this.sources = sources;
+        }
+
+        public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List<UnfilteredRowIterator> versions)
+        {
+            return new MergeListener(partitionKey, columns(versions), isReversed(versions));
+        }
+
+        private PartitionColumns columns(List<UnfilteredRowIterator> versions)
+        {
+            Columns statics = Columns.NONE;
+            Columns regulars = Columns.NONE;
+            for (UnfilteredRowIterator iter : versions)
+            {
+                if (iter == null)
+                    continue;
+
+                PartitionColumns cols = iter.columns();
+                statics = statics.mergeTo(cols.statics);
+                regulars = regulars.mergeTo(cols.regulars);
+            }
+            return new PartitionColumns(statics, regulars);
+        }
+
+        private boolean isReversed(List<UnfilteredRowIterator> versions)
+        {
+            for (UnfilteredRowIterator iter : versions)
+            {
+                if (iter == null)
+                    continue;
+
+                // Everything will be in the same order
+                return iter.isReverseOrder();
+            }
+
+            assert false : "Expected at least one iterator";
+            return false;
+        }
+
+        public void close()
+        {
+            try
+            {
+                FBUtilities.waitOnFutures(repairResults, DatabaseDescriptor.getWriteRpcTimeout());
+            }
+            catch (TimeoutException ex)
+            {
+                // We got all responses, but timed out while repairing
+                int blockFor = consistency.blockFor(keyspace);
+                if (Tracing.isTracing())
+                    Tracing.trace("Timed out while read-repairing after receiving all {} data and digest responses", blockFor);
+                else
+                    logger.debug("Timeout while read-repairing after receiving all {} data and digest responses", blockFor);
+
+                throw new ReadTimeoutException(consistency, blockFor-1, blockFor, true);
+            }
+        }
+
+        private class MergeListener implements UnfilteredRowIterators.MergeListener
+        {
+            private final DecoratedKey partitionKey;
+            private final PartitionColumns columns;
+            private final boolean isReversed;
+            private final PartitionUpdate[] repairs = new PartitionUpdate[sources.length];
+
+            private final Row.Builder[] currentRows = new Row.Builder[sources.length];
+            private final RowDiffListener diffListener;
+
+            // The partition level deletion for the merge row.
+            private DeletionTime partitionLevelDeletion;
+            // When merged has a currently open marker, its time. null otherwise.
+            private DeletionTime mergedDeletionTime;
+            // For each source, the time of the current deletion as known by the source.
+            private final DeletionTime[] sourceDeletionTime = new DeletionTime[sources.length];
+            // For each source, record if there is an open range to send as repair, and from where.
+            private final Slice.Bound[] markerToRepair = new Slice.Bound[sources.length];
+
+            private MergeListener(DecoratedKey partitionKey, PartitionColumns columns, boolean isReversed)
+            {
+                this.partitionKey = partitionKey;
+                this.columns = columns;
+                this.isReversed = isReversed;
+
+                this.diffListener = new RowDiffListener()
+                {
+                    public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
+                    {
+                        if (merged != null && !merged.equals(original))
+                            currentRow(i, clustering).addPrimaryKeyLivenessInfo(merged);
+                    }
+
+                    public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
+                    {
+                        if (merged != null && !merged.equals(original))
+                            currentRow(i, clustering).addRowDeletion(merged);
+                    }
+
+                    public void onComplexDeletion(int i, Clustering clustering, ColumnDefinition column, DeletionTime merged, DeletionTime original)
+                    {
+                        if (merged != null && !merged.equals(original))
+                            currentRow(i, clustering).addComplexDeletion(column, merged);
+                    }
+
+                    public void onCell(int i, Clustering clustering, Cell merged, Cell original)
+                    {
+                        if (merged != null && !merged.equals(original))
+                            currentRow(i, clustering).addCell(merged);
+                    }
+
+                };
+            }
+
+            private PartitionUpdate update(int i)
+            {
+                if (repairs[i] == null)
+                    repairs[i] = new PartitionUpdate(command.metadata(), partitionKey, columns, 1);
+                return repairs[i];
+            }
+
+            /**
+             * The partition level deletion with with which source {@code i} is currently repaired, or
+             * {@code DeletionTime.LIVE} if the source is not repaired on the partition level deletion (meaning it was
+             * up to date on it). The output* of this method is only valid after the call to
+             * {@link #onMergedPartitionLevelDeletion}.
+             */
+            private DeletionTime partitionLevelRepairDeletion(int i)
+            {
+                return repairs[i] == null ? DeletionTime.LIVE : repairs[i].partitionLevelDeletion();
+            }
+
+            private Row.Builder currentRow(int i, Clustering clustering)
+            {
+                if (currentRows[i] == null)
+                {
+                    currentRows[i] = BTreeRow.sortedBuilder();
+                    currentRows[i].newRow(clustering);
+                }
+                return currentRows[i];
+            }
+
+            public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions)
+            {
+                this.partitionLevelDeletion = mergedDeletion;
+                for (int i = 0; i < versions.length; i++)
+                {
+                    if (mergedDeletion.supersedes(versions[i]))
+                        update(i).addPartitionDeletion(mergedDeletion);
+                }
+            }
+
+            public Row onMergedRows(Row merged, Row[] versions)
+            {
+                // If a row was shadowed post merged, it must be by a partition level or range tombstone, and we handle
+                // those case directly in their respective methods (in other words, it would be inefficient to send a row
+                // deletion as repair when we know we've already send a partition level or range tombstone that covers it).
+                if (merged.isEmpty())
+                    return merged;
+
+                Rows.diff(diffListener, merged, versions);
+                for (int i = 0; i < currentRows.length; i++)
+                {
+                    if (currentRows[i] != null)
+                        update(i).add(currentRows[i].build());
+                }
+                Arrays.fill(currentRows, null);
+
+                return merged;
+            }
+
+            private DeletionTime currentDeletion()
+            {
+                return mergedDeletionTime == null ? partitionLevelDeletion : mergedDeletionTime;
+            }
+
+            public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions)
+            {
+                try
+                {
+                    // The code for merging range tombstones is a tad complex and we had the assertions there triggered
+                    // unexpectedly in a few occasions (CASSANDRA-13237, CASSANDRA-13719). It's hard to get insights
+                    // when that happen without more context that what the assertion errors give us however, hence the
+                    // catch here that basically gather as much as context as reasonable.
+                    internalOnMergedRangeTombstoneMarkers(merged, versions);
+                }
+                catch (AssertionError e)
+                {
+                    // The following can be pretty verbose, but it's really only triggered if a bug happen, so we'd
+                    // rather get more info to debug than not.
+                    CFMetaData table = command.metadata();
+                    String details = String.format("Error merging RTs on %s.%s: command=%s, reversed=%b, merged=%s, versions=%s, sources={%s}, responses:%n %s",
+                                                   table.ksName, table.cfName,
+                                                   command.toCQLString(),
+                                                   isReversed,
+                                                   merged == null ? "null" : merged.toString(table),
+                                                   '[' + Joiner.on(", ").join(Iterables.transform(Arrays.asList(versions), rt -> rt == null ? "null" : rt.toString(table))) + ']',
+                                                   Arrays.toString(sources),
+                                                   makeResponsesDebugString());
+                    throw new AssertionError(details, e);
+                }
+            }
+
+            private String makeResponsesDebugString()
+            {
+                return Joiner.on(",\n")
+                             .join(Iterables.transform(getMessages(), m -> m.from + " => " + m.payload.toDebugString(command, partitionKey)));
+            }
+
+            private void internalOnMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions)
+            {
+                // The current deletion as of dealing with this marker.
+                DeletionTime currentDeletion = currentDeletion();
+
+                for (int i = 0; i < versions.length; i++)
+                {
+                    RangeTombstoneMarker marker = versions[i];
+
+                    // Update what the source now thinks is the current deletion
+                    if (marker != null)
+                        sourceDeletionTime[i] = marker.isOpen(isReversed) ? marker.openDeletionTime(isReversed) : null;
+
+                    // If merged == null, some of the source is opening or closing a marker
+                    if (merged == null)
+                    {
+                        // but if it's not this source, move to the next one
+                        if (marker == null)
+                            continue;
+
+                        // We have a close and/or open marker for a source, with nothing corresponding in merged.
+                        // Because merged is a superset, this imply that we have a current deletion (being it due to an
+                        // early opening in merged or a partition level deletion) and that this deletion will still be
+                        // active after that point. Further whatever deletion was open or is open by this marker on the
+                        // source, that deletion cannot supersedes the current one.
+                        //
+                        // But while the marker deletion (before and/or after this point) cannot supersede the current
+                        // deletion, we want to know if it's equal to it (both before and after), because in that case
+                        // the source is up to date and we don't want to include repair.
+                        //
+                        // So in practice we have 2 possible case:
+                        //  1) the source was up-to-date on deletion up to that point: then it won't be from that point
+                        //     on unless it's a boundary and the new opened deletion time is also equal to the current
+                        //     deletion (note that this implies the boundary has the same closing and opening deletion
+                        //     time, which should generally not happen, but can due to legacy reading code not avoiding
+                        //     this for a while, see CASSANDRA-13237).
+                        //  2) the source wasn't up-to-date on deletion up to that point and it may now be (if it isn't
+                        //     we just have nothing to do for that marker).
+                        assert !currentDeletion.isLive() : currentDeletion.toString();
+
+                        // Is the source up to date on deletion? It's up to date if it doesn't have an open RT repair
+                        // nor an "active" partition level deletion (where "active" means that it's greater or equal
+                        // to the current deletion: if the source has a repaired partition deletion lower than the
+                        // current deletion, this means the current deletion is due to a previously open range tombstone,
+                        // and if the source isn't currently repaired for that RT, then it means it's up to date on it).
+                        DeletionTime partitionRepairDeletion = partitionLevelRepairDeletion(i);
+                        if (markerToRepair[i] == null && currentDeletion.supersedes(partitionRepairDeletion))
+                        {
+                            /*
+                             * Since there is an ongoing merged deletion, the only way we don't have an open repair for
+                             * this source is that it had a range open with the same deletion as current marker,
+                             * and the marker is closing it.
+                             */
+                            assert marker.isClose(isReversed) && currentDeletion.equals(marker.closeDeletionTime(isReversed))
+                                 : String.format("currentDeletion=%s, marker=%s", currentDeletion, marker.toString(command.metadata()));
+
+                            // and so unless it's a boundary whose opening deletion time is still equal to the current
+                            // deletion (see comment above for why this can actually happen), we have to repair the source
+                            // from that point on.
+                            if (!(marker.isOpen(isReversed) && currentDeletion.equals(marker.openDeletionTime(isReversed))))
+                                markerToRepair[i] = marker.closeBound(isReversed).invert();
+                        }
+                        // In case 2) above, we only have something to do if the source is up-to-date after that point
+                        // (which, since the source isn't up-to-date before that point, means we're opening a new deletion
+                        // that is equal to the current one).
+                        else if (marker.isOpen(isReversed) && currentDeletion.equals(marker.openDeletionTime(isReversed)))
+                        {
+                            closeOpenMarker(i, marker.openBound(isReversed).invert());
+                        }
+                    }
+                    else
+                    {
+                        // We have a change of current deletion in merged (potentially to/from no deletion at all).
+
+                        if (merged.isClose(isReversed))
+                        {
+                            // We're closing the merged range. If we've marked the source as needing to be repaired for
+                            // that range, close and add it to the repair to be sent.
+                            if (markerToRepair[i] != null)
+                                closeOpenMarker(i, merged.closeBound(isReversed));
+
+                        }
+
+                        if (merged.isOpen(isReversed))
+                        {
+                            // If we're opening a new merged range (or just switching deletion), then unless the source
+                            // is up to date on that deletion (note that we've updated what the source deleteion is
+                            // above), we'll have to sent the range to the source.
+                            DeletionTime newDeletion = merged.openDeletionTime(isReversed);
+                            DeletionTime sourceDeletion = sourceDeletionTime[i];
+                            if (!newDeletion.equals(sourceDeletion))
+                                markerToRepair[i] = merged.openBound(isReversed);
+                        }
+                    }
+                }
+
+                if (merged != null)
+                    mergedDeletionTime = merged.isOpen(isReversed) ? merged.openDeletionTime(isReversed) : null;
+            }
+
+            private void closeOpenMarker(int i, Slice.Bound close)
+            {
+                Slice.Bound open = markerToRepair[i];
+                update(i).add(new RangeTombstone(Slice.make(isReversed ? close : open, isReversed ? open : close), currentDeletion()));
+                markerToRepair[i] = null;
+            }
+
+            public void close()
+            {
+                for (int i = 0; i < repairs.length; i++)
+                    if (null != repairs[i])
+                        sendRepairMutation(repairs[i], sources[i]);
+            }
+
+            private void sendRepairMutation(PartitionUpdate partition, InetAddress destination)
+            {
+                Mutation mutation = new Mutation(partition);
+                int messagingVersion = MessagingService.instance().getVersion(destination);
+
+                int    mutationSize = (int) Mutation.serializer.serializedSize(mutation, messagingVersion);
+                int maxMutationSize = DatabaseDescriptor.getMaxMutationSize();
+
+                if (mutationSize <= maxMutationSize)
+                {
+                    Tracing.trace("Sending read-repair-mutation to {}", destination);
+                    // use a separate verb here to avoid writing hints on timeouts
+                    MessageOut<Mutation> message = mutation.createMessage(MessagingService.Verb.READ_REPAIR);
+                    repairResults.add(MessagingService.instance().sendRR(message, destination));
+                    ColumnFamilyStore.metricsFor(command.metadata().cfId).readRepairRequests.mark();
+                }
+                else if (DROP_OVERSIZED_READ_REPAIR_MUTATIONS)
+                {
+                    logger.debug("Encountered an oversized ({}/{}) read repair mutation for table {}.{}, key {}, node {}",
+                                 mutationSize,
+                                 maxMutationSize,
+                                 command.metadata().ksName,
+                                 command.metadata().cfName,
+                                 command.metadata().getKeyValidator().getString(partitionKey.getKey()),
+                                 destination);
+                }
+                else
+                {
+                    logger.warn("Encountered an oversized ({}/{}) read repair mutation for table {}.{}, key {}, node {}",
+                                mutationSize,
+                                maxMutationSize,
+                                command.metadata().ksName,
+                                command.metadata().cfName,
+                                command.metadata().getKeyValidator().getString(partitionKey.getKey()),
+                                destination);
+
+                    int blockFor = consistency.blockFor(keyspace);
+                    Tracing.trace("Timed out while read-repairing after receiving all {} data and digest responses", blockFor);
+                    throw new ReadTimeoutException(consistency, blockFor - 1, blockFor, true);
+                }
+            }
+        }
+    }
+
+    private UnfilteredPartitionIterator extendWithShortReadProtection(UnfilteredPartitionIterator partitions,
+                                                                      InetAddress source,
+                                                                      DataLimits.Counter mergedResultCounter)
+    {
+        DataLimits.Counter singleResultCounter =
+            command.limits().newCounter(command.nowInSec(), false, command.selectsFullPartition(), enforceStrictLiveness).onlyCount();
+
+        ShortReadPartitionsProtection protection =
+            new ShortReadPartitionsProtection(source, singleResultCounter, mergedResultCounter);
+
+        /*
+         * The order of extention and transformations is important here. Extending with more partitions has to happen
+         * first due to the way BaseIterator.hasMoreContents() works: only transformations applied after extension will
+         * be called on the first partition of the extended iterator.
+         *
+         * Additionally, we want singleResultCounter to be applied after SRPP, so that its applyToPartition() method will
+         * be called last, after the extension done by SRRP.applyToPartition() call. That way we preserve the same order
+         * when it comes to calling SRRP.moreContents() and applyToRow() callbacks.
+         *
+         * See ShortReadPartitionsProtection.applyToPartition() for more details.
+         */
+
+        // extend with moreContents() only if it's a range read command with no partition key specified
+        if (!command.isLimitedToOnePartition())
+            partitions = MorePartitions.extend(partitions, protection);     // register SRPP.moreContents()
+
+        partitions = Transformation.apply(partitions, protection);          // register SRPP.applyToPartition()
+        partitions = Transformation.apply(partitions, singleResultCounter); // register the per-source counter
+
+        return partitions;
+    }
+
+    /*
+     * We have a potential short read if the result from a given node contains the requested number of rows
+     * (i.e. it has stopped returning results due to the limit), but some of them haven't
+     * made it into the final post-reconciliation result due to other nodes' row, range, and/or partition tombstones.
+     *
+     * If that is the case, then that node may have more rows that we should fetch, as otherwise we could
+     * ultimately return fewer rows than required. Also, those additional rows may contain tombstones which
+     * which we also need to fetch as they may shadow rows or partitions from other replicas' results, which we would
+     * otherwise return incorrectly.
+     */
+    private class ShortReadPartitionsProtection extends Transformation<UnfilteredRowIterator> implements MorePartitions<UnfilteredPartitionIterator>
+    {
+        private final InetAddress source;
+
+        private final DataLimits.Counter singleResultCounter; // unmerged per-source counter
+        private final DataLimits.Counter mergedResultCounter; // merged end-result counter
+
+        private DecoratedKey lastPartitionKey; // key of the last observed partition
+
+        private boolean partitionsFetched; // whether we've seen any new partitions since iteration start or last moreContents() call
+
+        private ShortReadPartitionsProtection(InetAddress source, DataLimits.Counter singleResultCounter, DataLimits.Counter mergedResultCounter)
+        {
+            this.source = source;
+            this.singleResultCounter = singleResultCounter;
+            this.mergedResultCounter = mergedResultCounter;
+        }
+
+        @Override
+        public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
+        {
+            partitionsFetched = true;
+
+            lastPartitionKey = partition.partitionKey();
+
+            /*
+             * Extend for moreContents() then apply protection to track lastClustering by applyToRow().
+             *
+             * If we don't apply the transformation *after* extending the partition with MoreRows,
+             * applyToRow() method of protection will not be called on the first row of the new extension iterator.
+             */
+            ShortReadRowsProtection protection = new ShortReadRowsProtection(partition.metadata(), partition.partitionKey());
+            return Transformation.apply(MoreRows.extend(partition, protection), protection);
+        }
+
+        /*
+         * We only get here once all the rows and partitions in this iterator have been iterated over, and so
+         * if the node had returned the requested number of rows but we still get here, then some results were
+         * skipped during reconciliation.
+         */
+        public UnfilteredPartitionIterator moreContents()
+        {
+            // never try to request additional partitions from replicas if our reconciled partitions are already filled to the limit
+            assert !mergedResultCounter.isDone();
+
+            // we do not apply short read protection when we have no limits at all
+            assert !command.limits().isUnlimited();
+
+            /*
+             * If this is a single partition read command or an (indexed) partition range read command with
+             * a partition key specified, then we can't and shouldn't try fetch more partitions.
+             */
+            assert !command.isLimitedToOnePartition();
+
+            /*
+             * If the returned result doesn't have enough rows/partitions to satisfy even the original limit, don't ask for more.
+             *
+             * Can only take the short cut if there is no per partition limit set. Otherwise it's possible to hit false
+             * positives due to some rows being uncounted for in certain scenarios (see CASSANDRA-13911).
+             */
+            if (!singleResultCounter.isDone() && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
+                return null;
+
+            /*
+             * Either we had an empty iterator as the initial response, or our moreContents() call got us an empty iterator.
+             * There is no point to ask the replica for more rows - it has no more in the requested range.
+             */
+            if (!partitionsFetched)
+                return null;
+            partitionsFetched = false;
+
+            /*
+             * We are going to fetch one partition at a time for thrift and potentially more for CQL.
+             * The row limit will either be set to the per partition limit - if the command has no total row limit set, or
+             * the total # of rows remaining - if it has some. If we don't grab enough rows in some of the partitions,
+             * then future ShortReadRowsProtection.moreContents() calls will fetch the missing ones.
+             */
+            int toQuery = command.limits().count() != DataLimits.NO_LIMIT
+                        ? command.limits().count() - mergedResultCounter.counted()
+                        : command.limits().perPartitionCount();
+
+            ColumnFamilyStore.metricsFor(command.metadata().cfId).shortReadProtectionRequests.mark();
+            Tracing.trace("Requesting {} extra rows from {} for short read protection", toQuery, source);
+
+            PartitionRangeReadCommand cmd = makeFetchAdditionalPartitionReadCommand(toQuery);
+            return executeReadCommand(cmd);
+        }
+
+        private PartitionRangeReadCommand makeFetchAdditionalPartitionReadCommand(int toQuery)
+        {
+            PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
+
+            DataLimits newLimits = cmd.limits().forShortReadRetry(toQuery);
+
+            AbstractBounds<PartitionPosition> bounds = cmd.dataRange().keyRange();
+            AbstractBounds<PartitionPosition> newBounds = bounds.inclusiveRight()
+                                                        ? new Range<>(lastPartitionKey, bounds.right)
+                                                        : new ExcludingBounds<>(lastPartitionKey, bounds.right);
+            DataRange newDataRange = cmd.dataRange().forSubRange(newBounds);
+
+            return cmd.withUpdatedLimitsAndDataRange(newLimits, newDataRange);
+        }
+
+        private class ShortReadRowsProtection extends Transformation implements MoreRows<UnfilteredRowIterator>
+        {
+            private final CFMetaData metadata;
+            private final DecoratedKey partitionKey;
+
+            private Clustering lastClustering; // clustering of the last observed row
+
+            private int lastCounted = 0; // last seen recorded # before attempting to fetch more rows
+            private int lastFetched = 0; // # rows returned by last attempt to get more (or by the original read command)
+            private int lastQueried = 0; // # extra rows requested from the replica last time
+
+            private ShortReadRowsProtection(CFMetaData metadata, DecoratedKey partitionKey)
+            {
+                this.metadata = metadata;
+                this.partitionKey = partitionKey;
+            }
+
+            @Override
+            public Row applyToRow(Row row)
+            {
+                lastClustering = row.clustering();
+                return row;
+            }
+
+            /*
+             * We only get here once all the rows in this iterator have been iterated over, and so if the node
+             * had returned the requested number of rows but we still get here, then some results were skipped
+             * during reconciliation.
+             */
+            public UnfilteredRowIterator moreContents()
+            {
+                // never try to request additional rows from replicas if our reconciled partition is already filled to the limit
+                assert !mergedResultCounter.isDoneForPartition();
+
+                // we do not apply short read protection when we have no limits at all
+                assert !command.limits().isUnlimited();
+
+                /*
+                 * If the returned partition doesn't have enough rows to satisfy even the original limit, don't ask for more.
+                 *
+                 * Can only take the short cut if there is no per partition limit set. Otherwise it's possible to hit false
+                 * positives due to some rows being uncounted for in certain scenarios (see CASSANDRA-13911).
+                 */
+                if (!singleResultCounter.isDoneForPartition() && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
+                    return null;
+
+                /*
+                 * If the replica has no live rows in the partition, don't try to fetch more.
+                 *
+                 * Note that the previous branch [if (!singleResultCounter.isDoneForPartition()) return null] doesn't
+                 * always cover this scenario:
+                 * isDoneForPartition() is defined as [isDone() || rowInCurrentPartition >= perPartitionLimit],
+                 * and will return true if isDone() returns true, even if there are 0 rows counted in the current partition.
+                 *
+                 * This can happen with a range read if after 1+ rounds of short read protection requests we managed to fetch
+                 * enough extra rows for other partitions to satisfy the singleResultCounter's total row limit, but only
+                 * have tombstones in the current partition.
+                 *
+                 * One other way we can hit this condition is when the partition only has a live static row and no regular
+                 * rows. In that scenario the counter will remain at 0 until the partition is closed - which happens after
+                 * the moreContents() call.
+                 */
+                if (singleResultCounter.countedInCurrentPartition() == 0)
+                    return null;
+
+                /*
+                 * This is a table with no clustering columns, and has at most one row per partition - with EMPTY clustering.
+                 * We already have the row, so there is no point in asking for more from the partition.
+                 */
+                if (Clustering.EMPTY == lastClustering)
+                    return null;
+
+                lastFetched = singleResultCounter.countedInCurrentPartition() - lastCounted;
+                lastCounted = singleResultCounter.countedInCurrentPartition();
+
+                // getting back fewer rows than we asked for means the partition on the replica has been fully consumed
+                if (lastQueried > 0 && lastFetched < lastQueried)
+                    return null;
+
+                /*
+                 * At this point we know that:
+                 *     1. the replica returned [repeatedly?] as many rows as we asked for and potentially has more
+                 *        rows in the partition
+                 *     2. at least one of those returned rows was shadowed by a tombstone returned from another
+                 *        replica
+                 *     3. we haven't satisfied the client's limits yet, and should attempt to query for more rows to
+                 *        avoid a short read
+                 *
+                 * In the ideal scenario, we would get exactly min(a, b) or fewer rows from the next request, where a and b
+                 * are defined as follows:
+                 *     [a] limits.count() - mergedResultCounter.counted()
+                 *     [b] limits.perPartitionCount() - mergedResultCounter.countedInCurrentPartition()
+                 *
+                 * It would be naive to query for exactly that many rows, as it's possible and not unlikely
+                 * that some of the returned rows would also be shadowed by tombstones from other hosts.
+                 *
+                 * Note: we don't know, nor do we care, how many rows from the replica made it into the reconciled result;
+                 * we can only tell how many in total we queried for, and that [0, mrc.countedInCurrentPartition()) made it.
+                 *
+                 * In general, our goal should be to minimise the number of extra requests - *not* to minimise the number
+                 * of rows fetched: there is a high transactional cost for every individual request, but a relatively low
+                 * marginal cost for each extra row requested.
+                 *
+                 * As such it's better to overfetch than to underfetch extra rows from a host; but at the same
+                 * time we want to respect paging limits and not blow up spectacularly.
+                 *
+                 * Note: it's ok to retrieve more rows that necessary since singleResultCounter is not stopping and only
+                 * counts.
+                 *
+                 * With that in mind, we'll just request the minimum of (count(), perPartitionCount()) limits.
+                 *
+                 * See CASSANDRA-13794 for more details.
+                 */
+                lastQueried = Math.min(command.limits().count(), command.limits().perPartitionCount());
+
+                ColumnFamilyStore.metricsFor(metadata.cfId).shortReadProtectionRequests.mark();
+                Tracing.trace("Requesting {} extra rows from {} for short read protection", lastQueried, source);
+
+                SinglePartitionReadCommand cmd = makeFetchAdditionalRowsReadCommand(lastQueried);
+                return UnfilteredPartitionIterators.getOnlyElement(executeReadCommand(cmd), cmd);
+            }
+
+            private SinglePartitionReadCommand makeFetchAdditionalRowsReadCommand(int toQuery)
+            {
+                ClusteringIndexFilter filter = command.clusteringIndexFilter(partitionKey);
+                if (null != lastClustering)
+                    filter = filter.forPaging(metadata.comparator, lastClustering, false);
+
+                return SinglePartitionReadCommand.create(command.isForThrift(),
+                                                         command.metadata(),
+                                                         command.nowInSec(),
+                                                         command.columnFilter(),
+                                                         command.rowFilter(),
+                                                         command.limits().forShortReadRetry(toQuery),
+                                                         partitionKey,
+                                                         filter,
+                                                         command.indexMetadata());
+            }
+        }
+
+        private UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd)
+        {
+            DataResolver resolver = new DataResolver(keyspace, cmd, ConsistencyLevel.ONE, 1);
+            ReadCallback handler = new ReadCallback(resolver, ConsistencyLevel.ONE, cmd, Collections.singletonList(source));
+
+            if (StorageProxy.canDoLocalRequest(source))
+                StageManager.getStage(Stage.READ).maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(cmd, handler));
+            else
+                MessagingService.instance().sendRRWithFailure(cmd.createMessage(MessagingService.current_version), source, handler);
+
+            // We don't call handler.get() because we want to preserve tombstones since we're still in the middle of merging node results.
+            handler.awaitResults();
+            assert resolver.responses.size() == 1;
+            return resolver.responses.get(0).payload.makeIterator(command);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/DigestResolver.java b/src/java/org/apache/cassandra/service/DigestResolver.java
new file mode 100644
index 0000000..6a528e9
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/DigestResolver.java

@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.net.MessageIn;
+
+public class DigestResolver extends ResponseResolver
+{
+    private volatile ReadResponse dataResponse;
+
+    public DigestResolver(Keyspace keyspace, ReadCommand command, ConsistencyLevel consistency, int maxResponseCount)
+    {
+        super(keyspace, command, consistency, maxResponseCount);
+    }
+
+    @Override
+    public void preprocess(MessageIn<ReadResponse> message)
+    {
+        super.preprocess(message);
+        if (dataResponse == null && !message.payload.isDigestResponse())
+            dataResponse = message.payload;
+    }
+
+    /**
+     * Special case of resolve() so that CL.ONE reads never throw DigestMismatchException in the foreground
+     */
+    public PartitionIterator getData()
+    {
+        assert isDataPresent();
+        return UnfilteredPartitionIterators.filter(dataResponse.makeIterator(command), command.nowInSec());
+    }
+
+    /*
+     * This method handles two different scenarios:
+     *
+     * a) we're handling the initial read of data from the closest replica + digests
+     *    from the rest. In this case we check the digests against each other,
+     *    throw an exception if there is a mismatch, otherwise return the data row.
+     *
+     * b) we're checking additional digests that arrived after the minimum to handle
+     *    the requested ConsistencyLevel, i.e. asynchronous read repair check
+     */
+    public PartitionIterator resolve() throws DigestMismatchException
+    {
+        if (responses.size() == 1)
+            return getData();
+
+        if (logger.isTraceEnabled())
+            logger.trace("resolving {} responses", responses.size());
+
+        compareResponses();
+
+        return UnfilteredPartitionIterators.filter(dataResponse.makeIterator(command), command.nowInSec());
+    }
+
+    public void compareResponses() throws DigestMismatchException
+    {
+        long start = System.nanoTime();
+
+        // validate digests against each other; throw immediately on mismatch.
+        ByteBuffer digest = null;
+        for (MessageIn<ReadResponse> message : responses)
+        {
+            ReadResponse response = message.payload;
+
+            ByteBuffer newDigest = response.digest(command);
+            if (digest == null)
+                digest = newDigest;
+            else if (!digest.equals(newDigest))
+                // rely on the fact that only single partition queries use digests
+                throw new DigestMismatchException(((SinglePartitionReadCommand)command).partitionKey(), digest, newDigest);
+        }
+
+        if (logger.isTraceEnabled())
+            logger.trace("resolve: {} ms.", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
+    }
+
+    public boolean isDataPresent()
+    {
+        return dataResponse != null;
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/FileCacheService.java b/src/java/org/apache/cassandra/service/FileCacheService.java
deleted file mode 100644
index 19d6a70..0000000
--- a/src/java/org/apache/cassandra/service/FileCacheService.java
+++ /dev/null

@@ -1,190 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.util.Queue;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentLinkedQueue;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-
-import com.google.common.cache.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.metrics.FileCacheMetrics;
-
-public class FileCacheService
-{
-    private static final Logger logger = LoggerFactory.getLogger(FileCacheService.class);
-
-    private static final long MEMORY_USAGE_THRESHOLD = DatabaseDescriptor.getFileCacheSizeInMB() * 1024L * 1024L;
-    private static final int AFTER_ACCESS_EXPIRATION = 512; // in millis
-
-    public static FileCacheService instance = new FileCacheService();
-
-    private static final AtomicLong cacheKeyIdCounter = new AtomicLong();
-    public static final class CacheKey
-    {
-        final long id;
-        public CacheKey()
-        {
-            this.id = cacheKeyIdCounter.incrementAndGet();
-        }
-        public boolean equals(Object that)
-        {
-            return that instanceof CacheKey && ((CacheKey) that).id == this.id;
-        }
-        public int hashCode()
-        {
-            return (int) id;
-        }
-    }
-
-    private static final Callable<CacheBucket> cacheForPathCreator = new Callable<CacheBucket>()
-    {
-        @Override
-        public CacheBucket call()
-        {
-            return new CacheBucket();
-        }
-    };
-
-    private static final AtomicInteger memoryUsage = new AtomicInteger();
-
-    private final Cache<CacheKey, CacheBucket> cache;
-    private final FileCacheMetrics metrics = new FileCacheMetrics();
-
-    private static final class CacheBucket
-    {
-        final ConcurrentLinkedQueue<RandomAccessReader> queue = new ConcurrentLinkedQueue<>();
-        volatile boolean discarded = false;
-    }
-
-    protected FileCacheService()
-    {
-        RemovalListener<CacheKey, CacheBucket> onRemove = new RemovalListener<CacheKey, CacheBucket>()
-        {
-            @Override
-            public void onRemoval(RemovalNotification<CacheKey, CacheBucket> notification)
-            {
-                CacheBucket bucket = notification.getValue();
-                if (bucket == null)
-                    return;
-
-                // set discarded before deallocating the readers, to ensure we don't leak any
-                bucket.discarded = true;
-                Queue<RandomAccessReader> q = bucket.queue;
-                boolean first = true;
-                for (RandomAccessReader reader = q.poll() ; reader != null ; reader = q.poll())
-                {
-                    if (logger.isDebugEnabled() && first)
-                    {
-                        logger.debug("Evicting cold readers for {}", reader.getPath());
-                        first = false;
-                    }
-                    memoryUsage.addAndGet(-1 * reader.getTotalBufferSize());
-                    reader.deallocate();
-                }
-            }
-        };
-
-        cache = CacheBuilder.newBuilder()
-                .expireAfterAccess(AFTER_ACCESS_EXPIRATION, TimeUnit.MILLISECONDS)
-                .concurrencyLevel(DatabaseDescriptor.getConcurrentReaders())
-                .removalListener(onRemove)
-                .initialCapacity(16 << 10)
-                .build();
-    }
-
-    public RandomAccessReader get(CacheKey key)
-    {
-        metrics.requests.mark();
-
-        CacheBucket bucket = getCacheFor(key);
-        RandomAccessReader result = bucket.queue.poll();
-        if (result != null)
-        {
-            metrics.hits.mark();
-            memoryUsage.addAndGet(-result.getTotalBufferSize());
-        }
-
-        return result;
-    }
-
-    private CacheBucket getCacheFor(CacheKey key)
-    {
-        try
-        {
-            return cache.get(key, cacheForPathCreator);
-        }
-        catch (ExecutionException e)
-        {
-            throw new AssertionError(e);
-        }
-    }
-
-    @SuppressWarnings("resource")
-    public void put(CacheKey cacheKey, RandomAccessReader instance)
-    {
-        int memoryUsed = memoryUsage.get();
-        if (logger.isTraceEnabled())
-            logger.trace("Estimated memory usage is {} compared to actual usage {}", memoryUsed, sizeInBytes());
-
-        CacheBucket bucket = cache.getIfPresent(cacheKey);
-        if (memoryUsed >= MEMORY_USAGE_THRESHOLD || bucket == null)
-        {
-            instance.deallocate();
-        }
-        else
-        {
-            memoryUsage.addAndGet(instance.getTotalBufferSize());
-            bucket.queue.add(instance);
-            if (bucket.discarded)
-            {
-                RandomAccessReader reader = bucket.queue.poll();
-                if (reader != null)
-                {
-                    memoryUsage.addAndGet(-1 * reader.getTotalBufferSize());
-                    reader.deallocate();
-                }
-            }
-        }
-    }
-
-    public void invalidate(CacheKey cacheKey, String path)
-    {
-        if (logger.isDebugEnabled())
-            logger.debug("Invalidating cache for {}", path);
-        cache.invalidate(cacheKey);
-    }
-
-    // TODO: this method is unsafe, as it calls getTotalBufferSize() on items that can have been discarded
-    public long sizeInBytes()
-    {
-        long n = 0;
-        for (CacheBucket bucket : cache.asMap().values())
-            for (RandomAccessReader reader : bucket.queue)
-                n += reader.getTotalBufferSize();
-        return n;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/GCInspector.java b/src/java/org/apache/cassandra/service/GCInspector.java
index 4f93097..787d79a 100644
--- a/src/java/org/apache/cassandra/service/GCInspector.java
+++ b/src/java/org/apache/cassandra/service/GCInspector.java

@@ -41,7 +41,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 
-import org.apache.cassandra.io.sstable.SSTableDeletingTask;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.utils.MBeanWrapper;
 import org.apache.cassandra.utils.StatusLogger;
 
@@ -198,10 +198,10 @@
     }
 
     /*
-     * Assume that a GC type is an old generation collection so SSTableDeletingTask.rescheduleFailedTasks()
+     * Assume that a GC type is an old generation collection so TransactionLogs.rescheduleFailedTasks()
      * should be invoked.
      *
-     * Defaults to not invoking SSTableDeletingTask.rescheduleFailedTasks() on unrecognized GC names
+     * Defaults to not invoking TransactionLogs.rescheduleFailedTasks() on unrecognized GC names
      */
     private static boolean assumeGCIsOldGen(GarbageCollectorMXBean gc)
     {
@@ -219,7 +219,7 @@
                 return true;
             default:
                 //Assume not old gen otherwise, don't call
-                //SSTableDeletingTask.rescheduleFailedTasks()
+                //TransactionLogs.rescheduleFailedTasks()
                 return false;
         }
     }
@@ -291,7 +291,7 @@
 
             // if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure
             if (gcState.assumeGCIsOldGen)
-                SSTableDeletingTask.rescheduleFailedTasks();
+                LifecycleTransaction.rescheduleFailedDeletions();
         }
     }
 

diff --git a/src/java/org/apache/cassandra/service/IReadCommand.java b/src/java/org/apache/cassandra/service/IReadCommand.java
deleted file mode 100644
index c6a129e..0000000
--- a/src/java/org/apache/cassandra/service/IReadCommand.java
+++ /dev/null

@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-public interface IReadCommand
-{
-    public String getKeyspace();
-    public long getTimeout();
-}

diff --git a/src/java/org/apache/cassandra/service/IResponseResolver.java b/src/java/org/apache/cassandra/service/IResponseResolver.java
deleted file mode 100644
index 17c8bff..0000000
--- a/src/java/org/apache/cassandra/service/IResponseResolver.java
+++ /dev/null

@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import org.apache.cassandra.net.MessageIn;
-
-public interface IResponseResolver<TMessage, TResolved> {
-
-    /**
-     * This Method resolves the responses that are passed in . for example : if
-     * its write response then all we get is true or false return values which
-     * implies if the writes were successful but for reads its more complicated
-     * you need to look at the responses and then based on differences schedule
-     * repairs . Hence you need to derive a response resolver based on your
-     * needs from this interface.
-     */
-    public TResolved resolve() throws DigestMismatchException;
-
-    public boolean isDataPresent();
-
-    /**
-     * returns the data response without comparing with any digests
-     */
-    public TResolved getData();
-
-    public void preprocess(MessageIn<TMessage> message);
-    public Iterable<MessageIn<TMessage>> getMessages();
-}

diff --git a/src/java/org/apache/cassandra/service/MigrationListener.java b/src/java/org/apache/cassandra/service/MigrationListener.java
index 358b236..19d2592 100644
--- a/src/java/org/apache/cassandra/service/MigrationListener.java
+++ b/src/java/org/apache/cassandra/service/MigrationListener.java

@@ -31,6 +31,11 @@
     {
     }
 
+    public void onCreateView(String ksName, String viewName)
+    {
+        onCreateColumnFamily(ksName, viewName);
+    }
+
     public void onCreateUserType(String ksName, String typeName)
     {
     }
@@ -47,10 +52,17 @@
     {
     }
 
-    public void onUpdateColumnFamily(String ksName, String cfName, boolean columnsDidChange)
+    // the boolean flag indicates whether the change that triggered this event may have a substantive
+    // impact on statements using the column family.
+    public void onUpdateColumnFamily(String ksName, String cfName, boolean affectsStatements)
     {
     }
 
+    public void onUpdateView(String ksName, String viewName, boolean columnsDidChange)
+    {
+        onUpdateColumnFamily(ksName, viewName, columnsDidChange);
+    }
+
     public void onUpdateUserType(String ksName, String typeName)
     {
     }
@@ -71,6 +83,11 @@
     {
     }
 
+    public void onDropView(String ksName, String viewName)
+    {
+        onDropColumnFamily(ksName, viewName);
+    }
+
     public void onDropUserType(String ksName, String typeName)
     {
     }

diff --git a/src/java/org/apache/cassandra/service/MigrationManager.java b/src/java/org/apache/cassandra/service/MigrationManager.java
index d025196..26b1aed 100644
--- a/src/java/org/apache/cassandra/service/MigrationManager.java
+++ b/src/java/org/apache/cassandra/service/MigrationManager.java

@@ -17,27 +17,21 @@
  */
 package org.apache.cassandra.service;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.*;
-import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
 import java.util.concurrent.*;
-
 import java.lang.management.ManagementFactory;
 import java.lang.management.RuntimeMXBean;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
 import org.apache.cassandra.cql3.functions.UDAggregate;
 import org.apache.cassandra.cql3.functions.UDFunction;
 import org.apache.cassandra.db.*;
@@ -46,10 +40,13 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.*;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.schema.Tables;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.WrappedRunnable;
 
@@ -63,8 +60,10 @@
 
     public static final int MIGRATION_DELAY_IN_MS = 60000;
 
+    private static final int MIGRATION_TASK_WAIT_IN_SECONDS = Integer.parseInt(System.getProperty("cassandra.migration_task_wait_in_seconds", "1"));
+
     private final List<MigrationListener> listeners = new CopyOnWriteArrayList<>();
-    
+
     private MigrationManager() {}
 
     public void register(MigrationListener listener)
@@ -77,7 +76,7 @@
         listeners.remove(listener);
     }
 
-    public void scheduleSchemaPull(InetAddress endpoint, EndpointState state)
+    public static void scheduleSchemaPull(InetAddress endpoint, EndpointState state)
     {
         VersionedValue value = state.getApplicationState(ApplicationState.SCHEMA);
 
@@ -114,27 +113,24 @@
         {
             // Include a delay to make sure we have a chance to apply any changes being
             // pushed out simultaneously. See CASSANDRA-5025
-            Runnable runnable = new Runnable()
+            Runnable runnable = () ->
             {
-                public void run()
+                // grab the latest version of the schema since it may have changed again since the initial scheduling
+                EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
+                if (epState == null)
                 {
-                    // grab the latest version of the schema since it may have changed again since the initial scheduling
-                    EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
-                    if (epState == null)
-                    {
-                        logger.debug("epState vanished for {}, not submitting migration task", endpoint);
-                        return;
-                    }
-                    VersionedValue value = epState.getApplicationState(ApplicationState.SCHEMA);
-                    UUID currentVersion = UUID.fromString(value.value);
-                    if (Schema.instance.getVersion().equals(currentVersion))
-                    {
-                        logger.debug("not submitting migration task for {} because our versions match", endpoint);
-                        return;
-                    }
-                    logger.debug("submitting migration task for {}", endpoint);
-                    submitMigrationTask(endpoint);
+                    logger.debug("epState vanished for {}, not submitting migration task", endpoint);
+                    return;
                 }
+                VersionedValue value = epState.getApplicationState(ApplicationState.SCHEMA);
+                UUID currentVersion = UUID.fromString(value.value);
+                if (Schema.instance.getVersion().equals(currentVersion))
+                {
+                    logger.debug("not submitting migration task for {} because our versions match", endpoint);
+                    return;
+                }
+                logger.debug("submitting migration task for {}", endpoint);
+                submitMigrationTask(endpoint);
             };
             ScheduledExecutors.nonPeriodicTasks.schedule(runnable, MIGRATION_DELAY_IN_MS, TimeUnit.MILLISECONDS);
         }
@@ -156,16 +152,41 @@
          * Don't request schema from fat clients
          */
         return MessagingService.instance().knowsVersion(endpoint)
-                && MessagingService.instance().getRawVersion(endpoint) == MessagingService.current_version
+                && is30Compatible(MessagingService.instance().getRawVersion(endpoint))
                 && !Gossiper.instance.isGossipOnlyMember(endpoint);
     }
 
+    // Since 3.0.14 protocol contains only a CASSANDRA-13004 bugfix, it is safe to accept schema changes
+    // from both 3.0 and 3.0.14.
+    private static boolean is30Compatible(int version)
+    {
+        return version == MessagingService.current_version || version == MessagingService.VERSION_3014;
+    }
+
     public static boolean isReadyForBootstrap()
     {
-        return ((ThreadPoolExecutor) StageManager.getStage(Stage.MIGRATION)).getActiveCount() == 0;
+        return MigrationTask.getInflightTasks().isEmpty();
     }
 
-    public void notifyCreateKeyspace(KSMetaData ksm)
+    public static void waitUntilReadyForBootstrap()
+    {
+        CountDownLatch completionLatch;
+        while ((completionLatch = MigrationTask.getInflightTasks().poll()) != null)
+        {
+            try
+            {
+                if (!completionLatch.await(MIGRATION_TASK_WAIT_IN_SECONDS, TimeUnit.SECONDS))
+                    logger.error("Migration task failed to complete");
+            }
+            catch (InterruptedException e)
+            {
+                Thread.currentThread().interrupt();
+                logger.error("Migration task was interrupted");
+            }
+        }
+    }
+
+    public void notifyCreateKeyspace(KeyspaceMetadata ksm)
     {
         for (MigrationListener listener : listeners)
             listener.onCreateKeyspace(ksm.name);
@@ -177,6 +198,12 @@
             listener.onCreateColumnFamily(cfm.ksName, cfm.cfName);
     }
 
+    public void notifyCreateView(ViewDefinition view)
+    {
+        for (MigrationListener listener : listeners)
+            listener.onCreateView(view.ksName, view.viewName);
+    }
+
     public void notifyCreateUserType(UserType ut)
     {
         for (MigrationListener listener : listeners)
@@ -189,14 +216,13 @@
             listener.onCreateFunction(udf.name().keyspace, udf.name().name, udf.argTypes());
     }
 
-
     public void notifyCreateAggregate(UDAggregate udf)
     {
         for (MigrationListener listener : listeners)
             listener.onCreateAggregate(udf.name().keyspace, udf.name().name, udf.argTypes());
     }
 
-    public void notifyUpdateKeyspace(KSMetaData ksm)
+    public void notifyUpdateKeyspace(KeyspaceMetadata ksm)
     {
         for (MigrationListener listener : listeners)
             listener.onUpdateKeyspace(ksm.name);
@@ -208,10 +234,19 @@
             listener.onUpdateColumnFamily(cfm.ksName, cfm.cfName, columnsDidChange);
     }
 
+    public void notifyUpdateView(ViewDefinition view, boolean columnsDidChange)
+    {
+        for (MigrationListener listener : listeners)
+            listener.onUpdateView(view.ksName, view.viewName, columnsDidChange);
+    }
+
     public void notifyUpdateUserType(UserType ut)
     {
         for (MigrationListener listener : listeners)
             listener.onUpdateUserType(ut.keyspace, ut.getNameAsString());
+
+        // FIXME: remove when we get rid of AbstractType in metadata. Doesn't really belong anywhere.
+        Schema.instance.getKSMetaData(ut.keyspace).functions.udfs().forEach(f -> f.userTypeUpdated(ut.keyspace, ut.getNameAsString()));
     }
 
     public void notifyUpdateFunction(UDFunction udf)
@@ -226,7 +261,7 @@
             listener.onUpdateAggregate(udf.name().keyspace, udf.name().name, udf.argTypes());
     }
 
-    public void notifyDropKeyspace(KSMetaData ksm)
+    public void notifyDropKeyspace(KeyspaceMetadata ksm)
     {
         for (MigrationListener listener : listeners)
             listener.onDropKeyspace(ksm.name);
@@ -238,6 +273,12 @@
             listener.onDropColumnFamily(cfm.ksName, cfm.cfName);
     }
 
+    public void notifyDropView(ViewDefinition view)
+    {
+        for (MigrationListener listener : listeners)
+            listener.onDropView(view.ksName, view.viewName);
+    }
+
     public void notifyDropUserType(UserType ut)
     {
         for (MigrationListener listener : listeners)
@@ -256,17 +297,17 @@
             listener.onDropAggregate(udf.name().keyspace, udf.name().name, udf.argTypes());
     }
 
-    public static void announceNewKeyspace(KSMetaData ksm) throws ConfigurationException
+    public static void announceNewKeyspace(KeyspaceMetadata ksm) throws ConfigurationException
     {
         announceNewKeyspace(ksm, false);
     }
 
-    public static void announceNewKeyspace(KSMetaData ksm, boolean announceLocally) throws ConfigurationException
+    public static void announceNewKeyspace(KeyspaceMetadata ksm, boolean announceLocally) throws ConfigurationException
     {
         announceNewKeyspace(ksm, FBUtilities.timestampMicros(), announceLocally);
     }
 
-    public static void announceNewKeyspace(KSMetaData ksm, long timestamp, boolean announceLocally) throws ConfigurationException
+    public static void announceNewKeyspace(KeyspaceMetadata ksm, long timestamp, boolean announceLocally) throws ConfigurationException
     {
         ksm.validate();
 
@@ -274,7 +315,7 @@
             throw new AlreadyExistsException(ksm.name);
 
         logger.info(String.format("Create new Keyspace: %s", ksm));
-        announce(LegacySchemaTables.makeCreateKeyspaceMutation(ksm, timestamp), announceLocally);
+        announce(SchemaKeyspace.makeCreateKeyspaceMutation(ksm, timestamp), announceLocally);
     }
 
     public static void announceNewColumnFamily(CFMetaData cfm) throws ConfigurationException
@@ -287,70 +328,75 @@
         announceNewColumnFamily(cfm, announceLocally, true);
     }
 
-    /**
-     * Announces the table even if the definition is already know locally.
-     * This should generally be avoided but is used internally when we want to force the most up to date version of
-     * a system table schema (Note that we don't know if the schema we force _is_ the most recent version or not, we
-     * just rely on idempotency to basically ignore that announce if it's not. That's why we can't use announceUpdateColumnFamily,
-     * it would for instance delete new columns if this is not called with the most up-to-date version)
-     *
-     * Note that this is only safe for system tables where we know the cfId is fixed and will be the same whatever version
-     * of the definition is used.
-     */
-    public static void forceAnnounceNewColumnFamily(CFMetaData cfm) throws ConfigurationException
+    private static void announceNewColumnFamily(CFMetaData cfm, boolean announceLocally, boolean throwOnDuplicate) throws ConfigurationException
     {
-        announceNewColumnFamily(cfm, false, false);
+        announceNewColumnFamily(cfm, announceLocally, throwOnDuplicate, FBUtilities.timestampMicros());
     }
 
-    private static void announceNewColumnFamily(CFMetaData cfm, boolean announceLocally, boolean throwOnDuplicate) throws ConfigurationException
+    private static void announceNewColumnFamily(CFMetaData cfm, boolean announceLocally, boolean throwOnDuplicate, long timestamp) throws ConfigurationException
     {
         cfm.validate();
 
-        KSMetaData ksm = Schema.instance.getKSMetaData(cfm.ksName);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(cfm.ksName);
         if (ksm == null)
             throw new ConfigurationException(String.format("Cannot add table '%s' to non existing keyspace '%s'.", cfm.cfName, cfm.ksName));
-        else if (throwOnDuplicate && ksm.cfMetaData().containsKey(cfm.cfName))
+        // If we have a table or a view which has the same name, we can't add a new one
+        else if (throwOnDuplicate && ksm.getTableOrViewNullable(cfm.cfName) != null)
             throw new AlreadyExistsException(cfm.ksName, cfm.cfName);
 
         logger.info(String.format("Create new table: %s", cfm));
-        announce(LegacySchemaTables.makeCreateTableMutation(ksm, cfm, FBUtilities.timestampMicros()), announceLocally);
+        announce(SchemaKeyspace.makeCreateTableMutation(ksm, cfm, timestamp), announceLocally);
+    }
+
+    public static void announceNewView(ViewDefinition view, boolean announceLocally) throws ConfigurationException
+    {
+        view.metadata.validate();
+
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(view.ksName);
+        if (ksm == null)
+            throw new ConfigurationException(String.format("Cannot add table '%s' to non existing keyspace '%s'.", view.viewName, view.ksName));
+        else if (ksm.getTableOrViewNullable(view.viewName) != null)
+            throw new AlreadyExistsException(view.ksName, view.viewName);
+
+        logger.info(String.format("Create new view: %s", view));
+        announce(SchemaKeyspace.makeCreateViewMutation(ksm, view, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceNewType(UserType newType, boolean announceLocally)
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(newType.keyspace);
-        announce(LegacySchemaTables.makeCreateTypeMutation(ksm, newType, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(newType.keyspace);
+        announce(SchemaKeyspace.makeCreateTypeMutation(ksm, newType, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceNewFunction(UDFunction udf, boolean announceLocally)
     {
         logger.info(String.format("Create scalar function '%s'", udf.name()));
-        KSMetaData ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
-        announce(LegacySchemaTables.makeCreateFunctionMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
+        announce(SchemaKeyspace.makeCreateFunctionMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceNewAggregate(UDAggregate udf, boolean announceLocally)
     {
         logger.info(String.format("Create aggregate function '%s'", udf.name()));
-        KSMetaData ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
-        announce(LegacySchemaTables.makeCreateAggregateMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
+        announce(SchemaKeyspace.makeCreateAggregateMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
     }
 
-    public static void announceKeyspaceUpdate(KSMetaData ksm) throws ConfigurationException
+    public static void announceKeyspaceUpdate(KeyspaceMetadata ksm) throws ConfigurationException
     {
         announceKeyspaceUpdate(ksm, false);
     }
 
-    public static void announceKeyspaceUpdate(KSMetaData ksm, boolean announceLocally) throws ConfigurationException
+    public static void announceKeyspaceUpdate(KeyspaceMetadata ksm, boolean announceLocally) throws ConfigurationException
     {
         ksm.validate();
 
-        KSMetaData oldKsm = Schema.instance.getKSMetaData(ksm.name);
+        KeyspaceMetadata oldKsm = Schema.instance.getKSMetaData(ksm.name);
         if (oldKsm == null)
             throw new ConfigurationException(String.format("Cannot update non existing keyspace '%s'.", ksm.name));
 
         logger.info(String.format("Update Keyspace '%s' From %s To %s", ksm.name, oldKsm, ksm));
-        announce(LegacySchemaTables.makeCreateKeyspaceMutation(ksm, FBUtilities.timestampMicros()), announceLocally);
+        announce(SchemaKeyspace.makeCreateKeyspaceMutation(ksm.name, ksm.params, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceColumnFamilyUpdate(CFMetaData cfm) throws ConfigurationException
@@ -360,17 +406,52 @@
 
     public static void announceColumnFamilyUpdate(CFMetaData cfm, boolean announceLocally) throws ConfigurationException
     {
+        announceColumnFamilyUpdate(cfm, null, announceLocally);
+    }
+
+    public static void announceColumnFamilyUpdate(CFMetaData cfm, Collection<ViewDefinition> views, boolean announceLocally) throws ConfigurationException
+    {
         cfm.validate();
 
         CFMetaData oldCfm = Schema.instance.getCFMetaData(cfm.ksName, cfm.cfName);
         if (oldCfm == null)
             throw new ConfigurationException(String.format("Cannot update non existing table '%s' in keyspace '%s'.", cfm.cfName, cfm.ksName));
-        KSMetaData ksm = Schema.instance.getKSMetaData(cfm.ksName);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(cfm.ksName);
 
-        oldCfm.validateCompatility(cfm);
+        oldCfm.validateCompatibility(cfm);
+
+        long timestamp = FBUtilities.timestampMicros();
 
         logger.info(String.format("Update table '%s/%s' From %s To %s", cfm.ksName, cfm.cfName, oldCfm, cfm));
-        announce(LegacySchemaTables.makeUpdateTableMutation(ksm, oldCfm, cfm, FBUtilities.timestampMicros()), announceLocally);
+        Mutation mutation = SchemaKeyspace.makeUpdateTableMutation(ksm, oldCfm, cfm, timestamp);
+
+        if (views != null)
+            views.forEach(view -> addViewUpdateToMutation(view, mutation, timestamp));
+
+        announce(mutation, announceLocally);
+    }
+
+    public static void announceViewUpdate(ViewDefinition view, boolean announceLocally) throws ConfigurationException
+    {
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(view.ksName);
+        long timestamp = FBUtilities.timestampMicros();
+        Mutation mutation = SchemaKeyspace.makeCreateKeyspaceMutation(ksm.name, ksm.params, timestamp);
+        addViewUpdateToMutation(view, mutation, timestamp);
+        announce(mutation, announceLocally);
+    }
+
+    private static void addViewUpdateToMutation(ViewDefinition view, Mutation mutation, long timestamp)
+    {
+        view.metadata.validate();
+
+        ViewDefinition oldView = Schema.instance.getView(view.ksName, view.viewName);
+        if (oldView == null)
+            throw new ConfigurationException(String.format("Cannot update non existing materialized view '%s' in keyspace '%s'.", view.viewName, view.ksName));
+
+        oldView.metadata.validateCompatibility(view.metadata);
+
+        logger.info(String.format("Update view '%s/%s' From %s To %s", view.ksName, view.viewName, oldView, view));
+        SchemaKeyspace.makeUpdateViewMutation(mutation, oldView, view, timestamp);
     }
 
     public static void announceTypeUpdate(UserType updatedType, boolean announceLocally)
@@ -385,12 +466,12 @@
 
     public static void announceKeyspaceDrop(String ksName, boolean announceLocally) throws ConfigurationException
     {
-        KSMetaData oldKsm = Schema.instance.getKSMetaData(ksName);
+        KeyspaceMetadata oldKsm = Schema.instance.getKSMetaData(ksName);
         if (oldKsm == null)
             throw new ConfigurationException(String.format("Cannot drop non existing keyspace '%s'.", ksName));
 
         logger.info(String.format("Drop Keyspace '%s'", oldKsm.name));
-        announce(LegacySchemaTables.makeDropKeyspaceMutation(oldKsm, FBUtilities.timestampMicros()), announceLocally);
+        announce(SchemaKeyspace.makeDropKeyspaceMutation(oldKsm, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceColumnFamilyDrop(String ksName, String cfName) throws ConfigurationException
@@ -403,10 +484,21 @@
         CFMetaData oldCfm = Schema.instance.getCFMetaData(ksName, cfName);
         if (oldCfm == null)
             throw new ConfigurationException(String.format("Cannot drop non existing table '%s' in keyspace '%s'.", cfName, ksName));
-        KSMetaData ksm = Schema.instance.getKSMetaData(ksName);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
 
         logger.info(String.format("Drop table '%s/%s'", oldCfm.ksName, oldCfm.cfName));
-        announce(LegacySchemaTables.makeDropTableMutation(ksm, oldCfm, FBUtilities.timestampMicros()), announceLocally);
+        announce(SchemaKeyspace.makeDropTableMutation(ksm, oldCfm, FBUtilities.timestampMicros()), announceLocally);
+    }
+
+    public static void announceViewDrop(String ksName, String viewName, boolean announceLocally) throws ConfigurationException
+    {
+        ViewDefinition view = Schema.instance.getView(ksName, viewName);
+        if (view == null)
+            throw new ConfigurationException(String.format("Cannot drop non existing materialized view '%s' in keyspace '%s'.", viewName, ksName));
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ksName);
+
+        logger.info(String.format("Drop table '%s/%s'", view.ksName, view.viewName));
+        announce(SchemaKeyspace.makeDropViewMutation(ksm, view, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceTypeDrop(UserType droppedType)
@@ -416,45 +508,44 @@
 
     public static void announceTypeDrop(UserType droppedType, boolean announceLocally)
     {
-        KSMetaData ksm = Schema.instance.getKSMetaData(droppedType.keyspace);
-        announce(LegacySchemaTables.dropTypeFromSchemaMutation(ksm, droppedType, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(droppedType.keyspace);
+        announce(SchemaKeyspace.dropTypeFromSchemaMutation(ksm, droppedType, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceFunctionDrop(UDFunction udf, boolean announceLocally)
     {
         logger.info(String.format("Drop scalar function overload '%s' args '%s'", udf.name(), udf.argTypes()));
-        KSMetaData ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
-        announce(LegacySchemaTables.makeDropFunctionMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
+        announce(SchemaKeyspace.makeDropFunctionMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
     }
 
     public static void announceAggregateDrop(UDAggregate udf, boolean announceLocally)
     {
         logger.info(String.format("Drop aggregate function overload '%s' args '%s'", udf.name(), udf.argTypes()));
-        KSMetaData ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
-        announce(LegacySchemaTables.makeDropAggregateMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(udf.name().keyspace);
+        announce(SchemaKeyspace.makeDropAggregateMutation(ksm, udf, FBUtilities.timestampMicros()), announceLocally);
+    }
+
+    static void announceGlobally(Mutation schema)
+    {
+        announce(Collections.singletonList(schema), false);
     }
 
     /**
      * actively announce a new version to active hosts via rpc
      * @param schema The schema mutation to be applied
      */
-    private static void announce(Mutation schema, boolean announceLocally)
+    static void announce(Mutation schema, boolean announceLocally)
+    {
+        announce(Collections.singletonList(schema), announceLocally);
+    }
+
+    static void announce(Collection<Mutation> schema, boolean announceLocally)
     {
         if (announceLocally)
-        {
-            try
-            {
-                LegacySchemaTables.mergeSchema(Collections.singletonList(schema), false);
-            }
-            catch (IOException e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
+            SchemaKeyspace.mergeSchema(schema);
         else
-        {
-            FBUtilities.waitOnFuture(announce(Collections.singletonList(schema)));
-        }
+            FBUtilities.waitOnFuture(announce(schema));
     }
 
     private static void pushSchemaMutation(InetAddress endpoint, Collection<Mutation> schema)
@@ -470,9 +561,9 @@
     {
         Future<?> f = StageManager.getStage(Stage.MIGRATION).submit(new WrappedRunnable()
         {
-            protected void runMayThrow() throws IOException, ConfigurationException
+            protected void runMayThrow() throws ConfigurationException
             {
-                LegacySchemaTables.mergeSchema(schema);
+                SchemaKeyspace.mergeSchemaAndAnnounceVersion(schema);
             }
         });
 
@@ -480,8 +571,8 @@
         {
             // only push schema to nodes with known and equal versions
             if (!endpoint.equals(FBUtilities.getBroadcastAddress()) &&
-                    MessagingService.instance().knowsVersion(endpoint) &&
-                    MessagingService.instance().getRawVersion(endpoint) == MessagingService.current_version)
+                MessagingService.instance().knowsVersion(endpoint) &&
+                is30Compatible(MessagingService.instance().getRawVersion(endpoint)))
                 pushSchemaMutation(endpoint, schema);
         }
 
@@ -503,16 +594,14 @@
     /**
      * Clear all locally stored schema information and reset schema to initial state.
      * Called by user (via JMX) who wants to get rid of schema disagreement.
-     *
-     * @throws IOException if schema tables truncation fails
      */
-    public static void resetLocalSchema() throws IOException
+    public static void resetLocalSchema()
     {
         logger.info("Starting local schema reset...");
 
         logger.debug("Truncating schema tables...");
 
-        LegacySchemaTables.truncateSchemaTables();
+        SchemaKeyspace.truncate();
 
         logger.debug("Clearing local schema keyspace definitions...");
 
@@ -535,6 +624,48 @@
         logger.info("Local schema reset is complete.");
     }
 
+    /**
+     * We have a set of non-local, distributed system keyspaces, e.g. system_traces, system_auth, etc.
+     * (see {@link Schema#REPLICATED_SYSTEM_KEYSPACE_NAMES}), that need to be created on cluster initialisation,
+     * and later evolved on major upgrades (sometimes minor too). This method compares the current known definitions
+     * of the tables (if the keyspace exists) to the expected, most modern ones expected by the running version of C*;
+     * if any changes have been detected, a schema Mutation will be created which, when applied, should make
+     * cluster's view of that keyspace aligned with the expected modern definition.
+     *
+     * @param keyspace   the expected modern definition of the keyspace
+     * @param generation timestamp to use for the table changes in the schema mutation
+     *
+     * @return empty Optional if the current definition is up to date, or an Optional with the Mutation that would
+     *         bring the schema in line with the expected definition.
+     */
+    static Optional<Mutation> evolveSystemKeyspace(KeyspaceMetadata keyspace, long generation)
+    {
+        Mutation mutation = null;
+
+        KeyspaceMetadata definedKeyspace = Schema.instance.getKSMetaData(keyspace.name);
+        Tables definedTables = null == definedKeyspace ? Tables.none() : definedKeyspace.tables;
+
+        for (CFMetaData table : keyspace.tables)
+        {
+            if (table.equals(definedTables.getNullable(table.cfName)))
+                continue;
+
+            if (null == mutation)
+            {
+                // for the keyspace definition itself (name, replication, durability) always use generation 0;
+                // this ensures that any changes made to replication by the user will never be overwritten.
+                mutation = SchemaKeyspace.makeCreateKeyspaceMutation(keyspace.name, keyspace.params, 0);
+            }
+
+            // for table definitions always use the provided generation; these tables, unlike their containing
+            // keyspaces, are *NOT* meant to be altered by the user; if their definitions need to change,
+            // the schema must be updated in code, and the appropriate generation must be bumped.
+            SchemaKeyspace.addTableToSchemaMutation(table, generation, true, mutation);
+        }
+
+        return Optional.ofNullable(mutation);
+    }
+
     public static class MigrationsSerializer implements IVersionedSerializer<Collection<Mutation>>
     {
         public static MigrationsSerializer instance = new MigrationsSerializer();
@@ -546,7 +677,7 @@
                 Mutation.serializer.serialize(mutation, out, version);
         }
 
-        public Collection<Mutation> deserialize(DataInput in, int version) throws IOException
+        public Collection<Mutation> deserialize(DataInputPlus in, int version) throws IOException
         {
             int count = in.readInt();
             Collection<Mutation> schema = new ArrayList<>(count);
@@ -559,7 +690,7 @@
 
         public long serializedSize(Collection<Mutation> schema, int version)
         {
-            int size = TypeSizes.NATIVE.sizeof(schema.size());
+            int size = TypeSizes.sizeof(schema.size());
             for (Mutation mutation : schema)
                 size += Mutation.serializer.serializedSize(mutation, version);
             return size;

diff --git a/src/java/org/apache/cassandra/service/MigrationTask.java b/src/java/org/apache/cassandra/service/MigrationTask.java
index b065d90..6b04756 100644
--- a/src/java/org/apache/cassandra/service/MigrationTask.java
+++ b/src/java/org/apache/cassandra/service/MigrationTask.java

@@ -20,18 +20,24 @@
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.Collection;
+import java.util.EnumSet;
+import java.util.Set;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.CountDownLatch;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.SystemKeyspace.BootstrapState;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.schema.LegacySchemaTables;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.net.IAsyncCallback;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.utils.WrappedRunnable;
 
 
@@ -39,6 +45,10 @@
 {
     private static final Logger logger = LoggerFactory.getLogger(MigrationTask.class);
 
+    private static final ConcurrentLinkedQueue<CountDownLatch> inflightTasks = new ConcurrentLinkedQueue<>();
+
+    private static final Set<BootstrapState> monitoringBootstrapStates = EnumSet.of(BootstrapState.NEEDS_BOOTSTRAP, BootstrapState.IN_PROGRESS);
+
     private final InetAddress endpoint;
 
     MigrationTask(InetAddress endpoint)
@@ -46,6 +56,11 @@
         this.endpoint = endpoint;
     }
 
+    public static ConcurrentLinkedQueue<CountDownLatch> getInflightTasks()
+    {
+        return inflightTasks;
+    }
+
     public void runMayThrow() throws Exception
     {
         if (!FailureDetector.instance.isAlive(endpoint))
@@ -65,6 +80,8 @@
 
         MessageOut message = new MessageOut<>(MessagingService.Verb.MIGRATION_REQUEST, null, MigrationManager.MigrationsSerializer.instance);
 
+        final CountDownLatch completionLatch = new CountDownLatch(1);
+
         IAsyncCallback<Collection<Mutation>> cb = new IAsyncCallback<Collection<Mutation>>()
         {
             @Override
@@ -72,16 +89,16 @@
             {
                 try
                 {
-                    LegacySchemaTables.mergeSchema(message.payload);
-                }
-                catch (IOException e)
-                {
-                    logger.error("IOException merging remote schema", e);
+                    SchemaKeyspace.mergeSchemaAndAnnounceVersion(message.payload);
                 }
                 catch (ConfigurationException e)
                 {
                     logger.error("Configuration exception merging remote schema", e);
                 }
+                finally
+                {
+                    completionLatch.countDown();
+                }
             }
 
             public boolean isLatencyForSnitch()
@@ -89,6 +106,11 @@
                 return false;
             }
         };
+
+        // Only save the latches if we need bootstrap or are bootstrapping
+        if (monitoringBootstrapStates.contains(SystemKeyspace.getBootstrapState()))
+            inflightTasks.offer(completionLatch);
+
         MessagingService.instance().sendRR(message, endpoint, cb);
     }
 }

diff --git a/src/java/org/apache/cassandra/service/NativeAccessMBean.java b/src/java/org/apache/cassandra/service/NativeAccessMBean.java
index b0c408c..0128369 100644
--- a/src/java/org/apache/cassandra/service/NativeAccessMBean.java
+++ b/src/java/org/apache/cassandra/service/NativeAccessMBean.java

@@ -19,7 +19,15 @@
 
 public interface NativeAccessMBean 
 {
+    /**
+     * Checks if the native library has been successfully linked.
+     * @return {@code true} if the library has been successfully linked, {@code false} otherwise.
+     */
     boolean isAvailable();
 
+    /**
+     * Checks if the native library is able to lock memory.
+     * @return {@code true} if the native library is able to lock memory, {@code false} otherwise.
+     */
     boolean isMemoryLockable();
 }

diff --git a/src/java/org/apache/cassandra/service/NativeTransportService.java b/src/java/org/apache/cassandra/service/NativeTransportService.java
new file mode 100644
index 0000000..587f781
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/NativeTransportService.java

@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.net.InetAddress;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import io.netty.channel.EventLoopGroup;
+import io.netty.channel.epoll.Epoll;
+import io.netty.channel.epoll.EpollEventLoopGroup;
+import io.netty.channel.nio.NioEventLoopGroup;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.metrics.ClientMetrics;
+import org.apache.cassandra.transport.ConfiguredLimit;
+import org.apache.cassandra.transport.Message;
+import org.apache.cassandra.transport.Server;
+
+/**
+ * Handles native transport server lifecycle and associated resources. Lazily initialized.
+ */
+public class NativeTransportService
+{
+
+    private static final Logger logger = LoggerFactory.getLogger(NativeTransportService.class);
+
+    private Collection<Server> servers = Collections.emptyList();
+
+    private boolean initialized = false;
+    private EventLoopGroup workerGroup;
+    private ConfiguredLimit protocolVersionLimit;
+
+    /**
+     * Creates netty thread pools and event loops.
+     */
+    @VisibleForTesting
+    synchronized void initialize()
+    {
+        if (initialized)
+            return;
+
+        if (useEpoll())
+        {
+            workerGroup = new EpollEventLoopGroup();
+            logger.info("Netty using native Epoll event loop");
+        }
+        else
+        {
+            workerGroup = new NioEventLoopGroup();
+            logger.info("Netty using Java NIO event loop");
+        }
+
+        protocolVersionLimit = ConfiguredLimit.newLimit();
+
+        int nativePort = DatabaseDescriptor.getNativeTransportPort();
+        int nativePortSSL = DatabaseDescriptor.getNativeTransportPortSSL();
+        InetAddress nativeAddr = DatabaseDescriptor.getRpcAddress();
+
+        org.apache.cassandra.transport.Server.Builder builder = new org.apache.cassandra.transport.Server.Builder()
+                                                                .withEventLoopGroup(workerGroup)
+                                                                .withProtocolVersionLimit(protocolVersionLimit)
+                                                                .withHost(nativeAddr);
+
+        if (!DatabaseDescriptor.getClientEncryptionOptions().enabled)
+        {
+            servers = Collections.singleton(builder.withSSL(false).withPort(nativePort).build());
+        }
+        else
+        {
+            if (nativePort != nativePortSSL)
+            {
+                // user asked for dedicated ssl port for supporting both non-ssl and ssl connections
+                servers = Collections.unmodifiableList(
+                                                      Arrays.asList(
+                                                                   builder.withSSL(false).withPort(nativePort).build(),
+                                                                   builder.withSSL(true).withPort(nativePortSSL).build()
+                                                      )
+                );
+            }
+            else
+            {
+                // ssl only mode using configured native port
+                servers = Collections.singleton(builder.withSSL(true).withPort(nativePort).build());
+            }
+        }
+
+        // register metrics
+        ClientMetrics.instance.init(servers);
+
+        initialized = true;
+    }
+
+    /**
+     * Starts native transport servers.
+     */
+    public void start()
+    {
+        initialize();
+        servers.forEach(Server::start);
+    }
+
+    /**
+     * Stops currently running native transport servers.
+     */
+    public void stop()
+    {
+        servers.forEach(Server::stop);
+    }
+
+    /**
+     * Ultimately stops servers and closes all resources.
+     */
+    public void destroy()
+    {
+        stop();
+        servers = Collections.emptyList();
+
+        // shutdown executors used by netty for native transport server
+        workerGroup.shutdownGracefully(3, 5, TimeUnit.SECONDS).awaitUninterruptibly();
+
+        Message.Dispatcher.shutdown();
+    }
+
+    public int getMaxProtocolVersion()
+    {
+        return protocolVersionLimit.getMaxVersion();
+    }
+
+    public void refreshMaxNegotiableProtocolVersion()
+    {
+        // lowering the max negotiable protocol version is only safe if we haven't already
+        // allowed clients to connect with a higher version. This still allows the max
+        // version to be raised, as that is safe.
+        if (initialized)
+            protocolVersionLimit.updateMaxSupportedVersion();
+    }
+
+    /**
+     * @return intend to use epoll bassed event looping
+     */
+    public static boolean useEpoll()
+    {
+        final boolean enableEpoll = Boolean.valueOf(System.getProperty("cassandra.native.epoll.enabled", "true"));
+        return enableEpoll && Epoll.isAvailable();
+    }
+
+    /**
+     * @return true in case native transport server is running
+     */
+    public boolean isRunning()
+    {
+        for (Server server : servers)
+            if (server.isRunning()) return true;
+        return false;
+    }
+
+    @VisibleForTesting
+    EventLoopGroup getWorkerGroup()
+    {
+        return workerGroup;
+    }
+
+    @VisibleForTesting
+    Collection<Server> getServers()
+    {
+        return servers;
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java b/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java
index a7ee333..1334611 100644
--- a/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java
+++ b/src/java/org/apache/cassandra/service/PendingRangeCalculatorService.java

@@ -47,7 +47,7 @@
     // the executor will only run a single range calculation at a time while keeping at most one task queued in order
     // to trigger an update only after the most recent state change and not for each update individually
     private final JMXEnabledThreadPoolExecutor executor = new JMXEnabledThreadPoolExecutor(1, Integer.MAX_VALUE, TimeUnit.SECONDS,
-            new LinkedBlockingQueue<Runnable>(1), new NamedThreadFactory("PendingRangeCalculator"), "internal");
+            new LinkedBlockingQueue<>(1), new NamedThreadFactory("PendingRangeCalculator"), "internal");
 
     private AtomicInteger updateJobs = new AtomicInteger(0);
 
@@ -70,7 +70,7 @@
             try
             {
                 long start = System.currentTimeMillis();
-                List<String> keyspaces = Schema.instance.getNonSystemKeyspaces();
+                List<String> keyspaces = Schema.instance.getNonLocalStrategyKeyspaces();
                 for (String keyspaceName : keyspaces)
                     calculatePendingRanges(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName);
                 if (logger.isTraceEnabled())

diff --git a/src/java/org/apache/cassandra/service/RangeSliceResponseResolver.java b/src/java/org/apache/cassandra/service/RangeSliceResponseResolver.java
deleted file mode 100644
index 4242481..0000000
--- a/src/java/org/apache/cassandra/service/RangeSliceResponseResolver.java
+++ /dev/null

@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.net.InetAddress;
-import java.util.*;
-import java.util.concurrent.ConcurrentLinkedQueue;
-
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RangeSliceReply;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.net.AsyncOneResponse;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.MergeIterator;
-import org.apache.cassandra.utils.Pair;
-
-/**
- * Turns RangeSliceReply objects into row (string -> CF) maps, resolving
- * to the most recent ColumnFamily and setting up read repairs as necessary.
- */
-public class RangeSliceResponseResolver implements IResponseResolver<RangeSliceReply, Iterable<Row>>
-{
-    private static final Comparator<Pair<Row,InetAddress>> pairComparator = new Comparator<Pair<Row, InetAddress>>()
-    {
-        public int compare(Pair<Row, InetAddress> o1, Pair<Row, InetAddress> o2)
-        {
-            return o1.left.key.compareTo(o2.left.key);
-        }
-    };
-
-    private final String keyspaceName;
-    private final long timestamp;
-    private List<InetAddress> sources;
-    protected final Queue<MessageIn<RangeSliceReply>> responses = new ConcurrentLinkedQueue<>();
-    public final List<AsyncOneResponse> repairResults = new ArrayList<>();
-
-    public RangeSliceResponseResolver(String keyspaceName, long timestamp)
-    {
-        this.keyspaceName = keyspaceName;
-        this.timestamp = timestamp;
-    }
-
-    public void setSources(List<InetAddress> endpoints)
-    {
-        this.sources = endpoints;
-    }
-
-    public List<Row> getData()
-    {
-        assert !responses.isEmpty();
-        return responses.peek().payload.rows;
-    }
-
-    // Note: this would deserialize the response a 2nd time if getData was called first.
-    // (this is not currently an issue since we don't do read repair for range queries.)
-    public Iterable<Row> resolve()
-    {
-        ArrayList<RowIterator> iters = new ArrayList<>(responses.size());
-        int n = 0;
-        for (MessageIn<RangeSliceReply> response : responses)
-        {
-            RangeSliceReply reply = response.payload;
-            n = Math.max(n, reply.rows.size());
-            iters.add(new RowIterator(reply.rows.iterator(), response.from));
-        }
-        // for each row, compute the combination of all different versions seen, and repair incomplete versions
-        // TODO do we need to call close?
-        CloseableIterator<Row> iter = MergeIterator.get(iters, pairComparator, new Reducer());
-
-        List<Row> resolvedRows = new ArrayList<>(n);
-        while (iter.hasNext())
-            resolvedRows.add(iter.next());
-
-        return resolvedRows;
-    }
-
-    public void preprocess(MessageIn message)
-    {
-        responses.add(message);
-    }
-
-    public boolean isDataPresent()
-    {
-        return !responses.isEmpty();
-    }
-
-    private static class RowIterator extends AbstractIterator<Pair<Row,InetAddress>> implements CloseableIterator<Pair<Row,InetAddress>>
-    {
-        private final Iterator<Row> iter;
-        private final InetAddress source;
-
-        private RowIterator(Iterator<Row> iter, InetAddress source)
-        {
-            this.iter = iter;
-            this.source = source;
-        }
-
-        protected Pair<Row,InetAddress> computeNext()
-        {
-            return iter.hasNext() ? Pair.create(iter.next(), source) : endOfData();
-        }
-
-        public void close() {}
-    }
-
-    public Iterable<MessageIn<RangeSliceReply>> getMessages()
-    {
-        return responses;
-    }
-
-    private class Reducer extends MergeIterator.Reducer<Pair<Row,InetAddress>, Row>
-    {
-        List<ColumnFamily> versions = new ArrayList<>(sources.size());
-        List<InetAddress> versionSources = new ArrayList<InetAddress>(sources.size());
-        DecoratedKey key;
-
-        public void reduce(Pair<Row,InetAddress> current)
-        {
-            key = current.left.key;
-            versions.add(current.left.cf);
-            versionSources.add(current.right);
-        }
-
-        protected Row getReduced()
-        {
-            ColumnFamily resolved = versions.size() > 1
-                                  ? RowDataResolver.resolveSuperset(versions, timestamp)
-                                  : versions.get(0);
-            if (versions.size() < sources.size())
-            {
-                // add placeholder rows for sources that didn't have any data, so maybeScheduleRepairs sees them
-                for (InetAddress source : sources)
-                {
-                    if (!versionSources.contains(source))
-                    {
-                        versions.add(null);
-                        versionSources.add(source);
-                    }
-                }
-            }
-            // resolved can be null even if versions doesn't have all nulls because of the call to removeDeleted in resolveSuperSet
-            if (resolved != null)
-                repairResults.addAll(RowDataResolver.scheduleRepairs(resolved, keyspaceName, key, versions, versionSources));
-            versions.clear();
-            versionSources.clear();
-            return new Row(key, resolved);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/RangeSliceVerbHandler.java b/src/java/org/apache/cassandra/service/RangeSliceVerbHandler.java
deleted file mode 100644
index 0f3726c..0000000
--- a/src/java/org/apache/cassandra/service/RangeSliceVerbHandler.java
+++ /dev/null

@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import org.apache.cassandra.db.AbstractRangeCommand;
-import org.apache.cassandra.db.RangeSliceReply;
-import org.apache.cassandra.net.IVerbHandler;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.tracing.Tracing;
-
-public class RangeSliceVerbHandler implements IVerbHandler<AbstractRangeCommand>
-{
-    public void doVerb(MessageIn<AbstractRangeCommand> message, int id)
-    {
-        if (StorageService.instance.isBootstrapMode())
-        {
-            /* Don't service reads! */
-            throw new RuntimeException("Cannot service reads while bootstrapping!");
-        }
-        RangeSliceReply reply = new RangeSliceReply(message.payload.executeLocally());
-        Tracing.trace("Enqueuing response to {}", message.from);
-        MessagingService.instance().sendReply(reply.createMessage(), id, message.from);
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/ReadCallback.java b/src/java/org/apache/cassandra/service/ReadCallback.java
index 145679d..71eb0bc 100644
--- a/src/java/org/apache/cassandra/service/ReadCallback.java
+++ b/src/java/org/apache/cassandra/service/ReadCallback.java

@@ -30,8 +30,12 @@
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ReadCommand;
-import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.DuplicateRowChecker;
+import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.ReadFailureException;
 import org.apache.cassandra.exceptions.ReadTimeoutException;
 import org.apache.cassandra.exceptions.UnavailableException;
@@ -46,16 +50,16 @@
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
-public class ReadCallback<TMessage, TResolved> implements IAsyncCallbackWithFailure<TMessage>
+public class ReadCallback implements IAsyncCallbackWithFailure<ReadResponse>
 {
     protected static final Logger logger = LoggerFactory.getLogger( ReadCallback.class );
 
-    public final IResponseResolver<TMessage, TResolved> resolver;
+    public final ResponseResolver resolver;
     private final SimpleCondition condition = new SimpleCondition();
-    final long start;
+    private final long start;
     final int blockfor;
     final List<InetAddress> endpoints;
-    private final IReadCommand command;
+    private final ReadCommand command;
     private final ConsistencyLevel consistencyLevel;
     private static final AtomicIntegerFieldUpdater<ReadCallback> recievedUpdater
             = AtomicIntegerFieldUpdater.newUpdater(ReadCallback.class, "received");
@@ -69,14 +73,17 @@
     /**
      * Constructor when response count has to be calculated and blocked for.
      */
-    public ReadCallback(IResponseResolver<TMessage, TResolved> resolver, ConsistencyLevel consistencyLevel, IReadCommand command, List<InetAddress> filteredEndpoints)
+    public ReadCallback(ResponseResolver resolver, ConsistencyLevel consistencyLevel, ReadCommand command, List<InetAddress> filteredEndpoints)
     {
-        this(resolver, consistencyLevel, consistencyLevel.blockFor(Keyspace.open(command.getKeyspace())), command, Keyspace.open(command.getKeyspace()), filteredEndpoints);
-        if (logger.isTraceEnabled())
-            logger.trace(String.format("Blockfor is %s; setting up requests to %s", blockfor, StringUtils.join(this.endpoints, ",")));
+        this(resolver,
+             consistencyLevel,
+             consistencyLevel.blockFor(Keyspace.open(command.metadata().ksName)),
+             command,
+             Keyspace.open(command.metadata().ksName),
+             filteredEndpoints);
     }
 
-    public ReadCallback(IResponseResolver<TMessage, TResolved> resolver, ConsistencyLevel consistencyLevel, int blockfor, IReadCommand command, Keyspace keyspace, List<InetAddress> endpoints)
+    public ReadCallback(ResponseResolver resolver, ConsistencyLevel consistencyLevel, int blockfor, ReadCommand command, Keyspace keyspace, List<InetAddress> endpoints)
     {
         this.command = command;
         this.keyspace = keyspace;
@@ -86,7 +93,10 @@
         this.start = System.nanoTime();
         this.endpoints = endpoints;
         // we don't support read repair (or rapid read protection) for range scans yet (CASSANDRA-6897)
-        assert !(resolver instanceof RangeSliceResponseResolver) || blockfor >= endpoints.size();
+        assert !(command instanceof PartitionRangeReadCommand) || blockfor >= endpoints.size();
+
+        if (logger.isTraceEnabled())
+            logger.trace(String.format("Blockfor is %s; setting up requests to %s", blockfor, StringUtils.join(this.endpoints, ",")));
     }
 
     public boolean await(long timePastStart, TimeUnit unit)
@@ -102,31 +112,47 @@
         }
     }
 
-    public TResolved get() throws ReadFailureException, ReadTimeoutException, DigestMismatchException
+    public void awaitResults() throws ReadFailureException, ReadTimeoutException
     {
-        if (!await(command.getTimeout(), TimeUnit.MILLISECONDS))
+        boolean signaled = await(command.getTimeout(), TimeUnit.MILLISECONDS);
+        boolean failed = blockfor + failures > endpoints.size();
+        if (signaled && !failed)
+            return;
+
+        if (Tracing.isTracing())
         {
-            // Same as for writes, see AbstractWriteResponseHandler
-            ReadTimeoutException ex = new ReadTimeoutException(consistencyLevel, received, blockfor, resolver.isDataPresent());
-            Tracing.trace("Read timeout: {}", ex.toString());
-            if (logger.isTraceEnabled())
-                logger.trace("Read timeout: {}", ex.toString());
-            throw ex;
+            String gotData = received > 0 ? (resolver.isDataPresent() ? " (including data)" : " (only digests)") : "";
+            Tracing.trace("{}; received {} of {} responses{}", new Object[]{ (failed ? "Failed" : "Timed out"), received, blockfor, gotData });
+        }
+        else if (logger.isDebugEnabled())
+        {
+            String gotData = received > 0 ? (resolver.isDataPresent() ? " (including data)" : " (only digests)") : "";
+            logger.debug("{}; received {} of {} responses{}", new Object[]{ (failed ? "Failed" : "Timed out"), received, blockfor, gotData });
         }
 
-        if (blockfor + failures > endpoints.size())
-        {
-            ReadFailureException ex = new ReadFailureException(consistencyLevel, received, failures, blockfor, resolver.isDataPresent());
-
-            if (logger.isTraceEnabled())
-                logger.trace("Read failure: {}", ex.toString());
-            throw ex;
-        }
-
-        return blockfor == 1 ? resolver.getData() : resolver.resolve();
+        // Same as for writes, see AbstractWriteResponseHandler
+        throw failed
+            ? new ReadFailureException(consistencyLevel, received, failures, blockfor, resolver.isDataPresent())
+            : new ReadTimeoutException(consistencyLevel, received, blockfor, resolver.isDataPresent());
     }
 
-    public void response(MessageIn<TMessage> message)
+
+    public PartitionIterator get() throws ReadFailureException, ReadTimeoutException, DigestMismatchException
+    {
+        awaitResults();
+
+        PartitionIterator result = blockfor == 1 ? resolver.getData() : resolver.resolve();
+        if (logger.isTraceEnabled())
+            logger.trace("Read: {} ms.", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
+        return DuplicateRowChecker.duringRead(result, endpoints);
+    }
+
+    public int blockFor()
+    {
+        return blockfor;
+    }
+
+    public void response(MessageIn<ReadResponse> message)
     {
         resolver.preprocess(message);
         int n = waitingFor(message.from)
@@ -135,7 +161,6 @@
         if (n >= blockfor && resolver.isDataPresent())
         {
             condition.signalAll();
-
             // kick off a background digest comparison if this is a result that (may have) arrived after
             // the original resolve that get() kicks off as soon as the condition is signaled
             if (blockfor < endpoints.size() && n == endpoints.size())
@@ -166,13 +191,13 @@
         return received;
     }
 
-    public void response(TMessage result)
+    public void response(ReadResponse result)
     {
-        MessageIn<TMessage> message = MessageIn.create(FBUtilities.getBroadcastAddress(),
-                                                       result,
-                                                       Collections.<String, byte[]>emptyMap(),
-                                                       MessagingService.Verb.INTERNAL_RESPONSE,
-                                                       MessagingService.current_version);
+        MessageIn<ReadResponse> message = MessageIn.create(FBUtilities.getBroadcastAddress(),
+                                                           result,
+                                                           Collections.<String, byte[]>emptyMap(),
+                                                           MessagingService.Verb.INTERNAL_RESPONSE,
+                                                           MessagingService.current_version);
         response(message);
     }
 
@@ -197,31 +222,32 @@
 
         public void run()
         {
-            // If the resolver is a RowDigestResolver, we need to do a full data read if there is a mismatch.
+            // If the resolver is a DigestResolver, we need to do a full data read if there is a mismatch.
             // Otherwise, resolve will send the repairs directly if needs be (and in that case we should never
-            // get a digest mismatch)
+            // get a digest mismatch).
             try
             {
-                resolver.resolve();
+                resolver.compareResponses();
             }
             catch (DigestMismatchException e)
             {
-                assert resolver instanceof RowDigestResolver;
+                assert resolver instanceof DigestResolver;
 
                 if (traceState != null)
                     traceState.trace("Digest mismatch: {}", e.toString());
-                if (logger.isTraceEnabled())
-                    logger.trace("Digest mismatch:", e);
+                if (logger.isDebugEnabled())
+                    logger.debug("Digest mismatch:", e);
                 
                 ReadRepairMetrics.repairedBackground.mark();
                 
-                ReadCommand readCommand = (ReadCommand) command;
-                final RowDataResolver repairResolver = new RowDataResolver(readCommand.ksName, readCommand.key, readCommand.filter(), readCommand.timestamp, endpoints.size());
+                final DataResolver repairResolver = new DataResolver(keyspace, command, consistencyLevel, endpoints.size());
                 AsyncRepairCallback repairHandler = new AsyncRepairCallback(repairResolver, endpoints.size());
 
-                MessageOut<ReadCommand> message = ((ReadCommand) command).createMessage();
                 for (InetAddress endpoint : endpoints)
+                {
+                    MessageOut<ReadCommand> message = command.createMessage(MessagingService.instance().getVersion(endpoint));
                     MessagingService.instance().sendRR(message, endpoint, repairHandler);
+                }
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/service/ReplicaFilteringProtection.java b/src/java/org/apache/cassandra/service/ReplicaFilteringProtection.java
new file mode 100644
index 0000000..36d51cc
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/ReplicaFilteringProtection.java

@@ -0,0 +1,465 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service;
+
+import java.net.InetAddress;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NavigableSet;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Columns;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.exceptions.ReadTimeoutException;
+import org.apache.cassandra.exceptions.UnavailableException;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+/**
+ * Helper in charge of collecting additional queries to be done on the coordinator to protect against invalid results
+ * being included due to replica-side filtering (secondary indexes or {@code ALLOW * FILTERING}).
+ * <p>
+ * When using replica-side filtering with CL>ONE, a replica can send a stale result satisfying the filter, while updated
+ * replicas won't send a corresponding tombstone to discard that result during reconciliation. This helper identifies
+ * the rows in a replica response that don't have a corresponding row in other replica responses, and requests them by
+ * primary key to the "silent" replicas in a second fetch round.
+ * <p>
+ * See CASSANDRA-8272 and CASSANDRA-8273 for further details.
+ */
+class ReplicaFilteringProtection
+{
+    private static final Logger logger = LoggerFactory.getLogger(ReplicaFilteringProtection.class);
+
+    private final Keyspace keyspace;
+    private final ReadCommand command;
+    private final ConsistencyLevel consistency;
+    private final InetAddress[] sources;
+    private final TableMetrics tableMetrics;
+
+    /**
+     * Per-source primary keys of the rows that might be outdated so they need to be fetched.
+     * For outdated static rows we use an empty builder to signal it has to be queried.
+     */
+    private final List<SortedMap<DecoratedKey, BTreeSet.Builder<Clustering>>> rowsToFetch;
+
+    /**
+     * Per-source list of all the partitions seen by the merge listener, to be merged with the extra fetched rows.
+     */
+    private final List<List<PartitionBuilder>> originalPartitions;
+
+    ReplicaFilteringProtection(Keyspace keyspace,
+                               ReadCommand command,
+                               ConsistencyLevel consistency,
+                               InetAddress[] sources)
+    {
+        this.keyspace = keyspace;
+        this.command = command;
+        this.consistency = consistency;
+        this.sources = sources;
+        this.rowsToFetch = new ArrayList<>(sources.length);
+        this.originalPartitions = new ArrayList<>(sources.length);
+
+        for (InetAddress ignored : sources)
+        {
+            rowsToFetch.add(new TreeMap<>());
+            originalPartitions.add(new ArrayList<>());
+        }
+
+        tableMetrics = ColumnFamilyStore.metricsFor(command.metadata().cfId);
+    }
+
+    private BTreeSet.Builder<Clustering> getOrCreateToFetch(int source, DecoratedKey partitionKey)
+    {
+        return rowsToFetch.get(source).computeIfAbsent(partitionKey, k -> BTreeSet.builder(command.metadata().comparator));
+    }
+
+    /**
+     * Returns the protected results for the specified replica. These are generated fetching the extra rows and merging
+     * them with the cached original filtered results for that replica.
+     *
+     * @param source the source
+     * @return the protected results for the specified replica
+     */
+    UnfilteredPartitionIterator queryProtectedPartitions(int source)
+    {
+        UnfilteredPartitionIterator original = makeIterator(originalPartitions.get(source));
+        SortedMap<DecoratedKey, BTreeSet.Builder<Clustering>> toFetch = rowsToFetch.get(source);
+
+        if (toFetch.isEmpty())
+            return original;
+
+        // TODO: this would be more efficient if we had multi-key queries internally
+        List<UnfilteredPartitionIterator> fetched = toFetch.keySet()
+                                                           .stream()
+                                                           .map(k -> querySourceOnKey(source, k))
+                                                           .collect(Collectors.toList());
+
+        return UnfilteredPartitionIterators.merge(Arrays.asList(original, UnfilteredPartitionIterators.concat(fetched)),
+                                                  command.nowInSec(), null);
+    }
+
+    private UnfilteredPartitionIterator querySourceOnKey(int i, DecoratedKey key)
+    {
+        BTreeSet.Builder<Clustering> builder = rowsToFetch.get(i).get(key);
+        assert builder != null; // We're calling this on the result of rowsToFetch.get(i).keySet()
+
+        InetAddress source = sources[i];
+        NavigableSet<Clustering> clusterings = builder.build();
+        tableMetrics.replicaSideFilteringProtectionRequests.mark();
+        if (logger.isTraceEnabled())
+            logger.trace("Requesting rows {} in partition {} from {} for replica-side filtering protection",
+                         clusterings, key, source);
+        Tracing.trace("Requesting {} rows in partition {} from {} for replica-side filtering protection",
+                      clusterings.size(), key, source);
+
+        // build the read command taking into account that we could be requesting only in the static row
+        DataLimits limits = clusterings.isEmpty() ? DataLimits.cqlLimits(1) : DataLimits.NONE;
+        ClusteringIndexFilter filter = new ClusteringIndexNamesFilter(clusterings, command.isReversed());
+        SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(command.metadata(),
+                                                                           command.nowInSec(),
+                                                                           command.columnFilter(),
+                                                                           RowFilter.NONE,
+                                                                           limits,
+                                                                           key,
+                                                                           filter);
+        try
+        {
+            return executeReadCommand(cmd, source);
+        }
+        catch (ReadTimeoutException e)
+        {
+            int blockFor = consistency.blockFor(keyspace);
+            throw new ReadTimeoutException(consistency, blockFor - 1, blockFor, true);
+        }
+        catch (UnavailableException e)
+        {
+            int blockFor = consistency.blockFor(keyspace);
+            throw new UnavailableException(consistency, blockFor, blockFor - 1);
+        }
+    }
+
+    private UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, InetAddress source)
+    {
+        DataResolver resolver = new DataResolver(keyspace, cmd, ConsistencyLevel.ONE, 1);
+        ReadCallback handler = new ReadCallback(resolver, ConsistencyLevel.ONE, cmd, Collections.singletonList(source));
+
+        if (StorageProxy.canDoLocalRequest(source))
+            StageManager.getStage(Stage.READ).maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(cmd, handler));
+        else
+            MessagingService.instance().sendRRWithFailure(cmd.createMessage(MessagingService.current_version), source, handler);
+
+        // We don't call handler.get() because we want to preserve tombstones
+        handler.awaitResults();
+        assert resolver.responses.size() == 1;
+        return resolver.responses.get(0).payload.makeIterator(command);
+    }
+
+    /**
+     * Returns a merge listener that skips the merged rows for which any of the replicas doesn't have a version,
+     * pessimistically assuming that they are outdated. It is intended to be used during a first merge of per-replica
+     * query results to ensure we fetch enough results from the replicas to ensure we don't miss any potentially
+     * outdated result.
+     * <p>
+     * The listener will track both the accepted data and the primary keys of the rows that are considered as outdated.
+     * That way, once the query results would have been merged using this listener, further calls to
+     * {@link #queryProtectedPartitions(int)} will use the collected data to return a copy of the
+     * data originally collected from the specified replica, completed with the potentially outdated rows.
+     */
+    UnfilteredPartitionIterators.MergeListener mergeController()
+    {
+        return (partitionKey, versions) -> {
+
+            PartitionBuilder[] builders = new PartitionBuilder[sources.length];
+
+            for (int i = 0; i < sources.length; i++)
+                builders[i] = new PartitionBuilder(partitionKey, columns(versions), stats(versions));
+
+            return new UnfilteredRowIterators.MergeListener()
+            {
+                @Override
+                public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions)
+                {
+                    // cache the deletion time versions to be able to regenerate the original row iterator
+                    for (int i = 0; i < versions.length; i++)
+                        builders[i].setDeletionTime(versions[i]);
+                }
+
+                @Override
+                public Row onMergedRows(Row merged, Row[] versions)
+                {
+                    // cache the row versions to be able to regenerate the original row iterator
+                    for (int i = 0; i < versions.length; i++)
+                        builders[i].addRow(versions[i]);
+
+                    if (merged.isEmpty())
+                        return merged;
+
+                    boolean isPotentiallyOutdated = false;
+                    boolean isStatic = merged.isStatic();
+                    for (int i = 0; i < versions.length; i++)
+                    {
+                        Row version = versions[i];
+                        if (version == null || (isStatic && version.isEmpty()))
+                        {
+                            isPotentiallyOutdated = true;
+                            BTreeSet.Builder<Clustering> toFetch = getOrCreateToFetch(i, partitionKey);
+                            // Note that for static, we shouldn't add the clustering to the clustering set (the
+                            // ClusteringIndexNamesFilter we'll build from this later does not expect it), but the fact
+                            // we created a builder in the first place will act as a marker that the static row must be
+                            // fetched, even if no other rows are added for this partition.
+                            if (!isStatic)
+                                toFetch.add(merged.clustering());
+                        }
+                    }
+
+                    // If the row is potentially outdated (because some replica didn't send anything and so it _may_ be
+                    // an outdated result that is only present because other replica have filtered the up-to-date result
+                    // out), then we skip the row. In other words, the results of the initial merging of results by this
+                    // protection assume the worst case scenario where every row that might be outdated actually is.
+                    // This ensures that during this first phase (collecting additional row to fetch) we are guaranteed
+                    // to look at enough data to ultimately fulfill the query limit.
+                    return isPotentiallyOutdated ? null : merged;
+                }
+
+                @Override
+                public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions)
+                {
+                    // cache the marker versions to be able to regenerate the original row iterator
+                    for (int i = 0; i < versions.length; i++)
+                        builders[i].addRangeTombstoneMarker(versions[i]);
+                }
+
+                @Override
+                public void close()
+                {
+                    for (int i = 0; i < sources.length; i++)
+                        originalPartitions.get(i).add(builders[i]);
+                }
+            };
+        };
+    }
+
+    private static PartitionColumns columns(List<UnfilteredRowIterator> versions)
+    {
+        Columns statics = Columns.NONE;
+        Columns regulars = Columns.NONE;
+        for (UnfilteredRowIterator iter : versions)
+        {
+            if (iter == null)
+                continue;
+
+            PartitionColumns cols = iter.columns();
+            statics = statics.mergeTo(cols.statics);
+            regulars = regulars.mergeTo(cols.regulars);
+        }
+        return new PartitionColumns(statics, regulars);
+    }
+
+    private static EncodingStats stats(List<UnfilteredRowIterator> iterators)
+    {
+        EncodingStats stats = EncodingStats.NO_STATS;
+        for (UnfilteredRowIterator iter : iterators)
+        {
+            if (iter == null)
+                continue;
+
+            stats = stats.mergeWith(iter.stats());
+        }
+        return stats;
+    }
+
+    private UnfilteredPartitionIterator makeIterator(List<PartitionBuilder> builders)
+    {
+        return new UnfilteredPartitionIterator()
+        {
+            final Iterator<PartitionBuilder> iterator = builders.iterator();
+
+            @Override
+            public boolean isForThrift()
+            {
+                return command.isForThrift();
+            }
+
+            @Override
+            public CFMetaData metadata()
+            {
+                return command.metadata();
+            }
+
+            @Override
+            public void close()
+            {
+                // nothing to do here
+            }
+
+            @Override
+            public boolean hasNext()
+            {
+                return iterator.hasNext();
+            }
+
+            @Override
+            public UnfilteredRowIterator next()
+            {
+                return iterator.next().build();
+            }
+        };
+    }
+
+    private class PartitionBuilder
+    {
+        private final DecoratedKey partitionKey;
+        private final PartitionColumns columns;
+        private final EncodingStats stats;
+        private DeletionTime deletionTime;
+        private Row staticRow = Rows.EMPTY_STATIC_ROW;
+        private final List<Unfiltered> contents = new ArrayList<>();
+
+        private PartitionBuilder(DecoratedKey partitionKey, PartitionColumns columns, EncodingStats stats)
+        {
+            this.partitionKey = partitionKey;
+            this.columns = columns;
+            this.stats = stats;
+        }
+
+        private void setDeletionTime(DeletionTime deletionTime)
+        {
+            this.deletionTime = deletionTime;
+        }
+
+        private void addRow(Row row)
+        {
+            if (row == null)
+                return;
+
+            if (row.isStatic())
+                staticRow = row;
+            else
+                contents.add(row);
+        }
+
+        private void addRangeTombstoneMarker(RangeTombstoneMarker marker)
+        {
+            if (marker != null)
+                contents.add(marker);
+        }
+
+        private UnfilteredRowIterator build()
+        {
+            return new UnfilteredRowIterator()
+            {
+                final Iterator<Unfiltered> iterator = contents.iterator();
+
+                @Override
+                public DeletionTime partitionLevelDeletion()
+                {
+                    return deletionTime;
+                }
+
+                @Override
+                public EncodingStats stats()
+                {
+                    return stats;
+                }
+
+                @Override
+                public CFMetaData metadata()
+                {
+                    return command.metadata();
+                }
+
+                @Override
+                public boolean isReverseOrder()
+                {
+                    return command.isReversed();
+                }
+
+                @Override
+                public PartitionColumns columns()
+                {
+                    return columns;
+                }
+
+                @Override
+                public DecoratedKey partitionKey()
+                {
+                    return partitionKey;
+                }
+
+                @Override
+                public Row staticRow()
+                {
+                    return staticRow;
+                }
+
+                @Override
+                public void close()
+                {
+                    // nothing to do here
+                }
+
+                @Override
+                public boolean hasNext()
+                {
+                    return iterator.hasNext();
+                }
+
+                @Override
+                public Unfiltered next()
+                {
+                    return iterator.next();
+                }
+            };
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/service/ResponseResolver.java b/src/java/org/apache/cassandra/service/ResponseResolver.java
new file mode 100644
index 0000000..81b18b6
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/ResponseResolver.java

@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.utils.concurrent.Accumulator;
+
+public abstract class ResponseResolver
+{
+    protected static final Logger logger = LoggerFactory.getLogger(ResponseResolver.class);
+
+    protected final Keyspace keyspace;
+    protected final ReadCommand command;
+    protected final ConsistencyLevel consistency;
+
+    // Accumulator gives us non-blocking thread-safety with optimal algorithmic constraints
+    protected final Accumulator<MessageIn<ReadResponse>> responses;
+
+    public ResponseResolver(Keyspace keyspace, ReadCommand command, ConsistencyLevel consistency, int maxResponseCount)
+    {
+        this.keyspace = keyspace;
+        this.command = command;
+        this.consistency = consistency;
+        this.responses = new Accumulator<>(maxResponseCount);
+    }
+
+    public abstract PartitionIterator getData();
+    public abstract PartitionIterator resolve() throws DigestMismatchException;
+
+    /**
+     * Compares received responses, potentially triggering a digest mismatch (for a digest resolver) and read-repairs
+     * (for a data resolver).
+     * <p>
+     * This is functionally equivalent to calling {@link #resolve()} and consuming the result, but can be slightly more
+     * efficient in some case due to the fact that we don't care about the result itself. This is used when doing
+     * asynchronous read-repairs.
+     *
+     * @throws DigestMismatchException if it's a digest resolver and the responses don't match.
+     */
+    public abstract void compareResponses() throws DigestMismatchException;
+
+    public abstract boolean isDataPresent();
+
+    public void preprocess(MessageIn<ReadResponse> message)
+    {
+        responses.add(message);
+    }
+
+    public Iterable<MessageIn<ReadResponse>> getMessages()
+    {
+        return responses;
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/RowDataResolver.java b/src/java/org/apache/cassandra/service/RowDataResolver.java
deleted file mode 100644
index e15302b..0000000
--- a/src/java/org/apache/cassandra/service/RowDataResolver.java
+++ /dev/null

@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.net.InetAddress;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.net.*;
-import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.CloseableIterator;
-import org.apache.cassandra.utils.FBUtilities;
-
-public class RowDataResolver extends AbstractRowResolver
-{
-    private int maxLiveCount = 0;
-    public List<AsyncOneResponse> repairResults = Collections.emptyList();
-    private final IDiskAtomFilter filter;
-    private final long timestamp;
-
-    public RowDataResolver(String keyspaceName, ByteBuffer key, IDiskAtomFilter qFilter, long timestamp, int maxResponseCount)
-    {
-        super(key, keyspaceName, maxResponseCount);
-        this.filter = qFilter;
-        this.timestamp = timestamp;
-    }
-
-    /*
-    * This method handles the following scenario:
-    *
-    * there was a mismatch on the initial read, so we redid the digest requests
-    * as full data reads.  In this case we need to compute the most recent version
-    * of each column, and send diffs to out-of-date replicas.
-    */
-    public Row resolve() throws DigestMismatchException
-    {
-        int replyCount = replies.size();
-        if (logger.isTraceEnabled())
-            logger.trace("resolving {} responses", replyCount);
-        long start = System.nanoTime();
-
-        ColumnFamily resolved;
-        if (replyCount > 1)
-        {
-            List<ColumnFamily> versions = new ArrayList<>(replyCount);
-            List<InetAddress> endpoints = new ArrayList<>(replyCount);
-
-            for (MessageIn<ReadResponse> message : replies)
-            {
-                ReadResponse response = message.payload;
-                ColumnFamily cf = response.row().cf;
-                assert !response.isDigestQuery() : "Received digest response to repair read from " + message.from;
-                versions.add(cf);
-                endpoints.add(message.from);
-
-                // compute maxLiveCount to prevent short reads -- see https://issues.apache.org/jira/browse/CASSANDRA-2643
-                int liveCount = cf == null ? 0 : filter.getLiveCount(cf, timestamp);
-                if (liveCount > maxLiveCount)
-                    maxLiveCount = liveCount;
-            }
-
-            resolved = resolveSuperset(versions, timestamp);
-            if (logger.isTraceEnabled())
-                logger.trace("versions merged");
-
-            // send updates to any replica that was missing part of the full row
-            // (resolved can be null even if versions doesn't have all nulls because of the call to removeDeleted in resolveSuperSet)
-            if (resolved != null)
-                repairResults = scheduleRepairs(resolved, keyspaceName, key, versions, endpoints);
-        }
-        else
-        {
-            resolved = replies.get(0).payload.row().cf;
-        }
-
-        if (logger.isTraceEnabled())
-            logger.trace("resolve: {} ms.", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
-
-        return new Row(key, resolved);
-    }
-
-    /**
-     * For each row version, compare with resolved (the superset of all row versions);
-     * if it is missing anything, send a mutation to the endpoint it come from.
-     */
-    public static List<AsyncOneResponse> scheduleRepairs(ColumnFamily resolved, String keyspaceName, DecoratedKey key, List<ColumnFamily> versions, List<InetAddress> endpoints)
-    {
-        List<AsyncOneResponse> results = new ArrayList<AsyncOneResponse>(versions.size());
-
-        for (int i = 0; i < versions.size(); i++)
-        {
-            ColumnFamily diffCf = ColumnFamily.diff(versions.get(i), resolved);
-            if (diffCf == null) // no repair needs to happen
-                continue;
-
-            // create and send the mutation message based on the diff
-            Mutation mutation = new Mutation(keyspaceName, key.getKey(), diffCf);
-            // use a separate verb here because we don't want these to be get the white glove hint-
-            // on-timeout behavior that a "real" mutation gets
-            Tracing.trace("Sending read-repair-mutation to {}", endpoints.get(i));
-            results.add(MessagingService.instance().sendRR(mutation.createMessage(MessagingService.Verb.READ_REPAIR),
-                                                           endpoints.get(i)));
-        }
-
-        return results;
-    }
-
-    static ColumnFamily resolveSuperset(Iterable<ColumnFamily> versions, long now)
-    {
-        assert Iterables.size(versions) > 0;
-
-        ColumnFamily resolved = null;
-        for (ColumnFamily cf : versions)
-        {
-            if (cf == null)
-                continue;
-
-            if (resolved == null)
-                resolved = cf.cloneMeShallow();
-            else
-                resolved.delete(cf);
-        }
-        if (resolved == null)
-            return null;
-
-        // mimic the collectCollatedColumn + removeDeleted path that getColumnFamily takes.
-        // this will handle removing columns and subcolumns that are suppressed by a row or
-        // supercolumn tombstone.
-        QueryFilter filter = new QueryFilter(null, resolved.metadata().cfName, new IdentityQueryFilter(), now);
-        List<CloseableIterator<Cell>> iters = new ArrayList<>(Iterables.size(versions));
-        for (ColumnFamily version : versions)
-            if (version != null)
-                iters.add(FBUtilities.closeableIterator(version.iterator()));
-        filter.collateColumns(resolved, iters, Integer.MIN_VALUE);
-        return ColumnFamilyStore.removeDeleted(resolved, Integer.MIN_VALUE);
-    }
-
-    public Row getData()
-    {
-        assert !replies.isEmpty();
-        return replies.get(0).payload.row();
-    }
-
-    public boolean isDataPresent()
-    {
-        return !replies.isEmpty();
-    }
-
-    public int getMaxLiveCount()
-    {
-        return maxLiveCount;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/RowDigestResolver.java b/src/java/org/apache/cassandra/service/RowDigestResolver.java
deleted file mode 100644
index 32b26e1..0000000
--- a/src/java/org/apache/cassandra/service/RowDigestResolver.java
+++ /dev/null

@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.nio.ByteBuffer;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ReadResponse;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.net.MessageIn;
-
-public class RowDigestResolver extends AbstractRowResolver
-{
-    public RowDigestResolver(String keyspaceName, ByteBuffer key, int maxResponseCount)
-    {
-        super(key, keyspaceName, maxResponseCount);
-    }
-
-    /**
-     * Special case of resolve() so that CL.ONE reads never throw DigestMismatchException in the foreground
-     */
-    public Row getData()
-    {
-        for (MessageIn<ReadResponse> message : replies)
-        {
-            ReadResponse result = message.payload;
-            if (!result.isDigestQuery())
-            {
-                if (result.digest() == null)
-                    result.setDigest(ColumnFamily.digest(result.row().cf));
-
-                return result.row();
-            }
-        }
-        return null;
-    }
-
-    /*
-     * This method handles two different scenarios:
-     *
-     * a) we're handling the initial read, of data from the closest replica + digests
-     *    from the rest.  In this case we check the digests against each other,
-     *    throw an exception if there is a mismatch, otherwise return the data row.
-     *
-     * b) we're checking additional digests that arrived after the minimum to handle
-     *    the requested ConsistencyLevel, i.e. asynchronous read repair check
-     */
-    public Row resolve() throws DigestMismatchException
-    {
-        if (logger.isTraceEnabled())
-            logger.trace("resolving {} responses", replies.size());
-
-        long start = System.nanoTime();
-
-        // validate digests against each other; throw immediately on mismatch.
-        // also extract the data reply, if any.
-        ColumnFamily data = null;
-        ByteBuffer digest = null;
-
-        for (MessageIn<ReadResponse> message : replies)
-        {
-            ReadResponse response = message.payload;
-
-            ByteBuffer newDigest;
-            if (response.isDigestQuery())
-            {
-                newDigest = response.digest();
-            }
-            else
-            {
-                // note that this allows for multiple data replies, post-CASSANDRA-5932
-                data = response.row().cf;
-                if (response.digest() == null)
-                    message.payload.setDigest(ColumnFamily.digest(data));
-
-                newDigest = response.digest();
-            }
-
-            if (digest == null)
-                digest = newDigest;
-            else if (!digest.equals(newDigest))
-                throw new DigestMismatchException(key, digest, newDigest);
-        }
-
-        if (logger.isTraceEnabled())
-            logger.trace("resolve: {} ms.", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
-        return new Row(key, data);
-    }
-
-    public boolean isDataPresent()
-    {
-        for (MessageIn<ReadResponse> message : replies)
-        {
-            if (!message.payload.isDigestQuery())
-                return true;
-        }
-        return false;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java b/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java
index a997533..179abeb 100644
--- a/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java
+++ b/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java

@@ -26,6 +26,7 @@
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.DiagnosticSnapshotService;
 
 public class SnapshotVerbHandler implements IVerbHandler<SnapshotCommand>
 {
@@ -38,6 +39,10 @@
         {
             Keyspace.clearSnapshot(command.snapshot_name, command.keyspace);
         }
+        else if (DiagnosticSnapshotService.isDiagnosticSnapshotRequest(command))
+        {
+            DiagnosticSnapshotService.snapshot(command, message.from);
+        }
         else
             Keyspace.open(command.keyspace).getColumnFamilyStore(command.column_family).snapshot(command.snapshot_name);
         logger.debug("Enqueuing response to snapshot request {} to {}", command.snapshot_name, message.from);

diff --git a/src/java/org/apache/cassandra/service/StartupChecks.java b/src/java/org/apache/cassandra/service/StartupChecks.java
index f74d3da..ae44df5 100644
--- a/src/java/org/apache/cassandra/service/StartupChecks.java
+++ b/src/java/org/apache/cassandra/service/StartupChecks.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.service;
 
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.lang.management.ManagementFactory;
@@ -24,21 +25,29 @@
 import java.nio.file.*;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.*;
+import java.util.stream.Collectors;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Joiner;
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.auth.AuthKeyspace;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.StartupException;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.utils.*;
 
 /**
@@ -74,13 +83,15 @@
                                                                       checkValidLaunchDate,
                                                                       checkJMXPorts,
                                                                       inspectJvmOptions,
-                                                                      checkJnaInitialization,
+                                                                      checkNativeLibraryInitialization,
                                                                       initSigarLibrary,
+                                                                      checkMaxMapCount,
                                                                       checkDataDirs,
                                                                       checkSSTablesFormat,
                                                                       checkSystemKeyspaceState,
                                                                       checkDatacenter,
-                                                                      checkRack);
+                                                                      checkRack,
+                                                                      checkLegacyAuthTables);
 
     public StartupChecks withDefaultTests()
     {
@@ -226,13 +237,13 @@
         }
     };
 
-    public static final StartupCheck checkJnaInitialization = new StartupCheck()
+    public static final StartupCheck checkNativeLibraryInitialization = new StartupCheck()
     {
         public void execute() throws StartupException
         {
-            // Fail-fast if JNA is not available or failing to initialize properly
-            if (!CLibrary.jnaAvailable())
-                throw new StartupException(3, "JNA failing to initialize properly. ");
+            // Fail-fast if the native library could not be linked.
+            if (!NativeLibrary.isAvailable())
+                throw new StartupException(3, "The native library could not be initialized properly. ");
         }
     };
 
@@ -244,33 +255,77 @@
         }
     };
 
-    public static final StartupCheck checkDataDirs = new StartupCheck()
+    public static final StartupCheck checkMaxMapCount = new StartupCheck()
     {
-        public void execute() throws StartupException
+        private final long EXPECTED_MAX_MAP_COUNT = 1048575;
+        private final String MAX_MAP_COUNT_PATH = "/proc/sys/vm/max_map_count";
+
+        private long getMaxMapCount()
         {
-            // check all directories(data, commitlog, saved cache) for existence and permission
-            Iterable<String> dirs = Iterables.concat(Arrays.asList(DatabaseDescriptor.getAllDataFileLocations()),
-                                                     Arrays.asList(DatabaseDescriptor.getCommitLogLocation(),
-                                                                   DatabaseDescriptor.getSavedCachesLocation()));
-            for (String dataDir : dirs)
+            final Path path = Paths.get(MAX_MAP_COUNT_PATH);
+            try (final BufferedReader bufferedReader = Files.newBufferedReader(path))
             {
-                logger.debug("Checking directory {}", dataDir);
-                File dir = new File(dataDir);
-
-                // check that directories exist.
-                if (!dir.exists())
+                final String data = bufferedReader.readLine();
+                if (data != null)
                 {
-                    logger.warn("Directory {} doesn't exist", dataDir);
-                    // if they don't, failing their creation, stop cassandra.
-                    if (!dir.mkdirs())
-                        throw new StartupException(3, "Has no permission to create directory "+ dataDir);
+                    try
+                    {
+                        return Long.parseLong(data);
+                    }
+                    catch (final NumberFormatException e)
+                    {
+                        logger.warn("Unable to parse {}.", path, e);
+                    }
                 }
-
-                // if directories exist verify their permissions
-                if (!Directories.verifyFullPermissions(dir, dataDir))
-                    throw new StartupException(3, "Insufficient permissions on directory " + dataDir);
-
             }
+            catch (final IOException e)
+            {
+                logger.warn("IO exception while reading file {}.", path, e);
+            }
+            return -1;
+        }
+
+        public void execute()
+        {
+            if (!FBUtilities.hasProcFS())
+                return;
+
+            if (DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.standard &&
+                DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.standard)
+                return; // no need to check if disk access mode is only standard and not mmap
+
+            long maxMapCount = getMaxMapCount();
+            if (maxMapCount < EXPECTED_MAX_MAP_COUNT)
+                logger.warn("Maximum number of memory map areas per process (vm.max_map_count) {} " +
+                            "is too low, recommended value: {}, you can change it with sysctl.",
+                            maxMapCount, EXPECTED_MAX_MAP_COUNT);
+        }
+    };
+
+    public static final StartupCheck checkDataDirs = () ->
+    {
+        // check all directories(data, commitlog, saved cache) for existence and permission
+        Iterable<String> dirs = Iterables.concat(Arrays.asList(DatabaseDescriptor.getAllDataFileLocations()),
+                                                 Arrays.asList(DatabaseDescriptor.getCommitLogLocation(),
+                                                               DatabaseDescriptor.getSavedCachesLocation(),
+                                                               DatabaseDescriptor.getHintsDirectory().getAbsolutePath()));
+        for (String dataDir : dirs)
+        {
+            logger.debug("Checking directory {}", dataDir);
+            File dir = new File(dataDir);
+
+            // check that directories exist.
+            if (!dir.exists())
+            {
+                logger.warn("Directory {} doesn't exist", dataDir);
+                // if they don't, failing their creation, stop cassandra.
+                if (!dir.mkdirs())
+                    throw new StartupException(3, "Has no permission to create directory "+ dataDir);
+            }
+
+            // if directories exist verify their permissions
+            if (!Directories.verifyFullPermissions(dir, dataDir))
+                throw new StartupException(3, "Insufficient permissions on directory " + dataDir);
         }
     };
 
@@ -282,12 +337,13 @@
             final Set<String> nonSSTablePaths = new HashSet<>();
             nonSSTablePaths.add(FileUtils.getCanonicalPath(DatabaseDescriptor.getCommitLogLocation()));
             nonSSTablePaths.add(FileUtils.getCanonicalPath(DatabaseDescriptor.getSavedCachesLocation()));
+            nonSSTablePaths.add(FileUtils.getCanonicalPath(DatabaseDescriptor.getHintsDirectory()));
 
             FileVisitor<Path> sstableVisitor = new SimpleFileVisitor<Path>()
             {
                 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException
                 {
-                    if (!file.toString().endsWith(".db"))
+                    if (!Descriptor.isValidFile(file.getFileName().toString()))
                         return FileVisitResult.CONTINUE;
 
                     try
@@ -305,8 +361,8 @@
                 public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException
                 {
                     String name = dir.getFileName().toString();
-                    return (name.equals("snapshots")
-                            || name.equals("backups")
+                    return (name.equals(Directories.SNAPSHOT_SUBDIR)
+                            || name.equals(Directories.BACKUPS_SUBDIR)
                             || nonSSTablePaths.contains(dir.toFile().getCanonicalPath()))
                            ? FileVisitResult.SKIP_SUBTREE
                            : FileVisitResult.CONTINUE;
@@ -343,7 +399,7 @@
             // we do a one-off scrub of the system keyspace first; we can't load the list of the rest of the keyspaces,
             // until system keyspace is opened.
 
-            for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(SystemKeyspace.NAME).values())
+            for (CFMetaData cfm : Schema.instance.getTablesAndViews(SystemKeyspace.NAME))
                 ColumnFamilyStore.scrubDataDirectories(cfm);
 
             try
@@ -400,4 +456,28 @@
             }
         }
     };
+
+    public static final StartupCheck checkLegacyAuthTables = () -> checkLegacyAuthTablesMessage().ifPresent(logger::warn);
+
+    static final Set<String> LEGACY_AUTH_TABLES = ImmutableSet.of("credentials", "users", "permissions");
+
+    @VisibleForTesting
+    static Optional<String> checkLegacyAuthTablesMessage()
+    {
+        List<String> existing = new ArrayList<>(LEGACY_AUTH_TABLES).stream().filter((legacyAuthTable) ->
+            {
+                UntypedResultSet result = QueryProcessor.executeOnceInternal(String.format("SELECT table_name FROM %s.%s WHERE keyspace_name='%s' AND table_name='%s'",
+                                                                                           SchemaKeyspace.NAME,
+                                                                                           "tables",
+                                                                                           AuthKeyspace.NAME,
+                                                                                           legacyAuthTable));
+                return result != null && !result.isEmpty();
+            }).collect(Collectors.toList());
+
+        if (!existing.isEmpty())
+            return Optional.of(String.format("Legacy auth tables %s in keyspace %s still exist and have not been properly migrated.",
+                        Joiner.on(", ").join(existing), AuthKeyspace.NAME));
+        else
+            return Optional.empty();
+    };
 }

diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index b734343..c7888c4 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java

@@ -26,42 +26,49 @@
 import java.util.concurrent.atomic.AtomicLong;
 
 import com.google.common.base.Predicate;
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.cache.CacheLoader;
 import com.google.common.collect.*;
+import com.google.common.primitives.Ints;
 import com.google.common.util.concurrent.Uninterruptibles;
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.batchlog.Batch;
+import org.apache.cassandra.batchlog.BatchlogManager;
+import org.apache.cassandra.batchlog.LegacyBatchlogMigrator;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.filter.TombstoneOverwhelmingException;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.RingPosition;
-import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.view.ViewUtils;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.gms.FailureDetector;
 import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.hints.Hint;
+import org.apache.cassandra.hints.HintsService;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
-import org.apache.cassandra.locator.IEndpointSnitch;
-import org.apache.cassandra.locator.LocalStrategy;
-import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.locator.*;
 import org.apache.cassandra.metrics.*;
 import org.apache.cassandra.net.*;
+import org.apache.cassandra.service.paxos.Commit;
+import org.apache.cassandra.service.paxos.PaxosState;
+import org.apache.cassandra.service.paxos.PrepareCallback;
+import org.apache.cassandra.service.paxos.ProposeCallback;
 import org.apache.cassandra.net.MessagingService.Verb;
-import org.apache.cassandra.service.paxos.*;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.triggers.TriggerExecutor;
 import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.AbstractIterator;
 
 public class StorageProxy implements StorageProxyMBean
 {
@@ -89,14 +96,28 @@
     private static final ClientRequestMetrics writeMetrics = new ClientRequestMetrics("Write");
     private static final CASClientRequestMetrics casWriteMetrics = new CASClientRequestMetrics("CASWrite");
     private static final CASClientRequestMetrics casReadMetrics = new CASClientRequestMetrics("CASRead");
+    private static final ViewWriteMetrics viewWriteMetrics = new ViewWriteMetrics("ViewWrite");
 
     private static final double CONCURRENT_SUBREQUESTS_MARGIN = 0.10;
 
-    private StorageProxy() {}
+    /**
+     * Introduce a maximum number of sub-ranges that the coordinator can request in parallel for range queries. Previously
+     * we would request up to the maximum number of ranges but this causes problems if the number of vnodes is large.
+     * By default we pick 10 requests per core, assuming all replicas have the same number of cores. The idea is that we
+     * don't want a burst of range requests that will back up, hurting all other queries. At the same time,
+     * we want to give range queries a chance to run if resources are available.
+     */
+    private static final int MAX_CONCURRENT_RANGE_REQUESTS = Math.max(1, Integer.getInteger("cassandra.max_concurrent_range_requests", FBUtilities.getAvailableProcessors() * 10));
+
+    private StorageProxy()
+    {
+    }
 
     static
     {
         MBeanWrapper.instance.registerMBean(instance, MBEAN_NAME);
+        HintsService.instance.registerMBean();
+        HintedHandOffManager.instance.registerMBean();
 
         standardWritePerformer = new WritePerformer()
         {
@@ -108,7 +129,7 @@
             throws OverloadedException
             {
                 assert mutation instanceof Mutation;
-                sendToHintedEndpoints((Mutation) mutation, targets, responseHandler, localDataCenter);
+                sendToHintedEndpoints((Mutation) mutation, targets, responseHandler, localDataCenter, Stage.MUTATION);
             }
         };
 
@@ -185,13 +206,13 @@
      * @return null if the operation succeeds in updating the row, or the current values corresponding to conditions.
      * (since, if the CAS doesn't succeed, it means the current value do not match the conditions).
      */
-    public static ColumnFamily cas(String keyspaceName,
-                                   String cfName,
-                                   ByteBuffer key,
-                                   CASRequest request,
-                                   ConsistencyLevel consistencyForPaxos,
-                                   ConsistencyLevel consistencyForCommit,
-                                   ClientState state)
+    public static RowIterator cas(String keyspaceName,
+                                  String cfName,
+                                  DecoratedKey key,
+                                  CASRequest request,
+                                  ConsistencyLevel consistencyForPaxos,
+                                  ConsistencyLevel consistencyForCommit,
+                                  ClientState state)
     throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException
     {
         final long start = System.nanoTime();
@@ -207,35 +228,35 @@
             while (System.nanoTime() - start < timeout)
             {
                 // for simplicity, we'll do a single liveness check at the start of each attempt
-                Pair<List<InetAddress>, Integer> p = getPaxosParticipants(keyspaceName, key, consistencyForPaxos);
+                Pair<List<InetAddress>, Integer> p = getPaxosParticipants(metadata, key, consistencyForPaxos);
                 List<InetAddress> liveEndpoints = p.left;
                 int requiredParticipants = p.right;
 
                 final Pair<UUID, Integer> pair = beginAndRepairPaxos(start, key, metadata, liveEndpoints, requiredParticipants, consistencyForPaxos, consistencyForCommit, true, state);
                 final UUID ballot = pair.left;
                 contentions += pair.right;
+
                 // read the current values and check they validate the conditions
                 Tracing.trace("Reading existing values for CAS precondition");
-                long timestamp = System.currentTimeMillis();
-                ReadCommand readCommand = ReadCommand.create(keyspaceName, key, cfName, timestamp, request.readFilter());
-                List<Row> rows = read(Arrays.asList(readCommand), consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL
-                                                                  ? ConsistencyLevel.LOCAL_QUORUM
-                                                                  : ConsistencyLevel.QUORUM);
-                ColumnFamily current = rows.get(0).cf;
-                if (current == null)
-                    current = ArrayBackedSortedColumns.factory.create(metadata);
+                SinglePartitionReadCommand readCommand = request.readCommand(FBUtilities.nowInSeconds());
+                ConsistencyLevel readConsistency = consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
+
+                FilteredPartition current;
+                try (RowIterator rowIter = readOne(readCommand, readConsistency))
+                {
+                    current = FilteredPartition.create(rowIter);
+                }
 
                 if (!request.appliesTo(current))
                 {
                     Tracing.trace("CAS precondition does not match current values {}", current);
-                    // We should not return null as this means success
                     casWriteMetrics.conditionNotMet.inc();
-                    return current;
+                    return current.rowIterator();
                 }
 
                 // finish the paxos round w/ the desired updates
                 // TODO turn null updates into delete?
-                ColumnFamily updates = request.makeUpdates(current);
+                PartitionUpdate updates = request.makeUpdates(current);
 
                 // Apply triggers to cas updates. A consideration here is that
                 // triggers emit Mutations, and so a given trigger implementation
@@ -244,9 +265,10 @@
                 // validate that the generated mutations are targetted at the same
                 // partition as the initial updates and reject (via an
                 // InvalidRequestException) any which aren't.
-                updates = TriggerExecutor.instance.execute(key, updates);
+                updates = TriggerExecutor.instance.execute(updates);
 
-                Commit proposal = Commit.newProposal(key, ballot, updates);
+
+                Commit proposal = Commit.newProposal(ballot, updates);
                 Tracing.trace("CAS precondition is met; proposing client-requested updates for {}", ballot);
                 if (proposePaxos(proposal, liveEndpoints, requiredParticipants, true, consistencyForPaxos))
                 {
@@ -298,12 +320,11 @@
         };
     }
 
-    private static Pair<List<InetAddress>, Integer> getPaxosParticipants(String keyspaceName, ByteBuffer key, ConsistencyLevel consistencyForPaxos) throws UnavailableException
+    private static Pair<List<InetAddress>, Integer> getPaxosParticipants(CFMetaData cfm, DecoratedKey key, ConsistencyLevel consistencyForPaxos) throws UnavailableException
     {
-        Token tk = StorageService.getPartitioner().getToken(key);
-        List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(keyspaceName, tk);
-        Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
-
+        Token tk = key.getToken();
+        List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(cfm.ksName, tk);
+        Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, cfm.ksName);
         if (consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL)
         {
             // Restrict naturalEndpoints and pendingEndpoints to node in the local DC only
@@ -337,7 +358,7 @@
      * nodes have seen the mostRecentCommit.  Otherwise, return null.
      */
     private static Pair<UUID, Integer> beginAndRepairPaxos(long start,
-                                                           ByteBuffer key,
+                                                           DecoratedKey key,
                                                            CFMetaData metadata,
                                                            List<InetAddress> liveEndpoints,
                                                            int requiredParticipants,
@@ -388,7 +409,7 @@
                     casWriteMetrics.unfinishedCommit.inc();
                 else
                     casReadMetrics.unfinishedCommit.inc();
-                Commit refreshedInProgress = Commit.newProposal(inProgress.key, ballot, inProgress.update);
+                Commit refreshedInProgress = Commit.newProposal(ballot, inProgress.update);
                 if (proposePaxos(refreshedInProgress, liveEndpoints, requiredParticipants, false, consistencyForPaxos))
                 {
                     try
@@ -415,7 +436,8 @@
             // https://issues.apache.org/jira/browse/CASSANDRA-5062?focusedCommentId=13619810&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13619810)
             // Since we waited for quorum nodes, if some of them haven't seen the last commit (which may just be a timing issue, but may also
             // mean we lost messages), we pro-actively "repair" those nodes, and retry.
-            Iterable<InetAddress> missingMRC = summary.replicasMissingMostRecentCommit(metadata, ballotMicros);
+            int nowInSec = Ints.checkedCast(TimeUnit.MICROSECONDS.toSeconds(ballotMicros));
+            Iterable<InetAddress> missingMRC = summary.replicasMissingMostRecentCommit(metadata, nowInSec);
             if (Iterables.size(missingMRC) > 0)
             {
                 Tracing.trace("Repairing replicas that missed the most recent commit");
@@ -446,7 +468,7 @@
     private static PrepareCallback preparePaxos(Commit toPrepare, List<InetAddress> endpoints, int requiredParticipants, ConsistencyLevel consistencyForPaxos)
     throws WriteTimeoutException
     {
-        PrepareCallback callback = new PrepareCallback(toPrepare.key, toPrepare.update.metadata(), requiredParticipants, consistencyForPaxos);
+        PrepareCallback callback = new PrepareCallback(toPrepare.update.partitionKey(), toPrepare.update.metadata(), requiredParticipants, consistencyForPaxos);
         MessageOut<Commit> message = new MessageOut<Commit>(MessagingService.Verb.PAXOS_PREPARE, toPrepare, Commit.serializer);
         for (InetAddress target : endpoints)
             MessagingService.instance().sendRR(message, target, callback);
@@ -473,12 +495,12 @@
         return false;
     }
 
-    private static void commitPaxos(Commit proposal, ConsistencyLevel consistencyLevel, boolean shouldHint) throws WriteTimeoutException
+    private static void commitPaxos(Commit proposal, ConsistencyLevel consistencyLevel, boolean allowHints) throws WriteTimeoutException
     {
         boolean shouldBlock = consistencyLevel != ConsistencyLevel.ANY;
         Keyspace keyspace = Keyspace.open(proposal.update.metadata().ksName);
 
-        Token tk = StorageService.getPartitioner().getToken(proposal.key);
+        Token tk = proposal.update.partitionKey().getToken();
         List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(keyspace.getName(), tk);
         Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspace.getName());
 
@@ -492,22 +514,21 @@
         MessageOut<Commit> message = new MessageOut<Commit>(MessagingService.Verb.PAXOS_COMMIT, proposal, Commit.serializer);
         for (InetAddress destination : Iterables.concat(naturalEndpoints, pendingEndpoints))
         {
-
             if (FailureDetector.instance.isAlive(destination))
             {
                 if (shouldBlock)
                 {
-                    if (destination.equals(FBUtilities.getBroadcastAddress()))
+                    if (canDoLocalRequest(destination))
                         commitPaxosLocal(message, responseHandler);
                     else
-                        MessagingService.instance().sendRR(message, destination, responseHandler, shouldHint);
+                        MessagingService.instance().sendRR(message, destination, responseHandler, allowHints && shouldHint(destination));
                 }
                 else
                 {
                     MessagingService.instance().sendOneWay(message, destination);
                 }
             }
-            else if (shouldHint)
+            else if (allowHints && shouldHint(destination))
             {
                 submitHint(proposal.makeMutation(), destination, null);
             }
@@ -522,15 +543,24 @@
      * submit a fake one that executes immediately on the mutation stage, but generates the necessary backpressure
      * signal for hints
      */
-    private static void commitPaxosLocal(final MessageOut<Commit> message, final AbstractWriteResponseHandler responseHandler)
+    private static void commitPaxosLocal(final MessageOut<Commit> message, final AbstractWriteResponseHandler<?> responseHandler)
     {
         StageManager.getStage(MessagingService.verbStages.get(MessagingService.Verb.PAXOS_COMMIT)).maybeExecuteImmediately(new LocalMutationRunnable()
         {
             public void runMayThrow()
             {
-                PaxosState.commit(message.payload);
-                if (responseHandler != null)
-                    responseHandler.response(null);
+                try
+                {
+                    PaxosState.commit(message.payload);
+                    if (responseHandler != null)
+                        responseHandler.response(null);
+                }
+                catch (Exception ex)
+                {
+                    if (!(ex instanceof WriteTimeoutException))
+                        logger.error("Failed to apply paxos commit locally : {}", ex);
+                    responseHandler.onFailure(FBUtilities.getBroadcastAddress());
+                }
             }
 
             @Override
@@ -622,32 +652,152 @@
         }
     }
 
-    /** hint all the mutations (except counters, which can't be safely retried).  This means
-      * we'll re-hint any successful ones; doesn't seem worth it to track individual success
-      * just for this unusual case.
-
-      * @param mutations the mutations that require hints
-      */
+    /**
+     * Hint all the mutations (except counters, which can't be safely retried).  This means
+     * we'll re-hint any successful ones; doesn't seem worth it to track individual success
+     * just for this unusual case.
+     *
+     * Only used for CL.ANY
+     *
+     * @param mutations the mutations that require hints
+     */
     private static void hintMutations(Collection<? extends IMutation> mutations)
     {
         for (IMutation mutation : mutations)
-        {
-            if (mutation instanceof CounterMutation)
-                continue;
+            if (!(mutation instanceof CounterMutation))
+                hintMutation((Mutation) mutation);
 
-            Token tk = StorageService.getPartitioner().getToken(mutation.key());
-            List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(mutation.getKeyspaceName(), tk);
-            Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, mutation.getKeyspaceName());
-            for (InetAddress target : Iterables.concat(naturalEndpoints, pendingEndpoints))
+        Tracing.trace("Wrote hints to satisfy CL.ANY after no replicas acknowledged the write");
+    }
+
+    private static void hintMutation(Mutation mutation)
+    {
+        String keyspaceName = mutation.getKeyspaceName();
+        Token token = mutation.key().getToken();
+
+        Iterable<InetAddress> endpoints = StorageService.instance.getNaturalAndPendingEndpoints(keyspaceName, token);
+        ArrayList<InetAddress> endpointsToHint = new ArrayList<>(Iterables.size(endpoints));
+
+        // local writes can timeout, but cannot be dropped (see LocalMutationRunnable and CASSANDRA-6510),
+        // so there is no need to hint or retry.
+        for (InetAddress target : endpoints)
+            if (!target.equals(FBUtilities.getBroadcastAddress()) && shouldHint(target))
+                endpointsToHint.add(target);
+
+        submitHint(mutation, endpointsToHint, null);
+    }
+
+    public boolean appliesLocally(Mutation mutation)
+    {
+        String keyspaceName = mutation.getKeyspaceName();
+        Token token = mutation.key().getToken();
+        InetAddress local = FBUtilities.getBroadcastAddress();
+
+        return StorageService.instance.getNaturalEndpoints(keyspaceName, token).contains(local)
+               || StorageService.instance.getTokenMetadata().pendingEndpointsFor(token, keyspaceName).contains(local);
+    }
+
+    /**
+     * Use this method to have these Mutations applied
+     * across all replicas.
+     *
+     * @param mutations the mutations to be applied across the replicas
+     * @param writeCommitLog if commitlog should be written
+     * @param baseComplete time from epoch in ms that the local base mutation was(or will be) completed
+     */
+    public static void mutateMV(ByteBuffer dataKey, Collection<Mutation> mutations, boolean writeCommitLog, AtomicLong baseComplete)
+    throws UnavailableException, OverloadedException, WriteTimeoutException
+    {
+        Tracing.trace("Determining replicas for mutation");
+        final String localDataCenter = DatabaseDescriptor.getEndpointSnitch().getDatacenter(FBUtilities.getBroadcastAddress());
+
+        long startTime = System.nanoTime();
+
+
+        try
+        {
+            // if we haven't joined the ring, write everything to batchlog because paired replicas may be stale
+            final UUID batchUUID = UUIDGen.getTimeUUID();
+
+            if (StorageService.instance.isStarting() || StorageService.instance.isJoining() || StorageService.instance.isMoving())
             {
-                // local writes can timeout, but cannot be dropped (see LocalMutationRunnable and
-                // CASSANDRA-6510), so there is no need to hint or retry
-                if (!target.equals(FBUtilities.getBroadcastAddress()) && shouldHint(target))
-                    submitHint((Mutation) mutation, target, null);
+                BatchlogManager.store(Batch.createLocal(batchUUID, FBUtilities.timestampMicros(),
+                                                        mutations), writeCommitLog);
+            }
+            else
+            {
+                List<WriteResponseHandlerWrapper> wrappers = new ArrayList<>(mutations.size());
+                //non-local mutations rely on the base mutation commit-log entry for eventual consistency
+                Set<Mutation> nonLocalMutations = new HashSet<>(mutations);
+                Token baseToken = StorageService.instance.getTokenMetadata().partitioner.getToken(dataKey);
+
+                ConsistencyLevel consistencyLevel = ConsistencyLevel.ONE;
+
+                //Since the base -> view replication is 1:1 we only need to store the BL locally
+                final Collection<InetAddress> batchlogEndpoints = Collections.singleton(FBUtilities.getBroadcastAddress());
+                BatchlogResponseHandler.BatchlogCleanup cleanup = new BatchlogResponseHandler.BatchlogCleanup(mutations.size(), () -> asyncRemoveFromBatchlog(batchlogEndpoints, batchUUID));
+
+                // add a handler for each mutation - includes checking availability, but doesn't initiate any writes, yet
+                for (Mutation mutation : mutations)
+                {
+                    String keyspaceName = mutation.getKeyspaceName();
+                    Token tk = mutation.key().getToken();
+                    Optional<InetAddress> pairedEndpoint = ViewUtils.getViewNaturalEndpoint(keyspaceName, baseToken, tk);
+                    Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
+
+                    // if there are no paired endpoints there are probably range movements going on, so we write to the local batchlog to replay later
+                    if (!pairedEndpoint.isPresent())
+                    {
+                        if (pendingEndpoints.isEmpty())
+                            logger.warn("Received base materialized view mutation for key {} that does not belong " +
+                                        "to this node. There is probably a range movement happening (move or decommission)," +
+                                        "but this node hasn't updated its ring metadata yet. Adding mutation to " +
+                                        "local batchlog to be replayed later.",
+                                        mutation.key());
+                        continue;
+                    }
+
+                    // When local node is the endpoint we can just apply the mutation locally,
+                    // unless there are pending endpoints, in which case we want to do an ordinary
+                    // write so the view mutation is sent to the pending endpoint
+                    if (pairedEndpoint.get().equals(FBUtilities.getBroadcastAddress()) && StorageService.instance.isJoined()
+                        && pendingEndpoints.isEmpty())
+                        try
+                        {
+                            mutation.apply(writeCommitLog);
+                            nonLocalMutations.remove(mutation);
+                            cleanup.ackMutation();
+                        }
+                        catch (Exception exc)
+                        {
+                            logger.error("Error applying local view update to keyspace {}: {}", mutation.getKeyspaceName(), mutation);
+                            throw exc;
+                        }
+                    else
+                    {
+                        wrappers.add(wrapViewBatchResponseHandler(mutation,
+                                                                  consistencyLevel,
+                                                                  consistencyLevel,
+                                                                  Collections.singletonList(pairedEndpoint.get()),
+                                                                  baseComplete,
+                                                                  WriteType.BATCH,
+                                                                  cleanup));
+                    }
+                }
+
+                // Apply to local batchlog memtable in this thread
+                if (!nonLocalMutations.isEmpty())
+                    BatchlogManager.store(Batch.createLocal(batchUUID, FBUtilities.timestampMicros(), nonLocalMutations), writeCommitLog);
+
+                // Perform remote writes
+                if (!wrappers.isEmpty())
+                    asyncWriteBatchedMutations(wrappers, localDataCenter, Stage.VIEW_MUTATION);
             }
         }
-
-        Tracing.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
+        finally
+        {
+            viewWriteMetrics.addNano(System.nanoTime() - startTime);
+        }
     }
 
     @SuppressWarnings("unchecked")
@@ -658,12 +808,19 @@
     {
         Collection<Mutation> augmented = TriggerExecutor.instance.execute(mutations);
 
+        boolean updatesView = Keyspace.open(mutations.iterator().next().getKeyspaceName())
+                              .viewManager
+                              .updatesAffectView(mutations, true);
+
         if (augmented != null)
-            mutateAtomically(augmented, consistencyLevel);
-        else if (mutateAtomically)
-            mutateAtomically((Collection<Mutation>) mutations, consistencyLevel);
+            mutateAtomically(augmented, consistencyLevel, updatesView);
         else
-            mutate(mutations, consistencyLevel);
+        {
+            if (mutateAtomically || updatesView)
+                mutateAtomically((Collection<Mutation>) mutations, consistencyLevel, updatesView);
+            else
+                mutate(mutations, consistencyLevel);
+        }
     }
 
     /**
@@ -674,8 +831,11 @@
      *
      * @param mutations the Mutations to be applied across the replicas
      * @param consistency_level the consistency level for the operation
+     * @param requireQuorumForRemove at least a quorum of nodes will see update before deleting batchlog
      */
-    public static void mutateAtomically(Collection<Mutation> mutations, ConsistencyLevel consistency_level)
+    public static void mutateAtomically(Collection<Mutation> mutations,
+                                        ConsistencyLevel consistency_level,
+                                        boolean requireQuorumForRemove)
     throws UnavailableException, OverloadedException, WriteTimeoutException
     {
         Tracing.trace("Determining replicas for atomic batch");
@@ -686,25 +846,43 @@
 
         try
         {
+
+            // If we are requiring quorum nodes for removal, we upgrade consistency level to QUORUM unless we already
+            // require ALL, or EACH_QUORUM. This is so that *at least* QUORUM nodes see the update.
+            ConsistencyLevel batchConsistencyLevel = requireQuorumForRemove
+                                                     ? ConsistencyLevel.QUORUM
+                                                     : consistency_level;
+
+            switch (consistency_level)
+            {
+                case ALL:
+                case EACH_QUORUM:
+                    batchConsistencyLevel = consistency_level;
+            }
+
+            final BatchlogEndpoints batchlogEndpoints = getBatchlogEndpoints(localDataCenter, batchConsistencyLevel);
+            final UUID batchUUID = UUIDGen.getTimeUUID();
+            BatchlogResponseHandler.BatchlogCleanup cleanup = new BatchlogResponseHandler.BatchlogCleanup(mutations.size(),
+                                                                                                          () -> asyncRemoveFromBatchlog(batchlogEndpoints, batchUUID));
+
             // add a handler for each mutation - includes checking availability, but doesn't initiate any writes, yet
             for (Mutation mutation : mutations)
             {
-                WriteResponseHandlerWrapper wrapper = wrapResponseHandler(mutation, consistency_level, WriteType.BATCH);
+                WriteResponseHandlerWrapper wrapper = wrapBatchResponseHandler(mutation,
+                                                                               consistency_level,
+                                                                               batchConsistencyLevel,
+                                                                               WriteType.BATCH,
+                                                                               cleanup);
                 // exit early if we can't fulfill the CL at this time.
                 wrapper.handler.assureSufficientLiveNodes();
                 wrappers.add(wrapper);
             }
 
             // write to the batchlog
-            Collection<InetAddress> batchlogEndpoints = getBatchlogEndpoints(localDataCenter, consistency_level);
-            UUID batchUUID = UUIDGen.getTimeUUID();
             syncWriteToBatchlog(mutations, batchlogEndpoints, batchUUID);
 
             // now actually perform the writes and wait for them to complete
-            syncWriteBatchedMutations(wrappers, localDataCenter);
-
-            // remove the batchlog entries asynchronously
-            asyncRemoveFromBatchlog(batchlogEndpoints, batchUUID);
+            syncWriteBatchedMutations(wrappers, localDataCenter, Stage.MUTATION);
         }
         catch (UnavailableException e)
         {
@@ -730,71 +908,99 @@
         }
     }
 
-    private static void syncWriteToBatchlog(Collection<Mutation> mutations, Collection<InetAddress> endpoints, UUID uuid)
+    public static boolean canDoLocalRequest(InetAddress replica)
+    {
+        return replica.equals(FBUtilities.getBroadcastAddress());
+    }
+
+    private static void syncWriteToBatchlog(Collection<Mutation> mutations, BatchlogEndpoints endpoints, UUID uuid)
     throws WriteTimeoutException, WriteFailureException
     {
-        AbstractWriteResponseHandler<IMutation> handler = new WriteResponseHandler<>(endpoints,
-                                                                        Collections.<InetAddress>emptyList(),
-                                                                        ConsistencyLevel.ONE,
-                                                                        Keyspace.open(SystemKeyspace.NAME),
-                                                                        null,
-                                                                        WriteType.BATCH_LOG);
+        WriteResponseHandler<?> handler = new WriteResponseHandler<>(endpoints.all,
+                                                                     Collections.<InetAddress>emptyList(),
+                                                                     endpoints.all.size() == 1 ? ConsistencyLevel.ONE : ConsistencyLevel.TWO,
+                                                                     Keyspace.open(SystemKeyspace.NAME),
+                                                                     null,
+                                                                     WriteType.BATCH_LOG);
 
-        MessageOut<Mutation> message = BatchlogManager.getBatchlogMutationFor(mutations, uuid, MessagingService.current_version)
-                                                      .createMessage();
-        for (InetAddress target : endpoints)
-        {
-            int targetVersion = MessagingService.instance().getVersion(target);
-            if (target.equals(FBUtilities.getBroadcastAddress()))
-            {
-                insertLocal(message.payload, handler);
-            }
-            else if (targetVersion == MessagingService.current_version)
-            {
-                MessagingService.instance().sendRR(message, target, handler, false);
-            }
-            else
-            {
-                MessagingService.instance().sendRR(BatchlogManager.getBatchlogMutationFor(mutations, uuid, targetVersion)
-                                                                  .createMessage(),
-                                                   target,
-                                                   handler,
-                                                   false);
-            }
-        }
+        Batch batch = Batch.createLocal(uuid, FBUtilities.timestampMicros(), mutations);
+
+        if (!endpoints.current.isEmpty())
+            syncWriteToBatchlog(handler, batch, endpoints.current);
+
+        if (!endpoints.legacy.isEmpty())
+            LegacyBatchlogMigrator.syncWriteToBatchlog(handler, batch, endpoints.legacy);
 
         handler.get();
     }
 
-    private static void asyncRemoveFromBatchlog(Collection<InetAddress> endpoints, UUID uuid)
+    private static void syncWriteToBatchlog(WriteResponseHandler<?> handler, Batch batch, Collection<InetAddress> endpoints)
+    throws WriteTimeoutException, WriteFailureException
     {
-        AbstractWriteResponseHandler<IMutation> handler = new WriteResponseHandler<>(endpoints,
-                                                                        Collections.<InetAddress>emptyList(),
-                                                                        ConsistencyLevel.ANY,
-                                                                        Keyspace.open(SystemKeyspace.NAME),
-                                                                        null,
-                                                                        WriteType.SIMPLE);
-        Mutation mutation = new Mutation(SystemKeyspace.NAME, UUIDType.instance.decompose(uuid));
-        mutation.delete(SystemKeyspace.BATCHLOG, FBUtilities.timestampMicros());
-        MessageOut<Mutation> message = mutation.createMessage();
+        MessageOut<Batch> message = new MessageOut<>(MessagingService.Verb.BATCH_STORE, batch, Batch.serializer);
+
         for (InetAddress target : endpoints)
         {
-            if (target.equals(FBUtilities.getBroadcastAddress()))
-                insertLocal(message.payload, handler);
+            logger.trace("Sending batchlog store request {} to {} for {} mutations", batch.id, target, batch.size());
+
+            if (canDoLocalRequest(target))
+                performLocally(Stage.MUTATION, () -> BatchlogManager.store(batch), handler);
             else
-                MessagingService.instance().sendRR(message, target, handler, false);
+                MessagingService.instance().sendRR(message, target, handler);
         }
     }
 
-    private static void syncWriteBatchedMutations(List<WriteResponseHandlerWrapper> wrappers, String localDataCenter)
+    private static void asyncRemoveFromBatchlog(BatchlogEndpoints endpoints, UUID uuid)
+    {
+        if (!endpoints.current.isEmpty())
+            asyncRemoveFromBatchlog(endpoints.current, uuid);
+
+        if (!endpoints.legacy.isEmpty())
+            LegacyBatchlogMigrator.asyncRemoveFromBatchlog(endpoints.legacy, uuid);
+    }
+
+    private static void asyncRemoveFromBatchlog(Collection<InetAddress> endpoints, UUID uuid)
+    {
+        MessageOut<UUID> message = new MessageOut<>(MessagingService.Verb.BATCH_REMOVE, uuid, UUIDSerializer.serializer);
+        for (InetAddress target : endpoints)
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Sending batchlog remove request {} to {}", uuid, target);
+
+            if (canDoLocalRequest(target))
+                performLocally(Stage.MUTATION, () -> BatchlogManager.remove(uuid));
+            else
+                MessagingService.instance().sendOneWay(message, target);
+        }
+    }
+
+    private static void asyncWriteBatchedMutations(List<WriteResponseHandlerWrapper> wrappers, String localDataCenter, Stage stage)
+    {
+        for (WriteResponseHandlerWrapper wrapper : wrappers)
+        {
+            Iterable<InetAddress> endpoints = Iterables.concat(wrapper.handler.naturalEndpoints, wrapper.handler.pendingEndpoints);
+
+            try
+            {
+                sendToHintedEndpoints(wrapper.mutation, endpoints, wrapper.handler, localDataCenter, stage);
+            }
+            catch (OverloadedException | WriteTimeoutException e)
+            {
+                wrapper.handler.onFailure(FBUtilities.getBroadcastAddress());
+            }
+        }
+    }
+
+    private static void syncWriteBatchedMutations(List<WriteResponseHandlerWrapper> wrappers, String localDataCenter, Stage stage)
     throws WriteTimeoutException, OverloadedException
     {
         for (WriteResponseHandlerWrapper wrapper : wrappers)
         {
             Iterable<InetAddress> endpoints = Iterables.concat(wrapper.handler.naturalEndpoints, wrapper.handler.pendingEndpoints);
-            sendToHintedEndpoints(wrapper.mutation, endpoints, wrapper.handler, localDataCenter);
+            sendToHintedEndpoints(wrapper.mutation, endpoints, wrapper.handler, localDataCenter, stage);
         }
 
+
         for (WriteResponseHandlerWrapper wrapper : wrappers)
             wrapper.handler.get();
     }
@@ -824,7 +1030,7 @@
         String keyspaceName = mutation.getKeyspaceName();
         AbstractReplicationStrategy rs = Keyspace.open(keyspaceName).getReplicationStrategy();
 
-        Token tk = StorageService.getPartitioner().getToken(mutation.key());
+        Token tk = mutation.key().getToken();
         List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(keyspaceName, tk);
         Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
 
@@ -837,25 +1043,56 @@
         return responseHandler;
     }
 
-    // same as above except does not initiate writes (but does perform availability checks).
-    private static WriteResponseHandlerWrapper wrapResponseHandler(Mutation mutation, ConsistencyLevel consistency_level, WriteType writeType)
+    // same as performWrites except does not initiate writes (but does perform availability checks).
+    private static WriteResponseHandlerWrapper wrapBatchResponseHandler(Mutation mutation,
+                                                                        ConsistencyLevel consistency_level,
+                                                                        ConsistencyLevel batchConsistencyLevel,
+                                                                        WriteType writeType,
+                                                                        BatchlogResponseHandler.BatchlogCleanup cleanup)
     {
-        AbstractReplicationStrategy rs = Keyspace.open(mutation.getKeyspaceName()).getReplicationStrategy();
+        Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName());
+        AbstractReplicationStrategy rs = keyspace.getReplicationStrategy();
         String keyspaceName = mutation.getKeyspaceName();
-        Token tk = StorageService.getPartitioner().getToken(mutation.key());
+        Token tk = mutation.key().getToken();
         List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(keyspaceName, tk);
         Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
-        AbstractWriteResponseHandler<IMutation> responseHandler = rs.getWriteResponseHandler(naturalEndpoints, pendingEndpoints, consistency_level, null, writeType);
-        return new WriteResponseHandlerWrapper(responseHandler, mutation);
+        AbstractWriteResponseHandler<IMutation> writeHandler = rs.getWriteResponseHandler(naturalEndpoints, pendingEndpoints, consistency_level, null, writeType);
+        BatchlogResponseHandler<IMutation> batchHandler = new BatchlogResponseHandler<>(writeHandler, batchConsistencyLevel.blockFor(keyspace), cleanup);
+        return new WriteResponseHandlerWrapper(batchHandler, mutation);
+    }
+
+    /**
+     * Same as performWrites except does not initiate writes (but does perform availability checks).
+     * Keeps track of ViewWriteMetrics
+     */
+    private static WriteResponseHandlerWrapper wrapViewBatchResponseHandler(Mutation mutation,
+                                                                            ConsistencyLevel consistency_level,
+                                                                            ConsistencyLevel batchConsistencyLevel,
+                                                                            List<InetAddress> naturalEndpoints,
+                                                                            AtomicLong baseComplete,
+                                                                            WriteType writeType,
+                                                                            BatchlogResponseHandler.BatchlogCleanup cleanup)
+    {
+        Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName());
+        AbstractReplicationStrategy rs = keyspace.getReplicationStrategy();
+        String keyspaceName = mutation.getKeyspaceName();
+        Token tk = mutation.key().getToken();
+        Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
+        AbstractWriteResponseHandler<IMutation> writeHandler = rs.getWriteResponseHandler(naturalEndpoints, pendingEndpoints, consistency_level, () -> {
+            long delay = Math.max(0, System.currentTimeMillis() - baseComplete.get());
+            viewWriteMetrics.viewWriteLatency.update(delay, TimeUnit.MILLISECONDS);
+        }, writeType);
+        BatchlogResponseHandler<IMutation> batchHandler = new ViewWriteMetricsWrapped(writeHandler, batchConsistencyLevel.blockFor(keyspace), cleanup);
+        return new WriteResponseHandlerWrapper(batchHandler, mutation);
     }
 
     // used by atomic_batch_mutate to decouple availability check from the write itself, caches consistency level and endpoints.
     private static class WriteResponseHandlerWrapper
     {
-        final AbstractWriteResponseHandler<IMutation> handler;
+        final BatchlogResponseHandler<IMutation> handler;
         final Mutation mutation;
 
-        WriteResponseHandlerWrapper(AbstractWriteResponseHandler<IMutation> handler, Mutation mutation)
+        WriteResponseHandlerWrapper(BatchlogResponseHandler<IMutation> handler, Mutation mutation)
         {
             this.handler = handler;
             this.mutation = mutation;
@@ -863,13 +1100,38 @@
     }
 
     /*
+     * A class to filter batchlog endpoints into legacy endpoints (version < 3.0) or not.
+     */
+    private static final class BatchlogEndpoints
+    {
+        public final Collection<InetAddress> all;
+        public final Collection<InetAddress> current;
+        public final Collection<InetAddress> legacy;
+
+        BatchlogEndpoints(Collection<InetAddress> endpoints)
+        {
+            all = endpoints;
+            current = new ArrayList<>(2);
+            legacy = new ArrayList<>(2);
+
+            for (InetAddress ep : endpoints)
+            {
+                if (MessagingService.instance().getVersion(ep) >= MessagingService.VERSION_30)
+                    current.add(ep);
+                else
+                    legacy.add(ep);
+            }
+        }
+    }
+
+    /*
      * Replicas are picked manually:
      * - replicas should be alive according to the failure detector
      * - replicas should be in the local datacenter
      * - choose min(2, number of qualifying candiates above)
      * - allow the local node to be the only replica only if it's a single-node DC
      */
-    private static Collection<InetAddress> getBatchlogEndpoints(String localDataCenter, ConsistencyLevel consistencyLevel)
+    private static BatchlogEndpoints getBatchlogEndpoints(String localDataCenter, ConsistencyLevel consistencyLevel)
     throws UnavailableException
     {
         TokenMetadata.Topology topology = StorageService.instance.getTokenMetadata().cachedOnlyTokenMap().getTopology();
@@ -880,12 +1142,12 @@
         if (chosenEndpoints.isEmpty())
         {
             if (consistencyLevel == ConsistencyLevel.ANY)
-                return Collections.singleton(FBUtilities.getBroadcastAddress());
+                return new BatchlogEndpoints(Collections.singleton(FBUtilities.getBroadcastAddress()));
 
             throw new UnavailableException(ConsistencyLevel.ONE, 1, 0);
         }
 
-        return chosenEndpoints;
+        return new BatchlogEndpoints(chosenEndpoints);
     }
 
     /**
@@ -908,7 +1170,8 @@
     public static void sendToHintedEndpoints(final Mutation mutation,
                                              Iterable<InetAddress> targets,
                                              AbstractWriteResponseHandler<IMutation> responseHandler,
-                                             String localDataCenter)
+                                             String localDataCenter,
+                                             Stage stage)
     throws OverloadedException
     {
         // extra-datacenter replicas, grouped by dc
@@ -917,7 +1180,7 @@
         MessageOut<Mutation> message = null;
 
         boolean insertLocal = false;
-
+        ArrayList<InetAddress> endpointsToHint = null;
 
         for (InetAddress destination : targets)
         {
@@ -925,10 +1188,11 @@
 
             if (FailureDetector.instance.isAlive(destination))
             {
-                if (destination.equals(FBUtilities.getBroadcastAddress()))
+                if (canDoLocalRequest(destination))
                 {
                     insertLocal = true;
-                } else
+                }
+                else
                 {
                     // belongs on a different server
                     if (message == null)
@@ -939,31 +1203,37 @@
                     if (localDataCenter.equals(dc))
                     {
                         MessagingService.instance().sendRR(message, destination, responseHandler, true);
-                    } else
+                    }
+                    else
                     {
                         Collection<InetAddress> messages = (dcGroups != null) ? dcGroups.get(dc) : null;
                         if (messages == null)
                         {
-                            messages = new ArrayList<InetAddress>(3); // most DCs will have <= 3 replicas
+                            messages = new ArrayList<>(3); // most DCs will have <= 3 replicas
                             if (dcGroups == null)
-                                dcGroups = new HashMap<String, Collection<InetAddress>>();
+                                dcGroups = new HashMap<>();
                             dcGroups.put(dc, messages);
                         }
                         messages.add(destination);
                     }
                 }
-            } else
+            }
+            else
             {
-                if (!shouldHint(destination))
-                    continue;
-
-                // Schedule a local hint
-                submitHint(mutation, destination, responseHandler);
+                if (shouldHint(destination))
+                {
+                    if (endpointsToHint == null)
+                        endpointsToHint = new ArrayList<>(Iterables.size(targets));
+                    endpointsToHint.add(destination);
+                }
             }
         }
 
+        if (endpointsToHint != null)
+            submitHint(mutation, endpointsToHint, responseHandler);
+
         if (insertLocal)
-            insertLocal(mutation, responseHandler);
+            performLocally(stage, mutation::apply, responseHandler);
 
         if (dcGroups != null)
         {
@@ -976,7 +1246,7 @@
         }
     }
 
-    private static void checkHintOverload(InetAddress destination) throws OverloadedException
+    private static void checkHintOverload(InetAddress destination)
     {
         // avoid OOMing due to excess hints.  we need to do this check even for "live" nodes, since we can
         // still generate hints for those if it's overloaded or simply dead but not yet known-to-be-dead.
@@ -992,71 +1262,6 @@
         }
     }
 
-    private static AtomicInteger getHintsInProgressFor(InetAddress destination)
-    {
-        try
-        {
-            return hintsInProgress.load(destination);
-        }
-        catch (Exception e)
-        {
-            throw new AssertionError(e);
-        }
-    }
-
-    public static Future<Void> submitHint(final Mutation mutation,
-                                          final InetAddress target,
-                                          final AbstractWriteResponseHandler<IMutation> responseHandler)
-    {
-        // local write that time out should be handled by LocalMutationRunnable
-        assert !target.equals(FBUtilities.getBroadcastAddress()) : target;
-
-        HintRunnable runnable = new HintRunnable(target)
-        {
-            public void runMayThrow()
-            {
-                int ttl = HintedHandOffManager.calculateHintTTL(mutation);
-                if (ttl > 0)
-                {
-                    logger.trace("Adding hint for {}", target);
-                    writeHintForMutation(mutation, System.currentTimeMillis(), ttl, target);
-                    // Notify the handler only for CL == ANY
-                    if (responseHandler != null && responseHandler.consistencyLevel == ConsistencyLevel.ANY)
-                        responseHandler.response(null);
-                } else
-                {
-                    logger.debug("Skipped writing hint for {} (ttl {})", target, ttl);
-                }
-            }
-        };
-
-        return submitHint(runnable);
-    }
-
-    private static Future<Void> submitHint(HintRunnable runnable)
-    {
-        StorageMetrics.totalHintsInProgress.inc();
-        getHintsInProgressFor(runnable.target).incrementAndGet();
-        return (Future<Void>) StageManager.getStage(Stage.MUTATION).submit(runnable);
-    }
-
-    /**
-     * @param now current time in milliseconds - relevant for hint replay handling of truncated CFs
-     */
-    public static void writeHintForMutation(Mutation mutation, long now, int ttl, InetAddress target)
-    {
-        assert ttl > 0;
-
-        UUID hostId = StorageService.instance.getTokenMetadata().getHostId(target);
-        if (hostId != null)
-        {
-            HintedHandOffManager.instance.hintFor(mutation, now, ttl, Pair.create(target, hostId)).apply();
-            StorageMetrics.totalHints.inc();
-        }
-        else
-            logger.debug("Discarding hint for endpoint not part of ring: {}", target);
-    }
-
     private static void sendMessagesToNonlocalDC(MessageOut<? extends IMutation> message,
                                                  Collection<InetAddress> targets,
                                                  AbstractWriteResponseHandler<IMutation> handler)
@@ -1093,22 +1298,46 @@
         }
     }
 
-    private static void insertLocal(final Mutation mutation, final AbstractWriteResponseHandler<IMutation> responseHandler)
+    private static void performLocally(Stage stage, final Runnable runnable)
     {
-
-        StageManager.getStage(Stage.MUTATION).maybeExecuteImmediately(new LocalMutationRunnable()
+        StageManager.getStage(stage).maybeExecuteImmediately(new LocalMutationRunnable()
         {
             public void runMayThrow()
             {
                 try
                 {
-                    mutation.apply();
-                    responseHandler.response(null);
+                    runnable.run();
                 }
                 catch (Exception ex)
                 {
-                    logger.error("Failed to apply mutation locally : {}", ex.getMessage());
-                    responseHandler.onFailure(FBUtilities.getBroadcastAddress());
+                    logger.error("Failed to apply mutation locally : {}", ex);
+                }
+            }
+
+            @Override
+            protected Verb verb()
+            {
+                return MessagingService.Verb.MUTATION;
+            }
+        });
+    }
+
+    private static void performLocally(Stage stage, final Runnable runnable, final IAsyncCallbackWithFailure<?> handler)
+    {
+        StageManager.getStage(stage).maybeExecuteImmediately(new LocalMutationRunnable()
+        {
+            public void runMayThrow()
+            {
+                try
+                {
+                    runnable.run();
+                    handler.response(null);
+                }
+                catch (Exception ex)
+                {
+                    if (!(ex instanceof WriteTimeoutException))
+                        logger.error("Failed to apply mutation locally : {}", ex);
+                    handler.onFailure(FBUtilities.getBroadcastAddress());
                 }
             }
 
@@ -1147,7 +1376,7 @@
             // Exit now if we can't fulfill the CL here instead of forwarding to the leader replica
             String keyspaceName = cm.getKeyspaceName();
             AbstractReplicationStrategy rs = Keyspace.open(keyspaceName).getReplicationStrategy();
-            Token tk = StorageService.getPartitioner().getToken(cm.key());
+            Token tk = cm.key().getToken();
             List<InetAddress> naturalEndpoints = StorageService.instance.getNaturalEndpoints(keyspaceName, tk);
             Collection<InetAddress> pendingEndpoints = StorageService.instance.getTokenMetadata().pendingEndpointsFor(tk, keyspaceName);
 
@@ -1172,31 +1401,38 @@
      * is unclear we want to mix those latencies with read latencies, so this
      * may be a bit involved.
      */
-    private static InetAddress findSuitableEndpoint(String keyspaceName, ByteBuffer key, String localDataCenter, ConsistencyLevel cl) throws UnavailableException
+    private static InetAddress findSuitableEndpoint(String keyspaceName, DecoratedKey key, String localDataCenter, ConsistencyLevel cl) throws UnavailableException
     {
         Keyspace keyspace = Keyspace.open(keyspaceName);
         IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
-        List<InetAddress> endpoints = StorageService.instance.getLiveNaturalEndpoints(keyspace, key);
+        List<InetAddress> endpoints = new ArrayList<>();
+        StorageService.instance.getLiveNaturalEndpoints(keyspace, key, endpoints);
+
+        // CASSANDRA-13043: filter out those endpoints not accepting clients yet, maybe because still bootstrapping
+        endpoints.removeIf(endpoint -> !StorageService.instance.isRpcReady(endpoint));
+
+        // TODO have a way to compute the consistency level
         if (endpoints.isEmpty())
-            // TODO have a way to compute the consistency level
             throw new UnavailableException(cl, cl.blockFor(keyspace), 0);
 
-        List<InetAddress> localEndpoints = new ArrayList<InetAddress>();
+        List<InetAddress> localEndpoints = new ArrayList<>(endpoints.size());
+
         for (InetAddress endpoint : endpoints)
-        {
             if (snitch.getDatacenter(endpoint).equals(localDataCenter))
                 localEndpoints.add(endpoint);
-        }
+
         if (localEndpoints.isEmpty())
         {
+            // If the consistency required is local then we should not involve other DCs
+            if (cl.isDatacenterLocal())
+                throw new UnavailableException(cl, cl.blockFor(keyspace), 0);
+
             // No endpoint in local DC, pick the closest endpoint according to the snitch
             snitch.sortByProximity(FBUtilities.getBroadcastAddress(), endpoints);
             return endpoints.get(0);
         }
-        else
-        {
-            return localEndpoints.get(ThreadLocalRandom.current().nextInt(localEndpoints.size()));
-        }
+
+        return localEndpoints.get(ThreadLocalRandom.current().nextInt(localEndpoints.size()));
     }
 
     // Must be called on a replica of the mutation. This replica becomes the
@@ -1233,62 +1469,74 @@
                 Set<InetAddress> remotes = Sets.difference(ImmutableSet.copyOf(targets),
                                                            ImmutableSet.of(FBUtilities.getBroadcastAddress()));
                 if (!remotes.isEmpty())
-                    sendToHintedEndpoints(result, remotes, responseHandler, localDataCenter);
+                    sendToHintedEndpoints(result, remotes, responseHandler, localDataCenter, Stage.COUNTER_MUTATION);
             }
         };
     }
 
-    private static boolean systemKeyspaceQuery(List<ReadCommand> cmds)
+    private static boolean systemKeyspaceQuery(List<? extends ReadCommand> cmds)
     {
         for (ReadCommand cmd : cmds)
-            if (!cmd.ksName.equals(SystemKeyspace.NAME))
+            if (!Schema.isLocalSystemKeyspace(cmd.metadata().ksName))
                 return false;
         return true;
     }
 
-    public static List<Row> read(List<ReadCommand> commands, ConsistencyLevel consistencyLevel)
+    public static RowIterator readOne(SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel)
+    throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
+    {
+        return readOne(command, consistencyLevel, null);
+    }
+
+    public static RowIterator readOne(SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel, ClientState state)
+    throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
+    {
+        return PartitionIterators.getOnlyElement(read(SinglePartitionReadCommand.Group.one(command), consistencyLevel, state), command);
+    }
+
+    public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel)
     throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
     {
         // When using serial CL, the ClientState should be provided
         assert !consistencyLevel.isSerialConsistency();
-        return read(commands, consistencyLevel, null);
+        return read(group, consistencyLevel, null);
     }
 
     /**
      * Performs the actual reading of a row out of the StorageService, fetching
      * a specific set of column names from a given column family.
      */
-    public static List<Row> read(List<ReadCommand> commands, ConsistencyLevel consistencyLevel, ClientState state)
+    public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ClientState state)
     throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
     {
-        if (StorageService.instance.isBootstrapMode() && !systemKeyspaceQuery(commands))
+        if (StorageService.instance.isBootstrapMode() && !systemKeyspaceQuery(group.commands))
         {
             readMetrics.unavailables.mark();
             throw new IsBootstrappingException();
         }
 
         return consistencyLevel.isSerialConsistency()
-             ? readWithPaxos(commands, consistencyLevel, state)
-             : readRegular(commands, consistencyLevel);
+             ? readWithPaxos(group, consistencyLevel, state)
+             : readRegular(group, consistencyLevel);
     }
 
-    private static List<Row> readWithPaxos(List<ReadCommand> commands, ConsistencyLevel consistencyLevel, ClientState state)
+    private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ClientState state)
     throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException
     {
         assert state != null;
+        if (group.commands.size() > 1)
+            throw new InvalidRequestException("SERIAL/LOCAL_SERIAL consistency may only be requested for one partition at a time");
 
         long start = System.nanoTime();
-        List<Row> rows = null;
+        SinglePartitionReadCommand command = group.commands.get(0);
+        CFMetaData metadata = command.metadata();
+        DecoratedKey key = command.partitionKey();
 
+        PartitionIterator result = null;
         try
         {
             // make sure any in-progress paxos writes are done (i.e., committed to a majority of replicas), before performing a quorum read
-            if (commands.size() > 1)
-                throw new InvalidRequestException("SERIAL/LOCAL_SERIAL consistency may only be requested for one row at a time");
-            ReadCommand command = commands.get(0);
-
-            CFMetaData metadata = Schema.instance.getCFMetaData(command.ksName, command.cfName);
-            Pair<List<InetAddress>, Integer> p = getPaxosParticipants(command.ksName, command.key, consistencyLevel);
+            Pair<List<InetAddress>, Integer> p = getPaxosParticipants(metadata, key, consistencyLevel);
             List<InetAddress> liveEndpoints = p.left;
             int requiredParticipants = p.right;
 
@@ -1296,22 +1544,23 @@
             final ConsistencyLevel consistencyForCommitOrFetch = consistencyLevel == ConsistencyLevel.LOCAL_SERIAL
                                                                                    ? ConsistencyLevel.LOCAL_QUORUM
                                                                                    : ConsistencyLevel.QUORUM;
+
             try
             {
-                final Pair<UUID, Integer> pair = beginAndRepairPaxos(start, command.key, metadata, liveEndpoints, requiredParticipants, consistencyLevel, consistencyForCommitOrFetch, false, state);
+                final Pair<UUID, Integer> pair = beginAndRepairPaxos(start, key, metadata, liveEndpoints, requiredParticipants, consistencyLevel, consistencyForCommitOrFetch, false, state);
                 if (pair.right > 0)
                     casReadMetrics.contention.update(pair.right);
             }
             catch (WriteTimeoutException e)
             {
-                throw new ReadTimeoutException(consistencyLevel, 0, consistencyLevel.blockFor(Keyspace.open(command.ksName)), false);
+                throw new ReadTimeoutException(consistencyLevel, 0, consistencyLevel.blockFor(Keyspace.open(metadata.ksName)), false);
             }
             catch (WriteFailureException e)
             {
                 throw new ReadFailureException(consistencyLevel, e.received, e.failures, e.blockFor, false);
             }
 
-            rows = fetchRows(commands, consistencyForCommitOrFetch);
+            result = fetchRows(group.commands, consistencyForCommitOrFetch);
         }
         catch (UnavailableException e)
         {
@@ -1336,23 +1585,28 @@
             long latency = System.nanoTime() - start;
             readMetrics.addNano(latency);
             casReadMetrics.addNano(latency);
-            // TODO avoid giving every command the same latency number.  Can fix this in CASSADRA-5329
-            for (ReadCommand command : commands)
-                Keyspace.open(command.ksName).getColumnFamilyStore(command.cfName).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS);
+            Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS);
         }
 
-        return rows;
+        return result;
     }
 
-    private static List<Row> readRegular(List<ReadCommand> commands, ConsistencyLevel consistencyLevel)
+    @SuppressWarnings("resource")
+    private static PartitionIterator readRegular(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel)
     throws UnavailableException, ReadFailureException, ReadTimeoutException
     {
         long start = System.nanoTime();
-        List<Row> rows = null;
-
         try
         {
-            rows = fetchRows(commands, consistencyLevel);
+            PartitionIterator result = fetchRows(group.commands, consistencyLevel);
+            // Note that the only difference between the command in a group must be the partition key on which
+            // they applied.
+            boolean enforceStrictLiveness = group.commands.get(0).metadata().enforceStrictLiveness();
+            // If we have more than one command, then despite each read command honoring the limit, the total result
+            // might not honor it and so we should enforce it
+            if (group.commands.size() > 1)
+                result = group.limits().filter(result, group.nowInSec(), group.selectsFullPartition(), enforceStrictLiveness);
+            return result;
         }
         catch (UnavailableException e)
         {
@@ -1374,11 +1628,9 @@
             long latency = System.nanoTime() - start;
             readMetrics.addNano(latency);
             // TODO avoid giving every command the same latency number.  Can fix this in CASSADRA-5329
-            for (ReadCommand command : commands)
-                Keyspace.open(command.ksName).getColumnFamilyStore(command.cfName).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS);
+            for (ReadCommand command : group.commands)
+                Keyspace.openAndGetStore(command.metadata()).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS);
         }
-
-        return rows;
     }
 
     /**
@@ -1392,182 +1644,142 @@
      * 4. If the digests (if any) match the data return the data
      * 5. else carry out read repair by getting data from all the nodes.
      */
-    private static List<Row> fetchRows(List<ReadCommand> initialCommands, ConsistencyLevel consistencyLevel)
+    private static PartitionIterator fetchRows(List<SinglePartitionReadCommand> commands, ConsistencyLevel consistencyLevel)
     throws UnavailableException, ReadFailureException, ReadTimeoutException
     {
-        List<Row> rows = new ArrayList<>(initialCommands.size());
-        // (avoid allocating a new list in the common case of nothing-to-retry)
-        List<ReadCommand> commandsToRetry = Collections.emptyList();
+        int cmdCount = commands.size();
 
-        do
+        SinglePartitionReadLifecycle[] reads = new SinglePartitionReadLifecycle[cmdCount];
+        for (int i = 0; i < cmdCount; i++)
+            reads[i] = new SinglePartitionReadLifecycle(commands.get(i), consistencyLevel);
+
+        for (int i = 0; i < cmdCount; i++)
+            reads[i].doInitialQueries();
+
+        for (int i = 0; i < cmdCount; i++)
+            reads[i].maybeTryAdditionalReplicas();
+
+        for (int i = 0; i < cmdCount; i++)
+            reads[i].awaitResultsAndRetryOnDigestMismatch();
+
+        for (int i = 0; i < cmdCount; i++)
+            if (!reads[i].isDone())
+                reads[i].maybeAwaitFullDataRead();
+
+        List<PartitionIterator> results = new ArrayList<>(cmdCount);
+        for (int i = 0; i < cmdCount; i++)
         {
-            List<ReadCommand> commands = commandsToRetry.isEmpty() ? initialCommands : commandsToRetry;
-            AbstractReadExecutor[] readExecutors = new AbstractReadExecutor[commands.size()];
+            assert reads[i].isDone();
+            results.add(reads[i].getResult());
+        }
 
-            if (!commandsToRetry.isEmpty())
-                Tracing.trace("Retrying {} commands", commandsToRetry.size());
+        return PartitionIterators.concat(results);
+    }
 
-            // send out read requests
-            for (int i = 0; i < commands.size(); i++)
+    private static class SinglePartitionReadLifecycle
+    {
+        private final SinglePartitionReadCommand command;
+        private final AbstractReadExecutor executor;
+        private final ConsistencyLevel consistency;
+
+        private PartitionIterator result;
+        private ReadCallback repairHandler;
+
+        SinglePartitionReadLifecycle(SinglePartitionReadCommand command, ConsistencyLevel consistency)
+        {
+            this.command = command;
+            this.executor = AbstractReadExecutor.getReadExecutor(command, consistency);
+            this.consistency = consistency;
+        }
+
+        boolean isDone()
+        {
+            return result != null;
+        }
+
+        void doInitialQueries()
+        {
+            executor.executeAsync();
+        }
+
+        void maybeTryAdditionalReplicas()
+        {
+            executor.maybeTryAdditionalReplicas();
+        }
+
+        void awaitResultsAndRetryOnDigestMismatch() throws ReadFailureException, ReadTimeoutException
+        {
+            try
             {
-                ReadCommand command = commands.get(i);
-                assert !command.isDigestQuery();
-
-                AbstractReadExecutor exec = AbstractReadExecutor.getReadExecutor(command, consistencyLevel);
-                exec.executeAsync();
-                readExecutors[i] = exec;
+                result = executor.get();
             }
-
-            for (AbstractReadExecutor exec : readExecutors)
-                exec.maybeTryAdditionalReplicas();
-
-            // read results and make a second pass for any digest mismatches
-            List<ReadCommand> repairCommands = null;
-            List<ReadCallback<ReadResponse, Row>> repairResponseHandlers = null;
-            for (AbstractReadExecutor exec: readExecutors)
+            catch (DigestMismatchException ex)
             {
-                try
+                Tracing.trace("Digest mismatch: {}", ex);
+
+                ReadRepairMetrics.repairedBlocking.mark();
+
+                // Do a full data read to resolve the correct response (and repair node that need be)
+                Keyspace keyspace = Keyspace.open(command.metadata().ksName);
+                DataResolver resolver = new DataResolver(keyspace, command, ConsistencyLevel.ALL, executor.handler.endpoints.size());
+                repairHandler = new ReadCallback(resolver,
+                                                 ConsistencyLevel.ALL,
+                                                 executor.getContactedReplicas().size(),
+                                                 command,
+                                                 keyspace,
+                                                 executor.handler.endpoints);
+
+                for (InetAddress endpoint : executor.getContactedReplicas())
                 {
-                    Row row = exec.get();
-                    if (row != null)
-                    {
-                        row = exec.command.maybeTrim(row);
-                        rows.add(row);
-                    }
-
-                    if (logger.isTraceEnabled())
-                        logger.trace("Read: {} ms.", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - exec.handler.start));
-                }
-                catch (ReadTimeoutException|ReadFailureException ex)
-                {
-                    int blockFor = consistencyLevel.blockFor(Keyspace.open(exec.command.getKeyspace()));
-                    int responseCount = exec.handler.getReceivedCount();
-                    String gotData = responseCount > 0
-                                   ? exec.resolver.isDataPresent() ? " (including data)" : " (only digests)"
-                                   : "";
-
-                    boolean isTimeout = ex instanceof ReadTimeoutException;
-                    if (Tracing.isTracing())
-                    {
-                        Tracing.trace("{}; received {} of {} responses{}",
-                                      isTimeout ? "Timed out" : "Failed", responseCount, blockFor, gotData);
-                    }
-                    else if (logger.isDebugEnabled())
-                    {
-                        logger.debug("Read {}; received {} of {} responses{}", (isTimeout ? "timeout" : "failure"), responseCount, blockFor, gotData);
-                    }
-                    throw ex;
-                }
-                catch (DigestMismatchException ex)
-                {
-                    Tracing.trace("Digest mismatch: {}", ex);
-
-                    ReadRepairMetrics.repairedBlocking.mark();
-
-                    // Do a full data read to resolve the correct response (and repair node that need be)
-                    RowDataResolver resolver = new RowDataResolver(exec.command.ksName, exec.command.key, exec.command.filter(), exec.command.timestamp, exec.handler.endpoints.size());
-                    ReadCallback<ReadResponse, Row> repairHandler = new ReadCallback<>(resolver,
-                                                                                       ConsistencyLevel.ALL,
-                                                                                       exec.getContactedReplicas().size(),
-                                                                                       exec.command,
-                                                                                       Keyspace.open(exec.command.getKeyspace()),
-                                                                                       exec.handler.endpoints);
-
-                    if (repairCommands == null)
-                    {
-                        repairCommands = new ArrayList<>();
-                        repairResponseHandlers = new ArrayList<>();
-                    }
-                    repairCommands.add(exec.command);
-                    repairResponseHandlers.add(repairHandler);
-
-                    MessageOut<ReadCommand> message = exec.command.createMessage();
-                    for (InetAddress endpoint : exec.getContactedReplicas())
-                    {
-                        Tracing.trace("Enqueuing full data read to {}", endpoint);
-                        MessagingService.instance().sendRRWithFailure(message, endpoint, repairHandler);
-                    }
+                    MessageOut<ReadCommand> message = command.createMessage(MessagingService.instance().getVersion(endpoint));
+                    Tracing.trace("Enqueuing full data read to {}", endpoint);
+                    MessagingService.instance().sendRRWithFailure(message, endpoint, repairHandler);
                 }
             }
+        }
 
-            commandsToRetry.clear();
+        void maybeAwaitFullDataRead() throws ReadTimeoutException
+        {
+            // There wasn't a digest mismatch, we're good
+            if (repairHandler == null)
+                return;
 
-            // read the results for the digest mismatch retries
-            if (repairResponseHandlers != null)
+            // Otherwise, get the result from the full-data read and check that it's not a short read
+            try
             {
-                for (int i = 0; i < repairCommands.size(); i++)
-                {
-                    ReadCommand command = repairCommands.get(i);
-                    ReadCallback<ReadResponse, Row> handler = repairResponseHandlers.get(i);
-
-                    Row row;
-                    try
-                    {
-                        row = handler.get();
-                    }
-                    catch (DigestMismatchException e)
-                    {
-                        throw new AssertionError(e); // full data requested from each node here, no digests should be sent
-                    }
-                    catch (ReadTimeoutException e)
-                    {
-                        if (Tracing.isTracing())
-                            Tracing.trace("Timed out waiting on digest mismatch repair requests");
-                        else
-                            logger.trace("Timed out waiting on digest mismatch repair requests");
-                        // the caught exception here will have CL.ALL from the repair command,
-                        // not whatever CL the initial command was at (CASSANDRA-7947)
-                        int blockFor = consistencyLevel.blockFor(Keyspace.open(command.getKeyspace()));
-                        throw new ReadTimeoutException(consistencyLevel, blockFor-1, blockFor, true);
-                    }
-
-                    RowDataResolver resolver = (RowDataResolver)handler.resolver;
-                    try
-                    {
-                        // wait for the repair writes to be acknowledged, to minimize impact on any replica that's
-                        // behind on writes in case the out-of-sync row is read multiple times in quick succession
-                        FBUtilities.waitOnFutures(resolver.repairResults, DatabaseDescriptor.getWriteRpcTimeout());
-                    }
-                    catch (TimeoutException e)
-                    {
-                        if (Tracing.isTracing())
-                            Tracing.trace("Timed out waiting on digest mismatch repair acknowledgements");
-                        else
-                            logger.trace("Timed out waiting on digest mismatch repair acknowledgements");
-                        int blockFor = consistencyLevel.blockFor(Keyspace.open(command.getKeyspace()));
-                        throw new ReadTimeoutException(consistencyLevel, blockFor-1, blockFor, true);
-                    }
-
-                    // retry any potential short reads
-                    ReadCommand retryCommand = command.maybeGenerateRetryCommand(resolver, row);
-                    if (retryCommand != null)
-                    {
-                        Tracing.trace("Issuing retry for read command");
-                        if (commandsToRetry == Collections.EMPTY_LIST)
-                            commandsToRetry = new ArrayList<>();
-                        commandsToRetry.add(retryCommand);
-                        continue;
-                    }
-
-                    if (row != null)
-                    {
-                        row = command.maybeTrim(row);
-                        rows.add(row);
-                    }
-                }
+                result = repairHandler.get();
             }
-        } while (!commandsToRetry.isEmpty());
+            catch (DigestMismatchException e)
+            {
+                throw new AssertionError(e); // full data requested from each node here, no digests should be sent
+            }
+            catch (ReadTimeoutException e)
+            {
+                if (Tracing.isTracing())
+                    Tracing.trace("Timed out waiting on digest mismatch repair requests");
+                else
+                    logger.trace("Timed out waiting on digest mismatch repair requests");
+                // the caught exception here will have CL.ALL from the repair command,
+                // not whatever CL the initial command was at (CASSANDRA-7947)
+                int blockFor = consistency.blockFor(Keyspace.open(command.metadata().ksName));
+                throw new ReadTimeoutException(consistency, blockFor-1, blockFor, true);
+            }
+        }
 
-        return rows;
+        PartitionIterator getResult()
+        {
+            assert result != null;
+            return result;
+        }
     }
 
     static class LocalReadRunnable extends DroppableRunnable
     {
         private final ReadCommand command;
-        private final ReadCallback<ReadResponse, Row> handler;
+        private final ReadCallback handler;
         private final long start = System.nanoTime();
 
-        LocalReadRunnable(ReadCommand command, ReadCallback<ReadResponse, Row> handler)
+        LocalReadRunnable(ReadCommand command, ReadCallback handler)
         {
             super(MessagingService.Verb.READ);
             this.command = command;
@@ -1578,43 +1790,11 @@
         {
             try
             {
-                Keyspace keyspace = Keyspace.open(command.ksName);
-                Row r = command.getRow(keyspace);
-                ReadResponse result = ReadVerbHandler.getResponse(command, r);
+                try (ReadOrderGroup orderGroup = command.startOrderGroup(); UnfilteredPartitionIterator iterator = command.executeLocally(orderGroup))
+                {
+                    handler.response(command.createResponse(iterator));
+                }
                 MessagingService.instance().addLatency(FBUtilities.getBroadcastAddress(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
-                handler.response(result);
-            }
-            catch (Throwable t)
-            {
-                handler.onFailure(FBUtilities.getBroadcastAddress());
-                if (t instanceof TombstoneOverwhelmingException)
-                    logger.error(t.getMessage());
-                else
-                    throw t;
-            }
-        }
-    }
-
-    static class LocalRangeSliceRunnable extends DroppableRunnable
-    {
-        private final AbstractRangeCommand command;
-        private final ReadCallback<RangeSliceReply, Iterable<Row>> handler;
-        private final long start = System.nanoTime();
-
-        LocalRangeSliceRunnable(AbstractRangeCommand command, ReadCallback<RangeSliceReply, Iterable<Row>> handler)
-        {
-            super(MessagingService.Verb.RANGE_SLICE);
-            this.command = command;
-            this.handler = handler;
-        }
-
-        protected void runMayThrow()
-        {
-            try
-            {
-                RangeSliceReply result = new RangeSliceReply(command.executeLocally());
-                MessagingService.instance().addLatency(FBUtilities.getBroadcastAddress(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
-                handler.response(result);
             }
             catch (Throwable t)
             {
@@ -1629,10 +1809,10 @@
 
     public static List<InetAddress> getLiveSortedEndpoints(Keyspace keyspace, ByteBuffer key)
     {
-        return getLiveSortedEndpoints(keyspace, StorageService.getPartitioner().decorateKey(key));
+        return getLiveSortedEndpoints(keyspace, StorageService.instance.getTokenMetadata().decorateKey(key));
     }
 
-    private static List<InetAddress> getLiveSortedEndpoints(Keyspace keyspace, RingPosition pos)
+    public static List<InetAddress> getLiveSortedEndpoints(Keyspace keyspace, RingPosition pos)
     {
         List<InetAddress> liveEndpoints = StorageService.instance.getLiveNaturalEndpoints(keyspace, pos);
         DatabaseDescriptor.getEndpointSnitch().sortByProximity(FBUtilities.getBroadcastAddress(), liveEndpoints);
@@ -1652,321 +1832,402 @@
     }
 
     /**
-     * Estimate the number of result rows (either cql3 rows or storage rows, as called for by the command) per
+     * Estimate the number of result rows (either cql3 rows or "thrift" rows, as called for by the command) per
      * range in the ring based on our local data.  This assumes that ranges are uniformly distributed across the cluster
      * and that the queried data is also uniformly distributed.
      */
-    private static float estimateResultRowsPerRange(AbstractRangeCommand command, Keyspace keyspace)
+    private static float estimateResultsPerRange(PartitionRangeReadCommand command, Keyspace keyspace)
     {
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.columnFamily);
-        float resultRowsPerRange = Float.POSITIVE_INFINITY;
-        if (command.rowFilter != null && !command.rowFilter.isEmpty())
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.metadata().cfId);
+        Index index = command.getIndex(cfs);
+        float maxExpectedResults = index == null
+                                 ? command.limits().estimateTotalResults(cfs)
+                                 : index.getEstimatedResultRows();
+
+        // adjust maxExpectedResults by the number of tokens this node has and the replication factor for this ks
+        return (maxExpectedResults / DatabaseDescriptor.getNumTokens()) / keyspace.getReplicationStrategy().getReplicationFactor();
+    }
+
+    @VisibleForTesting
+    public static class RangeForQuery
+    {
+        public final AbstractBounds<PartitionPosition> range;
+        public final List<InetAddress> liveEndpoints;
+        public final List<InetAddress> filteredEndpoints;
+        public final int vnodeCount;
+
+        public RangeForQuery(AbstractBounds<PartitionPosition> range,
+                             List<InetAddress> liveEndpoints,
+                             List<InetAddress> filteredEndpoints,
+                             int vnodeCount)
         {
-            List<SecondaryIndexSearcher> searchers = cfs.indexManager.getIndexSearchersForQuery(command.rowFilter);
-            if (searchers.isEmpty())
+            this.range = range;
+            this.liveEndpoints = liveEndpoints;
+            this.filteredEndpoints = filteredEndpoints;
+            this.vnodeCount = vnodeCount;
+        }
+
+        public int vnodeCount()
+        {
+            return vnodeCount;
+        }
+    }
+
+    @VisibleForTesting
+    public static class RangeIterator extends AbstractIterator<RangeForQuery>
+    {
+        private final Keyspace keyspace;
+        private final ConsistencyLevel consistency;
+        private final Iterator<? extends AbstractBounds<PartitionPosition>> ranges;
+        private final int rangeCount;
+
+        public RangeIterator(PartitionRangeReadCommand command, Keyspace keyspace, ConsistencyLevel consistency)
+        {
+            this.keyspace = keyspace;
+            this.consistency = consistency;
+
+            List<? extends AbstractBounds<PartitionPosition>> l = keyspace.getReplicationStrategy() instanceof LocalStrategy
+                                                          ? command.dataRange().keyRange().unwrap()
+                                                          : getRestrictedRanges(command.dataRange().keyRange());
+            this.ranges = l.iterator();
+            this.rangeCount = l.size();
+        }
+
+        public int rangeCount()
+        {
+            return rangeCount;
+        }
+
+        protected RangeForQuery computeNext()
+        {
+            if (!ranges.hasNext())
+                return endOfData();
+
+            AbstractBounds<PartitionPosition> range = ranges.next();
+            List<InetAddress> liveEndpoints = getLiveSortedEndpoints(keyspace, range.right);
+            return new RangeForQuery(range,
+                                     liveEndpoints,
+                                     consistency.filterForQuery(keyspace, liveEndpoints),
+                                     1);
+        }
+    }
+
+    @VisibleForTesting
+    public static class RangeMerger extends AbstractIterator<RangeForQuery>
+    {
+        private final Keyspace keyspace;
+        private final ConsistencyLevel consistency;
+        private final PeekingIterator<RangeForQuery> ranges;
+
+        public RangeMerger(Iterator<RangeForQuery> iterator, Keyspace keyspace, ConsistencyLevel consistency)
+        {
+            this.keyspace = keyspace;
+            this.consistency = consistency;
+            this.ranges = Iterators.peekingIterator(iterator);
+        }
+
+        protected RangeForQuery computeNext()
+        {
+            if (!ranges.hasNext())
+                return endOfData();
+
+            RangeForQuery current = ranges.next();
+
+            // getRestrictedRange has broken the queried range into per-[vnode] token ranges, but this doesn't take
+            // the replication factor into account. If the intersection of live endpoints for 2 consecutive ranges
+            // still meets the CL requirements, then we can merge both ranges into the same RangeSliceCommand.
+            while (ranges.hasNext())
             {
-                resultRowsPerRange = calculateResultRowsUsingEstimatedKeys(cfs);
+                // If the current range right is the min token, we should stop merging because CFS.getRangeSlice
+                // don't know how to deal with a wrapping range.
+                // Note: it would be slightly more efficient to have CFS.getRangeSlice on the destination nodes unwraps
+                // the range if necessary and deal with it. However, we can't start sending wrapped range without breaking
+                // wire compatibility, so It's likely easier not to bother;
+                if (current.range.right.isMinimum())
+                    break;
+
+                RangeForQuery next = ranges.peek();
+
+                List<InetAddress> merged = intersection(current.liveEndpoints, next.liveEndpoints);
+
+                // Check if there is enough endpoint for the merge to be possible.
+                if (!consistency.isSufficientLiveNodes(keyspace, merged))
+                    break;
+
+                List<InetAddress> filteredMerged = consistency.filterForQuery(keyspace, merged);
+
+                // Estimate whether merging will be a win or not
+                if (!DatabaseDescriptor.getEndpointSnitch().isWorthMergingForRangeQuery(filteredMerged, current.filteredEndpoints, next.filteredEndpoints))
+                    break;
+
+                // If we get there, merge this range and the next one
+                int vnodeCount = current.vnodeCount + next.vnodeCount;
+                current = new RangeForQuery(current.range.withNewRight(next.range.right), merged, filteredMerged, vnodeCount);
+                ranges.next(); // consume the range we just merged since we've only peeked so far
+            }
+            return current;
+        }
+    }
+
+    private static class SingleRangeResponse extends AbstractIterator<RowIterator> implements PartitionIterator
+    {
+        private final ReadCallback handler;
+        private PartitionIterator result;
+
+        private SingleRangeResponse(ReadCallback handler)
+        {
+            this.handler = handler;
+        }
+
+        private void waitForResponse() throws ReadTimeoutException
+        {
+            if (result != null)
+                return;
+
+            try
+            {
+                result = handler.get();
+            }
+            catch (DigestMismatchException e)
+            {
+                throw new AssertionError(e); // no digests in range slices yet
+            }
+        }
+
+        protected RowIterator computeNext()
+        {
+            waitForResponse();
+            return result.hasNext() ? result.next() : endOfData();
+        }
+
+        public void close()
+        {
+            if (result != null)
+                result.close();
+        }
+    }
+
+    public static class RangeCommandIterator extends AbstractIterator<RowIterator> implements PartitionIterator
+    {
+        private final Iterator<RangeForQuery> ranges;
+        private final int totalRangeCount;
+        private final PartitionRangeReadCommand command;
+        private final Keyspace keyspace;
+        private final ConsistencyLevel consistency;
+        private final boolean enforceStrictLiveness;
+
+        private final long startTime;
+        private DataLimits.Counter counter;
+        private PartitionIterator sentQueryIterator;
+
+        private final int maxConcurrencyFactor;
+        private int concurrencyFactor;
+        // The two following "metric" are maintained to improve the concurrencyFactor
+        // when it was not good enough initially.
+        private int liveReturned;
+        private int rangesQueried;
+        private int batchesRequested = 0;
+
+        public RangeCommandIterator(Iterator<RangeForQuery> ranges,
+                                    PartitionRangeReadCommand command,
+                                    int concurrencyFactor,
+                                    int maxConcurrencyFactor,
+                                    int totalRangeCount,
+                                    Keyspace keyspace,
+                                    ConsistencyLevel consistency)
+        {
+            this.command = command;
+            this.concurrencyFactor = concurrencyFactor;
+            this.maxConcurrencyFactor = maxConcurrencyFactor;
+            this.startTime = System.nanoTime();
+            this.ranges = ranges;
+            this.totalRangeCount = totalRangeCount;
+            this.consistency = consistency;
+            this.keyspace = keyspace;
+            this.enforceStrictLiveness = command.metadata().enforceStrictLiveness();
+        }
+
+        public RowIterator computeNext()
+        {
+            try
+            {
+                while (sentQueryIterator == null || !sentQueryIterator.hasNext())
+                {
+                    // If we don't have more range to handle, we're done
+                    if (!ranges.hasNext())
+                        return endOfData();
+
+                    // else, sends the next batch of concurrent queries (after having close the previous iterator)
+                    if (sentQueryIterator != null)
+                    {
+                        sentQueryIterator.close();
+
+                        // It's not the first batch of queries and we're not done, so we we can use what has been
+                        // returned so far to improve our rows-per-range estimate and update the concurrency accordingly
+                        updateConcurrencyFactor();
+                    }
+                    sentQueryIterator = sendNextRequests();
+                }
+
+                return sentQueryIterator.next();
+            }
+            catch (UnavailableException e)
+            {
+                rangeMetrics.unavailables.mark();
+                throw e;
+            }
+            catch (ReadTimeoutException e)
+            {
+                rangeMetrics.timeouts.mark();
+                throw e;
+            }
+            catch (ReadFailureException e)
+            {
+                rangeMetrics.failures.mark();
+                throw e;
+            }
+        }
+
+        private void updateConcurrencyFactor()
+        {
+            liveReturned += counter.counted();
+
+            concurrencyFactor = computeConcurrencyFactor(totalRangeCount, rangesQueried, maxConcurrencyFactor, command.limits().count(), liveReturned);
+        }
+
+        @VisibleForTesting
+        public static int computeConcurrencyFactor(int totalRangeCount, int rangesQueried, int maxConcurrencyFactor, int limit, int liveReturned)
+        {
+            maxConcurrencyFactor = Math.max(1, Math.min(maxConcurrencyFactor, totalRangeCount - rangesQueried));
+            if (liveReturned == 0)
+            {
+                // we haven't actually gotten any results, so query up to the limit if not results so far
+                Tracing.trace("Didn't get any response rows; new concurrent requests: {}", maxConcurrencyFactor);
+                return maxConcurrencyFactor;
+            }
+
+            // Otherwise, compute how many rows per range we got on average and pick a concurrency factor
+            // that should allow us to fetch all remaining rows with the next batch of (concurrent) queries.
+            int remainingRows = limit - liveReturned;
+            float rowsPerRange = (float)liveReturned / (float)rangesQueried;
+            int concurrencyFactor = Math.max(1, Math.min(maxConcurrencyFactor, Math.round(remainingRows / rowsPerRange)));
+            logger.trace("Didn't get enough response rows; actual rows per range: {}; remaining rows: {}, new concurrent requests: {}",
+                         rowsPerRange, remainingRows, concurrencyFactor);
+
+            return concurrencyFactor;
+        }
+
+        private SingleRangeResponse query(RangeForQuery toQuery)
+        {
+            PartitionRangeReadCommand rangeCommand = command.forSubRange(toQuery.range);
+
+            DataResolver resolver = new DataResolver(keyspace, rangeCommand, consistency, toQuery.filteredEndpoints.size());
+
+            int blockFor = consistency.blockFor(keyspace);
+            int minResponses = Math.min(toQuery.filteredEndpoints.size(), blockFor);
+            List<InetAddress> minimalEndpoints = toQuery.filteredEndpoints.subList(0, minResponses);
+            ReadCallback handler = new ReadCallback(resolver, consistency, rangeCommand, minimalEndpoints);
+
+            handler.assureSufficientLiveNodes();
+
+            if (toQuery.filteredEndpoints.size() == 1 && canDoLocalRequest(toQuery.filteredEndpoints.get(0)))
+            {
+                StageManager.getStage(Stage.READ).execute(new LocalReadRunnable(rangeCommand, handler));
             }
             else
             {
-                // Secondary index query (cql3 or otherwise).  Estimate result rows based on most selective 2ary index.
-                for (SecondaryIndexSearcher searcher : searchers)
+                for (InetAddress endpoint : toQuery.filteredEndpoints)
                 {
-                    // use our own mean column count as our estimate for how many matching rows each node will have
-                    SecondaryIndex highestSelectivityIndex = searcher.highestSelectivityIndex(command.rowFilter);
-                    resultRowsPerRange = highestSelectivityIndex == null ? resultRowsPerRange : Math.min(resultRowsPerRange, highestSelectivityIndex.estimateResultRows());
+                    MessageOut<ReadCommand> message = rangeCommand.createMessage(MessagingService.instance().getVersion(endpoint));
+                    Tracing.trace("Enqueuing request to {}", endpoint);
+                    MessagingService.instance().sendRRWithFailure(message, endpoint, handler);
                 }
             }
-        }
-        else if (!command.countCQL3Rows())
-        {
-            // non-cql3 query
-            resultRowsPerRange = cfs.estimateKeys();
-        }
-        else
-        {
-            resultRowsPerRange = calculateResultRowsUsingEstimatedKeys(cfs);
+
+            return new SingleRangeResponse(handler);
         }
 
-        // adjust resultRowsPerRange by the number of tokens this node has and the replication factor for this ks
-        return (resultRowsPerRange / DatabaseDescriptor.getNumTokens()) / keyspace.getReplicationStrategy().getReplicationFactor();
-    }
+        private PartitionIterator sendNextRequests()
+        {
+            List<PartitionIterator> concurrentQueries = new ArrayList<>(concurrencyFactor);
+            for (int i = 0; i < concurrencyFactor && ranges.hasNext();)
+            {
+                RangeForQuery range = ranges.next();
+                concurrentQueries.add(query(range));
+                rangesQueried += range.vnodeCount();
+                i += range.vnodeCount();
+            }
+            batchesRequested++;
 
-    private static float calculateResultRowsUsingEstimatedKeys(ColumnFamilyStore cfs)
-    {
-        if (cfs.metadata.comparator.isDense())
-        {
-            // one storage row per result row, so use key estimate directly
-            return cfs.estimateKeys();
+            Tracing.trace("Submitted {} concurrent range requests", concurrentQueries.size());
+            // We want to count the results for the sake of updating the concurrency factor (see updateConcurrencyFactor) but we don't want to
+            // enforce any particular limit at this point (this could break code than rely on postReconciliationProcessing), hence the DataLimits.NONE.
+            counter = DataLimits.NONE.newCounter(command.nowInSec(), true, command.selectsFullPartition(), enforceStrictLiveness);
+            return counter.applyTo(PartitionIterators.concat(concurrentQueries));
         }
-        else
+
+        public void close()
         {
-            float resultRowsPerStorageRow = ((float) cfs.getMeanColumns()) / cfs.metadata.regularColumns().size();
-            return resultRowsPerStorageRow * (cfs.estimateKeys());
+            try
+            {
+                if (sentQueryIterator != null)
+                    sentQueryIterator.close();
+            }
+            finally
+            {
+                long latency = System.nanoTime() - startTime;
+                rangeMetrics.addNano(latency);
+                Keyspace.openAndGetStore(command.metadata()).metric.coordinatorScanLatency.update(latency, TimeUnit.NANOSECONDS);
+            }
+        }
+
+        @VisibleForTesting
+        public int rangesQueried()
+        {
+            return rangesQueried;
+        }
+
+        @VisibleForTesting
+        public int batchesRequested()
+        {
+            return batchesRequested;
         }
     }
 
-    public static List<Row> getRangeSlice(AbstractRangeCommand command, ConsistencyLevel consistency_level)
-    throws UnavailableException, ReadFailureException, ReadTimeoutException
+    @SuppressWarnings("resource")
+    public static PartitionIterator getRangeSlice(PartitionRangeReadCommand command, ConsistencyLevel consistencyLevel)
     {
         Tracing.trace("Computing ranges to query");
-        long startTime = System.nanoTime();
 
-        Keyspace keyspace = Keyspace.open(command.keyspace);
-        List<Row> rows;
-        // now scan until we have enough results
-        try
-        {
-            int liveRowCount = 0;
-            boolean countLiveRows = command.countCQL3Rows() || command.ignoredTombstonedPartitions();
-            rows = new ArrayList<>();
+        Keyspace keyspace = Keyspace.open(command.metadata().ksName);
+        RangeIterator ranges = new RangeIterator(command, keyspace, consistencyLevel);
 
-            // when dealing with LocalStrategy keyspaces, we can skip the range splitting and merging (which can be
-            // expensive in clusters with vnodes)
-            List<? extends AbstractBounds<RowPosition>> ranges;
-            if (keyspace.getReplicationStrategy() instanceof LocalStrategy)
-                ranges = command.keyRange.unwrap();
-            else
-                ranges = getRestrictedRanges(command.keyRange);
+        // our estimate of how many result rows there will be per-range
+        float resultsPerRange = estimateResultsPerRange(command, keyspace);
+        // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
+        // fetch enough rows in the first round
+        resultsPerRange -= resultsPerRange * CONCURRENT_SUBREQUESTS_MARGIN;
+        int maxConcurrencyFactor = Math.min(ranges.rangeCount(), MAX_CONCURRENT_RANGE_REQUESTS);
+        int concurrencyFactor = resultsPerRange == 0.0
+                              ? 1
+                              : Math.max(1, Math.min(maxConcurrencyFactor, (int) Math.ceil(command.limits().count() / resultsPerRange)));
+        logger.trace("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
+                     resultsPerRange, command.limits().count(), ranges.rangeCount(), concurrencyFactor);
+        Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)", ranges.rangeCount(), concurrencyFactor, resultsPerRange);
 
-            // determine the number of rows to be fetched and the concurrency factor
-            int rowsToBeFetched = command.limit();
-            int concurrencyFactor;
-            if (command.requiresScanningAllRanges())
-            {
-                // all nodes must be queried
-                rowsToBeFetched *= ranges.size();
-                concurrencyFactor = ranges.size();
-                logger.debug("Requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
-                             command.limit(),
-                             ranges.size(),
-                             concurrencyFactor);
-                Tracing.trace("Submitting range requests on {} ranges with a concurrency of {}",
-                              ranges.size(), concurrencyFactor);
-            }
-            else
-            {
-                // our estimate of how many result rows there will be per-range
-                float resultRowsPerRange = estimateResultRowsPerRange(command, keyspace);
-                // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
-                // fetch enough rows in the first round
-                resultRowsPerRange -= resultRowsPerRange * CONCURRENT_SUBREQUESTS_MARGIN;
-                concurrencyFactor = resultRowsPerRange == 0.0
-                                  ? 1
-                                  : Math.max(1, Math.min(ranges.size(), (int) Math.ceil(command.limit() / resultRowsPerRange)));
-
-                logger.trace("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
-                             resultRowsPerRange,
-                             command.limit(),
-                             ranges.size(),
-                             concurrencyFactor);
-                Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)",
-                              ranges.size(),
-                              concurrencyFactor,
-                              resultRowsPerRange);
-            }
-
-            boolean haveSufficientRows = false;
-            int i = 0;
-            AbstractBounds<RowPosition> nextRange = null;
-            List<InetAddress> nextEndpoints = null;
-            List<InetAddress> nextFilteredEndpoints = null;
-            while (i < ranges.size())
-            {
-                List<Pair<AbstractRangeCommand, ReadCallback<RangeSliceReply, Iterable<Row>>>> scanHandlers = new ArrayList<>(concurrencyFactor);
-                int concurrentFetchStartingIndex = i;
-                int concurrentRequests = 0;
-                while ((i - concurrentFetchStartingIndex) < concurrencyFactor)
-                {
-                    AbstractBounds<RowPosition> range = nextRange == null
-                                                      ? ranges.get(i)
-                                                      : nextRange;
-                    List<InetAddress> liveEndpoints = nextEndpoints == null
-                                                    ? getLiveSortedEndpoints(keyspace, range.right)
-                                                    : nextEndpoints;
-                    List<InetAddress> filteredEndpoints = nextFilteredEndpoints == null
-                                                        ? consistency_level.filterForQuery(keyspace, liveEndpoints)
-                                                        : nextFilteredEndpoints;
-                    ++i;
-                    ++concurrentRequests;
-
-                    // getRestrictedRange has broken the queried range into per-[vnode] token ranges, but this doesn't take
-                    // the replication factor into account. If the intersection of live endpoints for 2 consecutive ranges
-                    // still meets the CL requirements, then we can merge both ranges into the same RangeSliceCommand.
-                    while (i < ranges.size())
-                    {
-                        nextRange = ranges.get(i);
-                        nextEndpoints = getLiveSortedEndpoints(keyspace, nextRange.right);
-                        nextFilteredEndpoints = consistency_level.filterForQuery(keyspace, nextEndpoints);
-
-                        // If the current range right is the min token, we should stop merging because CFS.getRangeSlice
-                        // don't know how to deal with a wrapping range.
-                        // Note: it would be slightly more efficient to have CFS.getRangeSlice on the destination nodes unwraps
-                        // the range if necessary and deal with it. However, we can't start sending wrapped range without breaking
-                        // wire compatibility, so It's likely easier not to bother;
-                        if (range.right.isMinimum())
-                            break;
-
-                        List<InetAddress> merged = intersection(liveEndpoints, nextEndpoints);
-
-                        // Check if there is enough endpoint for the merge to be possible.
-                        if (!consistency_level.isSufficientLiveNodes(keyspace, merged))
-                            break;
-
-                        List<InetAddress> filteredMerged = consistency_level.filterForQuery(keyspace, merged);
-
-                        // Estimate whether merging will be a win or not
-                        if (!DatabaseDescriptor.getEndpointSnitch().isWorthMergingForRangeQuery(filteredMerged, filteredEndpoints, nextFilteredEndpoints))
-                            break;
-
-                        // If we get there, merge this range and the next one
-                        range = range.withNewRight(nextRange.right);
-                        liveEndpoints = merged;
-                        filteredEndpoints = filteredMerged;
-                        ++i;
-                    }
-
-                    AbstractRangeCommand nodeCmd = command.forSubRange(range);
-
-                    // collect replies and resolve according to consistency level
-                    RangeSliceResponseResolver resolver = new RangeSliceResponseResolver(nodeCmd.keyspace, command.timestamp);
-                    List<InetAddress> minimalEndpoints = filteredEndpoints.subList(0, Math.min(filteredEndpoints.size(), consistency_level.blockFor(keyspace)));
-                    ReadCallback<RangeSliceReply, Iterable<Row>> handler = new ReadCallback<>(resolver, consistency_level, nodeCmd, minimalEndpoints);
-                    handler.assureSufficientLiveNodes();
-                    resolver.setSources(filteredEndpoints);
-                    if (filteredEndpoints.size() == 1
-                        && filteredEndpoints.get(0).equals(FBUtilities.getBroadcastAddress()))
-                    {
-                        StageManager.getStage(Stage.READ).execute(new LocalRangeSliceRunnable(nodeCmd, handler));
-                    }
-                    else
-                    {
-                        MessageOut<? extends AbstractRangeCommand> message = nodeCmd.createMessage();
-                        for (InetAddress endpoint : filteredEndpoints)
-                        {
-                            Tracing.trace("Enqueuing request to {}", endpoint);
-                            MessagingService.instance().sendRRWithFailure(message, endpoint, handler);
-                        }
-                    }
-                    scanHandlers.add(Pair.create(nodeCmd, handler));
-                }
-                Tracing.trace("Submitted {} concurrent range requests covering {} ranges", concurrentRequests, i - concurrentFetchStartingIndex);
-
-                List<AsyncOneResponse> repairResponses = new ArrayList<>();
-                for (Pair<AbstractRangeCommand, ReadCallback<RangeSliceReply, Iterable<Row>>> cmdPairHandler : scanHandlers)
-                {
-                    ReadCallback<RangeSliceReply, Iterable<Row>> handler = cmdPairHandler.right;
-                    RangeSliceResponseResolver resolver = (RangeSliceResponseResolver)handler.resolver;
-
-                    try
-                    {
-                        for (Row row : handler.get())
-                        {
-                            rows.add(row);
-                            if (countLiveRows)
-                                liveRowCount += row.getLiveCount(command.predicate, command.timestamp);
-                        }
-                        repairResponses.addAll(resolver.repairResults);
-                    }
-                    catch (ReadTimeoutException|ReadFailureException ex)
-                    {
-                        // we timed out or failed waiting for responses
-                        int blockFor = consistency_level.blockFor(keyspace);
-                        int responseCount = resolver.responses.size();
-                        String gotData = responseCount > 0
-                                         ? resolver.isDataPresent() ? " (including data)" : " (only digests)"
-                                         : "";
-
-                        boolean isTimeout = ex instanceof ReadTimeoutException;
-                        if (Tracing.isTracing())
-                        {
-                            Tracing.trace("{}; received {} of {} responses{} for range {} of {}",
-                                          (isTimeout ? "Timed out" : "Failed"), responseCount, blockFor, gotData, i, ranges.size());
-                        }
-                        else if (logger.isDebugEnabled())
-                        {
-                            logger.debug("Range slice {}; received {} of {} responses{} for range {} of {}",
-                                         (isTimeout ? "timeout" : "failure"), responseCount, blockFor, gotData, i, ranges.size());
-                        }
-                        throw ex;
-                    }
-                    catch (DigestMismatchException e)
-                    {
-                        throw new AssertionError(e); // no digests in range slices yet
-                    }
-
-                    // if we're done, great, otherwise, move to the next range
-                    int count = countLiveRows ? liveRowCount : rows.size();
-                    if (count >= rowsToBeFetched)
-                    {
-                        haveSufficientRows = true;
-                        break;
-                    }
-                }
-
-                try
-                {
-                    FBUtilities.waitOnFutures(repairResponses, DatabaseDescriptor.getWriteRpcTimeout());
-                }
-                catch (TimeoutException ex)
-                {
-                    // We got all responses, but timed out while repairing
-                    int blockFor = consistency_level.blockFor(keyspace);
-                    if (Tracing.isTracing())
-                        Tracing.trace("Timed out while read-repairing after receiving all {} data and digest responses", blockFor);
-                    else
-                        logger.debug("Range slice timeout while read-repairing after receiving all {} data and digest responses", blockFor);
-                    throw new ReadTimeoutException(consistency_level, blockFor-1, blockFor, true);
-                }
-
-                if (haveSufficientRows)
-                    return command.postReconciliationProcessing(rows);
-
-                // we didn't get enough rows in our concurrent fetch; recalculate our concurrency factor
-                // based on the results we've seen so far (as long as we still have ranges left to query)
-                if (i < ranges.size())
-                {
-                    float fetchedRows = countLiveRows ? liveRowCount : rows.size();
-                    float remainingRows = rowsToBeFetched - fetchedRows;
-                    float actualRowsPerRange;
-                    if (fetchedRows == 0.0)
-                    {
-                        // we haven't actually gotten any results, so query all remaining ranges at once
-                        actualRowsPerRange = 0.0f;
-                        concurrencyFactor = ranges.size() - i;
-                    }
-                    else
-                    {
-                        actualRowsPerRange = fetchedRows / i;
-                        concurrencyFactor = Math.max(1, Math.min(ranges.size() - i, Math.round(remainingRows / actualRowsPerRange)));
-                    }
-                    logger.trace("Didn't get enough response rows; actual rows per range: {}; remaining rows: {}, new concurrent requests: {}",
-                                 actualRowsPerRange, (int) remainingRows, concurrencyFactor);
-                }
-            }
-        }
-        catch (ReadTimeoutException e)
-        {
-            rangeMetrics.timeouts.mark();
-            throw e;
-        }
-        catch (UnavailableException e)
-        {
-            rangeMetrics.unavailables.mark();
-            throw e;
-        }
-        catch (ReadFailureException e)
-        {
-            rangeMetrics.failures.mark();
-            throw e;
-        }
-        finally
-        {
-            long latency = System.nanoTime() - startTime;
-            rangeMetrics.addNano(latency);
-            Keyspace.open(command.keyspace).getColumnFamilyStore(command.columnFamily).metric.coordinatorScanLatency.update(latency, TimeUnit.NANOSECONDS);
-        }
-        return command.postReconciliationProcessing(rows);
+        // Note that in general, a RangeCommandIterator will honor the command limit for each range, but will not enforce it globally.
+        RangeMerger mergedRanges = new RangeMerger(ranges, keyspace, consistencyLevel);
+        RangeCommandIterator rangeCommandIterator = new RangeCommandIterator(mergedRanges,
+                                                                             command,
+                                                                             concurrencyFactor,
+                                                                             maxConcurrencyFactor,
+                                                                             ranges.rangeCount(),
+                                                                             keyspace,
+                                                                             consistencyLevel);
+        return command.limits().filter(command.postReconciliationProcessing(rangeCommandIterator),
+                                       command.nowInSec(),
+                                       command.selectsFullPartition(),
+                                       command.metadata().enforceStrictLiveness());
     }
 
     public Map<String, List<String>> getSchemaVersions()
@@ -2100,19 +2361,30 @@
         return DatabaseDescriptor.hintedHandoffEnabled();
     }
 
-    public Set<String> getHintedHandoffEnabledByDC()
-    {
-        return DatabaseDescriptor.hintedHandoffEnabledByDC();
-    }
-
     public void setHintedHandoffEnabled(boolean b)
     {
-        DatabaseDescriptor.setHintedHandoffEnabled(b);
+        synchronized (StorageService.instance)
+        {
+            if (b)
+                StorageService.instance.checkServiceAllowedToStart("hinted handoff");
+
+            DatabaseDescriptor.setHintedHandoffEnabled(b);
+        }
     }
 
-    public void setHintedHandoffEnabledByDCList(String dcNames)
+    public void enableHintsForDC(String dc)
     {
-        DatabaseDescriptor.setHintedHandoffEnabled(dcNames);
+        DatabaseDescriptor.enableHintsForDC(dc);
+    }
+
+    public void disableHintsForDC(String dc)
+    {
+        DatabaseDescriptor.disableHintsForDC(dc);
+    }
+
+    public Set<String> getHintedHandoffDisabledDCs()
+    {
+        return DatabaseDescriptor.hintedHandoffDisabledDCs();
     }
 
     public int getMaxHintWindow()
@@ -2129,21 +2401,21 @@
     {
         if (DatabaseDescriptor.hintedHandoffEnabled())
         {
-            if (DatabaseDescriptor.shouldHintByDC())
+            Set<String> disabledDCs = DatabaseDescriptor.hintedHandoffDisabledDCs();
+            if (!disabledDCs.isEmpty())
             {
                 final String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(ep);
-                // Disable DC specific hints
-                if (!DatabaseDescriptor.hintedHandoffEnabled(dc))
+                if (disabledDCs.contains(dc))
                 {
+                    Tracing.trace("Not hinting {} since its data center {} has been disabled {}", ep, dc, disabledDCs);
                     return false;
                 }
             }
-
             boolean hintWindowExpired = Gossiper.instance.getEndpointDowntime(ep) > DatabaseDescriptor.getMaxHintWindow();
             if (hintWindowExpired)
             {
-                HintedHandOffManager.instance.metrics.incrPastWindow(ep);
-                Tracing.trace("Not hinting {} which has been down {}ms", ep, Gossiper.instance.getEndpointDowntime(ep));
+                HintsService.instance.metrics.incrPastWindow(ep);
+                Tracing.trace("Not hinting {} which has been down {} ms", ep, Gossiper.instance.getEndpointDowntime(ep));
             }
             return !hintWindowExpired;
         }
@@ -2218,6 +2490,24 @@
     }
 
     /**
+     * This class captures metrics for views writes.
+     */
+    private static class ViewWriteMetricsWrapped extends BatchlogResponseHandler<IMutation>
+    {
+        public ViewWriteMetricsWrapped(AbstractWriteResponseHandler<IMutation> writeHandler, int i, BatchlogCleanup cleanup)
+        {
+            super(writeHandler, i, cleanup);
+            viewWriteMetrics.viewReplicasAttempted.inc(totalEndpoints());
+        }
+
+        public void response(MessageIn<IMutation> msg)
+        {
+            super.response(msg);
+            viewWriteMetrics.viewReplicasSuccess.inc();
+        }
+    }
+
+    /**
      * A Runnable that aborts if it doesn't start running before it times out
      */
     private static abstract class DroppableRunnable implements Runnable
@@ -2248,6 +2538,11 @@
             }
         }
 
+        protected MessagingService.Verb verb()
+        {
+            return verb;
+        }
+
         abstract protected void runMayThrow() throws Exception;
     }
 
@@ -2266,7 +2561,7 @@
             {
                 if (MessagingService.DROPPABLE_VERBS.contains(verb()))
                     MessagingService.instance().incrementDroppedMessages(verb);
-                HintRunnable runnable = new HintRunnable(FBUtilities.getBroadcastAddress())
+                HintRunnable runnable = new HintRunnable(Collections.singleton(FBUtilities.getBroadcastAddress()))
                 {
                     protected void runMayThrow() throws Exception
                     {
@@ -2297,11 +2592,11 @@
      */
     private abstract static class HintRunnable implements Runnable
     {
-        public final InetAddress target;
+        public final Collection<InetAddress> targets;
 
-        protected HintRunnable(InetAddress target)
+        protected HintRunnable(Collection<InetAddress> targets)
         {
-            this.target = target;
+            this.targets = targets;
         }
 
         public void run()
@@ -2316,8 +2611,9 @@
             }
             finally
             {
-                StorageMetrics.totalHintsInProgress.dec();
-                getHintsInProgressFor(target).decrementAndGet();
+                StorageMetrics.totalHintsInProgress.dec(targets.size());
+                for (InetAddress target : targets)
+                    getHintsInProgressFor(target).decrementAndGet();
             }
         }
 
@@ -2350,6 +2646,64 @@
             logger.warn("Some hints were not written before shutdown.  This is not supposed to happen.  You should (a) run repair, and (b) file a bug report");
     }
 
+    private static AtomicInteger getHintsInProgressFor(InetAddress destination)
+    {
+        try
+        {
+            return hintsInProgress.load(destination);
+        }
+        catch (Exception e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    public static Future<Void> submitHint(Mutation mutation, InetAddress target, AbstractWriteResponseHandler<IMutation> responseHandler)
+    {
+        return submitHint(mutation, Collections.singleton(target), responseHandler);
+    }
+
+    public static Future<Void> submitHint(Mutation mutation,
+                                          Collection<InetAddress> targets,
+                                          AbstractWriteResponseHandler<IMutation> responseHandler)
+    {
+        HintRunnable runnable = new HintRunnable(targets)
+        {
+            public void runMayThrow()
+            {
+                Set<InetAddress> validTargets = new HashSet<>(targets.size());
+                Set<UUID> hostIds = new HashSet<>(targets.size());
+                for (InetAddress target : targets)
+                {
+                    UUID hostId = StorageService.instance.getHostIdForEndpoint(target);
+                    if (hostId != null)
+                    {
+                        hostIds.add(hostId);
+                        validTargets.add(target);
+                    }
+                    else
+                        logger.debug("Discarding hint for endpoint not part of ring: {}", target);
+                }
+                logger.trace("Adding hints for {}", validTargets);
+                HintsService.instance.write(hostIds, Hint.create(mutation, System.currentTimeMillis()));
+                validTargets.forEach(HintsService.instance.metrics::incrCreatedHints);
+                // Notify the handler only for CL == ANY
+                if (responseHandler != null && responseHandler.consistencyLevel == ConsistencyLevel.ANY)
+                    responseHandler.response(null);
+            }
+        };
+
+        return submitHint(runnable);
+    }
+
+    private static Future<Void> submitHint(HintRunnable runnable)
+    {
+        StorageMetrics.totalHintsInProgress.inc(runnable.targets.size());
+        for (InetAddress target : runnable.targets)
+            getHintsInProgressFor(target).incrementAndGet();
+        return (Future<Void>) StageManager.getStage(Stage.MUTATION).submit(runnable);
+    }
+
     public Long getRpcTimeout() { return DatabaseDescriptor.getRpcTimeout(); }
     public void setRpcTimeout(Long timeoutInMillis) { DatabaseDescriptor.setRpcTimeout(timeoutInMillis); }
 
@@ -2390,4 +2744,66 @@
     public long getReadRepairRepairedBackground() {
         return ReadRepairMetrics.repairedBackground.getCount();
     }
+
+    public int getOtcBacklogExpirationInterval() {
+        return DatabaseDescriptor.getOtcBacklogExpirationInterval();
+    }
+
+    public void setOtcBacklogExpirationInterval(int intervalInMillis) {
+        DatabaseDescriptor.setOtcBacklogExpirationInterval(intervalInMillis);
+    }
+
+    @Override
+    public boolean getSnapshotOnDuplicateRowDetectionEnabled()
+    {
+        return DatabaseDescriptor.snapshotOnDuplicateRowDetection();
+    }
+
+    @Override
+    public void enableSnapshotOnDuplicateRowDetection()
+    {
+        DatabaseDescriptor.setSnapshotOnDuplicateRowDetection(true);
+    }
+
+    @Override
+    public void disableSnapshotOnDuplicateRowDetection()
+    {
+        DatabaseDescriptor.setSnapshotOnDuplicateRowDetection(false);
+    }
+
+    @Override
+    public boolean getCheckForDuplicateRowsDuringReads()
+    {
+        return DatabaseDescriptor.checkForDuplicateRowsDuringReads();
+    }
+
+    @Override
+    public void enableCheckForDuplicateRowsDuringReads()
+    {
+        DatabaseDescriptor.setCheckForDuplicateRowsDuringReads(true);
+    }
+
+    @Override
+    public void disableCheckForDuplicateRowsDuringReads()
+    {
+        DatabaseDescriptor.setCheckForDuplicateRowsDuringReads(false);
+    }
+
+    @Override
+    public boolean getCheckForDuplicateRowsDuringCompaction()
+    {
+        return DatabaseDescriptor.checkForDuplicateRowsDuringCompaction();
+    }
+
+    @Override
+    public void enableCheckForDuplicateRowsDuringCompaction()
+    {
+        DatabaseDescriptor.setCheckForDuplicateRowsDuringCompaction(true);
+    }
+
+    @Override
+    public void disableCheckForDuplicateRowsDuringCompaction()
+    {
+        DatabaseDescriptor.setCheckForDuplicateRowsDuringCompaction(false);
+    }
 }

diff --git a/src/java/org/apache/cassandra/service/StorageProxyMBean.java b/src/java/org/apache/cassandra/service/StorageProxyMBean.java
index e619892..047934c 100644
--- a/src/java/org/apache/cassandra/service/StorageProxyMBean.java
+++ b/src/java/org/apache/cassandra/service/StorageProxyMBean.java

@@ -21,13 +21,16 @@
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
+
 public interface StorageProxyMBean
 {
     public long getTotalHints();
     public boolean getHintedHandoffEnabled();
-    public Set<String> getHintedHandoffEnabledByDC();
     public void setHintedHandoffEnabled(boolean b);
-    public void setHintedHandoffEnabledByDCList(String dcs);
+    public void enableHintsForDC(String dc);
+    public void disableHintsForDC(String dc);
+    public Set<String> getHintedHandoffDisabledDCs();
     public int getMaxHintWindow();
     public void setMaxHintWindow(int ms);
     public int getMaxHintsInProgress();
@@ -58,6 +61,20 @@
     public long getReadRepairRepairedBlocking();
     public long getReadRepairRepairedBackground();
 
+    public int getOtcBacklogExpirationInterval();
+    public void setOtcBacklogExpirationInterval(int intervalInMillis);
+
     /** Returns each live node's schema version */
     public Map<String, List<String>> getSchemaVersions();
+
+    void enableSnapshotOnDuplicateRowDetection();
+    void disableSnapshotOnDuplicateRowDetection();
+    boolean getSnapshotOnDuplicateRowDetectionEnabled();
+
+    boolean getCheckForDuplicateRowsDuringReads();
+    void enableCheckForDuplicateRowsDuringReads();
+    void disableCheckForDuplicateRowsDuringReads();
+    boolean getCheckForDuplicateRowsDuringCompaction();
+    void enableCheckForDuplicateRowsDuringCompaction();
+    void disableCheckForDuplicateRowsDuringCompaction();
 }

diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index b1d8e26..d287788 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java

@@ -22,18 +22,18 @@
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
-import java.sql.Time;
 import java.util.*;
 import java.util.Map.Entry;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
+import javax.annotation.Nullable;
 import javax.management.*;
 import javax.management.openmbean.TabularData;
 import javax.management.openmbean.TabularDataSupport;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
+import com.google.common.base.Preconditions;
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
 import com.google.common.util.concurrent.*;
@@ -45,20 +45,32 @@
 import ch.qos.logback.classic.jmx.JMXConfiguratorMBean;
 import ch.qos.logback.classic.spi.ILoggingEvent;
 import ch.qos.logback.core.Appender;
+import ch.qos.logback.core.hook.DelayingShutdownHook;
 import org.apache.cassandra.auth.AuthKeyspace;
 import org.apache.cassandra.auth.AuthMigrationListener;
+import org.apache.cassandra.batchlog.BatchRemoveVerbHandler;
+import org.apache.cassandra.batchlog.BatchStoreVerbHandler;
+import org.apache.cassandra.batchlog.BatchlogManager;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.concurrent.StageManager;
-import org.apache.cassandra.config.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.ViewDefinition;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token.TokenFactory;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.gms.*;
-import org.apache.cassandra.io.sstable.SSTableDeletingTask;
+import org.apache.cassandra.hints.HintVerbHandler;
+import org.apache.cassandra.hints.HintsService;
 import org.apache.cassandra.io.sstable.SSTableLoader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.locator.*;
@@ -66,6 +78,8 @@
 import org.apache.cassandra.net.*;
 import org.apache.cassandra.repair.*;
 import org.apache.cassandra.repair.messages.RepairOption;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.service.paxos.CommitVerbHandler;
 import org.apache.cassandra.service.paxos.PrepareVerbHandler;
 import org.apache.cassandra.service.paxos.ProposeVerbHandler;
@@ -80,7 +94,12 @@
 import org.apache.cassandra.utils.progress.jmx.JMXProgressSupport;
 import org.apache.cassandra.utils.progress.jmx.LegacyJMXProgressSupport;
 
+import static java.util.Arrays.asList;
 import static java.util.concurrent.TimeUnit.MINUTES;
+import static java.util.stream.Collectors.toList;
+import static org.apache.cassandra.index.SecondaryIndexManager.getIndexName;
+import static org.apache.cassandra.index.SecondaryIndexManager.isIndexColumnFamily;
+import static org.apache.cassandra.service.MigrationManager.evolveSystemKeyspace;
 
 /**
  * This abstraction contains the token/identifier of this node
@@ -118,21 +137,22 @@
     /* This abstraction maintains the token/endpoint metadata information */
     private TokenMetadata tokenMetadata = new TokenMetadata();
 
-    public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
+    public volatile VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(tokenMetadata.partitioner);
 
     private Thread drainOnShutdown = null;
-    private volatile boolean inShutdownHook = false;
+    private volatile boolean isShutdown = false;
 
     public static final StorageService instance = new StorageService();
 
+    @Deprecated
     public boolean isInShutdownHook()
     {
-        return inShutdownHook;
+        return isShutdown();
     }
 
-    public static IPartitioner getPartitioner()
+    public boolean isShutdown()
     {
-        return DatabaseDescriptor.getPartitioner();
+        return isShutdown;
     }
 
     public Collection<Range<Token>> getLocalRanges(String keyspaceName)
@@ -161,17 +181,19 @@
     /* we bootstrap but do NOT join the ring unless told to do so */
     private boolean isSurveyMode = Boolean.parseBoolean(System.getProperty
             ("cassandra.write_survey", "false"));
+
     /* true if node is rebuilding and receiving data */
     private final AtomicBoolean isRebuilding = new AtomicBoolean();
 
     private boolean initialized;
     private volatile boolean joined = false;
+    private final AtomicBoolean authSetupCalled = new AtomicBoolean(false);
 
     /* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */
     private double traceProbability = 0.0;
 
     private static enum Mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED }
-    private Mode operationMode = Mode.STARTING;
+    private volatile Mode operationMode = Mode.STARTING;
 
     /* Used for tracking drain progress */
     private volatile int totalCFs, remainingCFs;
@@ -195,8 +217,6 @@
 
     private final StreamStateStore streamStateStore = new StreamStateStore();
 
-    private final AtomicBoolean doneAuthSetup = new AtomicBoolean(false);
-
     public boolean isSurveyMode()
     {
         return isSurveyMode;
@@ -242,7 +262,7 @@
         /* register the verb handlers */
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.MUTATION, new MutationVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ_REPAIR, new ReadRepairVerbHandler());
-        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ, new ReadVerbHandler());
+        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.READ, new ReadCommandVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.RANGE_SLICE, new RangeSliceVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAGED_RANGE, new RangeSliceVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.COUNTER_MUTATION, new CounterMutationVerbHandler());
@@ -250,6 +270,7 @@
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PREPARE, new PrepareVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_PROPOSE, new ProposeVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.PAXOS_COMMIT, new CommitVerbHandler());
+        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.HINT, new HintVerbHandler());
 
         // see BootStrapper for a summary of how the bootstrap verbs interact
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.REPLICATION_FINISHED, new ReplicationFinishedVerbHandler());
@@ -268,6 +289,9 @@
 
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.SNAPSHOT, new SnapshotVerbHandler());
         MessagingService.instance().registerVerbHandlers(MessagingService.Verb.ECHO, new EchoVerbHandler());
+
+        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.BATCH_STORE, new BatchStoreVerbHandler());
+        MessagingService.instance().registerVerbHandlers(MessagingService.Verb.BATCH_REMOVE, new BatchRemoveVerbHandler());
     }
 
     public void registerDaemon(CassandraDaemon daemon)
@@ -297,10 +321,12 @@
     }
 
     // should only be called via JMX
-    public void startGossiping()
+    public synchronized void startGossiping()
     {
         if (!initialized)
         {
+            checkServiceAllowedToStart("gossip");
+
             logger.warn("Starting gossip by operator request");
             Collection<Token> tokens = SystemKeyspace.getSavedTokens();
 
@@ -326,8 +352,10 @@
     }
 
     // should only be called via JMX
-    public void startRPCServer()
+    public synchronized void startRPCServer()
     {
+        checkServiceAllowedToStart("thrift");
+
         if (daemon == null)
         {
             throw new IllegalStateException("No configured daemon");
@@ -342,20 +370,19 @@
             {
                 if (StorageService.instance.isBootstrapMode() || DatabaseDescriptor.getAuthenticator().requireAuthentication())
                 {
-                    throw new IllegalStateException("Not starting RPC server in write_survey mode as " +
-                            "it's bootstrapping or auth is enabled");
+                    throw new IllegalStateException("Not starting RPC server in write_survey mode as it's bootstrapping or " +
+                            "auth is enabled");
                 }
             }
             else
             {
                 if (!SystemKeyspace.bootstrapComplete())
                 {
-                    throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
-                            " state and resume. For more, see `nodetool help bootstrap`");
+                    throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap state and resume. For more, see `nodetool help bootstrap`");
                 }
             }
         }
-        
+
         daemon.thriftServer.start();
     }
 
@@ -378,23 +405,9 @@
         return daemon.thriftServer.isRunning();
     }
 
-    public void startNativeTransport()
+    public synchronized void startNativeTransport()
     {
-        // We only start transports if bootstrap has completed and we're not in survey mode, OR if we are in
-        // survey mode and streaming has completed but we're not using auth.
-        // OR if we have not joined the ring yet.
-        if (hasJoined() &&
-                ((!isSurveyMode() && !SystemKeyspace.bootstrapComplete()) ||
-                (isSurveyMode() && isBootstrapMode())))
-        {
-            throw new IllegalStateException("Node is not yet bootstrapped completely. Use nodetool to check bootstrap" +
-                    " state and resume. For more, see `nodetool help bootstrap`");
-        }
-        if (hasJoined() && isSurveyMode() && !SystemKeyspace.bootstrapComplete() &&
-                DatabaseDescriptor.getAuthenticator().requireAuthentication())
-        {
-            throw new IllegalStateException("Not starting client transports as write_survey mode is enabled");
-        }
+        checkServiceAllowedToStart("native transport");
 
         if (daemon == null)
         {
@@ -403,7 +416,7 @@
 
         try
         {
-            daemon.nativeServer.start();
+            daemon.startNativeTransport();
         }
         catch (Exception e)
         {
@@ -417,17 +430,33 @@
         {
             throw new IllegalStateException("No configured daemon");
         }
-        if (daemon.nativeServer != null)
-            daemon.nativeServer.stop();
+        daemon.stopNativeTransport();
     }
 
     public boolean isNativeTransportRunning()
     {
-        if ((daemon == null) || (daemon.nativeServer == null))
+        if (daemon == null)
         {
             return false;
         }
-        return daemon.nativeServer.isRunning();
+        return daemon.isNativeTransportRunning();
+    }
+
+    public int getMaxNativeProtocolVersion()
+    {
+        if (daemon == null)
+        {
+            throw new IllegalStateException("No configured daemon");
+        }
+        return daemon.getMaxNativeProtocolVersion();
+    }
+
+    private void refreshMaxNativeProtocolVersion()
+    {
+        if (daemon != null)
+        {
+            daemon.refreshMaxNativeProtocolVersion();
+        }
     }
 
     public void stopTransports()
@@ -449,8 +478,19 @@
         }
     }
 
+    /**
+     * Set the Gossip flag RPC_READY to false and then
+     * shutdown the client services (thrift and CQL).
+     *
+     * Note that other nodes will do this for us when
+     * they get the Gossip shutdown message, so even if
+     * we don't get time to broadcast this, it is not a problem.
+     *
+     * See {@link Gossiper#markAsShutdown(InetAddress)}
+     */
     private void shutdownClientServers()
     {
+        setRpcReady(false);
         stopRPCServer();
         stopNativeTransport();
     }
@@ -492,7 +532,6 @@
 
         // make magic happen
         Map<InetAddress, EndpointState> epStates = Gossiper.instance.doShadowRound();
-
         // now that we've gossiped at least once, we should be able to find the node we're replacing
         if (epStates.get(DatabaseDescriptor.getReplaceAddress())== null)
             throw new RuntimeException("Cannot replace_address " + DatabaseDescriptor.getReplaceAddress() + " because it doesn't exist in gossip");
@@ -502,7 +541,7 @@
             VersionedValue tokensVersionedValue = epStates.get(DatabaseDescriptor.getReplaceAddress()).getApplicationState(ApplicationState.TOKENS);
             if (tokensVersionedValue == null)
                 throw new RuntimeException("Could not find tokens for " + DatabaseDescriptor.getReplaceAddress() + " to replace");
-            Collection<Token> tokens = TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(tokensVersionedValue.toBytes())));
+            Collection<Token> tokens = TokenSerializer.deserialize(tokenMetadata.partitioner, new DataInputStream(new ByteArrayInputStream(tokensVersionedValue.toBytes())));
 
             if (isReplacingSameAddress())
             {
@@ -560,6 +599,21 @@
             MessagingService.instance().listen();
     }
 
+    public void populateTokenMetadata()
+    {
+        if (Boolean.parseBoolean(System.getProperty("cassandra.load_ring_state", "true")))
+        {
+            logger.info("Populating token metadata from system tables");
+            Multimap<InetAddress, Token> loadedTokens = SystemKeyspace.loadTokens();
+            if (!shouldBootstrap()) // if we have not completed bootstrapping, we should not add ourselves as a normal token
+                loadedTokens.putAll(FBUtilities.getBroadcastAddress(), SystemKeyspace.getSavedTokens());
+            for (InetAddress ep : loadedTokens.keySet())
+                tokenMetadata.updateNormalTokens(loadedTokens.get(ep), ep);
+
+            logger.info("Token metadata: {}", tokenMetadata);
+        }
+    }
+
     public synchronized void initServer() throws ConfigurationException
     {
         initServer(RING_DELAY);
@@ -570,7 +624,7 @@
         logger.info("Cassandra version: {}", FBUtilities.getReleaseVersionString());
         logger.info("Thrift API version: {}", cassandraConstants.VERSION);
         logger.info("CQL supported versions: {} (default: {})",
-                    StringUtils.join(ClientState.getCQLSupportedVersion(), ","), ClientState.DEFAULT_CQL_VERSION);
+                StringUtils.join(ClientState.getCQLSupportedVersion(), ","), ClientState.DEFAULT_CQL_VERSION);
 
         initialized = true;
 
@@ -600,72 +654,35 @@
                 }
                 else
                 {
-                    tokenMetadata.updateNormalTokens(loadedTokens.get(ep), ep);
                     if (loadedHostIds.containsKey(ep))
                         tokenMetadata.updateHostId(loadedHostIds.get(ep), ep);
-                    Gossiper.instance.addSavedEndpoint(ep);
+                    Gossiper.runInGossipStageBlocking(() -> Gossiper.instance.addSavedEndpoint(ep));
                 }
             }
         }
 
         // daemon threads, like our executors', continue to run while shutdown hooks are invoked
-        drainOnShutdown = new Thread(new WrappedRunnable()
+        drainOnShutdown = new Thread(NamedThreadFactory.threadLocalDeallocator(new WrappedRunnable()
         {
             @Override
-            public void runMayThrow() throws InterruptedException
+            public void runMayThrow() throws InterruptedException, ExecutionException, IOException
             {
-                inShutdownHook = true;
-                ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
-                ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
-                if (mutationStage.isShutdown() && counterMutationStage.isShutdown())
-                    return; // drained already
-
-                if (daemon != null)
-                	shutdownClientServers();
-                ScheduledExecutors.optionalTasks.shutdown();
-                Gossiper.instance.stop();
-
-                // In-progress writes originating here could generate hints to be written, so shut down MessagingService
-                // before mutation stage, so we can get all the hints saved before shutting down
-                MessagingService.instance().shutdown();
-                counterMutationStage.shutdown();
-                mutationStage.shutdown();
-                counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
-                mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
-                StorageProxy.instance.verifyNoHintsInProgress();
-
-                List<Future<?>> flushes = new ArrayList<>();
-                for (Keyspace keyspace : Keyspace.all())
-                {
-                    KSMetaData ksm = Schema.instance.getKSMetaData(keyspace.getName());
-                    if (!ksm.durableWrites)
-                    {
-                        for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
-                            flushes.add(cfs.forceFlush());
-                    }
-                }
-                try
-                {
-                    FBUtilities.waitOnFutures(flushes);
-                }
-                catch (Throwable t)
-                {
-                    JVMStabilityInspector.inspectThrowable(t);
-                    // don't let this stop us from shutting down the commitlog and other thread pools
-                    logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", t);
-                }
-
-                CommitLog.instance.shutdownBlocking();
+                drain(true);
 
                 if (FBUtilities.isWindows())
                     WindowsTimer.endTimerPeriod(DatabaseDescriptor.getWindowsTimerInterval());
 
+                // Cleanup logback
+                DelayingShutdownHook logbackHook = new DelayingShutdownHook();
+                logbackHook.setContext((LoggerContext)LoggerFactory.getILoggerFactory());
+                logbackHook.run();
+
                 // wait for miscellaneous tasks like sstable and commitlog segment deletion
                 ScheduledExecutors.nonPeriodicTasks.shutdown();
                 if (!ScheduledExecutors.nonPeriodicTasks.awaitTermination(1, MINUTES))
                     logger.warn("Miscellaneous task executor still busy after one minute; proceeding with shutdown");
             }
-        }, "StorageServiceShutdownHook");
+        }), "StorageServiceShutdownHook");
         Runtime.getRuntime().addShutdownHook(drainOnShutdown);
 
         replacing = DatabaseDescriptor.isReplacing();
@@ -705,7 +722,7 @@
                 states.add(Pair.create(ApplicationState.STATUS, valueFactory.hibernate(true)));
                 Gossiper.instance.addLocalApplicationStates(states);
             }
-            doAuthSetup();
+            doAuthSetup(true);
             logger.info("Not joining ring as requested. Use JMX (StorageService->joinRing()) to initiate ring joining");
         }
     }
@@ -727,12 +744,23 @@
         return DatabaseDescriptor.isAutoBootstrap() && !SystemKeyspace.bootstrapComplete() && !DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddress());
     }
 
-    private void prepareToJoin() throws ConfigurationException
+    @VisibleForTesting
+    public void prepareToJoin() throws ConfigurationException
     {
         if (!joined)
         {
             Map<ApplicationState, VersionedValue> appStates = new EnumMap<>(ApplicationState.class);
 
+            if (SystemKeyspace.wasDecommissioned())
+            {
+                if (Boolean.getBoolean("cassandra.override_decommission"))
+                {
+                    logger.warn("This node was decommissioned, but overriding by operator request.");
+                    SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.COMPLETED);
+                }
+                else
+                    throw new ConfigurationException("This node was decommissioned and will not rejoin the ring unless cassandra.override_decommission=true has been set, or all existing data is removed and the node is bootstrapped again");
+            }
             if (replacing && !joinRing)
                 throw new ConfigurationException("Cannot set both join_ring=false and attempt to replace a node");
             if (DatabaseDescriptor.getReplaceTokens().size() > 0 || DatabaseDescriptor.getReplaceNode() != null)
@@ -758,6 +786,15 @@
             {
                 checkForEndpointCollision();
             }
+            else if (SystemKeyspace.bootstrapComplete())
+            {
+                Preconditions.checkState(!Config.isClientMode());
+                // tokens are only ever saved to system.local after bootstrap has completed and we're joining the ring,
+                // or when token update operations (move, decom) are completed
+                Collection<Token> savedTokens = SystemKeyspace.getSavedTokens();
+                if (!savedTokens.isEmpty())
+                    appStates.put(ApplicationState.TOKENS, valueFactory.tokens(savedTokens));
+            }
 
             // have to start the gossip service before we can see any info on other nodes.  this is necessary
             // for bootstrap to get the load info it needs.
@@ -781,12 +818,13 @@
                 MessagingService.instance().listen();
             LoadBroadcaster.instance.startBroadcasting();
 
-            HintedHandOffManager.instance.start();
+            HintsService.instance.startDispatch();
             BatchlogManager.instance.start();
         }
     }
 
-    private void joinTokenRing(int delay) throws ConfigurationException
+    @VisibleForTesting
+    public void joinTokenRing(int delay) throws ConfigurationException
     {
         joined = true;
 
@@ -832,12 +870,13 @@
                 }
                 Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
             }
-            // if our schema hasn't matched yet, keep sleeping until it does
+            // if our schema hasn't matched yet, wait until it has
+            // we do this by waiting for all in-flight migration requests and responses to complete
             // (post CASSANDRA-1391 we don't expect this to be necessary very often, but it doesn't hurt to be careful)
-            while (!MigrationManager.isReadyForBootstrap())
+            if (!MigrationManager.isReadyForBootstrap())
             {
                 setMode(Mode.JOINING, "waiting for schema information to complete", true);
-                Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+                MigrationManager.waitUntilReadyForBootstrap();
             }
             setMode(Mode.JOINING, "schema complete, ready to bootstrap", true);
             setMode(Mode.JOINING, "waiting for pending range calculation", true);
@@ -865,7 +904,7 @@
                     throw new UnsupportedOperationException(s);
                 }
                 setMode(Mode.JOINING, "getting bootstrap token", true);
-                bootstrapTokens = BootStrapper.getBootstrapTokens(tokenMetadata);
+                bootstrapTokens = BootStrapper.getBootstrapTokens(tokenMetadata, FBUtilities.getBroadcastAddress());
             }
             else
             {
@@ -934,7 +973,7 @@
                 {
                     bootstrapTokens = new ArrayList<>(initialTokens.size());
                     for (String token : initialTokens)
-                        bootstrapTokens.add(getPartitioner().getTokenFactory().fromString(token));
+                        bootstrapTokens.add(getTokenFactory().fromString(token));
                     logger.info("Saved tokens not found. Using configuration value: {}", bootstrapTokens);
                 }
             }
@@ -947,9 +986,7 @@
             }
         }
 
-        // if we don't have system_traces keyspace at this point, then create it manually
-        ensureTraceKeyspace();
-        maybeAddOrUpdateKeyspace(SystemDistributedKeyspace.definition());
+        setUpDistributedSystemKeyspaces();
 
         if (!isSurveyMode)
         {
@@ -960,8 +997,10 @@
                 // remove the existing info about the replaced node.
                 if (!current.isEmpty())
                 {
-                    for (InetAddress existing : current)
-                        Gossiper.instance.replacedEndpoint(existing);
+                    Gossiper.runInGossipStageBlocking(() -> {
+                        for (InetAddress existing : current)
+                            Gossiper.instance.replacedEndpoint(existing);
+                    });
                 }
             }
             else
@@ -981,7 +1020,7 @@
     @VisibleForTesting
     public void ensureTraceKeyspace()
     {
-        maybeAddOrUpdateKeyspace(TraceKeyspace.definition());
+        evolveSystemKeyspace(TraceKeyspace.metadata(), TraceKeyspace.GENERATION).ifPresent(MigrationManager::announceGlobally);
     }
 
     public static boolean isReplacingSameAddress()
@@ -1042,65 +1081,34 @@
         setTokens(tokens);
 
         assert tokenMetadata.sortedTokens().size() > 0;
-        doAuthSetup();
+        doAuthSetup(false);
     }
 
-    private void doAuthSetup()
+    private void doAuthSetup(boolean setUpSchema)
     {
-        if (!doneAuthSetup.getAndSet(true))
+        if (!authSetupCalled.getAndSet(true))
         {
-            maybeAddOrUpdateKeyspace(AuthKeyspace.definition());
+            if (setUpSchema)
+                evolveSystemKeyspace(AuthKeyspace.metadata(), AuthKeyspace.GENERATION).ifPresent(MigrationManager::announceGlobally);
 
             DatabaseDescriptor.getRoleManager().setup();
             DatabaseDescriptor.getAuthenticator().setup();
             DatabaseDescriptor.getAuthorizer().setup();
+
             MigrationManager.instance.register(new AuthMigrationListener());
         }
     }
 
-    private void maybeAddKeyspace(KSMetaData ksm)
+    private void setUpDistributedSystemKeyspaces()
     {
-        try
-        {
-            MigrationManager.announceNewKeyspace(ksm, 0, false);
-        }
-        catch (AlreadyExistsException e)
-        {
-            logger.debug("Attempted to create new keyspace {}, but it already exists", ksm.name);
-        }
-    }
+        Collection<Mutation> changes = new ArrayList<>(3);
 
-    /**
-     * Ensure the schema of a pseudo-system keyspace (a distributed system keyspace: traces, auth and the so-called distributedKeyspace),
-     * is up to date with what we expected (creating it if it doesn't exist and updating tables that may have been upgraded).
-     */
-    private void maybeAddOrUpdateKeyspace(KSMetaData expected)
-    {
-        // Note that want to deal with the keyspace and its table a bit differently: for the keyspace definition
-        // itself, we want to create it if it doesn't exist yet, but if it does exist, we don't want to modify it,
-        // because user can modify the definition to change the replication factor (#6016) and we don't want to
-        // override it. For the tables however, we have to deal with the fact that new version can add new columns
-        // (#8162 being an example), so even if the table definition exists, we still need to force the "current"
-        // version of the schema, the one the node will be expecting.
+        evolveSystemKeyspace(            TraceKeyspace.metadata(),             TraceKeyspace.GENERATION).ifPresent(changes::add);
+        evolveSystemKeyspace(SystemDistributedKeyspace.metadata(), SystemDistributedKeyspace.GENERATION).ifPresent(changes::add);
+        evolveSystemKeyspace(             AuthKeyspace.metadata(),              AuthKeyspace.GENERATION).ifPresent(changes::add);
 
-        KSMetaData defined = Schema.instance.getKSMetaData(expected.name);
-        // If the keyspace doesn't exist, create it
-        if (defined == null)
-        {
-            maybeAddKeyspace(expected);
-            defined = Schema.instance.getKSMetaData(expected.name);
-        }
-
-        // While the keyspace exists, it might miss table or have outdated one
-        // There is also the potential for a race, as schema migrations add the bare
-        // keyspace into Schema.instance before adding its tables, so double check that
-        // all the expected tables are present
-        for (CFMetaData expectedTable : expected.cfMetaData().values())
-        {
-            CFMetaData definedTable = defined.cfMetaData().get(expectedTable.cfName);
-            if (definedTable == null || !definedTable.equals(expectedTable))
-                MigrationManager.forceAnnounceNewColumnFamily(expectedTable);
-        }
+        if (!changes.isEmpty())
+            MigrationManager.announce(changes, false);
     }
 
     public boolean isJoined()
@@ -1110,7 +1118,7 @@
 
     public void rebuild(String sourceDc)
     {
-        // check on going rebuild
+        // check ongoing rebuild
         if (!isRebuilding.compareAndSet(false, true))
         {
             throw new IllegalStateException("Node is still rebuilding. Check nodetool netstats.");
@@ -1131,7 +1139,7 @@
             if (sourceDc != null)
                 streamer.addSourceFilter(new RangeStreamer.SingleDatacenterFilter(DatabaseDescriptor.getEndpointSnitch(), sourceDc));
 
-            for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+            for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
                 streamer.addRanges(keyspaceName, getLocalRanges(keyspaceName));
 
             StreamResultFuture resultFuture = streamer.fetchAsync();
@@ -1246,7 +1254,6 @@
             tokenMetadata.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
             SystemKeyspace.removeEndpoint(DatabaseDescriptor.getReplaceAddress());
         }
-
         if (!Gossiper.instance.seenAnySeed())
             throw new IllegalStateException("Unable to contact any seeds!");
 
@@ -1260,24 +1267,11 @@
         BootStrapper bootstrapper = new BootStrapper(FBUtilities.getBroadcastAddress(), tokens, tokenMetadata);
         bootstrapper.addProgressListener(progressSupport);
         ListenableFuture<StreamState> bootstrapStream = bootstrapper.bootstrap(streamStateStore, !replacing && useStrictConsistency); // handles token update
-        Futures.addCallback(bootstrapStream, new FutureCallback<StreamState>()
-        {
-            @Override
-            public void onSuccess(StreamState streamState)
-            {
-                isBootstrapMode = false;
-                logger.info("Bootstrap completed! for the tokens {}", tokens);
-            }
-
-            @Override
-            public void onFailure(Throwable e)
-            {
-                logger.warn("Error during bootstrap.", e);
-            }
-        });
         try
         {
             bootstrapStream.get();
+            bootstrapFinished();
+            logger.info("Bootstrap completed for tokens {}", tokens);
             return true;
         }
         catch (Throwable e)
@@ -1287,6 +1281,25 @@
         }
     }
 
+    /**
+     * All MVs have been created during bootstrap, so mark them as built
+     */
+    private void markViewsAsBuilt() {
+        for (String keyspace : Schema.instance.getUserKeyspaces())
+        {
+            for (ViewDefinition view: Schema.instance.getKSMetaData(keyspace).views)
+                SystemKeyspace.finishViewBuildStatus(view.ksName, view.viewName);
+        }
+    }
+
+    /**
+     * Called when bootstrap did finish successfully
+     */
+    private void bootstrapFinished() {
+        markViewsAsBuilt();
+        isBootstrapMode = false;
+    }
+
     public boolean resumeBootstrap()
     {
         if (isBootstrapMode && SystemKeyspace.bootstrapInProgress())
@@ -1304,20 +1317,30 @@
                 @Override
                 public void onSuccess(StreamState streamState)
                 {
-                    isBootstrapMode = false;
-                    if (isSurveyMode)
+                    try
                     {
-                        logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
+                        bootstrapFinished();
+                        if (isSurveyMode)
+                        {
+                            logger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
+                        }
+                        else
+                        {
+                            isSurveyMode = false;
+                            progressSupport.progress("bootstrap", ProgressEvent.createNotification("Joining ring..."));
+                            finishJoiningRing(bootstrapTokens);
+                        }
+                        progressSupport.progress("bootstrap", new ProgressEvent(ProgressEventType.COMPLETE, 1, 1, "Resume bootstrap complete"));
+                        if (!isNativeTransportRunning())
+                            daemon.initializeNativeTransport();
+                        daemon.start();
+                        logger.info("Resume complete");
                     }
-                    else
+                    catch(Exception e)
                     {
-                        isSurveyMode = false;
-                        progressSupport.progress("bootstrap", ProgressEvent.createNotification("Joining ring..."));
-                        finishJoiningRing(bootstrapTokens);
+                        onFailure(e);
+                        throw e;
                     }
-                    progressSupport.progress("bootstrap", new ProgressEvent(ProgressEventType.COMPLETE, 1, 1, "Resume bootstrap complete"));
-                    daemon.start();
-                    logger.info("Resume complete");
                 }
 
                 @Override
@@ -1374,9 +1397,9 @@
         return bgMonitor.getSeverity(endpoint);
     }
 
-    public void shutdownBGMonitorAndWait(long timeout, TimeUnit units) throws TimeoutException, InterruptedException
+    public void shutdownBGMonitorAndWait(long timeout, TimeUnit unit) throws TimeoutException, InterruptedException
     {
-        bgMonitor.shutdownAndWait(timeout, units);
+        bgMonitor.shutdownAndWait(timeout, unit);
     }
 
     /**
@@ -1436,7 +1459,7 @@
         // some people just want to get a visual representation of things. Allow null and set it to the first
         // non-system keyspace.
         if (keyspace == null)
-            keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
+            keyspace = Schema.instance.getNonLocalStrategyKeyspaces().get(0);
 
         Map<List<String>, List<String>> map = new HashMap<>();
         for (Map.Entry<Range<Token>, Collection<InetAddress>> entry : tokenMetadata.getPendingRangesMM(keyspace).asMap().entrySet())
@@ -1497,7 +1520,7 @@
         // some people just want to get a visual representation of things. Allow null and set it to the first
         // non-system keyspace.
         if (keyspace == null)
-            keyspace = Schema.instance.getNonSystemKeyspaces().get(0);
+            keyspace = Schema.instance.getNonLocalStrategyKeyspaces().get(0);
 
         List<Range<Token>> ranges = getAllRanges(sortedTokens);
         return constructRangeToEndpointMap(keyspace, ranges);
@@ -1561,7 +1584,7 @@
             throw new InvalidRequestException("There is no ring for the keyspace: " + keyspace);
 
         List<TokenRange> ranges = new ArrayList<>();
-        Token.TokenFactory tf = getPartitioner().getTokenFactory();
+        Token.TokenFactory tf = getTokenFactory();
 
         Map<Range<Token>, List<InetAddress>> rangeToAddressMap =
                 includeOnlyLocalDC
@@ -1618,6 +1641,11 @@
         return getTokenMetadata().getHostId(FBUtilities.getBroadcastAddress()).toString();
     }
 
+    public UUID getLocalHostUUID()
+    {
+        return getTokenMetadata().getHostId(FBUtilities.getBroadcastAddress());
+    }
+
     public Map<String, String> getHostIdMap()
     {
         return getEndpointToHostId();
@@ -1741,23 +1769,24 @@
 
             if (getTokenMetadata().isMember(endpoint))
             {
+                final ExecutorService executor = StageManager.getStage(Stage.MUTATION);
                 switch (state)
                 {
                     case RELEASE_VERSION:
-                        SystemKeyspace.updatePeerInfo(endpoint, "release_version", value.value);
+                        SystemKeyspace.updatePeerReleaseVersion(endpoint, value.value, this::refreshMaxNativeProtocolVersion, executor);
                         break;
                     case DC:
                         updateTopology(endpoint);
-                        SystemKeyspace.updatePeerInfo(endpoint, "data_center", value.value);
+                        SystemKeyspace.updatePeerInfo(endpoint, "data_center", value.value, executor);
                         break;
                     case RACK:
                         updateTopology(endpoint);
-                        SystemKeyspace.updatePeerInfo(endpoint, "rack", value.value);
+                        SystemKeyspace.updatePeerInfo(endpoint, "rack", value.value, executor);
                         break;
                     case RPC_ADDRESS:
                         try
                         {
-                            SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(value.value));
+                            SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(value.value), executor);
                         }
                         catch (UnknownHostException e)
                         {
@@ -1765,15 +1794,18 @@
                         }
                         break;
                     case SCHEMA:
-                        SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(value.value));
+                        SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(value.value), executor);
                         MigrationManager.instance.scheduleSchemaPull(endpoint, epState);
                         break;
                     case HOST_ID:
-                        SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(value.value));
+                        SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(value.value), executor);
                         break;
                     case RPC_READY:
                         notifyRpcChange(endpoint, epState.isRpcReady());
                         break;
+                    case NET_VERSION:
+                        updateNetVersion(endpoint, value);
+                        break;
                 }
             }
         }
@@ -1784,6 +1816,18 @@
         return value.value.split(VersionedValue.DELIMITER_STR, -1);
     }
 
+    private void updateNetVersion(InetAddress endpoint, VersionedValue value)
+    {
+        try
+        {
+            MessagingService.instance().setVersion(endpoint, Integer.valueOf(value.value));
+        }
+        catch (NumberFormatException e)
+        {
+            throw new AssertionError("Got invalid value for NET_VERSION application state: " + value.value);
+        }
+    }
+
     public void updateTopology(InetAddress endpoint)
     {
         if (getTokenMetadata().isMember(endpoint))
@@ -1800,23 +1844,24 @@
     private void updatePeerInfo(InetAddress endpoint)
     {
         EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
+        final ExecutorService executor = StageManager.getStage(Stage.MUTATION);
         for (Map.Entry<ApplicationState, VersionedValue> entry : epState.states())
         {
             switch (entry.getKey())
             {
                 case RELEASE_VERSION:
-                    SystemKeyspace.updatePeerInfo(endpoint, "release_version", entry.getValue().value);
+                    SystemKeyspace.updatePeerReleaseVersion(endpoint, entry.getValue().value, this::refreshMaxNativeProtocolVersion, executor);
                     break;
                 case DC:
-                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", entry.getValue().value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "data_center", entry.getValue().value, executor);
                     break;
                 case RACK:
-                    SystemKeyspace.updatePeerInfo(endpoint, "rack", entry.getValue().value);
+                    SystemKeyspace.updatePeerInfo(endpoint, "rack", entry.getValue().value, executor);
                     break;
                 case RPC_ADDRESS:
                     try
                     {
-                        SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(entry.getValue().value));
+                        SystemKeyspace.updatePeerInfo(endpoint, "rpc_address", InetAddress.getByName(entry.getValue().value), executor);
                     }
                     catch (UnknownHostException e)
                     {
@@ -1824,10 +1869,10 @@
                     }
                     break;
                 case SCHEMA:
-                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(entry.getValue().value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "schema_version", UUID.fromString(entry.getValue().value), executor);
                     break;
                 case HOST_ID:
-                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(entry.getValue().value));
+                    SystemKeyspace.updatePeerInfo(endpoint, "host_id", UUID.fromString(entry.getValue().value), executor);
                     break;
             }
         }
@@ -1879,18 +1924,34 @@
 
     private boolean isStatus(InetAddress endpoint, String status)
     {
-        return Gossiper.instance.getEndpointStateForEndpoint(endpoint).getStatus().equals(status);
+        EndpointState state = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
+        return state != null && state.getStatus().equals(status);
     }
 
     public boolean isRpcReady(InetAddress endpoint)
     {
-        return MessagingService.instance().getVersion(endpoint) < MessagingService.VERSION_22 ||
-                Gossiper.instance.getEndpointStateForEndpoint(endpoint).isRpcReady();
+        if (MessagingService.instance().getVersion(endpoint) < MessagingService.VERSION_22)
+            return true;
+        EndpointState state = Gossiper.instance.getEndpointStateForEndpoint(endpoint);
+        return state != null && state.isRpcReady();
     }
 
+    /**
+     * Set the RPC status. Because when draining a node we need to set the RPC
+     * status to not ready, and drain is called by the shutdown hook, it may be that value is false
+     * and there is no local endpoint state. In this case it's OK to just do nothing. Therefore,
+     * we assert that the local endpoint state is not null only when value is true.
+     *
+     * @param value - true indicates that RPC is ready, false indicates the opposite.
+     */
     public void setRpcReady(boolean value)
     {
-        Gossiper.instance.addLocalApplicationState(ApplicationState.RPC_READY, valueFactory.rpcReady(value));
+        EndpointState state = Gossiper.instance.getEndpointStateForEndpoint(FBUtilities.getBroadcastAddress());
+        // if value is false we're OK with a null state, if it is true we are not.
+        assert !value || state != null;
+
+        if (state != null)
+            Gossiper.instance.addLocalApplicationState(ApplicationState.RPC_READY, valueFactory.rpcReady(value));
     }
 
     private Collection<Token> getTokensFor(InetAddress endpoint)
@@ -1905,7 +1966,7 @@
             if (versionedValue == null)
                 return Collections.emptyList();
 
-            return TokenSerializer.deserialize(getPartitioner(), new DataInputStream(new ByteArrayInputStream(versionedValue.toBytes())));
+            return TokenSerializer.deserialize(tokenMetadata.partitioner, new DataInputStream(new ByteArrayInputStream(versionedValue.toBytes())));
         }
         catch (IOException e)
         {
@@ -1948,7 +2009,6 @@
         tokenMetadata.updateHostId(Gossiper.instance.getHostId(endpoint), endpoint);
     }
 
-
     private void handleStateBootreplacing(InetAddress newNode, String[] pieces)
     {
         InetAddress oldNode;
@@ -1985,6 +2045,85 @@
         tokenMetadata.updateHostId(Gossiper.instance.getHostId(newNode), newNode);
     }
 
+    private void ensureUpToDateTokenMetadata(String status, InetAddress endpoint)
+    {
+        Set<Token> tokens = new TreeSet<>(getTokensFor(endpoint));
+
+        if (logger.isDebugEnabled())
+            logger.debug("Node {} state {}, tokens {}", endpoint, status, tokens);
+
+        // If the node is previously unknown or tokens do not match, update tokenmetadata to
+        // have this node as 'normal' (it must have been using this token before the
+        // leave). This way we'll get pending ranges right.
+        if (!tokenMetadata.isMember(endpoint))
+        {
+            logger.info("Node {} state jump to {}", endpoint, status);
+            updateTokenMetadata(endpoint, tokens);
+        }
+        else if (!tokens.equals(new TreeSet<>(tokenMetadata.getTokens(endpoint))))
+        {
+            logger.warn("Node {} '{}' token mismatch. Long network partition?", endpoint, status);
+            updateTokenMetadata(endpoint, tokens);
+        }
+    }
+
+    private void updateTokenMetadata(InetAddress endpoint, Iterable<Token> tokens)
+    {
+        updateTokenMetadata(endpoint, tokens, new HashSet<>());
+    }
+
+    private void updateTokenMetadata(InetAddress endpoint, Iterable<Token> tokens, Set<InetAddress> endpointsToRemove)
+    {
+        Set<Token> tokensToUpdateInMetadata = new HashSet<>();
+        Set<Token> tokensToUpdateInSystemKeyspace = new HashSet<>();
+
+        for (final Token token : tokens)
+        {
+            // we don't want to update if this node is responsible for the token and it has a later startup time than endpoint.
+            InetAddress currentOwner = tokenMetadata.getEndpoint(token);
+            if (currentOwner == null)
+            {
+                logger.debug("New node {} at token {}", endpoint, token);
+                tokensToUpdateInMetadata.add(token);
+                tokensToUpdateInSystemKeyspace.add(token);
+            }
+            else if (endpoint.equals(currentOwner))
+            {
+                // set state back to normal, since the node may have tried to leave, but failed and is now back up
+                tokensToUpdateInMetadata.add(token);
+                tokensToUpdateInSystemKeyspace.add(token);
+            }
+            else if (Gossiper.instance.compareEndpointStartup(endpoint, currentOwner) > 0)
+            {
+                tokensToUpdateInMetadata.add(token);
+                tokensToUpdateInSystemKeyspace.add(token);
+
+                // currentOwner is no longer current, endpoint is.  Keep track of these moves, because when
+                // a host no longer has any tokens, we'll want to remove it.
+                Multimap<InetAddress, Token> epToTokenCopy = getTokenMetadata().getEndpointToTokenMapForReading();
+                epToTokenCopy.get(currentOwner).remove(token);
+                if (epToTokenCopy.get(currentOwner).isEmpty())
+                    endpointsToRemove.add(currentOwner);
+
+                logger.info("Nodes {} and {} have the same token {}. {} is the new owner", endpoint, currentOwner, token, endpoint);
+            }
+            else
+            {
+                logger.info("Nodes () and {} have the same token {}.  Ignoring {}", endpoint, currentOwner, token, endpoint);
+            }
+        }
+
+        tokenMetadata.updateNormalTokens(tokensToUpdateInMetadata, endpoint);
+        for (InetAddress ep : endpointsToRemove)
+        {
+            removeEndpoint(ep);
+            if (replacing && ep.equals(DatabaseDescriptor.getReplaceAddress()))
+                Gossiper.instance.replacementQuarantine(ep); // quarantine locally longer than normally; see CASSANDRA-8260
+        }
+        if (!tokensToUpdateInSystemKeyspace.isEmpty())
+            SystemKeyspace.updateTokens(endpoint, tokensToUpdateInSystemKeyspace, StageManager.getStage(Stage.MUTATION));
+    }
+
     /**
      * Handle node move to normal state. That is, node is entering token ring and participating
      * in reads.
@@ -1994,8 +2133,6 @@
     private void handleStateNormal(final InetAddress endpoint, final String status)
     {
         Collection<Token> tokens = getTokensFor(endpoint);
-        Set<Token> tokensToUpdateInMetadata = new HashSet<>();
-        Set<Token> tokensToUpdateInSystemKeyspace = new HashSet<>();
         Set<InetAddress> endpointsToRemove = new HashSet<>();
 
         if (logger.isDebugEnabled())
@@ -2063,62 +2200,11 @@
                 tokenMetadata.updateHostId(hostId, endpoint);
         }
 
-        for (final Token token : tokens)
-        {
-            // we don't want to update if this node is responsible for the token and it has a later startup time than endpoint.
-            InetAddress currentOwner = tokenMetadata.getEndpoint(token);
-            if (currentOwner == null)
-            {
-                logger.debug("New node {} at token {}", endpoint, token);
-                tokensToUpdateInMetadata.add(token);
-                tokensToUpdateInSystemKeyspace.add(token);
-            }
-            else if (endpoint.equals(currentOwner))
-            {
-                // set state back to normal, since the node may have tried to leave, but failed and is now back up
-                tokensToUpdateInMetadata.add(token);
-                tokensToUpdateInSystemKeyspace.add(token);
-            }
-            else if (Gossiper.instance.compareEndpointStartup(endpoint, currentOwner) > 0)
-            {
-                tokensToUpdateInMetadata.add(token);
-                tokensToUpdateInSystemKeyspace.add(token);
-
-                // currentOwner is no longer current, endpoint is.  Keep track of these moves, because when
-                // a host no longer has any tokens, we'll want to remove it.
-                Multimap<InetAddress, Token> epToTokenCopy = getTokenMetadata().getEndpointToTokenMapForReading();
-                epToTokenCopy.get(currentOwner).remove(token);
-                if (epToTokenCopy.get(currentOwner).size() < 1)
-                    endpointsToRemove.add(currentOwner);
-
-                logger.info(String.format("Nodes %s and %s have the same token %s.  %s is the new owner",
-                                          endpoint,
-                                          currentOwner,
-                                          token,
-                                          endpoint));
-            }
-            else
-            {
-                logger.info(String.format("Nodes %s and %s have the same token %s.  Ignoring %s",
-                                           endpoint,
-                                           currentOwner,
-                                           token,
-                                           endpoint));
-            }
-        }
-
         // capture because updateNormalTokens clears moving and member status
         boolean isMember = tokenMetadata.isMember(endpoint);
         boolean isMoving = tokenMetadata.isMoving(endpoint);
-        tokenMetadata.updateNormalTokens(tokensToUpdateInMetadata, endpoint);
-        for (InetAddress ep : endpointsToRemove)
-        {
-            removeEndpoint(ep);
-            if (replacing && DatabaseDescriptor.getReplaceAddress().equals(ep))
-                Gossiper.instance.replacementQuarantine(ep); // quarantine locally longer than normally; see CASSANDRA-8260
-        }
-        if (!tokensToUpdateInSystemKeyspace.isEmpty())
-            SystemKeyspace.updateTokens(endpoint, tokensToUpdateInSystemKeyspace);
+
+        updateTokenMetadata(endpoint, tokens, endpointsToRemove);
 
         if (isMoving || operationMode == Mode.MOVING)
         {
@@ -2140,24 +2226,11 @@
      */
     private void handleStateLeaving(InetAddress endpoint)
     {
-        Collection<Token> tokens = getTokensFor(endpoint);
-
-        if (logger.isDebugEnabled())
-            logger.debug("Node {} state leaving, tokens {}", endpoint, tokens);
-
         // If the node is previously unknown or tokens do not match, update tokenmetadata to
         // have this node as 'normal' (it must have been using this token before the
         // leave). This way we'll get pending ranges right.
-        if (!tokenMetadata.isMember(endpoint))
-        {
-            logger.info("Node {} state jump to leaving", endpoint);
-            tokenMetadata.updateNormalTokens(tokens, endpoint);
-        }
-        else if (!tokenMetadata.getTokens(endpoint).containsAll(tokens))
-        {
-            logger.warn("Node {} 'leaving' token mismatch. Long network partition?", endpoint);
-            tokenMetadata.updateNormalTokens(tokens, endpoint);
-        }
+
+        ensureUpToDateTokenMetadata(VersionedValue.STATUS_LEAVING, endpoint);
 
         // at this point the endpoint is certainly a member with this token, so let's proceed
         // normally
@@ -2190,8 +2263,10 @@
      */
     private void handleStateMoving(InetAddress endpoint, String[] pieces)
     {
+        ensureUpToDateTokenMetadata(VersionedValue.STATUS_MOVING, endpoint);
+
         assert pieces.length >= 2;
-        Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
+        Token token = getTokenFactory().fromString(pieces[1]);
 
         if (logger.isDebugEnabled())
             logger.debug("Node {} state moving, new token {}", endpoint, token);
@@ -2235,6 +2310,8 @@
             }
             else if (VersionedValue.REMOVING_TOKEN.equals(state))
             {
+                ensureUpToDateTokenMetadata(state, endpoint);
+
                 if (logger.isDebugEnabled())
                     logger.debug("Tokens {} removed manually (endpoint was {})", removeTokens, endpoint);
 
@@ -2260,12 +2337,20 @@
     private void excise(Collection<Token> tokens, InetAddress endpoint)
     {
         logger.info("Removing tokens {} for {}", tokens, endpoint);
-        HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
+
+        UUID hostId = tokenMetadata.getHostId(endpoint);
+        if (hostId != null && tokenMetadata.isMember(endpoint))
+        {
+            // enough time for writes to expire and MessagingService timeout reporter callback to fire, which is where
+            // hints are mostly written from - using getMinRpcTimeout() / 2 for the interval.
+            long delay = DatabaseDescriptor.getMinRpcTimeout() + DatabaseDescriptor.getWriteRpcTimeout();
+            ScheduledExecutors.optionalTasks.schedule(() -> HintsService.instance.excise(hostId), delay, TimeUnit.MILLISECONDS);
+        }
+
         removeEndpoint(endpoint);
         tokenMetadata.removeEndpoint(endpoint);
         if (!tokens.isEmpty())
             tokenMetadata.removeBootstrapTokens(tokens);
-
         notifyLeft(endpoint);
         PendingRangeCalculatorService.instance.update();
     }
@@ -2279,7 +2364,7 @@
     /** unlike excise we just need this endpoint gone without going through any notifications **/
     private void removeEndpoint(InetAddress endpoint)
     {
-        Gossiper.instance.removeEndpoint(endpoint);
+        Gossiper.runInGossipStageBlocking(() -> Gossiper.instance.removeEndpoint(endpoint));
         SystemKeyspace.removeEndpoint(endpoint);
     }
 
@@ -2374,7 +2459,7 @@
 
         InetAddress myAddress = FBUtilities.getBroadcastAddress();
 
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             Multimap<Range<Token>, InetAddress> changedRanges = getChangedRangesForLeaving(keyspaceName, endpoint);
             Set<Range<Token>> myNewRanges = new HashSet<>();
@@ -2479,10 +2564,7 @@
         MigrationManager.instance.scheduleSchemaPull(endpoint, state);
 
         if (tokenMetadata.isMember(endpoint))
-        {
-            HintedHandOffManager.instance.scheduleHintDelivery(endpoint, true);
             notifyUp(endpoint);
-        }
     }
 
     public void onRemove(InetAddress endpoint)
@@ -2502,6 +2584,12 @@
         // If we have restarted before the node was even marked down, we need to reset the connection pool
         if (state.isAlive())
             onDead(endpoint, state);
+
+        // Then, the node may have been upgraded and changed its messaging protocol version. If so, we
+        // want to update that before we mark the node live again to avoid problems like CASSANDRA-11128.
+        VersionedValue netVersion = state.getApplicationState(ApplicationState.NET_VERSION);
+        if (netVersion != null)
+            updateNetVersion(endpoint, netVersion);
     }
 
 
@@ -2522,9 +2610,10 @@
         return map;
     }
 
+    // TODO
     public final void deliverHints(String host) throws UnknownHostException
     {
-        HintedHandOffManager.instance.scheduleHintDelivery(host);
+        throw new UnsupportedOperationException();
     }
 
     public Collection<Token> getLocalTokens()
@@ -2534,6 +2623,18 @@
         return tokens;
     }
 
+    @Nullable
+    public InetAddress getEndpointForHostId(UUID hostId)
+    {
+        return tokenMetadata.getEndpointForHostId(hostId);
+    }
+
+    @Nullable
+    public UUID getHostIdForEndpoint(InetAddress address)
+    {
+        return tokenMetadata.getHostId(address);
+    }
+
     /* These methods belong to the MBean interface */
 
     public List<String> getTokens()
@@ -2653,18 +2754,18 @@
         return Gossiper.instance.getCurrentGenerationNumber(FBUtilities.getBroadcastAddress());
     }
 
-    public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int forceKeyspaceCleanup(String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return forceKeyspaceCleanup(0, keyspaceName, columnFamilies);
+        return forceKeyspaceCleanup(0, keyspaceName, tables);
     }
 
-    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        if (keyspaceName.equals(SystemKeyspace.NAME))
+        if (Schema.isLocalSystemKeyspace(keyspaceName))
             throw new RuntimeException("Cleanup of the system keyspace is neither necessary nor wise");
 
         CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, tables))
         {
             CompactionManager.AllSSTableOpStatus oneStatus = cfStore.forceCleanup(jobs);
             if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
@@ -2673,37 +2774,37 @@
         return status.statusCode;
     }
 
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return scrub(disableSnapshot, skipCorrupted, true, 0, keyspaceName, columnFamilies);
+        return scrub(disableSnapshot, skipCorrupted, true, 0, keyspaceName, tables);
     }
 
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return scrub(disableSnapshot, skipCorrupted, checkData, 0, keyspaceName, columnFamilies);
+        return scrub(disableSnapshot, skipCorrupted, checkData, 0, keyspaceName, tables);
     }
 
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return scrub(disableSnapshot, skipCorrupted, checkData, false, jobs, keyspaceName, columnFamilies);
+        return scrub(disableSnapshot, skipCorrupted, checkData, false, jobs, keyspaceName, tables);
     }
 
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows,
-                     int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
         CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, tables))
         {
-            CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted, checkData, reinsertOverflowedTTLRows, jobs);
+            CompactionManager.AllSSTableOpStatus oneStatus = cfStore.scrub(disableSnapshot, skipCorrupted, reinsertOverflowedTTL, checkData, jobs);
             if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
                 status = oneStatus;
         }
         return status.statusCode;
     }
-    public int verify(boolean extendedVerify, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+
+    public int verify(boolean extendedVerify, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
         CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(false, false, keyspaceName, tableNames))
         {
             CompactionManager.AllSSTableOpStatus oneStatus = cfStore.verify(extendedVerify);
             if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
@@ -2712,15 +2813,15 @@
         return status.statusCode;
     }
 
-    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        return upgradeSSTables(keyspaceName, excludeCurrentVersion, 2, columnFamilies);
+        return upgradeSSTables(keyspaceName, excludeCurrentVersion, 0, tableNames);
     }
 
-    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
         CompactionManager.AllSSTableOpStatus status = CompactionManager.AllSSTableOpStatus.SUCCESSFUL;
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, true, keyspaceName, tableNames))
         {
             CompactionManager.AllSSTableOpStatus oneStatus = cfStore.sstablesRewrite(excludeCurrentVersion, jobs);
             if (oneStatus != CompactionManager.AllSSTableOpStatus.SUCCESSFUL)
@@ -2729,9 +2830,9 @@
         return status.statusCode;
     }
 
-    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, tableNames))
         {
             cfStore.forceMajorCompaction(splitOutput);
         }
@@ -2774,29 +2875,29 @@
     }
 
     /**
-     * Takes the snapshot of a specific column family. A snapshot name must be specified.
+     * Takes the snapshot of a specific table. A snapshot name must be specified.
      *
-     * @param keyspaceName the keyspace which holds the specified column family
-     * @param columnFamilyName the column family to snapshot
+     * @param keyspaceName the keyspace which holds the specified table
+     * @param tableName the table to snapshot
      * @param tag the tag given to the snapshot; may not be null or empty
      */
-    public void takeColumnFamilySnapshot(String keyspaceName, String columnFamilyName, String tag) throws IOException
+    public void takeTableSnapshot(String keyspaceName, String tableName, String tag) throws IOException
     {
         if (keyspaceName == null)
             throw new IOException("You must supply a keyspace name");
         if (operationMode == Mode.JOINING)
             throw new IOException("Cannot snapshot until bootstrap completes");
 
-        if (columnFamilyName == null)
+        if (tableName == null)
             throw new IOException("You must supply a table name");
-        if (columnFamilyName.contains("."))
+        if (tableName.contains("."))
             throw new IllegalArgumentException("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
 
         if (tag == null || tag.equals(""))
             throw new IOException("You must supply a snapshot name.");
 
         Keyspace keyspace = getValidKeyspace(keyspaceName);
-        ColumnFamilyStore columnFamilyStore = keyspace.getColumnFamilyStore(columnFamilyName);
+        ColumnFamilyStore columnFamilyStore = keyspace.getColumnFamilyStore(tableName);
         if (columnFamilyStore.snapshotExists(tag))
             throw new IOException("Snapshot " + tag + " already exists.");
 
@@ -2809,34 +2910,34 @@
      *
      * @param tag
      *            the tag given to the snapshot; may not be null or empty
-     * @param columnFamilyList
-     *            list of columnfamily from different keyspace in the form of ks1.cf1 ks2.cf2
+     * @param tableList
+     *            list of tables from different keyspace in the form of ks1.cf1 ks2.cf2
      */
     @Override
-    public void takeMultipleColumnFamilySnapshot(String tag, String... columnFamilyList)
+    public void takeMultipleTableSnapshot(String tag, String... tableList)
             throws IOException
     {
         Map<Keyspace, List<String>> keyspaceColumnfamily = new HashMap<Keyspace, List<String>>();
-        for (String columnFamily : columnFamilyList)
+        for (String table : tableList)
         {
-            String splittedString[] = columnFamily.split("\\.");
+            String splittedString[] = table.split("\\.");
             if (splittedString.length == 2)
             {
                 String keyspaceName = splittedString[0];
-                String columnFamilyName = splittedString[1];
+                String tableName = splittedString[1];
 
                 if (keyspaceName == null)
                     throw new IOException("You must supply a keyspace name");
                 if (operationMode.equals(Mode.JOINING))
                     throw new IOException("Cannot snapshot until bootstrap completes");
 
-                if (columnFamilyName == null)
-                    throw new IOException("You must supply a column family name");
+                if (tableName == null)
+                    throw new IOException("You must supply a table name");
                 if (tag == null || tag.equals(""))
                     throw new IOException("You must supply a snapshot name.");
 
                 Keyspace keyspace = getValidKeyspace(keyspaceName);
-                ColumnFamilyStore columnFamilyStore = keyspace.getColumnFamilyStore(columnFamilyName);
+                ColumnFamilyStore columnFamilyStore = keyspace.getColumnFamilyStore(tableName);
                 // As there can be multiple column family from same keyspace check if snapshot exist for that specific
                 // columnfamily and not for whole keyspace
 
@@ -2849,7 +2950,7 @@
 
                 // Add Keyspace columnfamily to map in order to support atomicity for snapshot process.
                 // So no snapshot should happen if any one of the above conditions fail for any keyspace or columnfamily
-                keyspaceColumnfamily.get(keyspace).add(columnFamilyName);
+                keyspaceColumnfamily.get(keyspace).add(tableName);
 
             }
             else
@@ -2861,8 +2962,8 @@
 
         for (Entry<Keyspace, List<String>> entry : keyspaceColumnfamily.entrySet())
         {
-            for (String columnFamily : entry.getValue())
-                entry.getKey().snapshot(tag, columnFamily);
+            for (String table : entry.getValue())
+                entry.getKey().snapshot(tag, table);
         }
 
     }
@@ -2909,7 +3010,7 @@
         Map<String, TabularData> snapshotMap = new HashMap<>();
         for (Keyspace keyspace : Keyspace.all())
         {
-            if (SystemKeyspace.NAME.equals(keyspace.getName()))
+            if (Schema.isLocalSystemKeyspace(keyspace.getName()))
                 continue;
 
             for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
@@ -2935,7 +3036,7 @@
         long total = 0;
         for (Keyspace keyspace : Keyspace.all())
         {
-            if (SystemKeyspace.NAME.equals(keyspace.getName()))
+            if (Schema.isLocalSystemKeyspace(keyspace.getName()))
                 continue;
 
             for (ColumnFamilyStore cfStore : keyspace.getColumnFamilyStores())
@@ -2949,9 +3050,32 @@
 
     public void refreshSizeEstimates() throws ExecutionException
     {
+        cleanupSizeEstimates();
         FBUtilities.waitOnFuture(ScheduledExecutors.optionalTasks.submit(SizeEstimatesRecorder.instance));
     }
 
+    public void cleanupSizeEstimates()
+    {
+        SetMultimap<String, String> sizeEstimates = SystemKeyspace.getTablesWithSizeEstimates();
+
+        for (Entry<String, Collection<String>> tablesByKeyspace : sizeEstimates.asMap().entrySet())
+        {
+            String keyspace = tablesByKeyspace.getKey();
+            if (!Schema.instance.getKeyspaces().contains(keyspace))
+            {
+                SystemKeyspace.clearSizeEstimates(keyspace);
+            }
+            else
+            {
+                for (String table : tablesByKeyspace.getValue())
+                {
+                    if (!Schema.instance.hasCF(Pair.create(keyspace, table)))
+                        SystemKeyspace.clearSizeEstimates(keyspace, table);
+                }
+            }
+        }
+    }
+
     /**
      * @param allowIndexes Allow index CF names to be passed in
      * @param autoAddIndexes Automatically add secondary indexes if a CF has them
@@ -2968,12 +3092,12 @@
     /**
      * Flush all memtables for a keyspace and column families.
      * @param keyspaceName
-     * @param columnFamilies
+     * @param tableNames
      * @throws IOException
      */
-    public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException
+    public void forceKeyspaceFlush(String keyspaceName, String... tableNames) throws IOException
     {
-        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, columnFamilies))
+        for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, tableNames))
         {
             logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
             cfStore.forceBlockingFlush();
@@ -2982,7 +3106,7 @@
 
     public int repairAsync(String keyspace, Map<String, String> repairSpec)
     {
-        RepairOption option = RepairOption.parse(repairSpec, getPartitioner());
+        RepairOption option = RepairOption.parse(repairSpec, tokenMetadata.partitioner);
         // if ranges are not specified
         if (option.getRanges().isEmpty())
         {
@@ -3012,9 +3136,9 @@
                                 Collection<String> hosts,
                                 boolean primaryRange,
                                 boolean fullRepair,
-                                String... columnFamilies)
+                                String... tableNames)
     {
-        return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL.ordinal() : RepairParallelism.PARALLEL.ordinal(), dataCenters, hosts, primaryRange, fullRepair, columnFamilies);
+        return forceRepairAsync(keyspace, isSequential ? RepairParallelism.SEQUENTIAL.ordinal() : RepairParallelism.PARALLEL.ordinal(), dataCenters, hosts, primaryRange, fullRepair, tableNames);
     }
 
     @Deprecated
@@ -3024,7 +3148,7 @@
                                 Collection<String> hosts,
                                 boolean primaryRange,
                                 boolean fullRepair,
-                                String... columnFamilies)
+                                String... tableNames)
     {
         if (parallelismDegree < 0 || parallelismDegree > RepairParallelism.values().length - 1)
         {
@@ -3061,11 +3185,11 @@
         {
             options.getRanges().addAll(getLocalRanges(keyspace));
         }
-        if (columnFamilies != null)
+        if (tableNames != null)
         {
-            for (String columnFamily : columnFamilies)
+            for (String table : tableNames)
             {
-                options.getColumnFamilies().add(columnFamily);
+                options.getColumnFamilies().add(table);
             }
         }
         return forceRepairAsync(keyspace, options, true);
@@ -3077,14 +3201,14 @@
                                 boolean isLocal,
                                 boolean primaryRange,
                                 boolean fullRepair,
-                                String... columnFamilies)
+                                String... tableNames)
     {
         Set<String> dataCenters = null;
         if (isLocal)
         {
             dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
         }
-        return forceRepairAsync(keyspace, isSequential, dataCenters, null, primaryRange, fullRepair, columnFamilies);
+        return forceRepairAsync(keyspace, isSequential, dataCenters, null, primaryRange, fullRepair, tableNames);
     }
 
     @Deprecated
@@ -3095,11 +3219,11 @@
                                      Collection<String> dataCenters,
                                      Collection<String> hosts,
                                      boolean fullRepair,
-                                     String... columnFamilies)
+                                     String... tableNames)
     {
         return forceRepairRangeAsync(beginToken, endToken, keyspaceName,
                                      isSequential ? RepairParallelism.SEQUENTIAL.ordinal() : RepairParallelism.PARALLEL.ordinal(),
-                                     dataCenters, hosts, fullRepair, columnFamilies);
+                                     dataCenters, hosts, fullRepair, tableNames);
     }
 
     @Deprecated
@@ -3110,7 +3234,7 @@
                                      Collection<String> dataCenters,
                                      Collection<String> hosts,
                                      boolean fullRepair,
-                                     String... columnFamilies)
+                                     String... tableNames)
     {
         if (parallelismDegree < 0 || parallelismDegree > RepairParallelism.values().length - 1)
         {
@@ -3130,21 +3254,24 @@
         Collection<Range<Token>> repairingRange = createRepairRangeFrom(beginToken, endToken);
 
         RepairOption options = new RepairOption(parallelism, false, !fullRepair, false, 1, repairingRange, true);
-        options.getDataCenters().addAll(dataCenters);
+        if (dataCenters != null)
+        {
+            options.getDataCenters().addAll(dataCenters);
+        }
         if (hosts != null)
         {
             options.getHosts().addAll(hosts);
         }
-        if (columnFamilies != null)
+        if (tableNames != null)
         {
-            for (String columnFamily : columnFamilies)
+            for (String table : tableNames)
             {
-                options.getColumnFamilies().add(columnFamily);
+                options.getColumnFamilies().add(table);
             }
         }
 
         logger.info("starting user-requested repair of range {} for keyspace {} and column families {}",
-                    repairingRange, keyspaceName, columnFamilies);
+                    repairingRange, keyspaceName, tableNames);
         return forceRepairAsync(keyspaceName, options, true);
     }
 
@@ -3155,14 +3282,14 @@
                                      boolean isSequential,
                                      boolean isLocal,
                                      boolean fullRepair,
-                                     String... columnFamilies)
+                                     String... tableNames)
     {
         Set<String> dataCenters = null;
         if (isLocal)
         {
             dataCenters = Sets.newHashSet(DatabaseDescriptor.getLocalDataCenter());
         }
-        return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential, dataCenters, null, fullRepair, columnFamilies);
+        return forceRepairRangeAsync(beginToken, endToken, keyspaceName, isSequential, dataCenters, null, fullRepair, tableNames);
     }
 
     /**
@@ -3175,8 +3302,8 @@
     @VisibleForTesting
     Collection<Range<Token>> createRepairRangeFrom(String beginToken, String endToken)
     {
-        Token parsedBeginToken = getPartitioner().getTokenFactory().fromString(beginToken);
-        Token parsedEndToken = getPartitioner().getTokenFactory().fromString(endToken);
+        Token parsedBeginToken = getTokenFactory().fromString(beginToken);
+        Token parsedEndToken = getTokenFactory().fromString(endToken);
 
         // Break up given range to match ring layout in TokenMetadata
         ArrayList<Range<Token>> repairingRange = new ArrayList<>();
@@ -3203,13 +3330,18 @@
         return repairingRange;
     }
 
+    public TokenFactory getTokenFactory()
+    {
+        return tokenMetadata.partitioner.getTokenFactory();
+    }
+
     public int forceRepairAsync(String keyspace, RepairOption options, boolean legacy)
     {
         if (options.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor() < 2)
             return 0;
 
         int cmd = nextRepairCommand.incrementAndGet();
-        new Thread(createRepairTask(cmd, keyspace, options, legacy)).start();
+        new Thread(NamedThreadFactory.threadLocalDeallocator(createRepairTask(cmd, keyspace, options, legacy))).start();
         return cmd;
     }
 
@@ -3231,6 +3363,16 @@
         ActiveRepairService.instance.terminateSessions();
     }
 
+    public void setRepairSessionMaxTreeDepth(int depth)
+    {
+        DatabaseDescriptor.setRepairSessionMaxTreeDepth(depth);
+    }
+
+    public int getRepairSessionMaxTreeDepth()
+    {
+        return DatabaseDescriptor.getRepairSessionMaxTreeDepth();
+    }
+
     /* End of MBean interface methods */
 
     /**
@@ -3339,20 +3481,20 @@
      */
     public List<InetAddress> getNaturalEndpoints(String keyspaceName, String cf, String key)
     {
-        KSMetaData ksMetaData = Schema.instance.getKSMetaData(keyspaceName);
+        KeyspaceMetadata ksMetaData = Schema.instance.getKSMetaData(keyspaceName);
         if (ksMetaData == null)
             throw new IllegalArgumentException("Unknown keyspace '" + keyspaceName + "'");
 
-        CFMetaData cfMetaData = ksMetaData.cfMetaData().get(cf);
+        CFMetaData cfMetaData = ksMetaData.getTableOrViewNullable(cf);
         if (cfMetaData == null)
             throw new IllegalArgumentException("Unknown table '" + cf + "' in keyspace '" + keyspaceName + "'");
 
-        return getNaturalEndpoints(keyspaceName, getPartitioner().getToken(cfMetaData.getKeyValidator().fromString(key)));
+        return getNaturalEndpoints(keyspaceName, tokenMetadata.partitioner.getToken(cfMetaData.getKeyValidator().fromString(key)));
     }
 
     public List<InetAddress> getNaturalEndpoints(String keyspaceName, ByteBuffer key)
     {
-        return getNaturalEndpoints(keyspaceName, getPartitioner().getToken(key));
+        return getNaturalEndpoints(keyspaceName, tokenMetadata.partitioner.getToken(key));
     }
 
     /**
@@ -3369,6 +3511,14 @@
     }
 
     /**
+     * Returns the endpoints currently responsible for storing the token plus pending ones
+     */
+    public Iterable<InetAddress> getNaturalAndPendingEndpoints(String keyspaceName, Token token)
+    {
+        return Iterables.concat(getNaturalEndpoints(keyspaceName, token), tokenMetadata.pendingEndpointsFor(token, keyspaceName));
+    }
+
+    /**
      * This method attempts to return N endpoints that are responsible for storing the
      * specified key i.e for replication.
      *
@@ -3378,21 +3528,33 @@
      */
     public List<InetAddress> getLiveNaturalEndpoints(Keyspace keyspace, ByteBuffer key)
     {
-        return getLiveNaturalEndpoints(keyspace, getPartitioner().decorateKey(key));
+        return getLiveNaturalEndpoints(keyspace, tokenMetadata.decorateKey(key));
     }
 
     public List<InetAddress> getLiveNaturalEndpoints(Keyspace keyspace, RingPosition pos)
     {
+        List<InetAddress> liveEps = new ArrayList<>();
+        getLiveNaturalEndpoints(keyspace, pos, liveEps);
+        return liveEps;
+    }
+
+    /**
+     * This method attempts to return N endpoints that are responsible for storing the
+     * specified key i.e for replication.
+     *
+     * @param keyspace keyspace name also known as keyspace
+     * @param pos position for which we need to find the endpoint
+     * @param liveEps the list of endpoints to mutate
+     */
+    public void getLiveNaturalEndpoints(Keyspace keyspace, RingPosition pos, List<InetAddress> liveEps)
+    {
         List<InetAddress> endpoints = keyspace.getReplicationStrategy().getNaturalEndpoints(pos);
-        List<InetAddress> liveEps = new ArrayList<>(endpoints.size());
 
         for (InetAddress endpoint : endpoints)
         {
             if (FailureDetector.instance.isAlive(endpoint))
                 liveEps.add(endpoint);
         }
-
-        return liveEps;
     }
 
     public void setLoggingLevel(String classQualifier, String rawLevel) throws Exception
@@ -3473,7 +3635,7 @@
             Token token = tokens.get(index);
             Range<Token> range = new Range<>(prevToken, token);
             // always return an estimate > 0 (see CASSANDRA-7322)
-            splits.add(Pair.create(range, Math.max(cfs.metadata.getMinIndexInterval(), cfs.estimatedKeysForRange(range))));
+            splits.add(Pair.create(range, Math.max(cfs.metadata.params.minIndexInterval, cfs.estimatedKeysForRange(range))));
             prevToken = token;
         }
         return splits;
@@ -3518,7 +3680,7 @@
             throw new UnsupportedOperationException("Node in " + operationMode + " state; wait for status to become normal or restart");
 
         PendingRangeCalculatorService.instance.blockUntilFinished();
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             if (tokenMetadata.getPendingRanges(keyspaceName, FBUtilities.getBroadcastAddress()).size() > 0)
                 throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
@@ -3537,8 +3699,13 @@
             {
                 shutdownClientServers();
                 Gossiper.instance.stop();
-                MessagingService.instance().shutdown();
+                try {
+                    MessagingService.instance().shutdown();
+                } catch (IOError ioe) {
+                    logger.info("failed to shutdown message service: {}", ioe);
+                }
                 StageManager.shutdownNow();
+                SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.DECOMMISSIONED);
                 setMode(Mode.DECOMMISSIONED, true);
                 // let op be responsible for killing the process
             }
@@ -3562,7 +3729,7 @@
     {
         Map<String, Multimap<Range<Token>, InetAddress>> rangesToStream = new HashMap<>();
 
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             Multimap<Range<Token>, InetAddress> rangesMM = getChangedRangesForLeaving(keyspaceName, FBUtilities.getBroadcastAddress());
 
@@ -3591,7 +3758,7 @@
 
         setMode(Mode.LEAVING, "streaming hints to other nodes", true);
 
-        Future<StreamState> hintsSuccess = streamHints();
+        Future hintsSuccess = streamHints();
 
         // wait for the transfer runnables to signal the latch.
         logger.debug("waiting for stream acks.");
@@ -3609,13 +3776,16 @@
         onFinish.run();
     }
 
-    private Future<StreamState> streamHints()
+    private Future streamHints()
     {
-        // StreamPlan will not fail if there are zero files to transfer, so flush anyway (need to get any in-memory hints, as well)
-        ColumnFamilyStore hintsCF = Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.HINTS);
-        FBUtilities.waitOnFuture(hintsCF.forceFlush());
+        return HintsService.instance.transferHints(this::getPreferredHintsStreamTarget);
+    }
 
-        // gather all live nodes in the cluster that aren't also leaving
+    /**
+     * Find the best target to stream hints to. Currently the closest peer according to the snitch
+     */
+    private UUID getPreferredHintsStreamTarget()
+    {
         List<InetAddress> candidates = new ArrayList<>(StorageService.instance.getTokenMetadata().cloneAfterAllLeft().getAllEndpoints());
         candidates.remove(FBUtilities.getBroadcastAddress());
         for (Iterator<InetAddress> iter = candidates.iterator(); iter.hasNext(); )
@@ -3628,25 +3798,14 @@
         if (candidates.isEmpty())
         {
             logger.warn("Unable to stream hints since no live endpoints seen");
-            return Futures.immediateFuture(null);
+            throw new RuntimeException("Unable to stream hints since no live endpoints seen");
         }
         else
         {
             // stream to the closest peer as chosen by the snitch
             DatabaseDescriptor.getEndpointSnitch().sortByProximity(FBUtilities.getBroadcastAddress(), candidates);
             InetAddress hintsDestinationHost = candidates.get(0);
-            InetAddress preferred = SystemKeyspace.getPreferredIP(hintsDestinationHost);
-
-            // stream all hints -- range list will be a singleton of "the entire ring"
-            Token token = StorageService.getPartitioner().getMinimumToken();
-            List<Range<Token>> ranges = Collections.singletonList(new Range<>(token, token));
-
-            return new StreamPlan("Hints").transferRanges(hintsDestinationHost,
-                                                          preferred,
-                                                          SystemKeyspace.NAME,
-                                                          ranges,
-                                                          SystemKeyspace.HINTS)
-                                          .execute();
+            return tokenMetadata.getHostId(hintsDestinationHost);
         }
     }
 
@@ -3654,13 +3813,13 @@
     {
         try
         {
-            getPartitioner().getTokenFactory().validate(newToken);
+            getTokenFactory().validate(newToken);
         }
         catch (ConfigurationException e)
         {
             throw new IOException(e.getMessage());
         }
-        move(getPartitioner().getTokenFactory().fromString(newToken));
+        move(getTokenFactory().fromString(newToken));
     }
 
     /**
@@ -3688,7 +3847,7 @@
             throw new UnsupportedOperationException("This node has more than one token and cannot be moved thusly.");
         }
 
-        List<String> keyspacesToProcess = Schema.instance.getNonSystemKeyspaces();
+        List<String> keyspacesToProcess = Schema.instance.getNonLocalStrategyKeyspaces();
 
         PendingRangeCalculatorService.instance.blockUntilFinished();
         // checking if data is moving to this node
@@ -3946,7 +4105,7 @@
             throw new UnsupportedOperationException("This node is already processing a removal. Wait for it to complete, or use 'removenode force' if this has failed.");
 
         // Find the endpoints that are going to become responsible for data
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             // if the replication factor is 1 the data is lost so we shouldn't wait for confirmation
             if (Keyspace.open(keyspaceName).getReplicationStrategy().getReplicationFactor() == 1)
@@ -4016,6 +4175,26 @@
         return operationMode == Mode.STARTING;
     }
 
+    public boolean isMoving()
+    {
+        return operationMode == Mode.MOVING;
+    }
+
+    public boolean isJoining()
+    {
+        return operationMode == Mode.JOINING;
+    }
+
+    public boolean isDrained()
+    {
+        return operationMode == Mode.DRAINED;
+    }
+
+    public boolean isDraining()
+    {
+        return operationMode == Mode.DRAINING;
+    }
+
     public String getDrainProgress()
     {
         return String.format("Drained %s/%s ColumnFamilies", remainingCFs, totalCFs);
@@ -4023,107 +4202,170 @@
 
     /**
      * Shuts node off to writes, empties memtables and the commit log.
-     * There are two differences between drain and the normal shutdown hook:
-     * - Drain waits for in-progress streaming to complete
-     * - Drain flushes *all* columnfamilies (shutdown hook only flushes non-durable CFs)
      */
     public synchronized void drain() throws IOException, InterruptedException, ExecutionException
     {
-        inShutdownHook = true;
+        drain(false);
+    }
 
+    protected synchronized void drain(boolean isFinalShutdown) throws IOException, InterruptedException, ExecutionException
+    {
         ExecutorService counterMutationStage = StageManager.getStage(Stage.COUNTER_MUTATION);
+        ExecutorService viewMutationStage = StageManager.getStage(Stage.VIEW_MUTATION);
         ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
-        if (mutationStage.isTerminated() && counterMutationStage.isTerminated())
+
+        if (mutationStage.isTerminated()
+            && counterMutationStage.isTerminated()
+            && viewMutationStage.isTerminated())
         {
-            logger.warn("Cannot drain node (did it already happen?)");
+            if (!isFinalShutdown)
+                logger.warn("Cannot drain node (did it already happen?)");
             return;
         }
-        setMode(Mode.DRAINING, "starting drain process", true);
-        shutdownClientServers();
-        ScheduledExecutors.optionalTasks.shutdown();
-        Gossiper.instance.stop();
 
-        setMode(Mode.DRAINING, "shutting down MessageService", false);
-        MessagingService.instance().shutdown();
-
-        setMode(Mode.DRAINING, "clearing mutation stage", false);
-        counterMutationStage.shutdown();
-        mutationStage.shutdown();
-        counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
-        mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
-
-        StorageProxy.instance.verifyNoHintsInProgress();
-
-        setMode(Mode.DRAINING, "flushing column families", false);
-        // count CFs first, since forceFlush could block for the flushWriter to get a queue slot empty
-        totalCFs = 0;
-        for (Keyspace keyspace : Keyspace.nonSystem())
-            totalCFs += keyspace.getColumnFamilyStores().size();
-        remainingCFs = totalCFs;
-        // flush
-        List<Future<?>> flushes = new ArrayList<>();
-        for (Keyspace keyspace : Keyspace.nonSystem())
-        {
-            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
-                flushes.add(cfs.forceFlush());
-        }
-        // wait for the flushes.
-        // TODO this is a godawful way to track progress, since they flush in parallel.  a long one could
-        // thus make several short ones "instant" if we wait for them later.
-        for (Future f : flushes)
-        {
-            FBUtilities.waitOnFuture(f);
-            remainingCFs--;
-        }
+        assert !isShutdown;
+        isShutdown = true;
 
         try
         {
-            /* not clear this is reasonable time, but propagated from prior embedded behaviour */
-            BatchlogManager.shutdownAndWait(1L, MINUTES);
+            setMode(Mode.DRAINING, "starting drain process", !isFinalShutdown);
+
+            try
+            {
+                /* not clear this is reasonable time, but propagated from prior embedded behaviour */
+                BatchlogManager.instance.shutdownAndWait(1L, MINUTES);
+            }
+            catch (TimeoutException t)
+            {
+                logger.error("Batchlog manager timed out shutting down", t);
+            }
+
+            HintsService.instance.pauseDispatch();
+
+            if (daemon != null)
+                shutdownClientServers();
+            ScheduledExecutors.optionalTasks.shutdown();
+            Gossiper.instance.stop();
+
+            if (!isFinalShutdown)
+                setMode(Mode.DRAINING, "shutting down MessageService", false);
+
+            // In-progress writes originating here could generate hints to be written, so shut down MessagingService
+            // before mutation stage, so we can get all the hints saved before shutting down
+            MessagingService.instance().shutdown();
+
+            if (!isFinalShutdown)
+                setMode(Mode.DRAINING, "clearing mutation stage", false);
+            viewMutationStage.shutdown();
+            counterMutationStage.shutdown();
+            mutationStage.shutdown();
+            viewMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
+            counterMutationStage.awaitTermination(3600, TimeUnit.SECONDS);
+            mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
+
+            StorageProxy.instance.verifyNoHintsInProgress();
+
+            if (!isFinalShutdown)
+                setMode(Mode.DRAINING, "flushing column families", false);
+
+            // disable autocompaction - we don't want to start any new compactions while we are draining
+            for (Keyspace keyspace : Keyspace.all())
+                for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+                    cfs.disableAutoCompaction();
+
+            // count CFs first, since forceFlush could block for the flushWriter to get a queue slot empty
+            totalCFs = 0;
+            for (Keyspace keyspace : Keyspace.nonSystem())
+                totalCFs += keyspace.getColumnFamilyStores().size();
+            remainingCFs = totalCFs;
+            // flush
+            List<Future<?>> flushes = new ArrayList<>();
+            for (Keyspace keyspace : Keyspace.nonSystem())
+            {
+                for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+                    flushes.add(cfs.forceFlush());
+            }
+            // wait for the flushes.
+            // TODO this is a godawful way to track progress, since they flush in parallel.  a long one could
+            // thus make several short ones "instant" if we wait for them later.
+            for (Future f : flushes)
+            {
+                try
+                {
+                    FBUtilities.waitOnFuture(f);
+                }
+                catch (Throwable t)
+                {
+                    JVMStabilityInspector.inspectThrowable(t);
+                    // don't let this stop us from shutting down the commitlog and other thread pools
+                    logger.warn("Caught exception while waiting for memtable flushes during shutdown hook", t);
+                }
+
+                remainingCFs--;
+            }
+
+            // Interrupt ongoing compactions and shutdown CM to prevent further compactions.
+            CompactionManager.instance.forceShutdown();
+            // Flush the system tables after all other tables are flushed, just in case flushing modifies any system state
+            // like CASSANDRA-5151. Don't bother with progress tracking since system data is tiny.
+            // Flush system tables after stopping compactions since they modify
+            // system tables (for example compactions can obsolete sstables and the tidiers in SSTableReader update
+            // system tables, see SSTableReader.GlobalTidy)
+            flushes.clear();
+            for (Keyspace keyspace : Keyspace.system())
+            {
+                for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+                    flushes.add(cfs.forceFlush());
+            }
+            FBUtilities.waitOnFutures(flushes);
+
+            HintsService.instance.shutdownBlocking();
+
+            // Interrupt ongoing compactions and shutdown CM to prevent further compactions.
+            CompactionManager.instance.forceShutdown();
+
+            // whilst we've flushed all the CFs, which will have recycled all completed segments, we want to ensure
+            // there are no segments to replay, so we force the recycling of any remaining (should be at most one)
+            CommitLog.instance.forceRecycleAllSegments();
+
+            CommitLog.instance.shutdownBlocking();
+
+            // wait for miscellaneous tasks like sstable and commitlog segment deletion
+            ScheduledExecutors.nonPeriodicTasks.shutdown();
+            if (!ScheduledExecutors.nonPeriodicTasks.awaitTermination(1, MINUTES))
+                logger.warn("Failed to wait for non periodic tasks to shutdown");
+
+            ColumnFamilyStore.shutdownPostFlushExecutor();
+            setMode(Mode.DRAINED, !isFinalShutdown);
         }
-        catch (TimeoutException t)
+        catch (Throwable t)
         {
-            logger.error("Batchlog manager timed out shutting down", t);
+            logger.error("Caught an exception while draining ", t);
         }
+    }
 
-        // Interrupt on going compaction and shutdown to prevent further compaction
-        CompactionManager.instance.forceShutdown();
+    /**
+     * Some services are shutdown during draining and we should not attempt to start them again.
+     *
+     * @param service - the name of the service we are trying to start.
+     * @throws IllegalStateException - an exception that nodetool is able to convert into a message to display to the user
+     */
+    synchronized void checkServiceAllowedToStart(String service)
+    {
+        if (isDraining()) // when draining isShutdown is also true, so we check first to return a more accurate message
+            throw new IllegalStateException(String.format("Unable to start %s because the node is draining.", service));
 
-        // Flush the system tables after all other tables are flushed, just in case flushing modifies any system state
-        // like CASSANDRA-5151. Don't bother with progress tracking since system data is tiny.
-        // Flush system tables after stopping the batchlog manager and compactions since they both modify
-        // system tables (for example compactions can obsolete sstables and the tidiers in SSTableReader update
-        // system tables, see SSTableReader.GlobalTidy)
-        flushes.clear();
-        for (Keyspace keyspace : Keyspace.system())
-        {
-            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
-                flushes.add(cfs.forceFlush());
-        }
-        FBUtilities.waitOnFutures(flushes);
-
-        // whilst we've flushed all the CFs, which will have recycled all completed segments, we want to ensure
-        // there are no segments to replay, so we force the recycling of any remaining (should be at most one)
-        CommitLog.instance.forceRecycleAllSegments();
-
-        CommitLog.instance.shutdownBlocking();
-
-        // wait for miscellaneous tasks like sstable and commitlog segment deletion
-        ScheduledExecutors.nonPeriodicTasks.shutdown();
-        if (!ScheduledExecutors.nonPeriodicTasks.awaitTermination(1, MINUTES))
-            logger.warn("Failed to wait for non periodic tasks to shutdown");
-
-        ColumnFamilyStore.shutdownPostFlushExecutor();
-
-        setMode(Mode.DRAINED, true);
+        if (isShutdown()) // do not rely on operationMode in case it gets changed to decomissioned or other
+            throw new IllegalStateException(String.format("Unable to start %s because the node was drained.", service));
     }
 
     // Never ever do this at home. Used by tests.
-    IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner)
+    @VisibleForTesting
+    public IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner)
     {
-        IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner();
-        DatabaseDescriptor.setPartitioner(newPartitioner);
-        valueFactory = new VersionedValue.VersionedValueFactory(getPartitioner());
+        IPartitioner oldPartitioner = DatabaseDescriptor.setPartitionerUnsafe(newPartitioner);
+        tokenMetadata = tokenMetadata.cloneWithNewPartitioner(newPartitioner);
+        valueFactory = new VersionedValue.VersionedValueFactory(newPartitioner);
         return oldPartitioner;
     }
 
@@ -4134,11 +4376,11 @@
         return old;
     }
 
-    public void truncate(String keyspace, String columnFamily) throws TimeoutException, IOException
+    public void truncate(String keyspace, String table) throws TimeoutException, IOException
     {
         try
         {
-            StorageProxy.truncateBlocking(keyspace, columnFamily);
+            StorageProxy.truncateBlocking(keyspace, table);
         }
         catch (UnavailableException e)
         {
@@ -4150,7 +4392,7 @@
     {
         List<Token> sortedTokens = tokenMetadata.sortedTokens();
         // describeOwnership returns tokens in an unspecified order, let's re-order them
-        Map<Token, Float> tokenMap = new TreeMap<Token, Float>(getPartitioner().describeOwnership(sortedTokens));
+        Map<Token, Float> tokenMap = new TreeMap<Token, Float>(tokenMetadata.partitioner.describeOwnership(sortedTokens));
         Map<InetAddress, Float> nodeMap = new LinkedHashMap<>();
         for (Map.Entry<Token, Float> entry : tokenMap.entrySet())
         {
@@ -4192,7 +4434,7 @@
 
             if (userKeyspaces.size() > 0)
             {
-                keyspace = userKeyspaces.iterator().next();
+                keyspace = userKeyspaces.get(0);
                 AbstractReplicationStrategy replicationStrategy = Schema.instance.getKeyspaceInstance(keyspace).getReplicationStrategy();
                 for (String keyspaceName : userKeyspaces)
                 {
@@ -4220,7 +4462,7 @@
         for (Collection<InetAddress> endpoints : sortedDcsToEndpoints.values())
             endpointsGroupedByDc.add(endpoints);
 
-        Map<Token, Float> tokenOwnership = getPartitioner().describeOwnership(tokenMetadata.sortedTokens());
+        Map<Token, Float> tokenOwnership = tokenMetadata.partitioner.describeOwnership(tokenMetadata.sortedTokens());
         LinkedHashMap<InetAddress, Float> finalOwnership = Maps.newLinkedHashMap();
 
         Multimap<InetAddress, Range<Token>> endpointToRanges = strategy.getAddressRanges();
@@ -4250,8 +4492,12 @@
 
     public List<String> getNonSystemKeyspaces()
     {
-        List<String> keyspaceNamesList = new ArrayList<>(Schema.instance.getNonSystemKeyspaces());
-        return Collections.unmodifiableList(keyspaceNamesList);
+        return Collections.unmodifiableList(Schema.instance.getNonSystemKeyspaces());
+    }
+
+    public List<String> getNonLocalStrategyKeyspaces()
+    {
+        return Collections.unmodifiableList(Schema.instance.getNonLocalStrategyKeyspaces());
     }
 
     public void updateSnitch(String epSnitchClassName, Boolean dynamic, Integer dynamicUpdateInterval, Integer dynamicResetInterval, Double dynamicBadnessThreshold) throws ClassNotFoundException
@@ -4430,7 +4676,6 @@
                 this.keyspace = keyspace;
                 try
                 {
-                    setPartitioner(DatabaseDescriptor.getPartitioner());
                     for (Map.Entry<Range<Token>, List<InetAddress>> entry : StorageService.instance.getRangeToAddressMap(keyspace).entrySet())
                     {
                         Range<Token> range = entry.getKey();
@@ -4455,7 +4700,7 @@
 
     public void rescheduleFailedDeletions()
     {
-        SSTableDeletingTask.rescheduleFailedTasks();
+        LifecycleTransaction.rescheduleFailedDeletions();
     }
 
     /**
@@ -4463,6 +4708,8 @@
      */
     public void loadNewSSTables(String ksName, String cfName)
     {
+        if (!isInitialized())
+            throw new RuntimeException("Not yet initialized, can't load new sstables");
         ColumnFamilyStore.loadNewSSTables(ksName, cfName);
     }
 
@@ -4472,7 +4719,7 @@
     public List<String> sampleKeyRange() // do not rename to getter - see CASSANDRA-4452 for details
     {
         List<DecoratedKey> keys = new ArrayList<>();
-        for (Keyspace keyspace : Keyspace.nonSystem())
+        for (Keyspace keyspace : Keyspace.nonLocalStrategy())
         {
             for (Range<Token> range : getPrimaryRangesForEndpoint(keyspace.getName(), FBUtilities.getBroadcastAddress()))
                 keys.addAll(keySamples(keyspace.getColumnFamilyStores(), range));
@@ -4486,7 +4733,12 @@
 
     public void rebuildSecondaryIndex(String ksName, String cfName, String... idxNames)
     {
-        ColumnFamilyStore.rebuildSecondaryIndex(ksName, cfName, idxNames);
+        String[] indices = asList(idxNames).stream()
+                                           .map(p -> isIndexColumnFamily(p) ? getIndexName(p) : p)
+                                           .collect(toList())
+                                           .toArray(new String[idxNames.length]);
+
+        ColumnFamilyStore.rebuildSecondaryIndex(ksName, cfName, indices);
     }
 
     public void resetLocalSchema() throws IOException
@@ -4494,6 +4746,11 @@
         MigrationManager.resetLocalSchema();
     }
 
+    public void reloadLocalSchema()
+    {
+        SchemaKeyspace.reloadSchemaAndAnnounceVersion();
+    }
+
     public void setTraceProbability(double probability)
     {
         this.traceProbability = probability;
@@ -4504,17 +4761,19 @@
         return traceProbability;
     }
 
-    public void disableAutoCompaction(String ks, String... columnFamilies) throws IOException
+    public void disableAutoCompaction(String ks, String... tables) throws IOException
     {
-        for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, columnFamilies))
+        for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, tables))
         {
             cfs.disableAutoCompaction();
         }
     }
 
-    public void enableAutoCompaction(String ks, String... columnFamilies) throws IOException
+    public synchronized void enableAutoCompaction(String ks, String... tables) throws IOException
     {
-        for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, columnFamilies))
+        checkServiceAllowedToStart("auto compaction");
+
+        for (ColumnFamilyStore cfs : getValidColumnFamilies(true, true, ks, tables))
         {
             cfs.enableAutoCompaction();
         }

diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java
index 43d26c6..e22b094 100644
--- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java
+++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java

@@ -205,20 +205,20 @@
      * Takes the snapshot of a specific column family. A snapshot name must be specified.
      *
      * @param keyspaceName the keyspace which holds the specified column family
-     * @param columnFamilyName the column family to snapshot
+     * @param tableName the table to snapshot
      * @param tag the tag given to the snapshot; may not be null or empty
      */
-    public void takeColumnFamilySnapshot(String keyspaceName, String columnFamilyName, String tag) throws IOException;
+    public void takeTableSnapshot(String keyspaceName, String tableName, String tag) throws IOException;
 
     /**
      * Takes the snapshot of a multiple column family from different keyspaces. A snapshot name must be specified.
      * 
      * @param tag
      *            the tag given to the snapshot; may not be null or empty
-     * @param columnFamilyList
-     *            list of columnfamily from different keyspace in the form of ks1.cf1 ks2.cf2
+     * @param tableList
+     *            list of tables from different keyspace in the form of ks1.cf1 ks2.cf2
      */
-    public void takeMultipleColumnFamilySnapshot(String tag, String... columnFamilyList) throws IOException;
+    public void takeMultipleTableSnapshot(String tag, String... tableList) throws IOException;
 
     /**
      * Remove the snapshot with the given name from the given keyspaces.
@@ -244,56 +244,61 @@
     public void refreshSizeEstimates() throws ExecutionException;
 
     /**
+     * Removes extraneous entries in system.size_estimates.
+     */
+    public void cleanupSizeEstimates();
+
+    /**
      * Forces major compaction of a single keyspace
      */
-    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Trigger a cleanup of keys on a single keyspace
      */
     @Deprecated
-    public int forceKeyspaceCleanup(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
-    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int forceKeyspaceCleanup(String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException;
+    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Scrub (deserialize + reserialize at the latest version, skipping bad rows if any) the given keyspace.
-     * If columnFamilies array is empty, all CFs are scrubbed.
+     * If tableNames array is empty, all CFs are scrubbed.
      *
      * Scrubbed CFs will be snapshotted first, if disableSnapshot is false
      */
     @Deprecated
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException;
     @Deprecated
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException;
     @Deprecated
     public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
 
-public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Verify (checksums of) the given keyspace.
-     * If columnFamilies array is empty, all CFs are verified.
+     * If tableNames array is empty, all CFs are verified.
      *
      * The entire sstable will be read to ensure each cell validates if extendedVerify is true
      */
-    public int verify(boolean extendedVerify, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int verify(boolean extendedVerify, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Rewrite all sstables to the latest version.
      * Unlike scrub, it doesn't skip bad rows and do not snapshot sstables first.
      */
     @Deprecated
-    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
-    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, String... tableNames) throws IOException, ExecutionException, InterruptedException;
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... tableNames) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Flush all memtables for the given column families, or all columnfamilies for the given keyspace
      * if none are explicitly listed.
      * @param keyspaceName
-     * @param columnFamilies
+     * @param tableNames
      * @throws IOException
      */
-    public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException;
+    public void forceKeyspaceFlush(String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException;
 
     /**
      * Invoke repair asynchronously.
@@ -312,7 +317,7 @@
      * @deprecated use {@link #repairAsync(String keyspace, Map options)} instead.
      */
     @Deprecated
-    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts,  boolean primaryRange, boolean fullRepair, String... columnFamilies) throws IOException;
+    public int forceRepairAsync(String keyspace, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts,  boolean primaryRange, boolean fullRepair, String... tableNames) throws IOException;
 
     /**
      * Invoke repair asynchronously.
@@ -327,13 +332,13 @@
      * @return Repair command number, or 0 if nothing to repair
      */
     @Deprecated
-    public int forceRepairAsync(String keyspace, int parallelismDegree, Collection<String> dataCenters, Collection<String> hosts, boolean primaryRange, boolean fullRepair, String... columnFamilies);
+    public int forceRepairAsync(String keyspace, int parallelismDegree, Collection<String> dataCenters, Collection<String> hosts, boolean primaryRange, boolean fullRepair, String... tableNames);
 
     /**
      * @deprecated use {@link #repairAsync(String keyspace, Map options)} instead.
      */
     @Deprecated
-    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean fullRepair, String... columnFamilies) throws IOException;
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, Collection<String> dataCenters, Collection<String> hosts, boolean fullRepair, String... tableNames) throws IOException;
 
     /**
      * Same as forceRepairAsync, but handles a specified range
@@ -343,22 +348,26 @@
      * @param parallelismDegree 0: sequential, 1: parallel, 2: DC parallel
      */
     @Deprecated
-    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, int parallelismDegree, Collection<String> dataCenters, Collection<String> hosts, boolean fullRepair, String... columnFamilies);
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, int parallelismDegree, Collection<String> dataCenters, Collection<String> hosts, boolean fullRepair, String... tableNames);
 
     /**
      * @deprecated use {@link #repairAsync(String keyspace, Map options)} instead.
      */
     @Deprecated
-    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, boolean fullRepair, String... columnFamilies);
+    public int forceRepairAsync(String keyspace, boolean isSequential, boolean isLocal, boolean primaryRange, boolean fullRepair, String... tableNames);
 
     /**
      * @deprecated use {@link #repairAsync(String keyspace, Map options)} instead.
      */
     @Deprecated
-    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, boolean fullRepair, String... columnFamilies);
+    public int forceRepairRangeAsync(String beginToken, String endToken, String keyspaceName, boolean isSequential, boolean isLocal, boolean fullRepair, String... tableNames);
 
     public void forceTerminateAllRepairSessions();
 
+    public void setRepairSessionMaxTreeDepth(int depth);
+
+    public int getRepairSessionMaxTreeDepth();
+
     /**
      * transfer this node's data to other machines and remove it from service.
      */
@@ -419,16 +428,16 @@
     public void drain() throws IOException, InterruptedException, ExecutionException;
 
     /**
-     * Truncates (deletes) the given columnFamily from the provided keyspace.
+     * Truncates (deletes) the given table from the provided keyspace.
      * Calling truncate results in actual deletion of all data in the cluster
-     * under the given columnFamily and it will fail unless all hosts are up.
+     * under the given table and it will fail unless all hosts are up.
      * All data in the given column family will be deleted, but its definition
      * will not be affected.
      *
      * @param keyspace The keyspace to delete from
-     * @param columnFamily The column family to delete data from.
+     * @param table The column family to delete data from.
      */
-    public void truncate(String keyspace, String columnFamily)throws TimeoutException, IOException;
+    public void truncate(String keyspace, String table)throws TimeoutException, IOException;
 
     /**
      * given a list of tokens (representing the nodes in the cluster), returns
@@ -449,6 +458,8 @@
 
     public List<String> getNonSystemKeyspaces();
 
+    public List<String> getNonLocalStrategyKeyspaces();
+
     /**
      * Change endpointsnitch class and dynamic-ness (and dynamic attributes) at runtime
      * @param epSnitchClassName        the canonical path name for a class implementing IEndpointSnitch
@@ -490,6 +501,8 @@
     // allows a node that have been started without joining the ring to join it
     public void joinRing() throws IOException;
     public boolean isJoined();
+    public boolean isDrained();
+    public boolean isDraining();
 
     /** Check if currently bootstrapping.
      * Note this becomes false before {@link org.apache.cassandra.db.SystemKeyspace#bootstrapComplete()} is called,
@@ -531,12 +544,12 @@
     public void rescheduleFailedDeletions();
 
     /**
-     * Load new SSTables to the given keyspace/columnFamily
+     * Load new SSTables to the given keyspace/table
      *
      * @param ksName The parent keyspace name
      * @param cfName The ColumnFamily name where SSTables belong
      */
-    public void loadNewSSTables(String ksName, String cfName);
+    public void loadNewSSTables(String ksName, String tableName);
 
     /**
      * Return a List of Tokens representing a sample of keys across all ColumnFamilyStores.
@@ -555,6 +568,8 @@
 
     public void resetLocalSchema() throws IOException;
 
+    public void reloadLocalSchema();
+
     /**
      * Enables/Disables tracing for the whole system. Only thrift requests can start tracing currently.
      *
@@ -569,8 +584,8 @@
      */
     public double getTraceProbability();
 
-    void disableAutoCompaction(String ks, String ... columnFamilies) throws IOException;
-    void enableAutoCompaction(String ks, String ... columnFamilies) throws IOException;
+    void disableAutoCompaction(String ks, String ... tables) throws IOException;
+    void enableAutoCompaction(String ks, String ... tables) throws IOException;
 
     public void deliverHints(String host) throws UnknownHostException;
 
@@ -604,4 +619,7 @@
      * @return true if the node successfully starts resuming. (this does not mean bootstrap streaming was success.)
      */
     public boolean resumeBootstrap();
+
+    /** Returns the max version that this node will negotiate for native protocol connections */
+    public int getMaxNativeProtocolVersion();
 }

diff --git a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
index dceeecd..2eecfee 100644
--- a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java

@@ -17,157 +17,215 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.util.*;
-
-import com.google.common.annotations.VisibleForTesting;
-
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ColumnCounter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.cassandra.service.ClientState;
 
 abstract class AbstractQueryPager implements QueryPager
 {
-    private static final Logger logger = LoggerFactory.getLogger(AbstractQueryPager.class);
-
-    private final ConsistencyLevel consistencyLevel;
-    private final boolean localQuery;
-
-    protected final CFMetaData cfm;
-    protected final IDiskAtomFilter columnFilter;
-    private final long timestamp;
+    protected final ReadCommand command;
+    protected final DataLimits limits;
+    protected final int protocolVersion;
+    private final boolean enforceStrictLiveness;
 
     private int remaining;
+
+    // This is the last key we've been reading from (or can still be reading within). This the key for
+    // which remainingInPartition makes sense: if we're starting another key, we should reset remainingInPartition
+    // (and this is done in PagerIterator). This can be null (when we start).
+    private DecoratedKey lastKey;
+    private int remainingInPartition;
+
     private boolean exhausted;
-    private boolean shouldFetchExtraRow;
 
-    protected AbstractQueryPager(ConsistencyLevel consistencyLevel,
-                                 int toFetch,
-                                 boolean localQuery,
-                                 String keyspace,
-                                 String columnFamily,
-                                 IDiskAtomFilter columnFilter,
-                                 long timestamp)
+    protected AbstractQueryPager(ReadCommand command, int protocolVersion)
     {
-        this(consistencyLevel, toFetch, localQuery, Schema.instance.getCFMetaData(keyspace, columnFamily), columnFilter, timestamp);
+        this.command = command;
+        this.protocolVersion = protocolVersion;
+        this.limits = command.limits();
+        this.enforceStrictLiveness = command.metadata().enforceStrictLiveness();
+
+        this.remaining = limits.count();
+        this.remainingInPartition = limits.perPartitionCount();
     }
 
-    protected AbstractQueryPager(ConsistencyLevel consistencyLevel,
-                                 int toFetch,
-                                 boolean localQuery,
-                                 CFMetaData cfm,
-                                 IDiskAtomFilter columnFilter,
-                                 long timestamp)
+    public ReadOrderGroup startOrderGroup()
     {
-        this.consistencyLevel = consistencyLevel;
-        this.localQuery = localQuery;
-
-        this.cfm = cfm;
-        this.columnFilter = columnFilter;
-        this.timestamp = timestamp;
-
-        this.remaining = toFetch;
+        return command.startOrderGroup();
     }
 
-
-    public List<Row> fetchPage(int pageSize) throws RequestValidationException, RequestExecutionException
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState) throws RequestValidationException, RequestExecutionException
     {
         if (isExhausted())
-            return Collections.emptyList();
+            return EmptyIterators.partition();
 
-        int currentPageSize = nextPageSize(pageSize);
-        List<Row> rows = filterEmpty(queryNextPage(currentPageSize, consistencyLevel, localQuery));
-
-        if (rows.isEmpty())
+        pageSize = Math.min(pageSize, remaining);
+        Pager pager = new RowPager(limits.forPaging(pageSize), command.nowInSec());
+        ReadCommand readCommand = nextPageReadCommand(pageSize);
+        if (readCommand == null)
         {
-            logger.trace("Got empty set of rows, considering pager exhausted");
             exhausted = true;
-            return Collections.emptyList();
+            return EmptyIterators.partition();
         }
-
-        int liveCount = getPageLiveCount(rows);
-        logger.trace("Fetched {} live rows", liveCount);
-
-        // Because SP.getRangeSlice doesn't trim the result (see SP.trim()), liveCount may be greater than what asked
-        // (currentPageSize). This would throw off the paging logic so we trim the excess. It's not extremely efficient
-        // but most of the time there should be nothing or very little to trim.
-        if (liveCount > currentPageSize)
-        {
-            rows = discardLast(rows, liveCount - currentPageSize);
-            liveCount = currentPageSize;
-        }
-
-        remaining -= liveCount;
-
-        // If we've got less than requested, there is no more query to do (but
-        // we still need to return the current page)
-        if (liveCount < currentPageSize)
-        {
-            logger.trace("Got result ({}) smaller than page size ({}), considering pager exhausted", liveCount,
-                    currentPageSize);
-            exhausted = true;
-        }
-
-        // If it's not the first query and the first column is the last one returned (likely
-        // but not certain since paging can race with deletes/expiration), then remove the
-        // first column.
-        if (containsPreviousLast(rows.get(0)))
-        {
-            rows = discardFirst(rows);
-            remaining++;
-        }
-        // Otherwise, if 'shouldFetchExtraRow' was set, we queried for one more than the page size,
-        // so if the page is full, trim the last entry
-        else if (shouldFetchExtraRow && !exhausted)
-        {
-            // We've asked for one more than necessary
-            rows = discardLast(rows);
-            remaining++;
-        }
-
-        logger.trace("Remaining rows to page: {}", remaining);
-
-        if (!isExhausted())
-            shouldFetchExtraRow = recordLast(rows.get(rows.size() - 1));
-
-        return rows;
+        return Transformation.apply(readCommand.execute(consistency, clientState), pager);
     }
 
-    private List<Row> filterEmpty(List<Row> result)
+    public PartitionIterator fetchPageInternal(int pageSize, ReadOrderGroup orderGroup) throws RequestValidationException, RequestExecutionException
     {
-        for (Row row : result)
+        if (isExhausted())
+            return EmptyIterators.partition();
+
+        pageSize = Math.min(pageSize, remaining);
+        RowPager pager = new RowPager(limits.forPaging(pageSize), command.nowInSec());
+        ReadCommand readCommand = nextPageReadCommand(pageSize);
+        if (readCommand == null)
         {
-            if (row.cf == null || !row.cf.hasColumns())
+            exhausted = true;
+            return EmptyIterators.partition();
+        }
+        return Transformation.apply(readCommand.executeInternal(orderGroup), pager);
+    }
+
+    public UnfilteredPartitionIterator fetchPageUnfiltered(CFMetaData cfm, int pageSize, ReadOrderGroup orderGroup)
+    {
+        if (isExhausted())
+            return EmptyIterators.unfilteredPartition(cfm, false);
+
+        pageSize = Math.min(pageSize, remaining);
+
+        ReadCommand readCommand = nextPageReadCommand(pageSize);
+        if (readCommand == null)
+        {
+            exhausted = true;
+            return EmptyIterators.unfilteredPartition(cfm, false);
+        }
+        UnfilteredPager pager = new UnfilteredPager(limits.forPaging(pageSize), command.nowInSec());
+        return Transformation.apply(readCommand.executeLocally(orderGroup), pager);
+    }
+
+    private class UnfilteredPager extends Pager<Unfiltered>
+    {
+
+        private UnfilteredPager(DataLimits pageLimits, int nowInSec)
+        {
+            super(pageLimits, nowInSec);
+        }
+
+        protected BaseRowIterator<Unfiltered> apply(BaseRowIterator<Unfiltered> partition)
+        {
+            return Transformation.apply(counter.applyTo((UnfilteredRowIterator) partition), this);
+        }
+    }
+
+    private class RowPager extends Pager<Row>
+    {
+
+        private RowPager(DataLimits pageLimits, int nowInSec)
+        {
+            super(pageLimits, nowInSec);
+        }
+
+        protected BaseRowIterator<Row> apply(BaseRowIterator<Row> partition)
+        {
+            return Transformation.apply(counter.applyTo((RowIterator) partition), this);
+        }
+    }
+
+    private abstract class Pager<T extends Unfiltered> extends Transformation<BaseRowIterator<T>>
+    {
+        private final DataLimits pageLimits;
+        protected final DataLimits.Counter counter;
+        private Row lastRow;
+        private boolean isFirstPartition = true;
+
+        private Pager(DataLimits pageLimits, int nowInSec)
+        {
+            this.counter = pageLimits.newCounter(nowInSec, true, command.selectsFullPartition(), enforceStrictLiveness);
+            this.pageLimits = pageLimits;
+        }
+
+        @Override
+        public BaseRowIterator<T> applyToPartition(BaseRowIterator<T> partition)
+        {
+            DecoratedKey key = partition.partitionKey();
+            if (lastKey == null || !lastKey.equals(key))
+                remainingInPartition = limits.perPartitionCount();
+            lastKey = key;
+
+            // If this is the first partition of this page, this could be the continuation of a partition we've started
+            // on the previous page. In which case, we could have the problem that the partition has no more "regular"
+            // rows (but the page size is such we didn't knew before) but it does has a static row. We should then skip
+            // the partition as returning it would means to the upper layer that the partition has "only" static columns,
+            // which is not the case (and we know the static results have been sent on the previous page).
+            if (isFirstPartition)
             {
-                List<Row> newResult = new ArrayList<Row>(result.size() - 1);
-                for (Row row2 : result)
+                isFirstPartition = false;
+                if (isPreviouslyReturnedPartition(key) && !partition.hasNext())
                 {
-                    if (row2.cf == null || !row2.cf.hasColumns())
-                        continue;
-
-                    newResult.add(row2);
+                    partition.close();
+                    return null;
                 }
-                return newResult;
             }
+
+            return apply(partition);
         }
-        return result;
+
+        protected abstract BaseRowIterator<T> apply(BaseRowIterator<T> partition);
+
+        @Override
+        public void onClose()
+        {
+            recordLast(lastKey, lastRow);
+
+            int counted = counter.counted();
+            remaining -= counted;
+            // If the clustering of the last row returned is a static one, it means that the partition was only
+            // containing data within the static columns. If the clustering of the last row returned is empty
+            // it means that there is only one row per partition. Therefore, in both cases there are no data remaining
+            // within the partition.
+            if (lastRow != null && (lastRow.clustering() == Clustering.STATIC_CLUSTERING
+                    || lastRow.clustering() == Clustering.EMPTY))
+            {
+                remainingInPartition = 0;
+            }
+            else
+            {
+                remainingInPartition -= counter.countedInCurrentPartition();
+            }
+            exhausted = counted < pageLimits.count();
+        }
+
+        public Row applyToStatic(Row row)
+        {
+            if (!row.isEmpty())
+                lastRow = row;
+            return row;
+        }
+
+        @Override
+        public Row applyToRow(Row row)
+        {
+            lastRow = row;
+            return row;
+        }
     }
 
-    protected void restoreState(int remaining, boolean shouldFetchExtraRow)
+    protected void restoreState(DecoratedKey lastKey, int remaining, int remainingInPartition)
     {
+        this.lastKey = lastKey;
         this.remaining = remaining;
-        this.shouldFetchExtraRow = shouldFetchExtraRow;
+        this.remainingInPartition = remainingInPartition;
     }
 
     public boolean isExhausted()
     {
-        return exhausted || remaining == 0;
+        return exhausted || remaining == 0 || ((this instanceof SinglePartitionPager) && remainingInPartition == 0);
     }
 
     public int maxRemaining()
@@ -175,227 +233,12 @@
         return remaining;
     }
 
-    public long timestamp()
+    protected int remainingInPartition()
     {
-        return timestamp;
+        return remainingInPartition;
     }
 
-    private int nextPageSize(int pageSize)
-    {
-        return Math.min(remaining, pageSize) + (shouldFetchExtraRow ? 1 : 0);
-    }
-
-    public ColumnCounter columnCounter()
-    {
-        return columnFilter.columnCounter(cfm.comparator, timestamp);
-    }
-
-    protected abstract List<Row> queryNextPage(int pageSize, ConsistencyLevel consistency, boolean localQuery) throws RequestValidationException, RequestExecutionException;
-
-    /**
-     * Checks to see if the first row of a new page contains the last row from the previous page.
-     * @param first the first row of the new page
-     * @return true if <code>first</code> contains the last from from the previous page and it is live, false otherwise
-     */
-    protected abstract boolean containsPreviousLast(Row first);
-
-    /**
-     * Saves the paging state by recording the last seen partition key and cell name (where applicable).
-     * @param last the last row in the current page
-     * @return true if an extra row should be fetched in the next page,false otherwise
-     */
-    protected abstract boolean recordLast(Row last);
-
-    protected abstract boolean isReversed();
-
-    protected List<Row> discardFirst(List<Row> rows)
-    {
-        return discardFirst(rows, 1);
-    }
-
-    @VisibleForTesting
-    List<Row> discardFirst(List<Row> rows, int toDiscard)
-    {
-        if (toDiscard == 0 || rows.isEmpty())
-            return rows;
-
-        int i = 0;
-        DecoratedKey firstKey = null;
-        ColumnFamily firstCf = null;
-        while (toDiscard > 0 && i < rows.size())
-        {
-            Row first = rows.get(i++);
-            firstKey = first.key;
-            firstCf = first.cf.cloneMeShallow(isReversed());
-            toDiscard -= isReversed()
-                       ? discardLast(first.cf, toDiscard, firstCf)
-                       : discardFirst(first.cf, toDiscard, firstCf);
-        }
-
-        // If there is less live data than to discard, all is discarded
-        if (toDiscard > 0)
-            return Collections.<Row>emptyList();
-
-        // i is the index of the first row that we are sure to keep. On top of that,
-        // we also keep firstCf is it hasn't been fully emptied by the last iteration above.
-        int count = firstCf.getColumnCount();
-        int newSize = rows.size() - (count == 0 ? i : i - 1);
-        List<Row> newRows = new ArrayList<Row>(newSize);
-        if (count != 0)
-            newRows.add(new Row(firstKey, firstCf));
-        newRows.addAll(rows.subList(i, rows.size()));
-
-        return newRows;
-    }
-
-    private List<Row> discardLast(List<Row> rows)
-    {
-        return discardLast(rows, 1);
-    }
-
-    @VisibleForTesting
-    List<Row> discardLast(List<Row> rows, int toDiscard)
-    {
-        if (toDiscard == 0 || rows.isEmpty())
-            return rows;
-
-        int i = rows.size()-1;
-        DecoratedKey lastKey = null;
-        ColumnFamily lastCf = null;
-        while (toDiscard > 0 && i >= 0)
-        {
-            Row last = rows.get(i--);
-            lastKey = last.key;
-            lastCf = last.cf.cloneMeShallow(isReversed());
-            toDiscard -= isReversed()
-                       ? discardFirst(last.cf, toDiscard, lastCf)
-                       : discardLast(last.cf, toDiscard, lastCf);
-        }
-
-        // If there is less live data than to discard, all is discarded
-        if (toDiscard > 0)
-            return Collections.<Row>emptyList();
-
-        // i is the index of the last row that we are sure to keep. On top of that,
-        // we also keep lastCf is it hasn't been fully emptied by the last iteration above.
-        int count = lastCf.getColumnCount();
-        int newSize = count == 0 ? i+1 : i+2;
-        List<Row> newRows = new ArrayList<Row>(newSize);
-        newRows.addAll(rows.subList(0, i+1));
-        if (count != 0)
-            newRows.add(new Row(lastKey, lastCf));
-
-        return newRows;
-    }
-
-    private int getPageLiveCount(List<Row> page)
-    {
-        int count = 0;
-        for (Row row : page)
-            count += columnCounter().countAll(row.cf).live();
-        return count;
-    }
-
-    private int discardFirst(ColumnFamily cf, int toDiscard, ColumnFamily newCf)
-    {
-        boolean isReversed = isReversed();
-        DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester(isReversed);
-        return isReversed
-             ? discardTail(cf, toDiscard, newCf, cf.reverseIterator(), tester)
-             : discardHead(toDiscard, newCf, cf.iterator(), tester);
-    }
-
-    private int discardLast(ColumnFamily cf, int toDiscard, ColumnFamily newCf)
-    {
-        boolean isReversed = isReversed();
-        DeletionInfo.InOrderTester tester = cf.deletionInfo().inOrderTester(isReversed);
-        return isReversed
-             ? discardHead(toDiscard, newCf, cf.reverseIterator(), tester)
-             : discardTail(cf, toDiscard, newCf, cf.iterator(), tester);
-    }
-
-    private int discardHead(int toDiscard, ColumnFamily copy, Iterator<Cell> iter, DeletionInfo.InOrderTester tester)
-    {
-        ColumnCounter counter = columnCounter();
-
-        List<Cell> staticCells = new ArrayList<>(cfm.staticColumns().size());
-
-        // Discard the first 'toDiscard' live, non-static cells
-        while (iter.hasNext())
-        {
-            Cell c = iter.next();
-
-            // if it's a static column, don't count it and save it to add to the trimmed results
-            ColumnDefinition columnDef = cfm.getColumnDefinition(c.name());
-            if (columnDef != null && columnDef.kind == ColumnDefinition.Kind.STATIC)
-            {
-                staticCells.add(c);
-                continue;
-            }
-
-            counter.count(c, tester);
-
-            // once we've discarded the required amount, add the rest
-            if (counter.live() > toDiscard)
-            {
-                for (Cell staticCell : staticCells)
-                    copy.addColumn(staticCell);
-
-                copy.addColumn(c);
-                while (iter.hasNext())
-                    copy.addColumn(iter.next());
-            }
-        }
-        int live = counter.live();
-        // We want to take into account the row even if it was containing only static columns
-        if (live == 0 && !staticCells.isEmpty())
-            live = 1;
-        return Math.min(live, toDiscard);
-    }
-
-    private int discardTail(ColumnFamily cf, int toDiscard, ColumnFamily copy, Iterator<Cell> iter, DeletionInfo.InOrderTester tester)
-    {
-        // Redoing the counting like that is not extremely efficient.
-        // This is called only for reversed slices or in the case of a race between
-        // paging and a deletion (pretty unlikely), so this is probably acceptable.
-        int liveCount = columnCounter().countAll(cf).live();
-
-        if (liveCount == toDiscard)
-            return toDiscard;
-
-        ColumnCounter counter = columnCounter();
-        // Discard the last 'toDiscard' live (so stop adding as sound as we're past 'liveCount - toDiscard')
-        while (iter.hasNext())
-        {
-            Cell c = iter.next();
-            counter.count(c, tester);
-            if (counter.live() > liveCount - toDiscard)
-                break;
-
-            copy.addColumn(c);
-        }
-        return Math.min(liveCount, toDiscard);
-    }
-
-    /**
-     * Returns the first non-static cell in the ColumnFamily.  This is necessary to avoid recording a static column
-     * as the "last" cell seen in a reversed query.  Because we will always query static columns alongside the normal
-     * data for a page, they are not a good indicator of where paging should resume.  When we begin the next page, we
-     * need to start from the last non-static cell.
-     */
-    protected Cell firstNonStaticCell(ColumnFamily cf)
-    {
-        for (Cell cell : cf)
-        {
-            ColumnDefinition def = cfm.getColumnDefinition(cell.name());
-            if (def == null || def.kind != ColumnDefinition.Kind.STATIC)
-                return cell;
-        }
-        return null;
-    }
-
-    protected static Cell lastCell(ColumnFamily cf)
-    {
-        return cf.getReverseSortedColumns().iterator().next();
-    }
+    protected abstract ReadCommand nextPageReadCommand(int pageSize);
+    protected abstract void recordLast(DecoratedKey key, Row row);
+    protected abstract boolean isPreviouslyReturnedPartition(DecoratedKey key);
 }

diff --git a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
index 35d0971..aa268ab 100644
--- a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
+++ b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java

@@ -17,10 +17,12 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.util.ArrayList;
-import java.util.List;
+import org.apache.cassandra.utils.AbstractIterator;
 
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.service.ClientState;
@@ -36,56 +38,50 @@
  *
  * For now, we keep it simple (somewhat) and just do one command at a time. Provided that we make sure to not
  * create a pager unless we need to, this is probably fine. Though if we later want to get fancy, we could use the
- * cfs meanRowSize to decide if parallelizing some of the command might be worth it while being confident we don't
+ * cfs meanPartitionSize to decide if parallelizing some of the command might be worth it while being confident we don't
  * blow out memory.
  */
-class MultiPartitionPager implements QueryPager
+public class MultiPartitionPager implements QueryPager
 {
     private final SinglePartitionPager[] pagers;
-    private final long timestamp;
+    private final DataLimits limit;
+    private final boolean selectsFullPartitions;
+
+    private final int nowInSec;
 
     private int remaining;
     private int current;
 
-    MultiPartitionPager(List<ReadCommand> commands, ConsistencyLevel consistencyLevel, ClientState cState, boolean localQuery, PagingState state, int limitForQuery)
+    public MultiPartitionPager(SinglePartitionReadCommand.Group group, PagingState state, int protocolVersion)
     {
+        this.limit = group.limits();
+        this.nowInSec = group.nowInSec();
+        this.selectsFullPartitions = group.selectsFullPartition();
+
         int i = 0;
         // If it's not the beginning (state != null), we need to find where we were and skip previous commands
         // since they are done.
         if (state != null)
-            for (; i < commands.size(); i++)
-                if (commands.get(i).key.equals(state.partitionKey))
+            for (; i < group.commands.size(); i++)
+                if (group.commands.get(i).partitionKey().getKey().equals(state.partitionKey))
                     break;
 
-        if (i >= commands.size())
+        if (i >= group.commands.size())
         {
             pagers = null;
-            timestamp = -1;
             return;
         }
 
-        pagers = new SinglePartitionPager[commands.size() - i];
+        pagers = new SinglePartitionPager[group.commands.size() - i];
         // 'i' is on the first non exhausted pager for the previous page (or the first one)
-        pagers[0] = makePager(commands.get(i), consistencyLevel, cState, localQuery, state);
-        timestamp = commands.get(i).timestamp;
+        SinglePartitionReadCommand command = group.commands.get(i);
+        pagers[0] = command.getPager(state, protocolVersion);
 
         // Following ones haven't been started yet
-        for (int j = i + 1; j < commands.size(); j++)
-        {
-            ReadCommand command = commands.get(j);
-            if (command.timestamp != timestamp)
-                throw new IllegalArgumentException("All commands must have the same timestamp or weird results may happen.");
-            pagers[j - i] = makePager(command, consistencyLevel, cState, localQuery, null);
-        }
+        for (int j = i + 1; j < group.commands.size(); j++)
+            pagers[j - i] = group.commands.get(j).getPager(null, protocolVersion);
 
-        remaining = state == null ? limitForQuery : state.remaining;
-    }
-
-    private static SinglePartitionPager makePager(ReadCommand command, ConsistencyLevel consistencyLevel, ClientState cState, boolean localQuery, PagingState state)
-    {
-        return command instanceof SliceFromReadCommand
-             ? new SliceQueryPager((SliceFromReadCommand)command, consistencyLevel, cState, localQuery, state)
-             : new NamesQueryPager((SliceByNamesReadCommand)command, consistencyLevel, cState, localQuery);
+        remaining = state == null ? limit.count() : state.remaining;
     }
 
     public PagingState state()
@@ -94,8 +90,10 @@
         if (isExhausted())
             return null;
 
-        PagingState state = pagers[current].state();
-        return new PagingState(pagers[current].key(), state == null ? null : state.cellName, remaining);
+        SinglePartitionPager pager = pagers[current];
+        PagingState pagerState = pager.state();
+        // Multi-partition paging state represents a _current_ position.
+        return new PagingState(pager.key(), pagerState == null ? null : pagerState.rowMark, remaining, pager.remainingInPartition());
     }
 
     public boolean isExhausted()
@@ -113,35 +111,101 @@
         return true;
     }
 
-    public List<Row> fetchPage(int pageSize) throws RequestValidationException, RequestExecutionException
+    public ReadOrderGroup startOrderGroup()
     {
-        List<Row> result = new ArrayList<Row>();
-
-        int remainingThisQuery = Math.min(remaining, pageSize);
-        while (remainingThisQuery > 0 && !isExhausted())
+        // Note that for all pagers, the only difference is the partition key to which it applies, so in practice we
+        // can use any of the sub-pager ReadOrderGroup group to protect the whole pager
+        for (int i = current; i < pagers.length; i++)
         {
-            // isExhausted has set us on the first non-exhausted pager
-            List<Row> page = pagers[current].fetchPage(remainingThisQuery);
-            if (page.isEmpty())
-                continue;
+            if (pagers[i] != null)
+                return pagers[i].startOrderGroup();
+        }
+        throw new AssertionError("Shouldn't be called on an exhausted pager");
+    }
 
-            Row row = page.get(0);
-            int fetched = pagers[current].columnCounter().countAll(row.cf).live();
-            remaining -= fetched;
-            remainingThisQuery -= fetched;
-            result.add(row);
+    @SuppressWarnings("resource") // iter closed via countingIter
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState) throws RequestValidationException, RequestExecutionException
+    {
+        int toQuery = Math.min(remaining, pageSize);
+        PagersIterator iter = new PagersIterator(toQuery, consistency, clientState, null);
+        /**
+         * It's safe to set it as false since all PartitionIterators have been filtered by each SPRC.
+         */
+        boolean enforceStrictLiveness = false;
+        DataLimits.Counter counter = limit.forPaging(toQuery).newCounter(nowInSec, true, selectsFullPartitions, enforceStrictLiveness);
+        iter.setCounter(counter);
+        return counter.applyTo(iter);
+    }
+
+    @SuppressWarnings("resource") // iter closed via countingIter
+    public PartitionIterator fetchPageInternal(int pageSize, ReadOrderGroup orderGroup) throws RequestValidationException, RequestExecutionException
+    {
+        int toQuery = Math.min(remaining, pageSize);
+        PagersIterator iter = new PagersIterator(toQuery, null, null, orderGroup);
+        /**
+         * It's safe to set it as false since all PartitionIterators have been filtered by each SPRC.
+         */
+        boolean enforceStrictLiveness = false;
+        DataLimits.Counter counter = limit.forPaging(toQuery).newCounter(nowInSec, true, selectsFullPartitions, enforceStrictLiveness);
+        iter.setCounter(counter);
+        return counter.applyTo(iter);
+    }
+
+    private class PagersIterator extends AbstractIterator<RowIterator> implements PartitionIterator
+    {
+        private final int pageSize;
+        private PartitionIterator result;
+        private DataLimits.Counter counter;
+
+        // For "normal" queries
+        private final ConsistencyLevel consistency;
+        private final ClientState clientState;
+
+        // For internal queries
+        private final ReadOrderGroup orderGroup;
+
+        public PagersIterator(int pageSize, ConsistencyLevel consistency, ClientState clientState, ReadOrderGroup orderGroup)
+        {
+            this.pageSize = pageSize;
+            this.consistency = consistency;
+            this.clientState = clientState;
+            this.orderGroup = orderGroup;
         }
 
-        return result;
+        public void setCounter(DataLimits.Counter counter)
+        {
+            this.counter = counter;
+        }
+
+        protected RowIterator computeNext()
+        {
+            while (result == null || !result.hasNext())
+            {
+                if (result != null)
+                    result.close();
+
+                // This sets us on the first non-exhausted pager
+                if (isExhausted())
+                    return endOfData();
+
+                int toQuery = pageSize - counter.counted();
+                result = consistency == null
+                       ? pagers[current].fetchPageInternal(toQuery, orderGroup)
+                       : pagers[current].fetchPage(toQuery, consistency, clientState);
+            }
+            return result.next();
+        }
+
+        public void close()
+        {
+            remaining -= counter.counted();
+            if (result != null)
+                result.close();
+        }
     }
 
     public int maxRemaining()
     {
         return remaining;
     }
-
-    public long timestamp()
-    {
-        return timestamp;
-    }
 }

diff --git a/src/java/org/apache/cassandra/service/pager/NamesQueryPager.java b/src/java/org/apache/cassandra/service/pager/NamesQueryPager.java
deleted file mode 100644
index d03e582..0000000
--- a/src/java/org/apache/cassandra/service/pager/NamesQueryPager.java
+++ /dev/null

@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ColumnCounter;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.StorageProxy;
-
-/**
- * Pager over a SliceByNamesReadCommand.
- */
-public class NamesQueryPager implements SinglePartitionPager
-{
-    private final SliceByNamesReadCommand command;
-    private final ConsistencyLevel consistencyLevel;
-    private final ClientState state;
-    private final boolean localQuery;
-
-    private volatile boolean queried;
-
-    /**
-     * For now, we'll only use this in CQL3. In there, as name query can never
-     * yield more than one CQL3 row, there is no need for paging and so this is straight-forward.
-     *
-     * For thrift, we could imagine needing to page, though even then it's very
-     * unlikely unless the pageSize is very small.
-     *
-     * In any case we currently assert in fetchPage if it's a "thrift" query (i.e. a query that
-     * count every cell individually) and the names filter asks for more than pageSize columns.
-     */
-    // Don't use directly, use QueryPagers method instead
-    NamesQueryPager(SliceByNamesReadCommand command, ConsistencyLevel consistencyLevel, ClientState state, boolean localQuery)
-    {
-        this.command = command;
-        this.consistencyLevel = consistencyLevel;
-        this.state = state;
-        this.localQuery = localQuery;
-    }
-
-    public ByteBuffer key()
-    {
-        return command.key;
-    }
-
-    public ColumnCounter columnCounter()
-    {
-        // We know NamesQueryFilter.columnCounter don't care about his argument
-        return command.filter.columnCounter(null, command.timestamp);
-    }
-
-    public PagingState state()
-    {
-        return null;
-    }
-
-    public boolean isExhausted()
-    {
-        return queried;
-    }
-
-    public List<Row> fetchPage(int pageSize) throws RequestValidationException, RequestExecutionException
-    {
-        assert command.filter.countCQL3Rows() || command.filter.columns.size() <= pageSize;
-
-        if (isExhausted())
-            return Collections.<Row>emptyList();
-
-        queried = true;
-        return localQuery
-             ? Collections.singletonList(command.getRow(Keyspace.open(command.ksName)))
-             : StorageProxy.read(Collections.<ReadCommand>singletonList(command), consistencyLevel, state);
-    }
-
-    public int maxRemaining()
-    {
-        if (queried)
-            return 0;
-
-        return command.filter.countCQL3Rows() ? 1 : command.filter.columns.size();
-    }
-
-    public long timestamp()
-    {
-        return command.timestamp;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/pager/Pageable.java b/src/java/org/apache/cassandra/service/pager/Pageable.java
deleted file mode 100644
index d4986f7..0000000
--- a/src/java/org/apache/cassandra/service/pager/Pageable.java
+++ /dev/null

@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.util.List;
-
-import org.apache.cassandra.db.ReadCommand;
-
-/**
- * Marker interface for commands that can be paged.
- */
-public interface Pageable
-{
-    public static class ReadCommands implements Pageable
-    {
-        public final List<ReadCommand> commands;
-
-        public final int limitForQuery;
-
-        public ReadCommands(List<ReadCommand> commands, int limitForQuery)
-        {
-            this.commands = commands;
-            this.limitForQuery = limitForQuery;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/pager/PagingState.java b/src/java/org/apache/cassandra/service/pager/PagingState.java
index f168880..7de7e6f 100644
--- a/src/java/org/apache/cassandra/service/pager/PagingState.java
+++ b/src/java/org/apache/cassandra/service/pager/PagingState.java

@@ -17,55 +17,56 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.*;
 
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.primitives.Ints;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.LegacyLayout;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputBufferFixed;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.transport.ProtocolException;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.apache.cassandra.db.TypeSizes.sizeof;
+import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
+import static org.apache.cassandra.transport.Server.VERSION_3;
+import static org.apache.cassandra.transport.Server.VERSION_4;
+import static org.apache.cassandra.utils.ByteBufferUtil.*;
+import static org.apache.cassandra.utils.vint.VIntCoding.computeUnsignedVIntSize;
+import static org.apache.cassandra.utils.vint.VIntCoding.getUnsignedVInt;
+
+@SuppressWarnings("WeakerAccess")
 public class PagingState
 {
-    public final ByteBuffer partitionKey;
-    public final ByteBuffer cellName;
+    public final ByteBuffer partitionKey;  // Can be null for single partition queries.
+    public final RowMark rowMark;          // Can be null if not needed.
     public final int remaining;
+    public final int remainingInPartition;
 
-    public PagingState(ByteBuffer partitionKey, ByteBuffer cellName, int remaining)
+    public PagingState(ByteBuffer partitionKey, RowMark rowMark, int remaining, int remainingInPartition)
     {
-        this.partitionKey = partitionKey == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : partitionKey;
-        this.cellName = cellName == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : cellName;
+        this.partitionKey = partitionKey;
+        this.rowMark = rowMark;
         this.remaining = remaining;
+        this.remainingInPartition = remainingInPartition;
     }
 
-    public static PagingState deserialize(ByteBuffer bytes)
+    public ByteBuffer serialize(int protocolVersion)
     {
-        if (bytes == null)
-            return null;
-
+        assert rowMark == null || protocolVersion == rowMark.protocolVersion;
         try
         {
-            DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(bytes));
-            ByteBuffer pk = ByteBufferUtil.readWithShortLength(in);
-            ByteBuffer cn = ByteBufferUtil.readWithShortLength(in);
-            int remaining = in.readInt();
-            return new PagingState(pk, cn, remaining);
-        }
-        catch (IOException e)
-        {
-            throw new ProtocolException("Invalid value for the paging state");
-        }
-    }
-
-    public ByteBuffer serialize()
-    {
-        try (DataOutputBuffer out = new DataOutputBufferFixed(serializedSize()))
-        {
-            ByteBufferUtil.writeWithShortLength(partitionKey, out);
-            ByteBufferUtil.writeWithShortLength(cellName, out);
-            out.writeInt(remaining);
-            return out.buffer();
+            return protocolVersion > VERSION_3 ? modernSerialize() : legacySerialize(true);
         }
         catch (IOException e)
         {
@@ -73,16 +74,339 @@
         }
     }
 
-    private int serializedSize()
+    public int serializedSize(int protocolVersion)
     {
-        return 2 + partitionKey.remaining()
-             + 2 + cellName.remaining()
-             + 4;
+        assert rowMark == null || protocolVersion == rowMark.protocolVersion;
+
+        return protocolVersion > VERSION_3 ? modernSerializedSize() : legacySerializedSize(true);
+    }
+
+    /**
+     * It's possible to receive a V3 paging state on a V4 client session, and vice versa - so we cannot
+     * blindly rely on the protocol version provided. We must verify first that the buffer indeed contains
+     * a paging state that adheres to the protocol version provided, or, if not - see if it is in a different
+     * version, in which case we try the other format.
+     */
+    public static PagingState deserialize(ByteBuffer bytes, int protocolVersion)
+    {
+        if (bytes == null)
+            return null;
+
+        try
+        {
+            /*
+             * We can't just attempt to deser twice, as we risk to misinterpet short/vint
+             * lengths and allocate huge byte arrays for readWithVIntLength() or,
+             * to a lesser extent, readWithShortLength()
+             */
+
+            if (protocolVersion > VERSION_3)
+            {
+                if (isModernSerialized(bytes)) return modernDeserialize(bytes, protocolVersion);
+                if (isLegacySerialized(bytes)) return legacyDeserialize(bytes, VERSION_3);
+            }
+
+            if (protocolVersion < VERSION_4)
+            {
+                if (isLegacySerialized(bytes)) return legacyDeserialize(bytes, protocolVersion);
+                if (isModernSerialized(bytes)) return modernDeserialize(bytes, VERSION_4);
+            }
+        }
+        catch (IOException e)
+        {
+            throw new ProtocolException("Invalid value for the paging state");
+        }
+
+        throw new ProtocolException("Invalid value for the paging state");
+    }
+
+    /*
+     * Modern serde (> VERSION_3)
+     */
+
+    @SuppressWarnings({ "resource", "RedundantSuppression" })
+    private ByteBuffer modernSerialize() throws IOException
+    {
+        DataOutputBuffer out = new DataOutputBufferFixed(modernSerializedSize());
+        writeWithVIntLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey, out);
+        writeWithVIntLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark, out);
+        out.writeUnsignedVInt(remaining);
+        out.writeUnsignedVInt(remainingInPartition);
+        return out.buffer(false);
+    }
+
+    private static boolean isModernSerialized(ByteBuffer bytes)
+    {
+        int index = bytes.position();
+        int limit = bytes.limit();
+
+        long partitionKeyLen = getUnsignedVInt(bytes, index, limit);
+        if (partitionKeyLen < 0)
+            return false;
+        index += computeUnsignedVIntSize(partitionKeyLen) + partitionKeyLen;
+        if (index >= limit)
+            return false;
+
+        long rowMarkerLen = getUnsignedVInt(bytes, index, limit);
+        if (rowMarkerLen < 0)
+            return false;
+        index += computeUnsignedVIntSize(rowMarkerLen) + rowMarkerLen;
+        if (index >= limit)
+            return false;
+
+        long remaining = getUnsignedVInt(bytes, index, limit);
+        if (remaining < 0)
+            return false;
+        index += computeUnsignedVIntSize(remaining);
+        if (index >= limit)
+            return false;
+
+        long remainingInPartition = getUnsignedVInt(bytes, index, limit);
+        if (remainingInPartition < 0)
+            return false;
+        index += computeUnsignedVIntSize(remainingInPartition);
+        return index == limit;
+    }
+
+    @SuppressWarnings({ "resource", "RedundantSuppression" })
+    private static PagingState modernDeserialize(ByteBuffer bytes, int protocolVersion) throws IOException
+    {
+        if (protocolVersion < VERSION_4)
+            throw new IllegalArgumentException();
+
+        DataInputBuffer in = new DataInputBuffer(bytes, false);
+
+        ByteBuffer partitionKey = readWithVIntLength(in);
+        ByteBuffer rawMark = readWithVIntLength(in);
+        int remaining = Ints.checkedCast(in.readUnsignedVInt());
+        int remainingInPartition = Ints.checkedCast(in.readUnsignedVInt());
+
+        return new PagingState(partitionKey.hasRemaining() ? partitionKey : null,
+                               rawMark.hasRemaining() ? new RowMark(rawMark, protocolVersion) : null,
+                               remaining,
+                               remainingInPartition);
+    }
+
+    private int modernSerializedSize()
+    {
+        return serializedSizeWithVIntLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey)
+             + serializedSizeWithVIntLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark)
+             + sizeofUnsignedVInt(remaining)
+             + sizeofUnsignedVInt(remainingInPartition);
+    }
+
+    /*
+     * Legacy serde (< VERSION_4)
+     *
+     * There are two versions of legacy PagingState format - one used by 2.1/2.2 and one used by 3.0+.
+     * The latter includes remainingInPartition count, while the former doesn't.
+     */
+
+    @VisibleForTesting
+    @SuppressWarnings({ "resource", "RedundantSuppression" })
+    ByteBuffer legacySerialize(boolean withRemainingInPartition) throws IOException
+    {
+        DataOutputBuffer out = new DataOutputBufferFixed(legacySerializedSize(withRemainingInPartition));
+        writeWithShortLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey, out);
+        writeWithShortLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark, out);
+        out.writeInt(remaining);
+        if (withRemainingInPartition)
+            out.writeInt(remainingInPartition);
+        return out.buffer(false);
+    }
+
+    private static boolean isLegacySerialized(ByteBuffer bytes)
+    {
+        int index = bytes.position();
+        int limit = bytes.limit();
+
+        if (limit - index < 2)
+            return false;
+        short partitionKeyLen = bytes.getShort(index);
+        if (partitionKeyLen < 0)
+            return false;
+        index += 2 + partitionKeyLen;
+
+        if (limit - index < 2)
+            return false;
+        short rowMarkerLen = bytes.getShort(index);
+        if (rowMarkerLen < 0)
+            return false;
+        index += 2 + rowMarkerLen;
+
+        if (limit - index < 4)
+            return false;
+        int remaining = bytes.getInt(index);
+        if (remaining < 0)
+            return false;
+        index += 4;
+
+        // V3 encoded by 2.1/2.2 - sans remainingInPartition
+        if (index == limit)
+            return true;
+
+        if (limit - index == 4)
+        {
+            int remainingInPartition = bytes.getInt(index);
+            return remainingInPartition >= 0; // the value must make sense
+        }
+        return false;
+    }
+
+    @SuppressWarnings({ "resource", "RedundantSuppression" })
+    private static PagingState legacyDeserialize(ByteBuffer bytes, int protocolVersion) throws IOException
+    {
+        if (protocolVersion > VERSION_3)
+            throw new IllegalArgumentException();
+
+        DataInputBuffer in = new DataInputBuffer(bytes, false);
+
+        ByteBuffer partitionKey = readWithShortLength(in);
+        ByteBuffer rawMark = readWithShortLength(in);
+        int remaining = in.readInt();
+        /*
+         * 2.1/2.2 implementations of V3 protocol did not write remainingInPartition, but C* 3.0+ does, so we need
+         * to handle both variants of V3 serialization for compatibility.
+         */
+        int remainingInPartition = in.available() > 0 ? in.readInt() : Integer.MAX_VALUE;
+
+        return new PagingState(partitionKey.hasRemaining() ? partitionKey : null,
+                               rawMark.hasRemaining() ? new RowMark(rawMark, protocolVersion) : null,
+                               remaining,
+                               remainingInPartition);
+    }
+
+    @VisibleForTesting
+    int legacySerializedSize(boolean withRemainingInPartition)
+    {
+        return serializedSizeWithShortLength(null == partitionKey ? EMPTY_BYTE_BUFFER : partitionKey)
+             + serializedSizeWithShortLength(null == rowMark ? EMPTY_BYTE_BUFFER : rowMark.mark)
+             + sizeof(remaining)
+             + (withRemainingInPartition ? sizeof(remainingInPartition) : 0);
+    }
+
+    @Override
+    public final int hashCode()
+    {
+        return Objects.hash(partitionKey, rowMark, remaining, remainingInPartition);
+    }
+
+    @Override
+    public final boolean equals(Object o)
+    {
+        if(!(o instanceof PagingState))
+            return false;
+        PagingState that = (PagingState)o;
+        return Objects.equals(this.partitionKey, that.partitionKey)
+            && Objects.equals(this.rowMark, that.rowMark)
+            && this.remaining == that.remaining
+            && this.remainingInPartition == that.remainingInPartition;
     }
 
     @Override
     public String toString()
     {
-        return String.format("PagingState(key=%s, cellname=%s, remaining=%d", ByteBufferUtil.bytesToHex(partitionKey), ByteBufferUtil.bytesToHex(cellName), remaining);
+        return String.format("PagingState(key=%s, cellname=%s, remaining=%d, remainingInPartition=%d",
+                             partitionKey != null ? bytesToHex(partitionKey) : null,
+                             rowMark,
+                             remaining,
+                             remainingInPartition);
+    }
+
+    /**
+     * Marks the last row returned by paging, the one from which paging should continue.
+     * This class essentially holds a row clustering, but due to backward compatibility reasons,
+     * we need to actually store  the cell name for the last cell of the row we're marking when
+     * the protocol v3 is in use, and this class abstract that complication.
+     *
+     * See CASSANDRA-10254 for more details.
+     */
+    public static class RowMark
+    {
+        // This can be null for convenience if no row is marked.
+        private final ByteBuffer mark;
+        private final int protocolVersion;
+
+        private RowMark(ByteBuffer mark, int protocolVersion)
+        {
+            this.mark = mark;
+            this.protocolVersion = protocolVersion;
+        }
+
+        private static List<AbstractType<?>> makeClusteringTypes(CFMetaData metadata)
+        {
+            // This is the types that will be used when serializing the clustering in the paging state. We can't really use the actual clustering
+            // types however because we can't guarantee that there won't be a schema change between when we send the paging state and get it back,
+            // and said schema change could theoretically change one of the clustering types from a fixed width type to a non-fixed one
+            // (say timestamp -> blob). So we simply use a list of BytesTypes (for both reading and writting), which may be slightly inefficient
+            // for fixed-width types, but avoid any risk during schema changes.
+            int size = metadata.clusteringColumns().size();
+            List<AbstractType<?>> l = new ArrayList<>(size);
+            for (int i = 0; i < size; i++)
+                l.add(BytesType.instance);
+            return l;
+        }
+
+        public static RowMark create(CFMetaData metadata, Row row, int protocolVersion)
+        {
+            ByteBuffer mark;
+            if (protocolVersion <= VERSION_3)
+            {
+                // We need to be backward compatible with 2.1/2.2 nodes paging states. Which means we have to send
+                // the full cellname of the "last" cell in the row we get (since that's how 2.1/2.2 nodes will start after
+                // that last row if they get that paging state).
+                Iterator<Cell> cells = row.cellsInLegacyOrder(metadata, true).iterator();
+                if (!cells.hasNext())
+                {
+                    // If the last returned row has no cell, this means in 2.1/2.2 terms that we stopped on the row
+                    // marker. Note that this shouldn't happen if the table is COMPACT.
+                    assert !metadata.isCompactTable();
+                    mark = LegacyLayout.encodeCellName(metadata, row.clustering(), EMPTY_BYTE_BUFFER, null);
+                }
+                else
+                {
+                    Cell cell = cells.next();
+                    mark = LegacyLayout.encodeCellName(metadata, row.clustering(), cell.column().name.bytes, cell.column().isComplex() ? cell.path().get(0) : null);
+                }
+            }
+            else
+            {
+                // We froze the serialization version to 3.0 as we need to make this this doesn't change (that is, it has to be
+                // fix for a given version of the protocol).
+                mark = Clustering.serializer.serialize(row.clustering(), MessagingService.VERSION_30, makeClusteringTypes(metadata));
+            }
+            return new RowMark(mark, protocolVersion);
+        }
+
+        public Clustering clustering(CFMetaData metadata)
+        {
+            if (mark == null)
+                return null;
+
+            return protocolVersion <= VERSION_3
+                 ? LegacyLayout.decodeClustering(metadata, mark)
+                 : Clustering.serializer.deserialize(mark, MessagingService.VERSION_30, makeClusteringTypes(metadata));
+        }
+
+        @Override
+        public final int hashCode()
+        {
+            return Objects.hash(mark, protocolVersion);
+        }
+
+        @Override
+        public final boolean equals(Object o)
+        {
+            if(!(o instanceof RowMark))
+                return false;
+            RowMark that = (RowMark)o;
+            return Objects.equals(this.mark, that.mark) && this.protocolVersion == that.protocolVersion;
+        }
+
+        @Override
+        public String toString()
+        {
+            return mark == null ? "null" : bytesToHex(mark);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java b/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java
new file mode 100644
index 0000000..bee4a1e
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java

@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service.pager;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+
+/**
+ * Pages a PartitionRangeReadCommand.
+ *
+ * Note: this only work for CQL3 queries for now (because thrift queries expect
+ * a different limit on the rows than on the columns, which complicates it).
+ */
+public class PartitionRangeQueryPager extends AbstractQueryPager
+{
+    private static final Logger logger = LoggerFactory.getLogger(PartitionRangeQueryPager.class);
+
+    private volatile DecoratedKey lastReturnedKey;
+    private volatile PagingState.RowMark lastReturnedRow;
+
+    public PartitionRangeQueryPager(PartitionRangeReadCommand command, PagingState state, int protocolVersion)
+    {
+        super(command, protocolVersion);
+
+        if (state != null)
+        {
+            lastReturnedKey = command.metadata().decorateKey(state.partitionKey);
+            lastReturnedRow = state.rowMark;
+            restoreState(lastReturnedKey, state.remaining, state.remainingInPartition);
+        }
+    }
+
+    public PagingState state()
+    {
+        return lastReturnedKey == null
+             ? null
+             : new PagingState(lastReturnedKey.getKey(), lastReturnedRow, maxRemaining(), remainingInPartition());
+    }
+
+    protected ReadCommand nextPageReadCommand(int pageSize)
+    throws RequestExecutionException
+    {
+        DataLimits limits;
+        DataRange fullRange = ((PartitionRangeReadCommand)command).dataRange();
+        DataRange pageRange;
+        if (lastReturnedKey == null)
+        {
+            pageRange = fullRange;
+            limits = command.limits().forPaging(pageSize);
+        }
+        // if the last key was the one of the end of the range we know that we are done
+        else if (lastReturnedKey.equals(fullRange.keyRange().right) && remainingInPartition() == 0 && lastReturnedRow == null)
+        {
+            return null;
+        }
+        else
+        {
+            // We want to include the last returned key only if we haven't achieved our per-partition limit, otherwise, don't bother.
+            // note that the distinct check should only be hit when getting queries in a mixed mode cluster where a 2.1/2.2-serialized
+            // PagingState is sent to a 3.0 node - in that case we get remainingInPartition = Integer.MAX_VALUE and we include
+            // duplicate keys. For standard non-mixed operation remainingInPartition will always be 0 for DISTINCT queries.
+            boolean includeLastKey = remainingInPartition() > 0 && lastReturnedRow != null && !command.limits().isDistinct();
+            AbstractBounds<PartitionPosition> bounds = makeKeyBounds(lastReturnedKey, includeLastKey);
+            if (includeLastKey)
+            {
+                pageRange = fullRange.forPaging(bounds, command.metadata().comparator, lastReturnedRow.clustering(command.metadata()), false);
+                limits = command.limits().forPaging(pageSize, lastReturnedKey.getKey(), remainingInPartition());
+            }
+            else
+            {
+                pageRange = fullRange.forSubRange(bounds);
+                limits = command.limits().forPaging(pageSize);
+            }
+        }
+
+        return ((PartitionRangeReadCommand) command).withUpdatedLimitsAndDataRange(limits, pageRange);
+    }
+
+    protected void recordLast(DecoratedKey key, Row last)
+    {
+        if (last != null)
+        {
+            lastReturnedKey = key;
+            if (last.clustering() != Clustering.STATIC_CLUSTERING)
+                lastReturnedRow = PagingState.RowMark.create(command.metadata(), last, protocolVersion);
+        }
+    }
+
+    protected boolean isPreviouslyReturnedPartition(DecoratedKey key)
+    {
+        // Note that lastReturnedKey can be null, but key cannot.
+        return key.equals(lastReturnedKey);
+    }
+
+    private AbstractBounds<PartitionPosition> makeKeyBounds(PartitionPosition lastReturnedKey, boolean includeLastKey)
+    {
+        AbstractBounds<PartitionPosition> bounds = ((PartitionRangeReadCommand)command).dataRange().keyRange();
+        if (bounds instanceof Range || bounds instanceof Bounds)
+        {
+            return includeLastKey
+                 ? new Bounds<PartitionPosition>(lastReturnedKey, bounds.right)
+                 : new Range<PartitionPosition>(lastReturnedKey, bounds.right);
+        }
+        else
+        {
+            return includeLastKey
+                 ? new IncludingExcludingBounds<PartitionPosition>(lastReturnedKey, bounds.right)
+                 : new ExcludingBounds<PartitionPosition>(lastReturnedKey, bounds.right);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/service/pager/QueryPager.java b/src/java/org/apache/cassandra/service/pager/QueryPager.java
index ab2dad7..cdf2b97 100644
--- a/src/java/org/apache/cassandra/service/pager/QueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/QueryPager.java

@@ -17,11 +17,13 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.util.List;
-
-import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.EmptyIterators;
+import org.apache.cassandra.db.ReadOrderGroup;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.service.ClientState;
 
 /**
  * Perform a query, paging it by page of a given size.
@@ -44,13 +46,69 @@
  */
 public interface QueryPager
 {
+    public static final QueryPager EMPTY = new QueryPager()
+    {
+        public ReadOrderGroup startOrderGroup()
+        {
+            return ReadOrderGroup.emptyGroup();
+        }
+
+        public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState) throws RequestValidationException, RequestExecutionException
+        {
+            return EmptyIterators.partition();
+        }
+
+        public PartitionIterator fetchPageInternal(int pageSize, ReadOrderGroup orderGroup) throws RequestValidationException, RequestExecutionException
+        {
+            return EmptyIterators.partition();
+        }
+
+        public boolean isExhausted()
+        {
+            return true;
+        }
+
+        public int maxRemaining()
+        {
+            return 0;
+        }
+
+        public PagingState state()
+        {
+            return null;
+        }
+    };
+
     /**
      * Fetches the next page.
      *
      * @param pageSize the maximum number of elements to return in the next page.
+     * @param consistency the consistency level to achieve for the query.
+     * @param clientState the {@code ClientState} for the query. In practice, this can be null unless
+     * {@code consistency} is a serial consistency.
      * @return the page of result.
      */
-    public List<Row> fetchPage(int pageSize) throws RequestValidationException, RequestExecutionException;
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState) throws RequestValidationException, RequestExecutionException;
+
+    /**
+     * Starts a new read operation.
+     * <p>
+     * This must be called before {@link fetchPageInternal} and passed to it to protect the read.
+     * The returned object <b>must</b> be closed on all path and it is thus strongly advised to
+     * use it in a try-with-ressource construction.
+     *
+     * @return a newly started order group for this {@code QueryPager}.
+     */
+    public ReadOrderGroup startOrderGroup();
+
+    /**
+     * Fetches the next page internally (in other, this does a local query).
+     *
+     * @param pageSize the maximum number of elements to return in the next page.
+     * @param orderGroup the {@code ReadOrderGroup} protecting the read.
+     * @return the page of result.
+     */
+    public PartitionIterator fetchPageInternal(int pageSize, ReadOrderGroup orderGroup) throws RequestValidationException, RequestExecutionException;
 
     /**
      * Whether or not this pager is exhausted, i.e. whether or not a call to

diff --git a/src/java/org/apache/cassandra/service/pager/QueryPagers.java b/src/java/org/apache/cassandra/service/pager/QueryPagers.java
index f933ccb..6bc1f80 100644
--- a/src/java/org/apache/cassandra/service/pager/QueryPagers.java
+++ b/src/java/org/apache/cassandra/service/pager/QueryPagers.java

@@ -17,180 +17,49 @@
  */
 package org.apache.cassandra.service.pager;
 
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ColumnCounter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.transport.Server;
 
 /**
- * Static utility methods to create query pagers.
+ * Static utility methods for paging.
  */
 public class QueryPagers
 {
     private QueryPagers() {};
 
-    private static int maxQueried(ReadCommand command)
-    {
-        if (command instanceof SliceByNamesReadCommand)
-        {
-            NamesQueryFilter filter = ((SliceByNamesReadCommand)command).filter;
-            return filter.countCQL3Rows() ? 1 : filter.columns.size();
-        }
-        else
-        {
-            SliceQueryFilter filter = ((SliceFromReadCommand)command).filter;
-            return filter.count;
-        }
-    }
-
-    public static boolean mayNeedPaging(Pageable command, int pageSize)
-    {
-        if (command instanceof Pageable.ReadCommands)
-        {
-            List<ReadCommand> commands = ((Pageable.ReadCommands)command).commands;
-
-            // Using long on purpose, as we could overflow otherwise
-            long maxQueried = 0;
-            for (ReadCommand readCmd : commands)
-                maxQueried += maxQueried(readCmd);
-
-            return maxQueried > pageSize;
-        }
-        else if (command instanceof ReadCommand)
-        {
-            return maxQueried((ReadCommand)command) > pageSize;
-        }
-        else
-        {
-            assert command instanceof RangeSliceCommand;
-            RangeSliceCommand rsc = (RangeSliceCommand)command;
-            // We don't support paging for thrift in general because the way thrift RangeSliceCommand count rows
-            // independently of cells makes things harder (see RangeSliceQueryPager). The one case where we do
-            // get a RangeSliceCommand from CQL3 without the countCQL3Rows flag set is for DISTINCT. In that case
-            // however, the underlying sliceQueryFilter count is 1, so that the RSC limit is still a limit on the
-            // number of CQL3 rows returned.
-            assert rsc.countCQL3Rows || (rsc.predicate instanceof SliceQueryFilter && ((SliceQueryFilter)rsc.predicate).count == 1);
-            return rsc.maxResults > pageSize;
-        }
-    }
-
-    private static QueryPager pager(ReadCommand command, ConsistencyLevel consistencyLevel, ClientState cState, boolean local, PagingState state)
-    {
-        if (command instanceof SliceByNamesReadCommand)
-            return new NamesQueryPager((SliceByNamesReadCommand)command, consistencyLevel, cState, local);
-        else
-            return new SliceQueryPager((SliceFromReadCommand)command, consistencyLevel, cState, local, state);
-    }
-
-    private static QueryPager pager(Pageable command, ConsistencyLevel consistencyLevel, ClientState cState, boolean local, PagingState state)
-    {
-        if (command instanceof Pageable.ReadCommands)
-        {
-            List<ReadCommand> commands = ((Pageable.ReadCommands)command).commands;
-            if (commands.size() == 1)
-                return pager(commands.get(0), consistencyLevel, cState, local, state);
-
-            return new MultiPartitionPager(commands, consistencyLevel, cState, local, state, ((Pageable.ReadCommands) command).limitForQuery);
-        }
-        else if (command instanceof ReadCommand)
-        {
-            return pager((ReadCommand)command, consistencyLevel, cState, local, state);
-        }
-        else
-        {
-            assert command instanceof RangeSliceCommand;
-            RangeSliceCommand rangeCommand = (RangeSliceCommand)command;
-            if (rangeCommand.predicate instanceof NamesQueryFilter)
-                return new RangeNamesQueryPager(rangeCommand, consistencyLevel, local, state);
-            else
-                return new RangeSliceQueryPager(rangeCommand, consistencyLevel, local, state);
-        }
-    }
-
-    public static QueryPager pager(Pageable command, ConsistencyLevel consistencyLevel, ClientState cState)
-    {
-        return pager(command, consistencyLevel, cState, false, null);
-    }
-
-    public static QueryPager pager(Pageable command, ConsistencyLevel consistencyLevel, ClientState cState, PagingState state)
-    {
-        return pager(command, consistencyLevel, cState, false, state);
-    }
-
-    public static QueryPager localPager(Pageable command)
-    {
-        return pager(command, null, null, true, null);
-    }
-
-    /**
-     * Convenience method to (locally) page an internal row.
-     * Used to 2ndary index a wide row without dying.
-     */
-    public static Iterator<ColumnFamily> pageRowLocally(final ColumnFamilyStore cfs, ByteBuffer key, final int pageSize)
-    {
-        SliceFromReadCommand command = new SliceFromReadCommand(cfs.metadata.ksName, key, cfs.name, System.currentTimeMillis(), new IdentityQueryFilter());
-        final SliceQueryPager pager = new SliceQueryPager(command, null, null, true);
-
-        return new Iterator<ColumnFamily>()
-        {
-            // We don't use AbstractIterator because we don't want hasNext() to do an actual query
-            public boolean hasNext()
-            {
-                return !pager.isExhausted();
-            }
-
-            public ColumnFamily next()
-            {
-                try
-                {
-                    List<Row> rows = pager.fetchPage(pageSize);
-                    ColumnFamily cf = rows.isEmpty() ? null : rows.get(0).cf;
-                    return cf == null ? ArrayBackedSortedColumns.factory.create(cfs.metadata) : cf;
-                }
-                catch (Exception e)
-                {
-                    throw new RuntimeException(e);
-                }
-            }
-
-            public void remove()
-            {
-                throw new UnsupportedOperationException();
-            }
-        };
-    }
-
     /**
      * Convenience method that count (live) cells/rows for a given slice of a row, but page underneath.
      */
-    public static int countPaged(String keyspace,
-                                 String columnFamily,
-                                 ByteBuffer key,
-                                 SliceQueryFilter filter,
+    public static int countPaged(CFMetaData metadata,
+                                 DecoratedKey key,
+                                 ColumnFilter columnFilter,
+                                 ClusteringIndexFilter filter,
+                                 DataLimits limits,
                                  ConsistencyLevel consistencyLevel,
-                                 ClientState cState,
+                                 ClientState state,
                                  final int pageSize,
-                                 long now) throws RequestValidationException, RequestExecutionException
+                                 int nowInSec,
+                                 boolean isForThrift) throws RequestValidationException, RequestExecutionException
     {
-        SliceFromReadCommand command = new SliceFromReadCommand(keyspace, key, columnFamily, now, filter);
-        final SliceQueryPager pager = new SliceQueryPager(command, consistencyLevel, cState, false);
+        SinglePartitionReadCommand command = SinglePartitionReadCommand.create(isForThrift, metadata, nowInSec, columnFilter, RowFilter.NONE, limits, key, filter);
+        final SinglePartitionPager pager = new SinglePartitionPager(command, null, Server.CURRENT_VERSION);
 
-        ColumnCounter counter = filter.columnCounter(Schema.instance.getCFMetaData(keyspace, columnFamily).comparator, now);
+        int count = 0;
         while (!pager.isExhausted())
         {
-            List<Row> next = pager.fetchPage(pageSize);
-            if (!next.isEmpty())
-                counter.countAll(next.get(0).cf);
+            try (PartitionIterator iter = pager.fetchPage(pageSize, consistencyLevel, state))
+            {
+                DataLimits.Counter counter = limits.newCounter(nowInSec, true, command.selectsFullPartition(), metadata.enforceStrictLiveness());
+                PartitionIterators.consume(counter.applyTo(iter));
+                count += counter.counted();
+            }
         }
-        return counter.live();
+        return count;
     }
 }

diff --git a/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java b/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java
deleted file mode 100644
index 6b36a25..0000000
--- a/src/java/org/apache/cassandra/service/pager/RangeNamesQueryPager.java
+++ /dev/null

@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.util.List;
-
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.StorageService;
-
-/**
- * Pages a RangeSliceCommand whose predicate is a name query.
- *
- * Note: this only work for NamesQueryFilter that have countCQL3Rows() set,
- * because this assumes the pageSize is counted in number of internal rows
- * returned. More precisely, this doesn't do in-row paging so this assumes
- * that the counter returned by columnCounter() will count 1 for each internal
- * row.
- */
-public class RangeNamesQueryPager extends AbstractQueryPager
-{
-    private final RangeSliceCommand command;
-    private volatile DecoratedKey lastReturnedKey;
-
-    // Don't use directly, use QueryPagers method instead
-    RangeNamesQueryPager(RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery)
-    {
-        super(consistencyLevel, command.maxResults, localQuery, command.keyspace, command.columnFamily, command.predicate, command.timestamp);
-        this.command = command;
-        assert columnFilter instanceof NamesQueryFilter && ((NamesQueryFilter)columnFilter).countCQL3Rows();
-    }
-
-    RangeNamesQueryPager(RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery, PagingState state)
-    {
-        this(command, consistencyLevel, localQuery);
-
-        if (state != null)
-        {
-            lastReturnedKey = StorageService.getPartitioner().decorateKey(state.partitionKey);
-            restoreState(state.remaining, true);
-        }
-    }
-
-    public PagingState state()
-    {
-        return lastReturnedKey == null
-             ? null
-             : new PagingState(lastReturnedKey.getKey(), null, maxRemaining());
-    }
-
-    protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)
-    throws RequestExecutionException
-    {
-        AbstractRangeCommand pageCmd = command.withUpdatedLimit(pageSize);
-        if (lastReturnedKey != null)
-            pageCmd = pageCmd.forSubRange(makeExcludingKeyBounds(lastReturnedKey));
-
-        return localQuery
-             ? pageCmd.executeLocally()
-             : StorageProxy.getRangeSlice(pageCmd, consistencyLevel);
-    }
-
-    protected boolean containsPreviousLast(Row first)
-    {
-        // When querying the next page, we create a bound that exclude the lastReturnedKey
-        // but unfortunately ExcludingBounds is serialized as Bounds, which includes both endpoints,
-        // so we may still get a live row with the same key as lastReturnedKey, see CASSANDRA-10509
-        return lastReturnedKey != null && lastReturnedKey.equals(first.key);
-    }
-
-    protected boolean recordLast(Row last)
-    {
-        lastReturnedKey = last.key;
-        // We return false as that means "can that last be in the next query?"
-        return false;
-    }
-
-    protected boolean isReversed()
-    {
-        return false;
-    }
-
-    private AbstractBounds<RowPosition> makeExcludingKeyBounds(RowPosition lastReturnedKey)
-    {
-        // We return a range that always exclude lastReturnedKey, since we've already
-        // returned it.
-        AbstractBounds<RowPosition> bounds = command.keyRange;
-        if (bounds instanceof Range || bounds instanceof Bounds)
-        {
-            return new Range<>(lastReturnedKey, bounds.right);
-        }
-        else
-        {
-            return new ExcludingBounds<>(lastReturnedKey, bounds.right);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java b/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java
deleted file mode 100644
index 2ccd7e7..0000000
--- a/src/java/org/apache/cassandra/service/pager/RangeSliceQueryPager.java
+++ /dev/null

@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.StorageService;
-
-/**
- * Pages a RangeSliceCommand whose predicate is a slice query.
- *
- * Note: this only work for CQL3 queries for now (because thrift queries expect
- * a different limit on the rows than on the columns, which complicates it).
- */
-public class RangeSliceQueryPager extends AbstractQueryPager
-{
-    private final RangeSliceCommand command;
-    private volatile DecoratedKey lastReturnedKey;
-    private volatile CellName lastReturnedName;
-
-    // Don't use directly, use QueryPagers method instead
-    RangeSliceQueryPager(RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery)
-    {
-        super(consistencyLevel, command.maxResults, localQuery, command.keyspace, command.columnFamily, command.predicate, command.timestamp);
-        this.command = command;
-        assert columnFilter instanceof SliceQueryFilter;
-    }
-
-    RangeSliceQueryPager(RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery, PagingState state)
-    {
-        this(command, consistencyLevel, localQuery);
-
-        if (state != null)
-        {
-            lastReturnedKey = StorageService.getPartitioner().decorateKey(state.partitionKey);
-            lastReturnedName = cfm.comparator.cellFromByteBuffer(state.cellName);
-            restoreState(state.remaining, true);
-        }
-    }
-
-    public PagingState state()
-    {
-        return lastReturnedKey == null
-             ? null
-             : new PagingState(lastReturnedKey.getKey(), lastReturnedName.toByteBuffer(), maxRemaining());
-    }
-
-    protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)
-    throws RequestExecutionException
-    {
-        SliceQueryFilter rawFilter = (SliceQueryFilter)columnFilter;
-        SliceQueryFilter sf = rawFilter.withUpdatedCount(Math.min(rawFilter.count, pageSize));
-        AbstractBounds<RowPosition> keyRange = lastReturnedKey == null ? command.keyRange : makeIncludingKeyBounds(lastReturnedKey);
-        // For DISTINCT queries we can and must ignore the lastReturnedName (see CASSANDRA-13017)
-        Composite start = lastReturnedName == null || isDistinct() ? sf.start() : lastReturnedName;
-        PagedRangeCommand pageCmd = new PagedRangeCommand(command.keyspace,
-                                                          command.columnFamily,
-                                                          command.timestamp,
-                                                          keyRange,
-                                                          sf,
-                                                          start,
-                                                          sf.finish(),
-                                                          command.rowFilter,
-                                                          pageSize,
-                                                          command.countCQL3Rows);
-
-        return localQuery
-             ? pageCmd.executeLocally()
-             : StorageProxy.getRangeSlice(pageCmd, consistencyLevel);
-    }
-
-    protected boolean containsPreviousLast(Row first)
-    {
-        if (lastReturnedKey == null || !lastReturnedKey.equals(first.key))
-            return false;
-
-        // If the query is a DISTINCT one we can stop there
-        if (isDistinct())
-            return true;
-
-        // Same as SliceQueryPager, we ignore a deleted column
-        Cell firstCell = isReversed() ? lastCell(first.cf) : firstNonStaticCell(first.cf);
-        // If the row was containing only static columns it has already been returned and we can skip it.
-        if (firstCell == null)
-            return true;
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(command.keyspace, command.columnFamily);
-        return !first.cf.deletionInfo().isDeleted(firstCell)
-                && firstCell.isLive(timestamp())
-                && firstCell.name().isSameCQL3RowAs(metadata.comparator, lastReturnedName);
-    }
-
-    protected List<Row> discardFirst(List<Row> rows)
-    {
-        if (rows.isEmpty())
-            return rows;
-
-        // Special case for distinct queries because the superclass' discardFirst keeps dropping cells
-        // until it has removed the first *live* row. In a distinct query we only fetch the first row
-        // from a given partition, which may be entirely non-live. In the case where such a non-live
-        // row is the last in page N & the first in page N+1, we would also end up discarding an
-        // additional live row from page N+1.
-        // The simplest solution is to just remove whichever row is first in the page, without bothering
-        // to do liveness checks etc.
-        if (isDistinct())
-        {
-            List<Row> newRows = new ArrayList<>(Math.max(1, rows.size() - 1));
-            newRows.addAll(rows.subList(1, rows.size()));
-            return newRows;
-        }
-
-        return super.discardFirst(rows);
-    }
-
-    private boolean isDistinct()
-    {
-        // As this pager is never used for Thrift queries, checking the countCQL3Rows is enough.
-        return !command.countCQL3Rows;
-    }
-
-    protected boolean recordLast(Row last)
-    {
-        lastReturnedKey = last.key;
-        lastReturnedName = (isReversed() ? firstNonStaticCell(last.cf) : lastCell(last.cf)).name();
-        return true;
-    }
-
-    protected boolean isReversed()
-    {
-        return ((SliceQueryFilter)command.predicate).reversed;
-    }
-
-    private AbstractBounds<RowPosition> makeIncludingKeyBounds(RowPosition lastReturnedKey)
-    {
-        // We always include lastReturnedKey since we may still be paging within a row,
-        // and PagedRangeCommand will move over if we're not anyway
-        AbstractBounds<RowPosition> bounds = command.keyRange;
-        if (bounds instanceof Range || bounds instanceof Bounds)
-        {
-            return new Bounds<RowPosition>(lastReturnedKey, bounds.right);
-        }
-        else
-        {
-            return new IncludingExcludingBounds<RowPosition>(lastReturnedKey, bounds.right);
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java b/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java
index 51bbf90..6f17284 100644
--- a/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java
+++ b/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java

@@ -19,15 +19,68 @@
 
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.db.filter.ColumnCounter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.filter.*;
 
 /**
  * Common interface to single partition queries (by slice and by name).
  *
  * For use by MultiPartitionPager.
  */
-public interface SinglePartitionPager extends QueryPager
+public class SinglePartitionPager extends AbstractQueryPager
 {
-    public ByteBuffer key();
-    public ColumnCounter columnCounter();
+    private static final Logger logger = LoggerFactory.getLogger(SinglePartitionPager.class);
+
+    private final SinglePartitionReadCommand command;
+
+    private volatile PagingState.RowMark lastReturned;
+
+    public SinglePartitionPager(SinglePartitionReadCommand command, PagingState state, int protocolVersion)
+    {
+        super(command, protocolVersion);
+        this.command = command;
+
+        if (state != null)
+        {
+            lastReturned = state.rowMark;
+            restoreState(command.partitionKey(), state.remaining, state.remainingInPartition);
+        }
+    }
+
+    public ByteBuffer key()
+    {
+        return command.partitionKey().getKey();
+    }
+
+    public DataLimits limits()
+    {
+        return command.limits();
+    }
+
+    public PagingState state()
+    {
+        return lastReturned == null
+             ? null
+             : new PagingState(null, lastReturned, maxRemaining(), remainingInPartition());
+    }
+
+    protected ReadCommand nextPageReadCommand(int pageSize)
+    {
+        return command.forPaging(lastReturned == null ? null : lastReturned.clustering(command.metadata()), pageSize);
+    }
+
+    protected void recordLast(DecoratedKey key, Row last)
+    {
+        if (last != null && last.clustering() != Clustering.STATIC_CLUSTERING)
+            lastReturned = PagingState.RowMark.create(command.metadata(), last, protocolVersion);
+    }
+
+    protected boolean isPreviouslyReturnedPartition(DecoratedKey key)
+    {
+        return lastReturned != null;
+    }
 }

diff --git a/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java b/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java
deleted file mode 100644
index 3420831..0000000
--- a/src/java/org/apache/cassandra/service/pager/SliceQueryPager.java
+++ /dev/null

@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.StorageProxy;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Pager over a SliceFromReadCommand.
- */
-public class SliceQueryPager extends AbstractQueryPager implements SinglePartitionPager
-{
-    private static final Logger logger = LoggerFactory.getLogger(SliceQueryPager.class);
-
-    private final SliceFromReadCommand command;
-    private final ClientState cstate;
-
-    private volatile CellName lastReturned;
-
-    // Don't use directly, use QueryPagers method instead
-    SliceQueryPager(SliceFromReadCommand command, ConsistencyLevel consistencyLevel, ClientState cstate, boolean localQuery)
-    {
-        super(consistencyLevel, command.filter.count, localQuery, command.ksName, command.cfName, command.filter, command.timestamp);
-        this.command = command;
-        this.cstate = cstate;
-    }
-
-    SliceQueryPager(SliceFromReadCommand command, ConsistencyLevel consistencyLevel, ClientState cstate, boolean localQuery, PagingState state)
-    {
-        this(command, consistencyLevel, cstate, localQuery);
-
-        if (state != null)
-        {
-            // The cellname can be empty if this is used in a MultiPartitionPager and we're supposed to start reading this row
-            // (because the previous page has exhausted the previous pager). See #10352 for details.
-            if (state.cellName.hasRemaining())
-                lastReturned = (CellName) cfm.comparator.fromByteBuffer(state.cellName);
-            restoreState(state.remaining, true);
-        }
-    }
-
-    public ByteBuffer key()
-    {
-        return command.key;
-    }
-
-    public PagingState state()
-    {
-        return lastReturned == null
-             ? null
-             : new PagingState(null, lastReturned.toByteBuffer(), maxRemaining());
-    }
-
-    protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistencyLevel, boolean localQuery)
-    throws RequestValidationException, RequestExecutionException
-    {
-        // For some queries, such as a DISTINCT query on static columns, the limit for slice queries will be lower
-        // than the page size (in the static example, it will be 1).  We use the min here to ensure we don't fetch
-        // more rows than we're supposed to.  See CASSANDRA-8108 for more details.
-        SliceQueryFilter filter = command.filter.withUpdatedCount(Math.min(command.filter.count, pageSize));
-        if (lastReturned != null)
-            filter = filter.withUpdatedStart(lastReturned, cfm);
-
-        logger.trace("Querying next page of slice query; new filter: {}", filter);
-        ReadCommand pageCmd = command.withUpdatedFilter(filter);
-        return localQuery
-             ? Collections.singletonList(pageCmd.getRow(Keyspace.open(command.ksName)))
-             : StorageProxy.read(Collections.singletonList(pageCmd), consistencyLevel, cstate);
-    }
-
-    protected boolean containsPreviousLast(Row first)
-    {
-        if (lastReturned == null)
-            return false;
-
-        Cell firstCell = isReversed() ? lastCell(first.cf) : firstNonStaticCell(first.cf);
-        // If the row was containing only static columns it has already been returned and we can skip it.
-        if (firstCell == null)
-            return true;
-
-        CFMetaData metadata = Schema.instance.getCFMetaData(command.getKeyspace(), command.getColumnFamilyName());
-        // Note: we only return true if the column is the lastReturned *and* it is live. If it is deleted, it is ignored by the
-        // rest of the paging code (it hasn't been counted as live in particular) and we want to act as if it wasn't there.
-
-        return !first.cf.deletionInfo().isDeleted(firstCell)
-                && firstCell.isLive(timestamp())
-                && firstCell.name().isSameCQL3RowAs(metadata.comparator, lastReturned);
-    }
-
-    protected boolean recordLast(Row last)
-    {
-        Cell lastCell = isReversed() ? firstNonStaticCell(last.cf) : lastCell(last.cf);
-        lastReturned = lastCell.name();
-        return true;
-    }
-
-    protected boolean isReversed()
-    {
-        return command.filter.reversed;
-    }
-}

diff --git a/src/java/org/apache/cassandra/service/paxos/Commit.java b/src/java/org/apache/cassandra/service/paxos/Commit.java
index 45d04f9..95bd464 100644
--- a/src/java/org/apache/cassandra/service/paxos/Commit.java
+++ b/src/java/org/apache/cassandra/service/paxos/Commit.java

@@ -1,6 +1,6 @@
 package org.apache.cassandra.service.paxos;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,30 +8,34 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
 
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.util.UUID;
 import java.nio.ByteBuffer;
+import java.util.UUID;
 
 import com.google.common.base.Objects;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -40,34 +44,32 @@
 {
     public static final CommitSerializer serializer = new CommitSerializer();
 
-    public final ByteBuffer key;
     public final UUID ballot;
-    public final ColumnFamily update;
+    public final PartitionUpdate update;
 
-    public Commit(ByteBuffer key, UUID ballot, ColumnFamily update)
+    public Commit(UUID ballot, PartitionUpdate update)
     {
-        assert key != null;
         assert ballot != null;
         assert update != null;
 
-        this.key = key;
         this.ballot = ballot;
         this.update = update;
     }
 
-    public static Commit newPrepare(ByteBuffer key, CFMetaData metadata, UUID ballot)
+    public static Commit newPrepare(DecoratedKey key, CFMetaData metadata, UUID ballot)
     {
-        return new Commit(key, ballot, ArrayBackedSortedColumns.factory.create(metadata));
+        return new Commit(ballot, PartitionUpdate.emptyUpdate(metadata, key));
     }
 
-    public static Commit newProposal(ByteBuffer key, UUID ballot, ColumnFamily update)
+    public static Commit newProposal(UUID ballot, PartitionUpdate update)
     {
-        return new Commit(key, ballot, updatesWithPaxosTime(update, ballot));
+        update.updateAllTimestamp(UUIDGen.microsTimestamp(ballot));
+        return new Commit(ballot, update);
     }
 
-    public static Commit emptyCommit(ByteBuffer key, CFMetaData metadata)
+    public static Commit emptyCommit(DecoratedKey key, CFMetaData metadata)
     {
-        return new Commit(key, UUIDGen.minTimeUUID(0), ArrayBackedSortedColumns.factory.create(metadata));
+        return new Commit(UUIDGen.minTimeUUID(0), PartitionUpdate.emptyUpdate(metadata, key));
     }
 
     public boolean isAfter(Commit other)
@@ -82,8 +84,7 @@
 
     public Mutation makeMutation()
     {
-        assert update != null;
-        return new Mutation(key, update);
+        return new Mutation(update);
     }
 
     @Override
@@ -94,62 +95,52 @@
 
         Commit commit = (Commit) o;
 
-        if (!ballot.equals(commit.ballot)) return false;
-        if (!key.equals(commit.key)) return false;
-        if (!update.equals(commit.update)) return false;
-
-        return true;
+        return ballot.equals(commit.ballot) && update.equals(commit.update);
     }
 
     @Override
     public int hashCode()
     {
-        return Objects.hashCode(key, ballot, update);
-    }
-
-    private static ColumnFamily updatesWithPaxosTime(ColumnFamily updates, UUID ballot)
-    {
-        ColumnFamily cf = updates.cloneMeShallow();
-        long t = UUIDGen.microsTimestamp(ballot);
-        // For the tombstones, we use t-1 so that when insert a collection literall, the range tombstone that deletes the previous values of
-        // the collection and we want that to have a lower timestamp and our new values. Since tombstones wins over normal insert, using t-1
-        // should not be a problem in general (see #6069).
-        cf.deletionInfo().updateAllTimestamp(t-1);
-        for (Cell cell : updates)
-            cf.addAtom(cell.withUpdatedTimestamp(t));
-        return cf;
+        return Objects.hashCode(ballot, update);
     }
 
     @Override
     public String toString()
     {
-        return String.format("Commit(%s, %s, %s)", ByteBufferUtil.bytesToHex(key), ballot, update);
+        return String.format("Commit(%s, %s)", ballot, update);
     }
 
     public static class CommitSerializer implements IVersionedSerializer<Commit>
     {
         public void serialize(Commit commit, DataOutputPlus out, int version) throws IOException
         {
-            ByteBufferUtil.writeWithShortLength(commit.key, out);
+            if (version < MessagingService.VERSION_30)
+                ByteBufferUtil.writeWithShortLength(commit.update.partitionKey().getKey(), out);
+
             UUIDSerializer.serializer.serialize(commit.ballot, out, version);
-            ColumnFamily.serializer.serialize(commit.update, out, version);
+            PartitionUpdate.serializer.serialize(commit.update, out, version);
         }
 
-        public Commit deserialize(DataInput in, int version) throws IOException
+        public Commit deserialize(DataInputPlus in, int version) throws IOException
         {
-            return new Commit(ByteBufferUtil.readWithShortLength(in),
-                              UUIDSerializer.serializer.deserialize(in, version),
-                              ColumnFamily.serializer.deserialize(in,
-                                                                  ArrayBackedSortedColumns.factory,
-                                                                  ColumnSerializer.Flag.LOCAL,
-                                                                  version));
+            ByteBuffer key = null;
+            if (version < MessagingService.VERSION_30)
+                key = ByteBufferUtil.readWithShortLength(in);
+
+            UUID ballot = UUIDSerializer.serializer.deserialize(in, version);
+            PartitionUpdate update = PartitionUpdate.serializer.deserialize(in, version, SerializationHelper.Flag.LOCAL, key);
+            return new Commit(ballot, update);
         }
 
         public long serializedSize(Commit commit, int version)
         {
-            return 2 + commit.key.remaining()
-                   + UUIDSerializer.serializer.serializedSize(commit.ballot, version)
-                   + ColumnFamily.serializer.serializedSize(commit.update, version);
+            int size = 0;
+            if (version < MessagingService.VERSION_30)
+                size += ByteBufferUtil.serializedSizeWithShortLength(commit.update.partitionKey().getKey());
+
+            return size
+                 + UUIDSerializer.serializer.serializedSize(commit.ballot, version)
+                 + PartitionUpdate.serializer.serializedSize(commit.update, version);
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/service/paxos/CommitVerbHandler.java b/src/java/org/apache/cassandra/service/paxos/CommitVerbHandler.java
index 213023e..a702a4d 100644
--- a/src/java/org/apache/cassandra/service/paxos/CommitVerbHandler.java
+++ b/src/java/org/apache/cassandra/service/paxos/CommitVerbHandler.java

@@ -1,4 +1,3 @@
-package org.apache.cassandra.service.paxos;
 /*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -19,7 +18,7 @@
  * under the License.
  * 
  */
-
+package org.apache.cassandra.service.paxos;
 
 import org.apache.cassandra.db.WriteResponse;
 import org.apache.cassandra.net.IVerbHandler;
@@ -33,8 +32,7 @@
     {
         PaxosState.commit(message.payload);
 
-        WriteResponse response = new WriteResponse();
         Tracing.trace("Enqueuing acknowledge to {}", message.from);
-        MessagingService.instance().sendReply(response.createMessage(), id, message.from);
+        MessagingService.instance().sendReply(WriteResponse.createMessage(), id, message.from);
     }
 }

diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosState.java b/src/java/org/apache/cassandra/service/paxos/PaxosState.java
index fde881b..ee1ba6a 100644
--- a/src/java/org/apache/cassandra/service/paxos/PaxosState.java
+++ b/src/java/org/apache/cassandra/service/paxos/PaxosState.java

@@ -20,10 +20,12 @@
  */
 package org.apache.cassandra.service.paxos;
 
-import java.nio.ByteBuffer;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.locks.Lock;
 
+import com.google.common.base.Throwables;
 import com.google.common.util.concurrent.Striped;
+import com.google.common.util.concurrent.Uninterruptibles;
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -39,14 +41,14 @@
     private final Commit accepted;
     private final Commit mostRecentCommit;
 
-    public PaxosState(ByteBuffer key, CFMetaData metadata)
+    public PaxosState(DecoratedKey key, CFMetaData metadata)
     {
         this(Commit.emptyCommit(key, metadata), Commit.emptyCommit(key, metadata), Commit.emptyCommit(key, metadata));
     }
 
     public PaxosState(Commit promised, Commit accepted, Commit mostRecentCommit)
     {
-        assert promised.key == accepted.key && accepted.key == mostRecentCommit.key;
+        assert promised.update.partitionKey().equals(accepted.update.partitionKey()) && accepted.update.partitionKey().equals(mostRecentCommit.update.partitionKey());
         assert promised.update.metadata() == accepted.update.metadata() && accepted.update.metadata() == mostRecentCommit.update.metadata();
 
         this.promised = promised;
@@ -59,7 +61,7 @@
         long start = System.nanoTime();
         try
         {
-            Lock lock = LOCKS.get(toPrepare.key);
+            Lock lock = LOCKS.get(toPrepare.update.partitionKey());
             lock.lock();
             try
             {
@@ -68,8 +70,8 @@
                 // on some replica and not others during a new proposal (in StorageProxy.beginAndRepairPaxos()), and no
                 // amount of re-submit will fix this (because the node on which the commit has expired will have a
                 // tombstone that hides any re-submit). See CASSANDRA-12043 for details.
-                long now = UUIDGen.unixTimestamp(toPrepare.ballot);
-                PaxosState state = SystemKeyspace.loadPaxosState(toPrepare.key, toPrepare.update.metadata(), now);
+                int nowInSec = UUIDGen.unixTimestampInSec(toPrepare.ballot);
+                PaxosState state = SystemKeyspace.loadPaxosState(toPrepare.update.partitionKey(), toPrepare.update.metadata(), nowInSec);
                 if (toPrepare.isAfter(state.promised))
                 {
                     Tracing.trace("Promising ballot {}", toPrepare.ballot);
@@ -100,12 +102,12 @@
         long start = System.nanoTime();
         try
         {
-            Lock lock = LOCKS.get(proposal.key);
+            Lock lock = LOCKS.get(proposal.update.partitionKey());
             lock.lock();
             try
             {
-                long now = UUIDGen.unixTimestamp(proposal.ballot);
-                PaxosState state = SystemKeyspace.loadPaxosState(proposal.key, proposal.update.metadata(), now);
+                int nowInSec = UUIDGen.unixTimestampInSec(proposal.ballot);
+                PaxosState state = SystemKeyspace.loadPaxosState(proposal.update.partitionKey(), proposal.update.metadata(), nowInSec);
                 if (proposal.hasBallot(state.promised.ballot) || proposal.isAfter(state.promised))
                 {
                     Tracing.trace("Accepting proposal {}", proposal);

diff --git a/src/java/org/apache/cassandra/service/paxos/PrepareCallback.java b/src/java/org/apache/cassandra/service/paxos/PrepareCallback.java
index 081f457..ff81803 100644
--- a/src/java/org/apache/cassandra/service/paxos/PrepareCallback.java
+++ b/src/java/org/apache/cassandra/service/paxos/PrepareCallback.java

@@ -30,6 +30,7 @@
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
 import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -49,7 +50,7 @@
 
     private final Map<InetAddress, Commit> commitsByReplica = new ConcurrentHashMap<InetAddress, Commit>();
 
-    public PrepareCallback(ByteBuffer key, CFMetaData metadata, int targets, ConsistencyLevel consistency)
+    public PrepareCallback(DecoratedKey key, CFMetaData metadata, int targets, ConsistencyLevel consistency)
     {
         super(targets, consistency);
         // need to inject the right key in the empty commit so comparing with empty commits in the reply works as expected
@@ -89,7 +90,7 @@
         latch.countDown();
     }
 
-    public Iterable<InetAddress> replicasMissingMostRecentCommit(CFMetaData metadata, long now)
+    public Iterable<InetAddress> replicasMissingMostRecentCommit(CFMetaData metadata, int nowInSec)
     {
         // In general, we need every replicas that have answered to the prepare (a quorum) to agree on the MRC (see
         // coment in StorageProxy.beginAndRepairPaxos(), but basically we need to make sure at least a quorum of nodes
@@ -100,8 +101,8 @@
         // explained on CASSANDRA-12043. To avoid that, we ignore a MRC that is too old, i.e. older than the TTL we set
         // on paxos tables. For such old commit, we rely on hints and repair to ensure the commit has indeed be
         // propagated to all nodes.
-        long paxosTtlMicros = SystemKeyspace.paxosTtl(metadata) * 1000 * 1000;
-        if (UUIDGen.microsTimestamp(mostRecentCommit.ballot) + paxosTtlMicros < now)
+        long paxosTtlSec = SystemKeyspace.paxosTtlSec(metadata);
+        if (UUIDGen.unixTimestampInSec(mostRecentCommit.ballot) + paxosTtlSec < nowInSec)
             return Collections.emptySet();
 
         return Iterables.filter(commitsByReplica.keySet(), new Predicate<InetAddress>()

diff --git a/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java b/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java
index e766e34..f843b8d 100644
--- a/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java
+++ b/src/java/org/apache/cassandra/service/paxos/PrepareResponse.java

@@ -1,6 +1,6 @@
 package org.apache.cassandra.service.paxos;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,29 +8,29 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
 
 
-import java.io.DataInput;
 import java.io.IOException;
-import java.nio.ByteBuffer;
+import java.util.UUID;
 
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnSerializer;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
 
 public class PrepareResponse
@@ -49,7 +49,7 @@
 
     public PrepareResponse(boolean promised, Commit inProgressCommit, Commit mostRecentCommit)
     {
-        assert inProgressCommit.key == mostRecentCommit.key;
+        assert inProgressCommit.update.partitionKey().equals(mostRecentCommit.update.partitionKey());
         assert inProgressCommit.update.metadata() == mostRecentCommit.update.metadata();
 
         this.promised = promised;
@@ -68,38 +68,52 @@
         public void serialize(PrepareResponse response, DataOutputPlus out, int version) throws IOException
         {
             out.writeBoolean(response.promised);
-            ByteBufferUtil.writeWithShortLength(response.inProgressCommit.key, out);
-            UUIDSerializer.serializer.serialize(response.inProgressCommit.ballot, out, version);
-            ColumnFamily.serializer.serialize(response.inProgressCommit.update, out, version);
-            UUIDSerializer.serializer.serialize(response.mostRecentCommit.ballot, out, version);
-            ColumnFamily.serializer.serialize(response.mostRecentCommit.update, out, version);
+            Commit.serializer.serialize(response.inProgressCommit, out, version);
+
+            if (version < MessagingService.VERSION_30)
+            {
+                UUIDSerializer.serializer.serialize(response.mostRecentCommit.ballot, out, version);
+                PartitionUpdate.serializer.serialize(response.mostRecentCommit.update, out, version);
+            }
+            else
+            {
+                Commit.serializer.serialize(response.mostRecentCommit, out, version);
+            }
         }
 
-        public PrepareResponse deserialize(DataInput in, int version) throws IOException
+        public PrepareResponse deserialize(DataInputPlus in, int version) throws IOException
         {
             boolean success = in.readBoolean();
-            ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
-            return new PrepareResponse(success,
-                                       new Commit(key,
-                                                  UUIDSerializer.serializer.deserialize(in, version),
-                                                  ColumnFamily.serializer.deserialize(in,
-                                                                                      ArrayBackedSortedColumns.factory,
-                                                                                      ColumnSerializer.Flag.LOCAL, version)),
-                                       new Commit(key,
-                                                  UUIDSerializer.serializer.deserialize(in, version),
-                                                  ColumnFamily.serializer.deserialize(in,
-                                                                                      ArrayBackedSortedColumns.factory,
-                                                                                      ColumnSerializer.Flag.LOCAL, version)));
+            Commit inProgress = Commit.serializer.deserialize(in, version);
+            Commit mostRecent;
+            if (version < MessagingService.VERSION_30)
+            {
+                UUID ballot = UUIDSerializer.serializer.deserialize(in, version);
+                PartitionUpdate update = PartitionUpdate.serializer.deserialize(in, version, SerializationHelper.Flag.LOCAL, inProgress.update.partitionKey());
+                mostRecent = new Commit(ballot, update);
+            }
+            else
+            {
+                mostRecent = Commit.serializer.deserialize(in, version);
+            }
+            return new PrepareResponse(success, inProgress, mostRecent);
         }
 
         public long serializedSize(PrepareResponse response, int version)
         {
-            return 1
-                   + 2 + response.inProgressCommit.key.remaining()
-                   + UUIDSerializer.serializer.serializedSize(response.inProgressCommit.ballot, version)
-                   + ColumnFamily.serializer.serializedSize(response.inProgressCommit.update, version)
-                   + UUIDSerializer.serializer.serializedSize(response.mostRecentCommit.ballot, version)
-                   + ColumnFamily.serializer.serializedSize(response.mostRecentCommit.update, version);
+            long size = TypeSizes.sizeof(response.promised)
+                      + Commit.serializer.serializedSize(response.inProgressCommit, version);
+
+            if (version < MessagingService.VERSION_30)
+            {
+                size += UUIDSerializer.serializer.serializedSize(response.mostRecentCommit.ballot, version);
+                size += PartitionUpdate.serializer.serializedSize(response.mostRecentCommit.update, version);
+            }
+            else
+            {
+                size += Commit.serializer.serializedSize(response.mostRecentCommit, version);
+            }
+            return size;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/streaming/ConnectionHandler.java b/src/java/org/apache/cassandra/streaming/ConnectionHandler.java
index fe551a8..aa1c615 100644
--- a/src/java/org/apache/cassandra/streaming/ConnectionHandler.java
+++ b/src/java/org/apache/cassandra/streaming/ConnectionHandler.java

@@ -37,6 +37,8 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
@@ -216,7 +218,7 @@
             if (initiator)
                 sendInitMessage();
 
-            new Thread(this, name() + "-" + session.peer).start();
+            new Thread(NamedThreadFactory.threadLocalDeallocator(this), name() + "-" + session.peer).start();
         }
 
         public ListenableFuture<?> close()

diff --git a/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java b/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java
index 5c27ff3..7e9dfd3 100644
--- a/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java
+++ b/src/java/org/apache/cassandra/streaming/DefaultConnectionFactory.java

@@ -47,15 +47,20 @@
         int attempts = 0;
         while (true)
         {
+            Socket socket = null;
             try
             {
-                Socket socket = OutboundTcpConnectionPool.newSocket(peer);
+                socket = OutboundTcpConnectionPool.newSocket(peer);
                 socket.setSoTimeout(DatabaseDescriptor.getStreamingSocketTimeout());
                 socket.setKeepAlive(true);
                 return socket;
             }
             catch (IOException e)
             {
+                if (socket != null)
+                {
+                    socket.close();
+                }
                 if (++attempts >= MAX_CONNECT_ATTEMPTS)
                     throw e;
 

diff --git a/src/java/org/apache/cassandra/streaming/StreamCoordinator.java b/src/java/org/apache/cassandra/streaming/StreamCoordinator.java
index e0948c9..2838317 100644
--- a/src/java/org/apache/cassandra/streaming/StreamCoordinator.java
+++ b/src/java/org/apache/cassandra/streaming/StreamCoordinator.java

@@ -232,7 +232,7 @@
             for (StreamSession session : streamSessions.values())
             {
                 StreamSession.State state = session.state();
-                if (state != StreamSession.State.COMPLETE && state != StreamSession.State.FAILED)
+                if (!state.isFinalState())
                     return true;
             }
             return false;
@@ -245,6 +245,7 @@
             {
                 StreamSession session = new StreamSession(peer, connecting, factory, streamSessions.size(), keepSSTableLevel, isIncremental);
                 streamSessions.put(++lastReturned, session);
+                sessionInfos.put(lastReturned, session.getSessionInfo());
                 return session;
             }
             // get
@@ -277,6 +278,7 @@
             {
                 session = new StreamSession(peer, connecting, factory, id, keepSSTableLevel, isIncremental);
                 streamSessions.put(id, session);
+                sessionInfos.put(id, session.getSessionInfo());
             }
             return session;
         }

diff --git a/src/java/org/apache/cassandra/streaming/StreamLockfile.java b/src/java/org/apache/cassandra/streaming/StreamLockfile.java
deleted file mode 100644
index 83ae5f0..0000000
--- a/src/java/org/apache/cassandra/streaming/StreamLockfile.java
+++ /dev/null

@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.streaming;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.StandardOpenOption;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.UUID;
-
-import com.google.common.base.Charsets;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.utils.JVMStabilityInspector;
-
-/**
- * Encapsulates the behavior for 'locking' any streamed sttables to a node.
- * If a process crashes while converting a set of SSTableWriters to SSTReaders
- * (meaning, some subset of SSTWs were converted, but not the entire set), we want
- * to disregard the entire set as we will surely have missing data (by definition).
- *
- * Basic behavior id to write out the names of all SSTWs to a file, one SSTW per line,
- * and then delete the file when complete (normal behavior). This should happen before
- * converting any SSTWs. Thus, the lockfile is created, some SSTWs are converted,
- * and if the process crashes, on restart, we look for any existing lockfile, and delete
- * any referenced SSTRs.
- */
-public class StreamLockfile
-{
-    public static final String FILE_EXT = ".lockfile";
-    private static final Logger logger = LoggerFactory.getLogger(StreamLockfile.class);
-
-    private final File lockfile;
-
-    public StreamLockfile(File directory, UUID uuid)
-    {
-        lockfile = new File(directory, uuid + FILE_EXT);
-    }
-
-    public StreamLockfile(File lockfile)
-    {
-        assert lockfile != null;
-        this.lockfile = lockfile;
-    }
-
-    public void create(Collection<SSTableWriter> sstables)
-    {
-        List<String> sstablePaths = new ArrayList<>(sstables.size());
-        for (SSTableWriter writer : sstables)
-        {
-            /* write out the file names *without* the 'tmp-file' flag in the file name.
-               this class will not need to clean up tmp files (on restart), CassandraDaemon does that already,
-               just make sure we delete the fully-formed SSTRs. */
-            sstablePaths.add(writer.descriptor.asType(Descriptor.Type.FINAL).baseFilename());
-        }
-
-        try
-        {
-            Files.write(lockfile.toPath(), sstablePaths, Charsets.UTF_8,
-                    StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE, StandardOpenOption.DSYNC);
-        }
-        catch (IOException e)
-        {
-            logger.warn(String.format("Could not create lockfile %s for stream session, nothing to worry too much about", lockfile), e);
-        }
-    }
-
-    public void delete()
-    {
-        FileUtils.delete(lockfile);
-    }
-
-    public void cleanup()
-    {
-        List<String> files = readLockfile(lockfile);
-        for (String file : files)
-        {
-            try
-            {
-                Descriptor desc = Descriptor.fromFilename(file, true);
-                SSTable.delete(desc, SSTable.componentsFor(desc));
-            }
-            catch (Exception e)
-            {
-                JVMStabilityInspector.inspectThrowable(e);
-                logger.warn("failed to delete a potentially stale sstable {}", file);
-            }
-        }
-    }
-
-    private List<String> readLockfile(File lockfile)
-    {
-        try
-        {
-            return Files.readAllLines(lockfile.toPath(), Charsets.UTF_8);
-        }
-        catch (IOException e)
-        {
-            logger.info("couldn't read lockfile {}, ignoring", lockfile.getAbsolutePath());
-            return Collections.emptyList();
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/streaming/StreamReader.java b/src/java/org/apache/cassandra/streaming/StreamReader.java
index c96ea22..07278cb 100644
--- a/src/java/org/apache/cassandra/streaming/StreamReader.java
+++ b/src/java/org/apache/cassandra/streaming/StreamReader.java

@@ -24,29 +24,32 @@
 import java.util.UUID;
 
 import com.google.common.base.Throwables;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.io.sstable.format.Version;
+import com.google.common.collect.UnmodifiableIterator;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.ning.compress.lzf.LZFInputStream;
 
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.SSTableSimpleIterator;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.util.RewindableDataInputStreamPlus;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.messages.FileMessageHeader;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.BytesReadTracker;
+import org.apache.cassandra.io.util.TrackedInputStream;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 
-import static org.apache.cassandra.utils.Throwables.extractIOExceptionCause;
-
 /**
  * StreamReader reads from stream and writes to SSTable.
  */
@@ -61,6 +64,7 @@
     protected final long repairedAt;
     protected final SSTableFormat.Type format;
     protected final int sstableLevel;
+    protected final SerializationHeader.Component header;
     protected final int fileSeqNum;
 
     protected Descriptor desc;
@@ -71,10 +75,11 @@
         this.cfId = header.cfId;
         this.estimatedKeys = header.estimatedKeys;
         this.sections = header.sections;
-        this.inputVersion = header.format.info.getVersion(header.version);
+        this.inputVersion = header.version;
         this.repairedAt = header.repairedAt;
         this.format = header.format;
         this.sstableLevel = header.sstableLevel;
+        this.header = header.header;
         this.fileSeqNum = header.sequenceNumber;
     }
 
@@ -83,8 +88,8 @@
      * @return SSTable transferred
      * @throws IOException if reading the remote sstable fails. Will throw an RTE if local write fails.
      */
-    @SuppressWarnings("resource")
-    public SSTableWriter read(ReadableByteChannel channel) throws IOException
+    @SuppressWarnings("resource") // channel needs to remain open, streams on top of it can't be closed
+    public SSTableMultiWriter read(ReadableByteChannel channel) throws IOException
     {
         long totalSize = totalSize();
 
@@ -103,53 +108,55 @@
                      session.planId(), fileSeqNum, session.peer, repairedAt, totalSize, cfs.keyspace.getName(),
                      cfs.getColumnFamilyName());
 
-        DataInputStream dis = new DataInputStream(new LZFInputStream(Channels.newInputStream(channel)));
-        BytesReadTracker in = new BytesReadTracker(dis);
-        SSTableWriter writer = null;
-        DecoratedKey key = null;
+        TrackedInputStream in = new TrackedInputStream(new LZFInputStream(Channels.newInputStream(channel)));
+        StreamDeserializer deserializer = new StreamDeserializer(cfs.metadata, in, inputVersion, getHeader(cfs.metadata),
+                                                                 totalSize, session.planId());
+        SSTableMultiWriter writer = null;
         try
         {
             writer = createWriter(cfs, totalSize, repairedAt, format);
             while (in.getBytesRead() < totalSize)
             {
-                key = StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in));
-                writeRow(key, writer, in, cfs);
-
+                writePartition(deserializer, writer);
                 // TODO move this to BytesReadTracker
                 session.progress(desc, ProgressInfo.Direction.IN, in.getBytesRead(), totalSize);
             }
             logger.debug("[Stream #{}] Finished receiving file #{} from {} readBytes = {}, totalSize = {}",
                          session.planId(), fileSeqNum, session.peer, in.getBytesRead(), totalSize);
             return writer;
-        } catch (Throwable e)
+        }
+        catch (Throwable e)
         {
-            if (key != null)
+            if (deserializer != null)
                 logger.warn("[Stream {}] Error while reading partition {} from stream on ks='{}' and table='{}'.",
-                            session.planId(), key, cfs.keyspace.getName(), cfs.getColumnFamilyName());
+                            session.planId(), deserializer.partitionKey(), cfs.keyspace.getName(), cfs.getColumnFamilyName());
             if (writer != null)
             {
-                try
-                {
-                    writer.abort();
-                }
-                catch (Throwable e2)
-                {
-                    // add abort error to original and continue so we can drain unread stream
-                    e.addSuppressed(e2);
-                }
+                writer.abort(e);
             }
             throw Throwables.propagate(e);
         }
+        finally
+        {
+            if (deserializer != null)
+                deserializer.cleanup();
+        }
     }
 
-    protected SSTableWriter createWriter(ColumnFamilyStore cfs, long totalSize, long repairedAt, SSTableFormat.Type format) throws IOException
+    protected SerializationHeader getHeader(CFMetaData metadata)
     {
-        Directories.DataDirectory localDir = cfs.directories.getWriteableLocation(totalSize);
+        return header != null? header.toHeader(metadata) : null; //pre-3.0 sstable have no SerializationHeader
+    }
+
+    protected SSTableMultiWriter createWriter(ColumnFamilyStore cfs, long totalSize, long repairedAt, SSTableFormat.Type format) throws IOException
+    {
+        Directories.DataDirectory localDir = cfs.getDirectories().getWriteableLocation(totalSize);
         if (localDir == null)
             throw new IOException("Insufficient disk space to store " + totalSize + " bytes");
-        desc = Descriptor.fromFilename(cfs.getTempSSTablePath(cfs.directories.getLocationForDisk(localDir), format));
+        desc = Descriptor.fromFilename(cfs.getSSTablePath(cfs.getDirectories().getLocationForDisk(localDir), format));
 
-        return SSTableWriter.create(desc, estimatedKeys, repairedAt, sstableLevel);
+        return cfs.createSSTableMultiWriter(desc, estimatedKeys, repairedAt, sstableLevel, getHeader(cfs.metadata),
+                session.getReceivingTask(cfId).createLifecycleNewTracker());
     }
 
     protected long totalSize()
@@ -160,8 +167,172 @@
         return size;
     }
 
-    protected void writeRow(DecoratedKey key, SSTableWriter writer, DataInput in, ColumnFamilyStore cfs) throws IOException
+    protected void writePartition(StreamDeserializer deserializer, SSTableMultiWriter writer) throws IOException
     {
-        writer.appendFromStream(key, cfs.metadata, in, inputVersion);
+        writer.append(deserializer.newPartition());
+        deserializer.checkForExceptions();
+    }
+
+    public static class StreamDeserializer extends UnmodifiableIterator<Unfiltered> implements UnfilteredRowIterator
+    {
+        public static final int INITIAL_MEM_BUFFER_SIZE = Integer.getInteger("cassandra.streamdes.initial_mem_buffer_size", 32768);
+        public static final int MAX_MEM_BUFFER_SIZE = Integer.getInteger("cassandra.streamdes.max_mem_buffer_size", 1048576);
+        public static final int MAX_SPILL_FILE_SIZE = Integer.getInteger("cassandra.streamdes.max_spill_file_size", Integer.MAX_VALUE);
+
+        public static final String BUFFER_FILE_PREFIX = "buf";
+        public static final String BUFFER_FILE_SUFFIX = "dat";
+
+        private final CFMetaData metadata;
+        private final DataInputPlus in;
+        private final SerializationHeader header;
+        private final SerializationHelper helper;
+
+        private DecoratedKey key;
+        private DeletionTime partitionLevelDeletion;
+        private SSTableSimpleIterator iterator;
+        private Row staticRow;
+        private IOException exception;
+
+        public StreamDeserializer(CFMetaData metadata, InputStream in, Version version, SerializationHeader header,
+                                  long totalSize, UUID sessionId) throws IOException
+        {
+            this.metadata = metadata;
+            // streaming pre-3.0 sstables require mark/reset support from source stream
+            if (version.correspondingMessagingVersion() < MessagingService.VERSION_30)
+            {
+                logger.trace("Initializing rewindable input stream for reading legacy sstable with {} bytes with following " +
+                             "parameters: initial_mem_buffer_size={}, max_mem_buffer_size={}, max_spill_file_size={}.",
+                             totalSize, INITIAL_MEM_BUFFER_SIZE, MAX_MEM_BUFFER_SIZE, MAX_SPILL_FILE_SIZE);
+                File bufferFile = getTempBufferFile(metadata, totalSize, sessionId);
+                this.in = new RewindableDataInputStreamPlus(in, INITIAL_MEM_BUFFER_SIZE, MAX_MEM_BUFFER_SIZE, bufferFile, MAX_SPILL_FILE_SIZE);
+            } else
+                this.in = new DataInputPlus.DataInputStreamPlus(in);
+            this.helper = new SerializationHelper(metadata, version.correspondingMessagingVersion(), SerializationHelper.Flag.PRESERVE_SIZE);
+            this.header = header;
+        }
+
+        public StreamDeserializer newPartition() throws IOException
+        {
+            key = metadata.decorateKey(ByteBufferUtil.readWithShortLength(in));
+            partitionLevelDeletion = DeletionTime.serializer.deserialize(in);
+            iterator = SSTableSimpleIterator.create(metadata, in, header, helper, partitionLevelDeletion);
+            staticRow = iterator.readStaticRow();
+            return this;
+        }
+
+        public CFMetaData metadata()
+        {
+            return metadata;
+        }
+
+        public PartitionColumns columns()
+        {
+            // We don't know which columns we'll get so assume it can be all of them
+            return metadata.partitionColumns();
+        }
+
+        public boolean isReverseOrder()
+        {
+            return false;
+        }
+
+        public DecoratedKey partitionKey()
+        {
+            return key;
+        }
+
+        public DeletionTime partitionLevelDeletion()
+        {
+            return partitionLevelDeletion;
+        }
+
+        public Row staticRow()
+        {
+            return staticRow;
+        }
+
+        public EncodingStats stats()
+        {
+            return header.stats();
+        }
+
+        public boolean hasNext()
+        {
+            try
+            {
+                return iterator.hasNext();
+            }
+            catch (IOError e)
+            {
+                if (e.getCause() != null && e.getCause() instanceof IOException)
+                {
+                    exception = (IOException)e.getCause();
+                    return false;
+                }
+                throw e;
+            }
+        }
+
+        public Unfiltered next()
+        {
+            // Note that in practice we know that IOException will be thrown by hasNext(), because that's
+            // where the actual reading happens, so we don't bother catching RuntimeException here (contrarily
+            // to what we do in hasNext)
+            Unfiltered unfiltered = iterator.next();
+            return metadata.isCounter() && unfiltered.kind() == Unfiltered.Kind.ROW
+                 ? maybeMarkLocalToBeCleared((Row) unfiltered)
+                 : unfiltered;
+        }
+
+        private Row maybeMarkLocalToBeCleared(Row row)
+        {
+            return metadata.isCounter() ? row.markCounterLocalToBeCleared() : row;
+        }
+
+        public void checkForExceptions() throws IOException
+        {
+            if (exception != null)
+                throw exception;
+        }
+
+        public void close()
+        {
+        }
+
+        /* We have a separate cleanup method because sometimes close is called before exhausting the
+           StreamDeserializer (for instance, when enclosed in an try-with-resources wrapper, such as in
+           BigTableWriter.append()).
+         */
+        public void cleanup()
+        {
+            if (in instanceof RewindableDataInputStreamPlus)
+            {
+                try
+                {
+                    ((RewindableDataInputStreamPlus) in).close(false);
+                }
+                catch (IOException e)
+                {
+                    logger.warn("Error while closing RewindableDataInputStreamPlus.", e);
+                }
+            }
+        }
+
+        private static File getTempBufferFile(CFMetaData metadata, long totalSize, UUID sessionId) throws IOException
+        {
+            ColumnFamilyStore cfs = Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName);
+            if (cfs == null)
+            {
+                // schema was dropped during streaming
+                throw new RuntimeException(String.format("CF %s.%s was dropped during streaming", metadata.ksName, metadata.cfName));
+            }
+
+            long maxSize = Math.min(MAX_SPILL_FILE_SIZE, totalSize);
+            File tmpDir = cfs.getDirectories().getTemporaryWriteableDirectoryAsFile(maxSize);
+            if (tmpDir == null)
+                throw new IOException(String.format("No sufficient disk space to stream legacy sstable from {}.{}. " +
+                                                         "Required disk space: %s.", FBUtilities.prettyPrintMemory(maxSize)));
+            return new File(tmpDir, String.format("%s-%s.%s", BUFFER_FILE_PREFIX, sessionId, BUFFER_FILE_SUFFIX));
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java b/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java
index b342edc..ea82d9b 100644
--- a/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java
+++ b/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming;
 
-import java.io.File;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -26,6 +25,9 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -33,13 +35,21 @@
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.Pair;
-
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.Refs;
 
 /**
@@ -47,25 +57,34 @@
  */
 public class StreamReceiveTask extends StreamTask
 {
-    private static final ExecutorService executor = Executors.newCachedThreadPool(new NamedThreadFactory("StreamReceiveTask"));
     private static final Logger logger = LoggerFactory.getLogger(StreamReceiveTask.class);
 
+    private static final ExecutorService executor = Executors.newCachedThreadPool(new NamedThreadFactory("StreamReceiveTask"));
+
     // number of files to receive
     private final int totalFiles;
     // total size of files to receive
     private final long totalSize;
 
+    // Transaction tracking new files received
+    private final LifecycleTransaction txn;
+
     // true if task is done (either completed or aborted)
-    private boolean done = false;
+    private volatile boolean done = false;
 
     //  holds references to SSTables received
-    protected Collection<SSTableWriter> sstables;
+    protected Collection<SSTableReader> sstables;
+
+    private int remoteSSTablesReceived = 0;
 
     public StreamReceiveTask(StreamSession session, UUID cfId, int totalFiles, long totalSize)
     {
         super(session, cfId);
         this.totalFiles = totalFiles;
         this.totalSize = totalSize;
+        // this is an "offline" transaction, as we currently manually expose the sstables once done;
+        // this should be revisited at a later date, so that LifecycleTransaction manages all sstable state changes
+        this.txn = LifecycleTransaction.offline(OperationType.STREAM);
         this.sstables = new ArrayList<>(totalFiles);
     }
 
@@ -74,16 +93,32 @@
      *
      * @param sstable SSTable file received.
      */
-    public synchronized void received(SSTableWriter sstable)
+    public synchronized void received(SSTableMultiWriter sstable)
     {
         if (done)
+        {
+            logger.warn("[{}] Received sstable {} on already finished stream received task. Aborting sstable.", session.planId(),
+                        sstable.getFilename());
+            Throwables.maybeFail(sstable.abort(null));
             return;
+        }
 
-        assert cfId.equals(sstable.metadata.cfId);
+        remoteSSTablesReceived++;
+        assert cfId.equals(sstable.getCfId());
 
-        sstables.add(sstable);
+        Collection<SSTableReader> finished = null;
+        try
+        {
+            finished = sstable.finish(true);
+        }
+        catch (Throwable t)
+        {
+            Throwables.maybeFail(sstable.abort(t));
+        }
+        txn.update(finished, false);
+        sstables.addAll(finished);
 
-        if (sstables.size() == totalFiles)
+        if (remoteSSTablesReceived == totalFiles)
         {
             done = true;
             executor.submit(new OnCompletionRunnable(this));
@@ -100,6 +135,41 @@
         return totalSize;
     }
 
+    /**
+     * @return a LifecycleNewTracker whose operations are synchronised on this StreamReceiveTask.
+     */
+    public synchronized LifecycleNewTracker createLifecycleNewTracker()
+    {
+        if (done)
+            throw new RuntimeException(String.format("Stream receive task %s of cf %s already finished.", session.planId(), cfId));
+
+        return new LifecycleNewTracker()
+        {
+            @Override
+            public void trackNew(SSTable table)
+            {
+                synchronized (StreamReceiveTask.this)
+                {
+                    txn.trackNew(table);
+                }
+            }
+
+            @Override
+            public void untrackNew(SSTable table)
+            {
+                synchronized (StreamReceiveTask.this)
+                {
+                    txn.untrackNew(table);
+                }
+            }
+
+            public OperationType opType()
+            {
+                return txn.opType();
+            }
+        };
+    }
+
     private static class OnCompletionRunnable implements Runnable
     {
         private final StreamReceiveTask task;
@@ -111,70 +181,102 @@
 
         public void run()
         {
+            boolean hasViews = false;
+            ColumnFamilyStore cfs = null;
             try
             {
                 Pair<String, String> kscf = Schema.instance.getCF(task.cfId);
                 if (kscf == null)
                 {
                     // schema was dropped during streaming
-                    for (SSTableWriter writer : task.sstables)
-                        writer.abort();
                     task.sstables.clear();
+                    task.abortTransaction();
+                    task.session.taskCompleted(task);
                     return;
                 }
-                ColumnFamilyStore cfs = Keyspace.open(kscf.left).getColumnFamilyStore(kscf.right);
+                cfs = Keyspace.open(kscf.left).getColumnFamilyStore(kscf.right);
+                hasViews = !Iterables.isEmpty(View.findAll(kscf.left, kscf.right));
 
-                File lockfiledir = cfs.directories.getWriteableLocationAsFile(task.sstables.size() * 256L);
-                StreamLockfile lockfile = new StreamLockfile(lockfiledir, UUID.randomUUID());
-                lockfile.create(task.sstables);
-                List<SSTableReader> readers = new ArrayList<>();
-                for (SSTableWriter writer : task.sstables)
-                    readers.add(writer.finish(true));
-                lockfile.delete();
-                task.sstables.clear();
+                Collection<SSTableReader> readers = task.sstables;
 
                 try (Refs<SSTableReader> refs = Refs.ref(readers))
                 {
-                    // add sstables and build secondary indexes
-                    cfs.addSSTables(readers);
-                    cfs.indexManager.maybeBuildSecondaryIndexes(readers, cfs.indexManager.allIndexesNames());
-
-                    //invalidate row and counter cache
-                    if (cfs.isRowCacheEnabled() || cfs.metadata.isCounter())
+                    //We have a special path for views.
+                    //Since the view requires cleaning up any pre-existing state, we must put
+                    //all partitions through the same write path as normal mutations.
+                    //This also ensures any 2is are also updated
+                    if (hasViews)
                     {
-                        List<Bounds<Token>> boundsToInvalidate = new ArrayList<>(readers.size());
-                        for (SSTableReader sstable : readers)
-                            boundsToInvalidate.add(new Bounds<Token>(sstable.first.getToken(), sstable.last.getToken()));
-                        Set<Bounds<Token>> nonOverlappingBounds = Bounds.getNonOverlappingBounds(boundsToInvalidate);
-
-                        if (cfs.isRowCacheEnabled())
+                        for (SSTableReader reader : readers)
                         {
-                            int invalidatedKeys = cfs.invalidateRowCache(nonOverlappingBounds);
-                            if (invalidatedKeys > 0)
-                                logger.debug("[Stream #{}] Invalidated {} row cache entries on table {}.{} after stream " +
-                                             "receive task completed.", task.session.planId(), invalidatedKeys,
-                                             cfs.keyspace.getName(), cfs.getColumnFamilyName());
+                            Keyspace ks = Keyspace.open(reader.getKeyspaceName());
+                            try (ISSTableScanner scanner = reader.getScanner())
+                            {
+                                while (scanner.hasNext())
+                                {
+                                    try (UnfilteredRowIterator rowIterator = scanner.next())
+                                    {
+                                        // MV *can* be applied unsafe as we flush below before transaction is done.
+                                        ks.apply(new Mutation(PartitionUpdate.fromIterator(rowIterator)), false, true, false);
+                                    }
+                                }
+                            }
                         }
+                    }
+                    else
+                    {
+                        task.finishTransaction();
 
-                        if (cfs.metadata.isCounter())
+                        logger.debug("[Stream #{}] Received {} sstables from {} ({})", task.session.planId(), readers.size(), task.session.peer, readers);
+                        // add sstables and build secondary indexes
+                        cfs.addSSTables(readers);
+                        cfs.indexManager.buildAllIndexesBlocking(readers);
+
+                        //invalidate row and counter cache
+                        if (cfs.isRowCacheEnabled() || cfs.metadata.isCounter())
                         {
-                            int invalidatedKeys = cfs.invalidateCounterCache(nonOverlappingBounds);
-                            if (invalidatedKeys > 0)
-                                logger.debug("[Stream #{}] Invalidated {} counter cache entries on table {}.{} after stream " +
-                                             "receive task completed.", task.session.planId(), invalidatedKeys,
-                                             cfs.keyspace.getName(), cfs.getColumnFamilyName());
+                            List<Bounds<Token>> boundsToInvalidate = new ArrayList<>(readers.size());
+                            readers.forEach(sstable -> boundsToInvalidate.add(new Bounds<Token>(sstable.first.getToken(), sstable.last.getToken())));
+                            Set<Bounds<Token>> nonOverlappingBounds = Bounds.getNonOverlappingBounds(boundsToInvalidate);
+
+                            if (cfs.isRowCacheEnabled())
+                            {
+                                int invalidatedKeys = cfs.invalidateRowCache(nonOverlappingBounds);
+                                if (invalidatedKeys > 0)
+                                    logger.debug("[Stream #{}] Invalidated {} row cache entries on table {}.{} after stream " +
+                                                 "receive task completed.", task.session.planId(), invalidatedKeys,
+                                                 cfs.keyspace.getName(), cfs.getTableName());
+                            }
+
+                            if (cfs.metadata.isCounter())
+                            {
+                                int invalidatedKeys = cfs.invalidateCounterCache(nonOverlappingBounds);
+                                if (invalidatedKeys > 0)
+                                    logger.debug("[Stream #{}] Invalidated {} counter cache entries on table {}.{} after stream " +
+                                                 "receive task completed.", task.session.planId(), invalidatedKeys,
+                                                 cfs.keyspace.getName(), cfs.getTableName());
+                            }
                         }
                     }
                 }
-
                 task.session.taskCompleted(task);
             }
             catch (Throwable t)
             {
-                logger.error("Error applying streamed data: ", t);
                 JVMStabilityInspector.inspectThrowable(t);
                 task.session.onError(t);
             }
+            finally
+            {
+                //We don't keep the streamed sstables since we've applied them manually
+                //So we abort the txn and delete the streamed sstables
+                if (hasViews)
+                {
+                    if (cfs != null)
+                        cfs.forceBlockingFlush();
+                    task.abortTransaction();
+                }
+            }
         }
     }
 
@@ -190,8 +292,17 @@
             return;
 
         done = true;
-        for (SSTableWriter writer : sstables)
-            writer.abort();
+        abortTransaction();
         sstables.clear();
     }
+
+    private synchronized void abortTransaction()
+    {
+        txn.abort();
+    }
+
+    private synchronized void finishTransaction()
+    {
+        txn.finish();
+    }
 }

diff --git a/src/java/org/apache/cassandra/streaming/StreamRequest.java b/src/java/org/apache/cassandra/streaming/StreamRequest.java
index 0fe40cf..93726e7 100644
--- a/src/java/org/apache/cassandra/streaming/StreamRequest.java
+++ b/src/java/org/apache/cassandra/streaming/StreamRequest.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -28,6 +27,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 
@@ -65,7 +65,7 @@
                 out.writeUTF(cf);
         }
 
-        public StreamRequest deserialize(DataInput in, int version) throws IOException
+        public StreamRequest deserialize(DataInputPlus in, int version) throws IOException
         {
             String keyspace = in.readUTF();
             long repairedAt = in.readLong();
@@ -86,17 +86,17 @@
 
         public long serializedSize(StreamRequest request, int version)
         {
-            int size = TypeSizes.NATIVE.sizeof(request.keyspace);
-            size += TypeSizes.NATIVE.sizeof(request.repairedAt);
-            size += TypeSizes.NATIVE.sizeof(request.ranges.size());
+            int size = TypeSizes.sizeof(request.keyspace);
+            size += TypeSizes.sizeof(request.repairedAt);
+            size += TypeSizes.sizeof(request.ranges.size());
             for (Range<Token> range : request.ranges)
             {
                 size += Token.serializer.serializedSize(range.left, version);
                 size += Token.serializer.serializedSize(range.right, version);
             }
-            size += TypeSizes.NATIVE.sizeof(request.columnFamilies.size());
+            size += TypeSizes.sizeof(request.columnFamilies.size());
             for (String cf : request.columnFamilies)
-                size += TypeSizes.NATIVE.sizeof(cf);
+                size += TypeSizes.sizeof(cf);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/streaming/StreamResultFuture.java b/src/java/org/apache/cassandra/streaming/StreamResultFuture.java
index b299b87..c750d8c 100644
--- a/src/java/org/apache/cassandra/streaming/StreamResultFuture.java
+++ b/src/java/org/apache/cassandra/streaming/StreamResultFuture.java

@@ -202,7 +202,7 @@
 
     private synchronized void maybeComplete()
     {
-        if (!coordinator.hasActiveSessions())
+        if (finishedAllSessions())
         {
             StreamState finalState = getCurrentState();
             if (finalState.hasFailedSession())
@@ -217,4 +217,14 @@
             }
         }
     }
+
+    /**
+     * We can't use {@link StreamCoordinator#hasActiveSessions()} directly because {@link this#maybeComplete()}
+     * relies on the snapshotted state from {@link StreamCoordinator} and not the {@link StreamSession} state
+     * directly (CASSANDRA-15667), otherwise inconsistent snapshotted states may lead to completion races.
+     */
+    private boolean finishedAllSessions()
+    {
+        return coordinator.getAllSessionInfo().stream().allMatch(s -> s.state.isFinalState());
+    }
 }

diff --git a/src/java/org/apache/cassandra/streaming/StreamSession.java b/src/java/org/apache/cassandra/streaming/StreamSession.java
index 0f43f1f..bec4e84 100644
--- a/src/java/org/apache/cassandra/streaming/StreamSession.java
+++ b/src/java/org/apache/cassandra/streaming/StreamSession.java

@@ -26,10 +26,11 @@
 import java.util.concurrent.atomic.AtomicBoolean;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Function;
 import com.google.common.collect.*;
 
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.SSTableIntervalTree;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.slf4j.Logger;
@@ -38,9 +39,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.*;
@@ -153,12 +152,27 @@
 
     public static enum State
     {
-        INITIALIZED,
-        PREPARING,
-        STREAMING,
-        WAIT_COMPLETE,
-        COMPLETE,
-        FAILED,
+        INITIALIZED(false),
+        PREPARING(false),
+        STREAMING(false),
+        WAIT_COMPLETE(false),
+        COMPLETE(true),
+        FAILED(true);
+
+        private final boolean finalState;
+
+        State(boolean finalState)
+        {
+            this.finalState = finalState;
+        }
+
+        /**
+         * @return true if current state is final, either COMPLETE OR FAILED.
+         */
+        public boolean isFinalState()
+        {
+            return finalState;
+        }
     }
 
     private volatile State state = State.INITIALIZED;
@@ -209,6 +223,12 @@
     }
 
 
+    StreamReceiveTask getReceivingTask(UUID cfId)
+    {
+        assert receivers.containsKey(cfId);
+        return receivers.get(cfId);
+    }
+
     /**
      * Bind this session to report to specific {@link StreamResultFuture} and
      * perform pre-streaming initialization.
@@ -273,8 +293,9 @@
      * @param flushTables flush tables?
      * @param repairedAt the time the repair started.
      */
-    public void addTransferRanges(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, boolean flushTables, long repairedAt)
+    public synchronized void addTransferRanges(String keyspace, Collection<Range<Token>> ranges, Collection<String> columnFamilies, boolean flushTables, long repairedAt)
     {
+        failIfFinished();
         Collection<ColumnFamilyStore> stores = getColumnFamilyStores(keyspace, columnFamilies);
         if (flushTables)
             flushSSTables(stores);
@@ -292,6 +313,12 @@
         }
     }
 
+    private void failIfFinished()
+    {
+        if (state().isFinalState())
+            throw new RuntimeException(String.format("Stream %s is finished with state %s", planId(), state().name()));
+    }
+
     private Collection<ColumnFamilyStore> getColumnFamilyStores(String keyspace, Collection<String> columnFamilies)
     {
         Collection<ColumnFamilyStore> stores = new HashSet<>();
@@ -316,33 +343,30 @@
         {
             for (ColumnFamilyStore cfStore : stores)
             {
-                final List<Range<RowPosition>> keyRanges = new ArrayList<>(ranges.size());
+                final List<Range<PartitionPosition>> keyRanges = new ArrayList<>(ranges.size());
                 for (Range<Token> range : ranges)
                     keyRanges.add(Range.makeRowRange(range));
-                refs.addAll(cfStore.selectAndReference(new Function<View, List<SSTableReader>>()
-                {
-                    public List<SSTableReader> apply(View view)
+                refs.addAll(cfStore.selectAndReference(view -> {
+                    Set<SSTableReader> sstables = Sets.newHashSet();
+                    SSTableIntervalTree intervalTree = SSTableIntervalTree.build(view.select(SSTableSet.CANONICAL));
+                    for (Range<PartitionPosition> keyRange : keyRanges)
                     {
-                        SSTableIntervalTree intervalTree = SSTableIntervalTree.build(ColumnFamilyStore.CANONICAL_SSTABLES.apply(view));
-                        Set<SSTableReader> sstables = Sets.newHashSet();
-                        for (Range<RowPosition> keyRange : keyRanges)
+                        // keyRange excludes its start, while sstableInBounds is inclusive (of both start and end).
+                        // This is fine however, because keyRange has been created from a token range through Range.makeRowRange (see above).
+                        // And that later method uses the Token.maxKeyBound() method to creates the range, which return a "fake" key that
+                        // sort after all keys having the token. That "fake" key cannot however be equal to any real key, so that even
+                        // including keyRange.left will still exclude any key having the token of the original token range, and so we're
+                        // still actually selecting what we wanted.
+                        for (SSTableReader sstable : View.sstablesInBounds(keyRange.left, keyRange.right, intervalTree))
                         {
-                            // keyRange excludes its start, while sstableInBounds is inclusive (of both start and end).
-                            // This is fine however, because keyRange has been created from a token range through Range.makeRowRange (see above).
-                            // And that later method uses the Token.maxKeyBound() method to creates the range, which return a "fake" key that
-                            // sort after all keys having the token. That "fake" key cannot however be equal to any real key, so that even
-                            // including keyRange.left will still exclude any key having the token of the original token range, and so we're
-                            // still actually selecting what we wanted.
-                            for (SSTableReader sstable : View.sstablesInBounds(keyRange.left, keyRange.right, intervalTree))
-                            {
-                                if (!isIncremental || !sstable.isRepaired())
-                                    sstables.add(sstable);
-                            }
+                            if (!isIncremental || !sstable.isRepaired())
+                                sstables.add(sstable);
                         }
-
-                        logger.debug("ViewFilter for {}/{} sstables", sstables.size(), view.sstables.size());
-                        return ImmutableList.copyOf(sstables);
                     }
+
+                    if (logger.isDebugEnabled())
+                        logger.debug("ViewFilter for {}/{} sstables", sstables.size(), Iterables.size(view.select(SSTableSet.CANONICAL)));
+                    return sstables;
                 }).refs);
             }
 
@@ -366,8 +390,9 @@
         }
     }
 
-    public void addTransferFiles(Collection<SSTableStreamingSections> sstableDetails)
+    public synchronized void addTransferFiles(Collection<SSTableStreamingSections> sstableDetails)
     {
+        failIfFinished();
         Iterator<SSTableStreamingSections> iter = sstableDetails.iterator();
         while (iter.hasNext())
         {
@@ -712,8 +737,9 @@
         FBUtilities.waitOnFutures(flushes);
     }
 
-    private void prepareReceiving(StreamSummary summary)
+    private synchronized void prepareReceiving(StreamSummary summary)
     {
+        failIfFinished();
         if (summary.files > 0)
             receivers.put(summary.cfId, new StreamReceiveTask(this, summary.cfId, summary.files, summary.totalSize));
     }

diff --git a/src/java/org/apache/cassandra/streaming/StreamSummary.java b/src/java/org/apache/cassandra/streaming/StreamSummary.java
index dc332cb..c427283 100644
--- a/src/java/org/apache/cassandra/streaming/StreamSummary.java
+++ b/src/java/org/apache/cassandra/streaming/StreamSummary.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.UUID;
@@ -26,6 +25,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.UUIDSerializer;
@@ -88,7 +88,7 @@
             out.writeLong(summary.totalSize);
         }
 
-        public StreamSummary deserialize(DataInput in, int version) throws IOException
+        public StreamSummary deserialize(DataInputPlus in, int version) throws IOException
         {
             UUID cfId = UUIDSerializer.serializer.deserialize(in, MessagingService.current_version);
             int files = in.readInt();
@@ -99,8 +99,8 @@
         public long serializedSize(StreamSummary summary, int version)
         {
             long size = UUIDSerializer.serializer.serializedSize(summary.cfId, MessagingService.current_version);
-            size += TypeSizes.NATIVE.sizeof(summary.files);
-            size += TypeSizes.NATIVE.sizeof(summary.totalSize);
+            size += TypeSizes.sizeof(summary.files);
+            size += TypeSizes.sizeof(summary.totalSize);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java b/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java
index d08ffa9..e3d698e 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressedInputStream.java

@@ -24,7 +24,7 @@
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ThreadLocalRandom;
-import java.util.zip.Adler32;
+import java.util.function.Supplier;
 import java.util.zip.Checksum;
 
 import com.google.common.collect.Iterators;
@@ -33,7 +33,9 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.utils.ChecksumType;
 import org.apache.cassandra.utils.WrappedRunnable;
 
 /**
@@ -47,9 +49,10 @@
     private final CompressionInfo info;
     // chunk buffer
     private final BlockingQueue<byte[]> dataBuffer;
+    private final Supplier<Double> crcCheckChanceSupplier;
 
     // uncompressed bytes
-    private byte[] buffer;
+    private final byte[] buffer;
 
     // offset from the beginning of the buffer
     protected long bufferOffset = 0;
@@ -80,15 +83,16 @@
      * @param source Input source to read compressed data from
      * @param info Compression info
      */
-    public CompressedInputStream(InputStream source, CompressionInfo info)
+    public CompressedInputStream(InputStream source, CompressionInfo info, ChecksumType checksumType, Supplier<Double> crcCheckChanceSupplier)
     {
         this.info = info;
-        this.checksum =  new Adler32();
+        this.checksum =  checksumType.newInstance();
         this.buffer = new byte[info.parameters.chunkLength()];
         // buffer is limited to store up to 1024 chunks
-        this.dataBuffer = new ArrayBlockingQueue<byte[]>(Math.min(info.chunks.length, 1024));
+        this.dataBuffer = new ArrayBlockingQueue<>(Math.min(info.chunks.length, 1024));
+        this.crcCheckChanceSupplier = crcCheckChanceSupplier;
 
-        new Thread(new Reader(source, info, dataBuffer)).start();
+        new Thread(NamedThreadFactory.threadLocalDeallocator(new Reader(source, info, dataBuffer))).start();
     }
 
     public int read() throws IOException
@@ -128,11 +132,12 @@
     private void decompress(byte[] compressed) throws IOException
     {
         // uncompress
-        validBufferBytes = info.parameters.sstableCompressor.uncompress(compressed, 0, compressed.length - checksumBytes.length, buffer, 0);
+        validBufferBytes = info.parameters.getSstableCompressor().uncompress(compressed, 0, compressed.length - checksumBytes.length, buffer, 0);
         totalCompressedBytesRead += compressed.length;
 
         // validate crc randomly
-        if (info.parameters.getCrcCheckChance() > ThreadLocalRandom.current().nextDouble())
+        if (this.crcCheckChanceSupplier.get() >= 1d ||
+            this.crcCheckChanceSupplier.get() > ThreadLocalRandom.current().nextDouble())
         {
             checksum.update(compressed, 0, compressed.length - checksumBytes.length);
 

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java
index fa1022d..bc87c8f 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamReader.java

@@ -17,16 +17,13 @@
  */
 package org.apache.cassandra.streaming.compress;
 
-import java.io.DataInputStream;
-
 import java.io.IOException;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
 
 import com.google.common.base.Throwables;
 
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -35,13 +32,11 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.ProgressInfo;
 import org.apache.cassandra.streaming.StreamReader;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.streaming.messages.FileMessageHeader;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.BytesReadTracker;
+import org.apache.cassandra.io.util.TrackedInputStream;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.utils.Throwables.extractIOExceptionCause;
@@ -66,8 +61,8 @@
      * @throws java.io.IOException if reading the remote sstable fails. Will throw an RTE if local write fails.
      */
     @Override
-    @SuppressWarnings("resource")
-    public SSTableWriter read(ReadableByteChannel channel) throws IOException
+    @SuppressWarnings("resource") // channel needs to remain open, streams on top of it can't be closed
+    public SSTableMultiWriter read(ReadableByteChannel channel) throws IOException
     {
         long totalSize = totalSize();
 
@@ -86,10 +81,13 @@
                      session.planId(), fileSeqNum, session.peer, repairedAt, totalSize, cfs.keyspace.getName(),
                      cfs.getColumnFamilyName());
 
-        CompressedInputStream cis = new CompressedInputStream(Channels.newInputStream(channel), compressionInfo);
-        BytesReadTracker in = new BytesReadTracker(new DataInputStream(cis));
-        SSTableWriter writer = null;
-        DecoratedKey key = null;
+        CompressedInputStream cis = new CompressedInputStream(Channels.newInputStream(channel), compressionInfo,
+                                                              inputVersion.compressedChecksumType(), cfs::getCrcCheckChance);
+        TrackedInputStream in = new TrackedInputStream(cis);
+
+        StreamDeserializer deserializer = new StreamDeserializer(cfs.metadata, in, inputVersion, getHeader(cfs.metadata),
+                                                                 totalSize, session.planId());
+        SSTableMultiWriter writer = null;
         try
         {
             writer = createWriter(cfs, totalSize, repairedAt, format);
@@ -106,9 +104,7 @@
 
                 while (in.getBytesRead() < sectionLength)
                 {
-                    key = StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in));
-                    writeRow(key, writer, in, cfs);
-
+                    writePartition(deserializer, writer);
                     // when compressed, report total bytes of compressed chunks read since remoteFile.size is the sum of chunks transferred
                     session.progress(desc, ProgressInfo.Direction.IN, cis.getTotalCompressedBytesRead(), totalSize);
                 }
@@ -119,25 +115,22 @@
         }
         catch (Throwable e)
         {
-            if (key != null)
+            if (deserializer != null)
                 logger.warn("[Stream {}] Error while reading partition {} from stream on ks='{}' and table='{}'.",
-                            session.planId(), key, cfs.keyspace.getName(), cfs.getColumnFamilyName());
+                            session.planId(), deserializer.partitionKey(), cfs.keyspace.getName(), cfs.getTableName());
             if (writer != null)
             {
-                try
-                {
-                    writer.abort();
-                }
-                catch (Throwable e2)
-                {
-                    // add abort error to original and continue so we can drain unread stream
-                    e.addSuppressed(e2);
-                }
+                writer.abort(e);
             }
             if (extractIOExceptionCause(e).isPresent())
                 throw e;
             throw Throwables.propagate(e);
         }
+        finally
+        {
+            if (deserializer != null)
+                deserializer.cleanup();
+        }
     }
 
     @Override

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamWriter.java b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamWriter.java
index 99e9bd6..f37af29 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressedStreamWriter.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressedStreamWriter.java

@@ -32,6 +32,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.ChannelProxy;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.streaming.ProgressInfo;
 import org.apache.cassandra.streaming.StreamSession;
@@ -61,7 +62,7 @@
         long totalSize = totalSize();
         logger.debug("[Stream #{}] Start streaming file {} to {}, repairedAt = {}, totalSize = {}", session.planId(),
                      sstable.getFilename(), session.peer, sstable.getSSTableMetadata().repairedAt, totalSize);
-        try (RandomAccessReader file = sstable.openDataReader(); final ChannelProxy fc = file.getChannel())
+        try (ChannelProxy fc = sstable.getDataChannel().sharedCopy())
         {
             long progress = 0L;
             // calculate chunks to transfer. we want to send continuous chunks altogether.
@@ -84,13 +85,7 @@
                     final long bytesTransferredFinal = bytesTransferred;
                     final int toTransfer = (int) Math.min(CHUNK_SIZE, length - bytesTransferred);
                     limiter.acquire(toTransfer);
-                    long lastWrite = out.applyToChannel(new Function<WritableByteChannel, Long>()
-                    {
-                        public Long apply(WritableByteChannel wbc)
-                        {
-                            return fc.transferTo(section.left + bytesTransferredFinal, toTransfer, wbc);
-                        }
-                    });
+                    long lastWrite = out.applyToChannel((wbc) -> fc.transferTo(section.left + bytesTransferredFinal, toTransfer, wbc));
                     bytesTransferred += lastWrite;
                     progress += lastWrite;
                     session.progress(sstable.descriptor, ProgressInfo.Direction.OUT, progress, totalSize);

diff --git a/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java b/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java
index 907a1c7..bd0c2d5 100644
--- a/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java
+++ b/src/java/org/apache/cassandra/streaming/compress/CompressionInfo.java

@@ -17,13 +17,13 @@
  */
 package org.apache.cassandra.streaming.compress;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -34,9 +34,9 @@
     public static final IVersionedSerializer<CompressionInfo> serializer = new CompressionInfoSerializer();
 
     public final CompressionMetadata.Chunk[] chunks;
-    public final CompressionParameters parameters;
+    public final CompressionParams parameters;
 
-    public CompressionInfo(CompressionMetadata.Chunk[] chunks, CompressionParameters parameters)
+    public CompressionInfo(CompressionMetadata.Chunk[] chunks, CompressionParams parameters)
     {
         assert chunks != null && parameters != null;
         this.chunks = chunks;
@@ -58,10 +58,10 @@
             for (int i = 0; i < chunkCount; i++)
                 CompressionMetadata.Chunk.serializer.serialize(info.chunks[i], out, version);
             // compression params
-            CompressionParameters.serializer.serialize(info.parameters, out, version);
+            CompressionParams.serializer.serialize(info.parameters, out, version);
         }
 
-        public CompressionInfo deserialize(DataInput in, int version) throws IOException
+        public CompressionInfo deserialize(DataInputPlus in, int version) throws IOException
         {
             // chunks
             int chunkCount = in.readInt();
@@ -73,22 +73,22 @@
                 chunks[i] = CompressionMetadata.Chunk.serializer.deserialize(in, version);
 
             // compression params
-            CompressionParameters parameters = CompressionParameters.serializer.deserialize(in, version);
+            CompressionParams parameters = CompressionParams.serializer.deserialize(in, version);
             return new CompressionInfo(chunks, parameters);
         }
 
         public long serializedSize(CompressionInfo info, int version)
         {
             if (info == null)
-                return TypeSizes.NATIVE.sizeof(-1);
+                return TypeSizes.sizeof(-1);
 
             // chunks
             int chunkCount = info.chunks.length;
-            long size = TypeSizes.NATIVE.sizeof(chunkCount);
+            long size = TypeSizes.sizeof(chunkCount);
             for (int i = 0; i < chunkCount; i++)
                 size += CompressionMetadata.Chunk.serializer.serializedSize(info.chunks[i], version);
             // compression params
-            size += CompressionParameters.serializer.serializedSize(info.parameters, version);
+            size += CompressionParams.serializer.serializedSize(info.parameters, version);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java
index b555f64..44ff553 100644
--- a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.IOException;
 import java.nio.channels.ReadableByteChannel;
 
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
@@ -27,12 +26,12 @@
 {
     public static Serializer<CompleteMessage> serializer = new Serializer<CompleteMessage>()
     {
-        public CompleteMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
+        public CompleteMessage deserialize(ReadableByteChannel in, int version, StreamSession session)
         {
             return new CompleteMessage();
         }
 
-        public void serialize(CompleteMessage message, DataOutputStreamPlus out, int version, StreamSession session) throws IOException {}
+        public void serialize(CompleteMessage message, DataOutputStreamPlus out, int version, StreamSession session) {}
     };
 
     public CompleteMessage()

diff --git a/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java b/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java
index b2af699..0e06bc0 100644
--- a/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java
+++ b/src/java/org/apache/cassandra/streaming/messages/FileMessageHeader.java

@@ -17,17 +17,20 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.compress.CompressionInfo;
 import org.apache.cassandra.utils.Pair;
@@ -43,7 +46,7 @@
     public final UUID cfId;
     public final int sequenceNumber;
     /** SSTable version */
-    public final String version;
+    public final Version version;
 
     /** SSTable format **/
     public final SSTableFormat.Type format;
@@ -58,19 +61,21 @@
     private final CompressionMetadata compressionMetadata;
     public final long repairedAt;
     public final int sstableLevel;
+    public final SerializationHeader.Component header;
 
     /* cached size value */
     private transient final long size;
 
     public FileMessageHeader(UUID cfId,
                              int sequenceNumber,
-                             String version,
+                             Version version,
                              SSTableFormat.Type format,
                              long estimatedKeys,
                              List<Pair<Long, Long>> sections,
                              CompressionInfo compressionInfo,
                              long repairedAt,
-                             int sstableLevel)
+                             int sstableLevel,
+                             SerializationHeader.Component header)
     {
         this.cfId = cfId;
         this.sequenceNumber = sequenceNumber;
@@ -82,18 +87,20 @@
         this.compressionMetadata = null;
         this.repairedAt = repairedAt;
         this.sstableLevel = sstableLevel;
+        this.header = header;
         this.size = calculateSize();
     }
 
     public FileMessageHeader(UUID cfId,
                              int sequenceNumber,
-                             String version,
+                             Version version,
                              SSTableFormat.Type format,
                              long estimatedKeys,
                              List<Pair<Long, Long>> sections,
                              CompressionMetadata compressionMetadata,
                              long repairedAt,
-                             int sstableLevel)
+                             int sstableLevel,
+                             SerializationHeader.Component header)
     {
         this.cfId = cfId;
         this.sequenceNumber = sequenceNumber;
@@ -105,6 +112,7 @@
         this.compressionMetadata = compressionMetadata;
         this.repairedAt = repairedAt;
         this.sstableLevel = sstableLevel;
+        this.header = header;
         this.size = calculateSize();
     }
 
@@ -182,7 +190,7 @@
         {
             UUIDSerializer.serializer.serialize(header.cfId, out, version);
             out.writeInt(header.sequenceNumber);
-            out.writeUTF(header.version);
+            out.writeUTF(header.version.toString());
 
             //We can't stream to a node that doesn't understand a new sstable format
             if (version < StreamMessage.VERSION_22 && header.format != SSTableFormat.Type.LEGACY && header.format != SSTableFormat.Type.BIG)
@@ -205,14 +213,17 @@
             CompressionInfo.serializer.serialize(compressionInfo, out, version);
             out.writeLong(header.repairedAt);
             out.writeInt(header.sstableLevel);
+
+            if (version >= StreamMessage.VERSION_30 && header.version.storeRows())
+                SerializationHeader.serializer.serialize(header.version, header.header, out);
             return compressionInfo;
         }
 
-        public FileMessageHeader deserialize(DataInput in, int version) throws IOException
+        public FileMessageHeader deserialize(DataInputPlus in, int version) throws IOException
         {
             UUID cfId = UUIDSerializer.serializer.deserialize(in, MessagingService.current_version);
             int sequenceNumber = in.readInt();
-            String sstableVersion = in.readUTF();
+            Version sstableVersion = DatabaseDescriptor.getSSTableFormat().info.getVersion(in.readUTF());
 
             SSTableFormat.Type format = SSTableFormat.Type.LEGACY;
             if (version >= StreamMessage.VERSION_22)
@@ -226,28 +237,36 @@
             CompressionInfo compressionInfo = CompressionInfo.serializer.deserialize(in, MessagingService.current_version);
             long repairedAt = in.readLong();
             int sstableLevel = in.readInt();
-            return new FileMessageHeader(cfId, sequenceNumber, sstableVersion, format, estimatedKeys, sections, compressionInfo, repairedAt, sstableLevel);
+            SerializationHeader.Component header = version >= StreamMessage.VERSION_30 && sstableVersion.storeRows()
+                                                 ? SerializationHeader.serializer.deserialize(sstableVersion, in)
+                                                 : null;
+
+            return new FileMessageHeader(cfId, sequenceNumber, sstableVersion, format, estimatedKeys, sections, compressionInfo, repairedAt, sstableLevel, header);
         }
 
         public long serializedSize(FileMessageHeader header, int version)
         {
             long size = UUIDSerializer.serializer.serializedSize(header.cfId, version);
-            size += TypeSizes.NATIVE.sizeof(header.sequenceNumber);
-            size += TypeSizes.NATIVE.sizeof(header.version);
+            size += TypeSizes.sizeof(header.sequenceNumber);
+            size += TypeSizes.sizeof(header.version.toString());
 
             if (version >= StreamMessage.VERSION_22)
-                size += TypeSizes.NATIVE.sizeof(header.format.name);
+                size += TypeSizes.sizeof(header.format.name);
 
-            size += TypeSizes.NATIVE.sizeof(header.estimatedKeys);
+            size += TypeSizes.sizeof(header.estimatedKeys);
 
-            size += TypeSizes.NATIVE.sizeof(header.sections.size());
+            size += TypeSizes.sizeof(header.sections.size());
             for (Pair<Long, Long> section : header.sections)
             {
-                size += TypeSizes.NATIVE.sizeof(section.left);
-                size += TypeSizes.NATIVE.sizeof(section.right);
+                size += TypeSizes.sizeof(section.left);
+                size += TypeSizes.sizeof(section.right);
             }
             size += CompressionInfo.serializer.serializedSize(header.compressionInfo, version);
-            size += TypeSizes.NATIVE.sizeof(header.sstableLevel);
+            size += TypeSizes.sizeof(header.sstableLevel);
+
+            if (version >= StreamMessage.VERSION_30)
+                size += SerializationHeader.serializer.serializedSize(header.version, header.header);
+
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java b/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java
index 2870c03..438cb0b 100644
--- a/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/IncomingFileMessage.java

@@ -17,14 +17,15 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
+import java.util.Optional;
 
-import com.google.common.base.Optional;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.streaming.StreamReader;
 import org.apache.cassandra.streaming.StreamSession;
@@ -43,7 +44,7 @@
         @SuppressWarnings("resource")
         public IncomingFileMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
         {
-            DataInputStream input = new DataInputStream(Channels.newInputStream(in));
+            DataInputPlus input = new DataInputStreamPlus(Channels.newInputStream(in));
             FileMessageHeader header = FileMessageHeader.serializer.deserialize(input, version);
             StreamReader reader = !header.isCompressed() ? new StreamReader(header, session)
                     : new CompressedStreamReader(header, session);
@@ -66,9 +67,9 @@
     };
 
     public FileMessageHeader header;
-    public SSTableWriter sstable;
+    public SSTableMultiWriter sstable;
 
-    public IncomingFileMessage(SSTableWriter sstable, FileMessageHeader header)
+    public IncomingFileMessage(SSTableMultiWriter sstable, FileMessageHeader header)
     {
         super(Type.FILE);
         this.header = header;

diff --git a/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java b/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java
index 30b79f8..b2621f3 100644
--- a/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/OutgoingFileMessage.java

@@ -75,13 +75,14 @@
         filename = sstable.getFilename();
         this.header = new FileMessageHeader(sstable.metadata.cfId,
                                             sequenceNumber,
-                                            sstable.descriptor.version.toString(),
+                                            sstable.descriptor.version,
                                             sstable.descriptor.formatType,
                                             estimatedKeys,
                                             sections,
                                             sstable.compression ? sstable.getCompressionMetadata() : null,
                                             repairedAt,
-                                            keepSSTableLevel ? sstable.getSSTableLevel() : 0);
+                                            keepSSTableLevel ? sstable.getSSTableLevel() : 0,
+                                            sstable.header == null ? null : sstable.header.toComponent());
     }
 
     public synchronized void serialize(DataOutputStreamPlus out, int version, StreamSession session) throws IOException

diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java
index 004df18..1f53be7 100644
--- a/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/PrepareMessage.java

@@ -23,6 +23,8 @@
 import java.util.ArrayList;
 import java.util.Collection;
 
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.streaming.StreamRequest;
 import org.apache.cassandra.streaming.StreamSession;
@@ -32,9 +34,10 @@
 {
     public static Serializer<PrepareMessage> serializer = new Serializer<PrepareMessage>()
     {
+        @SuppressWarnings("resource") // Not closing constructed DataInputPlus's as the channel needs to remain open.
         public PrepareMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
         {
-            DataInput input = new DataInputStream(Channels.newInputStream(in));
+            DataInputPlus input = new DataInputStreamPlus(Channels.newInputStream(in));
             PrepareMessage message = new PrepareMessage();
             // requests
             int numRequests = input.readInt();

diff --git a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java
index 1255947..251b9c8 100644
--- a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java

@@ -22,6 +22,8 @@
 import java.nio.channels.ReadableByteChannel;
 import java.util.UUID;
 
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.StreamSession;
@@ -31,9 +33,10 @@
 {
     public static Serializer<ReceivedMessage> serializer = new Serializer<ReceivedMessage>()
     {
+        @SuppressWarnings("resource") // Not closing constructed DataInputPlus's as the channel needs to remain open.
         public ReceivedMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
         {
-            DataInput input = new DataInputStream(Channels.newInputStream(in));
+            DataInputPlus input = new DataInputStreamPlus(Channels.newInputStream(in));
             return new ReceivedMessage(UUIDSerializer.serializer.deserialize(input, MessagingService.current_version), input.readInt());
         }
 

diff --git a/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java b/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java
index 6673aa1..047fb06 100644
--- a/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/RetryMessage.java

@@ -22,6 +22,8 @@
 import java.nio.channels.ReadableByteChannel;
 import java.util.UUID;
 
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.streaming.StreamSession;
@@ -35,9 +37,10 @@
 {
     public static Serializer<RetryMessage> serializer = new Serializer<RetryMessage>()
     {
+        @SuppressWarnings("resource") // Not closing constructed DataInputPlus's as the channel needs to remain open.
         public RetryMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
         {
-            DataInput input = new DataInputStream(Channels.newInputStream(in));
+            DataInputPlus input = new DataInputStreamPlus(Channels.newInputStream(in));
             return new RetryMessage(UUIDSerializer.serializer.deserialize(input, MessagingService.current_version), input.readInt());
         }
 

diff --git a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java
index 46f49d6..4a5b6df 100644
--- a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.IOException;
 import java.nio.channels.ReadableByteChannel;
 
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
@@ -27,12 +26,12 @@
 {
     public static Serializer<SessionFailedMessage> serializer = new Serializer<SessionFailedMessage>()
     {
-        public SessionFailedMessage deserialize(ReadableByteChannel in, int version, StreamSession session) throws IOException
+        public SessionFailedMessage deserialize(ReadableByteChannel in, int version, StreamSession session)
         {
             return new SessionFailedMessage();
         }
 
-        public void serialize(SessionFailedMessage message, DataOutputStreamPlus out, int version, StreamSession session) throws IOException {}
+        public void serialize(SessionFailedMessage message, DataOutputStreamPlus out, int version, StreamSession session) {}
     };
 
     public SessionFailedMessage()

diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java
index e8b3f82..6d807e9 100644
--- a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.streaming.messages;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
@@ -25,6 +24,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputBufferFixed;
 import org.apache.cassandra.io.util.DataOutputPlus;
@@ -116,7 +116,7 @@
             out.writeBoolean(message.isIncremental);
         }
 
-        public StreamInitMessage deserialize(DataInput in, int version) throws IOException
+        public StreamInitMessage deserialize(DataInputPlus in, int version) throws IOException
         {
             InetAddress from = CompactEndpointSerializationHelper.deserialize(in);
             int sessionIndex = in.readInt();
@@ -131,12 +131,12 @@
         public long serializedSize(StreamInitMessage message, int version)
         {
             long size = CompactEndpointSerializationHelper.serializedSize(message.from);
-            size += TypeSizes.NATIVE.sizeof(message.sessionIndex);
+            size += TypeSizes.sizeof(message.sessionIndex);
             size += UUIDSerializer.serializer.serializedSize(message.planId, MessagingService.current_version);
-            size += TypeSizes.NATIVE.sizeof(message.description);
-            size += TypeSizes.NATIVE.sizeof(message.isForOutgoing);
-            size += TypeSizes.NATIVE.sizeof(message.keepSSTableLevel);
-            size += TypeSizes.NATIVE.sizeof(message.isIncremental);
+            size += TypeSizes.sizeof(message.description);
+            size += TypeSizes.sizeof(message.isForOutgoing);
+            size += TypeSizes.sizeof(message.keepSSTableLevel);
+            size += TypeSizes.sizeof(message.isIncremental);
             return size;
         }
     }

diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java
index f0a9ef3..eb7086f 100644
--- a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java
+++ b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java

@@ -35,7 +35,8 @@
     /** Streaming protocol version */
     public static final int VERSION_20 = 2;
     public static final int VERSION_22 = 3;
-    public static final int CURRENT_VERSION = VERSION_22;
+    public static final int VERSION_30 = 4;
+    public static final int CURRENT_VERSION = VERSION_30;
 
     public static void serialize(StreamMessage message, DataOutputStreamPlus out, int version, StreamSession session) throws IOException
     {

diff --git a/src/java/org/apache/cassandra/thrift/CassandraServer.java b/src/java/org/apache/cassandra/thrift/CassandraServer.java
index 36664ae..163eb2d 100644
--- a/src/java/org/apache/cassandra/thrift/CassandraServer.java
+++ b/src/java/org/apache/cassandra/thrift/CassandraServer.java

@@ -27,7 +27,6 @@
 import java.util.zip.DataFormatException;
 import java.util.zip.Inflater;
 
-import com.google.common.base.Function;
 import com.google.common.base.Joiner;
 import com.google.common.collect.*;
 import com.google.common.primitives.Longs;
@@ -37,13 +36,16 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.filter.ColumnSlice;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.exceptions.*;
@@ -51,13 +53,13 @@
 import org.apache.cassandra.locator.DynamicEndpointSnitch;
 import org.apache.cassandra.metrics.ClientMetrics;
 import org.apache.cassandra.scheduler.IRequestScheduler;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.btree.BTreeSet;
 import org.apache.thrift.TException;
 
 public class CassandraServer implements Cassandra.Iface
@@ -84,19 +86,15 @@
         return ThriftSessionManager.instance.currentSession();
     }
 
-    protected Map<DecoratedKey, ColumnFamily> readColumnFamily(List<ReadCommand> commands, org.apache.cassandra.db.ConsistencyLevel consistency_level, ClientState cState)
+    protected PartitionIterator read(List<SinglePartitionReadCommand> commands, org.apache.cassandra.db.ConsistencyLevel consistency_level, ClientState cState)
     throws org.apache.cassandra.exceptions.InvalidRequestException, UnavailableException, TimedOutException
     {
-        // TODO - Support multiple column families per row, right now row only contains 1 column family
-        Map<DecoratedKey, ColumnFamily> columnFamilyKeyMap = new HashMap<DecoratedKey, ColumnFamily>();
-
-        List<Row> rows = null;
         try
         {
             schedule(DatabaseDescriptor.getReadRpcTimeout());
             try
             {
-                rows = StorageProxy.read(commands, consistency_level, cState);
+                return StorageProxy.read(new SinglePartitionReadCommand.Group(commands, DataLimits.NONE), consistency_level, cState);
             }
             finally
             {
@@ -105,180 +103,176 @@
         }
         catch (RequestExecutionException e)
         {
-            ThriftConversion.rethrow(e);
+            throw ThriftConversion.rethrow(e);
         }
-
-        for (Row row: rows)
-        {
-            columnFamilyKeyMap.put(row.key, row.cf);
-        }
-        return columnFamilyKeyMap;
     }
 
-    public List<ColumnOrSuperColumn> thriftifyColumns(Collection<Cell> cells, boolean reverseOrder, long now)
+    public List<ColumnOrSuperColumn> thriftifyColumns(CFMetaData metadata, Iterator<LegacyLayout.LegacyCell> cells)
     {
-        ArrayList<ColumnOrSuperColumn> thriftColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
-        for (Cell cell : cells)
+        ArrayList<ColumnOrSuperColumn> thriftColumns = new ArrayList<>();
+        while (cells.hasNext())
         {
-            if (!cell.isLive(now))
-                continue;
-
-            thriftColumns.add(thriftifyColumnWithName(cell, cell.name().toByteBuffer()));
+            LegacyLayout.LegacyCell cell = cells.next();
+            thriftColumns.add(thriftifyColumnWithName(metadata, cell, cell.name.encode(metadata)));
         }
-
-        // we have to do the reversing here, since internally we pass results around in ColumnFamily
-        // objects, which always sort their cells in the "natural" order
-        // TODO this is inconvenient for direct users of StorageProxy
-        if (reverseOrder)
-            Collections.reverse(thriftColumns);
         return thriftColumns;
     }
 
-    private ColumnOrSuperColumn thriftifyColumnWithName(Cell cell, ByteBuffer newName)
+    private ColumnOrSuperColumn thriftifyColumnWithName(CFMetaData metadata, LegacyLayout.LegacyCell cell, ByteBuffer newName)
     {
-        if (cell instanceof CounterCell)
-            return new ColumnOrSuperColumn().setCounter_column(thriftifySubCounter(cell).setName(newName));
+        if (cell.isCounter())
+            return new ColumnOrSuperColumn().setCounter_column(thriftifySubCounter(metadata, cell).setName(newName));
         else
-            return new ColumnOrSuperColumn().setColumn(thriftifySubColumn(cell).setName(newName));
+            return new ColumnOrSuperColumn().setColumn(thriftifySubColumn(cell, newName));
     }
 
-    private Column thriftifySubColumn(Cell cell)
+    private Column thriftifySubColumn(CFMetaData metadata, LegacyLayout.LegacyCell cell)
     {
-        assert !(cell instanceof CounterCell);
+        return thriftifySubColumn(cell, cell.name.encode(metadata));
+    }
 
-        Column thrift_column = new Column(cell.name().toByteBuffer()).setValue(cell.value()).setTimestamp(cell.timestamp());
-        if (cell instanceof ExpiringCell)
-        {
-            thrift_column.setTtl(((ExpiringCell) cell).getTimeToLive());
-        }
+    private Column thriftifySubColumn(LegacyLayout.LegacyCell cell, ByteBuffer name)
+    {
+        assert !cell.isCounter();
+
+        Column thrift_column = new Column(name).setValue(cell.value).setTimestamp(cell.timestamp);
+        if (cell.isExpiring())
+            thrift_column.setTtl(cell.ttl);
         return thrift_column;
     }
 
-    private List<Column> thriftifyColumnsAsColumns(Collection<Cell> cells, long now)
+    private List<Column> thriftifyColumnsAsColumns(CFMetaData metadata, Iterator<LegacyLayout.LegacyCell> cells)
     {
-        List<Column> thriftColumns = new ArrayList<Column>(cells.size());
-        for (Cell cell : cells)
-        {
-            if (!cell.isLive(now))
-                continue;
-
-            thriftColumns.add(thriftifySubColumn(cell));
-        }
+        List<Column> thriftColumns = new ArrayList<>();
+        while (cells.hasNext())
+            thriftColumns.add(thriftifySubColumn(metadata, cells.next()));
         return thriftColumns;
     }
 
-    private CounterColumn thriftifySubCounter(Cell cell)
+    private CounterColumn thriftifySubCounter(CFMetaData metadata, LegacyLayout.LegacyCell cell)
     {
-        assert cell instanceof CounterCell;
-        return new CounterColumn(cell.name().toByteBuffer(), CounterContext.instance().total(cell.value()));
+        assert cell.isCounter();
+        return new CounterColumn(cell.name.encode(metadata), CounterContext.instance().total(cell.value));
     }
 
-    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<Cell> cells,
-                                                            boolean reverseOrder,
-                                                            long now,
+    private List<ColumnOrSuperColumn> thriftifySuperColumns(CFMetaData metadata,
+                                                            Iterator<LegacyLayout.LegacyCell> cells,
                                                             boolean subcolumnsOnly,
-                                                            boolean isCounterCF)
+                                                            boolean isCounterCF,
+                                                            boolean reversed)
     {
         if (subcolumnsOnly)
         {
-            ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
-            for (Cell cell : cells)
+            ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<>();
+            while (cells.hasNext())
             {
-                if (!cell.isLive(now))
-                    continue;
-
-                thriftSuperColumns.add(thriftifyColumnWithName(cell, SuperColumns.subName(cell.name())));
+                LegacyLayout.LegacyCell cell = cells.next();
+                thriftSuperColumns.add(thriftifyColumnWithName(metadata, cell, cell.name.superColumnSubName()));
             }
-            if (reverseOrder)
+            // Generally, cells come reversed if the query is reverse. However, this is not the case within a super column because
+            // internally a super column is a map within a row and those are never returned reversed.
+            if (reversed)
                 Collections.reverse(thriftSuperColumns);
             return thriftSuperColumns;
         }
         else
         {
             if (isCounterCF)
-                return thriftifyCounterSuperColumns(cells, reverseOrder, now);
+                return thriftifyCounterSuperColumns(metadata, cells, reversed);
             else
-                return thriftifySuperColumns(cells, reverseOrder, now);
+                return thriftifySuperColumns(cells, reversed);
         }
     }
 
-    private List<ColumnOrSuperColumn> thriftifySuperColumns(Collection<Cell> cells, boolean reverseOrder, long now)
+    private List<ColumnOrSuperColumn> thriftifySuperColumns(Iterator<LegacyLayout.LegacyCell> cells, boolean reversed)
     {
-        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
+        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<>();
         SuperColumn current = null;
-        for (Cell cell : cells)
+        while (cells.hasNext())
         {
-            if (!cell.isLive(now))
-                continue;
-
-            ByteBuffer scName = SuperColumns.scName(cell.name());
+            LegacyLayout.LegacyCell cell = cells.next();
+            ByteBuffer scName = cell.name.superColumnName();
             if (current == null || !scName.equals(current.bufferForName()))
             {
-                current = new SuperColumn(scName, new ArrayList<Column>());
+                // Generally, cells come reversed if the query is reverse. However, this is not the case within a super column because
+                // internally a super column is a map within a row and those are never returned reversed.
+                if (current != null && reversed)
+                    Collections.reverse(current.columns);
+
+                current = new SuperColumn(scName, new ArrayList<>());
                 thriftSuperColumns.add(new ColumnOrSuperColumn().setSuper_column(current));
             }
-            current.getColumns().add(thriftifySubColumn(cell).setName(SuperColumns.subName(cell.name())));
+            current.getColumns().add(thriftifySubColumn(cell, cell.name.superColumnSubName()));
         }
 
-        if (reverseOrder)
-            Collections.reverse(thriftSuperColumns);
+        if (current != null && reversed)
+            Collections.reverse(current.columns);
 
         return thriftSuperColumns;
     }
 
-    private List<ColumnOrSuperColumn> thriftifyCounterSuperColumns(Collection<Cell> cells, boolean reverseOrder, long now)
+    private List<ColumnOrSuperColumn> thriftifyCounterSuperColumns(CFMetaData metadata, Iterator<LegacyLayout.LegacyCell> cells, boolean reversed)
     {
-        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<ColumnOrSuperColumn>(cells.size());
+        ArrayList<ColumnOrSuperColumn> thriftSuperColumns = new ArrayList<>();
         CounterSuperColumn current = null;
-        for (Cell cell : cells)
+        while (cells.hasNext())
         {
-            if (!cell.isLive(now))
-                continue;
-
-            ByteBuffer scName = SuperColumns.scName(cell.name());
+            LegacyLayout.LegacyCell cell = cells.next();
+            ByteBuffer scName = cell.name.superColumnName();
             if (current == null || !scName.equals(current.bufferForName()))
             {
-                current = new CounterSuperColumn(scName, new ArrayList<CounterColumn>());
+                // Generally, cells come reversed if the query is reverse. However, this is not the case within a super column because
+                // internally a super column is a map within a row and those are never returned reversed.
+                if (current != null && reversed)
+                    Collections.reverse(current.columns);
+
+                current = new CounterSuperColumn(scName, new ArrayList<>());
                 thriftSuperColumns.add(new ColumnOrSuperColumn().setCounter_super_column(current));
             }
-            current.getColumns().add(thriftifySubCounter(cell).setName(SuperColumns.subName(cell.name())));
+            current.getColumns().add(thriftifySubCounter(metadata, cell).setName(cell.name.superColumnSubName()));
         }
-
-        if (reverseOrder)
-            Collections.reverse(thriftSuperColumns);
-
         return thriftSuperColumns;
     }
 
-    private Map<ByteBuffer, List<ColumnOrSuperColumn>> getSlice(List<ReadCommand> commands, boolean subColumnsOnly, org.apache.cassandra.db.ConsistencyLevel consistency_level, ClientState cState)
-    throws org.apache.cassandra.exceptions.InvalidRequestException, UnavailableException, TimedOutException
+    private List<ColumnOrSuperColumn> thriftifyPartition(RowIterator partition, boolean subcolumnsOnly, boolean reversed, int cellLimit)
     {
-        Map<DecoratedKey, ColumnFamily> columnFamilies = readColumnFamily(commands, consistency_level, cState);
-        Map<ByteBuffer, List<ColumnOrSuperColumn>> columnFamiliesMap = new HashMap<ByteBuffer, List<ColumnOrSuperColumn>>();
-        for (ReadCommand command: commands)
-        {
-            ColumnFamily cf = columnFamilies.get(StorageService.getPartitioner().decorateKey(command.key));
-            boolean reverseOrder = command instanceof SliceFromReadCommand && ((SliceFromReadCommand)command).filter.reversed;
-            List<ColumnOrSuperColumn> thriftifiedColumns = thriftifyColumnFamily(cf, subColumnsOnly, reverseOrder, command.timestamp);
-            columnFamiliesMap.put(command.key, thriftifiedColumns);
-        }
-
-        return columnFamiliesMap;
-    }
-
-    private List<ColumnOrSuperColumn> thriftifyColumnFamily(ColumnFamily cf, boolean subcolumnsOnly, boolean reverseOrder, long now)
-    {
-        if (cf == null || !cf.hasColumns())
+        if (partition.isEmpty())
             return EMPTY_COLUMNS;
 
-        if (cf.metadata().isSuper())
+        Iterator<LegacyLayout.LegacyCell> cells = LegacyLayout.fromRowIterator(partition).right;
+        List<ColumnOrSuperColumn> result;
+        if (partition.metadata().isSuper())
         {
-            boolean isCounterCF = cf.metadata().isCounter();
-            return thriftifySuperColumns(cf.getSortedColumns(), reverseOrder, now, subcolumnsOnly, isCounterCF);
+            boolean isCounterCF = partition.metadata().isCounter();
+            result = thriftifySuperColumns(partition.metadata(), cells, subcolumnsOnly, isCounterCF, reversed);
         }
         else
         {
-            return thriftifyColumns(cf.getSortedColumns(), reverseOrder, now);
+            result = thriftifyColumns(partition.metadata(), cells);
+        }
+
+        // Thrift count cells, but internally we only count them at "row" boundaries, which means that if the limit stops in the middle
+        // of an internal row we'll include a few additional cells. So trim it here.
+        return result.size() > cellLimit
+             ? result.subList(0, cellLimit)
+             : result;
+    }
+
+    private Map<ByteBuffer, List<ColumnOrSuperColumn>> getSlice(List<SinglePartitionReadCommand> commands, boolean subColumnsOnly, int cellLimit, org.apache.cassandra.db.ConsistencyLevel consistency_level, ClientState cState)
+    throws org.apache.cassandra.exceptions.InvalidRequestException, UnavailableException, TimedOutException
+    {
+        try (PartitionIterator results = read(commands, consistency_level, cState))
+        {
+            Map<ByteBuffer, List<ColumnOrSuperColumn>> columnFamiliesMap = new HashMap<>();
+            while (results.hasNext())
+            {
+                try (RowIterator iter = results.next())
+                {
+                    List<ColumnOrSuperColumn> thriftifiedColumns = thriftifyPartition(iter, subColumnsOnly, iter.isReverseOrder(), cellLimit);
+                    columnFamiliesMap.put(iter.partitionKey().getKey(), thriftifiedColumns);
+                }
+            }
+            return columnFamiliesMap;
         }
     }
 
@@ -303,7 +297,8 @@
             ClientState cState = state();
             String keyspace = cState.getKeyspace();
             state().hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.SELECT);
-            return getSliceInternal(keyspace, key, column_parent, System.currentTimeMillis(), predicate, consistency_level, cState);
+            List<ColumnOrSuperColumn> result = getSliceInternal(keyspace, key, column_parent, FBUtilities.nowInSeconds(), predicate, consistency_level, cState);
+            return result == null ? Collections.<ColumnOrSuperColumn>emptyList() : result;
         }
         catch (RequestValidationException e)
         {
@@ -318,13 +313,13 @@
     private List<ColumnOrSuperColumn> getSliceInternal(String keyspace,
                                                        ByteBuffer key,
                                                        ColumnParent column_parent,
-                                                       long timestamp,
+                                                       int nowInSec,
                                                        SlicePredicate predicate,
                                                        ConsistencyLevel consistency_level,
                                                        ClientState cState)
     throws org.apache.cassandra.exceptions.InvalidRequestException, UnavailableException, TimedOutException
     {
-        return multigetSliceInternal(keyspace, Collections.singletonList(key), column_parent, timestamp, predicate, consistency_level, cState).get(key);
+        return multigetSliceInternal(keyspace, Collections.singletonList(key), column_parent, nowInSec, predicate, consistency_level, cState).get(key);
     }
 
     public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level)
@@ -351,7 +346,7 @@
             ClientState cState = state();
             String keyspace = cState.getKeyspace();
             cState.hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.SELECT);
-            return multigetSliceInternal(keyspace, keys, column_parent, System.currentTimeMillis(), predicate, consistency_level, cState);
+            return multigetSliceInternal(keyspace, keys, column_parent, FBUtilities.nowInSeconds(), predicate, consistency_level, cState);
         }
         catch (RequestValidationException e)
         {
@@ -363,55 +358,192 @@
         }
     }
 
-    private SliceQueryFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SliceRange range)
+    private ClusteringIndexFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SliceRange range)
     {
-        if (metadata.isSuper())
-        {
-            CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(parent.isSetSuper_column() ? 1 : 0));
-            Composite start = columnType.fromByteBuffer(range.start);
-            Composite finish = columnType.fromByteBuffer(range.finish);
-            SliceQueryFilter filter = new SliceQueryFilter(start, finish, range.reversed, range.count);
-            return SuperColumns.fromSCSliceFilter(metadata.comparator, parent.bufferForSuper_column(), filter);
-        }
-
-        Composite start = metadata.comparator.fromByteBuffer(range.start);
-        Composite finish = metadata.comparator.fromByteBuffer(range.finish);
-        return new SliceQueryFilter(start, finish, range.reversed, range.count);
+        if (metadata.isSuper() && parent.isSetSuper_column())
+            return new ClusteringIndexNamesFilter(FBUtilities.singleton(new Clustering(parent.bufferForSuper_column()), metadata.comparator), range.reversed);
+        else
+            return new ClusteringIndexSliceFilter(makeSlices(metadata, range), range.reversed);
     }
 
-    private IDiskAtomFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SlicePredicate predicate)
+    private Slices makeSlices(CFMetaData metadata, SliceRange range)
     {
-        IDiskAtomFilter filter;
+        // Note that in thrift, the bounds are reversed if the query is reversed, but not internally.
+        ByteBuffer start = range.reversed ? range.finish : range.start;
+        ByteBuffer finish = range.reversed ? range.start : range.finish;
+        return Slices.with(metadata.comparator, Slice.make(LegacyLayout.decodeSliceBound(metadata, start, true).bound, LegacyLayout.decodeSliceBound(metadata, finish, false).bound));
+    }
 
-        if (predicate.column_names != null)
+    private ClusteringIndexFilter toInternalFilter(CFMetaData metadata, ColumnParent parent, SlicePredicate predicate)
+    throws org.apache.cassandra.exceptions.InvalidRequestException
+    {
+        try
         {
-            if (metadata.isSuper())
+            if (predicate.column_names != null)
             {
-                CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(parent.isSetSuper_column() ? 1 : 0));
-                SortedSet<CellName> s = new TreeSet<>(columnType);
-                for (ByteBuffer bb : predicate.column_names)
-                    s.add(columnType.cellFromByteBuffer(bb));
-                filter = SuperColumns.fromSCNamesFilter(metadata.comparator, parent.bufferForSuper_column(), new NamesQueryFilter(s));
+                if (metadata.isSuper())
+                {
+                    if (parent.isSetSuper_column())
+                    {
+                        return new ClusteringIndexNamesFilter(FBUtilities.singleton(new Clustering(parent.bufferForSuper_column()), metadata.comparator), false);
+                    }
+                    else
+                    {
+                        NavigableSet<Clustering> clusterings = new TreeSet<>(metadata.comparator);
+                        for (ByteBuffer bb : predicate.column_names)
+                            clusterings.add(new Clustering(bb));
+                        return new ClusteringIndexNamesFilter(clusterings, false);
+                    }
+                }
+                else
+                {
+                    NavigableSet<Clustering> clusterings = new TreeSet<>(metadata.comparator);
+                    for (ByteBuffer bb : predicate.column_names)
+                    {
+                        LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(metadata, parent.bufferForSuper_column(), bb);
+
+                        if (!name.clustering.equals(Clustering.STATIC_CLUSTERING))
+                            clusterings.add(name.clustering);
+                    }
+
+                    // clusterings cannot include STATIC_CLUSTERING, so if the names filter is for static columns, clusterings
+                    // will be empty.  However, by requesting the static columns in our ColumnFilter, this will still work.
+                    return new ClusteringIndexNamesFilter(clusterings, false);
+                }
             }
             else
             {
-                SortedSet<CellName> s = new TreeSet<CellName>(metadata.comparator);
-                for (ByteBuffer bb : predicate.column_names)
-                    s.add(metadata.comparator.cellFromByteBuffer(bb));
-                filter = new NamesQueryFilter(s);
+                return toInternalFilter(metadata, parent, predicate.slice_range);
             }
         }
-        else
+        catch (UnknownColumnException e)
         {
-            filter = toInternalFilter(metadata, parent, predicate.slice_range);
+            throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
         }
-        return filter;
+    }
+
+    private ColumnFilter makeColumnFilter(CFMetaData metadata, ColumnParent parent, SliceRange range)
+    {
+        if (metadata.isSuper() && parent.isSetSuper_column())
+        {
+            // We want a slice of the dynamic columns
+            ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+            ColumnDefinition def = metadata.compactValueColumn();
+            ByteBuffer start = range.reversed ? range.finish : range.start;
+            ByteBuffer finish = range.reversed ? range.start : range.finish;
+            builder.slice(def, start.hasRemaining() ? CellPath.create(start) : CellPath.BOTTOM, finish.hasRemaining() ? CellPath.create(finish) : CellPath.TOP);
+
+            if (metadata.isDense())
+                return builder.build();
+
+            // We also want to add any staticly defined column if it's within the range
+            AbstractType<?> cmp = metadata.thriftColumnNameType();
+
+            for (ColumnDefinition column : metadata.partitionColumns())
+            {
+                if (SuperColumnCompatibility.isSuperColumnMapColumn(column))
+                    continue;
+
+                ByteBuffer name = column.name.bytes;
+                if (cmp.compare(name, start) < 0 || cmp.compare(finish, name) > 0)
+                    continue;
+
+                builder.add(column);
+            }
+            return builder.build();
+        }
+        return makeColumnFilter(metadata, makeSlices(metadata, range));
+    }
+
+    private ColumnFilter makeColumnFilter(CFMetaData metadata, Slices slices)
+    {
+        PartitionColumns columns = metadata.partitionColumns();
+        if (metadata.isStaticCompactTable() && !columns.statics.isEmpty())
+        {
+            PartitionColumns.Builder builder = PartitionColumns.builder();
+            builder.addAll(columns.regulars);
+            // We only want to include the static columns that are selected by the slices
+            for (ColumnDefinition def : columns.statics)
+            {
+                if (slices.selects(new Clustering(def.name.bytes)))
+                    builder.add(def);
+            }
+            columns = builder.build();
+        }
+        return ColumnFilter.selection(columns);
+    }
+
+    private ColumnFilter makeColumnFilter(CFMetaData metadata, ColumnParent parent, SlicePredicate predicate)
+    throws org.apache.cassandra.exceptions.InvalidRequestException
+    {
+        try
+        {
+            if (predicate.column_names != null)
+            {
+                if (metadata.isSuper())
+                {
+                    if (parent.isSetSuper_column())
+                    {
+                        ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+                        ColumnDefinition dynamicDef = metadata.compactValueColumn();
+                        for (ByteBuffer bb : predicate.column_names)
+                        {
+                            ColumnDefinition staticDef = metadata.getColumnDefinition(bb);
+                            if (staticDef == null)
+                                builder.select(dynamicDef, CellPath.create(bb));
+                            else
+                                builder.add(staticDef);
+                        }
+                        return builder.build();
+                    }
+                    else
+                    {
+                        return ColumnFilter.all(metadata);
+                    }
+                }
+                else
+                {
+                    PartitionColumns.Builder builder = PartitionColumns.builder();
+                    for (ByteBuffer bb : predicate.column_names)
+                    {
+                        LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(metadata, parent.bufferForSuper_column(), bb);
+                        builder.add(name.column);
+                    }
+
+                    if (metadata.isStaticCompactTable())
+                        builder.add(metadata.compactValueColumn());
+
+                    return ColumnFilter.selection(builder.build());
+                }
+            }
+            else
+            {
+                return makeColumnFilter(metadata, parent, predicate.slice_range);
+            }
+        }
+        catch (UnknownColumnException e)
+        {
+            throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
+        }
+    }
+
+    private DataLimits getLimits(int partitionLimit, boolean countSuperColumns, SlicePredicate predicate)
+    {
+        int cellsPerPartition = predicate.slice_range == null ? Integer.MAX_VALUE : predicate.slice_range.count;
+        return getLimits(partitionLimit, countSuperColumns, cellsPerPartition);
+    }
+
+    private DataLimits getLimits(int partitionLimit, boolean countSuperColumns, int perPartitionCount)
+    {
+        return countSuperColumns
+             ? DataLimits.superColumnCountingLimits(partitionLimit, perPartitionCount)
+             : DataLimits.thriftLimits(partitionLimit, perPartitionCount);
     }
 
     private Map<ByteBuffer, List<ColumnOrSuperColumn>> multigetSliceInternal(String keyspace,
                                                                              List<ByteBuffer> keys,
                                                                              ColumnParent column_parent,
-                                                                             long timestamp,
+                                                                             int nowInSec,
                                                                              SlicePredicate predicate,
                                                                              ConsistencyLevel consistency_level,
                                                                              ClientState cState)
@@ -424,18 +556,19 @@
         org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(consistency_level);
         consistencyLevel.validateForRead(keyspace);
 
-        List<ReadCommand> commands = new ArrayList<ReadCommand>(keys.size());
-        IDiskAtomFilter filter = toInternalFilter(metadata, column_parent, predicate);
+        List<SinglePartitionReadCommand> commands = new ArrayList<>(keys.size());
+        ColumnFilter columnFilter = makeColumnFilter(metadata, column_parent, predicate);
+        ClusteringIndexFilter filter = toInternalFilter(metadata, column_parent, predicate);
+        DataLimits limits = getLimits(1, metadata.isSuper() && !column_parent.isSetSuper_column(), predicate);
 
         for (ByteBuffer key: keys)
         {
             ThriftValidation.validateKey(metadata, key);
-            // Note that we should not share a slice filter amongst the command, due to SliceQueryFilter not  being immutable
-            // due to its columnCounter used by the lastCounted() method (also see SelectStatement.getSliceCommands)
-            commands.add(ReadCommand.create(keyspace, key, column_parent.getColumn_family(), timestamp, filter.cloneShallow()));
+            DecoratedKey dk = metadata.decorateKey(key);
+            commands.add(SinglePartitionReadCommand.create(true, metadata, nowInSec, columnFilter, RowFilter.NONE, limits, dk, filter));
         }
 
-        return getSlice(commands, column_parent.isSetSuper_column(), consistencyLevel, cState);
+        return getSlice(commands, column_parent.isSetSuper_column(), limits.perPartitionCount(), consistencyLevel, cState);
     }
 
     public ColumnOrSuperColumn get(ByteBuffer key, ColumnPath column_path, ConsistencyLevel consistency_level)
@@ -466,35 +599,70 @@
 
             ThriftValidation.validateKey(metadata, key);
 
-            IDiskAtomFilter filter;
+            ColumnFilter columns;
+            ClusteringIndexFilter filter;
             if (metadata.isSuper())
             {
-                CellNameType columnType = new SimpleDenseCellNameType(metadata.comparator.subtype(column_path.column == null ? 0 : 1));
-                SortedSet<CellName> names = new TreeSet<CellName>(columnType);
-                names.add(columnType.cellFromByteBuffer(column_path.column == null ? column_path.super_column : column_path.column));
-                filter = SuperColumns.fromSCNamesFilter(metadata.comparator, column_path.column == null ? null : column_path.bufferForSuper_column(), new NamesQueryFilter(names));
+                if (column_path.column == null)
+                {
+                    // Selects a full super column
+                    columns = ColumnFilter.all(metadata);
+                }
+                else
+                {
+                    // Selects a single column within a super column
+                    ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+                    ColumnDefinition staticDef = metadata.getColumnDefinition(column_path.column);
+                    ColumnDefinition dynamicDef = metadata.compactValueColumn();
+
+                    if (staticDef != null)
+                        builder.add(staticDef);
+                    // Note that even if there is a staticDef, we still query the dynamicDef since we can't guarantee the static one hasn't
+                    // been created after data has been inserted for that definition
+                    builder.select(dynamicDef, CellPath.create(column_path.column));
+                    columns = builder.build();
+                }
+                filter = new ClusteringIndexNamesFilter(FBUtilities.singleton(new Clustering(column_path.super_column), metadata.comparator),
+                                                  false);
             }
             else
             {
-                SortedSet<CellName> names = new TreeSet<CellName>(metadata.comparator);
-                names.add(metadata.comparator.cellFromByteBuffer(column_path.column));
-                filter = new NamesQueryFilter(names);
+                LegacyLayout.LegacyCellName cellname = LegacyLayout.decodeCellName(metadata, column_path.super_column, column_path.column);
+                if (cellname.clustering == Clustering.STATIC_CLUSTERING)
+                {
+                    // Same as above: even if we're querying a static column, we still query the equivalent dynamic column and value as some
+                    // values might have been created post creation of the column (ThriftResultMerger then ensures we get only one result).
+                    ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
+                    builder.add(cellname.column);
+                    builder.add(metadata.compactValueColumn());
+                    columns = builder.build();
+                    filter = new ClusteringIndexNamesFilter(FBUtilities.singleton(new Clustering(column_path.column), metadata.comparator), false);
+                }
+                else
+                {
+                    columns = ColumnFilter.selection(PartitionColumns.of(cellname.column));
+                    filter = new ClusteringIndexNamesFilter(FBUtilities.singleton(cellname.clustering, metadata.comparator), false);
+                }
             }
 
-            long now = System.currentTimeMillis();
-            ReadCommand command = ReadCommand.create(keyspace, key, column_path.column_family, now, filter);
+            DecoratedKey dk = metadata.decorateKey(key);
+            SinglePartitionReadCommand command = SinglePartitionReadCommand.create(true, metadata, FBUtilities.nowInSeconds(), columns, RowFilter.NONE, DataLimits.NONE, dk, filter);
 
-            Map<DecoratedKey, ColumnFamily> cfamilies = readColumnFamily(Arrays.asList(command), consistencyLevel, cState);
+            try (RowIterator result = PartitionIterators.getOnlyElement(read(Arrays.asList(command), consistencyLevel, cState), command))
+            {
+                if (!result.hasNext())
+                    throw new NotFoundException();
 
-            ColumnFamily cf = cfamilies.get(StorageService.getPartitioner().decorateKey(command.key));
-
-            if (cf == null)
-                throw new NotFoundException();
-            List<ColumnOrSuperColumn> tcolumns = thriftifyColumnFamily(cf, metadata.isSuper() && column_path.column != null, false, now);
-            if (tcolumns.isEmpty())
-                throw new NotFoundException();
-            assert tcolumns.size() == 1;
-            return tcolumns.get(0);
+                List<ColumnOrSuperColumn> tcolumns = thriftifyPartition(result, metadata.isSuper() && column_path.column != null, result.isReverseOrder(), 1);
+                if (tcolumns.isEmpty())
+                    throw new NotFoundException();
+                assert tcolumns.size() == 1;
+                return tcolumns.get(0);
+            }
+        }
+        catch (UnknownColumnException e)
+        {
+            throw new InvalidRequestException(e.getMessage());
         }
         catch (RequestValidationException e)
         {
@@ -529,16 +697,16 @@
             cState.hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.SELECT);
             Keyspace keyspaceName = Keyspace.open(keyspace);
             ColumnFamilyStore cfs = keyspaceName.getColumnFamilyStore(column_parent.column_family);
-            long timestamp = System.currentTimeMillis();
+            int nowInSec = FBUtilities.nowInSeconds();
 
             if (predicate.column_names != null)
-                return getSliceInternal(keyspace, key, column_parent, timestamp, predicate, consistency_level, cState).size();
+                return getSliceInternal(keyspace, key, column_parent, nowInSec, predicate, consistency_level, cState).size();
 
             int pageSize;
             // request by page if this is a large row
             if (cfs.getMeanColumns() > 0)
             {
-                int averageColumnSize = (int) (cfs.metric.meanRowSize.getValue() / cfs.getMeanColumns());
+                int averageColumnSize = (int) (cfs.metric.meanPartitionSize.getValue() / cfs.getMeanColumns());
                 pageSize = Math.min(COUNT_PAGE_SIZE, 4 * 1024 * 1024 / averageColumnSize);
                 pageSize = Math.max(2, pageSize);
                 logger.trace("average row column size is {}; using pageSize of {}", averageColumnSize, pageSize);
@@ -551,16 +719,35 @@
             SliceRange sliceRange = predicate.slice_range == null
                                   ? new SliceRange(ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, Integer.MAX_VALUE)
                                   : predicate.slice_range;
-            SliceQueryFilter filter = toInternalFilter(cfs.metadata, column_parent, sliceRange);
 
-            return QueryPagers.countPaged(keyspace,
-                                          column_parent.column_family,
-                                          key,
+            ColumnFilter columnFilter;
+            ClusteringIndexFilter filter;
+            CFMetaData metadata = cfs.metadata;
+            if (metadata.isSuper() && !column_parent.isSetSuper_column())
+            {
+                // If we count on a super column table without having set the super column name, we're in fact interested by the count of super columns
+                columnFilter = ColumnFilter.all(metadata);
+                filter = new ClusteringIndexSliceFilter(makeSlices(metadata, sliceRange), sliceRange.reversed);
+            }
+            else
+            {
+                columnFilter = makeColumnFilter(metadata, column_parent, sliceRange);
+                filter = toInternalFilter(metadata, column_parent, sliceRange);
+            }
+
+            DataLimits limits = getLimits(1, metadata.isSuper() && !column_parent.isSetSuper_column(), predicate);
+            DecoratedKey dk = metadata.decorateKey(key);
+
+            return QueryPagers.countPaged(metadata,
+                                          dk,
+                                          columnFilter,
                                           filter,
+                                          limits,
                                           ThriftConversion.fromThrift(consistency_level),
                                           cState,
                                           pageSize,
-                                          timestamp);
+                                          nowInSec,
+                                          true);
         }
         catch (IllegalArgumentException e)
         {
@@ -608,11 +795,11 @@
             String keyspace = cState.getKeyspace();
             cState.hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.SELECT);
 
-            Map<ByteBuffer, Integer> counts = new HashMap<ByteBuffer, Integer>();
+            Map<ByteBuffer, Integer> counts = new HashMap<>();
             Map<ByteBuffer, List<ColumnOrSuperColumn>> columnFamiliesMap = multigetSliceInternal(keyspace,
                                                                                                  keys,
                                                                                                  column_parent,
-                                                                                                 System.currentTimeMillis(),
+                                                                                                 FBUtilities.nowInSeconds(),
                                                                                                  predicate,
                                                                                                  consistency_level,
                                                                                                  cState);
@@ -631,6 +818,14 @@
         }
     }
 
+    private Cell cellFromColumn(CFMetaData metadata, LegacyLayout.LegacyCellName name, Column column)
+    {
+        CellPath path = name.collectionElement == null ? null : CellPath.create(name.collectionElement);
+        return column.ttl == 0
+             ? BufferCell.live(metadata, name.column, column.timestamp, column.value, path)
+             : BufferCell.expiring(name.column, column.timestamp, column.ttl, FBUtilities.nowInSeconds(), column.value, path);
+    }
+
     private void internal_insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level)
     throws RequestValidationException, UnavailableException, TimedOutException
     {
@@ -639,36 +834,36 @@
         cState.hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.MODIFY);
 
         CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, column_parent.column_family, false);
+        if (metadata.isView())
+            throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot modify Materialized Views directly");
+
         ThriftValidation.validateKey(metadata, key);
         ThriftValidation.validateColumnParent(metadata, column_parent);
         // SuperColumn field is usually optional, but not when we're inserting
-        if (metadata.cfType == ColumnFamilyType.Super && column_parent.super_column == null)
+        if (metadata.isSuper() && column_parent.super_column == null)
         {
             throw new org.apache.cassandra.exceptions.InvalidRequestException("missing mandatory super column name for super CF " + column_parent.column_family);
         }
-        ThriftValidation.validateColumnNames(metadata, column_parent, Arrays.asList(column.name));
-        ThriftValidation.validateColumnData(metadata, key, column_parent.super_column, column);
+        ThriftValidation.validateColumnNames(metadata, column_parent, Collections.singletonList(column.name));
+        ThriftValidation.validateColumnData(metadata, column_parent.super_column, column);
 
         org.apache.cassandra.db.Mutation mutation;
         try
         {
-            CellName name = metadata.isSuper()
-                          ? metadata.comparator.makeCellName(column_parent.super_column, column.name)
-                          : metadata.comparator.cellFromByteBuffer(column.name);
+            LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(metadata, column_parent.super_column, column.name);
+            Cell cell = cellFromColumn(metadata, name, column);
+            PartitionUpdate update = PartitionUpdate.singleRowUpdate(metadata, key, BTreeRow.singleCellRow(name.clustering, cell));
 
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cState.getKeyspace(), column_parent.column_family);
-            cf.addColumn(name, column.value, column.timestamp, column.ttl);
+            // Indexed column values cannot be larger than 64K.  See CASSANDRA-3057/4240 for more details
+            Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(update);
 
-            // Validate row level indexes. See CASSANDRA-10092 for more details.
-            Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validateRowLevelIndexes(key, cf);
-
-            mutation = new org.apache.cassandra.db.Mutation(cState.getKeyspace(), key, cf);
+            mutation = new org.apache.cassandra.db.Mutation(update);
         }
-        catch (MarshalException e)
+        catch (MarshalException|UnknownColumnException e)
         {
             throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
         }
-        doInsert(consistency_level, Arrays.asList(mutation));
+        doInsert(consistency_level, Collections.singletonList(mutation));
     }
 
     public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level)
@@ -736,52 +931,42 @@
             cState.hasColumnFamilyAccess(keyspace, column_family, Permission.SELECT);
 
             CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, column_family, false);
+            if (metadata.isView())
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot modify Materialized Views directly");
+
             ThriftValidation.validateKey(metadata, key);
-            if (metadata.cfType == ColumnFamilyType.Super)
+            if (metadata.isSuper())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("CAS does not support supercolumns");
 
-            Iterable<ByteBuffer> names = Iterables.transform(updates, new Function<Column, ByteBuffer>()
-            {
-                public ByteBuffer apply(Column column)
-                {
-                    return column.name;
-                }
-            });
+            Iterable<ByteBuffer> names = Iterables.transform(updates, column -> column.name);
             ThriftValidation.validateColumnNames(metadata, new ColumnParent(column_family), names);
             for (Column column : updates)
-                ThriftValidation.validateColumnData(metadata, key, null, column);
+                ThriftValidation.validateColumnData(metadata, null, column);
 
-            CFMetaData cfm = Schema.instance.getCFMetaData(cState.getKeyspace(), column_family);
-            ColumnFamily cfUpdates = ArrayBackedSortedColumns.factory.create(cfm);
-            for (Column column : updates)
-                cfUpdates.addColumn(cfm.comparator.cellFromByteBuffer(column.name), column.value, column.timestamp);
+            DecoratedKey dk = metadata.decorateKey(key);
+            int nowInSec = FBUtilities.nowInSeconds();
 
-            // Validate row level indexes. See CASSANDRA-10092 for more details.
-            Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validateRowLevelIndexes(key, cfUpdates);
-
-            ColumnFamily cfExpected;
-            if (expected.isEmpty())
-            {
-                cfExpected = null;
-            }
-            else
-            {
-                cfExpected = ArrayBackedSortedColumns.factory.create(cfm);
-                for (Column column : expected)
-                    cfExpected.addColumn(cfm.comparator.cellFromByteBuffer(column.name), column.value, column.timestamp);
-            }
+            PartitionUpdate partitionUpdates = PartitionUpdate.fromIterator(LegacyLayout.toRowIterator(metadata, dk, toLegacyCells(metadata, updates, nowInSec).iterator(), nowInSec));
+            // Indexed column values cannot be larger than 64K.  See CASSANDRA-3057/4240 for more details
+            Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(partitionUpdates);
 
             schedule(DatabaseDescriptor.getWriteRpcTimeout());
-            ColumnFamily result = StorageProxy.cas(cState.getKeyspace(),
-                                                   column_family,
-                                                   key,
-                                                   new ThriftCASRequest(cfExpected, cfUpdates),
-                                                   ThriftConversion.fromThrift(serial_consistency_level),
-                                                   ThriftConversion.fromThrift(commit_consistency_level),
-                                                   cState);
-            return result == null
-                 ? new CASResult(true)
-                 : new CASResult(false).setCurrent_values(thriftifyColumnsAsColumns(result.getSortedColumns(), System.currentTimeMillis()));
+            try (RowIterator result = StorageProxy.cas(cState.getKeyspace(),
+                                                       column_family,
+                                                       dk,
+                                                       new ThriftCASRequest(toLegacyCells(metadata, expected, nowInSec), partitionUpdates, nowInSec),
+                                                       ThriftConversion.fromThrift(serial_consistency_level),
+                                                       ThriftConversion.fromThrift(commit_consistency_level),
+                                                       cState))
+            {
+                return result == null
+                     ? new CASResult(true)
+                     : new CASResult(false).setCurrent_values(thriftifyColumnsAsColumns(metadata, LegacyLayout.fromRowIterator(result).right));
+            }
+        }
+        catch (UnknownColumnException e)
+        {
+            throw new InvalidRequestException(e.getMessage());
         }
         catch (RequestTimeoutException e)
         {
@@ -801,14 +986,124 @@
         }
     }
 
+    private LegacyLayout.LegacyCell toLegacyCell(CFMetaData metadata, Column column, int nowInSec) throws UnknownColumnException
+    {
+        return toLegacyCell(metadata, null, column, nowInSec);
+    }
+
+    private LegacyLayout.LegacyCell toLegacyCell(CFMetaData metadata, ByteBuffer superColumnName, Column column, int nowInSec)
+    throws UnknownColumnException
+    {
+        return column.ttl > 0
+             ? LegacyLayout.LegacyCell.expiring(metadata, superColumnName, column.name, column.value, column.timestamp, column.ttl, nowInSec)
+             : LegacyLayout.LegacyCell.regular(metadata, superColumnName, column.name, column.value, column.timestamp);
+    }
+
+    private LegacyLayout.LegacyCell toLegacyDeletion(CFMetaData metadata, ByteBuffer name, long timestamp, int nowInSec)
+    throws UnknownColumnException
+    {
+        return toLegacyDeletion(metadata, null, name, timestamp, nowInSec);
+    }
+
+    private LegacyLayout.LegacyCell toLegacyDeletion(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec)
+    throws UnknownColumnException
+    {
+        return LegacyLayout.LegacyCell.tombstone(metadata, superColumnName, name, timestamp, nowInSec);
+    }
+
+    private LegacyLayout.LegacyCell toCounterLegacyCell(CFMetaData metadata, CounterColumn column)
+    throws UnknownColumnException
+    {
+        return toCounterLegacyCell(metadata, null, column);
+    }
+
+    private LegacyLayout.LegacyCell toCounterLegacyCell(CFMetaData metadata, ByteBuffer superColumnName, CounterColumn column)
+    throws UnknownColumnException
+    {
+        return LegacyLayout.LegacyCell.counterUpdate(metadata, superColumnName, column.name, column.value);
+    }
+
+    private void sortAndMerge(CFMetaData metadata, List<LegacyLayout.LegacyCell> cells, int nowInSec)
+    {
+        Collections.sort(cells, LegacyLayout.legacyCellComparator(metadata));
+
+        // After sorting, if we have multiple cells for the same "cellname", we want to merge those together.
+        Comparator<LegacyLayout.LegacyCellName> comparator = LegacyLayout.legacyCellNameComparator(metadata, false);
+
+        int previous = 0; // The last element that was set
+        for (int current = 1; current < cells.size(); current++)
+        {
+            LegacyLayout.LegacyCell pc = cells.get(previous);
+            LegacyLayout.LegacyCell cc = cells.get(current);
+
+            // There is really only 2 possible comparison: < 0 or == 0 since we've sorted already
+            int cmp = comparator.compare(pc.name, cc.name);
+            if (cmp == 0)
+            {
+                // current and previous are the same cell. Merge current into previous
+                // (and so previous + 1 will be "free").
+                Conflicts.Resolution res;
+                if (metadata.isCounter())
+                {
+                    res = Conflicts.resolveCounter(pc.timestamp, pc.isLive(nowInSec), pc.value,
+                                                   cc.timestamp, cc.isLive(nowInSec), cc.value);
+
+                }
+                else
+                {
+                    res = Conflicts.resolveRegular(pc.timestamp, pc.isLive(nowInSec), pc.localDeletionTime, pc.value,
+                                                   cc.timestamp, cc.isLive(nowInSec), cc.localDeletionTime, cc.value);
+                }
+
+                switch (res)
+                {
+                    case LEFT_WINS:
+                        // The previous cell wins, we'll just ignore current
+                        break;
+                    case RIGHT_WINS:
+                        cells.set(previous, cc);
+                        break;
+                    case MERGE:
+                        assert metadata.isCounter();
+                        ByteBuffer merged = Conflicts.mergeCounterValues(pc.value, cc.value);
+                        cells.set(previous, LegacyLayout.LegacyCell.counter(pc.name, merged));
+                        break;
+                }
+            }
+            else
+            {
+                // cell.get(previous) < cells.get(current), so move current just after previous if needs be
+                ++previous;
+                if (previous != current)
+                    cells.set(previous, cc);
+            }
+        }
+
+        // The last element we want is previous, so trim anything after that
+        for (int i = cells.size() - 1; i > previous; i--)
+            cells.remove(i);
+    }
+
+    private List<LegacyLayout.LegacyCell> toLegacyCells(CFMetaData metadata, List<Column> columns, int nowInSec)
+    throws UnknownColumnException
+    {
+        List<LegacyLayout.LegacyCell> cells = new ArrayList<>(columns.size());
+        for (Column column : columns)
+            cells.add(toLegacyCell(metadata, column, nowInSec));
+
+        sortAndMerge(metadata, cells, nowInSec);
+        return cells;
+    }
+
     private List<IMutation> createMutationList(ConsistencyLevel consistency_level,
                                                Map<ByteBuffer,Map<String,List<Mutation>>> mutation_map,
                                                boolean allowCounterMutations)
-    throws RequestValidationException
+    throws RequestValidationException, InvalidRequestException
     {
         List<IMutation> mutations = new ArrayList<>();
         ThriftClientState cState = state();
         String keyspace = cState.getKeyspace();
+        int nowInSec = FBUtilities.nowInSeconds();
 
         for (Map.Entry<ByteBuffer, Map<String, List<Mutation>>> mutationEntry: mutation_map.entrySet())
         {
@@ -823,42 +1118,53 @@
             for (Map.Entry<String, List<Mutation>> columnFamilyMutations : columnFamilyToMutations.entrySet())
             {
                 String cfName = columnFamilyMutations.getKey();
+                List<Mutation> muts = columnFamilyMutations.getValue();
 
                 cState.hasColumnFamilyAccess(keyspace, cfName, Permission.MODIFY);
 
                 CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, cfName);
+                if (metadata.isView())
+                    throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot modify Materialized Views directly");
+
                 ThriftValidation.validateKey(metadata, key);
+                if (metadata.isCounter())
+                    ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
+
+                LegacyLayout.LegacyDeletionInfo delInfo = LegacyLayout.LegacyDeletionInfo.live();
+                List<LegacyLayout.LegacyCell> cells = new ArrayList<>();
+                for (Mutation m : muts)
+                {
+                    ThriftValidation.validateMutation(metadata, m);
+
+                    if (m.deletion != null)
+                    {
+                        deleteColumnOrSuperColumn(delInfo, cells, metadata, m.deletion, nowInSec);
+                    }
+                    if (m.column_or_supercolumn != null)
+                    {
+                        addColumnOrSuperColumn(cells, metadata, m.column_or_supercolumn, nowInSec);
+                    }
+                }
+
+                sortAndMerge(metadata, cells, nowInSec);
+                DecoratedKey dk = metadata.decorateKey(key);
+                PartitionUpdate update = PartitionUpdate.fromIterator(LegacyLayout.toUnfilteredRowIterator(metadata, dk, delInfo, cells.iterator()));
+
+                // Indexed column values cannot be larger than 64K.  See CASSANDRA-3057/4240 for more details
+                Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(update);
 
                 org.apache.cassandra.db.Mutation mutation;
                 if (metadata.isCounter())
                 {
-                    ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
-                    counterMutation = counterMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, key) : counterMutation;
+                    counterMutation = counterMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, dk) : counterMutation;
                     mutation = counterMutation;
                 }
                 else
                 {
-                    standardMutation = standardMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, key) : standardMutation;
+                    standardMutation = standardMutation == null ? new org.apache.cassandra.db.Mutation(keyspace, dk) : standardMutation;
                     mutation = standardMutation;
                 }
-
-                for (Mutation m : columnFamilyMutations.getValue())
-                {
-                    ThriftValidation.validateMutation(metadata, key, m);
-
-                    if (m.deletion != null)
-                    {
-                        deleteColumnOrSuperColumn(mutation, metadata, m.deletion);
-                    }
-                    if (m.column_or_supercolumn != null)
-                    {
-                        addColumnOrSuperColumn(mutation, metadata, m.column_or_supercolumn);
-                    }
-                }
-
-                // Validate row level indexes. See CASSANDRA-10092 for more details.
-                ColumnFamily cf = mutation.addOrGet(metadata);
-                Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validateRowLevelIndexes(key, cf);
+                mutation.add(update);
             }
             if (standardMutation != null && !standardMutation.isEmpty())
                 mutations.add(standardMutation);
@@ -875,70 +1181,88 @@
         return mutations;
     }
 
-    private void addColumnOrSuperColumn(org.apache.cassandra.db.Mutation mutation, CFMetaData cfm, ColumnOrSuperColumn cosc)
+    private void addColumnOrSuperColumn(List<LegacyLayout.LegacyCell> cells, CFMetaData cfm, ColumnOrSuperColumn cosc, int nowInSec)
+    throws InvalidRequestException
     {
-        if (cosc.super_column != null)
+        try
         {
-            for (Column column : cosc.super_column.columns)
+            if (cosc.super_column != null)
             {
-                mutation.add(cfm.cfName, cfm.comparator.makeCellName(cosc.super_column.name, column.name), column.value, column.timestamp, column.ttl);
+                for (Column column : cosc.super_column.columns)
+                    cells.add(toLegacyCell(cfm, cosc.super_column.name, column, nowInSec));
+            }
+            else if (cosc.column != null)
+            {
+                cells.add(toLegacyCell(cfm, cosc.column, nowInSec));
+            }
+            else if (cosc.counter_super_column != null)
+            {
+                for (CounterColumn column : cosc.counter_super_column.columns)
+                    cells.add(toCounterLegacyCell(cfm, cosc.counter_super_column.name, column));
+            }
+            else // cosc.counter_column != null
+            {
+                cells.add(toCounterLegacyCell(cfm, cosc.counter_column));
             }
         }
-        else if (cosc.column != null)
+        catch (UnknownColumnException e)
         {
-            mutation.add(cfm.cfName, cfm.comparator.cellFromByteBuffer(cosc.column.name), cosc.column.value, cosc.column.timestamp, cosc.column.ttl);
-        }
-        else if (cosc.counter_super_column != null)
-        {
-            for (CounterColumn column : cosc.counter_super_column.columns)
-            {
-                mutation.addCounter(cfm.cfName, cfm.comparator.makeCellName(cosc.counter_super_column.name, column.name), column.value);
-            }
-        }
-        else // cosc.counter_column != null
-        {
-            mutation.addCounter(cfm.cfName, cfm.comparator.cellFromByteBuffer(cosc.counter_column.name), cosc.counter_column.value);
+            throw new InvalidRequestException(e.getMessage());
         }
     }
 
-    private void deleteColumnOrSuperColumn(org.apache.cassandra.db.Mutation mutation, CFMetaData cfm, Deletion del)
+    private void addRange(CFMetaData cfm, LegacyLayout.LegacyDeletionInfo delInfo, Slice.Bound start, Slice.Bound end, long timestamp, int nowInSec)
+    {
+        delInfo.add(cfm, new RangeTombstone(Slice.make(start, end), new DeletionTime(timestamp, nowInSec)));
+    }
+
+    private void deleteColumnOrSuperColumn(LegacyLayout.LegacyDeletionInfo delInfo, List<LegacyLayout.LegacyCell> cells, CFMetaData cfm, Deletion del, int nowInSec)
+    throws InvalidRequestException
     {
         if (del.predicate != null && del.predicate.column_names != null)
         {
             for (ByteBuffer c : del.predicate.column_names)
             {
-                if (del.super_column == null && cfm.isSuper())
-                    mutation.deleteRange(cfm.cfName, SuperColumns.startOf(c), SuperColumns.endOf(c), del.timestamp);
-                else if (del.super_column != null)
-                    mutation.delete(cfm.cfName, cfm.comparator.makeCellName(del.super_column, c), del.timestamp);
-                else
-                    mutation.delete(cfm.cfName, cfm.comparator.cellFromByteBuffer(c), del.timestamp);
+                try
+                {
+                    if (del.super_column == null && cfm.isSuper())
+                        addRange(cfm, delInfo, Slice.Bound.inclusiveStartOf(c), Slice.Bound.inclusiveEndOf(c), del.timestamp, nowInSec);
+                    else if (del.super_column != null)
+                        cells.add(toLegacyDeletion(cfm, del.super_column, c, del.timestamp, nowInSec));
+                    else
+                        cells.add(toLegacyDeletion(cfm, c, del.timestamp, nowInSec));
+                }
+                catch (UnknownColumnException e)
+                {
+                    throw new InvalidRequestException(e.getMessage());
+                }
             }
         }
         else if (del.predicate != null && del.predicate.slice_range != null)
         {
-            if (del.super_column == null && cfm.isSuper())
-                mutation.deleteRange(cfm.cfName,
-                                     SuperColumns.startOf(del.predicate.getSlice_range().start),
-                                     SuperColumns.endOf(del.predicate.getSlice_range().finish),
-                                     del.timestamp);
-            else if (del.super_column != null)
-                mutation.deleteRange(cfm.cfName,
-                                     cfm.comparator.makeCellName(del.super_column, del.predicate.getSlice_range().start),
-                                     cfm.comparator.makeCellName(del.super_column, del.predicate.getSlice_range().finish),
-                                     del.timestamp);
+            if (del.super_column == null)
+            {
+                LegacyLayout.LegacyBound start = LegacyLayout.decodeTombstoneBound(cfm, del.predicate.getSlice_range().start, true);
+                LegacyLayout.LegacyBound end = LegacyLayout.decodeTombstoneBound(cfm, del.predicate.getSlice_range().finish, false);
+                delInfo.add(cfm, new LegacyLayout.LegacyRangeTombstone(start, end, new DeletionTime(del.timestamp, nowInSec)));
+            }
             else
-                mutation.deleteRange(cfm.cfName,
-                                     cfm.comparator.fromByteBuffer(del.predicate.getSlice_range().start),
-                                     cfm.comparator.fromByteBuffer(del.predicate.getSlice_range().finish),
-                                     del.timestamp);
+            {
+                // Since we use a map for subcolumns, we would need range tombstone for collections to support this.
+                // And while we may want those some day, this require a bit of additional work. And since super columns
+                // are basically deprecated since a long time, and range tombstone on them has been only very recently
+                // added so that no thrift driver actually supports it to the best of my knowledge, it's likely ok to
+                // discontinue support for this. If it turns out that this is blocking the update of someone, we can
+                // decide then if we want to tackle the addition of range tombstone for collections then.
+                throw new InvalidRequestException("Cannot delete a range of subcolumns in a super column");
+            }
         }
         else
         {
             if (del.super_column != null)
-                mutation.deleteRange(cfm.cfName, SuperColumns.startOf(del.super_column), SuperColumns.endOf(del.super_column), del.timestamp);
+                addRange(cfm, delInfo, Slice.Bound.inclusiveStartOf(del.super_column), Slice.Bound.inclusiveEndOf(del.super_column), del.timestamp, nowInSec);
             else
-                mutation.delete(cfm.cfName, del.timestamp);
+                delInfo.add(new DeletionTime(del.timestamp, nowInSec));
         }
     }
 
@@ -1016,25 +1340,48 @@
         cState.hasColumnFamilyAccess(keyspace, column_path.column_family, Permission.MODIFY);
 
         CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, column_path.column_family, isCommutativeOp);
+        if (metadata.isView())
+            throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot modify Materialized Views directly");
+
         ThriftValidation.validateKey(metadata, key);
         ThriftValidation.validateColumnPathOrParent(metadata, column_path);
         if (isCommutativeOp)
             ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
 
-        org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(keyspace, key);
+        DecoratedKey dk = metadata.decorateKey(key);
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        PartitionUpdate update;
         if (column_path.super_column == null && column_path.column == null)
-            mutation.delete(column_path.column_family, timestamp);
-        else if (column_path.super_column == null)
-            mutation.delete(column_path.column_family, metadata.comparator.cellFromByteBuffer(column_path.column), timestamp);
-        else if (column_path.column == null)
-            mutation.deleteRange(column_path.column_family, SuperColumns.startOf(column_path.super_column), SuperColumns.endOf(column_path.super_column), timestamp);
+        {
+            update = PartitionUpdate.fullPartitionDelete(metadata, dk, timestamp, nowInSec);
+        }
+        else if (column_path.super_column != null && column_path.column == null)
+        {
+            Row row = BTreeRow.emptyDeletedRow(new Clustering(column_path.super_column), Row.Deletion.regular(new DeletionTime(timestamp, nowInSec)));
+            update = PartitionUpdate.singleRowUpdate(metadata, dk, row);
+        }
         else
-            mutation.delete(column_path.column_family, metadata.comparator.makeCellName(column_path.super_column, column_path.column), timestamp);
+        {
+            try
+            {
+                LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(metadata, column_path.super_column, column_path.column);
+                CellPath path = name.collectionElement == null ? null : CellPath.create(name.collectionElement);
+                Cell cell = BufferCell.tombstone(name.column, timestamp, nowInSec, path);
+                update = PartitionUpdate.singleRowUpdate(metadata, dk, BTreeRow.singleCellRow(name.clustering, cell));
+            }
+            catch (UnknownColumnException e)
+            {
+                throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
+            }
+        }
+
+        org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(update);
 
         if (isCommutativeOp)
-            doInsert(consistency_level, Arrays.asList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
+            doInsert(consistency_level, Collections.singletonList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
         else
-            doInsert(consistency_level, Arrays.asList(mutation));
+            doInsert(consistency_level, Collections.singletonList(mutation));
     }
 
     public void remove(ByteBuffer key, ColumnPath column_path, long timestamp, ConsistencyLevel consistency_level)
@@ -1116,7 +1463,7 @@
     {
         validateLogin();
 
-        KSMetaData ksm = Schema.instance.getKSMetaData(keyspaceName);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspaceName);
         if (ksm == null)
             throw new NotFoundException();
 
@@ -1154,10 +1501,8 @@
             org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(consistency_level);
             consistencyLevel.validateForRead(keyspace);
 
-            List<Row> rows = null;
-
-            IPartitioner p = StorageService.getPartitioner();
-            AbstractBounds<RowPosition> bounds;
+            IPartitioner p = metadata.partitioner;
+            AbstractBounds<PartitionPosition> bounds;
             if (range.start_key == null)
             {
                 Token.TokenFactory tokenFactory = p.getTokenFactory();
@@ -1167,32 +1512,38 @@
             }
             else
             {
-                RowPosition end = range.end_key == null
+                PartitionPosition end = range.end_key == null
                                 ? p.getTokenFactory().fromString(range.end_token).maxKeyBound()
-                                : RowPosition.ForKey.get(range.end_key, p);
-                bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(range.start_key, p), end);
+                                : PartitionPosition.ForKey.get(range.end_key, p);
+                bounds = new Bounds<>(PartitionPosition.ForKey.get(range.start_key, p), end);
             }
-            long now = System.currentTimeMillis();
+            int nowInSec = FBUtilities.nowInSeconds();
             schedule(DatabaseDescriptor.getRangeRpcTimeout());
             try
             {
-                IDiskAtomFilter filter = ThriftValidation.asIFilter(predicate, metadata, column_parent.super_column);
-                rows = StorageProxy.getRangeSlice(new RangeSliceCommand(keyspace,
-                                                                        column_parent.column_family,
-                                                                        now,
-                                                                        filter,
-                                                                        bounds,
-                                                                        ThriftConversion.indexExpressionsFromThrift(range.row_filter),
-                                                                        range.count),
-                                                  consistencyLevel);
+                ColumnFilter columns = makeColumnFilter(metadata, column_parent, predicate);
+                ClusteringIndexFilter filter = toInternalFilter(metadata, column_parent, predicate);
+                DataLimits limits = getLimits(range.count, metadata.isSuper() && !column_parent.isSetSuper_column(), predicate);
+
+                PartitionRangeReadCommand cmd =
+                    PartitionRangeReadCommand.create(true,
+                                                     metadata,
+                                                     nowInSec,
+                                                     columns,
+                                                     ThriftConversion.rowFilterFromThrift(metadata, range.row_filter),
+                                                     limits,
+                                                     new DataRange(bounds, filter));
+
+                try (PartitionIterator results = StorageProxy.getRangeSlice(cmd, consistencyLevel))
+                {
+                    assert results != null;
+                    return thriftifyKeySlices(results, column_parent, limits.perPartitionCount());
+                }
             }
             finally
             {
                 release();
             }
-            assert rows != null;
-
-            return thriftifyKeySlices(rows, column_parent, predicate, now);
         }
         catch (RequestValidationException e)
         {
@@ -1237,10 +1588,8 @@
             org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(consistency_level);
             consistencyLevel.validateForRead(keyspace);
 
-            SlicePredicate predicate = new SlicePredicate().setSlice_range(new SliceRange(start_column, ByteBufferUtil.EMPTY_BYTE_BUFFER, false, -1));
-
-            IPartitioner p = StorageService.getPartitioner();
-            AbstractBounds<RowPosition> bounds;
+            IPartitioner p = metadata.partitioner;
+            AbstractBounds<PartitionPosition> bounds;
             if (range.start_key == null)
             {
                 // (token, key) is unsupported, assume (token, token)
@@ -1251,30 +1600,47 @@
             }
             else
             {
-                RowPosition end = range.end_key == null
+                PartitionPosition end = range.end_key == null
                                 ? p.getTokenFactory().fromString(range.end_token).maxKeyBound()
-                                : RowPosition.ForKey.get(range.end_key, p);
-                bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(range.start_key, p), end);
+                                : PartitionPosition.ForKey.get(range.end_key, p);
+                bounds = new Bounds<>(PartitionPosition.ForKey.get(range.start_key, p), end);
             }
 
             if (range.row_filter != null && !range.row_filter.isEmpty())
                 throw new InvalidRequestException("Cross-row paging is not supported along with index clauses");
 
-            List<Row> rows;
-            long now = System.currentTimeMillis();
+            int nowInSec = FBUtilities.nowInSeconds();
             schedule(DatabaseDescriptor.getRangeRpcTimeout());
             try
             {
-                IDiskAtomFilter filter = ThriftValidation.asIFilter(predicate, metadata, null);
-                rows = StorageProxy.getRangeSlice(new RangeSliceCommand(keyspace, column_family, now, filter, bounds, null, range.count, true, true), consistencyLevel);
+                ClusteringIndexFilter filter = new ClusteringIndexSliceFilter(Slices.ALL, false);
+                DataLimits limits = getLimits(range.count, true, Integer.MAX_VALUE);
+                Clustering pageFrom = metadata.isSuper()
+                                    ? new Clustering(start_column)
+                                    : LegacyLayout.decodeCellName(metadata, start_column).clustering;
+
+                PartitionRangeReadCommand cmd =
+                    PartitionRangeReadCommand.create(true,
+                                                     metadata,
+                                                     nowInSec,
+                                                     ColumnFilter.all(metadata),
+                                                     RowFilter.NONE,
+                                                     limits,
+                                                     new DataRange(bounds, filter).forPaging(bounds, metadata.comparator, pageFrom, true));
+
+                try (PartitionIterator results = StorageProxy.getRangeSlice(cmd, consistencyLevel))
+                {
+                    return thriftifyKeySlices(results, new ColumnParent(column_family), limits.perPartitionCount());
+                }
+            }
+            catch (UnknownColumnException e)
+            {
+                throw new InvalidRequestException(e.getMessage());
             }
             finally
             {
                 release();
             }
-            assert rows != null;
-
-            return thriftifyKeySlices(rows, new ColumnParent(column_family), predicate, now);
         }
         catch (RequestValidationException e)
         {
@@ -1290,17 +1656,22 @@
         }
     }
 
-    private List<KeySlice> thriftifyKeySlices(List<Row> rows, ColumnParent column_parent, SlicePredicate predicate, long now)
+    private List<KeySlice> thriftifyKeySlices(PartitionIterator results, ColumnParent column_parent, int cellLimit)
     {
-        List<KeySlice> keySlices = new ArrayList<KeySlice>(rows.size());
-        boolean reversed = predicate.slice_range != null && predicate.slice_range.reversed;
-        for (Row row : rows)
+        try (PartitionIterator iter = results)
         {
-            List<ColumnOrSuperColumn> thriftifiedColumns = thriftifyColumnFamily(row.cf, column_parent.super_column != null, reversed, now);
-            keySlices.add(new KeySlice(row.key.getKey(), thriftifiedColumns));
-        }
+            List<KeySlice> keySlices = new ArrayList<>();
+            while (iter.hasNext())
+            {
+                try (RowIterator partition = iter.next())
+                {
+                    List<ColumnOrSuperColumn> thriftifiedColumns = thriftifyPartition(partition, column_parent.super_column != null, partition.isReverseOrder(), cellLimit);
+                    keySlices.add(new KeySlice(partition.partitionKey().getKey(), thriftifiedColumns));
+                }
+            }
 
-        return keySlices;
+            return keySlices;
+        }
     }
 
     public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level)
@@ -1331,22 +1702,31 @@
             org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(consistency_level);
             consistencyLevel.validateForRead(keyspace);
 
-            IPartitioner p = StorageService.getPartitioner();
-            AbstractBounds<RowPosition> bounds = new Bounds<RowPosition>(RowPosition.ForKey.get(index_clause.start_key, p),
-                                                                         p.getMinimumToken().minKeyBound());
+            IPartitioner p = metadata.partitioner;
+            AbstractBounds<PartitionPosition> bounds = new Bounds<>(PartitionPosition.ForKey.get(index_clause.start_key, p),
+                                                                    p.getMinimumToken().minKeyBound());
 
-            IDiskAtomFilter filter = ThriftValidation.asIFilter(column_predicate, metadata, column_parent.super_column);
-            long now = System.currentTimeMillis();
-            RangeSliceCommand command = new RangeSliceCommand(keyspace,
-                                                              column_parent.column_family,
-                                                              now,
-                                                              filter,
-                                                              bounds,
-                                                              ThriftConversion.indexExpressionsFromThrift(index_clause.expressions),
-                                                              index_clause.count);
+            int nowInSec = FBUtilities.nowInSeconds();
+            ColumnFilter columns = makeColumnFilter(metadata, column_parent, column_predicate);
+            ClusteringIndexFilter filter = toInternalFilter(metadata, column_parent, column_predicate);
+            DataLimits limits = getLimits(index_clause.count, metadata.isSuper() && !column_parent.isSetSuper_column(), column_predicate);
 
-            List<Row> rows = StorageProxy.getRangeSlice(command, consistencyLevel);
-            return thriftifyKeySlices(rows, column_parent, column_predicate, now);
+            PartitionRangeReadCommand cmd =
+                PartitionRangeReadCommand.create(true,
+                                                 metadata,
+                                                 nowInSec,
+                                                 columns,
+                                                 ThriftConversion.rowFilterFromThrift(metadata, index_clause.expressions),
+                                                 limits,
+                                                 new DataRange(bounds, filter));
+
+            // If there's a secondary index that the command can use, have it validate the request parameters.
+            cmd.maybeValidateIndex();
+
+            try (PartitionIterator results = StorageProxy.getRangeSlice(cmd, consistencyLevel))
+            {
+                return thriftifyKeySlices(results, column_parent, limits.perPartitionCount());
+            }
         }
         catch (RequestValidationException e)
         {
@@ -1367,7 +1747,7 @@
         validateLogin();
 
         Set<String> keyspaces = Schema.instance.getKeyspaces();
-        List<KsDef> ksset = new ArrayList<KsDef>(keyspaces.size());
+        List<KsDef> ksset = new ArrayList<>(keyspaces.size());
         for (String ks : keyspaces)
         {
             try
@@ -1424,7 +1804,7 @@
 
     public String describe_partitioner() throws TException
     {
-        return StorageService.getPartitioner().getClass().getName();
+        return StorageService.instance.getPartitionerName();
     }
 
     public String describe_snitch() throws TException
@@ -1439,7 +1819,7 @@
     throws TException, InvalidRequestException
     {
         List<CfSplit> splits = describe_splits_ex(cfName, start_token, end_token, keys_per_split);
-        List<String> result = new ArrayList<String>(splits.size() + 1);
+        List<String> result = new ArrayList<>(splits.size() + 1);
 
         result.add(splits.get(0).getStart_token());
         for (CfSplit cfSplit : splits)
@@ -1453,11 +1833,11 @@
     {
         try
         {
-            Token.TokenFactory tf = StorageService.getPartitioner().getTokenFactory();
+            Token.TokenFactory tf = StorageService.instance.getTokenFactory();
             Range<Token> tr = new Range<Token>(tf.fromString(start_token), tf.fromString(end_token));
             List<Pair<Range<Token>, Long>> splits =
                     StorageService.instance.getSplits(state().getKeyspace(), cfName, tr, keys_per_split);
-            List<CfSplit> result = new ArrayList<CfSplit>(splits.size());
+            List<CfSplit> result = new ArrayList<>(splits.size());
             for (Pair<Range<Token>, Long> split : splits)
                 result.add(new CfSplit(split.left.left.toString(), split.left.right.toString(), split.right));
             return result;
@@ -1503,8 +1883,7 @@
         requestScheduler.release();
     }
 
-    public String system_add_column_family(CfDef cf_def)
-    throws InvalidRequestException, SchemaDisagreementException, TException
+    public String system_add_column_family(CfDef cf_def) throws TException
     {
         logger.trace("add_column_family");
 
@@ -1515,8 +1894,7 @@
             cState.hasKeyspaceAccess(keyspace, Permission.CREATE);
             cf_def.unsetId(); // explicitly ignore any id set by client (Hector likes to set zero)
             CFMetaData cfm = ThriftConversion.fromThrift(cf_def);
-            CFMetaData.validateCompactionOptions(cfm.compactionStrategyClass, cfm.compactionStrategyOptions);
-            cfm.addDefaultIndexNames();
+            cfm.params.compaction.validate();
 
             if (!cfm.getTriggers().isEmpty())
                 state().ensureIsSuper("Only superusers are allowed to add triggers.");
@@ -1541,6 +1919,11 @@
         {
             String keyspace = cState.getKeyspace();
             cState.hasColumnFamilyAccess(keyspace, column_family, Permission.DROP);
+
+            CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, column_family);
+            if (metadata.isView())
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot drop Materialized Views from Thrift");
+
             MigrationManager.announceColumnFamilyDrop(keyspace, column_family);
             return Schema.instance.getVersion().toString();
         }
@@ -1575,7 +1958,6 @@
             {
                 cf_def.unsetId(); // explicitly ignore any id set by client (same as system_add_column_family)
                 CFMetaData cfm = ThriftConversion.fromThrift(cf_def);
-                cfm.addDefaultIndexNames();
 
                 if (!cfm.getTriggers().isEmpty())
                     state().ensureIsSuper("Only superusers are allowed to add triggers.");
@@ -1651,12 +2033,16 @@
             if (oldCfm == null)
                 throw new InvalidRequestException("Could not find table definition to modify.");
 
+            if (oldCfm.isView())
+                throw new InvalidRequestException("Cannot modify Materialized View table " + oldCfm.cfName + " as it may break the schema. You should use cqlsh to modify Materialized View tables instead.");
+            if (!Iterables.isEmpty(View.findAll(cf_def.keyspace, cf_def.name)))
+                throw new InvalidRequestException("Cannot modify table with Materialized View " + oldCfm.cfName + " as it may break the schema. You should use cqlsh to modify tables with Materialized Views instead.");
+
             if (!oldCfm.isThriftCompatible())
                 throw new InvalidRequestException("Cannot modify CQL3 table " + oldCfm.cfName + " as it may break the schema. You should use cqlsh to modify CQL3 tables instead.");
 
             CFMetaData cfm = ThriftConversion.fromThriftForUpdate(cf_def, oldCfm);
-            CFMetaData.validateCompactionOptions(cfm.compactionStrategyClass, cfm.compactionStrategyOptions);
-            cfm.addDefaultIndexNames();
+            cfm.params.compaction.validate();
 
             if (!oldCfm.getTriggers().equals(cfm.getTriggers()))
                 state().ensureIsSuper("Only superusers are allowed to add or remove triggers.");
@@ -1678,6 +2064,9 @@
         {
             String keyspace = cState.getKeyspace();
             cState.hasColumnFamilyAccess(keyspace, cfname, Permission.MODIFY);
+            CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, cfname, false);
+            if (metadata.isView())
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot truncate Materialized Views");
 
             if (startSessionIfRequested())
             {
@@ -1763,28 +2152,36 @@
             cState.hasColumnFamilyAccess(keyspace, column_parent.column_family, Permission.MODIFY);
 
             CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, column_parent.column_family, true);
+            if (metadata.isView())
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("Cannot modify Materialized Views directly");
+
             ThriftValidation.validateKey(metadata, key);
             ThriftConversion.fromThrift(consistency_level).validateCounterForWrite(metadata);
             ThriftValidation.validateColumnParent(metadata, column_parent);
             // SuperColumn field is usually optional, but not when we're adding
-            if (metadata.cfType == ColumnFamilyType.Super && column_parent.super_column == null)
+            if (metadata.isSuper() && column_parent.super_column == null)
                 throw new InvalidRequestException("missing mandatory super column name for super CF " + column_parent.column_family);
 
             ThriftValidation.validateColumnNames(metadata, column_parent, Arrays.asList(column.name));
 
-            org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(keyspace, key);
             try
             {
-                if (metadata.isSuper())
-                    mutation.addCounter(column_parent.column_family, metadata.comparator.makeCellName(column_parent.super_column, column.name), column.value);
-                else
-                    mutation.addCounter(column_parent.column_family, metadata.comparator.cellFromByteBuffer(column.name), column.value);
+                LegacyLayout.LegacyCellName name = LegacyLayout.decodeCellName(metadata, column_parent.super_column, column.name);
+
+                // See UpdateParameters.addCounter() for more details on this
+                ByteBuffer value = CounterContext.instance().createUpdate(column.value);
+                CellPath path = name.collectionElement == null ? null : CellPath.create(name.collectionElement);
+                Cell cell = BufferCell.live(metadata, name.column, FBUtilities.timestampMicros(), value, path);
+
+                PartitionUpdate update = PartitionUpdate.singleRowUpdate(metadata, key, BTreeRow.singleCellRow(name.clustering, cell));
+
+                org.apache.cassandra.db.Mutation mutation = new org.apache.cassandra.db.Mutation(update);
+                doInsert(consistency_level, Arrays.asList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
             }
-            catch (MarshalException e)
+            catch (MarshalException|UnknownColumnException e)
             {
                 throw new InvalidRequestException(e.getMessage());
             }
-            doInsert(consistency_level, Arrays.asList(new CounterMutation(mutation, ThriftConversion.fromThrift(consistency_level))));
         }
         catch (RequestValidationException e)
         {
@@ -1813,7 +2210,7 @@
 
         try
         {
-            internal_remove(key, path, System.currentTimeMillis(), consistency_level, true);
+            internal_remove(key, path, FBUtilities.timestampMicros(), consistency_level, true);
         }
         catch (RequestValidationException e)
         {
@@ -1885,7 +2282,7 @@
 
     public CqlResult execute_cql_query(ByteBuffer query, Compression compression) throws TException
     {
-        throw new InvalidRequestException("CQL2 has been removed in Cassandra 2.2. Please use CQL3 instead");
+        throw new InvalidRequestException("CQL2 has been removed in Cassandra 3.0. Please use CQL3 instead");
     }
 
     public CqlResult execute_cql3_query(ByteBuffer query, Compression compression, ConsistencyLevel cLevel) throws TException
@@ -1907,7 +2304,8 @@
             ThriftClientState cState = state();
             return ClientState.getCQLQueryHandler().process(queryString,
                                                             cState.getQueryState(),
-                                                            QueryOptions.fromProtocolV2(ThriftConversion.fromThrift(cLevel), Collections.<ByteBuffer>emptyList()),
+                                                            QueryOptions.fromThrift(ThriftConversion.fromThrift(cLevel),
+                                                            Collections.<ByteBuffer>emptyList()),
                                                             null).toThriftResult();
         }
         catch (RequestExecutionException e)
@@ -1926,7 +2324,7 @@
 
     public CqlPreparedResult prepare_cql_query(ByteBuffer query, Compression compression) throws TException
     {
-        throw new InvalidRequestException("CQL2 has been removed in Cassandra 2.2. Please use CQL3 instead");
+        throw new InvalidRequestException("CQL2 has been removed in Cassandra 3.0. Please use CQL3 instead");
     }
 
     public CqlPreparedResult prepare_cql3_query(ByteBuffer query, Compression compression) throws TException
@@ -1939,9 +2337,7 @@
         try
         {
             cState.validateLogin();
-            return ClientState.getCQLQueryHandler().prepare(queryString,
-                                                       cState.getQueryState(),
-                                                       null).toThriftPreparedResult();
+            return ClientState.getCQLQueryHandler().prepare(queryString, cState.getQueryState(), null).toThriftPreparedResult();
         }
         catch (RequestValidationException e)
         {
@@ -1951,7 +2347,7 @@
 
     public CqlResult execute_prepared_cql_query(int itemId, List<ByteBuffer> bindVariables) throws TException
     {
-        throw new InvalidRequestException("CQL2 has been removed in Cassandra 2.2. Please use CQL3 instead");
+        throw new InvalidRequestException("CQL2 has been removed in Cassandra 3.0. Please use CQL3 instead");
     }
 
     public CqlResult execute_prepared_cql3_query(int itemId, List<ByteBuffer> bindVariables, ConsistencyLevel cLevel) throws TException
@@ -1980,7 +2376,7 @@
 
             return ClientState.getCQLQueryHandler().processPrepared(prepared.statement,
                                                                     cState.getQueryState(),
-                                                                    QueryOptions.fromProtocolV2(ThriftConversion.fromThrift(cLevel), bindVariables),
+                                                                    QueryOptions.fromThrift(ThriftConversion.fromThrift(cLevel), bindVariables),
                                                                     null).toThriftResult();
         }
         catch (RequestExecutionException e)
@@ -2020,33 +2416,41 @@
             String keyspace = cState.getKeyspace();
             state().hasColumnFamilyAccess(keyspace, request.getColumn_parent().column_family, Permission.SELECT);
             CFMetaData metadata = ThriftValidation.validateColumnFamily(keyspace, request.getColumn_parent().column_family);
-            if (metadata.cfType == ColumnFamilyType.Super)
+            if (metadata.isSuper())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("get_multi_slice does not support super columns");
             ThriftValidation.validateColumnParent(metadata, request.getColumn_parent());
             org.apache.cassandra.db.ConsistencyLevel consistencyLevel = ThriftConversion.fromThrift(request.getConsistency_level());
             consistencyLevel.validateForRead(keyspace);
-            List<ReadCommand> commands = new ArrayList<>(1);
-            ColumnSlice[] slices = new ColumnSlice[request.getColumn_slices().size()];
+
+            Slices.Builder builder = new Slices.Builder(metadata.comparator, request.getColumn_slices().size());
             for (int i = 0 ; i < request.getColumn_slices().size() ; i++)
             {
                 fixOptionalSliceParameters(request.getColumn_slices().get(i));
-                Composite start = metadata.comparator.fromByteBuffer(request.getColumn_slices().get(i).start);
-                Composite finish = metadata.comparator.fromByteBuffer(request.getColumn_slices().get(i).finish);
-                if (!start.isEmpty() && !finish.isEmpty())
-                {
-                    int compare = metadata.comparator.compare(start, finish);
-                    if (!request.reversed && compare > 0)
-                        throw new InvalidRequestException(String.format("Column slice at index %d had start greater than finish", i));
-                    else if (request.reversed && compare < 0)
-                        throw new InvalidRequestException(String.format("Reversed column slice at index %d had start less than finish", i));
-                }
-                slices[i] = new ColumnSlice(start, finish);
+                Slice.Bound start = LegacyLayout.decodeSliceBound(metadata, request.getColumn_slices().get(i).start, true).bound;
+                Slice.Bound finish = LegacyLayout.decodeSliceBound(metadata, request.getColumn_slices().get(i).finish, false).bound;
+
+                int compare = metadata.comparator.compare(start, finish);
+                if (!request.reversed && compare > 0)
+                    throw new InvalidRequestException(String.format("Column slice at index %d had start greater than finish", i));
+                else if (request.reversed && compare < 0)
+                    throw new InvalidRequestException(String.format("Reversed column slice at index %d had start less than finish", i));
+
+                builder.add(request.reversed ? Slice.make(finish, start) : Slice.make(start, finish));
             }
-            ColumnSlice[] deoverlapped = ColumnSlice.deoverlapSlices(slices, request.reversed ? metadata.comparator.reverseComparator() : metadata.comparator);
-            SliceQueryFilter filter = new SliceQueryFilter(deoverlapped, request.reversed, request.count);
+
+            Slices slices = builder.build();
+            ColumnFilter columns = makeColumnFilter(metadata, slices);
+            ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(slices, request.reversed);
+            DataLimits limits = getLimits(1, false, request.count);
+
             ThriftValidation.validateKey(metadata, request.key);
-            commands.add(ReadCommand.create(keyspace, request.key, request.column_parent.getColumn_family(), System.currentTimeMillis(), filter));
-            return getSlice(commands, request.column_parent.isSetSuper_column(), consistencyLevel, cState).entrySet().iterator().next().getValue();
+            DecoratedKey dk = metadata.decorateKey(request.key);
+            SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(true, metadata, FBUtilities.nowInSeconds(), columns, RowFilter.NONE, limits, dk, filter);
+            return getSlice(Collections.<SinglePartitionReadCommand>singletonList(cmd),
+                            false,
+                            limits.perPartitionCount(),
+                            consistencyLevel,
+                            cState).entrySet().iterator().next().getValue();
         }
         catch (RequestValidationException e)
         {
@@ -2070,7 +2474,7 @@
     }
 
     /*
-     * No-op since 2.2.
+     * No-op since 3.0.
      */
     public void set_cql_version(String version)
     {
@@ -2107,60 +2511,104 @@
 
     private static class ThriftCASRequest implements CASRequest
     {
-        private final ColumnFamily expected;
-        private final ColumnFamily updates;
+        private final CFMetaData metadata;
+        private final DecoratedKey key;
+        private final List<LegacyLayout.LegacyCell> expected;
+        private final PartitionUpdate updates;
+        private final int nowInSec;
 
-        private ThriftCASRequest(ColumnFamily expected, ColumnFamily updates)
+        private ThriftCASRequest(List<LegacyLayout.LegacyCell> expected, PartitionUpdate updates, int nowInSec)
         {
+            this.metadata = updates.metadata();
+            this.key = updates.partitionKey();
             this.expected = expected;
             this.updates = updates;
+            this.nowInSec = nowInSec;
         }
 
-        public IDiskAtomFilter readFilter()
+        public SinglePartitionReadCommand readCommand(int nowInSec)
         {
-            return expected == null || expected.isEmpty()
-                 ? new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 1)
-                 : new NamesQueryFilter(ImmutableSortedSet.copyOf(expected.getComparator(), expected.getColumnNames()));
+            if (expected.isEmpty())
+            {
+                // We want to know if the partition exists, so just fetch a single cell.
+                ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(Slices.ALL, false);
+                DataLimits limits = DataLimits.thriftLimits(1, 1);
+                return SinglePartitionReadCommand.create(true, metadata, nowInSec, ColumnFilter.all(metadata), RowFilter.NONE, limits, key, filter);
+            }
+
+            // Gather the clustering for the expected values and query those.
+            BTreeSet.Builder<Clustering> clusterings = BTreeSet.builder(metadata.comparator);
+            FilteredPartition expectedPartition =
+                FilteredPartition.create(LegacyLayout.toRowIterator(metadata, key, expected.iterator(), nowInSec));
+
+            for (Row row : expectedPartition)
+                clusterings.add(row.clustering());
+
+            PartitionColumns columns = expectedPartition.staticRow().isEmpty()
+                                     ? metadata.partitionColumns().withoutStatics()
+                                     : metadata.partitionColumns();
+            ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings.build(), false);
+            return SinglePartitionReadCommand.create(true, metadata, nowInSec, ColumnFilter.selection(columns), RowFilter.NONE, DataLimits.NONE, key, filter);
         }
 
-        public boolean appliesTo(ColumnFamily current)
+        public boolean appliesTo(FilteredPartition current)
         {
-            long now = System.currentTimeMillis();
-
-            if (!hasLiveCells(expected, now))
-                return !hasLiveCells(current, now);
-            else if (!hasLiveCells(current, now))
+            if (expected.isEmpty())
+                return current.isEmpty();
+            else if (current.isEmpty())
                 return false;
 
-            // current has been built from expected, so we know that it can't have columns
-            // that excepted don't have. So we just check that for each columns in expected:
-            //   - if it is a tombstone, whether current has no column or a tombstone;
-            //   - otherwise, that current has a live column with the same value.
-            for (Cell e : expected)
+            // Push the expected results through ThriftResultsMerger to translate any static
+            // columns into clusterings. The current partition is retrieved in the same so
+            // unless they're both handled the same, they won't match.
+            FilteredPartition expectedPartition =
+                FilteredPartition.create(
+                    UnfilteredRowIterators.filter(
+                        ThriftResultsMerger.maybeWrap(expectedToUnfilteredRowIterator(), nowInSec), nowInSec));
+
+            // Check that for everything we expected, the fetched values exists and correspond.
+            for (Row e : expectedPartition)
             {
-                Cell c = current.getColumn(e.name());
-                if (e.isLive(now))
+                Row c = current.getRow(e.clustering());
+                if (c == null)
+                    return false;
+
+                SearchIterator<ColumnDefinition, ColumnData> searchIter = c.searchIterator();
+                for (ColumnData expectedData : e)
                 {
-                    if (c == null || !c.isLive(now) || !c.value().equals(e.value()))
+                    ColumnDefinition column = expectedData.column();
+                    ColumnData currentData = searchIter.next(column);
+                    if (currentData == null)
                         return false;
-                }
-                else
-                {
-                    if (c != null && c.isLive(now))
-                        return false;
+
+                    if (column.isSimple())
+                    {
+                        if (!((Cell)currentData).value().equals(((Cell)expectedData).value()))
+                            return false;
+                    }
+                    else
+                    {
+                        ComplexColumnData currentComplexData = (ComplexColumnData)currentData;
+                        for (Cell expectedCell : (ComplexColumnData)expectedData)
+                        {
+                            Cell currentCell = currentComplexData.getCell(expectedCell.path());
+                            if (currentCell == null || !currentCell.value().equals(expectedCell.value()))
+                                return false;
+                        }
+                    }
                 }
             }
             return true;
         }
 
-        private static boolean hasLiveCells(ColumnFamily cf, long now)
-        {
-            return cf != null && !cf.hasOnlyTombstones(now);
-        }
-
-        public ColumnFamily makeUpdates(ColumnFamily current)
+        public PartitionUpdate makeUpdates(FilteredPartition current)
         {
             return updates;
         }
+
+        private UnfilteredRowIterator expectedToUnfilteredRowIterator()
+        {
+            return LegacyLayout.toUnfilteredRowIterator(metadata, key, LegacyLayout.LegacyDeletionInfo.live(), expected.iterator());
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/thrift/ThriftConversion.java b/src/java/org/apache/cassandra/thrift/ThriftConversion.java
index 04eae38..e8256a8 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftConversion.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftConversion.java

@@ -17,30 +17,32 @@
  */
 package org.apache.cassandra.thrift;
 
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Strings;
 import com.google.common.collect.Maps;
 
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.cql3.SuperColumnCompatibility;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.CompactTables;
+import org.apache.cassandra.db.LegacyLayout;
 import org.apache.cassandra.db.WriteType;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.io.compress.ICompressor;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.LocalStrategy;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.UUIDGen;
 
 /**
@@ -136,47 +138,41 @@
         return new TimedOutException();
     }
 
-    public static List<org.apache.cassandra.db.IndexExpression> indexExpressionsFromThrift(List<IndexExpression> exprs)
+    public static RowFilter rowFilterFromThrift(CFMetaData metadata, List<IndexExpression> exprs)
     {
-        if (exprs == null)
-            return null;
+        if (exprs == null || exprs.isEmpty())
+            return RowFilter.NONE;
 
-        if (exprs.isEmpty())
-            return Collections.emptyList();
-
-        List<org.apache.cassandra.db.IndexExpression> converted = new ArrayList<>(exprs.size());
+        RowFilter converted = RowFilter.forThrift(exprs.size());
         for (IndexExpression expr : exprs)
-        {
-            converted.add(new org.apache.cassandra.db.IndexExpression(expr.column_name,
-                                                                      Operator.valueOf(expr.op.name()),
-                                                                      expr.value));
-        }
+            converted.addThriftExpression(metadata, expr.column_name, Operator.valueOf(expr.op.name()), expr.value);
         return converted;
     }
 
-    public static KSMetaData fromThrift(KsDef ksd, CFMetaData... cfDefs) throws ConfigurationException
+    public static KeyspaceMetadata fromThrift(KsDef ksd, CFMetaData... cfDefs) throws ConfigurationException
     {
         Class<? extends AbstractReplicationStrategy> cls = AbstractReplicationStrategy.getClass(ksd.strategy_class);
         if (cls.equals(LocalStrategy.class))
             throw new ConfigurationException("Unable to use given strategy class: LocalStrategy is reserved for internal use.");
 
-        return new KSMetaData(ksd.name,
-                              cls,
-                              ksd.strategy_options == null ? Collections.<String, String>emptyMap() : ksd.strategy_options,
-                              ksd.durable_writes,
-                              Arrays.asList(cfDefs));
+        Map<String, String> replicationMap = new HashMap<>();
+        if (ksd.strategy_options != null)
+            replicationMap.putAll(ksd.strategy_options);
+        replicationMap.put(ReplicationParams.CLASS, cls.getName());
+
+        return KeyspaceMetadata.create(ksd.name, KeyspaceParams.create(ksd.durable_writes, replicationMap), Tables.of(cfDefs));
     }
 
-    public static KsDef toThrift(KSMetaData ksm)
+    public static KsDef toThrift(KeyspaceMetadata ksm)
     {
-        List<CfDef> cfDefs = new ArrayList<>(ksm.cfMetaData().size());
-        for (CFMetaData cfm : ksm.cfMetaData().values())
+        List<CfDef> cfDefs = new ArrayList<>();
+        for (CFMetaData cfm : ksm.tables) // do not include views
             if (cfm.isThriftCompatible()) // Don't expose CF that cannot be correctly handle by thrift; see CASSANDRA-4377 for further details
                 cfDefs.add(toThrift(cfm));
 
-        KsDef ksdef = new KsDef(ksm.name, ksm.strategyClass.getName(), cfDefs);
-        ksdef.setStrategy_options(ksm.strategyOptions);
-        ksdef.setDurable_writes(ksm.durableWrites);
+        KsDef ksdef = new KsDef(ksm.name, ksm.params.replication.klass.getName(), cfDefs);
+        ksdef.setStrategy_options(ksm.params.replication.options);
+        ksdef.setDurable_writes(ksm.params.durableWrites);
 
         return ksdef;
     }
@@ -184,104 +180,136 @@
     public static CFMetaData fromThrift(CfDef cf_def)
     throws org.apache.cassandra.exceptions.InvalidRequestException, ConfigurationException
     {
-        return internalFromThrift(cf_def, Collections.<ColumnDefinition>emptyList());
+        // This is a creation: the table is dense if it doesn't define any column_metadata
+        boolean isDense = cf_def.column_metadata == null || cf_def.column_metadata.isEmpty();
+        return internalFromThrift(cf_def, true, Collections.<ColumnDefinition>emptyList(), isDense);
     }
 
     public static CFMetaData fromThriftForUpdate(CfDef cf_def, CFMetaData toUpdate)
     throws org.apache.cassandra.exceptions.InvalidRequestException, ConfigurationException
     {
-        return internalFromThrift(cf_def, toUpdate.allColumns());
+        return internalFromThrift(cf_def, false, toUpdate.allColumns(), toUpdate.isDense());
     }
 
-    // Convert a thrift CfDef, given a list of ColumnDefinitions to copy over to the created CFMetadata before the CQL metadata are rebuild
-    private static CFMetaData internalFromThrift(CfDef cf_def, Collection<ColumnDefinition> previousCQLMetadata)
+    private static boolean isSuper(String thriftColumnType)
+    throws org.apache.cassandra.exceptions.InvalidRequestException
+    {
+        switch (thriftColumnType.toLowerCase(Locale.ENGLISH))
+        {
+            case "standard": return false;
+            case "super": return true;
+            default: throw new org.apache.cassandra.exceptions.InvalidRequestException("Invalid column type " + thriftColumnType);
+        }
+    }
+
+    /**
+     * Convert a thrift CfDef.
+     * <p>,
+     * This is used both for creation and update of CF.
+     *
+     * @param cf_def the thrift CfDef to convert.
+     * @param isCreation whether that is a new table creation or not.
+     * @param previousCQLMetadata if it is not a table creation, the previous
+     * definitions of the tables (which we use to preserve the CQL metadata).
+     * If it is a table creation, this will be empty.
+     * @param isDense whether the table is dense or not.
+     *
+     * @return the converted table definition.
+     */
+    private static CFMetaData internalFromThrift(CfDef cf_def,
+                                                 boolean isCreation,
+                                                 Collection<ColumnDefinition> previousCQLMetadata,
+                                                 boolean isDense)
     throws org.apache.cassandra.exceptions.InvalidRequestException, ConfigurationException
     {
-        ColumnFamilyType cfType = ColumnFamilyType.create(cf_def.column_type);
-        if (cfType == null)
-            throw new org.apache.cassandra.exceptions.InvalidRequestException("Invalid column type " + cf_def.column_type);
-
         applyImplicitDefaults(cf_def);
 
         try
         {
+            boolean isSuper = isSuper(cf_def.column_type);
             AbstractType<?> rawComparator = TypeParser.parse(cf_def.comparator_type);
-            AbstractType<?> subComparator = cfType == ColumnFamilyType.Standard
-                    ? null
-                    : cf_def.subcomparator_type == null ? BytesType.instance : TypeParser.parse(cf_def.subcomparator_type);
+            AbstractType<?> subComparator = isSuper
+                                          ? cf_def.subcomparator_type == null ? BytesType.instance : TypeParser.parse(cf_def.subcomparator_type)
+                                          : null;
 
-            AbstractType<?> fullRawComparator = CFMetaData.makeRawAbstractType(rawComparator, subComparator);
+            AbstractType<?> keyValidator = cf_def.isSetKey_validation_class() ? TypeParser.parse(cf_def.key_validation_class) : BytesType.instance;
+            AbstractType<?> defaultValidator = TypeParser.parse(cf_def.default_validation_class);
 
-            AbstractType<?> keyValidator = cf_def.isSetKey_validation_class() ? TypeParser.parse(cf_def.key_validation_class) : null;
-
-            // Convert the REGULAR definitions from the input CfDef
+            // Convert the definitions from the input CfDef
             List<ColumnDefinition> defs = fromThrift(cf_def.keyspace, cf_def.name, rawComparator, subComparator, cf_def.column_metadata);
 
-            // Add the keyAlias if there is one, since that's on CQL metadata that thrift can actually change (for
+            // Add the keyAlias if there is one, since that's a CQL metadata that thrift can actually change (for
             // historical reasons)
             boolean hasKeyAlias = cf_def.isSetKey_alias() && keyValidator != null && !(keyValidator instanceof CompositeType);
             if (hasKeyAlias)
-                defs.add(ColumnDefinition.partitionKeyDef(cf_def.keyspace, cf_def.name, cf_def.key_alias, keyValidator, null));
-
-            // for Thrift updates, we should be calculating denseness from just the regular columns & comparator
-            boolean isDense = CFMetaData.calculateIsDense(fullRawComparator, defs);
+                defs.add(ColumnDefinition.partitionKeyDef(cf_def.keyspace, cf_def.name, UTF8Type.instance.getString(cf_def.key_alias), keyValidator, 0));
 
             // Now add any CQL metadata that we want to copy, skipping the keyAlias if there was one
             for (ColumnDefinition def : previousCQLMetadata)
             {
-                // skip all pre-existing REGULAR columns
-                if (def.kind == ColumnDefinition.Kind.REGULAR)
+                // isPartOfCellName basically means 'is not just a CQL metadata'
+                if (def.isPartOfCellName(false, isSuper))
                     continue;
 
-                // skip previous PARTITION_KEY column def if key_alias has been set by this update already (overwritten)
                 if (def.kind == ColumnDefinition.Kind.PARTITION_KEY && hasKeyAlias)
                     continue;
 
-                // the table switched from DENSE to SPARSE by adding one or more REGULAR columns;
-                // in this case we should now drop the COMPACT_VALUE column
-                if (def.kind == ColumnDefinition.Kind.COMPACT_VALUE && !isDense)
-                    continue;
-
-                // skip CLUSTERING_COLUMN column(s) of a sparse table, if:
-                // a) this is a Standard columnfamily *OR* b) it's a Super columnfamily and the second (subcolumn) component;
-                // in other words, only keep the clustering column in sparse tables if it's the first (super) component
-                // of a super column family
-                if (def.kind == ColumnDefinition.Kind.CLUSTERING_COLUMN && !isDense)
-                    if (cfType == ColumnFamilyType.Standard || def.position() != 0)
-                        continue;
-
                 defs.add(def);
             }
 
-            CellNameType comparator = CellNames.fromAbstractType(fullRawComparator, CFMetaData.calculateIsDense(fullRawComparator, defs));
-
             UUID cfId = Schema.instance.getId(cf_def.keyspace, cf_def.name);
             if (cfId == null)
                 cfId = UUIDGen.getTimeUUID();
 
-            // set isDense now so that it doesn't get re-calculated incorrectly later in rebuild() b/c of defined clusterings
-            CFMetaData newCFMD = new CFMetaData(cf_def.keyspace, cf_def.name, cfType, comparator, cfId).isDense(isDense);
+            boolean isCompound = !isSuper && (rawComparator instanceof CompositeType);
+            boolean isCounter = defaultValidator instanceof CounterColumnType;
 
-            newCFMD.addAllColumnDefinitions(defs);
+            // If it's a thrift table creation, adds the default CQL metadata for the new table
+            if (isCreation)
+            {
+                addDefaultCQLMetadata(defs,
+                                      cf_def.keyspace,
+                                      cf_def.name,
+                                      hasKeyAlias ? null : keyValidator,
+                                      rawComparator,
+                                      subComparator,
+                                      defaultValidator,
+                                      isDense);
+            }
 
-            if (keyValidator != null)
-                newCFMD.keyValidator(keyValidator);
+            // We do not allow Thrift views, so we always set it to false
+            boolean isView = false;
+
+            CFMetaData newCFMD = CFMetaData.create(cf_def.keyspace,
+                                                   cf_def.name,
+                                                   cfId,
+                                                   isDense,
+                                                   isCompound,
+                                                   isSuper,
+                                                   isCounter,
+                                                   isView,
+                                                   defs,
+                                                   DatabaseDescriptor.getPartitioner());
+
+            // Convert any secondary indexes defined in the thrift column_metadata
+            newCFMD.indexes(indexDefsFromThrift(newCFMD,
+                                                cf_def.keyspace,
+                                                cf_def.name,
+                                                rawComparator,
+                                                subComparator,
+                                                cf_def.column_metadata));
+
             if (cf_def.isSetGc_grace_seconds())
                 newCFMD.gcGraceSeconds(cf_def.gc_grace_seconds);
-            if (cf_def.isSetMin_compaction_threshold())
-                newCFMD.minCompactionThreshold(cf_def.min_compaction_threshold);
-            if (cf_def.isSetMax_compaction_threshold())
-                newCFMD.maxCompactionThreshold(cf_def.max_compaction_threshold);
-            if (cf_def.isSetCompaction_strategy())
-                newCFMD.compactionStrategyClass(CFMetaData.createCompactionStrategy(cf_def.compaction_strategy));
-            if (cf_def.isSetCompaction_strategy_options())
-                newCFMD.compactionStrategyOptions(new HashMap<>(cf_def.compaction_strategy_options));
+
+            newCFMD.compaction(compactionParamsFromThrift(cf_def));
+
             if (cf_def.isSetBloom_filter_fp_chance())
                 newCFMD.bloomFilterFpChance(cf_def.bloom_filter_fp_chance);
             if (cf_def.isSetMemtable_flush_period_in_ms())
                 newCFMD.memtableFlushPeriod(cf_def.memtable_flush_period_in_ms);
             if (cf_def.isSetCaching() || cf_def.isSetCells_per_row_to_cache())
-                newCFMD.caching(CachingOptions.fromThrift(cf_def.caching, cf_def.cells_per_row_to_cache));
+                newCFMD.caching(cachingFromThrift(cf_def.caching, cf_def.cells_per_row_to_cache));
             if (cf_def.isSetRead_repair_chance())
                 newCFMD.readRepairChance(cf_def.read_repair_chance);
             if (cf_def.isSetDefault_time_to_live())
@@ -293,14 +321,15 @@
             if (cf_def.isSetMax_index_interval())
                 newCFMD.maxIndexInterval(cf_def.max_index_interval);
             if (cf_def.isSetSpeculative_retry())
-                newCFMD.speculativeRetry(CFMetaData.SpeculativeRetry.fromString(cf_def.speculative_retry));
+                newCFMD.speculativeRetry(SpeculativeRetryParam.fromString(cf_def.speculative_retry));
             if (cf_def.isSetTriggers())
                 newCFMD.triggers(triggerDefinitionsFromThrift(cf_def.triggers));
+            if (cf_def.isSetComment())
+                newCFMD.comment(cf_def.comment);
+            if (cf_def.isSetCompression_options())
+                newCFMD.compression(compressionParametersFromThrift(cf_def.compression_options));
 
-            return newCFMD.comment(cf_def.comment)
-                          .defaultValidator(TypeParser.parse(cf_def.default_validation_class))
-                          .compressionParameters(CompressionParameters.create(cf_def.compression_options))
-                          .rebuild();
+            return newCFMD;
         }
         catch (SyntaxException | MarshalException e)
         {
@@ -308,25 +337,100 @@
         }
     }
 
-    /** applies implicit defaults to cf definition. useful in updates */
+    @SuppressWarnings("unchecked")
+    private static CompactionParams compactionParamsFromThrift(CfDef cf_def)
+    {
+        Class<? extends AbstractCompactionStrategy> klass =
+            CFMetaData.createCompactionStrategy(cf_def.compaction_strategy);
+        Map<String, String> options = new HashMap<>(cf_def.compaction_strategy_options);
+
+        int minThreshold = cf_def.min_compaction_threshold;
+        int maxThreshold = cf_def.max_compaction_threshold;
+
+        if (CompactionParams.supportsThresholdParams(klass))
+        {
+            options.putIfAbsent(CompactionParams.Option.MIN_THRESHOLD.toString(), Integer.toString(minThreshold));
+            options.putIfAbsent(CompactionParams.Option.MAX_THRESHOLD.toString(), Integer.toString(maxThreshold));
+        }
+
+        return CompactionParams.create(klass, options);
+    }
+
+    private static CompressionParams compressionParametersFromThrift(Map<String, String> compression_options)
+    {
+        CompressionParams compressionParameter = CompressionParams.fromMap(compression_options);
+        compressionParameter.validate();
+        return compressionParameter;
+    }
+
+    private static void addDefaultCQLMetadata(Collection<ColumnDefinition> defs,
+                                              String ks,
+                                              String cf,
+                                              AbstractType<?> keyValidator,
+                                              AbstractType<?> comparator,
+                                              AbstractType<?> subComparator,
+                                              AbstractType<?> defaultValidator,
+                                              boolean isDense)
+    {
+        CompactTables.DefaultNames names = CompactTables.defaultNameGenerator(defs);
+        if (keyValidator != null)
+        {
+            if (keyValidator instanceof CompositeType)
+            {
+                List<AbstractType<?>> subTypes = ((CompositeType)keyValidator).types;
+                for (int i = 0; i < subTypes.size(); i++)
+                    defs.add(ColumnDefinition.partitionKeyDef(ks, cf, names.defaultPartitionKeyName(), subTypes.get(i), i));
+            }
+            else
+            {
+                defs.add(ColumnDefinition.partitionKeyDef(ks, cf, names.defaultPartitionKeyName(), keyValidator, 0));
+            }
+        }
+
+        if (subComparator != null)
+        {
+            // SuperColumn tables: we use a special map to hold dynamic values within a given super column
+            defs.add(ColumnDefinition.clusteringDef(ks, cf, names.defaultClusteringName(), comparator, 0));
+            defs.add(ColumnDefinition.regularDef(ks, cf, SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN_STR, MapType.getInstance(subComparator, defaultValidator, true)));
+            if (isDense)
+            {
+                defs.add(ColumnDefinition.clusteringDef(ks, cf, names.defaultClusteringName(), subComparator, 1));
+                defs.add(ColumnDefinition.regularDef(ks, cf, names.defaultCompactValueName(), defaultValidator));
+            }
+        }
+        else
+        {
+            List<AbstractType<?>> subTypes = comparator instanceof CompositeType
+                                           ? ((CompositeType)comparator).types
+                                           : Collections.<AbstractType<?>>singletonList(comparator);
+
+            for (int i = 0; i < subTypes.size(); i++)
+                defs.add(ColumnDefinition.clusteringDef(ks, cf, names.defaultClusteringName(), subTypes.get(i), i));
+
+            defs.add(ColumnDefinition.regularDef(ks, cf, names.defaultCompactValueName(), defaultValidator));
+        }
+    }
+
+    /* applies implicit defaults to cf definition. useful in updates */
+    @SuppressWarnings("deprecation")
     private static void applyImplicitDefaults(org.apache.cassandra.thrift.CfDef cf_def)
     {
         if (!cf_def.isSetComment())
             cf_def.setComment("");
         if (!cf_def.isSetMin_compaction_threshold())
-            cf_def.setMin_compaction_threshold(CFMetaData.DEFAULT_MIN_COMPACTION_THRESHOLD);
+            cf_def.setMin_compaction_threshold(CompactionParams.DEFAULT_MIN_THRESHOLD);
         if (!cf_def.isSetMax_compaction_threshold())
-            cf_def.setMax_compaction_threshold(CFMetaData.DEFAULT_MAX_COMPACTION_THRESHOLD);
-        if (cf_def.compaction_strategy == null)
-            cf_def.compaction_strategy = CFMetaData.DEFAULT_COMPACTION_STRATEGY_CLASS.getSimpleName();
-        if (cf_def.compaction_strategy_options == null)
-            cf_def.compaction_strategy_options = Collections.emptyMap();
+            cf_def.setMax_compaction_threshold(CompactionParams.DEFAULT_MAX_THRESHOLD);
+        if (!cf_def.isSetCompaction_strategy())
+            cf_def.setCompaction_strategy(CompactionParams.DEFAULT.klass().getSimpleName());
+        if (!cf_def.isSetCompaction_strategy_options())
+            cf_def.setCompaction_strategy_options(Collections.emptyMap());
         if (!cf_def.isSetCompression_options())
-            cf_def.setCompression_options(Collections.singletonMap(CompressionParameters.SSTABLE_COMPRESSION, CFMetaData.DEFAULT_COMPRESSOR));
+            cf_def.setCompression_options(Collections.singletonMap(CompressionParams.SSTABLE_COMPRESSION, CompressionParams.DEFAULT.klass().getCanonicalName()));
         if (!cf_def.isSetDefault_time_to_live())
-            cf_def.setDefault_time_to_live(CFMetaData.DEFAULT_DEFAULT_TIME_TO_LIVE);
+            cf_def.setDefault_time_to_live(TableParams.DEFAULT_DEFAULT_TIME_TO_LIVE);
         if (!cf_def.isSetDclocal_read_repair_chance())
-            cf_def.setDclocal_read_repair_chance(CFMetaData.DEFAULT_DCLOCAL_READ_REPAIR_CHANCE);
+            cf_def.setDclocal_read_repair_chance(TableParams.DEFAULT_DCLOCAL_READ_REPAIR_CHANCE);
 
         // if index_interval was set, use that for the min_index_interval default
         if (!cf_def.isSetMin_index_interval())
@@ -334,81 +438,55 @@
             if (cf_def.isSetIndex_interval())
                 cf_def.setMin_index_interval(cf_def.getIndex_interval());
             else
-                cf_def.setMin_index_interval(CFMetaData.DEFAULT_MIN_INDEX_INTERVAL);
+                cf_def.setMin_index_interval(TableParams.DEFAULT_MIN_INDEX_INTERVAL);
         }
 
         if (!cf_def.isSetMax_index_interval())
         {
             // ensure the max is at least as large as the min
-            cf_def.setMax_index_interval(Math.max(cf_def.min_index_interval, CFMetaData.DEFAULT_MAX_INDEX_INTERVAL));
+            cf_def.setMax_index_interval(Math.max(cf_def.min_index_interval, TableParams.DEFAULT_MAX_INDEX_INTERVAL));
         }
     }
 
-    /**
-     * Create CFMetaData from thrift {@link CqlRow} that contains columns from schema_columnfamilies.
-     *
-     * @param columnsRes CqlRow containing columns from schema_columnfamilies.
-     * @return CFMetaData derived from CqlRow
-     */
-    public static CFMetaData fromThriftCqlRow(CqlRow cf, CqlResult columnsRes)
-    {
-        UntypedResultSet.Row cfRow = new UntypedResultSet.Row(convertThriftCqlRow(cf));
-
-        List<Map<String, ByteBuffer>> cols = new ArrayList<>(columnsRes.rows.size());
-        for (CqlRow row : columnsRes.rows)
-            cols.add(convertThriftCqlRow(row));
-        UntypedResultSet colsRows = UntypedResultSet.create(cols);
-
-        return LegacySchemaTables.createTableFromTableRowAndColumnRows(cfRow, colsRows);
-    }
-
-    private static Map<String, ByteBuffer> convertThriftCqlRow(CqlRow row)
-    {
-        Map<String, ByteBuffer> m = new HashMap<>();
-        for (org.apache.cassandra.thrift.Column column : row.getColumns())
-            m.put(UTF8Type.instance.getString(column.bufferForName()), column.value);
-        return m;
-    }
-
     public static CfDef toThrift(CFMetaData cfm)
     {
         CfDef def = new CfDef(cfm.ksName, cfm.cfName);
-        def.setColumn_type(cfm.cfType.name());
+        def.setColumn_type(cfm.isSuper() ? "Super" : "Standard");
 
         if (cfm.isSuper())
         {
             def.setComparator_type(cfm.comparator.subtype(0).toString());
-            def.setSubcomparator_type(cfm.comparator.subtype(1).toString());
+            def.setSubcomparator_type(cfm.thriftColumnNameType().toString());
         }
         else
         {
-            def.setComparator_type(cfm.comparator.toString());
+            def.setComparator_type(LegacyLayout.makeLegacyComparator(cfm).toString());
         }
 
-        def.setComment(Strings.nullToEmpty(cfm.getComment()));
-        def.setRead_repair_chance(cfm.getReadRepairChance());
-        def.setDclocal_read_repair_chance(cfm.getDcLocalReadRepairChance());
-        def.setGc_grace_seconds(cfm.getGcGraceSeconds());
-        def.setDefault_validation_class(cfm.getDefaultValidator().toString());
+        def.setComment(cfm.params.comment);
+        def.setRead_repair_chance(cfm.params.readRepairChance);
+        def.setDclocal_read_repair_chance(cfm.params.dcLocalReadRepairChance);
+        def.setGc_grace_seconds(cfm.params.gcGraceSeconds);
+        def.setDefault_validation_class(cfm.makeLegacyDefaultValidator().toString());
         def.setKey_validation_class(cfm.getKeyValidator().toString());
-        def.setMin_compaction_threshold(cfm.getMinCompactionThreshold());
-        def.setMax_compaction_threshold(cfm.getMaxCompactionThreshold());
+        def.setMin_compaction_threshold(cfm.params.compaction.minCompactionThreshold());
+        def.setMax_compaction_threshold(cfm.params.compaction.maxCompactionThreshold());
         // We only return the alias if only one is set since thrift don't know about multiple key aliases
         if (cfm.partitionKeyColumns().size() == 1)
             def.setKey_alias(cfm.partitionKeyColumns().get(0).name.bytes);
-        def.setColumn_metadata(columnDefinitionsToThrift(cfm.allColumns()));
-        def.setCompaction_strategy(cfm.compactionStrategyClass.getName());
-        def.setCompaction_strategy_options(new HashMap<>(cfm.compactionStrategyOptions));
-        def.setCompression_options(cfm.compressionParameters.asThriftOptions());
-        def.setBloom_filter_fp_chance(cfm.getBloomFilterFpChance());
-        def.setMin_index_interval(cfm.getMinIndexInterval());
-        def.setMax_index_interval(cfm.getMaxIndexInterval());
-        def.setMemtable_flush_period_in_ms(cfm.getMemtableFlushPeriod());
-        def.setCaching(cfm.getCaching().toThriftCaching());
-        def.setCells_per_row_to_cache(cfm.getCaching().toThriftCellsPerRow());
-        def.setDefault_time_to_live(cfm.getDefaultTimeToLive());
-        def.setSpeculative_retry(cfm.getSpeculativeRetry().toString());
-        def.setTriggers(triggerDefinitionsToThrift(cfm.getTriggers().values()));
+        def.setColumn_metadata(columnDefinitionsToThrift(cfm, cfm.allColumns()));
+        def.setCompaction_strategy(cfm.params.compaction.klass().getName());
+        def.setCompaction_strategy_options(cfm.params.compaction.options());
+        def.setCompression_options(compressionParametersToThrift(cfm.params.compression));
+        def.setBloom_filter_fp_chance(cfm.params.bloomFilterFpChance);
+        def.setMin_index_interval(cfm.params.minIndexInterval);
+        def.setMax_index_interval(cfm.params.maxIndexInterval);
+        def.setMemtable_flush_period_in_ms(cfm.params.memtableFlushPeriodInMs);
+        def.setCaching(toThrift(cfm.params.caching));
+        def.setCells_per_row_to_cache(toThriftCellsPerRow(cfm.params.caching));
+        def.setDefault_time_to_live(cfm.params.defaultTimeToLive);
+        def.setSpeculative_retry(cfm.params.speculativeRetry.toString());
+        def.setTriggers(triggerDefinitionsToThrift(cfm.getTriggers()));
 
         return def;
     }
@@ -420,8 +498,8 @@
                                               ColumnDef thriftColumnDef)
     throws SyntaxException, ConfigurationException
     {
+        boolean isSuper = thriftSubcomparator != null;
         // For super columns, the componentIndex is 1 because the ColumnDefinition applies to the column component.
-        Integer componentIndex = thriftSubcomparator != null ? 1 : null;
         AbstractType<?> comparator = thriftSubcomparator == null ? thriftComparator : thriftSubcomparator;
         try
         {
@@ -432,15 +510,15 @@
             throw new ConfigurationException(String.format("Column name %s is not valid for comparator %s", ByteBufferUtil.bytesToHex(thriftColumnDef.name), comparator));
         }
 
+        // In our generic layout, we store thrift defined columns as static, but this doesn't work for super columns so we
+        // use a regular definition (and "dynamic" columns are handled in a map).
+        ColumnDefinition.Kind kind = isSuper ? ColumnDefinition.Kind.REGULAR : ColumnDefinition.Kind.STATIC;
         return new ColumnDefinition(ksName,
                                     cfName,
-                                    new ColumnIdentifier(ByteBufferUtil.clone(thriftColumnDef.name), comparator),
+                                    ColumnIdentifier.getInterned(ByteBufferUtil.clone(thriftColumnDef.name), comparator),
                                     TypeParser.parse(thriftColumnDef.validation_class),
-                                    thriftColumnDef.index_type == null ? null : org.apache.cassandra.config.IndexType.valueOf(thriftColumnDef.index_type.name()),
-                                    thriftColumnDef.index_options,
-                                    thriftColumnDef.index_name,
-                                    componentIndex,
-                                    ColumnDefinition.Kind.REGULAR);
+                                    ColumnDefinition.NO_POSITION,
+                                    kind);
     }
 
     private static List<ColumnDefinition> fromThrift(String ksName,
@@ -460,48 +538,197 @@
         return defs;
     }
 
+    private static Indexes indexDefsFromThrift(CFMetaData cfm,
+                                               String ksName,
+                                               String cfName,
+                                               AbstractType<?> thriftComparator,
+                                               AbstractType<?> thriftSubComparator,
+                                               List<ColumnDef> thriftDefs)
+    {
+        if (thriftDefs == null)
+            return Indexes.none();
+
+        Set<String> indexNames = new HashSet<>();
+        Indexes.Builder indexes = Indexes.builder();
+        for (ColumnDef def : thriftDefs)
+        {
+            if (def.isSetIndex_type())
+            {
+                ColumnDefinition column = fromThrift(ksName, cfName, thriftComparator, thriftSubComparator, def);
+
+                String indexName = def.getIndex_name();
+                // add a generated index name if none was supplied
+                if (Strings.isNullOrEmpty(indexName))
+                    indexName = Indexes.getAvailableIndexName(ksName, cfName, column.name.toString());
+
+                if (indexNames.contains(indexName))
+                    throw new ConfigurationException("Duplicate index name " + indexName);
+
+                indexNames.add(indexName);
+
+                Map<String, String> indexOptions = def.getIndex_options();
+                if (indexOptions != null && indexOptions.containsKey(IndexTarget.TARGET_OPTION_NAME))
+                        throw new ConfigurationException("Reserved index option 'target' cannot be used");
+
+                IndexMetadata.Kind kind = IndexMetadata.Kind.valueOf(def.index_type.name());
+
+                indexes.add(IndexMetadata.fromLegacyMetadata(cfm, column, indexName, kind, indexOptions));
+            }
+        }
+        return indexes.build();
+    }
+
     @VisibleForTesting
-    public static ColumnDef toThrift(ColumnDefinition column)
+    public static ColumnDef toThrift(CFMetaData cfMetaData, ColumnDefinition column)
     {
         ColumnDef cd = new ColumnDef();
 
         cd.setName(ByteBufferUtil.clone(column.name.bytes));
         cd.setValidation_class(column.type.toString());
-        cd.setIndex_type(column.getIndexType() == null ? null : org.apache.cassandra.thrift.IndexType.valueOf(column.getIndexType().name()));
-        cd.setIndex_name(column.getIndexName());
-        cd.setIndex_options(column.getIndexOptions() == null ? null : Maps.newHashMap(column.getIndexOptions()));
+
+        // we include the index in the ColumnDef iff its targets are compatible with
+        // pre-3.0 indexes AND it is the only index defined on the given column, that is:
+        //   * it is the only index on the column (i.e. with this column as its target)
+        //   * it has only a single target, which matches the pattern for pre-3.0 indexes
+        //     i.e. keys/values/entries/full, with exactly 1 argument that matches the
+        //     column name OR a simple column name (for indexes on non-collection columns)
+        // n.b. it's a guess that using a pre-compiled regex and checking the group is
+        // cheaper than compiling a new regex for each column, but as this isn't on
+        // any hot path this hasn't been verified yet.
+        IndexMetadata matchedIndex = null;
+        for (IndexMetadata index : cfMetaData.getIndexes())
+        {
+            Pair<ColumnDefinition, IndexTarget.Type> target  = CassandraIndex.parseTarget(cfMetaData, index);
+            if (target.left.equals(column))
+            {
+                // we already found an index for this column, we've no option but to
+                // ignore both of them (and any others we've yet to find)
+                if (matchedIndex != null)
+                    return cd;
+
+                matchedIndex = index;
+            }
+        }
+
+        if (matchedIndex != null)
+        {
+            cd.setIndex_type(org.apache.cassandra.thrift.IndexType.valueOf(matchedIndex.kind.name()));
+            cd.setIndex_name(matchedIndex.name);
+            Map<String, String> filteredOptions = Maps.filterKeys(matchedIndex.options,
+                                                                  s -> !IndexTarget.TARGET_OPTION_NAME.equals(s));
+            cd.setIndex_options(filteredOptions.isEmpty()
+                                ? null
+                                : Maps.newHashMap(filteredOptions));
+        }
 
         return cd;
     }
 
-    private static List<ColumnDef> columnDefinitionsToThrift(Collection<ColumnDefinition> columns)
+    private static List<ColumnDef> columnDefinitionsToThrift(CFMetaData metadata, Collection<ColumnDefinition> columns)
     {
         List<ColumnDef> thriftDefs = new ArrayList<>(columns.size());
         for (ColumnDefinition def : columns)
-            if (def.kind == ColumnDefinition.Kind.REGULAR)
-                thriftDefs.add(ThriftConversion.toThrift(def));
+            if (def.isPartOfCellName(metadata.isCQLTable(), metadata.isSuper()))
+                thriftDefs.add(ThriftConversion.toThrift(metadata, def));
         return thriftDefs;
     }
 
-    private static Map<String, TriggerDefinition> triggerDefinitionsFromThrift(List<TriggerDef> thriftDefs)
+    private static Triggers triggerDefinitionsFromThrift(List<TriggerDef> thriftDefs)
     {
-        Map<String, TriggerDefinition> triggerDefinitions = new HashMap<>();
+        Triggers.Builder triggers = Triggers.builder();
         for (TriggerDef thriftDef : thriftDefs)
-            triggerDefinitions.put(thriftDef.getName(),
-                                   new TriggerDefinition(thriftDef.getName(), thriftDef.getOptions().get(TriggerDefinition.CLASS)));
-        return triggerDefinitions;
+            triggers.add(new TriggerMetadata(thriftDef.getName(), thriftDef.getOptions().get(TriggerMetadata.CLASS)));
+        return triggers.build();
     }
 
-    private static List<TriggerDef> triggerDefinitionsToThrift(Collection<TriggerDefinition> triggers)
+    private static List<TriggerDef> triggerDefinitionsToThrift(Triggers triggers)
     {
-        List<TriggerDef> thriftDefs = new ArrayList<>(triggers.size());
-        for (TriggerDefinition def : triggers)
+        List<TriggerDef> thriftDefs = new ArrayList<>();
+        for (TriggerMetadata def : triggers)
         {
             TriggerDef td = new TriggerDef();
             td.setName(def.name);
-            td.setOptions(Collections.singletonMap(TriggerDefinition.CLASS, def.classOption));
+            td.setOptions(Collections.singletonMap(TriggerMetadata.CLASS, def.classOption));
             thriftDefs.add(td);
         }
         return thriftDefs;
     }
+
+    @SuppressWarnings("deprecation")
+    public static Map<String, String> compressionParametersToThrift(CompressionParams parameters)
+    {
+        if (!parameters.isEnabled())
+            return Collections.emptyMap();
+
+        Map<String, String> options = new HashMap<>(parameters.getOtherOptions());
+        Class<? extends ICompressor> klass = parameters.getSstableCompressor().getClass();
+        options.put(CompressionParams.SSTABLE_COMPRESSION, klass.getName());
+        options.put(CompressionParams.CHUNK_LENGTH_KB, parameters.chunkLengthInKB());
+        return options;
+    }
+
+    private static String toThrift(CachingParams caching)
+    {
+        if (caching.cacheRows() && caching.cacheKeys())
+            return "ALL";
+
+        if (caching.cacheRows())
+            return "ROWS_ONLY";
+
+        if (caching.cacheKeys())
+            return "KEYS_ONLY";
+
+        return "NONE";
+    }
+
+    private static CachingParams cachingFromTrhfit(String caching)
+    {
+        switch (caching.toUpperCase(Locale.ENGLISH))
+        {
+            case "ALL":
+                return CachingParams.CACHE_EVERYTHING;
+            case "ROWS_ONLY":
+                return new CachingParams(false, Integer.MAX_VALUE);
+            case "KEYS_ONLY":
+                return CachingParams.CACHE_KEYS;
+            case "NONE":
+                return CachingParams.CACHE_NOTHING;
+            default:
+                throw new ConfigurationException(String.format("Invalid value %s for caching parameter", caching));
+        }
+    }
+
+    private static String toThriftCellsPerRow(CachingParams caching)
+    {
+        return caching.cacheAllRows()
+             ? "ALL"
+             : String.valueOf(caching.rowsPerPartitionToCache());
+    }
+
+    private static int fromThriftCellsPerRow(String value)
+    {
+        return "ALL".equals(value)
+             ? Integer.MAX_VALUE
+             : Integer.parseInt(value);
+    }
+
+    public static CachingParams cachingFromThrift(String caching, String cellsPerRow)
+    {
+        boolean cacheKeys = true;
+        int rowsPerPartitionToCache = 0;
+
+        // if we get a caching string from thrift it is legacy, "ALL", "KEYS_ONLY" etc
+        if (caching != null)
+        {
+            CachingParams parsed = cachingFromTrhfit(caching);
+            cacheKeys = parsed.cacheKeys();
+            rowsPerPartitionToCache = parsed.rowsPerPartitionToCache();
+        }
+
+        // if we get cells_per_row from thrift, it is either "ALL" or "<number of cells to cache>".
+        if (cellsPerRow != null && rowsPerPartitionToCache > 0)
+            rowsPerPartitionToCache = fromThriftCellsPerRow(cellsPerRow);
+
+        return new CachingParams(cacheKeys, rowsPerPartitionToCache);
+    }
 }

diff --git a/src/java/org/apache/cassandra/thrift/ThriftResultsMerger.java b/src/java/org/apache/cassandra/thrift/ThriftResultsMerger.java
new file mode 100644
index 0000000..ea3fa2f
--- /dev/null
+++ b/src/java/org/apache/cassandra/thrift/ThriftResultsMerger.java

@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.thrift;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.utils.AbstractIterator;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.PeekingIterator;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.partitions.*;
+
+/**
+ * Given an iterator on a partition of a compact table, this return an iterator that merges the
+ * static row columns with the other results.
+ *
+ * Compact tables stores thrift column_metadata as static columns (see CompactTables for
+ * details). When reading for thrift however, we want to merge those static values with other
+ * results because:
+ *   1) on thrift, all "columns" are sorted together, whether or not they are declared
+ *      column_metadata.
+ *   2) it's possible that a table add a value for a "dynamic" column, and later that column
+ *      is statically defined. Merging "static" and "dynamic" columns make sure we don't miss
+ *      a value prior to the column declaration.
+ *
+ * For example, if a thrift table declare 2 columns "c1" and "c5" and the results from a query
+ * is:
+ *    Partition: static: { c1: 3, c5: 4 }
+ *                 "a" : { value : 2 }
+ *                 "c3": { value : 8 }
+ *                 "c7": { value : 1 }
+ * then this class transform it into:
+ *    Partition:   "a" : { value : 2 }
+ *                 "c1": { value : 3 }
+ *                 "c3": { value : 8 }
+ *                 "c5": { value : 4 }
+ *                 "c7": { value : 1 }
+ */
+public class ThriftResultsMerger extends Transformation<UnfilteredRowIterator>
+{
+    private final int nowInSec;
+
+    private ThriftResultsMerger(int nowInSec)
+    {
+        this.nowInSec = nowInSec;
+    }
+
+    public static UnfilteredPartitionIterator maybeWrap(UnfilteredPartitionIterator iterator, CFMetaData metadata, int nowInSec)
+    {
+        if (!metadata.isStaticCompactTable() && !metadata.isSuper())
+            return iterator;
+
+        return Transformation.apply(iterator, new ThriftResultsMerger(nowInSec));
+    }
+
+    public static UnfilteredRowIterator maybeWrap(UnfilteredRowIterator iterator, int nowInSec)
+    {
+        if (!iterator.metadata().isStaticCompactTable() && !iterator.metadata().isSuper())
+            return iterator;
+
+        return iterator.metadata().isSuper()
+             ? Transformation.apply(iterator, new SuperColumnsPartitionMerger(iterator, nowInSec))
+             : new PartitionMerger(iterator, nowInSec);
+    }
+
+    @Override
+    public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator iter)
+    {
+        return iter.metadata().isSuper()
+             ? Transformation.apply(iter, new SuperColumnsPartitionMerger(iter, nowInSec))
+             : new PartitionMerger(iter, nowInSec);
+    }
+
+    private static class PartitionMerger extends WrappingUnfilteredRowIterator
+    {
+        private final int nowInSec;
+
+        // We initialize lazily to avoid having this iterator fetch the wrapped iterator before it's actually asked for it.
+        private boolean isInit;
+
+        private Iterator<Cell> staticCells;
+
+        private final Row.Builder builder;
+        private Row nextToMerge;
+        private Unfiltered nextFromWrapped;
+
+        private PartitionMerger(UnfilteredRowIterator results, int nowInSec)
+        {
+            super(results);
+            assert results.metadata().isStaticCompactTable();
+            this.nowInSec = nowInSec;
+            this.builder = BTreeRow.sortedBuilder();
+        }
+
+        private void init()
+        {
+            assert !isInit;
+            Row staticRow = super.staticRow();
+            assert !staticRow.hasComplex();
+
+            staticCells = staticRow.cells().iterator();
+            updateNextToMerge();
+            isInit = true;
+        }
+
+        @Override
+        public Row staticRow()
+        {
+            return Rows.EMPTY_STATIC_ROW;
+        }
+
+        @Override
+        public boolean hasNext()
+        {
+            if (!isInit)
+                init();
+
+            return nextFromWrapped != null || nextToMerge != null || super.hasNext();
+        }
+
+        @Override
+        public Unfiltered next()
+        {
+            if (!isInit)
+                init();
+
+            if (nextFromWrapped == null && super.hasNext())
+                nextFromWrapped = super.next();
+
+            if (nextFromWrapped == null)
+            {
+                if (nextToMerge == null)
+                    throw new NoSuchElementException();
+
+                return consumeNextToMerge();
+            }
+
+            if (nextToMerge == null)
+                return consumeNextWrapped();
+
+            int cmp = metadata().comparator.compare(nextToMerge, nextFromWrapped);
+            if (cmp < 0)
+                return consumeNextToMerge();
+            if (cmp > 0)
+                return consumeNextWrapped();
+
+            // Same row, so merge them
+            assert nextFromWrapped instanceof Row;
+            return Rows.merge((Row)consumeNextWrapped(), consumeNextToMerge(), nowInSec);
+        }
+
+        private Unfiltered consumeNextWrapped()
+        {
+            Unfiltered toReturn = nextFromWrapped;
+            nextFromWrapped = null;
+            return toReturn;
+        }
+
+        private Row consumeNextToMerge()
+        {
+            Row toReturn = nextToMerge;
+            updateNextToMerge();
+            return toReturn;
+        }
+
+        private void updateNextToMerge()
+        {
+            if (!staticCells.hasNext())
+            {
+                // Nothing more to merge.
+                nextToMerge = null;
+                return;
+            }
+
+            Cell cell = staticCells.next();
+
+            // Given a static cell, the equivalent row uses the column name as clustering and the value as unique cell value.
+            builder.newRow(new Clustering(cell.column().name.bytes));
+            builder.addCell(new BufferCell(metadata().compactValueColumn(), cell.timestamp(), cell.ttl(), cell.localDeletionTime(), cell.value(), cell.path()));
+            nextToMerge = builder.build();
+        }
+    }
+
+    private static class SuperColumnsPartitionMerger extends Transformation
+    {
+        private final int nowInSec;
+        private final Row.Builder builder;
+        private final ColumnDefinition superColumnMapColumn;
+        private final AbstractType<?> columnComparator;
+
+        private SuperColumnsPartitionMerger(UnfilteredRowIterator applyTo, int nowInSec)
+        {
+            assert applyTo.metadata().isSuper();
+            this.nowInSec = nowInSec;
+
+            this.superColumnMapColumn = applyTo.metadata().compactValueColumn();
+            assert superColumnMapColumn != null && superColumnMapColumn.type instanceof MapType;
+
+            this.builder = BTreeRow.sortedBuilder();
+            this.columnComparator = ((MapType)superColumnMapColumn.type).nameComparator();
+        }
+
+        @Override
+        public Row applyToRow(Row row)
+        {
+            PeekingIterator<Cell> staticCells = Iterators.peekingIterator(simpleCellsIterator(row));
+            if (!staticCells.hasNext())
+                return row;
+
+            builder.newRow(row.clustering());
+
+            ComplexColumnData complexData = row.getComplexColumnData(superColumnMapColumn);
+            
+            PeekingIterator<Cell> dynamicCells;
+            if (complexData == null)
+            {
+                dynamicCells = Iterators.peekingIterator(Collections.<Cell>emptyIterator());
+            }
+            else
+            {
+                dynamicCells = Iterators.peekingIterator(complexData.iterator());
+                builder.addComplexDeletion(superColumnMapColumn, complexData.complexDeletion());
+            }
+
+            while (staticCells.hasNext() && dynamicCells.hasNext())
+            {
+                Cell staticCell = staticCells.peek();
+                Cell dynamicCell = dynamicCells.peek();
+                int cmp = columnComparator.compare(staticCell.column().name.bytes, dynamicCell.path().get(0));
+                if (cmp < 0)
+                    builder.addCell(makeDynamicCell(staticCells.next()));
+                else if (cmp > 0)
+                    builder.addCell(dynamicCells.next());
+                else
+                    builder.addCell(Cells.reconcile(makeDynamicCell(staticCells.next()), dynamicCells.next(), nowInSec));
+            }
+
+            while (staticCells.hasNext())
+                builder.addCell(makeDynamicCell(staticCells.next()));
+            while (dynamicCells.hasNext())
+                builder.addCell(dynamicCells.next());
+
+            return builder.build();
+        }
+
+        private Cell makeDynamicCell(Cell staticCell)
+        {
+            return new BufferCell(superColumnMapColumn, staticCell.timestamp(), staticCell.ttl(), staticCell.localDeletionTime(), staticCell.value(), CellPath.create(staticCell.column().name.bytes));
+        }
+
+        private Iterator<Cell> simpleCellsIterator(Row row)
+        {
+            final Iterator<Cell> cells = row.cells().iterator();
+            return new AbstractIterator<Cell>()
+            {
+                protected Cell computeNext()
+                {
+                    if (cells.hasNext())
+                    {
+                        Cell cell = cells.next();
+                        if (cell.column().isSimple())
+                            return cell;
+                    }
+                    return endOfData();
+                }
+            };
+        }
+    }
+}
+

diff --git a/src/java/org/apache/cassandra/thrift/ThriftServer.java b/src/java/org/apache/cassandra/thrift/ThriftServer.java
index 87dcd3e..44ec524 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftServer.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftServer.java

@@ -77,11 +77,6 @@
         }
     }
 
-    public void stopAndAwaitTermination()
-    {
-        stop();
-    }
-
     public boolean isRunning()
     {
         return server != null;

diff --git a/src/java/org/apache/cassandra/thrift/ThriftValidation.java b/src/java/org/apache/cassandra/thrift/ThriftValidation.java
index 8bdf9dc..4b208ba 100644
--- a/src/java/org/apache/cassandra/thrift/ThriftValidation.java
+++ b/src/java/org/apache/cassandra/thrift/ThriftValidation.java

@@ -18,28 +18,25 @@
 package org.apache.cassandra.thrift;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.Attributes;
-import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexManager;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ColumnToCollectionType;
-import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -108,6 +105,11 @@
     // To be used when the operation should be authorized whether this is a counter CF or not
     public static CFMetaData validateColumnFamily(String keyspaceName, String cfName) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
+        return validateColumnFamilyWithCompactMode(keyspaceName, cfName, false);
+    }
+
+    public static CFMetaData validateColumnFamilyWithCompactMode(String keyspaceName, String cfName, boolean noCompactMode) throws org.apache.cassandra.exceptions.InvalidRequestException
+    {
         validateKeyspace(keyspaceName);
         if (cfName.isEmpty())
             throw new org.apache.cassandra.exceptions.InvalidRequestException("non-empty table is required");
@@ -116,7 +118,10 @@
         if (metadata == null)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("unconfigured table " + cfName);
 
-        return metadata;
+        if (metadata.isCompactTable() && noCompactMode)
+            return metadata.asNonCompact();
+        else
+            return metadata;
     }
 
     /**
@@ -124,7 +129,7 @@
      */
     public static void validateColumnPath(CFMetaData metadata, ColumnPath column_path) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        if (metadata.cfType == ColumnFamilyType.Standard)
+        if (!metadata.isSuper())
         {
             if (column_path.super_column != null)
             {
@@ -152,7 +157,7 @@
 
     public static void validateColumnParent(CFMetaData metadata, ColumnParent column_parent) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        if (metadata.cfType == ColumnFamilyType.Standard)
+        if (!metadata.isSuper())
         {
             if (column_parent.super_column != null)
             {
@@ -169,14 +174,7 @@
     // column_path_or_parent is a ColumnPath for remove, where the "column" is optional even for a standard CF
     static void validateColumnPathOrParent(CFMetaData metadata, ColumnPath column_path_or_parent) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        if (metadata.cfType == ColumnFamilyType.Standard)
-        {
-            if (column_path_or_parent.super_column != null)
-            {
-                throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn may not be specified for standard CF " + metadata.cfName);
-            }
-        }
-        if (metadata.cfType == ColumnFamilyType.Super)
+        if (metadata.isSuper())
         {
             if (column_path_or_parent.super_column == null && column_path_or_parent.column != null)
             {
@@ -184,6 +182,13 @@
                                                                           + metadata.cfName);
             }
         }
+        else
+        {
+            if (column_path_or_parent.super_column != null)
+            {
+                throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn may not be specified for standard CF " + metadata.cfName);
+            }
+        }
         if (column_path_or_parent.column != null)
         {
             validateColumnNames(metadata, column_path_or_parent.super_column, Arrays.asList(column_path_or_parent.column));
@@ -194,13 +199,30 @@
         }
     }
 
+    private static AbstractType<?> getThriftColumnNameComparator(CFMetaData metadata, ByteBuffer superColumnName)
+    {
+        if (!metadata.isSuper())
+            return LegacyLayout.makeLegacyComparator(metadata);
+
+        if (superColumnName == null)
+        {
+            // comparator for super column name
+            return metadata.comparator.subtype(0);
+        }
+        else
+        {
+            // comparator for sub columns
+            return metadata.thriftColumnNameType();
+        }
+    }
+
     /**
      * Validates the column names but not the parent path or data
      */
     private static void validateColumnNames(CFMetaData metadata, ByteBuffer superColumnName, Iterable<ByteBuffer> column_names)
     throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        int maxNameLength = Cell.MAX_NAME_LENGTH;
+        int maxNameLength = LegacyLayout.MAX_CELL_NAME_LENGTH;
 
         if (superColumnName != null)
         {
@@ -208,10 +230,10 @@
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn name length must not be greater than " + maxNameLength);
             if (superColumnName.remaining() == 0)
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn name must not be empty");
-            if (metadata.cfType == ColumnFamilyType.Standard)
+            if (!metadata.isSuper())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("supercolumn specified to table " + metadata.cfName + " containing normal columns");
         }
-        AbstractType<?> comparator = SuperColumns.getComparatorFor(metadata, superColumnName);
+        AbstractType<?> comparator = getThriftColumnNameComparator(metadata, superColumnName);
         boolean isCQL3Table = !metadata.isThriftCompatible();
         for (ByteBuffer name : column_names)
         {
@@ -230,31 +252,28 @@
 
             if (isCQL3Table)
             {
-                // CQL3 table don't support having only part of their composite column names set
-                Composite composite = metadata.comparator.fromByteBuffer(name);
-
-                int minComponents = metadata.comparator.clusteringPrefixSize() + 1;
-                if (composite.size() < minComponents)
-                    throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Not enough components (found %d but %d expected) for column name since %s is a CQL3 table",
-                                                                                                    composite.size(), minComponents, metadata.cfName));
-
-                // Furthermore, the column name must be a declared one.
-                int columnIndex = metadata.comparator.clusteringPrefixSize();
-                ByteBuffer CQL3ColumnName = composite.get(columnIndex);
-                if (!CQL3ColumnName.hasRemaining())
-                    continue; // Row marker, ok
-
-                ColumnIdentifier columnId = new ColumnIdentifier(CQL3ColumnName, metadata.comparator.subtype(columnIndex));
-                if (metadata.getColumnDefinition(columnId) == null)
-                    throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Invalid cell for CQL3 table %s. The CQL3 column component (%s) does not correspond to a defined CQL3 column",
-                                                                                                    metadata.cfName, columnId));
-
-                // On top of that, if we have a collection component, he (CQL3) column must be a collection
-                if (metadata.comparator.hasCollections() && composite.size() == metadata.comparator.size())
+                try
                 {
-                    ColumnToCollectionType collectionType = metadata.comparator.collectionType();
-                    if (!collectionType.defined.containsKey(CQL3ColumnName))
-                        throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Invalid collection component, %s is not a collection", UTF8Type.instance.getString(CQL3ColumnName)));
+                    LegacyLayout.LegacyCellName cname = LegacyLayout.decodeCellName(metadata, name);
+                    assert cname.clustering.size() == metadata.comparator.size();
+
+                    // CQL3 table don't support having only part of their composite column names set
+                    for (int i = 0; i < cname.clustering.size(); i++)
+                    {
+                        if (cname.clustering.get(i) == null)
+                            throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Not enough components (found %d but %d expected) for column name since %s is a CQL3 table",
+                                                                                                            i, metadata.comparator.size() + 1, metadata.cfName));
+                    }
+
+
+
+                    // On top of that, if we have a collection component, the (CQL3) column must be a collection
+                    if (cname.column != null && cname.collectionElement != null && !cname.column.type.isCollection())
+                        throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Invalid collection component, %s is not a collection", cname.column.name));
+                }
+                catch (IllegalArgumentException | UnknownColumnException e)
+                {
+                    throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Error validating cell name for CQL3 table %s: %s", metadata.cfName, e.getMessage()));
                 }
             }
         }
@@ -270,13 +289,13 @@
         if (range.count < 0)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("get_slice requires non-negative count");
 
-        int maxNameLength = Cell.MAX_NAME_LENGTH;
+        int maxNameLength = LegacyLayout.MAX_CELL_NAME_LENGTH;
         if (range.start.remaining() > maxNameLength)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("range start length cannot be larger than " + maxNameLength);
         if (range.finish.remaining() > maxNameLength)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("range finish length cannot be larger than " + maxNameLength);
 
-        AbstractType<?> comparator = SuperColumns.getComparatorFor(metadata, column_parent.super_column);
+        AbstractType<?> comparator = getThriftColumnNameComparator(metadata, column_parent.super_column);
         try
         {
             comparator.validate(range.start);
@@ -296,7 +315,7 @@
         }
     }
 
-    public static void validateColumnOrSuperColumn(CFMetaData metadata, ByteBuffer key, ColumnOrSuperColumn cosc)
+    public static void validateColumnOrSuperColumn(CFMetaData metadata, ColumnOrSuperColumn cosc)
             throws org.apache.cassandra.exceptions.InvalidRequestException
     {
         boolean isCommutative = metadata.isCounter();
@@ -317,7 +336,7 @@
 
             validateTtl(metadata, cosc.column);
             validateColumnPath(metadata, new ColumnPath(metadata.cfName).setSuper_column((ByteBuffer)null).setColumn(cosc.column.name));
-            validateColumnData(metadata, key, null, cosc.column);
+            validateColumnData(metadata, null, cosc.column);
         }
 
         if (cosc.super_column != null)
@@ -328,7 +347,7 @@
             for (Column c : cosc.super_column.columns)
             {
                 validateColumnPath(metadata, new ColumnPath(metadata.cfName).setSuper_column(cosc.super_column.name).setColumn(c.name));
-                validateColumnData(metadata, key, cosc.super_column.name, c);
+                validateColumnData(metadata, cosc.super_column.name, c);
             }
         }
 
@@ -357,19 +376,19 @@
             if (column.ttl <= 0)
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("ttl must be positive");
 
-            if (column.ttl > ExpiringCell.MAX_TTL)
-                throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", column.ttl, ExpiringCell.MAX_TTL));
-            Attributes.maybeApplyExpirationDateOverflowPolicy(metadata, column.ttl, false);
+            if (column.ttl > Attributes.MAX_TTL)
+                throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("ttl is too large. requested (%d) maximum (%d)", column.ttl, Attributes.MAX_TTL));
+            ExpirationDateOverflowHandling.maybeApplyExpirationDateOverflowPolicy(metadata, column.ttl, false);
         }
         else
         {
-            Attributes.maybeApplyExpirationDateOverflowPolicy(metadata, metadata.getDefaultTimeToLive(), true);
+            ExpirationDateOverflowHandling.maybeApplyExpirationDateOverflowPolicy(metadata, metadata.params.defaultTimeToLive, true);
             // if it's not set, then it should be zero -- here we are just checking to make sure Thrift doesn't change that contract with us.
             assert column.ttl == 0;
         }
     }
 
-    public static void validateMutation(CFMetaData metadata, ByteBuffer key, Mutation mut)
+    public static void validateMutation(CFMetaData metadata, Mutation mut)
             throws org.apache.cassandra.exceptions.InvalidRequestException
     {
         ColumnOrSuperColumn cosc = mut.column_or_supercolumn;
@@ -386,7 +405,7 @@
 
         if (cosc != null)
         {
-            validateColumnOrSuperColumn(metadata, key, cosc);
+            validateColumnOrSuperColumn(metadata, cosc);
         }
         else
         {
@@ -403,7 +422,7 @@
         if (del.predicate != null)
             validateSlicePredicate(metadata, del.super_column, del.predicate);
 
-        if (metadata.cfType == ColumnFamilyType.Standard && del.super_column != null)
+        if (!metadata.isSuper() && del.super_column != null)
         {
             String msg = String.format("Deletion of super columns is not possible on a standard table (KeySpace=%s Table=%s Deletion=%s)", metadata.ksName, metadata.cfName, del);
             throw new org.apache.cassandra.exceptions.InvalidRequestException(msg);
@@ -412,7 +431,7 @@
         if (metadata.isCounter())
         {
             // forcing server timestamp even if a timestamp was set for coherence with other counter operation
-            del.timestamp = System.currentTimeMillis();
+            del.timestamp = FBUtilities.timestampMicros();
         }
         else if (!del.isSetTimestamp())
         {
@@ -435,7 +454,7 @@
     /**
      * Validates the data part of the column (everything in the column object but the name, which is assumed to be valid)
      */
-    public static void validateColumnData(CFMetaData metadata, ByteBuffer key, ByteBuffer scName, Column column) throws org.apache.cassandra.exceptions.InvalidRequestException
+    public static void validateColumnData(CFMetaData metadata, ByteBuffer scName, Column column) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
         validateTtl(metadata, column);
         if (!column.isSetValue())
@@ -443,14 +462,18 @@
         if (!column.isSetTimestamp())
             throw new org.apache.cassandra.exceptions.InvalidRequestException("Column timestamp is required");
 
-        CellName cn = scName == null
-                    ? metadata.comparator.cellFromByteBuffer(column.name)
-                    : metadata.comparator.makeCellName(scName, column.name);
         try
         {
-            AbstractType<?> validator = metadata.getValueValidator(cn);
-            if (validator != null)
-                validator.validate(column.value);
+            LegacyLayout.LegacyCellName cn = LegacyLayout.decodeCellName(metadata, scName, column.name);
+            if (cn.column.isPrimaryKeyColumn())
+                throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Cannot add primary key column %s to partition update", cn.column.name));
+
+            cn.column.validateCellValue(column.value);
+
+        }
+        catch (UnknownColumnException e)
+        {
+            throw new org.apache.cassandra.exceptions.InvalidRequestException(e.getMessage());
         }
         catch (MarshalException me)
         {
@@ -461,25 +484,9 @@
                                                                       me.getMessage(),
                                                                       metadata.ksName,
                                                                       metadata.cfName,
-                                                                      (SuperColumns.getComparatorFor(metadata, scName != null)).getString(column.name)));
+                                                                      (getThriftColumnNameComparator(metadata, scName)).getString(column.name)));
         }
 
-        // Indexed column values cannot be larger than 64K.  See CASSANDRA-3057/4240 for more details
-        SecondaryIndex failedIndex = Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager.validate(key, asDBColumn(cn, column));
-        if (failedIndex != null)
-                    throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Can't index column value of size %d for index %s in CF %s of KS %s",
-                                                                              column.value.remaining(),
-                                                                              failedIndex.getIndexName(),
-                                                                              metadata.cfName,
-                                                                              metadata.ksName));
-    }
-
-    private static Cell asDBColumn(CellName name, Column column)
-    {
-        if (column.ttl <= 0)
-            return new BufferCell(name, column.value, column.timestamp);
-        else
-            return new BufferExpiringCell(name, column.value, column.timestamp, column.ttl);
     }
 
     /**
@@ -521,7 +528,7 @@
         if (range.start_token != null && range.end_key != null)
             throw new org.apache.cassandra.exceptions.InvalidRequestException("start token + end key is not a supported key range");
 
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = metadata.partitioner;
 
         if (range.start_key != null && range.end_key != null)
         {
@@ -538,8 +545,8 @@
         else if (range.start_key != null && range.end_token != null)
         {
             // start_token/end_token can wrap, but key/token should not
-            RowPosition stop = p.getTokenFactory().fromString(range.end_token).maxKeyBound();
-            if (RowPosition.ForKey.get(range.start_key, p).compareTo(stop) > 0 && !stop.isMinimum())
+            PartitionPosition stop = p.getTokenFactory().fromString(range.end_token).maxKeyBound();
+            if (PartitionPosition.ForKey.get(range.start_key, p).compareTo(stop) > 0 && !stop.isMinimum())
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("Start key's token sorts after end token");
         }
 
@@ -580,7 +587,7 @@
             return false;
 
         SecondaryIndexManager idxManager = Keyspace.open(metadata.ksName).getColumnFamilyStore(metadata.cfName).indexManager;
-        AbstractType<?> nameValidator = SuperColumns.getComparatorFor(metadata, null);
+        AbstractType<?> nameValidator = getThriftColumnNameComparator(metadata, null);
 
         boolean isIndexed = false;
         for (IndexExpression expression : index_clause)
@@ -600,11 +607,18 @@
             if (expression.value.remaining() > 0xFFFF)
                 throw new org.apache.cassandra.exceptions.InvalidRequestException("Index expression values may not be larger than 64K");
 
-            CellName name = metadata.comparator.cellFromByteBuffer(expression.column_name);
-            AbstractType<?> valueValidator = metadata.getValueValidator(name);
+            ColumnDefinition def = metadata.getColumnDefinition(expression.column_name);
+            if (def == null)
+            {
+                if (!metadata.isCompactTable())
+                    throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("Unknown column %s", nameValidator.getString(expression.column_name)));
+
+                def = metadata.compactValueColumn();
+            }
+
             try
             {
-                valueValidator.validate(expression.value);
+                def.type.validate(expression.value);
             }
             catch (MarshalException me)
             {
@@ -614,7 +628,8 @@
                                                                                   me.getMessage()));
             }
 
-            isIndexed |= (expression.op == IndexOperator.EQ) && idxManager.indexes(name);
+            for(Index index : idxManager.listIndexes())
+                isIndexed |= index.supportsExpression(def, Operator.valueOf(expression.op.name()));
         }
 
         return isIndexed;
@@ -636,36 +651,36 @@
 
     public static void validateKeyspaceNotSystem(String modifiedKeyspace) throws org.apache.cassandra.exceptions.InvalidRequestException
     {
-        if (modifiedKeyspace.equalsIgnoreCase(SystemKeyspace.NAME))
-            throw new org.apache.cassandra.exceptions.InvalidRequestException("system keyspace is not user-modifiable");
+        if (Schema.isLocalSystemKeyspace(modifiedKeyspace))
+            throw new org.apache.cassandra.exceptions.InvalidRequestException(String.format("%s keyspace is not user-modifiable", modifiedKeyspace));
     }
 
-    public static IDiskAtomFilter asIFilter(SlicePredicate sp, CFMetaData metadata, ByteBuffer superColumn)
-    {
-        SliceRange sr = sp.slice_range;
-        IDiskAtomFilter filter;
+    //public static IDiskAtomFilter asIFilter(SlicePredicate sp, CFMetaData metadata, ByteBuffer superColumn)
+    //{
+    //    SliceRange sr = sp.slice_range;
+    //    IDiskAtomFilter filter;
 
-        CellNameType comparator = metadata.isSuper()
-                                ? new SimpleDenseCellNameType(metadata.comparator.subtype(superColumn == null ? 0 : 1))
-                                : metadata.comparator;
-        if (sr == null)
-        {
+    //    CellNameType comparator = metadata.isSuper()
+    //                            ? new SimpleDenseCellNameType(metadata.comparator.subtype(superColumn == null ? 0 : 1))
+    //                            : metadata.comparator;
+    //    if (sr == null)
+    //    {
 
-            SortedSet<CellName> ss = new TreeSet<CellName>(comparator);
-            for (ByteBuffer bb : sp.column_names)
-                ss.add(comparator.cellFromByteBuffer(bb));
-            filter = new NamesQueryFilter(ss);
-        }
-        else
-        {
-            filter = new SliceQueryFilter(comparator.fromByteBuffer(sr.start),
-                                          comparator.fromByteBuffer(sr.finish),
-                                          sr.reversed,
-                                          sr.count);
-        }
+    //        SortedSet<CellName> ss = new TreeSet<CellName>(comparator);
+    //        for (ByteBuffer bb : sp.column_names)
+    //            ss.add(comparator.cellFromByteBuffer(bb));
+    //        filter = new NamesQueryFilter(ss);
+    //    }
+    //    else
+    //    {
+    //        filter = new SliceQueryFilter(comparator.fromByteBuffer(sr.start),
+    //                                      comparator.fromByteBuffer(sr.finish),
+    //                                      sr.reversed,
+    //                                      sr.count);
+    //    }
 
-        if (metadata.isSuper())
-            filter = SuperColumns.fromSCFilter(metadata.comparator, superColumn, filter);
-        return filter;
-    }
+    //    if (metadata.isSuper())
+    //        filter = SuperColumns.fromSCFilter(metadata.comparator, superColumn, filter);
+    //    return filter;
+    //}
 }

diff --git a/src/java/org/apache/cassandra/tools/BulkLoader.java b/src/java/org/apache/cassandra/tools/BulkLoader.java
index 093a063..c1849f8 100644
--- a/src/java/org/apache/cassandra/tools/BulkLoader.java
+++ b/src/java/org/apache/cassandra/tools/BulkLoader.java

@@ -19,6 +19,8 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.net.UnknownHostException;
@@ -28,6 +30,9 @@
 import com.google.common.collect.Multimap;
 import org.apache.commons.cli.*;
 
+import com.datastax.driver.core.AuthProvider;
+import com.datastax.driver.core.JdkSSLOptions;
+import com.datastax.driver.core.PlainTextAuthProvider;
 import com.datastax.driver.core.SSLOptions;
 import javax.net.ssl.SSLContext;
 import org.apache.cassandra.config.*;
@@ -52,6 +57,7 @@
     private static final String SSL_STORAGE_PORT_OPTION = "ssl-storage-port";
     private static final String USER_OPTION = "username";
     private static final String PASSWD_OPTION = "password";
+    private static final String AUTH_PROVIDER_OPTION = "auth-provider";
     private static final String THROTTLE_MBITS = "throttle";
     private static final String INTER_DC_THROTTLE_MBITS = "inter-dc-throttle";
 
@@ -70,15 +76,14 @@
     public static void main(String args[])
     {
         Config.setClientMode(true);
-        LoaderOptions options = LoaderOptions.parseArgs(args);
+        LoaderOptions options = LoaderOptions.parseArgs(args).validateArguments();
         OutputHandler handler = new OutputHandler.SystemOutput(options.verbose, options.debug);
         SSTableLoader loader = new SSTableLoader(
-                options.directory,
+                options.directory.getAbsoluteFile(),
                 new ExternalClient(
                         options.hosts,
                         options.nativePort,
-                        options.user,
-                        options.passwd,
+                        options.authProvider,
                         options.storagePort,
                         options.sslStoragePort,
                         options.serverEncOptions,
@@ -270,7 +275,10 @@
             throw new RuntimeException("Could not create SSL Context.", e);
         }
 
-        return new SSLOptions(sslContext, clientEncryptionOptions.cipher_suites);
+        return JdkSSLOptions.builder()
+                            .withSSLContext(sslContext)
+                            .withCipherSuites(clientEncryptionOptions.cipher_suites)
+                            .build();
     }
 
     static class ExternalClient extends NativeSSTableLoaderClient
@@ -281,14 +289,13 @@
 
         public ExternalClient(Set<InetAddress> hosts,
                               int port,
-                              String user,
-                              String passwd,
+                              AuthProvider authProvider,
                               int storagePort,
                               int sslStoragePort,
                               EncryptionOptions.ServerEncryptionOptions serverEncryptionOptions,
                               SSLOptions sslOptions)
         {
-            super(hosts, port, user, passwd, sslOptions);
+            super(hosts, port, authProvider, sslOptions);
             this.storagePort = storagePort;
             this.sslStoragePort = sslStoragePort;
             this.serverEncOptions = serverEncryptionOptions;
@@ -311,6 +318,8 @@
         public int nativePort;
         public String user;
         public String passwd;
+        public String authProviderName;
+        public AuthProvider authProvider;
         public int throttle = 0;
         public int interDcThrottle = 0;
         public int storagePort;
@@ -376,6 +385,9 @@
                 if (cmd.hasOption(PASSWD_OPTION))
                     opts.passwd = cmd.getOptionValue(PASSWD_OPTION);
 
+                if (cmd.hasOption(AUTH_PROVIDER_OPTION))
+                    opts.authProviderName = cmd.getOptionValue(AUTH_PROVIDER_OPTION);
+
                 if (cmd.hasOption(INITIAL_HOST_ADDRESS_OPTION))
                 {
                     String[] nodes = cmd.getOptionValue(INITIAL_HOST_ADDRESS_OPTION).split(",");
@@ -437,38 +449,24 @@
                     config.stream_throughput_outbound_megabits_per_sec = 0;
                     config.inter_dc_stream_throughput_outbound_megabits_per_sec = 0;
                 }
+
+                if (cmd.hasOption(NATIVE_PORT_OPTION))
+                    opts.nativePort = Integer.parseInt(cmd.getOptionValue(NATIVE_PORT_OPTION));
+                else
+                    opts.nativePort = config.native_transport_port;
+                if (cmd.hasOption(STORAGE_PORT_OPTION))
+                    opts.storagePort = Integer.parseInt(cmd.getOptionValue(STORAGE_PORT_OPTION));
+                else
+                    opts.storagePort = config.storage_port;
+                if (cmd.hasOption(SSL_STORAGE_PORT_OPTION))
+                    opts.sslStoragePort = Integer.parseInt(cmd.getOptionValue(SSL_STORAGE_PORT_OPTION));
+                else
+                    opts.sslStoragePort = config.ssl_storage_port;
                 opts.throttle = config.stream_throughput_outbound_megabits_per_sec;
                 opts.interDcThrottle = config.inter_dc_stream_throughput_outbound_megabits_per_sec;
                 opts.clientEncOptions = config.client_encryption_options;
                 opts.serverEncOptions = config.server_encryption_options;
 
-                if (cmd.hasOption(NATIVE_PORT_OPTION))
-                {
-                    opts.nativePort = Integer.parseInt(cmd.getOptionValue(NATIVE_PORT_OPTION));
-                }
-                else
-                {
-                    opts.nativePort = config.native_transport_port;
-                }
-
-                if (cmd.hasOption(STORAGE_PORT_OPTION))
-                {
-                    opts.storagePort = Integer.parseInt(cmd.getOptionValue(STORAGE_PORT_OPTION));
-                }
-                else
-                {
-                    opts.storagePort = config.storage_port;
-                }
-
-                if (cmd.hasOption(SSL_STORAGE_PORT_OPTION))
-                {
-                    opts.sslStoragePort = Integer.parseInt(cmd.getOptionValue(SSL_STORAGE_PORT_OPTION));
-                }
-                else
-                {
-                    opts.sslStoragePort = config.ssl_storage_port;
-                }
-
                 if (cmd.hasOption(THROTTLE_MBITS))
                 {
                     opts.throttle = Integer.parseInt(cmd.getOptionValue(THROTTLE_MBITS));
@@ -536,6 +534,64 @@
             }
         }
 
+        public LoaderOptions validateArguments()
+        {
+            // Both username and password need to be provided
+            if ((user != null) != (passwd != null))
+                errorMsg("Username and password must both be provided", getCmdLineOptions());
+
+            if (user != null)
+            {
+                // Support for 3rd party auth providers that support plain text credentials.
+                // In this case the auth provider must provide a constructor of the form:
+                //
+                // public MyAuthProvider(String username, String password)
+                if (authProviderName != null)
+                {
+                    try
+                    {
+                        Class authProviderClass = Class.forName(authProviderName);
+                        Constructor constructor = authProviderClass.getConstructor(String.class, String.class);
+                        authProvider = (AuthProvider)constructor.newInstance(user, passwd);
+                    }
+                    catch (ClassNotFoundException e)
+                    {
+                        errorMsg("Unknown auth provider: " + e.getMessage(), getCmdLineOptions());
+                    }
+                    catch (NoSuchMethodException e)
+                    {
+                        errorMsg("Auth provider does not support plain text credentials: " + e.getMessage(), getCmdLineOptions());
+                    }
+                    catch (InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException e)
+                    {
+                        errorMsg("Could not create auth provider with plain text credentials: " + e.getMessage(), getCmdLineOptions());
+                    }
+                }
+                else
+                {
+                    // If a 3rd party auth provider wasn't provided use the driver plain text provider
+                    authProvider = new PlainTextAuthProvider(user, passwd);
+                }
+            }
+            // Alternate support for 3rd party auth providers that don't use plain text credentials.
+            // In this case the auth provider must provide a nullary constructor of the form:
+            //
+            // public MyAuthProvider()
+            else if (authProviderName != null)
+            {
+                try
+                {
+                    authProvider = (AuthProvider)Class.forName(authProviderName).newInstance();
+                }
+                catch (ClassNotFoundException | InstantiationException | IllegalAccessException e)
+                {
+                    errorMsg("Unknown auth provider" + e.getMessage(), getCmdLineOptions());
+                }
+            }
+
+            return this;
+        }
+
         private static void errorMsg(String msg, CmdLineOptions options)
         {
             System.err.println(msg);
@@ -551,13 +607,14 @@
             options.addOption(null, NOPROGRESS_OPTION,   "don't display progress");
             options.addOption("i",  IGNORE_NODES_OPTION, "NODES", "don't stream to this (comma separated) list of nodes");
             options.addOption("d",  INITIAL_HOST_ADDRESS_OPTION, "initial hosts", "Required. try to connect to these hosts (comma separated) initially for ring information");
-            options.addOption("p",  NATIVE_PORT_OPTION, "rpc port", "port used for native connection (default 9042)");
-            options.addOption("sp", STORAGE_PORT_OPTION, "storage port", "port used for internode communication (default 7000)");
-            options.addOption("ssp", SSL_STORAGE_PORT_OPTION, "ssl storage port", "port used for TLS internode communication (default 7001)");
+            options.addOption("p",  NATIVE_PORT_OPTION, "native transport port", "port used for native connection (default 9042)");
+            options.addOption("sp",  STORAGE_PORT_OPTION, "storage port", "port used for internode communication (default 7000)");
+            options.addOption("ssp",  SSL_STORAGE_PORT_OPTION, "ssl storage port", "port used for TLS internode communication (default 7001)");
             options.addOption("t",  THROTTLE_MBITS, "throttle", "throttle speed in Mbits (default unlimited)");
             options.addOption("idct",  INTER_DC_THROTTLE_MBITS, "inter-dc-throttle", "inter-datacenter throttle speed in Mbits (default unlimited)");
             options.addOption("u",  USER_OPTION, "username", "username for cassandra authentication");
             options.addOption("pw", PASSWD_OPTION, "password", "password for cassandra authentication");
+            options.addOption("ap", AUTH_PROVIDER_OPTION, "auth provider", "custom AuthProvider class name for cassandra authentication");
             options.addOption("cph", CONNECTIONS_PER_HOST, "connectionsPerHost", "number of concurrent connections-per-host.");
             // ssl connection-related options
             options.addOption("ts", SSL_TRUSTSTORE, "TRUSTSTORE", "Client SSL: full path to truststore");

diff --git a/src/java/org/apache/cassandra/tools/JsonTransformer.java b/src/java/org/apache/cassandra/tools/JsonTransformer.java
new file mode 100644
index 0000000..a5e8553
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/JsonTransformer.java

@@ -0,0 +1,559 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.tools;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.ByteBuffer;
+import java.time.Instant;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.ColumnData;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.codehaus.jackson.JsonFactory;
+import org.codehaus.jackson.JsonGenerator;
+import org.codehaus.jackson.impl.Indenter;
+import org.codehaus.jackson.util.DefaultPrettyPrinter;
+import org.codehaus.jackson.util.DefaultPrettyPrinter.NopIndenter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class JsonTransformer
+{
+
+    private static final Logger logger = LoggerFactory.getLogger(JsonTransformer.class);
+
+    private static final JsonFactory jsonFactory = new JsonFactory();
+
+    private final JsonGenerator json;
+
+    private final CompactIndenter objectIndenter = new CompactIndenter();
+
+    private final CompactIndenter arrayIndenter = new CompactIndenter();
+
+    private final CFMetaData metadata;
+
+    private final ISSTableScanner currentScanner;
+
+    private boolean rawTime = false;
+
+    private long currentPosition = 0;
+
+    private JsonTransformer(JsonGenerator json, ISSTableScanner currentScanner, boolean rawTime, CFMetaData metadata)
+    {
+        this.json = json;
+        this.metadata = metadata;
+        this.currentScanner = currentScanner;
+        this.rawTime = rawTime;
+
+        DefaultPrettyPrinter prettyPrinter = new DefaultPrettyPrinter();
+        prettyPrinter.indentObjectsWith(objectIndenter);
+        prettyPrinter.indentArraysWith(arrayIndenter);
+        json.setPrettyPrinter(prettyPrinter);
+    }
+
+    public static void toJson(ISSTableScanner currentScanner, Stream<UnfilteredRowIterator> partitions, boolean rawTime, CFMetaData metadata, OutputStream out)
+            throws IOException
+    {
+        try (JsonGenerator json = jsonFactory.createJsonGenerator(new OutputStreamWriter(out, "UTF-8")))
+        {
+            JsonTransformer transformer = new JsonTransformer(json, currentScanner, rawTime, metadata);
+            json.writeStartArray();
+            partitions.forEach(transformer::serializePartition);
+            json.writeEndArray();
+        }
+    }
+
+    public static void keysToJson(ISSTableScanner currentScanner, Stream<DecoratedKey> keys, boolean rawTime, CFMetaData metadata, OutputStream out) throws IOException
+    {
+        try (JsonGenerator json = jsonFactory.createJsonGenerator(new OutputStreamWriter(out, "UTF-8")))
+        {
+            JsonTransformer transformer = new JsonTransformer(json, currentScanner, rawTime, metadata);
+            json.writeStartArray();
+            keys.forEach(transformer::serializePartitionKey);
+            json.writeEndArray();
+        }
+    }
+
+    private void updatePosition()
+    {
+        this.currentPosition = currentScanner.getCurrentPosition();
+    }
+
+    private void serializePartitionKey(DecoratedKey key)
+    {
+        AbstractType<?> keyValidator = metadata.getKeyValidator();
+        objectIndenter.setCompact(true);
+        try
+        {
+            arrayIndenter.setCompact(true);
+            json.writeStartArray();
+            if (keyValidator instanceof CompositeType)
+            {
+                // if a composite type, the partition has multiple keys.
+                CompositeType compositeType = (CompositeType) keyValidator;
+                ByteBuffer keyBytes = key.getKey().duplicate();
+                // Skip static data if it exists.
+                if (keyBytes.remaining() >= 2)
+                {
+                    int header = ByteBufferUtil.getShortLength(keyBytes, keyBytes.position());
+                    if ((header & 0xFFFF) == 0xFFFF)
+                    {
+                        ByteBufferUtil.readShortLength(keyBytes);
+                    }
+                }
+
+                int i = 0;
+                while (keyBytes.remaining() > 0 && i < compositeType.getComponents().size())
+                {
+                    AbstractType<?> colType = compositeType.getComponents().get(i);
+
+                    ByteBuffer value = ByteBufferUtil.readBytesWithShortLength(keyBytes);
+                    String colValue = colType.getString(value);
+
+                    json.writeString(colValue);
+
+                    byte b = keyBytes.get();
+                    if (b != 0)
+                    {
+                        break;
+                    }
+                    ++i;
+                }
+            }
+            else
+            {
+                // if not a composite type, assume a single column partition key.
+                assert metadata.partitionKeyColumns().size() == 1;
+                json.writeString(keyValidator.getString(key.getKey()));
+            }
+            json.writeEndArray();
+            objectIndenter.setCompact(false);
+            arrayIndenter.setCompact(false);
+        }
+        catch (IOException e)
+        {
+            logger.error("Failure serializing partition key.", e);
+        }
+    }
+
+    private void serializePartition(UnfilteredRowIterator partition)
+    {
+        String key = metadata.getKeyValidator().getString(partition.partitionKey().getKey());
+        try
+        {
+            json.writeStartObject();
+
+            json.writeFieldName("partition");
+            json.writeStartObject();
+            json.writeFieldName("key");
+            serializePartitionKey(partition.partitionKey());
+            json.writeNumberField("position", this.currentScanner.getCurrentPosition());
+
+            if (!partition.partitionLevelDeletion().isLive())
+                serializeDeletion(partition.partitionLevelDeletion());
+
+            json.writeEndObject();
+
+            json.writeFieldName("rows");
+            json.writeStartArray();
+            updatePosition();
+
+            if (partition.staticRow() != null)
+            {
+                if (!partition.staticRow().isEmpty())
+                    serializeRow(partition.staticRow());
+                updatePosition();
+            }
+
+            Unfiltered unfiltered;
+            while (partition.hasNext())
+            {
+                unfiltered = partition.next();
+                if (unfiltered instanceof Row)
+                {
+                    serializeRow((Row) unfiltered);
+                }
+                else if (unfiltered instanceof RangeTombstoneMarker)
+                {
+                    serializeTombstone((RangeTombstoneMarker) unfiltered);
+                }
+                updatePosition();
+            }
+
+            json.writeEndArray();
+
+            json.writeEndObject();
+        }
+        catch (IOException e)
+        {
+            logger.error("Fatal error parsing partition: {}", key, e);
+        }
+    }
+
+    private void serializeRow(Row row)
+    {
+        try
+        {
+            json.writeStartObject();
+            String rowType = row.isStatic() ? "static_block" : "row";
+            json.writeFieldName("type");
+            json.writeString(rowType);
+            json.writeNumberField("position", this.currentPosition);
+
+            // Only print clustering information for non-static rows.
+            if (!row.isStatic())
+            {
+                serializeClustering(row.clustering());
+            }
+
+            LivenessInfo liveInfo = row.primaryKeyLivenessInfo();
+            if (!liveInfo.isEmpty())
+            {
+                objectIndenter.setCompact(false);
+                json.writeFieldName("liveness_info");
+                objectIndenter.setCompact(true);
+                json.writeStartObject();
+                json.writeFieldName("tstamp");
+                json.writeString(dateString(TimeUnit.MICROSECONDS, liveInfo.timestamp()));
+                if (liveInfo.isExpiring())
+                {
+                    json.writeNumberField("ttl", liveInfo.ttl());
+                    json.writeFieldName("expires_at");
+                    json.writeString(dateString(TimeUnit.SECONDS, liveInfo.localExpirationTime()));
+                    json.writeFieldName("expired");
+                    json.writeBoolean(liveInfo.localExpirationTime() < (System.currentTimeMillis() / 1000));
+                }
+                json.writeEndObject();
+                objectIndenter.setCompact(false);
+            }
+
+            // If this is a deletion, indicate that, otherwise write cells.
+            if (!row.deletion().isLive())
+            {
+                serializeDeletion(row.deletion().time());
+            }
+            json.writeFieldName("cells");
+            json.writeStartArray();
+            for (ColumnData cd : row)
+            {
+                serializeColumnData(cd, liveInfo);
+            }
+            json.writeEndArray();
+            json.writeEndObject();
+        }
+        catch (IOException e)
+        {
+            logger.error("Fatal error parsing row.", e);
+        }
+    }
+
+    private void serializeTombstone(RangeTombstoneMarker tombstone)
+    {
+        try
+        {
+            json.writeStartObject();
+            json.writeFieldName("type");
+
+            if (tombstone instanceof RangeTombstoneBoundMarker)
+            {
+                json.writeString("range_tombstone_bound");
+                RangeTombstoneBoundMarker bm = (RangeTombstoneBoundMarker) tombstone;
+                serializeBound(bm.clustering(), bm.deletionTime());
+            }
+            else
+            {
+                assert tombstone instanceof RangeTombstoneBoundaryMarker;
+                json.writeString("range_tombstone_boundary");
+                RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker) tombstone;
+                serializeBound(bm.openBound(false), bm.openDeletionTime(false));
+                serializeBound(bm.closeBound(false), bm.closeDeletionTime(false));
+            }
+            json.writeEndObject();
+            objectIndenter.setCompact(false);
+        }
+        catch (IOException e)
+        {
+            logger.error("Failure parsing tombstone.", e);
+        }
+    }
+
+    private void serializeBound(RangeTombstone.Bound bound, DeletionTime deletionTime) throws IOException
+    {
+        json.writeFieldName(bound.isStart() ? "start" : "end");
+        json.writeStartObject();
+        json.writeFieldName("type");
+        json.writeString(bound.isInclusive() ? "inclusive" : "exclusive");
+        serializeClustering(bound.clustering());
+        serializeDeletion(deletionTime);
+        json.writeEndObject();
+    }
+
+    private void serializeClustering(ClusteringPrefix clustering) throws IOException
+    {
+        if (clustering.size() > 0)
+        {
+            json.writeFieldName("clustering");
+            objectIndenter.setCompact(true);
+            json.writeStartArray();
+            arrayIndenter.setCompact(true);
+            List<ColumnDefinition> clusteringColumns = metadata.clusteringColumns();
+            for (int i = 0; i < clusteringColumns.size(); i++)
+            {
+                ColumnDefinition column = clusteringColumns.get(i);
+                if (i >= clustering.size())
+                {
+                    json.writeString("*");
+                }
+                else
+                {
+                    json.writeRawValue(column.cellValueType().toJSONString(clustering.get(i), Server.CURRENT_VERSION));
+                }
+            }
+            json.writeEndArray();
+            objectIndenter.setCompact(false);
+            arrayIndenter.setCompact(false);
+        }
+    }
+
+    private void serializeDeletion(DeletionTime deletion) throws IOException
+    {
+        json.writeFieldName("deletion_info");
+        objectIndenter.setCompact(true);
+        json.writeStartObject();
+        json.writeFieldName("marked_deleted");
+        json.writeString(dateString(TimeUnit.MICROSECONDS, deletion.markedForDeleteAt()));
+        json.writeFieldName("local_delete_time");
+        json.writeString(dateString(TimeUnit.SECONDS, deletion.localDeletionTime()));
+        json.writeEndObject();
+        objectIndenter.setCompact(false);
+    }
+
+    private void serializeColumnData(ColumnData cd, LivenessInfo liveInfo)
+    {
+        if (cd.column().isSimple())
+        {
+            serializeCell((Cell) cd, liveInfo);
+        }
+        else
+        {
+            ComplexColumnData complexData = (ComplexColumnData) cd;
+            if (!complexData.complexDeletion().isLive())
+            {
+                try
+                {
+                    objectIndenter.setCompact(true);
+                    json.writeStartObject();
+                    json.writeFieldName("name");
+                    AbstractType<?> type = cd.column().type;
+                    json.writeString(cd.column().name.toCQLString());
+                    serializeDeletion(complexData.complexDeletion());
+                    objectIndenter.setCompact(true);
+                    json.writeEndObject();
+                    objectIndenter.setCompact(false);
+                }
+                catch (IOException e)
+                {
+                    logger.error("Failure parsing ColumnData.", e);
+                }
+            }
+            for (Cell cell : complexData){
+                serializeCell(cell, liveInfo);
+            }
+        }
+    }
+
+    private void serializeCell(Cell cell, LivenessInfo liveInfo)
+    {
+        try
+        {
+            json.writeStartObject();
+            objectIndenter.setCompact(true);
+            json.writeFieldName("name");
+            AbstractType<?> type = cell.column().type;
+            json.writeString(cell.column().name.toCQLString());
+
+            if (type.isCollection() && type.isMultiCell()) // non-frozen collection
+            {
+                CollectionType ct = (CollectionType) type;
+                json.writeFieldName("path");
+                arrayIndenter.setCompact(true);
+                json.writeStartArray();
+                for (int i = 0; i < cell.path().size(); i++)
+                {
+                    json.writeString(ct.nameComparator().getString(cell.path().get(i)));
+                }
+                json.writeEndArray();
+                arrayIndenter.setCompact(false);
+            }
+            if (cell.isTombstone())
+            {
+                json.writeFieldName("deletion_info");
+                objectIndenter.setCompact(true);
+                json.writeStartObject();
+                json.writeFieldName("local_delete_time");
+                json.writeString(dateString(TimeUnit.SECONDS, cell.localDeletionTime()));
+                json.writeEndObject();
+                objectIndenter.setCompact(false);
+            }
+            else
+            {
+                json.writeFieldName("value");
+                json.writeRawValue(cell.column().cellValueType().toJSONString(cell.value(), Server.CURRENT_VERSION));
+            }
+            if (liveInfo.isEmpty() || cell.timestamp() != liveInfo.timestamp())
+            {
+                json.writeFieldName("tstamp");
+                json.writeString(dateString(TimeUnit.MICROSECONDS, cell.timestamp()));
+            }
+            if (cell.isExpiring() && (liveInfo.isEmpty() || cell.ttl() != liveInfo.ttl()))
+            {
+                json.writeFieldName("ttl");
+                json.writeNumber(cell.ttl());
+                json.writeFieldName("expires_at");
+                json.writeString(dateString(TimeUnit.SECONDS, cell.localDeletionTime()));
+                json.writeFieldName("expired");
+                json.writeBoolean(!cell.isLive((int) (System.currentTimeMillis() / 1000)));
+            }
+            json.writeEndObject();
+            objectIndenter.setCompact(false);
+        }
+        catch (IOException e)
+        {
+            logger.error("Failure parsing cell.", e);
+        }
+    }
+
+    private String dateString(TimeUnit from, long time)
+    {
+        long secs = from.toSeconds(time);
+        long offset = Math.floorMod(from.toNanos(time), 1000_000_000L); // nanos per sec
+        return rawTime? Long.toString(time) : Instant.ofEpochSecond(secs, offset).toString();
+    }
+
+    /**
+     * A specialized {@link Indenter} that enables a 'compact' mode which puts all subsequent json values on the same
+     * line. This is manipulated via {@link CompactIndenter#setCompact(boolean)}
+     */
+    private static final class CompactIndenter extends NopIndenter
+    {
+
+        private static final int INDENT_LEVELS = 16;
+        private final char[] indents;
+        private final int charsPerLevel;
+        private final String eol;
+        private static final String space = " ";
+
+        private boolean compact = false;
+
+        CompactIndenter()
+        {
+            this("  ", System.lineSeparator());
+        }
+
+        CompactIndenter(String indent, String eol)
+        {
+            this.eol = eol;
+
+            charsPerLevel = indent.length();
+
+            indents = new char[indent.length() * INDENT_LEVELS];
+            int offset = 0;
+            for (int i = 0; i < INDENT_LEVELS; i++)
+            {
+                indent.getChars(0, indent.length(), indents, offset);
+                offset += indent.length();
+            }
+        }
+
+        @Override
+        public boolean isInline()
+        {
+            return false;
+        }
+
+        /**
+         * Configures whether or not subsequent json values should be on the same line delimited by string or not.
+         *
+         * @param compact
+         *            Whether or not to compact.
+         */
+        public void setCompact(boolean compact)
+        {
+            this.compact = compact;
+        }
+
+        @Override
+        public void writeIndentation(JsonGenerator jg, int level)
+        {
+            try
+            {
+                if (!compact)
+                {
+                    jg.writeRaw(eol);
+                    if (level > 0)
+                    { // should we err on negative values (as there's some flaw?)
+                        level *= charsPerLevel;
+                        while (level > indents.length)
+                        { // unlike to happen but just in case
+                            jg.writeRaw(indents, 0, indents.length);
+                            level -= indents.length;
+                        }
+                        jg.writeRaw(indents, 0, level);
+                    }
+                }
+                else
+                {
+                    jg.writeRaw(space);
+                }
+            }
+            catch (IOException e)
+            {
+                e.printStackTrace();
+                System.exit(1);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java
index 9798763..2425821 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java

@@ -25,6 +25,7 @@
 import java.lang.management.RuntimeMXBean;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
+import java.rmi.ConnectException;
 import java.rmi.server.RMIClientSocketFactory;
 import java.rmi.server.RMISocketFactory;
 import java.util.AbstractMap;
@@ -53,9 +54,10 @@
 import javax.management.remote.JMXServiceURL;
 import javax.rmi.ssl.SslRMIClientSocketFactory;
 
+import org.apache.cassandra.batchlog.BatchlogManager;
+import org.apache.cassandra.batchlog.BatchlogManagerMBean;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStoreMBean;
-import org.apache.cassandra.db.HintedHandOffManager;
 import org.apache.cassandra.db.HintedHandOffManagerMBean;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.CompactionManagerMBean;
@@ -63,10 +65,12 @@
 import org.apache.cassandra.gms.FailureDetectorMBean;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.GossiperMBean;
+import org.apache.cassandra.db.HintedHandOffManager;
 import org.apache.cassandra.locator.EndpointSnitchInfoMBean;
 import org.apache.cassandra.metrics.CassandraMetricsRegistry;
-import org.apache.cassandra.metrics.ColumnFamilyMetrics.Sampler;
+import org.apache.cassandra.metrics.TableMetrics.Sampler;
 import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.metrics.TableMetrics;
 import org.apache.cassandra.metrics.ThreadPoolMetrics;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.net.MessagingServiceMBean;
@@ -115,6 +119,7 @@
     protected CacheServiceMBean cacheService;
     protected StorageProxyMBean spProxy;
     protected HintedHandOffManagerMBean hhProxy;
+    protected BatchlogManagerMBean bmProxy;
     private boolean failed;
 
     /**
@@ -212,6 +217,8 @@
             gcProxy = JMX.newMBeanProxy(mbeanServerConn, name, GCInspectorMXBean.class);
             name = new ObjectName(Gossiper.MBEAN_NAME);
             gossProxy = JMX.newMBeanProxy(mbeanServerConn, name, GossiperMBean.class);
+            name = new ObjectName(BatchlogManager.MBEAN_NAME);
+            bmProxy = JMX.newMBeanProxy(mbeanServerConn, name, BatchlogManagerMBean.class);
         }
         catch (MalformedObjectNameException e)
         {
@@ -225,7 +232,7 @@
                 mbeanServerConn, ManagementFactory.RUNTIME_MXBEAN_NAME, RuntimeMXBean.class);
     }
 
-    private RMIClientSocketFactory getRMIClientSocketFactory() throws IOException
+    private RMIClientSocketFactory getRMIClientSocketFactory()
     {
         if (Boolean.parseBoolean(System.getProperty("ssl.enable")))
             return new SslRMIClientSocketFactory();
@@ -235,27 +242,35 @@
 
     public void close() throws IOException
     {
-        jmxc.close();
+        try
+        {
+            jmxc.close();
+        }
+        catch (ConnectException e)
+        {
+            // result of 'stopdaemon' command - i.e. if close() call fails, the daemon is shutdown
+            System.out.println("Cassandra has shutdown.");
+        }
     }
 
-    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int forceKeyspaceCleanup(int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return ssProxy.forceKeyspaceCleanup(jobs, keyspaceName, columnFamilies);
+        return ssProxy.forceKeyspaceCleanup(jobs, keyspaceName, tables);
     }
 
-    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int scrub(boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
-        return ssProxy.scrub(disableSnapshot, skipCorrupted, checkData, reinsertOverflowedTTLRows, jobs, keyspaceName, columnFamilies);
+        return ssProxy.scrub(disableSnapshot, skipCorrupted, checkData, reinsertOverflowedTTL, jobs, keyspaceName, tables);
     }
 
-    public int verify(boolean extendedVerify, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int verify(boolean extendedVerify, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        return ssProxy.verify(extendedVerify, keyspaceName, columnFamilies);
+        return ssProxy.verify(extendedVerify, keyspaceName, tableNames);
     }
 
-    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, int jobs, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        return ssProxy.upgradeSSTables(keyspaceName, excludeCurrentVersion, jobs, columnFamilies);
+        return ssProxy.upgradeSSTables(keyspaceName, excludeCurrentVersion, jobs, tableNames);
     }
 
     private void checkJobs(PrintStream out, int jobs)
@@ -264,55 +279,79 @@
             out.println(String.format("jobs (%d) is bigger than configured concurrent_compactors (%d), using at most %d threads", jobs, DatabaseDescriptor.getConcurrentCompactors(), DatabaseDescriptor.getConcurrentCompactors()));
     }
 
-    public void forceKeyspaceCleanup(PrintStream out, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void forceKeyspaceCleanup(PrintStream out, int jobs, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
         checkJobs(out, jobs);
-        if (forceKeyspaceCleanup(jobs, keyspaceName, columnFamilies) != 0)
+        switch (forceKeyspaceCleanup(jobs, keyspaceName, tableNames))
         {
-            failed = true;
-            out.println("Aborted cleaning up at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+            case 1:
+                failed = true;
+                out.println("Aborted cleaning up at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+                break;
+            case 2:
+                failed = true;
+                out.println("Failed marking some sstables compacting in keyspace "+keyspaceName+", check server logs for more information");
+                break;
         }
     }
 
-    public void scrub(PrintStream out, boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTLRows, int jobs, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void scrub(PrintStream out, boolean disableSnapshot, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, int jobs, String keyspaceName, String... tables) throws IOException, ExecutionException, InterruptedException
     {
         checkJobs(out, jobs);
-        if (scrub(disableSnapshot, skipCorrupted, checkData, reinsertOverflowedTTLRows, jobs, keyspaceName, columnFamilies) != 0)
+        switch (ssProxy.scrub(disableSnapshot, skipCorrupted, checkData, reinsertOverflowedTTL, jobs, keyspaceName, tables))
         {
-            failed = true;
-            out.println("Aborted scrubbing at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+            case 1:
+                failed = true;
+                out.println("Aborted scrubbing at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+                break;
+            case 2:
+                failed = true;
+                out.println("Failed marking some sstables compacting in keyspace "+keyspaceName+", check server logs for more information");
+                break;
         }
     }
 
-    public void verify(PrintStream out, boolean extendedVerify, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void verify(PrintStream out, boolean extendedVerify, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        if (verify(extendedVerify, keyspaceName, columnFamilies) != 0)
+        switch (verify(extendedVerify, keyspaceName, tableNames))
         {
-            failed = true;
-            out.println("Aborted verifying at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+            case 1:
+                failed = true;
+                out.println("Aborted verifying at least one table in keyspace "+keyspaceName+", check server logs for more information.");
+                break;
+            case 2:
+                failed = true;
+                out.println("Failed marking some sstables compacting in keyspace "+keyspaceName+", check server logs for more information");
+                break;
         }
     }
 
 
-    public void upgradeSSTables(PrintStream out, String keyspaceName, boolean excludeCurrentVersion, int jobs, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void upgradeSSTables(PrintStream out, String keyspaceName, boolean excludeCurrentVersion, int jobs, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
         checkJobs(out, jobs);
-        if (upgradeSSTables(keyspaceName, excludeCurrentVersion, jobs, columnFamilies) != 0)
+        switch (upgradeSSTables(keyspaceName, excludeCurrentVersion, jobs, tableNames))
         {
-            failed = true;
-            out.println("Aborted upgrading sstables for atleast one table in keyspace "+keyspaceName+", check server logs for more information.");
+            case 1:
+                failed = true;
+                out.println("Aborted upgrading sstables for atleast one table in keyspace "+keyspaceName+", check server logs for more information.");
+                break;
+            case 2:
+                failed = true;
+                out.println("Failed marking some sstables compacting in keyspace "+keyspaceName+", check server logs for more information");
+                break;
         }
     }
 
 
-    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void forceKeyspaceCompaction(boolean splitOutput, String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        ssProxy.forceKeyspaceCompaction(splitOutput, keyspaceName, columnFamilies);
+        ssProxy.forceKeyspaceCompaction(splitOutput, keyspaceName, tableNames);
     }
 
-    public void forceKeyspaceFlush(String keyspaceName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
+    public void forceKeyspaceFlush(String keyspaceName, String... tableNames) throws IOException, ExecutionException, InterruptedException
     {
-        ssProxy.forceKeyspaceFlush(keyspaceName, columnFamilies);
+        ssProxy.forceKeyspaceFlush(keyspaceName, tableNames);
     }
 
     public void repairAsync(final PrintStream out, final String keyspace, Map<String, String> options) throws IOException
@@ -521,18 +560,18 @@
      * Take a snapshot of all the keyspaces, optionally specifying only a specific column family.
      *
      * @param snapshotName the name of the snapshot.
-     * @param columnFamily the column family to snapshot or all on null
+     * @param table the table to snapshot or all on null
      * @param keyspaces the keyspaces to snapshot
      */
-    public void takeSnapshot(String snapshotName, String columnFamily, String... keyspaces) throws IOException
+    public void takeSnapshot(String snapshotName, String table, String... keyspaces) throws IOException
     {
-        if (columnFamily != null)
+        if (table != null)
         {
             if (keyspaces.length != 1)
             {
                 throw new IOException("When specifying the table for a snapshot, you must specify one and only one keyspace");
             }
-            ssProxy.takeColumnFamilySnapshot(keyspaces[0], columnFamily, snapshotName);
+            ssProxy.takeTableSnapshot(keyspaces[0], table, snapshotName);
         }
         else
             ssProxy.takeSnapshot(snapshotName, keyspaces);
@@ -543,15 +582,15 @@
      *
      * @param snapshotName
      *            the name of the snapshot.
-     * @param columnFamilyList
+     * @param tableList
      *            list of columnfamily from different keyspace in the form of ks1.cf1 ks2.cf2
      */
-    public void takeMultipleColumnFamilySnapshot(String snapshotName, String... columnFamilyList)
+    public void takeMultipleTableSnapshot(String snapshotName, String... tableList)
             throws IOException
     {
-        if (null != columnFamilyList && columnFamilyList.length != 0)
+        if (null != tableList && tableList.length != 0)
         {
-            ssProxy.takeMultipleColumnFamilySnapshot(snapshotName, columnFamilyList);
+            ssProxy.takeMultipleTableSnapshot(snapshotName, tableList);
         }
         else
         {
@@ -583,6 +622,16 @@
         return ssProxy.isJoined();
     }
 
+    public boolean isDrained()
+    {
+        return ssProxy.isDrained();
+    }
+
+    public boolean isDraining()
+    {
+        return ssProxy.isDraining();
+    }
+
     public boolean isBootstrapMode()
     {
         return ssProxy.isBootstrapMode();
@@ -635,14 +684,14 @@
         cfsProxy.setCompactionThresholds(minimumCompactionThreshold, maximumCompactionThreshold);
     }
 
-    public void disableAutoCompaction(String ks, String ... columnFamilies) throws IOException
+    public void disableAutoCompaction(String ks, String ... tables) throws IOException
     {
-        ssProxy.disableAutoCompaction(ks, columnFamilies);
+        ssProxy.disableAutoCompaction(ks, tables);
     }
 
-    public void enableAutoCompaction(String ks, String ... columnFamilies) throws IOException
+    public void enableAutoCompaction(String ks, String ... tableNames) throws IOException
     {
-        ssProxy.enableAutoCompaction(ks, columnFamilies);
+        ssProxy.enableAutoCompaction(ks, tableNames);
     }
 
     public void setIncrementalBackupsEnabled(boolean enabled)
@@ -708,11 +757,11 @@
         return ssProxy.isStarting();
     }
 
-    public void truncate(String keyspaceName, String cfName)
+    public void truncate(String keyspaceName, String tableName)
     {
         try
         {
-            ssProxy.truncate(keyspaceName, cfName);
+            ssProxy.truncate(keyspaceName, tableName);
         }
         catch (TimeoutException e)
         {
@@ -805,6 +854,11 @@
         return ssProxy.getNonSystemKeyspaces();
     }
 
+    public List<String> getNonLocalStrategyKeyspaces()
+    {
+        return ssProxy.getNonLocalStrategyKeyspaces();
+    }
+
     public String getClusterName()
     {
         return ssProxy.getClusterName();
@@ -830,9 +884,19 @@
         return spProxy.getHintedHandoffEnabled();
     }
 
-    public void enableHintedHandoff(String dcNames)
+    public void enableHintsForDC(String dc)
     {
-        spProxy.setHintedHandoffEnabledByDCList(dcNames);
+        spProxy.enableHintsForDC(dc);
+    }
+
+    public void disableHintsForDC(String dc)
+    {
+        spProxy.disableHintsForDC(dc);
+    }
+
+    public Set<String> getHintedHandoffDisabledDCs()
+    {
+        return spProxy.getHintedHandoffDisabledDCs();
     }
 
     public void pauseHintsDelivery()
@@ -1029,6 +1093,11 @@
         ssProxy.resetLocalSchema();
     }
 
+    public void reloadLocalSchema()
+    {
+        ssProxy.reloadLocalSchema();
+    }
+
     public boolean isFailed()
     {
         return failed;
@@ -1102,13 +1171,13 @@
      * Retrieve ColumnFamily metrics
      * @param ks Keyspace for which stats are to be displayed.
      * @param cf ColumnFamily for which stats are to be displayed.
-     * @param metricName View {@link org.apache.cassandra.metrics.ColumnFamilyMetrics}.
+     * @param metricName View {@link TableMetrics}.
      */
     public Object getColumnFamilyMetric(String ks, String cf, String metricName)
     {
         try
         {
-            String type = cf.contains(".") ? "IndexColumnFamily": "ColumnFamily";
+            String type = cf.contains(".") ? "IndexTable" : "Table";
             ObjectName oName = new ObjectName(String.format("org.apache.cassandra.metrics:type=%s,keyspace=%s,scope=%s,name=%s", type, ks, cf, metricName));
             switch(metricName)
             {
@@ -1120,16 +1189,16 @@
                 case "CompressionMetadataOffHeapMemoryUsed":
                 case "CompressionRatio":
                 case "EstimatedColumnCountHistogram":
-                case "EstimatedRowSizeHistogram":
-                case "EstimatedRowCount":
+                case "EstimatedPartitionSizeHistogram":
+                case "EstimatedPartitionCount":
                 case "KeyCacheHitRate":
                 case "LiveSSTableCount":
-                case "MaxRowSize":
-                case "MeanRowSize":
+                case "MaxPartitionSize":
+                case "MeanPartitionSize":
                 case "MemtableColumnsCount":
                 case "MemtableLiveDataSize":
                 case "MemtableOffHeapSize":
-                case "MinRowSize":
+                case "MinPartitionSize":
                 case "RecentBloomFilterFalsePositives":
                 case "RecentBloomFilterFalseRatio":
                 case "SnapshotsSize":
@@ -1152,7 +1221,7 @@
                 case "TombstoneScannedHistogram":
                     return JMX.newMBeanProxy(mbeanServerConn, oName, CassandraMetricsRegistry.JmxHistogramMBean.class);
                 default:
-                    throw new RuntimeException("Unknown table metric.");
+                    throw new RuntimeException("Unknown table metric " + metricName);
             }
         }
         catch (MalformedObjectNameException e)
@@ -1316,6 +1385,18 @@
         }
     }
 
+    public void replayBatchlog() throws IOException
+    {
+        try
+        {
+            bmProxy.forceBatchlogReplay();
+        }
+        catch (Exception e)
+        {
+            throw new IOException(e);
+        }
+    }
+
     public TabularData getFailureDetectorPhilValues()
     {
         try

diff --git a/src/java/org/apache/cassandra/tools/NodeTool.java b/src/java/org/apache/cassandra/tools/NodeTool.java
index b6dadd6..dbff241 100644
--- a/src/java/org/apache/cassandra/tools/NodeTool.java
+++ b/src/java/org/apache/cassandra/tools/NodeTool.java

@@ -111,6 +111,7 @@
                 RemoveNode.class,
                 Assassinate.class,
                 Repair.class,
+                ReplayBatchlog.class,
                 SetCacheCapacity.class,
                 SetHintedHandoffThrottleInKB.class,
                 SetCompactionThreshold.class,
@@ -135,6 +136,7 @@
                 EnableBackup.class,
                 DisableBackup.class,
                 ResetLocalSchema.class,
+                ReloadLocalSchema.class,
                 ReloadTriggers.class,
                 SetCacheKeysToSave.class,
                 DisableThrift.class,
@@ -145,6 +147,8 @@
                 TopPartitions.class,
                 SetLoggingLevel.class,
                 GetLoggingLevels.class,
+                DisableHintsForDC.class,
+                EnableHintsForDC.class,
                 FailureDetectorInfo.class,
                 RefreshSizeEstimates.class
         );
@@ -340,19 +344,34 @@
             return nodeClient;
         }
 
-        protected List<String> parseOptionalKeyspace(List<String> cmdArgs, NodeProbe nodeProbe)
+        protected enum KeyspaceSet
         {
-            return parseOptionalKeyspace(cmdArgs, nodeProbe, false);
+            ALL, NON_SYSTEM, NON_LOCAL_STRATEGY
         }
 
-        protected List<String> parseOptionalKeyspace(List<String> cmdArgs, NodeProbe nodeProbe, boolean includeSystemKS)
+        protected List<String> parseOptionalKeyspace(List<String> cmdArgs, NodeProbe nodeProbe)
+        {
+            return parseOptionalKeyspace(cmdArgs, nodeProbe, KeyspaceSet.ALL);
+        }
+
+        protected List<String> parseOptionalKeyspace(List<String> cmdArgs, NodeProbe nodeProbe, KeyspaceSet defaultKeyspaceSet)
         {
             List<String> keyspaces = new ArrayList<>();
 
+
             if (cmdArgs == null || cmdArgs.isEmpty())
-                keyspaces.addAll(includeSystemKS ? nodeProbe.getKeyspaces() : nodeProbe.getNonSystemKeyspaces());
+            {
+                if (defaultKeyspaceSet == KeyspaceSet.NON_LOCAL_STRATEGY)
+                    keyspaces.addAll(keyspaces = nodeProbe.getNonLocalStrategyKeyspaces());
+                else if (defaultKeyspaceSet == KeyspaceSet.NON_SYSTEM)
+                    keyspaces.addAll(keyspaces = nodeProbe.getNonSystemKeyspaces());
+                else
+                    keyspaces.addAll(nodeProbe.getKeyspaces());
+            }
             else
+            {
                 keyspaces.add(cmdArgs.get(0));
+            }
 
             for (String keyspace : keyspaces)
             {
@@ -363,7 +382,7 @@
             return Collections.unmodifiableList(keyspaces);
         }
 
-        protected String[] parseOptionalColumnFamilies(List<String> cmdArgs)
+        protected String[] parseOptionalTables(List<String> cmdArgs)
         {
             return cmdArgs.size() <= 1 ? EMPTY_STRING_ARRAY : toArray(cmdArgs.subList(1, cmdArgs.size()), String.class);
         }

diff --git a/src/java/org/apache/cassandra/tools/RepairRunner.java b/src/java/org/apache/cassandra/tools/RepairRunner.java
index 0813775..12ae5b8 100644
--- a/src/java/org/apache/cassandra/tools/RepairRunner.java
+++ b/src/java/org/apache/cassandra/tools/RepairRunner.java

@@ -110,6 +110,10 @@
             message = message + " (progress: " + (int)event.getProgressPercentage() + "%)";
         }
         out.println(message);
+        if (type == ProgressEventType.ERROR)
+        {
+            error = new RuntimeException("Repair job has failed with the error message: " + message);
+        }
         if (type == ProgressEventType.COMPLETE)
         {
             condition.signalAll();

diff --git a/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java b/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java
index 0d8c5e5..3e2ff08 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java

@@ -28,11 +28,11 @@
 import com.google.common.collect.Multimap;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -70,7 +70,7 @@
 
         Keyspace ks = Keyspace.openWithoutSSTables(keyspace);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(columnfamily);
-        Directories.SSTableLister lister = cfs.directories.sstableLister().skipTemporary(true);
+        Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true);
         Set<SSTableReader> sstables = new HashSet<>();
         for (Map.Entry<Descriptor, Set<Component>> sstable : lister.list().entrySet())
         {
@@ -93,7 +93,7 @@
             System.exit(1);
         }
 
-        int gcBefore = (int)(System.currentTimeMillis()/1000) - metadata.getGcGraceSeconds();
+        int gcBefore = (int)(System.currentTimeMillis()/1000) - metadata.params.gcGraceSeconds;
         Multimap<SSTableReader, SSTableReader> blockers = checkForExpiredSSTableBlockers(sstables, gcBefore);
         for (SSTableReader blocker : blockers.keySet())
         {

diff --git a/src/java/org/apache/cassandra/tools/SSTableExport.java b/src/java/org/apache/cassandra/tools/SSTableExport.java
index 50d6611..ac8ea61 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExport.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExport.java

@@ -19,367 +19,125 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.PrintStream;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.commons.cli.*;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.codehaus.jackson.JsonGenerator;
-import org.codehaus.jackson.map.ObjectMapper;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.KeyIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Export SSTables to JSON format.
  */
 public class SSTableExport
 {
-    private static final ObjectMapper jsonMapper = new ObjectMapper();
 
     private static final String KEY_OPTION = "k";
-    private static final String EXCLUDEKEY_OPTION = "x";
-    private static final String ENUMERATEKEYS_OPTION = "e";
+    private static final String DEBUG_OUTPUT_OPTION = "d";
+    private static final String EXCLUDE_KEY_OPTION = "x";
+    private static final String ENUMERATE_KEYS_OPTION = "e";
+    private static final String RAW_TIMESTAMPS = "t";
 
     private static final Options options = new Options();
     private static CommandLine cmd;
 
     static
     {
-        Option optKey = new Option(KEY_OPTION, true, "Row key");
+        Config.setClientMode(true);
+
+        Option optKey = new Option(KEY_OPTION, true, "Partition key");
         // Number of times -k <key> can be passed on the command line.
         optKey.setArgs(500);
         options.addOption(optKey);
 
-        Option excludeKey = new Option(EXCLUDEKEY_OPTION, true, "Excluded row key");
+        Option excludeKey = new Option(EXCLUDE_KEY_OPTION, true, "Excluded partition key");
         // Number of times -x <key> can be passed on the command line.
         excludeKey.setArgs(500);
         options.addOption(excludeKey);
 
-        Option optEnumerate = new Option(ENUMERATEKEYS_OPTION, false, "enumerate keys only");
+        Option optEnumerate = new Option(ENUMERATE_KEYS_OPTION, false, "enumerate partition keys only");
         options.addOption(optEnumerate);
 
-        // disabling auto close of the stream
-        jsonMapper.configure(JsonGenerator.Feature.AUTO_CLOSE_TARGET, false);
+        Option debugOutput = new Option(DEBUG_OUTPUT_OPTION, false, "CQL row per line internal representation");
+        options.addOption(debugOutput);
+
+        Option rawTimestamps = new Option(RAW_TIMESTAMPS, false, "Print raw timestamps instead of iso8601 date strings");
+        options.addOption(rawTimestamps);
     }
 
     /**
-     * Checks if PrintStream error and throw exception
+     * Construct table schema from info stored in SSTable's Stats.db
      *
-     * @param out The PrintStream to be check
+     * @param desc SSTable's descriptor
+     * @return Restored CFMetaData
+     * @throws IOException when Stats.db cannot be read
      */
-    private static void checkStream(PrintStream out) throws IOException
+    public static CFMetaData metadataFromSSTable(Descriptor desc) throws IOException
     {
-        if (out.checkError())
-            throw new IOException("Error writing output stream");
+        if (!desc.version.storeRows())
+            throw new IOException("pre-3.0 SSTable is not supported.");
+
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.STATS, MetadataType.HEADER);
+        Map<MetadataType, MetadataComponent> sstableMetadata = desc.getMetadataSerializer().deserialize(desc, types);
+        SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
+        IPartitioner partitioner = FBUtilities.newPartitioner(desc);
+
+        CFMetaData.Builder builder = CFMetaData.Builder.create("keyspace", "table").withPartitioner(partitioner);
+        header.getStaticColumns().entrySet().stream()
+                .forEach(entry -> {
+                    ColumnIdentifier ident = ColumnIdentifier.getInterned(UTF8Type.instance.getString(entry.getKey()), true);
+                    builder.addStaticColumn(ident, entry.getValue());
+                });
+        header.getRegularColumns().entrySet().stream()
+                .forEach(entry -> {
+                    ColumnIdentifier ident = ColumnIdentifier.getInterned(UTF8Type.instance.getString(entry.getKey()), true);
+                    builder.addRegularColumn(ident, entry.getValue());
+                });
+        builder.addPartitionKey("PartitionKey", header.getKeyType());
+        for (int i = 0; i < header.getClusteringTypes().size(); i++)
+        {
+            builder.addClusteringColumn("clustering" + (i > 0 ? i : ""), header.getClusteringTypes().get(i));
+        }
+        return builder.build();
+    }
+
+    private static <T> Stream<T> iterToStream(Iterator<T> iter)
+    {
+        Spliterator<T> splititer = Spliterators.spliteratorUnknownSize(iter, Spliterator.IMMUTABLE);
+        return StreamSupport.stream(splititer, false);
     }
 
     /**
-     * JSON Hash Key serializer
+     * Given arguments specifying an SSTable, and optionally an output file, export the contents of the SSTable to JSON.
      *
-     * @param out   The output steam to write data
-     * @param value value to set as a key
-     */
-    private static void writeKey(PrintStream out, String value)
-    {
-        writeJSON(out, value);
-        out.print(": ");
-    }
-
-    private static List<Object> serializeAtom(OnDiskAtom atom, CFMetaData cfMetaData)
-    {
-        if (atom instanceof Cell)
-        {
-            return serializeColumn((Cell) atom, cfMetaData);
-        }
-        else
-        {
-            assert atom instanceof RangeTombstone;
-            RangeTombstone rt = (RangeTombstone) atom;
-            ArrayList<Object> serializedColumn = new ArrayList<Object>();
-            serializedColumn.add(cfMetaData.comparator.getString(rt.min));
-            serializedColumn.add(cfMetaData.comparator.getString(rt.max));
-            serializedColumn.add(rt.data.markedForDeleteAt);
-            serializedColumn.add("t");
-            serializedColumn.add(rt.data.localDeletionTime);
-            return serializedColumn;
-        }
-    }
-
-    /**
-     * Serialize a given cell to a List of Objects that jsonMapper knows how to turn into strings.  Type is
-     *
-     * human_readable_name, value, timestamp, [flag, [options]]
-     *
-     * Value is normally the human readable value as rendered by the validator, but for deleted cells we
-     * give the local deletion time instead.
-     *
-     * Flag may be exactly one of {d,e,c} for deleted, expiring, or counter:
-     *  - No options for deleted cells
-     *  - If expiring, options will include the TTL and local deletion time.
-     *  - If counter, options will include timestamp of last delete
-     *
-     * @param cell     cell presentation
-     * @param cfMetaData Column Family metadata (to get validator)
-     * @return cell as serialized list
-     */
-    private static List<Object> serializeColumn(Cell cell, CFMetaData cfMetaData)
-    {
-        CellNameType comparator = cfMetaData.comparator;
-        ArrayList<Object> serializedColumn = new ArrayList<Object>();
-
-        serializedColumn.add(comparator.getString(cell.name()));
-
-        if (cell instanceof DeletedCell)
-        {
-            serializedColumn.add(cell.getLocalDeletionTime());
-        }
-        else
-        {
-            AbstractType<?> validator = cfMetaData.getValueValidator(cell.name());
-            serializedColumn.add(validator.getString(cell.value()));
-        }
-
-        serializedColumn.add(cell.timestamp());
-
-        if (cell instanceof DeletedCell)
-        {
-            serializedColumn.add("d");
-        }
-        else if (cell instanceof ExpiringCell)
-        {
-            serializedColumn.add("e");
-            serializedColumn.add(((ExpiringCell) cell).getTimeToLive());
-            serializedColumn.add(cell.getLocalDeletionTime());
-        }
-        else if (cell instanceof CounterCell)
-        {
-            serializedColumn.add("c");
-            serializedColumn.add(((CounterCell) cell).timestampOfLastDelete());
-        }
-
-        return serializedColumn;
-    }
-
-    /**
-     * Get portion of the columns and serialize in loop while not more columns left in the row
-     *
-     * @param row SSTableIdentityIterator row representation with Column Family
-     * @param key Decorated Key for the required row
-     * @param out output stream
-     */
-    private static void serializeRow(SSTableIdentityIterator row, DecoratedKey key, PrintStream out)
-    {
-        serializeRow(row.getColumnFamily().deletionInfo(), row, row.getColumnFamily().metadata(), key, out);
-    }
-
-    private static void serializeRow(DeletionInfo deletionInfo, Iterator<OnDiskAtom> atoms, CFMetaData metadata, DecoratedKey key, PrintStream out)
-    {
-        out.print("{");
-        writeKey(out, "key");
-        writeJSON(out, metadata.getKeyValidator().getString(key.getKey()));
-        out.print(",\n");
-
-        if (!deletionInfo.isLive())
-        {
-            out.print(" ");
-            writeKey(out, "metadata");
-            out.print("{");
-            writeKey(out, "deletionInfo");
-            writeJSON(out, deletionInfo.getTopLevelDeletion());
-            out.print("}");
-            out.print(",\n");
-        }
-
-        out.print(" ");
-        writeKey(out, "cells");
-        out.print("[");
-        while (atoms.hasNext())
-        {
-            writeJSON(out, serializeAtom(atoms.next(), metadata));
-
-            if (atoms.hasNext())
-                out.print(",\n           ");
-        }
-        out.print("]");
-
-        out.print("}");
-    }
-
-    /**
-     * Enumerate row keys from an SSTableReader and write the result to a PrintStream.
-     *
-     * @param desc the descriptor of the file to export the rows from
-     * @param outs PrintStream to write the output to
-     * @param metadata Metadata to print keys in a proper format
-     * @throws IOException on failure to read/write input/output
-     */
-    public static void enumeratekeys(Descriptor desc, PrintStream outs, CFMetaData metadata)
-    throws IOException
-    {
-        try (KeyIterator iter = new KeyIterator(desc))
-        {
-            DecoratedKey lastKey = null;
-            while (iter.hasNext())
-            {
-                DecoratedKey key = iter.next();
-
-                // validate order of the keys in the sstable
-                if (lastKey != null && lastKey.compareTo(key) > 0)
-                    throw new IOException("Key out of order! " + lastKey + " > " + key);
-                lastKey = key;
-
-                outs.println(metadata.getKeyValidator().getString(key.getKey()));
-                checkStream(outs); // flushes
-            }
-        }
-    }
-
-    /**
-     * Export specific rows from an SSTable and write the resulting JSON to a PrintStream.
-     *
-     * @param desc     the descriptor of the sstable to read from
-     * @param outs     PrintStream to write the output to
-     * @param toExport the keys corresponding to the rows to export
-     * @param excludes keys to exclude from export
-     * @param metadata Metadata to print keys in a proper format
-     * @throws IOException on failure to read/write input/output
-     */
-    public static void export(Descriptor desc, PrintStream outs, Collection<String> toExport, String[] excludes, CFMetaData metadata) throws IOException
-    {
-        SSTableReader sstable = SSTableReader.open(desc);
-
-        try (RandomAccessReader dfile = sstable.openDataReader())
-        {
-            IPartitioner partitioner = sstable.partitioner;
-
-            if (excludes != null)
-                toExport.removeAll(Arrays.asList(excludes));
-
-            outs.println("[");
-
-            int i = 0;
-
-            // last key to compare order
-            DecoratedKey lastKey = null;
-
-            for (String key : toExport)
-            {
-                DecoratedKey decoratedKey = partitioner.decorateKey(metadata.getKeyValidator().fromString(key));
-
-                if (lastKey != null && lastKey.compareTo(decoratedKey) > 0)
-                    throw new IOException("Key out of order! " + lastKey + " > " + decoratedKey);
-
-                lastKey = decoratedKey;
-
-                RowIndexEntry entry = sstable.getPosition(decoratedKey, SSTableReader.Operator.EQ);
-                if (entry == null)
-                    continue;
-
-                dfile.seek(entry.position);
-                ByteBufferUtil.readWithShortLength(dfile); // row key
-                DeletionInfo deletionInfo = new DeletionInfo(DeletionTime.serializer.deserialize(dfile));
-
-                Iterator<OnDiskAtom> atomIterator = sstable.metadata.getOnDiskIterator(dfile, sstable.descriptor.version);
-                checkStream(outs);
-
-                if (i != 0)
-                    outs.println(",");
-                i++;
-                serializeRow(deletionInfo, atomIterator, sstable.metadata, decoratedKey, outs);
-            }
-
-            outs.println("\n]");
-            outs.flush();
-        }
-    }
-
-    // This is necessary to accommodate the test suite since you cannot open a Reader more
-    // than once from within the same process.
-    static void export(SSTableReader reader, PrintStream outs, String[] excludes) throws IOException
-    {
-        Set<String> excludeSet = new HashSet<String>();
-
-        if (excludes != null)
-            excludeSet = new HashSet<>(Arrays.asList(excludes));
-
-        SSTableIdentityIterator row;
-        try (ISSTableScanner scanner = reader.getScanner())
-        {
-            outs.println("[");
-
-            int i = 0;
-
-            // collecting keys to export
-            while (scanner.hasNext())
-            {
-                row = (SSTableIdentityIterator) scanner.next();
-
-                String currentKey = row.getColumnFamily().metadata().getKeyValidator().getString(row.getKey().getKey());
-
-                if (excludeSet.contains(currentKey))
-                    continue;
-                else if (i != 0)
-                    outs.println(",");
-
-                serializeRow(row, row.getKey(), outs);
-                checkStream(outs);
-
-                i++;
-            }
-
-            outs.println("\n]");
-            outs.flush();
-        }
-    }
-
-    /**
-     * Export an SSTable and write the resulting JSON to a PrintStream.
-     *
-     * @param desc     the descriptor of the sstable to read from
-     * @param outs     PrintStream to write the output to
-     * @param excludes keys to exclude from export
-     * @throws IOException on failure to read/write input/output
-     */
-    public static void export(Descriptor desc, PrintStream outs, String[] excludes) throws IOException
-    {
-        export(SSTableReader.open(desc), outs, excludes);
-    }
-
-    /**
-     * Export an SSTable and write the resulting JSON to standard out.
-     *
-     * @param desc     the descriptor of the sstable to read from
-     * @param excludes keys to exclude from export
-     * @throws IOException on failure to read/write SSTable/standard out
-     */
-    public static void export(Descriptor desc, String[] excludes) throws IOException
-    {
-        export(desc, System.out, excludes);
-    }
-
-    /**
-     * Given arguments specifying an SSTable, and optionally an output file,
-     * export the contents of the SSTable to JSON.
-     *
-     * @param args command lines arguments
-     * @throws ConfigurationException on configuration failure (wrong params given)
+     * @param args
+     *            command lines arguments
+     * @throws ConfigurationException
+     *             on configuration failure (wrong params given)
      */
     public static void main(String[] args) throws ConfigurationException
     {
-        System.err.println("WARNING: please note that sstable2json is now deprecated and will be removed in Cassandra 3.0. "
-                         + "Please see https://issues.apache.org/jira/browse/CASSANDRA-9618 for details.");
-
-        String usage = String.format("Usage: %s <sstable> [-k key [-k key [...]] -x key [-x key [...]]]%n", SSTableExport.class.getName());
-
         CommandLineParser parser = new PosixParser();
         try
         {
@@ -388,70 +146,101 @@
         catch (ParseException e1)
         {
             System.err.println(e1.getMessage());
-            System.err.println(usage);
+            printUsage();
             System.exit(1);
         }
 
-
         if (cmd.getArgs().length != 1)
         {
             System.err.println("You must supply exactly one sstable");
-            System.err.println(usage);
+            printUsage();
             System.exit(1);
         }
 
-        Util.initDatabaseDescriptor();
-
         String[] keys = cmd.getOptionValues(KEY_OPTION);
-        String[] excludes = cmd.getOptionValues(EXCLUDEKEY_OPTION);
+        HashSet<String> excludes = new HashSet<>(Arrays.asList(
+                cmd.getOptionValues(EXCLUDE_KEY_OPTION) == null
+                        ? new String[0]
+                        : cmd.getOptionValues(EXCLUDE_KEY_OPTION)));
         String ssTableFileName = new File(cmd.getArgs()[0]).getAbsolutePath();
 
-        Schema.instance.loadFromDisk(false);
-        Descriptor descriptor = Descriptor.fromFilename(ssTableFileName);
-
-        // Start by validating keyspace name
-        if (Schema.instance.getKSMetaData(descriptor.ksname) == null)
+        if (Descriptor.isLegacyFile(new File(ssTableFileName)))
         {
-            System.err.println(String.format("Filename %s references to nonexistent keyspace: %s!",
-                                             ssTableFileName, descriptor.ksname));
+            System.err.println("Unsupported legacy sstable");
             System.exit(1);
         }
-        Keyspace.setInitialized();
-        Keyspace keyspace = Keyspace.open(descriptor.ksname);
-
-        // Make it works for indexes too - find parent cf if necessary
-        String baseName = descriptor.cfname;
-        if (descriptor.cfname.contains("."))
+        if (!new File(ssTableFileName).exists())
         {
-            String[] parts = descriptor.cfname.split("\\.", 2);
-            baseName = parts[0];
-        }
-
-        // IllegalArgumentException will be thrown here if ks/cf pair does not exist
-        ColumnFamilyStore cfStore = null;
-        try
-        {
-            cfStore = keyspace.getColumnFamilyStore(baseName);
-        }
-        catch (IllegalArgumentException e)
-        {
-            System.err.println(String.format("The provided table is not part of this cassandra keyspace: keyspace = %s, table = %s",
-                                             descriptor.ksname, descriptor.cfname));
+            System.err.println("Cannot find file " + ssTableFileName);
             System.exit(1);
         }
-
+        Descriptor desc = Descriptor.fromFilename(ssTableFileName);
         try
         {
-            if (cmd.hasOption(ENUMERATEKEYS_OPTION))
+            CFMetaData metadata = metadataFromSSTable(desc);
+            if (cmd.hasOption(ENUMERATE_KEYS_OPTION))
             {
-                enumeratekeys(descriptor, System.out, cfStore.metadata);
+                try (KeyIterator iter = new KeyIterator(desc, metadata))
+                {
+                    JsonTransformer.keysToJson(null, iterToStream(iter),
+                                               cmd.hasOption(RAW_TIMESTAMPS),
+                                               metadata,
+                                               System.out);
+                }
             }
             else
             {
+                SSTableReader sstable = SSTableReader.openNoValidation(desc, metadata);
+                IPartitioner partitioner = sstable.getPartitioner();
+                final ISSTableScanner currentScanner;
                 if ((keys != null) && (keys.length > 0))
-                    export(descriptor, System.out, Arrays.asList(keys), excludes, cfStore.metadata);
+                {
+                    List<AbstractBounds<PartitionPosition>> bounds = Arrays.stream(keys)
+                            .filter(key -> !excludes.contains(key))
+                            .map(metadata.getKeyValidator()::fromString)
+                            .map(partitioner::decorateKey)
+                            .sorted()
+                            .map(DecoratedKey::getToken)
+                            .map(token -> new Bounds<>(token.minKeyBound(), token.maxKeyBound())).collect(Collectors.toList());
+                    currentScanner = sstable.getScanner(bounds.iterator());
+                }
                 else
-                    export(descriptor, excludes);
+                {
+                    currentScanner = sstable.getScanner();
+                }
+                Stream<UnfilteredRowIterator> partitions = iterToStream(currentScanner).filter(i ->
+                    excludes.isEmpty() || !excludes.contains(metadata.getKeyValidator().getString(i.partitionKey().getKey()))
+                );
+                if (cmd.hasOption(DEBUG_OUTPUT_OPTION))
+                {
+                    AtomicLong position = new AtomicLong();
+                    partitions.forEach(partition ->
+                    {
+                        position.set(currentScanner.getCurrentPosition());
+
+                        if (!partition.partitionLevelDeletion().isLive())
+                        {
+                            System.out.println("[" + metadata.getKeyValidator().getString(partition.partitionKey().getKey()) + "]@" +
+                                               position.get() + " " + partition.partitionLevelDeletion());
+                        }
+                        if (!partition.staticRow().isEmpty())
+                        {
+                            System.out.println("[" + metadata.getKeyValidator().getString(partition.partitionKey().getKey()) + "]@" +
+                                               position.get() + " " + partition.staticRow().toString(metadata, true));
+                        }
+                        partition.forEachRemaining(row ->
+                        {
+                            System.out.println(
+                                    "[" + metadata.getKeyValidator().getString(partition.partitionKey().getKey()) + "]@"
+                                            + position.get() + " " + row.toString(metadata, false, true));
+                            position.set(currentScanner.getCurrentPosition());
+                        });
+                    });
+                }
+                else
+                {
+                    JsonTransformer.toJson(currentScanner, partitions, cmd.hasOption(RAW_TIMESTAMPS), metadata, System.out);
+                }
             }
         }
         catch (IOException e)
@@ -463,15 +252,10 @@
         System.exit(0);
     }
 
-    private static void writeJSON(PrintStream out, Object value)
+    private static void printUsage()
     {
-        try
-        {
-            jsonMapper.writeValue(out, value);
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e.getMessage(), e);
-        }
+        String usage = String.format("sstabledump <sstable file path> <options>%n");
+        String header = "Dump contents of given SSTable to standard output in JSON format.";
+        new HelpFormatter().printHelp(usage, header, options, "");
     }
 }

diff --git a/src/java/org/apache/cassandra/tools/SSTableImport.java b/src/java/org/apache/cassandra/tools/SSTableImport.java
deleted file mode 100644
index ad0ffa1..0000000
--- a/src/java/org/apache/cassandra/tools/SSTableImport.java
+++ /dev/null

@@ -1,575 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.tools;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-import org.apache.commons.cli.PosixParser;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.JVMStabilityInspector;
-import org.codehaus.jackson.JsonFactory;
-import org.codehaus.jackson.JsonParser;
-import org.codehaus.jackson.JsonToken;
-import org.codehaus.jackson.map.MappingJsonFactory;
-import org.codehaus.jackson.type.TypeReference;
-
-/**
- * Create SSTables from JSON input
- */
-public class SSTableImport
-{
-    private static final String KEYSPACE_OPTION = "K";
-    private static final String COLUMN_FAMILY_OPTION = "c";
-    private static final String KEY_COUNT_OPTION = "n";
-    private static final String IS_SORTED_OPTION = "s";
-
-    private static final Options options = new Options();
-    private static CommandLine cmd;
-
-    private Integer keyCountToImport;
-    private final boolean isSorted;
-
-    private static final JsonFactory factory = new MappingJsonFactory().configure(
-                                                                                 JsonParser.Feature.INTERN_FIELD_NAMES, false);
-
-    static
-    {
-        Option optKeyspace = new Option(KEYSPACE_OPTION, true, "Keyspace name.");
-        optKeyspace.setRequired(true);
-        options.addOption(optKeyspace);
-
-        Option optColfamily = new Option(COLUMN_FAMILY_OPTION, true, "Table name.");
-        optColfamily.setRequired(true);
-        options.addOption(optColfamily);
-
-        options.addOption(new Option(KEY_COUNT_OPTION, true, "Number of keys to import (Optional)."));
-        options.addOption(new Option(IS_SORTED_OPTION, false, "Assume JSON file as already sorted (e.g. created by sstable2json tool) (Optional)."));
-    }
-
-    private static class JsonColumn<T>
-    {
-        private ByteBuffer name;
-        private ByteBuffer value;
-        private long timestamp;
-
-        private String kind;
-        // Expiring columns
-        private int ttl;
-        private int localExpirationTime;
-
-        // Counter columns
-        private long timestampOfLastDelete;
-
-        public JsonColumn(T json, CFMetaData meta)
-        {
-            if (json instanceof List)
-            {
-                CellNameType comparator = meta.comparator;
-                List fields = (List<?>) json;
-
-                assert fields.size() >= 3 : "Cell definition should have at least 3";
-
-                name  = stringAsType((String) fields.get(0), comparator.asAbstractType());
-                timestamp = (Long) fields.get(2);
-                kind = "";
-
-                if (fields.size() > 3)
-                {
-                    kind = (String) fields.get(3);
-                    if (isExpiring())
-                    {
-                        ttl = (Integer) fields.get(4);
-                        localExpirationTime = (Integer) fields.get(5);
-                    }
-                    else if (isCounter())
-                    {
-                        timestampOfLastDelete = ((Integer) fields.get(4));
-                    }
-                    else if (isRangeTombstone())
-                    {
-                        localExpirationTime = (Integer) fields.get(4);
-                    }
-                }
-
-                if (isDeleted())
-                {
-                    value = ByteBufferUtil.bytes((Integer) fields.get(1));
-                }
-                else if (isRangeTombstone())
-                {
-                    value = stringAsType((String) fields.get(1), comparator.asAbstractType());
-                }
-                else
-                {
-                    assert meta.isCQL3Table() || name.hasRemaining() : "Cell name should not be empty";
-                    value = stringAsType((String) fields.get(1),
-                                         meta.getValueValidator(name.hasRemaining()
-                                                                ? comparator.cellFromByteBuffer(name)
-                                                                : meta.comparator.rowMarker(Composites.EMPTY)));
-                }
-            }
-        }
-
-        public boolean isDeleted()
-        {
-            return kind.equals("d");
-        }
-
-        public boolean isExpiring()
-        {
-            return kind.equals("e");
-        }
-
-        public boolean isCounter()
-        {
-            return kind.equals("c");
-        }
-
-        public boolean isRangeTombstone()
-        {
-            return kind.equals("t");
-        }
-
-        public ByteBuffer getName()
-        {
-            return name.duplicate();
-        }
-
-        public ByteBuffer getValue()
-        {
-            return value.duplicate();
-        }
-    }
-
-    public SSTableImport()
-    {
-        this(null, false);
-    }
-
-    public SSTableImport(boolean isSorted)
-    {
-        this(null, isSorted);
-    }
-
-    public SSTableImport(Integer keyCountToImport, boolean isSorted)
-    {
-        this.keyCountToImport = keyCountToImport;
-        this.isSorted = isSorted;
-    }
-
-    /**
-     * Add columns to a column family.
-     *
-     * @param row the columns associated with a row
-     * @param cfamily the column family to add columns to
-     */
-    private void addColumnsToCF(List<?> row, ColumnFamily cfamily)
-    {
-        CFMetaData cfm = cfamily.metadata();
-        assert cfm != null;
-
-        for (Object c : row)
-        {
-            JsonColumn col = new JsonColumn<List>((List) c, cfm);
-            if (col.isRangeTombstone())
-            {
-                Composite start = cfm.comparator.fromByteBuffer(col.getName());
-                Composite end = cfm.comparator.fromByteBuffer(col.getValue());
-                cfamily.addAtom(new RangeTombstone(start, end, col.timestamp, col.localExpirationTime));
-                continue;
-            }
-
-            assert cfm.isCQL3Table() || col.getName().hasRemaining() : "Cell name should not be empty";
-            CellName cname = col.getName().hasRemaining() ? cfm.comparator.cellFromByteBuffer(col.getName())
-                                                          : cfm.comparator.rowMarker(Composites.EMPTY);
-
-            if (col.isExpiring())
-            {
-                cfamily.addColumn(new BufferExpiringCell(cname, col.getValue(), col.timestamp, col.ttl, col.localExpirationTime));
-            }
-            else if (col.isCounter())
-            {
-                cfamily.addColumn(new BufferCounterCell(cname, col.getValue(), col.timestamp, col.timestampOfLastDelete));
-            }
-            else if (col.isDeleted())
-            {
-                cfamily.addTombstone(cname, col.getValue(), col.timestamp);
-            }
-            else if (col.isRangeTombstone())
-            {
-                CellName end = cfm.comparator.cellFromByteBuffer(col.getValue());
-                cfamily.addAtom(new RangeTombstone(cname, end, col.timestamp, col.localExpirationTime));
-            }
-            // cql3 row marker, see CASSANDRA-5852
-            else if (cname.isEmpty())
-            {
-                cfamily.addColumn(cfm.comparator.rowMarker(Composites.EMPTY), col.getValue(), col.timestamp);
-            }
-            else
-            {
-                cfamily.addColumn(cname, col.getValue(), col.timestamp);
-            }
-        }
-    }
-
-    private void parseMeta(Map<?, ?> map, ColumnFamily cf, ByteBuffer superColumnName)
-    {
-
-        // deletionInfo is the only metadata we store for now
-        if (map.containsKey("deletionInfo"))
-        {
-            Map<?, ?> unparsedDeletionInfo = (Map<?, ?>) map.get("deletionInfo");
-            Number number = (Number) unparsedDeletionInfo.get("markedForDeleteAt");
-            long markedForDeleteAt = number instanceof Long ? (Long) number : number.longValue();
-            int localDeletionTime = (Integer) unparsedDeletionInfo.get("localDeletionTime");
-            if (superColumnName == null)
-                cf.setDeletionInfo(new DeletionInfo(markedForDeleteAt, localDeletionTime));
-            else
-                cf.addAtom(new RangeTombstone(SuperColumns.startOf(superColumnName), SuperColumns.endOf(superColumnName), markedForDeleteAt, localDeletionTime));
-        }
-    }
-
-    /**
-     * Convert a JSON formatted file to an SSTable.
-     *
-     * @param jsonFile the file containing JSON formatted data
-     * @param keyspace keyspace the data belongs to
-     * @param cf column family the data belongs to
-     * @param ssTablePath file to write the SSTable to
-     *
-     * @throws IOException for errors reading/writing input/output
-     */
-    public int importJson(String jsonFile, String keyspace, String cf, String ssTablePath) throws IOException
-    {
-        if (Schema.instance.getCFMetaData(keyspace, cf) == null)
-            throw new IllegalArgumentException(String.format("Unknown keyspace/table %s.%s",
-                                                             keyspace,
-                                                             cf));
-
-        ColumnFamily columnFamily = ArrayBackedSortedColumns.factory.create(keyspace, cf);
-        IPartitioner partitioner = DatabaseDescriptor.getPartitioner();
-
-        int importedKeys = (isSorted) ? importSorted(jsonFile, columnFamily, ssTablePath, partitioner)
-                                      : importUnsorted(jsonFile, columnFamily, ssTablePath, partitioner);
-
-        if (importedKeys != -1)
-            System.out.printf("%d keys imported successfully.%n", importedKeys);
-
-        return importedKeys;
-    }
-
-    private int importUnsorted(String jsonFile, ColumnFamily columnFamily, String ssTablePath, IPartitioner partitioner) throws IOException
-    {
-        int importedKeys = 0;
-        long start = System.nanoTime();
-
-        Object[] data;
-        try (JsonParser parser = getParser(jsonFile))
-        {
-            data = parser.readValueAs(new TypeReference<Object[]>(){});
-        }
-
-        keyCountToImport = (keyCountToImport == null) ? data.length : keyCountToImport;
-
-        try (SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(ssTablePath), keyCountToImport, ActiveRepairService.UNREPAIRED_SSTABLE, 0))
-        {
-            System.out.printf("Importing %s keys...%n", keyCountToImport);
-
-            // sort by dk representation, but hold onto the hex version
-            SortedMap<DecoratedKey, Map<?, ?>> decoratedKeys = new TreeMap<DecoratedKey, Map<?, ?>>();
-
-            for (Object row : data)
-            {
-                Map<?, ?> rowAsMap = (Map<?, ?>) row;
-                decoratedKeys.put(partitioner.decorateKey(getKeyValidator(columnFamily).fromString((String) rowAsMap.get("key"))), rowAsMap);
-            }
-
-            for (Map.Entry<DecoratedKey, Map<?, ?>> row : decoratedKeys.entrySet())
-            {
-                if (row.getValue().containsKey("metadata"))
-                {
-                    parseMeta((Map<?, ?>) row.getValue().get("metadata"), columnFamily, null);
-                }
-
-                Object columns = row.getValue().get("cells");
-                addColumnsToCF((List<?>) columns, columnFamily);
-
-
-                writer.append(row.getKey(), columnFamily);
-                columnFamily.clear();
-
-                importedKeys++;
-
-                long current = System.nanoTime();
-
-                if (TimeUnit.NANOSECONDS.toSeconds(current - start) >= 5) // 5 secs.
-                {
-                    System.out.printf("Currently imported %d keys.%n", importedKeys);
-                    start = current;
-                }
-
-                if (keyCountToImport == importedKeys)
-                    break;
-            }
-
-            writer.finish(true);
-        }
-
-        return importedKeys;
-    }
-
-    private int importSorted(String jsonFile, ColumnFamily columnFamily, String ssTablePath,
-                             IPartitioner partitioner) throws IOException
-    {
-        int importedKeys = 0; // already imported keys count
-        long start = System.nanoTime();
-
-        try (JsonParser parser = getParser(jsonFile))
-        {
-
-            if (keyCountToImport == null)
-            {
-                keyCountToImport = 0;
-                System.out.println("Counting keys to import, please wait... (NOTE: to skip this use -n <num_keys>)");
-
-                parser.nextToken(); // START_ARRAY
-                while (parser.nextToken() != null)
-                {
-                    parser.skipChildren();
-                    if (parser.getCurrentToken() == JsonToken.END_ARRAY)
-                        break;
-
-                    keyCountToImport++;
-                }
-            }
-            System.out.printf("Importing %s keys...%n", keyCountToImport);
-        }
-
-        try (JsonParser parser = getParser(jsonFile); // renewing parser
-             SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(ssTablePath), keyCountToImport, ActiveRepairService.UNREPAIRED_SSTABLE);)
-        {
-            int lineNumber = 1;
-            DecoratedKey prevStoredKey = null;
-
-            parser.nextToken(); // START_ARRAY
-            while (parser.nextToken() != null)
-            {
-                String key = parser.getCurrentName();
-                Map<?, ?> row = parser.readValueAs(new TypeReference<Map<?, ?>>(){});
-                DecoratedKey currentKey = partitioner.decorateKey(getKeyValidator(columnFamily).fromString((String) row.get("key")));
-
-                if (row.containsKey("metadata"))
-                    parseMeta((Map<?, ?>) row.get("metadata"), columnFamily, null);
-
-                addColumnsToCF((List<?>) row.get("cells"), columnFamily);
-
-                if (prevStoredKey != null && prevStoredKey.compareTo(currentKey) != -1)
-                {
-                    System.err
-                    .printf("Line %d: Key %s is greater than previous, collection is not sorted properly. Aborting import. You might need to delete SSTables manually.%n",
-                            lineNumber, key);
-                    return -1;
-                }
-
-                // saving decorated key
-                writer.append(currentKey, columnFamily);
-                columnFamily.clear();
-
-                prevStoredKey = currentKey;
-                importedKeys++;
-                lineNumber++;
-
-                long current = System.nanoTime();
-
-                if (TimeUnit.NANOSECONDS.toSeconds(current - start) >= 5) // 5 secs.
-                {
-                    System.out.printf("Currently imported %d keys.%n", importedKeys);
-                    start = current;
-                }
-
-                if (keyCountToImport == importedKeys)
-                    break;
-
-            }
-
-            writer.finish(true);
-
-            return importedKeys;
-        }
-    }
-
-    /**
-     * Get key validator for column family
-     * @param columnFamily column family instance
-     * @return key validator for given column family
-     */
-    private AbstractType<?> getKeyValidator(ColumnFamily columnFamily) {
-        // this is a fix to support backward compatibility
-        // which allows to skip the current key validator
-        // please, take a look onto CASSANDRA-7498 for more details
-        if ("true".equals(System.getProperty("skip.key.validator", "false"))) {
-            return BytesType.instance;
-        }
-        return columnFamily.metadata().getKeyValidator();
-    }
-
-    /**
-     * Get JsonParser object for file
-     * @param fileName name of the file
-     * @return json parser instance for given file
-     * @throws IOException if any I/O error.
-     */
-    private JsonParser getParser(String fileName) throws IOException
-    {
-        return factory.createJsonParser(new File(fileName));
-    }
-
-    /**
-     * Converts JSON to an SSTable file. JSON input can either be a file specified
-     * using an optional command line argument, or supplied on standard in.
-     *
-     * @param args command line arguments
-     * @throws ParseException on failure to parse JSON input
-     * @throws ConfigurationException on configuration error.
-     */
-    public static void main(String[] args) throws ParseException, ConfigurationException
-    {
-        System.err.println("WARNING: please note that json2sstable is now deprecated and will be removed in Cassandra 3.0. "
-                         + "You should use CQLSSTableWriter if you want to write sstables directly. "
-                         + "Please see https://issues.apache.org/jira/browse/CASSANDRA-9618 for details.");
-
-        CommandLineParser parser = new PosixParser();
-
-        try
-        {
-            cmd = parser.parse(options, args);
-        }
-        catch (org.apache.commons.cli.ParseException e)
-        {
-            System.err.println(e.getMessage());
-            printProgramUsage();
-            System.exit(1);
-        }
-
-        if (cmd.getArgs().length != 2)
-        {
-            printProgramUsage();
-            System.exit(1);
-        }
-
-        String json     = cmd.getArgs()[0];
-        String ssTable  = cmd.getArgs()[1];
-        String keyspace = cmd.getOptionValue(KEYSPACE_OPTION);
-        String cfamily  = cmd.getOptionValue(COLUMN_FAMILY_OPTION);
-
-        Integer keyCountToImport = null;
-        boolean isSorted = false;
-
-        if (cmd.hasOption(KEY_COUNT_OPTION))
-        {
-            keyCountToImport = Integer.valueOf(cmd.getOptionValue(KEY_COUNT_OPTION));
-        }
-
-        if (cmd.hasOption(IS_SORTED_OPTION))
-        {
-            isSorted = true;
-        }
-
-        Util.initDatabaseDescriptor();
-
-        Schema.instance.loadFromDisk(false);
-        if (Schema.instance.getNonSystemKeyspaces().size() < 1)
-        {
-            String msg = "no non-system keyspaces are defined";
-            System.err.println(msg);
-            throw new ConfigurationException(msg);
-        }
-
-        try
-        {
-            new SSTableImport(keyCountToImport, isSorted).importJson(json, keyspace, cfamily, ssTable);
-        }
-        catch (Exception e)
-        {
-            JVMStabilityInspector.inspectThrowable(e);
-            e.printStackTrace();
-            System.err.println("ERROR: " + e.getMessage());
-            System.exit(-1);
-        }
-
-        System.exit(0);
-    }
-
-    private static void printProgramUsage()
-    {
-        System.out.printf("Usage: %s -s -K <keyspace> -c <column_family> -n <num_keys> <json> <sstable>%n%n",
-                          SSTableImport.class.getName());
-
-        System.out.println("Options:");
-        for (Object o :  options.getOptions())
-        {
-            Option opt = (Option) o;
-            System.out.println("  -" +opt.getOpt() + " - " + opt.getDescription());
-        }
-    }
-
-    /**
-     * Convert a string to bytes (ByteBuffer) according to type
-     * @param content string to convert
-     * @param type type to use for conversion
-     * @return byte buffer representation of the given string
-     */
-    private static ByteBuffer stringAsType(String content, AbstractType<?> type)
-    {
-        try
-        {
-            return type.fromString(content);
-        }
-        catch (MarshalException e)
-        {
-            throw new RuntimeException(e.getMessage());
-        }
-    }
-
-}

diff --git a/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java b/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java
index 9f0af05..915edf1 100644
--- a/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java
+++ b/src/java/org/apache/cassandra/tools/SSTableLevelResetter.java

@@ -23,6 +23,7 @@
 
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
@@ -77,7 +78,7 @@
             Keyspace keyspace = Keyspace.openWithoutSSTables(keyspaceName);
             ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(columnfamily);
             boolean foundSSTable = false;
-            for (Map.Entry<Descriptor, Set<Component>> sstable : cfs.directories.sstableLister().list().entrySet())
+            for (Map.Entry<Descriptor, Set<Component>> sstable : cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).list().entrySet())
             {
                 if (sstable.getValue().contains(Component.STATS))
                 {

diff --git a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
index 8319014..19af957 100644
--- a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
+++ b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java

@@ -23,6 +23,15 @@
 import java.util.EnumSet;
 import java.util.Map;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.metadata.*;
 
@@ -31,21 +40,37 @@
  */
 public class SSTableMetadataViewer
 {
+    private static final String GCGS_KEY = "gc_grace_seconds";
+
     /**
      * @param args a list of sstables whose metadata we're interested in
      */
     public static void main(String[] args) throws IOException
     {
         PrintStream out = System.out;
-        if (args.length == 0)
+        Option optGcgs = new Option(null, GCGS_KEY, true, "The "+GCGS_KEY+" to use when calculating droppable tombstones");
+
+        Options options = new Options();
+        options.addOption(optGcgs);
+        CommandLine cmd = null;
+        CommandLineParser parser = new PosixParser();
+        try
         {
-            out.println("Usage: sstablemetadata <sstable filenames>");
-            System.exit(1);
+            cmd = parser.parse(options, args);
+        }
+        catch (ParseException e)
+        {
+            printHelp(options, out);
         }
 
+        if (cmd.getArgs().length == 0)
+        {
+            printHelp(options, out);
+        }
+        int gcgs = Integer.parseInt(cmd.getOptionValue(GCGS_KEY, "0"));
         Util.initDatabaseDescriptor();
 
-        for (String fname : args)
+        for (String fname : cmd.getArgs())
         {
             if (new File(fname).exists())
             {
@@ -65,13 +90,13 @@
                 {
                     out.printf("Minimum timestamp: %s%n", stats.minTimestamp);
                     out.printf("Maximum timestamp: %s%n", stats.maxTimestamp);
+                    out.printf("SSTable min local deletion time: %s%n", stats.minLocalDeletionTime);
                     out.printf("SSTable max local deletion time: %s%n", stats.maxLocalDeletionTime);
                     out.printf("Compression ratio: %s%n", stats.compressionRatio);
-                    out.printf("Estimated droppable tombstones: %s%n", stats.getEstimatedDroppableTombstoneRatio((int) (System.currentTimeMillis() / 1000)));
+                    out.printf("Estimated droppable tombstones: %s%n", stats.getEstimatedDroppableTombstoneRatio((int) (System.currentTimeMillis() / 1000) - gcgs));
                     out.printf("SSTable Level: %d%n", stats.sstableLevel);
                     out.printf("Repaired at: %d%n", stats.repairedAt);
-                    out.printf("Minimum replay position: %s\n", stats.commitLogLowerBound);
-                    out.printf("Maximum replay position: %s\n", stats.commitLogUpperBound);
+                    out.printf("Replay positions covered: %s\n", stats.commitLogIntervals);
                     out.println("Estimated tombstone drop times:");
                     for (Map.Entry<Double, Long> entry : stats.estimatedTombstoneDropTime.getAsMap().entrySet())
                     {
@@ -81,9 +106,7 @@
                 }
                 if (compaction != null)
                 {
-                    out.printf("Ancestors: %s%n", compaction.ancestors.toString());
                     out.printf("Estimated cardinality: %s%n", compaction.cardinalityEstimator.cardinality());
-
                 }
             }
             else
@@ -93,10 +116,17 @@
         }
     }
 
+    private static void printHelp(Options options, PrintStream out)
+    {
+        out.println();
+        new HelpFormatter().printHelp("Usage: sstablemetadata [--"+GCGS_KEY+" n] <sstable filenames>", "Dump contents of given SSTable to standard output in JSON format.", options, "");
+        System.exit(1);
+    }
+
     private static void printHistograms(StatsMetadata metadata, PrintStream out)
     {
-        long[] offsets = metadata.estimatedRowSize.getBucketOffsets();
-        long[] ersh = metadata.estimatedRowSize.getBuckets(false);
+        long[] offsets = metadata.estimatedPartitionSize.getBucketOffsets();
+        long[] ersh = metadata.estimatedPartitionSize.getBuckets(false);
         long[] ecch = metadata.estimatedColumnCount.getBuckets(false);
 
         out.println(String.format("%-10s%18s%18s",

diff --git a/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java b/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java
index 71e4cfc..b27b07a 100644
--- a/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java
+++ b/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java

@@ -30,17 +30,14 @@
 
 import com.google.common.base.Throwables;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.utils.Pair;
 
 /**
  * Create a decent leveling for the given keyspace/column family
@@ -98,7 +95,7 @@
 
         Keyspace ks = Keyspace.openWithoutSSTables(keyspace);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(columnfamily);
-        Directories.SSTableLister lister = cfs.directories.sstableLister().skipTemporary(true);
+        Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true);
         Set<SSTableReader> sstables = new HashSet<>();
         for (Map.Entry<Descriptor, Set<Component>> sstable : lister.list().entrySet())
         {

diff --git a/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java b/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java
index ff362cc..3608808 100644
--- a/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java
+++ b/src/java/org/apache/cassandra/tools/SSTableRepairedAtSetter.java

@@ -25,6 +25,7 @@
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.service.ActiveRepairService;
@@ -47,6 +48,9 @@
      */
     public static void main(final String[] args) throws IOException
     {
+        // Necessary since BufferPool used in RandomAccessReader needs to access DatabaseDescriptor
+        Config.setClientMode(true);
+
         PrintStream out = System.out;
         if (args.length == 0)
         {

diff --git a/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java b/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java
new file mode 100644
index 0000000..7aa07d0
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java

@@ -0,0 +1,242 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.tools;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.utils.OutputHandler;
+import org.apache.commons.cli.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.function.BiFunction;
+
+import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions;
+
+public class StandaloneSSTableUtil
+{
+    private static final String TOOL_NAME = "sstableutil";
+    private static final String TYPE_OPTION  = "type";
+    private static final String OP_LOG_OPTION  = "oplog";
+    private static final String VERBOSE_OPTION  = "verbose";
+    private static final String DEBUG_OPTION  = "debug";
+    private static final String HELP_OPTION  = "help";
+    private static final String CLEANUP_OPTION = "cleanup";
+
+    public static void main(String args[])
+    {
+        Options options = Options.parseArgs(args);
+        try
+        {
+            // load keyspace descriptions.
+            Schema.instance.loadFromDisk(false);
+
+            CFMetaData metadata = Schema.instance.getCFMetaData(options.keyspaceName, options.cfName);
+            if (metadata == null)
+                throw new IllegalArgumentException(String.format("Unknown keyspace/table %s.%s",
+                                                                 options.keyspaceName,
+                                                                 options.cfName));
+
+            OutputHandler handler = new OutputHandler.SystemOutput(options.verbose, options.debug);
+
+            if (options.cleanup)
+            {
+                handler.output("Cleanuping up...");
+                LifecycleTransaction.removeUnfinishedLeftovers(metadata);
+            }
+            else
+            {
+                handler.output("Listing files...");
+                listFiles(options, metadata, handler);
+            }
+
+            System.exit(0);
+        }
+        catch (Exception e)
+        {
+            System.err.println(e.getMessage());
+            if (options.debug)
+                e.printStackTrace(System.err);
+            System.exit(1);
+        }
+    }
+
+    private static void listFiles(Options options, CFMetaData metadata, OutputHandler handler) throws IOException
+    {
+        Directories directories = new Directories(metadata, ColumnFamilyStore.getInitialDirectories());
+
+        for (File dir : directories.getCFDirectories())
+        {
+            for (File file : LifecycleTransaction.getFiles(dir.toPath(), getFilter(options), Directories.OnTxnErr.THROW))
+                handler.output(file.getCanonicalPath());
+        }
+    }
+
+    private static BiFunction<File, Directories.FileType, Boolean> getFilter(Options options)
+    {
+        return (file, type) ->
+        {
+            switch(type)
+            {
+                case FINAL:
+                    return options.type != Options.FileType.TMP;
+                case TEMPORARY:
+                    return options.type != Options.FileType.FINAL;
+                case TXN_LOG:
+                    return options.oplogs;
+                default:
+                    throw new AssertionError();
+            }
+        };
+    }
+
+    private static class Options
+    {
+        public enum FileType
+        {
+            ALL("all", "list all files, final or temporary"),
+            TMP("tmp", "list temporary files only"),
+            FINAL("final", "list final files only");
+
+            public String option;
+            public String descr;
+            FileType(String option, String descr)
+            {
+                this.option = option;
+                this.descr = descr;
+            }
+
+            static FileType fromOption(String option)
+            {
+                for (FileType fileType : FileType.values())
+                {
+                    if (fileType.option.equals(option))
+                        return fileType;
+                }
+
+                return FileType.ALL;
+            }
+
+            static String descr()
+            {
+                StringBuilder str = new StringBuilder();
+                for (FileType fileType : FileType.values())
+                {
+                    str.append(fileType.option);
+                    str.append(" (");
+                    str.append(fileType.descr);
+                    str.append("), ");
+                }
+                return str.toString();
+            }
+        }
+
+        public final String keyspaceName;
+        public final String cfName;
+
+        public boolean debug;
+        public boolean verbose;
+        public boolean oplogs;
+        public boolean cleanup;
+        public FileType type;
+
+        private Options(String keyspaceName, String cfName)
+        {
+            this.keyspaceName = keyspaceName;
+            this.cfName = cfName;
+        }
+
+        public static Options parseArgs(String cmdArgs[])
+        {
+            CommandLineParser parser = new GnuParser();
+            CmdLineOptions options = getCmdLineOptions();
+            try
+            {
+                CommandLine cmd = parser.parse(options, cmdArgs, false);
+
+                if (cmd.hasOption(HELP_OPTION))
+                {
+                    printUsage(options);
+                    System.exit(0);
+                }
+
+                String[] args = cmd.getArgs();
+                if (args.length != 2)
+                {
+                    String msg = args.length < 2 ? "Missing arguments" : "Too many arguments";
+                    System.err.println(msg);
+                    printUsage(options);
+                    System.exit(1);
+                }
+
+                String keyspaceName = args[0];
+                String cfName = args[1];
+
+                Options opts = new Options(keyspaceName, cfName);
+
+                opts.debug = cmd.hasOption(DEBUG_OPTION);
+                opts.verbose = cmd.hasOption(VERBOSE_OPTION);
+                opts.type = FileType.fromOption(cmd.getOptionValue(TYPE_OPTION));
+                opts.oplogs = cmd.hasOption(OP_LOG_OPTION);
+                opts.cleanup = cmd.hasOption(CLEANUP_OPTION);
+
+                return opts;
+            }
+            catch (ParseException e)
+            {
+                errorMsg(e.getMessage(), options);
+                return null;
+            }
+        }
+
+        private static void errorMsg(String msg, CmdLineOptions options)
+        {
+            System.err.println(msg);
+            printUsage(options);
+            System.exit(1);
+        }
+
+        private static CmdLineOptions getCmdLineOptions()
+        {
+            CmdLineOptions options = new CmdLineOptions();
+            options.addOption("c", CLEANUP_OPTION, "clean-up any outstanding transactions");
+            options.addOption("d", DEBUG_OPTION, "display stack traces");
+            options.addOption("h", HELP_OPTION, "display this help message");
+            options.addOption("o", OP_LOG_OPTION, "include operation logs");
+            options.addOption("t", TYPE_OPTION, true, FileType.descr());
+            options.addOption("v", VERBOSE_OPTION, "verbose output");
+
+            return options;
+        }
+
+        public static void printUsage(CmdLineOptions options)
+        {
+            String usage = String.format("%s [options] <keyspace> <column_family>", TOOL_NAME);
+            StringBuilder header = new StringBuilder();
+            header.append("--\n");
+            header.append("List sstable files for the provided table." );
+            header.append("\n--\n");
+            header.append("Options are:");
+            new HelpFormatter().printHelp(usage, header.toString(), options, "");
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
index f5e84c5..4778d72 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java

@@ -92,7 +92,7 @@
             String snapshotName = "pre-scrub-" + System.currentTimeMillis();
 
             OutputHandler handler = new OutputHandler.SystemOutput(options.verbose, options.debug);
-            Directories.SSTableLister lister = cfs.directories.sstableLister().skipTemporary(true);
+            Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true);
 
             List<SSTableReader> sstables = new ArrayList<>();
 
@@ -129,7 +129,7 @@
                     try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.SCRUB, sstable))
                     {
                         txn.obsoleteOriginals(); // make sure originals are deleted and avoid NPE if index is missing, CASSANDRA-9591
-                        try (Scrubber scrubber = new Scrubber(cfs, txn, options.skipCorrupted, handler, !options.noValidate, options.reinsertOverflowedTTL))
+                        try (Scrubber scrubber = new Scrubber(cfs, txn, options.skipCorrupted, handler, !options.noValidate, options.reinserOverflowedTTL))
                         {
                             scrubber.scrub();
                         }
@@ -151,9 +151,9 @@
             }
 
             // Check (and repair) manifests
-            checkManifest(cfs.getCompactionStrategy(), cfs, sstables);
+            checkManifest(cfs.getCompactionStrategyManager(), cfs, sstables);
             CompactionManager.instance.finishCompactionsAndShutdown(5, TimeUnit.MINUTES);
-            SSTableDeletingTask.waitForDeletions();
+            LifecycleTransaction.waitForDeletions();
             System.exit(0); // We need that to stop non daemonized threads
         }
         catch (Exception e)
@@ -165,11 +165,10 @@
         }
     }
 
-    private static void checkManifest(AbstractCompactionStrategy strategy, ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
+    private static void checkManifest(CompactionStrategyManager strategyManager, ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
     {
-        WrappingCompactionStrategy wrappingStrategy = (WrappingCompactionStrategy)strategy;
-        int maxSizeInMB = (int)((cfs.getCompactionStrategy().getMaxSSTableBytes()) / (1024L * 1024L));
-        if (wrappingStrategy.getWrappedStrategies().size() == 2 && wrappingStrategy.getWrappedStrategies().get(0) instanceof LeveledCompactionStrategy)
+        int maxSizeInMB = (int)((cfs.getCompactionStrategyManager().getMaxSSTableBytes()) / (1024L * 1024L));
+        if (strategyManager.getStrategies().size() == 2 && strategyManager.getStrategies().get(0) instanceof LeveledCompactionStrategy)
         {
             System.out.println("Checking leveled manifest");
             Predicate<SSTableReader> repairedPredicate = new Predicate<SSTableReader>()
@@ -207,7 +206,7 @@
         public boolean manifestCheckOnly;
         public boolean skipCorrupted;
         public boolean noValidate;
-        public boolean reinsertOverflowedTTL;
+        public boolean reinserOverflowedTTL;
 
         private Options(String keyspaceName, String cfName)
         {
@@ -248,7 +247,7 @@
                 opts.manifestCheckOnly = cmd.hasOption(MANIFEST_CHECK_OPTION);
                 opts.skipCorrupted = cmd.hasOption(SKIP_CORRUPTED_OPTION);
                 opts.noValidate = cmd.hasOption(NO_VALIDATE_OPTION);
-                opts.reinsertOverflowedTTL = cmd.hasOption(REINSERT_OVERFLOWED_TTL_OPTION);
+                opts.reinserOverflowedTTL = cmd.hasOption(REINSERT_OVERFLOWED_TTL_OPTION);
 
                 return opts;
             }

diff --git a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
index 95fed3c..57504c3 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java

@@ -154,20 +154,18 @@
                 try (LifecycleTransaction transaction = LifecycleTransaction.offline(OperationType.UNKNOWN, sstable))
                 {
                     new SSTableSplitter(cfs, transaction, options.sizeInMB).split();
-
-                    // Remove the sstable (it's been copied by split and snapshotted)
-                    sstable.markObsolete(null);
-                    sstable.selfRef().release();
                 }
                 catch (Exception e)
                 {
                     System.err.println(String.format("Error splitting %s: %s", sstable, e.getMessage()));
                     if (options.debug)
                         e.printStackTrace(System.err);
+
+                    sstable.selfRef().release();
                 }
             }
             CompactionManager.instance.finishCompactionsAndShutdown(5, TimeUnit.MINUTES);
-            SSTableDeletingTask.waitForDeletions();
+            LifecycleTransaction.waitForDeletions();
             System.exit(0); // We need that to stop non daemonized threads
         }
         catch (Exception e)

diff --git a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
index 23342bc..f2a922b 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java

@@ -64,7 +64,7 @@
             ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(options.cf);
 
             OutputHandler handler = new OutputHandler.SystemOutput(false, options.debug);
-            Directories.SSTableLister lister = cfs.directories.sstableLister();
+            Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW);
             if (options.snapshot != null)
                 lister.onlyBackups(true).snapshots(options.snapshot);
             else
@@ -106,15 +106,7 @@
                 try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.UPGRADE_SSTABLES, sstable))
                 {
                     Upgrader upgrader = new Upgrader(cfs, txn, handler);
-                    upgrader.upgrade();
-
-                    if (!options.keepSource)
-                    {
-                        // Remove the sstable (it's been copied by upgrade)
-                        System.out.format("Deleting table %s.%n", sstable.descriptor.baseFilename());
-                        sstable.markObsolete(null);
-                        sstable.selfRef().release();
-                    }
+                    upgrader.upgrade(options.keepSource);
                 }
                 catch (Exception e)
                 {
@@ -122,9 +114,15 @@
                     if (options.debug)
                         e.printStackTrace(System.err);
                 }
+                finally
+                {
+                    // we should have released this through commit of the LifecycleTransaction,
+                    // but in case the upgrade failed (or something else went wrong) make sure we don't retain a reference
+                    sstable.selfRef().ensureReleased();
+                }
             }
             CompactionManager.instance.finishCompactionsAndShutdown(5, TimeUnit.MINUTES);
-            SSTableDeletingTask.waitForDeletions();
+            LifecycleTransaction.waitForDeletions();
             System.exit(0);
         }
         catch (Exception e)

diff --git a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java
index fb7f218..d358882 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java

@@ -18,10 +18,6 @@
  */
 package org.apache.cassandra.tools;
 
-import com.google.common.base.Predicate;
-import com.google.common.base.Predicates;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
@@ -35,7 +31,6 @@
 import org.apache.cassandra.utils.OutputHandler;
 import org.apache.commons.cli.*;
 
-import java.io.File;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 
@@ -71,7 +66,7 @@
             ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(options.cfName);
 
             OutputHandler handler = new OutputHandler.SystemOutput(options.verbose, options.debug);
-            Directories.SSTableLister lister = cfs.directories.sstableLister().skipTemporary(true);
+            Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true);
 
             boolean extended = options.extended;
 

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Cleanup.java b/src/java/org/apache/cassandra/tools/nodetool/Cleanup.java
index 6c6676d..47c65c8 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Cleanup.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Cleanup.java

@@ -23,8 +23,8 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.cassandra.config.Schema;
 import io.airlift.command.Option;
-import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.tools.NodeProbe;
 import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
 
@@ -42,18 +42,19 @@
     @Override
     public void execute(NodeProbe probe)
     {
-        List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        List<String> keyspaces = parseOptionalKeyspace(args, probe, KeyspaceSet.NON_LOCAL_STRATEGY);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
-            if (SystemKeyspace.NAME.equals(keyspace))
+            if (Schema.isLocalSystemKeyspace(keyspace))
                 continue;
 
             try
             {
-                probe.forceKeyspaceCleanup(System.out, jobs, keyspace, cfnames);
-            } catch (Exception e)
+                probe.forceKeyspaceCleanup(System.out, jobs, keyspace, tableNames);
+            }
+            catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during cleanup", e);
             }

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Compact.java b/src/java/org/apache/cassandra/tools/nodetool/Compact.java
index 4d04ae7..002541d 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Compact.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Compact.java

@@ -40,13 +40,13 @@
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.forceKeyspaceCompaction(splitOutput, keyspace, cfnames);
+                probe.forceKeyspaceCompaction(splitOutput, keyspace, tableNames);
             } catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during compaction", e);

diff --git a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
index e57d2ee..36b5733 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java

@@ -26,11 +26,12 @@
 import java.util.List;
 import java.util.Map;
 
+import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManagerMBean;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.tools.NodeProbe;
-import org.apache.cassandra.tools.NodeTool;
 import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
 
 @Command(name = "compactionstats", description = "Print statistics on compactions")
@@ -62,9 +63,10 @@
                 String taskType = c.get("taskType");
                 String keyspace = c.get("keyspace");
                 String columnFamily = c.get("columnfamily");
-                String completedStr = humanReadable ? FileUtils.stringifyFileSize(completed) : Long.toString(completed);
-                String totalStr = humanReadable ? FileUtils.stringifyFileSize(total) : Long.toString(total);
                 String unit = c.get("unit");
+                boolean toFileSize = humanReadable && Unit.isFileSize(unit);
+                String completedStr = toFileSize ? FileUtils.stringifyFileSize(completed) : Long.toString(completed);
+                String totalStr = toFileSize ? FileUtils.stringifyFileSize(total) : Long.toString(total);
                 String percentComplete = total == 0 ? "n/a" : new DecimalFormat("0.00").format((double) completed / total * 100) + "%";
                 String id = c.get("compactionId");
                 addLine(lines, columnSizes, id, taskType, keyspace, columnFamily, completedStr, totalStr, unit, percentComplete);
@@ -96,10 +98,14 @@
         }
     }
 
-    private void addLine(List<String[]> lines, int[] columnSizes, String... columns) {
-        lines.add(columns);
-        for (int i = 0; i < columns.length; i++) {
-            columnSizes[i] = Math.max(columnSizes[i], columns[i].length());
+    private void addLine(List<String[]> lines, int[] columnSizes, String... columns)
+    {
+        String[] newColumns = new String[columns.length];
+        for (int i = 0; i < columns.length; i++)
+        {
+            columnSizes[i] = Math.max(columnSizes[i], columns[i] != null ? columns[i].length() : 1);
+            newColumns[i] = columns[i] != null ? columns[i] : "";
         }
+        lines.add(newColumns);
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/DisableAutoCompaction.java b/src/java/org/apache/cassandra/tools/nodetool/DisableAutoCompaction.java
index 2f5832d..4d35ded 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/DisableAutoCompaction.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/DisableAutoCompaction.java

@@ -37,13 +37,13 @@
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tablenames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.disableAutoCompaction(keyspace, cfnames);
+                probe.disableAutoCompaction(keyspace, tablenames);
             } catch (IOException e)
             {
                 throw new RuntimeException("Error occurred during disabling auto-compaction", e);

diff --git a/src/java/org/apache/cassandra/tools/nodetool/DisableHintsForDC.java b/src/java/org/apache/cassandra/tools/nodetool/DisableHintsForDC.java
new file mode 100644
index 0000000..7072318
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/nodetool/DisableHintsForDC.java

@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.tools.nodetool;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import io.airlift.command.Arguments;
+import io.airlift.command.Command;
+import org.apache.cassandra.tools.NodeProbe;
+import org.apache.cassandra.tools.NodeTool;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+@Command(name = "disablehintsfordc", description = "Disable hints for a data center")
+public class DisableHintsForDC extends NodeTool.NodeToolCmd
+{
+    @Arguments(usage = "<datacenter>", description = "The data center to disable")
+    private List<String> args = new ArrayList<>();
+
+    public void execute(NodeProbe probe)
+    {
+        checkArgument(args.size() == 1, "disablehintsfordc requires exactly one data center");
+
+        probe.disableHintsForDC(args.get(0));
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/tools/nodetool/EnableAutoCompaction.java b/src/java/org/apache/cassandra/tools/nodetool/EnableAutoCompaction.java
index e846187..c758df8 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/EnableAutoCompaction.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/EnableAutoCompaction.java

@@ -37,13 +37,13 @@
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.enableAutoCompaction(keyspace, cfnames);
+                probe.enableAutoCompaction(keyspace, tableNames);
             } catch (IOException e)
             {
                 throw new RuntimeException("Error occurred during enabling auto-compaction", e);

diff --git a/src/java/org/apache/cassandra/tools/nodetool/EnableHandoff.java b/src/java/org/apache/cassandra/tools/nodetool/EnableHandoff.java
index d18d77a..149c0fc 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/EnableHandoff.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/EnableHandoff.java

@@ -17,29 +17,17 @@
  */
 package org.apache.cassandra.tools.nodetool;
 
-import static com.google.common.base.Preconditions.checkArgument;
-import io.airlift.command.Arguments;
 import io.airlift.command.Command;
 
-import java.util.ArrayList;
-import java.util.List;
-
 import org.apache.cassandra.tools.NodeProbe;
 import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
 
-@Command(name = "enablehandoff", description = "Reenable the future hints storing on the current node")
+@Command(name = "enablehandoff", description = "Reenable future hints storing on the current node")
 public class EnableHandoff extends NodeToolCmd
 {
-    @Arguments(usage = "<dc-name>,<dc-name>", description = "Enable hinted handoff only for these DCs")
-    private List<String> args = new ArrayList<>();
-
     @Override
     public void execute(NodeProbe probe)
     {
-        checkArgument(args.size() <= 1, "enablehandoff does not accept two args");
-        if(args.size() == 1)
-            probe.enableHintedHandoff(args.get(0));
-        else
-            probe.enableHintedHandoff();
+        probe.enableHintedHandoff();
     }
 }
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/tools/nodetool/EnableHintsForDC.java b/src/java/org/apache/cassandra/tools/nodetool/EnableHintsForDC.java
new file mode 100644
index 0000000..1979ebd
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/nodetool/EnableHintsForDC.java

@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.tools.nodetool;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import io.airlift.command.Arguments;
+import io.airlift.command.Command;
+import org.apache.cassandra.tools.NodeProbe;
+import org.apache.cassandra.tools.NodeTool;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+@Command(name = "enablehintsfordc", description = "Enable hints for a data center that was previsouly disabled")
+public class EnableHintsForDC extends NodeTool.NodeToolCmd
+{
+    @Arguments(usage = "<datacenter>", description = "The data center to enable")
+    private List<String> args = new ArrayList<>();
+
+    public void execute(NodeProbe probe)
+    {
+        checkArgument(args.size() == 1, "enablehintsfordc requires exactly one data center");
+
+        probe.enableHintsForDC(args.get(0));
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Flush.java b/src/java/org/apache/cassandra/tools/nodetool/Flush.java
index e9038f7..f768615 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Flush.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Flush.java

@@ -36,13 +36,13 @@
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.forceKeyspaceFlush(keyspace, cfnames);
+                probe.forceKeyspaceFlush(keyspace, tableNames);
             } catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during flushing", e);

diff --git a/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java b/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java
index ee7bf34..344d9dc 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java

@@ -42,12 +42,12 @@
             final Map<String,TabularData> snapshotDetails = probe.getSnapshotDetails();
             if (snapshotDetails.isEmpty())
             {
-                System.out.printf("There are no snapshots");
+                System.out.println("There are no snapshots");
                 return;
             }
 
             final long trueSnapshotsSize = probe.trueSnapshotsSize();
-            final String format = "%-20s%-29s%-29s%-19s%-19s%n";
+            final String format = "%-40s %-29s %-29s %-19s %-19s%n";
             // display column names only once
             final List<String> indexNames = snapshotDetails.entrySet().iterator().next().getValue().getTabularType().getIndexNames();
             System.out.printf(format, (Object[]) indexNames.toArray(new String[indexNames.size()]));
@@ -69,4 +69,4 @@
             throw new RuntimeException("Error during list snapshot", e);
         }
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/RebuildIndex.java b/src/java/org/apache/cassandra/tools/nodetool/RebuildIndex.java
index 9985b2b..5fd7327 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/RebuildIndex.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/RebuildIndex.java

@@ -31,7 +31,7 @@
 @Command(name = "rebuild_index", description = "A full rebuild of native secondary indexes for a given table")
 public class RebuildIndex extends NodeToolCmd
 {
-    @Arguments(usage = "<keyspace> <table> <indexName...>", description = "The keyspace and table name followed by a list of index names (IndexNameExample: Standard3.IdxName Standard3.IdxName1)")
+    @Arguments(usage = "<keyspace> <table> <indexName...>", description = "The keyspace and table name followed by a list of index names")
     List<String> args = new ArrayList<>();
 
     @Override

diff --git a/src/java/org/apache/cassandra/tools/nodetool/ReloadLocalSchema.java b/src/java/org/apache/cassandra/tools/nodetool/ReloadLocalSchema.java
new file mode 100644
index 0000000..78fbf2d
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/nodetool/ReloadLocalSchema.java

@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.tools.nodetool;
+
+import io.airlift.command.Command;
+import org.apache.cassandra.tools.NodeProbe;
+import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
+
+@Command(name = "reloadlocalschema", description = "Reload local node schema from system tables")
+public class ReloadLocalSchema extends NodeToolCmd
+{
+    @Override
+    public void execute(NodeProbe probe)
+    {
+        probe.reloadLocalSchema();
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Repair.java b/src/java/org/apache/cassandra/tools/nodetool/Repair.java
index 7d0e207..02bfc5b 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Repair.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Repair.java

@@ -56,7 +56,7 @@
     private boolean localDC = false;
 
     @Option(title = "specific_dc", name = {"-dc", "--in-dc"}, description = "Use -dc to repair specific datacenters")
-    private List<String> specificDataCenters = new ArrayList<>();
+    private List<String> specificDataCenters = new ArrayList<>();;
 
     @Option(title = "specific_host", name = {"-hosts", "--in-hosts"}, description = "Use -hosts to repair specific hosts")
     private List<String> specificHosts = new ArrayList<>();
@@ -84,8 +84,8 @@
     @Override
     public void execute(NodeProbe probe)
     {
-        List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        List<String> keyspaces = parseOptionalKeyspace(args, probe, KeyspaceSet.NON_LOCAL_STRATEGY);
+        String[] cfnames = parseOptionalTables(args);
 
         if (primaryRange && (!specificDataCenters.isEmpty() || !specificHosts.isEmpty()))
             throw new RuntimeException("Primary range repair should be performed on all nodes in the cluster.");

diff --git a/src/java/org/apache/cassandra/tools/nodetool/ReplayBatchlog.java b/src/java/org/apache/cassandra/tools/nodetool/ReplayBatchlog.java
new file mode 100644
index 0000000..e3dcbd4
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/nodetool/ReplayBatchlog.java

@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tools.nodetool;
+
+import java.io.IOError;
+import java.io.IOException;
+
+import io.airlift.command.Command;
+import org.apache.cassandra.tools.NodeProbe;
+import org.apache.cassandra.tools.NodeTool;
+
+@Command(name = "replaybatchlog", description = "Kick off batchlog replay and wait for finish")
+public class ReplayBatchlog extends NodeTool.NodeToolCmd
+{
+    protected void execute(NodeProbe probe)
+    {
+        try
+        {
+            probe.replayBatchlog();
+        }
+        catch (IOException e)
+        {
+            throw new IOError(e);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Ring.java b/src/java/org/apache/cassandra/tools/nodetool/Ring.java
index 5102029..03d9449 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Ring.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Ring.java

@@ -65,7 +65,7 @@
             @Override
             public int compare(String first, String second)
             {
-                return ((Integer) first.length()).compareTo(second.length());
+            	return Integer.compare(first.length(), second.length());
             }
         }).length();
 

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Scrub.java b/src/java/org/apache/cassandra/tools/nodetool/Scrub.java
index 3c726b9..263291d 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Scrub.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Scrub.java

@@ -49,31 +49,33 @@
                    description = "Do not validate columns using column validator")
     private boolean noValidation = false;
 
-    @Option(title = "jobs",
-            name = {"-j", "--jobs"},
-            description = "Number of sstables to scrub simultanously, set to 0 to use all available compaction threads")
-    private int jobs = 2;
-
     @Option(title = "reinsert_overflowed_ttl",
     name = {"-r", "--reinsert-overflowed-ttl"},
     description = StandaloneScrubber.REINSERT_OVERFLOWED_TTL_OPTION_DESCRIPTION)
     private boolean reinsertOverflowedTTL = false;
 
+    @Option(title = "jobs",
+            name = {"-j", "--jobs"},
+            description = "Number of sstables to scrub simultanously, set to 0 to use all available compaction threads")
+    private int jobs = 2;
+
     @Override
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.scrub(System.out, disableSnapshot, skipCorrupted, !noValidation, reinsertOverflowedTTL, jobs, keyspace, cfnames);
-            } catch (IllegalArgumentException e)
+                probe.scrub(System.out, disableSnapshot, skipCorrupted, !noValidation, reinsertOverflowedTTL, jobs, keyspace, tableNames);
+            }
+            catch (IllegalArgumentException e)
             {
                 throw e;
-            } catch (Exception e)
+            }
+            catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during scrubbing", e);
             }

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Snapshot.java b/src/java/org/apache/cassandra/tools/nodetool/Snapshot.java
index 2318620..4f549e5 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Snapshot.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Snapshot.java

@@ -68,7 +68,7 @@
                 if (!snapshotName.isEmpty())
                     sb.append(" with snapshot name [").append(snapshotName).append("]");
                 System.out.println(sb.toString());
-                probe.takeMultipleColumnFamilySnapshot(snapshotName, ktList.split(","));
+                probe.takeMultipleTableSnapshot(snapshotName, ktList.split(","));
                 System.out.println("Snapshot directory: " + snapshotName);
             }
             else

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Status.java b/src/java/org/apache/cassandra/tools/nodetool/Status.java
index 99f745d..091040b 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Status.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Status.java

@@ -25,8 +25,10 @@
 import java.net.UnknownHostException;
 import java.text.DecimalFormat;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 
 import org.apache.cassandra.locator.EndpointSnitchInfoMBean;
@@ -46,7 +48,6 @@
     private boolean resolveIp = false;
 
     private boolean isTokenPerNode = true;
-    private int maxAddressLength = 0;
     private String format = null;
     private Collection<String> joiningNodes, leavingNodes, movingNodes, liveNodes, unreachableNodes;
     private Map<String, String> loadMap, hostIDMap;
@@ -91,13 +92,13 @@
         if (dcs.values().size() < tokensToEndpoints.keySet().size())
             isTokenPerNode = false;
 
-        findMaxAddressLength(dcs);
+        int maxAddressLength = computeMaxAddressLength(dcs);
 
         // Datacenters
         for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
         {
             String dcHeader = String.format("Datacenter: %s%n", dc.getKey());
-            System.out.printf(dcHeader);
+            System.out.print(dcHeader);
             for (int i = 0; i < (dcHeader.length() - 1); i++) System.out.print('=');
             System.out.println();
 
@@ -105,7 +106,7 @@
             System.out.println("Status=Up/Down");
             System.out.println("|/ State=Normal/Leaving/Joining/Moving");
 
-            printNodesHeader(hasEffectiveOwns, isTokenPerNode);
+            printNodesHeader(hasEffectiveOwns, isTokenPerNode, maxAddressLength);
 
             ArrayListMultimap<InetAddress, HostStat> hostToTokens = ArrayListMultimap.create();
             for (HostStat stat : dc.getValue())
@@ -115,29 +116,30 @@
             {
                 Float owns = ownerships.get(endpoint);
                 List<HostStat> tokens = hostToTokens.get(endpoint);
-                printNode(endpoint.getHostAddress(), owns, tokens, hasEffectiveOwns, isTokenPerNode);
+                printNode(endpoint.getHostAddress(), owns, tokens, hasEffectiveOwns, isTokenPerNode, maxAddressLength);
             }
         }
 
-        System.out.printf("%n" + errors.toString());
+        System.out.printf("%n" + errors);
 
     }
 
-    private void findMaxAddressLength(Map<String, SetHostStat> dcs)
+    private int computeMaxAddressLength(Map<String, SetHostStat> dcs)
     {
-        maxAddressLength = 0;
-        for (Map.Entry<String, SetHostStat> dc : dcs.entrySet())
-        {
-            for (HostStat stat : dc.getValue())
-            {
-                maxAddressLength = Math.max(maxAddressLength, stat.ipOrDns().length());
-            }
-        }
+        int maxAddressLength = 0;
+
+        Set<InetAddress> seenHosts = new HashSet<>();
+        for (SetHostStat stats : dcs.values())
+            for (HostStat stat : stats)
+                if (seenHosts.add(stat.endpoint))
+                    maxAddressLength = Math.max(maxAddressLength, stat.ipOrDns().length());
+
+        return maxAddressLength;
     }
 
-    private void printNodesHeader(boolean hasEffectiveOwns, boolean isTokenPerNode)
+    private void printNodesHeader(boolean hasEffectiveOwns, boolean isTokenPerNode, int maxAddressLength)
     {
-        String fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
+        String fmt = getFormat(hasEffectiveOwns, isTokenPerNode, maxAddressLength);
         String owns = hasEffectiveOwns ? "Owns (effective)" : "Owns";
 
         if (isTokenPerNode)
@@ -146,10 +148,11 @@
             System.out.printf(fmt, "-", "-", "Address", "Load", "Tokens", owns, "Host ID", "Rack");
     }
 
-    private void printNode(String endpoint, Float owns, List<HostStat> tokens, boolean hasEffectiveOwns, boolean isTokenPerNode)
+    private void printNode(String endpoint, Float owns, List<HostStat> tokens, boolean hasEffectiveOwns,
+                           boolean isTokenPerNode, int maxAddressLength)
     {
         String status, state, load, strOwns, hostID, rack, fmt;
-        fmt = getFormat(hasEffectiveOwns, isTokenPerNode);
+        fmt = getFormat(hasEffectiveOwns, isTokenPerNode, maxAddressLength);
         if (liveNodes.contains(endpoint)) status = "U";
         else if (unreachableNodes.contains(endpoint)) status = "D";
         else status = "?";
@@ -177,9 +180,7 @@
             System.out.printf(fmt, status, state, endpointDns, load, tokens.size(), strOwns, hostID, rack);
     }
 
-    private String getFormat(
-            boolean hasEffectiveOwns,
-            boolean isTokenPerNode)
+    private String getFormat(boolean hasEffectiveOwns, boolean isTokenPerNode, int maxAddressLength)
     {
         if (format == null)
         {

diff --git a/src/java/org/apache/cassandra/tools/nodetool/StatusHandoff.java b/src/java/org/apache/cassandra/tools/nodetool/StatusHandoff.java
index 5a00069..65f6729 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/StatusHandoff.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/StatusHandoff.java

@@ -28,9 +28,12 @@
     @Override
     public void execute(NodeProbe probe)
     {
-        System.out.println(
+        System.out.println(String.format("Hinted handoff is %s",
                 probe.isHandoffEnabled()
                 ? "running"
-                : "not running");
+                : "not running"));
+
+        for (String dc : probe.getHintedHandoffDisabledDCs())
+            System.out.println(String.format("Data center %s is disabled", dc));
     }
 }
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Stop.java b/src/java/org/apache/cassandra/tools/nodetool/Stop.java
index ad1fc27..6229e65 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Stop.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Stop.java

@@ -36,7 +36,7 @@
 
     @Option(title = "compactionId",
            name = {"-id", "--compaction-id"},
-           description = "Use -id to stop a compaction by the specified id. Ids can be found in the system.compactions_in_progress table.",
+           description = "Use -id to stop a compaction by the specified id. Ids can be found in the transaction log files whose name starts with compaction_, located in the table transactions folder.",
            required = false)
     private String compactionId = "";
 

diff --git a/src/java/org/apache/cassandra/tools/nodetool/StopDaemon.java b/src/java/org/apache/cassandra/tools/nodetool/StopDaemon.java
index a0af89f..79a499a 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/StopDaemon.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/StopDaemon.java

@@ -37,6 +37,5 @@
             JVMStabilityInspector.inspectThrowable(e);
             // ignored
         }
-        System.out.println("Cassandra has shutdown.");
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/tools/nodetool/TableHistograms.java b/src/java/org/apache/cassandra/tools/nodetool/TableHistograms.java
index 207a74e..be3f799 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/TableHistograms.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/TableHistograms.java

@@ -46,7 +46,7 @@
         String table = args.get(1);
 
         // calculate percentile of row size and column count
-        long[] estimatedRowSize = (long[]) probe.getColumnFamilyMetric(keyspace, table, "EstimatedRowSizeHistogram");
+        long[] estimatedPartitionSize = (long[]) probe.getColumnFamilyMetric(keyspace, table, "EstimatedPartitionSizeHistogram");
         long[] estimatedColumnCount = (long[]) probe.getColumnFamilyMetric(keyspace, table, "EstimatedColumnCountHistogram");
 
         // build arrays to store percentile values
@@ -54,7 +54,7 @@
         double[] estimatedColumnCountPercentiles = new double[7];
         double[] offsetPercentiles = new double[]{0.5, 0.75, 0.95, 0.98, 0.99};
 
-        if (ArrayUtils.isEmpty(estimatedRowSize) || ArrayUtils.isEmpty(estimatedColumnCount))
+        if (ArrayUtils.isEmpty(estimatedPartitionSize) || ArrayUtils.isEmpty(estimatedColumnCount))
         {
             System.err.println("No SSTables exists, unable to calculate 'Partition Size' and 'Cell Count' percentiles");
 
@@ -66,19 +66,19 @@
         }
         else
         {
-            EstimatedHistogram rowSizeHist = new EstimatedHistogram(estimatedRowSize);
+            EstimatedHistogram partitionSizeHist = new EstimatedHistogram(estimatedPartitionSize);
             EstimatedHistogram columnCountHist = new EstimatedHistogram(estimatedColumnCount);
 
-            if (rowSizeHist.isOverflowed())
+            if (partitionSizeHist.isOverflowed())
             {
-                System.err.println(String.format("Row sizes are larger than %s, unable to calculate percentiles", rowSizeHist.getLargestBucketOffset()));
+                System.err.println(String.format("Row sizes are larger than %s, unable to calculate percentiles", partitionSizeHist.getLargestBucketOffset()));
                 for (int i = 0; i < offsetPercentiles.length; i++)
                         estimatedRowSizePercentiles[i] = Double.NaN;
             }
             else
             {
                 for (int i = 0; i < offsetPercentiles.length; i++)
-                    estimatedRowSizePercentiles[i] = rowSizeHist.percentile(offsetPercentiles[i]);
+                    estimatedRowSizePercentiles[i] = partitionSizeHist.percentile(offsetPercentiles[i]);
             }
 
             if (columnCountHist.isOverflowed())
@@ -94,10 +94,10 @@
             }
 
             // min value
-            estimatedRowSizePercentiles[5] = rowSizeHist.min();
+            estimatedRowSizePercentiles[5] = partitionSizeHist.min();
             estimatedColumnCountPercentiles[5] = columnCountHist.min();
             // max value
-            estimatedRowSizePercentiles[6] = rowSizeHist.max();
+            estimatedRowSizePercentiles[6] = partitionSizeHist.max();
             estimatedColumnCountPercentiles[6] = columnCountHist.max();
         }
 

diff --git a/src/java/org/apache/cassandra/tools/nodetool/TableStats.java b/src/java/org/apache/cassandra/tools/nodetool/TableStats.java
index a1d2038..c7d0d30 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/TableStats.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/TableStats.java

@@ -186,13 +186,12 @@
                     System.out.println("\t\tOff heap memory used (total): " + format(offHeapSize, humanReadable));
                 System.out.println("\t\tSSTable Compression Ratio: " + probe.getColumnFamilyMetric(keyspaceName, tableName, "CompressionRatio"));
 
-                Object estimatedRowCount = probe.getColumnFamilyMetric(keyspaceName, tableName, "EstimatedRowCount");
-                if (Long.valueOf(-1L).equals(estimatedRowCount))
+                Object estimatedPartitionCount = probe.getColumnFamilyMetric(keyspaceName, tableName, "EstimatedPartitionCount");
+                if (Long.valueOf(-1L).equals(estimatedPartitionCount))
                 {
-                    estimatedRowCount = 0L;
+                    estimatedPartitionCount = 0L;
                 }
-
-                System.out.println("\t\tNumber of keys (estimate): " + estimatedRowCount);
+                System.out.println("\t\tNumber of partitions (estimate): " + estimatedPartitionCount);
 
                 System.out.println("\t\tMemtable cell count: " + probe.getColumnFamilyMetric(keyspaceName, tableName, "MemtableColumnsCount"));
                 System.out.println("\t\tMemtable data size: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MemtableLiveDataSize"), humanReadable));
@@ -218,9 +217,9 @@
                 if (compressionMetadataOffHeapSize != null)
                     System.out.println("\t\tCompression metadata off heap memory used: " + format(compressionMetadataOffHeapSize, humanReadable));
 
-                System.out.println("\t\tCompacted partition minimum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MinRowSize"), humanReadable));
-                System.out.println("\t\tCompacted partition maximum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MaxRowSize"), humanReadable));
-                System.out.println("\t\tCompacted partition mean bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MeanRowSize"), humanReadable));
+                System.out.println("\t\tCompacted partition minimum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MinPartitionSize"), humanReadable));
+                System.out.println("\t\tCompacted partition maximum bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MaxPartitionSize"), humanReadable));
+                System.out.println("\t\tCompacted partition mean bytes: " + format((Long) probe.getColumnFamilyMetric(keyspaceName, tableName, "MeanPartitionSize"), humanReadable));
                 CassandraMetricsRegistry.JmxHistogramMBean histogram = (CassandraMetricsRegistry.JmxHistogramMBean) probe.getColumnFamilyMetric(keyspaceName, tableName, "LiveScannedHistogram");
                 System.out.println("\t\tAverage live cells per slice (last five minutes): " + histogram.getMean());
                 System.out.println("\t\tMaximum live cells per slice (last five minutes): " + histogram.getMax());

diff --git a/src/java/org/apache/cassandra/tools/nodetool/TopPartitions.java b/src/java/org/apache/cassandra/tools/nodetool/TopPartitions.java
index 35e13ce..b473a8d 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/TopPartitions.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/TopPartitions.java

@@ -33,8 +33,8 @@
 import javax.management.openmbean.OpenDataException;
 import javax.management.openmbean.TabularDataSupport;
 
-import org.apache.cassandra.metrics.ColumnFamilyMetrics;
-import org.apache.cassandra.metrics.ColumnFamilyMetrics.Sampler;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.metrics.TableMetrics.Sampler;
 import org.apache.cassandra.tools.NodeProbe;
 import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
 
@@ -51,7 +51,7 @@
     @Option(name = "-k", description = "Number of the top partitions to list (Default: 10)")
     private int topCount = 10;
     @Option(name = "-a", description = "Comma separated list of samplers to use (Default: all)")
-    private String samplers = join(ColumnFamilyMetrics.Sampler.values(), ',');
+    private String samplers = join(TableMetrics.Sampler.values(), ',');
     @Override
     public void execute(NodeProbe probe)
     {

diff --git a/src/java/org/apache/cassandra/tools/nodetool/TpStats.java b/src/java/org/apache/cassandra/tools/nodetool/TpStats.java
index f3448ab..5d3eab7 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/TpStats.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/TpStats.java

@@ -34,8 +34,8 @@
     {
         System.out.printf("%-25s%10s%10s%15s%10s%18s%n", "Pool Name", "Active", "Pending", "Completed", "Blocked", "All time blocked");
 
-        Multimap<String, String> threadPools = probe.getThreadPools();
 
+        Multimap<String, String> threadPools = probe.getThreadPools();
         for (Map.Entry<String, String> tpool : threadPools.entries())
         {
             System.out.printf("%-25s%10s%10s%15s%10s%18s%n",

diff --git a/src/java/org/apache/cassandra/tools/nodetool/UpgradeSSTable.java b/src/java/org/apache/cassandra/tools/nodetool/UpgradeSSTable.java
index fcb1ab2..82866e0 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/UpgradeSSTable.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/UpgradeSSTable.java

@@ -44,18 +44,19 @@
     @Override
     public void execute(NodeProbe probe)
     {
-        List<String> keyspaces = parseOptionalKeyspace(args, probe, true);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        List<String> keyspaces = parseOptionalKeyspace(args, probe);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.upgradeSSTables(System.out, keyspace, !includeAll, jobs, cfnames);
-            } catch (Exception e)
+                probe.upgradeSSTables(System.out, keyspace, !includeAll, jobs, tableNames);
+            }
+            catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during enabling auto-compaction", e);
             }
         }
     }
-}
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/tools/nodetool/Verify.java b/src/java/org/apache/cassandra/tools/nodetool/Verify.java
index 813b761..c449366 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/Verify.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/Verify.java

@@ -42,13 +42,13 @@
     public void execute(NodeProbe probe)
     {
         List<String> keyspaces = parseOptionalKeyspace(args, probe);
-        String[] cfnames = parseOptionalColumnFamilies(args);
+        String[] tableNames = parseOptionalTables(args);
 
         for (String keyspace : keyspaces)
         {
             try
             {
-                probe.verify(System.out, extendedVerify, keyspace, cfnames);
+                probe.verify(System.out, extendedVerify, keyspace, tableNames);
             } catch (Exception e)
             {
                 throw new RuntimeException("Error occurred during verifying", e);

diff --git a/src/java/org/apache/cassandra/tracing/TraceKeyspace.java b/src/java/org/apache/cassandra/tracing/TraceKeyspace.java
index f66269d..fb70451 100644
--- a/src/java/org/apache/cassandra/tracing/TraceKeyspace.java
+++ b/src/java/org/apache/cassandra/tracing/TraceKeyspace.java

@@ -21,21 +21,39 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import com.google.common.collect.ImmutableMap;
-
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.CFRowAdder;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Tables;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
 
 public final class TraceKeyspace
 {
+    private TraceKeyspace()
+    {
+    }
+
     public static final String NAME = "system_traces";
 
+    /**
+     * Generation is used as a timestamp for automatic table creation on startup.
+     * If you make any changes to the tables below, make sure to increment the
+     * generation and document your change here.
+     *
+     * gen 1577836800000000: (3.0) maps to Jan 1 2020; an arbitrary cut-off date by which we assume no nodes older than 2.0.2
+     *                       will ever start; see the note below for why this is necessary; actual change in 3.0:
+     *                       removed default ttl, reduced bloom filter fp chance from 0.1 to 0.01.
+     * gen 1577836800000001: (pre-)adds coordinator_port column to sessions and source_port column to events in 3.0, 3.11, 4.0
+     *
+     * * Until CASSANDRA-6016 (Oct 13, 2.0.2) and in all of 1.2, we used to create system_traces keyspace and
+     *   tables in the same way that we created the purely local 'system' keyspace - using current time on node bounce
+     *   (+1). For new definitions to take, we need to bump the generation further than that.
+     */
+    public static final long GENERATION = 1577836800000001L;
+
     public static final String SESSIONS = "sessions";
     public static final String EVENTS = "events";
 
@@ -47,6 +65,7 @@
                 + "command text,"
                 + "client inet,"
                 + "coordinator inet,"
+                + "coordinator_port int,"
                 + "duration int,"
                 + "parameters map<text, text>,"
                 + "request text,"
@@ -61,6 +80,7 @@
                 + "event_id timeuuid,"
                 + "activity text,"
                 + "source inet,"
+                + "source_port int,"
                 + "source_elapsed int,"
                 + "thread text,"
                 + "PRIMARY KEY ((session_id), event_id))");
@@ -71,10 +91,9 @@
                          .comment(description);
     }
 
-    public static KSMetaData definition()
+    public static KeyspaceMetadata metadata()
     {
-        List<CFMetaData> tables = Arrays.asList(Sessions, Events);
-        return new KSMetaData(NAME, SimpleStrategy.class, ImmutableMap.of("replication_factor", "2"), true, tables);
+        return KeyspaceMetadata.create(NAME, KeyspaceParams.simple(2), Tables.of(Sessions, Events));
     }
 
     static Mutation makeStartSessionMutation(ByteBuffer sessionId,
@@ -85,44 +104,36 @@
                                              String command,
                                              int ttl)
     {
-        Mutation mutation = new Mutation(NAME, sessionId);
-        ColumnFamily cells = mutation.addOrGet(TraceKeyspace.Sessions);
+        RowUpdateBuilder adder = new RowUpdateBuilder(Sessions, FBUtilities.timestampMicros(), ttl, sessionId)
+                                 .clustering()
+                                 .add("client", client)
+                                 .add("coordinator", FBUtilities.getBroadcastAddress())
+                                 .add("request", request)
+                                 .add("started_at", new Date(startedAt))
+                                 .add("command", command);
 
-        CFRowAdder adder = new CFRowAdder(cells, cells.metadata().comparator.builder().build(), FBUtilities.timestampMicros(), ttl);
-        adder.add("client", client)
-             .add("coordinator", FBUtilities.getBroadcastAddress())
-             .add("request", request)
-             .add("started_at", new Date(startedAt))
-             .add("command", command);
         for (Map.Entry<String, String> entry : parameters.entrySet())
             adder.addMapEntry("parameters", entry.getKey(), entry.getValue());
-
-        return mutation;
+        return adder.build();
     }
 
     static Mutation makeStopSessionMutation(ByteBuffer sessionId, int elapsed, int ttl)
     {
-        Mutation mutation = new Mutation(NAME, sessionId);
-        ColumnFamily cells = mutation.addOrGet(Sessions);
-
-        CFRowAdder adder = new CFRowAdder(cells, cells.metadata().comparator.builder().build(), FBUtilities.timestampMicros(), ttl);
-        adder.add("duration", elapsed);
-
-        return mutation;
+        return new RowUpdateBuilder(Sessions, FBUtilities.timestampMicros(), ttl, sessionId)
+               .clustering()
+               .add("duration", elapsed)
+               .build();
     }
 
     static Mutation makeEventMutation(ByteBuffer sessionId, String message, int elapsed, String threadName, int ttl)
     {
-        Mutation mutation = new Mutation(NAME, sessionId);
-        ColumnFamily cells = mutation.addOrGet(Events);
-
-        CFRowAdder adder = new CFRowAdder(cells, cells.metadata().comparator.make(UUIDGen.getTimeUUID()), FBUtilities.timestampMicros(), ttl);
-        adder.add("activity", message)
-             .add("source", FBUtilities.getBroadcastAddress())
-             .add("thread", threadName);
+        RowUpdateBuilder adder = new RowUpdateBuilder(Events, FBUtilities.timestampMicros(), ttl, sessionId)
+                                 .clustering(UUIDGen.getTimeUUID());
+        adder.add("activity", message);
+        adder.add("source", FBUtilities.getBroadcastAddress());
+        adder.add("thread", threadName);
         if (elapsed >= 0)
             adder.add("source_elapsed", elapsed);
-
-        return mutation;
+        return adder.build();
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/CBUtil.java b/src/java/org/apache/cassandra/transport/CBUtil.java
index 92e2891..800a9a8 100644
--- a/src/java/org/apache/cassandra/transport/CBUtil.java
+++ b/src/java/org/apache/cassandra/transport/CBUtil.java

@@ -430,6 +430,13 @@
         return 4 + (bytes == null ? 0 : bytes.remaining());
     }
 
+    // The size of serializing a value given the size (in bytes) of said value. The provided size can be negative
+    // to indicate that the value is null.
+    public static int sizeOfValue(int valueSize)
+    {
+        return 4 + (valueSize < 0 ? 0 : valueSize);
+    }
+
     public static List<ByteBuffer> readValueList(ByteBuf cb, int protocolVersion)
     {
         int size = cb.readUnsignedShort();

diff --git a/src/java/org/apache/cassandra/transport/ConfiguredLimit.java b/src/java/org/apache/cassandra/transport/ConfiguredLimit.java
new file mode 100644
index 0000000..98518b8
--- /dev/null
+++ b/src/java/org/apache/cassandra/transport/ConfiguredLimit.java

@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.utils.CassandraVersion;
+
+public abstract class ConfiguredLimit implements ProtocolVersionLimit
+{
+    private static final Logger logger = LoggerFactory.getLogger(ConfiguredLimit.class);
+    static final String DISABLE_MAX_PROTOCOL_AUTO_OVERRIDE = "cassandra.disable_max_protocol_auto_override";
+    static final CassandraVersion MIN_VERSION_FOR_V4 = new CassandraVersion("3.0.0");
+
+    public abstract int getMaxVersion();
+    public abstract void updateMaxSupportedVersion();
+
+    public static ConfiguredLimit newLimit()
+    {
+        if (Boolean.getBoolean(DISABLE_MAX_PROTOCOL_AUTO_OVERRIDE))
+            return new StaticLimit(Server.CURRENT_VERSION);
+
+        int fromConfig = DatabaseDescriptor.getNativeProtocolMaxVersionOverride();
+        return fromConfig != Integer.MIN_VALUE
+               ? new StaticLimit(fromConfig)
+               : new DynamicLimit(Server.CURRENT_VERSION);
+    }
+
+    private static class StaticLimit extends ConfiguredLimit
+    {
+        private final int maxVersion;
+        private StaticLimit(int maxVersion)
+        {
+            if (maxVersion < Server.MIN_SUPPORTED_VERSION || maxVersion > Server.CURRENT_VERSION)
+                throw new IllegalArgumentException(String.format("Invalid max protocol version supplied (%s); " +
+                                                                 "Values between %s and %s are supported",
+                                                                 maxVersion,
+                                                                 Server.MIN_SUPPORTED_VERSION,
+                                                                 Server.CURRENT_VERSION));
+            this.maxVersion = maxVersion;
+            logger.info("Native transport max negotiable version statically limited to {}", maxVersion);
+        }
+
+        public int getMaxVersion()
+        {
+            return maxVersion;
+        }
+
+        public void updateMaxSupportedVersion()
+        {
+            // statically configured, so this is a no-op
+        }
+    }
+
+    private static class DynamicLimit extends ConfiguredLimit
+    {
+        private volatile int maxVersion;
+        private DynamicLimit(int initialLimit)
+        {
+            maxVersion = initialLimit;
+            maybeUpdateVersion(true);
+        }
+
+        public int getMaxVersion()
+        {
+            return maxVersion;
+        }
+
+        public void updateMaxSupportedVersion()
+        {
+            maybeUpdateVersion(false);
+        }
+
+        private void maybeUpdateVersion(boolean allowLowering)
+        {
+            boolean enforceV3Cap = SystemKeyspace.loadPeerVersions()
+                                                 .values()
+                                                 .stream()
+                                                 .anyMatch(v -> v.compareTo(MIN_VERSION_FOR_V4) < 0);
+
+            if (!enforceV3Cap)
+            {
+                maxVersion = Server.CURRENT_VERSION;
+                return;
+            }
+
+            if (maxVersion > Server.VERSION_3 && !allowLowering)
+            {
+                logger.info("Detected peers which do not fully support protocol V4, but V4 was previously negotiable. " +
+                            "Not enforcing cap as this can cause issues for older client versions. After the next " +
+                            "restart the server will apply the cap");
+                return;
+            }
+            logger.info("Detected peers which do not fully support protocol V4. Capping max negotiable version to V3");
+            maxVersion = Server.VERSION_3;
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/transport/Connection.java b/src/java/org/apache/cassandra/transport/Connection.java
index af26557..2966d9b 100644
--- a/src/java/org/apache/cassandra/transport/Connection.java
+++ b/src/java/org/apache/cassandra/transport/Connection.java

@@ -29,6 +29,7 @@
     private final Tracker tracker;
 
     private volatile FrameCompressor frameCompressor;
+    private boolean throwOnOverload;
 
     public Connection(Channel channel, int version, Tracker tracker)
     {
@@ -49,6 +50,16 @@
         return frameCompressor;
     }
 
+    public void setThrowOnOverload(boolean throwOnOverload)
+    {
+        this.throwOnOverload = throwOnOverload;
+    }
+
+    public boolean isThrowOnOverload()
+    {
+        return throwOnOverload;
+    }
+
     public Tracker getTracker()
     {
         return tracker;

diff --git a/src/java/org/apache/cassandra/transport/Event.java b/src/java/org/apache/cassandra/transport/Event.java
index a3e0888..3c45c33 100644
--- a/src/java/org/apache/cassandra/transport/Event.java
+++ b/src/java/org/apache/cassandra/transport/Event.java

@@ -21,7 +21,6 @@
 import java.net.InetSocketAddress;
 import java.util.Iterator;
 import java.util.List;
-import java.util.UUID;
 
 import com.google.common.base.Objects;
 import io.netty.buffer.ByteBuf;
@@ -29,9 +28,9 @@
 public abstract class Event
 {
     public enum Type {
-        TOPOLOGY_CHANGE(Server.VERSION_1),
-        STATUS_CHANGE(Server.VERSION_1),
-        SCHEMA_CHANGE(Server.VERSION_1),
+        TOPOLOGY_CHANGE(Server.VERSION_3),
+        STATUS_CHANGE(Server.VERSION_3),
+        SCHEMA_CHANGE(Server.VERSION_3),
         TRACE_COMPLETE(Server.VERSION_4);
 
         public final int minimumVersion;

diff --git a/src/java/org/apache/cassandra/transport/Frame.java b/src/java/org/apache/cassandra/transport/Frame.java
index 66df3e7..a07551f 100644
--- a/src/java/org/apache/cassandra/transport/Frame.java
+++ b/src/java/org/apache/cassandra/transport/Frame.java

@@ -22,9 +22,6 @@
 import java.util.EnumSet;
 import java.util.List;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import io.netty.buffer.ByteBuf;
 import io.netty.channel.*;
 import io.netty.handler.codec.ByteToMessageDecoder;
@@ -36,19 +33,12 @@
 
 public class Frame
 {
-    private static final Logger logger = LoggerFactory.getLogger(Frame.class);
-
     public static final byte PROTOCOL_VERSION_MASK = 0x7f;
 
     public final Header header;
     public final ByteBuf body;
 
     /**
-     * <code>true</code> if the deprecation warning for protocol versions 1 and 2 has been logged.
-     */
-    private static boolean hasLoggedDeprecationWarning;
-
-    /**
      * An on-wire frame consists of a header and a body.
      *
      * The header is defined the following way in native protocol version 3 and later:
@@ -59,16 +49,6 @@
      *   +---------+---------+---------+---------+---------+
      *   |                length                 |
      *   +---------+---------+---------+---------+
-     *
-     *
-     * In versions 1 and 2 the header has a smaller (1 byte) stream id, and is thus defined the following way:
-     *
-     *   0         8        16        24        32
-     *   +---------+---------+---------+---------+
-     *   | version |  flags  | stream  | opcode  |
-     *   +---------+---------+---------+---------+
-     *   |                length                 |
-     *   +---------+---------+---------+---------+
      */
     private Frame(Header header, ByteBuf body)
     {
@@ -88,15 +68,14 @@
 
     public static Frame create(Message.Type type, int streamId, int version, EnumSet<Header.Flag> flags, ByteBuf body)
     {
-        Header header = new Header(version, flags, streamId, type);
+        Header header = new Header(version, flags, streamId, type, body.readableBytes());
         return new Frame(header, body);
     }
 
     public static class Header
     {
-        // 8 bytes in protocol versions 1 and 2, 8 bytes in protocol version 3 and later
-        public static final int MODERN_LENGTH = 9;
-        public static final int LEGACY_LENGTH = 8;
+        // 9 bytes in protocol version 3 and later
+        public static final int LENGTH = 9;
 
         public static final int BODY_LENGTH_SIZE = 4;
 
@@ -104,18 +83,20 @@
         public final EnumSet<Flag> flags;
         public final int streamId;
         public final Message.Type type;
+        public final long bodySizeInBytes;
 
-        private Header(int version, int flags, int streamId, Message.Type type)
+        private Header(int version, int flags, int streamId, Message.Type type, long bodySizeInBytes)
         {
-            this(version, Flag.deserialize(flags), streamId, type);
+            this(version, Flag.deserialize(flags), streamId, type, bodySizeInBytes);
         }
 
-        private Header(int version, EnumSet<Flag> flags, int streamId, Message.Type type)
+        private Header(int version, EnumSet<Flag> flags, int streamId, Message.Type type, long bodySizeInBytes)
         {
             this.version = version;
             this.flags = flags;
             this.streamId = streamId;
             this.type = type;
+            this.bodySizeInBytes = bodySizeInBytes;
         }
 
         public static enum Flag
@@ -164,10 +145,12 @@
         private int tooLongStreamId;
 
         private final Connection.Factory factory;
+        private final ProtocolVersionLimit versionCap;
 
-        public Decoder(Connection.Factory factory)
+        public Decoder(Connection.Factory factory, ProtocolVersionLimit versionCap)
         {
             this.factory = factory;
+            this.versionCap = versionCap;
         }
 
         @Override
@@ -183,48 +166,30 @@
                 return;
             }
 
-            // Wait until we have read at least the short header
-            if (buffer.readableBytes() < Header.LEGACY_LENGTH)
+            int readableBytes = buffer.readableBytes();
+            if (readableBytes == 0)
                 return;
 
             int idx = buffer.readerIndex();
 
+            // Check the first byte for the protocol version before we wait for a complete header.  Protocol versions
+            // 1 and 2 use a shorter header, so we may never have a complete header's worth of bytes.
             int firstByte = buffer.getByte(idx++);
             Message.Direction direction = Message.Direction.extractFromVersion(firstByte);
             int version = firstByte & PROTOCOL_VERSION_MASK;
+            if (version < Server.MIN_SUPPORTED_VERSION || version > versionCap.getMaxVersion())
+                throw new ProtocolException(String.format("Invalid or unsupported protocol version (%d); the lowest supported version is %d and the greatest is %d",
+                                                          version, Server.MIN_SUPPORTED_VERSION, versionCap.getMaxVersion()),
+                                            version < Server.MIN_SUPPORTED_VERSION ? version : null);
 
-            if (version > Server.CURRENT_VERSION)
-                throw new ProtocolException(String.format("Invalid or unsupported protocol version (%d); highest supported is %d ",
-                                                          version, Server.CURRENT_VERSION));
-
-            if (version < Server.VERSION_3 && !hasLoggedDeprecationWarning)
-            {
-                hasLoggedDeprecationWarning = true;
-                logger.warn("Detected connection using native protocol version {}. Both version 1 and 2"
-                          + " of the native protocol are now deprecated and support will be removed in Cassandra 3.0."
-                          + " You are encouraged to upgrade to a client driver using version 3 of the native protocol",
-                            version);
-            }
-
-            // Wait until we have the complete V3+ header
-            if (version >= Server.VERSION_3 && buffer.readableBytes() < Header.MODERN_LENGTH)
+            // Wait until we have the complete header
+            if (readableBytes < Header.LENGTH)
                 return;
 
             int flags = buffer.getByte(idx++);
 
-            int streamId, headerLength;
-            if (version >= Server.VERSION_3)
-            {
-                streamId = buffer.getShort(idx);
-                idx += 2;
-                headerLength = Header.MODERN_LENGTH;
-            }
-            else
-            {
-                streamId = buffer.getByte(idx);
-                idx++;
-                headerLength = Header.LEGACY_LENGTH;
-            }
+            int streamId = buffer.getShort(idx);
+            idx += 2;
 
             // This throws a protocol exceptions if the opcode is unknown
             Message.Type type;
@@ -239,7 +204,8 @@
 
             long bodyLength = buffer.getUnsignedInt(idx);
             idx += Header.BODY_LENGTH_SIZE;
-            long frameLength = bodyLength + headerLength;
+
+            long frameLength = bodyLength + Header.LENGTH;
             if (frameLength > MAX_FRAME_LENGTH)
             {
                 // Enter the discard mode and discard everything received so far.
@@ -278,7 +244,7 @@
                         streamId);
             }
 
-            results.add(new Frame(new Header(version, flags, streamId, type), body));
+            results.add(new Frame(new Header(version, flags, streamId, type, bodyLength), body));
         }
 
         private void fail()
@@ -306,15 +272,14 @@
         public void encode(ChannelHandlerContext ctx, Frame frame, List<Object> results)
         throws IOException
         {
-            int headerLength = frame.header.version >= Server.VERSION_3
-                             ? Header.MODERN_LENGTH
-                             : Header.LEGACY_LENGTH;
-            ByteBuf header = CBUtil.allocator.buffer(headerLength);
+            ByteBuf header = CBUtil.allocator.buffer(Header.LENGTH);
 
             Message.Type type = frame.header.type;
             header.writeByte(type.direction.addToVersion(frame.header.version));
             header.writeByte(Header.Flag.serialize(frame.header.flags));
 
+            // Continue to support writing pre-v3 headers so that we can give proper error messages to drivers that
+            // connect with the v1/v2 protocol. See CASSANDRA-11464.
             if (frame.header.version >= Server.VERSION_3)
                 header.writeShort(frame.header.streamId);
             else

diff --git a/src/java/org/apache/cassandra/transport/Message.java b/src/java/org/apache/cassandra/transport/Message.java
index 2c2048f..5202578 100644
--- a/src/java/org/apache/cassandra/transport/Message.java
+++ b/src/java/org/apache/cassandra/transport/Message.java

@@ -42,11 +42,18 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.concurrent.LocalAwareExecutorService;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.exceptions.OverloadedException;
+import org.apache.cassandra.metrics.ClientMetrics;
+import org.apache.cassandra.net.ResourceLimits;
 import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.transport.messages.*;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 
+import static org.apache.cassandra.concurrent.SharedExecutorPool.SHARED;
+
 /**
  * A message from the CQL binary protocol.
  */
@@ -150,6 +157,7 @@
     private int streamId;
     private Frame sourceFrame;
     private Map<String, ByteBuffer> customPayload;
+    protected Integer forcedProtocolVersion = null;
 
     protected Message(Type type)
     {
@@ -314,11 +322,18 @@
     @ChannelHandler.Sharable
     public static class ProtocolEncoder extends MessageToMessageEncoder<Message>
     {
+        private final ProtocolVersionLimit versionCap;
+
+        ProtocolEncoder(ProtocolVersionLimit versionCap)
+        {
+            this.versionCap = versionCap;
+        }
+
         public void encode(ChannelHandlerContext ctx, Message message, List results)
         {
             Connection connection = ctx.channel().attr(Connection.attributeKey).get();
             // The only case the connection can be null is when we send the initial STARTUP message (client side thus)
-            int version = connection == null ? Server.CURRENT_VERSION : connection.getVersion();
+            int version = connection == null ? versionCap.getMaxVersion() : connection.getVersion();
 
             EnumSet<Frame.Header.Flag> flags = EnumSet.noneOf(Frame.Header.Flag.class);
 
@@ -389,7 +404,12 @@
                     throw e;
                 }
 
-                results.add(Frame.create(message.type, message.getStreamId(), version, flags, body));
+                // if the driver attempted to connect with a protocol version lower than the minimum supported
+                // version, respond with a protocol error message with the correct frame header for that version
+                int responseVersion = message.forcedProtocolVersion == null
+                                    ? version
+                                    : message.forcedProtocolVersion;
+                results.add(Frame.create(message.type, message.getStreamId(), responseVersion, flags, body));
             }
             catch (Throwable e)
             {
@@ -398,42 +418,77 @@
         }
     }
 
-    @ChannelHandler.Sharable
     public static class Dispatcher extends SimpleChannelInboundHandler<Request>
     {
+        private static final LocalAwareExecutorService requestExecutor = SHARED.newExecutor(DatabaseDescriptor.getNativeTransportMaxThreads(),
+                                                                                            Integer.MAX_VALUE,
+                                                                                            "transport",
+                                                                                            "Native-Transport-Requests");
+
+        /**
+         * Current count of *request* bytes that are live on the channel.
+         *
+         * Note: should only be accessed while on the netty event loop.
+         */
+        private long channelPayloadBytesInFlight;
+
+        private final Server.EndpointPayloadTracker endpointPayloadTracker;
+
+        private boolean paused;
+
         private static class FlushItem
         {
             final ChannelHandlerContext ctx;
             final Object response;
             final Frame sourceFrame;
-            private FlushItem(ChannelHandlerContext ctx, Object response, Frame sourceFrame)
+            final Dispatcher dispatcher;
+
+            private FlushItem(ChannelHandlerContext ctx, Object response, Frame sourceFrame, Dispatcher dispatcher)
             {
                 this.ctx = ctx;
                 this.sourceFrame = sourceFrame;
                 this.response = response;
+                this.dispatcher = dispatcher;
+            }
+
+            public void release()
+            {
+                dispatcher.releaseItem(this);
             }
         }
 
-        private static final class Flusher implements Runnable
+        private static abstract class Flusher implements Runnable
         {
             final EventLoop eventLoop;
             final ConcurrentLinkedQueue<FlushItem> queued = new ConcurrentLinkedQueue<>();
-            final AtomicBoolean running = new AtomicBoolean(false);
+            final AtomicBoolean scheduled = new AtomicBoolean(false);
             final HashSet<ChannelHandlerContext> channels = new HashSet<>();
             final List<FlushItem> flushed = new ArrayList<>();
-            int runsSinceFlush = 0;
-            int runsWithNoWork = 0;
-            private Flusher(EventLoop eventLoop)
-            {
-                this.eventLoop = eventLoop;
-            }
+
             void start()
             {
-                if (!running.get() && running.compareAndSet(false, true))
+                if (!scheduled.get() && scheduled.compareAndSet(false, true))
                 {
                     this.eventLoop.execute(this);
                 }
             }
+
+            public Flusher(EventLoop eventLoop)
+            {
+                this.eventLoop = eventLoop;
+            }
+        }
+
+        private static final class LegacyFlusher extends Flusher
+        {
+            int runsSinceFlush = 0;
+            int runsWithNoWork = 0;
+
+            private LegacyFlusher(EventLoop eventLoop)
+            {
+                super(eventLoop);
+            }
+
             public void run()
             {
 
@@ -454,7 +509,7 @@
                     for (ChannelHandlerContext channel : channels)
                         channel.flush();
                     for (FlushItem item : flushed)
-                        item.sourceFrame.release();
+                        item.release();
 
                     channels.clear();
                     flushed.clear();
@@ -470,8 +525,8 @@
                     // either reschedule or cancel
                     if (++runsWithNoWork > 5)
                     {
-                        running.set(false);
-                        if (queued.isEmpty() || !running.compareAndSet(false, true))
+                        scheduled.set(false);
+                        if (queued.isEmpty() || !scheduled.compareAndSet(false, true))
                             return;
                     }
                 }
@@ -480,17 +535,136 @@
             }
         }
 
+        private static final class ImmediateFlusher extends Flusher
+        {
+            private ImmediateFlusher(EventLoop eventLoop)
+            {
+                super(eventLoop);
+            }
+
+            public void run()
+            {
+                boolean doneWork = false;
+                FlushItem flush;
+                scheduled.set(false);
+
+                while (null != (flush = queued.poll()))
+                {
+                    channels.add(flush.ctx);
+                    flush.ctx.write(flush.response, flush.ctx.voidPromise());
+                    flushed.add(flush);
+                    doneWork = true;
+                }
+
+                if (doneWork)
+                {
+                    for (ChannelHandlerContext channel : channels)
+                        channel.flush();
+                    for (FlushItem item : flushed)
+                        item.release();
+
+                    channels.clear();
+                    flushed.clear();
+                }
+            }
+        }
+
         private static final ConcurrentMap<EventLoop, Flusher> flusherLookup = new ConcurrentHashMap<>();
 
-        public Dispatcher()
+        private final boolean useLegacyFlusher;
+
+        public Dispatcher(boolean useLegacyFlusher, Server.EndpointPayloadTracker endpointPayloadTracker)
         {
             super(false);
+            this.useLegacyFlusher = useLegacyFlusher;
+            this.endpointPayloadTracker = endpointPayloadTracker;
         }
 
         @Override
         public void channelRead0(ChannelHandlerContext ctx, Request request)
         {
+            // if we decide to handle this message, process it outside of the netty event loop
+            if (shouldHandleRequest(ctx, request))
+                requestExecutor.submit(() -> processRequest(ctx, request));
+        }
 
+        /** This check for inflight payload to potentially discard the request should have been ideally in one of the
+         * first handlers in the pipeline (Frame::decode()). However, incase of any exception thrown between that
+         * handler (where inflight payload is incremented) and this handler (Dispatcher::channelRead0) (where inflight
+         * payload in decremented), inflight payload becomes erroneous. ExceptionHandler is not sufficient for this
+         * purpose since it does not have the frame associated with the exception.
+         *
+         * Note: this method should execute on the netty event loop.
+         */
+        private boolean shouldHandleRequest(ChannelHandlerContext ctx, Request request)
+        {
+            long frameSize = request.getSourceFrame().header.bodySizeInBytes;
+
+            ResourceLimits.EndpointAndGlobal endpointAndGlobalPayloadsInFlight = endpointPayloadTracker.endpointAndGlobalPayloadsInFlight;
+
+            // check for overloaded state by trying to allocate framesize to inflight payload trackers
+            if (endpointAndGlobalPayloadsInFlight.tryAllocate(frameSize) != ResourceLimits.Outcome.SUCCESS)
+            {
+                if (request.connection.isThrowOnOverload())
+                {
+                    // discard the request and throw an exception
+                    ClientMetrics.instance.markRequestDiscarded();
+                    logger.trace("Discarded request of size: {}. InflightChannelRequestPayload: {}, InflightEndpointRequestPayload: {}, InflightOverallRequestPayload: {}, Request: {}",
+                                 frameSize,
+                                 channelPayloadBytesInFlight,
+                                 endpointAndGlobalPayloadsInFlight.endpoint().using(),
+                                 endpointAndGlobalPayloadsInFlight.global().using(),
+                                 request);
+                    throw ErrorMessage.wrap(new OverloadedException("Server is in overloaded state. Cannot accept more requests at this point"),
+                                            request.getSourceFrame().header.streamId);
+                }
+                else
+                {
+                    // set backpressure on the channel, and handle the request
+                    endpointAndGlobalPayloadsInFlight.allocate(frameSize);
+                    ctx.channel().config().setAutoRead(false);
+                    ClientMetrics.instance.pauseConnection();
+                    paused = true;
+                }
+            }
+
+            channelPayloadBytesInFlight += frameSize;
+            return true;
+        }
+
+        /**
+         * Note: this method will be used in the {@link Flusher#run()}, which executes on the netty event loop
+         * ({@link Dispatcher#flusherLookup}). Thus, we assume the semantics and visibility of variables
+         * of being on the event loop.
+         */
+        private void releaseItem(FlushItem item)
+        {
+            long itemSize = item.sourceFrame.header.bodySizeInBytes;
+            item.sourceFrame.release();
+
+            // since the request has been processed, decrement inflight payload at channel, endpoint and global levels
+            channelPayloadBytesInFlight -= itemSize;
+            ResourceLimits.Outcome endpointGlobalReleaseOutcome = endpointPayloadTracker.endpointAndGlobalPayloadsInFlight.release(itemSize);
+
+            // now check to see if we need to reenable the channel's autoRead.
+            // If the current payload side is zero, we must reenable autoread as
+            // 1) we allow no other thread/channel to do it, and
+            // 2) there's no other events following this one (becuase we're at zero bytes in flight),
+            // so no successive to trigger the other clause in this if-block
+            ChannelConfig config = item.ctx.channel().config();
+            if (paused && (channelPayloadBytesInFlight == 0 || endpointGlobalReleaseOutcome == ResourceLimits.Outcome.BELOW_LIMIT))
+            {
+                paused = false;
+                ClientMetrics.instance.unpauseConnection();
+                config.setAutoRead(true);
+            }
+        }
+
+        /**
+         * Note: this method is not expected to execute on the netty event loop.
+         */
+        void processRequest(ChannelHandlerContext ctx, Request request)
+        {
             final Response response;
             final ServerConnection connection;
 
@@ -514,7 +688,7 @@
             {
                 JVMStabilityInspector.inspectThrowable(t);
                 UnexpectedChannelExceptionHandler handler = new UnexpectedChannelExceptionHandler(ctx.channel(), true);
-                flush(new FlushItem(ctx, ErrorMessage.fromException(t, handler).setStreamId(request.getStreamId()), request.getSourceFrame()));
+                flush(new FlushItem(ctx, ErrorMessage.fromException(t, handler).setStreamId(request.getStreamId()), request.getSourceFrame(), this));
                 return;
             }
             finally
@@ -523,7 +697,19 @@
             }
 
             logger.trace("Responding: {}, v={}", response, connection.getVersion());
-            flush(new FlushItem(ctx, response, request.getSourceFrame()));
+            flush(new FlushItem(ctx, response, request.getSourceFrame(), this));
+        }
+
+        @Override
+        public void channelInactive(ChannelHandlerContext ctx)
+        {
+            endpointPayloadTracker.release();
+            if (paused)
+            {
+                paused = false;
+                ClientMetrics.instance.unpauseConnection();
+            }
+            ctx.fireChannelInactive();
         }
 
         private void flush(FlushItem item)
@@ -532,7 +718,8 @@
             Flusher flusher = flusherLookup.get(loop);
             if (flusher == null)
             {
-                Flusher alt = flusherLookup.putIfAbsent(loop, flusher = new Flusher(loop));
+                Flusher created = useLegacyFlusher ? new LegacyFlusher(loop) : new ImmediateFlusher(loop);
+                Flusher alt = flusherLookup.putIfAbsent(loop, flusher = created);
                 if (alt != null)
                     flusher = alt;
             }
@@ -540,6 +727,14 @@
             flusher.queued.add(item);
             flusher.start();
         }
+
+        public static void shutdown()
+        {
+            if (requestExecutor != null)
+            {
+                requestExecutor.shutdown();
+            }
+        }
     }
 
     @ChannelHandler.Sharable
@@ -602,7 +797,21 @@
 
             if (!alwaysLogAtError && exception instanceof IOException)
             {
-                if (ioExceptionsAtDebugLevel.contains(exception.getMessage()))
+                String errorMessage = exception.getMessage();
+                boolean logAtTrace = false;
+
+                for (String ioException : ioExceptionsAtDebugLevel)
+                {
+                    // exceptions thrown from the netty epoll transport add the name of the function that failed
+                    // to the exception string (which is simply wrapping a JDK exception), so we can't do a simple/naive comparison
+                    if (errorMessage.contains(ioException))
+                    {
+                        logAtTrace = true;
+                        break;
+                    }
+                }
+
+                if (logAtTrace)
                 {
                     // Likely unclean client disconnects
                     logger.trace(message, exception);

diff --git a/src/java/org/apache/cassandra/transport/ProtocolException.java b/src/java/org/apache/cassandra/transport/ProtocolException.java
index 9af9dc0..a589e9b 100644
--- a/src/java/org/apache/cassandra/transport/ProtocolException.java
+++ b/src/java/org/apache/cassandra/transport/ProtocolException.java

@@ -25,13 +25,31 @@
  */
 public class ProtocolException extends RuntimeException implements TransportException
 {
+    private final Integer attemptedLowProtocolVersion;
+
     public ProtocolException(String msg)
     {
+        this(msg, null);
+    }
+
+    public ProtocolException(String msg, Integer attemptedLowProtocolVersion)
+    {
         super(msg);
+        this.attemptedLowProtocolVersion = attemptedLowProtocolVersion;
     }
 
     public ExceptionCode code()
     {
         return ExceptionCode.PROTOCOL_ERROR;
     }
+
+    /**
+     * If the ProtocolException is due to a connection being made with a protocol version that is lower
+     * than Server.MIN_SUPPORTED_VERSION, this will return that unsupported protocol version.  Otherwise,
+     * null is returned.
+     */
+    public Integer getAttemptedLowProtocolVersion()
+    {
+        return attemptedLowProtocolVersion;
+    }
 }

diff --git a/src/java/org/apache/cassandra/transport/ProtocolVersionLimit.java b/src/java/org/apache/cassandra/transport/ProtocolVersionLimit.java
new file mode 100644
index 0000000..c476efb
--- /dev/null
+++ b/src/java/org/apache/cassandra/transport/ProtocolVersionLimit.java

@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+@FunctionalInterface
+public interface ProtocolVersionLimit
+{
+    public int getMaxVersion();
+
+    public static final ProtocolVersionLimit SERVER_DEFAULT = () -> Server.CURRENT_VERSION;
+}

diff --git a/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java b/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java
deleted file mode 100644
index 75dd05d..0000000
--- a/src/java/org/apache/cassandra/transport/RequestThreadPoolExecutor.java
+++ /dev/null

@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.transport;
-
-import java.util.List;
-import java.util.concurrent.TimeUnit;
-
-import io.netty.util.concurrent.AbstractEventExecutor;
-import io.netty.util.concurrent.EventExecutorGroup;
-import io.netty.util.concurrent.Future;
-import org.apache.cassandra.concurrent.LocalAwareExecutorService;
-import org.apache.cassandra.config.DatabaseDescriptor;
-
-import static org.apache.cassandra.concurrent.SharedExecutorPool.SHARED;
-
-public class RequestThreadPoolExecutor extends AbstractEventExecutor
-{
-    private final static int MAX_QUEUED_REQUESTS = Integer.getInteger("cassandra.max_queued_native_transport_requests", 128);
-    private final static String THREAD_FACTORY_ID = "Native-Transport-Requests";
-    private final LocalAwareExecutorService wrapped = SHARED.newExecutor(DatabaseDescriptor.getNativeTransportMaxThreads(),
-                                                                           MAX_QUEUED_REQUESTS,
-                                                                           "transport",
-                                                                           THREAD_FACTORY_ID);
-
-    public boolean isShuttingDown()
-    {
-        return wrapped.isShutdown();
-    }
-
-    public Future<?> shutdownGracefully(long l, long l2, TimeUnit timeUnit)
-    {
-        throw new IllegalStateException();
-    }
-
-    public Future<?> terminationFuture()
-    {
-        throw new IllegalStateException();
-    }
-
-    @Override
-    public void shutdown()
-    {
-        wrapped.shutdown();
-    }
-
-    @Override
-    public List<Runnable> shutdownNow()
-    {
-        return wrapped.shutdownNow();
-    }
-
-    public boolean isShutdown()
-    {
-        return wrapped.isShutdown();
-    }
-
-    public boolean isTerminated()
-    {
-        return wrapped.isTerminated();
-    }
-
-    public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException
-    {
-        return wrapped.awaitTermination(timeout, unit);
-    }
-
-    public EventExecutorGroup parent()
-    {
-        return null;
-    }
-
-    public boolean inEventLoop(Thread thread)
-    {
-        return false;
-    }
-
-    public void execute(Runnable command)
-    {
-        wrapped.execute(command);
-    }
-}

diff --git a/src/java/org/apache/cassandra/transport/Server.java b/src/java/org/apache/cassandra/transport/Server.java
index 418f6f7..012b326 100644
--- a/src/java/org/apache/cassandra/transport/Server.java
+++ b/src/java/org/apache/cassandra/transport/Server.java

@@ -22,21 +22,19 @@
 import java.net.InetSocketAddress;
 import java.net.UnknownHostException;
 import java.util.*;
-import java.util.concurrent.Callable;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.SSLEngine;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
 
-import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import io.netty.bootstrap.ServerBootstrap;
 import io.netty.buffer.ByteBuf;
 import io.netty.channel.*;
-import io.netty.channel.epoll.Epoll;
 import io.netty.channel.epoll.EpollEventLoopGroup;
 import io.netty.channel.epoll.EpollServerSocketChannel;
 import io.netty.channel.group.ChannelGroup;
@@ -53,7 +51,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.EncryptionOptions;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.metrics.ClientMetrics;
+import org.apache.cassandra.net.ResourceLimits;
 import org.apache.cassandra.security.SSLFactory;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.transport.messages.EventMessage;
@@ -67,13 +65,12 @@
     }
 
     private static final Logger logger = LoggerFactory.getLogger(Server.class);
-    private static final boolean enableEpoll = Boolean.valueOf(System.getProperty("cassandra.native.epoll.enabled", "true"));
+    private static final boolean useEpoll = NativeTransportService.useEpoll();
 
-    public static final int VERSION_1 = 1;
-    public static final int VERSION_2 = 2;
     public static final int VERSION_3 = 3;
     public static final int VERSION_4 = 4;
     public static final int CURRENT_VERSION = VERSION_4;
+    public static final int MIN_SUPPORTED_VERSION = VERSION_3;
 
     private final ConnectionTracker connectionTracker = new ConnectionTracker();
 
@@ -86,54 +83,38 @@
     };
 
     public final InetSocketAddress socket;
+    public boolean useSSL = false;
     private final AtomicBoolean isRunning = new AtomicBoolean(false);
 
     private EventLoopGroup workerGroup;
-    private EventExecutor eventExecutorGroup;
+    private final ProtocolVersionLimit protocolVersionLimit;
 
-    public Server(InetSocketAddress socket)
+    private Server (Builder builder)
     {
-        this.socket = socket;
+        this.socket = builder.getSocket();
+        this.useSSL = builder.useSSL;
+        this.protocolVersionLimit = builder.getProtocolVersionLimit();
+
+        if (builder.workerGroup != null)
+        {
+            workerGroup = builder.workerGroup;
+        }
+        else
+        {
+            if (useEpoll)
+                workerGroup = new EpollEventLoopGroup();
+            else
+                workerGroup = new NioEventLoopGroup();
+        }
         EventNotifier notifier = new EventNotifier(this);
         StorageService.instance.register(notifier);
         MigrationManager.instance.register(notifier);
-        registerMetrics();
-    }
-
-    public Server(String hostname, int port)
-    {
-        this(new InetSocketAddress(hostname, port));
-    }
-
-    public Server(InetAddress host, int port)
-    {
-        this(new InetSocketAddress(host, port));
-    }
-
-    public Server(int port)
-    {
-        this(new InetSocketAddress(port));
-    }
-
-    public void start()
-    {
-	    if(!isRunning())
-	    {
-            run();
-	    }
     }
 
     public void stop()
     {
         if (isRunning.compareAndSet(true, false))
-            close(false);
-    }
-
-    @VisibleForTesting
-    public void stopAndAwaitTermination()
-    {
-        if (isRunning.compareAndSet(true, false))
-            close(true);
+            close();
     }
 
     public boolean isRunning()
@@ -141,35 +122,25 @@
         return isRunning.get();
     }
 
-    private void run()
+    public synchronized void start()
     {
+        if(isRunning()) 
+            return;
+
         // Configure the server.
-        eventExecutorGroup = new RequestThreadPoolExecutor();
-
-        boolean hasEpoll = enableEpoll ? Epoll.isAvailable() : false;
-        if (hasEpoll)
-        {
-            workerGroup = new EpollEventLoopGroup();
-            logger.info("Netty using native Epoll event loop");
-        }
-        else
-        {
-            workerGroup = new NioEventLoopGroup();
-            logger.info("Netty using Java NIO event loop");
-        }
-
         ServerBootstrap bootstrap = new ServerBootstrap()
-                                    .group(workerGroup)
-                                    .channel(hasEpoll ? EpollServerSocketChannel.class : NioServerSocketChannel.class)
+                                    .channel(useEpoll ? EpollServerSocketChannel.class : NioServerSocketChannel.class)
                                     .childOption(ChannelOption.TCP_NODELAY, true)
                                     .childOption(ChannelOption.SO_LINGER, 0)
                                     .childOption(ChannelOption.SO_KEEPALIVE, DatabaseDescriptor.getRpcKeepAlive())
                                     .childOption(ChannelOption.ALLOCATOR, CBUtil.allocator)
                                     .childOption(ChannelOption.WRITE_BUFFER_HIGH_WATER_MARK, 32 * 1024)
                                     .childOption(ChannelOption.WRITE_BUFFER_LOW_WATER_MARK, 8 * 1024);
+        if (workerGroup != null)
+            bootstrap = bootstrap.group(workerGroup);
 
         final EncryptionOptions.ClientEncryptionOptions clientEnc = DatabaseDescriptor.getClientEncryptionOptions();
-        if (clientEnc.enabled)
+        if (this.useSSL)
         {
             if (clientEnc.optional)
             {
@@ -189,7 +160,7 @@
 
         // Bind and start to accept incoming connections.
         logger.info("Using Netty Version: {}", Version.identify().entrySet());
-        logger.info("Starting listening for CQL clients on {}...", socket);
+        logger.info("Starting listening for CQL clients on {} ({})...", socket, this.useSSL ? "encrypted" : "unencrypted");
 
         ChannelFuture bindFuture = bootstrap.bind(socket);
         if (!bindFuture.awaitUninterruptibly().isSuccess())
@@ -197,60 +168,91 @@
 
         connectionTracker.allChannels.add(bindFuture.channel());
         isRunning.set(true);
-
-        StorageService.instance.setRpcReady(true);
     }
 
-    private void registerMetrics()
+    public int getConnectedClients()
     {
-        ClientMetrics.instance.addCounter("connectedNativeClients", new Callable<Integer>()
-        {
-            @Override
-            public Integer call() throws Exception
-            {
-                return connectionTracker.getConnectedClients();
-            }
-        });
+        return connectionTracker.getConnectedClients();
     }
-
+    
     private void close()
     {
-        close(false);
-    }
-
-    private void closeAndAwait()
-    {
-        close(true);
-    }
-
-    private void close(boolean awaitTermination)
-    {
         // Close opened connections
         connectionTracker.closeAll();
-        workerGroup.shutdownGracefully();
-        eventExecutorGroup.shutdown();
-
+        
         logger.info("Stop listening for CQL clients");
-
-        if (awaitTermination)
-        {
-            try
-            {
-                workerGroup.awaitTermination(1, TimeUnit.MINUTES);
-                eventExecutorGroup.awaitTermination(1, TimeUnit.MINUTES);
-            }
-            catch (InterruptedException e)
-            {
-                logger.error(e.getMessage());
-            }
-        }
-
-        workerGroup = null;
-        eventExecutorGroup = null;
-
-        StorageService.instance.setRpcReady(false);
     }
 
+    public static class Builder
+    {
+        private EventLoopGroup workerGroup;
+        private EventExecutor eventExecutorGroup;
+        private boolean useSSL = false;
+        private InetAddress hostAddr;
+        private int port = -1;
+        private InetSocketAddress socket;
+        private ProtocolVersionLimit versionLimit;
+
+        public Builder withSSL(boolean useSSL)
+        {
+            this.useSSL = useSSL;
+            return this;
+        }
+
+        public Builder withEventLoopGroup(EventLoopGroup eventLoopGroup)
+        {
+            this.workerGroup = eventLoopGroup;
+            return this;
+        }
+
+        public Builder withHost(InetAddress host)
+        {
+            this.hostAddr = host;
+            this.socket = null;
+            return this;
+        }
+
+        public Builder withPort(int port)
+        {
+            this.port = port;
+            this.socket = null;
+            return this;
+        }
+
+        public Builder withProtocolVersionLimit(ProtocolVersionLimit limit)
+        {
+            this.versionLimit = limit;
+            return this;
+        }
+
+        ProtocolVersionLimit getProtocolVersionLimit()
+        {
+            if (versionLimit == null)
+                throw new IllegalArgumentException("Missing protocol version limiter");
+            return versionLimit;
+        }
+
+        public Server build()
+        {
+            return new Server(this);
+        }
+
+        private InetSocketAddress getSocket()
+        {
+            if (this.socket != null)
+                return this.socket;
+            else
+            {
+                if (this.port == -1)
+                    throw new IllegalStateException("Missing port number");
+                if (this.hostAddr != null)
+                    this.socket = new InetSocketAddress(this.hostAddr, this.port);
+                else
+                    throw new IllegalStateException("Missing host");
+                return this.socket;
+            }
+        }
+    }
 
     public static class ConnectionTracker implements Connection.Tracker
     {
@@ -295,16 +297,57 @@
         }
     }
 
-    private static class Initializer extends ChannelInitializer
+    // global inflight payload across all channels across all endpoints
+    private static final ResourceLimits.Concurrent globalRequestPayloadInFlight = new ResourceLimits.Concurrent(DatabaseDescriptor.getNativeTransportMaxConcurrentRequestsInBytes());
+
+    public static class EndpointPayloadTracker
+    {
+        // inflight payload per endpoint across corresponding channels
+        private static final ConcurrentMap<InetAddress, EndpointPayloadTracker> requestPayloadInFlightPerEndpoint = new ConcurrentHashMap<>();
+
+        private final AtomicInteger refCount = new AtomicInteger(0);
+        private final InetAddress endpoint;
+
+        final ResourceLimits.EndpointAndGlobal endpointAndGlobalPayloadsInFlight = new ResourceLimits.EndpointAndGlobal(new ResourceLimits.Concurrent(DatabaseDescriptor.getNativeTransportMaxConcurrentRequestsInBytesPerIp()),
+                                                                                                                         globalRequestPayloadInFlight);
+
+        private EndpointPayloadTracker(InetAddress endpoint)
+        {
+            this.endpoint = endpoint;
+        }
+
+        public static EndpointPayloadTracker get(InetAddress endpoint)
+        {
+            while (true)
+            {
+                EndpointPayloadTracker result = requestPayloadInFlightPerEndpoint.computeIfAbsent(endpoint, EndpointPayloadTracker::new);
+                if (result.acquire())
+                    return result;
+
+                requestPayloadInFlightPerEndpoint.remove(endpoint, result);
+            }
+        }
+
+        private boolean acquire()
+        {
+            return 0 < refCount.updateAndGet(i -> i < 0 ? i : i + 1);
+        }
+
+        public void release()
+        {
+            if (-1 == refCount.updateAndGet(i -> i == 1 ? -1 : i - 1))
+                requestPayloadInFlightPerEndpoint.remove(endpoint, this);
+        }
+    }
+
+    private static class Initializer extends ChannelInitializer<Channel>
     {
         // Stateless handlers
         private static final Message.ProtocolDecoder messageDecoder = new Message.ProtocolDecoder();
-        private static final Message.ProtocolEncoder messageEncoder = new Message.ProtocolEncoder();
         private static final Frame.Decompressor frameDecompressor = new Frame.Decompressor();
         private static final Frame.Compressor frameCompressor = new Frame.Compressor();
         private static final Frame.Encoder frameEncoder = new Frame.Encoder();
         private static final Message.ExceptionHandler exceptionHandler = new Message.ExceptionHandler();
-        private static final Message.Dispatcher dispatcher = new Message.Dispatcher();
         private static final ConnectionLimitHandler connectionLimitHandler = new ConnectionLimitHandler();
 
         private final Server server;
@@ -328,14 +371,17 @@
 
             //pipeline.addLast("debug", new LoggingHandler());
 
-            pipeline.addLast("frameDecoder", new Frame.Decoder(server.connectionFactory));
+            pipeline.addLast("frameDecoder", new Frame.Decoder(server.connectionFactory, server.protocolVersionLimit));
             pipeline.addLast("frameEncoder", frameEncoder);
 
             pipeline.addLast("frameDecompressor", frameDecompressor);
             pipeline.addLast("frameCompressor", frameCompressor);
 
             pipeline.addLast("messageDecoder", messageDecoder);
-            pipeline.addLast("messageEncoder", messageEncoder);
+            pipeline.addLast("messageEncoder", new Message.ProtocolEncoder(server.protocolVersionLimit));
+
+            pipeline.addLast("executor", new Message.Dispatcher(DatabaseDescriptor.useNativeTransportLegacyFlusher(),
+                                                                EndpointPayloadTracker.get(((InetSocketAddress) channel.remoteAddress()).getAddress())));
 
             // The exceptionHandler will take care of handling exceptionCaught(...) events while still running
             // on the same EventLoop as all previous added handlers in the pipeline. This is important as the used
@@ -344,8 +390,6 @@
             // correctly handled before the handler itself is removed.
             // See https://issues.apache.org/jira/browse/CASSANDRA-13649
             pipeline.addLast("exceptionHandler", exceptionHandler);
-
-            pipeline.addLast(server.eventExecutorGroup, "executor", dispatcher);
         }
     }
 
@@ -476,8 +520,7 @@
         private final Map<InetAddress, LatestEvent> latestEvents = new ConcurrentHashMap<>();
         // We also want to delay delivering a NEW_NODE notification until the new node has set its RPC ready
         // state. This tracks the endpoints which have joined, but not yet signalled they're ready for clients
-        private final Set<InetAddress> endpointsPendingJoinedNotification =
-            Collections.newSetFromMap(new ConcurrentHashMap<InetAddress, Boolean>());
+        private final Set<InetAddress> endpointsPendingJoinedNotification = ConcurrentHashMap.newKeySet();
 
 
         private static final InetAddress bindAll;
@@ -561,7 +604,7 @@
         {
             if (endpointsPendingJoinedNotification.remove(endpoint))
                 onJoinCluster(endpoint);
-            
+
             onStatusChange(endpoint, Event.StatusChange.nodeUp(getRpcAddress(endpoint), server.socket.getPort()));
         }
 
@@ -630,7 +673,7 @@
             send(new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, ksName));
         }
 
-        public void onUpdateColumnFamily(String ksName, String cfName, boolean columnsDidChange)
+        public void onUpdateColumnFamily(String ksName, String cfName, boolean affectsStatements)
         {
             send(new Event.SchemaChange(Event.SchemaChange.Change.UPDATED, Event.SchemaChange.Target.TABLE, ksName, cfName));
         }

diff --git a/src/java/org/apache/cassandra/transport/ServerConnection.java b/src/java/org/apache/cassandra/transport/ServerConnection.java
index 29691f5..1ef6c73 100644
--- a/src/java/org/apache/cassandra/transport/ServerConnection.java
+++ b/src/java/org/apache/cassandra/transport/ServerConnection.java

@@ -110,10 +110,10 @@
         }
     }
 
-    public IAuthenticator.SaslNegotiator getSaslNegotiator()
+    public IAuthenticator.SaslNegotiator getSaslNegotiator(QueryState queryState)
     {
         if (saslNegotiator == null)
-            saslNegotiator = DatabaseDescriptor.getAuthenticator().newSaslNegotiator();
+            saslNegotiator = DatabaseDescriptor.getAuthenticator().newSaslNegotiator(queryState.getClientAddress());
         return saslNegotiator;
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/SimpleClient.java b/src/java/org/apache/cassandra/transport/SimpleClient.java
index 4759c2a..7d34d98 100644
--- a/src/java/org/apache/cassandra/transport/SimpleClient.java
+++ b/src/java/org/apache/cassandra/transport/SimpleClient.java

@@ -115,10 +115,20 @@
 
     public void connect(boolean useCompression) throws IOException
     {
+        connect(useCompression, false);
+    }
+
+    public void connect(boolean useCompression, boolean throwOnOverload) throws IOException
+    {
         establishConnection();
 
         Map<String, String> options = new HashMap<>();
         options.put(StartupMessage.CQL_VERSION, "3.0.0");
+
+        if (throwOnOverload)
+            options.put(StartupMessage.THROW_ON_OVERLOAD, "1");
+        connection.setThrowOnOverload(throwOnOverload);
+
         if (useCompression)
         {
             options.put(StartupMessage.COMPRESSION, "snappy");
@@ -241,7 +251,7 @@
 
     // Stateless handlers
     private static final Message.ProtocolDecoder messageDecoder = new Message.ProtocolDecoder();
-    private static final Message.ProtocolEncoder messageEncoder = new Message.ProtocolEncoder();
+    private static final Message.ProtocolEncoder messageEncoder = new Message.ProtocolEncoder(ProtocolVersionLimit.SERVER_DEFAULT);
     private static final Frame.Decompressor frameDecompressor = new Frame.Decompressor();
     private static final Frame.Compressor frameCompressor = new Frame.Compressor();
     private static final Frame.Encoder frameEncoder = new Frame.Encoder();
@@ -264,7 +274,7 @@
             channel.attr(Connection.attributeKey).set(connection);
 
             ChannelPipeline pipeline = channel.pipeline();
-            pipeline.addLast("frameDecoder", new Frame.Decoder(connectionFactory));
+            pipeline.addLast("frameDecoder", new Frame.Decoder(connectionFactory, ProtocolVersionLimit.SERVER_DEFAULT));
             pipeline.addLast("frameEncoder", frameEncoder);
 
             pipeline.addLast("frameDecompressor", frameDecompressor);

diff --git a/src/java/org/apache/cassandra/transport/messages/AuthResponse.java b/src/java/org/apache/cassandra/transport/messages/AuthResponse.java
index 257a26a..ca7a0c3 100644
--- a/src/java/org/apache/cassandra/transport/messages/AuthResponse.java
+++ b/src/java/org/apache/cassandra/transport/messages/AuthResponse.java

@@ -71,7 +71,7 @@
     {
         try
         {
-            IAuthenticator.SaslNegotiator negotiator = ((ServerConnection) connection).getSaslNegotiator();
+            IAuthenticator.SaslNegotiator negotiator = ((ServerConnection) connection).getSaslNegotiator(queryState);
             byte[] challenge = negotiator.evaluateResponse(token);
             if (negotiator.isComplete())
             {

diff --git a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
index d86bb1a..bd2423e 100644
--- a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java

@@ -45,9 +45,6 @@
     {
         public BatchMessage decode(ByteBuf body, int version)
         {
-            if (version == 1)
-                throw new ProtocolException("BATCH messages are not support in version 1 of the protocol");
-
             byte type = body.readByte();
             int n = body.readUnsignedShort();
             List<Object> queryOrIds = new ArrayList<>(n);
@@ -63,9 +60,7 @@
                     throw new ProtocolException("Invalid query kind in BATCH messages. Must be 0 or 1 but got " + kind);
                 variables.add(CBUtil.readValueList(body, version));
             }
-            QueryOptions options = version < 3
-                                 ? QueryOptions.fromPreV3Batch(CBUtil.readConsistencyLevel(body))
-                                 : QueryOptions.codec.decode(body, version);
+            QueryOptions options = QueryOptions.codec.decode(body, version);
 
             return new BatchMessage(toType(type), queryOrIds, variables, options);
         }

diff --git a/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java b/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java
index 021db5a..8f45d4d 100644
--- a/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ErrorMessage.java

@@ -331,7 +331,18 @@
         }
 
         if (e instanceof TransportException)
-            return new ErrorMessage((TransportException)e, streamId);
+        {
+            ErrorMessage message = new ErrorMessage((TransportException) e, streamId);
+            if (e instanceof ProtocolException)
+            {
+                // if the driver attempted to connect with a protocol version lower than the minimum supported
+                // version, respond with a protocol error message with the correct frame header for that version
+                Integer attemptedLowProtocolVersion = ((ProtocolException) e).getAttemptedLowProtocolVersion();
+                if (attemptedLowProtocolVersion != null)
+                    message.forcedProtocolVersion = attemptedLowProtocolVersion;
+            }
+            return message;
+        }
 
         // Unexpected exception
         if (unexpectedExceptionHandler == null || !unexpectedExceptionHandler.apply(e))

diff --git a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
index 11a227c..e9923b4 100644
--- a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java

@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.transport.messages;
 
-import java.nio.ByteBuffer;
-import java.util.List;
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
@@ -28,7 +26,6 @@
 import org.apache.cassandra.cql3.QueryHandler;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
-import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.exceptions.PreparedQueryNotFoundException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
@@ -45,16 +42,7 @@
         public ExecuteMessage decode(ByteBuf body, int version)
         {
             byte[] id = CBUtil.readBytes(body);
-            if (version == 1)
-            {
-                List<ByteBuffer> values = CBUtil.readValueList(body, version);
-                ConsistencyLevel consistency = CBUtil.readConsistencyLevel(body);
-                return new ExecuteMessage(MD5Digest.wrap(id), QueryOptions.fromProtocolV1(consistency, values));
-            }
-            else
-            {
-                return new ExecuteMessage(MD5Digest.wrap(id), QueryOptions.codec.decode(body, version));
-            }
+            return new ExecuteMessage(MD5Digest.wrap(id), QueryOptions.codec.decode(body, version));
         }
 
         public void encode(ExecuteMessage msg, ByteBuf dest, int version)

diff --git a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
index 6d22dd1..3b48d52 100644
--- a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java

@@ -17,20 +17,20 @@
  */
 package org.apache.cassandra.transport.messages;
 
-import java.nio.ByteBuffer;
-import java.util.Collections;
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
-import io.netty.buffer.ByteBuf;
 
+import io.netty.buffer.ByteBuf;
 import org.apache.cassandra.cql3.QueryOptions;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.transport.*;
+import org.apache.cassandra.transport.CBUtil;
+import org.apache.cassandra.transport.Message;
+import org.apache.cassandra.transport.ProtocolException;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.UUIDGen;
 
@@ -44,15 +44,7 @@
         public QueryMessage decode(ByteBuf body, int version)
         {
             String query = CBUtil.readLongString(body);
-            if (version == 1)
-            {
-                ConsistencyLevel consistency = CBUtil.readConsistencyLevel(body);
-                return new QueryMessage(query, QueryOptions.fromProtocolV1(consistency, Collections.<ByteBuffer>emptyList()));
-            }
-            else
-            {
-                return new QueryMessage(query, QueryOptions.codec.decode(body, version));
-            }
+            return new QueryMessage(query, QueryOptions.codec.decode(body, version));
         }
 
         public void encode(QueryMessage msg, ByteBuf dest, int version)
@@ -145,6 +137,6 @@
     @Override
     public String toString()
     {
-        return "QUERY " + query;
+        return "QUERY " + query + "[pageSize = " + options.getPageSize() + "]";
     }
 }

diff --git a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
index 04d8e62..92278fa 100644
--- a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java

@@ -35,6 +35,8 @@
 {
     public static final String CQL_VERSION = "CQL_VERSION";
     public static final String COMPRESSION = "COMPRESSION";
+    public static final String NO_COMPACT = "NO_COMPACT";
+    public static final String THROW_ON_OVERLOAD = "THROW_ON_OVERLOAD";
 
     public static final Message.Codec<StartupMessage> codec = new Message.Codec<StartupMessage>()
     {
@@ -97,6 +99,11 @@
             }
         }
 
+        if (options.containsKey(NO_COMPACT) && Boolean.parseBoolean(options.get(NO_COMPACT)))
+            state.getClientState().setNoCompactMode();
+
+        connection.setThrowOnOverload("1".equals(options.get(THROW_ON_OVERLOAD)));
+
         if (DatabaseDescriptor.getAuthenticator().requireAuthentication())
             return new AuthenticateMessage(DatabaseDescriptor.getAuthenticator().getClass().getName());
         else

diff --git a/src/java/org/apache/cassandra/triggers/ITrigger.java b/src/java/org/apache/cassandra/triggers/ITrigger.java
index 21aba05..ad631d1 100644
--- a/src/java/org/apache/cassandra/triggers/ITrigger.java
+++ b/src/java/org/apache/cassandra/triggers/ITrigger.java

@@ -24,11 +24,11 @@
 import java.nio.ByteBuffer;
 import java.util.Collection;
 
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.partitions.Partition;
 
 /**
- * Trigger interface, For every Mutation received by the coordinator {@link #augment(ByteBuffer, ColumnFamily)}
+ * Trigger interface, For every partition update received by the coordinator {@link #augment(Partition)}
  * is called.<p>
  *
  * <b> Contract:</b><br>
@@ -44,9 +44,8 @@
     /**
      * Called exactly once per CF update, returned mutations are atomically updated.
      *
-     * @param partitionKey - partition Key for the update.
      * @param update - update received for the CF
      * @return additional modifications to be applied along with the supplied update
      */
-    public Collection<Mutation> augment(ByteBuffer partitionKey, ColumnFamily update);
+    public Collection<Mutation> augment(Partition update);
 }

diff --git a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java
index 973ad8b..3996127 100644
--- a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java
+++ b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java

@@ -22,15 +22,18 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
+import com.google.common.collect.ListMultimap;
 import com.google.common.collect.Maps;
 
-import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.TriggerMetadata;
+import org.apache.cassandra.schema.Triggers;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
 
@@ -62,14 +65,13 @@
 
     /**
      * Augment a partition update by executing triggers to generate an intermediate
-     * set of mutations, then merging the ColumnFamily from each mutation with those
+     * set of mutations, then merging the update from each mutation with those
      * supplied. This is called from @{link org.apache.cassandra.service.StorageProxy#cas}
      * which is scoped for a single partition. For that reason, any mutations generated
      * by triggers are checked to ensure that they are for the same table and partition
      * key as the primary update; if not, InvalidRequestException is thrown. If no
      * additional mutations are generated, the original updates are returned unmodified.
      *
-     * @param key partition key for the update
      * @param updates partition update to be applied, contains the merge of the original
      *                update and any generated mutations
      * @return the final update to be applied, the original update merged with any
@@ -77,22 +79,18 @@
      * @throws InvalidRequestException if any mutation generated by a trigger does not
      * apply to the exact same partition as the initial update
      */
-    public ColumnFamily execute(ByteBuffer key, ColumnFamily updates) throws InvalidRequestException
+    public PartitionUpdate execute(PartitionUpdate updates) throws InvalidRequestException
     {
-        List<Mutation> intermediate = executeInternal(key, updates);
+        List<Mutation> intermediate = executeInternal(updates);
         if (intermediate == null || intermediate.isEmpty())
             return updates;
 
-        validateForSinglePartition(updates.metadata().getKeyValidator(), updates.id(), key, intermediate);
-
-        for (Mutation mutation : intermediate)
-        {
-            for (ColumnFamily cf : mutation.getColumnFamilies())
-            {
-                updates.addAll(cf);
-            }
-        }
-        return updates;
+        List<PartitionUpdate> augmented = validateForSinglePartition(updates.metadata().cfId,
+                                                                     updates.partitionKey(),
+                                                                     intermediate);
+        // concatenate augmented and origin
+        augmented.add(updates);
+        return PartitionUpdate.merge(augmented);
     }
 
     /**
@@ -120,9 +118,9 @@
             if (mutation instanceof CounterMutation)
                 hasCounters = true;
 
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate upd : mutation.getPartitionUpdates())
             {
-                List<Mutation> augmentations = executeInternal(mutation.key(), cf);
+                List<Mutation> augmentations = executeInternal(upd);
                 if (augmentations == null || augmentations.isEmpty())
                     continue;
 
@@ -148,54 +146,66 @@
 
     private Collection<Mutation> mergeMutations(Iterable<Mutation> mutations)
     {
-        Map<Pair<String, ByteBuffer>, Mutation> groupedMutations = new HashMap<>();
+        ListMultimap<Pair<String, ByteBuffer>, Mutation> groupedMutations = ArrayListMultimap.create();
 
         for (Mutation mutation : mutations)
         {
-            Pair<String, ByteBuffer> key = Pair.create(mutation.getKeyspaceName(), mutation.key());
-            Mutation current = groupedMutations.get(key);
-            if (current == null)
-            {
-                // copy in case the mutation's modifications map is backed by an immutable Collections#singletonMap().
-                groupedMutations.put(key, mutation.copy());
-            }
-            else
-            {
-                current.addAll(mutation);
-            }
+            Pair<String, ByteBuffer> key = Pair.create(mutation.getKeyspaceName(), mutation.key().getKey());
+            groupedMutations.put(key, mutation);
         }
 
-        return groupedMutations.values();
+        List<Mutation> merged = new ArrayList<>(groupedMutations.size());
+        for (Pair<String, ByteBuffer> key : groupedMutations.keySet())
+            merged.add(Mutation.merge(groupedMutations.get(key)));
+
+        return merged;
     }
 
-    private void validateForSinglePartition(AbstractType<?> keyValidator,
-                                            UUID cfId,
-                                            ByteBuffer key,
-                                            Collection<Mutation> tmutations)
+    private List<PartitionUpdate> validateForSinglePartition(UUID cfId,
+                                                                   DecoratedKey key,
+                                                                   Collection<Mutation> tmutations)
     throws InvalidRequestException
     {
+        validate(tmutations);
+
+        if (tmutations.size() == 1)
+        {
+            List<PartitionUpdate> updates = Lists.newArrayList(Iterables.getOnlyElement(tmutations).getPartitionUpdates());
+            if (updates.size() > 1)
+                throw new InvalidRequestException("The updates generated by triggers are not all for the same partition");
+            validateSamePartition(cfId, key, Iterables.getOnlyElement(updates));
+            return updates;
+        }
+
+        ArrayList<PartitionUpdate> updates = new ArrayList<>(tmutations.size());
         for (Mutation mutation : tmutations)
         {
-            if (keyValidator.compare(mutation.key(), key) != 0)
-                throw new InvalidRequestException("Partition key of additional mutation does not match primary update key");
-
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate update : mutation.getPartitionUpdates())
             {
-                if (! cf.id().equals(cfId))
-                    throw new InvalidRequestException("table of additional mutation does not match primary update table");
+                validateSamePartition(cfId, key, update);
+                updates.add(update);
             }
         }
-        validate(tmutations);
+        return updates;
+    }
+
+    private void validateSamePartition(UUID cfId, DecoratedKey key, PartitionUpdate update)
+    throws InvalidRequestException
+    {
+        if (!key.equals(update.partitionKey()))
+            throw new InvalidRequestException("Partition key of additional mutation does not match primary update key");
+
+        if (!cfId.equals(update.metadata().cfId))
+            throw new InvalidRequestException("table of additional mutation does not match primary update table");
     }
 
     private void validate(Collection<Mutation> tmutations) throws InvalidRequestException
     {
         for (Mutation mutation : tmutations)
         {
-            QueryProcessor.validateKey(mutation.key());
-            for (ColumnFamily tcf : mutation.getColumnFamilies())
-                for (Cell cell : tcf)
-                    cell.validateFields(tcf.metadata());
+            QueryProcessor.validateKey(mutation.key().getKey());
+            for (PartitionUpdate update : mutation.getPartitionUpdates())
+                update.validate();
         }
     }
 
@@ -203,16 +213,16 @@
      * Switch class loader before using the triggers for the column family, if
      * not loaded them with the custom class loader.
      */
-    private List<Mutation> executeInternal(ByteBuffer key, ColumnFamily columnFamily)
+    private List<Mutation> executeInternal(PartitionUpdate update)
     {
-        Map<String, TriggerDefinition> triggers = columnFamily.metadata().getTriggers();
+        Triggers triggers = update.metadata().getTriggers();
         if (triggers.isEmpty())
             return null;
         List<Mutation> tmutations = Lists.newLinkedList();
         Thread.currentThread().setContextClassLoader(customClassLoader);
         try
         {
-            for (TriggerDefinition td : triggers.values())
+            for (TriggerMetadata td : triggers)
             {
                 ITrigger trigger = cachedTriggers.get(td.classOption);
                 if (trigger == null)
@@ -220,7 +230,7 @@
                     trigger = loadTriggerInstance(td.classOption);
                     cachedTriggers.put(td.classOption, trigger);
                 }
-                Collection<Mutation> temp = trigger.augment(key, columnFamily);
+                Collection<Mutation> temp = trigger.augment(update);
                 if (temp != null)
                     tmutations.addAll(temp);
             }
@@ -228,7 +238,7 @@
         }
         catch (Exception ex)
         {
-            throw new RuntimeException(String.format("Exception while creating trigger on table with ID: %s", columnFamily.id()), ex);
+            throw new RuntimeException(String.format("Exception while creating trigger on table with ID: %s", update.metadata().cfId), ex);
         }
         finally
         {

diff --git a/src/java/org/apache/cassandra/utils/AbstractIterator.java b/src/java/org/apache/cassandra/utils/AbstractIterator.java
new file mode 100644
index 0000000..dd3d73c
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/AbstractIterator.java

@@ -0,0 +1,83 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.utils;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import com.google.common.collect.PeekingIterator;
+
+public abstract class AbstractIterator<V> implements Iterator<V>, PeekingIterator<V>
+{
+
+    private static enum State { MUST_FETCH, HAS_NEXT, DONE, FAILED }
+    private State state = State.MUST_FETCH;
+    private V next;
+
+    protected V endOfData()
+    {
+        state = State.DONE;
+        return null;
+    }
+
+    protected abstract V computeNext();
+
+    public boolean hasNext()
+    {
+        switch (state)
+        {
+            case MUST_FETCH:
+                state = State.FAILED;
+                next = computeNext();
+
+            default:
+                if (state == State.DONE)
+                    return false;
+
+                state = State.HAS_NEXT;
+                return true;
+
+            case FAILED:
+                throw new IllegalStateException();
+        }
+    }
+
+    public V next()
+    {
+        if (state != State.HAS_NEXT && !hasNext())
+            throw new NoSuchElementException();
+
+        state = State.MUST_FETCH;
+        V result = next;
+        next = null;
+        return result;
+    }
+
+    public V peek()
+    {
+        if (!hasNext())
+            throw new NoSuchElementException();
+        return next;
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
index 44d8f24..b046e84 100644
--- a/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java
+++ b/src/java/org/apache/cassandra/utils/AlwaysPresentFilter.java

@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.utils;
 
+import org.apache.cassandra.utils.concurrent.Ref;
+
 public class AlwaysPresentFilter implements IFilter
 {
     public boolean isPresent(FilterKey key)
@@ -40,6 +42,10 @@
         return accumulate;
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+    }
+
     public long serializedSize() { return 0; }
 
     @Override

diff --git a/src/java/org/apache/cassandra/utils/BloomFilter.java b/src/java/org/apache/cassandra/utils/BloomFilter.java
index 9de202c..ce6c638 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilter.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilter.java

@@ -19,9 +19,9 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import org.apache.cassandra.utils.concurrent.Ref;
 import org.apache.cassandra.utils.concurrent.WrappedSharedCloseable;
 import org.apache.cassandra.utils.obs.IBitSet;
-import org.apache.cassandra.db.TypeSizes;
 
 public class BloomFilter extends WrappedSharedCloseable implements IFilter
 {
@@ -35,26 +35,31 @@
 
     public final IBitSet bitset;
     public final int hashCount;
+    /**
+     * CASSANDRA-8413: 3.0 (inverted) bloom filters have no 'static' bits caused by using the same upper bits
+     * for both bloom filter and token distribution.
+     */
+    public final boolean oldBfHashOrder;
 
-    BloomFilter(int hashCount, IBitSet bitset)
+    BloomFilter(int hashCount, IBitSet bitset, boolean oldBfHashOrder)
     {
         super(bitset);
         this.hashCount = hashCount;
         this.bitset = bitset;
+        this.oldBfHashOrder = oldBfHashOrder;
     }
 
-    BloomFilter(BloomFilter copy)
+    private BloomFilter(BloomFilter copy)
     {
         super(copy);
         this.hashCount = copy.hashCount;
         this.bitset = copy.bitset;
+        this.oldBfHashOrder = copy.oldBfHashOrder;
     }
 
-    public static final BloomFilterSerializer serializer = new BloomFilterSerializer();
-
     public long serializedSize()
     {
-        return serializer.serializedSize(this, TypeSizes.NATIVE);
+        return BloomFilterSerializer.serializedSize(this);
     }
 
     // Murmur is faster than an SHA-based approach and provides as-good collision
@@ -71,7 +76,7 @@
         long[] hash = new long[2];
         key.filterHash(hash);
         long[] indexes = new long[hashCount];
-        setIndexes(hash[0], hash[1], hashCount, max, indexes);
+        setIndexes(hash[1], hash[0], hashCount, max, indexes);
         return indexes;
     }
 
@@ -85,12 +90,19 @@
         // so that we do not need to allocate two arrays.
         long[] indexes = reusableIndexes.get();
         key.filterHash(indexes);
-        setIndexes(indexes[0], indexes[1], hashCount, bitset.capacity(), indexes);
+        setIndexes(indexes[1], indexes[0], hashCount, bitset.capacity(), indexes);
         return indexes;
     }
 
     private void setIndexes(long base, long inc, int count, long max, long[] results)
     {
+        if (oldBfHashOrder)
+        {
+            long x = inc;
+            inc = base;
+            base = x;
+        }
+
         for (int i = 0; i < count; i++)
         {
             results[i] = FBUtilities.abs(base % max);
@@ -135,4 +147,15 @@
     {
         return bitset.offHeapSize();
     }
+
+    public String toString()
+    {
+        return "BloomFilter[hashCount=" + hashCount + ";oldBfHashOrder=" + oldBfHashOrder + ";capacity=" + bitset.capacity() + ']';
+    }
+
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        super.addTo(identities);
+        bitset.addTo(identities);
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
index 6f80ac0..6f57fc8 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java

@@ -21,50 +21,48 @@
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.obs.IBitSet;
 import org.apache.cassandra.utils.obs.OffHeapBitSet;
 import org.apache.cassandra.utils.obs.OpenBitSet;
 
-class BloomFilterSerializer implements ISerializer<BloomFilter>
+final class BloomFilterSerializer
 {
-    public void serialize(BloomFilter bf, DataOutputPlus out) throws IOException
+    private BloomFilterSerializer()
+    {
+    }
+
+    public static void serialize(BloomFilter bf, DataOutputPlus out) throws IOException
     {
         out.writeInt(bf.hashCount);
         bf.bitset.serialize(out);
     }
 
-    public BloomFilter deserialize(DataInput in) throws IOException
+    public static BloomFilter deserialize(DataInput in, boolean oldBfHashOrder) throws IOException
     {
-        return deserialize(in, false);
+        return deserialize(in, false, oldBfHashOrder);
     }
 
     @SuppressWarnings("resource")
-    public BloomFilter deserialize(DataInput in, boolean offheap) throws IOException
+    public static BloomFilter deserialize(DataInput in, boolean offheap, boolean oldBfHashOrder) throws IOException
     {
         int hashes = in.readInt();
         IBitSet bs = offheap ? OffHeapBitSet.deserialize(in) : OpenBitSet.deserialize(in);
-        return createFilter(hashes, bs);
-    }
 
-    BloomFilter createFilter(int hashes, IBitSet bs)
-    {
-        return new BloomFilter(hashes, bs);
+        return new BloomFilter(hashes, bs, oldBfHashOrder);
     }
 
     /**
      * Calculates a serialized size of the given Bloom Filter
-     * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus)
-     *
      * @param bf Bloom filter to calculate serialized size
+     * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus)
      *
      * @return serialized size of the given bloom filter
      */
-    public long serializedSize(BloomFilter bf, TypeSizes typeSizes)
+    public static long serializedSize(BloomFilter bf)
     {
-        int size = typeSizes.sizeof(bf.hashCount); // hash count
-        size += bf.bitset.serializedSize(typeSizes);
+        int size = TypeSizes.sizeof(bf.hashCount); // hash count
+        size += bf.bitset.serializedSize();
         return size;
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/BooleanSerializer.java b/src/java/org/apache/cassandra/utils/BooleanSerializer.java
index 8f3abde..1fe7702 100644
--- a/src/java/org/apache/cassandra/utils/BooleanSerializer.java
+++ b/src/java/org/apache/cassandra/utils/BooleanSerializer.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.DataInput;
 import java.io.IOException;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class BooleanSerializer implements IVersionedSerializer<Boolean>
@@ -32,7 +32,7 @@
         out.writeBoolean(b);
     }
 
-    public Boolean deserialize(DataInput in, int version) throws IOException
+    public Boolean deserialize(DataInputPlus in, int version) throws IOException
     {
         return in.readBoolean();
     }

diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index 1779e67..e5a3bb8 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java

@@ -33,6 +33,8 @@
 import java.util.UUID;
 
 import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileUtils;
@@ -289,6 +291,12 @@
         out.write(bytes);
     }
 
+    public static void writeWithVIntLength(ByteBuffer bytes, DataOutputPlus out) throws IOException
+    {
+        out.writeUnsignedVInt(bytes.remaining());
+        out.write(bytes);
+    }
+
     public static void writeWithLength(byte[] bytes, DataOutput out) throws IOException
     {
         out.writeInt(bytes.length);
@@ -298,8 +306,8 @@
     public static void writeWithShortLength(ByteBuffer buffer, DataOutputPlus out) throws IOException
     {
         int length = buffer.remaining();
-        assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT :
-        String.format("Attempted serializing to buffer exceeded maximum of %s bytes: %s", FBUtilities.MAX_UNSIGNED_SHORT, length);
+        assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT
+            : String.format("Attempted serializing to buffer exceeded maximum of %s bytes: %s", FBUtilities.MAX_UNSIGNED_SHORT, length);
         out.writeShort(length);
         out.write(buffer);
     }
@@ -307,8 +315,8 @@
     public static void writeWithShortLength(byte[] buffer, DataOutput out) throws IOException
     {
         int length = buffer.length;
-        assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT :
-        String.format("Attempted serializing to buffer exceeded maximum of %s bytes: %s", FBUtilities.MAX_UNSIGNED_SHORT, length);
+        assert 0 <= length && length <= FBUtilities.MAX_UNSIGNED_SHORT
+            : String.format("Attempted serializing to buffer exceeded maximum of %s bytes: %s", FBUtilities.MAX_UNSIGNED_SHORT, length);
         out.writeShort(length);
         out.write(buffer);
     }
@@ -324,6 +332,36 @@
         return ByteBufferUtil.read(in, length);
     }
 
+    public static ByteBuffer readWithVIntLength(DataInputPlus in) throws IOException
+    {
+        int length = (int)in.readUnsignedVInt();
+        if (length < 0)
+            throw new IOException("Corrupt (negative) value length encountered");
+
+        return ByteBufferUtil.read(in, length);
+    }
+
+    public static int serializedSizeWithLength(ByteBuffer buffer)
+    {
+        int size = buffer.remaining();
+        return TypeSizes.sizeof(size) + size;
+    }
+
+    public static int serializedSizeWithVIntLength(ByteBuffer buffer)
+    {
+        int size = buffer.remaining();
+        return TypeSizes.sizeofUnsignedVInt(size) + size;
+    }
+
+    public static void skipWithVIntLength(DataInputPlus in) throws IOException
+    {
+        int length = (int)in.readUnsignedVInt();
+        if (length < 0)
+            throw new IOException("Corrupt (negative) value length encountered");
+
+        in.skipBytesFully(length);
+    }
+
     /* @return An unsigned short in an integer. */
     public static int readShortLength(DataInput in) throws IOException
     {
@@ -340,16 +378,21 @@
         return ByteBufferUtil.read(in, readShortLength(in));
     }
 
+    public static int serializedSizeWithShortLength(ByteBuffer buffer)
+    {
+        int size = buffer.remaining();
+        return TypeSizes.sizeof((short)size) + size;
+    }
+
     /**
      * @param in data input
      * @return null
      * @throws IOException if an I/O error occurs.
      */
-    public static ByteBuffer skipShortLength(DataInput in) throws IOException
+    public static void skipShortLength(DataInputPlus in) throws IOException
     {
         int skip = readShortLength(in);
-        FileUtils.skipBytesFully(in, skip);
-        return null;
+        in.skipBytesFully(skip);
     }
 
     public static ByteBuffer read(DataInput in, int length) throws IOException
@@ -357,9 +400,6 @@
         if (length == 0)
             return EMPTY_BYTE_BUFFER;
 
-        if (in instanceof FileDataInput)
-            return ((FileDataInput) in).readBytes(length);
-
         byte[] buff = new byte[length];
         in.readFully(buff);
         return ByteBuffer.wrap(buff);
@@ -570,12 +610,35 @@
         return prefix.equals(value.duplicate().limit(value.remaining() - diff));
     }
 
+    public static boolean canMinimize(ByteBuffer buf)
+    {
+        return buf != null && (buf.capacity() > buf.remaining() || !buf.hasArray());
+    }
+
     /** trims size of bytebuffer to exactly number of bytes in it, to do not hold too much memory */
     public static ByteBuffer minimalBufferFor(ByteBuffer buf)
     {
         return buf.capacity() > buf.remaining() || !buf.hasArray() ? ByteBuffer.wrap(getArray(buf)) : buf;
     }
 
+    public static ByteBuffer[] minimizeBuffers(ByteBuffer[] src)
+    {
+        ByteBuffer[] dst = new ByteBuffer[src.length];
+        for (int i=0; i<src.length; i++)
+            dst[i] = src[i] != null ? minimalBufferFor(src[i]) : null;
+        return dst;
+    }
+
+    public static boolean canMinimize(ByteBuffer[] src)
+    {
+        for (ByteBuffer buffer : src)
+        {
+            if (canMinimize(buffer))
+                return true;
+        }
+        return false;
+    }
+
     // Doesn't change bb position
     public static int getShortLength(ByteBuffer bb, int position)
     {

diff --git a/src/java/org/apache/cassandra/utils/BytesReadTracker.java b/src/java/org/apache/cassandra/utils/BytesReadTracker.java
deleted file mode 100644
index f363513..0000000
--- a/src/java/org/apache/cassandra/utils/BytesReadTracker.java
+++ /dev/null

@@ -1,151 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.IOException;
-
-/**
- * This class is to track bytes read from given DataInput
- */
-public class BytesReadTracker implements DataInput
-{
-
-    private long bytesRead;
-    private final DataInput source;
-
-    public BytesReadTracker(DataInput source)
-    {
-        this.source = source;
-    }
-
-    public long getBytesRead()
-    {
-        return bytesRead;
-    }
-
-    /**
-     * reset counter to @param count
-     */
-    public void reset(long count)
-    {
-        bytesRead = count;
-    }
-
-    public boolean readBoolean() throws IOException
-    {
-        boolean bool = source.readBoolean();
-        bytesRead += 1;
-        return bool;
-    }
-
-    public byte readByte() throws IOException
-    {
-        byte b = source.readByte();
-        bytesRead += 1;
-        return b;
-    }
-
-    public char readChar() throws IOException
-    {
-        char c = source.readChar();
-        bytesRead += 2;
-        return c;
-    }
-
-    public double readDouble() throws IOException
-    {
-        double d = source.readDouble();
-        bytesRead += 8;
-        return d;
-    }
-
-    public float readFloat() throws IOException
-    {
-        float f = source.readFloat();
-        bytesRead += 4;
-        return f;
-    }
-
-    public void readFully(byte[] b, int off, int len) throws IOException
-    {
-        source.readFully(b, off, len);
-        bytesRead += len;
-    }
-
-    public void readFully(byte[] b) throws IOException
-    {
-        source.readFully(b);
-        bytesRead += b.length;
-    }
-
-    public int readInt() throws IOException
-    {
-        int i = source.readInt();
-        bytesRead += 4;
-        return i;
-    }
-
-    public String readLine() throws IOException
-    {
-        // since this method is deprecated and cannot track bytes read
-        // just throw exception
-        throw new UnsupportedOperationException();
-    }
-
-    public long readLong() throws IOException
-    {
-        long l = source.readLong();
-        bytesRead += 8;
-        return l;
-    }
-
-    public short readShort() throws IOException
-    {
-        short s = source.readShort();
-        bytesRead += 2;
-        return s;
-    }
-
-    public String readUTF() throws IOException
-    {
-        return DataInputStream.readUTF(this);
-    }
-
-    public int readUnsignedByte() throws IOException
-    {
-        int i = source.readUnsignedByte();
-        bytesRead += 1;
-        return i;
-    }
-
-    public int readUnsignedShort() throws IOException
-    {
-        int i = source.readUnsignedShort();
-        bytesRead += 2;
-        return i;
-    }
-
-    public int skipBytes(int n) throws IOException
-    {
-        int skipped = source.skipBytes(n);
-        bytesRead += skipped;
-        return skipped;
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/CLibrary.java b/src/java/org/apache/cassandra/utils/CLibrary.java
deleted file mode 100644
index e3bec4f..0000000
--- a/src/java/org/apache/cassandra/utils/CLibrary.java
+++ /dev/null

@@ -1,364 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.io.FileDescriptor;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.lang.reflect.Field;
-import java.nio.channels.FileChannel;
-import java.nio.file.Paths;
-import java.nio.file.StandardOpenOption;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.sun.jna.LastErrorException;
-import com.sun.jna.Native;
-
-public final class CLibrary
-{
-    private static final Logger logger = LoggerFactory.getLogger(CLibrary.class);
-
-    private static final int MCL_CURRENT;
-    private static final int MCL_FUTURE;
-
-    private static final int ENOMEM = 12;
-
-    private static final int F_GETFL   = 3;  /* get file status flags */
-    private static final int F_SETFL   = 4;  /* set file status flags */
-    private static final int F_NOCACHE = 48; /* Mac OS X specific flag, turns cache on/off */
-    private static final int O_DIRECT  = 040000; /* fcntl.h */
-    private static final int O_RDONLY  = 00000000; /* fcntl.h */
-
-    private static final int POSIX_FADV_NORMAL     = 0; /* fadvise.h */
-    private static final int POSIX_FADV_RANDOM     = 1; /* fadvise.h */
-    private static final int POSIX_FADV_SEQUENTIAL = 2; /* fadvise.h */
-    private static final int POSIX_FADV_WILLNEED   = 3; /* fadvise.h */
-    private static final int POSIX_FADV_DONTNEED   = 4; /* fadvise.h */
-    private static final int POSIX_FADV_NOREUSE    = 5; /* fadvise.h */
-
-    static boolean jnaAvailable = true;
-    static boolean jnaLockable = false;
-
-    static
-    {
-        try
-        {
-            if (Boolean.getBoolean("cassandra.disable_clibrary"))
-            {
-                jnaAvailable = false;
-            }
-            else
-            {
-                Native.register("c");
-            }
-        }
-        catch (NoClassDefFoundError e)
-        {
-            logger.warn("JNA not found. Native methods will be disabled.");
-            jnaAvailable = false;
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            logger.warn("JNA link failure, one or more native method will be unavailable.");
-            logger.trace("JNA link failure details: {}", e.getMessage());
-        }
-        catch (NoSuchMethodError e)
-        {
-            logger.warn("Obsolete version of JNA present; unable to register C library. Upgrade to JNA 3.2.7 or later");
-            jnaAvailable = false;
-        }
-
-        if (System.getProperty("os.arch").toLowerCase().contains("ppc"))
-        {
-            if (System.getProperty("os.name").toLowerCase().contains("linux"))
-            {
-               MCL_CURRENT = 0x2000;
-               MCL_FUTURE = 0x4000;
-            }
-            else if (System.getProperty("os.name").toLowerCase().contains("aix"))
-            {
-                MCL_CURRENT = 0x100;
-                MCL_FUTURE = 0x200;
-            }
-            else
-            {
-                MCL_CURRENT = 1;
-                MCL_FUTURE = 2;
-            }
-        }
-        else
-        {
-            MCL_CURRENT = 1;
-            MCL_FUTURE = 2;
-        }
-    }
-
-    private static native int mlockall(int flags) throws LastErrorException;
-    private static native int munlockall() throws LastErrorException;
-    private static native int fcntl(int fd, int command, long flags) throws LastErrorException;
-    private static native int posix_fadvise(int fd, long offset, int len, int flag) throws LastErrorException;
-    private static native int open(String path, int flags) throws LastErrorException;
-    private static native int fsync(int fd) throws LastErrorException;
-    private static native int close(int fd) throws LastErrorException;
-
-    private static int errno(RuntimeException e)
-    {
-        assert e instanceof LastErrorException;
-        try
-        {
-            return ((LastErrorException) e).getErrorCode();
-        }
-        catch (NoSuchMethodError x)
-        {
-            logger.warn("Obsolete version of JNA present; unable to read errno. Upgrade to JNA 3.2.7 or later");
-            return 0;
-        }
-    }
-
-    private CLibrary() {}
-
-    public static boolean jnaAvailable()
-    {
-        return jnaAvailable;
-    }
-
-    public static boolean jnaMemoryLockable()
-    {
-        return jnaLockable;
-    }
-
-    public static void tryMlockall()
-    {
-        try
-        {
-            mlockall(MCL_CURRENT);
-            jnaLockable = true;
-            logger.info("JNA mlockall successful");
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // this will have already been logged by CLibrary, no need to repeat it
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            if (errno(e) == ENOMEM && System.getProperty("os.name").toLowerCase().contains("linux"))
-            {
-                logger.warn("Unable to lock JVM memory (ENOMEM)."
-                        + " This can result in part of the JVM being swapped out, especially with mmapped I/O enabled."
-                        + " Increase RLIMIT_MEMLOCK or run Cassandra as root.");
-            }
-            else if (!System.getProperty("os.name").toLowerCase().contains("mac"))
-            {
-                // OS X allows mlockall to be called, but always returns an error
-                logger.warn("Unknown mlockall error {}", errno(e));
-            }
-        }
-    }
-
-    public static void trySkipCache(String path, long offset, long len)
-    {
-        trySkipCache(getfd(path), offset, len);
-    }
-
-    public static void trySkipCache(int fd, long offset, long len)
-    {
-        if (len == 0)
-            trySkipCache(fd, 0, 0);
-
-        while (len > 0)
-        {
-            int sublen = (int) Math.min(Integer.MAX_VALUE, len);
-            trySkipCache(fd, offset, sublen);
-            len -= sublen;
-            offset -= sublen;
-        }
-    }
-
-    public static void trySkipCache(int fd, long offset, int len)
-    {
-        if (fd < 0)
-            return;
-
-        try
-        {
-            if (System.getProperty("os.name").toLowerCase().contains("linux"))
-            {
-                posix_fadvise(fd, offset, len, POSIX_FADV_DONTNEED);
-            }
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // if JNA is unavailable just skipping Direct I/O
-            // instance of this class will act like normal RandomAccessFile
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            logger.warn(String.format("posix_fadvise(%d, %d) failed, errno (%d).", fd, offset, errno(e)));
-        }
-    }
-
-    public static int tryFcntl(int fd, int command, int flags)
-    {
-        // fcntl return value may or may not be useful, depending on the command
-        int result = -1;
-
-        try
-        {
-            result = fcntl(fd, command, flags);
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // if JNA is unavailable just skipping
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            logger.warn(String.format("fcntl(%d, %d, %d) failed, errno (%d).", fd, command, flags, errno(e)));
-        }
-
-        return result;
-    }
-
-    public static int tryOpenDirectory(String path)
-    {
-        int fd = -1;
-
-        try
-        {
-            return open(path, O_RDONLY);
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // JNA is unavailable just skipping Direct I/O
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            logger.warn(String.format("open(%s, O_RDONLY) failed, errno (%d).", path, errno(e)));
-        }
-
-        return fd;
-    }
-
-    public static void trySync(int fd)
-    {
-        if (fd == -1)
-            return;
-
-        try
-        {
-            fsync(fd);
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // JNA is unavailable just skipping Direct I/O
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            logger.warn(String.format("fsync(%d) failed, errno (%d).", fd, errno(e)));
-        }
-    }
-
-    public static void tryCloseFD(int fd)
-    {
-        if (fd == -1)
-            return;
-
-        try
-        {
-            close(fd);
-        }
-        catch (UnsatisfiedLinkError e)
-        {
-            // JNA is unavailable just skipping Direct I/O
-        }
-        catch (RuntimeException e)
-        {
-            if (!(e instanceof LastErrorException))
-                throw e;
-
-            logger.warn(String.format("close(%d) failed, errno (%d).", fd, errno(e)));
-        }
-    }
-
-    public static int getfd(FileChannel channel)
-    {
-        Field field = FBUtilities.getProtectedField(channel.getClass(), "fd");
-
-        try
-        {
-            return getfd((FileDescriptor)field.get(channel));
-        }
-        catch (IllegalArgumentException|IllegalAccessException e)
-        {
-            logger.warn("Unable to read fd field from FileChannel");
-        }
-        return -1;
-    }
-
-    /**
-     * Get system file descriptor from FileDescriptor object.
-     * @param descriptor - FileDescriptor objec to get fd from
-     * @return file descriptor, -1 or error
-     */
-    public static int getfd(FileDescriptor descriptor)
-    {
-        Field field = FBUtilities.getProtectedField(descriptor.getClass(), "fd");
-
-        try
-        {
-            return field.getInt(descriptor);
-        }
-        catch (Exception e)
-        {
-            JVMStabilityInspector.inspectThrowable(e);
-            logger.warn("Unable to read fd field from FileDescriptor");
-        }
-
-        return -1;
-    }
-
-    public static int getfd(String path)
-    {
-        try(FileChannel channel = FileChannel.open(Paths.get(path), StandardOpenOption.READ))
-        {
-            return getfd(channel);
-        }
-        catch (IOException e)
-        {
-            JVMStabilityInspector.inspectThrowable(e);
-            // ignore
-            return -1;
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/CRC32Factory.java b/src/java/org/apache/cassandra/utils/CRC32Factory.java
deleted file mode 100644
index a031f09..0000000
--- a/src/java/org/apache/cassandra/utils/CRC32Factory.java
+++ /dev/null

@@ -1,35 +0,0 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
-*/
-package org.apache.cassandra.utils;
-
-
-/**
- * CRC Factory that uses our pure java crc for default
- */
-public class CRC32Factory extends com.github.tjake.CRC32Factory
-{
-    public static final CRC32Factory instance = new CRC32Factory();
-
-    public CRC32Factory()
-    {
-        super(PureJavaCrc32.class);
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/ChecksumType.java b/src/java/org/apache/cassandra/utils/ChecksumType.java
new file mode 100644
index 0000000..c9a1eb8
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/ChecksumType.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.nio.ByteBuffer;
+import java.util.zip.Checksum;
+import java.util.zip.CRC32;
+import java.util.zip.Adler32;
+
+public enum ChecksumType
+{
+    Adler32()
+    {
+
+        @Override
+        public Checksum newInstance()
+        {
+            return new Adler32();
+        }
+
+        @Override
+        public void update(Checksum checksum, ByteBuffer buf)
+        {
+            ((Adler32)checksum).update(buf);
+        }
+
+    },
+    CRC32()
+    {
+
+        @Override
+        public Checksum newInstance()
+        {
+            return new CRC32();
+        }
+
+        @Override
+        public void update(Checksum checksum, ByteBuffer buf)
+        {
+            ((CRC32)checksum).update(buf);
+        }
+
+    };
+
+    public abstract Checksum newInstance();
+
+    public abstract void update(Checksum checksum, ByteBuffer buf);
+}

diff --git a/src/java/org/apache/cassandra/utils/Clock.java b/src/java/org/apache/cassandra/utils/Clock.java
new file mode 100644
index 0000000..eb9822c
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/Clock.java

@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Wrapper around time related functions that are either implemented by using the default JVM calls
+ * or by using a custom implementation for testing purposes.
+ *
+ * See {@link #instance} for how to use a custom implementation.
+ *
+ * Please note that {@link java.time.Clock} wasn't used, as it would not be possible to provide an
+ * implementation for {@link #nanoTime()} with the exact same properties of {@link System#nanoTime()}.
+ */
+public class Clock
+{
+    private static final Logger logger = LoggerFactory.getLogger(Clock.class);
+
+    /**
+     * Static singleton object that will be instanciated by default with a system clock
+     * implementation. Set <code>cassandra.clock</code> system property to a FQCN to use a
+     * different implementation instead.
+     */
+    public static Clock instance;
+
+    static
+    {
+        String sclock = System.getProperty("cassandra.clock");
+        if (sclock == null)
+        {
+            instance = new Clock();
+        }
+        else
+        {
+            try
+            {
+                logger.debug("Using custom clock implementation: {}", sclock);
+                instance = (Clock) Class.forName(sclock).newInstance();
+            }
+            catch (Exception e)
+            {
+                logger.error(e.getMessage(), e);
+            }
+        }
+    }
+
+    /**
+     * @see System#nanoTime()
+     */
+    public long nanoTime()
+    {
+        return System.nanoTime();
+    }
+
+    /**
+     * @see System#currentTimeMillis()
+     */
+    public long currentTimeMillis()
+    {
+        return System.currentTimeMillis();
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/utils/CloseableIterator.java b/src/java/org/apache/cassandra/utils/CloseableIterator.java
index 7474f3d..57034ae 100644
--- a/src/java/org/apache/cassandra/utils/CloseableIterator.java
+++ b/src/java/org/apache/cassandra/utils/CloseableIterator.java

@@ -17,10 +17,10 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.Closeable;
 import java.util.Iterator;
 
 // so we can instantiate anonymous classes implementing both interfaces
-public interface CloseableIterator<T> extends Iterator<T>, AutoCloseable, Closeable
+public interface CloseableIterator<T> extends Iterator<T>, AutoCloseable
 {
+    public void close();
 }

diff --git a/src/java/org/apache/cassandra/utils/CoalescingStrategies.java b/src/java/org/apache/cassandra/utils/CoalescingStrategies.java
index 52d4240..d79fa15 100644
--- a/src/java/org/apache/cassandra/utils/CoalescingStrategies.java
+++ b/src/java/org/apache/cassandra/utils/CoalescingStrategies.java

@@ -35,6 +35,7 @@
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.LockSupport;
+import java.util.Locale;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
@@ -517,7 +518,7 @@
                                                     String displayName)
     {
         String classname = null;
-        String strategyCleaned = strategy.trim().toUpperCase();
+        String strategyCleaned = strategy.trim().toUpperCase(Locale.ENGLISH);
         switch(strategyCleaned)
         {
         case "MOVINGAVERAGE":

diff --git a/src/java/org/apache/cassandra/utils/CounterId.java b/src/java/org/apache/cassandra/utils/CounterId.java
index 2552178..690d4aa 100644
--- a/src/java/org/apache/cassandra/utils/CounterId.java
+++ b/src/java/org/apache/cassandra/utils/CounterId.java

@@ -46,10 +46,11 @@
     }
 
     /**
-     * Function for test purposes, do not use otherwise.
      * Pack an int in a valid CounterId so that the resulting ids respects the
      * numerical ordering. Used for creating handcrafted but easy to
      * understand contexts in unit tests (see CounterContextTest).
+     *
+     * Also used to generate a special ID for special-case update contexts (see CounterContext.createUpdate()).
      */
     public static CounterId fromInt(int n)
     {

diff --git a/src/java/org/apache/cassandra/utils/DefaultInteger.java b/src/java/org/apache/cassandra/utils/DefaultInteger.java
deleted file mode 100644
index 2a3efc7..0000000
--- a/src/java/org/apache/cassandra/utils/DefaultInteger.java
+++ /dev/null

@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-
-public class DefaultInteger
-{
-    private final int originalValue;
-    private int currentValue;
-
-    public DefaultInteger(int value)
-    {
-        originalValue = value;
-        currentValue = value;
-    }
-
-    public int value()
-    {
-        return currentValue;
-    }
-
-    public void set(int i)
-    {
-        currentValue = i;
-    }
-
-    public void reset()
-    {
-        currentValue = originalValue;
-    }
-
-    public boolean isModified()
-    {
-        return originalValue != currentValue;
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/DefaultValue.java b/src/java/org/apache/cassandra/utils/DefaultValue.java
new file mode 100644
index 0000000..5697ede
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/DefaultValue.java

@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+
+public class DefaultValue<T>
+{
+    private final T originalValue;
+    private T currentValue;
+
+    public DefaultValue(T value)
+    {
+        originalValue = value;
+        currentValue = value;
+    }
+
+    public T value()
+    {
+        return currentValue;
+    }
+
+    public void set(T i)
+    {
+        currentValue = i;
+    }
+
+    public void reset()
+    {
+        currentValue = originalValue;
+    }
+
+    public boolean isModified()
+    {
+        return originalValue != currentValue;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/DiagnosticSnapshotService.java b/src/java/org/apache/cassandra/utils/DiagnosticSnapshotService.java
new file mode 100644
index 0000000..5c48412
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/DiagnosticSnapshotService.java

@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.net.InetAddress;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.UUID;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+
+/**
+ * Provides a means to take snapshots when triggered by anomalous events or when the breaking of invariants is
+ * detected. When debugging certain classes of problems, having access to the relevant set of sstables when the problem
+ * is detected (or as close to then as possible) can be invaluable.
+ *
+ * This class performs two functions; on a replica where an anomaly is detected, it provides methods to issue snapshot
+ * requests to a provided set of replicas. For instance, if rows with duplicate clusterings are detected
+ * (CASSANDRA-15789) during a read, a snapshot request will be issued to all participating replicas. If detected during
+ * compaction, only the replica itself will receive the request. Requests are issued at a maximum rate of 1 per minute
+ * for any given table. Any additional triggers for the same table during the 60 second window are dropped, regardless
+ * of the replica set. This window is configurable via a system property (cassandra.diagnostic_snapshot_interval_nanos),
+ * but this is intended for use in testing only and operators are not expected to override the default.
+ *
+ * The second function performed is to handle snapshot requests on replicas. Snapshot names are prefixed with strings
+ * specific to the reason which triggered them. To manage consumption of disk space, replicas are restricted to taking
+ * a single snapshot for each prefix in a single calendar day. So if duplicate rows are detected by multiple
+ * coordinators during reads with the same replica set (or overlapping sets) on the same table, the coordinators may
+ * each issue snapshot  requests, but the replicas will only accept the first one they receive. Further requests will
+ * be dropped on the replica side.
+ */
+public class DiagnosticSnapshotService
+{
+    private static final Logger logger = LoggerFactory.getLogger(DiagnosticSnapshotService.class);
+
+    public static final DiagnosticSnapshotService instance =
+        new DiagnosticSnapshotService(Executors.newSingleThreadExecutor(new NamedThreadFactory("DiagnosticSnapshot")));
+
+    public static final String DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX = "DuplicateRows-";
+
+    private final Executor executor;
+
+    private DiagnosticSnapshotService(Executor executor)
+    {
+        this.executor = executor;
+    }
+
+    // Issue at most 1 snapshot request per minute for any given table.
+    // Replicas will only create one snapshot per day, but this stops us
+    // from swamping the network.
+    // Overridable via system property for testing.
+    private static final long SNAPSHOT_INTERVAL_NANOS = TimeUnit.MINUTES.toNanos(1);
+    private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.BASIC_ISO_DATE;
+    private final ConcurrentHashMap<UUID, AtomicLong> lastSnapshotTimes = new ConcurrentHashMap<>();
+
+    public static void duplicateRows(CFMetaData metadata, Iterable<InetAddress> replicas)
+    {
+        instance.maybeTriggerSnapshot(metadata, DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX, replicas);
+    }
+
+    public static boolean isDiagnosticSnapshotRequest(SnapshotCommand command)
+    {
+        return command.snapshot_name.startsWith(DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX);
+    }
+
+    public static void snapshot(SnapshotCommand command, InetAddress initiator)
+    {
+        Preconditions.checkArgument(isDiagnosticSnapshotRequest(command));
+        instance.maybeSnapshot(command, initiator);
+    }
+
+    public static String getSnapshotName(String prefix)
+    {
+        return String.format("%s%s", prefix, DATE_FORMAT.format(LocalDate.now()));
+    }
+
+    @VisibleForTesting
+    public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
+    {
+        ExecutorUtils.shutdownNowAndWait(timeout, unit, executor);
+    }
+
+    private void maybeTriggerSnapshot(CFMetaData metadata, String prefix, Iterable<InetAddress> endpoints)
+    {
+        long now = System.nanoTime();
+        AtomicLong cached = lastSnapshotTimes.computeIfAbsent(metadata.cfId, u -> new AtomicLong(0));
+        long last = cached.get();
+        long interval = Long.getLong("cassandra.diagnostic_snapshot_interval_nanos", SNAPSHOT_INTERVAL_NANOS);
+        if (now - last > interval && cached.compareAndSet(last, now))
+        {
+            MessageOut<?> msg = new SnapshotCommand(metadata.ksName,
+                                                    metadata.cfName,
+                                                    getSnapshotName(prefix),
+                                                    false).createMessage();
+            for (InetAddress replica : endpoints)
+                MessagingService.instance().sendOneWay(msg, replica);
+        }
+        else
+        {
+            logger.debug("Diagnostic snapshot request dropped due to throttling");
+        }
+    }
+
+    private void maybeSnapshot(SnapshotCommand command, InetAddress initiator)
+    {
+        executor.execute(new DiagnosticSnapshotTask(command, initiator));
+    }
+
+    private static class DiagnosticSnapshotTask implements Runnable
+    {
+        final SnapshotCommand command;
+        final InetAddress from;
+
+        DiagnosticSnapshotTask(SnapshotCommand command, InetAddress from)
+        {
+            this.command = command;
+            this.from = from;
+        }
+
+        public void run()
+        {
+            try
+            {
+                Keyspace ks = Keyspace.open(command.keyspace);
+                if (ks == null)
+                {
+                    logger.info("Snapshot request received from {} for {}.{} but keyspace not found",
+                                from,
+                                command.keyspace,
+                                command.column_family);
+                    return;
+                }
+
+                ColumnFamilyStore cfs = ks.getColumnFamilyStore(command.column_family);
+                if (cfs.snapshotExists(command.snapshot_name))
+                {
+                    logger.info("Received diagnostic snapshot request from {} for {}.{}, " +
+                                "but snapshot with tag {} already exists",
+                                from,
+                                command.keyspace,
+                                command.column_family,
+                                command.snapshot_name);
+                    return;
+                }
+                logger.info("Creating snapshot requested by {} of {}.{} tag: {}",
+                            from,
+                            command.keyspace,
+                            command.column_family,
+                            command.snapshot_name);
+                cfs.snapshot(command.snapshot_name);
+            }
+            catch (IllegalArgumentException e)
+            {
+                logger.warn("Snapshot request received from {} for {}.{} but CFS not found",
+                            from,
+                            command.keyspace,
+                            command.column_family);
+            }
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/DynamicList.java b/src/java/org/apache/cassandra/utils/DynamicList.java
new file mode 100644
index 0000000..30f5160
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/DynamicList.java

@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.TreeSet;
+import java.util.concurrent.ThreadLocalRandom;
+
+// simple thread-unsafe skiplist that permits indexing/removal by position, insertion at the end
+// (though easily extended to insertion at any position, not necessary here)
+// we use it for sampling items by position for visiting writes in the pool of pending writes
+public class DynamicList<E>
+{
+
+    // represents a value and an index simultaneously; each node maintains a list
+    // of next pointers for each height in the skip-list this node participates in
+    // (a contiguous range from [0..height))
+    public static class Node<E>
+    {
+        // stores the size of each descendant
+        private final int[] size;
+        // TODO: alternate links to save space
+        private final Node<E>[] links;
+        private E value;
+
+        private Node(int height, E value)
+        {
+            this.value = value;
+            links = new Node[height * 2];
+            size = new int[height];
+            Arrays.fill(size, 1);
+        }
+
+        private int height()
+        {
+            return size.length;
+        }
+
+        private Node<E> next(int i)
+        {
+            return links[i * 2];
+        }
+
+        private Node<E> prev(int i)
+        {
+            return links[1 + i * 2];
+        }
+
+        private void setNext(int i, Node<E> next)
+        {
+            links[i * 2] = next;
+        }
+
+        private void setPrev(int i, Node<E> prev)
+        {
+            links[1 + i * 2] = prev;
+        }
+
+        private Node parent(int parentHeight)
+        {
+            Node prev = this;
+            while (true)
+            {
+                int height = prev.height();
+                if (parentHeight < height)
+                    return prev;
+                prev = prev.prev(height - 1);
+            }
+        }
+    }
+
+    private final int maxHeight;
+    private final Node<E> head;
+    private int size;
+
+    public DynamicList(int maxExpectedSize)
+    {
+        this.maxHeight = 3 + Math.max(0, (int) Math.ceil(Math.log(maxExpectedSize) / Math.log(2)));
+        head = new Node<>(maxHeight, null);
+    }
+
+    private int randomLevel()
+    {
+        return 1 + Integer.bitCount(ThreadLocalRandom.current().nextInt() & ((1 << (maxHeight - 1)) - 1));
+    }
+
+    public Node<E> append(E value)
+    {
+        return append(value, Integer.MAX_VALUE);
+    }
+
+    // add the value to the end of the list, and return the associated Node that permits efficient removal
+    // regardless of its future position in the list from other modifications
+    public Node<E> append(E value, int maxSize)
+    {
+        Node<E> newTail = new Node<>(randomLevel(), value);
+        if (size >= maxSize)
+            return null;
+        size++;
+
+        Node<E> tail = head;
+        for (int i = maxHeight - 1 ; i >= newTail.height() ; i--)
+        {
+            Node<E> next;
+            while ((next = tail.next(i)) != null)
+                tail = next;
+            tail.size[i]++;
+        }
+
+        for (int i = newTail.height() - 1 ; i >= 0 ; i--)
+        {
+            Node<E> next;
+            while ((next = tail.next(i)) != null)
+                tail = next;
+            tail.setNext(i, newTail);
+            newTail.setPrev(i, tail);
+        }
+
+        return newTail;
+    }
+
+    // remove the provided node and its associated value from the list
+    public void remove(Node<E> node)
+    {
+        assert node.value != null;
+        node.value = null;
+
+        size--;
+
+        // go up through each level in the skip list, unlinking this node; this entails
+        // simply linking each neighbour to each other, and appending the size of the
+        // current level owned by this node's index to the preceding neighbour (since
+        // ownership is defined as any node that you must visit through the index,
+        // removal of ourselves from a level means the preceding index entry is the
+        // entry point to all of the removed node's descendants)
+        for (int i = 0 ; i < node.height() ; i++)
+        {
+            Node<E> prev = node.prev(i);
+            Node<E> next = node.next(i);
+            assert prev != null;
+            prev.setNext(i, next);
+            if (next != null)
+                next.setPrev(i, prev);
+            prev.size[i] += node.size[i] - 1;
+        }
+
+        // then go up the levels, removing 1 from the size at each height above ours
+        for (int i = node.height() ; i < maxHeight ; i++)
+        {
+            // if we're at our height limit, we backtrack at our top level until we
+            // hit a neighbour with a greater height
+            while (i == node.height())
+                node = node.prev(i - 1);
+            node.size[i]--;
+        }
+    }
+
+    // retrieve the item at the provided index, or return null if the index is past the end of the list
+    public E get(int index)
+    {
+        if (index >= size)
+            return null;
+
+        index++;
+        int c = 0;
+        Node<E> finger = head;
+        for (int i = maxHeight - 1 ; i >= 0 ; i--)
+        {
+            while (c + finger.size[i] <= index)
+            {
+                c += finger.size[i];
+                finger = finger.next(i);
+            }
+        }
+
+        assert c == index;
+        return finger.value;
+    }
+
+    public int size()
+    {
+        return size;
+    }
+
+    // some quick and dirty tests to confirm the skiplist works as intended
+    // don't create a separate unit test - tools tree doesn't currently warrant them
+
+    private boolean isWellFormed()
+    {
+        for (int i = 0 ; i < maxHeight ; i++)
+        {
+            int c = 0;
+            for (Node node = head ; node != null ; node = node.next(i))
+            {
+                if (node.prev(i) != null && node.prev(i).next(i) != node)
+                    return false;
+                if (node.next(i) != null && node.next(i).prev(i) != node)
+                    return false;
+                c += node.size[i];
+                if (i + 1 < maxHeight && node.parent(i + 1).next(i + 1) == node.next(i))
+                {
+                    if (node.parent(i + 1).size[i + 1] != c)
+                        return false;
+                    c = 0;
+                }
+            }
+            if (i == maxHeight - 1 && c != size + 1)
+                return false;
+        }
+        return true;
+    }
+
+    public static void main(String[] args)
+    {
+        DynamicList<Integer> list = new DynamicList<>(20);
+        TreeSet<Integer> canon = new TreeSet<>();
+        HashMap<Integer, Node> nodes = new HashMap<>();
+        int c = 0;
+        for (int i = 0 ; i < 100000 ; i++)
+        {
+            nodes.put(c, list.append(c));
+            canon.add(c);
+            c++;
+        }
+        ThreadLocalRandom rand = ThreadLocalRandom.current();
+        assert list.isWellFormed();
+        for (int loop = 0 ; loop < 100 ; loop++)
+        {
+            System.out.println(loop);
+            for (int i = 0 ; i < 100000 ; i++)
+            {
+                int index = rand.nextInt(100000);
+                Integer seed = list.get(index);
+//                assert canon.headSet(seed, false).size() == index;
+                list.remove(nodes.remove(seed));
+                canon.remove(seed);
+                nodes.put(c, list.append(c));
+                canon.add(c);
+                c++;
+            }
+            assert list.isWellFormed();
+        }
+    }
+
+}

diff --git a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
index 1a48039..8109c98 100644
--- a/src/java/org/apache/cassandra/utils/EstimatedHistogram.java
+++ b/src/java/org/apache/cassandra/utils/EstimatedHistogram.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicLongArray;
@@ -26,8 +25,8 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-
 import org.slf4j.Logger;
 
 public class EstimatedHistogram
@@ -271,7 +270,7 @@
     }
 
     /**
-     * log.trace() every record in the histogram
+     * log.debug() every record in the histogram
      *
      * @param log
      */
@@ -302,7 +301,7 @@
             // calculation, and accept the unnecessary whitespace prefixes that will occasionally occur
             if (i == 0 && count == 0)
                 continue;
-            log.trace(String.format(formatstr, names[i], count));
+            log.debug(String.format(formatstr, names[i], count));
         }
     }
 
@@ -367,7 +366,7 @@
             }
         }
 
-        public EstimatedHistogram deserialize(DataInput in) throws IOException
+        public EstimatedHistogram deserialize(DataInputPlus in) throws IOException
         {
             int size = in.readInt();
             long[] offsets = new long[size - 1];
@@ -380,17 +379,17 @@
             return new EstimatedHistogram(offsets, buckets);
         }
 
-        public long serializedSize(EstimatedHistogram eh, TypeSizes typeSizes)
+        public long serializedSize(EstimatedHistogram eh)
         {
             int size = 0;
 
             long[] offsets = eh.getBucketOffsets();
             long[] buckets = eh.getBuckets(false);
-            size += typeSizes.sizeof(buckets.length);
+            size += TypeSizes.sizeof(buckets.length);
             for (int i = 0; i < buckets.length; i++)
             {
-                size += typeSizes.sizeof(offsets[i == 0 ? 0 : i - 1]);
-                size += typeSizes.sizeof(buckets[i]);
+                size += TypeSizes.sizeof(offsets[i == 0 ? 0 : i - 1]);
+                size += TypeSizes.sizeof(buckets[i]);
             }
             return size;
         }

diff --git a/src/java/org/apache/cassandra/utils/ExpiringMap.java b/src/java/org/apache/cassandra/utils/ExpiringMap.java
index 8359918..a6895c5 100644
--- a/src/java/org/apache/cassandra/utils/ExpiringMap.java
+++ b/src/java/org/apache/cassandra/utils/ExpiringMap.java

@@ -48,7 +48,7 @@
             assert value != null;
             this.value = value;
             this.timeout = timeout;
-            this.createdAt = System.nanoTime();
+            this.createdAt = Clock.instance.nanoTime();
         }
 
         private boolean isReadyToDieAt(long atNano)
@@ -85,7 +85,7 @@
         {
             public void run()
             {
-                long start = System.nanoTime();
+                long start = Clock.instance.nanoTime();
                 int n = 0;
                 for (Map.Entry<K, CacheableObject<V>> entry : cache.entrySet())
                 {
@@ -121,6 +121,11 @@
     public void reset()
     {
         shutdown = false;
+        clear();
+    }
+
+    public void clear()
+    {
         cache.clear();
     }
 

diff --git a/src/java/org/apache/cassandra/utils/FBUtilities.java b/src/java/org/apache/cassandra/utils/FBUtilities.java
index 69b7b5f..b560adf 100644
--- a/src/java/org/apache/cassandra/utils/FBUtilities.java
+++ b/src/java/org/apache/cassandra/utils/FBUtilities.java

@@ -19,8 +19,6 @@
 
 import java.io.*;
 import java.lang.reflect.Field;
-import java.lang.reflect.InvocationTargetException;
-import java.lang.reflect.Method;
 import java.math.BigInteger;
 import java.net.*;
 import java.nio.ByteBuffer;
@@ -28,7 +26,7 @@
 import java.security.NoSuchAlgorithmException;
 import java.util.*;
 import java.util.concurrent.*;
-import java.util.zip.Adler32;
+import java.util.zip.CRC32;
 import java.util.zip.Checksum;
 
 import javax.annotation.Nonnull;
@@ -36,7 +34,9 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Joiner;
-import com.google.common.collect.AbstractIterator;
+import com.google.common.base.Strings;
+import com.google.common.util.concurrent.Uninterruptibles;
+
 import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -46,12 +46,18 @@
 import org.apache.cassandra.auth.IRoleManager;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.IVersionedSerializer;
-import org.apache.cassandra.io.compress.CompressionParameters;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputBufferFixed;
 import org.apache.cassandra.io.util.FileUtils;
@@ -82,8 +88,9 @@
 
     public static int getAvailableProcessors()
     {
-        if (System.getProperty("cassandra.available_processors") != null)
-            return Integer.parseInt(System.getProperty("cassandra.available_processors"));
+        String availableProcessors = System.getProperty("cassandra.available_processors");
+        if (!Strings.isNullOrEmpty(availableProcessors))
+            return Integer.parseInt(availableProcessors);
         else
             return Runtime.getRuntime().availableProcessors();
     }
@@ -270,25 +277,6 @@
         return out;
     }
 
-    public static byte[] hash(ByteBuffer... data)
-    {
-        MessageDigest messageDigest = localMD5Digest.get();
-        for (ByteBuffer block : data)
-        {
-            if (block.hasArray())
-                messageDigest.update(block.array(), block.arrayOffset() + block.position(), block.remaining());
-            else
-                messageDigest.update(block.duplicate());
-        }
-
-        return messageDigest.digest();
-    }
-
-    public static BigInteger hashToBigInteger(ByteBuffer data)
-    {
-        return new BigInteger(hash(data)).abs();
-    }
-
     public static void sortSampledKeys(List<DecoratedKey> keys, Range<Token> range)
     {
         if (range.left.compareTo(range.right) >= 0)
@@ -385,6 +373,11 @@
         return System.currentTimeMillis() * 1000;
     }
 
+    public static int nowInSeconds()
+    {
+        return (int) (System.currentTimeMillis() / 1000);
+    }
+
     public static <T> List<T> waitOnFutures(Iterable<? extends Future<? extends T>> futures)
     {
         return waitOnFutures(futures, -1, null);
@@ -450,6 +443,63 @@
             result.get(ms, TimeUnit.MILLISECONDS);
     }
 
+    public static <T> Future<? extends T> waitOnFirstFuture(Iterable<? extends Future<? extends T>> futures)
+    {
+        return waitOnFirstFuture(futures, 100);
+    }
+    /**
+     * Only wait for the first future to finish from a list of futures. Will block until at least 1 future finishes.
+     * @param futures The futures to wait on
+     * @return future that completed.
+     */
+    public static <T> Future<? extends T> waitOnFirstFuture(Iterable<? extends Future<? extends T>> futures, long delay)
+    {
+        while (true)
+        {
+            for (Future<? extends T> f : futures)
+            {
+                if (f.isDone())
+                {
+                    try
+                    {
+                        f.get();
+                    }
+                    catch (InterruptedException e)
+                    {
+                        throw new AssertionError(e);
+                    }
+                    catch (ExecutionException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
+                    return f;
+                }
+            }
+            Uninterruptibles.sleepUninterruptibly(delay, TimeUnit.MILLISECONDS);
+        }
+    }
+    /**
+     * Create a new instance of a partitioner defined in an SSTable Descriptor
+     * @param desc Descriptor of an sstable
+     * @return a new IPartitioner instance
+     * @throws IOException
+     */
+    public static IPartitioner newPartitioner(Descriptor desc) throws IOException
+    {
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION, MetadataType.HEADER);
+        Map<MetadataType, MetadataComponent> sstableMetadata = desc.getMetadataSerializer().deserialize(desc, types);
+        ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+        SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
+        if (validationMetadata.partitioner.endsWith("LocalPartitioner"))
+        {
+            return new LocalPartitioner(header.getKeyType());
+        }
+        else
+        {
+            return newPartitioner(validationMetadata.partitioner);
+        }
+    }
+
     public static IPartitioner newPartitioner(String partitionerClassName) throws ConfigurationException
     {
         if (!partitionerClassName.contains("."))
@@ -552,13 +602,18 @@
         }
     }
 
-    public static <T> SortedSet<T> singleton(T column, Comparator<? super T> comparator)
+    public static <T> NavigableSet<T> singleton(T column, Comparator<? super T> comparator)
     {
-        SortedSet<T> s = new TreeSet<T>(comparator);
+        NavigableSet<T> s = new TreeSet<T>(comparator);
         s.add(column);
         return s;
     }
 
+    public static <T> NavigableSet<T> emptySortedSet(Comparator<? super T> comparator)
+    {
+        return new TreeSet<T>(comparator);
+    }
+
     /**
      * Make straing out of the given {@code Map}.
      *
@@ -687,18 +742,32 @@
         checksum.update((v >>> 0) & 0xFF);
     }
 
-    private static Method directUpdate;
-    static
+    /**
+      * Updates checksum with the provided ByteBuffer at the given offset + length.
+      * Resets position and limit back to their original values on return.
+      * This method is *NOT* thread-safe.
+      */
+    public static void updateChecksum(CRC32 checksum, ByteBuffer buffer, int offset, int length)
     {
-        try
-        {
-            directUpdate = Adler32.class.getDeclaredMethod("update", new Class[]{ByteBuffer.class});
-            directUpdate.setAccessible(true);
-        } catch (NoSuchMethodException e)
-        {
-            logger.warn("JVM doesn't support Adler32 byte buffer access");
-            directUpdate = null;
-        }
+        int position = buffer.position();
+        int limit = buffer.limit();
+
+        buffer.position(offset).limit(offset + length);
+        checksum.update(buffer);
+
+        buffer.position(position).limit(limit);
+    }
+
+    /**
+     * Updates checksum with the provided ByteBuffer.
+     * Resets position back to its original values on return.
+     * This method is *NOT* thread-safe.
+     */
+    public static void updateChecksum(CRC32 checksum, ByteBuffer buffer)
+    {
+        int position = buffer.position();
+        checksum.update(buffer);
+        buffer.position(position);
     }
 
     private static final ThreadLocal<byte[]> threadLocalScratchBuffer = new ThreadLocal<byte[]>()
@@ -706,7 +775,7 @@
         @Override
         protected byte[] initialValue()
         {
-            return new byte[CompressionParameters.DEFAULT_CHUNK_LENGTH];
+            return new byte[CompressionParams.DEFAULT_CHUNK_LENGTH];
         }
     };
 
@@ -715,45 +784,6 @@
         return threadLocalScratchBuffer.get();
     }
 
-    //Java 7 has this method but it's private till Java 8. Thanks JDK!
-    public static boolean supportsDirectChecksum()
-    {
-        return directUpdate != null;
-    }
-
-    public static void directCheckSum(Adler32 checksum, ByteBuffer bb)
-    {
-        if (directUpdate != null)
-        {
-            try
-            {
-                directUpdate.invoke(checksum, bb);
-                return;
-            }
-            catch (IllegalAccessException e)
-            {
-                directUpdate = null;
-                logger.warn("JVM doesn't support Adler32 byte buffer access");
-            }
-            catch (InvocationTargetException e)
-            {
-                throw new RuntimeException(e);
-            }
-        }
-
-        //Fallback
-        byte[] buffer = getThreadLocalScratchBuffer();
-
-        int remaining;
-        while ((remaining = bb.remaining()) > 0)
-        {
-            remaining = Math.min(remaining, buffer.length);
-            ByteBufferUtil.arrayCopy(bb, bb.position(), buffer, 0, remaining);
-            bb.position(bb.position() + remaining);
-            checksum.update(buffer, 0, remaining);
-        }
-    }
-
     public static long abs(long index)
     {
         long negbit = index >> 63;
@@ -866,6 +896,11 @@
         digest.update((byte)  ((val >>> 0) & 0xFF));
     }
 
+    public static void updateWithBoolean(MessageDigest digest, boolean val)
+    {
+        updateWithByte(digest, val ? 0 : 1);
+    }
+
     public static void closeAll(Collection<? extends AutoCloseable> l) throws Exception
     {
         Exception toThrow = null;

diff --git a/src/java/org/apache/cassandra/utils/FastByteOperations.java b/src/java/org/apache/cassandra/utils/FastByteOperations.java
index f35d2e2..68e395c 100644
--- a/src/java/org/apache/cassandra/utils/FastByteOperations.java
+++ b/src/java/org/apache/cassandra/utils/FastByteOperations.java

@@ -104,7 +104,7 @@
         {
             String arch = System.getProperty("os.arch");
             boolean unaligned = arch.equals("i386") || arch.equals("x86")
-                                || arch.equals("amd64") || arch.equals("x86_64");
+                                || arch.equals("amd64") || arch.equals("x86_64") || arch.equals("s390x");
             if (!unaligned)
                 return new PureJavaOperations();
             try

diff --git a/src/java/org/apache/cassandra/utils/FilterFactory.java b/src/java/org/apache/cassandra/utils/FilterFactory.java
index d77500c..869f3fa 100644
--- a/src/java/org/apache/cassandra/utils/FilterFactory.java
+++ b/src/java/org/apache/cassandra/utils/FilterFactory.java

@@ -37,19 +37,19 @@
 
     public static void serialize(IFilter bf, DataOutputPlus output) throws IOException
     {
-        BloomFilter.serializer.serialize((BloomFilter) bf, output);
+        BloomFilterSerializer.serialize((BloomFilter) bf, output);
     }
 
-    public static IFilter deserialize(DataInput input, boolean offheap) throws IOException
+    public static IFilter deserialize(DataInput input, boolean offheap, boolean oldBfHashOrder) throws IOException
     {
-        return BloomFilter.serializer.deserialize(input, offheap);
+        return BloomFilterSerializer.deserialize(input, offheap, oldBfHashOrder);
     }
 
     /**
      * @return A BloomFilter with the lowest practical false positive
      *         probability for the given number of elements.
      */
-    public static IFilter getFilter(long numElements, int targetBucketsPerElem, boolean offheap)
+    public static IFilter getFilter(long numElements, int targetBucketsPerElem, boolean offheap, boolean oldBfHashOrder)
     {
         int maxBucketsPerElement = Math.max(1, BloomCalculations.maxBucketsPerElement(numElements));
         int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement);
@@ -58,7 +58,7 @@
             logger.warn(String.format("Cannot provide an optimal BloomFilter for %d elements (%d/%d buckets per element).", numElements, bucketsPerElement, targetBucketsPerElem));
         }
         BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement);
-        return createFilter(spec.K, numElements, spec.bucketsPerElement, offheap);
+        return createFilter(spec.K, numElements, spec.bucketsPerElement, offheap, oldBfHashOrder);
     }
 
     /**
@@ -68,21 +68,21 @@
      *         Asserts that the given probability can be satisfied using this
      *         filter.
      */
-    public static IFilter getFilter(long numElements, double maxFalsePosProbability, boolean offheap)
+    public static IFilter getFilter(long numElements, double maxFalsePosProbability, boolean offheap, boolean oldBfHashOrder)
     {
         assert maxFalsePosProbability <= 1.0 : "Invalid probability";
         if (maxFalsePosProbability == 1.0)
             return new AlwaysPresentFilter();
         int bucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements);
         BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
-        return createFilter(spec.K, numElements, spec.bucketsPerElement, offheap);
+        return createFilter(spec.K, numElements, spec.bucketsPerElement, offheap, oldBfHashOrder);
     }
 
     @SuppressWarnings("resource")
-    private static IFilter createFilter(int hash, long numElements, int bucketsPer, boolean offheap)
+    private static IFilter createFilter(int hash, long numElements, int bucketsPer, boolean offheap, boolean oldBfHashOrder)
     {
         long numBits = (numElements * bucketsPer) + BITSET_EXCESS;
         IBitSet bitset = offheap ? new OffHeapBitSet(numBits) : new OpenBitSet(numBits);
-        return new BloomFilter(hash, bitset);
+        return new BloomFilter(hash, bitset, oldBfHashOrder);
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/HeapUtils.java b/src/java/org/apache/cassandra/utils/HeapUtils.java
index 2d068de..4c84f9b 100644
--- a/src/java/org/apache/cassandra/utils/HeapUtils.java
+++ b/src/java/org/apache/cassandra/utils/HeapUtils.java

@@ -35,14 +35,13 @@
     private static final Logger logger = LoggerFactory.getLogger(HeapUtils.class);
 
     /**
-     * Generates a HEAP dump in the directory specified by the <code>HeapDumpPath</code> JVM option
-     * or in the <code>CASSANDRA_HOME</code> directory.
+     * Generates a HEAP histogram in the log file.
      */
     public static void logHeapHistogram()
     {
         try
         {
-            logger.info("Trying to log the heap histogram using jmap");
+            logger.info("Trying to log the heap histogram using jcmd");
 
             Long processId = getProcessId();
             if (processId == null)
@@ -51,14 +50,14 @@
                 return;
             }
 
-            String jmapPath = getJmapPath();
+            String jcmdPath = getJcmdPath();
 
-            // The jmap file could not be found. In this case let's default to jmap in the hope that it is in the path.
-            String jmapCommand = jmapPath == null ? "jmap" : jmapPath;
+            // The jcmd file could not be found. In this case let's default to jcmd in the hope that it is in the path.
+            String jcmdCommand = jcmdPath == null ? "jcmd" : jcmdPath;
 
-            String[] histoCommands = new String[] {jmapCommand,
-                    "-histo",
-                    processId.toString()};
+            String[] histoCommands = new String[] {jcmdCommand,
+                    processId.toString(),
+                    "GC.class_histogram"};
 
             logProcessOutput(Runtime.getRuntime().exec(histoCommands));
         }
@@ -69,10 +68,10 @@
     }
 
     /**
-     * Retrieve the path to the JMAP executable.
-     * @return the path to the JMAP executable or null if it cannot be found.
+     * Retrieve the path to the JCMD executable.
+     * @return the path to the JCMD executable or null if it cannot be found.
      */
-    private static String getJmapPath()
+    private static String getJcmdPath()
     {
         // Searching in the JAVA_HOME is safer than searching into System.getProperty("java.home") as the Oracle
         // JVM might use the JRE which do not contains jmap.
@@ -85,7 +84,7 @@
         {
             public boolean accept(File dir, String name)
             {
-                return name.startsWith("jmap");
+                return name.startsWith("jcmd");
             }
         });
         return ArrayUtils.isEmpty(files) ? null : files[0].getPath();
@@ -116,11 +115,9 @@
      */
     private static Long getProcessId()
     {
-        // Once Java 9 is ready the process API should provide a better way to get the process ID.
-        long pid = SigarLibrary.instance.getPid();
-
+        long pid = NativeLibrary.getProcessID();
         if (pid >= 0)
-            return Long.valueOf(pid);
+            return pid;
 
         return getProcessIdFromJvmName();
     }

diff --git a/src/java/org/apache/cassandra/utils/IFilter.java b/src/java/org/apache/cassandra/utils/IFilter.java
index 2f59864..b5eb2c4 100644
--- a/src/java/org/apache/cassandra/utils/IFilter.java
+++ b/src/java/org/apache/cassandra/utils/IFilter.java

@@ -21,7 +21,7 @@
 
 public interface IFilter extends SharedCloseable
 {
-    public interface FilterKey
+    interface FilterKey
     {
         /** Places the murmur3 hash of the key in the given long array of size at least two. */
         void filterHash(long[] dest);

diff --git a/src/java/org/apache/cassandra/utils/IndexedSearchIterator.java b/src/java/org/apache/cassandra/utils/IndexedSearchIterator.java
new file mode 100644
index 0000000..597e5bb
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/IndexedSearchIterator.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+public interface IndexedSearchIterator<K, V> extends SearchIterator<K, V>
+{
+    /**
+     * @return true if iterator has any elements left, false otherwise
+     */
+    public boolean hasNext();
+
+    /**
+     * @return the value just recently returned by next()
+     * @throws java.util.NoSuchElementException if next() returned null
+     */
+    public V current();
+
+    /**
+     * @return the index of the value returned by current(), and just returned by next()
+     * @throws java.util.NoSuchElementException if next() returned null
+     */
+    public int indexOfCurrent();
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/utils/IntegerInterval.java b/src/java/org/apache/cassandra/utils/IntegerInterval.java
new file mode 100644
index 0000000..03ad6e0
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/IntegerInterval.java

@@ -0,0 +1,227 @@
+package org.apache.cassandra.utils;
+
+import java.util.*;
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Lists;
+import com.google.common.primitives.Longs;
+
+/**
+ * Mutable integer interval class, thread-safe.
+ * Represents the interval [lower,upper].
+ */
+public class IntegerInterval
+{
+    volatile long interval;
+    private static AtomicLongFieldUpdater<IntegerInterval> intervalUpdater =
+            AtomicLongFieldUpdater.newUpdater(IntegerInterval.class, "interval");
+
+    private IntegerInterval(long interval)
+    {
+        this.interval = interval;
+    }
+
+    public IntegerInterval(int lower, int upper)
+    {
+        this(make(lower, upper));
+    }
+
+    public IntegerInterval(IntegerInterval src)
+    {
+        this(src.interval);
+    }
+
+    public int lower()
+    {
+        return lower(interval);
+    }
+
+    public int upper()
+    {
+        return upper(interval);
+    }
+
+    /**
+     * Expands the interval to cover the given value by extending one of its sides if necessary.
+     * Mutates this. Thread-safe.
+     */
+    public void expandToCover(int value)
+    {
+        long prev;
+        int lower;
+        int upper;
+        do
+        {
+            prev = interval;
+            upper = upper(prev);
+            lower = lower(prev);
+            if (value > upper) // common case
+                upper = value;
+            else if (value < lower)
+                lower = value;
+        }
+        while (!intervalUpdater.compareAndSet(this, prev, make(lower, upper)));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Long.hashCode(interval);
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (getClass() != obj.getClass())
+            return false;
+        IntegerInterval other = (IntegerInterval) obj;
+        return interval == other.interval;
+    }
+
+    public String toString()
+    {
+        long interval = this.interval;
+        return "[" + lower(interval) + "," + upper(interval) + "]";
+    }
+
+    private static long make(int lower, int upper)
+    {
+        assert lower <= upper;
+        return ((lower & 0xFFFFFFFFL) << 32) | upper & 0xFFFFFFFFL;
+    }
+
+    private static int lower(long interval)
+    {
+        return (int) (interval >>> 32);
+    }
+
+    private static int upper(long interval)
+    {
+        return (int) interval;
+    }
+
+
+    /**
+     * A mutable set of closed integer intervals, stored in normalized form (i.e. where overlapping intervals are
+     * converted to a single interval covering both). Thread-safe.
+     */
+    public static class Set
+    {
+        static long[] EMPTY = new long[0];
+
+        private volatile long[] ranges = EMPTY;
+
+        /**
+         * Adds an interval to the set, performing the necessary normalization.
+         */
+        public synchronized void add(int start, int end)
+        {
+            assert start <= end;
+            long[] ranges, newRanges;
+            {
+                ranges = this.ranges; // take local copy to avoid risk of it changing in the midst of operation
+
+                // extend ourselves to cover any ranges we overlap
+                // record directly preceding our end may extend past us, so take the max of our end and its
+                int rpos = Arrays.binarySearch(ranges, ((end & 0xFFFFFFFFL) << 32) | 0xFFFFFFFFL); // floor (i.e. greatest <=) of the end position
+                if (rpos < 0)
+                    rpos = (-1 - rpos) - 1;
+                if (rpos >= 0)
+                {
+                    int extend = upper(ranges[rpos]);
+                    if (extend > end)
+                        end = extend;
+                }
+    
+                // record directly preceding our start may extend into us; if it does, we take it as our start
+                int lpos = Arrays.binarySearch(ranges, ((start & 0xFFFFFFFFL) << 32) | 0); // lower (i.e. greatest <) of the start position
+                if (lpos < 0)
+                    lpos = -1 - lpos;
+                lpos -= 1;
+                if (lpos >= 0)
+                {
+                    if (upper(ranges[lpos]) >= start)
+                    {
+                        start = lower(ranges[lpos]);
+                        --lpos;
+                    }
+                }
+    
+                newRanges = new long[ranges.length - (rpos - lpos) + 1];
+                int dest = 0;
+                for (int i = 0; i <= lpos; ++i)
+                    newRanges[dest++] = ranges[i];
+                newRanges[dest++] = make(start, end);
+                for (int i = rpos + 1; i < ranges.length; ++i)
+                    newRanges[dest++] = ranges[i];
+            }
+            this.ranges = newRanges;
+        }
+
+        /**
+         * Returns true if the set completely covers the given interval.
+         */
+        public boolean covers(IntegerInterval iv)
+        {
+            long l = iv.interval;
+            return covers(lower(l), upper(l));
+        }
+
+        /**
+         * Returns true if the set completely covers the given interval.
+         */
+        public boolean covers(int start, int end)
+        {
+            long[] ranges = this.ranges; // take local copy to avoid risk of it changing in the midst of operation
+            int rpos = Arrays.binarySearch(ranges, ((start & 0xFFFFFFFFL) << 32) | 0xFFFFFFFFL);        // floor (i.e. greatest <=) of the end position
+            if (rpos < 0)
+                rpos = (-1 - rpos) - 1;
+            if (rpos == -1)
+                return false;
+            return upper(ranges[rpos]) >= end;
+        }
+
+        /**
+         * Returns a lower bound for the whole set. Will throw if set is not empty.
+         */
+        public int lowerBound()
+        {
+            return lower(ranges[0]);
+        }
+
+        /**
+         * Returns an upper bound for the whole set. Will throw if set is not empty.
+         */
+        public int upperBound()
+        {
+            long[] ranges = this.ranges; // take local copy to avoid risk of it changing in the midst of operation
+            return upper(ranges[ranges.length - 1]);
+        }
+
+        public Collection<IntegerInterval> intervals()
+        {
+            return Lists.transform(Longs.asList(ranges), iv -> new IntegerInterval(iv));
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Arrays.hashCode(ranges);
+        }
+
+        @Override
+        public boolean equals(Object obj)
+        {
+            if (getClass() != obj.getClass())
+                return false;
+            Set other = (Set) obj;
+            return Arrays.equals(ranges, other.ranges);
+        }
+
+        public String toString()
+        {
+            return "[" + intervals().stream().map(IntegerInterval::toString).collect(Collectors.joining(", ")) + "]";
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/IntervalTree.java b/src/java/org/apache/cassandra/utils/IntervalTree.java
index e857ee7..b92112e 100644
--- a/src/java/org/apache/cassandra/utils/IntervalTree.java
+++ b/src/java/org/apache/cassandra/utils/IntervalTree.java

@@ -17,21 +17,21 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.util.*;
 
 import com.google.common.base.Joiner;
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 import com.google.common.collect.Iterators;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.AsymmetricOrdering.Op;
 
@@ -329,12 +329,12 @@
          * tree is to use a custom comparator, as the comparator is *not*
          * serialized.
          */
-        public IntervalTree<C, D, I> deserialize(DataInput in, int version) throws IOException
+        public IntervalTree<C, D, I> deserialize(DataInputPlus in, int version) throws IOException
         {
             return deserialize(in, version, null);
         }
 
-        public IntervalTree<C, D, I> deserialize(DataInput in, int version, Comparator<C> comparator) throws IOException
+        public IntervalTree<C, D, I> deserialize(DataInputPlus in, int version, Comparator<C> comparator) throws IOException
         {
             try
             {
@@ -355,21 +355,16 @@
             }
         }
 
-        public long serializedSize(IntervalTree<C, D, I> it, TypeSizes typeSizes, int version)
-        {
-            long size = typeSizes.sizeof(0);
-            for (Interval<C, D> interval : it)
-            {
-                size += pointSerializer.serializedSize(interval.min, typeSizes);
-                size += pointSerializer.serializedSize(interval.max, typeSizes);
-                size += dataSerializer.serializedSize(interval.data, typeSizes);
-            }
-            return size;
-        }
-
         public long serializedSize(IntervalTree<C, D, I> it, int version)
         {
-            return serializedSize(it, TypeSizes.NATIVE, version);
+            long size = TypeSizes.sizeof(0);
+            for (Interval<C, D> interval : it)
+            {
+                size += pointSerializer.serializedSize(interval.min);
+                size += pointSerializer.serializedSize(interval.max);
+                size += dataSerializer.serializedSize(interval.data);
+            }
+            return size;
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
index 0196b04..89ef129 100644
--- a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
+++ b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java

@@ -19,12 +19,15 @@
 
 import java.io.FileNotFoundException;
 import java.net.SocketException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import com.google.common.annotations.VisibleForTesting;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSError;
@@ -109,6 +112,23 @@
         killer.killCurrentJVM(t, quiet);
     }
 
+    public static void userFunctionTimeout(Throwable t)
+    {
+        switch (DatabaseDescriptor.getUserFunctionTimeoutPolicy())
+        {
+            case die:
+                // policy to give 250ms grace time to
+                ScheduledExecutors.nonPeriodicTasks.schedule(() -> killer.killCurrentJVM(t), 250, TimeUnit.MILLISECONDS);
+                break;
+            case die_immediate:
+                killer.killCurrentJVM(t);
+                break;
+            case ignore:
+                logger.error(t.getMessage());
+                break;
+        }
+    }
+
     @VisibleForTesting
     public static Killer replaceKiller(Killer newKiller) {
         Killer oldKiller = JVMStabilityInspector.killer;
@@ -119,6 +139,8 @@
     @VisibleForTesting
     public static class Killer
     {
+        private final AtomicBoolean killing = new AtomicBoolean();
+
         /**
         * Certain situations represent "Die" conditions for the server, and if so, the reason is logged and the current JVM is killed.
         *
@@ -137,8 +159,11 @@
                 t.printStackTrace(System.err);
                 logger.error("JVM state determined to be unstable.  Exiting forcefully due to:", t);
             }
-            StorageService.instance.removeShutdownHook();
-            System.exit(100);
+            if (killing.compareAndSet(false, true))
+            {
+                StorageService.instance.removeShutdownHook();
+                System.exit(100);
+            }
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/LockedDynamicList.java b/src/java/org/apache/cassandra/utils/LockedDynamicList.java
new file mode 100644
index 0000000..cbda7e5
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/LockedDynamicList.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+// simple thread-unsafe skiplist that permits indexing/removal by position, insertion at the end
+// (though easily extended to insertion at any position, not necessary here)
+// we use it for sampling items by position for visiting writes in the pool of pending writes
+public class LockedDynamicList<E> extends DynamicList<E>
+{
+
+    private final ReadWriteLock lock = new ReentrantReadWriteLock();
+
+    public LockedDynamicList(int maxExpectedSize)
+    {
+        super(maxExpectedSize);
+    }
+
+    // add the value to the end of the list, and return the associated Node that permits efficient removal
+    // regardless of its future position in the list from other modifications
+    public Node<E> append(E value, int maxSize)
+    {
+        lock.writeLock().lock();
+        try
+        {
+            return super.append(value, maxSize);
+        }
+        finally
+        {
+            lock.writeLock().unlock();
+        }
+    }
+
+    // remove the provided node and its associated value from the list
+    public void remove(Node<E> node)
+    {
+        lock.writeLock().lock();
+        try
+        {
+            super.remove(node);
+        }
+        finally
+        {
+            lock.writeLock().unlock();
+        }
+    }
+
+    // retrieve the item at the provided index, or return null if the index is past the end of the list
+    public E get(int index)
+    {
+        lock.readLock().lock();
+        try
+        {
+            return super.get(index);
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
+    }
+
+    public int size()
+    {
+        lock.readLock().lock();
+        try
+        {
+            return super.size();
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/MBeanWrapper.java b/src/java/org/apache/cassandra/utils/MBeanWrapper.java
index 1ee787d..edee6af 100644
--- a/src/java/org/apache/cassandra/utils/MBeanWrapper.java
+++ b/src/java/org/apache/cassandra/utils/MBeanWrapper.java

@@ -19,6 +19,7 @@
 package org.apache.cassandra.utils;
 
 import java.lang.management.ManagementFactory;
+import java.util.function.Consumer;
 import javax.management.MBeanServer;
 import javax.management.MalformedObjectNameException;
 import javax.management.ObjectName;
@@ -40,37 +41,49 @@
 
     // Passing true for graceful will log exceptions instead of rethrowing them
     public void registerMBean(Object obj, ObjectName mbeanName, OnException onException);
-    public void registerMBean(Object obj, ObjectName mbeanName);
+    default void registerMBean(Object obj, ObjectName mbeanName)
+    {
+        registerMBean(obj, mbeanName, OnException.THROW);
+    }
 
     public void registerMBean(Object obj, String mbeanName, OnException onException);
-    public void registerMBean(Object obj, String mbeanName);
+    default void registerMBean(Object obj, String mbeanName)
+    {
+        registerMBean(obj, mbeanName, OnException.THROW);
+    }
 
     public boolean isRegistered(ObjectName mbeanName, OnException onException);
-    public boolean isRegistered(ObjectName mbeanName);
+    default boolean isRegistered(ObjectName mbeanName)
+    {
+        return isRegistered(mbeanName, OnException.THROW);
+    }
 
     public boolean isRegistered(String mbeanName, OnException onException);
-    public boolean isRegistered(String mbeanName);
+    default boolean isRegistered(String mbeanName)
+    {
+        return isRegistered(mbeanName, OnException.THROW);
+    }
 
     public void unregisterMBean(ObjectName mbeanName, OnException onException);
-    public void unregisterMBean(ObjectName mbeanName);
+    default void unregisterMBean(ObjectName mbeanName)
+    {
+        unregisterMBean(mbeanName, OnException.THROW);
+    }
 
     public void unregisterMBean(String mbeanName, OnException onException);
-    public void unregisterMBean(String mbeanName);
+    default void unregisterMBean(String mbeanName)
+    {
+        unregisterMBean(mbeanName, OnException.THROW);
+    }
 
     static class NoOpMBeanWrapper implements MBeanWrapper
     {
         public void registerMBean(Object obj, ObjectName mbeanName, OnException onException) {}
-        public void registerMBean(Object obj, ObjectName mbeanName) {}
         public void registerMBean(Object obj, String mbeanName, OnException onException) {}
-        public void registerMBean(Object obj, String mbeanName) {}
         public boolean isRegistered(ObjectName mbeanName, OnException onException) { return false; }
-        public boolean isRegistered(ObjectName mbeanName) { return false; }
         public boolean isRegistered(String mbeanName, OnException onException) { return false; }
-        public boolean isRegistered(String mbeanName) { return false; }
         public void unregisterMBean(ObjectName mbeanName, OnException onException) {}
-        public void unregisterMBean(ObjectName mbeanName) {}
         public void unregisterMBean(String mbeanName, OnException onException) {}
-        public void unregisterMBean(String mbeanName) {}
     }
 
     static class PlatformMBeanWrapper implements MBeanWrapper
@@ -87,10 +100,6 @@
                 onException.handler.accept(e);
             }
         }
-        public void registerMBean(Object obj, ObjectName mbeanName)
-        {
-            registerMBean(obj, mbeanName, OnException.THROW);
-        }
 
         public void registerMBean(Object obj, String mbeanName, OnException onException)
         {
@@ -103,10 +112,6 @@
                 onException.handler.accept(e);
             }
         }
-        public void registerMBean(Object obj, String mbeanName)
-        {
-            registerMBean(obj, mbeanName, OnException.THROW);
-        }
 
         public boolean isRegistered(ObjectName mbeanName, OnException onException)
         {
@@ -120,10 +125,6 @@
             }
             return false;
         }
-        public boolean isRegistered(ObjectName mbeanName)
-        {
-            return isRegistered(mbeanName, OnException.THROW);
-        }
 
         public boolean isRegistered(String mbeanName, OnException onException)
         {
@@ -137,10 +138,6 @@
             }
             return false;
         }
-        public boolean isRegistered(String mbeanName)
-        {
-            return isRegistered(mbeanName, OnException.THROW);
-        }
 
         public void unregisterMBean(ObjectName mbeanName, OnException onException)
         {
@@ -153,10 +150,6 @@
                 onException.handler.accept(e);
             }
         }
-        public void unregisterMBean(ObjectName mbeanName)
-        {
-            unregisterMBean(mbeanName, OnException.THROW);
-        }
 
         public void unregisterMBean(String mbeanName, OnException onException)
         {
@@ -169,35 +162,13 @@
                 onException.handler.accept(e);
             }
         }
-        public void unregisterMBean(String mbeanName)
-        {
-            unregisterMBean(mbeanName, OnException.THROW);
-        }
     }
 
     public enum OnException
     {
-        THROW(new Consumer<Exception>()
-        {
-            public void accept(Exception e)
-            {
-                throw new RuntimeException(e);
-            }
-        }),
-        LOG(new Consumer<Exception>()
-        {
-            public void accept(Exception e)
-            {
-                logger.error("Error in MBean wrapper: ", e);
-            }
-        }),
-        IGNORE(new Consumer<Exception>()
-        {
-            public void accept(Exception e)
-            {
-
-            }
-        });
+        THROW(e -> { throw new RuntimeException(e); }),
+        LOG(e -> { logger.error("Error in MBean wrapper: ", e); }),
+        IGNORE(e -> {});
 
         private Consumer<Exception> handler;
         OnException(Consumer<Exception> handler)
@@ -205,10 +176,4 @@
             this.handler = handler;
         }
     }
-
-    // Locally defined Consumer interface, to be compatible with Java 7. Only needed for cassandra-2.2
-    interface Consumer<T>
-    {
-        void accept(T e);
-    }
 }

diff --git a/src/java/org/apache/cassandra/utils/MergeIterator.java b/src/java/org/apache/cassandra/utils/MergeIterator.java
index e61326e..0cc5306 100644
--- a/src/java/org/apache/cassandra/utils/MergeIterator.java
+++ b/src/java/org/apache/cassandra/utils/MergeIterator.java

@@ -18,10 +18,9 @@
 package org.apache.cassandra.utils;
 
 import java.io.Closeable;
-import java.io.IOException;
 import java.util.*;
 
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 
 /** Merges sorted input iterators which individually contain unique items. */
 public abstract class MergeIterator<In,Out> extends AbstractIterator<Out> implements IMergeIterator<In, Out>
@@ -35,9 +34,9 @@
         this.reducer = reducer;
     }
 
-    public static <In, Out> IMergeIterator<In, Out> get(List<? extends Iterator<In>> sources,
-                                                        Comparator<In> comparator,
-                                                        Reducer<In, Out> reducer)
+    public static <In, Out> MergeIterator<In, Out> get(List<? extends Iterator<In>> sources,
+                                                       Comparator<? super In> comparator,
+                                                       Reducer<In, Out> reducer)
     {
         if (sources.size() == 1)
         {
@@ -59,9 +58,10 @@
         {
             try
             {
-                ((Closeable)iterator).close();
+                if (iterator instanceof AutoCloseable)
+                    ((AutoCloseable)iterator).close();
             }
-            catch (IOException e)
+            catch (Exception e)
             {
                 throw new RuntimeException(e);
             }
@@ -70,28 +70,87 @@
         reducer.close();
     }
 
-    /** A MergeIterator that consumes multiple input values per output value. */
-    private static final class ManyToOne<In,Out> extends MergeIterator<In,Out>
+    /**
+     * A MergeIterator that consumes multiple input values per output value.
+     *
+     * The most straightforward way to implement this is to use a {@code PriorityQueue} of iterators, {@code poll} it to
+     * find the next item to consume, then {@code add} the iterator back after advancing. This is not very efficient as
+     * {@code poll} and {@code add} in all cases require at least {@code log(size)} comparisons (usually more than
+     * {@code 2*log(size)}) per consumed item, even if the input is suitable for fast iteration.
+     *
+     * The implementation below makes use of the fact that replacing the top element in a binary heap can be done much
+     * more efficiently than separately removing it and placing it back, especially in the cases where the top iterator
+     * is to be used again very soon (e.g. when there are large sections of the output where only a limited number of
+     * input iterators overlap, which is normally the case in many practically useful situations, e.g. levelled
+     * compaction). To further improve this particular scenario, we also use a short sorted section at the start of the
+     * queue.
+     *
+     * The heap is laid out as this (for {@code SORTED_SECTION_SIZE == 2}):
+     *                 0
+     *                 |
+     *                 1
+     *                 |
+     *                 2
+     *               /   \
+     *              3     4
+     *             / \   / \
+     *             5 6   7 8
+     *            .. .. .. ..
+     * Where each line is a <= relationship.
+     *
+     * In the sorted section we can advance with a single comparison per level, while advancing a level within the heap
+     * requires two (so that we can find the lighter element to pop up).
+     * The sorted section adds a constant overhead when data is uniformly distributed among the iterators, but may up
+     * to halve the iteration time when one iterator is dominant over sections of the merged data (as is the case with
+     * non-overlapping iterators).
+     *
+     * The iterator is further complicated by the need to avoid advancing the input iterators until an output is
+     * actually requested. To achieve this {@code consume} walks the heap to find equal items without advancing the
+     * iterators, and {@code advance} moves them and restores the heap structure before any items can be consumed.
+     * 
+     * To avoid having to do additional comparisons in consume to identify the equal items, we keep track of equality
+     * between children and their parents in the heap. More precisely, the lines in the diagram above define the
+     * following relationship:
+     *   parent <= child && (parent == child) == child.equalParent
+     * We can track, make use of and update the equalParent field without any additional comparisons.
+     *
+     * For more formal definitions and proof of correctness, see CASSANDRA-8915.
+     */
+    static final class ManyToOne<In,Out> extends MergeIterator<In,Out>
     {
-        // a queue for return: all candidates must be open and have at least one item
-        protected final PriorityQueue<Candidate<In>> queue;
-        // a stack of the last consumed candidates, so that we can lazily call 'advance()'
-        // TODO: if we had our own PriorityQueue implementation we could stash items
-        // at the end of its array, so we wouldn't need this storage
-        protected final ArrayDeque<Candidate<In>> candidates;
-        public ManyToOne(List<? extends Iterator<In>> iters, Comparator<In> comp, Reducer<In, Out> reducer)
+        protected final Candidate<In>[] heap;
+
+        /** Number of non-exhausted iterators. */
+        int size;
+
+        /**
+         * Position of the deepest, right-most child that needs advancing before we can start consuming.
+         * Because advancing changes the values of the items of each iterator, the parent-chain from any position
+         * in this range that needs advancing is not in correct order. The trees rooted at any position that does
+         * not need advancing, however, retain their prior-held binary heap property.
+         */
+        int needingAdvance;
+
+        /**
+         * The number of elements to keep in order before the binary heap starts, exclusive of the top heap element.
+         */
+        static final int SORTED_SECTION_SIZE = 4;
+
+        public ManyToOne(List<? extends Iterator<In>> iters, Comparator<? super In> comp, Reducer<In, Out> reducer)
         {
             super(iters, reducer);
-            this.queue = new PriorityQueue<>(Math.max(1, iters.size()));
-            for (Iterator<In> iter : iters)
+
+            @SuppressWarnings("unchecked")
+            Candidate<In>[] heap = new Candidate[iters.size()];
+            this.heap = heap;
+            size = 0;
+
+            for (int i = 0; i < iters.size(); i++)
             {
-                Candidate<In> candidate = new Candidate<>(iter, comp);
-                if (!candidate.advance())
-                    // was empty
-                    continue;
-                this.queue.add(candidate);
+                Candidate<In> candidate = new Candidate<>(i, iters.get(i), comp);
+                heap[size++] = candidate;
             }
-            this.candidates = new ArrayDeque<>(queue.size());
+            needingAdvance = size;
         }
 
         protected final Out computeNext()
@@ -100,59 +159,237 @@
             return consume();
         }
 
-        /** Consume values by sending them to the reducer while they are equal. */
-        protected final Out consume()
+        /**
+         * Advance all iterators that need to be advanced and place them into suitable positions in the heap.
+         *
+         * By walking the iterators backwards we know that everything after the point being processed already forms
+         * correctly ordered subheaps, thus we can build a subheap rooted at the current position by only sinking down
+         * the newly advanced iterator. Because all parents of a consumed iterator are also consumed there is no way
+         * that we can process one consumed iterator but skip over its parent.
+         *
+         * The procedure is the same as the one used for the initial building of a heap in the heapsort algorithm and
+         * has a maximum number of comparisons {@code (2 * log(size) + SORTED_SECTION_SIZE / 2)} multiplied by the
+         * number of iterators whose items were consumed at the previous step, but is also at most linear in the size of
+         * the heap if the number of consumed elements is high (as it is in the initial heap construction). With non- or
+         * lightly-overlapping iterators the procedure finishes after just one (resp. a couple of) comparisons.
+         */
+        private void advance()
         {
-            reducer.onKeyChange();
-            Candidate<In> candidate = queue.peek();
-            if (candidate == null)
-                return endOfData();
-            do
+            // Turn the set of candidates into a heap.
+            for (int i = needingAdvance - 1; i >= 0; --i)
             {
-                candidate = queue.poll();
-                candidates.push(candidate);
-                reducer.reduce(candidate.item);
+                Candidate<In> candidate = heap[i];
+                /**
+                 *  needingAdvance runs to the maximum index (and deepest-right node) that may need advancing;
+                 *  since the equal items that were consumed at-once may occur in sub-heap "veins" of equality,
+                 *  not all items above this deepest-right position may have been consumed; these already form
+                 *  valid sub-heaps and can be skipped-over entirely
+                 */
+                if (candidate.needsAdvance())
+                    replaceAndSink(candidate.advance(), i);
             }
-            while (queue.peek() != null && queue.peek().compareTo(candidate) == 0);
+        }
+
+        /**
+         * Consume all items that sort like the current top of the heap. As we cannot advance the iterators to let
+         * equivalent items pop up, we walk the heap to find them and mark them as needing advance.
+         *
+         * This relies on the equalParent flag to avoid doing any comparisons.
+         */
+        private Out consume()
+        {
+            if (size == 0)
+                return endOfData();
+
+            reducer.onKeyChange();
+            assert !heap[0].equalParent;
+            reducer.reduce(heap[0].idx, heap[0].consume());
+            final int size = this.size;
+            final int sortedSectionSize = Math.min(size, SORTED_SECTION_SIZE);
+            int i;
+            consume: {
+                for (i = 1; i < sortedSectionSize; ++i)
+                {
+                    if (!heap[i].equalParent)
+                        break consume;
+                    reducer.reduce(heap[i].idx, heap[i].consume());
+                }
+                i = Math.max(i, consumeHeap(i) + 1);
+            }
+            needingAdvance = i;
             return reducer.getReduced();
         }
 
-        /** Advance and re-enqueue all items we consumed in the last iteration. */
-        protected final void advance()
+        /**
+         * Recursively consume all items equal to equalItem in the binary subheap rooted at position idx.
+         *
+         * @return the largest equal index found in this search.
+         */
+        private int consumeHeap(int idx)
         {
-            Candidate<In> candidate;
-            while ((candidate = candidates.pollFirst()) != null)
-                if (candidate.advance())
-                    queue.add(candidate);
+            if (idx >= size || !heap[idx].equalParent)
+                return -1;
+
+            reducer.reduce(heap[idx].idx, heap[idx].consume());
+            int nextIdx = (idx << 1) - (SORTED_SECTION_SIZE - 1);
+            return Math.max(idx, Math.max(consumeHeap(nextIdx), consumeHeap(nextIdx + 1)));
+        }
+
+        /**
+         * Replace an iterator in the heap with the given position and move it down the heap until it finds its proper
+         * position, pulling lighter elements up the heap.
+         *
+         * Whenever an equality is found between two elements that form a new parent-child relationship, the child's
+         * equalParent flag is set to true if the elements are equal.
+         */
+        private void replaceAndSink(Candidate<In> candidate, int currIdx)
+        {
+            if (candidate == null)
+            {
+                // Drop iterator by replacing it with the last one in the heap.
+                candidate = heap[--size];
+                heap[size] = null; // not necessary but helpful for debugging
+            }
+            // The new element will be top of its heap, at this point there is no parent to be equal to.
+            candidate.equalParent = false;
+
+            final int size = this.size;
+            final int sortedSectionSize = Math.min(size - 1, SORTED_SECTION_SIZE);
+
+            int nextIdx;
+
+            // Advance within the sorted section, pulling up items lighter than candidate.
+            while ((nextIdx = currIdx + 1) <= sortedSectionSize)
+            {
+                if (!heap[nextIdx].equalParent) // if we were greater then an (or were the) equal parent, we are >= the child
+                {
+                    int cmp = candidate.compareTo(heap[nextIdx]);
+                    if (cmp <= 0)
+                    {
+                        heap[nextIdx].equalParent = cmp == 0;
+                        heap[currIdx] = candidate;
+                        return;
+                    }
+                }
+
+                heap[currIdx] = heap[nextIdx];
+                currIdx = nextIdx;
+            }
+            // If size <= SORTED_SECTION_SIZE, nextIdx below will be no less than size,
+            // because currIdx == sortedSectionSize == size - 1 and nextIdx becomes
+            // (size - 1) * 2) - (size - 1 - 1) == size.
+
+            // Advance in the binary heap, pulling up the lighter element from the two at each level.
+            while ((nextIdx = (currIdx * 2) - (sortedSectionSize - 1)) + 1 < size)
+            {
+                if (!heap[nextIdx].equalParent)
+                {
+                    if (!heap[nextIdx + 1].equalParent)
+                    {
+                        // pick the smallest of the two children
+                        int siblingCmp = heap[nextIdx + 1].compareTo(heap[nextIdx]);
+                        if (siblingCmp < 0)
+                            ++nextIdx;
+
+                        // if we're smaller than this, we are done, and must only restore the heap and equalParent properties
+                        int cmp = candidate.compareTo(heap[nextIdx]);
+                        if (cmp <= 0)
+                        {
+                            if (cmp == 0)
+                            {
+                                heap[nextIdx].equalParent = true;
+                                if (siblingCmp == 0) // siblingCmp == 0 => nextIdx is the left child
+                                    heap[nextIdx + 1].equalParent = true;
+                            }
+
+                            heap[currIdx] = candidate;
+                            return;
+                        }
+
+                        if (siblingCmp == 0)
+                        {
+                            // siblingCmp == 0 => nextIdx is still the left child
+                            // if the two siblings were equal, and we are inserting something greater, we will
+                            // pull up the left one; this means the right gets an equalParent
+                            heap[nextIdx + 1].equalParent = true;
+                        }
+                    }
+                    else
+                        ++nextIdx;  // descend down the path where we found the equal child
+                }
+
+                heap[currIdx] = heap[nextIdx];
+                currIdx = nextIdx;
+            }
+
+            // our loop guard ensures there are always two siblings to process; typically when we exit the loop we will
+            // be well past the end of the heap and this next condition will match...
+            if (nextIdx >= size)
+            {
+                heap[currIdx] = candidate;
+                return;
+            }
+
+            // ... but sometimes we will have one last child to compare against, that has no siblings
+            if (!heap[nextIdx].equalParent)
+            {
+                int cmp = candidate.compareTo(heap[nextIdx]);
+                if (cmp <= 0)
+                {
+                    heap[nextIdx].equalParent = cmp == 0;
+                    heap[currIdx] = candidate;
+                    return;
+                }
+            }
+
+            heap[currIdx] = heap[nextIdx];
+            heap[nextIdx] = candidate;
         }
     }
 
     // Holds and is comparable by the head item of an iterator it owns
     protected static final class Candidate<In> implements Comparable<Candidate<In>>
     {
-        private final Iterator<In> iter;
-        private final Comparator<In> comp;
+        private final Iterator<? extends In> iter;
+        private final Comparator<? super In> comp;
+        private final int idx;
         private In item;
+        boolean equalParent;
 
-        public Candidate(Iterator<In> iter, Comparator<In> comp)
+        public Candidate(int idx, Iterator<? extends In> iter, Comparator<? super In> comp)
         {
             this.iter = iter;
             this.comp = comp;
+            this.idx = idx;
         }
 
-        /** @return True if our iterator had an item, and it is now available */
-        protected boolean advance()
+        /** @return this if our iterator had an item, and it is now available, otherwise null */
+        protected Candidate<In> advance()
         {
             if (!iter.hasNext())
-                return false;
+                return null;
             item = iter.next();
-            return true;
+            return this;
         }
 
         public int compareTo(Candidate<In> that)
         {
+            assert item != null && that.item != null;
             return comp.compare(this.item, that.item);
         }
+
+        public In consume()
+        {
+            In temp = item;
+            item = null;
+            assert temp != null;
+            return temp;
+        }
+
+        public boolean needsAdvance()
+        {
+            return item == null;
+        }
     }
 
     /** Accumulator that collects values of type A, and outputs a value of type B. */
@@ -170,7 +407,7 @@
          * combine this object with the previous ones.
          * intermediate state is up to your implementation.
          */
-        public abstract void reduce(In current);
+        public abstract void reduce(int idx, In current);
 
         /** @return The last object computed by reduce */
         protected abstract Out getReduced();
@@ -202,7 +439,7 @@
             if (!source.hasNext())
                 return endOfData();
             reducer.onKeyChange();
-            reducer.reduce(source.next());
+            reducer.reduce(0, source.next());
             return reducer.getReduced();
         }
     }

diff --git a/src/java/org/apache/cassandra/utils/MerkleTree.java b/src/java/org/apache/cassandra/utils/MerkleTree.java
index 1e0f505..22b61e8 100644
--- a/src/java/org/apache/cassandra/utils/MerkleTree.java
+++ b/src/java/org/apache/cassandra/utils/MerkleTree.java

@@ -22,10 +22,13 @@
 import java.io.Serializable;
 import java.util.*;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
-import com.google.common.collect.AbstractIterator;
 import com.google.common.collect.PeekingIterator;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.IPartitionerDependentSerializer;
@@ -33,7 +36,9 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
 
 /**
  * A MerkleTree implemented as a binary tree.
@@ -57,6 +62,8 @@
  */
 public class MerkleTree implements Serializable
 {
+    private static Logger logger = LoggerFactory.getLogger(MerkleTree.class);
+
     public static final MerkleTreeSerializer serializer = new MerkleTreeSerializer();
     private static final long serialVersionUID = 2L;
 
@@ -91,7 +98,7 @@
             Hashable.serializer.serialize(mt.root, out, version);
         }
 
-        public MerkleTree deserialize(DataInput in, int version) throws IOException
+        public MerkleTree deserialize(DataInputPlus in, int version) throws IOException
         {
             byte hashdepth = in.readByte();
             long maxsize = in.readLong();
@@ -120,9 +127,9 @@
         public long serializedSize(MerkleTree mt, int version)
         {
             long size = 1 // mt.hashdepth
-                 + TypeSizes.NATIVE.sizeof(mt.maxsize)
-                 + TypeSizes.NATIVE.sizeof(mt.size)
-                 + TypeSizes.NATIVE.sizeof(mt.partitioner.getClass().getCanonicalName());
+                 + TypeSizes.sizeof(mt.maxsize)
+                 + TypeSizes.sizeof(mt.size)
+                 + TypeSizes.sizeof(mt.partitioner.getClass().getCanonicalName());
 
             // full range
             size += Token.serializer.serializedSize(mt.fullRange.left, version);
@@ -240,8 +247,20 @@
 
         if (lhash != null && rhash != null && !Arrays.equals(lhash, rhash))
         {
-            if (FULLY_INCONSISTENT == differenceHelper(ltree, rtree, diff, active))
+            if(lnode instanceof  Leaf || rnode instanceof Leaf)
+            {
+                logger.debug("Digest mismatch detected among leaf nodes {}, {}", lnode, rnode);
                 diff.add(active);
+            }
+            else
+            {
+                logger.debug("Digest mismatch detected, traversing trees [{}, {}]", ltree, rtree);
+                if (FULLY_INCONSISTENT == differenceHelper(ltree, rtree, diff, active))
+                {
+                    logger.debug("Range {} fully inconsistent", active);
+                    diff.add(active);
+                }
+            }
         }
         else if (lhash == null || rhash == null)
             diff.add(active);
@@ -255,14 +274,25 @@
      * Takes two trees and a range for which they have hashes, but are inconsistent.
      * @return FULLY_INCONSISTENT if active is inconsistent, PARTIALLY_INCONSISTENT if only a subrange is inconsistent.
      */
+    @VisibleForTesting
     static int differenceHelper(MerkleTree ltree, MerkleTree rtree, List<TreeRange> diff, TreeRange active)
     {
         if (active.depth == Byte.MAX_VALUE)
             return CONSISTENT;
 
         Token midpoint = ltree.partitioner().midpoint(active.left, active.right);
+        // sanity check for midpoint calculation, see CASSANDRA-13052
+        if (midpoint.equals(active.left) || midpoint.equals(active.right))
+        {
+            // If the midpoint equals either the left or the right, we have a range that's too small to split - we'll simply report the
+            // whole range as inconsistent
+            logger.debug("({}) No sane midpoint ({}) for range {} , marking whole range as inconsistent", active.depth, midpoint, active);
+            return FULLY_INCONSISTENT;
+        }
+
         TreeDifference left = new TreeDifference(active.left, midpoint, inc(active.depth));
         TreeDifference right = new TreeDifference(midpoint, active.right, inc(active.depth));
+        logger.debug("({}) Hashing sub-ranges [{}, {}] for {} divided by midpoint {}", active.depth, left, right, active, midpoint);
         byte[] lhash, rhash;
         Hashable lnode, rnode;
 
@@ -277,9 +307,16 @@
         int ldiff = CONSISTENT;
         boolean lreso = lhash != null && rhash != null;
         if (lreso && !Arrays.equals(lhash, rhash))
-            ldiff = differenceHelper(ltree, rtree, diff, left);
+        {
+            logger.debug("({}) Inconsistent digest on left sub-range {}: [{}, {}]", active.depth, left, lnode, rnode);
+            if (lnode instanceof Leaf) ldiff = FULLY_INCONSISTENT;
+            else ldiff = differenceHelper(ltree, rtree, diff, left);
+        }
         else if (!lreso)
+        {
+            logger.debug("({}) Left sub-range fully inconsistent {}", active.depth, right);
             ldiff = FULLY_INCONSISTENT;
+        }
 
         // see if we should recurse right
         lnode = ltree.find(right);
@@ -292,25 +329,36 @@
         int rdiff = CONSISTENT;
         boolean rreso = lhash != null && rhash != null;
         if (rreso && !Arrays.equals(lhash, rhash))
-            rdiff = differenceHelper(ltree, rtree, diff, right);
+        {
+            logger.debug("({}) Inconsistent digest on right sub-range {}: [{}, {}]", active.depth, right, lnode, rnode);
+            if (rnode instanceof Leaf) rdiff = FULLY_INCONSISTENT;
+            else rdiff = differenceHelper(ltree, rtree, diff, right);
+        }
         else if (!rreso)
+        {
+            logger.debug("({}) Right sub-range fully inconsistent {}", active.depth, right);
             rdiff = FULLY_INCONSISTENT;
+        }
 
         if (ldiff == FULLY_INCONSISTENT && rdiff == FULLY_INCONSISTENT)
         {
             // both children are fully inconsistent
+            logger.debug("({}) Fully inconsistent range [{}, {}]", active.depth, left, right);
             return FULLY_INCONSISTENT;
         }
         else if (ldiff == FULLY_INCONSISTENT)
         {
+            logger.debug("({}) Adding left sub-range to diff as fully inconsistent {}", active.depth, left);
             diff.add(left);
             return PARTIALLY_INCONSISTENT;
         }
         else if (rdiff == FULLY_INCONSISTENT)
         {
+            logger.debug("({}) Adding right sub-range to diff as fully inconsistent {}", active.depth, right);
             diff.add(right);
             return PARTIALLY_INCONSISTENT;
         }
+        logger.debug("({}) Range {} partially inconstent", active.depth, active);
         return PARTIALLY_INCONSISTENT;
     }
 
@@ -826,12 +874,15 @@
         {
             public void serialize(Inner inner, DataOutputPlus out, int version) throws IOException
             {
-                if (inner.hash == null)
-                    out.writeInt(-1);
-                else
+                if (version < MessagingService.VERSION_30)
                 {
-                    out.writeInt(inner.hash.length);
-                    out.write(inner.hash);
+                    if (inner.hash == null)
+                        out.writeInt(-1);
+                    else
+                    {
+                        out.writeInt(inner.hash.length);
+                        out.write(inner.hash);
+                    }
                 }
                 Token.serializer.serialize(inner.token, out, version);
                 Hashable.serializer.serialize(inner.lchild, out, version);
@@ -840,10 +891,13 @@
 
             public Inner deserialize(DataInput in, IPartitioner p, int version) throws IOException
             {
-                int hashLen = in.readInt();
-                byte[] hash = hashLen >= 0 ? new byte[hashLen] : null;
-                if (hash != null)
-                    in.readFully(hash);
+                if (version < MessagingService.VERSION_30)
+                {
+                    int hashLen = in.readInt();
+                    byte[] hash = hashLen >= 0 ? new byte[hashLen] : null;
+                    if (hash != null)
+                        in.readFully(hash);
+                }
                 Token token = Token.serializer.deserialize(in, p, version);
                 Hashable lchild = Hashable.serializer.deserialize(in, p, version);
                 Hashable rchild = Hashable.serializer.deserialize(in, p, version);
@@ -852,9 +906,13 @@
 
             public long serializedSize(Inner inner, int version)
             {
-                int size = inner.hash == null
-                ? TypeSizes.NATIVE.sizeof(-1)
-                        : TypeSizes.NATIVE.sizeof(inner.hash().length) + inner.hash().length;
+                long size = 0;
+                if (version < MessagingService.VERSION_30)
+                {
+                    size += inner.hash == null
+                                       ? TypeSizes.sizeof(-1)
+                                       : TypeSizes.sizeof(inner.hash().length) + inner.hash().length;
+                }
 
                 size += Token.serializer.serializedSize(inner.token, version)
                 + Hashable.serializer.serializedSize(inner.lchild, version)
@@ -909,18 +967,24 @@
             {
                 if (leaf.hash == null)
                 {
-                    out.writeInt(-1);
+                    if (version < MessagingService.VERSION_30)
+                        out.writeInt(-1);
+                    else
+                        out.writeByte(-1);
                 }
                 else
                 {
-                    out.writeInt(leaf.hash.length);
+                    if (version < MessagingService.VERSION_30)
+                        out.writeInt(leaf.hash.length);
+                    else
+                        out.writeByte(leaf.hash.length);
                     out.write(leaf.hash);
                 }
             }
 
             public Leaf deserialize(DataInput in, IPartitioner p, int version) throws IOException
             {
-                int hashLen = in.readInt();
+                int hashLen = version < MessagingService.VERSION_30 ? in.readInt() : in.readByte();
                 byte[] hash = hashLen < 0 ? null : new byte[hashLen];
                 if (hash != null)
                     in.readFully(hash);
@@ -929,9 +993,12 @@
 
             public long serializedSize(Leaf leaf, int version)
             {
-                return leaf.hash == null
-                     ? TypeSizes.NATIVE.sizeof(-1)
-                     : TypeSizes.NATIVE.sizeof(leaf.hash().length) + leaf.hash().length;
+                long size = version < MessagingService.VERSION_30 ? TypeSizes.sizeof(1) : 1;
+                if (leaf.hash != null)
+                {
+                    size += leaf.hash().length;
+                }
+                return size;
             }
         }
     }

diff --git a/src/java/org/apache/cassandra/utils/MerkleTrees.java b/src/java/org/apache/cassandra/utils/MerkleTrees.java
new file mode 100644
index 0000000..4ae55ab
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/MerkleTrees.java

@@ -0,0 +1,446 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.*;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.PeekingIterator;
+import org.slf4j.Logger;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+
+/**
+ * Wrapper class for handling of multiple MerkleTrees at once.
+ * 
+ * The MerkleTree's are divided in Ranges of non-overlapping tokens.
+ */
+public class MerkleTrees implements Iterable<Map.Entry<Range<Token>, MerkleTree>>
+{
+    public static final MerkleTreesSerializer serializer = new MerkleTreesSerializer();
+
+    private Map<Range<Token>, MerkleTree> merkleTrees = new TreeMap<>(new TokenRangeComparator());
+
+    private IPartitioner partitioner;
+
+    /**
+     * Creates empty MerkleTrees object.
+     * 
+     * @param partitioner The partitioner to use
+     */
+    public MerkleTrees(IPartitioner partitioner)
+    {
+        this(partitioner, new ArrayList<>());
+    }
+
+    private MerkleTrees(IPartitioner partitioner, Collection<MerkleTree> merkleTrees)
+    {
+        this.partitioner = partitioner;
+        addTrees(merkleTrees);
+    }
+
+    /**
+     * Get the ranges that these merkle trees covers.
+     * 
+     * @return
+     */
+    public Collection<Range<Token>> ranges()
+    {
+        return merkleTrees.keySet();
+    }
+
+    /**
+     * Get the partitioner in use.
+     * 
+     * @return
+     */
+    public IPartitioner partitioner()
+    {
+        return partitioner;
+    }
+
+    /**
+     * Add merkle tree's with the defined maxsize and ranges.
+     * 
+     * @param maxsize
+     * @param ranges
+     */
+    public void addMerkleTrees(int maxsize, Collection<Range<Token>> ranges)
+    {
+        for (Range<Token> range : ranges)
+        {
+            addMerkleTree(maxsize, range);
+        }
+    }
+
+    /**
+     * Add a MerkleTree with the defined size and range.
+     * 
+     * @param maxsize
+     * @param range
+     * @return The created merkle tree.
+     */
+    public MerkleTree addMerkleTree(int maxsize, Range<Token> range)
+    {
+        return addMerkleTree(maxsize, MerkleTree.RECOMMENDED_DEPTH, range);
+    }
+
+    @VisibleForTesting
+    public MerkleTree addMerkleTree(int maxsize, byte hashdepth, Range<Token> range)
+    {
+        MerkleTree tree = new MerkleTree(partitioner, range, hashdepth, maxsize);
+        addTree(tree);
+
+        return tree;
+    }
+
+    /**
+     * Get the MerkleTree.Range responsible for the given token.
+     * 
+     * @param t
+     * @return
+     */
+    @VisibleForTesting
+    public MerkleTree.TreeRange get(Token t)
+    {
+        return getMerkleTree(t).get(t);
+    }
+
+    /**
+     * Init all MerkleTree's with an even tree distribution.
+     */
+    public void init()
+    {
+        for (Range<Token> range : merkleTrees.keySet())
+        {
+            init(range);
+        }
+    }
+
+    /**
+     * Init a selected MerkleTree with an even tree distribution.
+     * 
+     * @param range
+     */
+    public void init(Range<Token> range)
+    {
+        merkleTrees.get(range).init();
+    }
+
+    /**
+     * Split the MerkleTree responsible for the given token.
+     * 
+     * @param t
+     * @return
+     */
+    public boolean split(Token t)
+    {
+        return getMerkleTree(t).split(t);
+    }
+
+    /**
+     * Invalidate the MerkleTree responsible for the given token.
+     * 
+     * @param t
+     */
+    @VisibleForTesting
+    public void invalidate(Token t)
+    {
+        getMerkleTree(t).invalidate(t);
+    }
+
+    /**
+     * Get the MerkleTree responsible for the given token range.
+     * 
+     * @param range
+     * @return
+     */
+    public MerkleTree getMerkleTree(Range<Token> range)
+    {
+        return merkleTrees.get(range);
+    }
+
+    public long size()
+    {
+        long size = 0;
+
+        for (MerkleTree tree : merkleTrees.values())
+        {
+            size += tree.size();
+        }
+
+        return size;
+    }
+
+    @VisibleForTesting
+    public void maxsize(Range<Token> range, int maxsize)
+    {
+        getMerkleTree(range).maxsize(maxsize);
+    }
+
+    /**
+     * Get the MerkleTree responsible for the given token.
+     * 
+     * @param t
+     * @return The given MerkleTree or null if none exist.
+     */
+    private MerkleTree getMerkleTree(Token t)
+    {
+        for (Range<Token> range : merkleTrees.keySet())
+        {
+            if (range.contains(t))
+                return merkleTrees.get(range);
+        }
+
+        throw new AssertionError("Expected tree for token " + t);
+    }
+
+    private void addTrees(Collection<MerkleTree> trees)
+    {
+        for (MerkleTree tree : trees)
+        {
+            addTree(tree);
+        }
+    }
+
+    private void addTree(MerkleTree tree)
+    {
+        assert validateNonOverlapping(tree) : "Range [" + tree.fullRange + "] is intersecting an existing range";
+
+        merkleTrees.put(tree.fullRange, tree);
+    }
+
+    private boolean validateNonOverlapping(MerkleTree tree)
+    {
+        for (Range<Token> range : merkleTrees.keySet())
+        {
+            if (tree.fullRange.intersects(range))
+                return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * Get an iterator for all the invalids generated by the MerkleTrees.
+     * 
+     * @return
+     */
+    public TreeRangeIterator invalids()
+    {
+        return new TreeRangeIterator();
+    }
+
+    /**
+     * Log the row count per leaf for all MerkleTrees.
+     * 
+     * @param logger
+     */
+    public void logRowCountPerLeaf(Logger logger)
+    {
+        for (MerkleTree tree : merkleTrees.values())
+        {
+            tree.histogramOfRowCountPerLeaf().log(logger);
+        }
+    }
+
+    /**
+     * Log the row size per leaf for all MerkleTrees.
+     * 
+     * @param logger
+     */
+    public void logRowSizePerLeaf(Logger logger)
+    {
+        for (MerkleTree tree : merkleTrees.values())
+        {
+            tree.histogramOfRowSizePerLeaf().log(logger);
+        }
+    }
+
+    @VisibleForTesting
+    public byte[] hash(Range<Token> range)
+    {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        boolean hashed = false;
+
+        try
+        {
+            for (Range<Token> rt : merkleTrees.keySet())
+            {
+                if (rt.intersects(range))
+                {
+                    byte[] bytes = merkleTrees.get(rt).hash(range);
+                    if (bytes != null)
+                    {
+                        baos.write(bytes);
+                        hashed = true;
+                    }
+                }
+            }
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException("Unable to append merkle tree hash to result");
+        }
+        
+        return hashed ? baos.toByteArray() : null;
+    }
+
+    /**
+     * Get an iterator of all ranges and their MerkleTrees.
+     */
+    public Iterator<Map.Entry<Range<Token>, MerkleTree>> iterator()
+    {
+        return merkleTrees.entrySet().iterator();
+    }
+
+    public long rowCount()
+    {
+        long totalCount = 0;
+        for (MerkleTree tree : merkleTrees.values())
+        {
+            totalCount += tree.rowCount();
+        }
+        return totalCount;
+    }
+
+    public class TreeRangeIterator extends AbstractIterator<MerkleTree.TreeRange> implements
+            Iterable<MerkleTree.TreeRange>,
+            PeekingIterator<MerkleTree.TreeRange>
+    {
+        private final Iterator<MerkleTree> it;
+
+        private MerkleTree.TreeRangeIterator current = null;
+
+        private TreeRangeIterator()
+        {
+            it = merkleTrees.values().iterator();
+        }
+
+        public MerkleTree.TreeRange computeNext()
+        {
+            if (current == null || !current.hasNext())
+                return nextIterator();
+
+            return current.next();
+        }
+
+        private MerkleTree.TreeRange nextIterator()
+        {
+            if (it.hasNext())
+            {
+                current = it.next().invalids();
+
+                return current.next();
+            }
+
+            return endOfData();
+        }
+
+        public Iterator<MerkleTree.TreeRange> iterator()
+        {
+            return this;
+        }
+    }
+
+    /**
+     * Get the differences between the two sets of MerkleTrees.
+     * 
+     * @param ltree
+     * @param rtree
+     * @return
+     */
+    public static List<Range<Token>> difference(MerkleTrees ltree, MerkleTrees rtree)
+    {
+        List<Range<Token>> differences = new ArrayList<>();
+        for (MerkleTree tree : ltree.merkleTrees.values())
+        {
+            differences.addAll(MerkleTree.difference(tree, rtree.getMerkleTree(tree.fullRange)));
+        }
+        return differences;
+    }
+
+    public static class MerkleTreesSerializer implements IVersionedSerializer<MerkleTrees>
+    {
+        public void serialize(MerkleTrees trees, DataOutputPlus out, int version) throws IOException
+        {
+            out.writeInt(trees.merkleTrees.size());
+            for (MerkleTree tree : trees.merkleTrees.values())
+            {
+                MerkleTree.serializer.serialize(tree, out, version);
+            }
+        }
+
+        public MerkleTrees deserialize(DataInputPlus in, int version) throws IOException
+        {
+            IPartitioner partitioner = null;
+            int nTrees = in.readInt();
+            Collection<MerkleTree> trees = new ArrayList<>(nTrees);
+            if (nTrees > 0)
+            {
+                for (int i = 0; i < nTrees; i++)
+                {
+                    MerkleTree tree = MerkleTree.serializer.deserialize(in, version);
+                    trees.add(tree);
+
+                    if (partitioner == null)
+                        partitioner = tree.partitioner();
+                    else
+                        assert tree.partitioner() == partitioner;
+                }
+            }
+
+            return new MerkleTrees(partitioner, trees);
+        }
+
+        public long serializedSize(MerkleTrees trees, int version)
+        {
+            assert trees != null;
+
+            long size = TypeSizes.sizeof(trees.merkleTrees.size());
+            for (MerkleTree tree : trees.merkleTrees.values())
+            {
+                size += MerkleTree.serializer.serializedSize(tree, version);
+            }
+            return size;
+        }
+
+    }
+
+    private static class TokenRangeComparator implements Comparator<Range<Token>>
+    {
+        @Override
+        public int compare(Range<Token> rt1, Range<Token> rt2)
+        {
+            if (rt1.left.compareTo(rt2.left) == 0)
+                return 0;
+
+            return rt1.compareTo(rt2);
+        }
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeLibrary.java b/src/java/org/apache/cassandra/utils/NativeLibrary.java
new file mode 100644
index 0000000..0cc690e
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/NativeLibrary.java

@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.io.File;
+import java.io.FileDescriptor;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.nio.channels.FileChannel;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.sun.jna.LastErrorException;
+
+import org.apache.cassandra.io.FSWriteError;
+
+import static org.apache.cassandra.utils.NativeLibrary.OSType.LINUX;
+import static org.apache.cassandra.utils.NativeLibrary.OSType.MAC;
+import static org.apache.cassandra.utils.NativeLibrary.OSType.WINDOWS;
+import static org.apache.cassandra.utils.NativeLibrary.OSType.AIX;
+
+public final class NativeLibrary
+{
+    private static final Logger logger = LoggerFactory.getLogger(NativeLibrary.class);
+
+    public enum OSType
+    {
+        LINUX,
+        MAC,
+        WINDOWS,
+        AIX,
+        OTHER;
+    }
+
+    private static final OSType osType;
+
+    private static final int MCL_CURRENT;
+    private static final int MCL_FUTURE;
+
+    private static final int ENOMEM = 12;
+
+    private static final int F_GETFL   = 3;  /* get file status flags */
+    private static final int F_SETFL   = 4;  /* set file status flags */
+    private static final int F_NOCACHE = 48; /* Mac OS X specific flag, turns cache on/off */
+    private static final int O_DIRECT  = 040000; /* fcntl.h */
+    private static final int O_RDONLY  = 00000000; /* fcntl.h */
+
+    private static final int POSIX_FADV_NORMAL     = 0; /* fadvise.h */
+    private static final int POSIX_FADV_RANDOM     = 1; /* fadvise.h */
+    private static final int POSIX_FADV_SEQUENTIAL = 2; /* fadvise.h */
+    private static final int POSIX_FADV_WILLNEED   = 3; /* fadvise.h */
+    private static final int POSIX_FADV_DONTNEED   = 4; /* fadvise.h */
+    private static final int POSIX_FADV_NOREUSE    = 5; /* fadvise.h */
+
+    private static final NativeLibraryWrapper wrappedLibrary;
+    private static boolean jnaLockable = false;
+
+    static
+    {
+        // detect the OS type the JVM is running on and then set the CLibraryWrapper
+        // instance to a compatable implementation of CLibraryWrapper for that OS type
+        osType = getOsType();
+        switch (osType)
+        {
+            case MAC: wrappedLibrary = new NativeLibraryDarwin(); break;
+            case WINDOWS: wrappedLibrary = new NativeLibraryWindows(); break;
+            case LINUX:
+            case AIX:
+            case OTHER:
+            default: wrappedLibrary = new NativeLibraryLinux();
+        }
+
+        if (System.getProperty("os.arch").toLowerCase().contains("ppc"))
+        {
+            if (osType == LINUX)
+            {
+               MCL_CURRENT = 0x2000;
+               MCL_FUTURE = 0x4000;
+            }
+            else if (osType == AIX)
+            {
+                MCL_CURRENT = 0x100;
+                MCL_FUTURE = 0x200;
+            }
+            else
+            {
+                MCL_CURRENT = 1;
+                MCL_FUTURE = 2;
+            }
+        }
+        else
+        {
+            MCL_CURRENT = 1;
+            MCL_FUTURE = 2;
+        }
+    }
+
+    private NativeLibrary() {}
+
+    /**
+     * @return the detected OSType of the Operating System running the JVM using crude string matching
+     */
+    private static OSType getOsType()
+    {
+        String osName = System.getProperty("os.name").toLowerCase();
+        if (osName.contains("mac"))
+            return MAC;
+        else if (osName.contains("windows"))
+            return WINDOWS;
+        else if (osName.contains("aix"))
+            return AIX;
+        else
+            // fall back to the Linux impl for all unknown OS types until otherwise implicitly supported as needed
+            return LINUX;
+    }
+
+    private static int errno(RuntimeException e)
+    {
+        assert e instanceof LastErrorException;
+        try
+        {
+            return ((LastErrorException) e).getErrorCode();
+        }
+        catch (NoSuchMethodError x)
+        {
+            logger.warn("Obsolete version of JNA present; unable to read errno. Upgrade to JNA 3.2.7 or later");
+            return 0;
+        }
+    }
+
+    /**
+     * Checks if the library has been successfully linked.
+     * @return {@code true} if the library has been successfully linked, {@code false} otherwise.
+     */
+    public static boolean isAvailable()
+    {
+        return wrappedLibrary.isAvailable();
+    }
+
+    public static boolean jnaMemoryLockable()
+    {
+        return jnaLockable;
+    }
+
+    public static void tryMlockall()
+    {
+        try
+        {
+            wrappedLibrary.callMlockall(MCL_CURRENT);
+            jnaLockable = true;
+            logger.info("JNA mlockall successful");
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // this will have already been logged by CLibrary, no need to repeat it
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            if (errno(e) == ENOMEM && osType == LINUX)
+            {
+                logger.warn("Unable to lock JVM memory (ENOMEM)."
+                        + " This can result in part of the JVM being swapped out, especially with mmapped I/O enabled."
+                        + " Increase RLIMIT_MEMLOCK or run Cassandra as root.");
+            }
+            else if (osType != MAC)
+            {
+                // OS X allows mlockall to be called, but always returns an error
+                logger.warn("Unknown mlockall error {}", errno(e));
+            }
+        }
+    }
+
+    public static void trySkipCache(String path, long offset, long len)
+    {
+        File f = new File(path);
+        if (!f.exists())
+            return;
+
+        try (FileInputStream fis = new FileInputStream(f))
+        {
+            trySkipCache(getfd(fis.getChannel()), offset, len, path);
+        }
+        catch (IOException e)
+        {
+            logger.warn("Could not skip cache", e);
+        }
+    }
+
+    public static void trySkipCache(int fd, long offset, long len, String path)
+    {
+        if (len == 0)
+            trySkipCache(fd, 0, 0, path);
+
+        while (len > 0)
+        {
+            int sublen = (int) Math.min(Integer.MAX_VALUE, len);
+            trySkipCache(fd, offset, sublen, path);
+            len -= sublen;
+            offset -= sublen;
+        }
+    }
+
+    public static void trySkipCache(int fd, long offset, int len, String path)
+    {
+        if (fd < 0)
+            return;
+
+        try
+        {
+            if (osType == LINUX)
+            {
+                int result = wrappedLibrary.callPosixFadvise(fd, offset, len, POSIX_FADV_DONTNEED);
+                if (result != 0)
+                    NoSpamLogger.log(
+                            logger,
+                            NoSpamLogger.Level.WARN,
+                            10,
+                            TimeUnit.MINUTES,
+                            "Failed trySkipCache on file: {} Error: " + wrappedLibrary.callStrerror(result).getString(0),
+                            path);
+            }
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // if JNA is unavailable just skipping Direct I/O
+            // instance of this class will act like normal RandomAccessFile
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            logger.warn(String.format("posix_fadvise(%d, %d) failed, errno (%d).", fd, offset, errno(e)));
+        }
+    }
+
+    public static int tryFcntl(int fd, int command, int flags)
+    {
+        // fcntl return value may or may not be useful, depending on the command
+        int result = -1;
+
+        try
+        {
+            result = wrappedLibrary.callFcntl(fd, command, flags);
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // if JNA is unavailable just skipping
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            logger.warn(String.format("fcntl(%d, %d, %d) failed, errno (%d).", fd, command, flags, errno(e)));
+        }
+
+        return result;
+    }
+
+    public static int tryOpenDirectory(String path)
+    {
+        int fd = -1;
+
+        try
+        {
+            return wrappedLibrary.callOpen(path, O_RDONLY);
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // JNA is unavailable just skipping Direct I/O
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            logger.warn(String.format("open(%s, O_RDONLY) failed, errno (%d).", path, errno(e)));
+        }
+
+        return fd;
+    }
+
+    public static void trySync(int fd)
+    {
+        if (fd == -1)
+            return;
+
+        try
+        {
+            wrappedLibrary.callFsync(fd);
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // JNA is unavailable just skipping Direct I/O
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            String errMsg = String.format("fsync(%s) failed, errno %s", fd, errno(e));
+            logger.warn(errMsg);
+            throw new FSWriteError(e, errMsg);
+        }
+    }
+
+    public static void tryCloseFD(int fd)
+    {
+        if (fd == -1)
+            return;
+
+        try
+        {
+            wrappedLibrary.callClose(fd);
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            // JNA is unavailable just skipping Direct I/O
+        }
+        catch (RuntimeException e)
+        {
+            if (!(e instanceof LastErrorException))
+                throw e;
+
+            String errMsg = String.format("close(%d) failed, errno (%d).", fd, errno(e));
+            logger.warn(errMsg);
+            throw new FSWriteError(e, errMsg);
+        }
+    }
+
+    public static int getfd(FileChannel channel)
+    {
+        Field field = FBUtilities.getProtectedField(channel.getClass(), "fd");
+
+        try
+        {
+            return getfd((FileDescriptor)field.get(channel));
+        }
+        catch (IllegalArgumentException|IllegalAccessException e)
+        {
+            logger.warn("Unable to read fd field from FileChannel");
+        }
+        return -1;
+    }
+
+    /**
+     * Get system file descriptor from FileDescriptor object.
+     * @param descriptor - FileDescriptor objec to get fd from
+     * @return file descriptor, -1 or error
+     */
+    public static int getfd(FileDescriptor descriptor)
+    {
+        Field field = FBUtilities.getProtectedField(descriptor.getClass(), "fd");
+
+        try
+        {
+            return field.getInt(descriptor);
+        }
+        catch (Exception e)
+        {
+            JVMStabilityInspector.inspectThrowable(e);
+            logger.warn("Unable to read fd field from FileDescriptor");
+        }
+
+        return -1;
+    }
+
+    /**
+     * @return the PID of the JVM or -1 if we failed to get the PID
+     */
+    public static long getProcessID()
+    {
+        try
+        {
+            return wrappedLibrary.callGetpid();
+        }
+        catch (Exception e)
+        {
+            logger.info("Failed to get PID from JNA", e);
+        }
+
+        return -1;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeLibraryDarwin.java b/src/java/org/apache/cassandra/utils/NativeLibraryDarwin.java
new file mode 100644
index 0000000..6ed18d1
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/NativeLibraryDarwin.java

@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.util.Collections;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.sun.jna.LastErrorException;
+import com.sun.jna.Native;
+import com.sun.jna.Pointer;
+
+/**
+ * A {@code NativeLibraryWrapper} implementation for Darwin/Mac.
+ * <p>
+ * When JNA is initialized, all methods that have the 'native' keyword
+ * will be attmpted to be linked against. As Java doesn't have the equivalent
+ * of a #ifdef, this means if a native method like posix_fadvise is defined in the
+ * class but not available on the target operating system (e.g.
+ * posix_fadvise is not availble on Darwin/Mac) this will cause the entire
+ * initial linking and initialization of JNA to fail. This means other
+ * native calls that are supported on that target operating system will be
+ * unavailable simply because of one native defined method not supported
+ * on the runtime operating system.
+ * @see org.apache.cassandra.utils.NativeLibraryWrapper
+ * @see NativeLibrary
+ */
+public class NativeLibraryDarwin implements NativeLibraryWrapper
+{
+    private static final Logger logger = LoggerFactory.getLogger(NativeLibraryDarwin.class);
+
+    private static boolean available;
+
+    static
+    {
+        try
+        {
+            Native.register(com.sun.jna.NativeLibrary.getInstance("c", Collections.emptyMap()));
+            available = true;
+        }
+        catch (NoClassDefFoundError e)
+        {
+            logger.warn("JNA not found. Native methods will be disabled.");
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            logger.error("Failed to link the C library against JNA. Native methods will be unavailable.", e);
+        }
+        catch (NoSuchMethodError e)
+        {
+            logger.warn("Obsolete version of JNA present; unable to register C library. Upgrade to JNA 3.2.7 or later");
+        }
+    }
+
+    private static native int mlockall(int flags) throws LastErrorException;
+    private static native int munlockall() throws LastErrorException;
+    private static native int fcntl(int fd, int command, long flags) throws LastErrorException;
+    private static native int open(String path, int flags) throws LastErrorException;
+    private static native int fsync(int fd) throws LastErrorException;
+    private static native int close(int fd) throws LastErrorException;
+    private static native Pointer strerror(int errnum) throws LastErrorException;
+    private static native long getpid() throws LastErrorException;
+
+    public int callMlockall(int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return mlockall(flags);
+    }
+
+    public int callMunlockall() throws UnsatisfiedLinkError, RuntimeException
+    {
+        return munlockall();
+    }
+
+    public int callFcntl(int fd, int command, long flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return fcntl(fd, command, flags);
+    }
+
+    public int callPosixFadvise(int fd, long offset, int len, int flag) throws UnsatisfiedLinkError, RuntimeException
+    {
+        // posix_fadvise is not available on Darwin/Mac
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callOpen(String path, int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return open(path, flags);
+    }
+
+    public int callFsync(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return fsync(fd);
+    }
+
+    public int callClose(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return close(fd);
+    }
+
+    public Pointer callStrerror(int errnum) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return strerror(errnum);
+    }
+
+    public long callGetpid() throws UnsatisfiedLinkError, RuntimeException
+    {
+        return getpid();
+    }
+
+    public boolean isAvailable()
+    {
+        return available;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeLibraryLinux.java b/src/java/org/apache/cassandra/utils/NativeLibraryLinux.java
new file mode 100644
index 0000000..3f21d17
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/NativeLibraryLinux.java

@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.util.Collections;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.sun.jna.LastErrorException;
+import com.sun.jna.Native;
+import com.sun.jna.Pointer;
+
+/**
+ * A {@code NativeLibraryWrapper} implementation for Linux.
+ * <p>
+ * When JNA is initialized, all methods that have the 'native' keyword
+ * will be attmpted to be linked against. As Java doesn't have the equivalent
+ * of a #ifdef, this means if a native method like posix_fadvise is defined in the
+ * class but not available on the target operating system (e.g.
+ * posix_fadvise is not availble on Darwin/Mac) this will cause the entire
+ * initial linking and initialization of JNA to fail. This means other
+ * native calls that are supported on that target operating system will be
+ * unavailable simply because of one native defined method not supported
+ * on the runtime operating system.
+ * @see org.apache.cassandra.utils.NativeLibraryWrapper
+ * @see NativeLibrary
+ */
+public class NativeLibraryLinux implements NativeLibraryWrapper
+{
+    private static boolean available;
+
+    private static final Logger logger = LoggerFactory.getLogger(NativeLibraryLinux.class);
+
+    static
+    {
+        try
+        {
+            Native.register(com.sun.jna.NativeLibrary.getInstance("c", Collections.emptyMap()));
+            available = true;
+        }
+        catch (NoClassDefFoundError e)
+        {
+            logger.warn("JNA not found. Native methods will be disabled.");
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            logger.error("Failed to link the C library against JNA. Native methods will be unavailable.", e);
+        }
+        catch (NoSuchMethodError e)
+        {
+            logger.warn("Obsolete version of JNA present; unable to register C library. Upgrade to JNA 3.2.7 or later");
+        }
+    }
+
+    private static native int mlockall(int flags) throws LastErrorException;
+    private static native int munlockall() throws LastErrorException;
+    private static native int fcntl(int fd, int command, long flags) throws LastErrorException;
+    private static native int posix_fadvise(int fd, long offset, int len, int flag) throws LastErrorException;
+    private static native int open(String path, int flags) throws LastErrorException;
+    private static native int fsync(int fd) throws LastErrorException;
+    private static native int close(int fd) throws LastErrorException;
+    private static native Pointer strerror(int errnum) throws LastErrorException;
+    private static native long getpid() throws LastErrorException;
+
+    public int callMlockall(int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return mlockall(flags);
+    }
+
+    public int callMunlockall() throws UnsatisfiedLinkError, RuntimeException
+    {
+        return munlockall();
+    }
+
+    public int callFcntl(int fd, int command, long flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return fcntl(fd, command, flags);
+    }
+
+    public int callPosixFadvise(int fd, long offset, int len, int flag) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return posix_fadvise(fd, offset, len, flag);
+    }
+
+    public int callOpen(String path, int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return open(path, flags);
+    }
+
+    public int callFsync(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return fsync(fd);
+    }
+
+    public int callClose(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return close(fd);
+    }
+
+    public Pointer callStrerror(int errnum) throws UnsatisfiedLinkError, RuntimeException
+    {
+        return strerror(errnum);
+    }
+
+    public long callGetpid() throws UnsatisfiedLinkError, RuntimeException
+    {
+        return getpid();
+    }
+
+    public boolean isAvailable()
+    {
+        return available;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeLibraryWindows.java b/src/java/org/apache/cassandra/utils/NativeLibraryWindows.java
new file mode 100644
index 0000000..d6514af
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/NativeLibraryWindows.java

@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.util.Collections;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.sun.jna.LastErrorException;
+import com.sun.jna.Native;
+import com.sun.jna.Pointer;
+
+/**
+ * A {@code NativeLibraryWrapper} implementation for Windows.
+ * <p> This implementation only offer support for the {@code callGetpid} method
+ * using the Windows/Kernel32 library.</p>
+ *
+ * @see org.apache.cassandra.utils.NativeLibraryWrapper
+ * @see NativeLibrary
+ */
+public class NativeLibraryWindows implements NativeLibraryWrapper
+{
+    private static boolean available;
+
+    private static final Logger logger = LoggerFactory.getLogger(NativeLibraryWindows.class);
+
+    static
+    {
+        try
+        {
+            Native.register(com.sun.jna.NativeLibrary.getInstance("kernel32", Collections.emptyMap()));
+            available = true;
+        }
+        catch (NoClassDefFoundError e)
+        {
+            logger.warn("JNA not found. Native methods will be disabled.");
+        }
+        catch (UnsatisfiedLinkError e)
+        {
+            logger.error("Failed to link the Windows/Kernel32 library against JNA. Native methods will be unavailable.", e);
+        }
+        catch (NoSuchMethodError e)
+        {
+            logger.warn("Obsolete version of JNA present; unable to register Windows/Kernel32 library. Upgrade to JNA 3.2.7 or later");
+        }
+    }
+
+    /**
+     * Retrieves the process identifier of the calling process (<a href='https://msdn.microsoft.com/en-us/library/windows/desktop/ms683180(v=vs.85).aspx'>GetCurrentProcessId function</a>).
+     *
+     * @return the process identifier of the calling process
+     */
+    private static native long GetCurrentProcessId() throws LastErrorException;
+
+    public int callMlockall(int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callMunlockall() throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callFcntl(int fd, int command, long flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callPosixFadvise(int fd, long offset, int len, int flag) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callOpen(String path, int flags) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callFsync(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public int callClose(int fd) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    public Pointer callStrerror(int errnum) throws UnsatisfiedLinkError, RuntimeException
+    {
+        throw new UnsatisfiedLinkError();
+    }
+
+    /**
+     * @return the PID of the JVM running
+     * @throws UnsatisfiedLinkError if we fail to link against Sigar
+     * @throws RuntimeException if another unexpected error is thrown by Sigar
+     */
+    public long callGetpid() throws UnsatisfiedLinkError, RuntimeException
+    {
+        return GetCurrentProcessId();
+    }
+
+    public boolean isAvailable()
+    {
+        return available;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeLibraryWrapper.java b/src/java/org/apache/cassandra/utils/NativeLibraryWrapper.java
new file mode 100644
index 0000000..879ea88
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/NativeLibraryWrapper.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import com.sun.jna.Pointer;
+
+/**
+ * An interface to implement for using OS specific native methods.
+ * @see NativeLibrary
+ */
+interface NativeLibraryWrapper
+{
+    /**
+     * Checks if the library has been successfully linked.
+     * @return {@code true} if the library has been successfully linked, {@code false} otherwise.
+     */
+    boolean isAvailable();
+
+    int callMlockall(int flags) throws UnsatisfiedLinkError, RuntimeException;
+    int callMunlockall() throws UnsatisfiedLinkError, RuntimeException;
+    int callFcntl(int fd, int command, long flags) throws UnsatisfiedLinkError, RuntimeException;
+    int callPosixFadvise(int fd, long offset, int len, int flag) throws UnsatisfiedLinkError, RuntimeException;
+    int callOpen(String path, int flags) throws UnsatisfiedLinkError, RuntimeException;
+    int callFsync(int fd) throws UnsatisfiedLinkError, RuntimeException;
+    int callClose(int fd) throws UnsatisfiedLinkError, RuntimeException;
+    Pointer callStrerror(int errnum) throws UnsatisfiedLinkError, RuntimeException;
+    long callGetpid() throws UnsatisfiedLinkError, RuntimeException;
+}

diff --git a/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java b/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java
index 3ec6965..9ab4538 100644
--- a/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java
+++ b/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java

@@ -18,38 +18,46 @@
 package org.apache.cassandra.utils;
 
 import java.net.InetAddress;
+import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 import com.datastax.driver.core.*;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.ColumnDefinition.ClusteringOrder;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.dht.Token.TokenFactory;
 import org.apache.cassandra.io.sstable.SSTableLoader;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.CQLTypeParser;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.schema.Types;
 
 public class NativeSSTableLoaderClient extends SSTableLoader.Client
 {
     protected final Map<String, CFMetaData> tables;
     private final Collection<InetAddress> hosts;
     private final int port;
-    private final String username;
-    private final String password;
+    private final AuthProvider authProvider;
     private final SSLOptions sslOptions;
 
+
     public NativeSSTableLoaderClient(Collection<InetAddress> hosts, int port, String username, String password, SSLOptions sslOptions)
     {
+        this(hosts, port, new PlainTextAuthProvider(username, password), sslOptions);
+    }
+
+    public NativeSSTableLoaderClient(Collection<InetAddress> hosts, int port, AuthProvider authProvider, SSLOptions sslOptions)
+    {
         super();
         this.tables = new HashMap<>();
         this.hosts = hosts;
         this.port = port;
-        this.username = username;
-        this.password = password;
+        this.authProvider = authProvider;
         this.sslOptions = sslOptions;
     }
 
@@ -58,19 +66,18 @@
         Cluster.Builder builder = Cluster.builder().addContactPoints(hosts).withPort(port);
         if (sslOptions != null)
             builder.withSSL(sslOptions);
-        if (username != null && password != null)
-            builder = builder.withCredentials(username, password);
+        if (authProvider != null)
+            builder = builder.withAuthProvider(authProvider);
 
         try (Cluster cluster = builder.build(); Session session = cluster.connect())
         {
 
             Metadata metadata = cluster.getMetadata();
 
-            setPartitioner(metadata.getPartitioner());
-
             Set<TokenRange> tokenRanges = metadata.getTokenRanges();
 
-            Token.TokenFactory tokenFactory = getPartitioner().getTokenFactory();
+            IPartitioner partitioner = FBUtilities.newPartitioner(metadata.getPartitioner());
+            TokenFactory tokenFactory = partitioner.getTokenFactory();
 
             for (TokenRange tokenRange : tokenRanges)
             {
@@ -78,10 +85,14 @@
                 Range<Token> range = new Range<>(tokenFactory.fromString(tokenRange.getStart().getValue().toString()),
                                                  tokenFactory.fromString(tokenRange.getEnd().getValue().toString()));
                 for (Host endpoint : endpoints)
-                    addRangeForEndpoint(range, endpoint.getAddress());
+                    addRangeForEndpoint(range, endpoint.getBroadcastAddress());
             }
 
-            tables.putAll(fetchTablesMetadata(keyspace, session));
+            Types types = fetchTypes(keyspace, session);
+
+            tables.putAll(fetchTables(keyspace, session, partitioner, types));
+            // We only need the CFMetaData for the views, so we only load that.
+            tables.putAll(fetchViews(keyspace, session, partitioner, types));
         }
     }
 
@@ -96,31 +107,131 @@
         tables.put(cfm.cfName, cfm);
     }
 
-    private static Map<String, CFMetaData> fetchTablesMetadata(String keyspace, Session session)
+    private static Types fetchTypes(String keyspace, Session session)
+    {
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ?", SchemaKeyspace.NAME, SchemaKeyspace.TYPES);
+
+        Types.RawBuilder types = Types.rawBuilder(keyspace);
+        for (Row row : session.execute(query, keyspace))
+        {
+            String name = row.getString("type_name");
+            List<String> fieldNames = row.getList("field_names", String.class);
+            List<String> fieldTypes = row.getList("field_types", String.class);
+            types.add(name, fieldNames, fieldTypes);
+        }
+        return types.build();
+    }
+
+    /*
+     * The following is a slightly simplified but otherwise duplicated version of
+     * SchemaKeyspace.createTableFromTableRowAndColumnRows().
+     * It might be safer to have a simple wrapper of the driver ResultSet/Row implementing
+     * UntypedResultSet/UntypedResultSet.Row and reuse the original method.
+     *
+     * Note: It is not safe for this class to use static methods from SchemaKeyspace (static final fields are ok)
+     * as that triggers initialization of the class, which fails in client mode.
+     */
+    private static Map<String, CFMetaData> fetchTables(String keyspace, Session session, IPartitioner partitioner, Types types)
     {
         Map<String, CFMetaData> tables = new HashMap<>();
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ?", SchemaKeyspace.NAME, SchemaKeyspace.TABLES);
 
-        String query = String.format("SELECT columnfamily_name, cf_id, type, comparator, subcomparator, is_dense FROM %s.%s WHERE keyspace_name = '%s'",
-                                     SystemKeyspace.NAME,
-                                     LegacySchemaTables.COLUMNFAMILIES,
-                                     keyspace);
-
-        for (Row row : session.execute(query))
+        for (Row row : session.execute(query, keyspace))
         {
-            String name = row.getString("columnfamily_name");
-            UUID id = row.getUUID("cf_id");
-            ColumnFamilyType type = ColumnFamilyType.valueOf(row.getString("type"));
-            AbstractType rawComparator = TypeParser.parse(row.getString("comparator"));
-            AbstractType subComparator = row.isNull("subcomparator")
-                                       ? null
-                                       : TypeParser.parse(row.getString("subcomparator"));
-            boolean isDense = row.getBool("is_dense");
-            CellNameType comparator = CellNames.fromAbstractType(CFMetaData.makeRawAbstractType(rawComparator, subComparator),
-                                                                 isDense);
-
-            tables.put(name, new CFMetaData(keyspace, name, type, comparator, id));
+            String name = row.getString("table_name");
+            tables.put(name, createTableMetadata(keyspace, session, partitioner, false, row, name, types));
         }
 
         return tables;
     }
+
+    /*
+     * In the case where we are creating View CFMetaDatas, we
+     */
+    private static Map<String, CFMetaData> fetchViews(String keyspace, Session session, IPartitioner partitioner, Types types)
+    {
+        Map<String, CFMetaData> tables = new HashMap<>();
+        String query = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ?", SchemaKeyspace.NAME, SchemaKeyspace.VIEWS);
+
+        for (Row row : session.execute(query, keyspace))
+        {
+            String name = row.getString("view_name");
+            tables.put(name, createTableMetadata(keyspace, session, partitioner, true, row, name, types));
+        }
+
+        return tables;
+    }
+
+    private static CFMetaData createTableMetadata(String keyspace,
+                                                  Session session,
+                                                  IPartitioner partitioner,
+                                                  boolean isView,
+                                                  Row row,
+                                                  String name,
+                                                  Types types)
+    {
+        UUID id = row.getUUID("id");
+        Set<CFMetaData.Flag> flags = isView ? Collections.emptySet() : CFMetaData.flagsFromStrings(row.getSet("flags", String.class));
+
+        boolean isSuper = flags.contains(CFMetaData.Flag.SUPER);
+        boolean isCounter = flags.contains(CFMetaData.Flag.COUNTER);
+        boolean isDense = flags.contains(CFMetaData.Flag.DENSE);
+        boolean isCompound = isView || flags.contains(CFMetaData.Flag.COMPOUND);
+
+        String columnsQuery = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?",
+                                            SchemaKeyspace.NAME,
+                                            SchemaKeyspace.COLUMNS);
+
+        List<ColumnDefinition> defs = new ArrayList<>();
+        for (Row colRow : session.execute(columnsQuery, keyspace, name))
+            defs.add(createDefinitionFromRow(colRow, keyspace, name, types));
+
+        CFMetaData metadata = CFMetaData.create(keyspace,
+                                                name,
+                                                id,
+                                                isDense,
+                                                isCompound,
+                                                isSuper,
+                                                isCounter,
+                                                isView,
+                                                defs,
+                                                partitioner);
+
+        String droppedColumnsQuery = String.format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ?",
+                                                   SchemaKeyspace.NAME,
+                                                   SchemaKeyspace.DROPPED_COLUMNS);
+        Map<ByteBuffer, CFMetaData.DroppedColumn> droppedColumns = new HashMap<>();
+        for (Row colRow : session.execute(droppedColumnsQuery, keyspace, name))
+        {
+            CFMetaData.DroppedColumn droppedColumn = createDroppedColumnFromRow(colRow, keyspace);
+            droppedColumns.put(UTF8Type.instance.decompose(droppedColumn.name), droppedColumn);
+        }
+        metadata.droppedColumns(droppedColumns);
+
+        return metadata;
+    }
+
+    private static ColumnDefinition createDefinitionFromRow(Row row, String keyspace, String table, Types types)
+    {
+        ClusteringOrder order = ClusteringOrder.valueOf(row.getString("clustering_order").toUpperCase());
+        AbstractType<?> type = CQLTypeParser.parse(keyspace, row.getString("type"), types);
+        if (order == ClusteringOrder.DESC)
+            type = ReversedType.getInstance(type);
+
+        ColumnIdentifier name = new ColumnIdentifier(row.getBytes("column_name_bytes"), row.getString("column_name"));
+
+        int position = row.getInt("position");
+        ColumnDefinition.Kind kind = ColumnDefinition.Kind.valueOf(row.getString("kind").toUpperCase());
+        return new ColumnDefinition(keyspace, table, name, type, position, kind);
+    }
+
+    private static CFMetaData.DroppedColumn createDroppedColumnFromRow(Row row, String keyspace)
+    {
+        String name = row.getString("column_name");
+        ColumnDefinition.Kind kind =
+            row.isNull("kind") ? null : ColumnDefinition.Kind.valueOf(row.getString("kind").toUpperCase());
+        AbstractType<?> type = CQLTypeParser.parse(keyspace, row.getString("type"), Types.none());
+        long droppedTime = TimeUnit.MILLISECONDS.toMicros(row.getTimestamp("dropped_time").getTime());
+        return new CFMetaData.DroppedColumn(name, kind, type, droppedTime);
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/NoSpamLogger.java b/src/java/org/apache/cassandra/utils/NoSpamLogger.java
index 3cc8b5e..df3d2e4 100644
--- a/src/java/org/apache/cassandra/utils/NoSpamLogger.java
+++ b/src/java/org/apache/cassandra/utils/NoSpamLogger.java

@@ -81,9 +81,9 @@
             return nowNanos - expected >= minIntervalNanos && compareAndSet(expected, nowNanos);
         }
 
-        public void log(Level l, long nowNanos, Object... objects)
+        public boolean log(Level l, long nowNanos, Object... objects)
         {
-            if (!shouldLog(nowNanos)) return;
+            if (!shouldLog(nowNanos)) return false;
 
             switch (l)
             {
@@ -99,36 +99,37 @@
                 default:
                     throw new AssertionError();
             }
+            return true;
         }
 
-        public void info(long nowNanos, Object... objects)
+        public boolean info(long nowNanos, Object... objects)
         {
-            NoSpamLogStatement.this.log(Level.INFO, nowNanos, objects);
+            return NoSpamLogStatement.this.log(Level.INFO, nowNanos, objects);
         }
 
-        public void info(Object... objects)
+        public boolean info(Object... objects)
         {
-            NoSpamLogStatement.this.info(CLOCK.nanoTime(), objects);
+            return NoSpamLogStatement.this.info(CLOCK.nanoTime(), objects);
         }
 
-        public void warn(long nowNanos, Object... objects)
+        public boolean warn(long nowNanos, Object... objects)
         {
-            NoSpamLogStatement.this.log(Level.WARN, nowNanos, objects);
+            return NoSpamLogStatement.this.log(Level.WARN, nowNanos, objects);
         }
 
-        public void warn(Object... objects)
+        public boolean warn(Object... objects)
         {
-            NoSpamLogStatement.this.warn(CLOCK.nanoTime(), objects);
+            return NoSpamLogStatement.this.warn(CLOCK.nanoTime(), objects);
         }
 
-        public void error(long nowNanos, Object... objects)
+        public boolean error(long nowNanos, Object... objects)
         {
-            NoSpamLogStatement.this.log(Level.ERROR, nowNanos, objects);
+            return NoSpamLogStatement.this.log(Level.ERROR, nowNanos, objects);
         }
 
-        public void error(Object... objects)
+        public boolean error(Object... objects)
         {
-            NoSpamLogStatement.this.error(CLOCK.nanoTime(), objects);
+            return NoSpamLogStatement.this.error(CLOCK.nanoTime(), objects);
         }
     }
 
@@ -153,16 +154,21 @@
         return wrapped;
     }
 
-    public static void log(Logger logger, Level level, long minInterval, TimeUnit unit, String message, Object... objects)
+    public static boolean log(Logger logger, Level level, long minInterval, TimeUnit unit, String message, Object... objects)
     {
-        log(logger, level, minInterval, unit, CLOCK.nanoTime(), message, objects);
+        return log(logger, level, message, minInterval, unit, CLOCK.nanoTime(), message, objects);
     }
 
-    public static void log(Logger logger, Level level, long minInterval, TimeUnit unit, long nowNanos, String message, Object... objects)
+    public static boolean log(Logger logger, Level level, String key, long minInterval, TimeUnit unit, String message, Object... objects)
+    {
+        return log(logger, level, key, minInterval, unit, CLOCK.nanoTime(), message, objects);
+    }
+
+    public static boolean log(Logger logger, Level level, String key, long minInterval, TimeUnit unit, long nowNanos, String message, Object... objects)
     {
         NoSpamLogger wrapped = getLogger(logger, minInterval, unit);
-        NoSpamLogStatement statement = wrapped.getStatement(message);
-        statement.log(level, nowNanos, objects);
+        NoSpamLogStatement statement = wrapped.getStatement(key, message);
+        return statement.log(level, nowNanos, objects);
     }
 
     public static NoSpamLogStatement getStatement(Logger logger, String message, long minInterval, TimeUnit unit)
@@ -181,38 +187,38 @@
         minIntervalNanos = timeUnit.toNanos(minInterval);
     }
 
-    public void info(long nowNanos, String s, Object... objects)
+    public boolean info(long nowNanos, String s, Object... objects)
     {
-        NoSpamLogger.this.log( Level.INFO, s, nowNanos, objects);
+        return NoSpamLogger.this.log( Level.INFO, s, nowNanos, objects);
     }
 
-    public void info(String s, Object... objects)
+    public boolean info(String s, Object... objects)
     {
-        NoSpamLogger.this.info(CLOCK.nanoTime(), s, objects);
+        return NoSpamLogger.this.info(CLOCK.nanoTime(), s, objects);
     }
 
-    public void warn(long nowNanos, String s, Object... objects)
+    public boolean warn(long nowNanos, String s, Object... objects)
     {
-        NoSpamLogger.this.log( Level.WARN, s, nowNanos, objects);
+        return NoSpamLogger.this.log( Level.WARN, s, nowNanos, objects);
     }
 
-    public void warn(String s, Object... objects)
+    public boolean warn(String s, Object... objects)
     {
-        NoSpamLogger.this.warn(CLOCK.nanoTime(), s, objects);
+        return NoSpamLogger.this.warn(CLOCK.nanoTime(), s, objects);
     }
 
-    public void error(long nowNanos, String s, Object... objects)
+    public boolean error(long nowNanos, String s, Object... objects)
     {
-        NoSpamLogger.this.log( Level.ERROR, s, nowNanos, objects);
+        return NoSpamLogger.this.log( Level.ERROR, s, nowNanos, objects);
     }
 
-    public void error(String s, Object... objects)
+    public boolean error(String s, Object... objects)
     {
-        NoSpamLogger.this.error(CLOCK.nanoTime(), s, objects);
+        return NoSpamLogger.this.error(CLOCK.nanoTime(), s, objects);
     }
 
-    public void log(Level l, String s, long nowNanos, Object... objects) {
-        NoSpamLogger.this.getStatement(s, minIntervalNanos).log(l, nowNanos, objects);
+    public boolean log(Level l, String s, long nowNanos, Object... objects) {
+        return NoSpamLogger.this.getStatement(s, minIntervalNanos).log(l, nowNanos, objects);
     }
 
     public NoSpamLogStatement getStatement(String s)
@@ -220,17 +226,27 @@
         return NoSpamLogger.this.getStatement(s, minIntervalNanos);
     }
 
+    public NoSpamLogStatement getStatement(String key, String s)
+    {
+        return NoSpamLogger.this.getStatement(key, s, minIntervalNanos);
+    }
+
     public NoSpamLogStatement getStatement(String s, long minInterval, TimeUnit unit) {
         return NoSpamLogger.this.getStatement(s, unit.toNanos(minInterval));
     }
 
     public NoSpamLogStatement getStatement(String s, long minIntervalNanos)
     {
-        NoSpamLogStatement statement = lastMessage.get(s);
+        return getStatement(s, s, minIntervalNanos);
+    }
+
+    public NoSpamLogStatement getStatement(String key, String s, long minIntervalNanos)
+    {
+        NoSpamLogStatement statement = lastMessage.get(key);
         if (statement == null)
         {
             statement = new NoSpamLogStatement(s, minIntervalNanos);
-            NoSpamLogStatement temp = lastMessage.putIfAbsent(s, statement);
+            NoSpamLogStatement temp = lastMessage.putIfAbsent(key, statement);
             if (temp != null)
                 statement = temp;
         }

diff --git a/src/java/org/apache/cassandra/utils/ObjectSizes.java b/src/java/org/apache/cassandra/utils/ObjectSizes.java
index e05dcba..e7469c1 100644
--- a/src/java/org/apache/cassandra/utils/ObjectSizes.java
+++ b/src/java/org/apache/cassandra/utils/ObjectSizes.java

@@ -23,6 +23,8 @@
 
 import java.nio.ByteBuffer;
 
+import java.util.ArrayList;
+
 import org.github.jamm.MemoryLayoutSpecification;
 import org.github.jamm.MemoryMeter;
 
@@ -111,6 +113,7 @@
     {
         return BUFFER_EMPTY_SIZE * array.length + sizeOfArray(array);
     }
+
     /**
      * Memory a byte buffer consumes
      * @param buffer ByteBuffer to calculate in memory size

diff --git a/src/java/org/apache/cassandra/utils/OverlapIterator.java b/src/java/org/apache/cassandra/utils/OverlapIterator.java
index 7c1544a..b346a62 100644
--- a/src/java/org/apache/cassandra/utils/OverlapIterator.java
+++ b/src/java/org/apache/cassandra/utils/OverlapIterator.java

@@ -17,7 +17,7 @@
  * specific language governing permissions and limitations
  * under the License.
  *
- */
+*/
 package org.apache.cassandra.utils;
 
 import java.util.*;

diff --git a/src/java/org/apache/cassandra/utils/PureJavaCrc32.java b/src/java/org/apache/cassandra/utils/PureJavaCrc32.java
deleted file mode 100644
index 17e6235..0000000
--- a/src/java/org/apache/cassandra/utils/PureJavaCrc32.java
+++ /dev/null

@@ -1,726 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-import java.nio.ByteBuffer;
-
-import com.github.tjake.ICRC32;
-
-/**
- * NOTE: You should be using CRCFactory class because it will pick a better
- * version based on your JDK version
- *
- * A pure-java implementation of the CRC32 checksum that uses
- * the same polynomial as the built-in native CRC32.
- *
- * This is to avoid the JNI overhead for certain uses of Checksumming
- * where many small pieces of data are checksummed in succession.
- *
- * The current version is ~10x to 1.8x as fast as Sun's native
- * java.util.zip.CRC32 in Java 1.6
- *
- * @see java.util.zip.CRC32
- *
- * This class is copied from hadoop-commons project and retains that formatting.
- * (The initial patch added PureJavaCrc32 was HADOOP-6148)
- */
-public class PureJavaCrc32 implements ICRC32
-{
-
-  /** the current CRC value, bit-flipped */
-  private int crc;
-
-  /** Create a new PureJavaCrc32 object. */
-  public PureJavaCrc32() {
-    reset();
-  }
-
-  @Override
-  public long getValue() {
-    return (~crc) & 0xffffffffL;
-  }
-
-  public int getCrc() {
-    return ~crc;
-  }
-
-    @Override
-  public void reset() {
-    crc = 0xffffffff;
-  }
-
-  @Override
-  public void update(byte[] b, int off, int len) {
-    int localCrc = crc;
-
-    while(len > 7) {
-      final int c0 =(b[off+0] ^ localCrc) & 0xff;
-      final int c1 =(b[off+1] ^ (localCrc >>>= 8)) & 0xff;
-      final int c2 =(b[off+2] ^ (localCrc >>>= 8)) & 0xff;
-      final int c3 =(b[off+3] ^ (localCrc >>>= 8)) & 0xff;
-      localCrc = (T[T8_7_start + c0] ^ T[T8_6_start + c1])
-          ^ (T[T8_5_start + c2] ^ T[T8_4_start + c3]);
-
-      final int c4 = b[off+4] & 0xff;
-      final int c5 = b[off+5] & 0xff;
-      final int c6 = b[off+6] & 0xff;
-      final int c7 = b[off+7] & 0xff;
-
-      localCrc ^= (T[T8_3_start + c4] ^ T[T8_2_start + c5])
-           ^ (T[T8_1_start + c6] ^ T[T8_0_start + c7]);
-
-      off += 8;
-      len -= 8;
-    }
-
-    /* loop unroll - duff's device style */
-    switch(len) {
-      case 7: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 6: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 5: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 4: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 3: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 2: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      case 1: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b[off++]) & 0xff)];
-      default:
-        /* nothing */
-    }
-    
-    // Publish crc out to object
-    crc = localCrc;
-  }
-
-  private static final ThreadLocal<byte[]> BUFFER = new ThreadLocal<byte[]>()
-  {
-      protected byte[] initialValue()
-      {
-          return new byte[256];
-      }
-  };
-
-    public void update(ByteBuffer b, int off, int len)
-    {
-        if (b.hasArray())
-        {
-            update(b.array(), b.arrayOffset() + off, len);
-        }
-        else if (len < 16)
-        {
-            doUpdate(b, off, len);
-        }
-        else
-        {
-            byte[] buf = BUFFER.get();
-            while (len > 0)
-            {
-                int l = Math.min(len, buf.length);
-                ByteBufferUtil.arrayCopy(b, off, buf, 0, l);
-                update(buf, 0, l);
-                len -= l;
-                off += l;
-            }
-        }
-    }
-
-    private void doUpdate(ByteBuffer b, int off, int len) {
-        int localCrc = crc;
-
-        while(len > 7) {
-            final int c0 =(b.get(off+0) ^ localCrc) & 0xff;
-            final int c1 =(b.get(off+1) ^ (localCrc >>>= 8)) & 0xff;
-            final int c2 =(b.get(off+2) ^ (localCrc >>>= 8)) & 0xff;
-            final int c3 =(b.get(off+3) ^ (localCrc >>>= 8)) & 0xff;
-            localCrc = (T[T8_7_start + c0] ^ T[T8_6_start + c1])
-                       ^ (T[T8_5_start + c2] ^ T[T8_4_start + c3]);
-
-            final int c4 = b.get(off+4) & 0xff;
-            final int c5 = b.get(off+5) & 0xff;
-            final int c6 = b.get(off+6) & 0xff;
-            final int c7 = b.get(off+7) & 0xff;
-
-            localCrc ^= (T[T8_3_start + c4] ^ T[T8_2_start + c5])
-                        ^ (T[T8_1_start + c6] ^ T[T8_0_start + c7]);
-
-            off += 8;
-            len -= 8;
-        }
-
-    /* loop unroll - duff's device style */
-        switch(len) {
-            case 7: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 6: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 5: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 4: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 3: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 2: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            case 1: localCrc = (localCrc >>> 8) ^ T[T8_0_start + ((localCrc ^ b.get(off++)) & 0xff)];
-            default:
-        /* nothing */
-        }
-
-        // Publish crc out to object
-        crc = localCrc;
-    }
-
-    @Override
-  final public void update(int b) {
-    crc = (crc >>> 8) ^ T[T8_0_start + ((crc ^ b) & 0xff)];
-  }
-
-  final public void updateInt(int v) {
-      update((v >>> 24) & 0xFF);
-      update((v >>> 16) & 0xFF);
-      update((v >>> 8) & 0xFF);
-      update((v >>> 0) & 0xFF);
-  }
-
-    /*
-   * CRC-32 lookup tables generated by the polynomial 0xEDB88320.
-   * See also TestPureJavaCrc32.Table.
-   */
-  private static final int T8_0_start = 0*256;
-  private static final int T8_1_start = 1*256;
-  private static final int T8_2_start = 2*256;
-  private static final int T8_3_start = 3*256;
-  private static final int T8_4_start = 4*256;
-  private static final int T8_5_start = 5*256;
-  private static final int T8_6_start = 6*256;
-  private static final int T8_7_start = 7*256;
-
-  private static final int[] T = new int[] {
-    /* T8_0 */
-    0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 
-    0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 
-    0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 
-    0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 
-    0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 
-    0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 
-    0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 
-    0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 
-    0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 
-    0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 
-    0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 
-    0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 
-    0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 
-    0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, 
-    0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 
-    0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 
-    0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 
-    0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, 
-    0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 
-    0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, 
-    0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 
-    0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 
-    0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 
-    0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 
-    0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 
-    0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 
-    0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 
-    0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 
-    0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 
-    0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, 
-    0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 
-    0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 
-    0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 
-    0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 
-    0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 
-    0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 
-    0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 
-    0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 
-    0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 
-    0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 
-    0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 
-    0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, 
-    0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 
-    0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, 
-    0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 
-    0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 
-    0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 
-    0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 
-    0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 
-    0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 
-    0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 
-    0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 
-    0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 
-    0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 
-    0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 
-    0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 
-    0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 
-    0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 
-    0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 
-    0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, 
-    0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 
-    0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, 
-    0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 
-    0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D,
-    /* T8_1 */
-    0x00000000, 0x191B3141, 0x32366282, 0x2B2D53C3, 
-    0x646CC504, 0x7D77F445, 0x565AA786, 0x4F4196C7, 
-    0xC8D98A08, 0xD1C2BB49, 0xFAEFE88A, 0xE3F4D9CB, 
-    0xACB54F0C, 0xB5AE7E4D, 0x9E832D8E, 0x87981CCF, 
-    0x4AC21251, 0x53D92310, 0x78F470D3, 0x61EF4192, 
-    0x2EAED755, 0x37B5E614, 0x1C98B5D7, 0x05838496, 
-    0x821B9859, 0x9B00A918, 0xB02DFADB, 0xA936CB9A, 
-    0xE6775D5D, 0xFF6C6C1C, 0xD4413FDF, 0xCD5A0E9E, 
-    0x958424A2, 0x8C9F15E3, 0xA7B24620, 0xBEA97761, 
-    0xF1E8E1A6, 0xE8F3D0E7, 0xC3DE8324, 0xDAC5B265, 
-    0x5D5DAEAA, 0x44469FEB, 0x6F6BCC28, 0x7670FD69, 
-    0x39316BAE, 0x202A5AEF, 0x0B07092C, 0x121C386D, 
-    0xDF4636F3, 0xC65D07B2, 0xED705471, 0xF46B6530, 
-    0xBB2AF3F7, 0xA231C2B6, 0x891C9175, 0x9007A034, 
-    0x179FBCFB, 0x0E848DBA, 0x25A9DE79, 0x3CB2EF38, 
-    0x73F379FF, 0x6AE848BE, 0x41C51B7D, 0x58DE2A3C, 
-    0xF0794F05, 0xE9627E44, 0xC24F2D87, 0xDB541CC6, 
-    0x94158A01, 0x8D0EBB40, 0xA623E883, 0xBF38D9C2, 
-    0x38A0C50D, 0x21BBF44C, 0x0A96A78F, 0x138D96CE, 
-    0x5CCC0009, 0x45D73148, 0x6EFA628B, 0x77E153CA, 
-    0xBABB5D54, 0xA3A06C15, 0x888D3FD6, 0x91960E97, 
-    0xDED79850, 0xC7CCA911, 0xECE1FAD2, 0xF5FACB93, 
-    0x7262D75C, 0x6B79E61D, 0x4054B5DE, 0x594F849F, 
-    0x160E1258, 0x0F152319, 0x243870DA, 0x3D23419B, 
-    0x65FD6BA7, 0x7CE65AE6, 0x57CB0925, 0x4ED03864, 
-    0x0191AEA3, 0x188A9FE2, 0x33A7CC21, 0x2ABCFD60, 
-    0xAD24E1AF, 0xB43FD0EE, 0x9F12832D, 0x8609B26C, 
-    0xC94824AB, 0xD05315EA, 0xFB7E4629, 0xE2657768, 
-    0x2F3F79F6, 0x362448B7, 0x1D091B74, 0x04122A35, 
-    0x4B53BCF2, 0x52488DB3, 0x7965DE70, 0x607EEF31, 
-    0xE7E6F3FE, 0xFEFDC2BF, 0xD5D0917C, 0xCCCBA03D, 
-    0x838A36FA, 0x9A9107BB, 0xB1BC5478, 0xA8A76539, 
-    0x3B83984B, 0x2298A90A, 0x09B5FAC9, 0x10AECB88, 
-    0x5FEF5D4F, 0x46F46C0E, 0x6DD93FCD, 0x74C20E8C, 
-    0xF35A1243, 0xEA412302, 0xC16C70C1, 0xD8774180, 
-    0x9736D747, 0x8E2DE606, 0xA500B5C5, 0xBC1B8484, 
-    0x71418A1A, 0x685ABB5B, 0x4377E898, 0x5A6CD9D9, 
-    0x152D4F1E, 0x0C367E5F, 0x271B2D9C, 0x3E001CDD, 
-    0xB9980012, 0xA0833153, 0x8BAE6290, 0x92B553D1, 
-    0xDDF4C516, 0xC4EFF457, 0xEFC2A794, 0xF6D996D5, 
-    0xAE07BCE9, 0xB71C8DA8, 0x9C31DE6B, 0x852AEF2A, 
-    0xCA6B79ED, 0xD37048AC, 0xF85D1B6F, 0xE1462A2E, 
-    0x66DE36E1, 0x7FC507A0, 0x54E85463, 0x4DF36522, 
-    0x02B2F3E5, 0x1BA9C2A4, 0x30849167, 0x299FA026, 
-    0xE4C5AEB8, 0xFDDE9FF9, 0xD6F3CC3A, 0xCFE8FD7B, 
-    0x80A96BBC, 0x99B25AFD, 0xB29F093E, 0xAB84387F, 
-    0x2C1C24B0, 0x350715F1, 0x1E2A4632, 0x07317773, 
-    0x4870E1B4, 0x516BD0F5, 0x7A468336, 0x635DB277, 
-    0xCBFAD74E, 0xD2E1E60F, 0xF9CCB5CC, 0xE0D7848D, 
-    0xAF96124A, 0xB68D230B, 0x9DA070C8, 0x84BB4189, 
-    0x03235D46, 0x1A386C07, 0x31153FC4, 0x280E0E85, 
-    0x674F9842, 0x7E54A903, 0x5579FAC0, 0x4C62CB81, 
-    0x8138C51F, 0x9823F45E, 0xB30EA79D, 0xAA1596DC, 
-    0xE554001B, 0xFC4F315A, 0xD7626299, 0xCE7953D8, 
-    0x49E14F17, 0x50FA7E56, 0x7BD72D95, 0x62CC1CD4, 
-    0x2D8D8A13, 0x3496BB52, 0x1FBBE891, 0x06A0D9D0, 
-    0x5E7EF3EC, 0x4765C2AD, 0x6C48916E, 0x7553A02F, 
-    0x3A1236E8, 0x230907A9, 0x0824546A, 0x113F652B, 
-    0x96A779E4, 0x8FBC48A5, 0xA4911B66, 0xBD8A2A27, 
-    0xF2CBBCE0, 0xEBD08DA1, 0xC0FDDE62, 0xD9E6EF23, 
-    0x14BCE1BD, 0x0DA7D0FC, 0x268A833F, 0x3F91B27E, 
-    0x70D024B9, 0x69CB15F8, 0x42E6463B, 0x5BFD777A, 
-    0xDC656BB5, 0xC57E5AF4, 0xEE530937, 0xF7483876, 
-    0xB809AEB1, 0xA1129FF0, 0x8A3FCC33, 0x9324FD72,
-    /* T8_2 */
-    0x00000000, 0x01C26A37, 0x0384D46E, 0x0246BE59, 
-    0x0709A8DC, 0x06CBC2EB, 0x048D7CB2, 0x054F1685, 
-    0x0E1351B8, 0x0FD13B8F, 0x0D9785D6, 0x0C55EFE1, 
-    0x091AF964, 0x08D89353, 0x0A9E2D0A, 0x0B5C473D, 
-    0x1C26A370, 0x1DE4C947, 0x1FA2771E, 0x1E601D29, 
-    0x1B2F0BAC, 0x1AED619B, 0x18ABDFC2, 0x1969B5F5, 
-    0x1235F2C8, 0x13F798FF, 0x11B126A6, 0x10734C91, 
-    0x153C5A14, 0x14FE3023, 0x16B88E7A, 0x177AE44D, 
-    0x384D46E0, 0x398F2CD7, 0x3BC9928E, 0x3A0BF8B9, 
-    0x3F44EE3C, 0x3E86840B, 0x3CC03A52, 0x3D025065, 
-    0x365E1758, 0x379C7D6F, 0x35DAC336, 0x3418A901, 
-    0x3157BF84, 0x3095D5B3, 0x32D36BEA, 0x331101DD, 
-    0x246BE590, 0x25A98FA7, 0x27EF31FE, 0x262D5BC9, 
-    0x23624D4C, 0x22A0277B, 0x20E69922, 0x2124F315, 
-    0x2A78B428, 0x2BBADE1F, 0x29FC6046, 0x283E0A71, 
-    0x2D711CF4, 0x2CB376C3, 0x2EF5C89A, 0x2F37A2AD, 
-    0x709A8DC0, 0x7158E7F7, 0x731E59AE, 0x72DC3399, 
-    0x7793251C, 0x76514F2B, 0x7417F172, 0x75D59B45, 
-    0x7E89DC78, 0x7F4BB64F, 0x7D0D0816, 0x7CCF6221, 
-    0x798074A4, 0x78421E93, 0x7A04A0CA, 0x7BC6CAFD, 
-    0x6CBC2EB0, 0x6D7E4487, 0x6F38FADE, 0x6EFA90E9, 
-    0x6BB5866C, 0x6A77EC5B, 0x68315202, 0x69F33835, 
-    0x62AF7F08, 0x636D153F, 0x612BAB66, 0x60E9C151, 
-    0x65A6D7D4, 0x6464BDE3, 0x662203BA, 0x67E0698D, 
-    0x48D7CB20, 0x4915A117, 0x4B531F4E, 0x4A917579, 
-    0x4FDE63FC, 0x4E1C09CB, 0x4C5AB792, 0x4D98DDA5, 
-    0x46C49A98, 0x4706F0AF, 0x45404EF6, 0x448224C1, 
-    0x41CD3244, 0x400F5873, 0x4249E62A, 0x438B8C1D, 
-    0x54F16850, 0x55330267, 0x5775BC3E, 0x56B7D609, 
-    0x53F8C08C, 0x523AAABB, 0x507C14E2, 0x51BE7ED5, 
-    0x5AE239E8, 0x5B2053DF, 0x5966ED86, 0x58A487B1, 
-    0x5DEB9134, 0x5C29FB03, 0x5E6F455A, 0x5FAD2F6D, 
-    0xE1351B80, 0xE0F771B7, 0xE2B1CFEE, 0xE373A5D9, 
-    0xE63CB35C, 0xE7FED96B, 0xE5B86732, 0xE47A0D05, 
-    0xEF264A38, 0xEEE4200F, 0xECA29E56, 0xED60F461, 
-    0xE82FE2E4, 0xE9ED88D3, 0xEBAB368A, 0xEA695CBD, 
-    0xFD13B8F0, 0xFCD1D2C7, 0xFE976C9E, 0xFF5506A9, 
-    0xFA1A102C, 0xFBD87A1B, 0xF99EC442, 0xF85CAE75, 
-    0xF300E948, 0xF2C2837F, 0xF0843D26, 0xF1465711, 
-    0xF4094194, 0xF5CB2BA3, 0xF78D95FA, 0xF64FFFCD, 
-    0xD9785D60, 0xD8BA3757, 0xDAFC890E, 0xDB3EE339, 
-    0xDE71F5BC, 0xDFB39F8B, 0xDDF521D2, 0xDC374BE5, 
-    0xD76B0CD8, 0xD6A966EF, 0xD4EFD8B6, 0xD52DB281, 
-    0xD062A404, 0xD1A0CE33, 0xD3E6706A, 0xD2241A5D, 
-    0xC55EFE10, 0xC49C9427, 0xC6DA2A7E, 0xC7184049, 
-    0xC25756CC, 0xC3953CFB, 0xC1D382A2, 0xC011E895, 
-    0xCB4DAFA8, 0xCA8FC59F, 0xC8C97BC6, 0xC90B11F1, 
-    0xCC440774, 0xCD866D43, 0xCFC0D31A, 0xCE02B92D, 
-    0x91AF9640, 0x906DFC77, 0x922B422E, 0x93E92819, 
-    0x96A63E9C, 0x976454AB, 0x9522EAF2, 0x94E080C5, 
-    0x9FBCC7F8, 0x9E7EADCF, 0x9C381396, 0x9DFA79A1, 
-    0x98B56F24, 0x99770513, 0x9B31BB4A, 0x9AF3D17D, 
-    0x8D893530, 0x8C4B5F07, 0x8E0DE15E, 0x8FCF8B69, 
-    0x8A809DEC, 0x8B42F7DB, 0x89044982, 0x88C623B5, 
-    0x839A6488, 0x82580EBF, 0x801EB0E6, 0x81DCDAD1, 
-    0x8493CC54, 0x8551A663, 0x8717183A, 0x86D5720D, 
-    0xA9E2D0A0, 0xA820BA97, 0xAA6604CE, 0xABA46EF9, 
-    0xAEEB787C, 0xAF29124B, 0xAD6FAC12, 0xACADC625, 
-    0xA7F18118, 0xA633EB2F, 0xA4755576, 0xA5B73F41, 
-    0xA0F829C4, 0xA13A43F3, 0xA37CFDAA, 0xA2BE979D, 
-    0xB5C473D0, 0xB40619E7, 0xB640A7BE, 0xB782CD89, 
-    0xB2CDDB0C, 0xB30FB13B, 0xB1490F62, 0xB08B6555, 
-    0xBBD72268, 0xBA15485F, 0xB853F606, 0xB9919C31, 
-    0xBCDE8AB4, 0xBD1CE083, 0xBF5A5EDA, 0xBE9834ED,
-    /* T8_3 */
-    0x00000000, 0xB8BC6765, 0xAA09C88B, 0x12B5AFEE, 
-    0x8F629757, 0x37DEF032, 0x256B5FDC, 0x9DD738B9, 
-    0xC5B428EF, 0x7D084F8A, 0x6FBDE064, 0xD7018701, 
-    0x4AD6BFB8, 0xF26AD8DD, 0xE0DF7733, 0x58631056, 
-    0x5019579F, 0xE8A530FA, 0xFA109F14, 0x42ACF871, 
-    0xDF7BC0C8, 0x67C7A7AD, 0x75720843, 0xCDCE6F26, 
-    0x95AD7F70, 0x2D111815, 0x3FA4B7FB, 0x8718D09E, 
-    0x1ACFE827, 0xA2738F42, 0xB0C620AC, 0x087A47C9, 
-    0xA032AF3E, 0x188EC85B, 0x0A3B67B5, 0xB28700D0, 
-    0x2F503869, 0x97EC5F0C, 0x8559F0E2, 0x3DE59787, 
-    0x658687D1, 0xDD3AE0B4, 0xCF8F4F5A, 0x7733283F, 
-    0xEAE41086, 0x525877E3, 0x40EDD80D, 0xF851BF68, 
-    0xF02BF8A1, 0x48979FC4, 0x5A22302A, 0xE29E574F, 
-    0x7F496FF6, 0xC7F50893, 0xD540A77D, 0x6DFCC018, 
-    0x359FD04E, 0x8D23B72B, 0x9F9618C5, 0x272A7FA0, 
-    0xBAFD4719, 0x0241207C, 0x10F48F92, 0xA848E8F7, 
-    0x9B14583D, 0x23A83F58, 0x311D90B6, 0x89A1F7D3, 
-    0x1476CF6A, 0xACCAA80F, 0xBE7F07E1, 0x06C36084, 
-    0x5EA070D2, 0xE61C17B7, 0xF4A9B859, 0x4C15DF3C, 
-    0xD1C2E785, 0x697E80E0, 0x7BCB2F0E, 0xC377486B, 
-    0xCB0D0FA2, 0x73B168C7, 0x6104C729, 0xD9B8A04C, 
-    0x446F98F5, 0xFCD3FF90, 0xEE66507E, 0x56DA371B, 
-    0x0EB9274D, 0xB6054028, 0xA4B0EFC6, 0x1C0C88A3, 
-    0x81DBB01A, 0x3967D77F, 0x2BD27891, 0x936E1FF4, 
-    0x3B26F703, 0x839A9066, 0x912F3F88, 0x299358ED, 
-    0xB4446054, 0x0CF80731, 0x1E4DA8DF, 0xA6F1CFBA, 
-    0xFE92DFEC, 0x462EB889, 0x549B1767, 0xEC277002, 
-    0x71F048BB, 0xC94C2FDE, 0xDBF98030, 0x6345E755, 
-    0x6B3FA09C, 0xD383C7F9, 0xC1366817, 0x798A0F72, 
-    0xE45D37CB, 0x5CE150AE, 0x4E54FF40, 0xF6E89825, 
-    0xAE8B8873, 0x1637EF16, 0x048240F8, 0xBC3E279D, 
-    0x21E91F24, 0x99557841, 0x8BE0D7AF, 0x335CB0CA, 
-    0xED59B63B, 0x55E5D15E, 0x47507EB0, 0xFFEC19D5, 
-    0x623B216C, 0xDA874609, 0xC832E9E7, 0x708E8E82, 
-    0x28ED9ED4, 0x9051F9B1, 0x82E4565F, 0x3A58313A, 
-    0xA78F0983, 0x1F336EE6, 0x0D86C108, 0xB53AA66D, 
-    0xBD40E1A4, 0x05FC86C1, 0x1749292F, 0xAFF54E4A, 
-    0x322276F3, 0x8A9E1196, 0x982BBE78, 0x2097D91D, 
-    0x78F4C94B, 0xC048AE2E, 0xD2FD01C0, 0x6A4166A5, 
-    0xF7965E1C, 0x4F2A3979, 0x5D9F9697, 0xE523F1F2, 
-    0x4D6B1905, 0xF5D77E60, 0xE762D18E, 0x5FDEB6EB, 
-    0xC2098E52, 0x7AB5E937, 0x680046D9, 0xD0BC21BC, 
-    0x88DF31EA, 0x3063568F, 0x22D6F961, 0x9A6A9E04, 
-    0x07BDA6BD, 0xBF01C1D8, 0xADB46E36, 0x15080953, 
-    0x1D724E9A, 0xA5CE29FF, 0xB77B8611, 0x0FC7E174, 
-    0x9210D9CD, 0x2AACBEA8, 0x38191146, 0x80A57623, 
-    0xD8C66675, 0x607A0110, 0x72CFAEFE, 0xCA73C99B, 
-    0x57A4F122, 0xEF189647, 0xFDAD39A9, 0x45115ECC, 
-    0x764DEE06, 0xCEF18963, 0xDC44268D, 0x64F841E8, 
-    0xF92F7951, 0x41931E34, 0x5326B1DA, 0xEB9AD6BF, 
-    0xB3F9C6E9, 0x0B45A18C, 0x19F00E62, 0xA14C6907, 
-    0x3C9B51BE, 0x842736DB, 0x96929935, 0x2E2EFE50, 
-    0x2654B999, 0x9EE8DEFC, 0x8C5D7112, 0x34E11677, 
-    0xA9362ECE, 0x118A49AB, 0x033FE645, 0xBB838120, 
-    0xE3E09176, 0x5B5CF613, 0x49E959FD, 0xF1553E98, 
-    0x6C820621, 0xD43E6144, 0xC68BCEAA, 0x7E37A9CF, 
-    0xD67F4138, 0x6EC3265D, 0x7C7689B3, 0xC4CAEED6, 
-    0x591DD66F, 0xE1A1B10A, 0xF3141EE4, 0x4BA87981, 
-    0x13CB69D7, 0xAB770EB2, 0xB9C2A15C, 0x017EC639, 
-    0x9CA9FE80, 0x241599E5, 0x36A0360B, 0x8E1C516E, 
-    0x866616A7, 0x3EDA71C2, 0x2C6FDE2C, 0x94D3B949, 
-    0x090481F0, 0xB1B8E695, 0xA30D497B, 0x1BB12E1E, 
-    0x43D23E48, 0xFB6E592D, 0xE9DBF6C3, 0x516791A6, 
-    0xCCB0A91F, 0x740CCE7A, 0x66B96194, 0xDE0506F1,
-    /* T8_4 */
-    0x00000000, 0x3D6029B0, 0x7AC05360, 0x47A07AD0, 
-    0xF580A6C0, 0xC8E08F70, 0x8F40F5A0, 0xB220DC10, 
-    0x30704BC1, 0x0D106271, 0x4AB018A1, 0x77D03111, 
-    0xC5F0ED01, 0xF890C4B1, 0xBF30BE61, 0x825097D1, 
-    0x60E09782, 0x5D80BE32, 0x1A20C4E2, 0x2740ED52, 
-    0x95603142, 0xA80018F2, 0xEFA06222, 0xD2C04B92, 
-    0x5090DC43, 0x6DF0F5F3, 0x2A508F23, 0x1730A693, 
-    0xA5107A83, 0x98705333, 0xDFD029E3, 0xE2B00053, 
-    0xC1C12F04, 0xFCA106B4, 0xBB017C64, 0x866155D4, 
-    0x344189C4, 0x0921A074, 0x4E81DAA4, 0x73E1F314, 
-    0xF1B164C5, 0xCCD14D75, 0x8B7137A5, 0xB6111E15, 
-    0x0431C205, 0x3951EBB5, 0x7EF19165, 0x4391B8D5, 
-    0xA121B886, 0x9C419136, 0xDBE1EBE6, 0xE681C256, 
-    0x54A11E46, 0x69C137F6, 0x2E614D26, 0x13016496, 
-    0x9151F347, 0xAC31DAF7, 0xEB91A027, 0xD6F18997, 
-    0x64D15587, 0x59B17C37, 0x1E1106E7, 0x23712F57, 
-    0x58F35849, 0x659371F9, 0x22330B29, 0x1F532299, 
-    0xAD73FE89, 0x9013D739, 0xD7B3ADE9, 0xEAD38459, 
-    0x68831388, 0x55E33A38, 0x124340E8, 0x2F236958, 
-    0x9D03B548, 0xA0639CF8, 0xE7C3E628, 0xDAA3CF98, 
-    0x3813CFCB, 0x0573E67B, 0x42D39CAB, 0x7FB3B51B, 
-    0xCD93690B, 0xF0F340BB, 0xB7533A6B, 0x8A3313DB, 
-    0x0863840A, 0x3503ADBA, 0x72A3D76A, 0x4FC3FEDA, 
-    0xFDE322CA, 0xC0830B7A, 0x872371AA, 0xBA43581A, 
-    0x9932774D, 0xA4525EFD, 0xE3F2242D, 0xDE920D9D, 
-    0x6CB2D18D, 0x51D2F83D, 0x167282ED, 0x2B12AB5D, 
-    0xA9423C8C, 0x9422153C, 0xD3826FEC, 0xEEE2465C, 
-    0x5CC29A4C, 0x61A2B3FC, 0x2602C92C, 0x1B62E09C, 
-    0xF9D2E0CF, 0xC4B2C97F, 0x8312B3AF, 0xBE729A1F, 
-    0x0C52460F, 0x31326FBF, 0x7692156F, 0x4BF23CDF, 
-    0xC9A2AB0E, 0xF4C282BE, 0xB362F86E, 0x8E02D1DE, 
-    0x3C220DCE, 0x0142247E, 0x46E25EAE, 0x7B82771E, 
-    0xB1E6B092, 0x8C869922, 0xCB26E3F2, 0xF646CA42, 
-    0x44661652, 0x79063FE2, 0x3EA64532, 0x03C66C82, 
-    0x8196FB53, 0xBCF6D2E3, 0xFB56A833, 0xC6368183, 
-    0x74165D93, 0x49767423, 0x0ED60EF3, 0x33B62743, 
-    0xD1062710, 0xEC660EA0, 0xABC67470, 0x96A65DC0, 
-    0x248681D0, 0x19E6A860, 0x5E46D2B0, 0x6326FB00, 
-    0xE1766CD1, 0xDC164561, 0x9BB63FB1, 0xA6D61601, 
-    0x14F6CA11, 0x2996E3A1, 0x6E369971, 0x5356B0C1, 
-    0x70279F96, 0x4D47B626, 0x0AE7CCF6, 0x3787E546, 
-    0x85A73956, 0xB8C710E6, 0xFF676A36, 0xC2074386, 
-    0x4057D457, 0x7D37FDE7, 0x3A978737, 0x07F7AE87, 
-    0xB5D77297, 0x88B75B27, 0xCF1721F7, 0xF2770847, 
-    0x10C70814, 0x2DA721A4, 0x6A075B74, 0x576772C4, 
-    0xE547AED4, 0xD8278764, 0x9F87FDB4, 0xA2E7D404, 
-    0x20B743D5, 0x1DD76A65, 0x5A7710B5, 0x67173905, 
-    0xD537E515, 0xE857CCA5, 0xAFF7B675, 0x92979FC5, 
-    0xE915E8DB, 0xD475C16B, 0x93D5BBBB, 0xAEB5920B, 
-    0x1C954E1B, 0x21F567AB, 0x66551D7B, 0x5B3534CB, 
-    0xD965A31A, 0xE4058AAA, 0xA3A5F07A, 0x9EC5D9CA, 
-    0x2CE505DA, 0x11852C6A, 0x562556BA, 0x6B457F0A, 
-    0x89F57F59, 0xB49556E9, 0xF3352C39, 0xCE550589, 
-    0x7C75D999, 0x4115F029, 0x06B58AF9, 0x3BD5A349, 
-    0xB9853498, 0x84E51D28, 0xC34567F8, 0xFE254E48, 
-    0x4C059258, 0x7165BBE8, 0x36C5C138, 0x0BA5E888, 
-    0x28D4C7DF, 0x15B4EE6F, 0x521494BF, 0x6F74BD0F, 
-    0xDD54611F, 0xE03448AF, 0xA794327F, 0x9AF41BCF, 
-    0x18A48C1E, 0x25C4A5AE, 0x6264DF7E, 0x5F04F6CE, 
-    0xED242ADE, 0xD044036E, 0x97E479BE, 0xAA84500E, 
-    0x4834505D, 0x755479ED, 0x32F4033D, 0x0F942A8D, 
-    0xBDB4F69D, 0x80D4DF2D, 0xC774A5FD, 0xFA148C4D, 
-    0x78441B9C, 0x4524322C, 0x028448FC, 0x3FE4614C, 
-    0x8DC4BD5C, 0xB0A494EC, 0xF704EE3C, 0xCA64C78C,
-    /* T8_5 */
-    0x00000000, 0xCB5CD3A5, 0x4DC8A10B, 0x869472AE, 
-    0x9B914216, 0x50CD91B3, 0xD659E31D, 0x1D0530B8, 
-    0xEC53826D, 0x270F51C8, 0xA19B2366, 0x6AC7F0C3, 
-    0x77C2C07B, 0xBC9E13DE, 0x3A0A6170, 0xF156B2D5, 
-    0x03D6029B, 0xC88AD13E, 0x4E1EA390, 0x85427035, 
-    0x9847408D, 0x531B9328, 0xD58FE186, 0x1ED33223, 
-    0xEF8580F6, 0x24D95353, 0xA24D21FD, 0x6911F258, 
-    0x7414C2E0, 0xBF481145, 0x39DC63EB, 0xF280B04E, 
-    0x07AC0536, 0xCCF0D693, 0x4A64A43D, 0x81387798, 
-    0x9C3D4720, 0x57619485, 0xD1F5E62B, 0x1AA9358E, 
-    0xEBFF875B, 0x20A354FE, 0xA6372650, 0x6D6BF5F5, 
-    0x706EC54D, 0xBB3216E8, 0x3DA66446, 0xF6FAB7E3, 
-    0x047A07AD, 0xCF26D408, 0x49B2A6A6, 0x82EE7503, 
-    0x9FEB45BB, 0x54B7961E, 0xD223E4B0, 0x197F3715, 
-    0xE82985C0, 0x23755665, 0xA5E124CB, 0x6EBDF76E, 
-    0x73B8C7D6, 0xB8E41473, 0x3E7066DD, 0xF52CB578, 
-    0x0F580A6C, 0xC404D9C9, 0x4290AB67, 0x89CC78C2, 
-    0x94C9487A, 0x5F959BDF, 0xD901E971, 0x125D3AD4, 
-    0xE30B8801, 0x28575BA4, 0xAEC3290A, 0x659FFAAF, 
-    0x789ACA17, 0xB3C619B2, 0x35526B1C, 0xFE0EB8B9, 
-    0x0C8E08F7, 0xC7D2DB52, 0x4146A9FC, 0x8A1A7A59, 
-    0x971F4AE1, 0x5C439944, 0xDAD7EBEA, 0x118B384F, 
-    0xE0DD8A9A, 0x2B81593F, 0xAD152B91, 0x6649F834, 
-    0x7B4CC88C, 0xB0101B29, 0x36846987, 0xFDD8BA22, 
-    0x08F40F5A, 0xC3A8DCFF, 0x453CAE51, 0x8E607DF4, 
-    0x93654D4C, 0x58399EE9, 0xDEADEC47, 0x15F13FE2, 
-    0xE4A78D37, 0x2FFB5E92, 0xA96F2C3C, 0x6233FF99, 
-    0x7F36CF21, 0xB46A1C84, 0x32FE6E2A, 0xF9A2BD8F, 
-    0x0B220DC1, 0xC07EDE64, 0x46EAACCA, 0x8DB67F6F, 
-    0x90B34FD7, 0x5BEF9C72, 0xDD7BEEDC, 0x16273D79, 
-    0xE7718FAC, 0x2C2D5C09, 0xAAB92EA7, 0x61E5FD02, 
-    0x7CE0CDBA, 0xB7BC1E1F, 0x31286CB1, 0xFA74BF14, 
-    0x1EB014D8, 0xD5ECC77D, 0x5378B5D3, 0x98246676, 
-    0x852156CE, 0x4E7D856B, 0xC8E9F7C5, 0x03B52460, 
-    0xF2E396B5, 0x39BF4510, 0xBF2B37BE, 0x7477E41B, 
-    0x6972D4A3, 0xA22E0706, 0x24BA75A8, 0xEFE6A60D, 
-    0x1D661643, 0xD63AC5E6, 0x50AEB748, 0x9BF264ED, 
-    0x86F75455, 0x4DAB87F0, 0xCB3FF55E, 0x006326FB, 
-    0xF135942E, 0x3A69478B, 0xBCFD3525, 0x77A1E680, 
-    0x6AA4D638, 0xA1F8059D, 0x276C7733, 0xEC30A496, 
-    0x191C11EE, 0xD240C24B, 0x54D4B0E5, 0x9F886340, 
-    0x828D53F8, 0x49D1805D, 0xCF45F2F3, 0x04192156, 
-    0xF54F9383, 0x3E134026, 0xB8873288, 0x73DBE12D, 
-    0x6EDED195, 0xA5820230, 0x2316709E, 0xE84AA33B, 
-    0x1ACA1375, 0xD196C0D0, 0x5702B27E, 0x9C5E61DB, 
-    0x815B5163, 0x4A0782C6, 0xCC93F068, 0x07CF23CD, 
-    0xF6999118, 0x3DC542BD, 0xBB513013, 0x700DE3B6, 
-    0x6D08D30E, 0xA65400AB, 0x20C07205, 0xEB9CA1A0, 
-    0x11E81EB4, 0xDAB4CD11, 0x5C20BFBF, 0x977C6C1A, 
-    0x8A795CA2, 0x41258F07, 0xC7B1FDA9, 0x0CED2E0C, 
-    0xFDBB9CD9, 0x36E74F7C, 0xB0733DD2, 0x7B2FEE77, 
-    0x662ADECF, 0xAD760D6A, 0x2BE27FC4, 0xE0BEAC61, 
-    0x123E1C2F, 0xD962CF8A, 0x5FF6BD24, 0x94AA6E81, 
-    0x89AF5E39, 0x42F38D9C, 0xC467FF32, 0x0F3B2C97, 
-    0xFE6D9E42, 0x35314DE7, 0xB3A53F49, 0x78F9ECEC, 
-    0x65FCDC54, 0xAEA00FF1, 0x28347D5F, 0xE368AEFA, 
-    0x16441B82, 0xDD18C827, 0x5B8CBA89, 0x90D0692C, 
-    0x8DD55994, 0x46898A31, 0xC01DF89F, 0x0B412B3A, 
-    0xFA1799EF, 0x314B4A4A, 0xB7DF38E4, 0x7C83EB41, 
-    0x6186DBF9, 0xAADA085C, 0x2C4E7AF2, 0xE712A957, 
-    0x15921919, 0xDECECABC, 0x585AB812, 0x93066BB7, 
-    0x8E035B0F, 0x455F88AA, 0xC3CBFA04, 0x089729A1, 
-    0xF9C19B74, 0x329D48D1, 0xB4093A7F, 0x7F55E9DA, 
-    0x6250D962, 0xA90C0AC7, 0x2F987869, 0xE4C4ABCC,
-    /* T8_6 */
-    0x00000000, 0xA6770BB4, 0x979F1129, 0x31E81A9D, 
-    0xF44F2413, 0x52382FA7, 0x63D0353A, 0xC5A73E8E, 
-    0x33EF4E67, 0x959845D3, 0xA4705F4E, 0x020754FA, 
-    0xC7A06A74, 0x61D761C0, 0x503F7B5D, 0xF64870E9, 
-    0x67DE9CCE, 0xC1A9977A, 0xF0418DE7, 0x56368653, 
-    0x9391B8DD, 0x35E6B369, 0x040EA9F4, 0xA279A240, 
-    0x5431D2A9, 0xF246D91D, 0xC3AEC380, 0x65D9C834, 
-    0xA07EF6BA, 0x0609FD0E, 0x37E1E793, 0x9196EC27, 
-    0xCFBD399C, 0x69CA3228, 0x582228B5, 0xFE552301, 
-    0x3BF21D8F, 0x9D85163B, 0xAC6D0CA6, 0x0A1A0712, 
-    0xFC5277FB, 0x5A257C4F, 0x6BCD66D2, 0xCDBA6D66, 
-    0x081D53E8, 0xAE6A585C, 0x9F8242C1, 0x39F54975, 
-    0xA863A552, 0x0E14AEE6, 0x3FFCB47B, 0x998BBFCF, 
-    0x5C2C8141, 0xFA5B8AF5, 0xCBB39068, 0x6DC49BDC, 
-    0x9B8CEB35, 0x3DFBE081, 0x0C13FA1C, 0xAA64F1A8, 
-    0x6FC3CF26, 0xC9B4C492, 0xF85CDE0F, 0x5E2BD5BB, 
-    0x440B7579, 0xE27C7ECD, 0xD3946450, 0x75E36FE4, 
-    0xB044516A, 0x16335ADE, 0x27DB4043, 0x81AC4BF7, 
-    0x77E43B1E, 0xD19330AA, 0xE07B2A37, 0x460C2183, 
-    0x83AB1F0D, 0x25DC14B9, 0x14340E24, 0xB2430590, 
-    0x23D5E9B7, 0x85A2E203, 0xB44AF89E, 0x123DF32A, 
-    0xD79ACDA4, 0x71EDC610, 0x4005DC8D, 0xE672D739, 
-    0x103AA7D0, 0xB64DAC64, 0x87A5B6F9, 0x21D2BD4D, 
-    0xE47583C3, 0x42028877, 0x73EA92EA, 0xD59D995E, 
-    0x8BB64CE5, 0x2DC14751, 0x1C295DCC, 0xBA5E5678, 
-    0x7FF968F6, 0xD98E6342, 0xE86679DF, 0x4E11726B, 
-    0xB8590282, 0x1E2E0936, 0x2FC613AB, 0x89B1181F, 
-    0x4C162691, 0xEA612D25, 0xDB8937B8, 0x7DFE3C0C, 
-    0xEC68D02B, 0x4A1FDB9F, 0x7BF7C102, 0xDD80CAB6, 
-    0x1827F438, 0xBE50FF8C, 0x8FB8E511, 0x29CFEEA5, 
-    0xDF879E4C, 0x79F095F8, 0x48188F65, 0xEE6F84D1, 
-    0x2BC8BA5F, 0x8DBFB1EB, 0xBC57AB76, 0x1A20A0C2, 
-    0x8816EAF2, 0x2E61E146, 0x1F89FBDB, 0xB9FEF06F, 
-    0x7C59CEE1, 0xDA2EC555, 0xEBC6DFC8, 0x4DB1D47C, 
-    0xBBF9A495, 0x1D8EAF21, 0x2C66B5BC, 0x8A11BE08, 
-    0x4FB68086, 0xE9C18B32, 0xD82991AF, 0x7E5E9A1B, 
-    0xEFC8763C, 0x49BF7D88, 0x78576715, 0xDE206CA1, 
-    0x1B87522F, 0xBDF0599B, 0x8C184306, 0x2A6F48B2, 
-    0xDC27385B, 0x7A5033EF, 0x4BB82972, 0xEDCF22C6, 
-    0x28681C48, 0x8E1F17FC, 0xBFF70D61, 0x198006D5, 
-    0x47ABD36E, 0xE1DCD8DA, 0xD034C247, 0x7643C9F3, 
-    0xB3E4F77D, 0x1593FCC9, 0x247BE654, 0x820CEDE0, 
-    0x74449D09, 0xD23396BD, 0xE3DB8C20, 0x45AC8794, 
-    0x800BB91A, 0x267CB2AE, 0x1794A833, 0xB1E3A387, 
-    0x20754FA0, 0x86024414, 0xB7EA5E89, 0x119D553D, 
-    0xD43A6BB3, 0x724D6007, 0x43A57A9A, 0xE5D2712E, 
-    0x139A01C7, 0xB5ED0A73, 0x840510EE, 0x22721B5A, 
-    0xE7D525D4, 0x41A22E60, 0x704A34FD, 0xD63D3F49, 
-    0xCC1D9F8B, 0x6A6A943F, 0x5B828EA2, 0xFDF58516, 
-    0x3852BB98, 0x9E25B02C, 0xAFCDAAB1, 0x09BAA105, 
-    0xFFF2D1EC, 0x5985DA58, 0x686DC0C5, 0xCE1ACB71, 
-    0x0BBDF5FF, 0xADCAFE4B, 0x9C22E4D6, 0x3A55EF62, 
-    0xABC30345, 0x0DB408F1, 0x3C5C126C, 0x9A2B19D8, 
-    0x5F8C2756, 0xF9FB2CE2, 0xC813367F, 0x6E643DCB, 
-    0x982C4D22, 0x3E5B4696, 0x0FB35C0B, 0xA9C457BF, 
-    0x6C636931, 0xCA146285, 0xFBFC7818, 0x5D8B73AC, 
-    0x03A0A617, 0xA5D7ADA3, 0x943FB73E, 0x3248BC8A, 
-    0xF7EF8204, 0x519889B0, 0x6070932D, 0xC6079899, 
-    0x304FE870, 0x9638E3C4, 0xA7D0F959, 0x01A7F2ED, 
-    0xC400CC63, 0x6277C7D7, 0x539FDD4A, 0xF5E8D6FE, 
-    0x647E3AD9, 0xC209316D, 0xF3E12BF0, 0x55962044, 
-    0x90311ECA, 0x3646157E, 0x07AE0FE3, 0xA1D90457, 
-    0x579174BE, 0xF1E67F0A, 0xC00E6597, 0x66796E23, 
-    0xA3DE50AD, 0x05A95B19, 0x34414184, 0x92364A30,
-    /* T8_7 */
-    0x00000000, 0xCCAA009E, 0x4225077D, 0x8E8F07E3, 
-    0x844A0EFA, 0x48E00E64, 0xC66F0987, 0x0AC50919, 
-    0xD3E51BB5, 0x1F4F1B2B, 0x91C01CC8, 0x5D6A1C56, 
-    0x57AF154F, 0x9B0515D1, 0x158A1232, 0xD92012AC, 
-    0x7CBB312B, 0xB01131B5, 0x3E9E3656, 0xF23436C8, 
-    0xF8F13FD1, 0x345B3F4F, 0xBAD438AC, 0x767E3832, 
-    0xAF5E2A9E, 0x63F42A00, 0xED7B2DE3, 0x21D12D7D, 
-    0x2B142464, 0xE7BE24FA, 0x69312319, 0xA59B2387, 
-    0xF9766256, 0x35DC62C8, 0xBB53652B, 0x77F965B5, 
-    0x7D3C6CAC, 0xB1966C32, 0x3F196BD1, 0xF3B36B4F, 
-    0x2A9379E3, 0xE639797D, 0x68B67E9E, 0xA41C7E00, 
-    0xAED97719, 0x62737787, 0xECFC7064, 0x205670FA, 
-    0x85CD537D, 0x496753E3, 0xC7E85400, 0x0B42549E, 
-    0x01875D87, 0xCD2D5D19, 0x43A25AFA, 0x8F085A64, 
-    0x562848C8, 0x9A824856, 0x140D4FB5, 0xD8A74F2B, 
-    0xD2624632, 0x1EC846AC, 0x9047414F, 0x5CED41D1, 
-    0x299DC2ED, 0xE537C273, 0x6BB8C590, 0xA712C50E, 
-    0xADD7CC17, 0x617DCC89, 0xEFF2CB6A, 0x2358CBF4, 
-    0xFA78D958, 0x36D2D9C6, 0xB85DDE25, 0x74F7DEBB, 
-    0x7E32D7A2, 0xB298D73C, 0x3C17D0DF, 0xF0BDD041, 
-    0x5526F3C6, 0x998CF358, 0x1703F4BB, 0xDBA9F425, 
-    0xD16CFD3C, 0x1DC6FDA2, 0x9349FA41, 0x5FE3FADF, 
-    0x86C3E873, 0x4A69E8ED, 0xC4E6EF0E, 0x084CEF90, 
-    0x0289E689, 0xCE23E617, 0x40ACE1F4, 0x8C06E16A, 
-    0xD0EBA0BB, 0x1C41A025, 0x92CEA7C6, 0x5E64A758, 
-    0x54A1AE41, 0x980BAEDF, 0x1684A93C, 0xDA2EA9A2, 
-    0x030EBB0E, 0xCFA4BB90, 0x412BBC73, 0x8D81BCED, 
-    0x8744B5F4, 0x4BEEB56A, 0xC561B289, 0x09CBB217, 
-    0xAC509190, 0x60FA910E, 0xEE7596ED, 0x22DF9673, 
-    0x281A9F6A, 0xE4B09FF4, 0x6A3F9817, 0xA6959889, 
-    0x7FB58A25, 0xB31F8ABB, 0x3D908D58, 0xF13A8DC6, 
-    0xFBFF84DF, 0x37558441, 0xB9DA83A2, 0x7570833C, 
-    0x533B85DA, 0x9F918544, 0x111E82A7, 0xDDB48239, 
-    0xD7718B20, 0x1BDB8BBE, 0x95548C5D, 0x59FE8CC3, 
-    0x80DE9E6F, 0x4C749EF1, 0xC2FB9912, 0x0E51998C, 
-    0x04949095, 0xC83E900B, 0x46B197E8, 0x8A1B9776, 
-    0x2F80B4F1, 0xE32AB46F, 0x6DA5B38C, 0xA10FB312, 
-    0xABCABA0B, 0x6760BA95, 0xE9EFBD76, 0x2545BDE8, 
-    0xFC65AF44, 0x30CFAFDA, 0xBE40A839, 0x72EAA8A7, 
-    0x782FA1BE, 0xB485A120, 0x3A0AA6C3, 0xF6A0A65D, 
-    0xAA4DE78C, 0x66E7E712, 0xE868E0F1, 0x24C2E06F, 
-    0x2E07E976, 0xE2ADE9E8, 0x6C22EE0B, 0xA088EE95, 
-    0x79A8FC39, 0xB502FCA7, 0x3B8DFB44, 0xF727FBDA, 
-    0xFDE2F2C3, 0x3148F25D, 0xBFC7F5BE, 0x736DF520, 
-    0xD6F6D6A7, 0x1A5CD639, 0x94D3D1DA, 0x5879D144, 
-    0x52BCD85D, 0x9E16D8C3, 0x1099DF20, 0xDC33DFBE, 
-    0x0513CD12, 0xC9B9CD8C, 0x4736CA6F, 0x8B9CCAF1, 
-    0x8159C3E8, 0x4DF3C376, 0xC37CC495, 0x0FD6C40B, 
-    0x7AA64737, 0xB60C47A9, 0x3883404A, 0xF42940D4, 
-    0xFEEC49CD, 0x32464953, 0xBCC94EB0, 0x70634E2E, 
-    0xA9435C82, 0x65E95C1C, 0xEB665BFF, 0x27CC5B61, 
-    0x2D095278, 0xE1A352E6, 0x6F2C5505, 0xA386559B, 
-    0x061D761C, 0xCAB77682, 0x44387161, 0x889271FF, 
-    0x825778E6, 0x4EFD7878, 0xC0727F9B, 0x0CD87F05, 
-    0xD5F86DA9, 0x19526D37, 0x97DD6AD4, 0x5B776A4A, 
-    0x51B26353, 0x9D1863CD, 0x1397642E, 0xDF3D64B0, 
-    0x83D02561, 0x4F7A25FF, 0xC1F5221C, 0x0D5F2282, 
-    0x079A2B9B, 0xCB302B05, 0x45BF2CE6, 0x89152C78, 
-    0x50353ED4, 0x9C9F3E4A, 0x121039A9, 0xDEBA3937, 
-    0xD47F302E, 0x18D530B0, 0x965A3753, 0x5AF037CD, 
-    0xFF6B144A, 0x33C114D4, 0xBD4E1337, 0x71E413A9, 
-    0x7B211AB0, 0xB78B1A2E, 0x39041DCD, 0xF5AE1D53, 
-    0x2C8E0FFF, 0xE0240F61, 0x6EAB0882, 0xA201081C, 
-    0xA8C40105, 0x646E019B, 0xEAE10678, 0x264B06E6
-  };
-}

diff --git a/src/java/org/apache/cassandra/utils/RMIServerSocketFactoryImpl.java b/src/java/org/apache/cassandra/utils/RMIServerSocketFactoryImpl.java
index ea635ac..6444a65 100644
--- a/src/java/org/apache/cassandra/utils/RMIServerSocketFactoryImpl.java
+++ b/src/java/org/apache/cassandra/utils/RMIServerSocketFactoryImpl.java

@@ -21,7 +21,9 @@
 package org.apache.cassandra.utils;
 
 import java.io.IOException;
-import java.net.*;
+import java.net.InetAddress;
+import java.net.ServerSocket;
+import java.net.SocketException;
 import java.rmi.server.RMIServerSocketFactory;
 import javax.net.ServerSocketFactory;
 
@@ -29,8 +31,20 @@
 public class RMIServerSocketFactoryImpl implements RMIServerSocketFactory
 {
 
-    public ServerSocket createServerSocket(final int pPort) throws IOException  {
-        return ServerSocketFactory.getDefault().createServerSocket(pPort, 0, InetAddress.getLoopbackAddress());
+    public ServerSocket createServerSocket(final int pPort) throws IOException
+    {
+        ServerSocket socket = ServerSocketFactory.getDefault()
+                                                 .createServerSocket(pPort, 0, InetAddress.getLoopbackAddress());
+        try
+        {
+            socket.setReuseAddress(true);
+            return socket;
+        }
+        catch (SocketException e)
+        {
+            socket.close();
+            throw e;
+        }
     }
 
     public boolean equals(Object obj)

diff --git a/src/java/org/apache/cassandra/utils/SearchIterator.java b/src/java/org/apache/cassandra/utils/SearchIterator.java
index 004b02a..908053b 100644
--- a/src/java/org/apache/cassandra/utils/SearchIterator.java
+++ b/src/java/org/apache/cassandra/utils/SearchIterator.java

@@ -19,8 +19,15 @@
 
 public interface SearchIterator<K, V>
 {
-
-    public boolean hasNext();
+    /**
+     * Searches "forwards" (in direction of travel) in the iterator for the required key;
+     * if this or any key greater has already been returned by the iterator, the method may
+     * choose to return null, the correct or incorrect output, or fail an assertion.
+     *
+     * it is permitted to search past the end of the iterator, i.e. !hasNext() => next(?) == null
+     *
+     * @param key to search for
+     * @return value associated with key, if present in direction of travel
+     */
     public V next(K key);
-
 }

diff --git a/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java b/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java
index 368d3f5..e8bcee1 100644
--- a/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java
+++ b/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java

@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.utils;
 
+import java.util.Collection;
 import java.util.Comparator;
 import java.util.SortedMap;
 import java.util.TreeMap;
@@ -26,6 +27,9 @@
 
 public class SortedBiMultiValMap<K, V> extends BiMultiValMap<K, V>
 {
+    @SuppressWarnings("unchecked")
+    private static final Comparator DEFAULT_COMPARATOR = (o1, o2) -> ((Comparable) o1).compareTo(o2);
+
     protected SortedBiMultiValMap(SortedMap<K, V> forwardMap, SortedSetMultimap<V, K> reverseMap)
     {
         super(forwardMap, reverseMap);
@@ -48,28 +52,29 @@
     public static <K extends Comparable<K>, V extends Comparable<V>> SortedBiMultiValMap<K, V> create(BiMultiValMap<K, V> map)
     {
         SortedBiMultiValMap<K, V> newMap = SortedBiMultiValMap.<K,V>create();
-        newMap.forwardMap.putAll(map);
-        newMap.reverseMap.putAll(map.inverse());
+        copy(map, newMap);
         return newMap;
     }
 
     public static <K, V> SortedBiMultiValMap<K, V> create(BiMultiValMap<K, V> map, Comparator<K> keyComparator, Comparator<V> valueComparator)
     {
         SortedBiMultiValMap<K, V> newMap = create(keyComparator, valueComparator);
-        newMap.forwardMap.putAll(map);
-        newMap.reverseMap.putAll(map.inverse());
+        copy(map, newMap);
         return newMap;
     }
 
+    private static <K, V> void copy(BiMultiValMap<K, V> map, BiMultiValMap<K, V> newMap)
+    {
+        newMap.forwardMap.putAll(map.forwardMap);
+        // Put each individual TreeSet instead of Multimap#putAll(Multimap) to get linear complexity
+        // See CASSANDRA-14660
+        for (Entry<V, Collection<K>> entry : map.inverse().asMap().entrySet())
+            newMap.reverseMap.putAll(entry.getKey(), entry.getValue());
+    }
+
+    @SuppressWarnings("unchecked")
     private static <T> Comparator<T> defaultComparator()
     {
-        return new Comparator<T>()
-        {
-            @SuppressWarnings("unchecked")
-            public int compare(T o1, T o2)
-            {
-                return ((Comparable<T>) o1).compareTo(o2);
-            }
-        };
+        return DEFAULT_COMPARATOR;
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/StreamingHistogram.java b/src/java/org/apache/cassandra/utils/StreamingHistogram.java
index eb884be..6500a1a 100644
--- a/src/java/org/apache/cassandra/utils/StreamingHistogram.java
+++ b/src/java/org/apache/cassandra/utils/StreamingHistogram.java

@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.*;
 
@@ -25,6 +24,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 /**
@@ -40,20 +40,8 @@
 
     // TreeMap to hold bins of histogram.
     private final TreeMap<Double, Long> bin;
-
-    // maximum bin size for this histogram
     private final int maxBinSize;
 
-    /**
-     * Creates a new histogram with max bin size of maxBinSize
-     * @param maxBinSize maximum number of bins this histogram can have
-     */
-    public StreamingHistogram(int maxBinSize)
-    {
-        this.maxBinSize = maxBinSize;
-        bin = new TreeMap<>();
-    }
-
     private StreamingHistogram(int maxBinSize, Map<Double, Long> bin)
     {
         this.maxBinSize = maxBinSize;
@@ -61,74 +49,6 @@
     }
 
     /**
-     * Adds new point p to this histogram.
-     * @param p
-     */
-    public void update(double p)
-    {
-        update(p, 1);
-    }
-
-    /**
-     * Adds new point p with value m to this histogram.
-     * @param p
-     * @param m
-     */
-    public void update(double p, long m)
-    {
-        Long mi = bin.get(p);
-        if (mi != null)
-        {
-            // we found the same p so increment that counter
-            bin.put(p, mi + m);
-        }
-        else
-        {
-            bin.put(p, m);
-            // if bin size exceeds maximum bin size then trim down to max size
-            while (bin.size() > maxBinSize)
-            {
-                // find points p1, p2 which have smallest difference
-                Iterator<Double> keys = bin.keySet().iterator();
-                double p1 = keys.next();
-                double p2 = keys.next();
-                double smallestDiff = p2 - p1;
-                double q1 = p1, q2 = p2;
-                while (keys.hasNext())
-                {
-                    p1 = p2;
-                    p2 = keys.next();
-                    double diff = p2 - p1;
-                    if (diff < smallestDiff)
-                    {
-                        smallestDiff = diff;
-                        q1 = p1;
-                        q2 = p2;
-                    }
-                }
-                // merge those two
-                long k1 = bin.remove(q1);
-                long k2 = bin.remove(q2);
-                bin.put((q1 * k1 + q2 * k2) / (k1 + k2), k1 + k2);
-            }
-        }
-    }
-
-    /**
-     * Merges given histogram with this histogram.
-     *
-     * @param other histogram to merge
-     */
-    public void merge(StreamingHistogram other)
-    {
-        if (other == null)
-            return;
-
-        for (Map.Entry<Double, Long> entry : other.getAsMap().entrySet())
-            update(entry.getKey(), entry.getValue());
-    }
-
-    /**
      * Calculates estimated number of points in interval [-inf,b].
      *
      * @param b upper bound of a interval to calculate sum
@@ -168,6 +88,147 @@
         return Collections.unmodifiableMap(bin);
     }
 
+    public static class StreamingHistogramBuilder
+    {
+        // TreeMap to hold bins of histogram.
+        private final TreeMap<Double, Long> bin;
+
+        // Keep a second, larger buffer to spool data in, before finalizing it into `bin`
+        private final TreeMap<Double, Long> spool;
+
+        // maximum bin size for this histogram
+        private final int maxBinSize;
+
+        // maximum size of the spool
+        private final int maxSpoolSize;
+
+        // voluntarily give up resolution for speed
+        private final int roundSeconds;
+        /**
+         * Creates a new histogram with max bin size of maxBinSize
+         * @param maxBinSize maximum number of bins this histogram can have
+         */
+        public StreamingHistogramBuilder(int maxBinSize, int maxSpoolSize, int roundSeconds)
+        {
+            this.maxBinSize = maxBinSize;
+            this.maxSpoolSize = maxSpoolSize;
+            this.roundSeconds = roundSeconds;
+            bin = new TreeMap<>();
+            spool = new TreeMap<>();
+        }
+
+        public StreamingHistogram build()
+        {
+            flushHistogram();
+            return new StreamingHistogram(maxBinSize, bin);
+        }
+        /**
+         * Adds new point p to this histogram.
+         * @param p
+         */
+        public void update(double p)
+        {
+            update(p, 1);
+        }
+
+        /**
+         * Adds new point p with value m to this histogram.
+         * @param p
+         * @param m
+         */
+        public void update(double p, long m)
+        {
+            double d = p % this.roundSeconds;
+            if (d > 0)
+                p = p + (this.roundSeconds - d);
+
+            Long mi = spool.get(p);
+            if (mi != null)
+            {
+                // we found the same p so increment that counter
+                spool.put(p, mi + m);
+            }
+            else
+            {
+                spool.put(p, m);
+            }
+            if(spool.size() > maxSpoolSize)
+                flushHistogram();
+        }
+
+        /**
+         * Drain the temporary spool into the final bins
+         */
+        public void flushHistogram()
+        {
+            if(spool.size() > 0)
+            {
+                Long spoolValue;
+                Long binValue;
+
+                // Iterate over the spool, copying the value into the primary bin map
+                // and compacting that map as necessary
+                for (Map.Entry<Double, Long> entry : spool.entrySet())
+                {
+                    Double key = entry.getKey();
+                    spoolValue = entry.getValue();
+                    binValue = bin.get(key);
+
+                    if (binValue != null)
+                    {
+                        binValue += spoolValue;
+                        bin.put(key, binValue);
+                    } else
+                    {
+                        bin.put(key, spoolValue);
+                    }
+
+                    // if bin size exceeds maximum bin size then trim down to max size
+                    if (bin.size() > maxBinSize)
+                    {
+                        // find points p1, p2 which have smallest difference
+                        Iterator<Double> keys = bin.keySet().iterator();
+                        double p1 = keys.next();
+                        double p2 = keys.next();
+                        double smallestDiff = p2 - p1;
+                        double q1 = p1, q2 = p2;
+                        while (keys.hasNext()) {
+                            p1 = p2;
+                            p2 = keys.next();
+                            double diff = p2 - p1;
+                            if (diff < smallestDiff) {
+                                smallestDiff = diff;
+                                q1 = p1;
+                                q2 = p2;
+                            }
+                        }
+                        // merge those two
+                        long k1 = bin.remove(q1);
+                        long k2 = bin.remove(q2);
+                        bin.put((q1 * k1 + q2 * k2) / (k1 + k2), k1 + k2);
+                    }
+                }
+                spool.clear();
+            }
+        }
+
+        /**
+        * Merges given histogram with this histogram.
+        *
+        * @param other histogram to merge
+        */
+        public void merge(StreamingHistogram other)
+        {
+            if (other == null)
+                return;
+
+            flushHistogram();
+
+            for (Map.Entry<Double, Long> entry : other.getAsMap().entrySet())
+                update(entry.getKey(), entry.getValue());
+        }
+    }
+
     public static class StreamingHistogramSerializer implements ISerializer<StreamingHistogram>
     {
         public void serialize(StreamingHistogram histogram, DataOutputPlus out) throws IOException
@@ -182,7 +243,7 @@
             }
         }
 
-        public StreamingHistogram deserialize(DataInput in) throws IOException
+        public StreamingHistogram deserialize(DataInputPlus in) throws IOException
         {
             int maxBinSize = in.readInt();
             int size = in.readInt();
@@ -195,11 +256,11 @@
             return new StreamingHistogram(maxBinSize, tmp);
         }
 
-        public long serializedSize(StreamingHistogram histogram, TypeSizes typeSizes)
+        public long serializedSize(StreamingHistogram histogram)
         {
-            long size = typeSizes.sizeof(histogram.maxBinSize);
+            long size = TypeSizes.sizeof(histogram.maxBinSize);
             Map<Double, Long> entries = histogram.getAsMap();
-            size += typeSizes.sizeof(entries.size());
+            size += TypeSizes.sizeof(entries.size());
             // size of entries = size * (8(double) + 8(long))
             size += entries.size() * (8L + 8L);
             return size;
@@ -216,7 +277,8 @@
             return false;
 
         StreamingHistogram that = (StreamingHistogram) o;
-        return maxBinSize == that.maxBinSize && bin.equals(that.bin);
+        return maxBinSize == that.maxBinSize
+               && bin.equals(that.bin);
     }
 
     @Override
@@ -224,5 +286,4 @@
     {
         return Objects.hashCode(bin.hashCode(), maxBinSize);
     }
-
 }

diff --git a/src/java/org/apache/cassandra/utils/SyncUtil.java b/src/java/org/apache/cassandra/utils/SyncUtil.java
index 0d293aa..64d64cf 100644
--- a/src/java/org/apache/cassandra/utils/SyncUtil.java
+++ b/src/java/org/apache/cassandra/utils/SyncUtil.java

@@ -20,11 +20,7 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.FileDescriptor;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.io.SyncFailedException;
+import java.io.*;
 import java.lang.reflect.Field;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.ClosedChannelException;
@@ -180,6 +176,22 @@
         if (SKIP_SYNC)
             return;
         else
-            CLibrary.trySync(fd);
+            NativeLibrary.trySync(fd);
+    }
+
+    public static void trySyncDir(File dir)
+    {
+        if (SKIP_SYNC)
+            return;
+
+        int directoryFD = NativeLibrary.tryOpenDirectory(dir.getPath());
+        try
+        {
+            trySync(directoryFD);
+        }
+        finally
+        {
+            NativeLibrary.tryCloseFD(directoryFD);
+        }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/Throwables.java b/src/java/org/apache/cassandra/utils/Throwables.java
index 82703c8..5ad9686 100644
--- a/src/java/org/apache/cassandra/utils/Throwables.java
+++ b/src/java/org/apache/cassandra/utils/Throwables.java

@@ -18,14 +18,26 @@
 */
 package org.apache.cassandra.utils;
 
+import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Optional;
+import java.util.stream.Stream;
 
-import com.google.common.base.Optional;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.FSWriteError;
 
-public class Throwables
+public final class Throwables
 {
+    public enum FileOpType { READ, WRITE }
 
-    public static Throwable merge(Throwable existingFail, Throwable newFail)
+    public interface DiscreteAction<E extends Exception>
+    {
+        void perform() throws E;
+    }
+
+    public static <T extends Throwable> T merge(T existingFail, T newFail)
     {
         if (existingFail == null)
             return newFail;
@@ -62,6 +74,85 @@
         return true;
     }
 
+    @SafeVarargs
+    public static <E extends Exception> void perform(DiscreteAction<? extends E> ... actions) throws E
+    {
+        perform(Stream.of(actions));
+    }
+
+    public static <E extends Exception> void perform(Stream<? extends DiscreteAction<? extends E>> stream, DiscreteAction<? extends E> ... extra) throws E
+    {
+        perform(Stream.concat(stream, Stream.of(extra)));
+    }
+
+    @SuppressWarnings("unchecked")
+    public static <E extends Exception> void perform(Stream<DiscreteAction<? extends E>> actions) throws E
+    {
+        Throwable fail = perform((Throwable) null, actions);
+        if (failIfCanCast(fail, null))
+            throw (E) fail;
+    }
+
+    public static Throwable perform(Throwable accumulate, DiscreteAction<?> ... actions)
+    {
+        return perform(accumulate, Arrays.stream(actions));
+    }
+
+    public static Throwable perform(Throwable accumulate, Stream<? extends DiscreteAction<?>> actions)
+    {
+        return perform(accumulate, actions.iterator());
+    }
+
+    public static Throwable perform(Throwable accumulate, Iterator<? extends DiscreteAction<?>> actions)
+    {
+        while (actions.hasNext())
+        {
+            DiscreteAction<?> action = actions.next();
+            try
+            {
+                action.perform();
+            }
+            catch (Throwable t)
+            {
+                accumulate = merge(accumulate, t);
+            }
+        }
+        return accumulate;
+    }
+
+    @SafeVarargs
+    public static void perform(File against, FileOpType opType, DiscreteAction<? extends IOException> ... actions)
+    {
+        perform(against.getPath(), opType, actions);
+    }
+
+    @SafeVarargs
+    public static void perform(String filePath, FileOpType opType, DiscreteAction<? extends IOException> ... actions)
+    {
+        maybeFail(perform(null, filePath, opType, actions));
+    }
+
+    @SafeVarargs
+    public static Throwable perform(Throwable accumulate, String filePath, FileOpType opType, DiscreteAction<? extends IOException> ... actions)
+    {
+        return perform(accumulate, filePath, opType, Arrays.stream(actions));
+    }
+
+    public static Throwable perform(Throwable accumulate, String filePath, FileOpType opType, Stream<DiscreteAction<? extends IOException>> actions)
+    {
+        return perform(accumulate, actions.map((action) -> () ->
+        {
+            try
+            {
+                action.perform();
+            }
+            catch (IOException e)
+            {
+                throw (opType == FileOpType.WRITE) ? new FSWriteError(e, filePath) : new FSReadError(e, filePath);
+            }
+        }));
+    }
+
     public static Throwable close(Throwable accumulate, Iterable<? extends AutoCloseable> closeables)
     {
         for (AutoCloseable closeable : closeables)
@@ -88,6 +179,6 @@
             if (cause instanceof IOException)
                 return Optional.of((IOException) cause);
         }
-        return Optional.absent();
+        return Optional.empty();
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/UUIDGen.java b/src/java/org/apache/cassandra/utils/UUIDGen.java
index 78b8b57..11c1895 100644
--- a/src/java/org/apache/cassandra/utils/UUIDGen.java
+++ b/src/java/org/apache/cassandra/utils/UUIDGen.java

@@ -25,9 +25,11 @@
 import java.util.Collection;
 import java.util.Random;
 import java.util.UUID;
+import java.util.concurrent.TimeUnit;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Charsets;
+import com.google.common.primitives.Ints;
 
 
 /**
@@ -203,6 +205,15 @@
 
     /**
      * @param uuid
+     * @return seconds since Unix epoch
+     */
+    public static int unixTimestampInSec(UUID uuid)
+    {
+        return Ints.checkedCast(TimeUnit.MILLISECONDS.toSeconds(unixTimestamp(uuid)));
+    }
+
+    /**
+     * @param uuid
      * @return microseconds since Unix epoch
      */
     public static long microsTimestamp(UUID uuid)
@@ -272,7 +283,7 @@
 
     private static long makeClockSeqAndNode()
     {
-        long clock = new Random(System.currentTimeMillis()).nextLong();
+        long clock = new SecureRandom().nextLong();
 
         long lsb = 0;
         lsb |= 0x8000000000000000L;                 // variant (2 bits)
@@ -349,7 +360,7 @@
                 messageDigest.update(addr.getAddress());
 
             // Identify the process on the load: we use both the PID and class loader hash.
-            long pid = SigarLibrary.instance.getPid();
+            long pid = NativeLibrary.getProcessID();
             if (pid < 0)
                 pid = new Random(System.currentTimeMillis()).nextLong();
             FBUtilities.updateWithLong(messageDigest, pid);

diff --git a/src/java/org/apache/cassandra/utils/UUIDSerializer.java b/src/java/org/apache/cassandra/utils/UUIDSerializer.java
index 2aa2b4e..47b6f8c 100644
--- a/src/java/org/apache/cassandra/utils/UUIDSerializer.java
+++ b/src/java/org/apache/cassandra/utils/UUIDSerializer.java

@@ -17,12 +17,12 @@
  */
 package org.apache.cassandra.utils;
 
-import java.io.DataInput;
 import java.io.IOException;
 import java.util.UUID;
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
 public class UUIDSerializer implements IVersionedSerializer<UUID>
@@ -35,13 +35,13 @@
         out.writeLong(uuid.getLeastSignificantBits());
     }
 
-    public UUID deserialize(DataInput in, int version) throws IOException
+    public UUID deserialize(DataInputPlus in, int version) throws IOException
     {
         return new UUID(in.readLong(), in.readLong());
     }
 
     public long serializedSize(UUID uuid, int version)
     {
-        return TypeSizes.NATIVE.sizeof(uuid.getMostSignificantBits()) + TypeSizes.NATIVE.sizeof(uuid.getLeastSignificantBits());
+        return TypeSizes.sizeof(uuid.getMostSignificantBits()) + TypeSizes.sizeof(uuid.getLeastSignificantBits());
     }
-}
\ No newline at end of file
+}

diff --git a/src/java/org/apache/cassandra/utils/WindowsTimer.java b/src/java/org/apache/cassandra/utils/WindowsTimer.java
index 9db8559..351751f 100644
--- a/src/java/org/apache/cassandra/utils/WindowsTimer.java
+++ b/src/java/org/apache/cassandra/utils/WindowsTimer.java

@@ -34,6 +34,10 @@
         {
             Native.register("winmm");
         }
+        catch (NoClassDefFoundError e)
+        {
+            logger.warn("JNA not found. winmm.dll cannot be registered. Performance will be negatively impacted on this node.");
+        }
         catch (Exception e)
         {
             logger.error("Failed to register winmm.dll. Performance will be negatively impacted on this node.");

diff --git a/src/java/org/apache/cassandra/utils/btree/BTree.java b/src/java/org/apache/cassandra/utils/btree/BTree.java
index 1145d12..e6e6e40 100644
--- a/src/java/org/apache/cassandra/utils/btree/BTree.java
+++ b/src/java/org/apache/cassandra/utils/btree/BTree.java

@@ -18,28 +18,32 @@
  */
 package org.apache.cassandra.utils.btree;
 
-import java.util.ArrayDeque;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Queue;
+import java.util.*;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Ordering;
 
 import org.apache.cassandra.utils.ObjectSizes;
 
-import static org.apache.cassandra.utils.btree.UpdateFunction.NoOp;
+import static com.google.common.collect.Iterables.concat;
+import static com.google.common.collect.Iterables.filter;
+import static com.google.common.collect.Iterables.transform;
+import static java.lang.Math.max;
+import static java.lang.Math.min;
 
 public class BTree
 {
     /**
      * Leaf Nodes are a raw array of values: Object[V1, V1, ...,].
      *
-     * Branch Nodes: Object[V1, V2, ..., child[&lt;V1.key], child[&lt;V2.key], ..., child[&lt; Inf]], where
+     * Branch Nodes: Object[V1, V2, ..., child[&lt;V1.key], child[&lt;V2.key], ..., child[&lt; Inf], size], where
      * each child is another node, i.e., an Object[].  Thus, the value elements in a branch node are the
-     * first half of the array, rounding down.  In our implementation, each value must include its own key;
+     * first half of the array (minus one).  In our implementation, each value must include its own key;
      * we access these via Comparator, rather than directly. 
      *
-     * So we can quickly distinguish between leaves and branches, we require that leaf nodes are always even number
-     * of elements (padded with a null, if necessary), and branches are always an odd number of elements.
+     * So we can quickly distinguish between leaves and branches, we require that leaf nodes are always an odd number
+     * of elements (padded with a null, if necessary), and branches are always an even number of elements.
      *
      * BTrees are immutable; updating one returns a new tree that reuses unmodified nodes.
      *
@@ -64,228 +68,390 @@
     static final int FAN_FACTOR = 1 << FAN_SHIFT;
 
     // An empty BTree Leaf - which is the same as an empty BTree
-    static final Object[] EMPTY_LEAF = new Object[0];
+    static final Object[] EMPTY_LEAF = new Object[1];
 
     // An empty BTree branch - used only for internal purposes in Modifier
-    static final Object[] EMPTY_BRANCH = new Object[1];
+    static final Object[] EMPTY_BRANCH = new Object[] { null, new int[0] };
 
-    /**
-     * Returns an empty BTree
-     *
-     * @return
-     */
+    // direction of iteration
+    public static enum Dir
+    {
+        ASC, DESC;
+        public Dir invert() { return this == ASC ? DESC : ASC; }
+        public static Dir asc(boolean asc) { return asc ? ASC : DESC; }
+        public static Dir desc(boolean desc) { return desc ? DESC : ASC; }
+    }
+
     public static Object[] empty()
     {
         return EMPTY_LEAF;
     }
 
-    public static <V> Object[] build(Collection<V> source, Comparator<V> comparator, boolean sorted, UpdateFunction<V> updateF)
+    public static Object[] singleton(Object value)
     {
-        return build(source, source.size(), comparator, sorted, updateF);
+        return new Object[] { value };
+    }
+
+    public static <C, K extends C, V extends C> Object[] build(Collection<K> source, UpdateFunction<K, V> updateF)
+    {
+        return buildInternal(source, source.size(), updateF);
+    }
+
+    public static <C, K extends C, V extends C> Object[] build(Iterable<K> source, UpdateFunction<K, V> updateF)
+    {
+        return buildInternal(source, -1, updateF);
     }
 
     /**
      * Creates a BTree containing all of the objects in the provided collection
      *
-     * @param source     the items to build the tree with
-     * @param comparator the comparator that defines the ordering over the items in the tree
-     * @param sorted     if false, the collection will be copied and sorted to facilitate construction
-     * @param <V>
-     * @return
+     * @param source  the items to build the tree with. MUST BE IN STRICTLY ASCENDING ORDER.
+     * @param size    the size of the source iterable
+     * @return        a btree representing the contents of the provided iterable
      */
-    public static <V> Object[] build(Iterable<V> source, int size, Comparator<V> comparator, boolean sorted, UpdateFunction<V> updateF)
+    public static <C, K extends C, V extends C> Object[] build(Iterable<K> source, int size, UpdateFunction<K, V> updateF)
     {
-        if (size < FAN_FACTOR)
+        if (size < 0)
+            throw new IllegalArgumentException(Integer.toString(size));
+        return buildInternal(source, size, updateF);
+    }
+
+    /**
+     * As build(), except:
+     * @param size    < 0 if size is unknown
+     */
+    private static <C, K extends C, V extends C> Object[] buildInternal(Iterable<K> source, int size, UpdateFunction<K, V> updateF)
+    {
+        if ((size >= 0) & (size < FAN_FACTOR))
         {
-            // pad to even length to match contract that all leaf nodes are even
-            V[] values = (V[]) new Object[size + (size & 1)];
+            if (size == 0)
+                return EMPTY_LEAF;
+            // pad to odd length to match contract that all leaf nodes are odd
+            V[] values = (V[]) new Object[size | 1];
             {
                 int i = 0;
-                for (V v : source)
-                    values[i++] = v;
+                for (K k : source)
+                    values[i++] = updateF.apply(k);
             }
-
-            // inline sorting since we're already calling toArray
-            if (!sorted)
-                Arrays.sort(values, 0, size, comparator);
-
-            // if updateF is specified
-            if (updateF != null)
-            {
-                for (int i = 0 ; i < size ; i++)
-                    values[i] = updateF.apply(values[i]);
-                updateF.allocated(ObjectSizes.sizeOfArray(values));
-            }
+            updateF.allocated(ObjectSizes.sizeOfArray(values));
             return values;
         }
 
-        if (!sorted)
-            source = sorted(source, comparator, size);
-
-        Queue<Builder> queue = modifier.get();
-        Builder builder = queue.poll();
+        Queue<TreeBuilder> queue = modifier.get();
+        TreeBuilder builder = queue.poll();
         if (builder == null)
-            builder = new Builder();
+            builder = new TreeBuilder();
         Object[] btree = builder.build(source, updateF, size);
         queue.add(builder);
         return btree;
     }
 
-    /**
-     * Returns a new BTree with the provided set inserting/replacing as necessary any equal items
-     *
-     * @param btree              the tree to update
-     * @param comparator         the comparator that defines the ordering over the items in the tree
-     * @param updateWith         the items to either insert / update
-     * @param updateWithIsSorted if false, updateWith will be copied and sorted to facilitate construction
-     * @param <V>
-     * @return
-     */
-    public static <V> Object[] update(Object[] btree, Comparator<V> comparator, Collection<V> updateWith, boolean updateWithIsSorted)
+    public static <C, K extends C, V extends C> Object[] update(Object[] btree,
+                                                                Comparator<C> comparator,
+                                                                Collection<K> updateWith,
+                                                                UpdateFunction<K, V> updateF)
     {
-        return update(btree, comparator, updateWith, updateWithIsSorted, NoOp.<V>instance());
-    }
-
-    public static <V> Object[] update(Object[] btree,
-                                      Comparator<V> comparator,
-                                      Collection<V> updateWith,
-                                      boolean updateWithIsSorted,
-                                      UpdateFunction<V> updateF)
-    {
-        return update(btree, comparator, updateWith, updateWith.size(), updateWithIsSorted, updateF);
+        return update(btree, comparator, updateWith, updateWith.size(), updateF);
     }
 
     /**
-     * Returns a new BTree with the provided set inserting/replacing as necessary any equal items
+     * Returns a new BTree with the provided collection inserting/replacing as necessary any equal items
      *
      * @param btree              the tree to update
      * @param comparator         the comparator that defines the ordering over the items in the tree
-     * @param updateWith         the items to either insert / update
-     * @param updateWithIsSorted if false, updateWith will be copied and sorted to facilitate construction
+     * @param updateWith         the items to either insert / update. MUST BE IN STRICTLY ASCENDING ORDER.
+     * @param updateWithLength   then number of elements in updateWith
      * @param updateF            the update function to apply to any pairs we are swapping, and maybe abort early
      * @param <V>
      * @return
      */
-    public static <V> Object[] update(Object[] btree,
-                                      Comparator<V> comparator,
-                                      Iterable<V> updateWith,
-                                      int updateWithLength,
-                                      boolean updateWithIsSorted,
-                                      UpdateFunction<V> updateF)
+    public static <C, K extends C, V extends C> Object[] update(Object[] btree,
+                                                                Comparator<C> comparator,
+                                                                Iterable<K> updateWith,
+                                                                int updateWithLength,
+                                                                UpdateFunction<K, V> updateF)
     {
-        if (btree.length == 0)
-            return build(updateWith, updateWithLength, comparator, updateWithIsSorted, updateF);
+        if (isEmpty(btree))
+            return build(updateWith, updateWithLength, updateF);
 
-        if (!updateWithIsSorted)
-            updateWith = sorted(updateWith, comparator, updateWithLength);
-
-        Queue<Builder> queue = modifier.get();
-        Builder builder = queue.poll();
+        Queue<TreeBuilder> queue = modifier.get();
+        TreeBuilder builder = queue.poll();
         if (builder == null)
-            builder = new Builder();
+            builder = new TreeBuilder();
         btree = builder.update(btree, comparator, updateWith, updateF);
         queue.add(builder);
         return btree;
     }
 
+    public static <K> Object[] merge(Object[] tree1, Object[] tree2, Comparator<? super K> comparator, UpdateFunction<K, K> updateF)
+    {
+        if (size(tree1) < size(tree2))
+        {
+            Object[] tmp = tree1;
+            tree1 = tree2;
+            tree2 = tmp;
+        }
+        return update(tree1, comparator, new BTreeSet<K>(tree2, comparator), updateF);
+    }
+
+    public static <V> Iterator<V> iterator(Object[] btree)
+    {
+        return iterator(btree, Dir.ASC);
+    }
+
+    public static <V> Iterator<V> iterator(Object[] btree, Dir dir)
+    {
+        return new BTreeSearchIterator<V, V>(btree, null, dir);
+    }
+
+    public static <V> Iterator<V> iterator(Object[] btree, int lb, int ub, Dir dir)
+    {
+        return new BTreeSearchIterator<V, V>(btree, null, dir, lb, ub);
+    }
+
+    public static <V> Iterable<V> iterable(Object[] btree)
+    {
+        return iterable(btree, Dir.ASC);
+    }
+
+    public static <V> Iterable<V> iterable(Object[] btree, Dir dir)
+    {
+        return () -> iterator(btree, dir);
+    }
+
+    public static <V> Iterable<V> iterable(Object[] btree, int lb, int ub, Dir dir)
+    {
+        return () -> iterator(btree, lb, ub, dir);
+    }
+
     /**
      * Returns an Iterator over the entire tree
      *
-     * @param btree    the tree to iterate over
-     * @param forwards if false, the iterator will start at the end and move backwards
+     * @param btree  the tree to iterate over
+     * @param dir    direction of iteration
      * @param <V>
      * @return
      */
-    public static <V> Cursor<V, V> slice(Object[] btree, boolean forwards)
+    public static <K, V> BTreeSearchIterator<K, V> slice(Object[] btree, Comparator<? super K> comparator, Dir dir)
     {
-        Cursor<V, V> r = new Cursor<>();
-        r.reset(btree, forwards);
-        return r;
+        return new BTreeSearchIterator<>(btree, comparator, dir);
     }
 
     /**
-     * Returns an Iterator over a sub-range of the tree
-     *
      * @param btree      the tree to iterate over
      * @param comparator the comparator that defines the ordering over the items in the tree
-     * @param start      the first item to include
-     * @param end        the last item to include
-     * @param forwards   if false, the iterator will start at end and move backwards
-     * @param <V>
-     * @return
+     * @param start      the beginning of the range to return, inclusive (in ascending order)
+     * @param end        the end of the range to return, exclusive (in ascending order)
+     * @param dir   if false, the iterator will start at the last item and move backwards
+     * @return           an Iterator over the defined sub-range of the tree
      */
-    public static <K, V extends K> Cursor<K, V> slice(Object[] btree, Comparator<K> comparator, K start, K end, boolean forwards)
+    public static <K, V extends K> BTreeSearchIterator<K, V> slice(Object[] btree, Comparator<? super K> comparator, K start, K end, Dir dir)
     {
-        Cursor<K, V> r = new Cursor<>();
-        r.reset(btree, comparator, start, end, forwards);
-        return r;
+        return slice(btree, comparator, start, true, end, false, dir);
     }
 
     /**
-     * Returns an Iterator over a sub-range of the tree
-     *
-     * @param btree      the tree to iterate over
-     * @param comparator the comparator that defines the ordering over the items in the tree
-     * @param start      the first item to include
-     * @param end        the last item to include
-     * @param forwards   if false, the iterator will start at end and move backwards
-     * @param <V>
-     * @return
+     * @param btree          the tree to iterate over
+     * @param comparator     the comparator that defines the ordering over the items in the tree
+     * @param start          low bound of the range
+     * @param startInclusive inclusivity of lower bound
+     * @param end            high bound of the range
+     * @param endInclusive   inclusivity of higher bound
+     * @param dir            direction of iteration
+     * @return               an Iterator over the defined sub-range of the tree
      */
-    public static <K, V extends K> Cursor<K, V> slice(Object[] btree, Comparator<K> comparator, K start, boolean startInclusive, K end, boolean endInclusive, boolean forwards)
+    public static <K, V extends K> BTreeSearchIterator<K, V> slice(Object[] btree, Comparator<? super K> comparator, K start, boolean startInclusive, K end, boolean endInclusive, Dir dir)
     {
-        Cursor<K, V> r = new Cursor<>();
-        r.reset(btree, comparator, start, startInclusive, end, endInclusive, forwards);
-        return r;
+        int inclusiveLowerBound = max(0,
+                                      start == null ? Integer.MIN_VALUE
+                                                    : startInclusive ? ceilIndex(btree, comparator, start)
+                                                                     : higherIndex(btree, comparator, start));
+        int inclusiveUpperBound = min(size(btree) - 1,
+                                      end == null ? Integer.MAX_VALUE
+                                                  : endInclusive ? floorIndex(btree, comparator, end)
+                                                                 : lowerIndex(btree, comparator, end));
+        return new BTreeSearchIterator<>(btree, comparator, dir, inclusiveLowerBound, inclusiveUpperBound);
     }
 
-    public static <V> V find(Object[] node, Comparator<V> comparator, V find)
+    /**
+     * @return the item in the tree that sorts as equal to the search argument, or null if no such item
+     */
+    public static <V> V find(Object[] node, Comparator<? super V> comparator, V find)
     {
         while (true)
         {
             int keyEnd = getKeyEnd(node);
-            int i = BTree.find(comparator, find, node, 0, keyEnd);
+            int i = Arrays.binarySearch((V[]) node, 0, keyEnd, find, comparator);
+
+            if (i >= 0)
+                return (V) node[i];
+
+            if (isLeaf(node))
+                return null;
+
+            i = -1 - i;
+            node = (Object[]) node[keyEnd + i];
+        }
+    }
+
+    /**
+     * Modifies the provided btree directly. THIS SHOULD NOT BE USED WITHOUT EXTREME CARE as BTrees are meant to be immutable.
+     * Finds and replaces the provided item in the tree. Both should sort as equal to each other (although this is not enforced)
+     */
+    public static <V> void replaceInSitu(Object[] node, Comparator<? super V> comparator, V find, V replace)
+    {
+        while (true)
+        {
+            int keyEnd = getKeyEnd(node);
+            int i = Arrays.binarySearch((V[]) node, 0, keyEnd, find, comparator);
+
             if (i >= 0)
             {
-                return (V) node[i];
+                assert find == node[i];
+                node[i] = replace;
+                return;
             }
-            else if (!isLeaf(node))
-            {
-                i = -i - 1;
-                node = (Object[]) node[keyEnd + i];
-            }
-            else
-            {
-                return null;
-            }
+
+            if (isLeaf(node))
+                throw new NoSuchElementException();
+
+            i = -1 - i;
+            node = (Object[]) node[keyEnd + i];
         }
     }
 
+    /**
+     * Honours result semantics of {@link Arrays#binarySearch}, as though it were performed on the tree flattened into an array
+     * @return index of item in tree, or <tt>(-(<i>insertion point</i>) - 1)</tt> if not present
+     */
+    public static <V> int findIndex(Object[] node, Comparator<? super V> comparator, V find)
+    {
+        int lb = 0;
+        while (true)
+        {
+            int keyEnd = getKeyEnd(node);
+            int i = Arrays.binarySearch((V[]) node, 0, keyEnd, find, comparator);
+            boolean exact = i >= 0;
+
+            if (isLeaf(node))
+                return exact ? lb + i : i - lb;
+
+            if (!exact)
+                i = -1 - i;
+
+            int[] sizeMap = getSizeMap(node);
+            if (exact)
+                return lb + sizeMap[i];
+            else if (i > 0)
+                lb += sizeMap[i - 1] + 1;
+
+            node = (Object[]) node[keyEnd + i];
+        }
+    }
+
+    /**
+     * @return the value at the index'th position in the tree, in tree order
+     */
+    public static <V> V findByIndex(Object[] tree, int index)
+    {
+        // WARNING: if semantics change, see also InternalCursor.seekTo, which mirrors this implementation
+        if ((index < 0) | (index >= size(tree)))
+            throw new IndexOutOfBoundsException(index + " not in range [0.." + size(tree) + ")");
+
+        Object[] node = tree;
+        while (true)
+        {
+            if (isLeaf(node))
+            {
+                int keyEnd = getLeafKeyEnd(node);
+                assert index < keyEnd;
+                return (V) node[index];
+            }
+
+            int[] sizeMap = getSizeMap(node);
+            int boundary = Arrays.binarySearch(sizeMap, index);
+            if (boundary >= 0)
+            {
+                // exact match, in this branch node
+                assert boundary < sizeMap.length - 1;
+                return (V) node[boundary];
+            }
+
+            boundary = -1 -boundary;
+            if (boundary > 0)
+            {
+                assert boundary < sizeMap.length;
+                index -= (1 + sizeMap[boundary - 1]);
+            }
+            node = (Object[]) node[getChildStart(node) + boundary];
+        }
+    }
+
+    /* since we have access to binarySearch semantics within indexOf(), we can use this to implement
+     * lower/upper/floor/higher very trivially
+     *
+     * this implementation is *not* optimal; it requires two logarithmic traversals, although the second is much cheaper
+     * (having less height, and operating over only primitive arrays), and the clarity is compelling
+     */
+
+    public static <V> int lowerIndex(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = findIndex(btree, comparator, find);
+        if (i < 0)
+            i = -1 -i;
+        return i - 1;
+    }
+
+    public static <V> V lower(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = lowerIndex(btree, comparator, find);
+        return i >= 0 ? findByIndex(btree, i) : null;
+    }
+
+    public static <V> int floorIndex(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = findIndex(btree, comparator, find);
+        if (i < 0)
+            i = -2 -i;
+        return i;
+    }
+
+    public static <V> V floor(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = floorIndex(btree, comparator, find);
+        return i >= 0 ? findByIndex(btree, i) : null;
+    }
+
+    public static <V> int higherIndex(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = findIndex(btree, comparator, find);
+        if (i < 0) i = -1 -i;
+        else i++;
+        return i;
+    }
+
+    public static <V> V higher(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = higherIndex(btree, comparator, find);
+        return i < size(btree) ? findByIndex(btree, i) : null;
+    }
+
+    public static <V> int ceilIndex(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = findIndex(btree, comparator, find);
+        if (i < 0)
+            i = -1 -i;
+        return i;
+    }
+
+    public static <V> V ceil(Object[] btree, Comparator<? super V> comparator, V find)
+    {
+        int i = ceilIndex(btree, comparator, find);
+        return i < size(btree) ? findByIndex(btree, i) : null;
+    }
 
     // UTILITY METHODS
 
-    // same basic semantics as Arrays.binarySearch, but delegates to compare() method to avoid
-    // wrapping generic Comparator with support for Special +/- infinity sentinels
-    static <V> int find(Comparator<V> comparator, Object key, Object[] a, final int fromIndex, final int toIndex)
-    {
-        int low = fromIndex;
-        int high = toIndex - 1;
-
-        while (low <= high)
-        {
-            int mid = (low + high) / 2;
-            int cmp = comparator.compare((V) key, (V) a[mid]);
-
-            if (cmp > 0)
-                low = mid + 1;
-            else if (cmp < 0)
-                high = mid - 1;
-            else
-                return mid; // key found
-        }
-        return -(low + 1);  // key not found.
-    }
-
     // get the upper bound we should search in for keys in the node
     static int getKeyEnd(Object[] node)
     {
@@ -299,29 +465,87 @@
     static int getLeafKeyEnd(Object[] node)
     {
         int len = node.length;
-        if (len == 0)
-            return 0;
-        else if (node[len - 1] == null)
-            return len - 1;
-        else
-            return len;
+        return node[len - 1] == null ? len - 1 : len;
     }
 
     // return the boundary position between keys/children for the branch node
-    static int getBranchKeyEnd(Object[] node)
+    // == number of keys, as they are indexed from zero
+    static int getBranchKeyEnd(Object[] branchNode)
     {
-        return node.length / 2;
+        return (branchNode.length / 2) - 1;
+    }
+
+    /**
+     * @return first index in a branch node containing child nodes
+     */
+    static int getChildStart(Object[] branchNode)
+    {
+        return getBranchKeyEnd(branchNode);
+    }
+
+    /**
+     * @return last index + 1 in a branch node containing child nodes
+     */
+    static int getChildEnd(Object[] branchNode)
+    {
+        return branchNode.length - 1;
+    }
+
+    /**
+     * @return number of children in a branch node
+     */
+    static int getChildCount(Object[] branchNode)
+    {
+        return branchNode.length / 2;
+    }
+
+    /**
+     * @return the size map for the branch node
+     */
+    static int[] getSizeMap(Object[] branchNode)
+    {
+        return (int[]) branchNode[getChildEnd(branchNode)];
+    }
+
+    /**
+     * @return the size map for the branch node
+     */
+    static int lookupSizeMap(Object[] branchNode, int index)
+    {
+        return getSizeMap(branchNode)[index];
+    }
+
+    // get the size from the btree's index (fails if not present)
+    public static int size(Object[] tree)
+    {
+        if (isLeaf(tree))
+            return getLeafKeyEnd(tree);
+        int length = tree.length;
+        // length - 1 == getChildEnd == getPositionOfSizeMap
+        // (length / 2) - 1 == getChildCount - 1 == position of full tree size
+        // hard code this, as will be used often;
+        return ((int[]) tree[length - 1])[(length / 2) - 1];
+    }
+
+    public static long sizeOfStructureOnHeap(Object[] tree)
+    {
+        long size = ObjectSizes.sizeOfArray(tree);
+        if (isLeaf(tree))
+            return size;
+        for (int i = getChildStart(tree) ; i < getChildEnd(tree) ; i++)
+            size += sizeOfStructureOnHeap((Object[]) tree[i]);
+        return size;
     }
 
     // returns true if the provided node is a leaf, false if it is a branch
     static boolean isLeaf(Object[] node)
     {
-        return (node.length & 1) == 0;
+        return (node.length & 1) == 1;
     }
 
     public static boolean isEmpty(Object[] tree)
     {
-        return tree.length == 0;
+        return tree == EMPTY_LEAF;
     }
 
     public static int depth(Object[] tree)
@@ -335,54 +559,522 @@
         return depth;
     }
 
-    // Special class for making certain operations easier, so we can define a +/- Inf
-    static interface Special extends Comparable<Object> { }
-    static final Special POSITIVE_INFINITY = new Special()
+    /**
+     * Fill the target array with the contents of the provided subtree, in ascending order, starting at targetOffset
+     * @param tree source
+     * @param target array
+     * @param targetOffset offset in target array
+     * @return number of items copied (size of tree)
+     */
+    public static int toArray(Object[] tree, Object[] target, int targetOffset)
     {
-        public int compareTo(Object o)
-        {
-            return o == this ? 0 : 1;
-        }
-    };
-    static final Special NEGATIVE_INFINITY = new Special()
+        return toArray(tree, 0, size(tree), target, targetOffset);
+    }
+    public static int toArray(Object[] tree, int treeStart, int treeEnd, Object[] target, int targetOffset)
     {
-        public int compareTo(Object o)
+        if (isLeaf(tree))
         {
-            return o == this ? 0 : -1;
+            int count = treeEnd - treeStart;
+            System.arraycopy(tree, treeStart, target, targetOffset, count);
+            return count;
         }
-    };
 
-    private static final ThreadLocal<Queue<Builder>> modifier = new ThreadLocal<Queue<Builder>>()
+        int newTargetOffset = targetOffset;
+        int childCount = getChildCount(tree);
+        int childOffset = getChildStart(tree);
+        for (int i = 0 ; i < childCount ; i++)
+        {
+            int childStart = treeIndexOffsetOfChild(tree, i);
+            int childEnd = treeIndexOfBranchKey(tree, i);
+            if (childStart <= treeEnd && childEnd >= treeStart)
+            {
+                newTargetOffset += toArray((Object[]) tree[childOffset + i], max(0, treeStart - childStart), min(childEnd, treeEnd) - childStart,
+                                           target, newTargetOffset);
+                if (treeStart <= childEnd && treeEnd > childEnd) // this check will always fail for the non-existent key
+                    target[newTargetOffset++] = tree[i];
+            }
+        }
+        return newTargetOffset - targetOffset;
+    }
+
+    // simple class for avoiding duplicate transformation work
+    private static class FiltrationTracker<V> implements Function<V, V>
+    {
+        final Function<? super V, ? extends V> wrapped;
+        int index;
+        boolean failed;
+
+        private FiltrationTracker(Function<? super V, ? extends V> wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public V apply(V i)
+        {
+            V o = wrapped.apply(i);
+            if (o != null) index++;
+            else failed = true;
+            return o;
+        }
+    }
+
+    /**
+     * Takes a btree and transforms it using the provided function, filtering out any null results.
+     * The result of any transformation must sort identically wrt the other results as their originals
+     */
+    public static <V> Object[] transformAndFilter(Object[] btree, Function<? super V, ? extends V> function)
+    {
+        if (isEmpty(btree))
+            return btree;
+
+        // TODO: can be made more efficient
+        FiltrationTracker<V> wrapped = new FiltrationTracker<>(function);
+        Object[] result = transformAndFilter(btree, wrapped);
+        if (!wrapped.failed)
+            return result;
+
+        // take the already transformed bits from the head of the partial result
+        Iterable<V> head = iterable(result, 0, wrapped.index - 1, Dir.ASC);
+        // and concatenate with remainder of original tree, with transformation applied
+        Iterable<V> remainder = iterable(btree, wrapped.index + 1, size(btree) - 1, Dir.ASC);
+        remainder = filter(transform(remainder, function), (x) -> x != null);
+        Iterable<V> build = concat(head, remainder);
+
+        return buildInternal(build, -1, UpdateFunction.<V>noOp());
+    }
+
+    private static <V> Object[] transformAndFilter(Object[] btree, FiltrationTracker<V> function)
+    {
+        Object[] result = btree;
+        boolean isLeaf = isLeaf(btree);
+        int childOffset = isLeaf ? Integer.MAX_VALUE : getChildStart(btree);
+        int limit = isLeaf ? getLeafKeyEnd(btree) : btree.length - 1;
+        for (int i = 0 ; i < limit ; i++)
+        {
+            // we want to visit in iteration order, so we visit our key nodes inbetween our children
+            int idx = isLeaf ? i : (i / 2) + (i % 2 == 0 ? childOffset : 0);
+            Object current = btree[idx];
+            Object updated = idx < childOffset ? function.apply((V) current) : transformAndFilter((Object[]) current, function);
+            if (updated != current)
+            {
+                if (result == btree)
+                    result = btree.clone();
+                result[idx] = updated;
+            }
+            if (function.failed)
+                return result;
+        }
+        return result;
+    }
+
+    public static boolean equals(Object[] a, Object[] b)
+    {
+        return size(a) == size(b) && Iterators.elementsEqual(iterator(a), iterator(b));
+    }
+
+    public static int hashCode(Object[] btree)
+    {
+        // we can't just delegate to Arrays.deepHashCode(),
+        // because two equivalent trees may be represented by differently shaped trees
+        int result = 1;
+        for (Object v : iterable(btree))
+            result = 31 * result + Objects.hashCode(v);
+        return result;
+
+    }
+
+    /**
+     * tree index => index of key wrt all items in the tree laid out serially
+     *
+     * This version of the method permits requesting out-of-bounds indexes, -1 and size
+     * @param root to calculate tree index within
+     * @param keyIndex root-local index of key to calculate tree-index
+     * @return the number of items preceding the key in the whole tree of root
+     */
+    public static int treeIndexOfKey(Object[] root, int keyIndex)
+    {
+        if (isLeaf(root))
+            return keyIndex;
+        int[] sizeMap = getSizeMap(root);
+        if ((keyIndex >= 0) & (keyIndex < sizeMap.length))
+            return sizeMap[keyIndex];
+        // we support asking for -1 or size, so that we can easily use this for iterator bounds checking
+        if (keyIndex < 0)
+            return -1;
+        return sizeMap[keyIndex - 1] + 1;
+    }
+
+    /**
+     * @param keyIndex node-local index of the key to calculate index of
+     * @return keyIndex; this method is here only for symmetry and clarity
+     */
+    public static int treeIndexOfLeafKey(int keyIndex)
+    {
+        return keyIndex;
+    }
+
+    /**
+     * @param root to calculate tree-index within
+     * @param keyIndex root-local index of key to calculate tree-index of
+     * @return the number of items preceding the key in the whole tree of root
+     */
+    public static int treeIndexOfBranchKey(Object[] root, int keyIndex)
+    {
+        return lookupSizeMap(root, keyIndex);
+    }
+
+    /**
+     * @param root to calculate tree-index within
+     * @param childIndex root-local index of *child* to calculate tree-index of
+     * @return the number of items preceding the child in the whole tree of root
+     */
+    public static int treeIndexOffsetOfChild(Object[] root, int childIndex)
+    {
+        if (childIndex == 0)
+            return 0;
+        return 1 + lookupSizeMap(root, childIndex - 1);
+    }
+
+    private static final ThreadLocal<Queue<TreeBuilder>> modifier = new ThreadLocal<Queue<TreeBuilder>>()
     {
         @Override
-        protected Queue<Builder> initialValue()
+        protected Queue<TreeBuilder> initialValue()
         {
             return new ArrayDeque<>();
         }
     };
 
-    // return a sorted collection
-    private static <V> Collection<V> sorted(Iterable<V> source, Comparator<V> comparator, int size)
+    public static <V> Builder<V> builder(Comparator<? super V> comparator)
     {
-        V[] vs = (V[]) new Object[size];
-        int i = 0;
-        for (V v : source)
-            vs[i++] = v;
-        Arrays.sort(vs, comparator);
-        return Arrays.asList(vs);
+        return new Builder<>(comparator);
+    }
+
+    public static <V> Builder<V> builder(Comparator<? super V> comparator, int initialCapacity)
+    {
+        return new Builder<>(comparator);
+    }
+
+    public static class Builder<V>
+    {
+
+        // a user-defined bulk resolution, to be applied manually via resolve()
+        public static interface Resolver
+        {
+            // can return a different output type to input, so long as sort order is maintained
+            // if a resolver is present, this method will be called for every sequence of equal inputs
+            // even those with only one item
+            Object resolve(Object[] array, int lb, int ub);
+        }
+
+        // a user-defined resolver that is applied automatically on encountering two duplicate values
+        public static interface QuickResolver<V>
+        {
+            // can return a different output type to input, so long as sort order is maintained
+            // if a resolver is present, this method will be called for every sequence of equal inputs
+            // even those with only one item
+            V resolve(V a, V b);
+        }
+
+        Comparator<? super V> comparator;
+        Object[] values;
+        int count;
+        boolean detected = true; // true if we have managed to cheaply ensure sorted (+ filtered, if resolver == null) as we have added
+        boolean auto = true; // false if the user has promised to enforce the sort order and resolve any duplicates
+        QuickResolver<V> quickResolver;
+
+        protected Builder(Comparator<? super V> comparator)
+        {
+            this(comparator, 16);
+        }
+
+        protected Builder(Comparator<? super V> comparator, int initialCapacity)
+        {
+            this.comparator = comparator;
+            this.values = new Object[initialCapacity];
+        }
+
+        private Builder(Builder<V> builder)
+        {
+            this.comparator = builder.comparator;
+            this.values = Arrays.copyOf(builder.values, builder.values.length);
+            this.count = builder.count;
+            this.detected = builder.detected;
+            this.auto = builder.auto;
+            this.quickResolver = builder.quickResolver;
+        }
+
+        /**
+         * Creates a copy of this {@code Builder}.
+         * @return a copy of this {@code Builder}.
+         */
+        public Builder<V> copy()
+        {
+            return new Builder<>(this);
+        }
+
+        public Builder<V> setQuickResolver(QuickResolver<V> quickResolver)
+        {
+            this.quickResolver = quickResolver;
+            return this;
+        }
+
+        public void reuse()
+        {
+            reuse(comparator);
+        }
+
+        public void reuse(Comparator<? super V> comparator)
+        {
+            this.comparator = comparator;
+            count = 0;
+            detected = true;
+        }
+
+        public Builder<V> auto(boolean auto)
+        {
+            this.auto = auto;
+            return this;
+        }
+
+        public Builder<V> add(V v)
+        {
+            if (count == values.length)
+                values = Arrays.copyOf(values, count * 2);
+
+            Object[] values = this.values;
+            int prevCount = this.count++;
+            values[prevCount] = v;
+
+            if (auto && detected && prevCount > 0)
+            {
+                V prev = (V) values[prevCount - 1];
+                int c = comparator.compare(prev, v);
+                if (c == 0 && auto)
+                {
+                    count = prevCount;
+                    if (quickResolver != null)
+                        values[prevCount - 1] = quickResolver.resolve(prev, v);
+                }
+                else if (c > 0)
+                {
+                    detected = false;
+                }
+            }
+
+            return this;
+        }
+
+        public Builder<V> addAll(Collection<V> add)
+        {
+            if (auto && add instanceof SortedSet && equalComparators(comparator, ((SortedSet) add).comparator()))
+            {
+                // if we're a SortedSet, permit quick order-preserving addition of items
+                // if we collect all duplicates, don't bother as merge will necessarily be more expensive than sorting at end
+                return mergeAll(add, add.size());
+            }
+            detected = false;
+            if (values.length < count + add.size())
+                values = Arrays.copyOf(values, max(count + add.size(), count * 2));
+            for (V v : add)
+                values[count++] = v;
+            return this;
+        }
+
+        private static boolean equalComparators(Comparator<?> a, Comparator<?> b)
+        {
+            return a == b || (isNaturalComparator(a) && isNaturalComparator(b));
+        }
+
+        private static boolean isNaturalComparator(Comparator<?> a)
+        {
+            return a == null || a == Comparator.naturalOrder() || a == Ordering.natural();
+        }
+
+        // iter must be in sorted order!
+        private Builder<V> mergeAll(Iterable<V> add, int addCount)
+        {
+            assert auto;
+            // ensure the existing contents are in order
+            autoEnforce();
+
+            int curCount = count;
+            // we make room for curCount * 2 + addCount, so that we can copy the current values to the end
+            // if necessary for continuing the merge, and have the new values directly after the current value range
+            if (values.length < curCount * 2 + addCount)
+                values = Arrays.copyOf(values, max(curCount * 2 + addCount, curCount * 3));
+
+            if (add instanceof BTreeSet)
+            {
+                // use btree set's fast toArray method, to append directly
+                ((BTreeSet) add).toArray(values, curCount);
+            }
+            else
+            {
+                // consider calling toArray() and System.arraycopy
+                int i = curCount;
+                for (V v : add)
+                    values[i++] = v;
+            }
+            return mergeAll(addCount);
+        }
+
+        private Builder<V> mergeAll(int addCount)
+        {
+            Object[] a = values;
+            int addOffset = count;
+
+            int i = 0, j = addOffset;
+            int curEnd = addOffset, addEnd = addOffset + addCount;
+
+            // save time in cases where we already have a subset, by skipping dir
+            while (i < curEnd && j < addEnd)
+            {
+                V ai = (V) a[i], aj = (V) a[j];
+                // in some cases, such as Columns, we may have identity supersets, so perform a cheap object-identity check
+                int c = ai == aj ? 0 : comparator.compare(ai, aj);
+                if (c > 0)
+                    break;
+                else if (c == 0)
+                {
+                    if (quickResolver != null)
+                        a[i] = quickResolver.resolve(ai, aj);
+                    j++;
+                }
+                i++;
+            }
+
+            if (j == addEnd)
+                return this; // already a superset of the new values
+
+            // otherwise, copy the remaining existing values to the very end, freeing up space for merge result
+            int newCount = i;
+            System.arraycopy(a, i, a, addEnd, count - i);
+            curEnd = addEnd + (count - i);
+            i = addEnd;
+
+            while (i < curEnd && j < addEnd)
+            {
+                V ai = (V) a[i];
+                V aj = (V) a[j];
+                // could avoid one comparison if we cared, but would make this ugly
+                int c = comparator.compare(ai, aj);
+                if (c == 0)
+                {
+                    Object newValue = quickResolver == null ? ai : quickResolver.resolve(ai, aj);
+                    a[newCount++] = newValue;
+                    i++;
+                    j++;
+                }
+                else
+                {
+                    a[newCount++] =  c < 0 ? a[i++] : a[j++];
+                }
+            }
+
+            // exhausted one of the inputs; fill in remainder of the other
+            if (i < curEnd)
+            {
+                System.arraycopy(a, i, a, newCount, curEnd - i);
+                newCount += curEnd - i;
+            }
+            else if (j < addEnd)
+            {
+                if (j != newCount)
+                    System.arraycopy(a, j, a, newCount, addEnd - j);
+                newCount += addEnd - j;
+            }
+            count = newCount;
+            return this;
+        }
+
+        public boolean isEmpty()
+        {
+            return count == 0;
+        }
+
+        public Builder<V> reverse()
+        {
+            assert !auto;
+            int mid = count / 2;
+            for (int i = 0 ; i < mid ; i++)
+            {
+                Object t = values[i];
+                values[i] = values[count - (1 + i)];
+                values[count - (1 + i)] = t;
+            }
+            return this;
+        }
+
+        public Builder<V> sort()
+        {
+            Arrays.sort((V[]) values, 0, count, comparator);
+            return this;
+        }
+
+        // automatically enforce sorted+filtered
+        private void autoEnforce()
+        {
+            if (!detected && count > 1)
+            {
+                sort();
+                int prevIdx = 0;
+                V prev = (V) values[0];
+                for (int i = 1 ; i < count ; i++)
+                {
+                    V next = (V) values[i];
+                    if (comparator.compare(prev, next) != 0)
+                        values[++prevIdx] = prev = next;
+                    else if (quickResolver != null)
+                        values[prevIdx] = prev = quickResolver.resolve(prev, next);
+                }
+                count = prevIdx + 1;
+            }
+            detected = true;
+        }
+
+        public Builder<V> resolve(Resolver resolver)
+        {
+            if (count > 0)
+            {
+                int c = 0;
+                int prev = 0;
+                for (int i = 1 ; i < count ; i++)
+                {
+                    if (comparator.compare((V) values[i], (V) values[prev]) != 0)
+                    {
+                        values[c++] = resolver.resolve((V[]) values, prev, i);
+                        prev = i;
+                    }
+                }
+                values[c++] = resolver.resolve((V[]) values, prev, count);
+                count = c;
+            }
+            return this;
+        }
+
+        public Object[] build()
+        {
+            if (auto)
+                autoEnforce();
+            return BTree.build(Arrays.asList(values).subList(0, count), UpdateFunction.noOp());
+        }
     }
 
     /** simple static wrapper to calls to cmp.compare() which checks if either a or b are Special (i.e. represent an infinity) */
-    // TODO : cheaper to check for POSITIVE/NEGATIVE infinity in callers, rather than here
     static <V> int compare(Comparator<V> cmp, Object a, Object b)
     {
-        if (a instanceof Special)
-            return ((Special) a).compareTo(b);
-        if (b instanceof Special)
-            return -((Special) b).compareTo(a);
+        if (a == b)
+            return 0;
+        if (a == NEGATIVE_INFINITY | b == POSITIVE_INFINITY)
+            return -1;
+        if (b == NEGATIVE_INFINITY | a == POSITIVE_INFINITY)
+            return 1;
         return cmp.compare((V) a, (V) b);
     }
 
+    static Object POSITIVE_INFINITY = new Object();
+    static Object NEGATIVE_INFINITY = new Object();
+
     public static boolean isWellFormed(Object[] btree, Comparator<? extends Object> cmp)
     {
         return isWellFormed(cmp, btree, true, NEGATIVE_INFINITY, POSITIVE_INFINITY);
@@ -396,17 +1088,16 @@
         if (isLeaf(node))
         {
             if (isRoot)
-                return node.length <= FAN_FACTOR;
-            return node.length >= FAN_FACTOR / 2 && node.length <= FAN_FACTOR;
+                return node.length <= FAN_FACTOR + 1;
+            return node.length >= FAN_FACTOR / 2 && node.length <= FAN_FACTOR + 1;
         }
 
         int type = 0;
-        int childOffset = getBranchKeyEnd(node);
         // compare each child node with the branch element at the head of this node it corresponds with
-        for (int i = childOffset; i < node.length; i++)
+        for (int i = getChildStart(node); i < getChildEnd(node) ; i++)
         {
             Object[] child = (Object[]) node[i];
-            Object localmax = i < node.length - 1 ? node[i - childOffset] : max;
+            Object localmax = i < node.length - 2 ? node[i - getChildStart(node)] : max;
             if (!isWellFormed(cmp, child, false, min, localmax))
                 return false;
             type |= isLeaf(child) ? 1 : 2;

diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java b/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java
index 7a83238..ec16a8e 100644
--- a/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java
+++ b/src/java/org/apache/cassandra/utils/btree/BTreeSearchIterator.java

@@ -1,67 +1,163 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
 package org.apache.cassandra.utils.btree;
 
 import java.util.Comparator;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
 
-import org.apache.cassandra.utils.SearchIterator;
+import org.apache.cassandra.utils.IndexedSearchIterator;
 
-import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
+import static org.apache.cassandra.utils.btree.BTree.size;
 
-public class BTreeSearchIterator<CK, K extends CK, V> extends Path implements SearchIterator<K, V>
+public class BTreeSearchIterator<K, V> extends TreeCursor<K> implements IndexedSearchIterator<K, V>, Iterator<V>
 {
+    private final boolean forwards;
 
-    final Comparator<CK> comparator;
-    public BTreeSearchIterator(Object[] btree, Comparator<CK> comparator)
+    // for simplicity, we just always use the index feature of the btree to maintain our bounds within the tree,
+    // whether or not they are constrained
+    private int index;
+    private byte state;
+    private final int lowerBound, upperBound; // inclusive
+
+    private static final int MIDDLE = 0; // only "exists" as an absence of other states
+    private static final int ON_ITEM = 1; // may only co-exist with LAST (or MIDDLE, which is 0)
+    private static final int BEFORE_FIRST = 2; // may not coexist with any other state
+    private static final int LAST = 4; // may co-exist with ON_ITEM, in which case we are also at END
+    private static final int END = 5; // equal to LAST | ON_ITEM
+
+    public BTreeSearchIterator(Object[] btree, Comparator<? super K> comparator, BTree.Dir dir)
     {
-        init(btree);
-        this.comparator = comparator;
+        this(btree, comparator, dir, 0, size(btree)-1);
     }
 
-    public V next(K target)
+    BTreeSearchIterator(Object[] btree, Comparator<? super K> comparator, BTree.Dir dir, int lowerBound, int upperBound)
     {
-        while (depth > 0)
-        {
-            byte successorParentDepth = findSuccessorParentDepth();
-            if (successorParentDepth < 0)
-                break; // we're in last section of tree, so can only search down
-            int successorParentIndex = indexes[successorParentDepth] + 1;
-            Object[] successParentNode = path[successorParentDepth];
-            Object successorParentKey = successParentNode[successorParentIndex];
-            int c = BTree.compare(comparator, target, successorParentKey);
-            if (c < 0)
-                break;
-            if (c == 0)
-            {
-                depth = successorParentDepth;
-                indexes[successorParentDepth]++;
-                return (V) successorParentKey;
-            }
-            depth = successorParentDepth;
-            indexes[successorParentDepth]++;
-        }
-        if (find(comparator, target, Op.CEIL, true))
-            return (V) currentKey();
-        return null;
+        super(comparator, btree);
+        this.forwards = dir == BTree.Dir.ASC;
+        this.lowerBound = lowerBound;
+        this.upperBound = upperBound;
+        rewind();
+    }
+
+    /**
+     * @return 0 if we are on the last item, 1 if we are past the last item, and -1 if we are before it
+     */
+    private int compareToLast(int idx)
+    {
+        return forwards ? idx - upperBound : lowerBound - idx;
+    }
+
+    private int compareToFirst(int idx)
+    {
+        return forwards ? idx - lowerBound : upperBound - idx;
     }
 
     public boolean hasNext()
     {
-        return depth != 0 || indexes[0] != getKeyEnd(path[0]);
+        return state != END;
+    }
+
+    public V next()
+    {
+        switch (state)
+        {
+            case ON_ITEM:
+                if (compareToLast(index = moveOne(forwards)) >= 0)
+                    state = END;
+                break;
+            case BEFORE_FIRST:
+                seekTo(index = forwards ? lowerBound : upperBound);
+                state = (byte) (upperBound == lowerBound ? LAST : MIDDLE);
+            case LAST:
+            case MIDDLE:
+                state |= ON_ITEM;
+                break;
+            default:
+                throw new NoSuchElementException();
+        }
+
+        return current();
+    }
+
+    public V next(K target)
+    {
+        if (!hasNext())
+            return null;
+
+        int state = this.state;
+        boolean found = seekTo(target, forwards, (state & (ON_ITEM | BEFORE_FIRST)) != 0);
+        int index = cur.globalIndex();
+
+        V next = null;
+        if (state == BEFORE_FIRST && compareToFirst(index) < 0)
+            return null;
+
+        int compareToLast = compareToLast(index);
+        if ((compareToLast <= 0))
+        {
+            state = compareToLast < 0 ? MIDDLE : LAST;
+            if (found)
+            {
+                state |= ON_ITEM;
+                next = (V) currentValue();
+            }
+        }
+        else state = END;
+
+        this.state = (byte) state;
+        this.index = index;
+        return next;
+    }
+
+    /**
+     * Reset this Iterator to its starting position
+     */
+    public void rewind()
+    {
+        if (upperBound < lowerBound)
+        {
+            state = (byte) END;
+        }
+        else
+        {
+            // we don't move into the tree until the first request is made, so we know where to go
+            reset(forwards);
+            state = (byte) BEFORE_FIRST;
+        }
+    }
+
+    private void checkOnItem()
+    {
+        if ((state & ON_ITEM) != ON_ITEM)
+            throw new NoSuchElementException();
+    }
+
+    public V current()
+    {
+        checkOnItem();
+        return (V) currentValue();
+    }
+
+    public int indexOfCurrent()
+    {
+        checkOnItem();
+        return compareToFirst(index);
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeSet.java b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java
index d80b32e..03fa1ec 100644
--- a/src/java/org/apache/cassandra/utils/btree/BTreeSet.java
+++ b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java

@@ -18,27 +18,32 @@
  */
 package org.apache.cassandra.utils.btree;
 
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.NavigableSet;
-import java.util.SortedSet;
+import java.util.*;
 
-public class BTreeSet<V> implements NavigableSet<V>
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Ordering;
+
+import org.apache.cassandra.utils.btree.BTree.Dir;
+
+import static org.apache.cassandra.utils.btree.BTree.findIndex;
+import static org.apache.cassandra.utils.btree.BTree.lower;
+import static org.apache.cassandra.utils.btree.BTree.toArray;
+
+public class BTreeSet<V> implements NavigableSet<V>, List<V>
 {
-    protected final Comparator<V> comparator;
+    protected final Comparator<? super V> comparator;
     protected final Object[] tree;
 
-    public BTreeSet(Object[] tree, Comparator<V> comparator)
+    public BTreeSet(Object[] tree, Comparator<? super V> comparator)
     {
         this.tree = tree;
         this.comparator = comparator;
     }
 
-    public BTreeSet<V> update(Collection<V> updateWith, boolean isSorted)
+    public BTreeSet<V> update(Collection<V> updateWith)
     {
-        return new BTreeSet<>(BTree.update(tree, comparator, updateWith, isSorted, UpdateFunction.NoOp.<V>instance()), comparator);
+        return new BTreeSet<>(BTree.update(tree, comparator, updateWith, UpdateFunction.<V>noOp()), comparator);
     }
 
     @Override
@@ -47,33 +52,64 @@
         return comparator;
     }
 
-    protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+    protected BTreeSearchIterator<V, V> slice(Dir dir)
     {
-        return BTree.slice(tree, forwards);
+        return BTree.slice(tree, comparator, dir);
+    }
+
+    public Object[] tree()
+    {
+        return tree;
+    }
+
+    /**
+     * The index of the item within the list, or its insertion point otherwise. i.e. binarySearch semantics
+     */
+    public int indexOf(Object item)
+    {
+        return findIndex(tree, comparator, (V) item);
+    }
+
+    /**
+     * The converse of indexOf: provided an index between 0 and size, returns the i'th item, in set order.
+     */
+    public V get(int index)
+    {
+        return BTree.<V>findByIndex(tree, index);
+    }
+
+    public int lastIndexOf(Object o)
+    {
+        return indexOf(o);
+    }
+
+    public BTreeSet<V> subList(int fromIndex, int toIndex)
+    {
+        return new BTreeRange<V>(tree, comparator, fromIndex, toIndex - 1);
     }
 
     @Override
     public int size()
     {
-        return slice(true, false).count();
+        return BTree.size(tree);
     }
 
     @Override
     public boolean isEmpty()
     {
-        return slice(true, false).hasNext();
+        return BTree.isEmpty(tree);
     }
 
     @Override
-    public Iterator<V> iterator()
+    public BTreeSearchIterator<V, V> iterator()
     {
-        return slice(true, true);
+        return slice(Dir.ASC);
     }
 
     @Override
-    public Iterator<V> descendingIterator()
+    public BTreeSearchIterator<V, V> descendingIterator()
     {
-        return slice(false, true);
+        return slice(Dir.DESC);
     }
 
     @Override
@@ -85,29 +121,37 @@
     @Override
     public <T> T[] toArray(T[] a)
     {
+        return toArray(a, 0);
+    }
+
+    public <T> T[] toArray(T[] a, int offset)
+    {
         int size = size();
-        if (a.length < size)
+        if (a.length < size + offset)
             a = Arrays.copyOf(a, size);
-        int i = 0;
-        for (V v : this)
-            a[i++] = (T) v;
+        BTree.toArray(tree, a, offset);
         return a;
     }
 
+    public Spliterator<V> spliterator()
+    {
+        return Spliterators.spliterator(this, Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED);
+    }
+
     @Override
-    public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+    public BTreeSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
     {
         return new BTreeRange<>(tree, comparator, fromElement, fromInclusive, toElement, toInclusive);
     }
 
     @Override
-    public NavigableSet<V> headSet(V toElement, boolean inclusive)
+    public BTreeSet<V> headSet(V toElement, boolean inclusive)
     {
         return new BTreeRange<>(tree, comparator, null, true, toElement, inclusive);
     }
 
     @Override
-    public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+    public BTreeSet<V> tailSet(V fromElement, boolean inclusive)
     {
         return new BTreeRange<>(tree, comparator, fromElement, inclusive, null, true);
     }
@@ -131,15 +175,71 @@
     }
 
     @Override
+    public BTreeSet<V> descendingSet()
+    {
+        return new BTreeRange<V>(this.tree, this.comparator).descendingSet();
+    }
+
+    @Override
     public V first()
     {
-        throw new UnsupportedOperationException();
+        return get(0);
     }
 
     @Override
     public V last()
     {
-        throw new UnsupportedOperationException();
+        return get(size() - 1);
+    }
+
+    @Override
+    public V lower(V v)
+    {
+        return BTree.lower(tree, comparator, v);
+    }
+
+    @Override
+    public V floor(V v)
+    {
+        return BTree.floor(tree, comparator, v);
+    }
+
+    @Override
+    public V ceiling(V v)
+    {
+        return BTree.ceil(tree, comparator, v);
+    }
+
+    @Override
+    public V higher(V v)
+    {
+        return BTree.higher(tree, comparator, v);
+    }
+
+    @Override
+    public boolean contains(Object o)
+    {
+        return indexOf((V) o) >= 0;
+    }
+
+    @Override
+    public boolean containsAll(Collection<?> c)
+    {
+        // TODO: if we ever use this method, it can be specialized quite easily for SortedSet arguments
+        for (Object o : c)
+            if (!contains(o))
+                return false;
+        return true;
+    }
+
+    public int hashCode()
+    {
+        // we can't just delegate to Arrays.deepHashCode(),
+        // because two equivalent sets may be represented by differently shaped trees
+        int result = 1;
+        for (V v : this)
+            result = 31 * result + Objects.hashCode(v);
+        return result;
     }
 
     @Override
@@ -148,6 +248,11 @@
         throw new UnsupportedOperationException();
     }
 
+    public boolean addAll(int index, Collection<? extends V> c)
+    {
+        throw new UnsupportedOperationException();
+    }
+
     @Override
     public boolean retainAll(Collection<?> c)
     {
@@ -190,194 +295,350 @@
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public V lower(V v)
+    public V set(int index, V element)
     {
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public V floor(V v)
+    public void add(int index, V element)
     {
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public V ceiling(V v)
+    public V remove(int index)
     {
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public V higher(V v)
+    public ListIterator<V> listIterator()
     {
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public boolean contains(Object o)
+    public ListIterator<V> listIterator(int index)
     {
         throw new UnsupportedOperationException();
     }
 
-    @Override
-    public boolean containsAll(Collection<?> c)
+    public static class BTreeRange<V> extends BTreeSet<V>
     {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public NavigableSet<V> descendingSet()
-    {
-        return new BTreeRange<>(this.tree, this.comparator).descendingSet();
-    }
-
-    public static class BTreeRange<V> extends BTreeSet<V> implements NavigableSet<V>
-    {
-
-        protected final V lowerBound, upperBound;
-        protected final boolean inclusiveLowerBound, inclusiveUpperBound;
-
-        BTreeRange(Object[] tree, Comparator<V> comparator)
+        // both inclusive
+        protected final int lowerBound, upperBound;
+        BTreeRange(Object[] tree, Comparator<? super V> comparator)
         {
             this(tree, comparator, null, true, null, true);
         }
 
         BTreeRange(BTreeRange<V> from)
         {
-            this(from.tree, from.comparator, from.lowerBound, from.inclusiveLowerBound, from.upperBound, from.inclusiveUpperBound);
+            super(from.tree, from.comparator);
+            this.lowerBound = from.lowerBound;
+            this.upperBound = from.upperBound;
         }
 
-        BTreeRange(Object[] tree, Comparator<V> comparator, V lowerBound, boolean inclusiveLowerBound, V upperBound, boolean inclusiveUpperBound)
+        BTreeRange(Object[] tree, Comparator<? super V> comparator, int lowerBound, int upperBound)
         {
             super(tree, comparator);
+            if (upperBound < lowerBound - 1)
+                upperBound = lowerBound - 1;
             this.lowerBound = lowerBound;
             this.upperBound = upperBound;
-            this.inclusiveLowerBound = inclusiveLowerBound;
-            this.inclusiveUpperBound = inclusiveUpperBound;
+        }
+
+        BTreeRange(Object[] tree, Comparator<? super V> comparator, V lowerBound, boolean inclusiveLowerBound, V upperBound, boolean inclusiveUpperBound)
+        {
+            this(tree, comparator,
+                 lowerBound == null ? 0 : inclusiveLowerBound ? BTree.ceilIndex(tree, comparator, lowerBound)
+                                                              : BTree.higherIndex(tree, comparator, lowerBound),
+                 upperBound == null ? BTree.size(tree) - 1 : inclusiveUpperBound ? BTree.floorIndex(tree, comparator, upperBound)
+                                                                                 : BTree.lowerIndex(tree, comparator, upperBound));
         }
 
         // narrowing range constructor - makes this the intersection of the two ranges over the same tree b
         BTreeRange(BTreeRange<V> a, BTreeRange<V> b)
         {
-            super(a.tree, a.comparator);
+            this(a.tree, a.comparator, Math.max(a.lowerBound, b.lowerBound), Math.min(a.upperBound, b.upperBound));
             assert a.tree == b.tree;
-            final BTreeRange<V> lb, ub;
-
-            if (a.lowerBound == null)
-            {
-                lb = b;
-            }
-            else if (b.lowerBound == null)
-            {
-                lb = a;
-            }
-            else
-            {
-                int c = comparator.compare(a.lowerBound, b.lowerBound);
-                if (c < 0)
-                    lb = b;
-                else if (c > 0)
-                    lb = a;
-                else if (!a.inclusiveLowerBound)
-                    lb = a;
-                else
-                    lb = b;
-            }
-
-            if (a.upperBound == null)
-            {
-                ub = b;
-            }
-            else if (b.upperBound == null)
-            {
-                ub = a;
-            }
-            else
-            {
-                int c = comparator.compare(b.upperBound, a.upperBound);
-                if (c < 0)
-                    ub = b;
-                else if (c > 0)
-                    ub = a;
-                else if (!a.inclusiveUpperBound)
-                    ub = a;
-                else
-                    ub = b;
-            }
-
-            lowerBound = lb.lowerBound;
-            inclusiveLowerBound = lb.inclusiveLowerBound;
-            upperBound = ub.upperBound;
-            inclusiveUpperBound = ub.inclusiveUpperBound;
         }
 
         @Override
-        protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+        protected BTreeSearchIterator<V, V> slice(Dir dir)
         {
-            return BTree.slice(tree, comparator, lowerBound, inclusiveLowerBound, upperBound, inclusiveUpperBound, forwards);
+            return new BTreeSearchIterator<>(tree, comparator, dir, lowerBound, upperBound);
         }
 
         @Override
-        public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+        public boolean isEmpty()
+        {
+            return upperBound < lowerBound;
+        }
+
+        public int size()
+        {
+            return (upperBound - lowerBound) + 1;
+        }
+
+        boolean outOfBounds(int i)
+        {
+            return (i < lowerBound) | (i > upperBound);
+        }
+
+        public V get(int index)
+        {
+            index += lowerBound;
+            if (outOfBounds(index))
+                throw new NoSuchElementException();
+            return super.get(index);
+        }
+
+        public int indexOf(Object item)
+        {
+            int i = super.indexOf(item);
+            boolean negate = i < 0;
+            if (negate)
+                i = -1 - i;
+            if (outOfBounds(i))
+                return i < lowerBound ? -1 : -1 - size();
+            i = i - lowerBound;
+            if (negate)
+                i = -1 -i;
+            return i;
+        }
+
+        public V lower(V v)
+        {
+            return maybe(Math.min(upperBound, BTree.lowerIndex(tree, comparator, v)));
+        }
+
+        public V floor(V v)
+        {
+            return maybe(Math.min(upperBound, BTree.floorIndex(tree, comparator, v)));
+        }
+
+        public V ceiling(V v)
+        {
+            return maybe(Math.max(lowerBound, BTree.ceilIndex(tree, comparator, v)));
+        }
+
+        public V higher(V v)
+        {
+            return maybe(Math.max(lowerBound, BTree.higherIndex(tree, comparator, v)));
+        }
+
+        private V maybe(int i)
+        {
+            if (outOfBounds(i))
+                return null;
+            return super.get(i);
+        }
+
+        @Override
+        public BTreeSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
         {
             return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, fromElement, fromInclusive, toElement, toInclusive));
         }
 
         @Override
-        public NavigableSet<V> headSet(V toElement, boolean inclusive)
+        public BTreeSet<V> headSet(V toElement, boolean inclusive)
         {
-            return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, lowerBound, true, toElement, inclusive));
+            return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, null, true, toElement, inclusive));
         }
 
         @Override
-        public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+        public BTreeSet<V> tailSet(V fromElement, boolean inclusive)
         {
             return new BTreeRange<>(this, new BTreeRange<>(tree, comparator, fromElement, inclusive, null, true));
         }
 
         @Override
-        public NavigableSet<V> descendingSet()
+        public BTreeSet<V> descendingSet()
         {
             return new BTreeDescRange<>(this);
         }
+
+        public BTreeSet<V> subList(int fromIndex, int toIndex)
+        {
+            if (fromIndex < 0 || toIndex > size())
+                throw new IndexOutOfBoundsException();
+            return new BTreeRange<V>(tree, comparator, lowerBound + fromIndex, lowerBound + toIndex - 1);
+        }
+
+        @Override
+        public <T> T[] toArray(T[] a)
+        {
+            return toArray(a, 0);
+        }
+
+        public <T> T[] toArray(T[] a, int offset)
+        {
+            if (size() + offset < a.length)
+                a = Arrays.copyOf(a, size() + offset);
+
+            BTree.toArray(tree, lowerBound, upperBound + 1, a, offset);
+            return a;
+        }
     }
 
     public static class BTreeDescRange<V> extends BTreeRange<V>
     {
         BTreeDescRange(BTreeRange<V> from)
         {
-            super(from.tree, from.comparator, from.lowerBound, from.inclusiveLowerBound, from.upperBound, from.inclusiveUpperBound);
+            super(from.tree, from.comparator, from.lowerBound, from.upperBound);
         }
 
         @Override
-        protected Cursor<V, V> slice(boolean forwards, boolean permitInversion)
+        protected BTreeSearchIterator<V, V> slice(Dir dir)
         {
-            return super.slice(permitInversion ? !forwards : forwards, false);
+            return super.slice(dir.invert());
+        }
+
+        /* Flip the methods we call for inequality searches */
+
+        public V higher(V v)
+        {
+            return super.lower(v);
+        }
+
+        public V ceiling(V v)
+        {
+            return super.floor(v);
+        }
+
+        public V floor(V v)
+        {
+            return super.ceiling(v);
+        }
+
+        public V lower(V v)
+        {
+            return super.higher(v);
+        }
+
+        public V get(int index)
+        {
+            index = upperBound - index;
+            if (outOfBounds(index))
+                throw new NoSuchElementException();
+            return BTree.findByIndex(tree, index);
+        }
+
+        public int indexOf(Object item)
+        {
+            int i = super.indexOf(item);
+            // i is in range [-1 - size()..size())
+            // so we just need to invert by adding/subtracting from size
+            return i < 0 ? -2 - size() - i  : size() - (i + 1);
+        }
+
+        public BTreeSet<V> subList(int fromIndex, int toIndex)
+        {
+            if (fromIndex < 0 || toIndex > size())
+                throw new IndexOutOfBoundsException();
+            return new BTreeDescRange<V>(new BTreeRange<V>(tree, comparator, upperBound - (toIndex - 1), upperBound - fromIndex));
         }
 
         @Override
-        public NavigableSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
+        public BTreeSet<V> subSet(V fromElement, boolean fromInclusive, V toElement, boolean toInclusive)
         {
             return super.subSet(toElement, toInclusive, fromElement, fromInclusive).descendingSet();
         }
 
         @Override
-        public NavigableSet<V> headSet(V toElement, boolean inclusive)
+        public BTreeSet<V> headSet(V toElement, boolean inclusive)
         {
             return super.tailSet(toElement, inclusive).descendingSet();
         }
 
         @Override
-        public NavigableSet<V> tailSet(V fromElement, boolean inclusive)
+        public BTreeSet<V> tailSet(V fromElement, boolean inclusive)
         {
             return super.headSet(fromElement, inclusive).descendingSet();
         }
 
         @Override
-        public NavigableSet<V> descendingSet()
+        public BTreeSet<V> descendingSet()
         {
             return new BTreeRange<>(this);
         }
+
+        public Comparator<V> comparator()
+        {
+            return (a, b) -> comparator.compare(b, a);
+        }
+
+        public <T> T[] toArray(T[] a, int offset)
+        {
+            a = super.toArray(a, offset);
+            int count = size();
+            int flip = count / 2;
+            for (int i = 0 ; i < flip ; i++)
+            {
+                int j = count - (i + 1);
+                T t = a[i + offset];
+                a[i + offset] = a[j + offset];
+                a[j + offset] = t;
+            }
+            return a;
+        }
+    }
+
+    public static class Builder<V>
+    {
+        final BTree.Builder<V> builder;
+        protected Builder(Comparator<? super V> comparator)
+        {
+            builder= BTree.builder(comparator);
+        }
+
+        public Builder<V> add(V v)
+        {
+            builder.add(v);
+            return this;
+        }
+
+        public Builder<V> addAll(Collection<V> iter)
+        {
+            builder.addAll(iter);
+            return this;
+        }
+
+        public boolean isEmpty()
+        {
+            return builder.isEmpty();
+        }
+        public BTreeSet<V> build()
+        {
+            return new BTreeSet<>(builder.build(), builder.comparator);
+        }
+    }
+
+    public static <V> Builder<V> builder(Comparator<? super V> comparator)
+    {
+        return new Builder<>(comparator);
+    }
+
+    public static <V> BTreeSet<V> wrap(Object[] btree, Comparator<V> comparator)
+    {
+        return new BTreeSet<>(btree, comparator);
+    }
+
+    public static <V extends Comparable<V>> BTreeSet<V> of(Collection<V> sortedValues)
+    {
+        return new BTreeSet<>(BTree.build(sortedValues, UpdateFunction.<V>noOp()), Ordering.<V>natural());
+    }
+
+    public static <V extends Comparable<V>> BTreeSet<V> of(V value)
+    {
+        return new BTreeSet<>(BTree.build(ImmutableList.of(value), UpdateFunction.<V>noOp()), Ordering.<V>natural());
+    }
+
+    public static <V> BTreeSet<V> empty(Comparator<? super V> comparator)
+    {
+        return new BTreeSet<>(BTree.empty(), comparator);
+    }
+
+    public static <V> BTreeSet<V> of(Comparator<? super V> comparator, V value)
+    {
+        return new BTreeSet<>(BTree.build(ImmutableList.of(value), UpdateFunction.<V>noOp()), comparator);
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/btree/Builder.java b/src/java/org/apache/cassandra/utils/btree/Builder.java
deleted file mode 100644
index aa4c5dc..0000000
--- a/src/java/org/apache/cassandra/utils/btree/Builder.java
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.utils.btree;
-
-import java.util.Comparator;
-
-import static org.apache.cassandra.utils.btree.BTree.EMPTY_LEAF;
-import static org.apache.cassandra.utils.btree.BTree.FAN_SHIFT;
-import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
-
-/**
- * A class for constructing a new BTree, either from an existing one and some set of modifications
- * or a new tree from a sorted collection of items.
- * <p/>
- * This is a fairly heavy-weight object, so a ThreadLocal instance is created for making modifications to a tree
- */
-final class Builder
-{
-    private final NodeBuilder rootBuilder = new NodeBuilder();
-
-    /**
-     * At the highest level, we adhere to the classic b-tree insertion algorithm:
-     *
-     * 1. Add to the appropriate leaf
-     * 2. Split the leaf if necessary, add the median to the parent
-     * 3. Split the parent if necessary, etc.
-     *
-     * There is one important difference: we don't actually modify the original tree, but copy each node that we
-     * modify.  Note that every node on the path to the key being inserted or updated will be modified; this
-     * implies that at a minimum, the root node will be modified for every update, so every root is a "snapshot"
-     * of a tree that can be iterated or sliced without fear of concurrent modifications.
-     *
-     * The NodeBuilder class handles the details of buffering the copied contents of the original tree and
-     * adding in our changes.  Since NodeBuilder maintains parent/child references, it also handles parent-splitting
-     * (easy enough, since any node affected by the split will already be copied into a NodeBuilder).
-     *
-     * One other difference from the simple algorithm is that we perform modifications in bulk;
-     * we assume @param source has been sorted, e.g. by BTree.update, so the update of each key resumes where
-     * the previous left off.
-     */
-    public <V> Object[] update(Object[] btree, Comparator<V> comparator, Iterable<V> source, UpdateFunction<V> updateF)
-    {
-        assert updateF != null;
-
-        NodeBuilder current = rootBuilder;
-        current.reset(btree, POSITIVE_INFINITY, updateF, comparator);
-
-        for (V key : source)
-        {
-            while (true)
-            {
-                if (updateF.abortEarly())
-                {
-                    rootBuilder.clear();
-                    return null;
-                }
-                NodeBuilder next = current.update(key);
-                if (next == null)
-                    break;
-                // we were in a subtree from a previous key that didn't contain this new key;
-                // retry against the correct subtree
-                current = next;
-            }
-        }
-
-        // finish copying any remaining keys from the original btree
-        while (true)
-        {
-            NodeBuilder next = current.finish();
-            if (next == null)
-                break;
-            current = next;
-        }
-
-        // updating with POSITIVE_INFINITY means that current should be back to the root
-        assert current.isRoot();
-
-        Object[] r = current.toNode();
-        current.clear();
-        return r;
-    }
-
-    public <V> Object[] build(Iterable<V> source, UpdateFunction<V> updateF, int size)
-    {
-        assert updateF != null;
-
-        NodeBuilder current = rootBuilder;
-        // we descend only to avoid wasting memory; in update() we will often descend into existing trees
-        // so here we want to descend also, so we don't have lg max(N) depth in both directions
-        while ((size >>= FAN_SHIFT) > 0)
-            current = current.ensureChild();
-
-        current.reset(EMPTY_LEAF, POSITIVE_INFINITY, updateF, null);
-        for (V key : source)
-            current.addNewKey(key);
-
-        current = current.ascendToRoot();
-
-        Object[] r = current.toNode();
-        current.clear();
-        return r;
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/btree/Cursor.java b/src/java/org/apache/cassandra/utils/btree/Cursor.java
deleted file mode 100644
index 6814d26..0000000
--- a/src/java/org/apache/cassandra/utils/btree/Cursor.java
+++ /dev/null

@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.utils.btree;
-
-import java.util.Comparator;
-import java.util.Iterator;
-
-import static org.apache.cassandra.utils.btree.BTree.NEGATIVE_INFINITY;
-import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
-import static org.apache.cassandra.utils.btree.BTree.getLeafKeyEnd;
-import static org.apache.cassandra.utils.btree.BTree.isLeaf;
-
-/**
- * An extension of Path which provides a public interface for iterating over or counting a subrange of the tree
- *
- * @param <V>
- */
-public final class Cursor<K, V extends K> extends Path implements Iterator<V>
-{
-    /*
-     * Conceptually, a Cursor derives two Paths, one for the first object in the slice requested (inclusive),
-     * and one for the last (exclusive).  Then hasNext just checks, have we reached the last yet, and next
-     * calls successor() to get to the next item in the Tree.
-     *
-     * To optimize memory use, we summarize the last Path as just endNode/endIndex, and inherit from Path for
-     *
-     * the first one.
-     */
-
-    // the last node covered by the requested range
-    private Object[] endNode;
-    // the index within endNode that signals we're finished -- that is, endNode[endIndex] is NOT part of the Cursor
-    private byte endIndex;
-
-    private boolean forwards;
-
-    /**
-     * Reset this cursor for the provided tree, to iterate over its entire range
-     *
-     * @param btree    the tree to iterate over
-     * @param forwards if false, the cursor will start at the end and move backwards
-     */
-    public void reset(Object[] btree, boolean forwards)
-    {
-        _reset(btree, null, NEGATIVE_INFINITY, false, POSITIVE_INFINITY, false, forwards);
-    }
-
-    /**
-     * Reset this cursor for the provided tree, to iterate between the provided start and end
-     *
-     * @param btree      the tree to iterate over
-     * @param comparator the comparator that defines the ordering over the items in the tree
-     * @param lowerBound the first item to include, inclusive
-     * @param upperBound the last item to include, exclusive
-     * @param forwards   if false, the cursor will start at the end and move backwards
-     */
-    public void reset(Object[] btree, Comparator<K> comparator, K lowerBound, K upperBound, boolean forwards)
-    {
-        _reset(btree, comparator, lowerBound, true, upperBound, false, forwards);
-    }
-
-    /**
-     * Reset this cursor for the provided tree, to iterate between the provided start and end
-     *
-     * @param btree               the tree to iterate over
-     * @param comparator          the comparator that defines the ordering over the items in the tree
-     * @param lowerBound          the first item to include
-     * @param inclusiveLowerBound should include start in the iterator, if present in the tree
-     * @param upperBound          the last item to include
-     * @param inclusiveUpperBound should include end in the iterator, if present in the tree
-     * @param forwards            if false, the cursor will start at the end and move backwards
-     */
-    public void reset(Object[] btree, Comparator<K> comparator, K lowerBound, boolean inclusiveLowerBound, K upperBound, boolean inclusiveUpperBound, boolean forwards)
-    {
-        _reset(btree, comparator, lowerBound, inclusiveLowerBound, upperBound, inclusiveUpperBound, forwards);
-    }
-
-    private void _reset(Object[] btree, Comparator<K> comparator, Object lowerBound, boolean inclusiveLowerBound, Object upperBound, boolean inclusiveUpperBound, boolean forwards)
-    {
-        init(btree);
-        if (lowerBound == null)
-            lowerBound = NEGATIVE_INFINITY;
-        if (upperBound == null)
-            upperBound = POSITIVE_INFINITY;
-
-        this.forwards = forwards;
-
-        Path findLast = new Path(this.path.length, btree);
-        if (forwards)
-        {
-            findLast.find(comparator, upperBound, inclusiveUpperBound ? Op.HIGHER : Op.CEIL, true);
-            find(comparator, lowerBound, inclusiveLowerBound ? Op.CEIL : Op.HIGHER, true);
-        }
-        else
-        {
-            findLast.find(comparator, lowerBound, inclusiveLowerBound ? Op.LOWER : Op.FLOOR, false);
-            find(comparator, upperBound, inclusiveUpperBound ? Op.FLOOR : Op.LOWER, false);
-        }
-        int c = this.compareTo(findLast, forwards);
-        if (forwards ? c > 0 : c < 0)
-        {
-            endNode = currentNode();
-            endIndex = currentIndex();
-        }
-        else
-        {
-            endNode = findLast.currentNode();
-            endIndex = findLast.currentIndex();
-        }
-    }
-
-    public boolean hasNext()
-    {
-        return path[depth] != endNode || indexes[depth] != endIndex;
-    }
-
-    public V next()
-    {
-        Object r = currentKey();
-        if (forwards)
-            successor();
-        else
-            predecessor();
-        return (V) r;
-    }
-
-    public int count()
-    {
-        if (!forwards)
-            throw new IllegalStateException("Count can only be run on forward cursors");
-        int count = 0;
-        int next;
-        while ((next = consumeNextLeaf()) >= 0)
-            count += next;
-        return count;
-    }
-
-    /**
-     * @return the number of objects consumed by moving out of the next (possibly current) leaf
-     */
-    private int consumeNextLeaf()
-    {
-        Object[] node = currentNode();
-        int r = 0;
-
-        if (!isLeaf(node))
-        {
-            // if we're not in a leaf, then calling successor once will take us to a leaf, since the next
-            // key will be in the leftmost subtree of whichever branch is next.  For instance, if we
-            // are in the root node of the tree depicted by http://cis.stvincent.edu/html/tutorials/swd/btree/btree1.gif,
-            // successor() will take us to the leaf containing N and O.
-            int i = currentIndex();
-            if (node == endNode && i == endIndex)
-                return -1;
-            r = 1;
-            successor();
-            node = currentNode();
-        }
-
-        if (node == endNode)
-        {
-            // only count up to endIndex, and don't call successor()
-            if (currentIndex() == endIndex)
-                return r > 0 ? r : -1;
-            r += endIndex - currentIndex();
-            setIndex(endIndex);
-            return r;
-        }
-
-        // count the remaining objects in this leaf
-        int keyEnd = getLeafKeyEnd(node);
-        r += keyEnd - currentIndex();
-        setIndex(keyEnd);
-        successor();
-        return r;
-    }
-
-    public void remove()
-    {
-        throw new UnsupportedOperationException();
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java b/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java
index f683ec4..93f76fe 100644
--- a/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java
+++ b/src/java/org/apache/cassandra/utils/btree/NodeBuilder.java

@@ -23,12 +23,7 @@
 import java.util.Arrays;
 import java.util.Comparator;
 
-import static org.apache.cassandra.utils.btree.BTree.EMPTY_BRANCH;
-import static org.apache.cassandra.utils.btree.BTree.FAN_FACTOR;
-import static org.apache.cassandra.utils.btree.BTree.compare;
-import static org.apache.cassandra.utils.btree.BTree.find;
-import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
-import static org.apache.cassandra.utils.btree.BTree.isLeaf;
+import static org.apache.cassandra.utils.btree.BTree.*;
 
 /**
  * Represents a level / stack item of in progress modifications to a BTree.
@@ -151,7 +146,7 @@
             }
             else
             {
-                i = find(comparator, key, copyFrom, i + 1, copyFromKeyEnd);
+                i = Arrays.binarySearch(copyFrom, i + 1, copyFromKeyEnd, key, comparator);
                 found = i >= 0;
                 if (!found)
                     i = -i - 1;
@@ -167,7 +162,7 @@
                 return null;
             key = next;
         }
-        else if (i == copyFromKeyEnd && compare(comparator, key, upperBound) >= 0)
+        else if (i == copyFromKeyEnd && compareUpperBound(comparator, key, upperBound) >= 0)
             owns = false;
 
         if (isLeaf(copyFrom))
@@ -239,6 +234,10 @@
         return ascend();
     }
 
+    private static <V> int compareUpperBound(Comparator<V> comparator, Object value, Object upperBound)
+    {
+        return upperBound == POSITIVE_INFINITY ? -1 : comparator.compare((V)value, (V)upperBound);
+    }
 
     // UTILITY METHODS FOR IMPLEMENTATION OF UPDATE/BUILD/DELETE
 
@@ -263,7 +262,8 @@
     // builds a new root BTree node - must be called on root of operation
     Object[] toNode()
     {
-        assert buildKeyPosition <= FAN_FACTOR && (buildKeyPosition > 0 || copyFrom.length > 0) : buildKeyPosition;
+        // we permit building empty trees as some constructions do not know in advance how many items they will contain
+        assert buildKeyPosition <= FAN_FACTOR : buildKeyPosition;
         return buildFromRange(0, buildKeyPosition, isLeaf(copyFrom), false);
     }
 
@@ -385,14 +385,25 @@
         Object[] a;
         if (isLeaf)
         {
-            a = new Object[keyLength + (keyLength & 1)];
+            a = new Object[keyLength | 1];
             System.arraycopy(buildKeys, offset, a, 0, keyLength);
         }
         else
         {
-            a = new Object[1 + (keyLength * 2)];
+            a = new Object[2 + (keyLength * 2)];
             System.arraycopy(buildKeys, offset, a, 0, keyLength);
             System.arraycopy(buildChildren, offset, a, keyLength, keyLength + 1);
+
+            // calculate the indexOffsets of each key in this node, within the sub-tree rooted at this node
+            int[] indexOffsets = new int[keyLength + 1];
+            int size = BTree.size((Object[]) a[keyLength]);
+            for (int i = 0 ; i < keyLength ; i++)
+            {
+                indexOffsets[i] = size;
+                size += 1 + BTree.size((Object[]) a[keyLength + 1 + i]);
+            }
+            indexOffsets[keyLength] = size;
+            a[a.length - 1] = indexOffsets;
         }
         if (isExtra)
             updateFunction.allocated(ObjectSizes.sizeOfArray(a));

diff --git a/src/java/org/apache/cassandra/utils/btree/NodeCursor.java b/src/java/org/apache/cassandra/utils/btree/NodeCursor.java
new file mode 100644
index 0000000..e9fa89e
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/NodeCursor.java

@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Arrays;
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.*;
+
+/**
+ * A class for searching within one node of a btree: a linear chain (stack) of these is built of tree height
+ * to form a Cursor. Some corollaries of the basic building block operations in TreeCursor (moveOne and seekTo),
+ * along with some other methods for helping implement movement between two NodeCursor
+ *
+ * The behaviour is not dissimilar to that of NodeBuilder and TreeBuilder, wherein functions that may move
+ * us to a different node pass us the node we should move to, from which we continue our operations.
+ * @param <K>
+ */
+class NodeCursor<K>
+{
+    // TODO: consider splitting forwards from backwards
+    final NodeCursor<K> parent, child;
+    final Comparator<? super K> comparator;
+
+    boolean inChild;
+    // if !inChild, this is the key position we are currently on;
+    // if inChild, this is the child position we are currently descending into
+    int position;
+    Object[] node;
+    int nodeOffset;
+
+    NodeCursor(Object[] node, NodeCursor<K> parent, Comparator<? super K> comparator)
+    {
+        this.node = node;
+        this.parent = parent;
+        this.comparator = comparator;
+        // a well formed b-tree (text book, or ours) must be balanced, so by building a stack following the left-most branch
+        // we have a stack capable of visiting any path in the tree
+        this.child = BTree.isLeaf(node) ? null : new NodeCursor<>((Object[]) node[getChildStart(node)], this, comparator);
+    }
+
+    void resetNode(Object[] node, int nodeOffset)
+    {
+        this.node = node;
+        this.nodeOffset = nodeOffset;
+    }
+
+    /**
+     * adapt child position to key position within branch, knowing it is safe to do so
+     */
+    void safeAdvanceIntoBranchFromChild(boolean forwards)
+    {
+        if (!forwards)
+            --position;
+    }
+
+    /**
+     * adapt child position to key position within branch, and return if this was successful or we're now out of bounds
+     */
+    boolean advanceIntoBranchFromChild(boolean forwards)
+    {
+        return forwards ? position < getBranchKeyEnd(node) : --position >= 0;
+    }
+
+    boolean advanceLeafNode(boolean forwards)
+    {
+        return forwards ? ++position < getLeafKeyEnd(node)
+                        : --position >= 0;
+    }
+
+    /**
+     * @return the upper/lower bound of the child we are currently descended in
+     */
+    K bound(boolean upper)
+    {
+        return (K) node[position - (upper ? 0 : 1)];
+    }
+
+    /**
+     * The parent that covers a range wider than ourselves, either ascending or descending,
+     * i.e. that defines the upper or lower bound on the subtree rooted at our node
+     * @param upper
+     * @return the NodeCursor parent that can tell us the upper/lower bound of ourselves
+     */
+    NodeCursor<K> boundIterator(boolean upper)
+    {
+        NodeCursor<K> bound = this.parent;
+        while (bound != null && (upper ? bound.position >= getChildCount(bound.node) - 1
+                                       : bound.position <= 0))
+            bound = bound.parent;
+        return bound;
+    }
+
+    /**
+     * look for the provided key in this node, in the specified direction:
+     * forwards => ceil search; otherwise floor
+     *
+     * we require that the node's "current" key (including the relevant bound if we are a parent we have ascended into)
+     * be already excluded by the search. this is useful for the following reasons:
+     *   1: we must ensure we never go backwards, so excluding that key from our binary search prevents our
+     *      descending into a child we have already visited (without any further checks)
+     *   2: we already check the bounds as we search upwards for our natural parent;
+     *   3: we want to cheaply check sequential access, so we always check the first key we're on anyway (if it can be done easily)
+     */
+    boolean seekInNode(K key, boolean forwards)
+    {
+        int position = this.position;
+        int lb, ub;
+        if (forwards)
+        {
+            lb = position + 1;
+            ub = getKeyEnd(node);
+        }
+        else
+        {
+            ub = position;
+            lb = 0;
+        }
+
+        int find = Arrays.binarySearch((K[]) node, lb, ub, key, comparator);
+        if (find >= 0)
+        {
+            // exact key match, so we're in the correct node already. return success
+            this.position = find;
+            inChild = false;
+            return true;
+        }
+
+        // if we are a branch, and we are an inequality match, the direction of travel doesn't matter
+        // so we only need to modify if we are going backwards on a leaf node, to produce floor semantics
+        int delta = isLeaf() & !forwards ? -1 : 0;
+        this.position = delta -1 -find;
+        return false;
+    }
+
+    NodeCursor<K> descendToFirstChild(boolean forwards)
+    {
+        if (isLeaf())
+        {
+            position = forwards ? 0 : getLeafKeyEnd(node) - 1;
+            return null;
+        }
+        inChild = true;
+        position = forwards ? 0 : getChildCount(node) - 1;
+        return descend();
+    }
+
+    // descend into the child at "position"
+    NodeCursor<K> descend()
+    {
+        Object[] childNode = (Object[]) node[position + getChildStart(node)];
+        int childOffset = nodeOffset + treeIndexOffsetOfChild(node, position);
+        child.resetNode(childNode, childOffset);
+        inChild = true;
+        return child;
+    }
+
+    boolean isLeaf()
+    {
+        return child == null;
+    }
+
+    int globalIndex()
+    {
+        return nodeOffset + treeIndexOfKey(node, position);
+    }
+
+    int globalLeafIndex()
+    {
+        return nodeOffset + treeIndexOfLeafKey(position);
+    }
+
+    int globalBranchIndex()
+    {
+        return nodeOffset + treeIndexOfBranchKey(node, position);
+    }
+
+    K value()
+    {
+        return (K) node[position];
+    }
+}
\ No newline at end of file

diff --git a/src/java/org/apache/cassandra/utils/btree/Path.java b/src/java/org/apache/cassandra/utils/btree/Path.java
deleted file mode 100644
index b1b0e03..0000000
--- a/src/java/org/apache/cassandra/utils/btree/Path.java
+++ /dev/null

@@ -1,341 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.utils.btree;
-
-import java.util.Comparator;
-
-import static org.apache.cassandra.utils.btree.BTree.NEGATIVE_INFINITY;
-import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
-import static org.apache.cassandra.utils.btree.BTree.getBranchKeyEnd;
-import static org.apache.cassandra.utils.btree.BTree.getKeyEnd;
-import static org.apache.cassandra.utils.btree.BTree.getLeafKeyEnd;
-import static org.apache.cassandra.utils.btree.BTree.isLeaf;
-
-/**
- * An internal class for searching and iterating through a tree.  As it traverses the tree,
- * it adds the nodes visited to a stack.  This allows us to backtrack from a child node
- * to its parent.
- *
- * As we navigate the tree, we destructively modify this stack.
- *
- * Path is only intended to be used via Cursor.
- */
-public class Path<V>
-{
-    // operations corresponding to the ones in NavigableSet
-    static enum Op
-    {
-        CEIL,   // the least element greater than or equal to the given element
-        FLOOR,  // the greatest element less than or equal to the given element
-        HIGHER, // the least element strictly greater than the given element
-        LOWER   // the greatest element strictly less than the given element
-    }
-
-    // the path to the searched-for key
-    Object[][] path;
-    // the index within the node of our path at a given depth
-    byte[] indexes;
-    // current depth.  nothing in path[i] for i > depth is valid.
-    byte depth;
-
-    Path() { }
-    Path(int depth, Object[] btree)
-    {
-        this.path = new Object[depth][];
-        this.indexes = new byte[depth];
-        this.path[0] = btree;
-    }
-
-    void init(Object[] btree)
-    {
-        int depth = BTree.depth(btree);
-        if (path == null || path.length < depth)
-        {
-            path = new Object[depth][];
-            indexes = new byte[depth];
-        }
-        path[0] = btree;
-    }
-
-    void moveEnd(Object[] node, boolean forwards)
-    {
-        push(node, getKeyEnd(node));
-        if (!forwards)
-            predecessor();
-    }
-
-    void moveStart(Object[] node, boolean forwards)
-    {
-        push(node, -1);
-        if (forwards)
-            successor();
-    }
-
-    /**
-     * Find the provided key in the tree rooted at node, and store the root to it in the path
-     *
-     * @param comparator the comparator defining the order on the tree
-     * @param target     the key to search for
-     * @param mode       the type of search to perform
-     * @param forwards   if the path should be setup for forward or backward iteration
-     * @param <K>
-     */
-    <K> boolean find(Comparator<K> comparator, Object target, Op mode, boolean forwards)
-    {
-        // TODO : should not require parameter 'forwards' - consider modifying index to represent both
-        // child and key position, as opposed to just key position (which necessitates a different value depending
-        // on which direction you're moving in. Prerequisite for making Path public and using to implement general
-        // search
-
-        Object[] node = path[depth];
-        int lb = indexes[depth];
-        assert lb == 0 || forwards;
-        pop();
-
-        if (target instanceof BTree.Special)
-        {
-            if (target == POSITIVE_INFINITY)
-                moveEnd(node, forwards);
-            else if (target == NEGATIVE_INFINITY)
-                moveStart(node, forwards);
-            else
-                throw new AssertionError();
-            return false;
-        }
-
-        while (true)
-        {
-            int keyEnd = getKeyEnd(node);
-
-            // search for the target in the current node
-            int i = BTree.find(comparator, target, node, lb, keyEnd);
-            lb = 0;
-            if (i >= 0)
-            {
-                // exact match. transform exclusive bounds into the correct index by moving back or forwards one
-                push(node, i);
-                switch (mode)
-                {
-                    case HIGHER:
-                        successor();
-                        break;
-                    case LOWER:
-                        predecessor();
-                }
-                return true;
-            }
-            i = -i - 1;
-
-            // traverse into the appropriate child
-            if (!isLeaf(node))
-            {
-                push(node, forwards ? i - 1 : i);
-                node = (Object[]) node[keyEnd + i];
-                continue;
-            }
-
-            // bottom of the tree and still not found.  pick the right index to satisfy Op
-            switch (mode)
-            {
-                case FLOOR:
-                case LOWER:
-                    i--;
-            }
-
-            if (i < 0)
-            {
-                push(node, 0);
-                predecessor();
-            }
-            else if (i >= keyEnd)
-            {
-                push(node, keyEnd - 1);
-                successor();
-            }
-            else
-            {
-                push(node, i);
-            }
-
-            return false;
-        }
-    }
-
-    boolean isRoot()
-    {
-        return depth == 0;
-    }
-
-    void pop()
-    {
-        depth--;
-    }
-
-    Object[] currentNode()
-    {
-        return path[depth];
-    }
-
-    byte currentIndex()
-    {
-        return indexes[depth];
-    }
-
-    void push(Object[] node, int index)
-    {
-        path[++depth] = node;
-        indexes[depth] = (byte) index;
-    }
-
-    void setIndex(int index)
-    {
-        indexes[depth] = (byte) index;
-    }
-
-    byte findSuccessorParentDepth()
-    {
-        byte depth = this.depth;
-        depth--;
-        while (depth >= 0)
-        {
-            int ub = indexes[depth] + 1;
-            Object[] node = path[depth];
-            if (ub < getBranchKeyEnd(node))
-                return depth;
-            depth--;
-        }
-        return -1;
-    }
-
-    // move to the next key in the tree
-    void successor()
-    {
-        Object[] node = currentNode();
-        int i = currentIndex();
-
-        if (!isLeaf(node))
-        {
-            // if we're on a key in a branch, we MUST have a descendant either side of us,
-            // so we always go down the left-most child until we hit a leaf
-            node = (Object[]) node[getBranchKeyEnd(node) + i + 1];
-            while (!isLeaf(node))
-            {
-                push(node, -1);
-                node = (Object[]) node[getBranchKeyEnd(node)];
-            }
-            push(node, 0);
-            return;
-        }
-
-        // if we haven't reached the end of this leaf, just increment our index and return
-        i += 1;
-        if (i < getLeafKeyEnd(node))
-        {
-            // moved to the next key in the same leaf
-            setIndex(i);
-            return;
-        }
-
-        // we've reached the end of this leaf,
-        // so go up until we reach something we've not finished visiting
-        while (!isRoot())
-        {
-            pop();
-            i = currentIndex() + 1;
-            node = currentNode();
-            if (i < getKeyEnd(node))
-            {
-                setIndex(i);
-                return;
-            }
-        }
-
-        // we've visited the last key in the root node, so we're done
-        setIndex(getKeyEnd(node));
-    }
-
-    // move to the previous key in the tree
-    void predecessor()
-    {
-        Object[] node = currentNode();
-        int i = currentIndex();
-
-        if (!isLeaf(node))
-        {
-            // if we're on a key in a branch, we MUST have a descendant either side of us
-            // so we always go down the right-most child until we hit a leaf
-            node = (Object[]) node[getBranchKeyEnd(node) + i];
-            while (!isLeaf(node))
-            {
-                i = getBranchKeyEnd(node);
-                push(node, i);
-                node = (Object[]) node[i * 2];
-            }
-            push(node, getLeafKeyEnd(node) - 1);
-            return;
-        }
-
-        // if we haven't reached the beginning of this leaf, just decrement our index and return
-        i -= 1;
-        if (i >= 0)
-        {
-            setIndex(i);
-            return;
-        }
-
-        // we've reached the beginning of this leaf,
-        // so go up until we reach something we've not finished visiting
-        while (!isRoot())
-        {
-            pop();
-            i = currentIndex() - 1;
-            if (i >= 0)
-            {
-                setIndex(i);
-                return;
-            }
-        }
-
-        // we've visited the last key in the root node, so we're done
-        setIndex(-1);
-    }
-
-    Object currentKey()
-    {
-        return currentNode()[currentIndex()];
-    }
-
-    int compareTo(Path<V> that, boolean forwards)
-    {
-        int d = Math.min(this.depth, that.depth);
-        for (int i = 0; i <= d; i++)
-        {
-            int c = this.indexes[i] - that.indexes[i];
-            if (c != 0)
-                return c;
-        }
-        // identical indices up to depth, so if somebody is lower depth they are on a later item if iterating forwards
-        // and an earlier item if iterating backwards, as the node at max common depth must be a branch if they are
-        // different depths, and branches that are currently descended into lag the child index they are in when iterating forwards,
-        // i.e. if they are in child 0 they record an index of -1 forwards, or 0 when backwards
-        d = this.depth - that.depth;
-        return forwards ? d : -d;
-    }
-}
-

diff --git a/src/java/org/apache/cassandra/utils/btree/TreeBuilder.java b/src/java/org/apache/cassandra/utils/btree/TreeBuilder.java
new file mode 100644
index 0000000..024902e
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/TreeBuilder.java

@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.EMPTY_LEAF;
+import static org.apache.cassandra.utils.btree.BTree.FAN_SHIFT;
+import static org.apache.cassandra.utils.btree.BTree.POSITIVE_INFINITY;
+
+/**
+ * A class for constructing a new BTree, either from an existing one and some set of modifications
+ * or a new tree from a sorted collection of items.
+ * <p/>
+ * This is a fairly heavy-weight object, so a ThreadLocal instance is created for making modifications to a tree
+ */
+final class TreeBuilder
+{
+    private final NodeBuilder rootBuilder = new NodeBuilder();
+
+    /**
+     * At the highest level, we adhere to the classic b-tree insertion algorithm:
+     *
+     * 1. Add to the appropriate leaf
+     * 2. Split the leaf if necessary, add the median to the parent
+     * 3. Split the parent if necessary, etc.
+     *
+     * There is one important difference: we don't actually modify the original tree, but copy each node that we
+     * modify.  Note that every node on the path to the key being inserted or updated will be modified; this
+     * implies that at a minimum, the root node will be modified for every update, so every root is a "snapshot"
+     * of a tree that can be iterated or sliced without fear of concurrent modifications.
+     *
+     * The NodeBuilder class handles the details of buffering the copied contents of the original tree and
+     * adding in our changes.  Since NodeBuilder maintains parent/child references, it also handles parent-splitting
+     * (easy enough, since any node affected by the split will already be copied into a NodeBuilder).
+     *
+     * One other difference from the simple algorithm is that we perform modifications in bulk;
+     * we assume @param source has been sorted, e.g. by BTree.update, so the update of each key resumes where
+     * the previous left off.
+     */
+    public <C, K extends C, V extends C> Object[] update(Object[] btree, Comparator<C> comparator, Iterable<K> source, UpdateFunction<K, V> updateF)
+    {
+        assert updateF != null;
+
+        NodeBuilder current = rootBuilder;
+        current.reset(btree, POSITIVE_INFINITY, updateF, comparator);
+
+        for (K key : source)
+        {
+            while (true)
+            {
+                if (updateF.abortEarly())
+                {
+                    rootBuilder.clear();
+                    return null;
+                }
+                NodeBuilder next = current.update(key);
+                if (next == null)
+                    break;
+                // we were in a subtree from a previous key that didn't contain this new key;
+                // retry against the correct subtree
+                current = next;
+            }
+        }
+
+        // finish copying any remaining keys from the original btree
+        while (true)
+        {
+            NodeBuilder next = current.finish();
+            if (next == null)
+                break;
+            current = next;
+        }
+
+        // updating with POSITIVE_INFINITY means that current should be back to the root
+        assert current.isRoot();
+
+        Object[] r = current.toNode();
+        current.clear();
+        return r;
+    }
+
+    public <C, K extends C, V extends C> Object[] build(Iterable<K> source, UpdateFunction<K, V> updateF, int size)
+    {
+        assert updateF != null;
+
+        NodeBuilder current = rootBuilder;
+        // we descend only to avoid wasting memory; in update() we will often descend into existing trees
+        // so here we want to descend also, so we don't have lg max(N) depth in both directions
+        while ((size >>= FAN_SHIFT) > 0)
+            current = current.ensureChild();
+
+        current.reset(EMPTY_LEAF, POSITIVE_INFINITY, updateF, null);
+        for (K key : source)
+            current.addNewKey(key);
+
+        current = current.ascendToRoot();
+
+        Object[] r = current.toNode();
+        current.clear();
+        return r;
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/TreeCursor.java b/src/java/org/apache/cassandra/utils/btree/TreeCursor.java
new file mode 100644
index 0000000..5e55698
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/btree/TreeCursor.java

@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils.btree;
+
+import java.util.Arrays;
+import java.util.Comparator;
+
+import static org.apache.cassandra.utils.btree.BTree.*;
+
+/**
+ * Supports two basic operations for moving around a BTree, either forwards or backwards:
+ * moveOne(), and seekTo()
+ *
+ * These two methods, along with movement to the start/end, permit us to construct any desired
+ * movement around a btree, without much cognitive burden.
+ *
+ * This TreeCursor is (and has its methods) package private. This is to avoid polluting the BTreeSearchIterator
+ * that extends it, and uses its functionality. If this class is useful for wider consumption, a public extension
+ * class can be provided, that just makes all of its methods public.
+ */
+class TreeCursor<K> extends NodeCursor<K>
+{
+    // TODO: spend some time optimising compiler inlining decisions: many of these methods have only one primary call-site
+
+    NodeCursor<K> cur;
+
+    TreeCursor(Comparator<? super K> comparator, Object[] node)
+    {
+        super(node, null, comparator);
+    }
+
+    /**
+     * Move the cursor to either the first or last item in the btree
+     * @param start true if should move to the first item; false moves to the last
+     */
+    void reset(boolean start)
+    {
+        cur = root();
+        root().inChild = false;
+        // this is a corrupt position, but we ensure we never use it except to start our search from
+        root().position = start ? -1 : getKeyEnd(root().node);
+    }
+
+    /**
+     * move the Cursor one item, either forwards or backwards
+     * @param forwards direction of travel
+     * @return false iff the cursor is exhausted in the direction of travel
+     */
+    int moveOne(boolean forwards)
+    {
+        NodeCursor<K> cur = this.cur;
+        if (cur.isLeaf())
+        {
+            // if we're a leaf, we try to step forwards inside ourselves
+            if (cur.advanceLeafNode(forwards))
+                return cur.globalLeafIndex();
+
+            // if we fail, we just find our bounding parent
+            this.cur = cur = moveOutOfLeaf(forwards, cur, root());
+            return cur.globalIndex();
+        }
+
+        // otherwise we descend directly into our next child
+        if (forwards)
+            ++cur.position;
+        cur = cur.descend();
+
+        // and go to its first item
+        NodeCursor<K> next;
+        while ( null != (next = cur.descendToFirstChild(forwards)) )
+            cur = next;
+
+        this.cur = cur;
+        return cur.globalLeafIndex();
+    }
+
+    /**
+     * seeks from the current position, forwards or backwards, for the provided key
+     * while the direction could be inferred (or ignored), it is required so that (e.g.) we do not infinitely loop on bad inputs
+     * if there is no such key, it moves to the key that would naturally follow/succeed it (i.e. it behaves as ceil when ascending; floor when descending)
+     */
+    boolean seekTo(K key, boolean forwards, boolean skipOne)
+    {
+        NodeCursor<K> cur = this.cur;
+
+        /**
+         * decide if we will "try one" value by itself, as a sequential access;
+         * we actually *require* that we try the "current key" for any node before we call seekInNode on it.
+         *
+         * if we are already on a value, we just check it irregardless of if it is a leaf or not;
+         * if we are not, we have already excluded it (as we have consumed it), so:
+         *    if we are on a branch we consider that good enough;
+         *    otherwise, we move onwards one, and we try the new value
+         *
+         */
+        boolean tryOne = !skipOne;
+        if ((!tryOne & cur.isLeaf()) && !(tryOne = (cur.advanceLeafNode(forwards) || (cur = moveOutOfLeaf(forwards, cur, null)) != null)))
+        {
+            // we moved out of the tree; return out-of-bounds
+            this.cur = root();
+            return false;
+        }
+
+        if (tryOne)
+        {
+            // we're presently on a value we can (and *must*) cheaply test
+            K test = cur.value();
+
+            int cmp;
+            if (key == test) cmp = 0; // check object identity first, since we utilise that in some places and it's very cheap
+            else cmp = comparator.compare(test, key); // order of provision matters for asymmetric comparators
+            if (forwards ? cmp >= 0 : cmp <= 0)
+            {
+                // we've either matched, or excluded the value from being present
+                this.cur = cur;
+                return cmp == 0;
+            }
+        }
+
+        // if we failed to match with the cheap test, first look to see if we're even in the correct sub-tree
+        while (cur != root())
+        {
+            NodeCursor<K> bound = cur.boundIterator(forwards);
+            if (bound == null)
+                break; // we're all that's left
+
+            int cmpbound = comparator.compare(bound.bound(forwards), key); // order of provision matters for asymmetric comparators
+            if (forwards ? cmpbound > 0 : cmpbound < 0)
+                break; //  already in correct sub-tree
+
+            // bound is on-or-before target, so ascend to that bound and continue looking upwards
+            cur = bound;
+            cur.safeAdvanceIntoBranchFromChild(forwards);
+            if (cmpbound == 0) // it was an exact match, so terminate here
+            {
+                this.cur = cur;
+                return true;
+            }
+        }
+
+        // we must now be able to find our target in the sub-tree rooted at cur
+        boolean match;
+        while (!(match = cur.seekInNode(key, forwards)) && !cur.isLeaf())
+        {
+            cur = cur.descend();
+            cur.position = forwards ? -1 : getKeyEnd(cur.node);
+        }
+
+        if (!match)
+            cur = ensureValidLocation(forwards, cur);
+
+        this.cur = cur;
+        assert !cur.inChild;
+        return match;
+    }
+
+    /**
+     * ensures a leaf node we have seeked in, is not positioned outside of its bounds,
+     * by moving us into its parents (if any); if it is the root, we're permitted to be out-of-bounds
+     * as this indicates exhaustion
+     */
+    private NodeCursor<K> ensureValidLocation(boolean forwards, NodeCursor<K> cur)
+    {
+        assert cur.isLeaf();
+        int position = cur.position;
+        // if we're out of bounds of the leaf, move once in direction of travel
+        if ((position < 0) | (position >= getLeafKeyEnd(cur.node)))
+            cur = moveOutOfLeaf(forwards, cur, root());
+        return cur;
+    }
+
+    /**
+     * move out of a leaf node that is currently out of (its own) bounds
+     * @return null if we're now out-of-bounds of the whole tree
+     */
+    private <K> NodeCursor<K> moveOutOfLeaf(boolean forwards, NodeCursor<K> cur, NodeCursor<K> ifFail)
+    {
+        while (true)
+        {
+            cur = cur.parent;
+            if (cur == null)
+            {
+                root().inChild = false;
+                return ifFail;
+            }
+            if (cur.advanceIntoBranchFromChild(forwards))
+                break;
+        }
+        cur.inChild = false;
+        return cur;
+    }
+
+    /**
+     * resets the cursor and seeks to the specified position; does not assume locality or take advantage of the cursor's current position
+     */
+    void seekTo(int index)
+    {
+        if ((index < 0) | (index >= BTree.size(rootNode())))
+        {
+            if ((index < -1) | (index > BTree.size(rootNode())))
+                throw new IndexOutOfBoundsException(index + " not in range [0.." + BTree.size(rootNode()) + ")");
+            reset(index == -1);
+            return;
+        }
+
+        NodeCursor<K> cur = this.cur;
+        cur = root();
+        assert cur.nodeOffset == 0;
+        while (true)
+        {
+            int relativeIndex = index - cur.nodeOffset; // index within subtree rooted at cur
+            Object[] node = cur.node;
+
+            if (cur.isLeaf())
+            {
+                assert relativeIndex < getLeafKeyEnd(node);
+                cur.position = relativeIndex;
+                this.cur = cur;
+                return;
+            }
+
+            int[] sizeMap = getSizeMap(node);
+            int boundary = Arrays.binarySearch(sizeMap, relativeIndex);
+            if (boundary >= 0)
+            {
+                // exact match, in this branch node
+                assert boundary < sizeMap.length - 1;
+                cur.position = boundary;
+                cur.inChild = false;
+                this.cur = cur;
+                return;
+            }
+
+            cur.inChild = true;
+            cur.position = -1 -boundary;
+            cur = cur.descend();
+        }
+    }
+
+    private NodeCursor<K> root()
+    {
+        return this;
+    }
+
+    Object[] rootNode()
+    {
+        return this.node;
+    }
+
+    K currentValue()
+    {
+        return cur.value();
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java b/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java
index 9f45031..0ab10c2 100644
--- a/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java
+++ b/src/java/org/apache/cassandra/utils/btree/UpdateFunction.java

@@ -18,21 +18,21 @@
  */
 package org.apache.cassandra.utils.btree;
 
+import java.util.function.BiFunction;
+
 import com.google.common.base.Function;
 /**
  * An interface defining a function to be applied to both the object we are replacing in a BTree and
  * the object that is intended to replace it, returning the object to actually replace it.
- *
- * @param <V>
  */
-public interface UpdateFunction<V> extends Function<V, V>
+public interface UpdateFunction<K, V> extends Function<K, V>
 {
     /**
      * @param replacing the value in the original tree we have matched
      * @param update the value in the updating collection that matched
      * @return the value to insert into the new tree
      */
-    V apply(V replacing, V update);
+    V apply(V replacing, K update);
 
     /**
      * @return true if we should fail the update
@@ -44,37 +44,29 @@
      */
     void allocated(long heapSize);
 
-    public static final class NoOp<V> implements UpdateFunction<V>
+    public static final class Simple<V> implements UpdateFunction<V, V>
     {
-
-        private static final NoOp INSTANCE = new NoOp();
-        public static <V> NoOp<V> instance()
+        private final BiFunction<V, V, V> wrapped;
+        public Simple(BiFunction<V, V, V> wrapped)
         {
-            return INSTANCE;
-        }
-        
-        private NoOp()
-        {
+            this.wrapped = wrapped;
         }
 
-        public V apply(V replacing, V update)
-        {
-            return update;
-        }
+        public V apply(V v) { return v; }
+        public V apply(V replacing, V update) { return wrapped.apply(replacing, update); }
+        public boolean abortEarly() { return false; }
+        public void allocated(long heapSize) { }
 
-        public V apply(V update)
+        public static <V> Simple<V> of(BiFunction<V, V, V> f)
         {
-            return update;
-        }
-
-        public boolean abortEarly()
-        {
-            return false;
-        }
-
-        public void allocated(long heapSize)
-        {
+            return new Simple<>(f);
         }
     }
 
+    static final Simple<Object> noOp = Simple.of((a, b) -> a);
+
+    public static <K> UpdateFunction<K, K> noOp()
+    {
+        return (UpdateFunction<K, K>) noOp;
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/Accumulator.java b/src/java/org/apache/cassandra/utils/concurrent/Accumulator.java
index baecb34..ca9bb09 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/Accumulator.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/Accumulator.java

@@ -18,6 +18,7 @@
 */
 package org.apache.cassandra.utils.concurrent;
 
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
@@ -100,6 +101,11 @@
         return presentCount;
     }
 
+    public int capacity()
+    {
+        return values.length;
+    }
+
     public Iterator<E> iterator()
     {
         return new Iterator<E>()
@@ -130,4 +136,16 @@
             throw new IndexOutOfBoundsException();
         return (E) values[i];
     }
+
+    /**
+     * Removes all of the elements from this accumulator.
+     *
+     * This method is not thread-safe when used concurrently with {@link #add(Object)}.
+     */
+    public void clearUnsafe()
+    {
+        nextIndexUpdater.set(this, 0);
+        presentCountUpdater.set(this, 0);
+        Arrays.fill(values, null);
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java b/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java
index b80fe99..497eec3 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/OpOrder.java

@@ -240,6 +240,37 @@
             }
         }
 
+        public boolean isFinished()
+        {
+            return next.prev == null;
+        }
+
+        public boolean isOldestLiveGroup()
+        {
+            return prev == null;
+        }
+
+        public void await()
+        {
+            while (!isFinished())
+            {
+                WaitQueue.Signal signal = waiting.register();
+                if (isFinished())
+                {
+                    signal.cancel();
+                    return;
+                }
+                else
+                    signal.awaitUninterruptibly();
+            }
+            assert running == FINISHED;
+        }
+
+        public OpOrder.Group prev()
+        {
+            return prev;
+        }
+
         /**
          * called once we know all operations started against this Ordered have completed,
          * however we do not know if operations against its ancestors have completed, or
@@ -390,35 +421,14 @@
         }
 
         /**
-         * @return true if all operations started prior to barrier.issue() have completed
-         */
-        public boolean allPriorOpsAreFinished()
-        {
-            Group current = orderOnOrBefore;
-            if (current == null)
-                throw new IllegalStateException("This barrier needs to have issue() called on it before prior operations can complete");
-            if (current.next.prev == null)
-                return true;
-            return false;
-        }
-
-        /**
          * wait for all operations started prior to issuing the barrier to complete
          */
         public void await()
         {
-            while (!allPriorOpsAreFinished())
-            {
-                WaitQueue.Signal signal = register();
-                if (allPriorOpsAreFinished())
-                {
-                    signal.cancel();
-                    return;
-                }
-                else
-                    signal.awaitUninterruptibly();
-            }
-            assert orderOnOrBefore.running == FINISHED;
+            Group current = orderOnOrBefore;
+            if (current == null)
+                throw new IllegalStateException("This barrier needs to have issue() called on it before prior operations can complete");
+            current.await();
         }
 
         /**

diff --git a/src/java/org/apache/cassandra/utils/concurrent/Ref.java b/src/java/org/apache/cassandra/utils/concurrent/Ref.java
index c009032..933c498 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/Ref.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/Ref.java

@@ -21,23 +21,40 @@
 package org.apache.cassandra.utils.concurrent;
 
 import java.lang.ref.PhantomReference;
+import java.lang.ref.Reference;
 import java.lang.ref.ReferenceQueue;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Set;
+import java.lang.ref.WeakReference;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
-import com.google.common.annotations.VisibleForTesting;
 import org.apache.cassandra.concurrent.InfiniteLoopExecutor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.concurrent.InfiniteLoopExecutor.InterruptibleRunnable;
-import org.apache.cassandra.utils.ExecutorUtils;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.Memory;
+import org.apache.cassandra.io.util.SafeMemory;
+import org.apache.cassandra.utils.ExecutorUtils;
+import org.apache.cassandra.utils.NoSpamLogger;
+import org.apache.cassandra.utils.Pair;
+import org.cliffc.high_scale_lib.NonBlockingHashMap;
+
+import static java.util.Collections.emptyList;
+import org.apache.cassandra.concurrent.InfiniteLoopExecutor.InterruptibleRunnable;
+
+import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
+import static org.apache.cassandra.utils.ExecutorUtils.shutdownNow;
 import static org.apache.cassandra.utils.Throwables.maybeFail;
 import static org.apache.cassandra.utils.Throwables.merge;
 
@@ -260,7 +277,7 @@
 
     // the object that manages the actual cleaning up; this does not reference the target object
     // so that we can detect when references are lost to the resource itself, and still cleanup afterwards
-    // the Tidy object MUST not contain any references to the object we are managing
+    // the Tidy object MUST NOT contain any references to the object we are managing
     static final class GlobalState
     {
         // we need to retain a reference to each of the PhantomReference instances
@@ -304,7 +321,8 @@
                 globallyExtant.remove(this);
                 try
                 {
-                    tidy.tidy();
+                    if (tidy != null)
+                        tidy.tidy();
                 }
                 catch (Throwable t)
                 {
@@ -321,19 +339,36 @@
 
         public String toString()
         {
-            return tidy.getClass() + "@" + System.identityHashCode(tidy) + ":" + tidy.name();
+            if (tidy != null)
+                return tidy.getClass() + "@" + System.identityHashCode(tidy) + ":" + tidy.name();
+            return "@" + System.identityHashCode(this);
         }
     }
 
-    private static final Set<GlobalState> globallyExtant = Collections.newSetFromMap(new ConcurrentHashMap<GlobalState, Boolean>());
+    private static final Class<?>[] concurrentIterableClasses = new Class<?>[] {
+        ConcurrentLinkedQueue.class,
+        ConcurrentLinkedDeque.class,
+        ConcurrentSkipListSet.class,
+        CopyOnWriteArrayList.class,
+        CopyOnWriteArraySet.class,
+        DelayQueue.class,
+        NonBlockingHashMap.class,
+    };
+    static final Set<Class<?>> concurrentIterables = Collections.newSetFromMap(new IdentityHashMap<>());
+    private static final Set<GlobalState> globallyExtant = Collections.newSetFromMap(new ConcurrentHashMap<>());
     static final ReferenceQueue<Object> referenceQueue = new ReferenceQueue<>();
-    private static final InfiniteLoopExecutor EXEC = new InfiniteLoopExecutor("Reference-Reaper", new InterruptibleRunnable()
+    private static final InfiniteLoopExecutor EXEC = new InfiniteLoopExecutor("Reference-Reaper", Ref::reapOneReference).start();
+    static final ScheduledExecutorService STRONG_LEAK_DETECTOR = !DEBUG_ENABLED ? null : Executors.newScheduledThreadPool(1, new NamedThreadFactory("Strong-Reference-Leak-Detector"));
+    static
     {
-        public void run() throws InterruptedException
+        if (DEBUG_ENABLED)
         {
-            reapOneReference();
+            STRONG_LEAK_DETECTOR.scheduleAtFixedRate(new Visitor(), 1, 15, TimeUnit.MINUTES);
+            STRONG_LEAK_DETECTOR.scheduleAtFixedRate(new StrongLeakDetector(), 2, 15, TimeUnit.MINUTES);
         }
-    }).start();
+        concurrentIterables.addAll(Arrays.asList(concurrentIterableClasses));
+    }
+
     private static void reapOneReference() throws InterruptedException
     {
         Object obj = referenceQueue.remove(100);
@@ -343,9 +378,337 @@
         }
     }
 
+    static final Deque<InProgressVisit> inProgressVisitPool = new ArrayDeque<InProgressVisit>();
+
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    static InProgressVisit newInProgressVisit(Object o, List<Field> fields, Field field, String name)
+    {
+        Preconditions.checkNotNull(o);
+        InProgressVisit ipv = inProgressVisitPool.pollLast();
+        if (ipv == null)
+            ipv = new InProgressVisit();
+
+        ipv.o = o;
+        if (o instanceof Object[])
+            ipv.collectionIterator = Arrays.asList((Object[])o).iterator();
+        else if (o instanceof ConcurrentMap)
+        {
+            ipv.isMapIterator = true;
+            ipv.collectionIterator = ((Map)o).entrySet().iterator();
+        }
+        else if (concurrentIterables.contains(o.getClass()) | o instanceof BlockingQueue)
+            ipv.collectionIterator = ((Iterable)o).iterator();
+
+        ipv.fields = fields;
+        ipv.field = field;
+        ipv.name = name;
+        return ipv;
+    }
+
+    static void returnInProgressVisit(InProgressVisit ipv)
+    {
+        if (inProgressVisitPool.size() > 1024)
+            return;
+        ipv.name = null;
+        ipv.fields = null;
+        ipv.o = null;
+        ipv.fieldIndex = 0;
+        ipv.field = null;
+        ipv.collectionIterator = null;
+        ipv.mapEntryValue = null;
+        ipv.isMapIterator = false;
+        inProgressVisitPool.offer(ipv);
+    }
+
+    /*
+     * Stack state for walking an object graph.
+     * Field index is the index of the current field being fetched.
+     */
+    @SuppressWarnings({ "rawtypes"})
+    static class InProgressVisit
+    {
+        String name;
+        List<Field> fields;
+        Object o;
+        int fieldIndex = 0;
+        Field field;
+
+        //Need to know if Map.Entry should be returned or traversed as an object
+        boolean isMapIterator;
+        //If o is a ConcurrentMap, BlockingQueue, or Object[], this is populated with an iterator over the contents
+        Iterator<Object> collectionIterator;
+        //If o is a ConcurrentMap the entry set contains keys and values. The key is returned as the first child
+        //And the associated value is stashed here and returned next
+        Object mapEntryValue;
+
+        private Field nextField()
+        {
+            if (fields.isEmpty())
+                return null;
+
+            if (fieldIndex >= fields.size())
+                return null;
+
+            Field retval = fields.get(fieldIndex);
+            fieldIndex++;
+            return retval;
+        }
+
+        Pair<Object, Field> nextChild() throws IllegalAccessException
+        {
+            //If the last child returned was a key from a map, the value from that entry is stashed
+            //so it can be returned next
+            if (mapEntryValue != null)
+            {
+                Pair<Object, Field> retval = Pair.create(mapEntryValue, field);
+                mapEntryValue = null;
+                return retval;
+            }
+
+            //If o is a ConcurrentMap, BlockingQueue, or Object[], then an iterator will be stored to return the elements
+            if (collectionIterator != null)
+            {
+                if (!collectionIterator.hasNext())
+                    return null;
+                Object nextItem = null;
+                //Find the next non-null element to traverse since returning null will cause the visitor to stop
+                while (collectionIterator.hasNext() && (nextItem = collectionIterator.next()) == null){}
+                if (nextItem != null)
+                {
+                    if (isMapIterator & nextItem instanceof Map.Entry)
+                    {
+                        Map.Entry entry = (Map.Entry)nextItem;
+                        mapEntryValue = entry.getValue();
+                        return Pair.create(entry.getKey(), field);
+                    }
+                    return Pair.create(nextItem, field);
+                }
+                else
+                {
+                    return null;
+                }
+            }
+
+            //Basic traversal of an object by its member fields
+            //Don't return null values as that indicates no more objects
+            while (true)
+            {
+                Field nextField = nextField();
+                if (nextField == null)
+                    return null;
+
+                //A weak reference isn't strongly reachable
+                //subclasses of WeakReference contain strong references in their fields, so those need to be traversed
+                //The weak reference fields are in the common Reference class base so filter those out
+                if (o instanceof WeakReference & nextField.getDeclaringClass() == Reference.class)
+                    continue;
+
+                Object nextObject = nextField.get(o);
+                if (nextObject != null)
+                    return Pair.create(nextField.get(o), nextField);
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return field == null ? name : field.toString() + "-" + o.getClass().getName();
+        }
+    }
+
+    static class Visitor implements Runnable
+    {
+        final Deque<InProgressVisit> path = new ArrayDeque<>();
+        final Set<Object> visited = Collections.newSetFromMap(new IdentityHashMap<>());
+        @VisibleForTesting
+        int lastVisitedCount;
+        @VisibleForTesting
+        long iterations = 0;
+        GlobalState visiting;
+        Set<GlobalState> haveLoops;
+
+        public void run()
+        {
+            try
+            {
+                for (GlobalState globalState : globallyExtant)
+                {
+                    if (globalState.tidy == null)
+                        continue;
+
+                    // do a graph exploration of the GlobalState, since it should be shallow; if it references itself, we have a problem
+                    path.clear();
+                    visited.clear();
+                    lastVisitedCount = 0;
+                    iterations = 0;
+                    visited.add(globalState);
+                    visiting = globalState;
+                    traverse(globalState.tidy);
+                }
+            }
+            catch (Throwable t)
+            {
+                t.printStackTrace();
+            }
+            finally
+            {
+                lastVisitedCount = visited.size();
+                path.clear();
+                visited.clear();
+            }
+        }
+
+        /*
+         * Searches for an indirect strong reference between rootObject and visiting.
+         */
+        void traverse(final RefCounted.Tidy rootObject)
+        {
+            path.offer(newInProgressVisit(rootObject, getFields(rootObject.getClass()), null, rootObject.name()));
+
+            InProgressVisit inProgress = null;
+            while (inProgress != null || !path.isEmpty())
+            {
+                //If necessary fetch the next object to start tracing
+                if (inProgress == null)
+                    inProgress = path.pollLast();
+
+                try
+                {
+                    Pair<Object, Field> p = inProgress.nextChild();
+                    Object child = null;
+                    Field field = null;
+
+                    if (p != null)
+                    {
+                        iterations++;
+                        child = p.left;
+                        field = p.right;
+                    }
+
+                    if (child != null && visited.add(child))
+                    {
+                        path.offer(inProgress);
+                        inProgress = newInProgressVisit(child, getFields(child.getClass()), field, null);
+                        continue;
+                    }
+                    else if (visiting == child)
+                    {
+                        if (haveLoops != null)
+                            haveLoops.add(visiting);
+                        NoSpamLogger.log(logger,
+                                NoSpamLogger.Level.ERROR,
+                                rootObject.getClass().getName(),
+                                1,
+                                TimeUnit.SECONDS,
+                                "Strong self-ref loop detected {}",
+                                path);
+                    }
+                    else if (child == null)
+                    {
+                        returnInProgressVisit(inProgress);
+                        inProgress = null;
+                        continue;
+                    }
+                }
+                catch (IllegalAccessException e)
+                {
+                    NoSpamLogger.log(logger, NoSpamLogger.Level.ERROR, 5, TimeUnit.MINUTES, "Could not fully check for self-referential leaks", e);
+                }
+            }
+        }
+    }
+
+    static final Map<Class<?>, List<Field>> fieldMap = new HashMap<>();
+    static List<Field> getFields(Class<?> clazz)
+    {
+        if (clazz == null || clazz == PhantomReference.class || clazz == Class.class || java.lang.reflect.Member.class.isAssignableFrom(clazz))
+            return emptyList();
+        List<Field> fields = fieldMap.get(clazz);
+        if (fields != null)
+            return fields;
+        fieldMap.put(clazz, fields = new ArrayList<>());
+        for (Field field : clazz.getDeclaredFields())
+        {
+            if (field.getType().isPrimitive() || Modifier.isStatic(field.getModifiers()))
+                continue;
+            field.setAccessible(true);
+            fields.add(field);
+        }
+        fields.addAll(getFields(clazz.getSuperclass()));
+        return fields;
+    }
+
+    public static class IdentityCollection
+    {
+        final Set<Tidy> candidates;
+        public IdentityCollection(Set<Tidy> candidates)
+        {
+            this.candidates = candidates;
+        }
+
+        public void add(Ref<?> ref)
+        {
+            candidates.remove(ref.state.globalState.tidy);
+        }
+        public void add(SelfRefCounted<?> ref)
+        {
+            add(ref.selfRef());
+        }
+        public void add(SharedCloseable ref)
+        {
+            if (ref instanceof SharedCloseableImpl)
+                add((SharedCloseableImpl)ref);
+        }
+        public void add(SharedCloseableImpl ref)
+        {
+            add(ref.ref);
+        }
+        public void add(Memory memory)
+        {
+            if (memory instanceof SafeMemory)
+                ((SafeMemory) memory).addTo(this);
+        }
+    }
+
+    private static class StrongLeakDetector implements Runnable
+    {
+        Set<Tidy> candidates = new HashSet<>();
+
+        public void run()
+        {
+            final Set<Tidy> candidates = Collections.newSetFromMap(new IdentityHashMap<>());
+            for (GlobalState state : globallyExtant)
+                candidates.add(state.tidy);
+            removeExpected(candidates);
+            this.candidates.retainAll(candidates);
+            if (!this.candidates.isEmpty())
+            {
+                List<String> names = new ArrayList<>();
+                for (Tidy tidy : this.candidates)
+                    names.add(tidy.name());
+                logger.warn("Strong reference leak candidates detected: {}", names);
+            }
+            this.candidates = candidates;
+        }
+
+        private void removeExpected(Set<Tidy> candidates)
+        {
+            final Ref.IdentityCollection expected = new Ref.IdentityCollection(candidates);
+            for (Keyspace ks : Keyspace.all())
+            {
+                for (ColumnFamilyStore cfs : ks.getColumnFamilyStores())
+                {
+                    View view = cfs.getTracker().getView();
+                    for (SSTableReader reader : view.allKnownSSTables())
+                        reader.addTo(expected);
+                }
+            }
+        }
+    }
+
     @VisibleForTesting
     public static void shutdownReferenceReaper(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
     {
-        ExecutorUtils.shutdownNowAndWait(timeout, unit, EXEC);
+        ExecutorUtils.shutdownNowAndWait(timeout, unit, EXEC, STRONG_LEAK_DETECTOR);
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/SharedCloseable.java b/src/java/org/apache/cassandra/utils/concurrent/SharedCloseable.java
index a3a1863..d643d1d 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/SharedCloseable.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/SharedCloseable.java

@@ -33,4 +33,5 @@
     public SharedCloseable sharedCopy();
     public Throwable close(Throwable accumulate);
 
+    public void addTo(Ref.IdentityCollection identities);
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/SharedCloseableImpl.java b/src/java/org/apache/cassandra/utils/concurrent/SharedCloseableImpl.java
index d85fd54..023df8f 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/SharedCloseableImpl.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/SharedCloseableImpl.java

@@ -49,4 +49,9 @@
     {
         return ref.ensureReleased(accumulate);
     }
+
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        identities.add(ref);
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/Transactional.java b/src/java/org/apache/cassandra/utils/concurrent/Transactional.java
index 85c3de5..afc4fdf 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/Transactional.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/Transactional.java

@@ -41,7 +41,12 @@
  * If everything completes normally, then on exiting the try block the auto close method will invoke cleanup
  * to release any temporary state/resources
  *
- * No exceptions should be thrown during commit; if they are, it is not at all clear what the correct behaviour
+ * All exceptions and assertions that may be thrown should be checked and ruled out during commit preparation.
+ * Commit should generally never throw an exception unless there is a real correctness-affecting exception that
+ * cannot be moved to prepareToCommit, in which case this operation MUST be executed before any other commit
+ * methods in the object graph.
+ *
+ * If exceptions are generated by commit after this initial moment, it is not at all clear what the correct behaviour
  * of the system should be, and so simply logging the exception is likely best (since it may have been an issue
  * during cleanup, say), and rollback cannot now occur. As such all exceptions and assertions that may be thrown
  * should be checked and ruled out during commit preparation.
@@ -52,7 +57,6 @@
  */
 public interface Transactional extends AutoCloseable
 {
-
     /**
      * A simple abstract implementation of Transactional behaviour.
      * In general this should be used as the base class for any transactional implementations.
@@ -60,9 +64,9 @@
      * If the implementation wraps any internal Transactional objects, it must proxy every
      * commit() and abort() call onto each internal object to ensure correct behaviour
      */
-    public static abstract class AbstractTransactional implements Transactional
+    abstract class AbstractTransactional implements Transactional
     {
-        public static enum State
+        public enum State
         {
             IN_PROGRESS,
             READY_TO_COMMIT,
@@ -70,7 +74,6 @@
             ABORTED;
         }
 
-        private boolean permitRedundantTransitions;
         private State state = State.IN_PROGRESS;
 
         // the methods for actually performing the necessary behaviours, that are themselves protected against
@@ -84,7 +87,8 @@
         // Transactional objects will perform cleanup in the commit() or abort() calls
 
         /**
-         * perform an exception-safe pre-abort cleanup; this will still be run *after* commit
+         * perform an exception-safe pre-abort/commit cleanup;
+         * this will be run after prepareToCommit (so before commit), and before abort
          */
         protected Throwable doPreCleanup(Throwable accumulate){ return accumulate; }
 
@@ -104,12 +108,9 @@
          */
         public final Throwable commit(Throwable accumulate)
         {
-            if (permitRedundantTransitions && state == State.COMMITTED)
-                return accumulate;
             if (state != State.READY_TO_COMMIT)
                 throw new IllegalStateException("Cannot commit unless READY_TO_COMMIT; state is " + state);
             accumulate = doCommit(accumulate);
-            accumulate = doPreCleanup(accumulate);
             accumulate = doPostCleanup(accumulate);
             state = State.COMMITTED;
             return accumulate;
@@ -161,12 +162,11 @@
          */
         public final void prepareToCommit()
         {
-            if (permitRedundantTransitions && state == State.READY_TO_COMMIT)
-                return;
             if (state != State.IN_PROGRESS)
                 throw new IllegalStateException("Cannot prepare to commit unless IN_PROGRESS; state is " + state);
 
             doPrepare();
+            maybeFail(doPreCleanup(null));
             state = State.READY_TO_COMMIT;
         }
 
@@ -199,21 +199,19 @@
         {
             return state;
         }
-
-        protected void permitRedundantTransitions()
-        {
-            permitRedundantTransitions = true;
-        }
     }
 
     // commit should generally never throw an exception, and preferably never generate one,
     // but if it does generate one it should accumulate it in the parameter and return the result
     // IF a commit implementation has a real correctness affecting exception that cannot be moved to
     // prepareToCommit, it MUST be executed before any other commit methods in the object graph
-    public Throwable commit(Throwable accumulate);
+    Throwable commit(Throwable accumulate);
 
     // release any resources, then rollback all state changes (unless commit() has already been invoked)
-    public Throwable abort(Throwable accumulate);
+    Throwable abort(Throwable accumulate);
 
-    public void prepareToCommit();
+    void prepareToCommit();
+
+    // close() does not throw
+    public void close();
 }

diff --git a/src/java/org/apache/cassandra/utils/concurrent/WrappedSharedCloseable.java b/src/java/org/apache/cassandra/utils/concurrent/WrappedSharedCloseable.java
index 96e226c..0eefae3 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/WrappedSharedCloseable.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/WrappedSharedCloseable.java

@@ -20,6 +20,11 @@
 
 import java.util.Arrays;
 
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.utils.Throwables.maybeFail;
+import static org.apache.cassandra.utils.Throwables.merge;
+
 /**
  * An implementation of SharedCloseable that wraps a normal AutoCloseable,
  * ensuring its close method is only called when all instances of SharedCloseable have been
@@ -35,22 +40,41 @@
 
     public WrappedSharedCloseable(final AutoCloseable[] closeable)
     {
-        super(new RefCounted.Tidy()
-        {
-            public void tidy() throws Exception
-            {
-                for (AutoCloseable c : closeable)
-                    c.close();
-            }
-
-            public String name()
-            {
-                return Arrays.toString(closeable);
-            }
-        });
+        super(new Tidy(closeable));
         wrapped = closeable;
     }
 
+    static final class Tidy implements RefCounted.Tidy
+    {
+        final AutoCloseable[] closeable;
+        Tidy(AutoCloseable[] closeable)
+        {
+            this.closeable = closeable;
+        }
+
+        public void tidy() throws Exception
+        {
+            Throwable fail = null;
+            for (AutoCloseable c : closeable)
+            {
+                try
+                {
+                    c.close();
+                }
+                catch (Throwable t)
+                {
+                    fail = merge(fail, t);
+                }
+            }
+            maybeFail(fail);
+        }
+
+        public String name()
+    {
+        return Arrays.toString(closeable);
+    }
+    }
+
     protected WrappedSharedCloseable(WrappedSharedCloseable copy)
     {
         super(copy);

diff --git a/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java b/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java
index 0735d6e..9066335 100644
--- a/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/AbstractAllocator.java

@@ -19,6 +19,11 @@
 
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.Columns;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public abstract class AbstractAllocator
@@ -40,4 +45,32 @@
     }
 
     public abstract ByteBuffer allocate(int size);
+
+    public Row.Builder cloningBTreeRowBuilder()
+    {
+        return new CloningBTreeRowBuilder(this);
+    }
+
+    private static class CloningBTreeRowBuilder extends BTreeRow.Builder
+    {
+        private final AbstractAllocator allocator;
+
+        private CloningBTreeRowBuilder(AbstractAllocator allocator)
+        {
+            super(true);
+            this.allocator = allocator;
+        }
+
+        @Override
+        public void newRow(Clustering clustering)
+        {
+            super.newRow(clustering.copy(allocator));
+        }
+
+        @Override
+        public void addCell(Cell cell)
+        {
+            super.addCell(cell.copy(allocator));
+        }
+    }
 }

diff --git a/src/java/org/apache/cassandra/utils/memory/BufferPool.java b/src/java/org/apache/cassandra/utils/memory/BufferPool.java
new file mode 100644
index 0000000..d0cea0f
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/memory/BufferPool.java

@@ -0,0 +1,851 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils.memory;
+
+import java.lang.ref.PhantomReference;
+import java.lang.ref.ReferenceQueue;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.ExecutorUtils;
+import org.apache.cassandra.utils.NoSpamLogger;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.cassandra.concurrent.InfiniteLoopExecutor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.metrics.BufferPoolMetrics;
+import org.apache.cassandra.utils.concurrent.Ref;
+
+import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
+import static org.apache.cassandra.utils.ExecutorUtils.shutdownNow;
+
+/**
+ * A pool of ByteBuffers that can be recycled.
+ */
+public class BufferPool
+{
+    /** The size of a page aligned buffer, 64KiB */
+    static final int CHUNK_SIZE = 64 << 10;
+
+    @VisibleForTesting
+    public static long MEMORY_USAGE_THRESHOLD = DatabaseDescriptor.getFileCacheSizeInMB() * 1024L * 1024L;
+
+    @VisibleForTesting
+    public static boolean ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = DatabaseDescriptor.getBufferPoolUseHeapIfExhausted();
+
+    @VisibleForTesting
+    public static boolean DISABLED = Boolean.parseBoolean(System.getProperty("cassandra.test.disable_buffer_pool", "false"));
+
+    @VisibleForTesting
+    public static boolean DEBUG = false;
+
+    private static final Logger logger = LoggerFactory.getLogger(BufferPool.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 15L, TimeUnit.MINUTES);
+    private static final ByteBuffer EMPTY_BUFFER = ByteBuffer.allocateDirect(0);
+
+    /** A global pool of chunks (page aligned buffers) */
+    private static final GlobalPool globalPool = new GlobalPool();
+
+    /** A thread local pool of chunks, where chunks come from the global pool */
+    private static final ThreadLocal<LocalPool> localPool = new ThreadLocal<LocalPool>() {
+        @Override
+        protected LocalPool initialValue()
+        {
+            return new LocalPool();
+        }
+    };
+
+    public static ByteBuffer get(int size)
+    {
+        if (DISABLED)
+            return allocate(size, ALLOCATE_ON_HEAP_WHEN_EXAHUSTED);
+        else
+            return takeFromPool(size, ALLOCATE_ON_HEAP_WHEN_EXAHUSTED);
+    }
+
+    public static ByteBuffer get(int size, BufferType bufferType)
+    {
+        boolean direct = bufferType == BufferType.OFF_HEAP;
+        if (DISABLED || !direct)
+            return allocate(size, !direct);
+        else
+            return takeFromPool(size, !direct);
+    }
+
+    /** Unlike the get methods, this will return null if the pool is exhausted */
+    public static ByteBuffer tryGet(int size)
+    {
+        if (DISABLED)
+            return allocate(size, ALLOCATE_ON_HEAP_WHEN_EXAHUSTED);
+        else
+            return maybeTakeFromPool(size, ALLOCATE_ON_HEAP_WHEN_EXAHUSTED);
+    }
+
+    private static ByteBuffer allocate(int size, boolean onHeap)
+    {
+        return onHeap
+               ? ByteBuffer.allocate(size)
+               : ByteBuffer.allocateDirect(size);
+    }
+
+    private static ByteBuffer takeFromPool(int size, boolean allocateOnHeapWhenExhausted)
+    {
+        ByteBuffer ret = maybeTakeFromPool(size, allocateOnHeapWhenExhausted);
+        if (ret != null)
+            return ret;
+
+        if (logger.isTraceEnabled())
+            logger.trace("Requested buffer size {} has been allocated directly due to lack of capacity", size);
+
+        return localPool.get().allocate(size, allocateOnHeapWhenExhausted);
+    }
+
+    private static ByteBuffer maybeTakeFromPool(int size, boolean allocateOnHeapWhenExhausted)
+    {
+        if (size < 0)
+            throw new IllegalArgumentException("Size must be positive (" + size + ")");
+
+        if (size == 0)
+            return EMPTY_BUFFER;
+
+        if (size > CHUNK_SIZE)
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Requested buffer size {} is bigger than {}, allocating directly", size, CHUNK_SIZE);
+
+            return localPool.get().allocate(size, allocateOnHeapWhenExhausted);
+        }
+
+        return localPool.get().get(size);
+    }
+
+    public static void put(ByteBuffer buffer)
+    {
+        if (!(DISABLED || buffer.hasArray()))
+            localPool.get().put(buffer);
+    }
+
+    /** This is not thread safe and should only be used for unit testing. */
+    @VisibleForTesting
+    static void reset()
+    {
+        localPool.get().reset();
+        globalPool.reset();
+    }
+
+    @VisibleForTesting
+    static Chunk currentChunk()
+    {
+        return localPool.get().chunks[0];
+    }
+
+    @VisibleForTesting
+    static int numChunks()
+    {
+        int ret = 0;
+        for (Chunk chunk : localPool.get().chunks)
+        {
+            if (chunk != null)
+                ret++;
+        }
+        return ret;
+    }
+
+    @VisibleForTesting
+    static void assertAllRecycled()
+    {
+        globalPool.debug.check();
+    }
+
+    public static long sizeInBytes()
+    {
+        return globalPool.sizeInBytes();
+    }
+
+    static final class Debug
+    {
+        long recycleRound = 1;
+        final Queue<Chunk> allChunks = new ConcurrentLinkedQueue<>();
+        void register(Chunk chunk)
+        {
+            allChunks.add(chunk);
+        }
+        void recycle(Chunk chunk)
+        {
+            chunk.lastRecycled = recycleRound;
+        }
+        void check()
+        {
+            for (Chunk chunk : allChunks)
+                assert chunk.lastRecycled == recycleRound;
+            recycleRound++;
+        }
+    }
+
+    /**
+     * A queue of page aligned buffers, the chunks, which have been sliced from bigger chunks,
+     * the macro-chunks, also page aligned. Macro-chunks are allocated as long as we have not exceeded the
+     * memory maximum threshold, MEMORY_USAGE_THRESHOLD and are never released.
+     *
+     * This class is shared by multiple thread local pools and must be thread-safe.
+     */
+    static final class GlobalPool
+    {
+        /** The size of a bigger chunk, 1-mbit, must be a multiple of CHUNK_SIZE */
+        static final int MACRO_CHUNK_SIZE = 1 << 20;
+
+        static
+        {
+            assert Integer.bitCount(CHUNK_SIZE) == 1; // must be a power of 2
+            assert Integer.bitCount(MACRO_CHUNK_SIZE) == 1; // must be a power of 2
+            assert MACRO_CHUNK_SIZE % CHUNK_SIZE == 0; // must be a multiple
+
+            if (DISABLED)
+                logger.info("Global buffer pool is disabled, allocating {}", ALLOCATE_ON_HEAP_WHEN_EXAHUSTED ? "on heap" : "off heap");
+            else
+                logger.info("Global buffer pool is enabled, when pool is exahusted (max is {} mb) it will allocate {}",
+                            MEMORY_USAGE_THRESHOLD / (1024L * 1024L),
+                            ALLOCATE_ON_HEAP_WHEN_EXAHUSTED ? "on heap" : "off heap");
+        }
+
+        private final Debug debug = new Debug();
+        private final Queue<Chunk> macroChunks = new ConcurrentLinkedQueue<>();
+        // TODO (future): it would be preferable to use a CLStack to improve cache occupancy; it would also be preferable to use "CoreLocal" storage
+        private final Queue<Chunk> chunks = new ConcurrentLinkedQueue<>();
+        private final AtomicLong memoryUsage = new AtomicLong();
+
+        /** Return a chunk, the caller will take owership of the parent chunk. */
+        public Chunk get()
+        {
+            while (true)
+            {
+                Chunk chunk = chunks.poll();
+                if (chunk != null)
+                    return chunk;
+
+                if (!allocateMoreChunks())
+                    // give it one last attempt, in case someone else allocated before us
+                    return chunks.poll();
+            }
+        }
+
+        /**
+         * This method might be called by multiple threads and that's fine if we add more
+         * than one chunk at the same time as long as we don't exceed the MEMORY_USAGE_THRESHOLD.
+         */
+        private boolean allocateMoreChunks()
+        {
+            while (true)
+            {
+                long cur = memoryUsage.get();
+                if (cur + MACRO_CHUNK_SIZE > MEMORY_USAGE_THRESHOLD)
+                {
+                    noSpamLogger.info("Maximum memory usage reached ({} bytes), cannot allocate chunk of {} bytes",
+                                      MEMORY_USAGE_THRESHOLD, MACRO_CHUNK_SIZE);
+                    return false;
+                }
+                if (memoryUsage.compareAndSet(cur, cur + MACRO_CHUNK_SIZE))
+                    break;
+            }
+
+            // allocate a large chunk
+            Chunk chunk = new Chunk(allocateDirectAligned(MACRO_CHUNK_SIZE));
+            chunk.acquire(null);
+            macroChunks.add(chunk);
+            for (int i = 0 ; i < MACRO_CHUNK_SIZE ; i += CHUNK_SIZE)
+            {
+                Chunk add = new Chunk(chunk.get(CHUNK_SIZE));
+                chunks.add(add);
+                if (DEBUG)
+                    debug.register(add);
+            }
+
+            return true;
+        }
+
+        public void recycle(Chunk chunk)
+        {
+            chunks.add(chunk);
+        }
+
+        public long sizeInBytes()
+        {
+            return memoryUsage.get();
+        }
+
+        /** This is not thread safe and should only be used for unit testing. */
+        @VisibleForTesting
+        void reset()
+        {
+            while (!chunks.isEmpty())
+                chunks.poll().reset();
+
+            while (!macroChunks.isEmpty())
+                macroChunks.poll().reset();
+
+            memoryUsage.set(0);
+        }
+    }
+
+    /**
+     * A thread local class that grabs chunks from the global pool for this thread allocations.
+     * Only one thread can do the allocations but multiple threads can release the allocations.
+     */
+    static final class LocalPool
+    {
+        private final static BufferPoolMetrics metrics = new BufferPoolMetrics();
+        // a microqueue of Chunks:
+        //  * if any are null, they are at the end;
+        //  * new Chunks are added to the last null index
+        //  * if no null indexes available, the smallest is swapped with the last index, and this replaced
+        //  * this results in a queue that will typically be visited in ascending order of available space, so that
+        //    small allocations preferentially slice from the Chunks with the smallest space available to furnish them
+        // WARNING: if we ever change the size of this, we must update removeFromLocalQueue, and addChunk
+        private final Chunk[] chunks = new Chunk[3];
+        private byte chunkCount = 0;
+
+        public LocalPool()
+        {
+            localPoolReferences.add(new LocalPoolRef(this, localPoolRefQueue));
+        }
+
+        private Chunk addChunkFromGlobalPool()
+        {
+            Chunk chunk = globalPool.get();
+            if (chunk == null)
+                return null;
+
+            addChunk(chunk);
+            return chunk;
+        }
+
+        private void addChunk(Chunk chunk)
+        {
+            chunk.acquire(this);
+
+            if (chunkCount < 3)
+            {
+                chunks[chunkCount++] = chunk;
+                return;
+            }
+
+            int smallestChunkIdx = 0;
+            if (chunks[1].free() < chunks[0].free())
+                smallestChunkIdx = 1;
+            if (chunks[2].free() < chunks[smallestChunkIdx].free())
+                smallestChunkIdx = 2;
+
+            chunks[smallestChunkIdx].release();
+            if (smallestChunkIdx != 2)
+                chunks[smallestChunkIdx] = chunks[2];
+            chunks[2] = chunk;
+        }
+
+        public ByteBuffer get(int size)
+        {
+            for (Chunk chunk : chunks)
+            { // first see if our own chunks can serve this buffer
+                if (chunk == null)
+                    break;
+
+                ByteBuffer buffer = chunk.get(size);
+                if (buffer != null)
+                    return buffer;
+            }
+
+            // else ask the global pool
+            Chunk chunk = addChunkFromGlobalPool();
+            if (chunk != null)
+                return chunk.get(size);
+
+           return null;
+        }
+
+        private ByteBuffer allocate(int size, boolean onHeap)
+        {
+            metrics.misses.mark();
+            return BufferPool.allocate(size, onHeap);
+        }
+
+        public void put(ByteBuffer buffer)
+        {
+            Chunk chunk = Chunk.getParentChunk(buffer);
+            if (chunk == null)
+            {
+                FileUtils.clean(buffer);
+                return;
+            }
+
+            LocalPool owner = chunk.owner;
+            // ask the free method to take exclusive ownership of the act of recycling
+            // if we are either: already not owned by anyone, or owned by ourselves
+            long free = chunk.free(buffer, owner == null | owner == this);
+            if (free == 0L)
+            {
+                // 0L => we own recycling responsibility, so must recycle;
+                chunk.recycle();
+                // if we are also the owner, we must remove the Chunk from our local queue
+                if (owner == this)
+                    removeFromLocalQueue(chunk);
+            }
+            else if (((free == -1L) && owner != this) && chunk.owner == null)
+            {
+                // although we try to take recycle ownership cheaply, it is not always possible to do so if the owner is racing to unset.
+                // we must also check after completely freeing if the owner has since been unset, and try to recycle
+                chunk.tryRecycle();
+            }
+        }
+
+        private void removeFromLocalQueue(Chunk chunk)
+        {
+            // since we only have three elements in the queue, it is clearer, easier and faster to just hard code the options
+            if (chunks[0] == chunk)
+            {   // remove first by shifting back second two
+                chunks[0] = chunks[1];
+                chunks[1] = chunks[2];
+            }
+            else if (chunks[1] == chunk)
+            {   // remove second by shifting back last
+                chunks[1] = chunks[2];
+            }
+            else assert chunks[2] == chunk;
+            // whatever we do, the last element myst be null
+            chunks[2] = null;
+            chunkCount--;
+        }
+
+        @VisibleForTesting
+        void reset()
+        {
+            chunkCount = 0;
+            for (int i = 0; i < chunks.length; i++)
+            {
+                if (chunks[i] != null)
+                {
+                    chunks[i].owner = null;
+                    chunks[i].freeSlots = 0L;
+                    chunks[i].recycle();
+                    chunks[i] = null;
+                }
+            }
+        }
+    }
+
+    private static final class LocalPoolRef extends  PhantomReference<LocalPool>
+    {
+        private final Chunk[] chunks;
+        public LocalPoolRef(LocalPool localPool, ReferenceQueue<? super LocalPool> q)
+        {
+            super(localPool, q);
+            chunks = localPool.chunks;
+        }
+
+        public void release()
+        {
+            for (int i = 0 ; i < chunks.length ; i++)
+            {
+                if (chunks[i] != null)
+                {
+                    chunks[i].release();
+                    chunks[i] = null;
+                }
+            }
+        }
+    }
+
+    private static final ConcurrentLinkedQueue<LocalPoolRef> localPoolReferences = new ConcurrentLinkedQueue<>();
+
+    private static final ReferenceQueue<Object> localPoolRefQueue = new ReferenceQueue<>();
+    private static final InfiniteLoopExecutor EXEC = new InfiniteLoopExecutor("LocalPool-Cleaner", BufferPool::cleanupOneReference).start();
+
+    private static void cleanupOneReference() throws InterruptedException
+    {
+        Object obj = localPoolRefQueue.remove(100);
+        if (obj instanceof LocalPoolRef)
+        {
+            ((LocalPoolRef) obj).release();
+            localPoolReferences.remove(obj);
+        }
+    }
+
+    private static ByteBuffer allocateDirectAligned(int capacity)
+    {
+        int align = MemoryUtil.pageSize();
+        if (Integer.bitCount(align) != 1)
+            throw new IllegalArgumentException("Alignment must be a power of 2");
+
+        ByteBuffer buffer = ByteBuffer.allocateDirect(capacity + align);
+        long address = MemoryUtil.getAddress(buffer);
+        long offset = address & (align -1); // (address % align)
+
+        if (offset == 0)
+        { // already aligned
+            buffer.limit(capacity);
+        }
+        else
+        { // shift by offset
+            int pos = (int)(align - offset);
+            buffer.position(pos);
+            buffer.limit(pos + capacity);
+        }
+
+        return buffer.slice();
+    }
+
+    /**
+     * A memory chunk: it takes a buffer (the slab) and slices it
+     * into smaller buffers when requested.
+     *
+     * It divides the slab into 64 units and keeps a long mask, freeSlots,
+     * indicating if a unit is in use or not. Each bit in freeSlots corresponds
+     * to a unit, if the bit is set then the unit is free (available for allocation)
+     * whilst if it is not set then the unit is in use.
+     *
+     * When we receive a request of a given size we round up the size to the nearest
+     * multiple of allocation units required. Then we search for n consecutive free units,
+     * where n is the number of units required. We also align to page boundaries.
+     *
+     * When we reiceve a release request we work out the position by comparing the buffer
+     * address to our base address and we simply release the units.
+     */
+    final static class Chunk
+    {
+        private final ByteBuffer slab;
+        private final long baseAddress;
+        private final int shift;
+
+        private volatile long freeSlots;
+        private static final AtomicLongFieldUpdater<Chunk> freeSlotsUpdater = AtomicLongFieldUpdater.newUpdater(Chunk.class, "freeSlots");
+
+        // the pool that is _currently allocating_ from this Chunk
+        // if this is set, it means the chunk may not be recycled because we may still allocate from it;
+        // if it has been unset the local pool has finished with it, and it may be recycled
+        private volatile LocalPool owner;
+        private long lastRecycled;
+        private final Chunk original;
+
+        Chunk(Chunk recycle)
+        {
+            assert recycle.freeSlots == 0L;
+            this.slab = recycle.slab;
+            this.baseAddress = recycle.baseAddress;
+            this.shift = recycle.shift;
+            this.freeSlots = -1L;
+            this.original = recycle.original;
+            if (DEBUG)
+                globalPool.debug.recycle(original);
+        }
+
+        Chunk(ByteBuffer slab)
+        {
+            assert !slab.hasArray();
+            this.slab = slab;
+            this.baseAddress = MemoryUtil.getAddress(slab);
+
+            // The number of bits by which we need to shift to obtain a unit
+            // "31 &" is because numberOfTrailingZeros returns 32 when the capacity is zero
+            this.shift = 31 & (Integer.numberOfTrailingZeros(slab.capacity() / 64));
+            // -1 means all free whilst 0 means all in use
+            this.freeSlots = slab.capacity() == 0 ? 0L : -1L;
+            this.original = DEBUG ? this : null;
+        }
+
+        /**
+         * Acquire the chunk for future allocations: set the owner and prep
+         * the free slots mask.
+         */
+        void acquire(LocalPool owner)
+        {
+            assert this.owner == null;
+            this.owner = owner;
+        }
+
+        /**
+         * Set the owner to null and return the chunk to the global pool if the chunk is fully free.
+         * This method must be called by the LocalPool when it is certain that
+         * the local pool shall never try to allocate any more buffers from this chunk.
+         */
+        void release()
+        {
+            this.owner = null;
+            tryRecycle();
+        }
+
+        void tryRecycle()
+        {
+            assert owner == null;
+            if (isFree() && freeSlotsUpdater.compareAndSet(this, -1L, 0L))
+                recycle();
+        }
+
+        void recycle()
+        {
+            assert freeSlots == 0L;
+            globalPool.recycle(new Chunk(this));
+        }
+
+        /**
+         * We stash the chunk in the attachment of a buffer
+         * that was returned by get(), this method simply
+         * retrives the chunk that sliced a buffer, if any.
+         */
+        static Chunk getParentChunk(ByteBuffer buffer)
+        {
+            Object attachment = MemoryUtil.getAttachment(buffer);
+
+            if (attachment instanceof Chunk)
+                return (Chunk) attachment;
+
+            if (attachment instanceof Ref)
+                return ((Ref<Chunk>) attachment).get();
+
+            return null;
+        }
+
+        ByteBuffer setAttachment(ByteBuffer buffer)
+        {
+            if (Ref.DEBUG_ENABLED)
+                MemoryUtil.setAttachment(buffer, new Ref<>(this, null));
+            else
+                MemoryUtil.setAttachment(buffer, this);
+
+            return buffer;
+        }
+
+        boolean releaseAttachment(ByteBuffer buffer)
+        {
+            Object attachment = MemoryUtil.getAttachment(buffer);
+            if (attachment == null)
+                return false;
+
+            if (attachment instanceof Ref)
+                ((Ref<Chunk>) attachment).release();
+
+            return true;
+        }
+
+        @VisibleForTesting
+        void reset()
+        {
+            Chunk parent = getParentChunk(slab);
+            if (parent != null)
+                parent.free(slab, false);
+            else
+                FileUtils.clean(slab);
+        }
+
+        @VisibleForTesting
+        long setFreeSlots(long val)
+        {
+            long ret = freeSlots;
+            freeSlots = val;
+            return ret;
+        }
+
+        int capacity()
+        {
+            return 64 << shift;
+        }
+
+        final int unit()
+        {
+            return 1 << shift;
+        }
+
+        final boolean isFree()
+        {
+            return freeSlots == -1L;
+        }
+
+        /** The total free size */
+        int free()
+        {
+            return Long.bitCount(freeSlots) * unit();
+        }
+
+        /**
+         * Return the next available slice of this size. If
+         * we have exceeded the capacity we return null.
+         */
+        ByteBuffer get(int size)
+        {
+            // how many multiples of our units is the size?
+            // we add (unit - 1), so that when we divide by unit (>>> shift), we effectively round up
+            int slotCount = (size - 1 + unit()) >>> shift;
+
+            // if we require more than 64 slots, we cannot possibly accommodate the allocation
+            if (slotCount > 64)
+                return null;
+
+            // convert the slotCount into the bits needed in the bitmap, but at the bottom of the register
+            long slotBits = -1L >>> (64 - slotCount);
+
+            // in order that we always allocate page aligned results, we require that any allocation is "somewhat" aligned
+            // i.e. any single unit allocation can go anywhere; any 2 unit allocation must begin in one of the first 3 slots
+            // of a page; a 3 unit must go in the first two slots; and any four unit allocation must be fully page-aligned
+
+            // to achieve this, we construct a searchMask that constrains the bits we find to those we permit starting
+            // a match from. as we find bits, we remove them from the mask to continue our search.
+            // this has an odd property when it comes to concurrent alloc/free, as we can safely skip backwards if
+            // a new slot is freed up, but we always make forward progress (i.e. never check the same bits twice),
+            // so running time is bounded
+            long searchMask = 0x1111111111111111L;
+            searchMask *= 15L >>> ((slotCount - 1) & 3);
+            // i.e. switch (slotCount & 3)
+            // case 1: searchMask = 0xFFFFFFFFFFFFFFFFL
+            // case 2: searchMask = 0x7777777777777777L
+            // case 3: searchMask = 0x3333333333333333L
+            // case 0: searchMask = 0x1111111111111111L
+
+            // truncate the mask, removing bits that have too few slots proceeding them
+            searchMask &= -1L >>> (slotCount - 1);
+
+            // this loop is very unroll friendly, and would achieve high ILP, but not clear if the compiler will exploit this.
+            // right now, not worth manually exploiting, but worth noting for future
+            while (true)
+            {
+                long cur = freeSlots;
+                // find the index of the lowest set bit that also occurs in our mask (i.e. is permitted alignment, and not yet searched)
+                // we take the index, rather than finding the lowest bit, since we must obtain it anyway, and shifting is more efficient
+                // than multiplication
+                int index = Long.numberOfTrailingZeros(cur & searchMask);
+
+                // if no bit was actually found, we cannot serve this request, so return null.
+                // due to truncating the searchMask this immediately terminates any search when we run out of indexes
+                // that could accommodate the allocation, i.e. is equivalent to checking (64 - index) < slotCount
+                if (index == 64)
+                    return null;
+
+                // remove this bit from our searchMask, so we don't return here next round
+                searchMask ^= 1L << index;
+                // if our bits occur starting at the index, remove ourselves from the bitmask and return
+                long candidate = slotBits << index;
+                if ((candidate & cur) == candidate)
+                {
+                    // here we are sure we will manage to CAS successfully without changing candidate because
+                    // there is only one thread allocating at the moment, the concurrency is with the release
+                    // operations only
+                    while (true)
+                    {
+                        // clear the candidate bits (freeSlots &= ~candidate)
+                        if (freeSlotsUpdater.compareAndSet(this, cur, cur & ~candidate))
+                            break;
+
+                        cur = freeSlots;
+                        // make sure no other thread has cleared the candidate bits
+                        assert ((candidate & cur) == candidate);
+                    }
+                    return get(index << shift, size);
+                }
+            }
+        }
+
+        private ByteBuffer get(int offset, int size)
+        {
+            slab.limit(offset + size);
+            slab.position(offset);
+
+            return setAttachment(slab.slice());
+        }
+
+        /**
+         * Round the size to the next unit multiple.
+         */
+        int roundUp(int v)
+        {
+            return BufferPool.roundUp(v, unit());
+        }
+
+        /**
+         * Release a buffer. Return:
+         *    0L if the buffer must be recycled after the call;
+         *   -1L if it is free (and so we should tryRecycle if owner is now null)
+         *    some other value otherwise
+         **/
+        long free(ByteBuffer buffer, boolean tryRelease)
+        {
+            if (!releaseAttachment(buffer))
+                return 1L;
+
+            long address = MemoryUtil.getAddress(buffer);
+            assert (address >= baseAddress) & (address <= baseAddress + capacity());
+
+            int position = (int)(address - baseAddress);
+            int size = roundUp(buffer.capacity());
+
+            position >>= shift;
+            int slotCount = size >> shift;
+
+            long slotBits = (1L << slotCount) - 1;
+            long shiftedSlotBits = (slotBits << position);
+
+            if (slotCount == 64)
+            {
+                assert size == capacity();
+                assert position == 0;
+                shiftedSlotBits = -1L;
+            }
+
+            long next;
+            while (true)
+            {
+                long cur = freeSlots;
+                next = cur | shiftedSlotBits;
+                assert next == (cur ^ shiftedSlotBits); // ensure no double free
+                if (tryRelease && (next == -1L))
+                    next = 0L;
+                if (freeSlotsUpdater.compareAndSet(this, cur, next))
+                    return next;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[slab %s, slots bitmap %s, capacity %d, free %d]", slab, Long.toBinaryString(freeSlots), capacity(), free());
+        }
+    }
+
+    @VisibleForTesting
+    public static int roundUpNormal(int size)
+    {
+        return roundUp(size, CHUNK_SIZE / 64);
+    }
+
+    private static int roundUp(int size, int unit)
+    {
+        int mask = unit - 1;
+        return (size + mask) & ~mask;
+    }
+
+    @VisibleForTesting
+    public static void shutdownLocalCleaner(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
+    {
+        shutdownNow(Arrays.asList(EXEC));
+        awaitTermination(timeout, unit, Arrays.asList(EXEC));
+    }
+}

diff --git a/src/java/org/apache/cassandra/utils/memory/HeapPool.java b/src/java/org/apache/cassandra/utils/memory/HeapPool.java
index d28dbf7..57242c4 100644
--- a/src/java/org/apache/cassandra/utils/memory/HeapPool.java
+++ b/src/java/org/apache/cassandra/utils/memory/HeapPool.java

@@ -19,11 +19,7 @@
 package org.apache.cassandra.utils.memory;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
 
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public class HeapPool extends MemtablePool
@@ -43,7 +39,7 @@
         return new Allocator(this);
     }
 
-    public static class Allocator extends MemtableBufferAllocator
+    private static class Allocator extends MemtableBufferAllocator
     {
         Allocator(HeapPool pool)
         {
@@ -55,48 +51,5 @@
             super.onHeap().allocate(size, opGroup);
             return ByteBuffer.allocate(size);
         }
-
-        public DataReclaimer reclaimer()
-        {
-            return new Reclaimer();
-        }
-
-        private class Reclaimer implements DataReclaimer
-        {
-            List<Cell> delayed;
-
-            public Reclaimer reclaim(Cell cell)
-            {
-                if (delayed == null)
-                    delayed = new ArrayList<>();
-                delayed.add(cell);
-                return this;
-            }
-
-            public Reclaimer reclaimImmediately(Cell cell)
-            {
-                onHeap().released(cell.name().dataSize() + cell.value().remaining());
-                return this;
-            }
-
-            public Reclaimer reclaimImmediately(DecoratedKey key)
-            {
-                onHeap().released(key.getKey().remaining());
-                return this;
-            }
-
-            public void cancel()
-            {
-                if (delayed != null)
-                    delayed.clear();
-            }
-
-            public void commit()
-            {
-                if (delayed != null)
-                    for (Cell cell : delayed)
-                        reclaimImmediately(cell);
-            }
-        }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java
index 25d2dd8..22ecbf5 100644
--- a/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java
+++ b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java

@@ -36,6 +36,7 @@
     private static final long DIRECT_BYTE_BUFFER_CAPACITY_OFFSET;
     private static final long DIRECT_BYTE_BUFFER_LIMIT_OFFSET;
     private static final long DIRECT_BYTE_BUFFER_POSITION_OFFSET;
+    private static final long DIRECT_BYTE_BUFFER_ATTACHMENT_OFFSET;
     private static final Class<?> BYTE_BUFFER_CLASS;
     private static final long BYTE_BUFFER_OFFSET_OFFSET;
     private static final long BYTE_BUFFER_HB_OFFSET;
@@ -49,8 +50,10 @@
     static
     {
         String arch = System.getProperty("os.arch");
+        // Note that s390x architecture are not officially supported and adding it here is only done out of convenience
+        // for those that want to run C* on this architecture at their own risk (see #11214)
         UNALIGNED = arch.equals("i386") || arch.equals("x86")
-                || arch.equals("amd64") || arch.equals("x86_64");
+                || arch.equals("amd64") || arch.equals("x86_64") || arch.equals("s390x");
         INVERTED_ORDER = UNALIGNED && !BIG_ENDIAN;
         try
         {
@@ -62,6 +65,7 @@
             DIRECT_BYTE_BUFFER_CAPACITY_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("capacity"));
             DIRECT_BYTE_BUFFER_LIMIT_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("limit"));
             DIRECT_BYTE_BUFFER_POSITION_OFFSET = unsafe.objectFieldOffset(Buffer.class.getDeclaredField("position"));
+            DIRECT_BYTE_BUFFER_ATTACHMENT_OFFSET = unsafe.objectFieldOffset(clazz.getDeclaredField("att"));
             DIRECT_BYTE_BUFFER_CLASS = clazz;
 
             clazz = ByteBuffer.allocate(0).getClass();
@@ -77,6 +81,17 @@
         }
     }
 
+    public static int pageSize()
+    {
+        return unsafe.pageSize();
+    }
+
+    public static long getAddress(ByteBuffer buffer)
+    {
+        assert buffer.getClass() == DIRECT_BYTE_BUFFER_CLASS;
+        return unsafe.getLong(buffer, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET);
+    }
+
     public static long allocate(long size)
     {
         return Native.malloc(size);
@@ -177,9 +192,21 @@
         unsafe.putInt(instance, DIRECT_BYTE_BUFFER_LIMIT_OFFSET, length);
     }
 
+    public static Object getAttachment(ByteBuffer instance)
+    {
+        assert instance.getClass() == DIRECT_BYTE_BUFFER_CLASS;
+        return unsafe.getObject(instance, DIRECT_BYTE_BUFFER_ATTACHMENT_OFFSET);
+    }
+
+    public static void setAttachment(ByteBuffer instance, Object next)
+    {
+        assert instance.getClass() == DIRECT_BYTE_BUFFER_CLASS;
+        unsafe.putObject(instance, DIRECT_BYTE_BUFFER_ATTACHMENT_OFFSET, next);
+    }
+
     public static ByteBuffer duplicateDirectByteBuffer(ByteBuffer source, ByteBuffer hollowBuffer)
     {
-        assert(source.isDirect());
+        assert source.getClass() == DIRECT_BYTE_BUFFER_CLASS;
         unsafe.putLong(hollowBuffer, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET, unsafe.getLong(source, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET));
         unsafe.putInt(hollowBuffer, DIRECT_BYTE_BUFFER_POSITION_OFFSET, unsafe.getInt(source, DIRECT_BYTE_BUFFER_POSITION_OFFSET));
         unsafe.putInt(hollowBuffer, DIRECT_BYTE_BUFFER_LIMIT_OFFSET, unsafe.getInt(source, DIRECT_BYTE_BUFFER_LIMIT_OFFSET));
@@ -187,17 +214,6 @@
         return hollowBuffer;
     }
 
-    public static ByteBuffer duplicateByteBuffer(ByteBuffer source, ByteBuffer hollowBuffer)
-    {
-        assert(!source.isDirect());
-        unsafe.putInt(hollowBuffer, DIRECT_BYTE_BUFFER_POSITION_OFFSET, unsafe.getInt(source, DIRECT_BYTE_BUFFER_POSITION_OFFSET));
-        unsafe.putInt(hollowBuffer, DIRECT_BYTE_BUFFER_LIMIT_OFFSET, unsafe.getInt(source, DIRECT_BYTE_BUFFER_LIMIT_OFFSET));
-        unsafe.putInt(hollowBuffer, DIRECT_BYTE_BUFFER_CAPACITY_OFFSET, unsafe.getInt(source, DIRECT_BYTE_BUFFER_CAPACITY_OFFSET));
-        unsafe.putInt(hollowBuffer, BYTE_BUFFER_OFFSET_OFFSET, unsafe.getInt(source, BYTE_BUFFER_OFFSET_OFFSET));
-        unsafe.putObject(hollowBuffer, BYTE_BUFFER_HB_OFFSET, unsafe.getObject(source, BYTE_BUFFER_HB_OFFSET));
-        return hollowBuffer;
-    }
-
     public static long getLongByByte(long address)
     {
         if (BIG_ENDIAN)

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java b/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java
index f5e743c..8383ddc 100644
--- a/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableAllocator.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.concurrent.WaitQueue;
 
@@ -58,12 +59,8 @@
         this.offHeap = offHeap;
     }
 
-    public abstract Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp);
-    public abstract CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp);
-    public abstract DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp);
-    public abstract ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp);
+    public abstract Row.Builder rowBuilder(OpOrder.Group opGroup);
     public abstract DecoratedKey clone(DecoratedKey key, OpOrder.Group opGroup);
-    public abstract DataReclaimer reclaimer();
 
     public SubAllocator onHeap()
     {
@@ -104,41 +101,6 @@
         return state == LifeCycle.LIVE;
     }
 
-    public static interface DataReclaimer
-    {
-        public DataReclaimer reclaim(Cell cell);
-        public DataReclaimer reclaimImmediately(Cell cell);
-        public DataReclaimer reclaimImmediately(DecoratedKey key);
-        public void cancel();
-        public void commit();
-    }
-
-    public static final DataReclaimer NO_OP = new DataReclaimer()
-    {
-        public DataReclaimer reclaim(Cell cell)
-        {
-            return this;
-        }
-
-        public DataReclaimer reclaimImmediately(Cell cell)
-        {
-            return this;
-        }
-
-        public DataReclaimer reclaimImmediately(DecoratedKey key)
-        {
-            return this;
-        }
-
-        @Override
-        public void cancel()
-        {}
-
-        @Override
-        public void commit()
-        {}
-    };
-
     /** Mark the BB as unused, permitting it to be reclaimed */
     public static final class SubAllocator
     {
@@ -185,7 +147,12 @@
                     acquired(size);
                     return;
                 }
-                WaitQueue.Signal signal = opGroup.isBlockingSignal(parent.hasRoom().register());
+                if (opGroup.isBlocking())
+                {
+                    allocated(size);
+                    return;
+                }
+                WaitQueue.Signal signal = opGroup.isBlockingSignal(parent.hasRoom().register(parent.blockedTimerContext()));
                 boolean allocated = parent.tryAllocate(size);
                 if (allocated || opGroup.isBlocking())
                 {

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java b/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java
index 7034d76..fb35b38 100644
--- a/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableBufferAllocator.java

@@ -20,40 +20,20 @@
 import java.nio.ByteBuffer;
 
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.BufferDecoratedKey;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.CounterCell;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletedCell;
-import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public abstract class MemtableBufferAllocator extends MemtableAllocator
 {
-
     protected MemtableBufferAllocator(SubAllocator onHeap, SubAllocator offHeap)
     {
         super(onHeap, offHeap);
     }
 
-    public Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    public Row.Builder rowBuilder(OpOrder.Group writeOp)
     {
-        return cell.localCopy(cfm, allocator(writeOp));
-    }
-
-    public CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return cell.localCopy(cfm, allocator(writeOp));
-    }
-
-    public DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return cell.localCopy(cfm, allocator(writeOp));
-    }
-
-    public ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return cell.localCopy(cfm, allocator(writeOp));
+        return allocator(writeOp).cloningBTreeRowBuilder();
     }
 
     public DecoratedKey clone(DecoratedKey key, OpOrder.Group writeOp)

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java b/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java
index 628b8c0..b905d2c 100644
--- a/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java
+++ b/src/java/org/apache/cassandra/utils/memory/MemtableCleanerThread.java

@@ -68,16 +68,10 @@
     }
 
     private final Runnable trigger;
-    private MemtableCleanerThread(final Clean<P> clean)
+    private MemtableCleanerThread(Clean<P> clean)
     {
         super(clean.pool.getClass().getSimpleName() + "Cleaner", clean);
-        this.trigger = new Runnable()
-        {
-            public void run()
-            {
-                clean.wait.signal();
-            }
-        };
+        this.trigger = clean.wait::signal;
     }
 
     MemtableCleanerThread(P pool, Runnable cleaner)

diff --git a/src/java/org/apache/cassandra/utils/memory/MemtablePool.java b/src/java/org/apache/cassandra/utils/memory/MemtablePool.java
index 9c4824a..8061566 100644
--- a/src/java/org/apache/cassandra/utils/memory/MemtablePool.java
+++ b/src/java/org/apache/cassandra/utils/memory/MemtablePool.java

@@ -24,8 +24,11 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
-import org.apache.cassandra.utils.ExecutorUtils;
+import com.codahale.metrics.Timer;
+import org.apache.cassandra.metrics.CassandraMetricsRegistry;
+import org.apache.cassandra.metrics.DefaultNameFactory;
 import org.apache.cassandra.utils.concurrent.WaitQueue;
+import org.apache.cassandra.utils.ExecutorUtils;
 
 
 /**
@@ -40,6 +43,8 @@
     public final SubPool onHeap;
     public final SubPool offHeap;
 
+    public final Timer blockedOnAllocating;
+
     final WaitQueue hasRoom = new WaitQueue();
 
     MemtablePool(long maxOnHeapMemory, long maxOffHeapMemory, float cleanThreshold, Runnable cleaner)
@@ -47,6 +52,8 @@
         this.onHeap = getSubPool(maxOnHeapMemory, cleanThreshold);
         this.offHeap = getSubPool(maxOffHeapMemory, cleanThreshold);
         this.cleaner = getCleaner(cleaner);
+        blockedOnAllocating = CassandraMetricsRegistry.Metrics.timer(new DefaultNameFactory("MemtablePool")
+                                                                         .createMetricName("BlockedOnAllocation"));
         if (this.cleaner != null)
             this.cleaner.start();
     }
@@ -69,7 +76,6 @@
         ExecutorUtils.shutdownNowAndWait(timeout, unit, cleaner);
     }
 
-
     public abstract MemtableAllocator newAllocator();
 
     /**
@@ -222,6 +228,11 @@
         {
             return hasRoom;
         }
+
+        public Timer.Context blockedTimerContext()
+        {
+            return blockedOnAllocating.time();
+        }
     }
 
     private static final AtomicLongFieldUpdater<SubPool> reclaimingUpdater = AtomicLongFieldUpdater.newUpdater(SubPool.class, "reclaiming");

diff --git a/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
index 98878c0..3d4ec16 100644
--- a/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java

@@ -24,17 +24,9 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.CounterCell;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletedCell;
-import org.apache.cassandra.db.ExpiringCell;
-import org.apache.cassandra.db.NativeCell;
-import org.apache.cassandra.db.NativeCounterCell;
 import org.apache.cassandra.db.NativeDecoratedKey;
-import org.apache.cassandra.db.NativeDeletedCell;
-import org.apache.cassandra.db.NativeExpiringCell;
+import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
 /**
@@ -70,28 +62,10 @@
         super(pool.onHeap.newAllocator(), pool.offHeap.newAllocator());
     }
 
-    @Override
-    public Cell clone(Cell cell, CFMetaData cfm, OpOrder.Group writeOp)
+    public Row.Builder rowBuilder(OpOrder.Group opGroup)
     {
-        return new NativeCell(this, writeOp, cell);
-    }
-
-    @Override
-    public CounterCell clone(CounterCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return new NativeCounterCell(this, writeOp, cell);
-    }
-
-    @Override
-    public DeletedCell clone(DeletedCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return new NativeDeletedCell(this, writeOp, cell);
-    }
-
-    @Override
-    public ExpiringCell clone(ExpiringCell cell, CFMetaData cfm, OpOrder.Group writeOp)
-    {
-        return new NativeExpiringCell(this, writeOp, cell);
+        // TODO
+        throw new UnsupportedOperationException();
     }
 
     public DecoratedKey clone(DecoratedKey key, OpOrder.Group writeOp)
@@ -99,12 +73,6 @@
         return new NativeDecoratedKey(key.getToken(), this, writeOp, key.getKey());
     }
 
-    @Override
-    public MemtableAllocator.DataReclaimer reclaimer()
-    {
-        return NO_OP;
-    }
-
     public long allocate(int size, OpOrder.Group opGroup)
     {
         assert size >= 0;
@@ -219,11 +187,6 @@
         private AtomicInteger nextFreeOffset = new AtomicInteger(0);
 
         /**
-         * Total number of allocations satisfied from this buffer
-         */
-        private AtomicInteger allocCount = new AtomicInteger();
-
-        /**
          * Create an uninitialized region. Note that memory is not allocated yet, so
          * this is cheap.
          *
@@ -238,34 +201,24 @@
         /**
          * Try to allocate <code>size</code> bytes from the region.
          *
-         * @return the successful allocation, or null to indicate not-enough-space
+         * @return the successful allocation, or -1 to indicate not-enough-space
          */
         long allocate(int size)
         {
-            while (true)
-            {
-                int oldOffset = nextFreeOffset.get();
+            int newOffset = nextFreeOffset.getAndAdd(size);
 
-                if (oldOffset + size > capacity) // capacity == remaining
-                    return -1;
+            if (newOffset + size > capacity)
+                // this region is full
+                return -1;
 
-                // Try to atomically claim this region
-                if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size))
-                {
-                    // we got the alloc
-                    allocCount.incrementAndGet();
-                    return peer + oldOffset;
-                }
-                // we raced and lost alloc, try again
-            }
+            return peer + newOffset;
         }
 
         @Override
         public String toString()
         {
             return "Region@" + System.identityHashCode(this) +
-                    " allocs=" + allocCount.get() + "waste=" +
-                    (capacity - nextFreeOffset.get());
+                    "waste=" + Math.max(0, capacity - nextFreeOffset.get());
         }
     }
 

diff --git a/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
index 8ffead1..5a8ec18 100644
--- a/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java

@@ -106,11 +106,6 @@
         }
     }
 
-    public DataReclaimer reclaimer()
-    {
-        return NO_OP;
-    }
-
     public void setDiscarded()
     {
         for (Region region : offHeapRegions)
@@ -177,11 +172,6 @@
         private AtomicInteger nextFreeOffset = new AtomicInteger(0);
 
         /**
-         * Total number of allocations satisfied from this buffer
-         */
-        private AtomicInteger allocCount = new AtomicInteger();
-
-        /**
          * Create an uninitialized region. Note that memory is not allocated yet, so
          * this is cheap.
          *
@@ -199,30 +189,20 @@
          */
         public ByteBuffer allocate(int size)
         {
-            while (true)
-            {
-                int oldOffset = nextFreeOffset.get();
+            int newOffset = nextFreeOffset.getAndAdd(size);
 
-                if (oldOffset + size > data.capacity()) // capacity == remaining
-                    return null;
+            if (newOffset + size > data.capacity())
+                // this region is full
+                return null;
 
-                // Try to atomically claim this region
-                if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size))
-                {
-                    // we got the alloc
-                    allocCount.incrementAndGet();
-                    return (ByteBuffer) data.duplicate().position(oldOffset).limit(oldOffset + size);
-                }
-                // we raced and lost alloc, try again
-            }
+            return (ByteBuffer) data.duplicate().position((newOffset)).limit(newOffset + size);
         }
 
         @Override
         public String toString()
         {
             return "Region@" + System.identityHashCode(this) +
-                   " allocs=" + allocCount.get() + "waste=" +
-                   (data.capacity() - nextFreeOffset.get());
+                   "waste=" + Math.max(0, data.capacity() - nextFreeOffset.get());
         }
     }
 }

diff --git a/src/java/org/apache/cassandra/utils/obs/IBitSet.java b/src/java/org/apache/cassandra/utils/obs/IBitSet.java
index ed7e54b..15ff361 100644
--- a/src/java/org/apache/cassandra/utils/obs/IBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/IBitSet.java

@@ -21,7 +21,7 @@
 import java.io.DataOutput;
 import java.io.IOException;
 
-import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.utils.concurrent.Ref;
 
 public interface IBitSet extends Closeable
 {
@@ -46,7 +46,7 @@
 
     public void serialize(DataOutput out) throws IOException;
 
-    public long serializedSize(TypeSizes type);
+    public long serializedSize();
 
     public void clear();
 
@@ -57,4 +57,6 @@
      * @return the amount of memory in bytes used off heap
      */
     public long offHeapSize();
+
+    public void addTo(Ref.IdentityCollection identities);
 }

diff --git a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
index 46c1bd0..8593a11 100644
--- a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java

@@ -23,6 +23,7 @@
 
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.util.Memory;
+import org.apache.cassandra.utils.concurrent.Ref;
 
 /**
  * Off-heap bitset,
@@ -67,6 +68,11 @@
         return bytes.size();
     }
 
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        identities.add(bytes);
+    }
+
     public boolean get(long index)
     {
         long i = index >> 3;
@@ -108,7 +114,7 @@
         out.writeInt((int) (bytes.size() / 8));
         for (long i = 0; i < bytes.size();)
         {
-            long value = ((bytes.getByte(i++) & 0xff) << 0) 
+            long value = ((bytes.getByte(i++) & 0xff) << 0)
                        + ((bytes.getByte(i++) & 0xff) << 8)
                        + ((bytes.getByte(i++) & 0xff) << 16)
                        + ((long) (bytes.getByte(i++) & 0xff) << 24)
@@ -120,9 +126,9 @@
         }
     }
 
-    public long serializedSize(TypeSizes type)
+    public long serializedSize()
     {
-        return type.sizeof((int) bytes.size()) + bytes.size();
+        return TypeSizes.sizeof((int) bytes.size()) + bytes.size();
     }
 
     @SuppressWarnings("resource")

diff --git a/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java b/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java
index e793f6c..82e6929 100644
--- a/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/OpenBitSet.java

@@ -23,6 +23,7 @@
 import java.io.IOException;
 
 import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.utils.concurrent.Ref;
 
 /**
  * <p>
@@ -115,7 +116,11 @@
       return 0;
   }
 
- /**
+    public void addTo(Ref.IdentityCollection identities)
+    {
+    }
+
+    /**
   * Returns the current capacity of this set.  Included for
   * compatibility.  This is *not* equal to {@link #cardinality}
   */
@@ -416,16 +421,16 @@
     }
 }
 
-  public long serializedSize(TypeSizes type) {
+  public long serializedSize() {
     int bitLength = getNumWords();
     int pageSize = getPageSize();
     int pageCount = getPageCount();
 
-    long size = type.sizeof(bitLength); // length
+    long size = TypeSizes.sizeof(bitLength); // length
     for (int p = 0; p < pageCount; p++) {
       long[] bits = getPage(p);
       for (int i = 0; i < pageSize && bitLength-- > 0; i++)
-        size += type.sizeof(bits[i]); // bucket
+        size += TypeSizes.sizeof(bits[i]); // bucket
     }
     return size;
   }

diff --git a/src/java/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupport.java b/src/java/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupport.java
index 438e411..e0439af 100644
--- a/src/java/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupport.java
+++ b/src/java/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupport.java

@@ -18,14 +18,13 @@
 
 package org.apache.cassandra.utils.progress.jmx;
 
+import java.util.Optional;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.regex.Pattern;
 import javax.management.Notification;
 import javax.management.NotificationBroadcasterSupport;
 import javax.management.ObjectName;
 
-import com.google.common.base.Optional;
-
 import org.apache.cassandra.utils.progress.ProgressEvent;
 import org.apache.cassandra.utils.progress.ProgressListener;
 
@@ -36,7 +35,7 @@
  */
 public class LegacyJMXProgressSupport implements ProgressListener
 {
-    protected static final Pattern SESSION_FAILED_MATCHER = Pattern.compile("Repair session .* for range .* failed with error .*");
+    protected static final Pattern SESSION_FAILED_MATCHER = Pattern.compile("Repair session .* for range .* failed with error .*|Repair command .* failed with error .*");
     protected static final Pattern SESSION_SUCCESS_MATCHER = Pattern.compile("Repair session .* for range .* finished");
 
     private final AtomicLong notificationSerialNumber = new AtomicLong();
@@ -76,7 +75,7 @@
             result[1] = status.get().ordinal();
             return Optional.of(result);
         }
-        return Optional.absent();
+        return Optional.empty();
     }
 
     protected static Optional<Status> getStatus(ProgressEvent event)
@@ -87,6 +86,7 @@
                 return Optional.of(Status.STARTED);
             case COMPLETE:
                 return Optional.of(Status.FINISHED);
+            case ERROR:
             case PROGRESS:
                 if (SESSION_FAILED_MATCHER.matcher(event.getMessage()).matches())
                 {
@@ -98,7 +98,7 @@
                 }
         }
 
-        return Optional.absent();
+        return Optional.empty();
     }
 
     protected static int getCmd(String tag)

diff --git a/src/java/org/apache/cassandra/utils/vint/EncodedDataInputStream.java b/src/java/org/apache/cassandra/utils/vint/EncodedDataInputStream.java
deleted file mode 100644
index bee8ab0..0000000
--- a/src/java/org/apache/cassandra/utils/vint/EncodedDataInputStream.java
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils.vint;
-
-import java.io.DataInput;
-import java.io.IOException;
-
-import org.apache.cassandra.io.util.AbstractDataInput;
-
-/**
- * Borrows idea from
- * https://developers.google.com/protocol-buffers/docs/encoding#varints
- *
- * Should be used with EncodedDataOutputStream
- */
-public class EncodedDataInputStream extends AbstractDataInput implements DataInput
-{
-    private DataInput input;
-
-    public EncodedDataInputStream(DataInput input)
-    {
-        this.input = input;
-    }
-
-    public int skipBytes(int n) throws IOException
-    {
-        return input.skipBytes(n);
-    }
-
-    public int read() throws IOException
-    {
-        return input.readByte() & 0xFF;
-    }
-
-    public void seek(long position)
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public long getPosition()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    public long getPositionLimit()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    protected long length()
-    {
-        throw new UnsupportedOperationException();
-    }
-
-    /* as all of the integer types could be decoded using VInt we can use single method vintEncode */
-
-    public int readInt() throws IOException
-    {
-        return (int) vintDecode();
-    }
-
-    public long readLong() throws IOException
-    {
-        return vintDecode();
-    }
-
-    public int readUnsignedShort() throws IOException
-    {
-        return (short) vintDecode();
-    }
-    
-    public short readShort() throws IOException
-    {
-        return (short) vintDecode();
-    }
-
-    private long vintDecode() throws IOException
-    {
-        byte firstByte = input.readByte();
-        int len = vintDecodeSize(firstByte);
-        if (len == 1)
-            return firstByte;
-        long i = 0;
-        for (int idx = 0; idx < len - 1; idx++)
-        {
-            byte b = input.readByte();
-            i = i << 8;
-            i = i | (b & 0xFF);
-        }
-        return (vintIsNegative(firstByte) ? (i ^ -1L) : i);
-    }
-
-    private int vintDecodeSize(byte value)
-    {
-        if (value >= -112)
-        {
-            return 1;
-        }
-        else if (value < -120)
-        {
-            return -119 - value;
-        }
-        return -111 - value;
-    }
-
-    private boolean vintIsNegative(byte value)
-    {
-        return value < -120 || (value >= -112 && value < 0);
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/vint/EncodedDataOutputStream.java b/src/java/org/apache/cassandra/utils/vint/EncodedDataOutputStream.java
deleted file mode 100644
index fe43ff2..0000000
--- a/src/java/org/apache/cassandra/utils/vint/EncodedDataOutputStream.java
+++ /dev/null

@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils.vint;
-
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.cassandra.io.util.UnbufferedDataOutputStreamPlus;
-
-/**
- * Borrows idea from
- * https://developers.google.com/protocol-buffers/docs/encoding#varints
- */
-public class EncodedDataOutputStream extends UnbufferedDataOutputStreamPlus
-{
-    private OutputStream out;
-
-    public EncodedDataOutputStream(OutputStream out)
-    {
-        this.out = out;
-    }
-
-    public void write(int b) throws IOException
-    {
-        out.write(b);
-    }
-
-    public void write(byte[] b) throws IOException
-    {
-        out.write(b);
-    }
-
-    public void write(byte[] b, int off, int len) throws IOException
-    {
-        out.write(b, off, len);
-    }
-
-    /* as all of the integer types could be encoded using VInt we can use single method vintEncode */
-
-    public void writeInt(int v) throws IOException
-    {
-        vintEncode(v);
-    }
-
-    public void writeLong(long v) throws IOException
-    {
-        vintEncode(v);
-    }
-
-    public void writeShort(int v) throws IOException
-    {
-        vintEncode(v);
-    }
-
-    private void vintEncode(long i) throws IOException
-    {
-        if (i >= -112 && i <= 127)
-        {
-            writeByte((byte) i);
-            return;
-        }
-        int len = -112;
-        if (i < 0)
-        {
-            i ^= -1L; // take one's complement'
-            len = -120;
-        }
-        long tmp = i;
-        while (tmp != 0)
-        {
-            tmp = tmp >> 8;
-            len--;
-        }
-        writeByte((byte) len);
-        len = (len < -120) ? -(len + 120) : -(len + 112);
-        for (int idx = len; idx != 0; idx--)
-        {
-            int shiftbits = (idx - 1) * 8;
-            long mask = 0xFFL << shiftbits;
-            writeByte((byte) ((i & mask) >> shiftbits));
-        }
-    }
-}

diff --git a/src/java/org/apache/cassandra/utils/vint/VIntCoding.java b/src/java/org/apache/cassandra/utils/vint/VIntCoding.java
new file mode 100644
index 0000000..27448e2
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/vint/VIntCoding.java

@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+package org.apache.cassandra.utils.vint;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import net.nicoulaj.compilecommand.annotations.Inline;
+
+/**
+ * Borrows idea from
+ * https://developers.google.com/protocol-buffers/docs/encoding#varints
+ */
+public class VIntCoding
+{
+
+    public static long readUnsignedVInt(DataInput input) throws IOException {
+        int firstByte = input.readByte();
+
+        //Bail out early if this is one byte, necessary or it fails later
+        if (firstByte >= 0)
+            return firstByte;
+
+        int size = numberOfExtraBytesToRead(firstByte);
+        long retval = firstByte & firstByteValueMask(size);
+        for (int ii = 0; ii < size; ii++)
+        {
+            byte b = input.readByte();
+            retval <<= 8;
+            retval |= b & 0xff;
+        }
+
+        return retval;
+    }
+
+    /**
+     * Note this method is the same as {@link #readUnsignedVInt(DataInput)},
+     * except that we do *not* block if there are not enough bytes in the buffer
+     * to reconstruct the value.
+     *
+     * WARNING: this method is only safe for vints we know to be representable by a positive long value.
+     *
+     * @return -1 if there are not enough bytes in the input to read the value; else, the vint unsigned value.
+     */
+    public static long getUnsignedVInt(ByteBuffer input, int readerIndex)
+    {
+        return getUnsignedVInt(input, readerIndex, input.limit());
+    }
+
+    public static long getUnsignedVInt(ByteBuffer input, int readerIndex, int readerLimit)
+    {
+        if (readerIndex >= readerLimit)
+            return -1;
+
+        int firstByte = input.get(readerIndex++);
+
+        //Bail out early if this is one byte, necessary or it fails later
+        if (firstByte >= 0)
+            return firstByte;
+
+        int size = numberOfExtraBytesToRead(firstByte);
+        if (readerIndex + size > readerLimit)
+            return -1;
+
+        long retval = firstByte & firstByteValueMask(size);
+        for (int ii = 0; ii < size; ii++)
+        {
+            byte b = input.get(readerIndex++);
+            retval <<= 8;
+            retval |= b & 0xff;
+        }
+
+        return retval;
+    }
+
+    public static long readVInt(DataInput input) throws IOException {
+        return decodeZigZag64(readUnsignedVInt(input));
+    }
+
+    // & this with the first byte to give the value part for a given extraBytesToRead encoded in the byte
+    public static int firstByteValueMask(int extraBytesToRead)
+    {
+        // by including the known 0bit in the mask, we can use this for encodeExtraBytesToRead
+        return 0xff >> extraBytesToRead;
+    }
+
+    public static int encodeExtraBytesToRead(int extraBytesToRead)
+    {
+        // because we have an extra bit in the value mask, we just need to invert it
+        return ~firstByteValueMask(extraBytesToRead);
+    }
+
+    public static int numberOfExtraBytesToRead(int firstByte)
+    {
+        // we count number of set upper bits; so if we simply invert all of the bits, we're golden
+        // this is aided by the fact that we only work with negative numbers, so when upcast to an int all
+        // of the new upper bits are also set, so by inverting we set all of them to zero
+        return Integer.numberOfLeadingZeros(~firstByte) - 24;
+    }
+
+    protected static final ThreadLocal<byte[]> encodingBuffer = new ThreadLocal<byte[]>()
+    {
+        @Override
+        public byte[] initialValue()
+        {
+            return new byte[9];
+        }
+    };
+
+    public static void writeUnsignedVInt(long value, DataOutput output) throws IOException {
+        int size = VIntCoding.computeUnsignedVIntSize(value);
+        if (size == 1)
+        {
+            output.write((int)value);
+            return;
+        }
+
+        output.write(VIntCoding.encodeVInt(value, size), 0, size);
+    }
+
+    @Inline
+    public static byte[] encodeVInt(long value, int size) {
+        byte encodingSpace[] = encodingBuffer.get();
+        int extraBytes = size - 1;
+
+        for (int i = extraBytes ; i >= 0; --i)
+        {
+            encodingSpace[i] = (byte) value;
+            value >>= 8;
+        }
+        encodingSpace[0] |= VIntCoding.encodeExtraBytesToRead(extraBytes);
+        return encodingSpace;
+    }
+
+    public static void writeVInt(long value, DataOutput output) throws IOException {
+        writeUnsignedVInt(encodeZigZag64(value), output);
+    }
+
+    /**
+     * Decode a ZigZag-encoded 64-bit value.  ZigZag encodes signed integers
+     * into values that can be efficiently encoded with varint.  (Otherwise,
+     * negative values must be sign-extended to 64 bits to be varint encoded,
+     * thus always taking 10 bytes on the wire.)
+     *
+     * @param n An unsigned 64-bit integer, stored in a signed int because
+     *          Java has no explicit unsigned support.
+     * @return A signed 64-bit integer.
+     */
+    public static long decodeZigZag64(final long n) {
+        return (n >>> 1) ^ -(n & 1);
+    }
+
+    /**
+     * Encode a ZigZag-encoded 64-bit value.  ZigZag encodes signed integers
+     * into values that can be efficiently encoded with varint.  (Otherwise,
+     * negative values must be sign-extended to 64 bits to be varint encoded,
+     * thus always taking 10 bytes on the wire.)
+     *
+     * @param n A signed 64-bit integer.
+     * @return An unsigned 64-bit integer, stored in a signed int because
+     *         Java has no explicit unsigned support.
+     */
+    public static long encodeZigZag64(final long n) {
+        // Note:  the right-shift must be arithmetic
+        return (n << 1) ^ (n >> 63);
+    }
+
+    /** Compute the number of bytes that would be needed to encode a varint. */
+    public static int computeVIntSize(final long param) {
+        return computeUnsignedVIntSize(encodeZigZag64(param));
+    }
+
+    /** Compute the number of bytes that would be needed to encode an unsigned varint. */
+    public static int computeUnsignedVIntSize(final long value) {
+        int magnitude = Long.numberOfLeadingZeros(value | 1); // | with 1 to ensure magntiude <= 63, so (63 - 1) / 7 <= 8
+        return 9 - ((magnitude - 1) / 7);
+    }
+}

diff --git a/src/resources/org/apache/cassandra/cql3/functions/JavaSourceUDF.txt b/src/resources/org/apache/cassandra/cql3/functions/JavaSourceUDF.txt
index f57b01e..4bd3601 100644
--- a/src/resources/org/apache/cassandra/cql3/functions/JavaSourceUDF.txt
+++ b/src/resources/org/apache/cassandra/cql3/functions/JavaSourceUDF.txt

@@ -1,42 +1,28 @@
-package org.apache.cassandra.cql3.udf.gen;
+package #package_name#;
 
 import java.nio.ByteBuffer;
 import java.util.List;
-import com.datastax.driver.core.DataType;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.JavaSourceUDFFactory;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.exceptions.FunctionExecutionException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
 
-public final class #class_name# extends org.apache.cassandra.cql3.functions.UDFunction
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+import com.datastax.driver.core.TypeCodec;
+
+public final class #class_name# extends JavaUDF
 {
-    public #class_name#(FunctionName name, List<ColumnIdentifier> argNames, List<AbstractType<?>> argTypes,
-                        DataType[] argDataTypes, AbstractType<?> returnType, DataType returnDataType, boolean calledOnNullInput, String body)
+    public #class_name#(TypeCodec<Object> returnCodec, TypeCodec<Object>[] argCodecs)
     {
-        super(name, argNames, argTypes, argDataTypes, returnType, returnDataType, calledOnNullInput, "java", body);
+        super(returnCodec, argCodecs);
     }
 
-    protected ByteBuffer executeUserDefined(int protocolVersion, List<ByteBuffer> params) throws InvalidRequestException
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
     {
-        try
-        {
-            #return_type# result = executeInternal(
+        #return_type# result = #execute_internal_name#(
 #arguments#
-            );
-            return decompose(protocolVersion, result);
-        }
-        catch (Throwable t)
-        {
-            logger.debug("Invocation of function '{}' failed", this, t);
-            if (t instanceof VirtualMachineError)
-                throw (VirtualMachineError)t;
-            throw FunctionExecutionException.create(this, t);
-        }
+        );
+        return super.decompose(protocolVersion, result);
     }
 
-    private #return_type# executeInternal(#argument_list#)
+    private #return_type# #execute_internal_name#(#argument_list#)
     {
 #body#
     }

diff --git a/test/burn/org/apache/cassandra/io/sstable/LongSegmentedFileBoundaryTest.java b/test/burn/org/apache/cassandra/io/sstable/LongSegmentedFileBoundaryTest.java
deleted file mode 100644
index 4913b32..0000000
--- a/test/burn/org/apache/cassandra/io/sstable/LongSegmentedFileBoundaryTest.java
+++ /dev/null

@@ -1,324 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.io.sstable;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.Random;
-
-import com.google.common.io.Files;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import junit.framework.Assert;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Config;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputStreamPlus;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.io.util.MmappedSegmentedFile;
-import org.apache.cassandra.io.util.MmappedSegmentedFile.Builder.Boundaries;
-import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class LongSegmentedFileBoundaryTest
-{
-    @BeforeClass
-    public static void setup() throws Exception
-    {
-        SchemaLoader.cleanupAndLeaveDirs();
-        Keyspace.setInitialized();
-        StorageService.instance.initServer();
-    }
-
-    @AfterClass
-    public static void tearDown()
-    {
-        Config.setClientMode(false);
-    }
-
-    @Test
-    public void testRandomBoundaries()
-    {
-        long[] candidates = new long[1 + (1 << 16)];
-        int[] indexesToCheck = new int[1 << 8];
-        Random random = new Random();
-
-        for (int run = 0; run < 100; run++)
-        {
-
-            long seed = random.nextLong();
-            random.setSeed(seed);
-            System.out.println("Seed: " + seed);
-
-            // at least 1Ki, and as many as 256Ki, boundaries
-            int candidateCount = (1 + random.nextInt(candidates.length >> 10)) << 10;
-            generateBoundaries(random, candidateCount, candidates, indexesToCheck);
-
-            Boundaries builder = new Boundaries();
-            int nextIndexToCheck = indexesToCheck[0];
-            int checkCount = 0;
-            System.out.printf("[0..%d)", candidateCount);
-            for (int i = 1; i < candidateCount - 1; i++)
-            {
-                if (i == nextIndexToCheck)
-                {
-                    if (checkCount % 20 == 0)
-                        System.out.printf(" %d", i);
-                    // grow number of samples logarithmically; work will still increase superlinearly, as size of dataset grows linearly
-                    int sampleCount = 1 << (31 - Integer.numberOfLeadingZeros(++checkCount));
-                    checkBoundarySample(random, candidates, i, sampleCount, builder);
-                    // select out next index to check (there may be dups, so skip them)
-                    while ((nextIndexToCheck = checkCount == indexesToCheck.length ? candidateCount : indexesToCheck[checkCount]) == i)
-                        checkCount++;
-                }
-
-                builder.addCandidate(candidates[i]);
-            }
-            System.out.println();
-            checkBoundaries(candidates, candidateCount - 1, builder, candidates[candidateCount - 1]);
-            Assert.assertEquals(candidateCount, nextIndexToCheck);
-        }
-    }
-
-    private static void generateBoundaries(Random random, int candidateCount, long[] candidates, int[] indexesToCheck)
-    {
-        // average averageBoundarySize is 4MiB, max 4GiB, min 4KiB
-        long averageBoundarySize = (4L << 10) * random.nextInt(1 << 20);
-        long prev = 0;
-        for (int i = 1 ; i < candidateCount ; i++)
-            candidates[i] = prev += Math.max(1, averageBoundarySize + (random.nextGaussian() * averageBoundarySize));
-
-        // generate indexes we will corroborate our behaviour on
-        for (int i = 0 ; i < indexesToCheck.length ; i++)
-            indexesToCheck[i] = 1 + random.nextInt(candidateCount - 2);
-        Arrays.sort(indexesToCheck);
-    }
-
-    private static void checkBoundarySample(Random random, long[] candidates, int candidateCount, int sampleCount, Boundaries builder)
-    {
-        for (int i = 0 ; i < sampleCount ; i++)
-        {
-            // pick a number exponentially less likely to be near the beginning, since we test that area earlier
-            int position = 0 ;
-            while (position <= 0)
-                position = candidateCount / (Integer.lowestOneBit(random.nextInt()));
-            long upperBound = candidates[position];
-            long lowerBound = random.nextBoolean() ? (rand(random, 0, upperBound) / (Integer.lowestOneBit(random.nextInt())))
-                                                   : candidates[Math.max(0, position - random.nextInt(64))];
-            long length = rand(random, lowerBound, upperBound);
-            checkBoundaries(candidates, candidateCount, builder, length);
-        }
-        checkBoundaries(candidates, candidateCount, builder, candidates[candidateCount]);
-    }
-
-    private static long rand(Random random, long lowerBound, long upperBound)
-    {
-        if (upperBound == lowerBound)
-            return upperBound;
-        return lowerBound + ((random.nextLong() & Long.MAX_VALUE) % (upperBound - lowerBound));
-    }
-
-    private static void checkBoundaries(long[] candidates, int candidateCount, Boundaries builder, long length)
-    {
-        if (length == 0)
-            return;
-
-        long[] boundaries = new long[(int) (10 + 2 * (length / Integer.MAX_VALUE))];
-        int count = 1;
-        int prev = 0;
-        while (true)
-        {
-            int p = candidates[prev + 1] - boundaries[count - 1] >= Integer.MAX_VALUE
-                    ? prev + 1
-                    : Arrays.binarySearch(candidates, prev, candidateCount, boundaries[count - 1] + Integer.MAX_VALUE);
-            if (p < 0) p = -2 -p;
-            if (p >= candidateCount - 1 || candidates[p] >= length)
-                break;
-            boundaries[count++] = candidates[p];
-            if (candidates[p + 1] >= length)
-                break;
-            prev = p;
-        }
-        if (candidates[candidateCount - 1] < length && length - boundaries[count - 1] >= Integer.MAX_VALUE)
-            boundaries[count++] = candidates[candidateCount - 1];
-        boundaries[count++] = length;
-        final long[] canon = Arrays.copyOf(boundaries, count);
-        final long[] check = builder.finish(length, false);
-        if (!Arrays.equals(canon, check))
-            Assert.assertTrue("\n" + Arrays.toString(canon) + "\n" + Arrays.toString(check), Arrays.equals(canon, check));
-    }
-
-    @Test
-    public void testBoundariesAndRepairSmall() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1, 1 << 16);
-    }
-
-    @Test
-    public void testBoundariesAndRepairMedium() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1, 1 << 20);
-    }
-
-    @Test
-    public void testBoundariesAndRepairLarge() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1, 100 << 20);
-    }
-
-    @Test
-    public void testBoundariesAndRepairHuge() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1, Integer.MAX_VALUE - 1024);
-    }
-
-    @Test
-    public void testBoundariesAndRepairTooHuge() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1, Integer.MAX_VALUE);
-    }
-
-    @Test
-    public void testBoundariesAndRepairHugeIndex() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1 << 7, 1 << 15);
-    }
-
-    @Test
-    public void testBoundariesAndRepairReallyHugeIndex() throws InvalidRequestException, IOException
-    {
-        testBoundariesAndRepair(1 << 14, 1 << 15);
-    }
-
-    private void testBoundariesAndRepair(int rows, int rowSize) throws InvalidRequestException, IOException
-    {
-        String KS = "cql_keyspace";
-        String TABLE = "table1";
-
-        File tempdir = Files.createTempDir();
-        try
-        {
-            Assert.assertTrue(DatabaseDescriptor.getColumnIndexSize() < rowSize);
-            Assert.assertTrue(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap);
-            Assert.assertTrue(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap);
-            Assert.assertTrue(StorageService.getPartitioner() instanceof ByteOrderedPartitioner);
-            File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
-            Assert.assertTrue(dataDir.mkdirs());
-
-            String schema = "CREATE TABLE cql_keyspace.table" + (rows > 1 ? "2" : "1") + " (k bigint, v1 blob, v2 blob, v3 blob, v4 blob, v5 blob, PRIMARY KEY (k" + (rows > 1 ? ", v1" : "") + ")) WITH compression = { 'sstable_compression':'' };";
-            String insert = "INSERT INTO cql_keyspace.table" + (rows > 1 ? "2" : "1") + " (k, v1, v2, v3, v4, v5) VALUES (?, ?, ?, ?, ?, ?)";
-
-            CQLSSTableWriter.Builder builder = CQLSSTableWriter.builder()
-                                                      .inDirectory(dataDir)
-                                                      .forTable(schema)
-                                                      .withPartitioner(StorageService.getPartitioner())
-                                                      .using(insert)
-                                                      .sorted();
-            CQLSSTableWriter writer = builder.build();
-
-            // write 8Gb of decorated keys
-            ByteBuffer[] value = new ByteBuffer[rows];
-            for (int row = 0 ; row < rows ; row++)
-            {
-                // if we're using clustering columns, the clustering key is replicated across every other column
-                value[row] = ByteBuffer.allocate(rowSize / (rows > 1 ? 8 : 5));
-                value[row].putInt(0, row);
-            }
-            long targetSize = 8L << 30;
-            long dk = 0;
-            long size = 0;
-            long dkSize = rowSize * rows;
-            while (size < targetSize)
-            {
-                for (int row = 0 ; row < rows ; row++)
-                    writer.addRow(dk, value[row], value[row], value[row], value[row], value[row]);
-                size += dkSize;
-                dk++;
-            }
-
-            Descriptor descriptor = writer.getCurrentDescriptor().asType(Descriptor.Type.FINAL);
-            writer.close();
-
-            // open (and close) the reader so that the summary file is created
-            SSTableReader reader = SSTableReader.open(descriptor);
-            reader.selfRef().release();
-
-            // then check the boundaries are reasonable, and corrupt them
-            checkThenCorruptBoundaries(descriptor, rows * rowSize < Integer.MAX_VALUE);
-
-            // then check that reopening corrects the corruption
-            reader = SSTableReader.open(descriptor);
-            reader.selfRef().release();
-            checkThenCorruptBoundaries(descriptor, rows * rowSize < Integer.MAX_VALUE);
-        }
-        finally
-        {
-            FileUtils.deleteRecursive(tempdir);
-        }
-    }
-
-    private static void checkThenCorruptBoundaries(Descriptor descriptor, boolean expectDataMmappable) throws IOException
-    {
-        File summaryFile = new File(descriptor.filenameFor(Component.SUMMARY));
-        DataInputStream iStream = new DataInputStream(new FileInputStream(summaryFile));
-        IndexSummary indexSummary = IndexSummary.serializer.deserialize(iStream, StorageService.getPartitioner(), true, CFMetaData.DEFAULT_MIN_INDEX_INTERVAL, CFMetaData.DEFAULT_MAX_INDEX_INTERVAL);
-        ByteBuffer first = ByteBufferUtil.readWithLength(iStream);
-        ByteBuffer last = ByteBufferUtil.readWithLength(iStream);
-        MmappedSegmentedFile.Builder ibuilder = new MmappedSegmentedFile.Builder();
-        MmappedSegmentedFile.Builder dbuilder = new MmappedSegmentedFile.Builder();
-        ibuilder.deserializeBounds(iStream);
-        dbuilder.deserializeBounds(iStream);
-        iStream.close();
-        // index file cannot generally be non-mmappable, as index entries cannot be larger than MAX_SEGMENT_SIZE (due to promotedSize being encoded as an int)
-        assertBoundaries(descriptor.filenameFor(Component.PRIMARY_INDEX), true, ibuilder.boundaries());
-        assertBoundaries(descriptor.filenameFor(Component.DATA), expectDataMmappable, dbuilder.boundaries());
-
-        DataOutputStreamPlus oStream = new WrappedDataOutputStreamPlus(new FileOutputStream(summaryFile));
-        IndexSummary.serializer.serialize(indexSummary, oStream, true);
-        ByteBufferUtil.writeWithLength(first, oStream);
-        ByteBufferUtil.writeWithLength(last, oStream);
-        oStream.writeInt(1);
-        oStream.writeLong(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)).length());
-        oStream.writeLong(new File(descriptor.filenameFor(Component.DATA)).length());
-        oStream.close();
-    }
-
-    private static void assertBoundaries(String path, boolean expectMmappable, long[] boundaries)
-    {
-        long length = new File(path).length();
-        long prev = boundaries[0];
-        for (int i = 1 ; i <= boundaries.length && prev < length ; i++)
-        {
-            long boundary = i == boundaries.length ? length : boundaries[i];
-            Assert.assertEquals(String.format("[%d, %d), %d of %d", boundary, prev, i, boundaries.length),
-                                expectMmappable, boundary - prev <= Integer.MAX_VALUE);
-            prev = boundary;
-        }
-    }
-
-}

diff --git a/test/burn/org/apache/cassandra/utils/LongBTreeTest.java b/test/burn/org/apache/cassandra/utils/LongBTreeTest.java
index 5b91a5e..c052015 100644
--- a/test/burn/org/apache/cassandra/utils/LongBTreeTest.java
+++ b/test/burn/org/apache/cassandra/utils/LongBTreeTest.java

@@ -18,16 +18,10 @@
  */
 package org.apache.cassandra.utils;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.NavigableMap;
-import java.util.NavigableSet;
-import java.util.Random;
-import java.util.TreeMap;
-import java.util.TreeSet;
+import java.lang.annotation.Annotation;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutionException;
@@ -36,36 +30,43 @@
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Consumer;
 
 import com.google.common.base.Function;
-import com.google.common.base.Predicate;
-import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.ListenableFutureTask;
 import org.junit.Assert;
 import org.junit.Test;
 
-
 import com.codahale.metrics.MetricRegistry;
 import com.codahale.metrics.Snapshot;
 import com.codahale.metrics.Timer;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
-import org.apache.cassandra.utils.btree.BTree;
-import org.apache.cassandra.utils.btree.BTreeSearchIterator;
-import org.apache.cassandra.utils.btree.BTreeSet;
-import org.apache.cassandra.utils.btree.UpdateFunction;
+import org.apache.cassandra.utils.btree.*;
 
-// TODO : should probably lower fan-factor for tests to make them more intensive
+import static com.google.common.base.Predicates.notNull;
+import static com.google.common.collect.Iterables.filter;
+import static com.google.common.collect.Iterables.transform;
+import static java.util.Comparator.naturalOrder;
+import static java.util.Comparator.reverseOrder;
+import static org.apache.cassandra.utils.btree.BTree.iterable;
+import static org.junit.Assert.assertTrue;
+
 public class LongBTreeTest
 {
-    private static final int ITERATIONS = 10000;
 
+    private static final boolean DEBUG = false;
+    private static int perThreadTrees = 100;
+    private static int minTreeSize = 4;
+    private static int maxTreeSize = 10000;
+    private static int threads = DEBUG ? 1 : Runtime.getRuntime().availableProcessors() * 8;
     private static final MetricRegistry metrics = new MetricRegistry();
     private static final Timer BTREE_TIMER = metrics.timer(MetricRegistry.name(BTree.class, "BTREE"));
     private static final Timer TREE_TIMER = metrics.timer(MetricRegistry.name(BTree.class, "TREE"));
-    private static final ExecutorService MODIFY = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new NamedThreadFactory("MODIFY"));
-    private static final ExecutorService COMPARE = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new NamedThreadFactory("COMPARE"));
+    private static final ExecutorService MODIFY = Executors.newFixedThreadPool(threads, new NamedThreadFactory("MODIFY"));
+    private static final ExecutorService COMPARE = DEBUG ? MODIFY : Executors.newFixedThreadPool(threads, new NamedThreadFactory("COMPARE"));
     private static final RandomAbort<Integer> SPORADIC_ABORT = new RandomAbort<>(new Random(), 0.0001f);
 
     static
@@ -73,110 +74,607 @@
         System.setProperty("cassandra.btree.fanfactor", "4");
     }
 
-    @Test
-    public void testOversizedMiddleInsert()
-    {
-        TreeSet<Integer> canon = new TreeSet<>();
-        for (int i = 0 ; i < ITERATIONS ; i++)
-            canon.add(i);
-        Object[] btree = BTree.build(Arrays.asList(Integer.MIN_VALUE, Integer.MAX_VALUE), ICMP, true, null);
-        btree = BTree.update(btree, ICMP, canon, true);
-        canon.add(Integer.MIN_VALUE);
-        canon.add(Integer.MAX_VALUE);
-        Assert.assertTrue(BTree.isWellFormed(btree, ICMP));
-        testEqual("Oversize", BTree.<Integer>slice(btree, true), canon.iterator());
-    }
-
-    @Test
-    public void testIndividualInsertsSmallOverlappingRange() throws ExecutionException, InterruptedException
-    {
-        testInsertions(ITERATIONS, 50, 1, 1, true);
-    }
-
-    @Test
-    public void testBatchesSmallOverlappingRange() throws ExecutionException, InterruptedException
-    {
-        testInsertions(ITERATIONS, 50, 1, 5, true);
-    }
-
-    @Test
-    public void testIndividualInsertsMediumSparseRange() throws ExecutionException, InterruptedException
-    {
-        testInsertions(ITERATIONS, 500, 10, 1, true);
-    }
-
-    @Test
-    public void testBatchesMediumSparseRange() throws ExecutionException, InterruptedException
-    {
-        testInsertions(ITERATIONS, 500, 10, 10, true);
-    }
-
-    @Test
-    public void testLargeBatchesLargeRange() throws ExecutionException, InterruptedException
-    {
-        testInsertions(ITERATIONS * 10, 5000, 3, 100, true);
-    }
-
-    @Test
-    public void testSlicingSmallRandomTrees() throws ExecutionException, InterruptedException
-    {
-        testInsertions(10000, 50, 10, 10, false);
-    }
+    /************************** TEST ACCESS ********************************************/
 
     @Test
     public void testSearchIterator() throws InterruptedException
     {
+        final int perTreeSelections = 100;
+        testRandomSelection(perThreadTrees, perTreeSelections,
+        (test) -> {
+            IndexedSearchIterator<Integer, Integer> iter1 = test.testAsSet.iterator();
+            IndexedSearchIterator<Integer, Integer> iter2 = test.testAsList.iterator();
+            return (key) ->
+            {
+                Integer found1 = iter1.hasNext() ? iter1.next(key) : null;
+                Integer found2 = iter2.hasNext() ? iter2.next(key) : null;
+                Assert.assertSame(found1, found2);
+                if (found1 != null)
+                    Assert.assertEquals(iter1.indexOfCurrent(), iter2.indexOfCurrent());
+
+                int index = Collections.binarySearch(test.canonicalList, key, test.comparator);
+                if (index < 0)
+                {
+                    Assert.assertNull(found1);
+                }
+                else
+                {
+                    Assert.assertEquals(key, found1);
+                    Assert.assertEquals(index, iter1.indexOfCurrent());
+                }
+
+                // check that by advancing the same key again we get null, but only do it on one of the two iterators
+                // to ensure they both advance differently
+                if (ThreadLocalRandom.current().nextBoolean())
+                    Assert.assertNull(iter1.next(key));
+                else
+                    Assert.assertNull(iter2.next(key));
+            };
+        });
+    }
+
+    @Test
+    public void testInequalityLookups() throws InterruptedException
+    {
+        final int perTreeSelections = 2;
+        testRandomSelectionOfSet(perThreadTrees, perTreeSelections,
+                                 (test, canonical) -> {
+                                     if (!canonical.isEmpty() || !test.isEmpty())
+                                     {
+                                         Assert.assertEquals(canonical.isEmpty(), test.isEmpty());
+                                         Assert.assertEquals(canonical.first(), test.first());
+                                         Assert.assertEquals(canonical.last(), test.last());
+                                     }
+                                     return (key) ->
+                                     {
+                                         Assert.assertEquals(test.ceiling(key), canonical.ceiling(key));
+                                         Assert.assertEquals(test.higher(key), canonical.higher(key));
+                                         Assert.assertEquals(test.floor(key), canonical.floor(key));
+                                         Assert.assertEquals(test.lower(key), canonical.lower(key));
+                                     };
+                                 });
+    }
+
+    @Test
+    public void testListIndexes() throws InterruptedException
+    {
+        testRandomSelectionOfList(perThreadTrees, 4,
+                                  (test, canonical, cmp) ->
+                                  (key) ->
+                                  {
+                                      int javaIndex = Collections.binarySearch(canonical, key, cmp);
+                                      int btreeIndex = test.indexOf(key);
+                                      Assert.assertEquals(javaIndex, btreeIndex);
+                                      if (javaIndex >= 0)
+                                          Assert.assertEquals(canonical.get(javaIndex), test.get(btreeIndex));
+                                  }
+        );
+    }
+
+    @Test
+    public void testToArray() throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, 4,
+                            (selection) ->
+                            {
+                                Integer[] array = new Integer[selection.canonicalList.size() + 1];
+                                selection.testAsList.toArray(array, 1);
+                                Assert.assertEquals(null, array[0]);
+                                for (int j = 0; j < selection.canonicalList.size(); j++)
+                                    Assert.assertEquals(selection.canonicalList.get(j), array[j + 1]);
+                            });
+    }
+
+    private static final class CountingFunction implements Function<Integer, Integer>
+    {
+        final Function<Integer, Integer> wrapped;
+        int count = 0;
+        protected CountingFunction(Function<Integer, Integer> wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+        public Integer apply(Integer integer)
+        {
+            count++;
+            return wrapped.apply(integer);
+        }
+    }
+
+    @Test
+    public void testTransformAndFilter() throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, 4, false, false, false,
+                            (selection) ->
+                            {
+                                Map<Integer, Integer> update = new LinkedHashMap<>();
+                                for (Integer i : selection.testKeys)
+                                    update.put(i, new Integer(i));
+
+                                CountingFunction function;
+                                Object[] original = selection.testAsSet.tree();
+                                Object[] transformed;
+
+                                // test replacing none, leaving all present
+                                function = new CountingFunction((x) -> x);
+                                transformed = BTree.transformAndFilter(original, function);
+                                Assert.assertEquals(BTree.size(original), function.count);
+                                Assert.assertSame(original, transformed);
+
+                                // test replacing some, leaving all present
+                                function = new CountingFunction((x) -> update.containsKey(x) ? update.get(x) : x);
+                                transformed = BTree.transformAndFilter(original, function);
+                                Assert.assertEquals(BTree.size(original), function.count);
+                                assertSame(transform(selection.canonicalList, function.wrapped), iterable(transformed));
+
+                                // test replacing some, removing some
+                                function = new CountingFunction(update::get);
+                                transformed = BTree.transformAndFilter(original, function);
+                                Assert.assertEquals(BTree.size(original), function.count);
+                                assertSame(filter(transform(selection.canonicalList, function.wrapped), notNull()), iterable(transformed));
+
+                                // test replacing none, removing some
+                                function = new CountingFunction((x) -> update.containsKey(x) ? null : x);
+                                transformed = BTree.transformAndFilter(selection.testAsList.tree(), function);
+                                Assert.assertEquals(BTree.size(original), function.count);
+                                assertSame(filter(transform(selection.canonicalList, function.wrapped), notNull()), iterable(transformed));
+                            });
+    }
+
+    private static void assertSame(Iterable<Integer> i1, Iterable<Integer> i2)
+    {
+        assertSame(i1.iterator(), i2.iterator());
+    }
+
+    private static void assertSame(Iterator<Integer> i1, Iterator<Integer> i2)
+    {
+        while (i1.hasNext() && i2.hasNext())
+            Assert.assertSame(i1.next(), i2.next());
+        Assert.assertEquals(i1.hasNext(), i2.hasNext());
+    }
+
+    private void testRandomSelectionOfList(int perThreadTrees, int perTreeSelections, BTreeListTestFactory testRun) throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, perTreeSelections,
+                            (BTreeTestFactory) (selection) -> testRun.get(selection.testAsList, selection.canonicalList, selection.comparator));
+    }
+
+    private void testRandomSelectionOfSet(int perThreadTrees, int perTreeSelections, BTreeSetTestFactory testRun) throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, perTreeSelections,
+                            (BTreeTestFactory) (selection) -> testRun.get(selection.testAsSet, selection.canonicalSet));
+    }
+
+    static interface BTreeSetTestFactory
+    {
+        TestEachKey get(BTreeSet<Integer> test, NavigableSet<Integer> canonical);
+    }
+
+    static interface BTreeListTestFactory
+    {
+        TestEachKey get(BTreeSet<Integer> test, List<Integer> canonical, Comparator<Integer> comparator);
+    }
+
+    static interface BTreeTestFactory
+    {
+        TestEachKey get(RandomSelection test);
+    }
+
+    static interface TestEachKey
+    {
+        void testOne(Integer value);
+    }
+
+    private void testRandomSelection(int perThreadTrees, int perTreeSelections, BTreeTestFactory testRun) throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, perTreeSelections, (selection) -> {
+            TestEachKey testEachKey = testRun.get(selection);
+            for (Integer key : selection.testKeys)
+                testEachKey.testOne(key);
+        });
+    }
+
+    private void testRandomSelection(int perThreadTrees, int perTreeSelections, Consumer<RandomSelection> testRun) throws InterruptedException
+    {
+        testRandomSelection(perThreadTrees, perTreeSelections, true, true, true, testRun);
+    }
+
+    private void testRandomSelection(int perThreadTrees, int perTreeSelections, boolean narrow, boolean mixInNotPresentItems, boolean permitReversal, Consumer<RandomSelection> testRun) throws InterruptedException
+    {
         int threads = Runtime.getRuntime().availableProcessors();
         final CountDownLatch latch = new CountDownLatch(threads);
         final AtomicLong errors = new AtomicLong();
         final AtomicLong count = new AtomicLong();
-        final int perThreadTrees = 100;
-        final int perTreeSelections = 100;
         final long totalCount = threads * perThreadTrees * perTreeSelections;
         for (int t = 0 ; t < threads ; t++)
         {
-            MODIFY.execute(new Runnable()
+            Runnable runnable = new Runnable()
             {
                 public void run()
                 {
-                    ThreadLocalRandom random = ThreadLocalRandom.current();
-                    for (int i = 0 ; i < perThreadTrees ; i++)
+                    try
                     {
-                        Object[] tree = randomTree(10000, random);
-                        for (int j = 0 ; j < perTreeSelections ; j++)
+                        for (int i = 0 ; i < perThreadTrees ; i++)
                         {
-                            BTreeSearchIterator<Integer, Integer, Integer> searchIterator = new BTreeSearchIterator<>(tree, ICMP);
-                            for (Integer key : randomSelection(tree, random))
-                                if (key != searchIterator.next(key))
-                                    errors.incrementAndGet();
-                            searchIterator = new BTreeSearchIterator<Integer, Integer, Integer>(tree, ICMP);
-                            for (Integer key : randomMix(tree, random))
-                                if (key != searchIterator.next(key))
-                                    if (BTree.find(tree, ICMP, key) == key)
-                                        errors.incrementAndGet();
-                            count.incrementAndGet();
+                            RandomTree tree = randomTree(minTreeSize, maxTreeSize);
+                            for (int j = 0 ; j < perTreeSelections ; j++)
+                            {
+                                testRun.accept(tree.select(narrow, mixInNotPresentItems, permitReversal));
+                                count.incrementAndGet();
+                            }
                         }
                     }
+                    catch (Throwable t)
+                    {
+                        errors.incrementAndGet();
+                        t.printStackTrace();
+                    }
                     latch.countDown();
                 }
-            });
+            };
+            MODIFY.execute(runnable);
         }
         while (latch.getCount() > 0)
         {
-            latch.await(10L, TimeUnit.SECONDS);
-            System.out.println(String.format("%.0f%% complete %s", 100 * count.get() / (double) totalCount, errors.get() > 0 ? ("Errors: " + errors.get()) : ""));
-            assert errors.get() == 0;
+            for (int i = 0 ; i < 10L ; i++)
+            {
+                latch.await(1L, TimeUnit.SECONDS);
+                Assert.assertEquals(0, errors.get());
+            }
+            log("%.1f%% complete %s", 100 * count.get() / (double) totalCount, errors.get() > 0 ? ("Errors: " + errors.get()) : "");
         }
     }
 
-    private static void testInsertions(int totalCount, int perTestCount, int testKeyRatio, int modificationBatchSize, boolean quickEquality) throws ExecutionException, InterruptedException
+    private static class RandomSelection
+    {
+        final List<Integer> testKeys;
+        final NavigableSet<Integer> canonicalSet;
+        final List<Integer> canonicalList;
+        final BTreeSet<Integer> testAsSet;
+        final BTreeSet<Integer> testAsList;
+        final Comparator<Integer> comparator;
+
+        private RandomSelection(List<Integer> testKeys, NavigableSet<Integer> canonicalSet, BTreeSet<Integer> testAsSet,
+                                List<Integer> canonicalList, BTreeSet<Integer> testAsList, Comparator<Integer> comparator)
+        {
+            this.testKeys = testKeys;
+            this.canonicalList = canonicalList;
+            this.canonicalSet = canonicalSet;
+            this.testAsSet = testAsSet;
+            this.testAsList = testAsList;
+            this.comparator = comparator;
+        }
+    }
+
+    private static class RandomTree
+    {
+        final NavigableSet<Integer> canonical;
+        final BTreeSet<Integer> test;
+
+        private RandomTree(NavigableSet<Integer> canonical, BTreeSet<Integer> test)
+        {
+            this.canonical = canonical;
+            this.test = test;
+        }
+
+        RandomSelection select(boolean narrow, boolean mixInNotPresentItems, boolean permitReversal)
+        {
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+            NavigableSet<Integer> canonicalSet = this.canonical;
+            BTreeSet<Integer> testAsSet = this.test;
+            List<Integer> canonicalList = new ArrayList<>(canonicalSet);
+            BTreeSet<Integer> testAsList = this.test;
+
+            Assert.assertEquals(canonicalSet.size(), testAsSet.size());
+            Assert.assertEquals(canonicalList.size(), testAsList.size());
+
+            // sometimes select keys first, so we cover full range
+            List<Integer> allKeys = randomKeys(canonical, mixInNotPresentItems);
+            List<Integer> keys = allKeys;
+
+            int narrowCount = random.nextInt(3);
+            while (narrow && canonicalList.size() > 10 && keys.size() > 10 && narrowCount-- > 0)
+            {
+                boolean useLb = random.nextBoolean();
+                boolean useUb = random.nextBoolean();
+                if (!(useLb | useUb))
+                    continue;
+
+                // select a range smaller than the total span when we have more narrowing iterations left
+                int indexRange = keys.size() / (narrowCount + 1);
+
+                boolean lbInclusive = true;
+                Integer lbKey = canonicalList.get(0);
+                int lbKeyIndex = 0, lbIndex = 0;
+                boolean ubInclusive = true;
+                Integer ubKey = canonicalList.get(canonicalList.size() - 1);
+                int ubKeyIndex = keys.size(), ubIndex = canonicalList.size();
+
+                if (useLb)
+                {
+                    lbKeyIndex = random.nextInt(0, indexRange - 1);
+                    Integer candidate = keys.get(lbKeyIndex);
+                    if (useLb = (candidate > lbKey && candidate <= ubKey))
+                    {
+                        lbInclusive = random.nextBoolean();
+                        lbKey = keys.get(lbKeyIndex);
+                        lbIndex = Collections.binarySearch(canonicalList, lbKey);
+                        if (lbIndex >= 0 && !lbInclusive) lbIndex++;
+                        else if (lbIndex < 0) lbIndex = -1 -lbIndex;
+                    }
+                }
+                if (useUb)
+                {
+                    ubKeyIndex = random.nextInt(Math.max(lbKeyIndex, keys.size() - indexRange), keys.size() - 1);
+                    Integer candidate = keys.get(ubKeyIndex);
+                    if (useUb = (candidate < ubKey && candidate >= lbKey))
+                    {
+                        ubInclusive = random.nextBoolean();
+                        ubKey = keys.get(ubKeyIndex);
+                        ubIndex = Collections.binarySearch(canonicalList, ubKey);
+                        if (ubIndex >= 0 && ubInclusive) { ubIndex++; }
+                        else if (ubIndex < 0) ubIndex = -1 -ubIndex;
+                    }
+                }
+                if (ubIndex < lbIndex) { ubIndex = lbIndex; ubKey = lbKey; ubInclusive = false; }
+
+                canonicalSet = !useLb ? canonicalSet.headSet(ubKey, ubInclusive)
+                                      : !useUb ? canonicalSet.tailSet(lbKey, lbInclusive)
+                                               : canonicalSet.subSet(lbKey, lbInclusive, ubKey, ubInclusive);
+                testAsSet = !useLb ? testAsSet.headSet(ubKey, ubInclusive)
+                                   : !useUb ? testAsSet.tailSet(lbKey, lbInclusive)
+                                            : testAsSet.subSet(lbKey, lbInclusive, ubKey, ubInclusive);
+
+                keys = keys.subList(lbKeyIndex, ubKeyIndex);
+                canonicalList = canonicalList.subList(lbIndex, ubIndex);
+                testAsList = testAsList.subList(lbIndex, ubIndex);
+
+                Assert.assertEquals(canonicalSet.size(), testAsSet.size());
+                Assert.assertEquals(canonicalList.size(), testAsList.size());
+            }
+
+            // possibly restore full set of keys, to test case where we are provided existing keys that are out of bounds
+            if (keys != allKeys && random.nextBoolean())
+                keys = allKeys;
+
+            Comparator<Integer> comparator = naturalOrder();
+            if (permitReversal && random.nextBoolean())
+            {
+                if (allKeys != keys)
+                    keys = new ArrayList<>(keys);
+                if (canonicalSet != canonical)
+                    canonicalList = new ArrayList<>(canonicalList);
+                Collections.reverse(keys);
+                Collections.reverse(canonicalList);
+                testAsList = testAsList.descendingSet();
+
+                canonicalSet = canonicalSet.descendingSet();
+                testAsSet = testAsSet.descendingSet();
+                comparator = reverseOrder();
+            }
+
+            Assert.assertEquals(canonicalSet.size(), testAsSet.size());
+            Assert.assertEquals(canonicalList.size(), testAsList.size());
+            if (!canonicalSet.isEmpty())
+            {
+                Assert.assertEquals(canonicalSet.first(), canonicalList.get(0));
+                Assert.assertEquals(canonicalSet.last(), canonicalList.get(canonicalList.size() - 1));
+                Assert.assertEquals(canonicalSet.first(), testAsSet.first());
+                Assert.assertEquals(canonicalSet.last(), testAsSet.last());
+                Assert.assertEquals(canonicalSet.first(), testAsList.get(0));
+                Assert.assertEquals(canonicalSet.last(), testAsList.get(testAsList.size() - 1));
+            }
+
+            return new RandomSelection(keys, canonicalSet, testAsSet, canonicalList, testAsList, comparator);
+        }
+    }
+
+    private static RandomTree randomTree(int minSize, int maxSize)
+    {
+        // perform most of our tree constructions via update, as this is more efficient; since every run uses this
+        // we test builder disproportionately more often than if it had its own test anyway
+        return ThreadLocalRandom.current().nextFloat() < 0.95 ? randomTreeByUpdate(minSize, maxSize)
+                                                              : randomTreeByBuilder(minSize, maxSize);
+    }
+
+    private static RandomTree randomTreeByUpdate(int minSize, int maxSize)
+    {
+        assert minSize > 3;
+        TreeSet<Integer> canonical = new TreeSet<>();
+
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        int targetSize = random.nextInt(minSize, maxSize);
+        int maxModificationSize = random.nextInt(2, targetSize);
+        Object[] accmumulate = BTree.empty();
+        int curSize = 0;
+        while (curSize < targetSize)
+        {
+            int nextSize = maxModificationSize == 1 ? 1 : random.nextInt(1, maxModificationSize);
+            TreeSet<Integer> build = new TreeSet<>();
+            for (int i = 0 ; i < nextSize ; i++)
+            {
+                Integer next = random.nextInt();
+                build.add(next);
+                canonical.add(next);
+            }
+            accmumulate = BTree.update(accmumulate, naturalOrder(), build, UpdateFunction.<Integer>noOp());
+            curSize += nextSize;
+            maxModificationSize = Math.min(maxModificationSize, targetSize - curSize);
+        }
+        return new RandomTree(canonical, BTreeSet.<Integer>wrap(accmumulate, naturalOrder()));
+    }
+
+    private static RandomTree randomTreeByBuilder(int minSize, int maxSize)
+    {
+        assert minSize > 3;
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        BTree.Builder<Integer> builder = BTree.builder(naturalOrder());
+
+        int targetSize = random.nextInt(minSize, maxSize);
+        int maxModificationSize = (int) Math.sqrt(targetSize);
+
+        TreeSet<Integer> canonical = new TreeSet<>();
+
+        int curSize = 0;
+        TreeSet<Integer> ordered = new TreeSet<>();
+        List<Integer> shuffled = new ArrayList<>();
+        while (curSize < targetSize)
+        {
+            int nextSize = maxModificationSize <= 1 ? 1 : random.nextInt(1, maxModificationSize);
+
+            // leave a random selection of previous values
+            (random.nextBoolean() ? ordered.headSet(random.nextInt()) : ordered.tailSet(random.nextInt())).clear();
+            shuffled = new ArrayList<>(shuffled.subList(0, shuffled.size() < 2 ? 0 : random.nextInt(shuffled.size() / 2)));
+
+            for (int i = 0 ; i < nextSize ; i++)
+            {
+                Integer next = random.nextInt();
+                ordered.add(next);
+                shuffled.add(next);
+                canonical.add(next);
+            }
+
+            switch (random.nextInt(5))
+            {
+                case 0:
+                    builder.addAll(ordered);
+                    break;
+                case 1:
+                    builder.addAll(BTreeSet.of(ordered));
+                    break;
+                case 2:
+                    for (Integer i : ordered)
+                        builder.add(i);
+                case 3:
+                    builder.addAll(shuffled);
+                    break;
+                case 4:
+                    for (Integer i : shuffled)
+                        builder.add(i);
+            }
+
+            curSize += nextSize;
+            maxModificationSize = Math.min(maxModificationSize, targetSize - curSize);
+        }
+
+        BTreeSet<Integer> btree = BTreeSet.<Integer>wrap(builder.build(), naturalOrder());
+        Assert.assertEquals(canonical.size(), btree.size());
+        return new RandomTree(canonical, btree);
+    }
+
+    // select a random subset of the keys, with an optional random population of keys inbetween those that are present
+    // return a value with the search position
+    private static List<Integer> randomKeys(Iterable<Integer> canonical, boolean mixInNotPresentItems)
+    {
+        ThreadLocalRandom rnd = ThreadLocalRandom.current();
+        boolean useFake = mixInNotPresentItems && rnd.nextBoolean();
+        final float fakeRatio = rnd.nextFloat();
+        List<Integer> results = new ArrayList<>();
+        Long fakeLb = (long) Integer.MIN_VALUE, fakeUb = null;
+        Integer max = null;
+        for (Integer v : canonical)
+        {
+            if (    !useFake
+                ||  (fakeUb == null ? v - 1 : fakeUb) <= fakeLb + 1
+                ||  rnd.nextFloat() < fakeRatio)
+            {
+                // if we cannot safely construct a fake value, or our randomizer says not to, we emit the next real value
+                results.add(v);
+                fakeLb = v.longValue();
+                fakeUb = null;
+            }
+            else
+            {
+                // otherwise we emit a fake value in the range immediately proceeding the last real value, and not
+                // exceeding the real value that would have proceeded (ignoring any other suppressed real values since)
+                if (fakeUb == null)
+                    fakeUb = v.longValue() - 1;
+                long mid = (fakeLb + fakeUb) / 2;
+                assert mid < fakeUb;
+                results.add((int) mid);
+                fakeLb = mid;
+            }
+            max = v;
+        }
+        if (useFake && max != null && max < Integer.MAX_VALUE)
+            results.add(max + 1);
+        final float useChance = rnd.nextFloat();
+        return Lists.newArrayList(filter(results, (x) -> rnd.nextFloat() < useChance));
+    }
+
+    /************************** TEST MUTATION ********************************************/
+
+    @Test
+    public void testOversizedMiddleInsert()
+    {
+        TreeSet<Integer> canon = new TreeSet<>();
+        for (int i = 0 ; i < 10000000 ; i++)
+            canon.add(i);
+        Object[] btree = BTree.build(Arrays.asList(Integer.MIN_VALUE, Integer.MAX_VALUE), UpdateFunction.noOp());
+        btree = BTree.update(btree, naturalOrder(), canon, UpdateFunction.<Integer>noOp());
+        canon.add(Integer.MIN_VALUE);
+        canon.add(Integer.MAX_VALUE);
+        assertTrue(BTree.isWellFormed(btree, naturalOrder()));
+        testEqual("Oversize", BTree.iterator(btree), canon.iterator());
+    }
+
+    @Test
+    public void testIndividualInsertsSmallOverlappingRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(50, 1, 1, true);
+    }
+
+    @Test
+    public void testBatchesSmallOverlappingRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(50, 1, 5, true);
+    }
+
+    @Test
+    public void testIndividualInsertsMediumSparseRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(perThreadTrees / 10, 500, 10, 1, true);
+    }
+
+    @Test
+    public void testBatchesMediumSparseRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(500, 10, 10, true);
+    }
+
+    @Test
+    public void testLargeBatchesLargeRange() throws ExecutionException, InterruptedException
+    {
+        testInsertions(perThreadTrees / 10, Math.max(maxTreeSize, 5000), 3, 100, true);
+    }
+
+    @Test
+    public void testRandomRangeAndBatches() throws ExecutionException, InterruptedException
+    {
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        int treeSize = random.nextInt(maxTreeSize / 10, maxTreeSize * 10);
+        for (int i = 0 ; i < perThreadTrees / 10 ; i++)
+            testInsertions(threads * 10, treeSize, random.nextInt(1, 100) / 10f, treeSize / 100, true);
+    }
+
+    @Test
+    public void testSlicingSmallRandomTrees() throws ExecutionException, InterruptedException
+    {
+        testInsertions(50, 10, 10, false);
+    }
+
+    private static void testInsertions(int perTestCount, float testKeyRatio, int modificationBatchSize, boolean quickEquality) throws ExecutionException, InterruptedException
+    {
+        int tests = perThreadTrees * threads;
+        testInsertions(tests, perTestCount, testKeyRatio, modificationBatchSize, quickEquality);
+    }
+
+    private static void testInsertions(int tests, int perTestCount, float testKeyRatio, int modificationBatchSize, boolean quickEquality) throws ExecutionException, InterruptedException
     {
         int batchesPerTest = perTestCount / modificationBatchSize;
-        int maximumRunLength = 100;
-        int testKeyRange = perTestCount * testKeyRatio;
-        int tests = totalCount / perTestCount;
-        System.out.println(String.format("Performing %d tests of %d operations, with %.2f max size/key-range ratio in batches of ~%d ops",
-                tests, perTestCount, 1 / (float) testKeyRatio, modificationBatchSize));
+        int testKeyRange = (int) (perTestCount * testKeyRatio);
+        long totalCount = (long) perTestCount * tests;
+        log("Performing %d tests of %d operations, with %.2f max size/key-range ratio in batches of ~%d ops",
+            tests, perTestCount, 1 / testKeyRatio, modificationBatchSize);
 
         // if we're not doing quick-equality, we can spam with garbage for all the checks we perform, so we'll split the work into smaller chunks
         int chunkSize = quickEquality ? tests : (int) (100000 / Math.pow(perTestCount, 2));
@@ -185,30 +683,33 @@
             final List<ListenableFutureTask<List<ListenableFuture<?>>>> outer = new ArrayList<>();
             for (int i = 0 ; i < chunkSize ; i++)
             {
-                outer.add(doOneTestInsertions(testKeyRange, maximumRunLength, modificationBatchSize, batchesPerTest, quickEquality));
+                int maxRunLength = modificationBatchSize == 1 ? 1 : ThreadLocalRandom.current().nextInt(1, modificationBatchSize);
+                outer.add(doOneTestInsertions(testKeyRange, maxRunLength, modificationBatchSize, batchesPerTest, quickEquality));
             }
 
             final List<ListenableFuture<?>> inner = new ArrayList<>();
-            int complete = 0;
-            int reportInterval = totalCount / 100;
-            int lastReportAt = 0;
+            long complete = 0;
+            int reportInterval = Math.max(1000, (int) (totalCount / 10000));
+            long lastReportAt = 0;
             for (ListenableFutureTask<List<ListenableFuture<?>>> f : outer)
             {
                 inner.addAll(f.get());
                 complete += perTestCount;
                 if (complete - lastReportAt >= reportInterval)
                 {
-                    System.out.println(String.format("Completed %d of %d operations", (chunk * perTestCount) + complete, totalCount));
+                    long done = (chunk * perTestCount) + complete;
+                    float ratio = done / (float) totalCount;
+                    log("Completed %.1f%% (%d of %d operations)", ratio * 100, done, totalCount);
                     lastReportAt = complete;
                 }
             }
             Futures.allAsList(inner).get();
         }
         Snapshot snap = BTREE_TIMER.getSnapshot();
-        System.out.println(String.format("btree   : %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile()));
+        log("btree: %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile());
         snap = TREE_TIMER.getSnapshot();
-        System.out.println(String.format("snaptree: %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile()));
-        System.out.println("Done");
+        log("java: %.2fns, %.2fns, %.2fns", snap.getMedian(), snap.get95thPercentile(), snap.get999thPercentile());
+        log("Done");
     }
 
     private static ListenableFutureTask<List<ListenableFuture<?>>> doOneTestInsertions(final int upperBound, final int maxRunLength, final int averageModsPerIteration, final int iterations, final boolean quickEquality)
@@ -222,11 +723,11 @@
                 NavigableMap<Integer, Integer> canon = new TreeMap<>();
                 Object[] btree = BTree.empty();
                 final TreeMap<Integer, Integer> buffer = new TreeMap<>();
-                final Random rnd = new Random();
+                ThreadLocalRandom rnd = ThreadLocalRandom.current();
                 for (int i = 0 ; i < iterations ; i++)
                 {
                     buffer.clear();
-                    int mods = (averageModsPerIteration >> 1) + 1 + rnd.nextInt(averageModsPerIteration);
+                    int mods = rnd.nextInt(1, averageModsPerIteration * 2);
                     while (mods > 0)
                     {
                         int v = rnd.nextInt(upperBound);
@@ -246,24 +747,27 @@
                     ctxt = BTREE_TIMER.time();
                     Object[] next = null;
                     while (next == null)
-                        next = BTree.update(btree, ICMP, buffer.keySet(), true, SPORADIC_ABORT);
+                        next = BTree.update(btree, naturalOrder(), buffer.keySet(), SPORADIC_ABORT);
                     btree = next;
                     ctxt.stop();
 
-                    if (!BTree.isWellFormed(btree, ICMP))
+                    if (!BTree.isWellFormed(btree, naturalOrder()))
                     {
-                        System.out.println("ERROR: Not well formed");
+                        log("ERROR: Not well formed");
                         throw new AssertionError("Not well formed!");
                     }
                     if (quickEquality)
-                        testEqual("", BTree.<Integer>slice(btree, true), canon.keySet().iterator());
+                        testEqual("", BTree.iterator(btree), canon.keySet().iterator());
                     else
                         r.addAll(testAllSlices("RND", btree, new TreeSet<>(canon.keySet())));
                 }
                 return r;
             }
         });
-        MODIFY.execute(f);
+        if (DEBUG)
+            f.run();
+        else
+            MODIFY.execute(f);
         return f;
     }
 
@@ -276,30 +780,21 @@
         for (int i = 0 ; i < 128 ; i++)
         {
             String id = String.format("[0..%d)", canon.size());
-            System.out.println("Testing " + id);
+            log("Testing " + id);
             Futures.allAsList(testAllSlices(id, cur, canon)).get();
             Object[] next = null;
             while (next == null)
-                next = BTree.update(cur, ICMP, Arrays.asList(i), true, SPORADIC_ABORT);
+                next = BTree.update(cur, naturalOrder(), Arrays.asList(i), SPORADIC_ABORT);
             cur = next;
             canon.add(i);
         }
     }
 
-    static final Comparator<Integer> ICMP = new Comparator<Integer>()
-    {
-        @Override
-        public int compare(Integer o1, Integer o2)
-        {
-            return Integer.compare(o1, o2);
-        }
-    };
-
     private static List<ListenableFuture<?>> testAllSlices(String id, Object[] btree, NavigableSet<Integer> canon)
     {
         List<ListenableFuture<?>> waitFor = new ArrayList<>();
-        testAllSlices(id + " ASC", new BTreeSet<>(btree, ICMP), canon, true, waitFor);
-        testAllSlices(id + " DSC", new BTreeSet<>(btree, ICMP).descendingSet(), canon.descendingSet(), false, waitFor);
+        testAllSlices(id + " ASC", new BTreeSet<>(btree, naturalOrder()), canon, true, waitFor);
+        testAllSlices(id + " DSC", new BTreeSet<Integer>(btree, naturalOrder()).descendingSet(), canon.descendingSet(), false, waitFor);
         return waitFor;
     }
 
@@ -309,10 +804,10 @@
         for (Integer lb : range(canon.size(), Integer.MIN_VALUE, ascending))
         {
             // test head/tail sets
-            testOneSlice(String.format("%s->[%d..)", id, lb), btree.headSet(lb, true), canon.headSet(lb, true), results);
-            testOneSlice(String.format("%s->(%d..)", id, lb), btree.headSet(lb, false), canon.headSet(lb, false), results);
-            testOneSlice(String.format("%s->(..%d]", id, lb), btree.tailSet(lb, true), canon.tailSet(lb, true), results);
-            testOneSlice(String.format("%s->(..%d]", id, lb), btree.tailSet(lb, false), canon.tailSet(lb, false), results);
+            testOneSlice(String.format("%s->[..%d)", id, lb), btree.headSet(lb, true), canon.headSet(lb, true), results);
+            testOneSlice(String.format("%s->(..%d)", id, lb), btree.headSet(lb, false), canon.headSet(lb, false), results);
+            testOneSlice(String.format("%s->(%d..]", id, lb), btree.tailSet(lb, true), canon.tailSet(lb, true), results);
+            testOneSlice(String.format("%s->(%d..]", id, lb), btree.tailSet(lb, false), canon.tailSet(lb, false), results);
             for (Integer ub : range(canon.size(), lb, ascending))
             {
                 // test subsets
@@ -340,14 +835,17 @@
             }
         }, null);
         results.add(f);
-        COMPARE.execute(f);
+        if (DEBUG)
+            f.run();
+        else
+            COMPARE.execute(f);
     }
 
     private static void test(String id, int test, int expect)
     {
         if (test != expect)
         {
-            System.out.println(String.format("%s: Expected %d, Got %d", id, expect, test));
+            log("%s: Expected %d, Got %d", id, expect, test);
         }
     }
 
@@ -358,20 +856,20 @@
         {
             Object i = btree.next();
             Object j = canon.next();
-            if (!i.equals(j))
+            if (!Objects.equals(i, j))
             {
-                System.out.println(String.format("%s: Expected %d, Got %d", id, j, i));
+                log("%s: Expected %d, Got %d", id, j, i);
                 equal = false;
             }
         }
         while (btree.hasNext())
         {
-            System.out.println(String.format("%s: Expected <Nil>, Got %d", id, btree.next()));
+            log("%s: Expected <Nil>, Got %d", id, btree.next());
             equal = false;
         }
         while (canon.hasNext())
         {
-            System.out.println(String.format("%s: Expected %d, Got Nil", id, canon.next()));
+            log("%s: Expected %d, Got Nil", id, canon.next());
             equal = false;
         }
         if (!equal)
@@ -429,48 +927,7 @@
         };
     }
 
-    private static Object[] randomTree(int maxSize, Random random)
-    {
-        TreeSet<Integer> build = new TreeSet<>();
-        int size = random.nextInt(maxSize);
-        for (int i = 0 ; i < size ; i++)
-        {
-            build.add(random.nextInt());
-        }
-        return BTree.build(build, ICMP, true, UpdateFunction.NoOp.<Integer>instance());
-    }
-
-    private static Iterable<Integer> randomSelection(Object[] iter, final Random rnd)
-    {
-        final float proportion = rnd.nextFloat();
-        return Iterables.filter(new BTreeSet<>(iter, ICMP), new Predicate<Integer>()
-        {
-            public boolean apply(Integer integer)
-            {
-                return rnd.nextFloat() < proportion;
-            }
-        });
-    }
-
-    private static Iterable<Integer> randomMix(Object[] iter, final Random rnd)
-    {
-        final float proportion = rnd.nextFloat();
-        return Iterables.transform(new BTreeSet<>(iter, ICMP), new Function<Integer, Integer>()
-        {
-            long last = Integer.MIN_VALUE;
-
-            public Integer apply(Integer v)
-            {
-                long last = this.last;
-                this.last = v;
-                if (rnd.nextFloat() < proportion)
-                    return v;
-                return (int)((v - last) / 2);
-            }
-        });
-    }
-
-    private static final class RandomAbort<V> implements UpdateFunction<V>
+    private static final class RandomAbort<V> implements UpdateFunction<V, V>
     {
         final Random rnd;
         final float chance;
@@ -492,7 +949,6 @@
 
         public void allocated(long heapSize)
         {
-
         }
 
         public V apply(V v)
@@ -500,4 +956,58 @@
             return v;
         }
     }
+
+    public static void main(String[] args) throws ExecutionException, InterruptedException, InvocationTargetException, IllegalAccessException
+    {
+        for (String arg : args)
+        {
+            if (arg.startsWith("fan="))
+                System.setProperty("cassandra.btree.fanfactor", arg.substring(4));
+            else if (arg.startsWith("min="))
+                minTreeSize = Integer.parseInt(arg.substring(4));
+            else if (arg.startsWith("max="))
+                maxTreeSize = Integer.parseInt(arg.substring(4));
+            else if (arg.startsWith("count="))
+                perThreadTrees = Integer.parseInt(arg.substring(6));
+            else
+                exit();
+        }
+
+        List<Method> methods = new ArrayList<>();
+        for (Method m : LongBTreeTest.class.getDeclaredMethods())
+        {
+            if (m.getParameters().length > 0)
+                continue;
+            for (Annotation annotation : m.getAnnotations())
+                if (annotation.annotationType() == Test.class)
+                    methods.add(m);
+        }
+
+        LongBTreeTest test = new LongBTreeTest();
+        Collections.sort(methods, (a, b) -> a.getName().compareTo(b.getName()));
+        log(Lists.transform(methods, (m) -> m.getName()).toString());
+        for (Method m : methods)
+        {
+            log(m.getName());
+            m.invoke(test);
+        }
+        log("success");
+    }
+
+    private static void exit()
+    {
+        log("usage: fan=<int> min=<int> max=<int> count=<int>");
+        log("fan:   btree fanout");
+        log("min:   minimum btree size (must be >= 4)");
+        log("max:   maximum btree size (must be >= 4)");
+        log("count: number of trees to assign each core, for each test");
+    }
+
+    private static void log(String formatstr, Object ... args)
+    {
+        args = Arrays.copyOf(args, args.length + 1);
+        System.arraycopy(args, 0, args, 1, args.length - 1);
+        args[0] = System.currentTimeMillis();
+        System.out.printf("%tT: " + formatstr + "\n", args);
+    }
 }

diff --git a/test/burn/org/apache/cassandra/utils/memory/LongBufferPoolTest.java b/test/burn/org/apache/cassandra/utils/memory/LongBufferPoolTest.java
new file mode 100644
index 0000000..66abe5a
--- /dev/null
+++ b/test/burn/org/apache/cassandra/utils/memory/LongBufferPoolTest.java

@@ -0,0 +1,596 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.nio.ByteBuffer;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.utils.DynamicList;
+
+import static org.junit.Assert.*;
+
+/**
+ * Long BufferPool test - make sure that the BufferPool allocates and recycles
+ * ByteBuffers under heavy concurrent usage.
+ *
+ * The test creates two groups of threads
+ *
+ * - the burn producer/consumer pair that allocates 1/10 poolSize and then returns
+ *   all the memory to the pool. 50% is freed by the producer, 50% passed to the consumer thread.
+ *
+ * - a ring of worker threads that allocate buffers and either immediately free them,
+ *   or pass to the next worker thread for it to be freed on it's behalf.  Periodically
+ *   all memory is freed by the thread.
+ *
+ * While the burn/worker threads run, the original main thread checks that all of the threads are still
+ * making progress every 10s (no locking issues, or exits from assertion failures),
+ * and that every chunk has been freed at least once during the previous cycle (if that was possible).
+ *
+ * The test does not expect to survive out-of-memory errors, so needs sufficient heap memory
+ * for non-direct buffers and the debug tracking objects that check the allocate buffers.
+ * (The timing is very interesting when Xmx is lowered to increase garbage collection pauses, but do
+ * not set it too low).
+ */
+public class LongBufferPoolTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(LongBufferPoolTest.class);
+
+    private static final int AVG_BUFFER_SIZE = 16 << 10;
+    private static final int STDEV_BUFFER_SIZE = 10 << 10; // picked to ensure exceeding buffer size is rare, but occurs
+    private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
+
+    @Test
+    public void testAllocate() throws InterruptedException, ExecutionException
+    {
+        testAllocate(Runtime.getRuntime().availableProcessors() * 2, TimeUnit.MINUTES.toNanos(2L), 16 << 20);
+    }
+
+    private static final class BufferCheck
+    {
+        final ByteBuffer buffer;
+        final long val;
+        DynamicList.Node<BufferCheck> listnode;
+
+        private BufferCheck(ByteBuffer buffer, long val)
+        {
+            this.buffer = buffer;
+            this.val = val;
+        }
+
+        void validate()
+        {
+            ByteBuffer read = buffer.duplicate();
+            while (read.remaining() > 8)
+                assert read.getLong() == val;
+        }
+
+        void init()
+        {
+            ByteBuffer write = buffer.duplicate();
+            while (write.remaining() > 8)
+                write.putLong(val);
+        }
+    }
+
+    private static final class TestEnvironment
+    {
+        final int threadCount;
+        final long duration;
+        final int poolSize;
+        final long until;
+        final CountDownLatch latch;
+        final SPSCQueue<BufferCheck>[] sharedRecycle;
+        final AtomicBoolean[] makingProgress;
+        final AtomicBoolean burnFreed;
+        final AtomicBoolean[] freedAllMemory;
+        final ExecutorService executorService;
+        final List<Future<Boolean>> threadResultFuture;
+        final int targetSizeQuanta;
+
+        TestEnvironment(int threadCount, long duration, int poolSize)
+        {
+            this.threadCount = threadCount;
+            this.duration = duration;
+            this.poolSize = poolSize;
+            until = System.nanoTime() + duration;
+            latch = new CountDownLatch(threadCount);
+            sharedRecycle = new SPSCQueue[threadCount];
+            makingProgress = new AtomicBoolean[threadCount];
+            burnFreed = new AtomicBoolean(false);
+            freedAllMemory = new AtomicBoolean[threadCount];
+            executorService = Executors.newFixedThreadPool(threadCount + 2);
+            threadResultFuture = new ArrayList<>(threadCount);
+
+            for (int i = 0; i < sharedRecycle.length; i++)
+            {
+                sharedRecycle[i] = new SPSCQueue<>();
+                makingProgress[i] = new AtomicBoolean(false);
+                freedAllMemory[i] = new AtomicBoolean(false);
+            }
+
+            // Divide the poolSize across our threads, deliberately over-subscribing it.  Threads
+            // allocate a different amount of memory each - 1*quanta, 2*quanta, ... N*quanta.
+            // Thread0 is always going to be a single CHUNK, then to allocate increasing amounts
+            // using their own algorithm the targetSize should be poolSize / targetSizeQuanta.
+            //
+            // This should divide double the poolSize across the working threads,
+            // plus CHUNK_SIZE for thread0 and 1/10 poolSize for the burn producer/consumer pair.
+            targetSizeQuanta = 2 * poolSize / sum1toN(threadCount - 1);
+        }
+
+        void addCheckedFuture(Future<Boolean> future)
+        {
+            threadResultFuture.add(future);
+        }
+
+        int countStalledThreads()
+        {
+            int stalledThreads = 0;
+
+            for (AtomicBoolean progress : makingProgress)
+            {
+                if (!progress.getAndSet(false))
+                    stalledThreads++;
+            }
+            return stalledThreads;
+        }
+
+        int countDoneThreads()
+        {
+            int doneThreads = 0;
+            for (Future<Boolean> r : threadResultFuture)
+            {
+                if (r.isDone())
+                    doneThreads++;
+            }
+            return doneThreads;
+        }
+
+        void assertCheckedThreadsSucceeded()
+        {
+            try
+            {
+                for (Future<Boolean> r : threadResultFuture)
+                    assertTrue(r.get());
+            }
+            catch (InterruptedException ex)
+            {
+                // If interrupted while checking, restart and check everything.
+                assertCheckedThreadsSucceeded();
+            }
+            catch (ExecutionException ex)
+            {
+                fail("Checked thread threw exception: " + ex.toString());
+            }
+        }
+    }
+
+    public void testAllocate(int threadCount, long duration, int poolSize) throws InterruptedException, ExecutionException
+    {
+        System.out.println(String.format("%s - testing %d threads for %dm",
+                                         DATE_FORMAT.format(new Date()),
+                                         threadCount,
+                                         TimeUnit.NANOSECONDS.toMinutes(duration)));
+        long prevPoolSize = BufferPool.MEMORY_USAGE_THRESHOLD;
+        logger.info("Overriding configured BufferPool.MEMORY_USAGE_THRESHOLD={} and enabling BufferPool.DEBUG", poolSize);
+        BufferPool.MEMORY_USAGE_THRESHOLD = poolSize;
+        BufferPool.DEBUG = true;
+
+        TestEnvironment testEnv = new TestEnvironment(threadCount, duration, poolSize);
+
+        startBurnerThreads(testEnv);
+
+        for (int threadIdx = 0; threadIdx < threadCount; threadIdx++)
+            testEnv.addCheckedFuture(startWorkerThread(testEnv, threadIdx));
+
+        while (!testEnv.latch.await(10L, TimeUnit.SECONDS))
+        {
+            int stalledThreads = testEnv.countStalledThreads();
+            int doneThreads = testEnv.countDoneThreads();
+
+            if (doneThreads == 0) // If any threads have completed, they will stop making progress/recycling buffers.
+            {                     // Assertions failures on the threads will be caught below.
+                assert stalledThreads == 0;
+                boolean allFreed = testEnv.burnFreed.getAndSet(false);
+                for (AtomicBoolean freedMemory : testEnv.freedAllMemory)
+                    allFreed = allFreed && freedMemory.getAndSet(false);
+                if (allFreed)
+                    BufferPool.assertAllRecycled();
+                else
+                    logger.info("All threads did not free all memory in this time slot - skipping buffer recycle check");
+            }
+        }
+
+        for (SPSCQueue<BufferCheck> queue : testEnv.sharedRecycle)
+        {
+            BufferCheck check;
+            while ( null != (check = queue.poll()) )
+            {
+                check.validate();
+                BufferPool.put(check.buffer);
+            }
+        }
+
+        assertEquals(0, testEnv.executorService.shutdownNow().size());
+
+        logger.info("Reverting BufferPool.MEMORY_USAGE_THRESHOLD={}", prevPoolSize);
+        BufferPool.MEMORY_USAGE_THRESHOLD = prevPoolSize;
+        BufferPool.DEBUG = false;
+
+        testEnv.assertCheckedThreadsSucceeded();
+
+        System.out.println(String.format("%s - finished.",
+                                         DATE_FORMAT.format(new Date())));
+    }
+
+    private Future<Boolean> startWorkerThread(TestEnvironment testEnv, final int threadIdx)
+    {
+        return testEnv.executorService.submit(new TestUntil(testEnv.until)
+        {
+            final int targetSize = threadIdx == 0 ? BufferPool.CHUNK_SIZE : testEnv.targetSizeQuanta * threadIdx;
+            final SPSCQueue<BufferCheck> shareFrom = testEnv.sharedRecycle[threadIdx];
+            final DynamicList<BufferCheck> checks = new DynamicList<>((int) Math.max(1, targetSize / (1 << 10)));
+            final SPSCQueue<BufferCheck> shareTo = testEnv.sharedRecycle[(threadIdx + 1) % testEnv.threadCount];
+            final ThreadLocalRandom rand = ThreadLocalRandom.current();
+            int totalSize = 0;
+            int freeingSize = 0;
+            int size = 0;
+
+            void checkpoint()
+            {
+                if (!testEnv.makingProgress[threadIdx].get())
+                    testEnv.makingProgress[threadIdx].set(true);
+            }
+
+            void testOne() throws Exception
+            {
+
+                long currentTargetSize = (rand.nextInt(testEnv.poolSize / 1024) == 0 || !testEnv.freedAllMemory[threadIdx].get()) ? 0 : targetSize;
+                int spinCount = 0;
+                while (totalSize > currentTargetSize - freeingSize)
+                {
+                    // free buffers until we're below our target size
+                    if (checks.size() == 0)
+                    {
+                        // if we're out of buffers to free, we're waiting on our neighbour to free them;
+                        // first check if the consuming neighbour has caught up, and if so mark that free
+                        if (shareTo.exhausted)
+                        {
+                            totalSize -= freeingSize;
+                            freeingSize = 0;
+                        }
+                        else if (!recycleFromNeighbour())
+                        {
+                            if (++spinCount > 1000 && System.nanoTime() > until)
+                                return;
+                            // otherwise, free one of our other neighbour's buffers if can; and otherwise yield
+                            Thread.yield();
+                        }
+                        continue;
+                    }
+
+                    // pick a random buffer, with preference going to earlier ones
+                    BufferCheck check = sample();
+                    checks.remove(check.listnode);
+                    check.validate();
+
+                    size = BufferPool.roundUpNormal(check.buffer.capacity());
+                    if (size > BufferPool.CHUNK_SIZE)
+                        size = 0;
+
+                    // either share to free, or free immediately
+                    if (rand.nextBoolean())
+                    {
+                        shareTo.add(check);
+                        freeingSize += size;
+                        // interleave this with potentially messing with the other neighbour's stuff
+                        recycleFromNeighbour();
+                    }
+                    else
+                    {
+                        check.validate();
+                        BufferPool.put(check.buffer);
+                        totalSize -= size;
+                    }
+                }
+
+                if (currentTargetSize == 0)
+                    testEnv.freedAllMemory[threadIdx].compareAndSet(false, true);
+
+                // allocate a new buffer
+                size = (int) Math.max(1, AVG_BUFFER_SIZE + (STDEV_BUFFER_SIZE * rand.nextGaussian()));
+                if (size <= BufferPool.CHUNK_SIZE)
+                {
+                    totalSize += BufferPool.roundUpNormal(size);
+                    allocate(size);
+                }
+                else if (rand.nextBoolean())
+                {
+                    allocate(size);
+                }
+                else
+                {
+                    // perform a burst allocation to exhaust all available memory
+                    while (totalSize < testEnv.poolSize)
+                    {
+                        size = (int) Math.max(1, AVG_BUFFER_SIZE + (STDEV_BUFFER_SIZE * rand.nextGaussian()));
+                        if (size <= BufferPool.CHUNK_SIZE)
+                        {
+                            allocate(size);
+                            totalSize += BufferPool.roundUpNormal(size);
+                        }
+                    }
+                }
+
+                // validate a random buffer we have stashed
+                checks.get(rand.nextInt(checks.size())).validate();
+
+                // free all of our neighbour's remaining shared buffers
+                while (recycleFromNeighbour());
+            }
+
+            void cleanup()
+            {
+                while (checks.size() > 0)
+                {
+                    BufferCheck check = checks.get(0);
+                    BufferPool.put(check.buffer);
+                    checks.remove(check.listnode);
+                }
+                testEnv.latch.countDown();
+            }
+
+            boolean recycleFromNeighbour()
+            {
+                BufferCheck check = shareFrom.poll();
+                if (check == null)
+                    return false;
+                check.validate();
+                BufferPool.put(check.buffer);
+                return true;
+            }
+
+            BufferCheck allocate(int size)
+            {
+                ByteBuffer buffer = BufferPool.get(size);
+                assertNotNull(buffer);
+                BufferCheck check = new BufferCheck(buffer, rand.nextLong());
+                assertEquals(size, buffer.capacity());
+                assertEquals(0, buffer.position());
+                check.init();
+                check.listnode = checks.append(check);
+                return check;
+            }
+
+            BufferCheck sample()
+            {
+                // sample with preference to first elements:
+                // element at index n will be selected with likelihood (size - n) / sum1ToN(size)
+                int size = checks.size();
+
+                // pick a random number between 1 and sum1toN(size)
+                int sampleRange = sum1toN(size);
+                int sampleIndex = rand.nextInt(sampleRange);
+
+                // then binary search for the N, such that [sum1ToN(N), sum1ToN(N+1)) contains this random number
+                int moveBy = Math.max(size / 4, 1);
+                int index = size / 2;
+                while (true)
+                {
+                    int baseSampleIndex = sum1toN(index);
+                    int endOfSampleIndex = sum1toN(index + 1);
+                    if (sampleIndex >= baseSampleIndex)
+                    {
+                        if (sampleIndex < endOfSampleIndex)
+                            break;
+                        index += moveBy;
+                    }
+                    else index -= moveBy;
+                    moveBy = Math.max(moveBy / 2, 1);
+                }
+
+                // this gives us the inverse of our desired value, so just subtract it from the last index
+                index = size - (index + 1);
+
+                return checks.get(index);
+            }
+        });
+    }
+
+    private void startBurnerThreads(TestEnvironment testEnv)
+    {
+        // setup some high churn allocate/deallocate, without any checking
+        final SPSCQueue<ByteBuffer> burn = new SPSCQueue<>();
+        final CountDownLatch doneAdd = new CountDownLatch(1);
+        testEnv.addCheckedFuture(testEnv.executorService.submit(new TestUntil(testEnv.until)
+        {
+            int count = 0;
+            final ThreadLocalRandom rand = ThreadLocalRandom.current();
+            void testOne() throws Exception
+            {
+                if (count * BufferPool.CHUNK_SIZE >= testEnv.poolSize / 10)
+                {
+                    if (burn.exhausted)
+                    {
+                        count = 0;
+                        testEnv.burnFreed.compareAndSet(false, true);
+                    } else
+                    {
+                        Thread.yield();
+                    }
+                    return;
+                }
+
+                ByteBuffer buffer = BufferPool.tryGet(BufferPool.CHUNK_SIZE);
+                if (buffer == null)
+                {
+                    Thread.yield();
+                    return;
+                }
+
+                // 50/50 chance of returning the buffer from the producer thread, or
+                // pass it on to the consumer.
+                if (rand.nextBoolean())
+                    BufferPool.put(buffer);
+                else
+                    burn.add(buffer);
+
+                count++;
+            }
+            void cleanup()
+            {
+                doneAdd.countDown();
+            }
+        }));
+        testEnv.threadResultFuture.add(testEnv.executorService.submit(new TestUntil(testEnv.until)
+        {
+            void testOne() throws Exception
+            {
+                ByteBuffer buffer = burn.poll();
+                if (buffer == null)
+                {
+                    Thread.yield();
+                    return;
+                }
+                BufferPool.put(buffer);
+            }
+            void cleanup()
+            {
+                Uninterruptibles.awaitUninterruptibly(doneAdd);
+            }
+        }));
+    }
+
+    static abstract class TestUntil implements Callable<Boolean>
+    {
+        final long until;
+        protected TestUntil(long until)
+        {
+            this.until = until;
+        }
+
+        abstract void testOne() throws Exception;
+        void checkpoint() {}
+        void cleanup() {}
+
+        public Boolean call() throws Exception
+        {
+            try
+            {
+                while (System.nanoTime() < until)
+                {
+                    checkpoint();
+                    for (int i = 0 ; i < 100 ; i++)
+                        testOne();
+                }
+            }
+            catch (Exception ex)
+            {
+                logger.error("Got exception {}, current chunk {}",
+                             ex.getMessage(),
+                             BufferPool.currentChunk());
+                ex.printStackTrace();
+                return false;
+            }
+            catch (Throwable tr) // for java.lang.OutOfMemoryError
+            {
+                logger.error("Got throwable {}, current chunk {}",
+                             tr.getMessage(),
+                             BufferPool.currentChunk());
+                tr.printStackTrace();
+                return false;
+            }
+            finally
+            {
+                cleanup();
+            }
+            return true;
+        }
+    }
+
+    public static void main(String[] args)
+    {
+        try
+        {
+            new LongBufferPoolTest().testAllocate(Runtime.getRuntime().availableProcessors(),
+                                                  TimeUnit.HOURS.toNanos(2L), 16 << 20);
+            System.exit(0);
+        }
+        catch (Throwable tr)
+        {
+            System.out.println(String.format("Test failed - %s", tr.getMessage()));
+            System.exit(1); // Force exit so that non-daemon threads like REQUEST-SCHEDULER do not hang the process on failure
+        }
+    }
+
+    /**
+     * A single producer, single consumer queue.
+     */
+    private static final class SPSCQueue<V>
+    {
+        static final class Node<V>
+        {
+            volatile Node<V> next;
+            final V value;
+            Node(V value)
+            {
+                this.value = value;
+            }
+        }
+
+        private volatile boolean exhausted = true;
+        Node<V> head = new Node<>(null);
+        Node<V> tail = head;
+
+        void add(V value)
+        {
+            exhausted = false;
+            tail = tail.next = new Node<>(value);
+        }
+
+        V poll()
+        {
+            Node<V> next = head.next;
+            if (next == null)
+            {
+                // this is racey, but good enough for our purposes
+                exhausted = true;
+                return null;
+            }
+            head = next;
+            return next.value;
+        }
+    }
+
+    private static int sum1toN(int n)
+    {
+        return (n * (n + 1)) / 2;
+    }
+}

diff --git a/test/conf/cassandra.yaml b/test/conf/cassandra.yaml
index 3d3de84..1dba284 100644
--- a/test/conf/cassandra.yaml
+++ b/test/conf/cassandra.yaml

@@ -3,11 +3,12 @@
 # Consider the effects on 'o.a.c.i.s.LegacySSTableTest' before changing schemas in this file.
 #
 cluster_name: Test Cluster
-memtable_allocation_type: offheap_objects
+memtable_allocation_type: heap_buffers
 commitlog_sync: batch
 commitlog_sync_batch_window_in_ms: 1.0
 commitlog_segment_size_in_mb: 5
 commitlog_directory: build/test/cassandra/commitlog
+hints_directory: build/test/cassandra/hints
 partitioner: org.apache.cassandra.dht.ByteOrderedPartitioner
 listen_address: 127.0.0.1
 storage_port: 7010
@@ -39,3 +40,4 @@
 row_cache_class_name: org.apache.cassandra.cache.OHCProvider
 row_cache_size_in_mb: 16
 enable_user_defined_functions: true
+enable_scripted_user_defined_functions: true

diff --git a/test/conf/cassandra_pig.yaml b/test/conf/cassandra_pig.yaml
deleted file mode 100644
index 68615cf..0000000
--- a/test/conf/cassandra_pig.yaml
+++ /dev/null

@@ -1,41 +0,0 @@
-#
-# Warning!
-# Consider the effects on 'o.a.c.i.s.LegacySSTableTest' before changing schemas in this file.
-#
-cluster_name: Test Cluster
-memtable_allocation_type: offheap_objects
-commitlog_sync: batch
-commitlog_sync_batch_window_in_ms: 1.0
-commitlog_segment_size_in_mb: 5
-commitlog_directory: build/test/cassandra/commitlog
-partitioner: org.apache.cassandra.dht.Murmur3Partitioner
-listen_address: 127.0.0.1
-storage_port: 7010
-rpc_port: 9170
-start_native_transport: true
-native_transport_port: 9042
-column_index_size_in_kb: 4
-saved_caches_directory: build/test/cassandra/saved_caches
-data_file_directories:
-    - build/test/cassandra/data
-disk_access_mode: mmap
-seed_provider:
-    - class_name: org.apache.cassandra.locator.SimpleSeedProvider
-      parameters:
-          - seeds: "127.0.0.1"
-endpoint_snitch: org.apache.cassandra.locator.SimpleSnitch
-dynamic_snitch: true
-request_scheduler: org.apache.cassandra.scheduler.RoundRobinScheduler
-request_scheduler_id: keyspace
-server_encryption_options:
-    internode_encryption: none
-    keystore: conf/.keystore
-    keystore_password: cassandra
-    truststore: conf/.truststore
-    truststore_password: cassandra
-incremental_backups: true
-concurrent_compactors: 4
-compaction_throughput_mb_per_sec: 0
-row_cache_class_name: org.apache.cassandra.cache.OHCProvider
-row_cache_size_in_mb: 16
-enable_user_defined_functions: true

diff --git a/test/conf/logback-test.xml b/test/conf/logback-test.xml
index 6d75aaf..72550fe 100644
--- a/test/conf/logback-test.xml
+++ b/test/conf/logback-test.xml

@@ -17,13 +17,18 @@
  under the License.
 -->
 
-<configuration debug="false">
+<configuration debug="false" scan="true">
+  <!-- Shutdown hook ensures that async appender flushes -->
   <shutdownHook class="ch.qos.logback.core.hook.DelayingShutdownHook"/>
 
+  <!-- Status listener is used to wrap stdout/stderr and tee to log file -->
+  <statusListener class="org.apache.cassandra.LogbackStatusListener" />
+
   <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
-    <file>./build/test/logs/system.log</file>
+
+    <file>./build/test/logs/${cassandra.testtag}/TEST-${suitename}.log</file>
     <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
-      <fileNamePattern>./build/test/logs/system.log.%i.zip</fileNamePattern>
+      <fileNamePattern>./build/test/logs/${cassandra.testtag}/TEST-${suitename}.log.%i.gz</fileNamePattern>
       <minIndex>1</minIndex>
       <maxIndex>20</maxIndex>
     </rollingPolicy>
@@ -31,42 +36,39 @@
     <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
       <maxFileSize>20MB</maxFileSize>
     </triggeringPolicy>
+
     <encoder>
       <pattern>%-5level [%thread] %date{ISO8601} %msg%n</pattern>
-      <immediateFlush>false</immediateFlush>
     </encoder>
+    <immediateFlush>false</immediateFlush>
   </appender>
   
-  <appender name="ASYNCFILE" class="ch.qos.logback.classic.AsyncAppender">
-      <discardingThreshold>0</discardingThreshold>
-      <maxFlushTime>0</maxFlushTime>
-      <queueSize>1024</queueSize>
-      <appender-ref ref="FILE"/>
+  <appender name="STDOUT" target="System.out" class="org.apache.cassandra.ConsoleAppender">
+    <encoder>
+      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
+    </encoder>
+    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
+      <level>INFO</level>
+    </filter>
   </appender>
 
-    <appender name="STDERR" target="System.err" class="ch.qos.logback.core.ConsoleAppender">
-    <encoder>
-      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
-    </encoder>
-    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
-      <level>WARN</level>
-    </filter>
-  </appender>
-  
-  <appender name="STDOUT" target="System.out" class="ch.qos.logback.core.ConsoleAppender">
-    <encoder>
-      <pattern>%-5level %date{HH:mm:ss,SSS} %msg%n</pattern>
-    </encoder>
-    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
-      <level>WARN</level>
-    </filter>
+  <appender name="TEE" class="org.apache.cassandra.TeeingAppender">
+      <appender-ref ref="FILE"/>
+      <appender-ref ref="STDOUT"/>
   </appender>
 
   <logger name="org.apache.hadoop" level="WARN"/>
 
+  <!-- Do not change the name of this appender. LogbackStatusListener uses the thread name
+       tied to the appender name to know when to write to real stdout/stderr vs forwarding to logback -->
+  <appender name="ASYNC" class="ch.qos.logback.classic.AsyncAppender">
+      <discardingThreshold>0</discardingThreshold>
+      <maxFlushTime>0</maxFlushTime>
+      <queueSize>1024</queueSize>
+      <appender-ref ref="TEE"/>
+  </appender>
+
   <root level="DEBUG">
-    <appender-ref ref="ASYNCFILE" />
-    <appender-ref ref="STDERR" />
-    <appender-ref ref="STDOUT" />
+    <appender-ref ref="ASYNC" />
   </root>
 </configuration>

diff --git a/test/data/bloom-filter/ka/foo.cql b/test/data/bloom-filter/ka/foo.cql
new file mode 100644
index 0000000..c4aed6a
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo.cql

@@ -0,0 +1,64 @@
+create keyspace foo with replication = {'class':'SimpleStrategy', 'replication_factor':1};
+use foo ;
+create table atable ( pk int primary key, val int);
+insert into atable (pk, val) VALUES ( 1,1);
+insert into atable (pk, val) VALUES ( 2,2);
+insert into atable (pk, val) VALUES ( 3,3);
+insert into atable (pk, val) VALUES ( 4,4);
+insert into atable (pk, val) VALUES ( 5,5);
+insert into atable (pk, val) VALUES ( 6,6);
+insert into atable (pk, val) VALUES ( 7,7);
+insert into atable (pk, val) VALUES ( 8,8);
+insert into atable (pk, val) VALUES ( 9,9);
+insert into atable (pk, val) VALUES ( 10,10);
+
+
+
+[
+{"key": "5",
+ "cells": [["","",1428529571195019],
+           ["val","5",1428529571195019]]},
+{"key": "10",
+ "cells": [["","",1428529588242944],
+           ["val","10",1428529588242944]]},
+{"key": "1",
+ "cells": [["","",1428529563371015],
+           ["val","1",1428529563371015]]},
+{"key": "8",
+ "cells": [["","",1428529582362836],
+           ["val","8",1428529582362836]]},
+{"key": "2",
+ "cells": [["","",1428529565275080],
+           ["val","2",1428529565275080]]},
+{"key": "4",
+ "cells": [["","",1428529569331171],
+           ["val","4",1428529569331171]]},
+{"key": "7",
+ "cells": [["","",1428529575898967],
+           ["val","7",1428529575898967]]},
+{"key": "6",
+ "cells": [["","",1428529573027018],
+           ["val","6",1428529573027018]]},
+{"key": "9",
+ "cells": [["","",1428529585667042],
+           ["val","9",1428529585667042]]},
+{"key": "3",
+ "cells": [["","",1428529567379095],
+           ["val","3",1428529567379095]]}
+]
+
+
+
+SSTable: test/data/legacy-sstables/ka/foo/foo-atable-ka-1
+Partitioner: org.apache.cassandra.dht.Murmur3Partitioner
+Bloom Filter FP chance: 0,010000
+Minimum timestamp: 1428529563371015
+Maximum timestamp: 1428529588242944
+SSTable max local deletion time: 2147483647
+Compression ratio: 0.4
+Estimated droppable tombstones: 0.0
+SSTable Level: 0
+Repaired at: 0
+ReplayPosition(segmentId=1428529465658, position=6481)
+Estimated tombstone drop times:%n
+

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-CompressionInfo.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..0c9c6e6
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Data.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Data.db
new file mode 100644
index 0000000..c6a646b
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Data.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Digest.sha1 b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Digest.sha1
new file mode 100644
index 0000000..e2c1b2a
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Digest.sha1

@@ -0,0 +1 @@
+4153355033
\ No newline at end of file

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Filter.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Filter.db
new file mode 100644
index 0000000..ea01eae
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Filter.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Index.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Index.db
new file mode 100644
index 0000000..480fd51
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Index.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Statistics.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Statistics.db
new file mode 100644
index 0000000..037e0b4
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Summary.db b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Summary.db
new file mode 100644
index 0000000..602ec06
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-Summary.db
Binary files differ

diff --git a/test/data/bloom-filter/ka/foo/foo-atable-ka-1-TOC.txt b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-TOC.txt
new file mode 100644
index 0000000..8fb2ca0
--- /dev/null
+++ b/test/data/bloom-filter/ka/foo/foo-atable-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Statistics.db
+Filter.db
+Index.db
+Summary.db
+Digest.sha1
+Data.db
+TOC.txt

diff --git a/test/data/bloom-filter/la/foo/la-1-big-CompressionInfo.db b/test/data/bloom-filter/la/foo/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0c9c6e6
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Data.db b/test/data/bloom-filter/la/foo/la-1-big-Data.db
new file mode 100644
index 0000000..dc3536d
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Data.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Digest.adler32 b/test/data/bloom-filter/la/foo/la-1-big-Digest.adler32
new file mode 100644
index 0000000..d6952e2
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+3417187619
\ No newline at end of file

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Filter.db b/test/data/bloom-filter/la/foo/la-1-big-Filter.db
new file mode 100644
index 0000000..533a611
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Index.db b/test/data/bloom-filter/la/foo/la-1-big-Index.db
new file mode 100644
index 0000000..480fd51
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Index.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Statistics.db b/test/data/bloom-filter/la/foo/la-1-big-Statistics.db
new file mode 100644
index 0000000..b8127c2
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-Summary.db b/test/data/bloom-filter/la/foo/la-1-big-Summary.db
new file mode 100644
index 0000000..602ec06
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/bloom-filter/la/foo/la-1-big-TOC.txt b/test/data/bloom-filter/la/foo/la-1-big-TOC.txt
new file mode 100644
index 0000000..ee47456
--- /dev/null
+++ b/test/data/bloom-filter/la/foo/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+CompressionInfo.db
+Summary.db
+Index.db
+TOC.txt
+Filter.db
+Statistics.db
+Digest.adler32

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-CompressionInfo.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-CompressionInfo.db
deleted file mode 100644
index 44d2e59..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-CompressionInfo.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Data.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Data.db
deleted file mode 100644
index f75c4e6..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Filter.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Filter.db
deleted file mode 100644
index 8f0a999..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Filter.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Index.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Index.db
deleted file mode 100644
index da84fbc..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Statistics.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Statistics.db
deleted file mode 100644
index 0762615..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Summary.db b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Summary.db
deleted file mode 100644
index 6eb7650..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-TOC.txt b/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-TOC.txt
deleted file mode 100644
index cf6efa8..0000000
--- a/test/data/corrupt-sstables/Keyspace1-Standard3-jb-1-TOC.txt
+++ /dev/null

@@ -1,7 +0,0 @@
-CompressionInfo.db
-TOC.txt
-Filter.db
-Statistics.db
-Data.db
-Summary.db
-Index.db

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-CRC.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-CRC.db
deleted file mode 100644
index fc23cfe..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-CRC.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Data.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Data.db
deleted file mode 100644
index a4157d3..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Digest.sha1 b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Digest.sha1
deleted file mode 100644
index fb42fa9..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Digest.sha1
+++ /dev/null

@@ -1 +0,0 @@
-3265926428
\ No newline at end of file

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Filter.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Filter.db
deleted file mode 100644
index eb0ae30..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Filter.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Index.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Index.db
deleted file mode 100644
index 69a2fce..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Statistics.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Statistics.db
deleted file mode 100644
index 1cba196..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Summary.db b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Summary.db
deleted file mode 100644
index 190922a..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-TOC.txt b/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-TOC.txt
deleted file mode 100644
index 503f64d..0000000
--- a/test/data/corrupt-sstables/Keyspace1-StandardInteger1-ka-2-TOC.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Digest.sha1

-Summary.db

-Filter.db

-Index.db

-Statistics.db

-Data.db

-CRC.db

-TOC.txt


diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-CompressionInfo.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..307eeb3
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Data.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Data.db
new file mode 100644
index 0000000..175a5b6
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Data.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Digest.adler32 b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Digest.adler32
new file mode 100644
index 0000000..ad624d2
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Digest.adler32

@@ -0,0 +1 @@
+408097082
\ No newline at end of file

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Filter.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Filter.db
new file mode 100644
index 0000000..00a88b4
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Filter.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Index.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Index.db
new file mode 100644
index 0000000..c3b42d8
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Index.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Statistics.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Statistics.db
new file mode 100644
index 0000000..056cf17
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Summary.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Summary.db
new file mode 100644
index 0000000..453753f
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-Summary.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-TOC.txt b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-TOC.txt
new file mode 100644
index 0000000..ceb1dab
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_2_0/lb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Digest.adler32
+TOC.txt
+Filter.db
+Data.db
+Index.db
+Statistics.db
+Summary.db

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-CompressionInfo.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-CompressionInfo.db
new file mode 100644
index 0000000..3c39b5d
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Data.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Data.db
new file mode 100644
index 0000000..1f90815
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Data.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Digest.crc32 b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Digest.crc32
new file mode 100644
index 0000000..eeb8a5f
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Digest.crc32

@@ -0,0 +1 @@
+3332428483
\ No newline at end of file

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Filter.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Filter.db
new file mode 100644
index 0000000..f9c2d6e
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Filter.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Index.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Index.db
new file mode 100644
index 0000000..b077026
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Index.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Statistics.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Statistics.db
new file mode 100644
index 0000000..0b49b88
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Statistics.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Summary.db b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Summary.db
new file mode 100644
index 0000000..4547a94
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-Summary.db
Binary files differ

diff --git a/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-TOC.txt b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-TOC.txt
new file mode 100644
index 0000000..9a29338
--- /dev/null
+++ b/test/data/invalid-legacy-sstables/Keyspace1/cf_with_duplicates_3_0/mb-3-big-TOC.txt

@@ -0,0 +1,8 @@
+Statistics.db
+Digest.crc32
+Summary.db
+Index.db
+TOC.txt
+CompressionInfo.db
+Filter.db
+Data.db

diff --git a/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885380.log b/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885380.log
new file mode 100644
index 0000000..b98304a
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885380.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885381.log b/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885381.log
new file mode 100644
index 0000000..adac94f
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-lz4/CommitLog-5-1438186885381.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2-lz4/hash.txt b/test/data/legacy-commitlog/2.2-lz4/hash.txt
new file mode 100644
index 0000000..20aa6e5
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-lz4/hash.txt

@@ -0,0 +1,5 @@
+#CommitLog upgrade test, version 2.2.0-SNAPSHOT
+#Wed Jul 29 19:21:31 EEST 2015
+cells=6052
+hash=1274136076
+cfid=dc32ce20-360d-11e5-826c-afadad37221d

diff --git a/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915514.log b/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915514.log
new file mode 100644
index 0000000..e69dfb7
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915514.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915515.log b/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915515.log
new file mode 100644
index 0000000..3e06675
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-snappy/CommitLog-5-1438186915515.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2-snappy/hash.txt b/test/data/legacy-commitlog/2.2-snappy/hash.txt
new file mode 100644
index 0000000..f3dd72e
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2-snappy/hash.txt

@@ -0,0 +1,5 @@
+#CommitLog upgrade test, version 2.2.0-SNAPSHOT
+#Wed Jul 29 19:22:01 EEST 2015
+cells=6051
+hash=881633109
+cfid=ee2fe860-360d-11e5-951c-afadad37221d

diff --git a/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815314.log b/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815314.log
new file mode 100644
index 0000000..5032519
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815314.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815315.log b/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815315.log
new file mode 100644
index 0000000..34a02fe
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2/CommitLog-5-1438186815315.log
Binary files differ

diff --git a/test/data/legacy-commitlog/2.2/hash.txt b/test/data/legacy-commitlog/2.2/hash.txt
new file mode 100644
index 0000000..64f9dbb
--- /dev/null
+++ b/test/data/legacy-commitlog/2.2/hash.txt

@@ -0,0 +1,5 @@
+#CommitLog upgrade test, version 2.2.0-SNAPSHOT
+#Wed Jul 29 19:20:21 EEST 2015
+cells=6366
+hash=-802535821
+cfid=b28a7000-360d-11e5-ae92-afadad37221d

diff --git a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
index 83c68ce..1fbe040 100644
--- a/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
+++ b/test/data/legacy-sstables/jb/Keyspace1/Keyspace1-Standard1-jb-0-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..6d49922
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Data.db
new file mode 100644
index 0000000..326498b
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Index.db
new file mode 100644
index 0000000..44b89c4
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Statistics.db
new file mode 100644
index 0000000..a9a404a
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Summary.db
new file mode 100644
index 0000000..266c494
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-TOC.txt
new file mode 100644
index 0000000..abc3147
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust/legacy_tables-legacy_jb_clust-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+CompressionInfo.db
+Statistics.db
+Filter.db
+Data.db
+TOC.txt
+Index.db
+Summary.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..5eddda7
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Data.db
new file mode 100644
index 0000000..61ef270
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Index.db
new file mode 100644
index 0000000..9e18f8e
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Statistics.db
new file mode 100644
index 0000000..ab83acc
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Summary.db
new file mode 100644
index 0000000..896a529
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-TOC.txt
new file mode 100644
index 0000000..b67360a
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_compact/legacy_tables-legacy_jb_clust_compact-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+Data.db
+CompressionInfo.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+Filter.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..fe2e257
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Data.db
new file mode 100644
index 0000000..12c8fdc
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Index.db
new file mode 100644
index 0000000..51ddf91
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Statistics.db
new file mode 100644
index 0000000..a5eff40
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Summary.db
new file mode 100644
index 0000000..750a780
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-TOC.txt
new file mode 100644
index 0000000..abc3147
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter/legacy_tables-legacy_jb_clust_counter-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+CompressionInfo.db
+Statistics.db
+Filter.db
+Data.db
+TOC.txt
+Index.db
+Summary.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..34d459d
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Data.db
new file mode 100644
index 0000000..b511d30
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Index.db
new file mode 100644
index 0000000..10df1e8
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Statistics.db
new file mode 100644
index 0000000..aa3c757
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Summary.db
new file mode 100644
index 0000000..896a529
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-TOC.txt
new file mode 100644
index 0000000..b67360a
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_clust_counter_compact/legacy_tables-legacy_jb_clust_counter_compact-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+Data.db
+CompressionInfo.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+Filter.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..c80e64c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Data.db
new file mode 100644
index 0000000..401fe93
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Index.db
new file mode 100644
index 0000000..f0717e0
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Statistics.db
new file mode 100644
index 0000000..a2bcfaf
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Summary.db
new file mode 100644
index 0000000..af5e781
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-TOC.txt
new file mode 100644
index 0000000..abc3147
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple/legacy_tables-legacy_jb_simple-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+CompressionInfo.db
+Statistics.db
+Filter.db
+Data.db
+TOC.txt
+Index.db
+Summary.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..d530b73
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Data.db
new file mode 100644
index 0000000..c7e8586
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Index.db
new file mode 100644
index 0000000..d2ec218
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Statistics.db
new file mode 100644
index 0000000..792e733
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Summary.db
new file mode 100644
index 0000000..af5e781
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-TOC.txt
new file mode 100644
index 0000000..b67360a
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_compact/legacy_tables-legacy_jb_simple_compact-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+Data.db
+CompressionInfo.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+Filter.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..9c3416e
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Data.db
new file mode 100644
index 0000000..b72f790
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Index.db
new file mode 100644
index 0000000..932936c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Statistics.db
new file mode 100644
index 0000000..6baf1de
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Summary.db
new file mode 100644
index 0000000..af5e781
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-TOC.txt
new file mode 100644
index 0000000..abc3147
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter/legacy_tables-legacy_jb_simple_counter-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+CompressionInfo.db
+Statistics.db
+Filter.db
+Data.db
+TOC.txt
+Index.db
+Summary.db

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-CompressionInfo.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-CompressionInfo.db
new file mode 100644
index 0000000..01c5478
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Data.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Data.db
new file mode 100644
index 0000000..f545b04
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Filter.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Index.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Index.db
new file mode 100644
index 0000000..48c153c
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Statistics.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Statistics.db
new file mode 100644
index 0000000..8657050
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Summary.db b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Summary.db
new file mode 100644
index 0000000..af5e781
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-TOC.txt b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-TOC.txt
new file mode 100644
index 0000000..b67360a
--- /dev/null
+++ b/test/data/legacy-sstables/jb/legacy_tables/legacy_jb_simple_counter_compact/legacy_tables-legacy_jb_simple_counter_compact-jb-1-TOC.txt

@@ -0,0 +1,7 @@
+Data.db
+CompressionInfo.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..b5b5246
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db
new file mode 100644
index 0000000..18cf478
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1
new file mode 100644
index 0000000..f37a2b3
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1

@@ -0,0 +1 @@
+1576541413
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db
new file mode 100644
index 0000000..7a31048
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db
new file mode 100644
index 0000000..5e4995c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db
new file mode 100644
index 0000000..d4b0526
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db
new file mode 100644
index 0000000..38cc933
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt
new file mode 100644
index 0000000..db5ac46
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Digest.sha1
+Filter.db
+Statistics.db
+CompressionInfo.db
+Summary.db
+Index.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..bb15937
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Data.db
new file mode 100644
index 0000000..9f946ab
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Digest.sha1
new file mode 100644
index 0000000..ec58891
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Digest.sha1

@@ -0,0 +1 @@
+2454867855
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Filter.db
new file mode 100644
index 0000000..606783d
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Index.db
new file mode 100644
index 0000000..bcf40a1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Statistics.db
new file mode 100644
index 0000000..d30baa5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Summary.db
new file mode 100644
index 0000000..a4d9a6e
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-TOC.txt
new file mode 100644
index 0000000..141f12c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14803/legacy_tables-legacy_ka_14803-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Summary.db
+Data.db
+Index.db
+Digest.sha1
+CompressionInfo.db
+TOC.txt
+Filter.db
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..4a87419
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Data.db
new file mode 100644
index 0000000..007cc50
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Digest.sha1
new file mode 100644
index 0000000..71e6242
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Digest.sha1

@@ -0,0 +1 @@
+4060752841
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Filter.db
new file mode 100644
index 0000000..7a31048
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Index.db
new file mode 100644
index 0000000..7245332
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Statistics.db
new file mode 100644
index 0000000..f4b26ee0
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Summary.db
new file mode 100644
index 0000000..c1784f4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-TOC.txt
new file mode 100644
index 0000000..db5ac46
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14873/legacy_tables-legacy_ka_14873-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Digest.sha1
+Filter.db
+Statistics.db
+CompressionInfo.db
+Summary.db
+Index.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..cf8c97a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Data.db
new file mode 100644
index 0000000..19c7d79
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Digest.sha1
new file mode 100644
index 0000000..66d3a1c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Digest.sha1

@@ -0,0 +1 @@
+2565739962
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Filter.db
new file mode 100644
index 0000000..1b7fa17
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Index.db
new file mode 100644
index 0000000..a34ee93
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Statistics.db
new file mode 100644
index 0000000..405c3e3
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Summary.db
new file mode 100644
index 0000000..9756785
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-TOC.txt
new file mode 100644
index 0000000..7c351d8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14912/legacy_tables-legacy_ka_14912-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Filter.db
+Index.db
+Summary.db
+Data.db
+CompressionInfo.db
+Digest.sha1
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..69a8355
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Data.db
new file mode 100644
index 0000000..7acbf92
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Digest.sha1
new file mode 100644
index 0000000..fef7106
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Digest.sha1

@@ -0,0 +1 @@
+4293822635
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Index.db
new file mode 100644
index 0000000..44b89c4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Statistics.db
new file mode 100644
index 0000000..5f07da5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-TOC.txt
new file mode 100644
index 0000000..7be41d8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust/legacy_tables-legacy_ka_clust-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Digest.sha1
+CompressionInfo.db
+Data.db
+Statistics.db
+Summary.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..654094e
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Data.db
new file mode 100644
index 0000000..4c87e07
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Digest.sha1
new file mode 100644
index 0000000..4690757
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Digest.sha1

@@ -0,0 +1 @@
+1331331706
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Index.db
new file mode 100644
index 0000000..9e18f8e
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Statistics.db
new file mode 100644
index 0000000..ab55258
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Summary.db
new file mode 100644
index 0000000..774cbd1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-TOC.txt
new file mode 100644
index 0000000..7f7fe79
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_compact/legacy_tables-legacy_ka_clust_compact-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+TOC.txt
+Statistics.db
+Summary.db
+Index.db
+Data.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..3c7291c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Data.db
new file mode 100644
index 0000000..3566e5a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Digest.sha1
new file mode 100644
index 0000000..a679541
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Digest.sha1

@@ -0,0 +1 @@
+2539906592
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Index.db
new file mode 100644
index 0000000..51ddf91
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Statistics.db
new file mode 100644
index 0000000..36e9dc2
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-TOC.txt
new file mode 100644
index 0000000..7be41d8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter/legacy_tables-legacy_ka_clust_counter-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Digest.sha1
+CompressionInfo.db
+Data.db
+Statistics.db
+Summary.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..e3b71a4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Data.db
new file mode 100644
index 0000000..90d42a5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Digest.sha1
new file mode 100644
index 0000000..52e6552
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Digest.sha1

@@ -0,0 +1 @@
+2793875907
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Index.db
new file mode 100644
index 0000000..10df1e8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Statistics.db
new file mode 100644
index 0000000..8360ed5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Summary.db
new file mode 100644
index 0000000..774cbd1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-TOC.txt
new file mode 100644
index 0000000..7f7fe79
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_clust_counter_compact/legacy_tables-legacy_ka_clust_counter_compact-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+TOC.txt
+Statistics.db
+Summary.db
+Index.db
+Data.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-CompressionInfo.db
new file mode 100644
index 0000000..d320406
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Data.db
new file mode 100644
index 0000000..775b68c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Digest.sha1
new file mode 100644
index 0000000..63993fc
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Digest.sha1

@@ -0,0 +1 @@
+3417730863
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Filter.db
new file mode 100644
index 0000000..aa97e86
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Index.db
new file mode 100644
index 0000000..f425226
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Statistics.db
new file mode 100644
index 0000000..2580202
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Summary.db
new file mode 100644
index 0000000..c85b4a8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-TOC.txt
new file mode 100644
index 0000000..3fc5eec
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_compacted_multi_block_rt/legacy_tables-legacy_ka_compacted_multi_block_rt-ka-4-TOC.txt

@@ -0,0 +1,8 @@
+Summary.db
+Digest.sha1
+CompressionInfo.db
+TOC.txt
+Filter.db
+Data.db
+Index.db
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..01bde10
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Data.db
new file mode 100644
index 0000000..4c891d2
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Digest.sha1
new file mode 100644
index 0000000..e71840b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3389985016
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Filter.db
new file mode 100644
index 0000000..b6728a1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Index.db
new file mode 100644
index 0000000..64e12cd
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Statistics.db
new file mode 100644
index 0000000..1361f7c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Summary.db
new file mode 100644
index 0000000..76791c7
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-TOC.txt
new file mode 100644
index 0000000..402e1ab
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_bytes/legacy_tables-legacy_ka_cql_created_dense_table_with_bytes-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Digest.sha1
+CompressionInfo.db
+Summary.db
+Statistics.db
+Data.db
+Index.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..6f36650
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Data.db
new file mode 100644
index 0000000..bdad431
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Digest.sha1
new file mode 100644
index 0000000..f9e4b9c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Digest.sha1

@@ -0,0 +1 @@
+1334250623
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Filter.db
new file mode 100644
index 0000000..b6728a1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Index.db
new file mode 100644
index 0000000..64e12cd
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Statistics.db
new file mode 100644
index 0000000..13dc64a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Summary.db
new file mode 100644
index 0000000..76791c7
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-TOC.txt
new file mode 100644
index 0000000..402e1ab
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_cql_created_dense_table_with_int/legacy_tables-legacy_ka_cql_created_dense_table_with_int-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Digest.sha1
+CompressionInfo.db
+Summary.db
+Statistics.db
+Data.db
+Index.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..2336902
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Data.db
new file mode 100644
index 0000000..e7a9fd7
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Digest.sha1
new file mode 100644
index 0000000..bfe4bc3
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3995406674
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Filter.db
new file mode 100644
index 0000000..606783d
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Index.db
new file mode 100644
index 0000000..1faa378
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Statistics.db
new file mode 100644
index 0000000..0070c96
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Summary.db
new file mode 100644
index 0000000..d2adbfa
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-TOC.txt
new file mode 100644
index 0000000..3fc5eec
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_flushed_multi_block_rt/legacy_tables-legacy_ka_flushed_multi_block_rt-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Summary.db
+Digest.sha1
+CompressionInfo.db
+TOC.txt
+Filter.db
+Data.db
+Index.db
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..ecd3ddb
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Data.db
new file mode 100644
index 0000000..d1e4e2f
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Digest.sha1
new file mode 100644
index 0000000..bce117c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Digest.sha1

@@ -0,0 +1 @@
+76435450
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Filter.db
new file mode 100644
index 0000000..00a88b4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Index.db
new file mode 100644
index 0000000..9ba4894
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Statistics.db
new file mode 100644
index 0000000..a57d32b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Summary.db
new file mode 100644
index 0000000..d60d8f4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-TOC.txt
new file mode 100644
index 0000000..25fc863
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed/legacy_tables-legacy_ka_indexed-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+Summary.db
+TOC.txt
+Statistics.db
+Digest.sha1
+Filter.db
+Index.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..09c4cfa
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Data.db
new file mode 100644
index 0000000..40ee3c6
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Digest.sha1
new file mode 100644
index 0000000..55ac08c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3851004816
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Filter.db
new file mode 100644
index 0000000..00a88b4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Index.db
new file mode 100644
index 0000000..fb6ceed
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Statistics.db
new file mode 100644
index 0000000..b08f500
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Summary.db
new file mode 100644
index 0000000..d60d8f4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-TOC.txt
new file mode 100644
index 0000000..6865eca
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_indexed_static/legacy_tables-legacy_ka_indexed_static-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+Data.db
+Index.db
+Statistics.db
+TOC.txt
+Digest.sha1
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..9a33154
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Data.db
new file mode 100644
index 0000000..80a7c46
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Digest.sha1
new file mode 100644
index 0000000..de07755
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Digest.sha1

@@ -0,0 +1 @@
+1973536272
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Filter.db
new file mode 100644
index 0000000..dfcab1f
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Index.db
new file mode 100644
index 0000000..9fefd10
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Statistics.db
new file mode 100644
index 0000000..77c6233
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Summary.db
new file mode 100644
index 0000000..0c15fd4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-TOC.txt
new file mode 100644
index 0000000..a78243a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/Keyspace1-legacy_ka_repeated_rt-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Digest.sha1
+Index.db
+CompressionInfo.db
+Filter.db
+Summary.db
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..c80e64c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Data.db
new file mode 100644
index 0000000..b29a26a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Digest.sha1
new file mode 100644
index 0000000..c889c8d
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Digest.sha1

@@ -0,0 +1 @@
+2802392853
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Index.db
new file mode 100644
index 0000000..f0717e0
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Statistics.db
new file mode 100644
index 0000000..2af5467
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-TOC.txt
new file mode 100644
index 0000000..7be41d8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple/legacy_tables-legacy_ka_simple-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Digest.sha1
+CompressionInfo.db
+Data.db
+Statistics.db
+Summary.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..d530b73
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Data.db
new file mode 100644
index 0000000..6a38c52
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Digest.sha1
new file mode 100644
index 0000000..be8e5fb
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Digest.sha1

@@ -0,0 +1 @@
+606280675
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Index.db
new file mode 100644
index 0000000..d2ec218
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Statistics.db
new file mode 100644
index 0000000..e3fd855
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Summary.db
new file mode 100644
index 0000000..af8ad8b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-TOC.txt
new file mode 100644
index 0000000..7f7fe79
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_compact/legacy_tables-legacy_ka_simple_compact-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+TOC.txt
+Statistics.db
+Summary.db
+Index.db
+Data.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..9c3416e
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Data.db
new file mode 100644
index 0000000..1aee64c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Digest.sha1
new file mode 100644
index 0000000..3da96e6
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3671794375
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Index.db
new file mode 100644
index 0000000..932936c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Statistics.db
new file mode 100644
index 0000000..fa74e4b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-TOC.txt
new file mode 100644
index 0000000..7be41d8
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter/legacy_tables-legacy_ka_simple_counter-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Digest.sha1
+CompressionInfo.db
+Data.db
+Statistics.db
+Summary.db
+TOC.txt
+Filter.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..01c5478
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Data.db
new file mode 100644
index 0000000..5f4a7db
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Digest.sha1
new file mode 100644
index 0000000..a71f766
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Digest.sha1

@@ -0,0 +1 @@
+616768162
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Index.db
new file mode 100644
index 0000000..48c153c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Statistics.db
new file mode 100644
index 0000000..4a6e940
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Summary.db
new file mode 100644
index 0000000..af8ad8b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-TOC.txt
new file mode 100644
index 0000000..7f7fe79
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_simple_counter_compact/legacy_tables-legacy_ka_simple_counter_compact-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+TOC.txt
+Statistics.db
+Summary.db
+Index.db
+Data.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-CRC.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-CRC.db
new file mode 100644
index 0000000..ee733ee
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-CRC.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Data.db
new file mode 100644
index 0000000..6cf2e4c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Digest.sha1
new file mode 100644
index 0000000..f419fd2
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3673239127
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Filter.db
similarity index 100%
rename from test/data/negative-local-expiration-test/table1/lb-1-big-Filter.db
rename to test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Index.db
new file mode 100644
index 0000000..d6d8130
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Statistics.db
new file mode 100644
index 0000000..281b3da
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Summary.db
new file mode 100644
index 0000000..f2a5cd5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-TOC.txt
new file mode 100644
index 0000000..497e06b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Statistics.db
+CRC.db
+Data.db
+TOC.txt
+Filter.db
+Index.db
+Digest.sha1
+Summary.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-CompressionInfo.db
new file mode 100644
index 0000000..26a0dbe
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Data.db
new file mode 100644
index 0000000..c805f7d
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Digest.sha1
new file mode 100644
index 0000000..0c696fb
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Digest.sha1

@@ -0,0 +1 @@
+2529627719
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Filter.db
new file mode 100644
index 0000000..5543328
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Index.db
new file mode 100644
index 0000000..fbdd950
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Statistics.db
new file mode 100644
index 0000000..0e471d4
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-TOC.txt
new file mode 100644
index 0000000..1222811
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names/legacy_tables-legacy_ka_with_illegal_cell_names-ka-2-TOC.txt

@@ -0,0 +1,8 @@
+Digest.sha1
+Data.db
+Filter.db
+Summary.db
+Index.db
+TOC.txt
+CompressionInfo.db
+Statistics.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..908f3b1
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Data.db
new file mode 100644
index 0000000..33b88a0
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Digest.sha1
new file mode 100644
index 0000000..20deb5b
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3340111295
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Filter.db
new file mode 100644
index 0000000..5543328
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Index.db
new file mode 100644
index 0000000..fbdd950
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Statistics.db
new file mode 100644
index 0000000..f83575c
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Summary.db
new file mode 100644
index 0000000..9b90005
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-TOC.txt
new file mode 100644
index 0000000..8d621be
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_2/legacy_tables-legacy_ka_with_illegal_cell_names_2-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Filter.db
+Summary.db
+CompressionInfo.db
+Statistics.db
+Digest.sha1
+Index.db

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-CRC.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-CRC.db
new file mode 100644
index 0000000..82ca06a
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-CRC.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Data.db
new file mode 100644
index 0000000..269a739
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Digest.sha1
new file mode 100644
index 0000000..7c85191
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Digest.sha1

@@ -0,0 +1 @@
+1999183849
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Filter.db
new file mode 100644
index 0000000..f3f7da5
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Index.db
new file mode 100644
index 0000000..bff0123
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Statistics.db
new file mode 100644
index 0000000..febb2be
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-TOC.txt
new file mode 100644
index 0000000..c360dbf
--- /dev/null
+++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_with_illegal_cell_names_indexed/legacy_tables-legacy_ka_with_illegal_cell_names_indexed-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+CRC.db
+Statistics.db
+TOC.txt
+Data.db
+Index.db
+Summary.db
+Digest.sha1
+Filter.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..13701c4
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Data.db
new file mode 100644
index 0000000..f04344a
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Digest.adler32
new file mode 100644
index 0000000..d6157b2
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+1633775217
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Index.db
new file mode 100644
index 0000000..44b89c4
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Statistics.db
new file mode 100644
index 0000000..a54d94d
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-TOC.txt
new file mode 100644
index 0000000..dec3a3f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Digest.adler32
+Filter.db
+Summary.db
+Data.db
+Statistics.db
+TOC.txt
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..2a72f70
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Data.db
new file mode 100644
index 0000000..6bc08d2
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Digest.adler32
new file mode 100644
index 0000000..943dd1e
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+1372047449
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Index.db
new file mode 100644
index 0000000..9e18f8e
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Statistics.db
new file mode 100644
index 0000000..b2fd408
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Summary.db
new file mode 100644
index 0000000..6cd998f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-TOC.txt
new file mode 100644
index 0000000..0aef810
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_compact/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Statistics.db
+Digest.adler32
+CompressionInfo.db
+Summary.db
+Data.db
+Filter.db
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0bdb82a
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Data.db
new file mode 100644
index 0000000..76d4cbc
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Digest.adler32
new file mode 100644
index 0000000..e704111
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+287946299
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Index.db
new file mode 100644
index 0000000..51ddf91
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Statistics.db
new file mode 100644
index 0000000..b6ad155
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-TOC.txt
new file mode 100644
index 0000000..dec3a3f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Digest.adler32
+Filter.db
+Summary.db
+Data.db
+Statistics.db
+TOC.txt
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..d4dec70
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Data.db
new file mode 100644
index 0000000..63ee721
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Digest.adler32
new file mode 100644
index 0000000..577407e
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+2583914481
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Index.db
new file mode 100644
index 0000000..10df1e8
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Statistics.db
new file mode 100644
index 0000000..2bfc59d
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Summary.db
new file mode 100644
index 0000000..6cd998f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-TOC.txt
new file mode 100644
index 0000000..0aef810
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_clust_counter_compact/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Statistics.db
+Digest.adler32
+CompressionInfo.db
+Summary.db
+Data.db
+Filter.db
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..c80e64c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Data.db
new file mode 100644
index 0000000..ae136f5
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Digest.adler32
new file mode 100644
index 0000000..dacf8ac
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+4239203875
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Index.db
new file mode 100644
index 0000000..f0717e0
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Statistics.db
new file mode 100644
index 0000000..49b9275
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-TOC.txt
new file mode 100644
index 0000000..dec3a3f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Digest.adler32
+Filter.db
+Summary.db
+Data.db
+Statistics.db
+TOC.txt
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..d530b73
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Data.db
new file mode 100644
index 0000000..2e912a1
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Digest.adler32
new file mode 100644
index 0000000..c07a57f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+278403976
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Index.db
new file mode 100644
index 0000000..d2ec218
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Statistics.db
new file mode 100644
index 0000000..a81e03e
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Summary.db
new file mode 100644
index 0000000..6cd998f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-TOC.txt
new file mode 100644
index 0000000..0aef810
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_compact/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Statistics.db
+Digest.adler32
+CompressionInfo.db
+Summary.db
+Data.db
+Filter.db
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..9c3416e
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Data.db
new file mode 100644
index 0000000..010bd1a
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Digest.adler32
new file mode 100644
index 0000000..562547a
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+590029692
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Index.db
new file mode 100644
index 0000000..932936c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Statistics.db
new file mode 100644
index 0000000..525a4b1
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Summary.db
new file mode 100644
index 0000000..35b5e22
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-TOC.txt
new file mode 100644
index 0000000..dec3a3f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Digest.adler32
+Filter.db
+Summary.db
+Data.db
+Statistics.db
+TOC.txt
+Index.db

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-CompressionInfo.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..01c5478
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Data.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Data.db
new file mode 100644
index 0000000..323ff37
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Digest.adler32 b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Digest.adler32
new file mode 100644
index 0000000..92237e7
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+2048991053
\ No newline at end of file

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Filter.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Filter.db
new file mode 100644
index 0000000..c3cb27c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Index.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Index.db
new file mode 100644
index 0000000..48c153c
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Statistics.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Statistics.db
new file mode 100644
index 0000000..37324a7
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Summary.db b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Summary.db
new file mode 100644
index 0000000..6cd998f
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-TOC.txt b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-TOC.txt
new file mode 100644
index 0000000..0aef810
--- /dev/null
+++ b/test/data/legacy-sstables/la/legacy_tables/legacy_la_simple_counter_compact/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Statistics.db
+Digest.adler32
+CompressionInfo.db
+Summary.db
+Data.db
+Filter.db
+Index.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..aae310b
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db
new file mode 100644
index 0000000..ad9731c
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..f7cb5fb
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+4135005735
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db
new file mode 100644
index 0000000..55ee8d5
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db
new file mode 100644
index 0000000..955a443
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-TOC.txt
new file mode 100644
index 0000000..32fa9e1
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+Digest.crc32
+Data.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..11e69f9
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Data.db
new file mode 100644
index 0000000..9af7ba2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..46e277c
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+231958969
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Index.db
new file mode 100644
index 0000000..10fce6d
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Statistics.db
new file mode 100644
index 0000000..70a08ed
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-TOC.txt
new file mode 100644
index 0000000..a29a600
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_compact/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Statistics.db
+Digest.crc32
+Summary.db
+Data.db
+TOC.txt
+Filter.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..3d81003
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db
new file mode 100644
index 0000000..5d37d77
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..eeb0d82
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+745105579
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db
new file mode 100644
index 0000000..5ad5400
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db
new file mode 100644
index 0000000..c7292e9
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-TOC.txt
new file mode 100644
index 0000000..32fa9e1
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+Digest.crc32
+Data.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..2c92e35
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Data.db
new file mode 100644
index 0000000..f8dadef
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..b7dbadc
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+3570926375
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Index.db
new file mode 100644
index 0000000..5697f6f
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Statistics.db
new file mode 100644
index 0000000..45dbcfe
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-TOC.txt
new file mode 100644
index 0000000..a29a600
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_counter_compact/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Statistics.db
+Digest.crc32
+Summary.db
+Data.db
+TOC.txt
+Filter.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0b7faea
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db
new file mode 100644
index 0000000..642d343
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..f73c6e6
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+2206574354
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Index.db
new file mode 100644
index 0000000..b3094bf
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db
new file mode 100644
index 0000000..e3b5546
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-TOC.txt
new file mode 100644
index 0000000..32fa9e1
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+Digest.crc32
+Data.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..adb7fc4
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Data.db
new file mode 100644
index 0000000..69aec90
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..da73bd8
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+3668325305
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Index.db
new file mode 100644
index 0000000..56f29df
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Statistics.db
new file mode 100644
index 0000000..d4b841a
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-TOC.txt
new file mode 100644
index 0000000..a29a600
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_compact/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Statistics.db
+Digest.crc32
+Summary.db
+Data.db
+TOC.txt
+Filter.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0d9c077
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db
new file mode 100644
index 0000000..48c30e2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..8442491
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+3039840784
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Index.db
new file mode 100644
index 0000000..59e65ca
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db
new file mode 100644
index 0000000..7eb326b
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-TOC.txt
new file mode 100644
index 0000000..32fa9e1
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Filter.db
+Digest.crc32
+Data.db
+Index.db
+Summary.db
+TOC.txt
+Statistics.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-CompressionInfo.db
new file mode 100644
index 0000000..56c95a8
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Data.db
new file mode 100644
index 0000000..b30b1c4
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Digest.crc32
new file mode 100644
index 0000000..18ba1ff
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Digest.crc32

@@ -0,0 +1 @@
+3769133549
\ No newline at end of file

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Index.db
new file mode 100644
index 0000000..d094f73
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Statistics.db
new file mode 100644
index 0000000..b81ea33
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-TOC.txt
new file mode 100644
index 0000000..a29a600
--- /dev/null
+++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_simple_counter_compact/ma-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Index.db
+Statistics.db
+Digest.crc32
+Summary.db
+Data.db
+TOC.txt
+Filter.db
+CompressionInfo.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..e32d12f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Data.db
new file mode 100644
index 0000000..790cbe1
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..66d1d34
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+163939259
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Index.db
new file mode 100644
index 0000000..38d0537
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Statistics.db
new file mode 100644
index 0000000..a53b7c1
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..e3d4377
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Data.db
new file mode 100644
index 0000000..c5aa7ce
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..4e3bf89
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+3387396134
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Index.db
new file mode 100644
index 0000000..54afd0c
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Statistics.db
new file mode 100644
index 0000000..9d9b085
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_compact/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..5a34d6e
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Data.db
new file mode 100644
index 0000000..55ad88d
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..86240b5
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+346435755
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Index.db
new file mode 100644
index 0000000..d1a401b
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Statistics.db
new file mode 100644
index 0000000..4f3f45a
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..32fa731
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Data.db
new file mode 100644
index 0000000..ca10874
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..98f5784
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+3740016396
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Index.db
new file mode 100644
index 0000000..3661d77
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Statistics.db
new file mode 100644
index 0000000..bb1e3fb
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_counter_compact/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..288663f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Data.db
new file mode 100644
index 0000000..6a2f28e
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..fb255bd
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+655951031
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Index.db
new file mode 100644
index 0000000..b3094bf
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Statistics.db
new file mode 100644
index 0000000..55dcdce
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..adb7fc4
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Data.db
new file mode 100644
index 0000000..5cd0481
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..7d2bdbe
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+3883019031
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Index.db
new file mode 100644
index 0000000..56f29df
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Statistics.db
new file mode 100644
index 0000000..dff24fe
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_compact/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..19d25c9
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Data.db
new file mode 100644
index 0000000..77a1394
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..172b695
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+2876949266
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Index.db
new file mode 100644
index 0000000..59e65ca
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Statistics.db
new file mode 100644
index 0000000..4c67715
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-CompressionInfo.db
new file mode 100644
index 0000000..56c95a8
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Data.db
new file mode 100644
index 0000000..00a7a39
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Digest.crc32
new file mode 100644
index 0000000..f9f2fff
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Digest.crc32

@@ -0,0 +1 @@
+1214766167
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Index.db
new file mode 100644
index 0000000..d094f73
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Statistics.db
new file mode 100644
index 0000000..33c33f2
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-TOC.txt
new file mode 100644
index 0000000..fe9581f
--- /dev/null
+++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_simple_counter_compact/mb-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Summary.db
+TOC.txt
+Digest.crc32
+Filter.db
+Statistics.db
+Data.db
+Index.db

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..2df95c9
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Data.db
new file mode 100644
index 0000000..c90b58d
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..76480b1
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+2048618157
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Index.db
new file mode 100644
index 0000000..3c716e9
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Statistics.db
new file mode 100644
index 0000000..43beef3
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..69c19f3
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Data.db
new file mode 100644
index 0000000..7027017
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..4b4078b
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+892998706
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Index.db
new file mode 100644
index 0000000..b2f5171
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Statistics.db
new file mode 100644
index 0000000..535d7df
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_compact/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..42876b0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Data.db
new file mode 100644
index 0000000..8de00de
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..70a8c08
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+1609623183
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Index.db
new file mode 100644
index 0000000..690d2e4
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Statistics.db
new file mode 100644
index 0000000..d5bf83b
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..5ff1f27
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Data.db
new file mode 100644
index 0000000..7c47153
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..a6875fa
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+1205036423
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Index.db
new file mode 100644
index 0000000..b31055b
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Statistics.db
new file mode 100644
index 0000000..3463560
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_counter_compact/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..df694ed
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Data.db
new file mode 100644
index 0000000..e3e3637
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..394acb4
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+4091794686
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Filter.db
new file mode 100644
index 0000000..b58e394
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Index.db
new file mode 100644
index 0000000..e27f0f6
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Statistics.db
new file mode 100644
index 0000000..491277f
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Summary.db
new file mode 100644
index 0000000..7756279
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-TOC.txt
new file mode 100644
index 0000000..52b155b
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_inaccurate_min_max/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Digest.crc32
+CompressionInfo.db
+TOC.txt
+Summary.db
+Statistics.db
+Index.db
+Data.db
+Filter.db

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0b7faea
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Data.db
new file mode 100644
index 0000000..5862341
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..ee0485a
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+34605693
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Index.db
new file mode 100644
index 0000000..b3094bf
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Statistics.db
new file mode 100644
index 0000000..124f9a8
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..adb7fc4
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Data.db
new file mode 100644
index 0000000..4a00428
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..36c7d92
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+4017973941
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Index.db
new file mode 100644
index 0000000..56f29df
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Statistics.db
new file mode 100644
index 0000000..ac35208
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_compact/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..0d9c077
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Data.db
new file mode 100644
index 0000000..8aadb48
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..7bb4450
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+1545836769
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Index.db
new file mode 100644
index 0000000..59e65ca
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Statistics.db
new file mode 100644
index 0000000..c707d9e
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..56c95a8
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Data.db
new file mode 100644
index 0000000..6a5f57f
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..45b9e94
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+4272819930
\ No newline at end of file

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Filter.db
new file mode 100644
index 0000000..2e1d5d2
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Index.db
new file mode 100644
index 0000000..d094f73
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Statistics.db
new file mode 100644
index 0000000..c3299a0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Summary.db
new file mode 100644
index 0000000..9b24e04
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-TOC.txt
new file mode 100644
index 0000000..8ef7cb0
--- /dev/null
+++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_simple_counter_compact/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+TOC.txt
+Summary.db
+Filter.db
+Statistics.db
+Index.db
+Data.db
+CompressionInfo.db
+Digest.crc32

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-CompressionInfo.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-CompressionInfo.db
new file mode 100644
index 0000000..d9446df
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-CompressionInfo.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Data.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Data.db
new file mode 100644
index 0000000..f7b696d
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Digest.sha1 b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Digest.sha1
new file mode 100644
index 0000000..55756dd
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Digest.sha1

@@ -0,0 +1 @@
+3043896114
\ No newline at end of file

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Filter.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Filter.db
new file mode 100644
index 0000000..3015f10
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Filter.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Index.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Index.db
new file mode 100644
index 0000000..c8b59fb
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Statistics.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Statistics.db
new file mode 100644
index 0000000..8535f6a
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Statistics.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Summary.db b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Summary.db
new file mode 100644
index 0000000..d9ce8c2
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-Summary.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-TOC.txt b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-TOC.txt
new file mode 100644
index 0000000..7dc8930
--- /dev/null
+++ b/test/data/migration-sstables/2.1/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/system-compactions_in_progress-ka-1-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Filter.db
+Statistics.db
+Summary.db
+Index.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-CompressionInfo.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-CompressionInfo.db
new file mode 100644
index 0000000..b867db8
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-CompressionInfo.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Data.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Data.db
new file mode 100644
index 0000000..f14d86d
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Digest.sha1 b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Digest.sha1
new file mode 100644
index 0000000..2f4daa9
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Digest.sha1

@@ -0,0 +1 @@
+4283441474
\ No newline at end of file

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Filter.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Filter.db
new file mode 100644
index 0000000..a5bdd8e
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Filter.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Index.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Index.db
new file mode 100644
index 0000000..5d71315
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Statistics.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Statistics.db
new file mode 100644
index 0000000..aeb2bb8
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Statistics.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Summary.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Summary.db
new file mode 100644
index 0000000..602ec06
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-Summary.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-TOC.txt b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-TOC.txt
new file mode 100644
index 0000000..7dc8930
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-ka-3-TOC.txt

@@ -0,0 +1,8 @@
+Data.db
+TOC.txt
+Filter.db
+Statistics.db
+Summary.db
+Index.db
+Digest.sha1
+CompressionInfo.db

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Data.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Data.db
new file mode 100644
index 0000000..f14d86d
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Index.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Index.db
new file mode 100644
index 0000000..5d71315
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmp-ka-4-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Data.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Data.db
new file mode 100644
index 0000000..f14d86d
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Index.db b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Index.db
new file mode 100644
index 0000000..5d71315
--- /dev/null
+++ b/test/data/migration-sstables/2.1/test/foo-0094ac203e7411e59149ef9f87394ca6/test-foo-tmplink-ka-4-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-CompressionInfo.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-CompressionInfo.db
new file mode 100644
index 0000000..f7a81f0
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Data.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Data.db
new file mode 100644
index 0000000..2d5e60a
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Digest.adler32 b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Digest.adler32
new file mode 100644
index 0000000..deffbd1
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Digest.adler32

@@ -0,0 +1 @@
+2055934203
\ No newline at end of file

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Filter.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Filter.db
new file mode 100644
index 0000000..a749417
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Filter.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Index.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Index.db
new file mode 100644
index 0000000..d3923ab
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Statistics.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Statistics.db
new file mode 100644
index 0000000..664bfa5
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Statistics.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Summary.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Summary.db
new file mode 100644
index 0000000..a74f96f
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-Summary.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-TOC.txt b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-TOC.txt
new file mode 100644
index 0000000..92dc9fe
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/la-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Statistics.db
+Summary.db
+TOC.txt
+Filter.db
+Data.db
+CompressionInfo.db
+Digest.adler32
+Index.db

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Data.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Data.db
new file mode 100644
index 0000000..2d5e60a
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Index.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Index.db
new file mode 100644
index 0000000..d3923ab
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-la-2-big-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Data.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Data.db
new file mode 100644
index 0000000..2d5e60a
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Index.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Index.db
new file mode 100644
index 0000000..d3923ab
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmp-lb-3-big-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Data.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Data.db
new file mode 100644
index 0000000..2d5e60a
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Data.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Index.db b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Index.db
new file mode 100644
index 0000000..d3923ab
--- /dev/null
+++ b/test/data/migration-sstables/2.2/keyspace1/test-dfcc85801bc811e5aa694b06169f4ffa/tmplink-la-2-big-Index.db
Binary files differ

diff --git a/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435108403246-compactions_in_progress/manifest.json b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435108403246-compactions_in_progress/manifest.json
new file mode 100644
index 0000000..d5fdb4f
--- /dev/null
+++ b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435108403246-compactions_in_progress/manifest.json

@@ -0,0 +1 @@
+{"files":[]}

diff --git a/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241281-upgrade-3.0.0-SNAPSHOT-2.2.0-rc1-SNAPSHOT/manifest.json b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241281-upgrade-3.0.0-SNAPSHOT-2.2.0-rc1-SNAPSHOT/manifest.json
new file mode 100644
index 0000000..d5fdb4f
--- /dev/null
+++ b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241281-upgrade-3.0.0-SNAPSHOT-2.2.0-rc1-SNAPSHOT/manifest.json

@@ -0,0 +1 @@
+{"files":[]}

diff --git a/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241532-compactions_in_progress/manifest.json b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241532-compactions_in_progress/manifest.json
new file mode 100644
index 0000000..d5fdb4f
--- /dev/null
+++ b/test/data/migration-sstables/2.2/system/compactions_in_progress-55080ab05d9c388690a4acb25fe1f77b/snapshots/1435298241532-compactions_in_progress/manifest.json

@@ -0,0 +1 @@
+{"files":[]}

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table1/lb-1-big-CompressionInfo.db
deleted file mode 100644
index d7cc13b..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-CompressionInfo.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Data.db b/test/data/negative-local-expiration-test/table1/lb-1-big-Data.db
deleted file mode 100644
index 51213c2..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Digest.adler32 b/test/data/negative-local-expiration-test/table1/lb-1-big-Digest.adler32
deleted file mode 100644
index d5b12df..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-Digest.adler32
+++ /dev/null

@@ -1 +0,0 @@
-2292388625
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Index.db b/test/data/negative-local-expiration-test/table1/lb-1-big-Index.db
deleted file mode 100644
index 3ab96ee..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Statistics.db b/test/data/negative-local-expiration-test/table1/lb-1-big-Statistics.db
deleted file mode 100644
index e8cc7e0..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-Summary.db b/test/data/negative-local-expiration-test/table1/lb-1-big-Summary.db
deleted file mode 100644
index 1a3f81f..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/lb-1-big-TOC.txt b/test/data/negative-local-expiration-test/table1/lb-1-big-TOC.txt
deleted file mode 100644
index 26c7025..0000000
--- a/test/data/negative-local-expiration-test/table1/lb-1-big-TOC.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Statistics.db
-Summary.db
-Digest.adler32
-Data.db
-Index.db
-CompressionInfo.db
-Filter.db
-TOC.txt

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table1/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..d759cec
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Data.db b/test/data/negative-local-expiration-test/table1/mc-1-big-Data.db
new file mode 100644
index 0000000..e7a72da
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Digest.crc32 b/test/data/negative-local-expiration-test/table1/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..a3c633a
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+203700622
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Filter.db b/test/data/negative-local-expiration-test/table1/mc-1-big-Filter.db
new file mode 100644
index 0000000..a397f35
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Index.db b/test/data/negative-local-expiration-test/table1/mc-1-big-Index.db
new file mode 100644
index 0000000..d742724
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Statistics.db b/test/data/negative-local-expiration-test/table1/mc-1-big-Statistics.db
new file mode 100644
index 0000000..faf367b
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-Summary.db b/test/data/negative-local-expiration-test/table1/mc-1-big-Summary.db
new file mode 100644
index 0000000..66cf70f
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table1/mc-1-big-TOC.txt b/test/data/negative-local-expiration-test/table1/mc-1-big-TOC.txt
new file mode 100644
index 0000000..45113dc
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table1/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Data.db
+Summary.db
+Filter.db
+Statistics.db
+TOC.txt
+Digest.crc32
+Index.db

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table2/lb-1-big-CompressionInfo.db
deleted file mode 100644
index 38373b4..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-CompressionInfo.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Data.db b/test/data/negative-local-expiration-test/table2/lb-1-big-Data.db
deleted file mode 100644
index 762a229..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Digest.adler32 b/test/data/negative-local-expiration-test/table2/lb-1-big-Digest.adler32
deleted file mode 100644
index ae89849..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Digest.adler32
+++ /dev/null

@@ -1 +0,0 @@
-3829731931
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Filter.db b/test/data/negative-local-expiration-test/table2/lb-1-big-Filter.db
deleted file mode 100644
index f8e53be..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Filter.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Index.db b/test/data/negative-local-expiration-test/table2/lb-1-big-Index.db
deleted file mode 100644
index 38a6e4c..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Statistics.db b/test/data/negative-local-expiration-test/table2/lb-1-big-Statistics.db
deleted file mode 100644
index 64dab43..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-Summary.db b/test/data/negative-local-expiration-test/table2/lb-1-big-Summary.db
deleted file mode 100644
index 1a3f81f..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/lb-1-big-TOC.txt b/test/data/negative-local-expiration-test/table2/lb-1-big-TOC.txt
deleted file mode 100644
index 26c7025..0000000
--- a/test/data/negative-local-expiration-test/table2/lb-1-big-TOC.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Statistics.db
-Summary.db
-Digest.adler32
-Data.db
-Index.db
-CompressionInfo.db
-Filter.db
-TOC.txt

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table2/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..1759c09
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Data.db b/test/data/negative-local-expiration-test/table2/mc-1-big-Data.db
new file mode 100644
index 0000000..c1de572
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Digest.crc32 b/test/data/negative-local-expiration-test/table2/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..0403b5b
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+82785930
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Filter.db b/test/data/negative-local-expiration-test/table2/mc-1-big-Filter.db
new file mode 100644
index 0000000..a397f35
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Index.db b/test/data/negative-local-expiration-test/table2/mc-1-big-Index.db
new file mode 100644
index 0000000..a0477eb
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Statistics.db b/test/data/negative-local-expiration-test/table2/mc-1-big-Statistics.db
new file mode 100644
index 0000000..e9d6577
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-Summary.db b/test/data/negative-local-expiration-test/table2/mc-1-big-Summary.db
new file mode 100644
index 0000000..66cf70f
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table2/mc-1-big-TOC.txt b/test/data/negative-local-expiration-test/table2/mc-1-big-TOC.txt
new file mode 100644
index 0000000..45113dc
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table2/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+CompressionInfo.db
+Data.db
+Summary.db
+Filter.db
+Statistics.db
+TOC.txt
+Digest.crc32
+Index.db

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table3/lb-1-big-CompressionInfo.db
deleted file mode 100644
index 04a7384..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-CompressionInfo.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Data.db b/test/data/negative-local-expiration-test/table3/lb-1-big-Data.db
deleted file mode 100644
index 33145df..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Digest.adler32 b/test/data/negative-local-expiration-test/table3/lb-1-big-Digest.adler32
deleted file mode 100644
index 2a542cd..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Digest.adler32
+++ /dev/null

@@ -1 +0,0 @@
-3574474340
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Filter.db b/test/data/negative-local-expiration-test/table3/lb-1-big-Filter.db
deleted file mode 100644
index f8e53be..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Filter.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Index.db b/test/data/negative-local-expiration-test/table3/lb-1-big-Index.db
deleted file mode 100644
index 5fb34e8..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Statistics.db b/test/data/negative-local-expiration-test/table3/lb-1-big-Statistics.db
deleted file mode 100644
index 51203ae..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-Summary.db b/test/data/negative-local-expiration-test/table3/lb-1-big-Summary.db
deleted file mode 100644
index 1a3f81f..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/lb-1-big-TOC.txt b/test/data/negative-local-expiration-test/table3/lb-1-big-TOC.txt
deleted file mode 100644
index 26c7025..0000000
--- a/test/data/negative-local-expiration-test/table3/lb-1-big-TOC.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Statistics.db
-Summary.db
-Digest.adler32
-Data.db
-Index.db
-CompressionInfo.db
-Filter.db
-TOC.txt

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table3/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..b4de068
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Data.db b/test/data/negative-local-expiration-test/table3/mc-1-big-Data.db
new file mode 100644
index 0000000..e96f772
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Digest.crc32 b/test/data/negative-local-expiration-test/table3/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..459804b
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+3064924389
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Filter.db b/test/data/negative-local-expiration-test/table3/mc-1-big-Filter.db
new file mode 100644
index 0000000..a397f35
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Index.db b/test/data/negative-local-expiration-test/table3/mc-1-big-Index.db
new file mode 100644
index 0000000..807a27b
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Statistics.db b/test/data/negative-local-expiration-test/table3/mc-1-big-Statistics.db
new file mode 100644
index 0000000..1ee01e6
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-Summary.db b/test/data/negative-local-expiration-test/table3/mc-1-big-Summary.db
new file mode 100644
index 0000000..66cf70f
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table3/mc-1-big-TOC.txt b/test/data/negative-local-expiration-test/table3/mc-1-big-TOC.txt
new file mode 100644
index 0000000..f445537
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table3/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Summary.db
+TOC.txt
+Filter.db
+Index.db
+Digest.crc32
+CompressionInfo.db
+Data.db
+Statistics.db

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table4/lb-1-big-CompressionInfo.db
deleted file mode 100644
index c814fef..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-CompressionInfo.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Data.db b/test/data/negative-local-expiration-test/table4/lb-1-big-Data.db
deleted file mode 100644
index f40e71f..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Data.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Digest.adler32 b/test/data/negative-local-expiration-test/table4/lb-1-big-Digest.adler32
deleted file mode 100644
index e6675e4..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Digest.adler32
+++ /dev/null

@@ -1 +0,0 @@
-2405377913
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Filter.db b/test/data/negative-local-expiration-test/table4/lb-1-big-Filter.db
deleted file mode 100644
index f8e53be..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Filter.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Index.db b/test/data/negative-local-expiration-test/table4/lb-1-big-Index.db
deleted file mode 100644
index 8291383..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Index.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Statistics.db b/test/data/negative-local-expiration-test/table4/lb-1-big-Statistics.db
deleted file mode 100644
index 2217c2d..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Statistics.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-Summary.db b/test/data/negative-local-expiration-test/table4/lb-1-big-Summary.db
deleted file mode 100644
index 1a3f81f..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-Summary.db
+++ /dev/null
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/lb-1-big-TOC.txt b/test/data/negative-local-expiration-test/table4/lb-1-big-TOC.txt
deleted file mode 100644
index 26c7025..0000000
--- a/test/data/negative-local-expiration-test/table4/lb-1-big-TOC.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-Statistics.db
-Summary.db
-Digest.adler32
-Data.db
-Index.db
-CompressionInfo.db
-Filter.db
-TOC.txt

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-CompressionInfo.db b/test/data/negative-local-expiration-test/table4/mc-1-big-CompressionInfo.db
new file mode 100644
index 0000000..5d22c04
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-CompressionInfo.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Data.db b/test/data/negative-local-expiration-test/table4/mc-1-big-Data.db
new file mode 100644
index 0000000..a22a7a3
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Data.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Digest.crc32 b/test/data/negative-local-expiration-test/table4/mc-1-big-Digest.crc32
new file mode 100644
index 0000000..db7a6c7
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Digest.crc32

@@ -0,0 +1 @@
+1803989939
\ No newline at end of file

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Filter.db b/test/data/negative-local-expiration-test/table4/mc-1-big-Filter.db
new file mode 100644
index 0000000..a397f35
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Filter.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Index.db b/test/data/negative-local-expiration-test/table4/mc-1-big-Index.db
new file mode 100644
index 0000000..6397b5e
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Index.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Statistics.db b/test/data/negative-local-expiration-test/table4/mc-1-big-Statistics.db
new file mode 100644
index 0000000..4ee9294
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Statistics.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-Summary.db b/test/data/negative-local-expiration-test/table4/mc-1-big-Summary.db
new file mode 100644
index 0000000..66cf70f
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-Summary.db
Binary files differ

diff --git a/test/data/negative-local-expiration-test/table4/mc-1-big-TOC.txt b/test/data/negative-local-expiration-test/table4/mc-1-big-TOC.txt
new file mode 100644
index 0000000..f445537
--- /dev/null
+++ b/test/data/negative-local-expiration-test/table4/mc-1-big-TOC.txt

@@ -0,0 +1,8 @@
+Summary.db
+TOC.txt
+Filter.db
+Index.db
+Digest.crc32
+CompressionInfo.db
+Data.db
+Statistics.db

diff --git a/test/data/serialization/2.1/utils.BloomFilter1000.bin b/test/data/serialization/2.1/utils.BloomFilter1000.bin
new file mode 100644
index 0000000..619679c
--- /dev/null
+++ b/test/data/serialization/2.1/utils.BloomFilter1000.bin
Binary files differ

diff --git a/test/data/serialization/3.0/gms.EndpointState.bin b/test/data/serialization/3.0/gms.EndpointState.bin
new file mode 100644
index 0000000..a230ae1
--- /dev/null
+++ b/test/data/serialization/3.0/gms.EndpointState.bin
Binary files differ

diff --git a/test/data/serialization/3.0/gms.Gossip.bin b/test/data/serialization/3.0/gms.Gossip.bin
new file mode 100644
index 0000000..af5ac57
--- /dev/null
+++ b/test/data/serialization/3.0/gms.Gossip.bin
Binary files differ

diff --git a/test/data/serialization/3.0/service.SyncComplete.bin b/test/data/serialization/3.0/service.SyncComplete.bin
new file mode 100644
index 0000000..73ea4b4
--- /dev/null
+++ b/test/data/serialization/3.0/service.SyncComplete.bin
Binary files differ

diff --git a/test/data/serialization/3.0/service.SyncRequest.bin b/test/data/serialization/3.0/service.SyncRequest.bin
new file mode 100644
index 0000000..7e09777
--- /dev/null
+++ b/test/data/serialization/3.0/service.SyncRequest.bin
Binary files differ

diff --git a/test/data/serialization/3.0/service.ValidationComplete.bin b/test/data/serialization/3.0/service.ValidationComplete.bin
new file mode 100644
index 0000000..b8f0fb9
--- /dev/null
+++ b/test/data/serialization/3.0/service.ValidationComplete.bin
Binary files differ

diff --git a/test/data/serialization/3.0/service.ValidationRequest.bin b/test/data/serialization/3.0/service.ValidationRequest.bin
new file mode 100644
index 0000000..a00763b
--- /dev/null
+++ b/test/data/serialization/3.0/service.ValidationRequest.bin
Binary files differ

diff --git a/test/data/serialization/3.0/utils.BloomFilter.bin b/test/data/serialization/3.0/utils.BloomFilter.bin
new file mode 100644
index 0000000..e8bfb4f
--- /dev/null
+++ b/test/data/serialization/3.0/utils.BloomFilter.bin
Binary files differ

diff --git a/test/data/serialization/3.0/utils.BloomFilter1000.bin b/test/data/serialization/3.0/utils.BloomFilter1000.bin
new file mode 100644
index 0000000..cde99c9
--- /dev/null
+++ b/test/data/serialization/3.0/utils.BloomFilter1000.bin
Binary files differ

diff --git a/test/data/serialization/3.0/utils.EstimatedHistogram.bin b/test/data/serialization/3.0/utils.EstimatedHistogram.bin
new file mode 100644
index 0000000..bedd39b
--- /dev/null
+++ b/test/data/serialization/3.0/utils.EstimatedHistogram.bin
Binary files differ

diff --git a/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java b/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java
index 0085f1c..3cb8dac 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java

@@ -613,7 +613,7 @@
         InstanceClassLoader cl = (InstanceClassLoader) thread.getContextClassLoader();
         get(cl.getInstanceId()).uncaughtException(thread, error);
     }
-
+    
     @Override
     public void close()
     {

diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
index 6553fb9..2f2b525 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java

@@ -40,9 +40,7 @@
 import org.apache.cassandra.distributed.api.SimpleQueryResult;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.service.pager.Pageable;
 import org.apache.cassandra.service.pager.QueryPager;
-import org.apache.cassandra.service.pager.QueryPagers;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
@@ -85,7 +83,7 @@
 
     private SimpleQueryResult executeInternal(String query, ConsistencyLevel consistencyLevelOrigin, Object[] boundValues)
     {
-        ClientState clientState = ClientState.forInternalCalls();
+        ClientState clientState = makeFakeClientState();
         CQLStatement prepared = QueryProcessor.getStatement(query, clientState).statement;
         List<ByteBuffer> boundBBValues = new ArrayList<>();
         ConsistencyLevel consistencyLevel = ConsistencyLevel.valueOf(consistencyLevelOrigin.name());
@@ -122,40 +120,39 @@
             throw new IllegalArgumentException("Page size should be strictly positive but was " + pageSize);
 
         return instance.sync(() -> {
+            ClientState clientState = makeFakeClientState();
             ConsistencyLevel consistencyLevel = ConsistencyLevel.valueOf(consistencyLevelOrigin.name());
-            CQLStatement prepared = QueryProcessor.getStatement(query, ClientState.forInternalCalls()).statement;
+            CQLStatement prepared = QueryProcessor.getStatement(query, clientState).statement;
             List<ByteBuffer> boundBBValues = new ArrayList<>();
             for (Object boundValue : boundValues)
             {
                 boundBBValues.add(ByteBufferUtil.objectToBytes(boundValue));
             }
 
-            prepared.validate(QueryState.forInternalCalls().getClientState());
+            prepared.validate(clientState);
             assert prepared instanceof SelectStatement : "Only SELECT statements can be executed with paging";
 
-            ClientState clientState = QueryState.forInternalCalls().getClientState();
             SelectStatement selectStatement = (SelectStatement) prepared;
-            QueryOptions queryOptions = QueryOptions.create(toCassandraCL(consistencyLevel),
-                                                            boundBBValues,
-                                                            false,
-                                                            pageSize,
-                                                            null,
-                                                            null,
-                                                            Server.CURRENT_VERSION);
-            Pageable pageable = selectStatement.getPageableCommand(queryOptions);
+
+            QueryPager pager = selectStatement.getQuery(QueryOptions.create(toCassandraCL(consistencyLevel),
+                                                                            boundBBValues,
+                                                                            false,
+                                                                            pageSize,
+                                                                            null,
+                                                                            null,
+                                                                            Server.CURRENT_VERSION),
+                                                        FBUtilities.nowInSeconds())
+                                              .getPager(null, Server.CURRENT_VERSION);
 
             // Usually pager fetches a single page (see SelectStatement#execute). We need to iterate over all
             // of the results lazily.
-            QueryPager pager = QueryPagers.pager(pageable, toCassandraCL(consistencyLevel), clientState, null);
-            Iterator<Object[]> iter = RowUtil.toObjects(selectStatement.getResultMetadata().names,
-                                                        UntypedResultSet.create(selectStatement,
-                                                                                pager,
-                                                                                pageSize).iterator());
+            UntypedResultSet rs = UntypedResultSet.create(selectStatement, toCassandraCL(consistencyLevel), clientState, pager, pageSize);
+            Iterator<Object[]> it = new Iterator<Object[]>() {
+                Iterator<Object[]> iter = RowUtil.toObjects(rs);
 
-            // We have to make sure iterator is not running on main thread.
-            Iterator<Object[]> it =  new Iterator<Object[]>() {
                 public boolean hasNext()
                 {
+                    // We have to make sure iterator is not running on main thread.
                     return instance.sync(() -> iter.hasNext()).call();
                 }
 
@@ -164,7 +161,7 @@
                     return instance.sync(() -> iter.next()).call();
                 }
             };
-            return QueryResults.fromObjectArrayIterator(RowUtil.getColumnNames(selectStatement.getResultMetadata().names), it);
+            return QueryResults.fromObjectArrayIterator(RowUtil.getColumnNames(rs.metadata()), it);
         }).call();
     }
 

diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
index 7ed29fd..90e6787 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java

@@ -18,8 +18,6 @@
 
 package org.apache.cassandra.distributed.impl;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.net.InetAddress;
@@ -43,6 +41,7 @@
 import javax.management.Notification;
 import javax.management.NotificationListener;
 
+import org.apache.cassandra.batchlog.BatchlogManager;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.concurrent.SharedExecutorPool;
 import org.apache.cassandra.concurrent.StageManager;
@@ -53,9 +52,7 @@
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
-import org.apache.cassandra.db.BatchlogManager;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.HintedHandOffManager;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.SystemKeyspace;
@@ -78,15 +75,20 @@
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.VersionedValue;
+import org.apache.cassandra.hints.HintsService;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.IndexSummaryManager;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.net.CompactEndpointSerializationHelper;
 import org.apache.cassandra.net.IMessageSink;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.LegacySchemaMigrator;
 import org.apache.cassandra.service.CassandraDaemon;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.PendingRangeCalculatorService;
@@ -98,6 +100,7 @@
 import org.apache.cassandra.tracing.TraceState;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.utils.DiagnosticSnapshotService;
 import org.apache.cassandra.utils.ExecutorUtils;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.NanoTimeToCurrentTimeMillis;
@@ -105,11 +108,12 @@
 import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Ref;
+import org.apache.cassandra.utils.memory.BufferPool;
 
 import static java.util.concurrent.TimeUnit.MINUTES;
 import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
-import static org.apache.cassandra.distributed.api.Feature.NETWORK;
 import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL;
+import static org.apache.cassandra.distributed.api.Feature.NETWORK;
 
 public class Instance extends IsolatedExecutor implements IInvokableInstance
 {
@@ -305,7 +309,7 @@
             IVersionedSerializer serializer = MessagingService.instance().verbSerializers.get(messageIn.verb);
             CompactEndpointSerializationHelper.serialize(from.getAddress(), out);
 
-            out.writeInt(messageIn.verb.ordinal());
+            out.writeInt(MessagingService.Verb.convertForMessagingServiceVersion(messageIn.verb, version).ordinal());
             out.writeInt(messageIn.parameters.size());
             for (Map.Entry<String, byte[]> entry : messageIn.parameters.entrySet())
             {
@@ -395,7 +399,7 @@
     public static Pair<MessageIn<Object>, Integer> deserializeMessage(IMessage imessage)
     {
         // Based on org.apache.cassandra.net.IncomingTcpConnection.receiveMessage
-        try (DataInputStream input = new DataInputStream(new ByteArrayInputStream(imessage.bytes())))
+        try (DataInputBuffer input = new DataInputBuffer(imessage.bytes()))
         {
             int version = imessage.version();
             if (version > MessagingService.current_version)
@@ -418,6 +422,8 @@
             int partial = input.readInt();
 
             return Pair.create(MessageIn.read(input, version, id), partial);
+            //long currentTime = ApproximateTime.currentTimeMillis();
+            //return MessageIn.read(input, version, id, MessageIn.readConstructionTime(imessage.from().getAddress(), input, currentTime));
         }
         catch (IOException e)
         {
@@ -506,9 +512,10 @@
                 DatabaseDescriptor.setDaemonInitialized();
                 DatabaseDescriptor.createAllDirectories();
 
-                // We need to persist this as soon as possible after startup checks.
+                // We need to  persist this as soon as possible after startup checks.
                 // This should be the first write to SystemKeyspace (CASSANDRA-11742)
                 SystemKeyspace.persistLocalMetadata();
+                LegacySchemaMigrator.migrate();
 
                 try
                 {
@@ -584,6 +591,7 @@
     private void mkdirs()
     {
         new File(config.getString("saved_caches_directory")).mkdirs();
+        new File(config.getString("hints_directory")).mkdirs();
         new File(config.getString("commitlog_directory")).mkdirs();
         for (String dir : (String[]) config.get("data_file_directories"))
             new File(dir).mkdirs();
@@ -622,14 +630,18 @@
             for (int i = 0; i < tokens.size(); i++)
             {
                 InetSocketAddress ep = hosts.get(i);
-                Gossiper.instance.initializeNodeUnsafe(ep.getAddress(), hostIds.get(i), 1);
-                Gossiper.instance.injectApplicationState(ep.getAddress(),
-                        ApplicationState.TOKENS,
-                        new VersionedValue.VersionedValueFactory(partitioner).tokens(Collections.singleton(tokens.get(i))));
-                storageService.onChange(ep.getAddress(),
-                        ApplicationState.STATUS,
-                        new VersionedValue.VersionedValueFactory(partitioner).normal(Collections.singleton(tokens.get(i))));
-                Gossiper.instance.realMarkAlive(ep.getAddress(), Gossiper.instance.getEndpointStateForEndpoint(ep.getAddress()));
+                UUID hostId = hostIds.get(i);
+                Token token = tokens.get(i);
+                Gossiper.runInGossipStageBlocking(() -> {
+                    Gossiper.instance.initializeNodeUnsafe(ep.getAddress(), hostId, 1);
+                    Gossiper.instance.injectApplicationState(ep.getAddress(),
+                                                             ApplicationState.TOKENS,
+                                                             new VersionedValue.VersionedValueFactory(partitioner).tokens(Collections.singleton(token)));
+                    storageService.onChange(ep.getAddress(),
+                                            ApplicationState.STATUS,
+                                            new VersionedValue.VersionedValueFactory(partitioner).normal(Collections.singleton(token)));
+                    Gossiper.instance.realMarkAlive(ep.getAddress(), Gossiper.instance.getEndpointStateForEndpoint(ep.getAddress()));
+                });
                 int messagingVersion = cluster.get(ep).isShutdown()
                                        ? MessagingService.current_version
                                        : Math.min(MessagingService.current_version, cluster.get(ep).getMessagingVersion());
@@ -654,6 +666,9 @@
     @Override
     public Future<Void> shutdown(boolean graceful)
     {
+        if (!graceful)
+            MessagingService.instance().shutdown(false);
+
         Future<?> future = async((ExecutorService executor) -> {
             Throwable error = null;
 
@@ -666,36 +681,38 @@
                 StorageService.instance.shutdownServer();
 
                 error = parallelRun(error, executor,
-                    () -> NanoTimeToCurrentTimeMillis.shutdown(MINUTES.toMillis(1L))
+                                    () -> NanoTimeToCurrentTimeMillis.shutdown(MINUTES.toMillis(1L))
                 );
             }
 
             error = parallelRun(error, executor,
-                                MessagingService.instance()::shutdown
-            );
-
-            error = parallelRun(error, executor,
                                 () -> Gossiper.instance.stopShutdownAndWait(1L, MINUTES),
                                 CompactionManager.instance::forceShutdown,
-                                () -> BatchlogManager.shutdownAndWait(1L, MINUTES),
-                                () -> HintedHandOffManager.instance.shutdownAndWait(1L, MINUTES),
+                                () -> BatchlogManager.instance.shutdownAndWait(1L, MINUTES),
+                                HintsService.instance::shutdownBlocking,
                                 () -> StreamCoordinator.shutdownAndWait(1L, MINUTES),
+                                () -> SecondaryIndexManager.shutdownAndWait(1L, MINUTES),
                                 () -> IndexSummaryManager.instance.shutdownAndWait(1L, MINUTES),
                                 () -> ColumnFamilyStore.shutdownExecutorsAndWait(1L, MINUTES),
                                 () -> PendingRangeCalculatorService.instance.shutdownExecutor(1L, MINUTES),
+                                () -> BufferPool.shutdownLocalCleaner(1L, MINUTES),
                                 () -> StorageService.instance.shutdownBGMonitorAndWait(1L, MINUTES),
                                 () -> Ref.shutdownReferenceReaper(1L, MINUTES),
                                 () -> Memtable.MEMORY_POOL.shutdownAndWait(1L, MINUTES),
                                 () -> SSTableReader.shutdownBlocking(1L, MINUTES),
-                                () -> ScheduledExecutors.shutdownAndWait(1L, MINUTES)
+                                () -> DiagnosticSnapshotService.instance.shutdownAndWait(1L, MINUTES)
             );
             error = parallelRun(error, executor,
-                                CommitLog.instance::shutdownBlocking
+                                () -> ScheduledExecutors.shutdownAndWait(1L, MINUTES),
+                                MessagingService.instance()::shutdown
             );
             error = parallelRun(error, executor,
                                 () -> StageManager.shutdownAndWait(1L, MINUTES),
                                 () -> SharedExecutorPool.SHARED.shutdownAndWait(1L, MINUTES)
             );
+            error = parallelRun(error, executor,
+                                CommitLog.instance::shutdownBlocking
+            );
 
             Throwables.maybeFail(error);
         }).apply(isolatedExecutor);

diff --git a/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java b/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java
index d13a0b6..cfdcc80 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java

@@ -69,7 +69,7 @@
                            String saved_caches_directory,
                            String[] data_file_directories,
                            String commitlog_directory,
-//                           String hints_directory,
+                           String hints_directory,
 //                           String cdc_directory,
                            String initial_token)
     {
@@ -84,14 +84,14 @@
                 .set("saved_caches_directory", saved_caches_directory)
                 .set("data_file_directories", data_file_directories)
                 .set("commitlog_directory", commitlog_directory)
-//                .set("hints_directory", hints_directory)
+                .set("hints_directory", hints_directory)
 //                .set("cdc_directory", cdc_directory)
                 .set("initial_token", initial_token)
                 .set("partitioner", "org.apache.cassandra.dht.Murmur3Partitioner")
                 .set("start_native_transport", true)
                 .set("concurrent_writes", 2)
                 .set("concurrent_counter_writes", 2)
-//                .set("concurrent_materialized_view_writes", 2)
+                .set("concurrent_materialized_view_writes", 2)
                 .set("concurrent_reads", 2)
                 .set("memtable_flush_writers", 1)
                 .set("concurrent_compactors", 1)
@@ -273,7 +273,7 @@
                                   String.format("%s/node%d/saved_caches", root, nodeNum),
                                   new String[] { String.format("%s/node%d/data", root, nodeNum) },
                                   String.format("%s/node%d/commitlog", root, nodeNum),
-//                                  String.format("%s/node%d/hints", root, nodeNum),
+                                  String.format("%s/node%d/hints", root, nodeNum),
 //                                  String.format("%s/node%d/cdc", root, nodeNum),
                                   token);
     }

diff --git a/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java b/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java
index 625b4aa..f3eb327 100644
--- a/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java
+++ b/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java

@@ -27,6 +27,7 @@
 
 import com.google.common.collect.Multimap;
 
+import org.apache.cassandra.batchlog.BatchlogManager;
 import org.apache.cassandra.db.ColumnFamilyStoreMBean;
 import org.apache.cassandra.db.HintedHandOffManager;
 import org.apache.cassandra.db.Keyspace;
@@ -96,6 +97,7 @@
         hhProxy = HintedHandOffManager.instance;
         gcProxy = new GCInspector();
         gossProxy = Gossiper.instance;
+        bmProxy = BatchlogManager.instance;
         memProxy = ManagementFactory.getMemoryMXBean();
         runtimeProxy = ManagementFactory.getRuntimeMXBean();
     }

diff --git a/test/distributed/org/apache/cassandra/distributed/test/GossipTest.java b/test/distributed/org/apache/cassandra/distributed/test/GossipTest.java
new file mode 100644
index 0000000..83c62c8
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/GossipTest.java

@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test;
+
+import java.net.InetAddress;
+import java.util.Collection;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.LockSupport;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Iterables;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.EndpointState;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
+import static org.apache.cassandra.distributed.api.Feature.NETWORK;
+
+public class GossipTest extends TestBaseImpl
+{
+
+    @Test
+    public void nodeDownDuringMove() throws Throwable
+    {
+        int liveCount = 1;
+        System.setProperty("cassandra.ring_delay_ms", "5000"); // down from 30s default
+        System.setProperty("cassandra.consistent.rangemovement", "false");
+        System.setProperty("cassandra.consistent.simultaneousmoves.allow", "true");
+        try (Cluster cluster = Cluster.build(2 + liveCount)
+                                      .withConfig(config -> config.with(NETWORK).with(GOSSIP))
+                                      .createWithoutStarting())
+        {
+            int fail = liveCount + 1;
+            int late = fail + 1;
+            for (int i = 1 ; i <= liveCount ; ++i)
+                cluster.get(i).startup();
+            cluster.get(fail).startup();
+            Collection<String> expectTokens = cluster.get(fail).callsOnInstance(() ->
+                StorageService.instance.getTokenMetadata().getTokens(FBUtilities.getBroadcastAddress())
+                                       .stream().map(Object::toString).collect(Collectors.toList())
+            ).call();
+
+            InetAddress failAddress = cluster.get(fail).broadcastAddress().getAddress();
+            // wait for NORMAL state
+            for (int i = 1 ; i <= liveCount ; ++i)
+            {
+                cluster.get(i).acceptsOnInstance((InetAddress endpoint) -> {
+                    EndpointState ep;
+                    while (null == (ep = Gossiper.instance.getEndpointStateForEndpoint(endpoint))
+                           || ep.getApplicationState(ApplicationState.STATUS) == null
+                           || !ep.getApplicationState(ApplicationState.STATUS).value.startsWith("NORMAL"))
+                        LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(10L));
+                }).accept(failAddress);
+            }
+
+            // set ourselves to MOVING, and wait for it to propagate
+            cluster.get(fail).runOnInstance(() -> {
+
+                Token token = Iterables.getFirst(StorageService.instance.getTokenMetadata().getTokens(FBUtilities.getBroadcastAddress()), null);
+                Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.moving(token));
+            });
+
+            for (int i = 1 ; i <= liveCount ; ++i)
+            {
+                cluster.get(i).acceptsOnInstance((InetAddress endpoint) -> {
+                    EndpointState ep;
+                    while (null == (ep = Gossiper.instance.getEndpointStateForEndpoint(endpoint))
+                           || (ep.getApplicationState(ApplicationState.STATUS) == null
+                               || !ep.getApplicationState(ApplicationState.STATUS).value.startsWith("MOVING")))
+                        LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(10L));
+                }).accept(failAddress);
+            }
+
+            cluster.get(fail).shutdown(false).get();
+            cluster.get(late).startup();
+            cluster.get(late).acceptsOnInstance((InetAddress endpoint) -> {
+                EndpointState ep;
+                while (null == (ep = Gossiper.instance.getEndpointStateForEndpoint(endpoint))
+                       || !ep.getApplicationState(ApplicationState.STATUS).value.startsWith("MOVING"))
+                    LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(10L));
+            }).accept(failAddress);
+
+            Collection<String> tokens = cluster.get(late).appliesOnInstance((InetAddress endpoint) ->
+                StorageService.instance.getTokenMetadata().getTokens(failAddress)
+                                       .stream().map(Object::toString).collect(Collectors.toList())
+            ).apply(failAddress);
+
+            Assert.assertEquals(expectTokens, tokens);
+        }
+    }
+    
+}

diff --git a/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java b/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java
index 062f401..f4398da 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java

@@ -39,7 +39,6 @@
 
 public class MessageFiltersTest extends TestBaseImpl
 {
-
     @Test
     public void simpleInboundFiltersTest()
     {
@@ -69,7 +68,7 @@
         String MSG2 = "msg2";
 
         MessageFilters filters = new MessageFilters();
-        Permit permit = inbound ? filters::permitInbound : filters::permitOutbound;
+        Permit permit = inbound ? (from, to, msg) -> filters.permitInbound(from, to, msg) : (from, to, msg) -> filters.permitOutbound(from, to, msg);
 
         IMessageFilters.Filter filter = filters.allVerbs().inbound(inbound).from(1).drop();
         Assert.assertFalse(permit.test(i1, i2, msg(VERB1, MSG1)));

diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java
new file mode 100644
index 0000000..3f50bd4
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java

@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test;
+
+import java.util.Iterator;
+
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+
+public class ReadRepairTest extends TestBaseImpl
+{
+    @Test
+    public void emptyRangeTombstones1() throws Throwable
+    {
+        try (Cluster cluster = init(Cluster.create(2)))
+        {
+            cluster.schemaChange("CREATE TABLE distributed_test_keyspace.tbl (\n" +
+                                 "    key text,\n" +
+                                 "    column1 int,\n" +
+                                 "    PRIMARY KEY (key, column1)\n" +
+                                 ") WITH CLUSTERING ORDER BY (column1 ASC)");
+            cluster.get(1).executeInternal("DELETE FROM distributed_test_keyspace.tbl WHERE key=? AND column1>? AND column1<?;",
+                                           "test", Integer.MIN_VALUE, Integer.MAX_VALUE);
+            cluster.coordinator(2).execute("SELECT * FROM distributed_test_keyspace.tbl WHERE key = ? and column1 > ? and column1 <= ?",
+                                                 ConsistencyLevel.ALL,
+                                                 "test", 10, 10);
+            cluster.coordinator(2).execute("SELECT * FROM distributed_test_keyspace.tbl WHERE key = ? and column1 > ? and column1 <= ?",
+                                                 ConsistencyLevel.ALL,
+                                                 "test", 11, 11);
+            cluster.get(2).executeInternal("DELETE FROM distributed_test_keyspace.tbl WHERE key=? AND column1>? AND column1<?;",
+                                           "test", Integer.MIN_VALUE, Integer.MAX_VALUE);
+        }
+    }
+
+    @Test
+    public void emptyRangeTombstonesFromPaging() throws Throwable
+    {
+        try (Cluster cluster = init(Cluster.create(2)))
+        {
+            cluster.schemaChange("CREATE TABLE distributed_test_keyspace.tbl (\n" +
+                                 "    key text,\n" +
+                                 "    column1 int,\n" +
+                                 "    PRIMARY KEY (key, column1)\n" +
+                                 ") WITH CLUSTERING ORDER BY (column1 ASC)");
+
+            cluster.get(1).executeInternal("DELETE FROM distributed_test_keyspace.tbl USING TIMESTAMP 10 WHERE key=? AND column1>? AND column1<?;",
+                                           "test", Integer.MIN_VALUE, Integer.MAX_VALUE);
+
+            for (int i = 0; i < 100; i++)
+                cluster.coordinator(1).execute("INSERT INTO distributed_test_keyspace.tbl (key, column1) VALUES (?, ?) USING TIMESTAMP 30", ConsistencyLevel.ALL, "test", i);
+
+            consume(cluster.coordinator(2).executeWithPaging("SELECT * FROM distributed_test_keyspace.tbl WHERE key = ? and column1 >= ? and column1 <= ?",
+                                           ConsistencyLevel.ALL, 1,
+                                           "test", 8, 12));
+
+            consume(cluster.coordinator(2).executeWithPaging("SELECT * FROM distributed_test_keyspace.tbl WHERE key = ? and column1 >= ? and column1 <= ?",
+                                                             ConsistencyLevel.ALL, 1,
+                                                             "test", 16, 20));
+            cluster.get(2).executeInternal("DELETE FROM distributed_test_keyspace.tbl WHERE key=? AND column1>? AND column1<?;",
+                                           "test", Integer.MIN_VALUE, Integer.MAX_VALUE);
+        }
+    }
+
+    private void consume(Iterator<Object[]> it)
+    {
+        while (it.hasNext())
+            it.next();
+    }
+}

diff --git a/test/distributed/org/apache/cassandra/distributed/test/SharedClusterTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/SharedClusterTestBase.java
new file mode 100644
index 0000000..c502af2
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/SharedClusterTestBase.java

@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test;
+
+import java.io.IOException;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ICluster;
+
+public class SharedClusterTestBase extends TestBaseImpl
+{
+    protected static ICluster cluster;
+
+    @BeforeClass
+    public static void before() throws IOException
+    {
+        cluster = init(Cluster.build().withNodes(3).start());
+    }
+
+    @AfterClass
+    public static void after() throws Exception
+    {
+        cluster.close();
+    }
+
+    @After
+    public void afterEach()
+    {
+        cluster.schemaChange("DROP KEYSPACE IF EXISTS " + KEYSPACE);
+        init(cluster);
+    }
+}

diff --git a/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWritePathTest.java b/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWritePathTest.java
deleted file mode 100644
index 8c9e8af..0000000
--- a/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWritePathTest.java
+++ /dev/null

@@ -1,225 +0,0 @@
-package org.apache.cassandra.distributed.test;
-
-import java.io.IOException;
-
-import org.apache.cassandra.distributed.api.ConsistencyLevel;
-import org.apache.cassandra.distributed.api.ICluster;
-
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.apache.cassandra.distributed.api.Feature.NETWORK;
-
-import static org.apache.cassandra.distributed.shared.AssertUtils.*;
-
-// TODO: this test should be removed after running in-jvm dtests is set up via the shared API repository
-public class SimpleReadWritePathTest extends TestBaseImpl
-{
-    private static final TestBaseImpl impl = new TestBaseImpl();
-    private static ICluster cluster;
-
-    @BeforeClass
-    public static void before() throws IOException
-    {
-        cluster = init(impl.builder().withNodes(3).start());
-    }
-
-    @AfterClass
-    public static void after() throws Exception
-    {
-        cluster.close();
-    }
-
-    @After
-    public void afterEach()
-    {
-        cluster.schemaChange("DROP KEYSPACE IF EXISTS " + KEYSPACE);
-        init(cluster);
-    }
-
-    @Test
-    public void coordinatorReadTest() throws Throwable
-    {
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
-        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 2, 2)");
-        cluster.get(3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 3, 3)");
-
-        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
-                                                  ConsistencyLevel.ALL,
-                                                  1),
-                   row(1, 1, 1),
-                   row(1, 2, 2),
-                   row(1, 3, 3));
-    }
-
-    @Test
-    public void largeMessageTest() throws Throwable
-    {
-        int largeMessageThreshold = 1024 * 64;
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck))");
-        StringBuilder builder = new StringBuilder();
-        for (int i = 0; i < largeMessageThreshold; i++)
-            builder.append('a');
-        String s = builder.toString();
-        cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, ?)",
-                                       ConsistencyLevel.ALL,
-                                       s);
-        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
-                                                  ConsistencyLevel.ALL,
-                                                  1),
-                   row(1, 1, s));
-    }
-
-    @Test
-    public void coordinatorWriteTest() throws Throwable
-    {
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-        cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)",
-                                       ConsistencyLevel.QUORUM);
-
-        for (int i = 0; i < 3; i++)
-        {
-            assertRows(cluster.get(1).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"),
-                       row(1, 1, 1));
-        }
-
-        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1",
-                                                  ConsistencyLevel.QUORUM),
-                   row(1, 1, 1));
-    }
-
-    @Test
-    public void readRepairTest() throws Throwable
-    {
-
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
-        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
-
-        assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"));
-
-        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1",
-                                                  ConsistencyLevel.ALL), // ensure node3 in preflist
-                   row(1, 1, 1));
-
-        // Verify that data got repaired to the third node
-        assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"),
-                   row(1, 1, 1));
-    }
-
-    @Test
-    public void simplePagedReadsTest() throws Throwable
-    {
-
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-        int size = 100;
-        Object[][] results = new Object[size][];
-        for (int i = 0; i < size; i++)
-        {
-            cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
-                                           ConsistencyLevel.QUORUM,
-                                           i, i);
-            results[i] = new Object[]{ 1, i, i };
-        }
-
-        // Make sure paged read returns same results with different page sizes
-        for (int pageSize : new int[]{ 1, 2, 3, 5, 10, 20, 50 })
-        {
-            assertRows(cluster.coordinator(1).executeWithPaging("SELECT * FROM " + KEYSPACE + ".tbl",
-                                                                ConsistencyLevel.QUORUM,
-                                                                pageSize),
-                       results);
-        }
-    }
-
-    @Test
-    public void pagingWithRepairTest() throws Throwable
-    {
-
-        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-        int size = 100;
-        Object[][] results = new Object[size][];
-        for (int i = 0; i < size; i++)
-        {
-            // Make sure that data lands on different nodes and not coordinator
-            cluster.get(i % 2 == 0 ? 2 : 3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
-                                                            i, i);
-
-            results[i] = new Object[]{ 1, i, i };
-        }
-
-        // Make sure paged read returns same results with different page sizes
-        for (int pageSize : new int[]{ 1, 2, 3, 5, 10, 20, 50 })
-        {
-            assertRows(cluster.coordinator(1).executeWithPaging("SELECT * FROM " + KEYSPACE + ".tbl",
-                                                                ConsistencyLevel.ALL,
-                                                                pageSize),
-                       results);
-        }
-
-        assertRows(cluster.get(1).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl"),
-                   results);
-    }
-
-    @Test
-    public void pagingTests() throws Throwable
-    {
-        try (ICluster singleNode = init(builder().withNodes(1).withSubnet(1).start()))
-        {
-            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-            singleNode.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
-
-            for (int i = 0; i < 10; i++)
-            {
-                for (int j = 0; j < 10; j++)
-                {
-                    cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
-                                                   ConsistencyLevel.QUORUM,
-                                                   i, j, i + i);
-                    singleNode.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
-                                                      ConsistencyLevel.QUORUM,
-                                                      i, j, i + i);
-                }
-            }
-
-            int[] pageSizes = new int[]{ 1, 2, 3, 5, 10, 20, 50 };
-            String[] statements = new String[]{ "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 LIMIT 3",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 LIMIT 2",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 LIMIT 2",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 ORDER BY ck DESC",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 ORDER BY ck DESC",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 ORDER BY ck DESC",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 ORDER BY ck DESC LIMIT 3",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 ORDER BY ck DESC LIMIT 2",
-                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 ORDER BY ck DESC LIMIT 2",
-                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl LIMIT 3",
-                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl WHERE pk IN (3,5,8,10)",
-                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl WHERE pk IN (3,5,8,10) LIMIT 2"
-            };
-            for (String statement : statements)
-            {
-                for (int pageSize : pageSizes)
-                {
-                    assertRows(cluster.coordinator(1)
-                                      .executeWithPaging(statement,
-                                                         ConsistencyLevel.QUORUM, pageSize),
-                               singleNode.coordinator(1)
-                                         .executeWithPaging(statement,
-                                                            ConsistencyLevel.QUORUM, Integer.MAX_VALUE));
-                }
-            }
-        }
-    }
-}

diff --git a/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWriteTest.java b/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWriteTest.java
new file mode 100644
index 0000000..75e5ba9
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/SimpleReadWriteTest.java

@@ -0,0 +1,377 @@
+package org.apache.cassandra.distributed.test;
+
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.ICluster;
+import org.apache.cassandra.distributed.api.IInvokableInstance;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+import static org.junit.Assert.assertEquals;
+
+import static org.apache.cassandra.distributed.shared.AssertUtils.*;
+
+// TODO: this test should be removed after running in-jvm dtests is set up via the shared API repository
+public class SimpleReadWriteTest extends SharedClusterTestBase
+{
+    @Test
+    public void coordinatorReadTest() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
+        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 2, 2)");
+        cluster.get(3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 3, 3)");
+
+        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
+                                                  ConsistencyLevel.ALL,
+                                                  1),
+                   row(1, 1, 1),
+                   row(1, 2, 2),
+                   row(1, 3, 3));
+    }
+
+    @Test
+    public void largeMessageTest() throws Throwable
+    {
+        int largeMessageThreshold = 1024 * 64;
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck))");
+        StringBuilder builder = new StringBuilder();
+        for (int i = 0; i < largeMessageThreshold; i++)
+            builder.append('a');
+        String s = builder.toString();
+        cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, ?)",
+                                       ConsistencyLevel.ALL,
+                                       s);
+        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
+                                                  ConsistencyLevel.ALL,
+                                                  1),
+                   row(1, 1, s));
+    }
+
+    @Test
+    public void coordinatorWriteTest() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+        cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)",
+                                       ConsistencyLevel.QUORUM);
+
+        for (int i = 0; i < 3; i++)
+        {
+            assertRows(cluster.get(1).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"),
+                       row(1, 1, 1));
+        }
+
+        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1",
+                                                  ConsistencyLevel.QUORUM),
+                   row(1, 1, 1));
+    }
+
+    @Test
+    public void readRepairTest() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
+        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)");
+
+        assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"));
+
+        assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1",
+                                                  ConsistencyLevel.ALL), // ensure node3 in preflist
+                   row(1, 1, 1));
+
+        // Verify that data got repaired to the third node
+        assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"),
+                   row(1, 1, 1));
+    }
+
+    @Test
+    public void writeWithSchemaDisagreement() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v1 int, PRIMARY KEY (pk, ck))");
+
+        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+        cluster.get(3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+
+        // Introduce schema disagreement
+        cluster.schemaChange("ALTER TABLE " + KEYSPACE + ".tbl ADD v2 int", 1);
+
+        Exception thrown = null;
+        try
+        {
+            cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1, v2) VALUES (2, 2, 2, 2)",
+                                           ConsistencyLevel.QUORUM);
+        }
+        catch (RuntimeException e)
+        {
+            thrown = e;
+        }
+
+        Assert.assertTrue(thrown.getMessage().contains("Exception occurred on node"));
+        Assert.assertTrue(thrown.getCause().getCause().getCause().getMessage().contains("Unknown column v2 during deserialization"));
+    }
+
+    @Test
+    public void readWithSchemaDisagreement() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v1 int, PRIMARY KEY (pk, ck))");
+
+        cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+        cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+        cluster.get(3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v1) VALUES (1, 1, 1)");
+
+        // Introduce schema disagreement
+        cluster.schemaChange("ALTER TABLE " + KEYSPACE + ".tbl ADD v2 int", 1);
+
+        Exception thrown = null;
+        try
+        {
+            assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1",
+                                                      ConsistencyLevel.ALL),
+                       row(1, 1, 1, null));
+        }
+        catch (Exception e)
+        {
+            thrown = e;
+        }
+
+        Assert.assertTrue(thrown.getMessage().contains("Exception occurred on node"));
+        Assert.assertTrue(thrown.getCause().getCause().getCause().getMessage().contains("Unknown column v2 during deserialization"));
+    }
+
+    @Test
+    public void simplePagedReadsTest() throws Throwable
+    {
+
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+        int size = 100;
+        Object[][] results = new Object[size][];
+        for (int i = 0; i < size; i++)
+        {
+            cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
+                                           ConsistencyLevel.QUORUM,
+                                           i, i);
+            results[i] = new Object[]{ 1, i, i };
+        }
+
+        // Make sure paged read returns same results with different page sizes
+        for (int pageSize : new int[]{ 1, 2, 3, 5, 10, 20, 50 })
+        {
+            assertRows(cluster.coordinator(1).executeWithPaging("SELECT * FROM " + KEYSPACE + ".tbl",
+                                                                ConsistencyLevel.QUORUM,
+                                                                pageSize),
+                       results);
+        }
+    }
+
+    @Test
+    public void pagingWithRepairTest() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+        int size = 100;
+        Object[][] results = new Object[size][];
+        for (int i = 0; i < size; i++)
+        {
+            // Make sure that data lands on different nodes and not coordinator
+            cluster.get(i % 2 == 0 ? 2 : 3).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
+                                                            i, i);
+
+            results[i] = new Object[]{ 1, i, i };
+        }
+
+        // Make sure paged read returns same results with different page sizes
+        for (int pageSize : new int[]{ 1, 2, 3, 5, 10, 20, 50 })
+        {
+            assertRows(cluster.coordinator(1).executeWithPaging("SELECT * FROM " + KEYSPACE + ".tbl",
+                                                                ConsistencyLevel.ALL,
+                                                                pageSize),
+                       results);
+        }
+
+        assertRows(cluster.get(1).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl"),
+                   results);
+    }
+
+    @Test
+    public void pagingTests() throws Throwable
+    {
+        try (ICluster singleNode = init(builder().withNodes(1).withSubnet(1).start()))
+        {
+            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+            singleNode.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+
+            for (int i = 0; i < 10; i++)
+            {
+                for (int j = 0; j < 10; j++)
+                {
+                    cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
+                                                   ConsistencyLevel.QUORUM,
+                                                   i, j, i + i);
+                    singleNode.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, ?, ?)",
+                                                      ConsistencyLevel.QUORUM,
+                                                      i, j, i + i);
+                }
+            }
+
+            int[] pageSizes = new int[]{ 1, 2, 3, 5, 10, 20, 50 };
+            String[] statements = new String[]{ "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 LIMIT 3",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 LIMIT 2",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 LIMIT 2",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 ORDER BY ck DESC",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 ORDER BY ck DESC",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 ORDER BY ck DESC",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 ORDER BY ck DESC LIMIT 3",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck >= 5 ORDER BY ck DESC LIMIT 2",
+                                                "SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1 AND ck > 5 AND ck <= 10 ORDER BY ck DESC LIMIT 2",
+                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl LIMIT 3",
+                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl WHERE pk IN (3,5,8,10)",
+                                                "SELECT DISTINCT pk FROM " + KEYSPACE + ".tbl WHERE pk IN (3,5,8,10) LIMIT 2"
+            };
+            for (String statement : statements)
+            {
+                for (int pageSize : pageSizes)
+                {
+                    assertRows(cluster.coordinator(1)
+                                      .executeWithPaging(statement,
+                                                         ConsistencyLevel.QUORUM, pageSize),
+                               singleNode.coordinator(1)
+                                         .executeWithPaging(statement,
+                                                            ConsistencyLevel.QUORUM, Integer.MAX_VALUE));
+                }
+            }
+        }
+    }
+
+    @Test
+    public void metricsCountQueriesTest() throws Throwable
+    {
+        cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
+        for (int i = 0; i < 100; i++)
+            cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (?,?,?)", ConsistencyLevel.ALL, i, i, i);
+
+        long readCount1 = readCount((IInvokableInstance) cluster.get(1));
+        long readCount2 = readCount((IInvokableInstance) cluster.get(2));
+        for (int i = 0; i < 100; i++)
+            cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ? and ck = ?", ConsistencyLevel.ALL, i, i);
+
+        readCount1 = readCount((IInvokableInstance) cluster.get(1)) - readCount1;
+        readCount2 = readCount((IInvokableInstance) cluster.get(2)) - readCount2;
+        assertEquals(readCount1, readCount2);
+        assertEquals(100, readCount1);
+    }
+
+
+    @Test
+    public void skippedSSTableWithPartitionDeletionTest() throws Throwable
+    {
+        try (Cluster cluster = init(Cluster.create(2)))
+        {
+            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY(pk, ck))");
+            // insert a partition tombstone on node 1, the deletion timestamp should end up being the sstable's minTimestamp
+            cluster.get(1).executeInternal("DELETE FROM " + KEYSPACE + ".tbl USING TIMESTAMP 1 WHERE pk = 0");
+            // and a row from a different partition, to provide the sstable's min/max clustering
+            cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1) USING TIMESTAMP 2");
+            cluster.get(1).flush(KEYSPACE);
+            // expect a single sstable, where minTimestamp equals the timestamp of the partition delete
+            cluster.get(1).runOnInstance(() -> {
+                Set<SSTableReader> sstables = Keyspace.open(KEYSPACE)
+                                                      .getColumnFamilyStore("tbl")
+                                                      .getLiveSSTables();
+                assertEquals(1, sstables.size());
+                assertEquals(1, sstables.iterator().next().getMinTimestamp());
+            });
+
+            // on node 2, add a row for the deleted partition with an older timestamp than the deletion so it should be shadowed
+            cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (0, 10, 10) USING TIMESTAMP 0");
+
+
+            Object[][] rows = cluster.coordinator(1)
+                                     .execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk=0 AND ck > 5",
+                                              ConsistencyLevel.ALL);
+            assertEquals(0, rows.length);
+        }
+    }
+
+    @Test
+    public void skippedSSTableWithPartitionDeletionShadowingDataOnAnotherNode() throws Throwable
+    {
+        try (Cluster cluster = init(Cluster.create(2)))
+        {
+            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY(pk, ck))");
+            // insert a partition tombstone on node 1, the deletion timestamp should end up being the sstable's minTimestamp
+            cluster.get(1).executeInternal("DELETE FROM " + KEYSPACE + ".tbl USING TIMESTAMP 1 WHERE pk = 0");
+            // and a row from a different partition, to provide the sstable's min/max clustering
+            cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1) USING TIMESTAMP 1");
+            cluster.get(1).flush(KEYSPACE);
+            // sstable 1 has minTimestamp == maxTimestamp == 1 and is skipped due to its min/max clusterings. Now we
+            // insert a row which is not shadowed by the partition delete and flush to a second sstable. Importantly,
+            // this sstable's minTimestamp is > than the maxTimestamp of the first sstable. This would cause the first
+            // sstable not to be reincluded in the merge input, but we can't really make that decision as we don't
+            // know what data and/or tombstones are present on other nodes
+            cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (0, 6, 6) USING TIMESTAMP 2");
+            cluster.get(1).flush(KEYSPACE);
+
+            // on node 2, add a row for the deleted partition with an older timestamp than the deletion so it should be shadowed
+            cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (0, 10, 10) USING TIMESTAMP 0");
+
+            Object[][] rows = cluster.coordinator(1)
+                                     .execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk=0 AND ck > 5",
+                                              ConsistencyLevel.ALL);
+            // we expect that the row from node 2 (0, 10, 10) was shadowed by the partition delete, but the row from
+            // node 1 (0, 6, 6) was not.
+            assertRows(rows, new Object[] {0, 6 ,6});
+        }
+    }
+
+    @Test
+    public void skippedSSTableWithPartitionDeletionShadowingDataOnAnotherNode2() throws Throwable
+    {
+        // don't not add skipped sstables back just because the partition delete ts is < the local min ts
+
+        try (Cluster cluster = init(Cluster.create(2)))
+        {
+            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY(pk, ck))");
+            // insert a partition tombstone on node 1, the deletion timestamp should end up being the sstable's minTimestamp
+            cluster.get(1).executeInternal("DELETE FROM " + KEYSPACE + ".tbl USING TIMESTAMP 1 WHERE pk = 0");
+            // and a row from a different partition, to provide the sstable's min/max clustering
+            cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1) USING TIMESTAMP 3");
+            cluster.get(1).flush(KEYSPACE);
+            // sstable 1 has minTimestamp == maxTimestamp == 1 and is skipped due to its min/max clusterings. Now we
+            // insert a row which is not shadowed by the partition delete and flush to a second sstable. The first sstable
+            // has a maxTimestamp > than the min timestamp of all sstables, so it is a candidate for reinclusion to the
+            // merge. Hoever, the second sstable's minTimestamp is > than the partition delete. This would  cause the
+            // first sstable not to be reincluded in the merge input, but we can't really make that decision as we don't
+            // know what data and/or tombstones are present on other nodes
+            cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (0, 6, 6) USING TIMESTAMP 2");
+            cluster.get(1).flush(KEYSPACE);
+
+            // on node 2, add a row for the deleted partition with an older timestamp than the deletion so it should be shadowed
+            cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (0, 10, 10) USING TIMESTAMP 0");
+
+            Object[][] rows = cluster.coordinator(1)
+                                     .execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk=0 AND ck > 5",
+                                              ConsistencyLevel.ALL);
+            // we expect that the row from node 2 (0, 10, 10) was shadowed by the partition delete, but the row from
+            // node 1 (0, 6, 6) was not.
+            assertRows(rows, new Object[] {0, 6 ,6});
+        }
+    }
+
+    private long readCount(IInvokableInstance instance)
+    {
+        return instance.callOnInstance(() -> Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl").metric.readLatency.latency.getCount());
+    }
+}

diff --git a/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java b/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java
index 1850c63..2328df0 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java

@@ -27,8 +27,6 @@
 
 public class TestBaseImpl extends DistributedTestBase
 {
-    protected static final TestBaseImpl impl = new TestBaseImpl();
-
     @After
     public void afterEach() {
         super.afterEach();

diff --git a/test/distributed/org/apache/cassandra/distributed/test/TestLocator.java b/test/distributed/org/apache/cassandra/distributed/test/TestLocator.java
index a7ad400..e566910 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/TestLocator.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/TestLocator.java

@@ -39,14 +39,18 @@
     public static void main(String[] args) throws Throwable
     {
         String outputFileName = defaultOutputFileName;
-        if (args.length == 1)
+        if (args.length >= 1)
         {
             outputFileName = args[0];
         }
+        String testPackage = TestLocator.testPackage;
+        if (args.length == 2)
+            testPackage = args[1];
         try (FileWriter fileWriter = new FileWriter(outputFileName);
              PrintWriter printWriter = new PrintWriter(fileWriter))
         {
             printWriter.println("#!/bin/bash");
+            printWriter.println("ret=0");
             for (Class testClass : locateClasses(testPackage))
             {
                 for (Method method : testClass.getMethods())
@@ -57,8 +61,10 @@
                     printWriter.println(String.format(testCommandFormat,
                                                       testClass.getName(),
                                                       method.getName()));
+                    printWriter.println("if [ $? -ne 0 ]; then ret=1; fi");
                 }
             }
+            printWriter.println("exit $ret");
         }
     }
 

diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java
new file mode 100644
index 0000000..f138861
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorage2to3UpgradeTest.java

@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.upgrade;
+
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.ICoordinator;
+import org.apache.cassandra.distributed.shared.DistributedTestBase;
+import org.apache.cassandra.distributed.shared.Versions;
+import static org.apache.cassandra.distributed.shared.AssertUtils.*;
+
+public class CompactStorage2to3UpgradeTest extends UpgradeTestBase
+{
+    @Test
+    public void multiColumn() throws Throwable
+    {
+        new TestCase()
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
+        .setup(cluster -> {
+            assert cluster.size() == 3;
+            int rf = cluster.size() - 1;
+            assert rf == 2;
+            cluster.schemaChange("CREATE KEYSPACE ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': " + (cluster.size() - 1) + "};");
+            cluster.schemaChange("CREATE TABLE ks.tbl (pk int, v1 int, v2 text, PRIMARY KEY (pk)) WITH COMPACT STORAGE");
+            ICoordinator coordinator = cluster.coordinator(1);
+            // these shouldn't be replicated by the 3rd node
+            coordinator.execute("INSERT INTO ks.tbl (pk, v1, v2) VALUES (3, 3, '3')", ConsistencyLevel.ALL);
+            coordinator.execute("INSERT INTO ks.tbl (pk, v1, v2) VALUES (9, 9, '9')", ConsistencyLevel.ALL);
+            for (int i = 0; i < cluster.size(); i++)
+            {
+                int nodeNum = i + 1;
+                System.out.println(String.format("****** node %s: %s", nodeNum, cluster.get(nodeNum).config()));
+            }
+        })
+        .runAfterNodeUpgrade(((cluster, node) -> {
+            if (node != 2)
+                return;
+
+            Object[][] rows = cluster.coordinator(3).execute("SELECT * FROM ks.tbl LIMIT 2", ConsistencyLevel.ALL);
+            Object[][] expected = {
+            row(9, 9, "9"),
+            row(3, 3, "3")
+            };
+            assertRows(rows, expected);
+        })).run();
+    }
+
+    @Test
+    public void singleColumn() throws Throwable
+    {
+        new TestCase()
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
+        .setup(cluster -> {
+            assert cluster.size() == 3;
+            int rf = cluster.size() - 1;
+            assert rf == 2;
+            cluster.schemaChange("CREATE KEYSPACE ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': " + (cluster.size() - 1) + "};");
+            cluster.schemaChange("CREATE TABLE ks.tbl (pk int, v int, PRIMARY KEY (pk)) WITH COMPACT STORAGE");
+            ICoordinator coordinator = cluster.coordinator(1);
+            // these shouldn't be replicated by the 3rd node
+            coordinator.execute("INSERT INTO ks.tbl (pk, v) VALUES (3, 3)", ConsistencyLevel.ALL);
+            coordinator.execute("INSERT INTO ks.tbl (pk, v) VALUES (9, 9)", ConsistencyLevel.ALL);
+            for (int i = 0; i < cluster.size(); i++)
+            {
+                int nodeNum = i + 1;
+                System.out.println(String.format("****** node %s: %s", nodeNum, cluster.get(nodeNum).config()));
+            }
+        })
+        .runAfterNodeUpgrade(((cluster, node) -> {
+
+            if (node < 2)
+                return;
+
+            Object[][] rows = cluster.coordinator(3).execute("SELECT * FROM ks.tbl LIMIT 2", ConsistencyLevel.ALL);
+            Object[][] expected = {
+            row(9, 9),
+            row(3, 3)
+            };
+            assertRows(rows, expected);
+        })).run();
+    }
+}
\ No newline at end of file

diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRangeTombstoneTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRangeTombstoneTest.java
new file mode 100644
index 0000000..e4b3a17
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRangeTombstoneTest.java

@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.upgrade;
+
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.shared.DistributedTestBase;
+import org.apache.cassandra.distributed.shared.Versions;
+
+import static java.lang.String.format;
+import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows;
+import static org.apache.cassandra.distributed.shared.AssertUtils.row;
+
+/**
+ * Tests related to the handle of range tombstones during 2.x to 3.x upgrades.
+ */
+public class MixedModeRangeTombstoneTest extends UpgradeTestBase
+{
+    /**
+     * Tests the interaction of range tombstones covering multiple rows and collection tombsones within the covered
+     * rows.
+     *
+     * <p>This test reproduces the issue of CASSANDRA-15805.
+     */
+    @Test
+    public void multiRowsRangeTombstoneAndCollectionTombstoneInteractionTest() throws Throwable {
+        String tableName = DistributedTestBase.KEYSPACE + ".t";
+        String schema = "CREATE TABLE " + tableName + " (" +
+                        "  k int," +
+                        "  c1 text," +
+                        "  c2 text," +
+                        "  a text," +
+                        "  b set<text>," +
+                        "  c text," +
+                        "  PRIMARY KEY((k), c1, c2)" +
+                        " )";
+
+
+        new TestCase()
+        .nodes(2)
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
+        .setup(cluster -> {
+            cluster.schemaChange(schema);
+            cluster.coordinator(1).execute(format("DELETE FROM %s USING TIMESTAMP 1 WHERE k = 0 AND c1 = 'A'", tableName), ConsistencyLevel.ALL);
+            cluster.coordinator(1).execute(format("INSERT INTO %s(k, c1, c2, a, b, c) VALUES (0, 'A', 'X', 'foo', {'whatever'}, 'bar') USING TIMESTAMP 2", tableName), ConsistencyLevel.ALL);
+            cluster.coordinator(1).execute(format("DELETE b FROM %s USING TIMESTAMP 3 WHERE k = 0 AND c1 = 'A' and c2 = 'X'", tableName), ConsistencyLevel.ALL);
+            cluster.get(1).flush(DistributedTestBase.KEYSPACE);
+            cluster.get(2).flush(DistributedTestBase.KEYSPACE);
+        })
+        .runAfterNodeUpgrade((cluster, node) -> {
+            assertRows(cluster.coordinator(node).execute(format("SELECT * FROM %s", tableName), ConsistencyLevel.ALL),
+                       row(0, "A", "X", "foo", null, "bar"));
+        })
+        .run();
+    }
+}

diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairTest.java
index fabf172..e9391e0 100644
--- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairTest.java

@@ -18,12 +18,18 @@
 
 package org.apache.cassandra.distributed.upgrade;
 
+import java.util.Arrays;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterators;
 import org.junit.Test;
 
+import org.apache.cassandra.distributed.UpgradeableCluster;
 import org.apache.cassandra.distributed.api.ConsistencyLevel;
 import org.apache.cassandra.distributed.shared.DistributedTestBase;
 import org.apache.cassandra.distributed.shared.Versions;
 
+import static org.junit.Assert.fail;
 
 public class MixedModeReadRepairTest extends UpgradeTestBase
 {
@@ -50,4 +56,82 @@
         .runAfterClusterUpgrade((cluster) -> cluster.get(2).forceCompact(DistributedTestBase.KEYSPACE, "tbl"))
         .run();
     }
+
+    @Test
+    public void mixedModeReadRepairDuplicateRows() throws Throwable
+    {
+        final String[] workload1 = new String[]
+        {
+            "DELETE FROM " + DistributedTestBase.KEYSPACE + ".tbl USING TIMESTAMP 1 WHERE pk = 1 AND ck = 2;",
+            "INSERT INTO " + DistributedTestBase.KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, {'a':'b'}) USING TIMESTAMP 3;",
+            "INSERT INTO " + DistributedTestBase.KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 2, {'c':'d'}) USING TIMESTAMP 3;",
+            "INSERT INTO " + DistributedTestBase.KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 3, {'e':'f'}) USING TIMESTAMP 3;",
+        };
+
+        final String[] workload2 = new String[]
+        {
+            "INSERT INTO " + DistributedTestBase.KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 2, {'g':'h'}) USING TIMESTAMP 5;",
+        };
+
+        new TestCase()
+        .nodes(2)
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
+        .setup((cluster) ->
+        {
+            cluster.schemaChange("CREATE TABLE " + DistributedTestBase.KEYSPACE + ".tbl (pk int, ck int, v map<text, text>, PRIMARY KEY (pk, ck));");
+        })
+        .runAfterNodeUpgrade((cluster, node) ->
+        {
+            if (node == 2)
+                return;
+
+            // now node1 is 3.0 and node2 is 2.2
+            for (int i = 0; i < workload1.length; i++ )
+                cluster.coordinator(2).execute(workload1[i], ConsistencyLevel.QUORUM);
+
+            cluster.get(1).flush(KEYSPACE);
+            cluster.get(2).flush(KEYSPACE);
+
+            validate(cluster, 2, false);
+
+            for (int i = 0; i < workload2.length; i++ )
+                cluster.coordinator(2).execute(workload2[i], ConsistencyLevel.QUORUM);
+
+            cluster.get(1).flush(KEYSPACE);
+            cluster.get(2).flush(KEYSPACE);
+
+            validate(cluster, 1, true);
+        })
+        .run();
+    }
+
+    private void validate(UpgradeableCluster cluster, int nodeid, boolean local)
+    {
+        String query = "SELECT * FROM " + KEYSPACE + ".tbl";
+
+        Iterator<Object[]> iter = local
+                                ? Iterators.forArray(cluster.get(nodeid).executeInternal(query))
+                                : cluster.coordinator(nodeid).executeWithPaging(query, ConsistencyLevel.ALL, 2);
+
+        Object[] prevRow = null;
+        Object prevClustering = null;
+
+        while (iter.hasNext())
+        {
+            Object[] row = iter.next();
+            Object clustering = row[1];
+
+            if (clustering.equals(prevClustering))
+            {
+                fail(String.format("Duplicate rows on node %d in %s mode: \n%s\n%s",
+                                   nodeid,
+                                   local ? "local" : "distributed",
+                                   Arrays.toString(prevRow),
+                                   Arrays.toString(row)));
+            }
+
+            prevRow = row;
+            prevClustering = clustering;
+        }
+    }
 }

diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java
index 93ae78e..81e580d 100644
--- a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java

@@ -18,6 +18,9 @@
 
 package org.apache.cassandra.distributed.upgrade;
 
+import java.util.Iterator;
+
+import com.google.common.collect.Iterators;
 import org.junit.Test;
 
 import org.apache.cassandra.distributed.api.ConsistencyLevel;
@@ -33,7 +36,7 @@
     public void upgradeTest() throws Throwable
     {
         new TestCase()
-        .upgrade(Versions.Major.v22, Versions.Major.v30, Versions.Major.v3X)
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
         .setup((cluster) -> {
             cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))");
 
@@ -43,12 +46,44 @@
         })
         .runAfterClusterUpgrade((cluster) -> {
             assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
-                                                      ConsistencyLevel.ALL,
-                                                      1),
-                       row(1, 1, 1),
-                       row(1, 2, 2),
-                       row(1, 3, 3));
+                                                                          ConsistencyLevel.ALL,
+                                                                          1),
+                                           row(1, 1, 1),
+                                           row(1, 2, 2),
+                                           row(1, 3, 3));
         }).run();
     }
 
+    @Test
+    public void mixedModePagingTest() throws Throwable
+    {
+        new TestCase()
+        .upgrade(Versions.Major.v22, Versions.Major.v30)
+        .nodes(2)
+        .nodesToUpgrade(2)
+        .setup((cluster) -> {
+            cluster.schemaChange("ALTER KEYSPACE " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}");
+            cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage");
+            for (int i = 0; i < 100; i++)
+                for (int j = 0; j < 200; j++)
+                    cluster.coordinator(2).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (?, ?, 1)", ConsistencyLevel.ALL, i, j);
+            cluster.forEach((i) -> i.flush(KEYSPACE));
+            for (int i = 0; i < 100; i++)
+                for (int j = 10; j < 30; j++)
+                    cluster.coordinator(2).execute("DELETE FROM " + KEYSPACE + ".tbl where pk=? and ck=?", ConsistencyLevel.ALL, i, j);
+            cluster.forEach((i) -> i.flush(KEYSPACE));
+        })
+        .runAfterClusterUpgrade((cluster) -> {
+            for (int i = 0; i < 100; i++)
+            {
+                for (int pageSize = 10; pageSize < 100; pageSize++)
+                {
+                    Iterator<Object[]> res = cluster.coordinator(1).executeWithPaging("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = ?",
+                                                                                      ConsistencyLevel.ALL,
+                                                                                      pageSize, i);
+                    Assert.assertEquals(180, Iterators.size(res));
+                }
+            }
+        }).run();
+    }
 }
\ No newline at end of file

diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java
index d540442..4f0c700 100644
--- a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java
+++ b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java

@@ -31,6 +31,7 @@
 import org.apache.cassandra.distributed.UpgradeableCluster;
 import org.apache.cassandra.distributed.api.ICluster;
 import org.apache.cassandra.distributed.api.IInstanceConfig;
+import org.apache.cassandra.distributed.api.IUpgradeableInstance;
 import org.apache.cassandra.distributed.impl.Instance;
 import org.apache.cassandra.distributed.shared.DistributedTestBase;
 import org.apache.cassandra.distributed.shared.Versions;
@@ -172,7 +173,7 @@
 
                     for (Version version : upgrade.upgrade)
                     {
-                        for (int n : nodesToUpgrade)
+                        for (int n=1; n<=nodesToUpgrade.size(); n++)
                         {
                             cluster.get(n).shutdown().get();
                             cluster.get(n).setVersion(version);

diff --git a/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java b/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java
deleted file mode 100644
index a0bacea..0000000
--- a/test/long/org/apache/cassandra/cql3/DropKeyspaceCommitLogRecycleTest.java
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import org.junit.After;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.SchemaLoader;
-
-import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
-
-/**
- * Base class for CQL tests.
- */
-public class DropKeyspaceCommitLogRecycleTest
-{
-    protected static final Logger logger = LoggerFactory.getLogger(DropKeyspaceCommitLogRecycleTest.class);
-
-    private static final String KEYSPACE = "cql_test_keyspace";
-    private static final String KEYSPACE2 = "cql_test_keyspace2";
-
-    static
-    {
-        // Once per-JVM is enough
-        SchemaLoader.prepareServer();
-    }
-
-    private void create(boolean both)
-    {
-        executeOnceInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE));
-        executeOnceInternal(String.format("CREATE TABLE %s.test (k1 int, k2 int, v int, PRIMARY KEY (k1, k2))", KEYSPACE));
-        
-        if (both)
-        {
-            executeOnceInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE2));
-            executeOnceInternal(String.format("CREATE TABLE %s.test (k1 int, k2 int, v int, PRIMARY KEY (k1, k2))", KEYSPACE2));
-        }
-    }
-
-    private void insert()
-    {
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (0, 0, 0)", KEYSPACE));
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (1, 1, 1)", KEYSPACE));
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (2, 2, 2)", KEYSPACE));
-
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (0, 0, 0)", KEYSPACE2));
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (1, 1, 1)", KEYSPACE2));
-        executeOnceInternal(String.format("INSERT INTO %s.test (k1, k2, v) VALUES (2, 2, 2)", KEYSPACE2));       
-    }
-
-    private void drop(boolean both)
-    {
-        executeOnceInternal(String.format("DROP KEYSPACE IF EXISTS %s", KEYSPACE));
-        if (both)
-            executeOnceInternal(String.format("DROP KEYSPACE IF EXISTS %s", KEYSPACE2));
-    }
-
-    @Test
-    public void testRecycle()
-    {
-        for (int i = 0 ; i < 1000 ; i++)
-        {
-            create(i == 0);
-            insert();
-            drop(false);
-        }
-    }
-
-    @After
-    public void afterTest() throws Throwable
-    {
-        drop(true);
-    }
-}

diff --git a/test/long/org/apache/cassandra/cql3/ViewLongTest.java b/test/long/org/apache/cassandra/cql3/ViewLongTest.java
new file mode 100644
index 0000000..68931e2
--- /dev/null
+++ b/test/long/org/apache/cassandra/cql3/ViewLongTest.java

@@ -0,0 +1,416 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CyclicBarrier;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.exceptions.NoHostAvailableException;
+import com.datastax.driver.core.exceptions.WriteTimeoutException;
+import org.apache.cassandra.concurrent.SEPExecutor;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.batchlog.BatchlogManager;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.WrappedRunnable;
+
+public class ViewLongTest extends CQLTester
+{
+    int protocolVersion = 4;
+    private final List<String> views = new ArrayList<>();
+
+    @BeforeClass
+    public static void startup()
+    {
+        requireNetwork();
+    }
+    @Before
+    public void begin()
+    {
+        views.clear();
+    }
+
+    @After
+    public void end() throws Throwable
+    {
+        for (String viewName : views)
+            executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + viewName);
+    }
+
+    private void createView(String name, String query) throws Throwable
+    {
+        executeNet(protocolVersion, String.format(query, name));
+        // If exception is thrown, the view will not be added to the list; since it shouldn't have been created, this is
+        // the desired behavior
+        views.add(name);
+    }
+
+    @Test
+    public void testConflictResolution() throws Throwable
+    {
+        final int writers = 96;
+        final int insertsPerWriter = 50;
+        final Map<Integer, Exception> failedWrites = new ConcurrentHashMap<>();
+
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)");
+
+        CyclicBarrier semaphore = new CyclicBarrier(writers);
+
+        Thread[] threads = new Thread[writers];
+        for (int i = 0; i < writers; i++)
+        {
+            final int writer = i;
+            Thread t = new Thread(new WrappedRunnable()
+            {
+                public void runMayThrow()
+                {
+                    try
+                    {
+                        int writerOffset = writer * insertsPerWriter;
+                        semaphore.await();
+                        for (int i = 0; i < insertsPerWriter; i++)
+                        {
+                            try
+                            {
+                                executeNet(protocolVersion, "INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP 1",
+                                           1,
+                                           1,
+                                           i + writerOffset);
+                            }
+                            catch (NoHostAvailableException|WriteTimeoutException e)
+                            {
+                                failedWrites.put(i + writerOffset, e);
+                            }
+                        }
+                    }
+                    catch (Throwable e)
+                    {
+                        throw new RuntimeException(e);
+                    }
+                }
+            });
+            t.start();
+            threads[i] = t;
+        }
+
+        for (int i = 0; i < writers; i++)
+            threads[i].join();
+
+        for (int i = 0; i < writers * insertsPerWriter; i++)
+        {
+            if (executeNet(protocolVersion, "SELECT COUNT(*) FROM system.batchlog").one().getLong(0) == 0)
+                break;
+            try
+            {
+                // This will throw exceptions whenever there are exceptions trying to push the view values out, caused
+                // by the view becoming overwhelmed.
+                BatchlogManager.instance.startBatchlogReplay().get();
+            }
+            catch (Throwable ignore)
+            {
+
+            }
+        }
+
+        int value = executeNet(protocolVersion, "SELECT c FROM %s WHERE a = 1 AND b = 1").one().getInt("c");
+
+        List<Row> rows = executeNet(protocolVersion, "SELECT c FROM " + keyspace() + ".mv").all();
+
+        boolean containsC = false;
+        StringBuilder others = new StringBuilder();
+        StringBuilder overlappingFailedWrites = new StringBuilder();
+        for (Row row : rows)
+        {
+            int c = row.getInt("c");
+            if (c == value)
+                containsC = true;
+            else
+            {
+                if (others.length() != 0)
+                    others.append(' ');
+                others.append(c);
+                if (failedWrites.containsKey(c))
+                {
+                    if (overlappingFailedWrites.length() != 0)
+                        overlappingFailedWrites.append(' ');
+                    overlappingFailedWrites.append(c)
+                                           .append(':')
+                                           .append(failedWrites.get(c).getMessage());
+                }
+            }
+        }
+
+        if (rows.size() > 1)
+        {
+            throw new AssertionError(String.format("Expected 1 row, but found %d; %s c = %d, and (%s) of which (%s) failed to insert", rows.size(), containsC ? "found row with" : "no rows contained", value, others, overlappingFailedWrites));
+        }
+        else if (rows.isEmpty())
+        {
+            throw new AssertionError(String.format("Could not find row with c = %d", value));
+        }
+        else if (rows.size() == 1 && !containsC)
+        {
+            throw new AssertionError(String.format("Single row had c = %d, expected %d", rows.get(0).getInt("c"), value));
+        }
+    }
+
+    @Test
+    public void testExpiredLivenessInfoWithDefaultTTLWithFlush() throws Throwable
+    {
+        testExpiredLivenessInfoWithDefaultTTL(true);
+    }
+
+    @Test
+    public void testExpiredLivenessInfoWithDefaultTTLWithoutFlush() throws Throwable
+    {
+        testExpiredLivenessInfoWithDefaultTTL(false);
+    }
+
+    private void testExpiredLivenessInfoWithDefaultTTL(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (field1 int,field2 int,date int,PRIMARY KEY ((field1), field2)) WITH default_time_to_live = 5;");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW mv AS SELECT * FROM %%s WHERE field1 IS NOT NULL AND field2 IS NOT NULL AND date IS NOT NULL PRIMARY KEY ((field1), date, field2) WITH CLUSTERING ORDER BY (date desc, field2 asc);");
+
+        updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 111);", flush);
+        assertRows(execute("select * from %s"), row(1, 2, 111));
+        assertRows(execute("select * from mv"), row(1, 111, 2));
+
+        updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 222);", flush);
+        assertRows(execute("select * from %s"), row(1, 2, 222));
+        assertRows(execute("select * from mv"), row(1, 222, 2));
+
+        updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 333);", flush);
+
+        assertRows(execute("select * from %s"), row(1, 2, 333));
+        assertRows(execute("select * from mv"), row(1, 333, 2));
+
+        if (flush)
+        {
+            Keyspace.open(keyspace()).getColumnFamilyStore("mv").forceMajorCompaction();
+            assertRows(execute("select * from %s"), row(1, 2, 333));
+            assertRows(execute("select * from mv"), row(1, 333, 2));
+        }
+
+        // wait for ttl, data should be removed
+        updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 444);", flush);
+        assertRows(execute("select * from %s"), row(1, 2, 444));
+        assertRows(execute("select * from mv"), row(1, 444, 2));
+
+        Thread.sleep(5000);
+        assertRows(execute("select * from %s"));
+        assertRows(execute("select * from mv"));
+
+        // shadow mv with date=555 and then update it back to live, wait for ttl
+        updateView("update %s set date=555 where field1=1 and field2=2;");
+        updateView("update %s set date=666 where field1=1 and field2=2;");
+        updateViewWithFlush("update %s set date=555 where field1=1 and field2=2;", flush);
+        assertRows(execute("select * from %s"), row(1, 2, 555));
+        assertRows(execute("select * from mv"), row(1, 555, 2));
+
+        Thread.sleep(5000);
+        assertRows(execute("select * from %s"));
+        assertRows(execute("select * from mv"));
+
+        // test user-provided ttl for table with/without default-ttl
+        for (boolean withDefaultTTL : Arrays.asList(true, false))
+        {
+            execute("TRUNCATE %s");
+            if (withDefaultTTL)
+                execute("ALTER TABLE %s with default_time_to_live=" + (withDefaultTTL ? 10 : 0));
+            updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 666) USING TTL 1000;", flush);
+
+            assertRows(execute("select * from %s"), row(1, 2, 666));
+            assertRows(execute("select * from mv"), row(1, 666, 2));
+
+            updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 777) USING TTL 1100;", flush);
+            assertRows(execute("select * from %s"), row(1, 2, 777));
+            assertRows(execute("select * from mv"), row(1, 777, 2));
+
+            updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 888) USING TTL 800;", flush);
+
+            assertRows(execute("select * from %s"), row(1, 2, 888));
+            assertRows(execute("select * from mv"), row(1, 888, 2));
+
+            if (flush)
+            {
+                Keyspace.open(keyspace()).getColumnFamilyStore("mv").forceMajorCompaction();
+                assertRows(execute("select * from %s"), row(1, 2, 888));
+                assertRows(execute("select * from mv"), row(1, 888, 2));
+            }
+
+            // wait for ttl, data should be removed
+            updateViewWithFlush("insert into %s (field1, field2, date) values (1, 2, 999) USING TTL 5;", flush);
+            assertRows(execute("select * from %s"), row(1, 2, 999));
+            assertRows(execute("select * from mv"), row(1, 999, 2));
+
+            Thread.sleep(5000);
+            assertRows(execute("select * from %s"));
+            assertRows(execute("select * from mv"));
+
+            // shadow mv with date=555 and then update it back to live with ttl=5, wait for ttl to expire
+            updateViewWithFlush("update %s  USING TTL 800 set date=555 where field1=1 and field2=2;", flush);
+            assertRows(execute("select * from %s"), row(1, 2, 555));
+            assertRows(execute("select * from mv"), row(1, 555, 2));
+
+            updateViewWithFlush("update %s set date=666 where field1=1 and field2=2;", flush);
+            assertRows(execute("select * from %s"), row(1, 2, 666));
+            assertRows(execute("select * from mv"), row(1, 666, 2));
+
+            updateViewWithFlush("update %s USING TTL 5 set date=555 where field1=1 and field2=2;", flush);
+            assertRows(execute("select * from %s"), row(1, 2, 555));
+            assertRows(execute("select * from mv"), row(1, 555, 2));
+
+            Thread.sleep(5000);
+            assertRows(execute("select * from %s"));
+            assertRows(execute("select * from mv"));
+        }
+    }
+
+    @Test
+    public void testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTLWithFlush() throws Throwable
+    {
+        testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTL(true);
+    }
+
+    @Test
+    public void testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTLWithoutFlush() throws Throwable
+    {
+        testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTL(false);
+    }
+
+    private void testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTL(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int,c int,a int, b int, PRIMARY KEY ((k), c)) WITH default_time_to_live = 1000;");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW mv AS SELECT k,c,a FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL "
+                           + "PRIMARY KEY (c, k)");
+
+        // table default ttl
+        updateViewWithFlush("UPDATE %s SET b = 111 WHERE k = 1 AND c = 2", flush);
+        assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 111));
+        assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+        updateViewWithFlush("UPDATE %s SET b = null WHERE k = 1 AND c = 2", flush);
+        assertRows(execute("select k,c,a,b from %s"));
+        assertRows(execute("select k,c,a from mv"));
+
+        updateViewWithFlush("UPDATE %s SET b = 222 WHERE k = 1 AND c = 2", flush);
+        assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 222));
+        assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+        updateViewWithFlush("DELETE b FROM %s WHERE k = 1 AND c = 2", flush);
+        assertRows(execute("select k,c,a,b from %s"));
+        assertRows(execute("select k,c,a from mv"));
+
+        if (flush)
+        {
+            Keyspace.open(keyspace()).getColumnFamilyStore("mv").forceMajorCompaction();
+            assertRows(execute("select k,c,a,b from %s"));
+            assertRows(execute("select k,c,a from mv"));
+        }
+
+        // test user-provided ttl for table with/without default-ttl
+        for (boolean withDefaultTTL : Arrays.asList(true, false))
+        {
+            execute("TRUNCATE %s");
+            if (withDefaultTTL)
+                execute("ALTER TABLE %s with default_time_to_live=" + (withDefaultTTL ? 10 : 0));
+
+            updateViewWithFlush("UPDATE %s USING TTL 100 SET b = 666 WHERE k = 1 AND c = 2", flush);
+            assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 666));
+            assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+            updateViewWithFlush("UPDATE %s USING TTL 90  SET b = null WHERE k = 1 AND c = 2", flush);
+            if (flush)
+                FBUtilities.waitOnFutures(Keyspace.open(keyspace()).flush());
+            assertRows(execute("select k,c,a,b from %s"));
+            assertRows(execute("select k,c,a from mv"));
+
+            updateViewWithFlush("UPDATE %s USING TTL 80  SET b = 777 WHERE k = 1 AND c = 2", flush);
+            assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 777));
+            assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+            updateViewWithFlush("DELETE b FROM %s WHERE k = 1 AND c = 2", flush);
+            assertRows(execute("select k,c,a,b from %s"));
+            assertRows(execute("select k,c,a from mv"));
+
+            updateViewWithFlush("UPDATE %s USING TTL 110  SET b = 888 WHERE k = 1 AND c = 2", flush);
+            assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 888));
+            assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+            updateViewWithFlush("UPDATE %s USING TTL 5  SET b = 999 WHERE k = 1 AND c = 2", flush);
+            assertRows(execute("select k,c,a,b from %s"), row(1, 2, null, 999));
+            assertRows(execute("select k,c,a from mv"), row(1, 2, null));
+
+            Thread.sleep(5000); // wait for ttl expired
+
+            if (flush)
+            {
+                Keyspace.open(keyspace()).getColumnFamilyStore("mv").forceMajorCompaction();
+                assertRows(execute("select k,c,a,b from %s"));
+                assertRows(execute("select k,c,a from mv"));
+            }
+        }
+    }
+
+    private void updateView(String query, Object... params) throws Throwable
+    {
+        updateViewWithFlush(query, false, params);
+    }
+
+    private void updateViewWithFlush(String query, boolean flush, Object... params) throws Throwable
+    {
+        executeNet(protocolVersion, query, params);
+        while (!(((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getPendingTasks() == 0
+                && ((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getActiveCount() == 0))
+        {
+            Thread.sleep(1);
+        }
+        if (flush)
+            Keyspace.open(keyspace()).flush();
+    }
+}

diff --git a/test/long/org/apache/cassandra/db/LongFlushMemtableTest.java b/test/long/org/apache/cassandra/db/LongFlushMemtableTest.java
deleted file mode 100644
index 24993c8..0000000
--- a/test/long/org/apache/cassandra/db/LongFlushMemtableTest.java
+++ /dev/null

@@ -1,86 +0,0 @@
-package org.apache.cassandra.db;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class LongFlushMemtableTest
-{
-    public static final String KEYSPACE1 = "LongFlushMemtableTest";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.loadSchema();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1));
-    }
-
-    @Test
-    public void testFlushMemtables() throws ConfigurationException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        for (int i = 0; i < 100; i++)
-        {
-            CFMetaData metadata = CFMetaData.denseCFMetaData(keyspace.getName(), "_CF" + i, UTF8Type.instance);
-            MigrationManager.announceNewColumnFamily(metadata);
-        }
-
-        for (int j = 0; j < 200; j++)
-        {
-            for (int i = 0; i < 100; i++)
-            {
-                Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key" + j));
-                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "_CF" + i);
-                // don't cheat by allocating this outside of the loop; that defeats the purpose of deliberately using lots of memory
-                ByteBuffer value = ByteBuffer.allocate(100000);
-                cf.addColumn(new BufferCell(Util.cellname("c"), value));
-                rm.add(cf);
-                rm.applyUnsafe();
-            }
-        }
-
-        int flushes = 0;
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-        {
-            if (cfs.name.startsWith("_CF"))
-                flushes += cfs.metric.memtableSwitchCount.getCount();
-        }
-        assert flushes > 0;
-    }
-}
-

diff --git a/test/long/org/apache/cassandra/db/LongKeyspaceTest.java b/test/long/org/apache/cassandra/db/LongKeyspaceTest.java
deleted file mode 100644
index fe22da8..0000000
--- a/test/long/org/apache/cassandra/db/LongKeyspaceTest.java
+++ /dev/null

@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.WrappedRunnable;
-import static org.apache.cassandra.Util.column;
-
-import org.apache.cassandra.Util;
-
-public class LongKeyspaceTest
-{
-    public static final String KEYSPACE1 = "LongKeyspaceTest";
-    public static final String CF_STANDARD = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
-    }
-
-    @Test
-    public void testGetRowMultiColumn() throws Throwable
-    {
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-
-        for (int i = 1; i < 5000; i += 100)
-        {
-            Mutation rm = new Mutation(KEYSPACE1, Util.dk("key" + i).getKey());
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-            for (int j = 0; j < i; j++)
-                cf.addColumn(column("c" + j, "v" + j, 1L));
-            rm.add(cf);
-            rm.applyUnsafe();
-        }
-
-        Runnable verify = new WrappedRunnable()
-        {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
-                for (int i = 1; i < 5000; i += 100)
-                {
-                    for (int j = 0; j < i; j++)
-                    {
-                        cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, Util.dk("key" + i), "c" + j));
-                        KeyspaceTest.assertColumns(cf, "c" + j);
-                    }
-                }
-
-            }
-        };
-        KeyspaceTest.reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
-    }
-}

diff --git a/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java b/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java
deleted file mode 100644
index b4efd49..0000000
--- a/test/long/org/apache/cassandra/db/commitlog/ComitLogStress.java
+++ /dev/null

@@ -1,97 +0,0 @@
-package org.apache.cassandra.db.commitlog;
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-
-
-import java.nio.ByteBuffer;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.cassandra.Util;
-import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
-import org.apache.cassandra.concurrent.NamedThreadFactory;
-import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.UUIDGen;
-
-public class ComitLogStress
-{
-
-    public static final String format = "%s,%s,%s,%s,%s,%s";
-
-    public static void main(String[] args) throws Exception {
-        int NUM_THREADS = Runtime.getRuntime().availableProcessors();
-        if (args.length >= 1) {
-            NUM_THREADS = Integer.parseInt(args[0]);
-            System.out.println("Setting num threads to: " + NUM_THREADS);
-        }
-        ExecutorService executor = new JMXEnabledThreadPoolExecutor(NUM_THREADS, NUM_THREADS, 60,
-                TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(10 * NUM_THREADS), new NamedThreadFactory("Stress"), "");
-        ScheduledExecutorService scheduled = Executors.newScheduledThreadPool(1);
-
-        org.apache.cassandra.SchemaLoader.loadSchema();
-        org.apache.cassandra.SchemaLoader.schemaDefinition(""); // leave def. blank to maintain old behaviour
-        final AtomicLong count = new AtomicLong();
-        final long start = System.currentTimeMillis();
-        System.out.println(String.format(format, "seconds", "max_mb", "allocated_mb", "free_mb", "diffrence", "count"));
-        scheduled.scheduleAtFixedRate(new Runnable() {
-            long lastUpdate = 0;
-
-            public void run() {
-                Runtime runtime = Runtime.getRuntime();
-                long maxMemory = mb(runtime.maxMemory());
-                long allocatedMemory = mb(runtime.totalMemory());
-                long freeMemory = mb(runtime.freeMemory());
-                long temp = count.get();
-                System.out.println(String.format(format, ((System.currentTimeMillis() - start) / 1000),
-                        maxMemory, allocatedMemory, freeMemory, (temp - lastUpdate), lastUpdate));
-                lastUpdate = temp;
-            }
-        }, 1, 1, TimeUnit.SECONDS);
-
-        while (true) {
-            executor.execute(new CommitlogExecutor());
-            count.incrementAndGet();
-        }
-    }
-
-    private static long mb(long maxMemory) {
-        return maxMemory / (1024 * 1024);
-    }
-
-    static final String keyString = UUIDGen.getTimeUUID().toString();
-    public static class CommitlogExecutor implements Runnable {
-        public void run() {
-            String ks = "Keyspace1";
-            ByteBuffer key = ByteBufferUtil.bytes(keyString);
-            for (int i=0; i<100; ++i) {
-                Mutation mutation = new Mutation(ks, key);
-                mutation.add("Standard1", Util.cellname("name"), ByteBufferUtil.bytes("value" + i),
-                        System.currentTimeMillis());
-                CommitLog.instance.add(mutation);
-            }
-        }
-    }
-}

diff --git a/test/long/org/apache/cassandra/db/commitlog/CommitLogStressTest.java b/test/long/org/apache/cassandra/db/commitlog/CommitLogStressTest.java
index 4604c49..02b26c7 100644
--- a/test/long/org/apache/cassandra/db/commitlog/CommitLogStressTest.java
+++ b/test/long/org/apache/cassandra/db/commitlog/CommitLogStressTest.java

@@ -21,15 +21,13 @@
  *
  */
 
-import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -48,15 +46,19 @@
 import org.junit.Test;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.config.Config.CommitLogSync;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnSerializer;
 import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.io.util.FastByteArrayInputStream;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 
 public class CommitLogStressTest
 {
@@ -111,7 +113,6 @@
             initialize();
 
             CommitLogStressTest tester = new CommitLogStressTest();
-            tester.cleanDir();
             tester.testFixedSize();
         }
         catch (Throwable e)
@@ -245,7 +246,7 @@
             verifySizes(commitLog);
 
             commitLog.discardCompletedSegments(Schema.instance.getCFMetaData("Keyspace1", "Standard1").cfId,
-                                               discardedPos);
+                    ReplayPosition.NONE, discardedPos);
             threads.clear();
             System.out.format("Discarded at %s\n", discardedPos);
             verifySizes(commitLog);
@@ -355,8 +356,7 @@
                 }
                 double time = (System.currentTimeMillis() - start) / 1000.0;
                 double avg = (temp / time);
-                System.out
-                        .println(
+                System.out.println(
                         String.format("second %d mem max %.0fmb allocated %.0fmb free %.0fmb mutations %d since start %d avg %.3f content %.1fmb ondisk %.1fmb transfer %.3fmb",
                                       ((System.currentTimeMillis() - start) / 1000),
                                       mb(maxMemory),
@@ -422,20 +422,20 @@
             {
                 if (rl != null)
                     rl.acquire();
-                String ks = "Keyspace1";
                 ByteBuffer key = randomBytes(16, rand);
-                Mutation mutation = new Mutation(ks, key);
 
+                UpdateBuilder builder = UpdateBuilder.create(Schema.instance.getCFMetaData("Keyspace1", "Standard1"), Util.dk(key));
                 for (int ii = 0; ii < numCells; ii++)
                 {
                     int sz = randomSize ? rand.nextInt(cellSize) : cellSize;
                     ByteBuffer bytes = randomBytes(sz, rand);
-                    mutation.add("Standard1", Util.cellname("name" + ii), bytes, System.currentTimeMillis());
+                    builder.newRow("name" + ii).add("val", bytes);
                     hash = hash(hash, bytes);
                     ++cells;
                     dataSize += sz;
                 }
-                rp = commitLog.add(mutation);
+
+                rp = commitLog.add(new Mutation(builder.build()));
                 counter.incrementAndGet();
             }
         }
@@ -463,13 +463,13 @@
                 // Skip over this mutation.
                 return;
 
-            FastByteArrayInputStream bufIn = new FastByteArrayInputStream(inputBuffer, 0, size);
+            DataInputPlus bufIn = new DataInputBuffer(inputBuffer, 0, size);
             Mutation mutation;
             try
             {
-                mutation = Mutation.serializer.deserialize(new DataInputStream(bufIn),
+                mutation = Mutation.serializer.deserialize(bufIn,
                                                            desc.getMessagingVersion(),
-                                                           ColumnSerializer.Flag.LOCAL);
+                                                           SerializationHelper.Flag.LOCAL);
             }
             catch (IOException e)
             {
@@ -477,18 +477,24 @@
                 throw new AssertionError(e);
             }
 
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate cf : mutation.getPartitionUpdates())
             {
-                for (Cell c : cf.getSortedColumns())
+
+                Iterator<Row> rowIterator = cf.iterator();
+
+                while (rowIterator.hasNext())
                 {
-                    if (new String(c.name().toByteBuffer().array(), StandardCharsets.UTF_8).startsWith("name"))
+                    Row row = rowIterator.next();
+                    if (!(UTF8Type.instance.compose(row.clustering().get(0)).startsWith("name")))
+                        continue;
+
+                    for (Cell cell : row.cells())
                     {
-                        hash = hash(hash, c.value());
+                        hash = hash(hash, cell.value());
                         ++cells;
                     }
                 }
             }
         }
-
     }
 }

diff --git a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
index e6c8f56..d684e11 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java

@@ -26,16 +26,18 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.SSTableUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import static org.junit.Assert.assertEquals;
@@ -48,14 +50,12 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        Map<String, String> compactionOptions = new HashMap<>();
-        compactionOptions.put("tombstone_compaction_interval", "1");
+        Map<String, String> compactionOptions = Collections.singletonMap("tombstone_compaction_interval", "1");
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD)
-                                                .compactionStrategyOptions(compactionOptions));
+                                                .compaction(CompactionParams.scts(compactionOptions)));
     }
 
     @Before
@@ -93,7 +93,7 @@
         testCompaction(100, 800, 5);
     }
 
-    protected void testCompaction(int sstableCount, int rowsPerSSTable, int colsPerRow) throws Exception
+    protected void testCompaction(int sstableCount, int partitionsPerSSTable, int rowsPerPartition) throws Exception
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -103,38 +103,37 @@
         ArrayList<SSTableReader> sstables = new ArrayList<>();
         for (int k = 0; k < sstableCount; k++)
         {
-            SortedMap<String,ColumnFamily> rows = new TreeMap<String,ColumnFamily>();
-            for (int j = 0; j < rowsPerSSTable; j++)
+            SortedMap<String, PartitionUpdate> rows = new TreeMap<>();
+            for (int j = 0; j < partitionsPerSSTable; j++)
             {
                 String key = String.valueOf(j);
-                Cell[] cols = new Cell[colsPerRow];
-                for (int i = 0; i < colsPerRow; i++)
-                {
-                    // last sstable has highest timestamps
-                    cols[i] = Util.column(String.valueOf(i), String.valueOf(i), k);
-                }
-                rows.put(key, SSTableUtils.createCF(KEYSPACE1, CF_STANDARD, Long.MIN_VALUE, Integer.MIN_VALUE, cols));
+                // last sstable has highest timestamps
+                UpdateBuilder builder = UpdateBuilder.create(store.metadata, String.valueOf(j))
+                                                     .withTimestamp(k);
+                for (int i = 0; i < rowsPerPartition; i++)
+                    builder.newRow(String.valueOf(i)).add("val", String.valueOf(i));
+                rows.put(key, builder.build());
             }
-            SSTableReader sstable = SSTableUtils.prepare().write(rows);
-            sstables.add(sstable);
-            store.addSSTable(sstable);
+            Collection<SSTableReader> readers = SSTableUtils.prepare().write(rows);
+            sstables.addAll(readers);
+            store.addSSTables(readers);
         }
 
         // give garbage collection a bit of time to catch up
         Thread.sleep(1000);
 
         long start = System.nanoTime();
-        final int gcBefore = (int) (System.currentTimeMillis() / 1000) - Schema.instance.getCFMetaData(KEYSPACE1, "Standard1").getGcGraceSeconds();
+        final int gcBefore = (int) (System.currentTimeMillis() / 1000) - Schema.instance.getCFMetaData(KEYSPACE1, "Standard1").params.gcGraceSeconds;
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.COMPACTION))
         {
             assert txn != null : "Cannot markCompacting all sstables";
-            new CompactionTask(store, txn, gcBefore, false).execute(null);
+            new CompactionTask(store, txn, gcBefore).execute(null);
         }
         System.out.println(String.format("%s: sstables=%d rowsper=%d colsper=%d: %d ms",
                                          this.getClass().getName(),
                                          sstableCount,
-                                         rowsPerSSTable,
-                                         colsPerRow,
+                                         partitionsPerSSTable,
+                                         rowsPerPartition,
                                          TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start)));
     }
 
@@ -147,7 +146,7 @@
         cfs.clearUnsafe();
 
         final int ROWS_PER_SSTABLE = 10;
-        final int SSTABLES = cfs.metadata.getMinIndexInterval() * 3 / ROWS_PER_SSTABLE;
+        final int SSTABLES = cfs.metadata.params.minIndexInterval * 3 / ROWS_PER_SSTABLE;
 
         // disable compaction while flushing
         cfs.disableAutoCompaction();
@@ -157,23 +156,23 @@
         for (int j = 0; j < SSTABLES; j++) {
             for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
                 DecoratedKey key = Util.dk(String.valueOf(i % 2));
-                Mutation rm = new Mutation(KEYSPACE1, key.getKey());
                 long timestamp = j * ROWS_PER_SSTABLE + i;
-                rm.add("Standard1", Util.cellname(String.valueOf(i / 2)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp);
                 maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);
-                rm.apply();
+                UpdateBuilder.create(cfs.metadata, key)
+                             .withTimestamp(timestamp)
+                             .newRow(String.valueOf(i / 2)).add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                             .apply();
+
                 inserted.add(key);
             }
             cfs.forceBlockingFlush();
             CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
-            assertEquals(inserted.toString(), inserted.size(), Util.getRangeSlice(cfs).size());
+
+            assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
         }
 
         forceCompactions(cfs);
-
-        assertEquals(inserted.size(), Util.getRangeSlice(cfs).size());
+        assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
 
         // make sure max timestamp of compacted sstables is recorded properly after compaction.
         CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
@@ -196,7 +195,7 @@
             FBUtilities.waitOnFutures(compactions);
         } while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0);
 
-        if (cfs.getSSTables().size() > 1)
+        if (cfs.getLiveSSTables().size() > 1)
         {
             CompactionManager.instance.performMaximal(cfs, false);
         }

diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
index 8e63006..562de22 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java

@@ -21,7 +21,10 @@
 import java.util.*;
 import java.util.concurrent.*;
 
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import com.google.common.collect.Lists;
+
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
@@ -30,10 +33,11 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertTrue;
@@ -51,14 +55,11 @@
         leveledOptions.put("sstable_size_in_mb", "1");
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLVL)
-                                                .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                .compactionStrategyOptions(leveledOptions),
+                                                .compaction(CompactionParams.lcs(leveledOptions)),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLVL2)
-                                                .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                .compactionStrategyOptions(leveledOptions));
+                                                .compaction(CompactionParams.lcs(leveledOptions)));
     }
 
     @Test
@@ -70,8 +71,7 @@
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(cfname);
         store.disableAutoCompaction();
 
-        WrappingCompactionStrategy strategy = ((WrappingCompactionStrategy) store.getCompactionStrategy());
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) strategy.getWrappedStrategies().get(1);
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy)store.getCompactionStrategyManager().getStrategies().get(1);
 
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
@@ -83,11 +83,11 @@
         for (int r = 0; r < rows; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(ksname, key.getKey());
+            UpdateBuilder builder = UpdateBuilder.create(store.metadata, key);
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(cfname, Util.cellname("column" + c), value, 0);
-            }
+                builder.newRow("column" + c).add("val", value);
+
+            Mutation rm = new Mutation(builder.build());
             rm.apply();
             store.forceBlockingFlush();
         }
@@ -150,42 +150,22 @@
     public void testLeveledScanner() throws Exception
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARDLVL2);
-        WrappingCompactionStrategy strategy = ((WrappingCompactionStrategy) store.getCompactionStrategy());
-        final LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) strategy.getWrappedStrategies().get(1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARDLVL2);
+        store.disableAutoCompaction();
+
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy)store.getCompactionStrategyManager().getStrategies().get(1);
 
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
-        // Enough data to have a level 1 and 2
-        int rows = 128;
-        int columns = 10;
-
-        // Adds enough data to trigger multiple sstable per level
-        for (int r = 0; r < rows; r++)
-        {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDLVL2, Util.cellname("column" + c), value, 0);
-            }
-            rm.apply();
-            store.forceBlockingFlush();
-        }
-
-        value = ByteBuffer.wrap(new byte[10 * 1024]); // 10 KB value
-        LeveledCompactionStrategyTest.waitForLeveling(store);
-        // wait for higher-level compactions to finish
-        store.disableAutoCompaction();
         // Adds 10 partitions
         for (int r = 0; r < 10; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder builder = UpdateBuilder.create(store.metadata, key);
             for (int c = 0; c < 10; c++)
-            {
-                rm.add(CF_STANDARDLVL2, Util.cellname("column" + c), value, 0);
-            }
+                builder.newRow("column" + c).add("val", value);
+
+            Mutation rm = new Mutation(builder.build());
             rm.apply();
         }
 
@@ -196,7 +176,7 @@
         {
             public Void call() throws Exception
             {
-                Collection<SSTableReader> allSSTables = store.getSSTables();
+                Iterable<SSTableReader> allSSTables = store.getSSTables(SSTableSet.LIVE);
                 for (SSTableReader sstable : allSSTables)
                 {
                     if (sstable.getSSTableLevel() == 0)
@@ -207,7 +187,7 @@
                     }
                 }
 
-                try (AbstractCompactionStrategy.ScannerList scannerList = lcs.getScanners(allSSTables))
+                try (AbstractCompactionStrategy.ScannerList scannerList = lcs.getScanners(Lists.newArrayList(allSSTables)))
                 {
                     //Verify that leveled scanners will always iterate in ascending order (CASSANDRA-9935)
                     for (ISSTableScanner scanner : scannerList.scanners)
@@ -215,17 +195,19 @@
                         DecoratedKey lastKey = null;
                         while (scanner.hasNext())
                         {
-                            OnDiskAtomIterator row = scanner.next();
+                            UnfilteredRowIterator row = scanner.next();
                             if (lastKey != null)
                             {
-                                assertTrue("row " + row.getKey() + " received out of order wrt " + lastKey, row.getKey().compareTo(lastKey) >= 0);
+                                assertTrue("row " + row.partitionKey() + " received out of order wrt " + lastKey, row.partitionKey().compareTo(lastKey) >= 0);
                             }
-                            lastKey = row.getKey();
+                            lastKey = row.partitionKey();
                         }
                     }
                 }
                 return null;
             }
-        }, true);
+        }, true, true);
+
+
     }
 }

diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java
new file mode 100644
index 0000000..1b36c55
--- /dev/null
+++ b/test/long/org/apache/cassandra/dht/tokenallocator/ReplicationAwareTokenAllocatorTest.java

@@ -0,0 +1,715 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht.tokenallocator;
+
+import java.util.*;
+
+import junit.framework.Assert;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Token;
+
+public class ReplicationAwareTokenAllocatorTest
+{
+    private static final int MAX_VNODE_COUNT = 64;
+
+    private static final int TARGET_CLUSTER_SIZE = 250;
+
+    interface TestReplicationStrategy extends ReplicationStrategy<Unit>
+    {
+        void addUnit(Unit n);
+
+        void removeUnit(Unit n);
+
+        /**
+         * Returns a list of all replica units for given token.
+         */
+        List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens);
+
+        /**
+         * Returns the start of the token span that is replicated in this token.
+         * Note: Though this is not trivial to see, the replicated span is always contiguous. A token in the same
+         * group acts as a barrier; if one is not found the token replicates everything up to the replica'th distinct
+         * group seen in front of it.
+         */
+        Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens);
+
+        /**
+         * Multiplier for the acceptable disbalance in the cluster. With some strategies it is harder to achieve good
+         * results.
+         */
+        public double spreadExpectation();
+    }
+
+    static class NoReplicationStrategy implements TestReplicationStrategy
+    {
+        public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens)
+        {
+            return Collections.singletonList(sortedTokens.ceilingEntry(token).getValue());
+        }
+
+        public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens)
+        {
+            return sortedTokens.lowerKey(token);
+        }
+
+        public String toString()
+        {
+            return "No replication";
+        }
+
+        public void addUnit(Unit n)
+        {
+        }
+
+        public void removeUnit(Unit n)
+        {
+        }
+
+        public int replicas()
+        {
+            return 1;
+        }
+
+        public boolean sameGroup(Unit n1, Unit n2)
+        {
+            return false;
+        }
+
+        public Object getGroup(Unit unit)
+        {
+            return unit;
+        }
+
+        public double spreadExpectation()
+        {
+            return 1;
+        }
+    }
+
+    static class SimpleReplicationStrategy implements TestReplicationStrategy
+    {
+        int replicas;
+
+        public SimpleReplicationStrategy(int replicas)
+        {
+            super();
+            this.replicas = replicas;
+        }
+
+        public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens)
+        {
+            List<Unit> endpoints = new ArrayList<Unit>(replicas);
+
+            token = sortedTokens.ceilingKey(token);
+            if (token == null)
+                token = sortedTokens.firstKey();
+            Iterator<Unit> iter = Iterables.concat(sortedTokens.tailMap(token, true).values(), sortedTokens.values()).iterator();
+            while (endpoints.size() < replicas)
+            {
+                if (!iter.hasNext())
+                    return endpoints;
+                Unit ep = iter.next();
+                if (!endpoints.contains(ep))
+                    endpoints.add(ep);
+            }
+            return endpoints;
+        }
+
+        public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens)
+        {
+            Set<Unit> seenUnits = Sets.newHashSet();
+            int unitsFound = 0;
+
+            for (Map.Entry<Token, Unit> en : Iterables.concat(
+                                                             sortedTokens.headMap(token, false).descendingMap().entrySet(),
+                                                             sortedTokens.descendingMap().entrySet()))
+            {
+                Unit n = en.getValue();
+                // Same group as investigated unit is a break; anything that could replicate in it replicates there.
+                if (n == unit)
+                    break;
+
+                if (seenUnits.add(n))
+                {
+                    if (++unitsFound == replicas)
+                        break;
+                }
+                token = en.getKey();
+            }
+            return token;
+        }
+
+        public void addUnit(Unit n)
+        {
+        }
+
+        public void removeUnit(Unit n)
+        {
+        }
+
+        public String toString()
+        {
+            return String.format("Simple %d replicas", replicas);
+        }
+
+        public int replicas()
+        {
+            return replicas;
+        }
+
+        public boolean sameGroup(Unit n1, Unit n2)
+        {
+            return false;
+        }
+
+        public Unit getGroup(Unit unit)
+        {
+            // The unit is the group.
+            return unit;
+        }
+
+        public double spreadExpectation()
+        {
+            return 1;
+        }
+    }
+
+    static abstract class GroupReplicationStrategy implements TestReplicationStrategy
+    {
+        final int replicas;
+        final Map<Unit, Integer> groupMap;
+
+        public GroupReplicationStrategy(int replicas)
+        {
+            this.replicas = replicas;
+            this.groupMap = Maps.newHashMap();
+        }
+
+        public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens)
+        {
+            List<Unit> endpoints = new ArrayList<Unit>(replicas);
+            BitSet usedGroups = new BitSet();
+
+            if (sortedTokens.isEmpty())
+                return endpoints;
+
+            token = sortedTokens.ceilingKey(token);
+            if (token == null)
+                token = sortedTokens.firstKey();
+            Iterator<Unit> iter = Iterables.concat(sortedTokens.tailMap(token, true).values(), sortedTokens.values()).iterator();
+            while (endpoints.size() < replicas)
+            {
+                // For simlicity assuming list can't be exhausted before finding all replicas.
+                Unit ep = iter.next();
+                int group = groupMap.get(ep);
+                if (!usedGroups.get(group))
+                {
+                    endpoints.add(ep);
+                    usedGroups.set(group);
+                }
+            }
+            return endpoints;
+        }
+
+        public Token lastReplicaToken(Token token, NavigableMap<Token, Unit> sortedTokens)
+        {
+            BitSet usedGroups = new BitSet();
+            int groupsFound = 0;
+
+            token = sortedTokens.ceilingKey(token);
+            if (token == null)
+                token = sortedTokens.firstKey();
+            for (Map.Entry<Token, Unit> en :
+            Iterables.concat(sortedTokens.tailMap(token, true).entrySet(),
+                             sortedTokens.entrySet()))
+            {
+                Unit ep = en.getValue();
+                int group = groupMap.get(ep);
+                if (!usedGroups.get(group))
+                {
+                    usedGroups.set(group);
+                    if (++groupsFound >= replicas)
+                        return en.getKey();
+                }
+            }
+            return token;
+        }
+
+        public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens)
+        {
+            // replicated ownership
+            int unitGroup = groupMap.get(unit);   // unit must be already added
+            BitSet seenGroups = new BitSet();
+            int groupsFound = 0;
+
+            for (Map.Entry<Token, Unit> en : Iterables.concat(
+                                                             sortedTokens.headMap(token, false).descendingMap().entrySet(),
+                                                             sortedTokens.descendingMap().entrySet()))
+            {
+                Unit n = en.getValue();
+                int ngroup = groupMap.get(n);
+                // Same group as investigated unit is a break; anything that could replicate in it replicates there.
+                if (ngroup == unitGroup)
+                    break;
+
+                if (!seenGroups.get(ngroup))
+                {
+                    if (++groupsFound == replicas)
+                        break;
+                    seenGroups.set(ngroup);
+                }
+                token = en.getKey();
+            }
+            return token;
+        }
+
+        public String toString()
+        {
+            Map<Integer, Integer> idToSize = instanceToCount(groupMap);
+            Map<Integer, Integer> sizeToCount = Maps.newTreeMap();
+            sizeToCount.putAll(instanceToCount(idToSize));
+            return String.format("%s strategy, %d replicas, group size to count %s", getClass().getSimpleName(), replicas, sizeToCount);
+        }
+
+        @Override
+        public int replicas()
+        {
+            return replicas;
+        }
+
+        public boolean sameGroup(Unit n1, Unit n2)
+        {
+            return groupMap.get(n1).equals(groupMap.get(n2));
+        }
+
+        public void removeUnit(Unit n)
+        {
+            groupMap.remove(n);
+        }
+
+        public Integer getGroup(Unit unit)
+        {
+            return groupMap.get(unit);
+        }
+
+        public double spreadExpectation()
+        {
+            return 1.5;   // Even balanced racks get disbalanced when they lose nodes.
+        }
+    }
+
+    private static <T> Map<T, Integer> instanceToCount(Map<?, T> map)
+    {
+        Map<T, Integer> idToCount = Maps.newHashMap();
+        for (Map.Entry<?, T> en : map.entrySet())
+        {
+            Integer old = idToCount.get(en.getValue());
+            idToCount.put(en.getValue(), old != null ? old + 1 : 1);
+        }
+        return idToCount;
+    }
+
+    /**
+     * Group strategy spreading units into a fixed number of groups.
+     */
+    static class FixedGroupCountReplicationStrategy extends GroupReplicationStrategy
+    {
+        int groupId;
+        int groupCount;
+
+        public FixedGroupCountReplicationStrategy(int replicas, int groupCount)
+        {
+            super(replicas);
+            assert groupCount >= replicas;
+            groupId = 0;
+            this.groupCount = groupCount;
+        }
+
+        public void addUnit(Unit n)
+        {
+            groupMap.put(n, groupId++ % groupCount);
+        }
+    }
+
+    /**
+     * Group strategy with a fixed number of units per group.
+     */
+    static class BalancedGroupReplicationStrategy extends GroupReplicationStrategy
+    {
+        int groupId;
+        int groupSize;
+
+        public BalancedGroupReplicationStrategy(int replicas, int groupSize)
+        {
+            super(replicas);
+            groupId = 0;
+            this.groupSize = groupSize;
+        }
+
+        public void addUnit(Unit n)
+        {
+            groupMap.put(n, groupId++ / groupSize);
+        }
+    }
+
+    static class UnbalancedGroupReplicationStrategy extends GroupReplicationStrategy
+    {
+        int groupId;
+        int nextSize;
+        int num;
+        int minGroupSize;
+        int maxGroupSize;
+        Random rand;
+
+        public UnbalancedGroupReplicationStrategy(int replicas, int minGroupSize, int maxGroupSize, Random rand)
+        {
+            super(replicas);
+            groupId = -1;
+            nextSize = 0;
+            num = 0;
+            this.maxGroupSize = maxGroupSize;
+            this.minGroupSize = minGroupSize;
+            this.rand = rand;
+        }
+
+        public void addUnit(Unit n)
+        {
+            if (++num > nextSize)
+            {
+                nextSize = minGroupSize + rand.nextInt(maxGroupSize - minGroupSize + 1);
+                ++groupId;
+                num = 0;
+            }
+            groupMap.put(n, groupId);
+        }
+
+        public double spreadExpectation()
+        {
+            return 2;
+        }
+    }
+
+    static Map<Unit, Double> evaluateReplicatedOwnership(ReplicationAwareTokenAllocator<Unit> t)
+    {
+        Map<Unit, Double> ownership = Maps.newHashMap();
+        Iterator<Token> it = t.sortedTokens.keySet().iterator();
+        if (!it.hasNext())
+            return ownership;
+
+        Token current = it.next();
+        while (it.hasNext())
+        {
+            Token next = it.next();
+            addOwnership(t, current, next, ownership);
+            current = next;
+        }
+        addOwnership(t, current, t.sortedTokens.firstKey(), ownership);
+
+        return ownership;
+    }
+
+    private static void addOwnership(ReplicationAwareTokenAllocator<Unit> t, Token current, Token next, Map<Unit, Double> ownership)
+    {
+        TestReplicationStrategy ts = (TestReplicationStrategy) t.strategy;
+        double size = current.size(next);
+        Token representative = t.partitioner.midpoint(current, next);
+        for (Unit n : ts.getReplicas(representative, t.sortedTokens))
+        {
+            Double v = ownership.get(n);
+            ownership.put(n, v != null ? v + size : size);
+        }
+    }
+
+    private static double replicatedTokenOwnership(Token token, NavigableMap<Token, Unit> sortedTokens, ReplicationStrategy<Unit> strategy)
+    {
+        TestReplicationStrategy ts = (TestReplicationStrategy) strategy;
+        Token next = sortedTokens.higherKey(token);
+        if (next == null)
+            next = sortedTokens.firstKey();
+        return ts.replicationStart(token, sortedTokens.get(token), sortedTokens).size(next);
+    }
+
+    static interface TokenCount
+    {
+        int tokenCount(int perUnitCount, Random rand);
+
+        double spreadExpectation();
+    }
+
+    static TokenCount fixedTokenCount = new TokenCount()
+    {
+        public int tokenCount(int perUnitCount, Random rand)
+        {
+            return perUnitCount;
+        }
+
+        public double spreadExpectation()
+        {
+            return 4;  // High tolerance to avoid flakiness.
+        }
+    };
+
+    static TokenCount varyingTokenCount = new TokenCount()
+    {
+        public int tokenCount(int perUnitCount, Random rand)
+        {
+            if (perUnitCount == 1) return 1;
+            // 25 to 175%
+            return rand.nextInt(perUnitCount * 3 / 2) + (perUnitCount + 3) / 4;
+        }
+
+        public double spreadExpectation()
+        {
+            return 8;  // High tolerance to avoid flakiness.
+        }
+    };
+
+    Murmur3Partitioner partitioner = new Murmur3Partitioner();
+    Random seededRand = new Random(2);
+
+    private void random(Map<Token, Unit> map, TestReplicationStrategy rs, int unitCount, TokenCount tc, int perUnitCount)
+    {
+        System.out.format("\nRandom generation of %d units with %d tokens each\n", unitCount, perUnitCount);
+        Random rand = seededRand;
+        for (int i = 0; i < unitCount; i++)
+        {
+            Unit unit = new Unit();
+            rs.addUnit(unit);
+            int tokens = tc.tokenCount(perUnitCount, rand);
+            for (int j = 0; j < tokens; j++)
+            {
+                map.put(partitioner.getRandomToken(rand), unit);
+            }
+        }
+    }
+
+    @Test
+    public void testExistingCluster()
+    {
+        for (int rf = 1; rf <= 5; ++rf)
+        {
+            for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4)
+            {
+                testExistingCluster(perUnitCount, fixedTokenCount, new SimpleReplicationStrategy(rf));
+                testExistingCluster(perUnitCount, varyingTokenCount, new SimpleReplicationStrategy(rf));
+                if (rf == 1) continue;  // Replication strategy doesn't matter for RF = 1.
+                for (int groupSize = 4; groupSize <= 64 && groupSize * rf * 4 < TARGET_CLUSTER_SIZE; groupSize *= 4)
+                {
+                    testExistingCluster(perUnitCount, fixedTokenCount, new BalancedGroupReplicationStrategy(rf, groupSize));
+                    testExistingCluster(perUnitCount, varyingTokenCount, new UnbalancedGroupReplicationStrategy(rf, groupSize / 2, groupSize * 2, seededRand));
+                }
+                testExistingCluster(perUnitCount, fixedTokenCount, new FixedGroupCountReplicationStrategy(rf, rf * 2));
+            }
+        }
+    }
+
+    public void testExistingCluster(int perUnitCount, TokenCount tc, TestReplicationStrategy rs)
+    {
+        System.out.println("Testing existing cluster, target " + perUnitCount + " vnodes, replication " + rs);
+        final int targetClusterSize = TARGET_CLUSTER_SIZE;
+        NavigableMap<Token, Unit> tokenMap = Maps.newTreeMap();
+
+        random(tokenMap, rs, targetClusterSize / 2, tc, perUnitCount);
+
+        ReplicationAwareTokenAllocator<Unit> t = new ReplicationAwareTokenAllocator<>(tokenMap, rs, partitioner);
+        grow(t, targetClusterSize * 9 / 10, tc, perUnitCount, false);
+        grow(t, targetClusterSize, tc, perUnitCount, true);
+        loseAndReplace(t, targetClusterSize / 10, tc, perUnitCount);
+        System.out.println();
+    }
+
+    @Test
+    public void testNewCluster()
+    {
+        Util.flakyTest(this::flakyTestNewCluster,
+                       5,
+                       "It tends to fail sometimes due to the random selection of the tokens in the first few nodes.");
+    }
+
+    public void flakyTestNewCluster()
+    {
+        // This test is flaky because the selection of the tokens for the first RF nodes (which is random, with an
+        // uncontrolled seed) can sometimes cause a pathological situation where the algorithm will find a (close to)
+        // ideal distribution of tokens for some number of nodes, which in turn will inevitably cause it to go into a
+        // bad (unacceptable to the test criteria) distribution after adding one more node.
+
+        // This should happen very rarely, unless something is broken in the token allocation code.
+
+        for (int rf = 2; rf <= 5; ++rf)
+        {
+            for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4)
+            {
+                testNewCluster(perUnitCount, fixedTokenCount, new SimpleReplicationStrategy(rf));
+                testNewCluster(perUnitCount, varyingTokenCount, new SimpleReplicationStrategy(rf));
+                if (rf == 1) continue;  // Replication strategy doesn't matter for RF = 1.
+                for (int groupSize = 4; groupSize <= 64 && groupSize * rf * 8 < TARGET_CLUSTER_SIZE; groupSize *= 4)
+                {
+                    testNewCluster(perUnitCount, fixedTokenCount, new BalancedGroupReplicationStrategy(rf, groupSize));
+                    testNewCluster(perUnitCount, varyingTokenCount, new UnbalancedGroupReplicationStrategy(rf, groupSize / 2, groupSize * 2, seededRand));
+                }
+                testNewCluster(perUnitCount, fixedTokenCount, new FixedGroupCountReplicationStrategy(rf, rf * 2));
+            }
+        }
+    }
+
+    public void testNewCluster(int perUnitCount, TokenCount tc, TestReplicationStrategy rs)
+    {
+        System.out.println("Testing new cluster, target " + perUnitCount + " vnodes, replication " + rs);
+        final int targetClusterSize = TARGET_CLUSTER_SIZE;
+        NavigableMap<Token, Unit> tokenMap = Maps.newTreeMap();
+
+        ReplicationAwareTokenAllocator<Unit> t = new ReplicationAwareTokenAllocator<>(tokenMap, rs, partitioner);
+        grow(t, targetClusterSize * 2 / 5, tc, perUnitCount, false);
+        grow(t, targetClusterSize, tc, perUnitCount, true);
+        loseAndReplace(t, targetClusterSize / 5, tc, perUnitCount);
+        System.out.println();
+    }
+
+    private void loseAndReplace(ReplicationAwareTokenAllocator<Unit> t, int howMany, TokenCount tc, int perUnitCount)
+    {
+        int fullCount = t.unitCount();
+        System.out.format("Losing %d units. ", howMany);
+        for (int i = 0; i < howMany; ++i)
+        {
+            Unit u = t.unitFor(partitioner.getRandomToken(seededRand));
+            t.removeUnit(u);
+            ((TestReplicationStrategy) t.strategy).removeUnit(u);
+        }
+        // Grow half without verifying.
+        grow(t, (t.unitCount() + fullCount * 3) / 4, tc, perUnitCount, false);
+        // Metrics should be back to normal by now. Check that they remain so.
+        grow(t, fullCount, tc, perUnitCount, true);
+    }
+
+    static class Summary
+    {
+        double min = 1;
+        double max = 1;
+        double stddev = 0;
+
+        void update(SummaryStatistics stat)
+        {
+            min = Math.min(min, stat.getMin());
+            max = Math.max(max, stat.getMax());
+            stddev = Math.max(stddev, stat.getStandardDeviation());
+        }
+
+        public String toString()
+        {
+            return String.format("max %.2f min %.2f stddev %.4f", max, min, stddev);
+        }
+    }
+
+    public void grow(ReplicationAwareTokenAllocator<Unit> t, int targetClusterSize, TokenCount tc, int perUnitCount, boolean verifyMetrics)
+    {
+        int size = t.unitCount();
+        Summary su = new Summary();
+        Summary st = new Summary();
+        Random rand = new Random(targetClusterSize + perUnitCount);
+        TestReplicationStrategy strategy = (TestReplicationStrategy) t.strategy;
+        if (size < targetClusterSize)
+        {
+            System.out.format("Adding %d unit(s) using %s...", targetClusterSize - size, t.toString());
+            long time = System.currentTimeMillis();
+            while (size < targetClusterSize)
+            {
+                int tokens = tc.tokenCount(perUnitCount, rand);
+                Unit unit = new Unit();
+                strategy.addUnit(unit);
+                t.addUnit(unit, tokens);
+                ++size;
+                if (verifyMetrics)
+                    updateSummary(t, su, st, false);
+            }
+            System.out.format(" Done in %.3fs\n", (System.currentTimeMillis() - time) / 1000.0);
+            if (verifyMetrics)
+            {
+                updateSummary(t, su, st, true);
+                double maxExpected = 1.0 + tc.spreadExpectation() * strategy.spreadExpectation() / (perUnitCount * t.replicas);
+                if (su.max > maxExpected)
+                {
+                    Assert.fail(String.format("Expected max unit size below %.4f, was %.4f", maxExpected, su.max));
+                }
+                // We can't verify lower side range as small loads can't always be fixed.
+            }
+        }
+    }
+
+
+    private void updateSummary(ReplicationAwareTokenAllocator<Unit> t, Summary su, Summary st, boolean print)
+    {
+        int size = t.sortedTokens.size();
+        double inverseAverage = 1.0 * size / t.strategy.replicas();
+
+        Map<Unit, Double> ownership = evaluateReplicatedOwnership(t);
+        SummaryStatistics unitStat = new SummaryStatistics();
+        for (Map.Entry<Unit, Double> en : ownership.entrySet())
+            unitStat.addValue(en.getValue() * inverseAverage / t.unitToTokens.get(en.getKey()).size());
+        su.update(unitStat);
+
+        SummaryStatistics tokenStat = new SummaryStatistics();
+        for (Token tok : t.sortedTokens.keySet())
+            tokenStat.addValue(replicatedTokenOwnership(tok, t.sortedTokens, t.strategy) * inverseAverage);
+        st.update(tokenStat);
+
+        if (print)
+        {
+            System.out.format("Size %d(%d)   \tunit %s  token %s   %s\n",
+                              t.unitCount(), size,
+                              mms(unitStat),
+                              mms(tokenStat),
+                              t.strategy);
+            System.out.format("Worst intermediate unit\t%s  token %s\n", su, st);
+        }
+    }
+
+
+    private static String mms(SummaryStatistics s)
+    {
+        return String.format("max %.2f min %.2f stddev %.4f", s.getMax(), s.getMin(), s.getStandardDeviation());
+    }
+
+
+    int nextUnitId = 0;
+
+    final class Unit implements Comparable<Unit>
+    {
+        int unitId = nextUnitId++;
+
+        public String toString()
+        {
+            return Integer.toString(unitId);
+        }
+
+        @Override
+        public int compareTo(Unit o)
+        {
+            return Integer.compare(unitId, o.unitId);
+        }
+    }
+}
\ No newline at end of file

diff --git a/test/long/org/apache/cassandra/hints/HintsWriteThenReadTest.java b/test/long/org/apache/cassandra/hints/HintsWriteThenReadTest.java
new file mode 100644
index 0000000..fd880cb
--- /dev/null
+++ b/test/long/org/apache/cassandra/hints/HintsWriteThenReadTest.java

@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.util.Iterator;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.zip.CRC32;
+
+import com.google.common.collect.Iterables;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static junit.framework.Assert.assertTrue;
+
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class HintsWriteThenReadTest
+{
+    private static final String KEYSPACE = "hints_write_then_read_test";
+    private static final String TABLE = "table";
+
+    private static final int HINTS_COUNT = 10_000_000;
+
+    @Test
+    public void testWriteReadCycle() throws IOException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+
+        HintsDescriptor descriptor = new HintsDescriptor(UUID.randomUUID(), System.currentTimeMillis());
+
+        File directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            testWriteReadCycle(directory, descriptor);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    private void testWriteReadCycle(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        // write HINTS_COUNT hints to a file
+        writeHints(directory, descriptor);
+
+        // calculate the checksum of the file, then compare to the .crc32 checksum file content
+        verifyChecksum(directory, descriptor);
+
+        // iterate over the written hints, make sure they are all present
+        verifyHints(directory, descriptor);
+    }
+
+    private void writeHints(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        try (HintsWriter writer = HintsWriter.create(directory, descriptor))
+        {
+            write(writer, descriptor.timestamp);
+        }
+    }
+
+    private static void verifyChecksum(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        File hintsFile = new File(directory, descriptor.fileName());
+        File checksumFile = new File(directory, descriptor.checksumFileName());
+
+        assertTrue(checksumFile.exists());
+
+        String actualChecksum = Integer.toHexString(calculateChecksum(hintsFile));
+        String expectedChecksum = Files.readAllLines(checksumFile.toPath()).iterator().next();
+
+        assertEquals(expectedChecksum, actualChecksum);
+    }
+
+    private void verifyHints(File directory, HintsDescriptor descriptor)
+    {
+        long baseTimestamp = descriptor.timestamp;
+        int index = 0;
+
+        try (HintsReader reader = HintsReader.open(new File(directory, descriptor.fileName())))
+        {
+            for (HintsReader.Page page : reader)
+            {
+                Iterator<Hint> hints = page.hintsIterator();
+                while (hints.hasNext())
+                {
+                    Hint hint = hints.next();
+
+                    long timestamp = baseTimestamp + index;
+                    Mutation mutation = hint.mutation;
+
+                    assertEquals(timestamp, hint.creationTime);
+                    assertEquals(dk(bytes(index)), mutation.key());
+
+                    Row row = mutation.getPartitionUpdates().iterator().next().iterator().next();
+                    assertEquals(1, Iterables.size(row.cells()));
+                    assertEquals(bytes(index), row.clustering().get(0));
+                    Cell cell = row.cells().iterator().next();
+                    assertNotNull(cell);
+                    assertEquals(bytes(index), cell.value());
+                    assertEquals(timestamp * 1000, cell.timestamp());
+
+                    index++;
+                }
+            }
+        }
+
+        assertEquals(index, HINTS_COUNT);
+    }
+
+    private void write(HintsWriter writer, long timestamp) throws IOException
+    {
+        ByteBuffer buffer = ByteBuffer.allocateDirect(256 * 1024);
+        try (HintsWriter.Session session = writer.newSession(buffer))
+        {
+            write(session, timestamp);
+        }
+        FileUtils.clean(buffer);
+    }
+
+    private void write(HintsWriter.Session session, long timestamp) throws IOException
+    {
+        for (int i = 0; i < HINTS_COUNT; i++)
+            session.append(createHint(i, timestamp));
+    }
+
+    private static Hint createHint(int idx, long baseTimestamp)
+    {
+        long timestamp = baseTimestamp + idx;
+        return Hint.create(createMutation(idx, TimeUnit.MILLISECONDS.toMicros(timestamp)), timestamp);
+    }
+
+    private static Mutation createMutation(int index, long timestamp)
+    {
+        CFMetaData table = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        return new RowUpdateBuilder(table, timestamp, bytes(index))
+               .clustering(bytes(index))
+               .add("val", bytes(index))
+               .build();
+    }
+
+    private static int calculateChecksum(File file) throws IOException
+    {
+        CRC32 crc = new CRC32();
+        byte[] buffer = new byte[FBUtilities.MAX_UNSIGNED_SHORT];
+
+        try (InputStream in = Files.newInputStream(file.toPath()))
+        {
+            int bytesRead;
+            while((bytesRead = in.read(buffer)) != -1)
+                crc.update(buffer, 0, bytesRead);
+        }
+
+        return (int) crc.getValue();
+    }
+}

diff --git a/test/long/org/apache/cassandra/io/compress/CompressorPerformance.java b/test/long/org/apache/cassandra/io/compress/CompressorPerformance.java
index 3612412..17122f5 100644
--- a/test/long/org/apache/cassandra/io/compress/CompressorPerformance.java
+++ b/test/long/org/apache/cassandra/io/compress/CompressorPerformance.java

@@ -28,7 +28,7 @@
 public class CompressorPerformance
 {
 
-    static public void testPerformances() throws IOException
+    public static void testPerformances() throws IOException
     {
         for (ICompressor compressor: new ICompressor[] {
                 SnappyCompressor.instance,  // warm up
@@ -58,7 +58,7 @@
     static ByteBuffer dataSource;
     static int bufLen;
 
-    static private void testPerformance(ICompressor compressor, BufferType in, BufferType out) throws IOException
+    private static void testPerformance(ICompressor compressor, BufferType in, BufferType out) throws IOException
     {
         int len = dataSource.capacity();
         int bufLen = compressor.initialCompressedBufferLength(len);

diff --git a/test/long/org/apache/cassandra/io/sstable/CQLSSTableWriterLongTest.java b/test/long/org/apache/cassandra/io/sstable/CQLSSTableWriterLongTest.java
index ee719d1..b48336f 100644
--- a/test/long/org/apache/cassandra/io/sstable/CQLSSTableWriterLongTest.java
+++ b/test/long/org/apache/cassandra/io/sstable/CQLSSTableWriterLongTest.java

@@ -30,9 +30,6 @@
 import org.junit.Test;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.Config;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.service.StorageService;
 
 public class CQLSSTableWriterLongTest
@@ -40,9 +37,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
-        DatabaseDescriptor.setDaemonInitialized();
-        SchemaLoader.cleanupAndLeaveDirs();
-        Keyspace.setInitialized();
+        SchemaLoader.prepareServer();
         StorageService.instance.initServer();
     }
 
@@ -88,7 +83,6 @@
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
-                                                  .withPartitioner(StorageService.instance.getPartitioner())
                                                   .using(insert)
                                                   .withBufferSizeInMB(1)
                                                   .build();

diff --git a/test/long/org/apache/cassandra/locator/DynamicEndpointSnitchLongTest.java b/test/long/org/apache/cassandra/locator/DynamicEndpointSnitchLongTest.java
index 841f73e..0d66fa9 100644
--- a/test/long/org/apache/cassandra/locator/DynamicEndpointSnitchLongTest.java
+++ b/test/long/org/apache/cassandra/locator/DynamicEndpointSnitchLongTest.java

@@ -23,10 +23,11 @@
 import java.net.InetAddress;
 import java.util.*;
 
+import org.junit.Test;
+
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.StorageService;
-import org.junit.Test;
 
 import org.apache.cassandra.utils.FBUtilities;
 

diff --git a/test/long/org/apache/cassandra/utils/LongBloomFilterTest.java b/test/long/org/apache/cassandra/utils/LongBloomFilterTest.java
index 8d916a0..c50296d 100644
--- a/test/long/org/apache/cassandra/utils/LongBloomFilterTest.java
+++ b/test/long/org/apache/cassandra/utils/LongBloomFilterTest.java

@@ -19,11 +19,15 @@
 package org.apache.cassandra.utils;
 
 import java.util.Random;
+import java.util.concurrent.ThreadLocalRandom;
 
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.cassandra.utils.FilterFactory.getFilter;
+import static org.apache.cassandra.utils.FilterTestHelper.testFalsePositives;
+
 public class LongBloomFilterTest
 {
     private static final Logger logger = LoggerFactory.getLogger(LongBloomFilterTest.class);
@@ -34,39 +38,164 @@
     @Test
     public void testBigInt()
     {
+        testBigInt(false);
+        testBigInt(true);
+    }
+    private static void testBigInt(boolean oldBfHashOrder)
+    {
         int size = 10 * 1000 * 1000;
-        IFilter bf = FilterFactory.getFilter(size, FilterTestHelper.spec.bucketsPerElement, false);
-        double fp = FilterTestHelper.testFalsePositives(bf,
-                                                        new KeyGenerator.IntGenerator(size),
-                                                        new KeyGenerator.IntGenerator(size, size * 2));
-        logger.info("Bloom filter false positive: {}", fp);
+        IFilter bf = getFilter(size, FilterTestHelper.spec.bucketsPerElement, false, oldBfHashOrder);
+        double fp = testFalsePositives(bf,
+                                       new KeyGenerator.IntGenerator(size),
+                                       new KeyGenerator.IntGenerator(size, size * 2));
+        logger.info("Bloom filter false positive for oldBfHashOrder={}: {}", oldBfHashOrder, fp);
     }
 
     @Test
     public void testBigRandom()
     {
+        testBigRandom(false);
+        testBigRandom(true);
+    }
+    private static void testBigRandom(boolean oldBfHashOrder)
+    {
         int size = 10 * 1000 * 1000;
-        IFilter bf = FilterFactory.getFilter(size, FilterTestHelper.spec.bucketsPerElement, false);
-        double fp = FilterTestHelper.testFalsePositives(bf,
-                                                        new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size),
-                                                        new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size));
-        logger.info("Bloom filter false positive: {}", fp);
+        IFilter bf = getFilter(size, FilterTestHelper.spec.bucketsPerElement, false, oldBfHashOrder);
+        double fp = testFalsePositives(bf,
+                                       new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size),
+                                       new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size));
+        logger.info("Bloom filter false positive for oldBfHashOrder={}: {}", oldBfHashOrder, fp);
+    }
+
+    /**
+     * NB: needs to run with -mx1G
+     */
+    @Test
+    public void testConstrained()
+    {
+        testConstrained(false);
+        testConstrained(true);
+    }
+    private static void testConstrained(boolean oldBfHashOrder)
+    {
+        int size = 10 * 1000 * 1000;
+        try (IFilter bf = getFilter(size, 0.01, false, oldBfHashOrder))
+        {
+            double fp = testFalsePositives(bf,
+                                           new KeyGenerator.IntGenerator(size),
+                                           new KeyGenerator.IntGenerator(size, size * 2));
+            logger.info("Bloom filter false positive for oldBfHashOrder={}: {}", oldBfHashOrder, fp);
+        }
+    }
+
+    private static void testConstrained(double targetFp, int elements, boolean oldBfHashOrder, int staticBitCount, long ... staticBits)
+    {
+        for (long bits : staticBits)
+        {
+            try (IFilter bf = getFilter(elements, targetFp, false, oldBfHashOrder);)
+            {
+                SequentialHashGenerator gen = new SequentialHashGenerator(staticBitCount, bits);
+                long[] hash = new long[2];
+                for (int i = 0 ; i < elements ; i++)
+                {
+                    gen.nextHash(hash);
+                    bf.add(filterKey(hash[0], hash[1]));
+                }
+                int falsePositiveCount = 0;
+                for (int i = 0 ; i < elements ; i++)
+                {
+                    gen.nextHash(hash);
+                    if (bf.isPresent(filterKey(hash[0], hash[1])))
+                        falsePositiveCount++;
+                }
+                double fp = falsePositiveCount / (double) elements;
+                double ratio = fp/targetFp;
+                System.out.printf("%.2f, ", ratio);
+            }
+        }
+        System.out.printf("%d elements, %d static bits, %.2f target\n", elements, staticBitCount, targetFp);
+    }
+
+    private static IFilter.FilterKey filterKey(final long hash1, final long hash2)
+    {
+        return new IFilter.FilterKey()
+        {
+            public void filterHash(long[] dest)
+            {
+                dest[0] = hash1;
+                dest[1] = hash2;
+            }
+        };
+    }
+
+    @Test
+    public void testBffp()
+    {
+        bffp(false);
+        bffp(true);
+    }
+
+    private static void bffp(boolean flipInputs)
+    {
+        System.out.println("Bloom filter false posiitive with flipInputs=" + flipInputs);
+        long[] staticBits = staticBits(4, 0);
+        testConstrained(0.01d, 10 << 20, flipInputs, 0, staticBits);
+        testConstrained(0.01d, 1 << 20, flipInputs, 6, staticBits);
+        testConstrained(0.01d, 10 << 20, flipInputs, 6, staticBits);
+        testConstrained(0.01d, 1 << 19, flipInputs, 10, staticBits);
+        testConstrained(0.01d, 1 << 20, flipInputs, 10, staticBits);
+        testConstrained(0.01d, 10 << 20, flipInputs, 10, staticBits);
+        testConstrained(0.1d, 10 << 20, flipInputs, 0, staticBits);
+        testConstrained(0.1d, 10 << 20, flipInputs, 8, staticBits);
+        testConstrained(0.1d, 10 << 20, flipInputs, 10, staticBits);
+    }
+
+    static long[] staticBits(int random, long ... fixed)
+    {
+        long[] result = new long[random + fixed.length];
+        System.arraycopy(fixed, 0, result, 0, fixed.length);
+        for (int i = 0 ; i < random ; i++)
+            result[fixed.length + i] = ThreadLocalRandom.current().nextLong();
+        return result;
+    }
+
+    private static class SequentialHashGenerator
+    {
+        final long mask;
+        final long staticBits;
+        int next;
+        private SequentialHashGenerator(int staticBitCount, long staticBits) {
+            this.mask = -1 >>> staticBitCount;
+            this.staticBits = staticBits & ~mask;
+        }
+        void nextHash(long[] fill)
+        {
+            MurmurHash.hash3_x64_128(ByteBufferUtil.bytes(next), 0, 4, 0, fill);
+            fill[0] &= mask;
+            fill[0] |= staticBits;
+            next++;
+        }
     }
 
     @Test
     public void timeit()
     {
+        timeit(false);
+        timeit(true);
+    }
+    private static void timeit(boolean oldBfHashOrder)
+    {
         int size = 300 * FilterTestHelper.ELEMENTS;
-        IFilter bf = FilterFactory.getFilter(size, FilterTestHelper.spec.bucketsPerElement, false);
+        IFilter bf = getFilter(size, FilterTestHelper.spec.bucketsPerElement, false, oldBfHashOrder);
         double sumfp = 0;
         for (int i = 0; i < 10; i++)
         {
-            FilterTestHelper.testFalsePositives(bf,
-                                                new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size),
-                                                new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size));
+            testFalsePositives(bf,
+                               new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size),
+                               new KeyGenerator.RandomStringGenerator(new Random().nextInt(), size));
 
             bf.clear();
         }
-        logger.info("Bloom filter mean false positive: {}", sumfp / 10);
+        logger.info("Bloom filter mean false positive for oldBfHashOrder={}: {}", oldBfHashOrder, sumfp / 10);
     }
 }

diff --git a/test/microbench/org/apache/cassandra/test/microbench/StreamingHistogramBench.java b/test/microbench/org/apache/cassandra/test/microbench/StreamingHistogramBench.java
new file mode 100644
index 0000000..c1ecf6d
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/StreamingHistogramBench.java

@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench;
+
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+import org.apache.cassandra.utils.StreamingHistogram;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 3, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+@Threads(1)
+@State(Scope.Benchmark)
+public class StreamingHistogramBench
+{
+    StreamingHistogram.StreamingHistogramBuilder streamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram2;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram3;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram4;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram5;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram6;
+    StreamingHistogram.StreamingHistogramBuilder streamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder newStreamingHistogram100x60;
+
+    StreamingHistogram.StreamingHistogramBuilder narrowstreamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram2;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram3;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram4;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram5;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram6;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder narrowstreamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder narrownewStreamingHistogram100x60;
+
+    StreamingHistogram.StreamingHistogramBuilder sparsestreamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram2;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram3;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram4;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram5;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram6;
+    StreamingHistogram.StreamingHistogramBuilder sparsestreamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram60;
+    StreamingHistogram.StreamingHistogramBuilder sparsenewStreamingHistogram100x60;
+
+    static int[] ttls = new int[10000000];
+    static int[] narrowttls = new int[10000000];
+    static int[] sparsettls = new int[10000000];
+    static
+    {
+        Random random = new Random();
+        for(int i = 0 ; i < 10000000; i++)
+        {
+            // Seconds in a day
+            ttls[i] = random.nextInt(86400);
+            // Seconds in 3 hours
+            narrowttls[i] = random.nextInt(14400);
+            // Seconds in a minute
+            sparsettls[i] = random.nextInt(60);
+        }
+    }
+
+    @Setup(Level.Trial)
+    public void setup() throws Throwable
+    {
+
+        streamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 1);
+        newStreamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 1000, 1);
+        newStreamingHistogram2 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 1);
+        newStreamingHistogram3 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 1);
+        newStreamingHistogram4 = new StreamingHistogram.StreamingHistogramBuilder(50, 100000, 1);
+        newStreamingHistogram5 = new StreamingHistogram.StreamingHistogramBuilder(50, 10000,1 );
+        newStreamingHistogram6 = new StreamingHistogram.StreamingHistogramBuilder(100, 1000000, 1);
+        streamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 60);
+        newStreamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 60);
+        newStreamingHistogram100x60 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 60);
+
+        narrowstreamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 1);
+        narrownewStreamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 1000, 1);
+        narrownewStreamingHistogram2 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 1);
+        narrownewStreamingHistogram3 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 1);
+        narrownewStreamingHistogram4 = new StreamingHistogram.StreamingHistogramBuilder(50, 100000, 1);
+        narrownewStreamingHistogram5 = new StreamingHistogram.StreamingHistogramBuilder(50, 10000, 1);
+        narrownewStreamingHistogram6 = new StreamingHistogram.StreamingHistogramBuilder(100, 1000000, 1);
+        narrowstreamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 60);
+        narrownewStreamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 60);
+        narrownewStreamingHistogram100x60 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 60);
+
+
+        sparsestreamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 1);
+        sparsenewStreamingHistogram = new StreamingHistogram.StreamingHistogramBuilder(100, 1000, 1);
+        sparsenewStreamingHistogram2 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 1);
+        sparsenewStreamingHistogram3 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 1);
+        sparsenewStreamingHistogram4 = new StreamingHistogram.StreamingHistogramBuilder(50, 100000, 1);
+        sparsenewStreamingHistogram5 = new StreamingHistogram.StreamingHistogramBuilder(50, 10000, 1);
+        sparsenewStreamingHistogram6 = new StreamingHistogram.StreamingHistogramBuilder(100, 1000000, 1);
+        sparsestreamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 0, 60);
+        sparsenewStreamingHistogram60 = new StreamingHistogram.StreamingHistogramBuilder(100, 100000, 60);
+        sparsenewStreamingHistogram100x60 = new StreamingHistogram.StreamingHistogramBuilder(100, 10000, 60);
+
+    }
+
+    @TearDown(Level.Trial)
+    public void teardown() throws IOException, ExecutionException, InterruptedException
+    {
+
+    }
+
+    @Benchmark
+    public void existingSH() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            streamingHistogram.update(ttls[i]);
+        streamingHistogram.build();
+    }
+
+    @Benchmark
+    public void newSH10x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram.update(ttls[i]);
+        newStreamingHistogram.build();
+
+    }
+
+    @Benchmark
+    public void newSH100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram2.update(ttls[i]);
+        newStreamingHistogram2.build();
+
+    }
+
+    @Benchmark
+    public void newSH1000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram3.update(ttls[i]);
+        newStreamingHistogram3.build();
+
+    }
+
+    @Benchmark
+    public void newSH10000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram6.update(ttls[i]);
+        newStreamingHistogram6.build();
+
+    }
+
+
+    @Benchmark
+    public void newSH50and1000() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram4.update(ttls[i]);
+        newStreamingHistogram4.build();
+
+    }
+
+    @Benchmark
+    public void newSH50and100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram5.update(ttls[i]);
+        newStreamingHistogram5.build();
+
+    }
+
+    @Benchmark
+    public void streaminghistogram60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            streamingHistogram60.update(sparsettls[i]);
+        streamingHistogram60.build();
+
+    }
+
+    @Benchmark
+    public void newstreaminghistogram1000x60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram60.update(sparsettls[i]);
+        newStreamingHistogram60.build();
+    }
+
+    @Benchmark
+    public void newstreaminghistogram100x60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            newStreamingHistogram100x60.update(sparsettls[i]);
+        newStreamingHistogram100x60.build();
+    }
+
+
+    @Benchmark
+    public void narrowexistingSH() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrowstreamingHistogram.update(narrowttls[i]);
+        narrowstreamingHistogram.build();
+    }
+
+    @Benchmark
+    public void narrownewSH10x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram.update(narrowttls[i]);
+        narrownewStreamingHistogram.build();
+
+    }
+
+    @Benchmark
+    public void narrownewSH100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram2.update(narrowttls[i]);
+        narrownewStreamingHistogram2.build();
+
+    }
+
+    @Benchmark
+    public void narrownewSH1000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram3.update(narrowttls[i]);
+        narrownewStreamingHistogram3.build();
+
+    }
+
+    @Benchmark
+    public void narrownewSH10000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram6.update(ttls[i]);
+        narrownewStreamingHistogram6.build();
+
+    }
+
+
+    @Benchmark
+    public void narrownewSH50and1000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram4.update(narrowttls[i]);
+        narrownewStreamingHistogram4.build();
+
+    }
+
+    @Benchmark
+    public void narrownewSH50and100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram5.update(narrowttls[i]);
+        narrownewStreamingHistogram5.build();
+
+    }
+
+    @Benchmark
+    public void narrowstreaminghistogram60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrowstreamingHistogram60.update(sparsettls[i]);
+        narrowstreamingHistogram60.build();
+
+    }
+
+    @Benchmark
+    public void narrownewstreaminghistogram1000x60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram60.update(sparsettls[i]);
+        narrownewStreamingHistogram60.build();
+
+    }
+
+    @Benchmark
+    public void narrownewstreaminghistogram100x60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            narrownewStreamingHistogram100x60.update(sparsettls[i]);
+        narrownewStreamingHistogram100x60.build();
+
+    }
+
+
+    @Benchmark
+    public void sparseexistingSH() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsestreamingHistogram.update(sparsettls[i]);
+        sparsestreamingHistogram.build();
+    }
+
+    @Benchmark
+    public void sparsenewSH10x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram.update(sparsettls[i]);
+        sparsenewStreamingHistogram.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewSH100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram2.update(sparsettls[i]);
+        sparsenewStreamingHistogram2.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewSH1000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram3.update(sparsettls[i]);
+        sparsenewStreamingHistogram3.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewSH10000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram6.update(ttls[i]);
+        sparsenewStreamingHistogram6.build();
+    }
+
+
+    @Benchmark
+    public void sparsenewSH50and1000x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram4.update(sparsettls[i]);
+        sparsenewStreamingHistogram4.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewSH50and100x() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram5.update(sparsettls[i]);
+        sparsenewStreamingHistogram5.build();
+
+    }
+
+    @Benchmark
+    public void sparsestreaminghistogram60s() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsestreamingHistogram60.update(sparsettls[i]);
+        sparsestreamingHistogram60.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewstreaminghistogram1000x60() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram60.update(sparsettls[i]);
+        sparsenewStreamingHistogram60.build();
+
+    }
+
+    @Benchmark
+    public void sparsenewstreaminghistogram100x60() throws Throwable
+    {
+        for(int i = 0 ; i < ttls.length; i++)
+            sparsenewStreamingHistogram100x60.update(sparsettls[i]);
+        sparsenewStreamingHistogram100x60.build();
+
+    }
+}

diff --git a/test/pig/org/apache/cassandra/pig/CqlRecordReaderTest.java b/test/pig/org/apache/cassandra/pig/CqlRecordReaderTest.java
deleted file mode 100644
index b2a74b6..0000000
--- a/test/pig/org/apache/cassandra/pig/CqlRecordReaderTest.java
+++ /dev/null

@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.pig.data.Tuple;
-import org.apache.thrift.TException;
-
-public class CqlRecordReaderTest extends PigTestBase
-{
-    private static String[] statements = {
-        "DROP KEYSPACE IF EXISTS cql3ks",
-        "CREATE KEYSPACE cql3ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};",
-        "USE cql3ks;",
-
-        "CREATE TABLE cqltable (" +
-        "pk1 int," +
-        "pk2 int," +
-        "pk3 int," +
-        "ck1 int," +
-        "ck2 int," +
-        "data text," +
-        "primary key((pk1,pk2,pk3),ck1,ck2));",
-        "INSERT INTO cqltable(pk1, pk2, pk3, ck1, ck2, data) VALUES (11, 12, 13, 14, 15, 'value1');",
-
-        "CREATE TABLE \"MixedCaseCqlTable\" (" +
-        "pk1 int," +
-        "\"PK2\" int," +
-        "pk3 int," +
-        "\"CK1\" int," +
-        "ck2 int," +
-        "data text," +
-        "primary key((pk1,\"PK2\",pk3),\"CK1\",ck2));",
-        "INSERT INTO \"MixedCaseCqlTable\"(pk1, \"PK2\", pk3, \"CK1\", ck2, data) VALUES (11, 12, 13, 14, 15, 'value1');",
-    };
-
-    @BeforeClass
-    public static void setup() throws IOException, ConfigurationException, TException
-    {
-        startCassandra();
-        executeCQLStatements(statements);
-        startHadoopCluster();
-    }
-
-    @Test
-    public void defaultCqlQueryTest() throws Exception
-    {
-        String initialQuery = "rows = LOAD 'cql://cql3ks/cqltable?" + defaultParameters + nativeParameters + "' USING CqlNativeStorage();";
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 11);
-            Assert.assertEquals(t.get(1), 12);
-            Assert.assertEquals(t.get(2), 13);
-            Assert.assertEquals(t.get(3), 14);
-            Assert.assertEquals(t.get(4), 15);
-            Assert.assertEquals(t.get(5), "value1");
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void defaultMixedCaseCqlQueryTest() throws Exception
-    {
-        String initialQuery = "rows = LOAD 'cql://cql3ks/MixedCaseCqlTable?" + defaultParameters + nativeParameters + "' USING CqlNativeStorage();";
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 11);
-            Assert.assertEquals(t.get(1), 12);
-            Assert.assertEquals(t.get(2), 13);
-            Assert.assertEquals(t.get(3), 14);
-            Assert.assertEquals(t.get(4), 15);
-            Assert.assertEquals(t.get(5), "value1");
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void selectColumnsTest() throws Exception
-    {
-        String initialQuery = "rows = LOAD 'cql://cql3ks/cqltable?" + defaultParameters + nativeParameters + "&columns=ck1%2Cck2%2Cdata' USING CqlNativeStorage();";
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 11);
-            Assert.assertEquals(t.get(1), 12);
-            Assert.assertEquals(t.get(2), 13);
-            Assert.assertEquals(t.get(3), 14);
-            Assert.assertEquals(t.get(4), 15);
-            Assert.assertEquals(t.get(5), "value1");
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void whereClauseTest() throws Exception
-    {
-        String initialQuery = "rows = LOAD 'cql://cql3ks/cqltable?" + defaultParameters + nativeParameters + "&where_clause=ck1%3d14' USING CqlNativeStorage();";
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 11);
-            Assert.assertEquals(t.get(1), 12);
-            Assert.assertEquals(t.get(2), 13);
-            Assert.assertEquals(t.get(3), 14);
-            Assert.assertEquals(t.get(4), 15);
-            Assert.assertEquals(t.get(5), "value1");
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-}

diff --git a/test/pig/org/apache/cassandra/pig/CqlTableDataTypeTest.java b/test/pig/org/apache/cassandra/pig/CqlTableDataTypeTest.java
deleted file mode 100644
index ca01901..0000000
--- a/test/pig/org/apache/cassandra/pig/CqlTableDataTypeTest.java
+++ /dev/null

@@ -1,479 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.utils.Hex;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.Tuple;
-import org.apache.thrift.TException;
-
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class CqlTableDataTypeTest extends PigTestBase
-{
-    //ASCII    (AsciiType.instance),
-    //BIGINT   (LongType.instance),
-    //BLOB     (BytesType.instance),
-    //BOOLEAN  (BooleanType.instance),
-    //COUNTER  (CounterColumnType.instance),
-    //DECIMAL  (DecimalType.instance),
-    //DOUBLE   (DoubleType.instance),
-    //FLOAT    (FloatType.instance),
-    //INET     (InetAddressType.instance),
-    //INT      (Int32Type.instance),
-    //TEXT     (UTF8Type.instance),
-    //TIMESTAMP(DateType.instance),
-    //UUID     (UUIDType.instance),
-    //VARCHAR  (UTF8Type.instance),
-    //VARINT   (IntegerType.instance),
-    //TIMEUUID (TimeUUIDType.instance);
-    //SET
-    //LIST
-    //MAP
-    //Create table to test the above data types
-    private static String[] statements = {
-            "DROP KEYSPACE IF EXISTS cql3ks",
-            "CREATE KEYSPACE cql3ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}",
-            "USE cql3ks;",
-
-            "CREATE TABLE cqltable (" +
-            "key int primary key," +
-            "col_ascii ascii," +
-            "col_bigint bigint," +
-            "col_blob blob," +
-            "col_boolean boolean," +
-            "col_decimal decimal," +
-            "col_double double," +
-            "col_float float," +
-            "col_inet inet," +
-            "col_int int," +
-            "col_text text," +
-            "col_timestamp timestamp," +
-            "col_uuid uuid," +
-            "col_varchar varchar," +
-            "col_varint varint," +
-            "col_timeuuid timeuuid);",
-
-            "CREATE TABLE settable (" +
-            "key int primary key," +
-            "col_set_ascii set<ascii>," +
-            "col_set_bigint set<bigint>," +
-            "col_set_blob set<blob>," +
-            "col_set_boolean set<boolean>," +
-            "col_set_decimal set<decimal>," +
-            "col_set_double set<double>," +
-            "col_set_float set<float>," +
-            "col_set_inet set<inet>," +
-            "col_set_int set<int>," +
-            "col_set_text set<text>," +
-            "col_set_timestamp set<timestamp>," +
-            "col_set_uuid set<uuid>," +
-            "col_set_varchar set<varchar>," +
-            "col_set_varint set<varint>," +
-            "col_set_timeuuid set<timeuuid>);",
-
-            "CREATE TABLE listtable (" +
-            "key int primary key," +
-            "col_list_ascii list<ascii>," +
-            "col_list_bigint list<bigint>," +
-            "col_list_blob list<blob>," +
-            "col_list_boolean list<boolean>," +
-            "col_list_decimal list<decimal>," +
-            "col_list_double list<double>," +
-            "col_list_float list<float>," +
-            "col_list_inet list<inet>," +
-            "col_list_int list<int>," +
-            "col_list_text list<text>," +
-            "col_list_timestamp list<timestamp>," +
-            "col_list_uuid list<uuid>," +
-            "col_list_varchar list<varchar>," +
-            "col_list_varint list<varint>," +
-            "col_list_timeuuid list<timeuuid>);",
-
-            "CREATE TABLE maptable (" +
-            "key int primary key," +
-            "col_map_ascii map<ascii, ascii>," +
-            "col_map_bigint map<bigint, bigint>," +
-            "col_map_blob map<blob, blob>," +
-            "col_map_boolean map<boolean, boolean>," +
-            "col_map_decimal map<decimal, decimal>," +
-            "col_map_double map<double, double>," +
-            "col_map_float map<float, float>," +
-            "col_map_inet map<inet, inet>," +
-            "col_map_int map<int, int>," +
-            "col_map_text map<text, text>," +
-            "col_map_timestamp map<timestamp, timestamp>," +
-            "col_map_uuid map<uuid, uuid>," +
-            "col_map_varchar map<varchar, varchar>," +
-            "col_map_varint map<varint, varint>," +
-            "col_map_timeuuid map<timeuuid, timeuuid>);",
-        
-            "INSERT INTO cqltable(key, col_ascii) VALUES (1, 'ascii');",
-            "INSERT INTO cqltable(key, col_bigint) VALUES (1, 12345678);",
-            "INSERT INTO cqltable(key, col_blob) VALUES (1, 0x23446c6c6f);",
-            "INSERT INTO cqltable(key, col_boolean) VALUES (1, false);",
-            "INSERT INTO cqltable(key, col_decimal) VALUES (1, 23.4567);",
-            "INSERT INTO cqltable(key, col_double) VALUES (1, 12345678.12345678);",
-            "INSERT INTO cqltable(key, col_float) VALUES (1, 123.12);",
-            "INSERT INTO cqltable(key, col_inet) VALUES (1, '127.0.0.1');",
-            "INSERT INTO cqltable(key, col_int) VALUES (1, 123);",
-            "INSERT INTO cqltable(key, col_text) VALUES (1, 'text');",
-            "INSERT INTO cqltable(key, col_timestamp) VALUES (1, '2011-02-03T04:05:00+0000');",
-            "INSERT INTO cqltable(key, col_timeuuid) VALUES (1, maxTimeuuid('2013-01-01 00:05+0000'));",
-            "INSERT INTO cqltable(key, col_uuid) VALUES (1, 550e8400-e29b-41d4-a716-446655440000);",
-            "INSERT INTO cqltable(key, col_varchar) VALUES (1, 'varchar');",
-            "INSERT INTO cqltable(key, col_varint) VALUES (1, 123);",
-
-            "INSERT INTO settable(key, col_set_ascii) VALUES (1, {'ascii1', 'ascii2'});",
-            "INSERT INTO settable(key, col_set_bigint) VALUES (1, {12345678, 12345679});",
-            "INSERT INTO settable(key, col_set_blob) VALUES (1, {0x68656c6c6f, 0x68656c6c6e});",
-            "INSERT INTO settable(key, col_set_boolean) VALUES (1, {false, true});",
-            "INSERT INTO settable(key, col_set_decimal) VALUES (1, {23.4567, 23.4568});",
-            "INSERT INTO settable(key, col_set_double) VALUES (1, {12345678.12345678, 12345678.12345679});",
-            "INSERT INTO settable(key, col_set_float) VALUES (1, {123.12, 123.13});",
-            "INSERT INTO settable(key, col_set_inet) VALUES (1, {'127.0.0.1', '127.0.0.2'});",
-            "INSERT INTO settable(key, col_set_int) VALUES (1, {123, 124});",
-            "INSERT INTO settable(key, col_set_text) VALUES (1, {'text1', 'text2'});",
-            "INSERT INTO settable(key, col_set_timestamp) VALUES (1, {'2011-02-03T04:05:00+0000', '2011-02-04T04:05:00+0000'});",
-            "INSERT INTO settable(key, col_set_timeuuid) VALUES (1, {e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f, e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77});",      
-            "INSERT INTO settable(key, col_set_uuid) VALUES (1, {550e8400-e29b-41d4-a716-446655440000, 550e8400-e29b-41d4-a716-446655440001});",
-            "INSERT INTO settable(key, col_set_varchar) VALUES (1, {'varchar1', 'varchar2'});",
-            "INSERT INTO settable(key, col_set_varint) VALUES (1, {123, 124});",
-
-            "INSERT INTO listtable(key, col_list_ascii) VALUES (1, ['ascii2', 'ascii1']);",
-            "INSERT INTO listtable(key, col_list_bigint) VALUES (1, [12345679, 12345678]);",
-            "INSERT INTO listtable(key, col_list_blob) VALUES (1, [0x68656c6c6e, 0x68656c6c6f]);",
-            "INSERT INTO listtable(key, col_list_boolean) VALUES (1, [true, false]);",
-            "INSERT INTO listtable(key, col_list_decimal) VALUES (1, [23.4568, 23.4567]);",
-            "INSERT INTO listtable(key, col_list_double) VALUES (1, [12345678.12345679, 12345678.12345678]);",
-            "INSERT INTO listtable(key, col_list_float) VALUES (1, [123.13, 123.12]);",
-            "INSERT INTO listtable(key, col_list_inet) VALUES (1, ['127.0.0.2', '127.0.0.1']);",
-            "INSERT INTO listtable(key, col_list_int) VALUES (1, [124, 123]);",
-            "INSERT INTO listtable(key, col_list_text) VALUES (1, ['text2', 'text1']);",
-            "INSERT INTO listtable(key, col_list_timestamp) VALUES (1, ['2011-02-04T04:05:00+0000', '2011-02-03T04:05:00+0000']);",
-            "INSERT INTO listtable(key, col_list_timeuuid) VALUES (1, [e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77, e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f]);",
-            "INSERT INTO listtable(key, col_list_uuid) VALUES (1, [550e8400-e29b-41d4-a716-446655440001, 550e8400-e29b-41d4-a716-446655440000]);",
-            "INSERT INTO listtable(key, col_list_varchar) VALUES (1, ['varchar2', 'varchar1']);",
-            "INSERT INTO listtable(key, col_list_varint) VALUES (1, [124, 123]);",
-
-            "INSERT INTO maptable(key, col_map_ascii) VALUES (1, {'ascii1' : 'ascii2'});",
-            "INSERT INTO maptable(key, col_map_bigint) VALUES (1, {12345678 : 12345679});",
-            "INSERT INTO maptable(key, col_map_blob) VALUES (1, {0x68656c6c6f : 0x68656c6c6e});",
-            "INSERT INTO maptable(key, col_map_boolean) VALUES (1, {false : true});",
-            "INSERT INTO maptable(key, col_map_decimal) VALUES (1, {23.4567 : 23.4568});",
-            "INSERT INTO maptable(key, col_map_double) VALUES (1, {12345678.12345678 : 12345678.12345679});",
-            "INSERT INTO maptable(key, col_map_float) VALUES (1, {123.12 : 123.13});",
-            "INSERT INTO maptable(key, col_map_inet) VALUES (1, {'127.0.0.1' : '127.0.0.2'});",
-            "INSERT INTO maptable(key, col_map_int) VALUES (1, {123 : 124});",
-            "INSERT INTO maptable(key, col_map_text) VALUES (1, {'text1' : 'text2'});",
-            "INSERT INTO maptable(key, col_map_timestamp) VALUES (1, {'2011-02-03T04:05:00+0000' : '2011-02-04T04:05:00+0000'});",
-            "INSERT INTO maptable(key, col_map_timeuuid) VALUES (1, {e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f : e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77});",      
-            "INSERT INTO maptable(key, col_map_uuid) VALUES (1, {550e8400-e29b-41d4-a716-446655440000 : 550e8400-e29b-41d4-a716-446655440001});",
-            "INSERT INTO maptable(key, col_map_varchar) VALUES (1, {'varchar1' : 'varchar2'});",
-            "INSERT INTO maptable(key, col_map_varint) VALUES (1, {123 : 124});",
-
-            "CREATE TABLE countertable (key int primary key, col_counter counter);",            
-            "UPDATE countertable SET col_counter = col_counter + 3 WHERE key = 1;",
-    };
-
-    @BeforeClass
-    public static void setup() throws IOException, ConfigurationException, TException
-    {
-        startCassandra();
-        executeCQLStatements(statements);
-        startHadoopCluster();
-    }
-
-    @Test
-    public void testCqlNativeStorageRegularType() throws IOException
-    {
-        //input_cql=select * from cqltable where token(key) > ? and token(key) <= ?
-        cqlTableTest("rows = LOAD 'cql://cql3ks/cqltable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20cqltable%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-
-        //input_cql=select * from countertable where token(key) > ? and token(key) <= ?
-        counterTableTest("cc_rows = LOAD 'cql://cql3ks/countertable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20countertable%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void cqlTableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        //{key: int, 
-        //col_ascii: chararray, 
-        //col_bigint: long, 
-        //col_blob: bytearray, 
-        //col_boolean: bytearray,
-        //col_decimal: chararray, 
-        //col_double: double, 
-        //col_float: float, 
-        //col_inet: chararray, 
-        //col_int: int,
-        //col_text: chararray, 
-        //col_timestamp: long, 
-        //col_timeuuid: bytearray, 
-        //col_uuid: chararray,
-        //col_varchar: chararray, 
-        //col_varint: int}
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 1);
-            Assert.assertEquals(t.get(1), "ascii");
-            Assert.assertEquals(t.get(2), 12345678L);
-            Assert.assertEquals(t.get(3), new DataByteArray(Hex.hexToBytes("23446c6c6f")));
-            Assert.assertEquals(t.get(4), false);
-            Assert.assertEquals(t.get(5), "23.4567");
-            Assert.assertEquals(t.get(6), 12345678.12345678d);
-            Assert.assertEquals(t.get(7), 123.12f);
-            Assert.assertEquals(t.get(8), "127.0.0.1");
-            Assert.assertEquals(t.get(9), 123);
-            Assert.assertEquals(t.get(10), "text");
-            Assert.assertEquals(t.get(11), 1296705900000L);
-            Assert.assertEquals(t.get(12), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f").array())));
-            Assert.assertEquals(t.get(13), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440000").array())));
-            Assert.assertEquals(t.get(14), "varchar");
-            Assert.assertEquals(t.get(15), 123);
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    private void counterTableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple>  it = pig.openIterator("cc_rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 1);
-            Assert.assertEquals(t.get(1), 3L);
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void testCqlNativeStorageSetType() throws IOException
-    {
-        //input_cql=select * from settable where token(key) > ? and token(key) <= ?
-        settableTest("set_rows = LOAD 'cql://cql3ks/settable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20settable%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void settableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("set_rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 1);
-            Tuple innerTuple = (Tuple) t.get(1);
-            Assert.assertEquals(innerTuple.get(0), "ascii1");
-            Assert.assertEquals(innerTuple.get(1), "ascii2");
-            innerTuple = (Tuple) t.get(2);
-            Assert.assertEquals(innerTuple.get(0), 12345678L);
-            Assert.assertEquals(innerTuple.get(1), 12345679L);
-            innerTuple = (Tuple) t.get(3);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray(Hex.hexToBytes("68656c6c6e")));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray(Hex.hexToBytes("68656c6c6f")));
-            innerTuple = (Tuple) t.get(4);
-            Assert.assertEquals(innerTuple.get(0), false);
-            Assert.assertEquals(innerTuple.get(1), true);
-            innerTuple = (Tuple) t.get(5);
-            Assert.assertEquals(innerTuple.get(0), "23.4567");
-            Assert.assertEquals(innerTuple.get(1), "23.4568");
-            innerTuple = (Tuple) t.get(6);
-            Assert.assertEquals(innerTuple.get(0), 12345678.12345678d);
-            Assert.assertEquals(innerTuple.get(1), 12345678.12345679d);
-            innerTuple = (Tuple) t.get(7);
-            Assert.assertEquals(innerTuple.get(0), 123.12f);
-            Assert.assertEquals(innerTuple.get(1), 123.13f);
-            innerTuple = (Tuple) t.get(8);
-            Assert.assertEquals(innerTuple.get(0), "127.0.0.1");
-            Assert.assertEquals(innerTuple.get(1), "127.0.0.2");
-            innerTuple = (Tuple) t.get(9);
-            Assert.assertEquals(innerTuple.get(0), 123);
-            Assert.assertEquals(innerTuple.get(1), 124);
-            innerTuple = (Tuple) t.get(10);
-            Assert.assertEquals(innerTuple.get(0), "text1");
-            Assert.assertEquals(innerTuple.get(1), "text2");
-            innerTuple = (Tuple) t.get(11);
-            Assert.assertEquals(innerTuple.get(0), 1296705900000L);
-            Assert.assertEquals(innerTuple.get(1), 1296792300000L);
-            innerTuple = (Tuple) t.get(12);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77").array())));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f").array())));
-            innerTuple = (Tuple) t.get(13);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440000").array())));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440001").array())));
-            innerTuple = (Tuple) t.get(14);
-            Assert.assertEquals(innerTuple.get(0), "varchar1");
-            Assert.assertEquals(innerTuple.get(1), "varchar2");  
-            innerTuple = (Tuple) t.get(15);
-            Assert.assertEquals(innerTuple.get(0), 123);
-            Assert.assertEquals(innerTuple.get(1), 124);
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void testCqlNativeStorageListType() throws IOException
-    {
-        //input_cql=select * from listtable where token(key) > ? and token(key) <= ?
-        listtableTest("list_rows = LOAD 'cql://cql3ks/listtable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20listtable%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void listtableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("list_rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 1);
-            Tuple innerTuple = (Tuple) t.get(1);
-            Assert.assertEquals(innerTuple.get(1), "ascii1");
-            Assert.assertEquals(innerTuple.get(0), "ascii2");
-            innerTuple = (Tuple) t.get(2);
-            Assert.assertEquals(innerTuple.get(1), 12345678L);
-            Assert.assertEquals(innerTuple.get(0), 12345679L);
-            innerTuple = (Tuple) t.get(3);
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray(Hex.hexToBytes("68656c6c6f")));
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray(Hex.hexToBytes("68656c6c6e")));
-            innerTuple = (Tuple) t.get(4);
-            Assert.assertEquals(innerTuple.get(1), false);
-            Assert.assertEquals(innerTuple.get(0), true);
-            innerTuple = (Tuple) t.get(5);
-            Assert.assertEquals(innerTuple.get(1), "23.4567");
-            Assert.assertEquals(innerTuple.get(0), "23.4568");
-            innerTuple = (Tuple) t.get(6);
-            Assert.assertEquals(innerTuple.get(1), 12345678.12345678d);
-            Assert.assertEquals(innerTuple.get(0), 12345678.12345679d);
-            innerTuple = (Tuple) t.get(7);
-            Assert.assertEquals(innerTuple.get(1), 123.12f);
-            Assert.assertEquals(innerTuple.get(0), 123.13f);
-            innerTuple = (Tuple) t.get(8);
-            Assert.assertEquals(innerTuple.get(1), "127.0.0.1");
-            Assert.assertEquals(innerTuple.get(0), "127.0.0.2");
-            innerTuple = (Tuple) t.get(9);
-            Assert.assertEquals(innerTuple.get(1), 123);
-            Assert.assertEquals(innerTuple.get(0), 124);
-            innerTuple = (Tuple) t.get(10);
-            Assert.assertEquals(innerTuple.get(1), "text1");
-            Assert.assertEquals(innerTuple.get(0), "text2");
-            innerTuple = (Tuple) t.get(11);
-            Assert.assertEquals(innerTuple.get(1), 1296705900000L);
-            Assert.assertEquals(innerTuple.get(0), 1296792300000L);
-            innerTuple = (Tuple) t.get(12);
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f").array())));
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77").array())));
-            innerTuple = (Tuple) t.get(13);
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440000").array())));
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440001").array())));
-            innerTuple = (Tuple) t.get(14);
-            Assert.assertEquals(innerTuple.get(1), "varchar1");
-            Assert.assertEquals(innerTuple.get(0), "varchar2");  
-            innerTuple = (Tuple) t.get(15);
-            Assert.assertEquals(innerTuple.get(1), 123);
-            Assert.assertEquals(innerTuple.get(0), 124);
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void testCqlNativeStorageMapType() throws IOException
-    {
-        //input_cql=select * from maptable where token(key) > ? and token(key) <= ?
-        maptableTest("map_rows = LOAD 'cql://cql3ks/maptable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20maptable%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void maptableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("map_rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 1);
-            Tuple innerTuple = (Tuple) ((Tuple) t.get(1)).get(0);
-            Assert.assertEquals(innerTuple.get(0), "ascii1");
-            Assert.assertEquals(innerTuple.get(1), "ascii2");
-            innerTuple = (Tuple) ((Tuple) t.get(2)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 12345678L);
-            Assert.assertEquals(innerTuple.get(1), 12345679L);
-            innerTuple = (Tuple) ((Tuple) t.get(3)).get(0);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray(Hex.hexToBytes("68656c6c6f")));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray(Hex.hexToBytes("68656c6c6e")));
-            innerTuple = (Tuple) ((Tuple) t.get(4)).get(0);
-            Assert.assertEquals(innerTuple.get(0), false);
-            Assert.assertEquals(innerTuple.get(1), true);
-            innerTuple = (Tuple) ((Tuple) t.get(5)).get(0);
-            Assert.assertEquals(innerTuple.get(0), "23.4567");
-            Assert.assertEquals(innerTuple.get(1), "23.4568");
-            innerTuple = (Tuple) ((Tuple) t.get(6)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 12345678.12345678d);
-            Assert.assertEquals(innerTuple.get(1), 12345678.12345679d);
-            innerTuple = (Tuple) ((Tuple) t.get(7)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 123.12f);
-            Assert.assertEquals(innerTuple.get(1), 123.13f);
-            innerTuple = (Tuple) ((Tuple) t.get(8)).get(0);
-            Assert.assertEquals(innerTuple.get(0), "127.0.0.1");
-            Assert.assertEquals(innerTuple.get(1), "127.0.0.2");
-            innerTuple = (Tuple) ((Tuple) t.get(9)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 123);
-            Assert.assertEquals(innerTuple.get(1), 124);
-            innerTuple = (Tuple) ((Tuple) t.get(10)).get(0);
-            Assert.assertEquals(innerTuple.get(0), "text1");
-            Assert.assertEquals(innerTuple.get(1), "text2");
-            innerTuple = (Tuple) ((Tuple) t.get(11)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 1296705900000L);
-            Assert.assertEquals(innerTuple.get(1), 1296792300000L);
-            innerTuple = (Tuple) ((Tuple) t.get(12)).get(0);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f").array())));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f77").array())));
-            innerTuple = (Tuple) ((Tuple) t.get(13)).get(0);
-            Assert.assertEquals(innerTuple.get(0), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440000").array())));
-            Assert.assertEquals(innerTuple.get(1), new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440001").array())));
-            innerTuple = (Tuple) ((Tuple) t.get(14)).get(0);
-            Assert.assertEquals(innerTuple.get(0), "varchar1");
-            Assert.assertEquals(innerTuple.get(1), "varchar2");  
-            innerTuple = (Tuple) ((Tuple) t.get(15)).get(0);
-            Assert.assertEquals(innerTuple.get(0), 123);
-            Assert.assertEquals(innerTuple.get(1), 124);
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-}

diff --git a/test/pig/org/apache/cassandra/pig/CqlTableTest.java b/test/pig/org/apache/cassandra/pig/CqlTableTest.java
deleted file mode 100644
index 3902fce..0000000
--- a/test/pig/org/apache/cassandra/pig/CqlTableTest.java
+++ /dev/null

@@ -1,303 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.pig.data.Tuple;
-import org.apache.thrift.TException;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class CqlTableTest extends PigTestBase
-{    
-    private static String[] statements = {
-            "DROP KEYSPACE IF EXISTS cql3ks",
-            "CREATE KEYSPACE cql3ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}",
-            "USE cql3ks;",
-
-            "CREATE TABLE cqltable (key1 text, key2 int, column1 int, column2 float, primary key(key1, key2))",
-            "INSERT INTO cqltable (key1, key2, column1, column2) values ('key1', 111, 100, 10.1)",
-            "CREATE TABLE compactcqltable (key1 text, column1 int, column2 float, primary key(key1)) WITH COMPACT STORAGE",
-            "INSERT INTO compactcqltable (key1, column1, column2) values ('key1', 100, 10.1)",
-
-            "CREATE TABLE test (a int PRIMARY KEY, b int);",
-            "CREATE INDEX test_b on test (b);",
-
-            "CREATE TABLE moredata (x int PRIMARY KEY, y int);",
-            "CREATE TABLE test_bulk (a int PRIMARY KEY, b int);",
-            "INSERT INTO test_bulk (a,b) VALUES (1,1);",
-            "INSERT INTO test_bulk (a,b) VALUES (2,2);",
-            "INSERT INTO test_bulk (a,b) VALUES (3,3);",
-            "INSERT INTO test (a,b) VALUES (1,1);",
-            "INSERT INTO test (a,b) VALUES (2,2);",
-            "INSERT INTO test (a,b) VALUES (3,3);",
-            "INSERT INTO moredata (x, y) VALUES (4,4);",
-            "INSERT INTO moredata (x, y) VALUES (5,5);",
-            "INSERT INTO moredata (x, y) VALUES (6,6);",
-
-            "CREATE TABLE compotable (a int, b int, c text, d text, PRIMARY KEY (a,b,c));",
-            "INSERT INTO compotable (a, b , c , d ) VALUES ( 1,1,'One','match');",
-            "INSERT INTO compotable (a, b , c , d ) VALUES ( 2,2,'Two','match');",
-            "INSERT INTO compotable (a, b , c , d ) VALUES ( 3,3,'Three','match');",
-            "INSERT INTO compotable (a, b , c , d ) VALUES ( 4,4,'Four','match');",
-
-            "create table compmore (id int PRIMARY KEY, x int, y int, z text, data text);",
-            "INSERT INTO compmore (id, x, y, z,data) VALUES (1,5,6,'Fix','nomatch');",
-            "INSERT INTO compmore (id, x, y, z,data) VALUES (2,6,5,'Sive','nomatch');",
-            "INSERT INTO compmore (id, x, y, z,data) VALUES (3,7,7,'Seven','match');",
-            "INSERT INTO compmore (id, x, y, z,data) VALUES (4,8,8,'Eight','match');",
-            "INSERT INTO compmore (id, x, y, z,data) VALUES (5,9,10,'Ninen','nomatch');",
-
-            "CREATE TABLE collectiontable(m text PRIMARY KEY, n map<text, text>);",
-            "UPDATE collectiontable SET n['key1'] = 'value1' WHERE m = 'book1';",
-            "UPDATE collectiontable SET n['key2'] = 'value2' WHERE m = 'book2';",
-            "UPDATE collectiontable SET n['key3'] = 'value3' WHERE m = 'book3';",
-            "UPDATE collectiontable SET n['key4'] = 'value4' WHERE m = 'book4';",
-            "CREATE TABLE nulltable(m text PRIMARY KEY, n map<text, text>);",
-            "UPDATE nulltable SET n['key1'] = 'value1' WHERE m = 'book1';",
-            "UPDATE nulltable SET n['key2'] = 'value2' WHERE m = 'book2';",
-            "UPDATE nulltable SET n['key3'] = 'value3' WHERE m = 'book3';",
-            "UPDATE nulltable SET n['key4'] = 'value4' WHERE m = 'book4';",
-    };
-
-    @BeforeClass
-    public static void setup() throws IOException, ConfigurationException, TException
-    {
-        startCassandra();
-        executeCQLStatements(statements);
-        startHadoopCluster();
-    }
-
-    @Test
-    public void testCqlNativeStorageSchema() throws IOException
-    {
-        //input_cql=select * from cqltable where token(key1) > ? and token(key1) <= ?
-        cqlTableSchemaTest("rows = LOAD 'cql://cql3ks/cqltable?" + defaultParameters + nativeParameters +  "&input_cql=select%20*%20from%20cqltable%20where%20token(key1)%20%3E%20%3F%20and%20token(key1)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-
-        //input_cql=select * from compactcqltable where token(key1) > ? and token(key1) <= ?
-        compactCqlTableSchemaTest("rows = LOAD 'cql://cql3ks/compactcqltable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20compactcqltable%20where%20token(key1)%20%3E%20%3F%20and%20token(key1)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void compactCqlTableSchemaTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple>  it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0).toString(), "key1");
-            Assert.assertEquals(t.get(1), 100);
-            Assert.assertEquals(t.get(2), 10.1f);
-            Assert.assertEquals(3, t.size());
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    private void cqlTableSchemaTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0).toString(), "key1");
-            Assert.assertEquals(t.get(1), 111);
-            Assert.assertEquals(t.get(2), 100);
-            Assert.assertEquals(t.get(3), 10.1f);
-            Assert.assertEquals(4, t.size());
-        }
-        else
-        {
-            Assert.fail("Failed to get data for query " + initialQuery);
-        }
-    }
-
-    @Test
-    public void testCqlNativeStorageSingleKeyTable() throws IOException
-    {
-        //input_cql=select * from moredata where token(x) > ? and token(x) <= ?
-        singleKeyTableTest("moretestvalues= LOAD 'cql://cql3ks/moredata?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20moredata%20where%20token(x)%20%3E%20%3F%20and%20token(x)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void singleKeyTableTest(String initialQuery) throws IOException
-    {
-        pig.setBatchOn();
-        pig.registerQuery(initialQuery);
-        pig.registerQuery("insertformat= FOREACH moretestvalues GENERATE TOTUPLE(TOTUPLE('a',x)),TOTUPLE(y);");
-        pig.registerQuery("STORE insertformat INTO 'cql://cql3ks/test?" + defaultParameters + nativeParameters + "&output_query=UPDATE+cql3ks.test+set+b+%3D+%3F' USING CqlNativeStorage();");
-        pig.executeBatch();
-        //(5,5)
-        //(6,6)
-        //(4,4)
-        //(2,2)
-        //(3,3)
-        //(1,1)
-        //input_cql=select * from test where token(a) > ? and token(a) <= ?
-        pig.registerQuery("result= LOAD 'cql://cql3ks/test?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20test%20where%20token(a)%20%3E%20%3F%20and%20token(a)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        Iterator<Tuple> it = pig.openIterator("result");
-        int count = 0;
-        while (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), t.get(1));
-            count ++;
-        }
-        Assert.assertEquals(6, count);
-    }
-
-    @Test
-    public void testCqlNativeStorageCompositeKeyTable() throws IOException
-    {
-        //input_cql=select * from compmore where token(id) > ? and token(id) <= ?
-        compositeKeyTableTest("moredata= LOAD 'cql://cql3ks/compmore?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20compmore%20where%20token(id)%20%3E%20%3F%20and%20token(id)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void compositeKeyTableTest(String initialQuery) throws IOException
-    {
-        pig.setBatchOn();
-        pig.registerQuery(initialQuery);
-        pig.registerQuery("insertformat = FOREACH moredata GENERATE TOTUPLE (TOTUPLE('a',x),TOTUPLE('b',y), TOTUPLE('c',z)),TOTUPLE(data);");
-        pig.registerQuery("STORE insertformat INTO 'cql://cql3ks/compotable?" + defaultParameters + nativeParameters + "&output_query=UPDATE%20cql3ks.compotable%20SET%20d%20%3D%20%3F' USING CqlNativeStorage();");
-        pig.executeBatch();
-
-        //(5,6,Fix,nomatch)
-        //(3,3,Three,match)
-        //(1,1,One,match)
-        //(2,2,Two,match)
-        //(7,7,Seven,match)
-        //(8,8,Eight,match)
-        //(6,5,Sive,nomatch)
-        //(4,4,Four,match)
-        //(9,10,Ninen,nomatch)
-        //input_cql=select * from compotable where token(a) > ? and token(a) <= ?
-        pig.registerQuery("result= LOAD 'cql://cql3ks/compotable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20compotable%20where%20token(a)%20%3E%20%3F%20and%20token(a)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        Iterator<Tuple> it = pig.openIterator("result");
-        int count = 0;
-        while (it.hasNext()) {
-            it.next();
-            count ++;
-        }
-        Assert.assertEquals(count, 9);
-    }
-
-    @Test
-    public void testCqlNativeStorageCollectionColumnTable() throws IOException
-    {
-        //input_cql=select * from collectiontable where token(m) > ? and token(m) <= ?
-        CollectionColumnTableTest("collectiontable= LOAD 'cql://cql3ks/collectiontable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20collectiontable%20where%20token(m)%20%3E%20%3F%20and%20token(m)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void CollectionColumnTableTest(String initialQuery) throws IOException
-    {
-        pig.setBatchOn();
-        pig.registerQuery(initialQuery);
-        pig.registerQuery("recs= FOREACH collectiontable GENERATE TOTUPLE(TOTUPLE('m', m) ), TOTUPLE(TOTUPLE('map', TOTUPLE('m', 'mm'), TOTUPLE('n', 'nn')));");
-        pig.registerQuery("STORE recs INTO 'cql://cql3ks/collectiontable?" + defaultParameters + nativeParameters + "&output_query=update+cql3ks.collectiontable+set+n+%3D+%3F' USING CqlNativeStorage();");
-        pig.executeBatch();
-
-        //(book2,((m,mm),(n,nn)))
-        //(book3,((m,mm),(n,nn)))
-        //(book4,((m,mm),(n,nn)))
-        //(book1,((m,mm),(n,nn)))
-        //input_cql=select * from collectiontable where token(m) > ? and token(m) <= ?
-        pig.registerQuery("result= LOAD 'cql://cql3ks/collectiontable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20collectiontable%20where%20token(m)%20%3E%20%3F%20and%20token(m)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        Iterator<Tuple> it = pig.openIterator("result");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Tuple t1 = (Tuple) t.get(1);
-            Assert.assertEquals(t1.size(), 2);
-            Tuple element1 = (Tuple) t1.get(0);
-            Tuple element2 = (Tuple) t1.get(1);
-            Assert.assertEquals(element1.get(0), "m");
-            Assert.assertEquals(element1.get(1), "mm");
-            Assert.assertEquals(element2.get(0), "n");
-            Assert.assertEquals(element2.get(1), "nn");
-        }
-        else
-        {
-            Assert.fail("Can't fetch any data");
-        }
-    }
-
-    @Test
-    public void testCqlNativeStorageNullTuples() throws IOException
-    {
-        //input_cql=select * from collectiontable where token(m) > ? and token(m) <= ?
-        NullTupleTest("nulltable= LOAD 'cql://cql3ks/collectiontable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20nulltable%20where%20token(m)%20%3E%20%3F%20and%20token(m)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-    }
-
-    private void NullTupleTest(String initialQuery) throws IOException
-    {
-        pig.setBatchOn();
-        pig.registerQuery(initialQuery);
-        pig.registerQuery("recs= FOREACH nulltable GENERATE TOTUPLE(TOTUPLE('m', m) ), TOTUPLE(TOTUPLE('map', TOTUPLE('m', null), TOTUPLE('n', null)));");
-        pig.registerQuery("STORE recs INTO 'cql://cql3ks/nulltable?" + defaultParameters + nativeParameters + "&output_query=update+cql3ks.nulltable+set+n+%3D+%3F' USING CqlNativeStorage();");
-        pig.executeBatch();
-
-        pig.registerQuery("result= LOAD 'cql://cql3ks/nulltable?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20nulltable%20where%20token(m)%20%3E%20%3F%20and%20token(m)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        Iterator<Tuple> it = pig.openIterator("result");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Tuple t1 = (Tuple) t.get(1);
-            Assert.assertEquals(t1.size(), 2);
-            Tuple element1 = (Tuple) t1.get(0);
-            Tuple element2 = (Tuple) t1.get(1);
-            Assert.assertEquals(element1.get(0), "m");
-            Assert.assertEquals(element1.get(1), "");
-            Assert.assertEquals(element2.get(0), "n");
-            Assert.assertEquals(element2.get(1), "");
-        }
-        else
-        {
-            Assert.fail("Can't fetch any data");
-        }
-    }
-
-    @Test
-    public void testCqlStorageSingleKeyTableBulkLoad() throws TException, IOException
-    {
-        pig.setBatchOn();
-        //input_cql=select * from moredata where token(x) > ? and token(x) <= ?
-        pig.registerQuery("moretestvalues= LOAD 'cql://cql3ks/moredata?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20moredata%20where%20token(x)%20%3E%20%3F%20and%20token(x)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        pig.registerQuery("insertformat= FOREACH moretestvalues GENERATE TOTUPLE(x, y);");
-        pig.registerQuery("STORE insertformat INTO 'cql://cql3ks/test_bulk?" + defaultParameters + nativeParameters +  "&bulk_output_format=true&bulk_cf_schema=CREATE%20TABLE%20cql3ks.test_bulk%20(a%20int%20PRIMARY%20KEY%2C%20b%20int)&bulk_insert_statement=Insert%20into%20cql3ks.test_bulk(a%2C%20b)%20values(%3F%2C%3F)' USING CqlNativeStorage();");
-        pig.executeBatch();
-
-        //(5,5)
-        //(6,6)
-        //(4,4)
-        //(2,2)
-        //(3,3)
-        //(1,1)
-        //input_cql=select * from test_bulk1 where token(a) > ? and token(a) <= ?
-        pig.registerQuery("result= LOAD 'cql://cql3ks/test_bulk?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20test_bulk%20where%20token(a)%20%3E%20%3F%20and%20token(a)%20%3C%3D%20%3F' USING CqlNativeStorage();");
-        Iterator<Tuple> it = pig.openIterator("result");
-        int count = 0;
-        while (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), t.get(1));
-            count ++;
-        }
-        Assert.assertEquals(6, count);
-     }
-}

diff --git a/test/pig/org/apache/cassandra/pig/PigTestBase.java b/test/pig/org/apache/cassandra/pig/PigTestBase.java
deleted file mode 100644
index a8a9de5..0000000
--- a/test/pig/org/apache/cassandra/pig/PigTestBase.java
+++ /dev/null

@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.TypeParser;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.service.EmbeddedCassandraService;
-import org.apache.cassandra.thrift.Cassandra;
-import org.apache.cassandra.thrift.Compression;
-import org.apache.cassandra.thrift.ConsistencyLevel;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.pig.ExecType;
-import org.apache.pig.PigServer;
-import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
-import org.apache.pig.impl.PigContext;
-import org.apache.pig.test.MiniCluster;
-import org.apache.thrift.TException;
-import org.apache.thrift.protocol.TBinaryProtocol;
-import org.apache.thrift.protocol.TProtocol;
-import org.apache.thrift.transport.TFramedTransport;
-import org.apache.thrift.transport.TSocket;
-import org.apache.thrift.transport.TTransport;
-import org.apache.thrift.transport.TTransportException;
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.Before;
-
-public class PigTestBase extends SchemaLoader
-{
-    protected static EmbeddedCassandraService cassandra;
-    protected static Configuration conf;
-    protected static MiniCluster cluster; 
-    protected static PigServer pig;
-    protected static String defaultParameters= "init_address=localhost&rpc_port=9170&partitioner=org.apache.cassandra.dht.Murmur3Partitioner";
-    protected static String nativeParameters = "&core_conns=2&max_conns=10&min_simult_reqs=3&max_simult_reqs=10&native_timeout=10000000"  +
-                                               "&native_read_timeout=10000000&send_buff_size=4096&receive_buff_size=4096&solinger=3" +
-                                               "&tcp_nodelay=true&reuse_address=true&keep_alive=true&native_port=9042";
-
-    static
-    {
-        System.setProperty("logback.configurationFile", "logback-test.xml");
-        System.setProperty("cassandra.config", "cassandra_pig.yaml");
-    }
-
-    @AfterClass
-    public static void oneTimeTearDown() throws Exception {
-        cluster.shutDown();
-    }
-
-    @Before
-    public void beforeTest() throws Exception {
-        pig = new PigServer(new PigContext(ExecType.LOCAL, ConfigurationUtil.toProperties(conf)));
-        PigContext.initializeImportList("org.apache.cassandra.hadoop.pig");   
-    }
-
-    @After
-    public void tearDown() throws Exception {
-        pig.shutdown();
-    }
-
-    protected static Cassandra.Client getClient() throws TTransportException
-    {
-        TTransport tr = new TFramedTransport(new TSocket("localhost", 9170));
-        TProtocol proto = new TBinaryProtocol(tr);
-        Cassandra.Client client = new Cassandra.Client(proto);
-        tr.open();
-        return client;
-    }
-
-    protected static void startCassandra() throws IOException
-    {
-        Schema.instance.clear(); // Schema are now written on disk and will be reloaded
-        cassandra = new EmbeddedCassandraService();
-        cassandra.start();
-    }
-
-    protected static void startHadoopCluster()
-    {
-        cluster = MiniCluster.buildCluster();
-        conf = cluster.getConfiguration();
-    }
-
-    protected AbstractType parseType(String type) throws IOException
-    {
-        try
-        {
-            return TypeParser.parse(type);
-        }
-        catch (ConfigurationException | SyntaxException e)
-        {
-            throw new IOException(e);
-        }
-    }
-
-    protected static void executeCQLStatements(String[] statements) throws TException
-    {
-        Cassandra.Client client = getClient();
-
-        for (String statement : statements)
-        {
-            System.out.println("Executing statement: " + statement);
-            client.execute_cql3_query(ByteBufferUtil.bytes(statement), Compression.NONE, ConsistencyLevel.ONE);
-        }
-    }
-}

diff --git a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyDataTypeTest.java b/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyDataTypeTest.java
deleted file mode 100644
index 3ddb94e..0000000
--- a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyDataTypeTest.java
+++ /dev/null

@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.utils.Hex;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.Tuple;
-import org.apache.thrift.TException;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static junit.framework.Assert.assertEquals;
-
-public class ThriftColumnFamilyDataTypeTest extends PigTestBase
-{
-    private static String[] statements = {
-            "DROP KEYSPACE IF EXISTS thrift_ks",
-            "CREATE KEYSPACE thrift_ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};",
-            "USE thrift_ks;",
-
-            "CREATE TABLE some_app (" +
-            "key text PRIMARY KEY," +
-            "col_ascii ascii," +
-            "col_bigint bigint," +
-            "col_blob blob," +
-            "col_boolean boolean," +
-            "col_decimal decimal," +
-            "col_double double," +
-            "col_float float," +
-            "col_inet inet," +
-            "col_int int," +
-            "col_text text," +
-            "col_timestamp timestamp," +
-            "col_timeuuid timeuuid," +
-            "col_uuid uuid," +
-            "col_varint varint)" +
-            " WITH COMPACT STORAGE;",
-
-            "INSERT INTO some_app (key, col_ascii, col_bigint, col_blob, col_boolean, col_decimal, col_double, col_float," +
-                "col_inet, col_int, col_text, col_timestamp, col_uuid, col_varint, col_timeuuid) " +
-                    "VALUES ('foo', 'ascii', 12345678, 0xDEADBEEF, false, 23.345, 2.7182818284590451, 23.45, '127.0.0.1', 23, 'hello', " +
-                        "'2011-02-03T04:05:00+0000', 550e8400-e29b-41d4-a716-446655440000, 12345, e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f);",
-
-            "CREATE TABLE cc (key text, name text, value counter, PRIMARY KEY (key, name)) WITH COMPACT STORAGE",
-
-            "UPDATE cc SET value = value + 3 WHERE key = 'chuck' AND name = 'kick'",
-    };
-
-    @BeforeClass
-    public static void setup() throws IOException, ConfigurationException, TException
-    {
-        startCassandra();
-        executeCQLStatements(statements);
-        startHadoopCluster();
-    }
-
-    @Test
-    public void testCassandraStorageDataType() throws IOException
-    {
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-        Tuple t = pig.openIterator("rows").next();
-
-        // key
-        assertEquals("foo", t.get(0));
-
-        // col_ascii
-        Tuple column = (Tuple) t.get(1);
-        assertEquals("ascii", column.get(1));
-
-        // col_bigint
-        column = (Tuple) t.get(2);
-        assertEquals(12345678L, column.get(1));
-
-        // col_blob
-        column = (Tuple) t.get(3);
-        assertEquals(new DataByteArray(Hex.hexToBytes("DEADBEEF")), column.get(1));
-
-        // col_boolean
-        column = (Tuple) t.get(4);
-        assertEquals(false, column.get(1));
-
-        // col_decimal
-        column = (Tuple) t.get(5);
-        assertEquals("23.345", column.get(1));
-
-        // col_double
-        column = (Tuple) t.get(6);
-        assertEquals(2.7182818284590451d, column.get(1));
-
-        // col_float
-        column = (Tuple) t.get(7);
-        assertEquals(23.45f, column.get(1));
-
-        // col_inet
-        column = (Tuple) t.get(8);
-        assertEquals("127.0.0.1", column.get(1));
-
-        // col_int
-        column = (Tuple) t.get(9);
-        assertEquals(23, column.get(1));
-
-        // col_text
-        column = (Tuple) t.get(10);
-        assertEquals("hello", column.get(1));
-
-        // col_timestamp
-        column = (Tuple) t.get(11);
-        assertEquals(1296705900000L, column.get(1));
-
-        // col_timeuuid
-        column = (Tuple) t.get(12);
-        assertEquals(new DataByteArray((TimeUUIDType.instance.fromString("e23f450f-53a6-11e2-7f7f-7f7f7f7f7f7f").array())), column.get(1));
-
-        // col_uuid
-        column = (Tuple) t.get(13);
-        assertEquals(new DataByteArray((UUIDType.instance.fromString("550e8400-e29b-41d4-a716-446655440000").array())), column.get(1));
-
-        // col_varint
-        column = (Tuple) t.get(14);
-        assertEquals(12345, column.get(1));
-
-        pig.registerQuery("cc_rows = LOAD 'cassandra://thrift_ks/cc?" + defaultParameters + "' USING CassandraStorage();");
-        t = pig.openIterator("cc_rows").next();
-
-        assertEquals("chuck", t.get(0));
-
-        DataBag columns = (DataBag) t.get(1);
-        column = columns.iterator().next();
-        assertEquals("kick", column.get(0));
-        assertEquals(3L, column.get(1));
-    }
-}

diff --git a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java b/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java
deleted file mode 100644
index 60d04d3..0000000
--- a/test/pig/org/apache/cassandra/pig/ThriftColumnFamilyTest.java
+++ /dev/null

@@ -1,727 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.pig;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.thrift.Cassandra;
-import org.apache.cassandra.thrift.ColumnOrSuperColumn;
-import org.apache.cassandra.thrift.ColumnPath;
-import org.apache.cassandra.thrift.ConsistencyLevel;
-import org.apache.cassandra.thrift.NotFoundException;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.Tuple;
-import org.apache.thrift.TException;
-
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class ThriftColumnFamilyTest extends PigTestBase
-{    
-    private static String[] statements = {
-            "DROP KEYSPACE IF EXISTS thrift_ks",
-            "CREATE KEYSPACE thrift_ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};",
-            "USE thrift_ks;",
-
-            "CREATE TABLE some_app (" +
-            "key text PRIMARY KEY," +
-            "name text," +
-            "vote_type text," +
-            "rating int," +
-            "score bigint," +
-            "percent float," +
-            "atomic_weight double," +
-            "created timestamp)" +
-            " WITH COMPACT STORAGE;",
-
-            "CREATE INDEX ON some_app(name);",
-
-            "INSERT INTO some_app (key, name, vote_type, rating, score, percent, atomic_weight, created) " +
-                    "VALUES ('foo', 'User Foo', 'like', 8, 125000, 85.0, 2.7182818284590451, 1335890877);",
-
-            "INSERT INTO some_app (key, name, vote_type, rating, score, percent, atomic_weight, created) " +
-                    "VALUES ('bar', 'User Bar', 'like', 9, 15000, 35.0, 3.1415926535897931, 1335890877);",
-
-            "INSERT INTO some_app (key, name, vote_type, rating, score, percent, atomic_weight, created) " +
-                    "VALUES ('baz', 'User Baz', 'dislike', 3, 512000, 95.3, 1.61803399, 1335890877);",
-
-            "INSERT INTO some_app (key, name, vote_type, rating, score, percent, atomic_weight, created) " +
-                    "VALUES ('qux', 'User Qux', 'dislike', 2, 12000, 64.7, 0.660161815846869, 1335890877);",
-
-            "CREATE TABLE copy_of_some_app (" +
-            "key text PRIMARY KEY," +
-            "name text," +
-            "vote_type text," +
-            "rating int," +
-            "score bigint," +
-            "percent float," +
-            "atomic_weight double," +
-            "created timestamp)" +
-            " WITH COMPACT STORAGE;",
-
-            "CREATE INDEX ON copy_of_some_app(name);",
-
-            "CREATE TABLE u8 (" +
-            "key text," +
-            "column1 text," +
-            "value blob," +
-            "PRIMARY KEY (key, column1))" +
-            " WITH COMPACT STORAGE",
-
-            "INSERT INTO u8 (key, column1, value) VALUES ('foo', 'x', asciiAsBlob('Z'))",
-
-            "CREATE TABLE bytes (" +
-            "key blob," +
-            "column1 text," +
-            "value blob," +
-            "PRIMARY KEY (key, column1))" +
-            " WITH COMPACT STORAGE",
-
-            "INSERT INTO bytes (key, column1, value) VALUES (asciiAsBlob('foo'), 'x', asciiAsBlob('Z'))",
-
-            "CREATE TABLE cc (key text, name text, value counter, PRIMARY KEY (key, name)) WITH COMPACT STORAGE",
-
-            "UPDATE cc SET value = value + 3 WHERE key = 'chuck' AND name = 'kick'",
-            "UPDATE cc SET value = value + 1 WHERE key = 'chuck' AND name = 'fist'",
-
-            "CREATE TABLE compo (" +
-            "key text," +
-            "column1 text," +
-            "column2 text," +
-            "value text," +
-            "PRIMARY KEY (key, column1, column2))" +
-            " WITH COMPACT STORAGE",
-
-            "INSERT INTO compo (key, column1, column2, value) VALUES ('punch', 'bruce', 'lee', 'ouch');",
-            "INSERT INTO compo (key, column1, column2, value) VALUES ('punch', 'bruce', 'bruce', 'hunh?');",
-            "INSERT INTO compo (key, column1, column2, value) VALUES ('kick', 'bruce', 'lee', 'oww');",
-            "INSERT INTO compo (key, column1, column2, value) VALUES ('kick', 'bruce', 'bruce', 'watch it, mate');",
-
-            "CREATE TABLE compo_int (" +
-            "key text," +
-            "column1 bigint," +
-            "column2 bigint," +
-            "value text," +
-            "PRIMARY KEY (key, column1, column2))" +
-            " WITH COMPACT STORAGE",
-
-            "INSERT INTO compo_int (key, column1, column2, value) VALUES ('clock', 1, 0, 'z');",
-            "INSERT INTO compo_int (key, column1, column2, value) VALUES ('clock', 1, 30, 'zzzz');",
-            "INSERT INTO compo_int (key, column1, column2, value) VALUES ('clock', 2, 30, 'daddy?');",
-            "INSERT INTO compo_int (key, column1, column2, value) VALUES ('clock', 6, 30, 'coffee...');",
-
-            "CREATE TABLE compo_int_copy (" +
-            "key text," +
-            "column1 bigint," +
-            "column2 bigint," +
-            "value text," +
-            "PRIMARY KEY (key, column1, column2))" +
-            " WITH COMPACT STORAGE",
-
-            "CREATE TABLE compo_key (" +
-            "key text," +
-            "column1 bigint," +
-            "column2 bigint," +
-            "value text," +
-            "PRIMARY KEY ((key, column1), column2))" +
-            " WITH COMPACT STORAGE",
-
-            "INSERT INTO compo_key (key, column1, column2, value) VALUES ('clock', 10, 1, 'z');",
-            "INSERT INTO compo_key (key, column1, column2, value) VALUES ('clock', 20, 1, 'zzzz');",
-            "INSERT INTO compo_key (key, column1, column2, value) VALUES ('clock', 30, 2, 'daddy?');",
-            "INSERT INTO compo_key (key, column1, column2, value) VALUES ('clock', 40, 6, 'coffee...');",
-
-            "CREATE TABLE compo_key_copy (" +
-            "key text," +
-            "column1 bigint," +
-            "column2 bigint," +
-            "value text," +
-            "PRIMARY KEY ((key, column1), column2))" +
-            " WITH COMPACT STORAGE",
-    };
-
-    private static String[] deleteCopyOfSomeAppTableData = {
-            "use thrift_ks;",
-            "DELETE FROM copy_of_some_app WHERE key = 'foo';",
-            "DELETE FROM copy_of_some_app WHERE key = 'bar';",
-            "DELETE FROM copy_of_some_app WHERE key = 'baz';",
-            "DELETE FROM copy_of_some_app WHERE key = 'qux';",
-    };
-
-    @BeforeClass
-    public static void setup() throws IOException, ConfigurationException, TException
-    {
-        startCassandra();
-        executeCQLStatements(statements);
-        startHadoopCluster();
-    }
-
-    @Test
-    public void testCqlNativeStorage() throws IOException
-    {
-        //regular thrift column families
-        //input_cql=select * from "some_app" where token(key) > ? and token(key) <= ?
-        cqlStorageTest("data = load 'cql://thrift_ks/some_app?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20%22some_app%22%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' using CqlNativeStorage();");
-
-        //Test counter column family
-        //input_cql=select * from "cc" where token(key) > ? and token(key) <= ?
-        cqlStorageCounterTableTest("cc_data = load 'cql://thrift_ks/cc?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20%22cc%22%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' using CqlNativeStorage();");
-
-        //Test composite column family
-        //input_cql=select * from "compo" where token(key) > ? and token(key) <= ?
-        cqlStorageCompositeTableTest("compo_data = load 'cql://thrift_ks/compo?" + defaultParameters + nativeParameters + "&input_cql=select%20*%20from%20%22compo%22%20where%20token(key)%20%3E%20%3F%20and%20token(key)%20%3C%3D%20%3F' using CqlNativeStorage();");
-    }
-
-    private void cqlStorageTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-
-        //(bar,3.141592653589793,1335890877,User Bar,35.0,9,15000,like)
-        //(baz,1.61803399,1335890877,User Baz,95.3,3,512000,dislike)
-        //(foo,2.718281828459045,1335890877,User Foo,85.0,8,125000,like)
-        //(qux,0.660161815846869,1335890877,User Qux,64.7,2,12000,dislike)
-
-        //{key: chararray,atomic_weight: double,created: long,name: chararray,percent: float,rating: int,score: long,vote_type: chararray}
-        Iterator<Tuple> it = pig.openIterator("data");
-        int count = 0;
-        while (it.hasNext()) {
-            count ++;
-            Tuple t = it.next();
-            if ("bar".equals(t.get(0)))
-            {
-                Assert.assertEquals(t.get(1), 3.141592653589793d);
-                Assert.assertEquals(t.get(3), "User Bar");
-                Assert.assertEquals(t.get(4), 35.0f);
-                Assert.assertEquals(t.get(5), 9);
-                Assert.assertEquals(t.get(6), 15000L);
-                Assert.assertEquals(t.get(7), "like");
-            }
-            else if ("baz".equals(t.get(0)))
-            {
-                Assert.assertEquals(t.get(1), 1.61803399d);
-                Assert.assertEquals(t.get(3), "User Baz");
-                Assert.assertEquals(t.get(4), 95.3f);
-                Assert.assertEquals(t.get(5), 3);
-                Assert.assertEquals(t.get(6), 512000L);
-                Assert.assertEquals(t.get(7), "dislike");
-            }
-            else if ("foo".equals(t.get(0)))
-            {
-                Assert.assertEquals(t.get(0), "foo");
-                Assert.assertEquals(t.get(1), 2.718281828459045d);
-                Assert.assertEquals(t.get(3), "User Foo");
-                Assert.assertEquals(t.get(4), 85.0f);
-                Assert.assertEquals(t.get(5), 8);
-                Assert.assertEquals(t.get(6), 125000L);
-                Assert.assertEquals(t.get(7), "like");
-            }
-            else if ("qux".equals(t.get(0)))
-            {
-                Assert.assertEquals(t.get(0), "qux");
-                Assert.assertEquals(t.get(1), 0.660161815846869d);
-                Assert.assertEquals(t.get(3), "User Qux");
-                Assert.assertEquals(t.get(4), 64.7f);
-                Assert.assertEquals(t.get(5), 2);
-                Assert.assertEquals(t.get(6), 12000L);
-                Assert.assertEquals(t.get(7), "dislike");
-            }
-        }
-        Assert.assertEquals(count, 4);
-    }
-
-    private void cqlStorageCounterTableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-
-        //(chuck,fist,1)
-        //(chuck,kick,3)
-
-        // {key: chararray,column1: chararray,value: long}
-        Iterator<Tuple> it = pig.openIterator("cc_data");
-        int count = 0;
-        while (it.hasNext()) {
-            count ++;
-            Tuple t = it.next();
-            if ("chuck".equals(t.get(0)) && "fist".equals(t.get(1)))
-                Assert.assertEquals(t.get(2), 1L);
-            else if ("chuck".equals(t.get(0)) && "kick".equals(t.get(1)))
-                Assert.assertEquals(t.get(2), 3L);
-        }
-        Assert.assertEquals(count, 2);
-    }
-
-    private void cqlStorageCompositeTableTest(String initialQuery) throws IOException
-    {
-        pig.registerQuery(initialQuery);
-
-        //(kick,bruce,bruce,watch it, mate)
-        //(kick,bruce,lee,oww)
-        //(punch,bruce,bruce,hunh?)
-        //(punch,bruce,lee,ouch)
-
-        //{key: chararray,column1: chararray,column2: chararray,value: chararray}
-        Iterator<Tuple> it = pig.openIterator("compo_data");
-        int count = 0;
-        while (it.hasNext()) {
-            count ++;
-            Tuple t = it.next();
-            if ("kick".equals(t.get(0)) && "bruce".equals(t.get(1)) && "bruce".equals(t.get(2)))
-                Assert.assertEquals(t.get(3), "watch it, mate");
-            else if ("kick".equals(t.get(0)) && "bruce".equals(t.get(1)) && "lee".equals(t.get(2)))
-                Assert.assertEquals(t.get(3), "oww");
-            else if ("punch".equals(t.get(0)) && "bruce".equals(t.get(1)) && "bruce".equals(t.get(2)))
-                Assert.assertEquals(t.get(3), "hunh?");
-            else if ("punch".equals(t.get(0)) && "bruce".equals(t.get(1)) && "lee".equals(t.get(2)))
-                Assert.assertEquals(t.get(3), "ouch");
-        }
-        Assert.assertEquals(count, 4);
-    }
-
-    @Test
-    public void testCassandraStorageSchema() throws IOException
-    {
-        //results: (qux,(atomic_weight,0.660161815846869),(created,1335890877),(name,User Qux),(percent,64.7),
-        //(rating,2),(score,12000),(vote_type,dislike))
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-
-        //schema: {key: chararray,atomic_weight: (name: chararray,value: double),created: (name: chararray,value: long),
-        //name: (name: chararray,value: chararray),percent: (name: chararray,value: float),
-        //rating: (name: chararray,value: int),score: (name: chararray,value: long),
-        //vote_type: (name: chararray,value: chararray),columns: {(name: chararray,value: chararray)}}
-        Iterator<Tuple> it = pig.openIterator("rows");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            String rowKey =  t.get(0).toString();
-            if ("qux".equals(rowKey))
-            {
-                Tuple column = (Tuple) t.get(1);
-                Assert.assertEquals(column.get(0), "atomic_weight");
-                Assert.assertEquals(column.get(1), 0.660161815846869d);
-                column = (Tuple) t.get(3);
-                Assert.assertEquals(column.get(0), "name");
-                Assert.assertEquals(column.get(1), "User Qux");
-                column = (Tuple) t.get(4);
-                Assert.assertEquals(column.get(0), "percent");
-                Assert.assertEquals(column.get(1), 64.7f);
-                column = (Tuple) t.get(5);
-                Assert.assertEquals(column.get(0), "rating");
-                Assert.assertEquals(column.get(1), 2);
-                column = (Tuple) t.get(6);
-                Assert.assertEquals(column.get(0), "score");
-                Assert.assertEquals(column.get(1), 12000L);
-                column = (Tuple) t.get(7);
-                Assert.assertEquals(column.get(0), "vote_type");
-                Assert.assertEquals(column.get(1), "dislike");
-            }
-        }
-    }
-
-    @Test
-    public void testCassandraStorageFullCopy() throws IOException, TException
-    {
-        pig.setBatchOn();
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-        //full copy
-        pig.registerQuery("STORE rows INTO 'cassandra://thrift_ks/copy_of_some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.executeBatch();
-        Assert.assertEquals("User Qux", getColumnValue("thrift_ks", "copy_of_some_app", "name", "qux", "UTF8Type"));
-        Assert.assertEquals("dislike", getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "qux", "UTF8Type"));
-        Assert.assertEquals("64.7", getColumnValue("thrift_ks", "copy_of_some_app", "percent", "qux", "FloatType"));
-    }
-
-    @Test
-    public void testCassandraStorageSingleTupleCopy() throws IOException, TException
-    {
-        executeCQLStatements(deleteCopyOfSomeAppTableData);
-        pig.setBatchOn();
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-        //single tuple
-        pig.registerQuery("onecol = FOREACH rows GENERATE key, percent;");
-        pig.registerQuery("STORE onecol INTO 'cassandra://thrift_ks/copy_of_some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.executeBatch();
-        String value = null;
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "name", "qux", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "qux", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        Assert.assertEquals("64.7", getColumnValue("thrift_ks", "copy_of_some_app", "percent", "qux", "FloatType"));
-    }
-
-    @Test
-    public void testCassandraStorageBagOnlyCopy() throws IOException, TException
-    {
-        executeCQLStatements(deleteCopyOfSomeAppTableData);
-        pig.setBatchOn();
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-        //bag only
-        pig.registerQuery("other = FOREACH rows GENERATE key, columns;");
-        pig.registerQuery("STORE other INTO 'cassandra://thrift_ks/copy_of_some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.executeBatch();
-        String value = null;
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "name", "qux", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "qux", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "percent", "qux", "FloatType");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-    }
-
-    @Test
-    public void testCassandraStorageFilter() throws IOException, TException
-    {
-        executeCQLStatements(deleteCopyOfSomeAppTableData);
-        pig.setBatchOn();
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-
-        //filter
-        pig.registerQuery("likes = FILTER rows by vote_type.value eq 'like' and rating.value > 5;");
-        pig.registerQuery("STORE likes INTO 'cassandra://thrift_ks/copy_of_some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.executeBatch();
-
-        Assert.assertEquals("like", getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "bar", "UTF8Type"));
-        Assert.assertEquals("like", getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "foo", "UTF8Type"));
-        String value = null;
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "qux", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "baz", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-
-        executeCQLStatements(deleteCopyOfSomeAppTableData);
-        pig.setBatchOn();
-        pig.registerQuery("rows = LOAD 'cassandra://thrift_ks/some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.registerQuery("dislikes_extras = FILTER rows by vote_type.value eq 'dislike';");
-        pig.registerQuery("STORE dislikes_extras INTO 'cassandra://thrift_ks/copy_of_some_app?" + defaultParameters + "' USING CassandraStorage();");
-        pig.executeBatch();
-        Assert.assertEquals("dislike", getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "baz", "UTF8Type"));
-        Assert.assertEquals("dislike", getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "qux", "UTF8Type"));
-        value = null;
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "bar", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-        try
-        {
-            value = getColumnValue("thrift_ks", "copy_of_some_app", "vote_type", "foo", "UTF8Type");
-        }
-        catch (NotFoundException e)
-        {
-            Assert.assertTrue(true);
-        }
-        if (value != null)
-            Assert.fail();
-    }
-
-    @Test
-    public void testCassandraStorageJoin() throws IOException
-    {
-        //test key types with a join
-        pig.registerQuery("U8 = load 'cassandra://thrift_ks/u8?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("Bytes = load 'cassandra://thrift_ks/bytes?" + defaultParameters + "' using CassandraStorage();");
-
-        //cast key to chararray
-        pig.registerQuery("b = foreach Bytes generate (chararray)key, columns;");
-
-        //key in Bytes is a bytearray, U8 chararray
-        //(foo,{(x,Z)},foo,{(x,Z)})
-        pig.registerQuery("a = join Bytes by key, U8 by key;");
-        Iterator<Tuple> it = pig.openIterator("a");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), new DataByteArray("foo".getBytes()));
-            DataBag columns = (DataBag) t.get(1);
-            Iterator<Tuple> iter = columns.iterator();
-            Tuple t1 = iter.next();
-            Assert.assertEquals(t1.get(0), "x");
-            Assert.assertEquals(t1.get(1), new DataByteArray("Z".getBytes()));
-            String column = (String) t.get(2);
-            Assert.assertEquals(column, "foo");
-            columns = (DataBag) t.get(3);
-            iter = columns.iterator();
-            Tuple t2 = iter.next();
-            Assert.assertEquals(t2.get(0), "x");
-            Assert.assertEquals(t2.get(1), new DataByteArray("Z".getBytes()));
-        }
-        //key should now be cast into a chararray
-        //(foo,{(x,Z)},foo,{(x,Z)})
-        pig.registerQuery("c = join b by (chararray)key, U8 by (chararray)key;");
-        it = pig.openIterator("c");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), "foo");
-            DataBag columns = (DataBag) t.get(1);
-            Iterator<Tuple> iter = columns.iterator();
-            Tuple t1 = iter.next();
-            Assert.assertEquals(t1.get(0), "x");
-            Assert.assertEquals(t1.get(1), new DataByteArray("Z".getBytes()));
-            String column = (String) t.get(2);
-            Assert.assertEquals(column, "foo");
-            columns = (DataBag) t.get(3);
-            iter = columns.iterator();
-            Tuple t2 = iter.next();
-            Assert.assertEquals(t2.get(0), "x");
-            Assert.assertEquals(t2.get(1), new DataByteArray("Z".getBytes()));
-        }
-    }
-
-    @Test
-    public void testCassandraStorageCounterCF() throws IOException
-    {
-        //Test counter column family support
-        pig.registerQuery("CC = load 'cassandra://thrift_ks/cc?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("total_hits = foreach CC generate key, SUM(columns.value);");
-        //(chuck,4)
-        Tuple t = pig.openIterator("total_hits").next();
-        Assert.assertEquals(t.get(0), "chuck");
-        Assert.assertEquals(t.get(1), 4l);
-    }
-
-    @Test
-    public void testCassandraStorageCompositeColumnCF() throws IOException
-    {
-        //Test CompositeType
-        pig.registerQuery("compo = load 'cassandra://thrift_ks/compo?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("compo = foreach compo generate key as method, flatten(columns);");
-        pig.registerQuery("lee = filter compo by columns::name == ('bruce','lee');");
-
-        //(kick,(bruce,lee),oww)
-        //(punch,(bruce,lee),ouch)
-        Iterator<Tuple> it = pig.openIterator("lee");
-        int count = 0;
-        while (it.hasNext()) {
-            count ++;
-            Tuple t = it.next();
-            Tuple t1 = (Tuple) t.get(1);
-            Assert.assertEquals(t1.get(0), "bruce");
-            Assert.assertEquals(t1.get(1), "lee");
-            if ("kick".equals(t.get(0)))
-                Assert.assertEquals(t.get(2), "oww");
-            else if ("kick".equals(t.get(0)))
-                Assert.assertEquals(t.get(2), "ouch");
-        }
-        Assert.assertEquals(count, 2);
-        pig.registerQuery("night = load 'cassandra://thrift_ks/compo_int?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("night = foreach night generate flatten(columns);");
-        pig.registerQuery("night = foreach night generate (int)columns::name.$0+(double)columns::name.$1/60 as hour, columns::value as noise;");
-
-        //What happens at the darkest hour?
-        pig.registerQuery("darkest = filter night by hour > 2 and hour < 5;");
-
-        //(2.5,daddy?)
-        it = pig.openIterator("darkest");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), 2.5d);
-            Assert.assertEquals(t.get(1), "daddy?");
-        }
-        pig.setBatchOn();
-        pig.registerQuery("compo_int_rows = LOAD 'cassandra://thrift_ks/compo_int?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("STORE compo_int_rows INTO 'cassandra://thrift_ks/compo_int_copy?" + defaultParameters + "' using CassandraStorage();");
-        pig.executeBatch();
-        pig.registerQuery("compocopy_int_rows = LOAD 'cassandra://thrift_ks/compo_int_copy?" + defaultParameters + "' using CassandraStorage();");
-        //(clock,{((1,0),z),((1,30),zzzz),((2,30),daddy?),((6,30),coffee...)})
-        it = pig.openIterator("compocopy_int_rows");
-        count = 0;
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Assert.assertEquals(t.get(0), "clock");
-            DataBag columns = (DataBag) t.get(1);
-            for (Tuple t1 : columns)
-            {
-                count++;
-                Tuple inner = (Tuple) t1.get(0);
-                if ((Long) inner.get(0) == 1L && (Long) inner.get(1) == 0L)
-                    Assert.assertEquals(t1.get(1), "z");
-                else if ((Long) inner.get(0) == 1L && (Long) inner.get(1) == 30L)
-                    Assert.assertEquals(t1.get(1), "zzzz");
-                else if ((Long) inner.get(0) == 2L && (Long) inner.get(1) == 30L)
-                    Assert.assertEquals(t1.get(1), "daddy?");
-                else if ((Long) inner.get(0) == 6L && (Long) inner.get(1) == 30L)
-                    Assert.assertEquals(t1.get(1), "coffee...");
-            }
-            Assert.assertEquals(count, 4);
-        }
-    }
-
-    @Test
-    public void testCassandraStorageCompositeKeyCF() throws IOException
-    {
-        //Test CompositeKey
-        pig.registerQuery("compokeys = load 'cassandra://thrift_ks/compo_key?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("compokeys = filter compokeys by key.$1 == 40;");
-        //((clock,40),{(6,coffee...)})
-        Iterator<Tuple> it = pig.openIterator("compokeys");
-        if (it.hasNext()) {
-            Tuple t = it.next();
-            Tuple key = (Tuple) t.get(0); 
-            Assert.assertEquals(key.get(0), "clock");
-            Assert.assertEquals(key.get(1), 40L);
-            DataBag columns = (DataBag) t.get(1);
-            Iterator<Tuple> iter = columns.iterator();
-            if (iter.hasNext())
-            {
-                Tuple t1 = iter.next();
-                Assert.assertEquals(t1.get(0), 6L);
-                Assert.assertEquals(t1.get(1), "coffee...");
-            }
-        }
-        pig.setBatchOn();
-        pig.registerQuery("compo_key_rows = LOAD 'cassandra://thrift_ks/compo_key?" + defaultParameters + "' using CassandraStorage();");
-        pig.registerQuery("STORE compo_key_rows INTO 'cassandra://thrift_ks/compo_key_copy?" + defaultParameters + "' using CassandraStorage();");
-        pig.executeBatch();
-        pig.registerQuery("compo_key_copy_rows = LOAD 'cassandra://thrift_ks/compo_key_copy?" + defaultParameters + "' using CassandraStorage();");
-        //((clock,10),{(1,z)})
-        //((clock,20),{(1,zzzz)})
-        //((clock,30),{(2,daddy?)})
-        //((clock,40),{(6,coffee...)})
-        it = pig.openIterator("compo_key_copy_rows");
-        int count = 0;
-        while (it.hasNext()) {
-            Tuple t = it.next();
-            count ++;
-            Tuple key = (Tuple) t.get(0); 
-            if ("clock".equals(key.get(0)) && (Long) key.get(1) == 10L)
-            {
-                DataBag columns = (DataBag) t.get(1);
-                Iterator<Tuple> iter = columns.iterator();
-                if (iter.hasNext())
-                {
-                    Tuple t1 = iter.next();
-                    Assert.assertEquals(t1.get(0), 1L);
-                    Assert.assertEquals(t1.get(1), "z");
-                }
-            }
-            else if ("clock".equals(key.get(0)) && (Long) key.get(1) == 40L)
-            {
-                DataBag columns = (DataBag) t.get(1);
-                Iterator<Tuple> iter = columns.iterator();
-                if (iter.hasNext())
-                {
-                    Tuple t1 = iter.next();
-                    Assert.assertEquals(t1.get(0), 6L);
-                    Assert.assertEquals(t1.get(1), "coffee...");
-                }
-            }
-            else if ("clock".equals(key.get(0)) && (Long) key.get(1) == 20L)
-            {
-                DataBag columns = (DataBag) t.get(1);
-                Iterator<Tuple> iter = columns.iterator();
-                if (iter.hasNext())
-                {
-                    Tuple t1 = iter.next();
-                    Assert.assertEquals(t1.get(0), 1L);
-                    Assert.assertEquals(t1.get(1), "zzzz");
-                }
-            }
-            else if ("clock".equals(key.get(0)) && (Long) key.get(1) == 30L)
-            {
-                DataBag columns = (DataBag) t.get(1);
-                Iterator<Tuple> iter = columns.iterator();
-                if (iter.hasNext())
-                {
-                    Tuple t1 = iter.next();
-                    Assert.assertEquals(t1.get(0), 2L);
-                    Assert.assertEquals(t1.get(1), "daddy?");
-                }
-            }
-        }
-        Assert.assertEquals(4, count);
-    }
-
-    private String getColumnValue(String ks, String cf, String colName, String key, String validator) throws TException, IOException
-    {
-        Cassandra.Client client = getClient();
-        client.set_keyspace(ks);
-
-        ByteBuffer key_user_id = ByteBufferUtil.bytes(key);
-        ColumnPath cp = new ColumnPath(cf);
-        cp.column = ByteBufferUtil.bytes(colName);
-
-        // read
-        ColumnOrSuperColumn got = client.get(key_user_id, cp, ConsistencyLevel.ONE);
-        return parseType(validator).getString(got.getColumn().value);
-    }
-}

diff --git a/test/pig/org/apache/pig/test/MiniCluster.java b/test/pig/org/apache/pig/test/MiniCluster.java
deleted file mode 100644
index 95acb84..0000000
--- a/test/pig/org/apache/pig/test/MiniCluster.java
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.pig.test;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.mapred.MiniMRCluster;
-
-public class MiniCluster extends MiniGenericCluster {
-    private MiniMRCluster m_mr = null;
-    public MiniCluster() {
-        super();
-    }
-
-    @Override
-    protected void setupMiniDfsAndMrClusters() {
-        try {
-            System.setProperty("hadoop.log.dir", "build/test/logs");
-            final int dataNodes = 4;     // There will be 4 data nodes
-            final int taskTrackers = 4;  // There will be 4 task tracker nodes
-
-            // Create the configuration hadoop-site.xml file
-            File conf_dir = new File("build/classes/");
-            conf_dir.mkdirs();
-            File conf_file = new File(conf_dir, "hadoop-site.xml");
-
-            conf_file.delete();
-
-            // Builds and starts the mini dfs and mapreduce clusters
-            Configuration config = new Configuration();
-            if (FBUtilities.isWindows())
-                config.set("fs.file.impl", WindowsLocalFileSystem.class.getName());
-            m_dfs = new MiniDFSCluster(config, dataNodes, true, null);
-            m_fileSys = m_dfs.getFileSystem();
-            m_mr = new MiniMRCluster(taskTrackers, m_fileSys.getUri().toString(), 1);
-
-            // Write the necessary config info to hadoop-site.xml
-            m_conf = m_mr.createJobConf();
-            m_conf.setInt("mapred.submit.replication", 2);
-            m_conf.set("dfs.datanode.address", "0.0.0.0:0");
-            m_conf.set("dfs.datanode.http.address", "0.0.0.0:0");
-            m_conf.set("mapred.map.max.attempts", "2");
-            m_conf.set("mapred.reduce.max.attempts", "2");
-            m_conf.set("pig.jobcontrol.sleep", "100");
-            try (OutputStream os = new FileOutputStream(conf_file))
-            {
-                m_conf.writeXml(os);
-            }
-
-            // Set the system properties needed by Pig
-            System.setProperty("cluster", m_conf.get("mapred.job.tracker"));
-            System.setProperty("namenode", m_conf.get("fs.default.name"));
-            System.setProperty("junit.hadoop.conf", conf_dir.getPath());
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    @Override
-    protected void shutdownMiniMrClusters() {
-        if (m_mr != null)
-            m_mr.shutdown();
-        m_mr = null;
-    }
-}

diff --git a/test/pig/org/apache/pig/test/MiniGenericCluster.java b/test/pig/org/apache/pig/test/MiniGenericCluster.java
deleted file mode 100644
index ac3f5bc..0000000
--- a/test/pig/org/apache/pig/test/MiniGenericCluster.java
+++ /dev/null

@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.pig.test;
-
-import java.io.*;
-import java.util.Properties;
-
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
-
-/**
- * This class builds a single instance of itself with the Singleton 
- * design pattern. While building the single instance, it sets up a 
- * mini cluster that actually consists of a mini DFS cluster and a 
- * mini MapReduce cluster on the local machine and also sets up the 
- * environment for Pig to run on top of the mini cluster.
- *
- * This class is the base class for MiniCluster, which has slightly
- * difference among different versions of hadoop. MiniCluster implementation
- * is located in $PIG_HOME/shims.
- */
-abstract public class MiniGenericCluster {
-    protected MiniDFSCluster m_dfs = null;
-    protected FileSystem m_fileSys = null;
-    protected Configuration m_conf = null;
-    
-    protected final static MiniCluster INSTANCE = new MiniCluster();
-    protected static boolean isSetup = true;
-    
-    protected MiniGenericCluster() {
-        setupMiniDfsAndMrClusters();
-    }
-    
-    abstract protected void setupMiniDfsAndMrClusters();
-    
-    /**
-     * Returns the single instance of class MiniClusterBuilder that
-     * represents the resouces for a mini dfs cluster and a mini 
-     * mapreduce cluster. 
-     */
-    public static MiniCluster buildCluster() {
-        if(! isSetup){
-            INSTANCE.setupMiniDfsAndMrClusters();
-            isSetup = true;
-        }
-        return INSTANCE;
-    }
-
-    public void shutDown(){
-        INSTANCE.shutdownMiniDfsAndMrClusters();
-    }
-    
-    protected void finalize() {
-        shutdownMiniDfsAndMrClusters();
-    }
-    
-    protected void shutdownMiniDfsAndMrClusters() {
-        isSetup = false;
-        shutdownMiniDfsClusters();
-        shutdownMiniMrClusters();
-    }
-    
-    protected void shutdownMiniDfsClusters() {
-        try {
-            if (m_fileSys != null) { m_fileSys.close(); }
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-        if (m_dfs != null) { m_dfs.shutdown(); }
-        m_fileSys = null;
-        m_dfs = null;
-    }
-    
-    abstract protected void shutdownMiniMrClusters();
-
-    public Properties getProperties() {
-        errorIfNotSetup();
-        return ConfigurationUtil.toProperties(m_conf);
-    }
-
-    public Configuration getConfiguration() {
-        return new Configuration(m_conf);
-    }
-
-    public void setProperty(String name, String value) {
-        errorIfNotSetup();
-        m_conf.set(name, value);
-    }
-    
-    public FileSystem getFileSystem() {
-        errorIfNotSetup();
-        return m_fileSys;
-    }
-    
-    /**
-     * Throw RunTimeException if isSetup is false
-     */
-    private void errorIfNotSetup(){
-        if(isSetup)
-            return;
-        String msg = "function called on MiniCluster that has been shutdown";
-        throw new RuntimeException(msg);
-    }
-}

diff --git a/test/pig/org/apache/pig/test/WindowsLocalFileSystem.java b/test/pig/org/apache/pig/test/WindowsLocalFileSystem.java
deleted file mode 100644
index 9193341..0000000
--- a/test/pig/org/apache/pig/test/WindowsLocalFileSystem.java
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.pig.test;
-
-import java.io.IOException;
-
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.permission.FsPermission;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Workaround for https://issues.apache.org/jira/browse/HADOOP-7682 used to allow the Pig-tests to run on Cygwin on
- * a Windows box. This workaround was suggested by Joshua Caplan in the comments of HADOOP-7682.
- */
-public final class WindowsLocalFileSystem extends LocalFileSystem
-{
-    private final Logger logger = LoggerFactory.getLogger(getClass());
-
-    public WindowsLocalFileSystem()
-    {
-        logger.warn("Using {} instead of org.apache.hadoop.fs.LocalFileSystem to avoid the problem linked to HADOOP-7682. " +
-                    "IOException thrown when setting permissions will be swallowed.", getClass().getName());
-    }
-
-    @Override
-    public boolean mkdirs(Path path, FsPermission permission) throws IOException
-    {
-        boolean result = super.mkdirs(path);
-        setPermission(path, permission);
-        return result;
-    }
-
-    @Override
-    public void setPermission(Path p, FsPermission permission) throws IOException
-    {
-        try
-        {
-            super.setPermission(p, permission);
-        }
-        catch (IOException e)
-        {
-            // Just swallow the Exception as logging it produces too much output.
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/AbstractSerializationsTester.java b/test/unit/org/apache/cassandra/AbstractSerializationsTester.java
index ebfa79d..3a1f348 100644
--- a/test/unit/org/apache/cassandra/AbstractSerializationsTester.java
+++ b/test/unit/org/apache/cassandra/AbstractSerializationsTester.java

@@ -20,12 +20,13 @@
 package org.apache.cassandra;
 
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.net.MessagingService;
 
-import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
@@ -35,7 +36,7 @@
 
 public class AbstractSerializationsTester
 {
-    protected static final String CUR_VER = System.getProperty("cassandra.version", "2.1");
+    protected static final String CUR_VER = System.getProperty("cassandra.version", "3.0");
     protected static final Map<String, Integer> VERSION_MAP = new HashMap<String, Integer> ()
     {{
         put("0.7", 1);
@@ -43,6 +44,8 @@
         put("1.2", MessagingService.VERSION_12);
         put("2.0", MessagingService.VERSION_20);
         put("2.1", MessagingService.VERSION_21);
+        put("2.2", MessagingService.VERSION_22);
+        put("3.0", MessagingService.VERSION_30);
     }};
 
     protected static final boolean EXECUTE_WRITES = Boolean.getBoolean("cassandra.test-serialization-writes");
@@ -59,17 +62,28 @@
         assert out.getLength() == serializer.serializedSize(obj, getVersion());
     }
 
-    protected static DataInputStream getInput(String name) throws IOException
+    protected static DataInputStreamPlus getInput(String name) throws IOException
     {
-        File f = new File("test/data/serialization/" + CUR_VER + "/" + name);
+        return getInput(CUR_VER, name);
+    }
+
+    protected static DataInputStreamPlus getInput(String version, String name) throws IOException
+    {
+        File f = new File("test/data/serialization/" + version + '/' + name);
         assert f.exists() : f.getPath();
-        return new DataInputStream(new FileInputStream(f));
+        return new DataInputPlus.DataInputStreamPlus(new FileInputStream(f));
     }
 
     @SuppressWarnings("resource")
     protected static DataOutputStreamPlus getOutput(String name) throws IOException
     {
-        File f = new File("test/data/serialization/" + CUR_VER + "/" + name);
+        return getOutput(CUR_VER, name);
+    }
+
+    @SuppressWarnings("resource")
+    protected static DataOutputStreamPlus getOutput(String version, String name) throws IOException
+    {
+        File f = new File("test/data/serialization/" + version + '/' + name);
         f.getParentFile().mkdirs();
         return new BufferedDataOutputStreamPlus(new FileOutputStream(f).getChannel());
     }

diff --git a/test/unit/org/apache/cassandra/CassandraBriefJUnitResultFormatter.java b/test/unit/org/apache/cassandra/CassandraBriefJUnitResultFormatter.java
index 084858f..a6c5997 100644
--- a/test/unit/org/apache/cassandra/CassandraBriefJUnitResultFormatter.java
+++ b/test/unit/org/apache/cassandra/CassandraBriefJUnitResultFormatter.java

@@ -49,6 +49,8 @@
 
     private static final String tag = System.getProperty("cassandra.testtag", "");
 
+    private static final Boolean keepBriefBrief = Boolean.getBoolean("cassandra.keepBriefBrief");
+
     /**
      * Where to write the log to.
      */
@@ -145,7 +147,12 @@
      * @param suite the test suite
      */
     public void endTestSuite(JUnitTest suite) {
-        StringBuffer sb = new StringBuffer("Tests run: ");
+        StringBuffer sb = new StringBuffer("Testsuite: ");
+        String n = suite.getName();
+        if (n != null && !tag.isEmpty())
+            n = n + "-" + tag;
+        sb.append(n);
+        sb.append(" Tests run: ");
         sb.append(suite.runCount());
         sb.append(", Failures: ");
         sb.append(suite.failureCount());
@@ -160,7 +167,7 @@
         sb.append(StringUtils.LINE_SEP);
 
         // append the err and output streams to the log
-        if (systemOutput != null && systemOutput.length() > 0) {
+        if (!keepBriefBrief && systemOutput != null && systemOutput.length() > 0) {
             sb.append("------------- Standard Output ---------------")
                     .append(StringUtils.LINE_SEP)
                     .append(systemOutput)
@@ -168,7 +175,7 @@
                     .append(StringUtils.LINE_SEP);
         }
 
-        if (systemError != null && systemError.length() > 0) {
+        if (!keepBriefBrief && systemError != null && systemError.length() > 0) {
             sb.append("------------- Standard Error -----------------")
                     .append(StringUtils.LINE_SEP)
                     .append(systemError)

diff --git a/test/unit/org/apache/cassandra/CassandraXMLJUnitResultFormatter.java b/test/unit/org/apache/cassandra/CassandraXMLJUnitResultFormatter.java
index 066315d..b342b45 100644
--- a/test/unit/org/apache/cassandra/CassandraXMLJUnitResultFormatter.java
+++ b/test/unit/org/apache/cassandra/CassandraXMLJUnitResultFormatter.java

@@ -74,6 +74,17 @@
 
     private static final String tag = System.getProperty("cassandra.testtag", "");
 
+    /*
+     * Set the property for the test suite name so that log configuration can pick it up
+     * and log to a file specific to this test suite
+     */
+    static
+    {
+        String command = System.getProperty("sun.java.command");
+        String args[] = command.split(" ");
+        System.setProperty("suitename", args[1]);
+    }
+
     /**
      * The XML document.
      */

diff --git a/test/unit/org/apache/cassandra/ConsoleAppender.java b/test/unit/org/apache/cassandra/ConsoleAppender.java
new file mode 100644
index 0000000..aa8af1e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/ConsoleAppender.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+
+import ch.qos.logback.core.OutputStreamAppender;
+import ch.qos.logback.core.status.Status;
+import ch.qos.logback.core.status.WarnStatus;
+
+public class ConsoleAppender<E> extends OutputStreamAppender<E>
+{
+    private String target = "System.out";
+
+    public void setTarget(String target)
+    {
+        if(!(target.equals("System.out") || target.equals("System.err")))
+        {
+            Status status = new WarnStatus("[" + target + "] should be one of System.out or System.err", this);
+            status.add(new WarnStatus("Using default target System.out", this));
+            addStatus(status);
+            return;
+        }
+        this.target = target;
+    }
+
+    public String getTarget()
+    {
+        return target;
+    }
+
+    @Override
+    public void start()
+    {
+        @SuppressWarnings("resource")
+        final PrintStream targetStream = target.equals("System.out") ? LogbackStatusListener.originalOut : LogbackStatusListener.originalErr;
+        setOutputStream(new OutputStream() {
+            @Override
+            public void write(int b)
+            {
+                targetStream.write(b);
+            }
+
+            @Override
+            public void write(byte[] b) throws IOException
+            {
+                targetStream.write(b);
+            }
+
+            @Override
+            public void write(byte[] b, int off, int len)
+            {
+                targetStream.write(b, off, len);
+            }
+
+            @Override
+            public void flush()
+            {
+                targetStream.flush();
+            }
+        });
+        super.start();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/EmbeddedServer.java b/test/unit/org/apache/cassandra/EmbeddedServer.java
deleted file mode 100644
index 25754ea..0000000
--- a/test/unit/org/apache/cassandra/EmbeddedServer.java
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.cassandra.service.CassandraDaemon;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-
-public class EmbeddedServer extends SchemaLoader
-{
-    protected static CassandraDaemon daemon = null;
-
-    enum GatewayService
-    {
-        Thrift
-    }
-
-    public static GatewayService getDaemonGatewayService()
-    {
-        return GatewayService.Thrift;
-    }
-
-    static ExecutorService executor = Executors.newSingleThreadExecutor();
-
-    @BeforeClass
-    public static void startCassandra()
-
-    {
-        executor.execute(new Runnable()
-        {
-            public void run()
-            {
-                switch (getDaemonGatewayService())
-                {
-                    case Thrift:
-                    default:
-                        daemon = new org.apache.cassandra.service.CassandraDaemon();
-                }
-                daemon.activate();
-            }
-        });
-        try
-        {
-            TimeUnit.SECONDS.sleep(3);
-        }
-        catch (InterruptedException e)
-        {
-            throw new AssertionError(e);
-        }
-    }
-
-    @AfterClass
-    public static void stopCassandra() throws Exception
-    {
-        if (daemon != null)
-        {
-            daemon.deactivate();
-        }
-        executor.shutdown();
-        executor.shutdownNow();
-    }
-
-}

diff --git a/test/unit/org/apache/cassandra/LogbackStatusListener.java b/test/unit/org/apache/cassandra/LogbackStatusListener.java
new file mode 100644
index 0000000..1f95bd4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/LogbackStatusListener.java

@@ -0,0 +1,538 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Locale;
+
+import org.slf4j.ILoggerFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.LoggerContext;
+import ch.qos.logback.classic.spi.LoggerContextListener;
+import ch.qos.logback.core.status.Status;
+import ch.qos.logback.core.status.StatusListener;
+import org.apache.cassandra.distributed.shared.InstanceClassLoader;
+
+/*
+ * Listen for logback readiness and then redirect stdout/stderr to logback
+ */
+public class LogbackStatusListener implements StatusListener, LoggerContextListener
+{
+
+    public static final PrintStream originalOut = System.out;
+    public static final PrintStream originalErr = System.err;
+
+    private volatile boolean hadPreInstallError = false;
+    private volatile boolean haveInstalled = false;
+    private volatile boolean haveRegisteredListener = false;
+
+    private PrintStream replacementOut;
+    private PrintStream replacementErr;
+
+    @Override
+    public void addStatusEvent(Status s)
+    {
+        if (!haveInstalled && (s.getLevel() != 0 || s.getEffectiveLevel() != 0))
+        {
+            // if we encounter an error during setup, we're not sure what state we're in, so we just don't switch
+            // we should log this fact, though, so that we know that we're not necessarily capturing stdout
+            LoggerFactory.getLogger(LogbackStatusListener.class)
+                         .warn("Encountered non-info status in logger setup; aborting stdout capture: '" + s.getMessage() + '\'');
+            hadPreInstallError = true;
+        }
+
+        if (hadPreInstallError)
+            return;
+
+        if (s.getMessage().startsWith("Registering current configuration as safe fallback point"))
+        {
+            onStart(null);
+        }
+
+        if (haveInstalled && !haveRegisteredListener)
+        {
+            // we register ourselves as a listener after the fact, because we enable ourselves before the LoggerFactory
+            // is properly initialised, hence before it can accept any LoggerContextListener registrations
+            tryRegisterListener();
+        }
+
+        if (s.getMessage().equals("Logback context being closed via shutdown hook"))
+        {
+            onStop(null);
+        }
+    }
+
+    private static PrintStream wrapLogger(Logger logger, PrintStream original, String encodingProperty, boolean error) throws Exception
+    {
+        final String encoding = System.getProperty(encodingProperty);
+        OutputStream os = new ToLoggerOutputStream(logger, encoding, error);
+        return encoding != null ? new WrappedPrintStream(os, true, encoding, original)
+                                : new WrappedPrintStream(os, true, original);
+    }
+
+    private static class ToLoggerOutputStream extends ByteArrayOutputStream
+    {
+        final Logger logger;
+        final String encoding;
+        final boolean error;
+
+        private ToLoggerOutputStream(Logger logger, String encoding, boolean error)
+        {
+            this.logger = logger;
+            this.encoding = encoding;
+            this.error = error;
+        }
+
+        @Override
+        public void flush() throws IOException
+        {
+            try
+            {
+                //Filter out stupid PrintStream empty flushes
+                if (size() == 0) return;
+
+                //Filter out newlines, log framework provides its own
+                if (size() == 1)
+                {
+                    byte[] bytes = toByteArray();
+                    if (bytes[0] == 0xA)
+                        return;
+                }
+
+                //Filter out Windows newline
+                if (size() == 2)
+                {
+                    byte[] bytes = toByteArray();
+                    if (bytes[0] == 0xD && bytes[1] == 0xA)
+                        return;
+                }
+
+                String statement;
+                if (encoding != null)
+                    statement = new String(toByteArray(), encoding);
+                else
+                    statement = new String(toByteArray());
+
+                if (error)
+                    logger.error(statement);
+                else
+                    logger.info(statement);
+            }
+            finally
+            {
+                reset();
+            }
+        }
+    };
+
+    private static class WrappedPrintStream extends PrintStream
+    {
+        private long asyncAppenderThreadId = Long.MIN_VALUE;
+        private final PrintStream original;
+
+        public WrappedPrintStream(OutputStream out, boolean autoFlush, PrintStream original)
+        {
+            super(out, autoFlush);
+            this.original = original;
+        }
+
+        public WrappedPrintStream(OutputStream out, boolean autoFlush, String encoding, PrintStream original) throws UnsupportedEncodingException
+        {
+            super(out, autoFlush, encoding);
+            this.original = original;
+        }
+
+        /*
+         * Long and the short of it is that we don't want to serve logback a fake System.out/err.
+         * ConsoleAppender is replaced so it always goes to the real System.out/err, but logback itself
+         * will at times try to log to System.out/err when it has issues.
+         *
+         * Now here is the problem. There is a deadlock if a thread logs to System.out, blocks on the async
+         * appender queue, and the async appender thread tries to log to System.out directly as part of some
+         * internal logback issue.
+         *
+         * So to prevent this we have to exhaustively check before locking in the PrintStream and forward
+         * to real System.out/err if it is the async appender
+         */
+        private boolean isAsyncAppender()
+        {
+            //Set the thread id based on the name
+            Thread currentThread = Thread.currentThread();
+            long currentThreadId = currentThread.getId();
+            if (asyncAppenderThreadId == Long.MIN_VALUE &&
+                currentThread.getName().equals("AsyncAppender-Worker-ASYNC") &&
+                !InstanceClassLoader.wasLoadedByAnInstanceClassLoader(currentThread.getClass()))
+            {
+                asyncAppenderThreadId = currentThreadId;
+            }
+            if (currentThreadId == asyncAppenderThreadId)
+                original.println("Was in async appender");
+            return currentThreadId == asyncAppenderThreadId;
+        }
+
+        @Override
+        public void flush()
+        {
+            if (isAsyncAppender())
+                original.flush();
+            else
+                super.flush();
+        }
+
+        @Override
+        public void close()
+        {
+            if (isAsyncAppender())
+                original.close();
+            else
+                super.flush();
+        }
+
+        @Override
+        public void write(int b)
+        {
+            if (isAsyncAppender())
+                original.write(b);
+            else
+                super.write(b);
+        }
+
+        @Override
+        public void write(byte[] buf, int off, int len)
+        {
+            if (isAsyncAppender())
+                original.write(buf, off, len);
+            else
+                super.write(buf, off, len);
+        }
+
+        @Override
+        public void print(boolean b)
+        {
+            if (isAsyncAppender())
+                original.print(b);
+            else
+                super.print(b);
+        }
+
+        @Override
+        public void print(char c)
+        {
+            if (isAsyncAppender())
+                original.print(c);
+            else
+                super.print(c);
+        }
+
+        @Override
+        public void print(int i)
+        {
+            if (isAsyncAppender())
+                original.print(i);
+            else
+                super.print(i);
+        }
+
+        @Override
+        public void print(long l)
+        {
+            if (isAsyncAppender())
+                original.print(l);
+            else
+                super.print(l);
+        }
+
+        @Override
+        public void print(float f)
+        {
+            if (isAsyncAppender())
+                original.print(f);
+            else
+                super.print(f);
+        }
+
+        @Override
+        public void print(double d)
+        {
+            if (isAsyncAppender())
+                original.print(d);
+            else
+                super.print(d);
+        }
+
+        @Override
+        public void print(char[] s)
+        {
+            if(isAsyncAppender())
+                original.println(s);
+            else
+                super.print(s);
+        }
+
+        @Override
+        public void print(String s)
+        {
+            if (isAsyncAppender())
+                original.print(s);
+            else
+                super.print(s);
+        }
+
+        @Override
+        public void print(Object obj)
+        {
+            if (isAsyncAppender())
+                original.print(obj);
+            else
+                super.print(obj);
+        }
+
+        @Override
+        public void println()
+        {
+            if (isAsyncAppender())
+                original.println();
+            else
+                super.println();
+        }
+
+        @Override
+        public void println(boolean v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(char v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(int v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(long v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(float v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(double v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(char[] v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(String v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public void println(Object v)
+        {
+            if (isAsyncAppender())
+                original.println(v);
+            else
+                super.println(v);
+        }
+
+        @Override
+        public PrintStream printf(String format, Object... args)
+        {
+            if (isAsyncAppender())
+                return original.printf(format, args);
+            else
+                return super.printf(format, args);
+        }
+
+        @Override
+        public PrintStream printf(Locale l, String format, Object... args)
+        {
+            if (isAsyncAppender())
+                return original.printf(l, format, args);
+            else
+                return super.printf(l, format, args);
+        }
+
+        @Override
+        public PrintStream format(String format, Object... args)
+        {
+            if (isAsyncAppender())
+                return original.format(format, args);
+            else
+                return super.format(format, args);
+        }
+
+        @Override
+        public PrintStream format(Locale l, String format, Object... args)
+        {
+            if (isAsyncAppender())
+                return original.format(l, format, args);
+            else
+                return super.format(l, format, args);
+        }
+
+        @Override
+        public PrintStream append(CharSequence csq)
+        {
+            if (isAsyncAppender())
+                return original.append(csq);
+            else
+                return super.append(csq);
+        }
+
+        @Override
+        public PrintStream append(CharSequence csq, int start, int end)
+        {
+            if (isAsyncAppender())
+                return original.append(csq, start, end);
+            else
+                return super.append(csq, start, end);
+        }
+
+        @Override
+        public PrintStream append(char c)
+        {
+            if (isAsyncAppender())
+                return original.append(c);
+            else
+                return super.append(c);
+        }    }
+
+    public boolean isResetResistant()
+    {
+        return false;
+    }
+
+    public synchronized void onStart(LoggerContext loggerContext)
+    {
+        if (!hadPreInstallError && !haveInstalled)
+        {
+            if (InstanceClassLoader.wasLoadedByAnInstanceClassLoader(getClass())
+                || System.out.getClass().getName().contains("LogbackStatusListener"))
+            {
+                // don't operate if we're a dtest node, or if we're not the first to swap System.out for some other reason
+                hadPreInstallError = true;
+                return;
+            }
+            try
+            {
+                Logger stdoutLogger = LoggerFactory.getLogger("stdout");
+                Logger stderrLogger = LoggerFactory.getLogger("stderr");
+
+                replacementOut = wrapLogger(stdoutLogger, originalOut, "sun.stdout.encoding", false);
+                System.setOut(replacementOut);
+                replacementErr = wrapLogger(stderrLogger, originalErr, "sun.stderr.encoding", true);
+                System.setErr(replacementErr);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+            haveInstalled = true;
+        }
+    }
+
+    public synchronized void onReset(LoggerContext loggerContext)
+    {
+        onStop(loggerContext);
+    }
+
+    public synchronized void onStop(LoggerContext loggerContext)
+    {
+        if (haveInstalled)
+        {
+            if (replacementOut != null) replacementOut.flush();
+            if (replacementErr != null) replacementErr.flush();
+            System.setErr(originalErr);
+            System.setOut(originalOut);
+            hadPreInstallError = false;
+            haveInstalled = false;
+            haveRegisteredListener = false;
+            if (haveRegisteredListener)
+            {
+                ((LoggerContext)LoggerFactory.getILoggerFactory()).removeListener(this);
+            }
+        }
+    }
+
+    public void onLevelChange(ch.qos.logback.classic.Logger logger, Level level)
+    {
+    }
+
+    private synchronized void tryRegisterListener()
+    {
+        if (haveInstalled && !haveRegisteredListener)
+        {
+            ILoggerFactory factory = LoggerFactory.getILoggerFactory();
+            if (factory instanceof LoggerContext)
+            {
+                ((LoggerContext) factory).addListener(this);
+                haveRegisteredListener = true;
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/MockSchema.java b/test/unit/org/apache/cassandra/MockSchema.java
index c71c98b..a406290 100644
--- a/test/unit/org/apache/cassandra/MockSchema.java
+++ b/test/unit/org/apache/cassandra/MockSchema.java

@@ -21,18 +21,14 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.RandomAccessFile;
-import java.util.Set;
+import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
 
-import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
 
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.SimpleSparseCellNameType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.io.sstable.Component;
@@ -46,8 +42,11 @@
 import org.apache.cassandra.io.util.ChannelProxy;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.Memory;
+import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.io.util.SegmentedFile;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.AlwaysPresentFilter;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -60,10 +59,10 @@
         indexSummary = new IndexSummary(Murmur3Partitioner.instance, offsets, 0, Memory.allocate(4), 0, 0, 0, 1);
     }
     private static final AtomicInteger id = new AtomicInteger();
-    public static final Keyspace ks = Keyspace.mockKS(new KSMetaData("mockks", SimpleStrategy.class, ImmutableMap.of("replication_factor", "1"), false));
+    public static final Keyspace ks = Keyspace.mockKS(KeyspaceMetadata.create("mockks", KeyspaceParams.simpleTransient(1)));
 
-    private static final IndexSummary indexSummary;
-    private static final SegmentedFile segmentedFile = new BufferedSegmentedFile(new ChannelProxy(temp("mocksegmentedfile")), 0);
+    public static final IndexSummary indexSummary;
+    private static final SegmentedFile segmentedFile = new BufferedSegmentedFile(new ChannelProxy(temp("mocksegmentedfile")), RandomAccessReader.DEFAULT_BUFFER_SIZE, 0);
 
     public static Memtable memtable(ColumnFamilyStore cfs)
     {
@@ -87,11 +86,10 @@
 
     public static SSTableReader sstable(int generation, int size, boolean keepRef, ColumnFamilyStore cfs)
     {
-        Descriptor descriptor = new Descriptor(cfs.directories.getDirectoryForNewSSTables(),
+        Descriptor descriptor = new Descriptor(cfs.getDirectories().getDirectoryForNewSSTables(),
                                                cfs.keyspace.getName(),
                                                cfs.getColumnFamilyName(),
-                                               generation,
-                                               Descriptor.Type.FINAL);
+                                               generation);
         Set<Component> components = ImmutableSet.of(Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.TOC);
         for (Component component : components)
         {
@@ -119,12 +117,13 @@
                 throw new RuntimeException(e);
             }
         }
+        SerializationHeader header = SerializationHeader.make(cfs.metadata, Collections.emptyList());
         StatsMetadata metadata = (StatsMetadata) new MetadataCollector(cfs.metadata.comparator)
-                                                 .finalizeMetadata(Murmur3Partitioner.instance.getClass().getCanonicalName(), 0.01f, -1)
+                                                 .finalizeMetadata(cfs.metadata.partitioner.getClass().getCanonicalName(), 0.01f, -1, header)
                                                  .get(MetadataType.STATS);
-        SSTableReader reader = SSTableReader.internalOpen(descriptor, components, cfs.metadata, Murmur3Partitioner.instance,
+        SSTableReader reader = SSTableReader.internalOpen(descriptor, components, cfs.metadata,
                                                           segmentedFile.sharedCopy(), segmentedFile.sharedCopy(), indexSummary.sharedCopy(),
-                                                          new AlwaysPresentFilter(), 1L, metadata, SSTableReader.OpenReason.NORMAL);
+                                                          new AlwaysPresentFilter(), 1L, metadata, SSTableReader.OpenReason.NORMAL, header);
         reader.first = reader.last = readerBounds(generation);
         if (!keepRef)
             reader.selfRef().release();
@@ -133,18 +132,25 @@
 
     public static ColumnFamilyStore newCFS()
     {
-        String cfname = "mockcf" + (id.incrementAndGet());
-        CFMetaData metadata = newCFMetaData(ks.getName(), cfname);
-        return new ColumnFamilyStore(ks, cfname, Murmur3Partitioner.instance, 0, metadata, new Directories(metadata), false, false);
+        return newCFS(ks.getName());
     }
 
-    private static CFMetaData newCFMetaData(String ksname, String cfname)
+    public static ColumnFamilyStore newCFS(String ksname)
     {
-        CFMetaData metadata = new CFMetaData(ksname,
-                                             cfname,
-                                             ColumnFamilyType.Standard,
-                                             new SimpleSparseCellNameType(UTF8Type.instance));
-        metadata.caching(CachingOptions.NONE);
+        String cfname = "mockcf" + (id.incrementAndGet());
+        CFMetaData metadata = newCFMetaData(ksname, cfname);
+        return new ColumnFamilyStore(ks, cfname, 0, metadata, new Directories(metadata), false, false);
+    }
+
+    public static CFMetaData newCFMetaData(String ksname, String cfname)
+    {
+        CFMetaData metadata = CFMetaData.Builder.create(ksname, cfname)
+                                                .addPartitionKey("key", UTF8Type.instance)
+                                                .addClusteringColumn("col", UTF8Type.instance)
+                                                .addRegularColumn("value", UTF8Type.instance)
+                                                .withPartitioner(Murmur3Partitioner.instance)
+                                                .build();
+        metadata.caching(CachingParams.CACHE_NOTHING);
         return metadata;
     }
 

diff --git a/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java b/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java
index 9023b11..3bdb192 100644
--- a/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java
+++ b/test/unit/org/apache/cassandra/OffsetAwareConfigurationLoader.java

@@ -54,6 +54,7 @@
 
         config.commitlog_directory += File.pathSeparator + offset;
         config.saved_caches_directory += File.pathSeparator + offset;
+        config.hints_directory += File.pathSeparator + offset;
         for (int i = 0; i < config.data_file_directories.length; i++)
             config.data_file_directories[i] += File.pathSeparator + offset;
 

diff --git a/test/unit/org/apache/cassandra/SchemaLoader.java b/test/unit/org/apache/cassandra/SchemaLoader.java
index 45748a9..026aba8 100644
--- a/test/unit/org/apache/cassandra/SchemaLoader.java
+++ b/test/unit/org/apache/cassandra/SchemaLoader.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -19,37 +19,29 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import org.junit.After;
 import org.junit.BeforeClass;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
-import org.apache.cassandra.db.index.PerRowSecondaryIndexTest;
-import org.apache.cassandra.db.index.SecondaryIndex;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.io.compress.SnappyCompressor;
+import org.apache.cassandra.index.StubIndex;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.AbstractReplicationStrategy;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 
 public class SchemaLoader
 {
-    private static Logger logger = LoggerFactory.getLogger(SchemaLoader.class);
-
     @BeforeClass
     public static void loadSchema() throws ConfigurationException
     {
@@ -71,27 +63,7 @@
 
     public static void prepareServer()
     {
-        // Cleanup first
-        try
-        {
-            cleanupAndLeaveDirs();
-        }
-        catch (IOException e)
-        {
-            logger.error("Failed to cleanup and recreate directories and files.");
-            throw new RuntimeException(e);
-        }
-
-        Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler()
-        {
-            public void uncaughtException(Thread t, Throwable e)
-            {
-                logger.error("Fatal exception in thread " + t, e);
-            }
-        });
-
-        DatabaseDescriptor.setDaemonInitialized();
-        Keyspace.setInitialized();
+       CQLTester.prepareServer();
     }
 
     public static void startGossiper()
@@ -102,7 +74,7 @@
 
     public static void schemaDefinition(String testName) throws ConfigurationException
     {
-        List<KSMetaData> schema = new ArrayList<KSMetaData>();
+        List<KeyspaceMetadata> schema = new ArrayList<KeyspaceMetadata>();
 
         // A whole bucket of shorthand
         String ks1 = testName + "Keyspace1";
@@ -118,13 +90,6 @@
         String ks_prsi = testName + "PerRowSecondaryIndex";
         String ks_cql = testName + "cql_keyspace";
 
-        Class<? extends AbstractReplicationStrategy> simple = SimpleStrategy.class;
-
-        Map<String, String> opts_rf1 = KSMetaData.optsWithRF(1);
-        Map<String, String> opts_rf2 = KSMetaData.optsWithRF(2);
-        Map<String, String> opts_rf3 = KSMetaData.optsWithRF(3);
-        Map<String, String> opts_rf5 = KSMetaData.optsWithRF(5);
-
         AbstractType bytes = BytesType.instance;
 
         AbstractType<?> composite = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{BytesType.instance, TimeUUIDType.instance, IntegerType.instance}));
@@ -143,176 +108,147 @@
         leveledOptions.put("sstable_size_in_mb", "1");
 
         // Keyspace 1
-        schema.add(KSMetaData.testMetadata(ks1,
-                                           simple,
-                                           opts_rf1,
-
-                                           // Column Families
-                                           standardCFMD(ks1, "Standard1").compactionStrategyOptions(compactionOptions),
-                                           standardCFMD(ks1, "Standard2"),
-                                           standardCFMD(ks1, "Standard3"),
-                                           standardCFMD(ks1, "Standard4"),
-                                           standardCFMD(ks1, "StandardGCGS0").gcGraceSeconds(0),
-                                           standardCFMD(ks1, "StandardLong1"),
-                                           standardCFMD(ks1, "StandardLong2"),
-                                           CFMetaData.denseCFMetaData(ks1, "ValuesWithQuotes", BytesType.instance).defaultValidator(UTF8Type.instance),
-                                           superCFMD(ks1, "Super1", LongType.instance),
-                                           superCFMD(ks1, "Super2", LongType.instance),
-                                           superCFMD(ks1, "Super3", LongType.instance),
-                                           superCFMD(ks1, "Super4", UTF8Type.instance),
-                                           superCFMD(ks1, "Super5", bytes),
-                                           superCFMD(ks1, "Super6", LexicalUUIDType.instance, UTF8Type.instance),
-                                           indexCFMD(ks1, "Indexed1", true),
-                                           indexCFMD(ks1, "Indexed2", false),
-                                           CFMetaData.denseCFMetaData(ks1, "StandardInteger1", IntegerType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "StandardLong3", IntegerType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "Counter1", bytes).defaultValidator(CounterColumnType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "SuperCounter1", bytes, bytes).defaultValidator(CounterColumnType.instance),
-                                           superCFMD(ks1, "SuperDirectGC", BytesType.instance).gcGraceSeconds(0),
-                                           jdbcSparseCFMD(ks1, "JdbcInteger", IntegerType.instance).addColumnDefinition(integerColumn(ks1, "JdbcInteger")),
-                                           jdbcSparseCFMD(ks1, "JdbcUtf8", UTF8Type.instance).addColumnDefinition(utf8Column(ks1, "JdbcUtf8")),
-                                           jdbcCFMD(ks1, "JdbcLong", LongType.instance),
-                                           jdbcCFMD(ks1, "JdbcBytes", bytes),
-                                           jdbcCFMD(ks1, "JdbcAscii", AsciiType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "StandardComposite", composite),
-                                           CFMetaData.denseCFMetaData(ks1, "StandardComposite2", compositeMaxMin),
-                                           CFMetaData.denseCFMetaData(ks1, "StandardDynamicComposite", dynamicComposite),
-                                           standardCFMD(ks1, "StandardLeveled")
-                                                                               .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                                               .compactionStrategyOptions(leveledOptions),
-                                           standardCFMD(ks1, "StandardLeveled2")
-                                                                               .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                                               .compactionStrategyOptions(leveledOptions),
-                                           standardCFMD(ks1, "legacyleveled")
-                                                                               .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                                               .compactionStrategyOptions(leveledOptions),
-                                           standardCFMD(ks1, "StandardLowIndexInterval").minIndexInterval(8)
-                                                                                        .maxIndexInterval(256)
-                                                                                        .caching(CachingOptions.NONE),
-                                           standardCFMD(ks1, "UUIDKeys").keyValidator(UUIDType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "MixedTypes", LongType.instance).keyValidator(UUIDType.instance).defaultValidator(BooleanType.instance),
-                                           CFMetaData.denseCFMetaData(ks1, "MixedTypesComposite", composite).keyValidator(composite).defaultValidator(BooleanType.instance),
-                                           standardCFMD(ks1, "AsciiKeys").keyValidator(AsciiType.instance)
-        ));
+        schema.add(KeyspaceMetadata.create(ks1,
+                KeyspaceParams.simple(1),
+                Tables.of(
+                // Column Families
+                standardCFMD(ks1, "Standard1").compaction(CompactionParams.scts(compactionOptions)),
+                standardCFMD(ks1, "Standard2"),
+                standardCFMD(ks1, "Standard3"),
+                standardCFMD(ks1, "Standard4"),
+                standardCFMD(ks1, "StandardGCGS0").gcGraceSeconds(0),
+                standardCFMD(ks1, "StandardLong1"),
+                standardCFMD(ks1, "StandardLong2"),
+                //CFMetaData.Builder.create(ks1, "ValuesWithQuotes").build(),
+                superCFMD(ks1, "Super1", LongType.instance),
+                superCFMD(ks1, "Super2", LongType.instance),
+                superCFMD(ks1, "Super3", LongType.instance),
+                superCFMD(ks1, "Super4", UTF8Type.instance),
+                superCFMD(ks1, "Super5", bytes),
+                superCFMD(ks1, "Super6", LexicalUUIDType.instance, UTF8Type.instance),
+                keysIndexCFMD(ks1, "Indexed1", true),
+                keysIndexCFMD(ks1, "Indexed2", false),
+                //CFMetaData.Builder.create(ks1, "StandardInteger1").withColumnNameComparator(IntegerType.instance).build(),
+                //CFMetaData.Builder.create(ks1, "StandardLong3").withColumnNameComparator(IntegerType.instance).build(),
+                //CFMetaData.Builder.create(ks1, "Counter1", false, false, true).build(),
+                //CFMetaData.Builder.create(ks1, "SuperCounter1", false, false, true, true).build(),
+                superCFMD(ks1, "SuperDirectGC", BytesType.instance).gcGraceSeconds(0),
+//                jdbcCFMD(ks1, "JdbcInteger", IntegerType.instance).addColumnDefinition(integerColumn(ks1, "JdbcInteger")),
+                jdbcCFMD(ks1, "JdbcUtf8", UTF8Type.instance).addColumnDefinition(utf8Column(ks1, "JdbcUtf8")),
+                jdbcCFMD(ks1, "JdbcLong", LongType.instance),
+                jdbcCFMD(ks1, "JdbcBytes", bytes),
+                jdbcCFMD(ks1, "JdbcAscii", AsciiType.instance),
+                //CFMetaData.Builder.create(ks1, "StandardComposite", false, true, false).withColumnNameComparator(composite).build(),
+                //CFMetaData.Builder.create(ks1, "StandardComposite2", false, true, false).withColumnNameComparator(compositeMaxMin).build(),
+                //CFMetaData.Builder.create(ks1, "StandardDynamicComposite", false, true, false).withColumnNameComparator(dynamicComposite).build(),
+                standardCFMD(ks1, "StandardLeveled").compaction(CompactionParams.lcs(leveledOptions)),
+                standardCFMD(ks1, "legacyleveled").compaction(CompactionParams.lcs(leveledOptions)),
+                standardCFMD(ks1, "StandardLowIndexInterval").minIndexInterval(8)
+                                                             .maxIndexInterval(256)
+                                                             .caching(CachingParams.CACHE_NOTHING)
+                //CFMetaData.Builder.create(ks1, "UUIDKeys").addPartitionKey("key",UUIDType.instance).build(),
+                //CFMetaData.Builder.create(ks1, "MixedTypes").withColumnNameComparator(LongType.instance).addPartitionKey("key", UUIDType.instance).build(),
+                //CFMetaData.Builder.create(ks1, "MixedTypesComposite", false, true, false).withColumnNameComparator(composite).addPartitionKey("key", composite).build(),
+                //CFMetaData.Builder.create(ks1, "AsciiKeys").addPartitionKey("key", AsciiType.instance).build()
+        )));
 
         // Keyspace 2
-        schema.add(KSMetaData.testMetadata(ks2,
-                                           simple,
-                                           opts_rf1,
-
-                                           // Column Families
-                                           standardCFMD(ks2, "Standard1"),
-                                           standardCFMD(ks2, "Standard3"),
-                                           superCFMD(ks2, "Super3", bytes),
-                                           superCFMD(ks2, "Super4", TimeUUIDType.instance),
-                                           indexCFMD(ks2, "Indexed1", true),
-                                           compositeIndexCFMD(ks2, "Indexed2", true),
-                                           compositeIndexCFMD(ks2, "Indexed3", true).gcGraceSeconds(0)));
+        schema.add(KeyspaceMetadata.create(ks2,
+                KeyspaceParams.simple(1),
+                Tables.of(
+                // Column Families
+                standardCFMD(ks2, "Standard1"),
+                standardCFMD(ks2, "Standard3"),
+                superCFMD(ks2, "Super3", bytes),
+                superCFMD(ks2, "Super4", TimeUUIDType.instance),
+                keysIndexCFMD(ks2, "Indexed1", true),
+                compositeIndexCFMD(ks2, "Indexed2", true),
+                compositeIndexCFMD(ks2, "Indexed3", true).gcGraceSeconds(0))));
 
         // Keyspace 3
-        schema.add(KSMetaData.testMetadata(ks3,
-                                           simple,
-                                           opts_rf5,
-
-                                           // Column Families
-                                           standardCFMD(ks3, "Standard1"),
-                                           indexCFMD(ks3, "Indexed1", true)));
+        schema.add(KeyspaceMetadata.create(ks3,
+                KeyspaceParams.simple(5),
+                Tables.of(
+                standardCFMD(ks3, "Standard1"),
+                keysIndexCFMD(ks3, "Indexed1", true))));
 
         // Keyspace 4
-        schema.add(KSMetaData.testMetadata(ks4,
-                                           simple,
-                                           opts_rf3,
-
-                                           // Column Families
-                                           standardCFMD(ks4, "Standard1"),
-                                           standardCFMD(ks4, "Standard3"),
-                                           superCFMD(ks4, "Super3", bytes),
-                                           superCFMD(ks4, "Super4", TimeUUIDType.instance),
-                                           CFMetaData.denseCFMetaData(ks4, "Super5", TimeUUIDType.instance, bytes)));
+        schema.add(KeyspaceMetadata.create(ks4,
+                KeyspaceParams.simple(3),
+                Tables.of(
+                standardCFMD(ks4, "Standard1"),
+                standardCFMD(ks4, "Standard3"),
+                superCFMD(ks4, "Super3", bytes),
+                superCFMD(ks4, "Super4", TimeUUIDType.instance),
+                superCFMD(ks4, "Super5", TimeUUIDType.instance, BytesType.instance))));
 
         // Keyspace 5
-        schema.add(KSMetaData.testMetadata(ks5,
-                                           simple,
-                                           opts_rf2,
-                                           standardCFMD(ks5, "Standard1"),
-                                           standardCFMD(ks5, "Counter1")
-                                                   .defaultValidator(CounterColumnType.instance)));
-
+        schema.add(KeyspaceMetadata.create(ks5,
+                KeyspaceParams.simple(2),
+                Tables.of(standardCFMD(ks5, "Standard1"))));
         // Keyspace 6
-        schema.add(KSMetaData.testMetadata(ks6,
-                                           simple,
-                                           opts_rf1,
-                                           indexCFMD(ks6, "Indexed1", true)));
+        schema.add(KeyspaceMetadata.create(ks6,
+                KeyspaceParams.simple(1),
+                Tables.of(keysIndexCFMD(ks6, "Indexed1", true))));
 
         // KeyCacheSpace
-        schema.add(KSMetaData.testMetadata(ks_kcs,
-                                           simple,
-                                           opts_rf1,
-                                           standardCFMD(ks_kcs, "Standard1"),
-                                           standardCFMD(ks_kcs, "Standard2"),
-                                           standardCFMD(ks_kcs, "Standard3")));
+        schema.add(KeyspaceMetadata.create(ks_kcs,
+                KeyspaceParams.simple(1),
+                Tables.of(
+                standardCFMD(ks_kcs, "Standard1"),
+                standardCFMD(ks_kcs, "Standard2"),
+                standardCFMD(ks_kcs, "Standard3"))));
 
         // RowCacheSpace
-        schema.add(KSMetaData.testMetadata(ks_rcs,
-                                           simple,
-                                           opts_rf1,
-                                           standardCFMD(ks_rcs, "CFWithoutCache").caching(CachingOptions.NONE),
-                                           standardCFMD(ks_rcs, "CachedCF").caching(CachingOptions.ALL),
-                                           standardCFMD(ks_rcs, "CachedIntCF").
-                                                   defaultValidator(IntegerType.instance).
-                                                   caching(new CachingOptions(new CachingOptions.KeyCache(CachingOptions.KeyCache.Type.ALL),
-                                                                                  new CachingOptions.RowCache(CachingOptions.RowCache.Type.HEAD, 100)))));
+        schema.add(KeyspaceMetadata.create(ks_rcs,
+                KeyspaceParams.simple(1),
+                Tables.of(
+                standardCFMD(ks_rcs, "CFWithoutCache").caching(CachingParams.CACHE_NOTHING),
+                standardCFMD(ks_rcs, "CachedCF").caching(CachingParams.CACHE_EVERYTHING),
+                standardCFMD(ks_rcs, "CachedNoClustering", 1, IntegerType.instance, IntegerType.instance, null).caching(CachingParams.CACHE_EVERYTHING),
+                standardCFMD(ks_rcs, "CachedIntCF").
+                        caching(new CachingParams(true, 100)))));
 
         // CounterCacheSpace
-        schema.add(KSMetaData.testMetadata(ks_ccs,
-                                           simple,
-                                           opts_rf1,
-                                           standardCFMD(ks_ccs, "Counter1").defaultValidator(CounterColumnType.instance),
-                                           standardCFMD(ks_ccs, "Counter2").defaultValidator(CounterColumnType.instance)));
+        /*schema.add(KeyspaceMetadata.testMetadata(ks_ccs,
+                simple,
+                opts_rf1,
+                CFMetaData.Builder.create(ks_ccs, "Counter1", false, false, true).build(),
+                CFMetaData.Builder.create(ks_ccs, "Counter1", false, false, true).build()));*/
 
-        schema.add(KSMetaData.testMetadataNotDurable(ks_nocommit,
-                                                     simple,
-                                                     opts_rf1,
-                                                     standardCFMD(ks_nocommit, "Standard1")));
-
-        // PerRowSecondaryIndexTest
-        schema.add(KSMetaData.testMetadata(ks_prsi,
-                                           simple,
-                                           opts_rf1,
-                                           perRowIndexedCFMD(ks_prsi, "Indexed1")));
+        schema.add(KeyspaceMetadata.create(ks_nocommit, KeyspaceParams.simpleTransient(1), Tables.of(
+                standardCFMD(ks_nocommit, "Standard1"))));
 
         // CQLKeyspace
-        schema.add(KSMetaData.testMetadata(ks_cql,
-                                           simple,
-                                           opts_rf1,
+        schema.add(KeyspaceMetadata.create(ks_cql, KeyspaceParams.simple(1), Tables.of(
 
-                                           // Column Families
-                                           CFMetaData.compile("CREATE TABLE table1 ("
-                                                              + "k int PRIMARY KEY,"
-                                                              + "v1 text,"
-                                                              + "v2 int"
-                                                              + ")", ks_cql),
+                // Column Families
+                CFMetaData.compile("CREATE TABLE table1 ("
+                        + "k int PRIMARY KEY,"
+                        + "v1 text,"
+                        + "v2 int"
+                        + ")", ks_cql),
 
-                                           CFMetaData.compile("CREATE TABLE table2 ("
-                                                              + "k text,"
-                                                              + "c text,"
-                                                              + "v text,"
-                                                              + "PRIMARY KEY (k, c))", ks_cql),
-                                           CFMetaData.compile("CREATE TABLE foo ("
-                                                   + "bar text, "
-                                                   + "baz text, "
-                                                   + "qux text, "
-                                                   + "PRIMARY KEY(bar, baz) ) "
-                                                   + "WITH COMPACT STORAGE", ks_cql),
-                                           CFMetaData.compile("CREATE TABLE foofoo ("
-                                                   + "bar text, "
-                                                   + "baz text, "
-                                                   + "qux text, "
-                                                   + "quz text, "
-                                                   + "foo text, "
-                                                   + "PRIMARY KEY((bar, baz), qux, quz) ) "
-                                                   + "WITH COMPACT STORAGE", ks_cql)
-                                           ));
+                CFMetaData.compile("CREATE TABLE table2 ("
+                        + "k text,"
+                        + "c text,"
+                        + "v text,"
+                        + "PRIMARY KEY (k, c))", ks_cql),
+                CFMetaData.compile("CREATE TABLE foo ("
+                        + "bar text, "
+                        + "baz text, "
+                        + "qux text, "
+                        + "PRIMARY KEY(bar, baz) ) "
+                        + "WITH COMPACT STORAGE", ks_cql),
+                CFMetaData.compile("CREATE TABLE foofoo ("
+                        + "bar text, "
+                        + "baz text, "
+                        + "qux text, "
+                        + "quz text, "
+                        + "foo text, "
+                        + "PRIMARY KEY((bar, baz), qux, quz) ) "
+                        + "WITH COMPACT STORAGE", ks_cql)
+        )));
 
 
         if (Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false")))
@@ -320,144 +256,261 @@
 
         // if you're messing with low-level sstable stuff, it can be useful to inject the schema directly
         // Schema.instance.load(schemaDefinition());
-        for (KSMetaData ksm : schema)
+        for (KeyspaceMetadata ksm : schema)
             MigrationManager.announceNewKeyspace(ksm, false);
     }
 
-    public static void createKeyspace(String keyspaceName,
-                                      Class<? extends AbstractReplicationStrategy> strategy,
-                                      Map<String, String> options,
-                                      CFMetaData... cfmetas) throws ConfigurationException
+    public static void createKeyspace(String name, KeyspaceParams params, CFMetaData... tables)
     {
-        createKeyspace(keyspaceName, true, true, strategy, options, cfmetas);
+        MigrationManager.announceNewKeyspace(KeyspaceMetadata.create(name, params, Tables.of(tables)), true);
     }
 
-    public static void createKeyspace(String keyspaceName,
-                                      boolean durable,
-                                      boolean announceLocally,
-                                      Class<? extends AbstractReplicationStrategy> strategy,
-                                      Map<String, String> options,
-                                      CFMetaData... cfmetas) throws ConfigurationException
+    public static void createKeyspace(String name, KeyspaceParams params, Tables tables, Types types)
     {
-        KSMetaData ksm = durable ? KSMetaData.testMetadata(keyspaceName, strategy, options, cfmetas)
-                                 : KSMetaData.testMetadataNotDurable(keyspaceName, strategy, options, cfmetas);
-        MigrationManager.announceNewKeyspace(ksm, announceLocally);
+        MigrationManager.announceNewKeyspace(KeyspaceMetadata.create(name, params, tables, Views.none(), types, Functions.none()), true);
     }
 
     public static ColumnDefinition integerColumn(String ksName, String cfName)
     {
         return new ColumnDefinition(ksName,
                                     cfName,
-                                    new ColumnIdentifier(IntegerType.instance.fromString("42"), IntegerType.instance),
+                                    ColumnIdentifier.getInterned(IntegerType.instance.fromString("42"), IntegerType.instance),
                                     UTF8Type.instance,
-                                    null,
-                                    null,
-                                    null,
-                                    null,
+                                    ColumnDefinition.NO_POSITION,
                                     ColumnDefinition.Kind.REGULAR);
     }
 
-    private static ColumnDefinition utf8Column(String ksName, String cfName)
+    public static ColumnDefinition utf8Column(String ksName, String cfName)
     {
         return new ColumnDefinition(ksName,
                                     cfName,
-                                    new ColumnIdentifier("fortytwo", true),
+                                    ColumnIdentifier.getInterned("fortytwo", true),
                                     UTF8Type.instance,
-                                    null,
-                                    null,
-                                    null,
-                                    null,
+                                    ColumnDefinition.NO_POSITION,
                                     ColumnDefinition.Kind.REGULAR);
     }
 
     public static CFMetaData perRowIndexedCFMD(String ksName, String cfName)
     {
         final Map<String, String> indexOptions = Collections.singletonMap(
-                                                      SecondaryIndex.CUSTOM_INDEX_OPTION_NAME,
-                                                      PerRowSecondaryIndexTest.TestIndex.class.getName());
+                                                      IndexTarget.CUSTOM_INDEX_OPTION_NAME,
+                                                      StubIndex.class.getName());
 
-        CFMetaData cfm =  CFMetaData.sparseCFMetaData(ksName, cfName, AsciiType.instance).keyValidator(AsciiType.instance);
+        CFMetaData cfm =  CFMetaData.Builder.create(ksName, cfName)
+                .addPartitionKey("key", AsciiType.instance)
+                .build();
 
-        ByteBuffer cName = ByteBufferUtil.bytes("indexed");
-        return cfm.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(cfm, cName, AsciiType.instance, null)
-                                                                .setIndex("indexe1", IndexType.CUSTOM, indexOptions));
+        ColumnDefinition indexedColumn = ColumnDefinition.regularDef(ksName, cfName, "indexed", AsciiType.instance);
+        cfm.addOrReplaceColumnDefinition(indexedColumn);
+
+        cfm.indexes(
+            cfm.getIndexes()
+               .with(IndexMetadata.fromIndexTargets(cfm,
+                                                    Collections.singletonList(new IndexTarget(indexedColumn.name,
+                                                                                              IndexTarget.Type.VALUES)),
+                                                    "indexe1",
+                                                    IndexMetadata.Kind.CUSTOM,
+                                                    indexOptions)));
+        return cfm;
     }
 
-    private static void useCompression(List<KSMetaData> schema)
+    private static void useCompression(List<KeyspaceMetadata> schema)
     {
-        for (KSMetaData ksm : schema)
-        {
-            for (CFMetaData cfm : ksm.cfMetaData().values())
-            {
-                cfm.compressionParameters(new CompressionParameters(SnappyCompressor.instance));
-            }
-        }
+        for (KeyspaceMetadata ksm : schema)
+            for (CFMetaData cfm : ksm.tablesAndViews())
+                cfm.compression(CompressionParams.snappy());
+    }
+
+    public static CFMetaData counterCFMD(String ksName, String cfName)
+    {
+        return CFMetaData.Builder.create(ksName, cfName, false, true, true)
+                .addPartitionKey("key", AsciiType.instance)
+                .addClusteringColumn("name", AsciiType.instance)
+                .addRegularColumn("val", CounterColumnType.instance)
+                .addRegularColumn("val2", CounterColumnType.instance)
+                .build()
+                .compression(getCompressionParameters());
     }
 
     public static CFMetaData standardCFMD(String ksName, String cfName)
     {
-        return CFMetaData.denseCFMetaData(ksName, cfName, BytesType.instance).compressionParameters(getCompressionParameters());
+        return standardCFMD(ksName, cfName, 1, AsciiType.instance);
     }
 
-    public static CFMetaData standardCFMD(String ksName, String cfName, AbstractType<?> comparator)
+    public static CFMetaData standardCFMD(String ksName, String cfName, int columnCount, AbstractType<?> keyType)
     {
-        return CFMetaData.denseCFMetaData(ksName, cfName, comparator).compressionParameters(getCompressionParameters());
+        return standardCFMD(ksName, cfName, columnCount, keyType, AsciiType.instance);
+    }
+
+    public static CFMetaData standardCFMD(String ksName, String cfName, int columnCount, AbstractType<?> keyType, AbstractType<?> valType)
+    {
+        return standardCFMD(ksName, cfName, columnCount, keyType, valType, AsciiType.instance);
+    }
+
+    public static CFMetaData standardCFMD(String ksName, String cfName, int columnCount, AbstractType<?> keyType, AbstractType<?> valType, AbstractType<?> clusteringType)
+    {
+        CFMetaData.Builder builder;
+        builder = CFMetaData.Builder.create(ksName, cfName)
+                                    .addPartitionKey("key", keyType)
+                                    .addRegularColumn("val", valType);
+
+        if(clusteringType != null)
+            builder = builder.addClusteringColumn("name", clusteringType);
+
+        for (int i = 0; i < columnCount; i++)
+            builder.addRegularColumn("val" + i, AsciiType.instance);
+
+        return builder.build()
+                      .compression(getCompressionParameters());
+    }
+
+    public static CFMetaData staticCFMD(String ksName, String cfName)
+    {
+        return CFMetaData.Builder.create(ksName, cfName)
+                                 .addPartitionKey("key", AsciiType.instance)
+                                 .addClusteringColumn("cols", AsciiType.instance)
+                                 .addStaticColumn("val", AsciiType.instance)
+                                 .addRegularColumn("val2", AsciiType.instance)
+                                 .build();
+    }
+
+
+    public static CFMetaData denseCFMD(String ksName, String cfName)
+    {
+        return denseCFMD(ksName, cfName, AsciiType.instance);
+    }
+    public static CFMetaData denseCFMD(String ksName, String cfName, AbstractType cc)
+    {
+        return denseCFMD(ksName, cfName, cc, null);
+    }
+    public static CFMetaData denseCFMD(String ksName, String cfName, AbstractType cc, AbstractType subcc)
+    {
+        AbstractType comp = cc;
+        if (subcc != null)
+            comp = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{cc, subcc}));
+
+        return CFMetaData.Builder.createDense(ksName, cfName, subcc != null, false)
+            .addPartitionKey("key", AsciiType.instance)
+            .addClusteringColumn("cols", comp)
+            .addRegularColumn("val", AsciiType.instance)
+            .build()
+            .compression(getCompressionParameters());
     }
 
     public static CFMetaData superCFMD(String ksName, String cfName, AbstractType subcc)
     {
-        return superCFMD(ksName, cfName, BytesType.instance, subcc).compressionParameters(getCompressionParameters());
+        return superCFMD(ksName, cfName, BytesType.instance, subcc);
     }
 
     public static CFMetaData superCFMD(String ksName, String cfName, AbstractType cc, AbstractType subcc)
     {
-        return CFMetaData.denseCFMetaData(ksName, cfName, cc, subcc).compressionParameters(getCompressionParameters());
+        return CFMetaData.Builder.createSuper(ksName, cfName, false)
+                                 .addPartitionKey("key", BytesType.instance)
+                                 .addClusteringColumn("column1", cc)
+                                 .addRegularColumn("", MapType.getInstance(AsciiType.instance, subcc, true))
+                                 .build();
+
+    }
+    public static CFMetaData compositeIndexCFMD(String ksName, String cfName, boolean withIndex) throws ConfigurationException
+    {
+        // the withIndex flag exists to allow tests index creation
+        // on existing columns
+        CFMetaData cfm = CFMetaData.Builder.create(ksName, cfName)
+                .addPartitionKey("key", AsciiType.instance)
+                .addClusteringColumn("c1", AsciiType.instance)
+                .addRegularColumn("birthdate", LongType.instance)
+                .addRegularColumn("notbirthdate", LongType.instance)
+                .build();
+
+        if (withIndex)
+            cfm.indexes(
+                cfm.getIndexes()
+                   .with(IndexMetadata.fromIndexTargets(cfm,
+                                                        Collections.singletonList(
+                                                            new IndexTarget(new ColumnIdentifier("birthdate", true),
+                                                                            IndexTarget.Type.VALUES)),
+                                                        "birthdate_key_index",
+                                                        IndexMetadata.Kind.COMPOSITES,
+                                                        Collections.EMPTY_MAP)));
+
+        return cfm.compression(getCompressionParameters());
     }
 
-    public static CFMetaData indexCFMD(String ksName, String cfName, final Boolean withIdxType) throws ConfigurationException
+    public static CFMetaData compositeMultipleIndexCFMD(String ksName, String cfName) throws ConfigurationException
     {
-        CFMetaData cfm = CFMetaData.sparseCFMetaData(ksName, cfName, BytesType.instance).keyValidator(AsciiType.instance);
+        CFMetaData cfm = CFMetaData.Builder.create(ksName, cfName)
+                                           .addPartitionKey("key", AsciiType.instance)
+                                           .addClusteringColumn("c1", AsciiType.instance)
+                                           .addRegularColumn("birthdate", LongType.instance)
+                                           .addRegularColumn("notbirthdate", LongType.instance)
+                                           .build();
 
-        ByteBuffer cName = ByteBufferUtil.bytes("birthdate");
-        IndexType keys = withIdxType ? IndexType.KEYS : null;
-        return cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, cName, LongType.instance, null)
-                                                       .setIndex(withIdxType ? ByteBufferUtil.bytesToHex(cName) : null, keys, null))
-                                      .compressionParameters(getCompressionParameters());
+        cfm.indexes(
+            cfm.getIndexes()
+               .with(IndexMetadata.fromIndexTargets(cfm,
+                                                    Collections.singletonList(
+                                                    new IndexTarget(new ColumnIdentifier("birthdate", true),
+                                                                    IndexTarget.Type.VALUES)),
+                                                    "birthdate_key_index",
+                                                    IndexMetadata.Kind.COMPOSITES,
+                                                    Collections.EMPTY_MAP))
+               .with(IndexMetadata.fromIndexTargets(cfm,
+                                                    Collections.singletonList(
+                                                    new IndexTarget(new ColumnIdentifier("notbirthdate", true),
+                                                                    IndexTarget.Type.VALUES)),
+                                                    "notbirthdate_key_index",
+                                                    IndexMetadata.Kind.COMPOSITES,
+                                                    Collections.EMPTY_MAP))
+        );
+
+
+        return cfm.compression(getCompressionParameters());
     }
 
-    public static CFMetaData compositeIndexCFMD(String ksName, String cfName, final Boolean withIdxType) throws ConfigurationException
-    {
-        final CompositeType composite = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{UTF8Type.instance, UTF8Type.instance})); 
-        CFMetaData cfm = CFMetaData.sparseCFMetaData(ksName, cfName, composite);
 
-        ByteBuffer cName = ByteBufferUtil.bytes("col1");
-        IndexType idxType = withIdxType ? IndexType.COMPOSITES : null;
-        return cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, cName, UTF8Type.instance, 1)
-                                                       .setIndex(withIdxType ? "col1_idx" : null, idxType, Collections.<String, String>emptyMap()))
-                                       .compressionParameters(getCompressionParameters());
+    public static CFMetaData keysIndexCFMD(String ksName, String cfName, boolean withIndex) throws ConfigurationException
+    {
+        CFMetaData cfm = CFMetaData.Builder.createDense(ksName, cfName, false, false)
+                                           .addPartitionKey("key", AsciiType.instance)
+                                           .addClusteringColumn("c1", AsciiType.instance)
+                                           .addStaticColumn("birthdate", LongType.instance)
+                                           .addStaticColumn("notbirthdate", LongType.instance)
+                                           .addRegularColumn("value", LongType.instance)
+                                           .build();
+
+        if (withIndex)
+            cfm.indexes(
+                cfm.getIndexes()
+                   .with(IndexMetadata.fromIndexTargets(cfm,
+                                                        Collections.singletonList(
+                                                            new IndexTarget(new ColumnIdentifier("birthdate", true),
+                                                                            IndexTarget.Type.VALUES)),
+                                                         "birthdate_composite_index",
+                                                         IndexMetadata.Kind.KEYS,
+                                                         Collections.EMPTY_MAP)));
+
+
+        return cfm.compression(getCompressionParameters());
     }
     
-    private static CFMetaData jdbcCFMD(String ksName, String cfName, AbstractType comp)
+    public static CFMetaData jdbcCFMD(String ksName, String cfName, AbstractType comp)
     {
-        return CFMetaData.denseCFMetaData(ksName, cfName, comp).defaultValidator(comp).compressionParameters(getCompressionParameters());
+        return CFMetaData.Builder.create(ksName, cfName).addPartitionKey("key", BytesType.instance)
+                                                        .build()
+                                                        .compression(getCompressionParameters());
     }
 
-    public static CFMetaData jdbcSparseCFMD(String ksName, String cfName, AbstractType comp)
-    {
-        return CFMetaData.sparseCFMetaData(ksName, cfName, comp).defaultValidator(comp).compressionParameters(getCompressionParameters());
-    }
-
-    public static CompressionParameters getCompressionParameters()
+    public static CompressionParams getCompressionParameters()
     {
         return getCompressionParameters(null);
     }
 
-    public static CompressionParameters getCompressionParameters(Integer chunkSize)
+    public static CompressionParams getCompressionParameters(Integer chunkSize)
     {
         if (Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false")))
-            return new CompressionParameters(SnappyCompressor.instance, chunkSize, Collections.<String, String>emptyMap());
-        else
-            return new CompressionParameters(null);
+            return CompressionParams.snappy(chunkSize);
+
+        return CompressionParams.noCompression();
     }
 
     public static void cleanupAndLeaveDirs() throws IOException
@@ -507,25 +560,19 @@
 
     public static void insertData(String keyspace, String columnFamily, int offset, int numberOfRows)
     {
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace, columnFamily);
+
         for (int i = offset; i < offset + numberOfRows; i++)
         {
-            ByteBuffer key = ByteBufferUtil.bytes("key" + i);
-            Mutation mutation = new Mutation(keyspace, key);
-            mutation.add(columnFamily, Util.cellname("col" + i), ByteBufferUtil.bytes("val" + i), System.currentTimeMillis());
-            mutation.applyUnsafe();
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes("key"+i));
+            if (cfm.clusteringColumns() != null && !cfm.clusteringColumns().isEmpty())
+                builder.clustering(ByteBufferUtil.bytes("col"+ i)).add("val", ByteBufferUtil.bytes("val" + i));
+            else
+                builder.add("val", ByteBufferUtil.bytes("val"+i));
+            builder.build().apply();
         }
     }
 
-    /* usually used to populate the cache */
-    public static void readData(String keyspace, String columnFamily, int offset, int numberOfRows)
-    {
-        ColumnFamilyStore store = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
-        for (int i = offset; i < offset + numberOfRows; i++)
-        {
-            DecoratedKey key = Util.dk("key" + i);
-            store.getColumnFamily(Util.namesQueryFilter(store, key, "col" + i));
-        }
-    }
 
     public static void cleanupSavedCaches()
     {

diff --git a/test/unit/org/apache/cassandra/TeeingAppender.java b/test/unit/org/apache/cassandra/TeeingAppender.java
new file mode 100644
index 0000000..5022ab1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/TeeingAppender.java

@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra;
+
+import java.util.Iterator;
+
+import ch.qos.logback.core.Appender;
+import ch.qos.logback.core.UnsynchronizedAppenderBase;
+import ch.qos.logback.core.spi.AppenderAttachable;
+import ch.qos.logback.core.spi.AppenderAttachableImpl;
+
+public class TeeingAppender<E> extends UnsynchronizedAppenderBase<E> implements AppenderAttachable<E>
+{
+    AppenderAttachableImpl<E> aai = new AppenderAttachableImpl<>();
+
+    @Override
+    protected void append(E e)
+    {
+        aai.appendLoopOnAppenders(e);
+    }
+
+    @Override
+    public void addAppender(Appender<E> appender)
+    {
+        aai.addAppender(appender);
+    }
+
+    @Override
+    public void detachAndStopAllAppenders()
+    {
+        aai.detachAndStopAllAppenders();
+    }
+
+    @Override
+    public boolean detachAppender(Appender<E> appender)
+    {
+        return aai.detachAppender(appender);
+    }
+
+    @Override
+    public boolean detachAppender(String name)
+    {
+        return aai.detachAppender(name);
+    }
+
+    @Override
+    public Appender<E> getAppender(String name)
+    {
+        return aai.getAppender(name);
+    }
+
+    @Override
+    public boolean isAttached(Appender<E> appender)
+    {
+        return aai.isAttached(appender);
+    }
+
+    @Override
+    public Iterator<Appender<E>> iteratorForAppenders()
+    {
+        return aai.iteratorForAppenders();
+    }
+
+    @Override
+    public void stop()
+    {
+        try
+        {
+            if (started)
+                detachAndStopAllAppenders();
+        }
+        finally
+        {
+            super.stop();
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/UpdateBuilder.java b/test/unit/org/apache/cassandra/UpdateBuilder.java
new file mode 100644
index 0000000..3a5fbe6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/UpdateBuilder.java

@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.service.StorageService;
+
+
+/**
+ * Convenience object to create updates to a single partition.
+ *
+ * This is not unlike RowUpdateBuilder except that it allows to create update to multiple rows more easily.
+ * It is also aimed at unit tests so favor convenience over efficiency.
+ */
+public class UpdateBuilder
+{
+    private final PartitionUpdate update;
+    private RowUpdateBuilder currentRow;
+    private long timestamp = FBUtilities.timestampMicros();
+
+    private UpdateBuilder(CFMetaData metadata, DecoratedKey partitionKey)
+    {
+        this.update = new PartitionUpdate(metadata, partitionKey, metadata.partitionColumns(), 4);
+    }
+
+    public static UpdateBuilder create(CFMetaData metadata, Object... partitionKey)
+    {
+        return new UpdateBuilder(metadata, makeKey(metadata, partitionKey));
+    }
+
+    public UpdateBuilder withTimestamp(long timestamp)
+    {
+        this.timestamp = timestamp;
+        return this;
+    }
+
+    public UpdateBuilder newRow(Object... clustering)
+    {
+        maybeBuildCurrentRow();
+        currentRow = new RowUpdateBuilder(update, timestamp, 0);
+        if (clustering.length > 0)
+            currentRow.clustering(clustering);
+        return this;
+    }
+
+    public UpdateBuilder add(String column, Object value)
+    {
+        assert currentRow != null;
+        currentRow.add(column, value);
+        return this;
+    }
+
+    public PartitionUpdate build()
+    {
+        maybeBuildCurrentRow();
+        return update;
+    }
+
+    public IMutation makeMutation()
+    {
+        Mutation m = new Mutation(build());
+        return update.metadata().isCounter()
+             ? new CounterMutation(m, ConsistencyLevel.ONE)
+             : m;
+    }
+
+    public void apply()
+    {
+        Mutation m = new Mutation(build());
+        if (update.metadata().isCounter())
+            new CounterMutation(m, ConsistencyLevel.ONE).apply();
+        else
+            m.apply();
+    }
+
+    public void applyUnsafe()
+    {
+        assert !update.metadata().isCounter() : "Counters have currently no applyUnsafe() option";
+        new Mutation(build()).applyUnsafe();
+    }
+
+    private void maybeBuildCurrentRow()
+    {
+        if (currentRow != null)
+        {
+            currentRow.build();
+            currentRow = null;
+        }
+    }
+
+    private static DecoratedKey makeKey(CFMetaData metadata, Object[] partitionKey)
+    {
+        if (partitionKey.length == 1 && partitionKey[0] instanceof DecoratedKey)
+            return (DecoratedKey)partitionKey[0];
+
+        ByteBuffer key = CFMetaData.serializePartitionKey(metadata.getKeyValidatorAsClusteringComparator().make(partitionKey));
+        return metadata.decorateKey(key);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index f6b4771..a49440d 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java

@@ -3,8 +3,7 @@
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
+ * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
@@ -20,28 +19,40 @@
  *
  */
 
-import java.io.*;
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOError;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Supplier;
 
-import com.google.common.base.Supplier;
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterators;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.Directories.DataDirectory;
 import org.apache.cassandra.db.compaction.AbstractCompactionTask;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.dht.IPartitioner;
+
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.VersionedValue;
@@ -50,111 +61,85 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CounterId;
+import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 public class Util
 {
     private static List<UUID> hostIdPool = new ArrayList<UUID>();
 
+    public static IPartitioner testPartitioner()
+    {
+        return DatabaseDescriptor.getPartitioner();
+    }
+
     public static DecoratedKey dk(String key)
     {
-        return StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(key));
+        return testPartitioner().decorateKey(ByteBufferUtil.bytes(key));
     }
 
     public static DecoratedKey dk(String key, AbstractType<?> type)
     {
-        return StorageService.getPartitioner().decorateKey(type.fromString(key));
+        return testPartitioner().decorateKey(type.fromString(key));
     }
 
     public static DecoratedKey dk(ByteBuffer key)
     {
-        return StorageService.getPartitioner().decorateKey(key);
+        return testPartitioner().decorateKey(key);
     }
 
-    public static RowPosition rp(String key)
+    public static PartitionPosition rp(String key)
     {
-        return rp(key, StorageService.getPartitioner());
+        return rp(key, testPartitioner());
     }
 
-    public static RowPosition rp(String key, IPartitioner partitioner)
+    public static PartitionPosition rp(String key, IPartitioner partitioner)
     {
-        return RowPosition.ForKey.get(ByteBufferUtil.bytes(key), partitioner);
+        return PartitionPosition.ForKey.get(ByteBufferUtil.bytes(key), partitioner);
     }
 
-    public static CellName cellname(ByteBuffer... bbs)
+    public static Cell getRegularCell(CFMetaData metadata, Row row, String name)
     {
-        if (bbs.length == 1)
-            return CellNames.simpleDense(bbs[0]);
-        else
-            return CellNames.compositeDense(bbs);
+        ColumnDefinition column = metadata.getColumnDefinition(ByteBufferUtil.bytes(name));
+        assert column != null;
+        return row.getCell(column);
     }
 
-    public static CellName cellname(String... strs)
+    public static Clustering clustering(ClusteringComparator comparator, Object... o)
     {
-        ByteBuffer[] bbs = new ByteBuffer[strs.length];
-        for (int i = 0; i < strs.length; i++)
-            bbs[i] = ByteBufferUtil.bytes(strs[i]);
-        return cellname(bbs);
-    }
-
-    public static CellName cellname(int i)
-    {
-        return CellNames.simpleDense(ByteBufferUtil.bytes(i));
-    }
-
-    public static CellName cellname(long l)
-    {
-        return CellNames.simpleDense(ByteBufferUtil.bytes(l));
-    }
-
-    public static Cell column(String name, String value, long timestamp)
-    {
-        return new BufferCell(cellname(name), ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    public static Cell column(String name, long value, long timestamp)
-    {
-        return new BufferCell(cellname(name), ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    public static Cell column(String clusterKey, String name, long value, long timestamp)
-    {
-        return new BufferCell(cellname(clusterKey, name), ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    public static Cell expiringColumn(String name, String value, long timestamp, int ttl)
-    {
-        return new BufferExpiringCell(cellname(name), ByteBufferUtil.bytes(value), timestamp, ttl);
+        return comparator.make(o);
     }
 
     public static Token token(String key)
     {
-        return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(key));
+        return testPartitioner().getToken(ByteBufferUtil.bytes(key));
     }
 
-    public static Range<RowPosition> range(String left, String right)
+    public static Range<PartitionPosition> range(String left, String right)
     {
-        return new Range<RowPosition>(rp(left), rp(right));
+        return new Range<>(rp(left), rp(right));
     }
 
-    public static Range<RowPosition> range(IPartitioner p, String left, String right)
+    public static Range<PartitionPosition> range(IPartitioner p, String left, String right)
     {
-        return new Range<RowPosition>(rp(left, p), rp(right, p));
+        return new Range<>(rp(left, p), rp(right, p));
     }
 
-    public static Bounds<RowPosition> bounds(String left, String right)
+    //Test helper to make an iterator iterable once
+    public static <T> Iterable<T> once(final Iterator<T> source)
     {
-        return new Bounds<RowPosition>(rp(left), rp(right));
-    }
-
-    public static void addMutation(Mutation rm, String columnFamilyName, String superColumnName, long columnName, String value, long timestamp)
-    {
-        CellName cname = superColumnName == null
-                       ? CellNames.simpleDense(getBytes(columnName))
-                       : CellNames.compositeDense(ByteBufferUtil.bytes(superColumnName), getBytes(columnName));
-        rm.add(columnFamilyName, cname, ByteBufferUtil.bytes(value), timestamp);
+        return new Iterable<T>()
+        {
+            private AtomicBoolean exhausted = new AtomicBoolean();
+            public Iterator<T> iterator()
+            {
+                Preconditions.checkState(!exhausted.getAndSet(true));
+                return source;
+            }
+        };
     }
 
     public static ByteBuffer getBytes(long v)
@@ -175,39 +160,6 @@
         return bb;
     }
 
-    public static ByteBuffer getBytes(short v)
-    {
-        byte[] bytes = new byte[2];
-        ByteBuffer bb = ByteBuffer.wrap(bytes);
-        bb.putShort(v);
-        bb.rewind();
-        return bb;
-    }
-
-    public static ByteBuffer getBytes(byte v)
-    {
-        byte[] bytes = new byte[1];
-        ByteBuffer bb = ByteBuffer.wrap(bytes);
-        bb.put(v);
-        bb.rewind();
-        return bb;
-    }
-
-    public static List<Row> getRangeSlice(ColumnFamilyStore cfs)
-    {
-        return getRangeSlice(cfs, null);
-    }
-
-    public static List<Row> getRangeSlice(ColumnFamilyStore cfs, ByteBuffer superColumn)
-    {
-        IDiskAtomFilter filter = superColumn == null
-                               ? new IdentityQueryFilter()
-                               : new SliceQueryFilter(SuperColumns.startOf(superColumn), SuperColumns.endOf(superColumn), false, Integer.MAX_VALUE);
-
-        Token min = StorageService.getPartitioner().getMinimumToken();
-        return cfs.getRangeSlice(Bounds.makeRowBounds(min, min), null, filter, 10000);
-    }
-
     /**
      * Writes out a bunch of mutations for a single column family.
      *
@@ -228,23 +180,11 @@
         return store;
     }
 
-    public static ColumnFamily getColumnFamily(Keyspace keyspace, DecoratedKey key, String cfName)
-    {
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(cfName);
-        assert cfStore != null : "Table " + cfName + " has not been defined";
-        return cfStore.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-    }
-
     public static boolean equalsCounterId(CounterId n, ByteBuffer context, int offset)
     {
         return CounterId.wrap(context, context.position() + offset).equals(n);
     }
 
-    public static ColumnFamily cloneAndRemoveDeleted(ColumnFamily cf, int gcBefore)
-    {
-        return ColumnFamilyStore.removeDeleted(cf.cloneMe(), gcBefore);
-    }
-
     /**
      * Creates initial set of nodes and tokens. Nodes are added to StorageService as 'normal'
      */
@@ -284,15 +224,15 @@
     public static Future<?> compactAll(ColumnFamilyStore cfs, int gcBefore)
     {
         List<Descriptor> descriptors = new ArrayList<>();
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             descriptors.add(sstable.descriptor);
         return CompactionManager.instance.submitUserDefined(cfs, descriptors, gcBefore);
     }
 
     public static void compact(ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
     {
-        int gcBefore = cfs.gcBefore(System.currentTimeMillis());
-        AbstractCompactionTask task = cfs.getCompactionStrategy().getUserDefinedTask(sstables, gcBefore);
+        int gcBefore = cfs.gcBefore(FBUtilities.nowInSeconds());
+        AbstractCompactionTask task = cfs.getCompactionStrategyManager().getUserDefinedTask(sstables, gcBefore);
         task.execute(null);
     }
 
@@ -318,53 +258,269 @@
         assert thrown : exception.getName() + " not received";
     }
 
-    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key)
+    public static AbstractReadCommandBuilder.SinglePartitionBuilder cmd(ColumnFamilyStore cfs, Object... partitionKey)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
-        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+        return new AbstractReadCommandBuilder.SinglePartitionBuilder(cfs, makeKey(cfs.metadata, partitionKey));
     }
 
-    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key, String... names)
+    public static AbstractReadCommandBuilder.PartitionRangeBuilder cmd(ColumnFamilyStore cfs)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
-        for (String str : names)
-            s.add(cellname(str));
-        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+        return new AbstractReadCommandBuilder.PartitionRangeBuilder(cfs);
     }
 
-    public static QueryFilter namesQueryFilter(ColumnFamilyStore cfs, DecoratedKey key, CellName... names)
+    static DecoratedKey makeKey(CFMetaData metadata, Object... partitionKey)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
-        for (CellName n : names)
-            s.add(n);
-        return QueryFilter.getNamesFilter(key, cfs.name, s, System.currentTimeMillis());
+        if (partitionKey.length == 1 && partitionKey[0] instanceof DecoratedKey)
+            return (DecoratedKey)partitionKey[0];
+
+        ByteBuffer key = CFMetaData.serializePartitionKey(metadata.getKeyValidatorAsClusteringComparator().make(partitionKey));
+        return metadata.decorateKey(key);
     }
 
-    public static NamesQueryFilter namesFilter(ColumnFamilyStore cfs, String... names)
+    public static void assertEmptyUnfiltered(ReadCommand command)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs.getComparator());
-        for (String str : names)
-            s.add(cellname(str));
-        return new NamesQueryFilter(s);
-    }
-
-    public static String string(ByteBuffer bb)
-    {
-        try
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); UnfilteredPartitionIterator iterator = command.executeLocally(orderGroup))
         {
-            return ByteBufferUtil.string(bb);
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
+            if (iterator.hasNext())
+            {
+                try (UnfilteredRowIterator partition = iterator.next())
+                {
+                    throw new AssertionError("Expected no results for query " + command.toCQLString() + " but got key " + command.metadata().getKeyValidator().getString(partition.partitionKey().getKey()));
+                }
+            }
         }
     }
 
-    public static RangeTombstone tombstone(String start, String finish, long timestamp, int localtime)
+    public static void assertEmpty(ReadCommand command)
     {
-        Composite startName = CellNames.simpleDense(ByteBufferUtil.bytes(start));
-        Composite endName = CellNames.simpleDense(ByteBufferUtil.bytes(finish));
-        return new RangeTombstone(startName, endName, timestamp , localtime);
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); PartitionIterator iterator = command.executeInternal(orderGroup))
+        {
+            if (iterator.hasNext())
+            {
+                try (RowIterator partition = iterator.next())
+                {
+                    throw new AssertionError("Expected no results for query " + command.toCQLString() + " but got key " + command.metadata().getKeyValidator().getString(partition.partitionKey().getKey()));
+                }
+            }
+        }
+    }
+
+    public static List<ImmutableBTreePartition> getAllUnfiltered(ReadCommand command)
+    {
+        List<ImmutableBTreePartition> results = new ArrayList<>();
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); UnfilteredPartitionIterator iterator = command.executeLocally(orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                try (UnfilteredRowIterator partition = iterator.next())
+                {
+                    results.add(ImmutableBTreePartition.create(partition));
+                }
+            }
+        }
+        return results;
+    }
+
+    public static List<FilteredPartition> getAll(ReadCommand command)
+    {
+        List<FilteredPartition> results = new ArrayList<>();
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); PartitionIterator iterator = command.executeInternal(orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                try (RowIterator partition = iterator.next())
+                {
+                    results.add(FilteredPartition.create(partition));
+                }
+            }
+        }
+        return results;
+    }
+
+    public static Row getOnlyRowUnfiltered(ReadCommand cmd)
+    {
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator iterator = cmd.executeLocally(orderGroup))
+        {
+            assert iterator.hasNext() : "Expecting one row in one partition but got nothing";
+            try (UnfilteredRowIterator partition = iterator.next())
+            {
+                assert !iterator.hasNext() : "Expecting a single partition but got more";
+
+                assert partition.hasNext() : "Expecting one row in one partition but got an empty partition";
+                Row row = ((Row)partition.next());
+                assert !partition.hasNext() : "Expecting a single row but got more";
+                return row;
+            }
+        }
+    }
+
+    public static Row getOnlyRow(ReadCommand cmd)
+    {
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); PartitionIterator iterator = cmd.executeInternal(orderGroup))
+        {
+            assert iterator.hasNext() : "Expecting one row in one partition but got nothing";
+            try (RowIterator partition = iterator.next())
+            {
+                assert !iterator.hasNext() : "Expecting a single partition but got more";
+                assert partition.hasNext() : "Expecting one row in one partition but got an empty partition";
+                Row row = partition.next();
+                assert !partition.hasNext() : "Expecting a single row but got more";
+                return row;
+            }
+        }
+    }
+
+    public static ImmutableBTreePartition getOnlyPartitionUnfiltered(ReadCommand cmd)
+    {
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator iterator = cmd.executeLocally(orderGroup))
+        {
+            assert iterator.hasNext() : "Expecting a single partition but got nothing";
+            try (UnfilteredRowIterator partition = iterator.next())
+            {
+                assert !iterator.hasNext() : "Expecting a single partition but got more";
+                return ImmutableBTreePartition.create(partition);
+            }
+        }
+    }
+
+    public static FilteredPartition getOnlyPartition(ReadCommand cmd)
+    {
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); PartitionIterator iterator = cmd.executeInternal(orderGroup))
+        {
+            assert iterator.hasNext() : "Expecting a single partition but got nothing";
+            try (RowIterator partition = iterator.next())
+            {
+                assert !iterator.hasNext() : "Expecting a single partition but got more";
+                return FilteredPartition.create(partition);
+            }
+        }
+    }
+
+    public static UnfilteredRowIterator apply(Mutation mutation)
+    {
+        mutation.apply();
+        assert mutation.getPartitionUpdates().size() == 1;
+        return mutation.getPartitionUpdates().iterator().next().unfilteredIterator();
+    }
+
+    public static Cell cell(ColumnFamilyStore cfs, Row row, String columnName)
+    {
+        ColumnDefinition def = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes(columnName));
+        assert def != null;
+        return row.getCell(def);
+    }
+
+    public static Row row(Partition partition, Object... clustering)
+    {
+        return partition.getRow(partition.metadata().comparator.make(clustering));
+    }
+
+    public static void assertCellValue(Object value, ColumnFamilyStore cfs, Row row, String columnName)
+    {
+        Cell cell = cell(cfs, row, columnName);
+        assert cell != null : "Row " + row.toString(cfs.metadata) + " has no cell for " + columnName;
+        assertEquals(value, cell.column().type.compose(cell.value()));
+    }
+
+    public static void consume(UnfilteredRowIterator iter)
+    {
+        try (UnfilteredRowIterator iterator = iter)
+        {
+            while (iter.hasNext())
+                iter.next();
+        }
+    }
+
+    public static int size(PartitionIterator iter)
+    {
+        int size = 0;
+        while (iter.hasNext())
+        {
+            ++size;
+            iter.next().close();
+        }
+        return size;
+    }
+
+    public static CBuilder getCBuilderForCFM(CFMetaData cfm)
+    {
+        List<ColumnDefinition> clusteringColumns = cfm.clusteringColumns();
+        List<AbstractType<?>> types = new ArrayList<>(clusteringColumns.size());
+        for (ColumnDefinition def : clusteringColumns)
+            types.add(def.type);
+        return CBuilder.create(new ClusteringComparator(types));
+    }
+
+    public static boolean equal(UnfilteredRowIterator a, UnfilteredRowIterator b)
+    {
+        return Objects.equals(a.columns(), b.columns())
+            && Objects.equals(a.metadata(), b.metadata())
+            && Objects.equals(a.isReverseOrder(), b.isReverseOrder())
+            && Objects.equals(a.partitionKey(), b.partitionKey())
+            && Objects.equals(a.partitionLevelDeletion(), b.partitionLevelDeletion())
+            && Objects.equals(a.staticRow(), b.staticRow())
+            && Objects.equals(a.stats(), b.stats())
+            && Iterators.elementsEqual(a, b);
+    }
+
+    // moved & refactored from KeyspaceTest in < 3.0
+    public static void assertColumns(Row row, String... expectedColumnNames)
+    {
+        Iterator<Cell> cells = row == null ? Iterators.<Cell>emptyIterator() : row.cells().iterator();
+        String[] actual = Iterators.toArray(Iterators.transform(cells, new Function<Cell, String>()
+        {
+            public String apply(Cell cell)
+            {
+                return cell.column().name.toString();
+            }
+        }), String.class);
+
+        assert Arrays.equals(actual, expectedColumnNames)
+        : String.format("Columns [%s])] is not expected [%s]",
+                        ((row == null) ? "" : row.columns().toString()),
+                        StringUtils.join(expectedColumnNames, ","));
+    }
+
+    public static void assertColumn(CFMetaData cfm, Row row, String name, String value, long timestamp)
+    {
+        Cell cell = row.getCell(cfm.getColumnDefinition(new ColumnIdentifier(name, true)));
+        assertColumn(cell, value, timestamp);
+    }
+
+    public static void assertColumn(Cell cell, String value, long timestamp)
+    {
+        assertNotNull(cell);
+        assertEquals(0, ByteBufferUtil.compareUnsigned(cell.value(), ByteBufferUtil.bytes(value)));
+        assertEquals(timestamp, cell.timestamp());
+    }
+
+    public static void assertClustering(CFMetaData cfm, Row row, Object... clusteringValue)
+    {
+        assertEquals(row.clustering().size(), clusteringValue.length);
+        assertEquals(0, cfm.comparator.compare(row.clustering(), cfm.comparator.make(clusteringValue)));
+    }
+
+    public static PartitionerSwitcher switchPartitioner(IPartitioner p)
+    {
+        return new PartitionerSwitcher(p);
+    }
+
+    public static class PartitionerSwitcher implements AutoCloseable
+    {
+        final IPartitioner oldP;
+        final IPartitioner newP;
+
+        public PartitionerSwitcher(IPartitioner partitioner)
+        {
+            newP = partitioner;
+            oldP = StorageService.instance.setPartitionerUnsafe(partitioner);
+        }
+
+        public void close()
+        {
+            IPartitioner p = StorageService.instance.setPartitionerUnsafe(oldP);
+            assert p == newP;
+        }
     }
 
     public static void spinAssertEquals(Object expected, Supplier<Object> s, int timeoutInSeconds)
@@ -383,4 +539,113 @@
     {
         thread.join(10000);
     }
+
+    public static AssertionError runCatchingAssertionError(Runnable test)
+    {
+        try
+        {
+            test.run();
+            return null;
+        }
+        catch (AssertionError e)
+        {
+            return e;
+        }
+    }
+
+    /**
+     * Wrapper function used to run a test that can sometimes flake for uncontrollable reasons.
+     *
+     * If the given test fails on the first run, it is executed the given number of times again, expecting all secondary
+     * runs to succeed. If they do, the failure is understood as a flake and the test is treated as passing.
+     *
+     * Do not use this if the test is deterministic and its success is not influenced by external factors (such as time,
+     * selection of random seed, network failures, etc.). If the test can be made independent of such factors, it is
+     * probably preferable to do so rather than use this method.
+     *
+     * @param test The test to run.
+     * @param rerunsOnFailure How many times to re-run it if it fails. All reruns must pass.
+     * @param message Message to send to System.err on initial failure.
+     */
+    public static void flakyTest(Runnable test, int rerunsOnFailure, String message)
+    {
+        AssertionError e = runCatchingAssertionError(test);
+        if (e == null)
+            return;     // success
+        System.err.format("Test failed. %s%n"
+                        + "Re-running %d times to verify it isn't failing more often than it should.%n"
+                        + "Failure was: %s%n", message, rerunsOnFailure, e);
+        e.printStackTrace();
+
+        int rerunsFailed = 0;
+        for (int i = 0; i < rerunsOnFailure; ++i)
+        {
+            AssertionError t = runCatchingAssertionError(test);
+            if (t != null)
+            {
+                ++rerunsFailed;
+                e.addSuppressed(t);
+            }
+        }
+        if (rerunsFailed > 0)
+        {
+            System.err.format("Test failed in %d of the %d reruns.%n", rerunsFailed, rerunsOnFailure);
+            throw e;
+        }
+
+        System.err.println("All reruns succeeded. Failure treated as flake.");
+    }
+
+    // for use with Optional in tests, can be used as an argument to orElseThrow
+    public static Supplier<AssertionError> throwAssert(final String message)
+    {
+        return () -> new AssertionError(message);
+    }
+
+    public static class UnfilteredSource extends AbstractUnfilteredRowIterator implements UnfilteredRowIterator
+    {
+        Iterator<Unfiltered> content;
+
+        public UnfilteredSource(CFMetaData cfm, DecoratedKey partitionKey, Row staticRow, Iterator<Unfiltered> content)
+        {
+            super(cfm,
+                  partitionKey,
+                  DeletionTime.LIVE,
+                  cfm.partitionColumns(),
+                  staticRow != null ? staticRow : Rows.EMPTY_STATIC_ROW,
+                  false,
+                  EncodingStats.NO_STATS);
+            this.content = content;
+        }
+
+        @Override
+        protected Unfiltered computeNext()
+        {
+            return content.hasNext() ? content.next() : endOfData();
+        }
+    }
+
+    public static UnfilteredPartitionIterator executeLocally(PartitionRangeReadCommand command,
+                                                             ColumnFamilyStore cfs,
+                                                             ReadOrderGroup orderGroup)
+    {
+        return command.queryStorage(cfs, orderGroup);
+    }
+
+    public static Closeable markDirectoriesUnwriteable(ColumnFamilyStore cfs)
+    {
+        try
+        {
+            for ( ; ; )
+            {
+                DataDirectory dir = cfs.getDirectories().getWriteableLocation(1);
+                DisallowedDirectories.maybeMarkUnwritable(cfs.getDirectories().getLocationForDisk(dir));
+            }
+        }
+        catch (IOError e)
+        {
+            // Expected -- marked all directories as unwritable
+        }
+        return () -> DisallowedDirectories.clearUnwritableUnsafe();
+    }
 }

diff --git a/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java b/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java
new file mode 100644
index 0000000..0285049
--- /dev/null
+++ b/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java

@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.auth;
+
+
+import java.nio.charset.StandardCharsets;
+
+import com.google.common.collect.Iterables;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Authenticator;
+import com.datastax.driver.core.PlainTextAuthProvider;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.exceptions.AuthenticationException;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.apache.cassandra.auth.CassandraRoleManager.*;
+import static org.apache.cassandra.auth.PasswordAuthenticator.*;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mindrot.jbcrypt.BCrypt.hashpw;
+import static org.mindrot.jbcrypt.BCrypt.gensalt;
+
+public class PasswordAuthenticatorTest extends CQLTester
+{
+
+    private static PasswordAuthenticator authenticator = new PasswordAuthenticator();
+
+    @Test
+    public void testCheckpw() throws Exception
+    {
+        // Valid and correct
+        assertTrue(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw(DEFAULT_SUPERUSER_PASSWORD, gensalt(getGensaltLogRounds()))));
+        assertTrue(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw(DEFAULT_SUPERUSER_PASSWORD, gensalt(4))));
+        assertTrue(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw(DEFAULT_SUPERUSER_PASSWORD, gensalt(31))));
+
+        // Valid but incorrect hashes
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw("incorrect0", gensalt(4))));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw("incorrect1", gensalt(10))));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, hashpw("incorrect2", gensalt(31))));
+
+        // Invalid hash values, the jBCrypt library implementation
+        // throws an exception which we catch and treat as a failure
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, ""));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "0"));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD,
+                            "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"));
+
+        // Format is structurally right, but actually invalid
+        // bad salt version
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$5x$10$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        // invalid number of rounds, multiple salt versions but it's the rounds that are incorrect
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2$02$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2a$02$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2$99$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2a$99$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        // unpadded rounds
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2$6$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+        assertFalse(checkpw(DEFAULT_SUPERUSER_PASSWORD, "$2a$6$abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ01234"));
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testEmptyUsername()
+    {
+        testDecodeIllegalUserAndPwd("", "pwd");
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testEmptyPassword()
+    {
+        testDecodeIllegalUserAndPwd("user", "");
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testNULUsername0()
+    {
+        byte[] user = {'u', 's', PasswordAuthenticator.NUL, 'e', 'r'};
+        testDecodeIllegalUserAndPwd(new String(user, StandardCharsets.UTF_8), "pwd");
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testNULUsername1()
+    {
+        testDecodeIllegalUserAndPwd(new String(new byte[4]), "pwd");
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testNULPassword0()
+    {
+        byte[] pwd = {'p', 'w', PasswordAuthenticator.NUL, 'd'};
+        testDecodeIllegalUserAndPwd("user", new String(pwd, StandardCharsets.UTF_8));
+    }
+
+    @Test(expected = AuthenticationException.class)
+    public void testNULPassword1()
+    {
+        testDecodeIllegalUserAndPwd("user", new String(new byte[4]));
+    }
+
+    private void testDecodeIllegalUserAndPwd(String username, String password)
+    {
+        SaslNegotiator negotiator = authenticator.newSaslNegotiator(null);
+        Authenticator clientAuthenticator = (new PlainTextAuthProvider(username, password))
+                                            .newAuthenticator(null, null);
+
+        negotiator.evaluateResponse(clientAuthenticator.initialResponse());
+        negotiator.getAuthenticatedUser();
+    }
+
+    @BeforeClass
+    public static void setUp()
+    {
+        SchemaLoader.createKeyspace(AuthKeyspace.NAME,
+                                    KeyspaceParams.simple(1),
+                                    Iterables.toArray(AuthKeyspace.metadata().tables, CFMetaData.class));
+        authenticator.setup();
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        schemaChange("DROP KEYSPACE " + AuthKeyspace.NAME);
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/batchlog/BatchTest.java b/test/unit/org/apache/cassandra/batchlog/BatchTest.java
new file mode 100644
index 0000000..b7a4100
--- /dev/null
+++ b/test/unit/org/apache/cassandra/batchlog/BatchTest.java

@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.UUID;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+
+public class BatchTest
+{
+    private static final String KEYSPACE = "BatchRequestTest";
+    private static final String CF_STANDARD = "Standard";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF_STANDARD, 1, BytesType.instance));
+    }
+
+    @Test
+    public void testSerialization() throws IOException
+    {
+        CFMetaData cfm = Keyspace.open(KEYSPACE).getColumnFamilyStore(CF_STANDARD).metadata;
+
+        long now = FBUtilities.timestampMicros();
+        int version = MessagingService.current_version;
+        UUID uuid = UUIDGen.getTimeUUID();
+
+        List<Mutation> mutations = new ArrayList<>(10);
+        for (int i = 0; i < 10; i++)
+        {
+            mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), bytes(i))
+                          .clustering("name" + i)
+                          .add("val", "val" + i)
+                          .build());
+        }
+
+        Batch batch1 = Batch.createLocal(uuid, now, mutations);
+        assertEquals(uuid, batch1.id);
+        assertEquals(now, batch1.creationTime);
+        assertEquals(mutations, batch1.decodedMutations);
+
+        DataOutputBuffer out = new DataOutputBuffer();
+        Batch.serializer.serialize(batch1, out, version);
+
+        assertEquals(out.getLength(), Batch.serializer.serializedSize(batch1, version));
+
+        DataInputPlus dis = new DataInputBuffer(out.getData());
+        Batch batch2 = Batch.serializer.deserialize(dis, version);
+
+        assertEquals(batch1.id, batch2.id);
+        assertEquals(batch1.creationTime, batch2.creationTime);
+        assertEquals(batch1.decodedMutations.size(), batch2.encodedMutations.size());
+
+        Iterator<Mutation> it1 = batch1.decodedMutations.iterator();
+        Iterator<ByteBuffer> it2 = batch2.encodedMutations.iterator();
+        while (it1.hasNext())
+        {
+            try (DataInputBuffer in = new DataInputBuffer(it2.next().array()))
+            {
+                assertEquals(it1.next().toString(), Mutation.serializer.deserialize(in, version).toString());
+            }
+        }
+    }
+
+    /**
+     * This is just to test decodeMutations() when deserializing,
+     * since Batch will never be serialized at a version 2.2.
+     * @throws IOException
+     */
+    @Test
+    public void testSerializationNonCurrentVersion() throws IOException
+    {
+        CFMetaData cfm = Keyspace.open(KEYSPACE).getColumnFamilyStore(CF_STANDARD).metadata;
+
+        long now = FBUtilities.timestampMicros();
+        int version = MessagingService.VERSION_22;
+        UUID uuid = UUIDGen.getTimeUUID();
+
+        List<Mutation> mutations = new ArrayList<>(10);
+        for (int i = 0; i < 10; i++)
+        {
+            mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), bytes(i))
+                          .clustering("name" + i)
+                          .add("val", "val" + i)
+                          .build());
+        }
+
+        Batch batch1 = Batch.createLocal(uuid, now, mutations);
+        assertEquals(uuid, batch1.id);
+        assertEquals(now, batch1.creationTime);
+        assertEquals(mutations, batch1.decodedMutations);
+
+        DataOutputBuffer out = new DataOutputBuffer();
+        Batch.serializer.serialize(batch1, out, version);
+
+        assertEquals(out.getLength(), Batch.serializer.serializedSize(batch1, version));
+
+        DataInputPlus dis = new DataInputBuffer(out.getData());
+        Batch batch2 = Batch.serializer.deserialize(dis, version);
+
+        assertEquals(batch1.id, batch2.id);
+        assertEquals(batch1.creationTime, batch2.creationTime);
+        assertEquals(batch1.decodedMutations.size(), batch2.decodedMutations.size());
+
+        Iterator<Mutation> it1 = batch1.decodedMutations.iterator();
+        Iterator<Mutation> it2 = batch2.decodedMutations.iterator();
+        while (it1.hasNext())
+            assertEquals(it1.next().toString(), it2.next().toString());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/batchlog/BatchlogEndpointFilterTest.java b/test/unit/org/apache/cassandra/batchlog/BatchlogEndpointFilterTest.java
new file mode 100644
index 0000000..7db1cfa
--- /dev/null
+++ b/test/unit/org/apache/cassandra/batchlog/BatchlogEndpointFilterTest.java

@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+
+import com.google.common.collect.ImmutableMultimap;
+import com.google.common.collect.Multimap;
+import org.junit.Test;
+import org.junit.matchers.JUnitMatchers;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+public class BatchlogEndpointFilterTest
+{
+    private static final String LOCAL = "local";
+
+    @Test
+    public void shouldSelect2hostsFromNonLocalRacks() throws UnknownHostException
+    {
+        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
+                .put(LOCAL, InetAddress.getByName("0"))
+                .put(LOCAL, InetAddress.getByName("00"))
+                .put("1", InetAddress.getByName("1"))
+                .put("1", InetAddress.getByName("11"))
+                .put("2", InetAddress.getByName("2"))
+                .put("2", InetAddress.getByName("22"))
+                .build();
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
+        assertThat(result.size(), is(2));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("11")));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("22")));
+    }
+
+    @Test
+    public void shouldSelectHostFromLocal() throws UnknownHostException
+    {
+        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
+                .put(LOCAL, InetAddress.getByName("0"))
+                .put(LOCAL, InetAddress.getByName("00"))
+                .put("1", InetAddress.getByName("1"))
+                .build();
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
+        assertThat(result.size(), is(2));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("1")));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
+    }
+
+    @Test
+    public void shouldReturnAsIsIfNoEnoughEndpoints() throws UnknownHostException
+    {
+        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
+                .put(LOCAL, InetAddress.getByName("0"))
+                .build();
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
+        assertThat(result.size(), is(1));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
+    }
+
+    @Test
+    public void shouldSelectTwoRandomHostsFromSingleOtherRack() throws UnknownHostException
+    {
+        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
+                .put(LOCAL, InetAddress.getByName("0"))
+                .put(LOCAL, InetAddress.getByName("00"))
+                .put("1", InetAddress.getByName("1"))
+                .put("1", InetAddress.getByName("11"))
+                .put("1", InetAddress.getByName("111"))
+                .build();
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
+        // result should be the last two non-local replicas
+        // (Collections.shuffle has been replaced with Collections.reverse for testing)
+        assertThat(result.size(), is(2));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("11")));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("111")));
+    }
+
+    @Test
+    public void shouldSelectTwoRandomHostsFromSingleRack() throws UnknownHostException
+    {
+        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
+                .put(LOCAL, InetAddress.getByName("1"))
+                .put(LOCAL, InetAddress.getByName("11"))
+                .put(LOCAL, InetAddress.getByName("111"))
+                .put(LOCAL, InetAddress.getByName("1111"))
+                .build();
+        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
+        // result should be the last two non-local replicas
+        // (Collections.shuffle has been replaced with Collections.reverse for testing)
+        assertThat(result.size(), is(2));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("111")));
+        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("1111")));
+    }
+
+    private static class TestEndpointFilter extends BatchlogManager.EndpointFilter
+    {
+        TestEndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
+        {
+            super(localRack, endpoints);
+        }
+
+        @Override
+        protected boolean isValid(InetAddress input)
+        {
+            // We will use always alive non-localhost endpoints
+            return true;
+        }
+
+        @Override
+        protected int getRandomInt(int bound)
+        {
+            // We don't need random behavior here
+            return bound - 1;
+        }
+
+        @Override
+        protected void shuffle(List<?> list)
+        {
+            // We don't need random behavior here
+            Collections.reverse(list);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java
new file mode 100644
index 0000000..dd5444f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java

@@ -0,0 +1,500 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.batchlog;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Lists;
+
+import org.junit.*;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.Util.PartitionerSwitcher;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.commitlog.ReplayPosition;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+import static org.junit.Assert.*;
+
+public class BatchlogManagerTest
+{
+    private static final String KEYSPACE1 = "BatchlogManagerTest1";
+    private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_STANDARD2 = "Standard2";
+    private static final String CF_STANDARD3 = "Standard3";
+    private static final String CF_STANDARD4 = "Standard4";
+    private static final String CF_STANDARD5 = "Standard5";
+
+    static PartitionerSwitcher sw;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        sw = Util.switchPartitioner(Murmur3Partitioner.instance);
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1, 1, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2, 1, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3, 1, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD4, 1, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD5, 1, BytesType.instance));
+    }
+
+    @AfterClass
+    public static void cleanup()
+    {
+        sw.close();
+    }
+
+    @Before
+    @SuppressWarnings("deprecation")
+    public void setUp() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        InetAddress localhost = InetAddress.getByName("127.0.0.1");
+        metadata.updateNormalToken(Util.token("A"), localhost);
+        metadata.updateHostId(UUIDGen.getTimeUUID(), localhost);
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).truncateBlocking();
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_BATCHLOG).truncateBlocking();
+    }
+
+    @Test
+    public void testDelete()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        CFMetaData cfm = cfs.metadata;
+        new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes("1234"))
+                .clustering("c")
+                .add("val", "val" + 1234)
+                .build()
+                .applyUnsafe();
+
+        DecoratedKey dk = cfs.decorateKey(ByteBufferUtil.bytes("1234"));
+        ImmutableBTreePartition results = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, dk).build());
+        Iterator<Row> iter = results.iterator();
+        assert iter.hasNext();
+
+        Mutation mutation = new Mutation(PartitionUpdate.fullPartitionDelete(cfm,
+                                                         dk,
+                                                         FBUtilities.timestampMicros(),
+                                                         FBUtilities.nowInSeconds()));
+        mutation.applyUnsafe();
+
+        Util.assertEmpty(Util.cmd(cfs, dk).build());
+    }
+
+    @Test
+    public void testReplay() throws Exception
+    {
+        testReplay(false);
+    }
+
+    @Test
+    public void testLegacyReplay() throws Exception
+    {
+        testReplay(true);
+    }
+
+    @SuppressWarnings("deprecation")
+    private static void testReplay(boolean legacy) throws Exception
+    {
+        long initialAllBatches = BatchlogManager.instance.countAllBatches();
+        long initialReplayedBatches = BatchlogManager.instance.getTotalBatchesReplayed();
+
+        CFMetaData cfm = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).metadata;
+
+        // Generate 1000 mutations (100 batches of 10 mutations each) and put them all into the batchlog.
+        // Half batches (50) ready to be replayed, half not.
+        for (int i = 0; i < 100; i++)
+        {
+            List<Mutation> mutations = new ArrayList<>(10);
+            for (int j = 0; j < 10; j++)
+            {
+                mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(i))
+                              .clustering("name" + j)
+                              .add("val", "val" + j)
+                              .build());
+            }
+
+            long timestamp = i < 50
+                           ? (System.currentTimeMillis() - BatchlogManager.getBatchlogTimeout())
+                           : (System.currentTimeMillis() + BatchlogManager.getBatchlogTimeout());
+
+            if (legacy)
+                LegacyBatchlogMigrator.store(Batch.createLocal(UUIDGen.getTimeUUID(timestamp, i), timestamp * 1000, mutations), MessagingService.current_version);
+            else
+                BatchlogManager.store(Batch.createLocal(UUIDGen.getTimeUUID(timestamp, i), timestamp * 1000, mutations));
+        }
+
+        if (legacy)
+        {
+            Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_BATCHLOG).forceBlockingFlush();
+            LegacyBatchlogMigrator.migrate();
+        }
+
+        // Flush the batchlog to disk (see CASSANDRA-6822).
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+
+        assertEquals(100, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+        assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
+
+        // Force batchlog replay and wait for it to complete.
+        BatchlogManager.instance.startBatchlogReplay().get();
+
+        // Ensure that the first half, and only the first half, got replayed.
+        assertEquals(50, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+        assertEquals(50, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
+
+        for (int i = 0; i < 100; i++)
+        {
+            String query = String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD1, i);
+            UntypedResultSet result = executeInternal(query);
+            assertNotNull(result);
+            if (i < 50)
+            {
+                Iterator<UntypedResultSet.Row> it = result.iterator();
+                assertNotNull(it);
+                for (int j = 0; j < 10; j++)
+                {
+                    assertTrue(it.hasNext());
+                    UntypedResultSet.Row row = it.next();
+
+                    assertEquals(ByteBufferUtil.bytes(i), row.getBytes("key"));
+                    assertEquals("name" + j, row.getString("name"));
+                    assertEquals("val" + j, row.getString("val"));
+                }
+
+                assertFalse(it.hasNext());
+            }
+            else
+            {
+                assertTrue(result.isEmpty());
+            }
+        }
+
+        // Ensure that no stray mutations got somehow applied.
+        UntypedResultSet result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", KEYSPACE1, CF_STANDARD1));
+        assertNotNull(result);
+        assertEquals(500, result.one().getLong("count"));
+    }
+
+    @Test
+    public void testTruncatedReplay() throws InterruptedException, ExecutionException
+    {
+        CFMetaData cf2 = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD2);
+        CFMetaData cf3 = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD3);
+        // Generate 2000 mutations (1000 batchlog entries) and put them all into the batchlog.
+        // Each batchlog entry with a mutation for Standard2 and Standard3.
+        // In the middle of the process, 'truncate' Standard2.
+        for (int i = 0; i < 1000; i++)
+        {
+            Mutation mutation1 = new RowUpdateBuilder(cf2, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(i))
+                .clustering("name" + i)
+                .add("val", "val" + i)
+                .build();
+            Mutation mutation2 = new RowUpdateBuilder(cf3, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(i))
+                .clustering("name" + i)
+                .add("val", "val" + i)
+                .build();
+
+            List<Mutation> mutations = Lists.newArrayList(mutation1, mutation2);
+
+            // Make sure it's ready to be replayed, so adjust the timestamp.
+            long timestamp = System.currentTimeMillis() - BatchlogManager.getBatchlogTimeout();
+
+            if (i == 500)
+                SystemKeyspace.saveTruncationRecord(Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2),
+                                                    timestamp,
+                                                    ReplayPosition.NONE);
+
+            // Adjust the timestamp (slightly) to make the test deterministic.
+            if (i >= 500)
+                timestamp++;
+            else
+                timestamp--;
+
+            BatchlogManager.store(Batch.createLocal(UUIDGen.getTimeUUID(timestamp, i), FBUtilities.timestampMicros(), mutations));
+        }
+
+        // Flush the batchlog to disk (see CASSANDRA-6822).
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+
+        // Force batchlog replay and wait for it to complete.
+        BatchlogManager.instance.startBatchlogReplay().get();
+
+        // We should see half of Standard2-targeted mutations written after the replay and all of Standard3 mutations applied.
+        for (int i = 0; i < 1000; i++)
+        {
+            UntypedResultSet result = executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD2,i));
+            assertNotNull(result);
+            if (i >= 500)
+            {
+                assertEquals(ByteBufferUtil.bytes(i), result.one().getBytes("key"));
+                assertEquals("name" + i, result.one().getString("name"));
+                assertEquals("val" + i, result.one().getString("val"));
+            }
+            else
+            {
+                assertTrue(result.isEmpty());
+            }
+        }
+
+        for (int i = 0; i < 1000; i++)
+        {
+            UntypedResultSet result = executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD3, i));
+            assertNotNull(result);
+            assertEquals(ByteBufferUtil.bytes(i), result.one().getBytes("key"));
+            assertEquals("name" + i, result.one().getString("name"));
+            assertEquals("val" + i, result.one().getString("val"));
+        }
+    }
+
+    @Test
+    @SuppressWarnings("deprecation")
+    public void testConversion() throws Exception
+    {
+        long initialAllBatches = BatchlogManager.instance.countAllBatches();
+        long initialReplayedBatches = BatchlogManager.instance.getTotalBatchesReplayed();
+        CFMetaData cfm = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD4);
+
+        // Generate 1400 version 2.0 mutations and put them all into the batchlog.
+        // Half ready to be replayed, half not.
+        for (int i = 0; i < 1400; i++)
+        {
+            Mutation mutation = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(i))
+                .clustering("name" + i)
+                .add("val", "val" + i)
+                .build();
+
+            long timestamp = i < 700
+                           ? (System.currentTimeMillis() - BatchlogManager.getBatchlogTimeout())
+                           : (System.currentTimeMillis() + BatchlogManager.getBatchlogTimeout());
+
+
+            Mutation batchMutation = LegacyBatchlogMigrator.getStoreMutation(Batch.createLocal(UUIDGen.getTimeUUID(timestamp, i),
+                                                                                               TimeUnit.MILLISECONDS.toMicros(timestamp),
+                                                                                               Collections.singleton(mutation)),
+                                                                             MessagingService.VERSION_20);
+            assertTrue(LegacyBatchlogMigrator.isLegacyBatchlogMutation(batchMutation));
+            LegacyBatchlogMigrator.handleLegacyMutation(batchMutation);
+        }
+
+        // Mix in 100 current version mutations, 50 ready for replay.
+        for (int i = 1400; i < 1500; i++)
+        {
+            Mutation mutation = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(i))
+                .clustering("name" + i)
+                .add("val", "val" + i)
+                .build();
+
+            long timestamp = i < 1450
+                           ? (System.currentTimeMillis() - BatchlogManager.getBatchlogTimeout())
+                           : (System.currentTimeMillis() + BatchlogManager.getBatchlogTimeout());
+
+
+            BatchlogManager.store(Batch.createLocal(UUIDGen.getTimeUUID(timestamp, i),
+                                                    FBUtilities.timestampMicros(),
+                                                    Collections.singleton(mutation)));
+        }
+
+        // Flush the batchlog to disk (see CASSANDRA-6822).
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+
+        assertEquals(1500, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+        assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
+
+        UntypedResultSet result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", SystemKeyspace.NAME, SystemKeyspace.LEGACY_BATCHLOG));
+        assertNotNull(result);
+        assertEquals("Count in blog legacy", 0, result.one().getLong("count"));
+        result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", SystemKeyspace.NAME, SystemKeyspace.BATCHES));
+        assertNotNull(result);
+        assertEquals("Count in blog", 1500, result.one().getLong("count"));
+
+        // Force batchlog replay and wait for it to complete.
+        BatchlogManager.instance.performInitialReplay();
+
+        // Ensure that the first half, and only the first half, got replayed.
+        assertEquals(750, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+        assertEquals(750, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
+
+        for (int i = 0; i < 1500; i++)
+        {
+            result = executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD4, i));
+            assertNotNull(result);
+            if (i < 700 || i >= 1400 && i < 1450)
+            {
+                assertEquals(ByteBufferUtil.bytes(i), result.one().getBytes("key"));
+                assertEquals("name" + i, result.one().getString("name"));
+                assertEquals("val" + i, result.one().getString("val"));
+            }
+            else
+            {
+                assertTrue("Present at " + i, result.isEmpty());
+            }
+        }
+
+        // Ensure that no stray mutations got somehow applied.
+        result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", KEYSPACE1, CF_STANDARD4));
+        assertNotNull(result);
+        assertEquals(750, result.one().getLong("count"));
+
+        // Ensure batchlog is left as expected.
+        result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", SystemKeyspace.NAME, SystemKeyspace.BATCHES));
+        assertNotNull(result);
+        assertEquals("Count in blog after initial replay", 750, result.one().getLong("count"));
+        result = executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", SystemKeyspace.NAME, SystemKeyspace.LEGACY_BATCHLOG));
+        assertNotNull(result);
+        assertEquals("Count in blog legacy after initial replay ", 0, result.one().getLong("count"));
+    }
+
+    @Test
+    public void testAddBatch() throws IOException
+    {
+        long initialAllBatches = BatchlogManager.instance.countAllBatches();
+        CFMetaData cfm = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD5).metadata;
+
+        long timestamp = (System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2) * 1000;
+        UUID uuid = UUIDGen.getTimeUUID();
+
+        // Add a batch with 10 mutations
+        List<Mutation> mutations = new ArrayList<>(10);
+        for (int j = 0; j < 10; j++)
+        {
+            mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(j))
+                          .clustering("name" + j)
+                          .add("val", "val" + j)
+                          .build());
+        }
+
+
+        BatchlogManager.store(Batch.createLocal(uuid, timestamp, mutations));
+        Assert.assertEquals(initialAllBatches + 1, BatchlogManager.instance.countAllBatches());
+
+        String query = String.format("SELECT count(*) FROM %s.%s where id = %s",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.BATCHES,
+                                     uuid);
+        UntypedResultSet result = executeInternal(query);
+        assertNotNull(result);
+        assertEquals(1L, result.one().getLong("count"));
+    }
+
+    @Test
+    public void testRemoveBatch()
+    {
+        long initialAllBatches = BatchlogManager.instance.countAllBatches();
+        CFMetaData cfm = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD5).metadata;
+
+        long timestamp = (System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2) * 1000;
+        UUID uuid = UUIDGen.getTimeUUID();
+
+        // Add a batch with 10 mutations
+        List<Mutation> mutations = new ArrayList<>(10);
+        for (int j = 0; j < 10; j++)
+        {
+            mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(j))
+                          .clustering("name" + j)
+                          .add("val", "val" + j)
+                          .build());
+        }
+
+        // Store the batch
+        BatchlogManager.store(Batch.createLocal(uuid, timestamp, mutations));
+        Assert.assertEquals(initialAllBatches + 1, BatchlogManager.instance.countAllBatches());
+
+        // Remove the batch
+        BatchlogManager.remove(uuid);
+
+        assertEquals(initialAllBatches, BatchlogManager.instance.countAllBatches());
+
+        String query = String.format("SELECT count(*) FROM %s.%s where id = %s",
+                                     SystemKeyspace.NAME,
+                                     SystemKeyspace.BATCHES,
+                                     uuid);
+        UntypedResultSet result = executeInternal(query);
+        assertNotNull(result);
+        assertEquals(0L, result.one().getLong("count"));
+    }
+
+    // CASSANRDA-9223
+    @Test
+    public void testReplayWithNoPeers() throws Exception
+    {
+        StorageService.instance.getTokenMetadata().removeEndpoint(InetAddress.getByName("127.0.0.1"));
+
+        long initialAllBatches = BatchlogManager.instance.countAllBatches();
+        long initialReplayedBatches = BatchlogManager.instance.getTotalBatchesReplayed();
+
+        CFMetaData cfm = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).metadata;
+
+        long timestamp = (System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2) * 1000;
+        UUID uuid = UUIDGen.getTimeUUID();
+
+        // Add a batch with 10 mutations
+        List<Mutation> mutations = new ArrayList<>(10);
+        for (int j = 0; j < 10; j++)
+        {
+            mutations.add(new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), ByteBufferUtil.bytes(j))
+                          .clustering("name" + j)
+                          .add("val", "val" + j)
+                          .build());
+        }
+        BatchlogManager.store(Batch.createLocal(uuid, timestamp, mutations));
+        assertEquals(1, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+
+        // Flush the batchlog to disk (see CASSANDRA-6822).
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+
+        assertEquals(1, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+        assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
+
+        // Force batchlog replay and wait for it to complete.
+        BatchlogManager.instance.startBatchlogReplay().get();
+
+        // Replay should be cancelled as there are no peers in the ring.
+        assertEquals(1, BatchlogManager.instance.countAllBatches() - initialAllBatches);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
index 475e436..0c7e8a5 100644
--- a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
+++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java

@@ -17,6 +17,10 @@
  */
 package org.apache.cassandra.cache;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -24,13 +28,8 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -44,9 +43,11 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                                    KeyspaceParams.simple(1),
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD1)
+                                                      .addPartitionKey("pKey", AsciiType.instance)
+                                                      .addRegularColumn("col1", AsciiType.instance)
+                                                      .build());
     }
 
     @Test
@@ -55,16 +56,17 @@
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
         for (int i = 0; i < 2; i++)
         {
-            Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key1"));
-            rm.add(CF_STANDARD1, Util.cellname("c1"), ByteBufferUtil.bytes(i), 0);
-            rm.applyUnsafe();
+            ColumnDefinition colDef = ColumnDefinition.regularDef(cfs.metadata, ByteBufferUtil.bytes("col1"), AsciiType.instance);
+            RowUpdateBuilder rowBuilder = new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), "key1");
+            rowBuilder.add(colDef, "val1");
+            rowBuilder.build().apply();
             cfs.forceBlockingFlush();
         }
 
-        Assert.assertEquals(2, cfs.getSSTables().size());
+        Assert.assertEquals(2, cfs.getLiveSSTables().size());
 
         // preheat key cache
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             sstable.getPosition(Util.dk("key1"), SSTableReader.Operator.EQ);
 
         AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = CacheService.instance.keyCache;
@@ -77,7 +79,7 @@
 
         // then load saved
         keyCache.loadSavedAsync().get();
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             Assert.assertNotNull(keyCache.get(new KeyCacheKey(cfs.metadata.ksAndCFName, sstable.descriptor, ByteBufferUtil.bytes("key1"))));
     }
 }

diff --git a/test/unit/org/apache/cassandra/cache/CacheProviderTest.java b/test/unit/org/apache/cassandra/cache/CacheProviderTest.java
index bfcfa59..a4173d6 100644
--- a/test/unit/org/apache/cassandra/cache/CacheProviderTest.java
+++ b/test/unit/org/apache/cassandra/cache/CacheProviderTest.java

@@ -1,5 +1,4 @@
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -16,30 +15,36 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- *
  */
 package org.apache.cassandra.cache;
 
-
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
+import static org.junit.Assert.*;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.utils.Pair;
 
 import com.googlecode.concurrentlinkedhashmap.Weighers;
 
-import static org.apache.cassandra.Util.column;
-import static org.junit.Assert.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
 
 public class CacheProviderTest
 {
@@ -52,59 +57,85 @@
     private static final String KEYSPACE1 = "CacheProviderTest1";
     private static final String CF_STANDARD1 = "Standard1";
 
+    private static CFMetaData cfm;
+
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
+
+        cfm = CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD1)
+                                        .addPartitionKey("pKey", AsciiType.instance)
+                                        .addRegularColumn("col1", AsciiType.instance)
+                                        .build();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                                    KeyspaceParams.simple(1),
+                                    cfm);
     }
 
-    private void simpleCase(ColumnFamily cf, ICache<MeasureableString, IRowCacheEntry> cache)
+    private CachedBTreePartition createPartition()
     {
-        cache.put(key1, cf);
+        PartitionUpdate update = new RowUpdateBuilder(cfm, System.currentTimeMillis(), "key1")
+                                 .add("col1", "val1")
+                                 .buildUpdate();
+
+        return CachedBTreePartition.create(update.unfilteredIterator(), FBUtilities.nowInSeconds());
+    }
+
+    private void simpleCase(CachedBTreePartition partition, ICache<MeasureableString, IRowCacheEntry> cache)
+    {
+        cache.put(key1, partition);
         assertNotNull(cache.get(key1));
 
-        assertDigests(cache.get(key1), cf);
-        cache.put(key2, cf);
-        cache.put(key3, cf);
-        cache.put(key4, cf);
-        cache.put(key5, cf);
+        assertDigests(cache.get(key1), partition);
+        cache.put(key2, partition);
+        cache.put(key3, partition);
+        cache.put(key4, partition);
+        cache.put(key5, partition);
 
         assertEquals(CAPACITY, cache.size());
     }
 
-    private void assertDigests(IRowCacheEntry one, ColumnFamily two)
+    private void assertDigests(IRowCacheEntry one, CachedBTreePartition two)
     {
-        // CF does not implement .equals
-        assertTrue(one instanceof ColumnFamily);
-        assertEquals(ColumnFamily.digest((ColumnFamily)one), ColumnFamily.digest(two));
+        assertTrue(one instanceof CachedBTreePartition);
+        try
+        {
+            MessageDigest d1 = MessageDigest.getInstance("MD5");
+            MessageDigest d2 = MessageDigest.getInstance("MD5");
+            UnfilteredRowIterators.digest(null, ((CachedBTreePartition) one).unfilteredIterator(), d1, MessagingService.current_version);
+            UnfilteredRowIterators.digest(null, ((CachedBTreePartition) two).unfilteredIterator(), d2, MessagingService.current_version);
+            assertTrue(MessageDigest.isEqual(d1.digest(), d2.digest()));
+        }
+        catch (NoSuchAlgorithmException e)
+        {
+            throw new RuntimeException(e);
+        }
     }
 
-    // TODO this isn't terribly useful
-    private void concurrentCase(final ColumnFamily cf, final ICache<MeasureableString, IRowCacheEntry> cache) throws InterruptedException
+    private void concurrentCase(final CachedBTreePartition partition, final ICache<MeasureableString, IRowCacheEntry> cache) throws InterruptedException
     {
-        Runnable runable = new Runnable()
+        final long startTime = System.currentTimeMillis() + 500;
+        Runnable runnable = new Runnable()
         {
             public void run()
             {
-                for (int j = 0; j < 10; j++)
+                while (System.currentTimeMillis() < startTime) {}
+                for (int j = 0; j < 1000; j++)
                 {
-                    cache.put(key1, cf);
-                    cache.put(key2, cf);
-                    cache.put(key3, cf);
-                    cache.put(key4, cf);
-                    cache.put(key5, cf);
+                    cache.put(key1, partition);
+                    cache.put(key2, partition);
+                    cache.put(key3, partition);
+                    cache.put(key4, partition);
+                    cache.put(key5, partition);
                 }
             }
         };
 
-        List<Thread> threads = new ArrayList<Thread>(100);
+        List<Thread> threads = new ArrayList<>(100);
         for (int i = 0; i < 100; i++)
         {
-            Thread thread = new Thread(runable);
+            Thread thread = new Thread(runnable);
             threads.add(thread);
             thread.start();
         }
@@ -112,21 +143,13 @@
             thread.join();
     }
 
-    private ColumnFamily createCF()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.addColumn(column("vijay", "great", 1));
-        cf.addColumn(column("awesome", "vijay", 1));
-        return cf;
-    }
-
     @Test
     public void testSerializingCache() throws InterruptedException
     {
         ICache<MeasureableString, IRowCacheEntry> cache = SerializingCache.create(CAPACITY, Weighers.<RefCountedMemory>singleton(), new SerializingCacheProvider.RowCacheSerializer());
-        ColumnFamily cf = createCF();
-        simpleCase(cf, cache);
-        concurrentCase(cf, cache);
+        CachedBTreePartition partition = createPartition();
+        simpleCase(partition, cache);
+        concurrentCase(partition, cache);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/concurrent/SEPExecutorTest.java b/test/unit/org/apache/cassandra/concurrent/SEPExecutorTest.java
index 0d61ad8..8526dd0 100644
--- a/test/unit/org/apache/cassandra/concurrent/SEPExecutorTest.java
+++ b/test/unit/org/apache/cassandra/concurrent/SEPExecutorTest.java

@@ -47,19 +47,13 @@
         OutputStream nullOutputStream = new OutputStream() {
             public void write(int b) { }
         };
-        final PrintStream nullPrintSteam = new PrintStream(nullOutputStream);
+        PrintStream nullPrintSteam = new PrintStream(nullOutputStream);
 
         for (int idx = 0; idx < 20; idx++)
         {
-            final ExecutorService es = sharedPool.newExecutor(FBUtilities.getAvailableProcessors(), Integer.MAX_VALUE, "STAGE", run + MAGIC + idx);
+            ExecutorService es = sharedPool.newExecutor(FBUtilities.getAvailableProcessors(), Integer.MAX_VALUE, "STAGE", run + MAGIC + idx);
             // Write to black hole
-            es.execute(new Runnable()
-            {
-                public void run()
-                {
-                    nullPrintSteam.println("TEST" + es);
-                }
-            });
+            es.execute(() -> nullPrintSteam.println("TEST" + es));
         }
 
         // shutdown does not guarantee that threads are actually dead once it exits, only that they will stop promptly afterwards

diff --git a/test/unit/org/apache/cassandra/config/CFMetaDataTest.java b/test/unit/org/apache/cassandra/config/CFMetaDataTest.java
new file mode 100644
index 0000000..9d91df3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/config/CFMetaDataTest.java

@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.config;
+
+import java.util.*;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.schema.Types;
+import org.apache.cassandra.thrift.CfDef;
+import org.apache.cassandra.thrift.ColumnDef;
+import org.apache.cassandra.thrift.IndexType;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class CFMetaDataTest
+{
+    private static final String KEYSPACE1 = "CFMetaDataTest1";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    private static List<ColumnDef> columnDefs = new ArrayList<ColumnDef>();
+
+    static
+    {
+        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col1"), AsciiType.class.getCanonicalName())
+                                    .setIndex_name("col1Index")
+                                    .setIndex_type(IndexType.KEYS));
+
+        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col2"), UTF8Type.class.getCanonicalName())
+                                    .setIndex_name("col2Index")
+                                    .setIndex_type(IndexType.KEYS));
+
+        Map<String, String> customIndexOptions = new HashMap<>();
+        customIndexOptions.put("option1", "value1");
+        customIndexOptions.put("option2", "value2");
+        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col3"), Int32Type.class.getCanonicalName())
+                                    .setIndex_name("col3Index")
+                                    .setIndex_type(IndexType.CUSTOM)
+                                    .setIndex_options(customIndexOptions));
+    }
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+    }
+
+    @Test
+    public void testThriftConversion() throws Exception
+    {
+        CfDef cfDef = new CfDef().setDefault_validation_class(AsciiType.class.getCanonicalName())
+                                 .setComment("Test comment")
+                                 .setColumn_metadata(columnDefs)
+                                 .setKeyspace(KEYSPACE1)
+                                 .setName(CF_STANDARD1);
+
+        // convert Thrift to CFMetaData
+        CFMetaData cfMetaData = ThriftConversion.fromThrift(cfDef);
+
+        CfDef thriftCfDef = new CfDef();
+        thriftCfDef.keyspace = KEYSPACE1;
+        thriftCfDef.name = CF_STANDARD1;
+        thriftCfDef.default_validation_class = cfDef.default_validation_class;
+        thriftCfDef.comment = cfDef.comment;
+        thriftCfDef.column_metadata = new ArrayList<>();
+        for (ColumnDef columnDef : columnDefs)
+        {
+            ColumnDef c = new ColumnDef();
+            c.name = ByteBufferUtil.clone(columnDef.name);
+            c.validation_class = columnDef.getValidation_class();
+            c.index_name = columnDef.getIndex_name();
+            c.index_type = columnDef.getIndex_type();
+            if (columnDef.isSetIndex_options())
+                c.setIndex_options(columnDef.getIndex_options());
+            thriftCfDef.column_metadata.add(c);
+        }
+
+        CfDef converted = ThriftConversion.toThrift(cfMetaData);
+
+        assertEquals(thriftCfDef.keyspace, converted.keyspace);
+        assertEquals(thriftCfDef.name, converted.name);
+        assertEquals(thriftCfDef.default_validation_class, converted.default_validation_class);
+        assertEquals(thriftCfDef.comment, converted.comment);
+        assertEquals(new HashSet<>(thriftCfDef.column_metadata), new HashSet<>(converted.column_metadata));
+    }
+
+    @Test
+    public void testConversionsInverses() throws Exception
+    {
+        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        {
+            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+            {
+                CFMetaData cfm = cfs.metadata;
+                if (!cfm.isThriftCompatible())
+                    continue;
+
+                checkInverses(cfm);
+
+                // Testing with compression to catch #3558
+                CFMetaData withCompression = cfm.copy();
+                withCompression.compression(CompressionParams.snappy(32768));
+                checkInverses(withCompression);
+            }
+        }
+    }
+
+    private void checkInverses(CFMetaData cfm) throws Exception
+    {
+        KeyspaceMetadata keyspace = Schema.instance.getKSMetaData(cfm.ksName);
+
+        // Test thrift conversion
+        CFMetaData before = cfm;
+        CFMetaData after = ThriftConversion.fromThriftForUpdate(ThriftConversion.toThrift(before), before);
+        assert before.equals(after) : String.format("%n%s%n!=%n%s", before, after);
+
+        // Test schema conversion
+        Mutation rm = SchemaKeyspace.makeCreateTableMutation(keyspace, cfm, FBUtilities.timestampMicros());
+        PartitionUpdate cfU = rm.getPartitionUpdate(Schema.instance.getId(SchemaKeyspace.NAME, SchemaKeyspace.TABLES));
+        PartitionUpdate cdU = rm.getPartitionUpdate(Schema.instance.getId(SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS));
+
+        UntypedResultSet.Row tableRow = QueryProcessor.resultify(String.format("SELECT * FROM %s.%s", SchemaKeyspace.NAME, SchemaKeyspace.TABLES),
+                                                                 UnfilteredRowIterators.filter(cfU.unfilteredIterator(), FBUtilities.nowInSeconds()))
+                                                      .one();
+        TableParams params = SchemaKeyspace.createTableParamsFromRow(tableRow);
+
+        UntypedResultSet columnsRows = QueryProcessor.resultify(String.format("SELECT * FROM %s.%s", SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS),
+                                                                UnfilteredRowIterators.filter(cdU.unfilteredIterator(), FBUtilities.nowInSeconds()));
+        Set<ColumnDefinition> columns = new HashSet<>();
+        for (UntypedResultSet.Row row : columnsRows)
+            columns.add(SchemaKeyspace.createColumnFromRow(row, Types.none()));
+
+        assertEquals(cfm.params, params);
+        assertEquals(new HashSet<>(cfm.allColumns()), columns);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java b/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java
index 2bee0c3..933d231 100644
--- a/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java
+++ b/test/unit/org/apache/cassandra/config/ColumnDefinitionTest.java

@@ -23,7 +23,6 @@
 import org.junit.Assert;
 import org.junit.Test;
 
-import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.thrift.ThriftConversion;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -33,12 +32,14 @@
     @Test
     public void testSerializeDeserialize() throws Exception
     {
-        CFMetaData cfm = CFMetaData.denseCFMetaData("ks", "cf", UTF8Type.instance);
+        CFMetaData cfm = CFMetaData.Builder.create("ks", "cf", true, false, false)
+                         .addPartitionKey("pkey", AsciiType.instance)
+                         .addClusteringColumn("name", AsciiType.instance)
+                         .addRegularColumn("val", AsciiType.instance)
+                         .build();
 
-        ColumnDefinition cd0 = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("TestColumnDefinitionName0"), BytesType.instance, null)
-                                               .setIndex("random index name 0", IndexType.KEYS, null);
-
-        ColumnDefinition cd1 = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("TestColumnDefinition1"), LongType.instance, null);
+        ColumnDefinition cd0 = ColumnDefinition.staticDef(cfm, ByteBufferUtil.bytes("TestColumnDefinitionName0"), BytesType.instance);
+        ColumnDefinition cd1 = ColumnDefinition.staticDef(cfm, ByteBufferUtil.bytes("TestColumnDefinition1"), LongType.instance);
 
         testSerializeDeserialize(cfm, cd0);
         testSerializeDeserialize(cfm, cd1);
@@ -46,7 +47,7 @@
 
     protected void testSerializeDeserialize(CFMetaData cfm, ColumnDefinition cd) throws Exception
     {
-        ColumnDefinition newCd = ThriftConversion.fromThrift(cfm.ksName, cfm.cfName, cfm.comparator.asAbstractType(), null, ThriftConversion.toThrift(cd));
+        ColumnDefinition newCd = ThriftConversion.fromThrift(cfm.ksName, cfm.cfName, cfm.comparator.subtype(0), null, ThriftConversion.toThrift(cfm, cd));
         Assert.assertNotSame(cd, newCd);
         Assert.assertEquals(cd.hashCode(), newCd.hashCode());
         Assert.assertEquals(cd, newCd);

diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
index 7409535..4a43388 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java

@@ -25,8 +25,7 @@
 import java.net.NetworkInterface;
 import java.util.Enumeration;
 
-import junit.framework.Assert;
-
+import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -36,13 +35,15 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.MigrationManager;
 import org.apache.cassandra.thrift.ThriftConversion;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class DatabaseDescriptorTest
@@ -59,7 +60,7 @@
         // test serialization of all defined test CFs.
         for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
         {
-            for (CFMetaData cfm : Schema.instance.getKeyspaceMetaData(keyspaceName).values())
+            for (CFMetaData cfm : Schema.instance.getTablesAndViews(keyspaceName))
             {
                 CFMetaData cfmDupe = ThriftConversion.fromThrift(ThriftConversion.toThrift(cfm));
                 assertNotNull(cfmDupe);
@@ -74,8 +75,8 @@
         for (String ks : Schema.instance.getNonSystemKeyspaces())
         {
             // Not testing round-trip on the KsDef via serDe() because maps
-            KSMetaData ksm = Schema.instance.getKSMetaData(ks);
-            KSMetaData ksmDupe = ThriftConversion.fromThrift(ThriftConversion.toThrift(ksm));
+            KeyspaceMetadata ksm = Schema.instance.getKSMetaData(ks);
+            KeyspaceMetadata ksmDupe = ThriftConversion.fromThrift(ThriftConversion.toThrift(ksm));
             assertNotNull(ksmDupe);
             assertEquals(ksm, ksmDupe);
         }
@@ -95,14 +96,14 @@
         try
         {
             // add a few.
-            MigrationManager.announceNewKeyspace(KSMetaData.testMetadata("ks0", SimpleStrategy.class, KSMetaData.optsWithRF(3)));
-            MigrationManager.announceNewKeyspace(KSMetaData.testMetadata("ks1", SimpleStrategy.class, KSMetaData.optsWithRF(3)));
+            MigrationManager.announceNewKeyspace(KeyspaceMetadata.create("ks0", KeyspaceParams.simple(3)));
+            MigrationManager.announceNewKeyspace(KeyspaceMetadata.create("ks1", KeyspaceParams.simple(3)));
 
             assertNotNull(Schema.instance.getKSMetaData("ks0"));
             assertNotNull(Schema.instance.getKSMetaData("ks1"));
 
-            Schema.instance.clearKeyspaceDefinition(Schema.instance.getKSMetaData("ks0"));
-            Schema.instance.clearKeyspaceDefinition(Schema.instance.getKSMetaData("ks1"));
+            Schema.instance.clearKeyspaceMetadata(Schema.instance.getKSMetaData("ks0"));
+            Schema.instance.clearKeyspaceMetadata(Schema.instance.getKSMetaData("ks1"));
 
             assertNull(Schema.instance.getKSMetaData("ks0"));
             assertNull(Schema.instance.getKSMetaData("ks1"));
@@ -139,7 +140,7 @@
         public Config loadConfig() throws ConfigurationException
         {
             Config testConfig = new Config();
-            testConfig.cluster_name = "ConfigurationLoader Test";;
+            testConfig.cluster_name = "ConfigurationLoader Test";
             return testConfig;
         }
     }
@@ -272,4 +273,39 @@
         DatabaseDescriptor.applyAddressConfig(testConfig);
 
     }
+
+    @Test
+    public void testRepairSessionSizeToggles()
+    {
+        int previousDepth = DatabaseDescriptor.getRepairSessionMaxTreeDepth();
+        try
+        {
+            Assert.assertEquals(18, DatabaseDescriptor.getRepairSessionMaxTreeDepth());
+            DatabaseDescriptor.setRepairSessionMaxTreeDepth(10);
+            Assert.assertEquals(10, DatabaseDescriptor.getRepairSessionMaxTreeDepth());
+
+            try
+            {
+                DatabaseDescriptor.setRepairSessionMaxTreeDepth(9);
+                fail("Should have received a ConfigurationException for depth of 9");
+            }
+            catch (ConfigurationException ignored) { }
+            Assert.assertEquals(10, DatabaseDescriptor.getRepairSessionMaxTreeDepth());
+
+            try
+            {
+                DatabaseDescriptor.setRepairSessionMaxTreeDepth(-20);
+                fail("Should have received a ConfigurationException for depth of -20");
+            }
+            catch (ConfigurationException ignored) { }
+            Assert.assertEquals(10, DatabaseDescriptor.getRepairSessionMaxTreeDepth());
+
+            DatabaseDescriptor.setRepairSessionMaxTreeDepth(22);
+            Assert.assertEquals(22, DatabaseDescriptor.getRepairSessionMaxTreeDepth());
+        }
+        finally
+        {
+            DatabaseDescriptor.setRepairSessionMaxTreeDepth(previousDepth);
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/config/KSMetaDataTest.java b/test/unit/org/apache/cassandra/config/KSMetaDataTest.java
deleted file mode 100644
index 39be913..0000000
--- a/test/unit/org/apache/cassandra/config/KSMetaDataTest.java
+++ /dev/null

@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.cassandra.config;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.cassandra.locator.SimpleStrategy;
-
-import org.junit.Test;
-
-import static org.junit.Assert.assertTrue;
-
-public class KSMetaDataTest
-{
-    @Test
-    public void testToStringHasStrategyOptions() throws Exception
-    {
-        Map<String, String> options = new HashMap<String, String>();
-        options.put("key1", "value1");
-        options.put("key2", "value2");
-        options.put("key3", "value3");
-
-        KSMetaData ksMeta = new KSMetaData("test", SimpleStrategy.class, options, true);
-
-        assertTrue(ksMeta.toString().contains(options.toString()));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/config/LegacySchemaTablesTest.java b/test/unit/org/apache/cassandra/config/LegacySchemaTablesTest.java
deleted file mode 100644
index f630c88..0000000
--- a/test/unit/org/apache/cassandra/config/LegacySchemaTablesTest.java
+++ /dev/null

@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.config;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.HashMap;
-import java.util.HashSet;
-
-import com.google.common.collect.Iterables;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.compress.*;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.schema.LegacySchemaTables;
-import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.CfDef;
-import org.apache.cassandra.thrift.ColumnDef;
-import org.apache.cassandra.thrift.IndexType;
-import org.apache.cassandra.thrift.ThriftConversion;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-public class LegacySchemaTablesTest
-{
-    private static final String KEYSPACE1 = "CFMetaDataTest1";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
-
-    private static List<ColumnDef> columnDefs = new ArrayList<ColumnDef>();
-
-    static
-    {
-        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col1"), AsciiType.class.getCanonicalName())
-                                    .setIndex_name("col1Index")
-                                    .setIndex_type(IndexType.KEYS));
-
-        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col2"), UTF8Type.class.getCanonicalName())
-                                    .setIndex_name("col2Index")
-                                    .setIndex_type(IndexType.KEYS));
-    }
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
-    @Test
-    public void testIsDenseRecalculation()
-    {
-        // 1.a start with a dense CF
-        CfDef cfDef0 = new CfDef().setDefault_validation_class(BytesType.class.getCanonicalName())
-                                  .setComparator_type(UTF8Type.class.getCanonicalName())
-                                  .setColumn_metadata(Collections.<ColumnDef>emptyList())
-                                  .setKeyspace(KEYSPACE1)
-                                  .setName(CF_STANDARD2);
-        CFMetaData cfm0 = ThriftConversion.fromThrift(cfDef0);
-        MigrationManager.announceNewColumnFamily(cfm0, true);
-
-        // 1.b validate that the cf is dense, has a single compact value and a clustering column, and no regulars
-        CFMetaData current = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD2);
-        assertTrue(current.getIsDense());
-        assertNotNull(current.compactValueColumn());
-        assertEquals(0, Iterables.size(current.regularAndStaticColumns()));
-        assertEquals(1, current.clusteringColumns().size());
-
-        // 2.a add a column to the table
-        CfDef cfDef1 = ThriftConversion.toThrift(current);
-        List<ColumnDef> colDefs =
-            Collections.singletonList(new ColumnDef(ByteBufferUtil.bytes("col1"), AsciiType.class.getCanonicalName()));
-        cfDef1.setColumn_metadata(colDefs);
-        CFMetaData cfm1 = ThriftConversion.fromThriftForUpdate(cfDef1, current);
-        MigrationManager.announceColumnFamilyUpdate(cfm1, true);
-
-        // 2.b validate that the cf is sparse now, had no compact value column or clustering column, and 1 regular
-        current = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD2);
-        assertFalse(current.getIsDense());
-        assertNull(current.compactValueColumn());
-        assertEquals(1, Iterables.size(current.regularAndStaticColumns()));
-        assertEquals(0, current.clusteringColumns().size());
-
-        // 3.a remove the column
-        CfDef cfDef2 = ThriftConversion.toThrift(current);
-        cfDef2.setColumn_metadata(Collections.<ColumnDef>emptyList());
-        CFMetaData cfm2 = ThriftConversion.fromThriftForUpdate(cfDef2, current);
-        MigrationManager.announceColumnFamilyUpdate(cfm2, true);
-
-        // 3.b validate that the cf is dense, has a single compact value and a clustering column, and no regulars
-        current = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD2);
-        assertTrue(current.getIsDense());
-        assertNotNull(current.compactValueColumn());
-        assertEquals(0, Iterables.size(current.regularAndStaticColumns()));
-        assertEquals(1, current.clusteringColumns().size());
-    }
-
-    @Test
-    public void testThriftConversion() throws Exception
-    {
-        CfDef cfDef = new CfDef().setDefault_validation_class(AsciiType.class.getCanonicalName())
-                                 .setComment("Test comment")
-                                 .setColumn_metadata(columnDefs)
-                                 .setKeyspace(KEYSPACE1)
-                                 .setName(CF_STANDARD1);
-
-        // convert Thrift to CFMetaData
-        CFMetaData cfMetaData = ThriftConversion.fromThrift(cfDef);
-
-        CfDef thriftCfDef = new CfDef();
-        thriftCfDef.keyspace = KEYSPACE1;
-        thriftCfDef.name = CF_STANDARD1;
-        thriftCfDef.default_validation_class = cfDef.default_validation_class;
-        thriftCfDef.comment = cfDef.comment;
-        thriftCfDef.column_metadata = new ArrayList<>();
-        for (ColumnDef columnDef : columnDefs)
-        {
-            ColumnDef c = new ColumnDef();
-            c.name = ByteBufferUtil.clone(columnDef.name);
-            c.validation_class = columnDef.getValidation_class();
-            c.index_name = columnDef.getIndex_name();
-            c.index_type = IndexType.KEYS;
-            thriftCfDef.column_metadata.add(c);
-        }
-
-        CfDef converted = ThriftConversion.toThrift(cfMetaData);
-
-        assertEquals(thriftCfDef.keyspace, converted.keyspace);
-        assertEquals(thriftCfDef.name, converted.name);
-        assertEquals(thriftCfDef.default_validation_class, converted.default_validation_class);
-        assertEquals(thriftCfDef.comment, converted.comment);
-        assertEquals(new HashSet<>(thriftCfDef.column_metadata), new HashSet<>(converted.column_metadata));
-    }
-
-    @Test
-    public void testConversionsInverses() throws Exception
-    {
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
-        {
-            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
-            {
-                CFMetaData cfm = cfs.metadata;
-                if (!cfm.isThriftCompatible())
-                    continue;
-
-                checkInverses(cfm);
-
-                // Testing with compression to catch #3558
-                CFMetaData withCompression = cfm.copy();
-                withCompression.compressionParameters(new CompressionParameters(SnappyCompressor.instance, 32768, new HashMap<String, String>()));
-                checkInverses(withCompression);
-            }
-        }
-    }
-
-    private void checkInverses(CFMetaData cfm) throws Exception
-    {
-        DecoratedKey k = StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(cfm.ksName));
-        KSMetaData keyspace = Schema.instance.getKSMetaData(cfm.ksName);
-
-        // Test thrift conversion
-        CFMetaData before = cfm;
-        CFMetaData after = ThriftConversion.fromThriftForUpdate(ThriftConversion.toThrift(before), before);
-        assert before.equals(after) : String.format("%n%s%n!=%n%s", before, after);
-
-        // Test schema conversion
-        Mutation rm = LegacySchemaTables.makeCreateTableMutation(keyspace, cfm, FBUtilities.timestampMicros());
-        ColumnFamily serializedCf = rm.getColumnFamily(Schema.instance.getId(SystemKeyspace.NAME, LegacySchemaTables.COLUMNFAMILIES));
-        ColumnFamily serializedCD = rm.getColumnFamily(Schema.instance.getId(SystemKeyspace.NAME, LegacySchemaTables.COLUMNS));
-        CFMetaData newCfm = LegacySchemaTables.createTableFromTablePartitionAndColumnsPartition(new Row(k, serializedCf), new Row(k, serializedCD));
-        assert cfm.equals(newCfm) : String.format("%n%s%n!=%n%s", cfm, newCfm);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/BatchTests.java b/test/unit/org/apache/cassandra/cql3/BatchTests.java
index 95fe612..260db4e 100644
--- a/test/unit/org/apache/cassandra/cql3/BatchTests.java
+++ b/test/unit/org/apache/cassandra/cql3/BatchTests.java

@@ -30,16 +30,16 @@
 
 import java.io.IOException;
 
-public class BatchTests
+public class BatchTests extends  CQLTester
 {
     private static EmbeddedCassandraService cassandra;
 
     private static Cluster cluster;
     private static Session session;
 
-
     private static PreparedStatement counter;
     private static PreparedStatement noncounter;
+    private static PreparedStatement clustering;
 
     @BeforeClass()
     public static void setup() throws ConfigurationException, IOException
@@ -60,65 +60,92 @@
                 "  id int PRIMARY KEY,\n" +
                 "  val counter,\n" +
                 ");");
+        session.execute("CREATE TABLE junit.clustering (\n" +
+                "  id int,\n" +
+                "  clustering1 int,\n" +
+                "  clustering2 int,\n" +
+                "  clustering3 int,\n" +
+                "  val text, \n" +
+                " PRIMARY KEY(id, clustering1, clustering2, clustering3)" +
+                ");");
 
 
         noncounter = session.prepare("insert into junit.noncounter(id, val)values(?,?)");
         counter = session.prepare("update junit.counter set val = val + ? where id = ?");
+        clustering = session.prepare("insert into junit.clustering(id, clustering1, clustering2, clustering3, val) values(?,?,?,?,?)");
     }
 
     @Test(expected = InvalidQueryException.class)
     public void testMixedInCounterBatch()
     {
-       sendBatch(BatchStatement.Type.COUNTER, true, true);
+       sendBatch(BatchStatement.Type.COUNTER, true, true, false);
     }
 
     @Test(expected = InvalidQueryException.class)
     public void testMixedInLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.LOGGED, true, true);
+        sendBatch(BatchStatement.Type.LOGGED, true, true, false);
     }
 
     @Test(expected = InvalidQueryException.class)
     public void testMixedInUnLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.UNLOGGED, true, true);
+        sendBatch(BatchStatement.Type.UNLOGGED, true, true, false);
     }
 
     @Test(expected = InvalidQueryException.class)
     public void testNonCounterInCounterBatch()
     {
-        sendBatch(BatchStatement.Type.COUNTER, false, true);
+        sendBatch(BatchStatement.Type.COUNTER, false, true, false);
     }
 
     @Test
     public void testNonCounterInLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.LOGGED, false, true);
+        sendBatch(BatchStatement.Type.LOGGED, false, true, false);
     }
 
     @Test
     public void testNonCounterInUnLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.UNLOGGED, false, true);
+        sendBatch(BatchStatement.Type.UNLOGGED, false, true, false);
     }
 
     @Test
     public void testCounterInCounterBatch()
     {
-        sendBatch(BatchStatement.Type.COUNTER, true, false);
+        sendBatch(BatchStatement.Type.COUNTER, true, false, false);
     }
 
     @Test
     public void testCounterInUnLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.UNLOGGED, true, false);
+        sendBatch(BatchStatement.Type.UNLOGGED, true, false, false);
     }
 
+    @Test
+    public void testTableWithClusteringInLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.LOGGED, false, false, true);
+    }
+
+    @Test
+    public void testTableWithClusteringInUnLoggedBatch()
+    {
+        sendBatch(BatchStatement.Type.UNLOGGED, false, false, true);
+    }
+
+    @Test
+    public void testEmptyBatch()
+    {
+        session.execute("BEGIN BATCH APPLY BATCH");
+        session.execute("BEGIN UNLOGGED BATCH APPLY BATCH");
+    }
 
     @Test(expected = InvalidQueryException.class)
     public void testCounterInLoggedBatch()
     {
-        sendBatch(BatchStatement.Type.LOGGED, true, false);
+        sendBatch(BatchStatement.Type.LOGGED, true, false, false);
     }
 
     @Test(expected = InvalidQueryException.class)
@@ -133,12 +160,10 @@
         session.execute(b);
     }
 
-
-
-    public void sendBatch(BatchStatement.Type type, boolean addCounter, boolean addNonCounter)
+    public void sendBatch(BatchStatement.Type type, boolean addCounter, boolean addNonCounter, boolean addClustering)
     {
 
-        assert addCounter || addNonCounter;
+        assert addCounter || addNonCounter || addClustering;
         BatchStatement b = new BatchStatement(type);
 
         for (int i = 0; i < 10; i++)
@@ -148,6 +173,11 @@
 
             if (addCounter)
                 b.add(counter.bind((long)i, i));
+
+            if (addClustering)
+            {
+                b.add(clustering.bind(i, i, i, i, "foo"));
+            }
         }
 
         session.execute(b);

diff --git a/test/unit/org/apache/cassandra/cql3/CQL3TypeLiteralTest.java b/test/unit/org/apache/cassandra/cql3/CQL3TypeLiteralTest.java
new file mode 100644
index 0000000..02ed1a8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CQL3TypeLiteralTest.java

@@ -0,0 +1,752 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.regex.Pattern;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.serializers.*;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test functionality to re-create a CQL literal from its serialized representation.
+ * This test uses some randomness to generate the values and nested structures (collections,tuples,UDTs).
+ */
+public class CQL3TypeLiteralTest
+{
+    private static final Pattern QUOTE = Pattern.compile("'");
+
+    /**
+     * Container holding the expected CQL literal for a type and serialized value.
+     * The CQL literal is generated independently from the code in {@link CQL3Type}.
+     */
+    static class Value
+    {
+        final String expected;
+        final CQL3Type cql3Type;
+        final ByteBuffer value;
+
+        Value(String expected, CQL3Type cql3Type, ByteBuffer value)
+        {
+            this.expected = expected;
+            this.cql3Type = cql3Type;
+            this.value = value;
+        }
+    }
+
+    static final Map<CQL3Type.Native, List<Value>> nativeTypeValues = new EnumMap<>(CQL3Type.Native.class);
+
+    static void addNativeValue(String expected, CQL3Type.Native cql3Type, ByteBuffer value)
+    {
+        List<Value> l = nativeTypeValues.get(cql3Type);
+        if (l == null)
+            nativeTypeValues.put(cql3Type, l = new ArrayList<>());
+        l.add(new Value(expected, cql3Type, value));
+    }
+
+    static
+    {
+        // Add some (random) values for each native type.
+        // Also adds null values and empty values, if the type allows this.
+
+        for (int i = 0; i < 20; i++)
+        {
+            String v = randString(true);
+            addNativeValue(quote(v), CQL3Type.Native.ASCII, AsciiSerializer.instance.serialize(v));
+        }
+        addNativeValue("''", CQL3Type.Native.ASCII, AsciiSerializer.instance.serialize(""));
+        addNativeValue("''", CQL3Type.Native.ASCII, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.ASCII, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            String v = randString(false);
+            addNativeValue(quote(v), CQL3Type.Native.TEXT, UTF8Serializer.instance.serialize(v));
+        }
+        addNativeValue("''", CQL3Type.Native.TEXT, UTF8Serializer.instance.serialize(""));
+        addNativeValue("''", CQL3Type.Native.TEXT, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.TEXT, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            String v = randString(false);
+            addNativeValue(quote(v), CQL3Type.Native.VARCHAR, UTF8Serializer.instance.serialize(v));
+        }
+        addNativeValue("''", CQL3Type.Native.VARCHAR, UTF8Serializer.instance.serialize(""));
+        addNativeValue("''", CQL3Type.Native.VARCHAR, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.VARCHAR, null);
+
+        addNativeValue("0", CQL3Type.Native.BIGINT, LongType.instance.decompose(0L));
+        for (int i = 0; i < 20; i++)
+        {
+            long v = randLong();
+            addNativeValue(Long.toString(v), CQL3Type.Native.BIGINT, LongType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.BIGINT, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.BIGINT, null);
+
+        addNativeValue("0", CQL3Type.Native.COUNTER, LongType.instance.decompose(0L));
+        for (int i = 0; i < 20; i++)
+        {
+            long v = randLong();
+            addNativeValue(Long.toString(v), CQL3Type.Native.COUNTER, LongType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.COUNTER, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.COUNTER, null);
+
+        addNativeValue("0", CQL3Type.Native.INT, Int32Type.instance.decompose(0));
+        for (int i = 0; i < 20; i++)
+        {
+            int v = randInt();
+            addNativeValue(Integer.toString(v), CQL3Type.Native.INT, Int32Type.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.INT, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.INT, null);
+
+        addNativeValue("0", CQL3Type.Native.SMALLINT, ShortType.instance.decompose((short) 0));
+        for (int i = 0; i < 20; i++)
+        {
+            short v = randShort();
+            addNativeValue(Short.toString(v), CQL3Type.Native.SMALLINT, ShortType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.SMALLINT, null);
+
+        addNativeValue("0", CQL3Type.Native.TINYINT, ByteType.instance.decompose((byte) 0));
+        for (int i = 0; i < 20; i++)
+        {
+            byte v = randByte();
+            addNativeValue(Short.toString(v), CQL3Type.Native.TINYINT, ByteType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.TINYINT, null);
+
+        addNativeValue("0.0", CQL3Type.Native.FLOAT, FloatType.instance.decompose((float) 0));
+        for (int i = 0; i < 20; i++)
+        {
+            float v = randFloat();
+            addNativeValue(Float.toString(v), CQL3Type.Native.FLOAT, FloatType.instance.decompose(v));
+        }
+        addNativeValue("NaN", CQL3Type.Native.FLOAT, FloatType.instance.decompose(Float.NaN));
+        addNativeValue("null", CQL3Type.Native.FLOAT, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.FLOAT, null);
+
+        addNativeValue("0.0", CQL3Type.Native.DOUBLE, DoubleType.instance.decompose((double) 0));
+        for (int i = 0; i < 20; i++)
+        {
+            double v = randDouble();
+            addNativeValue(Double.toString(v), CQL3Type.Native.DOUBLE, DoubleType.instance.decompose(v));
+        }
+        addNativeValue("NaN", CQL3Type.Native.DOUBLE, DoubleType.instance.decompose(Double.NaN));
+        addNativeValue("null", CQL3Type.Native.DOUBLE, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.DOUBLE, null);
+
+        addNativeValue("0", CQL3Type.Native.DECIMAL, DecimalType.instance.decompose(BigDecimal.ZERO));
+        for (int i = 0; i < 20; i++)
+        {
+            BigDecimal v = BigDecimal.valueOf(randDouble());
+            addNativeValue(v.toString(), CQL3Type.Native.DECIMAL, DecimalType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.DECIMAL, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.DECIMAL, null);
+
+        addNativeValue("0", CQL3Type.Native.VARINT, IntegerType.instance.decompose(BigInteger.ZERO));
+        for (int i = 0; i < 20; i++)
+        {
+            BigInteger v = BigInteger.valueOf(randLong());
+            addNativeValue(v.toString(), CQL3Type.Native.VARINT, IntegerType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.VARINT, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.VARINT, null);
+
+        // boolean doesn't have that many possible values...
+        addNativeValue("false", CQL3Type.Native.BOOLEAN, BooleanType.instance.decompose(false));
+        addNativeValue("true", CQL3Type.Native.BOOLEAN, BooleanType.instance.decompose(true));
+        addNativeValue("null", CQL3Type.Native.BOOLEAN, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.BOOLEAN, null);
+
+        // (mostly generates date values with surreal values like in year 14273)
+        for (int i = 0; i < 20; i++)
+        {
+            int v = randInt();
+            addNativeValue(SimpleDateSerializer.instance.toString(v), CQL3Type.Native.DATE, SimpleDateSerializer.instance.serialize(v));
+        }
+        addNativeValue("null", CQL3Type.Native.DATE, null);
+
+        for (int i = 0; i < 100; i++)
+        {
+            long v = randLong(24L * 60 * 60 * 1000 * 1000 * 1000);
+            addNativeValue(TimeSerializer.instance.toString(v), CQL3Type.Native.TIME, TimeSerializer.instance.serialize(v));
+        }
+        addNativeValue("null", CQL3Type.Native.TIME, null);
+
+        // (mostly generates timestamp values with surreal values like in year 14273)
+        for (int i = 0; i < 20; i++)
+        {
+            long v = randLong();
+            addNativeValue(TimestampSerializer.instance.toStringUTC(new Date(v)), CQL3Type.Native.TIMESTAMP, TimestampType.instance.fromString(Long.toString(v)));
+        }
+        addNativeValue("null", CQL3Type.Native.TIMESTAMP, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.TIMESTAMP, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            UUID v = UUIDGen.getTimeUUID(randLong(System.currentTimeMillis()));
+            addNativeValue(v.toString(), CQL3Type.Native.TIMEUUID, TimeUUIDType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.TIMEUUID, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.TIMEUUID, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            UUID v = UUID.randomUUID();
+            addNativeValue(v.toString(), CQL3Type.Native.UUID, UUIDType.instance.decompose(v));
+        }
+        addNativeValue("null", CQL3Type.Native.UUID, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.UUID, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            ByteBuffer v = randBytes();
+            addNativeValue("0x" + BytesSerializer.instance.toString(v), CQL3Type.Native.BLOB, BytesType.instance.decompose(v));
+        }
+        addNativeValue("0x", CQL3Type.Native.BLOB, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.BLOB, null);
+
+        for (int i = 0; i < 20; i++)
+        {
+            InetAddress v;
+            try
+            {
+                v = InetAddress.getByAddress(new byte[]{ randByte(), randByte(), randByte(), randByte() });
+            }
+            catch (UnknownHostException e)
+            {
+                throw new RuntimeException(e);
+            }
+            addNativeValue(v.getHostAddress(), CQL3Type.Native.INET, InetAddressSerializer.instance.serialize(v));
+        }
+        addNativeValue("null", CQL3Type.Native.INET, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        addNativeValue("null", CQL3Type.Native.INET, null);
+    }
+
+    @Test
+    public void testNative()
+    {
+        // test each native type against each supported protocol version (although it doesn't make sense to
+        // iterate through all protocol versions as of C* 3.0).
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (Map.Entry<CQL3Type.Native, List<Value>> entry : nativeTypeValues.entrySet())
+            {
+                for (Value value : entry.getValue())
+                {
+                    compareCqlLiteral(version, value);
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testCollectionWithNatives()
+    {
+        // test 100 collections with varying element/key/value types against each supported protocol version,
+        // type of collection is randomly chosen
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (int n = 0; n < 100; n++)
+            {
+                Value value = generateCollectionValue(version, randomCollectionType(0), true);
+                compareCqlLiteral(version, value);
+            }
+        }
+    }
+
+    @Test
+    public void testCollectionNullAndEmpty()
+    {
+        // An empty collection is one with a size of 0 (note that rely on the fact that protocol version < 3 are not
+        // supported anymore and so the size of a collection is always on 4 bytes).
+        ByteBuffer emptyCollection = ByteBufferUtil.bytes(0);
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (boolean frozen : Arrays.asList(true, false))
+            {
+                // empty
+                Value value = new Value("[]", ListType.getInstance(UTF8Type.instance, frozen).asCQL3Type(), emptyCollection);
+                compareCqlLiteral(version, value);
+                value = new Value("{}", SetType.getInstance(UTF8Type.instance, frozen).asCQL3Type(), emptyCollection);
+                compareCqlLiteral(version, value);
+                value = new Value("{}", MapType.getInstance(UTF8Type.instance, UTF8Type.instance, frozen).asCQL3Type(), emptyCollection);
+                compareCqlLiteral(version, value);
+
+                // null
+                value = new Value("null", ListType.getInstance(UTF8Type.instance, frozen).asCQL3Type(), null);
+                compareCqlLiteral(version, value);
+                value = new Value("null", SetType.getInstance(UTF8Type.instance, frozen).asCQL3Type(), null);
+                compareCqlLiteral(version, value);
+                value = new Value("null", MapType.getInstance(UTF8Type.instance, UTF8Type.instance, frozen).asCQL3Type(), null);
+                compareCqlLiteral(version, value);
+            }
+        }
+    }
+
+    @Test
+    public void testTupleWithNatives()
+    {
+        // test 100 tuples with varying element/key/value types against each supported protocol version
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (int n = 0; n < 100; n++)
+            {
+                Value value = generateTupleValue(version, randomTupleType(0), true);
+                compareCqlLiteral(version, value);
+            }
+        }
+    }
+
+    @Test
+    public void testUserDefinedWithNatives()
+    {
+        // test 100 UDTs with varying element/key/value types against each supported protocol version
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (int n = 0; n < 100; n++)
+            {
+                Value value = generateUserDefinedValue(version, randomUserType(0), true);
+                compareCqlLiteral(version, value);
+            }
+        }
+    }
+
+    @Test
+    public void testNested()
+    {
+        // This is the "nice" part of this unit test - it tests (probably) nested type structures
+        // like 'tuple<map, list<user>, tuple, user>' or 'map<tuple<int, text>, set<inet>>' with
+        // random types  against each supported protocol version.
+
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
+        {
+            for (int n = 0; n < 100; n++)
+            {
+                Value value = randomNested(version);
+                compareCqlLiteral(version, value);
+            }
+        }
+    }
+
+    static void compareCqlLiteral(int version, Value value)
+    {
+        ByteBuffer buffer = value.value != null ? value.value.duplicate() : null;
+        String msg = "Failed to get expected value for type " + value.cql3Type + " / " + value.cql3Type.getType() + " with protocol-version " + version + " expected:\"" + value.expected + '"';
+        try
+        {
+            assertEquals(msg,
+                         value.expected,
+                         value.cql3Type.toCQLLiteral(buffer, version));
+        }
+        catch (RuntimeException e)
+        {
+            throw new RuntimeException(msg, e);
+        }
+    }
+
+    static Value randomNested(int version)
+    {
+        AbstractType type = randomNestedType(2);
+
+        return generateAnyValue(version, type.asCQL3Type());
+    }
+
+    /**
+     * Generates type of randomly nested type structures.
+     */
+    static AbstractType randomNestedType(int level)
+    {
+        if (level == 0)
+            return randomNativeType();
+        switch (randInt(level == 2 ? 3 : 4))
+        {
+            case 0:
+                return randomCollectionType(level - 1);
+            case 1:
+                return randomTupleType(level - 1);
+            case 2:
+                return randomUserType(level - 1);
+            case 3:
+                return randomNativeType();
+        }
+        throw new AssertionError();
+    }
+
+    static Value generateCollectionValue(int version, CollectionType collectionType, boolean allowNull)
+    {
+        StringBuilder expected = new StringBuilder();
+        ByteBuffer buffer;
+
+        if (allowNull && randBool(0.05d))
+        {
+            expected.append("null");
+            buffer = null;
+        }
+        else
+        {
+            int size = randInt(20);
+
+            CQL3Type elements;
+            CQL3Type values = null;
+            char bracketOpen;
+            char bracketClose;
+            switch (collectionType.kind)
+            {
+                case LIST:
+                    elements = ((ListType) collectionType).getElementsType().asCQL3Type();
+                    bracketOpen = '[';
+                    bracketClose = ']';
+                    break;
+                case SET:
+                    elements = ((SetType) collectionType).getElementsType().asCQL3Type();
+                    bracketOpen = '{';
+                    bracketClose = '}';
+                    break;
+                case MAP:
+                    elements = ((MapType) collectionType).getKeysType().asCQL3Type();
+                    values = ((MapType) collectionType).getValuesType().asCQL3Type();
+                    bracketOpen = '{';
+                    bracketClose = '}';
+                    break;
+                default:
+                    throw new AssertionError();
+            }
+
+            expected.append(bracketOpen);
+            Collection<ByteBuffer> buffers = new ArrayList<>();
+            Set<ByteBuffer> added = new HashSet<>();
+            for (int i = 0; i < size; i++)
+            {
+                Value el = generateAnyValue(version, elements);
+                if (!added.add(el.value))
+                    continue;
+
+                buffers.add(el.value.duplicate());
+                if (expected.length() > 1)
+                    expected.append(", ");
+                expected.append(el.cql3Type.toCQLLiteral(el.value, version));
+
+                if (collectionType.kind == CollectionType.Kind.MAP)
+                {
+                    // add map value
+                    el = generateAnyValue(version, values);
+                    buffers.add(el.value.duplicate());
+                    expected.append(": ");
+                    expected.append(el.cql3Type.toCQLLiteral(el.value, version));
+                }
+            }
+            expected.append(bracketClose);
+            buffer = CollectionSerializer.pack(buffers, added.size(), version);
+        }
+
+        return new Value(expected.toString(), collectionType.asCQL3Type(), buffer);
+    }
+
+    /**
+     * Generates a value for any type or type structure.
+     */
+    static Value generateAnyValue(int version, CQL3Type type)
+    {
+        if (type instanceof CQL3Type.Native)
+            return generateNativeValue(type, false);
+        if (type instanceof CQL3Type.Tuple)
+            return generateTupleValue(version, (TupleType) type.getType(), false);
+        if (type instanceof CQL3Type.UserDefined)
+            return generateUserDefinedValue(version, (UserType) type.getType(), false);
+        if (type instanceof CQL3Type.Collection)
+            return generateCollectionValue(version, (CollectionType) type.getType(), false);
+        throw new AssertionError();
+    }
+
+    static Value generateTupleValue(int version, TupleType tupleType, boolean allowNull)
+    {
+        StringBuilder expected = new StringBuilder();
+        ByteBuffer buffer;
+
+        if (allowNull && randBool(0.05d))
+        {
+            // generate 'null' collection
+            expected.append("null");
+            buffer = null;
+        }
+        else
+        {
+            expected.append('(');
+
+            // # of fields in this value
+            int fields = tupleType.size();
+            if (randBool(0.2d))
+                fields = randInt(fields);
+
+            ByteBuffer[] buffers = new ByteBuffer[fields];
+            for (int i = 0; i < fields; i++)
+            {
+                AbstractType<?> fieldType = tupleType.type(i);
+
+                if (i > 0)
+                    expected.append(", ");
+
+                if (allowNull && randBool(.1))
+                {
+                    expected.append("null");
+                    continue;
+                }
+
+                Value value = generateAnyValue(version, fieldType.asCQL3Type());
+                expected.append(value.expected);
+                buffers[i] = value.value.duplicate();
+            }
+            expected.append(')');
+            buffer = TupleType.buildValue(buffers);
+        }
+
+        return new Value(expected.toString(), tupleType.asCQL3Type(), buffer);
+    }
+
+    static Value generateUserDefinedValue(int version, UserType userType, boolean allowNull)
+    {
+        StringBuilder expected = new StringBuilder();
+        ByteBuffer buffer;
+
+        if (allowNull && randBool(0.05d))
+        {
+            // generate 'null' collection
+            expected.append("null");
+            buffer = null;
+        }
+        else
+        {
+            expected.append('{');
+
+            // # of fields in this value
+            int fields = userType.size();
+            if (randBool(0.2d))
+                fields = randInt(fields);
+
+            ByteBuffer[] buffers = new ByteBuffer[fields];
+            for (int i = 0; i < fields; i++)
+            {
+                AbstractType<?> fieldType = userType.type(i);
+
+                if (i > 0)
+                    expected.append(", ");
+
+                expected.append(ColumnIdentifier.maybeQuote(userType.fieldNameAsString(i)));
+                expected.append(": ");
+
+                if (randBool(.1))
+                {
+                    expected.append("null");
+                    continue;
+                }
+
+                Value value = generateAnyValue(version, fieldType.asCQL3Type());
+                expected.append(value.expected);
+                buffers[i] = value.value.duplicate();
+            }
+            expected.append('}');
+            buffer = TupleType.buildValue(buffers);
+        }
+
+        return new Value(expected.toString(), userType.asCQL3Type(), buffer);
+    }
+
+    static Value generateNativeValue(CQL3Type type, boolean allowNull)
+    {
+        List<Value> values = nativeTypeValues.get(type);
+        assert values != null : type.toString() + " needs to be defined";
+        while (true)
+        {
+            Value v = values.get(randInt(values.size()));
+            if (allowNull || v.value != null)
+                return v;
+        }
+    }
+
+    static CollectionType randomCollectionType(int level)
+    {
+        CollectionType.Kind kind = CollectionType.Kind.values()[randInt(CollectionType.Kind.values().length)];
+        switch (kind)
+        {
+            case LIST:
+            case SET:
+                return ListType.getInstance(randomNestedType(level), randBool());
+            case MAP:
+                return MapType.getInstance(randomNestedType(level), randomNestedType(level), randBool());
+        }
+        throw new AssertionError();
+    }
+
+    static TupleType randomTupleType(int level)
+    {
+        int typeCount = 2 + randInt(5);
+        List<AbstractType<?>> types = new ArrayList<>();
+        for (int i = 0; i < typeCount; i++)
+            types.add(randomNestedType(level));
+        return new TupleType(types);
+    }
+
+    static UserType randomUserType(int level)
+    {
+        int typeCount = 2 + randInt(5);
+        List<ByteBuffer> names = new ArrayList<>();
+        List<AbstractType<?>> types = new ArrayList<>();
+        for (int i = 0; i < typeCount; i++)
+        {
+            names.add(UTF8Type.instance.fromString('f' + randLetters(i)));
+            types.add(randomNestedType(level));
+        }
+        return new UserType("ks", UTF8Type.instance.fromString("u" + randInt(1000000)), names, types);
+    }
+
+    //
+    // Following methods are just helper methods. Mostly to generate many kinds of random values.
+    //
+
+    private static String randLetters(int len)
+    {
+        StringBuilder sb = new StringBuilder(len);
+        while (len-- > 0)
+        {
+            int i = randInt(52);
+            if (i < 26)
+                sb.append((char) ('A' + i));
+            else
+                sb.append((char) ('a' + i - 26));
+        }
+        return sb.toString();
+    }
+
+    static AbstractType randomNativeType()
+    {
+        while (true)
+        {
+            CQL3Type.Native t = CQL3Type.Native.values()[randInt(CQL3Type.Native.values().length)];
+            if (t != CQL3Type.Native.EMPTY)
+                return t.getType();
+        }
+    }
+
+    static boolean randBool()
+    {
+        return randBool(0.5d);
+    }
+
+    static boolean randBool(double probability)
+    {
+        return ThreadLocalRandom.current().nextDouble() < probability;
+    }
+
+    static long randLong()
+    {
+        return ThreadLocalRandom.current().nextLong();
+    }
+
+    static long randLong(long max)
+    {
+        return ThreadLocalRandom.current().nextLong(max);
+    }
+
+    static int randInt()
+    {
+        return ThreadLocalRandom.current().nextInt();
+    }
+
+    static int randInt(int max)
+    {
+        return ThreadLocalRandom.current().nextInt(max);
+    }
+
+    static short randShort()
+    {
+        return (short) ThreadLocalRandom.current().nextInt();
+    }
+
+    static byte randByte()
+    {
+        return (byte) ThreadLocalRandom.current().nextInt();
+    }
+
+    static double randDouble()
+    {
+        return ThreadLocalRandom.current().nextDouble();
+    }
+
+    static float randFloat()
+    {
+        return ThreadLocalRandom.current().nextFloat();
+    }
+
+    static String randString(boolean ascii)
+    {
+        int l = randInt(20);
+        StringBuilder sb = new StringBuilder(l);
+        for (int i = 0; i < l; i++)
+        {
+            if (randBool(.05))
+                sb.append('\'');
+            else
+            {
+                char c = (char) (ascii ? randInt(128) : randShort());
+                sb.append(c);
+            }
+        }
+        return UTF8Serializer.instance.deserialize(UTF8Serializer.instance.serialize(sb.toString()));
+    }
+
+    static ByteBuffer randBytes()
+    {
+        int l = randInt(20);
+        byte[] v = new byte[l];
+        for (int i = 0; i < l; i++)
+        {
+            v[i] = randByte();
+        }
+        return ByteBuffer.wrap(v);
+    }
+
+    private static String quote(String v)
+    {
+        return '\'' + QUOTE.matcher(v).replaceAll("''") + '\'';
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 416a4b2..95366c2 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java

@@ -18,6 +18,7 @@
 package org.apache.cassandra.cql3;
 
 import java.io.File;
+import java.io.IOException;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.net.InetAddress;
@@ -28,43 +29,47 @@
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
 
 import com.google.common.base.Objects;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import org.junit.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static junit.framework.Assert.assertNotNull;
-
 import com.datastax.driver.core.*;
 import com.datastax.driver.core.ResultSet;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.metrics.ClientMetrics;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.functions.FunctionName;
+import org.apache.cassandra.cql3.functions.ThreadAwareSecurityManager;
 import org.apache.cassandra.cql3.statements.ParsedStatement;
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.db.marshal.TupleType;
-import org.apache.cassandra.exceptions.CassandraException;
+import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.transport.ConfiguredLimit;
 import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertNotNull;
 
 /**
  * Base class for CQL tests.
@@ -76,37 +81,41 @@
     public static final String KEYSPACE = "cql_test_keyspace";
     public static final String KEYSPACE_PER_TEST = "cql_test_keyspace_alt";
     protected static final boolean USE_PREPARED_VALUES = Boolean.valueOf(System.getProperty("cassandra.test.use_prepared", "true"));
+    protected static final boolean REUSE_PREPARED = Boolean.valueOf(System.getProperty("cassandra.test.reuse_prepared", "true"));
     protected static final long ROW_CACHE_SIZE_IN_MB = Integer.valueOf(System.getProperty("cassandra.test.row_cache_size_in_mb", "0"));
     private static final AtomicInteger seqNumber = new AtomicInteger();
-    protected static final ByteBuffer TOO_BIG = ByteBuffer.allocate(1024 * 65);
+    protected static final ByteBuffer TOO_BIG = ByteBuffer.allocate(FBUtilities.MAX_UNSIGNED_SHORT + 1024);
 
     private static org.apache.cassandra.transport.Server server;
     protected static final int nativePort;
     protected static final InetAddress nativeAddr;
-    private static final Cluster[] cluster;
-    private static final Session[] session;
+    protected static ConfiguredLimit protocolVersionLimit;
+    private static final Map<Integer, Cluster> clusters = new HashMap<>();
+    private static final Map<Integer, Session> sessions = new HashMap<>();
 
-    public static int maxProtocolVersion;
-    static {
-        int version;
-        for (version = 1; version <= Server.CURRENT_VERSION; )
+    private static boolean isServerPrepared = false;
+
+    public static final List<Integer> PROTOCOL_VERSIONS;
+    static
+    {
+        // The latest versions might not be supported yet by the java driver
+        ImmutableList.Builder<Integer> builder = ImmutableList.builder();
+        for (int version = Server.MIN_SUPPORTED_VERSION; version <= Server.CURRENT_VERSION; version++)
         {
             try
             {
-                ProtocolVersion.fromInt(++version);
+                ProtocolVersion.fromInt(version);
+                builder.add(version);
             }
             catch (IllegalArgumentException e)
             {
-                version--;
                 break;
             }
         }
-        maxProtocolVersion = version;
-        cluster = new Cluster[maxProtocolVersion];
-        session = new Session[maxProtocolVersion];
+        PROTOCOL_VERSIONS = builder.build();
 
         // Once per-JVM is enough
-        SchemaLoader.prepareServer();
+        prepareServer();
 
         nativeAddr = InetAddress.getLoopbackAddress();
 
@@ -126,6 +135,7 @@
 
     public static ResultMessage lastSchemaChangeResult;
 
+    private List<String> keyspaces = new ArrayList<>();
     private List<String> tables = new ArrayList<>();
     private List<String> types = new ArrayList<>();
     private List<String> functions = new ArrayList<>();
@@ -134,7 +144,87 @@
     // We don't use USE_PREPARED_VALUES in the code below so some test can foce value preparation (if the result
     // is not expected to be the same without preparation)
     private boolean usePrepared = USE_PREPARED_VALUES;
-    private static final boolean reusePrepared = Boolean.valueOf(System.getProperty("cassandra.test.reuse_prepared", "true"));
+    private static boolean reusePrepared = REUSE_PREPARED;
+
+    public static void prepareServer()
+    {
+        if (isServerPrepared)
+            return;
+
+        // Cleanup first
+        try
+        {
+            cleanupAndLeaveDirs();
+        }
+        catch (IOException e)
+        {
+            logger.error("Failed to cleanup and recreate directories.");
+            throw new RuntimeException(e);
+        }
+
+        Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler()
+        {
+            public void uncaughtException(Thread t, Throwable e)
+            {
+                logger.error("Fatal exception in thread " + t, e);
+            }
+        });
+
+        ThreadAwareSecurityManager.install();
+
+        DatabaseDescriptor.setDaemonInitialized();
+        Keyspace.setInitialized();
+        isServerPrepared = true;
+    }
+
+    public static void cleanupAndLeaveDirs() throws IOException
+    {
+        // We need to stop and unmap all CLS instances prior to cleanup() or we'll get failures on Windows.
+        CommitLog.instance.stopUnsafe(true);
+        mkdirs();
+        cleanup();
+        mkdirs();
+        CommitLog.instance.restartUnsafe();
+    }
+
+    public static void cleanup()
+    {
+        // clean up commitlog
+        String[] directoryNames = { DatabaseDescriptor.getCommitLogLocation(), };
+        for (String dirName : directoryNames)
+        {
+            File dir = new File(dirName);
+            if (!dir.exists())
+                throw new RuntimeException("No such directory: " + dir.getAbsolutePath());
+            FileUtils.deleteRecursive(dir);
+        }
+
+        cleanupSavedCaches();
+
+        // clean up data directory which are stored as data directory/keyspace/data files
+        for (String dirName : DatabaseDescriptor.getAllDataFileLocations())
+        {
+            File dir = new File(dirName);
+            if (!dir.exists())
+                throw new RuntimeException("No such directory: " + dir.getAbsolutePath());
+            FileUtils.deleteRecursive(dir);
+        }
+    }
+
+    public static void mkdirs()
+    {
+        DatabaseDescriptor.createAllDirectories();
+    }
+
+    public static void cleanupSavedCaches()
+    {
+        File cachesDir = new File(DatabaseDescriptor.getSavedCachesLocation());
+
+        if (!cachesDir.exists() || !cachesDir.isDirectory())
+            return;
+
+        FileUtils.delete(cachesDir.listFiles());
+    }
 
     @BeforeClass
     public static void setUpClass()
@@ -142,17 +232,15 @@
         if (ROW_CACHE_SIZE_IN_MB > 0)
             DatabaseDescriptor.setRowCacheSizeInMB(ROW_CACHE_SIZE_IN_MB);
 
-        DatabaseDescriptor.setPartitioner(Murmur3Partitioner.instance);
+        StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance);
     }
 
     @AfterClass
     public static void tearDownClass()
     {
-        for (Session sess : session)
-            if (sess != null)
+        for (Session sess : sessions.values())
                 sess.close();
-        for (Cluster cl : cluster)
-            if (cl != null)
+        for (Cluster cl : clusters.values())
                 cl.close();
 
         if (server != null)
@@ -178,11 +266,14 @@
 
         // Restore standard behavior in case it was changed
         usePrepared = USE_PREPARED_VALUES;
+        reusePrepared = REUSE_PREPARED;
 
+        final List<String> keyspacesToDrop = copy(keyspaces);
         final List<String> tablesToDrop = copy(tables);
         final List<String> typesToDrop = copy(types);
         final List<String> functionsToDrop = copy(functions);
         final List<String> aggregatesToDrop = copy(aggregates);
+        keyspaces = null;
         tables = null;
         types = null;
         functions = null;
@@ -207,8 +298,11 @@
                     for (int i = typesToDrop.size() - 1; i >= 0; i--)
                         schemaChange(String.format("DROP TYPE IF EXISTS %s.%s", KEYSPACE, typesToDrop.get(i)));
 
+                    for (int i = keyspacesToDrop.size() - 1; i >= 0; i--)
+                        schemaChange(String.format("DROP KEYSPACE IF EXISTS %s", keyspacesToDrop.get(i)));
+
                     // Dropping doesn't delete the sstables. It's not a huge deal but it's cleaner to cleanup after us
-                    // Thas said, we shouldn't delete blindly before the SSTableDeletingTask for the table we drop
+                    // Thas said, we shouldn't delete blindly before the TransactionLogs.SSTableTidier for the table we drop
                     // have run or they will be unhappy. Since those taks are scheduled on StorageService.tasks and that's
                     // mono-threaded, just push a task on the queue to find when it's empty. No perfect but good enough.
 
@@ -238,29 +332,75 @@
         if (server != null)
             return;
 
+        prepareNetwork();
+        initializeNetwork();
+    }
+
+    protected static void prepareNetwork()
+    {
         SystemKeyspace.finishStartup();
         StorageService.instance.initServer();
         SchemaLoader.startGossiper();
+    }
 
-        server = new org.apache.cassandra.transport.Server(nativeAddr, nativePort);
+    protected static void reinitializeNetwork()
+    {
+        if (server != null && server.isRunning())
+        {
+            server.stop();
+            server = null;
+        }
+        List<CloseFuture> futures = new ArrayList<>();
+        for (Cluster cluster : clusters.values())
+            futures.add(cluster.closeAsync());
+        for (Session session : sessions.values())
+            futures.add(session.closeAsync());
+        FBUtilities.waitOnFutures(futures);
+        clusters.clear();
+        sessions.clear();
+
+        initializeNetwork();
+    }
+
+    private static void initializeNetwork()
+    {
+        protocolVersionLimit = ConfiguredLimit.newLimit();
+        server = new Server.Builder().withHost(nativeAddr)
+                                     .withPort(nativePort)
+                                     .withProtocolVersionLimit(protocolVersionLimit)
+                                     .build();
+        ClientMetrics.instance.init(Collections.singleton(server));
         server.start();
 
-        for (int version = 1; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
         {
-            if (cluster[version-1] != null)
+            if (clusters.containsKey(version))
                 continue;
 
-            cluster[version-1] = Cluster.builder().addContactPoints(nativeAddr)
-                                  .withClusterName("Test Cluster")
-                                  .withPort(nativePort)
-                                  .withProtocolVersion(ProtocolVersion.fromInt(version))
-                                  .build();
-            session[version-1] = cluster[version-1].connect();
+            if (version > protocolVersionLimit.getMaxVersion())
+                continue;
+
+            Cluster cluster = Cluster.builder()
+                                     .addContactPoints(nativeAddr)
+                                     .withClusterName("Test Cluster-v" + version)
+                                     .withPort(nativePort)
+                                     .withProtocolVersion(ProtocolVersion.fromInt(version))
+                                     .build();
+            clusters.put(version, cluster);
+            sessions.put(version, cluster.connect());
 
             logger.info("Started Java Driver instance for protocol version {}", version);
         }
     }
 
+    protected void updateMaxNegotiableProtocolVersion()
+    {
+        if (protocolVersionLimit == null)
+            throw new IllegalStateException("Native transport server has not been initialized");
+
+        protocolVersionLimit.updateMaxSupportedVersion();
+    }
+
     protected void dropPerTestKeyspace() throws Throwable
     {
         execute(String.format("DROP KEYSPACE IF EXISTS %s", KEYSPACE_PER_TEST));
@@ -275,27 +415,69 @@
         return list.isEmpty() ? Collections.<String>emptyList() : new ArrayList<>(list);
     }
 
+    public ColumnFamilyStore getCurrentColumnFamilyStore(String keyspace)
+    {
+        String currentTable = currentTable();
+        return currentTable == null
+             ? null
+             : Keyspace.open(keyspace).getColumnFamilyStore(currentTable);
+    }
+
+    public ColumnFamilyStore getCurrentColumnFamilyStore()
+    {
+        return getCurrentColumnFamilyStore(KEYSPACE);
+    }
+
     public void flush()
     {
-        try
-        {
-            String currentTable = currentTable();
-            if (currentTable != null)
-                Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable).forceFlush().get();
-        }
-        catch (InterruptedException | ExecutionException e)
-        {
-            throw new RuntimeException(e);
-        }
+        flush(KEYSPACE);
+    }
+
+    public void flush(String keyspace)
+    {
+        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        if (store != null)
+            store.forceBlockingFlush();
+    }
+
+    public void disableCompaction(String keyspace)
+    {
+        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        if (store != null)
+            store.disableAutoCompaction();
+    }
+
+    public void flush(boolean forceFlush)
+    {
+        if (forceFlush)
+            flush();
+    }
+
+    @FunctionalInterface
+    public interface CheckedFunction {
+        void apply() throws Throwable;
+    }
+
+    /**
+     * Runs the given function before and after a flush of sstables.  This is useful for checking that behavior is
+     * the same whether data is in memtables or sstables.
+     * @param runnable
+     * @throws Throwable
+     */
+    public void beforeAndAfterFlush(CheckedFunction runnable) throws Throwable
+    {
+        runnable.apply();
+        flush();
+        runnable.apply();
     }
 
     public void compact()
     {
         try
         {
-            String currentTable = currentTable();
-            if (currentTable != null)
-                Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable).forceMajorCompaction();
+            ColumnFamilyStore store = getCurrentColumnFamilyStore();
+            if (store != null)
+                store.forceMajorCompaction();
         }
         catch (InterruptedException | ExecutionException e)
         {
@@ -303,11 +485,28 @@
         }
     }
 
+    public void disableCompaction()
+    {
+        disableCompaction(KEYSPACE);
+    }
+
+    public void enableCompaction(String keyspace)
+    {
+        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        if (store != null)
+            store.enableAutoCompaction();
+    }
+
+    public void enableCompaction()
+    {
+        enableCompaction(KEYSPACE);
+    }
+
     public void cleanupCache()
     {
-        String currentTable = currentTable();
-        if (currentTable != null)
-            Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable).cleanupCache();
+        ColumnFamilyStore store = getCurrentColumnFamilyStore();
+        if (store != null)
+            store.cleanupCache();
     }
 
     public static FunctionName parseFunctionName(String qualifiedName)
@@ -370,6 +569,11 @@
         this.usePrepared = USE_PREPARED_VALUES;
     }
 
+    protected void disablePreparedReuseForTest()
+    {
+        this.reusePrepared = false;
+    }
+
     protected String createType(String query)
     {
         String typeName = "type_" + seqNumber.getAndIncrement();
@@ -410,10 +614,31 @@
         schemaChange(fullQuery);
     }
 
+    protected String createKeyspace(String query)
+    {
+        String currentKeyspace = createKeyspaceName();
+        String fullQuery = String.format(query, currentKeyspace);
+        logger.info(fullQuery);
+        schemaChange(fullQuery);
+        return currentKeyspace;
+    }
+
+    protected String createKeyspaceName()
+    {
+        String currentKeyspace = "keyspace_" + seqNumber.getAndIncrement();
+        keyspaces.add(currentKeyspace);
+        return currentKeyspace;
+    }
+
     protected String createTable(String query)
     {
+        return createTable(KEYSPACE, query);
+    }
+
+    protected String createTable(String keyspace, String query)
+    {
         String currentTable = createTableName();
-        String fullQuery = formatQuery(query);
+        String fullQuery = formatQuery(keyspace, query);
         logger.info(fullQuery);
         schemaChange(fullQuery);
         return currentTable;
@@ -428,8 +653,7 @@
 
     protected void createTableMayThrow(String query) throws Throwable
     {
-        String currentTable = "table_" + seqNumber.getAndIncrement();
-        tables.add(currentTable);
+        String currentTable = createTableName();
         String fullQuery = formatQuery(query);
         logger.info(fullQuery);
         QueryProcessor.executeOnceInternal(fullQuery);
@@ -451,16 +675,24 @@
 
     protected void dropTable(String query)
     {
-        String fullQuery = String.format(query, KEYSPACE + "." + currentTable());
-        logger.info(fullQuery);
-        schemaChange(fullQuery);
+        dropFormattedTable(String.format(query, KEYSPACE + "." + currentTable()));
+    }
+
+    protected void dropFormattedTable(String formattedQuery)
+    {
+        logger.info(formattedQuery);
+        schemaChange(formattedQuery);
     }
 
     protected void createIndex(String query)
     {
-        String fullQuery = formatQuery(query);
-        logger.info(fullQuery);
-        schemaChange(fullQuery);
+        createFormattedIndex(formatQuery(query));
+    }
+
+    protected void createFormattedIndex(String formattedQuery)
+    {
+        logger.info(formattedQuery);
+        schemaChange(formattedQuery);
     }
 
     /**
@@ -472,13 +704,12 @@
     {
         long start = System.currentTimeMillis();
         boolean indexCreated = false;
-        String indedName = String.format("%s.%s", table, index);
         while (!indexCreated)
         {
             Object[][] results = getRows(execute("select index_name from system.\"IndexInfo\" where table_name = ?", keyspace));
             for(int i = 0; i < results.length; i++)
             {
-                if (indedName.equals(results[i][0]))
+                if (index.equals(results[i][0]))
                 {
                     indexCreated = true;
                     break;
@@ -538,6 +769,7 @@
         }
         catch (Exception e)
         {
+            logger.info("Error performing schema change", e);
             throw new RuntimeException("Error setting schema for test (query was: " + query + ")", e);
         }
     }
@@ -549,37 +781,49 @@
 
     protected com.datastax.driver.core.ResultSet executeNet(int protocolVersion, String query, Object... values) throws Throwable
     {
-        requireNetwork();
-
-        return session[protocolVersion-1].execute(formatQuery(query), values);
+        return sessionNet(protocolVersion).execute(formatQuery(query), values);
     }
 
     protected com.datastax.driver.core.ResultSet executeNetWithPaging(String query, int pageSize) throws Throwable
     {
-        return sessionNet(maxProtocolVersion).execute(new SimpleStatement(formatQuery(query)).setFetchSize(pageSize));
+        return sessionNet().execute(new SimpleStatement(formatQuery(query)).setFetchSize(pageSize));
+    }
+
+    protected Session sessionNet()
+    {
+        return sessionNet(PROTOCOL_VERSIONS.get(PROTOCOL_VERSIONS.size() - 1));
     }
 
     protected Session sessionNet(int protocolVersion)
     {
         requireNetwork();
 
-        return session[protocolVersion-1];
+        return sessions.get(protocolVersion);
     }
 
-    private String formatQuery(String query)
+    protected String formatQuery(String query)
+    {
+        return formatQuery(KEYSPACE, query);
+    }
+
+    protected final String formatQuery(String keyspace, String query)
     {
         String currentTable = currentTable();
-        return currentTable == null ? query : String.format(query, KEYSPACE + "." + currentTable);
+        return currentTable == null ? query : String.format(query, keyspace + "." + currentTable);
     }
 
     protected UntypedResultSet execute(String query, Object... values) throws Throwable
     {
-        query = formatQuery(query);
+        return executeFormattedQuery(formatQuery(query), values);
+    }
 
+    protected UntypedResultSet executeFormattedQuery(String query, Object... values) throws Throwable
+    {
         UntypedResultSet rs;
         if (usePrepared)
         {
-            logger.info("Executing: {} with values {}", query, formatAllValues(values));
+            if (logger.isTraceEnabled())
+                logger.trace("Executing: {} with values {}", query, formatAllValues(values));
             if (reusePrepared)
             {
                 rs = QueryProcessor.executeInternal(query, transformValues(values));
@@ -599,16 +843,25 @@
         else
         {
             query = replaceValues(query, values);
-            logger.info("Executing: {}", query);
+            if (logger.isTraceEnabled())
+                logger.trace("Executing: {}", query);
             rs = QueryProcessor.executeOnceInternal(query);
         }
         if (rs != null)
-            logger.info("Got {} rows", rs.size());
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Got {} rows", rs.size());
+        }
         return rs;
     }
 
     protected void assertRowsNet(int protocolVersion, ResultSet result, Object[]... rows)
     {
+        // necessary as we need cluster objects to supply CodecRegistry.
+        // It's reasonably certain that the network setup has already been done
+        // by the time we arrive at this point, but adding this check doesn't hurt
+        requireNetwork();
+
         if (result == null)
         {
             if (rows.length > 0)
@@ -631,19 +884,21 @@
             for (int j = 0; j < meta.size(); j++)
             {
                 DataType type = meta.getType(j);
-                ByteBuffer expectedByteValue = type.serialize(expected[j], ProtocolVersion.fromInt(protocolVersion));
-                int expectedBytes = expectedByteValue.remaining();
+                com.datastax.driver.core.TypeCodec<Object> codec = clusters.get(protocolVersion).getConfiguration()
+                                                                                                .getCodecRegistry()
+                                                                                                .codecFor(type);
+                ByteBuffer expectedByteValue = codec.serialize(expected[j], ProtocolVersion.fromInt(protocolVersion));
+                int expectedBytes = expectedByteValue == null ? -1 : expectedByteValue.remaining();
                 ByteBuffer actualValue = actual.getBytesUnsafe(meta.getName(j));
-                int actualBytes = actualValue.remaining();
-
+                int actualBytes = actualValue == null ? -1 : actualValue.remaining();
                 if (!Objects.equal(expectedByteValue, actualValue))
                     Assert.fail(String.format("Invalid value for row %d column %d (%s of type %s), " +
                                               "expected <%s> (%d bytes) but got <%s> (%d bytes) " +
                                               "(using protocol version %d)",
                                               i, j, meta.getName(j), type,
-                                              type.format(expected[j]),
+                                              codec.format(expected[j]),
                                               expectedBytes,
-                                              type.format(type.deserialize(actualValue, ProtocolVersion.fromInt(protocolVersion))),
+                                              codec.format(codec.deserialize(actualValue, ProtocolVersion.fromInt(protocolVersion))),
                                               actualBytes,
                                               protocolVersion));
             }
@@ -667,10 +922,10 @@
 
     protected void assertRowsNet(ResultSet result, Object[]... rows)
     {
-        assertRowsNet(maxProtocolVersion, result, rows);
+        assertRowsNet(PROTOCOL_VERSIONS.get(PROTOCOL_VERSIONS.size() - 1), result, rows);
     }
 
-    protected void assertRows(UntypedResultSet result, Object[]... rows)
+    public static void assertRows(UntypedResultSet result, Object[]... rows)
     {
         if (result == null)
         {
@@ -687,18 +942,18 @@
             Object[] expected = rows[i];
             UntypedResultSet.Row actual = iter.next();
 
-            Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected.length, meta.size());
+            Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected == null ? 1 : expected.length, meta.size());
 
             for (int j = 0; j < meta.size(); j++)
             {
                 ColumnSpecification column = meta.get(j);
-                ByteBuffer expectedByteValue = makeByteBuffer(expected[j], column.type);
+                ByteBuffer expectedByteValue = makeByteBuffer(expected == null ? null : expected[j], column.type);
                 ByteBuffer actualValue = actual.getBytes(column.name.toString());
 
                 if (!Objects.equal(expectedByteValue, actualValue))
                 {
-                    Object actualValueDecoded = column.type.getSerializer().deserialize(actualValue);
-                    if (!actualValueDecoded.equals(expected[j]))
+                    Object actualValueDecoded = actualValue == null ? null : column.type.getSerializer().deserialize(actualValue);
+                    if (!Objects.equal(expected[j], actualValueDecoded))
                         Assert.fail(String.format("Invalid value for row %d column %d (%s of type %s), expected <%s> but got <%s>",
                                                   i,
                                                   j,
@@ -718,12 +973,115 @@
                 iter.next();
                 i++;
             }
-            Assert.fail(String.format("Got less rows than expected. Expected %d but got %d.", rows.length, i));
+            Assert.fail(String.format("Got more rows than expected. Expected %d but got %d.", rows.length, i));
         }
 
         Assert.assertTrue(String.format("Got %s rows than expected. Expected %d but got %d", rows.length>i ? "less" : "more", rows.length, i), i == rows.length);
     }
 
+    /**
+     * Like assertRows(), but ignores the ordering of rows.
+     */
+    public static void assertRowsIgnoringOrder(UntypedResultSet result, Object[]... rows)
+    {
+        assertRowsIgnoringOrderInternal(result, false, rows);
+    }
+
+    public static void assertRowsIgnoringOrderAndExtra(UntypedResultSet result, Object[]... rows)
+    {
+        assertRowsIgnoringOrderInternal(result, true, rows);
+    }
+
+    private static void assertRowsIgnoringOrderInternal(UntypedResultSet result, boolean ignoreExtra, Object[]... rows)
+    {
+        if (result == null)
+        {
+            if (rows.length > 0)
+                Assert.fail(String.format("No rows returned by query but %d expected", rows.length));
+            return;
+        }
+
+        List<ColumnSpecification> meta = result.metadata();
+
+        Set<List<ByteBuffer>> expectedRows = new HashSet<>(rows.length);
+        for (Object[] expected : rows)
+        {
+            Assert.assertEquals("Invalid number of (expected) values provided for row", expected.length, meta.size());
+            List<ByteBuffer> expectedRow = new ArrayList<>(meta.size());
+            for (int j = 0; j < meta.size(); j++)
+                expectedRow.add(makeByteBuffer(expected[j], meta.get(j).type));
+            expectedRows.add(expectedRow);
+        }
+
+        Set<List<ByteBuffer>> actualRows = new HashSet<>(result.size());
+        for (UntypedResultSet.Row actual : result)
+        {
+            List<ByteBuffer> actualRow = new ArrayList<>(meta.size());
+            for (int j = 0; j < meta.size(); j++)
+                actualRow.add(actual.getBytes(meta.get(j).name.toString()));
+            actualRows.add(actualRow);
+        }
+
+        com.google.common.collect.Sets.SetView<List<ByteBuffer>> extra = com.google.common.collect.Sets.difference(actualRows, expectedRows);
+        com.google.common.collect.Sets.SetView<List<ByteBuffer>> missing = com.google.common.collect.Sets.difference(expectedRows, actualRows);
+        if ((!ignoreExtra && !extra.isEmpty()) || !missing.isEmpty())
+        {
+            List<String> extraRows = makeRowStrings(extra, meta);
+            List<String> missingRows = makeRowStrings(missing, meta);
+            StringBuilder sb = new StringBuilder();
+            if (!extra.isEmpty())
+            {
+                sb.append("Got ").append(extra.size()).append(" extra row(s) ");
+                if (!missing.isEmpty())
+                    sb.append("and ").append(missing.size()).append(" missing row(s) ");
+                sb.append("in result.  Extra rows:\n    ");
+                sb.append(extraRows.stream().collect(Collectors.joining("\n    ")));
+                if (!missing.isEmpty())
+                    sb.append("\nMissing Rows:\n    ").append(missingRows.stream().collect(Collectors.joining("\n    ")));
+                Assert.fail(sb.toString());
+            }
+
+            if (!missing.isEmpty())
+                Assert.fail("Missing " + missing.size() + " row(s) in result: \n    " + missingRows.stream().collect(Collectors.joining("\n    ")));
+        }
+
+        assert ignoreExtra || expectedRows.size() == actualRows.size();
+    }
+
+    protected static List<String> makeRowStrings(UntypedResultSet resultSet)
+    {
+        List<List<ByteBuffer>> rows = new ArrayList<>();
+        for (UntypedResultSet.Row row : resultSet)
+        {
+            List<ByteBuffer> values = new ArrayList<>();
+            for (ColumnSpecification columnSpecification : resultSet.metadata())
+            {
+                values.add(row.getBytes(columnSpecification.name.toString()));
+            }
+            rows.add(values);
+        }
+
+        return makeRowStrings(rows, resultSet.metadata());
+    }
+
+    private static List<String> makeRowStrings(Iterable<List<ByteBuffer>> rows, List<ColumnSpecification> meta)
+    {
+        List<String> strings = new ArrayList<>();
+        for (List<ByteBuffer> row : rows)
+        {
+            StringBuilder sb = new StringBuilder("row(");
+            for (int j = 0; j < row.size(); j++)
+            {
+                ColumnSpecification column = meta.get(j);
+                sb.append(column.name.toString()).append("=").append(formatValue(row.get(j), column.type));
+                if (j < (row.size() - 1))
+                    sb.append(", ");
+            }
+            strings.add(sb.append(")").toString());
+        }
+        return strings;
+    }
+
     protected void assertRowCount(UntypedResultSet result, int numExpectedRows)
     {
         if (result == null)
@@ -806,7 +1164,7 @@
         assertRows(execute("SELECT * FROM %s"), rows);
     }
 
-    protected Object[] row(Object... expected)
+    public static Object[] row(Object... expected)
     {
         return expected;
     }
@@ -814,7 +1172,7 @@
     protected void assertEmpty(UntypedResultSet result) throws Throwable
     {
         if (result != null && !result.isEmpty())
-            throw new AssertionError(String.format("Expected empty result but got %d rows", result.size()));
+            throw new AssertionError(String.format("Expected empty result but got %d rows: %s \n", result.size(), makeRowStrings(result)));
     }
 
     protected void assertInvalid(String query, Object... values) throws Throwable
@@ -834,15 +1192,30 @@
 
     protected void assertInvalidThrowMessage(String errorMessage, Class<? extends Throwable> exception, String query, Object... values) throws Throwable
     {
+        assertInvalidThrowMessage(Integer.MIN_VALUE, errorMessage, exception, query, values);
+    }
+
+    // if a protocol version > Integer.MIN_VALUE is supplied, executes
+    // the query via the java driver, mimicking a real client.
+    protected void assertInvalidThrowMessage(int protocolVersion,
+                                             String errorMessage,
+                                             Class<? extends Throwable> exception,
+                                             String query,
+                                             Object... values) throws Throwable
+    {
         try
         {
-            execute(query, values);
+            if (protocolVersion == Integer.MIN_VALUE)
+                execute(query, values);
+            else
+                executeNet(protocolVersion, query, values);
+
             String q = USE_PREPARED_VALUES
                        ? query + " (values: " + formatAllValues(values) + ")"
                        : replaceValues(query, values);
             Assert.fail("Query should be invalid but no error was thrown. Query is: " + q);
         }
-        catch (CassandraException e)
+        catch (Exception e)
         {
             if (exception != null && !exception.isAssignableFrom(e.getClass()))
             {
@@ -1172,6 +1545,12 @@
         return m;
     }
 
+    protected com.datastax.driver.core.TupleType tupleTypeOf(int protocolVersion, DataType...types)
+    {
+        requireNetwork();
+        return clusters.get(protocolVersion).getMetadata().newTupleType(types);
+    }
+
     // Attempt to find an AbstracType from a value (for serialization/printing sake).
     // Will work as long as we use types we know of, which is good enough for testing
     private static AbstractType typeFor(Object value)

diff --git a/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java b/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java
index c8b3a2f..71524c5 100644
--- a/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ColumnConditionTest.java

@@ -17,27 +17,29 @@
  */
 package org.apache.cassandra.cql3;
 
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.junit.Test;
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.db.BufferCell;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.Int32Serializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.junit.Test;
+import org.apache.cassandra.utils.UUIDGen;
 
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import static org.apache.cassandra.utils.ByteBufferUtil.UNSET_BYTE_BUFFER;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.fail;
+import static org.junit.Assert.*;
 
 public class ColumnConditionTest
 {
+    private static final CellPath LIST_PATH = CellPath.create(ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes()));
+
     public static final ByteBuffer ZERO = Int32Type.instance.fromString("0");
     public static final ByteBuffer ONE = Int32Type.instance.fromString("1");
     public static final ByteBuffer TWO = Int32Type.instance.fromString("2");
@@ -50,11 +52,15 @@
         Cell cell = null;
         if (columnValue != null)
         {
-            CompoundSparseCellNameType nameType = new CompoundSparseCellNameType(Collections.EMPTY_LIST);
-            ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), Int32Type.instance, null, null, null, null, null);
-            cell = new BufferCell(nameType.create(Composites.EMPTY, definition), columnValue);
+            ColumnDefinition definition = ColumnDefinition.regularDef("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true));
+            cell = testCell(definition, columnValue, LIST_PATH);
         }
-        return bound.isSatisfiedByValue(conditionValue, cell, Int32Type.instance, bound.operator, 1234);
+        return bound.isSatisfiedByValue(conditionValue, cell, Int32Type.instance, bound.operator);
+    }
+
+    private static Cell testCell(ColumnDefinition column, ByteBuffer value, CellPath path)
+    {
+        return new BufferCell(column, 0L, Cell.NO_TTL, Cell.NO_DELETION_TIME, value, path);
     }
 
     private static void assertThrowsIRE(ColumnCondition.Bound bound, ByteBuffer conditionValue, ByteBuffer columnValue)
@@ -69,7 +75,7 @@
     @Test
     public void testSimpleBoundIsSatisfiedByValue() throws InvalidRequestException
     {
-        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), Int32Type.instance, null, null, null, null, null);
+        ColumnDefinition definition = ColumnDefinition.regularDef("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true));
 
         // EQ
         ColumnCondition condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.EQ);
@@ -83,7 +89,6 @@
         assertTrue(isSatisfiedBy(bound, null, null));
         assertFalse(isSatisfiedBy(bound, ONE, null));
         assertFalse(isSatisfiedBy(bound, null, ONE));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
 
         // NEQ
         condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.NEQ);
@@ -97,7 +102,6 @@
         assertFalse(isSatisfiedBy(bound, null, null));
         assertTrue(isSatisfiedBy(bound, ONE, null));
         assertTrue(isSatisfiedBy(bound, null, ONE));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
 
         // LT
         condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.LT);
@@ -110,7 +114,6 @@
         assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
         assertThrowsIRE(bound, null, ONE);
         assertFalse(isSatisfiedBy(bound, ONE, null));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
 
         // LTE
         condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.LTE);
@@ -123,7 +126,6 @@
         assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
         assertThrowsIRE(bound, null, ONE);
         assertFalse(isSatisfiedBy(bound, ONE, null));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
 
         // GT
         condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.GT);
@@ -136,7 +138,6 @@
         assertFalse(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
         assertThrowsIRE(bound, null, ONE);
         assertFalse(isSatisfiedBy(bound, ONE, null));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
 
         // GT
         condition = ColumnCondition.condition(definition, new Constants.Value(ONE), Operator.GTE);
@@ -149,7 +150,6 @@
         assertTrue(isSatisfiedBy(bound, ByteBufferUtil.EMPTY_BYTE_BUFFER, ByteBufferUtil.EMPTY_BYTE_BUFFER));
         assertThrowsIRE(bound, null, ONE);
         assertFalse(isSatisfiedBy(bound, ONE, null));
-        assertThrowsIRE(bound, UNSET_BYTE_BUFFER, ONE);
     }
 
     private static List<ByteBuffer> list(ByteBuffer... values)
@@ -162,8 +162,8 @@
         CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b int, c list<int>)", "ks");
         Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
         typeMap.put(ByteBufferUtil.bytes("c"), ListType.getInstance(Int32Type.instance, true));
-        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
-        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("c"), ListType.getInstance(Int32Type.instance, true), 0, ColumnDefinition.Kind.REGULAR);
+
+        ColumnDefinition definition = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("c"), ListType.getInstance(Int32Type.instance, true));
 
         List<Cell> cells = new ArrayList<>(columnValues.size());
         if (columnValues != null)
@@ -172,7 +172,7 @@
             {
                 ByteBuffer key = Int32Serializer.instance.serialize(i);
                 ByteBuffer value = columnValues.get(i);
-                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, key), value));
+                cells.add(testCell(definition, value, CellPath.create(key)));
             };
         }
 
@@ -183,7 +183,7 @@
     // sets use the same check as lists
     public void testListCollectionBoundAppliesTo() throws InvalidRequestException
     {
-        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), ListType.getInstance(Int32Type.instance, true), null, null, null, null, null);
+        ColumnDefinition definition = ColumnDefinition.regularDef("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true));
 
         // EQ
         ColumnCondition condition = ColumnCondition.condition(definition, null, new Lists.Value(Arrays.asList(ONE)), Operator.EQ);
@@ -294,8 +294,7 @@
         CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b int, c set<int>)", "ks");
         Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
         typeMap.put(ByteBufferUtil.bytes("c"), SetType.getInstance(Int32Type.instance, true));
-        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
-        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("c"), SetType.getInstance(Int32Type.instance, true), 0, ColumnDefinition.Kind.REGULAR);
+        ColumnDefinition definition = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("c"), SetType.getInstance(Int32Type.instance, true));
 
         List<Cell> cells = new ArrayList<>(columnValues.size());
         if (columnValues != null)
@@ -303,7 +302,7 @@
             for (int i = 0; i < columnValues.size(); i++)
             {
                 ByteBuffer key = columnValues.get(i);
-                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, key), ByteBufferUtil.EMPTY_BYTE_BUFFER));
+                cells.add(testCell(definition, ByteBufferUtil.EMPTY_BYTE_BUFFER, CellPath.create(key)));
             };
         }
 
@@ -313,7 +312,7 @@
     @Test
     public void testSetCollectionBoundAppliesTo() throws InvalidRequestException
     {
-        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("c", true), SetType.getInstance(Int32Type.instance, true), null, null, null, null, null);
+        ColumnDefinition definition = ColumnDefinition.regularDef("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true));
 
         // EQ
         ColumnCondition condition = ColumnCondition.condition(definition, null, new Sets.Value(set(ONE)), Operator.EQ);
@@ -427,14 +426,13 @@
         CFMetaData cfm = CFMetaData.compile("create table foo(a int PRIMARY KEY, b map<int, int>)", "ks");
         Map<ByteBuffer, CollectionType> typeMap = new HashMap<>();
         typeMap.put(ByteBufferUtil.bytes("b"), MapType.getInstance(Int32Type.instance, Int32Type.instance, true));
-        CompoundSparseCellNameType.WithCollection nameType = new CompoundSparseCellNameType.WithCollection(Collections.EMPTY_LIST, ColumnToCollectionType.getInstance(typeMap));
-        ColumnDefinition definition = new ColumnDefinition(cfm, ByteBufferUtil.bytes("b"), MapType.getInstance(Int32Type.instance, Int32Type.instance, true), 0, ColumnDefinition.Kind.REGULAR);
+        ColumnDefinition definition = ColumnDefinition.regularDef(cfm, ByteBufferUtil.bytes("b"), MapType.getInstance(Int32Type.instance, Int32Type.instance, true));
 
         List<Cell> cells = new ArrayList<>(columnValues.size());
         if (columnValues != null)
         {
             for (Map.Entry<ByteBuffer, ByteBuffer> entry : columnValues.entrySet())
-                cells.add(new BufferCell(nameType.create(Composites.EMPTY, definition, entry.getKey()), entry.getValue()));
+                cells.add(testCell(definition, entry.getValue(), CellPath.create(entry.getKey())));
         }
 
         return bound.mapAppliesTo(MapType.getInstance(Int32Type.instance, Int32Type.instance, true), cells.iterator(), conditionValues, bound.operator);
@@ -443,7 +441,7 @@
     @Test
     public void testMapCollectionBoundIsSatisfiedByValue() throws InvalidRequestException
     {
-        ColumnDefinition definition = new ColumnDefinition("ks", "cf", new ColumnIdentifier("b", true), MapType.getInstance(Int32Type.instance, Int32Type.instance, true), null, null, null, null, null);
+        ColumnDefinition definition = ColumnDefinition.regularDef("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true));
 
         Map<ByteBuffer, ByteBuffer> placeholderMap = new TreeMap<>();
         placeholderMap.put(ONE, ONE);
@@ -581,4 +579,4 @@
         assertTrue(mapAppliesTo(bound, map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE), map(ByteBufferUtil.EMPTY_BYTE_BUFFER, ONE)));
         assertTrue(mapAppliesTo(bound, map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER), map(ONE, ByteBufferUtil.EMPTY_BYTE_BUFFER)));
     }
-}
\ No newline at end of file
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ColumnIdentifierTest.java b/test/unit/org/apache/cassandra/cql3/ColumnIdentifierTest.java
new file mode 100644
index 0000000..3a34ad5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ColumnIdentifierTest.java

@@ -0,0 +1,98 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class ColumnIdentifierTest
+{
+
+    @Test
+    public void testComparisonMethod()
+    {
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        byte[] commonBytes = new byte[10];
+        byte[] aBytes = new byte[16];
+        byte[] bBytes = new byte[16];
+        for (int i = 0 ; i < 100000 ; i++)
+        {
+            int commonLength = random.nextInt(0, 10);
+            random.nextBytes(commonBytes);
+            random.nextBytes(aBytes);
+            random.nextBytes(bBytes);
+            System.arraycopy(commonBytes, 0, aBytes, 0, commonLength);
+            System.arraycopy(commonBytes, 0, bBytes, 0, commonLength);
+            int aLength = random.nextInt(commonLength, 16);
+            int bLength = random.nextInt(commonLength, 16);
+            ColumnIdentifier a = new ColumnIdentifier(ByteBuffer.wrap(aBytes, 0, aLength), BytesType.instance);
+            ColumnIdentifier b = new ColumnIdentifier(ByteBuffer.wrap(bBytes, 0, bLength), BytesType.instance);
+            Assert.assertEquals("" + i, compareResult(a.compareTo(b)), compareResult(ByteBufferUtil.compareUnsigned(a.bytes, b.bytes)));
+        }
+    }
+
+    private static int compareResult(int v)
+    {
+        return v < 0 ? -1 : v > 0 ? 1 : 0;
+    }
+
+    @Test
+    public void testInternedCache()
+    {
+        AbstractType<?> utf8Type = UTF8Type.instance;
+        AbstractType<?> bytesType = BytesType.instance;
+
+        byte[] bytes = new byte [] { 0x63, (byte) 0x32 };
+        String text = "c2"; // the UTF-8 encoding of this string is the same as bytes, 0x630x32
+
+        ColumnIdentifier c1 = ColumnIdentifier.getInterned(ByteBuffer.wrap(bytes), bytesType);
+        ColumnIdentifier c2 = ColumnIdentifier.getInterned(utf8Type, utf8Type.fromString(text), text);
+        ColumnIdentifier c3 = ColumnIdentifier.getInterned(text, true);
+
+        Assert.assertTrue(c1.isInterned());
+        Assert.assertTrue(c2.isInterned());
+        Assert.assertTrue(c3.isInterned());
+
+        Assert.assertEquals("6332", c1.toString());
+        Assert.assertEquals(text, c2.toString());
+        Assert.assertEquals(text, c3.toString());
+    }
+
+    @Test
+    public void testInterningUsesMinimalByteBuffer()
+    {
+        byte[] bytes = new byte[2];
+        bytes[0] = 0x63;
+        ByteBuffer byteBuffer = ByteBuffer.wrap(bytes);
+        byteBuffer.limit(1);
+
+        ColumnIdentifier c1 = ColumnIdentifier.getInterned(byteBuffer, UTF8Type.instance);
+
+        Assert.assertEquals(2, byteBuffer.capacity());
+        Assert.assertEquals(1, c1.bytes.capacity());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/DeleteTest.java b/test/unit/org/apache/cassandra/cql3/DeleteTest.java
index 812d729..3c95a6c 100644
--- a/test/unit/org/apache/cassandra/cql3/DeleteTest.java
+++ b/test/unit/org/apache/cassandra/cql3/DeleteTest.java

@@ -17,26 +17,17 @@
  */
 package org.apache.cassandra.cql3;
 
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
 
-import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.ConsistencyLevel;
 import com.datastax.driver.core.PreparedStatement;
 import com.datastax.driver.core.ResultSetFuture;
 import com.datastax.driver.core.Session;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.service.EmbeddedCassandraService;
-import org.junit.Assert;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
 
-public class DeleteTest extends SchemaLoader
+public class DeleteTest extends CQLTester
 {
-    private static EmbeddedCassandraService cassandra;
-
-    private static Cluster cluster;
-    private static Session session;
     private static PreparedStatement pstmtI;
     private static PreparedStatement pstmtU;
     private static PreparedStatement pstmtD;
@@ -46,16 +37,13 @@
     private static PreparedStatement pstmt4;
     private static PreparedStatement pstmt5;
 
-    @BeforeClass
-    public static void setup() throws Exception
+    @Before
+    public void prepare() throws Exception
     {
-        Schema.instance.clear();
+        // Schema.instance.clear();
 
-        cassandra = new EmbeddedCassandraService();
-        cassandra.start();
-
-        cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build();
-        session = cluster.connect();
+        Session session = sessionNet();
+        session.getCluster().getConfiguration().getQueryOptions().setConsistencyLevel(ConsistencyLevel.ONE);
 
         session.execute("drop keyspace if exists junit;");
         session.execute("create keyspace junit WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 2 };");
@@ -105,15 +93,10 @@
         pstmt5 = session.prepare("select id, cid, inh_c, val from junit.tpc_inherit_c where id=? and cid=?");
     }
 
-    @AfterClass
-    public static void tearDown() throws Exception
-    {
-        cluster.close();
-    }
-
     @Test
     public void lostDeletesTest()
     {
+        Session session = sessionNet();
 
         for (int i = 0; i < 500; i++)
         {
@@ -150,6 +133,8 @@
     }
 
     private ResultSetFuture[] load() {
+        Session session = sessionNet();
+
         return new ResultSetFuture[]{
                 session.executeAsync(pstmt1.bind(1, 1)),
                 session.executeAsync(pstmt2.bind(1, 1)),

diff --git a/test/unit/org/apache/cassandra/cql3/DistinctQueryPagingTest.java b/test/unit/org/apache/cassandra/cql3/DistinctQueryPagingTest.java
new file mode 100644
index 0000000..f433179
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/DistinctQueryPagingTest.java

@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class DistinctQueryPagingTest extends CQLTester
+{
+    /**
+     * Migrated from cql_tests.py:TestCQL.test_select_distinct()
+     */
+    @Test
+    public void testSelectDistinct() throws Throwable
+    {
+        // Test a regular (CQL3) table.
+        createTable("CREATE TABLE %s (pk0 int, pk1 int, ck0 int, val int, PRIMARY KEY((pk0, pk1), ck0))");
+
+        for (int i = 0; i < 3; i++)
+        {
+            execute("INSERT INTO %s (pk0, pk1, ck0, val) VALUES (?, ?, 0, 0)", i, i);
+            execute("INSERT INTO %s (pk0, pk1, ck0, val) VALUES (?, ?, 1, 1)", i, i);
+        }
+
+        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 1"),
+                   row(0, 0));
+
+        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 3"),
+                   row(0, 0),
+                   row(2, 2),
+                   row(1, 1));
+
+        // Test selection validation.
+        assertInvalidMessage("queries must request all the partition key columns", "SELECT DISTINCT pk0 FROM %s");
+        assertInvalidMessage("queries must only request partition key columns", "SELECT DISTINCT pk0, pk1, ck0 FROM %s");
+
+        // Test a 'compact storage' table.
+        createTable("CREATE TABLE %s (pk0 int, pk1 int, val int, PRIMARY KEY((pk0, pk1))) WITH COMPACT STORAGE");
+
+        for (int i = 0; i < 3; i++)
+            execute("INSERT INTO %s (pk0, pk1, val) VALUES (?, ?, ?)", i, i, i);
+
+        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 1"),
+                   row(0, 0));
+
+        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 3"),
+                   row(0, 0),
+                   row(2, 2),
+                   row(1, 1));
+
+        // Test a 'wide row' thrift table.
+        createTable("CREATE TABLE %s (pk int, name text, val int, PRIMARY KEY(pk, name)) WITH COMPACT STORAGE");
+
+        for (int i = 0; i < 3; i++)
+        {
+            execute("INSERT INTO %s (pk, name, val) VALUES (?, 'name0', 0)", i);
+            execute("INSERT INTO %s (pk, name, val) VALUES (?, 'name1', 1)", i);
+        }
+
+        assertRows(execute("SELECT DISTINCT pk FROM %s LIMIT 1"),
+                   row(1));
+
+        assertRows(execute("SELECT DISTINCT pk FROM %s LIMIT 3"),
+                   row(1),
+                   row(0),
+                   row(2));
+    }
+
+    /**
+     * Migrated from cql_tests.py:TestCQL.test_select_distinct_with_deletions()
+     */
+    @Test
+    public void testSelectDistinctWithDeletions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, c int, v int)");
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", i, i, i);
+
+        Object[][] rows = getRows(execute("SELECT DISTINCT k FROM %s"));
+        Assert.assertEquals(10, rows.length);
+        Object key_to_delete = rows[3][0];
+
+        execute("DELETE FROM %s WHERE k=?", key_to_delete);
+
+        rows = getRows(execute("SELECT DISTINCT k FROM %s"));
+        Assert.assertEquals(9, rows.length);
+
+        rows = getRows(execute("SELECT DISTINCT k FROM %s LIMIT 5"));
+        Assert.assertEquals(5, rows.length);
+
+        rows = getRows(execute("SELECT DISTINCT k FROM %s"));
+        Assert.assertEquals(9, rows.length);
+    }
+
+    @Test
+    public void testSelectDistinctWithWhereClause() throws Throwable {
+        createTable("CREATE TABLE %s (k int, a int, b int, PRIMARY KEY (k, a))");
+        createIndex("CREATE INDEX ON %s (b)");
+
+        for (int i = 0; i < 10; i++)
+        {
+            execute("INSERT INTO %s (k, a, b) VALUES (?, ?, ?)", i, i, i);
+            execute("INSERT INTO %s (k, a, b) VALUES (?, ?, ?)", i, i * 10, i * 10);
+        }
+
+        String distinctQueryErrorMsg = "SELECT DISTINCT with WHERE clause only supports restriction by partition key and/or static columns.";
+        assertInvalidMessage(distinctQueryErrorMsg,
+                             "SELECT DISTINCT k FROM %s WHERE a >= 80 ALLOW FILTERING");
+
+        assertInvalidMessage(distinctQueryErrorMsg,
+                             "SELECT DISTINCT k FROM %s WHERE k IN (1, 2, 3) AND a = 10");
+
+        assertInvalidMessage(distinctQueryErrorMsg,
+                             "SELECT DISTINCT k FROM %s WHERE b = 5");
+
+        assertRows(execute("SELECT DISTINCT k FROM %s WHERE k = 1"),
+                   row(1));
+        assertRows(execute("SELECT DISTINCT k FROM %s WHERE k IN (5, 6, 7)"),
+                   row(5),
+                   row(6),
+                   row(7));
+
+        // With static columns
+        createTable("CREATE TABLE %s (k int, a int, s int static, b int, PRIMARY KEY (k, a))");
+        createIndex("CREATE INDEX ON %s (b)");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("INSERT INTO %s (k, a, b, s) VALUES (?, ?, ?, ?)", i, i, i, i);
+            execute("INSERT INTO %s (k, a, b, s) VALUES (?, ?, ?, ?)", i, i * 10, i * 10, i * 10);
+        }
+
+        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k = 5"),
+                   row(50));
+        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k IN (5, 6, 7)"),
+                   row(50),
+                   row(60),
+                   row(70));
+    }
+
+    @Test
+    public void testSelectDistinctWithWhereClauseOnStaticColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, a int, s int static, s1 int static, b int, PRIMARY KEY (k, a))");
+
+        for (int i = 0; i < 10; i++)
+        {
+            execute("INSERT INTO %s (k, a, b, s, s1) VALUES (?, ?, ?, ?, ?)", i, i, i, i, i);
+            execute("INSERT INTO %s (k, a, b, s, s1) VALUES (?, ?, ?, ?, ?)", i, i * 10, i * 10, i * 10, i * 10);
+        }
+
+        execute("INSERT INTO %s (k, a, b, s, s1) VALUES (?, ?, ?, ?, ?)", 2, 10, 10, 10, 10);
+
+        assertRows(execute("SELECT DISTINCT k, s, s1 FROM %s WHERE s = 90 AND s1 = 90 ALLOW FILTERING"),
+                   row(9, 90, 90));
+
+        assertRows(execute("SELECT DISTINCT k, s, s1 FROM %s WHERE s = 90 AND s1 = 90 ALLOW FILTERING"),
+                   row(9, 90, 90));
+
+        assertRows(execute("SELECT DISTINCT k, s, s1 FROM %s WHERE s = 10 AND s1 = 10 ALLOW FILTERING"),
+                   row(1, 10, 10),
+                   row(2, 10, 10));
+
+        assertRows(execute("SELECT DISTINCT k, s, s1 FROM %s WHERE k = 1 AND s = 10 AND s1 = 10 ALLOW FILTERING"),
+                   row(1, 10, 10));
+    }
+
+    @Test
+    public void testSelectDistinctWithStaticColumnsAndPaging() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, s int static, c int, d int, primary key (a, b));");
+
+        // Test with only static data
+        for (int i = 0; i < 5; i++)
+            execute("INSERT INTO %s (a, s) VALUES (?, ?)", i, i);
+
+        testSelectDistinctWithPaging();
+
+        // Test with a mix of partition with rows and partitions without rows
+        for (int i = 0; i < 5; i++)
+        {
+            if (i % 2 == 0)
+            {
+                for (int j = 1; j < 4; j++)
+                {
+                    execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", i, j, j, i + j);
+                }
+            }
+        }
+
+        testSelectDistinctWithPaging();
+
+        // Test with all partition with rows
+        for (int i = 0; i < 5; i++)
+        {
+            for (int j = 1; j < 4; j++)
+            {
+                execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", i, j, j, i + j);
+            }
+        }
+
+        testSelectDistinctWithPaging();
+    }
+
+    private void testSelectDistinctWithPaging() throws Throwable
+    {
+        for (int pageSize = 1; pageSize < 7; pageSize++)
+        {
+            // Range query
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s", pageSize),
+                          row(1, 1),
+                          row(0, 0),
+                          row(2, 2),
+                          row(4, 4),
+                          row(3, 3));
+
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s LIMIT 3", pageSize),
+                          row(1, 1),
+                          row(0, 0),
+                          row(2, 2));
+
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s WHERE s >= 2 ALLOW FILTERING", pageSize),
+                          row(2, 2),
+                          row(4, 4),
+                          row(3, 3));
+
+            // Multi partition query
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s WHERE a IN (1, 2, 3, 4);", pageSize),
+                          row(1, 1),
+                          row(2, 2),
+                          row(3, 3),
+                          row(4, 4));
+
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s WHERE a IN (1, 2, 3, 4) LIMIT 3;", pageSize),
+                          row(1, 1),
+                          row(2, 2),
+                          row(3, 3));
+
+            assertRowsNet(executeNetWithPaging("SELECT DISTINCT a, s FROM %s WHERE a IN (1, 2, 3, 4) AND s >= 2 ALLOW FILTERING;", pageSize),
+                          row(2, 2),
+                          row(3, 3),
+                          row(4, 4));
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/IndexQueryPagingTest.java b/test/unit/org/apache/cassandra/cql3/IndexQueryPagingTest.java
new file mode 100644
index 0000000..fd1e661
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/IndexQueryPagingTest.java

@@ -0,0 +1,131 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.SimpleStatement;
+import com.datastax.driver.core.Statement;
+
+import static org.junit.Assert.assertEquals;
+
+public class IndexQueryPagingTest extends CQLTester
+{
+    /*
+     * Some simple tests to verify the behaviour of paging during
+     * 2i queries. We only use a single index type (CompositesIndexOnRegular)
+     * as the code we want to exercise here is in their abstract
+     * base class.
+     */
+
+    @Test
+    public void pagingOnRegularColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    " k1 int," +
+                    " v1 int," +
+                    "PRIMARY KEY (k1))");
+        createIndex("CREATE INDEX ON %s(v1)");
+
+        int rowCount = 3;
+        for (int i=0; i<rowCount; i++)
+            execute("INSERT INTO %s (k1, v1) VALUES (?, ?)", i, 0);
+
+        executePagingQuery("SELECT * FROM %s WHERE v1=0", rowCount);
+    }
+
+    @Test
+    public void pagingOnRegularColumnWithPartitionRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    " k1 int," +
+                    " c1 int," +
+                    " v1 int," +
+                    "PRIMARY KEY (k1, c1))");
+        createIndex("CREATE INDEX ON %s(v1)");
+
+        int partitions = 3;
+        int rowCount = 3;
+        for (int i=0; i<partitions; i++)
+            for (int j=0; j<rowCount; j++)
+                execute("INSERT INTO %s (k1, c1, v1) VALUES (?, ?, ?)", i, j, 0);
+
+        executePagingQuery("SELECT * FROM %s WHERE k1=0 AND v1=0", rowCount);
+    }
+
+    @Test
+    public void pagingOnRegularColumnWithClusteringRestrictions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    " k1 int," +
+                    " c1 int," +
+                    " v1 int," +
+                    "PRIMARY KEY (k1, c1))");
+        createIndex("CREATE INDEX ON %s(v1)");
+
+        int partitions = 3;
+        int rowCount = 3;
+        for (int i=0; i<partitions; i++)
+            for (int j=0; j<rowCount; j++)
+                execute("INSERT INTO %s (k1, c1, v1) VALUES (?, ?, ?)", i, j, 0);
+
+        executePagingQuery("SELECT * FROM %s WHERE k1=0 AND c1>=0 AND c1<=3 AND v1=0", rowCount);
+    }
+
+    @Test
+    public void testPagingOnPartitionsWithoutClusteringColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int PRIMARY KEY, v int)");
+        createIndex("CREATE INDEX on %s(v)");
+
+        execute("INSERT INTO %s (pk, v) VALUES (201, 200);");
+        execute("INSERT INTO %s (pk, v) VALUES (202, 200);");
+        execute("INSERT INTO %s (pk, v) VALUES (203, 200);");
+        execute("INSERT INTO %s (pk, v) VALUES (100, 100);");
+
+        for (int pageSize = 1; pageSize < 10; pageSize++)
+        {
+            assertRowsNet(executeNetWithPaging("select * from %s where v = 200 and pk = 201;", pageSize),
+                          row(201, 200));
+
+            assertRowsNet(executeNetWithPaging("select * from %s where v = 200;", pageSize),
+                          row(201, 200),
+                          row(203, 200),
+                          row(202, 200));
+
+            assertRowsNet(executeNetWithPaging("select * from %s where v = 100;", pageSize),
+                          row(100, 100));
+        }
+    }
+
+    private void executePagingQuery(String cql, int rowCount)
+    {
+        // Execute an index query which should return all rows,
+        // setting the fetch size < than the row count. Assert
+        // that all rows are returned, so we know that paging
+        // of the results was involved.
+        Session session = sessionNet();
+        Statement stmt = new SimpleStatement(String.format(cql, KEYSPACE + '.' + currentTable()));
+        stmt.setFetchSize(rowCount - 1);
+        assertEquals(rowCount, session.execute(stmt).all().size());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
index 8471682..3f87343 100644
--- a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
+++ b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java

@@ -18,31 +18,35 @@
 
 package org.apache.cassandra.cql3;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
-import com.google.common.collect.ImmutableSet;
-
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.metrics.CacheMetrics;
 import org.apache.cassandra.metrics.CassandraMetricsRegistry;
+import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.Pair;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertNull;
+import org.apache.cassandra.utils.Pair;
+
 
 public class KeyCacheCqlTest extends CQLTester
 {
 
-    static final String commonColumnsDef =
+    private static final String commonColumnsDef =
     "part_key_a     int," +
     "part_key_b     text," +
     "clust_key_a    int," +
@@ -50,8 +54,9 @@
     "clust_key_c    frozen<list<text>>," + // to make it really big
     "col_text       text," +
     "col_int        int," +
-    "col_long       bigint,";
-    static final String commonColumns =
+    "col_long       bigint," +
+    "col_blob       blob,";
+    private static final String commonColumns =
     "part_key_a," +
     "part_key_b," +
     "clust_key_a," +
@@ -61,6 +66,152 @@
     "col_int," +
     "col_long";
 
+    // 1200 chars
+    private static final String longString =
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789";
+
+    /**
+     * Prevent system tables from populating the key cache to ensure that
+     * the test can reliably check the size of the key cache size and its metrics.
+     * Test tables will be created with caching enabled manually in the CQL statement,
+     * see {@link KeyCacheCqlTest#createTable(String)}.
+     *
+     * Then call the base class initialization, which must be done after disabling the key cache.
+     */
+    @BeforeClass
+    public static void setUpClass()
+    {
+        CachingParams.DEFAULT = CachingParams.CACHE_NOTHING;
+        CQLTester.setUpClass();
+    }
+
+    /**
+     * Create a table in KEYSPACE_PER_TEST_PER_TEST because it will get dropped synchronously by CQLTester after
+     * each test, whereas the default keyspace gets dropped asynchronously and this may cause unexpected
+     * flush operations during a test, which would change the expected result of metrics.
+     *
+     * Then add manual caching, since by default we have disabled cachinng for all other tables, to ensure
+     * that we can assert on the key cache size and metrics.
+     */
+    @Override
+    protected String createTable(String query)
+    {
+        return super.createTable(KEYSPACE_PER_TEST, query + " WITH caching = { 'keys' : 'ALL', 'rows_per_partition' : '0' }");
+    }
+
+    @Override
+    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    {
+        return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
+    }
+
+    @Override
+    protected void createIndex(String query)
+    {
+        createFormattedIndex(formatQuery(KEYSPACE_PER_TEST, query));
+    }
+
+    @Override
+    protected void dropTable(String query)
+    {
+        dropFormattedTable(String.format(query, KEYSPACE_PER_TEST + "." + currentTable()));
+    }
+
+    @Test
+    public void testSliceQueries() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, val text, vpk text, vck1 int, vck2 int, PRIMARY KEY (pk, ck1, ck2))");
+
+        for (int pkInt = 0; pkInt < 20; pkInt++)
+        {
+            String pk = Integer.toString(pkInt);
+            for (int ck1 = 0; ck1 < 10; ck1++)
+            {
+                for (int ck2 = 0; ck2 < 10; ck2++)
+                {
+                    execute("INSERT INTO %s (pk, ck1, ck2, val, vpk, vck1, vck2) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                            pk, ck1, ck2, makeStringValue(pk, ck1, ck2), pk, ck1, ck2);
+                }
+            }
+        }
+
+        StorageService.instance.forceKeyspaceFlush(KEYSPACE_PER_TEST);
+
+        for (int pkInt = 0; pkInt < 20; pkInt++)
+        {
+            String pk = Integer.toString(pkInt);
+            assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=?", pk),
+                              pk, 0, 10, 0, 10);
+
+            for (int ck1 = 0; ck1 < 10; ck1++)
+            {
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=?", pk, ck1),
+                                  pk, ck1, ck1+1, 0, 10);
+
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1<?", pk, ck1),
+                                  pk, 0, ck1, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1>?", pk, ck1),
+                                  pk, ck1+1, 10, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1<=?", pk, ck1),
+                                  pk, 0, ck1+1, 0, 10);
+                assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1>=?", pk, ck1),
+                                  pk, ck1, 10, 0, 10);
+
+                for (int ck2 = 0; ck2 < 10; ck2++)
+                {
+                    assertRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2=?", pk, ck1, ck2),
+                               new Object[]{ makeStringValue(pk, ck1, ck2), pk, ck1, ck2 });
+
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2<?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, 0, ck2);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2>?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, ck2+1, 10);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2<=?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, 0, ck2+1);
+                    assertClusterRows(execute("SELECT val, vpk, vck1, vck2 FROM %s WHERE pk=? AND ck1=? AND ck2>=?", pk, ck1, ck2),
+                                      pk, ck1, ck1+1, ck2, 10);
+                }
+            }
+        }
+    }
+
+    private static void assertClusterRows(UntypedResultSet rows, String pk, int ck1from, int ck1to, int ck2from, int ck2to)
+    {
+        String info = "pk=" + pk + ", ck1from=" + ck1from + ", ck1to=" + ck1to + ", ck2from=" + ck2from + ", ck2to=" + ck2to;
+        Iterator<UntypedResultSet.Row> iter = rows.iterator();
+        int cnt = 0;
+        int expect = (ck1to - ck1from) * (ck2to - ck2from);
+        for (int ck1 = ck1from; ck1 < ck1to; ck1++)
+        {
+            for (int ck2 = ck2from; ck2 < ck2to; ck2++)
+            {
+                assertTrue("expected " + expect + " (already got " + cnt + ") rows, but more rows are available for " + info, iter.hasNext());
+                UntypedResultSet.Row row = iter.next();
+                assertEquals(makeStringValue(pk, ck1, ck2), row.getString("val"));
+                assertEquals(pk, row.getString("vpk"));
+                assertEquals(ck1, row.getInt("vck1"));
+                assertEquals(ck2, row.getInt("vck2"));
+            }
+        }
+        assertFalse("expected " + expect + " (already got " + cnt + ") rows, but more rows are available for " + info, iter.hasNext());
+    }
+
+    private static String makeStringValue(String pk, int ck1, int ck2)
+    {
+        return longString + ',' + pk + ',' + ck1 + ',' + ck2;
+    }
+
     @Test
     public void test2iKeyCachePaths() throws Throwable
     {
@@ -81,10 +232,8 @@
 
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(4900, hits);
-        assertEquals(5250, requests);
-
-        //
+        assertEquals(0, hits);
+        assertEquals(210, requests);
 
         for (int i = 0; i < 10; i++)
         {
@@ -98,8 +247,8 @@
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(10000, hits);
-        assertEquals(10500, requests);
+        assertEquals(200, hits);
+        assertEquals(420, requests);
 
         CacheService.instance.keyCache.submitWrite(Integer.MAX_VALUE).get();
 
@@ -126,7 +275,7 @@
         //Test Schema.getColumnFamilyStoreIncludingIndexes, several null check paths
         //are defensive and unreachable
         assertNull(Schema.instance.getColumnFamilyStoreIncludingIndexes(Pair.create("foo", "bar")));
-        assertNull(Schema.instance.getColumnFamilyStoreIncludingIndexes(Pair.create(KEYSPACE, "bar")));
+        assertNull(Schema.instance.getColumnFamilyStoreIncludingIndexes(Pair.create(KEYSPACE_PER_TEST, "bar")));
 
         dropTable("DROP TABLE %s");
         Schema.instance.updateVersion();
@@ -160,8 +309,8 @@
 
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(4900, hits);
-        assertEquals(5250, requests);
+        assertEquals(0, hits);
+        assertEquals(210, requests);
 
         //
 
@@ -177,8 +326,8 @@
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(10000, hits);
-        assertEquals(10500, requests);
+        assertEquals(200, hits);
+        assertEquals(420, requests);
 
         dropTable("DROP TABLE %s");
 
@@ -195,22 +344,120 @@
         while(iter.hasNext())
         {
             KeyCacheKey key = iter.next();
-            Assert.assertFalse(key.ksAndCFName.left.equals("KEYSPACE"));
+            Assert.assertFalse(key.ksAndCFName.left.equals("KEYSPACE_PER_TEST"));
             Assert.assertFalse(key.ksAndCFName.right.startsWith(table));
         }
     }
 
+    @Test
+    public void testKeyCacheNonClustered() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s ("
+                                   + commonColumnsDef
+                                   + "PRIMARY KEY ((part_key_a, part_key_b)))");
+        insertData(table, null, false);
+        clearCache();
+
+        for (int i = 0; i < 10; i++)
+        {
+            assertRows(execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)),
+                       new Object[]{ String.valueOf(i) + '-' + String.valueOf(0) });
+        }
+
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        long hits = metrics.hits.getCount();
+        long requests = metrics.requests.getCount();
+        assertEquals(0, hits);
+        assertEquals(10, requests);
+
+        for (int i = 0; i < 100; i++)
+        {
+            assertRows(execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)),
+                       new Object[]{ String.valueOf(i) + '-' + String.valueOf(0) });
+        }
+
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10, hits);
+        assertEquals(120, requests);
+    }
+
+    @Test
+    public void testKeyCacheClustered() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s ("
+                                   + commonColumnsDef
+                                   + "PRIMARY KEY ((part_key_a, part_key_b),clust_key_a,clust_key_b,clust_key_c))");
+        insertData(table, null, true);
+        clearCache();
+
+        // query on partition key
+
+        // 10 queries, each 50 result rows
+        for (int i = 0; i < 10; i++)
+        {
+            assertEquals(50, execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)).size());
+        }
+
+        CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
+        long hits = metrics.hits.getCount();
+        long requests = metrics.requests.getCount();
+        assertEquals(0, hits);
+        assertEquals(10, requests);
+
+        // 10 queries, each 50 result rows
+        for (int i = 0; i < 10; i++)
+        {
+            assertEquals(50, execute("SELECT col_text FROM %s WHERE part_key_a = ? AND part_key_b = ?", i, Integer.toOctalString(i)).size());
+        }
+
+        metrics = CacheService.instance.keyCache.getMetrics();
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10, hits);
+        assertEquals(10 + 10, requests);
+
+        // 100 queries - must get a hit in key-cache
+        for (int i = 0; i < 10; i++)
+        {
+            for (int c = 0; c < 10; c++)
+            {
+                assertRows(execute("SELECT col_text, col_long FROM %s WHERE part_key_a = ? AND part_key_b = ? and clust_key_a = ?", i, Integer.toOctalString(i), c),
+                           new Object[]{ String.valueOf(i) + '-' + String.valueOf(c), (long) c });
+            }
+        }
+
+        metrics = CacheService.instance.keyCache.getMetrics();
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(10 + 100, hits);
+        assertEquals(20 + 100, requests);
+
+        // 5000 queries - first 10 partitions already in key cache
+        for (int i = 0; i < 100; i++)
+        {
+            for (int c = 0; c < 50; c++)
+            {
+                assertRows(execute("SELECT col_text, col_long FROM %s WHERE part_key_a = ? AND part_key_b = ? and clust_key_a = ?", i, Integer.toOctalString(i), c),
+                           new Object[]{ String.valueOf(i) + '-' + String.valueOf(c), (long) c });
+            }
+        }
+
+        hits = metrics.hits.getCount();
+        requests = metrics.requests.getCount();
+        assertEquals(110 + 4910, hits);
+        assertEquals(120 + 5500, requests);
+    }
+
     // Inserts 100 partitions split over 10 sstables (flush after 10 partitions).
     // Clustered tables receive 50 CQL rows per partition.
     private void insertData(String table, String index, boolean withClustering) throws Throwable
     {
-        StorageService.instance.disableAutoCompaction(KEYSPACE, table);
-        Keyspace.open(KEYSPACE).getColumnFamilyStore(table).forceFlush().get();
-        Keyspace.open(KEYSPACE).getColumnFamilyStore(table).truncateBlocking();
+        prepareTable(table);
         if (index != null)
         {
-            StorageService.instance.disableAutoCompaction(KEYSPACE, table + '.' + index);
-            Keyspace.open(KEYSPACE).getColumnFamilyStore(table).indexManager.getIndexesByNames(ImmutableSet.of(table + "." + index)).iterator().next().forceBlockingFlush();
+            StorageService.instance.disableAutoCompaction(KEYSPACE_PER_TEST, table + '.' + index);
+            Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).indexManager.getIndexByName(index).getBlockingFlushTask().call();
         }
 
         for (int i = 0; i < 100; i++)
@@ -233,13 +480,20 @@
 
             if (i % 10 == 9)
             {
-                Keyspace.open(KEYSPACE).getColumnFamilyStore(table).forceFlush().get();
+                Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush().get();
                 if (index != null)
-                    Keyspace.open(KEYSPACE).getColumnFamilyStore(table).indexManager.getIndexesByNames(ImmutableSet.of(table + "." + index)).iterator().next().forceBlockingFlush();
+                    Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).indexManager.getIndexByName(index).getBlockingFlushTask().call();
             }
         }
     }
 
+    private static void prepareTable(String table) throws IOException, InterruptedException, java.util.concurrent.ExecutionException
+    {
+        StorageService.instance.disableAutoCompaction(KEYSPACE_PER_TEST, table);
+        Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush().get();
+        Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).truncateBlocking();
+    }
+
     private static List<String> makeList(String value)
     {
         List<String> list = new ArrayList<>(50);
@@ -252,11 +506,7 @@
 
     private static void clearCache()
     {
-        for (String name : ImmutableSet.copyOf(CassandraMetricsRegistry.Metrics.getMetrics().keySet()))
-        {
-            CassandraMetricsRegistry.Metrics.remove(name);
-        }
-
+        CassandraMetricsRegistry.Metrics.getNames().forEach(CassandraMetricsRegistry.Metrics::remove);
         CacheService.instance.keyCache.clear();
         CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
         Assert.assertEquals(0, metrics.entries.getValue().intValue());

diff --git a/test/unit/org/apache/cassandra/cql3/ListsTest.java b/test/unit/org/apache/cassandra/cql3/ListsTest.java
new file mode 100644
index 0000000..9ca0010
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ListsTest.java

@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+
+import com.google.common.collect.Iterators;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Lists.PrecisionTime;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class ListsTest extends CQLTester
+{
+    private static final long DEFAULT_MILLIS = 424242424242L;
+    private static final int DEFAULT_NANOS = PrecisionTime.MAX_NANOS;
+
+    @Test
+    public void testPrecisionTime_getNext_simple()
+    {
+        PrecisionTime.set(DEFAULT_MILLIS, DEFAULT_NANOS);
+
+        long millis = DEFAULT_MILLIS - 100;
+        int count = 1;
+        PrecisionTime next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(millis, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - count, next.nanos);
+
+        next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(millis, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - (count * 2), next.nanos);
+    }
+
+    @Test
+    public void testPrecisionTime_getNext_Mulitple()
+    {
+        PrecisionTime.set(DEFAULT_MILLIS, DEFAULT_NANOS);
+
+        long millis = DEFAULT_MILLIS - 100;
+        int count = DEFAULT_NANOS / 2;
+        PrecisionTime next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(millis, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - count, next.nanos);
+    }
+
+    @Test
+    public void testPrecisionTime_getNext_RollOverNanos()
+    {
+        final int remainingNanos = 0;
+        PrecisionTime.set(DEFAULT_MILLIS, remainingNanos);
+
+        long millis = DEFAULT_MILLIS;
+        int count = 1;
+        PrecisionTime next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(millis - 1, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - count, next.nanos);
+
+        next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(millis - 1, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - (count * 2), next.nanos);
+    }
+
+    @Test
+    public void testPrecisionTime_getNext_BorkedClock()
+    {
+        final int remainingNanos = 1;
+        PrecisionTime.set(DEFAULT_MILLIS, remainingNanos);
+
+        long millis = DEFAULT_MILLIS + 100;
+        int count = 1;
+        PrecisionTime next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(DEFAULT_MILLIS, next.millis);
+        Assert.assertEquals(remainingNanos - count, next.nanos);
+
+        // this should roll the clock
+        next = PrecisionTime.getNext(millis, count);
+        Assert.assertEquals(DEFAULT_MILLIS - 1, next.millis);
+        Assert.assertEquals(DEFAULT_NANOS - count, next.nanos);
+    }
+
+    @Test
+    public void testPrepender_SmallList()
+    {
+        List<ByteBuffer> terms = new ArrayList<>();
+        terms.add(ByteBufferUtil.bytes(1));
+        terms.add(ByteBufferUtil.bytes(2));
+        terms.add(ByteBufferUtil.bytes(3));
+        terms.add(ByteBufferUtil.bytes(4));
+        terms.add(ByteBufferUtil.bytes(5));
+        testPrepender_execute(terms);
+    }
+
+    @Test
+    public void testPrepender_HugeList()
+    {
+        List<ByteBuffer> terms = new ArrayList<>();
+        // create a large enough array, then remove some off the end, just to make it an odd size
+        for (int i = 0; i < PrecisionTime.MAX_NANOS * 4 - 287; i++)
+            terms.add(ByteBufferUtil.bytes(i));
+        testPrepender_execute(terms);
+    }
+
+    private void testPrepender_execute(List<ByteBuffer> terms)
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<text>)");
+        CFMetaData metaData = currentTableMetadata();
+
+        ColumnDefinition columnDefinition = metaData.getColumnDefinition(ByteBufferUtil.bytes("l"));
+        Term term = new Lists.Value(terms);
+        Lists.Prepender prepender = new Lists.Prepender(columnDefinition, term);
+
+        ByteBuffer keyBuf = ByteBufferUtil.bytes("key");
+        DecoratedKey key = Murmur3Partitioner.instance.decorateKey(keyBuf);
+        UpdateParameters parameters = new UpdateParameters(metaData, null, null, System.currentTimeMillis(), 1000, Collections.emptyMap());
+        Clustering clustering = new Clustering(ByteBufferUtil.bytes(1));
+        parameters.newRow(clustering);
+        prepender.execute(key, parameters);
+
+        Row row = parameters.buildRow();
+        Assert.assertEquals(terms.size(), Iterators.size(row.cells().iterator()));
+
+        int idx = 0;
+        UUID last = null;
+        for (Cell cell : row.cells())
+        {
+            UUID uuid = UUIDGen.getUUID(cell.path().get(0));
+
+            if (last != null)
+                Assert.assertTrue(last.compareTo(uuid) < 0);
+            last = uuid;
+
+            Assert.assertEquals(String.format("different values found: expected: '%d', found '%d'", ByteBufferUtil.toInt(terms.get(idx)), ByteBufferUtil.toInt(cell.value())),
+                                terms.get(idx), cell.value());
+            idx++;
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/NonNativeTimestampTest.java b/test/unit/org/apache/cassandra/cql3/NonNativeTimestampTest.java
index 80c5e3b3..37dc560 100644
--- a/test/unit/org/apache/cassandra/cql3/NonNativeTimestampTest.java
+++ b/test/unit/org/apache/cassandra/cql3/NonNativeTimestampTest.java

@@ -23,63 +23,31 @@
 import java.util.Arrays;
 import java.util.Collections;
 
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.service.EmbeddedCassandraService;
-import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
 import static junit.framework.Assert.assertEquals;
 import static junit.framework.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-public class NonNativeTimestampTest extends SchemaLoader
+public class NonNativeTimestampTest extends CQLTester
 {
-    @BeforeClass
-    public static void setup() throws Exception
-    {
-        Schema.instance.clear();
-        EmbeddedCassandraService cassandra = new EmbeddedCassandraService();
-        cassandra.start();
-    }
-
     @Test
-    public void setServerTimestampForNonCqlNativeStatements() throws RequestValidationException, RequestExecutionException
+    public void setServerTimestampForNonCqlNativeStatements() throws Throwable
     {
-        String createKsCQL = "CREATE KEYSPACE non_native_ts_test" +
-                             " WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };";
-        String createTableCQL = "CREATE TABLE non_native_ts_test.table_0 (k int PRIMARY KEY, v int)";
-        String insertCQL = "INSERT INTO non_native_ts_test.table_0 (k, v) values (1, ?)";
-        String selectCQL = "SELECT v, writetime(v) AS wt FROM non_native_ts_test.table_0 WHERE k = 1";
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v int)");
 
-        QueryProcessor.instance.process(createKsCQL,
-                                        QueryState.forInternalCalls(),
-                                        QueryOptions.forInternalCalls(Collections.<ByteBuffer>emptyList()));
-        QueryProcessor.instance.process(createTableCQL,
-                                        QueryState.forInternalCalls(),
-                                        QueryOptions.forInternalCalls(Collections.<ByteBuffer>emptyList()));
-        QueryProcessor.instance.process(insertCQL,
-                                        QueryState.forInternalCalls(),
-                                        QueryOptions.forInternalCalls(ConsistencyLevel.ONE,
-                                                                      Arrays.asList(ByteBufferUtil.bytes(2))));
-        UntypedResultSet.Row row = QueryProcessor.instance.executeInternal(selectCQL).one();
+        execute("INSERT INTO %s (k, v) values (1, ?)", 2);
+
+        UntypedResultSet.Row row = execute("SELECT v, writetime(v) AS wt FROM %s WHERE k = 1").one();
         assertEquals(2, row.getInt("v"));
         long timestamp1 = row.getLong("wt");
         assertFalse(timestamp1 == -1l);
 
         // per CASSANDRA-8246 the two updates will have the same (incorrect)
         // timestamp, so reconcilliation is by value and the "older" update wins
-        QueryProcessor.instance.process(insertCQL,
-                                        QueryState.forInternalCalls(),
-                                        QueryOptions.forInternalCalls(ConsistencyLevel.ONE,
-                                                                      Arrays.asList(ByteBufferUtil.bytes(1))));
-        row = QueryProcessor.executeInternal(selectCQL).one();
+        execute("INSERT INTO %s (k, v) values (1, ?)", 1);
+
+        row = execute("SELECT v, writetime(v) AS wt FROM %s WHERE k = 1").one();
         assertEquals(1, row.getInt("v"));
         assertTrue(row.getLong("wt") > timestamp1);
     }

diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceBase.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceBase.java
deleted file mode 100644
index eda2413..0000000
--- a/test/unit/org/apache/cassandra/cql3/OutOfSpaceBase.java
+++ /dev/null

@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import static junit.framework.Assert.fail;
-
-import java.io.IOError;
-import java.util.UUID;
-import java.util.concurrent.ExecutionException;
-
-import org.junit.After;
-import org.junit.Assert;
-
-import org.apache.cassandra.db.DisallowedDirectories;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Directories.DataDirectory;
-import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.db.commitlog.CommitLogSegment;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.FSWriteError;
-
-/**
- * Test that exceptions during flush are treated according to the disk failure policy.
- * We cannot recover after a failed flush due to postFlushExecutor being stuck, so each test needs to run separately.
- */
-public class OutOfSpaceBase extends CQLTester
-{
-    public void makeTable() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));");
-
-        for (int i = 0; i < 10; i++)
-            execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);");
-    }
-
-    public void markDirectoriesUnwriteable()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
-        try
-        {
-            for ( ; ; )
-            {
-                DataDirectory dir = cfs.directories.getWriteableLocation(1);
-                DisallowedDirectories.maybeMarkUnwritable(cfs.directories.getLocationForDisk(dir));
-            }
-        }
-        catch (IOError e)
-        {
-            // Expected -- marked all directories as unwritable
-        }
-    }
-
-    public void flushAndExpectError() throws InterruptedException, ExecutionException
-    {
-        try
-        {
-            Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable()).forceFlush().get();
-            fail("FSWriteError expected.");
-        }
-        catch (ExecutionException e)
-        {
-            // Correct path.
-            Assert.assertTrue(e.getCause() instanceof FSWriteError);
-        }
-
-        // Make sure commit log wasn't discarded.
-        UUID cfid = currentTableMetadata().cfId;
-        for (CommitLogSegment segment : CommitLog.instance.allocator.getActiveSegments())
-            if (segment.getDirtyCFIDs().contains(cfid))
-                return;
-        fail("Expected commit log to remain dirty for the affected table.");
-    }
-
-
-    @After
-    public void afterTest() throws Throwable
-    {
-        // Override CQLTester's afterTest method; clean-up will fail due to flush failing.
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceDieTest.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceDieTest.java
deleted file mode 100644
index 46d71e4..0000000
--- a/test/unit/org/apache/cassandra/cql3/OutOfSpaceDieTest.java
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import static junit.framework.Assert.fail;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import org.apache.cassandra.config.Config.DiskFailurePolicy;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.utils.JVMStabilityInspector;
-import org.apache.cassandra.utils.KillerForTests;
-
-/**
- * Test that exceptions during flush are treated according to the disk failure policy.
- */
-public class OutOfSpaceDieTest extends OutOfSpaceBase
-{
-    @Test
-    public void testFlushUnwriteableDie() throws Throwable
-    {
-        makeTable();
-        markDirectoriesUnwriteable();
-
-        KillerForTests killerForTests = new KillerForTests();
-        JVMStabilityInspector.Killer originalKiller = JVMStabilityInspector.replaceKiller(killerForTests);
-        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
-        try
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.die);
-            flushAndExpectError();
-            Assert.assertTrue(killerForTests.wasKilled());
-            Assert.assertFalse(killerForTests.wasKilledQuietly()); //only killed quietly on startup failure
-        }
-        finally
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
-            JVMStabilityInspector.replaceKiller(originalKiller);
-        }
-
-        makeTable();
-        try
-        {
-            flush();
-            fail("Subsequent flushes expected to fail.");
-        }
-        catch (RuntimeException e)
-        {
-            // correct path
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceIgnoreTest.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceIgnoreTest.java
deleted file mode 100644
index 854de80..0000000
--- a/test/unit/org/apache/cassandra/cql3/OutOfSpaceIgnoreTest.java
+++ /dev/null

@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import static junit.framework.Assert.fail;
-
-import org.junit.Test;
-
-import org.apache.cassandra.config.Config.DiskFailurePolicy;
-import org.apache.cassandra.config.DatabaseDescriptor;
-
-/**
- * Test that exceptions during flush are treated according to the disk failure policy.
- */
-public class OutOfSpaceIgnoreTest extends OutOfSpaceBase
-{
-    @Test
-    public void testFlushUnwriteableIgnore() throws Throwable
-    {
-        makeTable();
-        markDirectoriesUnwriteable();
-
-        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
-        try
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.ignore);
-            flushAndExpectError();
-        }
-        finally
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
-        }
-
-        makeTable();
-        try
-        {
-            flush();
-            fail("Subsequent flushes expected to fail.");
-        }
-        catch (RuntimeException e)
-        {
-            // correct path
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceStopTest.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceStopTest.java
deleted file mode 100644
index b48df56..0000000
--- a/test/unit/org/apache/cassandra/cql3/OutOfSpaceStopTest.java
+++ /dev/null

@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.cql3;
-
-import static junit.framework.Assert.fail;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import org.apache.cassandra.config.Config.DiskFailurePolicy;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.gms.Gossiper;
-
-/**
- * Test that exceptions during flush are treated according to the disk failure policy.
- */
-public class OutOfSpaceStopTest extends OutOfSpaceBase
-{
-    @Test
-    public void testFlushUnwriteableStop() throws Throwable
-    {
-        makeTable();
-        markDirectoriesUnwriteable();
-
-        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
-        try
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.stop);
-            flushAndExpectError();
-            Assert.assertFalse(Gossiper.instance.isEnabled());
-        }
-        finally
-        {
-            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
-        }
-
-        makeTable();
-        try
-        {
-            flush();
-            fail("Subsequent flushes expected to fail.");
-        }
-        catch (RuntimeException e)
-        {
-            // correct path
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java
new file mode 100644
index 0000000..26e7fe2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java

@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import static junit.framework.Assert.fail;
+
+import java.io.Closeable;
+import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.Config.DiskFailurePolicy;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.commitlog.CommitLogSegment;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.KillerForTests;
+
+/**
+ * Test that TombstoneOverwhelmingException gets thrown when it should be and doesn't when it shouldn't be.
+ */
+public class OutOfSpaceTest extends CQLTester
+{
+    @Test
+    public void testFlushUnwriteableDie() throws Throwable
+    {
+        makeTable();
+
+        KillerForTests killerForTests = new KillerForTests();
+        JVMStabilityInspector.Killer originalKiller = JVMStabilityInspector.replaceKiller(killerForTests);
+        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
+        try (Closeable c = Util.markDirectoriesUnwriteable(getCurrentColumnFamilyStore()))
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.die);
+            flushAndExpectError();
+            Assert.assertTrue(killerForTests.wasKilled());
+            Assert.assertFalse(killerForTests.wasKilledQuietly()); //only killed quietly on startup failure
+        }
+        finally
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
+            JVMStabilityInspector.replaceKiller(originalKiller);
+        }
+    }
+
+    @Test
+    public void testFlushUnwriteableStop() throws Throwable
+    {
+        makeTable();
+
+        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
+        try (Closeable c = Util.markDirectoriesUnwriteable(getCurrentColumnFamilyStore()))
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.stop);
+            flushAndExpectError();
+            Assert.assertFalse(Gossiper.instance.isEnabled());
+        }
+        finally
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
+        }
+    }
+
+    @Test
+    public void testFlushUnwriteableIgnore() throws Throwable
+    {
+        makeTable();
+
+        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
+        try (Closeable c = Util.markDirectoriesUnwriteable(getCurrentColumnFamilyStore()))
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.ignore);
+            flushAndExpectError();
+        }
+        finally
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
+        }
+
+        // Next flush should succeed.
+        flush();
+    }
+
+    public void makeTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, c text, PRIMARY KEY (a, b));");
+
+        // insert exactly the amount of tombstones that shouldn't trigger an exception
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);");
+    }
+
+    public void flushAndExpectError() throws InterruptedException, ExecutionException
+    {
+        try
+        {
+            Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable()).forceFlush().get();
+            fail("FSWriteError expected.");
+        }
+        catch (ExecutionException e)
+        {
+            // Correct path.
+            Assert.assertTrue(e.getCause() instanceof FSWriteError);
+        }
+
+        // Make sure commit log wasn't discarded.
+        UUID cfid = currentTableMetadata().cfId;
+        for (CommitLogSegment segment : CommitLog.instance.allocator.getActiveSegments())
+            if (segment.getDirtyCFIDs().contains(cfid))
+                return;
+        fail("Expected commit log to remain dirty for the affected table.");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/PagingTest.java b/test/unit/org/apache/cassandra/cql3/PagingTest.java
index 9c7041b..ea1eb43 100644
--- a/test/unit/org/apache/cassandra/cql3/PagingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/PagingTest.java

@@ -59,7 +59,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
-        DatabaseDescriptor.setPartitioner(new Murmur3Partitioner());
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
         EmbeddedCassandraService cassandra = new EmbeddedCassandraService();
         cassandra.start();
 

diff --git a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
index b5a28df..e01b812 100644
--- a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java

@@ -24,12 +24,15 @@
 import com.datastax.driver.core.Cluster;
 import com.datastax.driver.core.PreparedStatement;
 import com.datastax.driver.core.Session;
+import com.datastax.driver.core.exceptions.SyntaxError;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.index.StubIndex;
 import org.apache.cassandra.service.EmbeddedCassandraService;
 
-import static junit.framework.Assert.assertEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 public class PreparedStatementsTest extends SchemaLoader
 {
@@ -127,4 +130,37 @@
 
         assertEquals(1, session.execute(preparedSelect.bind(1)).all().size());
     }
+
+    @Test
+    public void prepareAndExecuteWithCustomExpressions() throws Throwable
+    {
+        session.execute(dropKsStatement);
+        session.execute(createKsStatement);
+        String table = "custom_expr_test";
+        String index = "custom_index";
+
+        session.execute(String.format("CREATE TABLE IF NOT EXISTS %s.%s (id int PRIMARY KEY, cid int, val text);",
+                                      KEYSPACE, table));
+        session.execute(String.format("CREATE CUSTOM INDEX %s ON %s.%s(val) USING '%s'",
+                                      index, KEYSPACE, table, StubIndex.class.getName()));
+        session.execute(String.format("INSERT INTO %s.%s(id, cid, val) VALUES (0, 0, 'test')", KEYSPACE, table));
+
+        PreparedStatement prepared1 = session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(%s, 'foo')",
+                                                                    KEYSPACE, table, index));
+        assertEquals(1, session.execute(prepared1.bind()).all().size());
+
+        PreparedStatement prepared2 = session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(%s, ?)",
+                                                                    KEYSPACE, table, index));
+        assertEquals(1, session.execute(prepared2.bind("foo bar baz")).all().size());
+
+        try
+        {
+            session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(?, 'foo bar baz')", KEYSPACE, table));
+            fail("Expected syntax exception, but none was thrown");
+        }
+        catch(SyntaxError e)
+        {
+            assertEquals("Bind variables cannot be used for index names", e.getMessage());
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
new file mode 100644
index 0000000..01a2afd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java

@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.Random;
+
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class QueryWithIndexedSSTableTest extends CQLTester
+{
+    @Test
+    public void queryIndexedSSTableTest() throws Throwable
+    {
+        // That test reproduces the bug from CASSANDRA-10903 and the fact we have a static column is
+        // relevant to that reproduction in particular as it forces a slightly different code path that
+        // if there wasn't a static.
+
+        int ROWS = 1000;
+        int VALUE_LENGTH = 100;
+
+        createTable("CREATE TABLE %s (k int, t int, s text static, v text, PRIMARY KEY (k, t))");
+
+        // We create a partition that is big enough that the underlying sstable will be indexed
+        // For that, we use a large-ish number of row, and a value that isn't too small.
+        String text = TombstonesWithIndexedSSTableTest.makeRandomString(VALUE_LENGTH);
+        for (int i = 0; i < ROWS; i++)
+            execute("INSERT INTO %s(k, t, v) VALUES (?, ?, ?)", 0, i, text + i);
+
+        flush();
+        compact();
+
+        // Sanity check that we're testing what we want to test, that is that we're reading from an indexed
+        // sstable. Note that we'll almost surely have a single indexed sstable in practice, but it's theorically
+        // possible for a compact strategy to yield more than that and as long as one is indexed we're pretty
+        // much testing what we want. If this check ever fails on some specific setting, we'll have to either
+        // tweak ROWS and VALUE_LENGTH, or skip the test on those settings.
+        DecoratedKey dk = Util.dk(ByteBufferUtil.bytes(0));
+        boolean hasIndexed = false;
+        for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
+        {
+            RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+            hasIndexed |= indexEntry != null && indexEntry.isIndexed();
+        }
+        assert hasIndexed;
+
+        assertRowCount(execute("SELECT s FROM %s WHERE k = ?", 0), ROWS);
+        assertRowCount(execute("SELECT s FROM %s WHERE k = ? ORDER BY t DESC", 0), ROWS);
+
+        assertRowCount(execute("SELECT DISTINCT s FROM %s WHERE k = ?", 0), 1);
+        assertRowCount(execute("SELECT DISTINCT s FROM %s WHERE k = ? ORDER BY t DESC", 0), 1);
+    }
+
+    // Creates a random string 
+    public static String makeRandomSt(int length)
+    {
+        Random random = new Random();
+        char[] chars = new char[26];
+        int i = 0;
+        for (char c = 'a'; c <= 'z'; c++)
+            chars[i++] = c;
+        return new String(chars);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/RangeTombstoneMergeTest.java b/test/unit/org/apache/cassandra/cql3/RangeTombstoneMergeTest.java
deleted file mode 100644
index 71634e9..0000000
--- a/test/unit/org/apache/cassandra/cql3/RangeTombstoneMergeTest.java
+++ /dev/null

@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.cql3;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import com.google.common.collect.Iterables;
-
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.cassandra.Util;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-
-public class RangeTombstoneMergeTest extends CQLTester
-{
-    @Before
-    public void before() throws Throwable
-    {
-        createTable("CREATE TABLE %s(" +
-                "  key text," +
-                "  column text," +
-                "  data text," +
-                "  extra text," +
-                "  PRIMARY KEY(key, column)" +
-                ");");
-
-        // If the sstable only contains tombstones during compaction it seems that the sstable either gets removed or isn't created (but that could probably be a separate JIRA issue).
-        execute("INSERT INTO %s (key, column, data) VALUES (?, ?, ?)", "1", "1", "1");
-    }
-
-    @Test
-    public void testEqualMerge() throws Throwable
-    {
-        addRemoveAndFlush();
-
-        for (int i=0; i<3; ++i)
-        {
-            addRemoveAndFlush();
-            compact();
-        }
-
-        assertOneTombstone();
-    }
-
-    @Test
-    public void testRangeMerge() throws Throwable
-    {
-        addRemoveAndFlush();
-
-        execute("INSERT INTO %s (key, column, data, extra) VALUES (?, ?, ?, ?)", "1", "2", "2", "2");
-        execute("DELETE extra FROM %s WHERE key=? AND column=?", "1", "2");
-
-        flush();
-        compact();
-
-        execute("DELETE FROM %s WHERE key=? AND column=?", "1", "2");
-
-        flush();
-        compact();
-
-        assertOneTombstone();
-    }
-
-    void assertOneTombstone() throws Throwable
-    {
-        assertRows(execute("SELECT column FROM %s"),
-                   row("1"));
-        assertAllRows(row("1", "1", "1", null));
-
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
-        ColumnFamily cf = cfs.getColumnFamily(Util.dk("1"), Composites.EMPTY, Composites.EMPTY, false, 100, System.currentTimeMillis());
-        assertTrue(cf.deletionInfo().hasRanges());
-        assertEquals(1, cf.deletionInfo().rangeCount());    // Ranges merged during CF construction
-
-        assertEquals(1, cfs.getSSTables().size());
-        SSTableReader reader = Iterables.get(cfs.getSSTables(), 0);
-        assertEquals(1, countTombstones(reader));           // See CASSANDRA-7953.
-    }
-
-    void addRemoveAndFlush() throws Throwable
-    {
-        execute("INSERT INTO %s (key, column, data) VALUES (?, ?, ?)", "1", "2", "2");
-        execute("DELETE FROM %s WHERE key=? AND column=?", "1", "2");
-        flush();
-    }
-
-    int countTombstones(SSTableReader reader)
-    {
-        int tombstones = 0;
-        ISSTableScanner partitions = reader.getScanner();
-        while (partitions.hasNext())
-        {
-            OnDiskAtomIterator iter = partitions.next();
-            while (iter.hasNext())
-            {
-                OnDiskAtom atom = iter.next();
-                if (atom instanceof RangeTombstone)
-                    ++tombstones;
-            }
-        }
-        return tombstones;
-    }
-}

diff --git a/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java b/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java
new file mode 100644
index 0000000..aaf9824
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.exceptions.SyntaxException;
+
+public class ReservedKeywordsTest
+{
+    @Test
+    public void testReservedWordsForColumns() throws Exception
+    {
+        for (String reservedWord : ReservedKeywords.reservedKeywords)
+        {
+            try
+            {
+                QueryProcessor.parseStatement(String.format("ALTER TABLE ks.t ADD %s TEXT", reservedWord));
+                Assert.fail(String.format("Reserved keyword %s should not have parsed", reservedWord));
+            }
+            catch (SyntaxException ignore)
+            {
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/SerializationMirrorTest.java b/test/unit/org/apache/cassandra/cql3/SerializationMirrorTest.java
new file mode 100644
index 0000000..49f77a7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/SerializationMirrorTest.java

@@ -0,0 +1,63 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.cql3;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class SerializationMirrorTest extends CQLTester
+{
+
+    @Test
+    public void testManyClusterings() throws Throwable
+    {
+        StringBuilder table = new StringBuilder("CREATE TABLE %s (a TEXT");
+        StringBuilder cols = new StringBuilder();
+        StringBuilder args = new StringBuilder("?");
+        List<Object> vals = new ArrayList<>();
+        vals.add("a");
+        for (int i = 0 ; i < 40 ; i++)
+        {
+            table.append(", c").append(i).append(" text");
+            cols.append(", c").append(i);
+            if (ThreadLocalRandom.current().nextBoolean())
+                vals.add(Integer.toString(i));
+            else
+                vals.add("");
+            args.append(",?");
+        }
+        args.append(",?");
+        vals.add("value");
+        table.append(", v text, PRIMARY KEY ((a)").append(cols).append("))");
+        createTable(table.toString());
+
+        execute("INSERT INTO %s (a" + cols + ", v) VALUES (" + args+ ")", vals.toArray());
+        flush();
+        UntypedResultSet.Row row = execute("SELECT * FROM %s").one();
+        for (int i = 0 ; i < row.getColumns().size() ; i++)
+            Assert.assertEquals(vals.get(i), row.getString(i == 0 ? "a" : i < 41 ? "c" + (i - 1) : "v"));
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java
new file mode 100644
index 0000000..f32bcc6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java

@@ -0,0 +1,549 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+public class SimpleQueryTest extends CQLTester
+{
+    @Test
+    public void testStaticCompactTables() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, v1 int, v2 text) WITH COMPACT STORAGE");
+
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "first", 1, "value1");
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "second", 2, "value2");
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "third", 3, "value3");
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ?", "first"),
+            row("first", 1, "value1")
+        );
+
+        assertRows(execute("SELECT v2 FROM %s WHERE k = ?", "second"),
+            row("value2")
+        );
+
+        // Murmur3 order
+        assertRows(execute("SELECT * FROM %s"),
+            row("third",  3, "value3"),
+            row("second", 2, "value2"),
+            row("first",  1, "value1")
+        );
+    }
+
+    @Test
+    public void testDynamicCompactTables() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v text, PRIMARY KEY (k, t));");
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key", 1, "v11");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key", 2, "v12");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key", 3, "v13");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key", 4, "v14");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key", 5, "v15");
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("key",  1, "v11"),
+            row("key",  2, "v12"),
+            row("key",  3, "v13"),
+            row("key",  4, "v14"),
+            row("key",  5, "v15")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ?", "key", 3),
+            row("key",  4, "v14"),
+            row("key",  5, "v15")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ?", "key", 2, 4),
+            row("key",  2, "v12"),
+            row("key",  3, "v13")
+        );
+
+        // Reversed queries
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? ORDER BY t DESC", "key"),
+            row("key",  5, "v15"),
+            row("key",  4, "v14"),
+            row("key",  3, "v13"),
+            row("key",  2, "v12"),
+            row("key",  1, "v11")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ? ORDER BY t DESC", "key", 3),
+            row("key",  5, "v15"),
+            row("key",  4, "v14")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ? ORDER BY t DESC", "key", 2, 4),
+            row("key",  3, "v13"),
+            row("key",  2, "v12")
+        );
+    }
+
+    @Test
+    public void testTableWithoutClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, v1 int, v2 text);");
+
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "first", 1, "value1");
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "second", 2, "value2");
+        execute("INSERT INTO %s (k, v1, v2) values (?, ?, ?)", "third", 3, "value3");
+
+        flush();
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ?", "first"),
+            row("first", 1, "value1")
+        );
+
+        assertRows(execute("SELECT v2 FROM %s WHERE k = ?", "second"),
+            row("value2")
+        );
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("third",  3, "value3"),
+            row("second", 2, "value2"),
+            row("first",  1, "value1")
+        );
+    }
+
+    @Test
+    public void testTableWithOneClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v1 text, v2 text, PRIMARY KEY (k, t));");
+
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 1, "v11", "v21");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 2, "v12", "v22");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 3, "v13", "v23");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 4, "v14", "v24");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 5, "v15", "v25");
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("key",  1, "v11", "v21"),
+            row("key",  2, "v12", "v22"),
+            row("key",  3, "v13", "v23"),
+            row("key",  4, "v14", "v24"),
+            row("key",  5, "v15", "v25")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ?", "key", 3),
+            row("key",  4, "v14", "v24"),
+            row("key",  5, "v15", "v25")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ?", "key", 2, 4),
+            row("key",  2, "v12", "v22"),
+            row("key",  3, "v13", "v23")
+        );
+
+        // Reversed queries
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? ORDER BY t DESC", "key"),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24"),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22"),
+            row("key",  1, "v11", "v21")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ? ORDER BY t DESC", "key", 3),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ? ORDER BY t DESC", "key", 2, 4),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22")
+        );
+    }
+
+    @Test
+    public void testTableWithReverseClusteringOrder() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v1 text, v2 text, PRIMARY KEY (k, t)) WITH CLUSTERING ORDER BY (t DESC);");
+
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 1, "v11", "v21");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 2, "v12", "v22");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 3, "v13", "v23");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 4, "v14", "v24");
+        execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", 5, "v15", "v25");
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24"),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22"),
+            row("key",  1, "v11", "v21")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? ORDER BY t ASC", "key"),
+            row("key",  1, "v11", "v21"),
+            row("key",  2, "v12", "v22"),
+            row("key",  3, "v13", "v23"),
+            row("key",  4, "v14", "v24"),
+            row("key",  5, "v15", "v25")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ?", "key", 3),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ?", "key", 2, 4),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22")
+        );
+
+        // Reversed queries
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? ORDER BY t DESC", "key"),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24"),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22"),
+            row("key",  1, "v11", "v21")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t > ? ORDER BY t DESC", "key", 3),
+            row("key",  5, "v15", "v25"),
+            row("key",  4, "v14", "v24")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t >= ? AND t < ? ORDER BY t DESC", "key", 2, 4),
+            row("key",  3, "v13", "v23"),
+            row("key",  2, "v12", "v22")
+        );
+    }
+
+    @Test
+    public void testTableWithTwoClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t1 text, t2 int, v text, PRIMARY KEY (k, t1, t2));");
+
+        execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", "v1", 1, "v1");
+        execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", "v1", 2, "v2");
+        execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", "v2", 1, "v3");
+        execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", "v2", 2, "v4");
+        execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", "v2", 3, "v5");
+        flush();
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("key",  "v1", 1, "v1"),
+            row("key",  "v1", 2, "v2"),
+            row("key",  "v2", 1, "v3"),
+            row("key",  "v2", 2, "v4"),
+            row("key",  "v2", 3, "v5")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t1 >= ?", "key", "v2"),
+            row("key",  "v2", 1, "v3"),
+            row("key",  "v2", 2, "v4"),
+            row("key",  "v2", 3, "v5")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE k = ? AND t1 >= ? ORDER BY t1 DESC", "key", "v2"),
+            row("key",  "v2", 3, "v5"),
+            row("key",  "v2", 2, "v4"),
+            row("key",  "v2", 1, "v3")
+        );
+    }
+
+    @Test
+    public void testTableWithLargePartition() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t1 int, t2 int, v text, PRIMARY KEY (k, t1, t2));");
+
+        for (int t1 = 0; t1 < 20; t1++)
+            for (int t2 = 0; t2 < 10; t2++)
+                execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", t1, t2, "someSemiLargeTextForValue_" + t1 + "_" + t2);
+
+        flush();
+
+        Object[][] expected = new Object[10][];
+        for (int t2 = 0; t2 < 10; t2++)
+            expected[t2] = row("key", 15, t2);
+
+        assertRows(execute("SELECT k, t1, t2 FROM %s WHERE k=? AND t1=?", "key", 15), expected);
+
+        Object[][] expectedReverse = new Object[10][];
+        for (int t2 = 9; t2 >= 0; t2--)
+            expectedReverse[9 - t2] = row("key", 15, t2);
+
+        assertRows(execute("SELECT k, t1, t2 FROM %s WHERE k=? AND t1=? ORDER BY t1 DESC, t2 DESC", "key", 15), expectedReverse);
+    }
+
+    @Test
+    public void testRowDeletion() throws Throwable
+    {
+        int N = 4;
+
+        createTable("CREATE TABLE %s (k text, t int, v1 text, v2 int, PRIMARY KEY (k, t));");
+
+        for (int t = 0; t < N; t++)
+            execute("INSERT INTO %s (k, t, v1, v2) values (?, ?, ?, ?)", "key", t, "v" + t, t + 10);
+
+        flush();
+
+        for (int i = 0; i < N / 2; i++)
+            execute("DELETE FROM %s WHERE k=? AND t=?", "key", i * 2);
+
+        Object[][] expected = new Object[N/2][];
+        for (int i = 0; i < N / 2; i++)
+        {
+            int t = i * 2 + 1;
+            expected[i] = row("key", t, "v" + t, t + 10);
+        }
+
+        assertRows(execute("SELECT * FROM %s"), expected);
+    }
+
+    @Test
+    public void testRangeTombstones() throws Throwable
+    {
+        int N = 100;
+
+        createTable("CREATE TABLE %s (k text, t1 int, t2 int, v text, PRIMARY KEY (k, t1, t2));");
+
+        for (int t1 = 0; t1 < 3; t1++)
+            for (int t2 = 0; t2 < N; t2++)
+                execute("INSERT INTO %s (k, t1, t2, v) values (?, ?, ?, ?)", "key", t1, t2, "someSemiLargeTextForValue_" + t1 + "_" + t2);
+
+        flush();
+
+        execute("DELETE FROM %s WHERE k=? AND t1=?", "key", 1);
+
+        flush();
+
+        Object[][] expected = new Object[2*N][];
+        for (int t2 = 0; t2 < N; t2++)
+        {
+            expected[t2] = row("key", 0, t2, "someSemiLargeTextForValue_0_" + t2);
+            expected[N + t2] = row("key", 2, t2, "someSemiLargeTextForValue_2_" + t2);
+        }
+
+        assertRows(execute("SELECT * FROM %s"), expected);
+    }
+
+    @Test
+    public void test2ndaryIndexes() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v text, PRIMARY KEY (k, t));");
+
+        execute("CREATE INDEX ON %s(v)");
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 1, "foo");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 2, "bar");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key2", 1, "foo");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key2", 2, "foo");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key2", 3, "bar");
+
+        assertRows(execute("SELECT * FROM %s WHERE v = ?", "foo"),
+            row("key1",  1, "foo"),
+            row("key2",  1, "foo"),
+            row("key2",  2, "foo")
+        );
+
+        assertRows(execute("SELECT * FROM %s WHERE v = ?", "bar"),
+            row("key1",  2, "bar"),
+            row("key2",  3, "bar")
+        );
+    }
+
+    @Test
+    public void testStaticColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, s text static, v text, PRIMARY KEY (k, t));");
+
+        execute("INSERT INTO %s (k, t, v, s) values (?, ?, ?, ?)", "key1", 1, "foo1", "st1");
+        execute("INSERT INTO %s (k, t, v, s) values (?, ?, ?, ?)", "key1", 2, "foo2", "st2");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v, s) values (?, ?, ?, ?)", "key1", 3, "foo3", "st3");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 4, "foo4");
+
+        assertRows(execute("SELECT * FROM %s"),
+            row("key1",  1, "st3", "foo1"),
+            row("key1",  2, "st3", "foo2"),
+            row("key1",  3, "st3", "foo3"),
+            row("key1",  4, "st3", "foo4")
+        );
+
+        assertRows(execute("SELECT s FROM %s WHERE k = ?", "key1"),
+            row("st3"),
+            row("st3"),
+            row("st3"),
+            row("st3")
+        );
+
+        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k = ?", "key1"),
+            row("st3")
+        );
+    }
+
+    @Test
+    public void testDistinct() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v text, PRIMARY KEY (k, t));");
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 1, "foo1");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 2, "foo2");
+
+        flush();
+
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key1", 3, "foo3");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key2", 4, "foo4");
+        execute("INSERT INTO %s (k, t, v) values (?, ?, ?)", "key2", 5, "foo5");
+
+        assertRows(execute("SELECT DISTINCT k FROM %s"),
+            row("key1"),
+            row("key2")
+        );
+    }
+
+    @Test
+    public void collectionDeletionTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, s set<int>);");
+
+        execute("INSERT INTO %s (k, s) VALUES (?, ?)", 1, set(1));
+
+        flush();
+
+        execute("INSERT INTO %s (k, s) VALUES (?, ?)", 1, set(2));
+
+        assertRows(execute("SELECT s FROM %s WHERE k = ?", 1),
+            row(set(2))
+        );
+    }
+
+    @Test
+    public void limitWithMultigetTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v int);");
+
+        execute("INSERT INTO %s (k, v) VALUES (?, ?)", 0, 0);
+        execute("INSERT INTO %s (k, v) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (k, v) VALUES (?, ?)", 2, 2);
+        execute("INSERT INTO %s (k, v) VALUES (?, ?)", 3, 3);
+
+        assertRows(execute("SELECT v FROM %s WHERE k IN ? LIMIT ?", list(0, 1, 2, 3), 2),
+            row(0),
+            row(1)
+        );
+    }
+
+    @Test
+    public void staticDistinctTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s ( k int, p int, s int static, PRIMARY KEY (k, p))");
+
+        execute("INSERT INTO %s (k, p) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (k, p) VALUES (?, ?)", 1, 2);
+
+        assertRows(execute("SELECT k, s FROM %s"),
+            row(1, null),
+            row(1, null)
+        );
+        assertRows(execute("SELECT DISTINCT k, s FROM %s"),
+            row(1, null)
+        );
+        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k=?", 1),
+            row((Object)null)
+        );
+        assertEmpty(execute("SELECT DISTINCT s FROM %s WHERE k=?", 2));
+    }
+
+    @Test
+    public void testCompactStorageUpdateWithNull() throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                "clustering_1 int," +
+                "value int," +
+                " PRIMARY KEY (partitionKey, clustering_1)) WITH COMPACT STORAGE");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 1, 1)");
+
+        flush();
+
+        execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ?", null, 0, 0);
+
+        assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?))", 0, 0, 1),
+            row(0, 1, 1)
+        );
+    }
+
+    @Test
+    public void test2ndaryIndexBug() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, c2 int, v int, PRIMARY KEY(k, c1, c2))");
+
+        execute("CREATE INDEX v_idx ON %s(v)");
+
+        execute("INSERT INTO %s (k, c1, c2, v) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (k, c1, c2, v) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+
+        assertRows(execute("SELECT * FROM %s WHERE v=?", 0),
+            row(0, 0, 0, 0),
+            row(0, 1, 0, 0)
+        );
+
+        flush();
+
+        execute("DELETE FROM %s WHERE k=? AND c1=?", 0, 1);
+
+        flush();
+
+        assertRows(execute("SELECT * FROM %s WHERE v=?", 0),
+            row(0, 0, 0, 0)
+        );
+    }
+
+    @Test
+    public void testSStableTimestampOrdering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, v1 int, v2 int, PRIMARY KEY (k1))");
+        disableCompaction();
+
+        // sstable1
+        execute("INSERT INTO %s(k1,v1,v2) VALUES(1,1,1)  USING TIMESTAMP 5");
+        flush();
+
+        // sstable2
+        execute("INSERT INTO %s(k1,v1,v2) VALUES(1,1,2)  USING TIMESTAMP 8");
+        flush();
+
+        execute("INSERT INTO %s(k1) VALUES(1)  USING TIMESTAMP 7");
+        execute("DELETE FROM %s USING TIMESTAMP 6 WHERE k1 = 1");
+
+        assertRows(execute("SELECT * FROM %s WHERE k1=1"), row(1, 1, 2));
+    } 
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ThriftCompatibilityTest.java b/test/unit/org/apache/cassandra/cql3/ThriftCompatibilityTest.java
index 7b72ef8..ff2af56 100644
--- a/test/unit/org/apache/cassandra/cql3/ThriftCompatibilityTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ThriftCompatibilityTest.java

@@ -17,40 +17,49 @@
  */
 package org.apache.cassandra.cql3;
 
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.junit.BeforeClass;
+import java.util.Arrays;
+import java.util.Collections;
+
 import org.junit.Test;
 
+import com.sun.org.apache.xerces.internal.impl.xs.models.CMNodeFactory;
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.thrift.CfDef;
+import org.apache.cassandra.thrift.ColumnDef;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
 public class ThriftCompatibilityTest extends SchemaLoader
 {
-    @BeforeClass
-    public static void defineSchema() throws Exception
-    {
-        // The before class annotation of SchemaLoader will prepare the service so no need to do it here
-        SchemaLoader.createKeyspace("thriftcompat",
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    jdbcSparseCFMD("thriftcompat", "JdbcInteger", Int32Type.instance)
-                                            .addColumnDefinition(integerColumn("thriftcompat", "JdbcInteger")));
-    }
-
-    private static UntypedResultSet execute(String query)
-    {
-        return QueryProcessor.executeInternal(String.format(query));
-    }
-
-    /** Test For CASSANDRA-8178 */
-    @Test
+    @Test // test for CASSANDRA-8178
     public void testNonTextComparator() throws Throwable
     {
+        ColumnDef column = new ColumnDef();
+        column.setName(bytes(42))
+              .setValidation_class(UTF8Type.instance.toString());
+
+        CfDef cf = new CfDef("thriftcompat", "JdbcInteger");
+        cf.setColumn_type("Standard")
+          .setComparator_type(Int32Type.instance.toString())
+          .setDefault_validation_class(UTF8Type.instance.toString())
+          .setKey_validation_class(BytesType.instance.toString())
+          .setColumn_metadata(Collections.singletonList(column));
+
+        SchemaLoader.createKeyspace("thriftcompat", KeyspaceParams.simple(1), ThriftConversion.fromThrift(cf));
+
         // the comparator is IntegerType, and there is a column named 42 with a UTF8Type validation type
         execute("INSERT INTO \"thriftcompat\".\"JdbcInteger\" (key, \"42\") VALUES (0x00000001, 'abc')");
         execute("UPDATE \"thriftcompat\".\"JdbcInteger\" SET \"42\" = 'abc' WHERE key = 0x00000001");
@@ -61,4 +70,43 @@
         assertEquals(ByteBufferUtil.bytes(1), row.getBytes("key"));
         assertEquals("abc", row.getString("42"));
     }
+
+    @Test // test for CASSANDRA-9867
+    public void testDropCompactStaticColumn()
+    {
+        ColumnDef column1 = new ColumnDef();
+        column1.setName(bytes(42))
+              .setValidation_class(UTF8Type.instance.toString());
+
+        ColumnDef column2 = new ColumnDef();
+        column2.setName(bytes(25))
+               .setValidation_class(UTF8Type.instance.toString());
+
+        CfDef cf = new CfDef("thriftks", "staticcompact");
+        cf.setColumn_type("Standard")
+          .setComparator_type(Int32Type.instance.toString())
+          .setDefault_validation_class(UTF8Type.instance.toString())
+          .setKey_validation_class(BytesType.instance.toString())
+          .setColumn_metadata(Arrays.asList(column1, column2));
+
+        SchemaLoader.createKeyspace("thriftks", KeyspaceParams.simple(1), ThriftConversion.fromThrift(cf));
+        CFMetaData cfm = Schema.instance.getCFMetaData("thriftks", "staticcompact");
+
+        // assert the both columns are in the metadata
+        assertTrue(cfm.getColumnMetadata().containsKey(bytes(42)));
+        assertTrue(cfm.getColumnMetadata().containsKey(bytes(25)));
+
+        // remove column2
+        cf.setColumn_metadata(Collections.singletonList(column1));
+        MigrationManager.announceColumnFamilyUpdate(ThriftConversion.fromThriftForUpdate(cf, cfm), true);
+
+        // assert that it's gone from metadata
+        assertTrue(cfm.getColumnMetadata().containsKey(bytes(42)));
+        assertFalse(cfm.getColumnMetadata().containsKey(bytes(25)));
+    }
+
+    private static UntypedResultSet execute(String query)
+    {
+        return QueryProcessor.executeInternal(query);
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
new file mode 100644
index 0000000..3042acd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java

@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.Random;
+
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.io.sstable.IndexHelper;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class TombstonesWithIndexedSSTableTest extends CQLTester
+{
+    @Test
+    public void testTombstoneBoundariesInIndexCached() throws Throwable
+    {
+        testTombstoneBoundariesInIndex("ALL");
+    }
+
+    @Test
+    public void testTombstoneBoundariesInIndexNotCached() throws Throwable
+    {
+        testTombstoneBoundariesInIndex("NONE");
+    }
+
+    public void testTombstoneBoundariesInIndex(String cacheKeys) throws Throwable
+    {
+        // That test reproduces the bug from CASSANDRA-11158 where a range tombstone boundary in the column index would
+        // cause an assertion failure.
+
+        int ROWS = 1000;
+        int VALUE_LENGTH = 100;
+
+        createTable("CREATE TABLE %s (k int, t int, s text static, v text, PRIMARY KEY (k, t)) WITH caching = { 'keys' : '" + cacheKeys + "' }");
+
+        // We create a partition that is big enough that the underlying sstable will be indexed
+        // For that, we use a large-ish number of row, and a value that isn't too small.
+        String text = makeRandomString(VALUE_LENGTH);
+        for (int i = 0; i < ROWS; i++)
+            execute("INSERT INTO %s(k, t, v) VALUES (?, ?, ?)", 0, i, text + i);
+
+        DecoratedKey dk = Util.dk(ByteBufferUtil.bytes(0));
+        int minDeleted = ROWS;
+        int maxDeleted = 0;
+
+        // Place some range deletions around an indexed location to get a tombstone boundary as the index's firstName.
+        // Because we insert a tombstone before it, the index position may move, so repeat procedure until the index
+        // boundary hits a tombstone boundary.
+        deletionLoop:
+        while (true)
+        {
+            flush();
+            compact();
+
+            int indexedRow = -1;
+            for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
+            {
+                // The line below failed with key caching off (CASSANDRA-11158)
+                @SuppressWarnings("unchecked")
+                RowIndexEntry<IndexHelper.IndexInfo> indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+                if (indexEntry != null && indexEntry.isIndexed())
+                {
+                    ClusteringPrefix firstName = indexEntry.columnsIndex().get(1).firstName;
+                    if (firstName.kind().isBoundary())
+                        break deletionLoop;
+                    indexedRow = Int32Type.instance.compose(firstName.get(0));
+                }
+            }
+            assert indexedRow >= 0;
+            minDeleted = Math.min(minDeleted, indexedRow - 2);
+            maxDeleted = Math.max(maxDeleted, indexedRow + 5);
+
+            execute("DELETE FROM %s WHERE k = 0 AND t >= ? AND t < ?", indexedRow - 2, indexedRow + 3);
+            execute("DELETE FROM %s WHERE k = 0 AND t >= ? AND t < ?", indexedRow, indexedRow + 5);
+        }
+
+        flush();
+        // The line below failed with key caching on (CASSANDRA-11158)
+        compact();
+
+        assertRowCount(execute("SELECT s FROM %s WHERE k = ?", 0), ROWS - (maxDeleted - minDeleted));
+        assertRowCount(execute("SELECT s FROM %s WHERE k = ? ORDER BY t DESC", 0), ROWS - (maxDeleted - minDeleted));
+
+        assertRowCount(execute("SELECT DISTINCT s FROM %s WHERE k = ?", 0), 1);
+        assertRowCount(execute("SELECT DISTINCT s FROM %s WHERE k = ? ORDER BY t DESC", 0), 1);
+    }
+
+    // Creates a random string
+    public static String makeRandomString(int length)
+    {
+        Random random = new Random();
+        char[] chars = new char[length];
+        for (int i = 0; i < length; ++i)
+            chars[i++] = (char) ('a' + random.nextInt('z' - 'a' + 1));
+        return new String(chars);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/UDHelperTest.java b/test/unit/org/apache/cassandra/cql3/UDHelperTest.java
index 4a5e78e..288cd86 100644
--- a/test/unit/org/apache/cassandra/cql3/UDHelperTest.java
+++ b/test/unit/org/apache/cassandra/cql3/UDHelperTest.java

@@ -54,6 +54,10 @@
 {
     static class UFTestCustomType extends AbstractType<String>
     {
+        protected UFTestCustomType()
+        {
+            super(ComparisonType.CUSTOM);
+        }
 
         public ByteBuffer fromString(String source) throws MarshalException
         {
@@ -70,7 +74,7 @@
             return UTF8Type.instance.getSerializer();
         }
 
-        public int compare(ByteBuffer o1, ByteBuffer o2)
+        public int compareCustom(ByteBuffer o1, ByteBuffer o2)
         {
             return o1.compareTo(o2);
         }

diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
new file mode 100644
index 0000000..bb0e269
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java

@@ -0,0 +1,1406 @@
+package org.apache.cassandra.cql3;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import org.apache.cassandra.concurrent.SEPExecutor;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.utils.FBUtilities;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import com.google.common.base.Objects;
+
+public class ViewComplexTest extends CQLTester
+{
+    int protocolVersion = 4;
+    private final List<String> views = new ArrayList<>();
+
+    @BeforeClass
+    public static void startup()
+    {
+        requireNetwork();
+    }
+    @Before
+    public void begin()
+    {
+        views.clear();
+    }
+
+    @After
+    public void end() throws Throwable
+    {
+        for (String viewName : views)
+            executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + viewName);
+    }
+
+    private void createView(String name, String query) throws Throwable
+    {
+        executeNet(protocolVersion, String.format(query, name));
+        // If exception is thrown, the view will not be added to the list; since it shouldn't have been created, this is
+        // the desired behavior
+        views.add(name);
+    }
+
+    private void updateView(String query, Object... params) throws Throwable
+    {
+        updateViewWithFlush(query, false, params);
+    }
+
+    private void updateViewWithFlush(String query, boolean flush, Object... params) throws Throwable
+    {
+        executeNet(protocolVersion, query, params);
+        while (!(((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getPendingTasks() == 0
+                && ((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getActiveCount() == 0))
+        {
+            Thread.sleep(1);
+        }
+        if (flush)
+            Keyspace.open(keyspace()).flush();
+    }
+
+    // for now, unselected column cannot be fully supported, SEE CASSANDRA-13826
+    @Ignore
+    @Test
+    public void testPartialDeleteUnselectedColumn() throws Throwable
+    {
+        boolean flush = true;
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        createTable("CREATE TABLE %s (k int, c int, a int, b int, PRIMARY KEY (k, c))");
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT k,c FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (k,c)");
+        Keyspace ks = Keyspace.open(keyspace());
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateView("UPDATE %s USING TIMESTAMP 10 SET b=1 WHERE k=1 AND c=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, 1));
+        assertRows(execute("SELECT * from mv"), row(1, 1));
+        updateView("DELETE b FROM %s USING TIMESTAMP 11 WHERE k=1 AND c=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+        updateView("UPDATE %s USING TIMESTAMP 1 SET a=1 WHERE k=1 AND c=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT * from %s"), row(1, 1, 1, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1));
+
+        execute("truncate %s;");
+
+        // removal generated by unselected column should not shadow PK update with smaller timestamp
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 18 SET a=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, 1, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1));
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 20 SET a=null WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"));
+        assertRows(execute("SELECT * from mv"));
+
+        updateViewWithFlush("INSERT INTO %s(k,c) VALUES(1,1) USING TIMESTAMP 15", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1));
+    }
+
+    @Test
+    public void testPartialDeleteSelectedColumnWithFlush() throws Throwable
+    {
+        testPartialDeleteSelectedColumn(true);
+    }
+
+    @Test
+    public void testPartialDeleteSelectedColumnWithoutFlush() throws Throwable
+    {
+        testPartialDeleteSelectedColumn(false);
+    }
+
+    private void testPartialDeleteSelectedColumn(boolean flush) throws Throwable
+    {
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        createTable("CREATE TABLE %s (k int, c int, a int, b int, e int, f int, PRIMARY KEY (k, c))");
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT a, b FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (k,c)");
+        Keyspace ks = Keyspace.open(keyspace());
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 10 SET b=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, 1, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, 1));
+
+        updateViewWithFlush("DELETE b FROM %s USING TIMESTAMP 11 WHERE k=1 AND c=1", flush);
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 1 SET a=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, 1, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, 1, null));
+
+        updateViewWithFlush("DELETE a FROM %s USING TIMESTAMP 1 WHERE k=1 AND c=1", flush);
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+
+        // view livenessInfo should not be affected by selected column ts or tb
+        updateViewWithFlush("INSERT INTO %s(k,c) VALUES(1,1) USING TIMESTAMP 0", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 12 SET b=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, 1, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, 1));
+
+        updateViewWithFlush("DELETE b FROM %s USING TIMESTAMP 13 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateViewWithFlush("DELETE FROM %s USING TIMESTAMP 14 WHERE k=1 AND c=1", flush);
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+
+        updateViewWithFlush("INSERT INTO %s(k,c) VALUES(1,1) USING TIMESTAMP 15", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateViewWithFlush("UPDATE %s USING TTL 3 SET b=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, 1, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, 1));
+
+        TimeUnit.SECONDS.sleep(4);
+
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateViewWithFlush("DELETE FROM %s USING TIMESTAMP 15 WHERE k=1 AND c=1", flush);
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+
+        execute("truncate %s;");
+
+        // removal generated by unselected column should not shadow selected column with smaller timestamp
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 18 SET e=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, null, null, 1, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 18 SET e=null WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"));
+        assertRows(execute("SELECT * from mv"));
+
+        updateViewWithFlush("UPDATE %s USING TIMESTAMP 16 SET a=1 WHERE k=1 AND c=1", flush);
+        assertRows(execute("SELECT * from %s"), row(1, 1, 1, null, null, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, 1, null));
+    }
+
+    @Test
+    public void testUpdateColumnInViewPKWithTTLWithFlush() throws Throwable
+    {
+        // CASSANDRA-13657
+        testUpdateColumnInViewPKWithTTL(true);
+    }
+
+    @Test
+    public void testUpdateColumnInViewPKWithTTLWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-13657
+        testUpdateColumnInViewPKWithTTL(false);
+    }
+
+    private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
+    {
+        // CASSANDRA-13657 if base column used in view pk is ttled, then view row is considered dead
+        createTable("create table %s (k int primary key, a int, b int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND a IS NOT NULL PRIMARY KEY (a, k)");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateView("UPDATE %s SET a = 1 WHERE k = 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"), row(1, 1, null));
+        assertRows(execute("SELECT * from mv"), row(1, 1, null));
+
+        updateView("DELETE a FROM %s WHERE k = 1");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+
+        updateView("INSERT INTO %s (k) VALUES (1);");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"), row(1, null, null));
+        assertEmpty(execute("SELECT * from mv"));
+
+        updateView("UPDATE %s USING TTL 5 SET a = 10 WHERE k = 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"), row(1, 10, null));
+        assertRows(execute("SELECT * from mv"), row(10, 1, null));
+
+        updateView("UPDATE %s SET b = 100 WHERE k = 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"), row(1, 10, 100));
+        assertRows(execute("SELECT * from mv"), row(10, 1, 100));
+
+        Thread.sleep(5000);
+
+        // 'a' is TTL of 5 and removed.
+        assertRows(execute("SELECT * from %s"), row(1, null, 100));
+        assertEmpty(execute("SELECT * from mv"));
+        assertEmpty(execute("SELECT * from mv WHERE k = ? AND a = ?", 1, 10));
+
+        updateView("DELETE b FROM %s WHERE k=1");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT * from %s"), row(1, null, null));
+        assertEmpty(execute("SELECT * from mv"));
+
+        updateView("DELETE FROM %s WHERE k=1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertEmpty(execute("SELECT * from %s"));
+        assertEmpty(execute("SELECT * from mv"));
+    }
+
+    @Test
+    public void testUpdateColumnNotInViewWithFlush() throws Throwable
+    {
+        testUpdateColumnNotInView(true);
+    }
+
+    @Test
+    public void testUpdateColumnNotInViewWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-13127
+        testUpdateColumnNotInView(false);
+    }
+
+    private void testUpdateColumnNotInView(boolean flush) throws Throwable
+    {
+        // CASSANDRA-13127: if base column not selected in view are alive, then pk of view row should be alive
+        String baseTable = createTable("create table %s (p int, c int, v1 int, v2 int, primary key(p, c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT p, c FROM %%s WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateView("UPDATE %s USING TIMESTAMP 0 SET v1 = 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, 1, null));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        updateView("DELETE v1 FROM %s USING TIMESTAMP 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+        assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+
+        // shadowed by tombstone
+        updateView("UPDATE %s USING TIMESTAMP 1 SET v1 = 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+        assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+
+        updateView("UPDATE %s USING TIMESTAMP 2 SET v2 = 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        updateView("DELETE v1 FROM %s USING TIMESTAMP 3 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        updateView("DELETE v2 FROM %s USING TIMESTAMP 4 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+        assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+
+        updateView("UPDATE %s USING TTL 3 SET v2 = 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        Thread.sleep(TimeUnit.SECONDS.toMillis(3));
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+
+        updateView("UPDATE %s SET v2 = 1 WHERE p = 0 AND c = 0");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        assertInvalidMessage(String.format("Cannot drop column v2 on base table %s with materialized views.", baseTable), "ALTER TABLE %s DROP v2");
+        // // drop unselected base column, unselected metadata should be removed, thus view row is dead
+        // updateView("ALTER TABLE %s DROP v2");
+        // assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+        // assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+        // assertRowsIgnoringOrder(execute("SELECT * from %s"));
+        // assertRowsIgnoringOrder(execute("SELECT * from mv"));
+    }
+
+    @Test
+    public void testPartialUpdateWithUnselectedCollectionsWithFlush() throws Throwable
+    {
+        testPartialUpdateWithUnselectedCollections(true);
+    }
+
+    @Test
+    public void testPartialUpdateWithUnselectedCollectionsWithoutFlush() throws Throwable
+    {
+        testPartialUpdateWithUnselectedCollections(false);
+    }
+
+    public void testPartialUpdateWithUnselectedCollections(boolean flush) throws Throwable
+    {
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        String baseTable = createTable("CREATE TABLE %s (k int, c int, a int, b int, l list<int>, s set<int>, m map<int,int>, PRIMARY KEY (k, c))");
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT a, b FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, k)");
+        Keyspace ks = Keyspace.open(keyspace());
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateView("UPDATE %s SET l=l+[1,2,3] WHERE k = 1 AND c = 1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateView("UPDATE %s SET l=l-[1,2] WHERE k = 1 AND c = 1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        updateView("UPDATE %s SET b=3 WHERE k=1 AND c=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT * from mv"), row(1, 1, null, 3));
+
+        updateView("UPDATE %s SET b=null, l=l-[3], s=s-{3} WHERE k = 1 AND c = 1");
+        if (flush)
+        {
+            FBUtilities.waitOnFutures(ks.flush());
+            ks.getColumnFamilyStore("mv").forceMajorCompaction();
+        }
+        assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"));
+        assertRowsIgnoringOrder(execute("SELECT * from mv"));
+
+        updateView("UPDATE %s SET m=m+{3:3}, l=l-[1], s=s-{2} WHERE k = 1 AND c = 1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"), row(1, 1, null, null));
+        assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 1, null, null));
+
+        assertInvalidMessage(String.format("Cannot drop column m on base table %s with materialized views.", baseTable), "ALTER TABLE %s DROP m");
+        // executeNet(protocolVersion, "ALTER TABLE %s DROP m");
+        // ks.getColumnFamilyStore("mv").forceMajorCompaction();
+        // assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s WHERE k = 1 AND c = 1"));
+        // assertRowsIgnoringOrder(execute("SELECT * from mv WHERE k = 1 AND c = 1"));
+        // assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"));
+        // assertRowsIgnoringOrder(execute("SELECT * from mv"));
+    }
+
+    @Test
+    public void testUnselectedColumnsTTLWithFlush() throws Throwable
+    {
+        // CASSANDRA-13127
+        testUnselectedColumnsTTL(true);
+    }
+
+    @Test
+    public void testUnselectedColumnsTTLWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-13127
+        testUnselectedColumnsTTL(false);
+    }
+
+    private void testUnselectedColumnsTTL(boolean flush) throws Throwable
+    {
+        // CASSANDRA-13127 not ttled unselected column in base should keep view row alive
+        createTable("create table %s (p int, c int, v int, primary key(p, c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT p, c FROM %%s WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        updateViewWithFlush("INSERT INTO %s (p, c) VALUES (0, 0) USING TTL 3;", flush);
+
+        updateViewWithFlush("UPDATE %s USING TTL 1000 SET v = 0 WHERE p = 0 and c = 0;", flush);
+
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        Thread.sleep(3000);
+
+        UntypedResultSet.Row row = execute("SELECT v, ttl(v) from %s WHERE c = ? AND p = ?", 0, 0).one();
+        assertTrue("row should have value of 0", row.getInt("v") == 0);
+        assertTrue("row should have ttl less than 1000", row.getInt("ttl(v)") < 1000);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        updateViewWithFlush("DELETE FROM %s WHERE p = 0 and c = 0;", flush);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+
+        updateViewWithFlush("INSERT INTO %s (p, c) VALUES (0, 0) ", flush);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        // already have a live row, no need to apply the unselected cell ttl
+        updateViewWithFlush("UPDATE %s USING TTL 3 SET v = 0 WHERE p = 0 and c = 0;", flush);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+
+        updateViewWithFlush("INSERT INTO %s (p, c) VALUES (1, 1) USING TTL 3", flush);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 1, 1), row(1, 1));
+
+        Thread.sleep(4000);
+
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 1, 1));
+
+        // unselected should keep view row alive
+        updateViewWithFlush("UPDATE %s SET v = 0 WHERE p = 1 and c = 1;", flush);
+        assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 1, 1), row(1, 1));
+
+    }
+
+    @Test
+    public void testRangeDeletionWithFlush() throws Throwable
+    {
+        testRangeDeletion(true);
+    }
+
+    @Test
+    public void testRangeDeletionWithoutFlush() throws Throwable
+    {
+        testRangeDeletion(false);
+    }
+
+    public void testRangeDeletion(boolean flush) throws Throwable
+    {
+        // for partition range deletion, need to know that existing row is shadowed instead of not existed.
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv_test1",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (a, b)");
+
+        Keyspace ks = Keyspace.open(keyspace());
+        ks.getColumnFamilyStore("mv_test1").disableAutoCompaction();
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 0", 1, 1, 1, 1);
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
+
+        // remove view row
+        updateView("UPDATE %s using timestamp 1 set b = null WHERE a=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+        // remove base row, no view updated generated.
+        updateView("DELETE FROM %s using timestamp 2 where a=1");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+
+        // restor view row with b,c column. d is still tombstone
+        updateView("UPDATE %s using timestamp 3 set b = 1,c = 1 where a=1"); // upsert
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, null));
+    }
+
+    @Test
+    public void testBaseTTLWithSameTimestampTest() throws Throwable
+    {
+        // CASSANDRA-13127 when liveness timestamp tie, greater localDeletionTime should win if both are expiring.
+        createTable("create table %s (p int, c int, v int, primary key(p, c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) using timestamp 1;");
+
+        FBUtilities.waitOnFutures(ks.flush());
+
+        updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;");
+
+        FBUtilities.waitOnFutures(ks.flush());
+
+        Thread.sleep(4000);
+
+        assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+
+        // reversed order
+        execute("truncate %s;");
+
+        updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;");
+
+        FBUtilities.waitOnFutures(ks.flush());
+
+        updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING timestamp 1;");
+
+        FBUtilities.waitOnFutures(ks.flush());
+
+        Thread.sleep(4000);
+
+        assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+
+    }
+
+    @Test
+    public void testCommutativeRowDeletionFlush() throws Throwable
+    {
+        // CASSANDRA-13409
+        testCommutativeRowDeletion(true);
+    }
+
+    @Test
+    public void testCommutativeRowDeletionWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-13409
+        testCommutativeRowDeletion(false);
+    }
+
+    private void testCommutativeRowDeletion(boolean flush) throws Throwable
+    {
+        // CASSANDRA-13409 new update should not resurrect previous deleted data in view
+        createTable("create table %s (p int primary key, v1 int, v2 int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p is not null and v1 is not null primary key (v1, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        // sstable-1, Set initial values TS=1
+        updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
+        // sstable-2
+        updateView("Delete from %s using timestamp 2 where p = 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"));
+        // sstable-3
+        updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+        // sstable-4
+        updateView("UPdate %s using timestamp 4 set v1 = 2 where p = 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
+        // sstable-5
+        updateView("UPdate %s using timestamp 5 set v1 = 1 where p = 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+
+        if (flush)
+        {
+            // compact sstable 2 and 4, 5;
+            ColumnFamilyStore cfs = ks.getColumnFamilyStore("mv");
+            List<String> sstables = cfs.getLiveSSTables()
+                                       .stream()
+                                       .sorted((s1, s2) -> s1.descriptor.generation - s2.descriptor.generation)
+                                       .map(s -> s.getFilename())
+                                       .collect(Collectors.toList());
+            String dataFiles = String.join(",", Arrays.asList(sstables.get(1), sstables.get(3), sstables.get(4)));
+            CompactionManager.instance.forceUserDefinedCompaction(dataFiles);
+            assertEquals(3, cfs.getLiveSSTables().size());
+        }
+        // regular tombstone should be retained after compaction
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+    }
+
+    @Test
+    public void testUnselectedColumnWithExpiredLivenessInfo() throws Throwable
+    {
+        boolean flush = true;
+        createTable("create table %s (k int, c int, a int, b int, PRIMARY KEY(k, c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select k,c,b from %%s where c is not null and k is not null primary key (c, k);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        // sstable-1, Set initial values TS=1
+        updateViewWithFlush("UPDATE %s SET a = 1 WHERE k = 1 AND c = 1;", flush);
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE k = 1 AND c = 1;"),
+                                row(1, 1, 1, null));
+        assertRowsIgnoringOrder(execute("SELECT k,c,b from mv WHERE k = 1 AND c = 1;"),
+                                row(1, 1, null));
+
+        // sstable-2
+        updateViewWithFlush("INSERT INTO %s(k,c) VALUES(1,1) USING TTL 5", flush);
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE k = 1 AND c = 1;"),
+                                row(1, 1, 1, null));
+        assertRowsIgnoringOrder(execute("SELECT k,c,b from mv WHERE k = 1 AND c = 1;"),
+                                row(1, 1, null));
+
+        Thread.sleep(5001);
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE k = 1 AND c = 1;"),
+                                row(1, 1, 1, null));
+        assertRowsIgnoringOrder(execute("SELECT k,c,b from mv WHERE k = 1 AND c = 1;"),
+                                row(1, 1, null));
+
+        // sstable-3
+        updateViewWithFlush("Update %s set a = null where k = 1 AND c = 1;", flush);
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE k = 1 AND c = 1;"));
+        assertRowsIgnoringOrder(execute("SELECT k,c,b from mv WHERE k = 1 AND c = 1;"));
+
+        // sstable-4
+        updateViewWithFlush("Update %s USING TIMESTAMP 1 set b = 1 where k = 1 AND c = 1;", flush);
+
+        assertRowsIgnoringOrder(execute("SELECT * from %s WHERE k = 1 AND c = 1;"),
+                                row(1, 1, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT k,c,b from mv WHERE k = 1 AND c = 1;"),
+                                row(1, 1, 1));
+    }
+
+    @Test
+    public void testUpdateWithColumnTimestampSmallerThanPkWithFlush() throws Throwable
+    {
+        testUpdateWithColumnTimestampSmallerThanPk(true);
+    }
+
+    @Test
+    public void testUpdateWithColumnTimestampSmallerThanPkWithoutFlush() throws Throwable
+    {
+        testUpdateWithColumnTimestampSmallerThanPk(false);
+    }
+
+    public void testUpdateWithColumnTimestampSmallerThanPk(boolean flush) throws Throwable
+    {
+        createTable("create table %s (p int primary key, v1 int, v2 int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p is not null and v1 is not null primary key (v1, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        // reset value
+        updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 6;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+        // increase pk's timestamp to 20
+        updateView("Insert into %s (p) values (3) using timestamp 20;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+        // change v1's to 2 and remove existing view row with ts7
+        updateView("UPdate %s using timestamp 7 set v1 = 2 where p = 3;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, 3, 6L));
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv limit 1"), row(2, 3, 3, 6L));
+        // change v1's to 1 and remove existing view row with ts8
+        updateView("UPdate %s using timestamp 8 set v1 = 1 where p = 3;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+    }
+
+    @Test
+    public void testExpiredLivenessLimitWithFlush() throws Throwable
+    {
+        // CASSANDRA-13883
+        testExpiredLivenessLimit(true);
+    }
+
+    @Test
+    public void testExpiredLivenessLimitWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-13883
+        testExpiredLivenessLimit(false);
+    }
+
+    private void testExpiredLivenessLimit(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, a int, b int);");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND a IS NOT NULL PRIMARY KEY (k, a);");
+        createView("mv2", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND a IS NOT NULL PRIMARY KEY (a, k);");
+        ks.getColumnFamilyStore("mv1").disableAutoCompaction();
+        ks.getColumnFamilyStore("mv2").disableAutoCompaction();
+
+        for (int i = 1; i <= 100; i++)
+            updateView("INSERT INTO %s(k, a, b) VALUES (?, ?, ?);", i, i, i);
+        for (int i = 1; i <= 100; i++)
+        {
+            if (i % 50 == 0)
+                continue;
+            // create expired liveness
+            updateView("DELETE a FROM %s WHERE k = ?;", i);
+        }
+        if (flush)
+        {
+            ks.getColumnFamilyStore("mv1").forceBlockingFlush();
+            ks.getColumnFamilyStore("mv2").forceBlockingFlush();
+        }
+
+        for (String view : Arrays.asList("mv1", "mv2"))
+        {
+            // paging
+            assertEquals(1, executeNetWithPaging(String.format("SELECT k,a,b FROM %s limit 1", view), 1).all().size());
+            assertEquals(2, executeNetWithPaging(String.format("SELECT k,a,b FROM %s limit 2", view), 1).all().size());
+            assertEquals(2, executeNetWithPaging(String.format("SELECT k,a,b FROM %s", view), 1).all().size());
+            assertRowsNet(executeNetWithPaging(String.format("SELECT k,a,b FROM %s ", view), 1),
+                          row(50, 50, 50),
+                          row(100, 100, 100));
+            // limit
+            assertEquals(1, execute(String.format("SELECT k,a,b FROM %s limit 1", view)).size());
+            assertRowsIgnoringOrder(execute(String.format("SELECT k,a,b FROM %s limit 2", view)),
+                                    row(50, 50, 50),
+                                    row(100, 100, 100));
+        }
+    }
+
+    @Test
+    public void testUpdateWithColumnTimestampBiggerThanPkWithFlush() throws Throwable
+    {
+        // CASSANDRA-11500
+        testUpdateWithColumnTimestampBiggerThanPk(true);
+    }
+
+    @Test
+    public void testUpdateWithColumnTimestampBiggerThanPkWithoutFlush() throws Throwable
+    {
+        // CASSANDRA-11500
+        testUpdateWithColumnTimestampBiggerThanPk(false);
+    }
+
+    public void testUpdateWithColumnTimestampBiggerThanPk(boolean flush) throws Throwable
+    {
+        // CASSANDRA-11500 able to shadow old view row with column ts greater tahn pk's ts and re-insert the view row
+        String baseTable = createTable("CREATE TABLE %s (k int PRIMARY KEY, a int, b int);");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND a IS NOT NULL PRIMARY KEY (k, a);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+        updateView("DELETE FROM %s USING TIMESTAMP 0 WHERE k = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // sstable-1, Set initial values TS=1
+        updateView("INSERT INTO %s(k, a, b) VALUES (1, 1, 1) USING TIMESTAMP 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 1));
+        updateView("UPDATE %s USING TIMESTAMP 10 SET b = 2 WHERE k = 1;");
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+        updateView("UPDATE %s USING TIMESTAMP 2 SET a = 2 WHERE k = 1;");
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        ks.getColumnFamilyStore("mv").forceMajorCompaction();
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv limit 1"), row(1, 2, 2));
+        updateView("UPDATE %s USING TIMESTAMP 11 SET a = 1 WHERE k = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
+
+        // set non-key base column as tombstone, view row is removed with shadowable
+        updateView("UPDATE %s USING TIMESTAMP 12 SET a = null WHERE k = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"));
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, null, 2));
+
+        // column b should be alive
+        updateView("UPDATE %s USING TIMESTAMP 13 SET a = 1 WHERE k = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+        assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
+
+        assertInvalidMessage(String.format("Cannot drop column a on base table %s with materialized views.", baseTable), "ALTER TABLE %s DROP a");
+    }
+
+    @Test
+    public void testNonBaseColumnInViewPkWithFlush() throws Throwable
+    {
+        testNonBaseColumnInViewPk(true);
+    }
+
+    @Test
+    public void testNonBaseColumnInViewPkWithoutFlush() throws Throwable
+    {
+        testNonBaseColumnInViewPk(true);
+    }
+
+    public void testNonBaseColumnInViewPk(boolean flush) throws Throwable
+    {
+        createTable("create table %s (p1 int, p2 int, v1 int, v2 int, primary key (p1,p2))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p1 is not null and p2 is not null primary key (p2, p1)"
+                           + " with gc_grace_seconds=5;");
+        ColumnFamilyStore cfs = ks.getColumnFamilyStore("mv");
+        cfs.disableAutoCompaction();
+
+        updateView("UPDATE %s USING TIMESTAMP 1 set v1 =1 where p1 = 1 AND p2 = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, 1, null));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, 1, null));
+
+        updateView("UPDATE %s USING TIMESTAMP 2 set v1 = null, v2 = 1 where p1 = 1 AND p2 = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
+
+        updateView("UPDATE %s USING TIMESTAMP 2 set v2 = null where p1 = 1 AND p2 = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
+
+        updateView("INSERT INTO %s (p1,p2) VALUES(1,1) USING TIMESTAMP 3;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, null));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, null));
+
+        updateView("DELETE FROM %s USING TIMESTAMP 4 WHERE p1 =1 AND p2 = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
+
+        updateView("UPDATE %s USING TIMESTAMP 5 set v2 = 1 where p1 = 1 AND p2 = 1;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
+        assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
+    }
+
+    @Test
+    public void testStrictLivenessTombstone() throws Throwable
+    {
+        createTable("create table %s (p int primary key, v1 int, v2 int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p is not null and v1 is not null primary key (v1, p)"
+                           + " with gc_grace_seconds=5;");
+        ColumnFamilyStore cfs = ks.getColumnFamilyStore("mv");
+        cfs.disableAutoCompaction();
+
+        updateView("Insert into %s (p, v1, v2) values (1, 1, 1) ;");
+        assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
+
+        updateView("Update %s set v1 = null WHERE p = 1");
+        FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"));
+
+        cfs.forceMajorCompaction(); // before gc grace second, strict-liveness tombstoned dead row remains
+        assertEquals(1, cfs.getLiveSSTables().size());
+
+        Thread.sleep(6000);
+        assertEquals(1, cfs.getLiveSSTables().size()); // no auto compaction.
+
+        cfs.forceMajorCompaction(); // after gc grace second, no data left
+        assertEquals(0, cfs.getLiveSSTables().size());
+
+        updateView("Update %s using ttl 5 set v1 = 1 WHERE p = 1");
+        FBUtilities.waitOnFutures(ks.flush());
+        assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
+
+        cfs.forceMajorCompaction(); // before ttl+gc_grace_second, strict-liveness ttled dead row remains
+        assertEquals(1, cfs.getLiveSSTables().size());
+        assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
+
+        Thread.sleep(5500); // after expired, before gc_grace_second
+        cfs.forceMajorCompaction();// before ttl+gc_grace_second, strict-liveness ttled dead row remains
+        assertEquals(1, cfs.getLiveSSTables().size());
+        assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"));
+
+        Thread.sleep(5500); // after expired + gc_grace_second
+        assertEquals(1, cfs.getLiveSSTables().size()); // no auto compaction.
+
+        cfs.forceMajorCompaction(); // after gc grace second, no data left
+        assertEquals(0, cfs.getLiveSSTables().size());
+    }
+
+    @Test
+    public void testCellTombstoneAndShadowableTombstonesWithFlush() throws Throwable
+    {
+        testCellTombstoneAndShadowableTombstones(true);
+    }
+
+    @Test
+    public void testCellTombstoneAndShadowableTombstonesWithoutFlush() throws Throwable
+    {
+        testCellTombstoneAndShadowableTombstones(false);
+    }
+
+    private void testCellTombstoneAndShadowableTombstones(boolean flush) throws Throwable
+    {
+        createTable("create table %s (p int primary key, v1 int, v2 int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p is not null and v1 is not null primary key (v1, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        // sstable 1, Set initial values TS=1
+        updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
+        // sstable 2
+        updateView("UPdate %s using timestamp 2 set v2 = null where p = 3");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3),
+                                row(null, null));
+        // sstable 3
+        updateView("UPdate %s using timestamp 3 set v1 = 2 where p = 3");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
+        // sstable 4
+        updateView("UPdate %s using timestamp 4 set v1 = 1 where p = 3");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+
+        if (flush)
+        {
+            // compact sstable 2 and 3;
+            ColumnFamilyStore cfs = ks.getColumnFamilyStore("mv");
+            List<String> sstables = cfs.getLiveSSTables()
+                                       .stream()
+                                       .sorted(Comparator.comparingInt(s -> s.descriptor.generation))
+                                       .map(s -> s.getFilename())
+                                       .collect(Collectors.toList());
+            String dataFiles = String.join(",", Arrays.asList(sstables.get(1), sstables.get(2)));
+            CompactionManager.instance.forceUserDefinedCompaction(dataFiles);
+        }
+        // cell-tombstone in sstable 4 is not compacted away, because the shadowable tombstone is shadowed by new row.
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+        assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv limit 1"), row(1, 3, null, null));
+    }
+
+    @Test
+    public void complexTimestampDeletionTestWithFlush() throws Throwable
+    {
+        complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(true);
+        complexTimestampWithbasePKColumnsInViewPKDeletionTest(true);
+    }
+
+    @Test
+    public void complexTimestampDeletionTestWithoutFlush() throws Throwable
+    {
+        complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(false);
+        complexTimestampWithbasePKColumnsInViewPKDeletionTest(false);
+    }
+
+    private void complexTimestampWithbasePKColumnsInViewPKDeletionTest(boolean flush) throws Throwable
+    {
+        createTable("create table %s (p1 int, p2 int, v1 int, v2 int, primary key(p1, p2))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv2",
+                   "create materialized view %s as select * from %%s where p1 is not null and p2 is not null primary key (p2, p1);");
+        ks.getColumnFamilyStore("mv2").disableAutoCompaction();
+
+        // Set initial values TS=1
+        updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                row(3, 4, 1L));
+        // remove row/mv TS=2
+        updateView("Delete from %s using timestamp 2 where p1 = 1 and p2 = 2;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // view are empty
+        assertRowsIgnoringOrder(execute("SELECT * from mv2"));
+        // insert PK with TS=3
+        updateView("Insert into %s (p1, p2) values (1, 2) using timestamp 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // deleted column in MV remained dead
+        assertRowsIgnoringOrder(execute("SELECT * from mv2"), row(2, 1, null, null));
+
+        ks.getColumnFamilyStore("mv2").forceMajorCompaction();
+        assertRowsIgnoringOrder(execute("SELECT * from mv2"), row(2, 1, null, null));
+
+        // reset values
+        updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 10;");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                row(3, 4, 10L));
+
+        updateView("UPDATE %s using timestamp 20 SET v2 = 5 WHERE p1 = 1 and p2 = 2");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                row(3, 5, 20L));
+
+        updateView("DELETE FROM %s using timestamp 10 WHERE p1 = 1 and p2 = 2");
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                row(null, 5, 20L));
+    }
+
+    public void complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(boolean flush) throws Throwable
+    {
+        createTable("create table %s (p int primary key, v1 int, v2 int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv",
+                   "create materialized view %s as select * from %%s where p is not null and v1 is not null primary key (v1, p);");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        // Set initial values TS=1
+        updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 1;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(5, 1L));
+        // remove row/mv TS=2
+        updateView("Delete from %s using timestamp 2 where p = 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // view are empty
+        assertRowsIgnoringOrder(execute("SELECT * from mv"));
+        // insert PK with TS=3
+        updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // deleted column in MV remained dead
+        assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
+
+        // insert values TS=2, it should be considered dead due to previous tombstone
+        updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 2;");
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        // deleted column in MV remained dead
+        assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
+        assertRowsIgnoringOrder(execute("SELECT * from mv limit 1"), row(1, 3, null));
+
+        // insert values TS=2, it should be considered dead due to previous tombstone
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 3 SET v2 = ? WHERE p = ?", 4, 3);
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        assertRows(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 4, 3L));
+
+        ks.getColumnFamilyStore("mv").forceMajorCompaction();
+        assertRows(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 4, 3L));
+        assertRows(execute("SELECT v1, p, v2, WRITETIME(v2) from mv limit 1"), row(1, 3, 4, 3L));
+    }
+
+    @Test
+    public void testMVWithDifferentColumnsWithFlush() throws Throwable
+    {
+        testMVWithDifferentColumns(true);
+    }
+
+    @Test
+    public void testMVWithDifferentColumnsWithoutFlush() throws Throwable
+    {
+        testMVWithDifferentColumns(false);
+    }
+
+    private void testMVWithDifferentColumns(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, f int, PRIMARY KEY(a, b))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        List<String> viewNames = new ArrayList<>();
+        List<String> mvStatements = Arrays.asList(
+                                                  // all selected
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (a,b)",
+                                                  // unselected e,f
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT c,d FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (a,b)",
+                                                  // no selected
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT a,b FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (a,b)",
+                                                  // all selected, re-order keys
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (b,a)",
+                                                  // unselected e,f, re-order keys
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT a,b,c,d FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (b,a)",
+                                                  // no selected, re-order keys
+                                                  "CREATE MATERIALIZED VIEW %s AS SELECT a,b FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (b,a)");
+
+        Keyspace ks = Keyspace.open(keyspace());
+
+        for (int i = 0; i < mvStatements.size(); i++)
+        {
+            String name = "mv" + i;
+            viewNames.add(name);
+            createView(name, mvStatements.get(i));
+            ks.getColumnFamilyStore(name).disableAutoCompaction();
+        }
+
+        // insert
+        updateViewWithFlush("INSERT INTO %s (a,b,c,d,e,f) VALUES(1,1,1,1,1,1) using timestamp 1", flush);
+        assertBaseViews(row(1, 1, 1, 1, 1, 1), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 2 SET c=0, d=0 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, 0, 0, 1, 1), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 2 SET e=0, f=0 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, 0, 0, 0, 0), viewNames);
+
+        updateViewWithFlush("DELETE FROM %s using timestamp 2 WHERE a=1 AND b=1", flush);
+        assertBaseViews(null, viewNames);
+
+        // partial update unselected, selected
+        updateViewWithFlush("UPDATE %s using timestamp 3 SET f=1 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, null, null, null, 1), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 4 SET e = 1, f=null WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, null, null, 1, null), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 4 SET e = null WHERE a=1 AND b=1", flush);
+        assertBaseViews(null, viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 5 SET c = 1 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, 1, null, null, null), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 5 SET c = null WHERE a=1 AND b=1", flush);
+        assertBaseViews(null, viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 6 SET d = 1 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, null, 1, null, null), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 7 SET d = null WHERE a=1 AND b=1", flush);
+        assertBaseViews(null, viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 8 SET f = 1 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, null, null, null, 1), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 6 SET c = 1 WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, 1, null, null, 1), viewNames);
+
+        // view row still alive due to c=1@6
+        updateViewWithFlush("UPDATE %s using timestamp 8 SET f = null WHERE a=1 AND b=1", flush);
+        assertBaseViews(row(1, 1, 1, null, null, null), viewNames);
+
+        updateViewWithFlush("UPDATE %s using timestamp 6 SET c = null WHERE a=1 AND b=1", flush);
+        assertBaseViews(null, viewNames);
+    }
+
+    private void assertBaseViews(Object[] row, List<String> viewNames) throws Throwable
+    {
+        UntypedResultSet result = execute("SELECT * FROM %s");
+        if (row == null)
+            assertRowsIgnoringOrder(result);
+        else
+            assertRowsIgnoringOrder(result, row);
+        for (int i = 0; i < viewNames.size(); i++)
+            assertBaseView(result, execute(String.format("SELECT * FROM %s", viewNames.get(i))), viewNames.get(i));
+    }
+
+    private void assertBaseView(UntypedResultSet base, UntypedResultSet view, String mv)
+    {
+        List<ColumnSpecification> baseMeta = base.metadata();
+        List<ColumnSpecification> viewMeta = view.metadata();
+
+        Iterator<UntypedResultSet.Row> iter = base.iterator();
+        Iterator<UntypedResultSet.Row> viewIter = view.iterator();
+
+        List<UntypedResultSet.Row> baseData = com.google.common.collect.Lists.newArrayList(iter);
+        List<UntypedResultSet.Row> viewData = com.google.common.collect.Lists.newArrayList(viewIter);
+
+        if (baseData.size() != viewData.size())
+            fail(String.format("Mismatch number of rows in view %s: <%s>, in base <%s>",
+                               mv,
+                               makeRowStrings(view),
+                               makeRowStrings(base)));
+        if (baseData.size() == 0)
+            return;
+        if (viewData.size() != 1)
+            fail(String.format("Expect only one row in view %s, but got <%s>",
+                               mv,
+                               makeRowStrings(view)));
+
+        UntypedResultSet.Row row = baseData.get(0);
+        UntypedResultSet.Row viewRow = viewData.get(0);
+
+        Map<String, ByteBuffer> baseValues = new HashMap<>();
+        for (int j = 0; j < baseMeta.size(); j++)
+        {
+            ColumnSpecification column = baseMeta.get(j);
+            ByteBuffer actualValue = row.getBytes(column.name.toString());
+            baseValues.put(column.name.toString(), actualValue);
+        }
+        for (int j = 0; j < viewMeta.size(); j++)
+        {
+            ColumnSpecification column = viewMeta.get(j);
+            String name = column.name.toString();
+            ByteBuffer viewValue = viewRow.getBytes(name);
+            if (!baseValues.containsKey(name))
+            {
+                fail(String.format("Extra column: %s with value %s in view", name, column.type.compose(viewValue)));
+            }
+            else if (!Objects.equal(baseValues.get(name), viewValue))
+            {
+                fail(String.format("Non equal column: %s, expected <%s> but got <%s>",
+                                   name,
+                                   column.type.compose(baseValues.get(name)),
+                                   column.type.compose(viewValue)));
+            }
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
new file mode 100644
index 0000000..fe618b6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java

@@ -0,0 +1,1306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import junit.framework.Assert;
+
+import org.apache.cassandra.db.SystemKeyspace;
+
+public class ViewFilteringTest extends CQLTester
+{
+    int protocolVersion = 4;
+    private final List<String> views = new ArrayList<>();
+
+    @BeforeClass
+    public static void startup()
+    {
+        requireNetwork();
+    }
+    @Before
+    public void begin()
+    {
+        views.clear();
+    }
+
+    @After
+    public void end() throws Throwable
+    {
+        for (String viewName : views)
+            executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + viewName);
+    }
+
+    private void createView(String name, String query) throws Throwable
+    {
+        executeNet(protocolVersion, String.format(query, name));
+        // If exception is thrown, the view will not be added to the list; since it shouldn't have been created, this is
+        // the desired behavior
+        views.add(name);
+    }
+
+    private void dropView(String name) throws Throwable
+    {
+        executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + name);
+        views.remove(name);
+    }
+
+    @Test
+    public void testMVCreationSelectRestrictions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY((a, b), c, d))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        // IS NOT NULL is required on all PK statements that are not otherwise restricted
+        List<String> badStatements = Arrays.asList(
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL AND d is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND c IS NOT NULL AND d is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND d is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = ? AND b IS NOT NULL AND c is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = blobAsInt(?) AND b IS NOT NULL AND c is NOT NULL PRIMARY KEY ((a, b), c, d)",
+        "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s PRIMARY KEY (a, b, c, d)"
+        );
+
+        for (String badStatement : badStatements)
+        {
+            try
+            {
+                createView("mv1_test", badStatement);
+                Assert.fail("Create MV statement should have failed due to missing IS NOT NULL restriction: " + badStatement);
+            }
+            catch (InvalidQueryException exc) {}
+        }
+
+            List<String> goodStatements = Arrays.asList(
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND c IS NOT NULL AND d is NOT NULL PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c = 1 AND d IS NOT NULL PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c = 1 AND d = 1 PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND c = 1 AND d = 1 PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND c > 1 AND d IS NOT NULL PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND c = 1 AND d IN (1, 2, 3) PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND (c, d) = (1, 1) PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND (c, d) > (1, 1) PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND (c, d) IN ((1, 1), (2, 2)) PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = (int) 1 AND b = 1 AND c = 1 AND d = 1 PRIMARY KEY ((a, b), c, d)",
+            "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = blobAsInt(intAsBlob(1)) AND b = 1 AND c = 1 AND d = 1 PRIMARY KEY ((a, b), c, d)"
+            );
+
+        for (int i = 0; i < goodStatements.size(); i++)
+        {
+            try
+            {
+                createView("mv" + i + "_test", goodStatements.get(i));
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException("MV creation failed: " + goodStatements.get(i), e);
+            }
+
+            try
+            {
+                executeNet(protocolVersion, "ALTER MATERIALIZED VIEW mv" + i + "_test WITH compaction = { 'class' : 'LeveledCompactionStrategy' }");
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException("MV alter failed: " + goodStatements.get(i), e);
+            }
+        }
+
+        try
+        {
+            createView("mv_foo", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b IS NOT NULL AND c IS NOT NULL AND d is NOT NULL PRIMARY KEY ((a, b), c, d)");
+            Assert.fail("Partial partition key restriction should not be allowed");
+        }
+        catch (InvalidQueryException exc) {}
+    }
+
+    @Test
+    public void testCaseSensitivity() throws Throwable
+    {
+        createTable("CREATE TABLE %s (\"theKey\" int, \"theClustering\" int, \"the\"\"Value\" int, PRIMARY KEY (\"theKey\", \"theClustering\"))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        execute("INSERT INTO %s (\"theKey\", \"theClustering\", \"the\"\"Value\") VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (\"theKey\", \"theClustering\", \"the\"\"Value\") VALUES (?, ?, ?)", 0, 1, 0);
+        execute("INSERT INTO %s (\"theKey\", \"theClustering\", \"the\"\"Value\") VALUES (?, ?, ?)", 1, 0, 0);
+        execute("INSERT INTO %s (\"theKey\", \"theClustering\", \"the\"\"Value\") VALUES (?, ?, ?)", 1, 1, 0);
+
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
+                              "WHERE \"theKey\" = 1 AND \"theClustering\" = 1 AND \"the\"\"Value\" IS NOT NULL " +
+                              "PRIMARY KEY (\"theKey\", \"theClustering\")");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Thread.sleep(10);
+        createView("mv_test2", "CREATE MATERIALIZED VIEW %s AS SELECT \"theKey\", \"theClustering\", \"the\"\"Value\" FROM %%s " +
+                               "WHERE \"theKey\" = 1 AND \"theClustering\" = 1 AND \"the\"\"Value\" IS NOT NULL " +
+                               "PRIMARY KEY (\"theKey\", \"theClustering\")");
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test2"))
+            Thread.sleep(10);
+
+        for (String mvname : Arrays.asList("mv_test", "mv_test2"))
+        {
+            assertRowsIgnoringOrder(execute("SELECT \"theKey\", \"theClustering\", \"the\"\"Value\" FROM " + mvname),
+                                    row(1, 1, 0)
+            );
+        }
+
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME \"theClustering\" TO \"Col\"");
+
+        for (String mvname : Arrays.asList("mv_test", "mv_test2"))
+        {
+            assertRowsIgnoringOrder(execute("SELECT \"theKey\", \"Col\", \"the\"\"Value\" FROM " + mvname),
+                                    row(1, 1, 0)
+            );
+        }
+    }
+
+    @Test
+    public void testFilterWithFunction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 0, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 3);
+
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
+                              "WHERE a = blobAsInt(intAsBlob(1)) AND b IS NOT NULL " +
+                              "PRIMARY KEY (a, b)");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Thread.sleep(10);
+
+        assertRows(execute("SELECT a, b, c FROM mv_test"),
+                   row(1, 0, 2),
+                   row(1, 1, 3)
+        );
+
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME a TO foo");
+
+        assertRows(execute("SELECT foo, b, c FROM mv_test"),
+                   row(1, 0, 2),
+                   row(1, 1, 3)
+        );
+    }
+
+    @Test
+    public void testFilterWithTypecast() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 0, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 3);
+
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
+                              "WHERE a = (int) 1 AND b IS NOT NULL " +
+                              "PRIMARY KEY (a, b)");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Thread.sleep(10);
+
+        assertRows(execute("SELECT a, b, c FROM mv_test"),
+                   row(1, 0, 2),
+                   row(1, 1, 3)
+        );
+
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME a TO foo");
+
+        assertRows(execute("SELECT foo, b, c FROM mv_test"),
+                   row(1, 0, 2),
+                   row(1, 1, 3)
+        );
+    }
+
+    @Test
+    public void testPartitionKeyRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where a = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b IS NOT NULL AND c IS NOT NULL PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 1, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 0, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 0, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertEmpty(execute("SELECT * FROM mv_test" + i));
+        }
+    }
+
+    @Test
+    public void testCompoundPartitionKeyRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY ((a, b), c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where a = 1 and b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b = 1 AND c IS NOT NULL PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 0, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 0, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 1, 1);
+            assertEmpty(execute("SELECT * FROM mv_test" + i));
+        }
+    }
+
+    @Test
+    public void testCompoundPartitionKeyRestrictionsNotIncludeAll() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY ((a, b), c))");
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+        // only accept rows where a = 1 and b = 1, don't include column d in the selection
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT a, b, c FROM %%s WHERE a = 1 AND b = 1 AND c IS NOT NULL PRIMARY KEY ((a, b), c)");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Thread.sleep(10);
+
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1)
+        );
+
+        // insert new rows that do not match the filter
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 1, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 0, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1)
+        );
+
+        // insert new row that does match the filter
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1),
+                   row(1, 1, 2)
+        );
+
+        // update rows that don't match the filter
+        execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 0, 0);
+        execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 0, 0);
+        execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 0, 1, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1),
+                   row(1, 1, 2)
+        );
+
+        // update a row that does match the filter
+        execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1),
+                   row(1, 1, 2)
+        );
+
+        // delete rows that don't match the filter
+        execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 0, 0);
+        execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 0, 0);
+        execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 0, 1, 0);
+        execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 0),
+                   row(1, 1, 1),
+                   row(1, 1, 2)
+        );
+
+        // delete a row that does match the filter
+        execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+        assertRows(execute("SELECT * FROM mv_test"),
+                   row(1, 1, 1),
+                   row(1, 1, 2)
+        );
+
+        // delete a partition that matches the filter
+        execute("DELETE FROM %s WHERE a = ? AND b = ?", 1, 1);
+        assertEmpty(execute("SELECT * FROM mv_test"));
+    }
+
+    @Test
+    public void testClusteringKeyEQRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b = 1 AND c IS NOT NULL PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 0, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 2, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0)
+            );
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testClusteringKeySliceRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b >= 1 AND c IS NOT NULL PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, -1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, -1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0)
+            );
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testClusteringKeyINRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IN (1, 2) AND c IS NOT NULL PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, -1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, -1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0)
+            );
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testClusteringKeyMultiColumnRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, -1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND (b, c) >= (1, 0) PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, -1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 1, -1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 2, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, -1);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, -1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 0, 1),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 1, 2, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 1, 0, 0),
+                                    row(0, 1, 1, 0)
+            );
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testClusteringKeyFilteringRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, -1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c = 1 PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 2, 1, -1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 2, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 2, 1, 1, 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 2),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, -1);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, -1, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 0);
+            execute("DELETE FROM %s WHERE a = ? AND b = ?", 0, -1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 2),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(1, 0, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0)
+            );
+
+            // insert a partition with one matching and one non-matching row using a batch (CASSANDRA-10614)
+            String tableName = KEYSPACE + "." + currentTable();
+            execute("BEGIN BATCH " +
+                    "INSERT INTO " + tableName + " (a, b, c, d) VALUES (?, ?, ?, ?); " +
+                    "INSERT INTO " + tableName + " (a, b, c, d) VALUES (?, ?, ?, ?); " +
+                    "APPLY BATCH",
+                    4, 4, 0, 0,
+                    4, 4, 1, 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(0, 0, 1, 0),
+                                    row(0, 1, 1, 0),
+                                    row(4, 4, 1, 1)
+            );
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testPartitionKeyAndClusteringKeyFilteringRestrictions() throws Throwable
+    {
+        List<String> mvPrimaryKeys = Arrays.asList("((a, b), c)", "((b, a), c)", "(a, b, c)", "(c, b, a)", "((c, a), b)");
+        for (int i = 0; i < mvPrimaryKeys.size(); i++)
+        {
+            createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+
+            execute("USE " + keyspace());
+            executeNet(protocolVersion, "USE " + keyspace());
+
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, -1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 0);
+
+            logger.info("Testing MV primary key: {}", mvPrimaryKeys.get(i));
+
+            // only accept rows where b = 1
+            createView("mv_test" + i, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a = 1 AND b IS NOT NULL AND c = 1 PRIMARY KEY " + mvPrimaryKeys.get(i));
+
+            while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test" + i))
+                Thread.sleep(10);
+
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new rows that do not match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 0);
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 0, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0)
+            );
+
+            // insert new row that does match the filter
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update rows that don't match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 1, 1, -1, 0);
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 0, 1, 1, 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // update a row that does match the filter
+            execute("UPDATE %s SET d = ? WHERE a = ? AND b = ? AND c = ?", 2, 1, 1, 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 2),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete rows that don't match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, -1);
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 2, 0, 1);
+            execute("DELETE FROM %s WHERE a = ?", 0);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 1, 1, 2),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a row that does match the filter
+            execute("DELETE FROM %s WHERE a = ? AND b = ? AND c = ?", 1, 1, 1);
+            assertRowsIgnoringOrder(execute("SELECT a, b, c, d FROM mv_test" + i),
+                                    row(1, 0, 1, 0),
+                                    row(1, 2, 1, 0)
+            );
+
+            // delete a partition that matches the filter
+            execute("DELETE FROM %s WHERE a = ?", 1);
+            assertEmpty(execute("SELECT a, b, c, d FROM mv_test" + i));
+
+            dropView("mv_test" + i);
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    public void testAllTypes() throws Throwable
+    {
+        String myType = createType("CREATE TYPE %s (a int, b uuid, c set<text>)");
+        String columnNames = "asciival, " +
+                             "bigintval, " +
+                             "blobval, " +
+                             "booleanval, " +
+                             "dateval, " +
+                             "decimalval, " +
+                             "doubleval, " +
+                             "floatval, " +
+                             "inetval, " +
+                             "intval, " +
+                             "textval, " +
+                             "timeval, " +
+                             "timestampval, " +
+                             "timeuuidval, " +
+                             "uuidval," +
+                             "varcharval, " +
+                             "varintval, " +
+                             "frozenlistval, " +
+                             "frozensetval, " +
+                             "frozenmapval, " +
+                             "tupleval, " +
+                             "udtval";
+
+        createTable(
+                    "CREATE TABLE %s (" +
+                            "asciival ascii, " +
+                            "bigintval bigint, " +
+                            "blobval blob, " +
+                            "booleanval boolean, " +
+                            "dateval date, " +
+                            "decimalval decimal, " +
+                            "doubleval double, " +
+                            "floatval float, " +
+                            "inetval inet, " +
+                            "intval int, " +
+                            "textval text, " +
+                            "timeval time, " +
+                            "timestampval timestamp, " +
+                            "timeuuidval timeuuid, " +
+                            "uuidval uuid," +
+                            "varcharval varchar, " +
+                            "varintval varint, " +
+                            "frozenlistval frozen<list<int>>, " +
+                            "frozensetval frozen<set<uuid>>, " +
+                            "frozenmapval frozen<map<ascii, int>>," +
+                            "tupleval frozen<tuple<int, ascii, uuid>>," +
+                            "udtval frozen<" + myType + ">, " +
+                            "PRIMARY KEY (" + columnNames + "))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+
+        createView(
+                   "mv_test",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " +
+                           "asciival = 'abc' AND " +
+                           "bigintval = 123 AND " +
+                           "blobval = 0xfeed AND " +
+                           "booleanval = true AND " +
+                           "dateval = '1987-03-23' AND " +
+                           "decimalval = 123.123 AND " +
+                           "doubleval = 123.123 AND " +
+                           "floatval = 123.123 AND " +
+                           "inetval = '127.0.0.1' AND " +
+                           "intval = 123 AND " +
+                           "textval = 'abc' AND " +
+                           "timeval = '07:35:07.000111222' AND " +
+                           "timestampval = 123123123 AND " +
+                           "timeuuidval = 6BDDC89A-5644-11E4-97FC-56847AFE9799 AND " +
+                           "uuidval = 6BDDC89A-5644-11E4-97FC-56847AFE9799 AND " +
+                           "varcharval = 'abc' AND " +
+                           "varintval = 123123123 AND " +
+                           "frozenlistval = [1, 2, 3] AND " +
+                           "frozensetval = {6BDDC89A-5644-11E4-97FC-56847AFE9799} AND " +
+                           "frozenmapval = {'a': 1, 'b': 2} AND " +
+                           "tupleval = (1, 'foobar', 6BDDC89A-5644-11E4-97FC-56847AFE9799) AND " +
+                           "udtval = {a: 1, b: 6BDDC89A-5644-11E4-97FC-56847AFE9799, c: {'foo', 'bar'}} " +
+                           "PRIMARY KEY (" + columnNames + ")");
+
+        execute("INSERT INTO %s (" + columnNames + ") VALUES (" +
+                "'abc'," +
+                "123," +
+                "0xfeed," +
+                "true," +
+                "'1987-03-23'," +
+                "123.123," +
+                "123.123," +
+                "123.123," +
+                "'127.0.0.1'," +
+                "123," +
+                "'abc'," +
+                "'07:35:07.000111222'," +
+                "123123123," +
+                "6BDDC89A-5644-11E4-97FC-56847AFE9799," +
+                "6BDDC89A-5644-11E4-97FC-56847AFE9799," +
+                "'abc'," +
+                "123123123," +
+                "[1, 2, 3]," +
+                "{6BDDC89A-5644-11E4-97FC-56847AFE9799}," +
+                "{'a': 1, 'b': 2}," +
+                "(1, 'foobar', 6BDDC89A-5644-11E4-97FC-56847AFE9799)," +
+                "{a: 1, b: 6BDDC89A-5644-11E4-97FC-56847AFE9799, c: {'foo', 'bar'}})");
+
+        assert !execute("SELECT * FROM mv_test").isEmpty();
+
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME inetval TO foo");
+        assert !execute("SELECT * FROM mv_test").isEmpty();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java b/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java
new file mode 100644
index 0000000..113fdf2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java

@@ -0,0 +1,686 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
+import java.util.UUID;
+
+import junit.framework.Assert;
+
+import org.apache.cassandra.concurrent.SEPExecutor;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.serializers.SimpleDateSerializer;
+import org.apache.cassandra.serializers.TimeSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+
+
+public class ViewSchemaTest extends CQLTester
+{
+    int protocolVersion = 4;
+    private final List<String> views = new ArrayList<>();
+
+    @BeforeClass
+    public static void startup()
+    {
+        requireNetwork();
+    }
+    @Before
+    public void begin()
+    {
+        views.clear();
+    }
+
+    @After
+    public void end() throws Throwable
+    {
+        for (String viewName : views)
+            executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + viewName);
+    }
+
+    private void createView(String name, String query) throws Throwable
+    {
+        executeNet(protocolVersion, String.format(query, name));
+        // If exception is thrown, the view will not be added to the list; since it shouldn't have been created, this is
+        // the desired behavior
+        views.add(name);
+    }
+
+    private void updateView(String query, Object... params) throws Throwable
+    {
+        executeNet(protocolVersion, query, params);
+        while (!(((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getPendingTasks() == 0
+                 && ((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getActiveCount() == 0))
+        {
+            Thread.sleep(1);
+        }
+    }
+
+    @Test
+    public void testCaseSensitivity() throws Throwable
+    {
+        createTable("CREATE TABLE %s (\"theKey\" int, \"theClustering\" int, \"theValue\" int, PRIMARY KEY (\"theKey\", \"theClustering\"))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        execute("INSERT INTO %s (\"theKey\", \"theClustering\", \"theValue\") VALUES (?, ?, ?)", 0, 0, 0);
+
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
+                              "WHERE \"theKey\" IS NOT NULL AND \"theClustering\" IS NOT NULL AND \"theValue\" IS NOT NULL " +
+                              "PRIMARY KEY (\"theKey\", \"theClustering\")");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Thread.sleep(10);
+        createView("mv_test2", "CREATE MATERIALIZED VIEW %s AS SELECT \"theKey\", \"theClustering\", \"theValue\" FROM %%s " +
+                               "WHERE \"theKey\" IS NOT NULL AND \"theClustering\" IS NOT NULL AND \"theValue\" IS NOT NULL " +
+                               "PRIMARY KEY (\"theKey\", \"theClustering\")");
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test2"))
+            Thread.sleep(10);
+
+        for (String mvname : Arrays.asList("mv_test", "mv_test2"))
+        {
+            assertRows(execute("SELECT \"theKey\", \"theClustering\", \"theValue\" FROM " + mvname),
+               row(0, 0, 0)
+            );
+        }
+
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME \"theClustering\" TO \"Col\"");
+
+        for (String mvname : Arrays.asList("mv_test", "mv_test2"))
+        {
+            assertRows(execute("SELECT \"theKey\", \"Col\", \"theValue\" FROM " + mvname),
+                       row(0, 0, 0)
+            );
+        }
+    }
+
+    @Test
+    public void testAccessAndSchema() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "PRIMARY KEY((k, asciival)))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv1_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE bigintval IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL PRIMARY KEY (bigintval, k, asciival)");
+        updateView("INSERT INTO %s(k,asciival,bigintval)VALUES(?,?,?)", 0, "foo", 1L);
+
+        try
+        {
+            updateView("INSERT INTO mv1_test(k,asciival,bigintval) VALUES(?,?,?)", 1, "foo", 2L);
+            Assert.fail("Shouldn't be able to modify a MV directly");
+        }
+        catch (Exception e)
+        {
+        }
+
+        try
+        {
+            executeNet(protocolVersion, "ALTER TABLE mv1_test ADD foo text");
+            Assert.fail("Should not be able to use alter table with MV");
+        }
+        catch (Exception e)
+        {
+        }
+
+        try
+        {
+            executeNet(protocolVersion, "ALTER TABLE mv1_test WITH compaction = { 'class' : 'LeveledCompactionStrategy' }");
+            Assert.fail("Should not be able to use alter table with MV");
+        }
+        catch (Exception e)
+        {
+        }
+
+        executeNet(protocolVersion, "ALTER MATERIALIZED VIEW mv1_test WITH compaction = { 'class' : 'LeveledCompactionStrategy' }");
+
+        //Test alter add
+        executeNet(protocolVersion, "ALTER TABLE %s ADD foo text");
+        CFMetaData metadata = Schema.instance.getCFMetaData(keyspace(), "mv1_test");
+        Assert.assertNotNull(metadata.getColumnDefinition(ByteBufferUtil.bytes("foo")));
+
+        updateView("INSERT INTO %s(k,asciival,bigintval,foo)VALUES(?,?,?,?)", 0, "foo", 1L, "bar");
+        assertRows(execute("SELECT foo from %s"), row("bar"));
+
+        //Test alter rename
+        executeNet(protocolVersion, "ALTER TABLE %s RENAME asciival TO bar");
+
+        assertRows(execute("SELECT bar from %s"), row("foo"));
+        metadata = Schema.instance.getCFMetaData(keyspace(), "mv1_test");
+        Assert.assertNotNull(metadata.getColumnDefinition(ByteBufferUtil.bytes("bar")));
+    }
+
+
+    @Test
+    public void testTwoTablesOneView() throws Throwable
+    {
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createTable("CREATE TABLE " + keyspace() + ".dummy_table (" +
+                "j int, " +
+                "intval int, " +
+                "PRIMARY KEY (j))");
+
+        createTable("CREATE TABLE " + keyspace() + ".real_base (" +
+                "k int, " +
+                "intval int, " +
+                "PRIMARY KEY (k))");
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM " + keyspace() + ".real_base WHERE k IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, k)");
+        createView("mv2", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM " + keyspace() + ".dummy_table WHERE j IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, j)");
+
+        updateView("INSERT INTO " + keyspace() + ".real_base (k, intval) VALUES (?, ?)", 0, 0);
+        assertRows(execute("SELECT k, intval FROM " + keyspace() + ".real_base WHERE k = ?", 0), row(0, 0));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 0), row(0, 0));
+
+        updateView("INSERT INTO " + keyspace() + ".real_base (k, intval) VALUES (?, ?)", 0, 1);
+        assertRows(execute("SELECT k, intval FROM " + keyspace() + ".real_base WHERE k = ?", 0), row(0, 1));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 1), row(0, 1));
+
+        assertRows(execute("SELECT k, intval FROM " + keyspace() + ".real_base WHERE k = ?", 0), row(0, 1));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 1), row(0, 1));
+
+        updateView("INSERT INTO " + keyspace() +".dummy_table (j, intval) VALUES(?, ?)", 0, 1);
+        assertRows(execute("SELECT j, intval FROM " + keyspace() + ".dummy_table WHERE j = ?", 0), row(0, 1));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 1), row(0, 1));
+    }
+
+    @Test
+    public void testReuseName() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "intval int, " +
+                    "PRIMARY KEY (k))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, k)");
+
+        updateView("INSERT INTO %s (k, intval) VALUES (?, ?)", 0, 0);
+        assertRows(execute("SELECT k, intval FROM %s WHERE k = ?", 0), row(0, 0));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 0), row(0, 0));
+
+        executeNet(protocolVersion, "DROP MATERIALIZED VIEW mv");
+        views.remove("mv");
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, k)");
+
+        updateView("INSERT INTO %s (k, intval) VALUES (?, ?)", 0, 1);
+        assertRows(execute("SELECT k, intval FROM %s WHERE k = ?", 0), row(0, 1));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 1), row(0, 1));
+    }
+
+    @Test
+    public void testAllTypes() throws Throwable
+    {
+        String myType = createType("CREATE TYPE %s (a int, b uuid, c set<text>)");
+
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "blobval blob, " +
+                    "booleanval boolean, " +
+                    "dateval date, " +
+                    "decimalval decimal, " +
+                    "doubleval double, " +
+                    "floatval float, " +
+                    "inetval inet, " +
+                    "intval int, " +
+                    "textval text, " +
+                    "timeval time, " +
+                    "timestampval timestamp, " +
+                    "timeuuidval timeuuid, " +
+                    "uuidval uuid," +
+                    "varcharval varchar, " +
+                    "varintval varint, " +
+                    "listval list<int>, " +
+                    "frozenlistval frozen<list<int>>, " +
+                    "setval set<uuid>, " +
+                    "frozensetval frozen<set<uuid>>, " +
+                    "mapval map<ascii, int>," +
+                    "frozenmapval frozen<map<ascii, int>>," +
+                    "tupleval frozen<tuple<int, ascii, uuid>>," +
+                    "udtval frozen<" + myType + ">)");
+
+        CFMetaData metadata = currentTableMetadata();
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        for (ColumnDefinition def : new HashSet<>(metadata.allColumns()))
+        {
+            try
+            {
+                createView("mv_" + def.name, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL PRIMARY KEY (" + def.name + ",k)");
+
+                if (def.type.isMultiCell())
+                    Assert.fail("MV on a multicell should fail " + def);
+
+                if (def.isPartitionKey())
+                    Assert.fail("MV on partition key should fail " + def);
+            }
+            catch (InvalidQueryException e)
+            {
+                if (!def.type.isMultiCell() && !def.isPartitionKey())
+                    Assert.fail("MV creation failed on " + def);
+            }
+        }
+
+        // fromJson() can only be used when the receiver type is known
+        assertInvalidMessage("fromJson() cannot be used in the selection clause", "SELECT fromJson(asciival) FROM %s", 0, 0);
+
+        String func1 = createFunction(KEYSPACE, "int", "CREATE FUNCTION %s (a int) CALLED ON NULL INPUT RETURNS text LANGUAGE java AS $$ return a.toString(); $$");
+        createFunctionOverload(func1, "int", "CREATE FUNCTION %s (a text) CALLED ON NULL INPUT RETURNS text LANGUAGE java AS $$ return new String(a); $$");
+
+        // ================ ascii ================
+        updateView("INSERT INTO %s (k, asciival) VALUES (?, fromJson(?))", 0, "\"ascii text\"");
+        assertRows(execute("SELECT k, asciival FROM %s WHERE k = ?", 0), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, asciival) VALUES (?, fromJson(?))", 0, "\"ascii \\\" text\"");
+        assertRows(execute("SELECT k, asciival FROM %s WHERE k = ?", 0), row(0, "ascii \" text"));
+
+        // test that we can use fromJson() in other valid places in queries
+        assertRows(execute("SELECT asciival FROM %s WHERE k = fromJson(?)", "0"), row("ascii \" text"));
+
+        //Check the MV
+        assertRows(execute("SELECT k, udtval from mv_asciival WHERE asciival = ?", "ascii text"));
+        assertRows(execute("SELECT k, udtval from mv_asciival WHERE asciival = ?", "ascii \" text"), row(0, null));
+
+        updateView("UPDATE %s SET asciival = fromJson(?) WHERE k = fromJson(?)", "\"ascii \\\" text\"", "0");
+        assertRows(execute("SELECT k, udtval from mv_asciival WHERE asciival = ?", "ascii \" text"), row(0, null));
+
+        updateView("DELETE FROM %s WHERE k = fromJson(?)", "0");
+        assertRows(execute("SELECT k, asciival FROM %s WHERE k = ?", 0));
+        assertRows(execute("SELECT k, udtval from mv_asciival WHERE asciival = ?", "ascii \" text"));
+
+        updateView("INSERT INTO %s (k, asciival) VALUES (?, fromJson(?))", 0, "\"ascii text\"");
+        assertRows(execute("SELECT k, udtval from mv_asciival WHERE asciival = ?", "ascii text"), row(0, null));
+
+        // ================ bigint ================
+        updateView("INSERT INTO %s (k, bigintval) VALUES (?, fromJson(?))", 0, "123123123123");
+        assertRows(execute("SELECT k, bigintval FROM %s WHERE k = ?", 0), row(0, 123123123123L));
+        assertRows(execute("SELECT k, asciival from mv_bigintval WHERE bigintval = ?", 123123123123L), row(0, "ascii text"));
+
+        // ================ blob ================
+        updateView("INSERT INTO %s (k, blobval) VALUES (?, fromJson(?))", 0, "\"0x00000001\"");
+        assertRows(execute("SELECT k, blobval FROM %s WHERE k = ?", 0), row(0, ByteBufferUtil.bytes(1)));
+        assertRows(execute("SELECT k, asciival from mv_blobval WHERE blobval = ?", ByteBufferUtil.bytes(1)), row(0, "ascii text"));
+
+        // ================ boolean ================
+        updateView("INSERT INTO %s (k, booleanval) VALUES (?, fromJson(?))", 0, "true");
+        assertRows(execute("SELECT k, booleanval FROM %s WHERE k = ?", 0), row(0, true));
+        assertRows(execute("SELECT k, asciival from mv_booleanval WHERE booleanval = ?", true), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, booleanval) VALUES (?, fromJson(?))", 0, "false");
+        assertRows(execute("SELECT k, booleanval FROM %s WHERE k = ?", 0), row(0, false));
+        assertRows(execute("SELECT k, asciival from mv_booleanval WHERE booleanval = ?", true));
+        assertRows(execute("SELECT k, asciival from mv_booleanval WHERE booleanval = ?", false), row(0, "ascii text"));
+
+        // ================ date ================
+        updateView("INSERT INTO %s (k, dateval) VALUES (?, fromJson(?))", 0, "\"1987-03-23\"");
+        assertRows(execute("SELECT k, dateval FROM %s WHERE k = ?", 0), row(0, SimpleDateSerializer.dateStringToDays("1987-03-23")));
+        assertRows(execute("SELECT k, asciival from mv_dateval WHERE dateval = fromJson(?)", "\"1987-03-23\""), row(0, "ascii text"));
+
+        // ================ decimal ================
+        updateView("INSERT INTO %s (k, decimalval) VALUES (?, fromJson(?))", 0, "123123.123123");
+        assertRows(execute("SELECT k, decimalval FROM %s WHERE k = ?", 0), row(0, new BigDecimal("123123.123123")));
+        assertRows(execute("SELECT k, asciival from mv_decimalval WHERE decimalval = fromJson(?)", "123123.123123"), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, decimalval) VALUES (?, fromJson(?))", 0, "123123");
+        assertRows(execute("SELECT k, decimalval FROM %s WHERE k = ?", 0), row(0, new BigDecimal("123123")));
+        assertRows(execute("SELECT k, asciival from mv_decimalval WHERE decimalval = fromJson(?)", "123123.123123"));
+        assertRows(execute("SELECT k, asciival from mv_decimalval WHERE decimalval = fromJson(?)", "123123"), row(0, "ascii text"));
+
+        // accept strings for numbers that cannot be represented as doubles
+        updateView("INSERT INTO %s (k, decimalval) VALUES (?, fromJson(?))", 0, "\"123123.123123\"");
+        assertRows(execute("SELECT k, decimalval FROM %s WHERE k = ?", 0), row(0, new BigDecimal("123123.123123")));
+
+        updateView("INSERT INTO %s (k, decimalval) VALUES (?, fromJson(?))", 0, "\"-1.23E-12\"");
+        assertRows(execute("SELECT k, decimalval FROM %s WHERE k = ?", 0), row(0, new BigDecimal("-1.23E-12")));
+        assertRows(execute("SELECT k, asciival from mv_decimalval WHERE decimalval = fromJson(?)", "\"-1.23E-12\""), row(0, "ascii text"));
+
+        // ================ double ================
+        updateView("INSERT INTO %s (k, doubleval) VALUES (?, fromJson(?))", 0, "123123.123123");
+        assertRows(execute("SELECT k, doubleval FROM %s WHERE k = ?", 0), row(0, 123123.123123d));
+        assertRows(execute("SELECT k, asciival from mv_doubleval WHERE doubleval = fromJson(?)", "123123.123123"), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, doubleval) VALUES (?, fromJson(?))", 0, "123123");
+        assertRows(execute("SELECT k, doubleval FROM %s WHERE k = ?", 0), row(0, 123123.0d));
+        assertRows(execute("SELECT k, asciival from mv_doubleval WHERE doubleval = fromJson(?)", "123123"), row(0, "ascii text"));
+
+        // ================ float ================
+        updateView("INSERT INTO %s (k, floatval) VALUES (?, fromJson(?))", 0, "123123.123123");
+        assertRows(execute("SELECT k, floatval FROM %s WHERE k = ?", 0), row(0, 123123.123123f));
+        assertRows(execute("SELECT k, asciival from mv_floatval WHERE floatval = fromJson(?)", "123123.123123"), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, floatval) VALUES (?, fromJson(?))", 0, "123123");
+        assertRows(execute("SELECT k, floatval FROM %s WHERE k = ?", 0), row(0, 123123.0f));
+        assertRows(execute("SELECT k, asciival from mv_floatval WHERE floatval = fromJson(?)", "123123"), row(0, "ascii text"));
+
+        // ================ inet ================
+        updateView("INSERT INTO %s (k, inetval) VALUES (?, fromJson(?))", 0, "\"127.0.0.1\"");
+        assertRows(execute("SELECT k, inetval FROM %s WHERE k = ?", 0), row(0, InetAddress.getByName("127.0.0.1")));
+        assertRows(execute("SELECT k, asciival from mv_inetval WHERE inetval = fromJson(?)", "\"127.0.0.1\""), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, inetval) VALUES (?, fromJson(?))", 0, "\"::1\"");
+        assertRows(execute("SELECT k, inetval FROM %s WHERE k = ?", 0), row(0, InetAddress.getByName("::1")));
+        assertRows(execute("SELECT k, asciival from mv_inetval WHERE inetval = fromJson(?)", "\"127.0.0.1\""));
+        assertRows(execute("SELECT k, asciival from mv_inetval WHERE inetval = fromJson(?)", "\"::1\""), row(0, "ascii text"));
+
+        // ================ int ================
+        updateView("INSERT INTO %s (k, intval) VALUES (?, fromJson(?))", 0, "123123");
+        assertRows(execute("SELECT k, intval FROM %s WHERE k = ?", 0), row(0, 123123));
+        assertRows(execute("SELECT k, asciival from mv_intval WHERE intval = fromJson(?)", "123123"), row(0, "ascii text"));
+
+        // ================ text (varchar) ================
+        updateView("INSERT INTO %s (k, textval) VALUES (?, fromJson(?))", 0, "\"some \\\" text\"");
+        assertRows(execute("SELECT k, textval FROM %s WHERE k = ?", 0), row(0, "some \" text"));
+
+        updateView("INSERT INTO %s (k, textval) VALUES (?, fromJson(?))", 0, "\"\\u2013\"");
+        assertRows(execute("SELECT k, textval FROM %s WHERE k = ?", 0), row(0, "\u2013"));
+        assertRows(execute("SELECT k, asciival from mv_textval WHERE textval = fromJson(?)", "\"\\u2013\""), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, textval) VALUES (?, fromJson(?))", 0, "\"abcd\"");
+        assertRows(execute("SELECT k, textval FROM %s WHERE k = ?", 0), row(0, "abcd"));
+        assertRows(execute("SELECT k, asciival from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, "ascii text"));
+
+        // ================ time ================
+        updateView("INSERT INTO %s (k, timeval) VALUES (?, fromJson(?))", 0, "\"07:35:07.000111222\"");
+        assertRows(execute("SELECT k, timeval FROM %s WHERE k = ?", 0), row(0, TimeSerializer.timeStringToLong("07:35:07.000111222")));
+        assertRows(execute("SELECT k, asciival from mv_timeval WHERE timeval = fromJson(?)", "\"07:35:07.000111222\""), row(0, "ascii text"));
+
+        // ================ timestamp ================
+        updateView("INSERT INTO %s (k, timestampval) VALUES (?, fromJson(?))", 0, "123123123123");
+        assertRows(execute("SELECT k, timestampval FROM %s WHERE k = ?", 0), row(0, new Date(123123123123L)));
+        assertRows(execute("SELECT k, asciival from mv_timestampval WHERE timestampval = fromJson(?)", "123123123123"), row(0, "ascii text"));
+
+        updateView("INSERT INTO %s (k, timestampval) VALUES (?, fromJson(?))", 0, "\"2014-01-01\"");
+        assertRows(execute("SELECT k, timestampval FROM %s WHERE k = ?", 0), row(0, new SimpleDateFormat("y-M-d").parse("2014-01-01")));
+        assertRows(execute("SELECT k, asciival from mv_timestampval WHERE timestampval = fromJson(?)", "\"2014-01-01\""), row(0, "ascii text"));
+
+        // ================ timeuuid ================
+        updateView("INSERT INTO %s (k, timeuuidval) VALUES (?, fromJson(?))", 0, "\"6bddc89a-5644-11e4-97fc-56847afe9799\"");
+        assertRows(execute("SELECT k, timeuuidval FROM %s WHERE k = ?", 0), row(0, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")));
+
+        updateView("INSERT INTO %s (k, timeuuidval) VALUES (?, fromJson(?))", 0, "\"6BDDC89A-5644-11E4-97FC-56847AFE9799\"");
+        assertRows(execute("SELECT k, timeuuidval FROM %s WHERE k = ?", 0), row(0, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")));
+        assertRows(execute("SELECT k, asciival from mv_timeuuidval WHERE timeuuidval = fromJson(?)", "\"6BDDC89A-5644-11E4-97FC-56847AFE9799\""), row(0, "ascii text"));
+
+        // ================ uuidval ================
+        updateView("INSERT INTO %s (k, uuidval) VALUES (?, fromJson(?))", 0, "\"6bddc89a-5644-11e4-97fc-56847afe9799\"");
+        assertRows(execute("SELECT k, uuidval FROM %s WHERE k = ?", 0), row(0, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")));
+
+        updateView("INSERT INTO %s (k, uuidval) VALUES (?, fromJson(?))", 0, "\"6BDDC89A-5644-11E4-97FC-56847AFE9799\"");
+        assertRows(execute("SELECT k, uuidval FROM %s WHERE k = ?", 0), row(0, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")));
+        assertRows(execute("SELECT k, asciival from mv_uuidval WHERE uuidval = fromJson(?)", "\"6BDDC89A-5644-11E4-97FC-56847AFE9799\""), row(0, "ascii text"));
+
+        // ================ varint ================
+        updateView("INSERT INTO %s (k, varintval) VALUES (?, fromJson(?))", 0, "123123123123");
+        assertRows(execute("SELECT k, varintval FROM %s WHERE k = ?", 0), row(0, new BigInteger("123123123123")));
+        assertRows(execute("SELECT k, asciival from mv_varintval WHERE varintval = fromJson(?)", "123123123123"), row(0, "ascii text"));
+
+        // accept strings for numbers that cannot be represented as longs
+        updateView("INSERT INTO %s (k, varintval) VALUES (?, fromJson(?))", 0, "\"1234567890123456789012345678901234567890\"");
+        assertRows(execute("SELECT k, varintval FROM %s WHERE k = ?", 0), row(0, new BigInteger("1234567890123456789012345678901234567890")));
+        assertRows(execute("SELECT k, asciival from mv_varintval WHERE varintval = fromJson(?)", "\"1234567890123456789012345678901234567890\""), row(0, "ascii text"));
+
+        // ================ lists ================
+        updateView("INSERT INTO %s (k, listval) VALUES (?, fromJson(?))", 0, "[1, 2, 3]");
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(1, 2, 3)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(1, 2, 3)));
+
+        updateView("INSERT INTO %s (k, listval) VALUES (?, fromJson(?))", 0, "[1]");
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(1)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(1)));
+
+        updateView("UPDATE %s SET listval = listval + fromJson(?) WHERE k = ?", "[2]", 0);
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(1, 2)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(1, 2)));
+
+        updateView("UPDATE %s SET listval = fromJson(?) + listval WHERE k = ?", "[0]", 0);
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(0, 1, 2)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(0, 1, 2)));
+
+        updateView("UPDATE %s SET listval[1] = fromJson(?) WHERE k = ?", "10", 0);
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(0, 10, 2)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(0, 10, 2)));
+
+        updateView("DELETE listval[1] FROM %s WHERE k = ?", 0);
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(0, 2)));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(0, 2)));
+
+        updateView("INSERT INTO %s (k, listval) VALUES (?, fromJson(?))", 0, "[]");
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, null));
+        assertRows(execute("SELECT k, listval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, null));
+
+        // frozen
+        updateView("INSERT INTO %s (k, frozenlistval) VALUES (?, fromJson(?))", 0, "[1, 2, 3]");
+        assertRows(execute("SELECT k, frozenlistval FROM %s WHERE k = ?", 0), row(0, list(1, 2, 3)));
+        assertRows(execute("SELECT k, frozenlistval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(1, 2, 3)));
+        assertRows(execute("SELECT k, textval from mv_frozenlistval where frozenlistval = fromJson(?)", "[1, 2, 3]"), row(0, "abcd"));
+
+        updateView("INSERT INTO %s (k, frozenlistval) VALUES (?, fromJson(?))", 0, "[3, 2, 1]");
+        assertRows(execute("SELECT k, frozenlistval FROM %s WHERE k = ?", 0), row(0, list(3, 2, 1)));
+        assertRows(execute("SELECT k, textval from mv_frozenlistval where frozenlistval = fromJson(?)", "[1, 2, 3]"));
+        assertRows(execute("SELECT k, textval from mv_frozenlistval where frozenlistval = fromJson(?)", "[3, 2, 1]"), row(0, "abcd"));
+        assertRows(execute("SELECT k, frozenlistval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list(3, 2, 1)));
+
+        updateView("INSERT INTO %s (k, frozenlistval) VALUES (?, fromJson(?))", 0, "[]");
+        assertRows(execute("SELECT k, frozenlistval FROM %s WHERE k = ?", 0), row(0, list()));
+        assertRows(execute("SELECT k, frozenlistval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, list()));
+
+        // ================ sets ================
+        updateView("INSERT INTO %s (k, setval) VALUES (?, fromJson(?))",
+                   0, "[\"6bddc89a-5644-11e4-97fc-56847afe9798\", \"6bddc89a-5644-11e4-97fc-56847afe9799\"]");
+        assertRows(execute("SELECT k, setval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))))
+        );
+        assertRows(execute("SELECT k, setval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))));
+
+        // duplicates are okay, just like in CQL
+        updateView("INSERT INTO %s (k, setval) VALUES (?, fromJson(?))",
+                   0, "[\"6bddc89a-5644-11e4-97fc-56847afe9798\", \"6bddc89a-5644-11e4-97fc-56847afe9798\", \"6bddc89a-5644-11e4-97fc-56847afe9799\"]");
+        assertRows(execute("SELECT k, setval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))))
+        );
+        assertRows(execute("SELECT k, setval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))));
+
+        updateView("UPDATE %s SET setval = setval + fromJson(?) WHERE k = ?", "[\"6bddc89a-5644-0000-97fc-56847afe9799\"]", 0);
+        assertRows(execute("SELECT k, setval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-5644-0000-97fc-56847afe9799"), UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))))
+        );
+        assertRows(execute("SELECT k, setval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-5644-0000-97fc-56847afe9799"), UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))));
+
+        updateView("UPDATE %s SET setval = setval - fromJson(?) WHERE k = ?", "[\"6bddc89a-5644-0000-97fc-56847afe9799\"]", 0);
+        assertRows(execute("SELECT k, setval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))))
+        );
+        assertRows(execute("SELECT k, setval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))));
+
+        updateView("INSERT INTO %s (k, setval) VALUES (?, fromJson(?))", 0, "[]");
+        assertRows(execute("SELECT k, setval FROM %s WHERE k = ?", 0), row(0, null));
+        assertRows(execute("SELECT k, setval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, null));
+
+
+        // frozen
+        updateView("INSERT INTO %s (k, frozensetval) VALUES (?, fromJson(?))",
+                   0, "[\"6bddc89a-5644-11e4-97fc-56847afe9798\", \"6bddc89a-5644-11e4-97fc-56847afe9799\"]");
+        assertRows(execute("SELECT k, frozensetval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))))
+        );
+        assertRows(execute("SELECT k, frozensetval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))));
+
+        updateView("INSERT INTO %s (k, frozensetval) VALUES (?, fromJson(?))",
+                   0, "[\"6bddc89a-0000-11e4-97fc-56847afe9799\", \"6bddc89a-5644-11e4-97fc-56847afe9798\"]");
+        assertRows(execute("SELECT k, frozensetval FROM %s WHERE k = ?", 0),
+                   row(0, set(UUID.fromString("6bddc89a-0000-11e4-97fc-56847afe9799"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798"))))
+        );
+        assertRows(execute("SELECT k, frozensetval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, set(UUID.fromString("6bddc89a-0000-11e4-97fc-56847afe9799"), (UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9798")))));
+
+        // ================ maps ================
+        updateView("INSERT INTO %s (k, mapval) VALUES (?, fromJson(?))", 0, "{\"a\": 1, \"b\": 2}");
+        assertRows(execute("SELECT k, mapval FROM %s WHERE k = ?", 0), row(0, map("a", 1, "b", 2)));
+        assertRows(execute("SELECT k, mapval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""), row(0, map("a", 1, "b", 2)));
+
+        updateView("UPDATE %s SET mapval[?] = ?  WHERE k = ?", "c", 3, 0);
+        assertRows(execute("SELECT k, mapval FROM %s WHERE k = ?", 0),
+                   row(0, map("a", 1, "b", 2, "c", 3))
+        );
+        assertRows(execute("SELECT k, mapval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, map("a", 1, "b", 2, "c", 3)));
+
+        updateView("UPDATE %s SET mapval[?] = ?  WHERE k = ?", "b", 10, 0);
+        assertRows(execute("SELECT k, mapval FROM %s WHERE k = ?", 0),
+                   row(0, map("a", 1, "b", 10, "c", 3))
+        );
+        assertRows(execute("SELECT k, mapval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, map("a", 1, "b", 10, "c", 3)));
+
+        updateView("DELETE mapval[?] FROM %s WHERE k = ?", "b", 0);
+        assertRows(execute("SELECT k, mapval FROM %s WHERE k = ?", 0),
+                   row(0, map("a", 1, "c", 3))
+        );
+        assertRows(execute("SELECT k, mapval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, map("a", 1, "c", 3)));
+
+        updateView("INSERT INTO %s (k, mapval) VALUES (?, fromJson(?))", 0, "{}");
+        assertRows(execute("SELECT k, mapval FROM %s WHERE k = ?", 0), row(0, null));
+        assertRows(execute("SELECT k, mapval from mv_textval WHERE textval = fromJson(?)", "\"abcd\""),
+                   row(0, null));
+
+        // frozen
+        updateView("INSERT INTO %s (k, frozenmapval) VALUES (?, fromJson(?))", 0, "{\"a\": 1, \"b\": 2}");
+        assertRows(execute("SELECT k, frozenmapval FROM %s WHERE k = ?", 0), row(0, map("a", 1, "b", 2)));
+        assertRows(execute("SELECT k, textval FROM mv_frozenmapval WHERE frozenmapval = fromJson(?)", "{\"a\": 1, \"b\": 2}"), row(0, "abcd"));
+
+        updateView("INSERT INTO %s (k, frozenmapval) VALUES (?, fromJson(?))", 0, "{\"b\": 2, \"a\": 3}");
+        assertRows(execute("SELECT k, frozenmapval FROM %s WHERE k = ?", 0), row(0, map("a", 3, "b", 2)));
+        assertRows(execute("SELECT k, frozenmapval FROM %s WHERE k = ?", 0), row(0, map("a", 3, "b", 2)));
+
+        // ================ tuples ================
+        updateView("INSERT INTO %s (k, tupleval) VALUES (?, fromJson(?))", 0, "[1, \"foobar\", \"6bddc89a-5644-11e4-97fc-56847afe9799\"]");
+        assertRows(execute("SELECT k, tupleval FROM %s WHERE k = ?", 0),
+                   row(0, tuple(1, "foobar", UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))
+        );
+        assertRows(execute("SELECT k, textval FROM mv_tupleval WHERE tupleval = ?", tuple(1, "foobar", UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))),
+                   row(0, "abcd"));
+
+        updateView("INSERT INTO %s (k, tupleval) VALUES (?, fromJson(?))", 0, "[1, null, \"6bddc89a-5644-11e4-97fc-56847afe9799\"]");
+        assertRows(execute("SELECT k, tupleval FROM %s WHERE k = ?", 0),
+                   row(0, tuple(1, null, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799")))
+        );
+        assertRows(execute("SELECT k, textval FROM mv_tupleval WHERE tupleval = ?", tuple(1, "foobar", UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))));
+        assertRows(execute("SELECT k, textval FROM mv_tupleval WHERE tupleval = ?", tuple(1, null, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"))),
+                   row(0, "abcd"));
+
+        // ================ UDTs ================
+        updateView("INSERT INTO %s (k, udtval) VALUES (?, fromJson(?))", 0, "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}");
+        assertRows(execute("SELECT k, udtval.a, udtval.b, udtval.c FROM %s WHERE k = ?", 0),
+                   row(0, 1, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"), set("bar", "foo"))
+        );
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}"),
+                   row(0, "abcd"));
+
+        // order of fields shouldn't matter
+        updateView("INSERT INTO %s (k, udtval) VALUES (?, fromJson(?))", 0, "{\"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"a\": 1, \"c\": [\"foo\", \"bar\"]}");
+        assertRows(execute("SELECT k, udtval.a, udtval.b, udtval.c FROM %s WHERE k = ?", 0),
+                   row(0, 1, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"), set("bar", "foo"))
+        );
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}"),
+                   row(0, "abcd"));
+
+        // test nulls
+        updateView("INSERT INTO %s (k, udtval) VALUES (?, fromJson(?))", 0, "{\"a\": null, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}");
+        assertRows(execute("SELECT k, udtval.a, udtval.b, udtval.c FROM %s WHERE k = ?", 0),
+                   row(0, null, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"), set("bar", "foo"))
+        );
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}"));
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": null, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}"),
+                   row(0, "abcd"));
+
+        // test missing fields
+        updateView("INSERT INTO %s (k, udtval) VALUES (?, fromJson(?))", 0, "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\"}");
+        assertRows(execute("SELECT k, udtval.a, udtval.b, udtval.c FROM %s WHERE k = ?", 0),
+                   row(0, 1, UUID.fromString("6bddc89a-5644-11e4-97fc-56847afe9799"), null)
+        );
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": null, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\", \"c\": [\"foo\", \"bar\"]}"));
+        assertRows(execute("SELECT k, textval FROM mv_udtval WHERE udtval = fromJson(?)", "{\"a\": 1, \"b\": \"6bddc89a-5644-11e4-97fc-56847afe9799\"}"),
+                   row(0, "abcd"));
+    }
+
+    @Test
+    public void testDropTableWithMV() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                "a int," +
+                "b int," +
+                "c int," +
+                "d int," +
+                "PRIMARY KEY (a, b, c))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView(keyspace() + ".mv1",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (a, b, c)");
+
+        try
+        {
+            executeNet(protocolVersion, "DROP TABLE " + keyspace() + ".mv1");
+            Assert.fail();
+        }
+        catch (InvalidQueryException e)
+        {
+            Assert.assertEquals("Cannot use DROP TABLE on Materialized View", e.getMessage());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/ViewTest.java b/test/unit/org/apache/cassandra/cql3/ViewTest.java
new file mode 100644
index 0000000..0d49e4b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/ViewTest.java

@@ -0,0 +1,1411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import static org.junit.Assert.*;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import com.google.common.util.concurrent.Uninterruptibles;
+
+import junit.framework.Assert;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.concurrent.SEPExecutor;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class ViewTest extends CQLTester
+{
+    int protocolVersion = 4;
+    private final List<String> views = new ArrayList<>();
+
+    @BeforeClass
+    public static void startup()
+    {
+        requireNetwork();
+    }
+    @Before
+    public void begin()
+    {
+        views.clear();
+    }
+
+    @After
+    public void end() throws Throwable
+    {
+        for (String viewName : views)
+            executeNet(protocolVersion, "DROP MATERIALIZED VIEW " + viewName);
+    }
+
+    private void createView(String name, String query) throws Throwable
+    {
+        executeNet(protocolVersion, String.format(query, name));
+        // If exception is thrown, the view will not be added to the list; since it shouldn't have been created, this is
+        // the desired behavior
+        views.add(name);
+    }
+
+    private void updateView(String query, Object... params) throws Throwable
+    {
+        executeNet(protocolVersion, query, params);
+        while (!(((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getPendingTasks() == 0
+                && ((SEPExecutor) StageManager.getStage(Stage.VIEW_MUTATION)).getActiveCount() == 0))
+        {
+            Thread.sleep(1);
+        }
+    }
+
+    @Test
+    public void testNonExistingOnes() throws Throwable
+    {
+        assertInvalidMessage("Cannot drop non existing materialized view", "DROP MATERIALIZED VIEW " + KEYSPACE + ".view_does_not_exist");
+        assertInvalidMessage("Cannot drop non existing materialized view", "DROP MATERIALIZED VIEW keyspace_does_not_exist.view_does_not_exist");
+
+        execute("DROP MATERIALIZED VIEW IF EXISTS " + KEYSPACE + ".view_does_not_exist");
+        execute("DROP MATERIALIZED VIEW IF EXISTS keyspace_does_not_exist.view_does_not_exist");
+    }
+
+    @Test
+    public void testExistingRangeTombstoneWithFlush() throws Throwable
+    {
+        testExistingRangeTombstone(true);
+    }
+
+    @Test
+    public void testExistingRangeTombstoneWithoutFlush() throws Throwable
+    {
+        testExistingRangeTombstone(false);
+    }
+
+    public void testExistingRangeTombstone(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, c1 int, c2 int, v1 int, v2 int, PRIMARY KEY (k1, c1, c2))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("view1",
+                   "CREATE MATERIALIZED VIEW view1 AS SELECT * FROM %%s WHERE k1 IS NOT NULL AND c1 IS NOT NULL AND c2 IS NOT NULL PRIMARY KEY (k1, c2, c1)");
+
+        updateView("DELETE FROM %s USING TIMESTAMP 10 WHERE k1 = 1 and c1=1");
+
+        if (flush)
+            Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+
+        String table = KEYSPACE + "." + currentTable();
+        updateView("BEGIN BATCH " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 0, 0, 0, 0) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 0, 1, 0, 1) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 1, 0, 1, 0) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 1, 1, 1, 1) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 1, 2, 1, 2) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 1, 3, 1, 3) USING TIMESTAMP 5; " +
+                "INSERT INTO " + table + " (k1, c1, c2, v1, v2) VALUES (1, 2, 0, 2, 0) USING TIMESTAMP 5; " +
+                "APPLY BATCH");
+
+        assertRowsIgnoringOrder(execute("select * from %s"),
+                                row(1, 0, 0, 0, 0),
+                                row(1, 0, 1, 0, 1),
+                                row(1, 2, 0, 2, 0));
+        assertRowsIgnoringOrder(execute("select k1,c1,c2,v1,v2 from view1"),
+                                row(1, 0, 0, 0, 0),
+                                row(1, 0, 1, 0, 1),
+                                row(1, 2, 0, 2, 0));
+    }
+
+    @Test
+    public void testPartitionTombstone() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, c1 int , val int, PRIMARY KEY (k1, c1))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("view1", "CREATE MATERIALIZED VIEW view1 AS SELECT k1 FROM %%s WHERE k1 IS NOT NULL AND c1 IS NOT NULL AND val IS NOT NULL PRIMARY KEY (val, k1, c1)");
+
+        updateView("INSERT INTO %s (k1, c1, val) VALUES (1, 2, 200)");
+        updateView("INSERT INTO %s (k1, c1, val) VALUES (1, 3, 300)");
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+        Assert.assertEquals(2, execute("select * from view1").size());
+
+        updateView("DELETE FROM %s WHERE k1 = 1");
+
+        Assert.assertEquals(0, execute("select * from %s").size());
+        Assert.assertEquals(0, execute("select * from view1").size());
+    }
+
+    @Test
+    public void testClusteringKeyTombstone() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, c1 int , val int, PRIMARY KEY (k1, c1))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("view1", "CREATE MATERIALIZED VIEW view1 AS SELECT k1 FROM %%s WHERE k1 IS NOT NULL AND c1 IS NOT NULL AND val IS NOT NULL PRIMARY KEY (val, k1, c1)");
+
+        updateView("INSERT INTO %s (k1, c1, val) VALUES (1, 2, 200)");
+        updateView("INSERT INTO %s (k1, c1, val) VALUES (1, 3, 300)");
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+        Assert.assertEquals(2, execute("select * from view1").size());
+
+        updateView("DELETE FROM %s WHERE k1 = 1 and c1 = 3");
+
+        Assert.assertEquals(1, execute("select * from %s").size());
+        Assert.assertEquals(1, execute("select * from view1").size());
+    }
+
+    @Test
+    public void testPrimaryKeyIsNotNull() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "PRIMARY KEY((k, asciival)))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        // Must include "IS NOT NULL" for primary keys
+        try
+        {
+            createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s");
+            Assert.fail("Should fail if no primary key is filtered as NOT NULL");
+        }
+        catch (Exception e)
+        {
+        }
+
+        // Must include both when the partition key is composite
+        try
+        {
+            createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE bigintval IS NOT NULL AND asciival IS NOT NULL PRIMARY KEY (bigintval, k, asciival)");
+            Assert.fail("Should fail if compound primary is not completely filtered as NOT NULL");
+        }
+        catch (Exception e)
+        {
+        }
+
+        dropTable("DROP TABLE %s");
+
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "PRIMARY KEY(k, asciival))");
+        try
+        {
+            createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s");
+            Assert.fail("Should fail if no primary key is filtered as NOT NULL");
+        }
+        catch (Exception e)
+        {
+        }
+
+        // Can omit "k IS NOT NULL" because we have a sinlge partition key
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE bigintval IS NOT NULL AND asciival IS NOT NULL PRIMARY KEY (bigintval, k, asciival)");
+    }
+
+    @Test
+    public void testStaticTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "c int, " +
+                    "sval text static, " +
+                    "val text, " +
+                    "PRIMARY KEY(k,c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        try
+        {
+            createView("mv_static", "CREATE MATERIALIZED VIEW %%s AS SELECT * FROM %s WHERE sval IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (sval,k,c)");
+            Assert.fail("Use of static column in a MV primary key should fail");
+        }
+        catch (InvalidQueryException e)
+        {
+        }
+
+        try
+        {
+            createView("mv_static", "CREATE MATERIALIZED VIEW %%s AS SELECT val, sval FROM %s WHERE val IS NOT NULL AND  k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val, k, c)");
+            Assert.fail("Explicit select of static column in MV should fail");
+        }
+        catch (InvalidQueryException e)
+        {
+        }
+
+        try
+        {
+            createView("mv_static", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
+            Assert.fail("Implicit select of static column in MV should fail");
+        }
+        catch (InvalidQueryException e)
+        {
+        }
+
+        createView("mv_static", "CREATE MATERIALIZED VIEW %s AS SELECT val,k,c FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
+
+        for (int i = 0; i < 100; i++)
+            updateView("INSERT into %s (k,c,sval,val)VALUES(?,?,?,?)", 0, i % 2, "bar" + i, "baz");
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+
+        assertRows(execute("SELECT sval from %s"), row("bar99"), row("bar99"));
+
+        Assert.assertEquals(2, execute("select * from mv_static").size());
+
+        assertInvalid("SELECT sval from mv_static");
+    }
+
+
+    @Test
+    public void testOldTimestamps() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "c int, " +
+                    "val text, " +
+                    "PRIMARY KEY(k,c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv_tstest", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
+
+        for (int i = 0; i < 100; i++)
+            updateView("INSERT into %s (k,c,val)VALUES(?,?,?)", 0, i % 2, "baz");
+
+        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+        Assert.assertEquals(2, execute("select * from mv_tstest").size());
+
+        assertRows(execute("SELECT val from %s where k = 0 and c = 0"), row("baz"));
+        assertRows(execute("SELECT c from mv_tstest where k = 0 and val = ?", "baz"), row(0), row(1));
+
+        //Make sure an old TS does nothing
+        updateView("UPDATE %s USING TIMESTAMP 100 SET val = ? where k = ? AND c = ?", "bar", 0, 0);
+        assertRows(execute("SELECT val from %s where k = 0 and c = 0"), row("baz"));
+        assertRows(execute("SELECT c from mv_tstest where k = 0 and val = ?", "baz"), row(0), row(1));
+        assertRows(execute("SELECT c from mv_tstest where k = 0 and val = ?", "bar"));
+
+        //Latest TS
+        updateView("UPDATE %s SET val = ? where k = ? AND c = ?", "bar", 0, 0);
+        assertRows(execute("SELECT val from %s where k = 0 and c = 0"), row("bar"));
+        assertRows(execute("SELECT c from mv_tstest where k = 0 and val = ?", "bar"), row(0));
+        assertRows(execute("SELECT c from mv_tstest where k = 0 and val = ?", "baz"), row(1));
+    }
+
+    @Test
+    public void testRegularColumnTimestampUpdates() throws Throwable
+    {
+        // Regression test for CASSANDRA-10910
+
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "c int, " +
+                    "val int)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv_rctstest", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (k,c)");
+
+        updateView("UPDATE %s SET c = ?, val = ? WHERE k = ?", 0, 0, 0);
+        updateView("UPDATE %s SET val = ? WHERE k = ?", 1, 0);
+        updateView("UPDATE %s SET c = ? WHERE k = ?", 1, 0);
+        assertRows(execute("SELECT c, k, val FROM mv_rctstest"), row(1, 0, 1));
+
+        updateView("TRUNCATE %s");
+
+        updateView("UPDATE %s USING TIMESTAMP 1 SET c = ?, val = ? WHERE k = ?", 0, 0, 0);
+        updateView("UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE k = ?", 1, 0);
+        updateView("UPDATE %s USING TIMESTAMP 2 SET val = ? WHERE k = ?", 1, 0);
+        updateView("UPDATE %s USING TIMESTAMP 4 SET c = ? WHERE k = ?", 2, 0);
+        updateView("UPDATE %s USING TIMESTAMP 3 SET val = ? WHERE k = ?", 2, 0);
+
+        assertRows(execute("SELECT c, k, val FROM mv_rctstest"), row(2, 0, 2));
+        assertRows(execute("SELECT c, k, val FROM mv_rctstest limit 1"), row(2, 0, 2));
+    }
+
+    @Test
+    public void testCountersTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "count counter)");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        try
+        {
+            createView("mv_counter", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE count IS NOT NULL AND k IS NOT NULL PRIMARY KEY (count,k)");
+            Assert.fail("MV on counter should fail");
+        }
+        catch (InvalidQueryException e)
+        {
+        }
+    }
+
+    @Test
+    public void testSuperCoumn() throws Throwable
+    {
+        String keyspace = createKeyspaceName();
+        String table = createTableName();
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.superCFMD(keyspace, table, AsciiType.instance, AsciiType.instance));
+
+        execute("USE " + keyspace);
+        executeNet(protocolVersion, "USE " + keyspace);
+
+        try
+        {
+            createView("mv_super_column", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM " + keyspace + "." + table + " WHERE key IS NOT NULL AND column1 IS NOT NULL PRIMARY KEY (key,column1)");
+            Assert.fail("MV on SuperColumn table should fail");
+        }
+        catch (InvalidQueryException e)
+        {
+            assertEquals("Materialized views are not supported on SuperColumn tables", e.getMessage());
+        }
+    }
+
+    @Test
+    public void testDurationsTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "c int, " +
+                    "val int) WITH default_time_to_live = 60");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        // Must NOT include "default_time_to_live" for Materialized View creation
+        try
+        {
+            createView("mv_ttl1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (k,c) WITH default_time_to_live = 30");
+            Assert.fail("Should fail if TTL is provided for materialized view");
+        }
+        catch (Exception e)
+        {
+        }
+    }
+
+    @Test
+    public void testAlterMvWithTTL() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "c int, " +
+                    "val int) WITH default_time_to_live = 60");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv_ttl2", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (k,c)");
+
+        // Must NOT include "default_time_to_live" on alter Materialized View
+        try
+        {
+            executeNet(protocolVersion, "ALTER MATERIALIZED VIEW %s WITH default_time_to_live = 30");
+            Assert.fail("Should fail if TTL is provided while altering materialized view");
+        }
+        catch (Exception e)
+        {
+        }
+    }
+
+    @Test
+    public void complexTimestampUpdateTestWithFlush() throws Throwable
+    {
+        complexTimestampUpdateTest(true);
+    }
+
+    @Test
+    public void complexTimestampUpdateTestWithoutFlush() throws Throwable
+    {
+        complexTimestampUpdateTest(false);
+    }
+
+    public void complexTimestampUpdateTest(boolean flush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY (a, b))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+        Keyspace ks = Keyspace.open(keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, a, b)");
+        ks.getColumnFamilyStore("mv").disableAutoCompaction();
+
+        //Set initial values TS=0, leaving e null and verify view
+        executeNet(protocolVersion, "INSERT INTO %s (a, b, c, d) VALUES (0, 0, 1, 0) USING TIMESTAMP 0");
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        //update c's timestamp TS=2
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 2 SET c = ? WHERE a = ? and b = ? ", 1, 0, 0);
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+        // change c's value and TS=3, tombstones c=1 and adds c=0 record
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE a = ? and b = ? ", 0, 0, 0);
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0));
+
+        if(flush)
+        {
+            ks.getColumnFamilyStore("mv").forceMajorCompaction();
+            FBUtilities.waitOnFutures(ks.flush());
+        }
+
+
+        //change c's value back to 1 with TS=4, check we can see d
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 4 SET c = ? WHERE a = ? and b = ? ", 1, 0, 0);
+        if (flush)
+        {
+            ks.getColumnFamilyStore("mv").forceMajorCompaction();
+            FBUtilities.waitOnFutures(ks.flush());
+        }
+
+        assertRows(execute("SELECT d,e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, null));
+
+
+        //Add e value @ TS=1
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 1 SET e = ? WHERE a = ? and b = ? ", 1, 0, 0);
+        assertRows(execute("SELECT d,e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, 1));
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+
+        //Change d value @ TS=2
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 2 SET d = ? WHERE a = ? and b = ? ", 2, 0, 0);
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(2));
+
+        if (flush)
+            FBUtilities.waitOnFutures(ks.flush());
+
+
+        //Change d value @ TS=3
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 3 SET d = ? WHERE a = ? and b = ? ", 1, 0, 0);
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(1));
+
+
+        //Tombstone c
+        executeNet(protocolVersion, "DELETE FROM %s WHERE a = ? and b = ?", 0, 0);
+        assertRows(execute("SELECT d from mv"));
+
+        //Add back without D
+        executeNet(protocolVersion, "INSERT INTO %s (a, b, c) VALUES (0, 0, 1)");
+
+        //Make sure D doesn't pop back in.
+        assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row((Object) null));
+
+
+        //New partition
+        // insert a row with timestamp 0
+        executeNet(protocolVersion, "INSERT INTO %s (a, b, c, d, e) VALUES (?, ?, ?, ?, ?) USING TIMESTAMP 0", 1, 0, 0, 0, 0);
+
+        // overwrite pk and e with timestamp 1, but don't overwrite d
+        executeNet(protocolVersion, "INSERT INTO %s (a, b, c, e) VALUES (?, ?, ?, ?) USING TIMESTAMP 1", 1, 0, 0, 0);
+
+        // delete with timestamp 0 (which should only delete d)
+        executeNet(protocolVersion, "DELETE FROM %s USING TIMESTAMP 0 WHERE a = ? AND b = ?", 1, 0);
+        assertRows(execute("SELECT a, b, c, d, e from mv WHERE c = ? and a = ? and b = ?", 0, 1, 0),
+                   row(1, 0, 0, null, 0)
+        );
+
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 2 SET c = ? WHERE a = ? AND b = ?", 1, 1, 0);
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE a = ? AND b = ?", 0, 1, 0);
+        assertRows(execute("SELECT a, b, c, d, e from mv WHERE c = ? and a = ? and b = ?", 0, 1, 0),
+                   row(1, 0, 0, null, 0)
+        );
+
+        executeNet(protocolVersion, "UPDATE %s USING TIMESTAMP 3 SET d = ? WHERE a = ? AND b = ?", 0, 1, 0);
+        assertRows(execute("SELECT a, b, c, d, e from mv WHERE c = ? and a = ? and b = ?", 0, 1, 0),
+                   row(1, 0, 0, 0, 0)
+        );
+
+
+    }
+
+    @Test
+    public void testBuilderWidePartition() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "c int, " +
+                    "intval int, " +
+                    "PRIMARY KEY (k, c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+
+        for(int i = 0; i < 1024; i++)
+            execute("INSERT INTO %s (k, c, intval) VALUES (?, ?, ?)", 0, i, 0);
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND c IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, c, k)");
+
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv"))
+            Thread.sleep(1000);
+
+        assertRows(execute("SELECT count(*) from %s WHERE k = ?", 0), row(1024L));
+        assertRows(execute("SELECT count(*) from mv WHERE intval = ?", 0), row(1024L));
+    }
+
+    @Test
+    public void testRangeTombstone() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "textval1 text, " +
+                    "textval2 text, " +
+                    "PRIMARY KEY((k, asciival), bigintval, textval1)" +
+                    ")");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv_test1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE textval2 IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL AND bigintval IS NOT NULL AND textval1 IS NOT NULL PRIMARY KEY ((textval2, k), asciival, bigintval, textval1)");
+
+        for (int i = 0; i < 100; i++)
+            updateView("INSERT into %s (k,asciival,bigintval,textval1,textval2)VALUES(?,?,?,?,?)", 0, "foo", (long) i % 2, "bar" + i, "baz");
+
+        Assert.assertEquals(50, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 0").size());
+        Assert.assertEquals(50, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 1").size());
+
+        Assert.assertEquals(100, execute("select * from mv_test1").size());
+
+        //Check the builder works
+        createView("mv_test2", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE textval2 IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL AND bigintval IS NOT NULL AND textval1 IS NOT NULL PRIMARY KEY ((textval2, k), asciival, bigintval, textval1)");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test2"))
+            Thread.sleep(10);
+
+        Assert.assertEquals(100, execute("select * from mv_test2").size());
+
+        createView("mv_test3", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE textval2 IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL AND bigintval IS NOT NULL AND textval1 IS NOT NULL PRIMARY KEY ((textval2, k), bigintval, textval1, asciival)");
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test3"))
+            Thread.sleep(10);
+
+        Assert.assertEquals(100, execute("select * from mv_test3").size());
+        Assert.assertEquals(100, execute("select asciival from mv_test3 where textval2 = ? and k = ?", "baz", 0).size());
+
+        //Write a RT and verify the data is removed from index
+        updateView("DELETE FROM %s WHERE k = ? AND asciival = ? and bigintval = ?", 0, "foo", 0L);
+
+        Assert.assertEquals(50, execute("select asciival from mv_test3 where textval2 = ? and k = ?", "baz", 0).size());
+    }
+
+
+    @Test
+    public void testRangeTombstone2() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "textval1 text, " +
+                    "PRIMARY KEY((k, asciival), bigintval)" +
+                    ")");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE textval1 IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL AND bigintval IS NOT NULL PRIMARY KEY ((textval1, k), asciival, bigintval)");
+
+        for (int i = 0; i < 100; i++)
+            updateView("INSERT into %s (k,asciival,bigintval,textval1)VALUES(?,?,?,?)", 0, "foo", (long) i % 2, "bar" + i);
+
+        Assert.assertEquals(1, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 0").size());
+        Assert.assertEquals(1, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 1").size());
+
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+        Assert.assertEquals(2, execute("select * from mv").size());
+
+        //Write a RT and verify the data is removed from index
+        updateView("DELETE FROM %s WHERE k = ? AND asciival = ? and bigintval = ?", 0, "foo", 0L);
+
+        Assert.assertEquals(1, execute("select * from %s").size());
+        Assert.assertEquals(1, execute("select * from mv").size());
+    }
+
+    @Test
+    public void testRangeTombstone3() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "textval1 text, " +
+                    "PRIMARY KEY((k, asciival), bigintval)" +
+                    ")");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE textval1 IS NOT NULL AND k IS NOT NULL AND asciival IS NOT NULL AND bigintval IS NOT NULL PRIMARY KEY ((textval1, k), asciival, bigintval)");
+
+        for (int i = 0; i < 100; i++)
+            updateView("INSERT into %s (k,asciival,bigintval,textval1)VALUES(?,?,?,?)", 0, "foo", (long) i % 2, "bar" + i);
+
+        Assert.assertEquals(1, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 0").size());
+        Assert.assertEquals(1, execute("select * from %s where k = 0 and asciival = 'foo' and bigintval = 1").size());
+
+
+        Assert.assertEquals(2, execute("select * from %s").size());
+        Assert.assertEquals(2, execute("select * from mv").size());
+
+        //Write a RT and verify the data is removed from index
+        updateView("DELETE FROM %s WHERE k = ? AND asciival = ? and bigintval >= ?", 0, "foo", 0L);
+
+        Assert.assertEquals(0, execute("select * from %s").size());
+        Assert.assertEquals(0, execute("select * from mv").size());
+    }
+
+    @Test
+    public void testCompoundPartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "asciival ascii, " +
+                    "bigintval bigint, " +
+                    "PRIMARY KEY((k, asciival)))");
+
+        CFMetaData metadata = currentTableMetadata();
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        for (ColumnDefinition def : new HashSet<>(metadata.allColumns()))
+        {
+            try
+            {
+                String query = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL "
+                               + (def.name.toString().equals("asciival") ? "" : "AND asciival IS NOT NULL ") + "PRIMARY KEY ("
+                               + def.name + ", k" + (def.name.toString().equals("asciival") ? "" : ", asciival") + ")";
+                createView("mv1_" + def.name, query);
+
+                if (def.type.isMultiCell())
+                    Assert.fail("MV on a multicell should fail " + def);
+            }
+            catch (InvalidQueryException e)
+            {
+                if (!def.type.isMultiCell() && !def.isPartitionKey())
+                    Assert.fail("MV creation failed on " + def);
+            }
+
+
+            try
+            {
+                String query = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL "
+                               + (def.name.toString().equals("asciival") ? "" : "AND asciival IS NOT NULL ") + " PRIMARY KEY ("
+                               + def.name + ", asciival" + (def.name.toString().equals("k") ? "" : ", k") + ")";
+                createView("mv2_" + def.name, query);
+
+                if (def.type.isMultiCell())
+                    Assert.fail("MV on a multicell should fail " + def);
+            }
+            catch (InvalidQueryException e)
+            {
+                if (!def.type.isMultiCell() && !def.isPartitionKey())
+                    Assert.fail("MV creation failed on " + def);
+            }
+
+            try
+            {
+                String query = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL "
+                               + (def.name.toString().equals("asciival") ? "" : "AND asciival IS NOT NULL ") + "PRIMARY KEY ((" + def.name + ", k), asciival)";
+                createView("mv3_" + def.name, query);
+
+                if (def.type.isMultiCell())
+                    Assert.fail("MV on a multicell should fail " + def);
+            }
+            catch (InvalidQueryException e)
+            {
+                if (!def.type.isMultiCell() && !def.isPartitionKey())
+                    Assert.fail("MV creation failed on " + def);
+            }
+
+
+            try
+            {
+                String query = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL "
+                               + (def.name.toString().equals("asciival") ? "" : "AND asciival IS NOT NULL ") + "PRIMARY KEY ((" + def.name + ", k), asciival)";
+                createView("mv3_" + def.name, query);
+
+                Assert.fail("Should fail on duplicate name");
+            }
+            catch (Exception e)
+            {
+            }
+
+            try
+            {
+                String query = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE " + def.name + " IS NOT NULL AND k IS NOT NULL "
+                               + (def.name.toString().equals("asciival") ? "" : "AND asciival IS NOT NULL ") + "PRIMARY KEY ((" + def.name + ", k), nonexistentcolumn)";
+                createView("mv3_" + def.name, query);
+                Assert.fail("Should fail with unknown base column");
+            }
+            catch (InvalidQueryException e)
+            {
+            }
+        }
+
+        updateView("INSERT INTO %s (k, asciival, bigintval) VALUES (?, ?, fromJson(?))", 0, "ascii text", "123123123123");
+        updateView("INSERT INTO %s (k, asciival) VALUES (?, fromJson(?))", 0, "\"ascii text\"");
+        assertRows(execute("SELECT bigintval FROM %s WHERE k = ? and asciival = ?", 0, "ascii text"), row(123123123123L));
+
+        //Check the MV
+        assertRows(execute("SELECT k, bigintval from mv1_asciival WHERE asciival = ?", "ascii text"), row(0, 123123123123L));
+        assertRows(execute("SELECT k, bigintval from mv2_k WHERE asciival = ? and k = ?", "ascii text", 0), row(0, 123123123123L));
+        assertRows(execute("SELECT k from mv1_bigintval WHERE bigintval = ?", 123123123123L), row(0));
+        assertRows(execute("SELECT asciival from mv3_bigintval where bigintval = ? AND k = ?", 123123123123L, 0), row("ascii text"));
+
+
+        //UPDATE BASE
+        updateView("INSERT INTO %s (k, asciival, bigintval) VALUES (?, ?, fromJson(?))", 0, "ascii text", "1");
+        assertRows(execute("SELECT bigintval FROM %s WHERE k = ? and asciival = ?", 0, "ascii text"), row(1L));
+
+        //Check the MV
+        assertRows(execute("SELECT k, bigintval from mv1_asciival WHERE asciival = ?", "ascii text"), row(0, 1L));
+        assertRows(execute("SELECT k, bigintval from mv2_k WHERE asciival = ? and k = ?", "ascii text", 0), row(0, 1L));
+        assertRows(execute("SELECT k from mv1_bigintval WHERE bigintval = ?", 123123123123L));
+        assertRows(execute("SELECT asciival from mv3_bigintval where bigintval = ? AND k = ?", 123123123123L, 0));
+        assertRows(execute("SELECT asciival from mv3_bigintval where bigintval = ? AND k = ?", 1L, 0), row("ascii text"));
+
+
+        //test truncate also truncates all MV
+        updateView("TRUNCATE %s");
+
+        assertRows(execute("SELECT bigintval FROM %s WHERE k = ? and asciival = ?", 0, "ascii text"));
+        assertRows(execute("SELECT k, bigintval from mv1_asciival WHERE asciival = ?", "ascii text"));
+        assertRows(execute("SELECT k, bigintval from mv2_k WHERE asciival = ? and k = ?", "ascii text", 0));
+        assertRows(execute("SELECT asciival from mv3_bigintval where bigintval = ? AND k = ?", 1L, 0));
+    }
+
+    @Test
+    public void testCollections() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "intval int, " +
+                    "listval list<int>, " +
+                    "PRIMARY KEY (k))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, k)");
+
+        updateView("INSERT INTO %s (k, intval, listval) VALUES (?, ?, fromJson(?))", 0, 0, "[1, 2, 3]");
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 0), row(0, list(1, 2, 3)));
+        assertRows(execute("SELECT k, listval from mv WHERE intval = ?", 0), row(0, list(1, 2, 3)));
+
+        updateView("INSERT INTO %s (k, intval) VALUES (?, ?)", 1, 1);
+        updateView("INSERT INTO %s (k, listval) VALUES (?, fromJson(?))", 1, "[1, 2, 3]");
+        assertRows(execute("SELECT k, listval FROM %s WHERE k = ?", 1), row(1, list(1, 2, 3)));
+        assertRows(execute("SELECT k, listval from mv WHERE intval = ?", 1), row(1, list(1, 2, 3)));
+    }
+
+    @Test
+    public void testUpdate() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "intval int, " +
+                    "PRIMARY KEY (k))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND intval IS NOT NULL PRIMARY KEY (intval, k)");
+
+        updateView("INSERT INTO %s (k, intval) VALUES (?, ?)", 0, 0);
+        assertRows(execute("SELECT k, intval FROM %s WHERE k = ?", 0), row(0, 0));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 0), row(0, 0));
+
+        updateView("INSERT INTO %s (k, intval) VALUES (?, ?)", 0, 1);
+        assertRows(execute("SELECT k, intval FROM %s WHERE k = ?", 0), row(0, 1));
+        assertRows(execute("SELECT k, intval from mv WHERE intval = ?", 1), row(0, 1));
+    }
+
+    @Test
+    public void testIgnoreUpdate() throws Throwable
+    {
+        // regression test for CASSANDRA-10614
+
+        createTable("CREATE TABLE %s (" +
+                    "a int, " +
+                    "b int, " +
+                    "c int, " +
+                    "d int, " +
+                    "PRIMARY KEY (a, b))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT a, b, c FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (b, a)");
+
+        updateView("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, 0);
+        assertRows(execute("SELECT a, b, c from mv WHERE b = ?", 0), row(0, 0, 0));
+
+        updateView("UPDATE %s SET d = ? WHERE a = ? AND b = ?", 0, 0, 0);
+        assertRows(execute("SELECT a, b, c from mv WHERE b = ?", 0), row(0, 0, 0));
+
+        // Note: errors here may result in the test hanging when the memtables are flushed as part of the table drop,
+        // because empty rows in the memtable will cause the flush to fail.  This will result in a test timeout that
+        // should not be ignored.
+        String table = KEYSPACE + "." + currentTable();
+        updateView("BEGIN BATCH " +
+                "INSERT INTO " + table + " (a, b, c, d) VALUES (?, ?, ?, ?); " + // should be accepted
+                "UPDATE " + table + " SET d = ? WHERE a = ? AND b = ?; " +  // should be accepted
+                "APPLY BATCH",
+                0, 0, 0, 0,
+                1, 0, 1);
+        assertRows(execute("SELECT a, b, c from mv WHERE b = ?", 0), row(0, 0, 0));
+        assertRows(execute("SELECT a, b, c from mv WHERE b = ?", 1), row(0, 1, null));
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore("mv");
+        cfs.forceBlockingFlush();
+        Assert.assertEquals(1, cfs.getLiveSSTables().size());
+    }
+
+    @Test
+    public void ttlTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)");
+
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) USING TTL 3", 1, 1, 1, 1);
+
+        Thread.sleep(TimeUnit.SECONDS.toMillis(1));
+        updateView("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 2);
+
+        Thread.sleep(TimeUnit.SECONDS.toMillis(5));
+        List<Row> results = executeNet(protocolVersion, "SELECT d FROM mv WHERE c = 2 AND a = 1 AND b = 1").all();
+        Assert.assertEquals(1, results.size());
+        Assert.assertTrue("There should be a null result given back due to ttl expiry", results.get(0).isNull(0));
+    }
+
+    @Test
+    public void ttlExpirationTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)");
+
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) USING TTL 3", 1, 1, 1, 1);
+
+        Thread.sleep(TimeUnit.SECONDS.toMillis(4));
+        Assert.assertEquals(0, executeNet(protocolVersion, "SELECT * FROM mv WHERE c = 1 AND a = 1 AND b = 1").all().size());
+    }
+
+    @Test
+    public void rowDeletionTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)");
+
+        String table = keyspace() + "." + currentTable();
+        updateView("DELETE FROM " + table + " USING TIMESTAMP 6 WHERE a = 1 AND b = 1;");
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) USING TIMESTAMP 3", 1, 1, 1, 1);
+        Assert.assertEquals(0, executeNet(protocolVersion, "SELECT * FROM mv WHERE c = 1 AND a = 1 AND b = 1").all().size());
+    }
+
+    @Test
+    public void conflictingTimestampTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE c IS NOT NULL AND a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (c, a, b)");
+
+        for (int i = 0; i < 50; i++)
+        {
+            updateView("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP 1", 1, 1, i);
+        }
+
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT c FROM mv");
+        List<Row> rows = executeNet(protocolVersion, "SELECT c FROM %s").all();
+        Assert.assertEquals("There should be exactly one row in base", 1, rows.size());
+        int expected = rows.get(0).getInt("c");
+        assertRowsNet(protocolVersion, mvRows, row(expected));
+    }
+
+    @Test
+    public void testClusteringOrder() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b, c))" +
+                    "WITH CLUSTERING ORDER BY (b ASC, c DESC)");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (a, b, c) WITH CLUSTERING ORDER BY (b DESC)");
+        createView("mv2", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (a, c, b) WITH CLUSTERING ORDER BY (c ASC)");
+        createView("mv3", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (a, b, c)");
+        createView("mv4", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (a, c, b) WITH CLUSTERING ORDER BY (c DESC)");
+
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 2, 2, 2);
+
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT b FROM mv1");
+        assertRowsNet(protocolVersion, mvRows,
+                      row(2),
+                      row(1));
+
+        mvRows = executeNet(protocolVersion, "SELECT c FROM mv2");
+        assertRowsNet(protocolVersion, mvRows,
+                      row(1),
+                      row(2));
+
+        mvRows = executeNet(protocolVersion, "SELECT b FROM mv3");
+        assertRowsNet(protocolVersion, mvRows,
+                      row(1),
+                      row(2));
+
+        mvRows = executeNet(protocolVersion, "SELECT c FROM mv4");
+        assertRowsNet(protocolVersion, mvRows,
+                      row(2),
+                      row(1));
+    }
+
+    @Test
+    public void testMultipleDeletes() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 1);
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 2);
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 3);
+
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, b FROM mv1");
+        assertRowsNet(protocolVersion, mvRows,
+                      row(1, 1),
+                      row(1, 2),
+                      row(1, 3));
+
+        updateView(String.format("BEGIN UNLOGGED BATCH " +
+                                 "DELETE FROM %s WHERE a = 1 AND b > 1 AND b < 3;" +
+                                 "DELETE FROM %s WHERE a = 1;" +
+                                 "APPLY BATCH", currentTable(), currentTable()));
+
+        mvRows = executeNet(protocolVersion, "SELECT a, b FROM mv1");
+        assertRowsNet(protocolVersion, mvRows);
+    }
+
+    @Test
+    public void testPrimaryKeyOnlyTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        // Cannot use SELECT *, as those are always handled by the includeAll shortcut in View.updateAffectsView
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT a, b FROM %%s WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 1);
+
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, b FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(1, 1));
+    }
+
+    @Test
+    public void testPartitionKeyOnlyTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "PRIMARY KEY ((a, b)))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        // Cannot use SELECT *, as those are always handled by the includeAll shortcut in View.updateAffectsView
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT a, b FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL PRIMARY KEY (b, a)");
+
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 1);
+
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, b FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(1, 1));
+    }
+
+    @Test
+    public void testDeleteSingleColumnInViewClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND d IS NOT NULL PRIMARY KEY (a, d, b)");
+
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, d, b, c FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(0, 0, 0, 0));
+
+        updateView("DELETE c FROM %s WHERE a = ? AND b = ?", 0, 0);
+        mvRows = executeNet(protocolVersion, "SELECT a, d, b, c FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(0, 0, 0, null));
+
+        updateView("DELETE d FROM %s WHERE a = ? AND b = ?", 0, 0);
+        mvRows = executeNet(protocolVersion, "SELECT a, d, b FROM mv1");
+        assertTrue(mvRows.isExhausted());
+    }
+
+    @Test
+    public void testDeleteSingleColumnInViewPartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "PRIMARY KEY (a, b))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+        createView("mv1", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND d IS NOT NULL PRIMARY KEY (d, a, b)");
+
+        updateView("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, d, b, c FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(0, 0, 0, 0));
+
+        updateView("DELETE c FROM %s WHERE a = ? AND b = ?", 0, 0);
+        mvRows = executeNet(protocolVersion, "SELECT a, d, b, c FROM mv1");
+        assertRowsNet(protocolVersion, mvRows, row(0, 0, 0, null));
+
+        updateView("DELETE d FROM %s WHERE a = ? AND b = ?", 0, 0);
+        mvRows = executeNet(protocolVersion, "SELECT a, d, b FROM mv1");
+        assertTrue(mvRows.isExhausted());
+    }
+
+    @Test
+    public void testCollectionInView() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c map<int, text>," +
+                    "PRIMARY KEY (a))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+        createView("mvmap", "CREATE MATERIALIZED VIEW %s AS SELECT a, b FROM %%s WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+
+        updateView("INSERT INTO %s (a, b) VALUES (?, ?)", 0, 0);
+        ResultSet mvRows = executeNet(protocolVersion, "SELECT a, b FROM mvmap WHERE b = ?", 0);
+        assertRowsNet(protocolVersion, mvRows, row(0, 0));
+
+        updateView("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, map(1, "1"));
+        mvRows = executeNet(protocolVersion, "SELECT a, b FROM mvmap WHERE b = ?", 1);
+        assertRowsNet(protocolVersion, mvRows, row(1, 1));
+
+        updateView("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, map(0, "0"));
+        mvRows = executeNet(protocolVersion, "SELECT a, b FROM mvmap WHERE b = ?", 0);
+        assertRowsNet(protocolVersion, mvRows, row(0, 0));
+    }
+
+    @Test
+    public void testMultipleNonPrimaryKeysInView() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "a int," +
+                    "b int," +
+                    "c int," +
+                    "d int," +
+                    "e int," +
+                    "PRIMARY KEY ((a, b), c))");
+
+        try
+        {
+            createView("mv_de", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL AND d IS NOT NULL AND e IS NOT NULL PRIMARY KEY ((d, a), b, e, c)");
+            Assert.fail("Should have rejected a query including multiple non-primary key base columns");
+        }
+        catch (Exception e)
+        {
+        }
+
+        try
+        {
+            createView("mv_de", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL AND d IS NOT NULL AND e IS NOT NULL PRIMARY KEY ((a, b), c, d, e)");
+            Assert.fail("Should have rejected a query including multiple non-primary key base columns");
+        }
+        catch (Exception e)
+        {
+        }
+    }
+
+    @Test
+    public void testNullInClusteringColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id1 int, id2 int, v1 text, v2 text, PRIMARY KEY (id1, id2))");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS" +
+                   "  SELECT id1, v1, id2, v2" +
+                   "  FROM %%s" +
+                   "  WHERE id1 IS NOT NULL AND v1 IS NOT NULL AND id2 IS NOT NULL" +
+                   "  PRIMARY KEY (id1, v1, id2)" +
+                   "  WITH CLUSTERING ORDER BY (v1 DESC, id2 ASC)");
+
+        execute("INSERT INTO %s (id1, id2, v1, v2) VALUES (?, ?, ?, ?)", 0, 1, "foo", "bar");
+
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM %s"), row(0, 1, "foo", "bar"));
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM mv"), row(0, "foo", 1, "bar"));
+
+        executeNet(protocolVersion, "UPDATE %s SET v1=? WHERE id1=? AND id2=?", null, 0, 1);
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM %s"), row(0, 1, null, "bar"));
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM mv"));
+
+        executeNet(protocolVersion, "UPDATE %s SET v2=? WHERE id1=? AND id2=?", "rab", 0, 1);
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM %s"), row(0, 1, null, "rab"));
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM mv"));
+    }
+
+    @Test
+    public void testReservedKeywordsInMV() throws Throwable
+    {
+        createTable("CREATE TABLE %s (\"token\" int PRIMARY KEY, \"keyspace\" int)");
+
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS" +
+                   "  SELECT \"keyspace\", \"token\"" +
+                   "  FROM %%s" +
+                   "  WHERE \"keyspace\" IS NOT NULL AND \"token\" IS NOT NULL" +
+                   "  PRIMARY KEY (\"keyspace\", \"token\")");
+
+        execute("INSERT INTO %s (\"token\", \"keyspace\") VALUES (?, ?)", 0, 1);
+
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM %s"), row(0, 1));
+        assertRowsNet(protocolVersion, executeNet(protocolVersion, "SELECT * FROM mv"), row(1, 0));
+    }
+
+    @Test
+    public void testViewBuilderResume() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int, " +
+                    "c int, " +
+                    "val text, " +
+                    "PRIMARY KEY(k,c))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        CompactionManager.instance.setCoreCompactorThreads(1);
+        CompactionManager.instance.setMaximumCompactorThreads(1);
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        for (int i = 0; i < 1024; i++)
+            execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
+
+        cfs.forceBlockingFlush();
+
+        for (int i = 0; i < 1024; i++)
+            execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
+
+        cfs.forceBlockingFlush();
+
+        for (int i = 0; i < 1024; i++)
+            execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
+
+        cfs.forceBlockingFlush();
+
+        for (int i = 0; i < 1024; i++)
+            execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
+
+        cfs.forceBlockingFlush();
+
+        createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
+
+        cfs.enableAutoCompaction();
+        List<Future<?>> futures = CompactionManager.instance.submitBackground(cfs);
+
+        //Force a second MV on the same base table, which will restart the first MV builder...
+        createView("mv_test2", "CREATE MATERIALIZED VIEW %s AS SELECT val, k, c FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
+
+
+        //Compact the base table
+        FBUtilities.waitOnFutures(futures);
+
+        while (!SystemKeyspace.isViewBuilt(keyspace(), "mv_test"))
+            Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+
+        assertRows(execute("SELECT count(*) FROM mv_test"), row(1024L));
+    }
+
+    @Test
+    public void testFrozenCollectionsWithComplicatedInnerType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, intval int,  listval frozen<list<tuple<text,text>>>, PRIMARY KEY (k))");
+
+        execute("USE " + keyspace());
+        executeNet(protocolVersion, "USE " + keyspace());
+
+        createView("mv",
+                   "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE k IS NOT NULL AND listval IS NOT NULL PRIMARY KEY (k, listval)");
+
+        updateView("INSERT INTO %s (k, intval, listval) VALUES (?, ?, fromJson(?))",
+                   0,
+                   0,
+                   "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]");
+
+        // verify input
+        assertRows(execute("SELECT k, toJson(listval) FROM %s WHERE k = ?", 0),
+                   row(0, "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]"));
+        assertRows(execute("SELECT k, toJson(listval) from mv"),
+                   row(0, "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]"));
+
+        // update listval with the same value and it will be compared in view generator
+        updateView("INSERT INTO %s (k, listval) VALUES (?, fromJson(?))",
+                   0,
+                   "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]");
+        // verify result
+        assertRows(execute("SELECT k, toJson(listval) FROM %s WHERE k = ?", 0),
+                   row(0, "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]"));
+        assertRows(execute("SELECT k, toJson(listval) from mv"),
+                   row(0, "[[\"a\", \"1\"], [\"b\", \"2\"], [\"c\", \"3\"]]"));
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void emptyViewNameTest() throws Throwable
+    {
+        execute("CREATE MATERIALIZED VIEW \"\" AS SELECT a, b FROM tbl WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+    }
+
+     @Test(expected = SyntaxException.class)
+     public void emptyBaseTableNameTest() throws Throwable
+     {
+         execute("CREATE MATERIALIZED VIEW myview AS SELECT a, b FROM \"\" WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+     }
+
+    @Test
+    public void viewOnCompactTableTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        executeNet(protocolVersion, "USE " + keyspace());
+        try
+        {
+            createView("mv",
+                       "CREATE MATERIALIZED VIEW %s AS SELECT a, b, value FROM %%s WHERE b IS NOT NULL PRIMARY KEY (b, a)");
+            fail("Should have thrown an exception");
+        }
+        catch (Throwable t)
+        {
+            Assert.assertEquals("Unknown column name detected in CREATE MATERIALIZED VIEW statement : value",
+                                t.getMessage());
+        }
+    }
+
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSetTest.java b/test/unit/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSetTest.java
index bd5395a..abbd36b 100644
--- a/test/unit/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSetTest.java
+++ b/test/unit/org/apache/cassandra/cql3/restrictions/PrimaryKeyRestrictionSetTest.java

@@ -18,9 +18,9 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
 
+import com.google.common.collect.Iterables;
 import org.junit.Test;
 
 import org.apache.cassandra.config.CFMetaData;
@@ -28,11 +28,8 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.Term.MultiItemTerminal;
 import org.apache.cassandra.cql3.statements.Bound;
-import org.apache.cassandra.db.ColumnFamilyType;
-import org.apache.cassandra.db.composites.Composite;
-import org.apache.cassandra.db.composites.Composite.EOC;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.composites.CompoundSparseCellNameType;
+
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.ReversedType;
@@ -45,72 +42,72 @@
 public class PrimaryKeyRestrictionSetTest
 {
     @Test
-    public void testBoundsAsCompositesWithNoRestrictions()
+    public void testBoundsAsClusteringWithNoRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC);
 
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
     }
 
     /**
      * Test 'clustering_0 = 1' with only one clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithOneEqRestrictionsAndOneClusteringColumn()
+    public void testBoundsAsClusteringWithOneEqRestrictionsAndOneClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC);
 
         ByteBuffer clustering_0 = ByteBufferUtil.bytes(1);
         Restriction eq = newSingleEq(cfMetaData, 0, clustering_0);
 
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), clustering_0, EOC.START);
+        assertStartBound(get(bounds, 0), true, clustering_0);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), clustering_0, EOC.END);
+        assertEndBound(get(bounds, 0), true, clustering_0);
     }
 
     /**
      * Test 'clustering_1 = 1' with 2 clustering columns
      */
     @Test
-    public void testBoundsAsCompositesWithOneEqRestrictionsAndTwoClusteringColumns()
+    public void testBoundsAsClusteringWithOneEqRestrictionsAndTwoClusteringColumns()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
         ByteBuffer clustering_0 = ByteBufferUtil.bytes(1);
         Restriction eq = newSingleEq(cfMetaData, 0, clustering_0);
 
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), clustering_0, EOC.START);
+        assertStartBound(get(bounds, 0), true, clustering_0);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), clustering_0, EOC.END);
+        assertEndBound(get(bounds, 0), true, clustering_0);
     }
 
     /**
      * Test 'clustering_0 IN (1, 2, 3)' with only one clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithOneInRestrictionsAndOneClusteringColumn()
+    public void testBoundsAsClusteringWithOneInRestrictionsAndOneClusteringColumn()
     {
         ByteBuffer value1 = ByteBufferUtil.bytes(1);
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
@@ -120,27 +117,27 @@
 
         Restriction in = newSingleIN(cfMetaData, 0, value1, value2, value3);
 
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(in);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value2, EOC.START);
-        assertComposite(bounds.get(2), value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value2);
+        assertStartBound(get(bounds, 2), true, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
-        assertComposite(bounds.get(1), value2, EOC.END);
-        assertComposite(bounds.get(2), value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
+        assertEndBound(get(bounds, 1), true, value2);
+        assertEndBound(get(bounds, 2), true, value3);
     }
 
     /**
      * Test slice restriction (e.g 'clustering_0 > 1') with only one clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithSliceRestrictionsAndOneClusteringColumn()
+    public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
@@ -148,85 +145,85 @@
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newSingleSlice(cfMetaData, 0, Bound.START, false, value1);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(cfMetaData, 0, Bound.END, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.END, false, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value2);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, true, value1);
         slice2 = newSingleSlice(cfMetaData, 0, Bound.END, true, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value2);
     }
 
     /**
      * Test slice restriction (e.g 'clustering_0 > 1') with only one descending clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithSliceRestrictionsAndOneDescendingClusteringColumn()
+    public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.DESC, Sort.DESC);
 
@@ -234,85 +231,85 @@
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newSingleSlice(cfMetaData, 0, Bound.START, false, value1);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.END, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(cfMetaData, 0, Bound.END, false, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newSingleSlice(cfMetaData, 0, Bound.START, true, value1);
         slice2 = newSingleSlice(cfMetaData, 0, Bound.END, true, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
     }
 
     /**
      * Test 'clustering_0 = 1 AND clustering_1 IN (1, 2, 3)'
      */
     @Test
-    public void testBoundsAsCompositesWithEqAndInRestrictions()
+    public void testBoundsAsClusteringWithEqAndInRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
@@ -321,27 +318,27 @@
         ByteBuffer value3 = ByteBufferUtil.bytes(3);
         Restriction eq = newSingleEq(cfMetaData, 0, value1);
         Restriction in = newSingleIN(cfMetaData, 1, value1, value2, value3);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(in);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.START);
-        assertComposite(bounds.get(2), value1, value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2);
+        assertStartBound(get(bounds, 2), true, value1, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value1, EOC.END);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
-        assertComposite(bounds.get(2), value1, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value1);
+        assertEndBound(get(bounds, 1), true, value1, value2);
+        assertEndBound(get(bounds, 2), true, value1, value3);
     }
 
     /**
      * Test equal and slice restrictions (e.g 'clustering_0 = 0 clustering_1 > 1')
      */
     @Test
-    public void testBoundsAsCompositesWithEqAndSliceRestrictions()
+    public void testBoundsAsClusteringWithEqAndSliceRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
@@ -352,108 +349,108 @@
         Restriction eq = newSingleEq(cfMetaData, 0, value3);
 
         Restriction slice = newSingleSlice(cfMetaData, 1, Bound.START, false, value1);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value3, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value3);
 
         slice = newSingleSlice(cfMetaData, 1, Bound.START, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value3, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value3);
 
         slice = newSingleSlice(cfMetaData, 1, Bound.END, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value3, value1);
 
         slice = newSingleSlice(cfMetaData, 1, Bound.END, false, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value3, value1);
 
         slice = newSingleSlice(cfMetaData, 1, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(cfMetaData, 1, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value3, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value3, value2);
 
         slice = newSingleSlice(cfMetaData, 1, Bound.START, true, value1);
         slice2 = newSingleSlice(cfMetaData, 1, Bound.END, true, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq).mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value3, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value3, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value3, value2);
     }
 
     /**
      * Test '(clustering_0, clustering_1) = (1, 2)' with two clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithMultiEqRestrictions()
+    public void testBoundsAsClusteringWithMultiEqRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
         ByteBuffer value1 = ByteBufferUtil.bytes(1);
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
         Restriction eq = newMultiEq(cfMetaData, 0, value1, value2);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(eq);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
     }
 
     /**
      * Test '(clustering_0, clustering_1) IN ((1, 2), (2, 3))' with two clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithMultiInRestrictions()
+    public void testBoundsAsClusteringWithMultiInRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
@@ -461,104 +458,105 @@
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
         ByteBuffer value3 = ByteBufferUtil.bytes(3);
         Restriction in = newMultiIN(cfMetaData, 0, asList(value1, value2), asList(value2, value3));
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(in);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value2, value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), true, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
-        assertComposite(bounds.get(1), value2, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
+        assertEndBound(get(bounds, 1), true, value2, value3);
     }
 
     /**
      * Test multi-column slice restrictions (e.g '(clustering_0) > (1)') with only one clustering column
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithOneClusteringColumn()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC);
 
+
         ByteBuffer value1 = ByteBufferUtil.bytes(1);
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value2);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value2);
     }
 
     /**
@@ -566,7 +564,7 @@
      * order
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithOneDescendingClusteringColumn()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.DESC);
 
@@ -574,85 +572,85 @@
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1);
 
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
     }
 
     /**
      * Test multi-column slice restrictions (e.g '(clustering_0, clustering_1) > (1, 2)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithTwoClusteringColumn()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringColumn()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC);
 
@@ -661,90 +659,90 @@
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value2);
 
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value2);
 
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value2, value1);
     }
 
     /**
      * Test multi-column slice restrictions with 2 descending clustering columns (e.g '(clustering_0, clustering_1) > (1, 2)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithTwoDescendingClusteringColumns()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClusteringColumns()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.DESC, Sort.DESC);
 
@@ -753,84 +751,84 @@
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value2);
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyStart(get(bounds, 0));
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
 
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value2);
 
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value2, value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value2, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
     }
 
     /**
@@ -838,7 +836,7 @@
      * (e.g '(clustering_0, clustering_1) > (1, 2)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithOneDescendingAndOneAscendingClusteringColumns()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAndOneAscendingClusteringColumns()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.DESC, Sort.ASC);
 
@@ -847,113 +845,113 @@
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), true, value1);
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.NONE);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), true, value1);
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), true, value1, value2);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.END);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), false, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), true, value1);
 
         // (clustering_0) > (1) AND (clustering_0, clustering1) < (2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.START);
-        assertComposite(bounds.get(1), value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value2);
+        assertStartBound(get(bounds, 1), false, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value2, value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.START);
+        assertEndBound(get(bounds, 0), false, value2, value1);
+        assertEndBound(get(bounds, 1), false, value1);
 
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value2, EOC.START);
-        assertComposite(bounds.get(1), value2, EOC.END);
-        assertComposite(bounds.get(2), value1, value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value2);
+        assertStartBound(get(bounds, 1), false, value2);
+        assertStartBound(get(bounds, 2), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value2, value1, EOC.END);
-        assertComposite(bounds.get(1), value1, EOC.START);
-        assertComposite(bounds.get(2), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value2, value1);
+        assertEndBound(get(bounds, 1), false, value1);
+        assertEndBound(get(bounds, 2), true, value1);
     }
 
     /**
@@ -961,7 +959,7 @@
      * (e.g '(clustering_0, clustering_1) > (1, 2)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithOneAscendingAndOneDescendingClusteringColumns()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndOneDescendingClusteringColumns()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.DESC);
 
@@ -970,97 +968,97 @@
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), true, value1, value2);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.NONE);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), true, value1);
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), true, value1);
 
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value2, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), false, value2);
 
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
-        assertComposite(bounds.get(2), value2, value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
+        assertStartBound(get(bounds, 2), true, value2, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
-        assertComposite(bounds.get(1), value2, EOC.START);
-        assertComposite(bounds.get(2), value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
+        assertEndBound(get(bounds, 1), false, value2);
+        assertEndBound(get(bounds, 2), true, value2);
     }
 
     /**
@@ -1068,7 +1066,7 @@
      * (e.g '(clustering_0, clustering1, clustering_3, clustering4) > (1, 2, 3, 4)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsWithTwoAscendingAndTwoDescendingClusteringColumns()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndTwoDescendingClusteringColumns()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.DESC, Sort.DESC);
 
@@ -1079,148 +1077,148 @@
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2, value3, value4);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), false, value1, value2, value3, value4);
+        assertEmptyEnd(get(bounds, 1));
 
         // clustering_0 = 1 AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction eq = newSingleEq(cfMetaData, 0, value1);
         slice = newMultiSlice(cfMetaData, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
         restrictions = restrictions.mergeWith(eq);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 1), true, value1);
 
-        // clustering_0 = 1 AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
+        // clustering_0 IN (1, 2) AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction in = newSingleIN(cfMetaData, 0, value1, value2);
         slice = newMultiSlice(cfMetaData, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
         restrictions = restrictions.mergeWith(in);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
-        assertComposite(bounds.get(2), value2, value2, EOC.START);
-        assertComposite(bounds.get(3), value2, value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
+        assertStartBound(get(bounds, 2), true, value2, value2);
+        assertStartBound(get(bounds, 3), false, value2, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
-        assertComposite(bounds.get(2), value2, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(3), value2, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 1), true, value1);
+        assertEndBound(get(bounds, 2), false, value2, value2, value3, value4);
+        assertEndBound(get(bounds, 3), true, value2);
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertEmptyComposite(bounds.get(0));
+        assertEmptyEnd(get(bounds, 0));
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1, clustering_2, clustering_3) <= (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.NONE);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), true, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), true, value1, value2);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) < (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.END);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), false, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), true, value1, value2);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4) AND (clustering_0, clustering_1) < (2, 3)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2, value3, value4);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2, value3);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(1), value2, value3, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 1), false, value2, value3);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4) AND (clustering_0, clustering1, clustering_2, clustering_3) <= (4, 3, 2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2, value3, value4);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value4, value3, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value2,  EOC.START);
-        assertComposite(bounds.get(1), value1, value2, EOC.END);
-        assertComposite(bounds.get(2), value4, value3, value2, value1, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2);
+        assertStartBound(get(bounds, 1), false, value1, value2);
+        assertStartBound(get(bounds, 2), true, value4, value3, value2, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
-        assertComposite(bounds.get(1), value4, value3, EOC.START);
-        assertComposite(bounds.get(2), value4, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 1), false, value4, value3);
+        assertEndBound(get(bounds, 2), true, value4, value3);
     }
 
     /**
@@ -1228,7 +1226,7 @@
      * (e.g '(clustering_0, clustering1, clustering_3, clustering4) > (1, 2, 3, 4)')
      */
     @Test
-    public void testBoundsAsCompositesWithMultiSliceRestrictionsMixingAscendingDescendingClusteringColumns()
+    public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescendingColumnMix()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.DESC, Sort.ASC, Sort.DESC);
 
@@ -1239,170 +1237,169 @@
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4)
         Restriction slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2, value3, value4);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2, value3);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3);
+        assertStartBound(get(bounds, 3), false, value1);
 
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, EOC.END);
-        assertEmptyComposite(bounds.get(3));
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 2), true, value1, value2);
+        assertEmptyEnd(get(bounds, 3));
 
         // clustering_0 = 1 AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction eq = newSingleEq(cfMetaData, 0, value1);
         slice = newMultiSlice(cfMetaData, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
         restrictions = restrictions.mergeWith(eq);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2, value3);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 2), true, value1, value2);
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.END);
-        assertEmptyComposite(bounds.get(1));
+        assertEndBound(get(bounds, 0), true, value1, value2);
+        assertEmptyEnd(get(bounds, 1));
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2, value3);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3);
+        assertStartBound(get(bounds, 3), false, value1);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.END);
-        assertComposite(bounds.get(2), value1, value2, EOC.END);
-        assertEmptyComposite(bounds.get(3));
-
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), true, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 2), true, value1, value2);
+        assertEmptyEnd(get(bounds, 3));
 
         // (clustering_0, clustering1, clustering_2, clustering_3) <= (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, true, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, value4, EOC.NONE);
-        assertComposite(bounds.get(3), value1, value2, EOC.END);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), true, value1, value2);
+        assertStartBound(get(bounds, 2), true, value1, value2, value3, value4);
+        assertStartBound(get(bounds, 3), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), false, value1, value2, value3);
+        assertEndBound(get(bounds, 2), true, value1, value2, value3);
+        assertEndBound(get(bounds, 3), true, value1);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) < (1, 2, 3, 4)
         slice = newMultiSlice(cfMetaData, 0, Bound.END, false, value1, value2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertEmptyComposite(bounds.get(0));
-        assertComposite(bounds.get(1), value1, value2, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, value4, EOC.END);
-        assertComposite(bounds.get(3), value1, value2, EOC.END);
+        assertEmptyStart(get(bounds, 0));
+        assertStartBound(get(bounds, 1), true, value1, value2);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3, value4);
+        assertStartBound(get(bounds, 3), false, value1, value2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1);
+        assertEndBound(get(bounds, 1), false, value1, value2, value3);
+        assertEndBound(get(bounds, 2), true, value1, value2, value3);
+        assertEndBound(get(bounds, 3), true, value1);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4) AND (clustering_0, clustering_1) < (2, 3)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, false, value1, value2, value3, value4);
         Restriction slice2 = newMultiSlice(cfMetaData, 0, Bound.END, false, value2, value3);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(5, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
-        assertComposite(bounds.get(4), value2, value3, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2, value3);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3);
+        assertStartBound(get(bounds, 3), false, value1);
+        assertStartBound(get(bounds, 4), false, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(5, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, EOC.END);
-        assertComposite(bounds.get(3), value2, EOC.START);
-        assertComposite(bounds.get(4), value2, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), false, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 2), true, value1, value2);
+        assertEndBound(get(bounds, 3), false, value2);
+        assertEndBound(get(bounds, 4), true, value2);
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4) AND (clustering_0, clustering1, clustering_2, clustering_3) <= (4, 3, 2, 1)
         slice = newMultiSlice(cfMetaData, 0, Bound.START, true, value1, value2, value3, value4);
         slice2 = newMultiSlice(cfMetaData, 0, Bound.END, true, value4, value3, value2, value1);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(7, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(2), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(3), value1, EOC.END);
-        assertComposite(bounds.get(4), value4, value3, EOC.START);
-        assertComposite(bounds.get(5), value4, value3, value2, value1, EOC.NONE);
-        assertComposite(bounds.get(6), value4, value3, EOC.END);
+        assertStartBound(get(bounds, 0), true, value1);
+        assertStartBound(get(bounds, 1), true, value1, value2, value3);
+        assertStartBound(get(bounds, 2), false, value1, value2, value3);
+        assertStartBound(get(bounds, 3), false, value1);
+        assertStartBound(get(bounds, 4), true, value4, value3);
+        assertStartBound(get(bounds, 5), true, value4, value3, value2, value1);
+        assertStartBound(get(bounds, 6), false, value4, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(7, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value3, value4, EOC.END);
-        assertComposite(bounds.get(2), value1, value2, EOC.END);
-        assertComposite(bounds.get(3), value4, EOC.START);
-        assertComposite(bounds.get(4), value4, value3, value2, EOC.START);
-        assertComposite(bounds.get(5), value4, value3, value2, EOC.END);
-        assertComposite(bounds.get(6), value4, EOC.END);
+        assertEndBound(get(bounds, 0), false, value1, value2);
+        assertEndBound(get(bounds, 1), true, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 2), true, value1, value2);
+        assertEndBound(get(bounds, 3), false, value4);
+        assertEndBound(get(bounds, 4), false, value4, value3, value2);
+        assertEndBound(get(bounds, 5), true, value4, value3, value2);
+        assertEndBound(get(bounds, 6), true, value4);
     }
 
     /**
      * Test mixing single and multi equals restrictions (e.g. clustering_0 = 1 AND (clustering_1, clustering_2) = (2, 3))
      */
     @Test
-    public void testBoundsAsCompositesWithSingleEqAndMultiEqRestrictions()
+    public void testBoundsAsClusteringWithSingleEqAndMultiEqRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.ASC, Sort.ASC);
 
@@ -1414,67 +1411,67 @@
         // clustering_0 = 1 AND (clustering_1, clustering_2) = (2, 3)
         Restriction singleEq = newSingleEq(cfMetaData, 0, value1);
         Restriction multiEq = newMultiEq(cfMetaData, 1, value2, value3);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3);
 
         // clustering_0 = 1 AND clustering_1 = 2 AND (clustering_2, clustering_3) = (3, 4)
         singleEq = newSingleEq(cfMetaData, 0, value1);
         Restriction singleEq2 = newSingleEq(cfMetaData, 1, value2);
         multiEq = newMultiEq(cfMetaData, 2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(singleEq2).mergeWith(multiEq);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
 
         // (clustering_0, clustering_1) = (1, 2) AND clustering_2 = 3
         singleEq = newSingleEq(cfMetaData, 2, value3);
         multiEq = newMultiEq(cfMetaData, 0, value1, value2);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3);
 
         // clustering_0 = 1 AND (clustering_1, clustering_2) = (2, 3) AND clustering_3 = 4
         singleEq = newSingleEq(cfMetaData, 0, value1);
         singleEq2 = newSingleEq(cfMetaData, 3, value4);
         multiEq = newMultiEq(cfMetaData, 1, value2, value3);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq).mergeWith(singleEq2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
     }
 
     /**
      * Test clustering_0 = 1 AND (clustering_1, clustering_2) IN ((2, 3), (4, 5))
      */
     @Test
-    public void testBoundsAsCompositesWithSingleEqAndMultiINRestrictions()
+    public void testBoundsAsClusteringWithSingleEqAndMultiINRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.ASC, Sort.ASC);
 
@@ -1487,49 +1484,49 @@
         // clustering_0 = 1 AND (clustering_1, clustering_2) IN ((2, 3), (4, 5))
         Restriction singleEq = newSingleEq(cfMetaData, 0, value1);
         Restriction multiIN = newMultiIN(cfMetaData, 1, asList(value2, value3), asList(value4, value5));
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiIN);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.START);
-        assertComposite(bounds.get(1), value1, value4, value5, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3);
+        assertStartBound(get(bounds, 1), true, value1, value4, value5);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
-        assertComposite(bounds.get(1), value1, value4, value5, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3);
+        assertEndBound(get(bounds, 1), true, value1, value4, value5);
 
         // clustering_0 = 1 AND (clustering_1, clustering_2) IN ((2, 3))
         singleEq = newSingleEq(cfMetaData, 0, value1);
         multiIN = newMultiIN(cfMetaData, 1, asList(value2, value3));
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiIN).mergeWith(singleEq);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3);
 
         // clustering_0 = 1 AND clustering_1 = 5 AND (clustering_2, clustering_3) IN ((2, 3), (4, 5))
         singleEq = newSingleEq(cfMetaData, 0, value1);
         Restriction singleEq2 = newSingleEq(cfMetaData, 1, value5);
         multiIN = newMultiIN(cfMetaData, 2, asList(value2, value3), asList(value4, value5));
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiIN).mergeWith(singleEq2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value5, value2, value3, EOC.START);
-        assertComposite(bounds.get(1), value1, value5, value4, value5, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value5, value2, value3);
+        assertStartBound(get(bounds, 1), true, value1, value5, value4, value5);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value5, value2, value3, EOC.END);
-        assertComposite(bounds.get(1), value1, value5, value4, value5, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value5, value2, value3);
+        assertEndBound(get(bounds, 1), true, value1, value5, value4, value5);
     }
 
     /**
@@ -1537,7 +1534,7 @@
      * (e.g. clustering_0 = 1 AND (clustering_1, clustering_2) > (2, 3))
      */
     @Test
-    public void testBoundsAsCompositesWithSingleEqAndSliceRestrictions()
+    public void testBoundsAsClusteringWithSingleEqAndSliceRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.ASC);
 
@@ -1550,46 +1547,46 @@
         // clustering_0 = 1 AND (clustering_1, clustering_2) > (2, 3)
         Restriction singleEq = newSingleEq(cfMetaData, 0, value1);
         Restriction multiSlice = newMultiSlice(cfMetaData, 1, Bound.START, false, value2, value3);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(singleEq).mergeWith(multiSlice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1);
 
         // clustering_0 = 1 AND (clustering_1, clustering_2) > (2, 3) AND (clustering_1) < (4)
         singleEq = newSingleEq(cfMetaData, 0, value1);
         multiSlice = newMultiSlice(cfMetaData, 1, Bound.START, false, value2, value3);
         Restriction multiSlice2 = newMultiSlice(cfMetaData, 1, Bound.END, false, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiSlice2).mergeWith(singleEq).mergeWith(multiSlice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value4, EOC.START);
+        assertEndBound(get(bounds, 0), false, value1, value4);
 
         // clustering_0 = 1 AND (clustering_1, clustering_2) => (2, 3) AND (clustering_1, clustering_2) <= (4, 5)
         singleEq = newSingleEq(cfMetaData, 0, value1);
         multiSlice = newMultiSlice(cfMetaData, 1, Bound.START, true, value2, value3);
         multiSlice2 = newMultiSlice(cfMetaData, 1, Bound.END, true, value4, value5);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiSlice2).mergeWith(singleEq).mergeWith(multiSlice);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.NONE);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value4, value5, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value4, value5);
     }
 
     /**
@@ -1597,7 +1594,7 @@
      * (e.g. clustering_0 = 1 AND (clustering_1, clustering_2) > (2, 3))
      */
     @Test
-    public void testBoundsAsCompositesWithMultiEqAndSingleSliceRestrictions()
+    public void testBoundsAsClusteringWithMultiEqAndSingleSliceRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.ASC);
 
@@ -1608,20 +1605,20 @@
         // (clustering_0, clustering_1) = (1, 2) AND clustering_2 > 3
         Restriction multiEq = newMultiEq(cfMetaData, 0, value1, value2);
         Restriction singleSlice = newSingleSlice(cfMetaData, 2, Bound.START, false, value3);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiEq).mergeWith(singleSlice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2, value3);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0),  value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
     }
 
     @Test
-    public void testBoundsAsCompositesWithSeveralMultiColumnRestrictions()
+    public void testBoundsAsClusteringWithSeveralMultiColumnRestrictions()
     {
         CFMetaData cfMetaData = newCFMetaData(Sort.ASC, Sort.ASC, Sort.ASC, Sort.ASC);
 
@@ -1634,142 +1631,106 @@
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) > (3, 4)
         Restriction multiEq = newMultiEq(cfMetaData, 0, value1, value2);
         Restriction multiSlice = newMultiSlice(cfMetaData, 2, Bound.START, false, value3, value4);
-        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        PrimaryKeyRestrictions restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiEq).mergeWith(multiSlice);
 
-        List<Composite> bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        SortedSet<Slice.Bound> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
+        assertStartBound(get(bounds, 0), false, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0),  value1, value2, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2);
 
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) IN ((3, 4), (4, 5))
         multiEq = newMultiEq(cfMetaData, 0, value1, value2);
         Restriction multiIN = newMultiIN(cfMetaData, 2, asList(value3, value4), asList(value4, value5));
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiEq).mergeWith(multiIN);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
-        assertComposite(bounds.get(1), value1, value2, value4, value5, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3, value4);
+        assertStartBound(get(bounds, 1), true, value1, value2, value4, value5);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
-        assertComposite(bounds.get(1), value1, value2, value4, value5, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
+        assertEndBound(get(bounds, 1), true, value1, value2, value4, value5);
 
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) = (3, 4)
         multiEq = newMultiEq(cfMetaData, 0, value1, value2);
         Restriction multiEq2 = newMultiEq(cfMetaData, 2, value3, value4);
-        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator);
+        restrictions = new PrimaryKeyRestrictionSet(cfMetaData.comparator, false);
         restrictions = restrictions.mergeWith(multiEq).mergeWith(multiEq2);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.START, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.START);
+        assertStartBound(get(bounds, 0), true, value1, value2, value3, value4);
 
-        bounds = restrictions.boundsAsComposites(cfMetaData, Bound.END, QueryOptions.DEFAULT);
+        bounds = restrictions.boundsAsClustering(Bound.END, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
-        assertComposite(bounds.get(0), value1, value2, value3, value4, EOC.END);
+        assertEndBound(get(bounds, 0), true, value1, value2, value3, value4);
     }
 
     /**
-     * Asserts that the specified <code>Composite</code> is an empty one.
+     * Asserts that the specified <code>Bound</code> is an empty start.
      *
-     * @param composite the composite to check
+     * @param bound the bound to check
      */
-    private static void assertEmptyComposite(Composite composite)
+    private static void assertEmptyStart(Slice.Bound bound)
     {
-        assertEquals(Composites.EMPTY, composite);
+        assertEquals(Slice.Bound.BOTTOM, bound);
     }
 
     /**
-     * Asserts that the specified <code>Composite</code> contains the specified element and the specified EOC.
+     * Asserts that the specified <code>Bound</code> is an empty end.
      *
-     * @param composite the composite to check
-     * @param element the expected element of the composite
-     * @param eoc the expected EOC of the composite
+     * @param bound the bound to check
      */
-    private static void assertComposite(Composite composite, ByteBuffer element, EOC eoc)
+    private static void assertEmptyEnd(Slice.Bound bound)
     {
-        assertComposite(composite, eoc, element);
+        assertEquals(Slice.Bound.TOP, bound);
     }
 
     /**
-     * Asserts that the specified <code>Composite</code> contains the 2 specified element and the specified EOC.
+     * Asserts that the specified <code>Slice.Bound</code> is a start with the specified elements.
      *
-     * @param composite the composite to check
-     * @param eoc the expected EOC of the composite
-     * @param elements the expected element of the composite
+     * @param bound the bound to check
+     * @param isInclusive if the bound is expected to be inclusive
+     * @param elements the expected elements of the clustering
      */
-    private static void assertComposite(Composite composite, ByteBuffer firstElement, ByteBuffer secondElement, EOC eoc)
+    private static void assertStartBound(Slice.Bound bound, boolean isInclusive, ByteBuffer... elements)
     {
-        assertComposite(composite, eoc, firstElement, secondElement);
+        assertBound(bound, true, isInclusive, elements);
     }
 
     /**
-     * Asserts that the specified <code>Composite</code> contains the 3 specified element and the specified EOC.
+     * Asserts that the specified <code>Slice.Bound</code> is a end with the specified elements.
      *
-     * @param composite the composite to check
-     * @param firstElement the first expected element of the composite
-     * @param secondElement the second expected element of the composite
-     * @param thirdElement the third expected element of the composite
-     * @param eoc the expected EOC of the composite
-     * @param elements the expected element of the composite
+     * @param bound the bound to check
+     * @param isInclusive if the bound is expected to be inclusive
+     * @param elements the expected elements of the clustering
      */
-    private static void assertComposite(Composite composite,
-                                        ByteBuffer firstElement,
-                                        ByteBuffer secondElement,
-                                        ByteBuffer thirdElement,
-                                        EOC eoc)
+    private static void assertEndBound(Slice.Bound bound, boolean isInclusive, ByteBuffer... elements)
     {
-        assertComposite(composite, eoc, firstElement, secondElement, thirdElement);
+        assertBound(bound, false, isInclusive, elements);
     }
 
-    /**
-     * Asserts that the specified <code>Composite</code> contains the 4 specified element and the specified EOC.
-     *
-     * @param composite the composite to check
-     * @param firstElement the first expected element of the composite
-     * @param secondElement the second expected element of the composite
-     * @param thirdElement the third expected element of the composite
-     * @param fourthElement the fourth expected element of the composite
-     * @param eoc the expected EOC of the composite
-     * @param elements the expected element of the composite
-     */
-    private static void assertComposite(Composite composite,
-                                        ByteBuffer firstElement,
-                                        ByteBuffer secondElement,
-                                        ByteBuffer thirdElement,
-                                        ByteBuffer fourthElement,
-                                        EOC eoc)
+    private static void assertBound(Slice.Bound bound, boolean isStart, boolean isInclusive, ByteBuffer... elements)
     {
-        assertComposite(composite, eoc, firstElement, secondElement, thirdElement, fourthElement);
-    }
-
-    /**
-     * Asserts that the specified <code>Composite</code> contains the specified elements and EOC.
-     *
-     * @param composite the composite to check
-     * @param eoc the expected EOC of the composite
-     * @param elements the expected elements of the composite
-     */
-    private static void assertComposite(Composite composite, EOC eoc, ByteBuffer... elements)
-    {
-        assertEquals("the composite size is not the expected one:", elements.length, composite.size());
+        assertEquals("the bound size is not the expected one:", elements.length, bound.size());
+        assertEquals("the bound should be a " + (isStart ? "start" : "end") + " but is a " + (bound.isStart() ? "start" : "end"), isStart, bound.isStart());
+        assertEquals("the bound inclusiveness is not the expected one", isInclusive, bound.isInclusive());
         for (int i = 0, m = elements.length; i < m; i++)
         {
             ByteBuffer element = elements[i];
-            assertTrue(String.format("the element %s of the composite is not the expected one: expected %s but was %s",
+            assertTrue(String.format("the element %s of the bound is not the expected one: expected %s but was %s",
                                      i,
                                      ByteBufferUtil.toInt(element),
-                                     ByteBufferUtil.toInt(composite.get(i))),
-                       element.equals(composite.get(i)));
+                                     ByteBufferUtil.toInt(bound.get(i))),
+                       element.equals(bound.get(i)));
         }
-        assertEquals("the EOC of the composite is not the expected one:", eoc, composite.eoc());
     }
 
     /**
@@ -1785,17 +1746,13 @@
         for (Sort sort : sorts)
             types.add(sort == Sort.ASC ? Int32Type.instance : ReversedType.getInstance(Int32Type.instance));
 
-        CompoundSparseCellNameType cType = new CompoundSparseCellNameType(types);
-        CFMetaData cfMetaData = new CFMetaData("keyspace", "test", ColumnFamilyType.Standard, cType);
+        CFMetaData.Builder builder = CFMetaData.Builder.create("keyspace", "test")
+                                                       .addPartitionKey("partition_key", Int32Type.instance);
 
         for (int i = 0; i < sorts.length; i++)
-        {
-            ByteBuffer name = ByteBufferUtil.bytes("clustering_" + i);
-            ColumnDefinition columnDef = ColumnDefinition.clusteringKeyDef(cfMetaData, name, types.get(i), i);
-            cfMetaData.addColumnDefinition(columnDef);
-        }
-        cfMetaData.rebuild();
-        return cfMetaData;
+            builder.addClusteringColumn("clustering_" + i, types.get(i));
+
+        return builder.build();
     }
 
     /**
@@ -1809,7 +1766,7 @@
     private static Restriction newSingleEq(CFMetaData cfMetaData, int index, ByteBuffer value)
     {
         ColumnDefinition columnDef = getClusteringColumnDefinition(cfMetaData, index);
-        return new SingleColumnRestriction.EQ(columnDef, toTerm(value));
+        return new SingleColumnRestriction.EQRestriction(columnDef, toTerm(value));
     }
 
     /**
@@ -1827,7 +1784,7 @@
         {
             columnDefinitions.add(getClusteringColumnDefinition(cfMetaData, firstIndex + i));
         }
-        return new MultiColumnRestriction.EQ(columnDefinitions, toMultiItemTerminal(values));
+        return new MultiColumnRestriction.EQRestriction(columnDefinitions, toMultiItemTerminal(values));
     }
 
     /**
@@ -1848,7 +1805,7 @@
             columnDefinitions.add(getClusteringColumnDefinition(cfMetaData, firstIndex + i));
             terms.add(toMultiItemTerminal(values[i].toArray(new ByteBuffer[0])));
         }
-        return new MultiColumnRestriction.InWithValues(columnDefinitions, terms);
+        return new MultiColumnRestriction.InRestrictionWithValues(columnDefinitions, terms);
     }
 
     /**
@@ -1862,7 +1819,7 @@
     private static Restriction newSingleIN(CFMetaData cfMetaData, int index, ByteBuffer... values)
     {
         ColumnDefinition columnDef = getClusteringColumnDefinition(cfMetaData, index);
-        return new SingleColumnRestriction.InWithValues(columnDef, toTerms(values));
+        return new SingleColumnRestriction.InRestrictionWithValues(columnDef, toTerms(values));
     }
 
     /**
@@ -1890,7 +1847,7 @@
     private static Restriction newSingleSlice(CFMetaData cfMetaData, int index, Bound bound, boolean inclusive, ByteBuffer value)
     {
         ColumnDefinition columnDef = getClusteringColumnDefinition(cfMetaData, index);
-        return new SingleColumnRestriction.Slice(columnDef, bound, inclusive, toTerm(value));
+        return new SingleColumnRestriction.SliceRestriction(columnDef, bound, inclusive, toTerm(value));
     }
 
     /**
@@ -1910,7 +1867,7 @@
         {
             columnDefinitions.add(getClusteringColumnDefinition(cfMetaData, i + firstIndex));
         }
-        return new MultiColumnRestriction.Slice(columnDefinitions, bound, inclusive, toMultiItemTerminal(values));
+        return new MultiColumnRestriction.SliceRestriction(columnDefinitions, bound, inclusive, toMultiItemTerminal(values));
     }
 
     /**
@@ -1949,6 +1906,11 @@
         return terms;
     }
 
+    private static <T> T get(SortedSet<T> set, int i)
+    {
+        return Iterables.get(set, i);
+    }
+
     private static enum Sort
     {
         ASC,

diff --git a/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java b/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java
index 8757b19..2b7a197 100644
--- a/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java

@@ -53,7 +53,7 @@
     @BeforeClass
     public static void setUpClass()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
+        DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test
@@ -68,7 +68,7 @@
                                 " v1 int," +
                                 " v2 ascii," +
                                 " v3 frozen<" + typeName + ">)");
-        userType = Schema.instance.getKSMetaData(KEYSPACE).userTypes.getType(ByteBufferUtil.bytes(typeName));
+        userType = Schema.instance.getKSMetaData(KEYSPACE).types.get(ByteBufferUtil.bytes(typeName)).get();
         functionName = createFunction(KEYSPACE, "int, ascii",
                                       "CREATE FUNCTION %s (i int, a ascii) " +
                                       "CALLED ON NULL INPUT " +
@@ -159,7 +159,6 @@
                                                                 .addMapping(kSpec, columnDefinition("k"))
                                                                 .addMapping(v1Spec, columnDefinition("v1"))
                                                                 .addMapping(v2Spec, columnDefinition("v2"));
-
         verify(expected, "SELECT k AS k_alias, v1 AS v1_alias, v2 AS v2_alias FROM %s");
     }
 

diff --git a/test/unit/org/apache/cassandra/cql3/validation/ThriftIllegalColumnsTest.java b/test/unit/org/apache/cassandra/cql3/validation/ThriftIllegalColumnsTest.java
new file mode 100644
index 0000000..2d922e0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/ThriftIllegalColumnsTest.java

@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.validation.operations.ThriftCQLTester;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.thrift.Cassandra;
+import org.apache.cassandra.thrift.Column;
+import org.apache.cassandra.thrift.ColumnParent;
+import org.apache.cassandra.thrift.InvalidRequestException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.thrift.ConsistencyLevel.ONE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class ThriftIllegalColumnsTest extends ThriftCQLTester
+{
+    final String NON_COMPACT_TABLE = "t1";
+    final String COMPACT_TABLE = "t2";
+
+    @Test
+    public void testNonCompactUpdateWithPrimaryKeyColumnName() throws Throwable
+    {
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+        String table = createTable(KEYSPACE, "CREATE TABLE %s (k int, c1 int,  c2 int, v int, PRIMARY KEY (k, c1, c2))");
+
+        // A cell name which represents a primary key column
+        ByteBuffer badCellName = CompositeType.build(ByteBufferUtil.bytes(0), ByteBufferUtil.bytes(0), ByteBufferUtil.bytes("c1"));
+        // A cell name which represents a regular column
+        ByteBuffer goodCellName = CompositeType.build(ByteBufferUtil.bytes(0), ByteBufferUtil.bytes(0), ByteBufferUtil.bytes("v"));
+
+        ColumnParent parent = new ColumnParent(table);
+        ByteBuffer key = ByteBufferUtil.bytes(0);
+        Column column = new Column();
+        column.setName(badCellName);
+        column.setValue(ByteBufferUtil.bytes(999));
+        column.setTimestamp(System.currentTimeMillis());
+
+        try
+        {
+            client.insert(key, parent, column, ONE);
+            fail("Expected exception");
+        } catch (InvalidRequestException e) {
+            assertEquals("Cannot add primary key column c1 to partition update", e.getWhy());
+        }
+
+        column.setName(goodCellName);
+        client.insert(key, parent, column, ONE);
+        assertRows(execute("SELECT v from %s WHERE k = 0"), row(999));
+    }
+
+    @Test
+    public void testThriftCompactUpdateWithPrimaryKeyColumnName() throws Throwable
+    {
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+        String table = createTable(KEYSPACE, "CREATE TABLE %s (k int, v int, PRIMARY KEY (k)) WITH COMPACT STORAGE");
+
+        // A cell name which represents a primary key column
+        ByteBuffer badCellName = ByteBufferUtil.bytes("k");
+        // A cell name which represents a regular column
+        ByteBuffer goodCellName = ByteBufferUtil.bytes("v");
+
+        ColumnParent parent = new ColumnParent(table);
+        ByteBuffer key = ByteBufferUtil.bytes(0);
+        Column column = new Column();
+        column.setName(badCellName);
+        column.setValue(ByteBufferUtil.bytes(999));
+        column.setTimestamp(System.currentTimeMillis());
+        // if the table is compact, a cell name which appears to reference a primary
+        // key column is treated as a dynamic column and so the update is allowed
+        client.insert(key, parent, column, ONE);
+
+        column.setName(goodCellName);
+        client.insert(key, parent, column, ONE);
+        assertRows(execute("SELECT v from %s where k=0"), row(999));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/ThriftIntegrationTest.java b/test/unit/org/apache/cassandra/cql3/validation/ThriftIntegrationTest.java
index c7e5088..def489e 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/ThriftIntegrationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/ThriftIntegrationTest.java

@@ -29,14 +29,15 @@
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.cql3.ColumnSpecification;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.validation.operations.ThriftCQLTester;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.db.marshal.CounterColumnType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.thrift.Cassandra;
 import org.apache.cassandra.thrift.CfDef;
 import org.apache.cassandra.thrift.Column;
@@ -66,6 +67,8 @@
     @Before
     public void setupSuperColumnFamily() throws Throwable
     {
+        StorageService.instance.setRpcReady(true);
+
         final String denseTableName = createTableName();
         final String sparseTableName =  currentSparseTable();
         final String counterTableName = currentCounterTable();
@@ -113,10 +116,14 @@
     }
 
     @Test
-    public void testReadCounter() throws Throwable
+    public void testCounterTableReads() throws Throwable
     {
         populateCounterTable();
+        beforeAndAfterFlush(this::testCounterTableReadsInternal);
+    }
 
+    private void testCounterTableReadsInternal() throws Throwable
+    {
         UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentCounterTable()));
         assertRows(resultSet,
                    row("key1", "ck1", "counter1", 10L),
@@ -149,12 +156,14 @@
                                                      Collections.singletonMap(currentCounterTable(), Arrays.asList(mutation2))),
                             ONE);
 
-        UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentCounterTable()));
-        assertRows(resultSet,
-                   row("key1", "ck1", "counter1", 11L),
-                   row("key1", "ck1", "counter2", 5L),
-                   row("key2", "ck1", "counter1", 110L),
-                   row("key2", "ck1", "counter2", 5L));
+        beforeAndAfterFlush(() -> {
+            UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentCounterTable()));
+            assertRows(resultSet,
+                       row("key1", "ck1", "counter1", 11L),
+                       row("key1", "ck1", "counter2", 5L),
+                       row("key2", "ck1", "counter1", 110L),
+                       row("key2", "ck1", "counter2", 5L));
+        });
     }
 
     @Test
@@ -169,12 +178,14 @@
         execute(String.format("UPDATE %s.%s set value = value - ? WHERE key = 'key1' AND column1 = 'ck1' AND column2 = 'counter2'", KEYSPACE, currentCounterTable()), 2L);
         execute(String.format("UPDATE %s.%s set value = value - ? WHERE key = 'key2' AND column1 = 'ck1' AND column2 = 'counter2'", KEYSPACE, currentCounterTable()), 100L);
 
-        UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentCounterTable()));
-        assertRows(resultSet,
-                   row("key1", "ck1", "counter1", 11L),
-                   row("key1", "ck1", "counter2", 3L),
-                   row("key2", "ck1", "counter1", 110L),
-                   row("key2", "ck1", "counter2", -95L));
+        beforeAndAfterFlush(() -> {
+            UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentCounterTable()));
+            assertRows(resultSet,
+                       row("key1", "ck1", "counter1", 11L),
+                       row("key1", "ck1", "counter2", 3L),
+                       row("key2", "ck1", "counter1", 110L),
+                       row("key2", "ck1", "counter2", -95L));
+        });
     }
 
     @Test
@@ -210,7 +221,7 @@
     }
 
     @Test
-    public void alterDenseTable() throws Throwable
+    public void testDenseTableAlter() throws Throwable
     {
         populateDenseTable();
 
@@ -219,33 +230,40 @@
         alterTable(String.format("ALTER TABLE %s.%s RENAME key TO renamed_key", KEYSPACE, currentDenseTable()));
         alterTable(String.format("ALTER TABLE %s.%s RENAME value TO renamed_value", KEYSPACE, currentDenseTable()));
 
-        UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentDenseTable()));
-        assertEquals("renamed_key", resultSet.metadata().get(0).name.toString());
-        assertEquals("renamed_column1", resultSet.metadata().get(1).name.toString());
-        assertEquals("renamed_column2", resultSet.metadata().get(2).name.toString());
-        assertEquals("renamed_value", resultSet.metadata().get(3).name.toString());
-        assertRows(resultSet,
-                   row("key1", "val1", 1, "value1"),
-                   row("key1", "val1", 2, "value2"),
-                   row("key1", "val2", 4, "value4"),
-                   row("key1", "val2", 5, "value5"),
-                   row("key2", "val1", 1, "value1"),
-                   row("key2", "val1", 2, "value2"),
-                   row("key2", "val2", 4, "value4"),
-                   row("key2", "val2", 5, "value5"));
+        beforeAndAfterFlush(() -> {
+            UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentDenseTable()));
+            assertEquals("renamed_key", resultSet.metadata().get(0).name.toString());
+            assertEquals("renamed_column1", resultSet.metadata().get(1).name.toString());
+                                assertEquals("renamed_column2", resultSet.metadata().get(2).name.toString());
+                                assertEquals("renamed_value", resultSet.metadata().get(3).name.toString());
+            assertRows(resultSet,
+                       row("key1", "val1", 1, "value1"),
+                       row("key1", "val1", 2, "value2"),
+                       row("key1", "val2", 4, "value4"),
+                       row("key1", "val2", 5, "value5"),
+                       row("key2", "val1", 1, "value1"),
+                       row("key2", "val1", 2, "value2"),
+                       row("key2", "val2", 4, "value4"),
+                       row("key2", "val2", 5, "value5"));
+        });
     }
 
     @Test
     public void testDenseTableReads() throws Throwable
     {
         populateDenseTable();
+        beforeAndAfterFlush(this::testDenseTableReadsInternal);
+    }
 
+    private void testDenseTableReadsInternal() throws Throwable
+    {
         UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentDenseTable()));
         assertEquals("key", resultSet.metadata().get(0).name.toString());
         assertEquals("column1", resultSet.metadata().get(1).name.toString());
         assertEquals("column2", resultSet.metadata().get(2).name.toString());
         assertEquals("value", resultSet.metadata().get(3).name.toString());
 
+
         assertRows(resultSet,
                    row("key1", "val1", 1, "value1"),
                    row("key1", "val1", 2, "value2"),
@@ -256,11 +274,6 @@
                    row("key2", "val2", 4, "value4"),
                    row("key2", "val2", 5, "value5"));
 
-        assertRows(execute(String.format("select * from %s.%s where key = 'key1' and (column1, column2) > ('val1', 1)", KEYSPACE, currentDenseTable())),
-                   row("key1", "val1", 2, "value2"),
-                   row("key1", "val2", 4, "value4"),
-                   row("key1", "val2", 5, "value5"));
-
         assertRows(execute(String.format("select * from %s.%s LIMIT 5", KEYSPACE, currentDenseTable())),
                    row("key1", "val1", 1, "value1"),
                    row("key1", "val1", 2, "value2"),
@@ -308,22 +321,28 @@
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and column1 in ('val1', 'val2') and column2 IN (1, 4)", KEYSPACE, currentDenseTable())),
                    row("key1", "val1", 1, "value1"),
                    row("key1", "val2", 4, "value4"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and column1 in ('val1', 'val2')", KEYSPACE, currentDenseTable())),
                    row("key1", "val1", 1, "value1"),
                    row("key1", "val1", 2, "value2"),
                    row("key1", "val2", 4, "value4"),
                    row("key1", "val2", 5, "value5"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and column1 in ('val1', 'val2') and column2 = 1", KEYSPACE, currentDenseTable())),
                    row("key1", "val1", 1, "value1"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and (column1, column2) = ('val2', 4)", KEYSPACE, currentDenseTable())),
                    row("key1", "val2", 4, "value4"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and (column1, column2) >= ('val2', 4)", KEYSPACE, currentDenseTable())),
                    row("key1", "val2", 4, "value4"),
                    row("key1", "val2", 5, "value5"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and (column1, column2) > ('val1', 1)", KEYSPACE, currentDenseTable())),
                    row("key1", "val1", 2, "value2"),
                    row("key1", "val2", 4, "value4"),
                    row("key1", "val2", 5, "value5"));
+
         assertRows(execute(String.format("select * from %s.%s where key = 'key1' and (column1, column2) > ('val2', 1)", KEYSPACE, currentDenseTable())),
                    row("key1", "val2", 4, "value4"),
                    row("key1", "val2", 5, "value5"));
@@ -344,7 +363,7 @@
                    row("val1", "value1"),
                    row("val1", "value2"));
 
-        assertInvalidMessage("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              String.format("CREATE INDEX ON %s.%s (column2)", KEYSPACE, currentDenseTable()));
         assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              String.format("CREATE INDEX ON %s.%s (value)", KEYSPACE, currentDenseTable()));
@@ -357,9 +376,13 @@
     @Test
     public void testDenseTablePartialCqlInserts() throws Throwable
     {
-        assertInvalidMessage("Column value is mandatory for this COMPACT STORAGE table",
+        assertInvalidMessage("Column value is mandatory for SuperColumn tables",
                              String.format("INSERT INTO %s.%s (key, column1, column2) VALUES ('key1', 'val1', 1)", KEYSPACE, currentDenseTable()));
 
+        // That's slightly different from 2.X, since null map keys are not allowed
+        assertInvalidMessage("Column key is mandatory for SuperColumn tables",
+                             String.format("INSERT INTO %s.%s (key, column1, value) VALUES ('key1', 'val1', 'value1')", KEYSPACE, currentDenseTable()));
+
         execute(String.format("INSERT INTO %s.%s (key, column1, column2, value) VALUES ('key1', 'val1', 1, NULL)", KEYSPACE, currentDenseTable()));
         execute(String.format("INSERT INTO %s.%s (key, column1, column2, value) VALUES ('key1', 'val1', 1, ?)", KEYSPACE, currentDenseTable()), unset());
         assertEmpty(execute(String.format("select * from %s.%s", KEYSPACE, currentDenseTable())));
@@ -395,8 +418,12 @@
     @Test
     public void testDenseTableCqlUpdates() throws Throwable
     {
-        assertInvalidMessage("PRIMARY KEY part column2 found in SET part",
+        assertInvalidMessage("Column key is mandatory for SuperColumn tables",
                              String.format("UPDATE %s.%s SET column2 = 1, value = 'value1' WHERE key = 'key1' AND column1 = 'val1'", KEYSPACE, currentDenseTable()));
+        assertInvalidMessage("Column `column2` of type `int` found in SET part",
+                             String.format("UPDATE %s.%s SET column2 = 1, value = 'value1' WHERE key = 'key1' AND column1 = 'val1' AND column2 = 1", KEYSPACE, currentDenseTable()));
+        assertInvalidMessage("Some clustering keys are missing: column1",
+                             String.format("UPDATE %s.%s SET value = 'value1' WHERE key = 'key1' AND column2 = 1", KEYSPACE, currentDenseTable()));
 
         execute(String.format("UPDATE %s.%s SET value = 'value1' WHERE key = 'key1' AND column1 = 'val1' AND column2 = 1", KEYSPACE, currentDenseTable()));
         execute(String.format("UPDATE %s.%s SET value = 'value2' WHERE key = 'key1' AND column1 = 'val1' AND column2 = 2", KEYSPACE, currentDenseTable()));
@@ -487,8 +514,11 @@
         assertRows(execute(String.format("SELECT * FROM %s.%s", KEYSPACE, currentDenseTable())),
                    row("key3", "val1", 1, "value1"));
 
-        assertInvalidMessage("Multi-column relations cannot be used in WHERE clauses for UPDATE and DELETE statements: (column1, column2) = ('val1', 1)",
+        assertInvalidMessage("Multi-column relations cannot be used in WHERE clauses for UPDATE and DELETE statements",
                              String.format("DELETE FROM %s.%s WHERE key = 'key3' AND (column1, column2) = ('val1', 1)", KEYSPACE, currentDenseTable()));
+
+        assertInvalidMessage("Token relations cannot be used in WHERE clauses for UPDATE and DELETE statements: token(key) > token('key3')",
+                             String.format("DELETE FROM %s.%s WHERE token(key) > token('key3')", KEYSPACE, currentDenseTable()));
     }
 
     @Test
@@ -525,7 +555,11 @@
     public void testSparseTableCqlReads() throws Throwable
     {
         populateSparseTable();
+        beforeAndAfterFlush(this::testSparseTableCqlReadsInternal);
+    }
 
+    private void testSparseTableCqlReadsInternal() throws Throwable
+    {
         UntypedResultSet resultSet = execute(String.format("select * from %s.%s", KEYSPACE, currentSparseTable()));
         assertEquals("key", resultSet.metadata().get(0).name.toString());
         assertEquals("column1", resultSet.metadata().get(1).name.toString());
@@ -553,17 +587,17 @@
         resultSet = execute(String.format("select col1 as a, col2 as b, column1 as c, key as d from %s.%s WHERE key = ? AND column1 = ?", KEYSPACE, currentSparseTable()), "key1", "val2");
         assertRows(resultSet,
                    row(3L, 4L, "val2", "key1"));
-        assertEquals(resultSet.metadata().get(0).name.toString(), "a");
-        assertEquals(resultSet.metadata().get(1).name.toString(), "b");
-        assertEquals(resultSet.metadata().get(2).name.toString(), "c");
-        assertEquals(resultSet.metadata().get(3).name.toString(), "d");
+        assertEquals(resultSet.metadata().get(0).name, ColumnIdentifier.getInterned("a", true));
+        assertEquals(resultSet.metadata().get(1).name, ColumnIdentifier.getInterned("b", true));
+        assertEquals(resultSet.metadata().get(2).name, ColumnIdentifier.getInterned("c", true));
+        assertEquals(resultSet.metadata().get(3).name, ColumnIdentifier.getInterned("d", true));
 
         assertRows(execute(String.format("select col1, col2 from %s.%s WHERE key = ? AND column1 = ?", KEYSPACE, currentSparseTable()), "key1", "val2"),
                    row(3L, 4L));
 
-        assertInvalidMessage("Secondary indexes are not supported on super column families",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              String.format("CREATE INDEX ON %s.%s (column1)", KEYSPACE, currentSparseTable()));
-        assertInvalidMessage("Secondary indexes are not supported on super column families",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              String.format("CREATE INDEX ON %s.%s (col1)", KEYSPACE, currentSparseTable()));
 
         assertRows(execute(String.format("SELECT JSON * FROM %s.%s WHERE key = ? AND column1 = ?", KEYSPACE, currentSparseTable()), "key1", "val2"),
@@ -676,15 +710,15 @@
     @Test
     public void testFiltering() throws Throwable
     {
-        assertInvalidMessage("Predicates on non-primary-key columns (value) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Filtering is not supported on SuperColumn tables",
                              String.format("select * from %s.%s WHERE value = ?", KEYSPACE, currentDenseTable()),
                              "value5");
-        assertInvalidMessage("Predicates on non-primary-key columns (value) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Filtering is not supported on SuperColumn tables",
                              String.format("select * from %s.%s WHERE value = ? ALLOW FILTERING", KEYSPACE, currentDenseTable()),
                              "value5");
-        assertInvalidMessage("Predicates on non-primary-key columns (value) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Filtering is not supported on SuperColumn tables",
                              String.format("SELECT * FROM %s.%s WHERE value = 'value2' ALLOW FILTERING", KEYSPACE, currentDenseTable()));
-        assertInvalidMessage("PRIMARY KEY column \"column2\" cannot be restricted as preceding column \"column1\" is not restricted",
+        assertInvalidMessage("Filtering is not supported on SuperColumn tables",
                              String.format("SELECT * FROM %s.%s WHERE column2 = 1 ALLOW FILTERING", KEYSPACE, currentDenseTable()));
     }
 
@@ -700,8 +734,9 @@
                            "key1", "val1", 1, "value1"),
                    row(false, "key1", "val1", 1, "value1"));
 
-        execute(String.format("UPDATE %s.%s SET value = 'changed' WHERE key = ? AND column1 = ? IF value = ?", KEYSPACE, currentDenseTable()),
-                "key1", "val1", "value1");
+        // in 2.2 this query was a no-op
+        assertInvalidMessage("Lightweight transactions on SuperColumn tables are only supported with supplied SuperColumn key",
+                             String.format("UPDATE %s.%s SET value = 'changed' WHERE key = ? AND column1 = ? IF value = ?", KEYSPACE, currentDenseTable()));
 
         assertRows(execute(String.format("UPDATE %s.%s SET value = 'changed' WHERE key = ? AND column1 = ? AND column2 = ? IF value = ?", KEYSPACE, currentDenseTable()),
                            "key1", "val1", 1, "value1"),
@@ -724,6 +759,9 @@
         assertInvalidMessage("PRIMARY KEY column 'column2' cannot have IF conditions",
                              String.format("UPDATE %s.%s SET value = 'changed2' WHERE key = ? AND column1 = ? AND column2 = ? IF value > ? AND column2 = ?", KEYSPACE, currentDenseTable()));
 
+        assertInvalidMessage("Lightweight transactions on SuperColumn tables are only supported with supplied SuperColumn key",
+                             String.format("UPDATE %s.%s SET value = 'changed2' WHERE key = ? AND column1 = ? IF value > ?", KEYSPACE, currentDenseTable()));
+
         execute(String.format("DELETE FROM %s.%s WHERE key = 'key1' AND column1 = 'val1' AND column2 = 1 IF EXISTS", KEYSPACE, currentDenseTable()));
         assertEmpty(execute(String.format("SELECT * FROM %s.%s", KEYSPACE, currentDenseTable())));
 
@@ -888,4 +926,17 @@
             column.addToColumns(c);
         return column;
     }
+
+    public void beforeAndAfterFlush(CheckedFunction runnable) throws Throwable
+    {
+        runnable.apply();
+        flushAll();
+        runnable.apply();
+    }
+
+    private void flushAll()
+    {
+        for (String cfName : new String[]{ currentTable(), currentSparseTable(), currentCounterTable() })
+            Keyspace.open(KEYSPACE).getColumnFamilyStore(cfName);
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java
index 69d5a5c..918033e 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java

@@ -17,11 +17,18 @@
  */
 package org.apache.cassandra.cql3.validation.entities;
 
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.Arrays;
 import java.util.UUID;
 
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
 
 public class CollectionsTest extends CQLTester
 {
@@ -237,7 +244,7 @@
         assertInvalidMessage("Attempted to set an element on a list which is null",
                              "UPDATE %s SET l[0] = ? WHERE k=0", list("v10"));
 
-        execute("UPDATE %s SET l = l - ? WHERE k=0 ", list("v11"));
+        execute("UPDATE %s SET l = l - ? WHERE k=0", list("v11"));
 
         assertRows(execute("SELECT l FROM %s WHERE k = 0"), row((Object) null));
     }
@@ -585,9 +592,84 @@
         assertInvalid("alter table %s add v set<int>");
     }
 
-    /**
-     * Test for 9838.
-     */
+    @Test
+    public void testDropAndReaddFrozenCollection() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v frozen<set<text>>, x int)");
+        execute("insert into %s (k, v) VALUES (0, {'fffffffff'})");
+        flush();
+        execute("alter table %s drop v");
+        assertInvalid("alter table %s add v frozen<set<int>>");
+    }
+
+    @Test
+    public void testMapWithLargePartition() throws Throwable
+    {
+        Random r = new Random();
+        long seed = System.nanoTime();
+        System.out.println("Seed " + seed);
+        r.setSeed(seed);
+
+        int len = (1024 * 1024)/100;
+        createTable("CREATE TABLE %s (userid text PRIMARY KEY, properties map<int, text>) with compression = {}");
+
+        final int numKeys = 200;
+        for (int i = 0; i < numKeys; i++)
+        {
+            byte[] b = new byte[len];
+            r.nextBytes(b);
+            execute("UPDATE %s SET properties[?] = ? WHERE userid = 'user'", i, new String(b));
+        }
+
+        flush();
+
+        Object[][] rows = getRows(execute("SELECT properties from %s where userid = 'user'"));
+        assertEquals(1, rows.length);
+        assertEquals(numKeys, ((Map) rows[0][0]).size());
+    }
+
+    @Test
+    public void testMapWithTwoSStables() throws Throwable
+    {
+        createTable("CREATE TABLE %s (userid text PRIMARY KEY, properties map<int, text>) with compression = {}");
+
+        final int numKeys = 100;
+        for (int i = 0; i < numKeys; i++)
+            execute("UPDATE %s SET properties[?] = ? WHERE userid = 'user'", i, "prop_" + Integer.toString(i));
+
+        flush();
+
+        for (int i = numKeys; i < 2*numKeys; i++)
+            execute("UPDATE %s SET properties[?] = ? WHERE userid = 'user'", i, "prop_" + Integer.toString(i));
+
+        flush();
+
+        Object[][] rows = getRows(execute("SELECT properties from %s where userid = 'user'"));
+        assertEquals(1, rows.length);
+        assertEquals(numKeys * 2, ((Map) rows[0][0]).size());
+    }
+
+    @Test
+    public void testSetWithTwoSStables() throws Throwable
+    {
+        createTable("CREATE TABLE %s (userid text PRIMARY KEY, properties set<text>) with compression = {}");
+
+        final int numKeys = 100;
+        for (int i = 0; i < numKeys; i++)
+            execute("UPDATE %s SET properties = properties + ? WHERE userid = 'user'", set("prop_" + Integer.toString(i)));
+
+        flush();
+
+        for (int i = numKeys; i < 2*numKeys; i++)
+            execute("UPDATE %s SET properties = properties + ? WHERE userid = 'user'", set("prop_" + Integer.toString(i)));
+
+        flush();
+
+        Object[][] rows = getRows(execute("SELECT properties from %s where userid = 'user'"));
+        assertEquals(1, rows.length);
+        assertEquals(numKeys * 2, ((Set) rows[0][0]).size());
+    }
+
     @Test
     public void testUpdateStaticList() throws Throwable
     {
@@ -608,6 +690,182 @@
     }
 
     @Test
+    public void testListWithElementsBiggerThan64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<text>)");
+
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+
+        bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 2);
+        String largeText2 = new String(bytes);
+
+        execute("INSERT INTO %s(k, l) VALUES (0, ?)", list(largeText, "v2"));
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list(largeText, "v2")));
+
+        execute("DELETE l[?] FROM %s WHERE k = 0", 0);
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v2")));
+
+        execute("UPDATE %s SET l[?] = ? WHERE k = 0", 0, largeText);
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list(largeText)));
+
+        // Full overwrite
+        execute("UPDATE %s SET l = ? WHERE k = 0", list("v1", largeText));
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", largeText)));
+
+        execute("UPDATE %s SET l = l + ? WHERE k = 0", list("v2", largeText2));
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", largeText, "v2", largeText2)));
+
+        execute("UPDATE %s SET l = l - ? WHERE k = 0", list(largeText, "v2"));
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", largeText2)));
+
+        execute("DELETE l FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, l) VALUES (0, ['" + largeText + "', 'v2'])");
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list(largeText, "v2")));
+    }
+
+    @Test
+    public void testMapsWithElementsBiggerThan64K() throws Throwable
+    {
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+        bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 2);
+        String largeText2 = new String(bytes);
+
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, m map<text, text>)");
+
+        execute("INSERT INTO %s(k, m) VALUES (0, ?)", map("k1", largeText, largeText, "v2"));
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k1", largeText, largeText, "v2")));
+
+        execute("UPDATE %s SET m[?] = ? WHERE k = 0", "k3", largeText);
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k1", largeText, largeText, "v2", "k3", largeText)));
+
+        execute("UPDATE %s SET m[?] = ? WHERE k = 0", largeText2, "v4");
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k1", largeText, largeText, "v2", "k3", largeText, largeText2, "v4")));
+
+        execute("DELETE m[?] FROM %s WHERE k = 0", "k1");
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map(largeText, "v2", "k3", largeText, largeText2, "v4")));
+
+        execute("DELETE m[?] FROM %s WHERE k = 0", largeText2);
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map(largeText, "v2", "k3", largeText)));
+
+        // Full overwrite
+        execute("UPDATE %s SET m = ? WHERE k = 0", map("k5", largeText, largeText, "v6"));
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k5", largeText, largeText, "v6")));
+
+        execute("UPDATE %s SET m = m + ? WHERE k = 0", map("k7", largeText));
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k5", largeText, largeText, "v6", "k7", largeText)));
+
+        execute("UPDATE %s SET m = m + ? WHERE k = 0", map(largeText2, "v8"));
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k5", largeText, largeText, "v6", "k7", largeText, largeText2, "v8")));
+
+        execute("DELETE m FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, m) VALUES (0, {'" + largeText + "' : 'v1', 'k2' : '" + largeText + "'})");
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map(largeText, "v1", "k2", largeText)));
+    }
+
+    @Test
+    public void testSetsWithElementsBiggerThan64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, s set<text>)");
+
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+
+        bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 2);
+        String largeText2 = new String(bytes);
+
+        execute("INSERT INTO %s(k, s) VALUES (0, ?)", set(largeText, "v2"));
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set(largeText, "v2")));
+
+        execute("DELETE s[?] FROM %s WHERE k = 0", largeText);
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set("v2")));
+
+        // Full overwrite
+        execute("UPDATE %s SET s = ? WHERE k = 0", set("v1", largeText));
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set("v1", largeText)));
+
+        execute("UPDATE %s SET s = s + ? WHERE k = 0", set("v2", largeText2));
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set("v1", largeText, "v2", largeText2)));
+
+        execute("UPDATE %s SET s = s - ? WHERE k = 0", set(largeText, "v2"));
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set("v1", largeText2)));
+
+        execute("DELETE s FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, s) VALUES (0, {'" + largeText + "', 'v2'})");
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set(largeText, "v2")));
+    }
+
+    @Test
+    public void testRemovalThroughUpdate() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<int>)");
+
+         execute("INSERT INTO %s(k, l) VALUES(?, ?)", 0, list(1, 2, 3));
+         assertRows(execute("SELECT * FROM %s"), row(0, list(1, 2, 3)));
+
+         execute("UPDATE %s SET l[0] = null WHERE k=0");
+         assertRows(execute("SELECT * FROM %s"), row(0, list(2, 3)));
+    }
+
+    @Test
     public void testInvalidInputForList() throws Throwable
     {
         createTable("CREATE TABLE %s(pk int PRIMARY KEY, l list<text>)");
@@ -757,7 +1015,7 @@
                              "INSERT INTO %s (k, s) VALUES (0, ?)",
                              set(tuple(1, "1", 1.0, 1), tuple(2, "2", 2.0, 2)));
 
-        assertInvalidMessage("Invalid set literal for s: value (1, '1', 1.0, 1) is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid set literal for s: value (1, '1', 1.0, 1) is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, s) VALUES (0, {(1, '1', 1.0, 1)})");
 
         createTable("CREATE TABLE %s (k int PRIMARY KEY, l frozen<list<tuple<int, text, double>>>)");
@@ -765,7 +1023,7 @@
                              "INSERT INTO %s (k, l) VALUES (0, ?)",
                              list(tuple(1, "1", 1.0, 1), tuple(2, "2", 2.0, 2)));
 
-        assertInvalidMessage("Invalid list literal for l: value (1, '1', 1.0, 1) is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid list literal for l: value (1, '1', 1.0, 1) is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, l) VALUES (0, [(1, '1', 1.0, 1)])");
 
         createTable("CREATE TABLE %s (k int PRIMARY KEY, m frozen<map<tuple<int, text, double>, int>>)");
@@ -773,7 +1031,7 @@
                              "INSERT INTO %s (k, m) VALUES (0, ?)",
                              map(tuple(1, "1", 1.0, 1), 1, tuple(2, "2", 2.0, 2), 2));
 
-        assertInvalidMessage("Invalid map literal for m: key (1, '1', 1.0, 1) is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid map literal for m: key (1, '1', 1.0, 1) is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, m) VALUES (0, {(1, '1', 1.0, 1) : 1})");
 
         createTable("CREATE TABLE %s (k int PRIMARY KEY, m frozen<map<int, tuple<int, text, double>>>)");
@@ -781,7 +1039,7 @@
                              "INSERT INTO %s (k, m) VALUES (0, ?)",
                              map(1, tuple(1, "1", 1.0, 1), 2, tuple(2, "2", 2.0, 2)));
 
-        assertInvalidMessage("Invalid map literal for m: value (1, '1', 1.0, 1) is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid map literal for m: value (1, '1', 1.0, 1) is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, m) VALUES (0, {1 : (1, '1', 1.0, 1)})");
     }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/CountersTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/CountersTest.java
index 41b73bc..c9939c8 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/CountersTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/CountersTest.java

@@ -59,8 +59,8 @@
     @Test
     public void testRegularCounters() throws Throwable
     {
-        assertInvalidThrowMessage("Cannot add a non counter column",
-                                  ConfigurationException.class,
+        assertInvalidThrowMessage("Cannot mix counter and non counter columns in the same table",
+                                  InvalidRequestException.class,
                                   String.format("CREATE TABLE %s.%s (id bigint PRIMARY KEY, count counter, things set<text>)", KEYSPACE, createTableName()));
     }
 
@@ -113,6 +113,80 @@
         );
     }
 
+    @Test
+    public void testCounterFiltering() throws Throwable
+    {
+        for (String compactStorageClause: new String[] {"", " WITH COMPACT STORAGE"})
+        {
+            createTable("CREATE TABLE %s (k int PRIMARY KEY, a counter)" + compactStorageClause);
+
+            for (int i = 0; i < 10; i++)
+                execute("UPDATE %s SET a = a + ? WHERE k = ?", (long) i, i);
+
+            execute("UPDATE %s SET a = a + ? WHERE k = ?", 6L, 10);
+
+            // GT
+            assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE a > ? ALLOW FILTERING", 5L),
+                                    row(6, 6L),
+                                    row(7, 7L),
+                                    row(8, 8L),
+                                    row(9, 9L),
+                                    row(10, 6L));
+
+            // GTE
+            assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE a >= ? ALLOW FILTERING", 6L),
+                                    row(6, 6L),
+                                    row(7, 7L),
+                                    row(8, 8L),
+                                    row(9, 9L),
+                                    row(10, 6L));
+
+            // LT
+            assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE a < ? ALLOW FILTERING", 3L),
+                                    row(0, 0L),
+                                    row(1, 1L),
+                                    row(2, 2L));
+
+            // LTE
+            assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE a <= ? ALLOW FILTERING", 3L),
+                                    row(0, 0L),
+                                    row(1, 1L),
+                                    row(2, 2L),
+                                    row(3, 3L));
+
+            // EQ
+            assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE a = ? ALLOW FILTERING", 6L),
+                                    row(6, 6L),
+                                    row(10, 6L));
+        }
+    }
+
+    @Test
+    public void testCounterFilteringWithNull() throws Throwable
+    {
+        for (String compactStorageClause : new String[]{ "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (k int PRIMARY KEY, a counter, b counter)" + compactStorageClause);
+            execute("UPDATE %s SET a = a + ? WHERE k = ?", 1L, 1);
+
+            assertRows(execute("SELECT * FROM %s WHERE a > ? ALLOW FILTERING", 0L),
+                       row(1, 1L, null));
+            // GT
+            assertEmpty(execute("SELECT * FROM %s WHERE b > ? ALLOW FILTERING", 1L));
+            // GTE
+            assertEmpty(execute("SELECT * FROM %s WHERE b >= ? ALLOW FILTERING", 1L));
+            // LT
+            assertEmpty(execute("SELECT * FROM %s WHERE b < ? ALLOW FILTERING", 1L));
+            // LTE
+            assertEmpty(execute("SELECT * FROM %s WHERE b <= ? ALLOW FILTERING", 1L));
+            // EQ
+            assertEmpty(execute("SELECT * FROM %s WHERE b = ? ALLOW FILTERING", 1L));
+            // with null
+            assertInvalidMessage("Invalid null value for counter increment/decrement",
+                                 "SELECT * FROM %s WHERE b = null ALLOW FILTERING");
+        }
+    }
+
     /**
      * Test for the validation bug of #9395.
      */
@@ -122,5 +196,4 @@
         assertInvalidThrowMessage("counter type is not supported for PRIMARY KEY part a",
                                   InvalidRequestException.class, String.format("CREATE TABLE %s.%s (a counter, b int, PRIMARY KEY (b, a)) WITH CLUSTERING ORDER BY (a desc);", KEYSPACE, createTableName()));
     }
-
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java
index b590843..f89163d 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java

@@ -17,21 +17,24 @@
  */
 package org.apache.cassandra.cql3.validation.entities;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.SyntaxException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertEquals;
 
@@ -40,7 +43,8 @@
     @BeforeClass
     public static void setUpClass()
     {
-        DatabaseDescriptor.setPartitioner(new ByteOrderedPartitioner());
+        // Selecting partitioner for a table is not exposed on CREATE TABLE.
+        StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test
@@ -505,20 +509,6 @@
         }
     }
 
-    private void assertInvalidAlterWithMessage(String createTableStatement, String errorMessage) throws Throwable
-    {
-        try
-        {
-            alterTableMayThrow(createTableStatement);
-            Assert.fail("Expected CREATE TABLE statement to error: " + createTableStatement);
-        }
-        catch (InvalidRequestException | ConfigurationException ex)
-        {
-            Assert.assertTrue("Expected error message to contain '" + errorMessage + "', but got '" + ex.getMessage() + "'",
-                    ex.getMessage().contains(errorMessage));
-        }
-    }
-
     @Test
     public void testInvalidOperations() throws Throwable
     {
@@ -551,34 +541,6 @@
                 "frozen<> is only allowed on collections, tuples, and user-defined types");
     }
 
-    @Test
-    public void testAltering() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int, b frozen<list<int>>, c frozen<list<int>>, PRIMARY KEY (a, b))");
-
-        alterTable("ALTER TABLE %s ALTER c TYPE frozen<list<blob>>");
-
-        assertInvalidAlterWithMessage("ALTER TABLE %s ALTER b TYPE frozen<list<blob>>",
-                                      "types are not order-compatible");
-
-        assertInvalidAlterWithMessage("ALTER TABLE %s ALTER b TYPE list<int>",
-                                      "types are not order-compatible");
-
-        assertInvalidAlterWithMessage("ALTER TABLE %s ALTER c TYPE list<blob>",
-                                      "types are incompatible");
-
-        alterTable("ALTER TABLE %s DROP c");
-        alterTable("ALTER TABLE %s ADD c frozen<set<int>>");
-        assertInvalidAlterWithMessage("ALTER TABLE %s ALTER c TYPE frozen<set<blob>>",
-                                      "types are incompatible");
-
-        alterTable("ALTER TABLE %s DROP c");
-        alterTable("ALTER TABLE %s ADD c frozen<map<int, int>>");
-        assertInvalidAlterWithMessage("ALTER TABLE %s ALTER c TYPE frozen<map<blob, int>>",
-                                      "types are incompatible");
-        alterTable("ALTER TABLE %s ALTER c TYPE frozen<map<int, blob>>");
-    }
-
     private void assertInvalidIndexCreationWithMessage(String statement, String errorMessage) throws Throwable
     {
         try
@@ -600,8 +562,9 @@
 
         // for now, we don't support indexing values or keys of collections in the primary key
         assertInvalidIndexCreationWithMessage("CREATE INDEX ON %s (full(a))", "Cannot create secondary index on partition key column");
-        assertInvalidIndexCreationWithMessage("CREATE INDEX ON %s (keys(a))", "Cannot create index on keys of frozen<map> column");
-        assertInvalidIndexCreationWithMessage("CREATE INDEX ON %s (keys(b))", "Cannot create index on keys of frozen<map> column");
+        assertInvalidIndexCreationWithMessage("CREATE INDEX ON %s (keys(a))", "Cannot create secondary index on partition key column");
+        assertInvalidIndexCreationWithMessage("CREATE INDEX ON %s (keys(b))", "Cannot create keys() index on frozen column b. " +
+                                                                              "Frozen collections only support full() indexes");
 
         createTable("CREATE TABLE %s (a int, b frozen<list<int>>, c frozen<set<int>>, d frozen<map<int, text>>, PRIMARY KEY (a, b))");
 
@@ -631,12 +594,9 @@
         assertInvalidMessage("Cannot restrict clustering columns by a CONTAINS relation without a secondary index",
                              "SELECT * FROM %s WHERE b CONTAINS ? ALLOW FILTERING", 1);
 
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE d CONTAINS KEY ?", 1);
 
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE d CONTAINS KEY ? ALLOW FILTERING", 1);
-
         assertInvalidMessage("Cannot restrict clustering columns by a CONTAINS relation without a secondary index",
                              "SELECT * FROM %s WHERE b CONTAINS ? AND d CONTAINS KEY ? ALLOW FILTERING", 1, 1);
 
@@ -747,6 +707,11 @@
             row(0, list(1, 2, 3), set(1, 2, 3), map(1, "a"))
         );
 
+        assertRows(execute("SELECT * FROM %s WHERE d CONTAINS KEY ? ALLOW FILTERING", 1),
+            row(0, list(1, 2, 3), set(1, 2, 3), map(1, "a")),
+            row(0, list(4, 5, 6), set(1, 2, 3), map(1, "a"))
+        );
+
         execute("DELETE d FROM %s WHERE a=? AND b=?", 0, list(1, 2, 3));
         assertRows(execute("SELECT * FROM %s WHERE d=?", map(1, "a")),
             row(0, list(4, 5, 6), set(1, 2, 3), map(1, "a"))
@@ -1108,4 +1073,101 @@
         TupleType tuple = new TupleType(types);
         assertEquals("TupleType(SetType(Int32Type))", clean(tuple.toString()));
     }
+
+    @Test
+    public void testListWithElementsBiggerThan64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l frozen<list<text>>)");
+
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+
+        execute("INSERT INTO %s(k, l) VALUES (0, ?)", list(largeText, "v2"));
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list(largeText, "v2")));
+
+        // Full overwrite
+        execute("UPDATE %s SET l = ? WHERE k = 0", list("v1", largeText));
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", largeText)));
+
+        execute("DELETE l FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, l) VALUES (0, ['" + largeText + "', 'v2'])");
+        flush();
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list(largeText, "v2")));
+    }
+
+    @Test
+    public void testMapsWithElementsBiggerThan64K() throws Throwable
+    {
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+
+        bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 2);
+        String largeText2 = new String(bytes);
+
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, m frozen<map<text, text>>)");
+
+        execute("INSERT INTO %s(k, m) VALUES (0, ?)", map(largeText, "v1", "k2", largeText));
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+            row(map(largeText, "v1", "k2", largeText)));
+
+        // Full overwrite
+        execute("UPDATE %s SET m = ? WHERE k = 0", map("k5", largeText, largeText2, "v6"));
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map("k5", largeText, largeText2, "v6")));
+
+        execute("DELETE m FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, m) VALUES (0, {'" + largeText + "' : 'v1', 'k2' : '" + largeText + "'})");
+        flush();
+
+        assertRows(execute("SELECT m FROM %s WHERE k = 0"),
+                   row(map(largeText, "v1", "k2", largeText)));
+    }
+
+    @Test
+    public void testSetsWithElementsBiggerThan64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, s frozen<set<text>>)");
+
+        byte[] bytes = new byte[FBUtilities.MAX_UNSIGNED_SHORT + 10];
+        Arrays.fill(bytes, (byte) 1);
+        String largeText = new String(bytes);
+
+        execute("INSERT INTO %s(k, s) VALUES (0, ?)", set(largeText, "v1", "v2"));
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set(largeText, "v1", "v2")));
+
+        // Full overwrite
+        execute("UPDATE %s SET s = ? WHERE k = 0", set(largeText, "v3"));
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set(largeText, "v3")));
+
+        execute("DELETE s FROM %s WHERE k = 0");
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row((Object) null));
+
+        execute("INSERT INTO %s(k, s) VALUES (0, {'" + largeText + "', 'v1', 'v2'})");
+        flush();
+
+        assertRows(execute("SELECT s FROM %s WHERE k = 0"), row(set(largeText, "v1", "v2")));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java
index 9c6c96a..2d16168 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java

@@ -24,6 +24,7 @@
 
 import org.apache.cassandra.serializers.SimpleDateSerializer;
 import org.apache.cassandra.serializers.TimeSerializer;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import org.junit.Assert;
@@ -38,12 +39,13 @@
 import java.util.concurrent.*;
 import static org.junit.Assert.fail;
 
-public class JsonTest extends CQLTester
+public class
+JsonTest extends CQLTester
 {
     @BeforeClass
     public static void setUp()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
+        StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/RowUpdateBuilderTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/RowUpdateBuilderTest.java
new file mode 100644
index 0000000..afe2455
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/RowUpdateBuilderTest.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3.validation.entities;
+
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.utils.FBUtilities;
+
+// see CASSANDRA-9743, CASSANDRA-9746
+public class RowUpdateBuilderTest extends CQLTester
+{
+    @Test
+    public void testAddListEntryDurable() throws Throwable
+    {
+        testAddListEntry(false);
+    }
+
+    @Test
+    public void testAddListEntryTransient() throws Throwable
+    {
+        testAddListEntry(true);
+    }
+
+    public void testAddListEntry(boolean skipCommitLog) throws Throwable
+    {
+        createTable("CREATE TABLE %s ("
+                    + "pk text,"
+                    + "ck text,"
+                    + "l1 list<int>,"
+                    + "l2 list<int>,"
+                    + "PRIMARY KEY ((pk), ck))");
+
+        long timestamp = FBUtilities.timestampMicros();
+
+        Mutation mutation = new Mutation(keyspace(), Util.dk("test"));
+        addToMutation("row1", timestamp, mutation);
+        addToMutation("row2", timestamp, mutation);
+
+        if (skipCommitLog)
+            mutation.applyUnsafe();
+        else
+            mutation.apply();
+
+        assertRowCount(execute("SELECT ck FROM %s"), 2);
+    }
+
+    private void addToMutation(String typeName, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(getCurrentColumnFamilyStore().metadata, timestamp, mutation)
+                                 .clustering(typeName);
+
+        for (int i = 0; i < 2; i++)
+        {
+            adder.addListEntry("l1", i)
+                 .addListEntry("l2", i);
+        }
+
+        adder.build();
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexOnMapEntriesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexOnMapEntriesTest.java
index fb0d027..b69948f 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexOnMapEntriesTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexOnMapEntriesTest.java

@@ -38,7 +38,7 @@
     @BeforeClass
     public static void setUp()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
+        DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
index e3616f6..c2640a0 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java

@@ -17,38 +17,49 @@
  */
 package org.apache.cassandra.cql3.validation.entities;
 
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
 import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.UUID;
+import java.util.*;
+import java.util.concurrent.Callable;
 import java.util.concurrent.CountDownLatch;
 
-import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.index.IndexNotAvailableException;
-import org.apache.cassandra.db.index.PerRowSecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
-import org.apache.cassandra.db.index.composites.CompositesSearcher;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.concurrent.OpOrder.Group;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Test;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.index.IndexNotAvailableException;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.StubIndex;
+import org.apache.cassandra.index.internal.CustomCassandraIndex;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.MD5Digest;
+import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.Util.throwAssert;
 import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 public class SecondaryIndexTest extends CQLTester
 {
@@ -85,13 +96,27 @@
     private void testCreateAndDropIndex(String indexName, boolean addKeyspaceOnDrop) throws Throwable
     {
         execute("USE system");
-        assertInvalidMessage("Index '" + removeQuotes(indexName.toLowerCase(Locale.US)) + "' could not be found", "DROP INDEX " + indexName + ";");
+        assertInvalidMessage(String.format("Index '%s' could not be found",
+                                           removeQuotes(indexName.toLowerCase(Locale.US))),
+                             "DROP INDEX " + indexName + ";");
 
         createTable("CREATE TABLE %s (a int primary key, b int);");
         createIndex("CREATE INDEX " + indexName + " ON %s(b);");
         createIndex("CREATE INDEX IF NOT EXISTS " + indexName + " ON %s(b);");
 
-        assertInvalidMessage("Index already exists", "CREATE INDEX " + indexName + " ON %s(b)");
+        assertInvalidMessage(String.format("Index %s already exists",
+                                           removeQuotes(indexName.toLowerCase(Locale.US))),
+                             "CREATE INDEX " + indexName + " ON %s(b)");
+
+        // IF NOT EXISTS should apply in cases where the new index differs from an existing one in name only
+        String otherIndexName = "index_" + System.nanoTime();
+        assertEquals(1, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        createIndex("CREATE INDEX IF NOT EXISTS " + otherIndexName + " ON %s(b)");
+        assertEquals(1, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        assertInvalidMessage(String.format("Index %s is a duplicate of existing index %s",
+                                           removeQuotes(otherIndexName.toLowerCase(Locale.US)),
+                                           removeQuotes(indexName.toLowerCase(Locale.US))),
+                             "CREATE INDEX " + otherIndexName + " ON %s(b)");
 
         execute("INSERT INTO %s (a, b) values (?, ?);", 0, 0);
         execute("INSERT INTO %s (a, b) values (?, ?);", 1, 1);
@@ -99,7 +124,8 @@
         execute("INSERT INTO %s (a, b) values (?, ?);", 3, 1);
 
         assertRows(execute("SELECT * FROM %s where b = ?", 1), row(1, 1), row(3, 1));
-        assertInvalidMessage("Index '" + removeQuotes(indexName.toLowerCase(Locale.US)) + "' could not be found in any of the tables of keyspace 'system'",
+        assertInvalidMessage(String.format("Index '%s' could not be found in any of the tables of keyspace 'system'",
+                                           removeQuotes(indexName.toLowerCase(Locale.US))),
                              "DROP INDEX " + indexName);
 
         if (addKeyspaceOnDrop)
@@ -112,10 +138,12 @@
             execute("DROP INDEX " + indexName);
         }
 
-        assertInvalidMessage("Predicates on non-primary-key columns (b) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s where b = ?", 1);
         dropIndex("DROP INDEX IF EXISTS " + indexName);
-        assertInvalidMessage("Index '" + removeQuotes(indexName.toLowerCase(Locale.US)) + "' could not be found", "DROP INDEX " + indexName);
+        assertInvalidMessage(String.format("Index '%s' could not be found",
+                                           removeQuotes(indexName.toLowerCase(Locale.US))),
+                             "DROP INDEX " + indexName);
     }
 
     /**
@@ -204,12 +232,10 @@
     public void testUnknownCompressionOptions() throws Throwable
     {
         String tableName = createTableName();
-        assertInvalidThrow(SyntaxException.class, String.format(
-                                                               "CREATE TABLE %s (key varchar PRIMARY KEY, password varchar, gender varchar) WITH compression_parameters:sstable_compressor = 'DeflateCompressor'", tableName));
+        assertInvalidThrow(SyntaxException.class, String.format("CREATE TABLE %s (key varchar PRIMARY KEY, password varchar, gender varchar) WITH compression_parameters:sstable_compressor = 'DeflateCompressor'", tableName));
 
-
-        assertInvalidThrow(ConfigurationException.class, String.format(
-                                                                      "CREATE TABLE %s (key varchar PRIMARY KEY, password varchar, gender varchar) WITH compression = { 'sstable_compressor': 'DeflateCompressor' }", tableName));
+        assertInvalidThrow(ConfigurationException.class, String.format("CREATE TABLE %s (key varchar PRIMARY KEY, password varchar, gender varchar) WITH compression = { 'sstable_compressor': 'DeflateCompressor' }",
+                                                                       tableName));
     }
 
     /**
@@ -394,6 +420,45 @@
         assertEmpty(execute("SELECT k, v FROM %s  WHERE m CONTAINS 4"));
     }
 
+    @Test
+    public void testSelectOnMultiIndexOnCollectionsWithNull() throws Throwable
+    {
+        createTable(" CREATE TABLE %s ( k int, v int, x text, l list<int>, s set<text>, m map<text, int>, PRIMARY KEY (k, v))");
+
+        createIndex("CREATE INDEX ON %s (x)");
+        createIndex("CREATE INDEX ON %s (v)");
+        createIndex("CREATE INDEX ON %s (s)");
+        createIndex("CREATE INDEX ON %s (m)");
+
+
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (0, 0, 'x', [1, 2],    {'a'},      {'a' : 1})");
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (0, 1, 'x', [3, 4],    {'b', 'c'}, {'a' : 1, 'b' : 2})");
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (0, 2, 'x', [1],       {'a', 'c'}, {'c' : 3})");
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (1, 0, 'x', [1, 2, 4], {},         {'b' : 1})");
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (1, 1, 'x', [4, 5],    {'d'},      {'a' : 1, 'b' : 3})");
+        execute("INSERT INTO %s (k, v, x, l, s, m) VALUES (1, 2, 'x', null,      null,       null)");
+
+        beforeAndAfterFlush(() -> {
+            // lists
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND l CONTAINS 1 ALLOW FILTERING"), row(1, 0), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND k = 0 AND l CONTAINS 1 ALLOW FILTERING"), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND l CONTAINS 2 ALLOW FILTERING"), row(1, 0), row(0, 0));
+            assertEmpty(execute("SELECT k, v FROM %s WHERE x = 'x' AND l CONTAINS 6 ALLOW FILTERING"));
+
+            // sets
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND s CONTAINS 'a' ALLOW FILTERING" ), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND k = 0 AND s CONTAINS 'a' ALLOW FILTERING"), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND s CONTAINS 'd' ALLOW FILTERING"), row(1, 1));
+            assertEmpty(execute("SELECT k, v FROM %s  WHERE x = 'x' AND s CONTAINS 'e' ALLOW FILTERING"));
+
+            // maps
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND m CONTAINS 1 ALLOW FILTERING"), row(1, 0), row(1, 1), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND k = 0 AND m CONTAINS 1 ALLOW FILTERING"), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE x = 'x' AND m CONTAINS 2 ALLOW FILTERING"), row(0, 1));
+            assertEmpty(execute("SELECT k, v FROM %s  WHERE x = 'x' AND m CONTAINS 4 ALLOW FILTERING"));
+        });
+    }
+
     /**
      * Migrated from cql_tests.py:TestCQL.map_keys_indexing()
      */
@@ -415,9 +480,6 @@
         assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND m CONTAINS KEY 'a'"), row(0, 0), row(0, 1));
         assertRows(execute("SELECT k, v FROM %s WHERE m CONTAINS KEY 'c'"), row(0, 2));
         assertEmpty(execute("SELECT k, v FROM %s  WHERE m CONTAINS KEY 'd'"));
-
-        // we're not allowed to create a value index if we already have a key one
-        assertInvalid("CREATE INDEX ON %s(m)");
     }
 
     /**
@@ -427,7 +489,7 @@
     @Test
     public void testIndexOnKeyWithReverseClustering() throws Throwable
     {
-        createTable(" CREATE TABLE %s (k1 int, k2 int, v int, PRIMARY KEY ((k1, k2), v) ) WITH CLUSTERING ORDER BY (v DESC)");
+        createTable("CREATE TABLE %s (k1 int, k2 int, v int, PRIMARY KEY ((k1, k2), v) ) WITH CLUSTERING ORDER BY (v DESC)");
 
         createIndex("CREATE INDEX ON %s (k2)");
 
@@ -465,6 +527,67 @@
         assertRows(execute("select count(*) from %s where app_name='foo' and account='bar' and last_access > 4 allow filtering"), row(1L));
     }
 
+    @Test
+    public void testSyntaxVariationsForIndexOnCollectionsValue() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, m map<int, int>, l list<int>, s set<int>, PRIMARY KEY (k))");
+        createAndDropCollectionValuesIndex("m");
+        createAndDropCollectionValuesIndex("l");
+        createAndDropCollectionValuesIndex("s");
+    }
+
+    private void createAndDropCollectionValuesIndex(String columnName) throws Throwable
+    {
+        String indexName = columnName + "_idx";
+        SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
+        createIndex(String.format("CREATE INDEX %s on %%s(%s)", indexName, columnName));
+        IndexMetadata indexDef = indexManager.getIndexByName(indexName).getIndexMetadata();
+        assertEquals(String.format("values(%s)", columnName), indexDef.options.get(IndexTarget.TARGET_OPTION_NAME));
+        dropIndex(String.format("DROP INDEX %s.%s", KEYSPACE, indexName));
+        assertFalse(indexManager.hasIndexes());
+        createIndex(String.format("CREATE INDEX %s on %%s(values(%s))", indexName, columnName));
+        assertEquals(indexDef, indexManager.getIndexByName(indexName).getIndexMetadata());
+        dropIndex(String.format("DROP INDEX %s.%s", KEYSPACE, indexName));
+    }
+
+    @Test
+    public void testCreateIndexWithQuotedColumnNames() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    " k int," +
+                    " v int, " +
+                    " lower_case_map map<int, int>," +
+                    " \"MixedCaseMap\" map<int, int>," +
+                    " lower_case_frozen_list frozen<list<int>>," +
+                    " \"UPPER_CASE_FROZEN_LIST\" frozen<list<int>>," +
+                    " \"set name with spaces\" set<int>," +
+                    " \"column_name_with\"\"escaped quote\" int," +
+                    " PRIMARY KEY (k))");
+
+        createAndDropIndexWithQuotedColumnIdentifier("\"v\"");
+        createAndDropIndexWithQuotedColumnIdentifier("keys(\"lower_case_map\")");
+        createAndDropIndexWithQuotedColumnIdentifier("keys(\"MixedCaseMap\")");
+        createAndDropIndexWithQuotedColumnIdentifier("full(\"lower_case_frozen_list\")");
+        createAndDropIndexWithQuotedColumnIdentifier("full(\"UPPER_CASE_FROZEN_LIST\")");
+        createAndDropIndexWithQuotedColumnIdentifier("values(\"set name with spaces\")");
+        createAndDropIndexWithQuotedColumnIdentifier("\"column_name_with\"\"escaped quote\"");
+    }
+
+    private void createAndDropIndexWithQuotedColumnIdentifier(String target) throws Throwable
+    {
+        String indexName = "test_mixed_case_idx";
+        createIndex(String.format("CREATE INDEX %s ON %%s(%s)", indexName, target));
+        SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
+        IndexMetadata indexDef = indexManager.getIndexByName(indexName).getIndexMetadata();
+        dropIndex(String.format("DROP INDEX %s.%s", KEYSPACE, indexName));
+        // verify we can re-create the index using the target string
+        createIndex(String.format("CREATE INDEX %s ON %%s(%s)",
+                                  indexName, indexDef.options.get(IndexTarget.TARGET_OPTION_NAME)));
+        assertEquals(indexDef, indexManager.getIndexByName(indexName).getIndexMetadata());
+        dropIndex(String.format("DROP INDEX %s.%s", KEYSPACE, indexName));
+    }
+
+
     /**
      * Test for CASSANDRA-5732, Can not query secondary index
      * migrated from cql_tests.py:TestCQL.bug_5732_test(),
@@ -476,7 +599,7 @@
     {
         String tableName = createTable("CREATE TABLE %s (k int PRIMARY KEY, v int,)");
 
-        execute("ALTER TABLE %s WITH CACHING='ALL'");
+        execute("ALTER TABLE %s WITH CACHING = { 'keys': 'ALL', 'rows_per_partition': 'ALL' }");
         execute("INSERT INTO %s (k,v) VALUES (0,0)");
         execute("INSERT INTO %s (k,v) VALUES (1,1)");
 
@@ -490,29 +613,23 @@
 
     // CASSANDRA-8280/8081
     // reject updates with indexed values where value > 64k
+    // make sure we check conditional and unconditional statements,
+    // both singly and in batches (CASSANDRA-10536)
     @Test
     public void testIndexOnCompositeValueOver64k() throws Throwable
     {
         createTable("CREATE TABLE %s(a int, b int, c blob, PRIMARY KEY (a))");
         createIndex("CREATE INDEX ON %s(c)");
         failInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?)", ByteBuffer.allocate(TOO_BIG));
-    }
-
-    @Test
-    public void testIndexOnClusteringColumnInsertPartitionKeyAndClusteringsOver64k() throws Throwable
-    {
-        createTable("CREATE TABLE %s(a blob, b blob, c blob, d int, PRIMARY KEY (a, b, c))");
-        createIndex("CREATE INDEX ON %s(b)");
-
-        // CompositeIndexOnClusteringKey creates index entries composed of the
-        // PK plus all of the non-indexed clustering columns from the primary row
-        // so we should reject where len(a) + len(c) > 65560 as this will form the
-        // total clustering in the index table
-        ByteBuffer a = ByteBuffer.allocate(100);
-        ByteBuffer b = ByteBuffer.allocate(10);
-        ByteBuffer c = ByteBuffer.allocate(FBUtilities.MAX_UNSIGNED_SHORT - 99);
-
-        failInsert("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, 0)", a, b, c);
+        failInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?) IF NOT EXISTS", ByteBuffer.allocate(TOO_BIG));
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b, c) VALUES (0, 0, ?);\n" +
+                   "APPLY BATCH",
+                   ByteBuffer.allocate(TOO_BIG));
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b, c) VALUES (0, 0, ?) IF NOT EXISTS;\n" +
+                   "APPLY BATCH",
+                   ByteBuffer.allocate(TOO_BIG));
     }
 
     @Test
@@ -521,40 +638,15 @@
         createTable("CREATE TABLE %s(a int, b blob, PRIMARY KEY (a)) WITH COMPACT STORAGE");
         createIndex("CREATE INDEX ON %s(b)");
         failInsert("INSERT INTO %s (a, b) VALUES (0, ?)", ByteBuffer.allocate(TOO_BIG));
-    }
-
-    @Test
-    public void testIndexOnCollectionValueInsertPartitionKeyAndCollectionKeyOver64k() throws Throwable
-    {
-        createTable("CREATE TABLE %s(a blob , b map<blob, int>, PRIMARY KEY (a))");
-        createIndex("CREATE INDEX ON %s(b)");
-
-        // A collection key > 64k by itself will be rejected from
-        // the primary table.
-        // To test index validation we need to ensure that
-        // len(b) < 64k, but len(a) + len(b) > 64k as that will
-        // form the clustering in the index table
-        ByteBuffer a = ByteBuffer.allocate(100);
-        ByteBuffer b = ByteBuffer.allocate(FBUtilities.MAX_UNSIGNED_SHORT - 100);
-
-        failInsert("UPDATE %s SET b[?] = 0 WHERE a = ?", b, a);
-    }
-
-    @Test
-    public void testIndexOnCollectionKeyInsertPartitionKeyAndClusteringOver64k() throws Throwable
-    {
-        createTable("CREATE TABLE %s(a blob, b blob, c map<blob, int>, PRIMARY KEY (a, b))");
-        createIndex("CREATE INDEX ON %s(KEYS(c))");
-
-        // Basically the same as the case with non-collection clustering
-        // CompositeIndexOnCollectionKeyy creates index entries composed of the
-        // PK plus all of the clustering columns from the primary row, except the
-        // collection element - which becomes the partition key in the index table
-        ByteBuffer a = ByteBuffer.allocate(100);
-        ByteBuffer b = ByteBuffer.allocate(FBUtilities.MAX_UNSIGNED_SHORT - 100);
-        ByteBuffer c = ByteBuffer.allocate(10);
-
-        failInsert("UPDATE %s SET c[?] = 0 WHERE a = ? and b = ?", c, a, b);
+        failInsert("INSERT INTO %s (a, b) VALUES (0, ?) IF NOT EXISTS", ByteBuffer.allocate(TOO_BIG));
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b) VALUES (0, ?);\n" +
+                   "APPLY BATCH",
+                   ByteBuffer.allocate(TOO_BIG));
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b) VALUES (0, ?) IF NOT EXISTS;\n" +
+                   "APPLY BATCH",
+                   ByteBuffer.allocate(TOO_BIG));
     }
 
     @Test
@@ -562,30 +654,29 @@
     {
         createTable("CREATE TABLE %s(a int, b int, c blob, PRIMARY KEY ((a, b)))");
         createIndex("CREATE INDEX ON %s(a)");
+        succeedInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?) IF NOT EXISTS", ByteBuffer.allocate(TOO_BIG));
         succeedInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?)", ByteBuffer.allocate(TOO_BIG));
-    }
+        succeedInsert("BEGIN BATCH\n" +
+                      "INSERT INTO %s (a, b, c) VALUES (0, 0, ?);\n" +
+                      "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
 
-    @Test
-    public void testIndexOnPartitionKeyWithStaticColumnAndNoRows() throws Throwable
-    {
-        createTable("CREATE TABLE %s (pk1 int, pk2 int, c int, s int static, v int, PRIMARY KEY((pk1, pk2), c))");
-        createIndex("CREATE INDEX ON %s (pk2)");
-        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 1, 9, 1);
-        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 2, 9, 2);
-        execute("INSERT INTO %s (pk1, pk2, s) VALUES (?, ?, ?)", 2, 1, 9);
-        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 3, 1, 1, 9, 1);
-
-        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 1),
-                   row(2, 1, null, 9, null),
-                   row(1, 1, 1, 9, 1),
-                   row(1, 1, 2, 9, 2),
-                   row(3, 1, 1, 9, 1));
-
-        execute("UPDATE %s SET s=?, v=? WHERE pk1=? AND pk2=? AND c=?", 9, 1, 1, 10, 2);
-        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 10), row(1, 10, 2, 9, 1));
-
-        execute("UPDATE %s SET s=? WHERE pk1=? AND pk2=?", 9, 1, 20);
-        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 20), row(1, 20, null, 9, null));
+        // the indexed value passes validation, but the batch size will
+        // exceed the default failure threshold, so temporarily raise it
+        // (the non-conditional batch doesn't hit this because
+        // BatchStatement::executeInternal skips the size check but CAS
+        // path does not)
+        long batchSizeThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
+        try
+        {
+            DatabaseDescriptor.setBatchSizeFailThresholdInKB( (TOO_BIG / 1024) * 2);
+            succeedInsert("BEGIN BATCH\n" +
+                          "INSERT INTO %s (a, b, c) VALUES (1, 1, ?) IF NOT EXISTS;\n" +
+                          "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
+        }
+        finally
+        {
+            DatabaseDescriptor.setBatchSizeFailThresholdInKB((int) (batchSizeThreshold / 1024));
+        }
     }
 
     @Test
@@ -593,7 +684,29 @@
     {
         createTable("CREATE TABLE %s(a int, b int, c blob, PRIMARY KEY (a, b))");
         createIndex("CREATE INDEX ON %s(b)");
+        succeedInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?) IF NOT EXISTS", ByteBuffer.allocate(TOO_BIG));
         succeedInsert("INSERT INTO %s (a, b, c) VALUES (0, 0, ?)", ByteBuffer.allocate(TOO_BIG));
+        succeedInsert("BEGIN BATCH\n" +
+                      "INSERT INTO %s (a, b, c) VALUES (0, 0, ?);\n" +
+                      "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
+
+        // the indexed value passes validation, but the batch size will
+        // exceed the default failure threshold, so temporarily raise it
+        // (the non-conditional batch doesn't hit this because
+        // BatchStatement::executeInternal skips the size check but CAS
+        // path does not)
+        long batchSizeThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
+        try
+        {
+            DatabaseDescriptor.setBatchSizeFailThresholdInKB( (TOO_BIG / 1024) * 2);
+            succeedInsert("BEGIN BATCH\n" +
+                          "INSERT INTO %s (a, b, c) VALUES (1, 1, ?) IF NOT EXISTS;\n" +
+                          "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
+        }
+        finally
+        {
+            DatabaseDescriptor.setBatchSizeFailThresholdInKB((int)(batchSizeThreshold / 1024));
+        }
     }
 
     @Test
@@ -604,6 +717,13 @@
         Map<Integer, ByteBuffer> map = new HashMap<>();
         map.put(0, ByteBuffer.allocate(1024 * 65));
         failInsert("INSERT INTO %s (a, b) VALUES (0, ?)", map);
+        failInsert("INSERT INTO %s (a, b) VALUES (0, ?) IF NOT EXISTS", map);
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b) VALUES (0, ?);\n" +
+                   "APPLY BATCH", map);
+        failInsert("BEGIN BATCH\n" +
+                   "INSERT INTO %s (a, b) VALUES (0, ?) IF NOT EXISTS;\n" +
+                   "APPLY BATCH", map);
     }
 
     public void failInsert(String insertCQL, Object...args) throws Throwable
@@ -681,6 +801,170 @@
     }
 
     @Test
+    public void testMultipleIndexesOnOneColumn() throws Throwable
+    {
+        String indexClassName = StubIndex.class.getName();
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY ((a), b))");
+        // uses different options otherwise the two indexes are considered duplicates
+        createIndex(String.format("CREATE CUSTOM INDEX c_idx_1 ON %%s(c) USING '%s' WITH OPTIONS = {'foo':'a'}", indexClassName));
+        createIndex(String.format("CREATE CUSTOM INDEX c_idx_2 ON %%s(c) USING '%s' WITH OPTIONS = {'foo':'b'}", indexClassName));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        CFMetaData cfm = cfs.metadata;
+        StubIndex index1 = (StubIndex)cfs.indexManager.getIndex(cfm.getIndexes()
+                                                                   .get("c_idx_1")
+                                                                   .orElseThrow(throwAssert("index not found")));
+        StubIndex index2 = (StubIndex)cfs.indexManager.getIndex(cfm.getIndexes()
+                                                                   .get("c_idx_2")
+                                                                   .orElseThrow(throwAssert("index not found")));
+        Object[] row1a = row(0, 0, 0);
+        Object[] row1b = row(0, 0, 1);
+        Object[] row2 = row(2, 2, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", row1a);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", row1b);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", row2);
+
+        assertEquals(2, index1.rowsInserted.size());
+        assertColumnValue(0, "c", index1.rowsInserted.get(0), cfm);
+        assertColumnValue(2, "c", index1.rowsInserted.get(1), cfm);
+
+        assertEquals(2, index2.rowsInserted.size());
+        assertColumnValue(0, "c", index2.rowsInserted.get(0), cfm);
+        assertColumnValue(2, "c", index2.rowsInserted.get(1), cfm);
+
+        assertEquals(1, index1.rowsUpdated.size());
+        assertColumnValue(0, "c", index1.rowsUpdated.get(0).left, cfm);
+        assertColumnValue(1, "c", index1.rowsUpdated.get(0).right, cfm);
+
+        assertEquals(1, index2.rowsUpdated.size());
+        assertColumnValue(0, "c", index2.rowsUpdated.get(0).left, cfm);
+        assertColumnValue(1, "c", index2.rowsUpdated.get(0).right, cfm);
+    }
+
+    @Test
+    public void testDeletions() throws Throwable
+    {
+        // Test for bugs like CASSANDRA-10694.  These may not be readily visible with the built-in secondary index
+        // implementation because of the stale entry handling.
+
+        String indexClassName = StubIndex.class.getName();
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY ((a), b))");
+        createIndex(String.format("CREATE CUSTOM INDEX c_idx ON %%s(c) USING '%s'", indexClassName));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        CFMetaData cfm = cfs.metadata;
+        StubIndex index1 = (StubIndex) cfs.indexManager.getIndex(cfm.getIndexes()
+                .get("c_idx")
+                .orElseThrow(throwAssert("index not found")));
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP 1", 0, 0, 0);
+        assertEquals(1, index1.rowsInserted.size());
+
+        execute("DELETE FROM %s USING TIMESTAMP 2 WHERE a = ? AND b = ?", 0, 0);
+        assertEquals(1, index1.rowsUpdated.size());
+        Pair<Row, Row> update = index1.rowsUpdated.get(0);
+        Row existingRow = update.left;
+        Row newRow = update.right;
+
+        // check the existing row from the update call
+        assertTrue(existingRow.deletion().isLive());
+        assertEquals(DeletionTime.LIVE, existingRow.deletion().time());
+        assertEquals(1L, existingRow.primaryKeyLivenessInfo().timestamp());
+
+        // check the new row from the update call
+        assertFalse(newRow.deletion().isLive());
+        assertEquals(2L, newRow.deletion().time().markedForDeleteAt());
+        assertFalse(newRow.cells().iterator().hasNext());
+
+        // delete the same row again
+        execute("DELETE FROM %s USING TIMESTAMP 3 WHERE a = ? AND b = ?", 0, 0);
+        assertEquals(2, index1.rowsUpdated.size());
+        update = index1.rowsUpdated.get(1);
+        existingRow = update.left;
+        newRow = update.right;
+
+        // check the new row from the update call
+        assertFalse(existingRow.deletion().isLive());
+        assertEquals(2L, existingRow.deletion().time().markedForDeleteAt());
+        assertFalse(existingRow.cells().iterator().hasNext());
+
+        // check the new row from the update call
+        assertFalse(newRow.deletion().isLive());
+        assertEquals(3L, newRow.deletion().time().markedForDeleteAt());
+        assertFalse(newRow.cells().iterator().hasNext());
+    }
+
+    @Test
+    public void testUpdatesToMemtableData() throws Throwable
+    {
+        // verify the contract specified by Index.Indexer::updateRow(oldRowData, newRowData),
+        // when a row in the memtable is updated, the indexer should be informed of:
+        // * new columns
+        // * removed columns
+        // * columns whose value, timestamp or ttl have been modified.
+        // Any columns which are unchanged by the update are not passed to the Indexer
+        // Note that for simplicity this test resets the index between each scenario
+        createTable("CREATE TABLE %s (k int, c int, v1 int, v2 int, PRIMARY KEY (k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX test_index ON %%s() USING '%s'", StubIndex.class.getName()));
+        execute("INSERT INTO %s (k, c, v1, v2) VALUES (0, 0, 0, 0) USING TIMESTAMP 0");
+
+        ColumnDefinition v1 = getCurrentColumnFamilyStore().metadata.getColumnDefinition(new ColumnIdentifier("v1", true));
+        ColumnDefinition v2 = getCurrentColumnFamilyStore().metadata.getColumnDefinition(new ColumnIdentifier("v2", true));
+
+        StubIndex index = (StubIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName("test_index");
+        assertEquals(1, index.rowsInserted.size());
+
+        // Overwrite a single value, leaving the other untouched
+        execute("UPDATE %s USING TIMESTAMP 1 SET v1=1 WHERE k=0 AND c=0");
+        assertEquals(1, index.rowsUpdated.size());
+        Row oldRow = index.rowsUpdated.get(0).left;
+        assertEquals(1, oldRow.columnCount());
+        validateCell(oldRow.getCell(v1), v1, ByteBufferUtil.bytes(0), 0);
+        Row newRow = index.rowsUpdated.get(0).right;
+        assertEquals(1, newRow.columnCount());
+        validateCell(newRow.getCell(v1), v1, ByteBufferUtil.bytes(1), 1);
+        index.reset();
+
+        // Overwrite both values
+        execute("UPDATE %s USING TIMESTAMP 2 SET v1=2, v2=2 WHERE k=0 AND c=0");
+        assertEquals(1, index.rowsUpdated.size());
+        oldRow = index.rowsUpdated.get(0).left;
+        assertEquals(2, oldRow.columnCount());
+        validateCell(oldRow.getCell(v1), v1, ByteBufferUtil.bytes(1), 1);
+        validateCell(oldRow.getCell(v2), v2, ByteBufferUtil.bytes(0), 0);
+        newRow = index.rowsUpdated.get(0).right;
+        assertEquals(2, newRow.columnCount());
+        validateCell(newRow.getCell(v1), v1, ByteBufferUtil.bytes(2), 2);
+        validateCell(newRow.getCell(v2), v2, ByteBufferUtil.bytes(2), 2);
+        index.reset();
+
+        // Delete one value
+        execute("DELETE v1 FROM %s USING TIMESTAMP 3 WHERE k=0 AND c=0");
+        assertEquals(1, index.rowsUpdated.size());
+        oldRow = index.rowsUpdated.get(0).left;
+        assertEquals(1, oldRow.columnCount());
+        validateCell(oldRow.getCell(v1), v1, ByteBufferUtil.bytes(2), 2);
+        newRow = index.rowsUpdated.get(0).right;
+        assertEquals(1, newRow.columnCount());
+        Cell newCell = newRow.getCell(v1);
+        assertTrue(newCell.isTombstone());
+        assertEquals(3, newCell.timestamp());
+        index.reset();
+
+        // Modify the liveness of the primary key, the delta rows should contain
+        // no cell data as only the pk was altered, but it should illustrate the
+        // change to the liveness info
+        execute("INSERT INTO %s(k, c) VALUES (0, 0) USING TIMESTAMP 4");
+        assertEquals(1, index.rowsUpdated.size());
+        oldRow = index.rowsUpdated.get(0).left;
+        assertEquals(0, oldRow.columnCount());
+        assertEquals(0, oldRow.primaryKeyLivenessInfo().timestamp());
+        newRow = index.rowsUpdated.get(0).right;
+        assertEquals(0, newRow.columnCount());
+        assertEquals(4, newRow.primaryKeyLivenessInfo().timestamp());
+    }
+
+    @Test
     public void testIndexQueriesWithIndexNotReady() throws Throwable
     {
         createTable("CREATE TABLE %s (pk int, ck int, value int, PRIMARY KEY (pk, ck))");
@@ -689,8 +973,7 @@
             for (int j = 0; j < 10; j++)
                 execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", i, j, i + j);
 
-        createIndex("CREATE CUSTOM INDEX testIndex ON %s (value) USING '" + IndexBlockingOnInitialization.class.getName()
-                + "'");
+        createIndex("CREATE CUSTOM INDEX testIndex ON %s (value) USING '" + IndexBlockingOnInitialization.class.getName() + "'");
         try
         {
             execute("SELECT value FROM %s WHERE value = 2");
@@ -707,6 +990,47 @@
     }
 
     @Test
+    public void droppingIndexInvalidatesPreparedStatements() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY ((a), b))");
+        createIndex("CREATE INDEX c_idx ON %s(c)");
+        MD5Digest cqlId = prepareStatement("SELECT * FROM %s.%s WHERE c=?", false).statementId;
+        Integer thriftId = prepareStatement("SELECT * FROM %s.%s WHERE c=?", true).toThriftPreparedResult().getItemId();
+
+        assertNotNull(QueryProcessor.instance.getPrepared(cqlId));
+        assertNotNull(QueryProcessor.instance.getPreparedForThrift(thriftId));
+
+        dropIndex("DROP INDEX %s.c_idx");
+
+        assertNull(QueryProcessor.instance.getPrepared(cqlId));
+        assertNull(QueryProcessor.instance.getPreparedForThrift(thriftId));
+    }
+
+    // See CASSANDRA-11021
+    @Test
+    public void testIndexesOnNonStaticColumnsWhereSchemaIncludesStaticColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int static, d int, PRIMARY KEY (a, b))");
+        createIndex("CREATE INDEX b_idx on %s(b)");
+        createIndex("CREATE INDEX d_idx on %s(d)");
+
+        execute("INSERT INTO %s (a, b, c ,d) VALUES (0, 0, 0, 0)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (1, 1, 1, 1)");
+        assertRows(execute("SELECT * FROM %s WHERE b = 0"), row(0, 0, 0, 0));
+        assertRows(execute("SELECT * FROM %s WHERE d = 1"), row(1, 1, 1, 1));
+
+        execute("UPDATE %s SET c = 2 WHERE a = 0");
+        execute("UPDATE %s SET c = 3, d = 4 WHERE a = 1 AND b = 1");
+        assertRows(execute("SELECT * FROM %s WHERE b = 0"), row(0, 0, 2, 0));
+        assertRows(execute("SELECT * FROM %s WHERE d = 4"), row(1, 1, 3, 4));
+
+        execute("DELETE FROM %s WHERE a = 0");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1");
+        assertEmpty(execute("SELECT * FROM %s WHERE b = 0"));
+        assertEmpty(execute("SELECT * FROM %s WHERE d = 3"));
+    }
+
+    @Test
     public void testWithEmptyRestrictionValueAndSecondaryIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (pk blob, c blob, v blob, PRIMARY KEY ((pk), c))");
@@ -716,11 +1040,7 @@
         execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"));
         execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("2"), bytes("1"));
 
-        for (boolean flush : new boolean[]{false, true})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             // Test clustering columns restrictions
             assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c = textAsBlob('');"));
 
@@ -755,15 +1075,12 @@
             assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c > textAsBlob('') AND c < textAsBlob('') AND v = textAsBlob('1') ALLOW FILTERING;"));
 
             assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND (c) > (textAsBlob('')) AND (c) < (textAsBlob('')) AND v = textAsBlob('1') ALLOW FILTERING;"));
-        }
+        });
 
         execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                 bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"));
 
-        for (boolean flush : new boolean[]{false, true})
-        {
-            if (flush)
-                flush();
+        beforeAndAfterFlush(() -> {
 
             assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c = textAsBlob('');"),
                        row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1")));
@@ -809,26 +1126,23 @@
 
             // Test restrictions on non-primary key value
             assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND v = textAsBlob('');"));
-        }
+        });
 
         execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                 bytes("foo123"), bytes("3"), EMPTY_BYTE_BUFFER);
 
-        for (boolean flush : new boolean[]{false, true})
-        {
-            if (flush)
-                flush();
+        beforeAndAfterFlush(() -> {
 
             assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND v = textAsBlob('');"),
                        row(bytes("foo123"), bytes("3"), EMPTY_BYTE_BUFFER));
-        }
+        });
     }
 
     @Test
     public void testEmptyRestrictionValueWithSecondaryIndexAndCompactTables() throws Throwable
     {
         createTable("CREATE TABLE %s (pk blob, c blob, v blob, PRIMARY KEY ((pk), c)) WITH COMPACT STORAGE");
-        assertInvalidMessage("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                             "CREATE INDEX on %s(c)");
 
         createTable("CREATE TABLE %s (pk blob PRIMARY KEY, v blob) WITH COMPACT STORAGE");
@@ -846,18 +1160,76 @@
     }
 
     @Test
-    public void testIndexOnRegularColumnWithPartitionWithoutRows() throws Throwable
+    public void testIndexOnPartitionKeyWithStaticColumnAndNoRows() throws Throwable
     {
-        createTable("CREATE TABLE %s (pk int, c int, s int static, v int, PRIMARY KEY(pk, c))");
-        createIndex("CREATE INDEX ON %s (v)");
-        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 1, 9, 1);
-        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 2, 9, 2);
-        execute("INSERT INTO %s (pk, s) VALUES (?, ?)", 2, 9);
-        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 3, 1, 9, 1);
-        flush();
-        execute("DELETE FROM %s WHERE pk = ? and c = ?", 3, 1);
-        assertRows(execute("SELECT * FROM %s WHERE v = ?", 1),
-                   row(1, 1, 9, 1));
+        createTable("CREATE TABLE %s (pk1 int, pk2 int, c int, s int static, v int, PRIMARY KEY((pk1, pk2), c))");
+        createIndex("CREATE INDEX ON %s (pk2)");
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 1, 9, 1);
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 2, 9, 2);
+        execute("INSERT INTO %s (pk1, pk2, s) VALUES (?, ?, ?)", 2, 1, 9);
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 3, 1, 1, 9, 1);
+
+        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 1),
+                   row(2, 1, null, 9, null),
+                   row(1, 1, 1, 9, 1),
+                   row(1, 1, 2, 9, 2),
+                   row(3, 1, 1, 9, 1));
+
+        execute("UPDATE %s SET s=?, v=? WHERE pk1=? AND pk2=? AND c=?", 9, 1, 1, 10, 2);
+        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 10), row(1, 10, 2, 9, 1));
+
+        execute("UPDATE %s SET s=? WHERE pk1=? AND pk2=?", 9, 1, 20);
+        assertRows(execute("SELECT * FROM %s WHERE pk2 = ?", 20), row(1, 20, null, 9, null));
+    }
+
+    private ResultMessage.Prepared prepareStatement(String cql, boolean forThrift)
+    {
+        return QueryProcessor.prepare(String.format(cql, KEYSPACE, currentTable()),
+                                      ClientState.forInternalCalls(),
+                                      forThrift);
+    }
+
+    private void validateCell(Cell cell, ColumnDefinition def, ByteBuffer val, long timestamp)
+    {
+        assertNotNull(cell);
+        assertEquals(0, def.type.compare(cell.value(), val));
+        assertEquals(timestamp, cell.timestamp());
+    }
+
+    private static void assertColumnValue(int expected, String name, Row row, CFMetaData cfm)
+    {
+        ColumnDefinition col = cfm.getColumnDefinition(new ColumnIdentifier(name, true));
+        AbstractType<?> type = col.type;
+        assertEquals(expected, type.compose(row.getCell(col).value()));
+    }
+
+    /**
+     * <code>CassandraIndex</code> that blocks during the initialization.
+     */
+    public static class IndexBlockingOnInitialization extends CustomCassandraIndex
+    {
+        private final CountDownLatch latch = new CountDownLatch(1);
+
+        public IndexBlockingOnInitialization(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+        {
+            super(baseCfs, indexDef);
+        }
+
+        @Override
+        public Callable<?> getInitializationTask()
+        {
+            return () -> {
+                latch.await();
+                return null;
+            };
+        }
+
+        @Override
+        public Callable<?> getInvalidateTask()
+        {
+            latch.countDown();
+            return super.getInvalidateTask();
+        }
     }
 
     @Test
@@ -902,109 +1274,46 @@
         assertEmpty(execute("SELECT * FROM %s WHERE a = 5"));
     }
 
-    /**
-     * Custom index used to test the behavior of the system when the index is not ready.
-     * As Custom indices cannot by <code>PerColumnSecondaryIndex</code> we use a <code>PerRowSecondaryIndex</code>
-     * to avoid the check but return a <code>CompositesSearcher</code>.
-     */
-    public static class IndexBlockingOnInitialization extends PerRowSecondaryIndex
+    @Test
+    public void testIndicesOnCompactTable() throws Throwable
     {
-        private volatile CountDownLatch latch = new CountDownLatch(1);
+        assertInvalidMessage("COMPACT STORAGE with composite PRIMARY KEY allows no more than one column not part of the PRIMARY KEY (got: v1, v2)",
+                             "CREATE TABLE test (pk int, c int, v1 int, v2 int, PRIMARY KEY(pk, c)) WITH COMPACT STORAGE");
 
-        @Override
-        public void index(ByteBuffer rowKey, ColumnFamily cf)
-        {
-            try
-            {
-                latch.await();
-            }
-            catch (InterruptedException e)
-            {
-                Thread.interrupted();
-            }
-        }
+        createTable("CREATE TABLE %s (pk int, c int, v int, PRIMARY KEY(pk, c)) WITH COMPACT STORAGE");
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
+                             "CREATE INDEX ON %s(v)");
 
-        @Override
-        public void delete(DecoratedKey key, Group opGroup)
-        {
-        }
+        createTable("CREATE TABLE %s (pk int PRIMARY KEY, v int) WITH COMPACT STORAGE");
+        createIndex("CREATE INDEX ON %s(v)");
 
-        @Override
-        public void init()
-        {
-        }
+        execute("INSERT INTO %s (pk, v) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (pk, v) VALUES (?, ?)", 2, 1);
+        execute("INSERT INTO %s (pk, v) VALUES (?, ?)", 3, 3);
 
-        @Override
-        public void reload()
-        {
-        }
+        assertRows(execute("SELECT pk, v FROM %s WHERE v = 1"),
+                   row(1, 1),
+                   row(2, 1));
 
-        @Override
-        public void validateOptions() throws ConfigurationException
-        {
-        }
+        assertRows(execute("SELECT pk, v FROM %s WHERE v = 3"),
+                   row(3, 3));
 
-        @Override
-        public String getIndexName()
-        {
-            return "testIndex";
-        }
+        assertEmpty(execute("SELECT pk, v FROM %s WHERE v = 5"));
 
-        @Override
-        protected SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
-        {
-            return new CompositesSearcher(baseCfs.indexManager, columns)
-            {
-                @Override
-                public boolean canHandleIndexClause(List<IndexExpression> clause)
-                {
-                    return true;
-                }
+        createTable("CREATE TABLE %s (pk int PRIMARY KEY, v1 int, v2 int) WITH COMPACT STORAGE");
+        createIndex("CREATE INDEX ON %s(v1)");
 
-                @Override
-                public void validate(IndexExpression indexExpression) throws InvalidRequestException
-                {
-                }
-            };
-        }
+        execute("INSERT INTO %s (pk, v1, v2) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (pk, v1, v2) VALUES (?, ?, ?)", 2, 1, 2);
+        execute("INSERT INTO %s (pk, v1, v2) VALUES (?, ?, ?)", 3, 3, 3);
 
-        @Override
-        public void forceBlockingFlush()
-        {
-        }
+        assertRows(execute("SELECT pk, v2 FROM %s WHERE v1 = 1"),
+                   row(1, 1),
+                   row(2, 2));
 
-        @Override
-        public ColumnFamilyStore getIndexCfs()
-        {
-            return baseCfs;
-        }
+        assertRows(execute("SELECT pk, v2 FROM %s WHERE v1 = 3"),
+                   row(3, 3));
 
-        @Override
-        public void removeIndex(ByteBuffer columnName)
-        {
-            latch.countDown();
-        }
-
-        @Override
-        public void invalidate()
-        {
-        }
-
-        @Override
-        public void truncateBlocking(long truncatedAt)
-        {
-        }
-
-        @Override
-        public boolean indexes(CellName name)
-        {
-            return false;
-        }
-
-        @Override
-        public long estimateResultRows()
-        {
-            return 0;
-        }
+        assertEmpty(execute("SELECT pk, v2 FROM %s WHERE v1 = 5"));
     }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/StaticColumnsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/StaticColumnsTest.java
index cef6f1f..efa48ae 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/StaticColumnsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/StaticColumnsTest.java

@@ -268,4 +268,28 @@
         // We shouldn 't allow static when there is not clustering columns
         assertInvalid("ALTER TABLE %s ADD bar2 text static");
     }
+
+    /**
+     * Ensure that deleting and compacting a static row that should be purged doesn't throw.
+     * This is a test for #11988.
+     */
+    @Test
+    public void testStaticColumnPurging() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pkey text, ckey text, value text, static_value text static, PRIMARY KEY(pkey, ckey)) WITH gc_grace_seconds = 0");
+
+        execute("INSERT INTO %s (pkey, ckey, static_value, value) VALUES (?, ?, ?, ?)", "k1", "c1", "s1", "v1");
+
+        flush();
+
+        execute("DELETE static_value FROM %s WHERE pkey = ?", "k1");
+
+        flush();
+
+        Thread.sleep(1000);
+
+        compact();
+
+        assertRows(execute("SELECT * FROM %s"), row("k1", "c1", null, "v1"));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java
index 3e70cd0..b41163c 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java

@@ -152,4 +152,33 @@
         execute("INSERT INTO %s (k, i) VALUES (1, 1) USING TIMESTAMP ?", unset()); // treat as 'now'
     }
 
+    @Test
+    public void testTimestampsOnUnsetColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, i int)");
+        execute("INSERT INTO %s (k, i) VALUES (1, 1) USING TIMESTAMP 1;");
+        execute("INSERT INTO %s (k) VALUES (2) USING TIMESTAMP 2;");
+        execute("INSERT INTO %s (k, i) VALUES (3, 3) USING TIMESTAMP 1;");
+        assertRows(execute("SELECT k, i, writetime(i) FROM %s "),
+                   row(1, 1, 1L),
+                   row(2, null, null),
+                   row(3, 3, 1L));
+    }
+
+    @Test
+    public void testTimestampsOnUnsetColumnsWide() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int , c int, i int, PRIMARY KEY (k, c))");
+        execute("INSERT INTO %s (k, c, i) VALUES (1, 1, 1) USING TIMESTAMP 1;");
+        execute("INSERT INTO %s (k, c) VALUES (1, 2) USING TIMESTAMP 1;");
+        execute("INSERT INTO %s (k, c, i) VALUES (1, 3, 1) USING TIMESTAMP 1;");
+        execute("INSERT INTO %s (k, c) VALUES (2, 2) USING TIMESTAMP 2;");
+        execute("INSERT INTO %s (k, c, i) VALUES (3, 3, 3) USING TIMESTAMP 1;");
+        assertRows(execute("SELECT k, c, i, writetime(i) FROM %s "),
+                   row(1, 1, 1, 1L),
+                   row(1, 2, null, null),
+                   row(1, 3, 1, 1L),
+                   row(2, 2, null, null),
+                   row(3, 3, 3, 1L));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java
index 3b4fb40..bace751 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java

@@ -17,6 +17,10 @@
  */
 package org.apache.cassandra.cql3.validation.entities;
 
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Locale;
+
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
@@ -110,7 +114,7 @@
             row(0, 4, tuple(null, "1"))
         );
 
-        assertInvalidMessage("Invalid tuple literal: too many elements. Type tuple<int, text> expects 2 but got 3",
+        assertInvalidMessage("Invalid tuple literal: too many elements. Type frozen<tuple<int, text>> expects 2 but got 3",
                              "INSERT INTO %s(k, t) VALUES (1,'1:2:3')");
     }
 
@@ -121,7 +125,7 @@
 
         assertInvalidSyntax("INSERT INTO %s (k, t) VALUES (0, ())");
 
-        assertInvalidMessage("Invalid tuple literal for t: too many elements. Type tuple<int, text, double> expects 3 but got 4",
+        assertInvalidMessage("Invalid tuple literal for t: too many elements. Type frozen<tuple<int, text, double>> expects 3 but got 4",
                              "INSERT INTO %s (k, t) VALUES (0, (2, 'foo', 3.1, 'bar'))");
 
         createTable("CREATE TABLE %s (k int PRIMARY KEY, t frozen<tuple<int, tuple<int, text, double>>>)");
@@ -129,7 +133,7 @@
                              "INSERT INTO %s (k, t) VALUES (0, ?)",
                              tuple(1, tuple(1, "1", 1.0, 1)));
 
-        assertInvalidMessage("Invalid tuple literal for t: component 1 is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid tuple literal for t: component 1 is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, t) VALUES (0, (1, (1, '1', 1.0, 1)))");
     }
 
@@ -211,4 +215,14 @@
         assertInvalidMessage("Not enough bytes to read 0th component",
                              "INSERT INTO %s (pk, t) VALUES (?, ?)", 1, Long.MAX_VALUE);
     }
+
+    @Test
+    public void testReversedTypeTuple() throws Throwable
+    {
+        // CASSANDRA-13717
+        createTable("CREATE TABLE %s (id int, tdemo frozen<tuple<timestamp, varchar>>, primary key (id, tdemo)) with clustering order by (tdemo desc)");
+        execute("INSERT INTO %s (id, tdemo) VALUES (1, ('2017-02-03 03:05+0000','Europe'))");
+        DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mmX", Locale.ENGLISH);
+        assertRows(execute("SELECT tdemo FROM %s"), row(tuple( df.parse("2017-02-03 03:05+0000"), "Europe")));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TypeTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TypeTest.java
index 1354fb0..60a0fdc 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/TypeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TypeTest.java

@@ -51,11 +51,11 @@
         createTable("CREATE TABLE %s (a int, b timestamp, c bigint, d varint, PRIMARY KEY (a, b, c, d))");
 
         execute("INSERT INTO %s (a, b, c, d) VALUES (0, toUnixTimestamp(now()), toTimestamp(now()), toTimestamp(now()))");
-        UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b < toUnixTimestamp(now())");
+        UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b <= toUnixTimestamp(now())");
         assertEquals(1, results.size());
 
         execute("INSERT INTO %s (a, b, c, d) VALUES (1, unixTimestampOf(now()), dateOf(now()), dateOf(now()))");
-        results = execute("SELECT * FROM %s WHERE a=1 AND b < toUnixTimestamp(now())");
+        results = execute("SELECT * FROM %s WHERE a=1 AND b <= toUnixTimestamp(now())");
         assertEquals(1, results.size());
     }
 
@@ -67,36 +67,4 @@
         UntypedResultSet results = execute("SELECT * FROM %s WHERE a=0 AND b < now()");
         assertEquals(1, results.size());
     }
-
-    @Test
-    // tests CASSANDRA-7797
-    public void testAlterReversedColumn() throws Throwable
-    {
-        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
-        alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimestampType)'");
-    }
-
-    @Test
-    public void testIncompatibleReversedTypes() throws Throwable
-    {
-        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
-        try
-        {
-            alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.TimeUUIDType)'");
-            fail("Expected error for ALTER statement");
-        }
-        catch (RuntimeException e) { }
-    }
-
-    @Test
-    public void testReversedAndNonReversed() throws Throwable
-    {
-        createTable("CREATE TABLE IF NOT EXISTS %s (a int, b 'org.apache.cassandra.db.marshal.DateType', PRIMARY KEY (a, b))");
-        try
-        {
-            alterTable("ALTER TABLE %s ALTER b TYPE 'org.apache.cassandra.db.marshal.ReversedType(org.apache.cassandra.db.marshal.DateType)'");
-            fail("Expected error for ALTER statement");
-        }
-        catch (RuntimeException e) { }
-    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFAuthTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFAuthTest.java
index b2b9946..e5ecc72 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UFAuthTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFAuthTest.java

@@ -25,17 +25,15 @@
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.auth.*;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.Attributes;
 import org.apache.cassandra.cql3.CQLStatement;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.cql3.statements.BatchStatement;
 import org.apache.cassandra.cql3.statements.ModificationStatement;
 import org.apache.cassandra.cql3.CQLTester;
@@ -49,8 +47,6 @@
 
 public class UFAuthTest extends CQLTester
 {
-    private static final Logger logger = LoggerFactory.getLogger(UFAuthTest.class);
-
     String roleName = "test_role";
     AuthenticatedUser user;
     RoleResource role;
@@ -261,6 +257,17 @@
     }
 
     @Test
+    public void functionInStaticColumnRestrictionInSelect() throws Throwable
+    {
+        setupTable("CREATE TABLE %s (k int, s int STATIC, v1 int, v2 int, PRIMARY KEY(k, v1))");
+        String functionName = createSimpleFunction();
+        String cql = String.format("SELECT k FROM %s WHERE k = 0 AND s = %s ALLOW FILTERING",
+                                   KEYSPACE + "." + currentTable(),
+                                   functionCall(functionName));
+        assertPermissionsOnFunction(cql, functionName);
+    }
+
+    @Test
     public void functionInRegularCondition() throws Throwable
     {
         String functionName = createSimpleFunction();
@@ -308,14 +315,14 @@
     public void systemFunctionsRequireNoExplicitPrivileges() throws Throwable
     {
         // with terminal arguments, so evaluated at prepare time
-        String cql = String.format("UPDATE %s SET v2 = 0 WHERE k = blobasint(intasblob(0))",
+        String cql = String.format("UPDATE %s SET v2 = 0 WHERE k = blobasint(intasblob(0)) and v1 = 0",
                                    KEYSPACE + "." + currentTable());
         getStatement(cql).checkAccess(clientState);
 
         // with non-terminal arguments, so evaluated at execution
         String functionName = createSimpleFunction();
         grantExecuteOnFunction(functionName);
-        cql = String.format("UPDATE %s SET v2 = 0 WHERE k = blobasint(intasblob(%s))",
+        cql = String.format("UPDATE %s SET v2 = 0 WHERE k = blobasint(intasblob(%s)) and v1 = 0",
                             KEYSPACE + "." + currentTable(),
                             functionCall(functionName));
         getStatement(cql).checkAccess(clientState);
@@ -442,6 +449,31 @@
         getStatement(cql).checkAccess(clientState);
     }
 
+    @Test
+    public void grantAndRevokeSyntaxRequiresExplicitKeyspace() throws Throwable
+    {
+        setupTable("CREATE TABLE %s (k int, s int STATIC, v1 int, v2 int, PRIMARY KEY(k, v1))");
+        String functionName = shortFunctionName(createSimpleFunction());
+        assertRequiresKeyspace(String.format("GRANT EXECUTE ON FUNCTION %s() TO %s",
+                                             functionName,
+                                             role.getRoleName()));
+        assertRequiresKeyspace(String.format("REVOKE EXECUTE ON FUNCTION %s() FROM %s",
+                                             functionName,
+                                             role.getRoleName()));
+    }
+
+    private void assertRequiresKeyspace(String cql) throws Throwable
+    {
+        try
+        {
+            getStatement(cql);
+        }
+        catch (InvalidRequestException e)
+        {
+            assertEquals("In this context function name must be explictly qualified by a keyspace", e.getMessage());
+        }
+    }
+
     private void assertPermissionsOnNestedFunctions(String innerFunction, String outerFunction) throws Throwable
     {
         String cql = String.format("SELECT k, %s FROM %s WHERE k=0",
@@ -607,12 +639,12 @@
         // It is here to avoid having to duplicate the functionality of CqlParser
         // for transforming cql types into AbstractTypes
         FunctionName fn = parseFunctionName(functionName);
-        List<Function> functions = Functions.find(fn);
+        Collection<Function> functions = Schema.instance.getFunctions(fn);
         assertEquals(String.format("Expected a single function definition for %s, but found %s",
                                    functionName,
                                    functions.size()),
                      1, functions.size());
-        return FunctionResource.function(fn.keyspace, fn.name, functions.get(0).argTypes());
+        return FunctionResource.function(fn.keyspace, fn.name, functions.iterator().next().argTypes());
     }
 
     private String functionCall(String functionName, String...args)

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFIdentificationTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFIdentificationTest.java
index ecc9d47..b2288e4 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UFIdentificationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFIdentificationTest.java

@@ -97,15 +97,15 @@
     @Test
     public void testSimpleModificationStatement() throws Throwable
     {
-        assertFunctions(cql("INSERT INTO %s (key, t_sc) VALUES (0, %s)", functionCall(tFunc, "'foo'")), tFunc);
-        assertFunctions(cql("INSERT INTO %s (key, i_cc) VALUES (0, %s)", functionCall(iFunc, "1")), iFunc);
-        assertFunctions(cql("INSERT INTO %s (key, t_cc) VALUES (0, %s)", functionCall(tFunc, "'foo'")), tFunc);
-        assertFunctions(cql("INSERT INTO %s (key, i_val) VALUES (0, %s)", functionCall(iFunc, "1")), iFunc);
-        assertFunctions(cql("INSERT INTO %s (key, l_val) VALUES (0, %s)", functionCall(lFunc, "[1]")), lFunc);
-        assertFunctions(cql("INSERT INTO %s (key, s_val) VALUES (0, %s)", functionCall(sFunc, "{1}")), sFunc);
-        assertFunctions(cql("INSERT INTO %s (key, m_val) VALUES (0, %s)", functionCall(mFunc, "{1:1}")), mFunc);
-        assertFunctions(cql("INSERT INTO %s (key, udt_val) VALUES (0,%s)", functionCall(udtFunc, "{i : 1, t : 'foo'}")), udtFunc);
-        assertFunctions(cql("INSERT INTO %s (key, u_val) VALUES (0, %s)", functionCall(uFunc, "now()")), uFunc, "system.now");
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, t_sc) VALUES (0, 0, 'A', %s)", functionCall(tFunc, "'foo'")), tFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc) VALUES (0, %s, 'A')", functionCall(iFunc, "1")), iFunc);
+        assertFunctions(cql("INSERT INTO %s (key, t_cc, i_cc) VALUES (0, %s, 1)", functionCall(tFunc, "'foo'")), tFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, i_val) VALUES (0, 0, 'A', %s)", functionCall(iFunc, "1")), iFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, l_val) VALUES (0, 0, 'A', %s)", functionCall(lFunc, "[1]")), lFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, s_val) VALUES (0, 0, 'A', %s)", functionCall(sFunc, "{1}")), sFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, m_val) VALUES (0, 0, 'A', %s)", functionCall(mFunc, "{1:1}")), mFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, udt_val) VALUES (0, 0, 'A', %s)", functionCall(udtFunc, "{i : 1, t : 'foo'}")), udtFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, u_val) VALUES (0, 0, 'A', %s)", functionCall(uFunc, "now()")), uFunc, "system.now");
     }
 
     @Test
@@ -113,48 +113,48 @@
     {
         String iFunc2 = createEchoFunction("int");
         String mapValue = String.format("{%s:%s}", functionCall(iFunc, "1"), functionCall(iFunc2, "1"));
-        assertFunctions(cql("INSERT INTO %s (key, m_val) VALUES (0, %s)", mapValue), iFunc, iFunc2);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, m_val) VALUES (0, 0, 'A', %s)", mapValue), iFunc, iFunc2);
 
         String listValue = String.format("[%s]", functionCall(iFunc, "1"));
-        assertFunctions(cql("INSERT INTO %s (key, l_val) VALUES (0, %s)", listValue), iFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, l_val) VALUES (0, 0, 'A',  %s)", listValue), iFunc);
 
         String setValue = String.format("{%s}", functionCall(iFunc, "1"));
-        assertFunctions(cql("INSERT INTO %s (key, s_val) VALUES (0, %s)", setValue), iFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, s_val) VALUES (0, 0, 'A', %s)", setValue), iFunc);
     }
 
     @Test
     public void testNonTerminalUDTLiterals() throws Throwable
     {
         String udtValue = String.format("{ i: %s, t : %s } ", functionCall(iFunc, "1"), functionCall(tFunc, "'foo'"));
-        assertFunctions(cql("INSERT INTO %s (key, udt_val) VALUES (0, %s)", udtValue), iFunc, tFunc);
+        assertFunctions(cql("INSERT INTO %s (key, i_cc, t_cc, udt_val) VALUES (0, 0, 'A', %s)", udtValue), iFunc, tFunc);
     }
 
     @Test
     public void testModificationStatementWithConditions() throws Throwable
     {
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF t_sc=%s", functionCall(tFunc, "'foo'")), tFunc);
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF i_val=%s", functionCall(iFunc, "1")), iFunc);
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF l_val=%s", functionCall(lFunc, "[1]")), lFunc);
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF s_val=%s", functionCall(sFunc, "{1}")), sFunc);
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF m_val=%s", functionCall(mFunc, "{1:1}")), mFunc);
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF t_sc=%s", functionCall(tFunc, "'foo'")), tFunc);
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF i_val=%s", functionCall(iFunc, "1")), iFunc);
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF l_val=%s", functionCall(lFunc, "[1]")), lFunc);
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF s_val=%s", functionCall(sFunc, "{1}")), sFunc);
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF m_val=%s", functionCall(mFunc, "{1:1}")), mFunc);
 
 
         String iFunc2 = createEchoFunction("int");
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF i_val IN (%s, %S)",
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF i_val IN (%s, %S)",
                             functionCall(iFunc, "1"),
                             functionCall(iFunc2, "2")),
                         iFunc, iFunc2);
 
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF u_val=%s",
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF u_val=%s",
                             functionCall(uFunc, "now()")),
                         uFunc, "system.now");
 
         // conditions on collection elements
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF l_val[%s] = %s",
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF l_val[%s] = %s",
                             functionCall(iFunc, "1"),
                             functionCall(iFunc2, "1")),
                         iFunc, iFunc2);
-        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 IF m_val[%s] = %s",
+        assertFunctions(cql("UPDATE %s SET i_val=0 WHERE key=0 AND i_cc = 0 AND t_cc = 'A' IF m_val[%s] = %s",
                             functionCall(iFunc, "1"),
                             functionCall(iFunc2, "1")),
                         iFunc, iFunc2);
@@ -200,16 +200,18 @@
     public void testSelectStatementSimpleRestrictions() throws Throwable
     {
         assertFunctions(cql("SELECT i_val FROM %s WHERE key=%s", functionCall(iFunc, "1")), iFunc);
-        assertFunctions(cql("SELECT i_val FROM %s WHERE key=0 AND i_cc=%s AND t_cc='foo'", functionCall(iFunc, "1")), iFunc);
-        assertFunctions(cql("SELECT i_val FROM %s WHERE key=0 AND i_cc=0 AND t_cc=%s", functionCall(tFunc, "'foo'")), tFunc);
+        assertFunctions(cql("SELECT i_val FROM %s WHERE key=0 AND t_sc=%s ALLOW FILTERING", functionCall(tFunc, "'foo'")), tFunc);
+        assertFunctions(cql("SELECT i_val FROM %s WHERE key=0 AND i_cc=%s AND t_cc='foo' ALLOW FILTERING", functionCall(iFunc, "1")), iFunc);
+        assertFunctions(cql("SELECT i_val FROM %s WHERE key=0 AND i_cc=0 AND t_cc=%s ALLOW FILTERING", functionCall(tFunc, "'foo'")), tFunc);
 
         String iFunc2 = createEchoFunction("int");
         String tFunc2 = createEchoFunction("text");
-        assertFunctions(cql("SELECT i_val FROM %s WHERE key=%s AND i_cc=%s AND t_cc=%s",
+        assertFunctions(cql("SELECT i_val FROM %s WHERE key=%s AND t_sc=%s AND i_cc=%s AND t_cc=%s ALLOW FILTERING",
                             functionCall(iFunc, "1"),
+                            functionCall(tFunc, "'foo'"),
                             functionCall(iFunc2, "1"),
                             functionCall(tFunc2, "'foo'")),
-                        iFunc, iFunc2, tFunc2);
+                        iFunc, tFunc, iFunc2, tFunc2);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFJavaTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFJavaTest.java
index caef808..15c8e18 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UFJavaTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFJavaTest.java

@@ -18,7 +18,6 @@
 
 package org.apache.cassandra.cql3.validation.entities;
 
-import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
@@ -27,7 +26,6 @@
 import java.util.TreeSet;
 
 import org.junit.Assert;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
 import com.datastax.driver.core.DataType;
@@ -35,27 +33,17 @@
 import com.datastax.driver.core.TupleType;
 import com.datastax.driver.core.TupleValue;
 import com.datastax.driver.core.UDTValue;
-import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.Functions;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.exceptions.FunctionExecutionException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.transport.Server;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class UFJavaTest extends CQLTester
 {
-    @BeforeClass
-    public static void setUp()
-    {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
-    }
-
-
     @Test
     public void testJavaFunctionNoParameters() throws Throwable
     {
@@ -70,7 +58,7 @@
                                       "LANGUAGE JAVA\n" +
                                       "AS '" +functionBody + "';");
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            KEYSPACE, parseFunctionName(fName).name),
                    row("java", functionBody));
 
@@ -174,7 +162,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            fNameName.keyspace, fNameName.name),
                    row("java", functionBody));
 
@@ -236,7 +224,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            fNameName.keyspace, fNameName.name),
                    row("java", functionBody));
 
@@ -267,7 +255,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            fNameName.keyspace, fNameName.name),
                    row("java", functionBody));
 
@@ -300,7 +288,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            fNameName.keyspace, fNameName.name),
                    row("java", functionBody));
     }
@@ -342,7 +330,7 @@
                    row(list, set, map));
 
         // same test - but via native protocol
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
             assertRowsNet(version,
                           executeNet(version, "SELECT " + fList + "(lst), " + fSet + "(st), " + fMap + "(mp) FROM %s WHERE key = 1"),
                           row(list, set, map));
@@ -436,12 +424,17 @@
         assertRows(execute("SELECT " + fTup4 + "(tup) FROM %s WHERE key = 1"),
                    row(map));
 
-        TupleType tType = TupleType.of(DataType.cdouble(),
-                                       DataType.list(DataType.cdouble()),
-                                       DataType.set(DataType.text()),
-                                       DataType.map(DataType.cint(), DataType.cboolean()));
+        // same test - but via native protocol
+        // we use protocol V3 here to encode the expected version because the server
+        // always serializes Collections using V3 - see CollectionSerializer's
+        // serialize and deserialize methods.
+        TupleType tType = tupleTypeOf(Server.VERSION_3,
+                                      DataType.cdouble(),
+                                      DataType.list(DataType.cdouble()),
+                                      DataType.set(DataType.text()),
+                                      DataType.map(DataType.cint(), DataType.cboolean()));
         TupleValue tup = tType.newValue(1d, list, set, map);
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
         {
             assertRowsNet(version,
                           executeNet(version, "SELECT " + fTup0 + "(tup) FROM %s WHERE key = 1"),
@@ -468,7 +461,7 @@
         createTable("CREATE TABLE %s (key int primary key, udt frozen<" + KEYSPACE + '.' + type + ">)");
         execute("INSERT INTO %s (key, udt) VALUES (1, {txt: 'one', i:1})");
 
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
         {
             executeNet(version, "USE " + KEYSPACE);
 
@@ -532,7 +525,7 @@
         assertRows(execute("SELECT " + fUdt2 + "(udt) FROM %s WHERE key = 1"),
                    row(1));
 
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
         {
             List<Row> rowsNet = executeNet(version, "SELECT " + fUdt0 + "(udt) FROM %s WHERE key = 1").all();
             Assert.assertEquals(1, rowsNet.size());
@@ -658,7 +651,7 @@
                               "AS $$return " +
                               "     udt.getString(\"txt\");$$;",
                               fName1replace, type));
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fName1replace)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fName1replace)).size());
         execute(String.format("CREATE OR REPLACE FUNCTION %s( udt %s ) " +
                               "CALLED ON NULL INPUT " +
                               "RETURNS int " +
@@ -666,7 +659,7 @@
                               "AS $$return " +
                               "     Integer.valueOf(udt.getInt(\"i\"));$$;",
                               fName2replace, type));
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fName2replace)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fName2replace)).size());
         execute(String.format("CREATE OR REPLACE FUNCTION %s( udt %s ) " +
                               "CALLED ON NULL INPUT " +
                               "RETURNS double " +
@@ -674,7 +667,7 @@
                               "AS $$return " +
                               "     Double.valueOf(udt.getDouble(\"added\"));$$;",
                               fName3replace, type));
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fName3replace)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fName3replace)).size());
         execute(String.format("CREATE OR REPLACE FUNCTION %s( udt %s ) " +
                               "RETURNS NULL ON NULL INPUT " +
                               "RETURNS %s " +
@@ -682,7 +675,7 @@
                               "AS $$return " +
                               "     udt;$$;",
                               fName4replace, type, type));
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fName4replace)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fName4replace)).size());
 
         assertRows(execute("SELECT " + fName1replace + "(udt) FROM %s WHERE key = 2"),
                    row("two"));
@@ -757,398 +750,22 @@
         assertRows(execute("SELECT " + fName1 + "(lst), " + fName2 + "(st), " + fName3 + "(mp) FROM %s WHERE key = 1"),
                    row("three", "one", "two"));
 
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
             assertRowsNet(version,
                           executeNet(version, "SELECT " + fName1 + "(lst), " + fName2 + "(st), " + fName3 + "(mp) FROM %s WHERE key = 1"),
                           row("three", "one", "two"));
     }
 
     @Test
-    public void testFunctionWithFrozenSetType() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<set<int>>)");
-        createIndex("CREATE INDEX ON %s (FULL(b))");
-
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, set());
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, set(1, 2, 3));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, set(4, 5, 6));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, set(7, 8, 9));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenSetArg(values frozen<set<int>>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS int " +
-                             "LANGUAGE java\n" +
-                             "AS 'int sum = 0; for (Object value : values) {sum += value;} return sum;';");
-
-        assertInvalidMessage("The function return type should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values set<int>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS frozen<set<int>> " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values;';");
-
-        String functionName = createFunction(KEYSPACE,
-                                             "set<int>",
-                                             "CREATE FUNCTION %s (values set<int>) " +
-                                             "CALLED ON NULL INPUT " +
-                                             "RETURNS int " +
-                                             "LANGUAGE java\n" +
-                                             "AS 'int sum = 0; for (Object value : values) {sum += ((Integer) value);} return sum;';");
-
-        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s"),
-                   row(0, 0),
-                   row(1, 6),
-                   row(2, 15),
-                   row(3, 24));
-
-        functionName = createFunction(KEYSPACE,
-                                      "set<int>",
-                                      "CREATE FUNCTION %s (values set<int>) " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS set<int> " +
-                                      "LANGUAGE java\n" +
-                                      "AS 'return values;';");
-
-        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", set(1, 2, 3)),
-                   row(1));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "DROP FUNCTION " + functionName + "(frozen<set<int>>);");
-    }
-
-    @Test
-    public void testFunctionWithFrozenListType() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<list<int>>)");
-        createIndex("CREATE INDEX ON %s (FULL(b))");
-
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, list());
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, list(1, 2, 3));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, list(4, 5, 6));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, list(7, 8, 9));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<list<int>>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS int " +
-                             "LANGUAGE java\n" +
-                             "AS 'int sum = 0; for (Object value : values) {sum += value;} return sum;';");
-
-        assertInvalidMessage("The function return type should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values list<int>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS frozen<list<int>> " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values;';");
-
-        String functionName = createFunction(KEYSPACE,
-                                             "list<int>",
-                                             "CREATE FUNCTION %s (values list<int>) " +
-                                             "CALLED ON NULL INPUT " +
-                                             "RETURNS int " +
-                                             "LANGUAGE java\n" +
-                                             "AS 'int sum = 0; for (Object value : values) {sum += ((Integer) value);} return sum;';");
-
-        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s"),
-                   row(0, 0),
-                   row(1, 6),
-                   row(2, 15),
-                   row(3, 24));
-
-        functionName = createFunction(KEYSPACE,
-                                      "list<int>",
-                                      "CREATE FUNCTION %s (values list<int>) " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS list<int> " +
-                                      "LANGUAGE java\n" +
-                                      "AS 'return values;';");
-
-        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", set(1, 2, 3)),
-                   row(1));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "DROP FUNCTION " + functionName + "(frozen<list<int>>);");
-    }
-
-    @Test
-    public void testFunctionWithFrozenMapType() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<map<int, int>>)");
-        createIndex("CREATE INDEX ON %s (FULL(b))");
-
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, map());
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, map(1, 1, 2, 2, 3, 3));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, map(4, 4, 5, 5, 6, 6));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, map(7, 7, 8, 8, 9, 9));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<map<int, int>>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS int " +
-                             "LANGUAGE java\n" +
-                             "AS 'int sum = 0; for (Object value : values.values()) {sum += value;} return sum;';");
-
-        assertInvalidMessage("The function return type should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values map<int, int>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS frozen<map<int, int>> " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values;';");
-
-        String functionName = createFunction(KEYSPACE,
-                                             "map<int, int>",
-                                             "CREATE FUNCTION %s (values map<int, int>) " +
-                                             "CALLED ON NULL INPUT " +
-                                             "RETURNS int " +
-                                             "LANGUAGE java\n" +
-                                             "AS 'int sum = 0; for (Object value : values.values()) {sum += ((Integer) value);} return sum;';");
-
-        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s"),
-                   row(0, 0),
-                   row(1, 6),
-                   row(2, 15),
-                   row(3, 24));
-
-        functionName = createFunction(KEYSPACE,
-                                      "map<int, int>",
-                                      "CREATE FUNCTION %s (values map<int, int>) " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS map<int, int> " +
-                                      "LANGUAGE java\n" +
-                                      "AS 'return values;';");
-
-        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", map(1, 1, 2, 2, 3, 3)),
-                   row(1));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "DROP FUNCTION " + functionName + "(frozen<map<int, int>>);");
-    }
-
-    @Test
-    public void testFunctionWithFrozenTupleType() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<tuple<int, int>>)");
-        createIndex("CREATE INDEX ON %s (b)");
-
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, tuple());
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, tuple(1, 2));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, tuple(4, 5));
-        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, tuple(7, 8));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<tuple<int, int>>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS text " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values.toString();';");
-
-        assertInvalidMessage("The function return type should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values tuple<int, int>) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS frozen<tuple<int, int>> " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values;';");
-
-        String functionName = createFunction(KEYSPACE,
-                                             "tuple<int, int>",
-                                             "CREATE FUNCTION %s (values tuple<int, int>) " +
-                                             "CALLED ON NULL INPUT " +
-                                             "RETURNS text " +
-                                             "LANGUAGE java\n" +
-                                             "AS 'return values.toString();';");
-
-        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s"),
-                   row(0, "(null, null)"),
-                   row(1, "(1, 2)"),
-                   row(2, "(4, 5)"),
-                   row(3, "(7, 8)"));
-
-        functionName = createFunction(KEYSPACE,
-                                      "tuple<int, int>",
-                                      "CREATE FUNCTION %s (values tuple<int, int>) " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS tuple<int, int> " +
-                                      "LANGUAGE java\n" +
-                                      "AS 'return values;';");
-
-        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", tuple(1, 2)),
-                   row(1));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "DROP FUNCTION " + functionName + "(frozen<tuple<int, int>>);");
-    }
-
-    @Test
-    public void testFunctionWithFrozenUDType() throws Throwable
-    {
-        String myType = createType("CREATE TYPE %s (f int)");
-        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)");
-        createIndex("CREATE INDEX ON %s (b)");
-
-        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 0, 0);
-        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 1, 1);
-        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 2, 4);
-        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 3, 7);
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<" + myType + ">) " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS text " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values.toString();';");
-
-        assertInvalidMessage("The function return type should not be frozen",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values " + myType + ") " +
-                             "CALLED ON NULL INPUT " +
-                             "RETURNS frozen<" + myType + "> " +
-                             "LANGUAGE java\n" +
-                             "AS 'return values;';");
-
-        String functionName = createFunction(KEYSPACE,
-                                             myType,
-                                             "CREATE FUNCTION %s (values " + myType + ") " +
-                                             "CALLED ON NULL INPUT " +
-                                             "RETURNS text " +
-                                             "LANGUAGE java\n" +
-                                             "AS 'return values.toString();';");
-
-        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s"),
-                   row(0, "{f:0}"),
-                   row(1, "{f:1}"),
-                   row(2, "{f:4}"),
-                   row(3, "{f:7}"));
-
-        functionName = createFunction(KEYSPACE,
-                                      myType,
-                                      "CREATE FUNCTION %s (values " + myType + ") " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS " + myType + " " +
-                                      "LANGUAGE java\n" +
-                                      "AS 'return values;';");
-
-        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "({f: ?})", 1),
-                   row(1));
-
-        assertInvalidMessage("The function arguments should not be frozen",
-                             "DROP FUNCTION " + functionName + "(frozen<" + myType + ">);");
-    }
-
-    @Test
-    public void testEmptyString() throws Throwable
-    {
-        createTable("CREATE TABLE %s (key int primary key, sval text, aval ascii, bval blob, empty_int int)");
-        execute("INSERT INTO %s (key, sval, aval, bval, empty_int) VALUES (?, ?, ?, ?, blobAsInt(0x))", 1, "", "", ByteBuffer.allocate(0));
-
-        String fNameSRC = createFunction(KEYSPACE_PER_TEST, "text",
-                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS text " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameSCC = createFunction(KEYSPACE_PER_TEST, "text",
-                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS text " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return \"\";'");
-
-        String fNameSRN = createFunction(KEYSPACE_PER_TEST, "text",
-                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS text " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameSCN = createFunction(KEYSPACE_PER_TEST, "text",
-                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS text " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return \"\";'");
-
-        String fNameBRC = createFunction(KEYSPACE_PER_TEST, "blob",
-                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS blob " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameBCC = createFunction(KEYSPACE_PER_TEST, "blob",
-                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS blob " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return ByteBuffer.allocate(0);'");
-
-        String fNameBRN = createFunction(KEYSPACE_PER_TEST, "blob",
-                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS blob " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameBCN = createFunction(KEYSPACE_PER_TEST, "blob",
-                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS blob " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return ByteBuffer.allocate(0);'");
-
-        String fNameIRC = createFunction(KEYSPACE_PER_TEST, "int",
-                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS int " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameICC = createFunction(KEYSPACE_PER_TEST, "int",
-                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
-                                         "CALLED ON NULL INPUT " +
-                                         "RETURNS int " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return 0;'");
-
-        String fNameIRN = createFunction(KEYSPACE_PER_TEST, "int",
-                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS int " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return val;'");
-
-        String fNameICN = createFunction(KEYSPACE_PER_TEST, "int",
-                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
-                                         "RETURNS NULL ON NULL INPUT " +
-                                         "RETURNS int " +
-                                         "LANGUAGE JAVA\n" +
-                                         "AS 'return 0;'");
-
-        assertRows(execute("SELECT " + fNameSRC + "(sval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSRN + "(sval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSCC + "(sval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSCN + "(sval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSRC + "(aval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSRN + "(aval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSCC + "(aval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameSCN + "(aval) FROM %s"), row(""));
-        assertRows(execute("SELECT " + fNameBRC + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
-        assertRows(execute("SELECT " + fNameBRN + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
-        assertRows(execute("SELECT " + fNameBCC + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
-        assertRows(execute("SELECT " + fNameBCN + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
-        assertRows(execute("SELECT " + fNameIRC + "(empty_int) FROM %s"), row(new Object[]{null}));
-        assertRows(execute("SELECT " + fNameIRN + "(empty_int) FROM %s"), row(new Object[]{null}));
-        assertRows(execute("SELECT " + fNameICC + "(empty_int) FROM %s"), row(0));
-        assertRows(execute("SELECT " + fNameICN + "(empty_int) FROM %s"), row(new Object[]{null}));
-    }
-
-    @Test
     public void testAllNativeTypes() throws Throwable
     {
         StringBuilder sig = new StringBuilder();
         StringBuilder args = new StringBuilder();
         for (CQL3Type.Native type : CQL3Type.Native.values())
         {
+            if (type == CQL3Type.Native.EMPTY)
+                continue;
+
             if (sig.length() > 0)
                 sig.append(',');
             sig.append(type.toString());
@@ -1166,6 +783,9 @@
 
         for (CQL3Type.Native type : CQL3Type.Native.values())
         {
+            if (type == CQL3Type.Native.EMPTY)
+                continue;
+
             createFunction(KEYSPACE_PER_TEST, type.toString(),
                            "CREATE OR REPLACE FUNCTION %s(val " + type.toString() + ") " +
                            "RETURNS NULL ON NULL INPUT " +

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFPureScriptTupleCollectionTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFPureScriptTupleCollectionTest.java
new file mode 100644
index 0000000..7465a2a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFPureScriptTupleCollectionTest.java

@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.DataType;
+import com.datastax.driver.core.TupleType;
+import com.datastax.driver.core.TupleValue;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.transport.Server;
+
+public class UFPureScriptTupleCollectionTest extends CQLTester
+{
+    // Just JavaScript UDFs to check how UDF - especially security/class-loading/sandboxing stuff -
+    // behaves, if no Java UDF has been executed before.
+
+    // Do not add any other test here!
+    // See CASSANDRA-10141
+
+    @Test
+    public void testJavascriptTupleTypeCollection() throws Throwable
+    {
+        String tupleTypeDef = "tuple<double, list<double>, set<text>, map<int, boolean>>";
+        createTable("CREATE TABLE %s (key int primary key, tup frozen<" + tupleTypeDef + ">)");
+
+        String fTup1 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
+                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS tuple<double, list<double>, set<text>, map<int, boolean>> " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS $$" +
+                                      "       tup;$$;");
+        String fTup2 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
+                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS double " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS $$" +
+                                      "       tup.getDouble(0);$$;");
+        String fTup3 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
+                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS list<double> " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS $$" +
+                                      "       tup.getList(1, java.lang.Double.class);$$;");
+        String fTup4 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
+                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS set<text> " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS $$" +
+                                      "       tup.getSet(2, java.lang.String.class);$$;");
+        String fTup5 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
+                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS map<int, boolean> " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS $$" +
+                                      "       tup.getMap(3, java.lang.Integer.class, java.lang.Boolean.class);$$;");
+
+        List<Double> list = Arrays.asList(1d, 2d, 3d);
+        Set<String> set = new TreeSet<>(Arrays.asList("one", "three", "two"));
+        Map<Integer, Boolean> map = new TreeMap<>();
+        map.put(1, true);
+        map.put(2, false);
+        map.put(3, true);
+
+        Object t = tuple(1d, list, set, map);
+
+        execute("INSERT INTO %s (key, tup) VALUES (1, ?)", t);
+
+        assertRows(execute("SELECT " + fTup1 + "(tup) FROM %s WHERE key = 1"),
+                   row(t));
+        assertRows(execute("SELECT " + fTup2 + "(tup) FROM %s WHERE key = 1"),
+                   row(1d));
+        assertRows(execute("SELECT " + fTup3 + "(tup) FROM %s WHERE key = 1"),
+                   row(list));
+        assertRows(execute("SELECT " + fTup4 + "(tup) FROM %s WHERE key = 1"),
+                   row(set));
+        assertRows(execute("SELECT " + fTup5 + "(tup) FROM %s WHERE key = 1"),
+                   row(map));
+
+        // same test - but via native protocol
+        // we use protocol V3 here to encode the expected version because the server
+        // always serializes Collections using V3 - see CollectionSerializer's
+        // serialize and deserialize methods.
+        TupleType tType = tupleTypeOf(Server.VERSION_3,
+                                      DataType.cdouble(),
+                                      DataType.list(DataType.cdouble()),
+                                      DataType.set(DataType.text()),
+                                      DataType.map(DataType.cint(),
+                                                   DataType.cboolean()));
+        TupleValue tup = tType.newValue(1d, list, set, map);
+        for (int version : PROTOCOL_VERSIONS)
+        {
+            assertRowsNet(version,
+                          executeNet(version, "SELECT " + fTup1 + "(tup) FROM %s WHERE key = 1"),
+                          row(tup));
+            assertRowsNet(version,
+                          executeNet(version, "SELECT " + fTup2 + "(tup) FROM %s WHERE key = 1"),
+                          row(1d));
+            assertRowsNet(version,
+                          executeNet(version, "SELECT " + fTup3 + "(tup) FROM %s WHERE key = 1"),
+                          row(list));
+            assertRowsNet(version,
+                          executeNet(version, "SELECT " + fTup4 + "(tup) FROM %s WHERE key = 1"),
+                          row(set));
+            assertRowsNet(version,
+                          executeNet(version, "SELECT " + fTup5 + "(tup) FROM %s WHERE key = 1"),
+                          row(map));
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFScriptTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFScriptTest.java
index d3050a5..9c931e8 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UFScriptTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFScriptTest.java

@@ -30,28 +30,21 @@
 import java.util.UUID;
 
 import org.junit.Assert;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import com.datastax.driver.core.DataType;
-import com.datastax.driver.core.TupleType;
-import com.datastax.driver.core.TupleValue;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.exceptions.FunctionExecutionException;
-import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.utils.UUIDGen;
 
 public class UFScriptTest extends CQLTester
 {
-    @BeforeClass
-    public static void setUp()
-    {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
-    }
+    // Just JavaScript UDFs to check how UDF - especially security/class-loading/sandboxing stuff -
+    // behaves, if no Java UDF has been executed before.
+
+    // Do not add any other test here - especially none using Java UDFs
 
     @Test
     public void testJavascriptSimpleCollections() throws Throwable
@@ -92,7 +85,7 @@
         assertRows(execute("SELECT " + fName1 + "(lst), " + fName2 + "(st), " + fName3 + "(mp) FROM %s WHERE key = 1"),
                    row(list, set, map));
 
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
             assertRowsNet(version,
                           executeNet(version, "SELECT " + fName1 + "(lst), " + fName2 + "(st), " + fName3 + "(mp) FROM %s WHERE key = 1"),
                           row(list, set, map));
@@ -122,96 +115,6 @@
     }
 
     @Test
-    public void testJavascriptTupleTypeCollection() throws Throwable
-    {
-        String tupleTypeDef = "tuple<double, list<double>, set<text>, map<int, boolean>>";
-        createTable("CREATE TABLE %s (key int primary key, tup frozen<" + tupleTypeDef + ">)");
-
-        String fTup1 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
-                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
-                                      "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS tuple<double, list<double>, set<text>, map<int, boolean>> " +
-                                      "LANGUAGE javascript\n" +
-                                      "AS $$" +
-                                      "       tup;$$;");
-        String fTup2 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
-                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
-                                      "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS double " +
-                                      "LANGUAGE javascript\n" +
-                                      "AS $$" +
-                                      "       tup.getDouble(0);$$;");
-        String fTup3 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
-                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
-                                      "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS list<double> " +
-                                      "LANGUAGE javascript\n" +
-                                      "AS $$" +
-                                      "       tup.getList(1, java.lang.Class.forName(\"java.lang.Double\"));$$;");
-        String fTup4 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
-                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
-                                      "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS set<text> " +
-                                      "LANGUAGE javascript\n" +
-                                      "AS $$" +
-                                      "       tup.getSet(2, java.lang.Class.forName(\"java.lang.String\"));$$;");
-        String fTup5 = createFunction(KEYSPACE_PER_TEST, tupleTypeDef,
-                                      "CREATE FUNCTION %s( tup " + tupleTypeDef + " ) " +
-                                      "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS map<int, boolean> " +
-                                      "LANGUAGE javascript\n" +
-                                      "AS $$" +
-                                      "       tup.getMap(3, java.lang.Class.forName(\"java.lang.Integer\"), java.lang.Class.forName(\"java.lang.Boolean\"));$$;");
-
-        List<Double> list = Arrays.asList(1d, 2d, 3d);
-        Set<String> set = new TreeSet<>(Arrays.asList("one", "three", "two"));
-        Map<Integer, Boolean> map = new TreeMap<>();
-        map.put(1, true);
-        map.put(2, false);
-        map.put(3, true);
-
-        Object t = tuple(1d, list, set, map);
-
-        execute("INSERT INTO %s (key, tup) VALUES (1, ?)", t);
-
-        assertRows(execute("SELECT " + fTup1 + "(tup) FROM %s WHERE key = 1"),
-                   row(t));
-        assertRows(execute("SELECT " + fTup2 + "(tup) FROM %s WHERE key = 1"),
-                   row(1d));
-        assertRows(execute("SELECT " + fTup3 + "(tup) FROM %s WHERE key = 1"),
-                   row(list));
-        assertRows(execute("SELECT " + fTup4 + "(tup) FROM %s WHERE key = 1"),
-                   row(set));
-        assertRows(execute("SELECT " + fTup5 + "(tup) FROM %s WHERE key = 1"),
-                   row(map));
-
-        // same test - but via native protocol
-        TupleType tType = TupleType.of(DataType.cdouble(),
-                                       DataType.list(DataType.cdouble()),
-                                       DataType.set(DataType.text()),
-                                       DataType.map(DataType.cint(), DataType.cboolean()));
-        TupleValue tup = tType.newValue(1d, list, set, map);
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
-        {
-            assertRowsNet(version,
-                          executeNet(version, "SELECT " + fTup1 + "(tup) FROM %s WHERE key = 1"),
-                          row(tup));
-            assertRowsNet(version,
-                          executeNet(version, "SELECT " + fTup2 + "(tup) FROM %s WHERE key = 1"),
-                          row(1d));
-            assertRowsNet(version,
-                          executeNet(version, "SELECT " + fTup3 + "(tup) FROM %s WHERE key = 1"),
-                          row(list));
-            assertRowsNet(version,
-                          executeNet(version, "SELECT " + fTup4 + "(tup) FROM %s WHERE key = 1"),
-                          row(set));
-            assertRowsNet(version,
-                          executeNet(version, "SELECT " + fTup5 + "(tup) FROM %s WHERE key = 1"),
-                          row(map));
-        }
-    }
-
-    @Test
     public void testJavascriptUserType() throws Throwable
     {
         String type = createType("CREATE TYPE %s (txt text, i int)");
@@ -221,7 +124,7 @@
         String fUdt1 = createFunction(KEYSPACE, type,
                                       "CREATE FUNCTION %s( udt " + type + " ) " +
                                       "RETURNS NULL ON NULL INPUT " +
-                                      "RETURNS " + type + " " +
+                                      "RETURNS " + type + ' ' +
                                       "LANGUAGE javascript\n" +
                                       "AS $$" +
                                       "     udt;$$;");
@@ -301,7 +204,7 @@
                    row("three", "one", "two"));
 
         // same test - but via native protocol
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
             assertRowsNet(version,
                           executeNet(version, cqlSelect),
                           row("three", "one", "two"));
@@ -324,7 +227,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        assertRows(execute("SELECT language, body FROM system.schema_functions WHERE keyspace_name=? AND function_name=?",
+        assertRows(execute("SELECT language, body FROM system_schema.functions WHERE keyspace_name=? AND function_name=?",
                            fNameName.keyspace, fNameName.name),
                    row("javascript", functionBody));
 
@@ -374,28 +277,6 @@
     }
 
     @Test
-    public void testJavascriptCompileFailure() throws Throwable
-    {
-        assertInvalidMessage("Failed to compile function 'cql_test_keyspace.scrinv'",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".scrinv(val double) " +
-                             "RETURNS NULL ON NULL INPUT " +
-                             "RETURNS double " +
-                             "LANGUAGE javascript\n" +
-                             "AS 'foo bar';");
-    }
-
-    @Test
-    public void testScriptInvalidLanguage() throws Throwable
-    {
-        assertInvalidMessage("Invalid language 'artificial_intelligence' for function 'cql_test_keyspace.scrinv'",
-                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".scrinv(val double) " +
-                             "RETURNS NULL ON NULL INPUT " +
-                             "RETURNS double " +
-                             "LANGUAGE artificial_intelligence\n" +
-                             "AS 'question for 42?';");
-    }
-
-    @Test
     public void testScriptReturnTypeCasting() throws Throwable
     {
         createTable("CREATE TABLE %s (key int primary key, val double)");
@@ -501,4 +382,46 @@
                        row(1, expected1, expected2));
         }
     }
+
+    @Test
+    public void testJavascriptDisabled() throws Throwable
+    {
+        createTable("CREATE TABLE %s (key int primary key, val double)");
+
+        DatabaseDescriptor.enableScriptedUserDefinedFunctions(false);
+        try
+        {
+            assertInvalid("CREATE OR REPLACE FUNCTION " + KEYSPACE + ".assertNotEnabled(val double) " +
+                          "RETURNS NULL ON NULL INPUT " +
+                          "RETURNS double " +
+                          "LANGUAGE javascript\n" +
+                          "AS 'Math.sin(val);';");
+        }
+        finally
+        {
+            DatabaseDescriptor.enableScriptedUserDefinedFunctions(true);
+        }
+    }
+
+    @Test
+    public void testJavascriptCompileFailure() throws Throwable
+    {
+        assertInvalidMessage("Failed to compile function 'cql_test_keyspace.scrinv'",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".scrinv(val double) " +
+                             "RETURNS NULL ON NULL INPUT " +
+                             "RETURNS double " +
+                             "LANGUAGE javascript\n" +
+                             "AS 'foo bar';");
+    }
+
+    @Test
+    public void testScriptInvalidLanguage() throws Throwable
+    {
+        assertInvalidMessage("Invalid language 'artificial_intelligence' for function 'cql_test_keyspace.scrinv'",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".scrinv(val double) " +
+                             "RETURNS NULL ON NULL INPUT " +
+                             "RETURNS double " +
+                             "LANGUAGE artificial_intelligence\n" +
+                             "AS 'question for 42?';");
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFSecurityTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFSecurityTest.java
new file mode 100644
index 0000000..4e45a8a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFSecurityTest.java

@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities;
+
+import java.security.AccessControlException;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.functions.UDHelper;
+import org.apache.cassandra.exceptions.FunctionExecutionException;
+import org.apache.cassandra.service.ClientWarn;
+
+public class UFSecurityTest extends CQLTester
+{
+    @Test
+    public void testSecurityPermissions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (key int primary key, dval double)");
+        execute("INSERT INTO %s (key, dval) VALUES (?, ?)", 1, 1d);
+
+        // Java UDFs
+
+        try
+        {
+            String fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                          "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                          "RETURNS NULL ON NULL INPUT " +
+                                          "RETURNS double " +
+                                          "LANGUAGE JAVA\n" +
+                                          "AS 'System.getProperty(\"foo.bar.baz\"); return 0d;';");
+            execute("SELECT " + fName + "(dval) FROM %s WHERE key=1");
+            Assert.fail();
+        }
+        catch (FunctionExecutionException e)
+        {
+            assertAccessControlException("System.getProperty(\"foo.bar.baz\"); return 0d;", e);
+        }
+
+        String[][] typesAndSources =
+        {
+        {"", "try { Class.forName(\"" + UDHelper.class.getName() + "\"); } catch (Exception e) { throw new RuntimeException(e); } return 0d;"},
+        {"sun.misc.Unsafe",         "sun.misc.Unsafe.getUnsafe(); return 0d;"},
+        {"",                        "try { Class.forName(\"sun.misc.Unsafe\"); } catch (Exception e) { throw new RuntimeException(e); } return 0d;"},
+        {"java.nio.file.FileSystems", "try {" +
+                                      "     java.nio.file.FileSystems.getDefault(); return 0d;" +
+                                      "} catch (Exception t) {" +
+                                      "     throw new RuntimeException(t);" +
+                                      '}'},
+        {"java.nio.channels.FileChannel", "try {" +
+                                          "     java.nio.channels.FileChannel.open(java.nio.file.FileSystems.getDefault().getPath(\"/etc/passwd\")).close(); return 0d;" +
+                                          "} catch (Exception t) {" +
+                                          "     throw new RuntimeException(t);" +
+                                          '}'},
+        {"java.nio.channels.SocketChannel", "try {" +
+                                            "     java.nio.channels.SocketChannel.open().close(); return 0d;" +
+                                            "} catch (Exception t) {" +
+                                            "     throw new RuntimeException(t);" +
+                                            '}'},
+        {"java.io.FileInputStream", "try {" +
+                                    "     new java.io.FileInputStream(\"./foobar\").close(); return 0d;" +
+                                    "} catch (Exception t) {" +
+                                    "     throw new RuntimeException(t);" +
+                                    '}'},
+        {"java.lang.Runtime",       "try {" +
+                                    "     java.lang.Runtime.getRuntime(); return 0d;" +
+                                    "} catch (Exception t) {" +
+                                    "     throw new RuntimeException(t);" +
+                                    '}'},
+        {"org.apache.cassandra.service.StorageService",
+         "try {" +
+         "     org.apache.cassandra.service.StorageService v = org.apache.cassandra.service.StorageService.instance; v.isShutdown(); return 0d;" +
+         "} catch (Exception t) {" +
+         "     throw new RuntimeException(t);" +
+         '}'},
+        {"java.net.ServerSocket",   "try {" +
+                                    "     new java.net.ServerSocket().bind(); return 0d;" +
+                                    "} catch (Exception t) {" +
+                                    "     throw new RuntimeException(t);" +
+                                    '}'},
+        {"java.io.FileOutputStream","try {" +
+                                    "     new java.io.FileOutputStream(\".foo\"); return 0d;" +
+                                    "} catch (Exception t) {" +
+                                    "     throw new RuntimeException(t);" +
+                                    '}'},
+        {"java.lang.Runtime",       "try {" +
+                                    "     java.lang.Runtime.getRuntime().exec(\"/tmp/foo\"); return 0d;" +
+                                    "} catch (Exception t) {" +
+                                    "     throw new RuntimeException(t);" +
+                                    '}'}
+        };
+
+        for (String[] typeAndSource : typesAndSources)
+        {
+            assertInvalidMessage(typeAndSource[0] + " cannot be resolved",
+                                 "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".invalid_class_access(val double) " +
+                                 "RETURNS NULL ON NULL INPUT " +
+                                 "RETURNS double " +
+                                 "LANGUAGE JAVA\n" +
+                                 "AS '" + typeAndSource[1] + "';");
+        }
+
+        // JavaScript UDFs
+
+        try
+        {
+            String fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                          "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                          "RETURNS NULL ON NULL INPUT " +
+                                          "RETURNS double " +
+                                          "LANGUAGE javascript\n" +
+                                          "AS 'org.apache.cassandra.service.StorageService.instance.isShutdown(); 0;';");
+            execute("SELECT " + fName + "(dval) FROM %s WHERE key=1");
+            Assert.fail("Javascript security check failed");
+        }
+        catch (FunctionExecutionException e)
+        {
+            assertAccessControlException("", e);
+        }
+
+        String[] javascript =
+        {
+        "java.lang.management.ManagmentFactory.getThreadMXBean(); 0;",
+        "new java.io.FileInputStream(\"/tmp/foo\"); 0;",
+        "new java.io.FileOutputStream(\"/tmp/foo\"); 0;",
+        "java.nio.file.FileSystems.getDefault().createFileExclusively(\"./foo_bar_baz\"); 0;",
+        "java.nio.channels.FileChannel.open(java.nio.file.FileSystems.getDefault().getPath(\"/etc/passwd\")); 0;",
+        "java.nio.channels.SocketChannel.open(); 0;",
+        "new java.net.ServerSocket().bind(null); 0;",
+        "var thread = new java.lang.Thread(); thread.start(); 0;",
+        "java.lang.System.getProperty(\"foo.bar.baz\"); 0;",
+        "java.lang.Runtime.getRuntime().exec(\"/tmp/foo\"); 0;",
+        "java.lang.Runtime.getRuntime().loadLibrary(\"foobar\"); 0;",
+        "java.lang.Runtime.getRuntime().loadLibrary(\"foobar\"); 0;",
+        // TODO these (ugly) calls are still possible - these can consume CPU (as one could do with an evil loop, too)
+//        "java.lang.Runtime.getRuntime().traceMethodCalls(true); 0;",
+//        "java.lang.Runtime.getRuntime().gc(); 0;",
+//        "java.lang.Runtime.getRuntime(); 0;",
+        };
+
+        for (String script : javascript)
+        {
+            try
+            {
+                String fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                              "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                              "RETURNS NULL ON NULL INPUT " +
+                                              "RETURNS double " +
+                                              "LANGUAGE javascript\n" +
+                                              "AS '" + script + "';");
+                execute("SELECT " + fName + "(dval) FROM %s WHERE key=1");
+                Assert.fail("Javascript security check failed: " + script);
+            }
+            catch (FunctionExecutionException e)
+            {
+                assertAccessControlException(script, e);
+            }
+        }
+
+        String script = "java.lang.Class.forName(\"java.lang.System\"); 0;";
+        String fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                      "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                      "RETURNS NULL ON NULL INPUT " +
+                                      "RETURNS double " +
+                                      "LANGUAGE javascript\n" +
+                                      "AS '" + script + "';");
+        assertInvalidThrowMessage("Java reflection not supported when class filter is present",
+                                  FunctionExecutionException.class,
+                                  "SELECT " + fName + "(dval) FROM %s WHERE key=1");
+    }
+
+    private static void assertAccessControlException(String script, FunctionExecutionException e)
+    {
+        for (Throwable t = e; t != null && t != t.getCause(); t = t.getCause())
+            if (t instanceof AccessControlException)
+                return;
+        Assert.fail("no AccessControlException for " + script + " (got " + e + ')');
+    }
+
+    @Test
+    public void testAmokUDF() throws Throwable
+    {
+        createTable("CREATE TABLE %s (key int primary key, dval double)");
+        execute("INSERT INTO %s (key, dval) VALUES (?, ?)", 1, 1d);
+
+        long udfWarnTimeout = DatabaseDescriptor.getUserDefinedFunctionWarnTimeout();
+        long udfFailTimeout = DatabaseDescriptor.getUserDefinedFunctionFailTimeout();
+        int maxTries = 5;
+        for (int i = 1; i <= maxTries; i++)
+        {
+            try
+            {
+                // short timeout
+                DatabaseDescriptor.setUserDefinedFunctionWarnTimeout(10);
+                DatabaseDescriptor.setUserDefinedFunctionFailTimeout(250);
+                // don't kill the unit test... - default policy is "die"
+                DatabaseDescriptor.setUserFunctionTimeoutPolicy(Config.UserFunctionTimeoutPolicy.ignore);
+
+                ClientWarn.instance.captureWarnings();
+                String fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                              "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                              "RETURNS NULL ON NULL INPUT " +
+                                              "RETURNS double " +
+                                              "LANGUAGE JAVA\n" +
+                                              "AS 'long t=System.currentTimeMillis()+110; while (t>System.currentTimeMillis()) { }; return 0d;'");
+                execute("SELECT " + fName + "(dval) FROM %s WHERE key=1");
+                List<String> warnings = ClientWarn.instance.getWarnings();
+                Assert.assertNotNull(warnings);
+                Assert.assertFalse(warnings.isEmpty());
+                ClientWarn.instance.resetWarnings();
+
+                // Java UDF
+
+                fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                       "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                       "RETURNS NULL ON NULL INPUT " +
+                                       "RETURNS double " +
+                                       "LANGUAGE JAVA\n" +
+                                       "AS 'long t=System.currentTimeMillis()+500; while (t>System.currentTimeMillis()) { }; return 0d;';");
+                assertInvalidMessage("ran longer than 250ms", "SELECT " + fName + "(dval) FROM %s WHERE key=1");
+
+                // Javascript UDF
+
+                fName = createFunction(KEYSPACE_PER_TEST, "double",
+                                       "CREATE OR REPLACE FUNCTION %s(val double) " +
+                                       "RETURNS NULL ON NULL INPUT " +
+                                       "RETURNS double " +
+                                       "LANGUAGE JAVASCRIPT\n" +
+                                       "AS 'var t=java.lang.System.currentTimeMillis()+500; while (t>java.lang.System.currentTimeMillis()) { }; 0;';");
+                assertInvalidMessage("ran longer than 250ms", "SELECT " + fName + "(dval) FROM %s WHERE key=1");
+
+                return;
+            }
+            catch (Error | RuntimeException e)
+            {
+                if (i == maxTries)
+                    throw e;
+            }
+            finally
+            {
+                // reset to defaults
+                DatabaseDescriptor.setUserDefinedFunctionWarnTimeout(udfWarnTimeout);
+                DatabaseDescriptor.setUserDefinedFunctionFailTimeout(udfFailTimeout);
+            }
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFTest.java
index d4d2a10..6e6af19 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UFTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFTest.java

@@ -17,53 +17,40 @@
  */
 package org.apache.cassandra.cql3.validation.entities;
 
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.UUID;
 
 import org.junit.Assert;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import com.datastax.driver.core.Row;
 import com.datastax.driver.core.exceptions.InvalidQueryException;
-import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.functions.FunctionName;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.cql3.functions.UDFunction;
 import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.transport.Event;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.transport.messages.ResultMessage;
-import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class UFTest extends CQLTester
 {
-    @BeforeClass
-    public static void setUp()
-    {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
-    }
-
     @Test
     public void testNonExistingOnes() throws Throwable
     {
-        assertInvalidMessage("Cannot drop non existing function", "DROP FUNCTION " + KEYSPACE + ".func_does_not_exist");
-        assertInvalidMessage("Cannot drop non existing function", "DROP FUNCTION " + KEYSPACE + ".func_does_not_exist(int,text)");
-        assertInvalidMessage("Cannot drop non existing function", "DROP FUNCTION keyspace_does_not_exist.func_does_not_exist");
-        assertInvalidMessage("Cannot drop non existing function", "DROP FUNCTION keyspace_does_not_exist.func_does_not_exist(int,text)");
+        assertInvalidThrowMessage("Cannot drop non existing function", InvalidRequestException.class, "DROP FUNCTION " + KEYSPACE + ".func_does_not_exist");
+        assertInvalidThrowMessage("Cannot drop non existing function", InvalidRequestException.class, "DROP FUNCTION " + KEYSPACE + ".func_does_not_exist(int,text)");
+        assertInvalidThrowMessage("Cannot drop non existing function", InvalidRequestException.class, "DROP FUNCTION keyspace_does_not_exist.func_does_not_exist");
+        assertInvalidThrowMessage("Cannot drop non existing function", InvalidRequestException.class, "DROP FUNCTION keyspace_does_not_exist.func_does_not_exist(int,text)");
 
         execute("DROP FUNCTION IF EXISTS " + KEYSPACE + ".func_does_not_exist");
         execute("DROP FUNCTION IF EXISTS " + KEYSPACE + ".func_does_not_exist(int,text)");
@@ -127,16 +114,16 @@
 
         FunctionName fSinName = parseFunctionName(fSin);
 
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fSin)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fSin)).size());
 
-        assertRows(execute("SELECT function_name, language FROM system.schema_functions WHERE keyspace_name=?", KEYSPACE_PER_TEST),
+        assertRows(execute("SELECT function_name, language FROM system_schema.functions WHERE keyspace_name=?", KEYSPACE_PER_TEST),
                    row(fSinName.name, "java"));
 
         dropPerTestKeyspace();
 
-        assertRows(execute("SELECT function_name, language FROM system.schema_functions WHERE keyspace_name=?", KEYSPACE_PER_TEST));
+        assertRows(execute("SELECT function_name, language FROM system_schema.functions WHERE keyspace_name=?", KEYSPACE_PER_TEST));
 
-        Assert.assertEquals(0, Functions.find(fSinName).size());
+        Assert.assertEquals(0, Schema.instance.getFunctions(fSinName).size());
     }
 
     @Test
@@ -153,7 +140,7 @@
 
         FunctionName fSinName = parseFunctionName(fSin);
 
-        Assert.assertEquals(1, Functions.find(parseFunctionName(fSin)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(fSin)).size());
 
         // create a pairs of Select and Inserts. One statement in each pair uses the function so when we
         // drop it those statements should be removed from the cache in QueryProcessor. The other statements
@@ -191,7 +178,7 @@
                 "LANGUAGE java " +
                 "AS 'return Double.valueOf(Math.sin(input));'");
 
-        Assert.assertEquals(1, Functions.find(fSinName).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(fSinName).size());
 
         preparedSelect1= QueryProcessor.prepare(
                                          String.format("SELECT key, %s(d) FROM %s.%s", fSin, KEYSPACE, currentTable()),
@@ -306,7 +293,7 @@
                                         "RETURNS double " +
                                         "LANGUAGE javascript " +
                                         "AS 'input'");
-        Assert.assertEquals(1, Functions.find(parseFunctionName(function)).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(parseFunctionName(function)).size());
 
         List<ResultMessage.Prepared> prepared = new ArrayList<>();
         // prepare statements which use the function to provide a DelayedValue
@@ -679,7 +666,7 @@
     @Test
     public void testFunctionNonExistingKeyspace() throws Throwable
     {
-        assertInvalidMessage("to non existing keyspace",
+        assertInvalidMessage("Keyspace this_ks_does_not_exist doesn't exist",
                              "CREATE OR REPLACE FUNCTION this_ks_does_not_exist.jnft(val double) " +
                              "RETURNS NULL ON NULL INPUT " +
                              "RETURNS double " +
@@ -692,7 +679,7 @@
     {
         dropPerTestKeyspace();
 
-        assertInvalidMessage("to non existing keyspace",
+        assertInvalidMessage("Keyspace " + KEYSPACE_PER_TEST + " doesn't exist",
                              "CREATE OR REPLACE FUNCTION " + KEYSPACE_PER_TEST + ".jnft(val double) " +
                              "RETURNS NULL ON NULL INPUT " +
                              "RETURNS double " +
@@ -724,115 +711,6 @@
     }
 
     @Test
-    public void testComplexNullValues() throws Throwable
-    {
-        String type = KEYSPACE + '.' + createType("CREATE TYPE %s (txt text, i int)");
-
-        createTable("CREATE TABLE %s (key int primary key, lst list<double>, st set<text>, mp map<int, boolean>," +
-                    "tup frozen<tuple<double, text, int, boolean>>, udt frozen<" + type + ">)");
-
-        String fList = createFunction(KEYSPACE, "list<double>",
-                                      "CREATE FUNCTION %s( coll list<double> ) " +
-                                      "CALLED ON NULL INPUT " +
-                                      "RETURNS list<double> " +
-                                      "LANGUAGE java\n" +
-                                      "AS $$return coll;$$;");
-        String fSet = createFunction(KEYSPACE, "set<text>",
-                                     "CREATE FUNCTION %s( coll set<text> ) " +
-                                     "CALLED ON NULL INPUT " +
-                                     "RETURNS set<text> " +
-                                     "LANGUAGE java\n" +
-                                     "AS $$return coll;$$;");
-        String fMap = createFunction(KEYSPACE, "map<int, boolean>",
-                                     "CREATE FUNCTION %s( coll map<int, boolean> ) " +
-                                     "CALLED ON NULL INPUT " +
-                                     "RETURNS map<int, boolean> " +
-                                     "LANGUAGE java\n" +
-                                     "AS $$return coll;$$;");
-        String fTup = createFunction(KEYSPACE, "tuple<double, text, int, boolean>",
-                                     "CREATE FUNCTION %s( val tuple<double, text, int, boolean> ) " +
-                                     "CALLED ON NULL INPUT " +
-                                     "RETURNS tuple<double, text, int, boolean> " +
-                                     "LANGUAGE java\n" +
-                                     "AS $$return val;$$;");
-        String fUdt = createFunction(KEYSPACE, type,
-                                     "CREATE FUNCTION %s( val " + type + " ) " +
-                                     "CALLED ON NULL INPUT " +
-                                     "RETURNS " + type + " " +
-                                     "LANGUAGE java\n" +
-                                     "AS $$return val;$$;");
-        List<Double> list = Arrays.asList(1d, 2d, 3d);
-        Set<String> set = new TreeSet<>(Arrays.asList("one", "three", "two"));
-        Map<Integer, Boolean> map = new TreeMap<>();
-        map.put(1, true);
-        map.put(2, false);
-        map.put(3, true);
-        Object t = tuple(1d, "one", 42, false);
-
-        execute("INSERT INTO %s (key, lst, st, mp, tup, udt) VALUES (1, ?, ?, ?, ?, {txt: 'one', i:1})", list, set, map, t);
-        execute("INSERT INTO %s (key, lst, st, mp, tup, udt) VALUES (2, ?, ?, ?, ?, null)", null, null, null, null);
-
-        execute("SELECT " +
-                fList + "(lst), " +
-                fSet + "(st), " +
-                fMap + "(mp), " +
-                fTup + "(tup), " +
-                fUdt + "(udt) FROM %s WHERE key = 1");
-        UntypedResultSet.Row row = execute("SELECT " +
-                                           fList + "(lst) as l, " +
-                                           fSet + "(st) as s, " +
-                                           fMap + "(mp) as m, " +
-                                           fTup + "(tup) as t, " +
-                                           fUdt + "(udt) as u " +
-                                           "FROM %s WHERE key = 1").one();
-        Assert.assertNotNull(row.getBytes("l"));
-        Assert.assertNotNull(row.getBytes("s"));
-        Assert.assertNotNull(row.getBytes("m"));
-        Assert.assertNotNull(row.getBytes("t"));
-        Assert.assertNotNull(row.getBytes("u"));
-        row = execute("SELECT " +
-                      fList + "(lst) as l, " +
-                      fSet + "(st) as s, " +
-                      fMap + "(mp) as m, " +
-                      fTup + "(tup) as t, " +
-                      fUdt + "(udt) as u " +
-                      "FROM %s WHERE key = 2").one();
-        Assert.assertNull(row.getBytes("l"));
-        Assert.assertNull(row.getBytes("s"));
-        Assert.assertNull(row.getBytes("m"));
-        Assert.assertNull(row.getBytes("t"));
-        Assert.assertNull(row.getBytes("u"));
-
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
-        {
-            Row r = executeNet(version, "SELECT " +
-                                        fList + "(lst) as l, " +
-                                        fSet + "(st) as s, " +
-                                        fMap + "(mp) as m, " +
-                                        fTup + "(tup) as t, " +
-                                        fUdt + "(udt) as u " +
-                                        "FROM %s WHERE key = 1").one();
-            Assert.assertNotNull(r.getBytesUnsafe("l"));
-            Assert.assertNotNull(r.getBytesUnsafe("s"));
-            Assert.assertNotNull(r.getBytesUnsafe("m"));
-            Assert.assertNotNull(r.getBytesUnsafe("t"));
-            Assert.assertNotNull(r.getBytesUnsafe("u"));
-            r = executeNet(version, "SELECT " +
-                                    fList + "(lst) as l, " +
-                                    fSet + "(st) as s, " +
-                                    fMap + "(mp) as m, " +
-                                    fTup + "(tup) as t, " +
-                                    fUdt + "(udt) as u " +
-                                    "FROM %s WHERE key = 2").one();
-            Assert.assertNull(r.getBytesUnsafe("l"));
-            Assert.assertNull(r.getBytesUnsafe("s"));
-            Assert.assertNull(r.getBytesUnsafe("m"));
-            Assert.assertNull(r.getBytesUnsafe("t"));
-            Assert.assertNull(r.getBytesUnsafe("u"));
-        }
-    }
-
-    @Test
     public void testUserTypeDrop() throws Throwable
     {
         String type = KEYSPACE + '.' + createType("CREATE TYPE %s (txt text, i int)");
@@ -849,7 +727,7 @@
 
         FunctionName fNameName = parseFunctionName(fName);
 
-        Assert.assertEquals(1, Functions.find(fNameName).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(fNameName).size());
 
         ResultMessage.Prepared prepared = QueryProcessor.prepare(String.format("SELECT key, %s(udt) FROM %s.%s", fName, KEYSPACE, currentTable()),
                                                                  ClientState.forInternalCalls(), false);
@@ -866,7 +744,7 @@
         Assert.assertNull(QueryProcessor.instance.getPrepared(prepared.statementId));
 
         // function stays
-        Assert.assertEquals(1, Functions.find(fNameName).size());
+        Assert.assertEquals(1, Schema.instance.getFunctions(fNameName).size());
     }
 
     @Test
@@ -880,139 +758,6 @@
                              "AS '\"foo bar\";';");
     }
 
-    private static class TypesTestDef
-    {
-        final String udfType;
-        final String tableType;
-        final String columnName;
-        final Object referenceValue;
-
-        String fCheckArgAndReturn;
-
-        String fCalledOnNull;
-        String fReturnsNullOnNull;
-
-        TypesTestDef(String udfType, String tableType, String columnName, Object referenceValue)
-        {
-            this.udfType = udfType;
-            this.tableType = tableType;
-            this.columnName = columnName;
-            this.referenceValue = referenceValue;
-        }
-    }
-
-    @Test
-    public void testTypesWithAndWithoutNulls() throws Throwable
-    {
-        // test various combinations of types against UDFs with CALLED ON NULL or RETURNS NULL ON NULL
-
-        String type = createType("CREATE TYPE %s (txt text, i int)");
-
-        TypesTestDef[] typeDefs =
-        {
-        //                udf type,            table type,                 column, reference value
-        new TypesTestDef("timestamp", "timestamp", "ts", new Date()),
-        new TypesTestDef("date", "date", "dt", 12345),
-        new TypesTestDef("time", "time", "tim", 12345L),
-        new TypesTestDef("uuid", "uuid", "uu", UUID.randomUUID()),
-        new TypesTestDef("timeuuid", "timeuuid", "tu", UUIDGen.getTimeUUID()),
-        new TypesTestDef("tinyint", "tinyint", "ti", (byte) 42),
-        new TypesTestDef("smallint", "smallint", "si", (short) 43),
-        new TypesTestDef("int", "int", "i", 44),
-        new TypesTestDef("bigint", "bigint", "b", 45L),
-        new TypesTestDef("float", "float", "f", 46f),
-        new TypesTestDef("double", "double", "d", 47d),
-        new TypesTestDef("boolean", "boolean", "x", true),
-        new TypesTestDef("ascii", "ascii", "a", "tqbfjutld"),
-        new TypesTestDef("text", "text", "t", "k\u00f6lsche jung"),
-        //new TypesTestDef(type,                 "frozen<" + type + '>',     "u",    null),
-        new TypesTestDef("tuple<int, text>", "frozen<tuple<int, text>>", "tup", tuple(1, "foo"))
-        };
-
-        String createTableDDL = "CREATE TABLE %s (key int PRIMARY KEY";
-        String insertDML = "INSERT INTO %s (key";
-        List<Object> values = new ArrayList<>();
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            createTableDDL += ", " + typeDef.columnName + ' ' + typeDef.tableType;
-            insertDML += ", " + typeDef.columnName;
-            String typeName = typeDef.udfType;
-            typeDef.fCheckArgAndReturn = createFunction(KEYSPACE,
-                                                        typeName,
-                                                        "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
-                                                        "CALLED ON NULL INPUT " +
-                                                        "RETURNS " + typeName + ' ' +
-                                                        "LANGUAGE java\n" +
-                                                        "AS 'return val;';");
-            typeDef.fCalledOnNull = createFunction(KEYSPACE,
-                                                   typeName,
-                                                   "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
-                                                   "CALLED ON NULL INPUT " +
-                                                   "RETURNS text " +
-                                                   "LANGUAGE java\n" +
-                                                   "AS 'return \"called\";';");
-            typeDef.fReturnsNullOnNull = createFunction(KEYSPACE,
-                                                        typeName,
-                                                        "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
-                                                        "RETURNS NULL ON NULL INPUT " +
-                                                        "RETURNS text " +
-                                                        "LANGUAGE java\n" +
-                                                        "AS 'return \"called\";';");
-            values.add(typeDef.referenceValue);
-        }
-
-        createTableDDL += ')';
-        createTable(createTableDDL);
-
-        insertDML += ") VALUES (1";
-        for (TypesTestDef ignored : typeDefs)
-            insertDML += ", ?";
-        insertDML += ')';
-
-        execute(insertDML, values.toArray());
-
-        // second row with null values
-        for (int i = 0; i < values.size(); i++)
-            values.set(i, null);
-        execute(insertDML.replace('1', '2'), values.toArray());
-
-        // check argument input + return
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            assertRows(execute("SELECT " + typeDef.fCheckArgAndReturn + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
-                       row(new Object[]{ typeDef.referenceValue }));
-        }
-
-        // check for CALLED ON NULL INPUT with non-null arguments
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            assertRows(execute("SELECT " + typeDef.fCalledOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
-                       row(new Object[]{ "called" }));
-        }
-
-        // check for CALLED ON NULL INPUT with null arguments
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            assertRows(execute("SELECT " + typeDef.fCalledOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 2"),
-                       row(new Object[]{ "called" }));
-        }
-
-        // check for RETURNS NULL ON NULL INPUT with non-null arguments
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            assertRows(execute("SELECT " + typeDef.fReturnsNullOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
-                       row(new Object[]{ "called" }));
-        }
-
-        // check for RETURNS NULL ON NULL INPUT with null arguments
-        for (TypesTestDef typeDef : typeDefs)
-        {
-            assertRows(execute("SELECT " + typeDef.fReturnsNullOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 2"),
-                       row(new Object[]{ null }));
-        }
-
-    }
-
     @Test
     public void testReplaceAllowNulls() throws Throwable
     {
@@ -1067,10 +812,18 @@
                                       "LANGUAGE JAVA\n" +
                                       "AS 'throw new RuntimeException();';");
 
-        UDFunction f = (UDFunction) Functions.find(parseFunctionName(fName)).get(0);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(KEYSPACE_PER_TEST);
+        UDFunction f = (UDFunction) ksm.functions.get(parseFunctionName(fName)).iterator().next();
 
-        Functions.addOrReplaceFunction(UDFunction.createBrokenFunction(f.name(), f.argNames(), f.argTypes(), f.returnType(), true,
-                                                                       "java", f.body(), new InvalidRequestException("foo bar is broken")));
+        UDFunction broken = UDFunction.createBrokenFunction(f.name(),
+                                                            f.argNames(),
+                                                            f.argTypes(),
+                                                            f.returnType(),
+                                                            true,
+                                                            "java",
+                                                            f.body(),
+                                                            new InvalidRequestException("foo bar is broken"));
+        Schema.instance.setKeyspaceMetadata(ksm.withSwapped(ksm.functions.without(f.name(), f.argTypes()).with(broken)));
 
         assertInvalidThrowMessage("foo bar is broken", InvalidRequestException.class,
                                   "SELECT key, " + fName + "(dval) FROM %s");
@@ -1089,7 +842,7 @@
                                       "LANGUAGE JAVA\n" +
                                       "AS 'throw new RuntimeException();'");
 
-        for (int version = Server.VERSION_2; version <= maxProtocolVersion; version++)
+        for (int version : PROTOCOL_VERSIONS)
         {
             try
             {
@@ -1108,4 +861,136 @@
             }
         }
     }
+
+    @Test
+    public void testEmptyString() throws Throwable
+    {
+        createTable("CREATE TABLE %s (key int primary key, sval text, aval ascii, bval blob, empty_int int)");
+        execute("INSERT INTO %s (key, sval, aval, bval, empty_int) VALUES (?, ?, ?, ?, blobAsInt(0x))", 1, "", "", ByteBuffer.allocate(0));
+
+        String fNameSRC = createFunction(KEYSPACE_PER_TEST, "text",
+                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS text " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameSCC = createFunction(KEYSPACE_PER_TEST, "text",
+                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS text " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return \"\";'");
+
+        String fNameSRN = createFunction(KEYSPACE_PER_TEST, "text",
+                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS text " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameSCN = createFunction(KEYSPACE_PER_TEST, "text",
+                                         "CREATE OR REPLACE FUNCTION %s(val text) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS text " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return \"\";'");
+
+        String fNameBRC = createFunction(KEYSPACE_PER_TEST, "blob",
+                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS blob " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameBCC = createFunction(KEYSPACE_PER_TEST, "blob",
+                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS blob " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return ByteBuffer.allocate(0);'");
+
+        String fNameBRN = createFunction(KEYSPACE_PER_TEST, "blob",
+                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS blob " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameBCN = createFunction(KEYSPACE_PER_TEST, "blob",
+                                         "CREATE OR REPLACE FUNCTION %s(val blob) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS blob " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return ByteBuffer.allocate(0);'");
+
+        String fNameIRC = createFunction(KEYSPACE_PER_TEST, "int",
+                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS int " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameICC = createFunction(KEYSPACE_PER_TEST, "int",
+                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
+                                         "CALLED ON NULL INPUT " +
+                                         "RETURNS int " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return 0;'");
+
+        String fNameIRN = createFunction(KEYSPACE_PER_TEST, "int",
+                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS int " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return val;'");
+
+        String fNameICN = createFunction(KEYSPACE_PER_TEST, "int",
+                                         "CREATE OR REPLACE FUNCTION %s(val int) " +
+                                         "RETURNS NULL ON NULL INPUT " +
+                                         "RETURNS int " +
+                                         "LANGUAGE JAVA\n" +
+                                         "AS 'return 0;'");
+
+        assertRows(execute("SELECT " + fNameSRC + "(sval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSRN + "(sval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSCC + "(sval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSCN + "(sval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSRC + "(aval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSRN + "(aval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSCC + "(aval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameSCN + "(aval) FROM %s"), row(""));
+        assertRows(execute("SELECT " + fNameBRC + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertRows(execute("SELECT " + fNameBRN + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertRows(execute("SELECT " + fNameBCC + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertRows(execute("SELECT " + fNameBCN + "(bval) FROM %s"), row(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertRows(execute("SELECT " + fNameIRC + "(empty_int) FROM %s"), row(new Object[]{ null }));
+        assertRows(execute("SELECT " + fNameIRN + "(empty_int) FROM %s"), row(new Object[]{ null }));
+        assertRows(execute("SELECT " + fNameICC + "(empty_int) FROM %s"), row(0));
+        assertRows(execute("SELECT " + fNameICN + "(empty_int) FROM %s"), row(new Object[]{ null }));
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void testEmptyFunctionName() throws Throwable
+    {
+        execute("CREATE FUNCTION IF NOT EXISTS " + KEYSPACE + ".\"\" (arg int)\n" +
+                "  RETURNS NULL ON NULL INPUT\n" +
+                "  RETURNS int\n" +
+                "  LANGUAGE java\n" +
+                "  AS $$\n" +
+                "    return a;\n" +
+                "  $$");
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void testEmptyArgName() throws Throwable
+    {
+        execute("CREATE FUNCTION IF NOT EXISTS " + KEYSPACE + ".myfn (\"\" int)\n" +
+                "  RETURNS NULL ON NULL INPUT\n" +
+                "  RETURNS int\n" +
+                "  LANGUAGE java\n" +
+                "  AS $$\n" +
+                "    return a;\n" +
+                "  $$");
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFTypesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFTypesTest.java
new file mode 100644
index 0000000..de98748
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFTypesTest.java

@@ -0,0 +1,549 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.UUID;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.datastax.driver.core.Row;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class UFTypesTest extends CQLTester
+{
+
+    @Test
+    public void testComplexNullValues() throws Throwable
+    {
+        String type = KEYSPACE + '.' + createType("CREATE TYPE %s (txt text, i int)");
+
+        createTable("CREATE TABLE %s (key int primary key, lst list<double>, st set<text>, mp map<int, boolean>," +
+                    "tup frozen<tuple<double, text, int, boolean>>, udt frozen<" + type + ">)");
+
+        String fList = createFunction(KEYSPACE, "list<double>",
+                                      "CREATE FUNCTION %s( coll list<double> ) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS list<double> " +
+                                      "LANGUAGE java\n" +
+                                      "AS $$return coll;$$;");
+        String fSet = createFunction(KEYSPACE, "set<text>",
+                                     "CREATE FUNCTION %s( coll set<text> ) " +
+                                     "CALLED ON NULL INPUT " +
+                                     "RETURNS set<text> " +
+                                     "LANGUAGE java\n" +
+                                     "AS $$return coll;$$;");
+        String fMap = createFunction(KEYSPACE, "map<int, boolean>",
+                                     "CREATE FUNCTION %s( coll map<int, boolean> ) " +
+                                     "CALLED ON NULL INPUT " +
+                                     "RETURNS map<int, boolean> " +
+                                     "LANGUAGE java\n" +
+                                     "AS $$return coll;$$;");
+        String fTup = createFunction(KEYSPACE, "tuple<double, text, int, boolean>",
+                                     "CREATE FUNCTION %s( val tuple<double, text, int, boolean> ) " +
+                                     "CALLED ON NULL INPUT " +
+                                     "RETURNS tuple<double, text, int, boolean> " +
+                                     "LANGUAGE java\n" +
+                                     "AS $$return val;$$;");
+        String fUdt = createFunction(KEYSPACE, type,
+                                     "CREATE FUNCTION %s( val " + type + " ) " +
+                                     "CALLED ON NULL INPUT " +
+                                     "RETURNS " + type + " " +
+                                     "LANGUAGE java\n" +
+                                     "AS $$return val;$$;");
+        List<Double> list = Arrays.asList(1d, 2d, 3d);
+        Set<String> set = new TreeSet<>(Arrays.asList("one", "three", "two"));
+        Map<Integer, Boolean> map = new TreeMap<>();
+        map.put(1, true);
+        map.put(2, false);
+        map.put(3, true);
+        Object t = tuple(1d, "one", 42, false);
+
+        execute("INSERT INTO %s (key, lst, st, mp, tup, udt) VALUES (1, ?, ?, ?, ?, {txt: 'one', i:1})", list, set, map, t);
+        execute("INSERT INTO %s (key, lst, st, mp, tup, udt) VALUES (2, ?, ?, ?, ?, null)", null, null, null, null);
+
+        execute("SELECT " +
+                fList + "(lst), " +
+                fSet + "(st), " +
+                fMap + "(mp), " +
+                fTup + "(tup), " +
+                fUdt + "(udt) FROM %s WHERE key = 1");
+        UntypedResultSet.Row row = execute("SELECT " +
+                                           fList + "(lst) as l, " +
+                                           fSet + "(st) as s, " +
+                                           fMap + "(mp) as m, " +
+                                           fTup + "(tup) as t, " +
+                                           fUdt + "(udt) as u " +
+                                           "FROM %s WHERE key = 1").one();
+        Assert.assertNotNull(row.getBytes("l"));
+        Assert.assertNotNull(row.getBytes("s"));
+        Assert.assertNotNull(row.getBytes("m"));
+        Assert.assertNotNull(row.getBytes("t"));
+        Assert.assertNotNull(row.getBytes("u"));
+        row = execute("SELECT " +
+                      fList + "(lst) as l, " +
+                      fSet + "(st) as s, " +
+                      fMap + "(mp) as m, " +
+                      fTup + "(tup) as t, " +
+                      fUdt + "(udt) as u " +
+                      "FROM %s WHERE key = 2").one();
+        Assert.assertNull(row.getBytes("l"));
+        Assert.assertNull(row.getBytes("s"));
+        Assert.assertNull(row.getBytes("m"));
+        Assert.assertNull(row.getBytes("t"));
+        Assert.assertNull(row.getBytes("u"));
+
+        for (int version : PROTOCOL_VERSIONS)
+        {
+            Row r = executeNet(version, "SELECT " +
+                                        fList + "(lst) as l, " +
+                                        fSet + "(st) as s, " +
+                                        fMap + "(mp) as m, " +
+                                        fTup + "(tup) as t, " +
+                                        fUdt + "(udt) as u " +
+                                        "FROM %s WHERE key = 1").one();
+            Assert.assertNotNull(r.getBytesUnsafe("l"));
+            Assert.assertNotNull(r.getBytesUnsafe("s"));
+            Assert.assertNotNull(r.getBytesUnsafe("m"));
+            Assert.assertNotNull(r.getBytesUnsafe("t"));
+            Assert.assertNotNull(r.getBytesUnsafe("u"));
+            r = executeNet(version, "SELECT " +
+                                    fList + "(lst) as l, " +
+                                    fSet + "(st) as s, " +
+                                    fMap + "(mp) as m, " +
+                                    fTup + "(tup) as t, " +
+                                    fUdt + "(udt) as u " +
+                                    "FROM %s WHERE key = 2").one();
+            Assert.assertNull(r.getBytesUnsafe("l"));
+            Assert.assertNull(r.getBytesUnsafe("s"));
+            Assert.assertNull(r.getBytesUnsafe("m"));
+            Assert.assertNull(r.getBytesUnsafe("t"));
+            Assert.assertNull(r.getBytesUnsafe("u"));
+        }
+    }
+
+    private static class TypesTestDef
+    {
+        final String udfType;
+        final String tableType;
+        final String columnName;
+        final Object referenceValue;
+
+        String fCheckArgAndReturn;
+
+        String fCalledOnNull;
+        String fReturnsNullOnNull;
+
+        TypesTestDef(String udfType, String tableType, String columnName, Object referenceValue)
+        {
+            this.udfType = udfType;
+            this.tableType = tableType;
+            this.columnName = columnName;
+            this.referenceValue = referenceValue;
+        }
+    }
+
+    @Test
+    public void testTypesWithAndWithoutNulls() throws Throwable
+    {
+        // test various combinations of types against UDFs with CALLED ON NULL or RETURNS NULL ON NULL
+
+        String type = createType("CREATE TYPE %s (txt text, i int)");
+
+        TypesTestDef[] typeDefs =
+        {
+        //                udf type,            table type,                 column, reference value
+        new TypesTestDef("timestamp", "timestamp", "ts", new Date()),
+        new TypesTestDef("date", "date", "dt", 12345),
+        new TypesTestDef("time", "time", "tim", 12345L),
+        new TypesTestDef("uuid", "uuid", "uu", UUID.randomUUID()),
+        new TypesTestDef("timeuuid", "timeuuid", "tu", UUIDGen.getTimeUUID()),
+        new TypesTestDef("tinyint", "tinyint", "ti", (byte) 42),
+        new TypesTestDef("smallint", "smallint", "si", (short) 43),
+        new TypesTestDef("int", "int", "i", 44),
+        new TypesTestDef("bigint", "bigint", "b", 45L),
+        new TypesTestDef("float", "float", "f", 46f),
+        new TypesTestDef("double", "double", "d", 47d),
+        new TypesTestDef("boolean", "boolean", "x", true),
+        new TypesTestDef("ascii", "ascii", "a", "tqbfjutld"),
+        new TypesTestDef("text", "text", "t", "k\u00f6lsche jung"),
+        //new TypesTestDef(type,                 "frozen<" + type + '>',     "u",    null),
+        new TypesTestDef("tuple<int, text>", "frozen<tuple<int, text>>", "tup", tuple(1, "foo"))
+        };
+
+        String createTableDDL = "CREATE TABLE %s (key int PRIMARY KEY";
+        String insertDML = "INSERT INTO %s (key";
+        List<Object> values = new ArrayList<>();
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            createTableDDL += ", " + typeDef.columnName + ' ' + typeDef.tableType;
+            insertDML += ", " + typeDef.columnName;
+            String typeName = typeDef.udfType;
+            typeDef.fCheckArgAndReturn = createFunction(KEYSPACE,
+                                                        typeName,
+                                                        "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
+                                                        "CALLED ON NULL INPUT " +
+                                                        "RETURNS " + typeName + ' ' +
+                                                        "LANGUAGE java\n" +
+                                                        "AS 'return val;';");
+            typeDef.fCalledOnNull = createFunction(KEYSPACE,
+                                                   typeName,
+                                                   "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
+                                                   "CALLED ON NULL INPUT " +
+                                                   "RETURNS text " +
+                                                   "LANGUAGE java\n" +
+                                                   "AS 'return \"called\";';");
+            typeDef.fReturnsNullOnNull = createFunction(KEYSPACE,
+                                                        typeName,
+                                                        "CREATE OR REPLACE FUNCTION %s(val " + typeName + ") " +
+                                                        "RETURNS NULL ON NULL INPUT " +
+                                                        "RETURNS text " +
+                                                        "LANGUAGE java\n" +
+                                                        "AS 'return \"called\";';");
+            values.add(typeDef.referenceValue);
+        }
+
+        createTableDDL += ')';
+        createTable(createTableDDL);
+
+        insertDML += ") VALUES (1";
+        for (TypesTestDef ignored : typeDefs)
+            insertDML += ", ?";
+        insertDML += ')';
+
+        execute(insertDML, values.toArray());
+
+        // second row with null values
+        for (int i = 0; i < values.size(); i++)
+            values.set(i, null);
+        execute(insertDML.replace('1', '2'), values.toArray());
+
+        // check argument input + return
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            assertRows(execute("SELECT " + typeDef.fCheckArgAndReturn + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
+                       row(new Object[]{ typeDef.referenceValue }));
+        }
+
+        // check for CALLED ON NULL INPUT with non-null arguments
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            assertRows(execute("SELECT " + typeDef.fCalledOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
+                       row(new Object[]{ "called" }));
+        }
+
+        // check for CALLED ON NULL INPUT with null arguments
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            assertRows(execute("SELECT " + typeDef.fCalledOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 2"),
+                       row(new Object[]{ "called" }));
+        }
+
+        // check for RETURNS NULL ON NULL INPUT with non-null arguments
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            assertRows(execute("SELECT " + typeDef.fReturnsNullOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 1"),
+                       row(new Object[]{ "called" }));
+        }
+
+        // check for RETURNS NULL ON NULL INPUT with null arguments
+        for (TypesTestDef typeDef : typeDefs)
+        {
+            assertRows(execute("SELECT " + typeDef.fReturnsNullOnNull + '(' + typeDef.columnName + ") FROM %s WHERE key = 2"),
+                       row(new Object[]{ null }));
+        }
+
+    }
+
+    @Test
+    public void testFunctionWithFrozenSetType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<set<int>>)");
+        createIndex("CREATE INDEX ON %s (FULL(b))");
+
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, set());
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, set(1, 2, 3));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, set(4, 5, 6));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, set(7, 8, 9));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenSetArg(values frozen<set<int>>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS int " +
+                             "LANGUAGE java\n" +
+                             "AS 'int sum = 0; for (Object value : values) {sum += value;} return sum;';");
+
+        assertInvalidMessage("The function return type should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values set<int>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS frozen<set<int>> " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values;';");
+
+        String functionName = createFunction(KEYSPACE,
+                                             "set<int>",
+                                             "CREATE FUNCTION %s (values set<int>) " +
+                                             "CALLED ON NULL INPUT " +
+                                             "RETURNS int " +
+                                             "LANGUAGE java\n" +
+                                             "AS 'int sum = 0; for (Object value : values) {sum += ((Integer) value);} return sum;';");
+
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 0"), row(0, 0));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 1"), row(1, 6));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 2"), row(2, 15));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 3"), row(3, 24));
+
+        functionName = createFunction(KEYSPACE,
+                                      "set<int>",
+                                      "CREATE FUNCTION %s (values set<int>) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS set<int> " +
+                                      "LANGUAGE java\n" +
+                                      "AS 'return values;';");
+
+        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", set(1, 2, 3)),
+                   row(1));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "DROP FUNCTION " + functionName + "(frozen<set<int>>);");
+    }
+
+    @Test
+    public void testFunctionWithFrozenListType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<list<int>>)");
+        createIndex("CREATE INDEX ON %s (FULL(b))");
+
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, list());
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, list(1, 2, 3));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, list(4, 5, 6));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, list(7, 8, 9));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<list<int>>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS int " +
+                             "LANGUAGE java\n" +
+                             "AS 'int sum = 0; for (Object value : values) {sum += value;} return sum;';");
+
+        assertInvalidMessage("The function return type should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values list<int>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS frozen<list<int>> " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values;';");
+
+        String functionName = createFunction(KEYSPACE,
+                                             "list<int>",
+                                             "CREATE FUNCTION %s (values list<int>) " +
+                                             "CALLED ON NULL INPUT " +
+                                             "RETURNS int " +
+                                             "LANGUAGE java\n" +
+                                             "AS 'int sum = 0; for (Object value : values) {sum += ((Integer) value);} return sum;';");
+
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 0"), row(0, 0));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 1"), row(1, 6));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 2"), row(2, 15));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 3"), row(3, 24));
+
+        functionName = createFunction(KEYSPACE,
+                                      "list<int>",
+                                      "CREATE FUNCTION %s (values list<int>) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS list<int> " +
+                                      "LANGUAGE java\n" +
+                                      "AS 'return values;';");
+
+        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", set(1, 2, 3)),
+                   row(1));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "DROP FUNCTION " + functionName + "(frozen<list<int>>);");
+    }
+
+    @Test
+    public void testFunctionWithFrozenMapType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<map<int, int>>)");
+        createIndex("CREATE INDEX ON %s (FULL(b))");
+
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, map());
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, map(1, 1, 2, 2, 3, 3));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, map(4, 4, 5, 5, 6, 6));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, map(7, 7, 8, 8, 9, 9));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<map<int, int>>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS int " +
+                             "LANGUAGE java\n" +
+                             "AS 'int sum = 0; for (Object value : values.values()) {sum += value;} return sum;';");
+
+        assertInvalidMessage("The function return type should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values map<int, int>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS frozen<map<int, int>> " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values;';");
+
+        String functionName = createFunction(KEYSPACE,
+                                             "map<int, int>",
+                                             "CREATE FUNCTION %s (values map<int, int>) " +
+                                             "CALLED ON NULL INPUT " +
+                                             "RETURNS int " +
+                                             "LANGUAGE java\n" +
+                                             "AS 'int sum = 0; for (Object value : values.values()) {sum += ((Integer) value);} return sum;';");
+
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 0"), row(0, 0));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 1"), row(1, 6));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 2"), row(2, 15));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 3"), row(3, 24));
+
+        functionName = createFunction(KEYSPACE,
+                                      "map<int, int>",
+                                      "CREATE FUNCTION %s (values map<int, int>) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS map<int, int> " +
+                                      "LANGUAGE java\n" +
+                                      "AS 'return values;';");
+
+        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", map(1, 1, 2, 2, 3, 3)),
+                   row(1));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "DROP FUNCTION " + functionName + "(frozen<map<int, int>>);");
+    }
+
+    @Test
+    public void testFunctionWithFrozenTupleType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<tuple<int, int>>)");
+        createIndex("CREATE INDEX ON %s (b)");
+
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, tuple());
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, tuple(1, 2));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, tuple(4, 5));
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 3, tuple(7, 8));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<tuple<int, int>>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS text " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values.toString();';");
+
+        assertInvalidMessage("The function return type should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values tuple<int, int>) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS frozen<tuple<int, int>> " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values;';");
+
+        String functionName = createFunction(KEYSPACE,
+                                             "tuple<int, int>",
+                                             "CREATE FUNCTION %s (values tuple<int, int>) " +
+                                             "CALLED ON NULL INPUT " +
+                                             "RETURNS text " +
+                                             "LANGUAGE java\n" +
+                                             "AS 'return values.toString();';");
+
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 0"), row(0, "(NULL,NULL)"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 1"), row(1, "(1,2)"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 2"), row(2, "(4,5)"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 3"), row(3, "(7,8)"));
+
+        functionName = createFunction(KEYSPACE,
+                                      "tuple<int, int>",
+                                      "CREATE FUNCTION %s (values tuple<int, int>) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS tuple<int, int> " +
+                                      "LANGUAGE java\n" +
+                                      "AS 'return values;';");
+
+        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "(?)", tuple(1, 2)),
+                   row(1));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "DROP FUNCTION " + functionName + "(frozen<tuple<int, int>>);");
+    }
+
+    @Test
+    public void testFunctionWithFrozenUDType() throws Throwable
+    {
+        String myType = createType("CREATE TYPE %s (f int)");
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)");
+        createIndex("CREATE INDEX ON %s (b)");
+
+        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 0, 0);
+        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 1, 1);
+        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 2, 4);
+        execute("INSERT INTO %s (a, b) VALUES (?, {f : ?})", 3, 7);
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".withFrozenArg(values frozen<" + myType + ">) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS text " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values.toString();';");
+
+        assertInvalidMessage("The function return type should not be frozen",
+                             "CREATE OR REPLACE FUNCTION " + KEYSPACE + ".frozenReturnType(values " + myType + ") " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS frozen<" + myType + "> " +
+                             "LANGUAGE java\n" +
+                             "AS 'return values;';");
+
+        String functionName = createFunction(KEYSPACE,
+                                             myType,
+                                             "CREATE FUNCTION %s (values " + myType + ") " +
+                                             "CALLED ON NULL INPUT " +
+                                             "RETURNS text " +
+                                             "LANGUAGE java\n" +
+                                             "AS 'return values.toString();';");
+
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 0"), row(0, "{f:0}"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 1"), row(1, "{f:1}"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 2"), row(2, "{f:4}"));
+        assertRows(execute("SELECT a, " + functionName + "(b) FROM %s WHERE a = 3"), row(3, "{f:7}"));
+
+        functionName = createFunction(KEYSPACE,
+                                      myType,
+                                      "CREATE FUNCTION %s (values " + myType + ") " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS " + myType + " " +
+                                      "LANGUAGE java\n" +
+                                      "AS 'return values;';");
+
+        assertRows(execute("SELECT a FROM %s WHERE b = " + functionName + "({f: ?})", 1),
+                   row(1));
+
+        assertInvalidMessage("The function arguments should not be frozen",
+                             "DROP FUNCTION " + functionName + "(frozen<" + myType + ">);");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UFVerifierTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UFVerifierTest.java
new file mode 100644
index 0000000..0b78bf2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UFVerifierTest.java

@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.functions.UDFByteCodeVerifier;
+import org.apache.cassandra.cql3.validation.entities.udfverify.CallClone;
+import org.apache.cassandra.cql3.validation.entities.udfverify.CallComDatastax;
+import org.apache.cassandra.cql3.validation.entities.udfverify.CallFinalize;
+import org.apache.cassandra.cql3.validation.entities.udfverify.CallOrgApache;
+import org.apache.cassandra.cql3.validation.entities.udfverify.ClassWithField;
+import org.apache.cassandra.cql3.validation.entities.udfverify.ClassWithInitializer;
+import org.apache.cassandra.cql3.validation.entities.udfverify.ClassWithInitializer2;
+import org.apache.cassandra.cql3.validation.entities.udfverify.ClassWithInitializer3;
+import org.apache.cassandra.cql3.validation.entities.udfverify.ClassWithStaticInitializer;
+import org.apache.cassandra.cql3.validation.entities.udfverify.GoodClass;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronized;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronizedWithNotify;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronizedWithNotifyAll;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronizedWithWait;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronizedWithWaitL;
+import org.apache.cassandra.cql3.validation.entities.udfverify.UseOfSynchronizedWithWaitLI;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test the Java UDF byte code verifier.
+ */
+public class UFVerifierTest extends CQLTester
+{
+    @Test
+    public void testByteCodeVerifier()
+    {
+        new UDFByteCodeVerifier().verify(readClass(GoodClass.class));
+    }
+
+    @Test
+    public void testClassWithField()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("field declared: field")),
+                     new UDFByteCodeVerifier().verify(readClass(ClassWithField.class)));
+    }
+
+    @Test
+    public void testClassWithInitializer()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("field declared: field",
+                                                 "initializer declared")),
+                     new UDFByteCodeVerifier().verify(readClass(ClassWithInitializer.class)));
+    }
+
+    @Test
+    public void testClassWithInitializer2()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("field declared: field",
+                                                 "initializer declared")),
+                     new UDFByteCodeVerifier().verify(readClass(ClassWithInitializer2.class)));
+    }
+
+    @Test
+    public void testClassWithInitializer3()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("initializer declared")),
+                     new UDFByteCodeVerifier().verify(readClass(ClassWithInitializer3.class)));
+    }
+
+    @Test
+    public void testClassWithStaticInitializer()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("static initializer declared")),
+                     new UDFByteCodeVerifier().verify(readClass(ClassWithStaticInitializer.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronized()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("use of synchronized")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronized.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronizedWithNotify()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("use of synchronized", "call to java.lang.Object.notify()")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronizedWithNotify.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronizedWithNotifyAll()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("use of synchronized", "call to java.lang.Object.notifyAll()")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronizedWithNotifyAll.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronizedWithWait()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("use of synchronized", "call to java.lang.Object.wait()")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronizedWithWait.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronizedWithWaitL()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("use of synchronized", "call to java.lang.Object.wait()")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronizedWithWaitL.class)));
+    }
+
+    @Test
+    public void testUseOfSynchronizedWithWaitI()
+    {
+        assertEquals(new HashSet<>(Arrays.asList("use of synchronized", "call to java.lang.Object.wait()")),
+                     new UDFByteCodeVerifier().verify(readClass(UseOfSynchronizedWithWaitLI.class)));
+    }
+
+    @Test
+    public void testCallClone()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("call to java.lang.Object.clone()")),
+                     new UDFByteCodeVerifier().verify(readClass(CallClone.class)));
+    }
+
+    @Test
+    public void testCallFinalize()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("call to java.lang.Object.finalize()")),
+                     new UDFByteCodeVerifier().verify(readClass(CallFinalize.class)));
+    }
+
+    @Test
+    public void testCallComDatastax()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("call to com.datastax.driver.core.DataType.cint()")),
+                     new UDFByteCodeVerifier().addDisallowedPackage("com/").verify(readClass(CallComDatastax.class)));
+    }
+
+    @Test
+    public void testCallOrgApache()
+    {
+        assertEquals(new HashSet<>(Collections.singletonList("call to org.apache.cassandra.config.DatabaseDescriptor.getClusterName()")),
+                     new UDFByteCodeVerifier().addDisallowedPackage("org/").verify(readClass(CallOrgApache.class)));
+    }
+
+    @SuppressWarnings("resource")
+    private static byte[] readClass(Class<?> clazz)
+    {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        URL res = clazz.getClassLoader().getResource(clazz.getName().replace('.', '/') + ".class");
+        assert res != null;
+        try (InputStream input = res.openConnection().getInputStream())
+        {
+            int i;
+            while ((i = input.read()) != -1)
+                out.write(i);
+            return out.toByteArray();
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Test
+    public void testInvalidByteCodeUDFs() throws Throwable
+    {
+        assertInvalidByteCode("try\n" +
+                              "{\n" +
+                              "    clone();\n" +
+                              "}\n" +
+                              "catch (CloneNotSupportedException e)\n" +
+                              "{\n" +
+                              "    throw new RuntimeException(e);\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.clone()]");
+        assertInvalidByteCode("try\n" +
+                              "{\n" +
+                              "    finalize();\n" +
+                              "}\n" +
+                              "catch (Throwable e)\n" +
+                              "{\n" +
+                              "    throw new RuntimeException(e);\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.finalize()]");
+        assertInvalidByteCode('\n' +
+                              "return 0d;\n" +
+                              "    }\n" +
+                              '\n' +
+                              "    Object field;\n" +
+                              '\n' +
+                              "    {", "Java UDF validation failed: [field declared: field]");
+        assertInvalidByteCode('\n' +
+                              "return 0d;\n" +
+                              "    }\n" +
+                              '\n' +
+                              "    final Object field;\n" +
+                              '\n' +
+                              "    {\n" +
+                              "field = new Object();", "Java UDF validation failed: [field declared: field, initializer declared]");
+        assertInvalidByteCode('\n' +
+                              "return 0d;\n" +
+                              "    }\n" +
+                              '\n' +
+                              "    Object field = new Object();\n" +
+                              '\n' +
+                              "    {\n" +
+                              "Math.sin(1d);", "Java UDF validation failed: [field declared: field, initializer declared]");
+        assertInvalidByteCode('\n' +
+                              "return 0d;\n" +
+                              "    }\n" +
+                              '\n' +
+                              "    {\n" +
+                              "Math.sin(1d);", "Java UDF validation failed: [initializer declared]");
+        assertInvalidByteCode('\n' +
+                              "return 0d;\n" +
+                              "    }\n" +
+                              '\n' +
+                              "    static\n" +
+                              "    {\n" +
+                              "Math.sin(1d);", "Java UDF validation failed: [static initializer declared]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    Math.sin(1d);\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [use of synchronized]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    notify();\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.notify(), use of synchronized]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    notifyAll();\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.notifyAll(), use of synchronized]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    try\n" +
+                              "    {\n" +
+                              "        wait();\n" +
+                              "    }\n" +
+                              "    catch (InterruptedException e)\n" +
+                              "    {\n" +
+                              "        throw new RuntimeException(e);\n" +
+                              "    }\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.wait(), use of synchronized]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    try\n" +
+                              "    {\n" +
+                              "        wait(1000L);\n" +
+                              "    }\n" +
+                              "    catch (InterruptedException e)\n" +
+                              "    {\n" +
+                              "        throw new RuntimeException(e);\n" +
+                              "    }\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.wait(), use of synchronized]");
+        assertInvalidByteCode("synchronized (this)\n" +
+                              "{\n" +
+                              "    try\n" +
+                              "    {\n" +
+                              "        wait(1000L, 100);\n" +
+                              "    }\n" +
+                              "    catch (InterruptedException e)\n" +
+                              "    {\n" +
+                              "        throw new RuntimeException(e);\n" +
+                              "    }\n" +
+                              "}\n" +
+                              "return 0d;", "Java UDF validation failed: [call to java.lang.Object.wait(), use of synchronized]");
+        assertInvalidByteCode("try {" +
+                              "     java.nio.ByteBuffer.allocateDirect(123); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.nio.ByteBuffer.allocateDirect()]");
+        assertInvalidByteCode("try {" +
+                              "     java.net.InetAddress.getLocalHost(); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.net.InetAddress.getLocalHost()]");
+        assertInvalidByteCode("try {" +
+                              "     java.net.InetAddress.getAllByName(\"localhost\"); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.net.InetAddress.getAllByName()]");
+        assertInvalidByteCode("try {" +
+                              "     java.net.Inet4Address.getByName(\"127.0.0.1\"); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.net.Inet4Address.getByName()]");
+        assertInvalidByteCode("try {" +
+                              "     java.net.Inet6Address.getByAddress(new byte[]{127,0,0,1}); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.net.Inet6Address.getByAddress()]");
+        assertInvalidByteCode("try {" +
+                              "     java.net.NetworkInterface.getNetworkInterfaces(); return 0d;" +
+                              "} catch (Exception t) {" +
+                              "     throw new RuntimeException(t);" +
+                              '}', "Java UDF validation failed: [call to java.net.NetworkInterface.getNetworkInterfaces()]");
+    }
+
+    private void assertInvalidByteCode(String body, String error) throws Throwable
+    {
+        assertInvalidMessage(error,
+                             "CREATE FUNCTION " + KEYSPACE + ".mustBeInvalid ( input double ) " +
+                             "CALLED ON NULL INPUT " +
+                             "RETURNS double " +
+                             "LANGUAGE java AS $$" + body + "$$");
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java
index 9bafe4a..68c0b8c 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java

@@ -22,16 +22,18 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.service.StorageService;
 
 public class UserTypesTest extends CQLTester
 {
     @BeforeClass
     public static void setUpClass()
     {
-        DatabaseDescriptor.setPartitioner(new ByteOrderedPartitioner());
+        // Selecting partitioner for a table is not exposed on CREATE TABLE.
+        StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test
@@ -60,7 +62,7 @@
                              "INSERT INTO %s (k, t) VALUES (0, ?)",
                              userType(1, tuple(1, "1", 1.0, 1)));
 
-        assertInvalidMessage("Invalid user type literal for t: field b is not of type tuple<int, text, double>",
+        assertInvalidMessage("Invalid user type literal for t: field b is not of type frozen<tuple<int, text, double>>",
                              "INSERT INTO %s (k, t) VALUES (0, {a: 1, b: (1, '1', 1.0, 1)})");
     }
 
@@ -198,6 +200,34 @@
     }
 
     @Test
+    public void testAlteringUserTypeNestedWithinNonFrozenMap() throws Throwable
+    {
+        String ut1 = createType("CREATE TYPE %s (a int)");
+        String columnType = KEYSPACE + "." + ut1;
+
+        createTable("CREATE TABLE %s (x int PRIMARY KEY, y map<text, frozen<" + columnType + ">>)");
+
+        execute("INSERT INTO %s (x, y) VALUES(1, {'firstValue': {a: 1}})");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, map("firstValue", userType(1))));
+
+        flush();
+
+        execute("ALTER TYPE " + columnType + " ADD b int");
+        execute("UPDATE %s SET y['secondValue'] = {a: 2, b: 2} WHERE x = 1");
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, map("firstValue", userType(1),
+                              "secondValue", userType(2, 2))));
+
+        flush();
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, map("firstValue", userType(1),
+                              "secondValue", userType(2, 2))));
+    }
+
+    @Test
     public void testAlteringUserTypeNestedWithinSet() throws Throwable
     {
         // test frozen and non-frozen collections
@@ -466,6 +496,243 @@
         assertInvalidMessage("would create a circular reference", "ALTER TYPE " + typeWithKs(type1) + " ADD needs_to_fail frozen<list<" + typeWithKs(type1) + ">>");
     }
 
+    @Test
+    public void testTypeAlterUsedInFunction() throws Throwable
+    {
+        String type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1, type1, "{foo: 'abc'}");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1, "list<frozen<" + type1 + ">>", "[{foo: 'abc'}]");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1, "set<frozen<" + type1 + ">>", "{{foo: 'abc'}}");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1, "map<text, frozen<" + type1 + ">>", "{'key': {foo: 'abc'}}");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        String type2 = createType("CREATE TYPE %s (foo frozen<" + type1 + ">)");
+        assertComplexInvalidAlterDropStatements(type1, type2, "{foo: 'abc'}");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        type2 = createType("CREATE TYPE %s (foo frozen<" + type1 + ">)");
+        assertComplexInvalidAlterDropStatements(type1,
+                                                "list<frozen<" + type2 + ">>",
+                                                "[{foo: 'abc'}]");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        type2 = createType("CREATE TYPE %s (foo frozen<set<" + type1 + ">>)");
+        assertComplexInvalidAlterDropStatements(type1,
+                                                "map<text, frozen<" + type2 + ">>",
+                                                "{'key': {foo: {{foo: 'abc'}}}}");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1,
+                                                "tuple<text, frozen<" + type1 + ">>",
+                                                "('key', {foo: 'abc'})");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        assertComplexInvalidAlterDropStatements(type1,
+                                                "tuple<text, frozen<tuple<tuple<" + type1 + ", int>, int>>>",
+                                                "('key', (({foo: 'abc'}, 0), 0))");
+
+        type1 = createType("CREATE TYPE %s (foo ascii)");
+        type2 = createType("CREATE TYPE %s (foo frozen<set<" + type1 + ">>)");
+        assertComplexInvalidAlterDropStatements(type1,
+                                                "tuple<text, frozen<" + type2 + ">>",
+                                                "('key', {foo: {{foo: 'abc'}}})");
+    }
+
+    private void assertComplexInvalidAlterDropStatements(String type1, String fArgType, String initcond) throws Throwable
+    {
+        String f = createFunction(KEYSPACE, type1, "CREATE FUNCTION %s(arg " + fArgType + ", col int) " +
+                                                   "RETURNS NULL ON NULL INPUT " +
+                                                   "RETURNS " + fArgType + ' ' +
+                                                   "LANGUAGE java AS 'return arg;'");
+        createAggregate(KEYSPACE, "int", "CREATE AGGREGATE %s(int) " +
+                                         "SFUNC " + shortFunctionName(f) + ' ' +
+                                         "STYPE " + fArgType + ' ' +
+                                         "INITCOND " + initcond);
+        assertInvalidAlterDropStatements(type1);
+    }
+
+    private void assertInvalidAlterDropStatements(String t) throws Throwable
+    {
+        assertInvalidMessage("Cannot alter user type " + typeWithKs(t), "ALTER TYPE " + typeWithKs(t) + " RENAME foo TO bar;");
+        assertInvalidMessage("Cannot drop user type " + typeWithKs(t), "DROP TYPE " + typeWithKs(t) + ';');
+    }
+
+    @Test
+    public void testReadAfterAlteringUserTypeNestedWithinSet() throws Throwable
+    {
+        String columnType = typeWithKs(createType("CREATE TYPE %s (a int)"));
+
+        try
+        {
+            createTable("CREATE TABLE %s (x int PRIMARY KEY, y set<frozen<" + columnType + ">>)");
+            disableCompaction();
+
+            execute("INSERT INTO %s (x, y) VALUES(1, ?)", set(userType(1), userType(2)));
+            assertRows(execute("SELECT * FROM %s"), row(1, set(userType(1), userType(2))));
+            flush();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, set(userType(1), userType(2))));
+
+            execute("ALTER TYPE " + columnType + " ADD b int");
+            execute("UPDATE %s SET y = y + ? WHERE x = 1",
+                    set(userType(1, 1), userType(1, 2), userType(2, 1)));
+
+            flush();
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                           row(1, set(userType(1),
+                                      userType(1, 1),
+                                      userType(1, 2),
+                                      userType(2),
+                                      userType(2, 1))));
+
+            compact();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, set(userType(1),
+                                  userType(1, 1),
+                                  userType(1, 2),
+                                  userType(2),
+                                  userType(2, 1))));
+        }
+        finally
+        {
+            enableCompaction();
+        }
+    }
+
+    @Test
+    public void testReadAfterAlteringUserTypeNestedWithinMap() throws Throwable
+    {
+        String columnType = typeWithKs(createType("CREATE TYPE %s (a int)"));
+
+        try
+        {
+            createTable("CREATE TABLE %s (x int PRIMARY KEY, y map<frozen<" + columnType + ">, int>)");
+            disableCompaction();
+
+            execute("INSERT INTO %s (x, y) VALUES(1, ?)", map(userType(1), 1, userType(2), 2));
+            assertRows(execute("SELECT * FROM %s"), row(1, map(userType(1), 1, userType(2), 2)));
+            flush();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, map(userType(1), 1, userType(2), 2)));
+
+            execute("ALTER TYPE " + columnType + " ADD b int");
+            execute("UPDATE %s SET y = y + ? WHERE x = 1",
+                    map(userType(1, 1), 1, userType(1, 2), 1, userType(2, 1), 2));
+
+            flush();
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                           row(1, map(userType(1), 1,
+                                      userType(1, 1), 1,
+                                      userType(1, 2), 1,
+                                      userType(2), 2,
+                                      userType(2, 1), 2)));
+
+            compact();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, map(userType(1), 1,
+                                  userType(1, 1), 1,
+                                  userType(1, 2), 1,
+                                  userType(2), 2,
+                                  userType(2, 1), 2)));
+        }
+        finally
+        {
+            enableCompaction();
+        }
+    }
+
+    @Test
+    public void testReadAfterAlteringUserTypeNestedWithinList() throws Throwable
+    {
+        String columnType = typeWithKs(createType("CREATE TYPE %s (a int)"));
+
+        try
+        {
+            createTable("CREATE TABLE %s (x int PRIMARY KEY, y list<frozen<" + columnType + ">>)");
+            disableCompaction();
+
+            execute("INSERT INTO %s (x, y) VALUES(1, ?)", list(userType(1), userType(2)));
+            assertRows(execute("SELECT * FROM %s"), row(1, list(userType(1), userType(2))));
+            flush();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, list(userType(1), userType(2))));
+
+            execute("ALTER TYPE " + columnType + " ADD b int");
+            execute("UPDATE %s SET y = y + ? WHERE x = 1",
+                    list(userType(1, 1), userType(1, 2), userType(2, 1)));
+
+            flush();
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                           row(1, list(userType(1),
+                                       userType(2),
+                                       userType(1, 1),
+                                       userType(1, 2),
+                                       userType(2, 1))));
+
+            compact();
+
+            assertRows(execute("SELECT * FROM %s WHERE x = 1"),
+                       row(1, list(userType(1),
+                                   userType(2),
+                                   userType(1, 1),
+                                   userType(1, 2),
+                                   userType(2, 1))));
+        }
+        finally
+        {
+            enableCompaction();
+        }
+    }
+
+    @Test
+    public void testAlteringUserTypeNestedWithinSetWithView() throws Throwable
+    {
+        String columnType = typeWithKs(createType("CREATE TYPE %s (a int)"));
+
+        createTable("CREATE TABLE %s (pk int, c int, v int, s set<frozen<" + columnType + ">>, PRIMARY KEY (pk, c))");
+        execute("CREATE MATERIALIZED VIEW " + keyspace() + ".view1 AS SELECT c, pk, v FROM %s WHERE pk IS NOT NULL AND c IS NOT NULL AND v IS NOT NULL PRIMARY KEY (c, pk)");
+
+        execute("INSERT INTO %s (pk, c, v, s) VALUES(?, ?, ?, ?)", 1, 1, 1, set(userType(1), userType(2)));
+        flush();
+
+        execute("ALTER TYPE " + columnType + " ADD b int");
+        execute("UPDATE %s SET s = s + ?, v = ? WHERE pk = ? AND c = ?",
+                set(userType(1, 1), userType(1, 2), userType(2, 1)), 2, 1, 1);
+
+        assertRows(execute("SELECT * FROM %s WHERE pk = ? AND c = ?", 1, 1),
+                       row(1, 1,set(userType(1), userType(1, 1), userType(1, 2), userType(2), userType(2, 1)), 2));
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void emptyTypeNameTest() throws Throwable
+    {
+        execute("CREATE TYPE \"\" (a int, b int)");
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void emptyFieldNameTest() throws Throwable
+    {
+        execute("CREATE TYPE mytype (\"\" int, b int)");
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void renameColumnToEmpty() throws Throwable
+    {
+        String typeName = createType("CREATE TYPE %s (a int, b int)");
+        execute(String.format("ALTER TYPE %s.%s RENAME b TO \"\"", keyspace(), typeName));
+    }
+
     private String typeWithKs(String type1)
     {
         return keyspace() + '.' + type1;

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallClone.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallClone.java
new file mode 100644
index 0000000..c01fbe6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallClone.java

@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class CallClone extends JavaUDF
+{
+    public CallClone(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        try
+        {
+            clone();
+        }
+        catch (CloneNotSupportedException e)
+        {
+            throw new RuntimeException(e);
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallComDatastax.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallComDatastax.java
new file mode 100644
index 0000000..9cd799f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallComDatastax.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.DataType;
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class CallComDatastax extends JavaUDF
+{
+    public CallComDatastax(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        DataType.cint();
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallFinalize.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallFinalize.java
new file mode 100644
index 0000000..a16bd31
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallFinalize.java

@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class CallFinalize extends JavaUDF
+{
+    public CallFinalize(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        try
+        {
+            finalize();
+        }
+        catch (Throwable e)
+        {
+            throw new RuntimeException(e);
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallOrgApache.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallOrgApache.java
new file mode 100644
index 0000000..4f511d7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/CallOrgApache.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class CallOrgApache extends JavaUDF
+{
+    public CallOrgApache(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        DatabaseDescriptor.getClusterName();
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithField.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithField.java
new file mode 100644
index 0000000..d981c18
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithField.java

@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class ClassWithField extends JavaUDF
+{
+    public ClassWithField(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+
+    Object field;
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer.java
new file mode 100644
index 0000000..f53cc24
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class ClassWithInitializer extends JavaUDF
+{
+    public ClassWithInitializer(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+
+    final Object field;
+
+    {
+        field = new Object();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer2.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer2.java
new file mode 100644
index 0000000..134f9f9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer2.java

@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class ClassWithInitializer2 extends JavaUDF
+{
+    public ClassWithInitializer2(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+
+    final Object field = new Object();
+
+    {
+        Math.sin(1d);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer3.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer3.java
new file mode 100644
index 0000000..9cd04fb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithInitializer3.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class ClassWithInitializer3 extends JavaUDF
+{
+    public ClassWithInitializer3(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+
+    {
+        Math.sin(1d);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithStaticInitializer.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithStaticInitializer.java
new file mode 100644
index 0000000..64470ca
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/ClassWithStaticInitializer.java

@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class ClassWithStaticInitializer extends JavaUDF
+{
+    public ClassWithStaticInitializer(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+
+    static
+    {
+        Math.sin(1d);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/GoodClass.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/GoodClass.java
new file mode 100644
index 0000000..e3bc1e2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/GoodClass.java

@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class GoodClass extends JavaUDF
+{
+    public GoodClass(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronized.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronized.java
new file mode 100644
index 0000000..2927b3e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronized.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronized extends JavaUDF
+{
+    public UseOfSynchronized(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            Math.sin(1d);
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotify.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotify.java
new file mode 100644
index 0000000..7ef2e1c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotify.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronizedWithNotify extends JavaUDF
+{
+    public UseOfSynchronizedWithNotify(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            notify();
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotifyAll.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotifyAll.java
new file mode 100644
index 0000000..50a3da8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithNotifyAll.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronizedWithNotifyAll extends JavaUDF
+{
+    public UseOfSynchronizedWithNotifyAll(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            notifyAll();
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWait.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWait.java
new file mode 100644
index 0000000..135c550
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWait.java

@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronizedWithWait extends JavaUDF
+{
+    public UseOfSynchronizedWithWait(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            try
+            {
+                wait();
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitL.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitL.java
new file mode 100644
index 0000000..4e49e5b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitL.java

@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronizedWithWaitL extends JavaUDF
+{
+    public UseOfSynchronizedWithWaitL(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            try
+            {
+                wait(1000L);
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitLI.java b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitLI.java
new file mode 100644
index 0000000..6770e7a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/udfverify/UseOfSynchronizedWithWaitLI.java

@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.entities.udfverify;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import com.datastax.driver.core.TypeCodec;
+import org.apache.cassandra.cql3.functions.JavaUDF;
+
+/**
+ * Used by {@link org.apache.cassandra.cql3.validation.entities.UFVerifierTest}.
+ */
+public final class UseOfSynchronizedWithWaitLI extends JavaUDF
+{
+    public UseOfSynchronizedWithWaitLI(TypeCodec<Object> returnDataType, TypeCodec<Object>[] argDataTypes)
+    {
+        super(returnDataType, argDataTypes);
+    }
+
+    protected ByteBuffer executeImpl(int protocolVersion, List<ByteBuffer> params)
+    {
+        synchronized (this)
+        {
+            try
+            {
+                wait(1000L, 100);
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+        return null;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java
index 98d7d70..d059f7d 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java

@@ -21,6 +21,8 @@
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
+import org.junit.Test;
+
 import junit.framework.Assert;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
@@ -28,18 +30,35 @@
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.io.compress.CompressedRandomAccessReader;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.FBUtilities;
 
-import org.junit.Test;
-
 
 public class CrcCheckChanceTest extends CQLTester
 {
+
+
     @Test
-    public void testChangingCrcCheckChance() throws Throwable
+    public void testChangingCrcCheckChanceNewFormat() throws Throwable
+    {
+        testChangingCrcCheckChance(true);
+    }
+
+    @Test
+    public void testChangingCrcCheckChanceOldFormat() throws Throwable
+    {
+        testChangingCrcCheckChance(false);
+    }
+
+
+    public void testChangingCrcCheckChance(boolean newFormat) throws Throwable
     {
         //Start with crc_check_chance of 99%
-        createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance' : 0.99}");
+        if (newFormat)
+            createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH compression = {'sstable_compression': 'LZ4Compressor'} AND crc_check_chance = 0.99;");
+        else
+            createTable("CREATE TABLE %s (p text, c text, v text, s text static, PRIMARY KEY (p, c)) WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance' : 0.99}");
 
         execute("CREATE INDEX foo ON %s(v)");
 
@@ -47,39 +66,32 @@
         execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
         execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
 
-
         ColumnFamilyStore cfs = Keyspace.open(CQLTester.KEYSPACE).getColumnFamilyStore(currentTable());
-        ColumnFamilyStore indexCfs = cfs.indexManager.getIndexesBackedByCfs().iterator().next();
+        ColumnFamilyStore indexCfs = cfs.indexManager.getAllIndexColumnFamilyStores().iterator().next();
         cfs.forceBlockingFlush();
 
-        Assert.assertEquals(0.99, cfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals(0.99, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
-        Assert.assertEquals(0.99, indexCfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals(0.99, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals(0.99, cfs.getCrcCheckChance());
+        Assert.assertEquals(0.99, cfs.getLiveSSTables().iterator().next().getCrcCheckChance());
+
+        Assert.assertEquals(0.99, indexCfs.getCrcCheckChance());
+        Assert.assertEquals(0.99, indexCfs.getLiveSSTables().iterator().next().getCrcCheckChance());
 
         //Test for stack overflow
-        cfs.setCrcCheckChance(0.99);
+        if (newFormat)
+            alterTable("ALTER TABLE %s WITH crc_check_chance = 0.99");
+        else
+            alterTable("ALTER TABLE %s WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance': 0.99}");
 
         assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
-                row("p1", "k1", "sv1", "v1"),
-                row("p1", "k2", "sv1", "v2")
+                   row("p1", "k1", "sv1", "v1"),
+                   row("p1", "k2", "sv1", "v2")
         );
 
         assertRows(execute("SELECT * FROM %s WHERE v=?", "v1"),
-                row("p1", "k1", "sv1", "v1")
+                   row("p1", "k1", "sv1", "v1")
         );
 
-
-
         //Write a few SSTables then Compact
-
-        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
-        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
-        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
-
-        cfs.forceBlockingFlush();
-
-
         execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
         execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
         execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
@@ -92,36 +104,67 @@
 
         cfs.forceBlockingFlush();
 
+        execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
+        execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
+        execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
+
+        cfs.forceBlockingFlush();
         cfs.forceMajorCompaction();
 
-        //Verify when we alter the value the live sstable readers hold the new one
-        alterTable("ALTER TABLE %s WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance': 0.01}");
+        //Now let's change via JMX
+        cfs.setCrcCheckChance(0.01);
 
-        Assert.assertEquals( 0.01, cfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals( 0.01, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
-        Assert.assertEquals( 0.01, indexCfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals( 0.01, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals(0.01, cfs.getCrcCheckChance());
+        Assert.assertEquals(0.01, cfs.getLiveSSTables().iterator().next().getCrcCheckChance());
+        Assert.assertEquals(0.01, indexCfs.getCrcCheckChance());
+        Assert.assertEquals(0.01, indexCfs.getLiveSSTables().iterator().next().getCrcCheckChance());
 
         assertRows(execute("SELECT * FROM %s WHERE p=?", "p1"),
-                row("p1", "k1", "sv1", "v1"),
-                row("p1", "k2", "sv1", "v2")
+                   row("p1", "k1", "sv1", "v1"),
+                   row("p1", "k2", "sv1", "v2")
         );
 
         assertRows(execute("SELECT * FROM %s WHERE v=?", "v1"),
-                row("p1", "k1", "sv1", "v1")
+                   row("p1", "k1", "sv1", "v1")
         );
 
+        //Alter again via schema
+        if (newFormat)
+            alterTable("ALTER TABLE %s WITH crc_check_chance = 0.5");
+        else
+            alterTable("ALTER TABLE %s WITH compression = {'sstable_compression': 'LZ4Compressor', 'crc_check_chance': 0.5}");
+
+        //We should be able to get the new value by accessing directly the schema metadata
+        Assert.assertEquals(0.5, cfs.metadata.params.crcCheckChance);
+
+        //but previous JMX-set value will persist until next restart
+        Assert.assertEquals(0.01, cfs.getLiveSSTables().iterator().next().getCrcCheckChance());
+        Assert.assertEquals(0.01, indexCfs.getCrcCheckChance());
+        Assert.assertEquals(0.01, indexCfs.getLiveSSTables().iterator().next().getCrcCheckChance());
 
         //Verify the call used by JMX still works
         cfs.setCrcCheckChance(0.03);
-        Assert.assertEquals( 0.03, cfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals( 0.03, cfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
-        Assert.assertEquals( 0.03, indexCfs.metadata.compressionParameters.getCrcCheckChance());
-        Assert.assertEquals( 0.03, indexCfs.getSSTables().iterator().next().getCompressionMetadata().parameters.getCrcCheckChance());
+        Assert.assertEquals(0.03, cfs.getCrcCheckChance());
+        Assert.assertEquals(0.03, cfs.getLiveSSTables().iterator().next().getCrcCheckChance());
+        Assert.assertEquals(0.03, indexCfs.getCrcCheckChance());
+        Assert.assertEquals(0.03, indexCfs.getLiveSSTables().iterator().next().getCrcCheckChance());
 
+        // Also check that any open readers also use the updated value
+        // note: only compressed files currently perform crc checks, so only the dfile reader is relevant here
+        SSTableReader baseSSTable = cfs.getLiveSSTables().iterator().next();
+        SSTableReader idxSSTable = indexCfs.getLiveSSTables().iterator().next();
+        try (CompressedRandomAccessReader baseDataReader = (CompressedRandomAccessReader)baseSSTable.openDataReader();
+             CompressedRandomAccessReader idxDataReader = (CompressedRandomAccessReader)idxSSTable.openDataReader())
+        {
+            Assert.assertEquals(0.03, baseDataReader.getCrcCheckChance());
+            Assert.assertEquals(0.03, idxDataReader.getCrcCheckChance());
+
+            cfs.setCrcCheckChance(0.31);
+            Assert.assertEquals(0.31, baseDataReader.getCrcCheckChance());
+            Assert.assertEquals(0.31, idxDataReader.getCrcCheckChance());
+        }
     }
 
-
     @Test
     public void testDropDuringCompaction() throws Throwable
     {
@@ -143,7 +186,7 @@
         }
 
         DatabaseDescriptor.setCompactionThroughputMbPerSec(1);
-        List<Future<?>> futures = CompactionManager.instance.submitMaximal(cfs, CompactionManager.getDefaultGcBefore(cfs), false); 
+        List<Future<?>> futures = CompactionManager.instance.submitMaximal(cfs, CompactionManager.getDefaultGcBefore(cfs, FBUtilities.nowInSeconds()), false); 
         execute("DROP TABLE %s");
 
         try

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/OverflowTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/OverflowTest.java
index 5b43599..9733eb2 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/OverflowTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/OverflowTest.java

@@ -113,8 +113,8 @@
                     + "AND gc_grace_seconds = 4 "
                     + "AND bloom_filter_fp_chance = 0.01 "
                     + "AND compaction = { 'class' : 'LeveledCompactionStrategy', 'sstable_size_in_mb' : 10 } "
-                    + "AND compression = { 'sstable_compression' : '' } "
-                    + "AND caching = 'all' ");
+                    + "AND compression = { 'enabled': false } "
+                    + "AND caching = { 'keys': 'ALL', 'rows_per_partition': 'ALL' }");
 
         execute("ALTER TABLE %s WITH "
                 + "comment = 'other comment' "
@@ -123,8 +123,8 @@
                 + "AND gc_grace_seconds = 100 "
                 + "AND bloom_filter_fp_chance = 0.1 "
                 + "AND compaction = { 'class': 'SizeTieredCompactionStrategy', 'min_sstable_size' : 42 } "
-                + "AND compression = { 'sstable_compression' : 'SnappyCompressor' } "
-                + "AND caching = 'rows_only' ");
+                + "AND compression = { 'class' : 'SnappyCompressor' } "
+                + "AND caching = { 'rows_per_partition': 'ALL' }");
     }
 
     /**

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/RoleSyntaxTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/RoleSyntaxTest.java
index 0cb1de2..f72e3dc 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/RoleSyntaxTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/RoleSyntaxTest.java

@@ -17,8 +17,10 @@
  */
 package org.apache.cassandra.cql3.validation.miscellaneous;
 
+import org.junit.Assert;
 import org.junit.Test;
 
+import org.apache.cassandra.auth.RoleResource;
 import org.apache.cassandra.cql3.CQLTester;
 
 public class RoleSyntaxTest extends CQLTester
@@ -158,4 +160,25 @@
         assertValidSyntax("LIST ROLES OF \"r1\"");
         assertValidSyntax("LIST ROLES OF $$ r '1' $$");
     }
+
+    @Test
+    public void roleNameTest()
+    {
+        // we used to split on all "/" which meant role names containing a / would trigger an exception in RoleResource.fromName()
+        RoleResource t1 = RoleResource.role("ki/ng");
+        RoleResource t2 = RoleResource.role("emperor");
+        RoleResource t3 = RoleResource.role("aeou/!@*%");
+        RoleResource t4 = RoleResource.role("do$\\$P#?:");
+        RoleResource t5 = RoleResource.root();
+        RoleResource r1 = RoleResource.fromName("roles/ki/ng");
+        RoleResource r2 = RoleResource.fromName("roles/emperor");
+        RoleResource r3 = RoleResource.fromName("roles/aeou/!@*%");
+        RoleResource r4 = RoleResource.fromName("roles/do$\\$P#?:");
+        RoleResource r5 = RoleResource.fromName("roles");
+        Assert.assertEquals(t1, r1);
+        Assert.assertEquals(t2, r2);
+        Assert.assertEquals(t3, r3);
+        Assert.assertEquals(t4, r4);
+        Assert.assertEquals(t5, r5);
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java
index 2a2ca7b..288cbe1 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java

@@ -34,11 +34,11 @@
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 9999");
         cfs.forceBlockingFlush();
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
         cfs.forceMajorCompaction();
-        metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
     }
@@ -51,12 +51,12 @@
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
         cfs.forceBlockingFlush();
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
@@ -70,12 +70,12 @@
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
         cfs.forceBlockingFlush();
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime, 5);
@@ -89,13 +89,13 @@
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
         cfs.forceBlockingFlush();
-        assertEquals(1, cfs.getSSTables().size());
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        assertEquals(1, cfs.getLiveSSTables().size());
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
@@ -109,13 +109,13 @@
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
 
         cfs.forceBlockingFlush();
-        assertEquals(1, cfs.getSSTables().size());
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        assertEquals(1, cfs.getLiveSSTables().size());
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
@@ -129,17 +129,18 @@
         execute("INSERT INTO %s (a) VALUES (1) USING TIMESTAMP 9999");
 
         cfs.forceBlockingFlush();
-        assertEquals(1, cfs.getSSTables().size());
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        assertEquals(1, cfs.getLiveSSTables().size());
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
         cfs.forceMajorCompaction();
-        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);
     }
+
     @Test
     public void testTrackMetadata_rowMarkerDelete() throws Throwable
     {
@@ -147,13 +148,13 @@
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a=1");
         cfs.forceBlockingFlush();
-        assertEquals(1, cfs.getSSTables().size());
-        StatsMetadata metadata = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        assertEquals(1, cfs.getLiveSSTables().size());
+        StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(9999, metadata.maxTimestamp);
         assertEquals(System.currentTimeMillis()/1000, metadata.maxLocalDeletionTime, 5);
         cfs.forceMajorCompaction();
-        StatsMetadata metadata2 = cfs.getSSTables().iterator().next().getSSTableMetadata();
+        StatsMetadata metadata2 = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(metadata.maxLocalDeletionTime, metadata2.maxLocalDeletionTime);
         assertEquals(metadata.minTimestamp, metadata2.minTimestamp);
         assertEquals(metadata.maxTimestamp, metadata2.maxTimestamp);

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
new file mode 100644
index 0000000..2cf518a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java

@@ -0,0 +1,136 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.cql3.validation.miscellaneous;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.metrics.ClearableHistogram;
+
+/**
+ * Tests for checking how many sstables we access during cql queries.
+ */
+public class SSTablesIteratedTest extends CQLTester
+{
+    private void executeAndCheck(String query, int numSSTables, Object[]... rows) throws Throwable
+    {
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore(KEYSPACE_PER_TEST);
+
+        ((ClearableHistogram) cfs.metric.sstablesPerReadHistogram.cf).clear(); // resets counts
+
+        assertRows(execute(query), rows);
+
+        long numSSTablesIterated = cfs.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(); // max sstables read
+        assertEquals(String.format("Expected %d sstables iterated but got %d instead, with %d live sstables",
+                                   numSSTables, numSSTablesIterated, cfs.getLiveSSTables().size()),
+                     numSSTables,
+                     numSSTablesIterated);
+    }
+
+    @Override
+    protected String createTable(String query)
+    {
+        String ret = super.createTable(KEYSPACE_PER_TEST, query);
+        disableCompaction(KEYSPACE_PER_TEST);
+        return ret;
+    }
+
+    @Override
+    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    {
+        return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
+    }
+
+    @Override
+    public void flush()
+    {
+        super.flush(KEYSPACE_PER_TEST);
+    }
+
+    @Test
+    public void testSinglePartitionQuery() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, c int, v text, PRIMARY KEY (pk, c))");
+
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 1, 40, "41");
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 2, 10, "12");
+        flush();
+
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 1, 10, "11");
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 3, 30, "33");
+        flush();
+
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 1, 20, "21");
+        execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", 2, 40, "42");
+        execute("UPDATE %s SET v = '12' WHERE pk = 2 AND c = 10");
+        flush();
+
+        // Test with all the table being merged
+        executeAndCheck("SELECT * FROM %s WHERE pk = 1", 3,
+                        row(1, 10, "11"),
+                        row(1, 20, "21"),
+                        row(1, 40, "41"));
+
+        // Test with only 2 of the 3 SSTables being merged
+        executeAndCheck("SELECT * FROM %s WHERE pk = 2", 2,
+                        row(2, 10, "12"),
+                        row(2, 40, "42"));
+
+        executeAndCheck("SELECT * FROM %s WHERE pk = 2 ORDER BY c DESC", 2,
+                        row(2, 40, "42"),
+                        row(2, 10, "12"));
+
+        // Test with only 2 of the 3 SSTables being merged and a Slice filter
+        executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c > 20", 2,
+                        row(2, 40, "42"));
+
+        executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c > 20 ORDER BY c DESC", 2,
+                        row(2, 40, "42"));
+
+        // Test with only 2 of the 3 SSTables being merged and a Name filter
+        // This test checks the SinglePartitionReadCommand::queryMemtableAndSSTablesInTimestampOrder which is only
+        // used for ClusteringIndexNamesFilter when there are no multi-cell columns
+        executeAndCheck("SELECT * FROM %s WHERE pk = 2 AND c = 10", 2,
+                        row(2, 10, "12"));
+
+        // For partition range queries the metric must not be updated. The reason being that range queries simply
+        // scan all the SSTables containing data within the partition range. Due to that they might pollute the metric
+        // and give a wrong view of the system.
+        executeAndCheck("SELECT * FROM %s", 0,
+                        row(1, 10, "11"),
+                        row(1, 20, "21"),
+                        row(1, 40, "41"),
+                        row(2, 10, "12"),
+                        row(2, 40, "42"),
+                        row(3, 30, "33"));
+
+        executeAndCheck("SELECT * FROM %s WHERE token(pk) = token(1)", 0,
+                        row(1, 10, "11"),
+                        row(1, 20, "21"),
+                        row(1, 40, "41"));
+
+        assertInvalidMessage("ORDER BY is only supported when the partition key is restricted by an EQ or an IN",
+                             "SELECT * FROM %s WHERE token(pk) = token(1) ORDER BY C DESC");
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java
index e7f47a2..b5db77e 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java

@@ -18,27 +18,47 @@
 package org.apache.cassandra.cql3.validation.operations;
 
 import java.math.BigDecimal;
+import java.math.MathContext;
+import java.math.BigInteger;
+import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.text.SimpleDateFormat;
+import java.util.Arrays;
 import java.util.Calendar;
+import java.util.Collections;
 import java.util.Date;
+import java.util.Locale;
 import java.util.TimeZone;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.DoubleStream;
 
 import org.apache.commons.lang3.time.DateUtils;
 
 import org.junit.Test;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ch.qos.logback.classic.LoggerContext;
+import ch.qos.logback.classic.spi.TurboFilterList;
+import ch.qos.logback.classic.turbo.ReconfigureOnChangeFilter;
+import ch.qos.logback.classic.turbo.TurboFilter;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.UntypedResultSet.Row;
-import org.apache.cassandra.cql3.functions.Functions;
 import org.apache.cassandra.cql3.functions.UDAggregate;
 import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.DynamicCompositeType;
+import org.apache.cassandra.db.marshal.TypeParser;
 import org.apache.cassandra.exceptions.FunctionExecutionException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.transport.Event;
+import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.transport.messages.ResultMessage;
 
 import static org.junit.Assert.assertEquals;
@@ -51,10 +71,10 @@
     @Test
     public void testNonExistingOnes() throws Throwable
     {
-        assertInvalidMessage("Cannot drop non existing aggregate", "DROP AGGREGATE " + KEYSPACE + ".aggr_does_not_exist");
-        assertInvalidMessage("Cannot drop non existing aggregate", "DROP AGGREGATE " + KEYSPACE + ".aggr_does_not_exist(int,text)");
-        assertInvalidMessage("Cannot drop non existing aggregate", "DROP AGGREGATE keyspace_does_not_exist.aggr_does_not_exist");
-        assertInvalidMessage("Cannot drop non existing aggregate", "DROP AGGREGATE keyspace_does_not_exist.aggr_does_not_exist(int,text)");
+        assertInvalidThrowMessage("Cannot drop non existing aggregate", InvalidRequestException.class, "DROP AGGREGATE " + KEYSPACE + ".aggr_does_not_exist");
+        assertInvalidThrowMessage("Cannot drop non existing aggregate", InvalidRequestException.class, "DROP AGGREGATE " + KEYSPACE + ".aggr_does_not_exist(int,text)");
+        assertInvalidThrowMessage("Cannot drop non existing aggregate", InvalidRequestException.class, "DROP AGGREGATE keyspace_does_not_exist.aggr_does_not_exist");
+        assertInvalidThrowMessage("Cannot drop non existing aggregate", InvalidRequestException.class, "DROP AGGREGATE keyspace_does_not_exist.aggr_does_not_exist(int,text)");
 
         execute("DROP AGGREGATE IF EXISTS " + KEYSPACE + ".aggr_does_not_exist");
         execute("DROP AGGREGATE IF EXISTS " + KEYSPACE + ".aggr_does_not_exist(int,text)");
@@ -387,7 +407,7 @@
         assertRows(execute("SELECT " + copySign + "(c, d) FROM %s"), row(1.2), row(-1.3), row(1.4));
         assertRows(execute("SELECT max(" + copySign + "(c, d)) FROM %s"), row(1.4));
         assertRows(execute("SELECT " + copySign + "(c, max(c)) FROM %s"), row(1.2));
-        assertRows(execute("SELECT " + copySign + "(max(c), c) FROM %s"), row(-1.4));
+        assertRows(execute("SELECT " + copySign + "(max(c), c) FROM %s"), row(-1.4));;
     }
 
     @Test
@@ -863,6 +883,9 @@
                                    "FINALFUNC " + shortFunctionName(fFinal) + " " +
                                    "INITCOND 42");
 
+        assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(a)),
+                   row("42"));
+
         // 42 + 1 + 2 + 3 = 48
         assertRows(execute("SELECT " + a + "(b) FROM %s"), row("48"));
 
@@ -907,6 +930,51 @@
     }
 
     @Test
+    public void testJavaAggregateEmpty() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int primary key, b int)");
+
+        String fState = createFunction(KEYSPACE,
+                                       "int, int",
+                                       "CREATE FUNCTION %s(a int, b int) " +
+                                       "CALLED ON NULL INPUT " +
+                                       "RETURNS int " +
+                                       "LANGUAGE java " +
+                                       "AS 'return Integer.valueOf((a!=null?a.intValue():0) + b.intValue());'");
+
+        String a = createAggregate(KEYSPACE,
+                                   "int, int",
+                                   "CREATE AGGREGATE %s(int) " +
+                                   "SFUNC " + shortFunctionName(fState) + " " +
+                                   "STYPE int");
+
+        assertRows(execute("SELECT " + a + "(b) FROM %s"), row(new Object[]{null}));
+    }
+
+    @Test
+    public void testJavaAggregateStateEmpty() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int primary key, b uuid)");
+
+        String fState = createFunction(KEYSPACE,
+                                       "int, int",
+                                       "CREATE FUNCTION %s(state map<uuid, int>, type uuid) " +
+                                       "RETURNS NULL ON NULL INPUT " +
+                                       "RETURNS map<uuid, int> " +
+                                       "LANGUAGE java " +
+                                       "AS 'return state;'");
+
+        String a = createAggregate(KEYSPACE,
+                                   "int, int",
+                                   "CREATE AGGREGATE %s(uuid) " +
+                                   "SFUNC " + shortFunctionName(fState) + " " +
+                                   "STYPE map<uuid, int> " +
+                                   "INITCOND {}");
+
+        assertRows(execute("SELECT " + a + "(b) FROM %s"), row(Collections.emptyMap()));
+    }
+
+    @Test
     public void testJavaAggregateComplex() throws Throwable
     {
         createTable("CREATE TABLE %s (a int primary key, b int)");
@@ -950,6 +1018,9 @@
                                    "FINALFUNC " + shortFunctionName(fFinal) + " " +
                                    "INITCOND (0, 0)");
 
+        assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(a)),
+                   row("(0, 0)"));
+
         // 1 + 2 + 3 = 6 / 3 = 2
         assertRows(execute("SELECT " + a + "(b) FROM %s"), row(2d));
 
@@ -1263,10 +1334,16 @@
                                    "SFUNC " + shortFunctionName(fState) + " " +
                                    "STYPE int ");
 
-        UDAggregate f = (UDAggregate) Functions.find(parseFunctionName(a)).get(0);
+        KeyspaceMetadata ksm = Schema.instance.getKSMetaData(keyspace());
+        UDAggregate f = (UDAggregate) ksm.functions.get(parseFunctionName(a)).iterator().next();
 
-        Functions.addOrReplaceFunction(UDAggregate.createBroken(f.name(), f.argTypes(), f.returnType(),
-                                                                null, new InvalidRequestException("foo bar is broken")));
+        UDAggregate broken = UDAggregate.createBroken(f.name(),
+                                                      f.argTypes(),
+                                                      f.returnType(),
+                                                      null,
+                                                      new InvalidRequestException("foo bar is broken"));
+
+        Schema.instance.setKeyspaceMetadata(ksm.withSwapped(ksm.functions.without(f.name(), f.argTypes()).with(broken)));
 
         assertInvalidThrowMessage("foo bar is broken", InvalidRequestException.class,
                                   "SELECT " + a + "(val) FROM %s");
@@ -1412,6 +1489,9 @@
                                              "FINALFUNC " + parseFunctionName(fFinal).name + ' ' +
                                              "INITCOND null");
 
+        assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(aggregation)),
+                   row((Object) null));
+
         assertRows(execute("SELECT " + aggregation + "(b) FROM %s"),
                    row(set(7, 8, 9)));
 
@@ -1648,6 +1728,9 @@
                                       "FINALFUNC " + shortFunctionName(fCONf) + ' ' +
                                       "INITCOND ''");
 
+        assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(aCON)),
+                   row("''"));
+
         String fRNON = createFunction(KEYSPACE,
                                       "text, text",
                                       "CREATE FUNCTION %s(a text, b text) " +
@@ -1686,6 +1769,116 @@
     }
 
     @Test
+    public void testEmptyListAndNullInitcond() throws Throwable
+    {
+        String f = createFunction(KEYSPACE,
+                                      "list, int",
+                                      "CREATE FUNCTION %s(s list<text>, i int) " +
+                                      "CALLED ON NULL INPUT " +
+                                      "RETURNS list<text> " +
+                                      "LANGUAGE java " +
+                                      "AS 'if (i != null) s.add(String.valueOf(i)); return s;'");
+
+        String a = createAggregate(KEYSPACE,
+                                       "int",
+                                       "CREATE AGGREGATE %s(int) " +
+                                       "SFUNC " + shortFunctionName(f) + ' ' +
+                                       "STYPE list<text> " +
+                                       "INITCOND [  ]");
+
+        assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(a)),
+                   row("[]"));
+
+        createTable("CREATE TABLE %s (a int primary key, b int)");
+        execute("INSERT INTO %s (a, b) VALUES (1, 1)");
+        execute("INSERT INTO %s (a, b) VALUES (2, null)");
+        execute("INSERT INTO %s (a, b) VALUES (3, 2)");
+        assertRows(execute("SELECT " + a + "(b) FROM %s"), row(Arrays.asList("1", "2")));
+    }
+
+    @Test
+    public void testLogbackReload() throws Throwable
+    {
+        // see https://issues.apache.org/jira/browse/CASSANDRA-11033
+
+        // make logback's scan interval 1ms - boilerplate, but necessary for this test
+        configureLogbackScanPeriod(1L);
+        try
+        {
+
+            createTable("CREATE TABLE %s (" +
+                        "   year int PRIMARY KEY," +
+                        "   country text," +
+                        "   title text)");
+
+            String[] countries = Locale.getISOCountries();
+            ThreadLocalRandom rand = ThreadLocalRandom.current();
+            for (int i = 0; i < 10000; i++)
+            {
+                execute("INSERT INTO %s (year, country, title) VALUES (1980,?,?)",
+                        countries[rand.nextInt(countries.length)],
+                        "title-" + i);
+            }
+
+            String albumCountByCountry = createFunction(KEYSPACE,
+                                                        "map<text,bigint>,text,text",
+                                                        "CREATE FUNCTION IF NOT EXISTS %s(state map<text,bigint>,country text, album_title text)\n" +
+                                                        " RETURNS NULL ON NULL INPUT\n" +
+                                                        " RETURNS map<text,bigint>\n" +
+                                                        " LANGUAGE java\n" +
+                                                        " AS $$\n" +
+                                                        "   if(state.containsKey(country)) {\n" +
+                                                        "       Long newCount = (Long)state.get(country) + 1;\n" +
+                                                        "       state.put(country, newCount);\n" +
+                                                        "   } else {\n" +
+                                                        "       state.put(country, 1L);\n" +
+                                                        "   }\n" +
+                                                        "   return state;\n" +
+                                                        " $$;");
+
+            String releasesByCountry = createAggregate(KEYSPACE,
+                                                       "text, text",
+                                                       " CREATE AGGREGATE IF NOT EXISTS %s(text, text)\n" +
+                                                       " SFUNC " + shortFunctionName(albumCountByCountry) + '\n' +
+                                                       " STYPE map<text,bigint>\n" +
+                                                       " INITCOND { };");
+
+            long tEnd = System.currentTimeMillis() + 150;
+            while (System.currentTimeMillis() < tEnd)
+            {
+                execute("SELECT " + releasesByCountry + "(country,title) FROM %s WHERE year=1980");
+            }
+        }
+        finally
+        {
+            configureLogbackScanPeriod(60000L);
+        }
+    }
+
+    private static void configureLogbackScanPeriod(long millis)
+    {
+        Logger l = LoggerFactory.getLogger(AggregationTest.class);
+        ch.qos.logback.classic.Logger logbackLogger = (ch.qos.logback.classic.Logger) l;
+        LoggerContext ctx = logbackLogger.getLoggerContext();
+        TurboFilterList turboFilterList = ctx.getTurboFilterList();
+        boolean done = false;
+        for (TurboFilter turboFilter : turboFilterList)
+        {
+            if (turboFilter instanceof ReconfigureOnChangeFilter)
+            {
+                ReconfigureOnChangeFilter reconfigureFilter = (ReconfigureOnChangeFilter) turboFilter;
+                reconfigureFilter.setContext(ctx);
+                reconfigureFilter.setRefreshPeriod(millis);
+                reconfigureFilter.stop();
+                reconfigureFilter.start(); // start() sets the next check timestammp
+                done = true;
+                break;
+            }
+        }
+        assertTrue("ReconfigureOnChangeFilter not in logback's turbo-filter list - do that by adding scan=\"true\" to logback-test.xml's configuration element", done);
+    }
+
+    @Test
     public void testOrReplaceOptionals() throws Throwable
     {
         String fState = createFunction(KEYSPACE,
@@ -1717,26 +1910,221 @@
                            "STYPE list<text> ";
 
         // Test replacing INITCOND
-        for (String condition : new String[]{"", "INITCOND null"})
-        {
-            execute(ddlPrefix + "INITCOND [  ] ");
-            checkOptionals(a, null, ByteBuffer.allocate(4));
+        execute(ddlPrefix + "INITCOND [  ] ");
+        checkOptionals(a, null, "[]");
 
-            execute(ddlPrefix + condition);
-            checkOptionals(a, null, null);
-        }
+        execute(ddlPrefix);
+        checkOptionals(a, null, null);
+
+        execute(ddlPrefix + "INITCOND [  ] ");
+        checkOptionals(a, null, "[]");
+
+        execute(ddlPrefix + "INITCOND null");
+        checkOptionals(a, null, null);
 
         // Test replacing FINALFUNC
-        execute(ddlPrefix + "FINALFUNC " + shortFunctionName(fFinal) + " ");
+        execute(ddlPrefix + "FINALFUNC " + shortFunctionName(fFinal) + ' ');
         checkOptionals(a, shortFunctionName(fFinal), null);
 
         execute(ddlPrefix);
         checkOptionals(a, null, null);
     }
 
-    private void checkOptionals(String aggregateName, String finalFunc, ByteBuffer initCond) throws Throwable
+    private void checkOptionals(String aggregateName, String finalFunc, String initCond) throws Throwable
     {
-        assertRows(execute("SELECT final_func, initcond FROM system.schema_aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(aggregateName)),
+        assertRows(execute("SELECT final_func, initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, shortFunctionName(aggregateName)),
                    row(finalFunc, initCond));
     }
+
+    public void testCustomTypeInitcond() throws Throwable
+    {
+        try
+        {
+            String type = "DynamicCompositeType(s => UTF8Type, i => Int32Type)";
+
+            executeNet(Server.CURRENT_VERSION,
+                       "CREATE FUNCTION " + KEYSPACE + ".f11064(i 'DynamicCompositeType(s => UTF8Type, i => Int32Type)')\n" +
+                       "RETURNS NULL ON NULL INPUT\n" +
+                       "RETURNS '" + type + "'\n" +
+                       "LANGUAGE java\n" +
+                       "AS 'return i;'");
+
+            // create aggregate using the 'composite syntax' for composite types
+            executeNet(Server.CURRENT_VERSION,
+                       "CREATE AGGREGATE " + KEYSPACE + ".a11064()\n" +
+                       "SFUNC f11064 " +
+                       "STYPE '" + type + "'\n" +
+                       "INITCOND 's@foo:i@32'");
+
+            AbstractType<?> compositeType = TypeParser.parse(type);
+            ByteBuffer compositeTypeValue = compositeType.fromString("s@foo:i@32");
+            String compositeTypeString = compositeType.asCQL3Type().toCQLLiteral(compositeTypeValue, Server.CURRENT_VERSION);
+            // ensure that the composite type is serialized using the 'blob syntax'
+            assertTrue(compositeTypeString.startsWith("0x"));
+
+            // ensure that the composite type is 'serialized' using the 'blob syntax' in the schema
+            assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, "a11064"),
+                       row(compositeTypeString));
+
+            // create aggregate using the 'blob syntax' for composite types
+            executeNet(Server.CURRENT_VERSION,
+                       "CREATE AGGREGATE " + KEYSPACE + ".a11064_2()\n" +
+                       "SFUNC f11064 " +
+                       "STYPE '" + type + "'\n" +
+                       "INITCOND " + compositeTypeString);
+
+            // ensure that the composite type is 'serialized' using the 'blob syntax' in the schema
+            assertRows(execute("SELECT initcond FROM system_schema.aggregates WHERE keyspace_name=? AND aggregate_name=?", KEYSPACE, "a11064_2"),
+                       row(compositeTypeString));
+        }
+        finally
+        {
+            try
+            {
+                execute("DROP AGGREGATE " + KEYSPACE + ".a11064_2");
+            }
+            catch (Exception ignore)
+            {
+            }
+            try
+            {
+                execute("DROP AGGREGATE " + KEYSPACE + ".a11064");
+            }
+            catch (Exception ignore)
+            {
+            }
+            try
+            {
+                execute("DROP FUNCTION " + KEYSPACE + ".f11064");
+            }
+            catch (Exception ignore)
+            {
+            }
+        }
+    }
+
+    @Test
+    public void testArithmeticCorrectness() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, val decimal)");
+        execute("insert into %s (bucket, val) values (1, 0.25)");
+        execute("insert into %s (bucket, val) values (2, 0.25)");
+        execute("insert into %s (bucket, val) values (3, 0.5);");
+
+        BigDecimal a = new BigDecimal("0.25");
+        a = a.add(new BigDecimal("0.25"));
+        a = a.add(new BigDecimal("0.5"));
+        a = a.divide(new BigDecimal(3), RoundingMode.HALF_EVEN);
+
+        assertRows(execute("select avg(val) from %s where bucket in (1, 2, 3);"),
+                   row(a));
+    }
+
+    @Test
+    public void testAggregatesWithoutOverflow() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 tinyint, v2 smallint, v3 int, v4 bigint, v5 varint)");
+        for (int i = 1; i <= 3; i++)
+            execute("insert into %s (bucket, v1, v2, v3, v4, v5) values (?, ?, ?, ?, ?, ?)", i,
+                    (byte) ((Byte.MAX_VALUE / 3) + i), (short) ((Short.MAX_VALUE / 3) + i), (Integer.MAX_VALUE / 3) + i, (Long.MAX_VALUE / 3) + i,
+                    BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.valueOf(i)));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3), avg(v4), avg(v5) from %s where bucket in (1, 2, 3);"),
+                   row((byte) ((Byte.MAX_VALUE / 3) + 2), (short) ((Short.MAX_VALUE / 3) + 2), (Integer.MAX_VALUE / 3) + 2, (Long.MAX_VALUE / 3) + 2,
+                       BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.valueOf(2))));
+
+        for (int i = 1; i <= 3; i++)
+            execute("insert into %s (bucket, v1, v2, v3, v4, v5) values (?, ?, ?, ?, ?, ?)", i + 3,
+                    (byte) (100 + i), (short) (100 + i), 100 + i, 100L + i, BigInteger.valueOf(100 + i));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3), avg(v4), avg(v5) from %s where bucket in (4, 5, 6);"),
+                   row((byte) 102, (short) 102, 102, 102L, BigInteger.valueOf(102)));
+    }
+
+    @Test
+    public void testAggregateOverflow() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 tinyint, v2 smallint, v3 int, v4 bigint, v5 varint)");
+        for (int i = 1; i <= 3; i++)
+            execute("insert into %s (bucket, v1, v2, v3, v4, v5) values (?, ?, ?, ?, ?, ?)", i,
+                    Byte.MAX_VALUE, Short.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE, BigInteger.valueOf(Long.MAX_VALUE).multiply(BigInteger.valueOf(2)));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3), avg(v4), avg(v5) from %s where bucket in (1, 2, 3);"),
+                   row(Byte.MAX_VALUE, Short.MAX_VALUE, Integer.MAX_VALUE, Long.MAX_VALUE, BigInteger.valueOf(Long.MAX_VALUE).multiply(BigInteger.valueOf(2))));
+
+        execute("truncate %s");
+
+        for (int i = 1; i <= 3; i++)
+            execute("insert into %s (bucket, v1, v2, v3, v4, v5) values (?, ?, ?, ?, ?, ?)", i,
+                    Byte.MIN_VALUE, Short.MIN_VALUE, Integer.MIN_VALUE, Long.MIN_VALUE, BigInteger.valueOf(Long.MIN_VALUE).multiply(BigInteger.valueOf(2)));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3), avg(v4), avg(v5) from %s where bucket in (1, 2, 3);"),
+                   row(Byte.MIN_VALUE, Short.MIN_VALUE, Integer.MIN_VALUE, Long.MIN_VALUE, BigInteger.valueOf(Long.MIN_VALUE).multiply(BigInteger.valueOf(2))));
+
+    }
+
+    @Test
+    public void testDoubleAggregatesPrecision() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 float, v2 double, v3 decimal)");
+
+        for (int i = 1; i <= 3; i++)
+            execute("insert into %s (bucket, v1, v2, v3) values (?, ?, ?, ?)", i,
+                    Float.MAX_VALUE, Double.MAX_VALUE, BigDecimal.valueOf(Double.MAX_VALUE).add(BigDecimal.valueOf(2)));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3) from %s where bucket in (1, 2, 3);"),
+                   row(Float.MAX_VALUE, Double.MAX_VALUE, BigDecimal.valueOf(Double.MAX_VALUE).add(BigDecimal.valueOf(2))));
+
+        execute("insert into %s (bucket, v1, v2, v3) values (?, ?, ?, ?)", 4, (float) 100.10, 100.10, BigDecimal.valueOf(100.10));
+        execute("insert into %s (bucket, v1, v2, v3) values (?, ?, ?, ?)", 5, (float) 110.11, 110.11, BigDecimal.valueOf(110.11));
+        execute("insert into %s (bucket, v1, v2, v3) values (?, ?, ?, ?)", 6, (float) 120.12, 120.12, BigDecimal.valueOf(120.12));
+
+        assertRows(execute("select avg(v1), avg(v2), avg(v3) from %s where bucket in (4, 5, 6);"),
+                   row((float) 110.11, 110.11, BigDecimal.valueOf(110.11)));
+    }
+
+    @Test
+    public void testNan() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 float, v2 double)");
+
+        for (int i = 1; i <= 10; i++)
+            if (i != 5)
+                execute("insert into %s (bucket, v1, v2) values (?, ?, ?)", i, (float) i, (double) i);
+
+        execute("insert into %s (bucket, v1, v2) values (?, ?, ?)", 5, Float.NaN, Double.NaN);
+
+        assertRows(execute("select avg(v1), avg(v2) from %s where bucket in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);"),
+                   row(Float.NaN, Double.NaN));
+    }
+
+    @Test
+    public void testInfinity() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 float, v2 double)");
+        for (boolean positive: new boolean[] { true, false})
+        {
+            final float FLOAT_INFINITY = positive ? Float.POSITIVE_INFINITY : Float.NEGATIVE_INFINITY;
+            final double DOUBLE_INFINITY = positive ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
+
+            for (int i = 1; i <= 10; i++)
+                if (i != 5)
+                    execute("insert into %s (bucket, v1, v2) values (?, ?, ?)", i, (float) i, (double) i);
+
+            execute("insert into %s (bucket, v1, v2) values (?, ?, ?)", 5, FLOAT_INFINITY, DOUBLE_INFINITY);
+
+            assertRows(execute("select avg(v1), avg(v2) from %s where bucket in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);"),
+                       row(FLOAT_INFINITY, DOUBLE_INFINITY));
+            execute("truncate %s");
+        }
+    }
+
+    @Test
+    public void testSumPrecision() throws Throwable
+    {
+        createTable("create table %s (bucket int primary key, v1 float, v2 double, v3 decimal)");
+
+        for (int i = 1; i <= 17; i++)
+            execute("insert into %s (bucket, v1, v2, v3) values (?, ?, ?, ?)", i, (float) (i / 10.0), i / 10.0, BigDecimal.valueOf(i / 10.0));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
index 2cc8a18..1efefc3 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java

@@ -17,15 +17,19 @@
  */
 package org.apache.cassandra.cql3.validation.operations;
 
+import org.junit.Assert;
+import org.junit.Test;
+
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.IntegerType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.SchemaKeyspace;
 
-import org.junit.Test;
-
+import static java.lang.String.format;
 import static org.junit.Assert.assertEquals;
 
 public class AlterTest extends CQLTester
@@ -49,6 +53,7 @@
 
         assertRows(execute("SELECT * FROM %s;"), row("test", "first test"));
     }
+
     @Test
     public void testAddMap() throws Throwable
     {
@@ -81,6 +86,7 @@
         execute("UPDATE %s set myCollection = ['second element'] WHERE id = 'test';");
         assertRows(execute("SELECT * FROM %s;"), row("test", "first test", list("second element")));
     }
+
     @Test
     public void testDropListAndAddMapWithSameName() throws Throwable
     {
@@ -92,6 +98,77 @@
     }
 
     @Test
+    public void testDropWithTimestamp() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id int, c1 int, v1 int, todrop int, PRIMARY KEY (id, c1));");
+        for (int i = 0; i < 5; i++)
+            execute("INSERT INTO %s (id, c1, v1, todrop) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", 1, i, i, i, 10000L * i);
+
+        // flush is necessary since otherwise the values of `todrop` will get discarded during
+        // alter statement
+        flush();
+        execute("ALTER TABLE %s DROP todrop USING TIMESTAMP 20000;");
+        execute("ALTER TABLE %s ADD todrop int;");
+        execute("INSERT INTO %s (id, c1, v1, todrop) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", 1, 100, 100, 100, 30000L);
+        assertRows(execute("SELECT id, c1, v1, todrop FROM %s"),
+                   row(1, 0, 0, null),
+                   row(1, 1, 1, null),
+                   row(1, 2, 2, null),
+                   row(1, 3, 3, 3),
+                   row(1, 4, 4, 4),
+                   row(1, 100, 100, 100));
+    }
+
+    @Test
+    public void testDropStaticWithTimestamp() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id int, c1 int, v1 int, todrop int static, PRIMARY KEY (id, c1));");
+        for (int i = 0; i < 5; i++)
+            execute("INSERT INTO %s (id, c1, v1, todrop) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", 1, i, i, i, 10000L * i);
+
+        // flush is necessary since otherwise the values of `todrop` will get discarded during
+        // alter statement
+        flush();
+        execute("ALTER TABLE %s DROP todrop USING TIMESTAMP 20000;");
+        execute("ALTER TABLE %s ADD todrop int static;");
+        execute("INSERT INTO %s (id, c1, v1, todrop) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", 1, 100, 100, 100, 30000L);
+        // static column value with largest timestmap will be available again
+        assertRows(execute("SELECT id, c1, v1, todrop FROM %s"),
+                   row(1, 0, 0, 4),
+                   row(1, 1, 1, 4),
+                   row(1, 2, 2, 4),
+                   row(1, 3, 3, 4),
+                   row(1, 4, 4, 4),
+                   row(1, 100, 100, 4));
+    }
+
+    @Test
+    public void testDropMultipleWithTimestamp() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id int, c1 int, v1 int, todrop1 int, todrop2 int, PRIMARY KEY (id, c1));");
+        for (int i = 0; i < 5; i++)
+            execute("INSERT INTO %s (id, c1, v1, todrop1, todrop2) VALUES (?, ?, ?, ?, ?) USING TIMESTAMP ?", 1, i, i, i, i, 10000L * i);
+
+        // flush is necessary since otherwise the values of `todrop1` and `todrop2` will get discarded during
+        // alter statement
+        flush();
+        execute("ALTER TABLE %s DROP todrop1 USING TIMESTAMP 20000;");
+        execute("ALTER TABLE %s DROP todrop2 USING TIMESTAMP 20000;");
+        execute("ALTER TABLE %s ADD todrop1 int;");
+        execute("ALTER TABLE %s ADD todrop2 int;");
+
+        execute("INSERT INTO %s (id, c1, v1, todrop1, todrop2) VALUES (?, ?, ?, ?, ?) USING TIMESTAMP ?", 1, 100, 100, 100, 100, 40000L);
+        assertRows(execute("SELECT id, c1, v1, todrop1, todrop2 FROM %s"),
+                   row(1, 0, 0, null, null),
+                   row(1, 1, 1, null, null),
+                   row(1, 2, 2, null, null),
+                   row(1, 3, 3, 3, 3),
+                   row(1, 4, 4, 4, 4),
+                   row(1, 100, 100, 100, 100));
+    }
+
+
+    @Test
     public void testChangeStrategyWithUnquotedAgrument() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY);");
@@ -108,12 +185,12 @@
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
         alterTable("ALTER TABLE %s WITH min_index_interval=256 AND max_index_interval=512");
-        assertEquals(256, cfs.metadata.getMinIndexInterval());
-        assertEquals(512, cfs.metadata.getMaxIndexInterval());
+        assertEquals(256, cfs.metadata.params.minIndexInterval);
+        assertEquals(512, cfs.metadata.params.maxIndexInterval);
 
-        alterTable("ALTER TABLE %s WITH caching = 'none'");
-        assertEquals(256, cfs.metadata.getMinIndexInterval());
-        assertEquals(512, cfs.metadata.getMaxIndexInterval());
+        alterTable("ALTER TABLE %s WITH caching = {}");
+        assertEquals(256, cfs.metadata.params.minIndexInterval);
+        assertEquals(512, cfs.metadata.params.maxIndexInterval);
     }
 
     /**
@@ -125,35 +202,33 @@
         assertInvalidThrow(SyntaxException.class, "CREATE KEYSPACE ks1");
         assertInvalidThrow(ConfigurationException.class, "CREATE KEYSPACE ks1 WITH replication= { 'replication_factor' : 1 }");
 
-        execute("CREATE KEYSPACE ks1 WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
-        execute("CREATE KEYSPACE ks2 WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 } AND durable_writes=false");
+        String ks1 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
+        String ks2 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 } AND durable_writes=false");
 
-        assertRows(execute("SELECT keyspace_name, durable_writes FROM system.schema_keyspaces"),
-                   row("ks1", true),
+        assertRowsIgnoringOrderAndExtra(execute("SELECT keyspace_name, durable_writes FROM system_schema.keyspaces"),
                    row(KEYSPACE, true),
                    row(KEYSPACE_PER_TEST, true),
-                   row("ks2", false));
+                   row(ks1, true),
+                   row(ks2, false));
 
-        execute("ALTER KEYSPACE ks1 WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 1 } AND durable_writes=False");
-        execute("ALTER KEYSPACE ks2 WITH durable_writes=true");
+        schemaChange("ALTER KEYSPACE " + ks1 + " WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 1 } AND durable_writes=False");
+        schemaChange("ALTER KEYSPACE " + ks2 + " WITH durable_writes=true");
 
-        assertRows(execute("SELECT keyspace_name, durable_writes, strategy_class FROM system.schema_keyspaces"),
-                   row("ks1", false, "org.apache.cassandra.locator.NetworkTopologyStrategy"),
-                   row(KEYSPACE, true, "org.apache.cassandra.locator.SimpleStrategy"),
-                   row(KEYSPACE_PER_TEST, true, "org.apache.cassandra.locator.SimpleStrategy"),
-                   row("ks2", true, "org.apache.cassandra.locator.SimpleStrategy"));
+        assertRowsIgnoringOrderAndExtra(execute("SELECT keyspace_name, durable_writes, replication FROM system_schema.keyspaces"),
+                   row(KEYSPACE, true, map("class", "org.apache.cassandra.locator.SimpleStrategy", "replication_factor", "1")),
+                   row(KEYSPACE_PER_TEST, true, map("class", "org.apache.cassandra.locator.SimpleStrategy", "replication_factor", "1")),
+                   row(ks1, false, map("class", "org.apache.cassandra.locator.NetworkTopologyStrategy", "dc1", "1")),
+                   row(ks2, true, map("class", "org.apache.cassandra.locator.SimpleStrategy", "replication_factor", "1")));
 
-        execute("USE ks1");
+        execute("USE " + ks1);
 
         assertInvalidThrow(ConfigurationException.class, "CREATE TABLE cf1 (a int PRIMARY KEY, b int) WITH compaction = { 'min_threshold' : 4 }");
 
         execute("CREATE TABLE cf1 (a int PRIMARY KEY, b int) WITH compaction = { 'class' : 'SizeTieredCompactionStrategy', 'min_threshold' : 7 }");
-        assertRows(execute("SELECT columnfamily_name, min_compaction_threshold FROM system.schema_columnfamilies WHERE keyspace_name='ks1'"),
-                   row("cf1", 7));
-
-        // clean-up
-        execute("DROP KEYSPACE ks1");
-        execute("DROP KEYSPACE ks2");
+        assertRows(execute("SELECT table_name, compaction FROM system_schema.tables WHERE keyspace_name='" + ks1 + "'"),
+                   row("cf1", map("class", "org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy",
+                                  "min_threshold", "7",
+                                  "max_threshold", "32")));
     }
 
     /**
@@ -176,53 +251,281 @@
                    row(1, null, null, "111"));
     }
 
-    /**
-     * Test for 7744,
-     * migrated from cql_tests.py:TestCQL.downgrade_to_compact_bug_test()
-     */
-    @Test
-    public void testDowngradeToCompact() throws Throwable
+    @Test(expected = InvalidRequestException.class)
+    public void testDropComplexAddSimpleColumn() throws Throwable
     {
         createTable("create table %s (k int primary key, v set<text>)");
-        execute("insert into %s (k, v) VALUES (0, {'f'})");
-        flush();
         execute("alter table %s drop v");
-        execute("alter table %s add v int");
+        execute("alter table %s add v text");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropSimpleAddComplexColumn() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v text)");
+        execute("alter table %s drop v");
+        execute("alter table %s add v set<text>");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropMultiCellAddFrozenColumn() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v set<text>)");
+        execute("alter table %s drop v");
+        execute("alter table %s add v frozen<set<text>>");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropFrozenAddMultiCellColumn() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v frozen<set<text>>)");
+        execute("alter table %s drop v");
+        execute("alter table %s add v set<text>");
+    }
+
+    @Test
+    public void testDropTimeUUIDAddUUIDColumn() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v timeuuid)");
+        execute("alter table %s drop v");
+        execute("alter table %s add v uuid");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropUUIDAddTimeUUIDColumn() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v uuid)");
+        execute("alter table %s drop v");
+        execute("alter table %s add v timeuuid");
+    }
+
+    @Test
+    public void testDropAddSameType() throws Throwable
+    {
+        createTable("create table %s (k int primary key, v1 timeuuid, v2 set<uuid>, v3 frozen<list<text>>)");
+
+        execute("alter table %s drop v1");
+        execute("alter table %s add v1 timeuuid");
+
+        execute("alter table %s drop v2");
+        execute("alter table %s add v2 set<uuid>");
+
+        execute("alter table %s drop v3");
+        execute("alter table %s add v3 frozen<list<text>>");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropRegularAddStatic() throws Throwable
+    {
+        createTable("create table %s (k int, c int, v uuid, PRIMARY KEY (k, c))");
+        execute("alter table %s drop v");
+        execute("alter table %s add v uuid static");
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testDropStaticAddRegular() throws Throwable
+    {
+        createTable("create table %s (k int, c int, v uuid static, PRIMARY KEY (k, c))");
+        execute("alter table %s drop v");
+        execute("alter table %s add v uuid");
+    }
+
+    @Test(expected = SyntaxException.class)
+    public void renameToEmptyTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, v int, PRIMARY KEY (k, c1))");
+        execute("ALTER TABLE %s RENAME c1 TO \"\"");
     }
 
     @Test
     // tests CASSANDRA-9565
     public void testDoubleWith() throws Throwable
     {
-        String[] stmts = new String[] { "ALTER KEYSPACE WITH WITH DURABLE_WRITES = true",
-                                        "ALTER KEYSPACE ks WITH WITH DURABLE_WRITES = true" };
+        String[] stmts = { "ALTER KEYSPACE WITH WITH DURABLE_WRITES = true",
+                           "ALTER KEYSPACE ks WITH WITH DURABLE_WRITES = true" };
 
         for (String stmt : stmts) {
             assertInvalidSyntaxMessage("no viable alternative at input 'WITH'", stmt);
         }
     }
 
+    @Test
+    public void testAlterTableWithCompression() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "64", "class", "org.apache.cassandra.io.compress.LZ4Compressor")));
+
+        execute("ALTER TABLE %s WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "32", "class", "org.apache.cassandra.io.compress.SnappyCompressor")));
+
+        execute("ALTER TABLE %s WITH compression = { 'sstable_compression' : 'LZ4Compressor', 'chunk_length_kb' : 64 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "64", "class", "org.apache.cassandra.io.compress.LZ4Compressor")));
+
+        execute("ALTER TABLE %s WITH compression = { 'sstable_compression' : '', 'chunk_length_kb' : 32 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("enabled", "false")));
+
+        execute("ALTER TABLE %s WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32 };");
+        execute("ALTER TABLE %s WITH compression = { 'enabled' : 'false'};");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("enabled", "false")));
+
+        assertThrowsConfigurationException("Missing sub-option 'class' for the 'compression' option.",
+                                           "ALTER TABLE %s WITH  compression = {'chunk_length_in_kb' : 32};");
+
+        assertThrowsConfigurationException("The 'class' option must not be empty. To disable compression use 'enabled' : false",
+                                           "ALTER TABLE %s WITH  compression = { 'class' : ''};");
+
+        assertThrowsConfigurationException("If the 'enabled' option is set to false no other options must be specified",
+                                           "ALTER TABLE %s WITH compression = { 'enabled' : 'false', 'class' : 'SnappyCompressor'};");
+
+        assertThrowsConfigurationException("The 'sstable_compression' option must not be used if the compression algorithm is already specified by the 'class' option",
+                                           "ALTER TABLE %s WITH compression = { 'sstable_compression' : 'SnappyCompressor', 'class' : 'SnappyCompressor'};");
+
+        assertThrowsConfigurationException("The 'chunk_length_kb' option must not be used if the chunk length is already specified by the 'chunk_length_in_kb' option",
+                                           "ALTER TABLE %s WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_kb' : 32 , 'chunk_length_in_kb' : 32 };");
+    }
+
+    private void assertThrowsConfigurationException(String errorMsg, String alterStmt) throws Throwable
+    {
+        try
+        {
+            execute(alterStmt);
+            Assert.fail("Query should be invalid but no error was thrown. Query is: " + alterStmt);
+        }
+        catch (ConfigurationException e)
+        {
+            assertEquals(errorMsg, e.getMessage());
+        }
+    }
+
     /**
-     * tests CASSANDRA-10027
+     * Test for CASSANDRA-13337. Checks that dropping a column when a sstable contains only data for that column
+     * works properly.
      */
     @Test
-    public void testAlterColumnTypeToDate() throws Throwable
+    public void testAlterDropEmptySSTable() throws Throwable
     {
-        createTable("CREATE TABLE %s (key int PRIMARY KEY, c1 int);");
-        execute("INSERT INTO %s (key, c1) VALUES (1,1);");
-        execute("ALTER TABLE %s ALTER c1 TYPE date;");
-        assertRows(execute("SELECT * FROM %s"), row(1, 1));
+        createTable("CREATE TABLE %s(k int PRIMARY KEY, x int, y int)");
 
-        createTable("CREATE TABLE %s (key int PRIMARY KEY, c1 varint);");
-        execute("INSERT INTO %s (key, c1) VALUES (1,1);");
-        assertInvalidMessage("Cannot change c1 from type varint to type date: types are incompatible.",
-                             "ALTER TABLE %s ALTER c1 TYPE date;");
+        execute("UPDATE %s SET x = 1 WHERE k = 0");
+
+        flush();
+
+        execute("UPDATE %s SET x = 1, y = 1 WHERE k = 0");
+
+        flush();
+
+        execute("ALTER TABLE %s DROP x");
+
+        compact();
+
+        assertRows(execute("SELECT * FROM %s"), row(0, 1));
     }
 
-    @Test // tests CASSANDRA-8879
-    public void testAlterClusteringColumnTypeInCompactTable() throws Throwable
+    /**
+     * Similarly to testAlterDropEmptySSTable, checks we don't return empty rows from queries (testAlterDropEmptySSTable
+     * tests the compaction case).
+     */
+    @Test
+    public void testAlterOnlyColumnBehaviorWithFlush() throws Throwable
     {
-        createTable("CREATE TABLE %s (key blob, column1 blob, value blob, PRIMARY KEY ((key), column1)) WITH COMPACT STORAGE");
-        assertInvalidThrow(InvalidRequestException.class, "ALTER TABLE %s ALTER column1 TYPE ascii");
+        testAlterOnlyColumnBehaviorWithFlush(true);
+        testAlterOnlyColumnBehaviorWithFlush(false);
     }
+
+    private void testAlterOnlyColumnBehaviorWithFlush(boolean flushAfterInsert) throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int PRIMARY KEY, x int, y int)");
+
+        execute("UPDATE %s SET x = 1 WHERE k = 0");
+
+        assertRows(execute("SELECT * FROM %s"), row(0, 1, null));
+
+        if (flushAfterInsert)
+            flush();
+
+        execute("ALTER TABLE %s DROP x");
+
+        assertEmpty(execute("SELECT * FROM %s"));
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testAlterWithCompactStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+
+        assertInvalidMessage("Cannot rename unknown column column1 in keyspace",
+                             "ALTER TABLE %s RENAME column1 TO column2");
+
+        assertInvalidMessage("Cannot rename unknown column value in keyspace",
+                             "ALTER TABLE %s RENAME value TO value2");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testAlterWithCompactNonStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        assertInvalidMessage("Cannot rename unknown column column1 in keyspace",
+                             "ALTER TABLE %s RENAME column1 TO column2");
+
+        createTable("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        assertInvalidMessage("Cannot rename unknown column column1 in keyspace",
+                             "ALTER TABLE %s RENAME column1 TO column2");
+    }
+
+    @Test
+    public void testAlterTableAlterType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a,b)) WITH COMPACT STORAGE");
+        assertInvalidMessage(String.format("Compact value type can only be changed to BytesType, but %s was given.",
+                                           IntegerType.instance),
+                             "ALTER TABLE %s ALTER value TYPE 'org.apache.cassandra.db.marshal.IntegerType'");
+
+        execute("ALTER TABLE %s ALTER value TYPE 'org.apache.cassandra.db.marshal.BytesType'");
+
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b)) WITH COMPACT STORAGE");
+        assertInvalidMessage("Altering of types is not allowed",
+                             "ALTER TABLE %s ALTER c TYPE 'org.apache.cassandra.db.marshal.BytesType'");
+
+        createTable("CREATE TABLE %s (a int, value int, PRIMARY KEY (a,value)) WITH COMPACT STORAGE");
+        assertInvalidMessage("Altering of types is not allowed",
+                             "ALTER TABLE %s ALTER value TYPE 'org.apache.cassandra.db.marshal.IntegerType'");
+        execute("ALTER TABLE %s ALTER value1 TYPE 'org.apache.cassandra.db.marshal.BytesType'");
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/BatchTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/BatchTest.java
index 07117fd..87d0cde 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/BatchTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/BatchTest.java

@@ -91,12 +91,11 @@
         createTable("CREATE TABLE %s (k int PRIMARY KEY, s text, i int)");
 
         // test batch and update
-        String qualifiedTable = keyspace() + "." + currentTable();
         execute("BEGIN BATCH " +
-                "INSERT INTO %s (k, s, i) VALUES (100, 'batchtext', 7); " +
-                "INSERT INTO " + qualifiedTable + " (k, s, i) VALUES (111, 'batchtext', 7); " +
-                "UPDATE " + qualifiedTable + " SET s=?, i=? WHERE k = 100; " +
-                "UPDATE " + qualifiedTable + " SET s=?, i=? WHERE k=111; " +
+                "INSERT INTO %1$s (k, s, i) VALUES (100, 'batchtext', 7); " +
+                "INSERT INTO %1$s (k, s, i) VALUES (111, 'batchtext', 7); " +
+                "UPDATE %1$s SET s=?, i=? WHERE k = 100; " +
+                "UPDATE %1$s SET s=?, i=? WHERE k=111; " +
                 "APPLY BATCH;", null, unset(), unset(), null);
         assertRows(execute("SELECT k, s, i FROM %s where k in (100,111)"),
                    row(100, null, 7),
@@ -105,8 +104,250 @@
     }
 
     @Test
+    public void testBatchRangeDelete() throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                "clustering int," +
+                "value int," +
+                " PRIMARY KEY (partitionKey, clustering)) WITH COMPACT STORAGE");
+
+        int value = 0;
+        for (int partitionKey = 0; partitionKey < 4; partitionKey++)
+            for (int clustering1 = 0; clustering1 < 5; clustering1++)
+                execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (?, ?, ?)",
+                        partitionKey, clustering1, value++);
+
+        execute("BEGIN BATCH " +
+                "DELETE FROM %1$s WHERE partitionKey = 1;" +
+                "DELETE FROM %1$s WHERE partitionKey = 0 AND  clustering >= 4;" +
+                "DELETE FROM %1$s WHERE partitionKey = 0 AND clustering <= 0;" +
+                "DELETE FROM %1$s WHERE partitionKey = 2 AND clustering >= 0 AND clustering <= 3;" +
+                "DELETE FROM %1$s WHERE partitionKey = 2 AND clustering <= 3 AND clustering >= 4;" +
+                "DELETE FROM %1$s WHERE partitionKey = 3 AND (clustering) >= (3) AND (clustering) <= (6);" +
+                "APPLY BATCH;");
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 1, 1),
+                   row(0, 2, 2),
+                   row(0, 3, 3),
+                   row(2, 4, 14),
+                   row(3, 0, 15),
+                   row(3, 1, 16),
+                   row(3, 2, 17));
+    }
+
+    @Test
+    public void testBatchUpdate() throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                "clustering_1 int," +
+                "value int," +
+                " PRIMARY KEY (partitionKey, clustering_1))");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 1, 1)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 2, 2)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 3, 3)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 4, 4)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 5, 5)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 6, 6)");
+
+        execute("BEGIN BATCH " +
+                "UPDATE %1$s SET value = 7 WHERE partitionKey = 0 AND clustering_1 = 1" +
+                "UPDATE %1$s SET value = 8 WHERE partitionKey = 0 AND (clustering_1) = (2)" +
+                "UPDATE %1$s SET value = 10 WHERE partitionKey = 0 AND clustering_1 IN (3, 4)" +
+                "UPDATE %1$s SET value = 20 WHERE partitionKey = 0 AND (clustering_1) IN ((5), (6))" +
+                "APPLY BATCH;");
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 0, 0),
+                   row(0, 1, 7),
+                   row(0, 2, 8),
+                   row(0, 3, 10),
+                   row(0, 4, 10),
+                   row(0, 5, 20),
+                   row(0, 6, 20));
+    }
+
+    @Test
     public void testBatchEmpty() throws Throwable
     {
-        execute("BEGIN BATCH APPLY BATCH;");
+        assertEmpty(execute("BEGIN BATCH APPLY BATCH;"));
     }
+
+    @Test
+    public void testBatchMultipleTable() throws Throwable
+    {
+        String tbl1 = KEYSPACE + "." + createTableName();
+        String tbl2 = KEYSPACE + "." + createTableName();
+
+        schemaChange(String.format("CREATE TABLE %s (k1 int PRIMARY KEY, v11 int, v12 int)", tbl1));
+        schemaChange(String.format("CREATE TABLE %s (k2 int PRIMARY KEY, v21 int, v22 int)", tbl2));
+
+        execute("BEGIN BATCH " +
+                String.format("UPDATE %s SET v11 = 1 WHERE k1 = 0;", tbl1) +
+                String.format("UPDATE %s SET v12 = 2 WHERE k1 = 0;", tbl1) +
+                String.format("UPDATE %s SET v21 = 3 WHERE k2 = 0;", tbl2) +
+                String.format("UPDATE %s SET v22 = 4 WHERE k2 = 0;", tbl2) +
+                "APPLY BATCH;");
+
+        assertRows(execute(String.format("SELECT * FROM %s", tbl1)), row(0, 1, 2));
+        assertRows(execute(String.format("SELECT * FROM %s", tbl2)), row(0, 3, 4));
+
+        flush();
+
+        assertRows(execute(String.format("SELECT * FROM %s", tbl1)), row(0, 1, 2));
+        assertRows(execute(String.format("SELECT * FROM %s", tbl2)), row(0, 3, 4));
+    }
+
+    @Test
+    public void testBatchWithInRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b))");
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,1,1);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,2,2);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,3,3);
+
+        for (String inClause : new String[] { "()", "(1, 2)"})
+        {
+            assertInvalidMessage("IN on the clustering key columns is not supported with conditional updates",
+                                 "BEGIN BATCH " +
+                                 "UPDATE %1$s SET c = 100 WHERE a = 1 AND b = 1 IF c = 1;" +
+                                 "UPDATE %1$s SET c = 200 WHERE a = 1 AND b IN " + inClause + " IF c = 1;" +
+                                 "APPLY BATCH");
+
+            assertInvalidMessage("IN on the clustering key columns is not supported with conditional deletions",
+                                 "BEGIN BATCH " +
+                                 "UPDATE %1$s SET c = 100 WHERE a = 1 AND b = 1 IF c = 1;" +
+                                 "DELETE FROM %1$s WHERE a = 1 AND b IN " + inClause + " IF c = 1;" +
+                                 "APPLY BATCH");
+
+            assertInvalidMessage("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)",
+                                 "BEGIN BATCH " +
+                                 "UPDATE %1$s SET c = 100 WHERE a = 1 AND b = 1 IF c = 1;" +
+                                 "UPDATE %1$s SET c = 200 WHERE a IN " + inClause + " AND b = 1 IF c = 1;" +
+                                 "APPLY BATCH");
+
+            assertInvalidMessage("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)",
+                                 "BEGIN BATCH " +
+                                 "UPDATE %1$s SET c = 100 WHERE a = 1 AND b = 1 IF c = 1;" +
+                                 "DELETE FROM %1$s WHERE a IN " + inClause + " AND b = 1 IF c = 1;" +
+                                 "APPLY BATCH");
+        }
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1),
+                   row(1,2,2),
+                   row(1,3,3));
+    }
+
+    @Test
+    public void testBatchAndConditionalInteraction() throws Throwable
+    {
+
+        createTable(String.format("CREATE TABLE %s.clustering (\n" +
+                "  id int,\n" +
+                "  clustering1 int,\n" +
+                "  clustering2 int,\n" +
+                "  clustering3 int,\n" +
+                "  val int, \n" +
+                " PRIMARY KEY(id, clustering1, clustering2, clustering3)" +
+                ")", KEYSPACE));
+
+        execute("DELETE FROM " + KEYSPACE +".clustering WHERE id=1");
+
+        String clusteringInsert = "INSERT INTO " + KEYSPACE + ".clustering(id, clustering1, clustering2, clustering3, val) VALUES(%s, %s, %s, %s, %s); ";
+        String clusteringUpdate = "UPDATE " + KEYSPACE + ".clustering SET val=%s WHERE id=%s AND clustering1=%s AND clustering2=%s AND clustering3=%s ;";
+        String clusteringConditionalUpdate = "UPDATE " + KEYSPACE + ".clustering SET val=%s WHERE id=%s AND clustering1=%s AND clustering2=%s AND clustering3=%s IF val=%s ;";
+        String clusteringDelete = "DELETE FROM " + KEYSPACE + ".clustering WHERE id=%s AND clustering1=%s AND clustering2=%s AND clustering3=%s ;";
+        String clusteringRangeDelete = "DELETE FROM " + KEYSPACE + ".clustering WHERE id=%s AND clustering1=%s ;";
+
+
+        execute("BEGIN BATCH " + String.format(clusteringInsert, 1, 1, 1, 1, 1) + " APPLY BATCH");
+
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"), row(1, 1, 1, 1, 1));
+
+        StringBuilder cmd2 = new StringBuilder();
+        cmd2.append("BEGIN BATCH ");
+        cmd2.append(String.format(clusteringInsert, 1, 1, 1, 2, 2));
+        cmd2.append(String.format(clusteringConditionalUpdate, 11, 1, 1, 1, 1, 1));
+        cmd2.append("APPLY BATCH ");
+        execute(cmd2.toString());
+
+
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 1, 11),
+                row(1, 1, 1, 2, 2)
+        );
+
+
+        StringBuilder cmd3 = new StringBuilder();
+        cmd3.append("BEGIN BATCH ");
+        cmd3.append(String.format(clusteringInsert, 1, 1, 2, 3, 23));
+        cmd3.append(String.format(clusteringConditionalUpdate, 22, 1, 1, 1, 2, 2));
+        cmd3.append(String.format(clusteringDelete, 1, 1, 1, 1));
+        cmd3.append("APPLY BATCH ");
+        execute(cmd3.toString());
+
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 2, 22),
+                row(1, 1, 2, 3, 23)
+        );
+
+        StringBuilder cmd4 = new StringBuilder();
+        cmd4.append("BEGIN BATCH ");
+        cmd4.append(String.format(clusteringInsert, 1, 2, 3, 4, 1234));
+        cmd4.append(String.format(clusteringConditionalUpdate, 234, 1, 1, 1, 2, 22));
+        cmd4.append("APPLY BATCH ");
+        execute(cmd4.toString());
+
+        System.out.println(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"));
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 2, 234),
+                row(1, 1, 2, 3, 23),
+                row(1, 2, 3, 4, 1234)
+        );
+
+        StringBuilder cmd5 = new StringBuilder();
+        cmd5.append("BEGIN BATCH ");
+        cmd5.append(String.format(clusteringRangeDelete, 1, 2));
+        cmd5.append(String.format(clusteringConditionalUpdate, 1234, 1, 1, 1, 2, 234));
+        cmd5.append("APPLY BATCH ");
+        execute(cmd5.toString());
+
+        System.out.println(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"));
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 2, 1234),
+                row(1, 1, 2, 3, 23)
+        );
+
+        StringBuilder cmd6 = new StringBuilder();
+        cmd6.append("BEGIN BATCH ");
+        cmd6.append(String.format(clusteringUpdate, 345, 1, 3, 4, 5));
+        cmd6.append(String.format(clusteringConditionalUpdate, 1, 1, 1, 1, 2, 1234));
+        cmd6.append("APPLY BATCH ");
+        execute(cmd6.toString());
+
+        System.out.println(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"));
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 2, 1),
+                row(1, 1, 2, 3, 23),
+                row(1, 3, 4, 5, 345)
+        );
+
+
+        StringBuilder cmd7 = new StringBuilder();
+        cmd7.append("BEGIN BATCH ");
+        cmd7.append(String.format(clusteringDelete, 1, 3, 4, 5));
+        cmd7.append(String.format(clusteringConditionalUpdate, 2300, 1, 1, 2, 3, 1));  // SHOULD NOT MATCH
+        cmd7.append("APPLY BATCH ");
+        execute(cmd7.toString());
+
+        System.out.println(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"));
+        assertRows(execute("SELECT * FROM " + KEYSPACE+".clustering WHERE id=1"),
+                row(1, 1, 1, 2, 1),
+                row(1, 1, 2, 3, 23),
+                row(1, 3, 4, 5, 345)
+        );
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java
index 398b851..1f436b9 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java

@@ -15,30 +15,31 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.cassandra.cql3.validation.operations;
 
-import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.UUID;
 
-import org.junit.Assert;
 import org.junit.Test;
 
+
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.triggers.ITrigger;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static java.lang.String.format;
 import static junit.framework.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static junit.framework.Assert.fail;
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
 
 public class CreateTest extends CQLTester
 {
@@ -403,12 +404,12 @@
     {
         createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
         execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
         execute("CREATE TRIGGER trigger_2 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_2", TestTrigger.class);
+        assertTriggerExists("trigger_2");
         assertInvalid("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
         execute("CREATE TRIGGER \"Trigger 3\" ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("Trigger 3", TestTrigger.class);
+        assertTriggerExists("Trigger 3");
     }
 
     @Test
@@ -417,10 +418,10 @@
         createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
 
         execute("CREATE TRIGGER IF NOT EXISTS trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
 
         execute("CREATE TRIGGER IF NOT EXISTS trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
     }
 
     @Test
@@ -429,21 +430,21 @@
         createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
 
         execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
 
         execute("DROP TRIGGER trigger_1 ON %s");
-        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+        assertTriggerDoesNotExists("trigger_1");
 
         execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
 
         assertInvalid("DROP TRIGGER trigger_2 ON %s");
 
         execute("CREATE TRIGGER \"Trigger 3\" ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("Trigger 3", TestTrigger.class);
+        assertTriggerExists("Trigger 3");
 
         execute("DROP TRIGGER \"Trigger 3\" ON %s");
-        assertTriggerDoesNotExists("Trigger 3", TestTrigger.class);
+        assertTriggerDoesNotExists("Trigger 3");
     }
 
     @Test
@@ -452,13 +453,13 @@
         createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
 
         execute("DROP TRIGGER IF EXISTS trigger_1 ON %s");
-        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+        assertTriggerDoesNotExists("trigger_1");
 
         execute("CREATE TRIGGER trigger_1 ON %s USING '" + TestTrigger.class.getName() + "'");
-        assertTriggerExists("trigger_1", TestTrigger.class);
+        assertTriggerExists("trigger_1");
 
         execute("DROP TRIGGER IF EXISTS trigger_1 ON %s");
-        assertTriggerDoesNotExists("trigger_1", TestTrigger.class);
+        assertTriggerDoesNotExists("trigger_1");
     }
 
     @Test
@@ -466,10 +467,10 @@
     {
         createTable("CREATE TABLE %s (a int, b int , c int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE;");
 
-        assertInvalidMessage("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              "CREATE INDEX ON %s (a);");
 
-        assertInvalidMessage("Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables",
+        assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
                              "CREATE INDEX ON %s (b);");
 
         assertInvalidMessage("Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns",
@@ -493,36 +494,151 @@
         assertRows(execute("SELECT * FROM %s WHERE b = ?", 4), row(2, 4));
     }
 
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testCreateIndextWithCompactStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+        assertInvalidMessage("No column definition found for column column1",
+                             "CREATE INDEX column1_index on %s (column1)");
+        assertInvalidMessage("No column definition found for column value",
+                             "CREATE INDEX value_index on %s (value)");
+    }
+
     @Test
     // tests CASSANDRA-9565
     public void testDoubleWith() throws Throwable
     {
-        String[] stmts = new String[] { "CREATE KEYSPACE WITH WITH DURABLE_WRITES = true",
-                                        "CREATE KEYSPACE ks WITH WITH DURABLE_WRITES = true" };
+        String[] stmts = { "CREATE KEYSPACE WITH WITH DURABLE_WRITES = true",
+                           "CREATE KEYSPACE ks WITH WITH DURABLE_WRITES = true" };
 
-        for (String stmt : stmts) {
+        for (String stmt : stmts)
             assertInvalidSyntaxMessage("no viable alternative at input 'WITH'", stmt);
-        }
     }
 
-    private void assertTriggerExists(String name, Class<?> clazz)
+    @Test
+    public void testCreateTableWithCompression() throws Throwable
     {
-        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), currentTable()).copy();
-        assertTrue("the trigger does not exist", cfm.containsTriggerDefinition(TriggerDefinition.create(name,
-                                                                                                        clazz.getName())));
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "64", "class", "org.apache.cassandra.io.compress.LZ4Compressor")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                + " WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "32", "class", "org.apache.cassandra.io.compress.SnappyCompressor")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                + " WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32, 'enabled' : true };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "32", "class", "org.apache.cassandra.io.compress.SnappyCompressor")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                + " WITH compression = { 'sstable_compression' : 'SnappyCompressor', 'chunk_length_kb' : 32 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("chunk_length_in_kb", "32", "class", "org.apache.cassandra.io.compress.SnappyCompressor")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                + " WITH compression = { 'sstable_compression' : '', 'chunk_length_kb' : 32 };");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("enabled", "false")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                + " WITH compression = { 'enabled' : 'false'};");
+
+        assertRows(execute(format("SELECT compression FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("enabled", "false")));
+
+        assertThrowsConfigurationException("Missing sub-option 'class' for the 'compression' option.",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = {'chunk_length_in_kb' : 32};");
+
+        assertThrowsConfigurationException("The 'class' option must not be empty. To disable compression use 'enabled' : false",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = { 'class' : ''};");
+
+        assertThrowsConfigurationException("If the 'enabled' option is set to false no other options must be specified",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = { 'enabled' : 'false', 'class' : 'SnappyCompressor'};");
+
+        assertThrowsConfigurationException("If the 'enabled' option is set to false no other options must be specified",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = { 'enabled' : 'false', 'chunk_length_in_kb' : 32};");
+
+        assertThrowsConfigurationException("The 'sstable_compression' option must not be used if the compression algorithm is already specified by the 'class' option",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = { 'sstable_compression' : 'SnappyCompressor', 'class' : 'SnappyCompressor'};");
+
+        assertThrowsConfigurationException("The 'chunk_length_kb' option must not be used if the chunk length is already specified by the 'chunk_length_in_kb' option",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_kb' : 32 , 'chunk_length_in_kb' : 32 };");
+
+        assertThrowsConfigurationException("Unknown compression options unknownOption",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                            + " WITH compression = { 'class' : 'SnappyCompressor', 'unknownOption' : 32 };");
     }
 
-    private void assertTriggerDoesNotExists(String name, Class<?> clazz)
+     private void assertThrowsConfigurationException(String errorMsg, String createStmt) {
+         try
+         {
+             createTable(createStmt);
+             fail("Query should be invalid but no error was thrown. Query is: " + createStmt);
+         }
+         catch (RuntimeException e)
+         {
+             Throwable cause = e.getCause();
+             assertTrue("The exception should be a ConfigurationException", cause instanceof ConfigurationException);
+             assertEquals(errorMsg, cause.getMessage());
+         }
+     }
+
+    private void assertTriggerExists(String name)
     {
         CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), currentTable()).copy();
-        Assert.assertFalse("the trigger exists", cfm.containsTriggerDefinition(TriggerDefinition.create(name,
-                                                                                                        clazz.getName())));
+        assertTrue("the trigger does not exist", cfm.getTriggers().get(name).isPresent());
+    }
+
+    private void assertTriggerDoesNotExists(String name)
+    {
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace(), currentTable()).copy();
+        assertFalse("the trigger exists", cfm.getTriggers().get(name).isPresent());
     }
 
     public static class TestTrigger implements ITrigger
     {
         public TestTrigger() { }
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition update)
         {
             return Collections.emptyList();
         }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/DeleteTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/DeleteTest.java
index 6bd5f26..9d495b3 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/DeleteTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/DeleteTest.java

@@ -28,10 +28,14 @@
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
 
-import static org.junit.Assert.assertEquals;
 import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.apache.commons.lang3.StringUtils.isEmpty;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class DeleteTest extends CQLTester
 {
@@ -39,15 +43,89 @@
     @Test
     public void testRangeDeletion() throws Throwable
     {
-        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+        testRangeDeletion(true, true);
+        testRangeDeletion(false, true);
+        testRangeDeletion(true, false);
+        testRangeDeletion(false, false);
+    }
 
+    private void testRangeDeletion(boolean flushData, boolean flushTombstone) throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 1, 1, 1, 1);
-        flush();
+        flush(flushData);
         execute("DELETE FROM %s WHERE a=? AND b=?", 1, 1);
-        flush();
+        flush(flushTombstone);
         assertEmpty(execute("SELECT * FROM %s WHERE a=? AND b=? AND c=?", 1, 1, 1));
     }
 
+    @Test
+    public void testDeleteRange() throws Throwable
+    {
+        testDeleteRange(true, true);
+        testDeleteRange(false, true);
+        testDeleteRange(true, false);
+        testDeleteRange(false, false);
+    }
+
+    private void testDeleteRange(boolean flushData, boolean flushTombstone) throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 1, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 2, 3);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 3, 4);
+        flush(flushData);
+
+        execute("DELETE FROM %s WHERE a = ? AND b >= ?", 2, 2);
+        flush(flushTombstone);
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s"),
+                                row(1, 1, 1),
+                                row(2, 1, 2));
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 1),
+                   row(2, 1, 2));
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 2));
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 3));
+    }
+
+    @Test
+    public void testCrossMemSSTableMultiColumn() throws Throwable
+    {
+        testCrossMemSSTableMultiColumn(true, true);
+        testCrossMemSSTableMultiColumn(false, true);
+        testCrossMemSSTableMultiColumn(true, false);
+        testCrossMemSSTableMultiColumn(false, false);
+    }
+
+    private void testCrossMemSSTableMultiColumn(boolean flushData, boolean flushTombstone) throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 1, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 2, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 2, 3, 3);
+        flush(flushData);
+
+        execute("DELETE FROM %s WHERE a = ? AND (b) = (?)", 2, 2);
+        execute("DELETE FROM %s WHERE a = ? AND (b) = (?)", 2, 3);
+
+        flush(flushTombstone);
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s"),
+                                row(1, 1, 1),
+                                row(2, 1, 2));
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 1),
+                   row(2, 1, 2));
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 2));
+        assertEmpty(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 2, 3));
+    }
+
+
     /**
      * Test simple deletion and in particular check for #4193 bug
      * migrated from cql_tests.py:TestCQL.deletion_test()
@@ -330,6 +408,731 @@
 
         assertEmpty(execute("select * from %s  where a=1 and b=1"));
     }
+    @Test
+    public void testDeleteWithNoClusteringColumns() throws Throwable
+    {
+        testDeleteWithNoClusteringColumns(false);
+        testDeleteWithNoClusteringColumns(true);
+    }
+
+    private void testDeleteWithNoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] {"", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int PRIMARY KEY," +
+                                      "value int)" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey, value) VALUES (0, 0)");
+            execute("INSERT INTO %s (partitionKey, value) VALUES (1, 1)");
+            execute("INSERT INTO %s (partitionKey, value) VALUES (2, 2)");
+            execute("INSERT INTO %s (partitionKey, value) VALUES (3, 3)");
+            flush(forceFlush);
+
+            execute("DELETE value FROM %s WHERE partitionKey = ?", 0);
+            flush(forceFlush);
+
+            if (isEmpty(compactOption))
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                           row(0, null));
+            }
+            else
+            {
+                assertEmpty(execute("SELECT * FROM %s WHERE partitionKey = ?", 0));
+            }
+
+            execute("DELETE FROM %s WHERE partitionKey IN (?, ?)", 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s"),
+                       row(2, 2),
+                       row(3, 3));
+
+            // test invalid queries
+
+            // token function
+            assertInvalidMessage("The token function cannot be used in WHERE clauses for DELETE statements",
+                                 "DELETE FROM %s WHERE token(partitionKey) = token(?)", 0);
+
+            // multiple time same primary key element in WHERE clause
+            assertInvalidMessage("partitionkey cannot be restricted by more than one relation if it includes an Equal",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND partitionKey = ?", 0, 1);
+
+            // unknown identifiers
+            assertInvalidMessage("Unknown identifier unknown",
+                                 "DELETE unknown FROM %s WHERE partitionKey = ?", 0);
+
+            assertInvalidMessage("Undefined name partitionkey1 in where clause ('partitionkey1 = ?')",
+                                 "DELETE FROM %s WHERE partitionKey1 = ?", 0);
+
+            // Invalid operator in the where clause
+            assertInvalidMessage("Only EQ and IN relation are supported on the partition key (unless you use the token() function)",
+                                 "DELETE FROM %s WHERE partitionKey > ? ", 0);
+
+            assertInvalidMessage("Cannot use CONTAINS on non-collection column partitionkey",
+                                 "DELETE FROM %s WHERE partitionKey CONTAINS ?", 0);
+
+            // Non primary key in the where clause
+            assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND value = ?", 0, 1);
+        }
+    }
+
+    @Test
+    public void testDeleteWithOneClusteringColumns() throws Throwable
+    {
+        testDeleteWithOneClusteringColumns(false);
+        testDeleteWithOneClusteringColumns(true);
+    }
+
+    private void testDeleteWithOneClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] {"", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering int," +
+                                      "value int," +
+                                      " PRIMARY KEY (partitionKey, clustering))" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 0, 0)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 1, 1)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 2, 2)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 3, 3)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 4, 4)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 5, 5)");
+            execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (1, 0, 6)");
+            flush(forceFlush);
+
+            execute("DELETE value FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1);
+            flush(forceFlush);
+            if (isEmpty(compactOption))
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1),
+                           row(0, 1, null));
+            }
+            else
+            {
+                assertEmpty(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1));
+            }
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1);
+            flush(forceFlush);
+            assertEmpty(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1));
+
+            execute("DELETE FROM %s WHERE partitionKey IN (?, ?) AND clustering = ?", 0, 1, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1),
+                       row(0, 2, 2),
+                       row(0, 3, 3),
+                       row(0, 4, 4),
+                       row(0, 5, 5));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering) IN ((?), (?))", 0, 4, 5);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1),
+                       row(0, 2, 2),
+                       row(0, 3, 3));
+
+            // test invalid queries
+
+            // missing primary key element
+            assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                                 "DELETE FROM %s WHERE clustering = ?", 1);
+
+            // token function
+            assertInvalidMessage("The token function cannot be used in WHERE clauses for DELETE statements",
+                                 "DELETE FROM %s WHERE token(partitionKey) = token(?) AND clustering = ? ", 0, 1);
+
+            // multiple time same primary key element in WHERE clause
+            assertInvalidMessage("clustering cannot be restricted by more than one relation if it includes an Equal",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering = ? AND clustering = ?", 0, 1, 1);
+
+            // unknown identifiers
+            assertInvalidMessage("Unknown identifier value1",
+                                 "DELETE value1 FROM %s WHERE partitionKey = ? AND clustering = ?", 0, 1);
+
+            assertInvalidMessage("Undefined name partitionkey1 in where clause ('partitionkey1 = ?')",
+                                 "DELETE FROM %s WHERE partitionKey1 = ? AND clustering = ?", 0, 1);
+
+            assertInvalidMessage("Undefined name clustering_3 in where clause ('clustering_3 = ?')",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering_3 = ?", 0, 1);
+
+            // Invalid operator in the where clause
+            assertInvalidMessage("Only EQ and IN relation are supported on the partition key (unless you use the token() function)",
+                                 "DELETE FROM %s WHERE partitionKey > ? AND clustering = ?", 0, 1);
+
+            assertInvalidMessage("Cannot use CONTAINS on non-collection column partitionkey",
+                                 "DELETE FROM %s WHERE partitionKey CONTAINS ? AND clustering = ?", 0, 1);
+
+            // Non primary key in the where clause
+            assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering = ? AND value = ?", 0, 1, 3);
+        }
+    }
+
+    @Test
+    public void testDeleteWithTwoClusteringColumns() throws Throwable
+    {
+        testDeleteWithTwoClusteringColumns(false);
+        testDeleteWithTwoClusteringColumns(true);
+    }
+
+    private void testDeleteWithTwoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] { "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering_1 int," +
+                                      "clustering_2 int," +
+                                      "value int," +
+                                      " PRIMARY KEY (partitionKey, clustering_1, clustering_2))" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 2, 2)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 3, 3)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 1, 1, 4)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 1, 2, 5)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (1, 0, 0, 6)");
+            flush(forceFlush);
+
+            execute("DELETE value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+            flush(forceFlush);
+
+            if (isEmpty(compactOption))
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                                   0, 1, 1),
+                           row(0, 1, 1, null));
+            }
+            else
+            {
+                assertEmpty(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                                   0, 1, 1));
+            }
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) = (?, ?)", 0, 1, 1);
+            flush(forceFlush);
+            assertEmpty(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                                0, 1, 1));
+
+            execute("DELETE FROM %s WHERE partitionKey IN (?, ?) AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 0, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1),
+                       row(0, 0, 1, 1),
+                       row(0, 0, 2, 2),
+                       row(0, 0, 3, 3),
+                       row(0, 1, 2, 5));
+
+            Object[][] rows;
+            if (isEmpty(compactOption))
+            {
+                rows = new Object[][]{row(0, 0, 1, 1),
+                                      row(0, 0, 2, null),
+                                      row(0, 0, 3, null),
+                                      row(0, 1, 2, 5)};
+            }
+            else
+            {
+                rows = new Object[][]{row(0, 0, 1, 1), row(0, 1, 2, 5)};
+            }
+
+            execute("DELETE value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 IN (?, ?)", 0, 0, 2, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1), rows);
+
+            if (isEmpty(compactOption))
+            {
+                rows = new Object[][]{row(0, 0, 1, 1),
+                                      row(0, 0, 3, null)};
+            }
+            else
+            {
+                rows = new Object[][]{row(0, 0, 1, 1)};
+            }
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) IN ((?, ?), (?, ?))", 0, 0, 2, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1), rows);
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?)) AND clustering_2 = ?", 0, 0, 2, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?)", 0, 1),
+                       row(0, 0, 1, 1));
+
+            // test invalid queries
+
+            // missing primary key element
+            assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                                 "DELETE FROM %s WHERE clustering_1 = ? AND clustering_2 = ?", 1, 1);
+
+            assertInvalidMessage("PRIMARY KEY column \"clustering_2\" cannot be restricted as preceding column \"clustering_1\" is not restricted",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering_2 = ?", 0, 1);
+
+            // token function
+            assertInvalidMessage("The token function cannot be used in WHERE clauses for DELETE statements",
+                                 "DELETE FROM %s WHERE token(partitionKey) = token(?) AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+
+            // multiple time same primary key element in WHERE clause
+            assertInvalidMessage("clustering_1 cannot be restricted by more than one relation if it includes an Equal",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ? AND clustering_1 = ?", 0, 1, 1, 1);
+
+            // unknown identifiers
+            assertInvalidMessage("Unknown identifier value1",
+                                 "DELETE value1 FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+
+            assertInvalidMessage("Undefined name partitionkey1 in where clause ('partitionkey1 = ?')",
+                                 "DELETE FROM %s WHERE partitionKey1 = ? AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+
+            assertInvalidMessage("Undefined name clustering_3 in where clause ('clustering_3 = ?')",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_3 = ?", 0, 1, 1);
+
+            // Invalid operator in the where clause
+            assertInvalidMessage("Only EQ and IN relation are supported on the partition key (unless you use the token() function)",
+                                 "DELETE FROM %s WHERE partitionKey > ? AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+
+            assertInvalidMessage("Cannot use CONTAINS on non-collection column partitionkey",
+                                 "DELETE FROM %s WHERE partitionKey CONTAINS ? AND clustering_1 = ? AND clustering_2 = ?", 0, 1, 1);
+
+            // Non primary key in the where clause
+            assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                                 "DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ? AND value = ?", 0, 1, 1, 3);
+        }
+    }
+
+    @Test
+    public void testDeleteWithRangeAndOneClusteringColumn() throws Throwable
+    {
+        testDeleteWithRangeAndOneClusteringColumn(false);
+        testDeleteWithRangeAndOneClusteringColumn(true);
+    }
+
+    private void testDeleteWithRangeAndOneClusteringColumn(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] { "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                                          "clustering int," +
+                                          "value int," +
+                                          " PRIMARY KEY (partitionKey, clustering))" + compactOption);
+
+            int value = 0;
+            for (int partitionKey = 0; partitionKey < 5; partitionKey++)
+                for (int clustering1 = 0; clustering1 < 5; clustering1++)
+                        execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (?, ?, ?)",
+                                partitionKey, clustering1, value++);
+
+            flush(forceFlush);
+
+            // test delete partition
+            execute("DELETE FROM %s WHERE partitionKey = ?", 1);
+            flush(forceFlush);
+            assertEmpty(execute("SELECT * FROM %s WHERE partitionKey = ?", 1));
+
+            // test slices on the first clustering column
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering >= ?", 0, 4);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 0, 0),
+                       row(0, 1, 1),
+                       row(0, 2, 2),
+                       row(0, 3, 3));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering > ?", 0, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 0, 0),
+                       row(0, 1, 1),
+                       row(0, 2, 2));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering <= ?", 0, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 1, 1),
+                       row(0, 2, 2));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering < ?", 0, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 2, 2));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering >= ? AND clustering < ?", 2, 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 13),
+                       row(2, 4, 14));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering > ? AND clustering <= ?", 2, 3, 5);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 13));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering < ? AND clustering > ?", 2, 3, 5);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 13));
+
+            // test multi-column slices
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering) > (?)", 3, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 3),
+                       row(3, 0, 15),
+                       row(3, 1, 16),
+                       row(3, 2, 17));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering) < (?)", 3, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 3),
+                       row(3, 1, 16),
+                       row(3, 2, 17));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering) >= (?) AND (clustering) <= (?)", 3, 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 3),
+                       row(3, 2, 17));
+
+            // Test invalid queries
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ? AND clustering >= ?", 2, 1);
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ?", 2);
+        }
+    }
+
+    @Test
+    public void testDeleteWithRangeAndTwoClusteringColumns() throws Throwable
+    {
+        testDeleteWithRangeAndTwoClusteringColumns(false);
+        testDeleteWithRangeAndTwoClusteringColumns(true);
+    }
+
+    private void testDeleteWithRangeAndTwoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] { "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                    "clustering_1 int," +
+                    "clustering_2 int," +
+                    "value int," +
+                    " PRIMARY KEY (partitionKey, clustering_1, clustering_2))" + compactOption);
+
+            int value = 0;
+            for (int partitionKey = 0; partitionKey < 5; partitionKey++)
+                for (int clustering1 = 0; clustering1 < 5; clustering1++)
+                    for (int clustering2 = 0; clustering2 < 5; clustering2++) {
+                        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (?, ?, ?, ?)",
+                                partitionKey, clustering1, clustering2, value++);}
+            flush(forceFlush);
+
+            // test unspecified second clustering column
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ?", 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 < ?", 0, 2),
+                       row(0, 0, 0, 0),
+                       row(0, 0, 1, 1),
+                       row(0, 0, 2, 2),
+                       row(0, 0, 3, 3),
+                       row(0, 0, 4, 4));
+
+            // test delete partition
+            execute("DELETE FROM %s WHERE partitionKey = ?", 1);
+            flush(forceFlush);
+            assertEmpty(execute("SELECT * FROM %s WHERE partitionKey = ?", 1));
+
+            // test slices on the second clustering column
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 < ?", 0, 0, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 < ?", 0, 2),
+                       row(0, 0, 2, 2),
+                       row(0, 0, 3, 3),
+                       row(0, 0, 4, 4));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 <= ?", 0, 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 < ?", 0, 2),
+                       row(0, 0, 4, 4));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 = ? AND clustering_2 > ? ", 0, 2, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND  clustering_1 = ?", 0, 2),
+                       row(0, 2, 0, 10),
+                       row(0, 2, 1, 11),
+                       row(0, 2, 2, 12));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 = ? AND clustering_2 >= ? ", 0, 2, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND  clustering_1 = ?", 0, 2),
+                       row(0, 2, 0, 10));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 = ? AND clustering_2 > ? AND clustering_2 < ? ",
+                    0, 3, 1, 4);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND  clustering_1 = ?", 0, 3),
+                       row(0, 3, 0, 15),
+                       row(0, 3, 1, 16),
+                       row(0, 3, 4, 19));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 = ? AND clustering_2 > ? AND clustering_2 < ? ",
+                    0, 3, 4, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND  clustering_1 = ?", 0, 3),
+                       row(0, 3, 0, 15),
+                       row(0, 3, 1, 16),
+                       row(0, 3, 4, 19));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 = ? AND clustering_2 >= ? AND clustering_2 <= ? ",
+                    0, 3, 1, 4);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND  clustering_1 = ?", 0, 3),
+                       row(0, 3, 0, 15));
+
+            // test slices on the first clustering column
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 >= ?", 0, 4);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 0, 4, 4),
+                       row(0, 2, 0, 10),
+                       row(0, 3, 0, 15));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND  clustering_1 > ?", 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 0, 4, 4),
+                       row(0, 2, 0, 10),
+                       row(0, 3, 0, 15));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 < ?", 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 0),
+                       row(0, 3, 0, 15));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 >= ? AND clustering_1 < ?", 2, 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 0, 65),
+                       row(2, 3, 1, 66),
+                       row(2, 3, 2, 67),
+                       row(2, 3, 3, 68),
+                       row(2, 3, 4, 69),
+                       row(2, 4, 0, 70),
+                       row(2, 4, 1, 71),
+                       row(2, 4, 2, 72),
+                       row(2, 4, 3, 73),
+                       row(2, 4, 4, 74));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 > ? AND clustering_1 <= ?", 2, 3, 5);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 0, 65),
+                       row(2, 3, 1, 66),
+                       row(2, 3, 2, 67),
+                       row(2, 3, 3, 68),
+                       row(2, 3, 4, 69));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 < ? AND clustering_1 > ?", 2, 3, 5);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 0, 65),
+                       row(2, 3, 1, 66),
+                       row(2, 3, 2, 67),
+                       row(2, 3, 3, 68),
+                       row(2, 3, 4, 69));
+
+            // test multi-column slices
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) > (?, ?)", 2, 3, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 0, 65),
+                       row(2, 3, 1, 66),
+                       row(2, 3, 2, 67),
+                       row(2, 3, 3, 68));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) < (?, ?)", 2, 3, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 1, 66),
+                       row(2, 3, 2, 67),
+                       row(2, 3, 3, 68));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) >= (?, ?) AND (clustering_1) <= (?)", 2, 3, 2, 4);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ?", 2),
+                       row(2, 3, 1, 66));
+
+            // Test with a mix of single column and multi-column restrictions
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND (clustering_2) < (?)", 3, 0, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 = ?", 3, 0),
+                       row(3, 0, 3, 78),
+                       row(3, 0, 4, 79));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?) AND (clustering_2) >= (?)", 3, 0, 1, 3);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?)", 3, 0, 1),
+                       row(3, 1, 0, 80),
+                       row(3, 1, 1, 81),
+                       row(3, 1, 2, 82));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?)) AND clustering_2 < ?", 3, 0, 1, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?)", 3, 0, 1),
+                       row(3, 1, 1, 81),
+                       row(3, 1, 2, 82));
+
+            execute("DELETE FROM %s WHERE partitionKey = ? AND (clustering_1) = (?) AND clustering_2 >= ?", 3, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?)", 3, 0, 1),
+                       row(3, 1, 1, 81));
+
+            // Test invalid queries
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) >= (?, ?)", 2, 3, 1);
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ? AND clustering_1 >= ?", 2, 3);
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ? AND clustering_1 = ?", 2, 3);
+            assertInvalidMessage("Range deletions are not supported for specific columns",
+                                 "DELETE value FROM %s WHERE partitionKey = ?", 2);
+        }
+    }
+
+    @Test
+    public void testDeleteWithAStaticColumn() throws Throwable
+    {
+        testDeleteWithAStaticColumn(false);
+        testDeleteWithAStaticColumn(true);
+    }
+
+    private void testDeleteWithAStaticColumn(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering_1 int," +
+                                      "clustering_2 int," +
+                                      "value int," +
+                                      "staticValue text static," +
+                                      " PRIMARY KEY (partitionKey, clustering_1, clustering_2))");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value, staticValue) VALUES (0, 0, 0, 0, 'A')");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value, staticValue) VALUES (1, 0, 0, 6, 'B')");
+        flush(forceFlush);
+
+        execute("DELETE staticValue FROM %s WHERE partitionKey = ?", 0);
+        flush(forceFlush);
+        assertRows(execute("SELECT DISTINCT staticValue FROM %s WHERE partitionKey IN (?, ?)", 0, 1),
+                   row(new Object[1]), row("B"));
+
+        execute("DELETE staticValue, value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                1, 0, 0);
+        flush(forceFlush);
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 0, 0, null, null),
+                   row(0, 0, 0, null, 0),
+                   row(0, 0, 1, null, 1));
+
+        assertInvalidMessage("Invalid restrictions on clustering columns since the DELETE statement modifies only static columns",
+                             "DELETE staticValue FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                             0, 0, 1);
+
+        assertInvalidMessage("Invalid restrictions on clustering columns since the DELETE statement modifies only static columns",
+                             "DELETE staticValue FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) >= (?, ?)",
+                             0, 0, 1);
+    }
+
+    @Test
+    public void testDeleteWithSecondaryIndices() throws Throwable
+    {
+        testDeleteWithSecondaryIndices(false);
+        testDeleteWithSecondaryIndices(true);
+    }
+
+    private void testDeleteWithSecondaryIndices(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                "clustering_1 int," +
+                "value int," +
+                "values set<int>," +
+                " PRIMARY KEY (partitionKey, clustering_1))");
+
+        createIndex("CREATE INDEX ON %s (value)");
+        createIndex("CREATE INDEX ON %s (clustering_1)");
+        createIndex("CREATE INDEX ON %s (values)");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 0, 0, {0})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 1, 1, {0, 1})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 2, 2, {0, 1, 2})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 3, 3, {0, 1, 2, 3})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (1, 0, 4, {0, 1, 2, 3, 4})");
+
+        flush(forceFlush);
+
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                             "DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND value = ?", 3, 3, 3);
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: values",
+                             "DELETE FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND values CONTAINS ?", 3, 3, 3);
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                             "DELETE FROM %s WHERE partitionKey = ? AND value = ?", 3, 3);
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: values",
+                             "DELETE FROM %s WHERE partitionKey = ? AND values CONTAINS ?", 3, 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "DELETE FROM %s WHERE clustering_1 = ?", 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "DELETE FROM %s WHERE value = ?", 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "DELETE FROM %s WHERE values CONTAINS ?", 3);
+    }
+
+    @Test
+    public void testDeleteWithOnlyPK() throws Throwable
+    {
+        // This is a regression test for CASSANDRA-11102
+
+        createTable("CREATE TABLE %s (k int, v int, PRIMARY KEY (k, v)) WITH gc_grace_seconds=1");
+
+        execute("INSERT INTO %s(k, v) VALUES (?, ?)", 1, 2);
+
+        execute("DELETE FROM %s WHERE k = ? AND v = ?", 1, 2);
+        execute("INSERT INTO %s(k, v) VALUES (?, ?)", 2, 3);
+
+        Thread.sleep(500);
+
+        execute("DELETE FROM %s WHERE k = ? AND v = ?", 2, 3);
+        execute("INSERT INTO %s(k, v) VALUES (?, ?)", 1, 2);
+
+        Thread.sleep(500);
+
+        flush();
+
+        assertRows(execute("SELECT * FROM %s"), row(1, 2));
+
+        Thread.sleep(1000);
+        compact();
+
+        assertRows(execute("SELECT * FROM %s"), row(1, 2));
+    }
+
+    @Test
+    public void testDeleteColumnNoClustering() throws Throwable
+    {
+        // This is a regression test for CASSANDRA-11068 (and ultimately another test for CASSANDRA-11102)
+        // Creates a table without clustering, insert a row (with a column) and only remove the column.
+        // We should still have a row (with a null column value) even post-compaction.
+
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v int) WITH gc_grace_seconds=0");
+
+        execute("INSERT INTO %s(k, v) VALUES (?, ?)", 0, 0);
+        execute("DELETE v FROM %s WHERE k=?", 0);
+
+        assertRows(execute("SELECT * FROM %s"), row(0, null));
+
+        flush();
+        assertRows(execute("SELECT * FROM %s"), row(0, null));
+
+        compact();
+        assertRows(execute("SELECT * FROM %s"), row(0, null));
+    }
 
     @Test
     public void testDeleteWithEmptyRestrictionValue() throws Throwable
@@ -344,6 +1147,24 @@
                 execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c = textAsBlob('');");
 
                 assertEmpty(execute("SELECT * FROM %s"));
+
+                execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"));
+                execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c IN (textAsBlob(''), textAsBlob('1'));");
+
+                assertEmpty(execute("SELECT * FROM %s"));
+
+                execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"));
+                execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"));
+                execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("2"), bytes("2"));
+
+                execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c > textAsBlob('')");
+
+                assertRows(execute("SELECT * FROM %s"),
+                           row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1")));
+
+                execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c >= textAsBlob('')");
+
+                assertEmpty(execute("SELECT * FROM %s"));
             }
             else
             {
@@ -352,6 +1173,23 @@
                 assertInvalid("Invalid empty or null value for column c",
                               "DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c IN (textAsBlob(''), textAsBlob('1'))");
             }
+
+            execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"));
+            execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("2"), bytes("2"));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c > textAsBlob('')");
+
+            assertEmpty(execute("SELECT * FROM %s"));
+
+            execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"));
+            execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)", bytes("foo123"), bytes("2"), bytes("2"));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c <= textAsBlob('')");
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c < textAsBlob('')");
+
+            assertRows(execute("SELECT * FROM %s"),
+                       row(bytes("foo123"), bytes("1"), bytes("1")),
+                       row(bytes("foo123"), bytes("2"), bytes("2")));
         }
     }
 
@@ -366,6 +1204,327 @@
             execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 = textAsBlob('');");
 
             assertEmpty(execute("SELECT * FROM %s"));
+
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("1"));
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 IN (textAsBlob(''), textAsBlob('1')) AND c2 = textAsBlob('1');");
+
+            assertEmpty(execute("SELECT * FROM %s"));
+
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("0"));
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"), bytes("1"));
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("2"), bytes("3"));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 > textAsBlob('')");
+
+            assertRows(execute("SELECT * FROM %s"),
+                       row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("0")));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 >= textAsBlob('')");
+
+            assertEmpty(execute("SELECT * FROM %s"));
+
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"), bytes("1"));
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("2"), bytes("3"));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 > textAsBlob('')");
+
+            assertEmpty(execute("SELECT * FROM %s"));
+
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"), bytes("1"));
+            execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("2"), bytes("3"));
+
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 <= textAsBlob('')");
+            execute("DELETE FROM %s WHERE pk = textAsBlob('foo123') AND c1 < textAsBlob('')");
+
+            assertRows(execute("SELECT * FROM %s"),
+                       row(bytes("foo123"), bytes("1"), bytes("1"), bytes("1")),
+                       row(bytes("foo123"), bytes("1"), bytes("2"), bytes("3")));
         }
     }
+
+    @Test
+    public void testDeleteAndReverseQueries() throws Throwable
+    {
+        // This test insert rows in one sstable and a range tombstone covering some of those rows in another, and it
+        // validates we correctly get only the non-removed rows when doing reverse queries.
+
+        createTable("CREATE TABLE %s (k text, i int, PRIMARY KEY (k, i))");
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s(k, i) values (?, ?)", "a", i);
+
+        flush();
+
+        execute("DELETE FROM %s WHERE k = ? AND i >= ? AND i <= ?", "a", 2, 7);
+
+        assertRows(execute("SELECT i FROM %s WHERE k = ? ORDER BY i DESC", "a"),
+            row(9), row(8), row(1), row(0)
+        );
+
+        flush();
+
+        assertRows(execute("SELECT i FROM %s WHERE k = ? ORDER BY i DESC", "a"),
+            row(9), row(8), row(1), row(0)
+        );
+    }
+
+    /**
+     * Test for CASSANDRA-12829
+     */
+    @Test
+    public void testDeleteWithEmptyInRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b))");
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)", 1, 1, 1);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)", 1, 2, 2);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)", 1, 3, 3);
+
+        execute("DELETE FROM %s WHERE a IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1;");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN ();");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 1, 1),
+                   row(1, 2, 2),
+                   row(1, 3, 3));
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, s int static, PRIMARY KEY ((a,b), c))");
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)", 1, 1, 1, 1, 1);
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)", 1, 1, 2, 2, 1);
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)", 1, 1, 3, 3, 1);
+
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN () AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1;");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 1, 1, 1, 1),
+                   row(1, 1, 2, 1, 2),
+                   row(1, 1, 3, 1, 3));
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY ((a,b), c, d))");
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)", 1, 1, 1, 1, 1);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)", 1, 1, 1, 2, 2);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)", 1, 1, 1, 3, 3);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)", 1, 1, 1, 4, 4);
+
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c = 1 AND d IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN () AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN () AND d = 1;");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1 AND d = 1;");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1 AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 1, 1, 1, 1),
+                   row(1, 1, 1, 2, 2),
+                   row(1, 1, 1, 3, 3),
+                   row(1, 1, 1, 4, 4));
+    }
+
+    /**
+     * Test for CASSANDRA-13152
+     */
+    @Test
+    public void testThatDeletesWithEmptyInRestrictionDoNotCreateMutations() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b))");
+
+        execute("DELETE FROM %s WHERE a IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1;");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN ();");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, s int static, PRIMARY KEY ((a,b), c))");
+
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN () AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1;");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY ((a,b), c, d))");
+
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c = 1 AND d IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1 AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a = 1 AND b IN () AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN () AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c IN () AND d = 1;");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1 AND d = 1;");
+        execute("DELETE FROM %s WHERE a IN () AND b IN () AND c = 1 AND d IN ();");
+        execute("DELETE FROM %s WHERE a IN () AND b = 1");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+    }
+
+    @Test
+    public void testQueryingOnRangeTombstoneBoundForward() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, i int, PRIMARY KEY (k, i))");
+
+        execute("INSERT INTO %s (k, i) VALUES (?, ?)", "a", 0);
+
+        execute("DELETE FROM %s WHERE k = ? AND i > ? AND i <= ?", "a", 0, 1);
+        execute("DELETE FROM %s WHERE k = ? AND i > ?", "a", 1);
+
+        flush();
+
+        assertEmpty(execute("SELECT i FROM %s WHERE k = ? AND i = ?", "a", 1));
+    }
+
+    @Test
+    public void testQueryingOnRangeTombstoneBoundReverse() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, i int, PRIMARY KEY (k, i))");
+
+        execute("INSERT INTO %s (k, i) VALUES (?, ?)", "a", 0);
+
+        execute("DELETE FROM %s WHERE k = ? AND i > ? AND i <= ?", "a", 0, 1);
+        execute("DELETE FROM %s WHERE k = ? AND i > ?", "a", 1);
+
+        flush();
+
+        assertRows(execute("SELECT i FROM %s WHERE k = ? AND i <= ? ORDER BY i DESC", "a", 1), row(0));
+    }
+
+    @Test
+    public void testReverseQueryWithRangeTombstoneOnMultipleBlocks() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, i int, v text, PRIMARY KEY (k, i))");
+
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 1200; i++)
+            sb.append('a');
+        String longText = sb.toString();
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s(k, i, v) VALUES (?, ?, ?) USING TIMESTAMP 3", "a", i*2, longText);
+
+        execute("DELETE FROM %s USING TIMESTAMP 1 WHERE k = ? AND i >= ? AND i <= ?", "a", 12, 16);
+
+        flush();
+
+        execute("INSERT INTO %s(k, i, v) VALUES (?, ?, ?) USING TIMESTAMP 0", "a", 3, longText);
+        execute("INSERT INTO %s(k, i, v) VALUES (?, ?, ?) USING TIMESTAMP 3", "a", 11, longText);
+        execute("INSERT INTO %s(k, i, v) VALUES (?, ?, ?) USING TIMESTAMP 0", "a", 15, longText);
+        execute("INSERT INTO %s(k, i, v) VALUES (?, ?, ?) USING TIMESTAMP 0", "a", 17, longText);
+
+        flush();
+
+        assertRows(execute("SELECT i FROM %s WHERE k = ? ORDER BY i DESC", "a"),
+                   row(18),
+                   row(17),
+                   row(16),
+                   row(14),
+                   row(12),
+                   row(11),
+                   row(10),
+                   row(8),
+                   row(6),
+                   row(4),
+                   row(3),
+                   row(2),
+                   row(0));
+    }
+
+    /**
+     * Test for CASSANDRA-13305
+     */
+    @Test
+    public void testWithEmptyRange() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, a int, b int, PRIMARY KEY (k, a, b))");
+
+        // Both of the following should be doing nothing, but before #13305 this inserted broken ranges. We do it twice
+        // and the follow-up delete mainly as a way to show the bug as the combination of this will trigger an assertion
+        // in RangeTombstoneList pre-#13305 showing that something wrong happened.
+        execute("DELETE FROM %s WHERE k = ? AND a >= ? AND a < ?", "a", 1, 1);
+        execute("DELETE FROM %s WHERE k = ? AND a >= ? AND a < ?", "a", 1, 1);
+
+        execute("DELETE FROM %s WHERE k = ? AND a >= ? AND a < ?", "a", 0, 2);
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+    */
+    @Test
+    public void testWithCompactStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+        testWithCompactFormat();
+
+        // if column1 is present, hidden column is called column2
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, column1 int) WITH COMPACT STORAGE");
+        assertInvalidMessage("Undefined name column2 in where clause ('column2 = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND column2= 1");
+        assertInvalidMessage("Undefined name column2 in where clause ('column2 = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND column2 = 1 AND value1 = 1");
+        assertInvalidMessage("Unknown identifier column2",
+                             "DELETE column2 FROM %s WHERE a = 1");
+
+        // if value is present, hidden column is called value1
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, value int) WITH COMPACT STORAGE");
+        assertInvalidMessage("Undefined name value1 in where clause ('value1 = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND value1 = 1");
+        assertInvalidMessage("Undefined name value1 in where clause ('value1 = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND value1 = 1 AND column1 = 1");
+        assertInvalidMessage("Unknown identifier value1",
+                             "DELETE value1 FROM %s WHERE a = 1");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testWithCompactNonStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b) VALUES (1, 1)");
+        execute("INSERT INTO %s (a, b) VALUES (2, 1)");
+        assertRows(execute("SELECT a, b FROM %s"),
+                   row(1, 1),
+                   row(2, 1));
+        testWithCompactFormat();
+
+        createTable("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, v) VALUES (1, 1, 3)");
+        execute("INSERT INTO %s (a, b, v) VALUES (2, 1, 4)");
+        assertRows(execute("SELECT a, b, v FROM %s"),
+                   row(1, 1, 3),
+                   row(2, 1, 4));
+        testWithCompactFormat();
+    }
+
+    private void testWithCompactFormat() throws Throwable
+    {
+        assertInvalidMessage("Undefined name value in where clause ('value = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND value = 1");
+        assertInvalidMessage("Undefined name column1 in where clause ('column1 = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND column1= 1");
+        assertInvalidMessage("Undefined name value in where clause ('value = 1')",
+                             "DELETE FROM %s WHERE a = 1 AND value = 1 AND column1 = 1");
+        assertInvalidMessage("Unknown identifier value",
+                             "DELETE value FROM %s WHERE a = 1");
+        assertInvalidMessage("Unknown identifier column1",
+                             "DELETE column1 FROM %s WHERE a = 1");
+    }
+
+    /**
+     * Checks if the memtable is empty or not
+     * @return {@code true} if the memtable is empty, {@code false} otherwise.
+     */
+    private boolean isMemtableEmpty()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(currentTable());
+        return cfs.metric.allMemtablesLiveDataSize.getValue() == 0;
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/DropCompactStorageThriftTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/DropCompactStorageThriftTest.java
new file mode 100644
index 0000000..7d81018
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/DropCompactStorageThriftTest.java

@@ -0,0 +1,641 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.validation.operations;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.EmptyType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.thrift.Cassandra;
+import org.apache.cassandra.thrift.CfDef;
+import org.apache.cassandra.thrift.Column;
+import org.apache.cassandra.thrift.ColumnDef;
+import org.apache.cassandra.thrift.ColumnOrSuperColumn;
+import org.apache.cassandra.thrift.ColumnParent;
+import org.apache.cassandra.thrift.IndexType;
+import org.apache.cassandra.thrift.KsDef;
+import org.apache.cassandra.thrift.Mutation;
+import org.apache.cassandra.thrift.SuperColumn;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+import static org.apache.cassandra.thrift.ConsistencyLevel.ONE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class DropCompactStorageThriftTest extends ThriftCQLTester
+{
+    @Test
+    public void thriftCreatedTableTest() throws Throwable
+    {
+        final String KEYSPACE = "thrift_created_table_test_ks";
+        final String TABLE = "test_table_1";
+
+        CfDef cfDef = new CfDef().setDefault_validation_class(Int32Type.instance.toString())
+                                 .setKey_validation_class(AsciiType.instance.toString())
+                                 .setComparator_type(AsciiType.instance.toString())
+                                 .setColumn_metadata(Arrays.asList(new ColumnDef(ByteBufferUtil.bytes("col1"),
+                                                                                 AsciiType.instance.toString())
+                                                                   .setIndex_name("col1Index")
+                                                                   .setIndex_type(IndexType.KEYS),
+                                                                   new ColumnDef(ByteBufferUtil.bytes("col2"),
+                                                                                 AsciiType.instance.toString())
+                                                                   .setIndex_name("col2Index")
+                                                                   .setIndex_type(IndexType.KEYS)))
+                                 .setKeyspace(KEYSPACE)
+                                 .setName(TABLE);
+
+        KsDef ksDef = new KsDef(KEYSPACE,
+                                SimpleStrategy.class.getName(),
+                                Arrays.asList(cfDef));
+        ksDef.setStrategy_options(Collections.singletonMap("replication_factor", "1"));
+
+        Cassandra.Client client = getClient();
+        client.system_add_keyspace(ksDef);
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("col1"), ByteBufferUtil.bytes("val1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("col2"), ByteBufferUtil.bytes("val2")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("dynamicKey1"), ByteBufferUtil.bytes(100)),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("dynamicKey2"), ByteBufferUtil.bytes(200)),
+                      ONE);
+
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(AsciiType.instance, resultSet, "key");
+        assertColumnType(AsciiType.instance, resultSet, "column1");
+        assertColumnType(Int32Type.instance, resultSet, "value");
+        assertColumnType(AsciiType.instance, resultSet, "col1");
+        assertColumnType(AsciiType.instance, resultSet, "col2");
+
+        assertRows(resultSet,
+                   row("key1", "dynamicKey1", "val1", "val2", 100),
+                   row("key1", "dynamicKey2", "val1", "val2", 200));
+    }
+
+    @Test
+    public void thriftStaticCompatTableTest() throws Throwable
+    {
+        String KEYSPACE = keyspace();
+        String TABLE = createTable("CREATE TABLE %s (key ascii PRIMARY KEY, val ascii) WITH COMPACT STORAGE");
+
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("val"), ByteBufferUtil.bytes("val1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("dynamicKey1"), ByteBufferUtil.bytes("dynamicValue1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("dynamicKey2"), ByteBufferUtil.bytes("dynamicValue2")),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(AsciiType.instance, resultSet, "key");
+        assertColumnType(UTF8Type.instance, resultSet, "column1");
+        assertColumnType(AsciiType.instance, resultSet, "val");
+        assertColumnType(BytesType.instance, resultSet, "value");
+
+        // Values are interpreted as bytes by default:
+        assertRows(resultSet,
+                   row("key1", "dynamicKey1", "val1", ByteBufferUtil.bytes("dynamicValue1")),
+                   row("key1", "dynamicKey2", "val1", ByteBufferUtil.bytes("dynamicValue2")));
+    }
+
+    @Test
+    public void testSparseCompactTableIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (key ascii PRIMARY KEY, val ascii) WITH COMPACT STORAGE");
+
+        // Indexes are allowed only on the sparse compact tables
+        createIndex("CREATE INDEX ON %s(val)");
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (key, val) VALUES (?, ?)", Integer.toString(i), Integer.toString(i * 10));
+
+        alterTable("ALTER TABLE %s DROP COMPACT STORAGE");
+
+        assertRows(execute("SELECT * FROM %s WHERE val = '50'"),
+                   row("5", null, "50", null));
+        assertRows(execute("SELECT * FROM %s WHERE key = '5'"),
+                   row("5", null, "50", null));
+    }
+
+    @Test
+    public void thriftCompatTableTest() throws Throwable
+    {
+        String KEYSPACE = keyspace();
+        String TABLE = createTable("CREATE TABLE %s (pkey ascii, ckey ascii, PRIMARY KEY (pkey, ckey)) WITH COMPACT STORAGE");
+
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckeyValue1"), ByteBufferUtil.EMPTY_BYTE_BUFFER),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckeyValue2"), ByteBufferUtil.EMPTY_BYTE_BUFFER),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(AsciiType.instance, resultSet, "pkey");
+        assertColumnType(AsciiType.instance, resultSet, "ckey");
+        assertColumnType(EmptyType.instance, resultSet, "value");
+
+        // Value is always empty
+        assertRows(resultSet,
+                   row("key1", "ckeyValue1", ByteBufferUtil.EMPTY_BYTE_BUFFER),
+                   row("key1", "ckeyValue2", ByteBufferUtil.EMPTY_BYTE_BUFFER));
+    }
+
+    @Test
+    public void thriftDenseTableTest() throws Throwable
+    {
+        String KEYSPACE = keyspace();
+        String TABLE = createTable("CREATE TABLE %s (pkey text, ckey text, v text, PRIMARY KEY (pkey, ckey)) WITH COMPACT STORAGE");
+
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey1"), ByteBufferUtil.bytes("cvalue1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey2"), ByteBufferUtil.bytes("cvalue2")),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(UTF8Type.instance, resultSet, "pkey");
+        assertColumnType(UTF8Type.instance, resultSet, "ckey");
+        assertColumnType(UTF8Type.instance, resultSet, "v");
+
+        assertRows(resultSet,
+                   row("key1", "ckey1", "cvalue1"),
+                   row("key1", "ckey2", "cvalue2"));
+    }
+
+    @Test
+    public void thriftTableWithIntKey() throws Throwable
+    {
+        final String KEYSPACE = "thrift_table_with_int_key_ks";
+        final String TABLE = "test_table_1";
+
+        ByteBuffer columnName = ByteBufferUtil.bytes("columnname");
+        CfDef cfDef = new CfDef().setDefault_validation_class(UTF8Type.instance.toString())
+                                 .setKey_validation_class(BytesType.instance.toString())
+                                 .setComparator_type(BytesType.instance.toString())
+                                 .setColumn_metadata(Arrays.asList(new ColumnDef(columnName,
+                                                                                 Int32Type.instance.toString())
+                                                                   .setIndex_name("col1Index")
+                                                                   .setIndex_type(IndexType.KEYS)))
+                                 .setKeyspace(KEYSPACE)
+                                 .setName(TABLE);
+
+        KsDef ksDef = new KsDef(KEYSPACE,
+                                SimpleStrategy.class.getName(),
+                                Arrays.asList(cfDef));
+        ksDef.setStrategy_options(Collections.singletonMap("replication_factor", "1"));
+
+        Cassandra.Client client = getClient();
+        client.system_add_keyspace(ksDef);
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(columnName, ByteBufferUtil.bytes(100)),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+
+        assertEquals(resultSet.metadata()
+                              .stream()
+                              .filter((cs) -> cs.name.toString().equals(BytesType.instance.getString(columnName)))
+                              .findFirst()
+                              .get().type,
+                     Int32Type.instance);
+
+        assertRows(resultSet,
+                   row(UTF8Type.instance.decompose("key1"), null, 100, null));
+    }
+
+    @Test
+    public void thriftCompatTableWithSupercolumnsTest() throws Throwable
+    {
+        final String KEYSPACE = "thrift_compact_table_with_supercolumns_test";
+        final String TABLE = "test_table_1";
+
+        CfDef cfDef = new CfDef().setColumn_type("Super")
+                                 .setSubcomparator_type(Int32Type.instance.toString())
+                                 .setComparator_type(AsciiType.instance.toString())
+                                 .setDefault_validation_class(AsciiType.instance.toString())
+                                 .setKey_validation_class(AsciiType.instance.toString())
+                                 .setKeyspace(KEYSPACE)
+                                 .setName(TABLE);
+
+        KsDef ksDef = new KsDef(KEYSPACE,
+                                SimpleStrategy.class.getName(),
+                                Arrays.asList(cfDef));
+        ksDef.setStrategy_options(Collections.singletonMap("replication_factor", "1"));
+
+        Cassandra.Client client = getClient();
+        client.system_add_keyspace(ksDef);
+
+        client.set_keyspace(KEYSPACE);
+
+        Mutation mutation = new Mutation();
+        ColumnOrSuperColumn csoc = new ColumnOrSuperColumn();
+        csoc.setSuper_column(getSuperColumnForInsert(ByteBufferUtil.bytes("val1"),
+                                                     Arrays.asList(getColumnForInsert(ByteBufferUtil.bytes(1), ByteBufferUtil.bytes("value1")),
+                                                                   getColumnForInsert(ByteBufferUtil.bytes(2), ByteBufferUtil.bytes("value2")),
+                                                                   getColumnForInsert(ByteBufferUtil.bytes(3), ByteBufferUtil.bytes("value3")))));
+        mutation.setColumn_or_supercolumn(csoc);
+
+        Mutation mutation2 = new Mutation();
+        ColumnOrSuperColumn csoc2 = new ColumnOrSuperColumn();
+        csoc2.setSuper_column(getSuperColumnForInsert(ByteBufferUtil.bytes("val2"),
+                                                     Arrays.asList(getColumnForInsert(ByteBufferUtil.bytes(4), ByteBufferUtil.bytes("value7")),
+                                                                   getColumnForInsert(ByteBufferUtil.bytes(5), ByteBufferUtil.bytes("value8")),
+                                                                   getColumnForInsert(ByteBufferUtil.bytes(6), ByteBufferUtil.bytes("value9")))));
+        mutation2.setColumn_or_supercolumn(csoc2);
+
+        client.batch_mutate(Collections.singletonMap(ByteBufferUtil.bytes("key1"),
+                                                     Collections.singletonMap(TABLE, Arrays.asList(mutation, mutation2))),
+                            ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(AsciiType.instance, resultSet, "key");
+        assertColumnType(AsciiType.instance, resultSet, "column1");
+        assertColumnType(MapType.getInstance(Int32Type.instance, AsciiType.instance, true), resultSet, "");
+
+        assertRows(resultSet,
+                   row("key1", "val1", map(1, "value1", 2, "value2", 3, "value3")),
+                   row("key1", "val2", map(4, "value7", 5, "value8", 6, "value9")));
+
+        assertRows(execute(String.format("SELECT \"\" FROM %s.%s;", KEYSPACE, TABLE)),
+                   row(map(1, "value1", 2, "value2", 3, "value3")),
+                   row(map(4, "value7", 5, "value8", 6, "value9")));
+
+        assertInvalidMessage("Range deletions are not supported for specific columns",
+                             String.format("DELETE \"\" FROM %s.%s WHERE key=?;", KEYSPACE, TABLE),
+                             "key1");
+
+        execute(String.format("TRUNCATE %s.%s;", KEYSPACE, TABLE));
+
+        execute(String.format("INSERT INTO %s.%s (key, column1, \"\") VALUES (?, ?, ?);", KEYSPACE, TABLE),
+                "key3", "val1", map(7, "value7", 8, "value8"));
+
+        assertRows(execute(String.format("SELECT \"\" FROM %s.%s;", KEYSPACE, TABLE)),
+                   row(map(7, "value7", 8, "value8")));
+    }
+
+    @Test
+    public void thriftCreatedTableWithCompositeColumnsTest() throws Throwable
+    {
+        final String KEYSPACE = "thrift_created_table_with_composites_test_ks";
+        final String TABLE = "test_table_1";
+
+        CompositeType type = CompositeType.getInstance(AsciiType.instance, AsciiType.instance, AsciiType.instance);
+        CfDef cfDef = new CfDef().setDefault_validation_class(AsciiType.instance.toString())
+                                 .setComparator_type(type.toString())
+                                 .setKey_validation_class(AsciiType.instance.toString())
+                                 .setKeyspace(KEYSPACE)
+                                 .setName(TABLE);
+
+        KsDef ksDef = new KsDef(KEYSPACE,
+                                SimpleStrategy.class.getName(),
+                                Arrays.asList(cfDef));
+        ksDef.setStrategy_options(Collections.singletonMap("replication_factor", "1"));
+
+        Cassandra.Client client = getClient();
+        client.system_add_keyspace(ksDef);
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(type.decompose("a", "b", "c"), ByteBufferUtil.bytes("val1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(type.decompose("d", "e", "f"), ByteBufferUtil.bytes("val2")),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+
+        assertColumnType(AsciiType.instance, resultSet, "key");
+        assertColumnType(AsciiType.instance, resultSet, "column1");
+        assertColumnType(AsciiType.instance, resultSet, "column2");
+        assertColumnType(AsciiType.instance, resultSet, "column3");
+        assertColumnType(AsciiType.instance, resultSet, "value");
+
+        assertRows(resultSet,
+                   row("key1", "a", "b", "c", "val1"),
+                   row("key1", "d", "e", "f", "val2"));
+    }
+
+    @Test
+    public void compactTableWithoutClusteringKeyTest() throws Throwable
+    {
+        String KEYSPACE = keyspace();
+        String TABLE = createTable("CREATE TABLE %s (pkey text PRIMARY KEY, s1 text, s2 text) WITH COMPACT STORAGE");
+
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey1"), ByteBufferUtil.bytes("val1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey2"), ByteBufferUtil.bytes("val2")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("s1"), ByteBufferUtil.bytes("s1Val")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("s2"), ByteBufferUtil.bytes("s2Val")),
+                      ONE);
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+
+        assertColumnType(UTF8Type.instance, resultSet, "pkey");
+        assertColumnType(UTF8Type.instance, resultSet, "s1");
+        assertColumnType(UTF8Type.instance, resultSet, "s2");
+        assertColumnType(UTF8Type.instance, resultSet, "column1");
+        assertColumnType(BytesType.instance, resultSet, "value");
+
+        assertRows(resultSet,
+                   row("key1", "ckey1", "s1Val", "s2Val", ByteBufferUtil.bytes("val1")),
+                   row("key1", "ckey2", "s1Val", "s2Val", ByteBufferUtil.bytes("val2")));
+    }
+
+    @Test
+    public void denseTableTestTest() throws Throwable
+    {
+        String KEYSPACE = keyspace();
+        String TABLE = createTable("CREATE TABLE %s (pkey text PRIMARY KEY, s text) WITH COMPACT STORAGE");
+
+        Cassandra.Client client = getClient();
+        client.set_keyspace(KEYSPACE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey1"), ByteBufferUtil.bytes("val1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("ckey2"), ByteBufferUtil.bytes("val2")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("s"), ByteBufferUtil.bytes("sval1")),
+                      ONE);
+
+        client.insert(UTF8Type.instance.decompose("key1"),
+                      new ColumnParent(TABLE),
+                      getColumnForInsert(ByteBufferUtil.bytes("s"), ByteBufferUtil.bytes("sval2")),
+                      ONE);
+
+        // `s` becomes static, `column1` becomes a clustering key, `value` becomes visible
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE;", KEYSPACE, TABLE));
+        UntypedResultSet resultSet = execute(String.format("select * from %s.%s",
+                                                           KEYSPACE, TABLE));
+        assertColumnType(UTF8Type.instance, resultSet, "pkey");
+        assertColumnType(UTF8Type.instance, resultSet, "s");
+        assertColumnType(UTF8Type.instance, resultSet, "column1");
+        assertColumnType(BytesType.instance, resultSet, "value");
+
+        assertRows(resultSet,
+                   row("key1", "ckey1", "sval2", ByteBufferUtil.bytes("val1")),
+                   row("key1", "ckey2", "sval2", ByteBufferUtil.bytes("val2")));
+    }
+
+    @Test
+    public void denseCompositeWithIndexesTest() throws Throwable
+    {
+        final String KEYSPACE = "thrift_dense_composite_table_test_ks";
+        final String TABLE = "dense_composite_table";
+
+        ByteBuffer aCol = createDynamicCompositeKey(ByteBufferUtil.bytes("a"));
+        ByteBuffer bCol = createDynamicCompositeKey(ByteBufferUtil.bytes("b"));
+        ByteBuffer cCol = createDynamicCompositeKey(ByteBufferUtil.bytes("c"));
+
+        String compositeType = "DynamicCompositeType(a => BytesType, b => TimeUUIDType, c => UTF8Type)";
+
+        CfDef cfDef = new CfDef();
+        cfDef.setName(TABLE);
+        cfDef.setComparator_type(compositeType);
+        cfDef.setKeyspace(KEYSPACE);
+
+        cfDef.setColumn_metadata(
+        Arrays.asList(new ColumnDef(aCol, "BytesType").setIndex_type(IndexType.KEYS).setIndex_name(KEYSPACE + "_a"),
+                      new ColumnDef(bCol, "BytesType").setIndex_type(IndexType.KEYS).setIndex_name(KEYSPACE + "_b"),
+                      new ColumnDef(cCol, "BytesType").setIndex_type(IndexType.KEYS).setIndex_name(KEYSPACE + "_c")));
+
+
+        KsDef ksDef = new KsDef(KEYSPACE,
+                                SimpleStrategy.class.getName(),
+                                Collections.singletonList(cfDef));
+        ksDef.setStrategy_options(Collections.singletonMap("replication_factor", "1"));
+
+        Cassandra.Client client = getClient();
+        client.system_add_keyspace(ksDef);
+        client.set_keyspace(KEYSPACE);
+
+        CFMetaData cfm = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).metadata;
+        assertFalse(cfm.isCQLTable());
+
+        List<Pair<ColumnDefinition, IndexTarget.Type>> compactTableTargets = new ArrayList<>();
+        compactTableTargets.add(CassandraIndex.parseTarget(cfm, "a"));
+        compactTableTargets.add(CassandraIndex.parseTarget(cfm, "b"));
+        compactTableTargets.add(CassandraIndex.parseTarget(cfm, "c"));
+
+        execute(String.format("ALTER TABLE %s.%s DROP COMPACT STORAGE", KEYSPACE, TABLE));
+        cfm = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).metadata;
+        assertTrue(cfm.isCQLTable());
+
+        List<Pair<ColumnDefinition, IndexTarget.Type>> cqlTableTargets = new ArrayList<>();
+        cqlTableTargets.add(CassandraIndex.parseTarget(cfm, "a"));
+        cqlTableTargets.add(CassandraIndex.parseTarget(cfm, "b"));
+        cqlTableTargets.add(CassandraIndex.parseTarget(cfm, "c"));
+
+        assertEquals(compactTableTargets, cqlTableTargets);
+    }
+
+    private static ByteBuffer createDynamicCompositeKey(Object... objects)
+    {
+        int length = 0;
+
+        for (Object object : objects)
+        {
+            length += 2 * Short.BYTES +  Byte.BYTES;
+            if (object instanceof String)
+                length += ((String) object).length();
+            else if (object instanceof UUID)
+                length += 2 * Long.BYTES;
+            else if (object instanceof ByteBuffer)
+                length += ((ByteBuffer) object).remaining();
+            else
+                throw new MarshalException(object.getClass().getName() + " is not recognized as a valid type for this composite");
+        }
+
+        ByteBuffer out = ByteBuffer.allocate(length);
+
+        for (Object object : objects)
+        {
+            if (object instanceof String)
+            {
+                String cast = (String) object;
+
+                out.putShort((short) (0x8000 | 's'));
+                out.putShort((short) cast.length());
+                out.put(cast.getBytes());
+                out.put((byte) 0);
+            }
+            else if (object instanceof UUID)
+            {
+                out.putShort((short) (0x8000 | 't'));
+                out.putShort((short) 16);
+                out.put(UUIDGen.decompose((UUID) object));
+                out.put((byte) 0);
+            }
+            else if (object instanceof ByteBuffer)
+            {
+                ByteBuffer bytes = ((ByteBuffer) object).duplicate();
+                out.putShort((short) (0x8000 | 'b'));
+                out.putShort((short) bytes.remaining());
+                out.put(bytes);
+                out.put((byte) 0);
+            }
+            else
+            {
+                throw new MarshalException(object.getClass().getName() + " is not recognized as a valid type for this composite");
+            }
+        }
+
+        return out;
+    }
+
+    private Column getColumnForInsert(ByteBuffer columnName, ByteBuffer value)
+    {
+        Column column = new Column();
+        column.setName(columnName);
+        column.setValue(value);
+        column.setTimestamp(System.currentTimeMillis());
+        return column;
+    }
+
+    private SuperColumn getSuperColumnForInsert(ByteBuffer columnName, List<Column> columns)
+    {
+        SuperColumn column = new SuperColumn();
+        column.setName(columnName);
+        for (Column c : columns)
+            column.addToColumns(c);
+        return column;
+    }
+
+    private static void assertColumnType(AbstractType t, UntypedResultSet resultSet, String columnName)
+    {
+        for (ColumnSpecification columnSpecification : resultSet.metadata())
+        {
+            if (columnSpecification.name.toString().equals(columnName))
+            {
+                assertEquals(t, columnSpecification.type);
+                return;
+            }
+        }
+
+        fail(String.format("Could not find a column with name '%s'", columnName));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/DropRecreateAndRestoreTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/DropRecreateAndRestoreTest.java
index 4a3a51d..f491d24 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/DropRecreateAndRestoreTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/DropRecreateAndRestoreTest.java

@@ -70,8 +70,7 @@
         {
             // Restore to point in time.
             CommitLog.instance.archiver.restorePointInTime = time;
-            CommitLog.instance.resetUnsafe(true);
-            CommitLog.instance.recover();
+            CommitLog.instance.resetUnsafe(false);
         }
         finally
         {

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/DropTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/DropTest.java
index 2c68390..692eb45 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/DropTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/DropTest.java

@@ -28,10 +28,23 @@
     public void testNonExistingOnes() throws Throwable
     {
         assertInvalidMessage("Cannot drop non existing table", "DROP TABLE " + KEYSPACE + ".table_does_not_exist");
-        assertInvalidMessage("Cannot drop non existing table", "DROP TABLE keyspace_does_not_exist.table_does_not_exist");
+        assertInvalidMessage("Cannot drop table in unknown keyspace", "DROP TABLE keyspace_does_not_exist.table_does_not_exist");
 
         execute("DROP TABLE IF EXISTS " + KEYSPACE + ".table_does_not_exist");
         execute("DROP TABLE IF EXISTS keyspace_does_not_exist.table_does_not_exist");
     }
 
+    @Test
+    public void testDropTableWithDroppedColumns() throws Throwable
+    {
+        // CASSANDRA-13730: entry should be removed from dropped_columns table when table is dropped
+        String cf = createTable("CREATE TABLE %s (k1 int, c1 int , v1 int, v2 int, PRIMARY KEY (k1, c1))");
+
+        execute("ALTER TABLE %s DROP v2");
+        execute("DROP TABLE %s");
+
+        assertRowsIgnoringOrder(execute("select * from system_schema.dropped_columns where keyspace_name = '"
+                + keyspace()
+                + "' and table_name = '" + cf + "'"));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java
index 1d532cb..e467291 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java

@@ -22,6 +22,7 @@
 
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class InsertTest extends CQLTester
 {
@@ -59,18 +60,312 @@
     }
 
     @Test
-    public void testOverlyLargeInsertPK() throws Throwable
+    public void testInsert() throws Throwable
     {
-        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY ((a), b))");
+        testInsert(false);
+        testInsert(true);
+    }
+
+    private void testInsert(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering int," +
+                                      "value int," +
+                                      " PRIMARY KEY (partitionKey, clustering))");
+
+        execute("INSERT INTO %s (partitionKey, clustering) VALUES (0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 1, 1)");
+        flush(forceFlush);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 0, null),
+                   row(0, 1, 1));
+
+        // Missing primary key columns
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "INSERT INTO %s (clustering, value) VALUES (0, 1)");
+        assertInvalidMessage("Some clustering keys are missing: clustering",
+                             "INSERT INTO %s (partitionKey, value) VALUES (0, 2)");
+
+        // multiple time the same value
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering, value, value) VALUES (0, 0, 2, 2)");
+
+        // multiple time same primary key element in WHERE clause
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering, clustering, value) VALUES (0, 0, 0, 2)");
+
+        // unknown identifiers
+        assertInvalidMessage("Unknown identifier clusteringx",
+                             "INSERT INTO %s (partitionKey, clusteringx, value) VALUES (0, 0, 2)");
+
+        assertInvalidMessage("Unknown identifier valuex",
+                             "INSERT INTO %s (partitionKey, clustering, valuex) VALUES (0, 0, 2)");
+    }
+
+    @Test
+    public void testInsertWithCompactFormat() throws Throwable
+    {
+        testInsertWithCompactFormat(false);
+        testInsertWithCompactFormat(true);
+    }
+
+    private void testInsertWithCompactFormat(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering int," +
+                                      "value int," +
+                                      " PRIMARY KEY (partitionKey, clustering)) WITH COMPACT STORAGE");
+
+        execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering, value) VALUES (0, 1, 1)");
+        flush(forceFlush);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 0, 0),
+                   row(0, 1, 1));
+
+        // Invalid Null values for the clustering key or the regular column
+        assertInvalidMessage("Some clustering keys are missing: clustering",
+                             "INSERT INTO %s (partitionKey, value) VALUES (0, 0)");
+        assertInvalidMessage("Column value is mandatory for this COMPACT STORAGE table",
+                             "INSERT INTO %s (partitionKey, clustering) VALUES (0, 0)");
+
+        // Missing primary key columns
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "INSERT INTO %s (clustering, value) VALUES (0, 1)");
+
+        // multiple time the same value
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering, value, value) VALUES (0, 0, 2, 2)");
+
+        // multiple time same primary key element in WHERE clause
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering, clustering, value) VALUES (0, 0, 0, 2)");
+
+        // unknown identifiers
+        assertInvalidMessage("Unknown identifier clusteringx",
+                             "INSERT INTO %s (partitionKey, clusteringx, value) VALUES (0, 0, 2)");
+
+        assertInvalidMessage("Unknown identifier valuex",
+                             "INSERT INTO %s (partitionKey, clustering, valuex) VALUES (0, 0, 2)");
+    }
+
+    @Test
+    public void testInsertWithTwoClusteringColumns() throws Throwable
+    {
+        testInsertWithTwoClusteringColumns(false);
+        testInsertWithTwoClusteringColumns(true);
+    }
+
+    private void testInsertWithTwoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                    "clustering_1 int," +
+                    "clustering_2 int," +
+                    "value int," +
+                    " PRIMARY KEY (partitionKey, clustering_1, clustering_2))");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+        flush(forceFlush);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 0, 0, null),
+                   row(0, 0, 1, 1));
+
+        // Missing primary key columns
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "INSERT INTO %s (clustering_1, clustering_2, value) VALUES (0, 0, 1)");
+        assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                             "INSERT INTO %s (partitionKey, clustering_2, value) VALUES (0, 0, 2)");
+
+        // multiple time the same value
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering_1, value, clustering_2, value) VALUES (0, 0, 2, 0, 2)");
+
+        // multiple time same primary key element in WHERE clause
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering_1, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0, 2)");
+
+        // unknown identifiers
+        assertInvalidMessage("Unknown identifier clustering_1x",
+                             "INSERT INTO %s (partitionKey, clustering_1x, clustering_2, value) VALUES (0, 0, 0, 2)");
+
+        assertInvalidMessage("Unknown identifier valuex",
+                             "INSERT INTO %s (partitionKey, clustering_1, clustering_2, valuex) VALUES (0, 0, 0, 2)");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testInsertWithCompactStaticFormat() throws Throwable
+    {
+        testWithCompactTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+
+        // if column1 is present, hidden column is called column2
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, column1 int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, column1) VALUES (1, 1, 1, 1)");
+        assertInvalidMessage("Unknown identifier column2",
+                             "INSERT INTO %s (a, b, c, column2) VALUES (1, 1, 1, 1)");
+
+        // if value is present, hidden column is called value1
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, value int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, value) VALUES (1, 1, 1, 1)");
+        assertInvalidMessage("Unknown identifier value1",
+                             "INSERT INTO %s (a, b, c, value1) VALUES (1, 1, 1, 1)");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testInsertWithCompactNonStaticFormat() throws Throwable
+    {
+        testWithCompactTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        testWithCompactTable("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+    }
+
+    private void testWithCompactTable(String tableQuery) throws Throwable
+    {
+        createTable(tableQuery);
+
+        // pass correct types to the hidden columns
+        assertInvalidMessage("Unknown identifier column1",
+                             "INSERT INTO %s (a, b, column1) VALUES (?, ?, ?)",
+                             1, 1, ByteBufferUtil.bytes('a'));
+        assertInvalidMessage("Unknown identifier value",
+                             "INSERT INTO %s (a, b, value) VALUES (?, ?, ?)",
+                             1, 1, ByteBufferUtil.bytes('a'));
+        assertInvalidMessage("Unknown identifier column1",
+                             "INSERT INTO %s (a, b, column1, value) VALUES (?, ?, ?, ?)",
+                             1, 1, ByteBufferUtil.bytes('a'), ByteBufferUtil.bytes('b'));
+        assertInvalidMessage("Unknown identifier value",
+                             "INSERT INTO %s (a, b, value, column1) VALUES (?, ?, ?, ?)",
+                             1, 1, ByteBufferUtil.bytes('a'), ByteBufferUtil.bytes('b'));
+
+        // pass incorrect types to the hidden columns
+        assertInvalidMessage("Unknown identifier value",
+                             "INSERT INTO %s (a, b, value) VALUES (?, ?, ?)",
+                             1, 1, 1);
+        assertInvalidMessage("Unknown identifier column1",
+                             "INSERT INTO %s (a, b, column1) VALUES (?, ?, ?)",
+                             1, 1, 1);
+        assertEmpty(execute("SELECT * FROM %s"));
+
+        // pass null to the hidden columns
+        assertInvalidMessage("Unknown identifier value",
+                             "INSERT INTO %s (a, b, value) VALUES (?, ?, ?)",
+                             1, 1, null);
+        assertInvalidMessage("Unknown identifier column1",
+                             "INSERT INTO %s (a, b, column1) VALUES (?, ?, ?)",
+                             1, 1, null);
+    }
+
+    @Test
+    public void testInsertWithCompactStorageAndTwoClusteringColumns() throws Throwable
+    {
+        testInsertWithCompactStorageAndTwoClusteringColumns(false);
+        testInsertWithCompactStorageAndTwoClusteringColumns(true);
+    }
+
+    private void testInsertWithCompactStorageAndTwoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering_1 int," +
+                                      "clustering_2 int," +
+                                      "value int," +
+                                      " PRIMARY KEY (partitionKey, clustering_1, clustering_2)) WITH COMPACT STORAGE");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+        flush(forceFlush);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(0, 0, null, 0),
+                   row(0, 0, 0, 0),
+                   row(0, 0, 1, 1));
+
+        // Invalid Null values for the clustering key or the regular column
+        assertInvalidMessage("PRIMARY KEY column \"clustering_2\" cannot be restricted as preceding column \"clustering_1\" is not restricted",
+                             "INSERT INTO %s (partitionKey, clustering_2, value) VALUES (0, 0, 0)");
+        assertInvalidMessage("Column value is mandatory for this COMPACT STORAGE table",
+                             "INSERT INTO %s (partitionKey, clustering_1, clustering_2) VALUES (0, 0, 0)");
+
+        // Missing primary key columns
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "INSERT INTO %s (clustering_1, clustering_2, value) VALUES (0, 0, 1)");
+        assertInvalidMessage("PRIMARY KEY column \"clustering_2\" cannot be restricted as preceding column \"clustering_1\" is not restricted",
+                             "INSERT INTO %s (partitionKey, clustering_2, value) VALUES (0, 0, 2)");
+
+        // multiple time the same value
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering_1, value, clustering_2, value) VALUES (0, 0, 2, 0, 2)");
+
+        // multiple time same primary key element in WHERE clause
+        assertInvalidMessage("The column names contains duplicates",
+                             "INSERT INTO %s (partitionKey, clustering_1, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0, 2)");
+
+        // unknown identifiers
+        assertInvalidMessage("Unknown identifier clustering_1x",
+                             "INSERT INTO %s (partitionKey, clustering_1x, clustering_2, value) VALUES (0, 0, 0, 2)");
+
+        assertInvalidMessage("Unknown identifier valuex",
+                             "INSERT INTO %s (partitionKey, clustering_1, clustering_2, valuex) VALUES (0, 0, 0, 2)");
+    }
+
+    @Test
+    public void testInsertWithAStaticColumn() throws Throwable
+    {
+        testInsertWithAStaticColumn(false);
+        testInsertWithAStaticColumn(true);
+    }
+
+    private void testInsertWithAStaticColumn(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                    "clustering_1 int," +
+                    "clustering_2 int," +
+                    "value int," +
+                    "staticValue text static," +
+                    " PRIMARY KEY (partitionKey, clustering_1, clustering_2))");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, staticValue) VALUES (0, 0, 0, 'A')");
+        execute("INSERT INTO %s (partitionKey, staticValue) VALUES (1, 'B')");
+        flush(forceFlush);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, null, null, "B", null),
+                   row(0, 0, 0, "A", null));
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (1, 0, 0, 0)");
+        flush(forceFlush);
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 0, 0, "B", 0),
+                   row(0, 0, 0, "A", null));
+
+        // Missing primary key columns
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "INSERT INTO %s (clustering_1, clustering_2, staticValue) VALUES (0, 0, 'A')");
+        assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                             "INSERT INTO %s (partitionKey, clustering_2, staticValue) VALUES (0, 0, 'A')");
+    }
+
+    @Test
+    public void testPKInsertWithValueOver64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY (a, b))");
 
         assertInvalidThrow(InvalidRequestException.class,
                            "INSERT INTO %s (a, b) VALUES (?, 'foo')", new String(TOO_BIG.array()));
     }
 
     @Test
-    public void testOverlyLargeInsertCK() throws Throwable
+    public void testCKInsertWithValueOver64K() throws Throwable
     {
-        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY ((a), b))");
+        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY (a, b))");
 
         assertInvalidThrow(InvalidRequestException.class,
                            "INSERT INTO %s (a, b) VALUES ('foo', ?)", new String(TOO_BIG.array()));

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertUpdateIfConditionTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertUpdateIfConditionTest.java
index 6396727..a47691a 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertUpdateIfConditionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertUpdateIfConditionTest.java

@@ -26,7 +26,9 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.schema.SchemaKeyspace;
 
+import static java.lang.String.format;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
@@ -115,7 +117,14 @@
         assertRows(execute("UPDATE %s SET v1 = 3, v2 = 'bar' WHERE k = 0 IF EXISTS"), row(false));
 
         // Should apply
+        assertEmpty(execute("SELECT * FROM %s WHERE k = 0"));
         assertRows(execute("DELETE FROM %s WHERE k = 0 IF v1 IN (?)", (Integer) null), row(true));
+
+        createTable(" CREATE TABLE %s (k int, c int, v1 text, PRIMARY KEY(k, c))");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional updates",
+                             "UPDATE %s SET v1 = 'A' WHERE k = 0 AND c IN () IF EXISTS");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional updates",
+                             "UPDATE %s SET v1 = 'A' WHERE k = 0 AND c IN (1, 2) IF EXISTS");
     }
 
     /**
@@ -150,6 +159,7 @@
                              "UPDATE %s SET v1 = 3, v2 = 'bar' WHERE k = 0 IF v1 >= ?", unset());
         assertInvalidMessage("Invalid 'unset' value in condition",
                              "UPDATE %s SET v1 = 3, v2 = 'bar' WHERE k = 0 IF v1 != ?", unset());
+
     }
 
     /**
@@ -158,7 +168,7 @@
     @Test
     public void testConditionalDelete() throws Throwable
     {
-        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int)");
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int,)");
 
         assertRows(execute("DELETE FROM %s WHERE k=1 IF EXISTS"), row(false));
 
@@ -183,29 +193,50 @@
         assertRows(execute("DELETE v1 FROM %s WHERE k=3 IF EXISTS"), row(true));
         assertRows(execute("DELETE FROM %s WHERE k=3 IF EXISTS"), row(true));
 
+        execute("INSERT INTO %s (k, v1) VALUES (4, 2)");
+        execute("UPDATE %s USING TTL 1 SET v1=2 WHERE k=4");
+        Thread.sleep(1001);
+        assertRows(execute("SELECT * FROM %s WHERE k=4"), row(4, null));
+        assertRows(execute("DELETE FROM %s WHERE k=4 IF EXISTS"), row(true));
+        assertEmpty(execute("SELECT * FROM %s WHERE k=4"));
+
         // static columns
         createTable("CREATE TABLE %s (k text, s text static, i int, v text, PRIMARY KEY (k, i) )");
 
         execute("INSERT INTO %s (k, s, i, v) VALUES ('k', 's', 0, 'v')");
         assertRows(execute("DELETE v FROM %s WHERE k='k' AND i=0 IF EXISTS"), row(true));
         assertRows(execute("DELETE FROM %s WHERE k='k' AND i=0 IF EXISTS"), row(true));
+        assertRows(execute("SELECT * FROM %s"), row("k", null, "s", null));
+        assertRows(execute("DELETE v FROM %s WHERE k='k' AND i=0 IF s = 'z'"), row(false, "s"));
+        assertRows(execute("DELETE v FROM %s WHERE k='k' AND i=0 IF v = 'z'"), row(false));
+        assertRows(execute("DELETE v FROM %s WHERE k='k' AND i=0 IF v = 'z' AND s = 'z'"), row(false, null, "s"));
         assertRows(execute("DELETE v FROM %s WHERE k='k' AND i=0 IF EXISTS"), row(false));
         assertRows(execute("DELETE FROM %s WHERE k='k' AND i=0 IF EXISTS"), row(false));
 
         // CASSANDRA-6430
-        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to use IF conditions, but column 'i' is not restricted",
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
                              "DELETE FROM %s WHERE k = 'k' IF EXISTS");
-        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to use IF conditions, but column 'i' is not restricted",
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
                              "DELETE FROM %s WHERE k = 'k' IF v = ?", "foo");
-        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to use IF conditions, but column 'k' is not restricted",
+        assertInvalidMessage("Some partition key parts are missing: k",
                              "DELETE FROM %s WHERE i = 0 IF EXISTS");
+
         assertInvalidMessage("Invalid INTEGER constant (0) for \"k\" of type text",
                              "DELETE FROM %s WHERE k = 0 AND i > 0 IF EXISTS");
-
-        assertInvalidMessage("Invalid operator > for PRIMARY KEY part i",
+        assertInvalidMessage("Invalid INTEGER constant (0) for \"k\" of type text",
+                             "DELETE FROM %s WHERE k = 0 AND i > 0 IF v = 'foo'");
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
                              "DELETE FROM %s WHERE k = 'k' AND i > 0 IF EXISTS");
-        assertInvalidMessage("Invalid operator > for PRIMARY KEY part i",
-                             "DELETE FROM %s WHERE k = 'k' AND i > 0 IF v = ?", "foo");
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
+                             "DELETE FROM %s WHERE k = 'k' AND i > 0 IF v = 'foo'");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional deletions",
+                             "DELETE FROM %s WHERE k = 'k' AND i IN (0, 1) IF v = 'foo'");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional deletions",
+                             "DELETE FROM %s WHERE k = 'k' AND i IN () IF v = 'foo'");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional deletions",
+                             "DELETE FROM %s WHERE k = 'k' AND i IN (0, 1) IF EXISTS");
+        assertInvalidMessage("IN on the clustering key columns is not supported with conditional deletions",
+                             "DELETE FROM %s WHERE k = 'k' AND i IN () IF EXISTS");
 
         assertInvalidMessage("Invalid 'unset' value in condition",
                              "DELETE FROM %s WHERE k = 'k' AND i = 0 IF v = ?", unset());
@@ -220,6 +251,28 @@
         assertRows(execute("DELETE FROM %s WHERE k = 1 AND i = 2 IF s = ?", 1), row(true));
         assertEmpty(execute("SELECT * FROM %s WHERE k = 1 AND i = 2"));
         assertRows(execute("SELECT * FROM %s WHERE k = 1"), row(1, null, 1, null));
+
+        createTable("CREATE TABLE %s (k int, i int, v1 int, v2 int, s int static, PRIMARY KEY (k, i))");
+        execute("INSERT INTO %s (k, i, v1, v2, s) VALUES (?, ?, ?, ?, ?)",
+                1, 1, 1, 1, 1);
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 1 IF EXISTS"),
+                   row(true));
+        assertRows(execute("DELETE v2 FROM %s WHERE k = 1 AND i = 1 IF EXISTS"),
+                   row(true));
+        assertRows(execute("DELETE FROM %s WHERE k = 1 AND i = 1 IF EXISTS"),
+                   row(true));
+        assertRows(execute("select * from %s"),
+                   row(1, null, 1, null, null));
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 1 IF EXISTS"),
+                   row(false));
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 1 IF s = 5"),
+                   row(false, 1));
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 1 IF v1 = 1 AND v2 = 1"),
+                   row(false));
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 1 IF v1 = 1 AND v2 = 1 AND s = 1"),
+                   row(false, null, null, 1));
+        assertRows(execute("DELETE v1 FROM %s WHERE k = 1 AND i = 5 IF s = 1"),
+                   row(true));
     }
 
     /**
@@ -249,13 +302,17 @@
         assertRows(execute("UPDATE %s SET v='bar', version=2 WHERE id=0 AND k='k2' IF version = ?", 1), row(true));
         assertRows(execute("SELECT * FROM %s"), row(0, "k1", 2, "foo"), row(0, "k2", 2, "bar"));
 
+        // Batch output is slightly different from non-batch CAS, since a full PK is included to disambiguate
+        // cases when conditions span across multiple rows.
+        assertRows(execute("UPDATE %1$s SET version=3 WHERE id=0 IF version=1; "),
+                   row(false, 2));
         // Testing batches
         assertRows(execute("BEGIN BATCH " +
                            "UPDATE %1$s SET v='foobar' WHERE id=0 AND k='k1'; " +
                            "UPDATE %1$s SET v='barfoo' WHERE id=0 AND k='k2'; " +
                            "UPDATE %1$s SET version=3 WHERE id=0 IF version=1; " +
                            "APPLY BATCH "),
-                   row(false, 0, null, 2));
+                   row(false, 0, "k1", 2));
 
         assertRows(execute("BEGIN BATCH " +
                            "UPDATE %1$s SET v = 'foobar' WHERE id = 0 AND k = 'k1'; " +
@@ -363,23 +420,22 @@
                    row(1, 7, null, 8));
         execute("INSERT INTO %s (pk, static_col) VALUES (?, ?)", 1, 1);
 
-        assertInvalidMessage("DELETE statements must restrict all PARTITION KEY columns with equality relations in order " +
-                             "to use IF conditions on static columns, but column 'pk' is not restricted",
+        assertInvalidMessage("Some partition key parts are missing: pk",
                              "DELETE static_col FROM %s WHERE ck = ? IF static_col = ?", 1, 1);
 
-        assertInvalidMessage("Invalid restriction on clustering column ck since the DELETE statement modifies only static columns",
+        assertInvalidMessage("Invalid restrictions on clustering columns since the DELETE statement modifies only static columns",
                              "DELETE static_col FROM %s WHERE pk = ? AND ck = ? IF static_col = ?", 1, 1, 1);
 
-        assertInvalidMessage("Primary key column 'ck' must be specified in order to delete column 'value'",
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
                              "DELETE static_col, value FROM %s WHERE pk = ? IF static_col = ?", 1, 1);
 
         // Same query but with an invalid condition
-        assertInvalidMessage("Primary key column 'ck' must be specified in order to delete column 'value'",
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order to delete non static columns",
                              "DELETE static_col, value FROM %s WHERE pk = ? IF static_col = ?", 1, 2);
 
         // DELETE of an underspecified PRIMARY KEY should not succeed if static is not only restriction
-        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations in order " +
-                             "to use IF conditions, but column 'ck' is not restricted",
+        assertInvalidMessage("DELETE statements must restrict all PRIMARY KEY columns with equality relations" +
+                             " in order to use IF condition on non static columns",
                              "DELETE static_col FROM %s WHERE pk = ? IF value = ? AND static_col = ?", 1, 2, 1);
 
         assertRows(execute("DELETE value FROM %s WHERE pk = ? AND ck = ? IF value = ? AND static_col = ?", 1, 1, 2, 2), row(false, 2, 1));
@@ -419,20 +475,20 @@
                    row(1, 5, 2, 6),
                    row(1, 7, 2, 8));
 
-        assertInvalidMessage("Missing mandatory PRIMARY KEY part pk",
+        assertInvalidMessage("Some partition key parts are missing: pk",
                              "UPDATE %s SET static_col = ? WHERE ck = ? IF static_col = ?", 3, 1, 1);
 
-        assertInvalidMessage("Invalid restriction on clustering column ck since the UPDATE statement modifies only static columns",
+        assertInvalidMessage("Invalid restrictions on clustering columns since the UPDATE statement modifies only static columns",
                              "UPDATE %s SET static_col = ? WHERE pk = ? AND ck = ? IF static_col = ?", 3, 1, 1, 1);
 
-        assertInvalidMessage("Missing mandatory PRIMARY KEY part ck",
+        assertInvalidMessage("Some clustering keys are missing: ck",
                              "UPDATE %s SET static_col = ?, value = ? WHERE pk = ? IF static_col = ?", 3, 1, 1, 2);
 
         // Same query but with an invalid condition
-        assertInvalidMessage("Missing mandatory PRIMARY KEY part ck",
+        assertInvalidMessage("Some clustering keys are missing: ck",
                              "UPDATE %s SET static_col = ?, value = ? WHERE pk = ? IF static_col = ?", 3, 1, 1, 1);
 
-        assertInvalidMessage("Missing mandatory PRIMARY KEY part ck",
+        assertInvalidMessage("Some clustering keys are missing: ck",
                              "UPDATE %s SET static_col = ? WHERE pk = ? IF value = ? AND static_col = ?", 3, 1, 4, 2);
 
         assertRows(execute("UPDATE %s SET value = ? WHERE pk = ? AND ck = ? IF value = ? AND static_col = ?", 3, 1, 1, 3, 2), row(false, 2, 2));
@@ -815,6 +871,24 @@
         }
     }
 
+    @Test
+    public void testFrozenWithNullValues() throws Throwable
+    {
+        createTable(String.format("CREATE TABLE %%s (k int PRIMARY KEY, m %s)", "frozen<list<text>>"));
+        execute("INSERT INTO %s (k, m) VALUES (0, null)");
+
+        assertRows(execute("UPDATE %s SET m = ? WHERE k = 0 IF m = ?", list("test"), list("comparison")), row(false, null));
+
+        createTable(String.format("CREATE TABLE %%s (k int PRIMARY KEY, m %s)", "frozen<map<text,int>>"));
+        execute("INSERT INTO %s (k, m) VALUES (0, null)");
+
+        assertRows(execute("UPDATE %s SET m = ? WHERE k = 0 IF m = ?", map("test", 3), map("comparison", 2)), row(false, null));
+
+        createTable(String.format("CREATE TABLE %%s (k int PRIMARY KEY, m %s)", "frozen<set<text>>"));
+        execute("INSERT INTO %s (k, m) VALUES (0, null)");
+
+        assertRows(execute("UPDATE %s SET m = ? WHERE k = 0 IF m = ?", set("test"), set("comparison")), row(false, null));
+    }
     /**
      * Test expanded functionality from CASSANDRA-6839,
      * migrated from cql_tests.py:TestCQL.expanded_map_item_conditional_test()
@@ -918,17 +992,26 @@
 
         // create and confirm
         schemaChange("CREATE KEYSPACE IF NOT EXISTS " + keyspace + " WITH replication = { 'class':'SimpleStrategy', 'replication_factor':1} and durable_writes = true ");
-        assertRows(execute("select durable_writes from system.schema_keyspaces where keyspace_name = ?", keyspace), row(true));
+        assertRows(execute(format("select durable_writes from %s.%s where keyspace_name = ?",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.KEYSPACES),
+                           keyspace),
+                   row(true));
 
         // unsuccessful create since it's already there, confirm settings don't change
         schemaChange("CREATE KEYSPACE IF NOT EXISTS " + keyspace + " WITH replication = {'class':'SimpleStrategy', 'replication_factor':1} and durable_writes = false ");
 
-        assertRows(execute("select durable_writes from system.schema_keyspaces where keyspace_name = ?", keyspace), row(true));
+        assertRows(execute(format("select durable_writes from %s.%s where keyspace_name = ?",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.KEYSPACES),
+                           keyspace),
+                   row(true));
 
         // drop and confirm
         schemaChange("DROP KEYSPACE IF EXISTS " + keyspace);
 
-        assertEmpty(execute("select * from system.schema_keyspaces where keyspace_name = ?", keyspace));
+        assertEmpty(execute(format("select * from %s.%s where keyspace_name = ?", SchemaKeyspace.NAME, SchemaKeyspace.KEYSPACES),
+                            keyspace));
     }
 
 
@@ -947,19 +1030,19 @@
         // create and confirm
         schemaChange("CREATE TABLE IF NOT EXISTS " + fullTableName + " (id text PRIMARY KEY, value1 blob) with comment = 'foo'");
 
-        assertRows(execute("select comment from system.schema_columnfamilies where keyspace_name = ? and columnfamily_name = ?", KEYSPACE, tableName),
+        assertRows(execute("select comment from system_schema.tables where keyspace_name = ? and table_name = ?", KEYSPACE, tableName),
                    row("foo"));
 
         // unsuccessful create since it's already there, confirm settings don't change
         schemaChange("CREATE TABLE IF NOT EXISTS " + fullTableName + " (id text PRIMARY KEY, value2 blob)with comment = 'bar'");
 
-        assertRows(execute("select comment from system.schema_columnfamilies where keyspace_name = ? and columnfamily_name = ?", KEYSPACE, tableName),
+        assertRows(execute("select comment from system_schema.tables where keyspace_name = ? and table_name = ?", KEYSPACE, tableName),
                    row("foo"));
 
         // drop and confirm
         schemaChange("DROP TABLE IF EXISTS " + fullTableName);
 
-        assertEmpty(execute("select * from system.schema_columnfamilies where keyspace_name = ? and columnfamily_name = ?", KEYSPACE, tableName));
+        assertEmpty(execute("select * from system_schema.tables where keyspace_name = ? and table_name = ?", KEYSPACE, tableName));
     }
 
     /**
@@ -1003,7 +1086,11 @@
 
         // create and confirm
         execute("CREATE TYPE IF NOT EXISTS mytype (somefield int)");
-        assertRows(execute("SELECT type_name from system.schema_usertypes where keyspace_name = ? and type_name = ?", KEYSPACE, "mytype"),
+        assertRows(execute(format("SELECT type_name from %s.%s where keyspace_name = ? and type_name = ?",
+                                  SchemaKeyspace.NAME,
+                                  SchemaKeyspace.TYPES),
+                           KEYSPACE,
+                           "mytype"),
                    row("mytype"));
 
         // unsuccessful create since it 's already there
@@ -1012,7 +1099,11 @@
 
         // drop and confirm
         execute("DROP TYPE IF EXISTS mytype");
-        assertEmpty(execute("SELECT type_name from system.schema_usertypes where keyspace_name = ? and type_name = ?", KEYSPACE, "mytype"));
+        assertEmpty(execute(format("SELECT type_name from %s.%s where keyspace_name = ? and type_name = ?",
+                                   SchemaKeyspace.NAME,
+                                   SchemaKeyspace.TYPES),
+                            KEYSPACE,
+                            "mytype"));
     }
 
     @Test
@@ -1029,14 +1120,20 @@
         assertRows(execute("SELECT * FROM %s WHERE a = 6"),
                    row(6, 6, 6, "a"));
 
+        execute("INSERT INTO %s (a, b, s, d) values (7, 7, 100, 'a')");
+        assertRows(execute("UPDATE %s SET s = 7 WHERE a = 7 IF s = 101"),
+                   row(false, 100));
+        assertRows(execute("SELECT * FROM %s WHERE a = 7"),
+                   row(7, 7, 100, "a"));
+
         // pre-existing row with null in the static column
         execute("INSERT INTO %s (a, b, d) values (7, 7, 'a')");
         assertRows(execute("UPDATE %s SET s = 7 WHERE a = 7 IF s = NULL"),
-                   row(true));
+                   row(false, 100));
         assertRows(execute("SELECT * FROM %s WHERE a = 7"),
-                   row(7, 7, 7, "a"));
+                   row(7, 7, 100, "a"));
 
-        // deleting row before CAS
+        // deleting row before CAS makes it effectively non-existing
         execute("DELETE FROM %s WHERE a = 8;");
         assertRows(execute("UPDATE %s SET s = 8 WHERE a = 8 IF s = NULL"),
                    row(true));
@@ -1045,39 +1142,24 @@
     }
 
     @Test
-    public void testConditionalUpdatesWithNullValues() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a int, b int, s int static, d text, PRIMARY KEY (a, b))");
-
-        // pre-populate, leave out static column
-        for (int i = 1; i <= 5; i++)
-            execute("INSERT INTO %s (a, b) VALUES (?, ?)", i, i);
-
-        conditionalUpdatesWithNonExistingOrNullValues();
-
-        // rejected: IN doesn't contain null
-        assertRows(execute("UPDATE %s SET s = 30 WHERE a = 3 IF s IN ?", list(10,20,30)),
-                   row(false));
-        assertRows(execute("SELECT * FROM %s WHERE a = 3"),
-                   row(3, 3, null, null));
-
-        // rejected: comparing number with NULL always returns false
-        for (String operator: new String[] { ">", "<", ">=", "<=", "="})
-        {
-            assertRows(execute("UPDATE %s SET s = 50 WHERE a = 5 IF s " + operator + " ?", 3),
-                       row(false));
-            assertRows(execute("SELECT * FROM %s WHERE a = 5"),
-                       row(5, 5, null, null));
-        }
-
-    }
-
-    @Test
     public void testConditionalUpdatesWithNonExistingValues() throws Throwable
     {
         createTable("CREATE TABLE %s (a int, b int, s int static, d text, PRIMARY KEY (a, b))");
 
-        conditionalUpdatesWithNonExistingOrNullValues();
+        assertRows(execute("UPDATE %s SET s = 1 WHERE a = 1 IF s = NULL"),
+                   row(true));
+        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 1"),
+                   row(1, 1, null));
+
+        assertRows(execute("UPDATE %s SET s = 2 WHERE a = 2 IF s IN (10,20,NULL)"),
+                   row(true));
+        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 2"),
+                   row(2, 2, null));
+
+        assertRows(execute("UPDATE %s SET s = 4 WHERE a = 4 IF s != 4"),
+                   row(true));
+        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 4"),
+                   row(4, 4, null));
 
         // rejected: IN doesn't contain null
         assertRows(execute("UPDATE %s SET s = 3 WHERE a = 3 IF s IN ?", list(10,20,30)),
@@ -1093,22 +1175,118 @@
         }
     }
 
-    private void conditionalUpdatesWithNonExistingOrNullValues() throws Throwable
+    @Test
+    public void testConditionalUpdatesWithNullValues() throws Throwable
     {
-        assertRows(execute("UPDATE %s SET s = 1 WHERE a = 1 IF s = ?", (Integer) null),
-                   row(true));
-        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 1"),
-                   row(1, 1, null));
+        createTable("CREATE TABLE %s (a int, b int, s int static, d int, PRIMARY KEY (a, b))");
 
-        assertRows(execute("UPDATE %s SET s = 2 WHERE a = 2 IF s IN (?, ?, ?)", 10,20,null),
-                   row(true));
-        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 2"),
-                   row(2, 2, null));
+        // pre-populate, leave out static column
+        for (int i = 1; i <= 5; i++)
+        {
+            execute("INSERT INTO %s (a, b) VALUES (?, ?)", i, 1);
+            execute("INSERT INTO %s (a, b) VALUES (?, ?)", i, 2);
+        }
 
-        assertRows(execute("UPDATE %s SET s = 4 WHERE a = 4 IF s != ?", 4 ),
+        assertRows(execute("UPDATE %s SET s = 100 WHERE a = 1 IF s = NULL"),
                    row(true));
-        assertRows(execute("SELECT a, s, d FROM %s WHERE a = 4"),
-                   row(4, 4, null));
+        assertRows(execute("SELECT a, b, s, d FROM %s WHERE a = 1"),
+                   row(1, 1, 100, null),
+                   row(1, 2, 100, null));
+
+        assertRows(execute("UPDATE %s SET s = 200 WHERE a = 2 IF s IN (10,20,NULL)"),
+                   row(true));
+        assertRows(execute("SELECT a, b, s, d FROM %s WHERE a = 2"),
+                   row(2, 1, 200, null),
+                   row(2, 2, 200, null));
+
+        // rejected: IN doesn't contain null
+        assertRows(execute("UPDATE %s SET s = 30 WHERE a = 3 IF s IN ?", list(10,20,30)),
+                   row(false, null));
+        assertRows(execute("SELECT * FROM %s WHERE a = 3"),
+                   row(3, 1, null, null),
+                   row(3, 2, null, null));
+
+        assertRows(execute("UPDATE %s SET s = 400 WHERE a = 4 IF s IN (10,20,NULL)"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 4"),
+                   row(4, 1, 400, null),
+                   row(4, 2, 400, null));
+
+        // rejected: comparing number with NULL always returns false
+        for (String operator: new String[] { ">", "<", ">=", "<=", "="})
+        {
+            assertRows(execute("UPDATE %s SET s = 50 WHERE a = 5 IF s " + operator + " 3"),
+                       row(false, null));
+            assertRows(execute("SELECT * FROM %s WHERE a = 5"),
+                       row(5, 1, null, null),
+                       row(5, 2, null, null));
+        }
+
+        assertRows(execute("UPDATE %s SET s = 500 WHERE a = 5 IF s != 5"),
+                   row(true));
+        assertRows(execute("SELECT a, b, s, d FROM %s WHERE a = 5"),
+                   row(5, 1, 500, null),
+                   row(5, 2, 500, null));
+
+        // Similar test, although with two static columns to test limits
+        createTable("CREATE TABLE %s (a int, b int, s1 int static, s2 int static, d int, PRIMARY KEY (a, b))");
+
+        for (int i = 1; i <= 5; i++)
+            for (int j = 0; j < 5; j++)
+                execute("INSERT INTO %s (a, b, d) VALUES (?, ?, ?)", i, j, i + j);
+
+        assertRows(execute("UPDATE %s SET s2 = 100 WHERE a = 1 IF s1 = NULL"),
+                   row(true));
+
+        execute("INSERT INTO %s (a, b, s1) VALUES (?, ?, ?)", 2, 2, 2);
+        assertRows(execute("UPDATE %s SET s1 = 100 WHERE a = 2 IF s2 = NULL"),
+                   row(true));
+
+        execute("INSERT INTO %s (a, b, s1) VALUES (?, ?, ?)", 2, 2, 2);
+        assertRows(execute("UPDATE %s SET s1 = 100 WHERE a = 2 IF s2 = NULL"),
+                   row(true));
+    }
+
+    @Test
+    public void testStaticsWithMultipleConditions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, s1 int static, s2 int static, d int, PRIMARY KEY (a, b))");
+
+        for (int i = 1; i <= 5; i++)
+        {
+            execute("INSERT INTO %s (a, b, d) VALUES (?, ?, ?)", i, 1, 5);
+            execute("INSERT INTO %s (a, b, d) VALUES (?, ?, ?)", i, 2, 6);
+        }
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "UPDATE %1$s SET s2 = 102 WHERE a = 1 IF s1 = null;\n"
+                           + "UPDATE %1$s SET s1 = 101 WHERE a = 1 IF s2 = null;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 1"),
+                   row(1, 1, 101, 102, 5),
+                   row(1, 2, 101, 102, 6));
+
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "UPDATE %1$s SET s2 = 202 WHERE a = 2 IF s1 = null;\n"
+                           + "UPDATE %1$s SET s1 = 201 WHERE a = 2 IF s2 = null;\n"
+                           + "UPDATE %1$s SET d = 203 WHERE a = 2 AND b = 1 IF d = 5;\n"
+                           + "UPDATE %1$s SET d = 204 WHERE a = 2 AND b = 2 IF d = 6;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+
+        assertRows(execute("SELECT * FROM %s WHERE a = 2"),
+                   row(2, 1, 201, 202, 203),
+                   row(2, 2, 201, 202, 204));
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "UPDATE %1$s SET s2 = 202 WHERE a = 20 IF s1 = null;\n"
+                           + "UPDATE %1$s SET s1 = 201 WHERE a = 20 IF s2 = null;\n"
+                           + "UPDATE %1$s SET d = 203 WHERE a = 20 AND b = 1 IF d = 5;\n"
+                           + "UPDATE %1$s SET d = 204 WHERE a = 20 AND b = 2 IF d = 6;\n"
+                           + "APPLY BATCH"),
+                   row(false));
     }
 
     @Test
@@ -1120,7 +1298,14 @@
         for (int i = 1; i <= 6; i++)
             execute("INSERT INTO %s (a, b) VALUES (?, ?)", i, i);
 
-        testConditionalUpdatesWithNonExistingOrNullValuesWithBatch();
+        // applied: null is indistiguishable from empty value, lwt condition is executed before INSERT
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, d) values (2, 2, 'a');\n"
+                           + "UPDATE %1$s SET s = 2 WHERE a = 2 IF s = null;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 2"),
+                   row(2, 2, 2, "a"));
 
         // rejected: comparing number with null value always returns false
         for (String operator: new String[] { ">", "<", ">=", "<=", "="})
@@ -1129,20 +1314,36 @@
                                + "INSERT INTO %1$s (a, b, s, d) values (3, 3, 40, 'a');\n"
                                + "UPDATE %1$s SET s = 30 WHERE a = 3 IF s " + operator + " 5;\n"
                                + "APPLY BATCH"),
-                       row(false));
+                       row(false, 3, 3, null));
             assertRows(execute("SELECT * FROM %s WHERE a = 3"),
                        row(3, 3, null, null));
         }
 
+        // applied: lwt condition is executed before INSERT, update is applied after it
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, s, d) values (4, 4, 4, 'a');\n"
+                           + "UPDATE %1$s SET s = 5 WHERE a = 4 IF s = null;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 4"),
+                   row(4, 4, 5, "a"));
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, s, d) values (5, 5, 5, 'a');\n"
+                           + "UPDATE %1$s SET s = 6 WHERE a = 5 IF s IN (1,2,null);\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 5"),
+                   row(5, 5, 6, "a"));
+
         // rejected: IN doesn't contain null
         assertRows(execute("BEGIN BATCH\n"
                            + "INSERT INTO %1$s (a, b, s, d) values (6, 6, 70, 'a');\n"
                            + "UPDATE %1$s SET s = 60 WHERE a = 6 IF s IN (1,2,3);\n"
                            + "APPLY BATCH"),
-                   row(false));
+                   row(false, 6, 6, null));
         assertRows(execute("SELECT * FROM %s WHERE a = 6"),
                    row(6, 6, null, null));
-
     }
 
     @Test
@@ -1150,7 +1351,38 @@
     {
         createTable("CREATE TABLE %s (a int, b int, s int static, d text, PRIMARY KEY (a, b))");
 
-        testConditionalUpdatesWithNonExistingOrNullValuesWithBatch();
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, d) values (2, 2, 'a');\n"
+                           + "UPDATE %1$s SET s = 2 WHERE a = 2 IF s = null;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 2"),
+                   row(2, 2, 2, "a"));
+
+        // applied: lwt condition is executed before INSERT, update is applied after it
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, s, d) values (4, 4, 4, 'a');\n"
+                           + "UPDATE %1$s SET s = 5 WHERE a = 4 IF s = null;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 4"),
+                   row(4, 4, 5, "a")); // Note that the update wins because 5 > 4 (we have a timestamp tie, so values are used)
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, s, d) values (5, 5, 5, 'a');\n"
+                           + "UPDATE %1$s SET s = 6 WHERE a = 5 IF s IN (1,2,null);\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 5"),
+                   row(5, 5, 6, "a")); // Same as above
+
+        assertRows(execute("BEGIN BATCH\n"
+                           + "INSERT INTO %1$s (a, b, s, d) values (7, 7, 7, 'a');\n"
+                           + "UPDATE %1$s SET s = 8 WHERE a = 7 IF s != 7;\n"
+                           + "APPLY BATCH"),
+                   row(true));
+        assertRows(execute("SELECT * FROM %s WHERE a = 7"),
+                   row(7, 7, 8, "a")); // Same as above
 
         // rejected: comparing number with non-existing value always returns false
         for (String operator: new String[] { ">", "<", ">=", "<=", "="})
@@ -1172,43 +1404,6 @@
         assertEmpty(execute("SELECT * FROM %s WHERE a = 6"));
     }
 
-    private void testConditionalUpdatesWithNonExistingOrNullValuesWithBatch() throws Throwable
-    {
-        // applied: null is indistiguishable from empty value, lwt condition is executed before INSERT
-        assertRows(execute("BEGIN BATCH\n"
-                           + "INSERT INTO %1$s (a, b, d) values (2, 2, 'a');\n"
-                           + "UPDATE %1$s SET s = 2 WHERE a = 2 IF s = null;\n"
-                           + "APPLY BATCH"),
-                   row(true));
-        assertRows(execute("SELECT * FROM %s WHERE a = 2"),
-                   row(2, 2, 2, "a"));
-
-        // applied: lwt condition is executed before INSERT, update is applied after it
-        assertRows(execute("BEGIN BATCH\n"
-                           + "INSERT INTO %1$s (a, b, s, d) values (4, 4, 4, 'a');\n"
-                           + "UPDATE %1$s SET s = 5 WHERE a = 4 IF s = null;\n"
-                           + "APPLY BATCH"),
-                   row(true));
-        assertRows(execute("SELECT * FROM %s WHERE a = 4"),
-                   row(4, 4, 5, "a"));
-
-        assertRows(execute("BEGIN BATCH\n"
-                           + "INSERT INTO %1$s (a, b, s, d) values (5, 5, 5, 'a');\n"
-                           + "UPDATE %1$s SET s = 6 WHERE a = 5 IF s IN (1,2,null);\n"
-                           + "APPLY BATCH"),
-                   row(true));
-        assertRows(execute("SELECT * FROM %s WHERE a = 5"),
-                   row(5, 5, 6, "a"));
-
-        assertRows(execute("BEGIN BATCH\n"
-                           + "INSERT INTO %1$s (a, b, s, d) values (7, 7, 7, 'a');\n"
-                           + "UPDATE %1$s SET s = 8 WHERE a = 7 IF s != 7;\n"
-                           + "APPLY BATCH"),
-                   row(true));
-        assertRows(execute("SELECT * FROM %s WHERE a = 7"),
-                   row(7, 7, 8, "a"));
-    }
-
     @Test
     public void testConditionalDeleteWithNullValues() throws Throwable
     {
@@ -1253,7 +1448,6 @@
     {
         createTable("CREATE TABLE %s (a int, b int, s1 int static, s2 int static, v int, PRIMARY KEY (a, b))");
 
-        // applied: null is indistiguishable from empty value, lwt condition is executed before INSERT
         assertRows(execute("BEGIN BATCH\n"
                            + "INSERT INTO %1$s (a, b, s1, v) values (2, 2, 2, 2);\n"
                            + "DELETE s1 FROM %1$s WHERE a = 2 IF s2 = null;\n"
@@ -1281,6 +1475,7 @@
                    row(false));
         assertEmpty(execute("SELECT * FROM %s WHERE a = 6"));
 
+        // Note that on equal timestamp, tombstone wins so the DELETE wins
         assertRows(execute("BEGIN BATCH\n"
                            + "INSERT INTO %1$s (a, b, s1, v) values (4, 4, 4, 4);\n"
                            + "DELETE s1 FROM %1$s WHERE a = 4 IF s2 = null;\n"
@@ -1289,6 +1484,7 @@
         assertRows(execute("SELECT * FROM %s WHERE a = 4"),
                    row(4, 4, null, null, 4));
 
+        // Note that on equal timestamp, tombstone wins so the DELETE wins
         assertRows(execute("BEGIN BATCH\n"
                            + "INSERT INTO %1$s (a, b, s1, v) VALUES (5, 5, 5, 5);\n"
                            + "DELETE s1 FROM %1$s WHERE a = 5 IF s1 IN (1,2,null);\n"
@@ -1297,6 +1493,7 @@
         assertRows(execute("SELECT * FROM %s WHERE a = 5"),
                    row(5, 5, null, null, 5));
 
+        // Note that on equal timestamp, tombstone wins so the DELETE wins
         assertRows(execute("BEGIN BATCH\n"
                            + "INSERT INTO %1$s (a, b, s1, v) values (7, 7, 7, 7);\n"
                            + "DELETE s1 FROM %1$s WHERE a = 7 IF s2 != 7;\n"
@@ -1306,6 +1503,110 @@
                    row(7, 7, null, null, 7));
     }
 
+    /**
+     * Test for CASSANDRA-12060, using a table without clustering.
+     */
+    @Test
+    public void testMultiExistConditionOnSameRowNoClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 text, v2 text)");
+
+        // Multiple inserts on the same row with not exist conditions
+        assertRows(execute("BEGIN BATCH "
+                           + "INSERT INTO %1$s (k, v1) values (0, 'foo') IF NOT EXISTS; "
+                           + "INSERT INTO %1$s (k, v2) values (0, 'bar') IF NOT EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        assertRows(execute("SELECT * FROM %s WHERE k = 0"), row(0, "foo", "bar"));
+
+        // Same, but both insert on the same column: doing so would almost surely be a user error, but that's the
+        // original case reported in #12867, so being thorough.
+        assertRows(execute("BEGIN BATCH "
+                           + "INSERT INTO %1$s (k, v1) values (1, 'foo') IF NOT EXISTS; "
+                           + "INSERT INTO %1$s (k, v1) values (1, 'bar') IF NOT EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        // As all statement gets the same timestamp, the biggest value ends up winning, so that's "foo"
+        assertRows(execute("SELECT * FROM %s WHERE k = 1"), row(1, "foo", null));
+
+        // Multiple deletes on the same row with exists conditions (note that this is somewhat non-sensical, one of the
+        // delete is redundant, we're just checking it doesn't break something)
+        assertRows(execute("BEGIN BATCH "
+                           + "DELETE FROM %1$s WHERE k = 0 IF EXISTS; "
+                           + "DELETE FROM %1$s WHERE k = 0 IF EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE k = 0"));
+
+        // Validate we can't mix different type of conditions however
+        assertInvalidMessage("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row",
+                             "BEGIN BATCH "
+                           + "INSERT INTO %1$s (k, v1) values (1, 'foo') IF NOT EXISTS; "
+                           + "DELETE FROM %1$s WHERE k = 1 IF EXISTS; "
+                           + "APPLY BATCH");
+
+        assertInvalidMessage("Cannot mix IF conditions and IF NOT EXISTS for the same row",
+                             "BEGIN BATCH "
+                             + "INSERT INTO %1$s (k, v1) values (1, 'foo') IF NOT EXISTS; "
+                             + "UPDATE %1$s SET v2 = 'bar' WHERE k = 1 IF v1 = 'foo'; "
+                             + "APPLY BATCH");
+    }
+
+    /**
+     * Test for CASSANDRA-12060, using a table with clustering.
+     */
+    @Test
+    public void testMultiExistConditionOnSameRowClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, t int, v1 text, v2 text, PRIMARY KEY (k, t))");
+
+        // Multiple inserts on the same row with not exist conditions
+        assertRows(execute("BEGIN BATCH "
+                           + "INSERT INTO %1$s (k, t, v1) values (0, 0, 'foo') IF NOT EXISTS; "
+                           + "INSERT INTO %1$s (k, t, v2) values (0, 0, 'bar') IF NOT EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        assertRows(execute("SELECT * FROM %s WHERE k = 0"), row(0, 0, "foo", "bar"));
+
+        // Same, but both insert on the same column: doing so would almost surely be a user error, but that's the
+        // original case reported in #12867, so being thorough.
+        assertRows(execute("BEGIN BATCH "
+                           + "INSERT INTO %1$s (k, t, v1) values (1, 0, 'foo') IF NOT EXISTS; "
+                           + "INSERT INTO %1$s (k, t, v1) values (1, 0, 'bar') IF NOT EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        // As all statement gets the same timestamp, the biggest value ends up winning, so that's "foo"
+        assertRows(execute("SELECT * FROM %s WHERE k = 1"), row(1, 0, "foo", null));
+
+        // Multiple deletes on the same row with exists conditions (note that this is somewhat non-sensical, one of the
+        // delete is redundant, we're just checking it doesn't break something)
+        assertRows(execute("BEGIN BATCH "
+                           + "DELETE FROM %1$s WHERE k = 0 AND t = 0 IF EXISTS; "
+                           + "DELETE FROM %1$s WHERE k = 0 AND t = 0 IF EXISTS; "
+                           + "APPLY BATCH"),
+                   row(true));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE k = 0"));
+
+        // Validate we can't mix different type of conditions however
+        assertInvalidMessage("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row",
+                             "BEGIN BATCH "
+                             + "INSERT INTO %1$s (k, t, v1) values (1, 0, 'foo') IF NOT EXISTS; "
+                             + "DELETE FROM %1$s WHERE k = 1 AND t = 0 IF EXISTS; "
+                             + "APPLY BATCH");
+
+        assertInvalidMessage("Cannot mix IF conditions and IF NOT EXISTS for the same row",
+                             "BEGIN BATCH "
+                             + "INSERT INTO %1$s (k, t, v1) values (1, 0, 'foo') IF NOT EXISTS; "
+                             + "UPDATE %1$s SET v2 = 'bar' WHERE k = 1 AND t = 0 IF v1 = 'foo'; "
+                             + "APPLY BATCH");
+    }
+
     @Test
     public void testInMarkerWithUDTs() throws Throwable
     {

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java
index 0ffb799..8ef4b58 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java

@@ -26,14 +26,15 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
-import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.StorageService;
 
 public class SelectLimitTest extends CQLTester
 {
     @BeforeClass
     public static void setUp()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
+        StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
+        DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     /**
@@ -93,6 +94,43 @@
 
     }
 
+    @Test
+    public void testLimitInStaticTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, v int, PRIMARY KEY (k) ) WITH COMPACT STORAGE ");
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s(k, v) VALUES (?, ?)", i, i);
+
+        assertRows(execute("SELECT * FROM %s LIMIT 5"),
+                   row(0, 0),
+                   row(1, 1),
+                   row(2, 2),
+                   row(3, 3),
+                   row(4, 4));
+
+        assertRows(execute("SELECT v FROM %s LIMIT 5"),
+                   row(0),
+                   row(1),
+                   row(2),
+                   row(3),
+                   row(4));
+
+        assertRows(execute("SELECT k FROM %s LIMIT 5"),
+                   row(0),
+                   row(1),
+                   row(2),
+                   row(3),
+                   row(4));
+
+        assertRows(execute("SELECT DISTINCT k FROM %s LIMIT 5"),
+                   row(0),
+                   row(1),
+                   row(2),
+                   row(3),
+                   row(4));
+    }
+
     /**
      * Check for #7052 bug,
      * migrated from cql_tests.py:TestCQL.limit_compact_table()
@@ -125,9 +163,44 @@
                    row(1, 1),
                    row(1, 2),
                    row(1, 3));
+        assertRows(execute("SELECT * FROM %s WHERE v > 1 AND v <= 3 LIMIT 6 ALLOW FILTERING"),
+                   row(0, 2),
+                   row(0, 3),
+                   row(1, 2),
+                   row(1, 3),
+                   row(2, 2),
+                   row(2, 3));
+    }
 
-        // strict bound (v > 1) over a range of partitions is not supported for compact storage if limit is provided
-        assertInvalidThrow(InvalidRequestException.class, "SELECT * FROM %s WHERE v > 1 AND v <= 3 LIMIT 6 ALLOW FILTERING");
+    @Test
+    public void testLimitWithDeletedRowsAndStaticColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, c int, v int, s int static, PRIMARY KEY (pk, c))");
+
+        execute("INSERT INTO %s (pk, c, v, s) VALUES (1, -1, 1, 1)");
+        execute("INSERT INTO %s (pk, c, v, s) VALUES (2, -1, 1, 1)");
+        execute("INSERT INTO %s (pk, c, v, s) VALUES (3, -1, 1, 1)");
+        execute("INSERT INTO %s (pk, c, v, s) VALUES (4, -1, 1, 1)");
+        execute("INSERT INTO %s (pk, c, v, s) VALUES (5, -1, 1, 1)");
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, -1, 1, 1),
+                   row(2, -1, 1, 1),
+                   row(3, -1, 1, 1),
+                   row(4, -1, 1, 1),
+                   row(5, -1, 1, 1));
+
+        execute("DELETE FROM %s WHERE pk = 2");
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, -1, 1, 1),
+                   row(3, -1, 1, 1),
+                   row(4, -1, 1, 1),
+                   row(5, -1, 1, 1));
+
+        assertRows(execute("SELECT * FROM %s LIMIT 2"),
+                   row(1, -1, 1, 1),
+                   row(3, -1, 1, 1));
     }
 
     @Test
@@ -145,11 +218,8 @@
                         execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", i, j, i + j);
         }
 
-        for (boolean forceFlush : new boolean[]{false, true})
+        beforeAndAfterFlush(() ->
         {
-            if (forceFlush)
-                flush();
-
             assertRows(execute("SELECT * FROM %s"),
                        row(0, 0, 0, 0),
                        row(0, 2, 0, 2),
@@ -187,7 +257,7 @@
                               row(1, 1, 1, 2),
                               row(2, 1, 2, 3));
             }
-        }
+        });
 
         assertRows(execute("SELECT * FROM %s WHERE a IN (0, 1, 2, 3) AND b = 1 LIMIT 2 ALLOW FILTERING"),
                    row(1, 1, 1, 2),
@@ -198,15 +268,16 @@
                    row(2, 1, 2, 3));
 
         assertRows(execute("SELECT * FROM %s WHERE a IN (0, 1, 2, 3) AND b = 1 ORDER BY b DESC LIMIT 2 ALLOW FILTERING"),
-                   row(2, 1, 2, 3),
-                   row(1, 1, 1, 2));
+                   row(1, 1, 1, 2),
+                   row(2, 1, 2, 3));
 
         assertRows(execute("SELECT * FROM %s WHERE a IN (0, 1, 2, 3) AND b >= 1 AND b <= 1 ORDER BY b DESC LIMIT 2 ALLOW FILTERING"),
-                   row(2, 1, 2, 3),
-                   row(1, 1, 1, 2));
+                   row(1, 1, 1, 2),
+                   row(2, 1, 2, 3));
 
         execute("SELECT * FROM %s WHERE a IN (0, 1, 2, 3)"); // Load all data in the row cache
 
+        // Partition range queries
         assertRows(execute("SELECT * FROM %s WHERE b = 1 LIMIT 2 ALLOW FILTERING"),
                    row(1, 1, 1, 2),
                    row(2, 1, 2, 3));
@@ -215,6 +286,15 @@
                    row(1, 1, 1, 2),
                    row(2, 1, 2, 3));
 
+        // Multiple partitions queries
+        assertRows(execute("SELECT * FROM %s WHERE a IN (0, 1, 2) AND b = 1 LIMIT 2 ALLOW FILTERING"),
+                   row(1, 1, 1, 2),
+                   row(2, 1, 2, 3));
+
+        assertRows(execute("SELECT * FROM %s WHERE a IN (0, 1, 2) AND b >= 1 AND b <= 1 LIMIT 2 ALLOW FILTERING"),
+                   row(1, 1, 1, 2),
+                   row(2, 1, 2, 3));
+
         // Test with paging
         for (int pageSize = 1; pageSize < 4; pageSize++)
         {
@@ -229,21 +309,18 @@
 
         // With multiple clustering columns
         createTable("CREATE TABLE %s (a int, b int, c int, s int static, d int, primary key (a, b, c))"
-                + " WITH caching = {'keys': 'ALL', 'rows_per_partition' : 'ALL'}");
+           + " WITH caching = {'keys': 'ALL', 'rows_per_partition' : 'ALL'}");
 
         for (int i = 0; i < 3; i++)
         {
             execute("INSERT INTO %s (a, s) VALUES (?, ?)", i, i);
-            for (int j = 0; j < 3; j++)
-                if (!(i == 0 && j == 1))
-                    execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", i, j, j, i + j);
+                for (int j = 0; j < 3; j++)
+                    if (!(i == 0 && j == 1))
+                        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", i, j, j, i + j);
         }
 
-        for (boolean forceFlush : new boolean[]{false, true})
+        beforeAndAfterFlush(() ->
         {
-            if (forceFlush)
-                flush();
-
             assertRows(execute("SELECT * FROM %s"),
                        row(0, 0, 0, 0, 0),
                        row(0, 2, 2, 0, 2),
@@ -273,7 +350,7 @@
                               row(1, 1, 1, 1, 2),
                               row(2, 1, 1, 2, 3));
             }
-        }
+        });
 
         execute("SELECT * FROM %s WHERE a IN (0, 1, 2)"); // Load data in the row cache
 
@@ -285,6 +362,14 @@
                    row(1, 1, 1, 1, 2),
                    row(2, 1, 1, 2, 3));
 
+        assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2, 3, 4) AND b = 1 ALLOW FILTERING"),
+                   row(1, 1, 1, 1, 2),
+                   row(2, 1, 1, 2, 3));
+
+        assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2, 3, 4) AND b IN (1, 2, 3, 4) AND c >= 1 AND c <= 1 LIMIT 2 ALLOW FILTERING"),
+                   row(1, 1, 1, 1, 2),
+                   row(2, 1, 1, 2, 3));
+
         // Test with paging
         for (int pageSize = 1; pageSize < 4; pageSize++)
         {
@@ -315,13 +400,9 @@
                    row(3, 1, 9, 1),
                    row(4, 1, 9, 1));
 
-        execute("DELETE FROM %s WHERE pk = ? AND c = ?", 3, 1);
+        execute("DELETE FROM %s WHERE pk = ? and c = ?", 3, 1);
 
         // Test without paging
-        assertRows(execute("SELECT * FROM %s WHERE v = ?", 1),
-                   row(1, 1, 9, 1),
-                   row(4, 1, 9, 1));
-
         assertRows(execute("SELECT * FROM %s WHERE v = ? LIMIT 2", 1),
                    row(1, 1, 9, 1),
                    row(4, 1, 9, 1));
@@ -329,11 +410,47 @@
         // Test with paging
         for (int pageSize = 1; pageSize < 4; pageSize++)
         {
-            assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE v = 1", pageSize),
+            assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE v = 1 LIMIT 2", pageSize),
+                          row(1, 1, 9, 1),
+                          row(4, 1, 9, 1));
+        }
+    }
+
+    @Test
+    public void testFilteringWithPartitionWithoutRows() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, c int, s int static, v int, PRIMARY KEY(pk, c))");
+
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 1, 9, 1);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 2, 9, 2);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 3, 1, 9, 1);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 4, 1, 9, 1);
+        flush();
+
+        assertRows(execute("SELECT * FROM %s WHERE v = ? ALLOW FILTERING", 1),
+                   row(1, 1, 9, 1),
+                   row(3, 1, 9, 1),
+                   row(4, 1, 9, 1));
+
+        execute("DELETE FROM %s WHERE pk = ? and c = ?", 3, 1);
+
+        // Test without paging
+        assertRows(execute("SELECT * FROM %s WHERE v = ? LIMIT 2 ALLOW FILTERING", 1),
+                   row(1, 1, 9, 1),
+                   row(4, 1, 9, 1));
+
+        assertRows(execute("SELECT * FROM %s WHERE pk IN ? AND v = ? LIMIT 2 ALLOW FILTERING", list(1, 3, 4), 1),
+                   row(1, 1, 9, 1),
+                   row(4, 1, 9, 1));
+
+        // Test with paging
+        for (int pageSize = 1; pageSize < 4; pageSize++)
+        {
+            assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE v = 1 LIMIT 2 ALLOW FILTERING", pageSize),
                           row(1, 1, 9, 1),
                           row(4, 1, 9, 1));
 
-            assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE v = 1 LIMIT 2", pageSize),
+            assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE pk IN (1, 3, 4) AND v = 1 LIMIT 2 ALLOW FILTERING", pageSize),
                           row(1, 1, 9, 1),
                           row(4, 1, 9, 1));
         }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java
index 4e5a1e6..6fec497 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java

@@ -19,11 +19,16 @@
 
 import org.junit.Test;
 
+import java.nio.ByteBuffer;
+
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class SelectMultiColumnRelationTest extends CQLTester
 {
+    private static final ByteBuffer TOO_BIG = ByteBuffer.allocate(1024 * 65);
+
     @Test
     public void testSingleClusteringInvalidQueries() throws Throwable
     {
@@ -871,6 +876,12 @@
                              "SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ?", 1, 1, 2);
         assertRows(execute("SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ? ALLOW FILTERING", 1, 1, 2),
                    row(0, 1, 1, 1, 2));
+
+        assertInvalidMessage("Unsupported null value for column e",
+                             "SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ?  ALLOW FILTERING", 1, 1, null);
+
+        assertInvalidMessage("Unsupported unset value for column e",
+                             "SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ?  ALLOW FILTERING", 1, 1, unset());
     }
 
     @Test
@@ -901,6 +912,19 @@
     }
 
     @Test
+    public void testMultipleClusteringWithIndexAndValueOver64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b blob, c int, d int, PRIMARY KEY (a, b, c))");
+        createIndex("CREATE INDEX ON %s (b)");
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, ByteBufferUtil.bytes(1), 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, ByteBufferUtil.bytes(2), 1, 0);
+
+        assertInvalidMessage("Index expression values may not be larger than 64K",
+                             "SELECT * FROM %s WHERE (b, c) = (?, ?) AND d = ?  ALLOW FILTERING", TOO_BIG, 1, 2);
+    }
+
+    @Test
     public void testMultiplePartitionKeyAndMultiClusteringWithIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, f int, PRIMARY KEY ((a, b), c, d, e))");

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderByTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderByTest.java
index e21074b..fc48928 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderByTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderByTest.java

@@ -21,8 +21,8 @@
 
 import org.apache.cassandra.cql3.CQLTester;
 
-import static org.junit.Assert.assertTrue;
 import static java.util.Arrays.asList;
+import static org.junit.Assert.assertTrue;
 
 public class SelectOrderByTest extends CQLTester
 {
@@ -36,11 +36,7 @@
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 1);
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 2, 2);
 
-            for (boolean flush : new boolean[]{true, false})
-            {
-                if (flush)
-                    flush();
-
+            beforeAndAfterFlush(() -> {
                 assertRows(execute("SELECT * FROM %s WHERE a=? ORDER BY b ASC", 0),
                            row(0, 0, 0),
                            row(0, 1, 1),
@@ -66,7 +62,7 @@
 
                 assertRows(execute("SELECT c FROM %s WHERE a=? ORDER BY b DESC", 0),
                            row(2), row(1), row(0));
-            }
+            });
         }
     }
 
@@ -80,11 +76,7 @@
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 1);
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 2, 2);
 
-            for (boolean flush : new boolean[]{true, false})
-            {
-                if (flush)
-                    flush();
-
+            beforeAndAfterFlush(() -> {
                 // order by the only column in the selection
                 assertRows(execute("SELECT blobAsInt(intAsBlob(b)) FROM %s WHERE a=? ORDER BY b ASC", 0),
                            row(0), row(1), row(2));
@@ -101,7 +93,7 @@
 
                 assertInvalid("SELECT * FROM %s WHERE a=? ORDER BY c ASC", 0);
                 assertInvalid("SELECT * FROM %s WHERE a=? ORDER BY c DESC", 0);
-            }
+            });
         }
     }
 
@@ -117,20 +109,17 @@
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, {a: ?})", 0, 1, 1);
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, {a: ?})", 0, 2, 2);
 
-            for (boolean flush : new boolean[]{true, false})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
                 // order by a column not in the selection
                 assertRows(execute("SELECT c.a FROM %s WHERE a=? ORDER BY b ASC", 0),
-                           row(0), row(1), row(2));
-
+                        row(0), row(1), row(2));
+    
                 assertRows(execute("SELECT c.a FROM %s WHERE a=? ORDER BY b DESC", 0),
-                           row(2), row(1), row(0));
-
+                        row(2), row(1), row(0));
+    
                 assertRows(execute("SELECT blobAsInt(intAsBlob(c.a)) FROM %s WHERE a=? ORDER BY b DESC", 0),
                            row(2), row(1), row(0));
-            }
+            });
             dropTable("DROP TABLE %s");
         }
     }
@@ -146,11 +135,7 @@
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 4);
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 2, 5);
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT * FROM %s WHERE a=? ORDER BY b ASC", 0),
                        row(0, 0, 0, 0),
                        row(0, 0, 1, 1),
@@ -207,7 +192,7 @@
                        row(0), row(1), row(2), row(3), row(4), row(5));
             assertRows(execute("SELECT d FROM %s WHERE a=? ORDER BY b DESC, c DESC", 0),
                        row(5), row(4), row(3), row(2), row(1), row(0));
-        }
+        });
     }
 
     @Test
@@ -221,11 +206,7 @@
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 1, 4);
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 2, 5);
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertInvalid("SELECT blobAsInt(intAsBlob(b)) FROM %s WHERE a=? ORDER BY c ASC", 0);
             assertInvalid("SELECT blobAsInt(intAsBlob(b)) FROM %s WHERE a=? ORDER BY c DESC", 0);
             assertInvalid("SELECT blobAsInt(intAsBlob(b)) FROM %s WHERE a=? ORDER BY b ASC, c DESC", 0);
@@ -260,7 +241,7 @@
                        row(0), row(1), row(2), row(3), row(4), row(5));
             assertRows(execute("SELECT blobAsInt(intAsBlob(d)) FROM %s WHERE a=? ORDER BY b DESC, c DESC", 0),
                        row(5), row(4), row(3), row(2), row(1), row(0));
-        }
+        });
     }
 
     /**
@@ -275,14 +256,10 @@
         for (int i = 0; i < 10; i++)
             execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT v FROM %s WHERE k = 0 ORDER BY c DESC"),
                        row(9), row(8), row(7), row(6), row(5), row(4), row(3), row(2), row(1), row(0));
-        }
+        });
 
         createTable("CREATE TABLE %s (k int, c1 int, c2 int, v int, PRIMARY KEY (k, c1, c2)) WITH COMPACT STORAGE");
 
@@ -290,11 +267,7 @@
             for (int j = 0; j < 2; j++)
                 execute("INSERT INTO %s (k, c1, c2, v) VALUES (0, ?, ?, ?)", i, j, i * 2 + j);
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertInvalid("SELECT v FROM %s WHERE k = 0 ORDER BY c DESC");
             assertInvalid("SELECT v FROM %s WHERE k = 0 ORDER BY c2 DESC");
             assertInvalid("SELECT v FROM %s WHERE k = 0 ORDER BY k DESC");
@@ -304,7 +277,7 @@
 
             assertRows(execute("SELECT v FROM %s WHERE k = 0 ORDER BY c1"),
                        row(0), row(1), row(2), row(3), row(4), row(5), row(6), row(7));
-        }
+        });
     }
 
     /**
@@ -321,11 +294,7 @@
         execute("INSERT INTO %s (row, number, string) VALUES ('row', 3, 'three')");
         execute("INSERT INTO %s (row, number, string) VALUES ('row', 4, 'four')");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT number FROM %s WHERE row='row' AND number < 3 ORDER BY number ASC"),
                        row(1), row(2));
 
@@ -343,7 +312,7 @@
 
             assertRows(execute("SELECT number FROM %s WHERE row='row' AND number <= 3 ORDER BY number DESC"),
                        row(3), row(2), row(1));
-        }
+        });
     }
 
     /**
@@ -376,20 +345,22 @@
         execute("INSERT INTO %s (my_id, col1, value) VALUES ( 'key3', 2, 'b')");
         execute("INSERT INTO %s (my_id, col1, value) VALUES ( 'key4', 4, 'd')");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT col1 FROM %s WHERE my_id in('key1', 'key2', 'key3') ORDER BY col1"),
                        row(1), row(2), row(3));
 
+            assertRows(execute("SELECT col1 FROM %s WHERE my_id in('key1', 'key2', 'key3') ORDER BY col1 LIMIT 2"),
+                       row(1), row(2));
+
+            assertRows(execute("SELECT col1 FROM %s WHERE my_id in('key1', 'key2', 'key3') ORDER BY col1 LIMIT 10"),
+                       row(1), row(2), row(3));
+
             assertRows(execute("SELECT col1, my_id FROM %s WHERE my_id in('key1', 'key2', 'key3') ORDER BY col1"),
                        row(1, "key1"), row(2, "key3"), row(3, "key2"));
 
             assertRows(execute("SELECT my_id, col1 FROM %s WHERE my_id in('key1', 'key2', 'key3') ORDER BY col1"),
                        row("key1", 1), row("key3", 2), row("key2", 3));
-        }
+        });
 
         createTable("CREATE TABLE %s (pk1 int, pk2 int, c int, v text, PRIMARY KEY ((pk1, pk2), c) )");
         execute("INSERT INTO %s (pk1, pk2, c, v) VALUES (?, ?, ?, ?)", 1, 1, 2, "A");
@@ -397,31 +368,36 @@
         execute("INSERT INTO %s (pk1, pk2, c, v) VALUES (?, ?, ?, ?)", 1, 3, 3, "C");
         execute("INSERT INTO %s (pk1, pk2, c, v) VALUES (?, ?, ?, ?)", 1, 1, 4, "D");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
+        beforeAndAfterFlush(() -> {
+        assertRows(execute("SELECT v, ttl(v), c FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
+                   row("B", null, 1),
+                   row("A", null, 2),
+                   row("D", null, 4));
 
-            assertRows(execute("SELECT v, ttl(v), c FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
-                       row("B", null, 1),
-                       row("A", null, 2),
-                       row("D", null, 4));
+        assertRows(execute("SELECT v, ttl(v), c as name_1 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
+                   row("B", null, 1),
+                   row("A", null, 2),
+                   row("D", null, 4));
 
-            assertRows(execute("SELECT v, ttl(v), c as name_1 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
-                       row("B", null, 1),
-                       row("A", null, 2),
-                       row("D", null, 4));
+        assertRows(execute("SELECT v FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
+                   row("B"),
+                   row("A"),
+                   row("D"));
 
-            assertRows(execute("SELECT v FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
-                       row("B"),
-                       row("A"),
-                       row("D"));
+        assertRows(execute("SELECT v FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c LIMIT 2; ", 1, 1, 2),
+                   row("B"),
+                   row("A"));
 
-            assertRows(execute("SELECT v as c FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
-                       row("B"),
-                       row("A"),
-                       row("D"));
-        }
+        assertRows(execute("SELECT v FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c LIMIT 10; ", 1, 1, 2),
+                   row("B"),
+                   row("A"),
+                   row("D"));
+
+        assertRows(execute("SELECT v as c FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c; ", 1, 1, 2),
+                   row("B"),
+                   row("A"),
+                   row("D"));
+        });
 
         createTable("CREATE TABLE %s (pk1 int, pk2 int, c1 int, c2 int, v text, PRIMARY KEY ((pk1, pk2), c1, c2) )");
         execute("INSERT INTO %s (pk1, pk2, c1, c2, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 4, 4, "A");
@@ -429,11 +405,7 @@
         execute("INSERT INTO %s (pk1, pk2, c1, c2, v) VALUES (?, ?, ?, ?, ?)", 1, 3, 3, 3, "C");
         execute("INSERT INTO %s (pk1, pk2, c1, c2, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 4, 1, "D");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT v, ttl(v), c1, c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1, c2; ", 1, 1, 2),
                        row("B", null, 1, 2),
                        row("D", null, 4, 1),
@@ -453,7 +425,33 @@
                        row("B"),
                        row("D"),
                        row("A"));
-        }
+
+            assertRows(execute("SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1, c2 LIMIT 2; ", 1, 1, 2),
+                       row("B"),
+                       row("D"));
+
+            assertRows(execute("SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1, c2 LIMIT 10; ", 1, 1, 2),
+                       row("B"),
+                       row("D"),
+                       row("A"));
+
+            assertRows(execute("SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1 DESC , c2 DESC; ", 1, 1, 2),
+                       row("A"),
+                       row("D"),
+                       row("B"));
+
+            assertRows(execute("SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1 DESC , c2 DESC LIMIT 2; ", 1, 1, 2),
+                       row("A"),
+                       row("D"));
+
+            assertRows(execute("SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1 DESC , c2 DESC LIMIT 10; ", 1, 1, 2),
+                       row("A"),
+                       row("D"),
+                       row("B"));
+
+            assertInvalidMessage("LIMIT must be strictly positive",
+                                 "SELECT v as c2 FROM %s where pk1 = ? AND pk2 IN (?, ?) ORDER BY c1 DESC , c2 DESC LIMIT 0; ", 1, 1, 2);
+        });
     }
 
     @Test
@@ -470,11 +468,7 @@
         execute("UPDATE %s SET s = 2 WHERE a = 2");
         execute("UPDATE %s SET s = 3 WHERE a = 3");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT a, b, c, d, s FROM %s WHERE a IN (1, 2, 3) ORDER BY b DESC"),
                        row(2, 2, 2, 1, 2),
                        row(2, 2, 1, 1, 2),
@@ -502,7 +496,7 @@
                        row(1, 1, 2, 1, 1),
                        row(2, 2, 1, 1, 2),
                        row(2, 2, 2, 1, 2));
-        }
+        });
     }
 
     /**
@@ -517,11 +511,7 @@
         for(int i =0; i < 10; i++)
             execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT c, v FROM %s WHERE k = 0 ORDER BY c ASC"),
                        row(0, 0), row(1, 1), row(2, 2), row(3, 3), row(4, 4),
                        row(5, 5), row(6, 6), row(7, 7), row(8, 8), row(9, 9));
@@ -529,7 +519,7 @@
             assertRows(execute("SELECT c, v FROM %s WHERE k = 0 ORDER BY c DESC"),
                        row(9, 9), row(8, 8), row(7, 7), row(6, 6), row(5, 5),
                        row(4, 4), row(3, 3), row(2, 2), row(1, 1), row(0, 0));
-        }
+        });
 
         createTable("CREATE TABLE %s (k int, c1 int, c2 int, v text, PRIMARY KEY (k, c1, c2)) WITH CLUSTERING ORDER BY (c1 ASC, c2 DESC)");
 
@@ -537,11 +527,7 @@
             for(int j = 0; j < 10; j++)
                 execute("INSERT INTO %s (k, c1, c2, v) VALUES (0, ?, ?, ?)", i, j, String.format("%d%d", i, j));
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertInvalid("SELECT c1, c2, v FROM %s WHERE k = 0 ORDER BY c1 ASC, c2 ASC");
             assertInvalid("SELECT c1, c2, v FROM %s WHERE k = 0 ORDER BY c1 DESC, c2 DESC");
 
@@ -564,7 +550,7 @@
                        expectedRows);
 
             assertInvalid("SELECT c1, c2, v FROM %s WHERE k = 0 ORDER BY c2 DESC, c1 ASC");
-        }
+        });
     }
 
     /**
@@ -575,14 +561,10 @@
     {
         createTable("CREATE TABLE %s (k text, c1 int, c2 int, PRIMARY KEY (k, c1, c2) ) WITH CLUSTERING ORDER BY (c1 ASC, c2 DESC)");
 
-        for (int i = 0; i < 2; i++)
-            for (int j = 0; j < 2; j++)
-                execute("INSERT INTO %s (k, c1, c2) VALUES ('foo', ?, ?)", i, j);
-
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
+        beforeAndAfterFlush(() -> {
+            for (int i = 0; i < 2; i++)
+                for (int j = 0; j < 2; j++)
+                    execute("INSERT INTO %s (k, c1, c2) VALUES ('foo', ?, ?)", i, j);
 
             assertRows(execute("SELECT c1, c2 FROM %s WHERE k = 'foo'"),
                        row(0, 1), row(0, 0), row(1, 1), row(1, 0));
@@ -596,7 +578,7 @@
             assertInvalid("SELECT c1, c2 FROM %s WHERE k = 'foo' ORDER BY c2 DESC");
             assertInvalid("SELECT c1, c2 FROM %s WHERE k = 'foo' ORDER BY c2 ASC");
             assertInvalid("SELECT c1, c2 FROM %s WHERE k = 'foo' ORDER BY c1 ASC, c2 ASC");
-        }
+        });
     }
 
     /**
@@ -610,15 +592,11 @@
         execute("INSERT INTO %s(k, c1, c2) VALUES (0, 0, 1)");
         execute("INSERT INTO %s(k, c1, c2) VALUES (0, 0, 2)");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT * FROM %s WHERE k=0 AND c1 = 0 AND c2 IN (2, 0) ORDER BY c1 DESC"),
                        row(0, 0, 2),
                        row(0, 0, 0));
-        }
+        });
     }
 
     /**
@@ -637,11 +615,7 @@
         execute("INSERT INTO %s (k, c1, c2, v) VALUES (1, 1, 1, 4)");
         execute("INSERT INTO %s (k, c1, c2, v) VALUES (1, 1, 2, 5)");
 
-        for (boolean flush : new boolean[]{true, false})
-        {
-            if (flush)
-                flush();
-
+        beforeAndAfterFlush(() -> {
             assertRows(execute("SELECT * FROM %s WHERE k=0 AND c1 = 0 AND c2 IN (2, 0)"),
                        row(0, 0, 0, 0),
                        row(0, 0, 2, 2));
@@ -681,7 +655,7 @@
 
             // since we don 't know the write times, just assert that the order matches the order we expect
             assertTrue(isFirstIntSorted(results));
-        }
+        });
     }
 
     @Test
@@ -696,11 +670,7 @@
             execute("INSERT INTO %s (col_1, col_2, col_3) VALUES(?, ?, ?)", 1, 2, 10);
             execute("INSERT INTO %s (col_1, col_2, col_3) VALUES(?, ?, ?)", 1, 2, 11);
 
-            for (boolean flush : new boolean[]{true, false})
-            {
-                if (flush)
-                    flush();
-
+            beforeAndAfterFlush(() -> {
                 assertRows(execute("select * from %s where col_1=? and col_2 IN (?, ?) order by col_3;", 1, 1, 2),
                            row(1, 1, 1),
                            row(1, 1, 2),
@@ -728,7 +698,7 @@
                            row(1, 2, 10),
                            row(1, 1, 2),
                            row(1, 1, 1));
-            }
+            });
         }
     }
 

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderedPartitionerTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderedPartitionerTest.java
index 9609906..5e82020 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderedPartitionerTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectOrderedPartitionerTest.java

@@ -39,7 +39,7 @@
     @BeforeClass
     public static void setUp()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
+        DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
     }
 
     @Test
@@ -464,7 +464,7 @@
     @Test
     public void testTruncateWithCaching() throws Throwable
     {
-        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int,) WITH CACHING = ALL;");
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int) WITH CACHING = { 'keys': 'ALL', 'rows_per_partition': 'ALL' };");
 
         for (int i = 0; i < 3; i++)
             execute("INSERT INTO %s (k, v1, v2) VALUES (?, ?, ?)", i, i, i * 2);

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java
index f8e5a28..4beb1fb 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java

@@ -62,6 +62,10 @@
                              "SELECT * FROM %s WHERE c = 0 AND b <= ?", set(0));
         assertInvalidMessage("Collection column 'b' (set<int>) cannot be restricted by a 'IN' relation",
                              "SELECT * FROM %s WHERE c = 0 AND b IN (?)", set(0));
+        assertInvalidMessage("Unsupported \"!=\" relation: b != 5",
+                "SELECT * FROM %s WHERE c = 0 AND b != 5");
+        assertInvalidMessage("Unsupported restriction: b IS NOT NULL",
+                "SELECT * FROM %s WHERE c = 0 AND b IS NOT NULL");
     }
 
     @Test
@@ -142,7 +146,7 @@
 
         assertRows(execute("select * from %s where a = ? and c < ? and b in (?, ?)", "first", 7, 3, 2),
                    row("first", 2, 6, 2));
-//---
+
         assertRows(execute("select * from %s where a = ? and c >= ? and c <= ? and b in (?, ?)", "first", 6, 7, 3, 2),
                    row("first", 2, 6, 2),
                    row("first", 3, 7, 3));
@@ -297,6 +301,25 @@
     }
 
     @Test
+    public void testAllowFilteringWithIndexedColumnAndStaticColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, s int static, PRIMARY KEY(a, b))");
+        createIndex("CREATE INDEX ON %s(c)");
+
+        execute("INSERT INTO %s(a, b, c, s) VALUES(?, ?, ?, ?)", 1, 1, 1, 1);
+        execute("INSERT INTO %s(a, b, c) VALUES(?, ?, ?)", 1, 2, 1);
+        execute("INSERT INTO %s(a, s) VALUES(?, ?)", 3, 3);
+        execute("INSERT INTO %s(a, b, c, s) VALUES(?, ?, ?, ?)", 2, 1, 1, 2);
+
+        assertRows(execute("SELECT * FROM %s WHERE c = ? AND s > ? ALLOW FILTERING", 1, 1),
+                   row(2, 1, 2, 1));
+
+        assertRows(execute("SELECT * FROM %s WHERE c = ? AND s < ? ALLOW FILTERING", 1, 2),
+                   row(1, 1, 1, 1),
+                   row(1, 2, 1, 1));
+    }
+
+    @Test
     public void testIndexQueriesOnComplexPrimaryKey() throws Throwable
     {
         createTable("CREATE TABLE %s (pk0 int, pk1 int, ck0 int, ck1 int, ck2 int, value int, PRIMARY KEY ((pk0, pk1), ck0, ck1, ck2))");

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java
index ac1ba4c..469e8ca 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java

@@ -22,18 +22,17 @@
 
 import org.junit.Test;
 
-import junit.framework.Assert;
+import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 
-import org.apache.cassandra.cql3.CQLTester;
-
+import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
-import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import junit.framework.Assert;
 
 /**
  * Test column ranges and ordering with static column in table
@@ -373,7 +372,7 @@
     }
 
     @Test
-    public void testSetContains() throws Throwable
+    public void testSetContainsWithIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (account text, id int, categories set<text>, PRIMARY KEY (account, id))");
         createIndex("CREATE INDEX ON %s(categories)");
@@ -406,7 +405,7 @@
     }
 
     @Test
-    public void testListContains() throws Throwable
+    public void testListContainsWithIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (account text, id int, categories list<text>, PRIMARY KEY (account, id))");
         createIndex("CREATE INDEX ON %s(categories)");
@@ -441,7 +440,7 @@
     }
 
     @Test
-    public void testListContainsWithFiltering() throws Throwable
+    public void testListContainsWithIndexAndFiltering() throws Throwable
     {
         createTable("CREATE TABLE %s (e int PRIMARY KEY, f list<text>, s int)");
         createIndex("CREATE INDEX ON %s(f)");
@@ -459,7 +458,7 @@
     }
 
     @Test
-    public void testMapKeyContains() throws Throwable
+    public void testMapKeyContainsWithIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (account text, id int, categories map<text,text>, PRIMARY KEY (account, id))");
         createIndex("CREATE INDEX ON %s(keys(categories))");
@@ -497,7 +496,7 @@
     }
 
     @Test
-    public void testMapValueContains() throws Throwable
+    public void testMapValueContainsWithIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (account text, id int, categories map<text,text>, PRIMARY KEY (account, id))");
         createIndex("CREATE INDEX ON %s(categories)");
@@ -525,11 +524,11 @@
                              "SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ?", "test", 5, unset());
 
         assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
-                             "SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ? AND categories CONTAINS ?"
-                            , "test", 5, "foo", "notPresent");
+                             "SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ? AND categories CONTAINS ?",
+                             "test", 5, "foo", "notPresent");
 
-        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ? AND categories CONTAINS ? ALLOW FILTERING"
-                           , "test", 5, "foo", "notPresent"));
+        assertEmpty(execute("SELECT * FROM %s WHERE account = ? AND id = ? AND categories CONTAINS ? AND categories CONTAINS ? ALLOW FILTERING",
+                            "test", 5, "foo", "notPresent"));
     }
 
     // See CASSANDRA-7525
@@ -556,7 +555,7 @@
 
     // See CASSANDRA-8033
     @Test
-    public void testFilterForContains() throws Throwable
+    public void testFilterWithIndexForContains() throws Throwable
     {
         createTable("CREATE TABLE %s (k1 int, k2 int, v set<int>, PRIMARY KEY ((k1, k2)))");
         createIndex("CREATE INDEX ON %s(k2)");
@@ -612,7 +611,7 @@
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, map("lmn", "foo"));
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 6, map("lmn", "foo2"));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (categories) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "foo");
 
         assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "test", "lmn"),
@@ -635,7 +634,7 @@
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 5, map("lmn", "foo"));
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 6, map("lmn2", "foo"));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (categories) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "test", "lmn");
 
         assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "foo"),
@@ -1171,137 +1170,6 @@
     }
 
     /**
-     * Migrated from cql_tests.py:TestCQL.select_distinct_test()
-     */
-    @Test
-    public void testSelectDistinct() throws Throwable
-    {
-        // Test a regular(CQL3) table.
-        createTable("CREATE TABLE %s (pk0 int, pk1 int, ck0 int, val int, PRIMARY KEY((pk0, pk1), ck0))");
-
-        for (int i = 0; i < 3; i++)
-        {
-            execute("INSERT INTO %s (pk0, pk1, ck0, val) VALUES (?, ?, 0, 0)", i, i);
-            execute("INSERT INTO %s (pk0, pk1, ck0, val) VALUES (?, ?, 1, 1)", i, i);
-        }
-
-        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 1"),
-                   row(0, 0));
-
-        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 3"),
-                   row(0, 0),
-                   row(2, 2),
-                   row(1, 1));
-
-        // Test selection validation.
-        assertInvalidMessage("queries must request all the partition key columns", "SELECT DISTINCT pk0 FROM %s");
-        assertInvalidMessage("queries must only request partition key columns", "SELECT DISTINCT pk0, pk1, ck0 FROM %s");
-
-        //Test a 'compact storage' table.
-        createTable("CREATE TABLE %s (pk0 int, pk1 int, val int, PRIMARY KEY((pk0, pk1))) WITH COMPACT STORAGE");
-
-        for (int i = 0; i < 3; i++)
-            execute("INSERT INTO %s (pk0, pk1, val) VALUES (?, ?, ?)", i, i, i);
-
-        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 1"),
-                   row(0, 0));
-
-        assertRows(execute("SELECT DISTINCT pk0, pk1 FROM %s LIMIT 3"),
-                   row(0, 0),
-                   row(2, 2),
-                   row(1, 1));
-
-        // Test a 'wide row' thrift table.
-        createTable("CREATE TABLE %s (pk int, name text, val int, PRIMARY KEY(pk, name)) WITH COMPACT STORAGE");
-
-        for (int i = 0; i < 3; i++)
-        {
-            execute("INSERT INTO %s (pk, name, val) VALUES (?, 'name0', 0)", i);
-            execute("INSERT INTO %s (pk, name, val) VALUES (?, 'name1', 1)", i);
-        }
-
-        assertRows(execute("SELECT DISTINCT pk FROM %s LIMIT 1"),
-                   row(1));
-
-        assertRows(execute("SELECT DISTINCT pk FROM %s LIMIT 3"),
-                   row(1),
-                   row(0),
-                   row(2));
-    }
-
-    /**
-     * Migrated from cql_tests.py:TestCQL.select_distinct_with_deletions_test()
-     */
-    @Test
-    public void testSelectDistinctWithDeletions() throws Throwable
-    {
-        createTable("CREATE TABLE %s (k int PRIMARY KEY, c int, v int)");
-
-        for (int i = 0; i < 10; i++)
-            execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", i, i, i);
-
-        Object[][] rows = getRows(execute("SELECT DISTINCT k FROM %s"));
-        Assert.assertEquals(10, rows.length);
-        Object key_to_delete = rows[3][0];
-
-        execute("DELETE FROM %s WHERE k=?", key_to_delete);
-
-        rows = getRows(execute("SELECT DISTINCT k FROM %s"));
-        Assert.assertEquals(9, rows.length);
-
-        rows = getRows(execute("SELECT DISTINCT k FROM %s LIMIT 5"));
-        Assert.assertEquals(5, rows.length);
-
-        rows = getRows(execute("SELECT DISTINCT k FROM %s"));
-        Assert.assertEquals(9, rows.length);
-    }
-
-    @Test
-    public void testSelectDistinctWithWhereClause() throws Throwable {
-        createTable("CREATE TABLE %s (k int, a int, b int, PRIMARY KEY (k, a))");
-        createIndex("CREATE INDEX ON %s (b)");
-
-        for (int i = 0; i < 10; i++)
-        {
-            execute("INSERT INTO %s (k, a, b) VALUES (?, ?, ?)", i, i, i);
-            execute("INSERT INTO %s (k, a, b) VALUES (?, ?, ?)", i, i * 10, i * 10);
-        }
-
-        String distinctQueryErrorMsg = "SELECT DISTINCT with WHERE clause only supports restriction by partition key.";
-        assertInvalidMessage(distinctQueryErrorMsg,
-                             "SELECT DISTINCT k FROM %s WHERE a >= 80 ALLOW FILTERING");
-
-        assertInvalidMessage(distinctQueryErrorMsg,
-                             "SELECT DISTINCT k FROM %s WHERE k IN (1, 2, 3) AND a = 10");
-
-        assertInvalidMessage(distinctQueryErrorMsg,
-                             "SELECT DISTINCT k FROM %s WHERE b = 5");
-
-        assertRows(execute("SELECT DISTINCT k FROM %s WHERE k = 1"),
-                   row(1));
-        assertRows(execute("SELECT DISTINCT k FROM %s WHERE k IN (5, 6, 7)"),
-                   row(5),
-                   row(6),
-                   row(7));
-
-        // With static columns
-        createTable("CREATE TABLE %s (k int, a int, s int static, b int, PRIMARY KEY (k, a))");
-        createIndex("CREATE INDEX ON %s (b)");
-        for (int i = 0; i < 10; i++)
-        {
-            execute("INSERT INTO %s (k, a, b, s) VALUES (?, ?, ?, ?)", i, i, i, i);
-            execute("INSERT INTO %s (k, a, b, s) VALUES (?, ?, ?, ?)", i, i * 10, i * 10, i * 10);
-        }
-
-        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k = 5"),
-                   row(50));
-        assertRows(execute("SELECT DISTINCT s FROM %s WHERE k IN (5, 6, 7)"),
-                   row(50),
-                   row(60),
-                   row(70));
-    }
-
-    /**
      * Migrated from cql_tests.py:TestCQL.bug_6327_test()
      */
     @Test
@@ -1387,95 +1255,144 @@
     }
 
     @Test
-    public void testFilteringWithoutIndices() throws Throwable
+    public void testFilteringOnStaticColumnsWithRowsWithOnlyStaticValues() throws Throwable
     {
-        createTable("CREATE TABLE %s (a int, b int, c int, d int, e map<int, int>, PRIMARY KEY (a, b))");
+        createTable("CREATE TABLE %s (a int, b int, s int static, c int, d int, primary key (a, b))");
 
-        // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c, d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c = 1 AND d = 2 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 1 AND c = 2 ALLOW FILTERING");
-        assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
-                             "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (2, 3) ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > 2 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS 1 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS KEY 1 ALLOW FILTERING");
+        for (int i = 0; i < 5; i++)
+        {
+            execute("INSERT INTO %s (a, s) VALUES (?, ?)", i, i);
+            if (i != 2)
+                for (int j = 0; j < 4; j++)
+                    execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", i, j, j, i + j);
+        }
 
-        // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS KEY null ALLOW FILTERING");
-
-        // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c = ? ALLOW FILTERING", unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > ? ALLOW FILTERING", unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS ? ALLOW FILTERING", unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS KEY ? ALLOW FILTERING", unset());
-
-        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY(a)) WITH COMPACT STORAGE");
-
-        execute("INSERT INTO %s (a, b, c) VALUES (1, 2, 4)");
-        execute("INSERT INTO %s (a, b, c) VALUES (2, 2, 8)");
-        execute("INSERT INTO %s (a, b, c) VALUES (3, 6, 4)");
-
-        assertRows(execute("SELECT * FROM %s WHERE c = 4 ALLOW FILTERING"),
-                   row(1, 2, 4),
-                   row(3, 6, 4));
-
-        // Checks filtering with null
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE, "SELECT * FROM %s WHERE c = null");
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE, "SELECT * FROM %s WHERE c > null");
-        assertInvalidMessage("Unsupported null value for column c", "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Unsupported null value for column c", "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
-
-        // Checks filtering with unset
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
-                             "SELECT * FROM %s WHERE c = ?", unset());
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
-                             "SELECT * FROM %s WHERE c > ?", unset());
-        assertInvalidMessage("Unsupported unset value for column c",
-                             "SELECT * FROM %s WHERE c = ? ALLOW FILTERING", unset());
-        assertInvalidMessage("Unsupported unset value for column c",
-                             "SELECT * FROM %s WHERE c > ? ALLOW FILTERING", unset());
+        assertRows(execute("SELECT * FROM %s WHERE c = 2 AND s >= 1 LIMIT 2 ALLOW FILTERING"),
+                   row(1, 2, 1, 2, 3),
+                   row(4, 2, 4, 2, 6));
     }
 
     @Test
-    public void testFilteringOnStaticColumnWithoutIndices() throws Throwable
+    public void testFilteringWithoutIndices() throws Throwable
     {
-        createTable("CREATE TABLE %s (a int, b int, s int static, c int, PRIMARY KEY (a, b))");
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, s int static, PRIMARY KEY (a, b))");
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (1, 2, 4, 8)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (1, 3, 6, 12)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (1, 4, 4, 8)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (2, 3, 7, 12)");
+        execute("UPDATE %s SET s = 1 WHERE a = 1");
+        execute("UPDATE %s SET s = 2 WHERE a = 2");
+        execute("UPDATE %s SET s = 3 WHERE a = 3");
+
+        // Adds tomstones
+        execute("INSERT INTO %s (a, b, c, d) VALUES (1, 1, 4, 8)");
+        execute("INSERT INTO %s (a, b, c, d) VALUES (2, 2, 7, 12)");
+        execute("DELETE FROM %s WHERE a = 1 AND b = 1");
+        execute("DELETE FROM %s WHERE a = 2 AND b = 2");
+
+        flush();
 
         // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c, s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c = 1 AND s = 2 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 1 AND s = 2 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE s > 2 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c = 4 AND d = 8");
+
+        assertRows(execute("SELECT * FROM %s WHERE c = 4 AND d = 8 ALLOW FILTERING"),
+                   row(1, 2, 1, 4, 8),
+                   row(1, 4, 1, 4, 8));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND d = 8");
+
+        assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND d = 8 ALLOW FILTERING"),
+                   row(1, 4, 1, 4, 8));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE s = 1 AND d = 12");
+
+        assertRows(execute("SELECT * FROM %s WHERE s = 1 AND d = 12 ALLOW FILTERING"),
+                   row(1, 3, 1, 6, 12));
+
+        assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+                             "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7)");
+
+        assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+                             "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE c > 4 ALLOW FILTERING"),
+                   row(1, 3, 1, 6, 12),
+                   row(2, 3, 2, 7, 12));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                "SELECT * FROM %s WHERE s > 1");
+
+        assertRows(execute("SELECT * FROM %s WHERE s > 1 ALLOW FILTERING"),
+                   row(2, 3, 2, 7, 12),
+                   row(3, null, 3, null, null));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE b < 3 AND c <= 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE b < 3 AND c <= 4 ALLOW FILTERING"),
+                   row(1, 2, 1, 4, 8));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c >= 3 AND c <= 6");
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= 3 AND c <= 6 ALLOW FILTERING"),
+                   row(1, 2, 1, 4, 8),
+                   row(1, 3, 1, 6, 12),
+                   row(1, 4, 1, 4, 8));
+
+        assertRows(execute("SELECT * FROM %s WHERE s >= 1 LIMIT 2 ALLOW FILTERING"),
+                   row(1, 2, 1, 4, 8),
+                   row(1, 3, 1, 6, 12));
 
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE s = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c = null");
+        assertInvalidMessage("Unsupported null value for column c",
+                             "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > null");
+        assertInvalidMessage("Unsupported null value for column c",
+                             "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE s > null");
+        assertInvalidMessage("Unsupported null value for column s",
                              "SELECT * FROM %s WHERE s > null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE s = ? ALLOW FILTERING", unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (s) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE s > ? ALLOW FILTERING", unset());
+        assertInvalidMessage("Unsupported unset value for column c",
+                             "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
+                             unset());
+        assertInvalidMessage("Unsupported unset value for column s",
+                             "SELECT * FROM %s WHERE s = ? ALLOW FILTERING",
+                             unset());
+        assertInvalidMessage("Unsupported unset value for column c",
+                             "SELECT * FROM %s WHERE c > ? ALLOW FILTERING",
+                             unset());
+    }
+
+    @Test
+    public void testIndexQueryWithCompositePartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (p1 int, p2 int, v int, PRIMARY KEY ((p1, p2)))");
+        assertInvalidMessage("Partition key parts: p2 must be restricted as other parts are",
+                             "SELECT * FROM %s WHERE p1 = 1 AND v = 3 ALLOW FILTERING");
+
+        createIndex("CREATE INDEX ON %s(v)");
+
+        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 1, 1, 3);
+        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 1, 2, 3);
+        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 2, 1, 3);
+
+        assertRows(execute("SELECT * FROM %s WHERE p1 = 1 AND v = 3 ALLOW FILTERING"),
+                   row(1, 2, 3),
+                   row(1, 1, 3));
     }
 
     @Test
@@ -1491,7 +1408,7 @@
         execute("INSERT INTO %s (a, b, c) VALUES (1, 4, 4)");
         execute("INSERT INTO %s (a, b, c) VALUES (2, 3, 7)");
 
-        // Lets add some tombstones to make sure that filtering handle them properly
+        // Adds tomstones
         execute("INSERT INTO %s (a, b, c) VALUES (1, 1, 4)");
         execute("INSERT INTO %s (a, b, c) VALUES (2, 2, 7)");
         execute("DELETE FROM %s WHERE a = 1 AND b = 1");
@@ -1500,30 +1417,54 @@
         flush();
 
         // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = 4 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = 4 ALLOW FILTERING"),
+                   row(1, 4, 4));
+
+        assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+                             "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7)");
 
         assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
                              "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > 4 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE c > 4 ALLOW FILTERING"),
+                   row(1, 3, 6),
+                   row(2, 3, 7));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE b < 3 AND c <= 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE b < 3 AND c <= 4 ALLOW FILTERING"),
+                   row(1, 2, 4));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c >= 3 AND c <= 6");
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= 3 AND c <= 6 ALLOW FILTERING"),
+                   row(1, 2, 4),
+                   row(1, 3, 6),
+                   row(1, 4, 4));
 
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c = null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c > null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
 
-        // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        // // Checks filtering with unset
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c > ? ALLOW FILTERING",
                              unset());
 
@@ -1537,7 +1478,7 @@
         execute("INSERT INTO %s (a, b, c) VALUES (3, 2, 4)");
         execute("INSERT INTO %s (a, b, c) VALUES (4, 1, 7)");
 
-        // Lets add some tombstones to make sure that filtering handle them properly
+        // Adds tomstones
         execute("INSERT INTO %s (a, b, c) VALUES (0, 1, 4)");
         execute("INSERT INTO %s (a, b, c) VALUES (5, 2, 7)");
         execute("DELETE FROM %s WHERE a = 0");
@@ -1590,7 +1531,7 @@
         assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
 
-        // Checks filtering with unset
+        // // Checks filtering with unset
         assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
@@ -1612,51 +1553,79 @@
         flush();
 
         // Checks filtering for lists
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND c CONTAINS 3 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
         // Checks filtering for sets
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE d CONTAINS 4 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE d CONTAINS 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE d CONTAINS 4 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE d CONTAINS 4 AND d CONTAINS 6 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
         // Checks filtering for maps
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS 2 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE e CONTAINS 2");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS KEY 2 ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS KEY 1 ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE e[1] = 6 ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)));
+
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS KEY 1 AND e CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND d CONTAINS 4 AND e CONTAINS KEY 3 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column d",
                              "SELECT * FROM %s WHERE d CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS KEY null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null map key for column e",
                              "SELECT * FROM %s WHERE e[null] = 2 ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null map value for column e",
                              "SELECT * FROM %s WHERE e[1] = null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column d",
                              "SELECT * FROM %s WHERE d CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS KEY ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset map key for column e",
                              "SELECT * FROM %s WHERE e[?] = 2 ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset map value for column e",
                              "SELECT * FROM %s WHERE e[1] = ? ALLOW FILTERING",
                              unset());
     }
@@ -1674,52 +1643,112 @@
         flush();
 
         // Checks filtering for lists
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c = [3, 2] ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c = [3, 2]");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > [1, 5] AND c < [3, 6] ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE c = [3, 2] ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > [1, 5] AND c < [3, 6]");
+
+        assertRows(execute("SELECT * FROM %s WHERE c > [1, 5] AND c < [3, 6] ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= [1, 6] AND c < [3, 3] ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                "SELECT * FROM %s WHERE c CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND c CONTAINS 3 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
         // Checks filtering for sets
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE d = {6, 4} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE d = {6, 4}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE d > {4, 5} AND d < {6} ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE d = {6, 4} ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE d CONTAINS 4 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE d > {4, 5} AND d < {6}");
+
+        assertRows(execute("SELECT * FROM %s WHERE d > {4, 5} AND d < {6} ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE d >= {2, 12} AND d <= {4, 6} ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE d CONTAINS 4");
+
+        assertRows(execute("SELECT * FROM %s WHERE d CONTAINS 4 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE d CONTAINS 4 AND d CONTAINS 6 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
 
         // Checks filtering for maps
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e = {1 : 2} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE e = {1 : 2}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                "SELECT * FROM %s WHERE e > {1 : 4} AND e < {3 : 6} ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE e = {1 : 2} ALLOW FILTERING"),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE e CONTAINS 2 ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                "SELECT * FROM %s WHERE e > {1 : 4} AND e < {3 : 6}");
+
+        assertRows(execute("SELECT * FROM %s WHERE e > {1 : 4} AND e < {3 : 6} ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE e >= {1 : 6} AND e <= {3 : 2} ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE e CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS KEY 1 ALLOW FILTERING"),
+                   row(1, 2, list(1, 6), set(2, 12), map(1, 6)),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
 
         assertInvalidMessage("Map-entry equality predicates on frozen map column e are not supported",
                              "SELECT * FROM %s WHERE e[1] = 6 ALLOW FILTERING");
 
+        assertRows(execute("SELECT * FROM %s WHERE e CONTAINS KEY 1 AND e CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 4, list(1, 2), set(2, 4), map(1, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND d CONTAINS 4 AND e CONTAINS KEY 3 ALLOW FILTERING"),
+                   row(1, 3, list(3, 2), set(6, 4), map(3, 2)));
+
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column d",
                              "SELECT * FROM %s WHERE d = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column d",
                              "SELECT * FROM %s WHERE d CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column e",
                              "SELECT * FROM %s WHERE e = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS KEY null ALLOW FILTERING");
         assertInvalidMessage("Map-entry equality predicates on frozen map column e are not supported",
                              "SELECT * FROM %s WHERE e[null] = 2 ALLOW FILTERING");
@@ -1727,25 +1756,25 @@
                              "SELECT * FROM %s WHERE e[1] = null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column d",
                              "SELECT * FROM %s WHERE d = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (d) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column d",
                              "SELECT * FROM %s WHERE d CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column e",
                              "SELECT * FROM %s WHERE e = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (e) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column e",
                              "SELECT * FROM %s WHERE e CONTAINS KEY ? ALLOW FILTERING",
                              unset());
         assertInvalidMessage("Map-entry equality predicates on frozen map column e are not supported",
@@ -1772,43 +1801,67 @@
         flush();
 
         // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = [4, 1] ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = [4, 1]");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > [4, 2] ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = [4, 1] ALLOW FILTERING"),
+                   row(1, 4, list(4, 1)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE b <= 3 AND c < [6, 2] ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > [4, 2]");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE c > [4, 2] ALLOW FILTERING"),
+                   row(1, 3, list(6, 2)),
+                   row(2, 3, list(7, 1)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE b <= 3 AND c < [6, 2]");
+
+        assertRows(execute("SELECT * FROM %s WHERE b <= 3 AND c < [6, 2] ALLOW FILTERING"),
+                   row(1, 2, list(4, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c >= [4, 2] AND c <= [6, 4]");
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= [4, 2] AND c <= [6, 4] ALLOW FILTERING"),
+                   row(1, 2, list(4, 2)),
+                   row(1, 3, list(6, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 2, list(4, 2)),
+                   row(1, 3, list(6, 2)));
 
         assertInvalidMessage("Cannot use CONTAINS KEY on non-map column c",
                              "SELECT * FROM %s WHERE c CONTAINS KEY 2 ALLOW FILTERING");
 
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND c CONTAINS 6 ALLOW FILTERING"),
+                   row(1, 3, list(6, 2)));
+
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c = null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c > null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c CONTAINS null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c > ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS ? ALLOW FILTERING",
                              unset());
 
@@ -1906,43 +1959,67 @@
         flush();
 
         // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4, 1} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4, 1}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > {4, 2} ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4, 1} ALLOW FILTERING"),
+                   row(1, 4, set(4, 1)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c >= {4, 2} AND c <= {6, 4} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > {4, 2}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE c > {4, 2} ALLOW FILTERING"),
+                   row(1, 3, set(6, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE b <= 3 AND c < {6, 2}");
+
+        assertRows(execute("SELECT * FROM %s WHERE b <= 3 AND c < {6, 2} ALLOW FILTERING"),
+                   row(1, 2, set(2, 4)),
+                   row(2, 3, set(1, 7)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c >= {4, 2} AND c <= {6, 4}");
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= {4, 2} AND c <= {6, 4} ALLOW FILTERING"),
+                   row(1, 2, set(4, 2)),
+                   row(1, 3, set(6, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 2, set(4, 2)),
+                   row(1, 3, set(6, 2)));
 
         assertInvalidMessage("Cannot use CONTAINS KEY on non-map column c",
                              "SELECT * FROM %s WHERE c CONTAINS KEY 2 ALLOW FILTERING");
 
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND c CONTAINS 6 ALLOW FILTERING"),
+                   row(1, 3, set(6, 2)));
+
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c = null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c > null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c CONTAINS null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c > ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS ? ALLOW FILTERING",
                              unset());
 
@@ -2035,6 +2112,26 @@
 
         assertInvalidMessage("Index expression values may not be larger than 64K",
                              "SELECT * FROM %s WHERE c = ?  ALLOW FILTERING", TOO_BIG);
+
+        dropIndex("DROP INDEX %s.test");
+        assertEmpty(execute("SELECT * FROM %s WHERE c = ?  ALLOW FILTERING", TOO_BIG));
+    }
+
+    @Test
+    public void testPKQueryWithValueOver64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY (a, b))");
+
+        assertInvalidThrow(InvalidRequestException.class,
+                           "SELECT * FROM %s WHERE a = ?", new String(TOO_BIG.array()));
+    }
+
+    @Test
+    public void testCKQueryWithValueOver64K() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY (a, b))");
+
+        execute("SELECT * FROM %s WHERE a = 'foo' AND b = ?", new String(TOO_BIG.array()));
     }
 
     @Test
@@ -2053,45 +2150,72 @@
         flush();
 
         // Checks filtering
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4 : 1} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4 : 1}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c > {4 : 2} ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = {4 : 1} ALLOW FILTERING"),
+                   row(1, 4, map(4, 1)));
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE b <= 3 AND c < {6 : 2} ALLOW FILTERING");
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c > {4 : 2}");
 
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
-                             "SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE c > {4 : 2} ALLOW FILTERING"),
+                   row(1, 3, map(6, 2)),
+                   row(2, 3, map(7, 1)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE b <= 3 AND c < {6 : 2}");
+
+        assertRows(execute("SELECT * FROM %s WHERE b <= 3 AND c < {6 : 2} ALLOW FILTERING"),
+                   row(1, 2, map(4, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c >= {4 : 2} AND c <= {6 : 4}");
+
+        assertRows(execute("SELECT * FROM %s WHERE c >= {4 : 2} AND c <= {6 : 4} ALLOW FILTERING"),
+                   row(1, 2, map(4, 2)),
+                   row(1, 3, map(6, 2)));
+
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                             "SELECT * FROM %s WHERE c CONTAINS 2");
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 ALLOW FILTERING"),
+                   row(1, 2, map(4, 2)),
+                   row(1, 3, map(6, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS KEY 6 ALLOW FILTERING"),
+                   row(1, 3, map(6, 2)));
+
+        assertRows(execute("SELECT * FROM %s WHERE c CONTAINS 2 AND c CONTAINS KEY 6 ALLOW FILTERING"),
+                   row(1, 3, map(6, 2)));
 
         // Checks filtering with null
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c = null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c = null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c > null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c > null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                              "SELECT * FROM %s WHERE c CONTAINS null");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS null ALLOW FILTERING");
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported null value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS KEY null ALLOW FILTERING");
 
         // Checks filtering with unset
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c = ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c > ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS ? ALLOW FILTERING",
                              unset());
-        assertInvalidMessage("Predicates on non-primary-key columns (c) are not yet supported for non secondary index queries",
+        assertInvalidMessage("Unsupported unset value for column c",
                              "SELECT * FROM %s WHERE c CONTAINS KEY ? ALLOW FILTERING",
                              unset());
 
@@ -2255,34 +2379,6 @@
     }
 
     @Test
-    public void testOverlyLargeSelectPK() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY ((a), b))");
-
-        assertInvalidThrow(InvalidRequestException.class,
-                           "SELECT * FROM %s WHERE a = ?", new String(TOO_BIG.array()));
-    }
-
-    @Test
-    public void testOverlyLargeSelectCK() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a text, b text, PRIMARY KEY ((a), b))");
-
-        assertInvalidThrow(InvalidRequestException.class,
-                           "SELECT * FROM %s WHERE a = 'foo' AND b = ?", new String(TOO_BIG.array()));
-    }
-
-    @Test
-    public void testOverlyLargeSelectKeyIn() throws Throwable
-    {
-        createTable("CREATE TABLE %s (a text, b text, c text, d text, PRIMARY KEY ((a, b, c), d))");
-
-        assertInvalidThrow(InvalidRequestException.class,
-                           "SELECT * FROM %s WHERE a = 'foo' AND b= 'bar' AND c IN (?, ?)",
-                           new String(TOO_BIG.array()), new String(TOO_BIG.array()));
-    }
-
-    @Test
     public void testFilteringWithSecondaryIndex() throws Throwable
     {
         createTable("CREATE TABLE %s (pk int, " +
@@ -2326,24 +2422,6 @@
     }
 
     @Test
-    public void testIndexQueryWithCompositePartitionKey() throws Throwable
-    {
-        createTable("CREATE TABLE %s (p1 int, p2 int, v int, PRIMARY KEY ((p1, p2)))");
-        assertInvalidMessage("Partition key parts: p2 must be restricted as other parts are",
-                             "SELECT * FROM %s WHERE p1 = 1 AND v = 3 ALLOW FILTERING");
-
-        createIndex("CREATE INDEX ON %s(v)");
-
-        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 1, 1, 3);
-        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 1, 2, 3);
-        execute("INSERT INTO %s(p1, p2, v) values (?, ?, ?)", 2, 1, 3);
-
-        assertRows(execute("SELECT * FROM %s WHERE p1 = 1 AND v = 3 ALLOW FILTERING"),
-                   row(1, 2, 3),
-                   row(1, 1, 3));
-    }
-
-    @Test
     public void testEmptyRestrictionValue() throws Throwable
     {
         for (String options : new String[] { "", " WITH COMPACT STORAGE" })
@@ -2354,10 +2432,7 @@
             execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                     bytes("foo123"), bytes("2"), bytes("2"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
 
                 assertInvalidMessage("Key may not be empty", "SELECT * FROM %s WHERE pk = textAsBlob('');");
                 assertInvalidMessage("Key may not be empty", "SELECT * FROM %s WHERE pk IN (textAsBlob(''), textAsBlob('1'));");
@@ -2402,11 +2477,11 @@
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND (c) < (textAsBlob(''));"));
 
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c > textAsBlob('') AND c < textAsBlob('');"));
-            }
+            });
 
             if (options.contains("COMPACT"))
             {
-                assertInvalidMessage("Missing PRIMARY KEY part c",
+                assertInvalidMessage("Invalid empty or null value for column c",
                                      "INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                                      bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("4"));
             }
@@ -2415,11 +2490,7 @@
                 execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                         bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("4"));
 
-                for (boolean flush : new boolean[]{false, true})
-                {
-                    if (flush)
-                        flush();
-
+                beforeAndAfterFlush(() -> {
                     assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c = textAsBlob('');"),
                                row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("4")));
 
@@ -2463,12 +2534,19 @@
                     assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND (c) < (textAsBlob(''));"));
 
                     assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c >= textAsBlob('') AND c < textAsBlob('');"));
-                }
+                });
             }
 
             // Test restrictions on non-primary key value
-            assertInvalidMessage("Predicates on non-primary-key columns (v) are not yet supported for non secondary index queries",
-                                 "SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND v = textAsBlob('') ALLOW FILTERING;");
+            assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND v = textAsBlob('') ALLOW FILTERING;"));
+
+            execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
+                    bytes("foo123"), bytes("3"), EMPTY_BYTE_BUFFER);
+
+            beforeAndAfterFlush(() -> {
+                assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND v = textAsBlob('') ALLOW FILTERING;"),
+                           row(bytes("foo123"), bytes("3"), EMPTY_BYTE_BUFFER));
+            });
         }
     }
 
@@ -2481,10 +2559,7 @@
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"), bytes("1"));
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("2"), bytes("2"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
 
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 = textAsBlob('');"));
 
@@ -2520,16 +2595,12 @@
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 = textAsBlob('1') AND c2 <= textAsBlob('');"));
 
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND (c1, c2) <= (textAsBlob('1'), textAsBlob(''));"));
-            }
+            });
 
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)",
                     bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("4"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
-
+            beforeAndAfterFlush(() -> {
                 assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 = textAsBlob('');"),
                            row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("4")));
 
@@ -2560,7 +2631,7 @@
                            row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("4")));
 
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND (c1, c2) < (textAsBlob(''), textAsBlob('1'));"));
-            }
+            });
         }
     }
 
@@ -2584,10 +2655,7 @@
                     bytes("2"),
                     bytes("2"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
 
                 assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c > textAsBlob('')" + orderingClause),
                            row(bytes("foo123"), bytes("2"), bytes("2")),
@@ -2600,11 +2668,12 @@
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c < textAsBlob('')" + orderingClause));
 
                 assertEmpty(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c <= textAsBlob('')" + orderingClause));
-            }
+
+            });
 
             if (options.contains("COMPACT"))
             {
-                assertInvalidMessage("Missing PRIMARY KEY part c",
+                assertInvalidMessage("Invalid empty or null value for column c",
                                      "INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                                      bytes("foo123"),
                                      EMPTY_BYTE_BUFFER,
@@ -2615,10 +2684,7 @@
                 execute("INSERT INTO %s (pk, c, v) VALUES (?, ?, ?)",
                         bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("4"));
 
-                for (boolean flush : new boolean[]{false, true})
-                {
-                    if (flush)
-                        flush();
+                beforeAndAfterFlush(() -> {
 
                     assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c IN (textAsBlob(''), textAsBlob('1'))" + orderingClause),
                                row(bytes("foo123"), bytes("1"), bytes("1")),
@@ -2637,7 +2703,7 @@
 
                     assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c <= textAsBlob('')" + orderingClause),
                                row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("4")));
-                }
+                });
             }
         }
     }
@@ -2656,10 +2722,7 @@
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("1"), bytes("1"));
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)", bytes("foo123"), bytes("1"), bytes("2"), bytes("2"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
 
                 assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 > textAsBlob('')" + orderingClause),
                            row(bytes("foo123"), bytes("1"), bytes("2"), bytes("2")),
@@ -2676,15 +2739,12 @@
                 assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 = textAsBlob('1') AND c2 >= textAsBlob('')" + orderingClause),
                            row(bytes("foo123"), bytes("1"), bytes("2"), bytes("2")),
                            row(bytes("foo123"), bytes("1"), bytes("1"), bytes("1")));
-            }
+            });
 
             execute("INSERT INTO %s (pk, c1, c2, v) VALUES (?, ?, ?, ?)",
                     bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("4"));
 
-            for (boolean flush : new boolean[]{false, true})
-            {
-                if (flush)
-                    flush();
+            beforeAndAfterFlush(() -> {
 
                 assertRows(execute("SELECT * FROM %s WHERE pk = textAsBlob('foo123') AND c1 IN (textAsBlob(''), textAsBlob('1')) AND c2 = textAsBlob('1')" + orderingClause),
                            row(bytes("foo123"), bytes("1"), bytes("1"), bytes("1")),
@@ -2702,7 +2762,186 @@
                            row(bytes("foo123"), bytes("1"), bytes("2"), bytes("2")),
                            row(bytes("foo123"), bytes("1"), bytes("1"), bytes("1")),
                            row(bytes("foo123"), EMPTY_BYTE_BUFFER, bytes("1"), bytes("4")));
-            }
+            });
         }
     }
+
+    @Test
+    public void testFilteringOnCollectionsWithNull() throws Throwable
+    {
+        createTable(" CREATE TABLE %s ( k int, v int, l list<int>, s set<text>, m map<text, int>, PRIMARY KEY (k, v))");
+
+        createIndex("CREATE INDEX ON %s (v)");
+        createIndex("CREATE INDEX ON %s (s)");
+        createIndex("CREATE INDEX ON %s (m)");
+
+
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (0, 0, [1, 2],    {'a'},      {'a' : 1})");
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (0, 1, [3, 4],    {'b', 'c'}, {'a' : 1, 'b' : 2})");
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (0, 2, [1],       {'a', 'c'}, {'c' : 3})");
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (1, 0, [1, 2, 4], {},         {'b' : 1})");
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (1, 1, [4, 5],    {'d'},      {'a' : 1, 'b' : 3})");
+        execute("INSERT INTO %s (k, v, l, s, m) VALUES (1, 2, null,      null,       null)");
+
+        beforeAndAfterFlush(() -> {
+            // lists
+            assertRows(execute("SELECT k, v FROM %s WHERE l CONTAINS 1 ALLOW FILTERING"), row(1, 0), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND l CONTAINS 1 ALLOW FILTERING"), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE l CONTAINS 2 ALLOW FILTERING"), row(1, 0), row(0, 0));
+            assertEmpty(execute("SELECT k, v FROM %s WHERE l CONTAINS 6 ALLOW FILTERING"));
+
+            // sets
+            assertRows(execute("SELECT k, v FROM %s WHERE s CONTAINS 'a' ALLOW FILTERING" ), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND s CONTAINS 'a' ALLOW FILTERING"), row(0, 0), row(0, 2));
+            assertRows(execute("SELECT k, v FROM %s WHERE s CONTAINS 'd' ALLOW FILTERING"), row(1, 1));
+            assertEmpty(execute("SELECT k, v FROM %s  WHERE s CONTAINS 'e' ALLOW FILTERING"));
+
+            // maps
+            assertRows(execute("SELECT k, v FROM %s WHERE m CONTAINS 1 ALLOW FILTERING"), row(1, 0), row(1, 1), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND m CONTAINS 1 ALLOW FILTERING"), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE m CONTAINS 2 ALLOW FILTERING"), row(0, 1));
+            assertEmpty(execute("SELECT k, v FROM %s  WHERE m CONTAINS 4 ALLOW FILTERING"));
+
+            assertRows(execute("SELECT k, v FROM %s WHERE m CONTAINS KEY 'a' ALLOW FILTERING"), row(1, 1), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND m CONTAINS KEY 'a' ALLOW FILTERING"), row(0, 0), row(0, 1));
+            assertRows(execute("SELECT k, v FROM %s WHERE k = 0 AND m CONTAINS KEY 'c' ALLOW FILTERING"), row(0, 2));
+        });
+    }
+
+    @Test
+    public void testMixedTTLOnColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, i int)");
+        execute("INSERT INTO %s (k) VALUES (2);");
+        execute("INSERT INTO %s (k, i) VALUES (1, 1) USING TTL 100;");
+        execute("INSERT INTO %s (k, i) VALUES (3, 3) USING TTL 100;");
+        assertRows(execute("SELECT k, i FROM %s "),
+                   row(1, 1),
+                   row(2, null),
+                   row(3, 3));
+
+        UntypedResultSet rs = execute("SELECT k, i, ttl(i) AS name_ttl FROM %s");
+        assertEquals("name_ttl", rs.metadata().get(2).name.toString());
+        int i = 0;
+        for (UntypedResultSet.Row row : rs)
+        {
+            if ( i % 2 == 0) // Every odd row has a null i/ttl
+                assertTrue(row.getInt("name_ttl") >= 90 && row.getInt("name_ttl") <= 100);
+            else
+                assertTrue(row.has("name_ttl") == false);
+
+            i++;
+        }
+
+    }
+
+    @Test
+    public void testMixedTTLOnColumnsWide() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, i int, PRIMARY KEY (k, c))");
+        execute("INSERT INTO %s (k, c) VALUES (2, 2);");
+        execute("INSERT INTO %s (k, c, i) VALUES (1, 1, 1) USING TTL 100;");
+        execute("INSERT INTO %s (k, c) VALUES (1, 2) ;");
+        execute("INSERT INTO %s (k, c, i) VALUES (1, 3, 3) USING TTL 100;");
+        execute("INSERT INTO %s (k, c, i) VALUES (3, 3, 3) USING TTL 100;");
+        assertRows(execute("SELECT k, c, i FROM %s "),
+                   row(1, 1, 1),
+                   row(1, 2, null),
+                   row(1, 3, 3),
+                   row(2, 2, null),
+                   row(3, 3, 3));
+
+        UntypedResultSet rs = execute("SELECT k, c, i, ttl(i) AS name_ttl FROM %s");
+        assertEquals("name_ttl", rs.metadata().get(3).name.toString());
+        int i = 0;
+        for (UntypedResultSet.Row row : rs)
+        {
+            if ( i % 2 == 0) // Every odd row has a null i/ttl
+                assertTrue(row.getInt("name_ttl") >= 90 && row.getInt("name_ttl") <= 100);
+            else
+                assertTrue(row.has("name_ttl") == false);
+
+            i++;
+        }
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testWithCompactStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c) VALUES (1, 1, 1)");
+        execute("INSERT INTO %s (a, b, c) VALUES (2, 1, 1)");
+        assertRows(execute("SELECT a, b, c FROM %s"),
+                   row(1, 1, 1),
+                   row(2, 1, 1));
+        testWithCompactFormat();
+
+        // if column column1 is present, hidden column is called column2
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, column1 int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, column1) VALUES (1, 1, 1, 1)");
+        execute("INSERT INTO %s (a, b, c, column1) VALUES (2, 1, 1, 2)");
+        assertRows(execute("SELECT a, b, c, column1 FROM %s"),
+                   row(1, 1, 1, 1),
+                   row(2, 1, 1, 2));
+        assertInvalidMessage("Undefined name column2 in selection clause",
+                             "SELECT a, column2, value FROM %s");
+
+        // if column value is present, hidden column is called value1
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, value int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, value) VALUES (1, 1, 1, 1)");
+        execute("INSERT INTO %s (a, b, c, value) VALUES (2, 1, 1, 2)");
+        assertRows(execute("SELECT a, b, c, value FROM %s"),
+                   row(1, 1, 1, 1),
+                   row(2, 1, 1, 2));
+        assertInvalidMessage("Undefined name value1 in selection clause",
+                             "SELECT a, value1, value FROM %s");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+    */
+    @Test
+    public void testWithCompactNonStaticFormat() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b) VALUES (1, 1)");
+        execute("INSERT INTO %s (a, b) VALUES (2, 1)");
+        assertRows(execute("SELECT a, b FROM %s"),
+                   row(1, 1),
+                   row(2, 1));
+        testWithCompactFormat();
+
+        createTable("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, v) VALUES (1, 1, 3)");
+        execute("INSERT INTO %s (a, b, v) VALUES (2, 1, 4)");
+        assertRows(execute("SELECT a, b, v FROM %s"),
+                   row(1, 1, 3),
+                   row(2, 1, 4));
+        testWithCompactFormat();
+    }
+
+    private void testWithCompactFormat() throws Throwable
+    {
+        assertInvalidMessage("Order by on unknown column value",
+                             "SELECT * FROM %s WHERE a IN (1,2,3) ORDER BY value ASC");
+        assertInvalidMessage("Order by on unknown column column1",
+                             "SELECT * FROM %s WHERE a IN (1,2,3) ORDER BY column1 ASC");
+        assertInvalidMessage("Undefined name column1 in selection clause",
+                             "SELECT column1 FROM %s");
+        assertInvalidMessage("Undefined name value in selection clause",
+                             "SELECT value FROM %s");
+        assertInvalidMessage("Undefined name value in selection clause",
+                             "SELECT value, column1 FROM %s");
+        assertInvalid("Undefined name column1 in where clause ('column1 = NULL')",
+                      "SELECT * FROM %s WHERE column1 = null ALLOW FILTERING");
+        assertInvalid("Undefined name value in where clause ('value = NULL')",
+                      "SELECT * FROM %s WHERE value = null ALLOW FILTERING");
+        assertInvalidMessage("Undefined name column1 in selection clause",
+                             "SELECT WRITETIME(column1) FROM %s");
+        assertInvalidMessage("Undefined name value in selection clause",
+                             "SELECT WRITETIME(value) FROM %s");
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java
index b1eaac1..fc70974 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java

@@ -12,10 +12,10 @@
 import org.apache.cassandra.cql3.Attributes;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.BufferExpiringCell;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.ExpiringCell;
+import org.apache.cassandra.db.ExpirationDateOverflowHandling;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.rows.AbstractCell;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -25,7 +25,7 @@
 {
     public static String NEGATIVE_LOCAL_EXPIRATION_TEST_DIR = "test/data/negative-local-expiration-test/%s";
 
-    public static int MAX_TTL = ExpiringCell.MAX_TTL;
+    public static int MAX_TTL = Attributes.MAX_TTL;
 
     public static final String SIMPLE_NOCLUSTERING = "table1";
     public static final String SIMPLE_CLUSTERING = "table2";
@@ -37,7 +37,7 @@
     {
         createTable("CREATE TABLE %s (k int PRIMARY KEY, i int)");
         // insert with low TTL should not be denied
-        execute("INSERT INTO %s (k, i) VALUES (1, 1) USING TTL ?", 10); // max ttl
+        execute("INSERT INTO %s (k, i) VALUES (1, 1) USING TTL ?", 10);
 
         try
         {
@@ -61,7 +61,7 @@
         execute("TRUNCATE %s");
 
         // insert with low TTL should not be denied
-        execute("UPDATE %s USING TTL ? SET i = 1 WHERE k = 2", 5); // max ttl
+        execute("UPDATE %s USING TTL ? SET i = 1 WHERE k = 2", 5);
 
         try
         {
@@ -84,7 +84,6 @@
         }
     }
 
-
     @Test
     public void testTTLDefaultLimit() throws Throwable
     {
@@ -97,7 +96,7 @@
         {
             assertTrue(e.getCause()
                         .getMessage()
-                        .contains("default_time_to_live cannot be smaller than 0"));
+                        .contains("default_time_to_live must be greater than or equal to 0 (got -1)"));
         }
         try
         {
@@ -119,9 +118,32 @@
     }
 
     @Test
-    public void testRejectExpirationDateOverflowPolicy() throws Throwable
+    public void testCapWarnExpirationOverflowPolicy() throws Throwable
     {
-        Attributes.policy = Attributes.ExpirationDateOverflowPolicy.REJECT;
+        // We don't test that the actual warn is logged here, only on dtest
+        testCapExpirationDateOverflowPolicy(ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy.CAP);
+    }
+
+    @Test
+    public void testCapNoWarnExpirationOverflowPolicy() throws Throwable
+    {
+        testCapExpirationDateOverflowPolicy(ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy.CAP_NOWARN);
+    }
+
+    @Test
+    public void testCapNoWarnExpirationOverflowPolicyDefaultTTL() throws Throwable
+    {
+        ExpirationDateOverflowHandling.policy = ExpirationDateOverflowHandling.policy.CAP_NOWARN;
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, i int) WITH default_time_to_live=" + MAX_TTL);
+        execute("INSERT INTO %s (k, i) VALUES (1, 1)");
+        checkTTLIsCapped("i");
+        ExpirationDateOverflowHandling.policy = ExpirationDateOverflowHandling.policy.REJECT;
+    }
+
+    @Test
+    public void testRejectExpirationOverflowPolicy() throws Throwable
+    {
+        //ExpirationDateOverflowHandling.expirationDateOverflowPolicy = ExpirationDateOverflowHandling.expirationDateOverflowPolicy.REJECT;
         createTable("CREATE TABLE %s (k int PRIMARY KEY, i int)");
         try
         {
@@ -143,45 +165,6 @@
     }
 
     @Test
-    public void testCapExpirationDatePolicyDefaultTTL() throws Throwable
-    {
-        Attributes.policy = Attributes.ExpirationDateOverflowPolicy.CAP;
-        createTable("CREATE TABLE %s (k int PRIMARY KEY, i int) WITH default_time_to_live=" + MAX_TTL);
-        execute("INSERT INTO %s (k, i) VALUES (1, 1)");
-        checkTTLIsCapped("i");
-        Attributes.policy = Attributes.ExpirationDateOverflowPolicy.REJECT;
-    }
-
-    @Test
-    public void testCapExpirationDatePolicyPerRequest() throws Throwable
-    {
-        // Test cap policy
-        Attributes.policy = Attributes.ExpirationDateOverflowPolicy.CAP;
-
-        // simple column, clustering, flush
-        baseCapExpirationDateOverflowTest(true, true, true);
-        // simple column, clustering, noflush
-        baseCapExpirationDateOverflowTest(true, true, false);
-        // simple column, noclustering, flush
-        baseCapExpirationDateOverflowTest(true, false, true);
-        // simple column, noclustering, noflush
-        baseCapExpirationDateOverflowTest(true, false, false);
-        // complex column, clustering, flush
-        baseCapExpirationDateOverflowTest(false, true, true);
-        // complex column, clustering, noflush
-        baseCapExpirationDateOverflowTest(false, true, false);
-        // complex column, noclustering, flush
-        baseCapExpirationDateOverflowTest(false, false, true);
-        // complex column, noclustering, noflush
-        baseCapExpirationDateOverflowTest(false, false, false);
-        // complex column, noclustering, flush
-        baseCapExpirationDateOverflowTest(false, false, false);
-
-        // Return to previous policy
-        Attributes.policy = Attributes.ExpirationDateOverflowPolicy.REJECT;
-    }
-
-    @Test
     public void testRecoverOverflowedExpirationWithScrub() throws Throwable
     {
         baseTestRecoverOverflowedExpiration(false, false);
@@ -189,28 +172,40 @@
         baseTestRecoverOverflowedExpiration(true, true);
     }
 
-    public void baseCapExpirationDateOverflowTest(boolean simple, boolean clustering, boolean flush) throws Throwable
+    public void testCapExpirationDateOverflowPolicy(ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy policy) throws Throwable
+    {
+        ExpirationDateOverflowHandling.policy = policy;
+
+        // simple column, clustering, flush
+        testCapExpirationDateOverflowPolicy(true, true, true);
+        // simple column, clustering, noflush
+        testCapExpirationDateOverflowPolicy(true, true, false);
+        // simple column, noclustering, flush
+        testCapExpirationDateOverflowPolicy(true, false, true);
+        // simple column, noclustering, noflush
+        testCapExpirationDateOverflowPolicy(true, false, false);
+        // complex column, clustering, flush
+        testCapExpirationDateOverflowPolicy(false, true, true);
+        // complex column, clustering, noflush
+        testCapExpirationDateOverflowPolicy(false, true, false);
+        // complex column, noclustering, flush
+        testCapExpirationDateOverflowPolicy(false, false, true);
+        // complex column, noclustering, noflush
+        testCapExpirationDateOverflowPolicy(false, false, false);
+
+        // Return to previous policy
+        ExpirationDateOverflowHandling.policy = ExpirationDateOverflowHandling.ExpirationDateOverflowPolicy.REJECT;
+    }
+
+    public void testCapExpirationDateOverflowPolicy(boolean simple, boolean clustering, boolean flush) throws Throwable
     {
         // Create Table
-        if (simple)
-        {
-            if (clustering)
-                createTable("create table %s (k int, a int, b int, primary key(k, a))");
-            else
-                createTable("create table %s (k int primary key, a int, b int)");
-        }
-        else
-        {
-            if (clustering)
-                createTable("create table %s (k int, a int, b set<text>, primary key(k, a))");
-            else
-                createTable("create table %s (k int primary key, a int, b set<text>)");
-        }
+        createTable(simple, clustering);
 
         // Insert data with INSERT and UPDATE
         if (simple)
         {
-            execute("INSERT INTO %s (k, a, b) VALUES (?, ?, ?) USING TTL " + MAX_TTL, 2, 2, 2);
+            execute("INSERT INTO %s (k, a) VALUES (?, ?) USING TTL " + MAX_TTL, 2, 2);
             if (clustering)
                 execute("UPDATE %s USING TTL " + MAX_TTL + " SET b = 1 WHERE k = 1 AND a = 1;");
             else
@@ -256,108 +251,6 @@
         testRecoverOverflowedExpirationWithScrub(false, false, runScrub, reinsertOverflowedTTL);
     }
 
-    private void verifyData(boolean simple) throws Throwable
-    {
-        if (simple)
-        {
-            assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, 2));
-        }
-        else
-        {
-            assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
-        }
-        // Cannot retrieve TTL from collections
-        if (simple)
-            checkTTLIsCapped("b");
-    }
-
-    /**
-     * Verify that the computed TTL is approximately equal to the maximum allowed ttl given the
-     * {@link ExpiringCell#getLocalDeletionTime()} field limitation (CASSANDRA-14092)
-     */
-    private void checkTTLIsCapped(String field) throws Throwable
-    {
-
-        // TTL is computed dynamically from row expiration time, so if it is
-        // equal or higher to the minimum max TTL we compute before the query
-        // we are fine.
-        int minMaxTTL = computeMaxTTL();
-        UntypedResultSet execute = execute("SELECT ttl(" + field + ") FROM %s");
-        for (UntypedResultSet.Row row : execute)
-        {
-            int ttl = row.getInt("ttl(" + field + ")");
-            assertTrue(ttl >= minMaxTTL);
-        }
-    }
-
-    /**
-     * The max TTL is computed such that the TTL summed with the current time is equal to the maximum
-     * allowed expiration time {@link BufferExpiringCell#getLocalDeletionTime()} (2038-01-19T03:14:06+00:00)
-     */
-    private int computeMaxTTL()
-    {
-        int nowInSecs = (int) (System.currentTimeMillis() / 1000);
-        return BufferExpiringCell.MAX_DELETION_TIME - nowInSecs;
-    }
-
-    public void testRecoverOverflowedExpirationWithScrub(boolean simple, boolean clustering, boolean runScrub, boolean reinsertOverflowedTTL) throws Throwable
-    {
-        if (reinsertOverflowedTTL)
-        {
-            assert runScrub;
-        }
-
-        createTable(simple, clustering);
-
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(currentTable());
-
-        assertEquals(0, cfs.getSSTables().size());
-
-        copySSTablesToTableDir(currentTable(), simple, clustering);
-
-        cfs.loadNewSSTables();
-
-        if (runScrub)
-        {
-            cfs.scrub(true, false, false, reinsertOverflowedTTL, 1);
-        }
-
-        if (reinsertOverflowedTTL)
-        {
-            if (simple)
-                assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, 2));
-            else
-                assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
-
-            cfs.forceMajorCompaction();
-
-            if (simple)
-                assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, 2));
-            else
-                assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
-        }
-        else
-        {
-            assertEmpty(execute("SELECT * from %s"));
-        }
-    }
-
-    private void copySSTablesToTableDir(String table, boolean simple, boolean clustering) throws IOException
-    {
-        File destDir = Keyspace.open(keyspace()).getColumnFamilyStore(table).directories.getCFDirectories().iterator().next();
-        File sourceDir = getTableDir(table, simple, clustering);
-        for (File file : sourceDir.listFiles())
-        {
-            copyFile(file, destDir);
-        }
-    }
-
-    private static File getTableDir(String table, boolean simple, boolean clustering)
-    {
-        return new File(String.format(NEGATIVE_LOCAL_EXPIRATION_TEST_DIR, getTableName(simple, clustering)));
-    }
-
     private void createTable(boolean simple, boolean clustering)
     {
         if (simple)
@@ -376,7 +269,104 @@
         }
     }
 
-    private static File getTableDir(boolean simple, boolean clustering)
+    private void verifyData(boolean simple) throws Throwable
+    {
+        if (simple)
+        {
+            assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, null));
+        }
+        else
+        {
+            assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
+        }
+        // Cannot retrieve TTL from collections
+        if (simple)
+            checkTTLIsCapped("b");
+    }
+
+    /**
+     * Verify that the computed TTL is equal to the maximum allowed ttl given the
+     * {@link AbstractCell#localDeletionTime()} field limitation (CASSANDRA-14092)
+     */
+    private void checkTTLIsCapped(String field) throws Throwable
+    {
+
+        // TTL is computed dynamically from row expiration time, so if it is
+        // equal or higher to the minimum max TTL we compute before the query
+        // we are fine.
+        int minMaxTTL = computeMaxTTL();
+        UntypedResultSet execute = execute("SELECT ttl(" + field + ") FROM %s WHERE k = 1");
+        for (UntypedResultSet.Row row : execute)
+        {
+            int ttl = row.getInt("ttl(" + field + ")");
+            assertTrue(ttl >= minMaxTTL);
+        }
+    }
+
+    /**
+     * The max TTL is computed such that the TTL summed with the current time is equal to the maximum
+     * allowed expiration time {@link org.apache.cassandra.db.rows.Cell#MAX_DELETION_TIME} (2038-01-19T03:14:06+00:00)
+     */
+    private int computeMaxTTL()
+    {
+        int nowInSecs = (int) (System.currentTimeMillis() / 1000);
+        return AbstractCell.MAX_DELETION_TIME - nowInSecs;
+    }
+
+    public void testRecoverOverflowedExpirationWithScrub(boolean simple, boolean clustering, boolean runScrub, boolean reinsertOverflowedTTL) throws Throwable
+    {
+        if (reinsertOverflowedTTL)
+        {
+            assert runScrub;
+        }
+
+        createTable(simple, clustering);
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(currentTable());
+
+        assertEquals(0, cfs.getLiveSSTables().size());
+
+        copySSTablesToTableDir(currentTable(), simple, clustering);
+
+        cfs.loadNewSSTables();
+
+        if (runScrub)
+        {
+            cfs.scrub(true, false, true, reinsertOverflowedTTL, 1);
+        }
+
+        if (reinsertOverflowedTTL)
+        {
+            if (simple)
+                assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, null));
+            else
+                assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
+
+            cfs.forceMajorCompaction();
+
+            if (simple)
+                assertRows(execute("SELECT * from %s"), row(1, 1, 1), row(2, 2, null));
+            else
+                assertRows(execute("SELECT * from %s"), row(1, 1, set("v11", "v12", "v13", "v14")), row(2, 2, set("v21", "v22", "v23", "v24")));
+        }
+        else
+        {
+            assertEmpty(execute("SELECT * from %s"));
+        }
+    }
+
+    private void copySSTablesToTableDir(String table, boolean simple, boolean clustering) throws IOException
+    {
+        File destDir = Keyspace.open(keyspace()).getColumnFamilyStore(table).getDirectories().getCFDirectories().iterator().next();
+        File sourceDir = getTableDir(table, simple, clustering);
+        for (File file : sourceDir.listFiles())
+        {
+            copyFile(file, destDir);
+        }
+    }
+
+    private static File getTableDir(String table, boolean simple, boolean clustering)
     {
         return new File(String.format(NEGATIVE_LOCAL_EXPIRATION_TEST_DIR, getTableName(simple, clustering)));
     }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/ThriftCQLTester.java b/test/unit/org/apache/cassandra/cql3/validation/operations/ThriftCQLTester.java
index 7947317..5d4d1a0 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/ThriftCQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/ThriftCQLTester.java

@@ -53,7 +53,6 @@
     @BeforeClass
     public static void setup() throws Exception
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
         StorageService.instance.initServer(0);
 
         if (thriftServer == null || ! thriftServer.isRunning())
@@ -77,11 +76,11 @@
         return getClient(FBUtilities.getLocalAddress().getHostName(), thriftPort);
     }
 
-	public Cassandra.Client getClient(String hostname, int thriftPort) throws Throwable
+    public Cassandra.Client getClient(String hostname, int thriftPort) throws Throwable
 	{
         if (client == null)
             client = new Cassandra.Client(new TBinaryProtocol(new TFramedTransportFactory().openTransport(hostname, thriftPort)));
 
         return client;
-	}
+    }
 }

diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
index c5d153f..8a9be19 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/UpdateTest.java

@@ -18,45 +18,21 @@
 
 package org.apache.cassandra.cql3.validation.operations;
 
+import java.util.Arrays;
+
+import org.junit.Assert;
 import org.junit.Test;
 
+import static org.apache.commons.lang3.StringUtils.isEmpty;
+import static org.junit.Assert.assertTrue;
+
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class UpdateTest extends CQLTester
 {
-    /**
-     * Test altering the type of a column, including the one in the primary key (#4041)
-     * migrated from cql_tests.py:TestCQL.update_type_test()
-     */
-    @Test
-    public void testUpdateColumnType() throws Throwable
-    {
-        createTable("CREATE TABLE %s (k text, c text, s set <text>, v text, PRIMARY KEY(k, c))");
-
-        // using utf8 character so that we can see the transition to BytesType
-        execute("INSERT INTO %s (k, c, v, s) VALUES ('ɸ', 'ɸ', 'ɸ', {'ɸ'})");
-
-        assertRows(execute("SELECT * FROM %s"),
-                   row("ɸ", "ɸ", set("ɸ"), "ɸ"));
-
-        execute("ALTER TABLE %s ALTER v TYPE blob");
-        assertRows(execute("SELECT * FROM %s"),
-                   row("ɸ", "ɸ", set("ɸ"), ByteBufferUtil.bytes("ɸ")));
-
-        execute("ALTER TABLE %s ALTER k TYPE blob");
-        assertRows(execute("SELECT * FROM %s"),
-                   row(ByteBufferUtil.bytes("ɸ"), "ɸ", set("ɸ"), ByteBufferUtil.bytes("ɸ")));
-
-        execute("ALTER TABLE %s ALTER c TYPE blob");
-        assertRows(execute("SELECT * FROM %s"),
-                   row(ByteBufferUtil.bytes("ɸ"), ByteBufferUtil.bytes("ɸ"), set("ɸ"), ByteBufferUtil.bytes("ɸ")));
-
-        execute("ALTER TABLE %s ALTER s TYPE set<blob>");
-        assertRows(execute("SELECT * FROM %s"),
-                   row(ByteBufferUtil.bytes("ɸ"), ByteBufferUtil.bytes("ɸ"), set(ByteBufferUtil.bytes("ɸ")), ByteBufferUtil.bytes("ɸ")));
-    }
-
     @Test
     public void testTypeCasts() throws Throwable
     {
@@ -83,4 +59,610 @@
         assertInvalid("UPDATE %s SET d = (int)3 WHERE k = ?", 0);
         assertInvalid("UPDATE %s SET i = (double)3 WHERE k = ?", 0);
     }
+
+    @Test
+    public void testUpdate() throws Throwable
+    {
+        testUpdate(false);
+        testUpdate(true);
+    }
+
+    private void testUpdate(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] {"", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                    "clustering_1 int," +
+                    "value int," +
+                    " PRIMARY KEY (partitionKey, clustering_1))" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 0, 0)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 1, 1)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 2, 2)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (0, 3, 3)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, value) VALUES (1, 0, 4)");
+
+            flush(forceFlush);
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ?", 7, 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering_1 = ?",
+                               0, 1),
+                       row(7));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND (clustering_1) = (?)", 8, 0, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering_1 = ?",
+                               0, 2),
+                       row(8));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey IN (?, ?) AND clustering_1 = ?", 9, 0, 1, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?) AND clustering_1 = ?",
+                               0, 1, 0),
+                       row(0, 0, 9),
+                       row(1, 0, 9));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey IN ? AND clustering_1 = ?", 19, Arrays.asList(0, 1), 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN ? AND clustering_1 = ?",
+                               Arrays.asList(0, 1), 0),
+                       row(0, 0, 19),
+                       row(1, 0, 19));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 IN (?, ?)", 10, 0, 1, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?)",
+                               0, 1, 0),
+                       row(0, 0, 10),
+                       row(0, 1, 10));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND (clustering_1) IN ((?), (?))", 20, 0, 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?))",
+                               0, 0, 1),
+                       row(0, 0, 20),
+                       row(0, 1, 20));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ?", null, 0, 0);
+            flush(forceFlush);
+
+            if (isEmpty(compactOption))
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?))",
+                                   0, 0, 1),
+                           row(0, 0, null),
+                           row(0, 1, 20));
+            }
+            else
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1) IN ((?), (?))",
+                                   0, 0, 1),
+                           row(0, 1, 20));
+            }
+
+            // test invalid queries
+
+            // missing primary key element
+            assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                                 "UPDATE %s SET value = ? WHERE clustering_1 = ? ", 7, 1);
+
+            assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ?", 7, 0);
+
+            assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ?", 7, 0);
+
+            // token function
+            assertInvalidMessage("The token function cannot be used in WHERE clauses for UPDATE statements",
+                                 "UPDATE %s SET value = ? WHERE token(partitionKey) = token(?) AND clustering_1 = ?",
+                                 7, 0, 1);
+
+            // multiple time the same value
+            assertInvalidSyntax("UPDATE %s SET value = ?, value = ? WHERE partitionKey = ? AND clustering_1 = ?", 7, 0, 1);
+
+            // multiple time same primary key element in WHERE clause
+            assertInvalidMessage("clustering_1 cannot be restricted by more than one relation if it includes an Equal",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_1 = ?", 7, 0, 1, 1);
+
+            // unknown identifiers
+            assertInvalidMessage("Unknown identifier value1",
+                                 "UPDATE %s SET value1 = ? WHERE partitionKey = ? AND clustering_1 = ?", 7, 0, 1);
+
+            assertInvalidMessage("Undefined name partitionkey1 in where clause ('partitionkey1 = ?')",
+                                 "UPDATE %s SET value = ? WHERE partitionKey1 = ? AND clustering_1 = ?", 7, 0, 1);
+
+            assertInvalidMessage("Undefined name clustering_3 in where clause ('clustering_3 = ?')",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_3 = ?", 7, 0, 1);
+
+            // Invalid operator in the where clause
+            assertInvalidMessage("Only EQ and IN relation are supported on the partition key (unless you use the token() function)",
+                                 "UPDATE %s SET value = ? WHERE partitionKey > ? AND clustering_1 = ?", 7, 0, 1);
+
+            assertInvalidMessage("Cannot use CONTAINS on non-collection column partitionkey",
+                                 "UPDATE %s SET value = ? WHERE partitionKey CONTAINS ? AND clustering_1 = ?", 7, 0, 1);
+
+            assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND value = ?", 7, 0, 1, 3);
+
+            assertInvalidMessage("Slice restrictions are not supported on the clustering columns in UPDATE statements",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 > ?", 7, 0, 1);
+        }
+    }
+
+    @Test
+    public void testUpdateWithSecondaryIndices() throws Throwable
+    {
+        testUpdateWithSecondaryIndices(false);
+        testUpdateWithSecondaryIndices(true);
+    }
+
+    private void testUpdateWithSecondaryIndices(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                "clustering_1 int," +
+                "value int," +
+                "values set<int>," +
+                " PRIMARY KEY (partitionKey, clustering_1))");
+
+        createIndex("CREATE INDEX ON %s (value)");
+        createIndex("CREATE INDEX ON %s (clustering_1)");
+        createIndex("CREATE INDEX ON %s (values)");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 0, 0, {0})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 1, 1, {0, 1})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 2, 2, {0, 1, 2})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (0, 3, 3, {0, 1, 2, 3})");
+        execute("INSERT INTO %s (partitionKey, clustering_1, value, values) VALUES (1, 0, 4, {0, 1, 2, 3, 4})");
+
+        flush(forceFlush);
+
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                             "UPDATE %s SET values= {6} WHERE partitionKey = ? AND clustering_1 = ? AND value = ?", 3, 3, 3);
+        assertInvalidMessage("Non PRIMARY KEY columns found in where clause: values",
+                             "UPDATE %s SET value= ? WHERE partitionKey = ? AND clustering_1 = ? AND values CONTAINS ?", 6, 3, 3, 3);
+        assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                             "UPDATE %s SET values= {6} WHERE partitionKey = ? AND value = ?", 3, 3);
+        assertInvalidMessage("Some clustering keys are missing: clustering_1",
+                             "UPDATE %s SET value= ? WHERE partitionKey = ? AND values CONTAINS ?", 6, 3, 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "UPDATE %s SET values= {6} WHERE clustering_1 = ?", 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "UPDATE %s SET values= {6} WHERE value = ?", 3);
+        assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                             "UPDATE %s SET value= ? WHERE values CONTAINS ?", 6, 3);
+    }
+
+    @Test
+    public void testUpdateWithTwoClusteringColumns() throws Throwable
+    {
+        testUpdateWithTwoClusteringColumns(false);
+        testUpdateWithTwoClusteringColumns(true);
+    }
+
+    private void testUpdateWithTwoClusteringColumns(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] { "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey int," +
+                    "clustering_1 int," +
+                    "clustering_2 int," +
+                    "value int," +
+                    " PRIMARY KEY (partitionKey, clustering_1, clustering_2))" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 2, 2)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 3, 3)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 1, 1, 4)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 1, 2, 5)");
+            execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (1, 0, 0, 6)");
+            flush(forceFlush);
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                               0, 1, 1),
+                       row(7));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND (clustering_1, clustering_2) = (?, ?)", 8, 0, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT value FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                               0, 1, 2),
+                       row(8));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey IN (?, ?) AND clustering_1 = ? AND clustering_2 = ?", 9, 0, 1, 0, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN (?, ?) AND clustering_1 = ? AND clustering_2 = ?",
+                               0, 1, 0, 0),
+                       row(0, 0, 0, 9),
+                       row(1, 0, 0, 9));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey IN ? AND clustering_1 = ? AND clustering_2 = ?", 9, Arrays.asList(0, 1), 0, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey IN ? AND clustering_1 = ? AND clustering_2 = ?",
+                               Arrays.asList(0, 1), 0, 0),
+                       row(0, 0, 0, 9),
+                       row(1, 0, 0, 9));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 IN (?, ?)", 12, 0, 1, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 IN (?, ?)",
+                               0, 1, 1, 2),
+                       row(0, 1, 1, 12),
+                       row(0, 1, 2, 12));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 IN (?, ?) AND clustering_2 IN (?, ?)", 10, 0, 1, 0, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 IN (?, ?) AND clustering_2 IN (?, ?)",
+                               0, 1, 0, 1, 2),
+                       row(0, 0, 1, 10),
+                       row(0, 0, 2, 10),
+                       row(0, 1, 1, 10),
+                       row(0, 1, 2, 10));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND (clustering_1, clustering_2) IN ((?, ?), (?, ?))", 20, 0, 0, 2, 1, 2);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) IN ((?, ?), (?, ?))",
+                               0, 0, 2, 1, 2),
+                       row(0, 0, 2, 20),
+                       row(0, 1, 2, 20));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", null, 0, 0, 2);
+            flush(forceFlush);
+
+            if (isEmpty(compactOption))
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) IN ((?, ?), (?, ?))",
+                                   0, 0, 2, 1, 2),
+                           row(0, 0, 2, null),
+                           row(0, 1, 2, 20));
+            }
+            else
+            {
+                assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND (clustering_1, clustering_2) IN ((?, ?), (?, ?))",
+                                   0, 0, 2, 1, 2),
+                           row(0, 1, 2, 20));
+            }
+
+            // test invalid queries
+
+            // missing primary key element
+            assertInvalidMessage("Some partition key parts are missing: partitionkey",
+                                 "UPDATE %s SET value = ? WHERE clustering_1 = ? AND clustering_2 = ?", 7, 1, 1);
+
+            String errorMsg = isEmpty(compactOption) ? "Some clustering keys are missing: clustering_1"
+                                                     : "PRIMARY KEY column \"clustering_2\" cannot be restricted as preceding column \"clustering_1\" is not restricted";
+
+            assertInvalidMessage(errorMsg,
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_2 = ?", 7, 0, 1);
+
+            assertInvalidMessage("Some clustering keys are missing: clustering_1, clustering_2",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ?", 7, 0);
+
+            // token function
+            assertInvalidMessage("The token function cannot be used in WHERE clauses for UPDATE statements",
+                                 "UPDATE %s SET value = ? WHERE token(partitionKey) = token(?) AND clustering_1 = ? AND clustering_2 = ?",
+                                 7, 0, 1, 1);
+
+            // multiple time the same value
+            assertInvalidSyntax("UPDATE %s SET value = ?, value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+
+            // multiple time same primary key element in WHERE clause
+            assertInvalidMessage("clustering_1 cannot be restricted by more than one relation if it includes an Equal",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ? AND clustering_1 = ?", 7, 0, 1, 1, 1);
+
+            // unknown identifiers
+            assertInvalidMessage("Unknown identifier value1",
+                                 "UPDATE %s SET value1 = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+
+            assertInvalidMessage("Undefined name partitionkey1 in where clause ('partitionkey1 = ?')",
+                                 "UPDATE %s SET value = ? WHERE partitionKey1 = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+
+            assertInvalidMessage("Undefined name clustering_3 in where clause ('clustering_3 = ?')",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_3 = ?", 7, 0, 1, 1);
+
+            // Invalid operator in the where clause
+            assertInvalidMessage("Only EQ and IN relation are supported on the partition key (unless you use the token() function)",
+                                 "UPDATE %s SET value = ? WHERE partitionKey > ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+
+            assertInvalidMessage("Cannot use CONTAINS on non-collection column partitionkey",
+                                 "UPDATE %s SET value = ? WHERE partitionKey CONTAINS ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 1, 1);
+
+            assertInvalidMessage("Non PRIMARY KEY columns found in where clause: value",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ? AND value = ?", 7, 0, 1, 1, 3);
+
+            assertInvalidMessage("Slice restrictions are not supported on the clustering columns in UPDATE statements",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND clustering_1 > ?", 7, 0, 1);
+
+            assertInvalidMessage("Slice restrictions are not supported on the clustering columns in UPDATE statements",
+                                 "UPDATE %s SET value = ? WHERE partitionKey = ? AND (clustering_1, clustering_2) > (?, ?)", 7, 0, 1, 1);
+        }
+    }
+
+    @Test
+    public void testUpdateWithMultiplePartitionKeyComponents() throws Throwable
+    {
+        testUpdateWithMultiplePartitionKeyComponents(false);
+        testUpdateWithMultiplePartitionKeyComponents(true);
+    }
+
+    public void testUpdateWithMultiplePartitionKeyComponents(boolean forceFlush) throws Throwable
+    {
+        for (String compactOption : new String[] { "", " WITH COMPACT STORAGE" })
+        {
+            createTable("CREATE TABLE %s (partitionKey_1 int," +
+                    "partitionKey_2 int," +
+                    "clustering_1 int," +
+                    "clustering_2 int," +
+                    "value int," +
+                    " PRIMARY KEY ((partitionKey_1, partitionKey_2), clustering_1, clustering_2))" + compactOption);
+
+            execute("INSERT INTO %s (partitionKey_1, partitionKey_2, clustering_1, clustering_2, value) VALUES (0, 0, 0, 0, 0)");
+            execute("INSERT INTO %s (partitionKey_1, partitionKey_2, clustering_1, clustering_2, value) VALUES (0, 1, 0, 1, 1)");
+            execute("INSERT INTO %s (partitionKey_1, partitionKey_2, clustering_1, clustering_2, value) VALUES (0, 1, 1, 1, 2)");
+            execute("INSERT INTO %s (partitionKey_1, partitionKey_2, clustering_1, clustering_2, value) VALUES (1, 0, 0, 1, 3)");
+            execute("INSERT INTO %s (partitionKey_1, partitionKey_2, clustering_1, clustering_2, value) VALUES (1, 1, 0, 1, 3)");
+            flush(forceFlush);
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey_1 = ? AND partitionKey_2 = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 0, 0, 0, 0);
+            flush(forceFlush);
+            assertRows(execute("SELECT value FROM %s WHERE partitionKey_1 = ? AND partitionKey_2 = ? AND clustering_1 = ? AND clustering_2 = ?",
+                               0, 0, 0, 0),
+                       row(7));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey_1 IN (?, ?) AND partitionKey_2 = ? AND clustering_1 = ? AND clustering_2 = ?", 9, 0, 1, 1, 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s WHERE partitionKey_1 IN (?, ?) AND partitionKey_2 = ? AND clustering_1 = ? AND clustering_2 = ?",
+                               0, 1, 1, 0, 1),
+                       row(0, 1, 0, 1, 9),
+                       row(1, 1, 0, 1, 9));
+
+            execute("UPDATE %s SET value = ? WHERE partitionKey_1 IN (?, ?) AND partitionKey_2 IN (?, ?) AND clustering_1 = ? AND clustering_2 = ?", 10, 0, 1, 0, 1, 0, 1);
+            flush(forceFlush);
+            assertRows(execute("SELECT * FROM %s"),
+                       row(0, 0, 0, 0, 7),
+                       row(0, 0, 0, 1, 10),
+                       row(0, 1, 0, 1, 10),
+                       row(0, 1, 1, 1, 2),
+                       row(1, 0, 0, 1, 10),
+                       row(1, 1, 0, 1, 10));
+
+            // missing primary key element
+            assertInvalidMessage("Some partition key parts are missing: partitionkey_2",
+                                 "UPDATE %s SET value = ? WHERE partitionKey_1 = ? AND clustering_1 = ? AND clustering_2 = ?", 7, 1, 1);
+        }
+    }
+
+    @Test
+    public void testUpdateWithAStaticColumn() throws Throwable
+    {
+        testUpdateWithAStaticColumn(false);
+        testUpdateWithAStaticColumn(true);
+    }
+
+    private void testUpdateWithAStaticColumn(boolean forceFlush) throws Throwable
+    {
+        createTable("CREATE TABLE %s (partitionKey int," +
+                                      "clustering_1 int," +
+                                      "clustering_2 int," +
+                                      "value int," +
+                                      "staticValue text static," +
+                                      " PRIMARY KEY (partitionKey, clustering_1, clustering_2))");
+
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value, staticValue) VALUES (0, 0, 0, 0, 'A')");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value) VALUES (0, 0, 1, 1)");
+        execute("INSERT INTO %s (partitionKey, clustering_1, clustering_2, value, staticValue) VALUES (1, 0, 0, 6, 'B')");
+        flush(forceFlush);
+
+        execute("UPDATE %s SET staticValue = ? WHERE partitionKey = ?", "A2", 0);
+        flush(forceFlush);
+
+        assertRows(execute("SELECT DISTINCT staticValue FROM %s WHERE partitionKey = ?", 0),
+                   row("A2"));
+
+        assertInvalidMessage("Some clustering keys are missing: clustering_1, clustering_2",
+                             "UPDATE %s SET staticValue = ?, value = ? WHERE partitionKey = ?", "A2", 7, 0);
+
+        execute("UPDATE %s SET staticValue = ?, value = ?  WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                "A3", 7, 0, 0, 1);
+        flush(forceFlush);
+        assertRows(execute("SELECT * FROM %s WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                           0, 0, 1),
+                   row(0, 0, 1, "A3", 7));
+
+        assertInvalidMessage("Invalid restrictions on clustering columns since the UPDATE statement modifies only static columns",
+                             "UPDATE %s SET staticValue = ? WHERE partitionKey = ? AND clustering_1 = ? AND clustering_2 = ?",
+                             "A3", 0, 0, 1);
+    }
+
+    @Test
+    public void testUpdateWithStaticList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int," +
+                                      "clustering int," +
+                                      "value int," +
+                                      "l list<text> static," +
+                                      " PRIMARY KEY (k, clustering))");
+
+        execute("INSERT INTO %s(k, clustering, value, l) VALUES (?, ?, ?, ?)", 0, 0, 0 ,list("v1", "v2", "v3"));
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", "v2", "v3")));
+
+        execute("UPDATE %s SET l[?] = ? WHERE k = ?", 1, "v4", 0);
+
+        assertRows(execute("SELECT l FROM %s WHERE k = 0"), row(list("v1", "v4", "v3")));
+    }
+
+    /**
+     * Test for CASSANDRA-12829
+     */
+    @Test
+    public void testUpdateWithEmptyInRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b))");
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,1,1);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,2,2);
+        execute("INSERT INTO %s (a,b,c) VALUES (?,?,?)",1,3,3);
+
+        assertInvalidMessage("Some clustering keys are missing: b",
+                             "UPDATE %s SET c = 100 WHERE a IN ();");
+        execute("UPDATE %s SET c = 100 WHERE a IN () AND b IN ();");
+        execute("UPDATE %s SET c = 100 WHERE a IN () AND b = 1;");
+        execute("UPDATE %s SET c = 100 WHERE a = 1 AND b IN ();");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1),
+                   row(1,2,2),
+                   row(1,3,3));
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, s int static, PRIMARY KEY ((a,b), c))");
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)",1,1,1,1,1);
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)",1,1,2,2,1);
+        execute("INSERT INTO %s (a,b,c,d,s) VALUES (?,?,?,?,?)",1,1,3,3,1);
+
+        execute("UPDATE %s SET d = 100 WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a = 1 AND b IN () AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b IN () AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b IN () AND c = 1;");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b = 1 AND c IN ();");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1,1,1),
+                   row(1,1,2,1,2),
+                   row(1,1,3,1,3));
+
+        // No clustering keys restricted, update whole partition
+        execute("UPDATE %s set s = 100 where a = 1 AND b = 1;");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1,100,1),
+                   row(1,1,2,100,2),
+                   row(1,1,3,100,3));
+
+        execute("UPDATE %s set s = 200 where a = 1 AND b IN ();");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1,100,1),
+                   row(1,1,2,100,2),
+                   row(1,1,3,100,3));
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY ((a,b), c, d))");
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)",1,1,1,1,1);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)",1,1,1,2,2);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)",1,1,1,3,3);
+        execute("INSERT INTO %s (a,b,c,d,e) VALUES (?,?,?,?,?)",1,1,1,4,4);
+
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b = 1 AND c = 1 AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b = 1 AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b IN () AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c IN () AND d = 1;");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c = 1 AND d = 1;");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c = 1 AND d IN ();");
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1,1,1,1,1),
+                   row(1,1,1,2,2),
+                   row(1,1,1,3,3),
+                   row(1,1,1,4,4));
+    }
+
+    /**
+     * Test for CASSANDRA-13152
+     */
+    @Test
+    public void testThatUpdatesWithEmptyInRestrictionDoNotCreateMutations() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a,b))");
+
+        execute("UPDATE %s SET c = 100 WHERE a IN () AND b = 1;");
+        execute("UPDATE %s SET c = 100 WHERE a = 1 AND b IN ();");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, s int static, PRIMARY KEY ((a,b), c))");
+
+        execute("UPDATE %s SET d = 100 WHERE a = 1 AND b = 1 AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a = 1 AND b IN () AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b IN () AND c IN ();");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b IN () AND c = 1;");
+        execute("UPDATE %s SET d = 100 WHERE a IN () AND b = 1 AND c IN ();");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, e int, PRIMARY KEY ((a,b), c, d))");
+
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b = 1 AND c = 1 AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b = 1 AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a = 1 AND b IN () AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c IN () AND d IN ();");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c IN () AND d = 1;");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c = 1 AND d = 1;");
+        execute("UPDATE %s SET e = 100 WHERE a IN () AND b IN () AND c = 1 AND d IN ();");
+
+        assertTrue("The memtable should be empty but is not", isMemtableEmpty());
+    }
+
+    /**
+     * Checks if the memtable is empty or not
+     * @return {@code true} if the memtable is empty, {@code false} otherwise.
+     */
+    private boolean isMemtableEmpty()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(currentTable());
+        return cfs.metric.allMemtablesLiveDataSize.getValue() == 0;
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testUpdateWithCompactStaticFormat() throws Throwable
+    {
+        testWithCompactFormat("CREATE TABLE %s (a int PRIMARY KEY, b int, c int) WITH COMPACT STORAGE");
+
+        assertInvalidMessage("Undefined name column1 in where clause ('column1 = ?')",
+                             "UPDATE %s SET b = 1 WHERE column1 = ?",
+                             ByteBufferUtil.bytes('a'));
+        assertInvalidMessage("Undefined name value in where clause ('value = ?')",
+                             "UPDATE %s SET b = 1 WHERE value = ?",
+                             ByteBufferUtil.bytes('a'));
+
+        // if column1 is present, hidden column is called column2
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, column1 int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, column1) VALUES (1, 1, 1, 1)");
+        execute("UPDATE %s SET column1 = 6 WHERE a = 1");
+        assertInvalidMessage("Unknown identifier column2", "UPDATE %s SET column2 = 6 WHERE a = 0");
+        assertInvalidMessage("Unknown identifier value", "UPDATE %s SET value = 6 WHERE a = 0");
+
+        // if value is present, hidden column is called value1
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b int, c int, value int) WITH COMPACT STORAGE");
+        execute("INSERT INTO %s (a, b, c, value) VALUES (1, 1, 1, 1)");
+        execute("UPDATE %s SET value = 6 WHERE a = 1");
+        assertInvalidMessage("Unknown identifier column1", "UPDATE %s SET column1 = 6 WHERE a = 1");
+        assertInvalidMessage("Unknown identifier value1", "UPDATE %s SET value1 = 6 WHERE a = 1");
+    }
+
+    /**
+     * Test for CASSANDRA-13917
+     */
+    @Test
+    public void testUpdateWithCompactNonStaticFormat() throws Throwable
+    {
+        testWithCompactFormat("CREATE TABLE %s (a int, b int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+        testWithCompactFormat("CREATE TABLE %s (a int, b int, v int, PRIMARY KEY (a, b)) WITH COMPACT STORAGE");
+    }
+
+    private void testWithCompactFormat(String tableQuery) throws Throwable
+    {
+        createTable(tableQuery);
+        // pass correct types to hidden columns
+        assertInvalidMessage("Unknown identifier column1",
+                             "UPDATE %s SET column1 = ? WHERE a = 0",
+                             ByteBufferUtil.bytes('a'));
+        assertInvalidMessage("Unknown identifier value",
+                             "UPDATE %s SET value = ? WHERE a = 0",
+                             ByteBufferUtil.bytes('a'));
+
+        // pass incorrect types to hidden columns
+        assertInvalidMessage("Unknown identifier column1", "UPDATE %s SET column1 = 6 WHERE a = 0");
+        assertInvalidMessage("Unknown identifier value", "UPDATE %s SET value = 6 WHERE a = 0");
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java b/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
deleted file mode 100644
index 0fdabe9..0000000
--- a/test/unit/org/apache/cassandra/db/ArrayBackedSortedColumnsTest.java
+++ /dev/null

@@ -1,426 +0,0 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-package org.apache.cassandra.db;
-
-import java.util.*;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-
-import com.google.common.collect.Sets;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.SearchIterator;
-import org.apache.cassandra.utils.BatchRemoveIterator;
-
-public class ArrayBackedSortedColumnsTest
-{
-    private static final String KEYSPACE1 = "ArrayBackedSortedColumnsTest";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
-    @Test
-    public void testAdd()
-    {
-        testAddInternal(false);
-        testAddInternal(true);
-    }
-
-    private CFMetaData metadata()
-    {
-        return Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD1);
-    }
-
-    private void testAddInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-        int[] values = new int[]{ 1, 2, 2, 3 };
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        Iterator<Cell> iter = map.iterator();
-        assertEquals("1st column", 1, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("2nd column", 2, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("3rd column", 3, iter.next().name().toByteBuffer().getInt(0));
-    }
-
-    @Test
-    public void testOutOfOrder()
-    {
-        testAddOutOfOrder(false);
-        testAddOutOfOrder(false);
-    }
-
-    private void testAddOutOfOrder(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-
-        int[] values = new int[]{ 1, 2, 1, 3, 4, 4, 5, 5, 1, 2, 6, 6, 6, 1, 2, 3 };
-        for (int i = 0; i < values.length; ++i)
-            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        assertEquals(6, cells.getColumnCount());
-
-        Iterator<Cell> iter = cells.iterator();
-        assertEquals(1, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(2, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(3, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(4, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(5, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(6, iter.next().name().toByteBuffer().getInt(0));
-
-        // Add more values
-        values = new int[]{ 11, 15, 12, 12, 12, 16, 10, 8, 8, 7, 4, 4, 5 };
-        for (int i = 0; i < values.length; ++i)
-            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        assertEquals(13, cells.getColumnCount());
-
-        iter = cells.reverseIterator();
-        assertEquals(16, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(15, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(12, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(11, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(10, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(8,  iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(7, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(6, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(5, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(4, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(3, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(2, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals(1, iter.next().name().toByteBuffer().getInt(0));
-    }
-
-    @Test
-    public void testGetColumn()
-    {
-        testGetColumnInternal(true);
-        testGetColumnInternal(false);
-    }
-
-    private void testGetColumnInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-
-        int[] values = new int[]{ -1, 20, 44, 55, 27, 27, 17, 1, 9, 89, 33, 44, 0, 9 };
-        for (int i = 0; i < values.length; ++i)
-            cells.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        for (int i : values)
-            assertEquals(i, cells.getColumn(type.makeCellName(i)).name().toByteBuffer().getInt(0));
-    }
-
-    @Test
-    public void testAddAll()
-    {
-        testAddAllInternal(false);
-        testAddAllInternal(true);
-    }
-
-    private void testAddAllInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-        ColumnFamily map2 = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-
-        int[] values1 = new int[]{ 1, 3, 5, 6 };
-        int[] values2 = new int[]{ 2, 4, 5, 6 };
-
-        for (int i = 0; i < values1.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values1[reversed ? values1.length - 1 - i : i])));
-
-        for (int i = 0; i < values2.length; ++i)
-            map2.addColumn(new BufferCell(type.makeCellName(values2[reversed ? values2.length - 1 - i : i])));
-
-        map2.addAll(map);
-
-        Iterator<Cell> iter = map2.iterator();
-        assertEquals("1st column", 1, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("2nd column", 2, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("3rd column", 3, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("4st column", 4, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("5st column", 5, iter.next().name().toByteBuffer().getInt(0));
-        assertEquals("6st column", 6, iter.next().name().toByteBuffer().getInt(0));
-    }
-
-    @Test
-    public void testGetCollection()
-    {
-        testGetCollectionInternal(false);
-        testGetCollectionInternal(true);
-    }
-
-    private void testGetCollectionInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-        int[] values = new int[]{ 1, 2, 3, 5, 9 };
-
-        List<Cell> sorted = new ArrayList<>();
-        for (int v : values)
-            sorted.add(new BufferCell(type.makeCellName(v)));
-        List<Cell> reverseSorted = new ArrayList<>(sorted);
-        Collections.reverse(reverseSorted);
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        assertSame(sorted, map.getSortedColumns());
-        assertSame(reverseSorted, map.getReverseSortedColumns());
-    }
-
-    @Test
-    public void testIterator()
-    {
-        testIteratorInternal(false);
-        //testIteratorInternal(true);
-    }
-
-    private void testIteratorInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-
-        int[] values = new int[]{ 1, 2, 3, 5, 9 };
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(type.make(3), Composites.EMPTY) }));
-        assertSame(new int[]{ 3, 2, 1 }, map.reverseIterator(new ColumnSlice[]{ new ColumnSlice(type.make(4), Composites.EMPTY) }));
-
-        assertSame(map.iterator(), map.iterator(ColumnSlice.ALL_COLUMNS_ARRAY));
-    }
-
-    @Test
-    public void testSearchIterator()
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), false);
-
-        int[] values = new int[]{ 1, 2, 3, 5, 9, 15, 21, 22 };
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[i])));
-
-        SearchIterator<CellName, Cell> iter = map.searchIterator();
-        for (int i = 0 ; i < values.length ; i++)
-            assertSame(values[i], iter.next(type.makeCellName(values[i])));
-
-        iter = map.searchIterator();
-        for (int i = 0 ; i < values.length ; i+=2)
-            assertSame(values[i], iter.next(type.makeCellName(values[i])));
-
-        iter = map.searchIterator();
-        for (int i = 0 ; i < values.length ; i+=4)
-            assertSame(values[i], iter.next(type.makeCellName(values[i])));
-
-        iter = map.searchIterator();
-        for (int i = 0 ; i < values.length ; i+=1)
-        {
-            if (i % 2 == 0)
-            {
-                Cell cell = iter.next(type.makeCellName(values[i] - 1));
-                if (i > 0 && values[i - 1] == values[i] - 1)
-                    assertSame(values[i - 1], cell);
-                else
-                    assertNull(cell);
-            }
-        }
-    }
-
-    private <T> void assertSame(Iterable<T> c1, Iterable<T> c2)
-    {
-        assertSame(c1.iterator(), c2.iterator());
-    }
-
-    private <T> void assertSame(Iterator<T> iter1, Iterator<T> iter2)
-    {
-        while (iter1.hasNext() && iter2.hasNext())
-            assertEquals(iter1.next(), iter2.next());
-        if (iter1.hasNext() || iter2.hasNext())
-            fail("The collection don't have the same size");
-    }
-
-    private void assertSame(int name, Cell cell)
-    {
-        int value = ByteBufferUtil.toInt(cell.name().toByteBuffer());
-        assert name == value : "Expected " + name + " but got " + value;
-    }
-    private void assertSame(int[] names, Iterator<Cell> iter)
-    {
-        for (int name : names)
-        {
-            assert iter.hasNext() : "Expected " + name + " but no more result";
-            int value = ByteBufferUtil.toInt(iter.next().name().toByteBuffer());
-            assert name == value : "Expected " + name + " but got " + value;
-        }
-    }
-
-    @Test
-    public void testRemove()
-    {
-        testRemoveInternal(false);
-        testRemoveInternal(true);
-    }
-
-    private void testRemoveInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-
-        int[] values = new int[]{ 1, 2, 2, 3 };
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        Iterator<Cell> iter = map.getReverseSortedColumns().iterator();
-        assertTrue(iter.hasNext());
-        iter.next();
-        iter.remove();
-        assertTrue(iter.hasNext());
-        iter.next();
-        iter.remove();
-        assertTrue(iter.hasNext());
-        iter.next();
-        iter.remove();
-        assertTrue(!iter.hasNext());
-    }
-
-    @Test(expected = IllegalStateException.class)
-    public void testBatchRemoveTwice()
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), false);
-        map.addColumn(new BufferCell(type.makeCellName(1)));
-        map.addColumn(new BufferCell(type.makeCellName(2)));
-
-        BatchRemoveIterator<Cell> batchIter = map.batchRemoveIterator();
-        batchIter.next();
-        batchIter.remove();
-        batchIter.remove();
-    }
-
-    @Test(expected = IllegalStateException.class)
-    public void testBatchCommitTwice()
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), false);
-        map.addColumn(new BufferCell(type.makeCellName(1)));
-        map.addColumn(new BufferCell(type.makeCellName(2)));
-
-        BatchRemoveIterator<Cell> batchIter = map.batchRemoveIterator();
-        batchIter.next();
-        batchIter.remove();
-        batchIter.commit();
-        batchIter.commit();
-    }
-
-    @Test
-    public void testBatchRemove()
-    {
-        testBatchRemoveInternal(false);
-        testBatchRemoveInternal(true);
-    }
-
-    public void testBatchRemoveInternal(boolean reversed)
-    {
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), reversed);
-        int[] values = new int[]{ 1, 2, 3, 5 };
-
-        for (int i = 0; i < values.length; ++i)
-            map.addColumn(new BufferCell(type.makeCellName(values[reversed ? values.length - 1 - i : i])));
-
-        BatchRemoveIterator<Cell> batchIter = map.batchRemoveIterator();
-        batchIter.next();
-        batchIter.remove();
-        batchIter.next();
-        batchIter.remove();
-
-        assertEquals("1st column before commit", 1, map.iterator().next().name().toByteBuffer().getInt(0));
-
-        batchIter.commit();
-
-        assertEquals("1st column after commit", 3, map.iterator().next().name().toByteBuffer().getInt(0));
-    }
-
-    @Test
-    public void testBatchRemoveCopy()
-    {
-        // Test delete some random columns and check the result
-        CellNameType type = new SimpleDenseCellNameType(Int32Type.instance);
-        ColumnFamily map = ArrayBackedSortedColumns.factory.create(metadata(), false);
-        int n = 127;
-        int[] values = new int[n];
-        for (int i = 0; i < n; i++)
-            values[i] = i;
-        Set<Integer> toRemove = Sets.newHashSet(3, 12, 13, 15, 58, 103, 112);
-
-        for (int value : values)
-            map.addColumn(new BufferCell(type.makeCellName(value)));
-
-        BatchRemoveIterator<Cell> batchIter = map.batchRemoveIterator();
-        while (batchIter.hasNext())
-            if (toRemove.contains(batchIter.next().name().toByteBuffer().getInt(0)))
-                batchIter.remove();
-
-        batchIter.commit();
-
-        int expected = 0;
-        while (toRemove.contains(expected))
-            expected++;
-
-        for (Cell column : map)
-        {
-            assertEquals(expected, column.name().toByteBuffer().getInt(0));
-            expected++;
-            while (toRemove.contains(expected))
-                expected++;
-        }
-        assertEquals(expected, n);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java b/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java
deleted file mode 100644
index 70d1d0c..0000000
--- a/test/unit/org/apache/cassandra/db/BatchlogManagerTest.java
+++ /dev/null

@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.net.InetAddress;
-import java.util.Collections;
-import java.util.List;
-import java.util.concurrent.ExecutionException;
-
-import com.google.common.collect.Lists;
-import org.junit.BeforeClass;
-import org.junit.Before;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.UUIDGen;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-
-public class BatchlogManagerTest
-{
-    private static final String KEYSPACE1 = "BatchlogManagerTest1";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
-    private static final String CF_STANDARD3 = "Standard3";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                SimpleStrategy.class,
-                KSMetaData.optsWithRF(1),
-                SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
-                SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3));
-    }
-
-    @Before
-    public void setUp() throws Exception
-    {
-        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
-        InetAddress localhost = InetAddress.getByName("127.0.0.1");
-        metadata.updateNormalToken(Util.token("A"), localhost);
-        metadata.updateHostId(UUIDGen.getTimeUUID(), localhost);
-    }
-
-    @Test
-    public void testReplay() throws Exception
-    {
-        long initialAllBatches = BatchlogManager.instance.countAllBatches();
-        long initialReplayedBatches = BatchlogManager.instance.getTotalBatchesReplayed();
-
-        // Generate 1000 mutations and put them all into the batchlog.
-        // Half (500) ready to be replayed, half not.
-        CellNameType comparator = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1").metadata.comparator;
-        for (int i = 0; i < 1000; i++)
-        {
-            Mutation mutation = new Mutation(KEYSPACE1, bytes(i));
-            mutation.add("Standard1", comparator.makeCellName(bytes(i)), bytes(i), System.currentTimeMillis());
-
-            long timestamp = i < 500
-                           ? (System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2) * 1000
-                           : Long.MAX_VALUE;
-
-            BatchlogManager.getBatchlogMutationFor(Collections.singleton(mutation),
-                                                   UUIDGen.getTimeUUID(),
-                                                   MessagingService.current_version,
-                                                   timestamp)
-                           .applyUnsafe();
-        }
-
-        // Flush the batchlog to disk (see CASSANDRA-6822).
-        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHLOG).forceBlockingFlush();
-
-        assertEquals(1000, BatchlogManager.instance.countAllBatches() - initialAllBatches);
-        assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
-
-        // Force batchlog replay and wait for it to complete.
-        BatchlogManager.instance.startBatchlogReplay().get();
-
-        // Ensure that the first half, and only the first half, got replayed.
-        assertEquals(500, BatchlogManager.instance.countAllBatches() - initialAllBatches);
-        assertEquals(500, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
-
-        for (int i = 0; i < 1000; i++)
-        {
-            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD1, i));
-            if (i < 500)
-            {
-                assertEquals(bytes(i), result.one().getBytes("key"));
-                assertEquals(bytes(i), result.one().getBytes("column1"));
-                assertEquals(bytes(i), result.one().getBytes("value"));
-            }
-            else
-            {
-                assertTrue(result.isEmpty());
-            }
-        }
-
-        // Ensure that no stray mutations got somehow applied.
-        UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT count(*) FROM \"%s\".\"%s\"", KEYSPACE1, CF_STANDARD1));
-        assertEquals(500, result.one().getLong("count"));
-    }
-
-    @Test
-    public void testTruncatedReplay() throws InterruptedException, ExecutionException
-    {
-        CellNameType comparator2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard2").metadata.comparator;
-        CellNameType comparator3 = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard3").metadata.comparator;
-        // Generate 2000 mutations (1000 batchlog entries) and put them all into the batchlog.
-        // Each batchlog entry with a mutation for Standard2 and Standard3.
-        // In the middle of the process, 'truncate' Standard2.
-        for (int i = 0; i < 1000; i++)
-        {
-            Mutation mutation1 = new Mutation(KEYSPACE1, bytes(i));
-            mutation1.add("Standard2", comparator2.makeCellName(bytes(i)), bytes(i), 0);
-            Mutation mutation2 = new Mutation(KEYSPACE1, bytes(i));
-            mutation2.add("Standard3", comparator3.makeCellName(bytes(i)), bytes(i), 0);
-            List<Mutation> mutations = Lists.newArrayList(mutation1, mutation2);
-
-            // Make sure it's ready to be replayed, so adjust the timestamp.
-            long timestamp = System.currentTimeMillis() - DatabaseDescriptor.getWriteRpcTimeout() * 2;
-
-            if (i == 500)
-                SystemKeyspace.saveTruncationRecord(Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard2"),
-                                                    timestamp,
-                                                    ReplayPosition.NONE);
-
-            // Adjust the timestamp (slightly) to make the test deterministic.
-            if (i >= 500)
-                timestamp++;
-            else
-                timestamp--;
-
-            BatchlogManager.getBatchlogMutationFor(mutations,
-                                                   UUIDGen.getTimeUUID(),
-                                                   MessagingService.current_version,
-                                                   timestamp * 1000)
-                           .applyUnsafe();
-        }
-
-        // Flush the batchlog to disk (see CASSANDRA-6822).
-        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.BATCHLOG).forceFlush();
-
-        // Force batchlog replay and wait for it to complete.
-        BatchlogManager.instance.startBatchlogReplay().get();
-
-        // We should see half of Standard2-targeted mutations written after the replay and all of Standard3 mutations applied.
-        for (int i = 0; i < 1000; i++)
-        {
-            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD2,i));
-            if (i >= 500)
-            {
-                assertEquals(bytes(i), result.one().getBytes("key"));
-                assertEquals(bytes(i), result.one().getBytes("column1"));
-                assertEquals(bytes(i), result.one().getBytes("value"));
-            }
-            else
-            {
-                assertTrue(result.isEmpty());
-            }
-        }
-
-        for (int i = 0; i < 1000; i++)
-        {
-            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE key = intAsBlob(%d)", KEYSPACE1, CF_STANDARD3, i));
-            assertEquals(bytes(i), result.one().getBytes("key"));
-            assertEquals(bytes(i), result.one().getBytes("column1"));
-            assertEquals(bytes(i), result.one().getBytes("value"));
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/CellTest.java b/test/unit/org/apache/cassandra/db/CellTest.java
index 493dbbf..22f1b78 100644
--- a/test/unit/org/apache/cassandra/db/CellTest.java
+++ b/test/unit/org/apache/cassandra/db/CellTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.db;
 /*
- * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,26 +15,62 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
  */
-
+package org.apache.cassandra.db;
 
 import java.nio.ByteBuffer;
+import java.util.List;
 
-import org.junit.Test;
+import com.google.common.collect.Lists;
 
 import junit.framework.Assert;
-import org.apache.cassandra.Util;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-import org.apache.cassandra.utils.memory.NativePool;
+import org.apache.cassandra.utils.FBUtilities;
 
 public class CellTest
 {
+    private static final String KEYSPACE1 = "CellTest";
+    private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_COLLECTION = "Collection1";
 
-    private static final OpOrder order = new OpOrder();
-    private static NativeAllocator allocator = new NativePool(Integer.MAX_VALUE, Integer.MAX_VALUE, 1f, null).newAllocator();
+    private static final CFMetaData cfm = SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1);
+    private static final CFMetaData cfm2 = CFMetaData.Builder.create(KEYSPACE1, CF_COLLECTION)
+                                                             .addPartitionKey("k", IntegerType.instance)
+                                                             .addClusteringColumn("c", IntegerType.instance)
+                                                             .addRegularColumn("v", IntegerType.instance)
+                                                             .addRegularColumn("m", MapType.getInstance(IntegerType.instance, IntegerType.instance, true))
+                                                             .build();
+
+    private static final CFMetaData fakeMetadata = CFMetaData.createFake("fakeKS", "fakeTable");
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), cfm, cfm2);
+    }
+
+    private static ColumnDefinition fakeColumn(String name, AbstractType<?> type)
+    {
+        return new ColumnDefinition(fakeMetadata.ksName,
+                                    fakeMetadata.cfName,
+                                    ColumnIdentifier.getInterned(name, false),
+                                    type,
+                                    ColumnDefinition.NO_POSITION,
+                                    ColumnDefinition.Kind.REGULAR);
+    }
 
     @Test
     public void testConflictingTypeEquality()
@@ -49,35 +83,195 @@
                 // don't test equality for both sides native, as this is based on CellName resolution
                 if (lhs && rhs)
                     continue;
-                Cell a = expiring("a", "a", 1, 1, lhs);
-                Cell b = regular("a", "a", 1, rhs);
+                Cell a = expiring(cfm, "val", "a", 1, 1);
+                Cell b = regular(cfm, "val", "a", 1);
                 Assert.assertNotSame(a, b);
                 Assert.assertNotSame(b, a);
-                a = deleted("a", 1, 1, lhs);
-                b = regular("a", ByteBufferUtil.bytes(1), 1, rhs);
+
+                a = deleted(cfm, "val", 1, 1);
                 Assert.assertNotSame(a, b);
                 Assert.assertNotSame(b, a);
             }
         }
     }
 
+    private void assertValid(Cell cell)
+    {
+        try
+        {
+            cell.validate();
+        }
+        catch (Exception e)
+        {
+            Assert.fail("Cell should be valid but got error: " + e);
+        }
+    }
+
+    private void assertInvalid(Cell cell)
+    {
+        try
+        {
+            cell.validate();
+            Assert.fail("Cell " + cell + " should be invalid");
+        }
+        catch (MarshalException e)
+        {
+            // Note that we shouldn't get anything else than a MarshalException so let other escape and fail the test
+        }
+    }
+
+    @Test
+    public void testValidate()
+    {
+        ColumnDefinition c;
+
+        // Valid cells
+        c = fakeColumn("c", Int32Type.instance);
+        assertValid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertValid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.bytes(4)));
+
+        assertValid(BufferCell.expiring(c, 0, 4, 4, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        assertValid(BufferCell.expiring(c, 0, 4, 4, ByteBufferUtil.bytes(4)));
+
+        assertValid(BufferCell.tombstone(c, 0, 4));
+
+        // Invalid value (we don't all empty values for smallint)
+        c = fakeColumn("c", ShortType.instance);
+        assertInvalid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        // But this should be valid even though the underlying value is an empty BB (catches bug #11618)
+        assertValid(BufferCell.tombstone(c, 0, 4));
+        // And of course, this should be valid with a proper value
+        assertValid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.bytes((short)4)));
+
+        // Invalid ttl
+        assertInvalid(BufferCell.expiring(c, 0, -4, 4, ByteBufferUtil.bytes(4)));
+        // Invalid local deletion times
+        assertInvalid(BufferCell.expiring(c, 0, 4, -4, ByteBufferUtil.bytes(4)));
+        assertInvalid(BufferCell.expiring(c, 0, 4, Cell.NO_DELETION_TIME, ByteBufferUtil.bytes(4)));
+
+        c = fakeColumn("c", MapType.getInstance(Int32Type.instance, Int32Type.instance, true));
+        // Valid cell path
+        assertValid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.bytes(4), CellPath.create(ByteBufferUtil.bytes(4))));
+        // Invalid cell path (int values should be 0 or 4 bytes)
+        assertInvalid(BufferCell.live(fakeMetadata, c, 0, ByteBufferUtil.bytes(4), CellPath.create(ByteBufferUtil.bytes((long)4))));
+    }
+
     @Test
     public void testExpiringCellReconile()
     {
         // equal
-        Assert.assertEquals(0, testExpiring("a", "a", 1, 1, null, null, null, null));
+        Assert.assertEquals(0, testExpiring("val", "a", 1, 1, null, null, null, null));
 
         // newer timestamp
-        Assert.assertEquals(-1, testExpiring("a", "a", 2, 1, null, null, 1L, null));
-        Assert.assertEquals(-1, testExpiring("a", "a", 2, 1, null, "b", 1L, 2));
+        Assert.assertEquals(-1, testExpiring("val", "a", 2, 1, null, null, 1L, null));
+        Assert.assertEquals(-1, testExpiring("val", "a", 2, 1, null, "val", 1L, 2));
 
-        // newer TTL
-        Assert.assertEquals(-1, testExpiring("a", "a", 1, 2, null, null, null, 1));
-        Assert.assertEquals(1, testExpiring("a", "a", 1, 2, null, "b", null, 1));
+        Assert.assertEquals(-1, testExpiring("val", "a", 1, 2, null, null, null, 1));
+        Assert.assertEquals(1, testExpiring("val", "a", 1, 2, null, "val", null, 1));
 
         // newer value
-        Assert.assertEquals(-1, testExpiring("a", "b", 2, 1, null, "a", null, null));
-        Assert.assertEquals(-1, testExpiring("a", "b", 2, 1, null, "a", null, 2));
+        Assert.assertEquals(-1, testExpiring("val", "b", 2, 1, null, "a", null, null));
+        Assert.assertEquals(-1, testExpiring("val", "b", 2, 1, null, "a", null, 2));
+    }
+
+    class SimplePurger implements DeletionPurger
+    {
+        private final int gcBefore;
+
+        public SimplePurger(int gcBefore)
+        {
+            this.gcBefore = gcBefore;
+        }
+
+        public boolean shouldPurge(long timestamp, int localDeletionTime)
+        {
+            return localDeletionTime < gcBefore;
+        }
+    }
+
+    /**
+     * tombstones shouldn't be purged if localDeletionTime is greater than gcBefore
+     */
+    @Test
+    public void testNonPurgableTombstone()
+    {
+        int now = 100;
+        Cell cell = deleted(cfm, "val", now, now);
+        Cell purged = cell.purge(new SimplePurger(now - 1), now + 1);
+        Assert.assertEquals(cell, purged);
+    }
+
+    @Test
+    public void testPurgeableTombstone()
+    {
+        int now = 100;
+        Cell cell = deleted(cfm, "val", now, now);
+        Cell purged = cell.purge(new SimplePurger(now + 1), now + 1);
+        Assert.assertNull(purged);
+    }
+
+    @Test
+    public void testLiveExpiringCell()
+    {
+        int now = 100;
+        Cell cell = expiring(cfm, "val", "a", now, now + 10);
+        Cell purged = cell.purge(new SimplePurger(now), now + 1);
+        Assert.assertEquals(cell, purged);
+    }
+
+    /**
+     * cells that have expired should be converted to tombstones with an local deletion time
+     * of the cell's local expiration time, minus it's ttl
+     */
+    @Test
+    public void testExpiredTombstoneConversion()
+    {
+        int now = 100;
+        Cell cell = expiring(cfm, "val", "a", now, 10, now + 10);
+        Cell purged = cell.purge(new SimplePurger(now), now + 11);
+        Assert.assertEquals(deleted(cfm, "val", now, now), purged);
+    }
+
+    /**
+     * if the tombstone created by an expiring cell has a local deletion time less than gcBefore,
+     * it should be purged
+     */
+    @Test
+    public void testPurgeableExpiringCell()
+    {
+        int now = 100;
+        Cell cell = expiring(cfm, "val", "a", now, 10, now + 10);
+        Cell purged = cell.purge(new SimplePurger(now + 1), now + 11);
+        Assert.assertNull(purged);
+    }
+
+    private static ByteBuffer bb(int i)
+    {
+        return ByteBufferUtil.bytes(i);
+    }
+
+    @Test
+    public void testComplexCellReconcile()
+    {
+        ColumnDefinition m = cfm2.getColumnDefinition(new ColumnIdentifier("m", false));
+        int now1 = FBUtilities.nowInSeconds();
+        long ts1 = now1*1000000L;
+
+
+        Cell r1m1 = BufferCell.live(cfm2, m, ts1, bb(1), CellPath.create(bb(1)));
+        Cell r1m2 = BufferCell.live(cfm2, m, ts1, bb(2), CellPath.create(bb(2)));
+        List<Cell> cells1 = Lists.newArrayList(r1m1, r1m2);
+
+        int now2 = now1 + 1;
+        long ts2 = now2*1000000L;
+        Cell r2m2 = BufferCell.live(cfm2, m, ts2, bb(1), CellPath.create(bb(2)));
+        Cell r2m3 = BufferCell.live(cfm2, m, ts2, bb(2), CellPath.create(bb(3)));
+        Cell r2m4 = BufferCell.live(cfm2, m, ts2, bb(3), CellPath.create(bb(4)));
+        List<Cell> cells2 = Lists.newArrayList(r2m2, r2m3, r2m4);
+
+        RowBuilder builder = new RowBuilder();
+        Cells.reconcileComplex(m, cells1.iterator(), cells2.iterator(), DeletionTime.LIVE, builder, now2 + 1);
+        Assert.assertEquals(Lists.newArrayList(r1m1, r2m2, r2m3, r2m4), builder.cells);
     }
 
     private int testExpiring(String n1, String v1, long t1, int et1, String n2, String v2, Long t2, Integer et2)
@@ -90,52 +284,35 @@
             t2 = t1;
         if (et2 == null)
             et2 = et1;
-        int result = testExpiring(n1, v1, t1, et1, false, n2, v2, t2, et2, false);
-        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, false, n2, v2, t2, et2, true));
-        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, true, n2, v2, t2, et2, false));
-        Assert.assertEquals(result, testExpiring(n1, v1, t1, et1, true, n2, v2, t2, et2, true));
-        return result;
+        Cell c1 = expiring(cfm, n1, v1, t1, et1);
+        Cell c2 = expiring(cfm, n2, v2, t2, et2);
+
+        int now = FBUtilities.nowInSeconds();
+        if (Cells.reconcile(c1, c2, now) == c1)
+            return Cells.reconcile(c2, c1, now) == c1 ? -1 : 0;
+        return Cells.reconcile(c2, c1, now) == c2 ? 1 : 0;
     }
 
-    private int testExpiring(String n1, String v1, long t1, int et1, boolean native1, String n2, String v2, long t2, int et2, boolean native2)
+    private Cell regular(CFMetaData cfm, String columnName, String value, long timestamp)
     {
-        Cell c1 = expiring(n1, v1, t1, et1, native1);
-        Cell c2 = expiring(n2, v2, t2, et2, native2);
-        return reconcile(c1, c2);
+        ColumnDefinition cdef = cfm.getColumnDefinition(ByteBufferUtil.bytes(columnName));
+        return BufferCell.live(cfm, cdef, timestamp, ByteBufferUtil.bytes(value));
     }
 
-    int reconcile(Cell c1, Cell c2)
+    private Cell expiring(CFMetaData cfm, String columnName, String value, long timestamp, int localExpirationTime)
     {
-        if (c1.reconcile(c2) == c1)
-            return c2.reconcile(c1) == c1 ? -1 : 0;
-        return c2.reconcile(c1) == c2 ? 1 : 0;
+        return expiring(cfm, columnName, value, timestamp, 1, localExpirationTime);
     }
 
-    private Cell expiring(String name, String value, long timestamp, int expirationTime, boolean nativeCell)
+    private Cell expiring(CFMetaData cfm, String columnName, String value, long timestamp, int ttl, int localExpirationTime)
     {
-        ExpiringCell cell = new BufferExpiringCell(Util.cellname(name), ByteBufferUtil.bytes(value), timestamp, 1, expirationTime);
-        if (nativeCell)
-            cell = new NativeExpiringCell(allocator, order.getCurrent(), cell);
-        return cell;
+        ColumnDefinition cdef = cfm.getColumnDefinition(ByteBufferUtil.bytes(columnName));
+        return new BufferCell(cdef, timestamp, ttl, localExpirationTime, ByteBufferUtil.bytes(value), null);
     }
 
-    private Cell regular(String name, ByteBuffer value, long timestamp, boolean nativeCell)
+    private Cell deleted(CFMetaData cfm, String columnName, int localDeletionTime, long timestamp)
     {
-        Cell cell = new BufferCell(Util.cellname(name), value, timestamp);
-        if (nativeCell)
-            cell = new NativeCell(allocator, order.getCurrent(), cell);
-        return cell;
-    }
-    private Cell regular(String name, String value, long timestamp, boolean nativeCell)
-    {
-        return regular(name, ByteBufferUtil.bytes(value), timestamp, nativeCell);
-    }
-
-    private Cell deleted(String name, int localDeletionTime, long timestamp, boolean nativeCell)
-    {
-        DeletedCell cell = new BufferDeletedCell(Util.cellname(name), localDeletionTime, timestamp);
-        if (nativeCell)
-            cell = new NativeDeletedCell(allocator, order.getCurrent(), cell);
-        return cell;
+        ColumnDefinition cdef = cfm.getColumnDefinition(ByteBufferUtil.bytes(columnName));
+        return BufferCell.tombstone(cdef, timestamp, localDeletionTime);
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/CleanupTest.java b/test/unit/org/apache/cassandra/db/CleanupTest.java
index 5777af4..d4c613d 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTest.java

@@ -24,41 +24,55 @@
 import java.nio.ByteBuffer;
 import java.util.AbstractMap;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 
+import com.google.common.collect.Sets;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.index.SecondaryIndex;
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.locator.AbstractNetworkTopologySnitch;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class CleanupTest
 {
     public static final int LOOPS = 200;
     public static final String KEYSPACE1 = "CleanupTest1";
-    public static final String CF1 = "Indexed1";
-    public static final String CF2 = "Standard1";
+    public static final String CF_INDEXED1 = "Indexed1";
+    public static final String CF_STANDARD1 = "Standard1";
+
+    public static final String KEYSPACE2 = "CleanupTestMultiDc";
+    public static final String CF_INDEXED2 = "Indexed2";
+    public static final String CF_STANDARD2 = "Standard2";
+
+    public static final String KEYSPACE3 = "CleanupSkipSSTables";
+    public static final String CF_STANDARD3 = "Standard3";
+
     public static final ByteBuffer COLUMN = ByteBufferUtil.bytes("birthdate");
     public static final ByteBuffer VALUE = ByteBuffer.allocate(8);
     static
@@ -72,10 +86,33 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF2),
-                                    SchemaLoader.indexCFMD(KEYSPACE1, CF1, true));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED1, true));
+
+
+        DatabaseDescriptor.setEndpointSnitch(new AbstractNetworkTopologySnitch()
+        {
+            @Override
+            public String getRack(InetAddress endpoint)
+            {
+                return "RC1";
+            }
+
+            @Override
+            public String getDatacenter(InetAddress endpoint)
+            {
+                return "DC1";
+            }
+        });
+
+        SchemaLoader.createKeyspace(KEYSPACE2,
+                                    KeyspaceParams.nts("DC1", 1),
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD2),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE2, CF_INDEXED2, true));
+        SchemaLoader.createKeyspace(KEYSPACE3,
+                                    KeyspaceParams.nts("DC1", 1),
+                                    SchemaLoader.standardCFMD(KEYSPACE3, CF_STANDARD3));
     }
 
     @Test
@@ -84,19 +121,15 @@
         StorageService.instance.getTokenMetadata().clearUnsafe();
 
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
-
-        List<Row> rows;
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
 
         // insert data and verify we get it back w/ range query
-        fillCF(cfs, LOOPS);
+        fillCF(cfs, "val", LOOPS);
 
         // record max timestamps of the sstables pre-cleanup
         List<Long> expectedMaxTimestamps = getMaxTimestampList(cfs);
 
-        rows = Util.getRangeSlice(cfs);
-        assertEquals(LOOPS, rows.size());
-
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).build()).size());
         // with one token in the ring, owned by the local node, cleanup should be a no-op
         CompactionManager.instance.performCleanup(cfs, 2);
 
@@ -104,35 +137,29 @@
         assert expectedMaxTimestamps.equals(getMaxTimestampList(cfs));
 
         // check data is still there
-        rows = Util.getRangeSlice(cfs);
-        assertEquals(LOOPS, rows.size());
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).build()).size());
     }
 
     @Test
     public void testCleanupWithIndexes() throws IOException, ExecutionException, InterruptedException
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_INDEXED1);
 
-        List<Row> rows;
 
         // insert data and verify we get it back w/ range query
-        fillCF(cfs, LOOPS);
-        rows = Util.getRangeSlice(cfs);
-        assertEquals(LOOPS, rows.size());
+        fillCF(cfs, "birthdate", LOOPS);
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).build()).size());
 
-        SecondaryIndex index = cfs.indexManager.getIndexForColumn(COLUMN);
+        ColumnDefinition cdef = cfs.metadata.getColumnDefinition(COLUMN);
+        String indexName = "birthdate_key_index";
         long start = System.nanoTime();
-        while (!index.isIndexBuilt(COLUMN) && System.nanoTime() - start < TimeUnit.SECONDS.toNanos(10))
+        while (!cfs.getBuiltIndexes().contains(indexName) && System.nanoTime() - start < TimeUnit.SECONDS.toNanos(10))
             Thread.sleep(10);
 
-        // verify we get it back w/ index query too
-        IndexExpression expr = new IndexExpression(COLUMN, Operator.EQ, VALUE);
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        rows = keyspace.getColumnFamilyStore(CF1).search(range, clause, filter, Integer.MAX_VALUE);
-        assertEquals(LOOPS, rows.size());
+        RowFilter cf = RowFilter.create();
+        cf.add(cdef, Operator.EQ, VALUE);
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).filterOn("birthdate", Operator.EQ, VALUE).build()).size());
 
         // we don't allow cleanup when the local host has no range to avoid wipping up all data when a node has not join the ring.
         // So to make sure cleanup erase everything here, we give the localhost the tiniest possible range.
@@ -146,15 +173,13 @@
         CompactionManager.instance.performCleanup(cfs, 2);
 
         // row data should be gone
-        rows = Util.getRangeSlice(cfs);
-        assertEquals(0, rows.size());
+        assertEquals(0, Util.getAll(Util.cmd(cfs).build()).size());
 
         // not only should it be gone but there should be no data on disk, not even tombstones
-        assert cfs.getSSTables().isEmpty();
+        assert cfs.getLiveSSTables().isEmpty();
 
         // 2ary indexes should result in no results, too (although tombstones won't be gone until compacted)
-        rows = cfs.search(range, clause, filter, Integer.MAX_VALUE);
-        assertEquals(0, rows.size());
+        assertEquals(0, Util.getAll(Util.cmd(cfs).filterOn("birthdate", Operator.EQ, VALUE).build()).size());
     }
 
     @Test
@@ -163,16 +188,12 @@
         StorageService.instance.getTokenMetadata().clearUnsafe();
 
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
-
-        List<Row> rows;
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
 
         // insert data and verify we get it back w/ range query
-        fillCF(cfs, LOOPS);
+        fillCF(cfs, "val", LOOPS);
 
-        rows = Util.getRangeSlice(cfs);
-
-        assertEquals(LOOPS, rows.size());
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).build()).size());
         TokenMetadata tmd = StorageService.instance.getTokenMetadata();
 
         byte[] tk1 = new byte[1], tk2 = new byte[1];
@@ -182,21 +203,84 @@
         tmd.updateNormalToken(new BytesToken(tk2), InetAddress.getByName("127.0.0.2"));
         CompactionManager.instance.performCleanup(cfs, 2);
 
-        rows = Util.getRangeSlice(cfs);
-        assertEquals(0, rows.size());
+        assertEquals(0, Util.getAll(Util.cmd(cfs).build()).size());
     }
 
     @Test
+    public void testCleanupWithNoTokenRange() throws Exception
+    {
+
+        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+        tmd.clearUnsafe();
+        tmd.updateHostId(UUID.randomUUID(), InetAddress.getByName("127.0.0.1"));
+        byte[] tk1 = {2};
+        tmd.updateNormalToken(new BytesToken(tk1), InetAddress.getByName("127.0.0.1"));
+
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE2);
+        keyspace.setMetadata(KeyspaceMetadata.create(KEYSPACE2, KeyspaceParams.nts("DC1", 1)));
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD2);
+
+        // insert data and verify we get it back w/ range query
+        fillCF(cfs, "val", LOOPS);
+        assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).build()).size());
+
+        // remove replication on DC1
+        keyspace.setMetadata(KeyspaceMetadata.create(KEYSPACE2, KeyspaceParams.nts("DC1", 0)));
+
+        // clear token range for localhost on DC1
+
+        CompactionManager.instance.performCleanup(cfs, 2);
+        assertEquals(0, Util.getAll(Util.cmd(cfs).build()).size());
+        assertTrue(cfs.getLiveSSTables().isEmpty());
+    }
+
+    @Test
+    public void testCleanupSkippingSSTables() throws UnknownHostException, ExecutionException, InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE3);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD3);
+        cfs.disableAutoCompaction();
+        for (byte i = 0; i < 100; i++)
+        {
+            new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), ByteBuffer.wrap(new byte[] {i}))
+                .clustering(COLUMN)
+                .add("val", VALUE)
+                .build()
+                .applyUnsafe();
+            cfs.forceBlockingFlush();
+        }
+        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+        tmd.clearUnsafe();
+        tmd.updateHostId(UUID.randomUUID(), InetAddress.getByName("127.0.0.1"));
+        tmd.updateNormalToken(token(new byte[] {50}), InetAddress.getByName("127.0.0.1"));
+        Set<SSTableReader> beforeFirstCleanup = Sets.newHashSet(cfs.getLiveSSTables());
+        // single token - 127.0.0.1 owns everything, cleanup should be noop
+        cfs.forceCleanup(2);
+        assertEquals(beforeFirstCleanup, cfs.getLiveSSTables());
+        tmd.updateNormalToken(token(new byte[] {120}), InetAddress.getByName("127.0.0.2"));
+        cfs.forceCleanup(2);
+        for (SSTableReader sstable : cfs.getLiveSSTables())
+        {
+            assertEquals(sstable.first, sstable.last); // single-token sstables
+            assertTrue(sstable.first.getToken().compareTo(token(new byte[]{50})) <= 0);
+            // with single-token sstables they should all either be skipped or dropped:
+            assertTrue(beforeFirstCleanup.contains(sstable));
+        }
+    }
+
+
+    @Test
     public void testNeedsCleanup() throws Exception
     {
         // setup
         StorageService.instance.getTokenMetadata().clearUnsafe();
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF1);
-        fillCF(cfs, LOOPS);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        fillCF(cfs, "val", LOOPS);
 
         // prepare SSTable and some useful tokens
-        SSTableReader ssTable = cfs.getSSTables().iterator().next();
+        SSTableReader ssTable = cfs.getLiveSSTables().iterator().next();
         final Token ssTableMin = ssTable.first.getToken();
         final Token ssTableMax = ssTable.last.getToken();
 
@@ -236,6 +320,7 @@
                 add(entry(true, Arrays.asList(range(ssTableMin, ssTableMax)))); // first token of SSTable is not owned
                 add(entry(false, Arrays.asList(range(before4, max)))); // first token of SSTable is not owned
                 add(entry(false, Arrays.asList(range(min, before1), range(before2, before3), range(before4, max)))); // SSTable owned by the last range
+                add(entry(true, Collections.EMPTY_LIST)); // empty token range means discard entire sstable
             }
         };
 
@@ -258,7 +343,7 @@
         return new Range<>(from, to);
     }
 
-    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable)
+    protected void fillCF(ColumnFamilyStore cfs, String colName, int rowsPerSSTable)
     {
         CompactionManager.instance.disableAutoCompaction();
 
@@ -266,10 +351,11 @@
         {
             String key = String.valueOf(i);
             // create a row and update the birthdate value, test that the index query fetches the new version
-            Mutation rm;
-            rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(key));
-            rm.add(cfs.name, Util.cellname(COLUMN), VALUE, System.currentTimeMillis());
-            rm.applyUnsafe();
+            new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), ByteBufferUtil.bytes(key))
+                    .clustering(COLUMN)
+                    .add(colName, VALUE)
+                    .build()
+                    .applyUnsafe();
         }
 
         cfs.forceBlockingFlush();
@@ -278,7 +364,7 @@
     protected List<Long> getMaxTimestampList(ColumnFamilyStore cfs)
     {
         List<Long> list = new LinkedList<Long>();
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             list.add(sstable.getMaxTimestamp());
         return list;
     }

diff --git a/test/unit/org/apache/cassandra/db/CollationControllerCQLTest.java b/test/unit/org/apache/cassandra/db/CollationControllerCQLTest.java
deleted file mode 100644
index 376678a..0000000
--- a/test/unit/org/apache/cassandra/db/CollationControllerCQLTest.java
+++ /dev/null

@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db;
-
-import org.junit.Test;
-
-import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import static org.junit.Assert.assertTrue;
-
-public class CollationControllerCQLTest extends CQLTester
-{
-    @Test
-    public void partitionLevelDeletionTest() throws Throwable
-    {
-        createTable("CREATE TABLE %s (bucket_id TEXT,name TEXT,data TEXT,PRIMARY KEY (bucket_id, name))");
-        execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c18', 'test', 'hello')");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
-        execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c19', 'test2', 'hello');");
-        execute("delete from %s where bucket_id = '8772618c9009cf8f5a5e0c18'");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
-        UntypedResultSet res = execute("select * from %s where bucket_id = '8772618c9009cf8f5a5e0c18' and name = 'test'");
-        assertTrue(res.isEmpty());
-    }
-
-    private ColumnFamilyStore getCurrentColumnFamilyStore()
-    {
-        return Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/CollationControllerTest.java b/test/unit/org/apache/cassandra/db/CollationControllerTest.java
deleted file mode 100644
index c227816..0000000
--- a/test/unit/org/apache/cassandra/db/CollationControllerTest.java
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-import static org.junit.Assert.assertEquals;
-
-public class CollationControllerTest
-{
-    private static final String KEYSPACE1 = "CollationControllerTest";
-    private static final String CF = "Standard1";
-    private static final String CFGCGRACE = "StandardGCGS0";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CFGCGRACE).gcGraceSeconds(0));
-    }
-
-    @Test
-    public void getTopLevelColumnsSkipsSSTablesModifiedBeforeRowDelete() 
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-        
-        // add data
-        rm = new Mutation(keyspace.getName(), dk.getKey());
-        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-        
-        // remove
-        rm = new Mutation(keyspace.getName(), dk.getKey());
-        rm.delete(cfs.name, 10);
-        rm.applyUnsafe();
-        
-        // add another mutation because sstable maxtimestamp isn't set
-        // correctly during flush if the most recent mutation is a row delete
-        rm = new Mutation(keyspace.getName(), Util.dk("key2").getKey());
-        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("zxcv"), 20);
-        rm.applyUnsafe();
-        
-        cfs.forceBlockingFlush();
-
-        // add yet one more mutation
-        rm = new Mutation(keyspace.getName(), dk.getKey());
-        rm.add(cfs.name, Util.cellname("Column1"), ByteBufferUtil.bytes("foobar"), 30);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        // A NamesQueryFilter goes down one code path (through collectTimeOrderedData())
-        // It should only iterate the last flushed sstable, since it probably contains the most recent value for Column1
-        QueryFilter filter = Util.namesQueryFilter(cfs, dk, "Column1");
-        CollationController controller = new CollationController(cfs, filter, Integer.MIN_VALUE);
-        controller.getTopLevelColumns(true);
-        assertEquals(1, controller.getSstablesIterated());
-
-        // SliceQueryFilter goes down another path (through collectAllData())
-        // We will read "only" the last sstable in that case, but because the 2nd sstable has a tombstone that is more
-        // recent than the maxTimestamp of the very first sstable we flushed, we should only read the 2 first sstables.
-        filter = QueryFilter.getIdentityFilter(dk, cfs.name, System.currentTimeMillis());
-        controller = new CollationController(cfs, filter, Integer.MIN_VALUE);
-        controller.getTopLevelColumns(true);
-        assertEquals(2, controller.getSstablesIterated());
-    }
-
-    @Test
-    public void ensureTombstonesAppliedAfterGCGS()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CFGCGRACE);
-        cfs.disableAutoCompaction();
-
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-        CellName cellName = Util.cellname("Column1");
-
-        // add data
-        rm = new Mutation(keyspace.getName(), dk.getKey());
-        rm.add(cfs.name, cellName, ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        // remove
-        rm = new Mutation(keyspace.getName(), dk.getKey());
-        rm.delete(cfs.name, cellName, 0);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        // use "realistic" query times since we'll compare these numbers to the local deletion time of the tombstone
-        QueryFilter filter;
-        long queryAt = System.currentTimeMillis() + 1000;
-        int gcBefore = cfs.gcBefore(queryAt);
-
-        filter = QueryFilter.getNamesFilter(dk, cfs.name, FBUtilities.singleton(cellName, cfs.getComparator()), queryAt);
-        CollationController controller = new CollationController(cfs, filter, gcBefore);
-        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(true), gcBefore) == null;
-
-        filter = QueryFilter.getIdentityFilter(dk, cfs.name, queryAt);
-        controller = new CollationController(cfs, filter, gcBefore);
-        assert ColumnFamilyStore.removeDeleted(controller.getTopLevelColumns(true), gcBefore) == null;
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java
index 2d89e09..2f7aaa5 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java

@@ -17,22 +17,18 @@
  */
 package org.apache.cassandra.db;
 
-import java.nio.ByteBuffer;
 import java.util.Collection;
 
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import com.google.common.base.Supplier;
-
 import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.Util.cellname;
 
 public class ColumnFamilyMetricTest
 {
@@ -41,8 +37,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace("Keyspace1",
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD("Keyspace1", "Standard2"));
     }
 
@@ -50,23 +45,24 @@
     public void testSizeMetric()
     {
         Keyspace keyspace = Keyspace.open("Keyspace1");
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-        store.disableAutoCompaction();
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard2");
+        cfs.disableAutoCompaction();
 
-        store.truncateBlocking();
+        cfs.truncateBlocking();
 
-        assertEquals(0, store.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(0, store.metric.totalDiskSpaceUsed.getCount());
+        assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
+        assertEquals(0, cfs.metric.totalDiskSpaceUsed.getCount());
 
         for (int j = 0; j < 10; j++)
         {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation("Keyspace1", key);
-            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros(), String.valueOf(j))
+                    .clustering("0")
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
         }
-        store.forceBlockingFlush();
-        Collection<SSTableReader> sstables = store.getSSTables();
+        cfs.forceBlockingFlush();
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
         long size = 0;
         for (SSTableReader reader : sstables)
         {
@@ -74,34 +70,16 @@
         }
 
         // size metrics should show the sum of all SSTable sizes
-        assertEquals(size, store.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(size, store.metric.totalDiskSpaceUsed.getCount());
+        assertEquals(size, cfs.metric.liveDiskSpaceUsed.getCount());
+        assertEquals(size, cfs.metric.totalDiskSpaceUsed.getCount());
 
-        store.truncateBlocking();
+        cfs.truncateBlocking();
 
         // after truncate, size metrics should be down to 0
-        Util.spinAssertEquals(
-                0L,
-                new Supplier<Object>()
-                {
-                    public Long get()
-                    {
-                        return store.metric.liveDiskSpaceUsed.getCount();
-                    }
-                },
-                30);
-        Util.spinAssertEquals(
-                0L,
-                new Supplier<Object>()
-                {
-                    public Long get()
-                    {
-                        return store.metric.totalDiskSpaceUsed.getCount();
-                    }
-                },
-                30);
+        Util.spinAssertEquals(0L, () -> cfs.metric.liveDiskSpaceUsed.getCount(), 30);
+        Util.spinAssertEquals(0L, () -> cfs.metric.totalDiskSpaceUsed.getCount(), 30);
 
-        store.enableAutoCompaction();
+        cfs.enableAutoCompaction();
     }
 
     @Test
@@ -113,18 +91,21 @@
         // This confirms another test/set up did not overflow the histogram
         store.metric.colUpdateTimeDeltaHistogram.cf.getSnapshot().get999thPercentile();
 
-        ByteBuffer key = ByteBufferUtil.bytes(4242);
-        Mutation m = new Mutation("Keyspace1", key);
-        m.add("Standard2", cellname("0"), ByteBufferUtil.bytes("0"), 0);
-        m.apply();
+        new RowUpdateBuilder(store.metadata, 0, "4242")
+            .clustering("0")
+            .add("val", ByteBufferUtil.bytes("0"))
+            .build()
+            .applyUnsafe();
 
         // The histogram should not have overflowed on the first write
         store.metric.colUpdateTimeDeltaHistogram.cf.getSnapshot().get999thPercentile();
 
-        m = new Mutation("Keyspace1", key);
         // smallest time delta that would overflow the histogram if unfiltered
-        m.add("Standard2", cellname("0"), ByteBufferUtil.bytes("1"), 18165375903307L);
-        m.apply();
+        new RowUpdateBuilder(store.metadata, 18165375903307L, "4242")
+            .clustering("0")
+            .add("val", ByteBufferUtil.bytes("0"))
+            .build()
+            .applyUnsafe();
 
         // CASSANDRA-11117 - update with large timestamp delta should not overflow the histogram
         store.metric.colUpdateTimeDeltaHistogram.cf.getSnapshot().get999thPercentile();

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreCQLHelperTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreCQLHelperTest.java
new file mode 100644
index 0000000..714b61a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreCQLHelperTest.java

@@ -0,0 +1,677 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.io.FileReader;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.*;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.io.Files;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.*;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.statements.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.schema.*;
+import org.apache.cassandra.utils.*;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class ColumnFamilyStoreCQLHelperTest extends CQLTester
+{
+    @Before
+    public void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+    }
+
+    @Test
+    public void testUserTypesCQL()
+    {
+        String keyspace = "cql_test_keyspace_user_types";
+        String table = "test_table_user_types";
+
+        UserType typeA = new UserType(keyspace, ByteBufferUtil.bytes("a"),
+                                      Arrays.asList(ByteBufferUtil.bytes("a1"),
+                                                    ByteBufferUtil.bytes("a2"),
+                                                    ByteBufferUtil.bytes("a3")),
+                                      Arrays.asList(IntegerType.instance,
+                                                    IntegerType.instance,
+                                                    IntegerType.instance));
+
+        UserType typeB = new UserType(keyspace, ByteBufferUtil.bytes("b"),
+                                      Arrays.asList(ByteBufferUtil.bytes("b1"),
+                                                    ByteBufferUtil.bytes("b2"),
+                                                    ByteBufferUtil.bytes("b3")),
+                                      Arrays.asList(typeA,
+                                                    typeA,
+                                                    typeA));
+
+        UserType typeC = new UserType(keyspace, ByteBufferUtil.bytes("c"),
+                                      Arrays.asList(ByteBufferUtil.bytes("c1"),
+                                                    ByteBufferUtil.bytes("c2"),
+                                                    ByteBufferUtil.bytes("c3")),
+                                      Arrays.asList(typeB,
+                                                    typeB,
+                                                    typeB));
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addClusteringColumn("ck1", IntegerType.instance)
+                                           .addRegularColumn("reg1", typeC)
+                                           .addRegularColumn("reg2", ListType.getInstance(IntegerType.instance, false))
+                                           .addRegularColumn("reg3", MapType.getInstance(AsciiType.instance, IntegerType.instance, true))
+                                           .build();
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    Tables.of(cfm),
+                                    Types.of(typeA, typeB, typeC));
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertEquals(ImmutableList.of("CREATE TYPE cql_test_keyspace_user_types.a(a1 varint, a2 varint, a3 varint);",
+                                      "CREATE TYPE cql_test_keyspace_user_types.b(b1 frozen<a>, b2 frozen<a>, b3 frozen<a>);",
+                                      "CREATE TYPE cql_test_keyspace_user_types.c(c1 frozen<b>, c2 frozen<b>, c3 frozen<b>);"),
+                     ColumnFamilyStoreCQLHelper.getUserTypesAsCQL(cfs.metadata));
+    }
+
+    @Test
+    public void testDroppedColumnsCQL()
+    {
+        String keyspace = "cql_test_keyspace_dropped_columns";
+        String table = "test_table_dropped_columns";
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addClusteringColumn("ck1", IntegerType.instance)
+                                           .addRegularColumn("reg1", IntegerType.instance)
+                                           .addRegularColumn("reg2", IntegerType.instance)
+                                           .addRegularColumn("reg3", IntegerType.instance)
+                                           .build();
+
+
+        ColumnDefinition reg1 = cfm.getColumnDefinition(ByteBufferUtil.bytes("reg1"));
+        ColumnDefinition reg2 = cfm.getColumnDefinition(ByteBufferUtil.bytes("reg2"));
+        ColumnDefinition reg3 = cfm.getColumnDefinition(ByteBufferUtil.bytes("reg3"));
+
+        cfm.removeColumnDefinition(reg1);
+        cfm.removeColumnDefinition(reg2);
+        cfm.removeColumnDefinition(reg3);
+
+        cfm.recordColumnDrop(reg1, 10000);
+        cfm.recordColumnDrop(reg2, 20000);
+        cfm.recordColumnDrop(reg3, 30000);
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertEquals(ImmutableList.of("ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg1 USING TIMESTAMP 10000;",
+                                      "ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg3 USING TIMESTAMP 30000;",
+                                      "ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg2 USING TIMESTAMP 20000;"),
+                     ColumnFamilyStoreCQLHelper.getDroppedColumnsAsCQL(cfs.metadata));
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS cql_test_keyspace_dropped_columns.test_table_dropped_columns (\n" +
+        "\tpk1 varint,\n" +
+        "\tck1 varint,\n" +
+        "\treg1 varint,\n" +
+        "\treg3 varint,\n" +
+        "\treg2 varint,\n" +
+        "\tPRIMARY KEY (pk1, ck1))"));
+    }
+
+    @Test
+    public void testReaddedColumns()
+    {
+        String keyspace = "cql_test_keyspace_readded_columns";
+        String table = "test_table_readded_columns";
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addClusteringColumn("ck1", IntegerType.instance)
+                                           .addRegularColumn("reg1", IntegerType.instance)
+                                           .addStaticColumn("reg2", IntegerType.instance)
+                                           .addRegularColumn("reg3", IntegerType.instance)
+                                           .build();
+
+        ColumnDefinition reg1 = cfm.getColumnDefinition(ByteBufferUtil.bytes("reg1"));
+        ColumnDefinition reg2 = cfm.getColumnDefinition(ByteBufferUtil.bytes("reg2"));
+
+        cfm.removeColumnDefinition(reg1);
+        cfm.removeColumnDefinition(reg2);
+
+        cfm.recordColumnDrop(reg1, 10000);
+        cfm.recordColumnDrop(reg2, 20000);
+
+        cfm.addColumnDefinition(reg1);
+        cfm.addColumnDefinition(reg2);
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        // when re-adding, column is present in CREATE, then in DROP and then in ADD again, to record DROP with a proper timestamp
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS cql_test_keyspace_readded_columns.test_table_readded_columns (\n" +
+        "\tpk1 varint,\n" +
+        "\tck1 varint,\n" +
+        "\treg2 varint static,\n" +
+        "\treg1 varint,\n" +
+        "\treg3 varint,\n" +
+        "\tPRIMARY KEY (pk1, ck1))"));
+
+        assertEquals(ImmutableList.of("ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns DROP reg1 USING TIMESTAMP 10000;",
+                                      "ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns ADD reg1 varint;",
+                                      "ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns DROP reg2 USING TIMESTAMP 20000;",
+                                      "ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns ADD reg2 varint static;"),
+                     ColumnFamilyStoreCQLHelper.getDroppedColumnsAsCQL(cfs.metadata));
+    }
+
+    @Test
+    public void testCfmColumnsCQL()
+    {
+        String keyspace = "cql_test_keyspace_create_table";
+        String table = "test_table_create_table";
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addPartitionKey("pk2", AsciiType.instance)
+                                           .addClusteringColumn("ck1", ReversedType.getInstance(IntegerType.instance))
+                                           .addClusteringColumn("ck2", IntegerType.instance)
+                                           .addStaticColumn("st1", AsciiType.instance)
+                                           .addRegularColumn("reg1", AsciiType.instance)
+                                           .addRegularColumn("reg2", ListType.getInstance(IntegerType.instance, false))
+                                           .addRegularColumn("reg3", MapType.getInstance(AsciiType.instance, IntegerType.instance, true))
+                                           .build();
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS cql_test_keyspace_create_table.test_table_create_table (\n" +
+        "\tpk1 varint,\n" +
+        "\tpk2 ascii,\n" +
+        "\tck1 varint,\n" +
+        "\tck2 varint,\n" +
+        "\tst1 ascii static,\n" +
+        "\treg1 ascii,\n" +
+        "\treg2 frozen<list<varint>>,\n" +
+        "\treg3 map<ascii, varint>,\n" +
+        "\tPRIMARY KEY ((pk1, pk2), ck1, ck2))\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND CLUSTERING ORDER BY (ck1 DESC, ck2 ASC)"));
+    }
+
+    @Test
+    public void testCfmCompactStorageCQL()
+    {
+        String keyspace = "cql_test_keyspace_compact";
+        String table = "test_table_compact";
+
+        CFMetaData cfm = CFMetaData.Builder.createDense(keyspace, table, true, false)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addPartitionKey("pk2", AsciiType.instance)
+                                           .addClusteringColumn("ck1", ReversedType.getInstance(IntegerType.instance))
+                                           .addClusteringColumn("ck2", IntegerType.instance)
+                                           .addRegularColumn("reg", IntegerType.instance)
+                                           .build();
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS cql_test_keyspace_compact.test_table_compact (\n" +
+        "\tpk1 varint,\n" +
+        "\tpk2 ascii,\n" +
+        "\tck1 varint,\n" +
+        "\tck2 varint,\n" +
+        "\treg varint,\n" +
+        "\tPRIMARY KEY ((pk1, pk2), ck1, ck2))\n" +
+        "\tWITH ID = " + cfm.cfId + "\n" +
+        "\tAND COMPACT STORAGE\n" +
+        "\tAND CLUSTERING ORDER BY (ck1 DESC, ck2 ASC)"));
+    }
+
+    @Test
+    public void testCfmCounterCQL()
+    {
+        String keyspace = "cql_test_keyspace_counter";
+        String table = "test_table_counter";
+
+        CFMetaData cfm = CFMetaData.Builder.createDense(keyspace, table, true, true)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addPartitionKey("pk2", AsciiType.instance)
+                                           .addClusteringColumn("ck1", ReversedType.getInstance(IntegerType.instance))
+                                           .addClusteringColumn("ck2", IntegerType.instance)
+                                           .addRegularColumn("cnt", CounterColumnType.instance)
+                                           .build();
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS cql_test_keyspace_counter.test_table_counter (\n" +
+        "\tpk1 varint,\n" +
+        "\tpk2 ascii,\n" +
+        "\tck1 varint,\n" +
+        "\tck2 varint,\n" +
+        "\tcnt counter,\n" +
+        "\tPRIMARY KEY ((pk1, pk2), ck1, ck2))\n" +
+        "\tWITH ID = " + cfm.cfId + "\n" +
+        "\tAND COMPACT STORAGE\n" +
+        "\tAND CLUSTERING ORDER BY (ck1 DESC, ck2 ASC)"));
+    }
+
+    @Test
+    public void testCfmOptionsCQL()
+    {
+        String keyspace = "cql_test_keyspace_options";
+        String table = "test_table_options";
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addClusteringColumn("cl1", IntegerType.instance)
+                                           .addRegularColumn("reg1", AsciiType.instance)
+                                           .build();
+
+        cfm.recordColumnDrop(cfm.getColumnDefinition(ByteBuffer.wrap("reg1".getBytes())), FBUtilities.timestampMicros());
+        cfm.bloomFilterFpChance(1.0);
+        cfm.comment("comment");
+        cfm.compaction(CompactionParams.lcs(Collections.singletonMap("sstable_size_in_mb", "1")));
+        cfm.compression(CompressionParams.lz4(1 << 16));
+        cfm.dcLocalReadRepairChance(0.2);
+        cfm.crcCheckChance(0.3);
+        cfm.defaultTimeToLive(4);
+        cfm.gcGraceSeconds(5);
+        cfm.minIndexInterval(6);
+        cfm.maxIndexInterval(7);
+        cfm.memtableFlushPeriod(8);
+        cfm.readRepairChance(0.9);
+        cfm.speculativeRetry(SpeculativeRetryParam.always());
+        cfm.extensions(ImmutableMap.of("ext1",
+                                       ByteBuffer.wrap("val1".getBytes())));
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).endsWith(
+        "AND bloom_filter_fp_chance = 1.0\n" +
+        "\tAND dclocal_read_repair_chance = 0.2\n" +
+        "\tAND crc_check_chance = 0.3\n" +
+        "\tAND default_time_to_live = 4\n" +
+        "\tAND gc_grace_seconds = 5\n" +
+        "\tAND min_index_interval = 6\n" +
+        "\tAND max_index_interval = 7\n" +
+        "\tAND memtable_flush_period_in_ms = 8\n" +
+        "\tAND read_repair_chance = 0.9\n" +
+        "\tAND speculative_retry = 'ALWAYS'\n" +
+        "\tAND comment = 'comment'\n" +
+        "\tAND caching = { 'keys': 'ALL', 'rows_per_partition': 'NONE' }\n" +
+        "\tAND compaction = { 'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy', 'sstable_size_in_mb': '1' }\n" +
+        "\tAND compression = { 'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor' }\n" +
+        "\tAND extensions = { 'ext1': 0x76616c31 };"
+        ));
+    }
+
+    @Test
+    public void testCfmIndexJson()
+    {
+        String keyspace = "cql_test_keyspace_3";
+        String table = "test_table_3";
+
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk1", IntegerType.instance)
+                                           .addClusteringColumn("cl1", IntegerType.instance)
+                                           .addRegularColumn("reg1", AsciiType.instance)
+                                           .build();
+
+        cfm.indexes(cfm.getIndexes()
+                       .with(IndexMetadata.fromIndexTargets(cfm,
+                                                            Collections.singletonList(new IndexTarget(cfm.getColumnDefinition(ByteBufferUtil.bytes("reg1")).name,
+                                                                                                      IndexTarget.Type.VALUES)),
+                                                            "indexName",
+                                                            IndexMetadata.Kind.COMPOSITES,
+                                                            Collections.emptyMap()))
+                       .with(IndexMetadata.fromIndexTargets(cfm,
+                                                            Collections.singletonList(new IndexTarget(cfm.getColumnDefinition(ByteBufferUtil.bytes("reg1")).name,
+                                                                                                      IndexTarget.Type.KEYS)),
+                                                            "indexName2",
+                                                            IndexMetadata.Kind.COMPOSITES,
+                                                            Collections.emptyMap()))
+                       .with(IndexMetadata.fromIndexTargets(cfm,
+                                                            Collections.singletonList(new IndexTarget(cfm.getColumnDefinition(ByteBufferUtil.bytes("reg1")).name,
+                                                                                                      IndexTarget.Type.KEYS_AND_VALUES)),
+                                                            "indexName3",
+                                                            IndexMetadata.Kind.COMPOSITES,
+                                                            Collections.emptyMap())));
+
+
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+
+        assertEquals(ImmutableList.of("CREATE INDEX \"indexName\" ON cql_test_keyspace_3.test_table_3 (reg1);",
+                                      "CREATE INDEX \"indexName2\" ON cql_test_keyspace_3.test_table_3 (reg1);",
+                                      "CREATE INDEX \"indexName3\" ON cql_test_keyspace_3.test_table_3 (reg1);"),
+                     ColumnFamilyStoreCQLHelper.getIndexesAsCQL(cfs.metadata));
+    }
+
+    private final static String SNAPSHOT = "testsnapshot";
+
+    @Test
+    public void testSnapshot() throws Throwable
+    {
+        String typeA = createType("CREATE TYPE %s (a1 varint, a2 varint, a3 varint);");
+        String typeB = createType("CREATE TYPE %s (b1 frozen<" + typeA + ">, b2 frozen<" + typeA + ">, b3 frozen<" + typeA + ">);");
+        String typeC = createType("CREATE TYPE %s (c1 frozen<" + typeB + ">, c2 frozen<" + typeB + ">, c3 frozen<" + typeB + ">);");
+
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint," +
+                                       "pk2 ascii," +
+                                       "ck1 varint," +
+                                       "ck2 varint," +
+                                       "reg1 frozen<" + typeC + ">," +
+                                       "reg2 int," +
+                                       "reg3 int," +
+                                       "PRIMARY KEY ((pk1, pk2), ck1, ck2)) WITH " +
+                                       "CLUSTERING ORDER BY (ck1 ASC, ck2 DESC);");
+
+        alterTable("ALTER TABLE %s DROP reg3 USING TIMESTAMP 10000;");
+        alterTable("ALTER TABLE %s ADD reg3 int;");
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (pk1, pk2, ck1, ck2, reg1, reg2) VALUES (?, ?, ?, ?, ?, ?)", i, i + 1, i + 2, i + 3, null, i + 5);
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+        cfs.snapshot(SNAPSHOT);
+
+        String schema = Files.toString(cfs.getDirectories().getSnapshotSchemaFile(SNAPSHOT), Charset.defaultCharset());
+        assertTrue(schema.contains(String.format("CREATE TYPE %s.%s(a1 varint, a2 varint, a3 varint);", keyspace(), typeA)));
+        assertTrue(schema.contains(String.format("CREATE TYPE %s.%s(a1 varint, a2 varint, a3 varint);", keyspace(), typeA)));
+        assertTrue(schema.contains(String.format("CREATE TYPE %s.%s(b1 frozen<%s>, b2 frozen<%s>, b3 frozen<%s>);", keyspace(), typeB, typeA, typeA, typeA)));
+        assertTrue(schema.contains(String.format("CREATE TYPE %s.%s(c1 frozen<%s>, c2 frozen<%s>, c3 frozen<%s>);", keyspace(), typeC, typeB, typeB, typeB)));
+
+        schema = schema.substring(schema.indexOf("CREATE TABLE")); // trim to ensure order
+
+        assertTrue(schema.startsWith("CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+                                     "\tpk1 varint,\n" +
+                                     "\tpk2 ascii,\n" +
+                                     "\tck1 varint,\n" +
+                                     "\tck2 varint,\n" +
+                                     "\treg1 frozen<" + typeC + ">,\n" +
+                                     "\treg2 int,\n" +
+                                     "\treg3 int,\n" +
+                                     "\tPRIMARY KEY ((pk1, pk2), ck1, ck2))\n" +
+                                     "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+                                     "\tAND CLUSTERING ORDER BY (ck1 ASC, ck2 DESC)"));
+
+        schema = schema.substring(schema.indexOf("ALTER"));
+        assertTrue(schema.startsWith(String.format("ALTER TABLE %s.%s DROP reg3 USING TIMESTAMP 10000;", keyspace(), tableName)));
+        assertTrue(schema.contains(String.format("ALTER TABLE %s.%s ADD reg3 int;", keyspace(), tableName)));
+
+        JSONObject manifest = (JSONObject) new JSONParser().parse(new FileReader(cfs.getDirectories().getSnapshotManifestFile(SNAPSHOT)));
+        JSONArray files = (JSONArray) manifest.get("files");
+        Assert.assertEquals(1, files.size());
+    }
+
+    @Test
+    public void testSystemKsSnapshot() throws Throwable
+    {
+        ColumnFamilyStore cfs = Keyspace.open("system").getColumnFamilyStore("peers");
+        cfs.snapshot(SNAPSHOT);
+
+        Assert.assertTrue(cfs.getDirectories().getSnapshotManifestFile(SNAPSHOT).exists());
+        Assert.assertFalse(cfs.getDirectories().getSnapshotSchemaFile(SNAPSHOT).exists());
+    }
+
+    @Test
+    public void testDroppedType() throws Throwable
+    {
+        String typeA = createType("CREATE TYPE %s (a1 varint, a2 varint, a3 varint);");
+        String typeB = createType("CREATE TYPE %s (b1 frozen<" + typeA + ">, b2 frozen<" + typeA + ">, b3 frozen<" + typeA + ">);");
+
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint," +
+                                       "ck1 varint," +
+                                       "reg1 frozen<" + typeB + ">," +
+                                       "reg2 varint," +
+                                       "PRIMARY KEY (pk1, ck1));");
+
+        alterTable("ALTER TABLE %s DROP reg1 USING TIMESTAMP 10000;");
+
+        Runnable validate = () -> {
+            try
+            {
+                ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+                cfs.snapshot(SNAPSHOT);
+                String schema = Files.toString(cfs.getDirectories().getSnapshotSchemaFile(SNAPSHOT), Charset.defaultCharset());
+
+                // When both column and it's type are dropped, the type in column definition gets substituted with a tuple
+                assertTrue(schema.startsWith("CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+                                             "\tpk1 varint,\n" +
+                                             "\tck1 varint,\n" +
+                                             "\treg2 varint,\n" +
+                                             "\treg1 frozen<tuple<frozen<tuple<varint, varint, varint>>, frozen<tuple<varint, varint, varint>>, frozen<tuple<varint, varint, varint>>>>,\n" +
+                                             "\tPRIMARY KEY (pk1, ck1))"));
+                assertTrue(schema.contains("ALTER TABLE " + keyspace() + "." + tableName + " DROP reg1 USING TIMESTAMP 10000;"));
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        };
+
+        // Validate before and after the type drop
+        validate.run();
+        schemaChange("DROP TYPE " + keyspace() + "." + typeB);
+        schemaChange("DROP TYPE " + keyspace() + "." + typeA);
+        validate.run();
+    }
+
+    @Test
+    public void testDenseTable() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint PRIMARY KEY," +
+                                       "reg1 int)" +
+                                       " WITH COMPACT STORAGE");
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+        "\tpk1 varint PRIMARY KEY,\n" +
+        "\treg1 int)\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void testStaticCompactTable() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint PRIMARY KEY," +
+                                       "reg1 int," +
+                                       "reg2 int)" +
+                                       " WITH COMPACT STORAGE");
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+        "\tpk1 varint PRIMARY KEY,\n" +
+        "\treg1 int,\n" +
+        "\treg2 int)\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void testStaticCompactWithCounters() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint PRIMARY KEY," +
+                                       "reg1 counter," +
+                                       "reg2 counter)" +
+                                       " WITH COMPACT STORAGE");
+
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+        "\tpk1 varint PRIMARY KEY,\n" +
+        "\treg1 counter,\n" +
+        "\treg2 counter)\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void testDenseCompactTableWithoutRegulars() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint," +
+                                       "ck1 int," +
+                                       "PRIMARY KEY (pk1, ck1))" +
+                                       " WITH COMPACT STORAGE");
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+        "\tpk1 varint,\n" +
+        "\tck1 int,\n" +
+        "\tPRIMARY KEY (pk1, ck1))\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void testCompactDynamic() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE IF NOT EXISTS %s (" +
+                                       "pk1 varint," +
+                                       "ck1 int," +
+                                       "reg int," +
+                                       "PRIMARY KEY (pk1, ck1))" +
+                                       " WITH COMPACT STORAGE");
+
+        ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(tableName);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + keyspace() + "." + tableName + " (\n" +
+        "\tpk1 varint,\n" +
+        "\tck1 int,\n" +
+        "\treg int,\n" +
+        "\tPRIMARY KEY (pk1, ck1))\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void testDynamicComposite() throws Throwable
+    {
+        Map<Byte, AbstractType<?>> aliases = new HashMap<>();
+        aliases.put((byte)'a', BytesType.instance);
+        aliases.put((byte)'b', BytesType.instance);
+        aliases.put((byte)'c', BytesType.instance);
+
+        String DYNAMIC_COMPOSITE = "dynamic_composite";
+        AbstractType<?> dynamicComposite = DynamicCompositeType.getInstance(aliases);
+
+        SchemaLoader.createKeyspace(DYNAMIC_COMPOSITE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.denseCFMD(DYNAMIC_COMPOSITE, DYNAMIC_COMPOSITE, dynamicComposite));
+
+        ColumnFamilyStore cfs = Keyspace.open(DYNAMIC_COMPOSITE).getColumnFamilyStore(DYNAMIC_COMPOSITE);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "CREATE TABLE IF NOT EXISTS " + DYNAMIC_COMPOSITE + "." + DYNAMIC_COMPOSITE + " (\n" +
+        "\tkey ascii,\n" +
+        "\tcols 'org.apache.cassandra.db.marshal.DynamicCompositeType(a=>org.apache.cassandra.db.marshal.BytesType,b=>org.apache.cassandra.db.marshal.BytesType,c=>org.apache.cassandra.db.marshal.BytesType)',\n" +
+        "\tval ascii,\n" +
+        "\tPRIMARY KEY (key, cols))\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+
+    @Test
+    public void superColumnFamilyTest() throws Throwable
+    {
+        final String KEYSPACE = "thrift_compact_table_with_supercolumns_test";
+        final String TABLE = "test_table_1";
+
+        CFMetaData cfm = CFMetaData.Builder.createSuper(KEYSPACE, TABLE, false)
+                                           .addPartitionKey("key", BytesType.instance)
+                                           .addClusteringColumn("column1", AsciiType.instance)
+                                           .addRegularColumn("", MapType.getInstance(Int32Type.instance, AsciiType.instance, true))
+                                           .build();
+
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE);
+
+        assertTrue(ColumnFamilyStoreCQLHelper.getCFMetadataAsCQL(cfs.metadata, true).startsWith(
+        "/*\n" +
+        "Warning: Table " + KEYSPACE + "." + TABLE + " omitted because it has constructs not compatible with CQL (was created via legacy API).\n\n" +
+        "Approximate structure, for reference:\n" +
+        "(this should not be used to reproduce this schema)\n\n" +
+        "CREATE TABLE IF NOT EXISTS " + KEYSPACE + "." + TABLE + " (\n" +
+        "\tkey blob,\n" +
+        "\tcolumn1 ascii,\n" +
+        "\t\"\" map<int, ascii>,\n" +
+        "\tPRIMARY KEY (key, column1))\n" +
+        "\tWITH ID = " + cfs.metadata.cfId + "\n" +
+        "\tAND COMPACT STORAGE"));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
index 4d452c6..f7152ff 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java

@@ -22,158 +22,77 @@
 import java.io.FileReader;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.UUID;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
+import java.util.*;
 
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Sets;
-
-import org.apache.cassandra.db.index.PerRowSecondaryIndexTest;
-import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.commons.lang3.ArrayUtils;
-import org.apache.commons.lang3.StringUtils;
+import org.junit.Before;
 import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
-import org.apache.cassandra.OrderedJUnit4ClassRunner;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.db.marshal.LexicalUUIDType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.ExcludingBounds;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.IncludingExcludingBounds;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.metrics.ClearableHistogram;
-import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.thrift.SlicePredicate;
-import org.apache.cassandra.thrift.SliceRange;
-import org.apache.cassandra.thrift.ThriftValidation;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
-import org.apache.cassandra.utils.WrappedRunnable;
 import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.Util.dk;
-import static org.apache.cassandra.Util.rp;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 
+import com.google.common.collect.Iterators;
+import org.apache.cassandra.*;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.metrics.ClearableHistogram;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.WrappedRunnable;
+import static junit.framework.Assert.assertNotNull;
+
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class ColumnFamilyStoreTest
 {
-    static byte[] bytes1, bytes2;
     public static final String KEYSPACE1 = "ColumnFamilyStoreTest1";
     public static final String KEYSPACE2 = "ColumnFamilyStoreTest2";
-    public static final String KEYSPACE3 = "ColumnFamilyStoreTest3";
-    public static final String KEYSPACE4 = "PerRowSecondaryIndex";
     public static final String CF_STANDARD1 = "Standard1";
     public static final String CF_STANDARD2 = "Standard2";
-    public static final String CF_STANDARD3 = "Standard3";
-    public static final String CF_STANDARD4 = "Standard4";
-    public static final String CF_STANDARD5 = "Standard5";
-    public static final String CF_STANDARD6 = "Standard6";
-    public static final String CF_STANDARDINT = "StandardInteger1";
     public static final String CF_SUPER1 = "Super1";
     public static final String CF_SUPER6 = "Super6";
     public static final String CF_INDEX1 = "Indexed1";
-    public static final String CF_INDEX2 = "Indexed2";
-    public static final String CF_INDEX3 = "Indexed3";
-
-    static
-    {
-        Random random = new Random();
-        bytes1 = new byte[1024];
-        bytes2 = new byte[128];
-        random.nextBytes(bytes1);
-        random.nextBytes(bytes2);
-    }
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD4),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD5),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD6),
-                                    SchemaLoader.indexCFMD(KEYSPACE1, CF_INDEX1, true),
-                                    SchemaLoader.indexCFMD(KEYSPACE1, CF_INDEX2, false),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER1, LongType.instance),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER6, LexicalUUIDType.instance, UTF8Type.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDINT, IntegerType.instance));
+                                    SchemaLoader.keysIndexCFMD(KEYSPACE1, CF_INDEX1, true));
+                                    // TODO: Fix superCFMD failing on legacy table creation. Seems to be applying composite comparator to partition key
+                                    // SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER1, LongType.instance));
+                                    // SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER6, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee", LexicalUUIDType.instance, UTF8Type.instance),
         SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD1),
-                                    SchemaLoader.indexCFMD(KEYSPACE2, CF_INDEX1, true),
-                                    SchemaLoader.compositeIndexCFMD(KEYSPACE2, CF_INDEX2, true),
-                                    SchemaLoader.compositeIndexCFMD(KEYSPACE2, CF_INDEX3, true).gcGraceSeconds(0));
-        SchemaLoader.createKeyspace(KEYSPACE3,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(5),
-                                    SchemaLoader.indexCFMD(KEYSPACE3, CF_INDEX1, true));
-        SchemaLoader.createKeyspace(KEYSPACE4,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.perRowIndexedCFMD(KEYSPACE4, "Indexed1"));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD1));
+    }
+
+    @Before
+    public void truncateCFS()
+    {
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2).truncateBlocking();
+        // Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_SUPER1).truncateBlocking();
+
+        Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1).truncateBlocking();
     }
 
     @Test
@@ -182,21 +101,23 @@
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-        cfs.truncateBlocking();
 
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key1"));
-        rm.add(CF_STANDARD1, cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "key1")
+                .clustering("Column1")
+                .add("val", "asdf")
+                .build()
+                .applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key1"));
-        rm.add(CF_STANDARD1, cellname("Column1"), ByteBufferUtil.bytes("asdf"), 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, "key1")
+                .clustering("Column1")
+                .add("val", "asdf")
+                .build()
+                .applyUnsafe();
         cfs.forceBlockingFlush();
 
         ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear(); // resets counts
-        cfs.getColumnFamily(Util.namesQueryFilter(cfs, Util.dk("key1"), "Column1"));
+        Util.getAll(Util.cmd(cfs, "key1").includeRow("c1").build());
         assertEquals(1, cfs.metric.sstablesPerReadHistogram.cf.getCount());
     }
 
@@ -205,820 +126,128 @@
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-        cfs.truncateBlocking();
 
         List<Mutation> rms = new LinkedList<>();
-        Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key1"));
-        rm.add(CF_STANDARD1, cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.add(CF_STANDARD1, cellname("Column2"), ByteBufferUtil.bytes("asdf"), 0);
-        rms.add(rm);
+        rms.add(new RowUpdateBuilder(cfs.metadata, 0, "key1")
+                .clustering("Column1")
+                .add("val", "asdf")
+                .build());
+
         Util.writeColumnFamily(rms);
 
-        List<SSTableReader> ssTables = keyspace.getAllSSTables();
+        List<SSTableReader> ssTables = keyspace.getAllSSTables(SSTableSet.LIVE);
         assertEquals(1, ssTables.size());
         ssTables.get(0).forceFilterFailures();
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("key2"), CF_STANDARD1, System.currentTimeMillis()));
-        assertNull(cf);
+        Util.assertEmpty(Util.cmd(cfs, "key2").build());
     }
 
     @Test
     public void testEmptyRow() throws Exception
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD2);
-        Mutation rm;
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD2);
 
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key1"));
-        rm.delete(CF_STANDARD2, System.currentTimeMillis());
-        rm.applyUnsafe();
+        RowUpdateBuilder.deleteRow(cfs.metadata, FBUtilities.timestampMicros(), "key1", "Column1").applyUnsafe();
 
         Runnable r = new WrappedRunnable()
         {
             public void runMayThrow() throws IOException
             {
-                QueryFilter sliceFilter = QueryFilter.getSliceFilter(Util.dk("key1"), CF_STANDARD2, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-                ColumnFamily cf = store.getColumnFamily(sliceFilter);
-                assertTrue(cf.isMarkedForDelete());
-                assertFalse(cf.hasColumns());
-
-                QueryFilter namesFilter = Util.namesQueryFilter(store, Util.dk("key1"), "a");
-                cf = store.getColumnFamily(namesFilter);
-                assertTrue(cf.isMarkedForDelete());
-                assertFalse(cf.hasColumns());
+                Row toCheck = Util.getOnlyRowUnfiltered(Util.cmd(cfs, "key1").build());
+                Iterator<Cell> iter = toCheck.cells().iterator();
+                assert(Iterators.size(iter) == 0);
             }
         };
 
-        KeyspaceTest.reTest(store, r);
+        reTest(cfs, r);
     }
 
-    @Test
-    public void testFilterWithNullCF() throws Exception
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-        final Row row = new Row(Util.dk("key1"), null);
+    // TODO: Implement this once we have hooks to super columns available in CQL context
+//    @Test
+//    public void testDeleteSuperRowSticksAfterFlush() throws Throwable
+//    {
+//        String keyspaceName = KEYSPACE1;
+//        String cfName= CF_SUPER1;
+//
+//        Keyspace keyspace = Keyspace.open(keyspaceName);
+//        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+//
+//        ByteBuffer scfName = ByteBufferUtil.bytes("SuperDuper");
+//        DecoratedKey key = Util.dk("flush-resurrection");
+//
+//        // create an isolated sstable.
+//        putColSuper(cfs, key, 0, ByteBufferUtil.bytes("val"), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes("val1"));
 
-        ColumnFamilyStore.AbstractScanIterator iterator = new ColumnFamilyStore.AbstractScanIterator()
-        {
-            Iterator<Row> it = Collections.singletonList(row).iterator();
+//        putColsSuper(cfs, key, scfName,
+//                new BufferCell(cellname(1L), ByteBufferUtil.bytes("val1"), 1),
+//                new BufferCell(cellname(2L), ByteBufferUtil.bytes("val2"), 1),
+//                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 1));
+//        cfs.forceBlockingFlush();
+//
+//        // insert, don't flush.
+//        putColsSuper(cfs, key, scfName,
+//                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1),
+//                new BufferCell(cellname(5L), ByteBufferUtil.bytes("val5"), 1),
+//                new BufferCell(cellname(6L), ByteBufferUtil.bytes("val6"), 1));
+//
+//        // verify insert.
+//        final SlicePredicate sp = new SlicePredicate();
+//        sp.setSlice_range(new SliceRange());
+//        sp.getSlice_range().setCount(100);
+//        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
+//        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
+//
+//        assertRowAndColCount(1, 6, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+//
+//        // delete
+//        Mutation rm = new Mutation(keyspace.getName(), key.getKey());
+//        rm.deleteRange(cfName, SuperColumns.startOf(scfName), SuperColumns.endOf(scfName), 2);
+//        rm.applyUnsafe();
+//
+//        // verify delete.
+//        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+//
+//        // flush
+//        cfs.forceBlockingFlush();
+//
+//        // re-verify delete.
+//        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+//
+//        // late insert.
+//        putColsSuper(cfs, key, scfName,
+//                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1L),
+//                new BufferCell(cellname(7L), ByteBufferUtil.bytes("val7"), 1L));
+//
+//        // re-verify delete.
+//        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+//
+//        // make sure new writes are recognized.
+//        putColsSuper(cfs, key, scfName,
+//                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 3),
+//                new BufferCell(cellname(8L), ByteBufferUtil.bytes("val8"), 3),
+//                new BufferCell(cellname(9L), ByteBufferUtil.bytes("val9"), 3));
+//        assertRowAndColCount(1, 3, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
+//    }
 
-            protected Row computeNext()
-            {
-                return it.hasNext() ? it.next() : endOfData();
-            }
-
-            @Override
-            public void close()
-            {
-            }
-        };
-
-        ExtendedFilter filter = ExtendedFilter.create(
-                cfs,
-                DataRange.allData(DatabaseDescriptor.getPartitioner()), null, 1, true, System.currentTimeMillis());
-
-        List<Row> list = cfs.filter(iterator, filter);
-        assert 1 == list.size();
-        assert list.get(0).key == row.key;
-        assert null == list.get(0).cf;
-    }
-
-    @Test
-    public void testSkipStartKey()
-    {
-        ColumnFamilyStore cfs = insertKey1Key2();
-
-        IPartitioner p = StorageService.getPartitioner();
-        List<Row> result = cfs.getRangeSlice(Util.range(p, "key1", "key2"),
-                                             null,
-                                             Util.namesFilter(cfs, "asdf"),
-                                             10);
-        assertEquals(1, result.size());
-        assert result.get(0).key.getKey().equals(ByteBufferUtil.bytes("key2"));
-    }
-
-    @Test
-    public void testIndexScan()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_INDEX1);
-        Mutation rm;
-        CellName nobirthdate = cellname("notbirthdate");
-        CellName birthdate = cellname("birthdate");
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k2"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k3"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k4aaaa"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(3L), 0);
-        rm.applyUnsafe();
-
-        // basic single-expression query
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = cfs.search(range, clause, filter, 100);
-
-        assert rows != null;
-        assert rows.size() == 2 : StringUtils.join(rows, ",");
-
-        String key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
-        assert "k1".equals( key ) : key;
-
-        key = new String(rows.get(1).key.getKey().array(), rows.get(1).key.getKey().position(), rows.get(1).key.getKey().remaining());
-        assert "k3".equals(key) : key;
-
-        assert ByteBufferUtil.bytes(1L).equals( rows.get(0).cf.getColumn(birthdate).value());
-        assert ByteBufferUtil.bytes(1L).equals( rows.get(1).cf.getColumn(birthdate).value());
-
-        // add a second expression
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), Operator.GTE, ByteBufferUtil.bytes(2L));
-        clause = Arrays.asList(expr, expr2);
-        rows = cfs.search(range, clause, filter, 100);
-
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
-        assert "k3".equals( key );
-
-        // same query again, but with resultset not including the subordinate expression
-        rows = cfs.search(range, clause, Util.namesFilter(cfs, "birthdate"), 100);
-
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
-        assert "k3".equals( key );
-
-        assert rows.get(0).cf.getColumnCount() == 1 : rows.get(0).cf;
-
-        // once more, this time with a slice rowset that needs to be expanded
-        SliceQueryFilter emptyFilter = new SliceQueryFilter(Composites.EMPTY, Composites.EMPTY, false, 0);
-        rows = cfs.search(range, clause, emptyFilter, 100);
-
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = new String(rows.get(0).key.getKey().array(), rows.get(0).key.getKey().position(), rows.get(0).key.getKey().remaining());
-        assert "k3".equals( key );
-
-        assertFalse(rows.get(0).cf.hasColumns());
-
-        // query with index hit but rejected by secondary clause, with a small enough count that just checking count
-        // doesn't tell the scan loop that it's done
-        IndexExpression expr3 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), Operator.EQ, ByteBufferUtil.bytes(-1L));
-        clause = Arrays.asList(expr, expr3);
-        rows = cfs.search(range, clause, filter, 100);
-
-        assert rows.isEmpty();
-    }
-
-    @Test
-    public void testLargeScan()
-    {
-        Mutation rm;
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_INDEX1);
-        for (int i = 0; i < 100; i++)
-        {
-            rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key" + i));
-            rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(34L), 0);
-            rm.add(CF_INDEX1, cellname("notbirthdate"), ByteBufferUtil.bytes((long) (i % 2)), 0);
-            rm.applyUnsafe();
-        }
-
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(34L));
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr, expr2);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = cfs.search(range, clause, filter, 100);
-
-        assert rows != null;
-        assert rows.size() == 50 : rows.size();
-        Set<DecoratedKey> keys = new HashSet<DecoratedKey>();
-        // extra check that there are no duplicate results -- see https://issues.apache.org/jira/browse/CASSANDRA-2406
-        for (Row row : rows)
-            keys.add(row.key);
-        assert rows.size() == keys.size();
-    }
-
-    @Test
-    public void testIndexDeletions() throws IOException
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE3).getColumnFamilyStore(CF_INDEX1);
-        Mutation rm;
-
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        String key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-        // delete the column directly
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.delete(CF_INDEX1, cellname("birthdate"), 1);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.isEmpty();
-
-        // verify that it's not being indexed under the deletion column value either
-        Cell deletion = rm.getColumnFamilies().iterator().next().iterator().next();
-        ByteBuffer deletionLong = ByteBufferUtil.bytes((long) ByteBufferUtil.toInt(deletion.value()));
-        IndexExpression expr0 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, deletionLong);
-        List<IndexExpression> clause0 = Arrays.asList(expr0);
-        rows = cfs.search(range, clause0, filter, 100);
-        assert rows.isEmpty();
-
-        // resurrect w/ a newer timestamp
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 2);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-        // verify that row and delete w/ older timestamp does nothing
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.delete(CF_INDEX1, 1);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-        // similarly, column delete w/ older timestamp should do nothing
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.delete(CF_INDEX1, cellname("birthdate"), 1);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-        // delete the entire row (w/ newer timestamp this time)
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.delete(CF_INDEX1, 3);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.isEmpty() : StringUtils.join(rows, ",");
-
-        // make sure obsolete mutations don't generate an index entry
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 3);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.isEmpty() : StringUtils.join(rows, ",");
-
-        // try insert followed by row delete in the same mutation
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 1);
-        rm.delete(CF_INDEX1, 2);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.isEmpty() : StringUtils.join(rows, ",");
-
-        // try row delete followed by insert in the same mutation
-        rm = new Mutation(KEYSPACE3, ByteBufferUtil.bytes("k1"));
-        rm.delete(CF_INDEX1, 3);
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 4);
-        rm.applyUnsafe();
-        rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-    }
-
-    @Test
-    public void testIndexUpdate() throws IOException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE2);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_INDEX1);
-        CellName birthdate = cellname("birthdate");
-
-        // create a row and update the birthdate value, test that the index query fetches the new version
-        Mutation rm;
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 1);
-        rm.applyUnsafe();
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(2L), 2);
-        rm.applyUnsafe();
-
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = cfs.search(range, clause, filter, 100);
-        assert rows.size() == 0;
-
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(2L));
-        clause = Arrays.asList(expr);
-        rows = keyspace.getColumnFamilyStore(CF_INDEX1).search(range, clause, filter, 100);
-        String key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-        // update the birthdate value with an OLDER timestamp, and test that the index ignores this
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(3L), 0);
-        rm.applyUnsafe();
-
-        rows = keyspace.getColumnFamilyStore(CF_INDEX1).search(range, clause, filter, 100);
-        key = ByteBufferUtil.string(rows.get(0).key.getKey());
-        assert "k1".equals( key );
-
-    }
-
-    @Test
-    public void testIndexUpdateOverwritingExpiringColumns() throws Exception
-    {
-        // see CASSANDRA-7268
-        Keyspace keyspace = Keyspace.open(KEYSPACE2);
-
-        // create a row and update the birthdate value with an expiring column
-        Mutation rm;
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k100"));
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
-        rm.applyUnsafe();
-
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(100L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        // requires a 1s sleep because we calculate local expiry time as (now() / 1000) + ttl
-        TimeUnit.SECONDS.sleep(1);
-
-        // now overwrite with the same name/value/ttl, but the local expiry time will be different
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k100"));
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(100L), 1, 1000);
-        rm.applyUnsafe();
-
-        rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        // check that modifying the indexed value using the same timestamp behaves as expected
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k101"));
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(101L), 1, 1000);
-        rm.applyUnsafe();
-
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(101L));
-        clause = Arrays.asList(expr);
-        rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        TimeUnit.SECONDS.sleep(1);
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("k101"));
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(102L), 1, 1000);
-        rm.applyUnsafe();
-        // search for the old value
-        rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-        // and for the new
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(102L));
-        clause = Arrays.asList(expr);
-        rows = keyspace.getColumnFamilyStore("Indexed1").search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-    }
-
-    @Test
-    public void testDeleteOfInconsistentValuesInKeysIndex() throws Exception
-    {
-        String keySpace = KEYSPACE2;
-        String cfName = CF_INDEX1;
-
-        Keyspace keyspace = Keyspace.open(keySpace);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.truncateBlocking();
-
-        ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
-        CellName colName = cellname("birthdate");
-        ByteBuffer val1 = ByteBufferUtil.bytes(1L);
-        ByteBuffer val2 = ByteBufferUtil.bytes(2L);
-
-        // create a row and update the "birthdate" value, test that the index query fetches this version
-        Mutation rm;
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, colName, val1, 0);
-        rm.applyUnsafe();
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, val1);
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        // force a flush, so our index isn't being read from a memtable
-        keyspace.getColumnFamilyStore(cfName).forceBlockingFlush();
-
-        // now apply another update, but force the index update to be skipped
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, colName, val2, 1);
-        keyspace.apply(rm, true, false);
-
-        // Now searching the index for either the old or new value should return 0 rows
-        // because the new value was not indexed and the old value should be ignored
-        // (and in fact purged from the index cf).
-        // first check for the old value
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-        // now check for the updated value
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, val2);
-        clause = Arrays.asList(expr);
-        filter = new IdentityQueryFilter();
-        range = Util.range("", "");
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-
-        // now, reset back to the original value, still skipping the index update, to
-        // make sure the value was expunged from the index when it was discovered to be inconsistent
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, colName, ByteBufferUtil.bytes(1L), 3);
-        keyspace.apply(rm, true, false);
-
-        expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        clause = Arrays.asList(expr);
-        filter = new IdentityQueryFilter();
-        range = Util.range("", "");
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-    }
-
-    @Test
-    public void testDeleteOfInconsistentValuesFromCompositeIndex() throws Exception
-    {
-        String keySpace = KEYSPACE2;
-        String cfName = CF_INDEX2;
-
-        Keyspace keyspace = Keyspace.open(keySpace);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.truncateBlocking();
-
-        ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
-        ByteBuffer clusterKey = ByteBufferUtil.bytes("ck1");
-        ByteBuffer colName = ByteBufferUtil.bytes("col1");
-
-        CellNameType baseComparator = cfs.getComparator();
-        CellName compositeName = baseComparator.makeCellName(clusterKey, colName);
-
-        ByteBuffer val1 = ByteBufferUtil.bytes("v1");
-        ByteBuffer val2 = ByteBufferUtil.bytes("v2");
-
-        // create a row and update the author value
-        Mutation rm;
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, compositeName, val1, 0);
-        rm.applyUnsafe();
-
-        // test that the index query fetches this version
-        IndexExpression expr = new IndexExpression(colName, Operator.EQ, val1);
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        // force a flush and retry the query, so our index isn't being read from a memtable
-        keyspace.getColumnFamilyStore(cfName).forceBlockingFlush();
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(1, rows.size());
-
-        // now apply another update, but force the index update to be skipped
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, compositeName, val2, 1);
-        keyspace.apply(rm, true, false);
-
-        // Now searching the index for either the old or new value should return 0 rows
-        // because the new value was not indexed and the old value should be ignored
-        // (and in fact purged from the index cf).
-        // first check for the old value
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-        // now check for the updated value
-        expr = new IndexExpression(colName, Operator.EQ, val2);
-        clause = Arrays.asList(expr);
-        filter = new IdentityQueryFilter();
-        range = Util.range("", "");
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-
-        // now, reset back to the original value, still skipping the index update, to
-        // make sure the value was expunged from the index when it was discovered to be inconsistent
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, compositeName, val1, 2);
-        keyspace.apply(rm, true, false);
-
-        expr = new IndexExpression(colName, Operator.EQ, val1);
-        clause = Arrays.asList(expr);
-        filter = new IdentityQueryFilter();
-        range = Util.range("", "");
-        rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-    }
-
-    // See CASSANDRA-6098
-    @Test
-    public void testDeleteCompositeIndex() throws Exception
-    {
-        String keySpace = KEYSPACE2;
-        String cfName = CF_INDEX3; // has gcGrace 0
-
-        Keyspace keyspace = Keyspace.open(keySpace);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.truncateBlocking();
-
-        ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
-        ByteBuffer clusterKey = ByteBufferUtil.bytes("ck1");
-        ByteBuffer colName = ByteBufferUtil.bytes("col1");
-
-        CellNameType baseComparator = cfs.getComparator();
-        CellName compositeName = baseComparator.makeCellName(clusterKey, colName);
-
-        ByteBuffer val1 = ByteBufferUtil.bytes("v2");
-
-        // Insert indexed value.
-        Mutation rm;
-        rm = new Mutation(keySpace, rowKey);
-        rm.add(cfName, compositeName, val1, 0);
-        rm.applyUnsafe();
-
-        // Now delete the value and flush too.
-        rm = new Mutation(keySpace, rowKey);
-        rm.delete(cfName, 1);
-        rm.applyUnsafe();
-
-        // We want the data to be gcable, but even if gcGrace == 0, we still need to wait 1 second
-        // since we won't gc on a tie.
-        try { Thread.sleep(1000); } catch (Exception e) {}
-
-        // Read the index and we check we do get no value (and no NPE)
-        // Note: the index will return the entry because it hasn't been deleted (we
-        // haven't read yet nor compacted) but the data read itself will return null
-        IndexExpression expr = new IndexExpression(colName, Operator.EQ, val1);
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = keyspace.getColumnFamilyStore(cfName).search(range, clause, filter, 100);
-        assertEquals(0, rows.size());
-    }
-
-    // See CASSANDRA-2628
-    @Test
-    public void testIndexScanWithLimitOne()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_INDEX1);
-        Mutation rm;
-
-        CellName nobirthdate = cellname("notbirthdate");
-        CellName birthdate = cellname("birthdate");
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("kk1"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("kk2"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("kk3"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("kk4"));
-        rm.add(CF_INDEX1, nobirthdate, ByteBufferUtil.bytes(2L), 0);
-        rm.add(CF_INDEX1, birthdate, ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
-
-        // basic single-expression query
-        IndexExpression expr1 = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        IndexExpression expr2 = new IndexExpression(ByteBufferUtil.bytes("notbirthdate"), Operator.GT, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr1, expr2);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = cfs.search(range, clause, filter, 1);
-
-        assert rows != null;
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-    }
-
-    @Test
-    public void testIndexCreate() throws IOException, InterruptedException, ExecutionException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_INDEX2);
-
-        // create a row and update the birthdate value, test that the index query fetches the new version
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX2, cellname("birthdate"), ByteBufferUtil.bytes(1L), 1);
-        rm.applyUnsafe();
-
-        ColumnDefinition old = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("birthdate"));
-        ColumnDefinition cd = ColumnDefinition.regularDef(cfs.metadata, old.name.bytes, old.type, null).setIndex("birthdate_index", IndexType.KEYS, null);
-        Future<?> future = cfs.indexManager.addIndexedColumn(cd);
-        future.get();
-        // we had a bug (CASSANDRA-2244) where index would get created but not flushed -- check for that
-        assert cfs.indexManager.getIndexForColumn(cd.name.bytes).getIndexCfs().getSSTables().size() > 0;
-
-        queryBirthdate(keyspace);
-
-        // validate that drop clears it out & rebuild works (CASSANDRA-2320)
-        SecondaryIndex indexedCfs = cfs.indexManager.getIndexForColumn(ByteBufferUtil.bytes("birthdate"));
-        cfs.indexManager.removeIndexedColumn(ByteBufferUtil.bytes("birthdate"));
-        assert !indexedCfs.isIndexBuilt(ByteBufferUtil.bytes("birthdate"));
-
-        // rebuild & re-query
-        future = cfs.indexManager.addIndexedColumn(cd);
-        future.get();
-        queryBirthdate(keyspace);
-    }
-
-    private void queryBirthdate(Keyspace keyspace) throws CharacterCodingException
-    {
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        IDiskAtomFilter filter = new IdentityQueryFilter();
-        List<Row> rows = keyspace.getColumnFamilyStore(CF_INDEX2).search(Util.range("", ""), clause, filter, 100);
-        assert rows.size() == 1 : StringUtils.join(rows, ",");
-        assertEquals("k1", ByteBufferUtil.string(rows.get(0).key.getKey()));
-    }
-
-    @Test
-    public void testCassandra6778() throws CharacterCodingException
-    {
-        String cfname = CF_STANDARDINT;
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
-
-        // insert two columns that represent the same integer but have different binary forms (the
-        // second one is padded with extra zeros)
-        Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        CellName column1 = cellname(ByteBuffer.wrap(new byte[]{1}));
-        rm.add(cfname, column1, ByteBufferUtil.bytes("data1"), 1);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        CellName column2 = cellname(ByteBuffer.wrap(new byte[]{0, 0, 1}));
-        rm.add(cfname, column2, ByteBufferUtil.bytes("data2"), 2);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        // fetch by the first column name; we should get the second version of the column value
-        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(
-            KEYSPACE1, ByteBufferUtil.bytes("k1"), cfname, System.currentTimeMillis(),
-            new NamesQueryFilter(FBUtilities.singleton(column1, cfs.getComparator())));
-
-        ColumnFamily cf = cmd.getRow(keyspace).cf;
-        assertEquals(1, cf.getColumnCount());
-        Cell cell = cf.getColumn(column1);
-        assertEquals("data2", ByteBufferUtil.string(cell.value()));
-        assertEquals(column2, cell.name());
-
-        // fetch by the second column name; we should get the second version of the column value
-        cmd = new SliceByNamesReadCommand(
-            KEYSPACE1, ByteBufferUtil.bytes("k1"), cfname, System.currentTimeMillis(),
-            new NamesQueryFilter(FBUtilities.singleton(column2, cfs.getComparator())));
-
-        cf = cmd.getRow(keyspace).cf;
-        assertEquals(1, cf.getColumnCount());
-        cell = cf.getColumn(column2);
-        assertEquals("data2", ByteBufferUtil.string(cell.value()));
-        assertEquals(column2, cell.name());
-    }
-
-    @Test
-    public void testInclusiveBounds()
-    {
-        ColumnFamilyStore cfs = insertKey1Key2();
-
-        List<Row> result = cfs.getRangeSlice(Util.bounds("key1", "key2"),
-                                             null,
-                                             Util.namesFilter(cfs, "asdf"),
-                                             10);
-        assertEquals(2, result.size());
-        assert result.get(0).key.getKey().equals(ByteBufferUtil.bytes("key1"));
-    }
-
-    @Test
-    public void testDeleteSuperRowSticksAfterFlush() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName= CF_SUPER1;
-        ByteBuffer scfName = ByteBufferUtil.bytes("SuperDuper");
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        DecoratedKey key = Util.dk("flush-resurrection");
-
-        // create an isolated sstable.
-        putColsSuper(cfs, key, scfName,
-                new BufferCell(cellname(1L), ByteBufferUtil.bytes("val1"), 1),
-                new BufferCell(cellname(2L), ByteBufferUtil.bytes("val2"), 1),
-                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 1));
-        cfs.forceBlockingFlush();
-
-        // insert, don't flush.
-        putColsSuper(cfs, key, scfName,
-                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1),
-                new BufferCell(cellname(5L), ByteBufferUtil.bytes("val5"), 1),
-                new BufferCell(cellname(6L), ByteBufferUtil.bytes("val6"), 1));
-
-        // verify insert.
-        final SlicePredicate sp = new SlicePredicate();
-        sp.setSlice_range(new SliceRange());
-        sp.getSlice_range().setCount(100);
-        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
-        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
-
-        assertRowAndColCount(1, 6, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
-
-        // delete
-        Mutation rm = new Mutation(keyspace.getName(), key.getKey());
-        rm.deleteRange(cfName, SuperColumns.startOf(scfName), SuperColumns.endOf(scfName), 2);
-        rm.applyUnsafe();
-
-        // verify delete.
-        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
-
-        // flush
-        cfs.forceBlockingFlush();
-
-        // re-verify delete.
-        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
-
-        // late insert.
-        putColsSuper(cfs, key, scfName,
-                new BufferCell(cellname(4L), ByteBufferUtil.bytes("val4"), 1L),
-                new BufferCell(cellname(7L), ByteBufferUtil.bytes("val7"), 1L));
-
-        // re-verify delete.
-        assertRowAndColCount(1, 0, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
-
-        // make sure new writes are recognized.
-        putColsSuper(cfs, key, scfName,
-                new BufferCell(cellname(3L), ByteBufferUtil.bytes("val3"), 3),
-                new BufferCell(cellname(8L), ByteBufferUtil.bytes("val8"), 3),
-                new BufferCell(cellname(9L), ByteBufferUtil.bytes("val9"), 3));
-        assertRowAndColCount(1, 3, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, scfName), 100));
-    }
-
-    private static void assertRowAndColCount(int rowCount, int colCount, boolean isDeleted, Collection<Row> rows) throws CharacterCodingException
-    {
-        assert rows.size() == rowCount : "rowcount " + rows.size();
-        for (Row row : rows)
-        {
-            assert row.cf != null : "cf was null";
-            assert row.cf.getColumnCount() == colCount : "colcount " + row.cf.getColumnCount() + "|" + str(row.cf);
-            if (isDeleted)
-                assert row.cf.isMarkedForDelete() : "cf not marked for delete";
-        }
-    }
-
-    private static String str(ColumnFamily cf) throws CharacterCodingException
-    {
-        StringBuilder sb = new StringBuilder();
-        for (Cell col : cf.getSortedColumns())
-            sb.append(String.format("(%s,%s,%d),", ByteBufferUtil.string(col.name().toByteBuffer()), ByteBufferUtil.string(col.value()), col.timestamp()));
-        return sb.toString();
-    }
-
-    private static void putColsSuper(ColumnFamilyStore cfs, DecoratedKey key, ByteBuffer scfName, Cell... cols) throws Throwable
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
-        for (Cell col : cols)
-            cf.addColumn(col.withUpdatedName(CellNames.compositeDense(scfName, col.name().toByteBuffer())));
-        Mutation rm = new Mutation(cfs.keyspace.getName(), key.getKey(), cf);
-        rm.applyUnsafe();
-    }
-
-    private static void putColsStandard(ColumnFamilyStore cfs, DecoratedKey key, Cell... cols) throws Throwable
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.keyspace.getName(), cfs.name);
-        for (Cell col : cols)
-            cf.addColumn(col);
-        Mutation rm = new Mutation(cfs.keyspace.getName(), key.getKey(), cf);
-        rm.applyUnsafe();
-    }
+//    private static void assertRowAndColCount(int rowCount, int colCount, boolean isDeleted, Collection<Row> rows) throws CharacterCodingException
+//    {
+//        assert rows.size() == rowCount : "rowcount " + rows.size();
+//        for (Row row : rows)
+//        {
+//            assert row.cf != null : "cf was null";
+//            assert row.cf.getColumnCount() == colCount : "colcount " + row.cf.getColumnCount() + "|" + str(row.cf);
+//            if (isDeleted)
+//                assert row.cf.isMarkedForDelete() : "cf not marked for delete";
+//        }
+//    }
+//
+//    private static String str(ColumnFamily cf) throws CharacterCodingException
+//    {
+//        StringBuilder sb = new StringBuilder();
+//        for (Cell col : cf.getSortedColumns())
+//            sb.append(String.format("(%s,%s,%d),", ByteBufferUtil.string(col.name().toByteBuffer()), ByteBufferUtil.string(col.value()), col.timestamp()));
+//        return sb.toString();
+//    }
 
     @Test
     public void testDeleteStandardRowSticksAfterFlush() throws Throwable
@@ -1028,546 +257,52 @@
         String cfName = CF_STANDARD1;
         Keyspace keyspace = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        DecoratedKey key = Util.dk("f-flush-resurrection");
 
-        SlicePredicate sp = new SlicePredicate();
-        sp.setSlice_range(new SliceRange());
-        sp.getSlice_range().setCount(100);
-        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
-        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
+        ByteBuffer col = ByteBufferUtil.bytes("val");
+        ByteBuffer val = ByteBufferUtil.bytes("val1");
 
         // insert
-        putColsStandard(cfs, key, column("col1", "val1", 1), column("col2", "val2", 1));
-        assertRowAndColCount(1, 2, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        ColumnDefinition newCol = ColumnDefinition.regularDef(cfs.metadata, ByteBufferUtil.bytes("val2"), AsciiType.instance);
+        new RowUpdateBuilder(cfs.metadata, 0, "key1").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "key2").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        assertRangeCount(cfs, col, val, 2);
 
         // flush.
         cfs.forceBlockingFlush();
 
         // insert, don't flush
-        putColsStandard(cfs, key, column("col3", "val3", 1), column("col4", "val4", 1));
-        assertRowAndColCount(1, 4, false, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        new RowUpdateBuilder(cfs.metadata, 1, "key3").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, "key4").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        assertRangeCount(cfs, col, val, 4);
 
         // delete (from sstable and memtable)
-        Mutation rm = new Mutation(keyspace.getName(), key.getKey());
-        rm.delete(cfs.name, 2);
-        rm.applyUnsafe();
+        RowUpdateBuilder.deleteRow(cfs.metadata, 5, "key1", "Column1").applyUnsafe();
+        RowUpdateBuilder.deleteRow(cfs.metadata, 5, "key3", "Column1").applyUnsafe();
 
         // verify delete
-        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRangeCount(cfs, col, val, 2);
 
         // flush
         cfs.forceBlockingFlush();
 
         // re-verify delete. // first breakage is right here because of CASSANDRA-1837.
-        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRangeCount(cfs, col, val, 2);
 
         // simulate a 'late' insertion that gets put in after the deletion. should get inserted, but fail on read.
-        putColsStandard(cfs, key, column("col5", "val5", 1), column("col2", "val2", 1));
+        new RowUpdateBuilder(cfs.metadata, 2, "key1").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 2, "key3").clustering("Column1").add("val", "val1").build().applyUnsafe();
 
         // should still be nothing there because we deleted this row. 2nd breakage, but was undetected because of 1837.
-        assertRowAndColCount(1, 0, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        assertRangeCount(cfs, col, val, 2);
 
         // make sure that new writes are recognized.
-        putColsStandard(cfs, key, column("col6", "val6", 3), column("col7", "val7", 3));
-        assertRowAndColCount(1, 2, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
+        new RowUpdateBuilder(cfs.metadata, 10, "key5").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 10, "key6").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        assertRangeCount(cfs, col, val, 4);
 
         // and it remains so after flush. (this wasn't failing before, but it's good to check.)
         cfs.forceBlockingFlush();
-        assertRowAndColCount(1, 2, true, cfs.getRangeSlice(Util.range("f", "g"), null, ThriftValidation.asIFilter(sp, cfs.metadata, null), 100));
-    }
-
-
-    private ColumnFamilyStore insertKey1Key2()
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1);
-        List<Mutation> rms = new LinkedList<>();
-        Mutation rm;
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("key1"));
-        rm.add(CF_STANDARD1, cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rms.add(rm);
-        Util.writeColumnFamily(rms);
-
-        rm = new Mutation(KEYSPACE2, ByteBufferUtil.bytes("key2"));
-        rm.add(CF_STANDARD1, cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rms.add(rm);
-        return Util.writeColumnFamily(rms);
-    }
-
-    @Test
-    public void testBackupAfterFlush() throws Throwable
-    {
-        ColumnFamilyStore cfs = insertKey1Key2();
-
-        for (int version = 1; version <= 2; ++version)
-        {
-            Descriptor existing = new Descriptor(cfs.directories.getDirectoryForNewSSTables(), KEYSPACE2, CF_STANDARD1, version, Descriptor.Type.FINAL);
-            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), KEYSPACE2, CF_STANDARD1, version, Descriptor.Type.FINAL);
-            for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.STATS })
-                assertTrue("can not find backedup file:" + desc.filenameFor(c), new File(desc.filenameFor(c)).exists());
-        }
-    }
-
-    // CASSANDRA-3467.  the key here is that supercolumn and subcolumn comparators are different
-    @Test
-    public void testSliceByNamesCommandOnUUIDTypeSCF() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_SUPER6;
-        ByteBuffer superColName = LexicalUUIDType.instance.fromString("a4ed3562-0e8e-4b41-bdfd-c45a2774682d");
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        DecoratedKey key = Util.dk("slice-get-uuid-type");
-
-        // Insert a row with one supercolumn and multiple subcolumns
-        putColsSuper(cfs, key, superColName, new BufferCell(cellname("a"), ByteBufferUtil.bytes("A"), 1),
-                                             new BufferCell(cellname("b"), ByteBufferUtil.bytes("B"), 1));
-
-        // Get the entire supercolumn like normal
-        ColumnFamily cfGet = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertEquals(ByteBufferUtil.bytes("A"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
-        assertEquals(ByteBufferUtil.bytes("B"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
-
-        // Now do the SliceByNamesCommand on the supercolumn, passing both subcolumns in as columns to get
-        SortedSet<CellName> sliceColNames = new TreeSet<CellName>(cfs.metadata.comparator);
-        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a")));
-        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b")));
-        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.getKey(), cfName, System.currentTimeMillis(), new NamesQueryFilter(sliceColNames));
-        ColumnFamily cfSliced = cmd.getRow(keyspace).cf;
-
-        // Make sure the slice returns the same as the straight get
-        assertEquals(ByteBufferUtil.bytes("A"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
-        assertEquals(ByteBufferUtil.bytes("B"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
-    }
-
-    @Test
-    public void testSliceByNamesCommandOldMetadata() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName= CF_STANDARD6;
-        DecoratedKey key = Util.dk("slice-name-old-metadata");
-        CellName cname = cellname("c1");
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.truncateBlocking();
-
-        // Create a cell a 'high timestamp'
-        putColsStandard(cfs, key, new BufferCell(cname, ByteBufferUtil.bytes("a"), 2));
-        cfs.forceBlockingFlush();
-
-        // Nuke the metadata and reload that sstable
-        Collection<SSTableReader> ssTables = cfs.getSSTables();
-        assertEquals(1, ssTables.size());
-        cfs.clearUnsafe();
-        assertEquals(0, cfs.getSSTables().size());
-
-        SSTableReader sstable = ssTables.iterator().next();
-        File statsFile = new File(sstable.descriptor.filenameFor(Component.STATS));
-        assert statsFile.exists();
-        boolean deleted = statsFile.delete();
-        assert deleted : "Cannot delete " + statsFile;
-        cfs.loadNewSSTables();
-
-        // Add another cell with a lower timestamp
-        putColsStandard(cfs, key, new BufferCell(cname, ByteBufferUtil.bytes("b"), 1));
-
-        // Test fetching the cell by name returns the first cell
-        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.getKey(), cfName, System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(cname, cfs.getComparator())));
-        ColumnFamily cf = cmd.getRow(keyspace).cf;
-        Cell cell = cf.getColumn(cname);
-        assert cell.value().equals(ByteBufferUtil.bytes("a")) : "expecting a, got " + ByteBufferUtil.string(cell.value());
-
-        Keyspace.clear(KEYSPACE1); // CASSANDRA-7195
-    }
-
-    private static void assertTotalColCount(Collection<Row> rows, int expectedCount)
-    {
-        int columns = 0;
-        for (Row row : rows)
-        {
-            columns += row.getLiveCount(new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, expectedCount), System.currentTimeMillis());
-        }
-        assert columns == expectedCount : "Expected " + expectedCount + " live columns but got " + columns + ": " + rows;
-    }
-
-
-    @Test
-    public void testRangeSliceColumnsLimit() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        Cell[] cols = new Cell[5];
-        for (int i = 0; i < 5; i++)
-            cols[i] = column("c" + i, "value", 1);
-
-        putColsStandard(cfs, Util.dk("a"), cols[0], cols[1], cols[2], cols[3], cols[4]);
-        putColsStandard(cfs, Util.dk("b"), cols[0], cols[1]);
-        putColsStandard(cfs, Util.dk("c"), cols[0], cols[1], cols[2], cols[3]);
-        cfs.forceBlockingFlush();
-
-        SlicePredicate sp = new SlicePredicate();
-        sp.setSlice_range(new SliceRange());
-        sp.getSlice_range().setCount(1);
-        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
-        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
-
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              3,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            3);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              5,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            5);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              8,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            8);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              10,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            10);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              100,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            11);
-
-        // Check that when querying by name, we always include all names for a
-        // gien row even if it means returning more columns than requested (this is necesseray for CQL)
-        sp = new SlicePredicate();
-        sp.setColumn_names(Arrays.asList(
-            ByteBufferUtil.bytes("c0"),
-            ByteBufferUtil.bytes("c1"),
-            ByteBufferUtil.bytes("c2")
-        ));
-
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              1,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            3);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              4,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            5);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              5,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            5);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              6,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            8);
-        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
-                                              null,
-                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
-                                              100,
-                                              System.currentTimeMillis(),
-                                              true,
-                                              false),
-                            8);
-    }
-
-    @Test
-    public void testRangeSlicePaging() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        Cell[] cols = new Cell[4];
-        for (int i = 0; i < 4; i++)
-            cols[i] = column("c" + i, "value", 1);
-
-        DecoratedKey ka = Util.dk("a");
-        DecoratedKey kb = Util.dk("b");
-        DecoratedKey kc = Util.dk("c");
-
-        RowPosition min = Util.rp("");
-
-        putColsStandard(cfs, ka, cols[0], cols[1], cols[2], cols[3]);
-        putColsStandard(cfs, kb, cols[0], cols[1], cols[2]);
-        putColsStandard(cfs, kc, cols[0], cols[1], cols[2], cols[3]);
-        cfs.forceBlockingFlush();
-
-        SlicePredicate sp = new SlicePredicate();
-        sp.setSlice_range(new SliceRange());
-        sp.getSlice_range().setCount(1);
-        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
-        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
-
-        Collection<Row> rows;
-        Row row, row1, row2;
-        IDiskAtomFilter filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
-
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(Util.range("", ""), filter, null, 3, true, true, System.currentTimeMillis()));
-        assert rows.size() == 1 : "Expected 1 row, got " + toString(rows);
-        row = rows.iterator().next();
-        assertColumnNames(row, "c0", "c1", "c2");
-
-        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c2")));
-        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(ka, min), filter, null, 3, true, true, System.currentTimeMillis()));
-        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
-        Iterator<Row> iter = rows.iterator();
-        row1 = iter.next();
-        row2 = iter.next();
-        assertColumnNames(row1, "c2", "c3");
-        assertColumnNames(row2, "c0");
-
-        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c0")));
-        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(row2.key, min), filter, null, 3, true, true, System.currentTimeMillis()));
-        assert rows.size() == 1 : "Expected 1 row, got " + toString(rows);
-        row = rows.iterator().next();
-        assertColumnNames(row, "c0", "c1", "c2");
-
-        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c2")));
-        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(row.key, min), filter, null, 3, true, true, System.currentTimeMillis()));
-        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
-        iter = rows.iterator();
-        row1 = iter.next();
-        row2 = iter.next();
-        assertColumnNames(row1, "c2");
-        assertColumnNames(row2, "c0", "c1");
-
-        // Paging within bounds
-        SliceQueryFilter sf = new SliceQueryFilter(cellname("c1"),
-                                                   cellname("c2"),
-                                                   false,
-                                                   0);
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(ka, kc), sf, cellname("c2"), cellname("c1"), null, 2, true, System.currentTimeMillis()));
-        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
-        iter = rows.iterator();
-        row1 = iter.next();
-        row2 = iter.next();
-        assertColumnNames(row1, "c2");
-        assertColumnNames(row2, "c1");
-
-        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<RowPosition>(kb, kc), sf, cellname("c1"), cellname("c1"), null, 10, true, System.currentTimeMillis()));
-        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
-        iter = rows.iterator();
-        row1 = iter.next();
-        row2 = iter.next();
-        assertColumnNames(row1, "c1", "c2");
-        assertColumnNames(row2, "c1");
-    }
-
-    private static String toString(Collection<Row> rows)
-    {
-        try
-        {
-            StringBuilder sb = new StringBuilder();
-            for (Row row : rows)
-            {
-                sb.append("{");
-                sb.append(ByteBufferUtil.string(row.key.getKey()));
-                sb.append(":");
-                if (row.cf != null && !row.cf.isEmpty())
-                {
-                    for (Cell c : row.cf)
-                        sb.append(" ").append(row.cf.getComparator().getString(c.name()));
-                }
-                sb.append("} ");
-            }
-            return sb.toString();
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private static void assertColumnNames(Row row, String ... columnNames) throws Exception
-    {
-        if (row == null || row.cf == null)
-            throw new AssertionError("The row should not be empty");
-
-        Iterator<Cell> columns = row.cf.getSortedColumns().iterator();
-        Iterator<String> names = Arrays.asList(columnNames).iterator();
-
-        while (columns.hasNext())
-        {
-            Cell c = columns.next();
-            assert names.hasNext() : "Got more columns that expected (first unexpected column: " + ByteBufferUtil.string(c.name().toByteBuffer()) + ")";
-            String n = names.next();
-            assert c.name().toByteBuffer().equals(ByteBufferUtil.bytes(n)) : "Expected " + n + ", got " + ByteBufferUtil.string(c.name().toByteBuffer());
-        }
-        assert !names.hasNext() : "Missing expected column " + names.next();
-    }
-
-    private static DecoratedKey idk(int i)
-    {
-        return Util.dk(String.valueOf(i));
-    }
-
-    @Test
-    public void testRangeSliceInclusionExclusion() throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        Cell[] cols = new Cell[5];
-        for (int i = 0; i < 5; i++)
-            cols[i] = column("c" + i, "value", 1);
-
-        for (int i = 0; i <= 9; i++)
-        {
-            putColsStandard(cfs, idk(i), column("name", "value", 1));
-        }
-        cfs.forceBlockingFlush();
-
-        SlicePredicate sp = new SlicePredicate();
-        sp.setSlice_range(new SliceRange());
-        sp.getSlice_range().setCount(1);
-        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
-        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
-        IDiskAtomFilter qf = ThriftValidation.asIFilter(sp, cfs.metadata, null);
-
-        List<Row> rows;
-
-        // Start and end inclusive
-        rows = cfs.getRangeSlice(new Bounds<RowPosition>(rp("2"), rp("7")), null, qf, 100);
-        assert rows.size() == 6;
-        assert rows.get(0).key.equals(idk(2));
-        assert rows.get(rows.size() - 1).key.equals(idk(7));
-
-        // Start and end excluded
-        rows = cfs.getRangeSlice(new ExcludingBounds<RowPosition>(rp("2"), rp("7")), null, qf, 100);
-        assert rows.size() == 4;
-        assert rows.get(0).key.equals(idk(3));
-        assert rows.get(rows.size() - 1).key.equals(idk(6));
-
-        // Start excluded, end included
-        rows = cfs.getRangeSlice(new Range<RowPosition>(rp("2"), rp("7")), null, qf, 100);
-        assert rows.size() == 5;
-        assert rows.get(0).key.equals(idk(3));
-        assert rows.get(rows.size() - 1).key.equals(idk(7));
-
-        // Start included, end excluded
-        rows = cfs.getRangeSlice(new IncludingExcludingBounds<RowPosition>(rp("2"), rp("7")), null, qf, 100);
-        assert rows.size() == 5;
-        assert rows.get(0).key.equals(idk(2));
-        assert rows.get(rows.size() - 1).key.equals(idk(6));
-    }
-
-    @Test
-    public void testKeysSearcher() throws Exception
-    {
-        // Create secondary index and flush to disk
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_INDEX1);
-
-        store.truncateBlocking();
-
-        for (int i = 0; i < 10; i++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k" + i));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add(CF_INDEX1, cellname("birthdate"), LongType.instance.decompose(1L), System.currentTimeMillis());
-            rm.applyUnsafe();
-        }
-
-        store.forceBlockingFlush();
-
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, LongType.instance.decompose(1L));
-        // explicitly tell to the KeysSearcher to use column limiting for rowsPerQuery to trigger bogus columnsRead--; (CASSANDRA-3996)
-        List<Row> rows = store.search(store.makeExtendedFilter(Util.range("", ""), new IdentityQueryFilter(), Arrays.asList(expr), 10, true, false, System.currentTimeMillis()));
-
-        assert rows.size() == 10;
-    }
-
-    @SuppressWarnings("unchecked")
-    @Test
-    public void testMultiRangeSomeEmptyNoIndex() throws Throwable
-    {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colA")),
-                new ColumnSlice(cellname("colC"), cellname("colE")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
-
-        ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colI")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colE"), cellname("colC")),
-                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
-
-        String tableName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace table = Keyspace.open(tableName);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "i" };
-        Cell[] cols = new Cell[letters.length];
-        for (int i = 0; i < cols.length; i++)
-        {
-            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
-                    ByteBuffer.wrap(new byte[1]), 1);
-        }
-
-        putColsStandard(cfs, dk("a"), cols);
-
-        cfs.forceBlockingFlush();
-
-        SliceQueryFilter multiRangeForward = new SliceQueryFilter(ranges, false, 100);
-        SliceQueryFilter multiRangeForwardWithCounting = new SliceQueryFilter(ranges, false, 3);
-        SliceQueryFilter multiRangeReverse = new SliceQueryFilter(rangesReversed, true, 100);
-        SliceQueryFilter multiRangeReverseWithCounting = new SliceQueryFilter(rangesReversed, true, 3);
-
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForward, "a", "colA", "colC", "colD", "colI");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForwardWithCounting, "a", "colA", "colC", "colD");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverse, "a", "colI", "colD", "colC", "colA");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverseWithCounting, "a", "colI", "colD", "colC");
+        assertRangeCount(cfs, col, val, 4);
     }
 
     @Test
@@ -1583,14 +318,14 @@
         //cleanup any previous test gargbage
         cfs.clearSnapshot("");
 
-        Mutation rm;
-        for (int i = 0; i < 100; i++)
+        int numRows = 1000;
+        long[] colValues = new long [numRows * 2]; // each row has two columns
+        for (int i = 0; i < colValues.length; i+=2)
         {
-            rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("key" + i));
-            rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(34L), 0);
-            rm.add(CF_INDEX1, cellname("notbirthdate"), ByteBufferUtil.bytes((long) (i % 2)), 0);
-            rm.applyUnsafe();
+            colValues[i] = (i % 4 == 0 ? 1L : 2L); // index column
+            colValues[i+1] = 3L; //other column
         }
+        ScrubTest.fillIndexCF(cfs, false, colValues);
 
         cfs.snapshot("nonEphemeralSnapshot", null, false);
         cfs.snapshot("ephemeralSnapshot", null, true);
@@ -1600,7 +335,7 @@
         assertTrue(snapshotDetails.containsKey("ephemeralSnapshot"));
         assertTrue(snapshotDetails.containsKey("nonEphemeralSnapshot"));
 
-        ColumnFamilyStore.clearEphemeralSnapshots(cfs.directories);
+        ColumnFamilyStore.clearEphemeralSnapshots(cfs.getDirectories());
 
         snapshotDetails = cfs.getSnapshotDetails();
         assertEquals(1, snapshotDetails.size());
@@ -1610,755 +345,176 @@
         cfs.clearSnapshot("");
     }
 
-    @SuppressWarnings("unchecked")
     @Test
-    public void testMultiRangeSomeEmptyIndexed() throws Throwable
+    public void testBackupAfterFlush() throws Throwable
     {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colA")),
-                new ColumnSlice(cellname("colC"), cellname("colE")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
-
-        ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY,  cellname("colI")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colE"), cellname("colC")),
-                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
-
-        String tableName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace table = Keyspace.open(tableName);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "i" };
-        Cell[] cols = new Cell[letters.length];
-        for (int i = 0; i < cols.length; i++)
-        {
-            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
-                    ByteBuffer.wrap(new byte[1366]), 1);
-        }
-
-        putColsStandard(cfs, dk("a"), cols);
-
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1);
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("key1")).clustering("Column1").add("val", "asdf").build().applyUnsafe();
+        cfs.forceBlockingFlush();
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("key2")).clustering("Column1").add("val", "asdf").build().applyUnsafe();
         cfs.forceBlockingFlush();
 
-        SliceQueryFilter multiRangeForward = new SliceQueryFilter(ranges, false, 100);
-        SliceQueryFilter multiRangeForwardWithCounting = new SliceQueryFilter(ranges, false, 3);
-        SliceQueryFilter multiRangeReverse = new SliceQueryFilter(rangesReversed, true, 100);
-        SliceQueryFilter multiRangeReverseWithCounting = new SliceQueryFilter(rangesReversed, true, 3);
-
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForward, "a", "colA", "colC", "colD", "colI");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForwardWithCounting, "a", "colA", "colC", "colD");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverse, "a", "colI", "colD", "colC", "colA");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverseWithCounting, "a", "colI", "colD", "colC");
+        for (int version = 1; version <= 2; ++version)
+        {
+            Descriptor existing = new Descriptor(cfs.getDirectories().getDirectoryForNewSSTables(), KEYSPACE2, CF_STANDARD1, version);
+            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), KEYSPACE2, CF_STANDARD1, version);
+            for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.STATS })
+                assertTrue("Cannot find backed-up file:" + desc.filenameFor(c), new File(desc.filenameFor(c)).exists());
+        }
     }
 
-    @SuppressWarnings("unchecked")
-    @Test
-    public void testMultiRangeContiguousNoIndex() throws Throwable
+    // TODO: Fix once we have working supercolumns in 8099
+//    // CASSANDRA-3467.  the key here is that supercolumn and subcolumn comparators are different
+//    @Test
+//    public void testSliceByNamesCommandOnUUIDTypeSCF() throws Throwable
+//    {
+//        String keyspaceName = KEYSPACE1;
+//        String cfName = CF_SUPER6;
+//        ByteBuffer superColName = LexicalUUIDType.instance.fromString("a4ed3562-0e8e-4b41-bdfd-c45a2774682d");
+//        Keyspace keyspace = Keyspace.open(keyspaceName);
+//        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+//        DecoratedKey key = Util.dk("slice-get-uuid-type");
+//
+//        // Insert a row with one supercolumn and multiple subcolumns
+//        putColsSuper(cfs, key, superColName, new BufferCell(cellname("a"), ByteBufferUtil.bytes("A"), 1),
+//                                             new BufferCell(cellname("b"), ByteBufferUtil.bytes("B"), 1));
+//
+//        // Get the entire supercolumn like normal
+//        ColumnFamily cfGet = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
+//        assertEquals(ByteBufferUtil.bytes("A"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
+//        assertEquals(ByteBufferUtil.bytes("B"), cfGet.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
+//
+//        // Now do the SliceByNamesCommand on the supercolumn, passing both subcolumns in as columns to get
+//        SortedSet<CellName> sliceColNames = new TreeSet<CellName>(cfs.metadata.comparator);
+//        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a")));
+//        sliceColNames.add(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b")));
+//        SliceByNamesReadCommand cmd = new SliceByNamesReadCommand(keyspaceName, key.getKey(), cfName, System.currentTimeMillis(), new NamesQueryFilter(sliceColNames));
+//        ColumnFamily cfSliced = cmd.getRow(keyspace).cf;
+//
+//        // Make sure the slice returns the same as the straight get
+//        assertEquals(ByteBufferUtil.bytes("A"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("a"))).value());
+//        assertEquals(ByteBufferUtil.bytes("B"), cfSliced.getColumn(CellNames.compositeDense(superColName, ByteBufferUtil.bytes("b"))).value());
+//    }
+
+
+    // TODO: Fix once SSTableSimpleWriter's back in
+    // @see <a href="https://issues.apache.org/jira/browse/CASSANDRA-6086">CASSANDRA-6086</a>
+
+
+    // TODO: Fix once SSTableSimpleWriter's back in
+//    @Test
+//    public void testLoadNewSSTablesAvoidsOverwrites() throws Throwable
+//    {
+//        String ks = KEYSPACE1;
+//        String cf = CF_STANDARD1;
+//        ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore(cf);
+//        SSTableDeletingTask.waitForDeletions();
+//
+//        final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
+//        Directories dir = new Directories(cfs.metadata);
+//
+//        // clear old SSTables (probably left by CFS.clearUnsafe() calls in other tests)
+//        for (Map.Entry<Descriptor, Set<Component>> entry : dir.sstableLister().list().entrySet())
+//        {
+//            for (Component component : entry.getValue())
+//            {
+//                FileUtils.delete(entry.getKey().filenameFor(component));
+//            }
+//        }
+//
+//        // sanity check
+//        int existingSSTables = dir.sstableLister().list().keySet().size();
+//        assert existingSSTables == 0 : String.format("%d SSTables unexpectedly exist", existingSSTables);
+//
+//        ByteBuffer key = bytes("key");
+//
+//        SSTableSimpleWriter writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
+//                                                             cfmeta, StorageService.getPartitioner())
+//        {
+//            @Override
+//            protected SSTableWriter getWriter()
+//            {
+//                // hack for reset generation
+//                generation.set(0);
+//                return super.getWriter();
+//            }
+//        };
+//        writer.newRow(key);
+//        writer.addColumn(bytes("col"), bytes("val"), 1);
+//        writer.close();
+//
+//        writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
+//                                         cfmeta, StorageService.getPartitioner());
+//        writer.newRow(key);
+//        writer.addColumn(bytes("col"), bytes("val"), 1);
+//        writer.close();
+//
+//        Set<Integer> generations = new HashSet<>();
+//        for (Descriptor descriptor : dir.sstableLister().list().keySet())
+//            generations.add(descriptor.generation);
+//
+//        // we should have two generations: [1, 2]
+//        assertEquals(2, generations.size());
+//        assertTrue(generations.contains(1));
+//        assertTrue(generations.contains(2));
+//
+//        assertEquals(0, cfs.getLiveSSTables().size());
+//
+//        // start the generation counter at 1 again (other tests have incremented it already)
+//        cfs.resetFileIndexGenerator();
+//
+//        boolean incrementalBackupsEnabled = DatabaseDescriptor.isIncrementalBackupsEnabled();
+//        try
+//        {
+//            // avoid duplicate hardlinks to incremental backups
+//            DatabaseDescriptor.setIncrementalBackupsEnabled(false);
+//            cfs.loadNewSSTables();
+//        }
+//        finally
+//        {
+//            DatabaseDescriptor.setIncrementalBackupsEnabled(incrementalBackupsEnabled);
+//        }
+//
+//        assertEquals(2, cfs.getLiveSSTables().size());
+//        generations = new HashSet<>();
+//        for (Descriptor descriptor : dir.sstableLister().list().keySet())
+//            generations.add(descriptor.generation);
+//
+//        // normally they would get renamed to generations 1 and 2, but since those filenames already exist,
+//        // they get skipped and we end up with generations 3 and 4
+//        assertEquals(2, generations.size());
+//        assertTrue(generations.contains(3));
+//        assertTrue(generations.contains(4));
+//    }
+
+    public void reTest(ColumnFamilyStore cfs, Runnable verify) throws Exception
     {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colA")),
-                new ColumnSlice(cellname("colC"), cellname("colE")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
-
-        ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colI")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colE"), cellname("colC")),
-                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
-
-        String tableName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace table = Keyspace.open(tableName);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Cell[] cols = new Cell[letters.length];
-        for (int i = 0; i < cols.length; i++)
-        {
-            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
-                    ByteBuffer.wrap(new byte[1]), 1);
-        }
-
-        putColsStandard(cfs, dk("a"), cols);
-
+        verify.run();
         cfs.forceBlockingFlush();
-
-        SliceQueryFilter multiRangeForward = new SliceQueryFilter(ranges, false, 100);
-        SliceQueryFilter multiRangeForwardWithCounting = new SliceQueryFilter(ranges, false, 3);
-        SliceQueryFilter multiRangeReverse = new SliceQueryFilter(rangesReversed, true, 100);
-        SliceQueryFilter multiRangeReverseWithCounting = new SliceQueryFilter(rangesReversed, true, 3);
-
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForward, "a", "colA", "colC", "colD", "colE", "colF", "colG", "colI");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForwardWithCounting, "a", "colA", "colC", "colD");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverse, "a", "colI", "colG", "colF", "colE", "colD", "colC", "colA");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverseWithCounting, "a", "colI", "colG", "colF");
-
+        verify.run();
     }
 
-    @SuppressWarnings("unchecked")
-    @Test
-    public void testMultiRangeContiguousIndexed() throws Throwable
+    private void assertRangeCount(ColumnFamilyStore cfs, ByteBuffer col, ByteBuffer val, int count)
     {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colA")),
-                new ColumnSlice(cellname("colC"), cellname("colE")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
+        assertRangeCount(cfs, cfs.metadata.getColumnDefinition(col), val, count);
+    }
 
-        ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colI")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colF"), cellname("colF")),
-                new ColumnSlice(cellname("colE"), cellname("colC")),
-                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
+    private void assertRangeCount(ColumnFamilyStore cfs, ColumnDefinition col, ByteBuffer val, int count)
+    {
 
-        String tableName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace table = Keyspace.open(tableName);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Cell[] cols = new Cell[letters.length];
-        for (int i = 0; i < cols.length; i++)
+        int found = 0;
+        if (count != 0)
         {
-            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
-                    ByteBuffer.wrap(new byte[1366]), 1);
-        }
-
-        putColsStandard(cfs, dk("a"), cols);
-
-        cfs.forceBlockingFlush();
-
-        SliceQueryFilter multiRangeForward = new SliceQueryFilter(ranges, false, 100);
-        SliceQueryFilter multiRangeForwardWithCounting = new SliceQueryFilter(ranges, false, 3);
-        SliceQueryFilter multiRangeReverse = new SliceQueryFilter(rangesReversed, true, 100);
-        SliceQueryFilter multiRangeReverseWithCounting = new SliceQueryFilter(rangesReversed, true, 3);
-
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForward, "a", "colA", "colC", "colD", "colE", "colF", "colG", "colI");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForwardWithCounting, "a", "colA", "colC", "colD");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverse, "a", "colI", "colG", "colF", "colE", "colD", "colC", "colA");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverseWithCounting, "a", "colI", "colG", "colF");
-
-    }
-
-    @SuppressWarnings("unchecked")
-    @Test
-    public void testMultiRangeIndexed() throws Throwable
-    {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] ranges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colA")),
-                new ColumnSlice(cellname("colC"), cellname("colE")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colI"), Composites.EMPTY) };
-
-        ColumnSlice[] rangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colI")),
-                new ColumnSlice(cellname("colG"), cellname("colG")),
-                new ColumnSlice(cellname("colE"), cellname("colC")),
-                new ColumnSlice(cellname("colA"), Composites.EMPTY) };
-
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i" };
-        Cell[] cols = new Cell[letters.length];
-        for (int i = 0; i < cols.length; i++)
-        {
-            cols[i] = new BufferCell(cellname("col" + letters[i].toUpperCase()),
-                    // use 1366 so that three cols make an index segment
-                    ByteBuffer.wrap(new byte[1366]), 1);
-        }
-
-        putColsStandard(cfs, dk("a"), cols);
-
-        cfs.forceBlockingFlush();
-
-        // this setup should generate the following row (assuming indexes are of 4Kb each):
-        // [colA, colB, colC, colD, colE, colF, colG, colH, colI]
-        // indexed as:
-        // index0 [colA, colC]
-        // index1 [colD, colF]
-        // index2 [colG, colI]
-        // and we're looking for the ranges:
-        // range0 [____, colA]
-        // range1 [colC, colE]
-        // range2 [colG, ColG]
-        // range3 [colI, ____]
-
-        SliceQueryFilter multiRangeForward = new SliceQueryFilter(ranges, false, 100);
-        SliceQueryFilter multiRangeForwardWithCounting = new SliceQueryFilter(ranges, false, 3);
-        SliceQueryFilter multiRangeReverse = new SliceQueryFilter(rangesReversed, true, 100);
-        SliceQueryFilter multiRangeReverseWithCounting = new SliceQueryFilter(rangesReversed, true, 3);
-
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForward, "a", "colA", "colC", "colD", "colE", "colG", "colI");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeForwardWithCounting, "a", "colA", "colC", "colD");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverse, "a", "colI", "colG", "colE", "colD", "colC", "colA");
-        findRowGetSlicesAndAssertColsFound(cfs, multiRangeReverseWithCounting, "a", "colI", "colG", "colE");
-
-    }
-
-    @Test
-    public void testMultipleRangesSlicesNoIndexedColumns() throws Throwable
-    {
-        // small values so that cols won't be indexed
-        testMultiRangeSlicesBehavior(prepareMultiRangeSlicesTest(10, true));
-    }
-
-    @Test
-    public void testMultipleRangesSlicesWithIndexedColumns() throws Throwable
-    {
-        // min val size before cols are indexed is 4kb while testing so lets make sure cols are indexed
-        testMultiRangeSlicesBehavior(prepareMultiRangeSlicesTest(1024, true));
-    }
-
-    @Test
-    public void testMultipleRangesSlicesInMemory() throws Throwable
-    {
-        // small values so that cols won't be indexed
-        testMultiRangeSlicesBehavior(prepareMultiRangeSlicesTest(10, false));
-    }
-
-    @Test
-    public void testRemoveUnfinishedCompactionLeftovers() throws Throwable
-    {
-        String ks = KEYSPACE1;
-        String cf = CF_STANDARD3; // should be empty
-
-        final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = new Directories(cfmeta);
-        ByteBuffer key = bytes("key");
-
-        // 1st sstable
-        SSTableSimpleWriter writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(), cfmeta, StorageService.getPartitioner());
-        writer.newRow(key);
-        writer.addColumn(bytes("col"), bytes("val"), 1);
-        writer.close();
-
-        Map<Descriptor, Set<Component>> sstables = dir.sstableLister().list();
-        assertEquals(1, sstables.size());
-
-        Map.Entry<Descriptor, Set<Component>> sstableToOpen = sstables.entrySet().iterator().next();
-        final SSTableReader sstable1 = SSTableReader.open(sstableToOpen.getKey());
-
-        // simulate incomplete compaction
-        writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
-                                         cfmeta, StorageService.getPartitioner())
-        {
-            protected SSTableWriter getWriter()
+            for (FilteredPartition partition : Util.getAll(Util.cmd(cfs).filterOn(col.name.toString(), Operator.EQ, val).build()))
             {
-                MetadataCollector collector = new MetadataCollector(cfmeta.comparator);
-                collector.addAncestor(sstable1.descriptor.generation); // add ancestor from previously written sstable
-                return SSTableWriter.create(createDescriptor(directory, metadata.ksName, metadata.cfName, DatabaseDescriptor.getSSTableFormat()),
-                        0L,
-                        ActiveRepairService.UNREPAIRED_SSTABLE,
-                        metadata,
-                        DatabaseDescriptor.getPartitioner(),
-                        collector);
-            }
-        };
-        writer.newRow(key);
-        writer.addColumn(bytes("col"), bytes("val"), 1);
-        writer.close();
-
-        // should have 2 sstables now
-        sstables = dir.sstableLister().list();
-        assertEquals(2, sstables.size());
-
-        SSTableReader sstable2 = SSTableReader.open(sstable1.descriptor);
-        UUID compactionTaskID = SystemKeyspace.startCompaction(
-                Keyspace.open(ks).getColumnFamilyStore(cf),
-                Collections.singleton(sstable2));
-
-        Map<Integer, UUID> unfinishedCompaction = new HashMap<>();
-        unfinishedCompaction.put(sstable1.descriptor.generation, compactionTaskID);
-        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfmeta, unfinishedCompaction);
-
-        // 2nd sstable should be removed (only 1st sstable exists in set of size 1)
-        sstables = dir.sstableLister().list();
-        assertEquals(1, sstables.size());
-        assertTrue(sstables.containsKey(sstable1.descriptor));
-
-        Map<Pair<String, String>, Map<Integer, UUID>> unfinished = SystemKeyspace.getUnfinishedCompactions();
-        assertTrue(unfinished.isEmpty());
-        sstable1.selfRef().release();
-        sstable2.selfRef().release();
-    }
-
-    /**
-     * @see <a href="https://issues.apache.org/jira/browse/CASSANDRA-6086">CASSANDRA-6086</a>
-     */
-    @Test
-    public void testFailedToRemoveUnfinishedCompactionLeftovers() throws Throwable
-    {
-        final String ks = KEYSPACE1;
-        final String cf = CF_STANDARD4; // should be empty
-
-        final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = new Directories(cfmeta);
-        ByteBuffer key = bytes("key");
-
-        // Write SSTable generation 3 that has ancestors 1 and 2
-        final Set<Integer> ancestors = Sets.newHashSet(1, 2);
-        SSTableSimpleWriter writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
-                                                cfmeta, StorageService.getPartitioner())
-        {
-            protected SSTableWriter getWriter()
-            {
-                MetadataCollector collector = new MetadataCollector(cfmeta.comparator);
-                for (int ancestor : ancestors)
-                    collector.addAncestor(ancestor);
-                String file = new Descriptor(directory, ks, cf, 3, Descriptor.Type.TEMP).filenameFor(Component.DATA);
-                return SSTableWriter.create(Descriptor.fromFilename(file),
-                        0L,
-                        ActiveRepairService.UNREPAIRED_SSTABLE,
-                        metadata,
-                        StorageService.getPartitioner(),
-                        collector);
-            }
-        };
-        writer.newRow(key);
-        writer.addColumn(bytes("col"), bytes("val"), 1);
-        writer.close();
-
-        Map<Descriptor, Set<Component>> sstables = dir.sstableLister().list();
-        assert sstables.size() == 1;
-
-        Map.Entry<Descriptor, Set<Component>> sstableToOpen = sstables.entrySet().iterator().next();
-        final SSTableReader sstable1 = SSTableReader.open(sstableToOpen.getKey());
-
-        // simulate we don't have generation in compaction_history
-        Map<Integer, UUID> unfinishedCompactions = new HashMap<>();
-        UUID compactionTaskID = UUID.randomUUID();
-        for (Integer ancestor : ancestors)
-            unfinishedCompactions.put(ancestor, compactionTaskID);
-        ColumnFamilyStore.removeUnfinishedCompactionLeftovers(cfmeta, unfinishedCompactions);
-
-        // SSTable should not be deleted
-        sstables = dir.sstableLister().list();
-        assert sstables.size() == 1;
-        assert sstables.containsKey(sstable1.descriptor);
-    }
-
-    @Test
-    public void testLoadNewSSTablesAvoidsOverwrites() throws Throwable
-    {
-        String ks = KEYSPACE1;
-        String cf = CF_STANDARD5;
-        ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore(cf);
-        cfs.truncateBlocking();
-        SSTableDeletingTask.waitForDeletions();
-
-        final CFMetaData cfmeta = Schema.instance.getCFMetaData(ks, cf);
-        Directories dir = new Directories(cfs.metadata);
-
-        // clear old SSTables (probably left by CFS.clearUnsafe() calls in other tests)
-        for (Map.Entry<Descriptor, Set<Component>> entry : dir.sstableLister().list().entrySet())
-        {
-            for (Component component : entry.getValue())
-            {
-                FileUtils.delete(entry.getKey().filenameFor(component));
-            }
-        }
-
-        // sanity check
-        int existingSSTables = dir.sstableLister().list().keySet().size();
-        assert existingSSTables == 0 : String.format("%d SSTables unexpectedly exist", existingSSTables);
-
-        ByteBuffer key = bytes("key");
-
-        SSTableSimpleWriter writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
-                                                             cfmeta, StorageService.getPartitioner())
-        {
-            @Override
-            protected SSTableWriter getWriter()
-            {
-                // hack for reset generation
-                generation.set(0);
-                return super.getWriter();
-            }
-        };
-        writer.newRow(key);
-        writer.addColumn(bytes("col"), bytes("val"), 1);
-        writer.close();
-
-        writer = new SSTableSimpleWriter(dir.getDirectoryForNewSSTables(),
-                                         cfmeta, StorageService.getPartitioner());
-        writer.newRow(key);
-        writer.addColumn(bytes("col"), bytes("val"), 1);
-        writer.close();
-
-        Set<Integer> generations = new HashSet<>();
-        for (Descriptor descriptor : dir.sstableLister().list().keySet())
-            generations.add(descriptor.generation);
-
-        // we should have two generations: [1, 2]
-        assertEquals(2, generations.size());
-        assertTrue(generations.contains(1));
-        assertTrue(generations.contains(2));
-
-        assertEquals(0, cfs.getSSTables().size());
-
-        // start the generation counter at 1 again (other tests have incremented it already)
-        cfs.resetFileIndexGenerator();
-
-        boolean incrementalBackupsEnabled = DatabaseDescriptor.isIncrementalBackupsEnabled();
-        try
-        {
-            // avoid duplicate hardlinks to incremental backups
-            DatabaseDescriptor.setIncrementalBackupsEnabled(false);
-            cfs.loadNewSSTables();
-        }
-        finally
-        {
-            DatabaseDescriptor.setIncrementalBackupsEnabled(incrementalBackupsEnabled);
-        }
-
-        assertEquals(2, cfs.getSSTables().size());
-        generations = new HashSet<>();
-        for (Descriptor descriptor : dir.sstableLister().list().keySet())
-            generations.add(descriptor.generation);
-
-        // normally they would get renamed to generations 1 and 2, but since those filenames already exist,
-        // they get skipped and we end up with generations 3 and 4
-        assertEquals(2, generations.size());
-        assertTrue(generations.contains(3));
-        assertTrue(generations.contains(4));
-    }
-
-    private ColumnFamilyStore prepareMultiRangeSlicesTest(int valueSize, boolean flush) throws Throwable
-    {
-        String keyspaceName = KEYSPACE1;
-        String cfName = CF_STANDARD1;
-        Keyspace keyspace = Keyspace.open(keyspaceName);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        cfs.clearUnsafe();
-
-        String[] letters = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l" };
-        Cell[] cols = new Cell[12];
-        for (int i = 0; i < cols.length; i++)
-        {
-            cols[i] = new BufferCell(cellname("col" + letters[i]), ByteBuffer.wrap(new byte[valueSize]), 1);
-        }
-
-        for (int i = 0; i < 12; i++)
-        {
-            putColsStandard(cfs, dk(letters[i]), Arrays.copyOfRange(cols, 0, i + 1));
-        }
-
-        if (flush)
-        {
-            cfs.forceBlockingFlush();
-        }
-        else
-        {
-            // The intent is to validate memtable code, so check we really didn't flush
-            assert cfs.getSSTables().isEmpty();
-        }
-
-        return cfs;
-    }
-
-    private void testMultiRangeSlicesBehavior(ColumnFamilyStore cfs)
-    {
-        // in order not to change thrift interfaces at this stage we build SliceQueryFilter
-        // directly instead of using QueryFilter to build it for us
-        ColumnSlice[] startMiddleAndEndRanges = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colc")),
-                new ColumnSlice(cellname("colf"), cellname("colg")),
-                new ColumnSlice(cellname("colj"), Composites.EMPTY) };
-
-        ColumnSlice[] startMiddleAndEndRangesReversed = new ColumnSlice[] {
-                new ColumnSlice(Composites.EMPTY, cellname("colj")),
-                new ColumnSlice(cellname("colg"), cellname("colf")),
-                new ColumnSlice(cellname("colc"), Composites.EMPTY) };
-
-        ColumnSlice[] startOnlyRange =
-                new ColumnSlice[] { new ColumnSlice(Composites.EMPTY, cellname("colc")) };
-
-        ColumnSlice[] startOnlyRangeReversed =
-                new ColumnSlice[] { new ColumnSlice(cellname("colc"), Composites.EMPTY) };
-
-        ColumnSlice[] middleOnlyRanges =
-                new ColumnSlice[] { new ColumnSlice(cellname("colf"), cellname("colg")) };
-
-        ColumnSlice[] middleOnlyRangesReversed =
-                new ColumnSlice[] { new ColumnSlice(cellname("colg"), cellname("colf")) };
-
-        ColumnSlice[] endOnlyRanges =
-                new ColumnSlice[] { new ColumnSlice(cellname("colj"), Composites.EMPTY) };
-
-        ColumnSlice[] endOnlyRangesReversed =
-                new ColumnSlice[] { new ColumnSlice(Composites.EMPTY, cellname("colj")) };
-
-        SliceQueryFilter startOnlyFilter = new SliceQueryFilter(startOnlyRange, false,
-                Integer.MAX_VALUE);
-        SliceQueryFilter startOnlyFilterReversed = new SliceQueryFilter(startOnlyRangeReversed, true,
-                Integer.MAX_VALUE);
-        SliceQueryFilter startOnlyFilterWithCounting = new SliceQueryFilter(startOnlyRange, false, 1);
-        SliceQueryFilter startOnlyFilterReversedWithCounting = new SliceQueryFilter(startOnlyRangeReversed,
-                true, 1);
-
-        SliceQueryFilter middleOnlyFilter = new SliceQueryFilter(middleOnlyRanges,
-                false,
-                Integer.MAX_VALUE);
-        SliceQueryFilter middleOnlyFilterReversed = new SliceQueryFilter(middleOnlyRangesReversed, true,
-                Integer.MAX_VALUE);
-        SliceQueryFilter middleOnlyFilterWithCounting = new SliceQueryFilter(middleOnlyRanges, false, 1);
-        SliceQueryFilter middleOnlyFilterReversedWithCounting = new SliceQueryFilter(middleOnlyRangesReversed,
-                true, 1);
-
-        SliceQueryFilter endOnlyFilter = new SliceQueryFilter(endOnlyRanges, false,
-                Integer.MAX_VALUE);
-        SliceQueryFilter endOnlyReversed = new SliceQueryFilter(endOnlyRangesReversed, true,
-                Integer.MAX_VALUE);
-        SliceQueryFilter endOnlyWithCounting = new SliceQueryFilter(endOnlyRanges, false, 1);
-        SliceQueryFilter endOnlyWithReversedCounting = new SliceQueryFilter(endOnlyRangesReversed,
-                true, 1);
-
-        SliceQueryFilter startMiddleAndEndFilter = new SliceQueryFilter(startMiddleAndEndRanges, false,
-                Integer.MAX_VALUE);
-        SliceQueryFilter startMiddleAndEndFilterReversed = new SliceQueryFilter(startMiddleAndEndRangesReversed, true,
-                Integer.MAX_VALUE);
-        SliceQueryFilter startMiddleAndEndFilterWithCounting = new SliceQueryFilter(startMiddleAndEndRanges, false,
-                1);
-        SliceQueryFilter startMiddleAndEndFilterReversedWithCounting = new SliceQueryFilter(
-                startMiddleAndEndRangesReversed, true,
-                1);
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "a", "cola");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "a", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "a", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "a", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "a", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "a", "cola");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "c", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "c", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "c", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "c", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "c", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "c", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "c", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "c", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "c", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "c", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "c", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "f", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "f", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "f", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "f", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "f", "colf");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "f", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "f", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "f", "colf");
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "f", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "f", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "f", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "f", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "f", "cola", "colb", "colc", "colf");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "f", "colf", "colc", "colb",
-                "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "f", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "f", "colf");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "h", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "h", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "h", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "h", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "h", "colf", "colg");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "h", "colg", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "h", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "h", "colg");
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "h", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "h", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "h", new String[] {});
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "h", new String[] {});
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "h", "cola", "colb", "colc", "colf",
-                "colg");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "h", "colg", "colf", "colc", "colb",
-                "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "h", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "h", "colg");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "j", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "j", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "j", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "j", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "j", "colf", "colg");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "j", "colg", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "j", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "j", "colg");
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "j", "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "j", "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "j", "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "j", "colj");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "j", "cola", "colb", "colc", "colf", "colg",
-                "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "j", "colj", "colg", "colf", "colc",
-                "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "j", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "j", "colj");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilter, "l", "cola", "colb", "colc");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversed, "l", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterWithCounting, "l", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startOnlyFilterReversedWithCounting, "l", "colc");
-
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilter, "l", "colf", "colg");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversed, "l", "colg", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterWithCounting, "l", "colf");
-        findRowGetSlicesAndAssertColsFound(cfs, middleOnlyFilterReversedWithCounting, "l", "colg");
-
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyFilter, "l", "colj", "colk", "coll");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyReversed, "l", "coll", "colk", "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithCounting, "l", "colj");
-        findRowGetSlicesAndAssertColsFound(cfs, endOnlyWithReversedCounting, "l", "coll");
-
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilter, "l", "cola", "colb", "colc", "colf", "colg",
-                "colj", "colk", "coll");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversed, "l", "coll", "colk", "colj", "colg",
-                "colf", "colc", "colb", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterWithCounting, "l", "cola");
-        findRowGetSlicesAndAssertColsFound(cfs, startMiddleAndEndFilterReversedWithCounting, "l", "coll");
-    }
-
-    private void findRowGetSlicesAndAssertColsFound(ColumnFamilyStore cfs, SliceQueryFilter filter, String rowKey,
-            String... colNames)
-    {
-        List<Row> rows = cfs.getRangeSlice(new Bounds<RowPosition>(rp(rowKey), rp(rowKey)),
-                                           null,
-                                           filter,
-                                           Integer.MAX_VALUE,
-                                           System.currentTimeMillis(),
-                                           false,
-                                           false);
-        assertSame("unexpected number of rows ", 1, rows.size());
-        Row row = rows.get(0);
-        Collection<Cell> cols = !filter.isReversed() ? row.cf.getSortedColumns() : row.cf.getReverseSortedColumns();
-        // printRow(cfs, new String(row.key.key.array()), cols);
-        String[] returnedColsNames = Iterables.toArray(Iterables.transform(cols, new Function<Cell, String>()
-        {
-            public String apply(Cell arg0)
-            {
-                return Util.string(arg0.name().toByteBuffer());
-            }
-        }), String.class);
-
-        assertTrue(
-                "Columns did not match. Expected: " + Arrays.toString(colNames) + " but got:"
-                        + Arrays.toString(returnedColsNames), Arrays.equals(colNames, returnedColsNames));
-        int i = 0;
-        for (Cell col : cols)
-        {
-            assertEquals(colNames[i++], Util.string(col.name().toByteBuffer()));
-        }
-    }
-
-    private void printRow(ColumnFamilyStore cfs, String rowKey, Collection<Cell> cols)
-    {
-        DecoratedKey ROW = Util.dk(rowKey);
-        System.err.println("Original:");
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(ROW, CF_STANDARD1, System.currentTimeMillis()));
-        System.err.println("Row key: " + rowKey + " Cols: "
-                + Iterables.transform(cf.getSortedColumns(), new Function<Cell, String>()
+                for (Row r : partition)
                 {
-                    public String apply(Cell arg0)
-                    {
-                        return Util.string(arg0.name().toByteBuffer());
-                    }
-                }));
-        System.err.println("Filtered:");
-        Iterable<String> transformed = Iterables.transform(cols, new Function<Cell, String>()
-        {
-            public String apply(Cell arg0)
-            {
-                return Util.string(arg0.name().toByteBuffer());
+                    if (r.getCell(col).value().equals(val))
+                        ++found;
+                }
             }
-        });
-        System.err.println("Row key: " + rowKey + " Cols: " + transformed);
-    }
-
-    @Test
-    public void testRebuildSecondaryIndex() throws IOException
-    {
-        CellName indexedCellName = cellname("indexed");
-        Mutation rm;
-
-        rm = new Mutation(KEYSPACE4, ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", indexedCellName, ByteBufferUtil.bytes("foo"), 1);
-
-        rm.apply();
-        assertTrue(Arrays.equals("k1".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
-
-        Keyspace.open("PerRowSecondaryIndex").getColumnFamilyStore("Indexed1").forceBlockingFlush();
-
-        PerRowSecondaryIndexTest.TestIndex.reset();
-
-        ColumnFamilyStore.rebuildSecondaryIndex("PerRowSecondaryIndex", "Indexed1", PerRowSecondaryIndexTest.TestIndex.class.getSimpleName());
-        assertTrue(Arrays.equals("k1".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
-
-        PerRowSecondaryIndexTest.TestIndex.reset();
-        PerRowSecondaryIndexTest.TestIndex.ACTIVE = false;
-        ColumnFamilyStore.rebuildSecondaryIndex("PerRowSecondaryIndex", "Indexed1", PerRowSecondaryIndexTest.TestIndex.class.getSimpleName());
-        assertNull(PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY);
-
-        PerRowSecondaryIndexTest.TestIndex.reset();
+        }
+        assertEquals(count, found);
     }
 
     @Test
@@ -2368,19 +524,17 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_INDEX1);
         cfs.truncateBlocking();
 
-        List<Mutation> rms = new LinkedList<>();
-        Mutation rm;
-
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        rm.add(CF_INDEX1, cellname("birthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.add(CF_INDEX1, cellname("nobirthdate"), ByteBufferUtil.bytes(1L), 0);
-        rms.add(rm);
-        Util.writeColumnFamily(rms);
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, "key")
+                                             .newRow()
+                                             .add("birthdate", 1L)
+                                             .add("notbirthdate", 2L);
+        new Mutation(builder.build()).applyUnsafe();
+        cfs.forceBlockingFlush();
 
         String snapshotName = "newSnapshot";
         cfs.snapshotWithoutFlush(snapshotName);
 
-        File snapshotManifestFile = cfs.directories.getSnapshotManifestFile(snapshotName);
+        File snapshotManifestFile = cfs.getDirectories().getSnapshotManifestFile(snapshotName);
         JSONParser parser = new JSONParser();
         JSONObject manifest = (JSONObject) parser.parse(new FileReader(snapshotManifestFile));
         JSONArray files = (JSONArray) manifest.get("files");
@@ -2395,4 +549,33 @@
         assert Directories.isSecondaryIndexFolder(new File(indexTableFile).getParentFile());
         assert indexTableFile.endsWith(baseTableFile);
     }
+
+
+    @Test
+    public void testScrubDataDirectories() throws Throwable
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+
+        ColumnFamilyStore.scrubDataDirectories(cfs.metadata);
+
+        new RowUpdateBuilder(cfs.metadata, 2, "key").clustering("name").add("val", "2").build().applyUnsafe();
+        cfs.forceBlockingFlush();
+
+        // Nuke the metadata and reload that sstable
+        Collection<SSTableReader> ssTables = cfs.getLiveSSTables();
+        assertEquals(1, ssTables.size());
+        SSTableReader ssTable = ssTables.iterator().next();
+
+        String dataFileName = ssTable.descriptor.filenameFor(Component.DATA);
+        String tmpDataFileName = ssTable.descriptor.tmpFilenameFor(Component.DATA);
+        new File(dataFileName).renameTo(new File(tmpDataFileName));
+
+        ssTable.selfRef().release();
+
+        ColumnFamilyStore.scrubDataDirectories(cfs.metadata);
+
+        List<File> ssTableFiles = new Directories(cfs.metadata).sstableLister(Directories.OnTxnErr.THROW).listFiles();
+        assertNotNull(ssTableFiles);
+        assertEquals(0, ssTableFiles.size());
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java
deleted file mode 100644
index 72ddd40..0000000
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyTest.java
+++ /dev/null

@@ -1,277 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.TreeMap;
-
-import com.google.common.collect.Iterables;
-import org.apache.cassandra.config.CFMetaData;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CounterId;
-import org.apache.cassandra.utils.FBUtilities;
-
-import static junit.framework.Assert.assertTrue;
-
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.tombstone;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-public class ColumnFamilyTest
-{
-    static int version = MessagingService.current_version;
-    private static final String KEYSPACE1 = "Keyspace1";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_COUNTER1 = "Counter1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER1)
-                                                .defaultValidator(CounterColumnType.instance));
-    }
-
-    // TODO test SuperColumns more
-
-    @Test
-    public void testSingleColumn() throws IOException
-    {
-        ColumnFamily cf;
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.addColumn(column("C", "v", 1));
-        DataOutputBuffer bufOut = new DataOutputBuffer();
-        ColumnFamily.serializer.serialize(cf, bufOut, version);
-
-        ByteArrayInputStream bufIn = new ByteArrayInputStream(bufOut.getData(), 0, bufOut.getLength());
-        cf = ColumnFamily.serializer.deserialize(new DataInputStream(bufIn), version);
-        assert cf != null;
-        assert cf.metadata().cfName.equals(CF_STANDARD1);
-        assert cf.getSortedColumns().size() == 1;
-    }
-
-    @Test
-    public void testManyColumns() throws IOException
-    {
-        ColumnFamily cf;
-
-        TreeMap<String, String> map = new TreeMap<>();
-        for (int i = 100; i < 1000; ++i)
-        {
-            map.put(Integer.toString(i), "Avinash Lakshman is a good man: " + i);
-        }
-
-        // write
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        DataOutputBuffer bufOut = new DataOutputBuffer();
-        for (String cName : map.navigableKeySet())
-        {
-            cf.addColumn(column(cName, map.get(cName), 314));
-        }
-        ColumnFamily.serializer.serialize(cf, bufOut, version);
-
-        // verify
-        ByteArrayInputStream bufIn = new ByteArrayInputStream(bufOut.getData(), 0, bufOut.getLength());
-        cf = ColumnFamily.serializer.deserialize(new DataInputStream(bufIn), version);
-        for (String cName : map.navigableKeySet())
-        {
-            ByteBuffer val = cf.getColumn(cellname(cName)).value();
-            assert new String(val.array(),val.position(),val.remaining()).equals(map.get(cName));
-        }
-        assert Iterables.size(cf.getColumnNames()) == map.size();
-    }
-
-    @Test
-    public void testGetColumnCount()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-
-        cf.addColumn(column("col1", "", 1));
-        cf.addColumn(column("col2", "", 2));
-        cf.addColumn(column("col1", "", 3));
-
-        assert 2 == cf.getColumnCount();
-        assert 2 == cf.getSortedColumns().size();
-    }
-
-    @Test
-    public void testDigest()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-
-        ByteBuffer digest = ColumnFamily.digest(cf);
-
-        cf.addColumn(column("col1", "", 1));
-        cf2.addColumn(column("col1", "", 1));
-
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        assert digest.equals(ColumnFamily.digest(cf2));
-
-        cf.addColumn(column("col2", "", 2));
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        cf.addColumn(column("col1", "", 3));
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        cf.delete(new DeletionTime(4, 4));
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        cf.delete(tombstone("col1", "col11", 5, 5));
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        assert digest.equals(ColumnFamily.digest(cf));
-
-        cf.delete(tombstone("col2", "col21", 5, 5));
-        assert !digest.equals(ColumnFamily.digest(cf));
-
-        digest = ColumnFamily.digest(cf);
-        cf.delete(tombstone("col1", "col11", 5, 5)); // this does not change RangeTombstoneLList
-        assert digest.equals(ColumnFamily.digest(cf));
-    }
-
-    @Test
-    public void testTimestamp()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-
-        cf.addColumn(column("col1", "val1", 2));
-        cf.addColumn(column("col1", "val2", 2)); // same timestamp, new value
-        cf.addColumn(column("col1", "val3", 1)); // older timestamp -- should be ignored
-
-        assert ByteBufferUtil.bytes("val2").equals(cf.getColumn(cellname("col1")).value());
-    }
-
-    @Test
-    public void testMergeAndAdd()
-    {
-        ColumnFamily cf_new = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        ColumnFamily cf_old = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        ColumnFamily cf_result = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        ByteBuffer val = ByteBufferUtil.bytes("sample value");
-        ByteBuffer val2 = ByteBufferUtil.bytes("x value ");
-
-        cf_new.addColumn(cellname("col1"), val, 3);
-        cf_new.addColumn(cellname("col2"), val, 4);
-
-        cf_old.addColumn(cellname("col2"), val2, 1);
-        cf_old.addColumn(cellname("col3"), val2, 2);
-
-        cf_result.addAll(cf_new);
-        cf_result.addAll(cf_old);
-
-        assert 3 == cf_result.getColumnCount() : "Count is " + cf_new.getColumnCount();
-        //addcolumns will only add if timestamp >= old timestamp
-        assert val.equals(cf_result.getColumn(cellname("col2")).value());
-
-        // check that tombstone wins timestamp ties
-        cf_result.addTombstone(cellname("col1"), 0, 3);
-        assertFalse(cf_result.getColumn(cellname("col1")).isLive());
-        cf_result.addColumn(cellname("col1"), val2, 3);
-        assertFalse(cf_result.getColumn(cellname("col1")).isLive());
-
-        // check that column value wins timestamp ties in absence of tombstone
-        cf_result.addColumn(cellname("col3"), val, 2);
-        assert cf_result.getColumn(cellname("col3")).value().equals(val2);
-        cf_result.addColumn(cellname("col3"), ByteBufferUtil.bytes("z"), 2);
-        assert cf_result.getColumn(cellname("col3")).value().equals(ByteBufferUtil.bytes("z"));
-    }
-
-    @Test
-    public void testColumnStatsRecordsRowDeletesCorrectly()
-    {
-        long timestamp = System.currentTimeMillis();
-        int localDeletionTime = (int) (System.currentTimeMillis() / 1000);
-
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.delete(new DeletionInfo(timestamp, localDeletionTime));
-        ColumnStats stats = cf.getColumnStats();
-        assertEquals(timestamp, stats.maxTimestamp);
-
-        cf.delete(new RangeTombstone(cellname("col2"), cellname("col21"), timestamp, localDeletionTime));
-
-        stats = cf.getColumnStats();
-        assertEquals(ByteBufferUtil.bytes("col2"), stats.minColumnNames.get(0));
-        assertEquals(ByteBufferUtil.bytes("col21"), stats.maxColumnNames.get(0));
-
-        cf.delete(new RangeTombstone(cellname("col6"), cellname("col61"), timestamp, localDeletionTime));
-        stats = cf.getColumnStats();
-
-        assertEquals(ByteBufferUtil.bytes("col2"), stats.minColumnNames.get(0));
-        assertEquals(ByteBufferUtil.bytes("col61"), stats.maxColumnNames.get(0));
-    }
-
-    @Test
-    public void testCounterDeletion()
-    {
-        long timestamp = FBUtilities.timestampMicros();
-        CellName name = cellname("counter1");
-
-        BufferCounterCell counter = new BufferCounterCell(name,
-                                                          CounterContext.instance().createGlobal(CounterId.fromInt(1), 1, 1),
-                                                          timestamp);
-        BufferDeletedCell tombstone = new BufferDeletedCell(name, (int) (System.currentTimeMillis() / 1000), 0L);
-
-        // check that the tombstone won the reconcile despite the counter cell having a higher timestamp
-        assertTrue(counter.reconcile(tombstone) == tombstone);
-
-        // check that a range tombstone overrides the counter cell, even with a lower timestamp than the counter
-        ColumnFamily cf0 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_COUNTER1);
-        cf0.addColumn(counter);
-        cf0.delete(new RangeTombstone(cellname("counter0"), cellname("counter2"), 0L, (int) (System.currentTimeMillis() / 1000)));
-        assertTrue(cf0.deletionInfo().isDeleted(counter));
-        assertTrue(cf0.deletionInfo().inOrderTester(false).isDeleted(counter));
-
-        // check that a top-level deletion info overrides the counter cell, even with a lower timestamp than the counter
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_COUNTER1);
-        cf1.addColumn(counter);
-        cf1.delete(new DeletionInfo(0L, (int) (System.currentTimeMillis() / 1000)));
-        assertTrue(cf1.deletionInfo().isDeleted(counter));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/ColumnsTest.java b/test/unit/org/apache/cassandra/db/ColumnsTest.java
new file mode 100644
index 0000000..9498e8b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/ColumnsTest.java

@@ -0,0 +1,522 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.Predicate;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.junit.AfterClass;
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.MockSchema;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class ColumnsTest
+{
+
+    private static final CFMetaData cfMetaData = MockSchema.newCFS().metadata;
+
+    @Test
+    public void testDeserializeCorruption() throws IOException
+    {
+        ColumnsCheck check = randomSmall(1, 0, 3, 0);
+        Columns superset = check.columns;
+        List<ColumnDefinition> minus1 = new ArrayList<>(check.definitions);
+        minus1.remove(3);
+        Columns minus2 = check.columns
+                .without(check.columns.getSimple(3))
+                .without(check.columns.getSimple(2));
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            // serialize a subset
+            Columns.serializer.serializeSubset(minus1, superset, out);
+            try (DataInputBuffer in = new DataInputBuffer(out.toByteArray()))
+            {
+                Columns.serializer.deserializeSubset(minus2, in);
+                Assert.assertFalse(true);
+            }
+            catch (IOException e)
+            {
+            }
+        }
+    }
+
+    // this tests most of our functionality, since each subset we perform
+    // reasonably comprehensive tests of basic functionality against
+    @Test
+    public void testContainsWithoutAndMergeTo()
+    {
+        for (ColumnsCheck randomColumns : randomSmall(true))
+            testContainsWithoutAndMergeTo(randomColumns);
+    }
+
+    private void testContainsWithoutAndMergeTo(ColumnsCheck input)
+    {
+        // pick some arbitrary groupings of columns to remove at-once (to avoid factorial complexity)
+        // whatever is left after each removal, we perform this logic on again, recursively
+        List<List<ColumnDefinition>> removeGroups = shuffleAndGroup(Lists.newArrayList(input.definitions));
+        for (List<ColumnDefinition> defs : removeGroups)
+        {
+            ColumnsCheck subset = input.remove(defs);
+
+            // test contents after .without
+            subset.assertContents();
+
+            // test .contains
+            assertSubset(input.columns, subset.columns);
+
+            // test .mergeTo
+            Columns otherSubset = input.columns;
+            for (ColumnDefinition def : subset.definitions)
+            {
+                otherSubset = otherSubset.without(def);
+                assertContents(otherSubset.mergeTo(subset.columns), input.definitions);
+            }
+
+            testContainsWithoutAndMergeTo(subset);
+        }
+    }
+
+    private void assertSubset(Columns superset, Columns subset)
+    {
+        Assert.assertTrue(superset.containsAll(superset));
+        Assert.assertTrue(superset.containsAll(subset));
+        Assert.assertFalse(subset.containsAll(superset));
+    }
+
+    @Test
+    public void testSerialize() throws IOException
+    {
+        testSerialize(Columns.NONE, Collections.emptyList());
+        for (ColumnsCheck randomColumns : randomSmall(false))
+            testSerialize(randomColumns.columns, randomColumns.definitions);
+    }
+
+    private void testSerialize(Columns columns, List<ColumnDefinition> definitions) throws IOException
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            Columns.serializer.serialize(columns, out);
+            Assert.assertEquals(Columns.serializer.serializedSize(columns), out.buffer().remaining());
+            Columns deserialized = Columns.serializer.deserialize(new DataInputBuffer(out.buffer(), false), mock(columns));
+            Assert.assertEquals(columns, deserialized);
+            Assert.assertEquals(columns.hashCode(), deserialized.hashCode());
+            assertContents(deserialized, definitions);
+        }
+    }
+
+    @Test
+    public void testSerializeSmallSubset() throws IOException
+    {
+        for (ColumnsCheck randomColumns : randomSmall(true))
+            testSerializeSubset(randomColumns);
+    }
+
+    @Test
+    public void testSerializeHugeSubset() throws IOException
+    {
+        for (ColumnsCheck randomColumns : randomHuge())
+            testSerializeSubset(randomColumns);
+    }
+
+    @Test
+    public void testContainsAllWithLargeNumberOfColumns()
+    {
+        List<String> names = new ArrayList<>();
+        for (int i = 0; i < 50; i++)
+            names.add("regular_" + i);
+
+        List<ColumnDefinition> defs = new ArrayList<>();
+        addRegular(names, defs);
+
+        Columns columns = Columns.from(new HashSet<>(defs));
+
+        defs = new ArrayList<>();
+        addRegular(names.subList(0, 8), defs);
+
+        Columns subset = Columns.from(new HashSet<>(defs));
+
+        Assert.assertTrue(columns.containsAll(subset));
+    }
+
+    @Test
+    public void testStaticColumns()
+    {
+        testColumns(ColumnDefinition.Kind.STATIC);
+    }
+
+    @Test
+    public void testRegularColumns()
+    {
+        testColumns(ColumnDefinition.Kind.REGULAR);
+    }
+
+    private void testColumns(ColumnDefinition.Kind kind)
+    {
+        List<ColumnDefinition> definitions = ImmutableList.of(
+            def("a", UTF8Type.instance, kind),
+            def("b", SetType.getInstance(UTF8Type.instance, true), kind),
+            def("c", UTF8Type.instance, kind),
+            def("d", SetType.getInstance(UTF8Type.instance, true), kind),
+            def("e", UTF8Type.instance, kind),
+            def("f", SetType.getInstance(UTF8Type.instance, true), kind),
+            def("g", UTF8Type.instance, kind),
+            def("h", SetType.getInstance(UTF8Type.instance, true), kind)
+        );
+        Columns columns = Columns.from(definitions);
+
+        // test simpleColumnCount()
+        Assert.assertEquals(4, columns.simpleColumnCount());
+
+        // test simpleColumns()
+        List<ColumnDefinition> simpleColumnsExpected =
+            ImmutableList.of(definitions.get(0), definitions.get(2), definitions.get(4), definitions.get(6));
+        List<ColumnDefinition> simpleColumnsActual = new ArrayList<>();
+        Iterators.addAll(simpleColumnsActual, columns.simpleColumns());
+        Assert.assertEquals(simpleColumnsExpected, simpleColumnsActual);
+
+        // test complexColumnCount()
+        Assert.assertEquals(4, columns.complexColumnCount());
+
+        // test complexColumns()
+        List<ColumnDefinition> complexColumnsExpected =
+            ImmutableList.of(definitions.get(1), definitions.get(3), definitions.get(5), definitions.get(7));
+        List<ColumnDefinition> complexColumnsActual = new ArrayList<>();
+        Iterators.addAll(complexColumnsActual, columns.complexColumns());
+        Assert.assertEquals(complexColumnsExpected, complexColumnsActual);
+
+        // test size()
+        Assert.assertEquals(8, columns.size());
+
+        // test selectOrderIterator()
+        List<ColumnDefinition> columnsExpected = definitions;
+        List<ColumnDefinition> columnsActual = new ArrayList<>();
+        Iterators.addAll(columnsActual, columns.selectOrderIterator());
+        Assert.assertEquals(columnsExpected, columnsActual);
+    }
+
+    private void testSerializeSubset(ColumnsCheck input) throws IOException
+    {
+        testSerializeSubset(input.columns, input.columns, input.definitions);
+        testSerializeSubset(input.columns, Columns.NONE, Collections.emptyList());
+        List<List<ColumnDefinition>> removeGroups = shuffleAndGroup(Lists.newArrayList(input.definitions));
+        for (List<ColumnDefinition> defs : removeGroups)
+        {
+            Collections.sort(defs);
+            ColumnsCheck subset = input.remove(defs);
+            testSerializeSubset(input.columns, subset.columns, subset.definitions);
+        }
+    }
+
+    private void testSerializeSubset(Columns superset, Columns subset, List<ColumnDefinition> subsetDefinitions) throws IOException
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            Columns.serializer.serializeSubset(subset, superset, out);
+            Assert.assertEquals(Columns.serializer.serializedSubsetSize(subset, superset), out.buffer().remaining());
+            Columns deserialized = Columns.serializer.deserializeSubset(superset, new DataInputBuffer(out.buffer(), false));
+            Assert.assertEquals(subset, deserialized);
+            Assert.assertEquals(subset.hashCode(), deserialized.hashCode());
+            assertContents(deserialized, subsetDefinitions);
+        }
+    }
+
+    private static void assertContents(Columns columns, List<ColumnDefinition> defs)
+    {
+        Assert.assertEquals(defs, Lists.newArrayList(columns));
+        boolean hasSimple = false, hasComplex = false;
+        int firstComplexIdx = 0;
+        int i = 0;
+        Iterator<ColumnDefinition> simple = columns.simpleColumns();
+        Iterator<ColumnDefinition> complex = columns.complexColumns();
+        Iterator<ColumnDefinition> all = columns.iterator();
+        Predicate<ColumnDefinition> predicate = columns.inOrderInclusionTester();
+        for (ColumnDefinition def : defs)
+        {
+            Assert.assertEquals(def, all.next());
+            Assert.assertTrue(columns.contains(def));
+            Assert.assertTrue(predicate.test(def));
+            if (def.isSimple())
+            {
+                hasSimple = true;
+                Assert.assertEquals(i, columns.simpleIdx(def));
+                Assert.assertEquals(def, columns.getSimple(i));
+                Assert.assertEquals(def, simple.next());
+                ++firstComplexIdx;
+            }
+            else
+            {
+                Assert.assertFalse(simple.hasNext());
+                hasComplex = true;
+                Assert.assertEquals(i - firstComplexIdx, columns.complexIdx(def));
+                Assert.assertEquals(def, columns.getComplex(i - firstComplexIdx));
+                Assert.assertEquals(def, complex.next());
+            }
+            i++;
+        }
+        Assert.assertEquals(defs.isEmpty(), columns.isEmpty());
+        Assert.assertFalse(simple.hasNext());
+        Assert.assertFalse(complex.hasNext());
+        Assert.assertFalse(all.hasNext());
+        Assert.assertEquals(hasSimple, columns.hasSimple());
+        Assert.assertEquals(hasComplex, columns.hasComplex());
+
+        // check select order
+        if (!columns.hasSimple() || !columns.getSimple(0).kind.isPrimaryKeyKind())
+        {
+            List<ColumnDefinition> selectOrderDefs = new ArrayList<>(defs);
+            Collections.sort(selectOrderDefs, (a, b) -> a.name.bytes.compareTo(b.name.bytes));
+            List<ColumnDefinition> selectOrderColumns = new ArrayList<>();
+            Iterators.addAll(selectOrderColumns, columns.selectOrderIterator());
+            Assert.assertEquals(selectOrderDefs, selectOrderColumns);
+        }
+    }
+
+    private static <V> List<List<V>> shuffleAndGroup(List<V> list)
+    {
+        // first shuffle
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        for (int i = 0 ; i < list.size() - 1 ; i++)
+        {
+            int j = random.nextInt(i, list.size());
+            V v = list.get(i);
+            list.set(i, list.get(j));
+            list.set(j, v);
+        }
+
+        // then group (logarithmically, to ensure our recursive functions don't explode the state space)
+        List<List<V>> result = new ArrayList<>();
+        for (int i = 0 ; i < list.size() ;)
+        {
+            List<V> group = new ArrayList<>();
+            int maxCount = list.size() - i;
+            int count = maxCount <= 2 ? maxCount : random.nextInt(1, maxCount);
+            for (int j = 0 ; j < count ; j++)
+                group.add(list.get(i + j));
+            i += count;
+            result.add(group);
+        }
+        return result;
+    }
+
+    @AfterClass
+    public static void cleanup()
+    {
+        MockSchema.cleanup();
+    }
+
+    private static class ColumnsCheck
+    {
+        final Columns columns;
+        final List<ColumnDefinition> definitions;
+
+        private ColumnsCheck(Columns columns, List<ColumnDefinition> definitions)
+        {
+            this.columns = columns;
+            this.definitions = definitions;
+        }
+
+        private ColumnsCheck(List<ColumnDefinition> definitions)
+        {
+            this.columns = Columns.from(BTreeSet.of(definitions));
+            this.definitions = definitions;
+        }
+
+        ColumnsCheck remove(List<ColumnDefinition> remove)
+        {
+            Columns subset = columns;
+            for (ColumnDefinition def : remove)
+                subset = subset.without(def);
+            Assert.assertEquals(columns.size() - remove.size(), subset.size());
+            List<ColumnDefinition> remainingDefs = Lists.newArrayList(columns);
+            remainingDefs.removeAll(remove);
+            return new ColumnsCheck(subset, remainingDefs);
+        }
+
+        void assertContents()
+        {
+            ColumnsTest.assertContents(columns, definitions);
+        }
+    }
+
+    private static List<ColumnsCheck> randomHuge()
+    {
+        List<ColumnsCheck> result = new ArrayList<>();
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        result.add(randomHuge(random.nextInt(64, 128), 0, 0, 0));
+        result.add(randomHuge(0, random.nextInt(64, 128), 0, 0));
+        result.add(randomHuge(0, 0, random.nextInt(64, 128), 0));
+        result.add(randomHuge(0, 0, 0, random.nextInt(64, 128)));
+        result.add(randomHuge(random.nextInt(64, 128), random.nextInt(64, 128), 0, 0));
+        result.add(randomHuge(0, random.nextInt(64, 128), random.nextInt(64, 128), 0));
+        result.add(randomHuge(0, 0, random.nextInt(64, 128), random.nextInt(64, 128)));
+        result.add(randomHuge(random.nextInt(64, 128), random.nextInt(64, 128), random.nextInt(64, 128), 0));
+        result.add(randomHuge(0, random.nextInt(64, 128), random.nextInt(64, 128), random.nextInt(64, 128)));
+        result.add(randomHuge(random.nextInt(64, 128), random.nextInt(64, 128), random.nextInt(64, 128), random.nextInt(64, 128)));
+        return result;
+    }
+
+    private static List<ColumnsCheck> randomSmall(boolean permitMultiplePartitionKeys)
+    {
+        List<ColumnsCheck> random = new ArrayList<>();
+        for (int i = 1 ; i <= 3 ; i++)
+        {
+            int pkCount = permitMultiplePartitionKeys ? i - 1 : 1;
+            if (permitMultiplePartitionKeys)
+                random.add(randomSmall(i, i - 1, i - 1, i - 1));
+            random.add(randomSmall(0, 0, i, i)); // both kinds of regular, no PK
+            random.add(randomSmall(pkCount, i, i - 1, i - 1)); // PK + clustering, few or none regular
+            random.add(randomSmall(pkCount, i - 1, i, i - 1)); // PK + few or none clustering, some regular, few or none complex
+            random.add(randomSmall(pkCount, i - 1, i - 1, i)); // PK + few or none clustering or regular, some complex
+        }
+        return random;
+    }
+
+    private static ColumnsCheck randomSmall(int pkCount, int clCount, int regularCount, int complexCount)
+    {
+        List<String> names = new ArrayList<>();
+        for (char c = 'a' ; c <= 'z' ; c++)
+            names .add(Character.toString(c));
+
+        List<ColumnDefinition> result = new ArrayList<>();
+        addPartition(select(names, pkCount), result);
+        addClustering(select(names, clCount), result);
+        addRegular(select(names, regularCount), result);
+        addComplex(select(names, complexCount), result);
+        Collections.sort(result);
+        return new ColumnsCheck(result);
+    }
+
+    private static List<String> select(List<String> names, int count)
+    {
+        List<String> result = new ArrayList<>();
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        for (int i = 0 ; i < count ; i++)
+        {
+            int v = random.nextInt(names.size());
+            result.add(names.get(v));
+            names.remove(v);
+        }
+        return result;
+    }
+
+    private static ColumnsCheck randomHuge(int pkCount, int clCount, int regularCount, int complexCount)
+    {
+        List<ColumnDefinition> result = new ArrayList<>();
+        Set<String> usedNames = new HashSet<>();
+        addPartition(names(pkCount, usedNames), result);
+        addClustering(names(clCount, usedNames), result);
+        addRegular(names(regularCount, usedNames), result);
+        addComplex(names(complexCount, usedNames), result);
+        Collections.sort(result);
+        return new ColumnsCheck(result);
+    }
+
+    private static List<String> names(int count, Set<String> usedNames)
+    {
+        List<String> names = new ArrayList<>();
+        StringBuilder builder = new StringBuilder();
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+        for (int i = 0 ; i < count ; i++)
+        {
+            builder.setLength(0);
+            for (int j = 0 ; j < 3 || usedNames.contains(builder.toString()) ; j++)
+                builder.append((char) random.nextInt('a', 'z' + 1));
+            String name = builder.toString();
+            names.add(name);
+            usedNames.add(name);
+        }
+        return names;
+    }
+
+    private static void addPartition(List<String> names, List<ColumnDefinition> results)
+    {
+        for (String name : names)
+            results.add(ColumnDefinition.partitionKeyDef(cfMetaData, bytes(name), UTF8Type.instance, 0));
+    }
+
+    private static void addClustering(List<String> names, List<ColumnDefinition> results)
+    {
+        int i = 0;
+        for (String name : names)
+            results.add(ColumnDefinition.clusteringDef(cfMetaData, bytes(name), UTF8Type.instance, i++));
+    }
+
+    private static void addRegular(List<String> names, List<ColumnDefinition> results)
+    {
+        for (String name : names)
+            results.add(ColumnDefinition.regularDef(cfMetaData, bytes(name), UTF8Type.instance));
+    }
+
+    private static void addComplex(List<String> names, List<ColumnDefinition> results)
+    {
+        for (String name : names)
+            results.add(ColumnDefinition.regularDef(cfMetaData, bytes(name), SetType.getInstance(UTF8Type.instance, true)));
+    }
+
+    private static ColumnDefinition def(String name, AbstractType<?> type, ColumnDefinition.Kind kind)
+    {
+        return new ColumnDefinition(cfMetaData, bytes(name), type, ColumnDefinition.NO_POSITION, kind);
+    }
+
+    private static CFMetaData mock(Columns columns)
+    {
+        if (columns.isEmpty())
+            return cfMetaData;
+        CFMetaData.Builder builder = CFMetaData.Builder.create(cfMetaData.ksName, cfMetaData.cfName);
+        boolean hasPartitionKey = false;
+        for (ColumnDefinition def : columns)
+        {
+            switch (def.kind)
+            {
+                case PARTITION_KEY:
+                    builder.addPartitionKey(def.name, def.type);
+                    hasPartitionKey = true;
+                    break;
+                case CLUSTERING:
+                    builder.addClusteringColumn(def.name, def.type);
+                    break;
+                case REGULAR:
+                    builder.addRegularColumn(def.name, def.type);
+                    break;
+            }
+        }
+        if (!hasPartitionKey)
+            builder.addPartitionKey("219894021498309239rufejsfjdksfjheiwfhjes", UTF8Type.instance);
+        return builder.build();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/CounterCacheTest.java b/test/unit/org/apache/cassandra/db/CounterCacheTest.java
index ed7921e..91157ad 100644
--- a/test/unit/org/apache/cassandra/db/CounterCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/CounterCacheTest.java

@@ -20,41 +20,47 @@
 import java.util.Collections;
 import java.util.concurrent.ExecutionException;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 
-import static org.apache.cassandra.Util.cellname;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
 public class CounterCacheTest
 {
     private static final String KEYSPACE1 = "CounterCacheTest";
-    private static final String CF = "Counter1";
+    private static final String COUNTER1 = "Counter1";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
+
+        CFMetaData counterTable = CFMetaData.Builder.create(KEYSPACE1, COUNTER1, false, true, true)
+                                  .addPartitionKey("key", Int32Type.instance)
+                                  .addClusteringColumn("name", Int32Type.instance)
+                                  .addRegularColumn("c", CounterColumnType.instance)
+                                  .build();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF).defaultValidator(CounterColumnType.instance));
+                                    KeyspaceParams.simple(1),
+                                    counterTable);
     }
 
     @AfterClass
@@ -66,82 +72,89 @@
     @Test
     public void testReadWrite()
     {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         cfs.truncateBlocking();
         CacheService.instance.invalidateCounterCache();
 
+        Clustering c1 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(1)).build();
+        Clustering c2 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(2)).build();
+        ColumnDefinition cd = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("c"));
+
         assertEquals(0, CacheService.instance.counterCache.size());
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(2)));
+        assertNull(cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c2, cd, null));
 
-        cfs.putCachedCounter(bytes(1), cellname(1), ClockAndCount.create(1L, 1L));
-        cfs.putCachedCounter(bytes(1), cellname(2), ClockAndCount.create(1L, 2L));
-        cfs.putCachedCounter(bytes(2), cellname(1), ClockAndCount.create(2L, 1L));
-        cfs.putCachedCounter(bytes(2), cellname(2), ClockAndCount.create(2L, 2L));
+        cfs.putCachedCounter(bytes(1), c1, cd, null, ClockAndCount.create(1L, 1L));
+        cfs.putCachedCounter(bytes(1), c2, cd, null, ClockAndCount.create(1L, 2L));
+        cfs.putCachedCounter(bytes(2), c1, cd, null, ClockAndCount.create(2L, 1L));
+        cfs.putCachedCounter(bytes(2), c2, cd, null, ClockAndCount.create(2L, 2L));
 
-        assertEquals(4, CacheService.instance.counterCache.size());
-        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertEquals(ClockAndCount.create(2L, 1L), cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertEquals(ClockAndCount.create(2L, 2L), cfs.getCachedCounter(bytes(2), cellname(2)));
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertEquals(ClockAndCount.create(2L, 1L), cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertEquals(ClockAndCount.create(2L, 2L), cfs.getCachedCounter(bytes(2), c2, cd, null));
     }
 
     @Test
     public void testCounterCacheInvalidate()
     {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         cfs.truncateBlocking();
         CacheService.instance.invalidateCounterCache();
 
+        Clustering c1 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(1)).build();
+        Clustering c2 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(2)).build();
+        ColumnDefinition cd = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("c"));
+
         assertEquals(0, CacheService.instance.counterCache.size());
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(2)));
-        assertNull(cfs.getCachedCounter(bytes(3), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(3), cellname(2)));
+        assertNull(cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c2, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(3), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(3), c2, cd, null));
 
-        cfs.putCachedCounter(bytes(1), cellname(1), ClockAndCount.create(1L, 1L));
-        cfs.putCachedCounter(bytes(1), cellname(2), ClockAndCount.create(1L, 2L));
-        cfs.putCachedCounter(bytes(2), cellname(1), ClockAndCount.create(2L, 1L));
-        cfs.putCachedCounter(bytes(2), cellname(2), ClockAndCount.create(2L, 2L));
-        cfs.putCachedCounter(bytes(3), cellname(1), ClockAndCount.create(3L, 1L));
-        cfs.putCachedCounter(bytes(3), cellname(2), ClockAndCount.create(3L, 2L));
+        cfs.putCachedCounter(bytes(1), c1, cd, null, ClockAndCount.create(1L, 1L));
+        cfs.putCachedCounter(bytes(1), c2, cd, null, ClockAndCount.create(1L, 2L));
+        cfs.putCachedCounter(bytes(2), c1, cd, null, ClockAndCount.create(2L, 1L));
+        cfs.putCachedCounter(bytes(2), c2, cd, null, ClockAndCount.create(2L, 2L));
+        cfs.putCachedCounter(bytes(3), c1, cd, null, ClockAndCount.create(3L, 1L));
+        cfs.putCachedCounter(bytes(3), c2, cd, null, ClockAndCount.create(3L, 2L));
 
-        assertEquals(6, CacheService.instance.counterCache.size());
-        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertEquals(ClockAndCount.create(2L, 1L), cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertEquals(ClockAndCount.create(2L, 2L), cfs.getCachedCounter(bytes(2), cellname(2)));
-        assertEquals(ClockAndCount.create(3L, 1L), cfs.getCachedCounter(bytes(3), cellname(1)));
-        assertEquals(ClockAndCount.create(3L, 2L), cfs.getCachedCounter(bytes(3), cellname(2)));
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertEquals(ClockAndCount.create(2L, 1L), cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertEquals(ClockAndCount.create(2L, 2L), cfs.getCachedCounter(bytes(2), c2, cd, null));
+        assertEquals(ClockAndCount.create(3L, 1L), cfs.getCachedCounter(bytes(3), c1, cd, null));
+        assertEquals(ClockAndCount.create(3L, 2L), cfs.getCachedCounter(bytes(3), c2, cd, null));
 
-        cfs.invalidateCounterCache(Collections.singleton(new Bounds<Token>(cfs.partitioner.decorateKey(bytes(1)).getToken(),
-                                                                           cfs.partitioner.decorateKey(bytes(2)).getToken())));
+        cfs.invalidateCounterCache(Collections.singleton(new Bounds<Token>(cfs.decorateKey(bytes(1)).getToken(),
+                                                                           cfs.decorateKey(bytes(2)).getToken())));
 
         assertEquals(2, CacheService.instance.counterCache.size());
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertNull(cfs.getCachedCounter(bytes(2), cellname(2)));
-        assertEquals(ClockAndCount.create(3L, 1L), cfs.getCachedCounter(bytes(3), cellname(1)));
-        assertEquals(ClockAndCount.create(3L, 2L), cfs.getCachedCounter(bytes(3), cellname(2)));
+        assertNull(cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertNull(cfs.getCachedCounter(bytes(2), c2, cd, null));
+        assertEquals(ClockAndCount.create(3L, 1L), cfs.getCachedCounter(bytes(3), c1, cd, null));
+        assertEquals(ClockAndCount.create(3L, 2L), cfs.getCachedCounter(bytes(3), c2, cd, null));
     }
 
     @Test
     public void testSaveLoad() throws ExecutionException, InterruptedException, WriteTimeoutException
     {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         cfs.truncateBlocking();
         CacheService.instance.invalidateCounterCache();
 
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addColumn(new BufferCounterUpdateCell(cellname(1), 1L, FBUtilities.timestampMicros()));
-        cells.addColumn(new BufferCounterUpdateCell(cellname(2), 2L, FBUtilities.timestampMicros()));
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(2), cells), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
+
+        assertEquals(4, CacheService.instance.counterCache.size());
 
         // flush the counter cache and invalidate
         CacheService.instance.counterCache.submitWrite(Integer.MAX_VALUE).get();
@@ -151,24 +164,28 @@
         // load from cache and validate
         CacheService.instance.counterCache.loadSaved();
         assertEquals(4, CacheService.instance.counterCache.size());
-        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), cellname(2)));
-        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(2), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(2), cellname(2)));
+
+        Clustering c1 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(1)).build();
+        Clustering c2 = CBuilder.create(cfs.metadata.comparator).add(ByteBufferUtil.bytes(2)).build();
+        ColumnDefinition cd = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("c"));
+
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(1), c1, cd, null));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(1), c2, cd, null));
+        assertEquals(ClockAndCount.create(1L, 1L), cfs.getCachedCounter(bytes(2), c1, cd, null));
+        assertEquals(ClockAndCount.create(1L, 2L), cfs.getCachedCounter(bytes(2), c2, cd, null));
     }
 
     @Test
     public void testDroppedSaveLoad() throws ExecutionException, InterruptedException, WriteTimeoutException
     {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         cfs.truncateBlocking();
         CacheService.instance.invalidateCounterCache();
 
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addColumn(new BufferCounterUpdateCell(cellname(1), 1L, FBUtilities.timestampMicros()));
-        cells.addColumn(new BufferCounterUpdateCell(cellname(2), 2L, FBUtilities.timestampMicros()));
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(2), cells), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
 
         // flush the counter cache and invalidate
         CacheService.instance.counterCache.submitWrite(Integer.MAX_VALUE).get();
@@ -192,15 +209,14 @@
     @Test
     public void testDisabledSaveLoad() throws ExecutionException, InterruptedException, WriteTimeoutException
     {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         cfs.truncateBlocking();
         CacheService.instance.invalidateCounterCache();
 
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addColumn(new BufferCounterUpdateCell(cellname(1), 1L, FBUtilities.timestampMicros()));
-        cells.addColumn(new BufferCounterUpdateCell(cellname(2), 2L, FBUtilities.timestampMicros()));
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(2), cells), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(1)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(1).add("c", 1L).build(), ConsistencyLevel.ONE).apply();
+        new CounterMutation(new RowUpdateBuilder(cfs.metadata, 0, bytes(2)).clustering(2).add("c", 2L).build(), ConsistencyLevel.ONE).apply();
 
         // flush the counter cache and invalidate
         CacheService.instance.counterCache.submitWrite(Integer.MAX_VALUE).get();

diff --git a/test/unit/org/apache/cassandra/db/CounterCellTest.java b/test/unit/org/apache/cassandra/db/CounterCellTest.java
index 5d4b8a8..b4c7b2a 100644
--- a/test/unit/org/apache/cassandra/db/CounterCellTest.java
+++ b/test/unit/org/apache/cassandra/db/CounterCellTest.java

@@ -18,27 +18,29 @@
 */
 package org.apache.cassandra.db;
 
-import java.security.MessageDigest;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.security.MessageDigest;
 import java.util.Arrays;
 
+import org.junit.AfterClass;
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Cells;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.serializers.AsciiSerializer;
 import org.apache.cassandra.utils.*;
 
-import static org.apache.cassandra.Util.cellname;
+import static org.junit.Assert.*;
 import static org.apache.cassandra.db.context.CounterContext.ContextState;
 
 public class CounterCellTest
@@ -51,6 +53,26 @@
 
     private static final int stepLength;
 
+    private static final String KEYSPACE1 = "CounterCacheTest";
+    private static final String COUNTER1 = "Counter1";
+    private static final String STANDARD1 = "Standard1";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1),
+                                    SchemaLoader.counterCFMD(KEYSPACE1, COUNTER1));
+    }
+
+    @AfterClass
+    public static void cleanup()
+    {
+        SchemaLoader.cleanupSavedCaches();
+    }
+
     static
     {
         idLength      = CounterId.LENGTH;
@@ -58,248 +80,190 @@
         countLength   = 8; // size of long
 
         stepLength    = idLength + clockLength + countLength;
-        // TODO: CounterId accesses SystemKespace to get local host ID, so need to mark as daemon initialized
-        DatabaseDescriptor.setDaemonInitialized();
     }
 
     @Test
     public void testCreate()
     {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
         long delta = 3L;
-        CounterCell cell = new BufferCounterCell(Util.cellname("x"),
-                                           CounterContext.instance().createLocal(delta),
-                                           1L,
-                                           Long.MIN_VALUE);
 
-        Assert.assertEquals(delta, cell.total());
-        Assert.assertEquals(1, cell.value().getShort(0));
-        Assert.assertEquals(0, cell.value().getShort(2));
+        Cell cell = createLegacyCounterCell(cfs, ByteBufferUtil.bytes("val"), delta, 1);
+
+        assertEquals(delta, CounterContext.instance().total(cell.value()));
+        assertEquals(1, cell.value().getShort(0));
+        assertEquals(0, cell.value().getShort(2));
         Assert.assertTrue(CounterId.wrap(cell.value(), 4).isLocalId());
-        Assert.assertEquals(1L, cell.value().getLong(4 + idLength));
-        Assert.assertEquals(delta, cell.value().getLong(4 + idLength + clockLength));
+        assertEquals(1L, cell.value().getLong(4 + idLength));
+        assertEquals(delta, cell.value().getLong(4 + idLength + clockLength));
+
+    }
+
+    private Cell createLegacyCounterCell(ColumnFamilyStore cfs, ByteBuffer colName, long count, long ts)
+    {
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(colName);
+        ByteBuffer val = CounterContext.instance().createLocal(count);
+        return BufferCell.live(cfs.metadata, cDef, ts, val);
+    }
+
+    private Cell createCounterCell(ColumnFamilyStore cfs, ByteBuffer colName, CounterId id, long count, long ts)
+    {
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(colName);
+        ByteBuffer val = CounterContext.instance().createGlobal(id, ts, count);
+        return BufferCell.live(cfs.metadata, cDef, ts, val);
+    }
+
+    private Cell createCounterCellFromContext(ColumnFamilyStore cfs, ByteBuffer colName, ContextState context, long ts)
+    {
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(colName);
+        return BufferCell.live(cfs.metadata, cDef, ts, context.context);
+    }
+
+    private Cell createDeleted(ColumnFamilyStore cfs, ByteBuffer colName, long ts, int localDeletionTime)
+    {
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(colName);
+        return BufferCell.tombstone(cDef, ts, localDeletionTime);
     }
 
     @Test
     public void testReconcile()
     {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
+        ByteBuffer col = ByteBufferUtil.bytes("val");
+
         Cell left;
         Cell right;
-        Cell reconciled;
 
-        ByteBuffer context;
+        // both deleted, diff deletion time, same ts
+        left = createDeleted(cfs, col, 2, 5);
+        right = createDeleted(cfs, col, 2, 10);
+        assert Cells.reconcile(left, right, 10) == right;
 
-        // tombstone + tombstone
-        left  = new BufferDeletedCell(cellname("x"), 1, 1L);
-        right = new BufferDeletedCell(cellname("x"), 2, 2L);
+        // diff ts
+        right = createLegacyCounterCell(cfs, col, 1, 10);
+        assert Cells.reconcile(left, right, 10) == left;
 
-        assert left.reconcile(right).timestamp() == right.timestamp();
-        assert right.reconcile(left).timestamp() == right.timestamp();
+        // < tombstone
+        left = createDeleted(cfs, col, 6, 6);
+        right = createLegacyCounterCell(cfs, col, 1, 5);
+        assert Cells.reconcile(left, right, 10) == left;
 
-        // tombstone > live
-        left  = new BufferDeletedCell(cellname("x"), 1, 2L);
-        right = BufferCounterCell.createLocal(cellname("x"), 0L, 1L, Long.MIN_VALUE);
+        // > tombstone
+        left = createDeleted(cfs, col, 1, 1);
+        right = createLegacyCounterCell(cfs, col, 1, 5);
+        assert Cells.reconcile(left, right, 10) == left;
 
-        assert left.reconcile(right) == left;
-
-        // tombstone < live last delete
-        left  = new BufferDeletedCell(cellname("x"), 1, 1L);
-        right = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
-
-        assert left.reconcile(right) == left;
-
-        // tombstone == live last delete
-        left  = new BufferDeletedCell(cellname("x"), 1, 2L);
-        right = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
-
-        assert left.reconcile(right) == left;
-
-        // tombstone > live last delete
-        left  = new BufferDeletedCell(cellname("x"), 1, 4L);
-        right = BufferCounterCell.createLocal(cellname("x"), 0L, 9L, 1L);
-
-        assert left.reconcile(right) == left;
-
-        // live < tombstone
-        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 1L, Long.MIN_VALUE);
-        right = new BufferDeletedCell(cellname("x"), 1, 2L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete > tombstone
-        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
-        right = new BufferDeletedCell(cellname("x"), 1, 1L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete == tombstone
-        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 4L, 2L);
-        right = new BufferDeletedCell(cellname("x"), 1, 2L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete < tombstone
-        left  = BufferCounterCell.createLocal(cellname("x"), 0L, 9L, 1L);
-        right = new BufferDeletedCell(cellname("x"), 1, 4L);
-
-        assert left.reconcile(right) == right;
-
-        // live < live last delete
-        left  = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 1L, Long.MIN_VALUE);
-        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, 3L);
-
-        assert left.reconcile(right) == right;
-
-        // live last delete > live
-        left  = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 6L, 5L);
-        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, 3L);
-
-        assert left.reconcile(right) == left;
+        // == tombstone
+        left = createDeleted(cfs, col, 8, 8);
+        right = createLegacyCounterCell(cfs, col, 1, 8);
+        assert Cells.reconcile(left, right, 10) == left;
 
         // live + live
-        left = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 1L, 1L), 4L, Long.MIN_VALUE);
-        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(1), 2L, 3L), 1L, Long.MIN_VALUE);
+        left = createLegacyCounterCell(cfs, col, 1, 2);
+        right = createLegacyCounterCell(cfs, col, 3, 5);
+        Cell reconciled = Cells.reconcile(left, right, 10);
+        assertEquals(CounterContext.instance().total(reconciled.value()), 4);
+        assertEquals(reconciled.timestamp(), 5L);
 
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterCell)reconciled).total() == 3L;
-        assert reconciled.timestamp() == 4L;
+        // Add, don't change TS
+        Cell addTen = createLegacyCounterCell(cfs, col, 10, 4);
+        reconciled = Cells.reconcile(reconciled, addTen, 10);
+        assertEquals(CounterContext.instance().total(reconciled.value()), 14);
+        assertEquals(reconciled.timestamp(), 5L);
 
-        left = reconciled;
-        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(2), 1L, 5L), 2L, Long.MIN_VALUE);
+        // Add w/new TS
+        Cell addThree = createLegacyCounterCell(cfs, col, 3, 7);
+        reconciled = Cells.reconcile(reconciled, addThree, 10);
+        assertEquals(CounterContext.instance().total(reconciled.value()), 17);
+        assertEquals(reconciled.timestamp(), 7L);
 
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterCell)reconciled).total() == 8L;
-        assert reconciled.timestamp() == 4L;
+        // Confirm no deletion time
+        assert reconciled.localDeletionTime() == Integer.MAX_VALUE;
 
-        left = reconciled;
-        right = new BufferCounterCell(cellname("x"), cc.createRemote(CounterId.fromInt(2), 2L, 2L), 6L, Long.MIN_VALUE);
-
-        reconciled = left.reconcile(right);
-        assert reconciled.name().equals(left.name());
-        assert ((CounterCell)reconciled).total() == 5L;
-        assert reconciled.timestamp() == 6L;
-
-        context = reconciled.value();
-        int hd = 2; // header
-        assert hd + 2 * stepLength == context.remaining();
-
-        assert Util.equalsCounterId(CounterId.fromInt(1), context, hd);
-        assert 2L == context.getLong(hd + idLength);
-        assert 3L == context.getLong(hd + idLength + clockLength);
-
-        assert Util.equalsCounterId(CounterId.fromInt(2), context, hd + stepLength);
-        assert 2L == context.getLong(hd + stepLength + idLength);
-        assert 2L == context.getLong(hd + stepLength + idLength + clockLength);
-
-        assert ((CounterCell)reconciled).timestampOfLastDelete() == Long.MIN_VALUE;
+        Cell deleted = createDeleted(cfs, col, 8, 8);
+        reconciled = Cells.reconcile(reconciled, deleted, 10);
+        assert reconciled.localDeletionTime() == 8;
     }
 
     @Test
     public void testDiff()
     {
-        ContextState left;
-        ContextState right;
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
+        ByteBuffer col = ByteBufferUtil.bytes("val");
 
-        CounterCell leftCell;
-        CounterCell rightCell;
+        Cell leftCell;
+        Cell rightCell;
+
+        // Equal count
+        leftCell = createLegacyCounterCell(cfs, col, 2, 2);
+        rightCell = createLegacyCounterCell(cfs, col, 2, 1);
+        assertEquals(CounterContext.Relationship.EQUAL, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
+
+        // Non-equal count
+        leftCell = createLegacyCounterCell(cfs, col, 1, 2);
+        rightCell = createLegacyCounterCell(cfs, col, 2, 1);
+        assertEquals(CounterContext.Relationship.DISJOINT, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
 
         // timestamp
-        leftCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, Long.MIN_VALUE);
-        rightCell = BufferCounterCell.createLocal(cellname("x"), 0, 2L, Long.MIN_VALUE);
+        CounterId id = CounterId.generate();
+        leftCell = createCounterCell(cfs, col, id, 2, 2);
+        rightCell = createCounterCell(cfs, col, id, 2, 1);
+        assertEquals(CounterContext.Relationship.GREATER_THAN, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
 
-        assert rightCell == leftCell.diff(rightCell);
-        assert null      == rightCell.diff(leftCell);
+        ContextState leftContext;
+        ContextState rightContext;
 
-        // timestampOfLastDelete
-        leftCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, 1L);
-        rightCell = BufferCounterCell.createLocal(cellname("x"), 0, 1L, 2L);
+        // Equal based on context w/shards etc
+        leftContext = ContextState.allocate(0, 0, 3);
+        leftContext.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        rightContext = ContextState.wrap(ByteBufferUtil.clone(leftContext.context));
 
-        assert rightCell == leftCell.diff(rightCell);
-        assert null      == rightCell.diff(leftCell);
-
-        // equality: equal nodes, all counts same
-        left = ContextState.allocate(0, 0, 3);
-        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
-        right = ContextState.wrap(ByteBufferUtil.clone(left.context));
-
-        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
-        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
-        assert leftCell.diff(rightCell) == null;
+        leftCell = createCounterCellFromContext(cfs, col, leftContext, 1);
+        rightCell = createCounterCellFromContext(cfs, col, rightContext, 1);
+        assertEquals(CounterContext.Relationship.EQUAL, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
 
         // greater than: left has superset of nodes (counts equal)
-        left = ContextState.allocate(0, 0, 4);
-        left.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        left.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(12), 0L, 0L);
+        leftContext = ContextState.allocate(0, 0, 4);
+        leftContext.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(12), 0L, 0L);
 
-        right = ContextState.allocate(0, 0, 3);
-        right.writeRemote(CounterId.fromInt(3), 3L, 0L);
-        right.writeRemote(CounterId.fromInt(6), 2L, 0L);
-        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        rightContext = ContextState.allocate(0, 0, 3);
+        rightContext.writeRemote(CounterId.fromInt(3), 3L, 0L);
+        rightContext.writeRemote(CounterId.fromInt(6), 2L, 0L);
+        rightContext.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
-        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
-        assert leftCell.diff(rightCell) == null;
-
-        // less than: right has subset of nodes (counts equal)
-        assert leftCell == rightCell.diff(leftCell);
+        leftCell = createCounterCellFromContext(cfs, col, leftContext, 1);
+        rightCell = createCounterCellFromContext(cfs, col, rightContext, 1);
+        assertEquals(CounterContext.Relationship.GREATER_THAN, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
+        assertEquals(CounterContext.Relationship.LESS_THAN, CounterContext.instance().diff(rightCell.value(), leftCell.value()));
 
         // disjoint: right and left have disjoint node sets
-        left = ContextState.allocate(0, 0, 3);
-        left.writeRemote(CounterId.fromInt(3), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(4), 1L, 0L);
-        left.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        leftContext = ContextState.allocate(0, 0, 3);
+        leftContext.writeRemote(CounterId.fromInt(3), 1L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(4), 1L, 0L);
+        leftContext.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        right = ContextState.allocate(0, 0, 3);
-        right.writeRemote(CounterId.fromInt(3), 1L, 0L);
-        right.writeRemote(CounterId.fromInt(6), 1L, 0L);
-        right.writeRemote(CounterId.fromInt(9), 1L, 0L);
+        rightContext = ContextState.allocate(0, 0, 3);
+        rightContext.writeRemote(CounterId.fromInt(3), 1L, 0L);
+        rightContext.writeRemote(CounterId.fromInt(6), 1L, 0L);
+        rightContext.writeRemote(CounterId.fromInt(9), 1L, 0L);
 
-        leftCell  = new BufferCounterCell(cellname("x"), left.context,  1L);
-        rightCell = new BufferCounterCell(cellname("x"), right.context, 1L);
-        assert rightCell == leftCell.diff(rightCell);
-        assert leftCell  == rightCell.diff(leftCell);
-    }
-
-    @Test
-    public void testSerializeDeserialize() throws IOException
-    {
-        CounterContext.ContextState state = CounterContext.ContextState.allocate(0, 2, 2);
-        state.writeRemote(CounterId.fromInt(1), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(2), 4L, 4L);
-        state.writeRemote(CounterId.fromInt(3), 4L, 4L);
-        state.writeLocal(CounterId.fromInt(4), 4L, 4L);
-
-        CellNameType type = new SimpleDenseCellNameType(UTF8Type.instance);
-        CounterCell original = new BufferCounterCell(cellname("x"), state.context, 1L);
-        byte[] serialized;
-        try (DataOutputBuffer bufOut = new DataOutputBuffer())
-        {
-            type.columnSerializer().serialize(original, bufOut);
-            serialized = bufOut.getData();
-        }
-
-
-        ByteArrayInputStream bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
-        CounterCell deserialized = (CounterCell) type.columnSerializer().deserialize(new DataInputStream(bufIn));
-        Assert.assertEquals(original, deserialized);
-
-        bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
-        CounterCell deserializedOnRemote = (CounterCell) type.columnSerializer().deserialize(new DataInputStream(bufIn), ColumnSerializer.Flag.FROM_REMOTE);
-        Assert.assertEquals(deserializedOnRemote.name(), original.name());
-        Assert.assertEquals(deserializedOnRemote.total(), original.total());
-        Assert.assertEquals(deserializedOnRemote.value(), cc.clearAllLocal(original.value()));
-        Assert.assertEquals(deserializedOnRemote.timestamp(), deserialized.timestamp());
-        Assert.assertEquals(deserializedOnRemote.timestampOfLastDelete(), deserialized.timestampOfLastDelete());
+        leftCell = createCounterCellFromContext(cfs, col, leftContext, 1);
+        rightCell = createCounterCellFromContext(cfs, col, rightContext, 1);
+        assertEquals(CounterContext.Relationship.DISJOINT, CounterContext.instance().diff(leftCell.value(), rightCell.value()));
+        assertEquals(CounterContext.Relationship.DISJOINT, CounterContext.instance().diff(rightCell.value(), leftCell.value()));
     }
 
     @Test
     public void testUpdateDigest() throws Exception
     {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
+        ByteBuffer col = ByteBufferUtil.bytes("val");
+
         MessageDigest digest1 = MessageDigest.getInstance("md5");
         MessageDigest digest2 = MessageDigest.getInstance("md5");
 
@@ -309,12 +273,34 @@
         state.writeRemote(CounterId.fromInt(3), 4L, 4L);
         state.writeLocal(CounterId.fromInt(4), 4L, 4L);
 
-        CounterCell original = new BufferCounterCell(cellname("x"), state.context, 1L);
-        CounterCell cleared = new BufferCounterCell(cellname("x"), cc.clearAllLocal(state.context), 1L);
+        Cell original = createCounterCellFromContext(cfs, col, state, 5);
 
-        original.updateDigest(digest1);
-        cleared.updateDigest(digest2);
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(col);
+        Cell cleared = BufferCell.live(cfs.metadata, cDef, 5, CounterContext.instance().clearAllLocal(state.context));
+
+        original.digest(digest1);
+        cleared.digest(digest2);
 
         assert Arrays.equals(digest1.digest(), digest2.digest());
     }
+
+    @Test
+    public void testDigestWithEmptyCells() throws Exception
+    {
+        // For DB-1881
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(COUNTER1);
+
+        ColumnDefinition emptyColDef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val2"));
+        BufferCell emptyCell = BufferCell.live(cfs.metadata, emptyColDef, 0, ByteBuffer.allocate(0));
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(new Clustering(AsciiSerializer.instance.serialize("test")));
+        builder.addCell(emptyCell);
+        Row row = builder.build();
+
+        MessageDigest digest = MessageDigest.getInstance("md5");
+        row.digest(digest);
+        assertNotNull(digest.digest());
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/db/CounterMutationTest.java b/test/unit/org/apache/cassandra/db/CounterMutationTest.java
index 0aa33c5..912dd68 100644
--- a/test/unit/org/apache/cassandra/db/CounterMutationTest.java
+++ b/test/unit/org/apache/cassandra/db/CounterMutationTest.java

@@ -17,27 +17,20 @@
  */
 package org.apache.cassandra.db;
 
-import java.nio.ByteBuffer;
-
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.CounterColumnType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.dk;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
 public class CounterMutationTest
 {
@@ -50,10 +43,9 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF1).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF2).defaultValidator(CounterColumnType.instance));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.counterCFMD(KEYSPACE1, CF1),
+                                    SchemaLoader.counterCFMD(KEYSPACE1, CF2));
     }
 
     @Test
@@ -61,28 +53,26 @@
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF1);
         cfs.truncateBlocking();
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
 
         // Do the initial update (+1)
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        addAndCheck(cfs, 1, 1);
 
         // Make another increment (+2)
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 2L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(3L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        addAndCheck(cfs, 2, 3);
 
         // Decrement to 0 (-3)
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), -3L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
-        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(1)));
+        addAndCheck(cfs, -3, 0);
+    }
+
+    private void addAndCheck(ColumnFamilyStore cfs, long toAdd, long expected)
+    {
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        Mutation m = new RowUpdateBuilder(cfs.metadata, 5, "key1").clustering("cc").add("val", toAdd).build();
+        new CounterMutation(m, ConsistencyLevel.ONE).apply();
+
+        Row row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val").build());
+        assertEquals(expected, CounterContext.instance().total(row.getCell(cDef).value()));
     }
 
     @Test
@@ -92,74 +82,79 @@
         cfs.truncateBlocking();
 
         // Do the initial update (+1, -1)
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        cells.addCounter(cellname(2), -1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
-        assertEquals(-1L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+        addTwoAndCheck(cfs, 1L, 1L, -1L, -1L);
 
         // Make another increment (+2, -2)
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 2L);
-        cells.addCounter(cellname(2), -2L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(3L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
+        addTwoAndCheck(cfs, 2L, 3L, -2L, -3L);
 
         // Decrement to 0 (-3, +3)
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), -3L);
-        cells.addCounter(cellname(2), 3L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
-        assertEquals(0L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+        addTwoAndCheck(cfs, -3L, 0L, 3L, 0L);
+    }
 
-        // Check the caches, separately
-        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(3L, 0L), cfs.getCachedCounter(bytes(1), cellname(2)));
+    private void addTwoAndCheck(ColumnFamilyStore cfs, long addOne, long expectedOne, long addTwo, long expectedTwo)
+    {
+        ColumnDefinition cDefOne = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        ColumnDefinition cDefTwo = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val2"));
+
+        Mutation m = new RowUpdateBuilder(cfs.metadata, 5, "key1")
+            .clustering("cc")
+            .add("val", addOne)
+            .add("val2", addTwo)
+            .build();
+        new CounterMutation(m, ConsistencyLevel.ONE).apply();
+
+        Row row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
+        assertEquals(expectedOne, CounterContext.instance().total(row.getCell(cDefOne).value()));
+        assertEquals(expectedTwo, CounterContext.instance().total(row.getCell(cDefTwo).value()));
     }
 
     @Test
     public void testBatch() throws WriteTimeoutException
     {
-        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF1);
-        ColumnFamilyStore cfs2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF2);
+        ColumnFamilyStore cfsOne = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF1);
+        ColumnFamilyStore cfsTwo = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF2);
 
-        cfs1.truncateBlocking();
-        cfs2.truncateBlocking();
+        cfsOne.truncateBlocking();
+        cfsTwo.truncateBlocking();
 
         // Do the update (+1, -1), (+2, -2)
-        ColumnFamily cells1 = ArrayBackedSortedColumns.factory.create(cfs1.metadata);
-        cells1.addCounter(cellname(1), 1L);
-        cells1.addCounter(cellname(2), -1L);
+        Mutation batch = new Mutation(KEYSPACE1, Util.dk("key1"));
+        batch.add(new RowUpdateBuilder(cfsOne.metadata, 5, "key1")
+            .clustering("cc")
+            .add("val", 1L)
+            .add("val2", -1L)
+            .build().get(cfsOne.metadata));
 
-        ColumnFamily cells2 = ArrayBackedSortedColumns.factory.create(cfs2.metadata);
-        cells2.addCounter(cellname(1), 2L);
-        cells2.addCounter(cellname(2), -2L);
+        batch.add(new RowUpdateBuilder(cfsTwo.metadata, 5, "key1")
+            .clustering("cc")
+            .add("val", 2L)
+            .add("val2", -2L)
+            .build().get(cfsTwo.metadata));
 
-        Mutation mutation = new Mutation(KEYSPACE1, bytes(1));
-        mutation.add(cells1);
-        mutation.add(cells2);
+        new CounterMutation(batch, ConsistencyLevel.ONE).apply();
 
-        new CounterMutation(mutation, ConsistencyLevel.ONE).apply();
+        ColumnDefinition c1cfs1 = cfsOne.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        ColumnDefinition c2cfs1 = cfsOne.metadata.getColumnDefinition(ByteBufferUtil.bytes("val2"));
 
-        // Validate all values
-        ColumnFamily current1 = cfs1.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        ColumnFamily current2 = cfs2.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF2, System.currentTimeMillis()));
+        Row row = Util.getOnlyRow(Util.cmd(cfsOne).includeRow("cc").columns("val", "val2").build());
+        assertEquals(1L, CounterContext.instance().total(row.getCell(c1cfs1).value()));
+        assertEquals(-1L, CounterContext.instance().total(row.getCell(c2cfs1).value()));
 
-        assertEquals(1L, CounterContext.instance().total(current1.getColumn(cellname(1)).value()));
-        assertEquals(-1L, CounterContext.instance().total(current1.getColumn(cellname(2)).value()));
-        assertEquals(2L, CounterContext.instance().total(current2.getColumn(cellname(1)).value()));
-        assertEquals(-2L, CounterContext.instance().total(current2.getColumn(cellname(2)).value()));
+        ColumnDefinition c1cfs2 = cfsTwo.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        ColumnDefinition c2cfs2 = cfsTwo.metadata.getColumnDefinition(ByteBufferUtil.bytes("val2"));
+        row = Util.getOnlyRow(Util.cmd(cfsTwo).includeRow("cc").columns("val", "val2").build());
+        assertEquals(2L, CounterContext.instance().total(row.getCell(c1cfs2).value()));
+        assertEquals(-2L, CounterContext.instance().total(row.getCell(c2cfs2).value()));
 
         // Check the caches, separately
-        assertEquals(ClockAndCount.create(1L, 1L), cfs1.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, -1L), cfs1.getCachedCounter(bytes(1), cellname(2)));
-        assertEquals(ClockAndCount.create(1L, 2L), cfs2.getCachedCounter(bytes(1), cellname(1)));
-        assertEquals(ClockAndCount.create(1L, -2L), cfs2.getCachedCounter(bytes(1), cellname(2)));
+        CBuilder cb = CBuilder.create(cfsOne.metadata.comparator);
+        cb.add("cc");
+
+        assertEquals(ClockAndCount.create(1L, 1L), cfsOne.getCachedCounter(Util.dk("key1").getKey(), cb.build(), c1cfs1, null));
+        assertEquals(ClockAndCount.create(1L, -1L), cfsOne.getCachedCounter(Util.dk("key1").getKey(), cb.build(), c2cfs1, null));
+
+        assertEquals(ClockAndCount.create(1L, 2L), cfsTwo.getCachedCounter(Util.dk("key1").getKey(), cb.build(), c1cfs2, null));
+        assertEquals(ClockAndCount.create(1L, -2L), cfsTwo.getCachedCounter(Util.dk("key1").getKey(), cb.build(), c2cfs2, null));
     }
 
     @Test
@@ -167,67 +162,57 @@
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF1);
         cfs.truncateBlocking();
+        ColumnDefinition cOne = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        ColumnDefinition cTwo = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val2"));
 
         // Do the initial update (+1, -1)
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        cells.addCounter(cellname(2), 1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(1)).value()));
-        assertEquals(1L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+        new CounterMutation(
+            new RowUpdateBuilder(cfs.metadata, 5, "key1")
+                .clustering("cc")
+                .add("val", 1L)
+                .add("val2", -1L)
+                .build(),
+            ConsistencyLevel.ONE).apply();
+
+        Row row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
+        assertEquals(1L, CounterContext.instance().total(row.getCell(cOne).value()));
+        assertEquals(-1L, CounterContext.instance().total(row.getCell(cTwo).value()));
 
         // Remove the first counter, increment the second counter
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addTombstone(cellname(1), (int) System.currentTimeMillis() / 1000, FBUtilities.timestampMicros());
-        cells.addCounter(cellname(2), 1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertNull(current.getColumn(cellname(1)));
-        assertEquals(2L, CounterContext.instance().total(current.getColumn(cellname(2)).value()));
+        new CounterMutation(
+            new RowUpdateBuilder(cfs.metadata, 5, "key1")
+                .clustering("cc")
+                .delete(cOne)
+                .add("val2", -5L)
+                .build(),
+            ConsistencyLevel.ONE).apply();
+
+        row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
+        assertEquals(null, row.getCell(cOne));
+        assertEquals(-6L, CounterContext.instance().total(row.getCell(cTwo).value()));
 
         // Increment the first counter, make sure it's still shadowed by the tombstone
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertNull(current.getColumn(cellname(1)));
+        new CounterMutation(
+            new RowUpdateBuilder(cfs.metadata, 5, "key1")
+                .clustering("cc")
+                .add("val", 1L)
+                .build(),
+            ConsistencyLevel.ONE).apply();
+        row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
+        assertEquals(null, row.getCell(cOne));
 
         // Get rid of the complete partition
-        Mutation mutation = new Mutation(KEYSPACE1, bytes(1));
-        mutation.delete(CF1, FBUtilities.timestampMicros());
-        new CounterMutation(mutation, ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertNull(current.getColumn(cellname(1)));
-        assertNull(current.getColumn(cellname(2)));
+        RowUpdateBuilder.deleteRow(cfs.metadata, 6, "key1", "cc").applyUnsafe();
+        Util.assertEmpty(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
 
         // Increment both counters, ensure that both stay dead
-        cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        cells.addCounter(cellname(2), 1L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-        current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        assertNull(current.getColumn(cellname(1)));
-        assertNull(current.getColumn(cellname(2)));
-    }
-
-    @Test
-    public void testDuplicateCells() throws WriteTimeoutException
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF1);
-        cfs.truncateBlocking();
-
-        ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cells.addCounter(cellname(1), 1L);
-        cells.addCounter(cellname(1), 2L);
-        cells.addCounter(cellname(1), 3L);
-        cells.addCounter(cellname(1), 4L);
-        new CounterMutation(new Mutation(KEYSPACE1, bytes(1), cells), ConsistencyLevel.ONE).apply();
-
-        ColumnFamily current = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(bytes(1)), CF1, System.currentTimeMillis()));
-        ByteBuffer context = current.getColumn(cellname(1)).value();
-        assertEquals(10L, CounterContext.instance().total(context));
-        assertEquals(ClockAndCount.create(1L, 10L), CounterContext.instance().getLocalClockAndCount(context));
-        assertEquals(ClockAndCount.create(1L, 10L), cfs.getCachedCounter(bytes(1), cellname(1)));
+        new CounterMutation(
+            new RowUpdateBuilder(cfs.metadata, 6, "key1")
+                .clustering("cc")
+                .add("val", 1L)
+                .add("val2", 1L)
+                .build(),
+            ConsistencyLevel.ONE).apply();
+        Util.assertEmpty(Util.cmd(cfs).includeRow("cc").columns("val", "val2").build());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/DeletePartitionTest.java b/test/unit/org/apache/cassandra/db/DeletePartitionTest.java
new file mode 100644
index 0000000..a65befd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/DeletePartitionTest.java

@@ -0,0 +1,93 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+
+public class DeletePartitionTest
+{
+    private static final String KEYSPACE1 = "RemoveColumnFamilyTest";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+    }
+
+    @Test
+    public void testDeletePartition()
+    {
+        testDeletePartition(Util.dk("key1"), true, true);
+        testDeletePartition(Util.dk("key2"), true, false);
+        testDeletePartition(Util.dk("key3"), false, true);
+        testDeletePartition(Util.dk("key4"), false, false);
+    }
+
+    public void testDeletePartition(DecoratedKey key, boolean flushBeforeRemove, boolean flushAfterRemove)
+    {
+        ColumnFamilyStore store = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        ColumnDefinition column = store.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+
+        // write
+        new RowUpdateBuilder(store.metadata, 0, key.getKey())
+                .clustering("Column1")
+                .add("val", "asdf")
+                .build()
+                .applyUnsafe();
+
+        // validate that data's written
+        FilteredPartition partition = Util.getOnlyPartition(Util.cmd(store, key).build());
+        assertTrue(partition.rowCount() > 0);
+        Row r = partition.iterator().next();
+        assertTrue(r.getCell(column).value().equals(ByteBufferUtil.bytes("asdf")));
+
+        if (flushBeforeRemove)
+            store.forceBlockingFlush();
+
+        // delete the partition
+        new Mutation(KEYSPACE1, key)
+                .add(PartitionUpdate.fullPartitionDelete(store.metadata, key, 0, FBUtilities.nowInSeconds()))
+                .applyUnsafe();
+
+        if (flushAfterRemove)
+            store.forceBlockingFlush();
+
+        // validate removal
+        ImmutableBTreePartition partitionUnfiltered = Util.getOnlyPartitionUnfiltered(Util.cmd(store, key).build());
+        assertFalse(partitionUnfiltered.partitionLevelDeletion().isLive());
+        assertFalse(partitionUnfiltered.iterator().hasNext());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/DirectoriesTest.java b/test/unit/org/apache/cassandra/db/DirectoriesTest.java
index 11adef2..5ef001a 100644
--- a/test/unit/org/apache/cassandra/db/DirectoriesTest.java
+++ b/test/unit/org/apache/cassandra/db/DirectoriesTest.java

@@ -17,28 +17,37 @@
  */
 package org.apache.cassandra.db;
 
-import java.io.*;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
 import org.apache.commons.lang3.StringUtils;
-
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.Config.DiskFailurePolicy;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.IndexTarget;
 import org.apache.cassandra.db.Directories.DataDirectory;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.service.DefaultFSErrorHandler;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.utils.Pair;
 
 import static org.junit.Assert.assertEquals;
@@ -51,12 +60,9 @@
 {
     private static File tempDataDir;
     private static final String KS = "ks";
-    private static final String[] CFS = new String[] { "cf1", "ks" };
+    private static final String[] TABLES = new String[] { "cf1", "ks" };
 
-    private static final Set<CFMetaData> CFM = new HashSet<>(CFS.length);
-
-    private static final CFMetaData PARENT_CFM = new CFMetaData(KS, "cf", ColumnFamilyType.Standard, null);
-    private static final CFMetaData INDEX_CFM = new CFMetaData(KS, "cf.idx", ColumnFamilyType.Standard, null, PARENT_CFM.cfId);
+    private static final Set<CFMetaData> CFM = new HashSet<>(TABLES.length);
 
     private static final Map<String, List<File>> files = new HashMap<>();
 
@@ -64,9 +70,14 @@
     public static void beforeClass() throws IOException
     {
         FileUtils.setFSErrorHandler(new DefaultFSErrorHandler());
-        for (String cf : CFS)
+        for (String table : TABLES)
         {
-            CFM.add(new CFMetaData(KS, cf, ColumnFamilyType.Standard, null));
+            UUID tableID = CFMetaData.generateLegacyCfId(KS, table);
+            CFM.add(CFMetaData.Builder.create(KS, table)
+                                      .withId(tableID)
+                                      .addPartitionKey("thekey", UTF8Type.instance)
+                                      .addClusteringColumn("thecolumn", UTF8Type.instance)
+                                      .build());
         }
 
         tempDataDir = File.createTempFile("cassandra", "unittest");
@@ -94,22 +105,22 @@
             File dir = cfDir(cfm);
             dir.mkdirs();
 
-            createFakeSSTable(dir, cfm.cfName, 1, false, fs);
-            createFakeSSTable(dir, cfm.cfName, 2, true, fs);
+            createFakeSSTable(dir, cfm.cfName, 1, fs);
+            createFakeSSTable(dir, cfm.cfName, 2, fs);
 
             File backupDir = new File(dir, Directories.BACKUPS_SUBDIR);
             backupDir.mkdir();
-            createFakeSSTable(backupDir, cfm.cfName, 1, false, fs);
+            createFakeSSTable(backupDir, cfm.cfName, 1, fs);
 
             File snapshotDir = new File(dir, Directories.SNAPSHOT_SUBDIR + File.separator + "42");
             snapshotDir.mkdirs();
-            createFakeSSTable(snapshotDir, cfm.cfName, 1, false, fs);
+            createFakeSSTable(snapshotDir, cfm.cfName, 1, fs);
         }
     }
 
-    private static void createFakeSSTable(File dir, String cf, int gen, boolean temp, List<File> addTo) throws IOException
+    private static void createFakeSSTable(File dir, String cf, int gen, List<File> addTo) throws IOException
     {
-        Descriptor desc = new Descriptor(dir, KS, cf, gen, temp ? Descriptor.Type.TEMP : Descriptor.Type.FINAL);
+        Descriptor desc = new Descriptor(dir, KS, cf, gen);
         for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER })
         {
             File f = new File(desc.filenameFor(c));
@@ -144,7 +155,7 @@
             Directories directories = new Directories(cfm);
             assertEquals(cfDir(cfm), directories.getDirectoryForNewSSTables());
 
-            Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1, Descriptor.Type.FINAL);
+            Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1);
             File snapshotDir = new File(cfDir(cfm),  File.separator + Directories.SNAPSHOT_SUBDIR + File.separator + "42");
             assertEquals(snapshotDir.getCanonicalFile(), Directories.getSnapshotDirectory(desc, "42"));
 
@@ -156,6 +167,21 @@
     @Test
     public void testSecondaryIndexDirectories()
     {
+        UUID tableID = CFMetaData.generateLegacyCfId(KS, "cf");
+        CFMetaData PARENT_CFM = CFMetaData.Builder.create(KS, "cf")
+                                  .withId(tableID)
+                                  .addPartitionKey("thekey", UTF8Type.instance)
+                                  .addClusteringColumn("col", UTF8Type.instance)
+                                  .build();
+        ColumnDefinition col = PARENT_CFM.getColumnDefinition(ByteBufferUtil.bytes("col"));
+        IndexMetadata indexDef =
+            IndexMetadata.fromIndexTargets(PARENT_CFM,
+                                           Collections.singletonList(new IndexTarget(col.name, IndexTarget.Type.VALUES)),
+                                           "idx",
+                                           IndexMetadata.Kind.KEYS,
+                                           Collections.emptyMap());
+        PARENT_CFM.indexes(PARENT_CFM.getIndexes().with(indexDef));
+        CFMetaData INDEX_CFM = CassandraIndex.indexCfsMetadata(PARENT_CFM, indexDef);
         Directories parentDirectories = new Directories(PARENT_CFM);
         Directories indexDirectories = new Directories(INDEX_CFM);
         // secondary index has its own directory
@@ -163,8 +189,8 @@
         {
             assertEquals(cfDir(INDEX_CFM), dir);
         }
-        Descriptor parentDesc = new Descriptor(parentDirectories.getDirectoryForNewSSTables(), KS, PARENT_CFM.cfName, 0, Descriptor.Type.FINAL);
-        Descriptor indexDesc = new Descriptor(indexDirectories.getDirectoryForNewSSTables(), KS, INDEX_CFM.cfName, 0, Descriptor.Type.FINAL);
+        Descriptor parentDesc = new Descriptor(parentDirectories.getDirectoryForNewSSTables(), KS, PARENT_CFM.cfName, 0);
+        Descriptor indexDesc = new Descriptor(indexDirectories.getDirectoryForNewSSTables(), KS, INDEX_CFM.cfName, 0);
 
         // snapshot dir should be created under its parent's
         File parentSnapshotDirectory = Directories.getSnapshotDirectory(parentDesc, "test");
@@ -181,9 +207,9 @@
                      indexDirectories.snapshotCreationTime("test"));
 
         // check true snapshot size
-        Descriptor parentSnapshot = new Descriptor(parentSnapshotDirectory, KS, PARENT_CFM.cfName, 0, Descriptor.Type.FINAL);
+        Descriptor parentSnapshot = new Descriptor(parentSnapshotDirectory, KS, PARENT_CFM.cfName, 0);
         createFile(parentSnapshot.filenameFor(Component.DATA), 30);
-        Descriptor indexSnapshot = new Descriptor(indexSnapshotDirectory, KS, INDEX_CFM.cfName, 0, Descriptor.Type.FINAL);
+        Descriptor indexSnapshot = new Descriptor(indexSnapshotDirectory, KS, INDEX_CFM.cfName, 0);
         createFile(indexSnapshot.filenameFor(Component.DATA), 40);
 
         assertEquals(30, parentDirectories.trueSnapshotsSize());
@@ -222,53 +248,83 @@
         for (CFMetaData cfm : CFM)
         {
             Directories directories = new Directories(cfm);
-            Directories.SSTableLister lister;
-            Set<File> listed;
-
-            // List all but no snapshot, backup
-            lister = directories.sstableLister();
-            listed = new HashSet<>(lister.listFiles());
-            for (File f : files.get(cfm.cfName))
-            {
-                if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
-                    assert !listed.contains(f) : f + " should not be listed";
-                else
-                    assert listed.contains(f) : f + " is missing";
-            }
-
-            // List all but including backup (but no snapshot)
-            lister = directories.sstableLister().includeBackups(true);
-            listed = new HashSet<>(lister.listFiles());
-            for (File f : files.get(cfm.cfName))
-            {
-                if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR))
-                    assert !listed.contains(f) : f + " should not be listed";
-                else
-                    assert listed.contains(f) : f + " is missing";
-            }
-
-            // Skip temporary and compacted
-            lister = directories.sstableLister().skipTemporary(true);
-            listed = new HashSet<>(lister.listFiles());
-            for (File f : files.get(cfm.cfName))
-            {
-                if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
-                    assert !listed.contains(f) : f + " should not be listed";
-                else if (f.getName().contains("tmp-"))
-                    assert !listed.contains(f) : f + " should not be listed";
-                else
-                    assert listed.contains(f) : f + " is missing";
-            }
+            checkFiles(cfm, directories);
         }
     }
 
+    private void checkFiles(CFMetaData cfm, Directories directories)
+    {
+        Directories.SSTableLister lister;
+        Set<File> listed;// List all but no snapshot, backup
+        lister = directories.sstableLister(Directories.OnTxnErr.THROW);
+        listed = new HashSet<>(lister.listFiles());
+        for (File f : files.get(cfm.cfName))
+        {
+            if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
+                assertFalse(f + " should not be listed", listed.contains(f));
+            else
+                assertTrue(f + " is missing", listed.contains(f));
+        }
+
+        // List all but including backup (but no snapshot)
+        lister = directories.sstableLister(Directories.OnTxnErr.THROW).includeBackups(true);
+        listed = new HashSet<>(lister.listFiles());
+        for (File f : files.get(cfm.cfName))
+        {
+            if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR))
+                assertFalse(f + " should not be listed", listed.contains(f));
+            else
+                assertTrue(f + " is missing", listed.contains(f));
+        }
+
+        // Skip temporary and compacted
+        lister = directories.sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true);
+        listed = new HashSet<>(lister.listFiles());
+        for (File f : files.get(cfm.cfName))
+        {
+            if (f.getPath().contains(Directories.SNAPSHOT_SUBDIR) || f.getPath().contains(Directories.BACKUPS_SUBDIR))
+                assertFalse(f + " should not be listed", listed.contains(f));
+            else if (f.getName().contains("tmp-"))
+                assertFalse(f + " should not be listed", listed.contains(f));
+            else
+                assertTrue(f + " is missing", listed.contains(f));
+        }
+    }
+
+    @Test
+    public void testTemporaryFile() throws IOException
+    {
+        for (CFMetaData cfm : CFM)
+        {
+            Directories directories = new Directories(cfm);
+
+            File tempDir = directories.getTemporaryWriteableDirectoryAsFile(10);
+            tempDir.mkdir();
+            File tempFile = new File(tempDir, "tempFile");
+            tempFile.createNewFile();
+
+            assertTrue(tempDir.exists());
+            assertTrue(tempFile.exists());
+
+            //make sure temp dir/file will not affect existing sstable listing
+            checkFiles(cfm, directories);
+
+            directories.removeTemporaryDirectories();
+
+            //make sure temp dir/file deletion will not affect existing sstable listing
+            checkFiles(cfm, directories);
+
+            assertFalse(tempDir.exists());
+            assertFalse(tempFile.exists());
+        }
+    }
 
     @Test
     public void testDiskFailurePolicy_best_effort()
     {
         DiskFailurePolicy origPolicy = DatabaseDescriptor.getDiskFailurePolicy();
-        
-        try 
+
+        try
         {
             DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.best_effort);
             // Fake a Directory creation failure
@@ -301,7 +357,7 @@
             final String n = Long.toString(System.nanoTime());
             Callable<File> directoryGetter = new Callable<File>() {
                 public File call() throws Exception {
-                    Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1, Descriptor.Type.FINAL);
+                    Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.cfName, 1);
                     return Directories.getSnapshotDirectory(desc, n);
                 }
             };
@@ -414,6 +470,46 @@
         }
     }
 
+    @Test
+    public void testGetLocationForDisk()
+    {
+        DataDirectory [] paths = new DataDirectory[3];
+        paths[0] = new DataDirectory(new File("/tmp/aaa"));
+        paths[1] = new DataDirectory(new File("/tmp/aa"));
+        paths[2] = new DataDirectory(new File("/tmp/a"));
+
+        for (CFMetaData cfm : CFM)
+        {
+            Directories dirs = new Directories(cfm, paths);
+            for (DataDirectory dir : paths)
+            {
+                String p = dirs.getLocationForDisk(dir).getAbsolutePath() + File.separator;
+                assertTrue(p.startsWith(dir.location.getAbsolutePath() + File.separator));
+            }
+        }
+    }
+
+    @Test
+    public void testGetLocationWithSymlinks() throws IOException
+    {
+        Path p = Files.createTempDirectory("something");
+        Path symlinktarget = Files.createDirectories(p.resolve("symlinktarget"));
+        Path ddir = Files.createDirectories(p.resolve("datadir1"));
+
+        Path p1 = Files.createDirectories(ddir.resolve("p1").resolve("ks")).getParent(); // the data dir does not include the keyspace dir
+        Path p2 = Files.createDirectories(ddir.resolve("p2"));
+        Path l1 = Files.createSymbolicLink(p2.resolve("ks"), symlinktarget);
+
+        DataDirectory path1 = new DataDirectory(p1.toFile());
+        DataDirectory path2 = new DataDirectory(p2.toFile());
+        Directories dirs = new Directories(CFM.iterator().next(), new DataDirectory[] {path1, path2});
+        dirs.getLocationForDisk(new DataDirectory(p1.toFile()));
+        dirs.getLocationForDisk(new DataDirectory(p2.toFile()));
+
+        assertTrue(dirs.getLocationForDisk(path2).toPath().startsWith(l1));
+        assertTrue(dirs.getLocationForDisk(path1).toPath().startsWith(p1));
+    }
+
     private List<Directories.DataDirectoryCandidate> getWriteableDirectories(DataDirectory[] dataDirectories, long writeSize)
     {
         // copied from Directories.getWriteableLocation(long)

diff --git a/test/unit/org/apache/cassandra/db/HintedHandOffTest.java b/test/unit/org/apache/cassandra/db/HintedHandOffTest.java
deleted file mode 100644
index 3d6e5cf..0000000
--- a/test/unit/org/apache/cassandra/db/HintedHandOffTest.java
+++ /dev/null

@@ -1,145 +0,0 @@
-/*
- * 
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- */
-package org.apache.cassandra.db;
-
-import java.net.InetAddress;
-import java.util.Map;
-import java.util.UUID;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
-
-public class HintedHandOffTest
-{
-
-    public static final String KEYSPACE4 = "HintedHandOffTest4";
-    public static final String STANDARD1_CF = "Standard1";
-    public static final String COLUMN1 = "column1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE4,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE4, STANDARD1_CF));
-    }
-
-    // Test compaction of hints column family. It shouldn't remove all columns on compaction.
-    @Test
-    public void testCompactionOfHintsCF() throws Exception
-    {
-        // prepare hints column family
-        Keyspace systemKeyspace = Keyspace.open("system");
-        ColumnFamilyStore hintStore = systemKeyspace.getColumnFamilyStore(SystemKeyspace.HINTS);
-        hintStore.clearUnsafe();
-        hintStore.metadata.gcGraceSeconds(36000); // 10 hours
-        hintStore.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
-        hintStore.disableAutoCompaction();
-
-        // insert 1 hint
-        Mutation rm = new Mutation(KEYSPACE4, ByteBufferUtil.bytes(1));
-        rm.add(STANDARD1_CF, Util.cellname(COLUMN1), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
-
-        HintedHandOffManager.instance.hintFor(rm,
-                                              System.currentTimeMillis(),
-                                              HintedHandOffManager.calculateHintTTL(rm),
-                                              Pair.create(InetAddress.getByName("127.0.0.1"), UUID.randomUUID()))
-                                     .applyUnsafe();
-
-        // flush data to disk
-        hintStore.forceBlockingFlush();
-        assertEquals(1, hintStore.getSSTables().size());
-
-        // submit compaction
-        HintedHandOffManager.instance.compact();
-
-        // single row should not be removed because of gc_grace_seconds
-        // is 10 hours and there are no any tombstones in sstable
-        assertEquals(1, hintStore.getSSTables().size());
-    }
-
-    @Test
-    public void testHintsMetrics() throws Exception
-    {
-        for (int i = 0; i < 99; i++)
-            HintedHandOffManager.instance.metrics.incrPastWindow(InetAddress.getByName("127.0.0.1"));
-        HintedHandOffManager.instance.metrics.log();
-
-        UntypedResultSet rows = executeInternal("SELECT hints_dropped FROM system." + SystemKeyspace.PEER_EVENTS);
-        Map<UUID, Integer> returned = rows.one().getMap("hints_dropped", UUIDType.instance, Int32Type.instance);
-        assertEquals(Iterators.getLast(returned.values().iterator()).intValue(), 99);
-    }
-
-    @Test(timeout = 5000)
-    public void testTruncateHints() throws Exception
-    {
-        Keyspace systemKeyspace = Keyspace.open("system");
-        ColumnFamilyStore hintStore = systemKeyspace.getColumnFamilyStore(SystemKeyspace.HINTS);
-        hintStore.clearUnsafe();
-
-        // insert 1 hint
-        Mutation rm = new Mutation(KEYSPACE4, ByteBufferUtil.bytes(1));
-        rm.add(STANDARD1_CF, Util.cellname(COLUMN1), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
-
-        HintedHandOffManager.instance.hintFor(rm,
-                                              System.currentTimeMillis(),
-                                              HintedHandOffManager.calculateHintTTL(rm),
-                                              Pair.create(InetAddress.getByName("127.0.0.1"), UUID.randomUUID()))
-                                     .applyUnsafe();
-
-        assert getNoOfHints() == 1;
-
-        HintedHandOffManager.instance.truncateAllHints();
-
-        while(getNoOfHints() > 0)
-        {
-            Thread.sleep(100);
-        }
-
-        assert getNoOfHints() == 0;
-    }
-
-    private int getNoOfHints()
-    {
-        String req = "SELECT * FROM system.%s";
-        UntypedResultSet resultSet = executeInternal(String.format(req, SystemKeyspace.HINTS));
-        return resultSet.size();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/KeyCacheTest.java b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
index c8caff9..515d30e 100644
--- a/test/unit/org/apache/cassandra/db/KeyCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCacheTest.java

@@ -34,20 +34,18 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.cache.KeyCacheKey;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.composites.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
 import org.apache.cassandra.utils.concurrent.Refs;
+
 import static org.junit.Assert.assertEquals;
 
 public class KeyCacheTest
@@ -63,8 +61,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, COLUMN_FAMILY1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, COLUMN_FAMILY2),
                                     SchemaLoader.standardCFMD(KEYSPACE1, COLUMN_FAMILY3));
@@ -92,7 +89,7 @@
         store.forceBlockingFlush();
 
         // populate the cache
-        SchemaLoader.readData(KEYSPACE1, COLUMN_FAMILY2, 0, 100);
+        readData(KEYSPACE1, COLUMN_FAMILY2, 0, 100);
         assertKeyCacheSize(100, KEYSPACE1, COLUMN_FAMILY2);
 
         // really? our caches don't implement the map interface? (hence no .addAll)
@@ -143,10 +140,10 @@
         SchemaLoader.insertData(KEYSPACE1, COLUMN_FAMILY3, 0, 100);
         store.forceBlockingFlush();
 
-        Collection<SSTableReader> firstFlushTables = ImmutableList.copyOf(store.getSSTables());
+        Collection<SSTableReader> firstFlushTables = ImmutableList.copyOf(store.getLiveSSTables());
 
         // populate the cache
-        SchemaLoader.readData(KEYSPACE1, COLUMN_FAMILY3, 0, 100);
+        readData(KEYSPACE1, COLUMN_FAMILY3, 0, 100);
         assertKeyCacheSize(100, KEYSPACE1, COLUMN_FAMILY3);
 
         // insert some new data and force to disk
@@ -154,7 +151,7 @@
         store.forceBlockingFlush();
 
         // check that it's fine
-        SchemaLoader.readData(KEYSPACE1, COLUMN_FAMILY3, 100, 50);
+        readData(KEYSPACE1, COLUMN_FAMILY3, 100, 50);
         assertKeyCacheSize(150, KEYSPACE1, COLUMN_FAMILY3);
 
         // force the cache to disk
@@ -192,41 +189,22 @@
         // KeyCache should start at size 0 if we're caching X% of zero data.
         assertKeyCacheSize(0, KEYSPACE1, COLUMN_FAMILY1);
 
-        DecoratedKey key1 = Util.dk("key1");
-        DecoratedKey key2 = Util.dk("key2");
         Mutation rm;
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key1.getKey());
-        rm.add(COLUMN_FAMILY1, Util.cellname("1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.applyUnsafe();
-        rm = new Mutation(KEYSPACE1, key2.getKey());
-        rm.add(COLUMN_FAMILY1, Util.cellname("2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "key1").clustering("1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "key2").clustering("2").build().applyUnsafe();
 
         // to make sure we have SSTable
         cfs.forceBlockingFlush();
 
         // reads to cache key position
-        cfs.getColumnFamily(QueryFilter.getSliceFilter(key1,
-                                                       COLUMN_FAMILY1,
-                                                       Composites.EMPTY,
-                                                       Composites.EMPTY,
-                                                       false,
-                                                       10,
-                                                       System.currentTimeMillis()));
-
-        cfs.getColumnFamily(QueryFilter.getSliceFilter(key2,
-                                                       COLUMN_FAMILY1,
-                                                       Composites.EMPTY,
-                                                       Composites.EMPTY,
-                                                       false,
-                                                       10,
-                                                       System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cfs, "key1").build());
+        Util.getAll(Util.cmd(cfs, "key2").build());
 
         assertKeyCacheSize(2, KEYSPACE1, COLUMN_FAMILY1);
 
-        Set<SSTableReader> readers = cfs.getTracker().getSSTables();
+        Set<SSTableReader> readers = cfs.getLiveSSTables();
         Refs<SSTableReader> refs = Refs.tryRef(readers);
         if (refs == null)
             throw new IllegalStateException();
@@ -241,32 +219,26 @@
 
         refs.release();
 
-        Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS);;
-        while (ScheduledExecutors.nonPeriodicTasks.getActiveCount() + ScheduledExecutors.nonPeriodicTasks.getQueue().size() > 0);
+        LifecycleTransaction.waitForDeletions();
 
         // after releasing the reference this should drop to 2
         assertKeyCacheSize(2, KEYSPACE1, COLUMN_FAMILY1);
 
         // re-read same keys to verify that key cache didn't grow further
-        cfs.getColumnFamily(QueryFilter.getSliceFilter(key1,
-                                                       COLUMN_FAMILY1,
-                                                       Composites.EMPTY,
-                                                       Composites.EMPTY,
-                                                       false,
-                                                       10,
-                                                       System.currentTimeMillis()));
-
-        cfs.getColumnFamily(QueryFilter.getSliceFilter(key2,
-                                                       COLUMN_FAMILY1,
-                                                       Composites.EMPTY,
-                                                       Composites.EMPTY,
-                                                       false,
-                                                       10,
-                                                       System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cfs, "key1").build());
+        Util.getAll(Util.cmd(cfs, "key2").build());
 
         assertKeyCacheSize(noEarlyOpen ? 4 : 2, KEYSPACE1, COLUMN_FAMILY1);
     }
 
+    private static void readData(String keyspace, String columnFamily, int startRow, int numberOfRows)
+    {
+        ColumnFamilyStore store = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
+        for (int i = 0; i < numberOfRows; i++)
+            Util.getAll(Util.cmd(store, "key" + (i + startRow)).includeRow("col" + (i + startRow)).build());
+    }
+
+
     private void assertKeyCacheSize(int expected, String keyspace, String columnFamily)
     {
         int size = 0;

diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index d9481ca..dd11c1c 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java

@@ -19,639 +19,458 @@
 package org.apache.cassandra.db;
 
 import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.text.DecimalFormat;
-import java.text.NumberFormat;
 import java.util.*;
-import java.io.IOException;
 
-import com.google.common.collect.Iterables;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.commons.lang3.StringUtils;
-import org.junit.BeforeClass;
+import org.apache.cassandra.metrics.ClearableHistogram;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 import org.junit.Test;
 
 import static org.junit.Assert.*;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.metrics.ClearableHistogram;
-import org.apache.cassandra.utils.WrappedRunnable;
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.Util.expiringColumn;
-import static org.apache.cassandra.Util.cellname;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 
-public class KeyspaceTest
+public class KeyspaceTest extends CQLTester
 {
-    private static final DecoratedKey TEST_KEY = Util.dk("key1");
-    private static final DecoratedKey TEST_SLICE_KEY = Util.dk("key1-slicerange");
-
-    private static final String KEYSPACE1 = "Keyspace1";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
-    private static final String CF_STANDARDLONG = "StandardLong1";
-    private static final String CF_STANDARDCOMPOSITE2 = "StandardComposite2";
-
-    private static final String KEYSPACE2 = "Keyspace2";
-    private static final String CF_STANDARD3 = "Standard3";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        AbstractType<?> compositeMaxMin = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{BytesType.instance, IntegerType.instance}));
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLONG),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDCOMPOSITE2, compositeMaxMin));
-        SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD3));
-    }
-
-    public static void reTest(ColumnFamilyStore cfs, Runnable verify) throws Exception
-    {
-        verify.run();
-        cfs.forceBlockingFlush();
-        verify.run();
-    }
-
     @Test
     public void testGetRowNoColumns() throws Throwable
     {
-        final Keyspace keyspace = Keyspace.open(KEYSPACE2);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard3");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE2, "Standard3");
-        cf.addColumn(column("col1","val1", 1L));
-        Mutation rm = new Mutation(KEYSPACE2, TEST_KEY.getKey(), cf);
-        rm.applyUnsafe();
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", 0, 0);
 
-        Runnable verify = new WrappedRunnable()
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+
+        for (int round = 0; round < 2; round++)
         {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
+            // slice with limit 0
+            Util.assertEmpty(Util.cmd(cfs, "0").columns("c").withLimit(0).build());
 
-                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY));
-                assertColumns(cf);
+            // slice with nothing in between the bounds
+            Util.assertEmpty(Util.cmd(cfs, "0").columns("c").fromIncl(1).toIncl(1).build());
 
-                cf = cfStore.getColumnFamily(QueryFilter.getSliceFilter(TEST_KEY, "Standard3", Composites.EMPTY, Composites.EMPTY, false, 0, System.currentTimeMillis()));
-                assertColumns(cf);
+            // fetch a non-existent name
+            Util.assertEmpty(Util.cmd(cfs, "0").columns("c").includeRow(1).build());
 
-                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col99"));
-                assertColumns(cf);
-            }
-        };
-        reTest(keyspace.getColumnFamilyStore("Standard3"), verify);
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
     public void testGetRowSingleColumn() throws Throwable
     {
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1","val1", 1L));
-        cf.addColumn(column("col2","val2", 1L));
-        cf.addColumn(column("col3","val3", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, TEST_KEY.getKey(), cf);
-        rm.applyUnsafe();
+        for (int i = 0; i < 2; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-        Runnable verify = new WrappedRunnable()
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+
+        for (int round = 0; round < 2; round++)
         {
-            public void runMayThrow() throws Exception
+            // slice with limit 1
+            Row row = Util.getOnlyRow(Util.cmd(cfs, "0").columns("c").withLimit(1).build());
+            assertEquals(ByteBufferUtil.bytes(0), row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false))).value());
+
+            // fetch each row by name
+            for (int i = 0; i < 2; i++)
             {
-                ColumnFamily cf;
-
-                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col1"));
-                assertColumns(cf, "col1");
-
-                cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, TEST_KEY, "col3"));
-                assertColumns(cf, "col3");
+                row = Util.getOnlyRow(Util.cmd(cfs, "0").columns("c").includeRow(i).build());
+                assertEquals(ByteBufferUtil.bytes(i), row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false))).value());
             }
-        };
-        reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
+
+            // fetch each row by slice
+            for (int i = 0; i < 2; i++)
+            {
+                row = Util.getOnlyRow(Util.cmd(cfs, "0").columns("c").fromIncl(i).toIncl(i).build());
+                assertEquals(ByteBufferUtil.bytes(i), row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false))).value());
+            }
+
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
-    public void testGetRowSliceByRange() throws Throwable
+    public void testGetSliceBloomFilterFalsePositive() throws Throwable
     {
-        DecoratedKey key = TEST_SLICE_KEY;
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        // First write "a", "b", "c"
-        cf.addColumn(column("a", "val1", 1L));
-        cf.addColumn(column("b", "val2", 1L));
-        cf.addColumn(column("c", "val3", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey(), cf);
-        rm.applyUnsafe();
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
 
-        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("c"), false, 100, System.currentTimeMillis());
-        assertEquals(2, cf.getColumnCount());
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "1", 1, 1);
 
-        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("b"), false, 100, System.currentTimeMillis());
-        assertEquals(1, cf.getColumnCount());
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        cf = cfStore.getColumnFamily(key, cellname("b"), cellname("c"), false, 1, System.currentTimeMillis());
-        assertEquals(1, cf.getColumnCount());
+        // check empty reads on the partitions before and after the existing one
+        for (String key : new String[]{"0", "2"})
+            Util.assertEmpty(Util.cmd(cfs, key).build());
+
+        cfs.forceBlockingFlush();
+
+        for (String key : new String[]{"0", "2"})
+            Util.assertEmpty(Util.cmd(cfs, key).build());
+
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
+        assertEquals(1, sstables.size());
+        sstables.iterator().next().forceFilterFailures();
+
+        for (String key : new String[]{"0", "2"})
+            Util.assertEmpty(Util.cmd(cfs, key).build());
     }
 
-    @Test
-    public void testGetSliceNoMatch() throws Throwable
+    private static void assertRowsInSlice(ColumnFamilyStore cfs, String key, int sliceStart, int sliceEnd, int limit, boolean reversed, String columnValuePrefix)
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard2");
-        cf.addColumn(column("col1", "val1", 1));
-        Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("row1000"), cf);
-        rm.applyUnsafe();
+        Clustering startClustering = new Clustering(ByteBufferUtil.bytes(sliceStart));
+        Clustering endClustering = new Clustering(ByteBufferUtil.bytes(sliceEnd));
+        Slices slices = Slices.with(cfs.getComparator(), Slice.make(startClustering, endClustering));
+        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(slices, reversed);
+        SinglePartitionReadCommand command = singlePartitionSlice(cfs, key, filter, limit);
 
-        validateGetSliceNoMatch(keyspace);
-        keyspace.getColumnFamilyStore("Standard2").forceBlockingFlush();
-        validateGetSliceNoMatch(keyspace);
-
-        Collection<SSTableReader> ssTables = keyspace.getColumnFamilyStore("Standard2").getSSTables();
-        assertEquals(1, ssTables.size());
-        ssTables.iterator().next().forceFilterFailures();
-        validateGetSliceNoMatch(keyspace);
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); PartitionIterator iterator = command.executeInternal(orderGroup))
+        {
+            try (RowIterator rowIterator = iterator.next())
+            {
+                if (reversed)
+                {
+                    for (int i = sliceEnd; i >= sliceStart; i--)
+                    {
+                        Row row = rowIterator.next();
+                        Cell cell = row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false)));
+                        assertEquals(ByteBufferUtil.bytes(columnValuePrefix + i), cell.value());
+                    }
+                }
+                else
+                {
+                    for (int i = sliceStart; i <= sliceEnd; i++)
+                    {
+                        Row row = rowIterator.next();
+                        Cell cell = row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false)));
+                        assertEquals(ByteBufferUtil.bytes(columnValuePrefix + i), cell.value());
+                    }
+                }
+                assertFalse(rowIterator.hasNext());
+            }
+        }
     }
 
     @Test
     public void testGetSliceWithCutoff() throws Throwable
     {
         // tests slicing against data from one row in a memtable and then flushed to an sstable
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        final DecoratedKey ROW = Util.dk("row4");
-        final NumberFormat fmt = new DecimalFormat("000");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c text, PRIMARY KEY (a, b))");
+        String prefix = "omg!thisisthevalue!";
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        // at this rate, we're getting 78-79 cos/block, assuming the blocks are set to be about 4k.
-        // so if we go to 300, we'll get at least 4 blocks, which is plenty for testing.
         for (int i = 0; i < 300; i++)
-            cf.addColumn(column("col" + fmt.format(i), "omg!thisisthevalue!"+i, 1L));
-        Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-        rm.applyUnsafe();
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, prefix + i);
 
-        Runnable verify = new WrappedRunnable()
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+
+        for (int round = 0; round < 2; round++)
         {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
+            assertRowsInSlice(cfs, "0", 96, 99, 4, false, prefix);
+            assertRowsInSlice(cfs, "0", 96, 99, 4, true, prefix);
 
-                // blocks are partitioned like this: 000-097, 098-193, 194-289, 290-299, assuming a 4k column index size.
-                assert DatabaseDescriptor.getColumnIndexSize() == 4096 : "Unexpected column index size, block boundaries won't be where tests expect them.";
+            assertRowsInSlice(cfs, "0", 100, 103, 4, false, prefix);
+            assertRowsInSlice(cfs, "0", 100, 103, 4, true, prefix);
 
-                // test forward, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, cellname("col096"), cellname("col099"), false, 4, System.currentTimeMillis());
-                assertColumns(cf, "col096", "col097", "col098", "col099");
+            assertRowsInSlice(cfs, "0", 0, 99, 100, false, prefix);
+            assertRowsInSlice(cfs, "0", 288, 299, 12, true, prefix);
 
-                // test reversed, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, cellname("col099"), cellname("col096"), true, 4, System.currentTimeMillis());
-                assertColumns(cf, "col096", "col097", "col098", "col099");
-
-                // test forward, within a segment.
-                cf = cfStore.getColumnFamily(ROW, cellname("col100"), cellname("col103"), false, 4, System.currentTimeMillis());
-                assertColumns(cf, "col100", "col101", "col102", "col103");
-
-                // test reversed, within a segment.
-                cf = cfStore.getColumnFamily(ROW, cellname("col103"), cellname("col100"), true, 4, System.currentTimeMillis());
-                assertColumns(cf, "col100", "col101", "col102", "col103");
-
-                // test forward from beginning, spanning a segment.
-                String[] strCols = new String[100]; // col000-col099
-                for (int i = 0; i < 100; i++)
-                    strCols[i] = "col" + fmt.format(i);
-                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, cellname("col099"), false, 100, System.currentTimeMillis());
-                assertColumns(cf, strCols);
-
-                // test reversed, from end, spanning a segment.
-                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, cellname("col288"), true, 12, System.currentTimeMillis());
-                assertColumns(cf, "col288", "col289", "col290", "col291", "col292", "col293", "col294", "col295", "col296", "col297", "col298", "col299");
-            }
-        };
-
-        reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
-    public void testReversedWithFlushing()
+    public void testReversedWithFlushing() throws Throwable
     {
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardLong1");
-        final DecoratedKey ROW = Util.dk("row4");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b)) WITH CLUSTERING ORDER BY (b DESC)");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
         for (int i = 0; i < 10; i++)
-        {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "StandardLong1");
-            cf.addColumn(new BufferCell(cellname((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
-            Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-            rm.applyUnsafe();
-        }
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
         cfs.forceBlockingFlush();
 
         for (int i = 10; i < 20; i++)
         {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "StandardLong1");
-            cf.addColumn(new BufferCell(cellname((long)i), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0));
-            Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-            rm.applyUnsafe();
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-            cf = cfs.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, true, 1, System.currentTimeMillis());
-            assertEquals(1, Iterables.size(cf.getColumnNames()));
-            assertEquals(i, cf.getColumnNames().iterator().next().toByteBuffer().getLong());
+            PartitionColumns columns = PartitionColumns.of(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false)));
+            ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(Slices.ALL, false);
+            SinglePartitionReadCommand command = singlePartitionSlice(cfs, "0", filter, null);
+            try (ReadOrderGroup orderGroup = command.startOrderGroup(); PartitionIterator iterator = command.executeInternal(orderGroup))
+            {
+                try (RowIterator rowIterator = iterator.next())
+                {
+                    Row row = rowIterator.next();
+                    Cell cell = row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false)));
+                    assertEquals(ByteBufferUtil.bytes(i), cell.value());
+                }
+            }
         }
     }
 
-    private void validateGetSliceNoMatch(Keyspace keyspace)
+    private static void assertRowsInResult(ColumnFamilyStore cfs, SinglePartitionReadCommand command, int ... columnValues)
     {
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard2");
-        ColumnFamily cf;
+        try (ReadOrderGroup orderGroup = command.startOrderGroup(); PartitionIterator iterator = command.executeInternal(orderGroup))
+        {
+            if (columnValues.length == 0)
+            {
+                if (iterator.hasNext())
+                    fail("Didn't expect any results, but got rows starting with: " + iterator.next().next().toString(cfs.metadata));
+                return;
+            }
 
-        // key before the rows that exists
-        cf = cfStore.getColumnFamily(Util.dk("a"), Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-        assertColumns(cf);
+            try (RowIterator rowIterator = iterator.next())
+            {
+                for (int expected : columnValues)
+                {
+                    Row row = rowIterator.next();
+                    Cell cell = row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("c", false)));
+                    assertEquals(
+                            String.format("Expected %s, but got %s", ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(expected)), ByteBufferUtil.bytesToHex(cell.value())),
+                            ByteBufferUtil.bytes(expected), cell.value());
+                }
+                assertFalse(rowIterator.hasNext());
+            }
+        }
+    }
 
-        // key after the rows that exist
-        cf = cfStore.getColumnFamily(Util.dk("z"), Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-        assertColumns(cf);
+    private static ClusteringIndexSliceFilter slices(ColumnFamilyStore cfs, Integer sliceStart, Integer sliceEnd, boolean reversed)
+    {
+        Slice.Bound startBound = sliceStart == null
+                               ? Slice.Bound.BOTTOM
+                               : Slice.Bound.create(ClusteringPrefix.Kind.INCL_START_BOUND, new ByteBuffer[]{ByteBufferUtil.bytes(sliceStart)});
+        Slice.Bound endBound = sliceEnd == null
+                             ? Slice.Bound.TOP
+                             : Slice.Bound.create(ClusteringPrefix.Kind.INCL_END_BOUND, new ByteBuffer[]{ByteBufferUtil.bytes(sliceEnd)});
+        Slices slices = Slices.with(cfs.getComparator(), Slice.make(startBound, endBound));
+        return new ClusteringIndexSliceFilter(slices, reversed);
+    }
+
+    private static SinglePartitionReadCommand singlePartitionSlice(ColumnFamilyStore cfs, String key, ClusteringIndexSliceFilter filter, Integer rowLimit)
+    {
+        DataLimits limit = rowLimit == null
+                         ? DataLimits.NONE
+                         : DataLimits.cqlLimits(rowLimit);
+        return SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, limit, Util.dk(key), filter);
     }
 
     @Test
     public void testGetSliceFromBasic() throws Throwable
     {
         // tests slicing against data from one row in a memtable and then flushed to an sstable
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        final DecoratedKey ROW = Util.dk("row1");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        cf.addColumn(column("col3", "val3", 1L));
-        cf.addColumn(column("col4", "val4", 1L));
-        cf.addColumn(column("col5", "val5", 1L));
-        cf.addColumn(column("col7", "val7", 1L));
-        cf.addColumn(column("col9", "val9", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-        rm.applyUnsafe();
-
-        rm = new Mutation(KEYSPACE1, ROW.getKey());
-        rm.delete("Standard1", cellname("col4"), 2L);
-        rm.applyUnsafe();
-
-        Runnable verify = new WrappedRunnable()
+        for (int i = 1; i < 10; i++)
         {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
+            if (i == 6 || i == 8)
+                continue;
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
+        }
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col5"), Composites.EMPTY, false, 2, System.currentTimeMillis());
-                assertColumns(cf, "col5", "col7");
+        execute("DELETE FROM %s WHERE a = ? AND b = ?", "0", 4);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col4"), Composites.EMPTY, false, 2, System.currentTimeMillis());
-                assertColumns(cf, "col4", "col5", "col7");
-                assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE), "col5", "col7");
+        for (int round = 0; round < 2; round++)
+        {
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col5"), Composites.EMPTY, true, 2, System.currentTimeMillis());
-                assertColumns(cf, "col3", "col4", "col5");
+            ClusteringIndexSliceFilter filter = slices(cfs, 5, null, false);
+            SinglePartitionReadCommand command = singlePartitionSlice(cfs, "0", filter, 2);
+            assertRowsInResult(cfs, command, 5, 7);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col6"), Composites.EMPTY, true, 2, System.currentTimeMillis());
-                assertColumns(cf, "col3", "col4", "col5");
+            command = singlePartitionSlice(cfs, "0", slices(cfs, 4, null, false), 2);
+            assertRowsInResult(cfs, command, 5, 7);
 
-                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, true, 2, System.currentTimeMillis());
-                assertColumns(cf, "col7", "col9");
+            command = singlePartitionSlice(cfs, "0", slices(cfs, null, 5, true), 2);
+            assertRowsInResult(cfs, command, 5, 3);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col95"), Composites.EMPTY, false, 2, System.currentTimeMillis());
-                assertColumns(cf);
+            command = singlePartitionSlice(cfs, "0", slices(cfs, null, 6, true), 2);
+            assertRowsInResult(cfs, command, 5, 3);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col0"), Composites.EMPTY, true, 2, System.currentTimeMillis());
-                assertColumns(cf);
-            }
-        };
+            command = singlePartitionSlice(cfs, "0", slices(cfs, null, 6, true), 2);
+            assertRowsInResult(cfs, command, 5, 3);
 
-        reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
+            command = singlePartitionSlice(cfs, "0", slices(cfs, null, null, true), 2);
+            assertRowsInResult(cfs, command, 9, 7);
+
+            command = singlePartitionSlice(cfs, "0", slices(cfs, 95, null, false), 2);
+            assertRowsInResult(cfs, command);
+
+            command = singlePartitionSlice(cfs, "0", slices(cfs, null, 0, true), 2);
+            assertRowsInResult(cfs, command);
+
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
     public void testGetSliceWithExpiration() throws Throwable
     {
         // tests slicing against data from one row with expiring column in a memtable and then flushed to an sstable
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        final DecoratedKey ROW = Util.dk("row5");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        cf.addColumn(expiringColumn("col2", "val2", 1L, 60)); // long enough not to be tombstoned
-        cf.addColumn(column("col3", "val3", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-        rm.applyUnsafe();
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", 0, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TTL 60", "0", 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", 2, 2);
 
-        Runnable verify = new WrappedRunnable()
+        for (int round = 0; round < 2; round++)
         {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
+            SinglePartitionReadCommand command = singlePartitionSlice(cfs, "0", slices(cfs, null, null, false), 2);
+            assertRowsInResult(cfs, command, 0, 1);
 
-                cf = cfStore.getColumnFamily(ROW, Composites.EMPTY, Composites.EMPTY, false, 2, System.currentTimeMillis());
-                assertColumns(cf, "col1", "col2");
-                assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE), "col1");
+            command = singlePartitionSlice(cfs, "0", slices(cfs, 1, null, false), 1);
+            assertRowsInResult(cfs, command, 1);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col2"), Composites.EMPTY, false, 1, System.currentTimeMillis());
-                assertColumns(cf, "col2");
-                assertColumns(ColumnFamilyStore.removeDeleted(cf, Integer.MAX_VALUE));
-            }
-        };
-
-        reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
     public void testGetSliceFromAdvanced() throws Throwable
     {
         // tests slicing against data from one row spread across two sstables
-        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        final DecoratedKey ROW = Util.dk("row2");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        cf.addColumn(column("col2", "val2", 1L));
-        cf.addColumn(column("col3", "val3", 1L));
-        cf.addColumn(column("col4", "val4", 1L));
-        cf.addColumn(column("col5", "val5", 1L));
-        cf.addColumn(column("col6", "val6", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-        rm.applyUnsafe();
-        cfStore.forceBlockingFlush();
+        for (int i = 1; i < 7; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "valx", 2L));
-        cf.addColumn(column("col2", "valx", 2L));
-        cf.addColumn(column("col3", "valx", 2L));
-        rm = new Mutation(KEYSPACE1, ROW.getKey(), cf);
-        rm.applyUnsafe();
+        cfs.forceBlockingFlush();
 
-        Runnable verify = new WrappedRunnable()
+        // overwrite three rows with -1
+        for (int i = 1; i < 4; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, -1);
+
+        for (int round = 0; round < 2; round++)
         {
-            public void runMayThrow() throws Exception
-            {
-                ColumnFamily cf;
+            SinglePartitionReadCommand command = singlePartitionSlice(cfs, "0", slices(cfs, 2, null, false), 3);
+            assertRowsInResult(cfs, command, -1, -1, 4);
 
-                cf = cfStore.getColumnFamily(ROW, cellname("col2"), Composites.EMPTY, false, 3, System.currentTimeMillis());
-                assertColumns(cf, "col2", "col3", "col4");
-
-                ByteBuffer col = cf.getColumn(cellname("col2")).value();
-                assertEquals(ByteBufferUtil.string(col), "valx");
-
-                col = cf.getColumn(cellname("col3")).value();
-                assertEquals(ByteBufferUtil.string(col), "valx");
-
-                col = cf.getColumn(cellname("col4")).value();
-                assertEquals(ByteBufferUtil.string(col), "val4");
-            }
-        };
-
-        reTest(keyspace.getColumnFamilyStore("Standard1"), verify);
+            if (round == 0)
+                cfs.forceBlockingFlush();
+        }
     }
 
     @Test
     public void testGetSliceFromLarge() throws Throwable
     {
-        // tests slicing against 1000 columns in an sstable
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        DecoratedKey key = Util.dk("row3");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        for (int i = 1000; i < 2000; i++)
-            cf.addColumn(column("col" + i, ("v" + i), 1L));
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey(), cf);
-        rm.applyUnsafe();
-        cfStore.forceBlockingFlush();
+        // tests slicing against 1000 rows in an sstable
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        validateSliceLarge(cfStore);
+        for (int i = 1000; i < 2000; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
+
+        cfs.forceBlockingFlush();
+
+        validateSliceLarge(cfs);
 
         // compact so we have a big row with more than the minimum index count
-        if (cfStore.getSSTables().size() > 1)
-        {
-            CompactionManager.instance.performMaximal(cfStore, false);
-        }
+        if (cfs.getLiveSSTables().size() > 1)
+            CompactionManager.instance.performMaximal(cfs, false);
+
         // verify that we do indeed have multiple index entries
-        SSTableReader sstable = cfStore.getSSTables().iterator().next();
-        RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        RowIndexEntry indexEntry = sstable.getPosition(Util.dk("0"), SSTableReader.Operator.EQ);
         assert indexEntry.columnsIndex().size() > 2;
 
-        validateSliceLarge(cfStore);
+        validateSliceLarge(cfs);
     }
 
     @Test
-    public void testLimitSSTables() throws CharacterCodingException
+    public void testLimitSSTables() throws Throwable
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore("Standard1");
-        cfStore.disableAutoCompaction();
-        DecoratedKey key = Util.dk("row_maxmin");
-        for (int j = 0; j < 10; j++)
-        {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-            for (int i = 1000 + (j*100); i < 1000 + ((j+1)*100); i++)
-            {
-                cf.addColumn(column("col" + i, ("v" + i), i));
-            }
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey(), cf);
-            rm.applyUnsafe();
-            cfStore.forceBlockingFlush();
-        }
-        ((ClearableHistogram)cfStore.metric.sstablesPerReadHistogram.cf).clear();
-        ColumnFamily cf = cfStore.getColumnFamily(key, Composites.EMPTY, cellname("col1499"), false, 1000, System.currentTimeMillis());
-        assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 5, 0.1);
-        int i = 0;
-        for (Cell c : cf.getSortedColumns())
-        {
-            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col" + (1000 + i++));
-        }
-        assertEquals(i, 500);
-        ((ClearableHistogram)cfStore.metric.sstablesPerReadHistogram.cf).clear();
-        cf = cfStore.getColumnFamily(key, cellname("col1500"), cellname("col2000"), false, 1000, System.currentTimeMillis());
-        assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 5, 0.1);
-
-        for (Cell c : cf.getSortedColumns())
-        {
-            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col"+(1000 + i++));
-        }
-        assertEquals(i, 1000);
-
-        // reverse
-        ((ClearableHistogram)cfStore.metric.sstablesPerReadHistogram.cf).clear();
-        cf = cfStore.getColumnFamily(key, cellname("col2000"), cellname("col1500"), true, 1000, System.currentTimeMillis());
-        assertEquals(cfStore.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 5, 0.1);
-        i = 500;
-        for (Cell c : cf.getSortedColumns())
-        {
-            assertEquals(ByteBufferUtil.string(c.name().toByteBuffer()), "col"+(1000 + i++));
-        }
-        assertEquals(i, 1000);
-
-    }
-
-    @Test
-    public void testLimitSSTablesComposites()
-    {
-        /*
-        creates 10 sstables, composite columns like this:
-        ---------------------
-        k   |a0:0|a1:1|..|a9:9
-        ---------------------
-        ---------------------
-        k   |a0:10|a1:11|..|a9:19
-        ---------------------
-        ...
-        ---------------------
-        k   |a0:90|a1:91|..|a9:99
-        ---------------------
-        then we slice out col1 = a5 and col2 > 85 -> which should let us just check 2 sstables and get 2 columns
-         */
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardComposite2");
+        String tableName = createTable("CREATE TABLE %s (a text, b int, c int, PRIMARY KEY (a, b))");
+        final ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
         cfs.disableAutoCompaction();
 
-        CellNameType type = cfs.getComparator();
-        DecoratedKey key = Util.dk("k");
         for (int j = 0; j < 10; j++)
         {
-            for (int i = 0; i < 10; i++)
-            {
-                Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-                CellName colName = type.makeCellName(ByteBufferUtil.bytes("a" + i), ByteBufferUtil.bytes(j*10 + i));
-                rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-                rm.applyUnsafe();
-            }
+            for (int i = 1000 + (j*100); i < 1000 + ((j+1)*100); i++)
+                execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", "0", i, i, (long)i);
+
             cfs.forceBlockingFlush();
         }
-        Composite start = type.builder().add(ByteBufferUtil.bytes("a5")).add(ByteBufferUtil.bytes(85)).build();
-        Composite finish = type.builder().add(ByteBufferUtil.bytes("a5")).build().end();
+
         ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear();
-        ColumnFamily cf = cfs.getColumnFamily(key, start, finish, false, 1000, System.currentTimeMillis());
-        int colCount = 0;
-        for (Cell c : cf)
-            colCount++;
-        assertEquals(2, colCount);
-        assertEquals(2, cfs.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 0.1);
+
+        SinglePartitionReadCommand command = singlePartitionSlice(cfs, "0", slices(cfs, null, 1499, false), 1000);
+        int[] expectedValues = new int[500];
+        for (int i = 0; i < 500; i++)
+            expectedValues[i] = i + 1000;
+        assertRowsInResult(cfs, command, expectedValues);
+
+        assertEquals(5, cfs.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 0.1);
+        ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear();
+
+        command = singlePartitionSlice(cfs, "0", slices(cfs, 1500, 2000, false), 1000);
+        for (int i = 0; i < 500; i++)
+            expectedValues[i] = i + 1500;
+        assertRowsInResult(cfs, command, expectedValues);
+
+        assertEquals(5, cfs.metric.sstablesPerReadHistogram.cf.getSnapshot().getMax(), 0.1);
+        ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear();
+
+        // reverse
+        command = singlePartitionSlice(cfs, "0", slices(cfs, 1500, 2000, true), 1000);
+        for (int i = 0; i < 500; i++)
+            expectedValues[i] = 1999 - i;
+        assertRowsInResult(cfs, command, expectedValues);
     }
 
-    private void validateSliceLarge(ColumnFamilyStore cfStore) throws IOException
+    private void validateSliceLarge(ColumnFamilyStore cfs)
     {
-        DecoratedKey key = Util.dk("row3");
-        ColumnFamily cf;
-        cf = cfStore.getColumnFamily(key, cellname("col1000"), Composites.EMPTY, false, 3, System.currentTimeMillis());
-        assertColumns(cf, "col1000", "col1001", "col1002");
+        ClusteringIndexSliceFilter filter = slices(cfs, 1000, null, false);
+        SinglePartitionReadCommand command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command, 1000, 1001, 1002);
 
-        ByteBuffer col;
-        col = cf.getColumn(cellname("col1000")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1000");
-        col = cf.getColumn(cellname("col1001")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1001");
-        col = cf.getColumn(cellname("col1002")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1002");
+        filter = slices(cfs, 1195, null, false);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command, 1195, 1196, 1197);
 
-        cf = cfStore.getColumnFamily(key, cellname("col1195"), Composites.EMPTY, false, 3, System.currentTimeMillis());
-        assertColumns(cf, "col1195", "col1196", "col1197");
+        filter = slices(cfs, null, 1996, true);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(1000), Util.dk("0"), filter);
+        int[] expectedValues = new int[997];
+        for (int i = 0, v = 1996; v >= 1000; i++, v--)
+            expectedValues[i] = v;
+        assertRowsInResult(cfs, command, expectedValues);
 
-        col = cf.getColumn(cellname("col1195")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1195");
-        col = cf.getColumn(cellname("col1196")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1196");
-        col = cf.getColumn(cellname("col1197")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1197");
+        filter = slices(cfs, 1990, null, false);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command, 1990, 1991, 1992);
 
+        filter = slices(cfs, null, null, true);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command, 1999, 1998, 1997);
 
-        cf = cfStore.getColumnFamily(key, cellname("col1996"), Composites.EMPTY, true, 1000, System.currentTimeMillis());
-        Cell[] cells = cf.getSortedColumns().toArray(new Cell[0]);
-        for (int i = 1000; i < 1996; i++)
-        {
-            String expectedName = "col" + i;
-            Cell cell = cells[i - 1000];
-            assertEquals(ByteBufferUtil.string(cell.name().toByteBuffer()), expectedName);
-            assertEquals(ByteBufferUtil.string(cell.value()), ("v" + i));
-        }
+        filter = slices(cfs, null, 9000, true);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command, 1999, 1998, 1997);
 
-        cf = cfStore.getColumnFamily(key, cellname("col1990"), Composites.EMPTY, false, 3, System.currentTimeMillis());
-        assertColumns(cf, "col1990", "col1991", "col1992");
-        col = cf.getColumn(cellname("col1990")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1990");
-        col = cf.getColumn(cellname("col1991")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1991");
-        col = cf.getColumn(cellname("col1992")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1992");
-
-        cf = cfStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, true, 3, System.currentTimeMillis());
-        assertColumns(cf, "col1997", "col1998", "col1999");
-        col = cf.getColumn(cellname("col1997")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1997");
-        col = cf.getColumn(cellname("col1998")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1998");
-        col = cf.getColumn(cellname("col1999")).value();
-        assertEquals(ByteBufferUtil.string(col), "v1999");
-
-        cf = cfStore.getColumnFamily(key, cellname("col9000"), Composites.EMPTY, true, 3, System.currentTimeMillis());
-        assertColumns(cf, "col1997", "col1998", "col1999");
-
-        cf = cfStore.getColumnFamily(key, cellname("col9000"), Composites.EMPTY, false, 3, System.currentTimeMillis());
-        assertColumns(cf);
-    }
-
-    public static void assertColumns(ColumnFamily container, String... columnNames)
-    {
-        Collection<Cell> cells = container == null ? new TreeSet<Cell>() : container.getSortedColumns();
-        List<String> L = new ArrayList<String>();
-        for (Cell cell : cells)
-        {
-            L.add(Util.string(cell.name().toByteBuffer()));
-        }
-
-        List<String> names = new ArrayList<String>(columnNames.length);
-
-        names.addAll(Arrays.asList(columnNames));
-
-        String[] columnNames1 = names.toArray(new String[0]);
-        String[] la = L.toArray(new String[cells.size()]);
-
-        assert Arrays.equals(la, columnNames1)
-                : String.format("Columns [%s])] is not expected [%s]",
-                                ((container == null) ? "" : CellNames.getColumnsString(container.getComparator(), cells)),
-                                StringUtils.join(columnNames1, ","));
-    }
-
-    public static void assertColumn(ColumnFamily cf, String name, String value, long timestamp)
-    {
-        assertColumn(cf.getColumn(cellname(name)), value, timestamp);
-    }
-
-    public static void assertColumn(Cell cell, String value, long timestamp)
-    {
-        assertNotNull(cell);
-        assertEquals(0, ByteBufferUtil.compareUnsigned(cell.value(), ByteBufferUtil.bytes(value)));
-        assertEquals(timestamp, cell.timestamp());
+        filter = slices(cfs, 9000, null, false);
+        command = SinglePartitionReadCommand.create(
+                cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.cqlLimits(3), Util.dk("0"), filter);
+        assertRowsInResult(cfs, command);
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/LegacyCellNameTest.java b/test/unit/org/apache/cassandra/db/LegacyCellNameTest.java
new file mode 100644
index 0000000..455fa9f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/LegacyCellNameTest.java

@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+
+import static junit.framework.Assert.assertTrue;
+
+public class LegacyCellNameTest
+{
+    @Test
+    public void testColumnSameNameAsPartitionKeyCompactStorage() throws Exception
+    {
+        CFMetaData cfm = CFMetaData.compile("CREATE TABLE cs (" +
+                                            "k int PRIMARY KEY, v int)" +
+                                            " WITH COMPACT STORAGE", "ks");
+
+        LegacyLayout.LegacyCellName cellName 
+            = LegacyLayout.decodeCellName(cfm, 
+                                          LegacyLayout.makeLegacyComparator(cfm)
+                                                      .fromString("k"));
+
+        assertTrue(cellName.column.isRegular());
+    }
+
+    @Test
+    public void testColumnSameNameAsClusteringKeyCompactStorage() throws Exception
+    {
+        CFMetaData cfm = CFMetaData.compile("CREATE TABLE cs (" +
+                                            "k int PRIMARY KEY, v int)" +
+                                            " WITH COMPACT STORAGE", "ks");
+
+        LegacyLayout.LegacyCellName cellName 
+            = LegacyLayout.decodeCellName(cfm, 
+                                          LegacyLayout.makeLegacyComparator(cfm)
+                                                      .fromString("column1"));
+
+        assertTrue(cellName.column.isRegular());
+    }
+
+    @Test
+    public void testColumnSameNameAsPartitionKeyCql3() throws Exception
+    {
+        CFMetaData cfm = CFMetaData.compile("CREATE TABLE cs (" +
+                                            "k int PRIMARY KEY, v int)", "ks");
+
+        LegacyLayout.LegacyCellName cellName 
+            = LegacyLayout.decodeCellName(cfm, 
+                                          LegacyLayout.makeLegacyComparator(cfm)
+                                                      .fromString("k"));
+
+        // When being grouped into Rows by LegacyLayout.CellGrouper,
+        // primary key columns are filtered out
+        assertTrue(cellName.column.isPrimaryKeyColumn());
+    }
+
+    @Test
+    public void testCompositeWithColumnNameSameAsClusteringKeyCql3() throws Exception
+    {
+        CFMetaData cfm = CFMetaData.compile("CREATE TABLE cs (" +
+                                            "k int, c text, v int, PRIMARY KEY(k, c))", "ks");
+
+        LegacyLayout.LegacyCellName cellName
+            = LegacyLayout.decodeCellName(cfm,
+                                          LegacyLayout.makeLegacyComparator(cfm)
+                                                      .fromString("c_value:c"));
+
+        // When being grouped into Rows by LegacyLayout.CellGrouper,
+        // primary key columns are filtered out
+        assertTrue(cellName.column.isPrimaryKeyColumn());
+    }
+
+    // This throws IllegalArgumentException not because the cellname's value matches
+    // the clustering key name, but because when converted to a Composite, the buffer
+    // contains only a single component and so has no column name component
+    @Test(expected=IllegalArgumentException.class)
+    public void testColumnSameNameAsClusteringKeyCql3() throws Exception
+    {
+        CFMetaData cfm = CFMetaData.compile("CREATE TABLE cs (" +
+                                            "k int, c text, v int, PRIMARY KEY(k, c))", "ks");
+
+        LegacyLayout.LegacyCellName cellName 
+            = LegacyLayout.decodeCellName(cfm, 
+                                          LegacyLayout.makeLegacyComparator(cfm)
+                                                      .fromString("c"));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/LegacyLayoutTest.java b/test/unit/org/apache/cassandra/db/LegacyLayoutTest.java
new file mode 100644
index 0000000..f0d2a02
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/LegacyLayoutTest.java

@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.cassandra.db.LegacyLayout.CellGrouper;
+import org.apache.cassandra.db.LegacyLayout.LegacyBound;
+import org.apache.cassandra.db.LegacyLayout.LegacyCell;
+import org.apache.cassandra.db.LegacyLayout.LegacyRangeTombstone;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer;
+import org.apache.cassandra.db.transform.FilteredRows;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.serializers.Int32Serializer;
+import org.apache.cassandra.serializers.UTF8Serializer;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.utils.FBUtilities;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.Hex;
+
+import static org.apache.cassandra.net.MessagingService.VERSION_21;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.*;
+
+public class LegacyLayoutTest
+{
+    static final String KEYSPACE = "Keyspace1";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        SchemaLoader.loadSchema();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1));
+    }
+
+    @Test
+    public void testFromUnfilteredRowIterator() throws Throwable
+    {
+        CFMetaData table = CFMetaData.Builder.create("ks", "table")
+                                             .addPartitionKey("k", Int32Type.instance)
+                                             .addRegularColumn("a", SetType.getInstance(Int32Type.instance, true))
+                                             .addRegularColumn("b", SetType.getInstance(Int32Type.instance, true))
+                                             .build();
+
+        ColumnDefinition a = table.getColumnDefinition(new ColumnIdentifier("a", false));
+        ColumnDefinition b = table.getColumnDefinition(new ColumnIdentifier("b", false));
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(Clustering.EMPTY);
+        builder.addComplexDeletion(a, new DeletionTime(1L, 1));
+        builder.addComplexDeletion(b, new DeletionTime(1L, 1));
+        Row row = builder.build();
+
+        ByteBuffer key = bytes(1);
+        PartitionUpdate upd = PartitionUpdate.singleRowUpdate(table, key, row);
+
+        LegacyLayout.LegacyUnfilteredPartition p = LegacyLayout.fromUnfilteredRowIterator(null, upd.unfilteredIterator());
+        assertEquals(DeletionTime.LIVE, p.partitionDeletion);
+        assertEquals(0, p.cells.size());
+
+        LegacyLayout.LegacyRangeTombstoneList l = p.rangeTombstones;
+        assertEquals("a", l.starts[0].collectionName.name.toString());
+        assertEquals("a", l.ends[0].collectionName.name.toString());
+
+        assertEquals("b", l.starts[1].collectionName.name.toString());
+        assertEquals("b", l.ends[1].collectionName.name.toString());
+    }
+
+    /**
+     * Tests with valid sstables containing duplicate RT entries at index boundaries
+     * in 2.1 format, where DATA below is a > 1000 byte long string of letters,
+     * and the column index is set to 1kb
+
+     [
+     {"key": "1",
+     "cells": [["1:_","1:!",1513015245,"t",1513015263],
+     ["1:1:","",1513015467727335],
+     ["1:1:val1","DATA",1513015467727335],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:1:val2","DATA",1513015467727335],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:1:val3","DATA",1513015467727335],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:2:","",1513015458470156],
+     ["1:2:val1","DATA",1513015458470156],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:2:val2","DATA",1513015458470156],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:2:val3","DATA",1513015458470156],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:3:","",1513015450253602],
+     ["1:3:val1","DATA",1513015450253602],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:3:val2","DATA",1513015450253602],
+     ["1:_","1:!",1513015245,"t",1513015263],
+     ["1:3:val3","DATA",1513015450253602]]}
+     ]
+     *
+     * See CASSANDRA-14008 for details.
+     */
+    @Test
+    public void testRTBetweenColumns() throws Throwable
+    {
+        QueryProcessor.executeInternal(String.format("CREATE TABLE \"%s\".legacy_ka_repeated_rt (k1 int, c1 int, c2 int, val1 text, val2 text, val3 text, primary key (k1, c1, c2))", KEYSPACE));
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("legacy_ka_repeated_rt");
+
+        Path legacySSTableRoot = Paths.get("test/data/legacy-sstables/ka/legacy_tables/legacy_ka_repeated_rt/");
+
+        for (String filename : new String[]{ "Keyspace1-legacy_ka_repeated_rt-ka-1-CompressionInfo.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Data.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Digest.sha1",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Filter.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Index.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Statistics.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-Summary.db",
+                                             "Keyspace1-legacy_ka_repeated_rt-ka-1-TOC.txt" })
+        {
+            Files.copy(Paths.get(legacySSTableRoot.toString(), filename), cfs.getDirectories().getDirectoryForNewSSTables().toPath().resolve(filename));
+        }
+
+        cfs.loadNewSSTables();
+
+        UntypedResultSet rs = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".legacy_ka_repeated_rt WHERE k1=1", KEYSPACE));
+        assertEquals(3, rs.size());
+        UntypedResultSet rs2 = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".legacy_ka_repeated_rt WHERE k1=1 AND c1=1", KEYSPACE));
+        assertEquals(3, rs2.size());
+        for (int i = 1; i <= 3; i++)
+        {
+            UntypedResultSet rs3 = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".legacy_ka_repeated_rt WHERE k1=1 AND c1=1 AND c2=%s", KEYSPACE, i));
+            assertEquals(1, rs3.size());
+
+        }
+
+    }
+
+
+    private static UnfilteredRowIterator roundTripVia21(UnfilteredRowIterator partition) throws IOException
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            LegacyLayout.serializeAsLegacyPartition(null, partition, out, VERSION_21);
+            try (DataInputBuffer in = new DataInputBuffer(out.buffer(), false))
+            {
+                return LegacyLayout.deserializeLegacyPartition(in, VERSION_21, SerializationHelper.Flag.LOCAL, partition.partitionKey().getKey());
+            }
+        }
+    }
+
+    @Test
+    public void testStaticRangeTombstoneRoundTripUnexpectedDeletion() throws Throwable
+    {
+        // this variant of the bug deletes a row with the same clustering key value as the name of the static collection
+        QueryProcessor.executeInternal(String.format("CREATE TABLE \"%s\".legacy_static_rt_rt_1 (pk int, ck1 text, ck2 text, v int, s set<text> static, primary key (pk, ck1, ck2))", KEYSPACE));
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        CFMetaData table = keyspace.getColumnFamilyStore("legacy_static_rt_rt_1").metadata;
+        ColumnDefinition v = table.getColumnDefinition(new ColumnIdentifier("v", false));
+        ColumnDefinition bug = table.getColumnDefinition(new ColumnIdentifier("s", false));
+
+        Row.Builder builder;
+        builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(Clustering.STATIC_CLUSTERING);
+        builder.addComplexDeletion(bug, new DeletionTime(1L, 1));
+        Row staticRow = builder.build();
+
+        builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(new Clustering(UTF8Serializer.instance.serialize("s"), UTF8Serializer.instance.serialize("anything")));
+        builder.addCell(new BufferCell(v, 1L, Cell.NO_TTL, Cell.NO_DELETION_TIME, Int32Serializer.instance.serialize(1), null));
+        Row row = builder.build();
+
+        DecoratedKey pk = table.decorateKey(bytes(1));
+        PartitionUpdate upd = PartitionUpdate.singleRowUpdate(table, pk, row, staticRow);
+
+        try (RowIterator before = FilteredRows.filter(upd.unfilteredIterator(), FBUtilities.nowInSeconds());
+             RowIterator after = FilteredRows.filter(roundTripVia21(upd.unfilteredIterator()), FBUtilities.nowInSeconds()))
+        {
+            while (before.hasNext() || after.hasNext())
+                assertEquals(before.hasNext() ? before.next() : null, after.hasNext() ? after.next() : null);
+        }
+    }
+
+    @Test
+    public void testStaticRangeTombstoneRoundTripCorruptRead() throws Throwable
+    {
+        // this variant of the bug corrupts the byte stream of the partition, so that a sequential read starting before
+        // this partition will fail with a CorruptSSTableException, and possible yield junk results
+        QueryProcessor.executeInternal(String.format("CREATE TABLE \"%s\".legacy_static_rt_rt_2 (pk int, ck int, nameWithLengthGreaterThan4 set<int> static, primary key (pk, ck))", KEYSPACE));
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        CFMetaData table = keyspace.getColumnFamilyStore("legacy_static_rt_rt_2").metadata;
+
+        ColumnDefinition bug = table.getColumnDefinition(new ColumnIdentifier("nameWithLengthGreaterThan4", false));
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(Clustering.STATIC_CLUSTERING);
+        builder.addComplexDeletion(bug, new DeletionTime(1L, 1));
+        Row row = builder.build();
+
+        DecoratedKey pk = table.decorateKey(bytes(1));
+        PartitionUpdate upd = PartitionUpdate.singleRowUpdate(table, pk, row);
+
+        UnfilteredRowIterator afterRoundTripVia32 = roundTripVia21(upd.unfilteredIterator());
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            // we only encounter a corruption/serialization error after writing this to a 3.0 format and reading it back
+            UnfilteredRowIteratorSerializer.serializer.serialize(afterRoundTripVia32, ColumnFilter.all(table), out, MessagingService.current_version);
+            try (DataInputBuffer in = new DataInputBuffer(out.buffer(), false);
+                 UnfilteredRowIterator afterSerialization = UnfilteredRowIteratorSerializer.serializer.deserialize(in, MessagingService.current_version, table, ColumnFilter.all(table), SerializationHelper.Flag.LOCAL))
+            {
+                while (afterSerialization.hasNext())
+                    afterSerialization.next();
+            }
+        }
+    }
+
+    @Test
+    public void testCollectionDeletionRoundTripForDroppedColumn() throws Throwable
+    {
+        // this variant of the bug deletes a row with the same clustering key value as the name of the static collection
+        QueryProcessor.executeInternal(String.format("CREATE TABLE \"%s\".legacy_rt_rt_dc (pk int, ck1 text, v int, s set<text>, primary key (pk, ck1))", KEYSPACE));
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        CFMetaData table = keyspace.getColumnFamilyStore("legacy_rt_rt_dc").metadata;
+        ColumnDefinition v = table.getColumnDefinition(new ColumnIdentifier("v", false));
+        ColumnDefinition bug = table.getColumnDefinition(new ColumnIdentifier("s", false));
+
+        Row.Builder builder;
+        builder = BTreeRow.unsortedBuilder(0);
+        builder.newRow(new Clustering(UTF8Serializer.instance.serialize("a")));
+        builder.addCell(BufferCell.live(table, v, 0L, Int32Serializer.instance.serialize(1), null));
+        builder.addComplexDeletion(bug, new DeletionTime(1L, 1));
+        Row row = builder.build();
+
+        DecoratedKey pk = table.decorateKey(bytes(1));
+        PartitionUpdate upd = PartitionUpdate.singleRowUpdate(table, pk, row);
+
+        // we need to perform the round trip in two parts here, with a column drop inbetween
+        try (RowIterator before = FilteredRows.filter(upd.unfilteredIterator(), FBUtilities.nowInSeconds());
+             DataOutputBuffer serialized21 = new DataOutputBuffer())
+        {
+            LegacyLayout.serializeAsLegacyPartition(null, upd.unfilteredIterator(), serialized21, VERSION_21);
+            QueryProcessor.executeInternal(String.format("ALTER TABLE \"%s\".legacy_rt_rt_dc DROP s", KEYSPACE));
+            try (DataInputBuffer in = new DataInputBuffer(serialized21.buffer(), false))
+            {
+                try (UnfilteredRowIterator deser21 = LegacyLayout.deserializeLegacyPartition(in, VERSION_21, SerializationHelper.Flag.LOCAL, upd.partitionKey().getKey());
+                    RowIterator after = FilteredRows.filter(deser21, FBUtilities.nowInSeconds());)
+                {
+                    while (before.hasNext() || after.hasNext())
+                        assertEquals(before.hasNext() ? before.next() : null, after.hasNext() ? after.next() : null);
+                }
+            }
+
+        }
+    }
+
+    @Test
+    public void testDecodeLegacyPagedRangeCommandSerializer() throws IOException
+    {
+        /*
+         Run on 2.1
+         public static void main(String[] args) throws IOException, ConfigurationException
+         {
+             Gossiper.instance.start((int) (System.currentTimeMillis() / 1000));
+             Keyspace.setInitialized();
+             CFMetaData cfMetaData = CFMetaData.sparseCFMetaData("ks", "cf", UTF8Type.instance)
+             .addColumnDefinition(new ColumnDefinition("ks", "cf", new ColumnIdentifier("v", true), SetType.getInstance(Int32Type.instance, false), null, null, null, null, ColumnDefinition.Kind.REGULAR));
+             KSMetaData ksMetaData = KSMetaData.testMetadata("ks", SimpleStrategy.class, KSMetaData.optsWithRF(3), cfMetaData);
+             MigrationManager.announceNewKeyspace(ksMetaData);
+             RowPosition position = RowPosition.ForKey.get(ByteBufferUtil.EMPTY_BYTE_BUFFER, new Murmur3Partitioner());
+             SliceQueryFilter filter = new IdentityQueryFilter();
+             Composite cellName = CellNames.compositeSparseWithCollection(new ByteBuffer[0], Int32Type.instance.decompose(1), new ColumnIdentifier("v", true), false);
+             try (DataOutputBuffer buffer = new DataOutputBuffer(1024))
+             {
+                 PagedRangeCommand command = new PagedRangeCommand("ks", "cf", 1, AbstractBounds.bounds(position, true, position, true), filter, cellName, filter.finish(), Collections.emptyList(), 1, true);
+                 PagedRangeCommand.serializer.serialize(command, buffer, MessagingService.current_version);
+                 System.out.println(Hex.bytesToHex(buffer.toByteArray()));
+             }
+         }
+         */
+
+        DatabaseDescriptor.setDaemonInitialized();
+        Keyspace.setInitialized();
+        CFMetaData table = CFMetaData.Builder.create("ks", "cf")
+                                             .addPartitionKey("k", Int32Type.instance)
+                                             .addRegularColumn("v", SetType.getInstance(Int32Type.instance, true))
+                                             .build();
+        SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1));
+        MigrationManager.announceNewColumnFamily(table);
+
+        byte[] bytes = Hex.hexToBytes("00026b73000263660000000000000001fffffffe01000000088000000000000000010000000880000000000000000000000100000000007fffffffffffffff000b00017600000400000001000000000000000000000101");
+        ReadCommand.legacyPagedRangeCommandSerializer.deserialize(new DataInputBuffer(bytes), VERSION_21);
+    }
+
+    @Test
+    public void testDecodeCollectionPageBoundary()
+    {
+        CFMetaData table = CFMetaData.Builder.create("ks", "cf")
+                                             .addPartitionKey("k", Int32Type.instance)
+                                             .addRegularColumn("v", SetType.getInstance(Int32Type.instance, true))
+                                             .build();
+
+        ColumnDefinition v = table.getColumnDefinition(new ColumnIdentifier("v", false));
+        ByteBuffer bound = LegacyLayout.encodeCellName(table, Clustering.EMPTY, v.name.bytes, Int32Type.instance.decompose(1));
+
+        LegacyLayout.decodeSliceBound(table, bound, true);
+    }
+
+    @Test
+    public void testAsymmetricRTBoundSerializedSize()
+    {
+        CFMetaData table = CFMetaData.Builder.create("ks", "cf")
+                                             .addPartitionKey("k", Int32Type.instance)
+                                             .addClusteringColumn("c1", Int32Type.instance)
+                                             .addClusteringColumn("c2", Int32Type.instance)
+                                             .addRegularColumn("v", Int32Type.instance)
+                                             .build();
+
+        ByteBuffer one = Int32Type.instance.decompose(1);
+        ByteBuffer two = Int32Type.instance.decompose(2);
+        PartitionUpdate p = new PartitionUpdate(table, table.decorateKey(one), table.partitionColumns(), 0);
+        p.add(new RangeTombstone(Slice.make(new Slice.Bound(ClusteringPrefix.Kind.EXCL_START_BOUND, new ByteBuffer[] { one, one }),
+                                            new Slice.Bound(ClusteringPrefix.Kind.INCL_END_BOUND, new ByteBuffer[] { two })),
+                                 new DeletionTime(1, 1)
+        ));
+
+        LegacyLayout.fromUnfilteredRowIterator(null, p.unfilteredIterator());
+        LegacyLayout.serializedSizeAsLegacyPartition(null, p.unfilteredIterator(), VERSION_21);
+    }
+
+    @Test
+    public void testCellGrouper()
+    {
+        // CREATE TABLE %s (pk int, ck int, v map<text, text>, PRIMARY KEY (pk, ck))
+        CFMetaData cfm = CFMetaData.Builder.create("ks", "table")
+                                           .addPartitionKey("pk", Int32Type.instance)
+                                           .addClusteringColumn("ck", Int32Type.instance)
+                                           .addRegularColumn("v", MapType.getInstance(UTF8Type.instance, UTF8Type.instance, true))
+                                           .build();
+        SerializationHelper helper = new SerializationHelper(cfm, MessagingService.VERSION_22, SerializationHelper.Flag.LOCAL, ColumnFilter.all(cfm));
+        LegacyLayout.CellGrouper cg = new LegacyLayout.CellGrouper(cfm, helper);
+
+        Slice.Bound startBound = Slice.Bound.create(ClusteringPrefix.Kind.INCL_START_BOUND, new ByteBuffer[] {bytes(2)});
+        Slice.Bound endBound = Slice.Bound.create(ClusteringPrefix.Kind.EXCL_END_BOUND, new ByteBuffer[] {bytes(2)});
+        LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, cfm.getColumnDefinition(bytes("v")));
+        LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, cfm.getColumnDefinition(bytes("v")));
+        LegacyLayout.LegacyRangeTombstone lrt = new LegacyLayout.LegacyRangeTombstone(start, end, new DeletionTime(2, 1588598040));
+        assertTrue(cg.addAtom(lrt));
+
+        // add a real cell
+        LegacyLayout.LegacyCell cell = new LegacyLayout.LegacyCell(LegacyLayout.LegacyCell.Kind.REGULAR,
+                                                                   new LegacyLayout.LegacyCellName(new Clustering(bytes(2)),
+                                                                                                   cfm.getColumnDefinition(bytes("v")),
+                                                                                                   bytes("g")),
+                                                                   bytes("v"), 3, Integer.MAX_VALUE, 0);
+        assertTrue(cg.addAtom(cell));
+
+        // add legacy range tombstone where collection name is null for the end bound (this gets translated to a row tombstone)
+        startBound = Slice.Bound.create(ClusteringPrefix.Kind.EXCL_START_BOUND, new ByteBuffer[] {bytes(2)});
+        endBound = Slice.Bound.create(ClusteringPrefix.Kind.EXCL_END_BOUND, new ByteBuffer[] {bytes(2)});
+        start = new LegacyLayout.LegacyBound(startBound, false, cfm.getColumnDefinition(bytes("v")));
+        end = new LegacyLayout.LegacyBound(endBound, false, null);
+        assertTrue(cg.addAtom(new LegacyLayout.LegacyRangeTombstone(start, end, new DeletionTime(1, 1588598040))));
+    }
+
+    private static LegacyCell cell(Clustering clustering, ColumnDefinition column, ByteBuffer value, long timestamp)
+    {
+        return new LegacyCell(LegacyCell.Kind.REGULAR,
+                              new LegacyLayout.LegacyCellName(clustering, column, null),
+                              value,
+                              timestamp,
+                              Cell.NO_DELETION_TIME,
+                              Cell.NO_TTL);
+    }
+
+    /**
+     * This tests that when {@link CellGrouper} gets a collection tombstone for
+     * a non-fetched collection, then that tombstone does not incorrectly stop the grouping of the current row, as
+     * was done before CASSANDRA-15805.
+     *
+     * <p>Please note that this rely on a query only _fetching_ some of the table columns, which in practice only
+     * happens for thrift queries, and thrift queries shouldn't mess up with CQL tables and collection tombstones,
+     * so this test is not of the utmost importance. Nonetheless, the pre-CASSANDRA-15805 behavior was incorrect and
+     * this ensure it is fixed.
+     */
+    @Test
+    public void testCellGrouperOnNonFecthedCollectionTombstone()
+    {
+        // CREATE TABLE %s (pk int, ck int, a text, b set<text>, c text, PRIMARY KEY (pk, ck))
+        CFMetaData cfm = CFMetaData.Builder.create("ks", "table")
+                                           .addPartitionKey("pk", Int32Type.instance)
+                                           .addClusteringColumn("ck", Int32Type.instance)
+                                           .addRegularColumn("a", UTF8Type.instance)
+                                           .addRegularColumn("b", SetType.getInstance(UTF8Type.instance, true))
+                                           .addRegularColumn("c", UTF8Type.instance)
+                                           .build();
+
+        // Creates a filter that _only_ fetches a and c, but not b.
+        ColumnFilter filter = ColumnFilter.selectionBuilder()
+                                          .add(cfm.getColumnDefinition(bytes("a")))
+                                          .add(cfm.getColumnDefinition(bytes("c")))
+                                          .build();
+        SerializationHelper helper = new SerializationHelper(cfm,
+                                                             MessagingService.VERSION_22,
+                                                             SerializationHelper.Flag.LOCAL,
+                                                             filter);
+        CellGrouper grouper = new CellGrouper(cfm, helper);
+        Clustering clustering = new Clustering(bytes(1));
+
+        // We add a cell for a, then a collection tombstone for b, and then a cell for c (for the same clustering).
+        // All those additions should return 'true' as all belong to the same row.
+        LegacyCell ca = cell(clustering, cfm.getColumnDefinition(bytes("a")), bytes("v1"), 1);
+        assertTrue(grouper.addAtom(ca));
+
+        Slice.Bound startBound = Slice.Bound.inclusiveStartOf(bytes(1));
+        Slice.Bound endBound = Slice.Bound.inclusiveEndOf(bytes(1));
+        ColumnDefinition bDef = cfm.getColumnDefinition(bytes("b"));
+        assert bDef != null;
+        LegacyBound start = new LegacyBound(startBound, false, bDef);
+        LegacyBound end = new LegacyBound(endBound, false, bDef);
+        LegacyRangeTombstone rtb = new LegacyRangeTombstone(start, end, new DeletionTime(1, 1588598040));
+        assertTrue(rtb.isCollectionTombstone()); // Ensure we're testing what we think
+        assertTrue(grouper.addAtom(rtb));
+
+        LegacyCell cc = cell(clustering, cfm.getColumnDefinition(bytes("c")), bytes("v2"), 1);
+        assertTrue(grouper.addAtom(cc));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/LegacyLayoutValidationTest.java b/test/unit/org/apache/cassandra/db/LegacyLayoutValidationTest.java
new file mode 100644
index 0000000..068d2a2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/LegacyLayoutValidationTest.java

@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterators;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.LegacyLayout.LegacyCell;
+import org.apache.cassandra.db.LegacyLayout.LegacyCellName;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.serializers.MarshalException;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
+
+public class LegacyLayoutValidationTest
+{
+    static final String KEYSPACE = "ks";
+
+    private static final CFMetaData FIXED = CFMetaData.Builder.create("ks", "cf")
+                                                              .addPartitionKey("k", Int32Type.instance)
+                                                              .addClusteringColumn("c1", Int32Type.instance)
+                                                              .addClusteringColumn("c2", Int32Type.instance)
+                                                              .addRegularColumn("v1", Int32Type.instance)
+                                                              .addRegularColumn("v2", Int32Type.instance)
+                                                              .build();
+
+    private static final CFMetaData COMPACT_FIXED = CFMetaData.Builder.create("ks", "cf", true, false, false)
+                                                                      .addPartitionKey("k", Int32Type.instance)
+                                                                      .addClusteringColumn("c", Int32Type.instance)
+                                                                      .addRegularColumn("v", Int32Type.instance)
+                                                                      .build();
+
+    private static final CFMetaData VARIABLE = CFMetaData.Builder.create("ks", "cf")
+                                                                 .addPartitionKey("k", Int32Type.instance)
+                                                                 .addClusteringColumn("c1", UTF8Type.instance)
+                                                                 .addClusteringColumn("c2", UTF8Type.instance)
+                                                                 .addRegularColumn("v1", UTF8Type.instance)
+                                                                 .addRegularColumn("v2", UTF8Type.instance)
+                                                                 .build();
+
+    private static final CFMetaData COMPACT_VARIABLE = CFMetaData.Builder.create("ks", "cf", true, false, false)
+                                                                         .addPartitionKey("k", Int32Type.instance)
+                                                                         .addClusteringColumn("c", UTF8Type.instance)
+                                                                         .addRegularColumn("v", UTF8Type.instance)
+                                                                         .build();
+
+    @Test
+    public void fixedClusteringSuccess()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), Int32Type.instance.decompose(2));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(FIXED, clustering);
+        LegacyLayout.decodeClustering(FIXED, serialized);
+    }
+
+    @Test (expected = MarshalException.class)
+    public void fixedClusteringFailure()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), hexToBytes("07000000000001"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(FIXED, clustering);
+        LegacyLayout.decodeClustering(FIXED, serialized);
+    }
+
+    @Test
+    public void variableClusteringSuccess()
+    {
+        Clustering clustering = new Clustering(UTF8Type.instance.decompose("one"), UTF8Type.instance.decompose("two,three"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(VARIABLE, clustering);
+        LegacyLayout.decodeClustering(VARIABLE, serialized);
+    }
+
+    @Test
+    public void fixedCompactClusteringSuccess()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(2));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_FIXED, clustering);
+        LegacyLayout.decodeClustering(COMPACT_FIXED, serialized);
+    }
+
+    @Test (expected = MarshalException.class)
+    public void fixedCompactClusteringFailure()
+    {
+        Clustering clustering = new Clustering(hexToBytes("07000000000001"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_FIXED, clustering);
+        LegacyLayout.decodeClustering(COMPACT_FIXED, serialized);
+    }
+
+    @Test
+    public void variableCompactClusteringSuccess()
+    {
+        Clustering clustering = new Clustering(UTF8Type.instance.decompose("two,three"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_VARIABLE, clustering);
+        LegacyLayout.decodeClustering(COMPACT_VARIABLE, serialized);
+    }
+
+    @Test
+    public void fixedBoundSuccess()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), Int32Type.instance.decompose(2));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(FIXED, clustering);
+        LegacyLayout.decodeSliceBound(FIXED, serialized, true);
+    }
+
+    @Test (expected = MarshalException.class)
+    public void fixedBoundFailure()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), hexToBytes("07000000000001"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(FIXED, clustering);
+        LegacyLayout.decodeSliceBound(FIXED, serialized, true);
+    }
+
+    @Test
+    public void variableBoundSuccess()
+    {
+        Clustering clustering = new Clustering(UTF8Type.instance.decompose("one"), UTF8Type.instance.decompose("two,three"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(VARIABLE, clustering);
+        LegacyLayout.decodeSliceBound(VARIABLE, serialized, true);
+    }
+
+    @Test
+    public void fixedCompactBoundSuccess()
+    {
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_FIXED, clustering);
+        LegacyLayout.decodeSliceBound(COMPACT_FIXED, serialized, true);
+    }
+
+    @Test (expected = MarshalException.class)
+    public void fixedCompactBoundFailure()
+    {
+        Clustering clustering = new Clustering(hexToBytes("07000000000001"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_FIXED, clustering);
+        LegacyLayout.decodeSliceBound(COMPACT_FIXED, serialized, true);
+    }
+
+    @Test
+    public void variableCompactBoundSuccess()
+    {
+        Clustering clustering = new Clustering(UTF8Type.instance.decompose("one"));
+        ByteBuffer serialized = LegacyLayout.encodeClustering(COMPACT_VARIABLE, clustering);
+        LegacyLayout.decodeSliceBound(COMPACT_VARIABLE, serialized, true);
+    }
+
+    private static LegacyCell cell(CFMetaData cfm, Clustering clustering, String name, ByteBuffer value) throws UnknownColumnException
+    {
+        ColumnDefinition definition = cfm.getColumnDefinition(new ColumnIdentifier(name, false));
+
+        ByteBuffer cellName = LegacyCellName.create(clustering, definition).encode(cfm);
+        return LegacyCell.regular(cfm, null, cellName, value, 0);
+
+    }
+
+    @Test
+    public void fixedValueSuccess() throws Throwable
+    {
+        DecoratedKey dk = DatabaseDescriptor.getPartitioner().decorateKey(Int32Type.instance.decompose(1000000));
+        LegacyLayout.LegacyDeletionInfo deletionInfo = LegacyLayout.LegacyDeletionInfo.live();
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), Int32Type.instance.decompose(2));
+        Iterator<LegacyCell> cells = Iterators.forArray(cell(FIXED, clustering, "v1", Int32Type.instance.decompose(3)),
+                                                        cell(FIXED, clustering, "v2", Int32Type.instance.decompose(4)));
+        try (UnfilteredRowIterator iter = LegacyLayout.toUnfilteredRowIterator(FIXED, dk, deletionInfo, cells))
+        {
+            while (iter.hasNext())
+                iter.next();
+        }
+    }
+
+    @Test (expected = MarshalException.class)
+    public void fixedValueFailure() throws Throwable
+    {
+        DecoratedKey dk = DatabaseDescriptor.getPartitioner().decorateKey(Int32Type.instance.decompose(1000000));
+        LegacyLayout.LegacyDeletionInfo deletionInfo = LegacyLayout.LegacyDeletionInfo.live();
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), Int32Type.instance.decompose(2));
+        Iterator<LegacyCell> cells = Iterators.forArray(cell(FIXED, clustering, "v1", Int32Type.instance.decompose(3)),
+                                                        cell(FIXED, clustering, "v2", hexToBytes("0000")));
+        try (UnfilteredRowIterator iter = LegacyLayout.toUnfilteredRowIterator(FIXED, dk, deletionInfo, cells))
+        {
+            while (iter.hasNext())
+                iter.next();
+        }
+    }
+
+    @Test
+    public void variableValueSuccess() throws Throwable
+    {
+        DecoratedKey dk = DatabaseDescriptor.getPartitioner().decorateKey(Int32Type.instance.decompose(1000000));
+        LegacyLayout.LegacyDeletionInfo deletionInfo = LegacyLayout.LegacyDeletionInfo.live();
+        Clustering clustering = new Clustering(Int32Type.instance.decompose(1), Int32Type.instance.decompose(2));
+        Iterator<LegacyCell> cells = Iterators.forArray(cell(VARIABLE, clustering, "v1", UTF8Type.instance.decompose("3")),
+                                                        cell(VARIABLE, clustering, "v2", hexToBytes("0000")));
+        try (UnfilteredRowIterator iter = LegacyLayout.toUnfilteredRowIterator(VARIABLE, dk, deletionInfo, cells))
+        {
+            while (iter.hasNext())
+                iter.next();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/LivenessInfoTest.java b/test/unit/org/apache/cassandra/db/LivenessInfoTest.java
new file mode 100644
index 0000000..b08023c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/LivenessInfoTest.java

@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import static org.junit.Assert.*;
+
+import org.apache.cassandra.utils.FBUtilities;
+
+import org.junit.Test;
+
+public class LivenessInfoTest
+{
+    @Test
+    public void testSupersedes()
+    {
+        LivenessInfo first;
+        LivenessInfo second;
+        int nowInSeconds = FBUtilities.nowInSeconds();
+
+        // timestamp supersedes for normal liveness info
+        first = LivenessInfo.create(100, 0, nowInSeconds);
+        second = LivenessInfo.create(101, 0, nowInSeconds);
+        assertSupersedes(second, first);
+
+        // timestamp supersedes for ttl
+        first = LivenessInfo.create(100, 0, nowInSeconds);
+        second = LivenessInfo.expiring(99, 1, nowInSeconds);
+        assertSupersedes(first, second);
+
+        // timestamp supersedes for mv expired liveness
+        first = LivenessInfo.create(100, 0, nowInSeconds);
+        second = LivenessInfo.create(99, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds);
+        assertSupersedes(first, second);
+
+        // timestamp ties, ttl supersedes non-ttl
+        first = LivenessInfo.expiring(100, 1, nowInSeconds);
+        second = LivenessInfo.create(100, 0, nowInSeconds);
+        assertSupersedes(first, second);
+
+        // timestamp ties, greater localDeletionTime supersedes
+        first = LivenessInfo.expiring(100, 2, nowInSeconds);
+        second = LivenessInfo.expiring(100, 1, nowInSeconds);
+        assertSupersedes(first, second);
+
+        first = LivenessInfo.expiring(100, 5, nowInSeconds - 4);
+        second = LivenessInfo.expiring(100, 2, nowInSeconds);
+        assertSupersedes(second, first);
+
+        // timestamp ties, mv expired liveness supersedes normal ttl
+        first = LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds);
+        second = LivenessInfo.expiring(100, 1000, nowInSeconds);
+        assertSupersedes(first, second);
+
+        // timestamp ties, mv expired liveness supersedes non-ttl
+        first = LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds);
+        second = LivenessInfo.create(100, 0, nowInSeconds);
+        assertSupersedes(first, second);
+
+        // timestamp ties, both are mv expired liveness, local deletion time win
+        first = LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds + 1);
+        second = LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds);
+        assertSupersedes(first, second);
+    }
+
+    @Test
+    public void testIsLive()
+    {
+        int nowInSeconds = FBUtilities.nowInSeconds();
+
+        assertIsLive(LivenessInfo.create(100, 0, nowInSeconds), nowInSeconds - 3, true);
+        assertIsLive(LivenessInfo.create(100, 0, nowInSeconds), nowInSeconds, true);
+        assertIsLive(LivenessInfo.create(100, 0, nowInSeconds), nowInSeconds + 3, true);
+
+        assertIsLive(LivenessInfo.expiring(100, 2, nowInSeconds), nowInSeconds - 3, true);
+        assertIsLive(LivenessInfo.expiring(100, 2, nowInSeconds), nowInSeconds, true);
+        assertIsLive(LivenessInfo.expiring(100, 2, nowInSeconds), nowInSeconds + 3, false);
+
+        assertIsLive(LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds), nowInSeconds - 3, false);
+        assertIsLive(LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds), nowInSeconds, false);
+        assertIsLive(LivenessInfo.create(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds), nowInSeconds + 3, false);
+    }
+
+    /**
+     * left supersedes right, right doesn't supersede left.
+     */
+    private static void assertSupersedes(LivenessInfo left, LivenessInfo right)
+    {
+        assertTrue(left.supersedes(right));
+        assertFalse(right.supersedes(left));
+    }
+
+    private static void assertIsLive(LivenessInfo info, int nowInSec, boolean alive)
+    {
+        assertEquals(info.isLive(nowInSec), alive);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java b/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java
new file mode 100644
index 0000000..d690253
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java

@@ -0,0 +1,49 @@
+package org.apache.cassandra.db;
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.junit.Test;
+
+
+public class MultiKeyspaceTest extends CQLTester
+{
+    @Test
+    public void testSameTableNames() throws Throwable
+    {
+        schemaChange("CREATE KEYSPACE multikstest1 WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
+        schemaChange("CREATE TABLE multikstest1.standard1 (a int PRIMARY KEY, b int)");
+
+        schemaChange("CREATE KEYSPACE multikstest2 WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
+        schemaChange("CREATE TABLE multikstest2.standard1 (a int PRIMARY KEY, b int)");
+
+        execute("INSERT INTO multikstest1.standard1 (a, b) VALUES (0, 0)");
+        execute("INSERT INTO multikstest2.standard1 (a, b) VALUES (0, 0)");
+
+        Keyspace.open("multikstest1").flush();
+        Keyspace.open("multikstest2").flush();
+
+        assertRows(execute("SELECT * FROM multikstest1.standard1"),
+                   row(0, 0));
+        assertRows(execute("SELECT * FROM multikstest2.standard1"),
+                   row(0, 0));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/MultitableTest.java b/test/unit/org/apache/cassandra/db/MultitableTest.java
deleted file mode 100644
index fd04b76..0000000
--- a/test/unit/org/apache/cassandra/db/MultitableTest.java
+++ /dev/null

@@ -1,80 +0,0 @@
-package org.apache.cassandra.db;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import static org.apache.cassandra.Util.column;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-public class MultitableTest
-{
-    private static final String KEYSPACE1 = "MultitableTest1";
-    private static final String KEYSPACE2 = "MultitableTest2";
-    private static final String CF1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF1));
-        SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE2, CF1));
-    }
-
-    @Test
-    public void testSameCFs()
-    {
-        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-        Keyspace keyspace2 = Keyspace.open(KEYSPACE2);
-
-        Mutation rm;
-        DecoratedKey dk = Util.dk("keymulti");
-        ColumnFamily cf;
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-        rm.applyUnsafe();
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE2, "Standard1");
-        cf.addColumn(column("col2", "val2", 1L));
-        rm = new Mutation(KEYSPACE2, dk.getKey(), cf);
-        rm.applyUnsafe();
-
-        keyspace1.getColumnFamilyStore("Standard1").forceBlockingFlush();
-        keyspace2.getColumnFamilyStore("Standard1").forceBlockingFlush();
-
-        assertColumns(Util.getColumnFamily(keyspace1, dk, "Standard1"), "col1");
-        assertColumns(Util.getColumnFamily(keyspace2, dk, "Standard1"), "col2");
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/NameSortTest.java b/test/unit/org/apache/cassandra/db/NameSortTest.java
index c4361d8..1da6ea6 100644
--- a/test/unit/org/apache/cassandra/db/NameSortTest.java
+++ b/test/unit/org/apache/cassandra/db/NameSortTest.java

@@ -18,20 +18,20 @@
  */
 package org.apache.cassandra.db;
 
-import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.Util.addMutation;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Collection;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.junit.Assert.assertEquals;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -39,17 +39,14 @@
 {
     private static final String KEYSPACE1 = "NameSortTest";
     private static final String CF = "Standard1";
-    private static final String CFSUPER = "Super1";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CFSUPER, LongType.instance));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF, 1000, AsciiType.instance));
     }
 
     @Test
@@ -76,56 +73,35 @@
     private void testNameSort(int N) throws IOException
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-
-        for (int i = 0; i < N; ++i)
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+        for (int i = 0; i < N; i++)
         {
             ByteBuffer key = ByteBufferUtil.bytes(Integer.toString(i));
-            Mutation rm;
-
-            // standard
-            for (int j = 0; j < 8; ++j)
-            {
-                ByteBuffer bytes = j % 2 == 0 ? ByteBufferUtil.bytes("a") : ByteBufferUtil.bytes("b");
-                rm = new Mutation(KEYSPACE1, key);
-                rm.add("Standard1", Util.cellname("Cell-" + j), bytes, j);
-                rm.applyUnsafe();
-            }
-
-            // super
-            for (int j = 0; j < 8; ++j)
-            {
-                rm = new Mutation(KEYSPACE1, key);
-                for (int k = 0; k < 4; ++k)
-                {
-                    String value = (j + k) % 2 == 0 ? "a" : "b";
-                    addMutation(rm, CFSUPER, "SuperColumn-" + j, k, value, k);
-                }
-                rm.applyUnsafe();
-            }
+            RowUpdateBuilder rub = new RowUpdateBuilder(cfs.metadata, 0, key);
+            rub.clustering("cc");
+            for (int j = 0; j < 8; j++)
+                rub.add("val" + j, j % 2 == 0 ? "a" : "b");
+            rub.build().applyUnsafe();
         }
-
-        validateNameSort(keyspace, N);
-
+        validateNameSort(cfs);
         keyspace.getColumnFamilyStore("Standard1").forceBlockingFlush();
-        keyspace.getColumnFamilyStore(CFSUPER).forceBlockingFlush();
-        validateNameSort(keyspace, N);
+        validateNameSort(cfs);
     }
 
-    private void validateNameSort(Keyspace keyspace, int N) throws IOException
+    private void validateNameSort(ColumnFamilyStore cfs) throws IOException
     {
-        for (int i = 0; i < N; ++i)
+        for (FilteredPartition partition : Util.getAll(Util.cmd(cfs).build()))
         {
-            DecoratedKey key = Util.dk(Integer.toString(i));
-            ColumnFamily cf;
-
-            cf = Util.getColumnFamily(keyspace, key, "Standard1");
-            Collection<Cell> cells = cf.getSortedColumns();
-            for (Cell cell : cells)
+            for (Row r : partition)
             {
-                String name = ByteBufferUtil.string(cell.name().toByteBuffer());
-                int j = Integer.valueOf(name.substring(name.length() - 1));
-                byte[] bytes = j % 2 == 0 ? "a".getBytes() : "b".getBytes();
-                assertEquals(new String(bytes), ByteBufferUtil.string(cell.value()));
+                for (ColumnDefinition cd : r.columns())
+                {
+                    if (r.getCell(cd) == null)
+                        continue;
+                    int cellVal = Integer.valueOf(cd.name.toString().substring(cd.name.toString().length() - 1));
+                    String expected = cellVal % 2 == 0 ? "a" : "b";
+                    assertEquals(expected, ByteBufferUtil.string(r.getCell(cd).value()));
+                }
             }
         }
     }

diff --git a/test/unit/org/apache/cassandra/db/NativeCellTest.java b/test/unit/org/apache/cassandra/db/NativeCellTest.java
deleted file mode 100644
index 4145a91..0000000
--- a/test/unit/org/apache/cassandra/db/NativeCellTest.java
+++ /dev/null

@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.Arrays;
-import java.util.Random;
-import java.util.concurrent.ThreadLocalRandom;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CompoundDenseCellNameType;
-import org.apache.cassandra.db.composites.CompoundSparseCellNameType;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
-import org.apache.cassandra.db.composites.SimpleSparseCellNameType;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.NativeAllocator;
-import org.apache.cassandra.utils.memory.NativePool;
-
-import static org.apache.cassandra.db.composites.CellNames.compositeDense;
-import static org.apache.cassandra.db.composites.CellNames.compositeSparse;
-import static org.apache.cassandra.db.composites.CellNames.simpleDense;
-import static org.apache.cassandra.db.composites.CellNames.simpleSparse;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-
-public class NativeCellTest
-{
-
-    private static final NativeAllocator nativeAllocator = new NativePool(Integer.MAX_VALUE, Integer.MAX_VALUE, 1f, null).newAllocator();
-    private static final OpOrder.Group group = new OpOrder().start();
-
-    static class Name
-    {
-        final CellName name;
-        final CellNameType type;
-        Name(CellName name, CellNameType type)
-        {
-            this.name = name;
-            this.type = type;
-        }
-    }
-
-    static ByteBuffer[] bytess(String ... strings)
-    {
-        ByteBuffer[] r = new ByteBuffer[strings.length];
-        for (int i = 0 ; i < r.length ; i++)
-            r[i] = bytes(strings[i]);
-        return r;
-    }
-
-    final static Name[] TESTS = new Name[]
-                          {
-                              new Name(simpleDense(bytes("a")), new SimpleDenseCellNameType(UTF8Type.instance)),
-                              new Name(simpleSparse(new ColumnIdentifier("a", true)), new SimpleSparseCellNameType(UTF8Type.instance)),
-                              new Name(compositeDense(bytes("a"), bytes("b")), new CompoundDenseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
-                              new Name(compositeSparse(bytess("b", "c"), new ColumnIdentifier("a", true), false), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
-                              new Name(compositeSparse(bytess("b", "c"), new ColumnIdentifier("a", true), true), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
-                              new Name(simpleDense(huge('a', 40000)), new SimpleDenseCellNameType(UTF8Type.instance)),
-                              new Name(simpleSparse(new ColumnIdentifier(hugestr('a', 40000), true)), new SimpleSparseCellNameType(UTF8Type.instance)),
-                              new Name(compositeDense(huge('a', 20000), huge('b', 20000)), new CompoundDenseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
-                              new Name(compositeSparse(huges(40000, 'b', 'c'), new ColumnIdentifier(hugestr('a', 10000), true), false), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance))),
-                              new Name(compositeSparse(huges(40000, 'b', 'c'), new ColumnIdentifier(hugestr('a', 10000), true), true), new CompoundSparseCellNameType(Arrays.<AbstractType<?>>asList(UTF8Type.instance, UTF8Type.instance)))
-                          };
-
-    private static ByteBuffer huge(char ch, int count)
-    {
-        return bytes(hugestr(ch, count));
-    }
-
-    private static ByteBuffer[] huges(int count, char ... chs)
-    {
-        ByteBuffer[] r = new ByteBuffer[chs.length];
-        for (int i = 0 ; i < chs.length ; i++)
-            r[i] = huge(chs[i], count / chs.length);
-        return r;
-    }
-
-    private static String hugestr(char ch, int count)
-    {
-        ThreadLocalRandom random = ThreadLocalRandom.current();
-        byte[] bytes = new byte[count];
-        random.nextBytes(bytes);
-        bytes[0] = (byte) ch;
-        for (int i = 0 ; i < bytes.length ; i++)
-            bytes[i] &= 0x7f;
-        return new String(bytes);
-    }
-
-    private static final CFMetaData metadata = new CFMetaData("", "", ColumnFamilyType.Standard, null);
-    static
-    {
-        try
-        {
-            metadata.addColumnDefinition(new ColumnDefinition(null, null, new ColumnIdentifier("a", true), UTF8Type.instance, null, null, null, null, null));
-        }
-        catch (ConfigurationException e)
-        {
-            throw new AssertionError();
-        }
-        // TODO: CounterId accesses SystemKespace to get local host ID, so need to mark as daemon initialized
-        DatabaseDescriptor.setDaemonInitialized();
-    }
-
-    @Test
-    public void testCells() throws IOException
-    {
-        Random rand = ThreadLocalRandom.current();
-        for (Name test : TESTS)
-        {
-            byte[] bytes = new byte[16];
-            rand.nextBytes(bytes);
-
-            // test regular Cell
-            Cell buf, nat;
-            buf = new BufferCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong());
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-            test(test, buf, nat);
-
-            // test DeletedCell
-            buf = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-            test(test, buf, nat);
-
-            // test ExpiringCell
-            buf = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong(),  rand.nextInt(100000));
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-            test(test, buf, nat);
-
-            // test CounterCell
-            buf = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-            test(test, buf, nat);
-        }
-    }
-
-
-    @Test
-    public void testComparator()
-    {
-
-        Random rand = ThreadLocalRandom.current();
-        for (Name test : TESTS)
-        {
-            byte[] bytes = new byte[7];
-            byte[] bytes2 = new byte[7];
-            rand.nextBytes(bytes);
-            rand.nextBytes(bytes2);
-
-            // test regular Cell
-            Cell buf, nat, buf2, nat2;
-            buf = new BufferCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong());
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-
-            buf2 = new BufferCell(test.name, ByteBuffer.wrap(bytes2), rand.nextLong());
-            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
-
-            assert test.type.compare(buf.name(), nat.name()) == 0;
-            assert test.type.compare(buf2.name(), nat2.name()) == 0;
-
-            int val = test.type.compare(buf.name(), buf2.name());
-            assert test.type.compare(nat.name(), nat2.name()) == val;
-            assert test.type.compare(nat.name(), buf2.name()) == val;
-            assert test.type.compare(buf.name(), nat2.name()) == val;
-
-
-            // test DeletedCell
-            buf = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-            buf2 = new BufferDeletedCell(test.name, rand.nextInt(100000), rand.nextLong());
-            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
-
-            assert test.type.compare(buf.name(), nat.name()) == 0;
-            assert test.type.compare(buf2.name(), nat2.name()) == 0;
-
-            val = test.type.compare(buf.name(), buf2.name());
-            assert test.type.compare(nat.name(), nat2.name()) == val;
-            assert test.type.compare(nat.name(), buf2.name()) == val;
-            assert test.type.compare(buf.name(), nat2.name()) == val;
-
-
-
-            // test ExpiringCell
-            buf = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes), rand.nextLong(),  rand.nextInt(100000));
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-
-            buf2 = new BufferExpiringCell(test.name, ByteBuffer.wrap(bytes2), rand.nextLong(),  rand.nextInt(100000));
-            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
-
-            assert test.type.compare(buf.name(), nat.name()) == 0;
-            assert test.type.compare(buf2.name(), nat2.name()) == 0;
-
-            val = test.type.compare(buf.name(), buf2.name());
-            assert test.type.compare(nat.name(), nat2.name()) == val;
-            assert test.type.compare(nat.name(), buf2.name()) == val;
-            assert test.type.compare(buf.name(), nat2.name()) == val;
-
-
-            // test CounterCell
-            buf = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
-            nat = buf.localCopy(metadata, nativeAllocator, group);
-
-            buf2 = new BufferCounterCell(test.name, CounterContext.instance().createLocal(rand.nextLong()), rand.nextLong(),  rand.nextInt(100000));
-            nat2 = buf2.localCopy(metadata, nativeAllocator, group);
-
-            assert test.type.compare(buf.name(), nat.name()) == 0;
-            assert test.type.compare(buf2.name(), nat2.name()) == 0;
-
-            val = test.type.compare(buf.name(), buf2.name());
-            assert test.type.compare(nat.name(), nat2.name()) == val;
-            assert test.type.compare(nat.name(), buf2.name()) == val;
-            assert test.type.compare(buf.name(), nat2.name()) == val;
-
-        }
-    }
-
-    static void test(Name test, Cell buf, Cell nat) throws IOException
-    {
-        Assert.assertTrue(buf.equals(nat));
-        Assert.assertTrue(nat.equals(buf));
-        Assert.assertTrue(buf.equals(buf));
-        Assert.assertTrue(nat.equals(nat));
-
-        try
-        {
-            MessageDigest d1 = MessageDigest.getInstance("MD5");
-            MessageDigest d2 = MessageDigest.getInstance("MD5");
-            buf.updateDigest(d1);
-            nat.updateDigest(d2);
-            Assert.assertArrayEquals(d1.digest(), d2.digest());
-        }
-        catch (NoSuchAlgorithmException e)
-        {
-            throw new IllegalStateException(e);
-        }
-
-        byte[] serialized;
-        try (DataOutputBuffer bufOut = new DataOutputBuffer())
-        {
-            test.type.columnSerializer().serialize(nat, bufOut);
-            serialized = bufOut.getData();
-        }
-
-        ByteArrayInputStream bufIn = new ByteArrayInputStream(serialized, 0, serialized.length);
-        Cell deserialized = test.type.columnSerializer().deserialize(new DataInputStream(bufIn));
-        Assert.assertTrue(buf.equals(deserialized));
-
-    }
-
-
-
-}

diff --git a/test/unit/org/apache/cassandra/db/OldFormatDeserializerTest.java b/test/unit/org/apache/cassandra/db/OldFormatDeserializerTest.java
new file mode 100644
index 0000000..886b191
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/OldFormatDeserializerTest.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.function.Supplier;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.UnfilteredDeserializer.OldFormatDeserializer.UnfilteredIterator;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.*;
+
+public class OldFormatDeserializerTest
+{
+    @Test
+    public void testRangeTombstones() throws Exception
+    {
+        CFMetaData metadata = CFMetaData.Builder.create("ks", "table")
+                                                .withPartitioner(Murmur3Partitioner.instance)
+                                                .addPartitionKey("k", Int32Type.instance)
+                                                .addClusteringColumn("v", Int32Type.instance)
+                                                .build();
+
+        Supplier<LegacyLayout.LegacyAtom> atomSupplier = supplier(rt(0, 10, 42),
+                                                                  rt(5, 15, 42));
+
+        UnfilteredIterator iterator = new UnfilteredIterator(metadata,
+                                                             DeletionTime.LIVE,
+                                                             new SerializationHelper(metadata, MessagingService.current_version, SerializationHelper.Flag.LOCAL),
+                                                             atomSupplier);
+
+        // As the deletion time are the same, we want this to produce a single range tombstone covering from 0 to 15.
+
+        assertTrue(iterator.hasNext());
+
+        Unfiltered first = iterator.next();
+        assertTrue(first.isRangeTombstoneMarker());
+        RangeTombstoneMarker start = (RangeTombstoneMarker)first;
+        assertTrue(start.isOpen(false));
+        assertFalse(start.isClose(false));
+        assertEquals(0, toInt(start.openBound(false)));
+        assertEquals(42, start.openDeletionTime(false).markedForDeleteAt());
+
+        Unfiltered second = iterator.next();
+        assertTrue(second.isRangeTombstoneMarker());
+        RangeTombstoneMarker end = (RangeTombstoneMarker)second;
+        assertTrue(end.isClose(false));
+        assertFalse(end.isOpen(false));
+        assertEquals(15, toInt(end.closeBound(false)));
+        assertEquals(42, end.closeDeletionTime(false).markedForDeleteAt());
+
+         assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testRangeTombstonesSameStart() throws Exception
+    {
+        CFMetaData metadata = CFMetaData.Builder.create("ks", "table")
+                                                .withPartitioner(Murmur3Partitioner.instance)
+                                                .addPartitionKey("k", Int32Type.instance)
+                                                .addClusteringColumn("v", Int32Type.instance)
+                                                .build();
+
+        // Multiple RT that have the same start (we _can_ get this in the legacy format!)
+        Supplier<LegacyLayout.LegacyAtom> atomSupplier = supplier(rt(1, 2, 3),
+                                                                  rt(1, 2, 5),
+                                                                  rt(1, 5, 4));
+
+        UnfilteredIterator iterator = new UnfilteredIterator(metadata,
+                                                             DeletionTime.LIVE,
+                                                             new SerializationHelper(metadata, MessagingService.current_version, SerializationHelper.Flag.LOCAL),
+                                                             atomSupplier);
+
+        // We should be entirely ignoring the first tombston (shadowed by 2nd one) so we should generate
+        // [1, 2]@5 (2, 5]@4 (but where both range actually form a boundary)
+
+        assertTrue(iterator.hasNext());
+
+        Unfiltered first = iterator.next();
+        System.out.println(">> " + first.toString(metadata));
+        assertTrue(first.isRangeTombstoneMarker());
+        RangeTombstoneMarker start = (RangeTombstoneMarker)first;
+        assertTrue(start.isOpen(false));
+        assertFalse(start.isClose(false));
+        assertEquals(1, toInt(start.openBound(false)));
+        assertEquals(5, start.openDeletionTime(false).markedForDeleteAt());
+
+        Unfiltered second = iterator.next();
+        assertTrue(second.isRangeTombstoneMarker());
+        RangeTombstoneMarker middle = (RangeTombstoneMarker)second;
+        assertTrue(middle.isClose(false));
+        assertTrue(middle.isOpen(false));
+        assertEquals(2, toInt(middle.closeBound(false)));
+        assertEquals(2, toInt(middle.openBound(false)));
+        assertEquals(5, middle.closeDeletionTime(false).markedForDeleteAt());
+        assertEquals(4, middle.openDeletionTime(false).markedForDeleteAt());
+
+        Unfiltered third = iterator.next();
+        assertTrue(third.isRangeTombstoneMarker());
+        RangeTombstoneMarker end = (RangeTombstoneMarker)third;
+        assertTrue(end.isClose(false));
+        assertFalse(end.isOpen(false));
+        assertEquals(5, toInt(end.closeBound(false)));
+        assertEquals(4, end.closeDeletionTime(false).markedForDeleteAt());
+
+        assertFalse(iterator.hasNext());
+    }
+
+    private static int toInt(ClusteringPrefix prefix)
+    {
+        assertTrue(prefix.size() == 1);
+        return ByteBufferUtil.toInt(prefix.get(0));
+    }
+
+    private static Supplier<LegacyLayout.LegacyAtom> supplier(LegacyLayout.LegacyAtom... atoms)
+    {
+        return new Supplier<LegacyLayout.LegacyAtom>()
+        {
+            int i = 0;
+
+            public LegacyLayout.LegacyAtom get()
+            {
+                return i >= atoms.length ? null : atoms[i++];
+            }
+        };
+    }
+
+    private static LegacyLayout.LegacyAtom rt(int start, int end, int deletion)
+    {
+        return new LegacyLayout.LegacyRangeTombstone(bound(start, true), bound(end, false), new DeletionTime(deletion, FBUtilities.nowInSeconds()));
+    }
+
+    private static LegacyLayout.LegacyBound bound(int b, boolean isStart)
+    {
+        return new LegacyLayout.LegacyBound(isStart ? Slice.Bound.inclusiveStartOf(ByteBufferUtil.bytes(b)) : Slice.Bound.inclusiveEndOf(ByteBufferUtil.bytes(b)),
+                                            false,
+                                            null);
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java b/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java
new file mode 100644
index 0000000..b567f72
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java

@@ -0,0 +1,535 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.collect.Iterators;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static org.apache.cassandra.db.ConsistencyLevel.ONE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.cassandra.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class PartitionRangeReadTest
+{
+    public static final String KEYSPACE1 = "PartitionRangeReadTest1";
+    public static final String KEYSPACE2 = "PartitionRangeReadTest2";
+    public static final String CF_STANDARD1 = "Standard1";
+    public static final String CF_STANDARDINT = "StandardInteger1";
+    public static final String CF_COMPACT1 = "Compact1";
+
+    private static final List<InetAddress> LOCAL = Collections.singletonList(FBUtilities.getBroadcastAddress());
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.denseCFMD(KEYSPACE1, CF_STANDARDINT, IntegerType.instance),
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_COMPACT1, false, false, false)
+                                                      .addPartitionKey("key", AsciiType.instance)
+                                                      .addClusteringColumn("column1", AsciiType.instance)
+                                                      .addRegularColumn("value", AsciiType.instance)
+                                                      .addStaticColumn("val", AsciiType.instance)
+                                                      .build());
+        SchemaLoader.createKeyspace(KEYSPACE2,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD1));
+    }
+
+    @Test
+    public void testInclusiveBounds()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1);
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("key1"))
+                .clustering("cc1")
+                .add("val", "asdf").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("key2"))
+                .clustering("cc2")
+                .add("val", "asdf").build().applyUnsafe();
+
+        assertEquals(2, Util.getAll(Util.cmd(cfs).fromIncl("cc1").toIncl("cc2").build()).size());
+    }
+
+    @Test
+    public void testCassandra6778() throws CharacterCodingException
+    {
+        String cfname = CF_STANDARDINT;
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        cfs.truncateBlocking();
+
+        ByteBuffer col = ByteBufferUtil.bytes("val");
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(col);
+
+        // insert two columns that represent the same integer but have different binary forms (the
+        // second one is padded with extra zeros)
+        new RowUpdateBuilder(cfs.metadata, 0, "k1")
+                .clustering(new BigInteger(new byte[]{1}))
+                .add("val", "val1")
+                .build()
+                .applyUnsafe();
+        cfs.forceBlockingFlush();
+
+        new RowUpdateBuilder(cfs.metadata, 1, "k1")
+                .clustering(new BigInteger(new byte[]{0, 0, 1}))
+                .add("val", "val2")
+                .build()
+                .applyUnsafe();
+        cfs.forceBlockingFlush();
+
+        // fetch by the first column name; we should get the second version of the column value
+        Row row = Util.getOnlyRow(Util.cmd(cfs, "k1").includeRow(new BigInteger(new byte[]{1})).build());
+        assertTrue(row.getCell(cDef).value().equals(ByteBufferUtil.bytes("val2")));
+
+        // fetch by the second column name; we should get the second version of the column value
+        row = Util.getOnlyRow(Util.cmd(cfs, "k1").includeRow(new BigInteger(new byte[]{0, 0, 1})).build());
+        assertTrue(row.getCell(cDef).value().equals(ByteBufferUtil.bytes("val2")));
+    }
+
+    @Test
+    public void testLimits()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_COMPACT1);
+        for (int i = 0; i < 10; i++)
+        {
+            new RowUpdateBuilder(cfs.metadata, 0, Integer.toString(i))
+            .add("val", "abcd")
+            .build()
+            .applyUnsafe();
+        }
+
+        assertEquals(10, Util.getAll(Util.cmd(cfs).build()).size());
+
+        for (int i = 0; i < 10; i++)
+            assertEquals(i, Util.getAll(Util.cmd(cfs).withLimit(i).build()).size());
+    }
+
+    @Test
+    public void testRangeSliceInclusionExclusion() throws Throwable
+    {
+        String keyspaceName = KEYSPACE1;
+        String cfName = CF_STANDARD1;
+        Keyspace keyspace = Keyspace.open(keyspaceName);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+        cfs.clearUnsafe();
+
+        for (int i = 0; i < 10; ++i)
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 10, String.valueOf(i));
+            builder.clustering("c");
+            builder.add("val", String.valueOf(i));
+            builder.build().applyUnsafe();
+        }
+
+        cfs.forceBlockingFlush();
+
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+
+        List<FilteredPartition> partitions;
+
+        // Start and end inclusive
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyIncl("2").toKeyIncl("7").build());
+        assertEquals(6, partitions.size());
+        assertTrue(partitions.get(0).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("2")));
+        assertTrue(partitions.get(partitions.size() - 1).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("7")));
+
+        // Start and end excluded
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyExcl("2").toKeyExcl("7").build());
+        assertEquals(4, partitions.size());
+        assertTrue(partitions.get(0).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("3")));
+        assertTrue(partitions.get(partitions.size() - 1).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("6")));
+
+        // Start excluded, end included
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyExcl("2").toKeyIncl("7").build());
+        assertEquals(5, partitions.size());
+        assertTrue(partitions.get(0).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("3")));
+        assertTrue(partitions.get(partitions.size() - 1).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("7")));
+
+        // Start included, end excluded
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyIncl("2").toKeyExcl("7").build());
+        assertEquals(5, partitions.size());
+        assertTrue(partitions.get(0).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("2")));
+        assertTrue(partitions.get(partitions.size() - 1).iterator().next().getCell(cDef).value().equals(ByteBufferUtil.bytes("6")));
+    }
+
+        // TODO: Port or remove, depending on what DataLimits.thriftLimits (per cell) looks like
+//    @Test
+//    public void testRangeSliceColumnsLimit() throws Throwable
+//    {
+//        String keyspaceName = KEYSPACE1;
+//        String cfName = CF_STANDARD1;
+//        Keyspace keyspace = Keyspace.open(keyspaceName);
+//        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+//        cfs.clearUnsafe();
+//
+//        Cell[] cols = new Cell[5];
+//        for (int i = 0; i < 5; i++)
+//            cols[i] = column("c" + i, "value", 1);
+//
+//        putColsStandard(cfs, Util.dk("a"), cols[0], cols[1], cols[2], cols[3], cols[4]);
+//        putColsStandard(cfs, Util.dk("b"), cols[0], cols[1]);
+//        putColsStandard(cfs, Util.dk("c"), cols[0], cols[1], cols[2], cols[3]);
+//        cfs.forceBlockingFlush();
+//
+//        SlicePredicate sp = new SlicePredicate();
+//        sp.setSlice_range(new SliceRange());
+//        sp.getSlice_range().setCount(1);
+//        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
+//        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
+//
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              3,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            3);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              5,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            5);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              8,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            8);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              10,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            10);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              100,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            11);
+//
+//        // Check that when querying by name, we always include all names for a
+//        // gien row even if it means returning more columns than requested (this is necesseray for CQL)
+//        sp = new SlicePredicate();
+//        sp.setColumn_names(Arrays.asList(
+//            ByteBufferUtil.bytes("c0"),
+//            ByteBufferUtil.bytes("c1"),
+//            ByteBufferUtil.bytes("c2")
+//        ));
+//
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              1,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            3);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              4,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            5);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              5,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            5);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              6,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            8);
+//        assertTotalColCount(cfs.getRangeSlice(Util.range("", ""),
+//                                              null,
+//                                              ThriftValidation.asIFilter(sp, cfs.metadata, null),
+//                                              100,
+//                                              System.currentTimeMillis(),
+//                                              true,
+//                                              false),
+//                            8);
+//    }
+
+    // TODO: Port or remove, depending on what DataLimits.thriftLimits (per cell) looks like
+//    @Test
+//    public void testRangeSlicePaging() throws Throwable
+//    {
+//        String keyspaceName = KEYSPACE1;
+//        String cfName = CF_STANDARD1;
+//        Keyspace keyspace = Keyspace.open(keyspaceName);
+//        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+//        cfs.clearUnsafe();
+//
+//        Cell[] cols = new Cell[4];
+//        for (int i = 0; i < 4; i++)
+//            cols[i] = column("c" + i, "value", 1);
+//
+//        DecoratedKey ka = Util.dk("a");
+//        DecoratedKey kb = Util.dk("b");
+//        DecoratedKey kc = Util.dk("c");
+//
+//        PartitionPosition min = Util.rp("");
+//
+//        putColsStandard(cfs, ka, cols[0], cols[1], cols[2], cols[3]);
+//        putColsStandard(cfs, kb, cols[0], cols[1], cols[2]);
+//        putColsStandard(cfs, kc, cols[0], cols[1], cols[2], cols[3]);
+//        cfs.forceBlockingFlush();
+//
+//        SlicePredicate sp = new SlicePredicate();
+//        sp.setSlice_range(new SliceRange());
+//        sp.getSlice_range().setCount(1);
+//        sp.getSlice_range().setStart(ArrayUtils.EMPTY_BYTE_ARRAY);
+//        sp.getSlice_range().setFinish(ArrayUtils.EMPTY_BYTE_ARRAY);
+//
+//        Collection<Row> rows;
+//        Row row, row1, row2;
+//        IDiskAtomFilter filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
+//
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(Util.range("", ""), filter, null, 3, true, true, System.currentTimeMillis()));
+//        assert rows.size() == 1 : "Expected 1 row, got " + toString(rows);
+//        row = rows.iterator().next();
+//        assertColumnNames(row, "c0", "c1", "c2");
+//
+//        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c2")));
+//        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<PartitionPosition>(ka, min), filter, null, 3, true, true, System.currentTimeMillis()));
+//        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
+//        Iterator<Row> iter = rows.iterator();
+//        row1 = iter.next();
+//        row2 = iter.next();
+//        assertColumnNames(row1, "c2", "c3");
+//        assertColumnNames(row2, "c0");
+//
+//        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c0")));
+//        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<PartitionPosition>(row2.key, min), filter, null, 3, true, true, System.currentTimeMillis()));
+//        assert rows.size() == 1 : "Expected 1 row, got " + toString(rows);
+//        row = rows.iterator().next();
+//        assertColumnNames(row, "c0", "c1", "c2");
+//
+//        sp.getSlice_range().setStart(ByteBufferUtil.getArray(ByteBufferUtil.bytes("c2")));
+//        filter = ThriftValidation.asIFilter(sp, cfs.metadata, null);
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<PartitionPosition>(row.key, min), filter, null, 3, true, true, System.currentTimeMillis()));
+//        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
+//        iter = rows.iterator();
+//        row1 = iter.next();
+//        row2 = iter.next();
+//        assertColumnNames(row1, "c2");
+//        assertColumnNames(row2, "c0", "c1");
+//
+//        // Paging within bounds
+//        SliceQueryFilter sf = new SliceQueryFilter(cellname("c1"),
+//                                                   cellname("c2"),
+//                                                   false,
+//                                                   0);
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<PartitionPosition>(ka, kc), sf, cellname("c2"), cellname("c1"), null, 2, true, System.currentTimeMillis()));
+//        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
+//        iter = rows.iterator();
+//        row1 = iter.next();
+//        row2 = iter.next();
+//        assertColumnNames(row1, "c2");
+//        assertColumnNames(row2, "c1");
+//
+//        rows = cfs.getRangeSlice(cfs.makeExtendedFilter(new Bounds<PartitionPosition>(kb, kc), sf, cellname("c1"), cellname("c1"), null, 10, true, System.currentTimeMillis()));
+//        assert rows.size() == 2 : "Expected 2 rows, got " + toString(rows);
+//        iter = rows.iterator();
+//        row1 = iter.next();
+//        row2 = iter.next();
+//        assertColumnNames(row1, "c1", "c2");
+//        assertColumnNames(row2, "c1");
+//    }
+
+    @Test
+    public void testComputeConcurrencyFactor()
+    {
+        int maxConcurrentRangeRequest = 32;
+
+        // no live row returned, fetch all remaining ranges but hit the max instead
+        int cf = StorageProxy.RangeCommandIterator.computeConcurrencyFactor(100, 30, maxConcurrentRangeRequest, 500, 0);
+        assertEquals(maxConcurrentRangeRequest, cf); // because 100 - 30 = 70 > maxConccurrentRangeRequest
+
+        // no live row returned, fetch all remaining ranges
+        cf = StorageProxy.RangeCommandIterator.computeConcurrencyFactor(100, 80, maxConcurrentRangeRequest, 500, 0);
+        assertEquals(20, cf); // because 100-80 = 20 < maxConccurrentRangeRequest
+
+        // returned half rows, fetch rangesQueried again but hit the max instead
+        cf = StorageProxy.RangeCommandIterator.computeConcurrencyFactor(100, 60, maxConcurrentRangeRequest, 480, 240);
+        assertEquals(maxConcurrentRangeRequest, cf); // because 60 > maxConccurrentRangeRequest
+
+        // returned half rows, fetch rangesQueried again
+        cf = StorageProxy.RangeCommandIterator.computeConcurrencyFactor(100, 30, maxConcurrentRangeRequest, 480, 240);
+        assertEquals(30, cf); // because 30 < maxConccurrentRangeRequest
+
+        // returned most of rows, 1 more range to fetch
+        cf = StorageProxy.RangeCommandIterator.computeConcurrencyFactor(100, 1, maxConcurrentRangeRequest, 480, 479);
+        assertEquals(1, cf); // because 1 < maxConccurrentRangeRequest
+    }
+
+    @Test
+    public void testRangeCountWithRangeMerge()
+    {
+        List<Token> tokens = setTokens(Arrays.asList(100, 200, 300, 400));
+        int vnodeCount = 0;
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        List<StorageProxy.RangeForQuery> ranges = new ArrayList<>();
+        for (int i = 0; i + 1 < tokens.size(); i++)
+        {
+            Range<PartitionPosition> range = Range.makeRowRange(tokens.get(i), tokens.get(i + 1));
+            ranges.add(new StorageProxy.RangeForQuery(range, LOCAL, LOCAL, 1));
+            vnodeCount++;
+        }
+
+        StorageProxy.RangeMerger merge = new StorageProxy.RangeMerger(ranges.iterator(), keyspace, ONE);
+        StorageProxy.RangeForQuery mergedRange = Iterators.getOnlyElement(merge);
+        // all ranges are merged as test has only one node.
+        assertEquals(vnodeCount, mergedRange.vnodeCount());
+    }
+
+    @Test
+    public void testRangeQueried()
+    {
+        List<Token> tokens = setTokens(Arrays.asList(100, 200, 300, 400));
+        int vnodeCount = tokens.size() + 1; // n tokens divide token ring into n+1 ranges
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        cfs.clearUnsafe();
+
+        int rows = 100;
+        for (int i = 0; i < rows; ++i)
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 10, String.valueOf(i));
+            builder.clustering("c");
+            builder.add("val", String.valueOf(i));
+            builder.build().applyUnsafe();
+        }
+        cfs.forceBlockingFlush();
+
+        PartitionRangeReadCommand command = (PartitionRangeReadCommand) Util.cmd(cfs).build();
+
+        // without range merger, there will be 2 batches requested: 1st batch with 1 range and 2nd batch with remaining ranges
+        Iterator<StorageProxy.RangeForQuery> ranges = rangeIterator(command, keyspace, false);
+        StorageProxy.RangeCommandIterator data = new StorageProxy.RangeCommandIterator(ranges, command, 1, 1000, vnodeCount, keyspace, ONE);
+        verifyRangeCommandIterator(data, rows, 2, vnodeCount);
+
+        // without range merger and initial cf=5, there will be 1 batches requested: 5 vnode ranges for 1st batch
+        ranges = rangeIterator(command, keyspace, false);
+        data = new StorageProxy.RangeCommandIterator(ranges, command, vnodeCount, 1000, vnodeCount, keyspace, ONE);
+        verifyRangeCommandIterator(data, rows, 1, vnodeCount);
+
+        // without range merger and max cf=1, there will be 5 batches requested: 1 vnode range per batch
+        ranges = rangeIterator(command, keyspace, false);
+        data = new StorageProxy.RangeCommandIterator(ranges, command, 1, 1, vnodeCount, keyspace, ONE);
+        verifyRangeCommandIterator(data, rows, vnodeCount, vnodeCount);
+
+        // with range merger, there will be only 1 batch requested, as all ranges share the same replica - localhost
+        ranges = rangeIterator(command, keyspace, true);
+        data = new StorageProxy.RangeCommandIterator(ranges, command, 1, 1000, vnodeCount, keyspace, ONE);
+        verifyRangeCommandIterator(data, rows, 1, vnodeCount);
+
+        // with range merger and max cf=1, there will be only 1 batch requested, as all ranges share the same replica - localhost
+        ranges = rangeIterator(command, keyspace, true);
+        data = new StorageProxy.RangeCommandIterator(ranges, command, 1, 1, vnodeCount, keyspace, ONE);
+        verifyRangeCommandIterator(data, rows, 1, vnodeCount);
+    }
+
+    private Iterator<StorageProxy.RangeForQuery> rangeIterator(PartitionRangeReadCommand command, Keyspace keyspace, boolean withRangeMerger)
+    {
+        Iterator<StorageProxy.RangeForQuery> ranges = new StorageProxy.RangeIterator(command, keyspace, ONE);
+        if (withRangeMerger)
+            ranges = new StorageProxy.RangeMerger(ranges, keyspace, ONE);
+
+        return  ranges;
+    }
+
+    private void verifyRangeCommandIterator(StorageProxy.RangeCommandIterator data, int rows, int batches, int vnodeCount)
+    {
+        int num = Util.size(data);
+        assertEquals(rows, num);
+        assertEquals(batches, data.batchesRequested());
+        assertEquals(vnodeCount, data.rangesQueried());
+    }
+
+    private List<Token> setTokens(List<Integer> values)
+    {
+        IPartitioner partitioner = DatabaseDescriptor.getPartitioner();
+        List<Token> tokens = new ArrayList<>(values.size());
+        for (Integer val : values)
+            tokens.add(partitioner.getToken(ByteBufferUtil.bytes(val)));
+
+        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+        tmd.clearUnsafe();
+        tmd.updateNormalTokens(tokens, FBUtilities.getBroadcastAddress());
+
+        return tokens;
+    }
+}
+

diff --git a/test/unit/org/apache/cassandra/db/PartitionTest.java b/test/unit/org/apache/cassandra/db/PartitionTest.java
new file mode 100644
index 0000000..7216ab7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/PartitionTest.java

@@ -0,0 +1,192 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+public class PartitionTest
+{
+    private static final String KEYSPACE1 = "Keyspace1";
+    private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_TENCOL = "TenColumns";
+    private static final String CF_COUNTER1 = "Counter1";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_TENCOL, 10, AsciiType.instance),
+                                    SchemaLoader.denseCFMD(KEYSPACE1, CF_COUNTER1, BytesType.instance));
+    }
+
+    @Test
+    public void testSingleColumn() throws IOException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        PartitionUpdate update = new RowUpdateBuilder(cfs.metadata, 5, "key1")
+                                 .clustering("c")
+                                 .add("val", "val1")
+                                 .buildUpdate();
+
+        CachedBTreePartition partition = CachedBTreePartition.create(update.unfilteredIterator(), FBUtilities.nowInSeconds());
+
+        DataOutputBuffer bufOut = new DataOutputBuffer();
+        CachedPartition.cacheSerializer.serialize(partition, bufOut);
+
+        CachedPartition deserialized = CachedPartition.cacheSerializer.deserialize(new DataInputBuffer(bufOut.getData()));
+
+        assert deserialized != null;
+        assert deserialized.metadata().cfName.equals(CF_STANDARD1);
+        assert deserialized.partitionKey().equals(partition.partitionKey());
+    }
+
+    @Test
+    public void testManyColumns() throws IOException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_TENCOL);
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 5, "key1")
+                                   .clustering("c")
+                                   .add("val", "val1");
+
+        for (int i = 0; i < 10; i++)
+            builder.add("val" + i, "val" + i);
+
+        PartitionUpdate update = builder.buildUpdate();
+
+        CachedBTreePartition partition = CachedBTreePartition.create(update.unfilteredIterator(), FBUtilities.nowInSeconds());
+
+        DataOutputBuffer bufOut = new DataOutputBuffer();
+        CachedPartition.cacheSerializer.serialize(partition, bufOut);
+
+        CachedPartition deserialized = CachedPartition.cacheSerializer.deserialize(new DataInputBuffer(bufOut.getData()));
+
+        assertEquals(partition.columns().regulars.size(), deserialized.columns().regulars.size());
+        assertTrue(deserialized.columns().regulars.getSimple(1).equals(partition.columns().regulars.getSimple(1)));
+        assertTrue(deserialized.columns().regulars.getSimple(5).equals(partition.columns().regulars.getSimple(5)));
+
+        ColumnDefinition cDef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val8"));
+        assertTrue(partition.lastRow().getCell(cDef).value().equals(deserialized.lastRow().getCell(cDef).value()));
+        assert deserialized.partitionKey().equals(partition.partitionKey());
+    }
+
+    @Test
+    public void testDigest() throws NoSuchAlgorithmException
+    {
+        testDigest(MessagingService.current_version);
+    }
+
+    @Test
+    public void testLegacyDigest() throws NoSuchAlgorithmException
+    {
+        testDigest(MessagingService.VERSION_22);
+    }
+
+    public void testDigest(int version) throws NoSuchAlgorithmException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_TENCOL);
+
+        try
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 5, "key1").clustering("c").add("val", "val1");
+            for (int i = 0; i < 10; i++)
+                builder.add("val" + i, "val" + i);
+            builder.build().applyUnsafe();
+
+            new RowUpdateBuilder(cfs.metadata, 5, "key2").clustering("c").add("val", "val2").build().applyUnsafe();
+
+            ReadCommand cmd1 = Util.cmd(cfs, "key1").build();
+            ReadCommand cmd2 = Util.cmd(cfs, "key2").build();
+            ImmutableBTreePartition p1 = Util.getOnlyPartitionUnfiltered(cmd1);
+            ImmutableBTreePartition p2 = Util.getOnlyPartitionUnfiltered(cmd2);
+
+            MessageDigest digest1 = MessageDigest.getInstance("MD5");
+            MessageDigest digest2 = MessageDigest.getInstance("MD5");
+            UnfilteredRowIterators.digest(cmd1, p1.unfilteredIterator(), digest1, version);
+            UnfilteredRowIterators.digest(cmd2, p2.unfilteredIterator(), digest2, version);
+            assertFalse(Arrays.equals(digest1.digest(), digest2.digest()));
+
+            p1 = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, "key2").build());
+            p2 = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, "key2").build());
+            digest1 = MessageDigest.getInstance("MD5");
+            digest2 = MessageDigest.getInstance("MD5");
+            UnfilteredRowIterators.digest(cmd1, p1.unfilteredIterator(), digest1, version);
+            UnfilteredRowIterators.digest(cmd2, p2.unfilteredIterator(), digest2, version);
+            assertTrue(Arrays.equals(digest1.digest(), digest2.digest()));
+
+            p1 = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, "key2").build());
+            RowUpdateBuilder.deleteRow(cfs.metadata, 6, "key2", "c").applyUnsafe();
+            p2 = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, "key2").build());
+            digest1 = MessageDigest.getInstance("MD5");
+            digest2 = MessageDigest.getInstance("MD5");
+            UnfilteredRowIterators.digest(cmd1, p1.unfilteredIterator(), digest1, version);
+            UnfilteredRowIterators.digest(cmd2, p2.unfilteredIterator(), digest2, version);
+            assertFalse(Arrays.equals(digest1.digest(), digest2.digest()));
+        }
+        finally
+        {
+            cfs.truncateBlocking();
+        }
+    }
+
+    @Test
+    public void testColumnStatsRecordsRowDeletesCorrectly()
+    {
+        long timestamp = System.currentTimeMillis();
+        int localDeletionTime = (int) (timestamp / 1000);
+
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_TENCOL);
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 5, "key1").clustering("c").add("val", "val1");
+        for (int i = 0; i < 10; i++)
+            builder.add("val" + i, "val" + i);
+        builder.build().applyUnsafe();
+
+        RowUpdateBuilder.deleteRowAt(cfs.metadata, 10L, localDeletionTime, "key1", "c").applyUnsafe();
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, "key1").build());
+        EncodingStats stats = partition.stats();
+        assertEquals(localDeletionTime, stats.minLocalDeletionTime);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java
index 7dc7300..f40abe9 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneListTest.java

@@ -18,190 +18,22 @@
 */
 package org.apache.cassandra.db;
 
+import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import com.google.common.base.Joiner;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-import org.apache.cassandra.Util;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class RangeTombstoneListTest
 {
-    private static final Comparator<Composite> cmp = new SimpleDenseCellNameType(IntegerType.instance);
-
-    @Test
-    public void testDiff()
-    {
-        RangeTombstoneList superset;
-        RangeTombstoneList subset;
-        RangeTombstoneList diff;
-        Iterator<RangeTombstone> iter;
-
-        // no difference
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        assertNull( subset.diff(superset));
-
-        // all items in subset are contained by the first range in the superset
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        subset.add(rt(1, 2, 3));
-        subset.add(rt(3, 4, 4));
-        subset.add(rt(5, 6, 5));
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(1, 10, 10), iter.next());
-        assertRT(rt(20, 30, 10), iter.next());
-        assertRT(rt(40, 50, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // multiple subset RTs are contained by superset RTs
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        subset.add(rt(1, 2, 1));
-        subset.add(rt(3, 4, 2));
-        subset.add(rt(5, 6, 3));
-        superset.add(rt(1, 5, 2));
-        superset.add(rt(5, 6, 3));
-        superset.add(rt(6, 10, 2));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(1, 5, 2), iter.next());
-        assertRT(rt(6, 10, 2), iter.next());
-        assertFalse(iter.hasNext());
-
-        // the superset has one RT that covers the entire subset
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(1, 50, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // the superset has one RT that covers the remainder of the subset
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(20, 50, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // only the timestamp differs on one RT
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 20));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(20, 30, 20), iter.next());
-        assertFalse(iter.hasNext());
-
-        // superset has a large range on an RT at the start
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 2, 3));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(1, 10, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // superset has a larger range on an RT in the middle
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 25, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(20, 30, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // superset has a larger range on an RT at the end
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 55, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(40, 55, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-         // superset has one additional RT in the middle
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(20, 30, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // superset has one additional RT at the start
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(20, 30, 10));
-        subset.add(rt(40, 50, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(1, 10, 10), iter.next());
-        assertFalse(iter.hasNext());
-
-        // superset has one additional RT at the end
-        superset = new RangeTombstoneList(cmp, 10);
-        subset = new RangeTombstoneList(cmp, 10);
-        superset.add(rt(1, 10, 10));
-        superset.add(rt(20, 30, 10));
-        superset.add(rt(40, 50, 10));
-        subset.add(rt(1, 10, 10));
-        subset.add(rt(20, 30, 10));
-        diff = subset.diff(superset);
-        iter = diff.iterator();
-        assertRT(rt(40, 50, 10), iter.next());
-        assertFalse(iter.hasNext());
-    }
+    private static final ClusteringComparator cmp = new ClusteringComparator(Int32Type.instance);
 
     @Test
     public void sortedAdditionTest()
@@ -224,7 +56,7 @@
         Iterator<RangeTombstone> iter = l.iterator();
         assertRT(rt1, iter.next());
         assertRT(rt2, iter.next());
-        assertRT(rt3, iter.next());
+        assertRT(rtei(10, 13, 1), iter.next());
 
         assert !iter.hasNext();
     }
@@ -250,7 +82,7 @@
         Iterator<RangeTombstone> iter = l.iterator();
         assertRT(rt1, iter.next());
         assertRT(rt2, iter.next());
-        assertRT(rt3, iter.next());
+        assertRT(rtei(10, 13, 1), iter.next());
 
         assert !iter.hasNext();
     }
@@ -272,18 +104,18 @@
         l.add(rt(0, 15, 1));
 
         Iterator<RangeTombstone> iter = l.iterator();
-        assertRT(rt(0, 1, 1), iter.next());
-        assertRT(rt(1, 4, 2), iter.next());
-        assertRT(rt(4, 8, 3), iter.next());
+        assertRT(rtie(0, 1, 1), iter.next());
+        assertRT(rtie(1, 4, 2), iter.next());
+        assertRT(rtie(4, 8, 3), iter.next());
         assertRT(rt(8, 13, 4), iter.next());
-        assertRT(rt(13, 15, 1), iter.next());
+        assertRT(rtei(13, 15, 1), iter.next());
         assert !iter.hasNext();
 
         RangeTombstoneList l2 = new RangeTombstoneList(cmp, initialCapacity);
         l2.add(rt(4, 10, 12L));
         l2.add(rt(0, 8, 25L));
 
-        assertEquals(25L, l2.searchDeletionTime(b(8)).markedForDeleteAt);
+        assertEquals(25L, l2.searchDeletionTime(clustering(8)).markedForDeleteAt());
     }
 
     @Test
@@ -307,9 +139,9 @@
         l1.add(rt(3, 7, 5));
 
         Iterator<RangeTombstone> iter1 = l1.iterator();
-        assertRT(rt(0, 3, 3), iter1.next());
+        assertRT(rtie(0, 3, 3), iter1.next());
         assertRT(rt(3, 7, 5), iter1.next());
-        assertRT(rt(7, 10, 3), iter1.next());
+        assertRT(rtei(7, 10, 3), iter1.next());
         assert !iter1.hasNext();
 
         RangeTombstoneList l2 = new RangeTombstoneList(cmp, 0);
@@ -330,9 +162,9 @@
         l.add(rt(1, 4, 2));
         l.add(rt(4, 10, 5));
 
-        assertEquals(2, l.searchDeletionTime(b(3)).markedForDeleteAt);
-        assertEquals(5, l.searchDeletionTime(b(4)).markedForDeleteAt);
-        assertEquals(5, l.searchDeletionTime(b(8)).markedForDeleteAt);
+        assertEquals(2, l.searchDeletionTime(clustering(3)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(4)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(8)).markedForDeleteAt());
         assertEquals(3, l.size());
     }
 
@@ -346,20 +178,20 @@
         l.add(rt(14, 15, 3));
         l.add(rt(15, 17, 6));
 
-        assertEquals(null, l.searchDeletionTime(b(-1)));
+        assertEquals(null, l.searchDeletionTime(clustering(-1)));
 
-        assertEquals(5, l.searchDeletionTime(b(0)).markedForDeleteAt);
-        assertEquals(5, l.searchDeletionTime(b(3)).markedForDeleteAt);
-        assertEquals(5, l.searchDeletionTime(b(4)).markedForDeleteAt);
+        assertEquals(5, l.searchDeletionTime(clustering(0)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(3)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(4)).markedForDeleteAt());
 
-        assertEquals(2, l.searchDeletionTime(b(5)).markedForDeleteAt);
+        assertEquals(2, l.searchDeletionTime(clustering(5)).markedForDeleteAt());
 
-        assertEquals(null, l.searchDeletionTime(b(7)));
+        assertEquals(null, l.searchDeletionTime(clustering(7)));
 
-        assertEquals(3, l.searchDeletionTime(b(14)).markedForDeleteAt);
+        assertEquals(3, l.searchDeletionTime(clustering(14)).markedForDeleteAt());
 
-        assertEquals(6, l.searchDeletionTime(b(15)).markedForDeleteAt);
-        assertEquals(null, l.searchDeletionTime(b(18)));
+        assertEquals(6, l.searchDeletionTime(clustering(15)).markedForDeleteAt());
+        assertEquals(null, l.searchDeletionTime(clustering(18)));
     }
 
     @Test
@@ -379,13 +211,12 @@
         l1.addAll(l2);
 
         Iterator<RangeTombstone> iter = l1.iterator();
-        assertRT(rt(0, 3, 5), iter.next());
-        assertRT(rt(3, 4, 7), iter.next());
-        assertRT(rt(4, 5, 7), iter.next());
-        assertRT(rt(6, 7, 2), iter.next());
+        assertRT(rtie(0, 3, 5), iter.next());
+        assertRT(rt(3, 5, 7), iter.next());
+        assertRT(rtie(6, 7, 2), iter.next());
         assertRT(rt(7, 8, 3), iter.next());
-        assertRT(rt(8, 10, 2), iter.next());
-        assertRT(rt(10, 12, 1), iter.next());
+        assertRT(rtei(8, 10, 2), iter.next());
+        assertRT(rtei(10, 12, 1), iter.next());
         assertRT(rt(14, 17, 4), iter.next());
 
         assert !iter.hasNext();
@@ -403,7 +234,7 @@
         l1.addAll(l2);
 
         Iterator<RangeTombstone> iter = l1.iterator();
-        assertRT(rt(3, 5, 2), iter.next());
+        assertRT(rtie(3, 5, 2), iter.next());
         assertRT(rt(5, 7, 7), iter.next());
 
         assert !iter.hasNext();
@@ -429,52 +260,12 @@
     }
 
     @Test
-    public void purgetTest()
+    public void addAllBugFrom9799()
     {
-        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
-        l.add(rt(0, 4, 5, 110));
-        l.add(rt(4, 6, 2, 98));
-        l.add(rt(9, 12, 1, 200));
-        l.add(rt(14, 15, 3, 3));
-        l.add(rt(15, 17, 6, 45));
-
-        l.purge(100);
-
-        Iterator<RangeTombstone> iter = l.iterator();
-        assertRT(rt(0, 4, 5, 110), iter.next());
-        assertRT(rt(9, 12, 1, 200), iter.next());
-
-        assert !iter.hasNext();
-    }
-
-    @Test
-    public void minMaxTest()
-    {
-        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
-        l.add(rt(0, 4, 5, 110));
-        l.add(rt(4, 6, 2, 98));
-        l.add(rt(9, 12, 1, 200));
-        l.add(rt(14, 15, 3, 3));
-        l.add(rt(15, 17, 6, 45));
-
-        assertEquals(1, l.minMarkedAt());
-        assertEquals(6, l.maxMarkedAt());
-    }
-
-    @Test
-    public void insertSameTest()
-    {
-        // Simple test that adding the same element multiple time ends up
-        // with that element only a single time (CASSANDRA-9485)
-
-        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
-        l.add(rt(4, 4, 5, 100));
-        l.add(rt(4, 4, 6, 110));
-        l.add(rt(4, 4, 4, 90));
-
-        Iterator<RangeTombstone> iter = l.iterator();
-        assertRT(rt(4, 4, 6, 110), iter.next());
-        assert !iter.hasNext();
+        RangeTombstoneList l1 = fromString("{ (6, 7]@4 - (7, 8)@1 - [12, 12]@0 - [13, 13]@0 - (20, 21)@3 - [27, 27]@2 - (33, 34)@2 - (35, 36]@4 - (40, 41]@0 - (42, 43)@2 - (44, 45)@3 - [47, 47]@1 - (47, 48)@0 - [55, 55]@4 - [61, 61]@4 - [67, 67]@0 - [70, 70]@4 - [77, 77]@1 - (83, 84)@1 - [90, 90]@0 - (91, 92]@4 - [93, 93]@0 - (94, 95)@2 - (100, 101]@3 - (103, 104]@0 - (108, 109]@2 - (115, 116]@3 - (116, 117]@3 - (118, 119)@4 - (125, 126)@2 - [131, 131]@1 - [132, 132]@3 - [139, 139]@0 - [145, 145]@1 - (145, 146]@3 - (147, 148]@4 - (150, 151]@1 - (156, 157)@2 - (158, 159)@2 - [164, 164]@4 - (168, 169)@0 - (171, 172)@4 - (173, 174]@0 - [179, 179]@1 - (186, 187]@4 - [191, 191]@1 }");
+        RangeTombstoneList l2 = fromString("{ (1, 12)@8 - [12, 13)@8 - [13, 18]@7 }");
+        l1.addAll(l2);
+        assertValid(l1);
     }
 
     private RangeTombstoneList makeRandom(Random rand, int size, int maxItSize, int maxItDistance, int maxMarkedAt)
@@ -483,19 +274,35 @@
 
         int prevStart = -1;
         int prevEnd = 0;
+        boolean prevStartInclusive = false;
+        boolean prevEndInclusive = false;
         for (int i = 0; i < size; i++)
         {
             int nextStart = prevEnd + rand.nextInt(maxItDistance);
             int nextEnd = nextStart + rand.nextInt(maxItSize);
 
-            // We can have an interval [x, x], but not 2 consecutives ones for the same x
-            if (nextEnd == nextStart && prevEnd == prevStart && prevEnd == nextStart)
-                nextEnd += 1 + rand.nextInt(maxItDistance);
+            boolean startInclusive = rand.nextBoolean();
+            boolean endInclusive = rand.nextBoolean();
 
-            l.add(rt(nextStart, nextEnd, rand.nextInt(maxMarkedAt)));
+            // Now make sure we create meaningful ranges
+
+            if (prevEnd == nextStart)
+                startInclusive = !prevEndInclusive;
+
+            if (nextStart == nextEnd)
+            {
+                if (startInclusive)
+                    endInclusive = true;
+                else
+                    nextEnd += 1;
+            }
+
+            l.add(rt(nextStart, startInclusive, nextEnd, endInclusive, rand.nextInt(maxMarkedAt)));
 
             prevStart = nextStart;
             prevEnd = nextEnd;
+            prevStartInclusive = startInclusive;
+            prevEndInclusive = endInclusive;
         }
         return l;
     }
@@ -537,55 +344,273 @@
         }
     }
 
+    @Test
+    public void nonSortedAdditionTestWithOneTombstoneWithEmptyEnd()
+    {
+        nonSortedAdditionTestWithOneRangeWithEmptyEnd(0);
+        nonSortedAdditionTestWithOneRangeWithEmptyEnd(10);
+    }
+
+  private static void nonSortedAdditionTestWithOneRangeWithEmptyEnd(int initialCapacity)
+    {
+        RangeTombstoneList l = new RangeTombstoneList(cmp, initialCapacity);
+        RangeTombstone rt1 = rt(1, 5, 3);
+        RangeTombstone rt2 = rt(7, 10, 2);
+        RangeTombstone rt3 = atLeast(11, 1, 0);
+
+        l.add(rt2);
+        l.add(rt3);
+        l.add(rt1);
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(rt1, iter.next());
+        assertRT(rt2, iter.next());
+        assertRT(rt3, iter.next());
+
+        assert !iter.hasNext();
+    }
+
+    @Test
+    public void addRangeWithEmptyEndWitchIncludeExistingRange()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(rt(4, 10, 3));
+        l.add(atLeast(3, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(atLeast(3, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addRangeWithEmptyStartAndEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(rt(4, 10, 3));
+        l.add(atMost(12, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(atMost(12, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addRangeWithEmptyEndToRangeWithEmptyStartAndEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(new RangeTombstone(Slice.ALL, new DeletionTime(2, 0)));
+        l.add(atLeast(12, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(lessThan(12, 2, 0), iter.next());
+        assertRT(atLeast(12, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addRangeWithEmptyEndWitchIncludeExistingRangeWithEmptyEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(atLeast(5, 3, 0));
+        l.add(atLeast(3, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(atLeast(3, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addIncludedRangeToRangeWithEmptyEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(atLeast(3, 3, 0));
+        l.add(rt(4, 10, 4));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(rtie(3, 4, 3), iter.next());
+        assertRT(rt(4, 10, 4), iter.next());
+        assertRT(greaterThan(10, 3, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addIncludedRangeWithEmptyEndToRangeWithEmptyEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(atLeast(3, 3, 0));
+        l.add(atLeast(5, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(rtie(3, 5, 3), iter.next());
+        assertRT(atLeast(5, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addRangeWithEmptyEndWitchOverlapExistingRange()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(rt(4, 10, 3));
+        l.add(atLeast(6, 4, 0));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(rtie(4, 6, 3), iter.next());
+        assertRT(atLeast(6, 4, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void addOverlappingRangeToRangeWithEmptyEnd()
+    {
+
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+
+        l.add(atLeast(3, 3, 0));
+        l.add(rt(1, 10, 4));
+
+        Iterator<RangeTombstone> iter = l.iterator();
+        assertRT(rt(1, 10, 4), iter.next());
+        assertRT(greaterThan(10, 3, 0), iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void searchTestWithEmptyStart()
+    {
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+        l.add(atMost(4, 5, 0));
+        l.add(rt(4, 6, 2));
+        l.add(rt(9, 12, 1));
+        l.add(rt(14, 15, 3));
+        l.add(rt(15, 17, 6));
+
+        assertEquals(5, l.searchDeletionTime(clustering(-1)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(0)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(3)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(4)).markedForDeleteAt());
+
+        assertEquals(2, l.searchDeletionTime(clustering(5)).markedForDeleteAt());
+
+        assertEquals(null, l.searchDeletionTime(clustering(7)));
+
+        assertEquals(3, l.searchDeletionTime(clustering(14)).markedForDeleteAt());
+
+        assertEquals(6, l.searchDeletionTime(clustering(15)).markedForDeleteAt());
+        assertEquals(null, l.searchDeletionTime(clustering(18)));
+    }
+
+    @Test
+    public void searchTestWithRangeWithEmptyEnd()
+    {
+        RangeTombstoneList l = new RangeTombstoneList(cmp, 0);
+        l.add(rt(0, 4, 5));
+        l.add(rt(4, 6, 2));
+        l.add(rt(9, 12, 1));
+        l.add(rt(14, 15, 3));
+        l.add(atLeast(15, 6, 0));
+
+        assertEquals(null, l.searchDeletionTime(clustering(-1)));
+
+        assertEquals(5, l.searchDeletionTime(clustering(0)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(3)).markedForDeleteAt());
+        assertEquals(5, l.searchDeletionTime(clustering(4)).markedForDeleteAt());
+
+        assertEquals(2, l.searchDeletionTime(clustering(5)).markedForDeleteAt());
+
+        assertEquals(null, l.searchDeletionTime(clustering(7)));
+
+        assertEquals(3, l.searchDeletionTime(clustering(14)).markedForDeleteAt());
+
+        assertEquals(6, l.searchDeletionTime(clustering(15)).markedForDeleteAt());
+        assertEquals(6, l.searchDeletionTime(clustering(1000)).markedForDeleteAt());
+    }
+
     private static void assertRT(RangeTombstone expected, RangeTombstone actual)
     {
-        assertEquals(String.format("Expected %s but got %s", toString(expected), toString(actual)), expected, actual);
+        assertTrue(String.format("%s != %s", toString(expected), toString(actual)), cmp.compare(expected.deletedSlice().start(), actual.deletedSlice().start()) == 0);
+        assertTrue(String.format("%s != %s", toString(expected), toString(actual)), cmp.compare(expected.deletedSlice().end(), actual.deletedSlice().end()) == 0);
+        assertEquals(String.format("%s != %s", toString(expected), toString(actual)), expected.deletionTime(), actual.deletionTime());
     }
 
     private static void assertValid(RangeTombstoneList l)
     {
-        // We check that ranges are in the right order and that we never have something
-        // like ...[x, x][x, x] ...
-        int prevStart = -2;
-        int prevEnd = -1;
-        for (RangeTombstone rt : l)
+        if (l.isEmpty())
+            return;
+
+        // We check that ranges are in the right order and non overlapping
+        Iterator<RangeTombstone> iter = l.iterator();
+        Slice prev = iter.next().deletedSlice();
+        assertFalse("Invalid empty slice " + prev.toString(cmp), prev.isEmpty(cmp));
+
+        while (iter.hasNext())
         {
-            int curStart = i(rt.min);
-            int curEnd = i(rt.max);
+            Slice curr = iter.next().deletedSlice();
 
-            assertTrue("Invalid " + toString(l), prevEnd <= curStart);
-            assertTrue("Invalid " + toString(l), curStart <= curEnd);
-
-            if (curStart == curEnd && prevEnd == curStart)
-                assertTrue("Invalid " + toString(l), prevStart != prevEnd);
-
-            prevStart = curStart;
-            prevEnd = curEnd;
+            assertFalse("Invalid empty slice " + curr.toString(cmp), curr.isEmpty(cmp));
+            assertTrue("Slice not in order or overlapping : " + prev.toString(cmp) + curr.toString(cmp), cmp.compare(prev.end(), curr.start()) <= 0);
         }
     }
 
     private static String toString(RangeTombstone rt)
     {
-        return String.format("[%d, %d]@%d", i(rt.min), i(rt.max), rt.data.markedForDeleteAt);
+        return String.format("%s@%d", rt.deletedSlice().toString(cmp), rt.deletionTime().markedForDeleteAt());
     }
 
     private static String toString(RangeTombstoneList l)
     {
-        StringBuilder sb = new StringBuilder();
-        sb.append("{");
+        String[] ranges = new String[l.size()];
+        int i = 0;
         for (RangeTombstone rt : l)
-            sb.append(" ").append(toString(rt));
-        return sb.append(" }").toString();
+            ranges[i++] = toString(rt);
+
+        return "{ " + Joiner.on(" - ").join(ranges) + " }";
     }
 
-    private static Composite b(int i)
+    private static RangeTombstone rangeFromString(String range)
     {
-        return Util.cellname(i);
+        Matcher matcher = Pattern.compile("([\\[(])(\\d+), (\\d+)([)\\]])@(\\d+)").matcher(range.trim());
+        matcher.matches();
+        boolean isOpenInclusive = matcher.group(1).equals("[");
+        int start = Integer.valueOf(matcher.group(2));
+        int end = Integer.valueOf(matcher.group(3));
+        boolean isCloseInclusive = matcher.group(4).equals("]");
+        long timestamp = Long.valueOf(matcher.group(5));
+        return rt(start, isOpenInclusive, end, isCloseInclusive, timestamp);
     }
 
-    private static int i(Composite c)
+    private static RangeTombstoneList fromString(String str)
     {
-        return ByteBufferUtil.toInt(c.toByteBuffer());
+        str = str.trim();
+        String[] ranges = str.substring(1, str.length() - 1).split("-", 0);
+        RangeTombstoneList l = new RangeTombstoneList(cmp, ranges.length);
+        for (String range : ranges)
+            l.add(rangeFromString(range));
+        return l;
+    }
+
+
+    private static Clustering clustering(int i)
+    {
+        return new Clustering(bb(i));
+    }
+
+    private static ByteBuffer bb(int i)
+    {
+        return ByteBufferUtil.bytes(i);
     }
 
     private static RangeTombstone rt(int start, int end, long tstamp)
@@ -593,8 +618,53 @@
         return rt(start, end, tstamp, 0);
     }
 
+    private static RangeTombstone rt(int start, boolean startInclusive, int end, boolean endInclusive, long tstamp)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.create(cmp, true, startInclusive, start), Slice.Bound.create(cmp, false, endInclusive, end)), new DeletionTime(tstamp, 0));
+    }
+
     private static RangeTombstone rt(int start, int end, long tstamp, int delTime)
     {
-        return new RangeTombstone(b(start), b(end), tstamp, delTime);
+        return new RangeTombstone(Slice.make(Slice.Bound.inclusiveStartOf(bb(start)), Slice.Bound.inclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone rtei(int start, int end, long tstamp)
+    {
+        return rtei(start, end, tstamp, 0);
+    }
+
+    private static RangeTombstone rtei(int start, int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.exclusiveStartOf(bb(start)), Slice.Bound.inclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone rtie(int start, int end, long tstamp)
+    {
+        return rtie(start, end, tstamp, 0);
+    }
+
+    private static RangeTombstone rtie(int start, int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.inclusiveStartOf(bb(start)), Slice.Bound.exclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone atLeast(int start, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.inclusiveStartOf(bb(start)), Slice.Bound.TOP), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone atMost(int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.BOTTOM, Slice.Bound.inclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone lessThan(int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.BOTTOM, Slice.Bound.exclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone greaterThan(int start, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.exclusiveStartOf(bb(start)), Slice.Bound.TOP), new DeletionTime(tstamp, delTime));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
index bff0ddf..967a85c 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java

@@ -19,49 +19,35 @@
 package org.apache.cassandra.db;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
+import java.util.*;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterators;
-
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.*;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.statements.IndexTarget;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
-import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.db.index.PerColumnSecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.index.SecondaryIndexSearcher;
+import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.index.StubIndex;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.FBUtilities;
 
-import static org.apache.cassandra.Util.dk;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -76,9 +62,13 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KSNAME,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KSNAME, CFNAME, IntegerType.instance));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KSNAME,
+                                                              CFNAME,
+                                                              1,
+                                                              UTF8Type.instance,
+                                                              Int32Type.instance,
+                                                              Int32Type.instance));
     }
 
     @Test
@@ -86,58 +76,58 @@
     {
         Keyspace keyspace = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CFNAME);
+        boolean enforceStrictLiveness = cfs.metadata.enforceStrictLiveness();
 
         // Inserting data
         String key = "k1";
-        Mutation rm;
-        ColumnFamily cf;
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        UpdateBuilder builder;
+
+        builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
         for (int i = 0; i < 40; i += 2)
-            add(rm, i, 0);
-        rm.applyUnsafe();
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 10, 22, 1);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(10, 22).build().applyUnsafe();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(2);
         for (int i = 1; i < 40; i += 2)
-            add(rm, i, 2);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 19, 27, 3);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 3, key).addRangeTombstone(19, 27).build().applyUnsafe();
         // We don't flush to test with both a range tomsbtone in memtable and in sstable
 
         // Queries by name
         int[] live = new int[]{ 4, 9, 11, 17, 28 };
         int[] dead = new int[]{ 12, 19, 21, 24, 27 };
-        SortedSet<CellName> columns = new TreeSet<CellName>(cfs.getComparator());
+
+        AbstractReadCommandBuilder.SinglePartitionBuilder cmdBuilder = Util.cmd(cfs, key);
         for (int i : live)
-            columns.add(b(i));
+            cmdBuilder.includeRow(i);
         for (int i : dead)
-            columns.add(b(i));
-        cf = cfs.getColumnFamily(QueryFilter.getNamesFilter(dk(key), CFNAME, columns, System.currentTimeMillis()));
+            cmdBuilder.includeRow(i);
+
+        Partition partition = Util.getOnlyPartitionUnfiltered(cmdBuilder.build());
+        int nowInSec = FBUtilities.nowInSeconds();
 
         for (int i : live)
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
         for (int i : dead)
-            assertTrue("Cell " + i + " shouldn't be live", !isLive(cf, cf.getColumn(b(i))));
+            assertFalse("Row " + i + " shouldn't be live",
+                        partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
 
         // Queries by slices
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(7), b(30), false, Integer.MAX_VALUE, System.currentTimeMillis()));
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(7).toIncl(30).build());
 
         for (int i : new int[]{ 7, 8, 9, 11, 13, 15, 17, 28, 29, 30 })
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
         for (int i : new int[]{ 10, 12, 14, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 })
-            assertTrue("Cell " + i + " shouldn't be live", !isLive(cf, cf.getColumn(b(i))));
+            assertFalse("Row " + i + " shouldn't be live",
+                        partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
     }
 
     @Test
@@ -149,141 +139,135 @@
 
         // Inserting data
         String key = "k111";
-        Mutation rm;
-        ColumnFamily cf;
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
         for (int i = 0; i < 40; i += 2)
-            add(rm, i, 0);
-        rm.applyUnsafe();
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 5, 10, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(5, 10).build().applyUnsafe();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 15, 20, 2);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 2, key).addRangeTombstone(15, 20).build().applyUnsafe();
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(11), b(14), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        Collection<RangeTombstone> rt = rangeTombstones(cf);
+        ImmutableBTreePartition partition;
+
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(11).toIncl(14).build());
+        Collection<RangeTombstone> rt = rangeTombstones(partition);
         assertEquals(0, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(11), b(15), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(11).toIncl(15).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(20), b(25), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(20).toIncl(25).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(12), b(25), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(12).toIncl(25).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(25), b(35), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(25).toIncl(35).build());
+        rt = rangeTombstones(partition);
         assertEquals(0, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(40), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(1).toIncl(40).build());
+        rt = rangeTombstones(partition);
         assertEquals(2, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(7), b(17), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(7).toIncl(17).build());
+        rt = rangeTombstones(partition);
         assertEquals(2, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(20), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(5).toIncl(20).build());
+        rt = rangeTombstones(partition);
         assertEquals(2, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(15), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(5).toIncl(20).build());
+        rt = rangeTombstones(partition);
         assertEquals(2, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(2), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(1).toIncl(2).build());
+        rt = rangeTombstones(partition);
         assertEquals(0, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(5), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(1).toIncl(5).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(1), b(10), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(1).toIncl(10).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(5), b(6), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(5).toIncl(6).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(17), b(20), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(17).toIncl(20).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, b(17), b(18), false, Integer.MAX_VALUE, System.currentTimeMillis()));
-        rt = rangeTombstones(cf);
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).fromIncl(17).toIncl(18).build());
+        rt = rangeTombstones(partition);
         assertEquals(1, rt.size());
 
-        ColumnSlice[] slices = new ColumnSlice[]{new ColumnSlice( b(1), b(10)), new ColumnSlice( b(16), b(20))};
-        IDiskAtomFilter sqf = new SliceQueryFilter(slices, false, Integer.MAX_VALUE);
-        cf = cfs.getColumnFamily( new QueryFilter(dk(key), CFNAME, sqf, System.currentTimeMillis()) );
-        rt = rangeTombstones(cf);
+        Slices.Builder sb = new Slices.Builder(cfs.getComparator());
+        sb.add(Slice.Bound.create(cfs.getComparator(), true, true, 1), Slice.Bound.create(cfs.getComparator(), false, true, 10));
+        sb.add(Slice.Bound.create(cfs.getComparator(), true, true, 16), Slice.Bound.create(cfs.getComparator(), false, true, 20));
+
+        partition = Util.getOnlyPartitionUnfiltered(SinglePartitionReadCommand.create(cfs.metadata, FBUtilities.nowInSeconds(), Util.dk(key), sb.build()));
+        rt = rangeTombstones(partition);
         assertEquals(2, rt.size());
     }
 
-    private Collection<RangeTombstone> rangeTombstones(ColumnFamily cf)
+    private Collection<RangeTombstone> rangeTombstones(ImmutableBTreePartition partition)
     {
         List<RangeTombstone> tombstones = new ArrayList<RangeTombstone>();
-        Iterators.addAll(tombstones, cf.deletionInfo().rangeIterator());
+        Iterators.addAll(tombstones, partition.deletionInfo().rangeIterator(false));
         return tombstones;
     }
 
     @Test
-    public void testTrackTimesRowTombstone() throws ExecutionException, InterruptedException
+    public void testTrackTimesPartitionTombstone() throws ExecutionException, InterruptedException
     {
         Keyspace ks = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        long timestamp = System.currentTimeMillis();
-        cf.delete(new DeletionInfo(1000, (int)(timestamp/1000)));
-        rm.apply();
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata, Util.dk(key), 1000, nowInSec)).apply();
         cfs.forceBlockingFlush();
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
-        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, (int)(timestamp/1000));
+
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
         cfs.forceMajorCompaction();
-        sstable = cfs.getSSTables().iterator().next();
-        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, (int)(timestamp/1000));
+        sstable = cfs.getLiveSSTables().iterator().next();
+        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
     }
 
     @Test
-    public void testTrackTimesRowTombstoneWithData() throws ExecutionException, InterruptedException
+    public void testTrackTimesPartitionTombstoneWithData() throws ExecutionException, InterruptedException
     {
         Keyspace ks = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        add(rm, 5, 999);
-        rm.apply();
+
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(999).newRow(5).add("val", 5).apply();
+
         key = "rt_times2";
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        int timestamp = (int)(System.currentTimeMillis()/1000);
-        cf.delete(new DeletionInfo(1000, timestamp));
-        rm.apply();
+        int nowInSec = FBUtilities.nowInSeconds();
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata, Util.dk(key), 1000, nowInSec)).apply();
         cfs.forceBlockingFlush();
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
         cfs.forceMajorCompaction();
-        sstable = cfs.getSSTables().iterator().next();
+        sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
     }
+
     @Test
     public void testTrackTimesRangeTombstone() throws ExecutionException, InterruptedException
     {
@@ -291,17 +275,16 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        long timestamp = System.currentTimeMillis();
-        cf.delete(new DeletionInfo(b(1), b(2), cfs.getComparator(), 1000, (int)(timestamp/1000)));
-        rm.apply();
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        new RowUpdateBuilder(cfs.metadata, nowInSec, 1000L, key).addRangeTombstone(1, 2).build().apply();
         cfs.forceBlockingFlush();
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
-        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, (int)(timestamp/1000));
+
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
         cfs.forceMajorCompaction();
-        sstable = cfs.getSSTables().iterator().next();
-        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, (int)(timestamp/1000));
+        sstable = cfs.getLiveSSTables().iterator().next();
+        assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
     }
 
     @Test
@@ -311,20 +294,19 @@
         ColumnFamilyStore cfs = ks.getColumnFamilyStore(CFNAME);
         cfs.truncateBlocking();
         String key = "rt_times";
-        Mutation rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        add(rm, 5, 999);
-        rm.apply();
+
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(999).newRow(5).add("val", 5).apply();
+
         key = "rt_times2";
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        int timestamp = (int)(System.currentTimeMillis()/1000);
-        cf.delete(new DeletionInfo(b(1), b(2), cfs.getComparator(), 1000, timestamp));
-        rm.apply();
+        int nowInSec = FBUtilities.nowInSeconds();
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata, Util.dk(key), 1000, nowInSec)).apply();
         cfs.forceBlockingFlush();
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+
+        cfs.forceBlockingFlush();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
         cfs.forceMajorCompaction();
-        sstable = cfs.getSSTables().iterator().next();
+        sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
     }
 
@@ -343,21 +325,19 @@
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7810";
-        Mutation rm;
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        for (int i = 10; i < 20; i++)
-            add(rm, i, 0);
-        rm.apply();
+
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
+        for (int i = 10; i < 20; i ++)
+            builder.newRow(i).add("val", i);
+        builder.apply();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        cf.delete(new DeletionInfo(b(10),b(11), cfs.getComparator(), 1, 1));
-        rm.apply();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(10, 11).build().apply();
         cfs.forceBlockingFlush();
+
         Thread.sleep(5);
         cfs.forceMajorCompaction();
-        assertEquals(8, Util.getColumnFamily(ks, Util.dk(key), CFNAME).getColumnCount());
+        assertEquals(8, Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).rowCount());
     }
 
     @Test
@@ -368,16 +348,13 @@
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7808_1";
-        Mutation rm;
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
         for (int i = 0; i < 40; i += 2)
-            add(rm, i, 0);
-        rm.apply();
+            builder.newRow(i).add("val", i);
+        builder.apply();
         cfs.forceBlockingFlush();
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        cf.delete(new DeletionInfo(1, 1));
-        rm.apply();
+
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata, Util.dk(key), 1, 1)).apply();
         cfs.forceBlockingFlush();
         Thread.sleep(5);
         cfs.forceMajorCompaction();
@@ -391,26 +368,20 @@
         cfs.metadata.gcGraceSeconds(2);
 
         String key = "7808_2";
-        Mutation rm;
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        for (int i = 10; i < 20; i++)
-            add(rm, i, 0);
-        rm.apply();
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
+        for (int i = 10; i < 20; i ++)
+            builder.newRow(i).add("val", i);
+        builder.apply();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        cf.delete(new DeletionInfo(0,0));
-        rm.apply();
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata, Util.dk(key), 0, 0)).apply();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        add(rm, 5, 1);
-        rm.apply();
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(1).newRow(5).add("val", 5).apply();
 
         cfs.forceBlockingFlush();
         Thread.sleep(5);
         cfs.forceMajorCompaction();
-        assertEquals(1, Util.getColumnFamily(ks, Util.dk(key), CFNAME).getColumnCount());
+        assertEquals(1, Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).rowCount());
     }
 
     @Test
@@ -419,55 +390,54 @@
         CompactionManager.instance.disableAutoCompaction();
         Keyspace keyspace = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CFNAME);
+        boolean enforceStrictLiveness = cfs.metadata.enforceStrictLiveness();
 
         // Inserting data
         String key = "k2";
-        Mutation rm;
-        ColumnFamily cf;
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
         for (int i = 0; i < 20; i++)
-            add(rm, i, 0);
-        rm.applyUnsafe();
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 5, 15, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(5, 15).build().applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 5, 10, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(5, 10).build().applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        cf = rm.addOrGet(CFNAME);
-        delete(cf, 5, 8, 2);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 2, key).addRangeTombstone(5, 8).build().applyUnsafe();
         cfs.forceBlockingFlush();
 
-        cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(key), CFNAME, System.currentTimeMillis()));
+        Partition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        int nowInSec = FBUtilities.nowInSeconds();
 
         for (int i = 0; i < 5; i++)
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
         for (int i = 16; i < 20; i++)
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
         for (int i = 5; i <= 15; i++)
-            assertTrue("Cell " + i + " shouldn't be live", !isLive(cf, cf.getColumn(b(i))));
+            assertFalse("Row " + i + " shouldn't be live",
+                        partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
 
         // Compact everything and re-test
         CompactionManager.instance.performMaximal(cfs, false);
-        cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dk(key), CFNAME, System.currentTimeMillis()));
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
 
         for (int i = 0; i < 5; i++)
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(FBUtilities.nowInSeconds(),
+                                                                           enforceStrictLiveness));
         for (int i = 16; i < 20; i++)
-            assertTrue("Cell " + i + " should be live", isLive(cf, cf.getColumn(b(i))));
+            assertTrue("Row " + i + " should be live",
+                       partition.getRow(new Clustering(bb(i))).hasLiveData(FBUtilities.nowInSeconds(),
+                                                                           enforceStrictLiveness));
         for (int i = 5; i <= 15; i++)
-            assertTrue("Cell " + i + " shouldn't be live", !isLive(cf, cf.getColumn(b(i))));
+            assertFalse("Row " + i + " shouldn't be live",
+                        partition.getRow(new Clustering(bb(i))).hasLiveData(nowInSec, enforceStrictLiveness));
     }
 
     @Test
@@ -478,33 +448,74 @@
 
         // Inserting data
         String key = "k3";
-        Mutation rm;
-        ColumnFamily cf;
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        add(rm, 2, 0);
-        rm.applyUnsafe();
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(0).newRow(2).add("val", 2).applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, ByteBufferUtil.bytes(key));
-        // Deletes everything but without being a row tombstone
-        delete(rm.addOrGet(CFNAME), 0, 10, 1);
-        add(rm, 1, 2);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(0, 10).build().applyUnsafe();
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(2).newRow(1).add("val", 1).applyUnsafe();
         cfs.forceBlockingFlush();
 
         // Get the last value of the row
-        cf = cfs.getColumnFamily(QueryFilter.getSliceFilter(dk(key), CFNAME, Composites.EMPTY, Composites.EMPTY, true, 1, System.currentTimeMillis()));
+        FilteredPartition partition = Util.getOnlyPartition(Util.cmd(cfs, key).build());
+        assertTrue(partition.rowCount() > 0);
 
-        assertFalse(cf.isEmpty());
-        int last = i(cf.getSortedColumns().iterator().next().name());
+        int last = i(partition.unfilteredIterator(ColumnFilter.all(cfs.metadata), Slices.ALL, true).next().clustering().get(0));
         assertEquals("Last column should be column 1 since column 2 has been deleted", 1, last);
     }
 
     @Test
     public void testRowWithRangeTombstonesUpdatesSecondaryIndex() throws Exception
     {
-        runCompactionWithRangeTombstoneAndCheckSecondaryIndex();
+        Keyspace table = Keyspace.open(KSNAME);
+        ColumnFamilyStore cfs = table.getColumnFamilyStore(CFNAME);
+        ByteBuffer key = ByteBufferUtil.bytes("k5");
+        ByteBuffer indexedColumnName = ByteBufferUtil.bytes("val");
+
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ColumnDefinition cd = cfs.metadata.getColumnDefinition(indexedColumnName).copy();
+        IndexMetadata indexDef =
+            IndexMetadata.fromIndexTargets(cfs.metadata,
+                                           Collections.singletonList(new IndexTarget(cd.name, IndexTarget.Type.VALUES)),
+                                           "test_index",
+                                           IndexMetadata.Kind.CUSTOM,
+                                           ImmutableMap.of(IndexTarget.CUSTOM_INDEX_OPTION_NAME,
+                                                           StubIndex.class.getName()));
+
+        if (!cfs.metadata.getIndexes().get("test_index").isPresent())
+            cfs.metadata.indexes(cfs.metadata.getIndexes().with(indexDef));
+
+        Future<?> rebuild = cfs.indexManager.addIndex(indexDef);
+        // If rebuild there is, wait for the rebuild to finish so it doesn't race with the following insertions
+        if (rebuild != null)
+            rebuild.get();
+
+        StubIndex index = (StubIndex)cfs.indexManager.listIndexes()
+                                                     .stream()
+                                                     .filter(i -> "test_index".equals(i.getIndexMetadata().name))
+                                                     .findFirst()
+                                                     .orElseThrow(() -> new RuntimeException(new AssertionError("Index not found")));
+        index.reset();
+
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
+        for (int i = 0; i < 10; i++)
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
+        cfs.forceBlockingFlush();
+
+        new RowUpdateBuilder(cfs.metadata, 0, key).addRangeTombstone(0, 7).build().applyUnsafe();
+        cfs.forceBlockingFlush();
+
+        assertEquals(10, index.rowsInserted.size());
+
+        CompactionManager.instance.performMaximal(cfs, false);
+
+        // compacted down to single sstable
+        assertEquals(1, cfs.getLiveSSTables().size());
+
+        assertEquals(8, index.rowsDeleted.size());
     }
 
     @Test
@@ -517,269 +528,94 @@
         // remove any existing sstables before starting
         cfs.truncateBlocking();
         cfs.disableAutoCompaction();
-        cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
 
-        Mutation rm = new Mutation(KSNAME, key);
+        UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, key).withTimestamp(0);
         for (int i = 0; i < 10; i += 2)
-            add(rm, i, 0);
-        rm.applyUnsafe();
+            builder.newRow(i).add("val", i);
+        builder.applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KSNAME, key);
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        for (int i = 0; i < 10; i += 2)
-            delete(cf, 0, 7, 0);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, key).addRangeTombstone(0, 7).build().applyUnsafe();
         cfs.forceBlockingFlush();
 
         // there should be 2 sstables
-        assertEquals(2, cfs.getSSTables().size());
+        assertEquals(2, cfs.getLiveSSTables().size());
 
         // compact down to single sstable
         CompactionManager.instance.performMaximal(cfs, false);
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // test the physical structure of the sstable i.e. rt & columns on disk
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
-        try(ISSTableScanner scanner = sstable.getScanner())
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        try (UnfilteredPartitionIterator scanner = sstable.getScanner())
         {
-            OnDiskAtomIterator iter = scanner.next();
-            int cnt = 0;
-            // after compaction, the first element should be an RT followed by the remaining non-deleted columns
-            while (iter.hasNext())
+            try (UnfilteredRowIterator iter = scanner.next())
             {
-                OnDiskAtom atom = iter.next();
-                if (cnt == 0)
-                    assertTrue(atom instanceof RangeTombstone);
-                if (cnt > 0)
-                    assertTrue(atom instanceof Cell);
-                cnt++;
+                // after compaction, we should have a single RT with a single row (the row 8)
+                Unfiltered u1 = iter.next();
+                assertTrue("Expecting open marker, got " + u1.toString(cfs.metadata), u1 instanceof RangeTombstoneMarker);
+                Unfiltered u2 = iter.next();
+                assertTrue("Expecting close marker, got " + u2.toString(cfs.metadata), u2 instanceof RangeTombstoneMarker);
+                Unfiltered u3 = iter.next();
+                assertTrue("Expecting row, got " + u3.toString(cfs.metadata), u3 instanceof Row);
             }
-            assertEquals(2, cnt);
         }
     }
 
     @Test
-    public void testCompactionOfRangeTombstonesCoveredByRowTombstone() throws Exception
-    {
-        long testTimeStamp = 1451606400L; // 01/01/2016 : 00:00:00 GMT
-        Keyspace table = Keyspace.open(KSNAME);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(CFNAME);
-        ByteBuffer key = ByteBufferUtil.bytes("k4");
-
-        // remove any existing sstables before starting
-        cfs.truncateBlocking();
-        cfs.disableAutoCompaction();
-        cfs.setCompactionStrategyClass(LeveledCompactionStrategy.class.getCanonicalName());
-
-        Mutation rm = new Mutation(KSNAME, key);
-        for (int i = 1; i < 11; i += 2, testTimeStamp += i * 10)
-            add(rm, i, testTimeStamp);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        rm = new Mutation(KSNAME, key);
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-
-        // Write the covering row tombstone
-        cf.delete(new DeletionTime(++testTimeStamp, (int) testTimeStamp));
-
-        // Create range tombstones covered by row tombstone above.
-        for (int i = 1; i < 11; i += 2, testTimeStamp -= i * 5)
-            delete(cf, 0, 7, testTimeStamp);
-        rm.apply();
-        cfs.forceBlockingFlush();
-
-        // there should be 2 sstables
-        assertEquals(2, cfs.getSSTables().size());
-
-        // compact down to nothing
-        CompactionManager.instance.performMaximal(cfs, false);
-        assertEquals(0, cfs.getSSTables().size());
-    }
-
-    @Test
     public void testOverwritesToDeletedColumns() throws Exception
     {
         Keyspace table = Keyspace.open(KSNAME);
         ColumnFamilyStore cfs = table.getColumnFamilyStore(CFNAME);
         ByteBuffer key = ByteBufferUtil.bytes("k6");
-        ByteBuffer indexedColumnName = ByteBufferUtil.bytes(1);
+        ByteBuffer indexedColumnName = ByteBufferUtil.bytes("val");
 
         cfs.truncateBlocking();
         cfs.disableAutoCompaction();
-        cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
-        if (cfs.indexManager.getIndexForColumn(indexedColumnName) == null)
-        {
-            ColumnDefinition cd = new ColumnDefinition(cfs.metadata, indexedColumnName, Int32Type.instance, null, ColumnDefinition.Kind.REGULAR);
-            cd.setIndex("test_index", IndexType.CUSTOM, ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()));
-            Future<?> rebuild = cfs.indexManager.addIndexedColumn(cd);
-            // If rebuild there is, wait for the rebuild to finish so it doesn't race with the following insertions
-            if (rebuild != null)
-                rebuild.get();
-        }
 
-        TestIndex index = ((TestIndex)cfs.indexManager.getIndexForColumn(indexedColumnName));
-        index.resetCounts();
+        ColumnDefinition cd = cfs.metadata.getColumnDefinition(indexedColumnName).copy();
+        IndexMetadata indexDef =
+            IndexMetadata.fromIndexTargets(cfs.metadata,
+                                           Collections.singletonList(new IndexTarget(cd.name, IndexTarget.Type.VALUES)),
+                                           "test_index",
+                                           IndexMetadata.Kind.CUSTOM,
+                                           ImmutableMap.of(IndexTarget.CUSTOM_INDEX_OPTION_NAME,
+                                                           StubIndex.class.getName()));
 
-        Mutation rm = new Mutation(KSNAME, key);
-        add(rm, 1, 0);
-        rm.applyUnsafe();
+        if (!cfs.metadata.getIndexes().get("test_index").isPresent())
+            cfs.metadata.indexes(cfs.metadata.getIndexes().with(indexDef));
+
+        Future<?> rebuild = cfs.indexManager.addIndex(indexDef);
+        // If rebuild there is, wait for the rebuild to finish so it doesn't race with the following insertions
+        if (rebuild != null)
+            rebuild.get();
+
+        StubIndex index = (StubIndex)cfs.indexManager.getIndexByName("test_index");
+        index.reset();
+
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(0).newRow(1).add("val", 1).applyUnsafe();
 
         // add a RT which hides the column we just inserted
-        rm = new Mutation(KSNAME, key);
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        delete(cf, 0, 1, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 1, key).addRangeTombstone(0, 1).build().applyUnsafe();
 
         // now re-insert that column
-        rm = new Mutation(KSNAME, key);
-        add(rm, 1, 2);
-        rm.applyUnsafe();
+        UpdateBuilder.create(cfs.metadata, key).withTimestamp(2).newRow(1).add("val", 1).applyUnsafe();
 
         cfs.forceBlockingFlush();
 
         // We should have 1 insert and 1 update to the indexed "1" column
         // CASSANDRA-6640 changed index update to just update, not insert then delete
-        assertEquals(1, index.inserts.size());
-        assertEquals(1, index.updates.size());
+        assertEquals(1, index.rowsInserted.size());
+        assertEquals(1, index.rowsUpdated.size());
     }
 
-    private void runCompactionWithRangeTombstoneAndCheckSecondaryIndex() throws Exception
+    private static ByteBuffer bb(int i)
     {
-        Keyspace table = Keyspace.open(KSNAME);
-        ColumnFamilyStore cfs = table.getColumnFamilyStore(CFNAME);
-        ByteBuffer key = ByteBufferUtil.bytes("k5");
-        ByteBuffer indexedColumnName = ByteBufferUtil.bytes(1);
-
-        cfs.truncateBlocking();
-        cfs.disableAutoCompaction();
-        cfs.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getCanonicalName());
-        if (cfs.indexManager.getIndexForColumn(indexedColumnName) == null)
-        {
-            ColumnDefinition cd = ColumnDefinition.regularDef(cfs.metadata, indexedColumnName, cfs.getComparator().asAbstractType(), 0)
-                                                  .setIndex("test_index", IndexType.CUSTOM, ImmutableMap.of(SecondaryIndex.CUSTOM_INDEX_OPTION_NAME, TestIndex.class.getName()));
-            Future<?> rebuild = cfs.indexManager.addIndexedColumn(cd);
-            // If rebuild there is, wait for the rebuild to finish so it doesn't race with the following insertions
-            if (rebuild != null)
-                rebuild.get();
-
-        }
-
-        TestIndex index = ((TestIndex)cfs.indexManager.getIndexForColumn(indexedColumnName));
-        index.resetCounts();
-
-        Mutation rm = new Mutation(KSNAME, key);
-        for (int i = 0; i < 10; i++)
-            add(rm, i, 0);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        rm = new Mutation(KSNAME, key);
-        ColumnFamily cf = rm.addOrGet(CFNAME);
-        for (int i = 0; i < 10; i += 2)
-            delete(cf, 0, 7, 0);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        // We should have indexed 1 column
-        assertEquals(1, index.inserts.size());
-
-        CompactionManager.instance.performMaximal(cfs, false);
-
-        // compacted down to single sstable
-        assertEquals(1, cfs.getSSTables().size());
-
-        // verify that the 1 indexed column was removed from the index
-        assertEquals(1, index.deletes.size());
-        assertEquals(index.deletes.get(0), index.inserts.get(0));
+        return ByteBufferUtil.bytes(i);
     }
 
-    private static boolean isLive(ColumnFamily cf, Cell c)
+    private static int i(ByteBuffer bb)
     {
-        return c != null && c.isLive() && !cf.deletionInfo().isDeleted(c);
-    }
-
-    private static CellName b(int i)
-    {
-        return CellNames.simpleDense(ByteBufferUtil.bytes(i));
-    }
-
-    private static int i(CellName i)
-    {
-        return ByteBufferUtil.toInt(i.toByteBuffer());
-    }
-
-    private static void add(Mutation rm, int value, long timestamp)
-    {
-        rm.add(CFNAME, b(value), ByteBufferUtil.bytes(value), timestamp);
-    }
-
-    private static void delete(ColumnFamily cf, int from, int to, long timestamp)
-    {
-        cf.delete(new DeletionInfo(b(from),
-                                   b(to),
-                                   cf.getComparator(),
-                                   timestamp,
-                                   (int)(System.currentTimeMillis() / 1000)));
-    }
-
-    public static class TestIndex extends PerColumnSecondaryIndex
-    {
-        public List<Cell> inserts = new ArrayList<>();
-        public List<Cell> deletes = new ArrayList<>();
-        public List<Cell> updates = new ArrayList<>();
-
-        public void resetCounts()
-        {
-            inserts.clear();
-            deletes.clear();
-            updates.clear();
-        }
-
-        public void delete(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
-        {
-            deletes.add(col);
-        }
-
-        @Override
-        public void deleteForCleanup(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup) {}
-
-        public void insert(ByteBuffer rowKey, Cell col, OpOrder.Group opGroup)
-        {
-            inserts.add(col);
-        }
-
-        public void update(ByteBuffer rowKey, Cell oldCol, Cell col, OpOrder.Group opGroup)
-        {
-            updates.add(col);
-        }
-
-        public void init(){}
-
-        public void reload(){}
-
-        public void validateOptions() throws ConfigurationException{}
-
-        public String getIndexName(){ return "TestIndex";}
-
-        protected SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns){ return null; }
-
-        public void forceBlockingFlush(){}
-
-        public ColumnFamilyStore getIndexCfs(){ return null; }
-
-        public void removeIndex(ByteBuffer columnName){}
-
-        public void invalidate(){}
-
-        public void truncateBlocking(long truncatedAt) { }
-
-        public boolean indexes(CellName name) { return name.toByteBuffer().equals(ByteBufferUtil.bytes(1)); }
-
-        @Override
-        public long estimateResultRows() {
-            return 0;
-        }
+        return ByteBufferUtil.toInt(bb);
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/ReadMessageTest.java b/test/unit/org/apache/cassandra/db/ReadMessageTest.java
index 34f25a1..d801b32 100644
--- a/test/unit/org/apache/cassandra/db/ReadMessageTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadMessageTest.java

@@ -21,119 +21,180 @@
 import static org.junit.Assert.*;
 
 import java.io.*;
-import java.nio.ByteBuffer;
-import java.util.SortedSet;
-import java.util.TreeSet;
 
 import com.google.common.base.Predicate;
+
 import org.junit.BeforeClass;
 import org.junit.Test;
-
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.commitlog.CommitLogTestReplayer;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.FilteredPartition;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
 
 public class ReadMessageTest
 {
     private static final String KEYSPACE1 = "ReadMessageTest1";
     private static final String KEYSPACENOCOMMIT = "ReadMessageTest_NoCommit";
     private static final String CF = "Standard1";
+    private static final String CF_FOR_READ_TEST = "Standard2";
+    private static final String CF_FOR_COMMIT_TEST = "Standard3";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        CFMetaData cfForReadMetadata = CFMetaData.Builder.create(KEYSPACE1, CF_FOR_READ_TEST)
+                                                            .addPartitionKey("key", BytesType.instance)
+                                                            .addClusteringColumn("col1", AsciiType.instance)
+                                                            .addClusteringColumn("col2", AsciiType.instance)
+                                                            .addRegularColumn("a", AsciiType.instance)
+                                                            .addRegularColumn("b", AsciiType.instance).build();
+
+        CFMetaData cfForCommitMetadata1 = CFMetaData.Builder.create(KEYSPACE1, CF_FOR_COMMIT_TEST)
+                                                       .addPartitionKey("key", BytesType.instance)
+                                                       .addClusteringColumn("name", AsciiType.instance)
+                                                       .addRegularColumn("commit1", AsciiType.instance).build();
+
+        CFMetaData cfForCommitMetadata2 = CFMetaData.Builder.create(KEYSPACENOCOMMIT, CF_FOR_COMMIT_TEST)
+                                                            .addPartitionKey("key", BytesType.instance)
+                                                            .addClusteringColumn("name", AsciiType.instance)
+                                                            .addRegularColumn("commit2", AsciiType.instance).build();
+
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF),
+                                    cfForReadMetadata,
+                                    cfForCommitMetadata1);
         SchemaLoader.createKeyspace(KEYSPACENOCOMMIT,
-                                    false,
-                                    true,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACENOCOMMIT, CF));
+                                    KeyspaceParams.simpleTransient(1),
+                                    SchemaLoader.standardCFMD(KEYSPACENOCOMMIT, CF),
+                                    cfForCommitMetadata2);
     }
 
     @Test
     public void testMakeReadMessage() throws IOException
     {
-        CellNameType type = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1").getComparator();
-
-        SortedSet<CellName> colList = new TreeSet<CellName>(type);
-        colList.add(Util.cellname("col1"));
-        colList.add(Util.cellname("col2"));
-
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_FOR_READ_TEST);
         ReadCommand rm, rm2;
-        DecoratedKey dk = Util.dk("row1");
-        long ts = System.currentTimeMillis();
 
-        rm = new SliceByNamesReadCommand(KEYSPACE1, dk.getKey(), "Standard1", ts, new NamesQueryFilter(colList));
+        rm = Util.cmd(cfs, Util.dk("key1"))
+                 .includeRow("col1", "col2")
+                 .build();
         rm2 = serializeAndDeserializeReadMessage(rm);
-        assert rm2.toString().equals(rm.toString());
+        assertEquals(rm.toString(), rm2.toString());
 
-        rm = new SliceFromReadCommand(KEYSPACE1, dk.getKey(), "Standard1", ts, new SliceQueryFilter(Composites.EMPTY, Composites.EMPTY, true, 2));
+        rm = Util.cmd(cfs, Util.dk("key1"))
+                 .includeRow("col1", "col2")
+                 .reverse()
+                 .build();
         rm2 = serializeAndDeserializeReadMessage(rm);
-        assert rm2.toString().equals(rm.toString());
+        assertEquals(rm.toString(), rm2.toString());
 
-        rm = new SliceFromReadCommand(KEYSPACE1, dk.getKey(), "Standard1", ts, new SliceQueryFilter(Util.cellname("a"), Util.cellname("z"), true, 5));
+        rm = Util.cmd(cfs)
+                 .build();
         rm2 = serializeAndDeserializeReadMessage(rm);
-        assert rm2.toString().equals(rm.toString());
+        assertEquals(rm.toString(), rm2.toString());
+
+        rm = Util.cmd(cfs)
+                 .fromKeyIncl(ByteBufferUtil.bytes("key1"))
+                 .toKeyIncl(ByteBufferUtil.bytes("key2"))
+                 .build();
+        rm2 = serializeAndDeserializeReadMessage(rm);
+        assertEquals(rm.toString(), rm2.toString());
+
+        rm = Util.cmd(cfs)
+                 .columns("a")
+                 .build();
+        rm2 = serializeAndDeserializeReadMessage(rm);
+        assertEquals(rm.toString(), rm2.toString());
+
+        rm = Util.cmd(cfs)
+                 .includeRow("col1", "col2")
+                 .columns("a")
+                 .build();
+        rm2 = serializeAndDeserializeReadMessage(rm);
+        assertEquals(rm.toString(), rm2.toString());
+
+        rm = Util.cmd(cfs)
+                 .fromKeyIncl(ByteBufferUtil.bytes("key1"))
+                 .includeRow("col1", "col2")
+                 .columns("a")
+                 .build();
+        rm2 = serializeAndDeserializeReadMessage(rm);
+        assertEquals(rm.toString(), rm2.toString());
     }
 
     private ReadCommand serializeAndDeserializeReadMessage(ReadCommand rm) throws IOException
     {
-        ReadCommandSerializer rms = ReadCommand.serializer;
+        IVersionedSerializer<ReadCommand> rms = ReadCommand.serializer;
         DataOutputBuffer out = new DataOutputBuffer();
-        ByteArrayInputStream bis;
 
         rms.serialize(rm, out, MessagingService.current_version);
-        bis = new ByteArrayInputStream(out.getData(), 0, out.getLength());
-        return rms.deserialize(new DataInputStream(bis), MessagingService.current_version);
+
+        DataInputPlus dis = new DataInputBuffer(out.getData());
+        return rms.deserialize(dis, MessagingService.current_version);
     }
 
+
     @Test
     public void testGetColumn()
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        CellNameType type = keyspace.getColumnFamilyStore("Standard1").getComparator();
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF);
 
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("abcd"), 0);
-        rm.apply();
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("key1"))
+                .clustering("Column1")
+                .add("val", ByteBufferUtil.bytes("abcd"))
+                .build()
+                .apply();
 
-        ReadCommand command = new SliceByNamesReadCommand(KEYSPACE1, dk.getKey(), "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(Util.cellname("Column1"), type)));
-        Row row = command.getRow(keyspace);
-        Cell col = row.cf.getColumn(Util.cellname("Column1"));
-        assertEquals(col.value(), ByteBuffer.wrap("abcd".getBytes()));
+        ColumnDefinition col = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        int found = 0;
+        for (FilteredPartition partition : Util.getAll(Util.cmd(cfs).build()))
+        {
+            for (Row r : partition)
+            {
+                if (r.getCell(col).value().equals(ByteBufferUtil.bytes("abcd")))
+                    ++found;
+            }
+        }
+        assertEquals(1, found);
     }
 
     @Test
     public void testNoCommitLog() throws Exception
     {
-        Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("row"));
-        rm.add("Standard1", Util.cellname("commit1"), ByteBufferUtil.bytes("abcd"), 0);
-        rm.apply();
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_FOR_COMMIT_TEST);
 
-        rm = new Mutation(KEYSPACENOCOMMIT, ByteBufferUtil.bytes("row"));
-        rm.add("Standard1", Util.cellname("commit2"), ByteBufferUtil.bytes("abcd"), 0);
-        rm.apply();
+        ColumnFamilyStore cfsnocommit = Keyspace.open(KEYSPACENOCOMMIT).getColumnFamilyStore(CF_FOR_COMMIT_TEST);
 
-        Checker checker = new Checker();
+        new RowUpdateBuilder(cfs.metadata, 0, ByteBufferUtil.bytes("row"))
+                .clustering("c")
+                .add("commit1", ByteBufferUtil.bytes("abcd"))
+                .build()
+                .apply();
+
+        new RowUpdateBuilder(cfsnocommit.metadata, 0, ByteBufferUtil.bytes("row"))
+                .clustering("c")
+                .add("commit2", ByteBufferUtil.bytes("abcd"))
+                .build()
+                .apply();
+
+        Checker checker = new Checker(cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("commit1")),
+                                      cfsnocommit.metadata.getColumnDefinition(ByteBufferUtil.bytes("commit2")));
         CommitLogTestReplayer.examineCommitLog(checker);
 
         assertTrue(checker.commitLogMessageFound);
@@ -142,17 +203,30 @@
 
     static class Checker implements Predicate<Mutation>
     {
+        private final ColumnDefinition withCommit;
+        private final ColumnDefinition withoutCommit;
+
         boolean commitLogMessageFound = false;
         boolean noCommitLogMessageFound = false;
 
+        public Checker(ColumnDefinition withCommit, ColumnDefinition withoutCommit)
+        {
+            this.withCommit = withCommit;
+            this.withoutCommit = withoutCommit;
+        }
+
         public boolean apply(Mutation mutation)
         {
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate upd : mutation.getPartitionUpdates())
             {
-                if (cf.getColumn(Util.cellname("commit1")) != null)
-                    commitLogMessageFound = true;
-                if (cf.getColumn(Util.cellname("commit2")) != null)
-                    noCommitLogMessageFound = true;
+                Row r = upd.getRow(new Clustering(ByteBufferUtil.bytes("c")));
+                if (r != null)
+                {
+                    if (r.getCell(withCommit) != null)
+                        commitLogMessageFound = true;
+                    if (r.getCell(withoutCommit) != null)
+                        noCommitLogMessageFound = true;
+                }
             }
             return true;
         }

diff --git a/test/unit/org/apache/cassandra/db/ReadResponseTest.java b/test/unit/org/apache/cassandra/db/ReadResponseTest.java
new file mode 100644
index 0000000..52ab8bb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/ReadResponseTest.java

@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.util.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.IPartitioner;
+
+import static org.junit.Assert.assertEquals;
+
+public class ReadResponseTest extends CQLTester
+{
+    private IPartitioner partitionerToRestore;
+
+    @Before
+    public void setupPartitioner()
+    {
+        // Using an ordered partitioner to be able to predict keys order in the following tests.
+        partitionerToRestore = DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance);
+    }
+
+    @After
+    public void resetPartitioner()
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(partitionerToRestore);
+    }
+
+    @Test
+    public void testLegacyResponseSkipWrongBounds()
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY)");
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+        // Test that if a legacy response contains keys at the boundary of the requested key range that shouldn't be present, those
+        // are properly skipped. See CASSANDRA-9857 for context.
+
+        List<ImmutableBTreePartition> responses = Arrays.asList(makePartition(cfs.metadata, "k1"),
+                                                                makePartition(cfs.metadata, "k2"),
+                                                                makePartition(cfs.metadata, "k3"));
+        ReadResponse.LegacyRemoteDataResponse response = new ReadResponse.LegacyRemoteDataResponse(responses);
+
+        assertPartitions(response.makeIterator(Util.cmd(cfs).fromKeyExcl("k1").toKeyExcl("k3").build()), "k2");
+        assertPartitions(response.makeIterator(Util.cmd(cfs).fromKeyExcl("k0").toKeyExcl("k3").build()), "k1", "k2");
+        assertPartitions(response.makeIterator(Util.cmd(cfs).fromKeyExcl("k1").toKeyExcl("k4").build()), "k2", "k3");
+
+        assertPartitions(response.makeIterator(Util.cmd(cfs).fromKeyIncl("k1").toKeyExcl("k3").build()), "k1", "k2");
+        assertPartitions(response.makeIterator(Util.cmd(cfs).fromKeyIncl("k1").toKeyExcl("k4").build()), "k1", "k2", "k3");
+    }
+
+    private void assertPartitions(UnfilteredPartitionIterator actual, String... expectedKeys)
+    {
+        int i = 0;
+        while (i < expectedKeys.length && actual.hasNext())
+        {
+            String actualKey = AsciiType.instance.getString(actual.next().partitionKey().getKey());
+            assertEquals(expectedKeys[i++], actualKey);
+        }
+
+        if (i < expectedKeys.length)
+            throw new AssertionError("Got less results than expected: " + expectedKeys[i] + " is not in the result");
+        if (actual.hasNext())
+            throw new AssertionError("Got more results than expected: first unexpected key is " + AsciiType.instance.getString(actual.next().partitionKey().getKey()));
+    }
+
+    private static ImmutableBTreePartition makePartition(CFMetaData metadata, String key)
+    {
+        return ImmutableBTreePartition.create(UnfilteredRowIterators.noRowsIterator(metadata, Util.dk(key), Rows.EMPTY_STATIC_ROW, new DeletionTime(0, 0), false));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java b/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java
deleted file mode 100644
index 3beb28e..0000000
--- a/test/unit/org/apache/cassandra/db/RecoveryManager2Test.java
+++ /dev/null

@@ -1,132 +0,0 @@
-package org.apache.cassandra.db;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import static org.apache.cassandra.Util.column;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.ParameterizedClass;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.compress.DeflateCompressor;
-import org.apache.cassandra.io.compress.LZ4Compressor;
-import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-
-@RunWith(Parameterized.class)
-public class RecoveryManager2Test
-{
-    private static Logger logger = LoggerFactory.getLogger(RecoveryManager2Test.class);
-
-    private static final String KEYSPACE1 = "RecoveryManager2Test";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
-    }
-
-    public RecoveryManager2Test(ParameterizedClass commitLogCompression)
-    {
-        DatabaseDescriptor.setCommitLogCompression(commitLogCompression);
-    }
-
-    @Before
-    public void setUp() throws IOException
-    {
-        CommitLog.instance.resetUnsafe(true);
-    }
-
-    @Parameters()
-    public static Collection<Object[]> generateData()
-    {
-        return Arrays.asList(new Object[][] {
-                { null }, // No compression
-                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.<String, String>emptyMap()) } });
-    }
-
-    @Test
-    /* test that commit logs do not replay flushed data */
-    public void testWithFlush() throws Exception
-    {
-        // Flush everything that may be in the commit log now to start fresh
-        FBUtilities.waitOnFutures(Keyspace.open(SystemKeyspace.NAME).flush());
-
-        CompactionManager.instance.disableAutoCompaction();
-
-        // add a row to another CF so we test skipping mutations within a not-entirely-flushed CF
-        insertRow("Standard2", "key");
-
-        for (int i = 0; i < 100; i++)
-        {
-            String key = "key" + i;
-            insertRow("Standard1", key);
-        }
-
-        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace1.getColumnFamilyStore("Standard1");
-        logger.debug("forcing flush");
-        cfs.forceBlockingFlush();
-
-        logger.debug("begin manual replay");
-        // replay the commit log (nothing on Standard1 should be replayed since everything was flushed, so only the row on Standard2
-        // will be replayed)
-        int replayed = CommitLog.instance.resetUnsafe(false);
-        assert replayed == 1 : "Expecting only 1 replayed mutation, got " + replayed;
-    }
-
-    private void insertRow(String cfname, String key) 
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, cfname);
-        cf.addColumn(column("col1", "val1", 1L));
-        Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(key), cf);
-        rm.apply();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java b/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java
deleted file mode 100644
index 2dd7eae..0000000
--- a/test/unit/org/apache/cassandra/db/RecoveryManager3Test.java
+++ /dev/null

@@ -1,132 +0,0 @@
-package org.apache.cassandra.db;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.ParameterizedClass;
-import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.compress.DeflateCompressor;
-import org.apache.cassandra.io.compress.LZ4Compressor;
-import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
-
-@RunWith(Parameterized.class)
-public class RecoveryManager3Test
-{
-    private static final String KEYSPACE1 = "RecoveryManager3Test1";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    private static final String KEYSPACE2 = "RecoveryManager3Test2";
-    private static final String CF_STANDARD3 = "Standard3";
-
-    public RecoveryManager3Test(ParameterizedClass commitLogCompression)
-    {
-        DatabaseDescriptor.setCommitLogCompression(commitLogCompression);
-    }
-
-    @Before
-    public void setUp() throws IOException
-    {
-        CommitLog.instance.resetUnsafe(true);
-    }
-
-    @Parameters()
-    public static Collection<Object[]> generateData()
-    {
-        return Arrays.asList(new Object[][] {
-                { null }, // No compression
-                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.<String, String>emptyMap()) } });
-    }
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-        SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD3));
-    }
-
-    @Test
-    public void testMissingHeader() throws IOException
-    {
-        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-        Keyspace keyspace2 = Keyspace.open(KEYSPACE2);
-
-        Mutation rm;
-        DecoratedKey dk = Util.dk("keymulti");
-        ColumnFamily cf;
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-        rm.apply();
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE2, "Standard3");
-        cf.addColumn(column("col2", "val2", 1L));
-        rm = new Mutation(KEYSPACE2, dk.getKey(), cf);
-        rm.apply();
-
-        keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
-        keyspace2.getColumnFamilyStore("Standard3").clearUnsafe();
-
-        // nuke the header
-        for (File file : new File(DatabaseDescriptor.getCommitLogLocation()).listFiles())
-        {
-            if (file.getName().endsWith(".header"))
-                FileUtils.deleteWithConfirm(file);
-        }
-
-        CommitLog.instance.resetUnsafe(false); // disassociate segments from live CL
-
-        assertColumns(Util.getColumnFamily(keyspace1, dk, "Standard1"), "col1");
-        assertColumns(Util.getColumnFamily(keyspace2, dk, "Standard3"), "col2");
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java
new file mode 100644
index 0000000..d06c112
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java

@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.compress.DeflateCompressor;
+import org.apache.cassandra.io.compress.LZ4Compressor;
+import org.apache.cassandra.io.compress.SnappyCompressor;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.utils.FBUtilities;
+
+@RunWith(Parameterized.class)
+public class RecoveryManagerFlushedTest
+{
+    private static Logger logger = LoggerFactory.getLogger(RecoveryManagerFlushedTest.class);
+
+    private static final String KEYSPACE1 = "RecoveryManager2Test";
+    private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_STANDARD2 = "Standard2";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
+    }
+
+    public RecoveryManagerFlushedTest(ParameterizedClass commitLogCompression)
+    {
+        DatabaseDescriptor.setCommitLogCompression(commitLogCompression);
+    }
+
+    @Before
+    public void setUp() throws IOException
+    {
+        CommitLog.instance.resetUnsafe(true);
+    }
+
+    @Parameters()
+    public static Collection<Object[]> generateData()
+    {
+        return Arrays.asList(new Object[][] {
+                { null }, // No compression
+                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.emptyMap()) } });
+    }
+
+    @Test
+    /* test that commit logs do not replay flushed data */
+    public void testWithFlush() throws Exception
+    {
+        // Flush everything that may be in the commit log now to start fresh
+        FBUtilities.waitOnFutures(Keyspace.open(SystemKeyspace.NAME).flush());
+        FBUtilities.waitOnFutures(Keyspace.open(SchemaKeyspace.NAME).flush());
+
+
+        CompactionManager.instance.disableAutoCompaction();
+
+        // add a row to another CF so we test skipping mutations within a not-entirely-flushed CF
+        insertRow("Standard2", "key");
+
+        for (int i = 0; i < 100; i++)
+        {
+            String key = "key" + i;
+            insertRow("Standard1", key);
+        }
+
+        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace1.getColumnFamilyStore("Standard1");
+        logger.debug("forcing flush");
+        cfs.forceBlockingFlush();
+
+        logger.debug("begin manual replay");
+        // replay the commit log (nothing on Standard1 should be replayed since everything was flushed, so only the row on Standard2
+        // will be replayed)
+        int replayed = CommitLog.instance.resetUnsafe(false);
+        assert replayed == 1 : "Expecting only 1 replayed mutation, got " + replayed;
+    }
+
+    private void insertRow(String cfname, String key)
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        new RowUpdateBuilder(cfs.metadata, 0, key)
+            .clustering("c")
+            .add("val", "val1")
+            .build()
+            .apply();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerMissingHeaderTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerMissingHeaderTest.java
new file mode 100644
index 0000000..8ac7c5d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerMissingHeaderTest.java

@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.compress.DeflateCompressor;
+import org.apache.cassandra.io.compress.LZ4Compressor;
+import org.apache.cassandra.io.compress.SnappyCompressor;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+@RunWith(Parameterized.class)
+public class RecoveryManagerMissingHeaderTest
+{
+    private static final String KEYSPACE1 = "RecoveryManager3Test1";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    private static final String KEYSPACE2 = "RecoveryManager3Test2";
+    private static final String CF_STANDARD3 = "Standard3";
+
+    public RecoveryManagerMissingHeaderTest(ParameterizedClass commitLogCompression)
+    {
+        DatabaseDescriptor.setCommitLogCompression(commitLogCompression);
+    }
+
+    @Before
+    public void setUp() throws IOException
+    {
+        CommitLog.instance.resetUnsafe(true);
+    }
+
+    @Parameters()
+    public static Collection<Object[]> generateData()
+    {
+        return Arrays.asList(new Object[][] {
+                { null }, // No compression
+                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.emptyMap()) } });
+    }
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+        SchemaLoader.createKeyspace(KEYSPACE2,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD3));
+    }
+
+    @Test
+    public void testMissingHeader() throws IOException
+    {
+        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
+        Keyspace keyspace2 = Keyspace.open(KEYSPACE2);
+
+        DecoratedKey dk = Util.dk("keymulti");
+        UnfilteredRowIterator upd1 = Util.apply(new RowUpdateBuilder(keyspace1.getColumnFamilyStore(CF_STANDARD1).metadata, 1L, 0, "keymulti")
+                                       .clustering("col1").add("val", "1")
+                                       .build());
+
+        UnfilteredRowIterator upd2 = Util.apply(new RowUpdateBuilder(keyspace2.getColumnFamilyStore(CF_STANDARD3).metadata, 1L, 0, "keymulti")
+                                       .clustering("col1").add("val", "1")
+                                       .build());
+
+        keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
+        keyspace2.getColumnFamilyStore("Standard3").clearUnsafe();
+
+        // nuke the header
+        for (File file : new File(DatabaseDescriptor.getCommitLogLocation()).listFiles())
+        {
+            if (file.getName().endsWith(".header"))
+                FileUtils.deleteWithConfirm(file);
+        }
+
+        CommitLog.instance.resetUnsafe(false);
+
+        Assert.assertTrue(Util.equal(upd1, Util.getOnlyPartitionUnfiltered(Util.cmd(keyspace1.getColumnFamilyStore(CF_STANDARD1), dk).build()).unfilteredIterator()));
+        Assert.assertTrue(Util.equal(upd2, Util.getOnlyPartitionUnfiltered(Util.cmd(keyspace2.getColumnFamilyStore(CF_STANDARD3), dk).build()).unfilteredIterator()));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java
index 5676b99..57bd044 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerTest.java

@@ -26,35 +26,43 @@
 import java.util.concurrent.TimeUnit;
 
 import org.junit.Assert;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 import org.junit.runners.Parameterized.Parameters;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogArchiver;
-import org.apache.cassandra.db.marshal.CounterColumnType;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.compress.DeflateCompressor;
 import org.apache.cassandra.io.compress.LZ4Compressor;
 import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
+import static org.junit.Assert.assertEquals;
 
 @RunWith(Parameterized.class)
 public class RecoveryManagerTest
 {
+    private static Logger logger = LoggerFactory.getLogger(RecoveryManagerTest.class);
+
     private static final String KEYSPACE1 = "RecoveryManagerTest1";
     private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_STATIC1 = "Static1";
     private static final String CF_COUNTER1 = "Counter1";
 
     private static final String KEYSPACE2 = "RecoveryManagerTest2";
@@ -65,16 +73,22 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER1).defaultValidator(CounterColumnType.instance));
+                                    SchemaLoader.counterCFMD(KEYSPACE1, CF_COUNTER1),
+                                    SchemaLoader.staticCFMD(KEYSPACE1, CF_STATIC1));
         SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD3));
     }
 
+    @Before
+    public void clearData()
+    {
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_COUNTER1).truncateBlocking();
+        Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD3).truncateBlocking();
+    }
     public RecoveryManagerTest(ParameterizedClass commitLogCompression)
     {
         DatabaseDescriptor.setCommitLogCompression(commitLogCompression);
@@ -85,9 +99,9 @@
     {
         return Arrays.asList(new Object[][] {
                 { null }, // No compression
-                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.<String, String>emptyMap()) } });
+                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.emptyMap()) } });
     }
 
     @Test
@@ -103,27 +117,22 @@
         Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
         Keyspace keyspace2 = Keyspace.open(KEYSPACE2);
 
-        Mutation rm;
-        DecoratedKey dk = Util.dk("keymulti");
-        ColumnFamily cf;
+        UnfilteredRowIterator upd1 = Util.apply(new RowUpdateBuilder(keyspace1.getColumnFamilyStore(CF_STANDARD1).metadata, 1L, 0, "keymulti")
+            .clustering("col1").add("val", "1")
+            .build());
 
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf.addColumn(column("col1", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-        rm.apply();
-
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE2, "Standard3");
-        cf.addColumn(column("col2", "val2", 1L));
-        rm = new Mutation(KEYSPACE2, dk.getKey(), cf);
-        rm.apply();
+        UnfilteredRowIterator upd2 = Util.apply(new RowUpdateBuilder(keyspace2.getColumnFamilyStore(CF_STANDARD3).metadata, 1L, 0, "keymulti")
+                                       .clustering("col2").add("val", "1")
+                                       .build());
 
         keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
         keyspace2.getColumnFamilyStore("Standard3").clearUnsafe();
 
-        CommitLog.instance.resetUnsafe(false); // disassociate segments from live CL
+        CommitLog.instance.resetUnsafe(false);
 
-        assertColumns(Util.getColumnFamily(keyspace1, dk, "Standard1"), "col1");
-        assertColumns(Util.getColumnFamily(keyspace2, dk, "Standard3"), "col2");
+        DecoratedKey dk = Util.dk("keymulti");
+        Assert.assertTrue(Util.equal(upd1, Util.getOnlyPartitionUnfiltered(Util.cmd(keyspace1.getColumnFamilyStore(CF_STANDARD1), dk).build()).unfilteredIterator()));
+        Assert.assertTrue(Util.equal(upd2, Util.getOnlyPartitionUnfiltered(Util.cmd(keyspace2.getColumnFamilyStore(CF_STANDARD3), dk).build()).unfilteredIterator()));
     }
 
     @Test
@@ -131,89 +140,112 @@
     {
         CommitLog.instance.resetUnsafe(true);
         Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key");
-        ColumnFamily cf;
+        ColumnFamilyStore cfs = keyspace1.getColumnFamilyStore(CF_COUNTER1);
 
         for (int i = 0; i < 10; ++i)
         {
-            cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Counter1");
-            cf.addColumn(BufferCounterCell.createLocal(cellname("col"), 1L, 1L, Long.MIN_VALUE));
-            rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-            rm.apply();
+            new CounterMutation(new RowUpdateBuilder(cfs.metadata, 1L, 0, "key")
+                .clustering("cc").add("val", CounterContext.instance().createLocal(1L))
+                .build(), ConsistencyLevel.ALL).apply();
         }
 
         keyspace1.getColumnFamilyStore("Counter1").clearUnsafe();
 
-        CommitLog.instance.resetUnsafe(false); // disassociate segments from live CL
+        int replayed = CommitLog.instance.resetUnsafe(false);
 
-        cf = Util.getColumnFamily(keyspace1, dk, "Counter1");
-
-        assert cf.getColumnCount() == 1;
-        Cell c = cf.getColumn(cellname("col"));
-
-        assert c != null;
-        assert ((CounterCell)c).total() == 10L;
+        ColumnDefinition counterCol = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
+        Row row = Util.getOnlyRow(Util.cmd(cfs).includeRow("cc").columns("val").build());
+        assertEquals(10L, CounterContext.instance().total(row.getCell(counterCol).value()));
     }
 
     @Test
     public void testRecoverPIT() throws Exception
     {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
         CommitLog.instance.resetUnsafe(true);
         Date date = CommitLogArchiver.format.parse("2112:12:12 12:12:12");
         long timeMS = date.getTime() - 5000;
 
         Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-        DecoratedKey dk = Util.dk("dkey");
         for (int i = 0; i < 10; ++i)
         {
             long ts = TimeUnit.MILLISECONDS.toMicros(timeMS + (i * 1000));
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-            cf.addColumn(column("name-" + i, "value", ts));
-            Mutation rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, ts, "name-" + i)
+                .clustering("cc")
+                .add("val", Integer.toString(i))
+                .build()
+                .apply();
         }
-        keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
-        CommitLog.instance.resetUnsafe(false); // disassociate segments from live CL
 
-        ColumnFamily cf = Util.getColumnFamily(keyspace1, dk, "Standard1");
-        Assert.assertEquals(6, cf.getColumnCount());
+        // Sanity check row count prior to clear and replay
+        assertEquals(10, Util.getAll(Util.cmd(cfs).build()).size());
+
+        keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
+        CommitLog.instance.resetUnsafe(false);
+
+        assertEquals(6, Util.getAll(Util.cmd(cfs).build()).size());
     }
 
+    @Test
+    public void testRecoverPITStatic() throws Exception
+    {
+        CommitLog.instance.resetUnsafe(true);
+        Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace1.getColumnFamilyStore(CF_STATIC1);
+        Date date = CommitLogArchiver.format.parse("2112:12:12 12:12:12");
+        long timeMS = date.getTime() - 5000;
+
+
+        for (int i = 0; i < 10; ++i)
+        {
+            long ts = TimeUnit.MILLISECONDS.toMicros(timeMS + (i * 1000));
+            new RowUpdateBuilder(cfs.metadata, ts, "name-" + i)
+            .add("val", Integer.toString(i))
+            .build()
+            .apply();
+        }
+
+        // Sanity check row count prior to clear and replay
+        assertEquals(10, Util.getAll(Util.cmd(cfs).build()).size());
+
+        keyspace1.getColumnFamilyStore(CF_STATIC1).clearUnsafe();
+        CommitLog.instance.resetUnsafe(false);
+
+        assertEquals(6, Util.getAll(Util.cmd(cfs).build()).size());
+    }
 
     @Test
     public void testRecoverPITUnordered() throws Exception
     {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
         CommitLog.instance.resetUnsafe(true);
         Date date = CommitLogArchiver.format.parse("2112:12:12 12:12:12");
         long timeMS = date.getTime();
 
         Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
-        DecoratedKey dk = Util.dk("dkey");
 
         // Col 0 and 9 are the only ones to be recovered
         for (int i = 0; i < 10; ++i)
         {
             long ts;
-            if(i==9)
+            if (i == 9)
                 ts = TimeUnit.MILLISECONDS.toMicros(timeMS - 1000);
             else
                 ts = TimeUnit.MILLISECONDS.toMicros(timeMS + (i * 1000));
 
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-            cf.addColumn(column("name-" + i, "value", ts));
-            Mutation rm = new Mutation(KEYSPACE1, dk.getKey(), cf);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, ts, "name-" + i)
+                .clustering("cc")
+                .add("val", Integer.toString(i))
+                .build()
+                .apply();
         }
 
-        ColumnFamily cf = Util.getColumnFamily(keyspace1, dk, "Standard1");
-        Assert.assertEquals(10, cf.getColumnCount());
+        // Sanity check row count prior to clear and replay
+        assertEquals(10, Util.getAll(Util.cmd(cfs).build()).size());
 
         keyspace1.getColumnFamilyStore("Standard1").clearUnsafe();
-        CommitLog.instance.resetUnsafe(false); // disassociate segments from live CL
+        CommitLog.instance.resetUnsafe(false);
 
-        cf = Util.getColumnFamily(keyspace1, dk, "Standard1");
-        Assert.assertEquals(2, cf.getColumnCount());
+        assertEquals(2, Util.getAll(Util.cmd(cfs).build()).size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java
index 769316f..5a59f1c 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerTruncateTest.java

@@ -18,9 +18,6 @@
 */
 package org.apache.cassandra.db;
 
-import static org.apache.cassandra.Util.column;
-import static org.junit.Assert.*;
-
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
@@ -36,15 +33,15 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.compress.DeflateCompressor;
 import org.apache.cassandra.io.compress.LZ4Compressor;
 import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.junit.Assert.assertTrue;
 
 /**
  * Test for the truncate operation.
@@ -54,7 +51,6 @@
 {
     private static final String KEYSPACE1 = "RecoveryManagerTruncateTest";
     private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
 
     public RecoveryManagerTruncateTest(ParameterizedClass commitLogCompression)
     {
@@ -72,9 +68,9 @@
     {
         return Arrays.asList(new Object[][] {
                 { null }, // No compression
-                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.<String, String>emptyMap()) } });
+                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.emptyMap()) } });
     }
 
     @BeforeClass
@@ -82,153 +78,31 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
     }
 
     @Test
     public void testTruncate() throws IOException
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-
-        Mutation rm;
-        ColumnFamily cf;
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
 
         // add a single cell
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.addColumn(column("col1", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("keymulti"), cf);
-        rm.applyUnsafe();
-        long time = System.currentTimeMillis();
+        new RowUpdateBuilder(cfs.metadata, 0, "key1")
+            .clustering("cc")
+            .add("val", "val1")
+            .build()
+            .applyUnsafe();
 
         // Make sure data was written
-        assertNotNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col1"));
+        assertTrue(Util.getAll(Util.cmd(cfs).build()).size() > 0);
 
         // and now truncate it
         cfs.truncateBlocking();
-        CommitLog.instance.resetUnsafe(false);
+        assert 0 != CommitLog.instance.resetUnsafe(false);
 
         // and validate truncation.
-        assertNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col1"));
-        assertTrue(SystemKeyspace.getTruncatedAt(cfs.metadata.cfId) > time);
-    }
-
-    @Test
-    public void testTruncatePointInTime() throws IOException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-
-        Mutation rm;
-        ColumnFamily cf;
-
-        // add a single cell
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.addColumn(column("col2", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("keymulti"), cf);
-        rm.apply();
-
-        // Make sure data was written
-        long time = System.currentTimeMillis();
-        assertNotNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col2"));
-
-        // and now truncate it
-        cfs.truncateBlocking();
-
-        // verify truncation
-        assertNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col2"));
-
-        try
-        {
-            // Restore to point in time.
-            CommitLog.instance.archiver.restorePointInTime = time;
-            CommitLog.instance.resetUnsafe(false);
-        }
-        finally
-        {
-            CommitLog.instance.archiver.restorePointInTime = Long.MAX_VALUE;
-        }
-
-        // Validate pre-truncation data was restored.
-        assertNotNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col2"));
-        // And that we don't have a truncation record after restore time.
-        assertFalse(SystemKeyspace.getTruncatedAt(cfs.metadata.cfId) > time);
-    }
-
-    @Test
-    public void testTruncatePointInTimeReplayList() throws IOException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs1 = keyspace.getColumnFamilyStore(CF_STANDARD1);
-        ColumnFamilyStore cfs2 = keyspace.getColumnFamilyStore(CF_STANDARD2);
-
-        Mutation rm;
-        ColumnFamily cf;
-
-        // add a single cell
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD1);
-        cf.addColumn(column("col3", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("keymulti"), cf);
-        rm.apply();
-
-        // add a single cell
-        cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD2);
-        cf.addColumn(column("col4", "val1", 1L));
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("keymulti"), cf);
-        rm.apply();
-
-        // Make sure data was written
-        long time = System.currentTimeMillis();
-        assertNotNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col3"));
-        assertNotNull(getFromTable(keyspace, CF_STANDARD2, "keymulti", "col4"));
-
-        // and now truncate it
-        cfs1.truncateBlocking();
-        cfs2.truncateBlocking();
-
-        // verify truncation
-        assertNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col3"));
-        assertNull(getFromTable(keyspace, CF_STANDARD2, "keymulti", "col4"));
-
-        try
-        {
-            // Restore to point in time.
-            CommitLog.instance.archiver.restorePointInTime = time;
-            System.setProperty("cassandra.replayList", KEYSPACE1 + "." + CF_STANDARD1);
-            CommitLog.instance.resetUnsafe(false);
-        }
-        finally
-        {
-            CommitLog.instance.archiver.restorePointInTime = Long.MAX_VALUE;
-            System.clearProperty("cassandra.replayList");
-        }
-
-        // Validate pre-truncation data was restored.
-        assertNotNull(getFromTable(keyspace, CF_STANDARD1, "keymulti", "col3"));
-        // But only on the replayed table.
-        assertNull(getFromTable(keyspace, CF_STANDARD2, "keymulti", "col4"));
-
-        // And that we have the correct truncation records.
-        assertFalse(SystemKeyspace.getTruncatedAt(cfs1.metadata.cfId) > time);
-        assertTrue(SystemKeyspace.getTruncatedAt(cfs2.metadata.cfId) > time);
-    }
-
-    private Cell getFromTable(Keyspace keyspace, String cfName, String keyName, String columnName)
-    {
-        ColumnFamily cf;
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(cfName);
-        if (cfStore == null)
-        {
-            return null;
-        }
-        cf = cfStore.getColumnFamily(Util.namesQueryFilter(cfStore, Util.dk(keyName), columnName));
-        if (cf == null)
-        {
-            return null;
-        }
-        return cf.getColumn(Util.cellname(columnName));
+        Util.assertEmptyUnfiltered(Util.cmd(cfs).build());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveCellTest.java b/test/unit/org/apache/cassandra/db/RemoveCellTest.java
index 1edb964..01fe255 100644
--- a/test/unit/org/apache/cassandra/db/RemoveCellTest.java
+++ b/test/unit/org/apache/cassandra/db/RemoveCellTest.java

@@ -18,84 +18,21 @@
 */
 package org.apache.cassandra.db;
 
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
+import org.apache.cassandra.cql3.CQLTester;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class RemoveCellTest
+public class RemoveCellTest extends CQLTester
 {
-    private static final String KEYSPACE1 = "RemoveCellTest";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
     @Test
-    public void testRemoveColumn()
+    public void testDeleteCell() throws Throwable
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        // remove
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Standard1", Util.cellname("Column1"), 1);
-        rm.applyUnsafe();
-
-        ColumnFamily retrieved = store.getColumnFamily(Util.namesQueryFilter(store, dk, "Column1"));
-        assertFalse(retrieved.getColumn(Util.cellname("Column1")).isLive());
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-        assertNull(Util.cloneAndRemoveDeleted(store.getColumnFamily(QueryFilter.getIdentityFilter(dk,
-                                                                                                  "Standard1",
-                                                                                                  System.currentTimeMillis())),
-                                              Integer.MAX_VALUE));
+        String tableName = createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 0, 0, 0L);
+        cfs.forceBlockingFlush();
+        execute("DELETE c FROM %s USING TIMESTAMP ? WHERE a = ? AND b = ?", 1L, 0, 0);
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 0, 0), row(0, 0, null));
+        assertRows(execute("SELECT c FROM %s WHERE a = ? AND b = ?", 0, 0), row(new Object[]{null}));
     }
-
-    private static BufferDeletedCell dc(String name, int ldt, long timestamp)
-    {
-        return new BufferDeletedCell(Util.cellname(name), ldt, timestamp);
-    }
-
-    @Test
-    public void deletedColumnShouldAlwaysBeMarkedForDelete()
-    {
-        // Check for bug in #4307
-        long timestamp = System.currentTimeMillis();
-        int localDeletionTime = (int) (timestamp / 1000);
-        Cell c = dc("dc1", localDeletionTime, timestamp);
-        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
-
-        // Simulate a node that is 30 seconds behind
-        c = dc("dc2", localDeletionTime + 30, timestamp + 30000);
-        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
-
-        // Simulate a node that is 30 ahead behind
-        c = dc("dc3", localDeletionTime - 30, timestamp - 30000);
-        assertFalse("DeletedCell was not marked for delete", c.isLive(timestamp));
-    }
-
 }

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java
deleted file mode 100644
index fec8711..0000000
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyTest.java
+++ /dev/null

@@ -1,73 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.assertNull;
-import org.apache.cassandra.db.filter.QueryFilter;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-
-public class RemoveColumnFamilyTest
-{
-    private static final String KEYSPACE1 = "RemoveColumnFamilyTest";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
-    @Test
-    public void testRemoveColumnFamily()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-
-        // remove
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Standard1", 1);
-        rm.applyUnsafe();
-
-        ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
-        assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(Util.cellname("Column1")));
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java
deleted file mode 100644
index 72827d0..0000000
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush1Test.java
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.assertNull;
-import org.apache.cassandra.db.filter.QueryFilter;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-
-public class RemoveColumnFamilyWithFlush1Test
-{
-    private static final String KEYSPACE1 = "RemoveColumnFamilyWithFlush1Test";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
-    @Test
-    public void testRemoveColumnFamilyWithFlush1()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.add("Standard1", Util.cellname("Column2"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        // remove
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Standard1", 1);
-        rm.applyUnsafe();
-
-        ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
-        assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(Util.cellname("Column1")));
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java b/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java
deleted file mode 100644
index ef7f7f2..0000000
--- a/test/unit/org/apache/cassandra/db/RemoveColumnFamilyWithFlush2Test.java
+++ /dev/null

@@ -1,73 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.assertNull;
-import org.apache.cassandra.db.filter.QueryFilter;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-
-public class RemoveColumnFamilyWithFlush2Test
-{
-    private static final String KEYSPACE1 = "RemoveColumnFamilyWithFlush2Test";
-    private static final String CF_STANDARD1 = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
-    }
-
-    @Test
-    public void testRemoveColumnFamilyWithFlush2()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
-        // remove
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Standard1", 1);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Standard1", System.currentTimeMillis()));
-        assert retrieved.isMarkedForDelete();
-        assertNull(retrieved.getColumn(Util.cellname("Column1")));
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RemoveSubCellTest.java b/test/unit/org/apache/cassandra/db/RemoveSubCellTest.java
deleted file mode 100644
index 3fa5c2f..0000000
--- a/test/unit/org/apache/cassandra/db/RemoveSubCellTest.java
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.db;
-
-import java.nio.ByteBuffer;
-import java.util.concurrent.TimeUnit;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-import static org.apache.cassandra.Util.getBytes;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import com.google.common.util.concurrent.Uninterruptibles;
-
-
-public class RemoveSubCellTest
-{
-    private static final String KEYSPACE1 = "RemoveSubCellTest";
-    private static final String CF_SUPER1 = "Super1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER1, LongType.instance));
-    }
-
-    @Test
-    public void testRemoveSubColumn()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        Util.addMutation(rm, "Super1", "SC1", 1, "asdf", 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        CellName cname = CellNames.compositeDense(ByteBufferUtil.bytes("SC1"), getBytes(1L));
-        // remove
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Super1", cname, 1);
-        rm.applyUnsafe();
-
-        ColumnFamily retrieved = store.getColumnFamily(QueryFilter.getIdentityFilter(dk, "Super1", System.currentTimeMillis()));
-        assertFalse(retrieved.getColumn(cname).isLive());
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-    }
-
-    @Test
-    public void testRemoveSubColumnAndContainer()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super1");
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key2");
-
-        // add data
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        Util.addMutation(rm, "Super1", "SC1", 1, "asdf", 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        // remove the SC
-        ByteBuffer scName = ByteBufferUtil.bytes("SC1");
-        CellName cname = CellNames.compositeDense(scName, getBytes(1L));
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.deleteRange("Super1", SuperColumns.startOf(scName), SuperColumns.endOf(scName), 1);
-        rm.applyUnsafe();
-
-        // Mark current time and make sure the next insert happens at least
-        // one second after the previous one (since gc resolution is the second)
-        QueryFilter filter = QueryFilter.getIdentityFilter(dk, "Super1", System.currentTimeMillis());
-        Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
-
-        // remove the column itself
-        rm = new Mutation(KEYSPACE1, dk.getKey());
-        rm.delete("Super1", cname, 2);
-        rm.applyUnsafe();
-
-        ColumnFamily retrieved = store.getColumnFamily(filter);
-        assertFalse(retrieved.getColumn(cname).isLive());
-        assertNull(Util.cloneAndRemoveDeleted(retrieved, Integer.MAX_VALUE));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java b/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java
new file mode 100644
index 0000000..e0d68a4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java

@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.Collections;
+
+import com.google.common.collect.Iterables;
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.AbstractRow;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class RepairedDataTombstonesTest extends CQLTester
+{
+    @Test
+    public void compactionTest() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        // insert a live row to make sure that the sstables are not dropped (we test dropping in compactionDropExpiredSSTableTest() below)
+        execute("insert into %s (id, id2, t) values (999,999,'live')");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        SSTableReader repairedSSTable = getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next();
+        repair(getCurrentColumnFamilyStore(), repairedSSTable);
+        Thread.sleep(2000);
+        execute("insert into %s (id, id2, t) values (999,999,'live')");
+        for (int i = 10; i < 20; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        Thread.sleep(1000);
+        // at this point we have 2 sstables, one repaired and one unrepaired. Both sstables contain expired tombstones, but we should only drop the tombstones from the repaired sstable.
+        getCurrentColumnFamilyStore().forceMajorCompaction();
+        verifyIncludingPurgeable();
+        verify2IncludingPurgeable(1);
+        assertEquals(2, Iterables.size(getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE)));
+    }
+
+    @Test
+    public void compactionDropExpiredSSTableTest() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        SSTableReader repairedSSTable = getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next();
+        repair(getCurrentColumnFamilyStore(), repairedSSTable);
+        Thread.sleep(2000);
+        for (int i = 10; i < 20; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        Thread.sleep(1000);
+        getCurrentColumnFamilyStore().forceMajorCompaction();
+        verifyIncludingPurgeable();
+        verify2IncludingPurgeable(1);
+        assertEquals(1, Iterables.size(getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE)));
+        assertFalse(getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next().isRepaired());
+
+    }
+
+    @Test
+    public void readTest() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, t2 text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("update %s set t2=null where id=? and id2=?", 123, i);
+        }
+        flush();
+        SSTableReader repairedSSTable = getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next();
+        repair(getCurrentColumnFamilyStore(), repairedSSTable);
+        Thread.sleep(2000);
+        for (int i = 10; i < 20; i++)
+        {
+            execute("update %s set t2=null where id=? and id2=?", 123, i);
+        }
+        flush();
+        // allow gcgrace to properly expire:
+        Thread.sleep(1000);
+        // make sure we only see the unrepaired tombstones, the other ones are expired and can be purged
+        verify();
+        verify2(123);
+    }
+
+    @Test
+    public void readOnlyUnrepairedTest() throws Throwable
+    {
+        // make sure we keep all tombstones if we only have unrepaired data
+        createTable("create table %s (id int, id2 int, t text, t2 text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        for (int i = 10; i < 20; i++)
+        {
+            execute("update %s set t2=null where id=? and id2=?", 123, i);
+        }
+        flush();
+
+        // allow gcgrace to properly expire:
+        Thread.sleep(1000);
+        verifyIncludingPurgeable();
+        verify2IncludingPurgeable(123);
+    }
+
+
+    @Test
+    public void readTestRowTombstones() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, t2 text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        SSTableReader repairedSSTable = getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next();
+        repair(getCurrentColumnFamilyStore(), repairedSSTable);
+        Thread.sleep(2000);
+        for (int i = 10; i < 20; i++)
+        {
+            execute("delete from %s where id=? and id2=?", 1, i);
+        }
+        flush();
+        Thread.sleep(1000);
+        verify();
+        verify2(1);
+    }
+
+    @Test
+    public void readTestPartitionTombstones() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, t2 text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        for (int i = 0; i < 10; i++)
+        {
+            execute("delete from %s where id=?", i);
+        }
+        flush();
+        SSTableReader repairedSSTable = getCurrentColumnFamilyStore().getSSTables(SSTableSet.LIVE).iterator().next();
+        repair(getCurrentColumnFamilyStore(), repairedSSTable);
+        Thread.sleep(2000);
+        for (int i = 10; i < 20; i++)
+        {
+            execute("delete from %s where id=?", i);
+        }
+        flush();
+
+        Thread.sleep(1000);
+        ReadCommand cmd = Util.cmd(getCurrentColumnFamilyStore()).build();
+        int partitionsFound = 0;
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator iterator = cmd.executeLocally(orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                partitionsFound++;
+                UnfilteredRowIterator rowIter = iterator.next();
+                int val = ByteBufferUtil.toInt(rowIter.partitionKey().getKey());
+                assertTrue("val=" + val, val >= 10 && val < 20);
+            }
+        }
+        assertEquals(10, partitionsFound);
+    }
+
+    @Test
+    public void readTestOldUnrepaired() throws Throwable
+    {
+        createTable("create table %s (id int, id2 int, t text, t2 text, primary key (id, id2)) with gc_grace_seconds=0 and compaction = {'class':'SizeTieredCompactionStrategy', 'only_purge_repaired_tombstones':true}");
+        getCurrentColumnFamilyStore().disableAutoCompaction();
+        for (int i = 0; i < 10; i++)
+        {
+            execute("delete from %s where id=1 and id2=?", i);
+        }
+        flush();
+        SSTableReader oldSSTable = getCurrentColumnFamilyStore().getLiveSSTables().iterator().next();
+        Thread.sleep(2000);
+        for (int i = 10; i < 20; i++)
+        {
+            execute("delete from %s where id=1 and id2=?", i);
+        }
+        flush();
+        for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
+            if (sstable != oldSSTable)
+                repair(getCurrentColumnFamilyStore(), sstable);
+        Thread.sleep(2000);
+        for (int i = 20; i < 30; i++)
+        {
+            execute("delete from %s where id=1 and id2=?", i);
+        }
+        flush();
+
+        Thread.sleep(2000);
+        // we will keep all tombstones since the oldest tombstones are unrepaired:
+        verify(30, 0, 30, false);
+        verify2(1, 30, 0, 30, false);
+    }
+
+    private void verify()
+    {
+        verify(10, 10, 20, false);
+    }
+
+    private void verifyIncludingPurgeable()
+    {
+        verify(10, 10, 20, true);
+    }
+
+    private void verify(int expectedRows, int minVal, int maxVal, boolean includePurgeable)
+    {
+        ReadCommand cmd = Util.cmd(getCurrentColumnFamilyStore()).build();
+        int foundRows = 0;
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup();
+             UnfilteredPartitionIterator iterator =
+             includePurgeable ? cmd.queryStorage(getCurrentColumnFamilyStore(), orderGroup) :
+                                cmd.executeLocally(orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                try (UnfilteredRowIterator rowIter = iterator.next())
+                {
+                    if (!rowIter.partitionKey().equals(Util.dk(ByteBufferUtil.bytes(999)))) // partition key 999 is 'live' and used to avoid sstables from being dropped
+                    {
+                        while (rowIter.hasNext())
+                        {
+                            AbstractRow row = (AbstractRow) rowIter.next();
+                            for (int i = 0; i < row.clustering().size(); i++)
+                            {
+                                foundRows++;
+                                int val = ByteBufferUtil.toInt(row.clustering().get(i));
+                                assertTrue("val=" + val, val >= minVal && val < maxVal);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        assertEquals(expectedRows, foundRows);
+    }
+
+    private void verify2(int key)
+    {
+        verify2(key, 10, 10, 20, false);
+    }
+
+    private void verify2IncludingPurgeable(int key)
+    {
+        verify2(key, 10, 10, 20, true);
+    }
+
+    private void verify2(int key, int expectedRows, int minVal, int maxVal, boolean includePurgeable)
+    {
+        ReadCommand cmd = Util.cmd(getCurrentColumnFamilyStore(), Util.dk(ByteBufferUtil.bytes(key))).build();
+        int foundRows = 0;
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup();
+             UnfilteredPartitionIterator iterator =
+             includePurgeable ? cmd.queryStorage(getCurrentColumnFamilyStore(), orderGroup) :
+                                cmd.executeLocally(orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                try (UnfilteredRowIterator rowIter = iterator.next())
+                {
+                    while (rowIter.hasNext())
+                    {
+                        AbstractRow row = (AbstractRow) rowIter.next();
+                        for (int i = 0; i < row.clustering().size(); i++)
+                        {
+                            foundRows++;
+                            int val = ByteBufferUtil.toInt(row.clustering().get(i));
+                            assertTrue("val=" + val, val >= minVal && val < maxVal);
+                        }
+                    }
+                }
+            }
+        }
+        assertEquals(expectedRows, foundRows);
+    }
+
+    public static void repair(ColumnFamilyStore cfs, SSTableReader sstable) throws IOException
+    {
+        sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, 1);
+        sstable.reloadSSTableMetadata();
+        cfs.getTracker().notifySSTableRepairedStatusChanged(Collections.singleton(sstable));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java b/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java
index 3dc5ce3..a8f7e3d 100644
--- a/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/RowCacheCQLTest.java

@@ -30,11 +30,81 @@
     public void test7636() throws Throwable
     {
         CacheService.instance.setRowCacheCapacityInMB(1);
-        createTable("CREATE TABLE %s (p1 bigint, c1 int, PRIMARY KEY (p1, c1)) WITH caching = '{\"keys\":\"NONE\", \"rows_per_partition\":\"ALL\"}'");
-        execute("INSERT INTO %s (p1, c1) VALUES (123, 10)");
-        assertEmpty(execute("SELECT * FROM %s WHERE p1=123 and c1 > 1000"));
-        UntypedResultSet res = execute("SELECT * FROM %s WHERE p1=123 and c1 > 0");
+        createTable("CREATE TABLE %s (p1 bigint, c1 int, v int, PRIMARY KEY (p1, c1)) WITH caching = { 'keys': 'NONE', 'rows_per_partition': 'ALL' }");
+        execute("INSERT INTO %s (p1, c1, v) VALUES (?, ?, ?)", 123L, 10, 12);
+        assertEmpty(execute("SELECT * FROM %s WHERE p1 = ? and c1 > ?", 123L, 1000));
+        UntypedResultSet res = execute("SELECT * FROM %s WHERE p1 = ? and c1 > ?", 123L, 0);
         assertEquals(1, res.size());
-        assertEmpty(execute("SELECT * FROM %s WHERE p1=123 and c1 > 1000"));
+        assertEmpty(execute("SELECT * FROM %s WHERE p1 = ? and c1 > ?", 123L, 1000));
+    }
+
+    /**
+     * Test for CASSANDRA-13482
+     */
+    @Test
+    public void testPartialCache() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck1 int, v1 int, v2 int, primary key (pk, ck1))" +
+                    "WITH CACHING = { 'keys': 'ALL', 'rows_per_partition': '1' }");
+        assertEmpty(execute("select * from %s where pk = 10000"));
+
+        execute("DELETE FROM %s WHERE pk = 1 AND ck1 = 0");
+        execute("DELETE FROM %s WHERE pk = 1 AND ck1 = 1");
+        execute("DELETE FROM %s WHERE pk = 1 AND ck1 = 2");
+        execute("INSERT INTO %s (pk, ck1, v1, v2) VALUES (1, 1, 1, 1)");
+        execute("INSERT INTO %s (pk, ck1, v1, v2) VALUES (1, 2, 2, 2)");
+        execute("INSERT INTO %s (pk, ck1, v1, v2) VALUES (1, 3, 3, 3)");
+        execute("DELETE FROM %s WHERE pk = 1 AND ck1 = 2");
+        execute("DELETE FROM %s WHERE pk = 1 AND ck1 = 3");
+        execute("INSERT INTO %s (pk, ck1, v1, v2) VALUES (1, 4, 4, 4)");
+        execute("INSERT INTO %s (pk, ck1, v1, v2) VALUES (1, 5, 5, 5)");
+
+        assertRows(execute("select * from %s where pk = 1"),
+                   row(1, 1, 1, 1),
+                   row(1, 4, 4, 4),
+                   row(1, 5, 5, 5));
+        assertRows(execute("select * from %s where pk = 1 LIMIT 1"),
+                   row(1, 1, 1, 1));
+
+        assertRows(execute("select * from %s where pk = 1 and ck1 >=2"),
+                   row(1, 4, 4, 4),
+                   row(1, 5, 5, 5));
+        assertRows(execute("select * from %s where pk = 1 and ck1 >=2 LIMIT 1"),
+                   row(1, 4, 4, 4));
+
+        assertRows(execute("select * from %s where pk = 1 and ck1 >=2"),
+                   row(1, 4, 4, 4),
+                   row(1, 5, 5, 5));
+        assertRows(execute("select * from %s where pk = 1 and ck1 >=2 LIMIT 1"),
+                   row(1, 4, 4, 4));
+    }
+
+    @Test
+    public void testPartialCacheWithStatic() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck1 int, s int static, v1 int, primary key (pk, ck1))" +
+                    "WITH CACHING = { 'keys': 'ALL', 'rows_per_partition': '1' }");
+        assertEmpty(execute("select * from %s where pk = 10000"));
+
+        execute("INSERT INTO %s (pk, s) VALUES (1, 1)");
+        execute("INSERT INTO %s (pk, ck1, v1) VALUES (1, 2, 2)");
+        execute("INSERT INTO %s (pk, ck1, v1) VALUES (1, 3, 3)");
+
+        execute("DELETE FROM %s WHERE pk = 2 AND ck1 = 0");
+        execute("DELETE FROM %s WHERE pk = 2 AND ck1 = 1");
+        execute("DELETE FROM %s WHERE pk = 3 AND ck1 = 2");
+        execute("INSERT INTO %s (pk, s) VALUES (2, 2)");
+        execute("INSERT INTO %s (pk, ck1, v1) VALUES (2, 1, 1)");
+        execute("INSERT INTO %s (pk, ck1, v1) VALUES (2, 2, 2)");
+        execute("INSERT INTO %s (pk, ck1, v1) VALUES (2, 3, 3)");
+
+        assertRows(execute("select * from %s WHERE pk = 1"),
+                   row(1, 2, 1, 2),
+                   row(1, 3, 1, 3));
+
+        assertRows(execute("select * from %s WHERE pk = 2"),
+                   row(2, 1, 2, 1),
+                   row(2, 2, 2, 2),
+                   row(2, 3, 2, 3));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowCacheTest.java b/test/unit/org/apache/cassandra/db/RowCacheTest.java
index 332714fb..06aed47 100644
--- a/test/unit/org/apache/cassandra/db/RowCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/RowCacheTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -21,7 +21,7 @@
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Collection;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.TreeSet;
 
@@ -29,35 +29,37 @@
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
-
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.cache.CachingOptions;
 import org.apache.cassandra.cache.RowCacheKey;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.metrics.ClearableHistogram;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+
+import static org.junit.Assert.*;
 
 public class RowCacheTest
 {
     private static final String KEYSPACE_CACHED = "RowCacheTest";
     private static final String CF_CACHED = "CachedCF";
     private static final String CF_CACHEDINT = "CachedIntCF";
+    private static final String CF_CACHEDNOCLUSTER = "CachedNoClustering";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
@@ -65,13 +67,12 @@
         System.setProperty("org.caffinitas.ohc.segmentCount", "16");
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE_CACHED,
-                SimpleStrategy.class,
-                KSMetaData.optsWithRF(1),
-                SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHED).caching(CachingOptions.ALL),
-                SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHEDINT)
-                            .defaultValidator(IntegerType.instance)
-                            .caching(new CachingOptions(new CachingOptions.KeyCache(CachingOptions.KeyCache.Type.ALL),
-                                     new CachingOptions.RowCache(CachingOptions.RowCache.Type.HEAD, 100))));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHEDNOCLUSTER, 1, AsciiType.instance, AsciiType.instance, null)
+                                                .caching(new CachingParams(true, 100)),
+                                    SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHED).caching(CachingParams.CACHE_EVERYTHING),
+                                    SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHEDINT, 1, IntegerType.instance)
+                                                .caching(new CachingParams(true, 100)));
     }
 
     @AfterClass
@@ -81,6 +82,53 @@
     }
 
     @Test
+    public void testRoundTrip() throws Exception
+    {
+        CompactionManager.instance.disableAutoCompaction();
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE_CACHED);
+        String cf = "CachedIntCF";
+        ColumnFamilyStore cachedStore  = keyspace.getColumnFamilyStore(cf);
+        long startRowCacheHits = cachedStore.metric.rowCacheHit.getCount();
+        long startRowCacheOutOfRange = cachedStore.metric.rowCacheHitOutOfRange.getCount();
+        // empty the row cache
+        CacheService.instance.invalidateRowCache();
+
+        // set global row cache size to 1 MB
+        CacheService.instance.setRowCacheCapacityInMB(1);
+
+        ByteBuffer key = ByteBufferUtil.bytes("rowcachekey");
+        DecoratedKey dk = cachedStore.decorateKey(key);
+        RowCacheKey rck = new RowCacheKey(cachedStore.metadata.ksAndCFName, dk);
+
+        RowUpdateBuilder rub = new RowUpdateBuilder(cachedStore.metadata, System.currentTimeMillis(), key);
+        rub.clustering(String.valueOf(0));
+        rub.add("val", ByteBufferUtil.bytes("val" + 0));
+        rub.build().applyUnsafe();
+
+        // populate row cache, we should not get a row cache hit;
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(1).build());
+        assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
+
+        // do another query, limit is 20, which is < 100 that we cache, we should get a hit and it should be in range
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(1).build());
+        assertEquals(++startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
+        assertEquals(startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.getCount());
+
+        CachedPartition cachedCf = (CachedPartition)CacheService.instance.rowCache.get(rck);
+        assertEquals(1, cachedCf.rowCount());
+        for (Unfiltered unfiltered : Util.once(cachedCf.unfilteredIterator(ColumnFilter.selection(cachedCf.columns()), Slices.ALL, false)))
+        {
+            Row r = (Row) unfiltered;
+            for (ColumnData c : r)
+            {
+                assertEquals(((Cell)c).value(), ByteBufferUtil.bytes("val" + 0));
+            }
+        }
+        cachedStore.truncateBlocking();
+    }
+
+    @Test
     public void testRowCache() throws Exception
     {
         CompactionManager.instance.disableAutoCompaction();
@@ -102,19 +150,25 @@
         {
             DecoratedKey key = Util.dk("key" + i);
 
-            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
+            Util.getAll(Util.cmd(cachedStore, key).build());
             assert CacheService.instance.rowCache.size() == i + 1;
-            assert cachedStore.containsCachedRow(key); // current key should be stored in the cache
+            assert cachedStore.containsCachedParition(key); // current key should be stored in the cache
 
             // checking if cell is read correctly after cache
-            ColumnFamily cf = cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-            Collection<Cell> cells = cf.getSortedColumns();
+            CachedPartition cp = cachedStore.getRawCachedPartition(key);
+            try (UnfilteredRowIterator ai = cp.unfilteredIterator(ColumnFilter.selection(cp.columns()), Slices.ALL, false))
+            {
+                assert ai.hasNext();
+                Row r = (Row)ai.next();
+                assertFalse(ai.hasNext());
 
-            Cell cell = cells.iterator().next();
+                Iterator<Cell> ci = r.cells().iterator();
+                assert(ci.hasNext());
+                Cell cell = ci.next();
 
-            assert cells.size() == 1;
-            assert cell.name().toByteBuffer().equals(ByteBufferUtil.bytes("col" + i));
-            assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
+                assert cell.column().name.bytes.equals(ByteBufferUtil.bytes("val"));
+                assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
+            }
         }
 
         // insert 10 more keys
@@ -124,25 +178,31 @@
         {
             DecoratedKey key = Util.dk("key" + i);
 
-            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-            assert cachedStore.containsCachedRow(key); // cache should be populated with the latest rows read (old ones should be popped)
+            Util.getAll(Util.cmd(cachedStore, key).build());
+            assert cachedStore.containsCachedParition(key); // cache should be populated with the latest rows read (old ones should be popped)
 
             // checking if cell is read correctly after cache
-            ColumnFamily cf = cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
-            Collection<Cell> cells = cf.getSortedColumns();
+            CachedPartition cp = cachedStore.getRawCachedPartition(key);
+            try (UnfilteredRowIterator ai = cp.unfilteredIterator(ColumnFilter.selection(cp.columns()), Slices.ALL, false))
+            {
+                assert ai.hasNext();
+                Row r = (Row)ai.next();
+                assertFalse(ai.hasNext());
 
-            Cell cell = cells.iterator().next();
+                Iterator<Cell> ci = r.cells().iterator();
+                assert(ci.hasNext());
+                Cell cell = ci.next();
 
-            assert cells.size() == 1;
-            assert cell.name().toByteBuffer().equals(ByteBufferUtil.bytes("col" + i));
-            assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
+                assert cell.column().name.bytes.equals(ByteBufferUtil.bytes("val"));
+                assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
+            }
         }
 
         // clear 100 rows from the cache
         int keysLeft = 109;
         for (int i = 109; i >= 10; i--)
         {
-            cachedStore.invalidateCachedRow(Util.dk("key" + i));
+            cachedStore.invalidateCachedPartition(Util.dk("key" + i));
             assert CacheService.instance.rowCache.size() == keysLeft;
             keysLeft--;
         }
@@ -151,6 +211,74 @@
     }
 
     @Test
+    public void testRowCacheNoClustering() throws Exception
+    {
+        CompactionManager.instance.disableAutoCompaction();
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE_CACHED);
+        ColumnFamilyStore cachedStore  = keyspace.getColumnFamilyStore(CF_CACHEDNOCLUSTER);
+
+        // empty the row cache
+        CacheService.instance.invalidateRowCache();
+
+        // set global row cache size to 1 MB
+        CacheService.instance.setRowCacheCapacityInMB(1);
+
+        // inserting 100 rows into column family
+        SchemaLoader.insertData(KEYSPACE_CACHED, CF_CACHEDNOCLUSTER, 0, 100);
+
+        // now reading rows one by one and checking if row cache grows
+        for (int i = 0; i < 100; i++)
+        {
+            DecoratedKey key = Util.dk("key" + i);
+
+            Util.getAll(Util.cmd(cachedStore, key).build());
+
+            assertEquals(CacheService.instance.rowCache.size(), i + 1);
+            assert(cachedStore.containsCachedParition(key)); // current key should be stored in the cache
+        }
+
+        // insert 10 more keys
+        SchemaLoader.insertData(KEYSPACE_CACHED, CF_CACHEDNOCLUSTER, 100, 10);
+
+        for (int i = 100; i < 110; i++)
+        {
+            DecoratedKey key = Util.dk("key" + i);
+
+            Util.getAll(Util.cmd(cachedStore, key).build());
+            assert cachedStore.containsCachedParition(key); // cache should be populated with the latest rows read (old ones should be popped)
+
+            // checking if cell is read correctly after cache
+            CachedPartition cp = cachedStore.getRawCachedPartition(key);
+            try (UnfilteredRowIterator ai = cp.unfilteredIterator(ColumnFilter.selection(cp.columns()), Slices.ALL, false))
+            {
+                assert ai.hasNext();
+                Row r = (Row)ai.next();
+                assertFalse(ai.hasNext());
+
+                Iterator<Cell> ci = r.cells().iterator();
+                assert(ci.hasNext());
+                Cell cell = ci.next();
+
+                assert cell.column().name.bytes.equals(ByteBufferUtil.bytes("val"));
+                assert cell.value().equals(ByteBufferUtil.bytes("val" + i));
+            }
+        }
+
+        // clear 100 rows from the cache
+        int keysLeft = 109;
+        for (int i = 109; i >= 10; i--)
+        {
+            cachedStore.invalidateCachedPartition(Util.dk("key" + i));
+            assert CacheService.instance.rowCache.size() == keysLeft;
+            keysLeft--;
+        }
+
+        CacheService.instance.setRowCacheCapacityInMB(0);
+
+    }
+
+    @Test
     public void testRowCacheLoad() throws Exception
     {
         CacheService.instance.setRowCacheCapacityInMB(1);
@@ -190,10 +318,10 @@
         ColumnFamilyStore store = Keyspace.open(KEYSPACE_CACHED).getColumnFamilyStore(CF_CACHED);
         assertEquals(CacheService.instance.rowCache.size(), 100);
 
-        //construct 5 ranges of 20 elements each
+        //construct 5 bounds of 20 elements each
         ArrayList<Bounds<Token>> subranges = getBounds(20);
 
-        //invalidate 3 of the 5 ranges
+        //invalidate 3 of the 5 bounds
         ArrayList<Bounds<Token>> boundsToInvalidate = Lists.newArrayList(subranges.get(0), subranges.get(2), subranges.get(4));
         int invalidatedKeys = store.invalidateRowCache(boundsToInvalidate);
         assertEquals(60, invalidatedKeys);
@@ -209,7 +337,7 @@
         TreeSet<DecoratedKey> orderedKeys = new TreeSet<>();
 
         for(Iterator<RowCacheKey> it = CacheService.instance.rowCache.keyIterator();it.hasNext();)
-            orderedKeys.add(store.partitioner.decorateKey(ByteBuffer.wrap(it.next().key)));
+            orderedKeys.add(store.decorateKey(ByteBuffer.wrap(it.next().key)));
 
         ArrayList<Bounds<Token>> boundsToInvalidate = new ArrayList<>();
         Iterator<DecoratedKey> iterator = orderedKeys.iterator();
@@ -285,41 +413,35 @@
         CacheService.instance.setRowCacheCapacityInMB(1);
 
         ByteBuffer key = ByteBufferUtil.bytes("rowcachekey");
-        DecoratedKey dk = cachedStore.partitioner.decorateKey(key);
+        DecoratedKey dk = cachedStore.decorateKey(key);
         RowCacheKey rck = new RowCacheKey(cachedStore.metadata.ksAndCFName, dk);
-        Mutation mutation = new Mutation(KEYSPACE_CACHED, key);
+        String values[] = new String[200];
         for (int i = 0; i < 200; i++)
-            mutation.add(cf, Util.cellname(i), ByteBufferUtil.bytes("val" + i), System.currentTimeMillis());
-        mutation.applyUnsafe();
+        {
+            RowUpdateBuilder rub = new RowUpdateBuilder(cachedStore.metadata, System.currentTimeMillis(), key);
+            rub.clustering(String.valueOf(i));
+            values[i] = "val" + i;
+            rub.add("val", ByteBufferUtil.bytes(values[i]));
+            rub.build().applyUnsafe();
+        }
+        Arrays.sort(values);
 
         // populate row cache, we should not get a row cache hit;
-        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
-                                                                Composites.EMPTY,
-                                                                Composites.EMPTY,
-                                                                false, 10, System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(10).build());
         assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
 
         // do another query, limit is 20, which is < 100 that we cache, we should get a hit and it should be in range
-        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
-                                                                Composites.EMPTY,
-                                                                Composites.EMPTY,
-                                                                false, 20, System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(10).build());
         assertEquals(++startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
         assertEquals(startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.getCount());
 
         // get a slice from 95 to 105, 95->99 are in cache, we should not get a hit and then row cache is out of range
-        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
-                                                               CellNames.simpleDense(ByteBufferUtil.bytes(95)),
-                                                               CellNames.simpleDense(ByteBufferUtil.bytes(105)),
-                                                               false, 10, System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cachedStore, dk).fromIncl(String.valueOf(210)).toExcl(String.valueOf(215)).build());
         assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
         assertEquals(++startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.getCount());
 
         // get a slice with limit > 100, we should get a hit out of range.
-        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
-                                                               Composites.EMPTY,
-                                                               Composites.EMPTY,
-                                                               false, 101, System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(101).build());
         assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
         assertEquals(++startRowCacheOutOfRange, cachedStore.metric.rowCacheHitOutOfRange.getCount());
 
@@ -327,19 +449,28 @@
         CacheService.instance.invalidateRowCache();
 
         // try to populate row cache with a limit > rows to cache, we should still populate row cache;
-        cachedStore.getColumnFamily(QueryFilter.getSliceFilter(dk, cf,
-                                                                Composites.EMPTY,
-                                                                Composites.EMPTY,
-                                                                false, 105, System.currentTimeMillis()));
+        Util.getAll(Util.cmd(cachedStore, dk).withLimit(105).build());
         assertEquals(startRowCacheHits, cachedStore.metric.rowCacheHit.getCount());
+
         // validate the stuff in cache;
-        ColumnFamily cachedCf = (ColumnFamily)CacheService.instance.rowCache.get(rck);
-        assertEquals(cachedCf.getColumnCount(), 100);
+        CachedPartition cachedCf = (CachedPartition)CacheService.instance.rowCache.get(rck);
+        assertEquals(cachedCf.rowCount(), 100);
         int i = 0;
-        for(Cell c : cachedCf)
+
+        for (Unfiltered unfiltered : Util.once(cachedCf.unfilteredIterator(ColumnFilter.selection(cachedCf.columns()), Slices.ALL, false)))
         {
-            assertEquals(c.name(), Util.cellname(i++));
+            Row r = (Row) unfiltered;
+
+            assertEquals(r.clustering().get(0), ByteBufferUtil.bytes(values[i].substring(3)));
+
+            for (ColumnData c : r)
+            {
+                assertEquals(((Cell)c).value(), ByteBufferUtil.bytes(values[i]));
+            }
+            i++;
         }
+
+        cachedStore.truncateBlocking();
     }
 
     @Test
@@ -368,10 +499,10 @@
         {
             DecoratedKey key = Util.dk("key" + i);
 
-            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
+            Util.getAll(Util.cmd(cachedStore, key).build());
 
             long count_before = cachedStore.metric.sstablesPerReadHistogram.cf.getCount();
-            cachedStore.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 1, System.currentTimeMillis());
+            Util.getAll(Util.cmd(cachedStore, key).build());
 
             // check that SSTablePerReadHistogram has been updated by zero,
             // so count has been increased and in a 1/2 of requests there were zero read SSTables
@@ -402,7 +533,7 @@
 
         // insert data and fill the cache
         SchemaLoader.insertData(KEYSPACE_CACHED, CF_CACHED, offset, totalKeys);
-        SchemaLoader.readData(KEYSPACE_CACHED, CF_CACHED, offset, totalKeys);
+        readData(KEYSPACE_CACHED, CF_CACHED, offset, totalKeys);
         assertEquals(totalKeys, CacheService.instance.rowCache.size());
 
         // force the cache to disk
@@ -413,4 +544,17 @@
         assertEquals(0, CacheService.instance.rowCache.size());
         assertEquals(keysToSave == Integer.MAX_VALUE ? totalKeys : keysToSave, CacheService.instance.rowCache.loadSaved());
     }
+
+    private static void readData(String keyspace, String columnFamily, int offset, int numberOfRows)
+    {
+        ColumnFamilyStore store = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
+        CFMetaData cfm = Schema.instance.getCFMetaData(keyspace, columnFamily);
+
+        for (int i = offset; i < offset + numberOfRows; i++)
+        {
+            DecoratedKey key = Util.dk("key" + i);
+            Clustering cl = new Clustering(ByteBufferUtil.bytes("col" + i));
+            Util.getAll(Util.cmd(store, key).build());
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
index 30267d9..62c88a0 100644
--- a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java

@@ -17,64 +17,197 @@
  */
 package org.apache.cassandra.db;
 
+import java.io.File;
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.Collections;
+import java.util.List;
 
-import junit.framework.Assert;
-import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
-import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.io.sstable.IndexHelper;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.utils.FBUtilities;
+
+import org.junit.Assert;
 import org.junit.Test;
 
-public class RowIndexEntryTest extends SchemaLoader
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+
+public class RowIndexEntryTest extends CQLTester
 {
-    @Test
-    public void testSerializedSize() throws IOException
+    private static final List<AbstractType<?>> clusterTypes = Collections.<AbstractType<?>>singletonList(LongType.instance);
+    private static final ClusteringComparator comp = new ClusteringComparator(clusterTypes);
+    private static ClusteringPrefix cn(long l)
     {
-        final RowIndexEntry<IndexHelper.IndexInfo> simple = new RowIndexEntry<>(123);
+        return Util.clustering(comp, l);
+    }
+
+    @Test
+    public void testArtificialIndexOf() throws IOException
+    {
+        CFMetaData cfMeta = CFMetaData.compile("CREATE TABLE pipe.dev_null (pk bigint, ck bigint, val text, PRIMARY KEY(pk, ck))", "foo");
+
+        DeletionTime deletionInfo = new DeletionTime(FBUtilities.timestampMicros(), FBUtilities.nowInSeconds());
+
+        SerializationHeader header = new SerializationHeader(true, cfMeta, cfMeta.partitionColumns(), EncodingStats.NO_STATS);
+        IndexHelper.IndexInfo.Serializer indexSerializer = new IndexHelper.IndexInfo.Serializer(cfMeta, BigFormat.latestVersion, header);
+
+        DataOutputBuffer dob = new DataOutputBuffer();
+        dob.writeUnsignedVInt(0);
+        DeletionTime.serializer.serialize(DeletionTime.LIVE, dob);
+        dob.writeUnsignedVInt(3);
+        int off0 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(0L), cn(5L), 0, 0, deletionInfo), dob);
+        int off1 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(10L), cn(15L), 0, 0, deletionInfo), dob);
+        int off2 = dob.getLength();
+        indexSerializer.serialize(new IndexHelper.IndexInfo(cn(20L), cn(25L), 0, 0, deletionInfo), dob);
+        dob.writeInt(off0);
+        dob.writeInt(off1);
+        dob.writeInt(off2);
+
+        @SuppressWarnings("resource") DataOutputBuffer dobRie = new DataOutputBuffer();
+        dobRie.writeUnsignedVInt(42L);
+        dobRie.writeUnsignedVInt(dob.getLength());
+        dobRie.write(dob.buffer());
+
+        ByteBuffer buf = dobRie.buffer();
+
+        RowIndexEntry<IndexHelper.IndexInfo> rie = new RowIndexEntry.Serializer(cfMeta, BigFormat.latestVersion, header).deserialize(new DataInputBuffer(buf, false));
+
+        Assert.assertEquals(42L, rie.position);
+
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(-1L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(17L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, -1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 0));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 1));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 2));
+        Assert.assertEquals(3, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, false, 3));
+
+        Assert.assertEquals(-1, IndexHelper.indexFor(cn(-1L), rie.columnsIndex(), comp, true, -1));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(0, IndexHelper.indexFor(cn(5L), rie.columnsIndex(), comp, true, 2));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(17L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 4));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, true, 3));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(12L), rie.columnsIndex(), comp, true, 2));
+        Assert.assertEquals(1, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 1));
+        Assert.assertEquals(2, IndexHelper.indexFor(cn(100L), rie.columnsIndex(), comp, true, 2));
+    }
+
+    @Test
+    public void testSerializedSize() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE %s (a int, b text, c int, PRIMARY KEY(a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+
+        final RowIndexEntry simple = new RowIndexEntry(123);
 
         DataOutputBuffer buffer = new DataOutputBuffer();
-        RowIndexEntry.Serializer serializer = new RowIndexEntry.Serializer(new IndexHelper.IndexInfo.Serializer(new SimpleDenseCellNameType(UTF8Type.instance)));
+        SerializationHeader header = new SerializationHeader(true, cfs.metadata, cfs.metadata.partitionColumns(), EncodingStats.NO_STATS);
+        RowIndexEntry.Serializer serializer = new RowIndexEntry.Serializer(cfs.metadata, BigFormat.latestVersion, header);
 
         serializer.serialize(simple, buffer);
 
-        Assert.assertEquals(buffer.getLength(), serializer.serializedSize(simple));
+        assertEquals(buffer.getLength(), serializer.serializedSize(simple));
+
+        // write enough rows to ensure we get a few column index entries
+        for (int i = 0; i <= DatabaseDescriptor.getColumnIndexSize() / 4; i++)
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, "" + i, i);
+
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build());
+
+        File tempFile = File.createTempFile("row_index_entry_test", null);
+        tempFile.deleteOnExit();
+        SequentialWriter writer = SequentialWriter.open(tempFile);
+        ColumnIndex columnIndex = ColumnIndex.writeAndBuildIndex(partition.unfilteredIterator(), writer, header, BigFormat.latestVersion);
+        RowIndexEntry<IndexHelper.IndexInfo> withIndex = RowIndexEntry.create(0xdeadbeef, DeletionTime.LIVE, columnIndex);
+        IndexHelper.IndexInfo.Serializer indexSerializer = new IndexHelper.IndexInfo.Serializer(cfs.metadata, BigFormat.latestVersion, header);
+
+        // sanity check
+        assertTrue(columnIndex.columnsIndex.size() >= 3);
 
         buffer = new DataOutputBuffer();
-        Schema.instance.setKeyspaceDefinition(KSMetaData.newKeyspace("Keyspace1",
-                                                                     SimpleStrategy.class,
-                                                                     Collections.<String,String>emptyMap(),
-                                                                     false,
-                                                                     Collections.singleton(standardCFMD("Keyspace1", "Standard1"))));
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create("Keyspace1", "Standard1");
-        ColumnIndex columnIndex = new ColumnIndex.Builder(cf, ByteBufferUtil.bytes("a"), new DataOutputBuffer())
-        {{
-            int idx = 0, size = 0;
-            Cell column;
-            do
-            {
-                column = new BufferCell(CellNames.simpleDense(ByteBufferUtil.bytes("c" + idx++)), ByteBufferUtil.bytes("v"), FBUtilities.timestampMicros());
-                size += column.serializedSize(new SimpleDenseCellNameType(UTF8Type.instance), TypeSizes.NATIVE);
-
-                add(column);
-            }
-            while (size < DatabaseDescriptor.getColumnIndexSize() * 3);
-            finishAddingAtoms();
-
-        }}.build();
-
-        RowIndexEntry<IndexHelper.IndexInfo> withIndex = RowIndexEntry.create(0xdeadbeef, DeletionTime.LIVE, columnIndex);
-
         serializer.serialize(withIndex, buffer);
-        Assert.assertEquals(buffer.getLength(), serializer.serializedSize(withIndex));
+        assertEquals(buffer.getLength(), serializer.serializedSize(withIndex));
+
+        // serialization check
+
+        ByteBuffer bb = buffer.buffer();
+        DataInputBuffer input = new DataInputBuffer(bb, false);
+        serializationCheck(withIndex, indexSerializer, bb, input);
+
+        // test with an output stream that doesn't support a file-pointer
+        buffer = new DataOutputBuffer()
+        {
+            public boolean hasPosition()
+            {
+                return false;
+            }
+
+            public long position()
+            {
+                throw new UnsupportedOperationException();
+            }
+        };
+        serializer.serialize(withIndex, buffer);
+        bb = buffer.buffer();
+        input = new DataInputBuffer(bb, false);
+        serializationCheck(withIndex, indexSerializer, bb, input);
+
+        //
+
+        bb = buffer.buffer();
+        input = new DataInputBuffer(bb, false);
+        RowIndexEntry.Serializer.skip(input, BigFormat.latestVersion);
+        Assert.assertEquals(0, bb.remaining());
+    }
+
+    private void serializationCheck(RowIndexEntry<IndexHelper.IndexInfo> withIndex, IndexHelper.IndexInfo.Serializer indexSerializer, ByteBuffer bb, DataInputBuffer input) throws IOException
+    {
+        Assert.assertEquals(0xdeadbeef, input.readUnsignedVInt());
+        Assert.assertEquals(withIndex.promotedSize(indexSerializer), input.readUnsignedVInt());
+
+        Assert.assertEquals(withIndex.headerLength(), input.readUnsignedVInt());
+        Assert.assertEquals(withIndex.deletionTime(), DeletionTime.serializer.deserialize(input));
+        Assert.assertEquals(withIndex.columnsIndex().size(), input.readUnsignedVInt());
+
+        int offset = bb.position();
+        int[] offsets = new int[withIndex.columnsIndex().size()];
+        for (int i = 0; i < withIndex.columnsIndex().size(); i++)
+        {
+            int pos = bb.position();
+            offsets[i] = pos - offset;
+            IndexHelper.IndexInfo info = indexSerializer.deserialize(input);
+            int end = bb.position();
+
+            Assert.assertEquals(indexSerializer.serializedSize(info), end - pos);
+
+            Assert.assertEquals(withIndex.columnsIndex().get(i).offset, info.offset);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).width, info.width);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).endOpenMarker, info.endOpenMarker);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).firstName, info.firstName);
+            Assert.assertEquals(withIndex.columnsIndex().get(i).lastName, info.lastName);
+        }
+
+        for (int i = 0; i < withIndex.columnsIndex().size(); i++)
+            Assert.assertEquals(offsets[i], input.readInt());
+
+        Assert.assertEquals(0, bb.remaining());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowIterationTest.java b/test/unit/org/apache/cassandra/db/RowIterationTest.java
index ee7bf1a..3b0293c 100644
--- a/test/unit/org/apache/cassandra/db/RowIterationTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIterationTest.java

@@ -18,105 +18,65 @@
 */
 package org.apache.cassandra.db;
 
-import java.net.InetAddress;
-import java.nio.ByteBuffer;
-import java.util.Set;
-import java.util.HashSet;
-
-import org.apache.cassandra.Util;
-
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.Util;
+
 import static org.junit.Assert.assertEquals;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
 
 
-public class RowIterationTest
+public class RowIterationTest extends CQLTester
 {
-    public static final String KEYSPACE1 = "RowIterationTest";
-    public static final InetAddress LOCAL = FBUtilities.getBroadcastAddress();
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
+    @Test
+    public void testRowIteration() throws Throwable
     {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, "Standard3"),
-                                    SchemaLoader.superCFMD(KEYSPACE1, "Super3", LongType.instance));
+        String tableName = createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", i, 0, i, i, (long)i);
+        cfs.forceBlockingFlush();
+        assertEquals(10, execute("SELECT * FROM %s").size());
     }
 
     @Test
-    public void testRowIteration()
+    public void testRowIterationDeletionTime() throws Throwable
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Super3");
+        String tableName = createTable("CREATE TABLE %s (a int PRIMARY KEY, b int)");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
-        final int ROWS_PER_SSTABLE = 10;
-        Set<DecoratedKey> inserted = new HashSet<DecoratedKey>();
-        for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
-            DecoratedKey key = Util.dk(String.valueOf(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add("Super3", CellNames.compositeDense(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes(String.valueOf(i))), ByteBuffer.wrap(new byte[ROWS_PER_SSTABLE * 10 - i * 2]), i);
-            rm.applyUnsafe();
-            inserted.add(key);
-        }
-        store.forceBlockingFlush();
-        assertEquals(inserted.toString(), inserted.size(), Util.getRangeSlice(store).size());
-    }
+        execute("INSERT INTO %s (a, b) VALUES (?, ?) USING TIMESTAMP ?", 0, 0, 0L);
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE a = ?", 0L, 0);
 
-    @Test
-    public void testRowIterationDeletionTime()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        String CF_NAME = "Standard3";
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_NAME);
-        DecoratedKey key = Util.dk("key");
-
-        // Delete row in first sstable
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete(CF_NAME, 0);
-        rm.add(CF_NAME, Util.cellname("c"), ByteBufferUtil.bytes("values"), 0L);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
+        cfs.forceBlockingFlush();
 
         // Delete row in second sstable with higher timestamp
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete(CF_NAME, 1);
-        rm.add(CF_NAME, Util.cellname("c"), ByteBufferUtil.bytes("values"), 1L);
-        DeletionInfo delInfo2 = rm.getColumnFamilies().iterator().next().deletionInfo();
-        assert delInfo2.getTopLevelDeletion().markedForDeleteAt == 1L;
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
+        execute("INSERT INTO %s (a, b) VALUES (?, ?) USING TIMESTAMP ?", 0, 0, 1L);
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE a = ?", 1L, 0);
 
-        ColumnFamily cf = Util.getRangeSlice(store).get(0).cf;
-        assert cf.deletionInfo().equals(delInfo2);
+        int localDeletionTime = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).partitionLevelDeletion().localDeletionTime();
+
+        cfs.forceBlockingFlush();
+
+        DeletionTime dt = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).partitionLevelDeletion();
+        assertEquals(1L, dt.markedForDeleteAt());
+        assertEquals(localDeletionTime, dt.localDeletionTime());
     }
 
     @Test
-    public void testRowIterationDeletion()
+    public void testRowIterationDeletion() throws Throwable
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        String CF_NAME = "Standard3";
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_NAME);
-        DecoratedKey key = Util.dk("key");
+        String tableName = createTable("CREATE TABLE %s (a int PRIMARY KEY, b int)");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
         // Delete a row in first sstable
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete(CF_NAME, 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE a = ?", 0L, 0);
+        cfs.forceBlockingFlush();
 
-        ColumnFamily cf = Util.getRangeSlice(store).get(0).cf;
-        assert cf != null;
+        assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).isEmpty());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/RowTest.java b/test/unit/org/apache/cassandra/db/RowTest.java
index 910f9e1..e3f4884 100644
--- a/test/unit/org/apache/cassandra/db/RowTest.java
+++ b/test/unit/org/apache/cassandra/db/RowTest.java

@@ -18,24 +18,31 @@
 */
 package org.apache.cassandra.db;
 
-import java.util.Arrays;
-import java.util.concurrent.TimeUnit;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 
-import com.google.common.util.concurrent.Uninterruptibles;
+import com.google.common.collect.ImmutableList;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.CellNames;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.Util.tombstone;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 public class RowTest
@@ -43,88 +50,169 @@
     private static final String KEYSPACE1 = "RowTest";
     private static final String CF_STANDARD1 = "Standard1";
 
+    private int nowInSeconds;
+    private DecoratedKey dk;
+    private ColumnFamilyStore cfs;
+    private CFMetaData cfm;
+
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        CFMetaData cfMetadata = CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD1)
+                                                  .addPartitionKey("key", BytesType.instance)
+                                                  .addClusteringColumn("col1", AsciiType.instance)
+                                                  .addRegularColumn("a", AsciiType.instance)
+                                                  .addRegularColumn("b", AsciiType.instance)
+                                                  .build();
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                                    KeyspaceParams.simple(1),
+                                    cfMetadata);
+    }
+
+    @Before
+    public void setup()
+    {
+        nowInSeconds = FBUtilities.nowInSeconds();
+        dk = Util.dk("key0");
+        cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        cfm = cfs.metadata;
     }
 
     @Test
-    public void testDiffColumnFamily()
+    public void testMergeRangeTombstones() throws InterruptedException
     {
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("one", "onev", 0));
+        PartitionUpdate update1 = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 1);
+        writeRangeTombstone(update1, "1", "11", 123, 123);
+        writeRangeTombstone(update1, "2", "22", 123, 123);
+        writeRangeTombstone(update1, "3", "31", 123, 123);
+        writeRangeTombstone(update1, "4", "41", 123, 123);
 
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        DeletionInfo delInfo = new DeletionInfo(0, 0);
-        cf2.delete(delInfo);
+        PartitionUpdate update2 = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 1);
+        writeRangeTombstone(update2, "1", "11", 123, 123);
+        writeRangeTombstone(update2, "111", "112", 1230, 123);
+        writeRangeTombstone(update2, "2", "24", 123, 123);
+        writeRangeTombstone(update2, "3", "31", 1230, 123);
+        writeRangeTombstone(update2, "4", "41", 123, 1230);
+        writeRangeTombstone(update2, "5", "51", 123, 1230);
 
-        ColumnFamily cfDiff = cf1.diff(cf2);
-        assertFalse(cfDiff.hasColumns());
-        assertEquals(cfDiff.deletionInfo(), delInfo);
+        try (UnfilteredRowIterator merged = UnfilteredRowIterators.merge(ImmutableList.of(update1.unfilteredIterator(), update2.unfilteredIterator()), nowInSeconds))
+        {
+            Object[][] expected = new Object[][]{ { "1", "11", 123l, 123 },
+                                                  { "111", "112", 1230l, 123 },
+                                                  { "2", "24", 123l, 123 },
+                                                  { "3", "31", 1230l, 123 },
+                                                  { "4", "41", 123l, 1230 },
+                                                  { "5", "51", 123l, 1230 } };
+            int i = 0;
+            while (merged.hasNext())
+            {
+                RangeTombstoneBoundMarker openMarker = (RangeTombstoneBoundMarker)merged.next();
+                Slice.Bound openBound = openMarker.clustering();
+                DeletionTime openDeletion = new DeletionTime(openMarker.deletionTime().markedForDeleteAt(),
+                                                                   openMarker.deletionTime().localDeletionTime());
 
-        RangeTombstone tombstone1 = tombstone("1", "11", (long) 123, 123);
-        RangeTombstone tombstone1_2 = tombstone("111", "112", (long) 1230, 123);
-        RangeTombstone tombstone2_1 = tombstone("2", "22", (long) 123, 123);
-        RangeTombstone tombstone2_2 = tombstone("2", "24", (long) 123, 123);
-        RangeTombstone tombstone3_1 = tombstone("3", "31", (long) 123, 123);
-        RangeTombstone tombstone3_2 = tombstone("3", "31", (long) 1230, 123);
-        RangeTombstone tombstone4_1 = tombstone("4", "41", (long) 123, 123);
-        RangeTombstone tombstone4_2 = tombstone("4", "41", (long) 123, 1230);
-        RangeTombstone tombstone5_2 = tombstone("5", "51", (long) 123, 1230);
-        cf1.delete(tombstone1);
-        cf1.delete(tombstone2_1);
-        cf1.delete(tombstone3_1);
-        cf1.delete(tombstone4_1);
+                RangeTombstoneBoundMarker closeMarker = (RangeTombstoneBoundMarker)merged.next();
+                Slice.Bound closeBound = closeMarker.clustering();
+                DeletionTime closeDeletion = new DeletionTime(closeMarker.deletionTime().markedForDeleteAt(),
+                                                                    closeMarker.deletionTime().localDeletionTime());
 
-        cf2.delete(tombstone1);
-        cf2.delete(tombstone1_2);
-        cf2.delete(tombstone2_2);
-        cf2.delete(tombstone3_2);
-        cf2.delete(tombstone4_2);
-        cf2.delete(tombstone5_2);
-
-        cfDiff = cf1.diff(cf2);
-        assertEquals(0, cfDiff.getColumnCount());
-
-        // only tmbstones which differ in superset or have more recent timestamp to be in diff
-        delInfo.add(tombstone1_2, cf1.getComparator());
-        delInfo.add(tombstone2_2, cf1.getComparator());
-        delInfo.add(tombstone3_2, cf1.getComparator());
-        delInfo.add(tombstone5_2, cf1.getComparator());
-
-        assertEquals(delInfo, cfDiff.deletionInfo());
+                assertEquals(openDeletion, closeDeletion);
+                assertRangeTombstoneMarkers(openBound, closeBound, openDeletion, expected[i++]);
+            }
+        }
     }
 
     @Test
     public void testResolve()
     {
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("one", "A", 0));
+        ColumnDefinition defA = cfm.getColumnDefinition(new ColumnIdentifier("a", true));
+        ColumnDefinition defB = cfm.getColumnDefinition(new ColumnIdentifier("b", true));
 
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.addColumn(column("one", "B", 1));
-        cf2.addColumn(column("two", "C", 1));
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSeconds);
+        builder.newRow(cfm.comparator.make("c1"));
+        writeSimpleCellValue(builder, cfm, defA, "a1", 0);
+        writeSimpleCellValue(builder, cfm, defA, "a2", 1);
+        writeSimpleCellValue(builder, cfm, defB, "b1", 1);
+        Row row = builder.build();
 
-        cf1.addAll(cf2);
-        assert Arrays.equals(cf1.getColumn(CellNames.simpleDense(ByteBufferUtil.bytes("one"))).value().array(), "B".getBytes());
-        assert Arrays.equals(cf1.getColumn(CellNames.simpleDense(ByteBufferUtil.bytes("two"))).value().array(), "C".getBytes());
+        PartitionUpdate update = PartitionUpdate.singleRowUpdate(cfm, dk, row);
+
+        Unfiltered unfiltered = update.unfilteredIterator().next();
+        assertTrue(unfiltered.kind() == Unfiltered.Kind.ROW);
+        row = (Row) unfiltered;
+        assertEquals("a2", defA.cellValueType().getString(row.getCell(defA).value()));
+        assertEquals("b1", defB.cellValueType().getString(row.getCell(defB).value()));
+        assertEquals(2, row.columns().size());
     }
 
     @Test
-    public void testExpiringColumnExpiration()
+    public void testExpiringColumnExpiration() throws IOException
     {
-        Cell c = new BufferExpiringCell(CellNames.simpleDense(ByteBufferUtil.bytes("one")), ByteBufferUtil.bytes("A"), 0, 1);
-        assertTrue(c.isLive());
+        int ttl = 1;
+        ColumnDefinition def = cfm.getColumnDefinition(new ColumnIdentifier("a", true));
 
-        // Because we keep the local deletion time with a precision of a
-        // second, we could have to wait 2 seconds in worst case scenario.
-        Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
+        Cell cell = BufferCell.expiring(def, 0, ttl, nowInSeconds, ((AbstractType) def.cellValueType()).decompose("a1"));
 
-        assert !c.isLive() && c.timestamp() == 0;
+        PartitionUpdate update = PartitionUpdate.singleRowUpdate(cfm, dk, BTreeRow.singleCellRow(cfm.comparator.make("c1"), cell));
+        new Mutation(update).applyUnsafe();
+
+        // when we read with a nowInSeconds before the cell has expired,
+        // the PartitionIterator includes the row we just wrote
+        Row row = Util.getOnlyRow(Util.cmd(cfs, dk).includeRow("c1").withNowInSeconds(nowInSeconds).build());
+        assertEquals("a1", ByteBufferUtil.string(row.getCell(def).value()));
+
+        // when we read with a nowInSeconds after the cell has expired, the row is filtered
+        // so the PartitionIterator is empty
+        Util.assertEmpty(Util.cmd(cfs, dk).includeRow("c1").withNowInSeconds(nowInSeconds + ttl + 1).build());
+    }
+
+    @Test
+    public void testHashCode()
+    {
+        ColumnDefinition defA = cfm.getColumnDefinition(new ColumnIdentifier("a", true));
+        ColumnDefinition defB = cfm.getColumnDefinition(new ColumnIdentifier("b", true));
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSeconds);
+        builder.newRow(cfm.comparator.make("c1"));
+        writeSimpleCellValue(builder, cfm, defA, "a1", 0);
+        writeSimpleCellValue(builder, cfm, defA, "a2", 1);
+        writeSimpleCellValue(builder, cfm, defB, "b1", 1);
+        Row row = builder.build();
+
+        Map<Row, Integer> map = new HashMap<>();
+        map.put(row, 1);
+        assertEquals(Integer.valueOf(1), map.get(row));
+    }
+
+    private void assertRangeTombstoneMarkers(Slice.Bound start, Slice.Bound end, DeletionTime deletionTime, Object[] expected)
+    {
+        AbstractType clusteringType = (AbstractType)cfm.comparator.subtype(0);
+
+        assertEquals(1, start.size());
+        assertEquals(start.kind(), Slice.Bound.Kind.INCL_START_BOUND);
+        assertEquals(expected[0], clusteringType.getString(start.get(0)));
+
+        assertEquals(1, end.size());
+        assertEquals(end.kind(), Slice.Bound.Kind.INCL_END_BOUND);
+        assertEquals(expected[1], clusteringType.getString(end.get(0)));
+
+        assertEquals(expected[2], deletionTime.markedForDeleteAt());
+        assertEquals(expected[3], deletionTime.localDeletionTime());
+    }
+
+    public void writeRangeTombstone(PartitionUpdate update, Object start, Object end, long markedForDeleteAt, int localDeletionTime)
+    {
+        ClusteringComparator comparator = cfs.getComparator();
+        update.add(new RangeTombstone(Slice.make(comparator.make(start), comparator.make(end)), new DeletionTime(markedForDeleteAt, localDeletionTime)));
+    }
+
+    private void writeSimpleCellValue(Row.Builder builder,
+                                      CFMetaData cfm,
+                                      ColumnDefinition columnDefinition,
+                                      String value,
+                                      long timestamp)
+    {
+       builder.addCell(BufferCell.live(cfm, columnDefinition, timestamp, ((AbstractType) columnDefinition.cellValueType()).decompose(value)));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 9b1ede4..e8a3285 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.db;
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,86 +15,77 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- *
  */
+package org.apache.cassandra.db;
 
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
+import java.io.*;
 import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.ExecutionException;
 
-import java.io.File;
-import java.io.IOError;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.UUIDGen;
-
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.commons.lang3.StringUtils;
-
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
+import org.apache.cassandra.*;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.compaction.Scrubber;
-import org.apache.cassandra.db.index.SecondaryIndex;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableRewriter;
-import org.apache.cassandra.OrderedJUnit4ClassRunner;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.Util;
+import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import static org.junit.Assert.*;
-import static org.junit.Assume.assumeTrue;
-
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.column;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class ScrubTest
 {
+    public static final String INVALID_LEGACY_SSTABLE_ROOT_PROP = "invalid-legacy-sstable-root";
+
     public static final String KEYSPACE = "Keyspace1";
     public static final String CF = "Standard1";
     public static final String CF2 = "Standard2";
     public static final String CF3 = "Standard3";
-    public static final String CFI1 = "StandardInteger1";
     public static final String COUNTER_CF = "Counter1";
     public static final String CF_UUID = "UUIDKeys";
     public static final String CF_INDEX1 = "Indexed1";
     public static final String CF_INDEX2 = "Indexed2";
+    public static final String CF_INDEX1_BYTEORDERED = "Indexed1_ordered";
+    public static final String CF_INDEX2_BYTEORDERED = "Indexed2_ordered";
 
-    public static final String COL_KEYS_INDEX = "birthdate";
-    public static final String COL_COMPOSITES_INDEX = "col1";
-    public static final String COL_NON_INDEX = "notanindexcol";
+    public static final String COL_INDEX = "birthdate";
+    public static final String COL_NON_INDEX = "notbirthdate";
 
     public static final Integer COMPRESSION_CHUNK_LENGTH = 4096;
 
@@ -105,18 +94,17 @@
     {
         SchemaLoader.loadSchema();
         SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE, CF),
                                     SchemaLoader.standardCFMD(KEYSPACE, CF2),
                                     SchemaLoader.standardCFMD(KEYSPACE, CF3),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CFI1),
-                                    SchemaLoader.standardCFMD(KEYSPACE, COUNTER_CF)
-                                                .defaultValidator(CounterColumnType.instance)
-                                                .compressionParameters(SchemaLoader.getCompressionParameters(COMPRESSION_CHUNK_LENGTH)),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF_UUID).keyValidator(UUIDType.instance),
-                                    SchemaLoader.indexCFMD(KEYSPACE, CF_INDEX1, true),
-                                    SchemaLoader.compositeIndexCFMD(KEYSPACE, CF_INDEX2, true));
+                                    SchemaLoader.counterCFMD(KEYSPACE, COUNTER_CF)
+                                                .compression(SchemaLoader.getCompressionParameters(COMPRESSION_CHUNK_LENGTH)),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF_UUID, 0, UUIDType.instance),
+                                    SchemaLoader.keysIndexCFMD(KEYSPACE, CF_INDEX1, true),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE, CF_INDEX2, true),
+                                    SchemaLoader.keysIndexCFMD(KEYSPACE, CF_INDEX1_BYTEORDERED, true).copy(ByteOrderedPartitioner.instance),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE, CF_INDEX2_BYTEORDERED, true).copy(ByteOrderedPartitioner.instance));
     }
 
     @Test
@@ -127,18 +115,14 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
         cfs.clearUnsafe();
 
-        List<Row> rows;
-
         // insert data and verify we get it back w/ range query
         fillCF(cfs, 1);
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(1, rows.size());
+        assertOrderedAll(cfs, 1);
 
-        CompactionManager.instance.performScrub(cfs, false, true, 2);
+        CompactionManager.instance.performScrub(cfs, false, true, false, 2);
 
         // check data is still there
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(1, rows.size());
+        assertOrderedAll(cfs, 1);
     }
 
     @Test
@@ -155,19 +139,18 @@
 
         fillCounterCF(cfs, numPartitions);
 
-        List<Row> rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), numPartitions*10);
-        assertEquals(numPartitions, rows.size());
+        assertOrderedAll(cfs, numPartitions);
 
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         //make sure to override at most 1 chunk when compression is enabled
         overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
 
         // with skipCorrupted == false, the scrub is expected to fail
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(Arrays.asList(sstable), OperationType.SCRUB);
-             Scrubber scrubber = new Scrubber(cfs, txn, false, true);)
+             Scrubber scrubber = new Scrubber(cfs, txn, false, true))
         {
             scrubber.scrub();
             fail("Expected a CorruptSSTableException to be thrown");
@@ -177,7 +160,7 @@
         // with skipCorrupted == true, the corrupt rows will be skipped
         Scrubber.ScrubResult scrubResult;
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(Arrays.asList(sstable), OperationType.SCRUB);
-             Scrubber scrubber = new Scrubber(cfs, txn, true, true);)
+             Scrubber scrubber = new Scrubber(cfs, txn, true, true))
         {
             scrubResult = scrubber.scrubWithResult();
         }
@@ -198,10 +181,9 @@
             assertEquals(1, scrubResult.badRows);
             assertEquals(numPartitions-1, scrubResult.goodRows);
         }
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(scrubResult.goodRows, rows.size());
+        assertOrderedAll(cfs, scrubResult.goodRows);
     }
 
     @Test
@@ -217,10 +199,9 @@
 
         fillCounterCF(cfs, 2);
 
-        List<Row> rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(2, rows.size());
+        assertOrderedAll(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         // overwrite one row with garbage
         overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
@@ -243,10 +224,9 @@
             scrubber.close();
         }
 
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
         // verify that we can read all of the rows, and there is now one less row
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(1, rows.size());
+        assertOrderedAll(cfs, 1);
     }
 
     @Test
@@ -260,21 +240,17 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
         cfs.clearUnsafe();
 
-        List<Row> rows;
-
         // insert data and verify we get it back w/ range query
         fillCF(cfs, 4);
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(4, rows.size());
+        assertOrderedAll(cfs, 4);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         overrideWithGarbage(sstable, 0, 2);
 
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
         // check data is still there
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(4, rows.size());
+        assertOrderedAll(cfs, 4);
     }
 
     @Test
@@ -293,24 +269,6 @@
     }
 
     @Test
-    public void testScrubDeletedRow() throws ExecutionException, InterruptedException
-    {
-        CompactionManager.instance.disableAutoCompaction();
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
-        cfs.clearUnsafe();
-
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, CF2);
-        cf.delete(new DeletionInfo(0, 1)); // expired tombstone
-        Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(1), cf);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        CompactionManager.instance.performScrub(cfs, false, true, 2);
-        assert cfs.getSSTables().isEmpty();
-    }
-
-    @Test
     public void testScrubMultiRow() throws ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
@@ -318,18 +276,14 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
         cfs.clearUnsafe();
 
-        List<Row> rows;
-
         // insert data and verify we get it back w/ range query
         fillCF(cfs, 10);
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(10, rows.size());
+        assertOrderedAll(cfs, 10);
 
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
         // check data is still there
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(10, rows.size());
+        assertOrderedAll(cfs, 10);
     }
 
     @Test
@@ -340,146 +294,96 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
         cfs.clearUnsafe();
 
-        List<Row> rows;
-
         // insert data and verify we get it back w/ range query
         fillCF(cfs, 10);
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(10, rows.size());
+        assertOrderedAll(cfs, 10);
 
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete();
 
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
         // check data is still there
-        rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assertEquals(10, rows.size());
+        assertOrderedAll(cfs, 10);
     }
 
     @Test
     public void testScrubOutOfOrder() throws Exception
     {
-        CompactionManager.instance.disableAutoCompaction();
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        String columnFamily = CF3;
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(columnFamily);
-        cfs.clearUnsafe();
+        // This test assumes ByteOrderPartitioner to create out-of-order SSTable
+        IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner();
+        DatabaseDescriptor.setPartitionerUnsafe(new ByteOrderedPartitioner());
 
-        /*
-         * Code used to generate an outOfOrder sstable. The test for out-of-order key in SSTableWriter must also be commented out.
-         * The test also assumes an ordered partitioner.
-         *
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        cf.addColumn(new BufferCell(ByteBufferUtil.bytes("someName"), ByteBufferUtil.bytes("someValue"), 0L));
-
-        SSTableWriter writer = new SSTableWriter(cfs.getTempSSTablePath(new File(System.getProperty("corrupt-sstable-root"))),
-                                                 cfs.metadata.getIndexInterval(),
-                                                 cfs.metadata,
-                                                 cfs.partitioner,
-                                                 SSTableMetadata.createCollector(BytesType.instance));
-        writer.append(Util.dk("a"), cf);
-        writer.append(Util.dk("b"), cf);
-        writer.append(Util.dk("z"), cf);
-        writer.append(Util.dk("c"), cf);
-        writer.append(Util.dk("y"), cf);
-        writer.append(Util.dk("d"), cf);
-        writer.finish();
-        */
-
-        String root = System.getProperty("corrupt-sstable-root");
-        assert root != null;
-
-        File rootDir = new File(root);
-        assert rootDir.isDirectory();
-        Descriptor desc = new Descriptor("jb", rootDir, KEYSPACE, columnFamily, 1, Descriptor.Type.FINAL, SSTableFormat.Type.LEGACY);
-        CFMetaData metadata = Schema.instance.getCFMetaData(desc.ksname, desc.cfname);
-
+        // Create out-of-order SSTable
+        File tempDir = File.createTempFile("ScrubTest.testScrubOutOfOrder", "").getParentFile();
+        // create ks/cf directory
+        File tempDataDir = new File(tempDir, String.join(File.separator, KEYSPACE, CF3));
+        tempDataDir.mkdirs();
         try
         {
-            SSTableReader.open(desc, metadata);
-            fail("SSTR validation should have caught the out-of-order rows");
-        }
-        catch (IllegalStateException ise) { /* this is expected */ }
+            CompactionManager.instance.disableAutoCompaction();
+            Keyspace keyspace = Keyspace.open(KEYSPACE);
+            String columnFamily = CF3;
+            ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(columnFamily);
+            cfs.clearUnsafe();
 
-        // open without validation for scrubbing
-        Set<Component> components = new HashSet<>();
-        components.add(Component.COMPRESSION_INFO);
-        components.add(Component.DATA);
-        components.add(Component.PRIMARY_INDEX);
-        components.add(Component.FILTER);
-        components.add(Component.STATS);
-        components.add(Component.SUMMARY);
-        components.add(Component.TOC);
+            List<String> keys = Arrays.asList("t", "a", "b", "z", "c", "y", "d");
+            String filename = cfs.getSSTablePath(tempDataDir);
+            Descriptor desc = Descriptor.fromFilename(filename);
 
-        SSTableReader sstable = SSTableReader.openNoValidation(desc, components, cfs);
-        if (sstable.last.compareTo(sstable.first) < 0)
-            sstable.last = sstable.first;
-
-        try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.SCRUB, sstable);
-             Scrubber scrubber = new Scrubber(cfs, txn, false, true);)
-        {
-            scrubber.scrub();
-        }
-        cfs.loadNewSSTables();
-        List<Row> rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
-        assert isRowOrdered(rows) : "Scrub failed: " + rows;
-        assert rows.size() == 6 : "Got " + rows.size();
-    }
-
-    @Test
-    public void testScrub10791() throws Exception
-    {
-        // Table is created by StreamingTransferTest.testTransferRangeTombstones with CASSANDRA-10791 fix disabled.
-        CompactionManager.instance.disableAutoCompaction();
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        String columnFamily = CFI1;
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(columnFamily);
-        cfs.clearUnsafe();
-
-        String root = System.getProperty("corrupt-sstable-root");
-        assert root != null;
-        File rootDir = new File(root);
-        assert rootDir.isDirectory();
-        Descriptor desc = new Descriptor("ka", rootDir, KEYSPACE, columnFamily, 2, Descriptor.Type.FINAL, SSTableFormat.Type.LEGACY);
-
-        // open without validation for scrubbing
-        Set<Component> components = new HashSet<>();
-        components.add(Component.DATA);
-        components.add(Component.PRIMARY_INDEX);
-        components.add(Component.FILTER);
-        components.add(Component.STATS);
-        components.add(Component.SUMMARY);
-        components.add(Component.TOC);
-        SSTableReader sstable = SSTableReader.openNoValidation(desc, components, cfs);
-
-        try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.SCRUB, sstable);
-             Scrubber scrubber = new Scrubber(cfs, txn, false, true);)
-        {
-            scrubber.scrub();
-        }
-
-        cfs.loadNewSSTables();
-        assertEquals(7, countCells(cfs));
-    }
-
-    private int countCells(ColumnFamilyStore cfs)
-    {
-        int cellCount = 0;
-        for (SSTableReader sstable : cfs.getSSTables())
-        {
-            Iterator<OnDiskAtomIterator> it = sstable.getScanner();
-            while (it.hasNext())
+            LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
+            try (SSTableTxnWriter writer = new SSTableTxnWriter(txn, createTestWriter(desc, (long) keys.size(), cfs.metadata, txn)))
             {
-                Iterator<OnDiskAtom> itr = it.next();
-                while (itr.hasNext())
+
+                for (String k : keys)
                 {
-                    ++cellCount;
-                    itr.next();
+                    PartitionUpdate update = UpdateBuilder.create(cfs.metadata, Util.dk(k))
+                                                          .newRow("someName").add("val", "someValue")
+                                                          .build();
+
+                    writer.append(update.unfilteredIterator());
                 }
+                writer.finish(false);
             }
+
+            try
+            {
+                SSTableReader.open(desc, cfs.metadata);
+                fail("SSTR validation should have caught the out-of-order rows");
+            }
+            catch (CorruptSSTableException ise)
+            { /* this is expected */ }
+
+            // open without validation for scrubbing
+            Set<Component> components = new HashSet<>();
+            if (new File(desc.filenameFor(Component.COMPRESSION_INFO)).exists())
+                components.add(Component.COMPRESSION_INFO);
+            components.add(Component.DATA);
+            components.add(Component.PRIMARY_INDEX);
+            components.add(Component.FILTER);
+            components.add(Component.STATS);
+            components.add(Component.SUMMARY);
+            components.add(Component.TOC);
+
+            SSTableReader sstable = SSTableReader.openNoValidation(desc, components, cfs);
+            if (sstable.last.compareTo(sstable.first) < 0)
+                sstable.last = sstable.first;
+
+            try (LifecycleTransaction scrubTxn = LifecycleTransaction.offline(OperationType.SCRUB, sstable);
+                 Scrubber scrubber = new Scrubber(cfs, scrubTxn, false, true))
+            {
+                scrubber.scrub();
+            }
+            LifecycleTransaction.waitForDeletions();
+            cfs.loadNewSSTables();
+            assertOrderedAll(cfs, 7);
         }
-        return cellCount;
+        finally
+        {
+            FileUtils.deleteRecursive(tempDataDir);
+            // reset partitioner
+            DatabaseDescriptor.setPartitionerUnsafe(oldPartitioner);
+        }
     }
 
     private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2) throws IOException
@@ -492,17 +396,19 @@
             CompressionMetadata compData = CompressionMetadata.create(sstable.getFilename());
 
             CompressionMetadata.Chunk chunk1 = compData.chunkFor(
-                    sstable.getPosition(RowPosition.ForKey.get(key1, sstable.partitioner), SSTableReader.Operator.EQ).position);
+                    sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
             CompressionMetadata.Chunk chunk2 = compData.chunkFor(
-                    sstable.getPosition(RowPosition.ForKey.get(key2, sstable.partitioner), SSTableReader.Operator.EQ).position);
+                    sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
 
             startPosition = Math.min(chunk1.offset, chunk2.offset);
             endPosition = Math.max(chunk1.offset + chunk1.length, chunk2.offset + chunk2.length);
+
+            compData.close();
         }
         else
         { // overwrite with garbage from key1 to key2
-            long row0Start = sstable.getPosition(RowPosition.ForKey.get(key1, sstable.partitioner), SSTableReader.Operator.EQ).position;
-            long row1Start = sstable.getPosition(RowPosition.ForKey.get(key2, sstable.partitioner), SSTableReader.Operator.EQ).position;
+            long row0Start = sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position;
+            long row1Start = sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position;
             startPosition = Math.min(row0Start, row1Start);
             endPosition = Math.max(row0Start, row1Start);
         }
@@ -518,68 +424,72 @@
         file.close();
     }
 
-    private static boolean isRowOrdered(List<Row> rows)
+    private static void assertOrderedAll(ColumnFamilyStore cfs, int expectedSize)
     {
-        DecoratedKey prev = null;
-        for (Row row : rows)
-        {
-            if (prev != null && prev.compareTo(row.key) > 0)
-                return false;
-            prev = row.key;
-        }
-        return true;
+        assertOrdered(Util.cmd(cfs).build(), expectedSize);
     }
 
-    protected void fillCF(ColumnFamilyStore cfs, int rowsPerSSTable)
+    private static void assertOrdered(ReadCommand cmd, int expectedSize)
     {
-        for (int i = 0; i < rowsPerSSTable; i++)
+        int size = 0;
+        DecoratedKey prev = null;
+        for (Partition partition : Util.getAllUnfiltered(cmd))
         {
-            String key = String.valueOf(i);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, CF);
-            cf.addColumn(column("c1", "1", 1L));
-            cf.addColumn(column("c2", "2", 1L));
-            Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
-            rm.applyUnsafe();
+            DecoratedKey current = partition.partitionKey();
+            assertTrue("key " + current + " does not sort after previous key " + prev, prev == null || prev.compareTo(current) < 0);
+            prev = current;
+            ++size;
+        }
+        assertEquals(expectedSize, size);
+    }
+
+    protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable)
+    {
+        for (int i = 0; i < partitionsPerSSTable; i++)
+        {
+            PartitionUpdate update = UpdateBuilder.create(cfs.metadata, String.valueOf(i))
+                                                  .newRow("r1").add("val", "1")
+                                                  .newRow("r1").add("val", "1")
+                                                  .build();
+
+            new Mutation(update).applyUnsafe();
         }
 
         cfs.forceBlockingFlush();
     }
 
-    private void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long ... values)
+    public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long ... values)
     {
         assertTrue(values.length % 2 == 0);
         for (int i = 0; i < values.length; i +=2)
         {
-            String key = String.valueOf(i);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, cfs.name);
+            UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, String.valueOf(i));
             if (composite)
             {
-                String clusterKey = "c" + key;
-                cf.addColumn(column(clusterKey, COL_COMPOSITES_INDEX, values[i], 1L));
-                cf.addColumn(column(clusterKey, COL_NON_INDEX, values[i + 1], 1L));
+                builder.newRow("c" + i)
+                       .add(COL_INDEX, values[i])
+                       .add(COL_NON_INDEX, values[i + 1]);
             }
             else
             {
-                cf.addColumn(column(COL_KEYS_INDEX, values[i], 1L));
-                cf.addColumn(column(COL_NON_INDEX, values[i + 1], 1L));
+                builder.newRow()
+                       .add(COL_INDEX, values[i])
+                       .add(COL_NON_INDEX, values[i + 1]);
             }
-            Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
-            rm.applyUnsafe();
+            new Mutation(builder.build()).applyUnsafe();
         }
 
         cfs.forceBlockingFlush();
     }
 
-    protected void fillCounterCF(ColumnFamilyStore cfs, int rowsPerSSTable) throws WriteTimeoutException
+    protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
     {
-        for (int i = 0; i < rowsPerSSTable; i++)
+        for (int i = 0; i < partitionsPerSSTable; i++)
         {
-            String key = String.valueOf(i);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, COUNTER_CF);
-            Mutation rm = new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf);
-            rm.addCounter(COUNTER_CF, cellname("Column1"), 100);
-            CounterMutation cm = new CounterMutation(rm, ConsistencyLevel.ONE);
-            cm.apply();
+            PartitionUpdate update = UpdateBuilder.create(cfs.metadata, String.valueOf(i))
+                                                  .newRow("r1").add("val", 100L)
+                                                  .build();
+            new CounterMutation(new Mutation(update), ConsistencyLevel.ONE).apply();
         }
 
         cfs.forceBlockingFlush();
@@ -599,35 +509,14 @@
 
         QueryProcessor.process("CREATE TABLE \"Keyspace1\".test_scrub_validation (a text primary key, b int)", ConsistencyLevel.ONE);
         ColumnFamilyStore cfs2 = keyspace.getColumnFamilyStore("test_scrub_validation");
-        Mutation mutation = new Mutation("Keyspace1", UTF8Type.instance.decompose("key"));
-        CellNameType ct = cfs2.getComparator();
-        mutation.add("test_scrub_validation", ct.makeCellName("b"), LongType.instance.decompose(1L), System.currentTimeMillis());
-        mutation.apply();
+
+        new Mutation(UpdateBuilder.create(cfs2.metadata, "key").newRow().add("b", Int32Type.instance.decompose(1)).build()).apply();
         cfs2.forceBlockingFlush();
 
         CompactionManager.instance.performScrub(cfs2, false, false, 2);
     }
 
     /**
-     * Tests CASSANDRA-6892 (key aliases being used improperly for validation)
-     */
-    @Test
-    public void testColumnNameEqualToDefaultKeyAlias() throws ExecutionException, InterruptedException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_UUID);
-
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, CF_UUID);
-        cf.addColumn(column(CFMetaData.DEFAULT_KEY_ALIAS, "not a uuid", 1L));
-        Mutation mutation = new Mutation(KEYSPACE, ByteBufferUtil.bytes(UUIDGen.getTimeUUID()), cf);
-        mutation.applyUnsafe();
-        cfs.forceBlockingFlush();
-        CompactionManager.instance.performScrub(cfs, false, true, 2);
-
-        assertEquals(1, cfs.getSSTables().size());
-    }
-
-    /**
      * For CASSANDRA-6892 too, check that for a compact table with one cluster column, we can insert whatever
      * we want as value for the clustering column, including something that would conflict with a CQL column definition.
      */
@@ -660,74 +549,43 @@
     {
         //If the partitioner preserves the order then SecondaryIndex uses BytesType comparator,
         // otherwise it uses LocalByPartitionerType
-        setKeyComparator(BytesType.instance);
-        testScrubIndex(CF_INDEX1, COL_KEYS_INDEX, false, true);
+        testScrubIndex(CF_INDEX1_BYTEORDERED, COL_INDEX, false, true);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testScrubCompositeIndex_preserveOrder() throws IOException, ExecutionException, InterruptedException
     {
-        setKeyComparator(BytesType.instance);
-        testScrubIndex(CF_INDEX2, COL_COMPOSITES_INDEX, true, true);
+        testScrubIndex(CF_INDEX2_BYTEORDERED, COL_INDEX, true, true);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testScrubKeysIndex() throws IOException, ExecutionException, InterruptedException
     {
-        setKeyComparator(new LocalByPartionerType(StorageService.getPartitioner()));
-        testScrubIndex(CF_INDEX1, COL_KEYS_INDEX, false, true);
+        testScrubIndex(CF_INDEX1, COL_INDEX, false, true);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testScrubCompositeIndex() throws IOException, ExecutionException, InterruptedException
     {
-        setKeyComparator(new LocalByPartionerType(StorageService.getPartitioner()));
-        testScrubIndex(CF_INDEX2, COL_COMPOSITES_INDEX, true, true);
+        testScrubIndex(CF_INDEX2, COL_INDEX, true, true);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testFailScrubKeysIndex() throws IOException, ExecutionException, InterruptedException
     {
-        testScrubIndex(CF_INDEX1, COL_KEYS_INDEX, false, false);
+        testScrubIndex(CF_INDEX1, COL_INDEX, false, false);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testFailScrubCompositeIndex() throws IOException, ExecutionException, InterruptedException
     {
-        testScrubIndex(CF_INDEX2, COL_COMPOSITES_INDEX, true, false);
+        testScrubIndex(CF_INDEX2, COL_INDEX, true, false);
     }
 
     @Test /* CASSANDRA-5174 */
     public void testScrubTwice() throws IOException, ExecutionException, InterruptedException
     {
-        testScrubIndex(CF_INDEX1, COL_KEYS_INDEX, false, true, true);
-    }
-
-    /** The SecondaryIndex class is used for custom indexes so to avoid
-     * making a public final field into a private field with getters
-     * and setters, we resort to this hack in order to test it properly
-     * since it can have two values which influence the scrubbing behavior.
-     * @param comparator - the key comparator we want to test
-     */
-    private void setKeyComparator(AbstractType<?> comparator)
-    {
-        try
-        {
-            Field keyComparator = SecondaryIndex.class.getDeclaredField("keyComparator");
-            keyComparator.setAccessible(true);
-            int modifiers = keyComparator.getModifiers();
-            Field modifierField = keyComparator.getClass().getDeclaredField("modifiers");
-            modifiers = modifiers & ~Modifier.FINAL;
-            modifierField.setAccessible(true);
-            modifierField.setInt(keyComparator, modifiers);
-
-            keyComparator.set(null, comparator);
-        }
-        catch (Exception ex)
-        {
-            fail("Failed to change key comparator in secondary index : " + ex.getMessage());
-            ex.printStackTrace();
-        }
+        testScrubIndex(CF_INDEX1, COL_INDEX, false, true, true);
     }
 
     private void testScrubIndex(String cfName, String colName, boolean composite, boolean ... scrubs)
@@ -748,13 +606,11 @@
         fillIndexCF(cfs, composite, colValues);
 
         // check index
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes(colName), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<Row> rows = cfs.search(Util.range("", ""), Arrays.asList(expr), new IdentityQueryFilter(), numRows);
-        assertNotNull(rows);
-        assertEquals(numRows / 2, rows.size());
+
+        assertOrdered(Util.cmd(cfs).filterOn(colName, Operator.EQ, 1L).build(), numRows / 2);
 
         // scrub index
-        Set<ColumnFamilyStore> indexCfss = cfs.indexManager.getIndexesBackedByCfs();
+        Set<ColumnFamilyStore> indexCfss = cfs.indexManager.getAllIndexColumnFamilyStores();
         assertTrue(indexCfss.size() == 1);
         for(ColumnFamilyStore indexCfs : indexCfss)
         {
@@ -763,9 +619,9 @@
                 boolean failure = !scrubs[i];
                 if (failure)
                 { //make sure the next scrub fails
-                    overrideWithGarbage(indexCfs.getSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L));
+                    overrideWithGarbage(indexCfs.getLiveSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L));
                 }
-                CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, true, true, true, 0);
+                CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false,0);
                 assertEquals(failure ?
                              CompactionManager.AllSSTableOpStatus.ABORTED :
                              CompactionManager.AllSSTableOpStatus.SUCCESSFUL,
@@ -775,8 +631,118 @@
 
 
         // check index is still working
-        rows = cfs.search(Util.range("", ""), Arrays.asList(expr), new IdentityQueryFilter(), numRows);
-        assertNotNull(rows);
-        assertEquals(numRows / 2, rows.size());
+        assertOrdered(Util.cmd(cfs).filterOn(colName, Operator.EQ, 1L).build(), numRows / 2);
+    }
+
+    private static SSTableMultiWriter createTestWriter(Descriptor descriptor, long keyCount, CFMetaData metadata, LifecycleNewTracker lifecycleNewTracker)
+    {
+        SerializationHeader header = new SerializationHeader(true, metadata, metadata.partitionColumns(), EncodingStats.NO_STATS);
+        MetadataCollector collector = new MetadataCollector(metadata.comparator).sstableLevel(0);
+        return new TestMultiWriter(new TestWriter(descriptor, keyCount, 0, metadata, collector, header, lifecycleNewTracker));
+    }
+
+    private static class TestMultiWriter extends SimpleSSTableMultiWriter
+    {
+        TestMultiWriter(SSTableWriter writer)
+        {
+            super(writer);
+        }
+    }
+
+    /**
+     * Test writer that allows to write out of order SSTable.
+     */
+    private static class TestWriter extends BigTableWriter
+    {
+        TestWriter(Descriptor descriptor, long keyCount, long repairedAt, CFMetaData metadata,
+                   MetadataCollector collector, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
+        {
+            super(descriptor, keyCount, repairedAt, metadata, collector, header, lifecycleNewTracker);
+        }
+
+        @Override
+        protected long beforeAppend(DecoratedKey decoratedKey)
+        {
+            return dataFile.position();
+        }
+    }
+
+    /**
+     * Tests with invalid sstables (containing duplicate entries in 2.0 and 3.0 storage format),
+     * that were caused by upgrading from 2.x with duplicate range tombstones.
+     *
+     * See CASSANDRA-12144 for details.
+     */
+    @Test
+    public void testFilterOutDuplicates() throws Exception
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        QueryProcessor.process(String.format("CREATE TABLE \"%s\".cf_with_duplicates_3_0 (a int, b int, c int, PRIMARY KEY (a, b))", KEYSPACE), ConsistencyLevel.ONE);
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("cf_with_duplicates_3_0");
+
+        Path legacySSTableRoot = Paths.get(System.getProperty(INVALID_LEGACY_SSTABLE_ROOT_PROP),
+                                           "Keyspace1",
+                                           "cf_with_duplicates_3_0");
+
+        for (String filename : new String[]{ "mb-3-big-CompressionInfo.db",
+                                             "mb-3-big-Digest.crc32",
+                                             "mb-3-big-Index.db",
+                                             "mb-3-big-Summary.db",
+                                             "mb-3-big-Data.db",
+                                             "mb-3-big-Filter.db",
+                                             "mb-3-big-Statistics.db",
+                                             "mb-3-big-TOC.txt" })
+        {
+            Files.copy(Paths.get(legacySSTableRoot.toString(), filename), cfs.getDirectories().getDirectoryForNewSSTables().toPath().resolve(filename));
+        }
+
+        cfs.loadNewSSTables();
+
+        cfs.scrub(true, true, false, false, false, 1);
+
+        UntypedResultSet rs = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".cf_with_duplicates_3_0", KEYSPACE));
+        assertEquals(1, rs.size());
+        QueryProcessor.executeInternal(String.format("DELETE FROM \"%s\".cf_with_duplicates_3_0 WHERE a=1 AND b =2", KEYSPACE));
+        rs = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".cf_with_duplicates_3_0", KEYSPACE));
+        assertEquals(0, rs.size());
+    }
+
+    @Test
+    public void testUpgradeSstablesWithDuplicates() throws Exception
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        String cf = "cf_with_duplicates_2_0";
+        QueryProcessor.process(String.format("CREATE TABLE \"%s\".%s (a int, b int, c int, PRIMARY KEY (a, b))", KEYSPACE, cf), ConsistencyLevel.ONE);
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cf);
+
+        Path legacySSTableRoot = Paths.get(System.getProperty(INVALID_LEGACY_SSTABLE_ROOT_PROP),
+                                           "Keyspace1",
+                                           cf);
+
+        for (String filename : new String[]{ "lb-1-big-CompressionInfo.db",
+                                             "lb-1-big-Data.db",
+                                             "lb-1-big-Digest.adler32",
+                                             "lb-1-big-Filter.db",
+                                             "lb-1-big-Index.db",
+                                             "lb-1-big-Statistics.db",
+                                             "lb-1-big-Summary.db",
+                                             "lb-1-big-TOC.txt" })
+        {
+            Files.copy(Paths.get(legacySSTableRoot.toString(), filename), cfs.getDirectories().getDirectoryForNewSSTables().toPath().resolve(filename));
+        }
+
+        cfs.loadNewSSTables();
+
+        cfs.sstablesRewrite(true, 1);
+
+        UntypedResultSet rs = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".%s", KEYSPACE, cf));
+        assertEquals(1, rs.size());
+        QueryProcessor.executeInternal(String.format("DELETE FROM \"%s\".%s WHERE a=1 AND b =2", KEYSPACE, cf));
+        rs = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".%s", KEYSPACE, cf));
+        assertEquals(0, rs.size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
new file mode 100644
index 0000000..9fb0463
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java

@@ -0,0 +1,555 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.Util.throwAssert;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class SecondaryIndexTest
+{
+    public static final String KEYSPACE1 = "SecondaryIndexTest1";
+    public static final String WITH_COMPOSITE_INDEX = "WithCompositeIndex";
+    public static final String WITH_MULTIPLE_COMPOSITE_INDEX = "WithMultipleCompositeIndex";
+    public static final String WITH_KEYS_INDEX = "WithKeysIndex";
+    public static final String COMPOSITE_INDEX_TO_BE_ADDED = "CompositeIndexToBeAdded";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE1, WITH_COMPOSITE_INDEX, true).gcGraceSeconds(0),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE1, COMPOSITE_INDEX_TO_BE_ADDED, false).gcGraceSeconds(0),
+                                    SchemaLoader.compositeMultipleIndexCFMD(KEYSPACE1, WITH_MULTIPLE_COMPOSITE_INDEX).gcGraceSeconds(0),
+                                    SchemaLoader.keysIndexCFMD(KEYSPACE1, WITH_KEYS_INDEX, true).gcGraceSeconds(0));
+    }
+
+    @Before
+    public void truncateCFS()
+    {
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(COMPOSITE_INDEX_TO_BE_ADDED).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_MULTIPLE_COMPOSITE_INDEX).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_KEYS_INDEX).truncateBlocking();
+    }
+
+    @Test
+    public void testIndexScan()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+
+        new RowUpdateBuilder(cfs.metadata, 0, "k1").clustering("c").add("birthdate", 1L).add("notbirthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k2").clustering("c").add("birthdate", 2L).add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k3").clustering("c").add("birthdate", 1L).add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k4").clustering("c").add("birthdate", 3L).add("notbirthdate", 2L).build().applyUnsafe();
+
+        // basic single-expression query
+        List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs).fromKeyExcl("k1").toKeyIncl("k3").columns("birthdate").build());
+        assertEquals(2, partitions.size());
+        Util.assertCellValue(2L, cfs, Util.row(partitions.get(0), "c"), "birthdate");
+        Util.assertCellValue(1L, cfs, Util.row(partitions.get(1), "c"), "birthdate");
+
+        // 2 columns, 3 results
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyExcl("k1").toKeyIncl("k4aaa").build());
+        assertEquals(3, partitions.size());
+
+        Row first = Util.row(partitions.get(0), "c");
+        Util.assertCellValue(2L, cfs, first, "birthdate");
+        Util.assertCellValue(2L, cfs, first, "notbirthdate");
+
+        Row second = Util.row(partitions.get(1), "c");
+        Util.assertCellValue(1L, cfs, second, "birthdate");
+        Util.assertCellValue(2L, cfs, second, "notbirthdate");
+
+        Row third = Util.row(partitions.get(2), "c");
+        Util.assertCellValue(3L, cfs, third, "birthdate");
+        Util.assertCellValue(2L, cfs, third, "notbirthdate");
+
+        // Verify getIndexSearchers finds the data for our rc
+        ReadCommand rc = Util.cmd(cfs).fromKeyIncl("k1")
+                                      .toKeyIncl("k3")
+                                      .columns("birthdate")
+                                      .filterOn("birthdate", Operator.EQ, 1L)
+                                      .build();
+
+        Index.Searcher searcher = rc.getIndex(cfs).searcherFor(rc);
+        try (ReadOrderGroup orderGroup = rc.startOrderGroup(); UnfilteredPartitionIterator pi = searcher.search(orderGroup))
+        {
+            assertTrue(pi.hasNext());
+            pi.next().close();
+        }
+
+        // Verify gt on idx scan
+        partitions = Util.getAll(Util.cmd(cfs).fromKeyIncl("k1").toKeyIncl("k4aaa") .filterOn("birthdate", Operator.GT, 1L).build());
+        int rowCount = 0;
+        for (FilteredPartition partition : partitions)
+        {
+            for (Row row : partition)
+            {
+                ++rowCount;
+                assert ByteBufferUtil.toLong(Util.cell(cfs, row, "birthdate").value()) > 1L;
+            }
+        }
+        assertEquals(2, rowCount);
+
+        // Filter on non-indexed, LT comparison
+        Util.assertEmpty(Util.cmd(cfs).fromKeyExcl("k1").toKeyIncl("k4aaa")
+                                      .filterOn("notbirthdate", Operator.NEQ, 2L)
+                                      .build());
+
+        // Hit on primary, fail on non-indexed filter
+        Util.assertEmpty(Util.cmd(cfs).fromKeyExcl("k1").toKeyIncl("k4aaa")
+                                      .filterOn("birthdate", Operator.EQ, 1L)
+                                      .filterOn("notbirthdate", Operator.NEQ, 2L)
+                                      .build());
+    }
+
+    @Test
+    public void testLargeScan()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+        ByteBuffer bBB = ByteBufferUtil.bytes("birthdate");
+        ByteBuffer nbBB = ByteBufferUtil.bytes("notbirthdate");
+
+        for (int i = 0; i < 100; i++)
+        {
+            new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros(), "key" + i)
+                    .clustering("c")
+                    .add("birthdate", 34L)
+                    .add("notbirthdate", ByteBufferUtil.bytes((long) (i % 2)))
+                    .build()
+                    .applyUnsafe();
+        }
+
+        List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs)
+                                                             .filterOn("birthdate", Operator.EQ, 34L)
+                                                             .filterOn("notbirthdate", Operator.EQ, 1L)
+                                                             .build());
+
+        Set<DecoratedKey> keys = new HashSet<>();
+        int rowCount = 0;
+
+        for (FilteredPartition partition : partitions)
+        {
+            keys.add(partition.partitionKey());
+            rowCount += partition.rowCount();
+        }
+
+        // extra check that there are no duplicate results -- see https://issues.apache.org/jira/browse/CASSANDRA-2406
+        assertEquals(rowCount, keys.size());
+        assertEquals(50, rowCount);
+    }
+
+    @Test
+    public void testCompositeIndexDeletions() throws IOException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+        ByteBuffer bBB = ByteBufferUtil.bytes("birthdate");
+        ColumnDefinition bDef = cfs.metadata.getColumnDefinition(bBB);
+        ByteBuffer col = ByteBufferUtil.bytes("birthdate");
+
+        // Confirm addition works
+        new RowUpdateBuilder(cfs.metadata, 0, "k1").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+        assertIndexedOne(cfs, col, 1L);
+
+        // delete the column directly
+        RowUpdateBuilder.deleteRow(cfs.metadata, 1, "k1", "c").applyUnsafe();
+        assertIndexedNone(cfs, col, 1L);
+
+        // verify that it's not being indexed under any other value either
+        ReadCommand rc = Util.cmd(cfs).build();
+        assertNull(rc.getIndex(cfs));
+
+        // resurrect w/ a newer timestamp
+        new RowUpdateBuilder(cfs.metadata, 2, "k1").clustering("c").add("birthdate", 1L).build().apply();;
+        assertIndexedOne(cfs, col, 1L);
+
+        // verify that row and delete w/ older timestamp does nothing
+        RowUpdateBuilder.deleteRow(cfs.metadata, 1, "k1", "c").applyUnsafe();
+        assertIndexedOne(cfs, col, 1L);
+
+        // similarly, column delete w/ older timestamp should do nothing
+        new RowUpdateBuilder(cfs.metadata, 1, "k1").clustering("c").delete(bDef).build().applyUnsafe();
+        assertIndexedOne(cfs, col, 1L);
+
+        // delete the entire row (w/ newer timestamp this time)
+        // todo - checking the # of index searchers for the command is probably not the best thing to test here
+        RowUpdateBuilder.deleteRow(cfs.metadata, 3, "k1", "c").applyUnsafe();
+        rc = Util.cmd(cfs).build();
+        assertNull(rc.getIndex(cfs));
+
+        // make sure obsolete mutations don't generate an index entry
+        // todo - checking the # of index searchers for the command is probably not the best thing to test here
+        new RowUpdateBuilder(cfs.metadata, 3, "k1").clustering("c").add("birthdate", 1L).build().apply();;
+        rc = Util.cmd(cfs).build();
+        assertNull(rc.getIndex(cfs));
+    }
+
+    @Test
+    public void testCompositeIndexUpdate() throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+        ByteBuffer col = ByteBufferUtil.bytes("birthdate");
+
+        // create a row and update the birthdate value, test that the index query fetches the new version
+        new RowUpdateBuilder(cfs.metadata, 1, "testIndexUpdate").clustering("c").add("birthdate", 100L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 2, "testIndexUpdate").clustering("c").add("birthdate", 200L).build().applyUnsafe();
+
+        // Confirm old version fetch fails
+        assertIndexedNone(cfs, col, 100L);
+
+        // Confirm new works
+        assertIndexedOne(cfs, col, 200L);
+
+        // update the birthdate value with an OLDER timestamp, and test that the index ignores this
+        assertIndexedNone(cfs, col, 300L);
+        assertIndexedOne(cfs, col, 200L);
+    }
+
+    @Test
+    public void testIndexUpdateOverwritingExpiringColumns() throws Exception
+    {
+        // see CASSANDRA-7268
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+        ByteBuffer col = ByteBufferUtil.bytes("birthdate");
+
+        // create a row and update the birthdate value with an expiring column
+        new RowUpdateBuilder(cfs.metadata, 1L, 500, "K100").clustering("c").add("birthdate", 100L).build().applyUnsafe();
+        assertIndexedOne(cfs, col, 100L);
+
+        // requires a 1s sleep because we calculate local expiry time as (now() / 1000) + ttl
+        TimeUnit.SECONDS.sleep(1);
+
+        // now overwrite with the same name/value/ttl, but the local expiry time will be different
+        new RowUpdateBuilder(cfs.metadata, 1L, 500, "K100").clustering("c").add("birthdate", 100L).build().applyUnsafe();
+        assertIndexedOne(cfs, col, 100L);
+
+        // check that modifying the indexed value using the same timestamp behaves as expected
+        new RowUpdateBuilder(cfs.metadata, 1L, 500, "K101").clustering("c").add("birthdate", 101L).build().applyUnsafe();
+        assertIndexedOne(cfs, col, 101L);
+
+        TimeUnit.SECONDS.sleep(1);
+
+        new RowUpdateBuilder(cfs.metadata, 1L, 500, "K101").clustering("c").add("birthdate", 102L).build().applyUnsafe();
+        // Confirm 101 is gone
+        assertIndexedNone(cfs, col, 101L);
+
+        // Confirm 102 is there
+        assertIndexedOne(cfs, col, 102L);
+    }
+
+    @Test
+    public void testDeleteOfInconsistentValuesInKeysIndex() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(WITH_KEYS_INDEX);
+
+        ByteBuffer col = ByteBufferUtil.bytes("birthdate");
+
+        // create a row and update the "birthdate" value
+        new RowUpdateBuilder(cfs.metadata, 1, "k1").noRowMarker().add("birthdate", 1L).build().applyUnsafe();
+
+        // force a flush, so our index isn't being read from a memtable
+        keyspace.getColumnFamilyStore(WITH_KEYS_INDEX).forceBlockingFlush();
+
+        // now apply another update, but force the index update to be skipped
+        keyspace.apply(new RowUpdateBuilder(cfs.metadata, 2, "k1").noRowMarker().add("birthdate", 2L).build(),
+                       true,
+                       false);
+
+        // Now searching the index for either the old or new value should return 0 rows
+        // because the new value was not indexed and the old value should be ignored
+        // (and in fact purged from the index cf).
+        // first check for the old value
+        assertIndexedNone(cfs, col, 1L);
+        assertIndexedNone(cfs, col, 2L);
+
+        // now, reset back to the original value, still skipping the index update, to
+        // make sure the value was expunged from the index when it was discovered to be inconsistent
+        keyspace.apply(new RowUpdateBuilder(cfs.metadata, 3, "k1").noRowMarker().add("birthdate", 1L).build(),
+                       true,
+                       false);
+        assertIndexedNone(cfs, col, 1L);
+        ColumnFamilyStore indexCfs = cfs.indexManager.getAllIndexColumnFamilyStores().iterator().next();
+        assertIndexCfsIsEmpty(indexCfs);
+    }
+
+    @Test
+    public void testDeleteOfInconsistentValuesFromCompositeIndex() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        String cfName = WITH_COMPOSITE_INDEX;
+
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
+
+        ByteBuffer col = ByteBufferUtil.bytes("birthdate");
+
+        // create a row and update the author value
+        new RowUpdateBuilder(cfs.metadata, 0, "k1").clustering("c").add("birthdate", 10l).build().applyUnsafe();
+
+        // test that the index query fetches this version
+        assertIndexedOne(cfs, col, 10l);
+
+        // force a flush and retry the query, so our index isn't being read from a memtable
+        keyspace.getColumnFamilyStore(cfName).forceBlockingFlush();
+        assertIndexedOne(cfs, col, 10l);
+
+        // now apply another update, but force the index update to be skipped
+        keyspace.apply(new RowUpdateBuilder(cfs.metadata, 1, "k1").clustering("c").add("birthdate", 20l).build(),
+                       true,
+                       false);
+
+        // Now searching the index for either the old or new value should return 0 rows
+        // because the new value was not indexed and the old value should be ignored
+        // (and in fact purged from the index cf).
+        // first check for the old value
+        assertIndexedNone(cfs, col, 10l);
+        assertIndexedNone(cfs, col, 20l);
+
+        // now, reset back to the original value, still skipping the index update, to
+        // make sure the value was expunged from the index when it was discovered to be inconsistent
+        // TODO: Figure out why this is re-inserting
+        keyspace.apply(new RowUpdateBuilder(cfs.metadata, 2, "k1").clustering("c1").add("birthdate", 10l).build(), true, false);
+        assertIndexedNone(cfs, col, 20l);
+
+        ColumnFamilyStore indexCfs = cfs.indexManager.getAllIndexColumnFamilyStores().iterator().next();
+        assertIndexCfsIsEmpty(indexCfs);
+    }
+
+    // See CASSANDRA-6098
+    @Test
+    public void testDeleteCompositeIndex() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+
+        ByteBuffer colName = ByteBufferUtil.bytes("birthdate");
+
+        // Insert indexed value.
+        new RowUpdateBuilder(cfs.metadata, 1, "k1").clustering("c").add("birthdate", 10l).build().applyUnsafe();
+
+        // Now delete the value
+        RowUpdateBuilder.deleteRow(cfs.metadata, 2, "k1", "c").applyUnsafe();
+
+        // We want the data to be gcable, but even if gcGrace == 0, we still need to wait 1 second
+        // since we won't gc on a tie.
+        try { Thread.sleep(1000); } catch (Exception e) {}
+
+        // Read the index and we check we do get no value (and no NPE)
+        // Note: the index will return the entry because it hasn't been deleted (we
+        // haven't read yet nor compacted) but the data read itself will return null
+        assertIndexedNone(cfs, colName, 10l);
+    }
+
+    @Test
+    public void testDeleteKeysIndex() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_KEYS_INDEX);
+
+        ByteBuffer colName = ByteBufferUtil.bytes("birthdate");
+
+        // Insert indexed value.
+        new RowUpdateBuilder(cfs.metadata, 1, "k1").add("birthdate", 10l).build().applyUnsafe();
+
+        // Now delete the value
+        RowUpdateBuilder.deleteRow(cfs.metadata, 2, "k1").applyUnsafe();
+
+        // We want the data to be gcable, but even if gcGrace == 0, we still need to wait 1 second
+        // since we won't gc on a tie.
+        try { Thread.sleep(1000); } catch (Exception e) {}
+
+        // Read the index and we check we do get no value (and no NPE)
+        // Note: the index will return the entry because it hasn't been deleted (we
+        // haven't read yet nor compacted) but the data read itself will return null
+        assertIndexedNone(cfs, colName, 10l);
+    }
+
+    // See CASSANDRA-2628
+    @Test
+    public void testIndexScanWithLimitOne()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_COMPOSITE_INDEX);
+        Mutation rm;
+
+        new RowUpdateBuilder(cfs.metadata, 0, "kk1").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk1").clustering("c").add("notbirthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk2").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk2").clustering("c").add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk3").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk3").clustering("c").add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk4").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "kk4").clustering("c").add("notbirthdate", 2L).build().applyUnsafe();
+
+        // basic single-expression query, limit 1
+        Util.getOnlyRow(Util.cmd(cfs)
+                            .filterOn("birthdate", Operator.EQ, 1L)
+                            .filterOn("notbirthdate", Operator.EQ, 1L)
+                            .withLimit(1)
+                            .build());
+    }
+
+    @Test
+    public void testIndexCreate() throws IOException, InterruptedException, ExecutionException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COMPOSITE_INDEX_TO_BE_ADDED);
+
+        // create a row and update the birthdate value, test that the index query fetches the new version
+        new RowUpdateBuilder(cfs.metadata, 0, "k1").clustering("c").add("birthdate", 1L).build().applyUnsafe();
+
+        String indexName = "birthdate_index";
+        ColumnDefinition old = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("birthdate"));
+        IndexMetadata indexDef =
+            IndexMetadata.fromIndexTargets(cfs.metadata,
+                                           Collections.singletonList(new IndexTarget(old.name, IndexTarget.Type.VALUES)),
+                                           indexName,
+                                           IndexMetadata.Kind.COMPOSITES,
+                                           Collections.EMPTY_MAP);
+        cfs.metadata.indexes(cfs.metadata.getIndexes().with(indexDef));
+        Future<?> future = cfs.indexManager.addIndex(indexDef);
+        future.get();
+
+        // we had a bug (CASSANDRA-2244) where index would get created but not flushed -- check for that
+        // the way we find the index cfs is a bit convoluted at the moment
+        boolean flushed = false;
+        ColumnFamilyStore indexCfs = cfs.indexManager.getIndex(indexDef)
+                                                     .getBackingTable()
+                                                     .orElseThrow(throwAssert("Index not found"));
+        flushed = !indexCfs.getLiveSSTables().isEmpty();
+        assertTrue(flushed);
+        assertIndexedOne(cfs, ByteBufferUtil.bytes("birthdate"), 1L);
+
+        // validate that drop clears it out & rebuild works (CASSANDRA-2320)
+        assertTrue(cfs.getBuiltIndexes().contains(indexName));
+        cfs.indexManager.removeIndex(indexDef.name);
+        assertFalse(cfs.getBuiltIndexes().contains(indexName));
+
+        // rebuild & re-query
+        future = cfs.indexManager.addIndex(indexDef);
+        future.get();
+        assertIndexedOne(cfs, ByteBufferUtil.bytes("birthdate"), 1L);
+    }
+
+    @Test
+    public void testKeysSearcherSimple() throws Exception
+    {
+        //  Create secondary index and flush to disk
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(WITH_KEYS_INDEX);
+
+        for (int i = 0; i < 10; i++)
+            new RowUpdateBuilder(cfs.metadata, 0, "k" + i).noRowMarker().add("birthdate", 1l).build().applyUnsafe();
+
+        assertIndexedCount(cfs, ByteBufferUtil.bytes("birthdate"), 1l, 10);
+        cfs.forceBlockingFlush();
+        assertIndexedCount(cfs, ByteBufferUtil.bytes("birthdate"), 1l, 10);
+    }
+
+    @Test
+    public void testSelectivityWithMultipleIndexes()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(WITH_MULTIPLE_COMPOSITE_INDEX);
+
+        // creates rows such that birthday_index has 1 partition (key = 1L) with 4 rows -- mean row count = 4, and notbirthdate_index has 2 partitions with 2 rows each -- mean row count = 2
+        new RowUpdateBuilder(cfs.metadata, 0, "k1").clustering("c").add("birthdate", 1L).add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k2").clustering("c").add("birthdate", 1L).add("notbirthdate", 2L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k3").clustering("c").add("birthdate", 1L).add("notbirthdate", 3L).build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 0, "k4").clustering("c").add("birthdate", 1L).add("notbirthdate", 3L).build().applyUnsafe();
+
+        cfs.forceBlockingFlush();
+        ReadCommand rc = Util.cmd(cfs)
+                             .fromKeyIncl("k1")
+                             .toKeyIncl("k3")
+                             .columns("birthdate")
+                             .filterOn("birthdate", Operator.EQ, 1L)
+                             .filterOn("notbirthdate", Operator.EQ, 0L)
+                             .build();
+
+        assertEquals("notbirthdate_key_index", rc.indexMetadata().name);
+    }
+
+    private void assertIndexedNone(ColumnFamilyStore cfs, ByteBuffer col, Object val)
+    {
+        assertIndexedCount(cfs, col, val, 0);
+    }
+    private void assertIndexedOne(ColumnFamilyStore cfs, ByteBuffer col, Object val)
+    {
+        assertIndexedCount(cfs, col, val, 1);
+    }
+    private void assertIndexedCount(ColumnFamilyStore cfs, ByteBuffer col, Object val, int count)
+    {
+        ColumnDefinition cdef = cfs.metadata.getColumnDefinition(col);
+
+        ReadCommand rc = Util.cmd(cfs).filterOn(cdef.name.toString(), Operator.EQ, ((AbstractType) cdef.cellValueType()).decompose(val)).build();
+        Index.Searcher searcher = rc.getIndex(cfs).searcherFor(rc);
+        if (count != 0)
+            assertNotNull(searcher);
+
+        try (ReadOrderGroup orderGroup = rc.startOrderGroup();
+             PartitionIterator iter = UnfilteredPartitionIterators.filter(searcher.search(orderGroup),
+                                                                          FBUtilities.nowInSeconds()))
+        {
+            assertEquals(count, Util.size(iter));
+        }
+    }
+
+    private void assertIndexCfsIsEmpty(ColumnFamilyStore indexCfs)
+    {
+        PartitionRangeReadCommand command = (PartitionRangeReadCommand)Util.cmd(indexCfs).build();
+        try (ReadOrderGroup orderGroup = command.startOrderGroup();
+             PartitionIterator iter = UnfilteredPartitionIterators.filter(Util.executeLocally(command, indexCfs, orderGroup),
+                                                                          FBUtilities.nowInSeconds()))
+        {
+            assertFalse(iter.hasNext());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java b/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java
new file mode 100644
index 0000000..3e9f3bc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java

@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import com.google.common.io.Files;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
+import org.apache.cassandra.io.util.FileUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+
+public class SerializationHeaderTest
+{
+    private static String KEYSPACE = "SerializationHeaderTest";
+
+    @Test
+    public void testWrittenAsDifferentKind() throws Exception
+    {
+        final String tableName = "testWrittenAsDifferentKind";
+        final String schemaCqlWithStatic = String.format("CREATE TABLE %s (k int, c int, v int static, PRIMARY KEY(k, c))", tableName);
+        final String schemaCqlWithRegular = String.format("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY(k, c))", tableName);
+        ColumnIdentifier v = ColumnIdentifier.getInterned("v", false);
+        CFMetaData schemaWithStatic = CFMetaData.compile(schemaCqlWithStatic, KEYSPACE);
+        CFMetaData schemaWithRegular = CFMetaData.compile(schemaCqlWithRegular, KEYSPACE);
+        ColumnDefinition columnStatic = schemaWithStatic.getColumnDefinition(v);
+        ColumnDefinition columnRegular = schemaWithRegular.getColumnDefinition(v);
+        schemaWithStatic.recordColumnDrop(columnRegular, 0L);
+        schemaWithRegular.recordColumnDrop(columnStatic, 0L);
+
+        final AtomicInteger generation = new AtomicInteger();
+        File dir = Files.createTempDir();
+        try
+        {
+            BiFunction<CFMetaData, Function<ByteBuffer, Clustering>, Callable<Descriptor>> writer = (schema, clusteringFunction) -> () -> {
+                Descriptor descriptor = new Descriptor(BigFormat.latestVersion, dir, schema.ksName, schema.cfName, generation.incrementAndGet(), SSTableFormat.Type.BIG, Component.DIGEST_CRC32);
+
+                SerializationHeader header = SerializationHeader.makeWithoutStats(schema);
+                try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
+                     SSTableWriter sstableWriter = BigTableWriter.create(schema, descriptor, 1, 0L, 0, header, txn))
+                {
+                    ColumnDefinition cd = schema.getColumnDefinition(v);
+                    for (int i = 0 ; i < 5 ; ++i) {
+                        final ByteBuffer value = Int32Type.instance.decompose(i);
+                        Cell cell = BufferCell.live(schema, cd, 1L, value);
+                        Clustering clustering = clusteringFunction.apply(value);
+                        Row row = BTreeRow.singleCellRow(clustering, cell);
+                        sstableWriter.append(PartitionUpdate.singleRowUpdate(schema, value, row).unfilteredIterator());
+                    }
+                    sstableWriter.finish(false);
+                    txn.finish();
+                }
+                return descriptor;
+            };
+
+            Descriptor sstableWithRegular = writer.apply(schemaWithRegular, Clustering::new).call();
+            Descriptor sstableWithStatic = writer.apply(schemaWithStatic, value -> Clustering.STATIC_CLUSTERING).call();
+            SSTableReader readerWithStatic = SSTableReader.openNoValidation(sstableWithStatic, schemaWithRegular);
+            SSTableReader readerWithRegular = SSTableReader.openNoValidation(sstableWithRegular, schemaWithStatic);
+
+            try (ISSTableScanner partitions = readerWithStatic.getScanner()) {
+                for (int i = 0 ; i < 5 ; ++i)
+                {
+                    UnfilteredRowIterator partition = partitions.next();
+                    Assert.assertFalse(partition.hasNext());
+                    long value = Int32Type.instance.compose(partition.staticRow().getCell(columnStatic).value());
+                    Assert.assertEquals(value, (long)i);
+                }
+                Assert.assertFalse(partitions.hasNext());
+            }
+            try (ISSTableScanner partitions = readerWithRegular.getScanner()) {
+                for (int i = 0 ; i < 5 ; ++i)
+                {
+                    UnfilteredRowIterator partition = partitions.next();
+                    long value = Int32Type.instance.compose(((Row)partition.next()).getCell(columnRegular).value());
+                    Assert.assertEquals(value, (long)i);
+                    Assert.assertTrue(partition.staticRow().isEmpty());
+                    Assert.assertFalse(partition.hasNext());
+                }
+                Assert.assertFalse(partitions.hasNext());
+            }
+        }
+        finally
+        {
+            FileUtils.deleteRecursive(dir);
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/SerializationsTest.java b/test/unit/org/apache/cassandra/db/SerializationsTest.java
deleted file mode 100644
index a280448..0000000
--- a/test/unit/org/apache/cassandra/db/SerializationsTest.java
+++ /dev/null

@@ -1,405 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.db;
-
-import org.apache.cassandra.AbstractSerializationsTester;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.DataOutputStreamPlus;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.net.CallbackInfo;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.net.MessageOut;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import java.io.DataInputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.*;
-
-public class SerializationsTest extends AbstractSerializationsTester
-{
-    Statics statics = new Statics();
-
-    private static final String KEYSPACE1 = "Keyspace1";
-    private ByteBuffer startCol = ByteBufferUtil.bytes("Start");
-    private ByteBuffer stopCol = ByteBufferUtil.bytes("Stop");
-    private Composite emptyCol = Composites.EMPTY;
-    public NamesQueryFilter namesPred = new NamesQueryFilter(statics.NamedCols);
-    public NamesQueryFilter namesSCPred = new NamesQueryFilter(statics.NamedSCCols);
-    public SliceQueryFilter emptyRangePred = new SliceQueryFilter(emptyCol, emptyCol, false, 100);
-    public SliceQueryFilter nonEmptyRangePred = new SliceQueryFilter(CellNames.simpleDense(startCol), CellNames.simpleDense(stopCol), true, 100);
-    public SliceQueryFilter nonEmptyRangeSCPred = new SliceQueryFilter(CellNames.compositeDense(statics.SC, startCol), CellNames.compositeDense(statics.SC, stopCol), true, 100);
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, "Standard1"),
-                                    SchemaLoader.superCFMD(KEYSPACE1, "Super1", LongType.instance));
-    }
-
-    private void testRangeSliceCommandWrite() throws IOException
-    {
-        IPartitioner part = StorageService.getPartitioner();
-        AbstractBounds<RowPosition> bounds = Range.makeRowRange(part.getRandomToken(), part.getRandomToken());
-
-        RangeSliceCommand namesCmd = new RangeSliceCommand(statics.KS, "Standard1", statics.readTs, namesPred, bounds, 100);
-        MessageOut<RangeSliceCommand> namesCmdMsg = namesCmd.createMessage();
-        RangeSliceCommand emptyRangeCmd = new RangeSliceCommand(statics.KS, "Standard1", statics.readTs, emptyRangePred, bounds, 100);
-        MessageOut<RangeSliceCommand> emptyRangeCmdMsg = emptyRangeCmd.createMessage();
-        RangeSliceCommand regRangeCmd = new RangeSliceCommand(statics.KS, "Standard1", statics.readTs, nonEmptyRangePred, bounds, 100);
-        MessageOut<RangeSliceCommand> regRangeCmdMsg = regRangeCmd.createMessage();
-        RangeSliceCommand namesCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, namesSCPred, bounds, 100);
-        MessageOut<RangeSliceCommand> namesCmdSupMsg = namesCmdSup.createMessage();
-        RangeSliceCommand emptyRangeCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, emptyRangePred, bounds, 100);
-        MessageOut<RangeSliceCommand> emptyRangeCmdSupMsg = emptyRangeCmdSup.createMessage();
-        RangeSliceCommand regRangeCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, nonEmptyRangeSCPred, bounds, 100);
-        MessageOut<RangeSliceCommand> regRangeCmdSupMsg = regRangeCmdSup.createMessage();
-
-        DataOutputStreamPlus out = getOutput("db.RangeSliceCommand.bin");
-        namesCmdMsg.serialize(out, getVersion());
-        emptyRangeCmdMsg.serialize(out, getVersion());
-        regRangeCmdMsg.serialize(out, getVersion());
-        namesCmdSupMsg.serialize(out, getVersion());
-        emptyRangeCmdSupMsg.serialize(out, getVersion());
-        regRangeCmdSupMsg.serialize(out, getVersion());
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(namesCmd, RangeSliceCommand.serializer);
-        testSerializedSize(emptyRangeCmd, RangeSliceCommand.serializer);
-        testSerializedSize(regRangeCmd, RangeSliceCommand.serializer);
-        testSerializedSize(namesCmdSup, RangeSliceCommand.serializer);
-        testSerializedSize(emptyRangeCmdSup, RangeSliceCommand.serializer);
-        testSerializedSize(regRangeCmdSup, RangeSliceCommand.serializer);
-    }
-
-    @Test
-    public void testRangeSliceCommandRead() throws IOException
-    {
-        if (EXECUTE_WRITES)
-            testRangeSliceCommandWrite();
-
-        DataInputStream in = getInput("db.RangeSliceCommand.bin");
-        for (int i = 0; i < 6; i++)
-            MessageIn.read(in, getVersion(), -1);
-        in.close();
-    }
-
-    private void testSliceByNamesReadCommandWrite() throws IOException
-    {
-        SliceByNamesReadCommand standardCmd = new SliceByNamesReadCommand(statics.KS, statics.Key, statics.StandardCF, statics.readTs, namesPred);
-        SliceByNamesReadCommand superCmd = new SliceByNamesReadCommand(statics.KS, statics.Key, statics.SuperCF, statics.readTs, namesSCPred);
-
-        DataOutputStreamPlus out = getOutput("db.SliceByNamesReadCommand.bin");
-        SliceByNamesReadCommand.serializer.serialize(standardCmd, out, getVersion());
-        SliceByNamesReadCommand.serializer.serialize(superCmd, out, getVersion());
-        ReadCommand.serializer.serialize(standardCmd, out, getVersion());
-        ReadCommand.serializer.serialize(superCmd, out, getVersion());
-        standardCmd.createMessage().serialize(out, getVersion());
-        superCmd.createMessage().serialize(out, getVersion());
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(standardCmd, SliceByNamesReadCommand.serializer);
-        testSerializedSize(superCmd, SliceByNamesReadCommand.serializer);
-    }
-
-    @Test
-    public void testSliceByNamesReadCommandRead() throws IOException
-    {
-        if (EXECUTE_WRITES)
-            testSliceByNamesReadCommandWrite();
-
-        DataInputStream in = getInput("db.SliceByNamesReadCommand.bin");
-        assert SliceByNamesReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert SliceByNamesReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert ReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert ReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        in.close();
-    }
-
-    private void testSliceFromReadCommandWrite() throws IOException
-    {
-        SliceFromReadCommand standardCmd = new SliceFromReadCommand(statics.KS, statics.Key, statics.StandardCF, statics.readTs, nonEmptyRangePred);
-        SliceFromReadCommand superCmd = new SliceFromReadCommand(statics.KS, statics.Key, statics.SuperCF, statics.readTs, nonEmptyRangeSCPred);
-
-        DataOutputStreamPlus out = getOutput("db.SliceFromReadCommand.bin");
-        SliceFromReadCommand.serializer.serialize(standardCmd, out, getVersion());
-        SliceFromReadCommand.serializer.serialize(superCmd, out, getVersion());
-        ReadCommand.serializer.serialize(standardCmd, out, getVersion());
-        ReadCommand.serializer.serialize(superCmd, out, getVersion());
-        standardCmd.createMessage().serialize(out, getVersion());
-        superCmd.createMessage().serialize(out, getVersion());
-
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(standardCmd, SliceFromReadCommand.serializer);
-        testSerializedSize(superCmd, SliceFromReadCommand.serializer);
-    }
-
-    @Test
-    public void testSliceFromReadCommandRead() throws IOException
-    {
-        if (EXECUTE_WRITES)
-            testSliceFromReadCommandWrite();
-
-        DataInputStream in = getInput("db.SliceFromReadCommand.bin");
-        assert SliceFromReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert SliceFromReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert ReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert ReadCommand.serializer.deserialize(in, getVersion()) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        in.close();
-    }
-
-    private void testRowWrite() throws IOException
-    {
-        DataOutputStreamPlus out = getOutput("db.Row.bin");
-        Row.serializer.serialize(statics.StandardRow, out, getVersion());
-        Row.serializer.serialize(statics.SuperRow, out, getVersion());
-        Row.serializer.serialize(statics.NullRow, out, getVersion());
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(statics.StandardRow, Row.serializer);
-        testSerializedSize(statics.SuperRow, Row.serializer);
-        testSerializedSize(statics.NullRow, Row.serializer);
-    }
-
-    @Test
-    public void testRowRead() throws IOException
-    {
-        // Since every table creation generates different CF ID,
-        // we need to generate file every time
-        testRowWrite();
-
-        DataInputStream in = getInput("db.Row.bin");
-        assert Row.serializer.deserialize(in, getVersion()) != null;
-        assert Row.serializer.deserialize(in, getVersion()) != null;
-        assert Row.serializer.deserialize(in, getVersion()) != null;
-        in.close();
-    }
-
-    private void testMutationWrite() throws IOException
-    {
-        Mutation standardRowRm = new Mutation(statics.KS, statics.StandardRow);
-        Mutation superRowRm = new Mutation(statics.KS, statics.SuperRow);
-        Mutation standardRm = new Mutation(statics.KS, statics.Key, statics.StandardCf);
-        Mutation superRm = new Mutation(statics.KS, statics.Key, statics.SuperCf);
-        Map<UUID, ColumnFamily> mods = new HashMap<UUID, ColumnFamily>();
-        mods.put(statics.StandardCf.metadata().cfId, statics.StandardCf);
-        mods.put(statics.SuperCf.metadata().cfId, statics.SuperCf);
-        Mutation mixedRm = new Mutation(statics.KS, statics.Key, mods);
-
-        DataOutputStreamPlus out = getOutput("db.RowMutation.bin");
-        Mutation.serializer.serialize(standardRowRm, out, getVersion());
-        Mutation.serializer.serialize(superRowRm, out, getVersion());
-        Mutation.serializer.serialize(standardRm, out, getVersion());
-        Mutation.serializer.serialize(superRm, out, getVersion());
-        Mutation.serializer.serialize(mixedRm, out, getVersion());
-
-        standardRowRm.createMessage().serialize(out, getVersion());
-        superRowRm.createMessage().serialize(out, getVersion());
-        standardRm.createMessage().serialize(out, getVersion());
-        superRm.createMessage().serialize(out, getVersion());
-        mixedRm.createMessage().serialize(out, getVersion());
-
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(standardRowRm, Mutation.serializer);
-        testSerializedSize(superRowRm, Mutation.serializer);
-        testSerializedSize(standardRm, Mutation.serializer);
-        testSerializedSize(superRm, Mutation.serializer);
-        testSerializedSize(mixedRm, Mutation.serializer);
-    }
-
-    @Test
-    public void testMutationRead() throws IOException
-    {
-        // mutation deserialization requires being able to look up the keyspace in the schema,
-        // so we need to rewrite this each time. plus, CF ID is different for every run.
-        testMutationWrite();
-
-        DataInputStream in = getInput("db.RowMutation.bin");
-        assert Mutation.serializer.deserialize(in, getVersion()) != null;
-        assert Mutation.serializer.deserialize(in, getVersion()) != null;
-        assert Mutation.serializer.deserialize(in, getVersion()) != null;
-        assert Mutation.serializer.deserialize(in, getVersion()) != null;
-        assert Mutation.serializer.deserialize(in, getVersion()) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-        in.close();
-    }
-
-    private void testTruncateWrite() throws IOException
-    {
-        Truncation tr = new Truncation(statics.KS, "Doesn't Really Matter");
-        TruncateResponse aff = new TruncateResponse(statics.KS, "Doesn't Matter Either", true);
-        TruncateResponse neg = new TruncateResponse(statics.KS, "Still Doesn't Matter", false);
-        DataOutputStreamPlus out = getOutput("db.Truncation.bin");
-        Truncation.serializer.serialize(tr, out, getVersion());
-        TruncateResponse.serializer.serialize(aff, out, getVersion());
-        TruncateResponse.serializer.serialize(neg, out, getVersion());
-
-        tr.createMessage().serialize(out, getVersion());
-        aff.createMessage().serialize(out, getVersion());
-        neg.createMessage().serialize(out, getVersion());
-        // todo: notice how CF names weren't validated.
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(tr, Truncation.serializer);
-        testSerializedSize(aff, TruncateResponse.serializer);
-        testSerializedSize(neg, TruncateResponse.serializer);
-    }
-
-    @Test
-    public void testTruncateRead() throws IOException
-    {
-        if (EXECUTE_WRITES)
-            testTruncateWrite();
-
-        DataInputStream in = getInput("db.Truncation.bin");
-        assert Truncation.serializer.deserialize(in, getVersion()) != null;
-        assert TruncateResponse.serializer.deserialize(in, getVersion()) != null;
-        assert TruncateResponse.serializer.deserialize(in, getVersion()) != null;
-        assert MessageIn.read(in, getVersion(), -1) != null;
-
-        // set up some fake callbacks so deserialization knows that what it's deserializing is a TruncateResponse
-        MessagingService.instance().setCallbackForTests(1, new CallbackInfo(null, null, TruncateResponse.serializer, false));
-        MessagingService.instance().setCallbackForTests(2, new CallbackInfo(null, null, TruncateResponse.serializer, false));
-
-        assert MessageIn.read(in, getVersion(), 1) != null;
-        assert MessageIn.read(in, getVersion(), 2) != null;
-        in.close();
-    }
-
-    private void testWriteResponseWrite() throws IOException
-    {
-        WriteResponse aff = new WriteResponse();
-        WriteResponse neg = new WriteResponse();
-        DataOutputStreamPlus out = getOutput("db.WriteResponse.bin");
-        WriteResponse.serializer.serialize(aff, out, getVersion());
-        WriteResponse.serializer.serialize(neg, out, getVersion());
-        out.close();
-
-        // test serializedSize
-        testSerializedSize(aff, WriteResponse.serializer);
-        testSerializedSize(neg, WriteResponse.serializer);
-    }
-
-    @Test
-    public void testWriteResponseRead() throws IOException
-    {
-        if (EXECUTE_WRITES)
-            testWriteResponseWrite();
-
-        DataInputStream in = getInput("db.WriteResponse.bin");
-        assert WriteResponse.serializer.deserialize(in, getVersion()) != null;
-        assert WriteResponse.serializer.deserialize(in, getVersion()) != null;
-        in.close();
-    }
-
-    private static ByteBuffer bb(String s)
-    {
-        return ByteBufferUtil.bytes(s);
-    }
-
-    private static CellName cn(String s)
-    {
-        return CellNames.simpleDense(ByteBufferUtil.bytes(s));
-    }
-
-    private static class Statics
-    {
-        private final String KS = KEYSPACE1;
-        private final ByteBuffer Key = ByteBufferUtil.bytes("Key01");
-        private final SortedSet<CellName> NamedCols = new TreeSet<CellName>(new SimpleDenseCellNameType(BytesType.instance))
-        {{
-            add(CellNames.simpleDense(ByteBufferUtil.bytes("AAA")));
-            add(CellNames.simpleDense(ByteBufferUtil.bytes("BBB")));
-            add(CellNames.simpleDense(ByteBufferUtil.bytes("CCC")));
-        }};
-        private final ByteBuffer SC = ByteBufferUtil.bytes("SCName");
-        private final SortedSet<CellName> NamedSCCols = new TreeSet<CellName>(new CompoundDenseCellNameType(Arrays.<AbstractType<?>>asList(BytesType.instance, BytesType.instance)))
-        {{
-            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("AAA")));
-            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("BBB")));
-            add(CellNames.compositeDense(SC, ByteBufferUtil.bytes("CCC")));
-        }};
-        private final String StandardCF = "Standard1";
-        private final String SuperCF = "Super1";
-
-        private final long readTs = 1369935512292L;
-
-        private final ColumnFamily StandardCf = ArrayBackedSortedColumns.factory.create(KS, StandardCF);
-        private final ColumnFamily SuperCf = ArrayBackedSortedColumns.factory.create(KS, SuperCF);
-
-        private final Row StandardRow = new Row(Util.dk("key0"), StandardCf);
-        private final Row SuperRow = new Row(Util.dk("key1"), SuperCf);
-        private final Row NullRow = new Row(Util.dk("key2"), null);
-
-        private Statics()
-        {
-            StandardCf.addColumn(new BufferCell(cn("aaaa")));
-            StandardCf.addColumn(new BufferCell(cn("bbbb"), bb("bbbbb-value")));
-            StandardCf.addColumn(new BufferCell(cn("cccc"), bb("ccccc-value"), 1000L));
-            StandardCf.addColumn(new BufferDeletedCell(cn("dddd"), 500, 1000));
-            StandardCf.addColumn(new BufferDeletedCell(cn("eeee"), bb("eeee-value"), 1001));
-            StandardCf.addColumn(new BufferExpiringCell(cn("ffff"), bb("ffff-value"), 2000, 1000));
-            StandardCf.addColumn(new BufferExpiringCell(cn("gggg"), bb("gggg-value"), 2001, 1000, 2002));
-
-            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("aaaa"))));
-            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("bbbb")), bb("bbbbb-value")));
-            SuperCf.addColumn(new BufferCell(CellNames.compositeDense(SC, bb("cccc")), bb("ccccc-value"), 1000L));
-            SuperCf.addColumn(new BufferDeletedCell(CellNames.compositeDense(SC, bb("dddd")), 500, 1000));
-            SuperCf.addColumn(new BufferDeletedCell(CellNames.compositeDense(SC, bb("eeee")), bb("eeee-value"), 1001));
-            SuperCf.addColumn(new BufferExpiringCell(CellNames.compositeDense(SC, bb("ffff")), bb("ffff-value"), 2000, 1000));
-            SuperCf.addColumn(new BufferExpiringCell(CellNames.compositeDense(SC, bb("gggg")), bb("gggg-value"), 2001, 1000, 2002));
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java
new file mode 100644
index 0000000..1c891ec
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java

@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import static org.junit.Assert.assertTrue;
+
+public class SinglePartitionReadCommandCQLTest extends CQLTester
+{
+    @Test
+    public void partitionLevelDeletionTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (bucket_id TEXT,name TEXT,data TEXT,PRIMARY KEY (bucket_id, name))");
+        execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c18', 'test', 'hello')");
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+        execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c19', 'test2', 'hello');");
+        execute("delete from %s where bucket_id = '8772618c9009cf8f5a5e0c18'");
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+        UntypedResultSet res = execute("select * from %s where bucket_id = '8772618c9009cf8f5a5e0c18' and name = 'test'");
+        assertTrue(res.isEmpty());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
new file mode 100644
index 0000000..940b4f9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java

@@ -0,0 +1,468 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Iterables;
+import com.google.common.primitives.Ints;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.filter.AbstractClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.btree.BTreeSet;
+
+public class SinglePartitionSliceCommandTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(SinglePartitionSliceCommandTest.class);
+
+    private static final String KEYSPACE = "ks";
+    private static final String TABLE = "tbl";
+
+    private static CFMetaData cfm;
+    private static ColumnDefinition v;
+    private static ColumnDefinition s;
+
+    private static final String TABLE_SCLICES = "tbl_slices";
+    private static CFMetaData CFM_SLICES;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        cfm = CFMetaData.Builder.create(KEYSPACE, TABLE)
+                                .addPartitionKey("k", UTF8Type.instance)
+                                .addStaticColumn("s", UTF8Type.instance)
+                                .addClusteringColumn("i", IntegerType.instance)
+                                .addRegularColumn("v", UTF8Type.instance)
+                                .build();
+
+        CFM_SLICES = CFMetaData.Builder.create(KEYSPACE, TABLE_SCLICES)
+                                       .addPartitionKey("k", UTF8Type.instance)
+                                       .addClusteringColumn("c1", Int32Type.instance)
+                                       .addClusteringColumn("c2", Int32Type.instance)
+                                       .addRegularColumn("v", IntegerType.instance)
+                                       .build();
+
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), cfm, CFM_SLICES);
+
+        cfm = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        v = cfm.getColumnDefinition(new ColumnIdentifier("v", true));
+        s = cfm.getColumnDefinition(new ColumnIdentifier("s", true));
+
+        CFM_SLICES = Schema.instance.getCFMetaData(KEYSPACE, TABLE_SCLICES);
+    }
+
+    @Before
+    public void truncate()
+    {
+        Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).truncateBlocking();
+        Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_SCLICES).truncateBlocking();
+    }
+
+    @Test
+    public void staticColumnsAreFiltered() throws IOException
+    {
+        DecoratedKey key = cfm.decorateKey(ByteBufferUtil.bytes("k"));
+
+        UntypedResultSet rows;
+
+        QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, s, i, v) VALUES ('k', 's', 0, 'v')");
+        QueryProcessor.executeInternal("DELETE v FROM ks.tbl WHERE k='k' AND i=0");
+        QueryProcessor.executeInternal("DELETE FROM ks.tbl WHERE k='k' AND i=0");
+        rows = QueryProcessor.executeInternal("SELECT * FROM ks.tbl WHERE k='k' AND i=0");
+
+        for (UntypedResultSet.Row row: rows)
+        {
+            logger.debug("Current: k={}, s={}, v={}", (row.has("k") ? row.getString("k") : null), (row.has("s") ? row.getString("s") : null), (row.has("v") ? row.getString("v") : null));
+        }
+
+        assert rows.isEmpty();
+
+        ColumnFilter columnFilter = ColumnFilter.selection(PartitionColumns.of(v));
+        ByteBuffer zero = ByteBufferUtil.bytes(0);
+        Slices slices = Slices.with(cfm.comparator, Slice.make(Slice.Bound.inclusiveStartOf(zero), Slice.Bound.inclusiveEndOf(zero)));
+        ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(slices, false);
+        ReadCommand cmd = SinglePartitionReadCommand.create(true,
+                                                            cfm,
+                                                            FBUtilities.nowInSeconds(),
+                                                            columnFilter,
+                                                            RowFilter.NONE,
+                                                            DataLimits.NONE,
+                                                            key,
+                                                            sliceFilter);
+
+        DataOutputBuffer out = new DataOutputBuffer((int) ReadCommand.legacyReadCommandSerializer.serializedSize(cmd, MessagingService.VERSION_21));
+        ReadCommand.legacyReadCommandSerializer.serialize(cmd, out, MessagingService.VERSION_21);
+        DataInputPlus in = new DataInputBuffer(out.buffer(), true);
+        cmd = ReadCommand.legacyReadCommandSerializer.deserialize(in, MessagingService.VERSION_21);
+
+        logger.debug("ReadCommand: {}", cmd);
+        UnfilteredPartitionIterator partitionIterator = cmd.executeLocally(ReadOrderGroup.emptyGroup());
+        ReadResponse response = ReadResponse.createDataResponse(partitionIterator, cmd);
+
+        logger.debug("creating response: {}", response);
+        partitionIterator = response.makeIterator(cmd);
+        assert partitionIterator.hasNext();
+        UnfilteredRowIterator partition = partitionIterator.next();
+
+        LegacyLayout.LegacyUnfilteredPartition rowIter = LegacyLayout.fromUnfilteredRowIterator(cmd, partition);
+        Assert.assertEquals(Collections.emptyList(), rowIter.cells);
+    }
+
+    @Test
+    public void testMultiNamesCommandWithFlush()
+    {
+        testMultiNamesOrSlicesCommand(true, false);
+    }
+
+    @Test
+    public void testMultiNamesCommandWithoutFlush()
+    {
+        testMultiNamesOrSlicesCommand(false, false);
+    }
+
+    @Test
+    public void testMultiSlicesCommandWithFlush()
+    {
+        testMultiNamesOrSlicesCommand(true, true);
+    }
+
+    @Test
+    public void testMultiSlicesCommandWithoutFlush()
+    {
+        testMultiNamesOrSlicesCommand(false, true);
+    }
+
+    private AbstractClusteringIndexFilter createClusteringFilter(int uniqueCk1, int uniqueCk2, boolean isSlice)
+    {
+        Slices.Builder slicesBuilder = new Slices.Builder(CFM_SLICES.comparator);
+        BTreeSet.Builder<Clustering> namesBuilder = BTreeSet.builder(CFM_SLICES.comparator);
+
+        for (int ck1 = 0; ck1 < uniqueCk1; ck1++)
+        {
+            for (int ck2 = 0; ck2 < uniqueCk2; ck2++)
+            {
+                if (isSlice)
+                    slicesBuilder.add(Slice.make(Util.clustering(CFM_SLICES.comparator, ck1, ck2)));
+                else
+                    namesBuilder.add(Util.clustering(CFM_SLICES.comparator, ck1, ck2));
+            }
+        }
+        if (isSlice)
+            return new ClusteringIndexSliceFilter(slicesBuilder.build(), false);
+        return new ClusteringIndexNamesFilter(namesBuilder.build(), false);
+    }
+
+    private void testMultiNamesOrSlicesCommand(boolean flush, boolean isSlice)
+    {
+        boolean isTombstone = flush || isSlice;
+        int deletionTime = 5;
+        int ck1 = 1;
+        int uniqueCk1 = 2;
+        int uniqueCk2 = 3;
+
+        DecoratedKey key = CFM_SLICES.decorateKey(ByteBufferUtil.bytes("k"));
+        QueryProcessor.executeInternal(String.format("DELETE FROM ks.tbl_slices USING TIMESTAMP %d WHERE k='k' AND c1=%d",
+                                                     deletionTime,
+                                                     ck1));
+
+        if (flush)
+            Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_SCLICES).forceBlockingFlush();
+
+        AbstractClusteringIndexFilter clusteringFilter = createClusteringFilter(uniqueCk1, uniqueCk2, isSlice);
+        ReadCommand cmd = SinglePartitionReadCommand.create(CFM_SLICES,
+                                                            FBUtilities.nowInSeconds(),
+                                                            ColumnFilter.all(CFM_SLICES),
+                                                            RowFilter.NONE,
+                                                            DataLimits.NONE,
+                                                            key,
+                                                            clusteringFilter);
+
+        UnfilteredPartitionIterator partitionIterator = cmd.executeLocally(ReadOrderGroup.emptyGroup());
+        assert partitionIterator.hasNext();
+        UnfilteredRowIterator partition = partitionIterator.next();
+
+        int count = 0;
+        boolean open = true;
+        while (partition.hasNext())
+        {
+            Unfiltered unfiltered = partition.next();
+            if (isTombstone)
+            {
+                assertTrue(unfiltered.isRangeTombstoneMarker());
+                RangeTombstoneMarker marker = (RangeTombstoneMarker) unfiltered;
+
+                // check if it's open-close pair
+                assertTrue(marker.isOpen(false) == open);
+                // check deletion time same as Range Deletion
+                if (open)
+                    assertEquals(deletionTime, marker.openDeletionTime(false).markedForDeleteAt());
+                else
+                    assertEquals(deletionTime, marker.closeDeletionTime(false).markedForDeleteAt());
+
+                // check clustering values
+                Clustering clustering = Util.clustering(CFM_SLICES.comparator, ck1, count / 2);
+                for (int i = 0; i < CFM_SLICES.comparator.size(); i++)
+                {
+                    int cmp = CFM_SLICES.comparator.compareComponent(i,
+                                                                     clustering.values[i],
+                                                                     marker.clustering().values[i]);
+                    assertEquals(0, cmp);
+                }
+                open = !open;
+            }
+            else
+            {
+                // deleted row
+                assertTrue(unfiltered.isRow());
+                Row row = (Row) unfiltered;
+                assertEquals(deletionTime, row.deletion().time().markedForDeleteAt());
+                assertEquals(0, row.columnCount()); // no btree
+            }
+            count++;
+        }
+        if (isTombstone)
+            assertEquals(uniqueCk2 * 2, count); // open and close range tombstones
+        else
+            assertEquals(uniqueCk2, count);
+    }
+
+    private void checkForS(UnfilteredPartitionIterator pi)
+    {
+        Assert.assertTrue(pi.toString(), pi.hasNext());
+        UnfilteredRowIterator ri = pi.next();
+        Assert.assertTrue(ri.columns().contains(s));
+        Row staticRow = ri.staticRow();
+        Iterator<Cell> cellIterator = staticRow.cells().iterator();
+        Assert.assertTrue(staticRow.toString(cfm, true), cellIterator.hasNext());
+        Cell cell = cellIterator.next();
+        Assert.assertEquals(s, cell.column());
+        Assert.assertEquals(ByteBufferUtil.bytesToHex(cell.value()), ByteBufferUtil.bytes("s"), cell.value());
+        Assert.assertFalse(cellIterator.hasNext());
+    }
+
+    @Test
+    public void staticColumnsAreReturned() throws IOException
+    {
+        DecoratedKey key = cfm.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, s) VALUES ('k1', 's')");
+        Assert.assertFalse(QueryProcessor.executeInternal("SELECT s FROM ks.tbl WHERE k='k1'").isEmpty());
+
+        ColumnFilter columnFilter = ColumnFilter.selection(PartitionColumns.of(s));
+        ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(Slices.NONE, false);
+        ReadCommand cmd = SinglePartitionReadCommand.create(true,
+                                                            cfm,
+                                                            FBUtilities.nowInSeconds(),
+                                                            columnFilter,
+                                                            RowFilter.NONE,
+                                                            DataLimits.NONE,
+                                                            key,
+                                                            sliceFilter);
+
+        // check raw iterator for static cell
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator pi = cmd.executeLocally(orderGroup))
+        {
+            checkForS(pi);
+        }
+
+        ReadResponse response;
+        DataOutputBuffer out;
+        DataInputPlus in;
+        ReadResponse dst;
+
+        // check (de)serialized iterator for memtable static cell
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator pi = cmd.executeLocally(orderGroup))
+        {
+            response = ReadResponse.createDataResponse(pi, cmd);
+        }
+
+        out = new DataOutputBuffer((int) ReadResponse.serializer.serializedSize(response, MessagingService.VERSION_30));
+        ReadResponse.serializer.serialize(response, out, MessagingService.VERSION_30);
+        in = new DataInputBuffer(out.buffer(), true);
+        dst = ReadResponse.serializer.deserialize(in, MessagingService.VERSION_30);
+        try (UnfilteredPartitionIterator pi = dst.makeIterator(cmd))
+        {
+            checkForS(pi);
+        }
+
+        // check (de)serialized iterator for sstable static cell
+        Schema.instance.getColumnFamilyStoreInstance(cfm.cfId).forceBlockingFlush();
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup(); UnfilteredPartitionIterator pi = cmd.executeLocally(orderGroup))
+        {
+            response = ReadResponse.createDataResponse(pi, cmd);
+        }
+        out = new DataOutputBuffer((int) ReadResponse.serializer.serializedSize(response, MessagingService.VERSION_30));
+        ReadResponse.serializer.serialize(response, out, MessagingService.VERSION_30);
+        in = new DataInputBuffer(out.buffer(), true);
+        dst = ReadResponse.serializer.deserialize(in, MessagingService.VERSION_30);
+        try (UnfilteredPartitionIterator pi = dst.makeIterator(cmd))
+        {
+            checkForS(pi);
+        }
+    }
+
+    @Test
+    public void toCQLStringIsSafeToCall() throws IOException
+    {
+        DecoratedKey key = cfm.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        ColumnFilter columnFilter = ColumnFilter.selection(PartitionColumns.of(s));
+        Slice slice = Slice.make(Slice.Bound.BOTTOM, Slice.Bound.inclusiveEndOf(ByteBufferUtil.bytes("i1")));
+        ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(Slices.with(cfm.comparator, slice), false);
+        ReadCommand cmd = SinglePartitionReadCommand.create(true,
+                                                            cfm,
+                                                            FBUtilities.nowInSeconds(),
+                                                            columnFilter,
+                                                            RowFilter.NONE,
+                                                            DataLimits.NONE,
+                                                            key,
+                                                            sliceFilter);
+
+        String ret = cmd.toCQLString();
+        Assert.assertNotNull(ret);
+        Assert.assertFalse(ret.isEmpty());
+    }
+
+
+    public static List<Unfiltered> getUnfilteredsFromSinglePartition(String q)
+    {
+        SelectStatement stmt = (SelectStatement) QueryProcessor.parseStatement(q).prepare(ClientState.forInternalCalls()).statement;
+
+        List<Unfiltered> unfiltereds = new ArrayList<>();
+        SinglePartitionReadCommand.Group query = (SinglePartitionReadCommand.Group) stmt.getQuery(QueryOptions.DEFAULT, 0);
+        Assert.assertEquals(1, query.commands.size());
+        SinglePartitionReadCommand command = Iterables.getOnlyElement(query.commands);
+        try (ReadOrderGroup group = ReadOrderGroup.forCommand(command);
+             UnfilteredPartitionIterator partitions = command.executeLocally(group))
+        {
+            assert partitions.hasNext();
+            try (UnfilteredRowIterator partition = partitions.next())
+            {
+                while (partition.hasNext())
+                {
+                    Unfiltered next = partition.next();
+                    unfiltereds.add(next);
+                }
+            }
+            assert !partitions.hasNext();
+        }
+        return unfiltereds;
+    }
+
+    private static void assertQueryReturnsSingleRT(String query)
+    {
+        List<Unfiltered> unfiltereds = getUnfilteredsFromSinglePartition(query);
+        Assert.assertEquals(2, unfiltereds.size());
+        Assert.assertTrue(unfiltereds.get(0).isRangeTombstoneMarker());
+        Assert.assertTrue(((RangeTombstoneMarker) unfiltereds.get(0)).isOpen(false));
+        Assert.assertTrue(unfiltereds.get(1).isRangeTombstoneMarker());
+        Assert.assertTrue(((RangeTombstoneMarker) unfiltereds.get(1)).isClose(false));
+    }
+
+    private static ByteBuffer bb(int v)
+    {
+        return Int32Type.instance.decompose(v);
+    }
+
+    /**
+     * tests the bug raised in CASSANDRA-14861, where the sstable min/max can
+     * exclude range tombstones for clustering ranges not also covered by rows
+     */
+    @Test
+    public void sstableFiltering()
+    {
+        QueryProcessor.executeOnceInternal("CREATE TABLE ks.legacy_mc_inaccurate_min_max (k int, c1 int, c2 int, c3 int, v int, primary key (k, c1, c2, c3))");
+        CFMetaData metadata = Schema.instance.getCFMetaData("ks", "legacy_mc_inaccurate_min_max");
+        ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata.cfId);
+
+        QueryProcessor.executeOnceInternal("INSERT INTO ks.legacy_mc_inaccurate_min_max (k, c1, c2, c3, v) VALUES (100, 2, 2, 2, 2)");
+        QueryProcessor.executeOnceInternal("DELETE FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1");
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1");
+        cfs.forceBlockingFlush();
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1");
+
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1 AND c3=1"); // clustering names
+
+        cfs.truncateBlocking();
+
+        long nowMillis = System.currentTimeMillis();
+        Slice slice = Slice.make(new Clustering(bb(2), bb(3)), new Clustering(bb(10), bb(10)));
+        RangeTombstone rt = new RangeTombstone(slice, new DeletionTime(TimeUnit.MILLISECONDS.toMicros(nowMillis),
+                                                                       Ints.checkedCast(TimeUnit.MILLISECONDS.toSeconds(nowMillis))));
+        PartitionUpdate update = new PartitionUpdate(cfs.metadata, bb(100), cfs.metadata.partitionColumns(), 1);
+        update.add(rt);
+        new Mutation(update).apply();
+
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2");
+        cfs.forceBlockingFlush();
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2");
+        assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2 AND c3=2"); // clustering names
+
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java b/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java
index c3047b8..d151f59 100644
--- a/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java

@@ -17,28 +17,41 @@
  */
 package org.apache.cassandra.db;
 
+import java.io.File;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.*;
+import java.util.concurrent.Future;
 
+import org.apache.commons.io.FileUtils;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.CassandraVersion;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 public class SystemKeyspaceTest
 {
+    private static final String MIGRATION_SSTABLES_ROOT = "migration-sstable-root";
+
+    // any file name will do but unrelated files in our folders tend to be log files or very old data files
+    private static final String UNRELATED_FILE_NAME = "system.log";
+    private static final String UNRELATED_FOLDER_NAME = "snapshot-abc";
+
     @BeforeClass
     public static void prepSnapshotTracker()
     {
@@ -74,7 +87,8 @@
     {
         BytesToken token = new BytesToken(ByteBufferUtil.bytes("token3"));
         InetAddress address = InetAddress.getByName("127.0.0.2");
-        SystemKeyspace.updateTokens(address, Collections.<Token>singletonList(token));
+        Future<?> future = SystemKeyspace.updateTokens(address, Collections.singletonList(token), StageManager.getStage(Stage.MUTATION));
+        FBUtilities.waitOnFuture(future);
         assert SystemKeyspace.loadTokens().get(address).contains(token);
         SystemKeyspace.removeEndpoint(address);
         assert !SystemKeyspace.loadTokens().containsValue(token);
@@ -130,7 +144,7 @@
 
         // Compare versions again & verify that snapshots were created for all tables in the system ks
         SystemKeyspace.snapshotOnVersionChange();
-        assertEquals(SystemKeyspace.definition().cfMetaData().size(), getSystemSnapshotFiles().size());
+        assertEquals(SystemKeyspace.metadata().tables.size(), getSystemSnapshotFiles().size());
 
         // clear out the snapshots & set the previous recorded version equal to the latest, we shouldn't
         // see any new snapshots created this time.
@@ -147,6 +161,176 @@
         Keyspace.clearSnapshot(null, SystemKeyspace.NAME);
     }
 
+    @Test
+    public void testMigrateEmptyDataDirs() throws IOException
+    {
+        File dataDir = Paths.get(DatabaseDescriptor.getAllDataFileLocations()[0]).toFile();
+        if (new File(dataDir, "Emptykeyspace1").exists())
+            FileUtils.deleteDirectory(new File(dataDir, "Emptykeyspace1"));
+        assertTrue(new File(dataDir, "Emptykeyspace1").mkdirs());
+        assertEquals(0, numLegacyFiles());
+        SystemKeyspace.migrateDataDirs();
+        assertEquals(0, numLegacyFiles());
+
+        assertTrue(new File(dataDir, "Emptykeyspace1/table1").mkdirs());
+        assertEquals(0, numLegacyFiles());
+        SystemKeyspace.migrateDataDirs();
+        assertEquals(0, numLegacyFiles());
+
+        assertTrue(new File(dataDir, "Emptykeyspace1/wrong_file").createNewFile());
+        assertEquals(0, numLegacyFiles());
+        SystemKeyspace.migrateDataDirs();
+        assertEquals(0, numLegacyFiles());
+
+    }
+
+    @Test
+    public void testMigrateDataDirs_2_1() throws IOException
+    {
+        testMigrateDataDirs("2.1", 5); // see test data for num legacy files
+    }
+
+    @Test
+    public void testMigrateDataDirs_2_2() throws IOException
+    {
+        testMigrateDataDirs("2.2", 7); // see test data for num legacy files
+    }
+
+    private void testMigrateDataDirs(String version, int numLegacyFiles) throws IOException
+    {
+        Path migrationSSTableRoot = Paths.get(System.getProperty(MIGRATION_SSTABLES_ROOT), version);
+        Path dataDir = Paths.get(DatabaseDescriptor.getAllDataFileLocations()[0]);
+
+        FileUtils.copyDirectory(migrationSSTableRoot.toFile(), dataDir.toFile());
+
+        assertEquals(numLegacyFiles, numLegacyFiles());
+
+        SystemKeyspace.migrateDataDirs();
+
+        assertEquals(0, numLegacyFiles());
+    }
+
+    private static int numLegacyFiles()
+    {
+        int ret = 0;
+        Iterable<String> dirs = Arrays.asList(DatabaseDescriptor.getAllDataFileLocations());
+        for (String dataDir : dirs)
+        {
+            File dir = new File(dataDir);
+            for (File ksdir : dir.listFiles((d, n) -> new File(d, n).isDirectory()))
+            {
+                for (File cfdir : ksdir.listFiles((d, n) -> new File(d, n).isDirectory()))
+                {
+                    if (Descriptor.isLegacyFile(cfdir))
+                    {
+                        ret++;
+                    }
+                    else
+                    {
+                        File[] legacyFiles = cfdir.listFiles((d, n) -> Descriptor.isLegacyFile(new File(d, n)));
+                        if (legacyFiles != null)
+                            ret += legacyFiles.length;
+                    }
+                }
+            }
+        }
+        return ret;
+    }
+
+    @Test
+    public void testMigrateDataDirs_UnrelatedFiles_2_1() throws IOException
+    {
+        testMigrateDataDirsWithUnrelatedFiles("2.1");
+    }
+
+    @Test
+    public void testMigrateDataDirs_UnrelatedFiles_2_2() throws IOException
+    {
+        testMigrateDataDirsWithUnrelatedFiles("2.2");
+    }
+
+    private void testMigrateDataDirsWithUnrelatedFiles(String version) throws IOException
+    {
+        Path migrationSSTableRoot = Paths.get(System.getProperty(MIGRATION_SSTABLES_ROOT), version);
+        Path dataDir = Paths.get(DatabaseDescriptor.getAllDataFileLocations()[0]);
+
+        FileUtils.copyDirectory(migrationSSTableRoot.toFile(), dataDir.toFile());
+
+        addUnRelatedFiles(dataDir);
+
+        SystemKeyspace.migrateDataDirs();
+
+        checkUnrelatedFiles(dataDir);
+    }
+
+    /**
+     * Add some extra and totally unrelated files to the data dir and its sub-folders
+     */
+    private void addUnRelatedFiles(Path dataDir) throws IOException
+    {
+        File dir = new File(dataDir.toString());
+        createAndCheck(dir, UNRELATED_FILE_NAME, false);
+        createAndCheck(dir, UNRELATED_FOLDER_NAME, true);
+
+        for (File ksdir : dir.listFiles((d, n) -> new File(d, n).isDirectory()))
+        {
+            createAndCheck(ksdir, UNRELATED_FILE_NAME, false);
+            createAndCheck(ksdir, UNRELATED_FOLDER_NAME, true);
+
+            for (File cfdir : ksdir.listFiles((d, n) -> new File(d, n).isDirectory()))
+            {
+                createAndCheck(cfdir, UNRELATED_FILE_NAME, false);
+                createAndCheck(cfdir, UNRELATED_FOLDER_NAME, true);
+            }
+        }
+    }
+
+    /**
+     * Make sure the extra files are still in the data dir and its sub-folders, then
+     * remove them.
+     */
+    private void checkUnrelatedFiles(Path dataDir) throws IOException
+    {
+        File dir = new File(dataDir.toString());
+        checkAndDelete(dir, UNRELATED_FILE_NAME, false);
+        checkAndDelete(dir, UNRELATED_FOLDER_NAME, true);
+
+        for (File ksdir : dir.listFiles((d, n) -> new File(d, n).isDirectory()))
+        {
+            checkAndDelete(ksdir, UNRELATED_FILE_NAME, false);
+            checkAndDelete(ksdir, UNRELATED_FOLDER_NAME, true);
+
+            for (File cfdir : ksdir.listFiles((d, n) -> new File(d, n).isDirectory()))
+            {
+                checkAndDelete(cfdir, UNRELATED_FILE_NAME, false);
+                checkAndDelete(cfdir, UNRELATED_FOLDER_NAME, true);
+            }
+        }
+    }
+
+    private void createAndCheck(File dir, String fileName, boolean isDir) throws IOException
+    {
+        File f = new File(dir, fileName);
+
+        if (isDir)
+            f.mkdir();
+        else
+            f.createNewFile();
+
+        assertTrue(f.exists());
+    }
+
+    private void checkAndDelete(File dir, String fileName, boolean isDir) throws IOException
+    {
+        File f = new File(dir, fileName);
+        assertTrue(f.exists());
+
+        if (isDir)
+            FileUtils.deleteDirectory(f);
+        else
+            f.delete();
+    }
+
     private String getOlderVersionString()
     {
         String version = FBUtilities.getReleaseVersionString();

diff --git a/test/unit/org/apache/cassandra/db/TimeSortTest.java b/test/unit/org/apache/cassandra/db/TimeSortTest.java
index 1d9fb10..8ae05ea 100644
--- a/test/unit/org/apache/cassandra/db/TimeSortTest.java
+++ b/test/unit/org/apache/cassandra/db/TimeSortTest.java

@@ -18,134 +18,79 @@
 */
 package org.apache.cassandra.db;
 
-import java.io.IOException;
-import java.util.*;
-
-import org.junit.BeforeClass;
+import java.util.Iterator;
 import org.junit.Test;
-import static org.junit.Assert.assertEquals;
 
-import org.apache.cassandra.SchemaLoader;
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.getBytes;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.Util;
 
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import static org.junit.Assert.assertEquals;
 
-
-public class TimeSortTest
+public class TimeSortTest extends CQLTester
 {
-    private static final String KEYSPACE1 = "TimeSortTest";
-    private static final String CF_STANDARD1 = "StandardLong1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
+    @Test
+    public void testMixedSources() throws Throwable
     {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+        String tableName = createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 100, 0, 100L);
+        cfs.forceBlockingFlush();
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 0, 1, 0L);
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND b >= ? LIMIT 1000", 0, 10), row(0, 100, 0));
     }
 
     @Test
-    public void testMixedSources()
+    public void testTimeSort() throws Throwable
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(CF_STANDARD1);
-        Mutation rm;
-        DecoratedKey key = Util.dk("key0");
-
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(CF_STANDARD1, cellname(100), ByteBufferUtil.bytes("a"), 100);
-        rm.applyUnsafe();
-        cfStore.forceBlockingFlush();
-
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(CF_STANDARD1, cellname(0), ByteBufferUtil.bytes("b"), 0);
-        rm.applyUnsafe();
-
-        ColumnFamily cf = cfStore.getColumnFamily(key, cellname(10), Composites.EMPTY, false, 1000, System.currentTimeMillis());
-        Collection<Cell> cells = cf.getSortedColumns();
-        assert cells.size() == 1;
-    }
-
-    @Test
-    public void testTimeSort() throws IOException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfStore = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        String tableName = createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
         for (int i = 900; i < 1000; ++i)
-        {
-            Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(Integer.toString(i)));
             for (int j = 0; j < 8; ++j)
-            {
-                rm.add(CF_STANDARD1, cellname(j * 2), ByteBufferUtil.bytes("a"), j * 2);
-            }
-            rm.applyUnsafe();
-        }
+                execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", i, j * 2, 0, (long)j * 2);
 
-        validateTimeSort(keyspace);
-
-        cfStore.forceBlockingFlush();
-        validateTimeSort(keyspace);
+        validateTimeSort();
+        cfs.forceBlockingFlush();
+        validateTimeSort();
 
         // interleave some new data to test memtable + sstable
         DecoratedKey key = Util.dk("900");
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
         for (int j = 0; j < 4; ++j)
-        {
-            rm.add(CF_STANDARD1, cellname(j * 2 + 1), ByteBufferUtil.bytes("b"), j * 2 + 1);
-        }
-        rm.applyUnsafe();
+            execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 900, j * 2 + 1, 1, (long)j * 2 + 1);
+
         // and some overwrites
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(CF_STANDARD1, cellname(0), ByteBufferUtil.bytes("c"), 100);
-        rm.add(CF_STANDARD1, cellname(10), ByteBufferUtil.bytes("c"), 100);
-        rm.applyUnsafe();
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 900, 0, 2, 100L);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 900, 10, 2, 100L);
 
         // verify
-        ColumnFamily cf = cfStore.getColumnFamily(key, cellname(0), Composites.EMPTY, false, 1000, System.currentTimeMillis());
-        Collection<Cell> cells = cf.getSortedColumns();
-        assertEquals(12, cells.size());
-        Iterator<Cell> iter = cells.iterator();
-        Cell cell;
+        UntypedResultSet results = execute("SELECT * FROM %s WHERE a = ? AND b >= ? LIMIT 1000", 900, 0);
+        assertEquals(12, results.size());
+        Iterator<UntypedResultSet.Row> iter = results.iterator();
         for (int j = 0; j < 8; j++)
         {
-            cell = iter.next();
-            assert cell.name().toByteBuffer().equals(getBytes(j));
+            UntypedResultSet.Row row = iter.next();
+            assertEquals(j, row.getInt("b"));
         }
-        TreeSet<CellName> columnNames = new TreeSet<CellName>(cfStore.getComparator());
-        columnNames.add(cellname(10));
-        columnNames.add(cellname(0));
-        cf = cfStore.getColumnFamily(QueryFilter.getNamesFilter(Util.dk("900"), CF_STANDARD1, columnNames, System.currentTimeMillis()));
-        assert "c".equals(ByteBufferUtil.string(cf.getColumn(cellname(0)).value()));
-        assert "c".equals(ByteBufferUtil.string(cf.getColumn(cellname(10)).value()));
+
+        assertRows(execute("SELECT * FROM %s WHERE a = ? AND b IN (?, ?)", 900, 0, 10),
+                row(900, 0, 2),
+                row(900, 10, 2));
     }
 
-    private void validateTimeSort(Keyspace keyspace)
+    private void validateTimeSort() throws Throwable
     {
         for (int i = 900; i < 1000; ++i)
         {
-            DecoratedKey key = Util.dk(Integer.toString(i));
             for (int j = 0; j < 8; j += 3)
             {
-                ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
-                ColumnFamily cf = cfs.getColumnFamily(key, cellname(j * 2), Composites.EMPTY, false, 1000, System.currentTimeMillis());
-                Collection<Cell> cells = cf.getSortedColumns();
-                assert cells.size() == 8 - j;
+                UntypedResultSet results = execute("SELECT writetime(c) AS wt FROM %s WHERE a = ? AND b >= ? LIMIT 1000", i, j * 2);
+                assertEquals(8 - j, results.size());
                 int k = j;
-                for (Cell c : cells)
-                {
-                    assertEquals((k++) * 2, c.timestamp());
-
-                }
+                for (UntypedResultSet.Row row : results)
+                    assertEquals((k++) * 2, row.getLong("wt"));
             }
         }
     }

diff --git a/test/unit/org/apache/cassandra/db/TransformerTest.java b/test/unit/org/apache/cassandra/db/TransformerTest.java
new file mode 100644
index 0000000..d56d8cd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/TransformerTest.java

@@ -0,0 +1,325 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.db;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.FilteredRows;
+import org.apache.cassandra.db.transform.MoreRows;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class TransformerTest
+{
+
+    static final CFMetaData metadata = metadata();
+    static final DecoratedKey partitionKey = new BufferDecoratedKey(new Murmur3Partitioner.LongToken(0L), ByteBufferUtil.EMPTY_BYTE_BUFFER);
+    static final Row staticRow = BTreeRow.singleCellRow(Clustering.STATIC_CLUSTERING, new BufferCell(metadata.partitionColumns().columns(true).getSimple(0), 0L, 0, 0, ByteBufferUtil.bytes(-1), null));
+
+    static CFMetaData metadata()
+    {
+        CFMetaData.Builder builder = CFMetaData.Builder.create("", "");
+        builder.addPartitionKey("pk", BytesType.instance);
+        builder.addClusteringColumn("c", Int32Type.instance);
+        builder.addStaticColumn("s", Int32Type.instance);
+        builder.addRegularColumn("v", Int32Type.instance);
+        return builder.build();
+    }
+
+    // Mock Data
+
+    static abstract class AbstractBaseRowIterator<U extends Unfiltered> extends AbstractIterator<U> implements BaseRowIterator<U>
+    {
+        private final int i;
+        private boolean returned;
+
+        protected AbstractBaseRowIterator(int i)
+        {
+            this.i = i;
+        }
+
+        protected U computeNext()
+        {
+            if (returned)
+                return endOfData();
+            returned = true;
+            return (U) row(i);
+        }
+
+        public CFMetaData metadata()
+        {
+            return metadata;
+        }
+
+        public boolean isReverseOrder()
+        {
+            return false;
+        }
+
+        public PartitionColumns columns()
+        {
+            return metadata.partitionColumns();
+        }
+
+        public DecoratedKey partitionKey()
+        {
+            return partitionKey;
+        }
+
+        public Row staticRow()
+        {
+            return staticRow;
+        }
+
+        public boolean isEmpty()
+        {
+            return false;
+        }
+
+        public void close()
+        {
+        }
+    }
+
+    private static UnfilteredRowIterator unfiltered(int i)
+    {
+        class Iter extends AbstractBaseRowIterator<Unfiltered> implements UnfilteredRowIterator
+        {
+            protected Iter(int i)
+            {
+                super(i);
+            }
+
+            public DeletionTime partitionLevelDeletion()
+            {
+                return DeletionTime.LIVE;
+            }
+
+            public EncodingStats stats()
+            {
+                return EncodingStats.NO_STATS;
+            }
+        }
+        return new Iter(i);
+    }
+
+    private static RowIterator filtered(int i)
+    {
+        class Iter extends AbstractBaseRowIterator<Row> implements RowIterator
+        {
+            protected Iter(int i)
+            {
+                super(i);
+            }
+        }
+        return new Iter(i);
+    }
+
+    private static Row row(int i)
+    {
+        return BTreeRow.singleCellRow(Util.clustering(metadata.comparator, i),
+                                      new BufferCell(metadata.partitionColumns().columns(false).getSimple(0), 1L, BufferCell.NO_TTL, BufferCell.NO_DELETION_TIME, ByteBufferUtil.bytes(i), null));
+    }
+
+    // Transformations that check mock data ranges
+
+    private static Transformation expect(int from, int to, List<Check> checks)
+    {
+        Expect expect = new Expect(from, to);
+        checks.add(expect);
+        return expect;
+    }
+
+    abstract static class Check extends Transformation
+    {
+        public abstract void check();
+    }
+
+    static class Expect extends Check
+    {
+        final int from, to;
+        int cur;
+        boolean closed;
+
+        Expect(int from, int to)
+        {
+            this.from = from;
+            this.to = to;
+            this.cur = from;
+        }
+
+        public Row applyToRow(Row row)
+        {
+            Assert.assertEquals(cur++, ByteBufferUtil.toInt(row.clustering().get(0)));
+            return row;
+        }
+
+        public void onPartitionClose()
+        {
+            Assert.assertEquals(to, cur);
+            closed = true;
+        }
+
+        public void check()
+        {
+            Assert.assertTrue(closed);
+        }
+    }
+
+    // Combinations of mock data and checks for an empty, singleton, and extending (sequential) range
+
+    private static enum Filter
+    {
+        INIT, APPLY_INNER, APPLY_OUTER, NONE
+    }
+
+    private static BaseRowIterator<?> empty(Filter filter, List<Check> checks)
+    {
+        switch (filter)
+        {
+            case INIT:
+                return Transformation.apply(EmptyIterators.row(metadata, partitionKey, false), expect(0, 0, checks));
+            case APPLY_INNER:
+                return Transformation.apply(FilteredRows.filter(Transformation.apply(EmptyIterators.unfilteredRow(metadata, partitionKey, false), expect(0, 0, checks)), Integer.MAX_VALUE), expect(0, 0, checks));
+            case APPLY_OUTER:
+            case NONE:
+                return Transformation.apply(EmptyIterators.unfilteredRow(metadata, partitionKey, false), expect(0, 0, checks));
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    private static BaseRowIterator<?> singleton(Filter filter, int i, List<Check> checks)
+    {
+        switch (filter)
+        {
+            case INIT:
+                return Transformation.apply(filtered(i), expect(i, i + 1, checks));
+            case APPLY_INNER:
+                return FilteredRows.filter(Transformation.apply(unfiltered(i), expect(i, i + 1, checks)), Integer.MAX_VALUE);
+            case APPLY_OUTER:
+            case NONE:
+                return Transformation.apply(unfiltered(i), expect(i, i + 1, checks));
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    private static BaseRowIterator<?> extendingIterator(int count, Filter filter, List<Check> checks)
+    {
+        class RefillNested extends Expect implements MoreRows<BaseRowIterator<?>>
+        {
+            boolean returnedEmpty, returnedSingleton, returnedNested;
+            RefillNested(int from)
+            {
+                super(from, count);
+            }
+
+            public BaseRowIterator<?> moreContents()
+            {
+                // first call return an empty iterator,
+                // second call return a singleton iterator (with a function that expects to be around to receive just that item)
+                // third call return a nested version of ourselves, with a function that expects to receive all future values
+                // fourth call, return null, indicating no more iterators to return
+
+                if (!returnedEmpty)
+                {
+                    returnedEmpty = true;
+                    return empty(filter, checks);
+                }
+
+                if (!returnedSingleton)
+                {
+                    returnedSingleton = true;
+                    return singleton(filter, from, checks);
+                }
+
+                if (from + 1 >= to)
+                    return null;
+
+                if (!returnedNested)
+                {
+                    returnedNested = true;
+
+                    RefillNested refill = new RefillNested(from + 1);
+                    checks.add(refill);
+                    return refill.applyTo(empty(filter, checks));
+                }
+
+                return null;
+            }
+
+            BaseRowIterator<?> applyTo(BaseRowIterator<?> iter)
+            {
+                if (iter instanceof UnfilteredRowIterator)
+                    return Transformation.apply(MoreRows.extend((UnfilteredRowIterator) iter, this), this);
+                else
+                    return Transformation.apply(MoreRows.extend((RowIterator) iter, this), this);
+            }
+        }
+
+        RefillNested refill = new RefillNested(0);
+        checks.add(refill);
+
+        BaseRowIterator<?> iter = empty(filter, checks);
+        switch (filter)
+        {
+            case APPLY_OUTER:
+                return FilteredRows.filter((UnfilteredRowIterator) refill.applyTo(iter), Integer.MAX_VALUE);
+            case APPLY_INNER:
+            case INIT:
+            case NONE:
+                return refill.applyTo(iter);
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    @Test
+    public void testRowExtension()
+    {
+        for (Filter filter : Filter.values())
+        {
+            List<Check> checks = new ArrayList<>();
+
+            BaseRowIterator<?> iter = extendingIterator(5, filter, checks);
+            for (int i = 0 ; i < 5 ; i++)
+            {
+                Unfiltered u = iter.next();
+                assert u instanceof Row;
+                Assert.assertEquals(i, ByteBufferUtil.toInt(u.clustering().get(0)));
+            }
+            iter.close();
+
+            for (Check check : checks)
+                check.check();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/VerifyTest.java b/test/unit/org/apache/cassandra/db/VerifyTest.java
index 27e99ab..0748270 100644
--- a/test/unit/org/apache/cassandra/db/VerifyTest.java
+++ b/test/unit/org/apache/cassandra/db/VerifyTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.db;
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,30 +15,28 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- *
  */
+package org.apache.cassandra.db;
 
 import com.google.common.base.Charsets;
+
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
+import org.apache.cassandra.UpdateBuilder;
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.Verifier;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.io.FSWriteError;
-import org.apache.cassandra.io.compress.*;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.BeforeClass;
@@ -49,13 +45,14 @@
 
 import java.io.*;
 import java.nio.file.Files;
-import java.util.HashMap;
+import java.util.Collections;
 import java.util.List;
-import java.util.zip.Adler32;
+import java.util.concurrent.ExecutionException;
+import java.util.zip.CRC32;
 import java.util.zip.CheckedInputStream;
 
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.column;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
@@ -80,25 +77,24 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        CompressionParameters compressionParameters = new CompressionParameters(SnappyCompressor.instance, 32768, new HashMap<String, String>());
+        CompressionParams compressionParameters = CompressionParams.snappy(32768);
 
         SchemaLoader.loadSchema();
         SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF).compressionParameters(compressionParameters),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF2).compressionParameters(compressionParameters),
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF).compression(compressionParameters),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF2).compression(compressionParameters),
                                     SchemaLoader.standardCFMD(KEYSPACE, CF3),
                                     SchemaLoader.standardCFMD(KEYSPACE, CF4),
                                     SchemaLoader.standardCFMD(KEYSPACE, CORRUPT_CF),
                                     SchemaLoader.standardCFMD(KEYSPACE, CORRUPT_CF2),
-                                    SchemaLoader.standardCFMD(KEYSPACE, COUNTER_CF, BytesType.instance).defaultValidator(CounterColumnType.instance).compressionParameters(compressionParameters),
-                                    SchemaLoader.standardCFMD(KEYSPACE, COUNTER_CF2, BytesType.instance).defaultValidator(CounterColumnType.instance).compressionParameters(compressionParameters),
-                                    SchemaLoader.standardCFMD(KEYSPACE, COUNTER_CF3, BytesType.instance).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE, COUNTER_CF4, BytesType.instance).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CORRUPTCOUNTER_CF, BytesType.instance).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CORRUPTCOUNTER_CF2, BytesType.instance).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF_UUID).keyValidator(UUIDType.instance));
+                                    SchemaLoader.counterCFMD(KEYSPACE, COUNTER_CF).compression(compressionParameters),
+                                    SchemaLoader.counterCFMD(KEYSPACE, COUNTER_CF2).compression(compressionParameters),
+                                    SchemaLoader.counterCFMD(KEYSPACE, COUNTER_CF3),
+                                    SchemaLoader.counterCFMD(KEYSPACE, COUNTER_CF4),
+                                    SchemaLoader.counterCFMD(KEYSPACE, CORRUPTCOUNTER_CF),
+                                    SchemaLoader.counterCFMD(KEYSPACE, CORRUPTCOUNTER_CF2),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF_UUID, 0, UUIDType.instance));
     }
 
 
@@ -109,11 +105,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
 
-        fillCF(cfs, KEYSPACE, CF, 2);
+        fillCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        try(Verifier verifier = new Verifier(cfs, sstable, false))
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
         }
@@ -130,12 +126,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF);
 
-        fillCounterCF(cfs, KEYSPACE, COUNTER_CF, 2);
+        fillCounterCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
         }
@@ -152,12 +147,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
 
-        fillCF(cfs, KEYSPACE, CF2, 2);
+        fillCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(true);
         }
@@ -174,12 +168,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF2);
 
-        fillCounterCF(cfs, KEYSPACE, COUNTER_CF2, 2);
+        fillCounterCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(true);
         }
@@ -196,12 +189,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF3);
 
-        fillCF(cfs, KEYSPACE, CF3, 2);
+        fillCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
         }
@@ -218,12 +210,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF3);
 
-        fillCounterCF(cfs, KEYSPACE, COUNTER_CF3, 2);
+        fillCounterCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
         }
@@ -240,12 +231,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF4);
 
-        fillCF(cfs, KEYSPACE, CF4, 2);
+        fillCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(true);
         }
@@ -262,12 +252,11 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF4);
 
-        fillCounterCF(cfs, KEYSPACE, COUNTER_CF4, 2);
+        fillCounterCF(cfs, 2);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(true);
         }
@@ -285,21 +274,20 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CORRUPT_CF);
 
-        fillCF(cfs, KEYSPACE, CORRUPT_CF, 2);
+        fillCF(cfs, 2);
 
-        List<Row> rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
+        Util.getAll(Util.cmd(cfs).build());
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
 
-        RandomAccessFile file = new RandomAccessFile(sstable.descriptor.filenameFor(Component.DIGEST), "rw");
+        RandomAccessFile file = new RandomAccessFile(sstable.descriptor.filenameFor(sstable.descriptor.digestComponent), "rw");
         Long correctChecksum = Long.parseLong(file.readLine());
         file.close();
 
-        writeChecksum(++correctChecksum, sstable.descriptor.filenameFor(Component.DIGEST));
+        writeChecksum(++correctChecksum, sstable.descriptor.filenameFor(sstable.descriptor.digestComponent));
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
-        try
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
             fail("Expected a CorruptSSTableException to be thrown");
@@ -315,15 +303,15 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CORRUPT_CF2);
 
-        fillCF(cfs, KEYSPACE, CORRUPT_CF2, 2);
+        fillCF(cfs, 2);
 
-        List<Row> rows = cfs.getRangeSlice(Util.range("", ""), null, new IdentityQueryFilter(), 1000);
+        Util.getAll(Util.cmd(cfs).build());
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         // overwrite one row with garbage
-        long row0Start = sstable.getPosition(RowPosition.ForKey.get(ByteBufferUtil.bytes("0"), sstable.partitioner), SSTableReader.Operator.EQ).position;
-        long row1Start = sstable.getPosition(RowPosition.ForKey.get(ByteBufferUtil.bytes("1"), sstable.partitioner), SSTableReader.Operator.EQ).position;
+        long row0Start = sstable.getPosition(PartitionPosition.ForKey.get(ByteBufferUtil.bytes("0"), cfs.getPartitioner()), SSTableReader.Operator.EQ).position;
+        long row1Start = sstable.getPosition(PartitionPosition.ForKey.get(ByteBufferUtil.bytes("1"), cfs.getPartitioner()), SSTableReader.Operator.EQ).position;
         long startPosition = row0Start < row1Start ? row0Start : row1Start;
         long endPosition = row0Start < row1Start ? row1Start : row0Start;
 
@@ -333,59 +321,113 @@
         file.close();
 
         // Update the Digest to have the right Checksum
-        writeChecksum(simpleFullChecksum(sstable.getFilename()), sstable.descriptor.filenameFor(Component.DIGEST));
+        writeChecksum(simpleFullChecksum(sstable.getFilename()), sstable.descriptor.filenameFor(sstable.descriptor.digestComponent));
 
-        Verifier verifier = new Verifier(cfs, sstable, false);
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
+        {
+            // First a simple verify checking digest, which should succeed
+            try
+            {
+                verifier.verify(false);
+            }
+            catch (CorruptSSTableException err)
+            {
+                fail("Simple verify should have succeeded as digest matched");
+            }
 
-        // First a simple verify checking digest, which should succeed
-        try
+            // Now try extended verify
+            try
+            {
+                verifier.verify(true);
+
+            }
+            catch (CorruptSSTableException err)
+            {
+                return;
+            }
+            fail("Expected a CorruptSSTableException to be thrown");
+        }
+    }
+
+    @Test(expected = CorruptSSTableException.class)
+    public void testVerifyBrokenSSTableMetadata() throws IOException, WriteTimeoutException
+    {
+        CompactionManager.instance.disableAutoCompaction();
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CORRUPT_CF2);
+
+        fillCF(cfs, 2);
+
+        Util.getAll(Util.cmd(cfs).build());
+
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+
+        String filenameToCorrupt = sstable.descriptor.filenameFor(Component.STATS);
+        RandomAccessFile file = new RandomAccessFile(filenameToCorrupt, "rw");
+        file.seek(0);
+        file.writeBytes(StringUtils.repeat('z', 2));
+        file.close();
+
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
         {
             verifier.verify(false);
         }
-        catch (CorruptSSTableException err)
-        {
-            fail("Simple verify should have succeeded as digest matched");
-        }
-
-        // Now try extended verify
-        try
-        {
-            verifier.verify(true);
-
-        }
-        catch (CorruptSSTableException err)
-        {
-            return;
-        }
-        fail("Expected a CorruptSSTableException to be thrown");
-
     }
 
-    protected void fillCF(ColumnFamilyStore cfs, String keyspace, String columnFamily, int rowsPerSSTable)
+    @Test
+    public void testMutateRepair() throws IOException, ExecutionException, InterruptedException
     {
-        for (int i = 0; i < rowsPerSSTable; i++)
+        CompactionManager.instance.disableAutoCompaction();
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CORRUPT_CF2);
+
+        fillCF(cfs, 2);
+
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        sstable.descriptor.getMetadataSerializer().mutateRepairedAt(sstable.descriptor, 1);
+        sstable.reloadSSTableMetadata();
+        cfs.getTracker().notifySSTableRepairedStatusChanged(Collections.singleton(sstable));
+        assertTrue(sstable.isRepaired());
+        cfs.forceMajorCompaction();
+
+        sstable = cfs.getLiveSSTables().iterator().next();
+        Long correctChecksum;
+        try (RandomAccessFile file = new RandomAccessFile(sstable.descriptor.filenameFor(sstable.descriptor.digestComponent), "rw"))
         {
-            String key = String.valueOf(i);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspace, columnFamily);
-            cf.addColumn(column("c1", "1", 1L));
-            cf.addColumn(column("c2", "2", 1L));
-            Mutation rm = new Mutation(keyspace, ByteBufferUtil.bytes(key), cf);
-            rm.apply();
+            correctChecksum = Long.parseLong(file.readLine());
+        }
+        writeChecksum(++correctChecksum, sstable.descriptor.filenameFor(sstable.descriptor.digestComponent));
+        try (Verifier verifier = new Verifier(cfs, sstable, false))
+        {
+            verifier.verify(false);
+            fail("should be corrupt");
+        }
+        catch (CorruptSSTableException e)
+        {}
+        assertFalse(sstable.isRepaired());
+    }
+
+
+    protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable)
+    {
+        for (int i = 0; i < partitionsPerSSTable; i++)
+        {
+            UpdateBuilder.create(cfs.metadata, String.valueOf(i))
+                         .newRow("c1").add("val", "1")
+                         .newRow("c2").add("val", "2")
+                         .apply();
         }
 
         cfs.forceBlockingFlush();
     }
 
-    protected void fillCounterCF(ColumnFamilyStore cfs, String keyspace, String columnFamily, int rowsPerSSTable) throws WriteTimeoutException
+    protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
     {
-        for (int i = 0; i < rowsPerSSTable; i++)
+        for (int i = 0; i < partitionsPerSSTable; i++)
         {
-            String key = String.valueOf(i);
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspace, columnFamily);
-            Mutation rm = new Mutation(keyspace, ByteBufferUtil.bytes(key), cf);
-            rm.addCounter(columnFamily, cellname("Column1"), 100);
-            CounterMutation cm = new CounterMutation(rm, ConsistencyLevel.ONE);
-            cm.apply();
+            UpdateBuilder.create(cfs.metadata, String.valueOf(i))
+                         .newRow("c1").add("val", 100L)
+                         .apply();
         }
 
         cfs.forceBlockingFlush();
@@ -394,8 +436,8 @@
     protected long simpleFullChecksum(String filename) throws IOException
     {
         FileInputStream inputStream = new FileInputStream(filename);
-        Adler32 adlerChecksum = new Adler32();
-        CheckedInputStream cinStream = new CheckedInputStream(inputStream, adlerChecksum);
+        CRC32 checksum = new CRC32();
+        CheckedInputStream cinStream = new CheckedInputStream(inputStream, checksum);
         byte[] b = new byte[128];
         while (cinStream.read(b) >= 0) {
         }

diff --git a/test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java b/test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java
new file mode 100644
index 0000000..2f183c0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java

@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.columniterator;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import com.google.common.collect.Iterables;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+public class SSTableReverseIteratorTest
+{
+    private static final String KEYSPACE = "ks";
+    private Random random;
+
+    @BeforeClass
+    public static void setupClass()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1));
+    }
+
+    @Before
+    public void setUp()
+    {
+        random = new Random(0);
+    }
+
+    private ByteBuffer bytes(int size)
+    {
+        byte[] b = new byte[size];
+        random.nextBytes(b);
+        return ByteBuffer.wrap(b);
+    }
+
+    /**
+     * SSTRI shouldn't bail out if it encounters empty blocks (due to dropped columns)
+     */
+    @Test
+    public void emptyBlockTolerance()
+    {
+        String table = "empty_block_tolerance";
+        QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (k INT, c int, v1 blob, v2 blob, primary key (k, c))", KEYSPACE, table));
+        ColumnFamilyStore tbl = Keyspace.open(KEYSPACE).getColumnFamilyStore(table);
+        assert tbl != null;
+
+        int key = 100;
+
+        QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=?, v2=? WHERE k=? AND c=?", KEYSPACE, table), bytes(8), bytes(8), key, 0);
+        QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=? WHERE k=? AND c=?", KEYSPACE, table), bytes(0x20000), key, 1);
+        QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=? WHERE k=? AND c=?", KEYSPACE, table), bytes(0x20000), key, 2);
+        QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=? WHERE k=? AND c=?", KEYSPACE, table), bytes(0x20000), key, 3);
+
+        tbl.forceBlockingFlush();
+        SSTableReader sstable = Iterables.getOnlyElement(tbl.getLiveSSTables());
+        DecoratedKey dk = tbl.getPartitioner().decorateKey(Int32Type.instance.decompose(key));
+        RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+        Assert.assertTrue(indexEntry.isIndexed());
+        Assert.assertTrue(indexEntry.columnsIndex().size() > 2);
+
+        // drop v1 so the first 2 index blocks only contain empty unfiltereds
+        QueryProcessor.executeInternal(String.format("ALTER TABLE %s.%s DROP v1", KEYSPACE, table));
+
+        UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT v2 FROM %s.%s WHERE k=? ORDER BY c DESC", KEYSPACE, table), key);
+        Assert.assertEquals(1, result.size());
+
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java
new file mode 100644
index 0000000..6f51eaf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java

@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.commitlog;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.commitlog.AbstractCommitLogService.SyncRunnable;
+import org.apache.cassandra.utils.Clock;
+import org.apache.cassandra.utils.FreeRunningClock;
+
+import static org.apache.cassandra.db.commitlog.AbstractCommitLogService.DEFAULT_MARKER_INTERVAL_MILLIS;
+
+public class AbstractCommitLogServiceTest
+{
+    @BeforeClass
+    public static void before()
+    {
+        DatabaseDescriptor.setCommitLogSync(Config.CommitLogSync.periodic);
+        DatabaseDescriptor.setCommitLogSyncPeriod(10 * 1000);
+    }
+
+    @Test
+    public void testConstructorSyncIsQuantized()
+    {
+        long syncTimeMillis = 10 * 1000;
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        Assert.assertEquals(DEFAULT_MARKER_INTERVAL_MILLIS, commitLogService.markerIntervalMillis);
+        Assert.assertEquals(syncTimeMillis, commitLogService.syncIntervalMillis);
+    }
+
+    @Test
+    public void testConstructorSyncEqualsMarkerDefault()
+    {
+        long syncTimeMillis = 100;
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        Assert.assertEquals(DEFAULT_MARKER_INTERVAL_MILLIS, commitLogService.markerIntervalMillis);
+        Assert.assertEquals(syncTimeMillis, commitLogService.syncIntervalMillis);
+        Assert.assertEquals(commitLogService.markerIntervalMillis, commitLogService.syncIntervalMillis);
+    }
+
+    @Test
+    public void testConstructorSyncShouldRoundUp()
+    {
+        long syncTimeMillis = 151;
+        long expectedMillis = 200;
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        Assert.assertEquals(DEFAULT_MARKER_INTERVAL_MILLIS, commitLogService.markerIntervalMillis);
+        Assert.assertEquals(expectedMillis, commitLogService.syncIntervalMillis);
+    }
+
+    @Test
+    public void testConstructorSyncShouldRoundDown()
+    {
+        long syncTimeMillis = 121;
+        long expectedMillis = 100;
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        Assert.assertEquals(DEFAULT_MARKER_INTERVAL_MILLIS, commitLogService.markerIntervalMillis);
+        Assert.assertEquals(expectedMillis, commitLogService.syncIntervalMillis);
+    }
+
+    @Test
+    public void testConstructorSyncTinyValue()
+    {
+        long syncTimeMillis = 10;
+        long expectedNanos = syncTimeMillis;
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        Assert.assertEquals(expectedNanos, commitLogService.markerIntervalMillis);
+        Assert.assertEquals(expectedNanos, commitLogService.syncIntervalMillis);
+    }
+
+    private static class FakeCommitLogService extends AbstractCommitLogService
+    {
+        FakeCommitLogService(long syncIntervalMillis)
+        {
+            super(new FakeCommitLog(), "This is not a real commit log", syncIntervalMillis, true);
+            lastSyncedAt = 0;
+        }
+
+        @Override
+        void start()
+        {
+            // nop
+        }
+
+        protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
+        {
+            // nop
+        }
+    }
+
+    @Test
+    public void testSync()
+    {
+        long syncTimeMillis = AbstractCommitLogService.DEFAULT_MARKER_INTERVAL_MILLIS * 2;
+        FreeRunningClock clock = new FreeRunningClock();
+        FakeCommitLogService commitLogService = new FakeCommitLogService(syncTimeMillis);
+        SyncRunnable syncRunnable = commitLogService.new SyncRunnable(clock);
+        FakeCommitLog commitLog = (FakeCommitLog) commitLogService.commitLog;
+
+        // at time 0
+        Assert.assertTrue(syncRunnable.sync());
+        Assert.assertEquals(1, commitLog.markCount.get());
+        Assert.assertEquals(0, commitLog.syncCount.get());
+
+        // at time DEFAULT_MARKER_INTERVAL_MILLIS
+        clock.advance(DEFAULT_MARKER_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
+        Assert.assertTrue(syncRunnable.sync());
+        Assert.assertEquals(2, commitLog.markCount.get());
+        Assert.assertEquals(0, commitLog.syncCount.get());
+
+        // at time DEFAULT_MARKER_INTERVAL_MILLIS * 2
+        clock.advance(DEFAULT_MARKER_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
+        Assert.assertTrue(syncRunnable.sync());
+        Assert.assertEquals(2, commitLog.markCount.get());
+        Assert.assertEquals(1, commitLog.syncCount.get());
+
+        // at time DEFAULT_MARKER_INTERVAL_MILLIS * 3, but with shutdown!
+        clock.advance(DEFAULT_MARKER_INTERVAL_MILLIS, TimeUnit.MILLISECONDS);
+        commitLogService.shutdown();
+        Assert.assertFalse(syncRunnable.sync());
+        Assert.assertEquals(2, commitLog.markCount.get());
+        Assert.assertEquals(2, commitLog.syncCount.get());
+    }
+
+    private static class FakeCommitLog extends CommitLog
+    {
+        private final AtomicInteger markCount = new AtomicInteger();
+        private final AtomicInteger syncCount = new AtomicInteger();
+
+        FakeCommitLog()
+        {
+            super(DatabaseDescriptor.getCommitLogLocation(), null);
+        }
+
+        @Override
+        CommitLog start()
+        {
+            // this is a bit dicey. we need to start the allocator, but starting the parent's executor will muck things
+            // up as it is pointing to a different executor service, not the fake one in this test class.
+            allocator.start();
+            return this;
+        }
+
+        @Override
+        public void sync(boolean syncAllSegments, boolean flush)
+        {
+            if (flush)
+                syncCount.incrementAndGet();
+            else
+                markCount.incrementAndGet();
+        }
+    }
+
+    @Test
+    public void maybeLogFlushLag_MustLog()
+    {
+        long syncTimeMillis = 10;
+        SyncRunnable syncRunnable = new FakeCommitLogService(syncTimeMillis).new SyncRunnable(new FreeRunningClock());
+        long pollStarted = 1;
+        long now = pollStarted + (syncTimeMillis * 2);
+        Assert.assertTrue(syncRunnable.maybeLogFlushLag(pollStarted, now));
+        Assert.assertEquals(now - pollStarted, syncRunnable.getTotalSyncDuration());
+    }
+
+    @Test
+    public void maybeLogFlushLag_NoLog()
+    {
+        long syncTimeMillis = 10;
+        SyncRunnable syncRunnable = new FakeCommitLogService(syncTimeMillis).new SyncRunnable(new FreeRunningClock());
+        long pollStarted = 1;
+        long now = pollStarted + (syncTimeMillis - 1);
+        Assert.assertFalse(syncRunnable.maybeLogFlushLag(pollStarted, now));
+        Assert.assertEquals(now - pollStarted, syncRunnable.getTotalSyncDuration());
+    }
+
+    /**
+     * Mostly tests that {@link SyncRunnable#totalSyncDuration} is handled correctly
+     */
+    @Test
+    public void maybeLogFlushLag_MultipleOperations()
+    {
+        long syncTimeMillis = 10;
+        SyncRunnable syncRunnable = new FakeCommitLogService(syncTimeMillis).new SyncRunnable(new FreeRunningClock());
+
+        long pollStarted = 1;
+        long now = pollStarted + (syncTimeMillis - 1);
+
+        int runCount = 12;
+        for (int i = 1; i <= runCount; i++)
+        {
+            Assert.assertFalse(syncRunnable.maybeLogFlushLag(pollStarted, now));
+            Assert.assertEquals(i * (now - pollStarted), syncRunnable.getTotalSyncDuration());
+        }
+
+        now = pollStarted + (syncTimeMillis * 2);
+        Assert.assertTrue(syncRunnable.maybeLogFlushLag(pollStarted, now));
+        Assert.assertEquals(now - pollStarted, syncRunnable.getTotalSyncDuration());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/BatchCommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/BatchCommitLogTest.java
new file mode 100644
index 0000000..c7f7e57
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/BatchCommitLogTest.java

@@ -0,0 +1,66 @@
+package org.apache.cassandra.db.commitlog;
+
+import static org.junit.Assert.*;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class BatchCommitLogTest
+{
+    private static final long CL_BATCH_SYNC_WINDOW = 1000; // 1 second
+    private static final String KEYSPACE1 = "CommitLogTest";
+    private static final String STANDARD1 = "Standard1";
+
+    @BeforeClass
+    public static void before()
+    {
+        DatabaseDescriptor.setCommitLogSync(Config.CommitLogSync.batch);
+        DatabaseDescriptor.setCommitLogSyncBatchWindow(CL_BATCH_SYNC_WINDOW);
+
+        KeyspaceParams.DEFAULT_LOCAL_DURABLE_WRITES = false;
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance));
+        CompactionManager.instance.disableAutoCompaction();
+    }
+
+    @Test
+    public void testBatchCLSyncImmediately()
+    {
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+        Mutation m = new RowUpdateBuilder(cfs1.metadata, 0, "key")
+                         .clustering("bytes")
+                         .add("val", ByteBuffer.allocate(10 * 1024))
+                         .build();
+
+        long startNano = System.nanoTime();
+        CommitLog.instance.add(m);
+        long delta = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano);
+        assertTrue("Expect batch commitlog sync immediately, but took " + delta, delta < CL_BATCH_SYNC_WINDOW);
+    }
+
+    @Test
+    public void testBatchCLShutDownImmediately() throws InterruptedException
+    {
+        long startNano = System.nanoTime();
+        CommitLog.instance.shutdownBlocking();
+        long delta = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNano);
+        assertTrue("Expect batch commitlog shutdown immediately, but took " + delta, delta < CL_BATCH_SYNC_WINDOW);
+        CommitLog.instance.start();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
new file mode 100644
index 0000000..8537ebb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java

@@ -0,0 +1,41 @@
+package org.apache.cassandra.db.commitlog;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+
+public class CommitLogCQLTest extends CQLTester
+{
+    @Test
+    public void testTruncateSegmentDiscard() throws Throwable
+    {
+        String otherTable = createTable("CREATE TABLE %s (idx INT, data TEXT, PRIMARY KEY(idx));");
+
+        createTable("CREATE TABLE %s (idx INT, data TEXT, PRIMARY KEY(idx));");
+        execute("INSERT INTO %s (idx, data) VALUES (?, ?)", 15, Integer.toString(15));
+        flush();
+
+        // We write something in different table to advance the commit log position. Current table remains clean.
+        execute(String.format("INSERT INTO %s.%s (idx, data) VALUES (?, ?)", keyspace(), otherTable), 16, Integer.toString(16));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        assert cfs.getTracker().getView().getCurrentMemtable().isClean();
+        // Calling switchMemtable directly applies Flush even though memtable is empty. This can happen with some races
+        // (flush with recycling by segment manager). It should still tell commitlog that the memtable's region is clean.
+        // CASSANDRA-12436
+        cfs.switchMemtable();
+
+        execute("INSERT INTO %s (idx, data) VALUES (?, ?)", 15, Integer.toString(17));
+
+        Collection<CommitLogSegment> active = new ArrayList<>(CommitLog.instance.allocator.getActiveSegments());
+        CommitLog.instance.forceRecycleAllSegments();
+
+        // If one of the previous segments remains, it wasn't clean.
+        active.retainAll(CommitLog.instance.allocator.getActiveSegments());
+        assert active.isEmpty();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogChainedMarkersTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogChainedMarkersTest.java
new file mode 100644
index 0000000..b73275b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogChainedMarkersTest.java

@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.commitlog;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.RebufferingInputStream;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+/**
+ * Tests the commitlog to make sure we can replay it - explicitly for the case where we update the chained markers
+ * in the commit log segment but do not flush the file to disk.
+ */
+@RunWith(BMUnitRunner.class)
+public class CommitLogChainedMarkersTest
+{
+    private static final String KEYSPACE1 = "CommitLogTest";
+    private static final String STANDARD1 = "CommitLogChainedMarkersTest";
+
+    @Test
+    @BMRule(name = "force all calls to sync() to not flush to disk",
+    targetClass = "CommitLogSegment",
+    targetMethod = "sync(boolean)",
+    action = "$flush = false")
+    public void replayCommitLogWithoutFlushing() throws IOException
+    {
+        DatabaseDescriptor.setCommitLogSegmentSize(5);
+        DatabaseDescriptor.setCommitLogSync(Config.CommitLogSync.periodic);
+        DatabaseDescriptor.setCommitLogSyncPeriod(10000 * 1000);
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance));
+
+        CompactionManager.instance.disableAutoCompaction();
+
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+
+        byte[] entropy = new byte[1024];
+        new Random().nextBytes(entropy);
+        final Mutation m = new RowUpdateBuilder(cfs1.metadata, 0, "k")
+                           .clustering("bytes")
+                           .add("val", ByteBuffer.wrap(entropy))
+                           .build();
+
+        int samples = 10000;
+        for (int i = 0; i < samples; i++)
+            CommitLog.instance.add(m);
+
+        CommitLog.instance.sync(false, true);
+
+        Replayer replayer = new Replayer(cfs1.metadata);
+        File commitLogDir = new File(DatabaseDescriptor.getCommitLogLocation());
+        replayer.recover(commitLogDir.listFiles());
+        Assert.assertEquals(samples, replayer.count);
+    }
+
+    private static class Replayer extends CommitLogReplayer
+    {
+        private final CFMetaData cfm;
+        private int count;
+
+        Replayer(CFMetaData cfm)
+        {
+            super(CommitLog.instance, ReplayPosition.NONE, null, ReplayFilter.create());
+            this.cfm = cfm;
+        }
+
+        @Override
+        void replayMutation(byte[] inputBuffer, int size, final int entryLocation, final CommitLogDescriptor desc)
+        {
+            RebufferingInputStream bufIn = new DataInputBuffer(inputBuffer, 0, size);
+            try
+            {
+                Mutation mutation = Mutation.serializer.deserialize(bufIn,
+                                                           desc.getMessagingVersion(),
+                                                           SerializationHelper.Flag.LOCAL);
+
+                if (cfm == null || mutation.get(cfm) != null)
+                    count++;
+            }
+            catch (IOException e)
+            {
+                // Test fails.
+                throw new AssertionError(e);
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogDescriptorTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogDescriptorTest.java
index 8d63959..898c19f 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogDescriptorTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogDescriptorTest.java

@@ -28,8 +28,7 @@
 
 import org.apache.cassandra.config.ParameterizedClass;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.ByteBufferDataInput;
-import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.net.MessagingService;
 
 import static org.junit.Assert.assertEquals;
@@ -59,14 +58,13 @@
     {
         ByteBuffer buf = ByteBuffer.allocate(1024);
         CommitLogDescriptor.writeHeader(buf, desc);
-        long length = buf.position();
         // Put some extra data in the stream.
         buf.putDouble(0.1);
         buf.flip();
-        try (FileDataInput input = new ByteBufferDataInput(buf, "input", 0, 0))
+
+        try (DataInputBuffer input = new DataInputBuffer(buf, false))
         {
             CommitLogDescriptor read = CommitLogDescriptor.readHeader(input);
-            assertEquals("Descriptor length", length, input.getFilePointer());
             assertEquals("Descriptors", desc, read);
         }
     }
@@ -76,20 +74,21 @@
     {
         testDescriptorPersistence(new CommitLogDescriptor(11, null));
         testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_21, 13, null));
-        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_22, 15, null));
-        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_22, 17, new ParameterizedClass("LZ4Compressor", null)));
-        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_22, 19,
+        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_30, 15, null));
+        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_30, 17, new ParameterizedClass("LZ4Compressor", null)));
+        testDescriptorPersistence(new CommitLogDescriptor(CommitLogDescriptor.VERSION_30, 19,
                 new ParameterizedClass("StubbyCompressor", ImmutableMap.of("parameter1", "value1", "flag2", "55", "argument3", "null"))));
     }
 
     @Test
     public void testDescriptorInvalidParametersSize() throws IOException
     {
-        Map<String, String> params = new HashMap<>();
-        for (int i=0; i<6000; ++i)
+        final int numberOfParameters = 65535;
+        Map<String, String> params = new HashMap<>(numberOfParameters);
+        for (int i=0; i<numberOfParameters; ++i)
             params.put("key"+i, Integer.toString(i, 16));
         try {
-            CommitLogDescriptor desc = new CommitLogDescriptor(CommitLogDescriptor.VERSION_22,
+            CommitLogDescriptor desc = new CommitLogDescriptor(CommitLogDescriptor.VERSION_30,
                                                                21,
                                                                new ParameterizedClass("LZ4Compressor", params));
             ByteBuffer buf = ByteBuffer.allocate(1024000);

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogFailurePolicyTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogFailurePolicyTest.java
index bde8ca3..79f83fe 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogFailurePolicyTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogFailurePolicyTest.java

@@ -39,6 +39,7 @@
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
+        System.setProperty("cassandra.commitlog.stop_on_errors", "true");
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentBackpressureTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentBackpressureTest.java
new file mode 100644
index 0000000..c615880
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentBackpressureTest.java

@@ -0,0 +1,140 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.commitlog;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.Semaphore;
+
+import com.google.common.collect.ImmutableMap;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.Config.CommitLogSync;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+/**
+ * Since this test depends on byteman rules being setup during initialization, you shouldn't add tests to this class
+ */
+@RunWith(BMUnitRunner.class)
+public class CommitLogSegmentBackpressureTest
+{
+    //Block commit log service from syncing
+    private static final Semaphore allowSync = new Semaphore(1);
+
+    private static final String KEYSPACE1 = "CommitLogTest";
+    private static final String STANDARD1 = "Standard1";
+    private static final String STANDARD2 = "Standard2";
+
+    private final static byte[] entropy = new byte[1024 * 256];
+
+    @Test
+    @BMRules(rules = {@BMRule(name = "Acquire Semaphore before sync",
+                              targetClass = "AbstractCommitLogService$SyncRunnable",
+                              targetMethod = "sync",
+                              targetLocation = "AT INVOKE org.apache.cassandra.db.commitlog.CommitLog.sync(boolean, boolean)",
+                              action = "org.apache.cassandra.db.commitlog.CommitLogSegmentBackpressureTest.allowSync.acquire()"),
+                      @BMRule(name = "Release Semaphore after sync",
+                              targetClass = "AbstractCommitLogService$SyncRunnable",
+                              targetMethod = "sync",
+                              targetLocation = "AFTER INVOKE org.apache.cassandra.db.commitlog.CommitLog.sync(boolean, boolean)",
+                              action = "org.apache.cassandra.db.commitlog.CommitLogSegmentBackpressureTest.allowSync.release()")})
+    public void testCompressedCommitLogBackpressure() throws Throwable
+    {
+        // Perform all initialization before making CommitLog.Sync blocking
+        // Doing the initialization within the method guarantee that Byteman has performed its injections when we start
+        new Random().nextBytes(entropy);
+        DatabaseDescriptor.setCommitLogCompression(new ParameterizedClass("LZ4Compressor", ImmutableMap.of()));
+        DatabaseDescriptor.setCommitLogSegmentSize(1);
+        DatabaseDescriptor.setCommitLogSync(CommitLogSync.periodic);
+        DatabaseDescriptor.setCommitLogSyncPeriod(10 * 1000);
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance));
+
+        CompactionManager.instance.disableAutoCompaction();
+
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+
+        final Mutation m = new RowUpdateBuilder(cfs1.metadata, 0, "k").clustering("bytes")
+                                                                      .add("val", ByteBuffer.wrap(entropy))
+                                                                      .build();
+
+        Thread dummyThread = new Thread(() -> {
+            for (int i = 0; i < 20; i++)
+                CommitLog.instance.add(m);
+        });
+
+        try
+        {
+            // Makes sure any call to CommitLog.sync is blocking
+            allowSync.acquire();
+
+            dummyThread.start();
+
+            CommitLogSegmentManager clsm = CommitLog.instance.allocator;
+
+            Util.spinAssertEquals(3, () -> clsm.getActiveSegments().size(), 5);
+
+            Thread.sleep(1000);
+
+            // Should only be able to create 3 segments (not 7) because it blocks waiting for truncation that never
+            // comes.
+            Assert.assertEquals(3, clsm.getActiveSegments().size());
+
+            clsm.getActiveSegments().forEach(segment -> clsm.recycleSegment(segment));
+
+            Util.spinAssertEquals(3, () -> clsm.getActiveSegments().size(), 5);
+        }
+        finally
+        {
+            // Allow the CommitLog.sync to perform normally.
+            allowSync.release();
+        }
+        try
+        {
+            // Wait for the dummy thread to die
+            dummyThread.join();
+        }
+        catch (InterruptedException e)
+        {
+            Thread.currentThread().interrupt();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
index c883cbd..479a090 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java

@@ -16,7 +16,6 @@
 * specific language governing permissions and limitations
 * under the License.
 */
-
 package org.apache.cassandra.db.commitlog;
 
 import java.io.*;
@@ -25,10 +24,13 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.UUID;
+import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
+import java.util.function.BiConsumer;
 import java.util.zip.CRC32;
 import java.util.zip.Checksum;
 
+import com.google.common.io.Files;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
@@ -41,35 +43,38 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.ParameterizedClass;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.config.Config.DiskFailurePolicy;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.commitlog.CommitLogReplayer.CommitLogReplayException;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.composites.CellNameType;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.filter.NamesQueryFilter;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.DeflateCompressor;
 import org.apache.cassandra.io.compress.LZ4Compressor;
 import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.KillerForTests;
+import org.apache.cassandra.utils.vint.VIntCoding;
 
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 @RunWith(Parameterized.class)
 public class CommitLogTest
 {
     private static final String KEYSPACE1 = "CommitLogTest";
     private static final String KEYSPACE2 = "CommitLogTestNonDurable";
-    private static final String CF1 = "Standard1";
-    private static final String CF2 = "Standard2";
-    private static final String CF3 = "Custom1";
+    private static final String STANDARD1 = "Standard1";
+    private static final String STANDARD2 = "Standard2";
+    private static final String CUSTOM1 = "Custom1";
 
     public CommitLogTest(ParameterizedClass commitLogCompression)
     {
@@ -87,47 +92,110 @@
     {
         return Arrays.asList(new Object[][] {
                 { null }, // No compression
-                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.<String, String>emptyMap()) },
-                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.<String, String>emptyMap()) } });
+                { new ParameterizedClass(LZ4Compressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(SnappyCompressor.class.getName(), Collections.emptyMap()) },
+                { new ParameterizedClass(DeflateCompressor.class.getName(), Collections.emptyMap()) } });
     }
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        // Disable durable writes for system keyspaces to prevent system mutations, e.g. sstable_activity,
+        // to end up in CL segments and cause unexpected results in this test wrt counting CL segments,
+        // see CASSANDRA-12854
+        KeyspaceParams.DEFAULT_LOCAL_DURABLE_WRITES = false;
+
         SchemaLoader.prepareServer();
+
         CFMetaData custom = CFMetaData.compile(String.format("CREATE TABLE \"%s\" (" +
                                                              "k int," +
                                                              "c1 frozen<map<text, text>>," +
                                                              "c2 frozen<set<text>>," +
                                                              "s int static," +
                                                              "PRIMARY KEY (k, c1, c2)" +
-                                                             ");", CF3),KEYSPACE1);
+                                                             ");", CUSTOM1), KEYSPACE1);
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF2),
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance),
                                     custom);
         SchemaLoader.createKeyspace(KEYSPACE2,
-                                    false,
-                                    true,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF2));
-
+                                    KeyspaceParams.simpleTransient(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance));
         CompactionManager.instance.disableAutoCompaction();
     }
 
     @Test
     public void testRecoveryWithEmptyLog() throws Exception
     {
-        runExpecting(new WrappedRunnable() {
-            public void runMayThrow() throws Exception
-            {
-                CommitLog.instance.recover(new File[]{ tmpFile(CommitLogDescriptor.current_version) });
-            }
+        // The first empty file we expect to throw as it's invalid
+        // We need to pass the second as well, because allowTruncation will be set to true for the final segment
+        runExpecting(() -> {
+            CommitLog.instance.recover(new File[]{
+                    tmpFile(CommitLogDescriptor.current_version),
+                    tmpFile(CommitLogDescriptor.current_version)  });
+            return null;
+        }, CommitLogReplayException.class);
+    }
+
+    @Test
+    public void testRecoveryWithEmptyFinalLog() throws Exception
+    {
+        // Even though it's empty, it's the last commitlog segment, so allowTruncation=true should allow it to pass
+        CommitLog.instance.recover(new File[]{ tmpFile(CommitLogDescriptor.current_version)  });
+    }
+
+    /**
+     * Since commit log segments can be allocated before they're needed, the commit log file with the highest
+     * id isn't neccesarily the last log that we wrote to. We should remove header only logs on recover so we
+     * can tolerate truncated logs
+     */
+    @Test
+    public void testHeaderOnlyFileFiltering() throws Exception
+    {
+        File directory = Files.createTempDir();
+
+        CommitLogDescriptor desc1 = new CommitLogDescriptor(CommitLogDescriptor.current_version, 1, null);
+        CommitLogDescriptor desc2 = new CommitLogDescriptor(CommitLogDescriptor.current_version, 2, null);
+
+        ByteBuffer buffer;
+
+        // this has a header and malformed data
+        File file1 = new File(directory, desc1.fileName());
+        buffer = ByteBuffer.allocate(1024);
+        CommitLogDescriptor.writeHeader(buffer, desc1);
+        int pos = buffer.position();
+        CommitLogSegment.writeSyncMarker(desc1.id, buffer, buffer.position(), buffer.position(), buffer.position() + 128);
+        buffer.position(pos + 8);
+        buffer.putInt(5);
+        buffer.putInt(6);
+
+        try (OutputStream lout = new FileOutputStream(file1))
+        {
+            lout.write(buffer.array());
+        }
+
+        // this has only a header
+        File file2 = new File(directory, desc2.fileName());
+        buffer = ByteBuffer.allocate(1024);
+        CommitLogDescriptor.writeHeader(buffer, desc2);
+        try (OutputStream lout = new FileOutputStream(file2))
+        {
+            lout.write(buffer.array());
+        }
+
+        // one corrupt file and one header only file should be ok
+        runExpecting(() -> {
+            CommitLog.instance.recover(file1, file2);
+            return null;
+        }, null);
+
+        // 2 corrupt files and one header only file should fail
+        runExpecting(() -> {
+            CommitLog.instance.recover(file1, file1, file2);
+            return null;
         }, CommitLogReplayException.class);
     }
 
@@ -164,12 +232,11 @@
     @Test
     public void testRecoveryWithShortSize() throws Exception
     {
-        runExpecting(new WrappedRunnable()  {
-            public void runMayThrow() throws Exception {
-                byte[] data = new byte[5];
-                data[3] = 1; // Not a legacy marker, give it a fake (short) size
-                testRecovery(data, CommitLogDescriptor.VERSION_20);
-            }
+        byte[] data = new byte[5];
+        data[3] = 1; // Not a legacy marker, give it a fake (short) size
+        runExpecting(() -> {
+            testRecovery(data, CommitLogDescriptor.VERSION_20);
+            return null;
         }, CommitLogReplayException.class);
     }
 
@@ -197,11 +264,9 @@
     @Test
     public void testRecoveryWithGarbageLog_fail() throws Exception
     {
-        runExpecting(new WrappedRunnable() {
-            public void runMayThrow() throws Exception
-            {
-                testRecoveryWithGarbageLog();
-            }
+        runExpecting(() -> {
+            testRecoveryWithGarbageLog();
+            return null;
         }, CommitLogReplayException.class);
     }
 
@@ -234,26 +299,33 @@
     @Test
     public void testDontDeleteIfDirty() throws Exception
     {
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+        ColumnFamilyStore cfs2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD2);
+
         // Roughly 32 MB mutation
-        Mutation rm = new Mutation(KEYSPACE1, bytes("k"));
-        rm.add(CF1, Util.cellname("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/4), 0);
+        Mutation m = new RowUpdateBuilder(cfs1.metadata, 0, "k")
+                     .clustering("bytes")
+                     .add("val", ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize()/4))
+                     .build();
 
         // Adding it 5 times
-        CommitLog.instance.add(rm);
-        CommitLog.instance.add(rm);
-        CommitLog.instance.add(rm);
-        CommitLog.instance.add(rm);
-        CommitLog.instance.add(rm);
+        CommitLog.instance.add(m);
+        CommitLog.instance.add(m);
+        CommitLog.instance.add(m);
+        CommitLog.instance.add(m);
+        CommitLog.instance.add(m);
 
         // Adding new mutation on another CF
-        Mutation rm2 = new Mutation(KEYSPACE1, bytes("k"));
-        rm2.add(CF2, Util.cellname("c1"), ByteBuffer.allocate(4), 0);
-        CommitLog.instance.add(rm2);
+        Mutation m2 = new RowUpdateBuilder(cfs2.metadata, 0, "k")
+                      .clustering("bytes")
+                      .add("val", ByteBuffer.allocate(4))
+                      .build();
+        CommitLog.instance.add(m2);
 
         assert CommitLog.instance.activeSegments() == 2 : "Expecting 2 segments, got " + CommitLog.instance.activeSegments();
 
-        UUID cfid2 = rm2.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext());
+        UUID cfid2 = m2.getColumnFamilyIds().iterator().next();
+        CommitLog.instance.discardCompletedSegments(cfid2, ReplayPosition.NONE, CommitLog.instance.getContext());
 
         // Assert we still have both our segment
         assert CommitLog.instance.activeSegments() == 2 : "Expecting 2 segments, got " + CommitLog.instance.activeSegments();
@@ -262,10 +334,14 @@
     @Test
     public void testDeleteIfNotDirty() throws Exception
     {
-        DatabaseDescriptor.getCommitLogSegmentSize();
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+        ColumnFamilyStore cfs2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD2);
+
         // Roughly 32 MB mutation
-        Mutation rm = new Mutation(KEYSPACE1, bytes("k"));
-        rm.add(CF1, Util.cellname("c1"), ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()/4) - 1), 0);
+        Mutation rm = new RowUpdateBuilder(cfs1.metadata, 0, "k")
+                      .clustering("bytes")
+                      .add("val", ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()/4) - 1))
+                      .build();
 
         // Adding it twice (won't change segment)
         CommitLog.instance.add(rm);
@@ -275,14 +351,16 @@
 
         // "Flush": this won't delete anything
         UUID cfid1 = rm.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.sync(true);
-        CommitLog.instance.discardCompletedSegments(cfid1, CommitLog.instance.getContext());
+        CommitLog.instance.sync(true, true);
+        CommitLog.instance.discardCompletedSegments(cfid1, ReplayPosition.NONE, CommitLog.instance.getContext());
 
         assert CommitLog.instance.activeSegments() == 1 : "Expecting 1 segment, got " + CommitLog.instance.activeSegments();
 
         // Adding new mutation on another CF, large enough (including CL entry overhead) that a new segment is created
-        Mutation rm2 = new Mutation(KEYSPACE1, bytes("k"));
-        rm2.add(CF2, Util.cellname("c1"), ByteBuffer.allocate((DatabaseDescriptor.getCommitLogSegmentSize()/2) - 200), 0);
+        Mutation rm2 = new RowUpdateBuilder(cfs2.metadata, 0, "k")
+                       .clustering("bytes")
+                       .add("val", ByteBuffer.allocate(DatabaseDescriptor.getMaxMutationSize() - 200))
+                       .build();
         CommitLog.instance.add(rm2);
         // also forces a new segment, since each entry-with-overhead is just under half the CL size
         CommitLog.instance.add(rm2);
@@ -295,43 +373,65 @@
         // didn't write anything on cf1 since last flush (and we flush cf2)
 
         UUID cfid2 = rm2.getColumnFamilyIds().iterator().next();
-        CommitLog.instance.discardCompletedSegments(cfid2, CommitLog.instance.getContext());
+        CommitLog.instance.discardCompletedSegments(cfid2, ReplayPosition.NONE, CommitLog.instance.getContext());
 
         // Assert we still have both our segment
         assert CommitLog.instance.activeSegments() == 1 : "Expecting 1 segment, got " + CommitLog.instance.activeSegments();
     }
 
-    private static int getMaxRecordDataSize(String keyspace, ByteBuffer key, String table, CellName column)
+    private static int getMaxRecordDataSize(String keyspace, ByteBuffer key, String cfName, String colName)
     {
-        Mutation rm = new Mutation(KEYSPACE1, bytes("k"));
-        rm.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(0), 0);
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(cfName);
+        // We don't want to allocate a size of 0 as this is optimized under the hood and our computation would
+        // break testEqualRecordLimit
+        int allocSize = 1;
+        Mutation rm = new RowUpdateBuilder(cfs.metadata, 0, key)
+                      .clustering(colName)
+                      .add("val", ByteBuffer.allocate(allocSize)).build();
 
-        int max = (DatabaseDescriptor.getCommitLogSegmentSize() / 2);
+        int max = DatabaseDescriptor.getMaxMutationSize();
         max -= CommitLogSegment.ENTRY_OVERHEAD_SIZE; // log entry overhead
-        return max - (int) Mutation.serializer.serializedSize(rm, MessagingService.current_version);
+
+        // Note that the size of the value if vint encoded. So we first compute the ovehead of the mutation without the value and it's size
+        int mutationOverhead = (int)Mutation.serializer.serializedSize(rm, MessagingService.current_version) - (VIntCoding.computeVIntSize(allocSize) + allocSize);
+        max -= mutationOverhead;
+
+        // Now, max is the max for both the value and it's size. But we want to know how much we can allocate, i.e. the size of the value.
+        int sizeOfMax = VIntCoding.computeVIntSize(max);
+        max -= sizeOfMax;
+        assert VIntCoding.computeVIntSize(max) == sizeOfMax; // sanity check that we're still encoded with the size we though we would
+        return max;
     }
 
     private static int getMaxRecordDataSize()
     {
-        return getMaxRecordDataSize(KEYSPACE1, bytes("k"), CF1, Util.cellname("c1"));
+        return getMaxRecordDataSize(KEYSPACE1, bytes("k"), STANDARD1, "bytes");
     }
 
     // CASSANDRA-3615
     @Test
     public void testEqualRecordLimit() throws Exception
     {
-        Mutation rm = new Mutation(KEYSPACE1, bytes("k"));
-        rm.add(CF1, Util.cellname("c1"), ByteBuffer.allocate(getMaxRecordDataSize()), 0);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+        Mutation rm = new RowUpdateBuilder(cfs.metadata, 0, "k")
+                      .clustering("bytes")
+                      .add("val", ByteBuffer.allocate(getMaxRecordDataSize()))
+                      .build();
+
         CommitLog.instance.add(rm);
     }
 
     @Test
     public void testExceedRecordLimit() throws Exception
     {
+        CommitLog.instance.resetUnsafe(true);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
         try
         {
-            Mutation rm = new Mutation(KEYSPACE1, bytes("k"));
-            rm.add(CF1, Util.cellname("c1"), ByteBuffer.allocate(1 + getMaxRecordDataSize()), 0);
+            Mutation rm = new RowUpdateBuilder(cfs.metadata, 0, "k")
+                          .clustering("bytes")
+                          .add("val", ByteBuffer.allocate(1 + getMaxRecordDataSize()))
+                          .build();
             CommitLog.instance.add(rm);
             throw new AssertionError("mutation larger than limit was accepted");
         }
@@ -367,7 +467,7 @@
         return logFile;
     }
 
-    protected void testRecovery(byte[] logData, int version) throws Exception
+    protected Void testRecovery(byte[] logData, int version) throws Exception
     {
         File logFile = tmpFile(version);
         try (OutputStream lout = new FileOutputStream(logFile))
@@ -376,9 +476,10 @@
             //statics make it annoying to test things correctly
             CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/
         }
+        return null;
     }
 
-    protected void testRecovery(CommitLogDescriptor desc, byte[] logData) throws Exception
+    protected Void testRecovery(CommitLogDescriptor desc, byte[] logData) throws Exception
     {
         File logFile = tmpFile(desc.version);
         CommitLogDescriptor fromFile = CommitLogDescriptor.fromFileName(logFile.getName());
@@ -393,29 +494,38 @@
             //statics make it annoying to test things correctly
             CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/
         }
+        return null;
     }
 
     @Test
     public void testRecoveryWithIdMismatch() throws Exception
     {
         CommitLogDescriptor desc = new CommitLogDescriptor(4, null);
-        final File logFile = tmpFile(desc.version);
+        File logFile = tmpFile(desc.version);
         ByteBuffer buf = ByteBuffer.allocate(1024);
         CommitLogDescriptor.writeHeader(buf, desc);
         try (OutputStream lout = new FileOutputStream(logFile))
         {
             lout.write(buf.array(), 0, buf.position());
 
-            runExpecting(new WrappedRunnable() {
-                public void runMayThrow() throws Exception
-                {
-                    CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/
-                }
+            runExpecting(() -> {
+                CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/
+                return null;
             }, CommitLogReplayException.class);
         }
     }
 
-    protected void runExpecting(Runnable r, Class<?> expected)
+    @Test
+    public void testRecoveryWithBadCompressor() throws Exception
+    {
+        CommitLogDescriptor desc = new CommitLogDescriptor(4, new ParameterizedClass("UnknownCompressor", null));
+        runExpecting(() -> {
+            testRecovery(desc, new byte[0]);
+            return null;
+        }, CommitLogReplayException.class);
+    }
+
+    protected void runExpecting(Callable<Void> r, Class<?> expected)
     {
         JVMStabilityInspector.Killer originalKiller;
         KillerForTests killerForTests;
@@ -426,107 +536,240 @@
         Throwable caught = null;
         try
         {
-            r.run();
+            r.call();
         }
-        catch (RuntimeException e)
+        catch (Throwable t)
         {
-            if (expected != e.getCause().getClass())
-                throw new AssertionError("Expected exception " + expected + ", got " + e, e);
-            caught = e;
+            if (expected != t.getClass())
+                throw new AssertionError("Expected exception " + expected + ", got " + t, t);
+            caught = t;
         }
         if (expected != null && caught == null)
             Assert.fail("Expected exception " + expected + " but call completed successfully.");
 
         JVMStabilityInspector.replaceKiller(originalKiller);
-        Assert.assertEquals("JVM killed", expected != null, killerForTests.wasKilled());
+        assertEquals("JVM killed", expected != null, killerForTests.wasKilled());
     }
 
     protected void testRecovery(final byte[] logData, Class<?> expected) throws Exception
     {
-        runExpecting(new WrappedRunnable() {
-            public void runMayThrow() throws Exception
-            {
-                testRecovery(logData, CommitLogDescriptor.VERSION_20);
-            }
-        }, expected);
-        runExpecting(new WrappedRunnable() {
-            public void runMayThrow() throws Exception
-            {
-                testRecovery(new CommitLogDescriptor(4, null), logData);
-            }
-        }, expected);
+        runExpecting(() -> testRecovery(logData, CommitLogDescriptor.VERSION_20), expected);
+        runExpecting(() -> testRecovery(new CommitLogDescriptor(4, null), logData), expected);
     }
 
     @Test
     public void testTruncateWithoutSnapshot() throws ExecutionException, InterruptedException, IOException
     {
-        boolean prev = DatabaseDescriptor.isAutoSnapshot();
-        DatabaseDescriptor.setAutoSnapshot(false);
-        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1");
-        ColumnFamilyStore cfs2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard2");
+        boolean originalState = DatabaseDescriptor.isAutoSnapshot();
+        try
+        {
+            CommitLog.instance.resetUnsafe(true);
+            boolean prev = DatabaseDescriptor.isAutoSnapshot();
+            DatabaseDescriptor.setAutoSnapshot(false);
+            ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+            ColumnFamilyStore cfs2 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD2);
 
-        final Mutation rm1 = new Mutation(KEYSPACE1, bytes("k"));
-        rm1.add("Standard1", Util.cellname("c1"), ByteBuffer.allocate(100), 0);
-        rm1.apply();
-        cfs1.truncateBlocking();
-        DatabaseDescriptor.setAutoSnapshot(prev);
-        final Mutation rm2 = new Mutation(KEYSPACE1, bytes("k"));
-        rm2.add("Standard2", Util.cellname("c1"), ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize() / 4), 0);
+            new RowUpdateBuilder(cfs1.metadata, 0, "k").clustering("bytes").add("val", ByteBuffer.allocate(100)).build().applyUnsafe();
+            cfs1.truncateBlocking();
+            DatabaseDescriptor.setAutoSnapshot(prev);
+            Mutation m2 = new RowUpdateBuilder(cfs2.metadata, 0, "k")
+                          .clustering("bytes")
+                          .add("val", ByteBuffer.allocate(DatabaseDescriptor.getCommitLogSegmentSize() / 4))
+                          .build();
 
-        for (int i = 0 ; i < 5 ; i++)
-            CommitLog.instance.add(rm2);
+            for (int i = 0 ; i < 5 ; i++)
+                CommitLog.instance.add(m2);
 
-        Assert.assertEquals(2, CommitLog.instance.activeSegments());
-        ReplayPosition position = CommitLog.instance.getContext();
-        for (Keyspace ks : Keyspace.system())
-            for (ColumnFamilyStore syscfs : ks.getColumnFamilyStores())
-                CommitLog.instance.discardCompletedSegments(syscfs.metadata.cfId, position);
-        CommitLog.instance.discardCompletedSegments(cfs2.metadata.cfId, position);
-        Assert.assertEquals(1, CommitLog.instance.activeSegments());
+            assertEquals(2, CommitLog.instance.activeSegments());
+            ReplayPosition position = CommitLog.instance.getContext();
+            for (Keyspace ks : Keyspace.system())
+                for (ColumnFamilyStore syscfs : ks.getColumnFamilyStores())
+                    CommitLog.instance.discardCompletedSegments(syscfs.metadata.cfId, ReplayPosition.NONE, position);
+            CommitLog.instance.discardCompletedSegments(cfs2.metadata.cfId, ReplayPosition.NONE, position);
+            assertEquals(1, CommitLog.instance.activeSegments());
+        }
+        finally
+        {
+            DatabaseDescriptor.setAutoSnapshot(originalState);
+        }
     }
 
     @Test
     public void testTruncateWithoutSnapshotNonDurable() throws IOException
     {
-        boolean prevAutoSnapshot = DatabaseDescriptor.isAutoSnapshot();
-        DatabaseDescriptor.setAutoSnapshot(false);
-        Keyspace notDurableKs = Keyspace.open(KEYSPACE2);
-        Assert.assertFalse(notDurableKs.getMetadata().durableWrites);
-        ColumnFamilyStore cfs = notDurableKs.getColumnFamilyStore("Standard1");
-        CellNameType type = notDurableKs.getColumnFamilyStore("Standard1").getComparator();
-        Mutation rm;
-        DecoratedKey dk = Util.dk("key1");
+        boolean originalState = DatabaseDescriptor.getAutoSnapshot();
+        try
+        {
+            DatabaseDescriptor.setAutoSnapshot(false);
+            Keyspace notDurableKs = Keyspace.open(KEYSPACE2);
+            Assert.assertFalse(notDurableKs.getMetadata().params.durableWrites);
 
-        // add data
-        rm = new Mutation(KEYSPACE2, dk.getKey());
-        rm.add("Standard1", Util.cellname("Column1"), ByteBufferUtil.bytes("abcd"), 0);
-        rm.apply();
+            ColumnFamilyStore cfs = notDurableKs.getColumnFamilyStore("Standard1");
+            new RowUpdateBuilder(cfs.metadata, 0, "key1")
+                .clustering("bytes").add("val", ByteBufferUtil.bytes("abcd"))
+                .build()
+                .applyUnsafe();
 
-        ReadCommand command = new SliceByNamesReadCommand(KEYSPACE2, dk.getKey(), "Standard1", System.currentTimeMillis(), new NamesQueryFilter(FBUtilities.singleton(Util.cellname("Column1"), type)));
-        Row row = command.getRow(notDurableKs);
-        Cell col = row.cf.getColumn(Util.cellname("Column1"));
-        Assert.assertEquals(col.value(), ByteBuffer.wrap("abcd".getBytes()));
-        cfs.truncateBlocking();
-        DatabaseDescriptor.setAutoSnapshot(prevAutoSnapshot);
-        row = command.getRow(notDurableKs);
-        Assert.assertEquals(null, row.cf);
+            assertTrue(Util.getOnlyRow(Util.cmd(cfs).columns("val").build())
+                            .cells().iterator().next().value().equals(ByteBufferUtil.bytes("abcd")));
+
+            cfs.truncateBlocking();
+
+            Util.assertEmpty(Util.cmd(cfs).columns("val").build());
+        }
+        finally
+        {
+            DatabaseDescriptor.setAutoSnapshot(originalState);
+        }
+    }
+
+    @Test
+    public void testUnwriteableFlushRecovery() throws ExecutionException, InterruptedException, IOException
+    {
+        CommitLog.instance.resetUnsafe(true);
+
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+
+        DiskFailurePolicy oldPolicy = DatabaseDescriptor.getDiskFailurePolicy();
+        try
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(DiskFailurePolicy.ignore);
+
+            for (int i = 0 ; i < 5 ; i++)
+            {
+                new RowUpdateBuilder(cfs.metadata, 0, "k")
+                    .clustering("c" + i).add("val", ByteBuffer.allocate(100))
+                    .build()
+                    .apply();
+
+                if (i == 2)
+                {
+                    try (Closeable c = Util.markDirectoriesUnwriteable(cfs))
+                    {
+                        cfs.forceBlockingFlush();
+                    }
+                    catch (Throwable t)
+                    {
+                        // expected. Cause (after some wrappings) should be a write error
+                        while (!(t instanceof FSWriteError))
+                            t = t.getCause();
+                    }
+                }
+                else
+                    cfs.forceBlockingFlush();
+            }
+        }
+        finally
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(oldPolicy);
+        }
+
+        CommitLog.instance.sync(true, true);
+        System.setProperty("cassandra.replayList", KEYSPACE1 + "." + STANDARD1);
+        // Currently we don't attempt to re-flush a memtable that failed, thus make sure data is replayed by commitlog.
+        // If retries work subsequent flushes should clear up error and this should change to expect 0.
+        Assert.assertEquals(1, CommitLog.instance.resetUnsafe(false));
+        System.clearProperty("cassandra.replayList");
+    }
+
+    public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable> flushAction, boolean performCompaction)
+            throws ExecutionException, InterruptedException, IOException
+    {
+        CommitLog.instance.resetUnsafe(true);
+
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+
+        for (int i = 0 ; i < 5 ; i++)
+        {
+            new RowUpdateBuilder(cfs.metadata, 0, "k")
+                .clustering("c" + i).add("val", ByteBuffer.allocate(100))
+                .build()
+                .apply();
+
+            Memtable current = cfs.getTracker().getView().getCurrentMemtable();
+            if (i == 2)
+                current.makeUnflushable();
+
+            flushAction.accept(cfs, current);
+        }
+        if (performCompaction)
+            cfs.forceMajorCompaction();
+        // Make sure metadata saves and reads fine
+        for (SSTableReader reader : cfs.getLiveSSTables())
+            reader.reloadSSTableMetadata();
+
+        CommitLog.instance.sync(true, true);
+        System.setProperty("cassandra.replayList", KEYSPACE1 + "." + STANDARD1);
+        // In the absence of error, this should be 0 because forceBlockingFlush/forceRecycleAllSegments would have
+        // persisted all data in the commit log. Because we know there was an error, there must be something left to
+        // replay.
+        Assert.assertEquals(1, CommitLog.instance.resetUnsafe(false));
+        System.clearProperty("cassandra.replayList");
+    }
+
+    BiConsumer<ColumnFamilyStore, Memtable> flush = (cfs, current) ->
+    {
+        try
+        {
+            cfs.forceBlockingFlush();
+        }
+        catch (Throwable t)
+        {
+            // expected after makeUnflushable. Cause (after some wrappings) should be a write error
+            while (!(t instanceof FSWriteError))
+                t = t.getCause();
+            // Wait for started flushes to complete.
+            cfs.switchMemtableIfCurrent(current);
+        }
+    };
+
+    BiConsumer<ColumnFamilyStore, Memtable> recycleSegments = (cfs, current) ->
+    {
+        // Move to new commit log segment and try to flush all data. Also delete segments that no longer contain
+        // flushed data.
+        // This does not stop on errors and should retain segments for which flushing failed.
+        CommitLog.instance.forceRecycleAllSegments();
+
+        // Wait for started flushes to complete.
+        cfs.switchMemtableIfCurrent(current);
+    };
+
+    @Test
+    public void testOutOfOrderFlushRecovery() throws ExecutionException, InterruptedException, IOException
+    {
+        testOutOfOrderFlushRecovery(flush, false);
+    }
+
+    @Test
+    public void testOutOfOrderLogDiscard() throws ExecutionException, InterruptedException, IOException
+    {
+        testOutOfOrderFlushRecovery(recycleSegments, false);
+    }
+
+    @Test
+    public void testOutOfOrderFlushRecoveryWithCompaction() throws ExecutionException, InterruptedException, IOException
+    {
+        testOutOfOrderFlushRecovery(flush, true);
+    }
+
+    @Test
+    public void testOutOfOrderLogDiscardWithCompaction() throws ExecutionException, InterruptedException, IOException
+    {
+        testOutOfOrderFlushRecovery(recycleSegments, true);
     }
 
     @Test
     public void testRecoveryWithCollectionClusteringKeysStatic() throws Exception
     {
-        Mutation rm = new Mutation(KEYSPACE1, bytes(0));
 
-        CFMetaData cfm = Schema.instance.getCFMetaData(KEYSPACE1,CF3);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CUSTOM1);
+        RowUpdateBuilder rb = new RowUpdateBuilder(cfs.metadata, 0, 1);
 
-        int clusterSize = cfm.comparator.clusteringPrefixSize();
-        ByteBuffer[] elements = new ByteBuffer[clusterSize];
-        for (int i = 0; i < clusterSize; i++)
-            elements[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        rb.add("s", 2);
 
-        rm.add(CF3, CellNames.compositeSparse(elements, new ColumnIdentifier("s", true), true), bytes(1), 0);
-
+        Mutation rm = rb.build();
         CommitLog.instance.add(rm);
+
         int replayed = 0;
 
         try
@@ -543,3 +786,4 @@
 
     }
 }
+

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTestReplayer.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTestReplayer.java
index 0c46061..36973f2 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTestReplayer.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTestReplayer.java

@@ -18,27 +18,27 @@
 */
 package org.apache.cassandra.db.commitlog;
 
-import java.io.DataInputStream;
 import java.io.File;
 import java.io.IOException;
 
 import com.google.common.base.Predicate;
 
 import org.junit.Assert;
-
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.ColumnSerializer;
 import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.io.util.FastByteArrayInputStream;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.NIODataInputStream;
+import org.apache.cassandra.io.util.RebufferingInputStream;
 
 /**
  * Utility class for tests needing to examine the commitlog contents.
  */
 public class CommitLogTestReplayer extends CommitLogReplayer
 {
-    static public void examineCommitLog(Predicate<Mutation> processor) throws IOException
+    public static void examineCommitLog(Predicate<Mutation> processor) throws IOException
     {
-        CommitLog.instance.sync(true);
+        CommitLog.instance.sync(true, true);
 
         CommitLogTestReplayer replayer = new CommitLogTestReplayer(CommitLog.instance, processor);
         File commitLogDir = new File(DatabaseDescriptor.getCommitLogLocation());
@@ -61,13 +61,13 @@
     @Override
     void replayMutation(byte[] inputBuffer, int size, final int entryLocation, final CommitLogDescriptor desc)
     {
-        FastByteArrayInputStream bufIn = new FastByteArrayInputStream(inputBuffer, 0, size);
+        RebufferingInputStream bufIn = new DataInputBuffer(inputBuffer, 0, size);
         Mutation mutation;
         try
         {
-            mutation = Mutation.serializer.deserialize(new DataInputStream(bufIn),
+            mutation = Mutation.serializer.deserialize(bufIn,
                                                            desc.getMessagingVersion(),
-                                                           ColumnSerializer.Flag.LOCAL);
+                                                           SerializationHelper.Flag.LOCAL);
             Assert.assertTrue(processor.apply(mutation));
         }
         catch (IOException e)

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTest.java
index 9de2628..00a143b 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTest.java

@@ -23,7 +23,6 @@
 
 import java.io.*;
 import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.Properties;
 import java.util.UUID;
 
@@ -39,12 +38,16 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
 import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.db.commitlog.CommitLogReplayer.CommitLogReplayException;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.KillerForTests;
+import org.apache.cassandra.db.commitlog.CommitLogReplayer.CommitLogReplayException;
 
 public class CommitLogUpgradeTest
 {
@@ -89,6 +92,24 @@
     }
 
     @Test
+
+    public void test22() throws Exception
+    {
+        testRestore(DATA_DIR + "2.2");
+    }
+
+    @Test
+    public void test22_LZ4() throws Exception
+    {
+        testRestore(DATA_DIR + "2.2-lz4");
+    }
+
+    @Test
+    public void test22_Snappy() throws Exception
+    {
+        testRestore(DATA_DIR + "2.2-snappy");
+    }
+
     public void test22_truncated() throws Exception
     {
         testRestore(DATA_DIR + "2.2-lz4-truncated");
@@ -133,8 +154,16 @@
     @BeforeClass
     static public void initialize() throws FileNotFoundException, IOException, InterruptedException
     {
+        CFMetaData metadata = CFMetaData.Builder.createDense(KEYSPACE, TABLE, false, false)
+                                                .addPartitionKey("key", AsciiType.instance)
+                                                .addClusteringColumn("col", AsciiType.instance)
+                                                .addRegularColumn("val", BytesType.instance)
+                                                .build()
+                                                .compression(SchemaLoader.getCompressionParameters());
         SchemaLoader.loadSchema();
-        SchemaLoader.schemaDefinition("");
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    metadata);
     }
 
     public void testRestore(String location) throws IOException, InterruptedException
@@ -151,21 +180,14 @@
             if (Schema.instance.getCF(cfid) == null)
             {
                 CFMetaData cfm = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
-                Schema.instance.purge(cfm);
+                Schema.instance.unload(cfm);
                 Schema.instance.load(cfm.copy(cfid));
             }
         }
 
         Hasher hasher = new Hasher();
         CommitLogTestReplayer replayer = new CommitLogTestReplayer(CommitLog.instance, hasher);
-        File[] files = new File(location).listFiles(new FilenameFilter()
-        {
-            @Override
-            public boolean accept(File dir, String name)
-            {
-                return name.endsWith(".log");
-            }
-        });
+        File[] files = new File(location).listFiles((file, name) -> name.endsWith(".log"));
         replayer.recover(files);
 
         Assert.assertEquals(cells, hasher.cells);
@@ -191,16 +213,18 @@
         @Override
         public boolean apply(Mutation mutation)
         {
-            for (ColumnFamily cf : mutation.getColumnFamilies())
+            for (PartitionUpdate update : mutation.getPartitionUpdates())
             {
-                for (Cell c : cf.getSortedColumns())
-                {
-                    if (new String(c.name().toByteBuffer().array(), StandardCharsets.UTF_8).startsWith(CELLNAME))
+                for (Row row : update)
+                    if (row.clustering().size() > 0 &&
+                        AsciiType.instance.compose(row.clustering().get(0)).startsWith(CELLNAME))
                     {
-                        hash = hash(hash, c.value());
-                        ++cells;
+                        for (Cell cell : row.cells())
+                        {
+                            hash = hash(hash, cell.value());
+                            ++cells;
+                        }
                     }
-                }
             }
             return true;
         }

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTestMaker.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTestMaker.java
index 175a8d6..3538bd1 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTestMaker.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogUpgradeTestMaker.java

@@ -36,6 +36,7 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Mutation;
@@ -231,18 +232,20 @@
                     rl.acquire();
                 String ks = KEYSPACE;
                 ByteBuffer key = randomBytes(16, tlr);
-                Mutation mutation = new Mutation(ks, key);
+
+                UpdateBuilder builder = UpdateBuilder.create(Schema.instance.getCFMetaData(KEYSPACE, TABLE), Util.dk(key));
 
                 for (int ii = 0; ii < numCells; ii++)
                 {
                     int sz = randomSize ? tlr.nextInt(cellSize) : cellSize;
                     ByteBuffer bytes = randomBytes(sz, tlr);
-                    mutation.add(TABLE, Util.cellname(CELLNAME + ii), bytes, System.currentTimeMillis());
+                    builder.newRow(CommitLogUpgradeTest.CELLNAME + ii).add("val", bytes);
                     hash = hash(hash, bytes);
                     ++cells;
                     dataSize += sz;
                 }
-                rp = commitLog.add(mutation);
+
+                rp = commitLog.add((Mutation)builder.makeMutation());
                 counter.incrementAndGet();
             }
         }

diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitlogShutdownTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitlogShutdownTest.java
new file mode 100644
index 0000000..ee3f111
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitlogShutdownTest.java

@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.commitlog;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableMap;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+/**
+ * Since this test depends on byteman rules being setup during initialization, you shouldn't add tests to this class
+ */
+@RunWith(BMUnitRunner.class)
+public class CommitlogShutdownTest
+{
+    private static final String KEYSPACE1 = "CommitLogTest";
+    private static final String STANDARD1 = "Standard1";
+
+    private final static byte[] entropy = new byte[1024 * 256];
+
+    @Test
+    @BMRule(name = "Make removing commitlog segments slow",
+    targetClass = "CommitLogSegment",
+    targetMethod = "discard",
+    action = "Thread.sleep(50)")
+    public void testShutdownWithPendingTasks() throws Exception
+    {
+        new Random().nextBytes(entropy);
+        DatabaseDescriptor.setCommitLogCompression(new ParameterizedClass("LZ4Compressor", ImmutableMap.of()));
+        DatabaseDescriptor.setCommitLogSegmentSize(1);
+        DatabaseDescriptor.setCommitLogSync(Config.CommitLogSync.periodic);
+        DatabaseDescriptor.setCommitLogSyncPeriod(10 * 1000);
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance));
+
+                                    CompactionManager.instance.disableAutoCompaction();
+
+        CommitLog.instance.resetUnsafe(true);
+        ColumnFamilyStore cfs1 = Keyspace.open(KEYSPACE1).getColumnFamilyStore(STANDARD1);
+
+        final Mutation m = new RowUpdateBuilder(cfs1.metadata, 0, "k")
+                           .clustering("bytes")
+                           .add("val", ByteBuffer.wrap(entropy))
+                           .build();
+
+        // force creating several commitlog files
+        for (int i = 0; i < 10; i++)
+        {
+            CommitLog.instance.add(m);
+        }
+
+        // schedule discarding completed segments and immediately issue a shutdown
+        UUID cfid = m.getColumnFamilyIds().iterator().next();
+        CommitLog.instance.discardCompletedSegments(cfid, ReplayPosition.NONE, CommitLog.instance.getContext());
+        CommitLog.instance.shutdownBlocking();
+
+        // the shutdown should block until all logs except the currently active one and perhaps a new, empty one are gone
+        Assert.assertTrue(new File(CommitLog.instance.location).listFiles().length <= 2);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java b/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java
index 99cf72d..37f1731 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java

@@ -25,15 +25,15 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.WindowsFailedSnapshotTracker;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.SnapshotDeletingTask;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.GCInspector;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -44,29 +44,26 @@
     private static final String CF_STANDARD1 = "CF_STANDARD1";
 
     @BeforeClass
-    public static void defineSchema() throws ConfigurationException
+    public static void defineSchema() throws Exception
     {
+        GCInspector.register();
+        // Needed to init the output file where we print failed snapshots. This is called on node startup.
+        WindowsFailedSnapshotTracker.deleteOldSnapshots();
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
     }
 
     @Test
-    public void testSnapshotDeletionFailure() throws Exception
+    public void testCompactionHook() throws Exception
     {
         Assume.assumeTrue(FBUtilities.isWindows());
 
-        GCInspector.register();
-
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
         store.clearUnsafe();
 
-        // Needed to init the output file where we print failed snapshots. This is called on node startup.
-        WindowsFailedSnapshotTracker.deleteOldSnapshots();
-
         populate(10000);
         store.snapshot("snapshot1");
 
@@ -90,20 +87,20 @@
         assertEquals(0, SnapshotDeletingTask.pendingDeletionCount());
     }
 
-    private long populate(int rowCount)
-    {
+    private void populate(int rowCount) {
         long timestamp = System.currentTimeMillis();
+        CFMetaData cfm = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).metadata;
         for (int i = 0; i <= rowCount; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add(CF_STANDARD1,  Util.cellname(Integer.toString(j)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       0);
-            rm.applyUnsafe();
+            {
+                new RowUpdateBuilder(cfm, timestamp, 0, key.getKey())
+                    .clustering(Integer.toString(j))
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+            }
         }
-        return timestamp;
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java
new file mode 100644
index 0000000..ba6f3a1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java

@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.google.common.collect.Sets;
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+@RunWith(BMUnitRunner.class)
+public class AntiCompactionBytemanTest extends CQLTester
+{
+    @Test
+    @BMRules(rules = { @BMRule(name = "Insert delay after first prepareToCommit",
+             targetClass = "CompactionManager",
+             targetMethod = "antiCompactGroup",
+             condition = "not flagged(\"done\")",
+             targetLocation = "AFTER INVOKE prepareToCommit",
+             action = "Thread.sleep(2000);") } )
+    public void testRedundantTransitions() throws Throwable
+    {
+        createTable("create table %s (id int primary key, i int)");
+        execute("insert into %s (id, i) values (1, 1)");
+        execute("insert into %s (id, i) values (2, 1)");
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+        UntypedResultSet res = execute("select token(id) as tok from %s");
+        Iterator<UntypedResultSet.Row> it = res.iterator();
+        List<Long> tokens = new ArrayList<>();
+        while (it.hasNext())
+        {
+            UntypedResultSet.Row r = it.next();
+            tokens.add(r.getLong("tok"));
+        }
+        tokens.sort(Long::compareTo);
+
+        long first = tokens.get(0) - 10;
+        long last = tokens.get(0) + 10;
+        Range<Token> toRepair = new Range<>(new Murmur3Partitioner.LongToken(first), new Murmur3Partitioner.LongToken(last));
+
+        AtomicBoolean failed = new AtomicBoolean(false);
+        AtomicBoolean finished = new AtomicBoolean(false);
+
+        Thread t = new Thread(() -> {
+            while (!finished.get())
+            {
+                UntypedResultSet result = null;
+                try
+                {
+                    result = execute("select id from %s");
+                }
+                catch (Throwable throwable)
+                {
+                    failed.set(true);
+                    throw new RuntimeException(throwable);
+                }
+
+                Iterator<UntypedResultSet.Row> rowIter = result.iterator();
+                Set<Integer> ids = new HashSet<>();
+                while (rowIter.hasNext())
+                {
+                    UntypedResultSet.Row r = rowIter.next();
+                    ids.add(r.getInt("id"));
+                }
+                if (!Sets.newHashSet(1,2).equals(ids))
+                {
+                    failed.set(true);
+                    return;
+                }
+                Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS);
+            }
+        });
+        t.start();
+        assertEquals(1, getCurrentColumnFamilyStore().getLiveSSTables().size());
+        SSTableReader sstableBefore = getCurrentColumnFamilyStore().getLiveSSTables().iterator().next();
+
+        try (LifecycleTransaction txn = getCurrentColumnFamilyStore().getTracker().tryModify(getCurrentColumnFamilyStore().getLiveSSTables(), OperationType.ANTICOMPACTION))
+        {
+            CompactionManager.instance.antiCompactGroup(getCurrentColumnFamilyStore(), Collections.singleton(toRepair), txn, 123);
+        }
+        finished.set(true);
+        t.join();
+        assertFalse(failed.get());
+        assertFalse(getCurrentColumnFamilyStore().getLiveSSTables().contains(sstableBefore));
+        AntiCompactionTest.assertOnDiskState(getCurrentColumnFamilyStore(), 2);
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
index abd9a4a..a85be24 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java

@@ -17,65 +17,67 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.utils.concurrent.Refs;
-import static org.hamcrest.CoreMatchers.is;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertThat;
-import static org.junit.Assert.assertTrue;
-
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Comparator;
 import java.util.List;
+import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.UUID;
 import java.util.concurrent.ExecutionException;
+import java.util.stream.Collectors;
 
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.locator.SimpleStrategy;
-
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.RateLimiter;
 import org.junit.BeforeClass;
 import org.junit.After;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.concurrent.Refs;
+import org.apache.cassandra.UpdateBuilder;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
 
 public class AntiCompactionTest
 {
     private static final String KEYSPACE1 = "AntiCompactionTest";
-    private static final String CF = "Standard1";
-
+    private static final String CF = "AntiCompactionTest";
+    private static CFMetaData cfm;
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
+        cfm = SchemaLoader.standardCFMD(KEYSPACE1, CF);
         SchemaLoader.createKeyspace(KEYSPACE1,
-                SimpleStrategy.class,
-                KSMetaData.optsWithRF(1),
-                SchemaLoader.standardCFMD(KEYSPACE1, CF));
+                                    KeyspaceParams.simple(1),
+                                    cfm);
     }
 
     @After
@@ -90,8 +92,8 @@
     public void antiCompactOne() throws Exception
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
-        assertEquals(store.getSSTables().size(), sstables.size());
+        Collection<SSTableReader> sstables = getUnrepairedSSTables(store);
+        assertEquals(store.getLiveSSTables().size(), sstables.size());
         Range<Token> range = new Range<Token>(new BytesToken("0".getBytes()), new BytesToken("4".getBytes()));
         List<Range<Token>> ranges = Arrays.asList(range);
 
@@ -103,31 +105,32 @@
             if (txn == null)
                 throw new IllegalStateException();
             long repairedAt = 1000;
-            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, repairedAt);
+            UUID parentRepairSession = UUID.randomUUID();
+            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, repairedAt, parentRepairSession);
         }
 
-        assertEquals(2, store.getSSTables().size());
-        for (SSTableReader sstable : store.getSSTables())
+        assertEquals(2, store.getLiveSSTables().size());
+        for (SSTableReader sstable : store.getLiveSSTables())
         {
-            try (ISSTableScanner scanner = sstable.getScanner())
+            try (ISSTableScanner scanner = sstable.getScanner((RateLimiter) null))
             {
                 while (scanner.hasNext())
                 {
-                    SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
+                    UnfilteredRowIterator row = scanner.next();
                     if (sstable.isRepaired())
                     {
-                        assertTrue(range.contains(row.getKey().getToken()));
+                        assertTrue(range.contains(row.partitionKey().getToken()));
                         repairedKeys++;
                     }
                     else
                     {
-                        assertFalse(range.contains(row.getKey().getToken()));
+                        assertFalse(range.contains(row.partitionKey().getToken()));
                         nonRepairedKeys++;
                     }
                 }
             }
         }
-        for (SSTableReader sstable : store.getSSTables())
+        for (SSTableReader sstable : store.getLiveSSTables())
         {
             assertFalse(sstable.isMarkedCompacted());
             assertEquals(1, sstable.selfRef().globalCount());
@@ -135,6 +138,7 @@
         assertEquals(0, store.getTracker().getCompacting().size());
         assertEquals(repairedKeys, 4);
         assertEquals(nonRepairedKeys, 6);
+        assertOnDiskState(store, 2);
     }
 
     @Test
@@ -145,192 +149,187 @@
         cfs.disableAutoCompaction();
         SSTableReader s = writeFile(cfs, 1000);
         cfs.addSSTable(s);
-        long origSize = s.bytesOnDisk();
         Range<Token> range = new Range<Token>(new BytesToken(ByteBufferUtil.bytes(0)), new BytesToken(ByteBufferUtil.bytes(500)));
-        Collection<SSTableReader> sstables = cfs.getSSTables();
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
+        UUID parentRepairSession = UUID.randomUUID();
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
              Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            CompactionManager.instance.performAnticompaction(cfs, Arrays.asList(range), refs, txn, 12345);
+            CompactionManager.instance.performAnticompaction(cfs, Arrays.asList(range), refs, txn, 12345, parentRepairSession);
         }
         long sum = 0;
-        for (SSTableReader x : cfs.getSSTables())
+        long rows = 0;
+        for (SSTableReader x : cfs.getLiveSSTables())
+        {
             sum += x.bytesOnDisk();
+            rows += x.getTotalRows();
+        }
         assertEquals(sum, cfs.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(origSize, cfs.metric.liveDiskSpaceUsed.getCount(), 100000);
+        assertEquals(rows, 1000 * (1000 * 5));//See writeFile for how this number is derived
+        assertOnDiskState(cfs, 2);
     }
 
     private SSTableReader writeFile(ColumnFamilyStore cfs, int count)
     {
-        ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        for (int i = 0; i < count; i++)
-            cf.addColumn(Util.column(String.valueOf(i), "a", 1));
-        File dir = cfs.directories.getDirectoryForNewSSTables();
-        String filename = cfs.getTempSSTablePath(dir);
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        String filename = cfs.getSSTablePath(dir);
 
-        try (SSTableWriter writer = SSTableWriter.create(filename, 0, 0);)
+        try (SSTableTxnWriter writer = SSTableTxnWriter.create(cfs, filename, 0, 0, new SerializationHeader(true, cfm, cfm.partitionColumns(), EncodingStats.NO_STATS)))
         {
-            for (int i = 0; i < count * 5; i++)
-                writer.append(StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(i)), cf);
-            return writer.finish(true);
+            for (int i = 0; i < count; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfm, ByteBufferUtil.bytes(i));
+                for (int j = 0; j < count * 5; j++)
+                    builder.newRow("c" + j).add("val", "value1");
+                writer.append(builder.build().unfilteredIterator());
+
+            }
+            Collection<SSTableReader> sstables = writer.finish(true);
+            assertNotNull(sstables);
+            assertEquals(1, sstables.size());
+            return sstables.iterator().next();
         }
     }
 
     public void generateSStable(ColumnFamilyStore store, String Suffix)
     {
-    long timestamp = System.currentTimeMillis();
-    for (int i = 0; i < 10; i++)
+        for (int i = 0; i < 10; i++)
         {
-            DecoratedKey key = Util.dk(Integer.toString(i) + "-" + Suffix);
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            for (int j = 0; j < 10; j++)
-                rm.add("Standard1", Util.cellname(Integer.toString(j)),
-                        ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                        timestamp,
-                        0);
-            rm.apply();
+            String localSuffix = Integer.toString(i);
+            new RowUpdateBuilder(cfm, System.currentTimeMillis(), localSuffix + "-" + Suffix)
+                    .clustering("c")
+                    .add("val", "val" + localSuffix)
+                    .build()
+                    .applyUnsafe();
         }
         store.forceBlockingFlush();
     }
 
     @Test
-    public void antiCompactTenSTC() throws Exception
-    {
-        antiCompactTen("SizeTieredCompactionStrategy");
-    }
-
-    @Test
-    public void antiCompactTenLC() throws Exception
-    {
-        antiCompactTen("LeveledCompactionStrategy");
-    }
-
-    public void antiCompactTen(String compactionStrategy) throws Exception
+    public void antiCompactTen() throws InterruptedException, IOException
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
-        store.setCompactionStrategyClass(compactionStrategy);
         store.disableAutoCompaction();
 
         for (int table = 0; table < 10; table++)
         {
             generateSStable(store,Integer.toString(table));
         }
-        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
-        assertEquals(store.getSSTables().size(), sstables.size());
+        Collection<SSTableReader> sstables = getUnrepairedSSTables(store);
+        assertEquals(store.getLiveSSTables().size(), sstables.size());
 
         Range<Token> range = new Range<Token>(new BytesToken("0".getBytes()), new BytesToken("4".getBytes()));
         List<Range<Token>> ranges = Arrays.asList(range);
 
         long repairedAt = 1000;
+        UUID parentRepairSession = UUID.randomUUID();
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
              Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, repairedAt);
+            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, repairedAt, parentRepairSession);
         }
         /*
         Anticompaction will be anti-compacting 10 SSTables but will be doing this two at a time
         so there will be no net change in the number of sstables
          */
-        assertEquals(10, store.getSSTables().size());
+        assertEquals(10, store.getLiveSSTables().size());
         int repairedKeys = 0;
         int nonRepairedKeys = 0;
-        for (SSTableReader sstable : store.getSSTables())
+        for (SSTableReader sstable : store.getLiveSSTables())
         {
-            try(ISSTableScanner scanner = sstable.getScanner())
+            try (ISSTableScanner scanner = sstable.getScanner((RateLimiter) null))
             {
                 while (scanner.hasNext())
                 {
-                    SSTableIdentityIterator row = (SSTableIdentityIterator) scanner.next();
-                    if (sstable.isRepaired())
+                    try (UnfilteredRowIterator row = scanner.next())
                     {
-                        assertTrue(range.contains(row.getKey().getToken()));
-                        assertEquals(repairedAt, sstable.getSSTableMetadata().repairedAt);
-                        repairedKeys++;
-                    }
-                    else
-                    {
-                        assertFalse(range.contains(row.getKey().getToken()));
-                        assertEquals(ActiveRepairService.UNREPAIRED_SSTABLE, sstable.getSSTableMetadata().repairedAt);
-                        nonRepairedKeys++;
+                        if (sstable.isRepaired())
+                        {
+                            assertTrue(range.contains(row.partitionKey().getToken()));
+                            assertEquals(repairedAt, sstable.getSSTableMetadata().repairedAt);
+                            repairedKeys++;
+                        }
+                        else
+                        {
+                            assertFalse(range.contains(row.partitionKey().getToken()));
+                            assertEquals(ActiveRepairService.UNREPAIRED_SSTABLE, sstable.getSSTableMetadata().repairedAt);
+                            nonRepairedKeys++;
+                        }
                     }
                 }
             }
         }
         assertEquals(repairedKeys, 40);
         assertEquals(nonRepairedKeys, 60);
+        assertOnDiskState(store, 10);
     }
 
     @Test
     public void shouldMutateRepairedAt() throws InterruptedException, IOException
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
-        assertEquals(store.getSSTables().size(), sstables.size());
+        Collection<SSTableReader> sstables = getUnrepairedSSTables(store);
+        assertEquals(store.getLiveSSTables().size(), sstables.size());
         Range<Token> range = new Range<Token>(new BytesToken("/".getBytes()), new BytesToken("9999".getBytes()));
         List<Range<Token>> ranges = Arrays.asList(range);
+        UUID parentRepairSession = UUID.randomUUID();
 
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
              Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1);
+            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1, parentRepairSession);
         }
 
-        SSTableReader sstable = Iterables.get(store.getSSTables(), 0);
-        assertThat(store.getSSTables().size(), is(1));
+        SSTableReader sstable = Iterables.get(store.getLiveSSTables(), 0);
+        assertThat(store.getLiveSSTables().size(), is(1));
         assertThat(sstable.isRepaired(), is(true));
         assertThat(sstable.selfRef().globalCount(), is(1));
         assertThat(store.getTracker().getCompacting().size(), is(0));
+        assertOnDiskState(store, 1);
     }
 
     @Test
     public void shouldAntiCompactSSTable() throws IOException, InterruptedException, ExecutionException
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
-        assertEquals(store.getSSTables().size(), sstables.size());
+        Collection<SSTableReader> sstables = getUnrepairedSSTables(store);
+        assertEquals(store.getLiveSSTables().size(), sstables.size());
         // SSTable range is 0 - 10, repair just a subset of the ranges (0 - 4) of the SSTable. Should result in
         // one repaired and one unrepaired SSTable
         Range<Token> range = new Range<Token>(new BytesToken("/".getBytes()), new BytesToken("4".getBytes()));
         List<Range<Token>> ranges = Arrays.asList(range);
+        UUID parentRepairSession = UUID.randomUUID();
 
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
              Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1);
+            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1, parentRepairSession);
         }
 
-        Comparator<SSTableReader> generationReverseComparator = new Comparator<SSTableReader>()
-        {
-            public int compare(SSTableReader o1, SSTableReader o2)
-            {
-                return Integer.compare(o1.descriptor.generation, o2.descriptor.generation);
-            }
-        };
-
-        SortedSet<SSTableReader> sstablesSorted = new TreeSet<>(generationReverseComparator);
-        sstablesSorted.addAll(store.getSSTables());
+        SortedSet<SSTableReader> sstablesSorted = new TreeSet<>(SSTableReader.generationReverseComparator.reversed());
+        sstablesSorted.addAll(store.getLiveSSTables());
 
         SSTableReader sstable = sstablesSorted.first();
-        assertThat(store.getSSTables().size(), is(2));
+        assertThat(store.getLiveSSTables().size(), is(2));
         assertThat(sstable.isRepaired(), is(true));
         assertThat(sstable.selfRef().globalCount(), is(1));
         assertThat(store.getTracker().getCompacting().size(), is(0));
 
         // Test we don't anti-compact already repaired SSTables. repairedAt shouldn't change for the already repaired SSTable (first)
-        sstables = store.getSSTables();
+        sstables = store.getLiveSSTables();
         // Range that's a subset of the repaired SSTable's ranges, so would cause an anti-compaction (if it wasn't repaired)
         range = new Range<Token>(new BytesToken("/".getBytes()), new BytesToken("2".getBytes()));
         ranges = Arrays.asList(range);
         try (Refs<SSTableReader> refs = Refs.ref(sstables))
         {
             // use different repairedAt to ensure it doesn't change
-            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 200);
+            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 200, parentRepairSession);
             fut.get();
         }
 
         sstablesSorted.clear();
-        sstablesSorted.addAll(store.getSSTables());
+        sstablesSorted.addAll(store.getLiveSSTables());
         assertThat(sstablesSorted.size(), is(2));
         assertThat(sstablesSorted.first().isRepaired(), is(true));
         assertThat(sstablesSorted.last().isRepaired(), is(false));
@@ -348,12 +347,12 @@
         try (Refs<SSTableReader> refs = Refs.ref(sstables))
         {
             // Same repaired at, but should be changed on the repaired SSTable now
-            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 200);
+            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 200, parentRepairSession);
             fut.get();
         }
 
         sstablesSorted.clear();
-        sstablesSorted.addAll(store.getSSTables());
+        sstablesSorted.addAll(store.getLiveSSTables());
 
         assertThat(sstablesSorted.size(), is(2));
         assertThat(sstablesSorted.first().isRepaired(), is(true));
@@ -372,12 +371,12 @@
         try (Refs<SSTableReader> refs = Refs.ref(sstables))
         {
             // Both SSTables should have repairedAt of 400
-            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 400);
+            ListenableFuture fut = CompactionManager.instance.submitAntiCompaction(store, ranges, refs, 400, parentRepairSession);
             fut.get();
         }
 
         sstablesSorted.clear();
-        sstablesSorted.addAll(store.getSSTables());
+        sstablesSorted.addAll(store.getLiveSSTables());
 
         assertThat(sstablesSorted.size(), is(2));
         assertThat(sstablesSorted.first().isRepaired(), is(true));
@@ -387,6 +386,7 @@
         assertThat(sstablesSorted.first().selfRef().globalCount(), is(1));
         assertThat(sstablesSorted.last().selfRef().globalCount(), is(1));
         assertThat(store.getTracker().getCompacting().size(), is(0));
+        assertOnDiskState(store, 2);
     }
 
 
@@ -401,21 +401,22 @@
         {
             generateSStable(store,Integer.toString(table));
         }
-        Collection<SSTableReader> sstables = store.getUnrepairedSSTables();
-        assertEquals(store.getSSTables().size(), sstables.size());
-        
+        Collection<SSTableReader> sstables = getUnrepairedSSTables(store);
+        assertEquals(store.getLiveSSTables().size(), sstables.size());
+
         Range<Token> range = new Range<Token>(new BytesToken("-1".getBytes()), new BytesToken("-10".getBytes()));
         List<Range<Token>> ranges = Arrays.asList(range);
-
+        UUID parentRepairSession = UUID.randomUUID();
 
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
              Refs<SSTableReader> refs = Refs.ref(sstables))
         {
-            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1);
+            CompactionManager.instance.performAnticompaction(store, ranges, refs, txn, 1, parentRepairSession);
         }
 
-        assertThat(store.getSSTables().size(), is(10));
-        assertThat(Iterables.get(store.getSSTables(), 0).isRepaired(), is(false));
+        assertThat(store.getLiveSSTables().size(), is(10));
+        assertThat(Iterables.get(store.getLiveSSTables(), 0).isRepaired(), is(false));
+        assertOnDiskState(store, 10);
     }
 
     private ColumnFamilyStore prepareColumnFamilyStore()
@@ -423,17 +424,13 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
         store.disableAutoCompaction();
-        long timestamp = System.currentTimeMillis();
         for (int i = 0; i < 10; i++)
         {
-            DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            for (int j = 0; j < 10; j++)
-                rm.add("Standard1", Util.cellname(Integer.toString(j)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       0);
-            rm.apply();
+            new RowUpdateBuilder(cfm, System.currentTimeMillis(), Integer.toString(i))
+                .clustering("c")
+                .add("val", "val")
+                .build()
+                .applyUnsafe();
         }
         store.forceBlockingFlush();
         return store;
@@ -446,4 +443,31 @@
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
         store.truncateBlocking();
     }
+
+    private static Set<SSTableReader> getUnrepairedSSTables(ColumnFamilyStore cfs)
+    {
+        return ImmutableSet.copyOf(cfs.getTracker().getView().sstables(SSTableSet.LIVE, (s) -> !s.isRepaired()));
+    }
+
+    public static void assertOnDiskState(ColumnFamilyStore cfs, int expectedSSTableCount)
+    {
+        LifecycleTransaction.waitForDeletions();
+        assertEquals(expectedSSTableCount, cfs.getLiveSSTables().size());
+        Set<Integer> liveGenerations = cfs.getLiveSSTables().stream().map(sstable -> sstable.descriptor.generation).collect(Collectors.toSet());
+        int fileCount = 0;
+        for (File f : cfs.getDirectories().getCFDirectories())
+        {
+            for (File sst : f.listFiles())
+            {
+                if (sst.getName().contains("Data"))
+                {
+                    Descriptor d = Descriptor.fromFilename(sst.getAbsolutePath());
+                    assertTrue(liveGenerations.contains(d.generation));
+                    fileCount++;
+                }
+            }
+        }
+        assertEquals(expectedSSTableCount, fileCount);
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
new file mode 100644
index 0000000..68ba6bf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java

@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collections;
+import java.util.concurrent.CountDownLatch;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.metrics.CompactionMetrics;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+public class CancelCompactionsTest extends CQLTester
+{
+    @Test
+    public void testStandardCompactionTaskCancellation() throws Throwable
+    {
+        createTable("create table %s (id int primary key, something int)");
+        getCurrentColumnFamilyStore().disableAutoCompaction();
+
+        for (int i = 0; i < 10; i++)
+        {
+            execute("insert into %s (id, something) values (?,?)", i, i);
+            getCurrentColumnFamilyStore().forceBlockingFlush();
+        }
+        AbstractCompactionTask ct = null;
+
+        for (AbstractCompactionStrategy cs : getCurrentColumnFamilyStore().getCompactionStrategyManager().getStrategies())
+        {
+            ct = cs.getNextBackgroundTask(0);
+            if (ct != null)
+                break;
+        }
+        assertNotNull(ct);
+
+        CountDownLatch waitForBeginCompaction = new CountDownLatch(1);
+        CountDownLatch waitForStart = new CountDownLatch(1);
+        Iterable<CFMetaData> metadatas = Collections.singleton(getCurrentColumnFamilyStore().metadata);
+        /*
+        Here we ask strategies to pause & interrupt compactions right before calling beginCompaction in CompactionTask
+        The code running in the separate thread below mimics CFS#runWithCompactionsDisabled but we only allow
+        the real beginCompaction to be called after pausing & interrupting.
+         */
+        Thread t = new Thread(() -> {
+            Uninterruptibles.awaitUninterruptibly(waitForBeginCompaction);
+            getCurrentColumnFamilyStore().getCompactionStrategyManager().pause();
+            CompactionManager.instance.interruptCompactionFor(metadatas, false);
+            waitForStart.countDown();
+            CompactionManager.instance.waitForCessation(Collections.singleton(getCurrentColumnFamilyStore()));
+            getCurrentColumnFamilyStore().getCompactionStrategyManager().resume();
+        });
+        t.start();
+
+        try
+        {
+            ct.execute(new CompactionMetrics()
+            {
+                @Override
+                public void beginCompaction(CompactionInfo.Holder ci)
+                {
+                    waitForBeginCompaction.countDown();
+                    Uninterruptibles.awaitUninterruptibly(waitForStart);
+                    super.beginCompaction(ci);
+                }
+            });
+            fail("execute should throw CompactionInterruptedException");
+        }
+        catch (CompactionInterruptedException cie)
+        {
+            // expected
+        }
+        finally
+        {
+            ct.transaction.abort();
+            t.join();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
index 235fd49..68936f5 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java

@@ -21,109 +21,100 @@
 import java.util.*;
 
 import com.google.common.primitives.Longs;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import org.junit.*;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.db.compaction.writers.DefaultCompactionWriter;
 import org.apache.cassandra.db.compaction.writers.MajorLeveledCompactionWriter;
 import org.apache.cassandra.db.compaction.writers.MaxSSTableSizeWriter;
 import org.apache.cassandra.db.compaction.writers.SplittingSizeTieredCompactionWriter;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
 
 import static org.junit.Assert.assertEquals;
 
-public class CompactionAwareWriterTest
+public class CompactionAwareWriterTest extends CQLTester
 {
-    private static String KEYSPACE1 = "CompactionAwareWriterTest";
-    private static String CF = "Standard1";
+    private static final String KEYSPACE = "cawt_keyspace";
+    private static final String TABLE = "cawt_table";
+
+    private static final int ROW_PER_PARTITION = 10;
 
     @BeforeClass
-    public static void defineSchema() throws ConfigurationException
+    public static void beforeClass() throws Throwable
     {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF));
-
+        // Disabling durable write since we don't care
+        schemaChange("CREATE KEYSPACE IF NOT EXISTS " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes=false");
+        schemaChange(String.format("CREATE TABLE %s.%s (k int, t int, v blob, PRIMARY KEY (k, t))", KEYSPACE, TABLE));
     }
 
-    @Before
-    public void clear()
+    @AfterClass
+    public static void tearDownClass()
     {
-        // avoid one test affecting the next one
-        Keyspace ks = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = ks.getColumnFamilyStore(CF);
-        cfs.clearUnsafe();
+        QueryProcessor.executeInternal("DROP KEYSPACE IF EXISTS " + KEYSPACE);
+    }
+
+    private ColumnFamilyStore getColumnFamilyStore()
+    {
+        return Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE);
     }
 
     @Test
-    public void testDefaultCompactionWriter()
+    public void testDefaultCompactionWriter() throws Throwable
     {
-        Keyspace ks = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = ks.getColumnFamilyStore(CF);
+        Keyspace ks = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = ks.getColumnFamilyStore(TABLE);
+
         int rowCount = 1000;
         cfs.disableAutoCompaction();
-        populate(cfs, rowCount);
-        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getSSTables(), OperationType.COMPACTION);
+        populate(rowCount);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getLiveSSTables(), OperationType.COMPACTION);
         long beforeSize = txn.originals().iterator().next().onDiskLength();
-        CompactionAwareWriter writer = new DefaultCompactionWriter(cfs, txn, txn.originals(), false, OperationType.COMPACTION);
+        CompactionAwareWriter writer = new DefaultCompactionWriter(cfs, cfs.getDirectories(), txn, txn.originals());
         int rows = compact(cfs, txn, writer);
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
         assertEquals(rowCount, rows);
-        assertEquals(beforeSize, cfs.getSSTables().iterator().next().onDiskLength());
+        assertEquals(beforeSize, cfs.getLiveSSTables().iterator().next().onDiskLength());
         validateData(cfs, rowCount);
         cfs.truncateBlocking();
     }
 
     @Test
-    public void testMaxSSTableSizeWriter()
+    public void testMaxSSTableSizeWriter() throws Throwable
     {
-        Keyspace ks = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = ks.getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = getColumnFamilyStore();
         cfs.disableAutoCompaction();
         int rowCount = 1000;
-        populate(cfs, rowCount);
-        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getSSTables(), OperationType.COMPACTION);
+        populate(rowCount);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getLiveSSTables(), OperationType.COMPACTION);
         long beforeSize = txn.originals().iterator().next().onDiskLength();
         int sstableSize = (int)beforeSize/10;
-        CompactionAwareWriter writer = new MaxSSTableSizeWriter(cfs, txn, txn.originals(), sstableSize, 0, false, OperationType.COMPACTION);
+        CompactionAwareWriter writer = new MaxSSTableSizeWriter(cfs, cfs.getDirectories(), txn, txn.originals(), sstableSize, 0);
         int rows = compact(cfs, txn, writer);
-        assertEquals(10, cfs.getSSTables().size());
+        assertEquals(10, cfs.getLiveSSTables().size());
         assertEquals(rowCount, rows);
         validateData(cfs, rowCount);
         cfs.truncateBlocking();
     }
+
     @Test
-    public void testSplittingSizeTieredCompactionWriter()
+    public void testSplittingSizeTieredCompactionWriter() throws Throwable
     {
-        Keyspace ks = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = ks.getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = getColumnFamilyStore();
         cfs.disableAutoCompaction();
         int rowCount = 10000;
-        populate(cfs, rowCount);
-        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getSSTables(), OperationType.COMPACTION);
+        populate(rowCount);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getLiveSSTables(), OperationType.COMPACTION);
         long beforeSize = txn.originals().iterator().next().onDiskLength();
-        CompactionAwareWriter writer = new SplittingSizeTieredCompactionWriter(cfs, txn, txn.originals(), OperationType.COMPACTION, 0);
+        CompactionAwareWriter writer = new SplittingSizeTieredCompactionWriter(cfs, cfs.getDirectories(), txn, txn.originals(), 0);
         int rows = compact(cfs, txn, writer);
         long expectedSize = beforeSize / 2;
-        List<SSTableReader> sortedSSTables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sortedSSTables = new ArrayList<>(cfs.getLiveSSTables());
 
         Collections.sort(sortedSSTables, new Comparator<SSTableReader>()
                                 {
@@ -146,23 +137,22 @@
     }
 
     @Test
-    public void testMajorLeveledCompactionWriter()
+    public void testMajorLeveledCompactionWriter() throws Throwable
     {
-        Keyspace ks = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = ks.getColumnFamilyStore(CF);
+        ColumnFamilyStore cfs = getColumnFamilyStore();
         cfs.disableAutoCompaction();
         int rowCount = 20000;
         int targetSSTableCount = 50;
-        populate(cfs, rowCount);
-        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getSSTables(), OperationType.COMPACTION);
+        populate(rowCount);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getLiveSSTables(), OperationType.COMPACTION);
         long beforeSize = txn.originals().iterator().next().onDiskLength();
         int sstableSize = (int)beforeSize/targetSSTableCount;
-        CompactionAwareWriter writer = new MajorLeveledCompactionWriter(cfs, txn, txn.originals(), sstableSize, false, OperationType.COMPACTION);
+        CompactionAwareWriter writer = new MajorLeveledCompactionWriter(cfs, cfs.getDirectories(), txn, txn.originals(), sstableSize);
         int rows = compact(cfs, txn, writer);
-        assertEquals(targetSSTableCount, cfs.getSSTables().size());
+        assertEquals(targetSSTableCount, cfs.getLiveSSTables().size());
         int [] levelCounts = new int[5];
         assertEquals(rowCount, rows);
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             levelCounts[sstable.getSSTableLevel()]++;
         }
@@ -179,39 +169,34 @@
     {
         assert txn.originals().size() == 1;
         int rowsWritten = 0;
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(txn.originals()))
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(txn.originals());
+             CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
-            CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(System.currentTimeMillis()));
-            ISSTableScanner scanner = scanners.scanners.get(0);
-            while(scanner.hasNext())
+            while (ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Arrays.asList(scanner.next()));
-                if (writer.append(row))
+                if (writer.append(ci.next()))
                     rowsWritten++;
             }
         }
-        Collection<SSTableReader> newSSTables = writer.finish();
+        writer.finish();
         return rowsWritten;
     }
 
-    private void populate(ColumnFamilyStore cfs, int count)
+    private void populate(int count) throws Throwable
     {
-        long timestamp = System.currentTimeMillis();
-        byte [] payload = new byte[1000];
-        new Random().nextBytes(payload);
+        byte [] payload = new byte[5000];
+        new Random(42).nextBytes(payload);
         ByteBuffer b = ByteBuffer.wrap(payload);
+
         for (int i = 0; i < count; i++)
-        {
-            DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            for (int j = 0; j < 10; j++)
-                rm.add(CF,  Util.cellname(Integer.toString(j)),
-                        b,
-                        timestamp);
-            rm.applyUnsafe();
-        }
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+                execute(String.format("INSERT INTO %s.%s(k, t, v) VALUES (?, ?, ?)", KEYSPACE, TABLE), i, j, b);
+
+        ColumnFamilyStore cfs = getColumnFamilyStore();
         cfs.forceBlockingFlush();
-        if (cfs.getSSTables().size() > 1)
+        if (cfs.getLiveSSTables().size() > 1)
         {
             // we want just one big sstable to avoid doing actual compaction in compact() above
             try
@@ -223,22 +208,18 @@
                 throw new RuntimeException(t);
             }
         }
-        assert cfs.getSSTables().size() == 1 : cfs.getSSTables();
+        assert cfs.getLiveSSTables().size() == 1 : cfs.getLiveSSTables();
     }
-    private void validateData(ColumnFamilyStore cfs, int rowCount)
+
+    private void validateData(ColumnFamilyStore cfs, int rowCount) throws Throwable
     {
         for (int i = 0; i < rowCount; i++)
         {
-            ColumnFamily cf = cfs.getTopLevelColumns(QueryFilter.getIdentityFilter(Util.dk(Integer.toString(i)), CF, System.currentTimeMillis()), Integer.MAX_VALUE);
-            Iterator<Cell> iter = cf.iterator();
-            int cellCount = 0;
-            while (iter.hasNext())
-            {
-                Cell c = iter.next();
-                assertEquals(Util.cellname(Integer.toString(cellCount)), c.name());
-                cellCount++;
-            }
-            assertEquals(10, cellCount);
+            Object[][] expected = new Object[ROW_PER_PARTITION][];
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+                expected[j] = row(i, j);
+
+            assertRows(execute(String.format("SELECT k, t FROM %s.%s WHERE k = :i", KEYSPACE, TABLE), i), expected);
         }
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java
index 3184159..1b400e8 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java

@@ -20,27 +20,28 @@
 
 import java.nio.ByteBuffer;
 import java.util.Set;
+import java.util.function.Predicate;
 
-import com.google.common.base.Predicate;
 import com.google.common.collect.Sets;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.db.composites.CellName;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
-import static org.apache.cassandra.Util.cellname;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -57,10 +58,17 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF1),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF2));
+                                    KeyspaceParams.simple(1),
+                                    CFMetaData.Builder.create(KEYSPACE, CF1, true, false, false)
+                                                      .addPartitionKey("pk", AsciiType.instance)
+                                                      .addClusteringColumn("ck", AsciiType.instance)
+                                                      .addRegularColumn("val", AsciiType.instance)
+                                                      .build(),
+                                    CFMetaData.Builder.create(KEYSPACE, CF2, true, false, false)
+                                                      .addPartitionKey("pk", AsciiType.instance)
+                                                      .addClusteringColumn("ck", AsciiType.instance)
+                                                      .addRegularColumn("val", AsciiType.instance)
+                                                      .build());
     }
 
     @Test
@@ -70,15 +78,14 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF1);
         cfs.truncateBlocking();
 
-        ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
-        DecoratedKey key = DatabaseDescriptor.getPartitioner().decorateKey(rowKey);
+        DecoratedKey key = Util.dk("k1");
 
         long timestamp1 = FBUtilities.timestampMicros(); // latest timestamp
         long timestamp2 = timestamp1 - 5;
         long timestamp3 = timestamp2 - 5; // oldest timestamp
 
         // add to first memtable
-        applyMutation(CF1, rowKey, timestamp1);
+        applyMutation(cfs.metadata, key, timestamp1);
 
         // check max purgeable timestamp without any sstables
         try(CompactionController controller = new CompactionController(cfs, null, 0))
@@ -86,13 +93,13 @@
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp1); //memtable only
 
             cfs.forceBlockingFlush();
-            assertTrue(controller.getPurgeEvaluator(key).apply(Long.MAX_VALUE)); //no memtables and no sstables
+            assertTrue(controller.getPurgeEvaluator(key).test(Long.MAX_VALUE)); //no memtables and no sstables
         }
 
-        Set<SSTableReader> compacting = Sets.newHashSet(cfs.getSSTables()); // first sstable is compacting
+        Set<SSTableReader> compacting = Sets.newHashSet(cfs.getLiveSSTables()); // first sstable is compacting
 
         // create another sstable
-        applyMutation(CF1, rowKey, timestamp2);
+        applyMutation(cfs.metadata, key, timestamp2);
         cfs.forceBlockingFlush();
 
         // check max purgeable timestamp when compacting the first sstable with and without a memtable
@@ -100,7 +107,7 @@
         {
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp2);
 
-            applyMutation(CF1, rowKey, timestamp3);
+            applyMutation(cfs.metadata, key, timestamp3);
 
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp3); //second sstable and second memtable
         }
@@ -111,9 +118,9 @@
         //newest to oldest
         try (CompactionController controller = new CompactionController(cfs, null, 0))
         {
-            applyMutation(CF1, rowKey, timestamp1);
-            applyMutation(CF1, rowKey, timestamp2);
-            applyMutation(CF1, rowKey, timestamp3);
+            applyMutation(cfs.metadata, key, timestamp1);
+            applyMutation(cfs.metadata, key, timestamp2);
+            applyMutation(cfs.metadata, key, timestamp3);
 
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp3); //memtable only
         }
@@ -123,9 +130,9 @@
         //oldest to newest
         try (CompactionController controller = new CompactionController(cfs, null, 0))
         {
-            applyMutation(CF1, rowKey, timestamp3);
-            applyMutation(CF1, rowKey, timestamp2);
-            applyMutation(CF1, rowKey, timestamp1);
+            applyMutation(cfs.metadata, key, timestamp3);
+            applyMutation(cfs.metadata, key, timestamp2);
+            applyMutation(cfs.metadata, key, timestamp1);
 
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp3);
         }
@@ -138,25 +145,25 @@
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2);
         cfs.truncateBlocking();
 
-        ByteBuffer rowKey = ByteBufferUtil.bytes("k1");
+        DecoratedKey key = Util.dk("k1");
 
         long timestamp1 = FBUtilities.timestampMicros(); // latest timestamp
         long timestamp2 = timestamp1 - 5;
         long timestamp3 = timestamp2 - 5; // oldest timestamp
 
         // create sstable with tombstone that should be expired in no older timestamps
-        applyDeleteMutation(CF2, rowKey, timestamp2);
+        applyDeleteMutation(cfs.metadata, key, timestamp2);
         cfs.forceBlockingFlush();
 
         // first sstable with tombstone is compacting
-        Set<SSTableReader> compacting = Sets.newHashSet(cfs.getSSTables());
+        Set<SSTableReader> compacting = Sets.newHashSet(cfs.getLiveSSTables());
 
         // create another sstable with more recent timestamp
-        applyMutation(CF2, rowKey, timestamp1);
+        applyMutation(cfs.metadata, key, timestamp1);
         cfs.forceBlockingFlush();
 
         // second sstable is overlapping
-        Set<SSTableReader> overlapping = Sets.difference(Sets.newHashSet(cfs.getSSTables()), compacting);
+        Set<SSTableReader> overlapping = Sets.difference(Sets.newHashSet(cfs.getLiveSSTables()), compacting);
 
         // the first sstable should be expired because the overlapping sstable is newer and the gc period is later
         int gcBefore = (int) (System.currentTimeMillis() / 1000) + 5;
@@ -166,32 +173,32 @@
         assertEquals(compacting.iterator().next(), expired.iterator().next());
 
         // however if we add an older mutation to the memtable then the sstable should not be expired
-        applyMutation(CF2, rowKey, timestamp3);
+        applyMutation(cfs.metadata, key, timestamp3);
         expired = CompactionController.getFullyExpiredSSTables(cfs, compacting, overlapping, gcBefore);
         assertNotNull(expired);
         assertEquals(0, expired.size());
     }
 
-    private void applyMutation(String cf, ByteBuffer rowKey, long timestamp)
+    private void applyMutation(CFMetaData cfm, DecoratedKey key, long timestamp)
     {
-        CellName colName = cellname("birthdate");
         ByteBuffer val = ByteBufferUtil.bytes(1L);
 
-        Mutation rm = new Mutation(KEYSPACE, rowKey);
-        rm.add(cf, colName, val, timestamp);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfm, timestamp, key)
+        .clustering("ck")
+        .add("val", val)
+        .build()
+        .applyUnsafe();
     }
 
-    private void applyDeleteMutation(String cf, ByteBuffer rowKey, long timestamp)
+    private void applyDeleteMutation(CFMetaData cfm, DecoratedKey key, long timestamp)
     {
-        Mutation rm = new Mutation(KEYSPACE, rowKey);
-        rm.delete(cf, timestamp);
-        rm.applyUnsafe();
+        new Mutation(PartitionUpdate.fullPartitionDelete(cfm, key, timestamp, FBUtilities.nowInSeconds()))
+        .applyUnsafe();
     }
 
     private void assertPurgeBoundary(Predicate<Long> evaluator, long boundary)
     {
-        assertFalse(evaluator.apply(boundary));
-        assertTrue(evaluator.apply(boundary - 1));
+        assertFalse(evaluator.test(boundary));
+        assertTrue(evaluator.test(boundary - 1));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionExecutorTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionExecutorTest.java
index c6feb3f..9b07da9 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionExecutorTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionExecutorTest.java

@@ -18,7 +18,6 @@
 
 package org.apache.cassandra.db.compaction;
 
-import java.util.concurrent.Callable;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 
@@ -68,17 +67,9 @@
     public void testFailedRunnable() throws Exception
     {
         testTaskThrowable = null;
-
         Future<?> tt = executor.submitIfRunning(
-            new Runnable()
-            {
-                @Override
-                public void run()
-                {
-                    assert false : "testFailedRunnable";
-                }
-            }, "compactionExecutorTest"
-        );
+            () -> { assert false : "testFailedRunnable"; }
+            , "compactionExecutorTest");
 
         while (!tt.isDone())
             Thread.sleep(10);
@@ -91,15 +82,7 @@
     {
         testTaskThrowable = null;
         Future<?> tt = executor.submitIfRunning(
-            new Callable<Integer>()
-            {
-                @Override
-                public Integer call() throws Exception
-                {
-                    assert false : "testFailedCallable";
-                    return 1;
-                }
-            }
+            () -> { assert false : "testFailedCallable"; return 1; }
             , "compactionExecutorTest");
 
         while (!tt.isDone())
@@ -113,14 +96,7 @@
     {
         testTaskThrowable = null;
         Future<?> tt = executor.submitIfRunning(
-        new Runnable()
-        {
-            @Override
-            public void run()
-            {
-                throw new RuntimeException("testExceptionRunnable");
-            }
-        }
+        () -> { throw new RuntimeException("testExceptionRunnable"); }
         , "compactionExecutorTest");
 
         while (!tt.isDone())

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
new file mode 100644
index 0000000..549a94d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java

@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.compaction;
+
+import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.assertCommandIssued;
+import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.iter;
+import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.makeRow;
+import static org.junit.Assert.*;
+
+import java.net.InetAddress;
+import java.util.*;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.net.*;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class CompactionIteratorTest extends CQLTester
+{
+    @Test
+    public void duplicateRowsTest() throws Throwable
+    {
+        System.setProperty("cassandra.diagnostic_snapshot_interval_nanos", "0");
+        // Create a table and insert some data. The actual rows read in the test will be synthetic
+        // but this creates an sstable on disk to be snapshotted.
+        createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, v int, PRIMARY KEY (pk, ck1, ck2))");
+        for (int i = 0; i < 10; i++)
+            execute("insert into %s (pk, ck1, ck2, v) values (?, ?, ?, ?)", "key", i, i, i);
+        flush();
+
+        DatabaseDescriptor.setSnapshotOnDuplicateRowDetection(true);
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        CFMetaData metadata = getCurrentColumnFamilyStore().metadata;
+
+        final HashMap<InetAddress, MessageOut> sentMessages = new HashMap<>();
+        IMessageSink sink = new IMessageSink()
+        {
+            public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
+            {
+                sentMessages.put(to, message);
+                return false;
+            }
+
+            public boolean allowIncomingMessage(MessageIn message, int id)
+            {
+                return false;
+            }
+        };
+        MessagingService.instance().addMessageSink(sink);
+
+        // no duplicates
+        sentMessages.clear();
+        iterate(cfs, iter(metadata,
+                          false,
+                          makeRow(metadata,0, 0),
+                          makeRow(metadata,0, 1),
+                          makeRow(metadata,0, 2)));
+        assertCommandIssued(sentMessages, false);
+
+        // now test with a duplicate row and see that we issue a snapshot command
+        sentMessages.clear();
+        iterate(cfs, iter(metadata,
+                          false,
+                          makeRow(metadata, 0, 0),
+                          makeRow(metadata, 0, 1),
+                          makeRow(metadata, 0, 1)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    private void iterate(ColumnFamilyStore cfs, UnfilteredPartitionIterator partitions)
+    {
+
+        try (CompactionController controller = new CompactionController(getCurrentColumnFamilyStore(), Integer.MAX_VALUE);
+             ISSTableScanner scanner = scanner(cfs, partitions);
+             CompactionIterator iter = new CompactionIterator(OperationType.COMPACTION,
+                                                              Collections.singletonList(scanner),
+                                                              controller, FBUtilities.nowInSeconds(), null))
+        {
+            while (iter.hasNext())
+            {
+                try (UnfilteredRowIterator partition = iter.next())
+                {
+                    partition.forEachRemaining(u -> {});
+                }
+            }
+        }
+    }
+
+    private ISSTableScanner scanner(final ColumnFamilyStore cfs, final UnfilteredPartitionIterator partitions)
+    {
+
+        return new ISSTableScanner()
+        {
+            public long getLengthInBytes() { return 0; }
+
+            public long getCurrentPosition() { return 0; }
+
+            public String getBackingFiles() { return cfs.getLiveSSTables().iterator().next().toString(); }
+
+            public boolean isForThrift() { return false; }
+
+            public CFMetaData metadata() { return cfs.metadata; }
+
+            public void close() { }
+
+            public boolean hasNext() { return partitions.hasNext(); }
+
+            public UnfilteredRowIterator next() { return partitions.next(); }
+        };
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
index 1d77b17..ce85dc5 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java

@@ -24,9 +24,6 @@
 
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -34,28 +31,31 @@
 
 public class CompactionsCQLTest extends CQLTester
 {
+
+    public static final int SLEEP_TIME = 5000;
+
     @Test
     public void testTriggerMinorCompactionSTCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, true);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
     }
 
     @Test
     public void testTriggerMinorCompactionLCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'LeveledCompactionStrategy', 'sstable_size_in_mb':1};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, true);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
     }
 
 
@@ -63,80 +63,93 @@
     public void testTriggerMinorCompactionDTCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'DateTieredCompactionStrategy', 'min_threshold':2};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1') using timestamp 1000"); // same timestamp = same window = minor compaction triggered
         flush();
         execute("insert into %s (id) values ('1') using timestamp 1000");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, true);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
     }
 
     @Test
+    public void testTriggerMinorCompactionTWCS() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'TimeWindowCompactionStrategy', 'min_threshold':2};");
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        execute("insert into %s (id) values ('1')");
+        flush();
+        execute("insert into %s (id) values ('1')");
+        flush();
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
+    }
+
+
+    @Test
     public void testTriggerNoMinorCompactionSTCSDisabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, false);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, false);
     }
 
     @Test
     public void testTriggerMinorCompactionSTCSNodetoolEnabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         getCurrentColumnFamilyStore().enableAutoCompaction();
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, true);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
     }
 
     @Test
     public void testTriggerNoMinorCompactionSTCSNodetoolDisabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':true};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         getCurrentColumnFamilyStore().disableAutoCompaction();
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, false);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, false);
     }
 
     @Test
     public void testTriggerNoMinorCompactionSTCSAlterTable() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':true};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("ALTER TABLE %s WITH compaction = {'class': 'SizeTieredCompactionStrategy', 'enabled': false}");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, false);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, false);
     }
 
     @Test
     public void testTriggerMinorCompactionSTCSAlterTable() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("ALTER TABLE %s WITH compaction = {'class': 'SizeTieredCompactionStrategy', 'min_threshold': 2, 'enabled': true}");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
         flush();
-        waitForMinor(KEYSPACE, currentTable(), 5000, true);
+        waitForMinor(KEYSPACE, currentTable(), SLEEP_TIME, true);
     }
 
     @Test
@@ -146,16 +159,15 @@
         Map<String, String> localOptions = new HashMap<>();
         localOptions.put("class", "DateTieredCompactionStrategy");
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        WrappingCompactionStrategy wrappingCompactionStrategy = (WrappingCompactionStrategy) getCurrentColumnFamilyStore().getCompactionStrategy();
-        assertTrue(verifyStrategies(wrappingCompactionStrategy, DateTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), DateTieredCompactionStrategy.class));
         // altering something non-compaction related
         execute("ALTER TABLE %s WITH gc_grace_seconds = 1000");
         // should keep the local compaction strat
-        assertTrue(verifyStrategies(wrappingCompactionStrategy, DateTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), DateTieredCompactionStrategy.class));
         // altering a compaction option
         execute("ALTER TABLE %s WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':3}");
         // will use the new option
-        assertTrue(verifyStrategies(wrappingCompactionStrategy, SizeTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), SizeTieredCompactionStrategy.class));
     }
 
 
@@ -167,12 +179,12 @@
         localOptions.put("class", "DateTieredCompactionStrategy");
         localOptions.put("enabled", "false");
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
         localOptions.clear();
         localOptions.put("class", "DateTieredCompactionStrategy");
         // localOptions.put("enabled", "true"); - this is default!
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
     }
 
 
@@ -184,24 +196,10 @@
         localOptions.put("class", "DateTieredCompactionStrategy");
 
         getCurrentColumnFamilyStore().disableAutoCompaction();
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
 
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategy().isEnabled());
-
-    }
-
-    @Test
-    public void testTopLevelDeletion() throws Throwable
-    {
-        createTable("CREATE TABLE %s (id int PRIMARY KEY, id2 text)");
-        execute("delete from %s where id = 22");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
-        for (SSTableReader sstable : getCurrentColumnFamilyStore().getSSTables())
-            assertFalse(sstable.getSSTableMetadata().estimatedTombstoneDropTime.getAsMap().isEmpty());
-        getCurrentColumnFamilyStore().forceMajorCompaction();
-        for (SSTableReader sstable : getCurrentColumnFamilyStore().getSSTables())
-            assertFalse(sstable.getSSTableMetadata().estimatedTombstoneDropTime.getAsMap().isEmpty());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
     }
 
 
@@ -216,10 +214,10 @@
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
     }
 
-    public boolean verifyStrategies(WrappingCompactionStrategy wrappingStrategy, Class<? extends AbstractCompactionStrategy> expected)
+    public boolean verifyStrategies(CompactionStrategyManager manager, Class<? extends AbstractCompactionStrategy> expected)
     {
         boolean found = false;
-        for (AbstractCompactionStrategy actualStrategy : wrappingStrategy.getWrappedStrategies())
+        for (AbstractCompactionStrategy actualStrategy : manager.getStrategies())
         {
             if (!actualStrategy.getClass().equals(expected))
                 return false;
@@ -228,11 +226,6 @@
         return found;
     }
 
-    private ColumnFamilyStore getCurrentColumnFamilyStore()
-    {
-        return Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
-    }
-
     private void waitForMinor(String keyspace, String cf, long maxWaitTime, boolean shouldFind) throws Throwable
     {
         long startTime = System.currentTimeMillis();

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
index 4a1f2ca..f02f4c2 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java

@@ -21,35 +21,27 @@
 import java.util.Collection;
 import java.util.concurrent.ExecutionException;
 
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
-
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.Util;
-
-import static org.junit.Assert.assertEquals;
-import static org.apache.cassandra.db.KeyspaceTest.assertColumns;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-
-import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
-
-import static org.apache.cassandra.Util.cellname;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.junit.Assert.*;
+import static org.apache.cassandra.Util.dk;
 
 public class CompactionsPurgeTest
 {
@@ -67,26 +59,22 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
         SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD1));
         SchemaLoader.createKeyspace(KEYSPACE_CACHED,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHED).caching(CachingOptions.ALL));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHED).caching(CachingParams.CACHE_EVERYTHING));
         SchemaLoader.createKeyspace(KEYSPACE_CQL,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     CFMetaData.compile("CREATE TABLE " + CF_CQL + " ("
-                                                     + "k int PRIMARY KEY,"
-                                                     + "v1 text,"
-                                                     + "v2 int"
-                                                     + ")", KEYSPACE_CQL));
+                                            + "k int PRIMARY KEY,"
+                                            + "v1 text,"
+                                            + "v2 int"
+                                            + ")", KEYSPACE_CQL));
     }
 
     @Test
@@ -98,39 +86,40 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key1");
-        Mutation rm;
+        String key = "key1";
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
-        rm.applyUnsafe();
+
         cfs.forceBlockingFlush();
 
         // deletes
         for (int i = 0; i < 10; i++)
         {
-            rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.delete(cfName, cellname(String.valueOf(i)), 1);
-            rm.applyUnsafe();
+            RowUpdateBuilder.deleteRow(cfs.metadata, 1, key, String.valueOf(i)).applyUnsafe();
         }
         cfs.forceBlockingFlush();
 
         // resurrect one column
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
-        rm.applyUnsafe();
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 2, key);
+        builder.clustering(String.valueOf(5))
+               .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+               .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
 
         // major compact and test that all columns but the resurrected one is completely gone
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
-        cfs.invalidateCachedRow(key);
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertColumns(cf, "5");
-        assertNotNull(cf.getColumn(cellname(String.valueOf(5))));
+        cfs.invalidateCachedPartition(dk(key));
+
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(1, partition.rowCount());
     }
 
     @Test
@@ -142,24 +131,22 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key1");
-        Mutation rm;
+        String key = "key1";
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
-        rm.apply();
         cfs.forceBlockingFlush();
 
         // deletes
         for (int i = 0; i < 10; i++)
         {
-            rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.delete(cfName, cellname(String.valueOf(i)), Long.MAX_VALUE);
-            rm.apply();
+            RowUpdateBuilder.deleteRow(cfs.metadata, Long.MAX_VALUE, key, String.valueOf(i)).applyUnsafe();
         }
         cfs.forceBlockingFlush();
 
@@ -167,15 +154,17 @@
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
 
         // resurrect one column
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
-        rm.apply();
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 2, key);
+        builder.clustering(String.valueOf(5))
+               .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+               .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        cfs.invalidateCachedRow(key);
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertColumns(cf, "5");
-        assert cf.getColumn(cellname(String.valueOf(5))) != null;
+        cfs.invalidateCachedPartition(dk(key));
+
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(1, partition.rowCount());
     }
 
     @Test
@@ -187,38 +176,38 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key1");
-        Mutation rm;
+        String key = "key1";
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
-        rm.apply();
         cfs.forceBlockingFlush();
 
-        // delete
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete(cfName, Long.MAX_VALUE);
-        rm.apply();
-
+        new Mutation(KEYSPACE1, dk(key))
+            .add(PartitionUpdate.fullPartitionDelete(cfs.metadata, dk(key), Long.MAX_VALUE, FBUtilities.nowInSeconds()))
+            .applyUnsafe();
         cfs.forceBlockingFlush();
 
-        // major compact - tombstone should be purged
+        // major compact - tombstones should be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
 
         // resurrect one column
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
-        rm.apply();
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 2, key);
+        builder.clustering(String.valueOf(5))
+               .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+               .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        cfs.invalidateCachedRow(key);
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertColumns(cf, "5");
-        assert cf.getColumn(cellname(String.valueOf(5))) != null;
+        cfs.invalidateCachedPartition(dk(key));
+
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(1, partition.rowCount());
     }
 
     @Test
@@ -230,38 +219,37 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key1");
-        Mutation rm;
+        String key = "key1";
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
-        rm.apply();
         cfs.forceBlockingFlush();
 
-        // delete
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.deleteRange(cfName, cellname(String.valueOf(0)), cellname(String.valueOf(9)), Long.MAX_VALUE);
-        rm.apply();
-
+        new RowUpdateBuilder(cfs.metadata, Long.MAX_VALUE, dk(key))
+            .addRangeTombstone(String.valueOf(0), String.valueOf(9)).build().applyUnsafe();
         cfs.forceBlockingFlush();
 
-        // major compact - tombstone should be purged
+        // major compact - tombstones should be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
 
         // resurrect one column
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
-        rm.apply();
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 2, key);
+        builder.clustering(String.valueOf(5))
+               .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+               .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        cfs.invalidateCachedRow(key);
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertColumns(cf, "5");
-        assert cf.getColumn(cellname(String.valueOf(5))) != null;
+        cfs.invalidateCachedPartition(dk(key));
+
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(1, partition.rowCount());
     }
 
     @Test
@@ -273,26 +261,25 @@
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        Mutation rm;
         for (int k = 1; k <= 2; ++k) {
-            DecoratedKey key = Util.dk("key" + k);
+            String key = "key" + k;
 
             // inserts
-            rm = new Mutation(KEYSPACE2, key.getKey());
             for (int i = 0; i < 10; i++)
             {
-                rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+                RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+                builder.clustering(String.valueOf(i))
+                        .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                        .build().applyUnsafe();
             }
-            rm.applyUnsafe();
             cfs.forceBlockingFlush();
 
             // deletes
             for (int i = 0; i < 10; i++)
             {
-                rm = new Mutation(KEYSPACE2, key.getKey());
-                rm.delete(cfName, cellname(String.valueOf(i)), 1);
-                rm.applyUnsafe();
+                RowUpdateBuilder.deleteRow(cfs.metadata, 1, key, String.valueOf(i)).applyUnsafe();
             }
+
             cfs.forceBlockingFlush();
         }
 
@@ -302,22 +289,24 @@
         // flush, remember the current sstable and then resurrect one column
         // for first key. Then submit minor compaction on remembered sstables.
         cfs.forceBlockingFlush();
-        Collection<SSTableReader> sstablesIncomplete = cfs.getSSTables();
-        rm = new Mutation(KEYSPACE2, key1.getKey());
-        rm.add(cfName, cellname(String.valueOf(5)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2);
-        rm.applyUnsafe();
+        Collection<SSTableReader> sstablesIncomplete = cfs.getLiveSSTables();
+
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 2, "key1");
+        builder.clustering(String.valueOf(5))
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
-        cfs.getCompactionStrategy().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
+        cfs.getCompactionStrategyManager().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
 
         // verify that minor compaction does GC when key is provably not
         // present in a non-compacted sstable
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key2, cfName, System.currentTimeMillis()));
-        assertNull(cf);
+        Util.assertEmpty(Util.cmd(cfs, key2).build());
 
         // verify that minor compaction still GC when key is present
         // in a non-compacted sstable but the timestamp ensures we won't miss anything
-        cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key1, cfName, System.currentTimeMillis()));
-        assertEquals(1, cf.getColumnCount());
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key1).build());
+        assertEquals(1, partition.rowCount());
     }
 
     /**
@@ -331,36 +320,40 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE2);
         String cfName = "Standard1";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        Mutation rm;
-        DecoratedKey key3 = Util.dk("key3");
+        final boolean enforceStrictLiveness = cfs.metadata.enforceStrictLiveness();
+        String key3 = "key3";
 
         // inserts
-        rm = new Mutation(KEYSPACE2, key3.getKey());
-        rm.add(cfName, cellname("c1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
-        rm.add(cfName, cellname("c2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 8, key3)
+            .clustering("c1")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build().applyUnsafe();
+
+        new RowUpdateBuilder(cfs.metadata, 8, key3)
+        .clustering("c2")
+        .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+        .build().applyUnsafe();
+
         cfs.forceBlockingFlush();
         // delete c1
-        rm = new Mutation(KEYSPACE2, key3.getKey());
-        rm.delete(cfName, cellname("c1"), 10);
-        rm.applyUnsafe();
+        RowUpdateBuilder.deleteRow(cfs.metadata, 10, key3, "c1").applyUnsafe();
+
         cfs.forceBlockingFlush();
-        Collection<SSTableReader> sstablesIncomplete = cfs.getSSTables();
+        Collection<SSTableReader> sstablesIncomplete = cfs.getLiveSSTables();
 
         // delete c2 so we have new delete in a diffrent SSTable
-        rm = new Mutation(KEYSPACE2, key3.getKey());
-        rm.delete(cfName, cellname("c2"), 9);
-        rm.applyUnsafe();
+        RowUpdateBuilder.deleteRow(cfs.metadata, 9, key3, "c2").applyUnsafe();
         cfs.forceBlockingFlush();
 
         // compact the sstables with the c1/c2 data and the c1 tombstone
-        cfs.getCompactionStrategy().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
+        cfs.getCompactionStrategyManager().getUserDefinedTask(sstablesIncomplete, Integer.MAX_VALUE).execute(null);
 
         // We should have both the c1 and c2 tombstones still. Since the min timestamp in the c2 tombstone
         // sstable is older than the c1 tombstone, it is invalid to throw out the c1 tombstone.
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key3, cfName, System.currentTimeMillis()));
-        assertFalse(cf.getColumn(cellname("c2")).isLive());
-        assertEquals(2, cf.getColumnCount());
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key3).build());
+        assertEquals(2, partition.rowCount());
+        for (Row row : partition)
+            assertFalse(row.hasLiveData(FBUtilities.nowInSeconds(), enforceStrictLiveness));
     }
 
     @Test
@@ -372,34 +365,33 @@
         String cfName = "Standard2";
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key1");
-        Mutation rm;
+        String key = "key1";
 
         // inserts
-        rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i < 5; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
-        rm.applyUnsafe();
 
         // deletes
         for (int i = 0; i < 5; i++)
         {
-            rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.delete(cfName, cellname(String.valueOf(i)), 1);
-            rm.applyUnsafe();
+            RowUpdateBuilder.deleteRow(cfs.metadata, 1, key, String.valueOf(i)).applyUnsafe();
         }
         cfs.forceBlockingFlush();
-        assertEquals(String.valueOf(cfs.getSSTables()), 1, cfs.getSSTables().size()); // inserts & deletes were in the same memtable -> only deletes in sstable
+        assertEquals(String.valueOf(cfs.getLiveSSTables()), 1, cfs.getLiveSSTables().size()); // inserts & deletes were in the same memtable -> only deletes in sstable
 
         // compact and test that the row is completely gone
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
-        assertTrue(cfs.getSSTables().isEmpty());
-        ColumnFamily cf = keyspace.getColumnFamilyStore(cfName).getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertNull(String.valueOf(cf), cf);
+        assertTrue(cfs.getLiveSSTables().isEmpty());
+
+        Util.assertEmpty(Util.cmd(cfs, key).build());
     }
 
+
     @Test
     public void testCompactionPurgeCachedRow() throws ExecutionException, InterruptedException
     {
@@ -410,42 +402,35 @@
         Keyspace keyspace = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
-        DecoratedKey key = Util.dk("key3");
-        Mutation rm;
+        String key = "key3";
 
         // inserts
-        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 10; i++)
         {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, 0, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
         }
+
+        // deletes partition
+        Mutation rm = new Mutation(KEYSPACE_CACHED, dk(key));
+        rm.add(PartitionUpdate.fullPartitionDelete(cfs.metadata, dk(key), 1, FBUtilities.nowInSeconds()));
         rm.applyUnsafe();
 
-        // move the key up in row cache
-        cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
+        // Adds another unrelated partition so that the sstable is not considered fully expired. We do not
+        // invalidate the row cache in that latter case.
+        new RowUpdateBuilder(cfs.metadata, 0, "key4").clustering("c").add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER).build().applyUnsafe();
 
-        // deletes row
-        rm = new Mutation(keyspaceName, key.getKey());
-        rm.delete(cfName, 1);
-        rm.applyUnsafe();
+        // move the key up in row cache (it should not be empty since we have the partition deletion info)
+        assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).isEmpty());
 
         // flush and major compact
         cfs.forceBlockingFlush();
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
 
-        // re-inserts with timestamp lower than delete
-        rm = new Mutation(keyspaceName, key.getKey());
-        for (int i = 0; i < 10; i++)
-        {
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        }
-        rm.applyUnsafe();
-
-        // Check that the second insert did went in
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis()));
-        assertEquals(10, cf.getColumnCount());
-        for (Cell c : cf)
-            assertTrue(c.isLive());
+        // Since we've force purging (by passing MAX_VALUE for gc_before), the row should have been invalidated and we should have no deletion info anymore
+        Util.assertEmpty(Util.cmd(cfs, key).build());
     }
 
     @Test
@@ -457,41 +442,45 @@
         String cfName = "Standard1";
         Keyspace keyspace = Keyspace.open(keyspaceName);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
-        DecoratedKey key = Util.dk("key3");
-        Mutation rm;
-        QueryFilter filter = QueryFilter.getIdentityFilter(key, cfName, System.currentTimeMillis());
+        String key = "key3";
 
         // inserts
-        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 10; i++)
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, i, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
+        }
+
+        // deletes partition with timestamp such that not all columns are deleted
+        Mutation rm = new Mutation(KEYSPACE1, dk(key));
+        rm.add(PartitionUpdate.fullPartitionDelete(cfs.metadata, dk(key), 4, FBUtilities.nowInSeconds()));
         rm.applyUnsafe();
 
-        // deletes row with timestamp such that not all columns are deleted
-        rm = new Mutation(keyspaceName, key.getKey());
-        rm.delete(cfName, 4);
-        rm.applyUnsafe();
-        ColumnFamily cf = cfs.getColumnFamily(filter);
-        assertTrue(cf.isMarkedForDelete());
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertFalse(partition.partitionLevelDeletion().isLive());
 
         // flush and major compact (with tombstone purging)
         cfs.forceBlockingFlush();
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
-        assertFalse(cfs.getColumnFamily(filter).isMarkedForDelete());
+        assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).isEmpty());
 
         // re-inserts with timestamp lower than delete
-        rm = new Mutation(keyspaceName, key.getKey());
         for (int i = 0; i < 5; i++)
-            rm.add(cfName, cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
-        rm.applyUnsafe();
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, i, key);
+            builder.clustering(String.valueOf(i))
+                   .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                   .build().applyUnsafe();
+        }
 
         // Check that the second insert went in
-        cf = cfs.getColumnFamily(filter);
-        assertEquals(10, cf.getColumnCount());
-        for (Cell c : cf)
-            assertTrue(c.isLive());
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(10, partition.rowCount());
     }
 
+
     @Test
     public void testRowTombstoneObservedBeforePurging()
     {
@@ -501,48 +490,48 @@
         cfs.disableAutoCompaction();
 
         // write a row out to one sstable
-        executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
-                                      keyspace, table, 1, "foo", 1));
+        QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
+                                                     keyspace, table, 1, "foo", 1));
         cfs.forceBlockingFlush();
 
-        UntypedResultSet result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a second sstable
-        executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        QueryProcessor.executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
         cfs.forceBlockingFlush();
 
         // basic check that the row is considered deleted
-        assertEquals(2, cfs.getSSTables().size());
-        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        assertEquals(2, cfs.getLiveSSTables().size());
+        result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
 
         // compact the two sstables with a gcBefore that does *not* allow the row tombstone to be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, (int) (System.currentTimeMillis() / 1000) - 10000, false));
 
         // the data should be gone, but the tombstone should still exist
-        assertEquals(1, cfs.getSSTables().size());
-        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        assertEquals(1, cfs.getLiveSSTables().size());
+        result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
 
         // write a row out to one sstable
-        executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
-                                      keyspace, table, 1, "foo", 1));
+        QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
+                                                     keyspace, table, 1, "foo", 1));
         cfs.forceBlockingFlush();
-        assertEquals(2, cfs.getSSTables().size());
-        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        assertEquals(2, cfs.getLiveSSTables().size());
+        result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a different sstable
-        executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        QueryProcessor.executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
         cfs.forceBlockingFlush();
 
         // compact the two sstables with a gcBefore that *does* allow the row tombstone to be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, (int) (System.currentTimeMillis() / 1000) + 10000, false));
 
         // both the data and the tombstone should be gone this time
-        assertEquals(0, cfs.getSSTables().size());
-        result = executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
+        assertEquals(0, cfs.getLiveSSTables().size());
+        result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(0, result.size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index 471f8cf..28725c7 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java

@@ -19,40 +19,47 @@
 package org.apache.cassandra.db.compaction;
 
 import java.io.File;
-import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 
-import org.apache.cassandra.OrderedJUnit4ClassRunner;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.io.sstable.*;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Sets;
+import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.columniterator.SSTableIterator;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.ColumnData;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.service.MigrationManager;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.*;
 
@@ -60,6 +67,7 @@
 public class CompactionsTest
 {
     private static final String KEYSPACE1 = "Keyspace1";
+    private static final String CF_DENSE1 = "CF_DENSE1";
     private static final String CF_STANDARD1 = "CF_STANDARD1";
     private static final String CF_STANDARD2 = "Standard2";
     private static final String CF_STANDARD3 = "Standard3";
@@ -73,35 +81,44 @@
     {
         Map<String, String> compactionOptions = new HashMap<>();
         compactionOptions.put("tombstone_compaction_interval", "1");
+
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
         SchemaLoader.prepareServer();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1).compactionStrategyOptions(compactionOptions),
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.denseCFMD(KEYSPACE1, CF_DENSE1)
+                                                .compaction(CompactionParams.scts(compactionOptions)),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1)
+                                                .compaction(CompactionParams.scts(compactionOptions)),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD4),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER1, LongType.instance),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPER5, BytesType.instance),
-                                    SchemaLoader.superCFMD(KEYSPACE1, CF_SUPERGC, BytesType.instance).gcGraceSeconds(0));
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_SUPER1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_SUPER5),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_SUPERGC)
+                                                .gcGraceSeconds(0));
     }
 
-    public ColumnFamilyStore testSingleSSTableCompaction(String strategyClassName) throws Exception
+    // Test to see if sstable has enough expired columns, it is compacted itself.
+    @Test
+    public void testSingleSSTableCompaction() throws Exception
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_DENSE1);
         store.clearUnsafe();
         store.metadata.gcGraceSeconds(1);
-        store.setCompactionStrategyClass(strategyClassName);
 
         // disable compaction while flushing
         store.disableAutoCompaction();
 
-        long timestamp = populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); //ttl=3s
+        long timestamp = populate(KEYSPACE1, CF_DENSE1, 0, 9, 3); //ttl=3s
 
         store.forceBlockingFlush();
-        assertEquals(1, store.getSSTables().size());
-        long originalSize = store.getSSTables().iterator().next().uncompressedLength();
+        assertEquals(1, store.getLiveSSTables().size());
+        long originalSize = store.getLiveSSTables().iterator().next().uncompressedLength();
 
         // wait enough to force single compaction
         TimeUnit.SECONDS.sleep(5);
@@ -115,100 +132,84 @@
         } while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0);
 
         // and sstable with ttl should be compacted
-        assertEquals(1, store.getSSTables().size());
-        long size = store.getSSTables().iterator().next().uncompressedLength();
+        assertEquals(1, store.getLiveSSTables().size());
+        long size = store.getLiveSSTables().iterator().next().uncompressedLength();
         assertTrue("should be less than " + originalSize + ", but was " + size, size < originalSize);
 
         // make sure max timestamp of compacted sstables is recorded properly after compaction.
         assertMaxTimestamp(store, timestamp);
-
-        return store;
     }
 
     public static long populate(String ks, String cf, int startRowKey, int endRowKey, int ttl)
     {
         long timestamp = System.currentTimeMillis();
+        CFMetaData cfm = Keyspace.open(ks).getColumnFamilyStore(cf).metadata;
         for (int i = startRowKey; i <= endRowKey; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(ks, key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add(cf,  Util.cellname(Integer.toString(j)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       j > 0 ? ttl : 0); // let first column never expire, since deleting all columns does not produce sstable
-            rm.applyUnsafe();
+            {
+                new RowUpdateBuilder(cfm, timestamp, j > 0 ? ttl : 0, key.getKey())
+                    .clustering(Integer.toString(j))
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+            }
         }
         return timestamp;
     }
 
-    /**
-     * Test to see if sstable has enough expired columns, it is compacted itself.
-     */
     @Test
-    public void testSingleSSTableCompactionWithSizeTieredCompaction() throws Exception
-    {
-        testSingleSSTableCompaction(SizeTieredCompactionStrategy.class.getCanonicalName());
-    }
-
-    @Test
-    public void testSingleSSTableCompactionWithLeveledCompaction() throws Exception
-    {
-        ColumnFamilyStore store = testSingleSSTableCompaction(LeveledCompactionStrategy.class.getCanonicalName());
-        WrappingCompactionStrategy strategy = (WrappingCompactionStrategy) store.getCompactionStrategy();
-        // tombstone removal compaction should not promote level
-        assert strategy.getSSTableCountPerLevel()[0] == 1;
-    }
-
-    @Test
-    public void testSuperColumnTombstones() throws IOException
+    public void testSuperColumnTombstones()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Super1");
+        CFMetaData table = cfs.metadata;
         cfs.disableAutoCompaction();
 
         DecoratedKey key = Util.dk("tskey");
         ByteBuffer scName = ByteBufferUtil.bytes("TestSuperColumn");
 
         // a subcolumn
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add("Super1", Util.cellname(scName, ByteBufferUtil.bytes(0)),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               FBUtilities.timestampMicros());
-        rm.applyUnsafe();
+        new RowUpdateBuilder(table, FBUtilities.timestampMicros(), key.getKey())
+        .clustering(ByteBufferUtil.bytes("cols"))
+        .add("val", "val1")
+        .build().applyUnsafe();
         cfs.forceBlockingFlush();
 
         // shadow the subcolumn with a supercolumn tombstone
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.deleteRange("Super1", SuperColumns.startOf(scName), SuperColumns.endOf(scName), FBUtilities.timestampMicros());
-        rm.applyUnsafe();
+        RowUpdateBuilder.deleteRow(table, FBUtilities.timestampMicros(), key.getKey(), ByteBufferUtil.bytes("cols")).applyUnsafe();
         cfs.forceBlockingFlush();
 
         CompactionManager.instance.performMaximal(cfs, false);
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // check that the shadowed column is gone
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
-        AbstractBounds<RowPosition> bounds = new Bounds<RowPosition>(key, sstable.partitioner.getMinimumToken().maxKeyBound());
-        ISSTableScanner scanner = sstable.getScanner(new DataRange(bounds, new IdentityQueryFilter()));
-        OnDiskAtomIterator iter = scanner.next();
-        assertEquals(key, iter.getKey());
-        assertTrue(iter.next() instanceof RangeTombstone);
-        assertFalse(iter.hasNext());
-        scanner.close();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        AbstractBounds<PartitionPosition> bounds = new Bounds<>(key, sstable.getPartitioner().getMinimumToken().maxKeyBound());
+        UnfilteredRowIterator ai;
+        try (ISSTableScanner scanner = sstable.getScanner())
+        {
+            ai = scanner.next();
+            final Unfiltered next = ai.next();
+            assertTrue(next.isRow());
+            assertFalse(ai.hasNext());
+        }
     }
 
     @Test
     public void testUncheckedTombstoneSizeTieredCompaction() throws Exception
     {
+        Map<String, String> compactionOptions = new HashMap<>();
+        compactionOptions.put("tombstone_compaction_interval", "1");
+        compactionOptions.put("unchecked_tombstone_compaction", "false");
+
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
         store.clearUnsafe();
-        store.metadata.gcGraceSeconds(1);
-        store.metadata.compactionStrategyOptions.put("tombstone_compaction_interval", "1");
-        store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "false");
+
+        MigrationManager.announceColumnFamilyUpdate(store.metadata.params(TableParams.builder(store.metadata.params).gcGraceSeconds(1).compaction(CompactionParams.scts(compactionOptions)).build()), true);
         store.reload();
-        store.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getName());
 
         // disable compaction while flushing
         store.disableAutoCompaction();
@@ -221,9 +222,9 @@
         long timestamp2 = populate(KEYSPACE1, CF_STANDARD1, 10, 19, 3); //ttl=3s
         store.forceBlockingFlush();
 
-        assertEquals(2, store.getSSTables().size());
+        assertEquals(2, store.getLiveSSTables().size());
 
-        Iterator<SSTableReader> it = store.getSSTables().iterator();
+        Iterator<SSTableReader> it = store.getLiveSSTables().iterator();
         long originalSize1 = it.next().uncompressedLength();
         long originalSize2 = it.next().uncompressedLength();
 
@@ -240,8 +241,8 @@
 
         // even though both sstables were candidate for tombstone compaction
         // it was not executed because they have an overlapping token range
-        assertEquals(2, store.getSSTables().size());
-        it = store.getSSTables().iterator();
+        assertEquals(2, store.getLiveSSTables().size());
+        it = store.getLiveSSTables().iterator();
         long newSize1 = it.next().uncompressedLength();
         long newSize2 = it.next().uncompressedLength();
         assertEquals("candidate sstable should not be tombstone-compacted because its key range overlap with other sstable",
@@ -250,7 +251,8 @@
                      originalSize2, newSize2);
 
         // now let's enable the magic property
-        store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "true");
+        compactionOptions.put("unchecked_tombstone_compaction", "true");
+        MigrationManager.announceColumnFamilyUpdate(store.metadata.params(TableParams.builder(store.metadata.params).gcGraceSeconds(1).compaction(CompactionParams.scts(compactionOptions)).build()), true);
         store.reload();
 
         //submit background task again and wait for it to complete
@@ -261,8 +263,8 @@
         } while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0);
 
         //we still have 2 sstables, since they were not compacted against each other
-        assertEquals(2, store.getSSTables().size());
-        it = store.getSSTables().iterator();
+        assertEquals(2, store.getLiveSSTables().size());
+        it = store.getLiveSSTables().iterator();
         newSize1 = it.next().uncompressedLength();
         newSize2 = it.next().uncompressedLength();
         assertTrue("should be less than " + originalSize1 + ", but was " + newSize1, newSize1 < originalSize1);
@@ -275,72 +277,18 @@
     public static void assertMaxTimestamp(ColumnFamilyStore cfs, long maxTimestampExpected)
     {
         long maxTimestampObserved = Long.MIN_VALUE;
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             maxTimestampObserved = Math.max(sstable.getMaxTimestamp(), maxTimestampObserved);
         assertEquals(maxTimestampExpected, maxTimestampObserved);
     }
 
     @Test
-    public void testEchoedRow()
+    public void testDontPurgeAccidentally() throws InterruptedException
     {
-        // This test check that EchoedRow doesn't skipp rows: see CASSANDRA-2653
-
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard2");
-
-        // disable compaction while flushing
-        cfs.disableAutoCompaction();
-
-        // Insert 4 keys in two sstables. We need the sstables to have 2 rows
-        // at least to trigger what was causing CASSANDRA-2653
-        for (int i=1; i < 5; i++)
-        {
-            DecoratedKey key = Util.dk(String.valueOf(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add("Standard2", Util.cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
-            rm.applyUnsafe();
-
-            if (i % 2 == 0)
-                cfs.forceBlockingFlush();
-        }
-        Collection<SSTableReader> toCompact = cfs.getSSTables();
-        assertEquals(2, toCompact.size());
-
-        // Reinserting the same keys. We will compact only the previous sstable, but we need those new ones
-        // to make sure we use EchoedRow, otherwise it won't be used because purge can be done.
-        for (int i=1; i < 5; i++)
-        {
-            DecoratedKey key = Util.dk(String.valueOf(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add("Standard2", Util.cellname(String.valueOf(i)), ByteBufferUtil.EMPTY_BYTE_BUFFER, i);
-            rm.applyUnsafe();
-        }
-        cfs.forceBlockingFlush();
-        SSTableReader tmpSSTable = null;
-        for (SSTableReader sstable : cfs.getSSTables())
-            if (!toCompact.contains(sstable))
-                tmpSSTable = sstable;
-        assertNotNull(tmpSSTable);
-
-        // Force compaction on first sstables. Since each row is in only one sstable, we will be using EchoedRow.
-        Util.compact(cfs, toCompact);
-        assertEquals(2, cfs.getSSTables().size());
-
-        // Now, we remove the sstable that was just created to force the use of EchoedRow (so that it doesn't hide the problem)
-        cfs.markObsolete(Collections.singleton(tmpSSTable), OperationType.UNKNOWN);
-        assertEquals(1, cfs.getSSTables().size());
-
-        // Now assert we do have the 4 keys
-        assertEquals(4, Util.getRangeSlice(cfs).size());
-    }
-
-    @Test
-    public void testDontPurgeAccidentaly() throws InterruptedException
-    {
-        testDontPurgeAccidentaly("test1", "Super5");
+        testDontPurgeAccidentally("test1", CF_SUPER5);
 
         // Use CF with gc_grace=0, see last bug of CASSANDRA-2786
-        testDontPurgeAccidentaly("test1", "SuperDirectGC");
+        testDontPurgeAccidentally("test1", CF_SUPERGC);
     }
 
     @Test
@@ -349,6 +297,7 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         final String cfname = "Standard3"; // use clean(no sstable) CF
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        CFMetaData table = cfs.metadata;
 
         // disable compaction while flushing
         cfs.disableAutoCompaction();
@@ -356,14 +305,13 @@
         final int ROWS_PER_SSTABLE = 10;
         for (int i = 0; i < ROWS_PER_SSTABLE; i++) {
             DecoratedKey key = Util.dk(String.valueOf(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add(cfname, Util.cellname("col"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   System.currentTimeMillis());
-            rm.applyUnsafe();
+            new RowUpdateBuilder(table, FBUtilities.timestampMicros(), key.getKey())
+            .clustering(ByteBufferUtil.bytes("cols"))
+            .add("val", "val1")
+            .build().applyUnsafe();
         }
         cfs.forceBlockingFlush();
-        Collection<SSTableReader> sstables = cfs.getSSTables();
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
 
         assertEquals(1, sstables.size());
         SSTableReader sstable = sstables.iterator().next();
@@ -378,11 +326,28 @@
             Thread.sleep(100);
         } while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0);
         // CF should have only one sstable with generation number advanced
-        sstables = cfs.getSSTables();
+        sstables = cfs.getLiveSSTables();
         assertEquals(1, sstables.size());
         assertEquals( prevGeneration + 1, sstables.iterator().next().descriptor.generation);
     }
 
+    public static void writeSSTableWithRangeTombstoneMaskingOneColumn(ColumnFamilyStore cfs, CFMetaData table, int[] dks) {
+        for (int dk : dks)
+        {
+            RowUpdateBuilder deletedRowUpdateBuilder = new RowUpdateBuilder(table, 1, Util.dk(Integer.toString(dk)));
+            deletedRowUpdateBuilder.clustering("01").add("val", "a"); //Range tombstone covers this (timestamp 2 > 1)
+            Clustering startClustering = new Clustering(ByteBufferUtil.bytes("0"));
+            Clustering endClustering = new Clustering(ByteBufferUtil.bytes("b"));
+            deletedRowUpdateBuilder.addRangeTombstone(new RangeTombstone(Slice.make(startClustering, endClustering), new DeletionTime(2, (int) (System.currentTimeMillis() / 1000))));
+            deletedRowUpdateBuilder.build().applyUnsafe();
+
+            RowUpdateBuilder notYetDeletedRowUpdateBuilder = new RowUpdateBuilder(table, 3, Util.dk(Integer.toString(dk)));
+            notYetDeletedRowUpdateBuilder.clustering("02").add("val", "a"); //Range tombstone doesn't cover this (timestamp 3 > 2)
+            notYetDeletedRowUpdateBuilder.build().applyUnsafe();
+        }
+        cfs.forceBlockingFlush();
+    }
+
     @Test
     public void testRangeTombstones()
     {
@@ -393,103 +358,67 @@
         // disable compaction while flushing
         cfs.disableAutoCompaction();
 
-        final CFMetaData cfmeta = cfs.metadata;
-        Directories dir = cfs.directories;
+        final CFMetaData table = cfs.metadata;
+        Directories dir = cfs.getDirectories();
 
         ArrayList<DecoratedKey> keys = new ArrayList<DecoratedKey>();
 
         for (int i=0; i < 4; i++)
         {
-            keys.add(Util.dk(""+i));
+            keys.add(Util.dk(Integer.toString(i)));
         }
 
-        ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfmeta);
-        cf.addColumn(Util.column("01", "a", 1)); // this must not resurrect
-        cf.addColumn(Util.column("a", "a", 3));
-        cf.deletionInfo().add(new RangeTombstone(Util.cellname("0"), Util.cellname("b"), 2, (int) (System.currentTimeMillis()/1000)),cfmeta.comparator);
+        int[] dks = {0, 1, 3};
+        writeSSTableWithRangeTombstoneMaskingOneColumn(cfs, table, dks);
 
-        try (SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables())), 0, 0, 0);)
-        {
-            writer.append(Util.dk("0"), cf);
-            writer.append(Util.dk("1"), cf);
-            writer.append(Util.dk("3"), cf);
+        int[] dkays = {0, 1, 2, 3};
+        writeSSTableWithRangeTombstoneMaskingOneColumn(cfs, table, dkays);
 
-            cfs.addSSTable(writer.finish(true));
-        }
-
-        try (SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables())), 0, 0, 0);)
-        {
-            writer.append(Util.dk("0"), cf);
-            writer.append(Util.dk("1"), cf);
-            writer.append(Util.dk("2"), cf);
-            writer.append(Util.dk("3"), cf);
-            cfs.addSSTable(writer.finish(true));
-        }
-
-
-        Collection<SSTableReader> toCompact = cfs.getSSTables();
+        Collection<SSTableReader> toCompact = cfs.getLiveSSTables();
         assert toCompact.size() == 2;
 
-        // Force compaction on first sstables. Since each row is in only one sstable, we will be using EchoedRow.
         Util.compact(cfs, toCompact);
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // Now assert we do have the 4 keys
-        assertEquals(4, Util.getRangeSlice(cfs).size());
+        assertEquals(4, Util.getAll(Util.cmd(cfs).build()).size());
 
-        ArrayList<DecoratedKey> k = new ArrayList<DecoratedKey>();
-        for (Row r : Util.getRangeSlice(cfs))
+        ArrayList<DecoratedKey> k = new ArrayList<>();
+
+        for (FilteredPartition p : Util.getAll(Util.cmd(cfs).build()))
         {
-            k.add(r.key);
-            assertEquals(ByteBufferUtil.bytes("a"),r.cf.getColumn(Util.cellname("a")).value());
-            assertNull(r.cf.getColumn(Util.cellname("01")));
-            assertEquals(3,r.cf.getColumn(Util.cellname("a")).timestamp());
+            k.add(p.partitionKey());
+            final SinglePartitionReadCommand command = SinglePartitionReadCommand.create(cfs.metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(cfs.metadata), RowFilter.NONE, DataLimits.NONE, p.partitionKey(), new ClusteringIndexSliceFilter(Slices.ALL, false));
+            try (ReadOrderGroup orderGroup = command.startOrderGroup();
+                 PartitionIterator iterator = command.executeInternal(orderGroup))
+            {
+                try (RowIterator rowIterator = iterator.next())
+                {
+                    Row row = rowIterator.next();
+                    Cell cell = row.getCell(cfs.metadata.getColumnDefinition(new ColumnIdentifier("val", false)));
+                    assertEquals(ByteBufferUtil.bytes("a"), cell.value());
+                    assertEquals(3, cell.timestamp());
+                    assertNotSame(ByteBufferUtil.bytes("01"), row.clustering().getRawValues()[0]);
+                    assertEquals(ByteBufferUtil.bytes("02"), row.clustering().getRawValues()[0]);
+                }
+            }
         }
-
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             StatsMetadata stats = sstable.getSSTableMetadata();
-            assertEquals(ByteBufferUtil.bytes("0"), stats.minColumnNames.get(0));
-            assertEquals(ByteBufferUtil.bytes("b"), stats.maxColumnNames.get(0));
+            assertEquals(ByteBufferUtil.bytes("0"), stats.minClusteringValues.get(0));
+            assertEquals(ByteBufferUtil.bytes("b"), stats.maxClusteringValues.get(0));
         }
 
         assertEquals(keys, k);
     }
 
-    @Test
-    public void testCompactionLog() throws Exception
-    {
-        SystemKeyspace.discardCompactionsInProgress();
-
-        String cf = "Standard4";
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(cf);
-        SchemaLoader.insertData(KEYSPACE1, cf, 0, 1);
-        cfs.forceBlockingFlush();
-
-        Collection<SSTableReader> sstables = cfs.getSSTables();
-        assertFalse(sstables.isEmpty());
-        Set<Integer> generations = Sets.newHashSet(Iterables.transform(sstables, new Function<SSTableReader, Integer>()
-        {
-            public Integer apply(SSTableReader sstable)
-            {
-                return sstable.descriptor.generation;
-            }
-        }));
-        UUID taskId = SystemKeyspace.startCompaction(cfs, sstables);
-        Map<Pair<String, String>, Map<Integer, UUID>> compactionLogs = SystemKeyspace.getUnfinishedCompactions();
-        Set<Integer> unfinishedCompactions = compactionLogs.get(Pair.create(KEYSPACE1, cf)).keySet();
-        assertTrue(unfinishedCompactions.containsAll(generations));
-
-        SystemKeyspace.finishCompaction(taskId);
-        compactionLogs = SystemKeyspace.getUnfinishedCompactions();
-        assertFalse(compactionLogs.containsKey(Pair.create(KEYSPACE1, cf)));
-    }
-
-    private void testDontPurgeAccidentaly(String k, String cfname) throws InterruptedException
+    private void testDontPurgeAccidentally(String k, String cfname) throws InterruptedException
     {
         // This test catches the regression of CASSANDRA-2786
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        CFMetaData table = cfs.metadata;
 
         // disable compaction while flushing
         cfs.clearUnsafe();
@@ -497,31 +426,31 @@
 
         // Add test row
         DecoratedKey key = Util.dk(k);
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(cfname, Util.cellname(ByteBufferUtil.bytes("sc"), ByteBufferUtil.bytes("c")), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-        rm.applyUnsafe();
+        RowUpdateBuilder rowUpdateBuilder = new RowUpdateBuilder(table, 0, key);
+        rowUpdateBuilder.clustering("c").add("val", "a");
+        rowUpdateBuilder.build().applyUnsafe();
 
         cfs.forceBlockingFlush();
 
-        Collection<SSTableReader> sstablesBefore = cfs.getSSTables();
+        Collection<SSTableReader> sstablesBefore = cfs.getLiveSSTables();
 
-        QueryFilter filter = QueryFilter.getIdentityFilter(key, cfname, System.currentTimeMillis());
-        assertTrue(cfs.getColumnFamily(filter).hasColumns());
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertTrue(!partition.isEmpty());
 
+        RowUpdateBuilder deleteRowBuilder = new RowUpdateBuilder(table, 2, key);
+        deleteRowBuilder.clustering("c").delete("val");
+        deleteRowBuilder.build().applyUnsafe();
         // Remove key
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete(cfname, 2);
-        rm.applyUnsafe();
 
-        ColumnFamily cf = cfs.getColumnFamily(filter);
-        assertTrue("should be empty: " + cf, cf == null || !cf.hasColumns());
+        partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        assertTrue(partition.iterator().next().cells().iterator().next().isTombstone());
 
         // Sleep one second so that the removal is indeed purgeable even with gcgrace == 0
         Thread.sleep(1000);
 
         cfs.forceBlockingFlush();
 
-        Collection<SSTableReader> sstablesAfter = cfs.getSSTables();
+        Collection<SSTableReader> sstablesAfter = cfs.getLiveSSTables();
         Collection<SSTableReader> toCompact = new ArrayList<SSTableReader>();
         for (SSTableReader sstable : sstablesAfter)
             if (!sstablesBefore.contains(sstable))
@@ -529,8 +458,27 @@
 
         Util.compact(cfs, toCompact);
 
-        cf = cfs.getColumnFamily(filter);
-        assertTrue("should be empty: " + cf, cf == null || !cf.hasColumns());
+        SSTableReader newSSTable = null;
+        for (SSTableReader reader : cfs.getLiveSSTables())
+        {
+            assert !toCompact.contains(reader);
+            if (!sstablesBefore.contains(reader))
+                newSSTable = reader;
+        }
+
+        // We cannot read the data, since {@link ReadCommand#withoutPurgeableTombstones} will purge droppable tombstones
+        // but we just want to check here that compaction did *NOT* drop the tombstone, so we read from the SSTable directly
+        // instead
+        ISSTableScanner scanner = newSSTable.getScanner();
+        assertTrue(scanner.hasNext());
+        UnfilteredRowIterator rowIt = scanner.next();
+        assertTrue(rowIt.hasNext());
+        Unfiltered unfiltered = rowIt.next();
+        assertTrue(unfiltered.isRow());
+        Row row = (Row)unfiltered;
+        assertTrue(row.cells().iterator().next().isTombstone());
+        assertFalse(rowIt.hasNext());
+        assertFalse(scanner.hasNext());
     }
 
     private static Range<Token> rangeFor(int start, int end)
@@ -550,10 +498,16 @@
     private static void insertRowWithKey(int key)
     {
         long timestamp = System.currentTimeMillis();
-        DecoratedKey decoratedKey = Util.dk(String.format("%03d", key));
+        DecoratedKey dk = Util.dk(String.format("%03d", key));
+        new RowUpdateBuilder(Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1).metadata, timestamp, dk.getKey())
+                .add("val", "val")
+                .build()
+                .applyUnsafe();
+        /*
         Mutation rm = new Mutation(KEYSPACE1, decoratedKey.getKey());
         rm.add("CF_STANDARD1", Util.cellname("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
         rm.applyUnsafe();
+        */
     }
 
     @Test
@@ -578,8 +532,8 @@
         }
         store.forceBlockingFlush();
 
-        assertEquals(1, store.getSSTables().size());
-        SSTableReader sstable = store.getSSTables().iterator().next();
+        assertEquals(1, store.getLiveSSTables().size());
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
 
 
         // contiguous range spans all data

diff --git a/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java
index 5208401..1c4387c 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java

@@ -24,31 +24,39 @@
 import java.io.RandomAccessFile;
 import java.util.*;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.After;
+import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.*;
+
 import static org.junit.Assert.assertTrue;
-import static org.apache.cassandra.Util.cellname;
 
 public class CorruptedSSTablesCompactionsTest
 {
+    private static final Logger logger = LoggerFactory.getLogger(CorruptedSSTablesCompactionsTest.class);
+
+    private static Random random;
+
     private static final String KEYSPACE1 = "CorruptedSSTablesCompactionsTest";
-    private static final String CF_STANDARD1 = "Standard1";
-    // seed hardcoded to one we know works:
-    private static final Random random = new Random(1);
+    private static final String STANDARD_STCS = "Standard_STCS";
+    private static final String STANDARD_LCS = "Standard_LCS";
+    private static int maxValueSize;
 
     @After
     public void leakDetect() throws InterruptedException
@@ -62,14 +70,40 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        long seed = System.nanoTime();
+
+        //long seed = 754271160974509L; // CASSANDRA-9530: use this seed to reproduce compaction failures if reading empty rows
+        //long seed = 2080431860597L; // CASSANDRA-12359: use this seed to reproduce undetected corruptions
+        //long seed = 9823169134884L; // CASSANDRA-15879: use this seed to reproduce duplicate clusterings
+
+        logger.info("Seed {}", seed);
+        random = new Random(seed);
+
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                                    KeyspaceParams.simple(1),
+                                    makeTable(STANDARD_STCS).compaction(CompactionParams.DEFAULT),
+                                    makeTable(STANDARD_LCS).compaction(CompactionParams.lcs(Collections.emptyMap())));
+
+        maxValueSize = DatabaseDescriptor.getMaxValueSize();
+        DatabaseDescriptor.setMaxValueSize(1024 * 1024);
         closeStdErr();
     }
 
+    /**
+     * Return a table metadata, we use types with fixed size to increase the chance of detecting corrupt data
+     */
+    private static CFMetaData makeTable(String tableName)
+    {
+        return SchemaLoader.standardCFMD(KEYSPACE1, tableName, 1, LongType.instance, LongType.instance, LongType.instance);
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.setMaxValueSize(maxValueSize);
+    }
+
     public static void closeStdErr()
     {
         // These tests generate an error message per CorruptSSTableException since it goes through
@@ -83,71 +117,79 @@
     @Test
     public void testCorruptedSSTablesWithSizeTieredCompactionStrategy() throws Exception
     {
-        testCorruptedSSTables(SizeTieredCompactionStrategy.class.getCanonicalName());
+        testCorruptedSSTables(STANDARD_STCS);
     }
 
     @Test
     public void testCorruptedSSTablesWithLeveledCompactionStrategy() throws Exception
     {
-        testCorruptedSSTables(LeveledCompactionStrategy.class.getCanonicalName());
+        testCorruptedSSTables(STANDARD_LCS);
     }
 
-    public void testCorruptedSSTables(String compactionStrategy) throws Exception
+
+    public void testCorruptedSSTables(String tableName) throws Exception
     {
         // this test does enough rows to force multiple block indexes to be used
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(tableName);
 
         final int ROWS_PER_SSTABLE = 10;
-        final int SSTABLES = cfs.metadata.getMinIndexInterval() * 2 / ROWS_PER_SSTABLE;
+        final int SSTABLES = cfs.metadata.params.minIndexInterval * 2 / ROWS_PER_SSTABLE;
+        final int SSTABLES_TO_CORRUPT = 8;
 
-        cfs.setCompactionStrategyClass(compactionStrategy);
+        assertTrue(String.format("Not enough sstables (%d), expected at least %d sstables to corrupt", SSTABLES, SSTABLES_TO_CORRUPT),
+                   SSTABLES > SSTABLES_TO_CORRUPT);
 
         // disable compaction while flushing
         cfs.disableAutoCompaction();
         //test index corruption
         //now create a few new SSTables
         long maxTimestampExpected = Long.MIN_VALUE;
-        Set<DecoratedKey> inserted = new HashSet<DecoratedKey>();
+        Set<DecoratedKey> inserted = new HashSet<>();
+
         for (int j = 0; j < SSTABLES; j++)
         {
             for (int i = 0; i < ROWS_PER_SSTABLE; i++)
             {
-                DecoratedKey key = Util.dk(String.valueOf(i % 2));
-                Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+                DecoratedKey key = Util.dk(String.valueOf(i), LongType.instance);
                 long timestamp = j * ROWS_PER_SSTABLE + i;
-                rm.add("Standard1", cellname(i / 2), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp);
+                new RowUpdateBuilder(cfs.metadata, timestamp, key.getKey())
+                        .clustering(Long.valueOf(i))
+                        .add("val", Long.valueOf(i))
+                        .build()
+                        .applyUnsafe();
                 maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);
-                rm.applyUnsafe();
                 inserted.add(key);
             }
             cfs.forceBlockingFlush();
             CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
-            assertEquals(inserted.toString(), inserted.size(), Util.getRangeSlice(cfs).size());
+            assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
         }
 
-        Collection<SSTableReader> sstables = cfs.getSSTables();
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
         int currentSSTable = 0;
-        int sstablesToCorrupt = 8;
 
         // corrupt first 'sstablesToCorrupt' SSTables
         for (SSTableReader sstable : sstables)
         {
-            if(currentSSTable + 1 > sstablesToCorrupt)
+            if (currentSSTable + 1 > SSTABLES_TO_CORRUPT)
                 break;
 
             RandomAccessFile raf = null;
 
             try
             {
+                int corruptionSize = 100;
                 raf = new RandomAccessFile(sstable.getFilename(), "rw");
                 assertNotNull(raf);
-                assertTrue(raf.length() > 20);
-                raf.seek(random.nextInt((int)(raf.length() - 20)));
+                assertTrue(raf.length() > corruptionSize);
+                long pos = random.nextInt((int)(raf.length() - corruptionSize));
+                logger.info("Corrupting sstable {} [{}] at pos {} / {}", currentSSTable, sstable.getFilename(), pos, raf.length());
+                raf.seek(pos);
                 // We want to write something large enough that the corruption cannot get undetected
                 // (even without compression)
-                byte[] corruption = new byte[20];
-                Arrays.fill(corruption, (byte)0xFF);
+                byte[] corruption = new byte[corruptionSize];
+                random.nextBytes(corruption);
                 raf.write(corruption);
 
             }
@@ -176,12 +218,10 @@
                 failures++;
                 continue;
             }
-
-            assertEquals(sstablesToCorrupt + 1, cfs.getSSTables().size());
             break;
         }
 
         cfs.truncateBlocking();
-        assertEquals(sstablesToCorrupt, failures);
+        assertEquals(SSTABLES_TO_CORRUPT, failures);
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
index 3bc830b..aa886b4 100644
--- a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -29,14 +29,14 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.db.compaction.DateTieredCompactionStrategy.getBuckets;
@@ -53,11 +53,14 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
         SchemaLoader.prepareServer();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                KeyspaceParams.simple(1),
+                SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
     }
 
     @Test
@@ -224,14 +227,15 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add(CF_STANDARD1, Util.cellname("column"), value, r);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, r, key.getKey())
+                .clustering("column")
+                .add("val", value).build().applyUnsafe();
+
             cfs.forceBlockingFlush();
         }
         cfs.forceBlockingFlush();
 
-        List<SSTableReader> sstrs = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
 
         List<SSTableReader> newBucket = newestBucket(Collections.singletonList(sstrs.subList(0, 2)), 4, 32, 9, 10, Long.MAX_VALUE, new SizeTieredCompactionStrategyOptions());
         assertTrue("incoming bucket should not be accepted when it has below the min threshold SSTables", newBucket.isEmpty());
@@ -259,15 +263,16 @@
         for (int r = 0; r < numSSTables; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add(CF_STANDARD1, Util.cellname("column"), value, r);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, r, key.getKey())
+                .clustering("column")
+                .add("val", value).build().applyUnsafe();
+
             cfs.forceBlockingFlush();
         }
         cfs.forceBlockingFlush();
 
         Iterable<SSTableReader> filtered;
-        List<SSTableReader> sstrs = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
 
         filtered = filterOldSSTables(sstrs, 0, 2);
         assertEquals("when maxSSTableAge is zero, no sstables should be filtered", sstrs.size(), Iterables.size(filtered));
@@ -295,18 +300,21 @@
 
         // create 2 sstables
         DecoratedKey key = Util.dk(String.valueOf("expired"));
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(CF_STANDARD1, Util.cellname("column"), value, System.currentTimeMillis(), 1);
-        rm.apply();
+        new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), 1, key.getKey())
+            .clustering("column")
+            .add("val", value).build().applyUnsafe();
+
         cfs.forceBlockingFlush();
-        SSTableReader expiredSSTable = cfs.getSSTables().iterator().next();
+        SSTableReader expiredSSTable = cfs.getLiveSSTables().iterator().next();
         Thread.sleep(10);
+
         key = Util.dk(String.valueOf("nonexpired"));
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.add(CF_STANDARD1, Util.cellname("column"), value, System.currentTimeMillis());
-        rm.apply();
+        new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), key.getKey())
+            .clustering("column")
+            .add("val", value).build().applyUnsafe();
+
         cfs.forceBlockingFlush();
-        assertEquals(cfs.getSSTables().size(), 2);
+        assertEquals(cfs.getLiveSSTables().size(), 2);
 
         Map<String, String> options = new HashMap<>();
 
@@ -315,7 +323,7 @@
         options.put(DateTieredCompactionStrategyOptions.MAX_SSTABLE_AGE_KEY, Double.toString((1d / (24 * 60 * 60))));
         options.put(DateTieredCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0");
         DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(cfs, options);
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             dtcs.addSSTable(sstable);
         dtcs.startup();
         assertNull(dtcs.getNextBackgroundTask((int) (System.currentTimeMillis() / 1000)));
@@ -345,9 +353,9 @@
             for (int i = 0; i < 10; i++)
             {
                 DecoratedKey key = Util.dk(String.valueOf(r));
-                Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-                rm.add(CF_STANDARD1, Util.cellname("column"), bigValue, timestamp);
-                rm.apply();
+                new RowUpdateBuilder(cfs.metadata, timestamp, key.getKey())
+                    .clustering("column")
+                    .add("val", bigValue).build().applyUnsafe();
             }
             cfs.forceBlockingFlush();
         }
@@ -355,15 +363,15 @@
         for (int r = 0; r < numSSTables / 2; r++)
         {
             DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add(CF_STANDARD1, Util.cellname("column"), value, timestamp);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, timestamp, key.getKey())
+                .clustering("column")
+                .add("val", value).build().applyUnsafe();
             cfs.forceBlockingFlush();
         }
         Map<String, String> options = new HashMap<>();
         options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, "1");
         DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(cfs, options);
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
             dtcs.addSSTable(sstable);
         AbstractCompactionTask task = dtcs.getNextBackgroundTask(0);
         assertEquals(20, task.transaction.originals().size());

diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
index 0047678..2cda2e8 100644
--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,15 +20,12 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.HashMap;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import java.util.Random;
 import java.util.UUID;
-import java.util.concurrent.ExecutionException;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import junit.framework.Assert;
 import org.junit.After;
 import org.junit.Before;
@@ -42,20 +39,20 @@
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.notifications.SSTableAddedNotification;
 import org.apache.cassandra.notifications.SSTableRepairStatusChanged;
 import org.apache.cassandra.repair.RepairJobDesc;
 import org.apache.cassandra.repair.Validator;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -76,15 +73,15 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        Map<String, String> leveledOptions = new HashMap<>();
-        leveledOptions.put("sstable_size_in_mb", "1");
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
         SchemaLoader.prepareServer();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDDLEVELED)
-                                                .compactionStrategyClass(LeveledCompactionStrategy.class)
-                                                .compactionStrategyOptions(leveledOptions));
+                                                .compaction(CompactionParams.lcs(Collections.singletonMap("sstable_size_in_mb", "1"))));
         }
 
     @Before
@@ -121,18 +118,15 @@
         // Adds enough data to trigger multiple sstable per level
         for (int r = 0; r < rows; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder update = UpdateBuilder.create(cfs.metadata, String.valueOf(r));
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.apply();
+                update.newRow("column" + c).add("val", value);
+            update.applyUnsafe();
             cfs.forceBlockingFlush();
         }
 
         waitForLeveling(cfs);
-        WrappingCompactionStrategy strategy = (WrappingCompactionStrategy) cfs.getCompactionStrategy();
+        CompactionStrategyManager strategy =  cfs.getCompactionStrategyManager();
         // Checking we're not completely bad at math
         int l1Count = strategy.getSSTableCountPerLevel()[1];
         int l2Count = strategy.getSSTableCountPerLevel()[2];
@@ -144,7 +138,7 @@
             Assert.fail();
         }
 
-        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategy().groupSSTablesForAntiCompaction(cfs.getSSTables());
+        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(cfs.getLiveSSTables());
         for (Collection<SSTableReader> sstableGroup : groupedSSTables)
         {
             int groupLevel = -1;
@@ -179,27 +173,24 @@
         // Adds enough data to trigger multiple sstable per level
         for (int r = 0; r < rows; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder update = UpdateBuilder.create(cfs.metadata, String.valueOf(r));
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.applyUnsafe();
+                update.newRow("column" + c).add("val", value);
+            update.applyUnsafe();
             cfs.forceBlockingFlush();
         }
 
         waitForLeveling(cfs);
-        WrappingCompactionStrategy strategy = (WrappingCompactionStrategy) cfs.getCompactionStrategy();
+        CompactionStrategyManager strategy =  cfs.getCompactionStrategyManager();
         // Checking we're not completely bad at math
         assertTrue(strategy.getSSTableCountPerLevel()[1] > 0);
         assertTrue(strategy.getSSTableCountPerLevel()[2] > 0);
 
         Range<Token> range = new Range<>(Util.token(""), Util.token(""));
-        int gcBefore = keyspace.getColumnFamilyStore(CF_STANDARDDLEVELED).gcBefore(System.currentTimeMillis());
+        int gcBefore = keyspace.getColumnFamilyStore(CF_STANDARDDLEVELED).gcBefore(FBUtilities.nowInSeconds());
         UUID parentRepSession = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(parentRepSession, FBUtilities.getBroadcastAddress(), Arrays.asList(cfs), Arrays.asList(range), false, true);
-        RepairJobDesc desc = new RepairJobDesc(parentRepSession, UUID.randomUUID(), KEYSPACE1, CF_STANDARDDLEVELED, range);
+        ActiveRepairService.instance.registerParentRepairSession(parentRepSession, FBUtilities.getBroadcastAddress(), Arrays.asList(cfs), Arrays.asList(range), false, System.currentTimeMillis(), true);
+        RepairJobDesc desc = new RepairJobDesc(parentRepSession, UUID.randomUUID(), KEYSPACE1, CF_STANDARDDLEVELED, Arrays.asList(range));
         Validator validator = new Validator(desc, FBUtilities.getBroadcastAddress(), gcBefore);
         CompactionManager.instance.submitValidation(cfs, validator).get();
     }
@@ -209,7 +200,7 @@
      */
     public static void waitForLeveling(ColumnFamilyStore cfs) throws InterruptedException
     {
-        WrappingCompactionStrategy strategyManager = (WrappingCompactionStrategy)cfs.getCompactionStrategy();
+        CompactionStrategyManager strategyManager = cfs.getCompactionStrategyManager();
         while (true)
         {
             // since we run several compaction strategies we wait until L0 in all strategies is empty and
@@ -217,7 +208,7 @@
             // so it should be good enough
             boolean allL0Empty = true;
             boolean anyL1NonEmpty = false;
-            for (AbstractCompactionStrategy strategy : strategyManager.getWrappedStrategies())
+            for (AbstractCompactionStrategy strategy : strategyManager.getStrategies())
             {
                 if (!(strategy instanceof LeveledCompactionStrategy))
                     return;
@@ -245,18 +236,15 @@
         int columns = 10;
         for (int r = 0; r < rows; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder update = UpdateBuilder.create(cfs.metadata, String.valueOf(r));
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.applyUnsafe();
+                update.newRow("column" + c).add("val", value);
+            update.applyUnsafe();
             cfs.forceBlockingFlush();
         }
 
         waitForLeveling(cfs);
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) ((WrappingCompactionStrategy) cfs.getCompactionStrategy()).getWrappedStrategies().get(1);
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) (cfs.getCompactionStrategyManager()).getStrategies().get(1);
         assert strategy.getLevelSize(1) > 0;
 
         // get LeveledScanner for level 1 sstables
@@ -285,20 +273,17 @@
         // Adds enough data to trigger multiple sstable per level
         for (int r = 0; r < rows; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder update = UpdateBuilder.create(cfs.metadata, String.valueOf(r));
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.applyUnsafe();
+                update.newRow("column" + c).add("val", value);
+            update.applyUnsafe();
             cfs.forceBlockingFlush();
         }
         cfs.forceBlockingFlush();
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) ((WrappingCompactionStrategy) cfs.getCompactionStrategy()).getWrappedStrategies().get(1);
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) ( cfs.getCompactionStrategyManager()).getStrategies().get(1);
         cfs.forceMajorCompaction();
 
-        for (SSTableReader s : cfs.getSSTables())
+        for (SSTableReader s : cfs.getLiveSSTables())
         {
             assertTrue(s.getSSTableLevel() != 6 && s.getSSTableLevel() > 0);
             strategy.manifest.remove(s);
@@ -307,12 +292,12 @@
             strategy.manifest.add(s);
         }
         // verify that all sstables in the changed set is level 6
-        for (SSTableReader s : cfs.getSSTables())
+        for (SSTableReader s : cfs.getLiveSSTables())
             assertEquals(6, s.getSSTableLevel());
 
         int[] levels = strategy.manifest.getAllLevelSize();
         // verify that the manifest has correct amount of sstables
-        assertEquals(cfs.getSSTables().size(), levels[6]);
+        assertEquals(cfs.getLiveSSTables().size(), levels[6]);
     }
 
     @Test
@@ -329,13 +314,10 @@
         // Adds enough data to trigger multiple sstable per level
         for (int r = 0; r < rows; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            UpdateBuilder update = UpdateBuilder.create(cfs.metadata, String.valueOf(r));
             for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.applyUnsafe();
+                update.newRow("column" + c).add("val", value);
+            update.applyUnsafe();
             cfs.forceBlockingFlush();
         }
         waitForLeveling(cfs);
@@ -344,8 +326,8 @@
         while(CompactionManager.instance.isCompacting(Arrays.asList(cfs)))
             Thread.sleep(100);
 
-        WrappingCompactionStrategy strategy = (WrappingCompactionStrategy) cfs.getCompactionStrategy();
-        List<AbstractCompactionStrategy> strategies = strategy.getWrappedStrategies();
+        CompactionStrategyManager strategy =  cfs.getCompactionStrategyManager();
+        List<AbstractCompactionStrategy> strategies = strategy.getStrategies();
         LeveledCompactionStrategy repaired = (LeveledCompactionStrategy) strategies.get(0);
         LeveledCompactionStrategy unrepaired = (LeveledCompactionStrategy) strategies.get(1);
         assertEquals(0, repaired.manifest.getLevelCount() );
@@ -353,14 +335,14 @@
         assertTrue(strategy.getSSTableCountPerLevel()[1] > 0);
         assertTrue(strategy.getSSTableCountPerLevel()[2] > 0);
 
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
             assertFalse(sstable.isRepaired());
 
         int sstableCount = 0;
         for (List<SSTableReader> level : unrepaired.manifest.generations)
             sstableCount += level.size();
         // we only have unrepaired sstables:
-        assertEquals(sstableCount, cfs.getSSTables().size());
+        assertEquals(sstableCount, cfs.getLiveSSTables().size());
 
         SSTableReader sstable1 = unrepaired.manifest.generations[2].get(0);
         SSTableReader sstable2 = unrepaired.manifest.generations[1].get(0);
@@ -381,42 +363,8 @@
         assertFalse(unrepaired.manifest.generations[2].contains(sstable1));
 
         unrepaired.removeSSTable(sstable2);
-        strategy.handleNotification(new SSTableAddedNotification(sstable2), this);
+        strategy.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable2)), this);
         assertTrue(unrepaired.manifest.getLevel(1).contains(sstable2));
         assertFalse(repaired.manifest.getLevel(1).contains(sstable2));
     }
-
-    @Test
-    public void testDontRemoveLevelInfoUpgradeSSTables() throws InterruptedException, ExecutionException
-    {
-        byte [] b = new byte[100 * 1024];
-        new Random().nextBytes(b);
-        ByteBuffer value = ByteBuffer.wrap(b); // 100 KB value, make it easy to have multiple files
-
-        // Enough data to have a level 1 and 2
-        int rows = 20;
-        int columns = 10;
-
-        // Adds enough data to trigger multiple sstable per level
-        for (int r = 0; r < rows; r++)
-        {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            for (int c = 0; c < columns; c++)
-            {
-                rm.add(CF_STANDARDDLEVELED, Util.cellname("column" + c), value, 0);
-            }
-            rm.apply();
-            cfs.forceBlockingFlush();
-        }
-        waitForLeveling(cfs);
-        cfs.forceBlockingFlush();
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) ((WrappingCompactionStrategy) cfs.getCompactionStrategy()).getWrappedStrategies().get(1);
-        assertTrue(strategy.getAllLevelSize()[1] > 0);
-
-        cfs.disableAutoCompaction();
-        cfs.sstablesRewrite(false, 2);
-        assertTrue(strategy.getAllLevelSize()[1] > 0);
-
-    }
 }

diff --git a/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java
index 362afe7..0d5bc81 100644
--- a/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java

@@ -24,11 +24,11 @@
 
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DeletedCell;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.OnDiskAtom;
-import org.apache.cassandra.db.RangeTombstone;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
@@ -49,7 +49,7 @@
     }
 
     @Test
-    public void neverPurgeRangeTombstoneTest() throws Throwable
+    public void neverPurgeRowTombstoneTest() throws Throwable
     {
         testHelper("DELETE FROM %s WHERE a=1 AND b=2");
     }
@@ -80,9 +80,9 @@
         execute("DELETE FROM %s WHERE a=3");
         cfs.forceBlockingFlush();
         cfs.enableAutoCompaction();
-        while (cfs.getSSTables().size() > 1 || !cfs.getTracker().getCompacting().isEmpty())
+        while (cfs.getLiveSSTables().size() > 1 || !cfs.getTracker().getCompacting().isEmpty())
             Thread.sleep(100);
-        verifyContainsTombstones(cfs.getSSTables(), 3);
+        verifyContainsTombstones(cfs.getLiveSSTables(), 3);
     }
 
     private void testHelper(String deletionStatement) throws Throwable
@@ -94,7 +94,7 @@
         Thread.sleep(1000);
         cfs.forceBlockingFlush();
         cfs.forceMajorCompaction();
-        verifyContainsTombstones(cfs.getSSTables(), 1);
+        verifyContainsTombstones(cfs.getLiveSSTables(), 1);
     }
 
     private void verifyContainsTombstones(Collection<SSTableReader> sstables, int expectedTombstoneCount) throws Exception
@@ -106,16 +106,23 @@
         {
             while (scanner.hasNext())
             {
-                try (OnDiskAtomIterator iter = scanner.next())
+                try (UnfilteredRowIterator iter = scanner.next())
                 {
-                    if (iter.getColumnFamily().deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
+                    if (!iter.partitionLevelDeletion().isLive())
                         tombstoneCount++;
 
                     while (iter.hasNext())
                     {
-                        OnDiskAtom atom = iter.next();
-                        if (atom instanceof DeletedCell || atom instanceof RangeTombstone)
-                            tombstoneCount++;
+                        Unfiltered atom = iter.next();
+                        if (atom.isRow())
+                        {
+                            Row r = (Row)atom;
+                            if (!r.deletion().isLive())
+                                tombstoneCount++;
+                            for (Cell c : r.cells())
+                                if (c.isTombstone())
+                                    tombstoneCount++;
+                        }
                     }
                 }
             }

diff --git a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
index ec5c280..f55bf52 100644
--- a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java

@@ -1,44 +1,41 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.db.compaction;
 
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ExecutionException;
-import java.util.Set;
+import java.util.Collections;
 import java.util.HashSet;
-
-import org.apache.cassandra.Util;
+import java.util.Set;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import static org.junit.Assert.assertEquals;
-
-import org.apache.cassandra.db.*;
-
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.junit.Assert.assertEquals;
+
 
 public class OneCompactionTest
 {
@@ -49,13 +46,11 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        Map<String, String> leveledOptions = new HashMap<>();
-        leveledOptions.put("sstable_size_in_mb", "1");
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1).compactionStrategyOptions(leveledOptions),
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1)
+                                                .compaction(CompactionParams.lcs(Collections.singletonMap("sstable_size_in_mb", "1"))),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
     }
 
@@ -66,18 +61,21 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(columnFamilyName);
 
-        Set<DecoratedKey> inserted = new HashSet<DecoratedKey>();
+        Set<String> inserted = new HashSet<>();
         for (int j = 0; j < insertsPerTable; j++) {
-            DecoratedKey key = Util.dk(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
-            rm.add(columnFamilyName, Util.cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.applyUnsafe();
+            String key = String.valueOf(j);
+            new RowUpdateBuilder(store.metadata, j, key)
+                .clustering("0")
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+
             inserted.add(key);
             store.forceBlockingFlush();
-            assertEquals(inserted.size(), Util.getRangeSlice(store).size());
+            assertEquals(inserted.size(), Util.getAll(Util.cmd(store).build()).size());
         }
         CompactionManager.instance.performMaximal(store, false);
-        assertEquals(1, store.getSSTables().size());
+        assertEquals(1, store.getLiveSSTables().size());
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
index 46d54d9..ff0f444 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -18,27 +18,32 @@
 package org.apache.cassandra.db.compaction;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.metrics.RestorableMeter;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.getBuckets;
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.mostInterestingBucket;
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.trimToThresholdWithHotness;
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.validateOptions;
-
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 public class SizeTieredCompactionStrategyTest
 {
@@ -48,14 +53,14 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        Map<String, String> leveledOptions = new HashMap<>();
-        leveledOptions.put("sstable_size_in_mb", "1");
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
         SchemaLoader.prepareServer();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1)
-                                                .compactionStrategyOptions(leveledOptions));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
     }
 
     @Test
@@ -156,15 +161,15 @@
         int numSSTables = 3;
         for (int r = 0; r < numSSTables; r++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(r));
-            Mutation rm = new Mutation(ksname, key.getKey());
-            rm.add(cfname, Util.cellname("column"), value, 0);
-            rm.applyUnsafe();
+            String key = String.valueOf(r);
+            new RowUpdateBuilder(cfs.metadata, 0, key)
+                .clustering("column").add("val", value)
+                .build().applyUnsafe();
             cfs.forceBlockingFlush();
         }
         cfs.forceBlockingFlush();
 
-        List<SSTableReader> sstrs = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
         Pair<List<SSTableReader>, Double> bucket;
 
         List<SSTableReader> interestingBucket = mostInterestingBucket(Collections.singletonList(sstrs.subList(0, 2)), 4, 32);

diff --git a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
index bd1e559..e0378f6 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.db.compaction;
 /*
- * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,32 +15,36 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
  */
+package org.apache.cassandra.db.compaction;
 
-import org.junit.BeforeClass;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
 import com.google.common.collect.Multimap;
 import com.google.common.collect.Sets;
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.tools.SSTableExpiredBlockers;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Set;
-
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
@@ -55,11 +57,24 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
         SchemaLoader.prepareServer();
+
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+                                    KeyspaceParams.simple(1),
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD1)
+                                                      .addPartitionKey("pKey", AsciiType.instance)
+                                                      .addRegularColumn("col1", AsciiType.instance)
+                                                      .addRegularColumn("col", AsciiType.instance)
+                                                      .addRegularColumn("col311", AsciiType.instance)
+                                                      .addRegularColumn("col2", AsciiType.instance)
+                                                      .addRegularColumn("col3", AsciiType.instance)
+                                                      .addRegularColumn("col7", AsciiType.instance)
+                                                      .addRegularColumn("col8", MapType.getInstance(AsciiType.instance, AsciiType.instance, true))
+                                                      .addRegularColumn("shadow", AsciiType.instance)
+                                                      .build().gcGraceSeconds(0));
     }
 
     @Test
@@ -68,33 +83,55 @@
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1");
         cfs.disableAutoCompaction();
         cfs.metadata.gcGraceSeconds(0);
+        String key = "ttl";
+        new RowUpdateBuilder(cfs.metadata, 1L, 1, key)
+                    .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
 
-        DecoratedKey ttlKey = Util.dk("ttl");
-        Mutation rm = new Mutation(KEYSPACE1, ttlKey.getKey());
-        rm.add("Standard1", Util.cellname("col1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 1, 1);
-        rm.add("Standard1", Util.cellname("col2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 3, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 3L, 1, key)
+                    .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+        cfs.forceBlockingFlush();
+        new RowUpdateBuilder(cfs.metadata, 2L, 1, key)
+                    .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
+        new RowUpdateBuilder(cfs.metadata, 5L, 1, key)
+                    .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KEYSPACE1, ttlKey.getKey());
-        rm.add("Standard1", Util.cellname("col1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 2, 1);
-        rm.add("Standard1", Util.cellname("col2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 5, 1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, 4L, 1, key)
+                    .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
+        new RowUpdateBuilder(cfs.metadata, 7L, 1, key)
+                    .add("shadow", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KEYSPACE1, ttlKey.getKey());
-        rm.add("Standard1", Util.cellname("col1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 4, 1);
-        rm.add("Standard1", Util.cellname("shadow"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 7, 1);
-        rm.applyUnsafe();
+
+        new RowUpdateBuilder(cfs.metadata, 6L, 3, key)
+                    .add("shadow", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
+        new RowUpdateBuilder(cfs.metadata, 8L, 1, key)
+                    .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KEYSPACE1, ttlKey.getKey());
-        rm.add("Standard1", Util.cellname("shadow"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 6, 3);
-        rm.add("Standard1", Util.cellname("col2"), ByteBufferUtil.EMPTY_BYTE_BUFFER, 8, 1);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-
-        Set<SSTableReader> sstables = Sets.newHashSet(cfs.getSSTables());
+        Set<SSTableReader> sstables = Sets.newHashSet(cfs.getLiveSSTables());
         int now = (int)(System.currentTimeMillis() / 1000);
         int gcBefore = now + 2;
         Set<SSTableReader> expired = CompactionController.getFullyExpiredSSTables(
@@ -110,105 +147,112 @@
     @Test
     public void testSimpleExpire() throws InterruptedException
     {
+        testSimpleExpire(false);
+    }
+
+    @Test
+    public void testBug10944() throws InterruptedException
+    {
+        // Reproduction for CASSANDRA-10944 (at the time of the bug)
+        testSimpleExpire(true);
+    }
+
+    public void testSimpleExpire(boolean force10944Bug) throws InterruptedException
+    {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1");
+        cfs.truncateBlocking();
         cfs.disableAutoCompaction();
-        cfs.metadata.gcGraceSeconds(0);
+        // To reproduce #10944, we need our gcBefore to be equal to the locaDeletionTime. A gcGrace of 1 will (almost always) give us that.
+        cfs.metadata.gcGraceSeconds(force10944Bug ? 1 : 0);
         long timestamp = System.currentTimeMillis();
-        Mutation rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-        rm.add("Standard1", Util.cellname("col"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               1);
-        rm.add("Standard1", Util.cellname("col7"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               1);
+        String key = "ttl";
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+                        .add("col", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                        .add("col7", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                        .build()
+                        .applyUnsafe();
 
-        rm.applyUnsafe();
         cfs.forceBlockingFlush();
 
-        rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-                rm.add("Standard1", Util.cellname("col2"),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       1);
-                rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+            .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .addMapEntry("col8", "bar", "foo")
+            .delete("col1")
+            .build()
+            .applyUnsafe();
+
+
         cfs.forceBlockingFlush();
-        rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-        rm.add("Standard1", Util.cellname("col3"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   timestamp,
-                   1);
-        rm.applyUnsafe();
+        // To reproduce #10944, we need to avoid the optimization that get rid of full sstable because everything
+        // is known to be gcAble, so keep some data non-expiring in that case.
+        new RowUpdateBuilder(cfs.metadata, timestamp, force10944Bug ? 0 : 1, key)
+                    .add("col3", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
+
         cfs.forceBlockingFlush();
-        rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-        rm.add("Standard1", Util.cellname("col311"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   timestamp,
-                   1);
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+                            .add("col311", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                            .build()
+                            .applyUnsafe();
+
 
         cfs.forceBlockingFlush();
         Thread.sleep(2000); // wait for ttl to expire
-        assertEquals(4, cfs.getSSTables().size());
+        assertEquals(4, cfs.getLiveSSTables().size());
         cfs.enableAutoCompaction(true);
-        assertEquals(0, cfs.getSSTables().size());
+        assertEquals(force10944Bug ? 1 : 0, cfs.getLiveSSTables().size());
     }
 
     @Test
     public void testNoExpire() throws InterruptedException, IOException
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1");
+        cfs.truncateBlocking();
         cfs.disableAutoCompaction();
         cfs.metadata.gcGraceSeconds(0);
         long timestamp = System.currentTimeMillis();
-        Mutation rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-        rm.add("Standard1", Util.cellname("col"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               1);
-        rm.add("Standard1", Util.cellname("col7"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               1);
+        String key = "ttl";
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+            .add("col", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .add("col7", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
 
-        rm.applyUnsafe();
         cfs.forceBlockingFlush();
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+            .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+        cfs.forceBlockingFlush();
+        new RowUpdateBuilder(cfs.metadata, timestamp, 1, key)
+            .add("col3", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+        cfs.forceBlockingFlush();
+        String noTTLKey = "nottl";
+        new RowUpdateBuilder(cfs.metadata, timestamp, noTTLKey)
+            .add("col311", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
 
-        rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-                rm.add("Standard1", Util.cellname("col2"),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       1);
-                rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-        rm = new Mutation(KEYSPACE1, Util.dk("ttl").getKey());
-        rm.add("Standard1", Util.cellname("col3"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   timestamp,
-                   1);
-        rm.applyUnsafe();
-        cfs.forceBlockingFlush();
-        DecoratedKey noTTLKey = Util.dk("nottl");
-        rm = new Mutation(KEYSPACE1, noTTLKey.getKey());
-        rm.add("Standard1", Util.cellname("col311"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   timestamp);
-        rm.applyUnsafe();
         cfs.forceBlockingFlush();
         Thread.sleep(2000); // wait for ttl to expire
-        assertEquals(4, cfs.getSSTables().size());
+        assertEquals(4, cfs.getLiveSSTables().size());
         cfs.enableAutoCompaction(true);
-        assertEquals(1, cfs.getSSTables().size());
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
-        ISSTableScanner scanner = sstable.getScanner(DataRange.allData(sstable.partitioner));
+        assertEquals(1, cfs.getLiveSSTables().size());
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        ISSTableScanner scanner = sstable.getScanner(ColumnFilter.all(sstable.metadata),
+                                                     DataRange.allData(cfs.getPartitioner()),
+                                                     false,
+                                                     SSTableReadsListener.NOOP_LISTENER);
         assertTrue(scanner.hasNext());
         while(scanner.hasNext())
         {
-            OnDiskAtomIterator iter = scanner.next();
-            assertEquals(noTTLKey, iter.getKey());
+            UnfilteredRowIterator iter = scanner.next();
+            assertEquals(Util.dk(noTTLKey), iter.partitionKey());
         }
-
         scanner.close();
     }
 
@@ -220,19 +264,24 @@
         cfs.disableAutoCompaction();
         cfs.metadata.gcGraceSeconds(0);
 
-        Mutation rm = new Mutation(KEYSPACE1, Util.dk("test").getKey());
-        rm.add("Standard1", Util.cellname("col1"), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
-        rm.applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), "test")
+                .noRowMarker()
+                .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+
         cfs.forceBlockingFlush();
-        SSTableReader blockingSSTable = cfs.getSSTables().iterator().next();
+        SSTableReader blockingSSTable = cfs.getSSTables(SSTableSet.LIVE).iterator().next();
         for (int i = 0; i < 10; i++)
         {
-            rm = new Mutation(KEYSPACE1, Util.dk("test").getKey());
-            rm.delete("Standard1", System.currentTimeMillis());
-            rm.applyUnsafe();
+            new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), "test")
+                            .noRowMarker()
+                            .delete("col1")
+                            .build()
+                            .applyUnsafe();
             cfs.forceBlockingFlush();
         }
-        Multimap<SSTableReader, SSTableReader> blockers = SSTableExpiredBlockers.checkForExpiredSSTableBlockers(cfs.getSSTables(), (int) (System.currentTimeMillis() / 1000) + 100);
+        Multimap<SSTableReader, SSTableReader> blockers = SSTableExpiredBlockers.checkForExpiredSSTableBlockers(cfs.getSSTables(SSTableSet.LIVE), (int) (System.currentTimeMillis() / 1000) + 100);
         assertEquals(1, blockers.keySet().size());
         assertTrue(blockers.keySet().contains(blockingSSTable));
         assertEquals(10, blockers.get(blockingSSTable).size());

diff --git a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
new file mode 100644
index 0000000..c57e0ca
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java

@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.compaction;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.getWindowBoundsInMillis;
+import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.newestBucket;
+import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.validateOptions;
+
+public class TimeWindowCompactionStrategyTest extends SchemaLoader
+{
+    public static final String KEYSPACE1 = "Keyspace1";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        // Disable tombstone histogram rounding for tests
+        System.setProperty("cassandra.streaminghistogram.roundseconds", "1");
+
+        SchemaLoader.prepareServer();
+
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+    }
+
+    @Test
+    public void testOptionsValidation() throws ConfigurationException
+    {
+        Map<String, String> options = new HashMap<>();
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "30");
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MINUTES");
+        Map<String, String> unvalidated = validateOptions(options);
+        assertTrue(unvalidated.isEmpty());
+
+        try
+        {
+            options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "0");
+            validateOptions(options);
+            fail(String.format("%s == 0 should be rejected", TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY));
+        }
+        catch (ConfigurationException e) {}
+
+        try
+        {
+            options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "-1337");
+            validateOptions(options);
+            fail(String.format("Negative %s should be rejected", TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY));
+        }
+        catch (ConfigurationException e)
+        {
+            options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "1");
+        }
+
+        try
+        {
+            options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MONTHS");
+            validateOptions(options);
+            fail(String.format("Invalid time units should be rejected", TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY));
+        }
+        catch (ConfigurationException e)
+        {
+            options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MINUTES");
+        }
+
+        options.put("bad_option", "1.0");
+        unvalidated = validateOptions(options);
+        assertTrue(unvalidated.containsKey("bad_option"));
+    }
+
+
+    @Test
+    public void testTimeWindows()
+    {
+        Long tstamp1 = 1451001601000L; // 2015-12-25 @ 00:00:01, in milliseconds
+        Long tstamp2 = 1451088001000L; // 2015-12-26 @ 00:00:01, in milliseconds
+        Long lowHour = 1451001600000L; // 2015-12-25 @ 00:00:00, in milliseconds
+
+        // A 1 hour window should round down to the beginning of the hour
+        assertTrue(getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp1).left.compareTo(lowHour) == 0);
+
+        // A 1 minute window should round down to the beginning of the hour
+        assertTrue(getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1).left.compareTo(lowHour) == 0);
+
+        // A 1 day window should round down to the beginning of the hour
+        assertTrue(getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1).left.compareTo(lowHour) == 0 );
+
+        // The 2 day window of 2015-12-25 + 2015-12-26 should round down to the beginning of 2015-12-25
+        assertTrue(getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2).left.compareTo(lowHour) == 0);
+
+
+        return;
+    }
+
+    @Test
+    public void testPrepBucket()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ByteBuffer value = ByteBuffer.wrap(new byte[100]);
+        Long tstamp = System.currentTimeMillis();
+        Long tstamp2 =  tstamp - (2L * 3600L * 1000L);
+
+        // create 5 sstables
+        for (int r = 0; r < 3; r++)
+        {
+            DecoratedKey key = Util.dk(String.valueOf(r));
+            new RowUpdateBuilder(cfs.metadata, r, key.getKey())
+                .clustering("column")
+                .add("val", value).build().applyUnsafe();
+
+            cfs.forceBlockingFlush();
+        }
+        // Decrement the timestamp to simulate a timestamp in the past hour
+        for (int r = 3; r < 5; r++)
+        {
+            // And add progressively more cells into each sstable
+            DecoratedKey key = Util.dk(String.valueOf(r));
+            new RowUpdateBuilder(cfs.metadata, r, key.getKey())
+                .clustering("column")
+                .add("val", value).build().applyUnsafe();
+            cfs.forceBlockingFlush();
+        }
+
+        cfs.forceBlockingFlush();
+
+        HashMultimap<Long, SSTableReader> buckets = HashMultimap.create();
+        List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
+
+        // We'll put 3 sstables into the newest bucket
+        for (int i = 0 ; i < 3; i++)
+        {
+            Pair<Long,Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp );
+            buckets.put(bounds.left, sstrs.get(i));
+        }
+        List<SSTableReader> newBucket = newestBucket(buckets, 4, 32, TimeUnit.HOURS, 1, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left );
+        assertTrue("incoming bucket should not be accepted when it has below the min threshold SSTables", newBucket.isEmpty());
+
+        newBucket = newestBucket(buckets, 2, 32, TimeUnit.HOURS, 1, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
+        assertTrue("incoming bucket should be accepted when it is larger than the min threshold SSTables", !newBucket.isEmpty());
+
+        // And 2 into the second bucket (1 hour back)
+        for (int i = 3 ; i < 5; i++)
+        {
+            Pair<Long,Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp2 );
+            buckets.put(bounds.left, sstrs.get(i));
+        }
+
+        assertEquals("an sstable with a single value should have equal min/max timestamps", sstrs.get(0).getMinTimestamp(), sstrs.get(0).getMaxTimestamp());
+        assertEquals("an sstable with a single value should have equal min/max timestamps", sstrs.get(1).getMinTimestamp(), sstrs.get(1).getMaxTimestamp());
+        assertEquals("an sstable with a single value should have equal min/max timestamps", sstrs.get(2).getMinTimestamp(), sstrs.get(2).getMaxTimestamp());
+
+        // Test trim
+        int numSSTables = 40;
+        for (int r = 5; r < numSSTables; r++)
+        {
+            DecoratedKey key = Util.dk(String.valueOf(r));
+            for(int i = 0 ; i < r ; i++)
+            {
+                new RowUpdateBuilder(cfs.metadata, tstamp + r, key.getKey())
+                    .clustering("column")
+                    .add("val", value).build().applyUnsafe();
+            }
+            cfs.forceBlockingFlush();
+        }
+
+        // Reset the buckets, overfill it now
+        sstrs = new ArrayList<>(cfs.getLiveSSTables());
+        for (int i = 0 ; i < 40; i++)
+        {
+            Pair<Long,Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, sstrs.get(i).getMaxTimestamp());
+            buckets.put(bounds.left, sstrs.get(i));
+        }
+
+        newBucket = newestBucket(buckets, 4, 32, TimeUnit.DAYS, 1, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
+        assertEquals("new bucket should be trimmed to max threshold of 32", newBucket.size(),  32);
+    }
+
+
+    @Test
+    public void testDropExpiredSSTables() throws InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        cfs.truncateBlocking();
+        cfs.disableAutoCompaction();
+
+        ByteBuffer value = ByteBuffer.wrap(new byte[100]);
+
+        // create 2 sstables
+        DecoratedKey key = Util.dk(String.valueOf("expired"));
+        new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), 1, key.getKey())
+            .clustering("column")
+            .add("val", value).build().applyUnsafe();
+
+        cfs.forceBlockingFlush();
+        SSTableReader expiredSSTable = cfs.getLiveSSTables().iterator().next();
+        Thread.sleep(10);
+
+        key = Util.dk(String.valueOf("nonexpired"));
+        new RowUpdateBuilder(cfs.metadata, System.currentTimeMillis(), key.getKey())
+            .clustering("column")
+            .add("val", value).build().applyUnsafe();
+
+        cfs.forceBlockingFlush();
+        assertEquals(cfs.getLiveSSTables().size(), 2);
+
+        Map<String, String> options = new HashMap<>();
+
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "30");
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "SECONDS");
+        options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, "MILLISECONDS");
+        options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0");
+        TimeWindowCompactionStrategy twcs = new TimeWindowCompactionStrategy(cfs, options);
+        for (SSTableReader sstable : cfs.getLiveSSTables())
+            twcs.addSSTable(sstable);
+        twcs.startup();
+        assertNull(twcs.getNextBackgroundTask((int) (System.currentTimeMillis() / 1000)));
+        Thread.sleep(2000);
+        AbstractCompactionTask t = twcs.getNextBackgroundTask((int) (System.currentTimeMillis()/1000));
+        assertNotNull(t);
+        assertEquals(1, Iterables.size(t.transaction.originals()));
+        SSTableReader sstable = t.transaction.originals().iterator().next();
+        assertEquals(sstable, expiredSSTable);
+        t.transaction.abort();
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/composites/CTypeTest.java b/test/unit/org/apache/cassandra/db/composites/CTypeTest.java
index 496a2dc..9b261e6 100644
--- a/test/unit/org/apache/cassandra/db/composites/CTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/composites/CTypeTest.java

@@ -17,98 +17,113 @@
  */
 package org.apache.cassandra.db.composites;
 
-import com.google.common.collect.Lists;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.junit.Test;
 
-import java.util.List;
+import java.nio.ByteBuffer;
 
 public class CTypeTest
 {
-    static final List<AbstractType<?>> types = Lists.newArrayList();
-    static
-    {
-        types.add(UTF8Type.instance);
-        types.add(UUIDType.instance);
-        types.add(Int32Type.instance);
-    }
-
-    static final CellNameType cdtype = new CompoundDenseCellNameType(types);
-    static final CellNameType stype1 = new SimpleDenseCellNameType(BytesType.instance);
-    static final CellNameType stype2 = new SimpleDenseCellNameType(UUIDType.instance);
-
     @Test
     public void testCompoundType()
     {
-        Composite a1 = cdtype.makeCellName("a",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 1);
-        Composite a2 = cdtype.makeCellName("a",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 100);
-        Composite b1 = cdtype.makeCellName("a",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 1);
-        Composite b2 = cdtype.makeCellName("a",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
-        Composite c1 = cdtype.makeCellName("z",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 1);
-        Composite c2 = cdtype.makeCellName("z",UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"), 100);
-        Composite d1 = cdtype.makeCellName("z",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 1);
-        Composite d2 = cdtype.makeCellName("z",UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
+        CompositeType baseType = CompositeType.getInstance(AsciiType.instance, UUIDType.instance, LongType.instance);
 
-        Composite z1 = cdtype.makeCellName(ByteBufferUtil.EMPTY_BYTE_BUFFER,UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"), 100);
+        ByteBuffer a1 = baseType.builder()
+                .add(ByteBufferUtil.bytes("a"))
+                .add(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"))
+                .add(ByteBufferUtil.bytes(1)).build();
+        ByteBuffer a2 = baseType.builder()
+                .add(ByteBufferUtil.bytes("a"))
+                .add(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"))
+                .add(ByteBufferUtil.bytes(100)).build();
+        ByteBuffer b1 = baseType.builder()
+                .add(ByteBufferUtil.bytes("a"))
+                .add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"))
+                .add(ByteBufferUtil.bytes(1)).build();
+        ByteBuffer b2 = baseType.builder()
+                .add(ByteBufferUtil.bytes("a"))
+                .add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"))
+                .add(ByteBufferUtil.bytes(100)).build();
+        ByteBuffer c1 = baseType.builder()
+                .add(ByteBufferUtil.bytes("z"))
+                .add(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"))
+                .add(ByteBufferUtil.bytes(1)).build();
+        ByteBuffer c2 = baseType.builder()
+                .add(ByteBufferUtil.bytes("z"))
+                .add(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"))
+                .add(ByteBufferUtil.bytes(100)).build();
+        ByteBuffer d1 = baseType.builder()
+                .add(ByteBufferUtil.bytes("z"))
+                .add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"))
+                .add(ByteBufferUtil.bytes(1)).build();
+        ByteBuffer d2 = baseType.builder()
+                .add(ByteBufferUtil.bytes("z"))
+                .add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"))
+                .add(ByteBufferUtil.bytes(100)).build();
+        ByteBuffer z1 = baseType.builder()
+                .add(ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"))
+                .add(ByteBufferUtil.bytes(100)).build();
 
-        assert cdtype.compare(a1,a2) < 0;
-        assert cdtype.compare(a2,b1) < 0;
-        assert cdtype.compare(b1,b2) < 0;
-        assert cdtype.compare(b2,c1) < 0;
-        assert cdtype.compare(c1,c2) < 0;
-        assert cdtype.compare(c2,d1) < 0;
-        assert cdtype.compare(d1,d2) < 0;
+        assert baseType.compare(a1,a2) < 0;
+        assert baseType.compare(a2,b1) < 0;
+        assert baseType.compare(b1,b2) < 0;
+        assert baseType.compare(b2,c1) < 0;
+        assert baseType.compare(c1,c2) < 0;
+        assert baseType.compare(c2,d1) < 0;
+        assert baseType.compare(d1,d2) < 0;
 
-        assert cdtype.compare(a2,a1) > 0;
-        assert cdtype.compare(b1,a2) > 0;
-        assert cdtype.compare(b2,b1) > 0;
-        assert cdtype.compare(c1,b2) > 0;
-        assert cdtype.compare(c2,c1) > 0;
-        assert cdtype.compare(d1,c2) > 0;
-        assert cdtype.compare(d2,d1) > 0;
+        assert baseType.compare(a2,a1) > 0;
+        assert baseType.compare(b1,a2) > 0;
+        assert baseType.compare(b2,b1) > 0;
+        assert baseType.compare(c1,b2) > 0;
+        assert baseType.compare(c2,c1) > 0;
+        assert baseType.compare(d1,c2) > 0;
+        assert baseType.compare(d2,d1) > 0;
 
-        assert cdtype.compare(z1,a1) < 0;
-        assert cdtype.compare(z1,a2) < 0;
-        assert cdtype.compare(z1,b1) < 0;
-        assert cdtype.compare(z1,b2) < 0;
-        assert cdtype.compare(z1,c1) < 0;
-        assert cdtype.compare(z1,c2) < 0;
-        assert cdtype.compare(z1,d1) < 0;
-        assert cdtype.compare(z1,d2) < 0;
+        assert baseType.compare(z1,a1) < 0;
+        assert baseType.compare(z1,a2) < 0;
+        assert baseType.compare(z1,b1) < 0;
+        assert baseType.compare(z1,b2) < 0;
+        assert baseType.compare(z1,c1) < 0;
+        assert baseType.compare(z1,c2) < 0;
+        assert baseType.compare(z1,d1) < 0;
+        assert baseType.compare(z1,d2) < 0;
 
-        assert cdtype.compare(a1,a1) == 0;
-        assert cdtype.compare(a2,a2) == 0;
-        assert cdtype.compare(b1,b1) == 0;
-        assert cdtype.compare(b2,b2) == 0;
-        assert cdtype.compare(c1,c1) == 0;
-        assert cdtype.compare(c2,c2) == 0;
-        assert cdtype.compare(z1,z1) == 0;
+        assert baseType.compare(a1,a1) == 0;
+        assert baseType.compare(a2,a2) == 0;
+        assert baseType.compare(b1,b1) == 0;
+        assert baseType.compare(b2,b2) == 0;
+        assert baseType.compare(c1,c1) == 0;
+        assert baseType.compare(c2,c2) == 0;
+        assert baseType.compare(z1,z1) == 0;
     }
 
     @Test
     public void testSimpleType2()
     {
-        CellName a = stype2.makeCellName(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000"));
-        CellName z = stype2.makeCellName(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff"));
+        CompositeType baseType = CompositeType.getInstance(UUIDType.instance);
+        ByteBuffer a = baseType.builder().add(UUIDType.instance.fromString("00000000-0000-0000-0000-000000000000")).build();
+        ByteBuffer z = baseType.builder().add(UUIDType.instance.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff")).build();
 
-        assert stype2.compare(a,z) < 0;
-        assert stype2.compare(z,a) > 0;
-        assert stype2.compare(a,a) == 0;
-        assert stype2.compare(z,z) == 0;
+        assert baseType.compare(a,z) < 0;
+        assert baseType.compare(z,a) > 0;
+        assert baseType.compare(a,a) == 0;
+        assert baseType.compare(z,z) == 0;
     }
 
-
     @Test
     public void testSimpleType1()
     {
-        CellName a = stype1.makeCellName(ByteBufferUtil.bytes("a"));
-        CellName z = stype1.makeCellName(ByteBufferUtil.bytes("z"));
+        CompositeType baseType = CompositeType.getInstance(BytesType.instance);
+        ByteBuffer a = baseType.builder().add(ByteBufferUtil.bytes("a")).build();
+        ByteBuffer z = baseType.builder().add(ByteBufferUtil.bytes("z")).build();
 
-        assert stype1.compare(a,z) < 0;
-        assert stype1.compare(z,a) > 0;
-        assert stype1.compare(a,a) == 0;
-        assert stype1.compare(z,z) == 0;
+        assert baseType.compare(a,z) < 0;
+        assert baseType.compare(z,a) > 0;
+        assert baseType.compare(a,a) == 0;
+        assert baseType.compare(z,z) == 0;
     }
-
 }

diff --git a/test/unit/org/apache/cassandra/db/context/CounterContextTest.java b/test/unit/org/apache/cassandra/db/context/CounterContextTest.java
index 4f587c6..a8852f7 100644
--- a/test/unit/org/apache/cassandra/db/context/CounterContextTest.java
+++ b/test/unit/org/apache/cassandra/db/context/CounterContextTest.java

@@ -28,17 +28,19 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ClockAndCount;
+import org.apache.cassandra.db.LegacyLayout.LegacyCell;
 import org.apache.cassandra.db.context.CounterContext.Relationship;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CounterId;
 
-import static org.apache.cassandra.db.context.CounterContext.ContextState;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotSame;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 
+import static org.apache.cassandra.db.context.CounterContext.ContextState;
+
 public class CounterContextTest
 {
     private static final CounterContext cc = new CounterContext();
@@ -542,4 +544,34 @@
         assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(15)));
         assertEquals(ClockAndCount.create(0L, 0L), cc.getClockAndCountOf(state.context, CounterId.fromInt(20)));
     }
+
+    @Test // see CASSANDRA-13691
+    public void testCounterUpdate()
+    {
+        /*
+         * a context with just one 'update' shard - a local shard with a hardcoded value of CounterContext.UPDATE_CLOCK_ID
+         */
+
+        ByteBuffer updateContext = CounterContext.instance().createUpdate(10L);
+
+        assertEquals(ClockAndCount.create(1L, 10L), cc.getClockAndCountOf(updateContext, CounterContext.UPDATE_CLOCK_ID));
+        assertTrue(cc.isUpdate(updateContext));
+        LegacyCell updateCell = LegacyCell.counter(null, updateContext);
+        assertTrue(updateCell.isCounterUpdate());
+
+
+        /*
+         * a context with a regular local shard sorting first and a couple others in it - should *not* be identified as an update
+         */
+
+        ContextState notUpdateContextState = ContextState.allocate(1, 1, 1);
+        notUpdateContextState.writeLocal( CounterId.fromInt(1), 1L, 10L);
+        notUpdateContextState.writeRemote(CounterId.fromInt(2), 1L, 10L);
+        notUpdateContextState.writeGlobal(CounterId.fromInt(3), 1L, 10L);
+        ByteBuffer notUpdateContext = notUpdateContextState.context;
+
+        assertFalse(cc.isUpdate(notUpdateContext));
+        LegacyCell notUpdateCell = LegacyCell.counter(null, notUpdateContext);
+        assertFalse(notUpdateCell.isCounterUpdate());
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java b/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java
new file mode 100644
index 0000000..aa56091
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java

@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.filter;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class ColumnFilterTest
+{
+    final static ColumnFilter.Serializer serializer = new ColumnFilter.Serializer();
+
+    @Test
+    public void columnFilterSerialisationRoundTrip() throws Exception
+    {
+        CFMetaData metadata = CFMetaData.Builder.create("ks", "table")
+                                                .withPartitioner(Murmur3Partitioner.instance)
+                                                .addPartitionKey("pk", Int32Type.instance)
+                                                .addClusteringColumn("ck", Int32Type.instance)
+                                                .addRegularColumn("v1", Int32Type.instance)
+                                                .addRegularColumn("v2", Int32Type.instance)
+                                                .addRegularColumn("v3", Int32Type.instance)
+                                                .build();
+
+        ColumnDefinition v1 = metadata.getColumnDefinition(ByteBufferUtil.bytes("v1"));
+
+        testRoundTrip(ColumnFilter.all(metadata), metadata, MessagingService.VERSION_30);
+        testRoundTrip(ColumnFilter.all(metadata), metadata, MessagingService.VERSION_3014);
+
+        testRoundTrip(ColumnFilter.selection(metadata.partitionColumns().without(v1)), metadata, MessagingService.VERSION_30);
+        testRoundTrip(ColumnFilter.selection(metadata.partitionColumns().without(v1)), metadata, MessagingService.VERSION_3014);
+
+        testRoundTrip(ColumnFilter.selection(metadata, metadata.partitionColumns().without(v1)), metadata, MessagingService.VERSION_30);
+        testRoundTrip(ColumnFilter.selection(metadata, metadata.partitionColumns().without(v1)), metadata, MessagingService.VERSION_3014);
+    }
+
+    static void testRoundTrip(ColumnFilter columnFilter, CFMetaData metadata, int version) throws Exception
+    {
+        DataOutputBuffer output = new DataOutputBuffer();
+        serializer.serialize(columnFilter, output, version);
+        Assert.assertEquals(serializer.serializedSize(columnFilter, version), output.position());
+        DataInputPlus input = new DataInputBuffer(output.buffer(), false);
+        Assert.assertEquals(serializer.deserialize(input, version, metadata), columnFilter);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java b/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java
deleted file mode 100644
index 8ba2665..0000000
--- a/test/unit/org/apache/cassandra/db/filter/ColumnSliceTest.java
+++ /dev/null

@@ -1,495 +0,0 @@
-/*
- * * Licensed to the Apache Software Foundation (ASF) under one
- * * or more contributor license agreements.  See the NOTICE file
- * * distributed with this work for additional information
- * * regarding copyright ownership.  The ASF licenses this file
- * * to you under the Apache License, Version 2.0 (the
- * * "License"); you may not use this file except in compliance
- * * with the License.  You may obtain a copy of the License at
- * *
- * *    http://www.apache.org/licenses/LICENSE-2.0
- * *
- * * Unless required by applicable law or agreed to in writing,
- * * software distributed under the License is distributed on an
- * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * * KIND, either express or implied.  See the License for the
- * * specific language governing permissions and limitations
- * * under the License.
- * */
-package org.apache.cassandra.db.filter;
-
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.junit.Test;
-
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.io.sstable.ColumnNameHelper;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import static org.junit.Assert.*;
-
-public class ColumnSliceTest
-{
-    private static final CellNameType simpleIntType = new SimpleDenseCellNameType(Int32Type.instance);
-
-    @Test
-    public void testIntersectsSingleSlice()
-    {
-        List<AbstractType<?>> types = new ArrayList<>();
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
-
-        // filter falls entirely before sstable
-        ColumnSlice slice = new ColumnSlice(composite(0, 0, 0), composite(1, 0, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with empty start
-        slice = new ColumnSlice(composite(), composite(1, 0, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for start
-        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for start and end
-        slice = new ColumnSlice(composite(0), composite(1, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-
-        // end of slice matches start of sstable for the first component, but not the second component
-        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 0, 0));
-        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for start
-        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
-        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for start and end
-        slice = new ColumnSlice(composite(0), composite(1, 0));
-        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, false));
-
-        // first two components match, but not the last
-        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 0));
-        assertFalse(slice.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), nameType, false));
-
-        // all three components in slice end match the start of the sstable
-        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 1, 1), columnNames(3, 1, 1), nameType, false));
-
-
-        // filter falls entirely after sstable
-        slice = new ColumnSlice(composite(4, 0, 0), composite(4, 0, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with empty end
-        slice = new ColumnSlice(composite(4, 0, 0), composite());
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for end
-        slice = new ColumnSlice(composite(4, 0, 0), composite(1));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-        // same case, but with missing components for start and end
-        slice = new ColumnSlice(composite(4, 0), composite(1));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, false));
-
-
-        // start of slice matches end of sstable for the first component, but not the second component
-        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
-        assertFalse(slice.intersects(columnNames(0, 0, 0), columnNames(1, 0, 0), nameType, false));
-
-        // start of slice matches end of sstable for the first two components, but not the last component
-        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
-        assertFalse(slice.intersects(columnNames(0, 0, 0), columnNames(1, 1, 0), nameType, false));
-
-        // all three components in the slice start match the end of the sstable
-        slice = new ColumnSlice(composite(1, 1, 1), composite(2, 0, 0));
-        assertTrue(slice.intersects(columnNames(0, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-
-        // slice covers entire sstable (with no matching edges)
-        slice = new ColumnSlice(composite(0, 0, 0), composite(2, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // same case, but with empty ends
-        slice = new ColumnSlice(composite(), composite());
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // same case, but with missing components
-        slice = new ColumnSlice(composite(0), composite(2, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // slice covers entire sstable (with matching start)
-        slice = new ColumnSlice(composite(1, 0, 0), composite(2, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // slice covers entire sstable (with matching end)
-        slice = new ColumnSlice(composite(0, 0, 0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // slice covers entire sstable (with matching start and end)
-        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-
-        // slice falls entirely within sstable (with matching start)
-        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // same case, but with a missing end component
-        slice = new ColumnSlice(composite(1, 0, 0), composite(1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // slice falls entirely within sstable (with matching end)
-        slice = new ColumnSlice(composite(1, 1, 0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-        // same case, but with a missing start component
-        slice = new ColumnSlice(composite(1, 1), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(1, 1, 1), nameType, false));
-
-
-        // slice falls entirely within sstable
-        slice = new ColumnSlice(composite(1, 1, 0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
-
-        // same case, but with a missing start component
-        slice = new ColumnSlice(composite(1, 1), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
-
-        // same case, but with a missing start and end components
-        slice = new ColumnSlice(composite(1), composite(1, 2));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
-
-        // same case, but with an equal first component and missing start and end components
-        slice = new ColumnSlice(composite(1), composite(1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
-
-        // slice falls entirely within sstable (slice start and end are the same)
-        slice = new ColumnSlice(composite(1, 1, 1), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, false));
-
-
-        // slice starts within sstable, empty end
-        slice = new ColumnSlice(composite(1, 1, 1), composite());
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing end components
-        slice = new ColumnSlice(composite(1, 1, 1), composite(3));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // slice starts within sstable (matching sstable start), empty end
-        slice = new ColumnSlice(composite(1, 0, 0), composite());
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing end components
-        slice = new ColumnSlice(composite(1, 0, 0), composite(3));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // slice starts within sstable (matching sstable end), empty end
-        slice = new ColumnSlice(composite(2, 0, 0), composite());
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing end components
-        slice = new ColumnSlice(composite(2, 0, 0), composite(3));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-
-        // slice ends within sstable, empty end
-        slice = new ColumnSlice(composite(), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing start components
-        slice = new ColumnSlice(composite(0), composite(1, 1, 1));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // slice ends within sstable (matching sstable start), empty start
-        slice = new ColumnSlice(composite(), composite(1, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing start components
-        slice = new ColumnSlice(composite(0), composite(1, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // slice ends within sstable (matching sstable end), empty start
-        slice = new ColumnSlice(composite(), composite(2, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-        // same case, but with missing start components
-        slice = new ColumnSlice(composite(0), composite(2, 0, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 0, 0), nameType, false));
-
-
-        // the slice technically falls within the sstable range, but since the first component is restricted to
-        // a single value, we can check that the second component does not fall within its min/max
-        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 3, 0));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing start component
-        slice = new ColumnSlice(composite(1, 2), composite(1, 3, 0));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing end component
-        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 3));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing start and end components
-        slice = new ColumnSlice(composite(1, 2), composite(1, 3));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with missing start and end components and different lengths for start and end
-        slice = new ColumnSlice(composite(1, 2), composite(1));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-
-        // same as the previous set of tests, but the second component is equal in the slice start and end
-        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 2, 0));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing start component
-        slice = new ColumnSlice(composite(1, 2), composite(1, 2, 0));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing end component
-        slice = new ColumnSlice(composite(1, 2, 0), composite(1, 2));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same case, but with a missing start and end components
-        slice = new ColumnSlice(composite(1, 2), composite(1, 2));
-        assertFalse(slice.intersects(columnNames(1, 0, 0), columnNames(2, 1, 0), nameType, false));
-
-        // same as the previous tests, but it's the third component that doesn't fit in its range this time
-        slice = new ColumnSlice(composite(1, 1, 2), composite(1, 1, 3));
-        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(2, 2, 1), nameType, false));
-
-        // empty min/max column names
-        slice = new ColumnSlice(composite(), composite());
-        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
-
-        slice = new ColumnSlice(composite(1), composite());
-        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
-
-        slice = new ColumnSlice(composite(1), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite());
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(2), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite(2));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(2), composite(3));
-        assertFalse(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        // basic check on reversed slices
-        slice = new ColumnSlice(composite(1, 0, 0), composite(0, 0, 0));
-        assertFalse(slice.intersects(columnNames(2, 0, 0), columnNames(3, 0, 0), nameType, true));
-
-        slice = new ColumnSlice(composite(1, 0, 0), composite(0, 0, 0));
-        assertFalse(slice.intersects(columnNames(1, 1, 0), columnNames(3, 0, 0), nameType, true));
-
-        slice = new ColumnSlice(composite(1, 1, 1), composite(1, 1, 0));
-        assertTrue(slice.intersects(columnNames(1, 0, 0), columnNames(2, 2, 2), nameType, true));
-    }
-
-    @Test
-    public void testDifferentMinMaxLengths()
-    {
-        List<AbstractType<?>> types = new ArrayList<>();
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
-
-        // slice does intersect
-        ColumnSlice slice = new ColumnSlice(composite(), composite());
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite());
-        assertTrue(slice.intersects(columnNames(1), columnNames(1, 2), nameType, false));
-
-        slice = new ColumnSlice(composite(), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(1), composite());
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(1), composite(1));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(0), composite(1, 2, 3));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(1, 2, 3), composite(2));
-        assertTrue(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        // slice does not intersect
-        slice = new ColumnSlice(composite(2), composite(3, 4, 5));
-        assertFalse(slice.intersects(columnNames(), columnNames(1), nameType, false));
-
-        slice = new ColumnSlice(composite(0), composite(0, 1, 2));
-        assertFalse(slice.intersects(columnNames(1), columnNames(1, 2), nameType, false));
-    }
-    @Test
-    public void testColumnNameHelper()
-    {
-        List<AbstractType<?>> types = new ArrayList<>();
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
-        assertTrue(ColumnNameHelper.overlaps(columnNames(0, 0, 0), columnNames(3, 3, 3), columnNames(1, 1, 1), columnNames(2, 2, 2), nameType));
-        assertFalse(ColumnNameHelper.overlaps(columnNames(0, 0, 0), columnNames(3, 3, 3), columnNames(4, 4, 4), columnNames(5, 5, 5), nameType));
-        assertFalse(ColumnNameHelper.overlaps(columnNames(0, 0, 0), columnNames(3, 3, 3), columnNames(3, 3, 4), columnNames(5, 5, 5), nameType));
-        assertTrue(ColumnNameHelper.overlaps(columnNames(0), columnNames(3, 3, 3), columnNames(1, 1), columnNames(5), nameType));
-    }
-
-    @Test
-    public void testDeoverlapSlices()
-    {
-        ColumnSlice[] slices;
-        ColumnSlice[] deoverlapped;
-
-        // Preserve correct slices
-        slices = slices(s(0, 3), s(4, 5), s(6, 9));
-        assertSlicesValid(slices);
-        assertSlicesEquals(slices, deoverlapSlices(slices));
-
-        // Simple overlap
-        slices = slices(s(0, 3), s(2, 5), s(8, 9));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(0, 5), s(8, 9)), deoverlapSlices(slices));
-
-        // Slice overlaps others fully
-        slices = slices(s(0, 10), s(2, 5), s(8, 9));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(0, 10)), deoverlapSlices(slices));
-
-        // Slice with empty end overlaps others fully
-        slices = slices(s(0, -1), s(2, 5), s(8, 9));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(0, -1)), deoverlapSlices(slices));
-
-        // Overlap with slices selecting only one element
-        slices = slices(s(0, 4), s(4, 4), s(4, 8));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(0, 8)), deoverlapSlices(slices));
-
-        // Unordered slices (without overlap)
-        slices = slices(s(4, 8), s(0, 3), s(9, 9));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(0, 3), s(4, 8), s(9, 9)), deoverlapSlices(slices));
-
-        // All range select but not by a single slice
-        slices = slices(s(5, -1), s(2, 5), s(-1, 2));
-        assertSlicesInvalid(slices);
-        assertSlicesEquals(slices(s(-1, -1)), deoverlapSlices(slices));
-    }
-
-    @Test
-    public void testValidateSlices()
-    {
-        assertSlicesValid(slices(s(0, 3)));
-        assertSlicesValid(slices(s(3, 3)));
-        assertSlicesValid(slices(s(3, 3), s(4, 4)));
-        assertSlicesValid(slices(s(0, 3), s(4, 5), s(6, 9)));
-        assertSlicesValid(slices(s(-1, -1)));
-        assertSlicesValid(slices(s(-1, 3), s(4, -1)));
-
-        assertSlicesInvalid(slices(s(3, 0)));
-        assertSlicesInvalid(slices(s(0, 2), s(2, 4)));
-        assertSlicesInvalid(slices(s(0, 2), s(1, 4)));
-        assertSlicesInvalid(slices(s(0, 2), s(3, 4), s(3, 4)));
-        assertSlicesInvalid(slices(s(-1, 2), s(3, -1), s(5, 9)));
-    }
-
-    private static Composite composite(Integer ... components)
-    {
-        List<AbstractType<?>> types = new ArrayList<>();
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        types.add(Int32Type.instance);
-        CompoundDenseCellNameType nameType = new CompoundDenseCellNameType(types);
-        return nameType.make((Object[]) components);
-    }
-
-    private static List<ByteBuffer> columnNames(Integer ... components)
-    {
-        List<ByteBuffer> names = new ArrayList<>(components.length);
-        for (int component : components)
-            names.add(ByteBufferUtil.bytes(component));
-        return names;
-    }
-
-    private static Composite simpleComposite(int i)
-    {
-        // We special negative values to mean EMPTY for convenience sake
-        if (i < 0)
-            return Composites.EMPTY;
-
-        return simpleIntType.make(i);
-    }
-
-    private static ColumnSlice s(int start, int finish)
-    {
-        return new ColumnSlice(simpleComposite(start), simpleComposite(finish));
-    }
-
-    private static ColumnSlice[] slices(ColumnSlice... slices)
-    {
-        return slices;
-    }
-
-    private static ColumnSlice[] deoverlapSlices(ColumnSlice[] slices)
-    {
-        return ColumnSlice.deoverlapSlices(slices, simpleIntType);
-    }
-
-    private static void assertSlicesValid(ColumnSlice[] slices)
-    {
-        assertTrue("Slices " + toString(slices) + " should be valid", ColumnSlice.validateSlices(slices, simpleIntType, false));
-    }
-
-    private static void assertSlicesInvalid(ColumnSlice[] slices)
-    {
-        assertFalse("Slices " + toString(slices) + " shouldn't be valid", ColumnSlice.validateSlices(slices, simpleIntType, false));
-    }
-
-    private static void assertSlicesEquals(ColumnSlice[] expected, ColumnSlice[] actual)
-    {
-        assertTrue("Expected " + toString(expected) + " but got " + toString(actual), Arrays.equals(expected, actual));
-    }
-
-    private static String toString(ColumnSlice[] slices)
-    {
-        StringBuilder sb = new StringBuilder().append("[");
-        for (int i = 0; i < slices.length; i++)
-        {
-            if (i > 0)
-                sb.append(", ");
-
-            ColumnSlice slice = slices[i];
-            sb.append("(");
-            sb.append(slice.start.isEmpty() ? "-1" : simpleIntType.getString(slice.start));
-            sb.append(", ");
-            sb.append(slice.finish.isEmpty() ? "-1" : simpleIntType.getString(slice.finish));
-            sb.append(")");
-        }
-        return sb.append("]").toString();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java
new file mode 100644
index 0000000..0e15013
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java

@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.filter;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.utils.btree.BTree;
+
+public class RowFilterTest
+{
+
+    @Test
+    public void testCQLFilterClose()
+    {
+        // CASSANDRA-15126
+        CFMetaData metadata = CFMetaData.Builder.create("testks", "testcf")
+                                                .addPartitionKey("pk", Int32Type.instance)
+                                                .addStaticColumn("s", Int32Type.instance)
+                                                .addRegularColumn("r", Int32Type.instance)
+                                                .build();
+        ColumnDefinition s = metadata.getColumnDefinition(new ColumnIdentifier("s", true));
+        ColumnDefinition r = metadata.getColumnDefinition(new ColumnIdentifier("r", true));
+
+        ByteBuffer one = Int32Type.instance.decompose(1);
+        RowFilter filter = RowFilter.NONE.withNewExpressions(new ArrayList<>());
+        filter.add(s, Operator.NEQ, one);
+        AtomicBoolean closed = new AtomicBoolean();
+        UnfilteredPartitionIterator iter = filter.filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
+        {
+            public DeletionTime partitionLevelDeletion() { return null; }
+            public EncodingStats stats() { return null; }
+            public CFMetaData metadata() { return metadata; }
+            public boolean isReverseOrder() { return false; }
+            public PartitionColumns columns() { return null; }
+            public DecoratedKey partitionKey() { return null; }
+            public boolean hasNext() { return false; }
+            public Unfiltered next() { return null; }
+            public Row staticRow()
+            {
+                return BTreeRow.create(Clustering.STATIC_CLUSTERING,
+                                       LivenessInfo.EMPTY,
+                                       Row.Deletion.LIVE,
+                                       BTree.singleton(new BufferCell(s, 1, Cell.NO_TTL, Cell.NO_DELETION_TIME, one, null)));
+            }
+            public void close()
+            {
+                closed.set(true);
+            }
+        }, false), 1);
+        Assert.assertFalse(iter.hasNext());
+        Assert.assertTrue(closed.get());
+
+        filter = RowFilter.NONE.withNewExpressions(new ArrayList<>());
+        filter.add(r, Operator.NEQ, one);
+        closed.set(false);
+        iter = filter.filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
+        {
+            boolean hasNext = true;
+            public DeletionTime partitionLevelDeletion() { return null; }
+            public EncodingStats stats() { return null; }
+            public CFMetaData metadata() { return metadata; }
+            public boolean isReverseOrder() { return false; }
+            public PartitionColumns columns() { return null; }
+            public DecoratedKey partitionKey() { return null; }
+            public Row staticRow() { return Rows.EMPTY_STATIC_ROW; }
+            public boolean hasNext()
+            {
+                boolean r = hasNext;
+                hasNext = false;
+                return r;
+            }
+            public Unfiltered next()
+            {
+                return BTreeRow.create(Clustering.EMPTY,
+                                       LivenessInfo.EMPTY,
+                                       Row.Deletion.LIVE,
+                                       BTree.singleton(new BufferCell(r, 1, Cell.NO_TTL, Cell.NO_DELETION_TIME, one, null)));
+            }
+            public void close()
+            {
+                closed.set(true);
+            }
+        }, false), 1);
+        Assert.assertFalse(iter.hasNext());
+        Assert.assertTrue(closed.get());
+    }
+
+
+}

diff --git a/test/unit/org/apache/cassandra/db/filter/SliceTest.java b/test/unit/org/apache/cassandra/db/filter/SliceTest.java
new file mode 100644
index 0000000..606395c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/filter/SliceTest.java

@@ -0,0 +1,367 @@
+/*
+ * * Licensed to the Apache Software Foundation (ASF) under one
+ * * or more contributor license agreements.  See the NOTICE file
+ * * distributed with this work for additional information
+ * * regarding copyright ownership.  The ASF licenses this file
+ * * to you under the Apache License, Version 2.0 (the
+ * * "License"); you may not use this file except in compliance
+ * * with the License.  You may obtain a copy of the License at
+ * *
+ * *    http://www.apache.org/licenses/LICENSE-2.0
+ * *
+ * * Unless required by applicable law or agreed to in writing,
+ * * software distributed under the License is distributed on an
+ * * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * * KIND, either express or implied.  See the License for the
+ * * specific language governing permissions and limitations
+ * * under the License.
+ * */
+package org.apache.cassandra.db.filter;
+
+
+import java.awt.*;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.List;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.*;
+
+public class SliceTest
+{
+    @Test
+    public void testIntersectsSingleSlice()
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        ClusteringComparator cc = new ClusteringComparator(types);
+
+        ClusteringPrefix.Kind sk = ClusteringPrefix.Kind.INCL_START_BOUND;
+        ClusteringPrefix.Kind ek = ClusteringPrefix.Kind.INCL_END_BOUND;
+
+        // filter falls entirely before sstable
+        Slice slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with empty start
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for start
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for start and end
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+
+        // end of slice matches start of sstable for the first component, but not the second component
+        slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for start
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for start and end
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0));
+        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+
+        // first two components match, but not the last
+        slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 0));
+        assertFalse(slice.intersects(cc, columnNames(1, 1, 1), columnNames(3, 1, 1)));
+
+        // all three components in slice end match the start of the sstable
+        slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 1, 1), columnNames(3, 1, 1)));
+
+
+        // filter falls entirely after sstable
+        slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek, 4, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with empty end
+        slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for end
+        slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek, 1));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        // same case, but with missing components for start and end
+        slice = Slice.make(makeBound(sk, 4, 0), makeBound(ek, 1));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+
+        // start of slice matches end of sstable for the first component, but not the second component
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 0, 0)));
+
+        // start of slice matches end of sstable for the first two components, but not the last component
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 1, 0)));
+
+        // all three components in the slice start match the end of the sstable
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 1, 1)));
+
+
+        // slice covers entire sstable (with no matching edges)
+        slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 2, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // same case, but with empty ends
+        slice = Slice.make(makeBound(sk), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // same case, but with missing components
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 2, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // slice covers entire sstable (with matching start)
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // slice covers entire sstable (with matching end)
+        slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // slice covers entire sstable (with matching start and end)
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+
+        // slice falls entirely within sstable (with matching start)
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // same case, but with a missing end component
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // slice falls entirely within sstable (with matching end)
+        slice = Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+        // same case, but with a missing start component
+        slice = Slice.make(makeBound(sk, 1, 1), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+
+
+        // slice falls entirely within sstable
+        slice = Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+
+        // same case, but with a missing start component
+        slice = Slice.make(makeBound(sk, 1, 1), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+
+        // same case, but with a missing start and end components
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1, 2));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+
+        // same case, but with an equal first component and missing start and end components
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+
+        // slice falls entirely within sstable (slice start and end are the same)
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+
+
+        // slice starts within sstable, empty end
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing end components
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 3));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // slice starts within sstable (matching sstable start), empty end
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing end components
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 3));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // slice starts within sstable (matching sstable end), empty end
+        slice = Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing end components
+        slice = Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+
+        // slice ends within sstable, empty end
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing start components
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 1, 1));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // slice ends within sstable (matching sstable start), empty start
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing start components
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // slice ends within sstable (matching sstable end), empty start
+        slice = Slice.make(makeBound(sk), makeBound(ek, 2, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // same case, but with missing start components
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 2, 0, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+
+        // empty min/max column names
+        slice = Slice.make(makeBound(sk), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(2)));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek, 2));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 2), makeBound(ek, 3));
+        assertFalse(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        // basic check on reversed slices
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 0, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+
+        slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 0, 0, 0));
+        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+
+        slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 1, 1, 0));
+        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+    }
+
+    @Test
+    public void testDifferentMinMaxLengths()
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        ClusteringComparator cc = new ClusteringComparator(types);
+
+        ClusteringPrefix.Kind sk = ClusteringPrefix.Kind.INCL_START_BOUND;
+        ClusteringPrefix.Kind ek = ClusteringPrefix.Kind.INCL_END_BOUND;
+
+        // slice does intersect
+        Slice slice = Slice.make(makeBound(sk), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(1), columnNames(1, 2)));
+
+        slice = Slice.make(makeBound(sk), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 2, 3));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 1, 2, 3), makeBound(ek, 2));
+        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        // slice does not intersect
+        slice = Slice.make(makeBound(sk, 2), makeBound(ek, 3, 4, 5));
+        assertFalse(slice.intersects(cc, columnNames(), columnNames(1)));
+
+        slice = Slice.make(makeBound(sk, 0), makeBound(ek, 0, 1, 2));
+        assertFalse(slice.intersects(cc, columnNames(1), columnNames(1, 2)));
+    }
+
+    @Test
+    public void testSliceNormalization()
+    {
+        List<AbstractType<?>> types = new ArrayList<>();
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        types.add(Int32Type.instance);
+        ClusteringComparator cc = new ClusteringComparator(types);
+
+        assertSlicesNormalization(cc, slices(s(0, 2), s(2, 4)), slices(s(0, 4)));
+        assertSlicesNormalization(cc, slices(s(0, 2), s(1, 4)), slices(s(0, 4)));
+        assertSlicesNormalization(cc, slices(s(0, 2), s(3, 4), s(3, 4)), slices(s(0, 2), s(3, 4)));
+        assertSlicesNormalization(cc, slices(s(-1, 3), s(-1, 4)), slices(s(-1, 4)));
+        assertSlicesNormalization(cc, slices(s(-1, 2), s(-1, 3), s(5, 9)), slices(s(-1, 3), s(5, 9)));
+    }
+
+    private static Slice.Bound makeBound(ClusteringPrefix.Kind kind, Integer... components)
+    {
+        ByteBuffer[] values = new ByteBuffer[components.length];
+        for (int i = 0; i < components.length; i++)
+        {
+            values[i] = ByteBufferUtil.bytes(components[i]);
+        }
+        return Slice.Bound.create(kind, values);
+    }
+
+    private static List<ByteBuffer> columnNames(Integer ... components)
+    {
+        List<ByteBuffer> names = new ArrayList<>(components.length);
+        for (int component : components)
+            names.add(ByteBufferUtil.bytes(component));
+        return names;
+    }
+
+    private static Slice s(int start, int finish)
+    {
+        return Slice.make(makeBound(ClusteringPrefix.Kind.INCL_START_BOUND, start),
+                          makeBound(ClusteringPrefix.Kind.INCL_END_BOUND, finish));
+    }
+
+    private Slice[] slices(Slice... slices)
+    {
+        return slices;
+    }
+
+    private static void assertSlicesNormalization(ClusteringComparator cc, Slice[] original, Slice[] expected)
+    {
+        Slices.Builder builder = new Slices.Builder(cc);
+        for (Slice s : original)
+            builder.add(s);
+        Slices slices = builder.build();
+        assertEquals(expected.length, slices.size());
+        for (int i = 0; i < expected.length; i++)
+            assertEquals(expected[i], slices.get(i));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java b/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java
deleted file mode 100644
index 19303cf..0000000
--- a/test/unit/org/apache/cassandra/db/index/PerRowSecondaryIndexTest.java
+++ /dev/null

@@ -1,454 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.index;
-
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.cassandra.service.EmbeddedCassandraService;
-import org.apache.cassandra.thrift.*;
-import org.apache.thrift.TException;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.ExtendedFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-public class PerRowSecondaryIndexTest
-{
-
-    // test that when index(key) is called on a PRSI index,
-    // the data to be indexed can be read using the supplied
-    // key. TestIndex.index(key) simply reads the data to be
-    // indexed & stashes it in a static variable for inspection
-    // in the test.
-
-    private static final String KEYSPACE1 = "PerRowSecondaryIndexTest";
-    private static final String CF_INDEXED = "Indexed1";
-    private static final String INDEXED_COLUMN = "indexed";
-
-    private static CassandraServer server;
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException, IOException, TException
-    {
-        SchemaLoader.prepareServer();
-        new EmbeddedCassandraService().start();
-        ThriftSessionManager.instance.setCurrentSocket(new InetSocketAddress(9160));
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                SimpleStrategy.class,
-                KSMetaData.optsWithRF(1),
-                SchemaLoader.perRowIndexedCFMD(KEYSPACE1, CF_INDEXED));
-        server = new CassandraServer();
-        server.set_keyspace(KEYSPACE1);
-    }
-
-    @Before
-    public void clearTestStub()
-    {
-        PerRowSecondaryIndexTest.TestIndex.reset();
-    }
-
-    @Test
-    public void testIndexInsertAndUpdate()
-    {
-        // create a row then test that the configured index instance was able to read the row
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("foo"), 1);
-        rm.applyUnsafe();
-
-        ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
-        assertNotNull(indexedRow);
-        assertEquals(ByteBufferUtil.bytes("foo"), indexedRow.getColumn(Util.cellname("indexed")).value());
-
-        // update the row and verify what was indexed
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("bar"), 2);
-        rm.applyUnsafe();
-
-        indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
-        assertNotNull(indexedRow);
-        assertEquals(ByteBufferUtil.bytes("bar"), indexedRow.getColumn(Util.cellname("indexed")).value());
-        assertTrue(Arrays.equals("k1".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
-    }
-
-    @Test
-    public void testColumnDelete()
-    {
-        // issue a column delete and test that the configured index instance was notified to update
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k2"));
-        rm.delete("Indexed1", Util.cellname("indexed"), 1);
-        rm.applyUnsafe();
-
-        ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
-        assertNotNull(indexedRow);
-
-        for (Cell cell : indexedRow.getSortedColumns())
-            assertFalse(cell.isLive());
-
-        assertTrue(Arrays.equals("k2".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
-    }
-
-    @Test
-    public void testRowDelete()
-    {
-        // issue a row level delete and test that the configured index instance was notified to update
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k3"));
-        rm.delete("Indexed1", 1);
-        rm.applyUnsafe();
-
-        ColumnFamily indexedRow = PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_ROW;
-        assertNotNull(indexedRow);
-        for (Cell cell : indexedRow.getSortedColumns())
-            assertFalse(cell.isLive());
-
-        assertTrue(Arrays.equals("k3".getBytes(), PerRowSecondaryIndexTest.TestIndex.LAST_INDEXED_KEY.array()));
-    }
-
-    @Test
-    public void testInvalidSearch()
-    {
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes("k4"));
-        rm.add("Indexed1", Util.cellname("indexed"), ByteBufferUtil.bytes("foo"), 1);
-        rm.apply();
-
-        // test we can search:
-        UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"Indexed1\" WHERE indexed = 'foo'", KEYSPACE1));
-        assertEquals(1, result.size());
-
-        // test we can't search if the searcher doesn't validate the expression:
-        try
-        {
-            QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"Indexed1\" WHERE indexed = 'invalid'", KEYSPACE1));
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof InvalidRequestException || (e.getCause() != null && (e.getCause() instanceof InvalidRequestException)));
-        }
-    }
-
-    @Test
-    public void testInvalidCqlInsert()
-    {
-        // test we can insert if the index validates the expression:
-        QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".\"Indexed1\" (key, indexed) VALUES ('valid','valid')", KEYSPACE1));
-
-        // test we can't insert if the index doesn't validate the key:
-        try
-        {
-            QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".\"Indexed1\" (key, indexed) VALUES ('invalid','valid')", KEYSPACE1));
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof InvalidRequestException);
-        }
-
-        // test we can't insert if the index doesn't validate the columns:
-        try
-        {
-            QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".\"Indexed1\" (key, indexed) VALUES ('valid','invalid')", KEYSPACE1));
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof InvalidRequestException);
-        }
-    }
-
-    @Test
-    public void testInvalidThriftInsert() throws IOException, TException
-    {
-
-        long timestamp = System.currentTimeMillis();
-        ColumnPath cp = new ColumnPath(CF_INDEXED);
-        ColumnParent par = new ColumnParent(CF_INDEXED);
-        cp.column = ByteBufferUtil.bytes(INDEXED_COLUMN);
-
-        // test we can insert if the index validates the expression:
-        ByteBuffer key = ByteBufferUtil.bytes("valid");
-        server.insert(key, par, new Column(key).setValue(ByteBufferUtil.bytes("valid")).setTimestamp(timestamp), ConsistencyLevel.ONE);
-
-        // test we can't insert if the index doesn't validate the key:
-        try
-        {
-            key = ByteBufferUtil.bytes("invalid");
-            server.insert(key, par, new Column(key).setValue(ByteBufferUtil.bytes("valid")).setTimestamp(timestamp), ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-
-        // test we can't insert if the index doesn't validate the columns:
-        try
-        {
-            key = ByteBufferUtil.bytes("valid");
-            server.insert(key, par, new Column(key).setValue(ByteBufferUtil.bytes("invalid")).setTimestamp(timestamp), ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-    }
-
-    @Test
-    public void testInvalidThriftCas() throws IOException, TException
-    {
-        // test we can insert if the index validates the expression:
-        ByteBuffer key = ByteBufferUtil.bytes("valid");
-        Column column = new Column(key).setValue(ByteBufferUtil.bytes("valid")).setTimestamp(System.currentTimeMillis());
-        server.cas(key, CF_INDEXED, Collections.<Column>emptyList(), Collections.singletonList(column), ConsistencyLevel.LOCAL_SERIAL, ConsistencyLevel.ONE);
-
-        // test we can't insert if the index doesn't validate the key:
-        try
-        {
-            key = ByteBufferUtil.bytes("invalid");
-            server.cas(key, CF_INDEXED, Collections.<Column>emptyList(), Collections.singletonList(column), ConsistencyLevel.LOCAL_SERIAL, ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-
-        // test we can't insert if the index doesn't validate the columns:
-        try
-        {
-            key = ByteBufferUtil.bytes("valid");
-            column.setValue(ByteBufferUtil.bytes("invalid"));
-            server.cas(key, CF_INDEXED, Collections.<Column>emptyList(), Collections.singletonList(column), ConsistencyLevel.LOCAL_SERIAL, ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-    }
-
-    @Test
-    public void testInvalidThriftBatchMutate() throws IOException, TException
-    {
-        ByteBuffer key = ByteBufferUtil.bytes("valid");
-        long timestamp = System.currentTimeMillis();
-
-        org.apache.cassandra.thrift.Mutation mutation = new org.apache.cassandra.thrift.Mutation();
-        Column column = new Column(key).setValue(ByteBufferUtil.bytes("valid")).setTimestamp(System.currentTimeMillis());
-        ColumnOrSuperColumn cosc = new ColumnOrSuperColumn();
-        cosc.setColumn(column);
-        mutation.setColumn_or_supercolumn(cosc);
-
-        server.batch_mutate(Collections.singletonMap(key, Collections.singletonMap(CF_INDEXED, Collections.singletonList(mutation))), ConsistencyLevel.ONE);
-
-        // test we can't insert if the index doesn't validate the key:
-        try
-        {
-            key = ByteBufferUtil.bytes("invalid");
-            server.batch_mutate(Collections.singletonMap(key, Collections.singletonMap(CF_INDEXED, Collections.singletonList(mutation))), ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-
-        // test we can't insert if the index doesn't validate the columns:
-        try
-        {
-            key = ByteBufferUtil.bytes("valid");
-            cosc.setColumn(new Column(key).setValue(ByteBufferUtil.bytes("invalid")).setTimestamp(timestamp));
-            server.batch_mutate(Collections.singletonMap(key, Collections.singletonMap(CF_INDEXED, Collections.singletonList(mutation))), ConsistencyLevel.ONE);
-            fail("Query should have been invalid!");
-        }
-        catch (Exception e)
-        {
-            assertTrue(e instanceof org.apache.cassandra.thrift.InvalidRequestException);
-        }
-    }
-
-    public static class TestIndex extends PerRowSecondaryIndex
-    {
-        public static volatile boolean ACTIVE = true;
-        public static ColumnFamily LAST_INDEXED_ROW;
-        public static ByteBuffer LAST_INDEXED_KEY;
-
-        public static void reset()
-        {
-            ACTIVE = true;
-            LAST_INDEXED_KEY = null;
-            LAST_INDEXED_ROW = null;
-        }
-
-        @Override
-        public boolean indexes(CellName name)
-        {
-            return ACTIVE;
-        }
-        
-        @Override
-        public boolean indexes(ColumnDefinition cdef)
-        {
-            return ACTIVE;
-        }
-        
-        @Override
-        public void index(ByteBuffer rowKey, ColumnFamily cf)
-        {
-            QueryFilter filter = QueryFilter.getIdentityFilter(DatabaseDescriptor.getPartitioner().decorateKey(rowKey),
-                                                               baseCfs.getColumnFamilyName(),
-                                                               System.currentTimeMillis());
-            LAST_INDEXED_ROW = cf;
-            LAST_INDEXED_KEY = rowKey;
-        }
-
-        @Override
-        public void delete(DecoratedKey key, OpOrder.Group opGroup)
-        {
-        }
-
-        @Override
-        public void init()
-        {
-        }
-
-        @Override
-        public void reload()
-        {
-        }
-
-        @Override
-        public void validateOptions() throws ConfigurationException
-        {
-        }
-
-        @Override
-        public String getIndexName()
-        {
-            return this.getClass().getSimpleName();
-        }
-
-        @Override
-        protected SecondaryIndexSearcher createSecondaryIndexSearcher(Set<ByteBuffer> columns)
-        {
-            return new SecondaryIndexSearcher(baseCfs.indexManager, columns)
-            {
-                
-                @Override
-                public List<Row> search(ExtendedFilter filter)
-                {
-                    return Arrays.asList(new Row(LAST_INDEXED_KEY, LAST_INDEXED_ROW));
-                }
-
-                @Override
-                public void validate(IndexExpression indexExpression) throws InvalidRequestException
-                {
-                    if (indexExpression.value.equals(ByteBufferUtil.bytes("invalid")))
-                        throw new InvalidRequestException("Invalid search!");
-                }
-                
-            };
-        }
-
-        @Override
-        public void forceBlockingFlush()
-        {
-        }
-
-        @Override
-        public ColumnFamilyStore getIndexCfs()
-        {
-            return baseCfs;
-        }
-
-        @Override
-        public void removeIndex(ByteBuffer columnName)
-        {
-        }
-
-        @Override
-        public void invalidate()
-        {
-        }
-
-        @Override
-        public void truncateBlocking(long truncatedAt)
-        {
-        }
-
-        @Override
-        public long estimateResultRows() {
-            return 0;
-        }
-
-        @Override
-        public void validate(ByteBuffer key, ColumnFamily cf) throws InvalidRequestException
-        {
-            if (key.equals(ByteBufferUtil.bytes("invalid")))
-            {
-                throw new InvalidRequestException("Invalid key!");
-            }
-            for (Cell cell : cf)
-            {
-                if (cell.value().equals(ByteBufferUtil.bytes("invalid")))
-                {
-                    throw new InvalidRequestException("Invalid column!");
-                }
-            }
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java b/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java
index 18bce10..1d9f8aa 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java

@@ -18,6 +18,8 @@
 */
 package org.apache.cassandra.db.lifecycle;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -30,12 +32,13 @@
 
 import junit.framework.Assert;
 import org.apache.cassandra.MockSchema;
-import org.apache.cassandra.Util;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.big.BigTableReader;
-import org.apache.cassandra.utils.concurrent.Refs;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 public class HelpersTest
 {
@@ -150,19 +153,52 @@
         for (SSTableReader reader : readers)
             Assert.assertTrue(reader.isReplaced());
         accumulate = Helpers.setReplaced(readers, null);
-        Assert.assertNotNull(accumulate);
+        assertNotNull(accumulate);
     }
 
     @Test
     public void testMarkObsolete()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
+        LogTransaction txnLogs = new LogTransaction(OperationType.UNKNOWN);
         Iterable<SSTableReader> readers = Lists.newArrayList(MockSchema.sstable(1, cfs), MockSchema.sstable(2, cfs));
-        Throwable accumulate = Helpers.markObsolete(null, readers, null);
+        Iterable<SSTableReader> readersToKeep = Lists.newArrayList(MockSchema.sstable(3, cfs), MockSchema.sstable(4, cfs));
+
+        List<LogTransaction.Obsoletion> obsoletions = new ArrayList<>();
+        Helpers.prepareForObsoletion(readers, txnLogs, obsoletions, null);
+        assertNotNull(obsoletions);
+        assertEquals(2, obsoletions.size());
+
+        Throwable accumulate = Helpers.markObsolete(obsoletions, null);
         Assert.assertNull(accumulate);
         for (SSTableReader reader : readers)
             Assert.assertTrue(reader.isMarkedCompacted());
-        accumulate = Helpers.markObsolete(null, readers, null);
-        Assert.assertNotNull(accumulate);
+
+        for (SSTableReader reader : readersToKeep)
+            Assert.assertFalse(reader.isMarkedCompacted());
+
+        accumulate = Helpers.markObsolete(obsoletions, null);
+        assertNotNull(accumulate);
+
+        txnLogs.finish();
+    }
+
+    @Test
+    public void testObsoletionPerformance()
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS();
+        LogTransaction txnLogs = new LogTransaction(OperationType.UNKNOWN);
+        List<SSTableReader> readers = new ArrayList<>();
+
+        for (int i = 0; i < 10000; i++)
+        {
+            readers.add(MockSchema.sstable(i + 1, cfs));
+        }
+        long start = System.currentTimeMillis();
+
+        Helpers.prepareForObsoletion(readers.subList(0, 500), txnLogs, new ArrayList<>(),null );
+        txnLogs.finish();
+        long time = System.currentTimeMillis() - start;
+        assertTrue(time < 20000);
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java
index 737392e..0d87cc9 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java

@@ -36,9 +36,7 @@
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction.ReaderState;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction.ReaderState.Action;
-import org.apache.cassandra.io.sstable.SSTableDeletingTask;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.metrics.ColumnFamilyMetrics;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.concurrent.AbstractTransactionalTest;
 import org.apache.cassandra.utils.concurrent.Transactional.AbstractTransactional.State;
@@ -254,7 +252,7 @@
 
     protected TestableTransaction newTest()
     {
-        SSTableDeletingTask.waitForDeletions();
+        LogTransaction.waitForDeletions();
         SSTableReader.resetTidying();
         return new TxnTest();
     }
@@ -404,6 +402,12 @@
             for (SSTableReader reader : concat(loggedObsolete, stagedObsolete))
                 Assert.assertTrue(reader.selfRef().globalCount() == 0);
         }
+
+        @Override
+        protected boolean commitCanThrow()
+        {
+            return true;
+        }
     }
 
     private static SSTableReader[] readersArray(int lb, int ub, ColumnFamilyStore cfs)

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java
new file mode 100644
index 0000000..2544a0d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java

@@ -0,0 +1,1290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.IOError;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.file.Files;
+import java.util.*;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static junit.framework.Assert.assertNotNull;
+import static junit.framework.Assert.assertNull;
+import static junit.framework.Assert.fail;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import junit.framework.Assert;
+import org.apache.cassandra.MockSchema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.*;
+import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.util.BufferedSegmentedFile;
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SegmentedFile;
+import org.apache.cassandra.utils.AlwaysPresentFilter;
+import org.apache.cassandra.utils.concurrent.AbstractTransactionalTest;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+public class LogTransactionTest extends AbstractTransactionalTest
+{
+    private static final String KEYSPACE = "TransactionLogsTest";
+
+    @BeforeClass
+    public static void setUp()
+    {
+        MockSchema.cleanup();
+    }
+
+    protected AbstractTransactionalTest.TestableTransaction newTest() throws Exception
+    {
+        LogTransaction.waitForDeletions();
+        SSTableReader.resetTidying();
+        return new TxnTest();
+    }
+
+    private static final class TxnTest extends TestableTransaction
+    {
+        private final static class Transaction extends Transactional.AbstractTransactional implements Transactional
+        {
+            final ColumnFamilyStore cfs;
+            final LogTransaction txnLogs;
+            final File dataFolder;
+            final SSTableReader sstableOld;
+            final SSTableReader sstableNew;
+            final LogTransaction.SSTableTidier tidier;
+
+            Transaction(ColumnFamilyStore cfs, LogTransaction txnLogs) throws IOException
+            {
+                this.cfs = cfs;
+                this.txnLogs = txnLogs;
+                this.dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+                this.sstableOld = sstable(dataFolder, cfs, 0, 128);
+                this.sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+                assertNotNull(txnLogs);
+                assertNotNull(txnLogs.id());
+                Assert.assertEquals(OperationType.COMPACTION, txnLogs.type());
+
+                txnLogs.trackNew(sstableNew);
+                tidier = txnLogs.obsoleted(sstableOld);
+                assertNotNull(tidier);
+            }
+
+            protected Throwable doCommit(Throwable accumulate)
+            {
+                sstableOld.markObsolete(tidier);
+                sstableOld.selfRef().release();
+                LogTransaction.waitForDeletions();
+
+                Throwable ret = txnLogs.commit(accumulate);
+
+                sstableNew.selfRef().release();
+                return ret;
+            }
+
+            protected Throwable doAbort(Throwable accumulate)
+            {
+                tidier.abort();
+                LogTransaction.waitForDeletions();
+
+                Throwable ret = txnLogs.abort(accumulate);
+
+                sstableNew.selfRef().release();
+                sstableOld.selfRef().release();
+                return ret;
+            }
+
+            protected void doPrepare()
+            {
+                txnLogs.prepareToCommit();
+            }
+
+            void assertInProgress() throws Exception
+            {
+                assertFiles(dataFolder.getPath(), Sets.newHashSet(Iterables.concat(sstableNew.getAllFilePaths(),
+                                                                                   sstableOld.getAllFilePaths(),
+                                                                                   txnLogs.logFilePaths())));
+            }
+
+            void assertPrepared() throws Exception
+            {
+            }
+
+            void assertAborted() throws Exception
+            {
+                assertFiles(dataFolder.getPath(), new HashSet<>(sstableOld.getAllFilePaths()));
+            }
+
+            void assertCommitted() throws Exception
+            {
+                assertFiles(dataFolder.getPath(), new HashSet<>(sstableNew.getAllFilePaths()));
+            }
+        }
+
+        final Transaction txn;
+
+        private TxnTest() throws IOException
+        {
+            this(MockSchema.newCFS(KEYSPACE));
+        }
+
+        private TxnTest(ColumnFamilyStore cfs) throws IOException
+        {
+            this(cfs, new LogTransaction(OperationType.COMPACTION));
+        }
+
+        private TxnTest(ColumnFamilyStore cfs, LogTransaction txnLogs) throws IOException
+        {
+            this(new Transaction(cfs, txnLogs));
+        }
+
+        private TxnTest(Transaction txn)
+        {
+            super(txn);
+            this.txn = txn;
+        }
+
+        protected void assertInProgress() throws Exception
+        {
+            txn.assertInProgress();
+        }
+
+        protected void assertPrepared() throws Exception
+        {
+            txn.assertPrepared();
+        }
+
+        protected void assertAborted() throws Exception
+        {
+            txn.assertAborted();
+        }
+
+        protected void assertCommitted() throws Exception
+        {
+            txn.assertCommitted();
+        }
+    }
+
+    @Test
+    public void testUntrack() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        // complete a transaction without keep the new files since they were untracked
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+        log.untrackNew(sstableNew);
+
+        log.finish();
+
+        sstableNew.selfRef().release();
+        Thread.sleep(1);
+        LogTransaction.waitForDeletions();
+
+        assertFiles(dataFolder.getPath(), Collections.<String>emptySet());
+    }
+
+    @Test
+    public void testCommitSameDesc() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableOld1 = sstable(dataFolder, cfs, 0, 128);
+        SSTableReader sstableOld2 = sstable(dataFolder, cfs, 0, 256);
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+
+        sstableOld1.setReplaced();
+
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstableOld2);
+        assertNotNull(tidier);
+
+        log.finish();
+
+        sstableOld2.markObsolete(tidier);
+
+        sstableOld1.selfRef().release();
+        sstableOld2.selfRef().release();
+
+        assertFiles(dataFolder.getPath(), new HashSet<>(sstableNew.getAllFilePaths()));
+
+        sstableNew.selfRef().release();
+    }
+
+    @Test
+    public void testCommitOnlyNew() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstable);
+        log.finish();
+
+        assertFiles(dataFolder.getPath(), new HashSet<>(sstable.getAllFilePaths()));
+
+        sstable.selfRef().release();
+    }
+
+    @Test
+    public void testCommitOnlyOld() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstable);
+        assertNotNull(tidier);
+
+        log.finish();
+        sstable.markObsolete(tidier);
+        sstable.selfRef().release();
+
+        assertFiles(dataFolder.getPath(), new HashSet<>());
+    }
+
+    @Test
+    public void testCommitMultipleFolders() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        log.finish();
+
+        sstables[0].markObsolete(tidiers[0]);
+        sstables[2].markObsolete(tidiers[1]);
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+        LogTransaction.waitForDeletions();
+
+        assertFiles(dataFolder1.getPath(), new HashSet<>(sstables[1].getAllFilePaths()));
+        assertFiles(dataFolder2.getPath(), new HashSet<>(sstables[3].getAllFilePaths()));
+    }
+
+    @Test
+    public void testAbortOnlyNew() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstable);
+        log.abort();
+
+        sstable.selfRef().release();
+
+        assertFiles(dataFolder.getPath(), new HashSet<>());
+    }
+
+    @Test
+    public void testAbortOnlyOld() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstable);
+        assertNotNull(tidier);
+
+        tidier.abort();
+        log.abort();
+
+        sstable.selfRef().release();
+
+        assertFiles(dataFolder.getPath(), new HashSet<>(sstable.getAllFilePaths()));
+    }
+
+    @Test
+    public void testAbortMultipleFolders() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        Arrays.stream(tidiers).forEach(LogTransaction.SSTableTidier::abort);
+        log.abort();
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+        LogTransaction.waitForDeletions();
+
+        assertFiles(dataFolder1.getPath(), new HashSet<>(sstables[0].getAllFilePaths()));
+        assertFiles(dataFolder2.getPath(), new HashSet<>(sstables[2].getAllFilePaths()));
+    }
+
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_abort() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableOld = sstable(dataFolder, cfs, 0, 128);
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        // simulate tracking sstables with a failed transaction (new log file NOT deleted)
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstableOld);
+
+        Set<File> tmpFiles = sstableNew.getAllFilePaths().stream().map(File::new).collect(Collectors.toSet());
+
+        sstableNew.selfRef().release();
+        sstableOld.selfRef().release();
+
+        Assert.assertEquals(tmpFiles, getTemporaryFiles(sstableNew.descriptor.directory));
+
+        // normally called at startup
+        LogTransaction.removeUnfinishedLeftovers(cfs.metadata);
+
+        // sstableOld should be only table left
+        Directories directories = new Directories(cfs.metadata);
+        Map<Descriptor, Set<Component>> sstables = directories.sstableLister(Directories.OnTxnErr.THROW).list();
+        assertEquals(1, sstables.size());
+
+        assertFiles(dataFolder.getPath(), new HashSet<>(sstableOld.getAllFilePaths()));
+
+        // complete the transaction before releasing files
+        tidier.run();
+        log.close();
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_commit() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableOld = sstable(dataFolder, cfs, 0, 128);
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        // simulate tracking sstables with a committed transaction (new log file deleted)
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstableOld);
+
+        //Fake a commit
+        log.txnFile().commit();
+
+        Set<File> tmpFiles = sstableOld.getAllFilePaths().stream().map(File::new).collect(Collectors.toSet());
+
+        sstableNew.selfRef().release();
+        sstableOld.selfRef().release();
+
+        Assert.assertEquals(tmpFiles, getTemporaryFiles(sstableOld.descriptor.directory));
+
+        // normally called at startup
+        LogTransaction.removeUnfinishedLeftovers(cfs.metadata);
+
+        // sstableNew should be only table left
+        Directories directories = new Directories(cfs.metadata);
+        Map<Descriptor, Set<Component>> sstables = directories.sstableLister(Directories.OnTxnErr.THROW).list();
+        assertEquals(1, sstables.size());
+
+        assertFiles(dataFolder.getPath(), new HashSet<>(sstableNew.getAllFilePaths()));
+
+        // complete the transaction to avoid LEAK errors
+        tidier.run();
+        assertNull(log.complete(null));
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_commit_multipleFolders() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        Collection<File> logFiles = log.logFiles();
+        Assert.assertEquals(2, logFiles.size());
+
+        // fake a commit
+        log.txnFile().commit();
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+
+        // test listing
+        Assert.assertEquals(sstables[0].getAllFilePaths().stream().map(File::new).collect(Collectors.toSet()),
+                            getTemporaryFiles(dataFolder1));
+        Assert.assertEquals(sstables[2].getAllFilePaths().stream().map(File::new).collect(Collectors.toSet()),
+                            getTemporaryFiles(dataFolder2));
+
+        // normally called at startup
+        LogTransaction.removeUnfinishedLeftovers(Arrays.asList(dataFolder1, dataFolder2));
+
+        // new tables should be only table left
+        assertFiles(dataFolder1.getPath(), new HashSet<>(sstables[1].getAllFilePaths()));
+        assertFiles(dataFolder2.getPath(), new HashSet<>(sstables[3].getAllFilePaths()));
+
+        // complete the transaction to avoid LEAK errors
+        Arrays.stream(tidiers).forEach(LogTransaction.SSTableTidier::run);
+        assertNull(log.complete(null));
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_abort_multipleFolders() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        Collection<File> logFiles = log.logFiles();
+        Assert.assertEquals(2, logFiles.size());
+
+        // fake an abort
+        log.txnFile().abort();
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+
+        // test listing
+        Assert.assertEquals(sstables[1].getAllFilePaths().stream().map(File::new).collect(Collectors.toSet()),
+                            getTemporaryFiles(dataFolder1));
+        Assert.assertEquals(sstables[3].getAllFilePaths().stream().map(File::new).collect(Collectors.toSet()),
+                            getTemporaryFiles(dataFolder2));
+
+        // normally called at startup
+        LogTransaction.removeUnfinishedLeftovers(Arrays.asList(dataFolder1, dataFolder2));
+
+        // old tables should be only table left
+        assertFiles(dataFolder1.getPath(), new HashSet<>(sstables[0].getAllFilePaths()));
+        assertFiles(dataFolder2.getPath(), new HashSet<>(sstables[2].getAllFilePaths()));
+
+        // complete the transaction to avoid LEAK errors
+        Arrays.stream(tidiers).forEach(LogTransaction.SSTableTidier::run);
+        assertNull(log.complete(null));
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_mismatchedFinalRecords() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert mismatched records
+            FileUtils.append(logFiles.get(0), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+            FileUtils.append(logFiles.get(1), LogRecord.makeAbort(System.currentTimeMillis()).raw);
+
+        }, false);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_partialFinalRecords_first() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert a full record and a partial one
+            String finalRecord = LogRecord.makeCommit(System.currentTimeMillis()).raw;
+            int toChop = finalRecord.length() / 2;
+            FileUtils.append(logFiles.get(0), finalRecord.substring(0, finalRecord.length() - toChop));
+            FileUtils.append(logFiles.get(1), finalRecord);
+
+        }, true);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_partialFinalRecords_second() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert a full record and a partial one
+            String finalRecord = LogRecord.makeCommit(System.currentTimeMillis()).raw;
+            int toChop = finalRecord.length() / 2;
+            FileUtils.append(logFiles.get(0), finalRecord);
+            FileUtils.append(logFiles.get(1), finalRecord.substring(0, finalRecord.length() - toChop));
+
+        }, true);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_partialNonFinalRecord_first() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert a partial sstable record and a full commit record
+            String sstableRecord = LogRecord.make(LogRecord.Type.ADD, Collections.emptyList(), 0, "abc-").raw;
+            int toChop = sstableRecord.length() / 2;
+            FileUtils.append(logFiles.get(0), sstableRecord.substring(0, sstableRecord.length() - toChop));
+            FileUtils.append(logFiles.get(1), sstableRecord);
+            String finalRecord = LogRecord.makeCommit(System.currentTimeMillis()).raw;
+            FileUtils.append(logFiles.get(0), finalRecord);
+            FileUtils.append(logFiles.get(1), finalRecord);
+
+        }, false);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_partialNonFinalRecord_second() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert a partial sstable record and a full commit record
+            String sstableRecord = LogRecord.make(LogRecord.Type.ADD, Collections.emptyList(), 0, "abc-").raw;
+            int toChop = sstableRecord.length() / 2;
+            FileUtils.append(logFiles.get(0), sstableRecord);
+            FileUtils.append(logFiles.get(1), sstableRecord.substring(0, sstableRecord.length() - toChop));
+            String finalRecord = LogRecord.makeCommit(System.currentTimeMillis()).raw;
+            FileUtils.append(logFiles.get(0), finalRecord);
+            FileUtils.append(logFiles.get(1), finalRecord);
+
+        }, false);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_missingFinalRecords_first() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert only one commit record
+            FileUtils.append(logFiles.get(0), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+
+        }, true);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_missingFinalRecords_second() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert only one commit record
+            FileUtils.append(logFiles.get(1), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+
+        }, true);
+    }
+
+    @Test
+    public void testRemoveUnfinishedLeftovers_multipleFolders_tooManyFinalRecords() throws Throwable
+    {
+        testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(txn -> {
+            List<File> logFiles = txn.logFiles();
+            Assert.assertEquals(2, logFiles.size());
+
+            // insert mismatched records
+            FileUtils.append(logFiles.get(0), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+            FileUtils.append(logFiles.get(1), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+            FileUtils.append(logFiles.get(1), LogRecord.makeCommit(System.currentTimeMillis()).raw);
+
+        }, false);
+    }
+
+    private static void testRemoveUnfinishedLeftovers_multipleFolders_errorConditions(Consumer<LogTransaction> modifier, boolean shouldCommit) throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        // fake some error condition on the txn logs
+        modifier.accept(log);
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+
+        LogTransaction.removeUnfinishedLeftovers(Arrays.asList(dataFolder1, dataFolder2));
+        LogTransaction.waitForDeletions();
+
+        if (shouldCommit)
+        {
+            // only new sstables should still be there
+            assertFiles(dataFolder1.getPath(), new HashSet<>(sstables[1].getAllFilePaths()));
+            assertFiles(dataFolder2.getPath(), new HashSet<>(sstables[3].getAllFilePaths()));
+        }
+        else
+        {
+            // all files should still be there
+            assertFiles(dataFolder1.getPath(), Sets.newHashSet(Iterables.concat(sstables[0].getAllFilePaths(),
+                                                                                sstables[1].getAllFilePaths(),
+                                                                                Collections.singleton(log.logFilePaths().get(0)))));
+            assertFiles(dataFolder2.getPath(), Sets.newHashSet(Iterables.concat(sstables[2].getAllFilePaths(),
+                                                                                sstables[3].getAllFilePaths(),
+                                                                                Collections.singleton(log.logFilePaths().get(1)))));
+        }
+
+
+        // complete the transaction to avoid LEAK errors
+        Arrays.stream(tidiers).forEach(LogTransaction.SSTableTidier::run);
+        log.txnFile().commit(); // just anything to make sure transaction tidier will finish
+        assertNull(log.complete(null));
+    }
+
+    @Test
+    public void testGetTemporaryFiles() throws IOException
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable1 = sstable(dataFolder, cfs, 0, 128);
+
+        Set<File> tmpFiles = getTemporaryFiles(dataFolder);
+        assertNotNull(tmpFiles);
+        assertEquals(0, tmpFiles.size());
+
+        try(LogTransaction log = new LogTransaction(OperationType.WRITE))
+        {
+            Directories directories = new Directories(cfs.metadata);
+
+            File[] beforeSecondSSTable = dataFolder.listFiles(pathname -> !pathname.isDirectory());
+
+            SSTableReader sstable2 = sstable(dataFolder, cfs, 1, 128);
+            log.trackNew(sstable2);
+
+            Map<Descriptor, Set<Component>> sstables = directories.sstableLister(Directories.OnTxnErr.THROW).list();
+            assertEquals(2, sstables.size());
+
+            // this should contain sstable1, sstable2 and the transaction log file
+            File[] afterSecondSSTable = dataFolder.listFiles(pathname -> !pathname.isDirectory());
+
+            int numNewFiles = afterSecondSSTable.length - beforeSecondSSTable.length;
+            assertEquals(numNewFiles - 1, sstable2.getAllFilePaths().size()); // new files except for transaction log file
+
+            tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(numNewFiles - 1, tmpFiles.size());
+
+            File ssTable2DataFile = new File(sstable2.descriptor.filenameFor(Component.DATA));
+            File ssTable2IndexFile = new File(sstable2.descriptor.filenameFor(Component.PRIMARY_INDEX));
+
+            assertTrue(tmpFiles.contains(ssTable2DataFile));
+            assertTrue(tmpFiles.contains(ssTable2IndexFile));
+
+            List<File> files = directories.sstableLister(Directories.OnTxnErr.THROW).listFiles();
+            List<File> filesNoTmp = directories.sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true).listFiles();
+            assertNotNull(files);
+            assertNotNull(filesNoTmp);
+
+            assertTrue(files.contains(ssTable2DataFile));
+            assertTrue(files.contains(ssTable2IndexFile));
+
+            assertFalse(filesNoTmp.contains(ssTable2DataFile));
+            assertFalse(filesNoTmp.contains(ssTable2IndexFile));
+
+            log.finish();
+
+            //Now it should be empty since the transaction has finished
+            tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(0, tmpFiles.size());
+
+            filesNoTmp = directories.sstableLister(Directories.OnTxnErr.THROW).skipTemporary(true).listFiles();
+            assertNotNull(filesNoTmp);
+            assertTrue(filesNoTmp.contains(ssTable2DataFile));
+            assertTrue(filesNoTmp.contains(ssTable2IndexFile));
+
+            sstable1.selfRef().release();
+            sstable2.selfRef().release();
+        }
+    }
+
+    @Test
+    public void testGetTemporaryFilesMultipleFolders() throws IOException
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+
+        File origiFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        File dataFolder1 = new File(origiFolder, "1");
+        File dataFolder2 = new File(origiFolder, "2");
+        Files.createDirectories(dataFolder1.toPath());
+        Files.createDirectories(dataFolder2.toPath());
+
+        SSTableReader[] sstables = { sstable(dataFolder1, cfs, 0, 128),
+                                     sstable(dataFolder1, cfs, 1, 128),
+                                     sstable(dataFolder2, cfs, 2, 128),
+                                     sstable(dataFolder2, cfs, 3, 128)
+        };
+
+        // they should all have the same number of files since they are created in the same way
+        int numSStableFiles = sstables[0].getAllFilePaths().size();
+
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        for (File dataFolder : new File[] {dataFolder1, dataFolder2})
+        {
+            Set<File> tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(0, tmpFiles.size());
+        }
+
+        LogTransaction.SSTableTidier[] tidiers = { log.obsoleted(sstables[0]), log.obsoleted(sstables[2]) };
+
+        log.trackNew(sstables[1]);
+        log.trackNew(sstables[3]);
+
+        for (File dataFolder : new File[] {dataFolder1, dataFolder2})
+        {
+            Set<File> tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(numSStableFiles, tmpFiles.size());
+        }
+
+        log.finish();
+
+        for (File dataFolder : new File[] {dataFolder1, dataFolder2})
+        {
+            Set<File> tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(numSStableFiles, tmpFiles.size());
+        }
+
+        sstables[0].markObsolete(tidiers[0]);
+        sstables[2].markObsolete(tidiers[1]);
+
+        Arrays.stream(sstables).forEach(s -> s.selfRef().release());
+        LogTransaction.waitForDeletions();
+
+        for (File dataFolder : new File[] {dataFolder1, dataFolder2})
+        {
+            Set<File> tmpFiles = getTemporaryFiles(dataFolder);
+            assertNotNull(tmpFiles);
+            assertEquals(0, tmpFiles.size());
+        }
+
+    }
+
+    @Test
+    public void testWrongChecksumLastLine() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          { // Fake a commit with invalid checksum
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d,0,0][%d]", now, 12345678L)));
+                          },
+                          true);
+    }
+
+    @Test
+    public void testWrongChecksumSecondFromLastLine() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          { // Fake two lines with invalid checksum
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("add:[ma-3-big,%d,4][%d]", now, 12345678L)));
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d,0,0][%d]", now, 12345678L)));
+                          },
+                          false);
+    }
+
+    @Test
+    public void testWrongChecksumLastLineMissingFile() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          { // Fake a commit with invalid checksum and also delete one of the old files
+                              for (String filePath : s.getAllFilePaths())
+                              {
+                                  if (filePath.endsWith("Data.db"))
+                                  {
+                                      assertTrue(FileUtils.delete(filePath));
+                                      assertNull(t.txnFile().syncFolder(null));
+                                      break;
+                                  }
+                              }
+
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d,0,0][%d]", now, 12345678L)));
+                          },
+                          false);
+    }
+
+    @Test
+    public void testWrongChecksumLastLineWrongRecordFormat() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          { // Fake a commit with invalid checksum and a wrong record format (extra spaces)
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d ,0 ,0 ][%d]", now, 12345678L)));
+                          },
+                          true);
+    }
+
+    @Test
+    public void testMissingChecksumLastLine() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          {
+                              // Fake a commit without a checksum
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d,0,0]", now)));
+                          },
+                          true);
+    }
+
+    @Test
+    public void testMissingChecksumSecondFromLastLine() throws IOException
+    {
+        testCorruptRecord((t, s) ->
+                          { // Fake two lines without a checksum
+                              long now = System.currentTimeMillis();
+                              t.logFiles().forEach( f -> FileUtils.append(f, String.format("add:[ma-3-big,%d,4]", now)));
+                              t.logFiles().forEach(f -> FileUtils.append(f, String.format("commit:[%d,0,0]", now)));
+                          },
+                          false);
+    }
+
+    @Test
+    public void testUnparsableLastRecord() throws IOException
+    {
+        testCorruptRecord((t, s) -> t.logFiles().forEach(f -> FileUtils.append(f, "commit:[a,b,c][12345678]")), true);
+    }
+
+    @Test
+    public void testUnparsableFirstRecord() throws IOException
+    {
+        testCorruptRecord((t, s) -> t.logFiles().forEach(f -> {
+            List<String> lines = FileUtils.readLines(f);
+            lines.add(0, "add:[a,b,c][12345678]");
+            FileUtils.replace(f, lines.toArray(new String[lines.size()]));
+        }), false);
+    }
+
+    private static void testCorruptRecord(BiConsumer<LogTransaction, SSTableReader> modifier, boolean isRecoverable) throws IOException
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableOld = sstable(dataFolder, cfs, 0, 128);
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        // simulate tracking sstables with a committed transaction except the checksum will be wrong
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstableOld);
+
+        // Modify the transaction log or disk state for sstableOld
+        modifier.accept(log, sstableOld);
+
+        // Sync the folder to make sure that later on removeUnfinishedLeftovers picks up
+        // any changes to the txn files done by the modifier
+        assertNull(log.txnFile().syncFolder(null));
+
+        assertNull(log.complete(null));
+
+        sstableOld.selfRef().release();
+        sstableNew.selfRef().release();
+
+        // The files on disk, for old files make sure to exclude the files that were deleted by the modifier
+        Set<String> newFiles = sstableNew.getAllFilePaths().stream().collect(Collectors.toSet());
+        Set<String> oldFiles = sstableOld.getAllFilePaths().stream().filter(p -> new File(p).exists()).collect(Collectors.toSet());
+
+        //This should filter as in progress since the last record is corrupt
+        assertFiles(newFiles, getTemporaryFiles(dataFolder));
+        assertFiles(oldFiles, getFinalFiles(dataFolder));
+
+        if (isRecoverable)
+        { // the corruption is recoverable but the commit record is unreadable so the transaction is still in progress
+
+            //This should remove new files
+            LogTransaction.removeUnfinishedLeftovers(cfs.metadata);
+
+            // make sure to exclude the old files that were deleted by the modifier
+            assertFiles(dataFolder.getPath(), oldFiles);
+        }
+        else
+        { // if an intermediate line was also modified, it should ignore the tx log file
+
+            //This should not remove any files
+            LogTransaction.removeUnfinishedLeftovers(cfs.metadata);
+
+            assertFiles(dataFolder.getPath(), Sets.newHashSet(Iterables.concat(newFiles,
+                                                                               oldFiles,
+                                                                               log.logFilePaths())));
+        }
+
+        // make sure to run the tidier to avoid any leaks in the logs
+        tidier.run();
+    }
+
+    @Test
+    public void testObsoletedDataFileUpdateTimeChanged() throws IOException
+    {
+        testObsoletedFilesChanged(sstable ->
+                                  {
+                                      // increase the modification time of the Data file
+                                      for (String filePath : sstable.getAllFilePaths())
+                                      {
+                                          if (filePath.endsWith("Data.db"))
+                                              assertTrue(new File(filePath).setLastModified(System.currentTimeMillis() + 60000)); //one minute later
+                                      }
+                                  });
+    }
+
+    private static void testObsoletedFilesChanged(Consumer<SSTableReader> modifier) throws IOException
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstableOld = sstable(dataFolder, cfs, 0, 128);
+        SSTableReader sstableNew = sstable(dataFolder, cfs, 1, 128);
+
+        // simulate tracking sstables with a committed transaction except the checksum will be wrong
+        LogTransaction log = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(log);
+
+        log.trackNew(sstableNew);
+        LogTransaction.SSTableTidier tidier = log.obsoleted(sstableOld);
+
+        //modify the old sstable files
+        modifier.accept(sstableOld);
+
+        //Fake a commit
+        log.txnFile().commit();
+
+        //This should not remove the old files
+        LogTransaction.removeUnfinishedLeftovers(cfs.metadata);
+
+        assertFiles(dataFolder.getPath(), Sets.newHashSet(Iterables.concat(
+                                                                          sstableNew.getAllFilePaths(),
+                                                                          sstableOld.getAllFilePaths(),
+                                                                          log.logFilePaths())));
+
+        sstableOld.selfRef().release();
+        sstableNew.selfRef().release();
+
+        // complete the transaction to avoid LEAK errors
+        assertNull(log.complete(null));
+
+        assertFiles(dataFolder.getPath(), Sets.newHashSet(Iterables.concat(sstableNew.getAllFilePaths(),
+                                                                           sstableOld.getAllFilePaths(),
+                                                                           log.logFilePaths())));
+
+        // make sure to run the tidier to avoid any leaks in the logs
+        tidier.run();
+    }
+
+    @Test
+    public void testGetTemporaryFilesSafeAfterObsoletion() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction logs = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(logs);
+
+        LogTransaction.SSTableTidier tidier = logs.obsoleted(sstable);
+
+        logs.finish();
+
+        sstable.markObsolete(tidier);
+        sstable.selfRef().release();
+
+        // This should race with the asynchronous deletion of txn log files
+        // It doesn't matter what it returns but it should not throw because the txn
+        // was completed before deleting files (i.e. releasing sstables)
+        for (int i = 0; i < 200; i++)
+            getTemporaryFiles(dataFolder);
+    }
+
+    @Test
+    public void testGetTemporaryFilesThrowsIfCompletingAfterObsoletion() throws Throwable
+    {
+        ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE);
+        File dataFolder = new Directories(cfs.metadata).getDirectoryForNewSSTables();
+        SSTableReader sstable = sstable(dataFolder, cfs, 0, 128);
+
+        LogTransaction logs = new LogTransaction(OperationType.COMPACTION);
+        assertNotNull(logs);
+
+        LogTransaction.SSTableTidier tidier = logs.obsoleted(sstable);
+
+        sstable.markObsolete(tidier);
+        sstable.selfRef().release();
+
+        LogTransaction.waitForDeletions();
+
+        try
+        {
+            // This should race with the asynchronous deletion of txn log files
+            // it should throw because we are violating the requirement that a transaction must
+            // finish before deleting files (i.e. releasing sstables)
+            getTemporaryFiles(dataFolder);
+            fail("Expected runtime exception");
+        }
+        catch(RuntimeException e)
+        {
+            //pass as long as the cause is not an assertion
+            assertFalse(e.getCause() instanceof AssertionError);
+        }
+
+        logs.finish();
+    }
+
+    private static SSTableReader sstable(File dataFolder, ColumnFamilyStore cfs, int generation, int size) throws IOException
+    {
+        Descriptor descriptor = new Descriptor(dataFolder, cfs.keyspace.getName(), cfs.getTableName(), generation);
+        Set<Component> components = ImmutableSet.of(Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.TOC);
+        for (Component component : components)
+        {
+            File file = new File(descriptor.filenameFor(component));
+            if (!file.exists())
+                assertTrue(file.createNewFile());
+            try (RandomAccessFile raf = new RandomAccessFile(file, "rw"))
+            {
+                raf.setLength(size);
+            }
+        }
+
+        SegmentedFile dFile = new BufferedSegmentedFile(new ChannelProxy(new File(descriptor.filenameFor(Component.DATA))), RandomAccessReader.DEFAULT_BUFFER_SIZE, 0);
+        SegmentedFile iFile = new BufferedSegmentedFile(new ChannelProxy(new File(descriptor.filenameFor(Component.PRIMARY_INDEX))), RandomAccessReader.DEFAULT_BUFFER_SIZE, 0);
+
+        SerializationHeader header = SerializationHeader.make(cfs.metadata, Collections.emptyList());
+        StatsMetadata metadata = (StatsMetadata) new MetadataCollector(cfs.metadata.comparator)
+                                                 .finalizeMetadata(cfs.metadata.partitioner.getClass().getCanonicalName(), 0.01f, -1, header)
+                                                 .get(MetadataType.STATS);
+        SSTableReader reader = SSTableReader.internalOpen(descriptor,
+                                                          components,
+                                                          cfs.metadata,
+                                                          dFile,
+                                                          iFile,
+                                                          MockSchema.indexSummary.sharedCopy(),
+                                                          new AlwaysPresentFilter(),
+                                                          1L,
+                                                          metadata,
+                                                          SSTableReader.OpenReason.NORMAL,
+                                                          header);
+        reader.first = reader.last = MockSchema.readerBounds(generation);
+        return reader;
+    }
+
+    private static void assertFiles(String dirPath, Set<String> expectedFiles) throws IOException
+    {
+        assertFiles(dirPath, expectedFiles, false);
+    }
+
+    private static void assertFiles(String dirPath, Set<String> expectedFiles, boolean excludeNonExistingFiles) throws IOException
+    {
+        LogTransaction.waitForDeletions();
+
+        File dir = new File(dirPath).getCanonicalFile();
+        File[] files = dir.listFiles();
+        if (files != null)
+        {
+            for (File file : files)
+            {
+                if (file.isDirectory())
+                    continue;
+
+                String filePath = file.getPath();
+                assertTrue(String.format("%s not in [%s]", filePath, expectedFiles), expectedFiles.contains(filePath));
+                expectedFiles.remove(filePath);
+            }
+        }
+
+        if (excludeNonExistingFiles)
+        {
+            for (String filePath : expectedFiles)
+            {
+                File file = new File(filePath);
+                if (!file.exists())
+                    expectedFiles.remove(filePath);
+            }
+        }
+
+        assertTrue(expectedFiles.toString(), expectedFiles.isEmpty());
+    }
+
+    // Check either that a temporary file is expected to exist (in the existingFiles) or that
+    // it does not exist any longer (on Windows we need to check File.exists() because a list
+    // might return a file as existing even if it does not)
+    private static void assertFiles(Iterable<String> existingFiles, Set<File> temporaryFiles)
+    {
+        for (String filePath : existingFiles)
+        {
+            File file = new File(filePath);
+            assertTrue(filePath, temporaryFiles.contains(file));
+            temporaryFiles.remove(file);
+        }
+
+        for (File file : temporaryFiles)
+        {
+            if (!file.exists())
+                temporaryFiles.remove(file);
+        }
+
+        assertTrue(temporaryFiles.toString(), temporaryFiles.isEmpty());
+    }
+
+    static Set<File> getTemporaryFiles(File folder)
+    {
+        return listFiles(folder, Directories.FileType.TEMPORARY);
+    }
+
+    static Set<File> getFinalFiles(File folder)
+    {
+        return listFiles(folder, Directories.FileType.FINAL);
+    }
+
+    static Set<File> listFiles(File folder, Directories.FileType... types)
+    {
+        Collection<Directories.FileType> match = Arrays.asList(types);
+        return new LogAwareFileLister(folder.toPath(),
+                                      (file, type) -> match.contains(type),
+                                      Directories.OnTxnErr.IGNORE).list()
+                       .stream()
+                       .map(f -> {
+                           try
+                           {
+                               return f.getCanonicalFile();
+                           }
+                           catch (IOException e)
+                           {
+                               throw new IOError(e);
+                           }
+                       })
+                       .collect(Collectors.toSet());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
new file mode 100644
index 0000000..4fbbb36
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java

@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.MockSchema;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.cassandra.db.compaction.CompactionController;
+import org.apache.cassandra.db.compaction.CompactionIterator;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.io.sstable.CQLSSTableWriter;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableRewriter;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests to simulate real transactions such as compactions and flushing
+ * using SSTableRewriter, ColumnFamilyStore, LifecycleTransaction, TransactionLogs, etc
+ */
+public class RealTransactionsTest extends SchemaLoader
+{
+    private static final String KEYSPACE = "TransactionLogsTest";
+    private static final String REWRITE_FINISHED_CF = "RewriteFinished";
+    private static final String REWRITE_ABORTED_CF = "RewriteAborted";
+    private static final String FLUSH_CF = "Flush";
+
+    @BeforeClass
+    public static void setUp()
+    {
+        MockSchema.cleanup();
+
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, REWRITE_FINISHED_CF),
+                                    SchemaLoader.standardCFMD(KEYSPACE, REWRITE_ABORTED_CF),
+                                    SchemaLoader.standardCFMD(KEYSPACE, FLUSH_CF));
+    }
+
+    @Test
+    public void testRewriteFinished() throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(REWRITE_FINISHED_CF);
+
+        SSTableReader oldSSTable = getSSTable(cfs, 1);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(oldSSTable, OperationType.COMPACTION);
+        SSTableReader newSSTable = replaceSSTable(cfs, txn, false);
+        LogTransaction.waitForDeletions();
+
+        // both sstables are in the same folder
+        assertFiles(oldSSTable.descriptor.directory.getPath(), new HashSet<>(newSSTable.getAllFilePaths()));
+        assertFiles(newSSTable.descriptor.directory.getPath(), new HashSet<>(newSSTable.getAllFilePaths()));
+    }
+
+    @Test
+    public void testRewriteAborted() throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(REWRITE_ABORTED_CF);
+
+        SSTableReader oldSSTable = getSSTable(cfs, 1);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(oldSSTable, OperationType.COMPACTION);
+
+        replaceSSTable(cfs, txn, true);
+        LogTransaction.waitForDeletions();
+
+        assertFiles(oldSSTable.descriptor.directory.getPath(), new HashSet<>(oldSSTable.getAllFilePaths()));
+    }
+
+    @Test
+    public void testFlush() throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(FLUSH_CF);
+
+        SSTableReader ssTableReader = getSSTable(cfs, 100);
+
+        String dataFolder = cfs.getLiveSSTables().iterator().next().descriptor.directory.getPath();
+        assertFiles(dataFolder, new HashSet<>(ssTableReader.getAllFilePaths()));
+    }
+
+    private SSTableReader getSSTable(ColumnFamilyStore cfs, int numPartitions) throws IOException
+    {
+        createSSTable(cfs, numPartitions);
+
+        Set<SSTableReader> sstables = new HashSet<>(cfs.getLiveSSTables());
+        assertEquals(1, sstables.size());
+        return sstables.iterator().next();
+    }
+
+    private void createSSTable(ColumnFamilyStore cfs, int numPartitions) throws IOException
+    {
+        cfs.truncateBlocking();
+
+        String schema = "CREATE TABLE \"%s\".\"%s\" (key ascii, name ascii, val ascii, val1 ascii, PRIMARY KEY (key, name))";
+        String query = "INSERT INTO \"%s\".\"%s\" (key, name, val) VALUES (?, ?, ?)";
+
+        try (CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                       .inDirectory(cfs.getDirectories().getDirectoryForNewSSTables())
+                                                       .forTable(String.format(schema, cfs.keyspace.getName(), cfs.name))
+                                                       .using(String.format(query, cfs.keyspace.getName(), cfs.name))
+                                                       .build())
+        {
+            for (int j = 0; j < numPartitions; j ++)
+                writer.addRow(String.format("key%d", j), "col1", "0");
+        }
+
+        cfs.loadNewSSTables();
+    }
+
+    private SSTableReader replaceSSTable(ColumnFamilyStore cfs, LifecycleTransaction txn, boolean fail)
+    {
+        List<SSTableReader> newsstables = null;
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(FBUtilities.nowInSeconds())))
+        {
+            try (SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false);
+                 AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(txn.originals());
+                 CompactionIterator ci = new CompactionIterator(txn.opType(), scanners.scanners, controller, nowInSec, txn.opId())
+            )
+            {
+                long lastCheckObsoletion = System.nanoTime();
+                File directory = txn.originals().iterator().next().descriptor.directory;
+                Descriptor desc = Descriptor.fromFilename(cfs.getSSTablePath(directory));
+                CFMetaData metadata = Schema.instance.getCFMetaData(desc);
+                rewriter.switchWriter(SSTableWriter.create(metadata,
+                                                           desc,
+                                                           0,
+                                                           0,
+                                                           0,
+                                                           SerializationHeader.make(cfs.metadata, txn.originals()),
+                                                           txn));
+                while (ci.hasNext())
+                {
+                    rewriter.append(ci.next());
+
+                    if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
+                    {
+                        controller.maybeRefreshOverlaps();
+                        lastCheckObsoletion = System.nanoTime();
+                    }
+                }
+
+                if (!fail)
+                    newsstables = rewriter.finish();
+                else
+                    rewriter.abort();
+            }
+        }
+
+        assertTrue(fail || newsstables != null);
+
+        if (newsstables != null)
+        {
+            Assert.assertEquals(1, newsstables.size());
+            return newsstables.iterator().next();
+        }
+
+        return null;
+    }
+
+    private void assertFiles(String dirPath, Set<String> expectedFiles)
+    {
+        File dir = new File(dirPath);
+        for (File file : dir.listFiles())
+        {
+            if (file.isDirectory())
+                continue;
+
+            String filePath = file.getPath();
+            assertTrue(filePath, expectedFiles.contains(filePath));
+            expectedFiles.remove(filePath);
+        }
+
+        assertTrue(expectedFiles.isEmpty());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java
index 04b4e4a..de1e640 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java

@@ -41,7 +41,6 @@
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.io.sstable.SSTableDeletingTask;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.notifications.*;
 import org.apache.cassandra.utils.concurrent.OpOrder;
@@ -155,6 +154,9 @@
 
         Assert.assertEquals(3, tracker.view.get().sstables.size());
 
+        for (SSTableReader reader : readers)
+            Assert.assertTrue(reader.isKeyCacheSetup());
+
         Assert.assertEquals(17 + 121 + 9, cfs.metric.liveDiskSpaceUsed.getCount());
     }
 
@@ -174,8 +176,11 @@
 
         Assert.assertEquals(3, tracker.view.get().sstables.size());
 
+        for (SSTableReader reader : readers)
+            Assert.assertTrue(reader.isKeyCacheSetup());
+
         Assert.assertEquals(17 + 121 + 9, cfs.metric.liveDiskSpaceUsed.getCount());
-        Assert.assertEquals(3, listener.senders.size());
+        Assert.assertEquals(1, listener.senders.size());
         Assert.assertEquals(tracker, listener.senders.get(0));
         Assert.assertTrue(listener.received.get(0) instanceof SSTableAddedNotification);
         DatabaseDescriptor.setIncrementalBackupsEnabled(backups);
@@ -185,9 +190,9 @@
     public void testDropSSTables()
     {
         testDropSSTables(false);
-        SSTableDeletingTask.waitForDeletions();
+        LogTransaction.waitForDeletions();
         testDropSSTables(true);
-        SSTableDeletingTask.waitForDeletions();
+        LogTransaction.waitForDeletions();
     }
 
     private void testDropSSTables(boolean invalidate)
@@ -201,57 +206,54 @@
                                                              MockSchema.sstable(2, 71, true, cfs));
         tracker.addInitialSSTables(copyOf(readers));
 
-        try
+        try (LifecycleTransaction txn = tracker.tryModify(readers.get(0), OperationType.COMPACTION))
         {
-            SSTableDeletingTask.pauseDeletions(true);
-            try (LifecycleTransaction txn = tracker.tryModify(readers.get(0), OperationType.COMPACTION))
+            if (invalidate)
             {
-                if (invalidate)
-                    cfs.invalidate(false);
-                else
-                    tracker.dropSSTables();
-                Assert.assertEquals(95, cfs.metric.totalDiskSpaceUsed.getCount());
-                Assert.assertEquals(9, cfs.metric.liveDiskSpaceUsed.getCount());
-                Assert.assertEquals(1, tracker.getView().sstables.size());
-            }
-            if (!invalidate)
-            {
-                Assert.assertEquals(1, tracker.getView().sstables.size());
-                Assert.assertEquals(readers.get(0), Iterables.getFirst(tracker.getView().sstables, null));
-                Assert.assertEquals(1, readers.get(0).selfRef().globalCount());
-                Assert.assertFalse(readers.get(0).isMarkedCompacted());
-                for (SSTableReader reader : readers.subList(1, 3))
-                {
-                    Assert.assertEquals(0, reader.selfRef().globalCount());
-                    Assert.assertTrue(reader.isMarkedCompacted());
-                }
-                Assert.assertNull(tracker.dropSSTables(new Predicate<SSTableReader>() {
-                                                           public boolean apply(SSTableReader reader)
-                                                           {
-                                                               return reader != readers.get(0);
-                                                           }
-                                                       },
-                                                       OperationType.UNKNOWN,
-                                                       null));
-                Assert.assertEquals(1, tracker.getView().sstables.size());
-                Assert.assertEquals(3, listener.received.size());
-                Assert.assertEquals(tracker, listener.senders.get(0));
-                Assert.assertEquals(2, ((SSTableListChangedNotification) listener.received.get(0)).removed.size());
-                Assert.assertEquals(0, ((SSTableListChangedNotification) listener.received.get(0)).added.size());
-                Assert.assertEquals(9, cfs.metric.liveDiskSpaceUsed.getCount());
-                readers.get(0).selfRef().release();
+                cfs.invalidate(false);
             }
             else
             {
-                Assert.assertEquals(0, tracker.getView().sstables.size());
-                Assert.assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
-                for (SSTableReader reader : readers)
-                    Assert.assertTrue(reader.isMarkedCompacted());
+                tracker.dropSSTables();
+                LogTransaction.waitForDeletions();
             }
+            Assert.assertEquals(9, cfs.metric.totalDiskSpaceUsed.getCount());
+            Assert.assertEquals(9, cfs.metric.liveDiskSpaceUsed.getCount());
+            Assert.assertEquals(1, tracker.getView().sstables.size());
         }
-        finally
+        if (!invalidate)
         {
-            SSTableDeletingTask.pauseDeletions(false);
+            Assert.assertEquals(1, tracker.getView().sstables.size());
+            Assert.assertEquals(readers.get(0), Iterables.getFirst(tracker.getView().sstables, null));
+            Assert.assertEquals(1, readers.get(0).selfRef().globalCount());
+            Assert.assertFalse(readers.get(0).isMarkedCompacted());
+            for (SSTableReader reader : readers.subList(1, 3))
+            {
+                Assert.assertEquals(0, reader.selfRef().globalCount());
+                Assert.assertTrue(reader.isMarkedCompacted());
+            }
+
+            Assert.assertNull(tracker.dropSSTables(reader -> reader != readers.get(0), OperationType.UNKNOWN, null));
+
+            Assert.assertEquals(1, tracker.getView().sstables.size());
+            Assert.assertEquals(3, listener.received.size());
+            Assert.assertEquals(tracker, listener.senders.get(0));
+            Assert.assertTrue(listener.received.get(0) instanceof SSTableDeletingNotification);
+            Assert.assertTrue(listener.received.get(1) instanceof  SSTableDeletingNotification);
+            Assert.assertTrue(listener.received.get(2) instanceof SSTableListChangedNotification);
+            Assert.assertEquals(readers.get(1), ((SSTableDeletingNotification) listener.received.get(0)).deleting);
+            Assert.assertEquals(readers.get(2), ((SSTableDeletingNotification)listener.received.get(1)).deleting);
+            Assert.assertEquals(2, ((SSTableListChangedNotification) listener.received.get(2)).removed.size());
+            Assert.assertEquals(0, ((SSTableListChangedNotification) listener.received.get(2)).added.size());
+            Assert.assertEquals(9, cfs.metric.liveDiskSpaceUsed.getCount());
+            readers.get(0).selfRef().release();
+        }
+        else
+        {
+            Assert.assertEquals(0, tracker.getView().sstables.size());
+            Assert.assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
+            for (SSTableReader reader : readers)
+                Assert.assertTrue(reader.isMarkedCompacted());
         }
     }
 
@@ -292,19 +294,17 @@
         Assert.assertTrue(tracker.getView().flushingMemtables.contains(prev1));
         Assert.assertEquals(2, tracker.getView().flushingMemtables.size());
 
-        tracker.replaceFlushed(prev1, null);
+        tracker.replaceFlushed(prev1, Collections.emptyList());
         Assert.assertEquals(1, tracker.getView().flushingMemtables.size());
         Assert.assertTrue(tracker.getView().flushingMemtables.contains(prev2));
 
         SSTableReader reader = MockSchema.sstable(0, 10, false, cfs);
-        tracker.replaceFlushed(prev2, reader);
+        tracker.replaceFlushed(prev2, Collections.singleton(reader));
         Assert.assertEquals(1, tracker.getView().sstables.size());
-        Assert.assertEquals(1, tracker.getView().premature.size());
-        tracker.permitCompactionOfFlushed(reader);
-        Assert.assertEquals(0, tracker.getView().premature.size());
         Assert.assertEquals(1, listener.received.size());
-        Assert.assertEquals(reader, ((SSTableAddedNotification) listener.received.get(0)).added);
+        Assert.assertEquals(singleton(reader), ((SSTableAddedNotification) listener.received.get(0)).added);
         listener.received.clear();
+        Assert.assertTrue(reader.isKeyCacheSetup());
         Assert.assertEquals(10, cfs.metric.liveDiskSpaceUsed.getCount());
 
         // test invalidated CFS
@@ -316,13 +316,14 @@
         tracker.markFlushing(prev1);
         reader = MockSchema.sstable(0, 10, true, cfs);
         cfs.invalidate(false);
-        tracker.replaceFlushed(prev1, reader);
-        tracker.permitCompactionOfFlushed(reader);
+        tracker.replaceFlushed(prev1, singleton(reader));
         Assert.assertEquals(0, tracker.getView().sstables.size());
         Assert.assertEquals(0, tracker.getView().flushingMemtables.size());
         Assert.assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
-        Assert.assertEquals(1, ((SSTableListChangedNotification) listener.received.get(0)).removed.size());
-        Assert.assertEquals(reader, (((SSTableDeletingNotification) listener.received.get(1)).deleting));
+        Assert.assertEquals(3, listener.received.size());
+        Assert.assertEquals(singleton(reader), ((SSTableAddedNotification) listener.received.get(0)).added);
+        Assert.assertTrue(listener.received.get(1) instanceof SSTableDeletingNotification);
+        Assert.assertEquals(1, ((SSTableListChangedNotification) listener.received.get(2)).removed.size());
         DatabaseDescriptor.setIncrementalBackupsEnabled(backups);
     }
 
@@ -334,8 +335,8 @@
         Tracker tracker = new Tracker(null, false);
         MockListener listener = new MockListener(false);
         tracker.subscribe(listener);
-        tracker.notifyAdded(r1);
-        Assert.assertEquals(r1, ((SSTableAddedNotification) listener.received.get(0)).added);
+        tracker.notifyAdded(singleton(r1));
+        Assert.assertEquals(singleton(r1), ((SSTableAddedNotification) listener.received.get(0)).added);
         listener.received.clear();
         tracker.notifyDeleting(r1);
         Assert.assertEquals(r1, ((SSTableDeletingNotification) listener.received.get(0)).deleting);
@@ -355,8 +356,8 @@
         MockListener failListener = new MockListener(true);
         tracker.subscribe(failListener);
         tracker.subscribe(listener);
-        Assert.assertNotNull(tracker.notifyAdded(r1, null));
-        Assert.assertEquals(r1, ((SSTableAddedNotification) listener.received.get(0)).added);
+        Assert.assertNotNull(tracker.notifyAdded(singleton(r1), null));
+        Assert.assertEquals(singleton(r1), ((SSTableAddedNotification) listener.received.get(0)).added);
         listener.received.clear();
         Assert.assertNotNull(tracker.notifySSTablesChanged(singleton(r1), singleton(r2), OperationType.COMPACTION, null));
         Assert.assertEquals(singleton(r1), ((SSTableListChangedNotification) listener.received.get(0)).removed);

diff --git a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
index 5706598..436bf18 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java

@@ -21,10 +21,12 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Set;
 
 import com.google.common.base.Function;
 import com.google.common.base.Predicates;
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -33,7 +35,7 @@
 import org.apache.cassandra.MockSchema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Memtable;
-import org.apache.cassandra.db.RowPosition;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
@@ -59,16 +61,17 @@
         {
             for (int j = i ; j < 5 ; j++)
             {
-                RowPosition min = MockSchema.readerBounds(i);
-                RowPosition max = MockSchema.readerBounds(j);
+                PartitionPosition min = MockSchema.readerBounds(i);
+                PartitionPosition max = MockSchema.readerBounds(j);
                 for (boolean minInc : new boolean[] { true })//, false} )
                 {
                     for (boolean maxInc : new boolean[] { true })//, false} )
                     {
                         if (i == j && !(minInc && maxInc))
                             continue;
-                        AbstractBounds<RowPosition> bounds = AbstractBounds.bounds(min, minInc, max, maxInc);
-                        List<SSTableReader> r = initialView.sstablesInBounds(bounds.left, bounds.right);
+
+                        AbstractBounds<PartitionPosition> bounds = AbstractBounds.bounds(min, minInc, max, maxInc);
+                        List<SSTableReader> r = ImmutableList.copyOf(initialView.liveSSTablesInBounds(bounds.left, bounds.right));
                         Assert.assertEquals(String.format("%d(%s) %d(%s)", i, minInc, j, maxInc), j - i + (minInc ? 0 : -1) + (maxInc ? 1 : 0), r.size());
                     }
                 }
@@ -80,7 +83,7 @@
     public void testCompaction()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
-        View initialView = fakeView(0, 5, cfs);
+        View initialView = fakeView(0, 5, cfs, true);
         View cur = initialView;
         List<SSTableReader> readers = ImmutableList.copyOf(initialView.sstables);
         Assert.assertTrue(View.permitCompacting(readers).apply(cur));
@@ -97,8 +100,8 @@
         Assert.assertFalse(View.permitCompacting(readers.subList(1, 2)).apply(cur));
         Assert.assertTrue(readers.subList(2, 5).containsAll(copyOf(cur.getUncompacting(readers))));
         Assert.assertEquals(3, copyOf(cur.getUncompacting(readers)).size());
-        Assert.assertTrue(cur.nonCompactingSStables().containsAll(readers.subList(2, 5)));
-        Assert.assertEquals(3, cur.nonCompactingSStables().size());
+        Assert.assertTrue(ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING)).containsAll(readers.subList(2, 5)));
+        Assert.assertEquals(3, ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING)).size());
 
         // check marking already compacting readers fails with an exception
         testFailure(View.updateCompacting(emptySet(), readers.subList(0, 1)), cur);
@@ -126,9 +129,13 @@
         testFailure(View.updateCompacting(copyOf(readers.subList(0, 2)), emptySet()), cur);
         Assert.assertTrue(copyOf(concat(readers.subList(0, 1), readers.subList(2, 5))).containsAll(copyOf(cur.getUncompacting(readers))));
         Assert.assertEquals(4, copyOf(cur.getUncompacting(readers)).size());
-        Assert.assertTrue(cur.nonCompactingSStables().containsAll(readers.subList(2, 5)));
-        Assert.assertTrue(cur.nonCompactingSStables().containsAll(readers.subList(0, 1)));
-        Assert.assertEquals(4, cur.nonCompactingSStables().size());
+        Set<SSTableReader> nonCompacting = ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING));
+        Assert.assertTrue(nonCompacting.containsAll(readers.subList(2, 5)));
+        Assert.assertTrue(nonCompacting.containsAll(readers.subList(0, 1)));
+        Assert.assertEquals(4, nonCompacting.size());
+
+        for (SSTableReader sstable : initialView.sstables)
+            sstable.selfRef().release();
     }
 
     private static void testFailure(Function<View, ?> function, View view)
@@ -191,7 +198,7 @@
         Assert.assertEquals(memtable3, cur.getCurrentMemtable());
 
         SSTableReader sstable = MockSchema.sstable(1, cfs);
-        cur = View.replaceFlushed(memtable1, sstable).apply(cur);
+        cur = View.replaceFlushed(memtable1, Collections.singleton(sstable)).apply(cur);
         Assert.assertEquals(0, cur.flushingMemtables.size());
         Assert.assertEquals(1, cur.liveMemtables.size());
         Assert.assertEquals(memtable3, cur.getCurrentMemtable());
@@ -201,13 +208,18 @@
 
     static View fakeView(int memtableCount, int sstableCount, ColumnFamilyStore cfs)
     {
+        return fakeView(memtableCount, sstableCount, cfs, false);
+    }
+
+    static View fakeView(int memtableCount, int sstableCount, ColumnFamilyStore cfs, boolean keepRef)
+    {
         List<Memtable> memtables = new ArrayList<>();
         List<SSTableReader> sstables = new ArrayList<>();
         for (int i = 0 ; i < memtableCount ; i++)
             memtables.add(MockSchema.memtable(cfs));
         for (int i = 0 ; i < sstableCount ; i++)
-            sstables.add(MockSchema.sstable(i, cfs));
+            sstables.add(MockSchema.sstable(i, keepRef, cfs));
         return new View(ImmutableList.copyOf(memtables), Collections.<Memtable>emptyList(), Helpers.identityMap(sstables),
-                        Collections.<SSTableReader>emptySet(), Collections.<SSTableReader>emptySet(), SSTableIntervalTree.build(sstables));
+                        Collections.<SSTableReader, SSTableReader>emptyMap(), SSTableIntervalTree.build(sstables));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java
index 25c1c7d..cc66e71 100644
--- a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java

@@ -19,6 +19,7 @@
 package org.apache.cassandra.db.marshal;
 
 import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
 import java.util.*;
 
 import org.junit.BeforeClass;
@@ -28,14 +29,14 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.*;
 
@@ -67,9 +68,8 @@
         AbstractType<?> composite = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{BytesType.instance, TimeUUIDType.instance, IntegerType.instance}));
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDCOMPOSITE, composite));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.denseCFMD(KEYSPACE1, CF_STANDARDCOMPOSITE, composite));
     }
 
     @Test
@@ -187,23 +187,28 @@
         ByteBuffer cname5 = createCompositeKey("test2", uuids[1], 42, false);
 
         ByteBuffer key = ByteBufferUtil.bytes("k");
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        addColumn(rm, cname5);
-        addColumn(rm, cname1);
-        addColumn(rm, cname4);
-        addColumn(rm, cname2);
-        addColumn(rm, cname3);
-        rm.applyUnsafe();
 
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("k"), CF_STANDARDCOMPOSITE, System.currentTimeMillis()));
+        long ts = FBUtilities.timestampMicros();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname5).add("val", "cname5").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname1).add("val", "cname1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname4).add("val", "cname4").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname2).add("val", "cname2").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname3).add("val", "cname3").build().applyUnsafe();
 
-        Iterator<Cell> iter = cf.getSortedColumns().iterator();
+        ColumnDefinition cdef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
 
-        assert iter.next().name().toByteBuffer().equals(cname1);
-        assert iter.next().name().toByteBuffer().equals(cname2);
-        assert iter.next().name().toByteBuffer().equals(cname3);
-        assert iter.next().name().toByteBuffer().equals(cname4);
-        assert iter.next().name().toByteBuffer().equals(cname5);
+        ImmutableBTreePartition readPartition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        Iterator<Row> iter = readPartition.iterator();
+
+        compareValues(iter.next().getCell(cdef), "cname1");
+        compareValues(iter.next().getCell(cdef), "cname2");
+        compareValues(iter.next().getCell(cdef), "cname3");
+        compareValues(iter.next().getCell(cdef), "cname4");
+        compareValues(iter.next().getCell(cdef), "cname5");
+    }
+    private void compareValues(Cell c, String r) throws CharacterCodingException
+    {
+        assert ByteBufferUtil.string(c.value()).equals(r) : "Expected: {" + ByteBufferUtil.string(c.value()) + "} got: {" + r + "}";
     }
 
     @Test
@@ -214,14 +219,16 @@
             TypeParser.parse("CompositeType");
             fail("Shouldn't work");
         }
-        catch (ConfigurationException | SyntaxException e) {}
+        catch (ConfigurationException e) {}
+        catch (SyntaxException e) {}
 
         try
         {
             TypeParser.parse("CompositeType()");
             fail("Shouldn't work");
         }
-        catch (ConfigurationException | SyntaxException e) {}
+        catch (ConfigurationException e) {}
+        catch (SyntaxException e) {}
     }
 
     @Test
@@ -267,11 +274,6 @@
         }
     }
 
-    private void addColumn(Mutation rm, ByteBuffer cname)
-    {
-        rm.add(CF_STANDARDCOMPOSITE, CellNames.simpleDense(cname), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-    }
-
     private ByteBuffer createCompositeKey(String s, UUID uuid, int i, boolean lastIsOne)
     {
         ByteBuffer bytes = ByteBufferUtil.bytes(s);

diff --git a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
index 1a6ddc9..0a3c39c 100644
--- a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java

@@ -19,8 +19,9 @@
 package org.apache.cassandra.db.marshal;
 
 import java.nio.ByteBuffer;
-import java.util.Iterator;
+import java.nio.charset.CharacterCodingException;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.UUID;
 
@@ -30,13 +31,13 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.*;
 
@@ -70,9 +71,8 @@
         AbstractType<?> dynamicComposite = DynamicCompositeType.getInstance(aliases);
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDDYNCOMPOSITE, dynamicComposite));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.denseCFMD(KEYSPACE1, CF_STANDARDDYNCOMPOSITE, dynamicComposite));
     }
 
     @Test
@@ -192,23 +192,27 @@
         ByteBuffer cname5 = createDynamicCompositeKey("test2", uuids[1], 42, false);
 
         ByteBuffer key = ByteBufferUtil.bytes("k");
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        addColumn(rm, cname5);
-        addColumn(rm, cname1);
-        addColumn(rm, cname4);
-        addColumn(rm, cname2);
-        addColumn(rm, cname3);
-        rm.applyUnsafe();
+        long ts = FBUtilities.timestampMicros();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname5).add("val", "cname5").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname1).add("val", "cname1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname4).add("val", "cname4").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname2).add("val", "cname2").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname3).add("val", "cname3").build().applyUnsafe();
 
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("k"), CF_STANDARDDYNCOMPOSITE, System.currentTimeMillis()));
+        ColumnDefinition cdef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
 
-        Iterator<Cell> iter = cf.getSortedColumns().iterator();
+        ImmutableBTreePartition readPartition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        Iterator<Row> iter = readPartition.iterator();
 
-        assert iter.next().name().toByteBuffer().equals(cname1);
-        assert iter.next().name().toByteBuffer().equals(cname2);
-        assert iter.next().name().toByteBuffer().equals(cname3);
-        assert iter.next().name().toByteBuffer().equals(cname4);
-        assert iter.next().name().toByteBuffer().equals(cname5);
+        compareValues(iter.next().getCell(cdef), "cname1");
+        compareValues(iter.next().getCell(cdef), "cname2");
+        compareValues(iter.next().getCell(cdef), "cname3");
+        compareValues(iter.next().getCell(cdef), "cname4");
+        compareValues(iter.next().getCell(cdef), "cname5");
+    }
+    private void compareValues(Cell c, String r) throws CharacterCodingException
+    {
+        assert ByteBufferUtil.string(c.value()).equals(r) : "Expected: {" + ByteBufferUtil.string(c.value()) + "} got: {" + r + "}";
     }
 
     @Test
@@ -224,23 +228,24 @@
         ByteBuffer cname5 = createDynamicCompositeKey("test2", uuids[1], 42, false, true);
 
         ByteBuffer key = ByteBufferUtil.bytes("kr");
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        addColumn(rm, cname5);
-        addColumn(rm, cname1);
-        addColumn(rm, cname4);
-        addColumn(rm, cname2);
-        addColumn(rm, cname3);
-        rm.apply();
 
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk("kr"), CF_STANDARDDYNCOMPOSITE, System.currentTimeMillis()));
+        long ts = FBUtilities.timestampMicros();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname5).add("val", "cname5").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname1).add("val", "cname1").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname4).add("val", "cname4").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname2).add("val", "cname2").build().applyUnsafe();
+        new RowUpdateBuilder(cfs.metadata, ts, key).clustering(cname3).add("val", "cname3").build().applyUnsafe();
 
-        Iterator<Cell> iter = cf.getSortedColumns().iterator();
+        ColumnDefinition cdef = cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val"));
 
-        assert iter.next().name().toByteBuffer().equals(cname5);
-        assert iter.next().name().toByteBuffer().equals(cname4);
-        assert iter.next().name().toByteBuffer().equals(cname1); // null UUID < reversed value
-        assert iter.next().name().toByteBuffer().equals(cname3);
-        assert iter.next().name().toByteBuffer().equals(cname2);
+        ImmutableBTreePartition readPartition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
+        Iterator<Row> iter = readPartition.iterator();
+
+        compareValues(iter.next().getCell(cdef), "cname5");
+        compareValues(iter.next().getCell(cdef), "cname4");
+        compareValues(iter.next().getCell(cdef), "cname1"); // null UUID < reversed value
+        compareValues(iter.next().getCell(cdef), "cname3");
+        compareValues(iter.next().getCell(cdef), "cname2");
     }
 
     @Test
@@ -309,11 +314,6 @@
         assert !TypeParser.parse("DynamicCompositeType(a => BytesType)").isCompatibleWith(TypeParser.parse("DynamicCompositeType(a => BytesType, b => AsciiType)"));
     }
 
-    private void addColumn(Mutation rm, ByteBuffer cname)
-    {
-        rm.add(CF_STANDARDDYNCOMPOSITE, CellNames.simpleDense(cname), ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-    }
-
     private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne)
     {
         return createDynamicCompositeKey(s, uuid, i, lastIsOne, false);

diff --git a/test/unit/org/apache/cassandra/db/marshal/EmptyTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/EmptyTypeTest.java
new file mode 100644
index 0000000..423e304
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/EmptyTypeTest.java

@@ -0,0 +1,94 @@
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.mockito.Mockito;
+
+
+public class EmptyTypeTest
+{
+    @Test
+    public void isFixed()
+    {
+        Assert.assertEquals(0, EmptyType.instance.valueLengthIfFixed());
+    }
+
+    @Test
+    public void writeEmptyAllowed()
+    {
+        DataOutputPlus output = Mockito.mock(DataOutputPlus.class);
+        EmptyType.instance.writeValue(ByteBufferUtil.EMPTY_BYTE_BUFFER, output);
+
+        Mockito.verifyNoInteractions(output);
+    }
+
+    @Test
+    public void writeNonEmpty()
+    {
+        DataOutputPlus output = Mockito.mock(DataOutputPlus.class);
+        ByteBuffer rejected = ByteBuffer.wrap("this better fail".getBytes());
+
+        boolean thrown = false;
+        try
+        {
+            EmptyType.instance.writeValue(rejected, output);
+        }
+        catch (AssertionError e)
+        {
+            thrown = true;
+        }
+        Assert.assertTrue("writeValue did not reject non-empty input", thrown);
+
+        Mockito.verifyNoInteractions(output);
+    }
+
+    @Test
+    public void read()
+    {
+        DataInputPlus input = Mockito.mock(DataInputPlus.class);
+
+        ByteBuffer buffer = EmptyType.instance.readValue(input);
+        Assert.assertNotNull(buffer);
+        Assert.assertFalse("empty type returned back non-empty data", buffer.hasRemaining());
+
+        buffer = EmptyType.instance.readValue(input, 42);
+        Assert.assertNotNull(buffer);
+        Assert.assertFalse("empty type returned back non-empty data", buffer.hasRemaining());
+
+        Mockito.verifyNoInteractions(input);
+    }
+
+    @Test
+    public void decompose()
+    {
+        ByteBuffer buffer = EmptyType.instance.decompose(null);
+        Assert.assertEquals(0, buffer.remaining());
+    }
+
+    @Test
+    public void composeEmptyInput()
+    {
+        Void result = EmptyType.instance.compose(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        Assert.assertNull(result);
+    }
+
+    @Test
+    public void composeNonEmptyInput()
+    {
+        try
+        {
+            EmptyType.instance.compose(ByteBufferUtil.bytes("should fail"));
+            Assert.fail("compose is expected to reject non-empty values, but did not");
+        }
+        catch (MarshalException e) {
+            Assert.assertTrue(e.getMessage().startsWith("EmptyType only accept empty values"));
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/marshal/TimeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/TimeTypeTest.java
index 699c805..d61d2c6 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TimeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TimeTypeTest.java

@@ -23,11 +23,14 @@
 import java.util.concurrent.TimeUnit;
 
 import org.junit.Test;
+
+import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.serializers.TimeSerializer;
 
-public class TimeTypeTest
+public class TimeTypeTest extends CQLTester
 {
-    @Test public void TestComparison()
+    @Test
+    public void testComparison()
     {
         Long t1 = TimeSerializer.timeStringToLong("01:00:00.123456789");
         Long t2 = new Long((1L * 60L * 60L * 1000L * 1000L * 1000L) + 123456789);
@@ -58,4 +61,24 @@
         b2 = TimeSerializer.instance.serialize(0L);
         assert TimeType.instance.compare(b1, b2) > 0 : "Failed > comparison against max range.";
     }
+
+    /**
+     * Check that time is correctly recognized and validated
+     */
+    @Test
+    public void testTime() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id int, cl int, ts time, PRIMARY KEY(id, cl));");
+
+        execute("INSERT into %s (id, cl, ts) values (1, 1, 42000000000);");
+        execute("INSERT into %s (id, cl, ts) values (1, 2, '42000000000');");
+        execute("INSERT into %s (id, cl, ts) values (1, 3, '00:00:42.000000000');");
+        execute("INSERT into %s (id, cl, ts) values (1, 4, ?);", 42000000000L);
+
+        assertRows(execute("SELECT * FROM %s"),
+                   row(1, 1, 42000000000L),
+                   row(1, 2, 42000000000L),
+                   row(1, 3, 42000000000L),
+                   row(1, 4, 42000000000L));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeCompareTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeCompareTest.java
index fae04a2..04b030e 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TypeCompareTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TypeCompareTest.java

@@ -93,48 +93,6 @@
     }
 
     @Test
-    public void testByte()
-    {
-        Random rng = new Random();
-        ByteBuffer[] data = new ByteBuffer[Byte.MAX_VALUE];
-        for (int i = 0; i < data.length; i++)
-        {
-            data[i] = ByteBuffer.allocate(1);
-            rng.nextBytes(data[i].array());
-        }
-
-        Arrays.sort(data, ByteType.instance);
-
-        for (int i = 1; i < data.length; i++)
-        {
-            byte b0 = data[i - 1].get(data[i - 1].position());
-            byte b1 = data[i].get(data[i].position());
-            assert b0 <= b1;
-        }
-    }
-
-    @Test
-    public void testShort()
-    {
-        Random rng = new Random();
-        ByteBuffer[] data = new ByteBuffer[1000];
-        for (int i = 0; i < data.length; i++)
-        {
-            data[i] = ByteBuffer.allocate(2);
-            rng.nextBytes(data[i].array());
-        }
-
-        Arrays.sort(data, ShortType.instance);
-
-        for (int i = 1; i < data.length; i++)
-        {
-            short s0 = data[i - 1].getShort(data[i - 1].position());
-            short s1 = data[i].getShort(data[i].position());
-            assert s0 <= s1;
-        }
-    }
-
-    @Test
     public void testInt()
     {
         Random rng = new Random();

diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeParserTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeParserTest.java
index 6581fc7..808a680 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TypeParserTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TypeParserTest.java

@@ -19,8 +19,12 @@
 package org.apache.cassandra.db.marshal;
 
 import org.junit.Test;
+
+import static org.junit.Assert.assertSame;
 import static org.junit.Assert.fail;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 
@@ -29,7 +33,7 @@
     @Test
     public void testParse() throws ConfigurationException, SyntaxException
     {
-        AbstractType type;
+        AbstractType<?> type;
 
         type = TypeParser.parse(null);
         assert type == BytesType.instance;
@@ -40,12 +44,6 @@
         type = TypeParser.parse("    ");
         assert type == BytesType.instance;
 
-        type = TypeParser.parse("ByteType");
-        assert type == ByteType.instance;
-
-        type = TypeParser.parse("ShortType");
-        assert type == ShortType.instance;
-
         type = TypeParser.parse("LongType");
         assert type == LongType.instance;
 
@@ -60,11 +58,11 @@
 
         type = TypeParser.parse("LongType(reversed=true)");
         assert type == ReversedType.getInstance(LongType.instance);
-        assert ((ReversedType)type).baseType == LongType.instance;
+        assert ((ReversedType<?>)type).baseType == LongType.instance;
 
         type = TypeParser.parse("LongType(reversed)");
         assert type == ReversedType.getInstance(LongType.instance);
-        assert ((ReversedType)type).baseType == LongType.instance;
+        assert ((ReversedType<?>)type).baseType == LongType.instance;
     }
 
     @Test
@@ -75,13 +73,29 @@
             TypeParser.parse("y");
             fail("Should not pass");
         }
-        catch (ConfigurationException | SyntaxException e) {}
+        catch (ConfigurationException e) {}
+        catch (SyntaxException e) {}
 
         try
         {
             TypeParser.parse("LongType(reversed@)");
             fail("Should not pass");
         }
-        catch (ConfigurationException | SyntaxException e) {}
+        catch (ConfigurationException e) {}
+        catch (SyntaxException e) {}
+    }
+
+    @Test
+    public void testParsePartitionerOrder() throws ConfigurationException, SyntaxException
+    {
+        for (IPartitioner partitioner: new IPartitioner[] { Murmur3Partitioner.instance,
+                                                            ByteOrderedPartitioner.instance,
+                                                            RandomPartitioner.instance,
+                                                            OrderPreservingPartitioner.instance })
+        {
+            AbstractType<?> type = partitioner.partitionOrdering();
+            assertSame(type, TypeParser.parse(type.toString()));
+        }
+        assertSame(DatabaseDescriptor.getPartitioner().partitionOrdering(), TypeParser.parse("PartitionerDefinedOrder"));
     }
 }

diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java
index 5ebeb64..ed5e2bf 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java

@@ -65,32 +65,6 @@
     }
 
     @Test
-    public void testValidShort()
-    {
-        ShortType.instance.validate(Util.getBytes((short) 5));
-        ShortType.instance.validate(Util.getBytes(Short.MAX_VALUE));
-    }
-
-    @Test(expected = MarshalException.class)
-    public void testInvalidShort()
-    {
-        ShortType.instance.validate(Util.getBytes(2057022603));
-    }
-
-    @Test
-    public void testValidByte()
-    {
-        ByteType.instance.validate(Util.getBytes((byte) 5));
-        ByteType.instance.validate(Util.getBytes(Byte.MAX_VALUE));
-    }
-
-    @Test(expected = MarshalException.class)
-    public void testInvalidByte()
-    {
-        ByteType.instance.validate(Util.getBytes(2057022603));
-    }
-
-    @Test
     public void testValidUtf8() throws UnsupportedEncodingException
     {
         assert Character.MAX_CODE_POINT == 0x0010ffff;

diff --git a/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java b/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java
new file mode 100644
index 0000000..f4c93d6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/partition/PartitionImplementationTest.java

@@ -0,0 +1,524 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partition;
+
+import static org.junit.Assert.*;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.Slice.Bound;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.partitions.AbstractBTreePartition;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.rows.Row.Deletion;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.SearchIterator;
+
+public class PartitionImplementationTest
+{
+    private static final String KEYSPACE = "PartitionImplementationTest";
+    private static final String CF = "Standard";
+
+    private static final int ENTRIES = 250;
+    private static final int TESTS = 1000;
+    private static final int KEY_RANGE = ENTRIES * 5;
+
+    private static final int TIMESTAMP = KEY_RANGE + 1;
+
+    private static CFMetaData cfm;
+    private Random rand = new Random(2);
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+
+        cfm = CFMetaData.Builder.create(KEYSPACE, CF)
+                                        .addPartitionKey("pk", AsciiType.instance)
+                                        .addClusteringColumn("ck", AsciiType.instance)
+                                        .addRegularColumn("col", AsciiType.instance)
+                                        .addStaticColumn("static_col", AsciiType.instance)
+                                        .build();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+    }
+
+    private List<Row> generateRows()
+    {
+        List<Row> content = new ArrayList<>();
+        Set<Integer> keysUsed = new HashSet<>();
+        for (int i = 0; i < ENTRIES; ++i)
+        {
+            int rk;
+            do
+            {
+                rk = rand.nextInt(KEY_RANGE);
+            }
+            while (!keysUsed.add(rk));
+            content.add(makeRow(clustering(rk), "Col" + rk));
+        }
+        return content; // not sorted
+    }
+
+    Row makeRow(Clustering clustering, String colValue)
+    {
+        ColumnDefinition defCol = cfm.getColumnDefinition(new ColumnIdentifier("col", true));
+        Row.Builder row = BTreeRow.unsortedBuilder(TIMESTAMP);
+        row.newRow(clustering);
+        row.addCell(BufferCell.live(cfm, defCol, TIMESTAMP, ByteBufferUtil.bytes(colValue)));
+        return row.build();
+    }
+
+    Row makeStaticRow()
+    {
+        ColumnDefinition defCol = cfm.getColumnDefinition(new ColumnIdentifier("static_col", true));
+        Row.Builder row = BTreeRow.unsortedBuilder(TIMESTAMP);
+        row.newRow(Clustering.STATIC_CLUSTERING);
+        row.addCell(BufferCell.live(cfm, defCol, TIMESTAMP, ByteBufferUtil.bytes("static value")));
+        return row.build();
+    }
+
+    private List<Unfiltered> generateMarkersOnly()
+    {
+        return addMarkers(new ArrayList<>());
+    }
+
+    private List<Unfiltered> generateUnfiltereds()
+    {
+        List<Unfiltered> content = new ArrayList<>(generateRows());
+        return addMarkers(content);
+    }
+
+    List<Unfiltered> addMarkers(List<Unfiltered> content)
+    {
+        List<RangeTombstoneMarker> markers = new ArrayList<>();
+        Set<Integer> delTimes = new HashSet<>();
+        for (int i = 0; i < ENTRIES / 10; ++i)
+        {
+            int delTime;
+            do
+            {
+                delTime = rand.nextInt(KEY_RANGE);
+            }
+            while (!delTimes.add(delTime));
+
+            int start = rand.nextInt(KEY_RANGE);
+            DeletionTime dt = new DeletionTime(delTime, delTime);
+            RangeTombstoneMarker open = RangeTombstoneBoundMarker.inclusiveOpen(false, clustering(start).getRawValues(), dt);
+            int end = start + rand.nextInt((KEY_RANGE - start) / 4 + 1);
+            RangeTombstoneMarker close = RangeTombstoneBoundMarker.inclusiveClose(false, clustering(end).getRawValues(), dt);
+            markers.add(open);
+            markers.add(close);
+        }
+        markers.sort(cfm.comparator);
+
+        RangeTombstoneMarker toAdd = null;
+        Set<DeletionTime> open = new HashSet<>();
+        DeletionTime current = DeletionTime.LIVE;
+        for (RangeTombstoneMarker marker : markers)
+        {
+            if (marker.isOpen(false))
+            {
+                DeletionTime delTime = marker.openDeletionTime(false);
+                open.add(delTime);
+                if (delTime.supersedes(current))
+                {
+                    if (toAdd != null)
+                    {
+                        if (cfm.comparator.compare(toAdd, marker) != 0)
+                            content.add(toAdd);
+                        else
+                        {
+                            // gotta join
+                            current = toAdd.isClose(false) ? toAdd.closeDeletionTime(false) : DeletionTime.LIVE;
+                        }
+                    }
+                    if (current != DeletionTime.LIVE)
+                        marker = RangeTombstoneBoundaryMarker.makeBoundary(false, marker.openBound(false).invert(), marker.openBound(false), current, delTime);
+                    toAdd = marker;
+                    current = delTime;
+                }
+            }
+            else
+            {
+                assert marker.isClose(false);
+                DeletionTime delTime = marker.closeDeletionTime(false);
+                boolean removed = open.remove(delTime);
+                assert removed;
+                if (current.equals(delTime))
+                {
+                    if (toAdd != null)
+                    {
+                        if (cfm.comparator.compare(toAdd, marker) != 0)
+                            content.add(toAdd);
+                        else
+                        {
+                            // gotta join
+                            current = toAdd.closeDeletionTime(false);
+                            marker = new RangeTombstoneBoundMarker(marker.closeBound(false), current);
+                        }
+                    }
+                    DeletionTime best = open.stream().max(DeletionTime::compareTo).orElse(DeletionTime.LIVE);
+                    if (best != DeletionTime.LIVE)
+                        marker = RangeTombstoneBoundaryMarker.makeBoundary(false, marker.closeBound(false), marker.closeBound(false).invert(), current, best);
+                    toAdd = marker;
+                    current = best;
+                }
+            }
+        }
+        content.add(toAdd);
+        assert current == DeletionTime.LIVE;
+        assert open.isEmpty();
+        return content;
+    }
+
+    private Clustering clustering(int i)
+    {
+        return cfm.comparator.make(String.format("Row%06d", i));
+    }
+
+    private void test(Supplier<Collection<? extends Unfiltered>> content, Row staticRow)
+    {
+        for (int i = 0; i<TESTS; ++i)
+        {
+            try
+            {
+                rand = new Random(i);
+                testIter(content, staticRow);
+            }
+            catch (Throwable t)
+            {
+                throw new AssertionError("Test failed with seed " + i, t);
+            }
+        }
+    }
+
+    private void testIter(Supplier<Collection<? extends Unfiltered>> contentSupplier, Row staticRow)
+    {
+        NavigableSet<Clusterable> sortedContent = new TreeSet<Clusterable>(cfm.comparator);
+        sortedContent.addAll(contentSupplier.get());
+        AbstractBTreePartition partition;
+        try (UnfilteredRowIterator iter = new Util.UnfilteredSource(cfm, Util.dk("pk"), staticRow, sortedContent.stream().map(x -> (Unfiltered) x).iterator()))
+        {
+            partition = ImmutableBTreePartition.create(iter);
+        }
+
+        ColumnDefinition defCol = cfm.getColumnDefinition(new ColumnIdentifier("col", true));
+        ColumnFilter cf = ColumnFilter.selectionBuilder().add(defCol).build();
+        Function<? super Clusterable, ? extends Clusterable> colFilter = x -> x instanceof Row ? ((Row) x).filter(cf, cfm) : x;
+        Slices slices = Slices.with(cfm.comparator, Slice.make(clustering(KEY_RANGE / 4), clustering(KEY_RANGE * 3 / 4)));
+        Slices multiSlices = makeSlices();
+
+        // lastRow
+        assertRowsEqual((Row) get(sortedContent.descendingSet(), x -> x instanceof Row),
+                        partition.lastRow());
+        // get(static)
+        assertRowsEqual(staticRow,
+                        partition.getRow(Clustering.STATIC_CLUSTERING));
+
+        // get
+        for (int i=0; i < KEY_RANGE; ++i)
+        {
+            Clustering cl = clustering(i);
+            assertRowsEqual(getRow(sortedContent, cl),
+                            partition.getRow(cl));
+        }
+        // isEmpty
+        assertEquals(sortedContent.isEmpty() && staticRow == null,
+                     partition.isEmpty());
+        // hasRows
+        assertEquals(sortedContent.stream().anyMatch(x -> x instanceof Row),
+                     partition.hasRows());
+
+        // iterator
+        assertIteratorsEqual(sortedContent.stream().filter(x -> x instanceof Row).iterator(),
+                             partition.iterator());
+
+        // unfiltered iterator
+        assertIteratorsEqual(sortedContent.iterator(),
+                             partition.unfilteredIterator());
+
+        // unfiltered iterator
+        assertIteratorsEqual(sortedContent.iterator(),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), Slices.ALL, false));
+        // column-filtered
+        assertIteratorsEqual(sortedContent.stream().map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, Slices.ALL, false));
+        // sliced
+        assertIteratorsEqual(slice(sortedContent, slices.get(0)),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), slices, false));
+        assertIteratorsEqual(streamOf(slice(sortedContent, slices.get(0))).map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, slices, false));
+        // randomly multi-sliced
+        assertIteratorsEqual(slice(sortedContent, multiSlices),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), multiSlices, false));
+        assertIteratorsEqual(streamOf(slice(sortedContent, multiSlices)).map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, multiSlices, false));
+        // reversed
+        assertIteratorsEqual(sortedContent.descendingIterator(),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), Slices.ALL, true));
+        assertIteratorsEqual(sortedContent.descendingSet().stream().map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, Slices.ALL, true));
+        assertIteratorsEqual(invert(slice(sortedContent, slices.get(0))),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), slices, true));
+        assertIteratorsEqual(streamOf(invert(slice(sortedContent, slices.get(0)))).map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, slices, true));
+        assertIteratorsEqual(invert(slice(sortedContent, multiSlices)),
+                             partition.unfilteredIterator(ColumnFilter.all(cfm), multiSlices, true));
+        assertIteratorsEqual(streamOf(invert(slice(sortedContent, multiSlices))).map(colFilter).iterator(),
+                             partition.unfilteredIterator(cf, multiSlices, true));
+
+        // search iterator
+        testSearchIterator(sortedContent, partition, ColumnFilter.all(cfm), false);
+        testSearchIterator(sortedContent, partition, cf, false);
+        testSearchIterator(sortedContent, partition, ColumnFilter.all(cfm), true);
+        testSearchIterator(sortedContent, partition, cf, true);
+
+        // sliceable iter
+        testSliceableIterator(sortedContent, partition, ColumnFilter.all(cfm), false);
+        testSliceableIterator(sortedContent, partition, cf, false);
+        testSliceableIterator(sortedContent, partition, ColumnFilter.all(cfm), true);
+        testSliceableIterator(sortedContent, partition, cf, true);
+    }
+
+    void testSearchIterator(NavigableSet<Clusterable> sortedContent, Partition partition, ColumnFilter cf, boolean reversed)
+    {
+        SearchIterator<Clustering, Row> searchIter = partition.searchIterator(cf, reversed);
+        int pos = reversed ? KEY_RANGE : 0;
+        int mul = reversed ? -1 : 1;
+        boolean started = false;
+        while (pos < KEY_RANGE)
+        {
+            int skip = rand.nextInt(KEY_RANGE / 10);
+            pos += skip * mul;
+            Clustering cl = clustering(pos);
+            Row row = searchIter.next(cl);  // returns row with deletion, incl. empty row with deletion
+            if (row == null && skip == 0 && started)    // allowed to return null if already reported row
+                continue;
+            started = true;
+            Row expected = getRow(sortedContent, cl);
+            assertEquals(expected == null, row == null);
+            if (row == null)
+                continue;
+            assertRowsEqual(expected.filter(cf, cfm), row);
+        }
+    }
+
+    Slices makeSlices()
+    {
+        int pos = 0;
+        Slices.Builder builder = new Slices.Builder(cfm.comparator);
+        while (pos <= KEY_RANGE)
+        {
+            int skip = rand.nextInt(KEY_RANGE / 10) * (rand.nextInt(3) + 2 / 3); // increased chance of getting 0
+            pos += skip;
+            int sz = rand.nextInt(KEY_RANGE / 10) + (skip == 0 ? 1 : 0);    // if start is exclusive need at least sz 1
+            Clustering start = clustering(pos);
+            pos += sz;
+            Clustering end = clustering(pos);
+            Slice slice = Slice.make(skip == 0 ? Bound.exclusiveStartOf(start) : Bound.inclusiveStartOf(start), Bound.inclusiveEndOf(end));
+            builder.add(slice);
+        }
+        return builder.build();
+    }
+
+    void testSliceableIterator(NavigableSet<Clusterable> sortedContent, AbstractBTreePartition partition, ColumnFilter cf, boolean reversed)
+    {
+        Function<? super Clusterable, ? extends Clusterable> colFilter = x -> x instanceof Row ? ((Row) x).filter(cf, cfm) : x;
+        Slices slices = makeSlices();
+        try (SliceableUnfilteredRowIterator sliceableIter = partition.sliceableUnfilteredIterator(cf, reversed))
+        {
+            for (Slice slice : (Iterable<Slice>) () -> directed(slices, reversed))
+                assertIteratorsEqual(streamOf(directed(slice(sortedContent, slice), reversed)).map(colFilter).iterator(),
+                                     sliceableIter.slice(slice));
+        }
+
+        // Try using sliceable as unfiltered iterator
+        try (SliceableUnfilteredRowIterator sliceableIter = partition.sliceableUnfilteredIterator(cf, reversed))
+        {
+            assertIteratorsEqual((reversed ? sortedContent.descendingSet() : sortedContent).
+                                     stream().map(colFilter).iterator(),
+                                 sliceableIter);
+        }
+    }
+
+    private<T> Iterator<T> invert(Iterator<T> slice)
+    {
+        Deque<T> dest = new LinkedList<>();
+        Iterators.addAll(dest, slice);
+        return dest.descendingIterator();
+    }
+
+    private Iterator<Clusterable> slice(NavigableSet<Clusterable> sortedContent, Slices slices)
+    {
+        return Iterators.concat(streamOf(slices).map(slice -> slice(sortedContent, slice)).iterator());
+    }
+
+    private Iterator<Clusterable> slice(NavigableSet<Clusterable> sortedContent, Slice slice)
+    {
+        // Slice bounds are inclusive bounds, equal only to markers. Matched markers should be returned as one-sided boundaries.
+        RangeTombstoneMarker prev = (RangeTombstoneMarker) sortedContent.headSet(slice.start(), true).descendingSet().stream().filter(x -> x instanceof RangeTombstoneMarker).findFirst().orElse(null);
+        RangeTombstoneMarker next = (RangeTombstoneMarker) sortedContent.tailSet(slice.end(), true).stream().filter(x -> x instanceof RangeTombstoneMarker).findFirst().orElse(null);
+        Iterator<Clusterable> result = sortedContent.subSet(slice.start(), false, slice.end(), false).iterator();
+        if (prev != null && prev.isOpen(false))
+            result = Iterators.concat(Iterators.singletonIterator(new RangeTombstoneBoundMarker(slice.start(), prev.openDeletionTime(false))), result);
+        if (next != null && next.isClose(false))
+            result = Iterators.concat(result, Iterators.singletonIterator(new RangeTombstoneBoundMarker(slice.end(), next.closeDeletionTime(false))));
+        return result;
+    }
+
+    private Iterator<Slice> directed(Slices slices, boolean reversed)
+    {
+        return directed(slices.iterator(), reversed);
+    }
+
+    private <T> Iterator<T> directed(Iterator<T> iter, boolean reversed)
+    {
+        if (!reversed)
+            return iter;
+        return invert(iter);
+    }
+
+    private <T> Stream<T> streamOf(Iterator<T> iterator)
+    {
+        Iterable<T> iterable = () -> iterator;
+        return streamOf(iterable);
+    }
+
+    <T> Stream<T> streamOf(Iterable<T> iterable)
+    {
+        return StreamSupport.stream(iterable.spliterator(), false);
+    }
+
+    private void assertIteratorsEqual(Iterator<? extends Clusterable> it1, Iterator<? extends Clusterable> it2)
+    {
+        Clusterable[] a1 = (Clusterable[]) Iterators.toArray(it1, Clusterable.class);
+        Clusterable[] a2 = (Clusterable[]) Iterators.toArray(it2, Clusterable.class);
+        if (Arrays.equals(a1, a2))
+            return;
+        String a1s = Stream.of(a1).map(x -> "\n" + (x instanceof Unfiltered ? ((Unfiltered) x).toString(cfm) : x.toString())).collect(Collectors.toList()).toString();
+        String a2s = Stream.of(a2).map(x -> "\n" + (x instanceof Unfiltered ? ((Unfiltered) x).toString(cfm) : x.toString())).collect(Collectors.toList()).toString();
+        assertArrayEquals("Arrays differ. Expected " + a1s + " was " + a2s, a1, a2);
+    }
+
+    private Row getRow(NavigableSet<Clusterable> sortedContent, Clustering cl)
+    {
+        NavigableSet<Clusterable> nexts = sortedContent.tailSet(cl, true);
+        if (nexts.isEmpty())
+            return null;
+        Row row = nexts.first() instanceof Row && cfm.comparator.compare(cl, nexts.first()) == 0 ? (Row) nexts.first() : null;
+        for (Clusterable next : nexts)
+            if (next instanceof RangeTombstoneMarker)
+            {
+                RangeTombstoneMarker rt = (RangeTombstoneMarker) next;
+                if (!rt.isClose(false))
+                    return row;
+                DeletionTime delTime = rt.closeDeletionTime(false);
+                return row == null ? BTreeRow.emptyDeletedRow(cl, Deletion.regular(delTime)) : row.filter(ColumnFilter.all(cfm), delTime, true, cfm);
+            }
+        return row;
+    }
+
+    private void assertRowsEqual(Row expected, Row actual)
+    {
+        try
+        {
+            assertEquals(expected == null, actual == null);
+            if (expected == null)
+                return;
+            assertEquals(expected.clustering(), actual.clustering());
+            assertEquals(expected.deletion(), actual.deletion());
+            assertArrayEquals(Iterables.toArray(expected.cells(), Cell.class), Iterables.toArray(expected.cells(), Cell.class));
+        } catch (Throwable t)
+        {
+            throw new AssertionError(String.format("Row comparison failed, expected %s got %s", expected, actual), t);
+        }
+    }
+
+    private static<T> T get(NavigableSet<T> sortedContent, Predicate<T> test)
+    {
+        return sortedContent.stream().filter(test).findFirst().orElse(null);
+    }
+
+    @Test
+    public void testEmpty()
+    {
+        test(() -> Collections.<Row>emptyList(), null);
+    }
+
+    @Test
+    public void testStaticOnly()
+    {
+        test(() -> Collections.<Row>emptyList(), makeStaticRow());
+    }
+
+    @Test
+    public void testRows()
+    {
+        test(this::generateRows, null);
+    }
+
+    @Test
+    public void testRowsWithStatic()
+    {
+        test(this::generateRows, makeStaticRow());
+    }
+
+    @Test
+    public void testMarkersOnly()
+    {
+        test(this::generateMarkersOnly, null);
+    }
+
+    @Test
+    public void testMarkersWithStatic()
+    {
+        test(this::generateMarkersOnly, makeStaticRow());
+    }
+
+    @Test
+    public void testUnfiltereds()
+    {
+        test(this::generateUnfiltereds, makeStaticRow());
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/db/partition/PartitionUpdateTest.java b/test/unit/org/apache/cassandra/db/partition/PartitionUpdateTest.java
new file mode 100644
index 0000000..2bd685c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/partition/PartitionUpdateTest.java

@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partition;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowAndDeletionMergeIterator;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class PartitionUpdateTest extends CQLTester
+{
+    @Test
+    public void testOperationCount()
+    {
+        createTable("CREATE TABLE %s (key text, clustering int, a int, s int static, PRIMARY KEY(key, clustering))");
+        CFMetaData cfm = currentTableMetadata();
+
+        long timestamp = FBUtilities.timestampMicros();
+        PartitionUpdate update = new RowUpdateBuilder(cfm, timestamp, "key0").clustering(1).add("a", 1).buildUpdate();
+        Assert.assertEquals(1, update.operationCount());
+
+        update = new RowUpdateBuilder(cfm, timestamp, "key0").buildUpdate();
+        Assert.assertEquals(0, update.operationCount());
+
+        update = new RowUpdateBuilder(cfm, timestamp, "key0").add("s", 1).buildUpdate();
+        Assert.assertEquals(1, update.operationCount());
+
+        update = new RowUpdateBuilder(cfm, timestamp, "key0").add("s", 1).buildUpdate();
+        update = new RowUpdateBuilder(update, timestamp, cfm.params.defaultTimeToLive).clustering(1)
+                                                                                      .add("a", 1)
+                                                                                      .buildUpdate();
+        Assert.assertEquals(2, update.operationCount());
+    }
+
+    @Test
+    public void testMutationSize()
+    {
+        createTable("CREATE TABLE %s (key text, clustering int, a int, s int static, PRIMARY KEY(key, clustering))");
+        CFMetaData cfm = currentTableMetadata();
+
+        PartitionUpdate update = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), "key0").add("s", 1).buildUpdate();
+        int size1 = update.dataSize();
+        Assert.assertEquals(20, size1);
+
+        update = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), "key0").clustering(1).add("a", 2).buildUpdate();
+        int size2 = update.dataSize();
+        Assert.assertTrue(size1 != size2);
+
+        update = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), "key0").buildUpdate();
+        int size3 = update.dataSize();
+        Assert.assertTrue(size1 != size3);
+        Assert.assertTrue(size2 != size3);
+
+    }
+
+    @Test
+    public void testOperationCountWithCompactTable()
+    {
+        createTable("CREATE TABLE %s (key text PRIMARY KEY, a int) WITH COMPACT STORAGE");
+        CFMetaData cfm = currentTableMetadata();
+
+        PartitionUpdate update = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), "key0").add("a", 1)
+                                                                                                 .buildUpdate();
+        Assert.assertEquals(1, update.operationCount());
+
+        update = new RowUpdateBuilder(cfm, FBUtilities.timestampMicros(), "key0").buildUpdate();
+        Assert.assertEquals(0, update.operationCount());
+    }
+
+    /**
+     * Makes sure we merge duplicate rows, see CASSANDRA-15789
+     */
+    @Test
+    public void testDuplicate()
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, v map<text, text>, PRIMARY KEY (pk, ck))");
+        CFMetaData cfm = currentTableMetadata();
+
+        DecoratedKey dk = Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1));
+
+        List<Row> rows = new ArrayList<>();
+        Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        builder.newRow(new Clustering(ByteBufferUtil.bytes(2)));
+        builder.addComplexDeletion(cfm.getColumnDefinition(ByteBufferUtil.bytes("v")), new DeletionTime(2, 1588586647));
+
+        Cell c = BufferCell.live(cfm, cfm.getColumnDefinition(ByteBufferUtil.bytes("v")), 3, ByteBufferUtil.bytes("h"), CellPath.create(ByteBufferUtil.bytes("g")));
+        builder.addCell(c);
+
+        Row r = builder.build();
+        rows.add(r);
+
+        builder.newRow(new Clustering(ByteBufferUtil.bytes(2)));
+        builder.addRowDeletion(new Row.Deletion(new DeletionTime(1588586647, 1), false));
+        r = builder.build();
+        rows.add(r);
+
+        RowAndDeletionMergeIterator rmi = new RowAndDeletionMergeIterator(cfm,
+                                                                          dk,
+                                                                          DeletionTime.LIVE,
+                                                                          ColumnFilter.all(cfm),
+                                                                          Rows.EMPTY_STATIC_ROW,
+                                                                          false,
+                                                                          EncodingStats.NO_STATS,
+                                                                          rows.iterator(),
+                                                                          Collections.emptyIterator(),
+                                                                          true);
+
+        PartitionUpdate pu = PartitionUpdate.fromPre30Iterator(rmi);
+        pu.iterator();
+
+        Mutation m = new Mutation(getCurrentColumnFamilyStore().keyspace.getName(), dk);
+        m.add(pu);
+        m.apply();
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+
+        SSTableReader sst = getCurrentColumnFamilyStore().getLiveSSTables().iterator().next();
+        int count = 0;
+        try (ISSTableScanner scanner = sst.getScanner())
+        {
+            while (scanner.hasNext())
+            {
+                try (UnfilteredRowIterator iter = scanner.next())
+                {
+                    while (iter.hasNext())
+                    {
+                        iter.next();
+                        count++;
+                    }
+                }
+            }
+        }
+        assertEquals(1, count);
+    }
+
+    /**
+     * Makes sure we don't create duplicates when merging 2 partition updates
+     */
+    @Test
+    public void testMerge()
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, v map<text, text>, PRIMARY KEY (pk, ck))");
+        CFMetaData cfm = currentTableMetadata();
+
+        DecoratedKey dk = Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1));
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        builder.newRow(new Clustering(ByteBufferUtil.bytes(2)));
+        builder.addComplexDeletion(cfm.getColumnDefinition(ByteBufferUtil.bytes("v")), new DeletionTime(2, 1588586647));
+        Cell c = BufferCell.live(cfm, cfm.getColumnDefinition(ByteBufferUtil.bytes("v")), 3, ByteBufferUtil.bytes("h"), CellPath.create(ByteBufferUtil.bytes("g")));
+        builder.addCell(c);
+        Row r = builder.build();
+
+        PartitionUpdate p1 = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 2);
+        p1.add(r);
+
+        builder.newRow(new Clustering(ByteBufferUtil.bytes(2)));
+        builder.addRowDeletion(new Row.Deletion(new DeletionTime(1588586647, 1), false));
+        r = builder.build();
+        PartitionUpdate p2 = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 2);
+        p2.add(r);
+
+        Mutation m = new Mutation(getCurrentColumnFamilyStore().keyspace.getName(), dk);
+        m.add(PartitionUpdate.merge(Lists.newArrayList(p1, p2)));
+        m.apply();
+
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+
+        SSTableReader sst = getCurrentColumnFamilyStore().getLiveSSTables().iterator().next();
+        int count = 0;
+        try (ISSTableScanner scanner = sst.getScanner())
+        {
+            while (scanner.hasNext())
+            {
+                try (UnfilteredRowIterator iter = scanner.next())
+                {
+                    while (iter.hasNext())
+                    {
+                        iter.next();
+                        count++;
+                    }
+                }
+            }
+        }
+        assertEquals(1, count);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/partitions/PurgeFunctionTest.java b/test/unit/org/apache/cassandra/db/partitions/PurgeFunctionTest.java
new file mode 100644
index 0000000..1dea7f3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/partitions/PurgeFunctionTest.java

@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.partitions;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.function.Predicate;
+
+import com.google.common.collect.Iterators;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ClusteringPrefix.Kind;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public final class PurgeFunctionTest
+{
+    private static final String KEYSPACE = "PurgeFunctionTest";
+    private static final String TABLE = "table";
+
+    private CFMetaData metadata;
+    private DecoratedKey key;
+
+    private static UnfilteredPartitionIterator withoutPurgeableTombstones(UnfilteredPartitionIterator iterator, int gcBefore)
+    {
+        class WithoutPurgeableTombstones extends PurgeFunction
+        {
+            private WithoutPurgeableTombstones()
+            {
+                super(iterator.isForThrift(), FBUtilities.nowInSeconds(), gcBefore, Integer.MAX_VALUE, false, false);
+            }
+
+            protected Predicate<Long> getPurgeEvaluator()
+            {
+                return time -> true;
+            }
+        }
+
+        return Transformation.apply(iterator, new WithoutPurgeableTombstones());
+    }
+
+    @Before
+    public void setUp()
+    {
+        metadata =
+            CFMetaData.Builder
+                      .create(KEYSPACE, TABLE)
+                      .addPartitionKey("pk", UTF8Type.instance)
+                      .addClusteringColumn("ck", UTF8Type.instance)
+                      .build();
+        key = Murmur3Partitioner.instance.decorateKey(bytes("key"));
+    }
+
+    @Test
+    public void testNothingIsPurgeableASC()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 0);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    @Test
+    public void testNothingIsPurgeableDESC()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 0);
+
+        UnfilteredPartitionIterator expected = iter(true
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    @Test
+    public void testEverythingIsPurgeableASC()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 2);
+
+        assertTrue(!purged.hasNext());
+    }
+
+    @Test
+    public void testEverythingIsPurgeableDESC()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 2);
+
+        assertTrue(!purged.hasNext());
+    }
+
+    @Test
+    public void testFirstHalfIsPurgeableASC()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 1);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "b")
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    @Test
+    public void testFirstHalfIsPurgeableDESC()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0L, 0, 1L, 1, "b")
+        , bound(Kind.INCL_START_BOUND, 0L, 0, "a")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 1);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_END_BOUND, 1L, 1, "c")
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "b")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    @Test
+    public void testSecondHalfIsPurgeableASC()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 1L, 1, 0L, 0, "b")
+        , bound(Kind.INCL_END_BOUND, 0L, 0, "c")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 1);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "a")
+        , bound(Kind.EXCL_END_BOUND, 1L, 1, "b")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    @Test
+    public void testSecondHalfIsPurgeableDESC()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 0L, 0, "c")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 1L, 1, 0L, 0, "b")
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "a")
+        );
+        UnfilteredPartitionIterator purged = withoutPurgeableTombstones(original, 1);
+
+        UnfilteredPartitionIterator expected = iter(true
+        , bound(Kind.EXCL_END_BOUND, 1L, 1, "b")
+        , bound(Kind.INCL_START_BOUND, 1L, 1, "a")
+        );
+        assertIteratorsEqual(expected, purged);
+    }
+
+    private UnfilteredPartitionIterator iter(boolean isReversedOrder, Unfiltered... unfiltereds)
+    {
+        Iterator<Unfiltered> iterator = Iterators.forArray(unfiltereds);
+
+        UnfilteredRowIterator rowIter =
+            new AbstractUnfilteredRowIterator(metadata,
+                                              key,
+                                              DeletionTime.LIVE,
+                                              metadata.partitionColumns(),
+                                              Rows.EMPTY_STATIC_ROW,
+                                              isReversedOrder,
+                                              EncodingStats.NO_STATS)
+        {
+            protected Unfiltered computeNext()
+            {
+                return iterator.hasNext() ? iterator.next() : endOfData();
+            }
+        };
+
+        return new SingletonUnfilteredPartitionIterator(rowIter, false);
+    }
+
+    private RangeTombstoneBoundMarker bound(ClusteringPrefix.Kind kind,
+                                            long timestamp,
+                                            int localDeletionTime,
+                                            Object clusteringValue)
+    {
+        ByteBuffer[] clusteringByteBuffers =
+            new ByteBuffer[] { decompose(metadata.clusteringColumns().get(0).type, clusteringValue) };
+
+        return new RangeTombstoneBoundMarker(new RangeTombstone.Bound(kind, clusteringByteBuffers),
+                                             new DeletionTime(timestamp, localDeletionTime));
+    }
+
+    private RangeTombstoneBoundaryMarker boundary(ClusteringPrefix.Kind kind,
+                                                  long closeTimestamp,
+                                                  int closeLocalDeletionTime,
+                                                  long openTimestamp,
+                                                  int openDeletionTime,
+                                                  Object clusteringValue)
+    {
+        ByteBuffer[] clusteringByteBuffers =
+            new ByteBuffer[] { decompose(metadata.clusteringColumns().get(0).type, clusteringValue) };
+
+        return new RangeTombstoneBoundaryMarker(new RangeTombstone.Bound(kind, clusteringByteBuffers),
+                                                new DeletionTime(closeTimestamp, closeLocalDeletionTime),
+                                                new DeletionTime(openTimestamp, openDeletionTime));
+    }
+
+    @SuppressWarnings("unchecked")
+    private static <T> ByteBuffer decompose(AbstractType<?> type, T value)
+    {
+        return ((AbstractType<T>) type).decompose(value);
+    }
+
+    private void assertIteratorsEqual(UnfilteredPartitionIterator iter1, UnfilteredPartitionIterator iter2)
+    {
+        while (iter1.hasNext())
+        {
+            assertTrue(iter2.hasNext());
+
+            try (UnfilteredRowIterator partition1 = iter1.next())
+            {
+                try (UnfilteredRowIterator partition2 = iter2.next())
+                {
+                    assertIteratorsEqual(partition1, partition2);
+                }
+            }
+        }
+
+        assertTrue(!iter2.hasNext());
+    }
+
+    private void assertIteratorsEqual(UnfilteredRowIterator iter1, UnfilteredRowIterator iter2)
+    {
+        while (iter1.hasNext())
+        {
+            assertTrue(iter2.hasNext());
+
+            assertEquals(iter1.next(), iter2.next());
+        }
+        assertTrue(!iter2.hasNext());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/AbstractTypeVersionComparatorTest.java b/test/unit/org/apache/cassandra/db/rows/AbstractTypeVersionComparatorTest.java
new file mode 100644
index 0000000..ad0c05c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/AbstractTypeVersionComparatorTest.java

@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.Set;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.*;
+
+import static java.util.Arrays.asList;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class AbstractTypeVersionComparatorTest
+{
+    private UserType udtWith2Fields;
+    private UserType udtWith3Fields;
+
+    @Before
+    public void setUp()
+    {
+        udtWith2Fields = new UserType("ks",
+                                      bytes("myType"),
+                                      asList(bytes("a"), bytes("b")),
+                                      asList(Int32Type.instance, Int32Type.instance));
+        udtWith3Fields = new UserType("ks",
+                                      bytes("myType"),
+                                      asList(bytes("a"), bytes("b"), bytes("c")),
+                                      asList(Int32Type.instance, Int32Type.instance, Int32Type.instance));
+    }
+
+    @After
+    public void tearDown()
+    {
+        udtWith2Fields = null;
+        udtWith3Fields = null;
+    }
+
+    @Test
+    public void testWithTuples()
+    {
+        checkComparisonResults(new TupleType(asList(Int32Type.instance, Int32Type.instance)),
+                               new TupleType(asList(Int32Type.instance, Int32Type.instance, Int32Type.instance)));
+    }
+
+    @Test
+    public void testWithUDTs()
+    {
+        checkComparisonResults(udtWith2Fields, udtWith3Fields);
+    }
+
+    @Test
+    public void testWithUDTsNestedWithinSet()
+    {
+        for (boolean isMultiCell : new boolean[]{false, true})
+        {
+            SetType<ByteBuffer> set1 = SetType.getInstance(udtWith2Fields, isMultiCell);
+            SetType<ByteBuffer> set2 = SetType.getInstance(udtWith3Fields, isMultiCell);
+            checkComparisonResults(set1, set2);
+        }
+    }
+
+    @Test
+    public void testWithUDTsNestedWithinList()
+    {
+        for (boolean isMultiCell : new boolean[]{false, true})
+        {
+            ListType<ByteBuffer> list1 = ListType.getInstance(udtWith2Fields, isMultiCell);
+            ListType<ByteBuffer> list2 = ListType.getInstance(udtWith3Fields, isMultiCell);
+            checkComparisonResults(list1, list2);
+        }
+    }
+
+    @Test
+    public void testWithUDTsNestedWithinMap()
+    {
+        for (boolean isMultiCell : new boolean[]{false, true})
+        {
+            MapType<ByteBuffer, Integer> map1 = MapType.getInstance(udtWith2Fields, Int32Type.instance, isMultiCell);
+            MapType<ByteBuffer, Integer> map2 = MapType.getInstance(udtWith3Fields, Int32Type.instance, isMultiCell);
+            checkComparisonResults(map1, map2);
+        }
+
+        for (boolean isMultiCell : new boolean[]{false, true})
+        {
+            MapType<Integer, ByteBuffer> map1 = MapType.getInstance(Int32Type.instance, udtWith2Fields, isMultiCell);
+            MapType<Integer, ByteBuffer> map2 = MapType.getInstance(Int32Type.instance, udtWith3Fields, isMultiCell);
+            checkComparisonResults(map1, map2);
+        }
+    }
+
+    @Test
+    public void testWithUDTsNestedWithinTuple()
+    {
+        TupleType tuple1 = new TupleType(asList(udtWith2Fields, Int32Type.instance));
+        TupleType tuple2 = new TupleType(asList(udtWith3Fields, Int32Type.instance));
+        checkComparisonResults(tuple1, tuple2);
+    }
+
+    @Test
+    public void testWithUDTsNestedWithinComposite()
+    {
+        CompositeType composite1 = CompositeType.getInstance(asList(udtWith2Fields, Int32Type.instance));
+        CompositeType composite2 = CompositeType.getInstance(asList(udtWith3Fields, Int32Type.instance));
+        checkComparisonResults(composite1, composite2);
+    }
+
+    @Test
+    public void testWithDeeplyNestedUDT()
+    {
+        for (boolean isMultiCell : new boolean[]{false, true})
+        {
+            ListType<Set<ByteBuffer>> list1 = ListType.getInstance(SetType.getInstance(new TupleType(asList(udtWith2Fields, Int32Type.instance)), isMultiCell), isMultiCell);
+            ListType<Set<ByteBuffer>> list2 = ListType.getInstance(SetType.getInstance(new TupleType(asList(udtWith3Fields, Int32Type.instance)), isMultiCell), isMultiCell);
+            checkComparisonResults(list1, list2);
+        }
+    }
+
+    @Test
+    public void testInvalidComparison()
+    {
+        assertInvalidComparison("Trying to compare 2 different types: org.apache.cassandra.db.marshal.UserType(ks,6d7954797065,61:org.apache.cassandra.db.marshal.Int32Type,62:org.apache.cassandra.db.marshal.Int32Type) and org.apache.cassandra.db.marshal.Int32Type",
+                                udtWith2Fields,
+                                Int32Type.instance);
+        assertInvalidComparison("Trying to compare 2 different types: org.apache.cassandra.db.marshal.UTF8Type and org.apache.cassandra.db.marshal.InetAddressType",
+                                SetType.getInstance(UTF8Type.instance, true),
+                                SetType.getInstance(InetAddressType.instance, true));
+        assertInvalidComparison("Trying to compare 2 different types: org.apache.cassandra.db.marshal.UTF8Type and org.apache.cassandra.db.marshal.InetAddressType",
+                                ListType.getInstance(UTF8Type.instance, true),
+                                ListType.getInstance(InetAddressType.instance, true));
+        assertInvalidComparison("Trying to compare 2 different types: org.apache.cassandra.db.marshal.UTF8Type and org.apache.cassandra.db.marshal.InetAddressType",
+                                MapType.getInstance(UTF8Type.instance, IntegerType.instance, true),
+                                MapType.getInstance(InetAddressType.instance, IntegerType.instance, true));
+        assertInvalidComparison("Trying to compare 2 different types: org.apache.cassandra.db.marshal.UTF8Type and org.apache.cassandra.db.marshal.InetAddressType",
+                                MapType.getInstance(IntegerType.instance, UTF8Type.instance, true),
+                                MapType.getInstance(IntegerType.instance, InetAddressType.instance, true));
+    }
+
+    private void assertInvalidComparison(String expectedMessage, AbstractType<?> oldVersion, AbstractType<?> newVersion)
+    {
+        try
+        {
+            checkComparisonResults(oldVersion, newVersion);
+            fail("comparison doesn't throw expected IllegalArgumentException: " + expectedMessage);
+        }
+        catch (IllegalArgumentException e)
+        {
+            assertEquals(e.getMessage(), expectedMessage);
+        }
+    }
+
+    private void checkComparisonResults(AbstractType<?> oldVersion, AbstractType<?> newVersion)
+    {
+        assertEquals(0, compare(oldVersion, oldVersion));
+        assertEquals(0, compare(newVersion, newVersion));
+        assertEquals(-1, compare(oldVersion, newVersion));
+        assertEquals(1, compare(newVersion, oldVersion));
+    }
+
+    private int compare(AbstractType<?> left, AbstractType<?> right)
+    {
+        return AbstractTypeVersionComparator.INSTANCE.compare(left, right);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/DigestBackwardCompatibilityTest.java b/test/unit/org/apache/cassandra/db/rows/DigestBackwardCompatibilityTest.java
new file mode 100644
index 0000000..c8f5cb1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/DigestBackwardCompatibilityTest.java

@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.security.MessageDigest;
+
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.CounterId;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test that digest for pre-3.0 versions are properly computed (they match the value computed on pre-3.0 nodes).
+ *
+ * The concreted 'hard-coded' digests this file tests against have been generated on a 2.2 node using basically
+ * the same test file but with 2 modifications:
+ *   1. readAndDigest is modified to work on 2.2 (the actual modification is in the method as a comment)
+ *   2. the assertions are replace by simple println() of the generated digest.
+ *
+ * Note that we only compare against 2.2 since digests should be fixed between version before 3.0 (this would be a bug
+ * of previous version otherwise).
+ */
+public class DigestBackwardCompatibilityTest extends CQLTester
+{
+    private ByteBuffer readAndDigest(String partitionKey)
+    {
+        /*
+         * In 2.2, this must be replaced by:
+         *   ColumnFamily partition = getCurrentColumnFamilyStore().getColumnFamily(QueryFilter.getIdentityFilter(Util.dk(partitionKey), currentTable(), System.currentTimeMillis()));
+         *   return ColumnFamily.digest(partition);
+         */
+
+        ReadCommand cmd = Util.cmd(getCurrentColumnFamilyStore(), partitionKey).build();
+        ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(cmd);
+        MessageDigest digest = FBUtilities.threadLocalMD5Digest();
+        UnfilteredRowIterators.digest(cmd, partition.unfilteredIterator(), digest, MessagingService.VERSION_22);
+        return ByteBuffer.wrap(digest.digest());
+    }
+
+    private void assertDigest(String expected, ByteBuffer actual)
+    {
+        String toTest = ByteBufferUtil.bytesToHex(actual);
+        assertEquals(String.format("[digest from 2.2] %s != %s [digest from 3.0]", expected, toTest), expected, toTest);
+    }
+
+    @Test
+    public void testCQLTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v1 text, v2 int, PRIMARY KEY (k, t))");
+
+        String key = "someKey";
+        int N = 10;
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s(k, t, v1, v2) VALUES (?, ?, ?, ?) USING TIMESTAMP ? AND TTL ?", key, i, "v" + i, i, 1L, 200);
+
+        // ColumnFamily(table_0 [0::false:0@1!200,0:v1:false:2@1!200,0:v2:false:4@1!200,1::false:0@1!200,1:v1:false:2@1!200,1:v2:false:4@1!200,2::false:0@1!200,2:v1:false:2@1!200,2:v2:false:4@1!200,3::false:0@1!200,3:v1:false:2@1!200,3:v2:false:4@1!200,4::false:0@1!200,4:v1:false:2@1!200,4:v2:false:4@1!200,5::false:0@1!200,5:v1:false:2@1!200,5:v2:false:4@1!200,6::false:0@1!200,6:v1:false:2@1!200,6:v2:false:4@1!200,7::false:0@1!200,7:v1:false:2@1!200,7:v2:false:4@1!200,8::false:0@1!200,8:v1:false:2@1!200,8:v2:false:4@1!200,9::false:0@1!200,9:v1:false:2@1!200,9:v2:false:4@1!200,])
+        assertDigest("aa608035cf6574a97061b5c166b64939", readAndDigest(key));
+
+        // This is a cell deletion
+        execute("DELETE v1 FROM %s USING TIMESTAMP ? WHERE k = ? AND t = ?", 2L, key, 2);
+
+        // This is a range tombstone
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE k = ? AND t = ?", 3L, key, 4);
+
+        // This is a partition level deletion (but we use an older tombstone so it doesn't get rid of everything and keeps the test interesting)
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE k = ?", 0L, key);
+
+        // ColumnFamily(table_0 -{deletedAt=0, localDeletion=1441012270, ranges=[4:_-4:!, deletedAt=3, localDeletion=1441012270]}- [0::false:0@1!200,0:v1:false:2@1!200,0:v2:false:4@1!200,1::false:0@1!200,1:v1:false:2@1!200,1:v2:false:4@1!200,2::false:0@1!200,2:v1:true:4@2,2:v2:false:4@1!200,3::false:0@1!200,3:v1:false:2@1!200,3:v2:false:4@1!200,5::false:0@1!200,5:v1:false:2@1!200,5:v2:false:4@1!200,6::false:0@1!200,6:v1:false:2@1!200,6:v2:false:4@1!200,7::false:0@1!200,7:v1:false:2@1!200,7:v2:false:4@1!200,8::false:0@1!200,8:v1:false:2@1!200,8:v2:false:4@1!200,9::false:0@1!200,9:v1:false:2@1!200,9:v2:false:4@1!200,])
+        assertDigest("b5f38d2dc7b917d221f98ab1641f82bf", readAndDigest(key));
+    }
+
+    @Test
+    public void testCompactTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, t int, v text, PRIMARY KEY (k, t)) WITH COMPACT STORAGE");
+
+        String key = "someKey";
+        int N = 10;
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s(k, t, v) VALUES (?, ?, ?) USING TIMESTAMP ? AND TTL ?", key, i, "v" + i, 1L, 200);
+
+        assertDigest("44785ddd7c62c73287b448b6063645e5", readAndDigest(key));
+
+        // This is a cell deletion
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE k = ? AND t = ?", 2L, key, 2);
+
+        // This is a partition level deletion (but we use an older tombstone so it doesn't get rid of everything and keeps the test interesting)
+        execute("DELETE FROM %s USING TIMESTAMP ? WHERE k = ?", 0L, key);
+
+        assertDigest("55d9bd6335276395d83b18eb17f9abe7", readAndDigest(key));
+    }
+
+    @Test
+    public void testStaticCompactTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, v1 text, v2 int) WITH COMPACT STORAGE");
+
+        String key = "someKey";
+        execute("INSERT INTO %s(k, v1, v2) VALUES (?, ?, ?) USING TIMESTAMP ?", key, "v", 0, 1L);
+
+        assertDigest("d2080f9f57d6edf92da1fdaaa76573d3", readAndDigest(key));
+    }
+
+    @Test
+    public void testTableWithCollection() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, m map<text, text>)");
+
+        String key = "someKey";
+
+        execute("INSERT INTO %s(k, m) VALUES (?, { 'foo' : 'value1', 'bar' : 'value2' }) USING TIMESTAMP ?", key, 1L);
+
+        // ColumnFamily(table_2 -{deletedAt=-9223372036854775808, localDeletion=2147483647, ranges=[m:_-m:!, deletedAt=0, localDeletion=1441012271]}- [:false:0@1,m:626172:false:6@1,m:666f6f:false:6@1,])
+        assertDigest("708f3fc8bc8149cc3513eef300bf0182", readAndDigest(key));
+
+        // This is a collection range tombstone
+        execute("DELETE m FROM %s USING TIMESTAMP ? WHERE k = ?", 2L, key);
+
+        // ColumnFamily(table_2 -{deletedAt=-9223372036854775808, localDeletion=2147483647, ranges=[m:_-m:!, deletedAt=2, localDeletion=1441012271]}- [:false:0@1,])
+        assertDigest("f39937fc3ed96956ef507e81717fa5cd", readAndDigest(key));
+    }
+
+    @Test
+    public void testCounterTable() throws Throwable
+    {
+        /*
+         * We can't use CQL to insert counters as both the timestamp and counter ID are automatically assigned and unpredictable.
+         * So we need to built it ourselves in a way that is totally equivalent between 2.2 and 3.0 which makes the test a little
+         * bit less readable. In any case, the code to generate the equivalent mutation on 2.2 is:
+         * ColumnFamily cf = ArrayBackedSortedColumns.factory.create(getCurrentColumnFamilyStore().metadata);
+         * ByteBuffer value = CounterContext.instance().createGlobal(CounterId.fromInt(1), 1L, 42L);
+         * cf.addColumn(new BufferCounterCell(CellNames.simpleSparse(new ColumnIdentifier("c", true)) , value, 0L, Long.MIN_VALUE));
+         * new Mutation(KEYSPACE, ByteBufferUtil.bytes(key), cf).applyUnsafe();
+         *
+         * Also note that we use COMPACT STORAGE only because it has no bearing on the test and was slightly easier in 2.2 to create
+         * the mutation.
+         */
+
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, c counter) WITH COMPACT STORAGE");
+
+        String key = "someKey";
+
+        CFMetaData metadata = getCurrentColumnFamilyStore().metadata;
+        ColumnDefinition column = metadata.getColumnDefinition(ByteBufferUtil.bytes("c"));
+        ByteBuffer value = CounterContext.instance().createGlobal(CounterId.fromInt(1), 1L, 42L);
+        Row row = BTreeRow.singleCellRow(Clustering.STATIC_CLUSTERING, BufferCell.live(metadata, column, 0L, value));
+
+        new Mutation(PartitionUpdate.singleRowUpdate(metadata, Util.dk(key), row)).applyUnsafe();
+
+        assertDigest("3a5f7b48c320538b4cd2f829e05c6db3", readAndDigest(key));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/RowAndDeletionMergeIteratorTest.java b/test/unit/org/apache/cassandra/db/rows/RowAndDeletionMergeIteratorTest.java
new file mode 100644
index 0000000..dd88704
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/RowAndDeletionMergeIteratorTest.java

@@ -0,0 +1,481 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.Iterator;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.db.Slice.Bound;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+import static org.junit.Assert.*;
+
+public class RowAndDeletionMergeIteratorTest
+{
+    private static final String KEYSPACE1 = "RowTest";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    private int nowInSeconds;
+    private DecoratedKey dk;
+    private ColumnFamilyStore cfs;
+    private CFMetaData cfm;
+    private ColumnDefinition defA;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        CFMetaData cfMetadata = CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD1)
+                                                  .addPartitionKey("key", AsciiType.instance)
+                                                  .addClusteringColumn("col1", Int32Type.instance)
+                                                  .addRegularColumn("a", Int32Type.instance)
+                                                  .build();
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    cfMetadata);
+
+    }
+
+    @Before
+    public void setup()
+    {
+        nowInSeconds = FBUtilities.nowInSeconds();
+        dk = Util.dk("key0");
+        cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        cfm = cfs.metadata;
+        defA = cfm.getColumnDefinition(new ColumnIdentifier("a", true));
+    }
+
+    @Test
+    public void testWithNoRangeTombstones()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, Collections.emptyIterator(), false);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 0);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 1);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 2);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 3);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 4);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithOnlyRangeTombstones()
+    {
+        int delTime = nowInSeconds + 1;
+        long timestamp = toMillis(delTime);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(rt(1, false, 3, false, timestamp, delTime),
+                                                                                       atLeast(4, timestamp, delTime));
+        UnfilteredRowIterator iterator = createMergeIterator(Collections.emptyIterator(), rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.EXCL_START_BOUND, 1);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.EXCL_END_BOUND, 3);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.INCL_START_BOUND, 4);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.TOP);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithAtMostRangeTombstone()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime = nowInSeconds + 1;
+        long timestamp = toMillis(delTime);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(atMost(0, timestamp, delTime));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.BOTTOM);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.INCL_END_BOUND, 0);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 1);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 2);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 3);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 4);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithGreaterThanRangeTombstone()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime = nowInSeconds + 1;
+        long timestamp = toMillis(delTime);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(greaterThan(2, timestamp, delTime));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 0);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 1);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.EXCL_START_BOUND, 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.TOP);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithAtMostAndGreaterThanRangeTombstone()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime = nowInSeconds + 1;
+        long timestamp = toMillis(delTime);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(atMost(0, timestamp, delTime),
+                                                                                       greaterThan(2, timestamp, delTime));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.BOTTOM);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.INCL_END_BOUND, 0);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 1);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.EXCL_START_BOUND, 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.TOP);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    private void assertRtMarker(Unfiltered unfiltered, ClusteringPrefix.Kind kind, int col1)
+    {
+        assertEquals(Unfiltered.Kind.RANGE_TOMBSTONE_MARKER, unfiltered.kind());
+        assertEquals(kind, unfiltered.clustering().kind());
+        assertEquals(bb(col1), unfiltered.clustering().get(0));
+    }
+
+    @Test
+    public void testWithIncludingEndExcludingStartMarker()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime1 = nowInSeconds + 1;
+        long timestamp1 = toMillis(delTime1);
+        int delTime2 = delTime1 + 1;
+        long timestamp2 = toMillis(delTime2);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(atMost(2, timestamp1, delTime1),
+                                                                                       greaterThan(2, timestamp2, delTime2));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.BOTTOM);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.INCL_END_EXCL_START_BOUNDARY, 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.TOP);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithExcludingEndIncludingStartMarker()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime1 = nowInSeconds + 1;
+        long timestamp1 = toMillis(delTime1);
+        int delTime2 = delTime1 + 1;
+        long timestamp2 = toMillis(delTime2);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(lessThan(2, timestamp1, delTime1),
+                                                                                       atLeast(2, timestamp2, delTime2));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.BOTTOM);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.EXCL_END_INCL_START_BOUNDARY, 2);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.TOP);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testNonShadowingTombstone()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(atMost(0, -1L, 0));
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator, rangeTombstoneIterator, false);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), Bound.BOTTOM);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 0);
+
+        assertTrue(iterator.hasNext());
+        assertRtMarker(iterator.next(), ClusteringPrefix.Kind.INCL_END_BOUND, 0);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 1);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 2);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 3);
+
+        assertTrue(iterator.hasNext());
+        assertRow(iterator.next(), 4);
+
+        assertFalse(iterator.hasNext());
+    }
+
+    @Test
+    public void testWithPartitionLevelTombstone()
+    {
+        Iterator<Row> rowIterator = createRowIterator();
+
+        int delTime = nowInSeconds - 1;
+        long timestamp = toMillis(delTime);
+
+        Iterator<RangeTombstone> rangeTombstoneIterator = createRangeTombstoneIterator(atMost(0, timestamp, delTime),
+                                                                                       greaterThan(2, timestamp, delTime));
+
+        int partitionDelTime = nowInSeconds + 1;
+        long partitionTimestamp = toMillis(partitionDelTime);
+
+        UnfilteredRowIterator iterator = createMergeIterator(rowIterator,
+                                                             rangeTombstoneIterator,
+                                                             new DeletionTime(partitionTimestamp, partitionDelTime),
+                                                             false);
+
+        assertFalse(iterator.hasNext());
+    }
+
+
+    /**
+     * RTL doesn't correctly merge range tombstones in some situations (see CASSANDRA-14894)
+     */
+    @Test
+    public void testWithNoopBoundaryMarkers()
+    {
+        PartitionUpdate update = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 1);
+        RangeTombstoneList rtl = new RangeTombstoneList(cfm.comparator, 10);
+        rtl.add(rt(1, 2, 5, 5));
+        rtl.add(rt(3, 4, 5, 5));
+        rtl.add(rt(5, 6, 5, 5));
+        rtl.add(rt(0, 8, 6, 6)); // <- supersedes all other tombstones
+
+        Assert.assertEquals(3, rtl.size());
+
+        try (UnfilteredRowIterator partition = createMergeIterator(update.iterator(), rtl.iterator(), false))
+        {
+            assertRtMarker(partition.next(), ClusteringPrefix.Kind.INCL_START_BOUND, 0);
+            assertRtMarker(partition.next(), ClusteringPrefix.Kind.INCL_END_BOUND, 8);
+            assertFalse(partition.hasNext());
+        }
+    }
+
+    private void assertRtMarker(Unfiltered unfiltered, Bound bound)
+    {
+        assertEquals(Unfiltered.Kind.RANGE_TOMBSTONE_MARKER, unfiltered.kind());
+        assertEquals(bound, unfiltered.clustering());
+    }
+
+    private void assertRow(Unfiltered unfiltered, int col1)
+    {
+        assertEquals(Unfiltered.Kind.ROW, unfiltered.kind());
+        assertEquals(cfm.comparator.make(col1), unfiltered.clustering());
+    }
+
+    private Iterator<RangeTombstone> createRangeTombstoneIterator(RangeTombstone... tombstones)
+    {
+        RangeTombstoneList list = new RangeTombstoneList(cfm.comparator, 10);
+
+        for (RangeTombstone tombstone : tombstones)
+            list.add(tombstone);
+
+        return list.iterator(Slice.ALL, false);
+    }
+
+    private Iterator<Row> createRowIterator()
+    {
+        PartitionUpdate update = new PartitionUpdate(cfm, dk, cfm.partitionColumns(), 1);
+        for (int i = 0; i < 5; i++)
+            addRow(update, i, i);
+
+        return update.iterator();
+    }
+
+    private UnfilteredRowIterator createMergeIterator(Iterator<Row> rows, Iterator<RangeTombstone> tombstones, boolean reversed)
+    {
+        return createMergeIterator(rows, tombstones, DeletionTime.LIVE, reversed);
+    }
+
+    private UnfilteredRowIterator createMergeIterator(Iterator<Row> rows,
+                                                      Iterator<RangeTombstone> tombstones,
+                                                      DeletionTime deletionTime,
+                                                      boolean reversed)
+    {
+        return new RowAndDeletionMergeIterator(cfm,
+                                               Util.dk("k"),
+                                               deletionTime,
+                                               ColumnFilter.all(cfm),
+                                               Rows.EMPTY_STATIC_ROW,
+                                               reversed,
+                                               EncodingStats.NO_STATS,
+                                               rows,
+                                               tombstones,
+                                               true);
+    }
+
+    private void addRow(PartitionUpdate update, int col1, int a)
+    {
+        update.add(BTreeRow.singleCellRow(update.metadata().comparator.make(col1), makeCell(cfm, defA, a, 0)));
+    }
+
+    private Cell makeCell(CFMetaData cfm, ColumnDefinition columnDefinition, int value, long timestamp)
+    {
+        return BufferCell.live(cfm, columnDefinition, timestamp, ((AbstractType)columnDefinition.cellValueType()).decompose(value));
+    }
+
+    private static RangeTombstone atLeast(int start, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.inclusiveStartOf(bb(start)), Slice.Bound.TOP), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone atMost(int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.BOTTOM, Slice.Bound.inclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone lessThan(int end, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.BOTTOM, Slice.Bound.exclusiveEndOf(bb(end))), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone greaterThan(int start, long tstamp, int delTime)
+    {
+        return new RangeTombstone(Slice.make(Slice.Bound.exclusiveStartOf(bb(start)), Slice.Bound.TOP), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone rt(int start, boolean startInclusive, int end, boolean endInclusive, long tstamp, int delTime)
+    {
+        Slice.Bound startBound = startInclusive ? Slice.Bound.inclusiveStartOf(bb(start)) : Slice.Bound.exclusiveStartOf(bb(start));
+        Slice.Bound endBound = endInclusive ? Slice.Bound.inclusiveEndOf(bb(end)) : Slice.Bound.exclusiveEndOf(bb(end));
+
+        return new RangeTombstone(Slice.make(startBound, endBound), new DeletionTime(tstamp, delTime));
+    }
+
+    private static RangeTombstone rt(int start, int end, long tstamp, int delTime)
+    {
+        return rt(start, true, end, true, tstamp, delTime);
+    }
+
+    private static ByteBuffer bb(int i)
+    {
+        return ByteBufferUtil.bytes(i);
+    }
+
+    private long toMillis(int timeInSeconds)
+    {
+        return timeInSeconds * 1000L;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/RowBuilder.java b/test/unit/org/apache/cassandra/db/rows/RowBuilder.java
new file mode 100644
index 0000000..ede2ccd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/RowBuilder.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.rows;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.rows.Row.Builder;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * Instrumented Builder implementation for testing the
+ * behavior of Cells and Rows static methods
+ */
+public class RowBuilder implements Row.Builder
+{
+    public List<Cell> cells = new LinkedList<>();
+    public Clustering clustering = null;
+    public LivenessInfo livenessInfo = null;
+    public Row.Deletion deletionTime = null;
+    public List<Pair<ColumnDefinition, DeletionTime>> complexDeletions = new LinkedList<>();
+
+    @Override
+    public Builder copy()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void addCell(Cell cell)
+    {
+        cells.add(cell);
+    }
+
+    public boolean isSorted()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    public void newRow(Clustering clustering)
+    {
+        assert this.clustering == null;
+        this.clustering = clustering;
+    }
+
+    public Clustering clustering()
+    {
+        return clustering;
+    }
+
+    public void addPrimaryKeyLivenessInfo(LivenessInfo info)
+    {
+        assert livenessInfo == null;
+        livenessInfo = info;
+    }
+
+    public void addRowDeletion(Row.Deletion deletion)
+    {
+        assert deletionTime == null;
+        deletionTime = deletion;
+    }
+
+    public void addComplexDeletion(ColumnDefinition column, DeletionTime complexDeletion)
+    {
+        complexDeletions.add(Pair.create(column, complexDeletion));
+    }
+
+    public Row build()
+    {
+        throw new UnsupportedOperationException();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/RowsTest.java b/test/unit/org/apache/cassandra/db/rows/RowsTest.java
new file mode 100644
index 0000000..8683808
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/RowsTest.java

@@ -0,0 +1,677 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.rows;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+public class RowsTest
+{
+    private static final String KEYSPACE = "rows_test";
+    private static final String KCVM_TABLE = "kcvm";
+    private static final CFMetaData kcvm;
+    private static final ColumnDefinition v;
+    private static final ColumnDefinition m;
+    private static final Clustering c1;
+
+    static
+    {
+        kcvm = CFMetaData.Builder.create(KEYSPACE, KCVM_TABLE)
+                                 .addPartitionKey("k", IntegerType.instance)
+                                 .addClusteringColumn("c", IntegerType.instance)
+                                 .addRegularColumn("v", IntegerType.instance)
+                                 .addRegularColumn("m", MapType.getInstance(IntegerType.instance, IntegerType.instance, true))
+                                 .build();
+
+        v = kcvm.getColumnDefinition(new ColumnIdentifier("v", false));
+        m = kcvm.getColumnDefinition(new ColumnIdentifier("m", false));
+        c1 = kcvm.comparator.make(BigInteger.valueOf(1));
+    }
+
+    private static final ByteBuffer BB1 = ByteBufferUtil.bytes(1);
+    private static final ByteBuffer BB2 = ByteBufferUtil.bytes(2);
+    private static final ByteBuffer BB3 = ByteBufferUtil.bytes(3);
+    private static final ByteBuffer BB4 = ByteBufferUtil.bytes(4);
+
+    private static class MergedPair<T>
+    {
+        public final int idx;
+        public final T merged;
+        public final T original;
+
+        private MergedPair(int idx, T merged, T original)
+        {
+            this.idx = idx;
+            this.merged = merged;
+            this.original = original;
+        }
+
+        static <T> MergedPair<T> create(int i, T m, T o)
+        {
+            return new MergedPair<>(i, m, o);
+        }
+
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+
+            MergedPair<?> that = (MergedPair<?>) o;
+
+            if (idx != that.idx) return false;
+            if (merged != null ? !merged.equals(that.merged) : that.merged != null) return false;
+            return !(original != null ? !original.equals(that.original) : that.original != null);
+        }
+
+        public int hashCode()
+        {
+            int result = idx;
+            result = 31 * result + (merged != null ? merged.hashCode() : 0);
+            result = 31 * result + (original != null ? original.hashCode() : 0);
+            return result;
+        }
+
+        public String toString()
+        {
+            return "MergedPair{" +
+                   "idx=" + idx +
+                   ", merged=" + merged +
+                   ", original=" + original +
+                   '}';
+        }
+    }
+
+    private static class DiffListener implements RowDiffListener
+    {
+        int updates = 0;
+        Clustering clustering = null;
+
+        private void updateClustering(Clustering c)
+        {
+            assert clustering == null || clustering == c;
+            clustering = c;
+        }
+
+        List<MergedPair<Cell>> cells = new LinkedList<>();
+        public void onCell(int i, Clustering clustering, Cell merged, Cell original)
+        {
+            updateClustering(clustering);
+            cells.add(MergedPair.create(i, merged, original));
+            updates++;
+        }
+
+        List<MergedPair<LivenessInfo>> liveness = new LinkedList<>();
+        public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
+        {
+            updateClustering(clustering);
+            liveness.add(MergedPair.create(i, merged, original));
+            updates++;
+        }
+
+        List<MergedPair<Row.Deletion>> deletions = new LinkedList<>();
+        public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
+        {
+            updateClustering(clustering);
+            deletions.add(MergedPair.create(i, merged, original));
+            updates++;
+        }
+
+        Map<ColumnDefinition, List<MergedPair<DeletionTime>>> complexDeletions = new HashMap<>();
+        public void onComplexDeletion(int i, Clustering clustering, ColumnDefinition column, DeletionTime merged, DeletionTime original)
+        {
+            updateClustering(clustering);
+            if (!complexDeletions.containsKey(column)) complexDeletions.put(column, new LinkedList<>());
+            complexDeletions.get(column).add(MergedPair.create(i, merged, original));
+            updates++;
+        }
+    }
+
+    public static class StatsCollector implements PartitionStatisticsCollector
+    {
+        List<Cell> cells = new LinkedList<>();
+        public void update(Cell cell)
+        {
+            cells.add(cell);
+        }
+
+        List<LivenessInfo> liveness = new LinkedList<>();
+        public void update(LivenessInfo info)
+        {
+            liveness.add(info);
+        }
+
+        List<DeletionTime> deletions = new LinkedList<>();
+        public void update(DeletionTime deletion)
+        {
+            deletions.add(deletion);
+        }
+
+        long columnCount = -1;
+        public void updateColumnSetPerRow(long columnSetInRow)
+        {
+            assert columnCount < 0;
+            this.columnCount = columnSetInRow;
+        }
+
+        boolean hasLegacyCounterShards = false;
+        public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
+        {
+            this.hasLegacyCounterShards |= hasLegacyCounterShards;
+        }
+    }
+
+    private static long secondToTs(int now)
+    {
+        return now * 1000000L;
+    }
+
+    private static Row.Builder createBuilder(Clustering c, int now, ByteBuffer vVal, ByteBuffer mKey, ByteBuffer mVal)
+    {
+        long ts = secondToTs(now);
+        Row.Builder builder = BTreeRow.unsortedBuilder(now);
+        builder.newRow(c);
+        builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(kcvm, ts, now));
+        if (vVal != null)
+        {
+            builder.addCell(BufferCell.live(kcvm, v, ts, vVal));
+        }
+        if (mKey != null && mVal != null)
+        {
+            builder.addComplexDeletion(m, new DeletionTime(ts - 1, now));
+            builder.addCell(BufferCell.live(kcvm, m, ts, mVal, CellPath.create(mKey)));
+        }
+
+        return builder;
+    }
+
+    @Test
+    public void copy()
+    {
+        int now = FBUtilities.nowInSeconds();
+        long ts = secondToTs(now);
+        Row.Builder originalBuilder = BTreeRow.unsortedBuilder(now);
+        originalBuilder.newRow(c1);
+        LivenessInfo liveness = LivenessInfo.create(kcvm, ts, now);
+        originalBuilder.addPrimaryKeyLivenessInfo(liveness);
+        DeletionTime complexDeletion = new DeletionTime(ts-1, now);
+        originalBuilder.addComplexDeletion(m, complexDeletion);
+        List<Cell> expectedCells = Lists.newArrayList(BufferCell.live(kcvm, v, secondToTs(now), BB1),
+                                                      BufferCell.live(kcvm, m, secondToTs(now), BB1, CellPath.create(BB1)),
+                                                      BufferCell.live(kcvm, m, secondToTs(now), BB2, CellPath.create(BB2)));
+        expectedCells.forEach(originalBuilder::addCell);
+        // We need to use ts-1 so the deletion doesn't shadow what we've created
+        Row.Deletion rowDeletion = new Row.Deletion(new DeletionTime(ts-1, now), false);
+        originalBuilder.addRowDeletion(rowDeletion);
+
+        RowBuilder builder = new RowBuilder();
+        Rows.copy(originalBuilder.build(), builder);
+
+        Assert.assertEquals(c1, builder.clustering);
+        Assert.assertEquals(liveness, builder.livenessInfo);
+        Assert.assertEquals(rowDeletion, builder.deletionTime);
+        Assert.assertEquals(Lists.newArrayList(Pair.create(m, complexDeletion)), builder.complexDeletions);
+        Assert.assertEquals(Sets.newHashSet(expectedCells), Sets.newHashSet(builder.cells));
+    }
+
+    @Test
+    public void collectStats()
+    {
+        int now = FBUtilities.nowInSeconds();
+        long ts = secondToTs(now);
+        Row.Builder builder = BTreeRow.unsortedBuilder(now);
+        builder.newRow(c1);
+        LivenessInfo liveness = LivenessInfo.create(kcvm, ts, now);
+        builder.addPrimaryKeyLivenessInfo(liveness);
+        DeletionTime complexDeletion = new DeletionTime(ts-1, now);
+        builder.addComplexDeletion(m, complexDeletion);
+        List<Cell> expectedCells = Lists.newArrayList(BufferCell.live(kcvm, v, ts, BB1),
+                                                      BufferCell.live(kcvm, m, ts, BB1, CellPath.create(BB1)),
+                                                      BufferCell.live(kcvm, m, ts, BB2, CellPath.create(BB2)));
+        expectedCells.forEach(builder::addCell);
+        // We need to use ts-1 so the deletion doesn't shadow what we've created
+        Row.Deletion rowDeletion = new Row.Deletion(new DeletionTime(ts-1, now), false);
+        builder.addRowDeletion(rowDeletion);
+
+        StatsCollector collector = new StatsCollector();
+        Rows.collectStats(builder.build(), collector);
+
+        Assert.assertEquals(Lists.newArrayList(liveness), collector.liveness);
+        Assert.assertEquals(Sets.newHashSet(rowDeletion.time(), complexDeletion), Sets.newHashSet(collector.deletions));
+        Assert.assertEquals(Sets.newHashSet(expectedCells), Sets.newHashSet(collector.cells));
+        Assert.assertEquals(2, collector.columnCount);
+        Assert.assertFalse(collector.hasLegacyCounterShards);
+    }
+
+
+    public static void addExpectedCells(Set<MergedPair<Cell>> dst, Cell merged, Cell... inputs)
+    {
+        for (int i=0; i<inputs.length; i++)
+        {
+            dst.add(MergedPair.create(i, merged, inputs[i]));
+        }
+    }
+
+    @Test
+    public void diff()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        long ts1 = secondToTs(now1);
+        Row.Builder r1Builder = BTreeRow.unsortedBuilder(now1);
+        r1Builder.newRow(c1);
+        LivenessInfo r1Liveness = LivenessInfo.create(kcvm, ts1, now1);
+        r1Builder.addPrimaryKeyLivenessInfo(r1Liveness);
+        DeletionTime r1ComplexDeletion = new DeletionTime(ts1-1, now1);
+        r1Builder.addComplexDeletion(m, r1ComplexDeletion);
+
+        Cell r1v = BufferCell.live(kcvm, v, ts1, BB1);
+        Cell r1m1 = BufferCell.live(kcvm, m, ts1, BB1, CellPath.create(BB1));
+        Cell r1m2 = BufferCell.live(kcvm, m, ts1, BB2, CellPath.create(BB2));
+        List<Cell> r1ExpectedCells = Lists.newArrayList(r1v, r1m1, r1m2);
+
+        r1ExpectedCells.forEach(r1Builder::addCell);
+
+        int now2 = now1 + 1;
+        long ts2 = secondToTs(now2);
+        Row.Builder r2Builder = BTreeRow.unsortedBuilder(now2);
+        r2Builder.newRow(c1);
+        LivenessInfo r2Liveness = LivenessInfo.create(kcvm, ts2, now2);
+        r2Builder.addPrimaryKeyLivenessInfo(r2Liveness);
+        Cell r2v = BufferCell.live(kcvm, v, ts2, BB2);
+        Cell r2m2 = BufferCell.live(kcvm, m, ts2, BB1, CellPath.create(BB2));
+        Cell r2m3 = BufferCell.live(kcvm, m, ts2, BB2, CellPath.create(BB3));
+        Cell r2m4 = BufferCell.live(kcvm, m, ts2, BB3, CellPath.create(BB4));
+        List<Cell> r2ExpectedCells = Lists.newArrayList(r2v, r2m2, r2m3, r2m4);
+
+        r2ExpectedCells.forEach(r2Builder::addCell);
+        Row.Deletion r2RowDeletion = new Row.Deletion(new DeletionTime(ts1 - 2, now2), false);
+        r2Builder.addRowDeletion(r2RowDeletion);
+
+        Row r1 = r1Builder.build();
+        Row r2 = r2Builder.build();
+        Row merged = Rows.merge(r1, r2, now2 + 1);
+
+        Assert.assertEquals(r1ComplexDeletion, merged.getComplexColumnData(m).complexDeletion());
+
+        DiffListener listener = new DiffListener();
+        Rows.diff(listener, merged, r1, r2);
+
+        Assert.assertEquals(c1, listener.clustering);
+
+        // check cells
+        Set<MergedPair<Cell>> expectedCells = Sets.newHashSet();
+        addExpectedCells(expectedCells, r2v,  r1v,  r2v);     // v
+        addExpectedCells(expectedCells, r1m1, r1m1, null);   // m[1]
+        addExpectedCells(expectedCells, r2m2, r1m2, r2m2);   // m[2]
+        addExpectedCells(expectedCells, r2m3, null, r2m3);   // m[3]
+        addExpectedCells(expectedCells, r2m4, null, r2m4);   // m[4]
+
+        Assert.assertEquals(expectedCells.size(), listener.cells.size());
+        Assert.assertEquals(expectedCells, Sets.newHashSet(listener.cells));
+
+        // liveness
+        List<MergedPair<LivenessInfo>> expectedLiveness = Lists.newArrayList(MergedPair.create(0, r2Liveness, r1Liveness),
+                                                                             MergedPair.create(1, r2Liveness, r2Liveness));
+        Assert.assertEquals(expectedLiveness, listener.liveness);
+
+        // deletions
+        List<MergedPair<Row.Deletion>> expectedDeletions = Lists.newArrayList(MergedPair.create(0, r2RowDeletion, null),
+                                                                              MergedPair.create(1, r2RowDeletion, r2RowDeletion));
+        Assert.assertEquals(expectedDeletions, listener.deletions);
+
+        // complex deletions
+        List<MergedPair<DeletionTime>> expectedCmplxDeletions = Lists.newArrayList(MergedPair.create(0, r1ComplexDeletion, r1ComplexDeletion),
+                                                                                   MergedPair.create(1, r1ComplexDeletion, DeletionTime.LIVE));
+        Assert.assertEquals(ImmutableMap.builder().put(m, expectedCmplxDeletions).build(), listener.complexDeletions);
+    }
+
+    /**
+     * merged row has no column data
+     */
+    @Test
+    public void diffEmptyMerged()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        long ts1 = secondToTs(now1);
+        Row.Builder r1Builder = BTreeRow.unsortedBuilder(now1);
+        r1Builder.newRow(c1);
+        LivenessInfo r1Liveness = LivenessInfo.create(kcvm, ts1, now1);
+        r1Builder.addPrimaryKeyLivenessInfo(r1Liveness);
+
+        // mergedData == null
+        int now2 = now1 + 1;
+        long ts2 = secondToTs(now2);
+        Row.Builder r2Builder = BTreeRow.unsortedBuilder(now2);
+        r2Builder.newRow(c1);
+        LivenessInfo r2Liveness = LivenessInfo.create(kcvm, ts2, now2);
+        r2Builder.addPrimaryKeyLivenessInfo(r2Liveness);
+        DeletionTime r2ComplexDeletion = new DeletionTime(ts2-1, now2);
+        r2Builder.addComplexDeletion(m, r2ComplexDeletion);
+        Cell r2v = BufferCell.live(kcvm, v, ts2, BB2);
+        Cell r2m2 = BufferCell.live(kcvm, m, ts2, BB1, CellPath.create(BB2));
+        Cell r2m3 = BufferCell.live(kcvm, m, ts2, BB2, CellPath.create(BB3));
+        Cell r2m4 = BufferCell.live(kcvm, m, ts2, BB3, CellPath.create(BB4));
+        List<Cell> r2ExpectedCells = Lists.newArrayList(r2v, r2m2, r2m3, r2m4);
+
+        r2ExpectedCells.forEach(r2Builder::addCell);
+        Row.Deletion r2RowDeletion = new Row.Deletion(new DeletionTime(ts1 - 1, now2), false);
+        r2Builder.addRowDeletion(r2RowDeletion);
+
+        Row r1 = r1Builder.build();
+        Row r2 = r2Builder.build();
+
+        DiffListener listener = new DiffListener();
+        Rows.diff(listener, r1, r2);
+
+        Assert.assertEquals(c1, listener.clustering);
+
+        // check cells
+        Set<MergedPair<Cell>> expectedCells = Sets.newHashSet(MergedPair.create(0, null, r2v),   // v
+                                                              MergedPair.create(0, null, r2m2),  // m[2]
+                                                              MergedPair.create(0, null, r2m3),  // m[3]
+                                                              MergedPair.create(0, null, r2m4)); // m[4]
+
+        Assert.assertEquals(expectedCells.size(), listener.cells.size());
+        Assert.assertEquals(expectedCells, Sets.newHashSet(listener.cells));
+
+        // complex deletions
+        List<MergedPair<DeletionTime>> expectedCmplxDeletions = Lists.newArrayList(MergedPair.create(0, null, r2ComplexDeletion));
+        Assert.assertEquals(ImmutableMap.builder().put(m, expectedCmplxDeletions).build(), listener.complexDeletions);
+    }
+
+    /**
+     * input row has no column data
+     */
+    @Test
+    public void diffEmptyInput()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        long ts1 = secondToTs(now1);
+        Row.Builder r1Builder = BTreeRow.unsortedBuilder(now1);
+        r1Builder.newRow(c1);
+        LivenessInfo r1Liveness = LivenessInfo.create(kcvm, ts1, now1);
+        r1Builder.addPrimaryKeyLivenessInfo(r1Liveness);
+
+        // mergedData == null
+        int now2 = now1 + 1;
+        long ts2 = secondToTs(now2);
+        Row.Builder r2Builder = BTreeRow.unsortedBuilder(now2);
+        r2Builder.newRow(c1);
+        LivenessInfo r2Liveness = LivenessInfo.create(kcvm, ts2, now2);
+        r2Builder.addPrimaryKeyLivenessInfo(r2Liveness);
+        DeletionTime r2ComplexDeletion = new DeletionTime(ts2-1, now2);
+        r2Builder.addComplexDeletion(m, r2ComplexDeletion);
+        Cell r2v = BufferCell.live(kcvm, v, ts2, BB2);
+        Cell r2m2 = BufferCell.live(kcvm, m, ts2, BB1, CellPath.create(BB2));
+        Cell r2m3 = BufferCell.live(kcvm, m, ts2, BB2, CellPath.create(BB3));
+        Cell r2m4 = BufferCell.live(kcvm, m, ts2, BB3, CellPath.create(BB4));
+        List<Cell> r2ExpectedCells = Lists.newArrayList(r2v, r2m2, r2m3, r2m4);
+
+        r2ExpectedCells.forEach(r2Builder::addCell);
+        Row.Deletion r2RowDeletion = new Row.Deletion(new DeletionTime(ts1 - 1, now2), false);
+        r2Builder.addRowDeletion(r2RowDeletion);
+
+        Row r1 = r1Builder.build();
+        Row r2 = r2Builder.build();
+
+        DiffListener listener = new DiffListener();
+        Rows.diff(listener, r2, r1);
+
+        Assert.assertEquals(c1, listener.clustering);
+
+        // check cells
+        Set<MergedPair<Cell>> expectedCells = Sets.newHashSet(MergedPair.create(0, r2v, null),   // v
+                                                              MergedPair.create(0, r2m2, null),  // m[2]
+                                                              MergedPair.create(0, r2m3, null),  // m[3]
+                                                              MergedPair.create(0, r2m4, null)); // m[4]
+
+        Assert.assertEquals(expectedCells.size(), listener.cells.size());
+        Assert.assertEquals(expectedCells, Sets.newHashSet(listener.cells));
+
+        // complex deletions
+        List<MergedPair<DeletionTime>> expectedCmplxDeletions = Lists.newArrayList(MergedPair.create(0, r2ComplexDeletion, null));
+        Assert.assertEquals(ImmutableMap.builder().put(m, expectedCmplxDeletions).build(), listener.complexDeletions);
+    }
+
+    @Test
+    public void merge()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        Row.Builder existingBuilder = createBuilder(c1, now1, BB1, BB1, BB1);
+
+        int now2 = now1 + 1;
+        long ts2 = secondToTs(now2);
+
+        Cell expectedVCell = BufferCell.live(kcvm, v, ts2, BB2);
+        Cell expectedMCell = BufferCell.live(kcvm, m, ts2, BB2, CellPath.create(BB1));
+        DeletionTime expectedComplexDeletionTime = new DeletionTime(ts2 - 1, now2);
+
+        Row.Builder updateBuilder = createBuilder(c1, now2, null, null, null);
+        updateBuilder.addCell(expectedVCell);
+        updateBuilder.addComplexDeletion(m, expectedComplexDeletionTime);
+        updateBuilder.addCell(expectedMCell);
+
+        RowBuilder builder = new RowBuilder();
+        long td = Rows.merge(existingBuilder.build(), updateBuilder.build(), builder, now2 + 1);
+
+        Assert.assertEquals(c1, builder.clustering);
+        Assert.assertEquals(LivenessInfo.create(kcvm, ts2, now2), builder.livenessInfo);
+        Assert.assertEquals(Lists.newArrayList(Pair.create(m, new DeletionTime(ts2-1, now2))), builder.complexDeletions);
+
+        Assert.assertEquals(2, builder.cells.size());
+        Assert.assertEquals(Lists.newArrayList(expectedVCell, expectedMCell), Lists.newArrayList(builder.cells));
+        Assert.assertEquals(ts2 - secondToTs(now1), td);
+    }
+
+    @Test
+    public void mergeComplexDeletionSupersededByRowDeletion()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        Row.Builder existingBuilder = createBuilder(c1, now1, null, null, null);
+
+        int now2 = now1 + 1;
+        Row.Builder updateBuilder = createBuilder(c1, now2, null, BB1, BB1);
+        int now3 = now2 + 1;
+        Row.Deletion expectedDeletion = new Row.Deletion(new DeletionTime(secondToTs(now3), now3), false);
+        updateBuilder.addRowDeletion(expectedDeletion);
+
+        RowBuilder builder = new RowBuilder();
+        Rows.merge(existingBuilder.build(), updateBuilder.build(), builder, now3 + 1);
+
+        Assert.assertEquals(expectedDeletion, builder.deletionTime);
+        Assert.assertEquals(Collections.emptyList(), builder.complexDeletions);
+        Assert.assertEquals(Collections.emptyList(), builder.cells);
+    }
+
+    /**
+     * If a row's deletion time deletes a row's liveness info, the new row should have it's
+     * liveness info set to empty
+     */
+    @Test
+    public void mergeRowDeletionSupercedesLiveness()
+    {
+        int now1 = FBUtilities.nowInSeconds();
+        Row.Builder existingBuilder = createBuilder(c1, now1, null, null, null);
+
+        int now2 = now1 + 1;
+        Row.Builder updateBuilder = createBuilder(c1, now2, BB1, BB1, BB1);
+        int now3 = now2 + 1;
+        Row.Deletion expectedDeletion = new Row.Deletion(new DeletionTime(secondToTs(now3), now3), false);
+        updateBuilder.addRowDeletion(expectedDeletion);
+
+        RowBuilder builder = new RowBuilder();
+        Rows.merge(existingBuilder.build(), updateBuilder.build(), builder, now3 + 1);
+
+        Assert.assertEquals(expectedDeletion, builder.deletionTime);
+        Assert.assertEquals(LivenessInfo.EMPTY, builder.livenessInfo);
+        Assert.assertEquals(Collections.emptyList(), builder.complexDeletions);
+        Assert.assertEquals(Collections.emptyList(), builder.cells);
+    }
+
+    // Creates a dummy cell for a (regular) column for the provided name and without a cellPath.
+    private static Cell liveCell(ColumnDefinition name)
+    {
+        return liveCell(name, -1);
+    }
+
+    // Creates a dummy cell for a (regular) column for the provided name.
+    // If path >= 0, the cell will have a CellPath containing path as an Int32Type.
+    private static Cell liveCell(ColumnDefinition name, int path)
+    {
+        CellPath cp = path < 0 ? null : CellPath.create(ByteBufferUtil.bytes(path));
+        return new BufferCell(name, 0L, Cell.NO_TTL, Cell.NO_DELETION_TIME, ByteBuffer.allocate(1), cp);
+    }
+
+    // Assert that the cells generated by iterating iterable are the cell of cells (in the same order
+    // and with neither more nor less cells).
+    private static void assertCellOrder(Iterable<Cell> iterable, Cell... cells)
+    {
+        int i = 0;
+        for (Cell actual : iterable)
+        {
+            Assert.assertFalse(String.format("Got more rows than expected (expecting %d). First unexpected cell is %s", cells.length, actual), i >= cells.length);
+            Assert.assertEquals(cells[i++], actual);
+        }
+        Assert.assertFalse(String.format("Got less rows than expected (got %d while expecting %d).", i, cells.length), i < cells.length);
+    }
+
+    // Make a dummy row (empty clustering) with the provided cells, that are assumed to be in order
+    private static Row makeDummyRow(Cell ... cells)
+    {
+        Row.Builder builder = BTreeRow.sortedBuilder();
+        builder.newRow(Clustering.EMPTY);
+        for (Cell cell : cells)
+            builder.addCell(cell);
+
+        return builder.build();
+    }
+
+    @Test
+    public void testLegacyCellIterator()
+    {
+        // Creates a table with
+        //   - 3 Simple columns: a, c and e
+        //   - 2 Complex columns: b and d
+        CFMetaData metadata = CFMetaData.Builder.create("dummy_ks", "dummy_tbl")
+                                        .addPartitionKey("k", BytesType.instance)
+                                        .addRegularColumn("a", BytesType.instance)
+                                        .addRegularColumn("b", MapType.getInstance(Int32Type.instance, BytesType.instance, true))
+                                        .addRegularColumn("c", BytesType.instance)
+                                        .addRegularColumn("d", MapType.getInstance(Int32Type.instance, BytesType.instance, true))
+                                        .addRegularColumn("e", BytesType.instance)
+                                        .build();
+
+        ColumnDefinition a = metadata.getColumnDefinition(new ColumnIdentifier("a", false));
+        ColumnDefinition b = metadata.getColumnDefinition(new ColumnIdentifier("b", false));
+        ColumnDefinition c = metadata.getColumnDefinition(new ColumnIdentifier("c", false));
+        ColumnDefinition d = metadata.getColumnDefinition(new ColumnIdentifier("d", false));
+        ColumnDefinition e = metadata.getColumnDefinition(new ColumnIdentifier("e", false));
+
+        Row row;
+
+        // Row with only simple columns
+
+        row = makeDummyRow(liveCell(a),
+                           liveCell(c),
+                           liveCell(e));
+
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, false),
+                        liveCell(a),
+                        liveCell(c),
+                        liveCell(e));
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, true),
+                        liveCell(e),
+                        liveCell(c),
+                        liveCell(a));
+
+        // Row with only complex columns
+
+        row = makeDummyRow(liveCell(b, 1),
+                           liveCell(b, 2),
+                           liveCell(d, 3),
+                           liveCell(d, 4));
+
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, false),
+                        liveCell(b, 1),
+                        liveCell(b, 2),
+                        liveCell(d, 3),
+                        liveCell(d, 4));
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, true),
+                        liveCell(d, 4),
+                        liveCell(d, 3),
+                        liveCell(b, 2),
+                        liveCell(b, 1));
+
+        // Row with mixed simple and complex columns
+
+        row = makeDummyRow(liveCell(a),
+                           liveCell(c),
+                           liveCell(e),
+                           liveCell(b, 1),
+                           liveCell(b, 2),
+                           liveCell(d, 3),
+                           liveCell(d, 4));
+
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, false),
+                        liveCell(a),
+                        liveCell(b, 1),
+                        liveCell(b, 2),
+                        liveCell(c),
+                        liveCell(d, 3),
+                        liveCell(d, 4),
+                        liveCell(e));
+
+        assertCellOrder(row.cellsInLegacyOrder(metadata, true),
+                        liveCell(e),
+                        liveCell(d, 4),
+                        liveCell(d, 3),
+                        liveCell(c),
+                        liveCell(b, 2),
+                        liveCell(b, 1),
+                        liveCell(a));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsMergeTest.java b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsMergeTest.java
new file mode 100644
index 0000000..7637fa0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsMergeTest.java

@@ -0,0 +1,492 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.rows;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterators;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.Slice.Bound;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.rows.Unfiltered.Kind;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class UnfilteredRowIteratorsMergeTest
+{
+    static DecoratedKey partitionKey = Util.dk("key");
+    static DeletionTime partitionLevelDeletion = DeletionTime.LIVE;
+    static CFMetaData metadata = CFMetaData.Builder.create("UnfilteredRowIteratorsMergeTest", "Test").
+            addPartitionKey("key", AsciiType.instance).
+            addClusteringColumn("clustering", Int32Type.instance).
+            addRegularColumn("data", Int32Type.instance).
+            build();
+    static Comparator<Clusterable> comparator = new ClusteringComparator(Int32Type.instance);
+    static int nowInSec = FBUtilities.nowInSeconds();
+
+    static final int RANGE = 3000;
+    static final int DEL_RANGE = 100;
+    static final int ITERATORS = 15;
+    static final int ITEMS = 300;
+
+    boolean reversed;
+
+    public UnfilteredRowIteratorsMergeTest()
+    {
+    }
+
+    @Test
+    public void testTombstoneMerge()
+    {
+        testTombstoneMerge(false, false);
+    }
+
+    @Test
+    public void testTombstoneMergeReversed()
+    {
+        testTombstoneMerge(true, false);
+    }
+
+    @Test
+    public void testTombstoneMergeIterative()
+    {
+        testTombstoneMerge(false, true);
+    }
+
+    @Test
+    public void testTombstoneMergeReversedIterative()
+    {
+        testTombstoneMerge(true, true);
+    }
+
+    @Test
+    public void testDuplicateRangeCase()
+    {
+        testForInput("67<=[98] [98]<=67",
+                     "66<=[11] [11]<71",
+                     "66<[13] [13]<67");
+    }
+
+    @SuppressWarnings("unused")
+    public void testTombstoneMerge(boolean reversed, boolean iterations)
+    {
+        for (int seed = 1; seed <= 100; ++seed)
+        {
+            this.reversed = reversed;
+            if (ITEMS <= 20)
+                System.out.println("\nSeed " + seed);
+
+            Random r = new Random(seed);
+            List<Function<Integer, Integer>> timeGenerators = ImmutableList.of(
+                    x -> -1,
+                    x -> DEL_RANGE,
+                    x -> r.nextInt(DEL_RANGE)
+                );
+            List<List<Unfiltered>> sources = new ArrayList<>(ITERATORS);
+            if (ITEMS <= 20)
+                System.out.println("Merging");
+            for (int i=0; i<ITERATORS; ++i)
+                sources.add(generateSource(r, timeGenerators.get(r.nextInt(timeGenerators.size()))));
+            List<Unfiltered> merged = merge(sources, iterations);
+    
+            if (ITEMS <= 20)
+                System.out.println("results in");
+            if (ITEMS <= 20)
+                dumpList(merged);
+            verifyEquivalent(sources, merged);
+            verifyValid(merged);
+            if (reversed)
+            {
+                Collections.reverse(merged);
+                this.reversed = false;
+                verifyValid(merged);
+            }
+        }
+    }
+
+    private List<Unfiltered> merge(List<List<Unfiltered>> sources, boolean iterations)
+    {
+        List<UnfilteredRowIterator> us = sources.stream().map(l -> new Source(l.iterator())).collect(Collectors.toList());
+        List<Unfiltered> merged = new ArrayList<>();
+        Iterators.addAll(merged, mergeIterators(us, iterations));
+        return merged;
+    }
+
+    public UnfilteredRowIterator mergeIterators(List<UnfilteredRowIterator> us, boolean iterations)
+    {
+        int now = FBUtilities.nowInSeconds();
+        if (iterations)
+        {
+            UnfilteredRowIterator mi = us.get(0);
+            int i;
+            for (i = 1; i + 2 <= ITERATORS; i += 2)
+                mi = UnfilteredRowIterators.merge(ImmutableList.of(mi, us.get(i), us.get(i+1)), now);
+            if (i + 1 <= ITERATORS)
+                mi = UnfilteredRowIterators.merge(ImmutableList.of(mi, us.get(i)), now);
+            return mi;
+        }
+        else
+        {
+            return UnfilteredRowIterators.merge(us, now);
+        }
+    }
+
+    @SuppressWarnings("unused")
+    private List<Unfiltered> generateSource(Random r, Function<Integer, Integer> timeGenerator)
+    {
+        int[] positions = new int[ITEMS + 1];
+        for (int i=0; i<ITEMS; ++i)
+            positions[i] = r.nextInt(RANGE);
+        positions[ITEMS] = RANGE;
+        Arrays.sort(positions);
+
+        List<Unfiltered> content = new ArrayList<>(ITEMS);
+        int prev = -1;
+        for (int i=0; i<ITEMS; ++i)
+        {
+            int pos = positions[i];
+            int sz = positions[i + 1] - pos;
+            if (sz == 0 && pos == prev)
+                // Filter out more than two of the same position.
+                continue;
+            if (r.nextBoolean() || pos == prev)
+            {
+                int span;
+                boolean includesStart;
+                boolean includesEnd;
+                if (pos > prev)
+                {
+                    span = r.nextInt(sz + 1);
+                    includesStart = span > 0 ? r.nextBoolean() : true;
+                    includesEnd = span > 0 ? r.nextBoolean() : true;
+                }
+                else
+                {
+                    span = 1 + r.nextInt(sz);
+                    includesStart = false;
+                    includesEnd = r.nextBoolean();
+                }
+                int deltime = r.nextInt(DEL_RANGE);
+                DeletionTime dt = new DeletionTime(deltime, deltime);
+                content.add(new RangeTombstoneBoundMarker(boundFor(pos, true, includesStart), dt));
+                content.add(new RangeTombstoneBoundMarker(boundFor(pos + span, false, includesEnd), dt));
+                prev = pos + span - (includesEnd ? 0 : 1);
+            }
+            else
+            {
+                content.add(emptyRowAt(pos, timeGenerator));
+                prev = pos;
+            }
+        }
+
+        attachBoundaries(content);
+        if (reversed)
+        {
+            Collections.reverse(content);
+        }
+        verifyValid(content);
+        if (ITEMS <= 20)
+            dumpList(content);
+        return content;
+    }
+
+    static void attachBoundaries(List<Unfiltered> content)
+    {
+        int di = 0;
+        RangeTombstoneMarker prev = null;
+        for (int si = 0; si < content.size(); ++si)
+        {
+            Unfiltered currUnfiltered = content.get(si);
+            RangeTombstoneMarker curr = currUnfiltered.kind() == Kind.RANGE_TOMBSTONE_MARKER ?
+                                        (RangeTombstoneMarker) currUnfiltered :
+                                        null;
+            if (prev != null && curr != null && prev.isClose(false) && curr.isOpen(false) && prev.clustering().invert().equals(curr.clustering()))
+            {
+                // Join. Prefer not to use merger to check its correctness.
+                RangeTombstone.Bound b = prev.clustering();
+                b = b.withNewKind(b.isInclusive() ? RangeTombstone.Bound.Kind.INCL_END_EXCL_START_BOUNDARY : RangeTombstone.Bound.Kind.EXCL_END_INCL_START_BOUNDARY);
+                prev = new RangeTombstoneBoundaryMarker(b, prev.closeDeletionTime(false), curr.openDeletionTime(false));
+                currUnfiltered = prev;
+                --di;
+            }
+            content.set(di++, currUnfiltered);
+            prev = curr;
+        }
+        for (int pos = content.size() - 1; pos >= di; --pos)
+            content.remove(pos);
+    }
+
+    void verifyValid(List<Unfiltered> list)
+    {
+        int reversedAsMultiplier = reversed ? -1 : 1;
+        try {
+            RangeTombstoneMarker prev = null;
+            Unfiltered prevUnfiltered = null;
+            for (Unfiltered unfiltered : list)
+            {
+                Assert.assertTrue("Order violation prev " + str(prevUnfiltered) + " curr " + str(unfiltered),
+                                  prevUnfiltered == null || comparator.compare(prevUnfiltered, unfiltered) * reversedAsMultiplier < 0);
+                prevUnfiltered = unfiltered;
+
+                if (unfiltered.kind() == Kind.RANGE_TOMBSTONE_MARKER)
+                {
+                    RangeTombstoneMarker curr = (RangeTombstoneMarker) unfiltered;
+                    if (prev != null)
+                    {
+                        if (curr.isClose(reversed))
+                        {
+                            Assert.assertTrue(str(unfiltered) + " follows another close marker " + str(prev), prev.isOpen(reversed));
+                            Assert.assertEquals("Deletion time mismatch for open " + str(prev) + " and close " + str(unfiltered),
+                                                prev.openDeletionTime(reversed),
+                                                curr.closeDeletionTime(reversed));
+                        }
+                        else
+                            Assert.assertFalse(str(curr) + " follows another open marker " + str(prev), prev.isOpen(reversed));
+                    }
+
+                    prev = curr;
+                }
+            }
+            Assert.assertFalse("Cannot end in open marker " + str(prev), prev != null && prev.isOpen(reversed));
+
+        } catch (AssertionError e) {
+            System.out.println(e);
+            dumpList(list);
+            throw e;
+        }
+    }
+
+    void verifyEquivalent(List<List<Unfiltered>> sources, List<Unfiltered> merged)
+    {
+        try
+        {
+            for (int i=0; i<RANGE; ++i)
+            {
+                Clusterable c = clusteringFor(i);
+                DeletionTime dt = DeletionTime.LIVE;
+                for (List<Unfiltered> source : sources)
+                {
+                    dt = deletionFor(c, source, dt);
+                }
+                Assert.assertEquals("Deletion time mismatch for position " + str(c), dt, deletionFor(c, merged));
+                if (dt == DeletionTime.LIVE)
+                {
+                    Optional<Unfiltered> sourceOpt = sources.stream().map(source -> rowFor(c, source)).filter(x -> x != null).findAny();
+                    Unfiltered mergedRow = rowFor(c, merged);
+                    Assert.assertEquals("Content mismatch for position " + str(c), str(sourceOpt.orElse(null)), str(mergedRow));
+                }
+            }
+        }
+        catch (AssertionError e)
+        {
+            System.out.println(e);
+            for (List<Unfiltered> list : sources)
+                dumpList(list);
+            System.out.println("merged");
+            dumpList(merged);
+            throw e;
+        }
+    }
+
+    private Unfiltered rowFor(Clusterable pointer, List<Unfiltered> list)
+    {
+        int index = Collections.binarySearch(list, pointer, reversed ? comparator.reversed() : comparator);
+        return index >= 0 ? list.get(index) : null;
+    }
+
+    DeletionTime deletionFor(Clusterable pointer, List<Unfiltered> list)
+    {
+        return deletionFor(pointer, list, DeletionTime.LIVE);
+    }
+
+    DeletionTime deletionFor(Clusterable pointer, List<Unfiltered> list, DeletionTime def)
+    {
+        if (list.isEmpty())
+            return def;
+
+        int index = Collections.binarySearch(list, pointer, reversed ? comparator.reversed() : comparator);
+        if (index < 0)
+            index = -1 - index;
+        else
+        {
+            Row row = (Row) list.get(index);
+            if (row.deletion().supersedes(def))
+                def = row.deletion().time();
+        }
+
+        if (index >= list.size())
+            return def;
+
+        while (--index >= 0)
+        {
+            Unfiltered unfiltered = list.get(index);
+            if (unfiltered.kind() == Kind.ROW)
+                continue;
+            RangeTombstoneMarker lower = (RangeTombstoneMarker) unfiltered;
+            if (!lower.isOpen(reversed))
+                return def;
+            return lower.openDeletionTime(reversed).supersedes(def) ? lower.openDeletionTime(reversed) : def;
+        }
+        return def;
+    }
+
+    private static Bound boundFor(int pos, boolean start, boolean inclusive)
+    {
+        return Bound.create(Bound.boundKind(start, inclusive), new ByteBuffer[] {Int32Type.instance.decompose(pos)});
+    }
+
+    private static Clustering clusteringFor(int i)
+    {
+        return new Clustering(Int32Type.instance.decompose(i));
+    }
+
+    static Row emptyRowAt(int pos, Function<Integer, Integer> timeGenerator)
+    {
+        final Clustering clustering = clusteringFor(pos);
+        final LivenessInfo live = LivenessInfo.create(metadata, timeGenerator.apply(pos), nowInSec);
+        return BTreeRow.noCellLiveRow(clustering, live);
+    }
+
+    private void dumpList(List<Unfiltered> list)
+    {
+        for (Unfiltered u : list)
+            System.out.print(str(u) + " ");
+        System.out.println();
+    }
+
+    private String str(Clusterable curr)
+    {
+        if (curr == null)
+            return "null";
+        String val = Int32Type.instance.getString(curr.clustering().get(0));
+        if (curr instanceof RangeTombstoneMarker)
+        {
+            RangeTombstoneMarker marker = (RangeTombstoneMarker) curr;
+            if (marker.isClose(reversed))
+                val = "[" + marker.closeDeletionTime(reversed).markedForDeleteAt() + "]" + (marker.closeIsInclusive(reversed) ? "<=" : "<") + val;
+            if (marker.isOpen(reversed)) 
+                val = val + (marker.openIsInclusive(reversed) ? "<=" : "<") + "[" + marker.openDeletionTime(reversed).markedForDeleteAt() + "]";
+        }
+        return val;
+    }
+
+    class Source extends AbstractUnfilteredRowIterator implements UnfilteredRowIterator
+    {
+        Iterator<Unfiltered> content;
+
+        protected Source(Iterator<Unfiltered> content)
+        {
+            super(UnfilteredRowIteratorsMergeTest.metadata,
+                  UnfilteredRowIteratorsMergeTest.partitionKey,
+                  UnfilteredRowIteratorsMergeTest.partitionLevelDeletion,
+                  UnfilteredRowIteratorsMergeTest.metadata.partitionColumns(),
+                  null,
+                  reversed,
+                  EncodingStats.NO_STATS);
+            this.content = content;
+        }
+
+        @Override
+        protected Unfiltered computeNext()
+        {
+            return content.hasNext() ? content.next() : endOfData();
+        }
+    }
+
+    public void testForInput(String... inputs)
+    {
+        List<List<Unfiltered>> sources = new ArrayList<>();
+        for (String input : inputs)
+        {
+            List<Unfiltered> source = parse(input);
+            attachBoundaries(source);
+            dumpList(source);
+            verifyValid(source);
+            sources.add(source);
+        }
+
+        List<Unfiltered> merged = merge(sources, false);
+        System.out.println("Merge to:");
+        dumpList(merged);
+        verifyEquivalent(sources, merged);
+        verifyValid(merged);
+        System.out.println();
+    }
+
+    List<Unfiltered> parse(String input)
+    {
+        String[] split = input.split(" ");
+        Pattern open = Pattern.compile("(\\d+)<(=)?\\[(\\d+)\\]");
+        Pattern close = Pattern.compile("\\[(\\d+)\\]<(=)?(\\d+)");
+        Pattern row = Pattern.compile("(\\d+)(\\[(\\d+)\\])?");
+        List<Unfiltered> out = new ArrayList<>(split.length);
+        for (String s : split)
+        {
+            Matcher m = open.matcher(s);
+            if (m.matches())
+            {
+                out.add(openMarker(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(3)), m.group(2) != null));
+                continue;
+            }
+            m = close.matcher(s);
+            if (m.matches())
+            {
+                out.add(closeMarker(Integer.parseInt(m.group(3)), Integer.parseInt(m.group(1)), m.group(2) != null));
+                continue;
+            }
+            m = row.matcher(s);
+            if (m.matches())
+            {
+                int live = m.group(3) != null ? Integer.parseInt(m.group(3)) : DEL_RANGE;
+                out.add(emptyRowAt(Integer.parseInt(m.group(1)), x -> live));
+                continue;
+            }
+            Assert.fail("Can't parse " + s);
+        }
+        return out;
+    }
+
+    private RangeTombstoneMarker openMarker(int pos, int delTime, boolean inclusive)
+    {
+        return marker(pos, delTime, true, inclusive);
+    }
+
+    private RangeTombstoneMarker closeMarker(int pos, int delTime, boolean inclusive)
+    {
+        return marker(pos, delTime, false, inclusive);
+    }
+
+    private RangeTombstoneMarker marker(int pos, int delTime, boolean isStart, boolean inclusive)
+    {
+        return new RangeTombstoneBoundMarker(Bound.create(Bound.boundKind(isStart, inclusive),
+                                                          new ByteBuffer[] {clusteringFor(pos).get(0)}),
+                                             new DeletionTime(delTime, delTime));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java
new file mode 100644
index 0000000..0387ba2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java

@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.rows;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.EmptyIterators;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class UnfilteredRowIteratorsTest
+{
+    static final CFMetaData metadata;
+    static final ColumnDefinition v1Metadata;
+    static final ColumnDefinition v2Metadata;
+
+    static
+    {
+        metadata = CFMetaData.Builder.create("", "")
+                             .addPartitionKey("pk", Int32Type.instance)
+                                     .addClusteringColumn("ck", Int32Type.instance)
+                             .addRegularColumn("v1", Int32Type.instance)
+                             .addRegularColumn("v2", Int32Type.instance)
+                             .build();
+        v1Metadata = metadata.partitionColumns().columns(false).getSimple(0);
+        v2Metadata = metadata.partitionColumns().columns(false).getSimple(1);
+    }
+
+
+    @Test
+    public void concatTest()
+    {
+        UnfilteredRowIterator iter1, iter2, iter3, concat;
+        // simple concatenation
+        iter1 = rows(metadata.partitionColumns(), 1,
+                     row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                     row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+        iter2 = rows(metadata.partitionColumns(), 1,
+                     row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)),
+                     row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)));
+        concat = UnfilteredRowIterators.concat(iter1, iter2);
+        Assert.assertEquals(concat.columns(), metadata.partitionColumns());
+        assertRows(concat,
+                   row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                   row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)),
+                   row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)),
+                   row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)));
+
+        // concat with RHS empty iterator
+        iter1 = rows(metadata.partitionColumns(), 1,
+                     row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                     row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+        Assert.assertEquals(concat.columns(), metadata.partitionColumns());
+        assertRows(UnfilteredRowIterators.concat(iter1, EmptyIterators.unfilteredRow(metadata, dk(1), false, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE)),
+                   row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                   row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+
+        // concat with LHS empty iterator
+        iter1 = rows(metadata.partitionColumns(), 1,
+                     row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                     row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+        Assert.assertEquals(concat.columns(), metadata.partitionColumns());
+        assertRows(UnfilteredRowIterators.concat(EmptyIterators.unfilteredRow(metadata, dk(1), false, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE), iter1),
+                   row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                   row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+
+        // concat with different columns
+        iter1 = rows(metadata.partitionColumns().without(v1Metadata), 1,
+                     row(1, cell(v2Metadata, 1)), row(2, cell(v2Metadata, 2)));
+        iter2 = rows(metadata.partitionColumns().without(v2Metadata), 1,
+                     row(3, cell(v1Metadata, 3)), row(4, cell(v1Metadata, 4)));
+        concat = UnfilteredRowIterators.concat(iter1, iter2);
+        Assert.assertEquals(concat.columns(), PartitionColumns.of(v1Metadata).mergeTo(PartitionColumns.of(v2Metadata)));
+        assertRows(concat,
+                   row(1, cell(v2Metadata, 1)), row(2, cell(v2Metadata, 2)),
+                   row(3, cell(v1Metadata, 3)), row(4, cell(v1Metadata, 4)));
+
+        // concat with CQL limits
+        iter1 = rows(metadata.partitionColumns(), 1,
+                     row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                     row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+        iter2 = rows(metadata.partitionColumns(), 1,
+                     row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)),
+                     row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)));
+        concat = UnfilteredRowIterators.concat(DataLimits.cqlLimits(1).filter(iter1, FBUtilities.nowInSeconds(), true),
+                                               DataLimits.cqlLimits(1).filter(iter2, FBUtilities.nowInSeconds(), true));
+        Assert.assertEquals(concat.columns(), metadata.partitionColumns());
+        assertRows(concat,
+                   row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                   row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)));
+
+        // concat concatenated iterators
+        iter1 = rows(metadata.partitionColumns(), 1,
+                     row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                     row(2, cell(v1Metadata, 2), cell(v2Metadata, 2)));
+        iter2 = rows(metadata.partitionColumns(), 1,
+                     row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)),
+                     row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)));
+
+        concat = UnfilteredRowIterators.concat(DataLimits.cqlLimits(1).filter(iter1, FBUtilities.nowInSeconds(), true),
+                                               DataLimits.cqlLimits(1).filter(iter2, FBUtilities.nowInSeconds(), true));
+
+        iter3 = rows(metadata.partitionColumns(), 1,
+                     row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)),
+                     row(5, cell(v1Metadata, 5), cell(v2Metadata, 5)));
+        concat = UnfilteredRowIterators.concat(concat, DataLimits.cqlLimits(1).filter(iter3, FBUtilities.nowInSeconds(), true));
+
+        Assert.assertEquals(concat.columns(), metadata.partitionColumns());
+        assertRows(concat,
+                   row(1, cell(v1Metadata, 1), cell(v2Metadata, 1)),
+                   row(3, cell(v1Metadata, 3), cell(v2Metadata, 3)),
+                   row(4, cell(v1Metadata, 4), cell(v2Metadata, 4)));
+    }
+
+    public static void assertRows(UnfilteredRowIterator iterator, Row... rows)
+    {
+        Iterator<Row> rowsIterator = Arrays.asList(rows).iterator();
+
+        while (iterator.hasNext() && rowsIterator.hasNext())
+            Assert.assertEquals(iterator.next(), rowsIterator.next());
+
+        Assert.assertTrue(iterator.hasNext() == rowsIterator.hasNext());
+    }
+
+    public static DecoratedKey dk(int pk)
+    {
+        return new BufferDecoratedKey(new Murmur3Partitioner.LongToken(pk), ByteBufferUtil.bytes(pk));
+    }
+
+    public static UnfilteredRowIterator rows(PartitionColumns columns, int pk, Row... rows)
+    {
+        Iterator<Row> rowsIterator = Arrays.asList(rows).iterator();
+        return new AbstractUnfilteredRowIterator(metadata, dk(pk), DeletionTime.LIVE, columns, Rows.EMPTY_STATIC_ROW, false, EncodingStats.NO_STATS) {
+            protected Unfiltered computeNext()
+            {
+                return rowsIterator.hasNext() ? rowsIterator.next() : endOfData();
+            }
+        };
+    }
+
+    public Row row(int ck, Cell... columns)
+    {
+        BTreeRow.Builder builder = new BTreeRow.Builder(true);
+        builder.newRow(Util.clustering(metadata.comparator, ck));
+        for (Cell cell : columns)
+            builder.addCell(cell);
+        return builder.build();
+    }
+
+    public Cell cell(ColumnDefinition metadata, int v)
+    {
+        return new BufferCell(metadata,
+                              1L, BufferCell.NO_TTL, BufferCell.NO_DELETION_TIME, ByteBufferUtil.bytes(v), null);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java b/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java
new file mode 100644
index 0000000..78a0c8c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java

@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.transform;
+
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterators;
+import org.junit.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.net.*;
+import org.apache.cassandra.utils.DiagnosticSnapshotService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class DuplicateRowCheckerTest extends CQLTester
+{
+    ColumnFamilyStore cfs;
+    CFMetaData metadata;
+    static HashMap<InetAddress, MessageOut> sentMessages;
+
+    @BeforeClass
+    public static void setupMessaging()
+    {
+        sentMessages = new HashMap<>();
+        IMessageSink sink = new IMessageSink()
+        {
+            public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
+            {
+                sentMessages.put(to, message);
+                return false;
+            }
+
+            public boolean allowIncomingMessage(MessageIn message, int id)
+            {
+                return false;
+            }
+        };
+        MessagingService.instance().addMessageSink(sink);
+    }
+
+    @Before
+    public void setup() throws Throwable
+    {
+        DatabaseDescriptor.setSnapshotOnDuplicateRowDetection(true);
+        System.setProperty("cassandra.diagnostic_snapshot_interval_nanos", "0");
+        // Create a table and insert some data. The actual rows read in the test will be synthetic
+        // but this creates an sstable on disk to be snapshotted.
+        createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, v int, PRIMARY KEY (pk, ck1, ck2))");
+        for (int i = 0; i < 10; i++)
+            execute("insert into %s (pk, ck1, ck2, v) values (?, ?, ?, ?)", "key", i, i, i);
+        getCurrentColumnFamilyStore().forceBlockingFlush();
+
+        metadata = getCurrentColumnFamilyStore().metadata;
+        cfs = getCurrentColumnFamilyStore();
+        sentMessages.clear();
+    }
+
+    @Test
+    public void noDuplicates()
+    {
+        // no duplicates
+        iterate(iter(metadata,
+                     false,
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 2)));
+        assertCommandIssued(sentMessages, false);
+    }
+
+    @Test
+    public void singleDuplicateForward()
+    {
+
+        iterate(iter(metadata,
+                     false,
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    @Test
+    public void singleDuplicateReverse()
+    {
+        iterate(iter(metadata,
+                     true,
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    @Test
+    public void multipleContiguousForward()
+    {
+        iterate(iter(metadata,
+                     false,
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    @Test
+    public void multipleContiguousReverse()
+    {
+        iterate(iter(metadata,
+                     true,
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 1)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    @Test
+    public void multipleDisjointForward()
+    {
+        iterate(iter(metadata,
+                     false,
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 2),
+                     makeRow(metadata, 0, 2)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    @Test
+    public void multipleDisjointReverse()
+    {
+        iterate(iter(metadata,
+                     true,
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 0),
+                     makeRow(metadata, 0, 1),
+                     makeRow(metadata, 0, 2),
+                     makeRow(metadata, 0, 2)));
+        assertCommandIssued(sentMessages, true);
+    }
+
+    public static void assertCommandIssued(HashMap<InetAddress, MessageOut> sent, boolean isExpected)
+    {
+        assertEquals(isExpected, !sent.isEmpty());
+        if (isExpected)
+        {
+            assertEquals(1, sent.size());
+            assertTrue(sent.containsKey(FBUtilities.getBroadcastAddress()));
+            SnapshotCommand command = (SnapshotCommand) sent.get(FBUtilities.getBroadcastAddress()).payload;
+            assertTrue(command.snapshot_name.startsWith(DiagnosticSnapshotService.DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX));
+        }
+    }
+
+    private void iterate(UnfilteredPartitionIterator iter)
+    {
+        try (PartitionIterator partitions = applyChecker(iter))
+        {
+            while (partitions.hasNext())
+            {
+                try (RowIterator partition = partitions.next())
+                {
+                    partition.forEachRemaining(u -> {});
+                }
+            }
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    private static <T> ByteBuffer decompose(AbstractType<?> type, T value)
+    {
+        return ((AbstractType<T>) type).decompose(value);
+    }
+
+    public static Row makeRow(CFMetaData metadata, Object... clusteringValues)
+    {
+        ByteBuffer[] clusteringByteBuffers = new ByteBuffer[clusteringValues.length];
+        for (int i = 0; i < clusteringValues.length; i++)
+            clusteringByteBuffers[i] = decompose(metadata.clusteringColumns().get(i).type, clusteringValues[i]);
+
+        return BTreeRow.noCellLiveRow(new Clustering(clusteringByteBuffers), LivenessInfo.create(metadata, 0, 0));
+    }
+
+    private static PartitionIterator applyChecker(UnfilteredPartitionIterator unfiltered)
+    {
+        int nowInSecs = 0;
+        return DuplicateRowChecker.duringRead(FilteredPartitions.filter(unfiltered, nowInSecs),
+                                              Collections.singletonList(FBUtilities.getBroadcastAddress()));
+    }
+
+    public static UnfilteredPartitionIterator iter(CFMetaData metadata, boolean isReversedOrder, Unfiltered... unfiltereds)
+    {
+        DecoratedKey key = metadata.partitioner.decorateKey(bytes("key"));
+        Iterator<Unfiltered> iterator = Iterators.forArray(unfiltereds);
+
+        UnfilteredRowIterator rowIter = new AbstractUnfilteredRowIterator(metadata,
+                                                                          key,
+                                                                          DeletionTime.LIVE,
+                                                                          metadata.partitionColumns(),
+                                                                          Rows.EMPTY_STATIC_ROW,
+                                                                          isReversedOrder,
+                                                                          EncodingStats.NO_STATS)
+        {
+            protected Unfiltered computeNext()
+            {
+                return iterator.hasNext() ? iterator.next() : endOfData();
+            }
+        };
+
+        return new SingletonUnfilteredPartitionIterator(rowIter, false);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/transform/RTTransformationsTest.java b/test/unit/org/apache/cassandra/db/transform/RTTransformationsTest.java
new file mode 100644
index 0000000..f79b9f3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/transform/RTTransformationsTest.java

@@ -0,0 +1,482 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.transform;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ClusteringPrefix.Kind;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.RTBoundValidator.Stage;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.db.transform.RTBoundCloser.close;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import static org.apache.cassandra.db.transform.RTBoundValidator.validate;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public final class RTTransformationsTest
+{
+    private static final String KEYSPACE = "RTBoundCloserTest";
+    private static final String TABLE = "table";
+
+    private final int nowInSec = FBUtilities.nowInSeconds();
+
+    private CFMetaData metadata;
+    private DecoratedKey key;
+
+    @Before
+    public void setUp()
+    {
+        metadata =
+            CFMetaData.Builder
+                      .create(KEYSPACE, TABLE)
+                      .addPartitionKey("pk", UTF8Type.instance)
+                      .addClusteringColumn("ck0", UTF8Type.instance)
+                      .addClusteringColumn("ck1", UTF8Type.instance)
+                      .addClusteringColumn("ck2", UTF8Type.instance)
+                      .build();
+        key = Murmur3Partitioner.instance.decorateKey(bytes("key"));
+    }
+
+    @Test
+    public void testAddsNothingWhenAlreadyClosed()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        );
+
+        UnfilteredPartitionIterator extended = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        );
+        assertIteratorsEqual(original, close(extended));
+    }
+
+    @Test
+    public void testAddsNothingWhenAlreadyClosedInReverseOrder()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+
+        UnfilteredPartitionIterator extended = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+        assertIteratorsEqual(original, close(extended));
+    }
+
+    @Test
+    public void testClosesUnclosedBound()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        );
+        UnfilteredPartitionIterator extended = close(original);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1", "")
+        );
+        assertIteratorsEqual(expected, extended);
+    }
+
+    @Test
+    public void testClosesUnclosedBoundary()
+    {
+        UnfilteredPartitionIterator original = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 1, "a", "0")
+        , row(2, "a", "1", "")
+        );
+        UnfilteredPartitionIterator extended = close(original);
+
+        UnfilteredPartitionIterator expected = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 1, "a", "0")
+        , row(2, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 1, "a", "1", "")
+        );
+        assertIteratorsEqual(expected, extended);
+    }
+
+    @Test
+    public void testClosesUnclosedBoundInReverseOrder()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        );
+        UnfilteredPartitionIterator extended = close(original);
+
+        UnfilteredPartitionIterator expected = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1", "")
+        );
+        assertIteratorsEqual(expected, extended);
+    }
+
+    @Test
+    public void testClosesUnclosedBoundaryInReverseOrder()
+    {
+        UnfilteredPartitionIterator original = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a")
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 1, 0, "a", "1")
+        , row(2, "a", "0", "")
+        );
+        UnfilteredPartitionIterator extended = close(original);
+
+        UnfilteredPartitionIterator expected = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a")
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 1, 0, "a", "1")
+        , row(2, "a", "0", "")
+        , bound(Kind.INCL_START_BOUND, 1, "a", "0", "")
+        );
+
+        assertIteratorsEqual(expected, extended);
+    }
+
+    @Test
+    public void testFailsWithoutSeeingRows()
+    {
+        UnfilteredPartitionIterator iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a")
+        );
+        assertThrowsISEIterated(close(iterator));
+    }
+
+    @Test
+    public void testValidatesLegalBounds()
+    {
+        UnfilteredPartitionIterator iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+
+        , bound(Kind.INCL_START_BOUND, 0, "a", "2")
+        , row(1, "a", "2", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "2")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        drain(iterator);
+    }
+
+    @Test
+    public void testValidatesLegalBoundsInReverseOrder()
+    {
+        UnfilteredPartitionIterator iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "2")
+        , row(1, "a", "2", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "2")
+
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        drain(iterator);
+    }
+
+    @Test
+    public void testValidatesLegalBoundaries()
+    {
+        UnfilteredPartitionIterator iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a")
+
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 1, "a", "1")
+        , row(2, "a", "1", "")
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 1, 0, "a", "1")
+
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 2, "a", "2")
+        , row(3, "a", "2", "")
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 2, 0, "a", "2")
+
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 3, "a", "3")
+        , row(4, "a", "3", "")
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 3, 0, "a", "3")
+
+        , bound(Kind.INCL_END_BOUND, 0, "a")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        drain(iterator);
+    }
+
+    @Test
+    public void testValidatesLegalBoundariesInReverseOrder()
+    {
+        UnfilteredPartitionIterator iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a")
+
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 3, 0, "a", "3")
+        , row(4, "a", "3", "")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 3, "a", "3")
+
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 2, 0, "a", "2")
+        , row(3, "a", "2", "")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 2, "a", "2")
+
+        , boundary(Kind.INCL_END_EXCL_START_BOUNDARY, 1, 0, "a", "1")
+        , row(2, "a", "1", "")
+        , boundary(Kind.EXCL_END_INCL_START_BOUNDARY, 0, 1, "a", "1")
+
+        , bound(Kind.INCL_START_BOUND, 0, "a")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        drain(iterator);
+    }
+
+    @Test
+    public void testComplainsAboutMismatchedTimestamps()
+    {
+        UnfilteredPartitionIterator iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 1, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+    }
+
+    @Test
+    public void testComplainsAboutMismatchedTimestampsInReverseOrder()
+    {
+        UnfilteredPartitionIterator iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 1, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+    }
+
+    @Test
+    public void testComplainsAboutInvalidSequence()
+    {
+        // duplicated start bound
+        UnfilteredPartitionIterator iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // duplicated end bound
+        iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // absent open bound
+        iterator = iter(false
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // absent end bound
+        iterator = iter(false
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+    }
+
+    @Test
+    public void testComplainsAboutInvalidSequenceInReveseOrder()
+    {
+        // duplicated start bound
+        UnfilteredPartitionIterator iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // duplicated end bound
+        iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // absent open bound
+        iterator = iter(true
+        , bound(Kind.INCL_END_BOUND, 0, "a", "1")
+        , row(1, "a", "1", "")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+
+        // absent end bound
+        iterator = iter(true
+        , row(1, "a", "1", "")
+        , bound(Kind.INCL_START_BOUND, 0, "a", "1")
+        );
+        iterator = validate(iterator, Stage.PROCESSED, true);
+        assertThrowsISEIterated(iterator);
+    }
+
+    private RangeTombstoneBoundMarker bound(ClusteringPrefix.Kind kind, long timestamp, Object... clusteringValues)
+    {
+        ByteBuffer[] clusteringByteBuffers = new ByteBuffer[clusteringValues.length];
+        for (int i = 0; i < clusteringValues.length; i++)
+            clusteringByteBuffers[i] = decompose(metadata.clusteringColumns().get(i).type, clusteringValues[i]);
+
+        return new RangeTombstoneBoundMarker(new RangeTombstone.Bound(kind, clusteringByteBuffers), new DeletionTime(timestamp, nowInSec));
+    }
+
+    private RangeTombstoneBoundaryMarker boundary(ClusteringPrefix.Kind kind, long closeTimestamp, long openTimestamp, Object... clusteringValues)
+    {
+        ByteBuffer[] clusteringByteBuffers = new ByteBuffer[clusteringValues.length];
+        for (int i = 0; i < clusteringValues.length; i++)
+            clusteringByteBuffers[i] = decompose(metadata.clusteringColumns().get(i).type, clusteringValues[i]);
+
+        return new RangeTombstoneBoundaryMarker(new RangeTombstone.Bound(kind, clusteringByteBuffers),
+                                                new DeletionTime(closeTimestamp, nowInSec),
+                                                new DeletionTime(openTimestamp, nowInSec));
+    }
+
+    private Row row(long timestamp, Object... clusteringValues)
+    {
+        ByteBuffer[] clusteringByteBuffers = new ByteBuffer[clusteringValues.length];
+        for (int i = 0; i < clusteringValues.length; i++)
+            clusteringByteBuffers[i] = decompose(metadata.clusteringColumns().get(i).type, clusteringValues[i]);
+
+        return BTreeRow.noCellLiveRow(new Clustering(clusteringByteBuffers), LivenessInfo.create(metadata, timestamp, nowInSec));
+    }
+
+    @SuppressWarnings("unchecked")
+    private static <T> ByteBuffer decompose(AbstractType<?> type, T value)
+    {
+        return ((AbstractType<T>) type).decompose(value);
+    }
+
+    private UnfilteredPartitionIterator iter(boolean isReversedOrder, Unfiltered... unfiltereds)
+    {
+        Iterator<Unfiltered> iterator = Iterators.forArray(unfiltereds);
+
+        UnfilteredRowIterator rowIter =
+            new AbstractUnfilteredRowIterator(metadata,
+                                              key,
+                                              DeletionTime.LIVE,
+                                              metadata.partitionColumns(),
+                                              Rows.EMPTY_STATIC_ROW,
+                                              isReversedOrder,
+                                              EncodingStats.NO_STATS)
+        {
+            protected Unfiltered computeNext()
+            {
+                return iterator.hasNext() ? iterator.next() : endOfData();
+            }
+        };
+
+        return new SingletonUnfilteredPartitionIterator(rowIter, false);
+    }
+
+    private void assertIteratorsEqual(UnfilteredPartitionIterator iter1, UnfilteredPartitionIterator iter2)
+    {
+        while (iter1.hasNext())
+        {
+            assertTrue(iter2.hasNext());
+
+            try (UnfilteredRowIterator partition1 = iter1.next())
+            {
+                try (UnfilteredRowIterator partition2 = iter2.next())
+                {
+                    assertIteratorsEqual(partition1, partition2);
+                }
+            }
+        }
+        assertFalse(iter2.hasNext());
+    }
+
+    private void assertIteratorsEqual(UnfilteredRowIterator iter1, UnfilteredRowIterator iter2)
+    {
+        while (iter1.hasNext())
+        {
+            assertTrue(iter2.hasNext());
+
+            assertEquals(iter1.next(), iter2.next());
+        }
+        assertFalse(iter2.hasNext());
+    }
+
+    private void assertThrowsISEIterated(UnfilteredPartitionIterator iterator)
+    {
+        Throwable t = null;
+        try
+        {
+            drain(iterator);
+        }
+        catch (Throwable e)
+        {
+            t = e;
+        }
+        assertTrue(t instanceof IllegalStateException);
+    }
+
+    private void drain(UnfilteredPartitionIterator iter)
+    {
+        while (iter.hasNext())
+        {
+            try (UnfilteredRowIterator partition = iter.next())
+            {
+                while (partition.hasNext())
+                    partition.next();
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java b/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java
new file mode 100644
index 0000000..c238f36
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java

@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.view;
+
+import java.net.InetAddress;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.locator.IEndpointSnitch;
+import org.apache.cassandra.locator.NetworkTopologyStrategy;
+import org.apache.cassandra.locator.PropertyFileSnitch;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.ReplicationParams;
+import org.apache.cassandra.service.StorageService;
+
+public class ViewUtilsTest
+{
+    @BeforeClass
+    public static void setUp() throws ConfigurationException
+    {
+        IEndpointSnitch snitch = new PropertyFileSnitch();
+        DatabaseDescriptor.setEndpointSnitch(snitch);
+        Keyspace.setInitialized();
+    }
+
+    @Test
+    public void testGetIndexNaturalEndpoint() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        // DC1
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.2"));
+
+        // DC2
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
+        metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
+
+        Map<String, String> replicationMap = new HashMap<>();
+        replicationMap.put(ReplicationParams.CLASS, NetworkTopologyStrategy.class.getName());
+
+        replicationMap.put("DC1", "1");
+        replicationMap.put("DC2", "1");
+
+        Keyspace.clear("Keyspace1");
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, replicationMap));
+        Schema.instance.setKeyspaceMetadata(meta);
+
+        Optional<InetAddress> naturalEndpoint = ViewUtils.getViewNaturalEndpoint("Keyspace1",
+                                                                       new StringToken("CA"),
+                                                                       new StringToken("BB"));
+
+        Assert.assertTrue(naturalEndpoint.isPresent());
+        Assert.assertEquals(InetAddress.getByName("127.0.0.2"), naturalEndpoint.get());
+    }
+
+
+    @Test
+    public void testLocalHostPreference() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        // DC1
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.2"));
+
+        // DC2
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
+        metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
+
+        Map<String, String> replicationMap = new HashMap<>();
+        replicationMap.put(ReplicationParams.CLASS, NetworkTopologyStrategy.class.getName());
+
+        replicationMap.put("DC1", "2");
+        replicationMap.put("DC2", "2");
+
+        Keyspace.clear("Keyspace1");
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, replicationMap));
+        Schema.instance.setKeyspaceMetadata(meta);
+
+        Optional<InetAddress> naturalEndpoint = ViewUtils.getViewNaturalEndpoint("Keyspace1",
+                                                                       new StringToken("CA"),
+                                                                       new StringToken("BB"));
+
+        Assert.assertTrue(naturalEndpoint.isPresent());
+        Assert.assertEquals(InetAddress.getByName("127.0.0.1"), naturalEndpoint.get());
+    }
+
+    @Test
+    public void testBaseTokenDoesNotBelongToLocalReplicaShouldReturnEmpty() throws Exception
+    {
+        TokenMetadata metadata = StorageService.instance.getTokenMetadata();
+        metadata.clearUnsafe();
+
+        // DC1
+        metadata.updateNormalToken(new StringToken("A"), InetAddress.getByName("127.0.0.1"));
+        metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.2"));
+
+        // DC2
+        metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.4"));
+        metadata.updateNormalToken(new StringToken("D"), InetAddress.getByName("127.0.0.5"));
+
+        Map<String, String> replicationMap = new HashMap<>();
+        replicationMap.put(ReplicationParams.CLASS, NetworkTopologyStrategy.class.getName());
+
+        replicationMap.put("DC1", "1");
+        replicationMap.put("DC2", "1");
+
+        Keyspace.clear("Keyspace1");
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, replicationMap));
+        Schema.instance.setKeyspaceMetadata(meta);
+
+        Optional<InetAddress> naturalEndpoint = ViewUtils.getViewNaturalEndpoint("Keyspace1",
+                                                                       new StringToken("AB"),
+                                                                       new StringToken("BB"));
+
+        Assert.assertFalse(naturalEndpoint.isPresent());
+    }
+}

diff --git a/test/unit/org/apache/cassandra/dht/BootStrapperTest.java b/test/unit/org/apache/cassandra/dht/BootStrapperTest.java
index ababd99..3fbe106 100644
--- a/test/unit/org/apache/cassandra/dht/BootStrapperTest.java
+++ b/test/unit/org/apache/cassandra/dht/BootStrapperTest.java

@@ -1,30 +1,38 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.dht;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.Collection;
 import java.util.HashSet;
-import java.util.Set;
+import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
+import com.google.common.collect.Lists;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+
+import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -34,30 +42,42 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.tokenallocator.TokenAllocation;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.IFailureDetectionEventListener;
 import org.apache.cassandra.gms.IFailureDetector;
+import org.apache.cassandra.locator.IEndpointSnitch;
+import org.apache.cassandra.locator.RackInferringSnitch;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
-
-import static org.junit.Assert.*;
+import org.apache.cassandra.utils.FBUtilities;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class BootStrapperTest
 {
+    static IPartitioner oldPartitioner;
+
     @BeforeClass
     public static void setup() throws ConfigurationException
     {
+        oldPartitioner = StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance);
         SchemaLoader.startGossiper();
         SchemaLoader.prepareServer();
         SchemaLoader.schemaDefinition("BootStrapperTest");
     }
 
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(oldPartitioner);
+    }
+
     @Test
     public void testSourceTargetComputation() throws UnknownHostException
     {
         final int[] clusterSizes = new int[] { 1, 3, 5, 10, 100};
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             int replicationFactor = Keyspace.open(keyspaceName).getReplicationStrategy().getReplicationFactor();
             for (int clusterSize : clusterSizes)
@@ -69,12 +89,12 @@
     private RangeStreamer testSourceTargetComputation(String keyspaceName, int numOldNodes, int replicationFactor) throws UnknownHostException
     {
         StorageService ss = StorageService.instance;
+        TokenMetadata tmd = ss.getTokenMetadata();
 
         generateFakeEndpoints(numOldNodes);
-        Token myToken = StorageService.getPartitioner().getRandomToken();
+        Token myToken = tmd.partitioner.getRandomToken();
         InetAddress myEndpoint = InetAddress.getByName("127.0.0.1");
 
-        TokenMetadata tmd = ss.getTokenMetadata();
         assertEquals(numOldNodes, tmd.sortedTokens().size());
         RangeStreamer s = new RangeStreamer(tmd, null, myEndpoint, "Bootstrap", true, DatabaseDescriptor.getEndpointSnitch(), new StreamStateStore());
         IFailureDetector mockFailureDetector = new IFailureDetector()
@@ -112,14 +132,138 @@
 
     private void generateFakeEndpoints(int numOldNodes) throws UnknownHostException
     {
-        TokenMetadata tmd = StorageService.instance.getTokenMetadata();
+        generateFakeEndpoints(StorageService.instance.getTokenMetadata(), numOldNodes, 1);
+    }
+
+    private void generateFakeEndpoints(TokenMetadata tmd, int numOldNodes, int numVNodes) throws UnknownHostException
+    {
         tmd.clearUnsafe();
-        IPartitioner p = StorageService.getPartitioner();
+        generateFakeEndpoints(tmd, numOldNodes, numVNodes, "0", "0");
+    }
+
+    private void generateFakeEndpoints(TokenMetadata tmd, int numOldNodes, int numVNodes, String dc, String rack) throws UnknownHostException
+    {
+        IPartitioner p = tmd.partitioner;
 
         for (int i = 1; i <= numOldNodes; i++)
         {
             // leave .1 for myEndpoint
-            tmd.updateNormalToken(p.getRandomToken(), InetAddress.getByName("127.0.0." + (i + 1)));
+            InetAddress addr = InetAddress.getByName("127." + dc + "." + rack + "." + (i + 1));
+            List<Token> tokens = Lists.newArrayListWithCapacity(numVNodes);
+            for (int j = 0; j < numVNodes; ++j)
+                tokens.add(p.getRandomToken());
+            
+            tmd.updateNormalTokens(tokens, addr);
         }
     }
+    
+    @Test
+    public void testAllocateTokens() throws UnknownHostException
+    {
+        int vn = 16;
+        String ks = "BootStrapperTestKeyspace3";
+        TokenMetadata tm = new TokenMetadata();
+        generateFakeEndpoints(tm, 10, vn);
+        InetAddress addr = FBUtilities.getBroadcastAddress();
+        allocateTokensForNode(vn, ks, tm, addr);
+    }
+
+    public void testAllocateTokensNetworkStrategy(int rackCount, int replicas) throws UnknownHostException
+    {
+        IEndpointSnitch oldSnitch = DatabaseDescriptor.getEndpointSnitch();
+        try
+        {
+            DatabaseDescriptor.setEndpointSnitch(new RackInferringSnitch());
+            int vn = 16;
+            String ks = "BootStrapperTestNTSKeyspace" + rackCount + replicas;
+            String dc = "1";
+            SchemaLoader.createKeyspace(ks, KeyspaceParams.nts(dc, replicas, "15", 15), SchemaLoader.standardCFMD(ks, "Standard1"));
+            TokenMetadata tm = StorageService.instance.getTokenMetadata();
+            tm.clearUnsafe();
+            for (int i = 0; i < rackCount; ++i)
+                generateFakeEndpoints(tm, 10, vn, dc, Integer.toString(i));
+            InetAddress addr = InetAddress.getByName("127." + dc + ".0.99");
+            allocateTokensForNode(vn, ks, tm, addr);
+            // Note: Not matching replication factor in second datacentre, but this should not affect us.
+        } finally {
+            DatabaseDescriptor.setEndpointSnitch(oldSnitch);
+        }
+    }
+
+    @Test
+    public void testAllocateTokensNetworkStrategyOneRack() throws UnknownHostException
+    {
+        testAllocateTokensNetworkStrategy(1, 3);
+    }
+
+    @Test(expected = ConfigurationException.class)
+    public void testAllocateTokensNetworkStrategyTwoRacks() throws UnknownHostException
+    {
+        testAllocateTokensNetworkStrategy(2, 3);
+    }
+
+    @Test
+    public void testAllocateTokensNetworkStrategyThreeRacks() throws UnknownHostException
+    {
+        testAllocateTokensNetworkStrategy(3, 3);
+    }
+
+    @Test
+    public void testAllocateTokensNetworkStrategyFiveRacks() throws UnknownHostException
+    {
+        testAllocateTokensNetworkStrategy(5, 3);
+    }
+
+    @Test
+    public void testAllocateTokensNetworkStrategyOneRackOneReplica() throws UnknownHostException
+    {
+        testAllocateTokensNetworkStrategy(1, 1);
+    }
+
+    private void allocateTokensForNode(int vn, String ks, TokenMetadata tm, InetAddress addr)
+    {
+        SummaryStatistics os = TokenAllocation.replicatedOwnershipStats(tm.cloneOnlyTokenMap(), Keyspace.open(ks).getReplicationStrategy(), addr);
+        Collection<Token> tokens = BootStrapper.allocateTokens(tm, addr, ks, vn);
+        assertEquals(vn, tokens.size());
+        tm.updateNormalTokens(tokens, addr);
+        SummaryStatistics ns = TokenAllocation.replicatedOwnershipStats(tm.cloneOnlyTokenMap(), Keyspace.open(ks).getReplicationStrategy(), addr);
+        verifyImprovement(os, ns);
+    }
+
+    private void verifyImprovement(SummaryStatistics os, SummaryStatistics ns)
+    {
+        if (ns.getStandardDeviation() > os.getStandardDeviation())
+        {
+            fail(String.format("Token allocation unexpectedly increased standard deviation.\nStats before:\n%s\nStats after:\n%s", os, ns));
+        }
+    }
+
+    
+    @Test
+    public void testAllocateTokensMultipleKeyspaces() throws UnknownHostException
+    {
+        // TODO: This scenario isn't supported very well. Investigate a multi-keyspace version of the algorithm.
+        int vn = 16;
+        String ks3 = "BootStrapperTestKeyspace4"; // RF = 3
+        String ks2 = "BootStrapperTestKeyspace5"; // RF = 2
+
+        TokenMetadata tm = new TokenMetadata();
+        generateFakeEndpoints(tm, 10, vn);
+        
+        InetAddress dcaddr = FBUtilities.getBroadcastAddress();
+        SummaryStatistics os3 = TokenAllocation.replicatedOwnershipStats(tm, Keyspace.open(ks3).getReplicationStrategy(), dcaddr);
+        SummaryStatistics os2 = TokenAllocation.replicatedOwnershipStats(tm, Keyspace.open(ks2).getReplicationStrategy(), dcaddr);
+        String cks = ks3;
+        String nks = ks2;
+        for (int i=11; i<=20; ++i)
+        {
+            allocateTokensForNode(vn, cks, tm, InetAddress.getByName("127.0.0." + (i + 1)));
+            String t = cks; cks = nks; nks = t;
+        }
+        
+        SummaryStatistics ns3 = TokenAllocation.replicatedOwnershipStats(tm, Keyspace.open(ks3).getReplicationStrategy(), dcaddr);
+        SummaryStatistics ns2 = TokenAllocation.replicatedOwnershipStats(tm, Keyspace.open(ks2).getReplicationStrategy(), dcaddr);
+        verifyImprovement(os3, ns3);
+        verifyImprovement(os2, ns2);
+    }
 }

diff --git a/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java b/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java
index e70e086..c4896a3 100644
--- a/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java
+++ b/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.dht;
 
 public class ByteOrderedPartitionerTest extends PartitionerTestCase

diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
index e8a5ee2..ade6ec1 100644
--- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
+++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -7,38 +7,36 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.apache.cassandra.dht;
 
 import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.List;
 
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
+
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.config.*;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.*;
-
-import static org.apache.cassandra.Util.dk;
-
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
 
 /**
  * Test cases where multiple keys collides, ie have the same token.
@@ -56,19 +54,17 @@
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
-        oldPartitioner = DatabaseDescriptor.getPartitioner();
-        DatabaseDescriptor.setPartitioner(LengthPartitioner.instance);
+        oldPartitioner = StorageService.instance.setPartitionerUnsafe(LengthPartitioner.instance);
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF));
     }
 
     @AfterClass
     public static void tearDown()
     {
-        DatabaseDescriptor.setPartitioner(oldPartitioner);
+        DatabaseDescriptor.setPartitionerUnsafe(oldPartitioner);
     }
 
     @Test
@@ -82,12 +78,12 @@
         insert("key1", "key2", "key3"); // token = 4
         insert("longKey1", "longKey2"); // token = 8
 
-        List<Row> rows = cfs.getRangeSlice(new Bounds<RowPosition>(dk("k2"), dk("key2")), null, new IdentityQueryFilter(), 10000);
-        assert rows.size() == 4 : "Expecting 4 keys, got " + rows.size();
-        assert rows.get(0).key.getKey().equals(ByteBufferUtil.bytes("k2"));
-        assert rows.get(1).key.getKey().equals(ByteBufferUtil.bytes("kq"));
-        assert rows.get(2).key.getKey().equals(ByteBufferUtil.bytes("key1"));
-        assert rows.get(3).key.getKey().equals(ByteBufferUtil.bytes("key2"));
+        List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs).fromKeyIncl("k2").toKeyIncl("key2").build());
+
+        assert partitions.get(0).partitionKey().getKey().equals(ByteBufferUtil.bytes("k2"));
+        assert partitions.get(1).partitionKey().getKey().equals(ByteBufferUtil.bytes("kq"));
+        assert partitions.get(2).partitionKey().getKey().equals(ByteBufferUtil.bytes("key1"));
+        assert partitions.get(3).partitionKey().getKey().equals(ByteBufferUtil.bytes("key2"));
     }
 
     private void insert(String... keys)
@@ -98,10 +94,8 @@
 
     private void insert(String key)
     {
-        Mutation rm;
-        rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(key));
-        rm.add(CF, Util.cellname("column"), ByteBufferUtil.bytes("asdf"), 0);
-        rm.applyUnsafe();
+        RowUpdateBuilder builder = new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE1, CF), FBUtilities.timestampMicros(), key);
+        builder.clustering("c").add("val", "asdf").build().applyUnsafe();
     }
 
     static class BigIntegerToken extends ComparableObjectToken<BigInteger>
@@ -130,122 +124,4 @@
             return 0;
         }
     }
-
-    public static class LengthPartitioner implements IPartitioner
-    {
-        public static final BigInteger ZERO = new BigInteger("0");
-        public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1");
-
-        public static LengthPartitioner instance = new LengthPartitioner();
-
-        public DecoratedKey decorateKey(ByteBuffer key)
-        {
-            return new BufferDecoratedKey(getToken(key), key);
-        }
-
-        public BigIntegerToken midpoint(Token ltoken, Token rtoken)
-        {
-            // the symbolic MINIMUM token should act as ZERO: the empty bit array
-            BigInteger left = ltoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)ltoken).token;
-            BigInteger right = rtoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)rtoken).token;
-            Pair<BigInteger,Boolean> midpair = FBUtilities.midpoint(left, right, 127);
-            // discard the remainder
-            return new BigIntegerToken(midpair.left);
-        }
-
-        public BigIntegerToken getMinimumToken()
-        {
-            return MINIMUM;
-        }
-
-        public BigIntegerToken getRandomToken()
-        {
-            return new BigIntegerToken(BigInteger.valueOf(new Random().nextInt(15)));
-        }
-
-        private final Token.TokenFactory tokenFactory = new Token.TokenFactory() {
-            public ByteBuffer toByteArray(Token token)
-            {
-                BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
-                return ByteBuffer.wrap(bigIntegerToken.token.toByteArray());
-            }
-
-            public Token fromByteArray(ByteBuffer bytes)
-            {
-                return new BigIntegerToken(new BigInteger(ByteBufferUtil.getArray(bytes)));
-            }
-
-            public String toString(Token token)
-            {
-                BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
-                return bigIntegerToken.token.toString();
-            }
-
-            public Token fromString(String string)
-            {
-                return new BigIntegerToken(new BigInteger(string));
-            }
-
-            public void validate(String token) {}
-        };
-
-        public Token.TokenFactory getTokenFactory()
-        {
-            return tokenFactory;
-        }
-
-        public boolean preservesOrder()
-        {
-            return false;
-        }
-
-        public BigIntegerToken getToken(ByteBuffer key)
-        {
-            if (key.remaining() == 0)
-                return MINIMUM;
-            return new BigIntegerToken(BigInteger.valueOf(key.remaining()));
-        }
-
-        public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
-        {
-            // allTokens will contain the count and be returned, sorted_ranges is shorthand for token<->token math.
-            Map<Token, Float> allTokens = new HashMap<Token, Float>();
-            List<Range<Token>> sortedRanges = new ArrayList<Range<Token>>();
-
-            // this initializes the counts to 0 and calcs the ranges in order.
-            Token lastToken = sortedTokens.get(sortedTokens.size() - 1);
-            for (Token node : sortedTokens)
-            {
-                allTokens.put(node, new Float(0.0));
-                sortedRanges.add(new Range<Token>(lastToken, node));
-                lastToken = node;
-            }
-
-            for (String ks : Schema.instance.getKeyspaces())
-            {
-                for (CFMetaData cfmd : Schema.instance.getKSMetaData(ks).cfMetaData().values())
-                {
-                    for (Range<Token> r : sortedRanges)
-                    {
-                        // Looping over every KS:CF:Range, get the splits size and add it to the count
-                        allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1).size());
-                    }
-                }
-            }
-
-            // Sum every count up and divide count/total for the fractional ownership.
-            Float total = new Float(0.0);
-            for (Float f : allTokens.values())
-                total += f;
-            for (Map.Entry<Token, Float> row : allTokens.entrySet())
-                allTokens.put(row.getKey(), row.getValue() / total);
-
-            return allTokens;
-        }
-
-        public AbstractType<?> getTokenValidator()
-        {
-            return IntegerType.instance;
-        }
-    }
 }

diff --git a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
new file mode 100644
index 0000000..9cefbf2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java

@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.dht;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
+import org.apache.cassandra.dht.KeyCollisionTest.BigIntegerToken;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+public class LengthPartitioner implements IPartitioner
+{
+    public static final BigInteger ZERO = new BigInteger("0");
+    public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1");
+
+    public static LengthPartitioner instance = new LengthPartitioner();
+
+    public DecoratedKey decorateKey(ByteBuffer key)
+    {
+        return new BufferDecoratedKey(getToken(key), key);
+    }
+
+    public BigIntegerToken midpoint(Token ltoken, Token rtoken)
+    {
+        // the symbolic MINIMUM token should act as ZERO: the empty bit array
+        BigInteger left = ltoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)ltoken).token;
+        BigInteger right = rtoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)rtoken).token;
+        Pair<BigInteger,Boolean> midpair = FBUtilities.midpoint(left, right, 127);
+        // discard the remainder
+        return new BigIntegerToken(midpair.left);
+    }
+
+    public BigIntegerToken getMinimumToken()
+    {
+        return MINIMUM;
+    }
+
+    public BigIntegerToken getRandomToken()
+    {
+        return new BigIntegerToken(BigInteger.valueOf(new Random().nextInt(15)));
+    }
+
+    private final Token.TokenFactory tokenFactory = new Token.TokenFactory() {
+        public ByteBuffer toByteArray(Token token)
+        {
+            BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
+            return ByteBuffer.wrap(bigIntegerToken.token.toByteArray());
+        }
+
+        public Token fromByteArray(ByteBuffer bytes)
+        {
+            return new BigIntegerToken(new BigInteger(ByteBufferUtil.getArray(bytes)));
+        }
+
+        public String toString(Token token)
+        {
+            BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
+            return bigIntegerToken.token.toString();
+        }
+
+        public Token fromString(String string)
+        {
+            return new BigIntegerToken(new BigInteger(string));
+        }
+
+        public void validate(String token) {}
+    };
+
+    public Token.TokenFactory getTokenFactory()
+    {
+        return tokenFactory;
+    }
+
+    public boolean preservesOrder()
+    {
+        return false;
+    }
+
+    public BigIntegerToken getToken(ByteBuffer key)
+    {
+        if (key.remaining() == 0)
+            return MINIMUM;
+        return new BigIntegerToken(BigInteger.valueOf(key.remaining()));
+    }
+
+    public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
+    {
+        // allTokens will contain the count and be returned, sorted_ranges is shorthand for token<->token math.
+        Map<Token, Float> allTokens = new HashMap<Token, Float>();
+        List<Range<Token>> sortedRanges = new ArrayList<Range<Token>>();
+
+        // this initializes the counts to 0 and calcs the ranges in order.
+        Token lastToken = sortedTokens.get(sortedTokens.size() - 1);
+        for (Token node : sortedTokens)
+        {
+            allTokens.put(node, new Float(0.0));
+            sortedRanges.add(new Range<Token>(lastToken, node));
+            lastToken = node;
+        }
+
+        for (String ks : Schema.instance.getKeyspaces())
+        {
+            for (CFMetaData cfmd : Schema.instance.getTablesAndViews(ks))
+            {
+                for (Range<Token> r : sortedRanges)
+                {
+                    // Looping over every KS:CF:Range, get the splits size and add it to the count
+                    allTokens.put(r.right, allTokens.get(r.right) + StorageService.instance.getSplits(ks, cfmd.cfName, r, 1).size());
+                }
+            }
+        }
+
+        // Sum every count up and divide count/total for the fractional ownership.
+        Float total = new Float(0.0);
+        for (Float f : allTokens.values())
+            total += f;
+        for (Map.Entry<Token, Float> row : allTokens.entrySet())
+            allTokens.put(row.getKey(), row.getValue() / total);
+
+        return allTokens;
+    }
+
+    public AbstractType<?> getTokenValidator()
+    {
+        return IntegerType.instance;
+    }
+
+    public AbstractType<?> partitionOrdering()
+    {
+        return new PartitionerDefinedOrder(this);
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/dht/Murmur3PartitionerTest.java b/test/unit/org/apache/cassandra/dht/Murmur3PartitionerTest.java
index 9f330d3..19aba40 100644
--- a/test/unit/org/apache/cassandra/dht/Murmur3PartitionerTest.java
+++ b/test/unit/org/apache/cassandra/dht/Murmur3PartitionerTest.java

@@ -1,5 +1,4 @@
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,15 +7,13 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.apache.cassandra.dht;
 

diff --git a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java
index 0449258..57f33e7 100644
--- a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java
+++ b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.dht;
 
 import java.io.IOException;

diff --git a/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java b/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java
index 8080a0c..cb892a7 100644
--- a/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java
+++ b/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java

@@ -1,30 +1,34 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.dht;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
 
-import org.apache.cassandra.service.StorageService;
 import org.junit.Before;
 import org.junit.Test;
 
+import org.apache.cassandra.service.StorageService;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 
@@ -125,7 +129,7 @@
     {
         // This call initializes StorageService, needed to populate the keyspaces.
         // TODO: This points to potential problems in the initialization sequence. Should be solved by CASSANDRA-7837.
-        StorageService.getPartitioner();
+        StorageService.instance.getKeyspaces();
 
         try
         {

diff --git a/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java b/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java
index 6b22617..5e68644 100644
--- a/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java
+++ b/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.dht;
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -9,17 +7,17 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
+package org.apache.cassandra.dht;
+
 public class RandomPartitionerTest extends PartitionerTestCase
 {
     public void initPartitioner()

diff --git a/test/unit/org/apache/cassandra/dht/RangeTest.java b/test/unit/org/apache/cassandra/dht/RangeTest.java
index 9fb49cf..627253d 100644
--- a/test/unit/org/apache/cassandra/dht/RangeTest.java
+++ b/test/unit/org/apache/cassandra/dht/RangeTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.dht;
 
 import java.nio.ByteBuffer;
@@ -28,16 +27,15 @@
 import java.util.Set;
 
 import com.google.common.base.Joiner;
-
-import static java.util.Arrays.asList;
-
-import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Test;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
-import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
 
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken;
+import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+
+import static java.util.Arrays.asList;
 import static org.apache.cassandra.Util.range;
 import static org.junit.Assert.*;
 
@@ -191,7 +189,7 @@
     }
 
     @SafeVarargs
-    static <T extends RingPosition<T>> void assertIntersection(Range<T> one, Range<T> two, Range<T> ... ranges)
+    static <T extends RingPosition<T>> void assertIntersection(Range<T> one, Range<T> two, Range<T>... ranges)
     {
         Set<Range<T>> correct = Range.rangeSet(ranges);
         Set<Range<T>> result1 = one.intersectionWith(two);
@@ -333,7 +331,7 @@
 
     private Range<Token> makeRange(long token1, long token2)
     {
-        return new Range<Token>(new Murmur3Partitioner.LongToken(token1), new Murmur3Partitioner.LongToken(token2));
+        return new Range<>(new Murmur3Partitioner.LongToken(token1), new Murmur3Partitioner.LongToken(token2));
     }
 
     private void assertRanges(Set<Range<Token>> result, Long ... tokens)
@@ -364,6 +362,8 @@
         assertRanges(range.subtractAll(collection), 10L, 54L, 60L, 90L);
         collection.add(makeRange(80L, 95L));
         assertRanges(range.subtractAll(collection), 10L, 54L, 60L, 80L);
+
+        assertEquals(Collections.emptySet(), range.subtractAll(Collections.singleton(range)));
     }
 
     @Test
@@ -382,6 +382,44 @@
         assertRanges(range.subtractAll(collection), 100L, 200L, 500L, 0L);
         collection.add(makeRange(1000L, 0));
         assertRanges(range.subtractAll(collection), 100L, 200L, 500L, 1000L);
+
+        assertEquals(Collections.emptySet(), range.subtractAll(Collections.singleton(range)));
+    }
+
+    @Test
+    public void testSubtractAllFromFullRingRange()
+    {
+        Range<Token> ring1 = makeRange(50L, 50L);
+        Range<Token> ring2 = makeRange(0L, 0L);
+
+        Set<Range<Token>> contained1 = Collections.singleton(makeRange(10L, 100L));
+        Set<Range<Token>> contained2 = Collections.singleton(makeRange(100L, 10L));
+
+        assertEquals(contained2, ring1.subtractAll(contained1));
+        assertEquals(contained2, ring2.subtractAll(contained1));
+        assertEquals(contained1, ring1.subtractAll(contained2));
+        assertEquals(contained1, ring2.subtractAll(contained2));
+        assertEquals(Collections.emptySet(), ring1.subtractAll(Collections.singleton(ring1)));
+        assertEquals(Collections.emptySet(), ring2.subtractAll(Collections.singleton(ring2)));
+        assertEquals(Collections.emptySet(), ring1.subtractAll(Collections.singleton(ring2)));
+    }
+
+    @Test
+    public void testSubtractFromFullRingRange()
+    {
+        Range<Token> ring1 = makeRange(50L, 50L);
+        Range<Token> ring2 = makeRange(0L, 0L);
+
+        Range<Token> contained1 = makeRange(10L, 100L);
+        Range<Token> contained2 = makeRange(100L, 10L);
+
+        assertEquals(Collections.singleton(contained2), ring1.subtract(contained1));
+        assertEquals(Collections.singleton(contained2), ring2.subtract(contained1));
+        assertEquals(Collections.singleton(contained1), ring1.subtract(contained2));
+        assertEquals(Collections.singleton(contained1), ring2.subtract(contained2));
+        assertEquals(Collections.emptySet(), ring1.subtract(ring1));
+        assertEquals(Collections.emptySet(), ring2.subtract(ring2));
+        assertEquals(Collections.emptySet(), ring1.subtract(ring2));
     }
     
     private Range<Token> makeRange(String token1, String token2)
@@ -540,7 +578,7 @@
     @Test
     public void testNormalizeNoop()
     {
-        List<Range<RowPosition>> l;
+        List<Range<PartitionPosition>> l;
 
         l = asList(range("1", "3"), range("4", "5"));
         assertNormalize(l, l);
@@ -549,7 +587,7 @@
     @Test
     public void testNormalizeSimpleOverlap()
     {
-        List<Range<RowPosition>> input, expected;
+        List<Range<PartitionPosition>> input, expected;
 
         input = asList(range("1", "4"), range("3", "5"));
         expected = asList(range("1", "5"));
@@ -563,7 +601,7 @@
     @Test
     public void testNormalizeSort()
     {
-        List<Range<RowPosition>> input, expected;
+        List<Range<PartitionPosition>> input, expected;
 
         input = asList(range("4", "5"), range("1", "3"));
         expected = asList(range("1", "3"), range("4", "5"));
@@ -573,7 +611,7 @@
     @Test
     public void testNormalizeUnwrap()
     {
-        List<Range<RowPosition>> input, expected;
+        List<Range<PartitionPosition>> input, expected;
 
         input = asList(range("9", "2"));
         expected = asList(range("", "2"), range("9", ""));
@@ -583,7 +621,7 @@
     @Test
     public void testNormalizeComplex()
     {
-        List<Range<RowPosition>> input, expected;
+        List<Range<PartitionPosition>> input, expected;
 
         input = asList(range("8", "2"), range("7", "9"), range("4", "5"));
         expected = asList(range("", "2"), range("4", "5"), range("7", ""));

diff --git a/test/unit/org/apache/cassandra/gms/ExpireEndpointTest.java b/test/unit/org/apache/cassandra/gms/ExpireEndpointTest.java
new file mode 100644
index 0000000..e7a1a64
--- /dev/null
+++ b/test/unit/org/apache/cassandra/gms/ExpireEndpointTest.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.gms;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import org.apache.cassandra.service.StorageService;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class ExpireEndpointTest
+{
+    @Test
+    public void testExpireEndpoint() throws UnknownHostException
+    {
+        InetAddress hostAddress = InetAddress.getByName("127.0.0.2");
+        UUID hostId = UUID.randomUUID();
+        long expireTime = System.currentTimeMillis() - 1;
+
+        Gossiper.instance.initializeNodeUnsafe(hostAddress, hostId, 1);
+
+        EndpointState endpointState = Gossiper.instance.getEndpointStateForEndpoint(hostAddress);
+        Gossiper.runInGossipStageBlocking(() -> Gossiper.instance.markDead(hostAddress, endpointState));
+        endpointState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.removedNonlocal(hostId, expireTime));
+        Gossiper.instance.addExpireTimeForEndpoint(hostAddress, expireTime);
+
+        assertTrue("Expiring endpoint not unreachable before status check", Gossiper.instance.getUnreachableMembers().contains(hostAddress));
+
+        Gossiper.instance.doStatusCheck();
+
+        assertFalse("Expired endpoint still part of live members", Gossiper.instance.getLiveMembers().contains(hostAddress));
+        assertFalse("Expired endpoint still part of unreachable members", Gossiper.instance.getUnreachableMembers().contains(hostAddress));
+        assertNull("Expired endpoint still contain endpoint state", Gossiper.instance.getEndpointStateForEndpoint(hostAddress));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/gms/FailureDetectorTest.java b/test/unit/org/apache/cassandra/gms/FailureDetectorTest.java
index af099b0..83c3500 100644
--- a/test/unit/org/apache/cassandra/gms/FailureDetectorTest.java
+++ b/test/unit/org/apache/cassandra/gms/FailureDetectorTest.java

@@ -45,8 +45,8 @@
     {
         // slow unit tests can cause problems with FailureDetector's GC pause handling
         System.setProperty("cassandra.max_local_pause_in_ms", "20000");
-
         DatabaseDescriptor.setDaemonInitialized();
+        DatabaseDescriptor.createAllDirectories();
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/gms/GossipDigestTest.java b/test/unit/org/apache/cassandra/gms/GossipDigestTest.java
index 2928b12..3191b03 100644
--- a/test/unit/org/apache/cassandra/gms/GossipDigestTest.java
+++ b/test/unit/org/apache/cassandra/gms/GossipDigestTest.java

@@ -20,11 +20,12 @@
 
 import static org.junit.Assert.*;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
 import java.io.IOException;
 
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
+
 import java.net.InetAddress;
 
 import org.apache.cassandra.net.MessagingService;
@@ -48,8 +49,8 @@
         DataOutputBuffer output = new DataOutputBuffer();
         GossipDigest.serializer.serialize(expected, output, MessagingService.current_version);
 
-        ByteArrayInputStream input = new ByteArrayInputStream(output.getData(), 0, output.getLength());
-        GossipDigest actual = GossipDigest.serializer.deserialize(new DataInputStream(input), MessagingService.current_version);
+        DataInputPlus input = new DataInputBuffer(output.getData());
+        GossipDigest actual = GossipDigest.serializer.deserialize(input, MessagingService.current_version);
         assertEquals(0, expected.compareTo(actual));
     }
 }

diff --git a/test/unit/org/apache/cassandra/gms/GossiperTest.java b/test/unit/org/apache/cassandra/gms/GossiperTest.java
index eb01305..42e4483 100644
--- a/test/unit/org/apache/cassandra/gms/GossiperTest.java
+++ b/test/unit/org/apache/cassandra/gms/GossiperTest.java

@@ -21,23 +21,19 @@
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
-import org.junit.After;
-import org.junit.AfterClass;
 import org.junit.Before;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.service.StorageService;
 
@@ -47,6 +43,7 @@
 {
     static
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.setDaemonInitialized();
     }
     static final IPartitioner partitioner = new RandomPartitioner();
@@ -67,32 +64,118 @@
     public void testLargeGenerationJump() throws UnknownHostException, InterruptedException
     {
         Util.createInitialRing(ss, partitioner, endpointTokens, keyTokens, hosts, hostIds, 2);
-        InetAddress remoteHostAddress = hosts.get(1);
+        try
+        {
+            InetAddress remoteHostAddress = hosts.get(1);
 
-        EndpointState initialRemoteState = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress);
-        HeartBeatState initialRemoteHeartBeat = initialRemoteState.getHeartBeatState();
+            EndpointState initialRemoteState = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress);
+            HeartBeatState initialRemoteHeartBeat = initialRemoteState.getHeartBeatState();
 
-        //Util.createInitialRing should have initialized remoteHost's HeartBeatState's generation to 1
-        assertEquals(initialRemoteHeartBeat.getGeneration(), 1);
+            //Util.createInitialRing should have initialized remoteHost's HeartBeatState's generation to 1
+            assertEquals(initialRemoteHeartBeat.getGeneration(), 1);
 
-        HeartBeatState proposedRemoteHeartBeat = new HeartBeatState(initialRemoteHeartBeat.getGeneration() + Gossiper.MAX_GENERATION_DIFFERENCE + 1);
-        EndpointState proposedRemoteState = new EndpointState(proposedRemoteHeartBeat);
+            HeartBeatState proposedRemoteHeartBeat = new HeartBeatState(initialRemoteHeartBeat.getGeneration() + Gossiper.MAX_GENERATION_DIFFERENCE + 1);
+            EndpointState proposedRemoteState = new EndpointState(proposedRemoteHeartBeat);
 
-        Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, proposedRemoteState));
+            Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, proposedRemoteState));
 
-        //The generation should have been updated because it isn't over Gossiper.MAX_GENERATION_DIFFERENCE in the future
-        HeartBeatState actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
-        assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
+            //The generation should have been updated because it isn't over Gossiper.MAX_GENERATION_DIFFERENCE in the future
+            HeartBeatState actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
+            assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
 
-        //Propose a generation 10 years in the future - this should be rejected.
-        HeartBeatState badProposedRemoteHeartBeat = new HeartBeatState((int) (System.currentTimeMillis()/1000) + Gossiper.MAX_GENERATION_DIFFERENCE * 10);
-        EndpointState badProposedRemoteState = new EndpointState(badProposedRemoteHeartBeat);
+            //Propose a generation 10 years in the future - this should be rejected.
+            HeartBeatState badProposedRemoteHeartBeat = new HeartBeatState((int) (System.currentTimeMillis() / 1000) + Gossiper.MAX_GENERATION_DIFFERENCE * 10);
+            EndpointState badProposedRemoteState = new EndpointState(badProposedRemoteHeartBeat);
 
-        Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, badProposedRemoteState));
+            Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, badProposedRemoteState));
 
-        actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
+            actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
 
-        //The generation should not have been updated because it is over Gossiper.MAX_GENERATION_DIFFERENCE in the future
-        assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
+            //The generation should not have been updated because it is over Gossiper.MAX_GENERATION_DIFFERENCE in the future
+            assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
+        }
+        finally
+        {
+            // clean up the gossip states
+            Gossiper.instance.endpointStateMap.clear();
+        }
+    }
+
+    int stateChangedNum = 0;
+
+    @Test
+    public void testDuplicatedStateUpdate() throws Exception
+    {
+        VersionedValue.VersionedValueFactory valueFactory =
+            new VersionedValue.VersionedValueFactory(DatabaseDescriptor.getPartitioner());
+
+        Util.createInitialRing(ss, partitioner, endpointTokens, keyTokens, hosts, hostIds, 2);
+        try
+        {
+            InetAddress remoteHostAddress = hosts.get(1);
+
+            EndpointState initialRemoteState = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress);
+            HeartBeatState initialRemoteHeartBeat = initialRemoteState.getHeartBeatState();
+
+            //Util.createInitialRing should have initialized remoteHost's HeartBeatState's generation to 1
+            assertEquals(initialRemoteHeartBeat.getGeneration(), 1);
+
+            HeartBeatState proposedRemoteHeartBeat = new HeartBeatState(initialRemoteHeartBeat.getGeneration());
+            EndpointState proposedRemoteState = new EndpointState(proposedRemoteHeartBeat);
+
+            final Token token = DatabaseDescriptor.getPartitioner().getRandomToken();
+            VersionedValue tokensValue = valueFactory.tokens(Collections.singletonList(token));
+            proposedRemoteState.addApplicationState(ApplicationState.TOKENS, tokensValue);
+
+            Gossiper.instance.register(
+            new IEndpointStateChangeSubscriber()
+            {
+                public void onJoin(InetAddress endpoint, EndpointState epState) { }
+
+                public void beforeChange(InetAddress endpoint, EndpointState currentState, ApplicationState newStateKey, VersionedValue newValue) { }
+
+                public void onChange(InetAddress endpoint, ApplicationState state, VersionedValue value)
+                {
+                    assertEquals(ApplicationState.TOKENS, state);
+                    stateChangedNum++;
+                }
+
+                public void onAlive(InetAddress endpoint, EndpointState state) { }
+
+                public void onDead(InetAddress endpoint, EndpointState state) { }
+
+                public void onRemove(InetAddress endpoint) { }
+
+                public void onRestart(InetAddress endpoint, EndpointState state) { }
+            }
+            );
+
+            stateChangedNum = 0;
+            Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, proposedRemoteState));
+            assertEquals(1, stateChangedNum);
+
+            HeartBeatState actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
+            assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
+
+            // Clone a new HeartBeatState
+            proposedRemoteHeartBeat = new HeartBeatState(initialRemoteHeartBeat.getGeneration(), proposedRemoteHeartBeat.getHeartBeatVersion());
+            proposedRemoteState = new EndpointState(proposedRemoteHeartBeat);
+
+            // Bump the heartbeat version and use the same TOKENS state
+            proposedRemoteHeartBeat.updateHeartBeat();
+            proposedRemoteState.addApplicationState(ApplicationState.TOKENS, tokensValue);
+
+            // The following state change should only update heartbeat without updating the TOKENS state
+            Gossiper.instance.applyStateLocally(ImmutableMap.of(remoteHostAddress, proposedRemoteState));
+            assertEquals(1, stateChangedNum);
+
+            actualRemoteHeartBeat = Gossiper.instance.getEndpointStateForEndpoint(remoteHostAddress).getHeartBeatState();
+            assertEquals(proposedRemoteHeartBeat.getGeneration(), actualRemoteHeartBeat.getGeneration());
+        }
+        finally
+        {
+            // clean up the gossip states
+            Gossiper.instance.endpointStateMap.clear();
+        }
     }
 }

diff --git a/test/unit/org/apache/cassandra/gms/PendingRangeCalculatorServiceTest.java b/test/unit/org/apache/cassandra/gms/PendingRangeCalculatorServiceTest.java
index 507948c..7892de4 100644
--- a/test/unit/org/apache/cassandra/gms/PendingRangeCalculatorServiceTest.java
+++ b/test/unit/org/apache/cassandra/gms/PendingRangeCalculatorServiceTest.java

@@ -55,6 +55,7 @@
     @BeforeClass
     public static void setUp() throws ConfigurationException
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         SchemaLoader.prepareServer();
         StorageService.instance.initServer();
     }
@@ -67,8 +68,8 @@
             action = "org.apache.cassandra.gms.PendingRangeCalculatorServiceTest.calculationLock.lock()")
     public void testDelayedResponse() throws UnknownHostException, InterruptedException
     {
-        final InetAddress otherNodeAddr = InetAddress.getByName("127.0.0.2");
-        final UUID otherHostId = UUID.randomUUID();
+        InetAddress otherNodeAddr = InetAddress.getByName("127.0.0.2");
+        UUID otherHostId = UUID.randomUUID();
 
         // introduce node for first major state change
         Gossiper.instance.applyStateLocally(getStates(otherNodeAddr, otherHostId, 1, false));

diff --git a/test/unit/org/apache/cassandra/gms/SerializationsTest.java b/test/unit/org/apache/cassandra/gms/SerializationsTest.java
index 080ae53..e50b461 100644
--- a/test/unit/org/apache/cassandra/gms/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/gms/SerializationsTest.java

@@ -19,13 +19,15 @@
 package org.apache.cassandra.gms;
 
 import org.apache.cassandra.AbstractSerializationsTester;
+import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
+
 import org.junit.Test;
 
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.ArrayList;
@@ -58,7 +60,7 @@
         if (EXECUTE_WRITES)
             testEndpointStateWrite();
 
-        DataInputStream in = getInput("gms.EndpointState.bin");
+        DataInputStreamPlus in = getInput("gms.EndpointState.bin");
         assert HeartBeatState.serializer.deserialize(in, getVersion()) != null;
         assert EndpointState.serializer.deserialize(in, getVersion()) != null;
         assert VersionedValue.serializer.deserialize(in, getVersion()) != null;
@@ -73,7 +75,9 @@
         states.put(InetAddress.getByName("127.0.0.2"), Statics.EndpointSt);
         GossipDigestAck ack = new GossipDigestAck(Statics.Digests, states);
         GossipDigestAck2 ack2 = new GossipDigestAck2(states);
-        GossipDigestSyn syn = new GossipDigestSyn("Not a real cluster name", StorageService.getPartitioner().getClass().getCanonicalName(), Statics.Digests);
+        GossipDigestSyn syn = new GossipDigestSyn("Not a real cluster name",
+                                                  StorageService.instance.getTokenMetadata().partitioner.getClass().getCanonicalName(),
+                                                  Statics.Digests);
 
         DataOutputStreamPlus out = getOutput("gms.Gossip.bin");
         for (GossipDigest gd : Statics.Digests)
@@ -98,7 +102,7 @@
             testGossipDigestWrite();
 
         int count = 0;
-        DataInputStream in = getInput("gms.Gossip.bin");
+        DataInputStreamPlus in = getInput("gms.Gossip.bin");
         while (count < Statics.Digests.size())
             assert GossipDigestAck2.serializer.deserialize(in, getVersion()) != null;
         assert GossipDigestAck.serializer.deserialize(in, getVersion()) != null;
@@ -111,9 +115,10 @@
     {
         private static HeartBeatState HeartbeatSt = new HeartBeatState(101, 201);
         private static EndpointState EndpointSt = new EndpointState(HeartbeatSt);
-        private static VersionedValue.VersionedValueFactory vvFact = new VersionedValue.VersionedValueFactory(StorageService.getPartitioner());
+        private static IPartitioner partitioner = StorageService.instance.getTokenMetadata().partitioner;
+        private static VersionedValue.VersionedValueFactory vvFact = new VersionedValue.VersionedValueFactory(partitioner);
         private static VersionedValue vv0 = vvFact.load(23d);
-        private static VersionedValue vv1 = vvFact.bootstrapping(Collections.<Token>singleton(StorageService.getPartitioner().getRandomToken()));
+        private static VersionedValue vv1 = vvFact.bootstrapping(Collections.<Token>singleton(partitioner.getRandomToken()));
         private static List<GossipDigest> Digests = new ArrayList<GossipDigest>();
 
         {

diff --git a/test/unit/org/apache/cassandra/hints/ChecksummedDataInputTest.java b/test/unit/org/apache/cassandra/hints/ChecksummedDataInputTest.java
new file mode 100644
index 0000000..323a12d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/ChecksummedDataInputTest.java

@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.zip.CRC32;
+
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+
+public class ChecksummedDataInputTest
+{
+    @Test
+    public void testReadMethods() throws IOException
+    {
+        // Make sure this array is bigger than the reader buffer size
+        // so we test updating the crc across buffer boundaries
+        byte[] b = new byte[RandomAccessReader.DEFAULT_BUFFER_SIZE * 2];
+        for (int i = 0; i < b.length; i++)
+            b[i] = (byte)i;
+
+        ByteBuffer buffer;
+
+        // fill a bytebuffer with some input
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            out.write(127);
+            out.write(b);
+            out.writeBoolean(false);
+            out.writeByte(10);
+            out.writeChar('t');
+            out.writeDouble(3.3);
+            out.writeFloat(2.2f);
+            out.writeInt(42);
+            out.writeLong(Long.MAX_VALUE);
+            out.writeShort(Short.MIN_VALUE);
+            out.writeUTF("utf");
+            out.writeVInt(67L);
+            out.writeUnsignedVInt(88L);
+            out.writeBytes("abcdefghi");
+
+            buffer = out.buffer();
+        }
+
+        // calculate expected CRC
+        CRC32 crc = new CRC32();
+        FBUtilities.updateChecksum(crc, buffer);
+
+        // save the buffer to file to create a RAR
+        File file = File.createTempFile("testReadMethods", "1");
+        file.deleteOnExit();
+        try (SequentialWriter writer = SequentialWriter.open(file))
+        {
+            writer.write(buffer);
+            writer.writeInt((int) crc.getValue());
+            writer.finish();
+        }
+
+        assertTrue(file.exists());
+        assertEquals(buffer.remaining() + 4, file.length());
+
+        try (ChecksummedDataInput reader = ChecksummedDataInput.open(file))
+        {
+            reader.limit(buffer.remaining() + 4);
+
+            // assert that we read all the right values back
+            assertEquals(127, reader.read());
+            byte[] bytes = new byte[b.length];
+            reader.readFully(bytes);
+            assertTrue(Arrays.equals(bytes, b));
+            assertEquals(false, reader.readBoolean());
+            assertEquals(10, reader.readByte());
+            assertEquals('t', reader.readChar());
+            assertEquals(3.3, reader.readDouble());
+            assertEquals(2.2f, reader.readFloat());
+            assertEquals(42, reader.readInt());
+            assertEquals(Long.MAX_VALUE, reader.readLong());
+            assertEquals(Short.MIN_VALUE, reader.readShort());
+            assertEquals("utf", reader.readUTF());
+            assertEquals(67L, reader.readVInt());
+            assertEquals(88L, reader.readUnsignedVInt());
+            assertEquals("abcdefghi", new String(ByteBufferUtil.read(reader, 9).array(), StandardCharsets.UTF_8));
+
+            // assert that the crc matches, and that we've read exactly as many bytes as expected
+            assertTrue(reader.checkCrc());
+            assertEquals(0, reader.bytesRemaining());
+
+            reader.checkLimit(0);
+        }
+    }
+
+    @Test
+    public void testResetCrc() throws IOException
+    {
+        CRC32 crc = new CRC32();
+        ByteBuffer buffer;
+
+        // fill a bytebuffer with some input
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            out.write(127);
+            out.writeBoolean(false);
+            out.writeByte(10);
+            out.writeChar('t');
+
+            buffer = out.buffer();
+            FBUtilities.updateChecksum(crc, buffer);
+            out.writeInt((int) crc.getValue());
+
+            int bufferPos = out.getLength();
+            out.writeDouble(3.3);
+            out.writeFloat(2.2f);
+            out.writeInt(42);
+
+            buffer = out.buffer();
+            buffer.position(bufferPos);
+            crc.reset();
+            FBUtilities.updateChecksum(crc, buffer);
+
+            out.writeInt((int) crc.getValue());
+            buffer = out.buffer();
+        }
+
+        // save the buffer to file to create a RAR
+        File file = File.createTempFile("testResetCrc", "1");
+        file.deleteOnExit();
+        try (SequentialWriter writer = SequentialWriter.open(file))
+        {
+            writer.write(buffer);
+            writer.finish();
+        }
+
+        assertTrue(file.exists());
+        assertEquals(buffer.remaining(), file.length());
+
+        try (ChecksummedDataInput reader = ChecksummedDataInput.open(file))
+        {
+            reader.limit(buffer.remaining());
+
+            // assert that we read all the right values back
+            assertEquals(127, reader.read());
+            assertEquals(false, reader.readBoolean());
+            assertEquals(10, reader.readByte());
+            assertEquals('t', reader.readChar());
+            assertTrue(reader.checkCrc());
+
+            reader.resetCrc();
+            assertEquals(3.3, reader.readDouble());
+            assertEquals(2.2f, reader.readFloat());
+            assertEquals(42, reader.readInt());
+            assertTrue(reader.checkCrc());
+            assertEquals(0, reader.bytesRemaining());
+        }
+    }
+
+    @Test
+    public void testFailedCrc() throws IOException
+    {
+        CRC32 crc = new CRC32();
+        ByteBuffer buffer;
+
+        // fill a bytebuffer with some input
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            out.write(127);
+            out.writeBoolean(false);
+            out.writeByte(10);
+            out.writeChar('t');
+
+            buffer = out.buffer();
+            FBUtilities.updateChecksum(crc, buffer);
+
+            // update twice so it won't match
+            FBUtilities.updateChecksum(crc, buffer);
+            out.writeInt((int) crc.getValue());
+
+            buffer = out.buffer();
+        }
+
+        // save the buffer to file to create a RAR
+        File file = File.createTempFile("testFailedCrc", "1");
+        file.deleteOnExit();
+        try (SequentialWriter writer = SequentialWriter.open(file))
+        {
+            writer.write(buffer);
+            writer.finish();
+        }
+
+        assertTrue(file.exists());
+        assertEquals(buffer.remaining(), file.length());
+
+        try (ChecksummedDataInput reader = ChecksummedDataInput.open(file))
+        {
+            reader.limit(buffer.remaining());
+
+            // assert that we read all the right values back
+            assertEquals(127, reader.read());
+            assertEquals(false, reader.readBoolean());
+            assertEquals(10, reader.readByte());
+            assertEquals('t', reader.readChar());
+            assertFalse(reader.checkCrc());
+            assertEquals(0, reader.bytesRemaining());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintMessageTest.java b/test/unit/org/apache/cassandra/hints/HintMessageTest.java
new file mode 100644
index 0000000..7ffaa54
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintMessageTest.java

@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.assertEquals;
+
+import static org.apache.cassandra.hints.HintsTestUtil.assertHintsEqual;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class HintMessageTest
+{
+    private static final String KEYSPACE = "hint_message_test";
+    private static final String TABLE = "table";
+
+    @Test
+    public void testSerializer() throws IOException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+
+        UUID hostId = UUID.randomUUID();
+        long now = FBUtilities.timestampMicros();
+
+        CFMetaData table = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        Mutation mutation =
+            new RowUpdateBuilder(table, now, bytes("key"))
+                .clustering("column")
+                .add("val", "val" + 1234)
+                .build();
+        Hint hint = Hint.create(mutation, now / 1000);
+        HintMessage message = new HintMessage(hostId, hint);
+
+        // serialize
+        int serializedSize = (int) HintMessage.serializer.serializedSize(message, MessagingService.current_version);
+        DataOutputBuffer dob = new DataOutputBuffer();
+        HintMessage.serializer.serialize(message, dob, MessagingService.current_version);
+        assertEquals(serializedSize, dob.getLength());
+
+        // deserialize
+        DataInputPlus di = new DataInputBuffer(dob.buffer(), true);
+        HintMessage deserializedMessage = HintMessage.serializer.deserialize(di, MessagingService.current_version);
+
+        // compare before/after
+        assertEquals(hostId, deserializedMessage.hostId);
+        assertHintsEqual(message.hint, deserializedMessage.hint);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintTest.java b/test/unit/org/apache/cassandra/hints/HintTest.java
new file mode 100644
index 0000000..1d486e1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintTest.java

@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.util.Collections;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableList;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.dht.BootStrapper;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.*;
+
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.hints.HintsTestUtil.assertHintsEqual;
+import static org.apache.cassandra.hints.HintsTestUtil.assertPartitionsEqual;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class HintTest
+{
+    private static final String KEYSPACE = "hint_test";
+    private static final String TABLE0 = "table_0";
+    private static final String TABLE1 = "table_1";
+    private static final String TABLE2 = "table_2";
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE0),
+                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE2));
+    }
+
+    @Before
+    public void resetGcGraceSeconds()
+    {
+        TokenMetadata tokenMeta = StorageService.instance.getTokenMetadata();
+        InetAddress local = FBUtilities.getBroadcastAddress();
+        tokenMeta.clearUnsafe();
+        tokenMeta.updateHostId(UUID.randomUUID(), local);
+        tokenMeta.updateNormalTokens(BootStrapper.getRandomTokens(tokenMeta, 1), local);
+
+        for (CFMetaData table : Schema.instance.getTablesAndViews(KEYSPACE))
+            table.gcGraceSeconds(TableParams.DEFAULT_GC_GRACE_SECONDS);
+    }
+
+    @Test
+    public void testSerializer() throws IOException
+    {
+        long now = FBUtilities.timestampMicros();
+        Mutation mutation = createMutation("testSerializer", now);
+        Hint hint = Hint.create(mutation, now / 1000);
+
+        // serialize
+        int serializedSize = (int) Hint.serializer.serializedSize(hint, MessagingService.current_version);
+        DataOutputBuffer dob = new DataOutputBuffer();
+        Hint.serializer.serialize(hint, dob, MessagingService.current_version);
+        assertEquals(serializedSize, dob.getLength());
+
+        // deserialize
+        DataInputPlus di = new DataInputBuffer(dob.buffer(), true);
+        Hint deserializedHint = Hint.serializer.deserialize(di, MessagingService.current_version);
+
+        // compare before/after
+        assertHintsEqual(hint, deserializedHint);
+    }
+
+    @Test
+    public void testApply()
+    {
+        long now = FBUtilities.timestampMicros();
+        String key = "testApply";
+        Mutation mutation = createMutation(key, now);
+        Hint hint = Hint.create(mutation, now / 1000);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        hint.apply();
+
+        // assert that we can read the inserted partitions
+        for (PartitionUpdate partition : mutation.getPartitionUpdates())
+            assertPartitionsEqual(partition, readPartition(key, partition.metadata().cfName));
+    }
+
+    @Test
+    public void testApplyWithTruncation()
+    {
+        long now = FBUtilities.timestampMicros();
+        String key = "testApplyWithTruncation";
+        Mutation mutation = createMutation(key, now);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        // truncate TABLE1
+        Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE1).truncateBlocking();
+
+        // create and apply a hint with creation time in the past (one second before the truncation)
+        Hint.create(mutation, now / 1000 - 1).apply();
+
+        // TABLE1 update should have been skipped and not applied, as expired
+        assertNoPartitions(key, TABLE1);
+
+        // TABLE0 and TABLE2 updates should have been applied successfully
+        assertPartitionsEqual(mutation.getPartitionUpdate(Schema.instance.getId(KEYSPACE, TABLE0)), readPartition(key, TABLE0));
+        assertPartitionsEqual(mutation.getPartitionUpdate(Schema.instance.getId(KEYSPACE, TABLE2)), readPartition(key, TABLE2));
+    }
+
+    @Test
+    public void testApplyWithRegularExpiration()
+    {
+        long now = FBUtilities.timestampMicros();
+        String key = "testApplyWithRegularExpiration";
+        Mutation mutation = createMutation(key, now);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        // lower the GC GS on TABLE0 to 0 BEFORE the hint is created
+        Schema.instance.getCFMetaData(KEYSPACE, TABLE0).gcGraceSeconds(0);
+
+        Hint.create(mutation, now / 1000).apply();
+
+        // all updates should have been skipped and not applied, as expired
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+    }
+
+    @Test
+    public void testApplyWithGCGSReducedLater()
+    {
+        long now = FBUtilities.timestampMicros();
+        String key = "testApplyWithGCGSReducedLater";
+        Mutation mutation = createMutation(key, now);
+        Hint hint = Hint.create(mutation, now / 1000);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        // lower the GC GS on TABLE0 AFTER the hint is already created
+        Schema.instance.getCFMetaData(KEYSPACE, TABLE0).gcGraceSeconds(0);
+
+        hint.apply();
+
+        // all updates should have been skipped and not applied, as expired
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+    }
+
+    @SuppressWarnings("unchecked")
+    @Test
+    public void testChangedTopology() throws Exception
+    {
+        // create a hint
+        long now = FBUtilities.timestampMicros();
+        String key = "testChangedTopology";
+        Mutation mutation = createMutation(key, now);
+        Hint hint = Hint.create(mutation, now / 1000);
+
+        // Prepare metadata with injected stale endpoint serving the mutation key.
+        TokenMetadata tokenMeta = StorageService.instance.getTokenMetadata();
+        InetAddress local = FBUtilities.getBroadcastAddress();
+        InetAddress endpoint = InetAddress.getByName("1.1.1.1");
+        UUID localId = StorageService.instance.getLocalHostUUID();
+        UUID targetId = UUID.randomUUID();
+        tokenMeta.updateHostId(targetId, endpoint);
+        tokenMeta.updateNormalTokens(ImmutableList.of(mutation.key().getToken()), endpoint);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        assert StorageProxy.instance.getHintsInProgress() == 0;
+        long totalHintCount = StorageProxy.instance.getTotalHints();
+        // Process hint message.
+        HintMessage message = new HintMessage(localId, hint);
+        MessagingService.instance().getVerbHandler(MessagingService.Verb.HINT).doVerb(
+                MessageIn.create(local, message, Collections.emptyMap(), MessagingService.Verb.HINT, MessagingService.current_version),
+                -1);
+
+        // hint should not be applied as we no longer are a replica
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        // Attempt to send to new endpoint should have been made. Node is not live hence it should now be a hint.
+        assertEquals(totalHintCount + 1, StorageProxy.instance.getTotalHints());
+    }
+
+    @SuppressWarnings("unchecked")
+    @Test
+    public void testChangedTopologyNotHintable() throws Exception
+    {
+        // create a hint
+        long now = FBUtilities.timestampMicros();
+        String key = "testChangedTopology";
+        Mutation mutation = createMutation(key, now);
+        Hint hint = Hint.create(mutation, now / 1000);
+
+        // Prepare metadata with injected stale endpoint.
+        TokenMetadata tokenMeta = StorageService.instance.getTokenMetadata();
+        InetAddress local = FBUtilities.getBroadcastAddress();
+        InetAddress endpoint = InetAddress.getByName("1.1.1.1");
+        UUID localId = StorageService.instance.getLocalHostUUID();
+        UUID targetId = UUID.randomUUID();
+        tokenMeta.updateHostId(targetId, endpoint);
+        tokenMeta.updateNormalTokens(ImmutableList.of(mutation.key().getToken()), endpoint);
+
+        // sanity check that there is no data inside yet
+        assertNoPartitions(key, TABLE0);
+        assertNoPartitions(key, TABLE1);
+        assertNoPartitions(key, TABLE2);
+
+        try
+        {
+            DatabaseDescriptor.setHintedHandoffEnabled(false);
+
+            assert StorageMetrics.totalHintsInProgress.getCount() == 0;
+            long totalHintCount = StorageMetrics.totalHints.getCount();
+            // Process hint message.
+            HintMessage message = new HintMessage(localId, hint);
+            MessagingService.instance().getVerbHandler(MessagingService.Verb.HINT).doVerb(
+                    MessageIn.create(local, message, Collections.emptyMap(), MessagingService.Verb.HINT, MessagingService.current_version),
+                    -1);
+
+            // hint should not be applied as we no longer are a replica
+            assertNoPartitions(key, TABLE0);
+            assertNoPartitions(key, TABLE1);
+            assertNoPartitions(key, TABLE2);
+
+            // Attempt to send to new endpoint should not have been made.
+            assertEquals(totalHintCount, StorageMetrics.totalHints.getCount());
+        }
+        finally
+        {
+            DatabaseDescriptor.setHintedHandoffEnabled(true);
+        }
+    }
+
+    private static Mutation createMutation(String key, long now)
+    {
+        Mutation mutation = new Mutation(KEYSPACE, dk(key));
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE0), now, mutation)
+            .clustering("column0")
+            .add("val", "value0")
+            .build();
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE1), now + 1, mutation)
+            .clustering("column1")
+            .add("val", "value1")
+            .build();
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE2), now + 2, mutation)
+            .clustering("column2")
+            .add("val", "value2")
+            .build();
+
+        return mutation;
+    }
+
+    private static SinglePartitionReadCommand cmd(String key, String table)
+    {
+        CFMetaData meta = Schema.instance.getCFMetaData(KEYSPACE, table);
+        return SinglePartitionReadCommand.fullPartitionRead(meta, FBUtilities.nowInSeconds(), bytes(key));
+    }
+
+    private static FilteredPartition readPartition(String key, String table)
+    {
+        return Util.getOnlyPartition(cmd(key, table));
+    }
+
+    private static void assertNoPartitions(String key, String table)
+    {
+        ReadCommand cmd = cmd(key, table);
+
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup();
+             PartitionIterator iterator = cmd.executeInternal(orderGroup))
+        {
+            assertFalse(iterator.hasNext());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsBufferPoolTest.java b/test/unit/org/apache/cassandra/hints/HintsBufferPoolTest.java
new file mode 100644
index 0000000..7c8d0be
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsBufferPoolTest.java

@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import org.apache.cassandra.Util;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import com.google.common.collect.ImmutableList;
+
+import static junit.framework.Assert.*;
+
+import java.lang.Thread.State;
+import java.util.Queue;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+@RunWith(BMUnitRunner.class)
+public class HintsBufferPoolTest
+{
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        HintsBufferTest.defineSchema();
+    }
+
+    /*
+     * Check that the hints buffer pool will only drain a limited number of buffers
+     */
+    static volatile boolean blockedOnBackpressure = false;
+    @Test
+    @BMRule(name = "Greatest name in the world",
+            targetClass="HintsBufferPool",
+            targetMethod="switchCurrentBuffer",
+            targetLocation="AT INVOKE java.util.concurrent.BlockingQueue.take",
+            action="org.apache.cassandra.hints.HintsBufferPoolTest.blockedOnBackpressure = true;")
+    public void testBackpressure() throws Exception
+    {
+        Queue<HintsBuffer> returnedBuffers = new ConcurrentLinkedQueue<>();
+        HintsBufferPool pool = new HintsBufferPool(256, (buffer, p) -> returnedBuffers.offer(buffer));
+
+        Thread blocked = new Thread(() -> {
+            for (int ii = 0; ii < 512; ii++)
+                pool.write(ImmutableList.of(UUID.randomUUID()), HintsBufferTest.createHint(ii, ii));
+        });
+        blocked.start();
+
+        Util.spinAssertEquals(true, () -> blockedOnBackpressure, 60);
+
+        while (blocked.isAlive())
+            if (!returnedBuffers.isEmpty())
+                pool.offer(returnedBuffers.poll().recycle());
+
+        assertTrue(blockedOnBackpressure);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsBufferTest.java b/test/unit/org/apache/cassandra/hints/HintsBufferTest.java
new file mode 100644
index 0000000..78ea4f4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsBufferTest.java

@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.zip.CRC32;
+
+import com.google.common.collect.Iterables;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static junit.framework.Assert.*;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.apache.cassandra.utils.FBUtilities.updateChecksum;
+
+public class HintsBufferTest
+{
+    private static final String KEYSPACE = "hints_buffer_test";
+    private static final String TABLE = "table";
+
+    private static final int HINTS_COUNT = 300_000;
+    private static final int HINT_THREADS_COUNT = 10;
+    private static final int HOST_ID_COUNT = 10;
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+    }
+
+    @Test
+    @SuppressWarnings("resource")
+    public void testOverlyLargeAllocation()
+    {
+        // create a small, 128 bytes buffer
+        HintsBuffer buffer = HintsBuffer.create(128);
+
+        // try allocating an entry of 65 bytes (53 bytes hint + 12 bytes of overhead)
+        try
+        {
+            buffer.allocate(65 - HintsBuffer.ENTRY_OVERHEAD_SIZE);
+            fail("Allocation of the buffer should have failed but hasn't");
+        }
+        catch (IllegalArgumentException e)
+        {
+            assertEquals(String.format("Hint of %s bytes is too large - the maximum size is 64", 65 - HintsBuffer.ENTRY_OVERHEAD_SIZE),
+                         e.getMessage());
+        }
+
+        // assert that a 1-byte smaller allocation fits properly
+        try (HintsBuffer.Allocation allocation = buffer.allocate(64 - HintsBuffer.ENTRY_OVERHEAD_SIZE))
+        {
+            assertNotNull(allocation);
+        }
+    }
+
+    @Test
+    public void testWrite() throws IOException, InterruptedException
+    {
+        // generate 10 random host ids to choose from
+        UUID[] hostIds = new UUID[HOST_ID_COUNT];
+        for (int i = 0; i < hostIds.length; i++)
+            hostIds[i] = UUID.randomUUID();
+
+        // map each index to one random UUID from the previously created UUID array
+        Random random = new Random(System.currentTimeMillis());
+        UUID[] load = new UUID[HINTS_COUNT];
+        for (int i = 0; i < load.length; i++)
+            load[i] = hostIds[random.nextInt(HOST_ID_COUNT)];
+
+        // calculate the size of a single hint (they will all have an equal size in this test)
+        int hintSize = (int) Hint.serializer.serializedSize(createHint(0, System.currentTimeMillis()), MessagingService.current_version);
+        int entrySize = hintSize + HintsBuffer.ENTRY_OVERHEAD_SIZE;
+
+        // allocate a slab to fit *precisely* HINTS_COUNT hints
+        int slabSize = entrySize * HINTS_COUNT;
+        HintsBuffer buffer = HintsBuffer.create(slabSize);
+
+        // use a fixed timestamp base for all mutation timestamps
+        long baseTimestamp = System.currentTimeMillis();
+
+        // create HINT_THREADS_COUNT, start them, and wait for them to finish
+        List<Thread> threads = new ArrayList<>(HINT_THREADS_COUNT);
+        for (int i = 0; i < HINT_THREADS_COUNT; i ++)
+            threads.add(new Thread(new Writer(buffer, load, hintSize, i, baseTimestamp)));
+        threads.forEach(java.lang.Thread::start);
+        for (Thread thread : threads)
+            thread.join();
+
+        // sanity check that we are full
+        assertEquals(slabSize, buffer.capacity());
+        assertEquals(0, buffer.remaining());
+
+        // try to allocate more bytes, ensure that the allocation fails
+        assertNull(buffer.allocate(1));
+
+        // a failed allocation should automatically close the oporder
+        buffer.waitForModifications();
+
+        // a failed allocation should also automatically make the buffer as closed
+        assertTrue(buffer.isClosed());
+
+        // assert that host id set in the buffer equals to hostIds
+        assertEquals(HOST_ID_COUNT, buffer.hostIds().size());
+        assertEquals(new HashSet<>(Arrays.asList(hostIds)), buffer.hostIds());
+
+        // iterate over *every written hint*, validate its content
+        for (UUID hostId : hostIds)
+        {
+            Iterator<ByteBuffer> iter = buffer.consumingHintsIterator(hostId);
+            while (iter.hasNext())
+            {
+                int idx = validateEntry(hostId, iter.next(), baseTimestamp, load);
+                load[idx] = null; // nullify each visited entry
+            }
+        }
+
+        // assert that all the entries in load array have been visited and nullified
+        for (UUID hostId : load)
+            assertNull(hostId);
+
+        // free the buffer
+        buffer.free();
+    }
+
+    private static int validateEntry(UUID hostId, ByteBuffer buffer, long baseTimestamp, UUID[] load) throws IOException
+    {
+        CRC32 crc = new CRC32();
+        DataInputPlus di = new DataInputBuffer(buffer, true);
+
+        // read and validate size
+        int hintSize = di.readInt();
+        assertEquals(hintSize + HintsBuffer.ENTRY_OVERHEAD_SIZE, buffer.remaining());
+
+        // read and validate size crc
+        updateChecksum(crc, buffer, buffer.position(), 4);
+        assertEquals((int) crc.getValue(), di.readInt());
+
+        // read the hint and update/validate overall crc
+        Hint hint = Hint.serializer.deserialize(di, MessagingService.current_version);
+        updateChecksum(crc, buffer, buffer.position() + 8, hintSize);
+        assertEquals((int) crc.getValue(), di.readInt());
+
+        // further validate hint correctness
+        int idx = (int) (hint.creationTime - baseTimestamp);
+        assertEquals(hostId, load[idx]);
+
+        Row row = hint.mutation.getPartitionUpdates().iterator().next().iterator().next();
+        assertEquals(1, Iterables.size(row.cells()));
+
+        assertEquals(bytes(idx), row.clustering().get(0));
+        Cell cell = row.cells().iterator().next();
+        assertEquals(TimeUnit.MILLISECONDS.toMicros(baseTimestamp + idx), cell.timestamp());
+        assertEquals(bytes(idx), cell.value());
+
+        return idx;
+    }
+
+    static Hint createHint(int idx, long baseTimestamp)
+    {
+        long timestamp = baseTimestamp + idx;
+        return Hint.create(createMutation(idx, TimeUnit.MILLISECONDS.toMicros(timestamp)), timestamp);
+    }
+
+    private static Mutation createMutation(int index, long timestamp)
+    {
+        CFMetaData table = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        return new RowUpdateBuilder(table, timestamp, bytes(index))
+                   .clustering(bytes(index))
+                   .add("val", bytes(index))
+                   .build();
+    }
+
+    static class Writer implements Runnable
+    {
+        final HintsBuffer buffer;
+        final UUID[] load;
+        final int hintSize;
+        final int index;
+        final long baseTimestamp;
+
+        Writer(HintsBuffer buffer, UUID[] load, int hintSize, int index, long baseTimestamp)
+        {
+            this.buffer = buffer;
+            this.load = load;
+            this.hintSize = hintSize;
+            this.index = index;
+            this.baseTimestamp = baseTimestamp;
+        }
+
+        public void run()
+        {
+            int hintsPerThread = HINTS_COUNT / HINT_THREADS_COUNT;
+            for (int i = index * hintsPerThread; i < (index + 1) * hintsPerThread; i++)
+            {
+                try (HintsBuffer.Allocation allocation = buffer.allocate(hintSize))
+                {
+                    Hint hint = createHint(i, baseTimestamp);
+                    allocation.write(Collections.singleton(load[i]), hint);
+                }
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsCatalogTest.java b/test/unit/org/apache/cassandra/hints/HintsCatalogTest.java
new file mode 100644
index 0000000..dcd31cf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsCatalogTest.java

@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.util.*;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static junit.framework.Assert.*;
+import static org.apache.cassandra.Util.dk;
+
+public class HintsCatalogTest
+{
+    private static final String KEYSPACE = "hint_test";
+    private static final String TABLE0 = "table_0";
+    private static final String TABLE1 = "table_1";
+    private static final String TABLE2 = "table_2";
+    private static final int WRITE_BUFFER_SIZE = 256 << 10;
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                KeyspaceParams.simple(1),
+                SchemaLoader.standardCFMD(KEYSPACE, TABLE0),
+                SchemaLoader.standardCFMD(KEYSPACE, TABLE1),
+                SchemaLoader.standardCFMD(KEYSPACE, TABLE2));
+    }
+
+    @Test
+    public void loadCompletenessAndOrderTest() throws IOException
+    {
+        File directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            loadCompletenessAndOrderTest(directory);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    @Test
+    public void exciseHintFiles() throws IOException
+    {
+        File directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            exciseHintFiles(directory);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    private void loadCompletenessAndOrderTest(File directory) throws IOException
+    {
+        UUID hostId1 = UUID.randomUUID();
+        UUID hostId2 = UUID.randomUUID();
+
+        long timestamp1 = System.currentTimeMillis();
+        long timestamp2 = System.currentTimeMillis() + 1;
+        long timestamp3 = System.currentTimeMillis() + 2;
+        long timestamp4 = System.currentTimeMillis() + 3;
+
+        HintsDescriptor descriptor1 = new HintsDescriptor(hostId1, timestamp1);
+        HintsDescriptor descriptor2 = new HintsDescriptor(hostId2, timestamp3);
+        HintsDescriptor descriptor3 = new HintsDescriptor(hostId2, timestamp2);
+        HintsDescriptor descriptor4 = new HintsDescriptor(hostId1, timestamp4);
+
+        writeDescriptor(directory, descriptor1);
+        writeDescriptor(directory, descriptor2);
+        writeDescriptor(directory, descriptor3);
+        writeDescriptor(directory, descriptor4);
+
+        HintsCatalog catalog = HintsCatalog.load(directory, ImmutableMap.of());
+        assertEquals(2, catalog.stores().count());
+
+        HintsStore store1 = catalog.get(hostId1);
+        assertNotNull(store1);
+        assertEquals(descriptor1, store1.poll());
+        assertEquals(descriptor4, store1.poll());
+        assertNull(store1.poll());
+
+        HintsStore store2 = catalog.get(hostId2);
+        assertNotNull(store2);
+        assertEquals(descriptor3, store2.poll());
+        assertEquals(descriptor2, store2.poll());
+        assertNull(store2.poll());
+    }
+
+    private static void exciseHintFiles(File directory) throws IOException
+    {
+        UUID hostId = UUID.randomUUID();
+
+        HintsDescriptor descriptor1 = new HintsDescriptor(hostId, System.currentTimeMillis());
+        HintsDescriptor descriptor2 = new HintsDescriptor(hostId, System.currentTimeMillis() + 1);
+        HintsDescriptor descriptor3 = new HintsDescriptor(hostId, System.currentTimeMillis() + 2);
+        HintsDescriptor descriptor4 = new HintsDescriptor(hostId, System.currentTimeMillis() + 3);
+
+        createHintFile(directory, descriptor1);
+        createHintFile(directory, descriptor2);
+        createHintFile(directory, descriptor3);
+        createHintFile(directory, descriptor4);
+
+        HintsCatalog catalog = HintsCatalog.load(directory, ImmutableMap.of());
+        assertEquals(1, catalog.stores().count());
+
+        HintsStore store = catalog.get(hostId);
+
+        //should have 4 hint files
+        assertEquals(4, store.getDispatchQueueSize());
+
+        //excise store as a result it should remove all the hint files
+        catalog.exciseStore(hostId);
+
+        catalog = HintsCatalog.load(directory, ImmutableMap.of());
+        assertEquals(0, catalog.stores().count());
+        store = catalog.get(hostId);
+
+        //should have 0 hint files now
+        assertEquals(0, store.getDispatchQueueSize());
+    }
+
+    @Test
+    public void deleteHintsTest() throws IOException
+    {
+        File directory = Files.createTempDirectory(null).toFile();
+        UUID hostId1 = UUID.randomUUID();
+        UUID hostId2 = UUID.randomUUID();
+        long now = System.currentTimeMillis();
+        writeDescriptor(directory, new HintsDescriptor(hostId1, now));
+        writeDescriptor(directory, new HintsDescriptor(hostId1, now+1));
+        writeDescriptor(directory, new HintsDescriptor(hostId2, now+2));
+        writeDescriptor(directory, new HintsDescriptor(hostId2, now+3));
+
+        // load catalog containing two stores (one for each host)
+        HintsCatalog catalog = HintsCatalog.load(directory, ImmutableMap.of());
+        assertEquals(2, catalog.stores().count());
+        assertTrue(catalog.hasFiles());
+
+        // delete all hints from store 1
+        assertTrue(catalog.get(hostId1).hasFiles());
+        catalog.deleteAllHints(hostId1);
+        assertFalse(catalog.get(hostId1).hasFiles());
+        // stores are still keepts for each host, even after deleting hints
+        assertEquals(2, catalog.stores().count());
+        assertTrue(catalog.hasFiles());
+
+        // delete all hints from all stores
+        catalog.deleteAllHints();
+        assertEquals(2, catalog.stores().count());
+        assertFalse(catalog.hasFiles());
+    }
+
+    @SuppressWarnings("EmptyTryBlock")
+    private static void writeDescriptor(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        try (HintsWriter ignored = HintsWriter.create(directory, descriptor))
+        {
+        }
+    }
+
+    private static Mutation createMutation(String key, long now)
+    {
+        Mutation mutation = new Mutation(KEYSPACE, dk(key));
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE0), now, mutation)
+                .clustering("column0")
+                .add("val", "value0")
+                .build();
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE1), now + 1, mutation)
+                .clustering("column1")
+                .add("val", "value1")
+                .build();
+
+        new RowUpdateBuilder(Schema.instance.getCFMetaData(KEYSPACE, TABLE2), now + 2, mutation)
+                .clustering("column2")
+                .add("val", "value2")
+                .build();
+
+        return mutation;
+    }
+
+    @SuppressWarnings("EmptyTryBlock")
+    private static void createHintFile(File directory, HintsDescriptor descriptor) throws IOException
+    {
+        try (HintsWriter writer = HintsWriter.create(directory, descriptor))
+        {
+            ByteBuffer writeBuffer = ByteBuffer.allocateDirect(WRITE_BUFFER_SIZE);
+            try (HintsWriter.Session session = writer.newSession(writeBuffer))
+            {
+                long now = FBUtilities.timestampMicros();
+                Mutation mutation = createMutation("testSerializer", now);
+                Hint hint = Hint.create(mutation, now / 1000);
+
+                session.append(hint);
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsCompressionTest.java b/test/unit/org/apache/cassandra/hints/HintsCompressionTest.java
new file mode 100644
index 0000000..656d7cd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsCompressionTest.java

@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.io.Files;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ParameterizedClass;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.io.compress.DeflateCompressor;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.io.compress.LZ4Compressor;
+import org.apache.cassandra.io.compress.SnappyCompressor;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class HintsCompressionTest
+{
+    private static final String KEYSPACE = "hints_compression_test";
+    private static final String TABLE = "table";
+
+
+    private static Mutation createMutation(int index, long timestamp)
+    {
+        CFMetaData table = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        return new RowUpdateBuilder(table, timestamp, bytes(index))
+               .clustering(bytes(index))
+               .add("val", bytes(index))
+               .build();
+    }
+
+    private static Hint createHint(int idx, long baseTimestamp)
+    {
+        long timestamp = baseTimestamp + idx;
+        return Hint.create(createMutation(idx, TimeUnit.MILLISECONDS.toMicros(timestamp)), timestamp);
+    }
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+    }
+
+    private ImmutableMap<String, Object> params(Class<? extends ICompressor> compressorClass)
+    {
+        ImmutableMap<String, Object> compressionParams = ImmutableMap.<String, Object>builder()
+                                                                     .put(ParameterizedClass.CLASS_NAME, compressorClass.getSimpleName())
+                                                                     .build();
+        return ImmutableMap.<String, Object>builder()
+                           .put(HintsDescriptor.COMPRESSION, compressionParams)
+                           .build();
+    }
+
+    public void multiFlushAndDeserializeTest(Class<? extends ICompressor> compressorClass) throws Exception
+    {
+        int hintNum = 0;
+        int bufferSize = HintsWriteExecutor.WRITE_BUFFER_SIZE;
+        List<Hint> hints = new LinkedList<>();
+
+        UUID hostId = UUIDGen.getTimeUUID();
+        long ts = System.currentTimeMillis();
+
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, ts, params(compressorClass));
+        File dir = Files.createTempDir();
+        try (HintsWriter writer = HintsWriter.create(dir, descriptor))
+        {
+            assert writer instanceof CompressedHintsWriter;
+
+            ByteBuffer writeBuffer = ByteBuffer.allocateDirect(bufferSize);
+            try (HintsWriter.Session session = writer.newSession(writeBuffer))
+            {
+                while (session.getBytesWritten() < bufferSize * 3)
+                {
+                    Hint hint = createHint(hintNum, ts+hintNum);
+                    session.append(hint);
+                    hints.add(hint);
+                    hintNum++;
+                }
+            }
+        }
+
+        try (HintsReader reader = HintsReader.open(new File(dir, descriptor.fileName())))
+        {
+            List<Hint> deserialized = new ArrayList<>(hintNum);
+            List<InputPosition> pagePositions = new ArrayList<>(hintNum);
+
+            for (HintsReader.Page page: reader)
+            {
+                pagePositions.add(page.position);
+                Iterator<Hint> iterator = page.hintsIterator();
+                while (iterator.hasNext())
+                {
+                    deserialized.add(iterator.next());
+                }
+            }
+
+            Assert.assertEquals(hints.size(), deserialized.size());
+            hintNum = 0;
+            for (Hint expected: hints)
+            {
+                HintsTestUtil.assertHintsEqual(expected, deserialized.get(hintNum));
+                hintNum++;
+            }
+
+            // explicitely seek to each page by iterating collected page positions and check if hints still match as expected
+            int hintOffset = 0;
+            for (InputPosition pos : pagePositions)
+            {
+                reader.seek(pos);
+                HintsReader.Page page = reader.iterator().next();
+                Iterator<Hint> iterator = page.hintsIterator();
+                while (iterator.hasNext())
+                {
+                    Hint seekedHint = iterator.next();
+                    HintsTestUtil.assertHintsEqual(hints.get(hintOffset), seekedHint);
+                    hintOffset++;
+                }
+            }
+        }
+    }
+
+    @Test
+    public void lz4Compressor() throws Exception
+    {
+        multiFlushAndDeserializeTest(LZ4Compressor.class);
+    }
+
+    @Test
+    public void snappyCompressor() throws Exception
+    {
+        multiFlushAndDeserializeTest(SnappyCompressor.class);
+    }
+
+    @Test
+    public void deflateCompressor() throws Exception
+    {
+        multiFlushAndDeserializeTest(DeflateCompressor.class);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsDescriptorTest.java b/test/unit/org/apache/cassandra/hints/HintsDescriptorTest.java
new file mode 100644
index 0000000..08487d1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsDescriptorTest.java

@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.DataInput;
+import java.io.File;
+import java.io.IOException;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.io.ByteStreams;
+import com.google.common.io.Files;
+import org.junit.Test;
+
+import org.apache.cassandra.io.compress.LZ4Compressor;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotSame;
+import static junit.framework.Assert.fail;
+
+public class HintsDescriptorTest
+{
+    @Test
+    public void testSerializerNormal() throws IOException
+    {
+        UUID hostId = UUID.randomUUID();
+        int version = HintsDescriptor.CURRENT_VERSION;
+        long timestamp = System.currentTimeMillis();
+        ImmutableMap<String, Object> parameters =
+                ImmutableMap.of("compression", (Object) ImmutableMap.of("class_name", LZ4Compressor.class.getName()));
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, version, timestamp, parameters);
+
+        testSerializeDeserializeLoop(descriptor);
+    }
+
+    @Test
+    public void testSerializerWithEmptyParameters() throws IOException
+    {
+        UUID hostId = UUID.randomUUID();
+        int version = HintsDescriptor.CURRENT_VERSION;
+        long timestamp = System.currentTimeMillis();
+        ImmutableMap<String, Object> parameters = ImmutableMap.of();
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, version, timestamp, parameters);
+
+        testSerializeDeserializeLoop(descriptor);
+    }
+
+    @Test
+    public void testCorruptedDeserialize() throws IOException
+    {
+        UUID hostId = UUID.randomUUID();
+        int version = HintsDescriptor.CURRENT_VERSION;
+        long timestamp = System.currentTimeMillis();
+        ImmutableMap<String, Object> parameters = ImmutableMap.of();
+        HintsDescriptor descriptor = new HintsDescriptor(hostId, version, timestamp, parameters);
+
+        byte[] bytes = serializeDescriptor(descriptor);
+
+        // mess up the parameters size
+        bytes[28] = (byte) 0xFF;
+        bytes[29] = (byte) 0xFF;
+        bytes[30] = (byte) 0xFF;
+        bytes[31] = (byte) 0x7F;
+
+        // attempt to deserialize
+        try
+        {
+            deserializeDescriptor(bytes);
+            fail("Deserializing the descriptor should but didn't");
+        }
+        catch (IOException e)
+        {
+            assertEquals("Hints Descriptor CRC Mismatch", e.getMessage());
+        }
+    }
+
+    @Test
+    @SuppressWarnings("EmptyTryBlock")
+    public void testReadFromFile() throws IOException
+    {
+        UUID hostId = UUID.randomUUID();
+        int version = HintsDescriptor.CURRENT_VERSION;
+        long timestamp = System.currentTimeMillis();
+        ImmutableMap<String, Object> parameters = ImmutableMap.of();
+        HintsDescriptor expected = new HintsDescriptor(hostId, version, timestamp, parameters);
+
+        File directory = Files.createTempDir();
+        try
+        {
+            try (HintsWriter ignored = HintsWriter.create(directory, expected))
+            {
+            }
+            HintsDescriptor actual = HintsDescriptor.readFromFile(new File(directory, expected.fileName()).toPath());
+            assertEquals(expected, actual);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    private static void testSerializeDeserializeLoop(HintsDescriptor descriptor) throws IOException
+    {
+        // serialize to a byte array
+        byte[] bytes = serializeDescriptor(descriptor);
+        // make sure the sizes match
+        assertEquals(bytes.length, descriptor.serializedSize());
+        // deserialize back
+        HintsDescriptor deserializedDescriptor = deserializeDescriptor(bytes);
+        // compare equality
+        assertDescriptorsEqual(descriptor, deserializedDescriptor);
+    }
+
+    private static byte[] serializeDescriptor(HintsDescriptor descriptor) throws IOException
+    {
+        DataOutputBuffer dob = new DataOutputBuffer();
+        descriptor.serialize(dob);
+        return dob.toByteArray();
+    }
+
+    private static HintsDescriptor deserializeDescriptor(byte[] bytes) throws IOException
+    {
+        DataInput in = ByteStreams.newDataInput(bytes);
+        return HintsDescriptor.deserialize(in);
+    }
+
+    private static void assertDescriptorsEqual(HintsDescriptor expected, HintsDescriptor actual)
+    {
+        assertNotSame(expected, actual);
+        assertEquals(expected, actual);
+        assertEquals(expected.hashCode(), actual.hashCode());
+        assertEquals(expected.hostId, actual.hostId);
+        assertEquals(expected.version, actual.version);
+        assertEquals(expected.timestamp, actual.timestamp);
+        assertEquals(expected.parameters, actual.parameters);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsReaderTest.java b/test/unit/org/apache/cassandra/hints/HintsReaderTest.java
new file mode 100644
index 0000000..70cf6e7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsReaderTest.java

@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.util.Iterator;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Iterables;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+public class HintsReaderTest
+{
+    private static final String CF_STANDARD1 = "Standard1";
+    private static final String CF_STANDARD2 = "Standard2";
+
+    private static HintsDescriptor descriptor;
+
+    private static File directory;
+
+    @BeforeClass
+    public static void defineSchema() throws Exception
+    {
+        SchemaLoader.prepareServer();
+
+        descriptor = new HintsDescriptor(UUID.randomUUID(), System.currentTimeMillis());
+    }
+
+    private static Mutation createMutation(int index, long timestamp, String ks, String tb)
+    {
+        CFMetaData table = Schema.instance.getCFMetaData(ks, tb);
+        return new RowUpdateBuilder(table, timestamp, bytes(index))
+               .clustering(bytes(index))
+               .add("val", bytes(index))
+               .build();
+    }
+
+    private void generateHints(int num, String ks) throws IOException
+    {
+        try (HintsWriter writer = HintsWriter.create(directory, descriptor))
+        {
+            ByteBuffer buffer = ByteBuffer.allocateDirect(256 * 1024);
+            try (HintsWriter.Session session = writer.newSession(buffer))
+            {
+                for (int i = 0; i < num; i++)
+                {
+                    long timestamp = descriptor.timestamp + i;
+                    Mutation m = createMutation(i, TimeUnit.MILLISECONDS.toMicros(timestamp), ks, CF_STANDARD1);
+                    session.append(Hint.create(m, timestamp));
+                    m = createMutation(i, TimeUnit.MILLISECONDS.toMicros(timestamp), ks, CF_STANDARD2);
+                    session.append(Hint.create(m, timestamp));
+                }
+            }
+            FileUtils.clean(buffer);
+        }
+    }
+
+    private void readHints(int num, int numTable) throws IOException
+    {
+        long baseTimestamp = descriptor.timestamp;
+        int index = 0;
+
+        try (HintsReader reader = HintsReader.open(new File(directory, descriptor.fileName())))
+        {
+            for (HintsReader.Page page : reader)
+            {
+                Iterator<Hint> hints = page.hintsIterator();
+                while (hints.hasNext())
+                {
+                    int i = index / numTable;
+                    Hint hint = hints.next();
+
+                    long timestamp = baseTimestamp + i;
+                    Mutation mutation = hint.mutation;
+
+                    assertEquals(timestamp, hint.creationTime);
+                    assertEquals(dk(bytes(i)), mutation.key());
+
+                    Row row = mutation.getPartitionUpdates().iterator().next().iterator().next();
+                    assertEquals(1, Iterables.size(row.cells()));
+                    assertEquals(bytes(i), row.clustering().get(0));
+                    Cell cell = row.cells().iterator().next();
+                    assertNotNull(cell);
+                    assertEquals(bytes(i), cell.value());
+                    assertEquals(timestamp * 1000, cell.timestamp());
+
+                    index++;
+                }
+            }
+        }
+
+        assertEquals(index, num);
+    }
+
+    @Test
+    public void testNormalRead() throws IOException
+    {
+        String ks = "testNormalRead";
+        SchemaLoader.createKeyspace(ks,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(ks, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(ks, CF_STANDARD2));
+        int numTable = 2;
+        directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            generateHints(3, ks);
+            readHints(3 * numTable, numTable);
+        }
+        finally
+        {
+            directory.delete();
+        }
+    }
+
+    @Test
+    public void testDroppedTableRead() throws IOException
+    {
+        String ks = "testDroppedTableRead";
+        SchemaLoader.createKeyspace(ks,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(ks, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(ks, CF_STANDARD2));
+        directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            generateHints(3, ks);
+            Schema.instance.dropTable(ks, CF_STANDARD1);
+            readHints(3, 1);
+        }
+        finally
+        {
+            directory.delete();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsServiceTest.java b/test/unit/org/apache/cassandra/hints/HintsServiceTest.java
new file mode 100644
index 0000000..ab1cbd0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsServiceTest.java

@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.net.InetAddress;
+import java.util.Collections;
+import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import javax.annotation.Nullable;
+
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.utils.MoreFutures;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.UpdateBuilder;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.gms.IFailureDetectionEventListener;
+import org.apache.cassandra.gms.IFailureDetector;
+import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.net.MockMessagingService;
+import org.apache.cassandra.net.MockMessagingSpy;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.Util.dk;
+import static org.apache.cassandra.net.MockMessagingService.verb;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class HintsServiceTest
+{
+    private static final String KEYSPACE = "hints_service_test";
+    private static final String TABLE = "table";
+
+    private final MockFailureDetector failureDetector = new MockFailureDetector();
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        StorageService.instance.initServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                KeyspaceParams.simple(1),
+                SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+    }
+
+    @After
+    public void cleanup()
+    {
+        MockMessagingService.cleanup();
+    }
+
+    @Before
+    public void reinstanciateService() throws ExecutionException, InterruptedException
+    {
+        MessagingService.instance().clearMessageSinks();
+
+        if (!HintsService.instance.isShutDown())
+        {
+            HintsService.instance.shutdownBlocking();
+            HintsService.instance.deleteAllHints();
+        }
+
+        failureDetector.isAlive = true;
+        HintsService.instance = new HintsService(failureDetector);
+        HintsService.instance.startDispatch();
+    }
+
+    @Test
+    public void testDispatchHints() throws InterruptedException, ExecutionException
+    {
+        long cnt = StorageMetrics.totalHints.getCount();
+
+        // create spy for hint messages
+        MockMessagingSpy spy = sendHintsAndResponses(100, -1);
+
+        // metrics should have been updated with number of create hints
+        assertEquals(cnt + 100, StorageMetrics.totalHints.getCount());
+
+        // wait until hints have been send
+        spy.interceptMessageOut(100).get();
+        spy.interceptNoMsg(500, TimeUnit.MILLISECONDS).get();
+    }
+
+    @Test
+    public void testPauseAndResume() throws InterruptedException, ExecutionException
+    {
+        HintsService.instance.pauseDispatch();
+
+        // create spy for hint messages
+        MockMessagingSpy spy = sendHintsAndResponses(100, -1);
+
+        // we should not send any hints while paused
+        ListenableFuture<Boolean> noMessagesWhilePaused = spy.interceptNoMsg(15, TimeUnit.SECONDS);
+        Futures.addCallback(noMessagesWhilePaused, new MoreFutures.SuccessCallback<Boolean>()
+        {
+            public void onSuccess(@Nullable Boolean aBoolean)
+            {
+                HintsService.instance.resumeDispatch();
+            }
+        });
+
+        Futures.allAsList(
+                noMessagesWhilePaused,
+                spy.interceptMessageOut(100),
+                spy.interceptNoMsg(200, TimeUnit.MILLISECONDS)
+        ).get();
+    }
+
+    @Test
+    public void testPageRetry() throws InterruptedException, ExecutionException, TimeoutException
+    {
+        // create spy for hint messages, but only create responses for 5 hints
+        MockMessagingSpy spy = sendHintsAndResponses(20, 5);
+
+        Futures.allAsList(
+                // the dispatcher will always send all hints within the current page
+                // and only wait for the acks before going to the next page
+                spy.interceptMessageOut(20),
+                spy.interceptNoMsg(200, TimeUnit.MILLISECONDS),
+
+                // next tick will trigger a retry of the same page as we only replied with 5/20 acks
+                spy.interceptMessageOut(20)
+        ).get();
+
+        // marking the destination node as dead should stop sending hints
+        failureDetector.isAlive = false;
+        spy.interceptNoMsg(20, TimeUnit.SECONDS).get();
+    }
+
+    @Test
+    public void testPageSeek() throws InterruptedException, ExecutionException
+    {
+        // create spy for hint messages, stop replying after 12k (should be on 3rd page)
+        MockMessagingSpy spy = sendHintsAndResponses(20000, 12000);
+
+        // At this point the dispatcher will constantly retry the page we stopped acking,
+        // thus we receive the same hints from the page multiple times and in total more than
+        // all written hints. Lets just consume them for a while and then pause the dispatcher.
+        spy.interceptMessageOut(22000).get();
+        HintsService.instance.pauseDispatch();
+        Thread.sleep(1000);
+
+        // verify that we have a dispatch offset set for the page we're currently stuck at
+        HintsStore store = HintsService.instance.getCatalog().get(StorageService.instance.getLocalHostUUID());
+        HintsDescriptor descriptor = store.poll();
+        store.offerFirst(descriptor); // add again for cleanup during re-instanciation
+        InputPosition dispatchOffset = store.getDispatchOffset(descriptor);
+        assertTrue(dispatchOffset != null);
+        assertTrue(((ChecksummedDataInput.Position) dispatchOffset).sourcePosition > 0);
+    }
+
+    private MockMessagingSpy sendHintsAndResponses(int noOfHints, int noOfResponses)
+    {
+        // create spy for hint messages, but only create responses for noOfResponses hints
+        MessageIn<HintResponse> messageIn = MessageIn.create(FBUtilities.getBroadcastAddress(),
+                HintResponse.instance,
+                Collections.emptyMap(),
+                MessagingService.Verb.REQUEST_RESPONSE,
+                MessagingService.current_version);
+
+        MockMessagingSpy spy;
+        if (noOfResponses != -1)
+        {
+            spy = MockMessagingService.when(verb(MessagingService.Verb.HINT)).respondN(messageIn, noOfResponses);
+        }
+        else
+        {
+            spy = MockMessagingService.when(verb(MessagingService.Verb.HINT)).respond(messageIn);
+        }
+
+        // create and write noOfHints using service
+        UUID hostId = StorageService.instance.getLocalHostUUID();
+        for (int i = 0; i < noOfHints; i++)
+        {
+            long now = System.currentTimeMillis();
+            DecoratedKey dkey = dk(String.valueOf(i));
+            CFMetaData cfMetaData = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+
+            UpdateBuilder builder = UpdateBuilder.create(cfMetaData, dkey)
+                    .withTimestamp(now)
+                    .newRow("column0")
+                    .add("val", "value0");
+            Hint hint = Hint.create((Mutation) builder.makeMutation(), now);
+
+            HintsService.instance.write(hostId, hint);
+        }
+        return spy;
+    }
+
+    private static class MockFailureDetector implements IFailureDetector
+    {
+        private boolean isAlive = true;
+
+        public boolean isAlive(InetAddress ep)
+        {
+            return isAlive;
+        }
+
+        public void interpret(InetAddress ep)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void report(InetAddress ep)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void remove(InetAddress ep)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void forceConviction(InetAddress ep)
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/HintsTestUtil.java b/test/unit/org/apache/cassandra/hints/HintsTestUtil.java
new file mode 100644
index 0000000..89b532f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/HintsTestUtil.java

@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.util.UUID;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.partitions.AbstractBTreePartition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+
+final class HintsTestUtil
+{
+    static void assertMutationsEqual(Mutation expected, Mutation actual)
+    {
+        assertEquals(expected.key(), actual.key());
+        assertEquals(expected.getPartitionUpdates().size(), actual.getPartitionUpdates().size());
+
+        for (UUID id : expected.getColumnFamilyIds())
+            assertPartitionsEqual(expected.getPartitionUpdate(id), actual.getPartitionUpdate(id));
+    }
+
+    static void assertPartitionsEqual(AbstractBTreePartition expected, AbstractBTreePartition actual)
+    {
+        assertEquals(expected.partitionKey(), actual.partitionKey());
+        assertEquals(expected.deletionInfo(), actual.deletionInfo());
+        assertEquals(expected.columns(), actual.columns());
+        assertTrue(Iterators.elementsEqual(expected.iterator(), actual.iterator()));
+    }
+
+    static void assertHintsEqual(Hint expected, Hint actual)
+    {
+        assertEquals(expected.mutation.getKeyspaceName(), actual.mutation.getKeyspaceName());
+        assertEquals(expected.mutation.key(), actual.mutation.key());
+        assertEquals(expected.mutation.getColumnFamilyIds(), actual.mutation.getColumnFamilyIds());
+        for (PartitionUpdate partitionUpdate : expected.mutation.getPartitionUpdates())
+            assertPartitionsEqual(partitionUpdate, actual.mutation.getPartitionUpdate(partitionUpdate.metadata().cfId));
+        assertEquals(expected.creationTime, actual.creationTime);
+        assertEquals(expected.gcgs, actual.gcgs);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/hints/LegacyHintsMigratorTest.java b/test/unit/org/apache/cassandra/hints/LegacyHintsMigratorTest.java
new file mode 100644
index 0000000..cc97df0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/hints/LegacyHintsMigratorTest.java

@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.hints;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.util.*;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static junit.framework.Assert.assertTrue;
+
+import static org.apache.cassandra.hints.HintsTestUtil.assertMutationsEqual;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+// TODO: test split into several files
+@SuppressWarnings("deprecation")
+public class LegacyHintsMigratorTest
+{
+    private static final String KEYSPACE = "legacy_hints_migrator_test";
+    private static final String TABLE = "table";
+
+    @BeforeClass
+    public static void defineSchema()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE));
+    }
+
+    @Test
+    public void testNothingToMigrate() throws IOException
+    {
+        File directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            testNothingToMigrate(directory);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    private static void testNothingToMigrate(File directory)
+    {
+        // truncate system.hints to enseure nothing inside
+        Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_HINTS).truncateBlocking();
+        new LegacyHintsMigrator(directory, 128 * 1024 * 1024).migrate();
+        HintsCatalog catalog = HintsCatalog.load(directory, HintsService.EMPTY_PARAMS);
+        assertEquals(0, catalog.stores().count());
+    }
+
+    @Test
+    public void testMigrationIsComplete() throws IOException
+    {
+        File directory = Files.createTempDirectory(null).toFile();
+        try
+        {
+            testMigrationIsComplete(directory);
+        }
+        finally
+        {
+            directory.deleteOnExit();
+        }
+    }
+
+    private static void testMigrationIsComplete(File directory)
+    {
+        long timestamp = System.currentTimeMillis();
+
+        // write 100 mutations for each of the 10 generated endpoints
+        Map<UUID, Queue<Mutation>> mutations = new HashMap<>();
+        for (int i = 0; i < 10; i++)
+        {
+            UUID hostId = UUID.randomUUID();
+            Queue<Mutation> queue = new LinkedList<>();
+            mutations.put(hostId, queue);
+
+            for (int j = 0; j < 100; j++)
+            {
+                Mutation mutation = createMutation(j, timestamp + j);
+                queue.offer(mutation);
+                Mutation legacyHint = createLegacyHint(mutation, timestamp, hostId);
+                legacyHint.applyUnsafe();
+            }
+        }
+
+        // run the migration
+        new LegacyHintsMigrator(directory, 128 * 1024 * 1024).migrate();
+
+        // validate that the hints table is truncated now
+        assertTrue(Keyspace.open(SystemKeyspace.NAME).getColumnFamilyStore(SystemKeyspace.LEGACY_HINTS).isEmpty());
+
+        HintsCatalog catalog = HintsCatalog.load(directory, HintsService.EMPTY_PARAMS);
+
+        // assert that we've correctly loaded 10 hints stores
+        assertEquals(10, catalog.stores().count());
+
+        // for each of the 10 stores, make sure the mutations have been migrated correctly
+        for (Map.Entry<UUID, Queue<Mutation>> entry : mutations.entrySet())
+        {
+            HintsStore store = catalog.get(entry.getKey());
+            assertNotNull(store);
+
+            HintsDescriptor descriptor = store.poll();
+            assertNotNull(descriptor);
+
+            // read all the hints
+            Queue<Hint> actualHints = new LinkedList<>();
+            try (HintsReader reader = HintsReader.open(new File(directory, descriptor.fileName())))
+            {
+                for (HintsReader.Page page : reader)
+                    page.hintsIterator().forEachRemaining(actualHints::offer);
+            }
+
+            // assert the size matches
+            assertEquals(100, actualHints.size());
+
+            // compare expected hints to actual hints
+            for (int i = 0; i < 100; i++)
+            {
+                Hint hint = actualHints.poll();
+                Mutation mutation = entry.getValue().poll();
+                int ttl = mutation.smallestGCGS();
+
+                assertEquals(timestamp, hint.creationTime);
+                assertEquals(ttl, hint.gcgs);
+                assertMutationsEqual(mutation, hint.mutation);
+            }
+        }
+    }
+
+    // legacy hint mutation creation code, copied more or less verbatim from the previous implementation
+    private static Mutation createLegacyHint(Mutation mutation, long now, UUID targetId)
+    {
+        int version = MessagingService.VERSION_21;
+        int ttl = mutation.smallestGCGS();
+        UUID hintId = UUIDGen.getTimeUUID();
+
+        ByteBuffer key = UUIDType.instance.decompose(targetId);
+        Clustering clustering = SystemKeyspace.LegacyHints.comparator.make(hintId, version);
+        ByteBuffer value = ByteBuffer.wrap(FBUtilities.serialize(mutation, Mutation.serializer, version));
+        Cell cell = BufferCell.expiring(SystemKeyspace.LegacyHints.compactValueColumn(),
+                                        now,
+                                        ttl,
+                                        FBUtilities.nowInSeconds(),
+                                        value);
+        return new Mutation(PartitionUpdate.singleRowUpdate(SystemKeyspace.LegacyHints,
+                                                            key,
+                                                            BTreeRow.singleCellRow(clustering, cell)));
+    }
+
+    private static Mutation createMutation(int index, long timestamp)
+    {
+        CFMetaData table = Schema.instance.getCFMetaData(KEYSPACE, TABLE);
+        return new RowUpdateBuilder(table, timestamp, bytes(index))
+               .clustering(bytes(index))
+               .add("val", bytes(index))
+               .build();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/index/CustomIndexTest.java b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
new file mode 100644
index 0000000..1ab08fd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/CustomIndexTest.java

@@ -0,0 +1,1138 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index;
+
+import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Sets;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.QueryValidationException;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.restrictions.IndexRestrictions;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.cql3.statements.ModificationStatement;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Indexes;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+import static org.apache.cassandra.Util.throwAssert;
+import static org.apache.cassandra.cql3.statements.IndexTarget.CUSTOM_INDEX_OPTION_NAME;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class CustomIndexTest extends CQLTester
+{
+    @Test
+    public void testInsertsOnCfsBackedIndex() throws Throwable
+    {
+        // test to ensure that we don't deadlock when flushing CFS backed custom indexers
+        // see CASSANDRA-10181
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        createIndex("CREATE CUSTOM INDEX myindex ON %s(c) USING 'org.apache.cassandra.index.internal.CustomCassandraIndex'");
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 2);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 0, 1);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 2, 0, 0);
+    }
+
+    @Test
+    public void testTruncateWithNonCfsCustomIndex() throws Throwable
+    {
+        // deadlocks and times out the test in the face of the synchronisation
+        // issues described in the comments on CASSANDRA-9669
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a))");
+        createIndex("CREATE CUSTOM INDEX b_index ON %s(b) USING 'org.apache.cassandra.index.StubIndex'");
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 2);
+        getCurrentColumnFamilyStore().truncateBlocking();
+    }
+
+    @Test
+    public void indexControlsIfIncludedInBuildOnNewSSTables() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, PRIMARY KEY (a))");
+        String toInclude = "include";
+        String toExclude = "exclude";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(b) USING '%s'",
+                                  toInclude, IndexIncludedInBuild.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(b) USING '%s'",
+                                  toExclude, IndexExcludedFromBuild.class.getName()));
+
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 0, 0);
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (a, b) VALUES (?, ?)", 2, 2);
+        flush();
+
+        SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
+        IndexIncludedInBuild included = (IndexIncludedInBuild)indexManager.getIndexByName(toInclude);
+        included.reset();
+        assertTrue(included.rowsInserted.isEmpty());
+
+        IndexExcludedFromBuild excluded = (IndexExcludedFromBuild)indexManager.getIndexByName(toExclude);
+        excluded.reset();
+        assertTrue(excluded.rowsInserted.isEmpty());
+
+        indexManager.buildAllIndexesBlocking(getCurrentColumnFamilyStore().getLiveSSTables());
+
+        assertEquals(3, included.rowsInserted.size());
+        assertTrue(excluded.rowsInserted.isEmpty());
+    }
+
+    @Test
+    public void indexReceivesWriteTimeDeletionsCorrectly() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b, c))");
+        String indexName = "test_index";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(d) USING '%s'",
+                                  indexName, StubIndex.class.getName()));
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 1, 1);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 0, 2, 2);
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 3, 3);
+
+        SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
+        StubIndex index = (StubIndex)indexManager.getIndexByName(indexName);
+        assertEquals(4, index.rowsInserted.size());
+        assertTrue(index.partitionDeletions.isEmpty());
+        assertTrue(index.rangeTombstones.isEmpty());
+
+        execute("DELETE FROM %s WHERE a=0 AND b=0");
+        assertTrue(index.partitionDeletions.isEmpty());
+        assertEquals(1, index.rangeTombstones.size());
+
+        execute("DELETE FROM %s WHERE a=0");
+        assertEquals(1, index.partitionDeletions.size());
+        assertEquals(1, index.rangeTombstones.size());
+    }
+    @Test
+    public void nonCustomIndexesRequireExactlyOneTargetColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY (k,c))");
+
+        assertInvalidMessage("Only CUSTOM indexes support multiple columns", "CREATE INDEX multi_idx on %s(v1,v2)");
+        assertInvalidMessage("Only CUSTOM indexes can be created without specifying a target column",
+                           "CREATE INDEX no_targets on %s()");
+
+        createIndex(String.format("CREATE CUSTOM INDEX multi_idx ON %%s(v1, v2) USING '%s'", StubIndex.class.getName()));
+        assertIndexCreated("multi_idx", "v1", "v2");
+    }
+
+    @Test
+    public void rejectDuplicateColumnsInTargetList() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY (k,c))");
+
+        assertInvalidMessage("Duplicate column v1 in index target list",
+                             String.format("CREATE CUSTOM INDEX ON %%s(v1, v1) USING '%s'",
+                                           StubIndex.class.getName()));
+
+        assertInvalidMessage("Duplicate column v1 in index target list",
+                             String.format("CREATE CUSTOM INDEX ON %%s(v1, v1, c, c) USING '%s'",
+                                           StubIndex.class.getName()));
+    }
+
+    @Test
+    public void requireFullQualifierForFrozenCollectionTargets() throws Throwable
+    {
+        // this is really just to prove that we require the full modifier on frozen collection
+        // targets whether the index is multicolumn or not
+        createTable("CREATE TABLE %s(" +
+                    " k int," +
+                    " c int," +
+                    " fmap frozen<map<int, text>>," +
+                    " flist frozen<list<int>>," +
+                    " fset frozen<set<int>>," +
+                    " PRIMARY KEY(k,c))");
+
+        assertInvalidMessage("Cannot create keys() index on frozen column fmap. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, keys(fmap)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create entries() index on frozen column fmap. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, entries(fmap)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create values() index on frozen column fmap. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, fmap) USING'%s'", StubIndex.class.getName()));
+
+        assertInvalidMessage("Cannot create keys() index on frozen column flist. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, keys(flist)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create entries() index on frozen column flist. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, entries(flist)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create values() index on frozen column flist. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, flist) USING'%s'", StubIndex.class.getName()));
+
+        assertInvalidMessage("Cannot create keys() index on frozen column fset. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, keys(fset)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create entries() index on frozen column fset. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, entries(fset)) USING'%s'",
+                                           StubIndex.class.getName()));
+        assertInvalidMessage("Cannot create values() index on frozen column fset. " +
+                             "Frozen collections only support full() indexes",
+                             String.format("CREATE CUSTOM INDEX ON %%s(c, fset) USING'%s'", StubIndex.class.getName()));
+
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, full(fmap)) USING'%s'", StubIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, full(flist)) USING'%s'", StubIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, full(fset)) USING'%s'", StubIndex.class.getName()));
+    }
+
+    @Test
+    public void defaultIndexNameContainsTargetColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY(k,c))");
+
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1, v2) USING '%s'", StubIndex.class.getName()));
+        assertEquals(1, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        assertIndexCreated(currentTable() + "_idx", "v1", "v2");
+
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, v1, v2) USING '%s'", StubIndex.class.getName()));
+        assertEquals(2, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        assertIndexCreated(currentTable() + "_idx_1", "c", "v1", "v2");
+
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, v2) USING '%s'", StubIndex.class.getName()));
+        assertEquals(3, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        assertIndexCreated(currentTable() + "_idx_2", "c", "v2");
+
+        // duplicate the previous index with some additional options and check the name is generated as expected
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, v2) USING '%s' WITH OPTIONS = {'foo':'bar'}",
+                                  StubIndex.class.getName()));
+        assertEquals(4, getCurrentColumnFamilyStore().metadata.getIndexes().size());
+        Map<String, String> options = new HashMap<>();
+        options.put("foo", "bar");
+        assertIndexCreated(currentTable() + "_idx_3", options, "c", "v2");
+    }
+
+    @Test
+    public void createMultiColumnIndexes() throws Throwable
+    {
+        // smoke test for various permutations of multicolumn indexes
+        createTable("CREATE TABLE %s (" +
+                    " pk1 int," +
+                    " pk2 int," +
+                    " c1 int," +
+                    " c2 int," +
+                    " v1 int," +
+                    " v2 int," +
+                    " mval map<text, int>," +
+                    " lval list<int>," +
+                    " sval set<int>," +
+                    " fmap frozen<map<text,int>>," +
+                    " flist frozen<list<int>>," +
+                    " fset frozen<set<int>>," +
+                    " PRIMARY KEY ((pk1, pk2), c1, c2))");
+
+        testCreateIndex("idx_1", "pk1", "pk2");
+        testCreateIndex("idx_2", "pk1", "c1");
+        testCreateIndex("idx_3", "pk1", "c2");
+        testCreateIndex("idx_4", "c1", "c2");
+        testCreateIndex("idx_5", "c2", "v1");
+        testCreateIndex("idx_6", "v1", "v2");
+        testCreateIndex("idx_7", "pk2", "c2", "v2");
+        testCreateIndex("idx_8", "pk1", "c1", "v1", "mval", "sval", "lval");
+
+        createIndex(String.format("CREATE CUSTOM INDEX inc_frozen ON %%s(" +
+                                  "  pk2, c2, v2, full(fmap), full(fset), full(flist)" +
+                                  ") USING '%s'",
+                                  StubIndex.class.getName()));
+        assertIndexCreated("inc_frozen",
+                           new HashMap<>(),
+                           ImmutableList.of(indexTarget("pk2", IndexTarget.Type.VALUES),
+                                            indexTarget("c2", IndexTarget.Type.VALUES),
+                                            indexTarget("v2", IndexTarget.Type.VALUES),
+                                            indexTarget("fmap", IndexTarget.Type.FULL),
+                                            indexTarget("fset", IndexTarget.Type.FULL),
+                                            indexTarget("flist", IndexTarget.Type.FULL)));
+
+        createIndex(String.format("CREATE CUSTOM INDEX all_teh_things ON %%s(" +
+                                  "  pk1, pk2, c1, c2, v1, v2, keys(mval), lval, sval, full(fmap), full(fset), full(flist)" +
+                                  ") USING '%s'",
+                                  StubIndex.class.getName()));
+        assertIndexCreated("all_teh_things",
+                           new HashMap<>(),
+                           ImmutableList.of(indexTarget("pk1", IndexTarget.Type.VALUES),
+                                            indexTarget("pk2", IndexTarget.Type.VALUES),
+                                            indexTarget("c1", IndexTarget.Type.VALUES),
+                                            indexTarget("c2", IndexTarget.Type.VALUES),
+                                            indexTarget("v1", IndexTarget.Type.VALUES),
+                                            indexTarget("v2", IndexTarget.Type.VALUES),
+                                            indexTarget("mval", IndexTarget.Type.KEYS),
+                                            indexTarget("lval", IndexTarget.Type.VALUES),
+                                            indexTarget("sval", IndexTarget.Type.VALUES),
+                                            indexTarget("fmap", IndexTarget.Type.FULL),
+                                            indexTarget("fset", IndexTarget.Type.FULL),
+                                            indexTarget("flist", IndexTarget.Type.FULL)));
+    }
+
+    @Test
+    public void createMultiColumnIndexIncludingUserTypeColumn() throws Throwable
+    {
+        String myType = KEYSPACE + '.' + createType("CREATE TYPE %s (a int, b int)");
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 frozen<" + myType + ">)");
+        testCreateIndex("udt_idx", "v1", "v2");
+        Indexes indexes = getCurrentColumnFamilyStore().metadata.getIndexes();
+        IndexMetadata expected = IndexMetadata.fromIndexTargets(getCurrentColumnFamilyStore().metadata,
+                                                                ImmutableList.of(indexTarget("v1", IndexTarget.Type.VALUES),
+                                                                                 indexTarget("v2", IndexTarget.Type.VALUES)),
+                                                                "udt_idx",
+                                                                IndexMetadata.Kind.CUSTOM,
+                                                                ImmutableMap.of(CUSTOM_INDEX_OPTION_NAME,
+                                                                                StubIndex.class.getName()));
+        IndexMetadata actual = indexes.get("udt_idx").orElseThrow(throwAssert("Index udt_idx not found"));
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    public void createIndexWithoutTargets() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY(k,c))");
+        // only allowed for CUSTOM indexes
+        assertInvalidMessage("Only CUSTOM indexes can be created without specifying a target column",
+                             "CREATE INDEX ON %s()");
+
+        // parentheses are mandatory
+        assertInvalidSyntax("CREATE CUSTOM INDEX ON %%s USING '%s'", StubIndex.class.getName());
+        createIndex(String.format("CREATE CUSTOM INDEX no_targets ON %%s() USING '%s'", StubIndex.class.getName()));
+        assertIndexCreated("no_targets", new HashMap<>());
+    }
+
+    @Test
+    public void testCustomIndexExpressionSyntax() throws Throwable
+    {
+        Object[] row = row(0, 0, 0, 0);
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        String indexName = currentTable() + "_custom_index";
+        execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", row);
+
+
+        assertInvalidMessage(String.format(IndexRestrictions.INDEX_NOT_FOUND, indexName, keyspace(), currentTable()),
+                             String.format("SELECT * FROM %%s WHERE expr(%s, 'foo bar baz')", indexName));
+
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(c) USING '%s'", indexName, StubIndex.class.getName()));
+
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  String.format(IndexRestrictions.INDEX_NOT_FOUND, "no_such_index", keyspace(), currentTable()),
+                                  QueryValidationException.class,
+                                  "SELECT * FROM %s WHERE expr(no_such_index, 'foo bar baz ')");
+
+        // simple case
+        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(%s, 'foo bar baz')", indexName)), row);
+        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(\"%s\", 'foo bar baz')", indexName)), row);
+        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(%s, $$foo \" ~~~ bar Baz$$)", indexName)), row);
+
+        // multiple expressions on the same index
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  IndexRestrictions.MULTIPLE_EXPRESSIONS,
+                                  QueryValidationException.class,
+                                  String.format("SELECT * FROM %%s WHERE expr(%1$s, 'foo') AND expr(%1$s, 'bar')",
+                                                indexName));
+
+        // multiple expressions on different indexes
+        createIndex(String.format("CREATE CUSTOM INDEX other_custom_index ON %%s(d) USING '%s'", StubIndex.class.getName()));
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  IndexRestrictions.MULTIPLE_EXPRESSIONS,
+                                  QueryValidationException.class,
+                                  String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND expr(other_custom_index, 'bar')",
+                                                indexName));
+
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                                  QueryValidationException.class,
+                                  String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND d=0", indexName));
+        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND d=0 ALLOW FILTERING", indexName)), row);
+    }
+
+    @Test
+    public void customIndexDoesntSupportCustomExpressions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        String indexName = currentTable() + "_custom_index";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(c) USING '%s'",
+                                  indexName,
+                                  NoCustomExpressionsIndex.class.getName()));
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  String.format( IndexRestrictions.CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName),
+                                  QueryValidationException.class,
+                                  String.format("SELECT * FROM %%s WHERE expr(%s, 'foo bar baz')", indexName));
+    }
+
+    @Test
+    public void customIndexRejectsExpressionSyntax() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        String indexName = currentTable() + "_custom_index";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(c) USING '%s'",
+                                  indexName,
+                                  AlwaysRejectIndex.class.getName()));
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  "None shall pass",
+                                  QueryValidationException.class,
+                                  String.format("SELECT * FROM %%s WHERE expr(%s, 'foo bar baz')", indexName));
+    }
+
+    @Test
+    public void customExpressionsMustTargetCustomIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        createIndex("CREATE INDEX non_custom_index ON %s(c)");
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  String.format(IndexRestrictions.NON_CUSTOM_INDEX_IN_EXPRESSION, "non_custom_index"),
+                                  QueryValidationException.class,
+                                  "SELECT * FROM %s WHERE expr(non_custom_index, 'c=0')");
+    }
+
+    @Test
+    public void customExpressionsDisallowedInModifications() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a int, b int, c int, d int, PRIMARY KEY (a, b))");
+        String indexName = currentTable() + "_custom_index";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(c) USING '%s'",
+                                  indexName, StubIndex.class.getName()));
+
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  ModificationStatement.CUSTOM_EXPRESSIONS_NOT_ALLOWED,
+                                  QueryValidationException.class,
+                                  String.format("DELETE FROM %%s WHERE expr(%s, 'foo bar baz ')", indexName));
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  ModificationStatement.CUSTOM_EXPRESSIONS_NOT_ALLOWED,
+                                  QueryValidationException.class,
+                                  String.format("UPDATE %%s SET d=0 WHERE expr(%s, 'foo bar baz ')", indexName));
+    }
+
+    @Test
+    public void indexSelectionPrefersMostSelectiveIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s(a int, b int, c int, PRIMARY KEY (a))");
+        createIndex(String.format("CREATE CUSTOM INDEX %s_more_selective ON %%s(b) USING '%s'",
+                                  currentTable(),
+                                  SettableSelectivityIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX %s_less_selective ON %%s(c) USING '%s'",
+                                  currentTable(),
+                                  SettableSelectivityIndex.class.getName()));
+        SettableSelectivityIndex moreSelective =
+            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
+        SettableSelectivityIndex lessSelective =
+            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
+        assertEquals(0, moreSelective.searchersProvided);
+        assertEquals(0, lessSelective.searchersProvided);
+
+        // the more selective index should be chosen
+        moreSelective.setEstimatedResultRows(1);
+        lessSelective.setEstimatedResultRows(1000);
+        execute("SELECT * FROM %s WHERE b=0 AND c=0 ALLOW FILTERING");
+        assertEquals(1, moreSelective.searchersProvided);
+        assertEquals(0, lessSelective.searchersProvided);
+
+        // and adjusting the selectivity should have an observable effect
+        moreSelective.setEstimatedResultRows(10000);
+        execute("SELECT * FROM %s WHERE b=0 AND c=0 ALLOW FILTERING");
+        assertEquals(1, moreSelective.searchersProvided);
+        assertEquals(1, lessSelective.searchersProvided);
+    }
+
+    @Test
+    public void customExpressionForcesIndexSelection() throws Throwable
+    {
+        createTable("CREATE TABLE %s(a int, b int, c int, PRIMARY KEY (a))");
+        createIndex(String.format("CREATE CUSTOM INDEX %s_more_selective ON %%s(b) USING '%s'",
+                                  currentTable(),
+                                  SettableSelectivityIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX %s_less_selective ON %%s(c) USING '%s'",
+                                  currentTable(),
+                                  SettableSelectivityIndex.class.getName()));
+        SettableSelectivityIndex moreSelective =
+            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
+        SettableSelectivityIndex lessSelective =
+            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
+        assertEquals(0, moreSelective.searchersProvided);
+        assertEquals(0, lessSelective.searchersProvided);
+
+        // without a custom expression, the more selective index should be chosen
+        moreSelective.setEstimatedResultRows(1);
+        lessSelective.setEstimatedResultRows(1000);
+        execute("SELECT * FROM %s WHERE b=0 AND c=0 ALLOW FILTERING");
+        assertEquals(1, moreSelective.searchersProvided);
+        assertEquals(0, lessSelective.searchersProvided);
+
+        // when a custom expression is present, its target index should be preferred
+        execute(String.format("SELECT * FROM %%s WHERE b=0 AND expr(%s_less_selective, 'expression') ALLOW FILTERING", currentTable()));
+        assertEquals(1, moreSelective.searchersProvided);
+        assertEquals(1, lessSelective.searchersProvided);
+    }
+
+    @Test
+    public void testCustomExpressionValueType() throws Throwable
+    {
+        // verify that the type of the expression value is determined by Index::customExpressionValueType
+        createTable("CREATE TABLE %s (k int, v1 uuid, v2 blob, PRIMARY KEY(k))");
+        createIndex(String.format("CREATE CUSTOM INDEX int_index ON %%s() USING '%s'",
+                                  Int32ExpressionIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX text_index ON %%s() USING '%s'",
+                                  UTF8ExpressionIndex.class.getName()));
+
+        execute("SELECT * FROM %s WHERE expr(text_index, 'foo')");
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  "Invalid INTEGER constant (99) for \"custom index expression\" of type text",
+                                  QueryValidationException.class,
+                                  "SELECT * FROM %s WHERE expr(text_index, 99)");
+
+        execute("SELECT * FROM %s WHERE expr(int_index, 99)");
+        assertInvalidThrowMessage(Server.CURRENT_VERSION,
+                                  "Invalid STRING constant (foo) for \"custom index expression\" of type int",
+                                  QueryValidationException.class,
+                                  "SELECT * FROM %s WHERE expr(int_index, 'foo')");
+    }
+
+    @Test
+    public void reloadIndexMetadataOnBaseCfsReload() throws Throwable
+    {
+        // verify that whenever the base table CFMetadata is reloaded, a reload of the index
+        // metadata is performed
+        createTable("CREATE TABLE %s (k int, v1 int, PRIMARY KEY(k))");
+        createIndex(String.format("CREATE CUSTOM INDEX reload_counter ON %%s() USING '%s'",
+                                  CountMetadataReloadsIndex.class.getName()));
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        CountMetadataReloadsIndex index = (CountMetadataReloadsIndex)cfs.indexManager.getIndexByName("reload_counter");
+        assertEquals(0, index.reloads.get());
+
+        // reloading the CFS, even without any metadata changes invokes the index's metadata reload task
+        cfs.reload();
+        assertEquals(1, index.reloads.get());
+    }
+
+    @Test
+    public void notifyIndexersOfPartitionAndRowRemovalDuringCleanup() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX cleanup_index ON %%s() USING '%s'", StubIndex.class.getName()));
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        StubIndex index  = (StubIndex)cfs.indexManager.getIndexByName("cleanup_index");
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 2, 2);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 3, 3, 3);
+        assertEquals(4, index.rowsInserted.size());
+        assertEquals(0, index.partitionDeletions.size());
+
+        ReadCommand cmd = Util.cmd(cfs, 0).build();
+        try (ReadOrderGroup orderGroup = cmd.startOrderGroup();
+             UnfilteredPartitionIterator iterator = cmd.executeLocally(orderGroup))
+        {
+            assertTrue(iterator.hasNext());
+            cfs.indexManager.deletePartition(iterator.next(), FBUtilities.nowInSeconds());
+        }
+
+        assertEquals(1, index.partitionDeletions.size());
+        assertEquals(3, index.rowsDeleted.size());
+        for (int i = 0; i < 3; i++)
+            assertEquals(index.rowsDeleted.get(i).clustering(), index.rowsInserted.get(i).clustering());
+    }
+
+    @Test
+    public void notifyIndexersOfExpiredRowsDuringCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, PRIMARY KEY (k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX row_ttl_test_index ON %%s() USING '%s'", StubIndex.class.getName()));
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        StubIndex index  = (StubIndex)cfs.indexManager.getIndexByName("row_ttl_test_index");
+
+        execute("INSERT INTO %s (k, c) VALUES (?, ?) USING TTL 1", 0, 0);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", 0, 1);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", 0, 2);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", 3, 3);
+        assertEquals(4, index.rowsInserted.size());
+        // flush so that we end up with an expiring row in the first sstable
+        flush();
+
+        // let the row with the ttl expire, then force a compaction
+        TimeUnit.SECONDS.sleep(2);
+        compact();
+
+        // the index should have been notified of the expired row
+        assertEquals(1, index.rowsDeleted.size());
+        Integer deletedClustering = Int32Type.instance.compose(index.rowsDeleted.get(0).clustering().get(0));
+        assertEquals(0, deletedClustering.intValue());
+    }
+
+    @Test
+    public void validateOptions() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY(k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, v2) USING '%s' WITH OPTIONS = {'foo':'bar'}",
+                                  IndexWithValidateOptions.class.getName()));
+        assertNotNull(IndexWithValidateOptions.options);
+        assertEquals("bar", IndexWithValidateOptions.options.get("foo"));
+    }
+
+    @Test
+    public void validateOptionsWithCFMetaData() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v1 int, v2 int, PRIMARY KEY(k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, v2) USING '%s' WITH OPTIONS = {'foo':'bar'}",
+                                  IndexWithOverloadedValidateOptions.class.getName()));
+        CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+        assertEquals(cfm, IndexWithOverloadedValidateOptions.cfm);
+        assertNotNull(IndexWithOverloadedValidateOptions.options);
+        assertEquals("bar", IndexWithOverloadedValidateOptions.options.get("foo"));
+    }
+
+    @Test
+    public void testFailing2iFlush() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int PRIMARY KEY, value int)");
+        createIndex("CREATE CUSTOM INDEX IF NOT EXISTS ON %s(value) USING 'org.apache.cassandra.index.CustomIndexTest$BrokenCustom2I'");
+
+        for (int i = 0; i < 10; i++)
+            execute("INSERT INTO %s (pk, value) VALUES (?, ?)", i, i);
+
+        try
+        {
+            getCurrentColumnFamilyStore().forceBlockingFlush();
+            fail("Flush should have thrown an exception.");
+        }
+        catch (Throwable t)
+        {
+            assertTrue(t.getMessage().contains("Broken2I"));
+        }
+
+        // SSTables remain uncommitted.
+        assertEquals(1, getCurrentColumnFamilyStore().getDirectories().getDirectoryForNewSSTables().listFiles().length);
+    }
+
+    @Test
+    public void indexBuildingPagesLargePartitions() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v int, PRIMARY KEY(k,c))");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        SecondaryIndexManager indexManager = cfs.indexManager;
+        int totalRows = SimulateConcurrentFlushingIndex.ROWS_IN_PARTITION;
+        // Insert a single wide partition to be indexed
+        for (int i = 0; i < totalRows; i++)
+            execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
+        cfs.forceBlockingFlush();
+
+        // Create the index, which won't automatically start building
+        String indexName = "build_single_partition_idx";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
+                                  indexName, SimulateConcurrentFlushingIndex.class.getName()));
+        SimulateConcurrentFlushingIndex index = (SimulateConcurrentFlushingIndex) indexManager.getIndexByName(indexName);
+
+        // Index the partition with an Indexer which artificially simulates additional concurrent
+        // flush activity by periodically issuing barriers on the read & write op groupings
+        DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(0));
+        indexManager.indexPartition(targetKey, Collections.singleton(index), totalRows / 10);
+
+        // When indexing is done check that:
+        // * The base table's read ordering at finish was > the one at the start (i.e. that
+        //   we didn't hold a single read OpOrder.Group for the whole operation.
+        // * That multiple write OpOrder.Groups were used to perform the writes to the index
+        // * That all operations are complete, that none of the relevant OpOrder.Groups are
+        //   marked as blocking progress and that all the barriers' ops are considered done.
+        assertTrue(index.readOrderingAtFinish.compareTo(index.readOrderingAtStart) > 0);
+        assertTrue(index.writeGroups.size() > 1);
+        assertFalse(index.readOrderingAtFinish.isBlocking());
+        index.writeGroups.forEach(group -> assertFalse(group.isBlocking()));
+        index.readBarriers.forEach(b -> assertTrue(b.getSyncPoint().isFinished()));
+        index.writeBarriers.forEach(b -> {
+            b.await(); // Keyspace.writeOrder is global, so this might be temporally blocked by other tests
+            assertTrue(b.getSyncPoint().isFinished());
+        });
+    }
+
+    @Test
+    public void partitionIndexTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v int, s int static, PRIMARY KEY(k,c))");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 2, 2);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 3, 3);
+
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", 2, 2);
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 3, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 3, 2, 2);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 3, 3, 3);
+        execute("DELETE FROM %s WHERE k = ? AND c >= ?", 3, 3);
+        execute("DELETE FROM %s WHERE k = ? AND c <= ?", 3, 1);
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 4, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 4, 2, 2);
+        execute("DELETE FROM %s WHERE k = ? AND c = ?", 4, 1);
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 5, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 5, 2, 2);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 5, 3, 3);
+        execute("DELETE FROM %s WHERE k = ?", 5);
+
+        cfs.forceBlockingFlush();
+
+        String indexName = "partition_index_test_idx";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
+                                  indexName, StubIndex.class.getName()));
+
+        SecondaryIndexManager indexManager = cfs.indexManager;
+        StubIndex index = (StubIndex) indexManager.getIndexByName(indexName);
+
+        DecoratedKey targetKey;
+        for (int pageSize = 1; pageSize <= 5; pageSize++)
+        {
+            targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(1));
+            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            assertEquals(3, index.rowsInserted.size());
+            assertEquals(0, index.rangeTombstones.size());
+            assertTrue(index.partitionDeletions.get(0).isLive());
+            index.reset();
+        }
+
+        for (int pageSize = 1; pageSize <= 5; pageSize++)
+        {
+            targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(2));
+            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            assertEquals(1, index.rowsInserted.size());
+            assertEquals(0, index.rangeTombstones.size());
+            assertTrue(index.partitionDeletions.get(0).isLive());
+            index.reset();
+        }
+
+        for (int pageSize = 1; pageSize <= 5; pageSize++)
+        {
+            targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(3));
+            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            assertEquals(1, index.rowsInserted.size());
+            assertEquals(2, index.rangeTombstones.size());
+            assertTrue(index.partitionDeletions.get(0).isLive());
+            index.reset();
+        }
+
+        for (int pageSize = 1; pageSize <= 5; pageSize++)
+        {
+            targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(5));
+            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            assertEquals(1, index.partitionDeletions.size());
+            assertFalse(index.partitionDeletions.get(0).isLive());
+            index.reset();
+        }
+    }
+
+    @Test
+    public void partitionIsNotOverIndexed() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v int, PRIMARY KEY(k,c))");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        SecondaryIndexManager indexManager = cfs.indexManager;
+
+        int totalRows = 1;
+
+        // Insert a single row partition to be indexed
+        for (int i = 0; i < totalRows; i++)
+            execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
+        cfs.forceBlockingFlush();
+
+        // Create the index, which won't automatically start building
+        String indexName = "partition_overindex_test_idx";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
+                                  indexName, StubIndex.class.getName()));
+        StubIndex index = (StubIndex) indexManager.getIndexByName(indexName);
+
+        // Index the partition
+        DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(0));
+        indexManager.indexPartition(targetKey, Collections.singleton(index), totalRows);
+
+        // Assert only one partition is counted
+        assertEquals(1, index.beginCalls);
+        assertEquals(1, index.finishCalls);
+    }
+
+    @Test
+    public void rangeTombstoneTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int, c int, v int, v2 int, PRIMARY KEY(k,c))");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        SecondaryIndexManager indexManager = cfs.indexManager;
+
+        // Insert a single range tombstone
+        execute("DELETE FROM %s WHERE k=1 and c > 2");
+        cfs.forceBlockingFlush();
+
+        // Create the index, which won't automatically start building
+        String indexName = "range_tombstone_idx";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
+                                  indexName, StubIndex.class.getName()));
+        String indexName2 = "range_tombstone_idx2";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v2) USING '%s'",
+                                  indexName2, StubIndex.class.getName()));
+
+        StubIndex index = (StubIndex) indexManager.getIndexByName(indexName);
+        StubIndex index2 = (StubIndex) indexManager.getIndexByName(indexName2);
+
+        // Index the partition
+        DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(1));
+        indexManager.indexPartition(targetKey, Sets.newHashSet(index, index2), 1);
+
+        // and both indexes should have the same range tombstone
+        assertEquals(index.rangeTombstones, index2.rangeTombstones);
+    }
+
+
+    // Used for index creation above
+    public static class BrokenCustom2I extends StubIndex
+    {
+        public BrokenCustom2I(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public Callable<?> getBlockingFlushTask()
+        {
+            throw new RuntimeException("Broken2I");
+        }
+    }
+
+    private void testCreateIndex(String indexName, String... targetColumnNames) throws Throwable
+    {
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(%s) USING '%s'",
+                                  indexName,
+                                  Arrays.stream(targetColumnNames).collect(Collectors.joining(",")),
+                                  StubIndex.class.getName()));
+        assertIndexCreated(indexName, targetColumnNames);
+    }
+
+    private void assertIndexCreated(String name, String... targetColumnNames)
+    {
+        assertIndexCreated(name, new HashMap<>(), targetColumnNames);
+    }
+
+    private void assertIndexCreated(String name, Map<String, String> options, String... targetColumnNames)
+    {
+        List<IndexTarget> targets = Arrays.stream(targetColumnNames)
+                                          .map(s -> new IndexTarget(ColumnIdentifier.getInterned(s, true),
+                                                                    IndexTarget.Type.VALUES))
+                                          .collect(Collectors.toList());
+        assertIndexCreated(name, options, targets);
+    }
+
+    private void assertIndexCreated(String name, Map<String, String> options, List<IndexTarget> targets)
+    {
+        // all tests here use StubIndex as the custom index class,
+        // so add that to the map of options
+        options.put(CUSTOM_INDEX_OPTION_NAME, StubIndex.class.getName());
+        CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+        IndexMetadata expected = IndexMetadata.fromIndexTargets(cfm, targets, name, IndexMetadata.Kind.CUSTOM, options);
+        Indexes indexes = getCurrentColumnFamilyStore().metadata.getIndexes();
+        for (IndexMetadata actual : indexes)
+            if (actual.equals(expected))
+                return;
+
+        fail(String.format("Index %s not found in CFMetaData", expected));
+    }
+
+    private static IndexTarget indexTarget(String name, IndexTarget.Type type)
+    {
+        return new IndexTarget(ColumnIdentifier.getInterned(name, true), type);
+    }
+
+    public static final class CountMetadataReloadsIndex extends StubIndex
+    {
+        private final AtomicInteger reloads = new AtomicInteger(0);
+
+        public CountMetadataReloadsIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public void reset()
+        {
+            super.reset();
+            reloads.set(0);
+        }
+
+        public Callable<?> getMetadataReloadTask(IndexMetadata indexMetadata)
+        {
+            return reloads::incrementAndGet;
+        }
+    }
+
+    public static final class IndexIncludedInBuild extends StubIndex
+    {
+        public IndexIncludedInBuild(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public boolean shouldBuildBlocking()
+        {
+            return true;
+        }
+    }
+
+    public static final class UTF8ExpressionIndex extends StubIndex
+    {
+        public UTF8ExpressionIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public AbstractType<?> customExpressionValueType()
+        {
+            return UTF8Type.instance;
+        }
+    }
+
+    public static final class Int32ExpressionIndex extends StubIndex
+    {
+        public Int32ExpressionIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public AbstractType<?> customExpressionValueType()
+        {
+            return Int32Type.instance;
+        }
+    }
+
+    public static final class SettableSelectivityIndex extends StubIndex
+    {
+        private int searchersProvided = 0;
+        private long estimatedResultRows = 0;
+
+        public SettableSelectivityIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public void setEstimatedResultRows(long estimate)
+        {
+            estimatedResultRows = estimate;
+        }
+
+        public long getEstimatedResultRows()
+        {
+            return estimatedResultRows;
+        }
+
+        public Searcher searcherFor(ReadCommand command)
+        {
+                searchersProvided++;
+                return super.searcherFor(command);
+        }
+    }
+
+    public static final class IndexExcludedFromBuild extends StubIndex
+    {
+        public IndexExcludedFromBuild(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public boolean shouldBuildBlocking()
+        {
+            return false;
+        }
+    }
+
+    public static final class NoCustomExpressionsIndex extends StubIndex
+    {
+        public NoCustomExpressionsIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public AbstractType<?> customExpressionValueType()
+        {
+            return null;
+        }
+    }
+
+    public static final class AlwaysRejectIndex extends StubIndex
+    {
+        public AlwaysRejectIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public void validate(ReadCommand command) throws InvalidRequestException
+        {
+            throw new InvalidRequestException("None shall pass");
+        }
+
+        public Searcher searcherFor(ReadCommand command)
+        {
+            throw new InvalidRequestException("None shall pass (though I'd have expected to fail faster)");
+        }
+    }
+
+    public static final class IndexWithValidateOptions extends StubIndex
+    {
+        public static Map<String, String> options;
+
+        public IndexWithValidateOptions(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public static Map<String, String> validateOptions(Map<String, String> options)
+        {
+            IndexWithValidateOptions.options = options;
+            return new HashMap<>();
+        }
+    }
+
+    public static final class IndexWithOverloadedValidateOptions extends StubIndex
+    {
+        public static CFMetaData cfm;
+        public static Map<String, String> options;
+
+        public IndexWithOverloadedValidateOptions(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        public static Map<String, String> validateOptions(Map<String, String> options, CFMetaData cfm)
+        {
+            IndexWithOverloadedValidateOptions.options = options;
+            IndexWithOverloadedValidateOptions.cfm = cfm;
+            return new HashMap<>();
+        }
+    }
+
+    public static final class SimulateConcurrentFlushingIndex extends StubIndex
+    {
+        ColumnFamilyStore baseCfs;
+        AtomicInteger indexedRowCount = new AtomicInteger(0);
+
+        OpOrder.Group readOrderingAtStart = null;
+        OpOrder.Group readOrderingAtFinish = null;
+        Set<OpOrder.Group> writeGroups = new HashSet<>();
+        List<OpOrder.Barrier> readBarriers = new ArrayList<>();
+        List<OpOrder.Barrier> writeBarriers = new ArrayList<>();
+
+        static final int ROWS_IN_PARTITION = 1000;
+
+        public SimulateConcurrentFlushingIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+            this.baseCfs = baseCfs;
+        }
+
+        // When indexing an entire partition 2 potential problems can be caused by
+        // whilst holding a single read & a single write OpOrder.Group.
+        // * By holding a write group too long, flushes are blocked
+        // * Holding a read group for too long prevents the memory from flushed memtables
+        //   from being reclaimed.
+        // See CASSANDRA-12796 for details.
+        // To test that the index builder pages through a large partition, using
+        // finer grained OpOrder.Groups we write a "large" partition to disk, then
+        // kick off an index build on it, using this indexer.
+        // To simulate concurrent flush activity, we periodically issue barriers on
+        // the current read and write groups.
+        // When we're done indexing the partition, the test checks the states of the
+        // various OpOrder.Groups, which it can obtain from this index.
+
+        public Indexer indexerFor(final DecoratedKey key,
+                                  PartitionColumns columns,
+                                  int nowInSec,
+                                  OpOrder.Group opGroup,
+                                  IndexTransaction.Type transactionType)
+        {
+            if (readOrderingAtStart == null)
+                readOrderingAtStart = baseCfs.readOrdering.getCurrent();
+
+            writeGroups.add(opGroup);
+
+            return new Indexer()
+            {
+                public void begin()
+                {
+                    // to simulate other activity on base table during indexing, issue
+                    // barriers on the read and write orderings. This is analogous to
+                    // what happens when other flushes are being processed during the
+                    // indexing of a partition
+                    OpOrder.Barrier readBarrier = baseCfs.readOrdering.newBarrier();
+                    readBarrier.issue();
+                    readBarriers.add(readBarrier);
+                    OpOrder.Barrier writeBarrier = Keyspace.writeOrder.newBarrier();
+                    writeBarrier.issue();
+                    writeBarriers.add(writeBarrier);
+                }
+
+                public void insertRow(Row row)
+                {
+                    indexedRowCount.incrementAndGet();
+                }
+
+                public void finish()
+                {
+                    // we've indexed all rows in the target partition,
+                    // grab the read OpOrder.Group for the base CFS so
+                    // we can compare it with the starting group
+                    if (indexedRowCount.get() < ROWS_IN_PARTITION)
+                        readOrderingAtFinish = baseCfs.readOrdering.getCurrent();
+                }
+
+                public void partitionDelete(DeletionTime deletionTime) { }
+
+                public void rangeTombstone(RangeTombstone tombstone) { }
+
+                public void updateRow(Row oldRowData, Row newRowData) { }
+
+                public void removeRow(Row row) { }
+
+            };
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java
new file mode 100644
index 0000000..569ce97
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/StubIndex.java

@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index;
+
+import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.function.BiFunction;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Basic custom index implementation for testing.
+ * During indexing by default it just records the updates for later inspection.
+ * At query time, the Searcher implementation simply performs a local scan of the entire target table
+ * with no further filtering applied.
+ */
+public class StubIndex implements Index
+{
+    public volatile int beginCalls;
+    public volatile int finishCalls;
+    public List<DeletionTime> partitionDeletions = new ArrayList<>();
+    public List<RangeTombstone> rangeTombstones = new ArrayList<>();
+    public List<Row> rowsInserted = new ArrayList<>();
+    public List<Row> rowsDeleted = new ArrayList<>();
+    public List<Pair<Row,Row>> rowsUpdated = new ArrayList<>();
+    private IndexMetadata indexMetadata;
+    private ColumnFamilyStore baseCfs;
+
+    public void reset()
+    {
+        rowsInserted.clear();
+        rowsDeleted.clear();
+        rowsUpdated.clear();
+        partitionDeletions.clear();
+        rangeTombstones.clear();
+    }
+
+    public StubIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+    {
+        this.baseCfs = baseCfs;
+        this.indexMetadata = metadata;
+    }
+
+    public boolean shouldBuildBlocking()
+    {
+        return false;
+    }
+
+    public boolean dependsOn(ColumnDefinition column)
+    {
+        return false;
+    }
+
+    public boolean supportsExpression(ColumnDefinition column, Operator operator)
+    {
+        return operator == Operator.EQ;
+    }
+
+    public AbstractType<?> customExpressionValueType()
+    {
+        return UTF8Type.instance;
+    }
+
+    public RowFilter getPostIndexQueryFilter(RowFilter filter)
+    {
+        return filter;
+    }
+
+    public Indexer indexerFor(final DecoratedKey key,
+                              PartitionColumns columns,
+                              int nowInSec,
+                              OpOrder.Group opGroup,
+                              IndexTransaction.Type transactionType)
+    {
+        return new Indexer()
+        {
+            public void begin()
+            {
+                beginCalls++;
+            }
+
+            public void partitionDelete(DeletionTime deletionTime)
+            {
+                partitionDeletions.add(deletionTime);
+            }
+
+            public void rangeTombstone(RangeTombstone tombstone)
+            {
+                rangeTombstones.add(tombstone);
+            }
+
+            public void insertRow(Row row)
+            {
+                rowsInserted.add(row);
+            }
+
+            public void removeRow(Row row)
+            {
+                rowsDeleted.add(row);
+            }
+
+            public void updateRow(Row oldRowData, Row newRowData)
+            {
+                rowsUpdated.add(Pair.create(oldRowData, newRowData));
+            }
+
+            public void finish()
+            {
+                finishCalls++;
+            }
+        };
+    }
+
+    public Callable<?> getInitializationTask()
+    {
+        return null;
+    }
+
+    public IndexMetadata getIndexMetadata()
+    {
+        return indexMetadata;
+    }
+
+    public void register(IndexRegistry registry){
+        registry.registerIndex(this);
+    }
+
+    public Optional<ColumnFamilyStore> getBackingTable()
+    {
+        return Optional.empty();
+    }
+
+    public Collection<ColumnDefinition> getIndexedColumns()
+    {
+        return Collections.emptySet();
+    }
+
+    public Callable<?> getBlockingFlushTask()
+    {
+        return null;
+    }
+
+    public Callable<?> getTruncateTask(long truncatedAt)
+    {
+        return null;
+    }
+
+    public Callable<?> getInvalidateTask()
+    {
+        return null;
+    }
+
+    public Callable<?> getMetadataReloadTask(IndexMetadata indexMetadata)
+    {
+        return null;
+    }
+
+    public long getEstimatedResultRows()
+    {
+        return 0;
+    }
+
+    public void validate(PartitionUpdate update) throws InvalidRequestException
+    {
+
+    }
+
+    public Searcher searcherFor(final ReadCommand command)
+    {
+        return (orderGroup) -> Util.executeLocally((PartitionRangeReadCommand)command, baseCfs, orderGroup);
+    }
+
+    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand readCommand)
+    {
+        return (iter, command) -> iter;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java b/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java
new file mode 100644
index 0000000..36c0249
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java

@@ -0,0 +1,858 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.internal;
+
+import java.nio.ByteBuffer;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.*;
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.Util.throwAssert;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+/**
+ * Smoke tests of built-in secondary index implementations
+ */
+public class CassandraIndexTest extends CQLTester
+{
+    @Test
+    public void indexOnRegularColumn() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c));")
+                        .target("v")
+                        .withFirstRow(row(0, 0, 0))
+                        .withSecondRow(row(1, 1, 1))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("v=0")
+                        .secondQueryExpression("v=1")
+                        .updateExpression("SET v=2")
+                        .postUpdateQueryExpression("v=2")
+                        .run();
+    }
+
+    @Test
+    public void testIndexOnPartitionKeyWithPartitionWithoutRows() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk1 int, pk2 int, c int, s int static, v int, PRIMARY KEY((pk1, pk2), c))");
+        createIndex("CREATE INDEX ON %s (pk2)");
+
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 1, 9, 1);
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 1, 1, 2, 9, 2);
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 3, 1, 1, 9, 1);
+        execute("INSERT INTO %s (pk1, pk2, c, s, v) VALUES (?, ?, ?, ?, ?)", 4, 1, 1, 9, 1);
+        flush();
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk2 = ?", 1),
+                                row(1, 1, 1, 9, 1),
+                                row(1, 1, 2, 9, 2),
+                                row(3, 1, 1, 9, 1),
+                                row(4, 1, 1, 9, 1));
+
+        execute("DELETE FROM %s WHERE pk1 = ? AND pk2 = ? AND c = ?", 3, 1, 1);
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk2 = ?", 1),
+                                row(1, 1, 1, 9, 1),
+                                row(1, 1, 2, 9, 2),
+                                row(3, 1, null, 9, null),
+                                row(4, 1, 1, 9, 1));
+    }
+
+    @Test
+    public void indexOnFirstClusteringColumn() throws Throwable
+    {
+        // No update allowed on primary key columns, so this script has no update expression
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c));")
+                        .target("c")
+                        .withFirstRow(row(0, 0, 0))
+                        .withSecondRow(row(1, 1, 1))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("c=0")
+                        .secondQueryExpression("c=1")
+                        .run();
+    }
+
+    @Test
+    public void indexOnSecondClusteringColumn() throws Throwable
+    {
+        // No update allowed on primary key columns, so this script has no update expression
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c1 int, c2 int, v int, PRIMARY KEY (k, c1, c2));")
+                        .target("c2")
+                        .withFirstRow(row(0, 0, 0, 0))
+                        .withSecondRow(row(1, 1, 1, 1))
+                        .missingIndexMessage(String.format("PRIMARY KEY column \"%s\" cannot be restricted " +
+                                                           "as preceding column \"%s\" is not restricted",
+                                                           "c2", "c1"))
+                        .firstQueryExpression("c2=0")
+                        .secondQueryExpression("c2=1")
+                        .run();
+    }
+
+    @Test
+    public void indexOnFirstPartitionKeyColumn() throws Throwable
+    {
+        // No update allowed on primary key columns, so this script has no update expression
+        new TestScript().tableDefinition("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, v int, PRIMARY KEY ((k1, k2), c1, c2));")
+                        .target("k1")
+                        .withFirstRow(row(0, 0, 0, 0, 0))
+                        .withSecondRow(row(1, 1, 1, 1, 1))
+                        .missingIndexMessage("Partition key parts: k2 must be restricted as other parts are")
+                        .firstQueryExpression("k1=0")
+                        .secondQueryExpression("k1=1")
+                        .run();
+    }
+
+    @Test
+    public void indexOnSecondPartitionKeyColumn() throws Throwable
+    {
+        // No update allowed on primary key columns, so this script has no update expression
+        new TestScript().tableDefinition("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, v int, PRIMARY KEY ((k1, k2), c1, c2));")
+                        .target("k2")
+                        .withFirstRow(row(0, 0, 0, 0, 0))
+                        .withSecondRow(row(1, 1, 1, 1, 1))
+                        .missingIndexMessage("Partition key parts: k1 must be restricted as other parts are")
+                        .firstQueryExpression("k2=0")
+                        .secondQueryExpression("k2=1")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenListWithReplaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, l list<int>, PRIMARY KEY (k, c));")
+                        .target("l")
+                        .withFirstRow(row(0, 0, Lists.newArrayList(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Lists.newArrayList(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("l CONTAINS 10")
+                        .secondQueryExpression("l CONTAINS 11")
+                        .updateExpression("SET l = [40, 50, 60]")
+                        .postUpdateQueryExpression("l CONTAINS 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenListWithInPlaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, l list<int>, PRIMARY KEY (k, c));")
+                        .target("l")
+                        .withFirstRow(row(0, 0, Lists.newArrayList(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Lists.newArrayList(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("l CONTAINS 10")
+                        .secondQueryExpression("l CONTAINS 11")
+                        .updateExpression("SET l = l - [10]")
+                        .postUpdateQueryExpression("l CONTAINS 20")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenSetWithReplaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, s set<int>, PRIMARY KEY (k, c));")
+                        .target("s")
+                        .withFirstRow(row(0, 0, Sets.newHashSet(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Sets.newHashSet(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("s CONTAINS 10")
+                        .secondQueryExpression("s CONTAINS 11")
+                        .updateExpression("SET s = {40, 50, 60}")
+                        .postUpdateQueryExpression("s CONTAINS 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenSetWithInPlaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, s set<int>, PRIMARY KEY (k, c));")
+                        .target("s")
+                        .withFirstRow(row(0, 0, Sets.newHashSet(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Sets.newHashSet(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("s CONTAINS 10")
+                        .secondQueryExpression("s CONTAINS 11")
+                        .updateExpression("SET s = s - {10}")
+                        .postUpdateQueryExpression("s CONTAINS 20")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapValuesWithReplaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("m")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m CONTAINS 10")
+                        .secondQueryExpression("m CONTAINS 11")
+                        .updateExpression("SET m = {'x':40, 'y':50, 'z':60}")
+                        .postUpdateQueryExpression("m CONTAINS 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapValuesWithInPlaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("m")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m CONTAINS 10")
+                        .secondQueryExpression("m CONTAINS 11")
+                        .updateExpression("SET m['a'] = 40")
+                        .postUpdateQueryExpression("m CONTAINS 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapKeysWithReplaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("keys(m)")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m CONTAINS KEY 'a'")
+                        .secondQueryExpression("m CONTAINS KEY 'd'")
+                        .updateExpression("SET m = {'x':40, 'y':50, 'z':60}")
+                        .postUpdateQueryExpression("m CONTAINS KEY 'x'")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapKeysWithInPlaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("keys(m)")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m CONTAINS KEY 'a'")
+                        .secondQueryExpression("m CONTAINS KEY 'd'")
+                        .updateExpression("SET m['a'] = NULL")
+                        .postUpdateQueryExpression("m CONTAINS KEY 'b'")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapEntriesWithReplaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("entries(m)")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m['a'] = 10")
+                        .secondQueryExpression("m['d'] = 11")
+                        .updateExpression("SET m = {'x':40, 'y':50, 'z':60}")
+                        .postUpdateQueryExpression("m['x'] = 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnNonFrozenMapEntriesWithInPlaceOperation() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m map<text,int>, PRIMARY KEY (k, c));")
+                        .target("entries(m)")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m['a'] = 10")
+                        .secondQueryExpression("m['d'] = 11")
+                        .updateExpression("SET m['a'] = 40")
+                        .postUpdateQueryExpression("m['a'] = 40")
+                        .run();
+    }
+
+    @Test
+    public void indexOnFrozenList() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, l frozen<list<int>>, PRIMARY KEY (k, c));")
+                        .target("full(l)")
+                        .withFirstRow(row(0, 0, Lists.newArrayList(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Lists.newArrayList(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("l = [10, 20, 30]")
+                        .secondQueryExpression("l = [11, 21, 31]")
+                        .updateExpression("SET l = [40, 50, 60]")
+                        .postUpdateQueryExpression("l = [40, 50, 60]")
+                        .run();
+    }
+
+    @Test
+    public void indexOnFrozenSet() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, s frozen<set<int>>, PRIMARY KEY (k, c));")
+                        .target("full(s)")
+                        .withFirstRow(row(0, 0, Sets.newHashSet(10, 20, 30)))
+                        .withSecondRow(row(1, 1, Sets.newHashSet(11, 21, 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("s = {10, 20, 30}")
+                        .secondQueryExpression("s = {11, 21, 31}")
+                        .updateExpression("SET s = {40, 50, 60}")
+                        .postUpdateQueryExpression("s = {40, 50, 60}")
+                        .run();
+    }
+
+    @Test
+    public void indexOnFrozenMap() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, c int, m frozen<map<text,int>>, PRIMARY KEY (k, c));")
+                        .target("full(m)")
+                        .withFirstRow(row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30)))
+                        .withSecondRow(row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 31)))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("m = {'a':10, 'b':20, 'c':30}")
+                        .secondQueryExpression("m = {'d':11, 'e':21, 'f':31}")
+                        .updateExpression("SET m = {'x':40, 'y':50, 'z':60}")
+                        .postUpdateQueryExpression("m = {'x':40, 'y':50, 'z':60}")
+                        .run();
+    }
+
+    @Test
+    public void indexOnRegularColumnWithCompactStorage() throws Throwable
+    {
+        new TestScript().tableDefinition("CREATE TABLE %s (k int, v int, PRIMARY KEY (k)) WITH COMPACT STORAGE;")
+                        .target("v")
+                        .withFirstRow(row(0, 0))
+                        .withSecondRow(row(1,1))
+                        .missingIndexMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE)
+                        .firstQueryExpression("v=0")
+                        .secondQueryExpression("v=1")
+                        .updateExpression("SET v=2")
+                        .postUpdateQueryExpression("v=2")
+                        .run();
+    }
+
+    @Test
+    public void testIndexOnCompactTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, v int, PRIMARY KEY (k)) WITH COMPACT STORAGE;");
+        assertInvalidMessage("No column definition found for column value",
+                             "CREATE INDEX idx_value ON %s(value)");
+    }
+
+    @Test
+    public void indexOnClusteringColumnWithoutRegularColumns() throws Throwable
+    {
+        Object[] row1 = row("k0", "c0");
+        Object[] row2 = row("k0", "c1");
+        Object[] row3 = row("k1", "c0");
+        Object[] row4 = row("k1", "c1");
+        String tableName = createTable("CREATE TABLE %s (k text, c text, PRIMARY KEY(k, c))");
+        createIndex("CREATE INDEX no_regulars_idx ON %s(c)");
+
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", row1);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", row2);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", row3);
+        execute("INSERT INTO %s (k, c) VALUES (?, ?)", row4);
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c = ?", "c0"), row1, row3);
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c = ?", "c1"), row2, row4);
+        assertEmpty(execute("SELECT * FROM %s WHERE c = ?", "c3"));
+
+        dropIndex("DROP INDEX %s.no_regulars_idx");
+        createIndex("CREATE INDEX no_regulars_idx ON %s(c)");
+        assertTrue(waitForIndex(keyspace(), tableName, "no_regulars_idx"));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c = ?", "c0"), row1, row3);
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c = ?", "c1"), row2, row4);
+        assertEmpty(execute("SELECT * FROM %s WHERE c = ?", "c3"));
+    }
+
+    @Test
+    public void createIndexesOnMultipleMapDimensions() throws Throwable
+    {
+        Object[] row1 = row(0, 0, ImmutableMap.of("a", 10, "b", 20, "c", 30));
+        Object[] row2 = row(1, 1, ImmutableMap.of("d", 11, "e", 21, "f", 32));
+        createTable("CREATE TABLE %s (k int, c int, m map<text, int>, PRIMARY KEY(k, c))");
+        createIndex("CREATE INDEX ON %s(keys(m))");
+        createIndex("CREATE INDEX ON %s(m)");
+
+        execute("INSERT INTO %s (k, c, m) VALUES (?, ?, ?)", row1);
+        execute("INSERT INTO %s (k, c, m) VALUES (?, ?, ?)", row2);
+
+        assertRows(execute("SELECT * FROM %s WHERE m CONTAINS KEY 'a'"), row1);
+        assertRows(execute("SELECT * FROM %s WHERE m CONTAINS 20"), row1);
+        assertRows(execute("SELECT * FROM %s WHERE m CONTAINS KEY 'f'"), row2);
+        assertRows(execute("SELECT * FROM %s WHERE m CONTAINS 32"), row2);
+    }
+
+    @Test
+    public void insertWithTombstoneRemovesEntryFromIndex() throws Throwable
+    {
+        int key = 0;
+        int indexedValue = 99;
+        createTable("CREATE TABLE %s (k int, v int, PRIMARY KEY(k))");
+        createIndex("CREATE INDEX ON %s(v)");
+        execute("INSERT INTO %s (k, v) VALUES (?, ?)", key, indexedValue);
+
+        assertRows(execute("SELECT * FROM %s WHERE v = ?", indexedValue), row(key, indexedValue));
+        execute("DELETE v FROM %s WHERE k=?", key);
+        assertEmpty(execute("SELECT * FROM %s WHERE v = ?", indexedValue));
+    }
+
+    @Test
+    public void updateTTLOnIndexedClusteringValue() throws Throwable
+    {
+        int basePk = 1;
+        int indexedVal = 2;
+        int initialTtl = 3600;
+        createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY(k,c))");
+        createIndex("CREATE INDEX ON %s(c)");
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, 0) USING TTL ?", basePk, indexedVal, initialTtl);
+        ColumnFamilyStore baseCfs = getCurrentColumnFamilyStore();
+        ColumnFamilyStore indexCfs = baseCfs.indexManager.listIndexes()
+                                                         .iterator()
+                                                         .next()
+                                                         .getBackingTable()
+                                                         .orElseThrow(throwAssert("No index found"));
+        assertIndexRowTtl(indexCfs, indexedVal, initialTtl);
+
+        int updatedTtl = 9999;
+        execute("INSERT INTO %s (k, c ,v) VALUES (?, ?, 0) USING TTL ?", basePk, indexedVal, updatedTtl);
+
+        assertIndexRowTtl(indexCfs, indexedVal, updatedTtl);
+    }
+
+    @Test
+    public void indexBatchStatements() throws Throwable
+    {
+        // see CASSANDRA-10536
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        createIndex("CREATE INDEX ON %s(c)");
+
+        // Multi partition batch
+        execute("BEGIN BATCH\n" +
+                "UPDATE %1$s SET c = 0 WHERE a = 0 AND b = 0;\n" +
+                "UPDATE %1$s SET c = 1 WHERE a = 1 AND b = 1;\n" +
+                "APPLY BATCH");
+        assertRows(execute("SELECT * FROM %s WHERE c = 0"), row(0, 0, 0));
+        assertRows(execute("SELECT * FROM %s WHERE c = 1"), row(1, 1, 1));
+
+        // Single Partition batch
+        execute("BEGIN BATCH\n" +
+                "UPDATE %1$s SET c = 2 WHERE a = 2 AND b = 0;\n" +
+                "UPDATE %1$s SET c = 3 WHERE a = 2 AND b = 1;\n" +
+                "APPLY BATCH");
+        assertRows(execute("SELECT * FROM %s WHERE c = 2"), row(2, 0, 2));
+        assertRows(execute("SELECT * FROM %s WHERE c = 3"), row(2, 1, 3));
+    }
+
+    @Test
+    public void indexStatementsWithConditions() throws Throwable
+    {
+        // see CASSANDRA-10536
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        createIndex("CREATE INDEX ON %s(c)");
+
+        execute("INSERT INTO %s (a, b, c) VALUES (0, 0, 0) IF NOT EXISTS");
+        assertRows(execute("SELECT * FROM %s WHERE c = 0"), row(0, 0, 0));
+
+        execute("INSERT INTO %s (a, b, c) VALUES (0, 0, 1) IF NOT EXISTS");
+        assertEmpty(execute("SELECT * FROM %s WHERE c = 1"));
+
+        execute("UPDATE %s SET c = 1 WHERE a = 0 AND b =0 IF c = 0");
+        assertRows(execute("SELECT * FROM %s WHERE c = 1"), row(0, 0, 1));
+        assertEmpty(execute("SELECT * FROM %s WHERE c = 0"));
+
+        execute("DELETE FROM %s WHERE a = 0 AND b = 0 IF c = 0");
+        assertRows(execute("SELECT * FROM %s WHERE c = 1"), row(0, 0, 1));
+
+        execute("DELETE FROM %s WHERE a = 0 AND b = 0 IF c = 1");
+        assertEmpty(execute("SELECT * FROM %s WHERE c = 1"));
+
+        execute("BEGIN BATCH\n" +
+                "INSERT INTO %1$s (a, b, c) VALUES (2, 2, 2) IF NOT EXISTS;\n" +
+                "INSERT INTO %1$s (a, b, c) VALUES (2, 3, 3)\n" +
+                "APPLY BATCH");
+        assertRows(execute("SELECT * FROM %s WHERE c = 2"), row(2, 2, 2));
+        assertRows(execute("SELECT * FROM %s WHERE c = 3"), row(2, 3, 3));
+    }
+
+    @Test
+    public void indexCorrectlyMarkedAsBuildAndRemoved() throws Throwable
+    {
+        String selectBuiltIndexesQuery = String.format("SELECT * FROM %s.\"%s\"",
+                                                       SystemKeyspace.NAME,
+                                                       SystemKeyspace.BUILT_INDEXES);
+        UntypedResultSet rs = execute(selectBuiltIndexesQuery);
+        int initialSize = rs.size();
+
+        String indexName = "build_remove_test_idx";
+        String tableName = createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
+        createIndex(String.format("CREATE INDEX %s ON %%s(c)", indexName));
+        waitForIndex(KEYSPACE, tableName, indexName);
+        // check that there are no other rows in the built indexes table
+        rs = execute(selectBuiltIndexesQuery);
+        int sizeAfterBuild = rs.size();
+        assertRowsIgnoringOrderAndExtra(rs, row(KEYSPACE, indexName));
+
+        // rebuild the index and verify the built status table
+        getCurrentColumnFamilyStore().rebuildSecondaryIndex(indexName);
+        waitForIndex(KEYSPACE, tableName, indexName);
+
+        // check that there are no other rows in the built indexes table
+        rs = execute(selectBuiltIndexesQuery);
+        assertEquals(sizeAfterBuild, rs.size());
+        assertRowsIgnoringOrderAndExtra(rs, row(KEYSPACE, indexName));
+
+        // check that dropping the index removes it from the built indexes table
+        dropIndex("DROP INDEX %s." + indexName);
+        rs = execute(selectBuiltIndexesQuery);
+        assertEquals(initialSize, rs.size());
+        rs.forEach(row -> assertFalse(row.getString("table_name").equals(KEYSPACE)  // table_name is actually keyspace
+                                      && row.getString("index_name").equals(indexName)));
+    }
+
+    // this is slightly annoying, but we cannot read rows from the methods in Util as
+    // ReadCommand#executeInternal uses metadata retrieved via the cfId, which the index
+    // CFS inherits from the base CFS. This has the 'wrong' partitioner (the index table
+    // uses LocalPartition, the base table a real one, so we cannot read from the index
+    // table with executeInternal
+    private void assertIndexRowTtl(ColumnFamilyStore indexCfs, int indexedValue, int ttl) throws Throwable
+    {
+        DecoratedKey indexKey = indexCfs.decorateKey(ByteBufferUtil.bytes(indexedValue));
+        ClusteringIndexFilter filter = new ClusteringIndexSliceFilter(Slices.with(indexCfs.metadata.comparator,
+                                                                                  Slice.ALL),
+                                                                      false);
+        SinglePartitionReadCommand command = SinglePartitionReadCommand.create(indexCfs.metadata,
+                                                                               FBUtilities.nowInSeconds(),
+                                                                               indexKey,
+                                                                               ColumnFilter.all(indexCfs.metadata),
+                                                                               filter);
+        try (ReadOrderGroup orderGroup = ReadOrderGroup.forCommand(command);
+             UnfilteredRowIterator iter = command.queryMemtableAndDisk(indexCfs, orderGroup.indexReadOpOrderGroup()))
+        {
+            while( iter.hasNext())
+            {
+                Unfiltered unfiltered = iter.next();
+                assert (unfiltered.isRow());
+                Row indexRow = (Row) unfiltered;
+                assertEquals(ttl, indexRow.primaryKeyLivenessInfo().ttl());
+            }
+        }
+    }
+
+    // Used in order to generate the unique names for indexes
+    private static int indexCounter;
+
+    private class TestScript
+    {
+        String tableDefinition;
+        String indexName;
+        String indexTarget;
+        String queryExpression1;
+        String queryExpression2;
+        String updateExpression;
+        String postUpdateQueryExpression;
+        String missingIndexMessage;
+
+        Object[] firstRow;
+        Object[] secondRow;
+
+        TestScript target(String indexTarget)
+        {
+            this.indexTarget = indexTarget;
+            return this;
+        }
+
+        TestScript tableDefinition(String tableDefinition)
+        {
+            this.tableDefinition = tableDefinition;
+            return this;
+        }
+
+        TestScript withFirstRow(Object[] row)
+        {
+            this.firstRow = row;
+            return this;
+        }
+
+        TestScript withSecondRow(Object[] row)
+        {
+            this.secondRow = row;
+            return this;
+        }
+
+        TestScript firstQueryExpression(String queryExpression)
+        {
+            queryExpression1 = queryExpression;
+            return this;
+        }
+
+        TestScript secondQueryExpression(String queryExpression)
+        {
+            queryExpression2 = queryExpression;
+            return this;
+        }
+
+        TestScript updateExpression(String updateExpression)
+        {
+            this.updateExpression = updateExpression;
+            return this;
+        }
+
+        TestScript postUpdateQueryExpression(String queryExpression)
+        {
+            this.postUpdateQueryExpression = queryExpression;
+            return this;
+        }
+
+        TestScript missingIndexMessage(String missingIndexMessage)
+        {
+            this.missingIndexMessage = missingIndexMessage;
+            return this;
+        }
+
+        void run() throws Throwable
+        {
+            // check minimum required setup
+            assertNotNull(indexTarget);
+            assertNotNull(queryExpression1);
+            assertNotNull(queryExpression2);
+            assertNotNull(firstRow);
+            assertNotNull(secondRow);
+            assertNotNull(tableDefinition);
+            if (updateExpression != null)
+                assertNotNull(postUpdateQueryExpression);
+
+            // first, create the table as we need the CFMetaData to build the other cql statements
+            String tableName = createTable(tableDefinition);
+
+            indexName = String.format("index_%s_%d", tableName, indexCounter++);
+
+            // now setup the cql statements the test will run through. Some are dependent on
+            // the table definition, others are not.
+            String createIndexCql = String.format("CREATE INDEX %s ON %%s(%s)", indexName, indexTarget);
+            String dropIndexCql = String.format("DROP INDEX %s.%s", KEYSPACE, indexName);
+
+            String selectFirstRowCql = String.format("SELECT * FROM %%s WHERE %s", queryExpression1);
+            String selectSecondRowCql = String.format("SELECT * FROM %%s WHERE %s", queryExpression2);
+            String insertCql = getInsertCql();
+            String deleteRowCql = getDeleteRowCql();
+            String deletePartitionCql = getDeletePartitionCql();
+
+            // everything setup, run through the smoke test
+            execute(insertCql, firstRow);
+            // before creating the index, check we cannot query on the indexed column
+            assertInvalidThrowMessage(missingIndexMessage, InvalidRequestException.class, selectFirstRowCql);
+
+            // create the index, wait for it to be built then validate the indexed value
+            createIndex(createIndexCql);
+            waitForIndexBuild();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertEmpty(execute(selectSecondRowCql));
+
+            // flush and check again
+            flush();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertEmpty(execute(selectSecondRowCql));
+
+            // force major compaction and query again
+            compact();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertEmpty(execute(selectSecondRowCql));
+
+            // reload the base cfs and verify queries still work as expected
+            getCurrentColumnFamilyStore().reload();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertEmpty(execute(selectSecondRowCql));
+
+            // drop the index and assert we can no longer query using it
+            execute(dropIndexCql);
+            assertInvalidThrowMessage(missingIndexMessage, InvalidRequestException.class, selectFirstRowCql);
+            // reload the base cfs and verify again
+            getCurrentColumnFamilyStore().reload();
+            assertInvalidThrowMessage(missingIndexMessage, InvalidRequestException.class, selectFirstRowCql);
+
+            flush();
+            compact();
+
+            // insert second row, re-create the index and query for both indexed values
+            execute(insertCql, secondRow);
+            createIndex(createIndexCql);
+            waitForIndexBuild();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertRows(execute(selectSecondRowCql), secondRow);
+
+            // modify the indexed value in the first row, assert we can query by the new value & not the original one
+            // note: this is not possible if the indexed column is part of the primary key, so we skip it in that case
+            if (includesUpdate())
+            {
+                execute(getUpdateCql(), getPrimaryKeyValues(firstRow));
+                assertEmpty(execute(selectFirstRowCql));
+                // update the select statement to query using the updated value
+                selectFirstRowCql = String.format("SELECT * FROM %%s WHERE %s", postUpdateQueryExpression);
+                // we can't check the entire row b/c we've modified something.
+                // so we just check the primary key columns, as they cannot have changed
+                assertPrimaryKeyColumnsOnly(execute(selectFirstRowCql), firstRow);
+            }
+
+            // delete row, check that it cannot be found via index query
+            execute(deleteRowCql, getPrimaryKeyValues(firstRow));
+            assertEmpty(execute(selectFirstRowCql));
+
+            // delete partition, check that its rows cannot be retrieved via index query
+            execute(deletePartitionCql, getPartitionKeyValues(secondRow));
+            assertEmpty(execute(selectSecondRowCql));
+
+            // flush & compact, then verify that deleted values stay gone
+            flush();
+            compact();
+            assertEmpty(execute(selectFirstRowCql));
+            assertEmpty(execute(selectSecondRowCql));
+
+            // add back both rows, reset the select for the first row to query on the original value & verify
+            execute(insertCql, firstRow);
+            selectFirstRowCql = String.format("SELECT * FROM %%s WHERE %s", queryExpression1);
+            assertRows(execute(selectFirstRowCql), firstRow);
+            execute(insertCql, secondRow);
+            assertRows(execute(selectSecondRowCql), secondRow);
+
+            // flush and compact, verify again & we're done
+            flush();
+            compact();
+            assertRows(execute(selectFirstRowCql), firstRow);
+            assertRows(execute(selectSecondRowCql), secondRow);
+        }
+
+        private void assertPrimaryKeyColumnsOnly(UntypedResultSet resultSet, Object[] row)
+        {
+            assertFalse(resultSet.isEmpty());
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            int columnCount = cfm.partitionKeyColumns().size();
+            if (cfm.isCompound())
+                columnCount += cfm.clusteringColumns().size();
+            Object[] expected = copyValuesFromRow(row, columnCount);
+            assertArrayEquals(expected, copyValuesFromRow(getRows(resultSet)[0], columnCount));
+        }
+
+        private String getInsertCql()
+        {
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            String columns = Joiner.on(", ")
+                                   .join(Iterators.transform(cfm.allColumnsInSelectOrder(),
+                                                             (column) -> column.name.toString()));
+            String markers = Joiner.on(", ").join(Iterators.transform(cfm.allColumnsInSelectOrder(),
+                                                                      (column) -> {
+                                                                          return "?";
+                                                                      }));
+            return String.format("INSERT INTO %%s (%s) VALUES (%s)", columns, markers);
+        }
+
+        private String getUpdateCql()
+        {
+            String whereClause = getPrimaryKeyColumns().map(column -> column.name.toString() + "=?")
+                                                       .collect(Collectors.joining(" AND "));
+            return String.format("UPDATE %%s %s WHERE %s", updateExpression, whereClause);
+        }
+
+        private String getDeleteRowCql()
+        {
+            return getPrimaryKeyColumns().map(column -> column.name.toString() + "=?")
+                                         .collect(Collectors.joining(" AND ", "DELETE FROM %s WHERE ", ""));
+        }
+
+        private String getDeletePartitionCql()
+        {
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            return StreamSupport.stream(cfm.partitionKeyColumns().spliterator(), false)
+                                .map(column -> column.name.toString() + "=?")
+                                .collect(Collectors.joining(" AND ", "DELETE FROM %s WHERE ", ""));
+        }
+
+        private Stream<ColumnDefinition> getPrimaryKeyColumns()
+        {
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            if (cfm.isCompactTable())
+                return cfm.partitionKeyColumns().stream();
+            else
+                return Stream.concat(cfm.partitionKeyColumns().stream(), cfm.clusteringColumns().stream());
+        }
+
+        private Object[] getPrimaryKeyValues(Object[] row)
+        {
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            if (cfm.isCompactTable())
+                return getPartitionKeyValues(row);
+
+            return copyValuesFromRow(row, cfm.partitionKeyColumns().size() + cfm.clusteringColumns().size());
+        }
+
+        private Object[] getPartitionKeyValues(Object[] row)
+        {
+            CFMetaData cfm = getCurrentColumnFamilyStore().metadata;
+            return copyValuesFromRow(row, cfm.partitionKeyColumns().size());
+        }
+
+        private Object[] copyValuesFromRow(Object[] row, int length)
+        {
+            Object[] values = new Object[length];
+            System.arraycopy(row, 0, values, 0, length);
+            return values;
+        }
+
+        private boolean includesUpdate()
+        {
+            return updateExpression != null;
+        }
+
+        // Spin waiting for named index to be built
+        private void waitForIndexBuild() throws Throwable
+        {
+            ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+            long maxWaitMillis = 10000;
+            long startTime = System.currentTimeMillis();
+            while (! cfs.indexManager.getBuiltIndexNames().contains(indexName))
+            {
+                Thread.sleep(100);
+                long wait = System.currentTimeMillis() - startTime;
+                if (wait > maxWaitMillis)
+                    fail(String.format("Timed out waiting for index %s to build (%s)ms", indexName, wait));
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
new file mode 100644
index 0000000..2124abe
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java

@@ -0,0 +1,660 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.internal;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Future;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.statements.IndexTarget;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.index.SecondaryIndexBuilder;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.concurrent.Refs;
+
+import static org.apache.cassandra.index.internal.CassandraIndex.getFunctions;
+import static org.apache.cassandra.index.internal.CassandraIndex.indexCfsMetadata;
+import static org.apache.cassandra.index.internal.CassandraIndex.parseTarget;
+
+/**
+ * Clone of KeysIndex used in CassandraIndexTest#testCustomIndexWithCFS to verify
+ * behaviour of flushing CFS backed CUSTOM indexes
+ */
+public class CustomCassandraIndex implements Index
+{
+    private static final Logger logger = LoggerFactory.getLogger(CassandraIndex.class);
+
+    public final ColumnFamilyStore baseCfs;
+    protected IndexMetadata metadata;
+    protected ColumnFamilyStore indexCfs;
+    protected ColumnDefinition indexedColumn;
+    protected CassandraIndexFunctions functions;
+
+    public CustomCassandraIndex(ColumnFamilyStore baseCfs, IndexMetadata indexDef)
+    {
+        this.baseCfs = baseCfs;
+        setMetadata(indexDef);
+    }
+
+    /**
+     * Returns true if an index of this type can support search predicates of the form [column] OPERATOR [value]
+     * @param indexedColumn
+     * @param operator
+     * @return
+     */
+    protected boolean supportsOperator(ColumnDefinition indexedColumn, Operator operator)
+    {
+        return operator.equals(Operator.EQ);
+    }
+
+    public ColumnDefinition getIndexedColumn()
+    {
+        return indexedColumn;
+    }
+
+    public ClusteringComparator getIndexComparator()
+    {
+        return indexCfs.metadata.comparator;
+    }
+
+    public ColumnFamilyStore getIndexCfs()
+    {
+        return indexCfs;
+    }
+
+    public void register(IndexRegistry registry)
+    {
+        registry.registerIndex(this);
+    }
+
+    public Callable<?> getInitializationTask()
+    {
+        // if we're just linking in the index on an already-built index post-restart
+        // or if the table is empty we've nothing to do. Otherwise, submit for building via SecondaryIndexBuilder
+        return isBuilt() || baseCfs.isEmpty() ? null : getBuildIndexTask();
+    }
+
+    public IndexMetadata getIndexMetadata()
+    {
+        return metadata;
+    }
+
+    public Optional<ColumnFamilyStore> getBackingTable()
+    {
+        return indexCfs == null ? Optional.empty() : Optional.of(indexCfs);
+    }
+
+    public Callable<Void> getBlockingFlushTask()
+    {
+        return () -> {
+            indexCfs.forceBlockingFlush();
+            return null;
+        };
+    }
+
+    public Callable<?> getInvalidateTask()
+    {
+        return () -> {
+            invalidate();
+            return null;
+        };
+    }
+
+    public Callable<?> getMetadataReloadTask(IndexMetadata indexDef)
+    {
+        setMetadata(indexDef);
+        return () -> {
+            indexCfs.metadata.reloadIndexMetadataProperties(baseCfs.metadata);
+            indexCfs.reload();
+            return null;
+        };
+    }
+
+    private void setMetadata(IndexMetadata indexDef)
+    {
+        metadata = indexDef;
+        Pair<ColumnDefinition, IndexTarget.Type> target = parseTarget(baseCfs.metadata, indexDef);
+        functions = getFunctions(indexDef, target);
+        CFMetaData cfm = indexCfsMetadata(baseCfs.metadata, indexDef);
+        indexCfs = ColumnFamilyStore.createColumnFamilyStore(baseCfs.keyspace,
+                                                             cfm.cfName,
+                                                             cfm,
+                                                             baseCfs.getTracker().loadsstables);
+        indexedColumn = target.left;
+    }
+
+    public Callable<?> getTruncateTask(final long truncatedAt)
+    {
+        return () -> {
+            indexCfs.discardSSTables(truncatedAt);
+            return null;
+        };
+    }
+
+    public boolean shouldBuildBlocking()
+    {
+        return true;
+    }
+
+    public boolean dependsOn(ColumnDefinition column)
+    {
+        return column.equals(indexedColumn);
+    }
+
+    public boolean supportsExpression(ColumnDefinition column, Operator operator)
+    {
+        return indexedColumn.name.equals(column.name)
+               && supportsOperator(indexedColumn, operator);
+    }
+
+    public AbstractType<?> customExpressionValueType()
+    {
+        return null;
+    }
+
+    private boolean supportsExpression(RowFilter.Expression expression)
+    {
+        return supportsExpression(expression.column(), expression.operator());
+    }
+
+    public long getEstimatedResultRows()
+    {
+        return indexCfs.getMeanColumns();
+    }
+
+    /**
+     * No post processing of query results, just return them unchanged
+     */
+    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command)
+    {
+        return (partitionIterator, readCommand) -> partitionIterator;
+    }
+
+    public RowFilter getPostIndexQueryFilter(RowFilter filter)
+    {
+        return getTargetExpression(filter.getExpressions()).map(filter::without)
+                                                           .orElse(filter);
+    }
+
+    private Optional<RowFilter.Expression> getTargetExpression(List<RowFilter.Expression> expressions)
+    {
+        return expressions.stream().filter(this::supportsExpression).findFirst();
+    }
+
+    public Index.Searcher searcherFor(ReadCommand command)
+    {
+        return null;
+    }
+
+    public void validate(PartitionUpdate update) throws InvalidRequestException
+    {
+        switch (indexedColumn.kind)
+        {
+            case PARTITION_KEY:
+                validatePartitionKey(update.partitionKey());
+                break;
+            case CLUSTERING:
+                validateClusterings(update);
+                break;
+            case REGULAR:
+                validateRows(update);
+                break;
+            case STATIC:
+                validateRows(Collections.singleton(update.staticRow()));
+                break;
+        }
+    }
+
+    protected CBuilder buildIndexClusteringPrefix(ByteBuffer partitionKey,
+                                               ClusteringPrefix prefix,
+                                               CellPath path)
+    {
+        CBuilder builder = CBuilder.create(getIndexComparator());
+        builder.add(partitionKey);
+        return builder;
+    }
+
+    protected ByteBuffer getIndexedValue(ByteBuffer partitionKey,
+                                      Clustering clustering,
+                                      CellPath path, ByteBuffer cellValue)
+    {
+        return cellValue;
+    }
+
+    public IndexEntry decodeEntry(DecoratedKey indexedValue, Row indexEntry)
+    {
+        throw new UnsupportedOperationException("KEYS indexes do not use a specialized index entry format");
+    }
+
+    public boolean isStale(Row row, ByteBuffer indexValue, int nowInSec)
+    {
+        if (row == null)
+            return true;
+
+        Cell cell = row.getCell(indexedColumn);
+
+        return (cell == null
+             || !cell.isLive(nowInSec)
+             || indexedColumn.type.compare(indexValue, cell.value()) != 0);
+    }
+
+    public Indexer indexerFor(final DecoratedKey key,
+                              final PartitionColumns columns,
+                              final int nowInSec,
+                              final OpOrder.Group opGroup,
+                              final IndexTransaction.Type transactionType)
+    {
+        if (!isPrimaryKeyIndex() && !columns.contains(indexedColumn))
+            return null;
+
+        return new Indexer()
+        {
+            public void begin()
+            {
+            }
+
+            public void partitionDelete(DeletionTime deletionTime)
+            {
+            }
+
+            public void rangeTombstone(RangeTombstone tombstone)
+            {
+            }
+
+            public void insertRow(Row row)
+            {
+                if (isPrimaryKeyIndex())
+                {
+                    indexPrimaryKey(row.clustering(),
+                                    getPrimaryKeyIndexLiveness(row),
+                                    row.deletion());
+                }
+                else
+                {
+                    if (indexedColumn.isComplex())
+                        indexCells(row.clustering(), row.getComplexColumnData(indexedColumn));
+                    else
+                        indexCell(row.clustering(), row.getCell(indexedColumn));
+                }
+            }
+
+            public void removeRow(Row row)
+            {
+                if (isPrimaryKeyIndex())
+                    indexPrimaryKey(row.clustering(), row.primaryKeyLivenessInfo(), row.deletion());
+
+                if (indexedColumn.isComplex())
+                    removeCells(row.clustering(), row.getComplexColumnData(indexedColumn));
+                else
+                    removeCell(row.clustering(), row.getCell(indexedColumn));
+            }
+
+
+            public void updateRow(Row oldRow, Row newRow)
+            {
+                if (isPrimaryKeyIndex())
+                    indexPrimaryKey(newRow.clustering(),
+                                    newRow.primaryKeyLivenessInfo(),
+                                    newRow.deletion());
+
+                if (indexedColumn.isComplex())
+                {
+                    indexCells(newRow.clustering(), newRow.getComplexColumnData(indexedColumn));
+                    removeCells(oldRow.clustering(), oldRow.getComplexColumnData(indexedColumn));
+                }
+                else
+                {
+                    indexCell(newRow.clustering(), newRow.getCell(indexedColumn));
+                    removeCell(oldRow.clustering(), oldRow.getCell(indexedColumn));
+                }
+            }
+
+            public void finish()
+            {
+            }
+
+            private void indexCells(Clustering clustering, Iterable<Cell> cells)
+            {
+                if (cells == null)
+                    return;
+
+                for (Cell cell : cells)
+                    indexCell(clustering, cell);
+            }
+
+            private void indexCell(Clustering clustering, Cell cell)
+            {
+                if (cell == null || !cell.isLive(nowInSec))
+                    return;
+
+                insert(key.getKey(),
+                       clustering,
+                       cell,
+                       LivenessInfo.create(cell.timestamp(), cell.ttl(), cell.localDeletionTime()),
+                       opGroup);
+            }
+
+            private void removeCells(Clustering clustering, Iterable<Cell> cells)
+            {
+                if (cells == null)
+                    return;
+
+                for (Cell cell : cells)
+                    removeCell(clustering, cell);
+            }
+
+            private void removeCell(Clustering clustering, Cell cell)
+            {
+                if (cell == null || !cell.isLive(nowInSec))
+                    return;
+
+                delete(key.getKey(), clustering, cell, opGroup, nowInSec);
+            }
+
+            private void indexPrimaryKey(final Clustering clustering,
+                                         final LivenessInfo liveness,
+                                         final Row.Deletion deletion)
+            {
+                if (liveness.timestamp() != LivenessInfo.NO_TIMESTAMP)
+                    insert(key.getKey(), clustering, null, liveness, opGroup);
+
+                if (!deletion.isLive())
+                    delete(key.getKey(), clustering, deletion.time(), opGroup);
+            }
+
+            private LivenessInfo getPrimaryKeyIndexLiveness(Row row)
+            {
+                long timestamp = row.primaryKeyLivenessInfo().timestamp();
+                int ttl = row.primaryKeyLivenessInfo().ttl();
+                for (Cell cell : row.cells())
+                {
+                    long cellTimestamp = cell.timestamp();
+                    if (cell.isLive(nowInSec))
+                    {
+                        if (cellTimestamp > timestamp)
+                        {
+                            timestamp = cellTimestamp;
+                            ttl = cell.ttl();
+                        }
+                    }
+                }
+                return LivenessInfo.create(baseCfs.metadata, timestamp, ttl, nowInSec);
+            }
+        };
+    }
+
+    /**
+     * Specific to internal indexes, this is called by a
+     * searcher when it encounters a stale entry in the index
+     * @param indexKey the partition key in the index table
+     * @param indexClustering the clustering in the index table
+     * @param deletion deletion timestamp etc
+     * @param opGroup the operation under which to perform the deletion
+     */
+    public void deleteStaleEntry(DecoratedKey indexKey,
+                                 Clustering indexClustering,
+                                 DeletionTime deletion,
+                                 OpOrder.Group opGroup)
+    {
+        doDelete(indexKey, indexClustering, deletion, opGroup);
+        logger.debug("Removed index entry for stale value {}", indexKey);
+    }
+
+    /**
+     * Called when adding a new entry to the index
+     */
+    private void insert(ByteBuffer rowKey,
+                        Clustering clustering,
+                        Cell cell,
+                        LivenessInfo info,
+                        OpOrder.Group opGroup)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               cell));
+        Row row = BTreeRow.noCellLiveRow(buildIndexClustering(rowKey, clustering, cell), info);
+        PartitionUpdate upd = partitionUpdate(valueKey, row);
+        indexCfs.apply(upd, UpdateTransaction.NO_OP, opGroup, null);
+        logger.debug("Inserted entry into index for value {}", valueKey);
+    }
+
+    /**
+     * Called when deleting entries on non-primary key columns
+     */
+    private void delete(ByteBuffer rowKey,
+                        Clustering clustering,
+                        Cell cell,
+                        OpOrder.Group opGroup,
+                        int nowInSec)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               cell));
+        doDelete(valueKey,
+                 buildIndexClustering(rowKey, clustering, cell),
+                 new DeletionTime(cell.timestamp(), nowInSec),
+                 opGroup);
+    }
+
+    /**
+     * Called when deleting entries from indexes on primary key columns
+     */
+    private void delete(ByteBuffer rowKey,
+                        Clustering clustering,
+                        DeletionTime deletion,
+                        OpOrder.Group opGroup)
+    {
+        DecoratedKey valueKey = getIndexKeyFor(getIndexedValue(rowKey,
+                                                               clustering,
+                                                               null));
+        doDelete(valueKey,
+                 buildIndexClustering(rowKey, clustering, null),
+                 deletion,
+                 opGroup);
+    }
+
+    private void doDelete(DecoratedKey indexKey,
+                          Clustering indexClustering,
+                          DeletionTime deletion,
+                          OpOrder.Group opGroup)
+    {
+        Row row = BTreeRow.emptyDeletedRow(indexClustering, Row.Deletion.regular(deletion));
+        PartitionUpdate upd = partitionUpdate(indexKey, row);
+        indexCfs.apply(upd, UpdateTransaction.NO_OP, opGroup, null);
+        logger.debug("Removed index entry for value {}", indexKey);
+    }
+
+    private void validatePartitionKey(DecoratedKey partitionKey) throws InvalidRequestException
+    {
+        assert indexedColumn.isPartitionKey();
+        validateIndexedValue(getIndexedValue(partitionKey.getKey(), null, null ));
+    }
+
+    private void validateClusterings(PartitionUpdate update) throws InvalidRequestException
+    {
+        assert indexedColumn.isClusteringColumn();
+        for (Row row : update)
+            validateIndexedValue(getIndexedValue(null, row.clustering(), null));
+    }
+
+    private void validateRows(Iterable<Row> rows)
+    {
+        assert !indexedColumn.isPrimaryKeyColumn();
+        for (Row row : rows)
+        {
+            if (indexedColumn.isComplex())
+            {
+                ComplexColumnData data = row.getComplexColumnData(indexedColumn);
+                if (data != null)
+                {
+                    for (Cell cell : data)
+                    {
+                        validateIndexedValue(getIndexedValue(null, null, cell.path(), cell.value()));
+                    }
+                }
+            }
+            else
+            {
+                validateIndexedValue(getIndexedValue(null, null, row.getCell(indexedColumn)));
+            }
+        }
+    }
+
+    private void validateIndexedValue(ByteBuffer value)
+    {
+        if (value != null && value.remaining() >= FBUtilities.MAX_UNSIGNED_SHORT)
+            throw new InvalidRequestException(String.format(
+                                                           "Cannot index value of size %d for index %s on %s.%s(%s) (maximum allowed size=%d)",
+                                                           value.remaining(),
+                                                           metadata.name,
+                                                           baseCfs.metadata.ksName,
+                                                           baseCfs.metadata.cfName,
+                                                           indexedColumn.name.toString(),
+                                                           FBUtilities.MAX_UNSIGNED_SHORT));
+    }
+
+    private ByteBuffer getIndexedValue(ByteBuffer rowKey,
+                                       Clustering clustering,
+                                       Cell cell)
+    {
+        return getIndexedValue(rowKey,
+                               clustering,
+                               cell == null ? null : cell.path(),
+                               cell == null ? null : cell.value()
+        );
+    }
+
+    private Clustering buildIndexClustering(ByteBuffer rowKey,
+                                            Clustering clustering,
+                                            Cell cell)
+    {
+        return buildIndexClusteringPrefix(rowKey,
+                                          clustering,
+                                          cell == null ? null : cell.path()).build();
+    }
+
+    private DecoratedKey getIndexKeyFor(ByteBuffer value)
+    {
+        return indexCfs.decorateKey(value);
+    }
+
+    private PartitionUpdate partitionUpdate(DecoratedKey valueKey, Row row)
+    {
+        return PartitionUpdate.singleRowUpdate(indexCfs.metadata, valueKey, row);
+    }
+
+    private void invalidate()
+    {
+        // interrupt in-progress compactions
+        Collection<ColumnFamilyStore> cfss = Collections.singleton(indexCfs);
+        CompactionManager.instance.interruptCompactionForCFs(cfss, true);
+        CompactionManager.instance.waitForCessation(cfss);
+        indexCfs.keyspace.writeOrder.awaitNewBarrier();
+        indexCfs.forceBlockingFlush();
+        indexCfs.readOrdering.awaitNewBarrier();
+        indexCfs.invalidate();
+    }
+
+    private boolean isBuilt()
+    {
+        return SystemKeyspace.isIndexBuilt(baseCfs.keyspace.getName(), metadata.name);
+    }
+
+    private boolean isPrimaryKeyIndex()
+    {
+        return indexedColumn.isPrimaryKeyColumn();
+    }
+
+    private Callable<?> getBuildIndexTask()
+    {
+        return () -> {
+            buildBlocking();
+            return null;
+        };
+    }
+
+    private void buildBlocking()
+    {
+        baseCfs.forceBlockingFlush();
+
+        try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+             Refs<SSTableReader> sstables = viewFragment.refs)
+        {
+            if (sstables.isEmpty())
+            {
+                logger.info("No SSTable data for {}.{} to build index {} from, marking empty index as built",
+                            baseCfs.metadata.ksName,
+                            baseCfs.metadata.cfName,
+                            metadata.name);
+                baseCfs.indexManager.markIndexBuilt(metadata.name);
+                return;
+            }
+
+            logger.info("Submitting index build of {} for data in {}",
+                        metadata.name,
+                        getSSTableNames(sstables));
+
+            SecondaryIndexBuilder builder = new SecondaryIndexBuilder(baseCfs,
+                                                                      Collections.singleton(this),
+                                                                      new ReducingKeyIterator(sstables));
+            Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
+            FBUtilities.waitOnFuture(future);
+            indexCfs.forceBlockingFlush();
+            baseCfs.indexManager.markIndexBuilt(metadata.name);
+        }
+        logger.info("Index build of {} complete", metadata.name);
+    }
+
+    private static String getSSTableNames(Collection<SSTableReader> sstables)
+    {
+        return StreamSupport.stream(sstables.spliterator(), false)
+                            .map(SSTableReader::toString)
+                            .collect(Collectors.joining(", "));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/ChecksummedRandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/ChecksummedRandomAccessReaderTest.java
deleted file mode 100644
index c1e43c9..0000000
--- a/test/unit/org/apache/cassandra/io/ChecksummedRandomAccessReaderTest.java
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.io;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.util.Arrays;
-import java.util.concurrent.ThreadLocalRandom;
-
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-import org.apache.cassandra.io.util.ChecksummedRandomAccessReader;
-import org.apache.cassandra.io.util.ChecksummedSequentialWriter;
-import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.io.util.SequentialWriter;
-
-public class ChecksummedRandomAccessReaderTest
-{
-    @Test
-    public void readFully() throws IOException
-    {
-        final File data = File.createTempFile("testReadFully", "data");
-        final File crc = File.createTempFile("testReadFully", "crc");
-
-        final byte[] expected = new byte[70 * 1024];   // bit more than crc chunk size, so we can test rebuffering.
-        ThreadLocalRandom.current().nextBytes(expected);
-
-        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
-        writer.write(expected);
-        writer.finish();
-
-        assert data.exists();
-
-        RandomAccessReader reader = ChecksummedRandomAccessReader.open(data, crc);
-        byte[] b = new byte[expected.length];
-        reader.readFully(b);
-
-        assertArrayEquals(expected, b);
-
-        assertTrue(reader.isEOF());
-
-        reader.close();
-    }
-
-    @Test
-    public void seek() throws IOException
-    {
-        final File data = File.createTempFile("testSeek", "data");
-        final File crc = File.createTempFile("testSeek", "crc");
-
-        final byte[] dataBytes = new byte[70 * 1024];   // bit more than crc chunk size
-        ThreadLocalRandom.current().nextBytes(dataBytes);
-
-        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
-        writer.write(dataBytes);
-        writer.finish();
-
-        assert data.exists();
-
-        RandomAccessReader reader = ChecksummedRandomAccessReader.open(data, crc);
-
-        final int seekPosition = 66000;
-        reader.seek(seekPosition);
-
-        byte[] b = new byte[dataBytes.length - seekPosition];
-        reader.readFully(b);
-
-        byte[] expected = Arrays.copyOfRange(dataBytes, seekPosition, dataBytes.length);
-
-        assertArrayEquals(expected, b);
-
-        assertTrue(reader.isEOF());
-
-        reader.close();
-    }
-
-    @Test(expected = ChecksummedRandomAccessReader.CorruptFileException.class)
-    public void corruptionDetection() throws IOException
-    {
-        final File data = File.createTempFile("corruptionDetection", "data");
-        final File crc = File.createTempFile("corruptionDetection", "crc");
-
-        final byte[] expected = new byte[5 * 1024];
-        Arrays.fill(expected, (byte) 0);
-
-        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
-        writer.write(expected);
-        writer.finish();
-
-        assert data.exists();
-
-        // simulate corruption of file
-        try (RandomAccessFile dataFile = new RandomAccessFile(data, "rw"))
-        {
-            dataFile.seek(1024);
-            dataFile.write((byte) 5);
-        }
-
-        RandomAccessReader reader = ChecksummedRandomAccessReader.open(data, crc);
-        byte[] b = new byte[expected.length];
-        reader.readFully(b);
-
-        assertArrayEquals(expected, b);
-
-        assertTrue(reader.isEOF());
-
-        reader.close();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
new file mode 100644
index 0000000..ddacc6b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java

@@ -0,0 +1,139 @@
+package org.apache.cassandra.io;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.compaction.CompactionInterruptedException;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.io.sstable.IndexSummaryManager;
+import org.apache.cassandra.io.sstable.IndexSummaryRedistribution;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class DiskSpaceMetricsTest extends CQLTester
+{
+    /**
+     * This test runs the system with normal operations and makes sure the disk metrics match reality
+     */
+    @Test
+    public void baseline() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk bigint, PRIMARY KEY (pk)) WITH min_index_interval=1");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+        // disable compaction so nothing changes between calculations
+        cfs.disableAutoCompaction();
+
+        // create 100 sstables
+        for (int i = 0; i < 100; i++)
+            insert(cfs, i);
+        assertDiskSpaceEqual(cfs);
+    }
+
+    /**
+     * If index summary downsampling is interrupted in the middle, the metrics still reflect the real data
+     */
+    @Test
+    public void summaryRedistribution() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk bigint, PRIMARY KEY (pk)) WITH min_index_interval=1");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+        // disable compaction so nothing changes between calculations
+        cfs.disableAutoCompaction();
+
+        // create 100 sstables, make sure they have more than 1 value, else sampling can't happen
+        for (int i = 0; i < 100; i++)
+            insertN(cfs, 10, i);
+        assertDiskSpaceEqual(cfs);
+
+        // summary downsample
+        for (int i = 0; i < 100; i++)
+        {
+            indexDownsampleCancelLastSSTable(cfs);
+            assertDiskSpaceEqual(cfs);
+        }
+    }
+
+    private void insert(ColumnFamilyStore cfs, long value) throws Throwable
+    {
+        insertN(cfs, 1, value);
+    }
+
+    private void insertN(ColumnFamilyStore cfs, int n, long base) throws Throwable
+    {
+        for (int i = 0; i < n; i++)
+            execute("INSERT INTO %s (pk) VALUES (?)", base + i);
+
+        // flush to write the sstable
+        cfs.forceBlockingFlush();
+    }
+
+    private void assertDiskSpaceEqual(ColumnFamilyStore cfs)
+    {
+        long liveDiskSpaceUsed = cfs.metric.liveDiskSpaceUsed.getCount();
+        long actual = 0;
+        for (SSTableReader sstable : cfs.getTracker().getView().liveSSTables())
+            actual += sstable.bytesOnDisk();
+
+        Assert.assertEquals("bytes on disk does not match current metric liveDiskSpaceUsed", actual, liveDiskSpaceUsed);
+
+        // totalDiskSpaceUsed is based off SStable delete, which is async: LogTransaction's tidy enqueues in ScheduledExecutors.nonPeriodicTasks
+        // wait for there to be no more pending sstable releases
+        LifecycleTransaction.waitForDeletions();
+        long totalDiskSpaceUsed = cfs.metric.totalDiskSpaceUsed.getCount();
+        Assert.assertEquals("bytes on disk does not match current metric totalDiskSpaceUsed", actual, totalDiskSpaceUsed);
+    }
+
+    private static void indexDownsampleCancelLastSSTable(ColumnFamilyStore cfs)
+    {
+        List<SSTableReader> sstables = Lists.newArrayList(cfs.getSSTables(SSTableSet.CANONICAL));
+        LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
+        Map<UUID, LifecycleTransaction> txns = ImmutableMap.of(cfs.metadata.cfId, txn);
+        // fail on the last file (* 3 because we call isStopRequested 3 times for each sstable, and we should fail on the last)
+        AtomicInteger countdown = new AtomicInteger(3 * sstables.size() - 1);
+        IndexSummaryRedistribution redistribution = new IndexSummaryRedistribution(Collections.emptyList(), txns, 0) {
+            public boolean isStopRequested()
+            {
+                return countdown.decrementAndGet() == 0;
+            }
+        };
+        try
+        {
+            IndexSummaryManager.redistributeSummaries(redistribution);
+            Assert.fail("Should throw CompactionInterruptedException");
+        }
+        catch (CompactionInterruptedException e)
+        {
+            // trying to get this to happen
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+        finally
+        {
+            try
+            {
+                FBUtilities.closeAll(txns.values());
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/RandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/RandomAccessReaderTest.java
deleted file mode 100644
index 8c6cc90..0000000
--- a/test/unit/org/apache/cassandra/io/RandomAccessReaderTest.java
+++ /dev/null

@@ -1,251 +0,0 @@
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-package org.apache.cassandra.io;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.UUID;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-import org.apache.cassandra.io.util.ChannelProxy;
-import org.apache.cassandra.io.util.FileMark;
-import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.io.util.SequentialWriter;
-
-public class RandomAccessReaderTest
-{
-    @Test
-    public void testReadFully() throws IOException
-    {
-        final File f = File.createTempFile("testReadFully", "1");
-        final String expected = "The quick brown fox jumps over the lazy dog";
-
-        SequentialWriter writer = SequentialWriter.open(f);
-        writer.write(expected.getBytes());
-        writer.finish();
-
-        assert f.exists();
-
-        ChannelProxy channel = new ChannelProxy(f);
-        RandomAccessReader reader = RandomAccessReader.open(channel);
-        assertEquals(f.getAbsolutePath(), reader.getPath());
-        assertEquals(expected.length(), reader.length());
-
-        byte[] b = new byte[expected.length()];
-        reader.readFully(b);
-        assertEquals(expected, new String(b));
-
-        assertTrue(reader.isEOF());
-        assertEquals(0, reader.bytesRemaining());
-
-        reader.close();
-        channel.close();
-    }
-
-    @Test
-    public void testReadBytes() throws IOException
-    {
-        File f = File.createTempFile("testReadBytes", "1");
-        final String expected = "The quick brown fox jumps over the lazy dog";
-
-        SequentialWriter writer = SequentialWriter.open(f);
-        writer.write(expected.getBytes());
-        writer.finish();
-
-        assert f.exists();
-
-        ChannelProxy channel = new ChannelProxy(f);
-        RandomAccessReader reader = RandomAccessReader.open(channel);
-        assertEquals(f.getAbsolutePath(), reader.getPath());
-        assertEquals(expected.length(), reader.length());
-
-        ByteBuffer b = reader.readBytes(expected.length());
-        assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
-
-        assertTrue(reader.isEOF());
-        assertEquals(0, reader.bytesRemaining());
-
-        reader.close();
-        channel.close();
-    }
-
-    @Test
-    public void testReset() throws IOException
-    {
-        File f = File.createTempFile("testMark", "1");
-        final String expected = "The quick brown fox jumps over the lazy dog";
-        final int numIterations = 10;
-
-        SequentialWriter writer = SequentialWriter.open(f);
-        for (int i = 0; i < numIterations; i++)
-            writer.write(expected.getBytes());
-        writer.finish();
-
-        assert f.exists();
-
-        ChannelProxy channel = new ChannelProxy(f);
-        RandomAccessReader reader = RandomAccessReader.open(channel);
-        assertEquals(expected.length() * numIterations, reader.length());
-
-        ByteBuffer b = reader.readBytes(expected.length());
-        assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
-
-        assertFalse(reader.isEOF());
-        assertEquals((numIterations - 1) * expected.length(), reader.bytesRemaining());
-
-        FileMark mark = reader.mark();
-        assertEquals(0, reader.bytesPastMark());
-        assertEquals(0, reader.bytesPastMark(mark));
-
-        for (int i = 0; i < (numIterations - 1); i++)
-        {
-            b = reader.readBytes(expected.length());
-            assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
-        }
-        assertTrue(reader.isEOF());
-        assertEquals(expected.length() * (numIterations -1), reader.bytesPastMark());
-        assertEquals(expected.length() * (numIterations - 1), reader.bytesPastMark(mark));
-
-        reader.reset(mark);
-        assertEquals(0, reader.bytesPastMark());
-        assertEquals(0, reader.bytesPastMark(mark));
-        assertFalse(reader.isEOF());
-        for (int i = 0; i < (numIterations - 1); i++)
-        {
-            b = reader.readBytes(expected.length());
-            assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
-        }
-
-        reader.reset();
-        assertEquals(0, reader.bytesPastMark());
-        assertEquals(0, reader.bytesPastMark(mark));
-        assertFalse(reader.isEOF());
-        for (int i = 0; i < (numIterations - 1); i++)
-        {
-            b = reader.readBytes(expected.length());
-            assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
-        }
-
-        assertTrue(reader.isEOF());
-        reader.close();
-        channel.close();
-    }
-
-    @Test
-    public void testSeekSingleThread() throws IOException, InterruptedException
-    {
-        testSeek(1);
-    }
-
-    @Test
-    public void testSeekMultipleThreads() throws IOException, InterruptedException
-    {
-        testSeek(10);
-    }
-
-    private void testSeek(int numThreads) throws IOException, InterruptedException
-    {
-        final File f = File.createTempFile("testMark", "1");
-        final String[] expected = new String[10];
-        int len = 0;
-        for (int i = 0; i < expected.length; i++)
-        {
-            expected[i] = UUID.randomUUID().toString();
-            len += expected[i].length();
-        }
-        final int totalLength = len;
-
-        SequentialWriter writer = SequentialWriter.open(f);
-        for (int i = 0; i < expected.length; i++)
-            writer.write(expected[i].getBytes());
-        writer.finish();
-
-        assert f.exists();
-
-        final ChannelProxy channel = new ChannelProxy(f);
-
-        final Runnable worker = new Runnable() {
-
-            @Override
-            public void run()
-            {
-                try
-                {
-                    RandomAccessReader reader = RandomAccessReader.open(channel);
-                    assertEquals(totalLength, reader.length());
-
-                    ByteBuffer b = reader.readBytes(expected[0].length());
-                    assertEquals(expected[0], new String(b.array(), Charset.forName("UTF-8")));
-
-                    assertFalse(reader.isEOF());
-                    assertEquals(totalLength - expected[0].length(), reader.bytesRemaining());
-
-                    long filePointer = reader.getFilePointer();
-
-                    for (int i = 1; i < expected.length; i++)
-                    {
-                        b = reader.readBytes(expected[i].length());
-                        assertEquals(expected[i], new String(b.array(), Charset.forName("UTF-8")));
-                    }
-                    assertTrue(reader.isEOF());
-
-                    reader.seek(filePointer);
-                    assertFalse(reader.isEOF());
-                    for (int i = 1; i < expected.length; i++)
-                    {
-                        b = reader.readBytes(expected[i].length());
-                        assertEquals(expected[i], new String(b.array(), Charset.forName("UTF-8")));
-                    }
-
-                    assertTrue(reader.isEOF());
-                    reader.close();
-                }
-                catch (Exception ex)
-                {
-                    ex.printStackTrace();
-                    fail(ex.getMessage());
-                }
-            }
-        };
-
-        if(numThreads == 1)
-        {
-            worker.run();
-            return;
-        }
-
-        ExecutorService executor = Executors.newFixedThreadPool(numThreads);
-        for (int i = 0; i < numThreads; i++)
-            executor.submit(worker);
-
-        executor.shutdown();
-        executor.awaitTermination(1, TimeUnit.MINUTES);
-
-        channel.close();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java
index 3bef89e..34ff94f 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressedRandomAccessReaderTest.java

@@ -18,32 +18,33 @@
  */
 package org.apache.cassandra.io.compress;
 
+import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.io.RandomAccessFile;
-import java.util.Collections;
+import java.util.Arrays;
 import java.util.Random;
 
-import com.google.common.util.concurrent.RateLimiter;
 import org.junit.Test;
-
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.ChannelProxy;
-import org.apache.cassandra.io.util.CompressedPoolingSegmentedFile;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileMark;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.MmappedRegions;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.ChecksumType;
 import org.apache.cassandra.utils.SyncUtil;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 public class CompressedRandomAccessReaderTest
 {
@@ -51,16 +52,24 @@
     public void testResetAndTruncate() throws IOException
     {
         // test reset in current buffer or previous one
-        testResetAndTruncate(File.createTempFile("normal", "1"), false, 10);
-        testResetAndTruncate(File.createTempFile("normal", "2"), false, CompressionParameters.DEFAULT_CHUNK_LENGTH);
+        testResetAndTruncate(File.createTempFile("normal", "1"), false, false, 10);
+        testResetAndTruncate(File.createTempFile("normal", "2"), false, false, CompressionParams.DEFAULT_CHUNK_LENGTH);
     }
 
     @Test
     public void testResetAndTruncateCompressed() throws IOException
     {
         // test reset in current buffer or previous one
-        testResetAndTruncate(File.createTempFile("compressed", "1"), true, 10);
-        testResetAndTruncate(File.createTempFile("compressed", "2"), true, CompressionParameters.DEFAULT_CHUNK_LENGTH);
+        testResetAndTruncate(File.createTempFile("compressed", "1"), true, false, 10);
+        testResetAndTruncate(File.createTempFile("compressed", "2"), true, false, CompressionParams.DEFAULT_CHUNK_LENGTH);
+    }
+
+    @Test
+    public void testResetAndTruncateCompressedMmap() throws IOException
+    {
+        // test reset in current buffer or previous one
+        testResetAndTruncate(File.createTempFile("compressed_mmap", "1"), true, true, 10);
+        testResetAndTruncate(File.createTempFile("compressed_mmap", "2"), true, true, CompressionParams.DEFAULT_CHUNK_LENGTH);
     }
 
     @Test
@@ -68,93 +77,157 @@
     {
         File f = File.createTempFile("compressed6791_", "3");
         String filename = f.getAbsolutePath();
-        ChannelProxy channel = new ChannelProxy(f);
-        try
+        try (ChannelProxy channel = new ChannelProxy(f))
         {
 
-            MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
-            CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", new CompressionParameters(SnappyCompressor.instance, 32, Collections.<String, String>emptyMap()), sstableMetadataCollector);
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(BytesType.instance));
+            try (CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", CompressionParams.snappy(32), sstableMetadataCollector))
+            {
 
-            for (int i = 0; i < 20; i++)
-                writer.write("x".getBytes());
+                for (int i = 0; i < 20; i++)
+                    writer.write("x".getBytes());
 
-            FileMark mark = writer.mark();
-            // write enough garbage to create new chunks:
-            for (int i = 0; i < 40; ++i)
-                writer.write("y".getBytes());
+                DataPosition mark = writer.mark();
+                // write enough garbage to create new chunks:
+                for (int i = 0; i < 40; ++i)
+                    writer.write("y".getBytes());
 
-            writer.resetAndTruncate(mark);
+                writer.resetAndTruncate(mark);
 
-            for (int i = 0; i < 20; i++)
-                writer.write("x".getBytes());
-            writer.finish();
+                for (int i = 0; i < 20; i++)
+                    writer.write("x".getBytes());
+                writer.finish();
+            }
 
-            CompressedRandomAccessReader reader = CompressedRandomAccessReader.open(channel, new CompressionMetadata(filename + ".metadata", f.length()));
-            String res = reader.readLine();
-            assertEquals(res, "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
-            assertEquals(40, res.length());
+            try (RandomAccessReader reader = new CompressedRandomAccessReader.Builder(channel,
+                                                                                      new CompressionMetadata(filename + ".metadata", f.length(), ChecksumType.CRC32))
+                    .build())
+            {
+                String res = reader.readLine();
+                assertEquals(res, "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
+                assertEquals(40, res.length());
+            }
         }
         finally
         {
-            // cleanup
-            channel.close();
-
             if (f.exists())
-                f.delete();
-            File metadata = new File(filename+ ".metadata");
-                if (metadata.exists())
-                    metadata.delete();
+                assertTrue(f.delete());
+            File metadata = new File(filename + ".metadata");
+            if (metadata.exists())
+                metadata.delete();
         }
     }
 
-    private void testResetAndTruncate(File f, boolean compressed, int junkSize) throws IOException
+    /**
+     * JIRA: CASSANDRA-15595 verify large position with small chunk length won't overflow chunk index
+     */
+    @Test
+    public void testChunkIndexOverflow() throws IOException
     {
-        final String filename = f.getAbsolutePath();
-        ChannelProxy channel = new ChannelProxy(f);
+        File file = File.createTempFile("chunk_idx_overflow", "1");
+        String filename = file.getAbsolutePath();
+        int chunkLength = 4096; // 4k
+
         try
         {
-            MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
-            SequentialWriter writer = compressed
-                ? new CompressedSequentialWriter(f, filename + ".metadata", new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector)
-                : SequentialWriter.open(f);
+            writeSSTable(file, CompressionParams.snappy(chunkLength), 10);
+            CompressionMetadata metadata = new CompressionMetadata(filename + ".metadata", file.length(), ChecksumType.CRC32);
 
-            writer.write("The quick ".getBytes());
-            FileMark mark = writer.mark();
-            writer.write("blue fox jumps over the lazy dog".getBytes());
+            long chunks = 2761628520L;
+            long midPosition = (chunks / 2L) * chunkLength;
+            int idx = 8 * (int) (midPosition / chunkLength); // before patch
+            assertTrue("Expect integer overflow", idx < 0);
 
-            // write enough to be sure to change chunk
-            for (int i = 0; i < junkSize; ++i)
+            try
             {
-                writer.write((byte)1);
+                metadata.chunkFor(midPosition);
+                fail("Expected to throw EOF exception with chunk idx larger than total number of chunks in the sstable");
             }
-
-            writer.resetAndTruncate(mark);
-            writer.write("brown fox jumps over the lazy dog".getBytes());
-            writer.finish();
-
-            assert f.exists();
-            RandomAccessReader reader = compressed
-                                      ? CompressedRandomAccessReader.open(channel, new CompressionMetadata(filename + ".metadata", f.length()))
-                                      : RandomAccessReader.open(f);
-            String expected = "The quick brown fox jumps over the lazy dog";
-            assertEquals(expected.length(), reader.length());
-            byte[] b = new byte[expected.length()];
-            reader.readFully(b);
-            assert new String(b).equals(expected) : "Expecting '" + expected + "', got '" + new String(b) + "'";
+            catch (CorruptSSTableException e)
+            {
+                assertTrue("Expect EOF, but got " + e.getCause(), e.getCause() instanceof EOFException);
+            }
         }
         finally
         {
-            // cleanup
-            channel.close();
+            if (file.exists())
+                assertTrue(file.delete());
+            File metadata = new File(filename + ".metadata");
+            if (metadata.exists())
+                metadata.delete();
+        }
+    }
 
+    private static void testResetAndTruncate(File f, boolean compressed, boolean usemmap, int junkSize) throws IOException
+    {
+        final String filename = f.getAbsolutePath();
+        try(ChannelProxy channel = new ChannelProxy(f))
+        {
+            writeSSTable(f, compressed ? CompressionParams.snappy() : null, junkSize);
+
+            CompressionMetadata compressionMetadata = compressed ? new CompressionMetadata(filename + ".metadata", f.length(), ChecksumType.CRC32) : null;
+            RandomAccessReader.Builder builder = compressed
+                                                 ? new CompressedRandomAccessReader.Builder(channel, compressionMetadata)
+                                                 : new RandomAccessReader.Builder(channel);
+
+            if (usemmap)
+            {
+                if (compressed)
+                    builder.regions(MmappedRegions.map(channel, compressionMetadata));
+                else
+                    builder.regions(MmappedRegions.map(channel, f.length()));
+            }
+
+            try(RandomAccessReader reader = builder.build())
+            {
+                String expected = "The quick brown fox jumps over the lazy dog";
+                assertEquals(expected.length(), reader.length());
+                byte[] b = new byte[expected.length()];
+                reader.readFully(b);
+                assert new String(b).equals(expected) : "Expecting '" + expected + "', got '" + new String(b) + '\'';
+            }
+
+            if (usemmap)
+                builder.regions.close();
+        }
+        finally
+        {
             if (f.exists())
-                f.delete();
+                assertTrue(f.delete());
             File metadata = new File(filename + ".metadata");
             if (compressed && metadata.exists())
                 metadata.delete();
         }
     }
 
+    private static void writeSSTable(File f, CompressionParams params, int junkSize) throws IOException
+    {
+        final String filename = f.getAbsolutePath();
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(BytesType.instance));
+        try(SequentialWriter writer = params != null
+                                      ? new CompressedSequentialWriter(f, filename + ".metadata", params, sstableMetadataCollector)
+                                      : SequentialWriter.open(f))
+        {
+            writer.write("The quick ".getBytes());
+            DataPosition mark = writer.mark();
+            writer.write("blue fox jumps over the lazy dog".getBytes());
+
+            // write enough to be sure to change chunk
+            for (int i = 0; i < junkSize; ++i)
+            {
+                writer.write((byte) 1);
+            }
+
+            writer.resetAndTruncate(mark);
+            writer.write("brown fox jumps over the lazy dog".getBytes());
+            writer.finish();
+        }
+        assert f.exists();
+    }
+
+    /**
+     * If the data read out doesn't match the checksum, an exception should be thrown
+     */
     @Test
     public void testDataCorruptionDetection() throws IOException
     {
@@ -166,135 +239,74 @@
         File metadata = new File(file.getPath() + ".meta");
         metadata.deleteOnExit();
 
-        MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
-        try (SequentialWriter writer = new CompressedSequentialWriter(file, metadata.getPath(), new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector))
+        assertTrue(file.createNewFile());
+        assertTrue(metadata.createNewFile());
+
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(BytesType.instance));
+        try (SequentialWriter writer = new CompressedSequentialWriter(file, metadata.getPath(), CompressionParams.snappy(), sstableMetadataCollector))
         {
             writer.write(CONTENT.getBytes());
             writer.finish();
         }
 
-        ChannelProxy channel = new ChannelProxy(file);
-
-        // open compression metadata and get chunk information
-        CompressionMetadata meta = new CompressionMetadata(metadata.getPath(), file.length());
-        CompressionMetadata.Chunk chunk = meta.chunkFor(0);
-
-        RandomAccessReader reader = CompressedRandomAccessReader.open(channel, meta);
-        // read and verify compressed data
-        assertEquals(CONTENT, reader.readLine());
-
-        Random random = new Random();
-        RandomAccessFile checksumModifier = null;
-
-        try
+        try(ChannelProxy channel = new ChannelProxy(file))
         {
-            checksumModifier = new RandomAccessFile(file, "rw");
-            byte[] checksum = new byte[4];
+            // open compression metadata and get chunk information
+            CompressionMetadata meta = new CompressionMetadata(metadata.getPath(), file.length(), ChecksumType.CRC32);
+            CompressionMetadata.Chunk chunk = meta.chunkFor(0);
 
-            // seek to the end of the compressed chunk
-            checksumModifier.seek(chunk.length);
-            // read checksum bytes
-            checksumModifier.read(checksum);
-            // seek back to the chunk end
-            checksumModifier.seek(chunk.length);
-
-            // lets modify one byte of the checksum on each iteration
-            for (int i = 0; i < checksum.length; i++)
-            {
-                checksumModifier.write(random.nextInt());
-                SyncUtil.sync(checksumModifier); // making sure that change was synced with disk
-
-                final RandomAccessReader r = CompressedRandomAccessReader.open(channel, meta);
-
-                Throwable exception = null;
-                try
-                {
-                    r.readLine();
-                }
-                catch (Throwable t)
-                {
-                    exception = t;
-                }
-                assertNotNull(exception);
-                assertEquals(exception.getClass(), CorruptSSTableException.class);
-                assertEquals(exception.getCause().getClass(), CorruptBlockException.class);
-
-                r.close();
+            try(RandomAccessReader reader = new CompressedRandomAccessReader.Builder(channel, meta).build())
+            {// read and verify compressed data
+                assertEquals(CONTENT, reader.readLine());
             }
 
-            // lets write original checksum and check if we can read data
-            updateChecksum(checksumModifier, chunk.length, checksum);
+            Random random = new Random();
+            try(RandomAccessFile checksumModifier = new RandomAccessFile(file, "rw"))
+            {
+                byte[] checksum = new byte[4];
 
-            reader = CompressedRandomAccessReader.open(channel, meta);
-            // read and verify compressed data
-            assertEquals(CONTENT, reader.readLine());
-            // close reader
-            reader.close();
-        }
-        finally
-        {
-            channel.close();
+                // seek to the end of the compressed chunk
+                checksumModifier.seek(chunk.length);
+                // read checksum bytes
+                checksumModifier.read(checksum);
 
-            if (checksumModifier != null)
-                checksumModifier.close();
+                byte[] corruptChecksum = new byte[4];
+                do
+                {
+                    random.nextBytes(corruptChecksum);
+                } while (Arrays.equals(corruptChecksum, checksum));
+
+                updateChecksum(checksumModifier, chunk.length, corruptChecksum);
+
+                try (final RandomAccessReader r = new CompressedRandomAccessReader.Builder(channel, meta).build())
+                {
+                    Throwable exception = null;
+                    try
+                    {
+                        r.readLine();
+                    }
+                    catch (Throwable t)
+                    {
+                        exception = t;
+                    }
+                    assertNotNull(exception);
+                    assertSame(exception.getClass(), CorruptSSTableException.class);
+                    assertSame(exception.getCause().getClass(), CorruptBlockException.class);
+                }
+
+                // lets write original checksum and check if we can read data
+                updateChecksum(checksumModifier, chunk.length, checksum);
+
+                // read and verify compressed data
+                try (RandomAccessReader cr = new CompressedRandomAccessReader.Builder(channel, meta).build())
+                {
+                    assertEquals(CONTENT, cr.readLine());
+                }
+            }
         }
     }
 
-    @Test
-    public void testThrottledReadersAreNotCached() throws IOException
-    {
-        String CONTENT = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam vitae.";
-
-        File file = new File("testThrottledReadersAreNotCached");
-        file.deleteOnExit();
-
-        File metadata = new File(file.getPath() + ".meta");
-        metadata.deleteOnExit();
-
-        MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
-        try (SequentialWriter writer = new CompressedSequentialWriter(file, metadata.getPath(), new CompressionParameters(SnappyCompressor.instance), sstableMetadataCollector))
-        {
-            writer.write(CONTENT.getBytes());
-            writer.finish();
-        }
-
-        CompressionMetadata meta = new CompressionMetadata(metadata.getPath(), file.length());
-
-        try(ChannelProxy channel = new ChannelProxy(file);
-            CompressedPoolingSegmentedFile segmentedFile = new CompressedPoolingSegmentedFile(channel, meta))
-        {
-            //The cache bucket is only initialized by a call to FileCacheService.instance.get() so first
-            // we must create a reader using the interface for accessing segments
-            FileDataInput reader = segmentedFile.getSegment(0);
-            assertNotNull(reader);
-            reader.close();
-
-            //Now we create a throttled reader, this should not be added to the cache
-            RateLimiter limiter = RateLimiter.create(1024);
-            reader = segmentedFile.createThrottledReader(limiter);
-            assertNotNull(reader);
-            assertTrue(reader instanceof CompressedThrottledReader);
-            reader.close();
-
-            //We retrieve 2 readers, neither should be a throttled reader
-            FileDataInput[] readers =
-            {
-                segmentedFile.getSegment(0),
-                segmentedFile.getSegment(0)
-            };
-
-            for (FileDataInput r : readers)
-            {
-                assertNotNull(r);
-                assertFalse(r instanceof CompressedThrottledReader);
-            }
-
-            for (FileDataInput r : readers)
-                r.close();
-        }
-    }
-
-    private void updateChecksum(RandomAccessFile file, long checksumOffset, byte[] checksum) throws IOException
+    private static void updateChecksum(RandomAccessFile file, long checksumOffset, byte[] checksum) throws IOException
     {
         file.seek(checksumOffset);
         file.write(checksum);

diff --git a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java
index 33b4957..1bc3454 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java

@@ -20,8 +20,6 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.HashSet;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;
@@ -30,8 +28,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.junit.Assert.assertEquals;
@@ -74,24 +71,22 @@
     @Test
     public void compressionEnabled() throws Throwable
     {
-        createTable("create table %s (id int primary key, t blob) with compression = {'sstable_compression':'org.apache.cassandra.io.compress.CompressedSequentialWriterReopenTest$BadCompressor'}");
+        createTable("create table %s (id int primary key, t blob) with compression = {'class':'org.apache.cassandra.io.compress.CompressedSequentialWriterReopenTest$BadCompressor'}");
         byte [] blob = new byte[1000];
         (new Random()).nextBytes(blob);
-        Keyspace keyspace = Keyspace.open(keyspace());
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(currentTable());
-        cfs.disableAutoCompaction();
+        getCurrentColumnFamilyStore().disableAutoCompaction();
         for (int i = 0; i < 10000; i++)
         {
             execute("insert into %s (id, t) values (?, ?)", i, ByteBuffer.wrap(blob));
         }
-        cfs.forceBlockingFlush();
+        getCurrentColumnFamilyStore().forceBlockingFlush();
         for (int i = 0; i < 10000; i++)
         {
             execute("insert into %s (id, t) values (?, ?)", i, ByteBuffer.wrap(blob));
         }
-        cfs.forceBlockingFlush();
+        getCurrentColumnFamilyStore().forceBlockingFlush();
         DatabaseDescriptor.setSSTablePreempiveOpenIntervalInMB(1);
-        cfs.forceMajorCompaction();
+        getCurrentColumnFamilyStore().forceMajorCompaction();
     }
 
     public static class BadCompressor implements ICompressor
@@ -147,7 +142,7 @@
         @Override
         public Set<String> supportedOptions()
         {
-            return new HashSet<>(Arrays.asList(CompressionParameters.CRC_CHECK_CHANCE));
+            return null;
         }
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterTest.java b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterTest.java
index bca0354..f04439a 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterTest.java

@@ -33,21 +33,24 @@
 import org.junit.Test;
 
 import junit.framework.Assert;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
+
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.ChannelProxy;
-import org.apache.cassandra.io.util.FileMark;
+import org.apache.cassandra.io.util.DataPosition;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.io.util.SequentialWriterTest;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.ChecksumType;
 
 public class CompressedSequentialWriterTest extends SequentialWriterTest
 {
-    private ICompressor compressor;
+    private CompressionParams compressionParameters;
 
     private void runTests(String testName) throws IOException
     {
@@ -55,30 +58,30 @@
         testWrite(File.createTempFile(testName + "_small", "1"), 25);
 
         // Test to confirm pipeline w/chunk-aligned data writes works
-        testWrite(File.createTempFile(testName + "_chunkAligned", "1"), CompressionParameters.DEFAULT_CHUNK_LENGTH);
+        testWrite(File.createTempFile(testName + "_chunkAligned", "1"), CompressionParams.DEFAULT_CHUNK_LENGTH);
 
         // Test to confirm pipeline on non-chunk boundaries works
-        testWrite(File.createTempFile(testName + "_large", "1"), CompressionParameters.DEFAULT_CHUNK_LENGTH * 3 + 100);
+        testWrite(File.createTempFile(testName + "_large", "1"), CompressionParams.DEFAULT_CHUNK_LENGTH * 3 + 100);
     }
 
     @Test
     public void testLZ4Writer() throws IOException
     {
-        compressor = LZ4Compressor.instance;
+        compressionParameters = CompressionParams.lz4();
         runTests("LZ4");
     }
 
     @Test
     public void testDeflateWriter() throws IOException
     {
-        compressor = DeflateCompressor.instance;
+        compressionParameters = CompressionParams.deflate();
         runTests("Deflate");
     }
 
     @Test
     public void testSnappyWriter() throws IOException
     {
-        compressor = SnappyCompressor.instance;
+        compressionParameters = CompressionParams.snappy();
         runTests("Snappy");
     }
 
@@ -89,12 +92,13 @@
 
         try
         {
-            MetadataCollector sstableMetadataCollector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
+            MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(Arrays.<AbstractType<?>>asList(BytesType.instance)));
+
             byte[] dataPre = new byte[bytesToTest];
             byte[] rawPost = new byte[bytesToTest];
-            try (CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", new CompressionParameters(compressor), sstableMetadataCollector);)
+            try (CompressedSequentialWriter writer = new CompressedSequentialWriter(f, filename + ".metadata", compressionParameters, sstableMetadataCollector);)
             {
-                Random r = new Random();
+                Random r = new Random(42);
 
                 // Test both write with byte[] and ByteBuffer
                 r.nextBytes(dataPre);
@@ -104,18 +108,18 @@
                 dataPost.flip();
 
                 writer.write(dataPre);
-                FileMark mark = writer.mark();
+                DataPosition mark = writer.mark();
 
                 // Write enough garbage to transition chunk
-                for (int i = 0; i < CompressionParameters.DEFAULT_CHUNK_LENGTH; i++)
+                for (int i = 0; i < CompressionParams.DEFAULT_CHUNK_LENGTH; i++)
                 {
                     writer.write((byte)i);
                 }
 
-                if (bytesToTest <= CompressionParameters.DEFAULT_CHUNK_LENGTH)
-                    assertEquals(writer.getLastFlushOffset(), CompressionParameters.DEFAULT_CHUNK_LENGTH);
+                if (bytesToTest <= CompressionParams.DEFAULT_CHUNK_LENGTH)
+                    assertEquals(writer.getLastFlushOffset(), CompressionParams.DEFAULT_CHUNK_LENGTH);
                 else
-                    assertTrue(writer.getLastFlushOffset() % CompressionParameters.DEFAULT_CHUNK_LENGTH == 0);
+                    assertTrue(writer.getLastFlushOffset() % CompressionParams.DEFAULT_CHUNK_LENGTH == 0);
 
                 writer.resetAndTruncate(mark);
                 writer.write(dataPost);
@@ -123,7 +127,7 @@
             }
 
             assert f.exists();
-            RandomAccessReader reader = CompressedRandomAccessReader.open(channel, new CompressionMetadata(filename + ".metadata", f.length()));
+            RandomAccessReader reader = new CompressedRandomAccessReader.Builder(channel, new CompressionMetadata(filename + ".metadata", f.length(), ChecksumType.CRC32)).build();
             assertEquals(dataPre.length + rawPost.length, reader.length());
             byte[] result = new byte[(int)reader.length()];
 
@@ -152,7 +156,7 @@
 
     private ByteBuffer makeBB(int size)
     {
-        return compressor.preferredBufferType().allocate(size);
+        return compressionParameters.getSstableCompressor().preferredBufferType().allocate(size);
     }
 
     private final List<TestableCSW> writers = new ArrayList<>();
@@ -174,29 +178,30 @@
         final int bufferSize = 48;
         final int writeSize = 64;
         byte[] toWrite = new byte[writeSize];
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(Arrays.<AbstractType<?>>asList(BytesType.instance)));
 
         try (SequentialWriter writer = new CompressedSequentialWriter(tempFile, offsetsFile.getPath(),
-                                                                                new CompressionParameters(LZ4Compressor.instance),new MetadataCollector(CellNames.fromAbstractType(UTF8Type.instance, false))))
+                                                                      CompressionParams.lz4(), sstableMetadataCollector))
         {
             // write bytes greather than buffer
             writer.write(toWrite);
             long flushedOffset = writer.getLastFlushOffset();
-            assertEquals(writeSize, writer.getFilePointer());
+            assertEquals(writeSize, writer.position());
             // mark thi position
-            FileMark pos = writer.mark();
+            DataPosition pos = writer.mark();
             // write another
             writer.write(toWrite);
             // another buffer should be flushed
             assertEquals(flushedOffset * 2, writer.getLastFlushOffset());
-            assertEquals(writeSize * 2, writer.getFilePointer());
+            assertEquals(writeSize * 2, writer.position());
             // reset writer
             writer.resetAndTruncate(pos);
             // current position and flushed size should be changed
-            assertEquals(writeSize, writer.getFilePointer());
+            assertEquals(writeSize, writer.position());
             assertEquals(flushedOffset, writer.getLastFlushOffset());
             // write another byte less than buffer
             writer.write(new byte[]{0});
-            assertEquals(writeSize + 1, writer.getFilePointer());
+            assertEquals(writeSize + 1, writer.position());
             // flush off set should not be increase
             assertEquals(flushedOffset, writer.getLastFlushOffset());
             writer.finish();
@@ -226,7 +231,10 @@
 
         private TestableCSW(File file, File offsetsFile) throws IOException
         {
-            this(file, offsetsFile, new CompressedSequentialWriter(file, offsetsFile.getPath(), new CompressionParameters(LZ4Compressor.instance, BUFFER_SIZE, new HashMap<String, String>()), new MetadataCollector(CellNames.fromAbstractType(UTF8Type.instance, false))));
+            this(file, offsetsFile, new CompressedSequentialWriter(file,
+                                                                   offsetsFile.getPath(),
+                                                                   CompressionParams.lz4(BUFFER_SIZE),
+                                                                   new MetadataCollector(new ClusteringComparator(UTF8Type.instance))));
         }
 
         private TestableCSW(File file, File offsetsFile, CompressedSequentialWriter sw) throws IOException
@@ -267,7 +275,6 @@
         protected void assertAborted() throws Exception
         {
             super.assertAborted();
-            Assert.assertFalse(offsetsFile.exists());
         }
 
         void cleanup()

diff --git a/test/unit/org/apache/cassandra/io/compress/CompressorTest.java b/test/unit/org/apache/cassandra/io/compress/CompressorTest.java
index 1d285ea4..1e24d03 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressorTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressorTest.java

@@ -33,6 +33,7 @@
 
 import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.BufferPool;
 
 public class CompressorTest
 {
@@ -86,7 +87,7 @@
 
         assertEquals(decompressedLength, len);
         assertArrayEquals(Arrays.copyOfRange(data, off, off + len),
-                Arrays.copyOfRange(restored, restoreOffset, restoreOffset + decompressedLength));
+                          Arrays.copyOfRange(restored, restoreOffset, restoreOffset + decompressedLength));
     }
 
     public void testArrayUncompress(byte[] data) throws IOException

diff --git a/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java
index dfb55a1..78964f4 100644
--- a/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java

@@ -21,20 +21,17 @@
 import java.io.File;
 import java.io.IOException;
 
-import org.junit.After;
 import org.junit.BeforeClass;
 
 import junit.framework.Assert;
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.concurrent.AbstractTransactionalTest;
 
 public class BigTableWriterTest extends AbstractTransactionalTest
@@ -49,9 +46,8 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD, 0, Int32Type.instance, AsciiType.instance, Int32Type.instance));
         cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD);
     }
 
@@ -64,49 +60,49 @@
     {
         final File file;
         final Descriptor descriptor;
-        final SSTableWriter writer;
+        final SSTableTxnWriter writer;
 
-        private TestableBTW() throws IOException
+        private TestableBTW()
         {
-            this(cfs.getTempSSTablePath(cfs.directories.getDirectoryForNewSSTables()));
+            this(cfs.getSSTablePath(cfs.getDirectories().getDirectoryForNewSSTables()));
         }
 
-        private TestableBTW(String file) throws IOException
+        private TestableBTW(String file)
         {
-            this(file, SSTableWriter.create(file, 0, 0));
+            this(file, SSTableTxnWriter.create(cfs, file, 0, 0, new SerializationHeader(true, cfs.metadata, cfs.metadata.partitionColumns(), EncodingStats.NO_STATS)));
         }
 
-        private TestableBTW(String file, SSTableWriter sw) throws IOException
+        private TestableBTW(String file, SSTableTxnWriter sw)
         {
             super(sw);
             this.file = new File(file);
             this.descriptor = Descriptor.fromFilename(file);
             this.writer = sw;
-            ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-            for (int i = 0; i < 10; i++)
-                cf.addColumn(Util.cellname(i), SSTableRewriterTest.random(0, 1000), 1);
+
             for (int i = 0; i < 100; i++)
-                writer.append(StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(i)), cf);
+            {
+                UpdateBuilder update = UpdateBuilder.create(cfs.metadata, i);
+                for (int j = 0; j < 10; j++)
+                    update.newRow(j).add("val", SSTableRewriterTest.random(0, 1000));
+                writer.append(update.build().unfilteredIterator());
+            }
         }
 
         protected void assertInProgress() throws Exception
         {
-            assertExists(Descriptor.Type.TEMP, Component.DATA, Component.PRIMARY_INDEX);
-            assertNotExists(Descriptor.Type.TEMP, Component.FILTER, Component.SUMMARY);
-            assertNotExists(Descriptor.Type.FINAL, Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
+            assertExists(Component.DATA, Component.PRIMARY_INDEX);
+            assertNotExists(Component.FILTER, Component.SUMMARY);
             Assert.assertTrue(file.length() > 0);
         }
 
         protected void assertPrepared() throws Exception
         {
-            assertNotExists(Descriptor.Type.TEMP, Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
-            assertExists(Descriptor.Type.FINAL, Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
+            assertExists(Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
         }
 
         protected void assertAborted() throws Exception
         {
-            assertNotExists(Descriptor.Type.TEMP, Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
-            assertNotExists(Descriptor.Type.FINAL, Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
+            assertNotExists(Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.SUMMARY);
             Assert.assertFalse(file.exists());
         }
 
@@ -115,15 +111,22 @@
             assertPrepared();
         }
 
-        private void assertExists(Descriptor.Type type, Component ... components)
+        @Override
+        protected boolean commitCanThrow()
         {
-            for (Component component : components)
-                Assert.assertTrue(new File(descriptor.asType(type).filenameFor(component)).exists());
+            return true;
         }
-        private void assertNotExists(Descriptor.Type type, Component ... components)
+
+        private void assertExists(Component ... components)
         {
             for (Component component : components)
-                Assert.assertFalse(type.toString() + " " + component.toString(), new File(descriptor.asType(type).filenameFor(component)).exists());
+                Assert.assertTrue(new File(descriptor.filenameFor(component)).exists());
+        }
+
+        private void assertNotExists(Component ... components)
+        {
+            for (Component component : components)
+                Assert.assertFalse(component.toString(), new File(descriptor.filenameFor(component)).exists());
         }
     }
 

diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java
index ad2d876..d38276f 100644
--- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java

@@ -20,18 +20,21 @@
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
-import java.util.Arrays;
 
 import com.google.common.io.Files;
-
-import org.junit.*;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Test;
 
 import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.io.util.FileUtils;
 
 import static org.junit.Assert.assertEquals;
-
 import static org.junit.Assert.assertTrue;
 
 public class CQLSSTableWriterClientTest
@@ -42,6 +45,7 @@
     public void setUp()
     {
         this.testDirectory = Files.createTempDir();
+        Config.setClientMode(true);
     }
 
     @After
@@ -97,6 +101,5 @@
 
         File[] dataFiles = this.testDirectory.listFiles(filter);
         assertEquals(2, dataFiles.length);
-
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
index 5e2fffe..7d79036 100644
--- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java

@@ -23,6 +23,7 @@
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.UUID;
+import java.util.concurrent.ExecutionException;
 
 import com.google.common.collect.ImmutableMap;
 import com.google.common.io.Files;
@@ -32,6 +33,7 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -39,14 +41,15 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.fail;
 
 public class CQLSSTableWriterTest
 {
@@ -68,79 +71,65 @@
     @Test
     public void testUnsortedWriter() throws Exception
     {
-        String KS = "cql_keyspace";
-        String TABLE = "table1";
-
-        File tempdir = Files.createTempDir();
-        File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
-        assert dataDir.mkdirs();
-
-        String schema = "CREATE TABLE cql_keyspace.table1 ("
-                      + "  k int PRIMARY KEY,"
-                      + "  v1 text,"
-                      + "  v2 int"
-                      + ")";
-        String insert = "INSERT INTO cql_keyspace.table1 (k, v1, v2) VALUES (?, ?, ?)";
-        CQLSSTableWriter writer = CQLSSTableWriter.builder()
-                                                  .inDirectory(dataDir)
-                                                  .forTable(schema)
-                                                  .withPartitioner(StorageService.getPartitioner())
-                                                  .using(insert).build();
-
-        writer.addRow(0, "test1", 24);
-        writer.addRow(1, "test2", null);
-        writer.addRow(2, "test3", 42);
-        writer.addRow(ImmutableMap.<String, Object>of("k", 3, "v2", 12));
-        writer.close();
-
-        SSTableLoader loader = new SSTableLoader(dataDir, new SSTableLoader.Client()
+        try (AutoCloseable switcher = Util.switchPartitioner(ByteOrderedPartitioner.instance))
         {
-            private String keyspace;
+            String KS = "cql_keyspace";
+            String TABLE = "table1";
 
-            public void init(String keyspace)
-            {
-                this.keyspace = keyspace;
-                for (Range<Token> range : StorageService.instance.getLocalRanges("cql_keyspace"))
-                    addRangeForEndpoint(range, FBUtilities.getBroadcastAddress());
-                setPartitioner(StorageService.getPartitioner());
-            }
+            File tempdir = Files.createTempDir();
+            File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
+            assert dataDir.mkdirs();
 
-            public CFMetaData getTableMetadata(String tableName)
-            {
-                return Schema.instance.getCFMetaData(keyspace, tableName);
-            }
-        }, new OutputHandler.SystemOutput(false, false));
+            String schema = "CREATE TABLE cql_keyspace.table1 ("
+                          + "  k int PRIMARY KEY,"
+                          + "  v1 text,"
+                          + "  v2 int"
+                          + ")";
+            String insert = "INSERT INTO cql_keyspace.table1 (k, v1, v2) VALUES (?, ?, ?)";
+            CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                      .inDirectory(dataDir)
+                                                      .forTable(schema)
+                                                      .using(insert).build();
 
-        loader.stream().get();
+            writer.addRow(0, "test1", 24);
+            writer.addRow(1, "test2", 44);
+            writer.addRow(2, "test3", 42);
+            writer.addRow(ImmutableMap.<String, Object>of("k", 3, "v2", 12));
 
-        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM cql_keyspace.table1;");
-        assertEquals(4, rs.size());
+            writer.close();
 
-        Iterator<UntypedResultSet.Row> iter = rs.iterator();
-        UntypedResultSet.Row row;
+            loadSSTables(dataDir, KS);
 
-        row = iter.next();
-        assertEquals(0, row.getInt("k"));
-        assertEquals("test1", row.getString("v1"));
-        assertEquals(24, row.getInt("v2"));
+            UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM cql_keyspace.table1;");
+            assertEquals(4, rs.size());
 
-        row = iter.next();
-        assertEquals(1, row.getInt("k"));
-        assertEquals("test2", row.getString("v1"));
-        assertFalse(row.has("v2"));
+            Iterator<UntypedResultSet.Row> iter = rs.iterator();
+            UntypedResultSet.Row row;
 
-        row = iter.next();
-        assertEquals(2, row.getInt("k"));
-        assertEquals("test3", row.getString("v1"));
-        assertEquals(42, row.getInt("v2"));
+            row = iter.next();
+            assertEquals(0, row.getInt("k"));
+            assertEquals("test1", row.getString("v1"));
+            assertEquals(24, row.getInt("v2"));
 
-        row = iter.next();
-        assertEquals(3, row.getInt("k"));
-        assertEquals(null, row.getBytes("v1")); // Using getBytes because we know it won't NPE
-        assertEquals(12, row.getInt("v2"));
+            row = iter.next();
+            assertEquals(1, row.getInt("k"));
+            assertEquals("test2", row.getString("v1"));
+            //assertFalse(row.has("v2"));
+            assertEquals(44, row.getInt("v2"));
+
+            row = iter.next();
+            assertEquals(2, row.getInt("k"));
+            assertEquals("test3", row.getString("v1"));
+            assertEquals(42, row.getInt("v2"));
+
+            row = iter.next();
+            assertEquals(3, row.getInt("k"));
+            assertEquals(null, row.getBytes("v1")); // Using getBytes because we know it won't NPE
+            assertEquals(12, row.getInt("v2"));
+        }
     }
 
-    @Test(expected = IllegalArgumentException.class)
+    @Test
     public void testForbidCounterUpdates() throws Exception
     {
         String KS = "cql_keyspace";
@@ -156,10 +145,18 @@
                         "  PRIMARY KEY (my_id)" +
                         ")";
         String insert = String.format("UPDATE cql_keyspace.counter1 SET my_counter = my_counter - ? WHERE my_id = ?");
-        CQLSSTableWriter.builder().inDirectory(dataDir)
-                        .forTable(schema)
-                        .withPartitioner(StorageService.instance.getPartitioner())
-                        .using(insert).build();
+        try
+        {
+            CQLSSTableWriter.builder().inDirectory(dataDir)
+                            .forTable(schema)
+                            .withPartitioner(Murmur3Partitioner.instance)
+                            .using(insert).build();
+            fail("Counter update statements should not be supported");
+        }
+        catch (IllegalArgumentException e)
+        {
+            assertEquals(e.getMessage(), "Counter update statements are not supported");
+        }
     }
 
     @Test
@@ -175,24 +172,21 @@
         File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
         assert dataDir.mkdirs();
         String schema = "CREATE TABLE ks.test ("
-                      + "  k int,"
-                      + "  c int,"
-                      + "  v blob,"
-                      + "  PRIMARY KEY (k,c)"
+                      + "  k int PRIMARY KEY,"
+                      + "  v blob"
                       + ")";
-        String insert = "INSERT INTO ks.test (k, c, v) VALUES (?, ?, ?)";
+        String insert = "INSERT INTO ks.test (k, v) VALUES (?, ?)";
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
-                                                  .withPartitioner(StorageService.getPartitioner())
                                                   .using(insert)
                                                   .withBufferSizeInMB(1)
                                                   .build();
 
         ByteBuffer val = ByteBuffer.allocate(1024 * 1050);
 
-        writer.addRow(0, 0, val);
-        writer.addRow(0, 1, val);
+        writer.addRow(0, val);
+        writer.addRow(1, val);
         writer.close();
 
         FilenameFilter filterDataFiles = new FilenameFilter()
@@ -220,7 +214,6 @@
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(tempdir)
                                                   .forTable(schema)
-                                                  .withPartitioner(StorageService.instance.getPartitioner())
                                                   .using(insert)
                                                   .withBufferSizeInMB(1)
                                                   .build();
@@ -232,7 +225,102 @@
 
     }
 
+    @Test
+    public void testUpdateStatement() throws Exception
+    {
+        final String KS = "cql_keyspace6";
+        final String TABLE = "table6";
 
+        final String schema = "CREATE TABLE " + KS + "." + TABLE + " ("
+                              + "  k int,"
+                              + "  c1 int,"
+                              + "  c2 int,"
+                              + "  v text,"
+                              + "  PRIMARY KEY (k, c1, c2)"
+                              + ")";
+
+        File tempdir = Files.createTempDir();
+        File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
+        assert dataDir.mkdirs();
+
+        CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                  .inDirectory(dataDir)
+                                                  .forTable(schema)
+                                                  .using("UPDATE " + KS + "." + TABLE + " SET v = ? " +
+                                                         "WHERE k = ? AND c1 = ? AND c2 = ?")
+                                                  .build();
+
+        writer.addRow("a", 1, 2, 3);
+        writer.addRow("b", 4, 5, 6);
+        writer.addRow(null, 7, 8, 9);
+        writer.close();
+        loadSSTables(dataDir, KS);
+
+        UntypedResultSet resultSet = QueryProcessor.executeInternal("SELECT * FROM " + KS + "." + TABLE);
+        assertEquals(2, resultSet.size());
+
+        Iterator<UntypedResultSet.Row> iter = resultSet.iterator();
+        UntypedResultSet.Row r1 = iter.next();
+        assertEquals(1, r1.getInt("k"));
+        assertEquals(2, r1.getInt("c1"));
+        assertEquals(3, r1.getInt("c2"));
+        assertEquals("a", r1.getString("v"));
+        UntypedResultSet.Row r2 = iter.next();
+        assertEquals(4, r2.getInt("k"));
+        assertEquals(5, r2.getInt("c1"));
+        assertEquals(6, r2.getInt("c2"));
+        assertEquals("b", r2.getString("v"));
+        assertFalse(iter.hasNext());
+    }
+
+    @Test
+    public void testNativeFunctions() throws Exception
+    {
+        final String KS = "cql_keyspace7";
+        final String TABLE = "table7";
+
+        final String schema = "CREATE TABLE " + KS + "." + TABLE + " ("
+                              + "  k int,"
+                              + "  c1 int,"
+                              + "  c2 int,"
+                              + "  v blob,"
+                              + "  PRIMARY KEY (k, c1, c2)"
+                              + ")";
+
+        File tempdir = Files.createTempDir();
+        File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KS + File.separator + TABLE);
+        assert dataDir.mkdirs();
+
+        CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                  .inDirectory(dataDir)
+                                                  .forTable(schema)
+                                                  .using("INSERT INTO " + KS + "." + TABLE + " (k, c1, c2, v) VALUES (?, ?, ?, textAsBlob(?))")
+                                                  .build();
+
+        writer.addRow(1, 2, 3, "abc");
+        writer.addRow(4, 5, 6, "efg");
+
+        writer.close();
+        loadSSTables(dataDir, KS);
+
+        UntypedResultSet resultSet = QueryProcessor.executeInternal("SELECT * FROM " + KS + "." + TABLE);
+        assertEquals(2, resultSet.size());
+
+        Iterator<UntypedResultSet.Row> iter = resultSet.iterator();
+        UntypedResultSet.Row r1 = iter.next();
+        assertEquals(1, r1.getInt("k"));
+        assertEquals(2, r1.getInt("c1"));
+        assertEquals(3, r1.getInt("c2"));
+        assertEquals(ByteBufferUtil.bytes("abc"), r1.getBytes("v"));
+
+        UntypedResultSet.Row r2 = iter.next();
+        assertEquals(4, r2.getInt("k"));
+        assertEquals(5, r2.getInt("c1"));
+        assertEquals(6, r2.getInt("c2"));
+        assertEquals(ByteBufferUtil.bytes("efg"), r2.getBytes("v"));
+
+        assertFalse(iter.hasNext());
+    }
 
     private static final int NUMBER_WRITES_IN_RUNNABLE = 10;
     private class WriterThread extends Thread
@@ -259,7 +347,6 @@
             CQLSSTableWriter writer = CQLSSTableWriter.builder()
                     .inDirectory(dataDir)
                     .forTable(schema)
-                    .withPartitioner(StorageService.instance.getPartitioner())
                     .using(insert).build();
 
             try
@@ -305,6 +392,14 @@
             }
         }
 
+        loadSSTables(dataDir, KS);
+
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM cql_keyspace2.table2;");
+        assertEquals(threads.length * NUMBER_WRITES_IN_RUNNABLE, rs.size());
+    }
+
+    private static void loadSSTables(File dataDir, String ks) throws ExecutionException, InterruptedException
+    {
         SSTableLoader loader = new SSTableLoader(dataDir, new SSTableLoader.Client()
         {
             private String keyspace;
@@ -312,20 +407,16 @@
             public void init(String keyspace)
             {
                 this.keyspace = keyspace;
-                for (Range<Token> range : StorageService.instance.getLocalRanges(KS))
+                for (Range<Token> range : StorageService.instance.getLocalRanges(ks))
                     addRangeForEndpoint(range, FBUtilities.getBroadcastAddress());
-                setPartitioner(StorageService.getPartitioner());
             }
 
-            public CFMetaData getTableMetadata(String tableName)
+            public CFMetaData getTableMetadata(String cfName)
             {
-                return Schema.instance.getCFMetaData(keyspace, tableName);
+                return Schema.instance.getCFMetaData(keyspace, cfName);
             }
         }, new OutputHandler.SystemOutput(false, false));
 
         loader.stream().get();
-
-        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM cql_keyspace2.table2;");
-        assertEquals(threads.length * NUMBER_WRITES_IN_RUNNABLE, rs.size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java b/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java
index 6354fc2..184d637 100644
--- a/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java

@@ -21,12 +21,12 @@
 import java.io.IOException;
 import java.util.UUID;
 
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
@@ -76,30 +76,19 @@
     private void testFromFilenameFor(File dir)
     {
         // normal
-        checkFromFilename(new Descriptor(dir, ksname, cfname, 1, Descriptor.Type.FINAL), false);
+        checkFromFilename(new Descriptor(dir, ksname, cfname, 1), false);
         // skip component (for streaming lock file)
-        checkFromFilename(new Descriptor(dir, ksname, cfname, 2, Descriptor.Type.FINAL), true);
-        // tmp
-        checkFromFilename(new Descriptor(dir, ksname, cfname, 3, Descriptor.Type.TEMP), false);
+        checkFromFilename(new Descriptor(dir, ksname, cfname, 2), true);
+
         // secondary index
         String idxName = "myidx";
         File idxDir = new File(dir.getAbsolutePath() + File.separator + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName);
-        checkFromFilename(new Descriptor(idxDir, ksname, cfname + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName,
-                                         4, Descriptor.Type.FINAL), false);
-        // secondary index tmp
-        checkFromFilename(new Descriptor(idxDir, ksname, cfname + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName,
-                                         5, Descriptor.Type.TEMP), false);
+        checkFromFilename(new Descriptor(idxDir, ksname, cfname + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName, 4), false);
 
         // legacy version
-        checkFromFilename(new Descriptor("ja", dir, ksname, cfname, 1, Descriptor.Type.FINAL,
-                                         SSTableFormat.Type.LEGACY), false);
-        // legacy tmp
-        checkFromFilename(new Descriptor("ja", dir, ksname, cfname, 2, Descriptor.Type.TEMP, SSTableFormat.Type.LEGACY),
-                          false);
+        checkFromFilename(new Descriptor("ja", dir, ksname, cfname, 1, SSTableFormat.Type.LEGACY), false);
         // legacy secondary index
-        checkFromFilename(new Descriptor("ja", dir, ksname,
-                                         cfname + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName, 3,
-                                         Descriptor.Type.FINAL, SSTableFormat.Type.LEGACY), false);
+        checkFromFilename(new Descriptor("ja", dir, ksname, cfname + Directories.SECONDARY_INDEX_NAME_SEPARATOR + idxName, 3, SSTableFormat.Type.LEGACY), false);
     }
 
     private void checkFromFilename(Descriptor original, boolean skipComponent)
@@ -114,7 +103,6 @@
         assertEquals(original.cfname, desc.cfname);
         assertEquals(original.version, desc.version);
         assertEquals(original.generation, desc.generation);
-        assertEquals(original.type, desc.type);
 
         if (skipComponent)
         {
@@ -131,8 +119,8 @@
     {
         // Descriptor should be equal when parent directory points to the same directory
         File dir = new File(".");
-        Descriptor desc1 = new Descriptor(dir, "ks", "cf", 1, Descriptor.Type.FINAL);
-        Descriptor desc2 = new Descriptor(dir.getAbsoluteFile(), "ks", "cf", 1, Descriptor.Type.FINAL);
+        Descriptor desc1 = new Descriptor(dir, "ks", "cf", 1);
+        Descriptor desc2 = new Descriptor(dir.getAbsoluteFile(), "ks", "cf", 1);
         assertEquals(desc1, desc2);
         assertEquals(desc1.hashCode(), desc2.hashCode());
     }
@@ -140,18 +128,18 @@
     @Test
     public void validateNames()
     {
-
+        // TODO tmp file name probably is not handled correctly after CASSANDRA-7066
         String[] names = {
              // old formats
              "system-schema_keyspaces-jb-1-Data.db",
-             "system-schema_keyspaces-tmp-jb-1-Data.db",
+             //"system-schema_keyspaces-tmp-jb-1-Data.db",
              "system-schema_keyspaces-ka-1-big-Data.db",
-             "system-schema_keyspaces-tmp-ka-1-big-Data.db",
+             //"system-schema_keyspaces-tmp-ka-1-big-Data.db",
              // 2ndary index
              "keyspace1-standard1.idx1-ka-1-big-Data.db",
              // new formats
              "la-1-big-Data.db",
-             "tmp-la-1-big-Data.db",
+             //"tmp-la-1-big-Data.db",
              // 2ndary index
              ".idx1" + File.separator + "la-1-big-Data.db",
         };

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
index 59ef4c4..e6328de 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexHelperTest.java

@@ -19,33 +19,40 @@
 package org.apache.cassandra.io.sstable;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
-import static org.junit.Assert.*;
-
 import org.junit.Test;
 
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.utils.FBUtilities;
+
 import static org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
+import static org.junit.Assert.assertEquals;
 
 public class IndexHelperTest
 {
-    private static CellName cn(long l)
+
+    private static ClusteringComparator comp = new ClusteringComparator(Collections.<AbstractType<?>>singletonList(LongType.instance));
+    private static ClusteringPrefix cn(long l)
     {
-        return Util.cellname(l);
+        return Util.clustering(comp, l);
     }
 
     @Test
     public void testIndexHelper()
     {
-        List<IndexInfo> indexes = new ArrayList<IndexInfo>();
-        indexes.add(new IndexInfo(cn(0L), cn(5L), 0, 0));
-        indexes.add(new IndexInfo(cn(10L), cn(15L), 0, 0));
-        indexes.add(new IndexInfo(cn(20L), cn(25L), 0, 0));
+        DeletionTime deletionInfo = new DeletionTime(FBUtilities.timestampMicros(), FBUtilities.nowInSeconds());
 
-        CellNameType comp = new SimpleDenseCellNameType(IntegerType.instance);
+        List<IndexInfo> indexes = new ArrayList<>();
+        indexes.add(new IndexInfo(cn(0L), cn(5L), 0, 0, deletionInfo));
+        indexes.add(new IndexInfo(cn(10L), cn(15L), 0, 0, deletionInfo));
+        indexes.add(new IndexInfo(cn(20L), cn(25L), 0, 0, deletionInfo));
 
         assertEquals(0, IndexHelper.indexFor(cn(-1L), indexes, comp, false, -1));
         assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, false, -1));
@@ -55,16 +62,17 @@
         assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 0));
         assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 1));
         assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 2));
-        assertEquals(-1, IndexHelper.indexFor(cn(100L), indexes, comp, false, 3));
+        assertEquals(3, IndexHelper.indexFor(cn(100L), indexes, comp, false, 3));
 
         assertEquals(-1, IndexHelper.indexFor(cn(-1L), indexes, comp, true, -1));
-        assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, true, -1));
-        assertEquals(1, IndexHelper.indexFor(cn(17L), indexes, comp, true, -1));
-        assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, -1));
-        assertEquals(0, IndexHelper.indexFor(cn(100L), indexes, comp, true, 0));
-        assertEquals(1, IndexHelper.indexFor(cn(12L), indexes, comp, true, -1));
+        assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, true, 3));
+        assertEquals(0, IndexHelper.indexFor(cn(5L), indexes, comp, true, 2));
+        assertEquals(1, IndexHelper.indexFor(cn(17L), indexes, comp, true, 3));
+        assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, 3));
+        assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, 4));
+        assertEquals(1, IndexHelper.indexFor(cn(12L), indexes, comp, true, 3));
+        assertEquals(1, IndexHelper.indexFor(cn(12L), indexes, comp, true, 2));
         assertEquals(1, IndexHelper.indexFor(cn(100L), indexes, comp, true, 1));
         assertEquals(2, IndexHelper.indexFor(cn(100L), indexes, comp, true, 2));
-        assertEquals(-1, IndexHelper.indexFor(cn(100L), indexes, comp, true, 4));
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
index 385e88a..9eb63c5 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java

@@ -22,6 +22,7 @@
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Consumer;
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
@@ -36,23 +37,26 @@
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.metrics.CompactionMetrics;
 import org.apache.cassandra.metrics.RestorableMeter;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static com.google.common.collect.ImmutableMap.of;
 import static java.util.Arrays.asList;
+import static org.apache.cassandra.db.compaction.AntiCompactionTest.assertOnDiskState;
 import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
 import static org.apache.cassandra.io.sstable.IndexSummaryRedistribution.DOWNSAMPLE_THESHOLD;
 import static org.apache.cassandra.io.sstable.IndexSummaryRedistribution.UPSAMPLE_THRESHOLD;
@@ -60,6 +64,8 @@
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class IndexSummaryManagerTest
@@ -80,16 +86,15 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWiINTERVAL)
                                                 .minIndexInterval(8)
                                                 .maxIndexInterval(256)
-                                                .caching(CachingOptions.NONE),
+                                                .caching(CachingParams.CACHE_NOTHING),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDRACE)
                                                 .minIndexInterval(8)
                                                 .maxIndexInterval(256)
-                                                .caching(CachingOptions.NONE));
+                                                .caching(CachingParams.CACHE_NOTHING));
     }
 
     @Before
@@ -99,8 +104,8 @@
         String cfname = CF_STANDARDLOWiINTERVAL; // index interval of 8, no key caching
         Keyspace keyspace = Keyspace.open(ksname);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
-        originalMinIndexInterval = cfs.metadata.getMinIndexInterval();
-        originalMaxIndexInterval = cfs.metadata.getMaxIndexInterval();
+        originalMinIndexInterval = cfs.metadata.params.minIndexInterval;
+        originalMaxIndexInterval = cfs.metadata.params.maxIndexInterval;
         originalCapacity = IndexSummaryManager.instance.getMemoryPoolCapacityInMB();
     }
 
@@ -144,17 +149,15 @@
         return sstables;
     }
 
-    private void validateData(ColumnFamilyStore cfs, int numRows)
+    private void validateData(ColumnFamilyStore cfs, int numPartition)
     {
-        for (int i = 0; i < numRows; i++)
+        for (int i = 0; i < numPartition; i++)
         {
-            DecoratedKey key = Util.dk(String.format("%3d", i));
-            QueryFilter filter = QueryFilter.getIdentityFilter(key, cfs.getColumnFamilyName(), System.currentTimeMillis());
-            ColumnFamily row = cfs.getColumnFamily(filter);
-            assertNotNull(row);
-            Cell cell = row.getColumn(Util.cellname("column"));
+            Row row = Util.getOnlyRowUnfiltered(Util.cmd(cfs, String.format("%3d", i)).build());
+            Cell cell = row.getCell(cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes("val")));
             assertNotNull(cell);
             assertEquals(100, cell.value().array().length);
+
         }
     }
 
@@ -166,7 +169,7 @@
         }
     };
 
-    private void createSSTables(String ksname, String cfname, int numSSTables, int numRows)
+    private void createSSTables(String ksname, String cfname, int numSSTables, int numPartition)
     {
         Keyspace keyspace = Keyspace.open(ksname);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
@@ -177,12 +180,15 @@
         ByteBuffer value = ByteBuffer.wrap(new byte[100]);
         for (int sstable = 0; sstable < numSSTables; sstable++)
         {
-            for (int row = 0; row < numRows; row++)
+            for (int p = 0; p < numPartition; p++)
             {
-                DecoratedKey key = Util.dk(String.format("%3d", row));
-                Mutation rm = new Mutation(ksname, key.getKey());
-                rm.add(cfname, Util.cellname("column"), value, 0);
-                rm.applyUnsafe();
+
+                String key = String.format("%3d", p);
+                new RowUpdateBuilder(cfs.metadata, 0, key)
+                    .clustering("column")
+                    .add("val", value)
+                    .build()
+                    .applyUnsafe();
             }
             futures.add(cfs.forceFlush());
         }
@@ -197,8 +203,8 @@
                 throw new RuntimeException(e);
             }
         }
-        assertEquals(numSSTables, cfs.getSSTables().size());
-        validateData(cfs, numRows);
+        assertEquals(numSSTables, cfs.getLiveSSTables().size());
+        validateData(cfs, numPartition);
     }
 
     @Test
@@ -212,42 +218,42 @@
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
         for (SSTableReader sstable : sstables)
-            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
 
         // double the min_index_interval
         cfs.metadata.minIndexInterval(originalMinIndexInterval * 2);
         IndexSummaryManager.instance.redistributeSummaries();
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
-            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
-            assertEquals(numRows / cfs.metadata.getMinIndexInterval(), sstable.getIndexSummarySize());
+            assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(numRows / cfs.metadata.params.minIndexInterval, sstable.getIndexSummarySize());
         }
 
         // return min_index_interval to its original value
         cfs.metadata.minIndexInterval(originalMinIndexInterval);
         IndexSummaryManager.instance.redistributeSummaries();
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
-            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
-            assertEquals(numRows / cfs.metadata.getMinIndexInterval(), sstable.getIndexSummarySize());
+            assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(numRows / cfs.metadata.params.minIndexInterval, sstable.getIndexSummarySize());
         }
 
         // halve the min_index_interval, but constrain the available space to exactly what we have now; as a result,
         // the summary shouldn't change
         cfs.metadata.minIndexInterval(originalMinIndexInterval / 2);
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         long summarySpace = sstable.getIndexSummaryOffHeapSize();
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(asList(sstable), OperationType.UNKNOWN))
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), summarySpace);
         }
 
-        sstable = cfs.getSSTables().iterator().next();
+        sstable = cfs.getLiveSSTables().iterator().next();
         assertEquals(originalMinIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
         assertEquals(numRows / originalMinIndexInterval, sstable.getIndexSummarySize());
 
@@ -258,7 +264,7 @@
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), (long) Math.ceil(summarySpace * 1.5));
         }
-        sstable = cfs.getSSTables().iterator().next();
+        sstable = cfs.getLiveSSTables().iterator().next();
         assertEquals(previousSize * 1.5, (double) sstable.getIndexSummarySize(), 1);
         assertEquals(previousInterval * (1.0 / 1.5), sstable.getEffectiveIndexInterval(), 0.001);
 
@@ -269,7 +275,7 @@
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), (long) Math.ceil(summarySpace / 2.0));
         }
-        sstable = cfs.getSSTables().iterator().next();
+        sstable = cfs.getLiveSSTables().iterator().next();
         assertEquals(originalMinIndexInterval * 2, sstable.getEffectiveIndexInterval(), 0.001);
         assertEquals(numRows / (originalMinIndexInterval * 2), sstable.getIndexSummarySize());
 
@@ -282,8 +288,8 @@
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), 10);
         }
-        sstable = cfs.getSSTables().iterator().next();
-        assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+        sstable = cfs.getLiveSSTables().iterator().next();
+        assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
     }
 
     @Test
@@ -297,7 +303,7 @@
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -305,33 +311,33 @@
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), 10);
         }
-        sstables = new ArrayList<>(cfs.getSSTables());
+        sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
-            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
+            assertEquals(cfs.metadata.params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
 
         // halve the max_index_interval
-        cfs.metadata.maxIndexInterval(cfs.metadata.getMaxIndexInterval() / 2);
+        cfs.metadata.maxIndexInterval(cfs.metadata.params.maxIndexInterval / 2);
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN))
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), 1);
         }
-        sstables = new ArrayList<>(cfs.getSSTables());
+        sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
         {
-            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
-            assertEquals(numRows / cfs.metadata.getMaxIndexInterval(), sstable.getIndexSummarySize());
+            assertEquals(cfs.metadata.params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
+            assertEquals(numRows / cfs.metadata.params.maxIndexInterval, sstable.getIndexSummarySize());
         }
 
         // return max_index_interval to its original value
-        cfs.metadata.maxIndexInterval(cfs.metadata.getMaxIndexInterval() * 2);
+        cfs.metadata.maxIndexInterval(cfs.metadata.params.maxIndexInterval * 2);
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN))
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.cfId, txn), 1);
         }
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
-            assertEquals(cfs.metadata.getMaxIndexInterval(), sstable.getEffectiveIndexInterval(), 0.01);
-            assertEquals(numRows / cfs.metadata.getMaxIndexInterval(), sstable.getIndexSummarySize());
+            assertEquals(cfs.metadata.params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
+            assertEquals(numRows / cfs.metadata.params.maxIndexInterval, sstable.getIndexSummarySize());
         }
     }
 
@@ -346,9 +352,9 @@
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        int minSamplingLevel = (BASE_SAMPLING_LEVEL * cfs.metadata.getMinIndexInterval()) / cfs.metadata.getMaxIndexInterval();
+        int minSamplingLevel = (BASE_SAMPLING_LEVEL * cfs.metadata.params.minIndexInterval) / cfs.metadata.params.maxIndexInterval;
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -501,14 +507,17 @@
         int numRows = 256;
         for (int row = 0; row < numRows; row++)
         {
-            DecoratedKey key = Util.dk(String.valueOf(row));
-            Mutation rm = new Mutation(ksname, key.getKey());
-            rm.add(cfname, Util.cellname("column"), value, 0);
-            rm.applyUnsafe();
+            String key = String.format("%3d", row);
+            new RowUpdateBuilder(cfs.metadata, 0, key)
+            .clustering("column")
+            .add("val", value)
+            .build()
+            .applyUnsafe();
         }
+
         cfs.forceBlockingFlush();
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
         SSTableReader original = sstables.get(0);
 
@@ -519,7 +528,7 @@
             {
                 sstable = sstable.cloneWithNewSummarySamplingLevel(cfs, samplingLevel);
                 assertEquals(samplingLevel, sstable.getIndexSummarySamplingLevel());
-                int expectedSize = (numRows * samplingLevel) / (sstable.metadata.getMinIndexInterval() * BASE_SAMPLING_LEVEL);
+                int expectedSize = (numRows * samplingLevel) / (sstable.metadata.params.minIndexInterval * BASE_SAMPLING_LEVEL);
                 assertEquals(expectedSize, sstable.getIndexSummarySize(), 1);
                 txn.update(sstable, true);
                 txn.checkpoint();
@@ -564,72 +573,56 @@
         {
             for (int row = 0; row < numRows; row++)
             {
-                DecoratedKey key = Util.dk(String.valueOf(row));
-                Mutation rm = new Mutation(ksname, key.getKey());
-                rm.add(cfname, Util.cellname("column"), value, 0);
-                rm.applyUnsafe();
+                String key = String.format("%3d", row);
+                new RowUpdateBuilder(cfs.metadata, 0, key)
+                .clustering("column")
+                .add("val", value)
+                .build()
+                .applyUnsafe();
             }
             cfs.forceBlockingFlush();
         }
 
-        assertTrue(manager.getAverageIndexInterval() >= cfs.metadata.getMinIndexInterval());
+        assertTrue(manager.getAverageIndexInterval() >= cfs.metadata.params.minIndexInterval);
         Map<String, Integer> intervals = manager.getIndexIntervals();
         for (Map.Entry<String, Integer> entry : intervals.entrySet())
             if (entry.getKey().contains(CF_STANDARDLOWiINTERVAL))
-                assertEquals(cfs.metadata.getMinIndexInterval(), entry.getValue(), 0.001);
+                assertEquals(cfs.metadata.params.minIndexInterval, entry.getValue(), 0.001);
 
         manager.setMemoryPoolCapacityInMB(0);
         manager.redistributeSummaries();
-        assertTrue(manager.getAverageIndexInterval() > cfs.metadata.getMinIndexInterval());
+        assertTrue(manager.getAverageIndexInterval() > cfs.metadata.params.minIndexInterval);
         intervals = manager.getIndexIntervals();
         for (Map.Entry<String, Integer> entry : intervals.entrySet())
         {
             if (entry.getKey().contains(CF_STANDARDLOWiINTERVAL))
-                assertTrue(entry.getValue() >= cfs.metadata.getMinIndexInterval());
+                assertTrue(entry.getValue() >= cfs.metadata.params.minIndexInterval);
         }
     }
 
     @Test
     public void testCancelIndex() throws Exception
     {
-        testCancelIndexHelper(new CancelFunction()
-        {
-            public void cancel(ColumnFamilyStore cfs)
-            {
-                CompactionManager.instance.stopCompaction("INDEX_SUMMARY");
-            }
-        });
+        testCancelIndexHelper((cfs) -> CompactionManager.instance.stopCompaction("INDEX_SUMMARY"));
     }
 
     @Test
     public void testCancelIndexInterrupt() throws Exception
     {
-        testCancelIndexHelper(new CancelFunction()
-        {
-            public void cancel(ColumnFamilyStore cfs)
-            {
-                CompactionManager.instance.interruptCompactionFor(Collections.singleton(cfs.metadata), false);
-            }
-        });
+        testCancelIndexHelper((cfs) -> CompactionManager.instance.interruptCompactionFor(Collections.singleton(cfs.metadata), false));
     }
 
-    private static interface CancelFunction
-    {
-        void cancel(ColumnFamilyStore cfs);
-    }
-
-    public void testCancelIndexHelper(CancelFunction cf) throws Exception
+    public void testCancelIndexHelper(Consumer<ColumnFamilyStore> cancelFunction) throws Exception
     {
         String ksname = KEYSPACE1;
         String cfname = CF_STANDARDLOWiINTERVAL; // index interval of 8, no key caching
         Keyspace keyspace = Keyspace.open(ksname);
         final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
         final int numSSTables = 4;
-        final int numTries = 4;
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        final List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        final List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -666,11 +659,11 @@
         });
         t.start();
         while (CompactionManager.instance.getActiveCompactions() == 0 && t.isAlive())
-            Thread.yield();
+            Thread.sleep(1);
         // to ensure that the stop condition check in IndexSummaryRedistribution::redistributeSummaries
         // is made *after* the halt request is made to the CompactionManager, don't allow the redistribution
         // to proceed until stopCompaction has been called.
-        cf.cancel(cfs);
+        cancelFunction.accept(cfs);
         // allows the redistribution to proceed
         barrier.countDown();
         t.join();
@@ -679,15 +672,53 @@
         assertTrue("Expected no active compactions", CompactionMetrics.getCompactions().isEmpty());
 
         Set<SSTableReader> beforeRedistributionSSTables = new HashSet<>(sstables);
-        Set<SSTableReader> afterCancelSSTables = new HashSet<>(cfs.getSSTables());
+        Set<SSTableReader> afterCancelSSTables = new HashSet<>(cfs.getLiveSSTables());
         Set<SSTableReader> disjoint = Sets.symmetricDifference(beforeRedistributionSSTables, afterCancelSSTables);
         assertTrue(String.format("Mismatched files before and after cancelling redistribution: %s",
                                  Joiner.on(",").join(disjoint)),
                    disjoint.isEmpty());
 
+        assertOnDiskState(cfs, numSSTables);
         validateData(cfs, numRows);
     }
 
+    @Test
+    public void testPauseIndexSummaryManager() throws Exception
+    {
+        String ksname = KEYSPACE1;
+        String cfname = CF_STANDARDLOWiINTERVAL; // index interval of 8, no key caching
+        Keyspace keyspace = Keyspace.open(ksname);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        int numSSTables = 4;
+        int numRows = 256;
+        createSSTables(ksname, cfname, numSSTables, numRows);
+
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        for (SSTableReader sstable : sstables)
+            sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
+
+        long singleSummaryOffHeapSpace = sstables.get(0).getIndexSummaryOffHeapSize();
+
+        // everything should get cut in half
+        assert sstables.size() == numSSTables;
+        try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN))
+        {
+            try (AutoCloseable toresume = CompactionManager.instance.pauseGlobalCompaction())
+            {
+                sstables = redistributeSummaries(Collections.emptyList(), of(cfs.metadata.cfId, txn), (singleSummaryOffHeapSpace * (numSSTables / 2)));
+                fail("The redistribution should fail - we got paused before adding to active compactions, but after marking compacting");
+            }
+        }
+        catch (CompactionInterruptedException e)
+        {
+            // expected
+        }
+        for (SSTableReader sstable : sstables)
+            assertEquals(BASE_SAMPLING_LEVEL, sstable.getIndexSummarySamplingLevel());
+        validateData(cfs, numRows);
+        assertOnDiskState(cfs, numSSTables);
+    }
+
     private static List<SSTableReader> redistributeSummaries(List<SSTableReader> compacting,
                                                              Map<UUID, LifecycleTransaction> transactions,
                                                              long memoryPoolBytes)

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
index 77fd69a..31a57e1 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java

@@ -29,18 +29,15 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceParams;
 
 import static org.junit.Assert.assertEquals;
 
@@ -54,12 +51,11 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD)
                                                 .minIndexInterval(8)
                                                 .maxIndexInterval(256)
-                                                .caching(CachingOptions.NONE));
+                                                .caching(CachingParams.CACHE_NOTHING));
     }
 
     @Test
@@ -75,31 +71,30 @@
         StorageMetrics.load.dec(load); // reset the load metric
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getSSTables());
+        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
         long oldSize = 0;
         for (SSTableReader sstable : sstables)
         {
-            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
             oldSize += sstable.bytesOnDisk();
         }
 
         load = StorageMetrics.load.getCount();
-
         long others = load - oldSize; // Other SSTables size, e.g. schema and other system SSTables
 
-        int originalMinIndexInterval = cfs.metadata.getMinIndexInterval();
+        int originalMinIndexInterval = cfs.metadata.params.minIndexInterval;
         // double the min_index_interval
         cfs.metadata.minIndexInterval(originalMinIndexInterval * 2);
         IndexSummaryManager.instance.redistributeSummaries();
 
         long newSize = 0;
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
-            assertEquals(cfs.metadata.getMinIndexInterval(), sstable.getEffectiveIndexInterval(), 0.001);
-            assertEquals(numRows / cfs.metadata.getMinIndexInterval(), sstable.getIndexSummarySize());
+            assertEquals(cfs.metadata.params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
+            assertEquals(numRows / cfs.metadata.params.minIndexInterval, sstable.getIndexSummarySize());
             newSize += sstable.bytesOnDisk();
         }
         newSize += others;
@@ -122,10 +117,12 @@
         {
             for (int row = 0; row < numRows; row++)
             {
-                DecoratedKey key = Util.dk(String.format("%3d", row));
-                Mutation rm = new Mutation(ksname, key.getKey());
-                rm.add(cfname, Util.cellname("column"), value, 0);
-                rm.applyUnsafe();
+                String key = String.format("%3d", row);
+                new RowUpdateBuilder(cfs.metadata, 0, key)
+                .clustering("column")
+                .add("val", value)
+                .build()
+                .applyUnsafe();
             }
             futures.add(cfs.forceFlush());
         }
@@ -140,6 +137,6 @@
                 throw new RuntimeException(e);
             }
         }
-        assertEquals(numSSTables, cfs.getSSTables().size());
+        assertEquals(numSSTables, cfs.getLiveSSTables().size());
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java
index 7442a22..6f37d8f 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryTest.java

@@ -24,9 +24,11 @@
 import java.util.*;
 
 import com.google.common.collect.Lists;
+import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.Assume;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner;
@@ -39,11 +41,139 @@
 import static org.apache.cassandra.io.sstable.IndexSummaryBuilder.downsample;
 import static org.apache.cassandra.io.sstable.IndexSummaryBuilder.entriesAtSamplingLevel;
 import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
-
 import static org.junit.Assert.*;
 
 public class IndexSummaryTest
 {
+    private final static Random random = new Random();
+    private final static IPartitioner partitioner = Util.testPartitioner();
+
+    @BeforeClass
+    public static void setup()
+    {
+        final long seed = System.nanoTime();
+        System.out.println("Using seed: " + seed);
+        random.setSeed(seed);
+    }
+
+    @Test
+    public void testIndexSummaryKeySizes() throws IOException
+    {
+        // On Circle CI we normally don't have enough off-heap memory for this test so ignore it
+        Assume.assumeTrue(System.getenv("CIRCLECI") == null);
+
+        testIndexSummaryProperties(32, 100);
+        testIndexSummaryProperties(64, 100);
+        testIndexSummaryProperties(100, 100);
+        testIndexSummaryProperties(1000, 100);
+        testIndexSummaryProperties(10000, 100);
+    }
+
+    private void testIndexSummaryProperties(int keySize, int numKeys) throws IOException
+    {
+        final int minIndexInterval = 1;
+        final List<DecoratedKey> keys = new ArrayList<>(numKeys);
+
+        try (IndexSummaryBuilder builder = new IndexSummaryBuilder(numKeys, minIndexInterval, BASE_SAMPLING_LEVEL))
+        {
+            for (int i = 0; i < numKeys; i++)
+            {
+                byte[] randomBytes = new byte[keySize];
+                random.nextBytes(randomBytes);
+                DecoratedKey key = partitioner.decorateKey(ByteBuffer.wrap(randomBytes));
+                keys.add(key);
+                builder.maybeAddEntry(key, i);
+            }
+
+            try(IndexSummary indexSummary = builder.build(partitioner))
+            {
+                assertEquals(numKeys, keys.size());
+                assertEquals(minIndexInterval, indexSummary.getMinIndexInterval());
+                assertEquals(numKeys, indexSummary.getMaxNumberOfEntries());
+                assertEquals(numKeys + 1, indexSummary.getEstimatedKeyCount());
+
+                for (int i = 0; i < numKeys; i++)
+                    assertEquals(keys.get(i).getKey(), ByteBuffer.wrap(indexSummary.getKey(i)));
+            }
+        }
+    }
+
+    /**
+     * Test an index summary whose total size is bigger than 2GB,
+     * the index summary builder should log an error but it should still
+     * create an index summary, albeit one that does not cover the entire sstable.
+     */
+    @Test
+    public void testLargeIndexSummary() throws IOException
+    {
+        // On Circle CI we normally don't have enough off-heap memory for this test so ignore it
+        Assume.assumeTrue(System.getenv("CIRCLECI") == null);
+
+        final int numKeys = 1000000;
+        final int keySize = 3000;
+        final int minIndexInterval = 1;
+
+        try (IndexSummaryBuilder builder = new IndexSummaryBuilder(numKeys, minIndexInterval, BASE_SAMPLING_LEVEL))
+        {
+            for (int i = 0; i < numKeys; i++)
+            {
+                byte[] randomBytes = new byte[keySize];
+                random.nextBytes(randomBytes);
+                DecoratedKey key = partitioner.decorateKey(ByteBuffer.wrap(randomBytes));
+                builder.maybeAddEntry(key, i);
+            }
+
+            try (IndexSummary indexSummary = builder.build(partitioner))
+            {
+                assertNotNull(indexSummary);
+                assertEquals(numKeys, indexSummary.getMaxNumberOfEntries());
+                assertEquals(numKeys + 1, indexSummary.getEstimatedKeyCount());
+            }
+        }
+    }
+
+    /**
+     * Test an index summary whose total size is bigger than 2GB,
+     * having updated IndexSummaryBuilder.defaultExpectedKeySize to match the size,
+     * the index summary should be downsampled automatically.
+     */
+    @Test
+    public void testLargeIndexSummaryWithExpectedSizeMatching() throws IOException
+    {
+        // On Circle CI we normally don't have enough off-heap memory for this test so ignore it
+        Assume.assumeTrue(System.getenv("CIRCLECI") == null);
+
+        final int numKeys = 1000000;
+        final int keySize = 3000;
+        final int minIndexInterval = 1;
+
+        long oldExpectedKeySize = IndexSummaryBuilder.defaultExpectedKeySize;
+        IndexSummaryBuilder.defaultExpectedKeySize = 3000;
+
+        try (IndexSummaryBuilder builder = new IndexSummaryBuilder(numKeys, minIndexInterval, BASE_SAMPLING_LEVEL))
+        {
+            for (int i = 0; i < numKeys; i++)
+            {
+                byte[] randomBytes = new byte[keySize];
+                random.nextBytes(randomBytes);
+                DecoratedKey key = partitioner.decorateKey(ByteBuffer.wrap(randomBytes));
+                builder.maybeAddEntry(key, i);
+            }
+
+            try (IndexSummary indexSummary = builder.build(partitioner))
+            {
+                assertNotNull(indexSummary);
+                assertEquals(minIndexInterval * 2, indexSummary.getMinIndexInterval());
+                assertEquals(numKeys / 2, indexSummary.getMaxNumberOfEntries());
+                assertEquals(numKeys + 2, indexSummary.getEstimatedKeyCount());
+            }
+        }
+        finally
+        {
+            IndexSummaryBuilder.defaultExpectedKeySize = oldExpectedKeySize;
+        }
+    }
+
     @Test
     public void testGetKey()
     {
@@ -82,7 +212,7 @@
         dos.writeUTF("JUNK");
         FileUtils.closeQuietly(dos);
         DataInputStream dis = new DataInputStream(new ByteArrayInputStream(dos.toByteArray()));
-        IndexSummary is = IndexSummary.serializer.deserialize(dis, DatabaseDescriptor.getPartitioner(), false, 1, 1);
+        IndexSummary is = IndexSummary.serializer.deserialize(dis, partitioner, false, 1, 1);
         for (int i = 0; i < 100; i++)
             assertEquals(i, is.binarySearch(random.left.get(i)));
         // read the junk
@@ -126,13 +256,13 @@
             for (int i = 0; i < size; i++)
             {
                 UUID uuid = UUID.randomUUID();
-                DecoratedKey key = DatabaseDescriptor.getPartitioner().decorateKey(ByteBufferUtil.bytes(uuid));
+                DecoratedKey key = partitioner.decorateKey(ByteBufferUtil.bytes(uuid));
                 list.add(key);
             }
             Collections.sort(list);
             for (int i = 0; i < size; i++)
                 builder.maybeAddEntry(list.get(i), i);
-            IndexSummary summary = builder.build(DatabaseDescriptor.getPartitioner());
+            IndexSummary summary = builder.build(partitioner);
             return Pair.create(list, summary);
         }
         catch (IOException e)
@@ -185,7 +315,7 @@
         int downsamplingRound = 1;
         for (int samplingLevel = BASE_SAMPLING_LEVEL - 1; samplingLevel >= 1; samplingLevel--)
         {
-            try (IndexSummary downsampled = downsample(original, samplingLevel, 128, DatabaseDescriptor.getPartitioner());)
+            try (IndexSummary downsampled = downsample(original, samplingLevel, 128, partitioner);)
             {
                 assertEquals(entriesAtSamplingLevel(samplingLevel, original.getMaxNumberOfEntries()), downsampled.size());
 
@@ -210,7 +340,7 @@
         downsamplingRound = 1;
         for (int downsampleLevel = BASE_SAMPLING_LEVEL - 1; downsampleLevel >= 1; downsampleLevel--)
         {
-            IndexSummary downsampled = downsample(previous, downsampleLevel, 128, DatabaseDescriptor.getPartitioner());
+            IndexSummary downsampled = downsample(previous, downsampleLevel, 128, partitioner);
             if (previous != original)
                 previous.close();
             assertEquals(entriesAtSamplingLevel(downsampleLevel, original.getMaxNumberOfEntries()), downsampled.size());

diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
index f4b9617..510d12f 100644
--- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java

@@ -1,135 +1,621 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
-import java.nio.ByteBuffer;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
+import java.util.Random;
 
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.Version;
+import com.google.common.collect.Iterables;
+import org.junit.After;
+import org.junit.Assert;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ColumnFamily;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletionInfo;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.composites.CellNameType;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.SinglePartitionSliceCommandTest;
+import org.apache.cassandra.db.compaction.Verifier;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.StreamPlan;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.apache.cassandra.cql3.CQLTester.assertRows;
+import static org.apache.cassandra.cql3.CQLTester.row;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
 /**
  * Tests backwards compatibility for SSTables
  */
 public class LegacySSTableTest
 {
-    public static final String LEGACY_SSTABLE_PROP = "legacy-sstable-root";
-    public static final String KSNAME = "Keyspace1";
-    public static final String CFNAME = "Standard1";
+    private static final Logger logger = LoggerFactory.getLogger(LegacySSTableTest.class);
 
-    public static Set<String> TEST_DATA;
+    public static final String LEGACY_SSTABLE_PROP = "legacy-sstable-root";
+
     public static File LEGACY_SSTABLE_ROOT;
 
+    /**
+     * When adding a new sstable version, add that one here.
+     * See {@link #testGenerateSstables()} to generate sstables.
+     * Take care on commit as you need to add the sstable files using {@code git add -f}
+     */
+    public static final String[] legacyVersions = {"mc", "mb", "ma", "la", "ka", "jb"};
+
+    // 1200 chars
+    static final String longString = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
+                                     "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789";
+
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KSNAME,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KSNAME, CFNAME));
-        beforeClass();
-    }
-
-    public static void beforeClass()
-    {
+        StorageService.instance.initServer();
         Keyspace.setInitialized();
+        createKeyspace();
+        for (String legacyVersion : legacyVersions)
+        {
+            createTables(legacyVersion);
+        }
         String scp = System.getProperty(LEGACY_SSTABLE_PROP);
         assert scp != null;
         LEGACY_SSTABLE_ROOT = new File(scp).getAbsoluteFile();
         assert LEGACY_SSTABLE_ROOT.isDirectory();
+    }
 
-        TEST_DATA = new HashSet<String>();
-        for (int i = 100; i < 1000; ++i)
-            TEST_DATA.add(Integer.toString(i));
+    @After
+    public void tearDown()
+    {
+        for (String legacyVersion : legacyVersions)
+        {
+            truncateTables(legacyVersion);
+        }
     }
 
     /**
      * Get a descriptor for the legacy sstable at the given version.
      */
-    protected Descriptor getDescriptor(String ver)
+    protected Descriptor getDescriptor(String legacyVersion, String table)
     {
-        File directory = new File(LEGACY_SSTABLE_ROOT + File.separator + ver + File.separator + KSNAME);
-        return new Descriptor(ver, directory, KSNAME, CFNAME, 0, Descriptor.Type.FINAL, SSTableFormat.Type.LEGACY);
+        return new Descriptor(legacyVersion, getTableDir(legacyVersion, table), "legacy_tables", table, 1,
+                              BigFormat.instance.getVersion(legacyVersion).hasNewFileName()?
+                              SSTableFormat.Type.BIG :SSTableFormat.Type.LEGACY);
     }
 
-    /**
-     * Generates a test SSTable for use in this classes' tests. Uncomment and run against an older build
-     * and the output will be copied to a version subdirectory in 'LEGACY_SSTABLE_ROOT'
-     *
-    @Test
-    public void buildTestSSTable() throws IOException
-    {
-        // write the output in a version specific directory
-        Descriptor dest = getDescriptor(Descriptor.Version.current_version);
-        assert dest.directory.mkdirs() : "Could not create " + dest.directory + ". Might it already exist?";
-
-        SSTableReader ssTable = SSTableUtils.prepare().ks(KSNAME).cf(CFNAME).dest(dest).write(TEST_DATA);
-        assert ssTable.descriptor.generation == 0 :
-            "In order to create a generation 0 sstable, please run this test alone.";
-        System.out.println(">>> Wrote " + dest);
-    }
-    */
-
     @Test
-    public void testStreaming() throws Throwable
+    public void testLoadLegacyCqlTables() throws Exception
     {
-        StorageService.instance.initServer();
-
-        for (File version : LEGACY_SSTABLE_ROOT.listFiles())
-            if (Version.validate(version.getName()) && SSTableFormat.Type.LEGACY.info.getVersion(version.getName()).isCompatible())
-                testStreaming(version.getName());
+        for (String legacyVersion : legacyVersions)
+        {
+            logger.info("Loading legacy version: {}", legacyVersion);
+            loadLegacyTables(legacyVersion);
+            CacheService.instance.invalidateKeyCache();
+            long startCount = CacheService.instance.keyCache.size();
+            verifyReads(legacyVersion);
+            verifyCache(legacyVersion, startCount);
+        }
     }
 
-    private void testStreaming(String version) throws Exception
+    @Test
+    public void testStreamLegacyCqlTables() throws Exception
     {
-        SSTableReader sstable = SSTableReader.open(getDescriptor(version));
-        IPartitioner p = StorageService.getPartitioner();
+        for (String legacyVersion : legacyVersions)
+        {
+            streamLegacyTables(legacyVersion);
+            verifyReads(legacyVersion);
+        }
+    }
+    @Test
+    public void testReverseIterationOfLegacyIndexedSSTable() throws Exception
+    {
+        // During upgrades from 2.1 to 3.0, reverse queries can drop rows before upgradesstables is completed
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_indexed (" +
+                                       "  p int," +
+                                       "  c int," +
+                                       "  v1 int," +
+                                       "  v2 int," +
+                                       "  PRIMARY KEY(p, c)" +
+                                       ")");
+        loadLegacyTable("legacy_%s_indexed%s", "ka", "");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * " +
+                                                             "FROM legacy_tables.legacy_ka_indexed " +
+                                                             "WHERE p=1 " +
+                                                             "ORDER BY c DESC");
+        assertEquals(5000, rs.size());
+    }
+
+    @Test
+    public void testReadingLegacyIndexedSSTableWithStaticColumns() throws Exception
+    {
+        // During upgrades from 2.1 to 3.0, reading from tables with static columns errors before upgradesstables
+        // is completed
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_indexed_static (" +
+                                       "  p int," +
+                                       "  c int," +
+                                       "  v1 int," +
+                                       "  v2 int," +
+                                       "  s1 int static," +
+                                       "  s2 int static," +
+                                       "  PRIMARY KEY(p, c)" +
+                                       ")");
+        loadLegacyTable("legacy_%s_indexed_static%s", "ka", "");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * " +
+                                                             "FROM legacy_tables.legacy_ka_indexed_static " +
+                                                             "WHERE p=1 ");
+        assertEquals(5000, rs.size());
+    }
+
+    @Test
+    public void test14766() throws Exception
+    {
+        /*
+         * During upgrades from 2.1 to 3.0, reading from old sstables in reverse order could omit the very last row if the
+         * last indexed block had only two Unfiltered-s. See CASSANDRA-14766 for details.
+         *
+         * The sstable used here has two indexed blocks, with 2 cells/rows of ~500 bytes each, with column index interval of 1kb.
+         * Without the fix SELECT * returns 4 rows in ASC order, but only 3 rows in DESC order, omitting the last one.
+         */
+
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_14766 (pk int, ck int, value text, PRIMARY KEY (pk, ck));");
+        loadLegacyTable("legacy_%s_14766%s", "ka", "");
+
+        UntypedResultSet rs;
+
+        // read all rows in ASC order, expect all 4 to be returned
+        rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables.legacy_ka_14766 WHERE pk = 0 ORDER BY ck ASC;");
+        assertEquals(4, rs.size());
+
+        // read all rows in DESC order, expect all 4 to be returned
+        rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables.legacy_ka_14766 WHERE pk = 0 ORDER BY ck DESC;");
+        assertEquals(4, rs.size());
+    }
+
+    @Test
+    public void test14803() throws Exception
+    {
+        /*
+         * During upgrades from 2.1 to 3.0, reading from old sstables in reverse order could return early if the sstable
+         * reverse iterator encounters an indexed block that only covers a single row, and that row starts in the next
+         * indexed block.
+         */
+
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_14803 (k int, c int, v1 blob, v2 blob, PRIMARY KEY (k, c));");
+        loadLegacyTable("legacy_%s_14803%s", "ka", "");
+
+        UntypedResultSet forward = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM legacy_tables.legacy_ka_14803 WHERE k=100"));
+        UntypedResultSet reverse = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM legacy_tables.legacy_ka_14803 WHERE k=100 ORDER BY c DESC"));
+
+        logger.info("{} - {}", forward.size(), reverse.size());
+        Assert.assertFalse(forward.isEmpty());
+        assertEquals(forward.size(), reverse.size());
+    }
+
+    @Test
+    public void test14873() throws Exception
+    {
+        /*
+         * When reading 2.1 sstables in 3.0 in reverse order it's possible to wrongly return an empty result set if the
+         * partition being read has a static row, and the read is performed backwards.
+         */
+
+        /*
+         * Contents of the SSTable (column_index_size_in_kb: 1) below:
+         *
+         * insert into legacy_tables.legacy_ka_14873 (pkc, sc)     values (0, 0);
+         * insert into legacy_tables.legacy_ka_14873 (pkc, cc, rc) values (0, 5, '5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555');
+         * insert into legacy_tables.legacy_ka_14873 (pkc, cc, rc) values (0, 4, '4444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444444');
+         * insert into legacy_tables.legacy_ka_14873 (pkc, cc, rc) values (0, 3, '3333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333');
+         * insert into legacy_tables.legacy_ka_14873 (pkc, cc, rc) values (0, 2, '2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222');
+         * insert into legacy_tables.legacy_ka_14873 (pkc, cc, rc) values (0, 1, '1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111');
+         */
+
+        String ddl =
+            "CREATE TABLE legacy_tables.legacy_ka_14873 ("
+            + "pkc int, cc int, sc int static, rc text, PRIMARY KEY (pkc, cc)"
+            + ") WITH CLUSTERING ORDER BY (cc DESC) AND compaction = {'enabled' : 'false', 'class' : 'LeveledCompactionStrategy'};";
+        QueryProcessor.executeInternal(ddl);
+        loadLegacyTable("legacy_%s_14873%s", "ka", "");
+
+        UntypedResultSet forward =
+            QueryProcessor.executeOnceInternal(
+                String.format("SELECT * FROM legacy_tables.legacy_ka_14873 WHERE pkc = 0 AND cc > 0 ORDER BY cc DESC;"));
+
+        UntypedResultSet reverse =
+            QueryProcessor.executeOnceInternal(
+                String.format("SELECT * FROM legacy_tables.legacy_ka_14873 WHERE pkc = 0 AND cc > 0 ORDER BY cc ASC;"));
+
+        assertEquals(5, forward.size());
+        assertEquals(5, reverse.size());
+    }
+
+    @Test
+    public void testMultiBlockRangeTombstones() throws Exception
+    {
+        /**
+         * During upgrades from 2.1 to 3.0, reading old sstables in reverse order would generate invalid sequences of
+         * range tombstone bounds if their range tombstones spanned multiple column index blocks. The read would fail
+         * in different ways depending on whether the 2.1 tables were produced by a flush or a compaction.
+         */
+
+        String version = "ka";
+        for (String tableFmt : new String[]{"legacy_%s_compacted_multi_block_rt%s", "legacy_%s_flushed_multi_block_rt%s"})
+        {
+            String table = String.format(tableFmt, version, "");
+            QueryProcessor.executeOnceInternal(String.format("CREATE TABLE legacy_tables.%s " +
+                                                             "(k int, c1 int, c2 int, v1 blob, v2 blob, " +
+                                                             "PRIMARY KEY (k, c1, c2))", table));
+            loadLegacyTable(tableFmt, version, "");
+
+            UntypedResultSet forward = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM legacy_tables.%s WHERE k=100", table));
+            UntypedResultSet reverse = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM legacy_tables.%s WHERE k=100 ORDER BY c1 DESC, c2 DESC", table));
+
+            Assert.assertFalse(forward.isEmpty());
+            assertEquals(table, forward.size(), reverse.size());
+        }
+    }
+
+
+    @Test
+    public void testInaccurateSSTableMinMax() throws Exception
+    {
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_mc_inaccurate_min_max (k int, c1 int, c2 int, c3 int, v int, primary key (k, c1, c2, c3))");
+        loadLegacyTable("legacy_%s_inaccurate_min_max%s", "mc", "");
+
+        /*
+         sstable has the following mutations:
+            INSERT INTO legacy_tables.legacy_mc_inaccurate_min_max (k, c1, c2, c3, v) VALUES (100, 4, 4, 4, 4)
+            DELETE FROM legacy_tables.legacy_mc_inaccurate_min_max WHERE k=100 AND c1<3
+         */
+
+        String query = "SELECT * FROM legacy_tables.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1";
+        List<Unfiltered> unfiltereds = SinglePartitionSliceCommandTest.getUnfilteredsFromSinglePartition(query);
+        assertEquals(2, unfiltereds.size());
+        Assert.assertTrue(unfiltereds.get(0).isRangeTombstoneMarker());
+        Assert.assertTrue(((RangeTombstoneMarker) unfiltereds.get(0)).isOpen(false));
+        Assert.assertTrue(unfiltereds.get(1).isRangeTombstoneMarker());
+        Assert.assertTrue(((RangeTombstoneMarker) unfiltereds.get(1)).isClose(false));
+    }
+
+    @Test
+    public void testVerifyOldSSTables() throws Exception
+    {
+        for (String legacyVersion : legacyVersions)
+        {
+            loadLegacyTables(legacyVersion);
+            ColumnFamilyStore cfs = Keyspace.open("legacy_tables").getColumnFamilyStore(String.format("legacy_%s_simple", legacyVersion));
+            for (SSTableReader sstable : cfs.getLiveSSTables())
+            {
+                try (Verifier verifier = new Verifier(cfs, sstable, false))
+                {
+                    verifier.verify(true);
+                }
+            }
+        }
+    }
+
+    @Test
+    public void test14912() throws Exception
+    {
+        /*
+         * When reading 2.1 sstables in 3.0, collection tombstones need to be checked against
+         * the dropped columns stored in table metadata. Failure to do so can result in unreadable
+         * rows if a column with the same name but incompatible type has subsequently been added.
+         *
+         * The original (i.e. pre-any ALTER statements) table definition for this test is:
+         * CREATE TABLE legacy_tables.legacy_ka_14912 (k int PRIMARY KEY, v1 set<text>, v2 text);
+         *
+         * The SSTable loaded emulates data being written before the table is ALTERed and contains:
+         *
+         * insert into legacy_tables.legacy_ka_14912 (k, v1, v2) values (0, {}, 'abc') USING TIMESTAMP 1543244999672280;
+         * insert into legacy_tables.legacy_ka_14912 (k, v1, v2) values (1, {'abc'}, 'abc') USING TIMESTAMP 1543244999672280;
+         *
+         * The timestamps of the (generated) collection tombstones are 1543244999672279, e.g. the <TIMESTAMP of the mutation> - 1
+         */
+
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_14912 (k int PRIMARY KEY, v1 text, v2 text)");
+        loadLegacyTable("legacy_%s_14912%s", "ka", "");
+        CFMetaData cfm = Keyspace.open("legacy_tables").getColumnFamilyStore("legacy_ka_14912").metadata;
+        ColumnDefinition columnToDrop;
+
+        /*
+         * This first variant simulates the original v1 set<text> column being dropped
+         * then re-added with the text type:
+         * CREATE TABLE legacy_tables.legacy_ka_14912 (k int PRIMARY KEY, v1 set<text>, v2 text);
+         * INSERT INTO legacy_tables.legacy)ka_14912 (k, v1, v2)...
+         * ALTER TABLE legacy_tables.legacy_ka_14912 DROP v1;
+         * ALTER TABLE legacy_tables.legacy_ka_14912 ADD v1 text;
+         */
+        columnToDrop = ColumnDefinition.regularDef(cfm,
+                                                   UTF8Type.instance.fromString("v1"),
+                                                   SetType.getInstance(UTF8Type.instance, true));
+        cfm.recordColumnDrop(columnToDrop, 1543244999700000L);
+        assertExpectedRowsWithDroppedCollection(true);
+        // repeat the query, but simulate clock drift by shifting the recorded
+        // drop time forward so that it occurs before the collection timestamp
+        cfm.recordColumnDrop(columnToDrop, 1543244999600000L);
+        assertExpectedRowsWithDroppedCollection(false);
+
+        /*
+         * This second test simulates the original v1 set<text> column being dropped
+         * then re-added with some other, non-collection type (overwriting the dropped
+         * columns record), then dropping and re-adding again as text type:
+         * CREATE TABLE legacy_tables.legacy_ka_14912 (k int PRIMARY KEY, v1 set<text>, v2 text);
+         * INSERT INTO legacy_tables.legacy_ka_14912 (k, v1, v2)...
+         * ALTER TABLE legacy_tables.legacy_ka_14912 DROP v1;
+         * ALTER TABLE legacy_tables.legacy_ka_14912 ADD v1 blob;
+         * ALTER TABLE legacy_tables.legacy_ka_14912 DROP v1;
+         * ALTER TABLE legacy_tables.legacy_ka_14912 ADD v1 text;
+         */
+        columnToDrop = ColumnDefinition.regularDef(cfm,
+                                                   UTF8Type.instance.fromString("v1"),
+                                                   BytesType.instance);
+        cfm.recordColumnDrop(columnToDrop, 1543244999700000L);
+        assertExpectedRowsWithDroppedCollection(true);
+        // repeat the query, but simulate clock drift by shifting the recorded
+        // drop time forward so that it occurs before the collection timestamp
+        cfm.recordColumnDrop(columnToDrop, 1543244999600000L);
+        assertExpectedRowsWithDroppedCollection(false);
+    }
+
+    @Test
+    public void testReadingLegacyTablesWithIllegalCellNames() throws Exception {
+        /**
+         * The sstable can be generated externally with SSTableSimpleUnsortedWriter:
+         *
+         * [
+         * {"key": "1",
+         *  "cells": [["a:aa:c1","61",1555000750634000],
+         *            ["a:aa:c2","6161",1555000750634000],
+         *            ["a:aa:pk","00000001",1555000750634000],
+         *            ["a:aa:v1","aaa",1555000750634000]]},
+         * {"key": "2",
+         *  "cells": [["b:bb:c1","62",1555000750634000],
+         *            ["b:bb:c2","6262",1555000750634000],
+         *            ["b:bb:pk","00000002",1555000750634000],
+         *            ["b:bb:v1","bbb",1555000750634000]]}
+         * ]
+         * and an extra sstable with only the invalid cell name
+         * [
+         * {"key": "3",
+         *  "cells": [["a:aa:pk","68656c6c6f30",1570466358949]]}
+         * ]
+         *
+         */
+        String table = "legacy_ka_with_illegal_cell_names";
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables." + table + " (" +
+                                       " pk int," +
+                                       " c1 text," +
+                                       " c2 text," +
+                                       " v1 text," +
+                                       " PRIMARY KEY(pk, c1, c2))");
+        loadLegacyTable("legacy_%s_with_illegal_cell_names%s", "ka", "");
+        UntypedResultSet results =
+            QueryProcessor.executeOnceInternal("SELECT * FROM legacy_tables."+table);
+
+        assertRows(results, row(1, "a", "aa", "aaa"), row(2, "b", "bb", "bbb"), row (3, "a", "aa", null));
+        Keyspace.open("legacy_tables").getColumnFamilyStore(table).forceMajorCompaction();
+    }
+
+    @Test
+    public void testReadingLegacyTablesWithIllegalCellNamesPKLI() throws Exception {
+        /**
+         *
+         * Makes sure we grab the correct PKLI when we have illegal columns
+         *
+         * sstable looks like this:
+         * [
+         * {"key": "3",
+         *  "cells": [["a:aa:","",100],
+         *            ["a:aa:pk","6d656570",200]]}
+         * ]
+         */
+        /*
+        this generates the stable on 2.1:
+        CFMetaData metadata = CFMetaData.compile("create table legacy_tables.legacy_ka_with_illegal_cell_names_2 (pk int, c1 text, c2 text, v1 text, primary key (pk, c1, c2))", "legacy_tables");
+        try (SSTableSimpleUnsortedWriter writer = new SSTableSimpleUnsortedWriter(new File("/tmp/sstable21"),
+                                                                                  metadata,
+                                                                                  new ByteOrderedPartitioner(),
+                                                                                  10))
+        {
+            writer.newRow(bytes(3));
+            writer.addColumn(new BufferCell(Util.cellname("a", "aa", ""), bytes(""), 100));
+            writer.addColumn(new BufferCell(Util.cellname("a", "aa", "pk"), bytes("meep"), 200));
+        }
+        */
+        String table = "legacy_ka_with_illegal_cell_names_2";
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables." + table + " (" +
+                                       " pk int," +
+                                       " c1 text," +
+                                       " c2 text," +
+                                       " v1 text," +
+                                       " PRIMARY KEY(pk, c1, c2))");
+        loadLegacyTable("legacy_%s_with_illegal_cell_names_2%s", "ka", "");
+        ColumnFamilyStore cfs = Keyspace.open("legacy_tables").getColumnFamilyStore(table);
+        assertEquals(1, Iterables.size(cfs.getSSTables(SSTableSet.CANONICAL)));
+        cfs.forceMajorCompaction();
+        assertEquals(1, Iterables.size(cfs.getSSTables(SSTableSet.CANONICAL)));
+        SSTableReader sstable = Iterables.getFirst(cfs.getSSTables(SSTableSet.CANONICAL), null);
+        LivenessInfo livenessInfo = null;
+        try (ISSTableScanner scanner = sstable.getScanner())
+        {
+            while (scanner.hasNext())
+            {
+                try (UnfilteredRowIterator iter = scanner.next())
+                {
+                    while (iter.hasNext())
+                    {
+                        Unfiltered uf = iter.next();
+                        livenessInfo = ((Row)uf).primaryKeyLivenessInfo();
+                    }
+                }
+            }
+        }
+        assertNotNull(livenessInfo);
+        assertEquals(100, livenessInfo.timestamp());
+    }
+
+    @Test
+    public void testReadingIndexedLegacyTablesWithIllegalCellNames() throws Exception {
+        /**
+         * The sstable can be generated externally with SSTableSimpleUnsortedWriter:
+         * column_index_size_in_kb: 1
+         * [
+         *   {"key": "key",
+         *    "cells": [
+         *               ["00000:000000:a","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0],
+         *               ["00000:000000:b","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00000:000000:c","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00000:000000:z","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00001:000001:a","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0],
+         *               ["00001:000001:b","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00001:000001:c","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00001:000001:z","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               .
+         *               .
+         *               .
+         *               ["00010:000010:a","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0],
+         *               ["00010:000010:b","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00010:000010:c","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *               ["00010:000010:z","00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",0]
+         *           ]
+         *   }
+         * ]
+         * Each row in the partition contains only 1 valid cell. The ones with the column name components 'a', 'b' & 'z' are illegal as they refer to PRIMARY KEY
+         * columns, but SSTables such as this can be generated with offline tools and loaded via SSTableLoader or nodetool refresh (see CASSANDRA-15086) (see
+         * CASSANDRA-15086) Only 'c' is a valid REGULAR column in the table schema.
+         * In the initial fix for CASSANDRA-15086, the bytes read by OldFormatDeserializer for these invalid cells are not correctly accounted for, causing
+         * ReverseIndexedReader to assert that the end of a block has been reached earlier than it actually has, which in turn causes rows to be incorrectly
+         * ommitted from the results.
+         *
+         * This sstable has been crafted to hit a further potential error condition. Rows 00001:00001 and 00008:00008 interact with the index block boundaries
+         * in a very specific way; for both of these rows, the (illegal) cells 'a' & 'b', along with the valid 'c' cell are at the end of an index block, but
+         * the 'z' cell is over the boundary, in the following block. We need to ensure that the bytes consumed for the 'z' cell are properly accounted for and
+         * not counted toward those for the next row on disk.
+         */
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_with_illegal_cell_names_indexed (" +
+                                       " a text," +
+                                       " b text," +
+                                       " z text," +
+                                       " c text," +
+                                       " PRIMARY KEY(a, b, z))");
+        loadLegacyTable("legacy_%s_with_illegal_cell_names_indexed%s", "ka", "");
+        String queryForward = "SELECT * FROM legacy_tables.legacy_ka_with_illegal_cell_names_indexed WHERE a = 'key'";
+        String queryReverse = queryForward + " ORDER BY b DESC, z DESC";
+
+        List<String> forward = new ArrayList<>();
+        QueryProcessor.executeOnceInternal(queryForward).forEach(r -> forward.add(r.getString("b") + ":" +  r.getString("z")));
+
+        List<String> reverse = new ArrayList<>();
+        QueryProcessor.executeOnceInternal(queryReverse).forEach(r -> reverse.add(r.getString("b") + ":" +  r.getString("z")));
+
+        assertEquals(11, reverse.size());
+        assertEquals(11, forward.size());
+        for (int i=0; i < 11; i++)
+            assertEquals(forward.get(i), reverse.get(10 - i));
+    }
+
+    private void assertExpectedRowsWithDroppedCollection(boolean droppedCheckSuccessful)
+    {
+        for (int i=0; i<=1; i++)
+        {
+            UntypedResultSet rows =
+                QueryProcessor.executeOnceInternal(
+                    String.format("SELECT * FROM legacy_tables.legacy_ka_14912 WHERE k = %s;", i));
+            assertEquals(1, rows.size());
+            UntypedResultSet.Row row = rows.one();
+
+            // If the best-effort attempt to filter dropped columns was successful, then the row
+            // should not contain the v1 column at all. Likewise, if no column data was written,
+            // only a tombstone, then no v1 column should be present.
+            // However, if collection data was written (i.e. where k=1), then if the dropped column
+            // check didn't filter the legacy cells, we should expect an empty column value as the
+            // legacy collection tombstone won't cover it and the dropped column check doesn't filter
+            // it.
+            if (droppedCheckSuccessful || i == 0)
+                Assert.assertFalse(row.has("v1"));
+            else
+                assertEquals("", row.getString("v1"));
+
+            assertEquals("abc", row.getString("v2"));
+        }
+    }
+
+    private void streamLegacyTables(String legacyVersion) throws Exception
+    {
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            logger.info("Streaming legacy version {}{}", legacyVersion, getCompactNameSuffix(compact));
+            streamLegacyTable("legacy_%s_simple%s", legacyVersion, getCompactNameSuffix(compact));
+            streamLegacyTable("legacy_%s_simple_counter%s", legacyVersion, getCompactNameSuffix(compact));
+            streamLegacyTable("legacy_%s_clust%s", legacyVersion, getCompactNameSuffix(compact));
+            streamLegacyTable("legacy_%s_clust_counter%s", legacyVersion, getCompactNameSuffix(compact));
+        }
+    }
+
+    private void streamLegacyTable(String tablePattern, String legacyVersion, String compactNameSuffix) throws Exception
+    {
+        String table = String.format(tablePattern, legacyVersion, compactNameSuffix);
+        SSTableReader sstable = SSTableReader.open(getDescriptor(legacyVersion, table));
+        IPartitioner p = sstable.getPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("100"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("100")), p.getMinimumToken()));
@@ -139,62 +625,305 @@
                                                                sstable.estimatedKeysForRanges(ranges), sstable.getSSTableMetadata().repairedAt));
         new StreamPlan("LegacyStreamingTest").transferFiles(FBUtilities.getBroadcastAddress(), details)
                                              .execute().get();
-
-        ColumnFamilyStore cfs = Keyspace.open(KSNAME).getColumnFamilyStore(CFNAME);
-        assert cfs.getSSTables().size() == 1;
-        sstable = cfs.getSSTables().iterator().next();
-        CellNameType type = sstable.metadata.comparator;
-        for (String keystring : TEST_DATA)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(keystring);
-            OnDiskAtomIterator iter = sstable.iterator(Util.dk(key), FBUtilities.singleton(Util.cellname(key), type));
-            ColumnFamily cf = iter.getColumnFamily();
-
-            // check not deleted (CASSANDRA-6527)
-            assert cf.deletionInfo().equals(DeletionInfo.live());
-            assert iter.next().name().toByteBuffer().equals(key);
-        }
-        sstable.selfRef().release();
     }
 
+    private static void loadLegacyTables(String legacyVersion) throws Exception
+    {
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            logger.info("Preparing legacy version {}{}", legacyVersion, getCompactNameSuffix(compact));
+            loadLegacyTable("legacy_%s_simple%s", legacyVersion, getCompactNameSuffix(compact));
+            loadLegacyTable("legacy_%s_simple_counter%s", legacyVersion, getCompactNameSuffix(compact));
+            loadLegacyTable("legacy_%s_clust%s", legacyVersion, getCompactNameSuffix(compact));
+            loadLegacyTable("legacy_%s_clust_counter%s", legacyVersion, getCompactNameSuffix(compact));
+        }
+    }
+
+    private static void verifyCache(String legacyVersion, long startCount) throws InterruptedException, java.util.concurrent.ExecutionException
+    {
+        //For https://issues.apache.org/jira/browse/CASSANDRA-10778
+        //Validate whether the key cache successfully saves in the presence of old keys as
+        //well as loads the correct number of keys
+        long endCount = CacheService.instance.keyCache.size();
+        Assert.assertTrue(endCount > startCount);
+        CacheService.instance.keyCache.submitWrite(Integer.MAX_VALUE).get();
+        CacheService.instance.invalidateKeyCache();
+        assertEquals(startCount, CacheService.instance.keyCache.size());
+        CacheService.instance.keyCache.loadSaved();
+        if (BigFormat.instance.getVersion(legacyVersion).storeRows())
+            assertEquals(endCount, CacheService.instance.keyCache.size());
+        else
+            assertEquals(startCount, CacheService.instance.keyCache.size());
+    }
+
+    private static void verifyReads(String legacyVersion)
+    {
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            for (int ck = 0; ck < 50; ck++)
+            {
+                String ckValue = Integer.toString(ck) + longString;
+                for (int pk = 0; pk < 5; pk++)
+                {
+                    logger.debug("for pk={} ck={}", pk, ck);
+
+                    String pkValue = Integer.toString(pk);
+                    UntypedResultSet rs;
+                    if (ck == 0)
+                    {
+                        readSimpleTable(legacyVersion, getCompactNameSuffix(compact),  pkValue);
+                        readSimpleCounterTable(legacyVersion, getCompactNameSuffix(compact), pkValue);
+                    }
+
+                    readClusteringTable(legacyVersion, getCompactNameSuffix(compact), ck, ckValue, pkValue);
+                    readClusteringCounterTable(legacyVersion, getCompactNameSuffix(compact), ckValue, pkValue);
+                }
+            }
+        }
+    }
+
+    private static void readClusteringCounterTable(String legacyVersion, String compactSuffix, String ckValue, String pkValue)
+    {
+        logger.debug("Read legacy_{}_clust_counter{}", legacyVersion, compactSuffix);
+        UntypedResultSet rs;
+        rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_clust_counter%s WHERE pk=? AND ck=?", legacyVersion, compactSuffix), pkValue, ckValue);
+        Assert.assertNotNull(rs);
+        assertEquals(1, rs.size());
+        assertEquals(1L, rs.one().getLong("val"));
+    }
+
+    private static void readClusteringTable(String legacyVersion, String compactSuffix, int ck, String ckValue, String pkValue)
+    {
+        logger.debug("Read legacy_{}_clust{}", legacyVersion, compactSuffix);
+        UntypedResultSet rs;
+        rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_clust%s WHERE pk=? AND ck=?", legacyVersion, compactSuffix), pkValue, ckValue);
+        assertLegacyClustRows(1, rs);
+
+        String ckValue2 = Integer.toString(ck < 10 ? 40 : ck - 1) + longString;
+        String ckValue3 = Integer.toString(ck > 39 ? 10 : ck + 1) + longString;
+        rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_clust%s WHERE pk=? AND ck IN (?, ?, ?)", legacyVersion, compactSuffix), pkValue, ckValue, ckValue2, ckValue3);
+        assertLegacyClustRows(3, rs);
+    }
+
+    private static void readSimpleCounterTable(String legacyVersion, String compactSuffix, String pkValue)
+    {
+        logger.debug("Read legacy_{}_simple_counter{}", legacyVersion, compactSuffix);
+        UntypedResultSet rs;
+        rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_simple_counter%s WHERE pk=?", legacyVersion, compactSuffix), pkValue);
+        Assert.assertNotNull(rs);
+        assertEquals(1, rs.size());
+        assertEquals(1L, rs.one().getLong("val"));
+    }
+
+    private static void readSimpleTable(String legacyVersion, String compactSuffix, String pkValue)
+    {
+        logger.debug("Read simple: legacy_{}_simple{}", legacyVersion, compactSuffix);
+        UntypedResultSet rs;
+        rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_simple%s WHERE pk=?", legacyVersion, compactSuffix), pkValue);
+        Assert.assertNotNull(rs);
+        assertEquals(1, rs.size());
+        assertEquals("foo bar baz", rs.one().getString("val"));
+    }
+
+    private static void createKeyspace()
+    {
+        QueryProcessor.executeInternal("CREATE KEYSPACE legacy_tables WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}");
+    }
+
+    private static void createTables(String legacyVersion)
+    {
+        for (int i=0; i<=1; i++)
+        {
+            String compactSuffix = getCompactNameSuffix(i);
+            String tableSuffix = i == 0? "" : " WITH COMPACT STORAGE";
+            QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_simple%s (pk text PRIMARY KEY, val text)%s", legacyVersion, compactSuffix, tableSuffix));
+            QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_simple_counter%s (pk text PRIMARY KEY, val counter)%s", legacyVersion, compactSuffix, tableSuffix));
+            QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_clust%s (pk text, ck text, val text, PRIMARY KEY (pk, ck))%s", legacyVersion, compactSuffix, tableSuffix));
+            QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_clust_counter%s (pk text, ck text, val counter, PRIMARY KEY (pk, ck))%s", legacyVersion, compactSuffix, tableSuffix));
+        }
+    }
+
+    private static String getCompactNameSuffix(int i)
+    {
+        return i == 0? "" : "_compact";
+    }
+
+    private static void truncateTables(String legacyVersion)
+    {
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_simple%s", legacyVersion, getCompactNameSuffix(compact)));
+            QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_simple_counter%s", legacyVersion, getCompactNameSuffix(compact)));
+            QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_clust%s", legacyVersion, getCompactNameSuffix(compact)));
+            QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_clust_counter%s", legacyVersion, getCompactNameSuffix(compact)));
+        }
+        CacheService.instance.invalidateCounterCache();
+        CacheService.instance.invalidateKeyCache();
+    }
+
+    private static void assertLegacyClustRows(int count, UntypedResultSet rs)
+    {
+        Assert.assertNotNull(rs);
+        assertEquals(count, rs.size());
+        for (int i = 0; i < count; i++)
+        {
+            for (UntypedResultSet.Row r : rs)
+            {
+                assertEquals(128, r.getString("val").length());
+            }
+        }
+    }
+
+    private static void loadLegacyTable(String tablePattern, String legacyVersion, String compactSuffix) throws IOException
+    {
+        String table = String.format(tablePattern, legacyVersion, compactSuffix);
+
+        logger.info("Loading legacy table {}", table);
+
+        ColumnFamilyStore cfs = Keyspace.open("legacy_tables").getColumnFamilyStore(table);
+
+        for (File cfDir : cfs.getDirectories().getCFDirectories())
+        {
+            copySstablesToTestData(legacyVersion, table, cfDir);
+        }
+
+        cfs.loadNewSSTables();
+    }
+
+
+    /**
+     * Test for CASSANDRA-15778
+     */
     @Test
-    public void testVersions() throws Throwable
-    {
-        boolean notSkipped = false;
-
-        for (File version : LEGACY_SSTABLE_ROOT.listFiles())
-        {
-            if (Version.validate(version.getName()) && SSTableFormat.Type.LEGACY.info.getVersion(version.getName()).isCompatible())
-            {
-                notSkipped = true;
-                testVersion(version.getName());
-            }
-        }
-
-        assert notSkipped;
+    public void testReadLegacyCqlCreatedTableWithBytes() throws Exception {
+        String table = "legacy_ka_cql_created_dense_table_with_bytes";
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables." + table + " (" +
+                                       " k int," +
+                                       " v text," +
+                                       " PRIMARY KEY(k, v)) WITH COMPACT STORAGE");
+        loadLegacyTable("legacy_%s_cql_created_dense_table_with_bytes%s", "ka", "");
+        QueryProcessor.executeInternal("ALTER TABLE legacy_tables." + table + " ALTER value TYPE 'org.apache.cassandra.db.marshal.BytesType';");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables." + table);
+        Assert.assertNotNull(rs);
+        assertEquals(1, rs.size());
+        assertEquals(ByteBufferUtil.bytes("byte string"), rs.one().getBytes("value"));
     }
 
-    public void testVersion(String version) throws Throwable
-    {
-        try
-        {
-            SSTableReader reader = SSTableReader.open(getDescriptor(version));
-            CellNameType type = reader.metadata.comparator;
-            for (String keystring : TEST_DATA)
-            {
-                ByteBuffer key = ByteBufferUtil.bytes(keystring);
-                // confirm that the bloom filter does not reject any keys/names
-                DecoratedKey dk = reader.partitioner.decorateKey(key);
-                OnDiskAtomIterator iter = reader.iterator(dk, FBUtilities.singleton(Util.cellname(key), type));
-                assert iter.next().name().toByteBuffer().equals(key);
-            }
+    /**
+     * Test for CASSANDRA-15778
+     */
+    @Test
+    public void testReadLegacyCqlCreatedTableWithInt() throws Exception {
+        String table = "legacy_ka_cql_created_dense_table_with_int";
+        QueryProcessor.executeInternal("CREATE TABLE legacy_tables." + table + " (" +
+                                       " k int," +
+                                       " v text," +
+                                       " PRIMARY KEY(k, v)) WITH COMPACT STORAGE");
+        loadLegacyTable("legacy_%s_cql_created_dense_table_with_int%s", "ka", "");
+        QueryProcessor.executeInternal("ALTER TABLE legacy_tables." + table + " ALTER value TYPE 'org.apache.cassandra.db.marshal.BytesType';");
+        UntypedResultSet rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables." + table);
+        Assert.assertNotNull(rs);
+        assertEquals(1, rs.size());
+        assertEquals(ByteBufferUtil.bytes(0xaabbcc), rs.one().getBytes("value"));
+    }
 
-            // TODO actually test some reads
-        }
-        catch (Throwable e)
+    /**
+     * Generates sstables for 8 CQL tables (see {@link #createTables(String)}) in <i>current</i>
+     * sstable format (version) into {@code test/data/legacy-sstables/VERSION}, where
+     * {@code VERSION} matches {@link Version#getVersion() BigFormat.latestVersion.getVersion()}.
+     * <p>
+     * Run this test alone (e.g. from your IDE) when a new version is introduced or format changed
+     * during development. I.e. remove the {@code @Ignore} annotation temporarily.
+     * </p>
+     */
+    @Ignore
+    @Test
+    public void testGenerateSstables() throws Throwable
+    {
+        Random rand = new Random();
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 128; i++)
         {
-            System.err.println("Failed to read " + version);
-            throw e;
+            sb.append((char)('a' + rand.nextInt(26)));
+        }
+        String randomString = sb.toString();
+
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            for (int pk = 0; pk < 5; pk++)
+            {
+                String valPk = Integer.toString(pk);
+                QueryProcessor.executeInternal(String.format("INSERT INTO legacy_tables.legacy_%s_simple%s (pk, val) VALUES ('%s', '%s')",
+                                                             BigFormat.latestVersion, getCompactNameSuffix(compact), valPk, "foo bar baz"));
+
+                QueryProcessor.executeInternal(String.format("UPDATE legacy_tables.legacy_%s_simple_counter%s SET val = val + 1 WHERE pk = '%s'",
+                                                             BigFormat.latestVersion, getCompactNameSuffix(compact), valPk));
+
+                for (int ck = 0; ck < 50; ck++)
+                {
+                    String valCk = Integer.toString(ck);
+
+                    QueryProcessor.executeInternal(String.format("INSERT INTO legacy_tables.legacy_%s_clust%s (pk, ck, val) VALUES ('%s', '%s', '%s')",
+                                                                 BigFormat.latestVersion, getCompactNameSuffix(compact), valPk, valCk + longString, randomString));
+
+                    QueryProcessor.executeInternal(String.format("UPDATE legacy_tables.legacy_%s_clust_counter%s SET val = val + 1 WHERE pk = '%s' AND ck='%s'",
+                                                                 BigFormat.latestVersion, getCompactNameSuffix(compact), valPk, valCk + longString));
+
+                }
+            }
+        }
+
+        StorageService.instance.forceKeyspaceFlush("legacy_tables");
+
+        File ksDir = new File(LEGACY_SSTABLE_ROOT, String.format("%s/legacy_tables", BigFormat.latestVersion));
+        ksDir.mkdirs();
+        for (int compact = 0; compact <= 1; compact++)
+        {
+            copySstablesFromTestData(String.format("legacy_%s_simple%s", BigFormat.latestVersion, getCompactNameSuffix(compact)), ksDir);
+            copySstablesFromTestData(String.format("legacy_%s_simple_counter%s", BigFormat.latestVersion, getCompactNameSuffix(compact)), ksDir);
+            copySstablesFromTestData(String.format("legacy_%s_clust%s", BigFormat.latestVersion, getCompactNameSuffix(compact)), ksDir);
+            copySstablesFromTestData(String.format("legacy_%s_clust_counter%s", BigFormat.latestVersion, getCompactNameSuffix(compact)), ksDir);
+        }
+    }
+
+    public static void copySstablesFromTestData(String table, File ksDir) throws IOException
+    {
+        File cfDir = new File(ksDir, table);
+        cfDir.mkdir();
+
+        for (File srcDir : Keyspace.open("legacy_tables").getColumnFamilyStore(table).getDirectories().getCFDirectories())
+        {
+            for (File file : srcDir.listFiles())
+            {
+                copyFile(cfDir, file);
+            }
+        }
+    }
+
+    private static void copySstablesToTestData(String legacyVersion, String table, File cfDir) throws IOException
+    {
+        for (File file : getTableDir(legacyVersion, table).listFiles())
+        {
+            copyFile(cfDir, file);
+        }
+    }
+
+    private static File getTableDir(String legacyVersion, String table)
+    {
+        return new File(LEGACY_SSTABLE_ROOT, String.format("%s/legacy_tables/%s", legacyVersion, table));
+    }
+
+    private static void copyFile(File cfDir, File file) throws IOException
+    {
+        byte[] buf = new byte[65536];
+        if (file.isFile())
+        {
+            File target = new File(cfDir, file.getName());
+            int rd;
+            FileInputStream is = new FileInputStream(file);
+            FileOutputStream os = new FileOutputStream(target);
+            while ((rd = is.read(buf)) >= 0)
+                os.write(buf, 0, rd);
         }
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java
new file mode 100644
index 0000000..451af25
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java

@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.*;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.*;
+import org.apache.cassandra.cache.*;
+import org.apache.cassandra.config.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.schema.*;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class SSTableCorruptionDetectionTest extends SSTableWriterTestBase
+{
+    private static final Logger logger = LoggerFactory.getLogger(SSTableCorruptionDetectionTest.class);
+
+    private static final int numberOfPks = 1000;
+    private static final int numberOfRuns = 100;
+    private static final int valueSize = 512 * 1024;
+    // Set corruption size larger or in comparable size to value size, otherwise
+    // chance for corruption to land in the middle of value is quite high.
+    private static final int maxCorruptionSize = 2 * 1024 * 1024;
+
+    private static final String keyspace = "SSTableCorruptionDetectionTest";
+    private static final String table = "corrupted_table";
+
+    private static int maxValueSize;
+    private static Random random;
+    private static SSTableWriter writer;
+    private static LifecycleTransaction txn;
+    private static ColumnFamilyStore cfs;
+    private static SSTableReader ssTableReader;
+
+    @BeforeClass
+    public static void setUp()
+    {
+        CFMetaData cfm = CFMetaData.Builder.create(keyspace, table)
+                                           .addPartitionKey("pk", AsciiType.instance)
+                                           .addClusteringColumn("ck1", AsciiType.instance)
+                                           .addClusteringColumn("ck2", AsciiType.instance)
+                                           .addRegularColumn("reg1", BytesType.instance)
+                                           .addRegularColumn("reg2", BytesType.instance)
+                                           .build();
+
+        cfm.compression(CompressionParams.noCompression());
+        SchemaLoader.createKeyspace(keyspace,
+                                    KeyspaceParams.simple(1),
+                                    cfm);
+
+        cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+        cfs.disableAutoCompaction();
+
+        maxValueSize = DatabaseDescriptor.getMaxValueSize();
+        DatabaseDescriptor.setMaxValueSize(1024 * 1024);
+
+        long seed = System.nanoTime();
+        logger.info("Seed {}", seed);
+        random = new Random(seed);
+
+        truncate(cfs);
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        txn = LifecycleTransaction.offline(OperationType.WRITE);
+
+        // Setting up/writing large values is an expensive operation, we only want to do it once per run
+        writer = getWriter(cfs, dir, txn);
+        for (int i = 0; i < numberOfPks; i++)
+        {
+            UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, String.format("pkvalue_%07d", i)).withTimestamp(1);
+            byte[] reg1 = new byte[valueSize];
+            random.nextBytes(reg1);
+            byte[] reg2 = new byte[valueSize];
+            random.nextBytes(reg2);
+            builder.newRow("clustering_" + i, "clustering_" + (i + 1))
+                   .add("reg1", ByteBuffer.wrap(reg1))
+                   .add("reg2", ByteBuffer.wrap(reg2));
+            writer.append(builder.build().unfilteredIterator());
+        }
+        cfs.forceBlockingFlush();
+
+        ssTableReader = writer.finish(true);
+        txn.update(ssTableReader, false);
+        LifecycleTransaction.waitForDeletions();
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.setMaxValueSize(maxValueSize);
+
+        txn.abort();
+        writer.close();
+    }
+
+    @Test
+    public void testSinglePartitionIterator() throws Throwable
+    {
+        bruteForceCorruptionTest(ssTableReader, partitionIterator());
+    }
+
+    @Test
+    public void testSSTableScanner() throws Throwable
+    {
+        bruteForceCorruptionTest(ssTableReader, sstableScanner());
+    }
+
+    private void bruteForceCorruptionTest(SSTableReader ssTableReader, Consumer<SSTableReader> walker) throws Throwable
+    {
+        RandomAccessFile raf = new RandomAccessFile(ssTableReader.getFilename(), "rw");
+
+        int corruptedCounter = 0;
+
+        int fileLength = (int)raf.length(); // in current test, it does fit into int
+        for (int i = 0; i < numberOfRuns; i++)
+        {
+            final int corruptionPosition = random.nextInt(fileLength - 1); //ensure at least one byte will be corrupted
+            // corrupt max from position to end of file
+            final int corruptionSize = Math.min(maxCorruptionSize, random.nextInt(fileLength - corruptionPosition));
+
+            byte[] backup = corruptSstable(raf, corruptionPosition, corruptionSize);
+
+            try
+            {
+                walker.accept(ssTableReader);
+            }
+            catch (CorruptSSTableException t)
+            {
+                corruptedCounter++;
+            }
+            finally
+            {
+                restore(raf, corruptionPosition, backup);
+            }
+        }
+
+        assertTrue(corruptedCounter > 0);
+        FileUtils.closeQuietly(raf);
+    }
+
+    private Consumer<SSTableReader> sstableScanner()
+    {
+        return (SSTableReader sstable) -> {
+            try (ISSTableScanner scanner = sstable.getScanner())
+            {
+                while (scanner.hasNext())
+                {
+                    try (UnfilteredRowIterator rowIter = scanner.next())
+                    {
+                        if (rowIter.hasNext())
+                        {
+                            Unfiltered unfiltered = rowIter.next();
+                            if (unfiltered.isRow())
+                            {
+                                Row row = (Row) unfiltered;
+                                assertEquals(2, row.clustering().size());
+                                // no-op read
+                            }
+                        }
+                    }
+
+                }
+            }
+        };
+    }
+
+    private Consumer<SSTableReader> partitionIterator()
+    {
+        return (SSTableReader sstable) -> {
+            for (int i = 0; i < numberOfPks; i++)
+            {
+                DecoratedKey dk = Util.dk(String.format("pkvalue_%07d", i));
+                try (UnfilteredRowIterator rowIter = sstable.iterator(dk,
+                                                                      ColumnFilter.all(cfs.metadata),
+                                                                      false,
+                                                                      false,
+                                                                      SSTableReadsListener.NOOP_LISTENER))
+                {
+                    while (rowIter.hasNext())
+                    {
+                        Unfiltered unfiltered = rowIter.next();
+                        if (unfiltered.isRow())
+                        {
+                            Row row = (Row) unfiltered;
+                            assertEquals(2, row.clustering().size());
+                            // no-op read
+                        }
+                    }
+                    rowIter.close();
+                }
+            }
+        };
+    }
+
+    private byte[] corruptSstable(RandomAccessFile raf, int position, int corruptionSize) throws IOException
+    {
+        byte[] backup = new byte[corruptionSize];
+        raf.seek(position);
+        raf.read(backup);
+
+        raf.seek(position);
+        byte[] corruption = new byte[corruptionSize];
+        random.nextBytes(corruption);
+        raf.write(corruption);
+
+        return backup;
+    }
+
+    private void restore(RandomAccessFile raf, int position, byte[] backup) throws IOException
+    {
+        raf.seek(position);
+        raf.write(backup);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
index 4a51fbd..ad7523d 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java

@@ -18,92 +18,209 @@
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
+import java.util.Collections;
 import java.util.List;
+import java.util.concurrent.CountDownLatch;
 
 import com.google.common.io.Files;
+
+import org.junit.After;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Row;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.streaming.StreamEvent;
+import org.apache.cassandra.streaming.StreamEventHandler;
+import org.apache.cassandra.streaming.StreamState;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.OutputHandler;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class SSTableLoaderTest
 {
     public static final String KEYSPACE1 = "SSTableLoaderTest";
-    public static final String CF_STANDARD = "Standard1";
+    public static final String CF_STANDARD1 = "Standard1";
+    public static final String CF_STANDARD2 = "Standard2";
+
+    private File tmpdir;
 
     @BeforeClass
     public static void defineSchema() throws Exception
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
-        setup();
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
+
+        StorageService.instance.initServer();
     }
 
-    public static void setup() throws Exception
+    @Before
+    public void setup() throws Exception
     {
-        StorageService.instance.initServer();
+        tmpdir = Files.createTempDir();
+    }
+
+    @After
+    public void cleanup()
+    {
+        try {
+            FileUtils.deleteRecursive(tmpdir);
+        } catch (FSWriteError e) {
+            /**
+             * Windows does not allow a mapped file to be deleted, so we probably forgot to clean the buffers somewhere.
+             * We force a GC here to force buffer deallocation, and then try deleting the directory again.
+             * For more information, see: http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4715154
+             * If this is not the problem, the exception will be rethrown anyway.
+             */
+            System.gc();
+            FileUtils.deleteRecursive(tmpdir);
+        }
+    }
+
+    private static final class TestClient extends SSTableLoader.Client
+    {
+        private String keyspace;
+
+        public void init(String keyspace)
+        {
+            this.keyspace = keyspace;
+            for (Range<Token> range : StorageService.instance.getLocalRanges(KEYSPACE1))
+                addRangeForEndpoint(range, FBUtilities.getBroadcastAddress());
+        }
+
+        public CFMetaData getTableMetadata(String tableName)
+        {
+            return Schema.instance.getCFMetaData(keyspace, tableName);
+        }
     }
 
     @Test
     public void testLoadingSSTable() throws Exception
     {
-        File tempdir = Files.createTempDir();
-        File dataDir = new File(tempdir.getAbsolutePath() + File.separator + KEYSPACE1 + File.separator + CF_STANDARD);
+        File dataDir = new File(tmpdir.getAbsolutePath() + File.separator + KEYSPACE1 + File.separator + CF_STANDARD1);
         assert dataDir.mkdirs();
-        CFMetaData cfmeta = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD);
-        DecoratedKey key = Util.dk("key1");
-        
-        try (SSTableSimpleUnsortedWriter writer = new SSTableSimpleUnsortedWriter(dataDir,
-                                                                             cfmeta,
-                                                                             StorageService.getPartitioner(),
-                                                                             1))
+        CFMetaData cfmeta = Schema.instance.getCFMetaData(KEYSPACE1, CF_STANDARD1);
+
+        String schema = "CREATE TABLE %s.%s (key ascii, name ascii, val ascii, val1 ascii, PRIMARY KEY (key, name))";
+        String query = "INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)";
+
+        try (CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                       .inDirectory(dataDir)
+                                                       .forTable(String.format(schema, KEYSPACE1, CF_STANDARD1))
+                                                       .using(String.format(query, KEYSPACE1, CF_STANDARD1))
+                                                       .build())
         {
-            writer.newRow(key.getKey());
-            writer.addColumn(ByteBufferUtil.bytes("col1"), ByteBufferUtil.bytes(100), 1);
+            writer.addRow("key1", "col1", "100");
         }
 
-        SSTableLoader loader = new SSTableLoader(dataDir, new SSTableLoader.Client()
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+
+        final CountDownLatch latch = new CountDownLatch(1);
+        SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false));
+        loader.stream(Collections.emptySet(), completionStreamListener(latch)).get();
+
+        List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs).build());
+
+        assertEquals(1, partitions.size());
+        assertEquals("key1", AsciiType.instance.getString(partitions.get(0).partitionKey().getKey()));
+        assertEquals(ByteBufferUtil.bytes("100"), partitions.get(0).getRow(new Clustering(ByteBufferUtil.bytes("col1")))
+                                                                   .getCell(cfmeta.getColumnDefinition(ByteBufferUtil.bytes("val")))
+                                                                   .value());
+
+        // The stream future is signalled when the work is complete but before releasing references. Wait for release
+        // before cleanup (CASSANDRA-10118).
+        latch.await();
+    }
+
+    @Test
+    public void testLoadingIncompleteSSTable() throws Exception
+    {
+        File dataDir = new File(tmpdir.getAbsolutePath() + File.separator + KEYSPACE1 + File.separator + CF_STANDARD2);
+        assert dataDir.mkdirs();
+
+        //make sure we have no tables...
+        assertTrue(dataDir.listFiles().length == 0);
+
+        String schema = "CREATE TABLE %s.%s (key ascii, name ascii, val ascii, val1 ascii, PRIMARY KEY (key, name))";
+        String query = "INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)";
+
+        CQLSSTableWriter writer = CQLSSTableWriter.builder()
+                                                  .inDirectory(dataDir)
+                                                  .forTable(String.format(schema, KEYSPACE1, CF_STANDARD2))
+                                                  .using(String.format(query, KEYSPACE1, CF_STANDARD2))
+                                                  .withBufferSizeInMB(1)
+                                                  .build();
+
+        int NB_PARTITIONS = 5000; // Enough to write >1MB and get at least one completed sstable before we've closed the writer
+
+        for (int i = 0; i < NB_PARTITIONS; i++)
         {
-            private String keyspace;
+            for (int j = 0; j < 100; j++)
+                writer.addRow(String.format("key%d", i), String.format("col%d", j), "100");
+        }
 
-            public void init(String keyspace)
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2);
+        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+
+        //make sure we have some tables...
+        assertTrue(dataDir.listFiles().length > 0);
+
+        final CountDownLatch latch = new CountDownLatch(2);
+        //writer is still open so loader should not load anything
+        SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false));
+        loader.stream(Collections.emptySet(), completionStreamListener(latch)).get();
+
+        List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs).build());
+
+        assertTrue(partitions.size() > 0 && partitions.size() < NB_PARTITIONS);
+
+        // now we complete the write and the second loader should load the last sstable as well
+        writer.close();
+
+        loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false));
+        loader.stream(Collections.emptySet(), completionStreamListener(latch)).get();
+
+        partitions = Util.getAll(Util.cmd(Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2)).build());
+        assertEquals(NB_PARTITIONS, partitions.size());
+
+        // The stream future is signalled when the work is complete but before releasing references. Wait for release
+        // before cleanup (CASSANDRA-10118).
+        latch.await();
+    }
+
+    StreamEventHandler completionStreamListener(final CountDownLatch latch)
+    {
+        return new StreamEventHandler() {
+            public void onFailure(Throwable arg0)
             {
-                this.keyspace = keyspace;
-                for (Range<Token> range : StorageService.instance.getLocalRanges(KEYSPACE1))
-                    addRangeForEndpoint(range, FBUtilities.getBroadcastAddress());
-                setPartitioner(StorageService.getPartitioner());
+                latch.countDown();
             }
 
-            public CFMetaData getTableMetadata(String tableName)
+            public void onSuccess(StreamState arg0)
             {
-                return Schema.instance.getCFMetaData(keyspace, tableName);
+                latch.countDown();
             }
-        }, new OutputHandler.SystemOutput(false, false));
 
-        loader.stream().get();
-
-        List<Row> rows = Util.getRangeSlice(Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD));
-        assertEquals(1, rows.size());
-        assertEquals(key, rows.get(0).key);
-        assertEquals(ByteBufferUtil.bytes(100), rows.get(0).cf.getColumn(Util.cellname("col1")).value());
+            public void handleStreamEvent(StreamEvent event) {}
+        };
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
index 755225e..f39cf3b 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java

@@ -1,5 +1,4 @@
 /*
- * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -7,48 +6,42 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.apache.cassandra.io.sstable;
 
-import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.ExecutionException;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CounterId;
 
+import static org.apache.cassandra.Util.getBytes;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-import static org.apache.cassandra.Util.cellname;
-
 public class SSTableMetadataTest
 {
     public static final String KEYSPACE1 = "SSTableMetadataTest";
@@ -61,16 +54,18 @@
     @BeforeClass
     public static void defineSchema() throws Exception
     {
-        AbstractType<?> compositeMaxMin = CompositeType.getInstance(Arrays.asList(new AbstractType<?>[]{BytesType.instance, IntegerType.instance}));
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDCOMPOSITE2, compositeMaxMin),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER1).defaultValidator(CounterColumnType.instance));
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_STANDARDCOMPOSITE2)
+                                                      .addPartitionKey("key", AsciiType.instance)
+                                                      .addClusteringColumn("name", AsciiType.instance)
+                                                      .addClusteringColumn("int", IntegerType.instance)
+                                                      .addRegularColumn("val", AsciiType.instance).build(),
+                                    SchemaLoader.counterCFMD(KEYSPACE1, CF_COUNTER1));
     }
 
     @Test
@@ -82,40 +77,44 @@
         for(int i = 0; i < 10; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add("Standard1", cellname(Integer.toString(j)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       10 + j);
-            rm.applyUnsafe();
+                new RowUpdateBuilder(store.metadata, timestamp, 10 + j, Integer.toString(i))
+                    .clustering(Integer.toString(j))
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
+
         }
-        Mutation rm = new Mutation(KEYSPACE1, Util.dk("longttl").getKey());
-        rm.add("Standard1", cellname("col"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               10000);
-        rm.applyUnsafe();
+
+        new RowUpdateBuilder(store.metadata, timestamp, 10000, "longttl")
+            .clustering("col")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+
+
         store.forceBlockingFlush();
-        assertEquals(1, store.getSSTables().size());
+        assertEquals(1, store.getLiveSSTables().size());
         int ttltimestamp = (int)(System.currentTimeMillis()/1000);
         int firstDelTime = 0;
-        for(SSTableReader sstable : store.getSSTables())
+        for(SSTableReader sstable : store.getLiveSSTables())
         {
             firstDelTime = sstable.getSSTableMetadata().maxLocalDeletionTime;
             assertEquals(ttltimestamp + 10000, firstDelTime, 10);
 
         }
-        rm = new Mutation(KEYSPACE1, Util.dk("longttl2").getKey());
-        rm.add("Standard1", cellname("col"),
-               ByteBufferUtil.EMPTY_BYTE_BUFFER,
-               timestamp,
-               20000);
-        rm.applyUnsafe();
+
+        new RowUpdateBuilder(store.metadata, timestamp, 20000, "longttl2")
+        .clustering("col")
+        .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+        .build()
+        .applyUnsafe();
+
+
         ttltimestamp = (int) (System.currentTimeMillis()/1000);
         store.forceBlockingFlush();
-        assertEquals(2, store.getSSTables().size());
-        List<SSTableReader> sstables = new ArrayList<>(store.getSSTables());
+        assertEquals(2, store.getLiveSSTables().size());
+        List<SSTableReader> sstables = new ArrayList<>(store.getLiveSSTables());
         if(sstables.get(0).getSSTableMetadata().maxLocalDeletionTime < sstables.get(1).getSSTableMetadata().maxLocalDeletionTime)
         {
             assertEquals(sstables.get(0).getSSTableMetadata().maxLocalDeletionTime, firstDelTime);
@@ -127,9 +126,9 @@
             assertEquals(sstables.get(0).getSSTableMetadata().maxLocalDeletionTime, ttltimestamp + 20000, 10);
         }
 
-        Util.compact(store, store.getSSTables());
-        assertEquals(1, store.getSSTables().size());
-        for(SSTableReader sstable : store.getSSTables())
+        Util.compact(store, store.getLiveSSTables());
+        assertEquals(1, store.getLiveSSTables().size());
+        for(SSTableReader sstable : store.getLiveSSTables())
         {
             assertEquals(sstable.getSSTableMetadata().maxLocalDeletionTime, ttltimestamp + 20000, 10);
         }
@@ -153,33 +152,36 @@
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
         long timestamp = System.currentTimeMillis();
         DecoratedKey key = Util.dk("deletetest");
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
         for (int i = 0; i<5; i++)
-            rm.add("Standard2", cellname("deletecolumn" + i),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       100);
-        rm.add("Standard2", cellname("todelete"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                   timestamp,
-                   1000);
-        rm.applyUnsafe();
+            new RowUpdateBuilder(store.metadata, timestamp, 100, "deletetest")
+                .clustering("deletecolumn" + i)
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+
+
+        new RowUpdateBuilder(store.metadata, timestamp, 1000, "deletetest")
+        .clustering("todelete")
+        .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+        .build()
+        .applyUnsafe();
+
         store.forceBlockingFlush();
-        assertEquals(1,store.getSSTables().size());
+        assertEquals(1,store.getLiveSSTables().size());
         int ttltimestamp = (int) (System.currentTimeMillis()/1000);
         int firstMaxDelTime = 0;
-        for(SSTableReader sstable : store.getSSTables())
+        for(SSTableReader sstable : store.getLiveSSTables())
         {
             firstMaxDelTime = sstable.getSSTableMetadata().maxLocalDeletionTime;
             assertEquals(ttltimestamp + 1000, firstMaxDelTime, 10);
         }
-        rm = new Mutation(KEYSPACE1, key.getKey());
-        rm.delete("Standard2", cellname("todelete"), timestamp + 1);
-        rm.applyUnsafe();
+
+        RowUpdateBuilder.deleteRow(store.metadata, timestamp + 1, "deletetest", "todelete").applyUnsafe();
+
         store.forceBlockingFlush();
-        assertEquals(2,store.getSSTables().size());
+        assertEquals(2,store.getLiveSSTables().size());
         boolean foundDelete = false;
-        for(SSTableReader sstable : store.getSSTables())
+        for(SSTableReader sstable : store.getLiveSSTables())
         {
             if(sstable.getSSTableMetadata().maxLocalDeletionTime != firstMaxDelTime)
             {
@@ -188,9 +190,9 @@
             }
         }
         assertTrue(foundDelete);
-        Util.compact(store, store.getSSTables());
-        assertEquals(1,store.getSSTables().size());
-        for(SSTableReader sstable : store.getSSTables())
+        Util.compact(store, store.getLiveSSTables());
+        assertEquals(1,store.getLiveSSTables().size());
+        for(SSTableReader sstable : store.getLiveSSTables())
         {
             assertEquals(ttltimestamp + 100, sstable.getSSTableMetadata().maxLocalDeletionTime, 10);
         }
@@ -201,93 +203,53 @@
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard3");
-        store.getCompactionStrategy();
         for (int j = 0; j < 8; j++)
         {
-            DecoratedKey key = Util.dk("row"+j);
-            Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+            String key = "row" + j;
             for (int i = 100; i<150; i++)
             {
-                rm.add("Standard3", cellname(j + "col" + i), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
+                new RowUpdateBuilder(store.metadata, System.currentTimeMillis(), key)
+                    .clustering(j + "col" + i)
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .applyUnsafe();
             }
-            rm.applyUnsafe();
         }
         store.forceBlockingFlush();
-        assertEquals(1, store.getSSTables().size());
-        for (SSTableReader sstable : store.getSSTables())
+        assertEquals(1, store.getLiveSSTables().size());
+        for (SSTableReader sstable : store.getLiveSSTables())
         {
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minColumnNames.get(0)), "0col100");
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxColumnNames.get(0)), "7col149");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minClusteringValues.get(0)), "0col100");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxClusteringValues.get(0)), "7col149");
+            // make sure the clustering values are minimised
+            assertTrue(sstable.getSSTableMetadata().minClusteringValues.get(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().maxClusteringValues.get(0).capacity() < 50);
         }
-        DecoratedKey key = Util.dk("row2");
-        Mutation rm = new Mutation(KEYSPACE1, key.getKey());
+        String key = "row2";
+
         for (int i = 101; i<299; i++)
         {
-            rm.add("Standard3", cellname(9 + "col" + i), ByteBufferUtil.EMPTY_BYTE_BUFFER, System.currentTimeMillis());
+            new RowUpdateBuilder(store.metadata, System.currentTimeMillis(), key)
+            .clustering(9 + "col" + i)
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
         }
-        rm.applyUnsafe();
 
         store.forceBlockingFlush();
         store.forceMajorCompaction();
-        assertEquals(1, store.getSSTables().size());
-        for (SSTableReader sstable : store.getSSTables())
+        assertEquals(1, store.getLiveSSTables().size());
+        for (SSTableReader sstable : store.getLiveSSTables())
         {
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minColumnNames.get(0)), "0col100");
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxColumnNames.get(0)), "9col298");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minClusteringValues.get(0)), "0col100");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxClusteringValues.get(0)), "9col298");
+            // and make sure the clustering values are still minimised after compaction
+            assertTrue(sstable.getSSTableMetadata().minClusteringValues.get(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().maxClusteringValues.get(0).capacity() < 50);
         }
     }
 
-    @Test
-    public void testMaxMinComposites() throws CharacterCodingException, ExecutionException, InterruptedException
-    {
-        /*
-        creates two sstables, columns like this:
-        ---------------------
-        k   |a0:9|a1:8|..|a9:0
-        ---------------------
-        and
-        ---------------------
-        k2  |b0:9|b1:8|..|b9:0
-        ---------------------
-        meaning max columns are b9 and 9, min is a0 and 0
-         */
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("StandardComposite2");
-
-        CellNameType type = cfs.getComparator();
-
-        ByteBuffer key = ByteBufferUtil.bytes("k");
-        for (int i = 0; i < 10; i++)
-        {
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            CellName colName = type.makeCellName(ByteBufferUtil.bytes("a"+(9-i)), ByteBufferUtil.bytes(i));
-            rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-            rm.applyUnsafe();
-        }
-        cfs.forceBlockingFlush();
-
-        key = ByteBufferUtil.bytes("k2");
-        for (int i = 0; i < 10; i++)
-        {
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            CellName colName = type.makeCellName(ByteBufferUtil.bytes("b"+(9-i)), ByteBufferUtil.bytes(i));
-            rm.add("StandardComposite2", colName, ByteBufferUtil.EMPTY_BYTE_BUFFER, 0);
-            rm.applyUnsafe();
-        }
-        cfs.forceBlockingFlush();
-        cfs.forceMajorCompaction();
-        assertEquals(cfs.getSSTables().size(), 1);
-        for (SSTableReader sstable : cfs.getSSTables())
-        {
-            assertEquals("b9", ByteBufferUtil.string(sstable.getSSTableMetadata().maxColumnNames.get(0)));
-            assertEquals(9, ByteBufferUtil.toInt(sstable.getSSTableMetadata().maxColumnNames.get(1)));
-            assertEquals("a0", ByteBufferUtil.string(sstable.getSSTableMetadata().minColumnNames.get(0)));
-            assertEquals(0, ByteBufferUtil.toInt(sstable.getSSTableMetadata().minColumnNames.get(1)));
-        }
-    }
-
-    @Test
+    /*@Test
     public void testLegacyCounterShardTracking()
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Counter1");
@@ -297,11 +259,12 @@
         state.writeGlobal(CounterId.fromInt(1), 1L, 1L);
         state.writeLocal(CounterId.fromInt(2), 1L, 1L);
         state.writeRemote(CounterId.fromInt(3), 1L, 1L);
+
         ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
         cfs.forceBlockingFlush();
-        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
         // A cell with global and remote shards
@@ -312,7 +275,7 @@
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
         cfs.forceBlockingFlush();
-        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
         // A cell with global and local shards
@@ -323,7 +286,7 @@
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
         cfs.forceBlockingFlush();
-        assertTrue(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
         // A cell with global only
@@ -333,7 +296,7 @@
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
         cfs.forceBlockingFlush();
-        assertFalse(cfs.getSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
+        assertFalse(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
-    }
+    } */
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
new file mode 100644
index 0000000..4ca6ec0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java

@@ -0,0 +1,706 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.*;
+import java.util.concurrent.*;
+
+import com.google.common.collect.Sets;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner.LocalToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.MmappedRegions;
+import org.apache.cassandra.io.util.SegmentedFile;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.Pair;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(OrderedJUnit4ClassRunner.class)
+public class SSTableReaderTest
+{
+    public static final String KEYSPACE1 = "SSTableReaderTest";
+    public static final String CF_STANDARD = "Standard1";
+    public static final String CF_STANDARD2 = "Standard2";
+    public static final String CF_INDEXED = "Indexed1";
+    public static final String CF_STANDARDLOWINDEXINTERVAL = "StandardLowIndexInterval";
+
+    private IPartitioner partitioner;
+
+    Token t(int i)
+    {
+        return partitioner.getToken(ByteBufferUtil.bytes(String.valueOf(i)));
+    }
+
+    @BeforeClass
+    public static void defineSchema() throws Exception
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED, true),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL)
+                                                .minIndexInterval(8)
+                                                .maxIndexInterval(256)
+                                                .caching(CachingParams.CACHE_NOTHING));
+    }
+
+    @Test
+    public void testGetPositionsForRanges()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
+        partitioner = store.getPartitioner();
+
+        // insert data and compact to a single sstable
+        CompactionManager.instance.disableAutoCompaction();
+        for (int j = 0; j < 10; j++)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+                .clustering("0")
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        List<Range<Token>> ranges = new ArrayList<Range<Token>>();
+        // 1 key
+        ranges.add(new Range<>(t(0), t(1)));
+        // 2 keys
+        ranges.add(new Range<>(t(2), t(4)));
+        // wrapping range from key to end
+        ranges.add(new Range<>(t(6), partitioner.getMinimumToken()));
+        // empty range (should be ignored)
+        ranges.add(new Range<>(t(9), t(91)));
+
+        // confirm that positions increase continuously
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        long previous = -1;
+        for (Pair<Long,Long> section : sstable.getPositionsForRanges(ranges))
+        {
+            assert previous <= section.left : previous + " ! < " + section.left;
+            assert section.left < section.right : section.left + " ! < " + section.right;
+            previous = section.right;
+        }
+    }
+
+    @Test
+    public void testSpannedIndexPositions() throws IOException
+    {
+        int originalMaxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE;
+        MmappedRegions.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
+
+        try
+        {
+            Keyspace keyspace = Keyspace.open(KEYSPACE1);
+            ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+            partitioner = store.getPartitioner();
+
+            // insert a bunch of data and compact to a single sstable
+            CompactionManager.instance.disableAutoCompaction();
+            for (int j = 0; j < 100; j += 2)
+            {
+                new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+                .clustering("0")
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+            }
+            store.forceBlockingFlush();
+            CompactionManager.instance.performMaximal(store, false);
+
+            // check that all our keys are found correctly
+            SSTableReader sstable = store.getLiveSSTables().iterator().next();
+            for (int j = 0; j < 100; j += 2)
+            {
+                DecoratedKey dk = Util.dk(String.valueOf(j));
+                FileDataInput file = sstable.getFileDataInput(sstable.getPosition(dk, SSTableReader.Operator.EQ).position);
+                DecoratedKey keyInDisk = sstable.decorateKey(ByteBufferUtil.readWithShortLength(file));
+                assert keyInDisk.equals(dk) : String.format("%s != %s in %s", keyInDisk, dk, file.getPath());
+            }
+
+            // check no false positives
+            for (int j = 1; j < 110; j += 2)
+            {
+                DecoratedKey dk = Util.dk(String.valueOf(j));
+                assert sstable.getPosition(dk, SSTableReader.Operator.EQ) == null;
+            }
+        }
+        finally
+        {
+            MmappedRegions.MAX_SEGMENT_SIZE = originalMaxSegmentSize;
+        }
+    }
+
+    @Test
+    public void testPersistentStatistics()
+    {
+
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+        partitioner = store.getPartitioner();
+
+        for (int j = 0; j < 100; j += 2)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+            .clustering("0")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+        }
+        store.forceBlockingFlush();
+
+        clearAndLoad(store);
+        assert store.metric.maxPartitionSize.getValue() != 0;
+    }
+
+    private void clearAndLoad(ColumnFamilyStore cfs)
+    {
+        cfs.clearUnsafe();
+        cfs.loadNewSSTables();
+    }
+
+    @Test
+    public void testReadRateTracking()
+    {
+        // try to make sure CASSANDRA-8239 never happens again
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+        partitioner = store.getPartitioner();
+
+        for (int j = 0; j < 10; j++)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+            .clustering("0")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+        }
+
+        store.forceBlockingFlush();
+
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        assertEquals(0, sstable.getReadMeter().count());
+
+        DecoratedKey key = sstable.decorateKey(ByteBufferUtil.bytes("4"));
+        Util.getAll(Util.cmd(store, key).build());
+        assertEquals(1, sstable.getReadMeter().count());
+
+        Util.getAll(Util.cmd(store, key).includeRow("0").build());
+        assertEquals(2, sstable.getReadMeter().count());
+    }
+
+    @Test
+    public void testGetPositionsForRangesWithKeyCache()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
+        partitioner = store.getPartitioner();
+        CacheService.instance.keyCache.setCapacity(100);
+
+        // insert data and compact to a single sstable
+        CompactionManager.instance.disableAutoCompaction();
+        for (int j = 0; j < 10; j++)
+        {
+
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+            .clustering("0")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        long p2 = sstable.getPosition(k(2), SSTableReader.Operator.EQ).position;
+        long p3 = sstable.getPosition(k(3), SSTableReader.Operator.EQ).position;
+        long p6 = sstable.getPosition(k(6), SSTableReader.Operator.EQ).position;
+        long p7 = sstable.getPosition(k(7), SSTableReader.Operator.EQ).position;
+
+        Pair<Long, Long> p = sstable.getPositionsForRanges(makeRanges(t(2), t(6))).get(0);
+
+        // range are start exclusive so we should start at 3
+        assert p.left == p3;
+
+        // to capture 6 we have to stop at the start of 7
+        assert p.right == p7;
+    }
+
+    @Test
+    public void testPersistentStatisticsWithSecondaryIndex()
+    {
+        // Create secondary index and flush to disk
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_INDEXED);
+        partitioner = store.getPartitioner();
+
+        new RowUpdateBuilder(store.metadata, System.currentTimeMillis(), "k1")
+            .clustering("0")
+            .add("birthdate", 1L)
+            .build()
+            .applyUnsafe();
+
+        store.forceBlockingFlush();
+
+        // check if opening and querying works
+        assertIndexQueryWorks(store);
+    }
+    public void testGetPositionsKeyCacheStats()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
+        partitioner = store.getPartitioner();
+        CacheService.instance.keyCache.setCapacity(1000);
+
+        // insert data and compact to a single sstable
+        CompactionManager.instance.disableAutoCompaction();
+        for (int j = 0; j < 10; j++)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+            .clustering("0")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        sstable.getPosition(k(2), SSTableReader.Operator.EQ);
+        assertEquals(0, sstable.getKeyCacheHit());
+        assertEquals(1, sstable.getBloomFilterTruePositiveCount());
+        sstable.getPosition(k(2), SSTableReader.Operator.EQ);
+        assertEquals(1, sstable.getKeyCacheHit());
+        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
+        sstable.getPosition(k(15), SSTableReader.Operator.EQ);
+        assertEquals(1, sstable.getKeyCacheHit());
+        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
+
+    }
+
+
+    @Test
+    public void testOpeningSSTable() throws Exception
+    {
+        String ks = KEYSPACE1;
+        String cf = "Standard1";
+
+        // clear and create just one sstable for this test
+        Keyspace keyspace = Keyspace.open(ks);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(cf);
+        store.clearUnsafe();
+        store.disableAutoCompaction();
+
+        DecoratedKey firstKey = null, lastKey = null;
+        long timestamp = System.currentTimeMillis();
+        for (int i = 0; i < store.metadata.params.minIndexInterval; i++)
+        {
+            DecoratedKey key = Util.dk(String.valueOf(i));
+            if (firstKey == null)
+                firstKey = key;
+            if (lastKey == null)
+                lastKey = key;
+            if (store.metadata.getKeyValidator().compare(lastKey.getKey(), key.getKey()) < 0)
+                lastKey = key;
+
+
+            new RowUpdateBuilder(store.metadata, timestamp, key.getKey())
+            .clustering("col")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+
+        }
+        store.forceBlockingFlush();
+
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        Descriptor desc = sstable.descriptor;
+
+        // test to see if sstable can be opened as expected
+        SSTableReader target = SSTableReader.open(desc);
+        Assert.assertEquals(target.getIndexSummarySize(), 1);
+        Assert.assertArrayEquals(ByteBufferUtil.getArray(firstKey.getKey()), target.getIndexSummaryKey(0));
+        assert target.first.equals(firstKey);
+        assert target.last.equals(lastKey);
+
+        executeInternal(String.format("ALTER TABLE \"%s\".\"%s\" WITH bloom_filter_fp_chance = 0.3", ks, cf));
+
+        File summaryFile = new File(desc.filenameFor(Component.SUMMARY));
+        Path bloomPath = new File(desc.filenameFor(Component.FILTER)).toPath();
+        Path summaryPath = summaryFile.toPath();
+
+        long bloomModified = Files.getLastModifiedTime(bloomPath).toMillis();
+        long summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
+
+        TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
+
+        // Offline tests
+        // check that bloomfilter/summary ARE NOT regenerated
+        target = SSTableReader.openNoValidation(desc, store.metadata);
+
+        assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
+        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+
+        target.selfRef().release();
+
+        // check that bloomfilter/summary ARE NOT regenerated and BF=AlwaysPresent when filter component is missing
+        Set<Component> components = SSTable.discoverComponentsFor(desc);
+        components.remove(Component.FILTER);
+        target = SSTableReader.openNoValidation(desc, components, store);
+
+        assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
+        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+        assertEquals(FilterFactory.AlwaysPresent, target.getBloomFilter());
+
+        target.selfRef().release();
+
+        // #### online tests ####
+        // check that summary & bloomfilter are not regenerated when SSTable is opened and BFFP has been changed
+        target = SSTableReader.open(desc, store.metadata);
+
+        assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
+        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+
+        target.selfRef().release();
+
+        // check that bloomfilter is recreated when it doesn't exist and this causes the summary to be recreated
+        components = SSTable.discoverComponentsFor(desc);
+        components.remove(Component.FILTER);
+
+        target = SSTableReader.open(desc, components, store.metadata);
+
+        assertTrue("Bloomfilter was not recreated", bloomModified < Files.getLastModifiedTime(bloomPath).toMillis());
+        assertTrue("Summary was not recreated", summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
+
+        target.selfRef().release();
+
+        // check that only the summary is regenerated when it is deleted
+        components.add(Component.FILTER);
+        summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
+        summaryFile.delete();
+
+        TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
+        bloomModified = Files.getLastModifiedTime(bloomPath).toMillis();
+
+        target = SSTableReader.open(desc, components, store.metadata);
+
+        assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
+        assertTrue("Summary was not recreated", summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
+
+        target.selfRef().release();
+
+        // check that summary and bloomfilter is not recreated when the INDEX is missing
+        components.add(Component.SUMMARY);
+        components.remove(Component.PRIMARY_INDEX);
+
+        summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
+        target = SSTableReader.open(desc, components, store.metadata, false, false);
+
+        TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
+        assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
+        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+
+        target.selfRef().release();
+    }
+
+    @Test
+    public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
+
+        new RowUpdateBuilder(store.metadata, System.currentTimeMillis(), "k1")
+        .clustering("0")
+        .add("birthdate", 1L)
+        .build()
+        .applyUnsafe();
+
+        store.forceBlockingFlush();
+
+        for(ColumnFamilyStore indexCfs : store.indexManager.getAllIndexColumnFamilyStores())
+        {
+            assert indexCfs.isIndex();
+            SSTableReader sstable = indexCfs.getLiveSSTables().iterator().next();
+            assert sstable.first.getToken() instanceof LocalToken;
+
+            try (SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(),
+                                                                           false);
+                 SegmentedFile.Builder dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(),
+                                                                           sstable.compression))
+            {
+                sstable.saveSummary(ibuilder, dbuilder);
+            }
+            SSTableReader reopened = SSTableReader.open(sstable.descriptor);
+            assert reopened.first.getToken() instanceof LocalToken;
+            reopened.selfRef().release();
+        }
+    }
+
+    /** see CASSANDRA-5407 */
+    @Test
+    public void testGetScannerForNoIntersectingRanges() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+        partitioner = store.getPartitioner();
+
+        new RowUpdateBuilder(store.metadata, 0, "k1")
+            .clustering("xyz")
+            .add("val", "abc")
+            .build()
+            .applyUnsafe();
+
+        store.forceBlockingFlush();
+        boolean foundScanner = false;
+        for (SSTableReader s : store.getLiveSSTables())
+        {
+            try (ISSTableScanner scanner = s.getScanner(new Range<Token>(t(0), t(1)), null))
+            {
+                scanner.next(); // throws exception pre 5407
+                foundScanner = true;
+            }
+        }
+        assertTrue(foundScanner);
+    }
+
+    @Test
+    public void testGetPositionsForRangesFromTableOpenedForBulkLoading() throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
+        partitioner = store.getPartitioner();
+
+        // insert data and compact to a single sstable. The
+        // number of keys inserted is greater than index_interval
+        // to ensure multiple segments in the index file
+        CompactionManager.instance.disableAutoCompaction();
+        for (int j = 0; j < 130; j++)
+        {
+
+            new RowUpdateBuilder(store.metadata, j, String.valueOf(j))
+            .clustering("0")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        // construct a range which is present in the sstable, but whose
+        // keys are not found in the first segment of the index.
+        List<Range<Token>> ranges = new ArrayList<Range<Token>>();
+        ranges.add(new Range<Token>(t(98), t(99)));
+
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+        List<Pair<Long,Long>> sections = sstable.getPositionsForRanges(ranges);
+        assert sections.size() == 1 : "Expected to find range in sstable" ;
+
+        // re-open the same sstable as it would be during bulk loading
+        Set<Component> components = Sets.newHashSet(Component.DATA, Component.PRIMARY_INDEX);
+        if (sstable.components.contains(Component.COMPRESSION_INFO))
+            components.add(Component.COMPRESSION_INFO);
+        SSTableReader bulkLoaded = SSTableReader.openForBatch(sstable.descriptor, components, store.metadata);
+        sections = bulkLoaded.getPositionsForRanges(ranges);
+        assert sections.size() == 1 : "Expected to find range in sstable opened for bulk loading";
+        bulkLoaded.selfRef().release();
+    }
+
+    @Test
+    public void testIndexSummaryReplacement() throws IOException, ExecutionException, InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
+        CompactionManager.instance.disableAutoCompaction();
+
+        final int NUM_PARTITIONS = 512;
+        for (int j = 0; j < NUM_PARTITIONS; j++)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.format("%3d", j))
+            .clustering("0")
+            .add("val", String.format("%3d", j))
+            .build()
+            .applyUnsafe();
+
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        Collection<SSTableReader> sstables = store.getLiveSSTables();
+        assert sstables.size() == 1;
+        final SSTableReader sstable = sstables.iterator().next();
+
+        ThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(5);
+        List<Future> futures = new ArrayList<>(NUM_PARTITIONS * 2);
+        for (int i = 0; i < NUM_PARTITIONS; i++)
+        {
+            final ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", i));
+            final int index = i;
+
+            futures.add(executor.submit(new Runnable()
+            {
+                public void run()
+                {
+                    Row row = Util.getOnlyRowUnfiltered(Util.cmd(store, key).build());
+                    assertEquals(0, ByteBufferUtil.compare(String.format("%3d", index).getBytes(), row.cells().iterator().next().value()));
+                }
+            }));
+
+            futures.add(executor.submit(new Runnable()
+            {
+                public void run()
+                {
+                    Iterable<DecoratedKey> results = store.keySamples(
+                            new Range<>(sstable.getPartitioner().getMinimumToken(), sstable.getPartitioner().getToken(key)));
+                    assertTrue(results.iterator().hasNext());
+                }
+            }));
+        }
+
+        SSTableReader replacement;
+        try (LifecycleTransaction txn = store.getTracker().tryModify(Arrays.asList(sstable), OperationType.UNKNOWN))
+        {
+            replacement = sstable.cloneWithNewSummarySamplingLevel(store, 1);
+            txn.update(replacement, true);
+            txn.finish();
+        }
+        for (Future future : futures)
+            future.get();
+
+        assertEquals(sstable.estimatedKeys(), replacement.estimatedKeys(), 1);
+    }
+
+    @Test
+    public void testIndexSummaryUpsampleAndReload() throws Exception
+    {
+        int originalMaxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE;
+        MmappedRegions.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
+
+        try
+        {
+            testIndexSummaryUpsampleAndReload0();
+        }
+        finally
+        {
+            MmappedRegions.MAX_SEGMENT_SIZE = originalMaxSegmentSize;
+        }
+    }
+
+    private void testIndexSummaryUpsampleAndReload0() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
+        CompactionManager.instance.disableAutoCompaction();
+
+        final int NUM_PARTITIONS = 512;
+        for (int j = 0; j < NUM_PARTITIONS; j++)
+        {
+            new RowUpdateBuilder(store.metadata, j, String.format("%3d", j))
+            .clustering("0")
+            .add("val", String.format("%3d", j))
+            .build()
+            .applyUnsafe();
+
+        }
+        store.forceBlockingFlush();
+        CompactionManager.instance.performMaximal(store, false);
+
+        Collection<SSTableReader> sstables = store.getLiveSSTables();
+        assert sstables.size() == 1;
+        final SSTableReader sstable = sstables.iterator().next();
+
+        try (LifecycleTransaction txn = store.getTracker().tryModify(Arrays.asList(sstable), OperationType.UNKNOWN))
+        {
+            SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(store, sstable.getIndexSummarySamplingLevel() + 1);
+            txn.update(replacement, true);
+            txn.finish();
+        }
+        SSTableReader reopen = SSTableReader.open(sstable.descriptor);
+        assert reopen.getIndexSummarySamplingLevel() == sstable.getIndexSummarySamplingLevel() + 1;
+    }
+
+    private void assertIndexQueryWorks(ColumnFamilyStore indexedCFS)
+    {
+        assert "Indexed1".equals(indexedCFS.name);
+
+        // make sure all sstables including 2ary indexes load from disk
+        for (ColumnFamilyStore cfs : indexedCFS.concatWithIndexes())
+            clearAndLoad(cfs);
+
+
+        // query using index to see if sstable for secondary index opens
+        ReadCommand rc = Util.cmd(indexedCFS).fromKeyIncl("k1").toKeyIncl("k3")
+                                             .columns("birthdate")
+                                             .filterOn("birthdate", Operator.EQ, 1L)
+                                             .build();
+        Index.Searcher searcher = rc.getIndex(indexedCFS).searcherFor(rc);
+        assertNotNull(searcher);
+        try (ReadOrderGroup orderGroup = ReadOrderGroup.forCommand(rc))
+        {
+            assertEquals(1, Util.size(UnfilteredPartitionIterators.filter(searcher.search(orderGroup), rc.nowInSec())));
+        }
+    }
+
+    private List<Range<Token>> makeRanges(Token left, Token right)
+    {
+        return Arrays.asList(new Range<>(left, right));
+    }
+
+    private DecoratedKey k(int i)
+    {
+        return new BufferDecoratedKey(t(i), ByteBufferUtil.bytes(String.valueOf(i)));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
index f50953a..bdc3b42 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java

@@ -15,102 +15,56 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.io.sstable;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.Test;
 
-import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.Config;
+import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionController;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.compaction.LazilyCompactedRow;
+import org.apache.cassandra.db.compaction.CompactionIterator;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.db.compaction.SSTableSplitter;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.metrics.StorageMetrics;
-import org.apache.cassandra.notifications.INotification;
-import org.apache.cassandra.notifications.INotificationConsumer;
-import org.apache.cassandra.notifications.SSTableListChangedNotification;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.UUIDGen;
 
 import static org.junit.Assert.*;
 
-public class SSTableRewriterTest extends SchemaLoader
+public class SSTableRewriterTest extends SSTableWriterTestBase
 {
-    private static final String KEYSPACE = "SSTableRewriterTest";
-    private static final String CF = "Standard1";
-
-    private static Config.DiskAccessMode standardMode;
-    private static Config.DiskAccessMode indexMode;
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        if (FBUtilities.isWindows())
-        {
-            standardMode = DatabaseDescriptor.getDiskAccessMode();
-            indexMode = DatabaseDescriptor.getIndexAccessMode();
-
-            DatabaseDescriptor.setDiskAccessMode(Config.DiskAccessMode.standard);
-            DatabaseDescriptor.setIndexAccessMode(Config.DiskAccessMode.standard);
-        }
-
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE,
-                SimpleStrategy.class,
-                KSMetaData.optsWithRF(1),
-                SchemaLoader.standardCFMD(KEYSPACE, CF));
-    }
-
-    @AfterClass
-    public static void revertDiskAccess()
-    {
-        DatabaseDescriptor.setDiskAccessMode(standardMode);
-        DatabaseDescriptor.setIndexAccessMode(indexMode);
-    }
-
-    @After
-    public void truncateCF()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
-        store.truncateBlocking();
-        SSTableDeletingTask.waitForDeletions();
-    }
-
     @Test
     public void basicTest() throws InterruptedException
     {
@@ -120,33 +74,34 @@
 
         for (int j = 0; j < 100; j ++)
         {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE, key);
-            rm.add(CF, Util.cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.apply();
+            new RowUpdateBuilder(cfs.metadata, j, String.valueOf(j))
+                .clustering("0")
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .apply();
         }
         cfs.forceBlockingFlush();
-        Set<SSTableReader> sstables = new HashSet<>(cfs.getSSTables());
+        Set<SSTableReader> sstables = new HashSet<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
         assertEquals(sstables.iterator().next().bytesOnDisk(), cfs.metric.liveDiskSpaceUsed.getCount());
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
-             SSTableRewriter writer = new SSTableRewriter(cfs, txn, 1000, false);)
+             SSTableRewriter writer = new SSTableRewriter(txn, 1000, false);
+             CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
-            ISSTableScanner scanner = scanners.scanners.get(0);
-            CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(System.currentTimeMillis()));
-            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            while(scanner.hasNext())
+            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            while(ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Arrays.asList(scanner.next()));
-                writer.append(row);
+                writer.append(ci.next());
             }
             writer.finish();
         }
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
+        assertEquals(1, assertFileCounts(sstables.iterator().next().descriptor.directory.list()));
+
         validateCFS(cfs);
-        int filecounts = assertFileCounts(sstables.iterator().next().descriptor.directory.list(), 0, 0);
-        assertEquals(1, filecounts);
         truncate(cfs);
     }
     @Test
@@ -158,27 +113,27 @@
 
         SSTableReader s = writeFile(cfs, 1000);
         cfs.addSSTable(s);
-        Set<SSTableReader> sstables = new HashSet<>(cfs.getSSTables());
+        Set<SSTableReader> sstables = new HashSet<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
 
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
-             SSTableRewriter writer = new SSTableRewriter(cfs, txn, 1000, false, 10000000);)
+             SSTableRewriter writer = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
-            ISSTableScanner scanner = scanners.scanners.get(0);
-            CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(System.currentTimeMillis()));
-            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            while (scanner.hasNext())
+            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Arrays.asList(scanner.next()));
-                writer.append(row);
+                writer.append(ci.next());
             }
             writer.finish();
         }
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
+        assertEquals(1, assertFileCounts(sstables.iterator().next().descriptor.directory.list()));
+
         validateCFS(cfs);
-        int filecounts = assertFileCounts(sstables.iterator().next().descriptor.directory.list(), 0, 0);
-        assertEquals(1, filecounts);
     }
 
     @Test
@@ -190,30 +145,31 @@
 
         SSTableReader s = writeFile(cfs, 1000);
         cfs.addSSTable(s);
-        Set<SSTableReader> sstables = new HashSet<>(cfs.getSSTables());
+        Set<SSTableReader> sstables = new HashSet<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
 
+        int nowInSec = FBUtilities.nowInSeconds();
         boolean checked = false;
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
+        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
-             SSTableRewriter writer = new SSTableRewriter(cfs, txn, 1000, false, 10000000))
+             SSTableRewriter writer = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
-            ISSTableScanner scanner = scanners.scanners.get(0);
-            CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(System.currentTimeMillis()));
-            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            while (scanner.hasNext())
+            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Arrays.asList(scanner.next()));
+                UnfilteredRowIterator row = ci.next();
                 writer.append(row);
-                if (!checked && writer.currentWriter().getFilePointer() > 15000000)
+                if (!checked && writer.currentWriter().getFilePointer() > 1500000)
                 {
                     checked = true;
-                    for (SSTableReader sstable : cfs.getSSTables())
+                    for (SSTableReader sstable : cfs.getLiveSSTables())
                     {
                         if (sstable.openReason == SSTableReader.OpenReason.EARLY)
                         {
                             SSTableReader c = txn.current(sstables.iterator().next());
-                            Collection<Range<Token>> r = Arrays.asList(new Range<>(cfs.partitioner.getMinimumToken(), cfs.partitioner.getMinimumToken()));
+                            Collection<Range<Token>> r = Arrays.asList(new Range<>(cfs.getPartitioner().getMinimumToken(), cfs.getPartitioner().getMinimumToken()));
                             List<Pair<Long, Long>> tmplinkPositions = sstable.getPositionsForRanges(r);
                             List<Pair<Long, Long>> compactingPositions = c.getPositionsForRanges(r);
                             assertEquals(1, tmplinkPositions.size());
@@ -229,56 +185,14 @@
             assertTrue(checked);
             writer.finish();
         }
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
+        assertEquals(1, assertFileCounts(sstables.iterator().next().descriptor.directory.list()));
+
         validateCFS(cfs);
-        int filecounts = assertFileCounts(sstables.iterator().next().descriptor.directory.list(), 0, 0);
-        assertEquals(1, filecounts);
         truncate(cfs);
     }
 
     @Test
-    public void testFileRemoval() throws InterruptedException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
-        truncate(cfs);
-
-        ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        for (int i = 0; i < 100; i++)
-            cf.addColumn(Util.cellname(i), ByteBuffer.allocate(1000), 1);
-        File dir = cfs.directories.getDirectoryForNewSSTables();
-
-        try (SSTableWriter writer = getWriter(cfs, dir);)
-        {
-            for (int i = 0; i < 10000; i++)
-                writer.append(StorageService.getPartitioner().decorateKey(random(i, 10)), cf);
-            SSTableReader s = writer.setMaxDataAge(1000).openEarly();
-            assert s != null;
-            assertFileCounts(dir.list(), 2, 2);
-            for (int i = 10000; i < 20000; i++)
-                writer.append(StorageService.getPartitioner().decorateKey(random(i, 10)), cf);
-            SSTableReader s2 = writer.setMaxDataAge(1000).openEarly();
-            assertTrue(s.last.compareTo(s2.last) < 0);
-            assertFileCounts(dir.list(), 2, 2);
-            s.markObsolete(cfs.getTracker());
-            s.selfRef().release();
-            s2.selfRef().release();
-            // These checks don't work on Windows because the writer has the channel still
-            // open till .abort() is called (via the builder)
-            if (!FBUtilities.isWindows())
-            {
-                SSTableDeletingTask.waitForDeletions();
-                assertFileCounts(dir.list(), 0, 2);
-            }
-            writer.abort();
-            SSTableDeletingTask.waitForDeletions();
-            int datafiles = assertFileCounts(dir.list(), 0, 0);
-            assertEquals(datafiles, 0);
-            validateCFS(cfs);
-        }
-    }
-
-    @Test
     public void testNumberOfFilesAndSizes() throws Exception
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE);
@@ -296,17 +210,19 @@
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 10000000))
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while(scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+
+            while(ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (rewriter.currentWriter().getOnDiskFilePointer() > 25000000)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                     files++;
-                    assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                    assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
                     assertEquals(s.bytesOnDisk(), cfs.metric.liveDiskSpaceUsed.getCount());
                     assertEquals(s.bytesOnDisk(), cfs.metric.totalDiskSpaceUsed.getCount());
 
@@ -314,18 +230,21 @@
             }
             sstables = rewriter.finish();
         }
+
+        LifecycleTransaction.waitForDeletions();
+
         long sum = 0;
-        for (SSTableReader x : cfs.getSSTables())
+        for (SSTableReader x : cfs.getLiveSSTables())
             sum += x.bytesOnDisk();
         assertEquals(sum, cfs.metric.liveDiskSpaceUsed.getCount());
         assertEquals(startStorageMetricsLoad - sBytesOnDisk + sum, StorageMetrics.load.getCount());
         assertEquals(files, sstables.size());
-        assertEquals(files, cfs.getSSTables().size());
-        SSTableDeletingTask.waitForDeletions();
+        assertEquals(files, cfs.getLiveSSTables().size());
+        LifecycleTransaction.waitForDeletions();
 
         // tmplink and tmp files should be gone:
         assertEquals(sum, cfs.metric.totalDiskSpaceUsed.getCount());
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        assertFileCounts(s.descriptor.directory.list());
         validateCFS(cfs);
     }
 
@@ -346,27 +265,29 @@
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 10000000))
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while(scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+
+            while(ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (rewriter.currentWriter().getOnDiskFilePointer() > 25000000)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                     files++;
-                    assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                    assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
                 }
             }
             sstables = rewriter.finish();
         }
 
         assertEquals(files, sstables.size());
-        assertEquals(files, cfs.getSSTables().size());
-        SSTableDeletingTask.waitForDeletions();
+        assertEquals(files, cfs.getLiveSSTables().size());
+        LifecycleTransaction.waitForDeletions();
 
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        assertFileCounts(s.descriptor.directory.list());
         validateCFS(cfs);
     }
 
@@ -376,20 +297,28 @@
     {
         testNumberOfFiles_abort(new RewriterTest()
         {
-            public void run(ISSTableScanner scanner, CompactionController controller, SSTableReader sstable, ColumnFamilyStore cfs, SSTableRewriter rewriter)
+            public void run(ISSTableScanner scanner,
+                            CompactionController controller,
+                            SSTableReader sstable,
+                            ColumnFamilyStore cfs,
+                            SSTableRewriter rewriter,
+                            LifecycleTransaction txn)
             {
-                int files = 1;
-                while(scanner.hasNext())
+                try (CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
                 {
-                    rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
-                    if (rewriter.currentWriter().getFilePointer() > 25000000)
+                    int files = 1;
+                    while (ci.hasNext())
                     {
-                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory));
-                        files++;
-                        assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                        rewriter.append(ci.next());
+                        if (rewriter.currentWriter().getFilePointer() > 25000000)
+                        {
+                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory, txn));
+                            files++;
+                            assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                        }
                     }
+                    rewriter.abort();
                 }
-                rewriter.abort();
             }
         });
     }
@@ -399,23 +328,31 @@
     {
         testNumberOfFiles_abort(new RewriterTest()
         {
-            public void run(ISSTableScanner scanner, CompactionController controller, SSTableReader sstable, ColumnFamilyStore cfs, SSTableRewriter rewriter)
+            public void run(ISSTableScanner scanner,
+                            CompactionController controller,
+                            SSTableReader sstable,
+                            ColumnFamilyStore cfs,
+                            SSTableRewriter rewriter,
+                            LifecycleTransaction txn)
             {
-                int files = 1;
-                while(scanner.hasNext())
+                try (CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
                 {
-                    rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
-                    if (rewriter.currentWriter().getFilePointer() > 25000000)
+                    int files = 1;
+                    while (ci.hasNext())
                     {
-                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory));
-                        files++;
-                        assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
-                    }
-                    if (files == 3)
-                    {
-                        //testing to abort when we have nothing written in the new file
-                        rewriter.abort();
-                        break;
+                        rewriter.append(ci.next());
+                        if (rewriter.currentWriter().getFilePointer() > 25000000)
+                        {
+                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory, txn));
+                            files++;
+                            assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                        }
+                        if (files == 3)
+                        {
+                            //testing to abort when we have nothing written in the new file
+                            rewriter.abort();
+                            break;
+                        }
                     }
                 }
             }
@@ -427,27 +364,40 @@
     {
         testNumberOfFiles_abort(new RewriterTest()
         {
-            public void run(ISSTableScanner scanner, CompactionController controller, SSTableReader sstable, ColumnFamilyStore cfs, SSTableRewriter rewriter)
+            public void run(ISSTableScanner scanner,
+                            CompactionController controller,
+                            SSTableReader sstable,
+                            ColumnFamilyStore cfs,
+                            SSTableRewriter rewriter,
+                            LifecycleTransaction txn)
             {
-                int files = 1;
-                while(scanner.hasNext())
+                try(CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
                 {
-                    rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
-                    if (files == 1 && rewriter.currentWriter().getFilePointer() > 10000000)
+                    int files = 1;
+                    while (ci.hasNext())
                     {
-                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory));
-                        files++;
-                        assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                        rewriter.append(ci.next());
+                        if (files == 1 && rewriter.currentWriter().getFilePointer() > 10000000)
+                        {
+                        rewriter.switchWriter(getWriter(cfs, sstable.descriptor.directory, txn));
+                            files++;
+                            assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                        }
                     }
+                    rewriter.abort();
                 }
-                rewriter.abort();
             }
         });
     }
 
     private static interface RewriterTest
     {
-        public void run(ISSTableScanner scanner, CompactionController controller, SSTableReader sstable, ColumnFamilyStore cfs, SSTableRewriter rewriter);
+        public void run(ISSTableScanner scanner,
+                        CompactionController controller,
+                        SSTableReader sstable,
+                        ColumnFamilyStore cfs,
+                        SSTableRewriter rewriter,
+                        LifecycleTransaction txn);
     }
 
     private void testNumberOfFiles_abort(RewriterTest test) throws Exception
@@ -463,23 +413,22 @@
         DecoratedKey origLast = s.last;
         long startSize = cfs.metric.liveDiskSpaceUsed.getCount();
         Set<SSTableReader> compacting = Sets.newHashSet(s);
-
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 10000000);)
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 10000000, false))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            test.run(scanner, controller, s, cfs, rewriter);
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            test.run(scanner, controller, s, cfs, rewriter, txn);
         }
 
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
 
         assertEquals(startSize, cfs.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(1, cfs.getSSTables().size());
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
-        assertEquals(cfs.getSSTables().iterator().next().first, origFirst);
-        assertEquals(cfs.getSSTables().iterator().next().last, origLast);
+        assertEquals(1, cfs.getLiveSSTables().size());
+        assertFileCounts(s.descriptor.directory.list());
+        assertEquals(cfs.getLiveSSTables().iterator().next().first, origFirst);
+        assertEquals(cfs.getLiveSSTables().iterator().next().last, origLast);
         validateCFS(cfs);
     }
 
@@ -499,17 +448,18 @@
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 10000000))
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while(scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            while(ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
-                if (rewriter.currentWriter().getFilePointer() > 25000000)
+                rewriter.append(ci.next());
+                if (rewriter.currentWriter().getFilePointer() > 2500000)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                     files++;
-                    assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                    assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
                 }
                 if (files == 3)
                 {
@@ -520,10 +470,10 @@
             }
         }
 
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
 
-        assertEquals(files - 1, cfs.getSSTables().size()); // we never wrote anything to the last file
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        assertEquals(files - 1, cfs.getLiveSSTables().size()); // we never wrote anything to the last file
+        assertFileCounts(s.descriptor.directory.list());
         validateCFS(cfs);
     }
 
@@ -544,25 +494,26 @@
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 10000000))
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while(scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            while(ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (rewriter.currentWriter().getOnDiskFilePointer() > 25000000)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                     files++;
-                    assertEquals(cfs.getSSTables().size(), files); // we have one original file plus the ones we have switched out.
+                    assertEquals(cfs.getLiveSSTables().size(), files); // we have one original file plus the ones we have switched out.
                 }
             }
 
             sstables = rewriter.finish();
         }
 
-        SSTableDeletingTask.waitForDeletions();
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        LifecycleTransaction.waitForDeletions();
+        assertFileCounts(s.descriptor.directory.list());
         validateCFS(cfs);
     }
 
@@ -583,27 +534,27 @@
         try (ISSTableScanner scanner = s.getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 1000000);)
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 1000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()))
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while(scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            while(ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (rewriter.currentWriter().getOnDiskFilePointer() > 2500000)
                 {
-                    assertEquals(files, cfs.getSSTables().size()); // all files are now opened early
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    assertEquals(files, cfs.getLiveSSTables().size()); // all files are now opened early
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                     files++;
                 }
             }
 
             sstables = rewriter.finish();
         }
-
         assertEquals(files, sstables.size());
-        assertEquals(files, cfs.getSSTables().size());
-        SSTableDeletingTask.waitForDeletions();
-        assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        assertEquals(files, cfs.getLiveSSTables().size());
+        LifecycleTransaction.waitForDeletions();
+        assertFileCounts(s.descriptor.directory.list());
 
         validateCFS(cfs);
     }
@@ -621,10 +572,8 @@
             SSTableSplitter splitter = new SSTableSplitter(cfs, txn, 10);
             splitter.split();
 
-            assertFileCounts(s.descriptor.directory.list(), 0, 0);
-
-            s.selfRef().release();
-            SSTableDeletingTask.waitForDeletions();
+            assertFileCounts(s.descriptor.directory.list());
+            LifecycleTransaction.waitForDeletions();
 
             for (File f : s.descriptor.directory.listFiles())
             {
@@ -671,16 +620,17 @@
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = offline ? LifecycleTransaction.offline(OperationType.UNKNOWN, compacting)
                                        : cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, offline, 10000000);
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, offline, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID())
         )
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while (scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (rewriter.currentWriter().getOnDiskFilePointer() > 25000000)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                 }
             }
             try
@@ -699,22 +649,22 @@
                 s.selfRef().release();
         }
 
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
 
-        int filecount = assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        int filecount = assertFileCounts(s.descriptor.directory.list());
         assertEquals(filecount, 1);
         if (!offline)
         {
-            assertEquals(1, cfs.getSSTables().size());
+            assertEquals(1, cfs.getLiveSSTables().size());
             validateCFS(cfs);
             truncate(cfs);
         }
         else
         {
-            assertEquals(0, cfs.getSSTables().size());
+            assertEquals(0, cfs.getLiveSSTables().size());
             cfs.truncateBlocking();
         }
-        filecount = assertFileCounts(s.descriptor.directory.list(), 0, 0);
+        filecount = assertFileCounts(s.descriptor.directory.list());
         if (offline)
         {
             // the file is not added to the CFS, therefore not truncated away above
@@ -723,7 +673,7 @@
             {
                 FileUtils.deleteRecursive(f);
             }
-            filecount = assertFileCounts(s.descriptor.directory.list(), 0, 0);
+            filecount = assertFileCounts(s.descriptor.directory.list());
         }
 
         assertEquals(0, filecount);
@@ -738,18 +688,21 @@
         truncate(cfs);
         for (int i = 0; i < 100; i++)
         {
-            DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE, key.getKey());
+            String key = Integer.toString(i);
+
             for (int j = 0; j < 10; j++)
-                rm.add(CF, Util.cellname(Integer.toString(j)), ByteBufferUtil.EMPTY_BYTE_BUFFER, 100);
-            rm.apply();
+                new RowUpdateBuilder(cfs.metadata, 100, key)
+                    .clustering(Integer.toString(j))
+                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                    .build()
+                    .apply();
         }
         cfs.forceBlockingFlush();
         cfs.forceMajorCompaction();
         validateKeys(keyspace);
 
-        assertEquals(1, cfs.getSSTables().size());
-        SSTableReader s = cfs.getSSTables().iterator().next();
+        assertEquals(1, cfs.getLiveSSTables().size());
+        SSTableReader s = cfs.getLiveSSTables().iterator().next();
         Set<SSTableReader> compacting = new HashSet<>();
         compacting.add(s);
 
@@ -757,16 +710,17 @@
         try (ISSTableScanner scanner = compacting.iterator().next().getScanner();
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(cfs, txn, 1000, false, 1);
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, false, 1, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID())
         )
         {
-            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
-            while (scanner.hasNext())
+            rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                rewriter.append(new LazilyCompactedRow(controller, Arrays.asList(scanner.next())));
+                rewriter.append(ci.next());
                 if (keyCount % 10 == 0)
                 {
-                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory));
+                    rewriter.switchWriter(getWriter(cfs, s.descriptor.directory, txn));
                 }
                 keyCount++;
                 validateKeys(keyspace);
@@ -774,7 +728,7 @@
             rewriter.finish();
         }
         validateKeys(keyspace);
-        SSTableDeletingTask.waitForDeletions();
+        LifecycleTransaction.waitForDeletions();
         validateCFS(cfs);
         truncate(cfs);
     }
@@ -794,18 +748,18 @@
         try (ISSTableScanner scanner = sstables.iterator().next().getScanner();
              CompactionController controller = new CompactionController(cfs, sstables, 0);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
-             SSTableRewriter writer = new SSTableRewriter(cfs, txn, 1000, false, 10000000);
+             SSTableRewriter writer = new SSTableRewriter(txn, 1000, false, 10000000, false);
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID())
         )
         {
-            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            while (scanner.hasNext())
+            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Collections.singletonList(scanner.next()));
-                writer.append(row);
+                writer.append(ci.next());
                 if (!checked && writer.currentWriter().getFilePointer() > 15000000)
                 {
                     checked = true;
-                    ColumnFamilyStore.ViewFragment viewFragment = cfs.select(ColumnFamilyStore.CANONICAL_SSTABLES);
+                    ColumnFamilyStore.ViewFragment viewFragment = cfs.select(View.selectFunction(SSTableSet.CANONICAL));
                     // canonical view should have only one SSTable which is not opened early.
                     assertEquals(1, viewFragment.sstables.size());
                     SSTableReader sstable = viewFragment.sstables.get(0);
@@ -827,7 +781,7 @@
 
         cfs.addSSTable(writeFile(cfs, 1000));
 
-        Collection<SSTableReader> allSSTables = cfs.getSSTables();
+        Collection<SSTableReader> allSSTables = cfs.getLiveSSTables();
         assertEquals(1, allSSTables.size());
         final Token firstToken = allSSTables.iterator().next().first.getToken();
         DatabaseDescriptor.setSSTablePreempiveOpenIntervalInMB(1);
@@ -893,56 +847,80 @@
         cfs.addSSTable(s);
         Set<SSTableReader> sstables = Sets.newHashSet(s);
         assertEquals(1, sstables.size());
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
-             SSTableRewriter writer = new SSTableRewriter(cfs, txn, 1000, false, false);
-             SSTableRewriter writer2 = new SSTableRewriter(cfs, txn, 1000, false, false))
+             SSTableRewriter writer = new SSTableRewriter(txn, 1000, false, false);
+             SSTableRewriter writer2 = new SSTableRewriter(txn, 1000, false, false);
+             CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID())
+             )
         {
-            ISSTableScanner scanner = scanners.scanners.get(0);
-            CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(System.currentTimeMillis()));
-            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            writer2.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory));
-            while (scanner.hasNext())
+            writer.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            writer2.switchWriter(getWriter(cfs, sstables.iterator().next().descriptor.directory, txn));
+            while (ci.hasNext())
             {
-                AbstractCompactedRow row = new LazilyCompactedRow(controller, Collections.singletonList(scanner.next()));
-
                 if (writer.currentWriter().getFilePointer() < 15000000)
-                    writer.append(row);
+                    writer.append(ci.next());
                 else
-                    writer2.append(row);
+                    writer2.append(ci.next());
             }
             for (int i = 0; i < 5000; i++)
-            {
-                DecoratedKey key = Util.dk(ByteBufferUtil.bytes(i));
-                ColumnFamily cf = Util.getColumnFamily(keyspace, key, CF);
-                assertTrue(cf != null);
-            }
+                assertFalse(Util.getOnlyPartition(Util.cmd(cfs, ByteBufferUtil.bytes(i)).build()).isEmpty());
         }
         truncateCF();
         validateCFS(cfs);
     }
 
+    @Test
+    public void testCanonicalSSTables() throws ExecutionException, InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+        truncate(cfs);
+
+        cfs.addSSTable(writeFile(cfs, 100));
+        Collection<SSTableReader> allSSTables = cfs.getLiveSSTables();
+        assertEquals(1, allSSTables.size());
+        final AtomicBoolean done = new AtomicBoolean(false);
+        final AtomicBoolean failed = new AtomicBoolean(false);
+        Runnable r = () -> {
+            while (!done.get())
+            {
+                Iterable<SSTableReader> sstables = cfs.getSSTables(SSTableSet.CANONICAL);
+                if (Iterables.size(sstables) != 1)
+                {
+                    failed.set(true);
+                    return;
+                }
+            }
+        };
+        Thread t = new Thread(r);
+        try
+        {
+            t.start();
+            cfs.forceMajorCompaction();
+        }
+        finally
+        {
+            done.set(true);
+            t.join(20);
+        }
+        assertFalse(failed.get());
+
+
+    }
 
     private void validateKeys(Keyspace ks)
     {
         for (int i = 0; i < 100; i++)
         {
             DecoratedKey key = Util.dk(Integer.toString(i));
-            ColumnFamily cf = Util.getColumnFamily(ks, key, CF);
-            assertTrue(cf != null);
+            ImmutableBTreePartition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(ks.getColumnFamilyStore(CF), key).build());
+            assertTrue(partition != null && partition.rowCount() > 0);
         }
     }
 
-    public static void truncate(ColumnFamilyStore cfs)
-    {
-        cfs.truncateBlocking();
-        SSTableDeletingTask.waitForDeletions();
-        Uninterruptibles.sleepUninterruptibly(10L,TimeUnit.MILLISECONDS);
-        assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(0, cfs.metric.totalDiskSpaceUsed.getCount());
-        validateCFS(cfs);
-    }
-
     public static SSTableReader writeFile(ColumnFamilyStore cfs, int count)
     {
         return Iterables.getFirst(writeFiles(cfs, 1, count * 5, count / 100, 1000), null);
@@ -954,86 +932,23 @@
         Set<SSTableReader> result = new LinkedHashSet<>();
         for (int f = 0 ; f < fileCount ; f++)
         {
-            File dir = cfs.directories.getDirectoryForNewSSTables();
-            String filename = cfs.getTempSSTablePath(dir);
+            File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+            String filename = cfs.getSSTablePath(dir);
 
-            SSTableWriter writer = SSTableWriter.create(filename, 0, 0);
-            int end = f == fileCount - 1 ? partitionCount : ((f + 1) * partitionCount) / fileCount;
-            for ( ; i < end ; i++)
+            try (SSTableTxnWriter writer = SSTableTxnWriter.create(cfs, filename, 0, 0, new SerializationHeader(true, cfs.metadata, cfs.metadata.partitionColumns(), EncodingStats.NO_STATS)))
             {
-                ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-                for (int j = 0; j < cellCount ; j++)
-                    cf.addColumn(Util.cellname(j), random(0, cellSize), 1);
-                writer.append(StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(i)), cf);
+                int end = f == fileCount - 1 ? partitionCount : ((f + 1) * partitionCount) / fileCount;
+                for ( ; i < end ; i++)
+                {
+                    UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, ByteBufferUtil.bytes(i));
+                    for (int j = 0; j < cellCount ; j++)
+                        builder.newRow(Integer.toString(i)).add("val", random(0, 1000));
+
+                    writer.append(builder.build().unfilteredIterator());
+                }
+                result.addAll(writer.finish(true));
             }
-            result.add(writer.finish(true));
         }
         return result;
     }
-
-    public static void validateCFS(ColumnFamilyStore cfs)
-    {
-        Set<Integer> liveDescriptors = new HashSet<>();
-        long spaceUsed = 0;
-        for (SSTableReader sstable : cfs.getSSTables())
-        {
-            assertFalse(sstable.isMarkedCompacted());
-            assertEquals(1, sstable.selfRef().globalCount());
-            liveDescriptors.add(sstable.descriptor.generation);
-            spaceUsed += sstable.bytesOnDisk();
-        }
-        for (File dir : cfs.directories.getCFDirectories())
-        {
-            for (File f : dir.listFiles())
-            {
-                if (f.getName().contains("Data"))
-                {
-                    Descriptor d = Descriptor.fromFilename(f.getAbsolutePath());
-                    assertTrue(d.toString(), liveDescriptors.contains(d.generation));
-                }
-            }
-        }
-        assertEquals(spaceUsed, cfs.metric.liveDiskSpaceUsed.getCount());
-        assertEquals(spaceUsed, cfs.metric.totalDiskSpaceUsed.getCount());
-        assertTrue(cfs.getTracker().getCompacting().isEmpty());
-        if (cfs.getSSTables().size() > 0)
-            assertFalse(CompactionManager.instance.submitMaximal(cfs, cfs.gcBefore(System.currentTimeMillis()/1000), false).isEmpty());
-    }
-
-    public static int assertFileCounts(String [] files, int expectedtmplinkCount, int expectedtmpCount)
-    {
-        int tmplinkcount = 0;
-        int tmpcount = 0;
-        int datacount = 0;
-        for (String f : files)
-        {
-            if (f.endsWith("-CRC.db"))
-                continue;
-            if (f.contains("tmplink-"))
-                tmplinkcount++;
-            else if (f.contains("tmp-"))
-                tmpcount++;
-            else if (f.contains("Data"))
-                datacount++;
-        }
-        assertEquals(expectedtmplinkCount, tmplinkcount);
-        assertEquals(expectedtmpCount, tmpcount);
-        return datacount;
-    }
-
-    public static SSTableWriter getWriter(ColumnFamilyStore cfs, File directory)
-    {
-        String filename = cfs.getTempSSTablePath(directory);
-        return SSTableWriter.create(filename, 0, 0);
-    }
-
-    public static ByteBuffer random(int i, int size)
-    {
-        byte[] bytes = new byte[size + 4];
-        ThreadLocalRandom.current().nextBytes(bytes);
-        ByteBuffer r = ByteBuffer.wrap(bytes);
-        r.putInt(0, i);
-        return r;
-    }
-
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
index f8b808d..cf57b17 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java

@@ -1,42 +1,59 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.io.sstable;
 
 import java.io.IOException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.junit.BeforeClass;
 import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.RateLimiter;
+
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 import static org.apache.cassandra.dht.AbstractBounds.isEmpty;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 public class SSTableScannerTest
 {
@@ -48,8 +65,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE, TABLE));
     }
 
@@ -59,51 +75,51 @@
     }
 
     // we produce all DataRange variations that produce an inclusive start and exclusive end range
-    private static Iterable<DataRange> dataRanges(int start, int end)
+    private static Iterable<DataRange> dataRanges(CFMetaData metadata, int start, int end)
     {
         if (end < start)
-            return dataRanges(start, end, false, true);
-        return Iterables.concat(dataRanges(start, end, false, false),
-                                dataRanges(start, end, false, true),
-                                dataRanges(start, end, true, false),
-                                dataRanges(start, end, true, true)
+            return dataRanges(metadata, start, end, false, true);
+        return Iterables.concat(dataRanges(metadata, start, end, false, false),
+                                dataRanges(metadata, start, end, false, true),
+                                dataRanges(metadata, start, end, true, false),
+                                dataRanges(metadata, start, end, true, true)
         );
     }
 
-    private static Iterable<DataRange> dataRanges(int start, int end, boolean inclusiveStart, boolean inclusiveEnd)
+    private static Iterable<DataRange> dataRanges(CFMetaData metadata, int start, int end, boolean inclusiveStart, boolean inclusiveEnd)
     {
         List<DataRange> ranges = new ArrayList<>();
         if (start == end + 1)
         {
             assert !inclusiveStart && inclusiveEnd;
-            ranges.add(dataRange(min(start), false, max(end), true));
-            ranges.add(dataRange(min(start), false, min(end + 1), true));
-            ranges.add(dataRange(max(start - 1), false, max(end), true));
-            ranges.add(dataRange(dk(start - 1), false, dk(start - 1), true));
+            ranges.add(dataRange(metadata, min(start), false, max(end), true));
+            ranges.add(dataRange(metadata, min(start), false, min(end + 1), true));
+            ranges.add(dataRange(metadata, max(start - 1), false, max(end), true));
+            ranges.add(dataRange(metadata, dk(start - 1), false, dk(start - 1), true));
         }
         else
         {
-            for (RowPosition s : starts(start, inclusiveStart))
+            for (PartitionPosition s : starts(start, inclusiveStart))
             {
-                for (RowPosition e : ends(end, inclusiveEnd))
+                for (PartitionPosition e : ends(end, inclusiveEnd))
                 {
                     if (end < start && e.compareTo(s) > 0)
                         continue;
                     if (!isEmpty(new AbstractBounds.Boundary<>(s, inclusiveStart), new AbstractBounds.Boundary<>(e, inclusiveEnd)))
                         continue;
-                    ranges.add(dataRange(s, inclusiveStart, e, inclusiveEnd));
+                    ranges.add(dataRange(metadata, s, inclusiveStart, e, inclusiveEnd));
                 }
             }
         }
         return ranges;
     }
 
-    private static Iterable<RowPosition> starts(int key, boolean inclusive)
+    private static Iterable<PartitionPosition> starts(int key, boolean inclusive)
     {
         return Arrays.asList(min(key), max(key - 1), dk(inclusive ? key : key - 1));
     }
 
-    private static Iterable<RowPosition> ends(int key, boolean inclusive)
+    private static Iterable<PartitionPosition> ends(int key, boolean inclusive)
     {
         return Arrays.asList(max(key), min(key + 1), dk(inclusive ? key : key + 1));
     }
@@ -118,19 +134,22 @@
         return key == Integer.MIN_VALUE ? ByteOrderedPartitioner.MINIMUM : new ByteOrderedPartitioner.BytesToken(toKey(key).getBytes());
     }
 
-    private static RowPosition min(int key)
+    private static PartitionPosition min(int key)
     {
         return token(key).minKeyBound();
     }
 
-    private static RowPosition max(int key)
+    private static PartitionPosition max(int key)
     {
         return token(key).maxKeyBound();
     }
 
-    private static DataRange dataRange(RowPosition start, boolean startInclusive, RowPosition end, boolean endInclusive)
+    private static DataRange dataRange(CFMetaData metadata, PartitionPosition start, boolean startInclusive, PartitionPosition end, boolean endInclusive)
     {
-        return new DataRange(AbstractBounds.bounds(start, startInclusive, end, endInclusive), new IdentityQueryFilter());
+        Slices.Builder sb = new Slices.Builder(metadata.comparator);
+        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(sb.build(), false);
+
+        return new DataRange(AbstractBounds.bounds(start, startInclusive, end, endInclusive), filter);
     }
 
     private static Range<Token> rangeFor(int start, int end)
@@ -147,25 +166,31 @@
         return ranges;
     }
 
-    private static void insertRowWithKey(int key)
+    private static void insertRowWithKey(CFMetaData metadata, int key)
     {
         long timestamp = System.currentTimeMillis();
-        DecoratedKey decoratedKey = Util.dk(toKey(key));
-        Mutation rm = new Mutation(KEYSPACE, decoratedKey.getKey());
-        rm.add(TABLE, Util.cellname("col"), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, 1000);
-        rm.applyUnsafe();
+
+        new RowUpdateBuilder(metadata, timestamp, toKey(key))
+            .clustering("col")
+            .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+            .build()
+            .applyUnsafe();
+
     }
 
     private static void assertScanMatches(SSTableReader sstable, int scanStart, int scanEnd, int ... boundaries)
     {
         assert boundaries.length % 2 == 0;
-        for (DataRange range : dataRanges(scanStart, scanEnd))
+        for (DataRange range : dataRanges(sstable.metadata, scanStart, scanEnd))
         {
-            try(ISSTableScanner scanner = sstable.getScanner(range))
+            try(ISSTableScanner scanner = sstable.getScanner(ColumnFilter.all(sstable.metadata),
+                                                             range,
+                                                             false,
+                                                             SSTableReadsListener.NOOP_LISTENER))
             {
                 for (int b = 0; b < boundaries.length; b += 2)
                     for (int i = boundaries[b]; i <= boundaries[b + 1]; i++)
-                        assertEquals(toKey(i), new String(scanner.next().getKey().getKey().array()));
+                        assertEquals(toKey(i), new String(scanner.next().partitionKey().getKey().array()));
                 assertFalse(scanner.hasNext());
             }
             catch (Exception e)
@@ -191,16 +216,16 @@
         store.disableAutoCompaction();
 
         for (int i = 2; i < 10; i++)
-            insertRowWithKey(i);
+            insertRowWithKey(store.metadata, i);
         store.forceBlockingFlush();
 
-        assertEquals(1, store.getSSTables().size());
-        SSTableReader sstable = store.getSSTables().iterator().next();
+        assertEquals(1, store.getLiveSSTables().size());
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
 
         // full range scan
-        ISSTableScanner scanner = sstable.getScanner();
+        ISSTableScanner scanner = sstable.getScanner(RateLimiter.create(Double.MAX_VALUE));
         for (int i = 2; i < 10; i++)
-            assertEquals(toKey(i), new String(scanner.next().getKey().getKey().array()));
+            assertEquals(toKey(i), new String(scanner.next().partitionKey().getKey().array()));
 
         scanner.close();
 
@@ -278,7 +303,7 @@
             for (int expected = rangeStart; expected <= rangeEnd; expected++)
             {
                 assertTrue(String.format("Expected to see key %03d", expected), scanner.hasNext());
-                assertEquals(toKey(expected), new String(scanner.next().getKey().getKey().array()));
+                assertEquals(toKey(expected), new String(scanner.next().partitionKey().getKey().array()));
             }
         }
         assertFalse(scanner.hasNext());
@@ -297,14 +322,14 @@
 
         for (int i = 0; i < 3; i++)
             for (int j = 2; j < 10; j++)
-                insertRowWithKey(i * 100 + j);
+                insertRowWithKey(store.metadata, i * 100 + j);
         store.forceBlockingFlush();
 
-        assertEquals(1, store.getSSTables().size());
-        SSTableReader sstable = store.getSSTables().iterator().next();
+        assertEquals(1, store.getLiveSSTables().size());
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
 
         // full range scan
-        ISSTableScanner fullScanner = sstable.getScanner();
+        ISSTableScanner fullScanner = sstable.getScanner(RateLimiter.create(Double.MAX_VALUE));
         assertScanContainsRanges(fullScanner,
                                  2, 9,
                                  102, 109,
@@ -427,14 +452,14 @@
         // disable compaction while flushing
         store.disableAutoCompaction();
 
-        insertRowWithKey(205);
+        insertRowWithKey(store.metadata, 205);
         store.forceBlockingFlush();
 
-        assertEquals(1, store.getSSTables().size());
-        SSTableReader sstable = store.getSSTables().iterator().next();
+        assertEquals(1, store.getLiveSSTables().size());
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
 
         // full range scan
-        ISSTableScanner fullScanner = sstable.getScanner();
+        ISSTableScanner fullScanner = sstable.getScanner(RateLimiter.create(Double.MAX_VALUE));
         assertScanContainsRanges(fullScanner, 205, 205);
 
         // scan three ranges separately

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java
deleted file mode 100644
index 499caf7..0000000
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableSimpleWriterTest.java
+++ /dev/null

@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.io.sstable;
-
-import java.io.File;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.StorageService;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-import static org.apache.cassandra.utils.ByteBufferUtil.toInt;
-
-public class SSTableSimpleWriterTest
-{
-    public static final String KEYSPACE = "SSTableSimpleWriterTest";
-    public static final String CF_STANDARDINT = "StandardInteger1";
-
-    @BeforeClass
-    public static void defineSchema() throws Exception
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE, CF_STANDARDINT));
-    }
-
-    @Test
-    public void testSSTableSimpleUnsortedWriter() throws Exception
-    {
-        final int INC = 5;
-        final int NBCOL = 10;
-
-        String keyspaceName = KEYSPACE;
-        String cfname = "StandardInteger1";
-
-        Keyspace t = Keyspace.open(keyspaceName); // make sure we create the directory
-        File dir = new Directories(Schema.instance.getCFMetaData(keyspaceName, cfname)).getDirectoryForNewSSTables();
-        assert dir.exists();
-
-        IPartitioner partitioner = StorageService.getPartitioner();
-        try (SSTableSimpleUnsortedWriter writer = new SSTableSimpleUnsortedWriter(dir, partitioner, keyspaceName, cfname, IntegerType.instance, null, 16))
-        {
-
-            int k = 0;
-    
-            // Adding a few rows first
-            for (; k < 10; ++k)
-            {
-                writer.newRow(bytes("Key" + k));
-                writer.addColumn(bytes(1), bytes("v"), 0);
-                writer.addColumn(bytes(2), bytes("v"), 0);
-                writer.addColumn(bytes(3), bytes("v"), 0);
-            }
-    
-    
-            // Testing multiple opening of the same row
-            // We'll write column 0, 5, 10, .., on the first row, then 1, 6, 11, ... on the second one, etc.
-            for (int i = 0; i < INC; ++i)
-            {
-                writer.newRow(bytes("Key" + k));
-                for (int j = 0; j < NBCOL; ++j)
-                {
-                    writer.addColumn(bytes(i + INC * j), bytes("v"), 1);
-                }
-            }
-            k++;
-    
-            // Adding a few more rows
-            for (; k < 20; ++k)
-            {
-                writer.newRow(bytes("Key" + k));
-                writer.addColumn(bytes(1), bytes("v"), 0);
-                writer.addColumn(bytes(2), bytes("v"), 0);
-                writer.addColumn(bytes(3), bytes("v"), 0);
-            }
-        }
-
-        // Now add that newly created files to the column family
-        ColumnFamilyStore cfs = t.getColumnFamilyStore(cfname);
-        cfs.loadNewSSTables();
-
-        // Check we get expected results
-        ColumnFamily cf = Util.getColumnFamily(t, Util.dk("Key10"), cfname);
-        assert cf.getColumnCount() == INC * NBCOL : "expecting " + (INC * NBCOL) + " columns, got " + cf.getColumnCount();
-        int i = 0;
-        for (Cell c : cf)
-        {
-            assert toInt(c.name().toByteBuffer()) == i : "Cell name should be " + i + ", got " + toInt(c.name().toByteBuffer());
-            assert c.value().equals(bytes("v"));
-            assert c.timestamp() == 1;
-            ++i;
-        }
-
-        cf = Util.getColumnFamily(t, Util.dk("Key19"), cfname);
-        assert cf.getColumnCount() == 3 : "expecting 3 columns, got " + cf.getColumnCount();
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java b/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java
index a116b84..5c7ff02 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableUtils.java

@@ -23,12 +23,13 @@
 import java.io.IOException;
 import java.util.*;
 
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.ByteBufferUtil;
 
 import org.apache.cassandra.Util;
 import static org.junit.Assert.assertEquals;
@@ -45,7 +46,7 @@
         CFNAME = cfname;
     }
 
-    /**/
+    /*
     public static ColumnFamily createCF(long mfda, int ldt, Cell... cols)
     {
         return createCF(KEYSPACENAME, CFNAME, mfda, ldt, cols);
@@ -64,6 +65,7 @@
     {
         return tempSSTableFile(keyspaceName, cfname, 0);
     }
+    */
 
     public static File tempSSTableFile(String keyspaceName, String cfname, int generation) throws IOException
     {
@@ -74,7 +76,7 @@
         File cfDir = new File(tempdir, keyspaceName + File.separator + cfname);
         cfDir.mkdirs();
         cfDir.deleteOnExit();
-        File datafile = new File(new Descriptor(cfDir, keyspaceName, cfname, generation, Descriptor.Type.FINAL).filenameFor("Data.db"));
+        File datafile = new File(new Descriptor(cfDir, keyspaceName, cfname, generation).filenameFor(Component.DATA));
         if (!datafile.createNewFile())
             throw new IOException("unable to create file " + datafile);
         datafile.deleteOnExit();
@@ -88,40 +90,29 @@
         {
             while (slhs.hasNext())
             {
-                OnDiskAtomIterator ilhs = slhs.next();
+                UnfilteredRowIterator ilhs = slhs.next();
                 assert srhs.hasNext() : "LHS contained more rows than RHS";
-                OnDiskAtomIterator irhs = srhs.next();
+                UnfilteredRowIterator irhs = srhs.next();
                 assertContentEquals(ilhs, irhs);
             }
             assert !srhs.hasNext() : "RHS contained more rows than LHS";
         }
     }
 
-    public static void assertContentEquals(OnDiskAtomIterator lhs, OnDiskAtomIterator rhs)
+    public static void assertContentEquals(UnfilteredRowIterator lhs, UnfilteredRowIterator rhs)
     {
-        assertEquals(lhs.getKey(), rhs.getKey());
-        // check metadata
-        ColumnFamily lcf = lhs.getColumnFamily();
-        ColumnFamily rcf = rhs.getColumnFamily();
-        if (lcf == null)
-        {
-            if (rcf == null)
-                return;
-            throw new AssertionError("LHS had no content for " + rhs.getKey());
-        }
-        else if (rcf == null)
-            throw new AssertionError("RHS had no content for " + lhs.getKey());
-        assertEquals(lcf.deletionInfo(), rcf.deletionInfo());
+        assertEquals(lhs.partitionKey(), rhs.partitionKey());
+        assertEquals(lhs.partitionLevelDeletion(), rhs.partitionLevelDeletion());
         // iterate columns
         while (lhs.hasNext())
         {
-            Cell clhs = (Cell)lhs.next();
-            assert rhs.hasNext() : "LHS contained more columns than RHS for " + lhs.getKey();
-            Cell crhs = (Cell)rhs.next();
+            Unfiltered clhs = lhs.next();
+            assert rhs.hasNext() : "LHS contained more columns than RHS for " + lhs.partitionKey();
+            Unfiltered crhs = rhs.next();
 
-            assertEquals("Mismatched columns for " + lhs.getKey(), clhs, crhs);
+            assertEquals("Mismatched row/tombstone for " + lhs.partitionKey(), clhs, crhs);
         }
-        assert !rhs.hasNext() : "RHS contained more columns than LHS for " + lhs.getKey();
+        assert !rhs.hasNext() : "RHS contained more columns than LHS for " + lhs.partitionKey();
     }
 
     /**
@@ -174,61 +165,74 @@
             return this;
         }
 
-        public SSTableReader write(Set<String> keys) throws IOException
+        public Collection<SSTableReader> write(Set<String> keys) throws IOException
         {
-            Map<String, ColumnFamily> map = new HashMap<String, ColumnFamily>();
+            Map<String, PartitionUpdate> map = new HashMap<>();
             for (String key : keys)
             {
-                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(ksname, cfname);
-                cf.addColumn(new BufferCell(Util.cellname(key), ByteBufferUtil.bytes(key), 0));
-                map.put(key, cf);
+                RowUpdateBuilder builder = new RowUpdateBuilder(Schema.instance.getCFMetaData(ksname, cfname), 0, key);
+                builder.clustering(key).add("val", key);
+                map.put(key, builder.buildUpdate());
             }
             return write(map);
         }
 
-        public SSTableReader write(SortedMap<DecoratedKey, ColumnFamily> sorted) throws IOException
+        public Collection<SSTableReader> write(SortedMap<DecoratedKey, PartitionUpdate> sorted) throws IOException
         {
-            final Iterator<Map.Entry<DecoratedKey, ColumnFamily>> iter = sorted.entrySet().iterator();
+            PartitionColumns.Builder builder = PartitionColumns.builder();
+            for (PartitionUpdate update : sorted.values())
+                builder.addAll(update.columns());
+            final Iterator<Map.Entry<DecoratedKey, PartitionUpdate>> iter = sorted.entrySet().iterator();
             return write(sorted.size(), new Appender()
             {
+                public SerializationHeader header()
+                {
+                    return new SerializationHeader(true, Schema.instance.getCFMetaData(ksname, cfname), builder.build(), EncodingStats.NO_STATS);
+                }
+
                 @Override
-                public boolean append(SSTableWriter writer) throws IOException
+                public boolean append(SSTableTxnWriter writer) throws IOException
                 {
                     if (!iter.hasNext())
                         return false;
-                    Map.Entry<DecoratedKey, ColumnFamily> entry = iter.next();
-                    writer.append(entry.getKey(), entry.getValue());
+                    writer.append(iter.next().getValue().unfilteredIterator());
                     return true;
                 }
             });
         }
 
-        public SSTableReader write(Map<String, ColumnFamily> entries) throws IOException
+        public Collection<SSTableReader> write(Map<String, PartitionUpdate> entries) throws IOException
         {
-            SortedMap<DecoratedKey, ColumnFamily> sorted = new TreeMap<DecoratedKey, ColumnFamily>();
-            for (Map.Entry<String, ColumnFamily> entry : entries.entrySet())
+            SortedMap<DecoratedKey, PartitionUpdate> sorted = new TreeMap<>();
+            for (Map.Entry<String, PartitionUpdate> entry : entries.entrySet())
                 sorted.put(Util.dk(entry.getKey()), entry.getValue());
 
             return write(sorted);
         }
 
-        public SSTableReader write(int expectedSize, Appender appender) throws IOException
+        public Collection<SSTableReader> write(int expectedSize, Appender appender) throws IOException
         {
             File datafile = (dest == null) ? tempSSTableFile(ksname, cfname, generation) : new File(dest.filenameFor(Component.DATA));
-            SSTableWriter writer = SSTableWriter.create(Descriptor.fromFilename(datafile.getAbsolutePath()), expectedSize, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
+            CFMetaData cfm = Schema.instance.getCFMetaData(ksname, cfname);
+            ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(cfm.cfId);
+            SerializationHeader header = appender.header();
+            SSTableTxnWriter writer = SSTableTxnWriter.create(cfs, datafile.getAbsolutePath(), expectedSize, ActiveRepairService.UNREPAIRED_SSTABLE, 0, header);
             while (appender.append(writer)) { /* pass */ }
-            SSTableReader reader = writer.finish(true);
+            Collection<SSTableReader> readers = writer.finish(true);
+
             // mark all components for removal
             if (cleanup)
-                for (Component component : reader.components)
-                    new File(reader.descriptor.filenameFor(component)).deleteOnExit();
-            return reader;
+                for (SSTableReader reader: readers)
+                    for (Component component : reader.components)
+                        new File(reader.descriptor.filenameFor(component)).deleteOnExit();
+            return readers;
         }
     }
 
     public static abstract class Appender
     {
+        public abstract SerializationHeader header();
         /** Called with an open writer until it returns false. */
-        public abstract boolean append(SSTableWriter writer) throws IOException;
+        public abstract boolean append(SSTableTxnWriter writer) throws IOException;
     }
 }

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java
new file mode 100644
index 0000000..e714c60
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java

@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static junit.framework.Assert.fail;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class SSTableWriterTest extends SSTableWriterTestBase
+{
+    @Test
+    public void testAbortTxnWithOpenEarlyShouldRemoveSSTable() throws InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+        truncate(cfs);
+
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
+        try (SSTableWriter writer = getWriter(cfs, dir, txn))
+        {
+            for (int i = 0; i < 10000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer.append(builder.build().unfilteredIterator());
+            }
+
+            SSTableReader s = writer.setMaxDataAge(1000).openEarly();
+            assert s != null;
+            assertFileCounts(dir.list());
+            for (int i = 10000; i < 20000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer.append(builder.build().unfilteredIterator());
+            }
+            SSTableReader s2 = writer.setMaxDataAge(1000).openEarly();
+            assertTrue(s.last.compareTo(s2.last) < 0);
+            assertFileCounts(dir.list());
+            s.selfRef().release();
+            s2.selfRef().release();
+
+            int datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 1);
+
+            // These checks don't work on Windows because the writer has the channel still
+            // open till .abort() is called (via the builder)
+            if (!FBUtilities.isWindows())
+            {
+                LifecycleTransaction.waitForDeletions();
+                assertFileCounts(dir.list());
+            }
+            writer.abort();
+            txn.abort();
+            LifecycleTransaction.waitForDeletions();
+            datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 0);
+            validateCFS(cfs);
+        }
+    }
+
+
+    @Test
+    public void testAbortTxnWithClosedWriterShouldRemoveSSTable() throws InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+        truncate(cfs);
+
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.STREAM);
+        try (SSTableWriter writer = getWriter(cfs, dir, txn))
+        {
+            for (int i = 0; i < 10000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer.append(builder.build().unfilteredIterator());
+            }
+
+            assertFileCounts(dir.list());
+            for (int i = 10000; i < 20000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer.append(builder.build().unfilteredIterator());
+            }
+            SSTableReader sstable = writer.finish(true);
+            int datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 1);
+
+            sstable.selfRef().release();
+            // These checks don't work on Windows because the writer has the channel still
+            // open till .abort() is called (via the builder)
+            if (!FBUtilities.isWindows())
+            {
+                LifecycleTransaction.waitForDeletions();
+                assertFileCounts(dir.list());
+            }
+
+            txn.abort();
+            LifecycleTransaction.waitForDeletions();
+            datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 0);
+            validateCFS(cfs);
+        }
+    }
+
+    @Test
+    public void testAbortTxnWithClosedAndOpenWriterShouldRemoveAllSSTables() throws InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+        truncate(cfs);
+
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.STREAM);
+
+        SSTableWriter writer1 = getWriter(cfs, dir, txn);
+        SSTableWriter writer2 = getWriter(cfs, dir, txn);
+        try
+        {
+            for (int i = 0; i < 10000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer1.append(builder.build().unfilteredIterator());
+            }
+
+            assertFileCounts(dir.list());
+            for (int i = 10000; i < 20000; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata, random(i, 10)).withTimestamp(1);
+                for (int j = 0; j < 100; j++)
+                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                writer2.append(builder.build().unfilteredIterator());
+            }
+            SSTableReader sstable = writer1.finish(true);
+            txn.update(sstable, false);
+
+            assertFileCounts(dir.list());
+
+            int datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 2);
+
+            // These checks don't work on Windows because the writer has the channel still
+            // open till .abort() is called (via the builder)
+            if (!FBUtilities.isWindows())
+            {
+                LifecycleTransaction.waitForDeletions();
+                assertFileCounts(dir.list());
+            }
+            txn.abort();
+            LifecycleTransaction.waitForDeletions();
+            datafiles = assertFileCounts(dir.list());
+            assertEquals(datafiles, 0);
+            validateCFS(cfs);
+        }
+        finally
+        {
+            writer1.close();
+            writer2.close();
+        }
+    }
+
+    @Test
+    public void testValueTooBigCorruption() throws InterruptedException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_SMALL_MAX_VALUE);
+        truncate(cfs);
+
+        File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.STREAM);
+
+        try (SSTableWriter writer1 = getWriter(cfs, dir, txn))
+        {
+            UpdateBuilder largeValue = UpdateBuilder.create(cfs.metadata, "large_value").withTimestamp(1);
+            largeValue.newRow("clustering").add("val", ByteBuffer.allocate(2 * 1024 * 1024));
+            writer1.append(largeValue.build().unfilteredIterator());
+
+            SSTableReader sstable = writer1.finish(true);
+
+            txn.update(sstable, false);
+
+            try
+            {
+                DecoratedKey dk = Util.dk("large_value");
+                UnfilteredRowIterator rowIter = sstable.iterator(dk,
+                                                                 ColumnFilter.all(cfs.metadata),
+                                                                 false,
+                                                                 false,
+                                                                 SSTableReadsListener.NOOP_LISTENER);
+                while (rowIter.hasNext())
+                {
+                    rowIter.next();
+                    // no-op read, as values may not appear expected
+                }
+                fail("Expected a CorruptSSTableException to be thrown");
+            }
+            catch (CorruptSSTableException e)
+            {
+            }
+
+            txn.abort();
+            LifecycleTransaction.waitForDeletions();
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java
new file mode 100644
index 0000000..26b1134
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java

@@ -0,0 +1,198 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class SSTableWriterTestBase extends SchemaLoader
+{
+
+    protected static final String KEYSPACE = "SSTableRewriterTest";
+    protected static final String CF = "Standard1";
+    protected static final String CF_SMALL_MAX_VALUE = "Standard_SmallMaxValue";
+
+    private static Config.DiskAccessMode standardMode;
+    private static Config.DiskAccessMode indexMode;
+
+    private static int maxValueSize;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        if (FBUtilities.isWindows())
+        {
+            standardMode = DatabaseDescriptor.getDiskAccessMode();
+            indexMode = DatabaseDescriptor.getIndexAccessMode();
+
+            DatabaseDescriptor.setDiskAccessMode(Config.DiskAccessMode.standard);
+            DatabaseDescriptor.setIndexAccessMode(Config.DiskAccessMode.standard);
+        }
+
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF_SMALL_MAX_VALUE));
+
+        maxValueSize = DatabaseDescriptor.getMaxValueSize();
+        DatabaseDescriptor.setMaxValueSize(1024 * 1024); // set max value size to 1MB
+    }
+
+    @AfterClass
+    public static void revertConfiguration()
+    {
+        DatabaseDescriptor.setMaxValueSize(maxValueSize);
+        DatabaseDescriptor.setDiskAccessMode(standardMode);
+        DatabaseDescriptor.setIndexAccessMode(indexMode);
+    }
+
+    @After
+    public void truncateCF()
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF);
+        store.truncateBlocking();
+        LifecycleTransaction.waitForDeletions();
+    }
+
+    public static void truncate(ColumnFamilyStore cfs)
+    {
+        cfs.truncateBlocking();
+        LifecycleTransaction.waitForDeletions();
+        Uninterruptibles.sleepUninterruptibly(10L, TimeUnit.MILLISECONDS);
+        assertEquals(0, cfs.metric.liveDiskSpaceUsed.getCount());
+        assertEquals(0, cfs.metric.totalDiskSpaceUsed.getCount());
+        validateCFS(cfs);
+    }
+
+    /**
+     * Validate the column family store by checking that all live
+     * sstables are referenced only once and are not marked as
+     * compacting. It also checks that the generation of the data
+     * files on disk is the same as that of the live sstables,
+     * to ensure that the data files on disk belong to the live
+     * sstables. Finally, it checks that the metrics contain the
+     * correct disk space used, live and total.
+     *
+     * Note that this method will submit a maximal compaction task
+     * if there are live sstables, in order to check that there is at least
+     * a maximal task when there are live sstables.
+     *
+     * This method has therefore side effects and should be called after
+     * performing any other checks on previous operations, especially
+     * checks involving files on disk.
+     *
+     * @param cfs - the column family store to validate
+     */
+    public static void validateCFS(ColumnFamilyStore cfs)
+    {
+        Set<Integer> liveDescriptors = new HashSet<>();
+        long spaceUsed = 0;
+        for (SSTableReader sstable : cfs.getLiveSSTables())
+        {
+            assertFalse(sstable.isMarkedCompacted());
+            assertEquals(1, sstable.selfRef().globalCount());
+            liveDescriptors.add(sstable.descriptor.generation);
+            spaceUsed += sstable.bytesOnDisk();
+        }
+        for (File dir : cfs.getDirectories().getCFDirectories())
+        {
+            for (File f : dir.listFiles())
+            {
+                if (f.getName().contains("Data"))
+                {
+                    Descriptor d = Descriptor.fromFilename(f.getAbsolutePath());
+                    assertTrue(d.toString(), liveDescriptors.contains(d.generation));
+                }
+            }
+        }
+        assertEquals(spaceUsed, cfs.metric.liveDiskSpaceUsed.getCount());
+        assertEquals(spaceUsed, cfs.metric.totalDiskSpaceUsed.getCount());
+        assertTrue(cfs.getTracker().getCompacting().isEmpty());
+
+        if(cfs.getLiveSSTables().size() > 0)
+            assertFalse(CompactionManager.instance.submitMaximal(cfs, cfs.gcBefore((int) (System.currentTimeMillis() / 1000)), false).isEmpty());
+    }
+
+    public static SSTableWriter getWriter(ColumnFamilyStore cfs, File directory, LifecycleTransaction txn)
+    {
+        String filename = cfs.getSSTablePath(directory);
+        return SSTableWriter.create(filename, 0, 0, new SerializationHeader(true, cfs.metadata, cfs.metadata.partitionColumns(), EncodingStats.NO_STATS), txn);
+    }
+
+    public static ByteBuffer random(int i, int size)
+    {
+        byte[] bytes = new byte[size + 4];
+        ThreadLocalRandom.current().nextBytes(bytes);
+        ByteBuffer r = ByteBuffer.wrap(bytes);
+        r.putInt(0, i);
+        return r;
+    }
+
+    public static int assertFileCounts(String [] files)
+    {
+        int tmplinkcount = 0;
+        int tmpcount = 0;
+        int datacount = 0;
+        for (String f : files)
+        {
+            if (f.endsWith("-CRC.db"))
+                continue;
+            if (f.contains("tmplink-"))
+                tmplinkcount++;
+            else if (f.contains("tmp-"))
+                tmpcount++;
+            else if (f.contains("Data"))
+                datacount++;
+        }
+        assertEquals(0, tmplinkcount);
+        assertEquals(0, tmpcount);
+        return datacount;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/format/ClientModeSSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/format/ClientModeSSTableTest.java
new file mode 100644
index 0000000..48a8af5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/format/ClientModeSSTableTest.java

@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+
+import java.io.File;
+import java.nio.ByteBuffer;
+
+import com.google.common.util.concurrent.Runnables;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.rows.SliceableUnfilteredRowIterator;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.Version;
+
+/**
+ * Tests backwards compatibility for SSTables
+ */
+public class ClientModeSSTableTest
+{
+    public static final String LEGACY_SSTABLE_PROP = "legacy-sstable-root";
+    public static final String KSNAME = "Keyspace1";
+    public static final String CFNAME = "Standard1";
+
+    public static File LEGACY_SSTABLE_ROOT;
+
+    static CFMetaData metadata;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        Config.setClientMode(true);
+
+        metadata = CFMetaData.Builder.createDense(KSNAME, CFNAME, false, false)
+                                                .addPartitionKey("key", BytesType.instance)
+                                                .addClusteringColumn("column", BytesType.instance)
+                                                .addRegularColumn("value", BytesType.instance)
+                                                .withPartitioner(ByteOrderedPartitioner.instance)
+                                                .build();
+
+        String scp = System.getProperty(LEGACY_SSTABLE_PROP);
+        assert scp != null;
+        LEGACY_SSTABLE_ROOT = new File(scp).getAbsoluteFile();
+        assert LEGACY_SSTABLE_ROOT.isDirectory();
+    }
+
+    /**
+     * Get a descriptor for the legacy sstable at the given version.
+     */
+    protected Descriptor getDescriptor(String ver)
+    {
+        File directory = new File(LEGACY_SSTABLE_ROOT + File.separator + ver + File.separator + KSNAME);
+        return new Descriptor(ver, directory, KSNAME, CFNAME, 0, SSTableFormat.Type.LEGACY);
+    }
+
+    @Test
+    public void testVersions() throws Throwable
+    {
+        boolean notSkipped = false;
+
+        for (File version : LEGACY_SSTABLE_ROOT.listFiles())
+        {
+            if (!new File(LEGACY_SSTABLE_ROOT + File.separator + version.getName() + File.separator + KSNAME).isDirectory())
+                continue;
+            if (Version.validate(version.getName()) && SSTableFormat.Type.LEGACY.info.getVersion(version.getName()).isCompatible())
+            {
+                notSkipped = true;
+                testVersion(version.getName());
+            }
+        }
+
+        assert notSkipped;
+    }
+
+    public void testVersion(String version) throws Throwable
+    {
+        SSTableReader reader = null;
+        try
+        {
+            reader = SSTableReader.openNoValidation(getDescriptor(version), metadata);
+
+            ByteBuffer key = bytes(Integer.toString(100));
+
+            try (SliceableUnfilteredRowIterator iter = reader.iterator(metadata.decorateKey(key),
+                                                                       ColumnFilter.selection(metadata.partitionColumns()),
+                                                                       false,
+                                                                       false,
+                                                                       SSTableReadsListener.NOOP_LISTENER))
+            {
+                assert iter.next().clustering().get(0).equals(key);
+            }
+        }
+        catch (Throwable e)
+        {
+            System.err.println("Failed to read " + version);
+            throw e;
+        }
+        finally
+        {
+            if (reader != null)
+            {
+                int globalTidyCount = SSTableReader.GlobalTidy.lookup.size();
+                reader.selfRef().release();
+                assert reader.selfRef().globalCount() == 0;
+
+                // await clean-up to complete if started.
+                ScheduledExecutors.nonPeriodicTasks.submit(Runnables.doNothing()).get();
+                // Ensure clean-up completed.
+                assert SSTableReader.GlobalTidy.lookup.size() < globalTidyCount;
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/sstable/format/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/format/SSTableReaderTest.java
deleted file mode 100644
index 6d07f1c..0000000
--- a/test/unit/org/apache/cassandra/io/sstable/format/SSTableReaderTest.java
+++ /dev/null

@@ -1,648 +0,0 @@
-package org.apache.cassandra.io.sstable.format;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Set;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.ScheduledThreadPoolExecutor;
-import java.util.concurrent.ThreadPoolExecutor;
-
-import com.google.common.collect.Sets;
-import org.apache.cassandra.cache.CachingOptions;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import org.apache.cassandra.OrderedJUnit4ClassRunner;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.Operator;
-import org.apache.cassandra.db.BufferDecoratedKey;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.IndexExpression;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
-import org.apache.cassandra.db.Row;
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.composites.Composites;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.dht.LocalPartitioner;
-import org.apache.cassandra.dht.LocalPartitioner.LocalToken;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.MmappedSegmentedFile;
-import org.apache.cassandra.io.util.SegmentedFile;
-import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-import static org.apache.cassandra.Util.cellname;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-@RunWith(OrderedJUnit4ClassRunner.class)
-public class SSTableReaderTest
-{
-    public static final String KEYSPACE1 = "SSTableReaderTest";
-    public static final String CF_STANDARD = "Standard1";
-    public static final String CF_STANDARD2 = "Standard2";
-    public static final String CF_INDEXED = "Indexed1";
-    public static final String CF_STANDARDLOWINDEXINTERVAL = "StandardLowIndexInterval";
-
-    static Token t(int i)
-    {
-        return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(String.valueOf(i)));
-    }
-
-    @BeforeClass
-    public static void defineSchema() throws Exception
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
-                                    SchemaLoader.indexCFMD(KEYSPACE1, CF_INDEXED, true),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL)
-                                                .minIndexInterval(8)
-                                                .maxIndexInterval(256)
-                                                .caching(CachingOptions.NONE));
-    }
-
-    @Test
-    public void testGetPositionsForRanges()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-
-        // insert data and compact to a single sstable
-        CompactionManager.instance.disableAutoCompaction();
-        for (int j = 0; j < 10; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        List<Range<Token>> ranges = new ArrayList<Range<Token>>();
-        // 1 key
-        ranges.add(new Range<Token>(t(0), t(1)));
-        // 2 keys
-        ranges.add(new Range<Token>(t(2), t(4)));
-        // wrapping range from key to end
-        ranges.add(new Range<Token>(t(6), StorageService.getPartitioner().getMinimumToken()));
-        // empty range (should be ignored)
-        ranges.add(new Range<Token>(t(9), t(91)));
-
-        // confirm that positions increase continuously
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        long previous = -1;
-        for (Pair<Long,Long> section : sstable.getPositionsForRanges(ranges))
-        {
-            assert previous <= section.left : previous + " ! < " + section.left;
-            assert section.left < section.right : section.left + " ! < " + section.right;
-            previous = section.right;
-        }
-    }
-
-    @Test
-    public void testSpannedIndexPositions() throws IOException
-    {
-        long originalMaxSegmentSize = MmappedSegmentedFile.MAX_SEGMENT_SIZE;
-        MmappedSegmentedFile.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
-
-        try
-        {
-            Keyspace keyspace = Keyspace.open(KEYSPACE1);
-            ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-
-            // insert a bunch of data and compact to a single sstable
-            CompactionManager.instance.disableAutoCompaction();
-            for (int j = 0; j < 100; j += 2)
-            {
-                ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-                Mutation rm = new Mutation(KEYSPACE1, key);
-                rm.add("Standard1", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-                rm.applyUnsafe();
-            }
-            store.forceBlockingFlush();
-            CompactionManager.instance.performMaximal(store, false);
-
-            // check that all our keys are found correctly
-            SSTableReader sstable = store.getSSTables().iterator().next();
-            for (int j = 0; j < 100; j += 2)
-            {
-                DecoratedKey dk = Util.dk(String.valueOf(j));
-                FileDataInput file = sstable.getFileDataInput(sstable.getPosition(dk, SSTableReader.Operator.EQ).position);
-                DecoratedKey keyInDisk = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
-                assert keyInDisk.equals(dk) : String.format("%s != %s in %s", keyInDisk, dk, file.getPath());
-            }
-
-            // check no false positives
-            for (int j = 1; j < 110; j += 2)
-            {
-                DecoratedKey dk = Util.dk(String.valueOf(j));
-                assert sstable.getPosition(dk, SSTableReader.Operator.EQ) == null;
-            }
-        }
-        finally
-        {
-            MmappedSegmentedFile.MAX_SEGMENT_SIZE = originalMaxSegmentSize;
-        }
-    }
-
-    @Test
-    public void testPersistentStatistics()
-    {
-
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-
-        for (int j = 0; j < 100; j += 2)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("Standard1", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-
-        clearAndLoad(store);
-        assert store.metric.maxRowSize.getValue() != 0;
-    }
-
-    private void clearAndLoad(ColumnFamilyStore cfs)
-    {
-        cfs.clearUnsafe();
-        cfs.loadNewSSTables();
-    }
-
-    @Test
-    public void testReadRateTracking()
-    {
-        // try to make sure CASSANDRA-8239 never happens again
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-
-        for (int j = 0; j < 10; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("Standard1", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.apply();
-        }
-        store.forceBlockingFlush();
-
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        assertEquals(0, sstable.getReadMeter().count());
-
-        DecoratedKey key = sstable.partitioner.decorateKey(ByteBufferUtil.bytes("4"));
-        store.getColumnFamily(key, Composites.EMPTY, Composites.EMPTY, false, 100, 100);
-        assertEquals(1, sstable.getReadMeter().count());
-        store.getColumnFamily(key, cellname("0"), cellname("0"), false, 100, 100);
-        assertEquals(2, sstable.getReadMeter().count());
-        store.getColumnFamily(Util.namesQueryFilter(store, key, cellname("0")));
-        assertEquals(3, sstable.getReadMeter().count());
-    }
-
-    @Test
-    public void testGetPositionsForRangesWithKeyCache()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-        CacheService.instance.keyCache.setCapacity(100);
-
-        // insert data and compact to a single sstable
-        CompactionManager.instance.disableAutoCompaction();
-        for (int j = 0; j < 10; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        long p2 = sstable.getPosition(k(2), SSTableReader.Operator.EQ).position;
-        long p3 = sstable.getPosition(k(3), SSTableReader.Operator.EQ).position;
-        long p6 = sstable.getPosition(k(6), SSTableReader.Operator.EQ).position;
-        long p7 = sstable.getPosition(k(7), SSTableReader.Operator.EQ).position;
-
-        Pair<Long, Long> p = sstable.getPositionsForRanges(makeRanges(t(2), t(6))).get(0);
-
-        // range are start exclusive so we should start at 3
-        assert p.left == p3;
-
-        // to capture 6 we have to stop at the start of 7
-        assert p.right == p7;
-    }
-
-    @Test
-    public void testPersistentStatisticsWithSecondaryIndex()
-    {
-        // Create secondary index and flush to disk
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
-        ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        // check if opening and querying works
-        assertIndexQueryWorks(store);
-    }
-    public void testGetPositionsKeyCacheStats()
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-        CacheService.instance.keyCache.setCapacity(1000);
-
-        // insert data and compact to a single sstable
-        CompactionManager.instance.disableAutoCompaction();
-        for (int j = 0; j < 10; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation("Keyspace1", key);
-            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.apply();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        sstable.getPosition(k(2), SSTableReader.Operator.EQ);
-        assertEquals(0, sstable.getKeyCacheHit());
-        assertEquals(1, sstable.getBloomFilterTruePositiveCount());
-        sstable.getPosition(k(2), SSTableReader.Operator.EQ);
-        assertEquals(1, sstable.getKeyCacheHit());
-        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
-        sstable.getPosition(k(15), SSTableReader.Operator.EQ);
-        assertEquals(1, sstable.getKeyCacheHit());
-        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
-
-    }
-
-
-    @Test
-    public void testOpeningSSTable() throws Exception
-    {
-        String ks = KEYSPACE1;
-        String cf = "Standard1";
-
-        // clear and create just one sstable for this test
-        Keyspace keyspace = Keyspace.open(ks);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore(cf);
-        store.clearUnsafe();
-        store.disableAutoCompaction();
-
-        DecoratedKey firstKey = null, lastKey = null;
-        long timestamp = System.currentTimeMillis();
-        for (int i = 0; i < store.metadata.getMinIndexInterval(); i++)
-        {
-            DecoratedKey key = Util.dk(String.valueOf(i));
-            if (firstKey == null)
-                firstKey = key;
-            if (lastKey == null)
-                lastKey = key;
-            if (store.metadata.getKeyValidator().compare(lastKey.getKey(), key.getKey()) < 0)
-                lastKey = key;
-            Mutation rm = new Mutation(ks, key.getKey());
-            rm.add(cf, cellname("col"),
-                   ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        Descriptor desc = sstable.descriptor;
-
-        // test to see if sstable can be opened as expected
-        SSTableReader target = SSTableReader.open(desc);
-        Assert.assertEquals(target.getIndexSummarySize(), 1);
-        Assert.assertArrayEquals(ByteBufferUtil.getArray(firstKey.getKey()), target.getIndexSummaryKey(0));
-        assert target.first.equals(firstKey);
-        assert target.last.equals(lastKey);
-        target.selfRef().release();
-    }
-
-    @Test
-    public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
-        ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), System.currentTimeMillis());
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-
-        ColumnFamilyStore indexCfs = store.indexManager.getIndexForColumn(ByteBufferUtil.bytes("birthdate")).getIndexCfs();
-        assert indexCfs.partitioner instanceof LocalPartitioner;
-        SSTableReader sstable = indexCfs.getSSTables().iterator().next();
-        assert sstable.first.getToken() instanceof LocalToken;
-
-        try(SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode(), false);
-            SegmentedFile.Builder dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode(), sstable.compression))
-        {
-            sstable.saveSummary(ibuilder, dbuilder);
-        }
-        SSTableReader reopened = SSTableReader.open(sstable.descriptor);
-        assert reopened.first.getToken() instanceof LocalToken;
-        reopened.selfRef().release();
-    }
-
-    /** see CASSANDRA-5407 */
-    @Test
-    public void testGetScannerForNoIntersectingRanges() throws Exception
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
-        ByteBuffer key = ByteBufferUtil.bytes(String.valueOf("k1"));
-        Mutation rm = new Mutation(KEYSPACE1, key);
-        rm.add("Standard1", cellname("xyz"), ByteBufferUtil.bytes("abc"), 0);
-        rm.applyUnsafe();
-        store.forceBlockingFlush();
-        boolean foundScanner = false;
-        for (SSTableReader s : store.getSSTables())
-        {
-            try (ISSTableScanner scanner = s.getScanner(new Range<Token>(t(0), t(1)), null))
-            {
-                scanner.next(); // throws exception pre 5407
-                foundScanner = true;
-            }
-        }
-        assertTrue(foundScanner);
-    }
-
-    @Test
-    public void testGetPositionsForRangesFromTableOpenedForBulkLoading() throws IOException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
-
-        // insert data and compact to a single sstable. The
-        // number of keys inserted is greater than index_interval
-        // to ensure multiple segments in the index file
-        CompactionManager.instance.disableAutoCompaction();
-        for (int j = 0; j < 130; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.valueOf(j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("Standard2", cellname("0"), ByteBufferUtil.EMPTY_BYTE_BUFFER, j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        // construct a range which is present in the sstable, but whose
-        // keys are not found in the first segment of the index.
-        List<Range<Token>> ranges = new ArrayList<Range<Token>>();
-        ranges.add(new Range<Token>(t(98), t(99)));
-
-        SSTableReader sstable = store.getSSTables().iterator().next();
-        List<Pair<Long,Long>> sections = sstable.getPositionsForRanges(ranges);
-        assert sections.size() == 1 : "Expected to find range in sstable" ;
-
-        // re-open the same sstable as it would be during bulk loading
-        Set<Component> components = Sets.newHashSet(Component.DATA, Component.PRIMARY_INDEX);
-        if (sstable.compression)
-            components.add(Component.COMPRESSION_INFO);
-        SSTableReader bulkLoaded = SSTableReader.openForBatch(sstable.descriptor, components, store.metadata, sstable.partitioner);
-        sections = bulkLoaded.getPositionsForRanges(ranges);
-        assert sections.size() == 1 : "Expected to find range in sstable opened for bulk loading";
-        bulkLoaded.selfRef().release();
-    }
-
-    @Test
-    public void testIndexSummaryReplacement() throws IOException, ExecutionException, InterruptedException
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
-        CompactionManager.instance.disableAutoCompaction();
-
-        final int NUM_ROWS = 512;
-        for (int j = 0; j < NUM_ROWS; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("StandardLowIndexInterval", Util.cellname("0"), ByteBufferUtil.bytes(String.format("%3d", j)), j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        Collection<SSTableReader> sstables = store.getSSTables();
-        assert sstables.size() == 1;
-        final SSTableReader sstable = sstables.iterator().next();
-
-        ThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(5);
-        List<Future> futures = new ArrayList<>(NUM_ROWS * 2);
-        for (int i = 0; i < NUM_ROWS; i++)
-        {
-            final ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", i));
-            final int index = i;
-
-            futures.add(executor.submit(new Runnable()
-            {
-                public void run()
-                {
-                    ColumnFamily result = store.getColumnFamily(sstable.partitioner.decorateKey(key), Composites.EMPTY, Composites.EMPTY, false, 100, 100);
-                    assertFalse(result.isEmpty());
-                    assertEquals(0, ByteBufferUtil.compare(String.format("%3d", index).getBytes(), result.getColumn(Util.cellname("0")).value()));
-                }
-            }));
-
-            futures.add(executor.submit(new Runnable()
-            {
-                public void run()
-                {
-                    Iterable<DecoratedKey> results = store.keySamples(
-                            new Range<>(sstable.partitioner.getMinimumToken(), sstable.partitioner.getToken(key)));
-                    assertTrue(results.iterator().hasNext());
-                }
-            }));
-        }
-
-        SSTableReader replacement;
-        try (LifecycleTransaction txn = store.getTracker().tryModify(Arrays.asList(sstable), OperationType.UNKNOWN))
-        {
-            replacement = sstable.cloneWithNewSummarySamplingLevel(store, 1);
-            txn.update(replacement, true);
-            txn.finish();
-        }
-        for (Future future : futures)
-            future.get();
-
-        assertEquals(sstable.estimatedKeys(), replacement.estimatedKeys(), 1);
-    }
-
-    @Test
-    public void testIndexSummaryUpsampleAndReload() throws Exception
-    {
-        long originalMaxSegmentSize = MmappedSegmentedFile.MAX_SEGMENT_SIZE;
-        MmappedSegmentedFile.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
-
-        try
-        {
-            testIndexSummaryUpsampleAndReload0();
-        }
-        finally
-        {
-            MmappedSegmentedFile.MAX_SEGMENT_SIZE = originalMaxSegmentSize;
-        }
-    }
-
-    private void testIndexSummaryUpsampleAndReload0() throws Exception
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
-        CompactionManager.instance.disableAutoCompaction();
-
-        final int NUM_ROWS = 512;
-        for (int j = 0; j < NUM_ROWS; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("StandardLowIndexInterval", Util.cellname("0"), ByteBufferUtil.bytes(String.format("%3d", j)), j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        Collection<SSTableReader> sstables = store.getSSTables();
-        assert sstables.size() == 1;
-        final SSTableReader sstable = sstables.iterator().next();
-
-        try (LifecycleTransaction txn = store.getTracker().tryModify(Arrays.asList(sstable), OperationType.UNKNOWN))
-        {
-            SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(store, sstable.getIndexSummarySamplingLevel() + 1);
-            txn.update(replacement, true);
-            txn.finish();
-        }
-        SSTableReader reopen = SSTableReader.open(sstable.descriptor);
-        assert reopen.getIndexSummarySamplingLevel() == sstable.getIndexSummarySamplingLevel() + 1;
-    }
-
-    @Test
-    public void testIndexSummaryDownsampleAndReload() throws Exception
-    {
-        long originalMaxSegmentSize = MmappedSegmentedFile.MAX_SEGMENT_SIZE;
-        MmappedSegmentedFile.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
-
-        try
-        {
-            testIndexSummaryDownsampleAndReload0();
-        }
-        finally
-        {
-            MmappedSegmentedFile.MAX_SEGMENT_SIZE = originalMaxSegmentSize;
-        }
-    }
-
-    private void testIndexSummaryDownsampleAndReload0() throws Exception
-    {
-        Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore store = keyspace.getColumnFamilyStore("StandardLowIndexInterval"); // index interval of 8, no key caching
-        CompactionManager.instance.disableAutoCompaction();
-
-        final int NUM_ROWS = 512;
-        for (int j = 0; j < NUM_ROWS; j++)
-        {
-            ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", j));
-            Mutation rm = new Mutation(KEYSPACE1, key);
-            rm.add("StandardLowIndexInterval", Util.cellname("0"), ByteBufferUtil.bytes(String.format("%3d", j)), j);
-            rm.applyUnsafe();
-        }
-        store.forceBlockingFlush();
-        CompactionManager.instance.performMaximal(store, false);
-
-        Collection<SSTableReader> sstables = store.getSSTables();
-        assert sstables.size() == 1;
-        final SSTableReader sstable = sstables.iterator().next();
-
-        try (LifecycleTransaction txn = store.getTracker().tryModify(Arrays.asList(sstable), OperationType.UNKNOWN))
-        {
-            SSTableReader replacement = sstable.cloneWithNewSummarySamplingLevel(store, sstable.getIndexSummarySamplingLevel() / 2);
-            txn.update(replacement, true);
-            txn.finish();
-        }
-        SSTableReader reopen = SSTableReader.open(sstable.descriptor);
-        assert Arrays.equals(sstable.ifile.copyReadableBounds(), reopen.ifile.copyReadableBounds());
-        assert Arrays.equals(sstable.dfile.copyReadableBounds(), reopen.dfile.copyReadableBounds());
-    }
-
-
-    private void assertIndexQueryWorks(ColumnFamilyStore indexedCFS)
-    {
-        assert "Indexed1".equals(indexedCFS.name);
-
-        // make sure all sstables including 2ary indexes load from disk
-        for (ColumnFamilyStore cfs : indexedCFS.concatWithIndexes())
-            clearAndLoad(cfs);
-
-        // query using index to see if sstable for secondary index opens
-        IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"), Operator.EQ, ByteBufferUtil.bytes(1L));
-        List<IndexExpression> clause = Arrays.asList(expr);
-        Range<RowPosition> range = Util.range("", "");
-        List<Row> rows = indexedCFS.search(range, clause, new IdentityQueryFilter(), 100);
-        assert rows.size() == 1;
-    }
-
-    private List<Range<Token>> makeRanges(Token left, Token right)
-    {
-        return Arrays.asList(new Range<>(left, right));
-    }
-
-    private DecoratedKey k(int i)
-    {
-        return new BufferDecoratedKey(t(i), ByteBufferUtil.bytes(String.valueOf(i)));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
index 19fa7c4..de12d57 100644
--- a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java

@@ -15,33 +15,33 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.io.sstable.metadata;
 
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Map;
-import java.util.Set;
-
-import com.google.common.collect.Sets;
 
 import org.junit.Test;
 
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.commitlog.IntervalSet;
 import org.apache.cassandra.db.commitlog.ReplayPosition;
-import org.apache.cassandra.db.composites.SimpleDenseCellNameType;
-import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
-import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.RandomAccessReader;
-import org.apache.cassandra.utils.EstimatedHistogram;
 
 import static org.junit.Assert.assertEquals;
 
@@ -55,7 +55,7 @@
         MetadataSerializer serializer = new MetadataSerializer();
         File statsFile = serialize(originalMetadata, serializer, BigFormat.latestVersion);
 
-        Descriptor desc = new Descriptor( statsFile.getParentFile(), "", "", 0, Descriptor.Type.FINAL);
+        Descriptor desc = new Descriptor( statsFile.getParentFile(), "", "", 0);
         try (RandomAccessReader in = RandomAccessReader.open(statsFile))
         {
             Map<MetadataType, MetadataComponent> deserialized = serializer.deserialize(desc, in, EnumSet.allOf(MetadataType.class));
@@ -74,52 +74,60 @@
         File statsFile = File.createTempFile(Component.STATS.name, null);
         try (DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(statsFile)))
         {
-            serializer.serialize(metadata, version, out);
+            serializer.serialize(metadata, out, version);
         }
         return statsFile;
     }
 
     public Map<MetadataType, MetadataComponent> constructMetadata()
     {
-        EstimatedHistogram rowSizes = new EstimatedHistogram(new long[] { 1L, 2L },
-                                                             new long[] { 3L, 4L, 5L });
-        EstimatedHistogram columnCounts = new EstimatedHistogram(new long[] { 6L, 7L },
-                                                                 new long[] { 8L, 9L, 10L });
-        ReplayPosition start = new ReplayPosition(11L, 12);
-        ReplayPosition end = new ReplayPosition(15L, 9);
-        long minTimestamp = 2162517136L;
-        long maxTimestamp = 4162517136L;
+        ReplayPosition club = new ReplayPosition(11L, 12);
+        ReplayPosition cllb = new ReplayPosition(9L, 12);
 
-        MetadataCollector collector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance))
-                                                      .estimatedRowSize(rowSizes)
-                                                      .estimatedColumnCount(columnCounts)
-                                                      .commitLogLowerBound(start)
-                                                      .commitLogUpperBound(end);
-        collector.updateMinTimestamp(minTimestamp);
-        collector.updateMaxTimestamp(maxTimestamp);
-
-        Set<Integer> ancestors = Sets.newHashSet(1, 2, 3, 4);
-        for (int i : ancestors)
-            collector.addAncestor(i);
+        CFMetaData cfm = SchemaLoader.standardCFMD("ks1", "cf1");
+        MetadataCollector collector = new MetadataCollector(cfm.comparator)
+                                          .commitLogIntervals(new IntervalSet(cllb, club));
 
         String partitioner = RandomPartitioner.class.getCanonicalName();
         double bfFpChance = 0.1;
-        Map<MetadataType, MetadataComponent> originalMetadata = collector.finalizeMetadata(partitioner, bfFpChance, 0);
+        Map<MetadataType, MetadataComponent> originalMetadata = collector.finalizeMetadata(partitioner, bfFpChance, 0, SerializationHeader.make(cfm, Collections.emptyList()));
         return originalMetadata;
     }
 
     @Test
-    public void testLaReadsLb() throws IOException
+    public void testLaReadLb() throws IOException
+    {
+        testOldReadsNew("la", "lb");
+    }
+
+    @Test
+    public void testMaReadMb() throws IOException
+    {
+        testOldReadsNew("ma", "mb");
+    }
+
+    @Test
+    public void testMaReadMc() throws IOException
+    {
+        testOldReadsNew("ma", "mc");
+    }
+
+    @Test
+    public void testMbReadMc() throws IOException
+    {
+        testOldReadsNew("mb", "mc");
+    }
+
+    public void testOldReadsNew(String oldV, String newV) throws IOException
     {
         Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
 
         MetadataSerializer serializer = new MetadataSerializer();
         // Write metadata in two minor formats.
-        File statsFileLb = serialize(originalMetadata, serializer, BigFormat.instance.getVersion("lb"));
-        File statsFileLa = serialize(originalMetadata, serializer, BigFormat.instance.getVersion("la"));
-
+        File statsFileLb = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(newV));
+        File statsFileLa = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(oldV));
         // Reading both as earlier version should yield identical results.
-        Descriptor desc = new Descriptor("la", statsFileLb.getParentFile(), "", "", 0, Descriptor.Type.FINAL, DatabaseDescriptor.getSSTableFormat());
+        Descriptor desc = new Descriptor(oldV, statsFileLb.getParentFile(), "", "", 0, DatabaseDescriptor.getSSTableFormat());
         try (RandomAccessReader inLb = RandomAccessReader.open(statsFileLb);
              RandomAccessReader inLa = RandomAccessReader.open(statsFileLa))
         {

diff --git a/test/unit/org/apache/cassandra/io/util/BufferedDataOutputStreamTest.java b/test/unit/org/apache/cassandra/io/util/BufferedDataOutputStreamTest.java
index 0c58e41..7ca2273 100644
--- a/test/unit/org/apache/cassandra/io/util/BufferedDataOutputStreamTest.java
+++ b/test/unit/org/apache/cassandra/io/util/BufferedDataOutputStreamTest.java

@@ -32,11 +32,18 @@
 import java.lang.reflect.Field;
 import java.nio.BufferOverflowException;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.channels.WritableByteChannel;
+import java.util.Arrays;
 import java.util.Random;
 
+import org.apache.cassandra.utils.vint.VIntCoding;
 import org.junit.Test;
 
+import com.google.common.primitives.UnsignedBytes;
+import com.google.common.primitives.UnsignedInteger;
+import com.google.common.primitives.UnsignedLong;
+
 import static org.junit.Assert.*;
 
 public class BufferedDataOutputStreamTest
@@ -180,12 +187,14 @@
     private ByteArrayOutputStream canonical;
     private DataOutputStreamPlus dosp;
 
-    void setUp()
+    void setUp() throws Exception
     {
 
         generated = new ByteArrayOutputStream();
         canonical = new ByteArrayOutputStream();
         dosp = new WrappedDataOutputStreamPlus(canonical);
+        if (ndosp != null)
+            ndosp.close();
         ndosp = new BufferedDataOutputStreamPlus(adapter, 4096);
     }
 
@@ -210,7 +219,7 @@
         int action = 0;
         while (generated.size() < 1024 * 1024 * 8)
         {
-            action = r.nextInt(19);
+            action = r.nextInt(21);
 
             //System.out.println("Action " + action + " iteration " + iteration);
             iteration++;
@@ -387,6 +396,20 @@
                 }
                 break;
             }
+            case 19:
+            {
+                long val = r.nextLong();
+                VIntCoding.writeVInt(val, dosp);
+                ndosp.writeVInt(val);
+                break;
+            }
+            case 20:
+            {
+                long val = r.nextLong();
+                VIntCoding.writeUnsignedVInt(val, dosp);
+                ndosp.writeUnsignedVInt(val);
+                break;
+            }
             default:
                 fail("Shouldn't reach here");
             }
@@ -506,4 +529,129 @@
                 sb.append(twoByte);
         }
     }
+
+    /*
+     * Add values to the array with a bit set in every position
+     */
+    public static long[] enrich(long vals[])
+    {
+        long retval[] = Arrays.copyOf(vals, vals.length + 64);
+        for (int ii = 0; ii < 64; ii++)
+            retval[vals.length + ii] = 1L << ii;
+        return retval;
+   }
+
+    @Test
+    public void testVInt() throws Exception
+    {
+        setUp();
+        long testValues[] = new long[] {
+                0, 1, -1
+                ,Long.MIN_VALUE, Long.MIN_VALUE + 1, Long.MAX_VALUE, Long.MAX_VALUE - 1
+                ,Integer.MIN_VALUE, Integer.MIN_VALUE + 1, Integer.MAX_VALUE, Integer.MAX_VALUE - 1
+                ,Short.MIN_VALUE, Short.MIN_VALUE + 1, Short.MAX_VALUE, Short.MAX_VALUE - 1
+                ,Byte.MIN_VALUE, Byte.MIN_VALUE + 1, Byte.MAX_VALUE, Byte.MAX_VALUE - 1 };
+        testValues = enrich(testValues);
+
+        int expectedSize = 0;
+        for (long v : testValues)
+        {
+            expectedSize += VIntCoding.computeVIntSize(v);
+            ndosp.writeVInt(v);
+        }
+
+        ndosp.flush();
+
+        DataInputBuffer in = new DataInputBuffer(generated.toByteArray());
+        assertEquals(expectedSize, generated.toByteArray().length);
+
+        for (long v : testValues)
+        {
+            assertEquals(v, in.readVInt());
+        }
+    }
+
+    @Test
+    public void testUnsignedVInt() throws Exception
+    {
+        setUp();
+        long testValues[] = new long[] { //-1 };
+                0, 1
+                , UnsignedLong.MAX_VALUE.longValue(), UnsignedLong.MAX_VALUE.longValue() - 1, UnsignedLong.MAX_VALUE.longValue() + 1
+                , UnsignedInteger.MAX_VALUE.longValue(), UnsignedInteger.MAX_VALUE.longValue() - 1, UnsignedInteger.MAX_VALUE.longValue() + 1
+                , UnsignedBytes.MAX_VALUE, UnsignedBytes.MAX_VALUE - 1, UnsignedBytes.MAX_VALUE + 1
+                , 65536, 65536 - 1, 65536 + 1 };
+        testValues = enrich(testValues);
+
+        int expectedSize = 0;
+        for (long v : testValues)
+        {
+            expectedSize += VIntCoding.computeUnsignedVIntSize(v);
+            ndosp.writeUnsignedVInt(v);
+        }
+
+        ndosp.flush();
+
+        DataInputBuffer in = new DataInputBuffer(generated.toByteArray());
+        assertEquals(expectedSize, generated.toByteArray().length);
+
+        for (long v : testValues)
+            assertEquals(v, in.readUnsignedVInt());
+    }
+
+    @Test
+    public void testWriteSlowByteOrder() throws Exception
+    {
+        try (DataOutputBuffer dob = new DataOutputBuffer(4))
+        {
+            dob.order(ByteOrder.LITTLE_ENDIAN);
+            dob.writeLong(42);
+            assertEquals(42, ByteBuffer.wrap(dob.toByteArray()).order(ByteOrder.LITTLE_ENDIAN).getLong());
+        }
+    }
+
+    @Test
+    public void testWriteExcessSlow() throws Exception
+    {
+        try (DataOutputBuffer dob = new DataOutputBuffer(4))
+        {
+            dob.strictFlushing = true;
+            ByteBuffer buf = ByteBuffer.allocateDirect(8);
+            buf.putLong(0, 42);
+            dob.write(buf);
+            assertEquals(42, ByteBuffer.wrap(dob.toByteArray()).getLong());
+        }
+    }
+
+    @Test
+    public void testApplyToChannel() throws Exception
+    {
+        setUp();
+        Object obj = new Object();
+        Object retval = ndosp.applyToChannel( channel -> {
+            ByteBuffer buf = ByteBuffer.allocate(8);
+            buf.putLong(0, 42);
+            try
+            {
+                channel.write(buf);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+            return obj;
+        });
+        assertEquals(obj, retval);
+        assertEquals(42, ByteBuffer.wrap(generated.toByteArray()).getLong());
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testApplyToChannelThrowsForMisaligned() throws Exception
+    {
+        setUp();
+        ndosp.strictFlushing = true;
+        ndosp.applyToChannel( channel -> {
+            return null;
+        });
+    }
 }

diff --git a/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java b/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java
index 0c1583d..360d262 100644
--- a/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java
+++ b/test/unit/org/apache/cassandra/io/util/BufferedRandomAccessFileTest.java

@@ -18,8 +18,6 @@
  *
  */
 package org.apache.cassandra.io.util;
-
-import org.apache.cassandra.service.FileCacheService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.SyncUtil;
 
@@ -27,13 +25,7 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.nio.channels.ClosedChannelException;
 import java.util.Arrays;
-import java.util.concurrent.Callable;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 import static org.apache.cassandra.Util.expectEOF;
 import static org.apache.cassandra.Util.expectException;
@@ -48,17 +40,18 @@
     public void testReadAndWrite() throws Exception
     {
         SequentialWriter w = createTempFile("braf");
+        ChannelProxy channel = new ChannelProxy(w.getPath());
 
         // writting string of data to the file
         byte[] data = "Hello".getBytes();
         w.write(data);
         assertEquals(data.length, w.length());
-        assertEquals(data.length, w.getFilePointer());
+        assertEquals(data.length, w.position());
 
         w.sync();
 
         // reading small amount of data from file, this is handled by initial buffer
-        RandomAccessReader r = RandomAccessReader.open(w);
+        RandomAccessReader r = RandomAccessReader.open(channel);
 
         byte[] buffer = new byte[data.length];
         assertEquals(data.length, r.read(buffer));
@@ -74,14 +67,14 @@
         for (int i = 0; i < bigData.length; i++)
             bigData[i] = 'd';
 
-        long initialPosition = w.getFilePointer();
+        long initialPosition = w.position();
         w.write(bigData); // writing data
-        assertEquals(w.getFilePointer(), initialPosition + bigData.length);
+        assertEquals(w.position(), initialPosition + bigData.length);
         assertEquals(w.length(), initialPosition + bigData.length); // file size should equals to last position
 
         w.sync();
 
-        r = RandomAccessReader.open(w); // re-opening file in read-only mode
+        r = RandomAccessReader.open(channel); // re-opening file in read-only mode
 
         // reading written buffer
         r.seek(initialPosition); // back to initial (before write) position
@@ -101,7 +94,7 @@
 
         // test readBytes(int) method
         r.seek(0);
-        ByteBuffer fileContent = r.readBytes((int) w.length());
+        ByteBuffer fileContent = ByteBufferUtil.read(r, (int) w.length());
         assertEquals(fileContent.limit(), w.length());
         assert ByteBufferUtil.string(fileContent).equals("Hello" + new String(bigData));
 
@@ -130,6 +123,7 @@
 
         w.finish();
         r.close();
+        channel.close();
     }
 
     @Test
@@ -142,7 +136,8 @@
         byte[] in = generateByteArray(RandomAccessReader.DEFAULT_BUFFER_SIZE);
         w.write(in);
 
-        RandomAccessReader r = RandomAccessReader.open(w);
+        ChannelProxy channel = new ChannelProxy(w.getPath());
+        RandomAccessReader r = RandomAccessReader.open(channel);
 
         // Read it into a same size array.
         byte[] out = new byte[RandomAccessReader.DEFAULT_BUFFER_SIZE];
@@ -154,6 +149,7 @@
 
         r.close();
         w.finish();
+        channel.close();
     }
 
     @Test
@@ -181,9 +177,11 @@
         w.finish();
 
         // will use cachedlength
-        RandomAccessReader r = RandomAccessReader.open(tmpFile);
-        assertEquals(lessThenBuffer.length + biggerThenBuffer.length, r.length());
-        r.close();
+        try (ChannelProxy channel = new ChannelProxy(tmpFile);
+            RandomAccessReader r = RandomAccessReader.open(channel))
+        {
+            assertEquals(lessThenBuffer.length + biggerThenBuffer.length, r.length());
+        }
     }
 
     @Test
@@ -201,30 +199,26 @@
         w.write(data);
         w.sync();
 
-        final RandomAccessReader r = RandomAccessReader.open(w);
+        final ChannelProxy channel = new ChannelProxy(w.getPath());
+        final RandomAccessReader r = RandomAccessReader.open(channel);
 
-        ByteBuffer content = r.readBytes((int) r.length());
+        ByteBuffer content = ByteBufferUtil.read(r, (int) r.length());
 
         // after reading whole file we should be at EOF
         assertEquals(0, ByteBufferUtil.compare(content, data));
         assert r.bytesRemaining() == 0 && r.isEOF();
 
         r.seek(0);
-        content = r.readBytes(10); // reading first 10 bytes
+        content = ByteBufferUtil.read(r, 10); // reading first 10 bytes
         assertEquals(ByteBufferUtil.compare(content, "cccccccccc".getBytes()), 0);
         assertEquals(r.bytesRemaining(), r.length() - content.limit());
 
         // trying to read more than file has right now
-        expectEOF(new Callable<Object>()
-        {
-            public Object call() throws IOException
-            {
-                return r.readBytes((int) r.length() + 10);
-            }
-        });
+        expectEOF(() -> ByteBufferUtil.read(r, (int) r.length() + 10));
 
         w.finish();
         r.close();
+        channel.close();
     }
 
     @Test
@@ -235,7 +229,8 @@
         w.write(data);
         w.finish();
 
-        final RandomAccessReader file = RandomAccessReader.open(w);
+        final ChannelProxy channel = new ChannelProxy(w.getPath());
+        final RandomAccessReader file = RandomAccessReader.open(channel);
 
         file.seek(0);
         assertEquals(file.getFilePointer(), 0);
@@ -246,25 +241,12 @@
         assertEquals(file.bytesRemaining(), file.length() - 20);
 
         // trying to seek past the end of the file should produce EOFException
-        expectException(new Callable<Object>()
-        {
-            public Object call()
-            {
-                file.seek(file.length() + 30);
-                return null;
-            }
-        }, IllegalArgumentException.class);
+        expectException(() -> { file.seek(file.length() + 30); return null; }, IllegalArgumentException.class);
 
-        expectException(new Callable<Object>()
-        {
-            public Object call() throws IOException
-            {
-                file.seek(-1);
-                return null;
-            }
-        }, IllegalArgumentException.class); // throws IllegalArgumentException
+        expectException(() -> { file.seek(-1); return null; }, IllegalArgumentException.class); // throws IllegalArgumentException
 
         file.close();
+        channel.close();
     }
 
     @Test
@@ -274,7 +256,8 @@
         w.write(generateByteArray(RandomAccessReader.DEFAULT_BUFFER_SIZE * 2));
         w.finish();
 
-        RandomAccessReader file = RandomAccessReader.open(w);
+        ChannelProxy channel = new ChannelProxy(w.getPath());
+        RandomAccessReader file = RandomAccessReader.open(channel);
 
         file.seek(0); // back to the beginning of the file
         assertEquals(file.skipBytes(10), 10);
@@ -294,6 +277,7 @@
         assertEquals(file.bytesRemaining(), file.length());
 
         file.close();
+        channel.close();
     }
 
     @Test
@@ -301,14 +285,15 @@
     {
         final SequentialWriter w = createTempFile("brafGetFilePointer");
 
-        assertEquals(w.getFilePointer(), 0); // initial position should be 0
+        assertEquals(w.position(), 0); // initial position should be 0
 
         w.write(generateByteArray(20));
-        assertEquals(w.getFilePointer(), 20); // position 20 after writing 20 bytes
+        assertEquals(w.position(), 20); // position 20 after writing 20 bytes
 
         w.sync();
 
-        RandomAccessReader r = RandomAccessReader.open(w);
+        ChannelProxy channel = new ChannelProxy(w.getPath());
+        RandomAccessReader r = RandomAccessReader.open(channel);
 
         // position should change after skip bytes
         r.seek(0);
@@ -322,6 +307,7 @@
 
         w.finish();
         r.close();
+        channel.close();
     }
 
     @Test
@@ -344,16 +330,11 @@
             {
                 File file1 = writeTemporaryFile(new byte[16]);
                 try (final ChannelProxy channel = new ChannelProxy(file1);
-                     final RandomAccessReader file = RandomAccessReader.open(channel, bufferSize, null))
+                     final RandomAccessReader file = new RandomAccessReader.Builder(channel)
+                                                     .bufferSize(bufferSize)
+                                                     .build())
                 {
-                    expectEOF(new Callable<Object>()
-                    {
-                        public Object call() throws IOException
-                        {
-                            file.readFully(target, offset, 17);
-                            return null;
-                        }
-                    });
+                    expectEOF(() -> { file.readFully(target, offset, 17); return null; });
                 }
             }
 
@@ -362,15 +343,11 @@
             {
                 File file1 = writeTemporaryFile(new byte[16]);
                 try (final ChannelProxy channel = new ChannelProxy(file1);
-                     final RandomAccessReader file = RandomAccessReader.open(channel, bufferSize, null))
+                     final RandomAccessReader file = new RandomAccessReader.Builder(channel).bufferSize(bufferSize).build())
                 {
-                    expectEOF(new Callable<Object>()
-                    {
-                        public Object call() throws IOException
-                        {
-                            while (true)
-                                file.readFully(target, 0, n);
-                        }
+                    expectEOF(() -> {
+                        while (true)
+                            file.readFully(target, 0, n);
                     });
                 }
             }
@@ -397,7 +374,8 @@
 
         w.sync();
 
-        RandomAccessReader r = RandomAccessReader.open(w);
+        ChannelProxy channel = new ChannelProxy(w.getPath());
+        RandomAccessReader r = RandomAccessReader.open(channel);
 
         assertEquals(r.bytesRemaining(), toWrite);
 
@@ -413,6 +391,7 @@
 
         w.finish();
         r.close();
+        channel.close();
     }
 
     @Test
@@ -427,7 +406,7 @@
             assert tmpFile.getPath().equals(r.getPath());
 
             // Create a mark and move the rw there.
-            final FileMark mark = r.mark();
+            final DataPosition mark = r.mark();
             r.reset(mark);
 
             // Expect this call to succeed.
@@ -449,26 +428,17 @@
 
         r.close(); // closing to test read after close
 
-        expectException(new Callable<Object>()
-        {
-            public Object call()
-            {
-                return r.read();
-            }
-        }, AssertionError.class);
+        expectException(() -> r.read(), NullPointerException.class);
 
-        expectException(new Callable<Object>()
-        {
-            public Object call() throws IOException
-            {
-                w.write(generateByteArray(1));
-                return null;
-            }
-        }, ClosedChannelException.class);
+        //Used to throw ClosedChannelException, but now that it extends BDOSP it just NPEs on the buffer
+        //Writing to a BufferedOutputStream that is closed generates no error
+        //Going to allow the NPE to throw to catch as a bug any use after close. Notably it won't throw NPE for a
+        //write of a 0 length, but that is kind of a corner case
+        expectException(() -> { w.write(generateByteArray(1)); return null; }, NullPointerException.class);
 
         try (RandomAccessReader copy = RandomAccessReader.open(new File(r.getPath())))
         {
-            ByteBuffer contents = copy.readBytes((int) copy.length());
+            ByteBuffer contents = ByteBufferUtil.read(copy, (int) copy.length());
 
             assertEquals(contents.limit(), data.length);
             assertEquals(ByteBufferUtil.compare(contents, data), 0);
@@ -483,10 +453,11 @@
 
         w.finish();
 
-        RandomAccessReader file = RandomAccessReader.open(w);
+        ChannelProxy channel = new ChannelProxy(w.getPath());
+        RandomAccessReader file = RandomAccessReader.open(channel);
 
         file.seek(10);
-        FileMark mark = file.mark();
+        DataPosition mark = file.mark();
 
         file.seek(file.length());
         assertTrue(file.isEOF());
@@ -508,9 +479,10 @@
         assertEquals(file.bytesPastMark(), 0);
 
         file.close();
+        channel.close();
     }
 
-    @Test (expected = AssertionError.class)
+    @Test(expected = AssertionError.class)
     public void testAssertionErrorWhenBytesPastMarkIsNegative() throws IOException
     {
         try (SequentialWriter w = createTempFile("brafAssertionErrorWhenBytesPastMarkIsNegative"))
@@ -518,7 +490,8 @@
             w.write(new byte[30]);
             w.flush();
 
-            try (RandomAccessReader r = RandomAccessReader.open(w))
+            try (ChannelProxy channel = new ChannelProxy(w.getPath());
+                 RandomAccessReader r = RandomAccessReader.open(channel))
             {
                 r.seek(10);
                 r.mark();
@@ -530,71 +503,6 @@
     }
 
     @Test
-    public void testFileCacheService() throws IOException, InterruptedException
-    {
-        //see https://issues.apache.org/jira/browse/CASSANDRA-7756
-
-        final FileCacheService.CacheKey cacheKey = new FileCacheService.CacheKey();
-        final int THREAD_COUNT = 40;
-        ExecutorService executorService = Executors.newFixedThreadPool(THREAD_COUNT);
-
-        SequentialWriter w1 = createTempFile("fscache1");
-        SequentialWriter w2 = createTempFile("fscache2");
-
-        w1.write(new byte[30]);
-        w1.finish();
-
-        w2.write(new byte[30]);
-        w2.finish();
-
-        for (int i = 0; i < 20; i++)
-        {
-
-
-            RandomAccessReader r1 = RandomAccessReader.open(w1);
-            RandomAccessReader r2 = RandomAccessReader.open(w2);
-
-
-            FileCacheService.instance.put(cacheKey, r1);
-            FileCacheService.instance.put(cacheKey, r2);
-
-            final CountDownLatch finished = new CountDownLatch(THREAD_COUNT);
-            final AtomicBoolean hadError = new AtomicBoolean(false);
-
-            for (int k = 0; k < THREAD_COUNT; k++)
-            {
-                executorService.execute( new Runnable()
-                {
-                    @Override
-                    public void run()
-                    {
-                        try
-                        {
-                            long size = FileCacheService.instance.sizeInBytes();
-
-                            while (size > 0)
-                                size = FileCacheService.instance.sizeInBytes();
-                        }
-                        catch (Throwable t)
-                        {
-                            t.printStackTrace();
-                            hadError.set(true);
-                        }
-                        finally
-                        {
-                            finished.countDown();
-                        }
-                    }
-                });
-
-            }
-
-            finished.await();
-            assert !hadError.get();
-        }
-    }
-
-    @Test
     public void testReadOnly() throws IOException
     {
         SequentialWriter file = createTempFile("brafReadOnlyTest");
@@ -613,14 +521,7 @@
         assertTrue(copy.bytesRemaining() == 0 && copy.isEOF());
 
         // can't seek past the end of the file for read-only files
-        expectException(new Callable<Object>()
-        {
-            public Object call()
-            {
-                copy.seek(copy.length() + 1);
-                return null;
-            }
-        }, IllegalArgumentException.class);
+        expectException(() -> { copy.seek(copy.length() + 1); return null; }, IllegalArgumentException.class);
 
         copy.seek(0);
         copy.skipBytes(5);
@@ -630,7 +531,7 @@
         assertTrue(!copy.isEOF());
 
         copy.seek(0);
-        ByteBuffer contents = copy.readBytes((int) copy.length());
+        ByteBuffer contents = ByteBufferUtil.read(copy, (int) copy.length());
 
         assertEquals(contents.limit(), copy.length());
         assertTrue(ByteBufferUtil.compare(contents, data) == 0);

diff --git a/test/unit/org/apache/cassandra/io/util/ByteBufferDataInputTest.java b/test/unit/org/apache/cassandra/io/util/ByteBufferDataInputTest.java
deleted file mode 100644
index af2d1dc..0000000
--- a/test/unit/org/apache/cassandra/io/util/ByteBufferDataInputTest.java
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.io.util;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.MappedByteBuffer;
-
-import org.junit.Test;
-
-import junit.framework.Assert;
-
-public class ByteBufferDataInputTest
-{
-
-    @Test
-    public void testPositionAndSeek() throws IOException
-    {
-        ByteBufferDataInput bbdi = new ByteBufferDataInput(ByteBuffer.allocate(100), "", 15, 1);
-        Assert.assertEquals(99, bbdi.bytesRemaining());
-        Assert.assertEquals(16, bbdi.getPosition());
-        Assert.assertEquals(16, bbdi.getFilePointer());
-//        Assert.assertTrue(bbdi.markSupported());
-        FileMark mark = bbdi.mark();
-        bbdi.seek(115);
-        Assert.assertEquals(115, bbdi.getFilePointer());
-        Assert.assertEquals(115, bbdi.getPosition());
-        Assert.assertEquals(99, bbdi.bytesPastMark(mark));
-        Assert.assertTrue(bbdi.isEOF());
-        bbdi.seek(15);
-        Assert.assertEquals(15, bbdi.getFilePointer());
-        Assert.assertEquals(15, bbdi.getPosition());
-        try
-        {
-            bbdi.seek(14);
-            Assert.assertTrue(false);
-        }
-        catch (IOException t)
-        {
-        }
-        try
-        {
-            bbdi.seek(116);
-            Assert.assertTrue(false);
-        }
-        catch (IOException t)
-        {
-        }
-    }
-
-}

diff --git a/test/unit/org/apache/cassandra/io/util/ChecksummedRandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/util/ChecksummedRandomAccessReaderTest.java
new file mode 100644
index 0000000..57428af
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/ChecksummedRandomAccessReaderTest.java

@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.Arrays;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import org.apache.cassandra.io.util.ChecksummedRandomAccessReader;
+import org.apache.cassandra.io.util.ChecksummedSequentialWriter;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SequentialWriter;
+
+public class ChecksummedRandomAccessReaderTest
+{
+    @Test
+    public void readFully() throws IOException
+    {
+        final File data = File.createTempFile("testReadFully", "data");
+        final File crc = File.createTempFile("testReadFully", "crc");
+
+        final byte[] expected = new byte[70 * 1024];   // bit more than crc chunk size, so we can test rebuffering.
+        ThreadLocalRandom.current().nextBytes(expected);
+
+        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
+        writer.write(expected);
+        writer.finish();
+
+        assert data.exists();
+
+        RandomAccessReader reader = new ChecksummedRandomAccessReader.Builder(data, crc).build();
+        byte[] b = new byte[expected.length];
+        reader.readFully(b);
+
+        assertArrayEquals(expected, b);
+
+        assertTrue(reader.isEOF());
+
+        reader.close();
+    }
+
+    @Test
+    public void seek() throws IOException
+    {
+        final File data = File.createTempFile("testSeek", "data");
+        final File crc = File.createTempFile("testSeek", "crc");
+
+        final byte[] dataBytes = new byte[70 * 1024];   // bit more than crc chunk size
+        ThreadLocalRandom.current().nextBytes(dataBytes);
+
+        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
+        writer.write(dataBytes);
+        writer.finish();
+
+        assert data.exists();
+
+        RandomAccessReader reader = new ChecksummedRandomAccessReader.Builder(data, crc).build();
+
+        final int seekPosition = 66000;
+        reader.seek(seekPosition);
+
+        byte[] b = new byte[dataBytes.length - seekPosition];
+        reader.readFully(b);
+
+        byte[] expected = Arrays.copyOfRange(dataBytes, seekPosition, dataBytes.length);
+
+        assertArrayEquals(expected, b);
+
+        assertTrue(reader.isEOF());
+
+        reader.close();
+    }
+
+    @Test(expected = ChecksummedRandomAccessReader.CorruptFileException.class)
+    public void corruptionDetection() throws IOException
+    {
+        final File data = File.createTempFile("corruptionDetection", "data");
+        final File crc = File.createTempFile("corruptionDetection", "crc");
+
+        final byte[] expected = new byte[5 * 1024];
+        Arrays.fill(expected, (byte) 0);
+
+        SequentialWriter writer = ChecksummedSequentialWriter.open(data, crc);
+        writer.write(expected);
+        writer.finish();
+
+        assert data.exists();
+
+        // simulate corruption of file
+        try (RandomAccessFile dataFile = new RandomAccessFile(data, "rw"))
+        {
+            dataFile.seek(1024);
+            dataFile.write((byte) 5);
+        }
+
+        RandomAccessReader reader = new ChecksummedRandomAccessReader.Builder(data, crc).build();
+        byte[] b = new byte[expected.length];
+        reader.readFully(b);
+
+        assertArrayEquals(expected, b);
+
+        assertTrue(reader.isEOF());
+
+        reader.close();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/ChecksummedSequentialWriterTest.java b/test/unit/org/apache/cassandra/io/util/ChecksummedSequentialWriterTest.java
index 9731a8d..bea3aac 100644
--- a/test/unit/org/apache/cassandra/io/util/ChecksummedSequentialWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/util/ChecksummedSequentialWriterTest.java

@@ -85,7 +85,6 @@
         protected void assertAborted() throws Exception
         {
             super.assertAborted();
-            Assert.assertFalse(crcFile.exists());
         }
     }
 

diff --git a/test/unit/org/apache/cassandra/io/util/FileSegmentInputStreamTest.java b/test/unit/org/apache/cassandra/io/util/FileSegmentInputStreamTest.java
new file mode 100644
index 0000000..fcee9b7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/FileSegmentInputStreamTest.java

@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+
+import com.google.common.primitives.Ints;
+import org.junit.Test;
+
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class FileSegmentInputStreamTest
+{
+    private ByteBuffer allocateBuffer(int size)
+    {
+        ByteBuffer ret = ByteBuffer.allocate(Ints.checkedCast(size));
+        long seed = System.nanoTime();
+        //seed = 365238103404423L;
+        System.out.println("Seed " + seed);
+
+        new Random(seed).nextBytes(ret.array());
+        return ret;
+    }
+
+    @Test
+    public void testRead() throws IOException
+    {
+        testRead(0, 4096, 1024);
+        testRead(1024, 4096, 1024);
+        testRead(4096, 4096, 1024);
+    }
+
+    private void testRead(int offset, int size, int checkInterval) throws IOException
+    {
+        final ByteBuffer buffer = allocateBuffer(size);
+        final String path = buffer.toString();
+
+        FileSegmentInputStream reader = new FileSegmentInputStream(buffer.duplicate(), path, offset);
+        assertEquals(path, reader.getPath());
+
+        for (int i = offset; i < (size + offset); i += checkInterval)
+        {
+            reader.seek(i);
+            assertFalse(reader.isEOF());
+            assertEquals(i, reader.getFilePointer());
+
+            buffer.position(i - offset);
+
+            int remaining = buffer.remaining();
+            assertEquals(remaining, reader.bytesRemaining());
+            byte[] expected = new byte[buffer.remaining()];
+            buffer.get(expected);
+            assertTrue(Arrays.equals(expected, ByteBufferUtil.read(reader, remaining).array()));
+
+            assertTrue(reader.isEOF());
+            assertEquals(0, reader.bytesRemaining());
+            assertEquals(buffer.capacity() + offset, reader.getFilePointer());
+        }
+
+        reader.close();
+        reader.close();
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testMarkNotSupported() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 0);
+        assertFalse(reader.markSupported());
+        assertEquals(0, reader.bytesPastMark(null));
+        reader.mark();
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testResetNotSupported() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 0);
+        reader.reset(null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSeekNegative() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 0);
+        reader.seek(-1);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSeekBeforeOffset() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 1024);
+        reader.seek(1023);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSeekPastLength() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 1024);
+        reader.seek(2049);
+    }
+
+    @Test(expected = EOFException.class)
+    public void testReadBytesTooMany() throws Exception
+    {
+        FileSegmentInputStream reader = new FileSegmentInputStream(allocateBuffer(1024), "", 1024);
+        ByteBufferUtil.read(reader, 2049);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/FileUtilsTest.java b/test/unit/org/apache/cassandra/io/util/FileUtilsTest.java
index 7110504..8d1b752 100644
--- a/test/unit/org/apache/cassandra/io/util/FileUtilsTest.java
+++ b/test/unit/org/apache/cassandra/io/util/FileUtilsTest.java

@@ -26,6 +26,7 @@
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 public class FileUtilsTest
@@ -52,4 +53,13 @@
         assertEquals(0, b.length);
     }
 
+    @Test
+    public void testIsContained()
+    {
+        assertTrue(FileUtils.isContained(new File("/tmp/abc"), new File("/tmp/abc")));
+        assertFalse(FileUtils.isContained(new File("/tmp/abc"), new File("/tmp/abcd")));
+        assertTrue(FileUtils.isContained(new File("/tmp/abc"), new File("/tmp/abc/d")));
+        assertTrue(FileUtils.isContained(new File("/tmp/abc/../abc"), new File("/tmp/abc/d")));
+        assertFalse(FileUtils.isContained(new File("/tmp/abc/../abc"), new File("/tmp/abcc")));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/io/util/MemoryTest.java b/test/unit/org/apache/cassandra/io/util/MemoryTest.java
index 9be69ac..81dee7e 100644
--- a/test/unit/org/apache/cassandra/io/util/MemoryTest.java
+++ b/test/unit/org/apache/cassandra/io/util/MemoryTest.java

@@ -18,8 +18,11 @@
 */
 package org.apache.cassandra.io.util;
 
+import java.io.EOFException;
+import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.util.Arrays;
 import java.util.concurrent.ThreadLocalRandom;
 
 import org.junit.Test;
@@ -27,6 +30,10 @@
 import junit.framework.Assert;
 import org.apache.cassandra.utils.memory.MemoryUtil;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
 public class MemoryTest
 {
 
@@ -45,6 +52,36 @@
         memory.close();
     }
 
+    @Test
+    public void testInputStream() throws IOException
+    {
+        byte[] bytes = new byte[4096];
+        ThreadLocalRandom.current().nextBytes(bytes);
+        final Memory memory = Memory.allocate(bytes.length);
+        memory.setBytes(0, bytes, 0, bytes.length);
+
+        try(MemoryInputStream stream = new MemoryInputStream(memory, 1024))
+        {
+            byte[] bb = new byte[bytes.length];
+            assertEquals(bytes.length, stream.available());
+
+            stream.readFully(bb);
+            assertEquals(0, stream.available());
+
+            assertTrue(Arrays.equals(bytes, bb));
+
+            try
+            {
+                stream.readInt();
+                fail("Expected EOF exception");
+            }
+            catch (EOFException e)
+            {
+                //pass
+            }
+        }
+    }
+
     private static void test(ByteBuffer canon, Memory memory)
     {
         ByteBuffer hollow = MemoryUtil.getHollowDirectByteBuffer();

diff --git a/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java b/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java
new file mode 100644
index 0000000..7cf7bd3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java

@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import com.google.common.primitives.Ints;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
+import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.ChecksumType;
+
+import static junit.framework.Assert.assertNull;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public class MmappedRegionsTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(MmappedRegionsTest.class);
+
+    private static ByteBuffer allocateBuffer(int size)
+    {
+        ByteBuffer ret = ByteBuffer.allocate(Ints.checkedCast(size));
+        long seed = System.nanoTime();
+        //seed = 365238103404423L;
+        logger.info("Seed {}", seed);
+
+        new Random(seed).nextBytes(ret.array());
+        return ret;
+    }
+
+    private static File writeFile(String fileName, ByteBuffer buffer) throws IOException
+    {
+        File ret = File.createTempFile(fileName, "1");
+        ret.deleteOnExit();
+
+        try (SequentialWriter writer = SequentialWriter.open(ret))
+        {
+            writer.write(buffer);
+            writer.finish();
+        }
+
+        assert ret.exists();
+        assert ret.length() >= buffer.capacity();
+        return ret;
+
+    }
+
+    @Test
+    public void testEmpty() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(1024);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testEmpty", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            assertTrue(regions.isEmpty());
+            assertTrue(regions.isValid(channel));
+        }
+    }
+
+    @Test
+    public void testTwoSegments() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(2048);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testTwoSegments", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(1024);
+            for (int i = 0; i < 1024; i++)
+            {
+                MmappedRegions.Region region = regions.floor(i);
+                assertNotNull(region);
+                assertEquals(0, region.bottom());
+                assertEquals(1024, region.top());
+            }
+
+            regions.extend(2048);
+            for (int i = 0; i < 2048; i++)
+            {
+                MmappedRegions.Region region = regions.floor(i);
+                assertNotNull(region);
+                if (i < 1024)
+                {
+                    assertEquals(0, region.bottom());
+                    assertEquals(1024, region.top());
+                }
+                else
+                {
+                    assertEquals(1024, region.bottom());
+                    assertEquals(2048, region.top());
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testSmallSegmentSize() throws Exception
+    {
+        int OLD_MAX_SEGMENT_SIZE = MmappedRegions.MAX_SEGMENT_SIZE;
+        MmappedRegions.MAX_SEGMENT_SIZE = 1024;
+
+        ByteBuffer buffer = allocateBuffer(4096);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testSmallSegmentSize", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(1024);
+            regions.extend(2048);
+            regions.extend(4096);
+
+            final int SIZE = MmappedRegions.MAX_SEGMENT_SIZE;
+            for (int i = 0; i < buffer.capacity(); i++)
+            {
+                MmappedRegions.Region region = regions.floor(i);
+                assertNotNull(region);
+                assertEquals(SIZE * (i / SIZE), region.bottom());
+                assertEquals(SIZE + (SIZE * (i / SIZE)), region.top());
+            }
+        }
+        finally
+        {
+            MmappedRegions.MAX_SEGMENT_SIZE = OLD_MAX_SEGMENT_SIZE;
+        }
+    }
+
+    @Test
+    public void testAllocRegions() throws Exception
+    {
+        int OLD_MAX_SEGMENT_SIZE = MmappedRegions.MAX_SEGMENT_SIZE;
+        MmappedRegions.MAX_SEGMENT_SIZE = 1024;
+
+        ByteBuffer buffer = allocateBuffer(MmappedRegions.MAX_SEGMENT_SIZE * MmappedRegions.REGION_ALLOC_SIZE * 3);
+
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testAllocRegions", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(buffer.capacity());
+
+            final int SIZE = MmappedRegions.MAX_SEGMENT_SIZE;
+            for (int i = 0; i < buffer.capacity(); i++)
+            {
+                MmappedRegions.Region region = regions.floor(i);
+                assertNotNull(region);
+                assertEquals(SIZE * (i / SIZE), region.bottom());
+                assertEquals(SIZE + (SIZE * (i / SIZE)), region.top());
+            }
+        }
+        finally
+        {
+            MmappedRegions.MAX_SEGMENT_SIZE = OLD_MAX_SEGMENT_SIZE;
+        }
+    }
+
+    @Test
+    public void testCopy() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(128 * 1024);
+
+        MmappedRegions snapshot;
+        ChannelProxy channelCopy;
+
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testSnapshot", buffer));
+            MmappedRegions regions = MmappedRegions.map(channel, buffer.capacity() / 4))
+        {
+            // create 3 more segments, one per quater capacity
+            regions.extend(buffer.capacity() / 2);
+            regions.extend(3 * buffer.capacity() / 4);
+            regions.extend(buffer.capacity());
+
+            // make a snapshot
+            snapshot = regions.sharedCopy();
+
+            // keep the channel open
+            channelCopy = channel.sharedCopy();
+        }
+
+        assertFalse(snapshot.isCleanedUp());
+
+        final int SIZE = buffer.capacity() / 4;
+        for (int i = 0; i < buffer.capacity(); i++)
+        {
+            MmappedRegions.Region region = snapshot.floor(i);
+            assertNotNull(region);
+            assertEquals(SIZE * (i / SIZE), region.bottom());
+            assertEquals(SIZE + (SIZE * (i / SIZE)), region.top());
+
+            // check we can access the buffer
+            assertNotNull(region.buffer.duplicate().getInt());
+        }
+
+        assertNull(snapshot.close(null));
+        assertNull(channelCopy.close(null));
+        assertTrue(snapshot.isCleanedUp());
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testCopyCannotExtend() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(128 * 1024);
+
+        MmappedRegions snapshot;
+        ChannelProxy channelCopy;
+
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testSnapshotCannotExtend", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(buffer.capacity() / 2);
+
+            // make a snapshot
+            snapshot = regions.sharedCopy();
+
+            // keep the channel open
+            channelCopy = channel.sharedCopy();
+        }
+
+        try
+        {
+            snapshot.extend(buffer.capacity());
+        }
+        finally
+        {
+            assertNull(snapshot.close(null));
+            assertNull(channelCopy.close(null));
+        }
+    }
+
+    @Test
+    public void testExtendOutOfOrder() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(4096);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testExtendOutOfOrder", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(4096);
+            regions.extend(1024);
+            regions.extend(2048);
+
+            for (int i = 0; i < buffer.capacity(); i++)
+            {
+                MmappedRegions.Region region = regions.floor(i);
+                assertNotNull(region);
+                assertEquals(0, region.bottom());
+                assertEquals(4096, region.top());
+            }
+        }
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testNegativeExtend() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(1024);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testNegativeExtend", buffer));
+            MmappedRegions regions = MmappedRegions.empty(channel))
+        {
+            regions.extend(-1);
+        }
+    }
+
+    @Test
+    public void testMapForCompressionMetadata() throws Exception
+    {
+        int OLD_MAX_SEGMENT_SIZE = MmappedRegions.MAX_SEGMENT_SIZE;
+        MmappedRegions.MAX_SEGMENT_SIZE = 1024;
+
+        ByteBuffer buffer = allocateBuffer(128 * 1024);
+        File f = File.createTempFile("testMapForCompressionMetadata", "1");
+        f.deleteOnExit();
+
+        File cf = File.createTempFile(f.getName() + ".metadata", "1");
+        cf.deleteOnExit();
+
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(new ClusteringComparator(BytesType.instance));
+        try(SequentialWriter writer = new CompressedSequentialWriter(f,
+                                                                     cf.getAbsolutePath(),
+                                                                     CompressionParams.snappy(),
+                                                                     sstableMetadataCollector))
+        {
+            writer.write(buffer);
+            writer.finish();
+        }
+
+        CompressionMetadata metadata = new CompressionMetadata(cf.getAbsolutePath(), f.length(), ChecksumType.CRC32);
+        try(ChannelProxy channel = new ChannelProxy(f);
+            MmappedRegions regions = MmappedRegions.map(channel, metadata))
+        {
+
+            assertFalse(regions.isEmpty());
+            int i = 0;
+            while(i < buffer.capacity())
+            {
+                CompressionMetadata.Chunk chunk = metadata.chunkFor(i);
+
+                MmappedRegions.Region region = regions.floor(chunk.offset);
+                assertNotNull(region);
+
+                ByteBuffer compressedChunk = region.buffer.duplicate();
+                assertNotNull(compressedChunk);
+                assertEquals(chunk.length + 4, compressedChunk.capacity());
+
+                assertEquals(chunk.offset, region.bottom());
+                assertEquals(chunk.offset + chunk.length + 4, region.top());
+
+                i += metadata.chunkLength();
+            }
+        }
+        finally
+        {
+            MmappedRegions.MAX_SEGMENT_SIZE = OLD_MAX_SEGMENT_SIZE;
+            metadata.close();
+        }
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testIllegalArgForMap1() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(1024);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testIllegalArgForMap1", buffer));
+            MmappedRegions regions = MmappedRegions.map(channel, 0))
+        {
+            assertTrue(regions.isEmpty());
+        }
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testIllegalArgForMap2() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(1024);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testIllegalArgForMap2", buffer));
+            MmappedRegions regions = MmappedRegions.map(channel, -1L))
+        {
+            assertTrue(regions.isEmpty());
+        }
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testIllegalArgForMap3() throws Exception
+    {
+        ByteBuffer buffer = allocateBuffer(1024);
+        try(ChannelProxy channel = new ChannelProxy(writeFile("testIllegalArgForMap3", buffer));
+            MmappedRegions regions = MmappedRegions.map(channel, null))
+        {
+            assertTrue(regions.isEmpty());
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/io/util/NIODataInputStreamTest.java b/test/unit/org/apache/cassandra/io/util/NIODataInputStreamTest.java
index 953d882..7b91ccb 100644
--- a/test/unit/org/apache/cassandra/io/util/NIODataInputStreamTest.java
+++ b/test/unit/org/apache/cassandra/io/util/NIODataInputStreamTest.java

@@ -34,10 +34,12 @@
 import java.util.Queue;
 import java.util.Random;
 
-import org.apache.cassandra.io.util.NIODataInputStream;
 import org.junit.Test;
 
 import com.google.common.base.Charsets;
+import com.google.common.primitives.UnsignedBytes;
+import com.google.common.primitives.UnsignedInteger;
+import com.google.common.primitives.UnsignedLong;
 
 import static org.junit.Assert.*;
 
@@ -146,7 +148,7 @@
 
     }
 
-    NIODataInputStream fakeStream = new NIODataInputStream(new FakeChannel(), 8);
+    NIODataInputStream fakeStream = new NIODataInputStream(new FakeChannel(), 9);
 
     @Test(expected = IOException.class)
     public void testResetThrows() throws Exception
@@ -197,17 +199,10 @@
     }
 
     @SuppressWarnings("resource")
-    @Test(expected = IllegalArgumentException.class)
-    public void testTooSmallBufferSize() throws Exception
-    {
-        new NIODataInputStream(new FakeChannel(), 4);
-    }
-
-    @SuppressWarnings("resource")
     @Test(expected = NullPointerException.class)
     public void testNullRBC() throws Exception
     {
-        new NIODataInputStream(null, 8);
+        new NIODataInputStream(null, 9);
     }
 
     @SuppressWarnings("resource")
@@ -232,7 +227,7 @@
         fos.write(new byte[10]);
         fos.seek(0);
 
-        is = new NIODataInputStream(fos.getChannel(), 8);
+        is = new NIODataInputStream(fos.getChannel(), 9);
 
         int remaining = 10;
         assertEquals(10, is.available());
@@ -246,6 +241,31 @@
         assertEquals(0, is.available());
     }
 
+    private static ReadableByteChannel wrap(final byte bytes[])
+    {
+        final ByteBuffer buf = ByteBuffer.wrap(bytes);
+        return new ReadableByteChannel()
+        {
+
+            @Override
+            public boolean isOpen() {return false;}
+
+            @Override
+            public void close() throws IOException {}
+
+            @Override
+            public int read(ByteBuffer dst) throws IOException
+            {
+                int read = Math.min(dst.remaining(), buf.remaining());
+                buf.limit(buf.position() + read);
+                dst.put(buf);
+                buf.limit(buf.capacity());
+                return read == 0 ? -1 : read;
+            }
+
+        };
+    }
+
     @SuppressWarnings("resource")
     @Test
     public void testReadUTF() throws Exception
@@ -264,23 +284,7 @@
         daos.writeUTF(BufferedDataOutputStreamTest.threeByte);
         daos.writeUTF(BufferedDataOutputStreamTest.fourByte);
 
-        NIODataInputStream is = new NIODataInputStream(new ReadableByteChannel()
-        {
-
-            @Override
-            public boolean isOpen() {return false;}
-
-            @Override
-            public void close() throws IOException {}
-
-            @Override
-            public int read(ByteBuffer dst) throws IOException
-            {
-                dst.put(baos.toByteArray());
-                return baos.toByteArray().length;
-            }
-
-        }, 4096);
+        NIODataInputStream is = new NIODataInputStream(wrap(baos.toByteArray()), 4096);
 
         assertEquals(simple, is.readUTF());
         assertEquals(BufferedDataOutputStreamTest.twoByte, is.readUTF());
@@ -288,6 +292,78 @@
         assertEquals(BufferedDataOutputStreamTest.fourByte, is.readUTF());
     }
 
+    @SuppressWarnings("resource")
+    @Test
+    public void testReadVInt() throws Exception {
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutputStreamPlus daos = new WrappedDataOutputStreamPlus(baos);
+
+        long values[] = new long[] {
+                0, 1, -1,
+                Long.MIN_VALUE, Long.MIN_VALUE + 1, Long.MAX_VALUE, Long.MAX_VALUE - 1,
+                Integer.MIN_VALUE, Integer.MIN_VALUE + 1, Integer.MAX_VALUE, Integer.MAX_VALUE - 1,
+                Short.MIN_VALUE, Short.MIN_VALUE + 1, Short.MAX_VALUE, Short.MAX_VALUE - 1,
+                Byte.MIN_VALUE, Byte.MIN_VALUE + 1, Byte.MAX_VALUE, Byte.MAX_VALUE - 1 };
+        values = BufferedDataOutputStreamTest.enrich(values);
+
+        for (long v : values)
+            daos.writeVInt(v);
+
+        daos.flush();
+
+        NIODataInputStream is = new NIODataInputStream(wrap(baos.toByteArray()), 9);
+
+        for (long v : values)
+            assertEquals(v, is.readVInt());
+
+        boolean threw = false;
+        try
+        {
+            is.readVInt();
+        }
+        catch (EOFException e)
+        {
+            threw = true;
+        }
+        assertTrue(threw);
+    }
+
+    @SuppressWarnings("resource")
+    @Test
+    public void testReadUnsignedVInt() throws Exception {
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutputStreamPlus daos = new WrappedDataOutputStreamPlus(baos);
+
+        long values[] = new long[] {
+                0, 1
+                , UnsignedLong.MAX_VALUE.longValue(), UnsignedLong.MAX_VALUE.longValue() - 1, UnsignedLong.MAX_VALUE.longValue() + 1
+                , UnsignedInteger.MAX_VALUE.longValue(), UnsignedInteger.MAX_VALUE.longValue() - 1, UnsignedInteger.MAX_VALUE.longValue() + 1
+                , UnsignedBytes.MAX_VALUE, UnsignedBytes.MAX_VALUE - 1, UnsignedBytes.MAX_VALUE + 1
+                , 65536, 65536 - 1, 65536 + 1 };
+        values = BufferedDataOutputStreamTest.enrich(values);
+
+        for (long v : values)
+            daos.writeUnsignedVInt(v);
+
+        daos.flush();
+
+        NIODataInputStream is = new NIODataInputStream(wrap(baos.toByteArray()), 9);
+
+        for (long v : values)
+            assertEquals(v, is.readUnsignedVInt());
+
+        boolean threw = false;
+        try
+        {
+            is.readUnsignedVInt();
+        }
+        catch (EOFException e)
+        {
+            threw = true;
+        }
+        assertTrue(threw);
+    }
+
     @Test
     public void testFuzz() throws Exception
     {
@@ -681,4 +757,104 @@
         assertEquals(totalRead, corpus.capacity());
         assertEquals(-1, dis.read());
     }
+
+
+    @Test
+    @SuppressWarnings({ "resource"})
+    public void testVIntRemainingBytes() throws Exception
+    {
+        for(int ii = 0; ii < 10; ii++)
+        {
+            for (int zz = 0; zz < 10; zz++)
+            {
+                if (zz + ii > 10)
+                    continue;
+
+                ByteBuffer buf = ByteBuffer.allocate(10);
+                buf.position(ii);
+
+                long value = 0;
+                if (ii > 0)
+                    value = (1L << 7 * zz) - 1;
+
+                BufferedDataOutputStreamPlus out = new DataOutputBufferFixed(buf);
+                out.writeUnsignedVInt(value);
+
+                buf.position(ii);
+                RebufferingInputStream in = new DataInputBuffer(buf, false);
+
+                assertEquals(value, in.readUnsignedVInt());
+            }
+        }
+    }
+
+    @Test
+    @SuppressWarnings({ "resource"})
+    public void testVIntSmallBuffer() throws Exception
+    {
+        for(int ii = 0; ii < 10; ii++)
+        {
+            ByteBuffer buf = ByteBuffer.allocate(Math.max(1,  ii));
+
+            long value = 0;
+            if (ii > 0)
+                value = (1L << 7 * ii) - 1;
+
+            BufferedDataOutputStreamPlus out = new DataOutputBufferFixed(buf);
+            out.writeUnsignedVInt(value);
+
+            buf.position(0);
+            RebufferingInputStream in = new DataInputBuffer(buf, false);
+
+            assertEquals(value, in.readUnsignedVInt());
+
+            boolean threw = false;
+            try
+            {
+                in.readUnsignedVInt();
+            }
+            catch (EOFException e)
+            {
+                threw = true;
+            }
+            assertTrue(threw);
+        }
+    }
+
+    @Test
+    @SuppressWarnings({ "resource"})
+    public void testVIntTruncationEOF() throws Exception
+    {
+        for(int ii = 0; ii < 10; ii++)
+        {
+            ByteBuffer buf = ByteBuffer.allocate(Math.max(1,  ii));
+
+            long value = 0;
+            if (ii > 0)
+                value = (1L << 7 * ii) - 1;
+
+            BufferedDataOutputStreamPlus out = new DataOutputBufferFixed(buf);
+            out.writeUnsignedVInt(value);
+
+            buf.position(0);
+
+            ByteBuffer truncated = ByteBuffer.allocate(buf.capacity() - 1);
+            buf.limit(buf.limit() - 1);
+            truncated.put(buf);
+            truncated.flip();
+
+            RebufferingInputStream in = new DataInputBuffer(truncated, false);
+
+            boolean threw = false;
+            try
+            {
+                in.readUnsignedVInt();
+            }
+            catch (EOFException e)
+            {
+                threw = true;
+            }
+            assertTrue(threw);
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/io/util/RandomAccessReaderTest.java b/test/unit/org/apache/cassandra/io/util/RandomAccessReaderTest.java
new file mode 100644
index 0000000..aad5117
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/RandomAccessReaderTest.java

@@ -0,0 +1,503 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.io.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileLock;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.junit.Assert.*;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class RandomAccessReaderTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(RandomAccessReaderTest.class);
+
+    private static final class Parameters
+    {
+        public final long fileLength;
+        public final int bufferSize;
+
+        public BufferType bufferType;
+        public int maxSegmentSize;
+        public boolean mmappedRegions;
+        public byte[] expected;
+
+        Parameters(long fileLength, int bufferSize)
+        {
+            this.fileLength = fileLength;
+            this.bufferSize = bufferSize;
+            this.bufferType = BufferType.OFF_HEAP;
+            this.maxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE;
+            this.mmappedRegions = false;
+            this.expected = "The quick brown fox jumps over the lazy dog".getBytes(FileUtils.CHARSET);
+        }
+
+        public Parameters mmappedRegions(boolean mmappedRegions)
+        {
+            this.mmappedRegions = mmappedRegions;
+            return this;
+        }
+
+        public Parameters bufferType(BufferType bufferType)
+        {
+            this.bufferType = bufferType;
+            return this;
+        }
+
+        public Parameters maxSegmentSize(int maxSegmentSize)
+        {
+            this.maxSegmentSize = maxSegmentSize;
+            return this;
+        }
+
+        public Parameters expected(byte[] expected)
+        {
+            this.expected = expected;
+            return this;
+        }
+    }
+
+    @Test
+    public void testBufferedOffHeap() throws IOException
+    {
+        testReadFully(new Parameters(8192, 4096).bufferType(BufferType.OFF_HEAP));
+    }
+
+    @Test
+    public void testBufferedOnHeap() throws IOException
+    {
+        testReadFully(new Parameters(8192, 4096).bufferType(BufferType.ON_HEAP));
+    }
+
+    @Test
+    public void testBigBufferSize() throws IOException
+    {
+        testReadFully(new Parameters(8192, 65536).bufferType(BufferType.ON_HEAP));
+    }
+
+    @Test
+    public void testTinyBufferSize() throws IOException
+    {
+        testReadFully(new Parameters(8192, 16).bufferType(BufferType.ON_HEAP));
+    }
+
+    @Test
+    public void testOneSegment() throws IOException
+    {
+        testReadFully(new Parameters(8192, 4096).mmappedRegions(true));
+    }
+
+    @Test
+    public void testMultipleSegments() throws IOException
+    {
+        testReadFully(new Parameters(8192, 4096).mmappedRegions(true).maxSegmentSize(1024));
+    }
+
+    @Test
+    public void testVeryLarge() throws IOException
+    {
+        final long SIZE = 1L << 32; // 2GB
+        Parameters params = new Parameters(SIZE, 1 << 20); // 1MB
+
+        try(ChannelProxy channel = new ChannelProxy("abc", new FakeFileChannel(SIZE)))
+        {
+            RandomAccessReader.Builder builder = new RandomAccessReader.Builder(channel)
+                                                 .bufferType(params.bufferType)
+                                                 .bufferSize(params.bufferSize);
+
+            try(RandomAccessReader reader = builder.build())
+            {
+                assertEquals(channel.size(), reader.length());
+                assertEquals(channel.size(), reader.bytesRemaining());
+                assertEquals(Integer.MAX_VALUE, reader.available());
+
+                assertEquals(channel.size(), reader.skip(channel.size()));
+
+                assertTrue(reader.isEOF());
+                assertEquals(0, reader.bytesRemaining());
+            }
+        }
+    }
+
+    /** A fake file channel that simply increments the position and doesn't
+     * actually read anything. We use it to simulate very large files, > 2G.
+     */
+    private static final class FakeFileChannel extends FileChannel
+    {
+        private final long size;
+        private long position;
+
+        FakeFileChannel(long size)
+        {
+            this.size = size;
+        }
+
+        public int read(ByteBuffer dst)
+        {
+            int ret = dst.remaining();
+            position += ret;
+            dst.position(dst.limit());
+            return ret;
+        }
+
+        public long read(ByteBuffer[] dsts, int offset, int length)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public int write(ByteBuffer src)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public long write(ByteBuffer[] srcs, int offset, int length)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public long position()
+        {
+            return position;
+        }
+
+        public FileChannel position(long newPosition)
+        {
+            position = newPosition;
+            return this;
+        }
+
+        public long size()
+        {
+            return size;
+        }
+
+        public FileChannel truncate(long size)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public void force(boolean metaData)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public long transferTo(long position, long count, WritableByteChannel target)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public long transferFrom(ReadableByteChannel src, long position, long count)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public int read(ByteBuffer dst, long position)
+        {
+            int ret = dst.remaining();
+            this.position = position + ret;
+            dst.position(dst.limit());
+            return ret;
+        }
+
+        public int write(ByteBuffer src, long position)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public MappedByteBuffer map(MapMode mode, long position, long size)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public FileLock lock(long position, long size, boolean shared)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        public FileLock tryLock(long position, long size, boolean shared)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        protected void implCloseChannel()
+        {
+
+        }
+    }
+
+    private static File writeFile(Parameters params) throws IOException
+    {
+        final File f = File.createTempFile("testReadFully", "1");
+        f.deleteOnExit();
+
+        try(SequentialWriter writer = SequentialWriter.open(f))
+        {
+            long numWritten = 0;
+            while (numWritten < params.fileLength)
+            {
+                writer.write(params.expected);
+                numWritten += params.expected.length;
+            }
+
+            writer.finish();
+        }
+
+        assert f.exists();
+        assert f.length() >= params.fileLength;
+        return f;
+    }
+
+    private static void testReadFully(Parameters params) throws IOException
+    {
+        final File f = writeFile(params);
+        try(ChannelProxy channel = new ChannelProxy(f))
+        {
+            RandomAccessReader.Builder builder = new RandomAccessReader.Builder(channel)
+                                                 .bufferType(params.bufferType)
+                                                 .bufferSize(params.bufferSize);
+            if (params.mmappedRegions)
+                builder.regions(MmappedRegions.map(channel, f.length()));
+
+            try(RandomAccessReader reader = builder.build())
+            {
+                assertEquals(f.getAbsolutePath(), reader.getPath());
+                assertEquals(f.length(), reader.length());
+                assertEquals(f.length(), reader.bytesRemaining());
+                assertEquals(Math.min(Integer.MAX_VALUE, f.length()), reader.available());
+
+                byte[] b = new byte[params.expected.length];
+                long numRead = 0;
+                while (numRead < params.fileLength)
+                {
+                    reader.readFully(b);
+                    assertTrue(Arrays.equals(params.expected, b));
+                    numRead += b.length;
+                }
+
+                assertTrue(reader.isEOF());
+                assertEquals(0, reader.bytesRemaining());
+            }
+
+            if (builder.regions != null)
+                assertNull(builder.regions.close(null));
+        }
+    }
+
+    @Test
+    public void testReadBytes() throws IOException
+    {
+        File f = File.createTempFile("testReadBytes", "1");
+        final String expected = "The quick brown fox jumps over the lazy dog";
+
+        try(SequentialWriter writer = SequentialWriter.open(f))
+        {
+            writer.write(expected.getBytes());
+            writer.finish();
+        }
+
+        assert f.exists();
+
+        try(ChannelProxy channel = new ChannelProxy(f);
+            RandomAccessReader reader = new RandomAccessReader.Builder(channel).build())
+        {
+            assertEquals(f.getAbsolutePath(), reader.getPath());
+            assertEquals(expected.length(), reader.length());
+
+            ByteBuffer b = ByteBufferUtil.read(reader, expected.length());
+            assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
+
+            assertTrue(reader.isEOF());
+            assertEquals(0, reader.bytesRemaining());
+        }
+    }
+
+    @Test
+    public void testReset() throws IOException
+    {
+        File f = File.createTempFile("testMark", "1");
+        final String expected = "The quick brown fox jumps over the lazy dog";
+        final int numIterations = 10;
+
+        try(SequentialWriter writer = SequentialWriter.open(f))
+        {
+            for (int i = 0; i < numIterations; i++)
+                writer.write(expected.getBytes());
+            writer.finish();
+        }
+
+        assert f.exists();
+
+        try(ChannelProxy channel = new ChannelProxy(f);
+        RandomAccessReader reader = new RandomAccessReader.Builder(channel).build())
+        {
+            assertEquals(expected.length() * numIterations, reader.length());
+
+            ByteBuffer b = ByteBufferUtil.read(reader, expected.length());
+            assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
+
+            assertFalse(reader.isEOF());
+            assertEquals((numIterations - 1) * expected.length(), reader.bytesRemaining());
+
+            DataPosition mark = reader.mark();
+            assertEquals(0, reader.bytesPastMark());
+            assertEquals(0, reader.bytesPastMark(mark));
+
+            for (int i = 0; i < (numIterations - 1); i++)
+            {
+                b = ByteBufferUtil.read(reader, expected.length());
+                assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
+            }
+            assertTrue(reader.isEOF());
+            assertEquals(expected.length() * (numIterations - 1), reader.bytesPastMark());
+            assertEquals(expected.length() * (numIterations - 1), reader.bytesPastMark(mark));
+
+            reader.reset(mark);
+            assertEquals(0, reader.bytesPastMark());
+            assertEquals(0, reader.bytesPastMark(mark));
+            assertFalse(reader.isEOF());
+            for (int i = 0; i < (numIterations - 1); i++)
+            {
+                b = ByteBufferUtil.read(reader, expected.length());
+                assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
+            }
+
+            reader.reset();
+            assertEquals(0, reader.bytesPastMark());
+            assertEquals(0, reader.bytesPastMark(mark));
+            assertFalse(reader.isEOF());
+            for (int i = 0; i < (numIterations - 1); i++)
+            {
+                b = ByteBufferUtil.read(reader, expected.length());
+                assertEquals(expected, new String(b.array(), Charset.forName("UTF-8")));
+            }
+
+            assertTrue(reader.isEOF());
+        }
+    }
+
+    @Test
+    public void testSeekSingleThread() throws IOException, InterruptedException
+    {
+        testSeek(1);
+    }
+
+    @Test
+    public void testSeekMultipleThreads() throws IOException, InterruptedException
+    {
+        testSeek(10);
+    }
+
+    private static void testSeek(int numThreads) throws IOException, InterruptedException
+    {
+        final File f = File.createTempFile("testMark", "1");
+        final byte[] expected = new byte[1 << 16];
+
+        long seed = System.nanoTime();
+        //seed = 365238103404423L;
+        logger.info("Seed {}", seed);
+        Random r = new Random(seed);
+        r.nextBytes(expected);
+
+        try(SequentialWriter writer = SequentialWriter.open(f))
+        {
+            writer.write(expected);
+            writer.finish();
+        }
+
+        assert f.exists();
+
+        try(final ChannelProxy channel = new ChannelProxy(f))
+        {
+            final Runnable worker = () ->
+            {
+                try(RandomAccessReader reader = new RandomAccessReader.Builder(channel).build())
+                {
+                    assertEquals(expected.length, reader.length());
+
+                    ByteBuffer b = ByteBufferUtil.read(reader, expected.length);
+                    assertTrue(Arrays.equals(expected, b.array()));
+                    assertTrue(reader.isEOF());
+                    assertEquals(0, reader.bytesRemaining());
+
+                    reader.seek(0);
+                    b = ByteBufferUtil.read(reader, expected.length);
+                    assertTrue(Arrays.equals(expected, b.array()));
+                    assertTrue(reader.isEOF());
+                    assertEquals(0, reader.bytesRemaining());
+
+                    for (int i = 0; i < 10; i++)
+                    {
+                        int pos = r.nextInt(expected.length);
+                        reader.seek(pos);
+                        assertEquals(pos, reader.getPosition());
+
+                        ByteBuffer buf = ByteBuffer.wrap(expected, pos, expected.length - pos)
+                                                   .order(ByteOrder.BIG_ENDIAN);
+
+                        while (reader.bytesRemaining() > 4)
+                            assertEquals(buf.getInt(), reader.readInt());
+                    }
+
+                    reader.close();
+                }
+                catch (Exception ex)
+                {
+                    ex.printStackTrace();
+                    fail(ex.getMessage());
+                }
+            };
+
+            if (numThreads == 1)
+            {
+                worker.run();
+            }
+            else
+            {
+                ExecutorService executor = Executors.newFixedThreadPool(numThreads);
+                for (int i = 0; i < numThreads; i++)
+                    executor.submit(worker);
+
+                executor.shutdown();
+                executor.awaitTermination(1, TimeUnit.MINUTES);
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/RewindableDataInputStreamPlusTest.java b/test/unit/org/apache/cassandra/io/util/RewindableDataInputStreamPlusTest.java
new file mode 100644
index 0000000..175ab53
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/RewindableDataInputStreamPlusTest.java

@@ -0,0 +1,539 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class RewindableDataInputStreamPlusTest
+{
+
+    private final int INITIAL_BUFFER_SIZE = 1;
+
+    private File file;
+
+    @Before
+    public void setup() throws Exception
+    {
+        this.file = new File(System.getProperty("java.io.tmpdir"), "subdir/test.buffer");
+    }
+
+    @Test
+    public void testMarkAndResetSimple() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            // boolean
+            out.writeBoolean(true);
+            // byte
+            out.writeByte(0x1);
+            // char
+            out.writeChar('a');
+            // short
+            out.writeShort(1);
+            // int
+            out.writeInt(1);
+            // long
+            out.writeLong(1L);
+            // float
+            out.writeFloat(1.0f);
+            // double
+            out.writeDouble(1.0d);
+
+            // String
+            out.writeUTF("abc");
+            testData = baos.toByteArray();
+        }
+
+        for (int memCapacity = 0; memCapacity <= 16; memCapacity++)
+        {
+            int diskCapacity = 16 - memCapacity;
+            try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                          INITIAL_BUFFER_SIZE, memCapacity, file,
+                                                                                          diskCapacity))
+            {
+                try {
+                    //should mark before resetting
+                    reader.reset(null);
+                    fail("Should have thrown IOException");
+                } catch (IOException e) {}
+
+                assertTrue(reader.readBoolean());
+
+                reader.mark();
+
+                try {
+                    //cannot mark already marked stream
+                    reader.mark();
+                    fail("Should have thrown IllegalStateException");
+                } catch (IllegalStateException e) {}
+
+                assertEquals(0x1, reader.readByte());
+                assertEquals('a', reader.readChar());
+                assertEquals(3, reader.bytesPastMark(null));
+                reader.reset(null);
+
+                try {
+                    //cannot mark when reading from cache
+                    reader.mark();
+                    fail("Should have thrown IllegalStateException");
+                } catch (IllegalStateException e) {}
+
+                //read again previous sequence
+                assertEquals(0x1, reader.readByte());
+                assertEquals('a', reader.readChar());
+                //finish reading again previous sequence
+                assertEquals(1, reader.readShort());
+
+                reader.mark();
+                assertEquals(1, reader.readInt());
+                assertEquals(1L, reader.readLong());
+                assertEquals(1.0f, reader.readFloat(), 0);
+                assertEquals(16, reader.bytesPastMark(null));
+                reader.reset(null);
+
+                //read again previous sequence
+                assertEquals(1, reader.readInt());
+                assertEquals(1L, reader.readLong());
+                assertEquals(1.0f, reader.readFloat(), 0);
+                //finish reading again previous sequence
+
+                //mark again
+                reader.mark();
+                assertEquals(1.0d, reader.readDouble(), 0);
+                assertEquals(8, reader.bytesPastMark(null));
+                reader.reset(null);
+
+                //read again previous sequence
+                assertEquals(1.0d, reader.readDouble(), 0);
+                //finish reading again previous sequence
+
+                //mark and reset
+                reader.mark();
+                reader.reset(null);
+
+                assertEquals("abc", reader.readUTF());
+
+                //check max file size
+                assertEquals(diskCapacity, file.length());
+            }
+            assertFalse(file.exists());
+        }
+    }
+
+    @Test
+    public void testVeryLargeCapacity() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            out.writeUTF("abc");
+            testData = baos.toByteArray();
+        }
+
+        try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                      INITIAL_BUFFER_SIZE, Integer.MAX_VALUE, file,
+                                                                                      Integer.MAX_VALUE))
+        {
+            reader.mark();
+            assertEquals("abc", reader.readUTF());
+            reader.reset();
+            assertEquals("abc", reader.readUTF());
+        }
+        assertFalse(file.exists());
+
+
+        baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            out.writeBoolean(true);
+            out.writeBoolean(true);
+            testData = baos.toByteArray();
+        }
+    }
+
+    @Test
+    public void testMarkAndResetBigBuffer() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            // boolean
+            out.writeBoolean(true);
+            // byte
+            out.writeByte(0x1);
+            // char
+            out.writeChar('a');
+            // short
+            out.writeShort(1);
+            // int
+            out.writeInt(1);
+            // long
+            out.writeLong(1L);
+            // float
+            out.writeFloat(1.0f);
+            // double
+            out.writeDouble(1.0d);
+
+            // String
+            out.writeUTF("abc");
+            testData = baos.toByteArray();
+
+            // 1 (boolean) + 1 (byte) + 2 (char) + 2 (short) + 4 (int) + 8 (long)
+            // + 4 (float) + 8 (double) + 5 bytes (utf string)
+        }
+
+        for (int memCapacity = 0; memCapacity <= 18; memCapacity++)
+        {
+            int diskCapacity = 18 - memCapacity;
+            try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                          INITIAL_BUFFER_SIZE, memCapacity, file,
+                                                                                          diskCapacity))
+            {
+                //read a big amount before resetting
+                reader.mark();
+                assertTrue(reader.readBoolean());
+                assertEquals(0x1, reader.readByte());
+                assertEquals('a', reader.readChar());
+                assertEquals(1, reader.readShort());
+                assertEquals(1, reader.readInt());
+                assertEquals(1L, reader.readLong());
+                reader.reset();
+
+                //read from buffer
+                assertTrue(reader.readBoolean());
+                assertEquals(0x1, reader.readByte());
+                assertEquals('a', reader.readChar());
+                assertEquals(1, reader.readShort());
+                assertEquals(1, reader.readInt());
+                assertEquals(1L, reader.readLong());
+
+                assertEquals(17, reader.available());
+
+                //mark again
+                reader.mark();
+                assertEquals(1.0f, reader.readFloat(), 0);
+                assertEquals(1.0d, reader.readDouble(), 0);
+                assertEquals("abc", reader.readUTF());
+                reader.reset();
+
+                assertEquals(17, reader.available());
+
+                assertEquals(1.0f, reader.readFloat(), 0);
+                assertEquals(1.0d, reader.readDouble(), 0);
+                assertEquals("abc", reader.readUTF());
+            }
+            assertFalse(file.exists());
+        }
+    }
+
+
+    @Test
+    public void testCircularSpillFile() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            // boolean
+            out.writeBoolean(true);
+            // byte
+            out.writeByte(0x1);
+            // char
+            out.writeChar('a');
+            // short
+            out.writeShort(1);
+            // int
+            out.writeInt(1);
+
+            // String
+            out.writeUTF("ab");
+            testData = baos.toByteArray();
+
+            // 1 (boolean) + 1 (byte) + 2 (char) + 2 (short) + 4 (int) + 4 bytes (utf string)
+        }
+
+        //read at most 4 bytes multiple times (and then check file size)
+        int MEM_SIZE = 0;
+        int DISK_SIZE = 4;
+        try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                      INITIAL_BUFFER_SIZE, MEM_SIZE, file,
+                                                                                      DISK_SIZE))
+        {
+            //read 2 bytes and reset
+            reader.mark();
+            assertTrue(reader.readBoolean());
+            assertEquals(0x1, reader.readByte());
+            assertEquals(2, reader.bytesPastMark(null));
+            reader.reset();
+
+            //read again previous sequence
+            assertTrue(reader.readBoolean());
+            assertEquals(0x1, reader.readByte());
+            //finish reading again previous sequence
+
+            //read 4 bytes and reset
+            reader.mark();
+            assertEquals('a', reader.readChar());
+            assertEquals(1, reader.readShort());
+            assertEquals(4, reader.bytesPastMark(null));
+            reader.reset();
+
+            //read again previous sequence
+            assertEquals('a', reader.readChar());
+            assertEquals(1, reader.readShort());
+            //finish reading again previous sequence
+
+            //read 4 bytes and reset
+            reader.mark();
+            assertEquals(1, reader.readInt());
+            assertEquals(4, reader.bytesPastMark(null));
+            reader.reset();
+
+            //read again previous sequence
+            assertEquals(1, reader.readInt());
+
+            //check max file size
+            assertEquals(DISK_SIZE, file.length());
+        }
+        assertFalse(file.exists());
+    }
+
+    @Test
+    public void testExhaustCapacity() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            // boolean
+            out.writeBoolean(true);
+            // byte
+            out.writeByte(0x1);
+            // char
+            out.writeChar('a');
+            // short
+            out.writeShort(1);
+            testData = baos.toByteArray();
+        }
+
+        //test capacity exhausted when reading more than 4 bytes
+        testCapacityExhausted(testData, 0, 2);
+        testCapacityExhausted(testData, 2, 0);
+        testCapacityExhausted(testData, 1, 1);
+    }
+
+    private void testCapacityExhausted(byte[] testData, int memSize, int diskSize) throws IOException
+    {
+        try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                      INITIAL_BUFFER_SIZE, memSize, file,
+                                                                                      diskSize))
+        {
+            //read 2 bytes and reset
+            reader.mark();
+            assertTrue(reader.readBoolean());
+            assertEquals(0x1, reader.readByte());
+            assertEquals(2, reader.bytesPastMark(null));
+            reader.reset();
+
+            //read again previous sequence
+            assertTrue(reader.readBoolean());
+            assertEquals(0x1, reader.readByte());
+            //finish reading again previous sequence
+
+            reader.mark();
+            //read 3 bytes - OK
+            assertEquals('a', reader.readChar());
+            //read 1 more bytes - CAPACITY will exhaust when trying to reset :(
+            assertEquals(1, reader.readShort());
+
+            try
+            {
+                reader.reset();
+                fail("Should have thrown IOException");
+            }
+            catch (IOException e) {}
+
+            //check max file size
+            assertEquals(diskSize, file.length());
+        }
+        assertFalse(file.exists());
+    }
+
+    @Test
+    public void testMarkAndResetUnsignedRead() throws Exception
+    {
+        byte[] testData;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (DataOutputStream out = new DataOutputStream(baos))
+        {
+            // byte
+            out.writeByte(0x1);
+            // short
+            out.writeShort(2);
+            testData = baos.toByteArray();
+        }
+
+        for (int memCapacity = 0; memCapacity <= 1; memCapacity++)
+        {
+            int diskCapacity = 1 - memCapacity;
+            try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                          INITIAL_BUFFER_SIZE, memCapacity, file,
+                                                                                          diskCapacity))
+            {
+                reader.mark();
+                assertEquals(1, reader.readUnsignedByte());
+                reader.reset();
+                assertEquals(1, reader.readUnsignedByte());
+
+                //will read first byte of short 2
+                reader.mark();
+                assertEquals(0, reader.readUnsignedByte());
+                reader.reset();
+
+                assertEquals(2, reader.readUnsignedShort());
+
+                reader.mark();
+                reader.reset();
+                assertEquals(0, reader.available());
+            }
+        }
+        assertFalse(file.exists());
+    }
+
+    @Test
+    public void testMarkAndResetSkipBytes() throws Exception
+    {
+        String testStr = "1234567890";
+        byte[] testData = testStr.getBytes();
+
+        for (int memCapacity = 0; memCapacity <= 7; memCapacity++)
+        {
+            int diskCapacity = 7 - memCapacity;
+            try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                          INITIAL_BUFFER_SIZE, memCapacity, file,
+                                                                                          diskCapacity))
+            {
+                reader.mark();
+                // read first 5 bytes and rewind
+                byte[] out = new byte[5];
+                reader.readFully(out, 0, 5);
+                assertEquals("12345", new String(out));
+
+                // then skip 2 bytes (67)
+                reader.skipBytes(2);
+
+                assertEquals(7, reader.bytesPastMark(null));
+                reader.reset();
+
+                //now read part of the previously skipped bytes
+                out = new byte[5];
+                reader.readFully(out);
+                assertEquals("12345", new String(out));
+
+                //skip 3 bytes (2 from cache, 1 from stream)
+                reader.skip(3);
+
+                // mark and read 2 more bytes
+                reader.mark();
+                out = new byte[2];
+                reader.readFully(out);
+                assertEquals("90", new String(out));
+                assertEquals(0, reader.available());
+                reader.reset();
+
+                //reset and read only the next byte "9" in the third position
+                reader.readFully(out, 1, 1);
+                assertEquals("99", new String(out));
+
+                //now we read the remainder via readline
+                assertEquals(1, reader.available());
+                assertEquals("0", reader.readLine());
+
+            }
+            assertFalse(file.exists());
+        }
+    }
+
+    @Test
+    public void testMarkAndResetReadFully() throws Exception
+    {
+        String testStr = "1234567890";
+        byte[] testData = testStr.getBytes();
+
+        for (int memCapacity = 0; memCapacity <= 5; memCapacity++)
+        {
+            int diskCapacity = 5 - memCapacity;
+            try (RewindableDataInputStreamPlus reader = new RewindableDataInputStreamPlus(new ByteArrayInputStream(testData),
+                                                                                          INITIAL_BUFFER_SIZE, memCapacity, file,
+                                                                                          diskCapacity))
+            {
+                reader.mark();
+                // read first 5 bytes and rewind
+                byte[] out = new byte[5];
+                reader.readFully(out, 0, 5);
+                assertEquals("12345", new String(out));
+                reader.reset();
+
+                // read half from cache, half from parent stream
+                out = new byte[7];
+                reader.readFully(out);
+                assertEquals("1234567", new String(out));
+
+                // mark and read 3 more bytes
+                reader.mark();
+                out = new byte[3];
+                reader.readFully(out);
+                assertEquals("890", new String(out));
+                assertEquals(0, reader.available());
+                reader.reset();
+
+                //reset and read only the next byte "8" in the third position
+                reader.readFully(out, 2, 1);
+                assertEquals("898", new String(out));
+
+                //now we read the remainder via readline
+                assertEquals(2, reader.available());
+                assertEquals("90", reader.readLine());
+            }
+            assertFalse(file.exists());
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/SegmentedFileTest.java b/test/unit/org/apache/cassandra/io/util/SegmentedFileTest.java
new file mode 100644
index 0000000..03c10de
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/SegmentedFileTest.java

@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+import static org.junit.Assert.assertEquals;
+
+public class SegmentedFileTest
+{
+    @Test
+    public void testRoundingBufferSize()
+    {
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(-1L));
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(0));
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(1));
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(2013));
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(4095));
+        assertEquals(4096, SegmentedFile.Builder.roundBufferSize(4096));
+        assertEquals(8192, SegmentedFile.Builder.roundBufferSize(4097));
+        assertEquals(8192, SegmentedFile.Builder.roundBufferSize(8191));
+        assertEquals(8192, SegmentedFile.Builder.roundBufferSize(8192));
+        assertEquals(12288, SegmentedFile.Builder.roundBufferSize(8193));
+        assertEquals(65536, SegmentedFile.Builder.roundBufferSize(65535));
+        assertEquals(65536, SegmentedFile.Builder.roundBufferSize(65536));
+        assertEquals(65536, SegmentedFile.Builder.roundBufferSize(65537));
+        assertEquals(65536, SegmentedFile.Builder.roundBufferSize(10000000000000000L));
+    }
+
+    @Test
+    public void testBufferSize_ssd()
+    {
+        DatabaseDescriptor.setDiskOptimizationStrategy(Config.DiskOptimizationStrategy.ssd);
+        DatabaseDescriptor.setDiskOptimizationPageCrossChance(0.1);
+
+        assertEquals(4096, SegmentedFile.Builder.bufferSize(0));
+        assertEquals(4096, SegmentedFile.Builder.bufferSize(10));
+        assertEquals(4096, SegmentedFile.Builder.bufferSize(100));
+        assertEquals(4096, SegmentedFile.Builder.bufferSize(4096));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(4505));   // just < (4096 + 4096 * 0.1)
+        assertEquals(12288, SegmentedFile.Builder.bufferSize(4506));  // just > (4096 + 4096 * 0.1)
+
+        DatabaseDescriptor.setDiskOptimizationPageCrossChance(0.5);
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(4506));  // just > (4096 + 4096 * 0.1)
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(6143));  // < (4096 + 4096 * 0.5)
+        assertEquals(12288, SegmentedFile.Builder.bufferSize(6144));  // = (4096 + 4096 * 0.5)
+        assertEquals(12288, SegmentedFile.Builder.bufferSize(6145));  // > (4096 + 4096 * 0.5)
+
+        DatabaseDescriptor.setDiskOptimizationPageCrossChance(1.0); // never add a page
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(8191));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(8192));
+
+        DatabaseDescriptor.setDiskOptimizationPageCrossChance(0.0); // always add a page
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(10));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(4096));
+    }
+
+    @Test
+    public void testBufferSize_spinning()
+    {
+        DatabaseDescriptor.setDiskOptimizationStrategy(Config.DiskOptimizationStrategy.spinning);
+
+        assertEquals(4096, SegmentedFile.Builder.bufferSize(0));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(10));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(100));
+        assertEquals(8192, SegmentedFile.Builder.bufferSize(4096));
+        assertEquals(12288, SegmentedFile.Builder.bufferSize(4097));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/io/util/SequentialWriterTest.java b/test/unit/org/apache/cassandra/io/util/SequentialWriterTest.java
index 15d6160..4d75103 100644
--- a/test/unit/org/apache/cassandra/io/util/SequentialWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/util/SequentialWriterTest.java

@@ -103,7 +103,6 @@
         protected void assertAborted() throws Exception
         {
             Assert.assertFalse(writer.isOpen());
-            Assert.assertFalse(file.exists());
         }
 
         protected void assertCommitted() throws Exception
@@ -132,22 +131,22 @@
             // write bytes greather than buffer
             writer.write(toWrite);
             assertEquals(bufferSize, writer.getLastFlushOffset());
-            assertEquals(writeSize, writer.getFilePointer());
+            assertEquals(writeSize, writer.position());
             // mark thi position
-            FileMark pos = writer.mark();
+            DataPosition pos = writer.mark();
             // write another
             writer.write(toWrite);
             // another buffer should be flushed
             assertEquals(bufferSize * 2, writer.getLastFlushOffset());
-            assertEquals(writeSize * 2, writer.getFilePointer());
+            assertEquals(writeSize * 2, writer.position());
             // reset writer
             writer.resetAndTruncate(pos);
             // current position and flushed size should be changed
-            assertEquals(writeSize, writer.getFilePointer());
+            assertEquals(writeSize, writer.position());
             assertEquals(writeSize, writer.getLastFlushOffset());
             // write another byte less than buffer
             writer.write(new byte[]{0});
-            assertEquals(writeSize + 1, writer.getFilePointer());
+            assertEquals(writeSize + 1, writer.position());
             // flush off set should not be increase
             assertEquals(writeSize, writer.getLastFlushOffset());
             writer.finish();

diff --git a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
index bc5be46..5dc34df 100644
--- a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java

@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.locator;
 
 import java.io.IOException;
@@ -27,13 +28,13 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.VersionedValue;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.db.Keyspace;
 
 import static org.junit.Assert.assertEquals;
 
@@ -44,6 +45,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.setDaemonInitialized();
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();

diff --git a/test/unit/org/apache/cassandra/locator/DynamicEndpointSnitchTest.java b/test/unit/org/apache/cassandra/locator/DynamicEndpointSnitchTest.java
index d27edbf..af7dc17 100644
--- a/test/unit/org/apache/cassandra/locator/DynamicEndpointSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/DynamicEndpointSnitchTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.cassandra.locator;
 
@@ -23,10 +22,10 @@
 import java.net.InetAddress;
 import java.util.*;
 
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.service.StorageService;
 import org.junit.Test;
 
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertEquals;

diff --git a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
index 32383d9..9d078ce 100644
--- a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.locator;
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -9,17 +7,17 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
+package org.apache.cassandra.locator;
+
 
 import java.io.IOException;
 import java.net.InetAddress;
@@ -52,6 +50,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.setDaemonInitialized();
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();

diff --git a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
index f2450f4..04b71e9 100644
--- a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java

@@ -1,6 +1,4 @@
-package org.apache.cassandra.locator;
 /*
- *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -9,17 +7,17 @@
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
+package org.apache.cassandra.locator;
+
 
 import java.io.IOException;
 import java.net.InetAddress;
@@ -48,6 +46,7 @@
     @BeforeClass
     public static void setup() throws Exception
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.setDaemonInitialized();
         SchemaLoader.mkdirs();
         SchemaLoader.cleanup();

diff --git a/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java b/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java
index 20608ba..61c179f 100644
--- a/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/GossipingPropertyFileSnitchTest.java

@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.locator;
 
 import java.net.InetAddress;

diff --git a/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java b/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java
index a3ac416..bbfdd3b 100644
--- a/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.cassandra.locator;
 
@@ -29,19 +28,17 @@
 import java.util.Map;
 import java.util.Set;
 
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
 import org.junit.Assert;
-
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken;
 import org.apache.cassandra.dht.Token;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
+import org.apache.cassandra.exceptions.ConfigurationException;
 
 public class NetworkTopologyStrategyTest
 {

diff --git a/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java b/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java
index eceb847..6eb08c4 100644
--- a/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/OldNetworkTopologyStrategyTest.java

@@ -1,45 +1,43 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.locator;
 
-import static org.junit.Assert.assertEquals;
-
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-import org.apache.cassandra.config.KSMetaData;
+import org.junit.Before;
+import org.junit.Test;
+
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.Pair;
 
-import org.junit.Before;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
 
 public class OldNetworkTopologyStrategyTest
 {
@@ -58,14 +56,14 @@
     /**
      * 4 same rack endpoints
      *
-     * @throws UnknownHostException
+     * @throws java.net.UnknownHostException
      */
     @Test
     public void testBigIntegerEndpointsA() throws UnknownHostException
     {
         RackInferringSnitch endpointSnitch = new RackInferringSnitch();
 
-        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, KSMetaData.optsWithRF(1));
+        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, optsWithRF(1));
         addEndpoint("0", "5", "254.0.0.1");
         addEndpoint("10", "15", "254.0.0.2");
         addEndpoint("20", "25", "254.0.0.3");
@@ -83,14 +81,14 @@
      * 3 same rack endpoints
      * 1 external datacenter
      *
-     * @throws UnknownHostException
+     * @throws java.net.UnknownHostException
      */
     @Test
     public void testBigIntegerEndpointsB() throws UnknownHostException
     {
         RackInferringSnitch endpointSnitch = new RackInferringSnitch();
 
-        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, KSMetaData.optsWithRF(1));
+        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, optsWithRF(1));
         addEndpoint("0", "5", "254.0.0.1");
         addEndpoint("10", "15", "254.0.0.2");
         addEndpoint("20", "25", "254.1.0.3");
@@ -109,14 +107,14 @@
      * 1 same datacenter, different rack endpoints
      * 1 external datacenter
      *
-     * @throws UnknownHostException
+     * @throws java.net.UnknownHostException
      */
     @Test
     public void testBigIntegerEndpointsC() throws UnknownHostException
     {
         RackInferringSnitch endpointSnitch = new RackInferringSnitch();
 
-        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, KSMetaData.optsWithRF(1));
+        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tmd, endpointSnitch, optsWithRF(1));
         addEndpoint("0", "5", "254.0.0.1");
         addEndpoint("10", "15", "254.0.0.2");
         addEndpoint("20", "25", "254.0.1.3");
@@ -167,7 +165,7 @@
     /**
      * test basic methods to move a node. For sure, it's not the best place, but it's easy to test
      *
-     * @throws UnknownHostException
+     * @throws java.net.UnknownHostException
      */
     @Test
     public void testMoveLeft() throws UnknownHostException
@@ -359,7 +357,7 @@
 
         TokenMetadata tokenMetadataCurrent = initTokenMetadata(tokens);
         TokenMetadata tokenMetadataAfterMove = initTokenMetadata(tokensAfterMove);
-        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tokenMetadataCurrent, endpointSnitch, KSMetaData.optsWithRF(2));
+        AbstractReplicationStrategy strategy = new OldNetworkTopologyStrategy("Keyspace1", tokenMetadataCurrent, endpointSnitch, optsWithRF(2));
 
         Collection<Range<Token>> currentRanges = strategy.getAddressRanges().get(movingNode);
         Collection<Range<Token>> updatedRanges = strategy.getPendingAddressRanges(tokenMetadataAfterMove, tokensAfterMove[movingNodeIdx], movingNode);
@@ -369,5 +367,8 @@
         return ranges;
     }
 
-
+    private static Map<String, String> optsWithRF(int rf)
+    {
+        return Collections.singletonMap("replication_factor", Integer.toString(rf));
+    }
 }

diff --git a/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java b/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java
index e9a307b..29ea4d5 100644
--- a/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java

@@ -64,6 +64,7 @@
     @Before
     public void setup() throws ConfigurationException, IOException
     {
+        System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         String confFile = FBUtilities.resourceToFile(PropertyFileSnitch.SNITCH_PROPERTIES_FILENAME);
         effectiveFile = Paths.get(confFile);
         backupFile = Paths.get(confFile + ".bak");

diff --git a/test/unit/org/apache/cassandra/locator/ReplicationStrategyEndpointCacheTest.java b/test/unit/org/apache/cassandra/locator/ReplicationStrategyEndpointCacheTest.java
index 093de9b..c811811 100644
--- a/test/unit/org/apache/cassandra/locator/ReplicationStrategyEndpointCacheTest.java
+++ b/test/unit/org/apache/cassandra/locator/ReplicationStrategyEndpointCacheTest.java

@@ -1,37 +1,38 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.locator;
 
 import java.net.InetAddress;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.KeyspaceParams;
 
 public class ReplicationStrategyEndpointCacheTest
 {
@@ -45,8 +46,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(5));
+                                    KeyspaceParams.simple(5));
     }
 
     public void setup(Class stratClass, Map<String, String> strategyOptions) throws Exception
@@ -175,11 +175,11 @@
     private AbstractReplicationStrategy getStrategyWithNewTokenMetadata(AbstractReplicationStrategy strategy, TokenMetadata newTmd) throws ConfigurationException
     {
         return AbstractReplicationStrategy.createReplicationStrategy(
-                strategy.keyspaceName,
-                AbstractReplicationStrategy.getClass(strategy.getClass().getName()),
-                newTmd,
-                strategy.snitch,
-                strategy.configOptions);
+                                                                    strategy.keyspaceName,
+                                                                    AbstractReplicationStrategy.getClass(strategy.getClass().getName()),
+                                                                    newTmd,
+                                                                    strategy.snitch,
+                                                                    strategy.configOptions);
     }
 
 }

diff --git a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
index 61255f3..0955985 100644
--- a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java

@@ -1,22 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.locator;
 
 import java.net.InetAddress;
@@ -30,17 +28,22 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.dht.*;
-import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.OrderPreservingPartitioner;
 import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken;
+import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.PendingRangeCalculatorService;
 import org.apache.cassandra.service.StorageServiceAccessor;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 public class SimpleStrategyTest
 {
@@ -50,9 +53,7 @@
     public static void defineSchema() throws Exception
     {
         SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1));
+        SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1));
     }
 
     @Test
@@ -82,7 +83,7 @@
         List<Token> keyTokens = new ArrayList<Token>();
         for (int i = 0; i < 5; i++) {
             endpointTokens.add(new StringToken(String.valueOf((char)('a' + i * 2))));
-            keyTokens.add(partitioner.getToken(ByteBufferUtil.bytes(String.valueOf((char)('a' + i * 2 + 1)))));
+            keyTokens.add(partitioner.getToken(ByteBufferUtil.bytes(String.valueOf((char) ('a' + i * 2 + 1)))));
         }
         verifyGetNaturalEndpoints(endpointTokens.toArray(new Token[0]), keyTokens.toArray(new Token[0]));
     }
@@ -93,7 +94,7 @@
     {
         TokenMetadata tmd;
         AbstractReplicationStrategy strategy;
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             tmd = new TokenMetadata();
             strategy = getStrategy(keyspaceName, tmd);
@@ -148,7 +149,7 @@
         tmd.addBootstrapToken(bsToken, bootstrapEndpoint);
 
         AbstractReplicationStrategy strategy = null;
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             strategy = getStrategy(keyspaceName, tmd);
 
@@ -180,12 +181,12 @@
 
     private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
-        KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
+        KeyspaceMetadata ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(
-                keyspaceName,
-                ksmd.strategyClass,
-                tmd,
-                new SimpleSnitch(),
-                ksmd.strategyOptions);
+                                                                    keyspaceName,
+                                                                    ksmd.params.replication.klass,
+                                                                    tmd,
+                                                                    new SimpleSnitch(),
+                                                                    ksmd.params.replication.options);
     }
 }

diff --git a/test/unit/org/apache/cassandra/locator/TokenMetadataTest.java b/test/unit/org/apache/cassandra/locator/TokenMetadataTest.java
index fc8095d..dab7082 100644
--- a/test/unit/org/apache/cassandra/locator/TokenMetadataTest.java
+++ b/test/unit/org/apache/cassandra/locator/TokenMetadataTest.java

@@ -1,21 +1,20 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.cassandra.locator;
 
 import java.net.InetAddress;
@@ -23,6 +22,7 @@
 import java.util.ArrayList;
 import java.util.Map;
 
+import com.google.common.collect.ImmutableMultimap;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Multimap;
 
@@ -30,18 +30,19 @@
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
-import static junit.framework.Assert.assertNotNull;
-import static org.junit.Assert.assertEquals;
-
-import static org.apache.cassandra.Util.token;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.service.StorageService;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import static org.apache.cassandra.Util.token;
+
+
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class TokenMetadataTest
 {
@@ -139,7 +140,7 @@
         assertTrue(allEndpoints.get(DATA_CENTER).contains(first));
         assertTrue(allEndpoints.get(DATA_CENTER).contains(second));
 
-        Map<String, Multimap<String, InetAddress>> racks = topology.getDatacenterRacks();
+        Map<String, ImmutableMultimap<String, InetAddress>> racks = topology.getDatacenterRacks();
         assertNotNull(racks);
         assertTrue(racks.size() == 1);
         assertTrue(racks.containsKey(DATA_CENTER));
@@ -171,7 +172,7 @@
         });
 
         tokenMetadata.updateTopology(first);
-        tokenMetadata.updateTopology(second);
+        topology = tokenMetadata.updateTopology(second);
 
         allEndpoints = topology.getDatacenterEndpoints();
         assertNotNull(allEndpoints);
@@ -237,7 +238,7 @@
         assertTrue(allEndpoints.get(DATA_CENTER).contains(first));
         assertTrue(allEndpoints.get(DATA_CENTER).contains(second));
 
-        Map<String, Multimap<String, InetAddress>> racks = topology.getDatacenterRacks();
+        Map<String, ImmutableMultimap<String, InetAddress>> racks = topology.getDatacenterRacks();
         assertNotNull(racks);
         assertTrue(racks.size() == 1);
         assertTrue(racks.containsKey(DATA_CENTER));
@@ -268,7 +269,7 @@
             }
         });
 
-        tokenMetadata.updateTopology();
+        topology = tokenMetadata.updateTopology();
 
         allEndpoints = topology.getDatacenterEndpoints();
         assertNotNull(allEndpoints);

diff --git a/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java
index a357d24..099a530 100644
--- a/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java
+++ b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java

@@ -15,14 +15,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.metrics;
 
 import java.io.IOException;
 
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
 import com.datastax.driver.core.Cluster;
 import com.datastax.driver.core.PreparedStatement;
 import com.datastax.driver.core.Session;
-
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -33,10 +37,6 @@
 
 import static junit.framework.Assert.assertEquals;
 
-import org.junit.BeforeClass;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class CQLMetricsTest extends SchemaLoader
 {
@@ -44,7 +44,6 @@
 
     private static Cluster cluster;
     private static Session session;
-    private static PreparedStatement metricsStatement;
 
     @BeforeClass()
     public static void setup() throws ConfigurationException, IOException
@@ -64,15 +63,16 @@
     @Test
     public void testPreparedStatementsCount()
     {
-        assertEquals(0, (int) QueryProcessor.metrics.preparedStatementsCount.getValue());
-        metricsStatement = session.prepare("INSERT INTO junit.metricstest (id, val) VALUES (?, ?)");
-        assertEquals(1, (int) QueryProcessor.metrics.preparedStatementsCount.getValue());
+        int n = (int) QueryProcessor.metrics.preparedStatementsCount.getValue();
+        session.prepare("SELECT * FROM junit.metricstest WHERE id = ?");
+        assertEquals(n+1, (int) QueryProcessor.metrics.preparedStatementsCount.getValue());
     }
 
     @Test
     public void testRegularStatementsExecuted()
     {
         clearMetrics();
+        PreparedStatement metricsStatement = session.prepare("INSERT INTO junit.metricstest (id, val) VALUES (?, ?)");
 
         assertEquals(0, QueryProcessor.metrics.preparedStatementsExecuted.getCount());
         assertEquals(0, QueryProcessor.metrics.regularStatementsExecuted.getCount());
@@ -88,6 +88,7 @@
     public void testPreparedStatementsExecuted()
     {
         clearMetrics();
+        PreparedStatement metricsStatement = session.prepare("INSERT INTO junit.metricstest (id, val) VALUES (?, ?)");
 
         assertEquals(0, QueryProcessor.metrics.preparedStatementsExecuted.getCount());
         assertEquals(0, QueryProcessor.metrics.regularStatementsExecuted.getCount());
@@ -103,6 +104,7 @@
     public void testPreparedStatementsRatio()
     {
         clearMetrics();
+        PreparedStatement metricsStatement = session.prepare("INSERT INTO junit.metricstest (id, val) VALUES (?, ?)");
 
         assertEquals(Double.NaN, QueryProcessor.metrics.preparedStatementsRatio.getValue());
 

diff --git a/test/unit/org/apache/cassandra/metrics/HintedHandOffMetricsTest.java b/test/unit/org/apache/cassandra/metrics/HintedHandOffMetricsTest.java
new file mode 100644
index 0000000..1d3863a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/metrics/HintedHandOffMetricsTest.java

@@ -0,0 +1,63 @@
+/*
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+package org.apache.cassandra.metrics;
+
+import java.net.InetAddress;
+import java.util.Map;
+import java.util.UUID;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.hints.HintsService;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+
+public class HintedHandOffMetricsTest
+{
+    @BeforeClass
+    public static void initDD()
+    {
+        DatabaseDescriptor.setDaemonInitialized();
+    }
+
+    @Test
+    public void testHintsMetrics() throws Exception
+    {
+        DatabaseDescriptor.getHintsDirectory().mkdirs();
+
+        for (int i = 0; i < 99; i++)
+            HintsService.instance.metrics.incrPastWindow(InetAddress.getByName("127.0.0.1"));
+        HintsService.instance.metrics.log();
+
+        UntypedResultSet rows = executeInternal("SELECT hints_dropped FROM system." + SystemKeyspace.PEER_EVENTS);
+        Map<UUID, Integer> returned = rows.one().getMap("hints_dropped", UUIDType.instance, Int32Type.instance);
+        assertEquals(Iterators.getLast(returned.values().iterator()).intValue(), 99);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/metrics/LatencyMetricsTest.java b/test/unit/org/apache/cassandra/metrics/LatencyMetricsTest.java
index ae4b733..62cb88e 100644
--- a/test/unit/org/apache/cassandra/metrics/LatencyMetricsTest.java
+++ b/test/unit/org/apache/cassandra/metrics/LatencyMetricsTest.java

@@ -15,9 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.metrics;
 
 import org.junit.Test;
+
 import static junit.framework.Assert.assertFalse;
 
 public class LatencyMetricsTest

diff --git a/test/unit/org/apache/cassandra/net/Matcher.java b/test/unit/org/apache/cassandra/net/Matcher.java
new file mode 100644
index 0000000..cd1b667
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/Matcher.java

@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.net.InetAddress;
+
+/**
+ * Predicate based on intercepted, outgoing messange and the message's destination address.
+ */
+public interface Matcher<T>
+{
+    /**
+     * @param obj intercepted outgoing message
+     * @param to  destination address
+     */
+    public boolean matches(MessageOut<T> obj, InetAddress to);
+}

diff --git a/test/unit/org/apache/cassandra/net/MatcherResponse.java b/test/unit/org/apache/cassandra/net/MatcherResponse.java
new file mode 100644
index 0000000..12a8d1b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/MatcherResponse.java

@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.net.InetAddress;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Queue;
+import java.util.Set;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+
+import org.apache.cassandra.utils.Clock;
+
+/**
+ * Sends a response for an incoming message with a matching {@link Matcher}.
+ * The actual behavior by any instance of this class can be inspected by
+ * interacting with the returned {@link MockMessagingSpy}.
+ */
+public class MatcherResponse
+{
+    private final Matcher<?> matcher;
+    private final Set<Integer> sendResponses = new HashSet<>();
+    private final MockMessagingSpy spy = new MockMessagingSpy();
+    private final AtomicInteger limitCounter = new AtomicInteger(Integer.MAX_VALUE);
+    private IMessageSink sink;
+
+    MatcherResponse(Matcher<?> matcher)
+    {
+        this.matcher = matcher;
+    }
+
+    /**
+     * Do not create any responses for intercepted outbound messages.
+     */
+    public MockMessagingSpy dontReply()
+    {
+        return respond((MessageIn<?>)null);
+    }
+
+    /**
+     * Respond with provided message in reply to each intercepted outbound message.
+     * @param message   the message to use as mock reply from the cluster
+     */
+    public MockMessagingSpy respond(MessageIn<?> message)
+    {
+        return respondN(message, Integer.MAX_VALUE);
+    }
+
+    /**
+     * Respond a limited number of times with the provided message in reply to each intercepted outbound message.
+     * @param response  the message to use as mock reply from the cluster
+     * @param limit     number of times to respond with message
+     */
+    public MockMessagingSpy respondN(final MessageIn<?> response, int limit)
+    {
+        return respondN((in, to) -> response, limit);
+    }
+
+    /**
+     * Respond with the message created by the provided function that will be called with each intercepted outbound message.
+     * @param fnResponse    function to call for creating reply based on intercepted message and target address
+     */
+    public <T, S> MockMessagingSpy respond(BiFunction<MessageOut<T>, InetAddress, MessageIn<S>> fnResponse)
+    {
+        return respondN(fnResponse, Integer.MAX_VALUE);
+    }
+
+    /**
+     * Respond with message wrapping the payload object created by provided function called for each intercepted outbound message.
+     * The target address from the intercepted message will automatically be used as the created message's sender address.
+     * @param fnResponse    function to call for creating payload object based on intercepted message and target address
+     * @param verb          verb to use for reply message
+     */
+    public <T, S> MockMessagingSpy respondWithPayloadForEachReceiver(Function<MessageOut<T>, S> fnResponse, MessagingService.Verb verb)
+    {
+        return respondNWithPayloadForEachReceiver(fnResponse, verb, Integer.MAX_VALUE);
+    }
+
+    /**
+     * Respond a limited number of times with message wrapping the payload object created by provided function called for
+     * each intercepted outbound message. The target address from the intercepted message will automatically be used as the
+     * created message's sender address.
+     * @param fnResponse    function to call for creating payload object based on intercepted message and target address
+     * @param verb          verb to use for reply message
+     */
+    public <T, S> MockMessagingSpy respondNWithPayloadForEachReceiver(Function<MessageOut<T>, S> fnResponse, MessagingService.Verb verb, int limit)
+    {
+        return respondN((MessageOut<T> msg, InetAddress to) -> {
+                    S payload = fnResponse.apply(msg);
+                    if (payload == null)
+                        return null;
+                    else
+                        return MessageIn.create(to, payload, Collections.emptyMap(), verb, MessagingService.current_version);
+                },
+                limit);
+    }
+
+    /**
+     * Responds to each intercepted outbound message by creating a response message wrapping the next element consumed
+     * from the provided queue. No reply will be send when the queue has been exhausted.
+     * @param cannedResponses   prepared payload messages to use for responses
+     * @param verb              verb to use for reply message
+     */
+    public <T, S> MockMessagingSpy respondWithPayloadForEachReceiver(Queue<S> cannedResponses, MessagingService.Verb verb)
+    {
+        return respondWithPayloadForEachReceiver((MessageOut<T> msg) -> cannedResponses.poll(), verb);
+    }
+
+    /**
+     * Responds to each intercepted outbound message by creating a response message wrapping the next element consumed
+     * from the provided queue. This method will block until queue elements are available.
+     * @param cannedResponses   prepared payload messages to use for responses
+     * @param verb              verb to use for reply message
+     */
+    public <T, S> MockMessagingSpy respondWithPayloadForEachReceiver(BlockingQueue<S> cannedResponses, MessagingService.Verb verb)
+    {
+        return respondWithPayloadForEachReceiver((MessageOut<T> msg) -> {
+            try
+            {
+                return cannedResponses.take();
+            }
+            catch (InterruptedException e)
+            {
+                return null;
+            }
+        }, verb);
+    }
+
+    /**
+     * Respond a limited number of times with the message created by the provided function that will be called with
+     * each intercepted outbound message.
+     * @param fnResponse    function to call for creating reply based on intercepted message and target address
+     */
+    public <T, S> MockMessagingSpy respondN(BiFunction<MessageOut<T>, InetAddress, MessageIn<S>> fnResponse, int limit)
+    {
+        limitCounter.set(limit);
+
+        assert sink == null: "destroy() must be called first to register new response";
+
+        sink = new IMessageSink()
+        {
+            public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
+            {
+                // prevent outgoing message from being send in case matcher indicates a match
+                // and instead send the mocked response
+                if (matcher.matches(message, to))
+                {
+                    spy.matchingMessage(message);
+
+                    if (limitCounter.decrementAndGet() < 0)
+                        return false;
+
+                    synchronized (sendResponses)
+                    {
+                        // I'm not sure about retry semantics regarding message/ID relationships, but I assume
+                        // sending a message multiple times using the same ID shouldn't happen..
+                        assert !sendResponses.contains(id) : "ID re-use for outgoing message";
+                        sendResponses.add(id);
+                    }
+                    MessageIn<?> response = fnResponse.apply(message, to);
+                    if (response != null)
+                    {
+                        CallbackInfo cb = MessagingService.instance().getRegisteredCallback(id);
+                        if (cb != null)
+                            cb.callback.response(response);
+                        else
+                            MessagingService.instance().receive(response, id, Clock.instance.currentTimeMillis(), false);
+                        spy.matchingResponse(response);
+                    }
+                    return false;
+                }
+                return true;
+            }
+
+            public boolean allowIncomingMessage(MessageIn message, int id)
+            {
+                return true;
+            }
+        };
+        MessagingService.instance().addMessageSink(sink);
+
+        return spy;
+    }
+
+    /**
+     * Stops currently registered response from being send.
+     */
+    public void destroy()
+    {
+        MessagingService.instance().removeMessageSink(sink);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java
index 8631f03..3be1990 100644
--- a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java
+++ b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java

@@ -22,6 +22,7 @@
 
 import java.util.List;
 
+import org.junit.Before;
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
@@ -30,6 +31,13 @@
 {
     private final MessagingService messagingService = MessagingService.test();
 
+    private static int metricScopeId = 0;
+
+    @Before
+    public void before() {
+        messagingService.resetDroppedMessagesMap(Integer.toString(metricScopeId++));;
+    }
+
     @Test
     public void testDroppedMessages()
     {

diff --git a/test/unit/org/apache/cassandra/net/MockMessagingService.java b/test/unit/org/apache/cassandra/net/MockMessagingService.java
new file mode 100644
index 0000000..0412759
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/MockMessagingService.java

@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.function.Predicate;
+
+/**
+ * Starting point for mocking {@link MessagingService} interactions. Outgoing messages can be
+ * intercepted by first creating a {@link MatcherResponse} by calling {@link MockMessagingService#when(Matcher)}.
+ * Alternatively {@link Matcher}s can be created by using helper methods such as {@link #to(InetAddress)},
+ * {@link #verb(MessagingService.Verb)} or {@link #payload(Predicate)} and may also be
+ * nested using {@link MockMessagingService#all(Matcher[])} or {@link MockMessagingService#any(Matcher[])}.
+ * After each test, {@link MockMessagingService#cleanup()} must be called for free listeners registered
+ * in {@link MessagingService}.
+ */
+public class MockMessagingService
+{
+
+    private MockMessagingService()
+    {
+    }
+
+    /**
+     * Creates a MatcherResponse based on specified matcher.
+     */
+    public static MatcherResponse when(Matcher matcher)
+    {
+        return new MatcherResponse(matcher);
+    }
+
+    /**
+     * Unsubscribes any handlers added by calling {@link MessagingService#addMessageSink(IMessageSink)}.
+     * This should be called after each test.
+     */
+    public static void cleanup()
+    {
+        MessagingService.instance().clearMessageSinks();
+    }
+
+    /**
+     * Creates a matcher that will indicate if the target address of the outgoing message equals the
+     * provided address.
+     */
+    public static Matcher<InetAddress> to(String address)
+    {
+        try
+        {
+            return to(InetAddress.getByName(address));
+        }
+        catch (UnknownHostException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Creates a matcher that will indicate if the target address of the outgoing message equals the
+     * provided address.
+     */
+    public static Matcher<InetAddress> to(InetAddress address)
+    {
+        return (in, to) -> to == address || to.equals(address);
+    }
+
+    /**
+     * Creates a matcher that will indicate if the verb of the outgoing message equals the
+     * provided value.
+     */
+    public static Matcher<MessagingService.Verb> verb(MessagingService.Verb verb)
+    {
+        return (in, to) -> in.verb == verb;
+    }
+
+    /**
+     * Creates a matcher based on the result of the provided predicate called with the outgoing message.
+     */
+    public static <T> Matcher<T> message(Predicate<MessageOut<T>> fn)
+    {
+        return (msg, to) -> fn.test(msg);
+    }
+
+    /**
+     * Creates a matcher based on the result of the provided predicate called with the outgoing message's payload.
+     */
+    public static <T> Matcher<T> payload(Predicate<T> fn)
+    {
+        return (msg, to) -> fn.test(msg.payload);
+    }
+
+    /**
+     * Inverts boolean result of wrapped matcher.
+     */
+    public static <T> Matcher<T> not(Matcher<T> matcher)
+    {
+        return (o, to) -> !matcher.matches(o, to);
+    }
+
+    /**
+     * Indicates true in case all provided matchers returned true.
+     */
+    public static <T> Matcher<?> all(Matcher<?>... matchers)
+    {
+        return (MessageOut<T> out, InetAddress to) -> {
+            for (Matcher matcher : matchers)
+            {
+                if (!matcher.matches(out, to))
+                    return false;
+            }
+            return true;
+        };
+    }
+
+    /**
+     * Indicates true in case at least a single provided matcher returned true.
+     */
+    public static <T> Matcher<?> any(Matcher<?>... matchers)
+    {
+        return (MessageOut<T> out, InetAddress to) -> {
+            for (Matcher matcher : matchers)
+            {
+                if (matcher.matches(out, to))
+                    return true;
+            }
+            return false;
+        };
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/net/MockMessagingServiceTest.java b/test/unit/org/apache/cassandra/net/MockMessagingServiceTest.java
new file mode 100644
index 0000000..ed4cce8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/MockMessagingServiceTest.java

@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.util.Collections;
+import java.util.concurrent.ExecutionException;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.gms.EchoMessage;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.net.MockMessagingService.all;
+import static org.apache.cassandra.net.MockMessagingService.to;
+import static org.apache.cassandra.net.MockMessagingService.verb;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class MockMessagingServiceTest
+{
+    @BeforeClass
+    public static void initCluster() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        StorageService.instance.initServer();
+    }
+
+    @Before
+    public void cleanup()
+    {
+        MockMessagingService.cleanup();
+    }
+
+    @Test
+    public void testRequestResponse() throws InterruptedException, ExecutionException
+    {
+        // echo message that we like to mock as incoming reply for outgoing echo message
+        MessageIn<EchoMessage> echoMessageIn = MessageIn.create(FBUtilities.getBroadcastAddress(),
+                EchoMessage.instance,
+                Collections.emptyMap(),
+                MessagingService.Verb.ECHO,
+                MessagingService.current_version
+        );
+        MockMessagingSpy spy = MockMessagingService
+                .when(
+                        all(
+                                to(FBUtilities.getBroadcastAddress()),
+                                verb(MessagingService.Verb.ECHO)
+                        )
+                )
+                .respond(echoMessageIn);
+
+        MessageOut<EchoMessage> echoMessageOut = new MessageOut<>(MessagingService.Verb.ECHO, EchoMessage.instance, EchoMessage.serializer);
+        MessagingService.instance().sendRR(echoMessageOut, FBUtilities.getBroadcastAddress(), new IAsyncCallback()
+        {
+            public void response(MessageIn msg)
+            {
+                assertEquals(MessagingService.Verb.ECHO, msg.verb);
+                assertEquals(echoMessageIn.payload, msg.payload);
+            }
+
+            public boolean isLatencyForSnitch()
+            {
+                return false;
+            }
+        });
+
+        // we must have intercepted the outgoing message at this point
+        MessageOut<?> msg = spy.captureMessageOut().get();
+        assertEquals(1, spy.messagesIntercepted);
+        assertTrue(msg == echoMessageOut);
+
+        // and return a mocked response
+        assertEquals(1, spy.mockedMessageResponses);
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/net/MockMessagingSpy.java b/test/unit/org/apache/cassandra/net/MockMessagingSpy.java
new file mode 100644
index 0000000..80bdb39
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/MockMessagingSpy.java

@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.util.concurrent.AbstractFuture;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import junit.framework.AssertionFailedError;
+
+/**
+ * Allows inspecting the behavior of mocked messaging by observing {@link MatcherResponse}.
+ */
+public class MockMessagingSpy
+{
+    private static final Logger logger = LoggerFactory.getLogger(MockMessagingSpy.class);
+
+    public int messagesIntercepted = 0;
+    public int mockedMessageResponses = 0;
+
+    private final BlockingQueue<MessageOut<?>> interceptedMessages = new LinkedBlockingQueue<>();
+    private final BlockingQueue<MessageIn<?>> deliveredResponses = new LinkedBlockingQueue<>();
+
+    private static final Executor executor = Executors.newSingleThreadExecutor();
+
+    /**
+     * Returns a future with the first mocked incoming message that has been created and delivered.
+     */
+    public ListenableFuture<MessageIn<?>> captureMockedMessageIn()
+    {
+        return Futures.transform(captureMockedMessageInN(1), (List<MessageIn<?>> result) -> result.isEmpty() ? null : result.get(0));
+    }
+
+    /**
+     * Returns a future with the specified number mocked incoming messages that have been created and delivered.
+     */
+    public ListenableFuture<List<MessageIn<?>>> captureMockedMessageInN(int noOfMessages)
+    {
+        CapturedResultsFuture<MessageIn<?>> ret = new CapturedResultsFuture<>(noOfMessages, deliveredResponses);
+        executor.execute(ret);
+        return ret;
+    }
+
+    /**
+     * Returns a future that will indicate if a mocked incoming message has been created and delivered.
+     */
+    public ListenableFuture<Boolean> expectMockedMessageIn()
+    {
+        return expectMockedMessageIn(1);
+    }
+
+    /**
+     * Returns a future that will indicate if the specified number of mocked incoming message have been created and delivered.
+     */
+    public ListenableFuture<Boolean> expectMockedMessageIn(int noOfMessages)
+    {
+        ResultsCompletionFuture<MessageIn<?>> ret = new ResultsCompletionFuture<>(noOfMessages, deliveredResponses);
+        executor.execute(ret);
+        return ret;
+    }
+
+    /**
+     * Returns a future with the first intercepted outbound message that would have been send.
+     */
+    public ListenableFuture<MessageOut<?>> captureMessageOut()
+    {
+        return Futures.transform(captureMessageOut(1), (List<MessageOut<?>> result) -> result.isEmpty() ? null : result.get(0));
+    }
+
+    /**
+     * Returns a future with the specified number of intercepted outbound messages that would have been send.
+     */
+    public ListenableFuture<List<MessageOut<?>>> captureMessageOut(int noOfMessages)
+    {
+        CapturedResultsFuture<MessageOut<?>> ret = new CapturedResultsFuture<>(noOfMessages, interceptedMessages);
+        executor.execute(ret);
+        return ret;
+    }
+
+    /**
+     * Returns a future that will indicate if an intercepted outbound messages would have been send.
+     */
+    public ListenableFuture<Boolean> interceptMessageOut()
+    {
+        return interceptMessageOut(1);
+    }
+
+    /**
+     * Returns a future that will indicate if the specified number of intercepted outbound messages would have been send.
+     */
+    public ListenableFuture<Boolean> interceptMessageOut(int noOfMessages)
+    {
+        ResultsCompletionFuture<MessageOut<?>> ret = new ResultsCompletionFuture<>(noOfMessages, interceptedMessages);
+        executor.execute(ret);
+        return ret;
+    }
+
+    /**
+     * Returns a future that will indicate the absence of any intercepted outbound messages with the specifed period.
+     */
+    public ListenableFuture<Boolean> interceptNoMsg(long time, TimeUnit unit)
+    {
+        ResultAbsenceFuture<MessageOut<?>> ret = new ResultAbsenceFuture<>(interceptedMessages, time, unit);
+        executor.execute(ret);
+        return ret;
+    }
+
+    void matchingMessage(MessageOut<?> message)
+    {
+        messagesIntercepted++;
+        logger.trace("Received matching message: {}", message);
+        interceptedMessages.add(message);
+    }
+
+    void matchingResponse(MessageIn<?> response)
+    {
+        mockedMessageResponses++;
+        logger.trace("Responding to intercepted message: {}", response);
+        deliveredResponses.add(response);
+    }
+
+
+    private static class CapturedResultsFuture<T> extends AbstractFuture<List<T>> implements Runnable
+    {
+        private final int waitForResults;
+        private final List<T> results;
+        private final BlockingQueue<T> queue;
+
+        CapturedResultsFuture(int waitForResponses, BlockingQueue<T> queue)
+        {
+            this.waitForResults = waitForResponses;
+            results = new ArrayList<T>(waitForResponses);
+            this.queue = queue;
+        }
+
+        public void run()
+        {
+            try
+            {
+                while (results.size() < waitForResults)
+                    results.add(queue.take());
+
+                set(results);
+            }
+            catch (InterruptedException e)
+            {
+                throw new AssertionError();
+            }
+        }
+    }
+
+    private static class ResultsCompletionFuture<T> extends AbstractFuture<Boolean> implements Runnable
+    {
+        private final int waitForResults;
+        private final BlockingQueue<T> queue;
+
+        ResultsCompletionFuture(int waitForResponses, BlockingQueue<T> queue)
+        {
+            this.waitForResults = waitForResponses;
+            this.queue = queue;
+        }
+
+        public void run()
+        {
+            try
+            {
+                for (int i = 0; i < waitForResults; i++)
+                {
+                    queue.take();
+                }
+                set(true);
+            }
+            catch (InterruptedException e)
+            {
+                throw new AssertionError();
+            }
+        }
+    }
+
+    private static class ResultAbsenceFuture<T> extends AbstractFuture<Boolean> implements Runnable
+    {
+        private final BlockingQueue<T> queue;
+        private final long time;
+        private final TimeUnit unit;
+
+        ResultAbsenceFuture(BlockingQueue<T> queue, long time, TimeUnit unit)
+        {
+            this.queue = queue;
+            this.time = time;
+            this.unit = unit;
+        }
+
+        public void run()
+        {
+            try
+            {
+                T result = queue.poll(time, unit);
+                if (result != null)
+                    setException(new AssertionFailedError("Received unexpected message: " + result));
+                else
+                    set(true);
+            }
+            catch (InterruptedException e)
+            {
+                throw new AssertionError();
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/net/OutboundTcpConnectionTest.java b/test/unit/org/apache/cassandra/net/OutboundTcpConnectionTest.java
new file mode 100644
index 0000000..c09ae0f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/OutboundTcpConnectionTest.java

@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.net;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.net.MessagingService.Verb;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * The tests check whether Queue expiration in the OutboundTcpConnection behaves properly for droppable and
+ * non-droppable messages.
+ */
+public class OutboundTcpConnectionTest
+{
+    AtomicInteger messageId = new AtomicInteger(0);
+
+    final static Verb VERB_DROPPABLE = Verb.MUTATION; // Droppable, 2s timeout
+    final static Verb VERB_NONDROPPABLE = Verb.GOSSIP_DIGEST_ACK; // Not droppable
+
+    final static long NANOS_FOR_TIMEOUT = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getTimeout(VERB_DROPPABLE)*2);
+
+    
+    /**
+     * Verifies our assumptions whether a Verb can be dropped or not. The tests make use of droppabilty, and
+     * may produce wrong test results if their droppabilty is changed. 
+     */
+    @BeforeClass
+    public static void assertDroppability()
+    {
+        if (!MessagingService.DROPPABLE_VERBS.contains(VERB_DROPPABLE))
+            throw new AssertionError("Expected " + VERB_DROPPABLE + " to be droppable");
+        if (MessagingService.DROPPABLE_VERBS.contains(VERB_NONDROPPABLE))
+            throw new AssertionError("Expected " + VERB_NONDROPPABLE + " not to be droppable");
+    }
+
+    /**
+     * Tests that non-droppable messages are never expired
+     */
+    @Test
+    public void testNondroppable() throws UnknownHostException
+    {
+        OutboundTcpConnection otc = getOutboundTcpConnectionForLocalhost();
+        long nanoTimeBeforeEnqueue = System.nanoTime();
+
+        assertFalse("Fresh OutboundTcpConnection contains expired messages",
+                otc.backlogContainsExpiredMessages(nanoTimeBeforeEnqueue));
+
+        fillToPurgeSize(otc, VERB_NONDROPPABLE);
+        fillToPurgeSize(otc, VERB_NONDROPPABLE);
+        otc.expireMessages(expirationTimeNanos());
+
+        assertFalse("OutboundTcpConnection with non-droppable verbs should not expire",
+                otc.backlogContainsExpiredMessages(expirationTimeNanos()));
+    }
+
+    /**
+     * Tests that droppable messages will be dropped after they expire, but not before.
+     * 
+     * @throws UnknownHostException
+     */
+    @Test
+    public void testDroppable() throws UnknownHostException
+    {
+        OutboundTcpConnection otc = getOutboundTcpConnectionForLocalhost();
+        long nanoTimeBeforeEnqueue = System.nanoTime();
+
+        initialFill(otc, VERB_DROPPABLE);
+        assertFalse("OutboundTcpConnection with droppable verbs should not expire immediately",
+                otc.backlogContainsExpiredMessages(nanoTimeBeforeEnqueue));
+
+        otc.expireMessages(nanoTimeBeforeEnqueue);
+        assertFalse("OutboundTcpConnection with droppable verbs should not expire with enqueue-time expiration",
+                otc.backlogContainsExpiredMessages(nanoTimeBeforeEnqueue));
+
+        // Lets presume, expiration time have passed => At that time there shall be expired messages in the Queue
+        long nanoTimeWhenExpired = expirationTimeNanos();
+        assertTrue("OutboundTcpConnection with droppable verbs should have expired",
+                otc.backlogContainsExpiredMessages(nanoTimeWhenExpired));
+
+        // Using the same timestamp, lets expire them and check whether they have gone
+        otc.expireMessages(nanoTimeWhenExpired);
+        assertFalse("OutboundTcpConnection should not have expired entries",
+                otc.backlogContainsExpiredMessages(nanoTimeWhenExpired));
+
+        // Actually the previous test can be done in a harder way: As expireMessages() has run, we cannot have
+        // ANY expired values, thus lets test also against nanoTimeBeforeEnqueue
+        assertFalse("OutboundTcpConnection should not have any expired entries",
+                otc.backlogContainsExpiredMessages(nanoTimeBeforeEnqueue));
+
+    }
+
+    /**
+     * Fills the given OutboundTcpConnection with (1 + BACKLOG_PURGE_SIZE), elements. The first
+     * BACKLOG_PURGE_SIZE elements are non-droppable, the last one is a message with the given Verb and can be
+     * droppable or non-droppable.
+     */
+    private void initialFill(OutboundTcpConnection otc, Verb verb)
+    {
+        assertFalse("Fresh OutboundTcpConnection contains expired messages",
+                otc.backlogContainsExpiredMessages(System.nanoTime()));
+
+        fillToPurgeSize(otc, VERB_NONDROPPABLE);
+        MessageOut<?> messageDroppable10s = new MessageOut<>(verb);
+        otc.enqueue(messageDroppable10s, nextMessageId());
+        otc.expireMessages(System.nanoTime());
+    }
+
+    /**
+     * Returns a nano timestamp in the far future, when expiration should have been performed for VERB_DROPPABLE.
+     * The offset is chosen as 2 times of the expiration time of VERB_DROPPABLE.
+     * 
+     * @return The future nano timestamp
+     */
+    private long expirationTimeNanos()
+    {
+        return System.nanoTime() + NANOS_FOR_TIMEOUT;
+    }
+
+    private int nextMessageId()
+    {
+        return messageId.incrementAndGet();
+    }
+
+    /**
+     * Adds BACKLOG_PURGE_SIZE messages to the queue. Hint: At BACKLOG_PURGE_SIZE expiration starts to work.
+     * 
+     * @param otc
+     *            The OutboundTcpConnection
+     * @param verb
+     *            The verb that defines the message type
+     */
+    private void fillToPurgeSize(OutboundTcpConnection otc, Verb verb)
+    {
+        for (int i = 0; i < OutboundTcpConnection.BACKLOG_PURGE_SIZE; i++)
+        {
+            otc.enqueue(new MessageOut<>(verb), nextMessageId());
+        }
+    }
+
+    private OutboundTcpConnection getOutboundTcpConnectionForLocalhost() throws UnknownHostException
+    {
+        InetAddress lo = InetAddress.getByName("127.0.0.1");
+        OutboundTcpConnectionPool otcPool = new OutboundTcpConnectionPool(lo);
+        OutboundTcpConnection otc = new OutboundTcpConnection(otcPool);
+        return otc;
+    }
+}

diff --git a/test/unit/org/apache/cassandra/net/WriteCallbackInfoTest.java b/test/unit/org/apache/cassandra/net/WriteCallbackInfoTest.java
new file mode 100644
index 0000000..a994a99
--- /dev/null
+++ b/test/unit/org/apache/cassandra/net/WriteCallbackInfoTest.java

@@ -0,0 +1,79 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.net;
+
+import java.net.InetAddress;
+import java.util.UUID;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+import org.apache.cassandra.MockSchema;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.PartitionColumns;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.net.MessagingService.Verb;
+import org.apache.cassandra.service.paxos.Commit;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class WriteCallbackInfoTest
+{
+
+    @Test
+    public void testShouldHint() throws Exception
+    {
+        testShouldHint(Verb.COUNTER_MUTATION, ConsistencyLevel.ALL, true, false);
+        for (Verb verb : new Verb[] { Verb.PAXOS_COMMIT, Verb.MUTATION })
+        {
+            testShouldHint(verb, ConsistencyLevel.ALL, true, true);
+            testShouldHint(verb, ConsistencyLevel.ANY, true, false);
+            testShouldHint(verb, ConsistencyLevel.ALL, false, false);
+        }
+    }
+
+    private void testShouldHint(Verb verb, ConsistencyLevel cl, boolean allowHints, boolean expectHint) throws Exception
+    {
+        Object payload = verb == Verb.PAXOS_COMMIT
+                         ? new Commit(UUID.randomUUID(), new PartitionUpdate(MockSchema.newCFMetaData("", ""), ByteBufferUtil.EMPTY_BYTE_BUFFER, PartitionColumns.NONE, 1))
+                         : new Mutation("", new BufferDecoratedKey(new Murmur3Partitioner.LongToken(0), ByteBufferUtil.EMPTY_BYTE_BUFFER));
+
+        WriteCallbackInfo wcbi = new WriteCallbackInfo(InetAddress.getByName("192.168.1.1"), null, new MessageOut(verb, payload, null), null, cl, allowHints);
+        Assert.assertEquals(expectHint, wcbi.shouldHint());
+        if (expectHint)
+        {
+            Assert.assertNotNull(wcbi.mutation());
+        }
+        else
+        {
+            boolean fail = false;
+            try
+            {
+                wcbi.mutation();
+            }
+            catch (Throwable t)
+            {
+                fail = true;
+            }
+            Assert.assertTrue(fail);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java b/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java
index 892ced1..b891296 100644
--- a/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java
+++ b/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java

@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.repair;
 
 import java.net.InetAddress;
@@ -25,19 +26,18 @@
 
 import org.junit.BeforeClass;
 import org.junit.Test;
-
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 
 import static org.junit.Assert.assertEquals;
 
@@ -52,8 +52,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
     }
 
@@ -67,16 +66,19 @@
         final InetAddress ep2 = InetAddress.getByName("127.0.0.1");
 
         Range<Token> range = new Range<>(partirioner.getMinimumToken(), partirioner.getRandomToken());
-        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), KEYSPACE1, "Standard1", range);
+        RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), KEYSPACE1, "Standard1", Arrays.asList(range));
 
-        MerkleTree tree1 = createInitialTree(desc);
-        MerkleTree tree2 = createInitialTree(desc);
+        MerkleTrees tree1 = createInitialTree(desc);
+
+        MerkleTrees tree2 = createInitialTree(desc);
 
         // difference the trees
         // note: we reuse the same endpoint which is bogus in theory but fine here
         TreeResponse r1 = new TreeResponse(ep1, tree1);
         TreeResponse r2 = new TreeResponse(ep2, tree2);
-        LocalSyncTask task = new LocalSyncTask(desc, r1, r2, ActiveRepairService.UNREPAIRED_SSTABLE);
+        LocalSyncTask task = new LocalSyncTask(desc, r1.endpoint, r2.endpoint,
+                                               MerkleTrees.difference(r1.trees, r2.trees),
+                                               ActiveRepairService.UNREPAIRED_SSTABLE);
         task.run();
 
         assertEquals(0, task.get().numberOfDifferences);
@@ -90,12 +92,13 @@
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
 
-        ActiveRepairService.instance.registerParentRepairSession(parentRepairSession, FBUtilities.getBroadcastAddress(), Arrays.asList(cfs), Arrays.asList(range), false, false);
+        ActiveRepairService.instance.registerParentRepairSession(parentRepairSession,  FBUtilities.getBroadcastAddress(), Arrays.asList(cfs), Arrays.asList(range), false, System.currentTimeMillis(), false);
 
-        RepairJobDesc desc = new RepairJobDesc(parentRepairSession, UUID.randomUUID(), KEYSPACE1, "Standard1", range);
+        RepairJobDesc desc = new RepairJobDesc(parentRepairSession, UUID.randomUUID(), KEYSPACE1, "Standard1", Arrays.asList(range));
 
-        MerkleTree tree1 = createInitialTree(desc);
-        MerkleTree tree2 = createInitialTree(desc);
+        MerkleTrees tree1 = createInitialTree(desc);
+
+        MerkleTrees tree2 = createInitialTree(desc);
 
         // change a range in one of the trees
         Token token = partirioner.midpoint(range.left, range.right);
@@ -110,16 +113,19 @@
         // note: we reuse the same endpoint which is bogus in theory but fine here
         TreeResponse r1 = new TreeResponse(InetAddress.getByName("127.0.0.1"), tree1);
         TreeResponse r2 = new TreeResponse(InetAddress.getByName("127.0.0.2"), tree2);
-        LocalSyncTask task = new LocalSyncTask(desc, r1, r2, ActiveRepairService.UNREPAIRED_SSTABLE);
+        LocalSyncTask task = new LocalSyncTask(desc, r1.endpoint, r2.endpoint,
+                                               MerkleTrees.difference(r1.trees, r2.trees),
+                                               ActiveRepairService.UNREPAIRED_SSTABLE);
         task.run();
 
         // ensure that the changed range was recorded
         assertEquals("Wrong differing ranges", interesting.size(), task.getCurrentStat().numberOfDifferences);
     }
 
-    private MerkleTree createInitialTree(RepairJobDesc desc)
+    private MerkleTrees createInitialTree(RepairJobDesc desc)
     {
-        MerkleTree tree = new MerkleTree(partirioner, desc.range, MerkleTree.RECOMMENDED_DEPTH, (int)Math.pow(2, 15));
+        MerkleTrees tree = new MerkleTrees(partirioner);
+        tree.addMerkleTrees((int) Math.pow(2, 15), desc.ranges);
         tree.init();
         for (MerkleTree.TreeRange r : tree.invalids())
         {

diff --git a/test/unit/org/apache/cassandra/repair/RepairJobTest.java b/test/unit/org/apache/cassandra/repair/RepairJobTest.java
new file mode 100644
index 0000000..2f77a34
--- /dev/null
+++ b/test/unit/org/apache/cassandra/repair/RepairJobTest.java

@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.repair;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.util.concurrent.AsyncFunction;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.net.IMessageSink;
+import org.apache.cassandra.net.MessageIn;
+import org.apache.cassandra.net.MessageOut;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.repair.messages.RepairMessage;
+import org.apache.cassandra.repair.messages.SyncRequest;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class RepairJobTest extends SchemaLoader
+{
+    private static final long TEST_TIMEOUT_S = 10;
+    private static final long THREAD_TIMEOUT_MILLIS = 100;
+    private static final IPartitioner MURMUR3_PARTITIONER = Murmur3Partitioner.instance;
+    private static final String KEYSPACE = "RepairJobTest";
+    private static final String CF = "Standard1";
+    private static final Object messageLock = new Object();
+
+    private static final List<Range<Token>> fullRange = Collections.singletonList(new Range<>(MURMUR3_PARTITIONER.getMinimumToken(),
+                                                                                              MURMUR3_PARTITIONER.getRandomToken()));
+    private static InetAddress addr1;
+    private static InetAddress addr2;
+    private static InetAddress addr3;
+    private static InetAddress addr4;
+    private RepairSession session;
+    private RepairJob job;
+    private RepairJobDesc sessionJobDesc;
+
+    // So that threads actually get recycled and we can have accurate memory accounting while testing
+    // memory retention from CASSANDRA-14096
+    private static class MeasureableRepairSession extends RepairSession
+    {
+        public MeasureableRepairSession(UUID parentRepairSession, UUID id, Collection<Range<Token>> ranges, String keyspace,
+                                        RepairParallelism parallelismDegree, Set<InetAddress> endpoints, long repairedAt, String... cfnames)
+        {
+            super(parentRepairSession, id, ranges, keyspace, parallelismDegree, endpoints, repairedAt, cfnames);
+        }
+
+        protected DebuggableThreadPoolExecutor createExecutor()
+        {
+            DebuggableThreadPoolExecutor executor = super.createExecutor();
+            executor.setKeepAliveTime(THREAD_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS);
+            return executor;
+        }
+    }
+
+    @BeforeClass
+    public static void setupClass() throws UnknownHostException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, CF));
+        addr1 = InetAddress.getByName("127.0.0.1");
+        addr2 = InetAddress.getByName("127.0.0.2");
+        addr3 = InetAddress.getByName("127.0.0.3");
+        addr4 = InetAddress.getByName("127.0.0.4");
+    }
+
+    @Before
+    public void setup()
+    {
+        Set<InetAddress> neighbors = new HashSet<>(Arrays.asList(addr2, addr3));
+
+        UUID parentRepairSession = UUID.randomUUID();
+        ActiveRepairService.instance.registerParentRepairSession(parentRepairSession, FBUtilities.getBroadcastAddress(),
+                                                                 Collections.singletonList(Keyspace.open(KEYSPACE).getColumnFamilyStore(CF)), fullRange, false,
+                                                                 ActiveRepairService.UNREPAIRED_SSTABLE, false);
+
+        this.session = new MeasureableRepairSession(parentRepairSession, UUIDGen.getTimeUUID(), fullRange,
+                                                    KEYSPACE, RepairParallelism.SEQUENTIAL, neighbors,
+                                                    ActiveRepairService.UNREPAIRED_SSTABLE, CF);
+
+        this.job = new RepairJob(session, CF);
+        this.sessionJobDesc = new RepairJobDesc(session.parentRepairSession, session.getId(),
+                                                session.keyspace, CF, session.getRanges());
+
+        DatabaseDescriptor.setBroadcastAddress(addr1);
+    }
+
+    @After
+    public void reset()
+    {
+        ActiveRepairService.instance.terminateSessions();
+        MessagingService.instance().clearMessageSinks();
+    }
+
+    /**
+     * Ensure we can do an end to end repair of consistent data and get the messages we expect
+     */
+    @Test
+    public void testEndToEndNoDifferences() throws Exception
+    {
+        Map<InetAddress, MerkleTrees> mockTrees = new HashMap<>();
+        mockTrees.put(FBUtilities.getBroadcastAddress(), createInitialTree(false));
+        mockTrees.put(addr2, createInitialTree(false));
+        mockTrees.put(addr3, createInitialTree(false));
+
+        List<MessageOut> observedMessages = new ArrayList<>();
+        interceptRepairMessages(mockTrees, observedMessages);
+
+        job.run();
+
+        RepairResult result = job.get(TEST_TIMEOUT_S, TimeUnit.SECONDS);
+
+        assertEquals(3, result.stats.size());
+        // Should be one RemoteSyncTask left behind (other two should be local)
+        assertExpectedDifferences(session.getSyncingTasks().values(), 0);
+
+        // RepairJob should send out SNAPSHOTS -> VALIDATIONS -> done
+        List<RepairMessage.Type> expectedTypes = new ArrayList<>();
+        for (int i = 0; i < 3; i++)
+            expectedTypes.add(RepairMessage.Type.SNAPSHOT);
+        for (int i = 0; i < 3; i++)
+            expectedTypes.add(RepairMessage.Type.VALIDATION_REQUEST);
+
+        assertEquals(expectedTypes, observedMessages.stream()
+                                                    .map(k -> ((RepairMessage) k.payload).messageType)
+                                                    .collect(Collectors.toList()));
+    }
+
+    /**
+     * Regression test for CASSANDRA-14096. We should not retain memory in the RepairSession once the
+     * ValidationTask -> SyncTask transform is done.
+     */
+    @Test
+    public void testNoTreesRetainedAfterDifference() throws Throwable
+    {
+        Map<InetAddress, MerkleTrees> mockTrees = new HashMap<>();
+        mockTrees.put(FBUtilities.getBroadcastAddress(), createInitialTree(false));
+        mockTrees.put(addr2, createInitialTree(true));
+        mockTrees.put(addr3, createInitialTree(false));
+
+        List<MessageOut> observedMessages = new ArrayList<>();
+        interceptRepairMessages(mockTrees, observedMessages);
+
+        List<TreeResponse> mockTreeResponses = mockTrees.entrySet().stream()
+                                                        .map(e -> new TreeResponse(e.getKey(), e.getValue()))
+                                                        .collect(Collectors.toList());
+
+        long singleTreeSize = ObjectSizes.measureDeep(mockTrees.get(addr2));
+
+        // Use a different local address so we get all RemoteSyncs (as LocalSyncs try to reach out over the network).
+        List<SyncTask> syncTasks = job.createSyncTasks(mockTreeResponses, addr4);
+
+        // SyncTasks themselves should not contain significant memory
+        assertTrue(ObjectSizes.measureDeep(syncTasks) < 0.8 * singleTreeSize);
+
+        ListenableFuture<List<SyncStat>> syncResults = Futures.transform(Futures.immediateFuture(mockTreeResponses), new AsyncFunction<List<TreeResponse>, List<SyncStat>>()
+        {
+            public ListenableFuture<List<SyncStat>> apply(List<TreeResponse> treeResponses)
+            {
+                return Futures.allAsList(syncTasks);
+            }
+        }, session.taskExecutor);
+
+        // The session can retain memory in the contained executor until the threads expire, so we wait for the threads
+        // that ran the Tree -> SyncTask conversions to die and release the memory
+        int millisUntilFreed;
+        for (millisUntilFreed = 0; millisUntilFreed < TEST_TIMEOUT_S * 1000; millisUntilFreed += THREAD_TIMEOUT_MILLIS)
+        {
+            // The measured size of the syncingTasks, and result of the computation should be much smaller
+            if (ObjectSizes.measureDeep(session) < 0.8 * singleTreeSize)
+                break;
+            TimeUnit.MILLISECONDS.sleep(THREAD_TIMEOUT_MILLIS);
+        }
+
+        assertTrue(millisUntilFreed < TEST_TIMEOUT_S * 1000);
+
+        List<SyncStat> results = syncResults.get(TEST_TIMEOUT_S, TimeUnit.SECONDS);
+
+        assertTrue(ObjectSizes.measureDeep(results) < 0.8 * singleTreeSize);
+
+        assertEquals(3, results.size());
+        // Should be two RemoteSyncTasks with ranges and one empty one
+        assertExpectedDifferences(new ArrayList<>(session.getSyncingTasks().values()), 1, 1, 0);
+
+        int numDifferent = 0;
+        for (SyncStat stat : results)
+        {
+            if (stat.nodes.endpoint1.equals(addr2) || stat.nodes.endpoint2.equals(addr2))
+            {
+                assertEquals(1, stat.numberOfDifferences);
+                numDifferent++;
+            }
+        }
+        assertEquals(2, numDifferent);
+    }
+
+    private void assertExpectedDifferences(Collection<RemoteSyncTask> tasks, Integer ... differences)
+    {
+        List<Integer> expectedDifferences = new ArrayList<>(Arrays.asList(differences));
+        List<Integer> observedDifferences = tasks.stream()
+                                                 .map(t -> (int) t.getCurrentStat().numberOfDifferences)
+                                                 .collect(Collectors.toList());
+        assertEquals(expectedDifferences.size(), observedDifferences.size());
+        assertTrue(expectedDifferences.containsAll(observedDifferences));
+    }
+
+    private MerkleTrees createInitialTree(boolean invalidate)
+    {
+        MerkleTrees tree = new MerkleTrees(MURMUR3_PARTITIONER);
+        tree.addMerkleTrees((int) Math.pow(2, 15), fullRange);
+        tree.init();
+        for (MerkleTree.TreeRange r : tree.invalids())
+        {
+            r.ensureHashInitialised();
+        }
+
+        if (invalidate)
+        {
+            // change a range in one of the trees
+            Token token = MURMUR3_PARTITIONER.midpoint(fullRange.get(0).left, fullRange.get(0).right);
+            tree.invalidate(token);
+            tree.get(token).hash("non-empty hash!".getBytes());
+        }
+
+        return tree;
+    }
+
+    private void interceptRepairMessages(Map<InetAddress, MerkleTrees> mockTrees,
+                                         List<MessageOut> messageCapture)
+    {
+        MessagingService.instance().addMessageSink(new IMessageSink()
+        {
+            public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
+            {
+                if (message == null || !(message.payload instanceof RepairMessage))
+                    return false;
+
+                // So different Thread's messages don't overwrite each other.
+                synchronized (messageLock)
+                {
+                    messageCapture.add(message);
+                }
+
+                RepairMessage rm = (RepairMessage) message.payload;
+                switch (rm.messageType)
+                {
+                    case SNAPSHOT:
+                        MessageIn<?> messageIn = MessageIn.create(to, null,
+                                                                  Collections.emptyMap(),
+                                                                  MessagingService.Verb.REQUEST_RESPONSE,
+                                                                  MessagingService.current_version);
+                        MessagingService.instance().receive(messageIn, id, System.currentTimeMillis(), false);
+                        break;
+                    case VALIDATION_REQUEST:
+                        session.validationComplete(sessionJobDesc, to, mockTrees.get(to));
+                        break;
+                    case SYNC_REQUEST:
+                        SyncRequest syncRequest = (SyncRequest) rm;
+                        session.syncComplete(sessionJobDesc, new NodePair(syncRequest.src, syncRequest.dst), true);
+                        break;
+                    default:
+                        break;
+                }
+                return false;
+            }
+
+            public boolean allowIncomingMessage(MessageIn message, int id)
+            {
+                return message.verb == MessagingService.Verb.REQUEST_RESPONSE;
+            }
+        });
+    }
+}

diff --git a/test/unit/org/apache/cassandra/repair/RepairSessionTest.java b/test/unit/org/apache/cassandra/repair/RepairSessionTest.java
index 8ea2bfa..d40982c 100644
--- a/test/unit/org/apache/cassandra/repair/RepairSessionTest.java
+++ b/test/unit/org/apache/cassandra/repair/RepairSessionTest.java

@@ -15,10 +15,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.repair;
 
 import java.io.IOException;
 import java.net.InetAddress;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.ExecutionException;
@@ -52,7 +54,7 @@
         IPartitioner p = Murmur3Partitioner.instance;
         Range<Token> repairRange = new Range<>(p.getToken(ByteBufferUtil.bytes(0)), p.getToken(ByteBufferUtil.bytes(100)));
         Set<InetAddress> endpoints = Sets.newHashSet(remote);
-        RepairSession session = new RepairSession(parentSessionId, sessionId, repairRange, "Keyspace1", RepairParallelism.SEQUENTIAL, endpoints, ActiveRepairService.UNREPAIRED_SSTABLE, "Standard1");
+        RepairSession session = new RepairSession(parentSessionId, sessionId, Arrays.asList(repairRange), "Keyspace1", RepairParallelism.SEQUENTIAL, endpoints, ActiveRepairService.UNREPAIRED_SSTABLE, "Standard1");
 
         // perform convict
         session.convict(remote, Double.MAX_VALUE);

diff --git a/test/unit/org/apache/cassandra/repair/ValidatorTest.java b/test/unit/org/apache/cassandra/repair/ValidatorTest.java
index 61ab3da..9c32cef 100644
--- a/test/unit/org/apache/cassandra/repair/ValidatorTest.java
+++ b/test/unit/org/apache/cassandra/repair/ValidatorTest.java

@@ -17,11 +17,13 @@
  */
 package org.apache.cassandra.repair;
 
-import java.io.IOException;
 import java.net.InetAddress;
-import java.security.MessageDigest;
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
 import java.util.UUID;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.TimeUnit;
 
 import com.google.common.util.concurrent.ListenableFuture;
@@ -30,39 +32,38 @@
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.CompactionsTest;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.SequentialWriter;
 import org.junit.After;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.EmptyIterators;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.compaction.AbstractCompactedRow;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.ColumnStats;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.net.IMessageSink;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessageOut;
 import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.net.IMessageSink;
 import org.apache.cassandra.repair.messages.RepairMessage;
 import org.apache.cassandra.repair.messages.ValidationComplete;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
-import org.apache.cassandra.utils.concurrent.SimpleCondition;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 
 public class ValidatorTest
 {
@@ -70,16 +71,16 @@
 
     private static final String keyspace = "ValidatorTest";
     private static final String columnFamily = "Standard1";
-    private final IPartitioner partitioner = StorageService.getPartitioner();
+    private static IPartitioner partitioner;
 
     @BeforeClass
     public static void defineSchema() throws Exception
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(keyspace,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(keyspace, columnFamily));
+        partitioner = Schema.instance.getCFMetaData(keyspace, columnFamily).partitioner;
     }
 
     @After
@@ -92,16 +93,17 @@
     public void testValidatorComplete() throws Throwable
     {
         Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getRandomToken());
-        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, range);
+        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, Arrays.asList(range));
 
-        final ListenableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
+        final CompletableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
 
         InetAddress remote = InetAddress.getByName("127.0.0.2");
 
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(columnFamily);
 
         Validator validator = new Validator(desc, remote, 0);
-        MerkleTree tree = new MerkleTree(cfs.partitioner, validator.desc.range, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, 15));
+        MerkleTrees tree = new MerkleTrees(partitioner);
+        tree.addMerkleTrees((int) Math.pow(2, 15), validator.desc.ranges);
         validator.prepare(cfs, tree);
 
         // and confirm that the tree was split
@@ -109,7 +111,7 @@
 
         // add a row
         Token mid = partitioner.midpoint(range.left, range.right);
-        validator.add(new CompactedRowStub(new BufferDecoratedKey(mid, ByteBufferUtil.bytes("inconceivable!"))));
+        validator.add(EmptyIterators.unfilteredRow(cfs.metadata, new BufferDecoratedKey(mid, ByteBufferUtil.bytes("inconceivable!")), false));
         validator.complete();
 
         // confirm that the tree was validated
@@ -121,39 +123,18 @@
         RepairMessage m = (RepairMessage) message.payload;
         assertEquals(RepairMessage.Type.VALIDATION_COMPLETE, m.messageType);
         assertEquals(desc, m.desc);
-        assertTrue(((ValidationComplete) m).success);
-        assertNotNull(((ValidationComplete) m).tree);
+        assertTrue(((ValidationComplete) m).success());
+        assertNotNull(((ValidationComplete) m).trees);
     }
 
-    private static class CompactedRowStub extends AbstractCompactedRow
-    {
-        private CompactedRowStub(DecoratedKey key)
-        {
-            super(key);
-        }
-
-        public RowIndexEntry write(long currentPosition, SequentialWriter out) throws IOException
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        public void update(MessageDigest digest) { }
-
-        public ColumnStats columnStats()
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        public void close() throws IOException { }
-    }
 
     @Test
     public void testValidatorFailed() throws Throwable
     {
         Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getRandomToken());
-        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, range);
+        final RepairJobDesc desc = new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), keyspace, columnFamily, Arrays.asList(range));
 
-        final ListenableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
+        final CompletableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
 
         InetAddress remote = InetAddress.getByName("127.0.0.2");
 
@@ -165,8 +146,8 @@
         RepairMessage m = (RepairMessage) message.payload;
         assertEquals(RepairMessage.Type.VALIDATION_COMPLETE, m.messageType);
         assertEquals(desc, m.desc);
-        assertFalse(((ValidationComplete) m).success);
-        assertNull(((ValidationComplete) m).tree);
+        assertFalse(((ValidationComplete) m).success());
+        assertNull(((ValidationComplete) m).trees);
     }
 
     @Test
@@ -199,22 +180,22 @@
         CompactionsTest.populate(keyspace, columnFamily, 0, n, 0); //ttl=3s
 
         cfs.forceBlockingFlush();
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // wait enough to force single compaction
         TimeUnit.SECONDS.sleep(5);
 
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         UUID repairSessionId = UUIDGen.getTimeUUID();
         final RepairJobDesc desc = new RepairJobDesc(repairSessionId, UUIDGen.getTimeUUID(), cfs.keyspace.getName(),
-                                               cfs.getColumnFamilyName(), new Range<Token>(sstable.first.getToken(),
-                                                                                             sstable.last.getToken()));
+                                               cfs.getColumnFamilyName(), Collections.singletonList(new Range<>(sstable.first.getToken(),
+                                                                                                                sstable.last.getToken())));
 
         ActiveRepairService.instance.registerParentRepairSession(repairSessionId, FBUtilities.getBroadcastAddress(),
-                                                                 Collections.singletonList(cfs), Collections.singleton(desc.range),
-                                                                 false, false);
+                                                                 Collections.singletonList(cfs), desc.ranges, false, ActiveRepairService.UNREPAIRED_SSTABLE,
+                                                                 false);
 
-        final ListenableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
+        final CompletableFuture<MessageOut> outgoingMessageSink = registerOutgoingMessageSink();
         Validator validator = new Validator(desc, FBUtilities.getBroadcastAddress(), 0, true);
         CompactionManager.instance.submitValidation(cfs, validator);
 
@@ -223,21 +204,25 @@
         RepairMessage m = (RepairMessage) message.payload;
         assertEquals(RepairMessage.Type.VALIDATION_COMPLETE, m.messageType);
         assertEquals(desc, m.desc);
-        assertTrue(((ValidationComplete) m).success);
-        MerkleTree tree = ((ValidationComplete) m).tree;
+        assertTrue(((ValidationComplete) m).success());
+        MerkleTrees trees = ((ValidationComplete) m).trees;
 
-        assertEquals(Math.pow(2, Math.ceil(Math.log(n) / Math.log(2))), tree.size(), 0.0);
-        assertEquals(tree.rowCount(), n);
+        Iterator<Map.Entry<Range<Token>, MerkleTree>> iterator = trees.iterator();
+        while (iterator.hasNext())
+        {
+            assertEquals(Math.pow(2, Math.ceil(Math.log(n) / Math.log(2))), iterator.next().getValue().size(), 0.0);
+        }
+        assertEquals(trees.rowCount(), n);
     }
 
-    private ListenableFuture<MessageOut> registerOutgoingMessageSink()
+    private CompletableFuture<MessageOut> registerOutgoingMessageSink()
     {
-        final SettableFuture<MessageOut> future = SettableFuture.create();
+        final CompletableFuture<MessageOut> future = new CompletableFuture<>();
         MessagingService.instance().addMessageSink(new IMessageSink()
         {
             public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
             {
-                future.set(message);
+                future.complete(message);
                 return false;
             }
 

diff --git a/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java b/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java
new file mode 100644
index 0000000..5dbed3f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java

@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.repair.messages;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBufferFixed;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.repair.NodePair;
+import org.apache.cassandra.repair.RepairJobDesc;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.MerkleTrees;
+
+public class RepairMessageSerializationsTest
+{
+    private static final int PROTOCOL_VERSION = MessagingService.current_version;
+    private static final int GC_BEFORE = 1000000;
+
+    private static IPartitioner originalPartitioner;
+
+    @BeforeClass
+    public static void before()
+    {
+        originalPartitioner = StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance);
+    }
+
+    @AfterClass
+    public static void after()
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(originalPartitioner);
+    }
+
+    @Test
+    public void validationRequestMessage() throws IOException
+    {
+        RepairJobDesc jobDesc = buildRepairJobDesc();
+        ValidationRequest msg = new ValidationRequest(jobDesc, GC_BEFORE);
+        ValidationRequest deserialized = serializeRoundTrip(msg, ValidationRequest.serializer);
+        Assert.assertEquals(jobDesc, deserialized.desc);
+    }
+
+    private RepairJobDesc buildRepairJobDesc()
+    {
+        List<Range<Token>> tokenRanges = buildTokenRanges();
+        return new RepairJobDesc(UUID.randomUUID(), UUID.randomUUID(), "serializationsTestKeyspace", "repairMessages", tokenRanges);
+    }
+
+    private List<Range<Token>> buildTokenRanges()
+    {
+        List<Range<Token>> tokenRanges = new ArrayList<>(4);
+        tokenRanges.add(new Range<>(new LongToken(1000), new LongToken(1001)));
+        tokenRanges.add(new Range<>(new LongToken(2000), new LongToken(2001)));
+        tokenRanges.add(new Range<>(new LongToken(3000), new LongToken(3001)));
+        tokenRanges.add(new Range<>(new LongToken(4000), new LongToken(4001)));
+        return tokenRanges;
+    }
+
+    private <T extends RepairMessage> T serializeRoundTrip(T msg, IVersionedSerializer<T> serializer) throws IOException
+    {
+        long size = serializer.serializedSize(msg, PROTOCOL_VERSION);
+
+        ByteBuffer buf = ByteBuffer.allocate((int)size);
+        DataOutputPlus out = new DataOutputBufferFixed(buf);
+        serializer.serialize(msg, out, PROTOCOL_VERSION);
+        Assert.assertEquals(size, buf.position());
+
+        buf.flip();
+        DataInputPlus in = new DataInputBuffer(buf, false);
+        T deserialized = serializer.deserialize(in, PROTOCOL_VERSION);
+        Assert.assertEquals(msg, deserialized);
+        Assert.assertEquals(msg.hashCode(), deserialized.hashCode());
+        return deserialized;
+    }
+
+    @Test
+    public void validationCompleteMessage_NoMerkleTree() throws IOException
+    {
+        ValidationComplete deserialized = validationCompleteMessage(null);
+        Assert.assertNull(deserialized.trees);
+    }
+
+    @Test
+    public void validationCompleteMessage_WithMerkleTree() throws IOException
+    {
+        MerkleTrees trees = new MerkleTrees(Murmur3Partitioner.instance);
+        trees.addMerkleTree(256, new Range<>(new LongToken(1000), new LongToken(1001)));
+        ValidationComplete deserialized = validationCompleteMessage(trees);
+
+        // a simple check to make sure we got some merkle trees back.
+        Assert.assertEquals(trees.size(), deserialized.trees.size());
+    }
+
+    private ValidationComplete validationCompleteMessage(MerkleTrees trees) throws IOException
+    {
+        RepairJobDesc jobDesc = buildRepairJobDesc();
+        ValidationComplete msg = trees == null ?
+                                 new ValidationComplete(jobDesc) :
+                                 new ValidationComplete(jobDesc, trees);
+        ValidationComplete deserialized = serializeRoundTrip(msg, ValidationComplete.serializer);
+        return deserialized;
+    }
+
+    @Test
+    public void syncRequestMessage() throws IOException
+    {
+        InetAddress initiator = InetAddress.getByName("127.0.0.1");
+        InetAddress src = InetAddress.getByName("127.0.0.2");
+        InetAddress dst = InetAddress.getByName("127.0.0.3");
+
+        SyncRequest msg = new SyncRequest(buildRepairJobDesc(), initiator, src, dst, buildTokenRanges());
+        serializeRoundTrip(msg, SyncRequest.serializer);
+    }
+
+    @Test
+    public void syncCompleteMessage() throws IOException
+    {
+        InetAddress src = InetAddress.getByName("127.0.0.2");
+        InetAddress dst = InetAddress.getByName("127.0.0.3");
+        SyncComplete msg = new SyncComplete(buildRepairJobDesc(), new NodePair(src, dst), true);
+        serializeRoundTrip(msg, SyncComplete.serializer);
+    }
+
+    @Test
+    public void antiCompactionRequestMessage() throws IOException
+    {
+        AnticompactionRequest msg = new AnticompactionRequest(UUID.randomUUID(), buildTokenRanges());
+        serializeRoundTrip(msg, AnticompactionRequest.serializer);
+    }
+
+    @Test
+    public void prepareMessage() throws IOException
+    {
+        PrepareMessage msg = new PrepareMessage(UUID.randomUUID(), new ArrayList<UUID>() {{add(UUID.randomUUID());}},
+                                                buildTokenRanges(), true, 100000L, false);
+        serializeRoundTrip(msg, PrepareMessage.serializer);
+    }
+
+    @Test
+    public void snapshotMessage() throws IOException
+    {
+        SnapshotMessage msg = new SnapshotMessage(buildRepairJobDesc());
+        serializeRoundTrip(msg, SnapshotMessage.serializer);
+    }
+
+    @Test
+    public void cleanupMessage() throws IOException
+    {
+        CleanupMessage msg = new CleanupMessage(UUID.randomUUID());
+        serializeRoundTrip(msg, CleanupMessage.serializer);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java b/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java
index a0eea4e..b617e96 100644
--- a/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java
+++ b/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java

@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.repair.messages;
 
 import java.util.HashMap;
@@ -35,7 +36,9 @@
 import org.apache.cassandra.repair.RepairParallelism;
 import org.apache.cassandra.utils.FBUtilities;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 public class RepairOptionTest
 {
@@ -119,7 +122,7 @@
     @Test
     public void testIncrementalRepairWithSubrangesIsNotGlobal() throws Exception
     {
-        RepairOption ro = RepairOption.parse(ImmutableMap.of(RepairOption.INCREMENTAL_KEY, "true", RepairOption.RANGES_KEY, "42:42"),
+        RepairOption ro = RepairOption.parse(ImmutableMap.of(RepairOption.INCREMENTAL_KEY, "true", RepairOption.RANGES_KEY, "41:42"),
                            Murmur3Partitioner.instance);
         assertFalse(ro.isGlobal());
         ro = RepairOption.parse(ImmutableMap.of(RepairOption.INCREMENTAL_KEY, "true", RepairOption.RANGES_KEY, ""),

diff --git a/test/unit/org/apache/cassandra/schema/DefsTest.java b/test/unit/org/apache/cassandra/schema/DefsTest.java
index ecf6709..e9980f6 100644
--- a/test/unit/org/apache/cassandra/schema/DefsTest.java
+++ b/test/unit/org/apache/cassandra/schema/DefsTest.java

@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,47 +20,56 @@
 
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Supplier;
+
+import com.google.common.collect.ImmutableMap;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.db.marshal.TimeUUIDType;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableDeletingTask;
 import org.apache.cassandra.locator.OldNetworkTopologyStrategy;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.schema.LegacySchemaTables;
 import org.apache.cassandra.service.MigrationManager;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 
-import static org.apache.cassandra.Util.cellname;
+import static org.apache.cassandra.Util.throwAssert;
+import static org.apache.cassandra.cql3.CQLTester.assertRows;
+import static org.apache.cassandra.cql3.CQLTester.row;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Ignore;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import com.google.common.base.Supplier;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class DefsTest
 {
-    private static final String KEYSPACE1 = "Keyspace1";
-    private static final String KEYSPACE3 = "Keyspace3";
-    private static final String KEYSPACE6 = "Keyspace6";
-    private static final String EMPTYKEYSPACE = "DefsTestEmptyKeyspace";
-    private static final String CF_STANDARD1 = "Standard1";
-    private static final String CF_STANDARD2 = "Standard2";
-    private static final String CF_INDEXED = "Indexed1";
+    private static final String KEYSPACE1 = "keyspace1";
+    private static final String KEYSPACE3 = "keyspace3";
+    private static final String KEYSPACE6 = "keyspace6";
+    private static final String EMPTY_KEYSPACE = "test_empty_keyspace";
+    private static final String TABLE1 = "standard1";
+    private static final String TABLE2 = "standard2";
+    private static final String TABLE1i = "indexed1";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
@@ -68,62 +77,57 @@
         SchemaLoader.prepareServer();
         SchemaLoader.startGossiper();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2));
-        SchemaLoader.createKeyspace(KEYSPACE3, true, false,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(5),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
-                                    SchemaLoader.indexCFMD(KEYSPACE3, CF_INDEXED, true));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, TABLE1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, TABLE2));
+        SchemaLoader.createKeyspace(KEYSPACE3,
+                                    KeyspaceParams.simple(5),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, TABLE1),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE3, TABLE1i, true));
         SchemaLoader.createKeyspace(KEYSPACE6,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.indexCFMD(KEYSPACE6, CF_INDEXED, true));
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE6, TABLE1i, true));
     }
 
     @Test
     public void testCFMetaDataApply() throws ConfigurationException
     {
-        CFMetaData cfm = new CFMetaData(KEYSPACE1,
-                                        "TestApplyCFM_CF",
-                                        ColumnFamilyType.Standard,
-                                        new SimpleDenseCellNameType(BytesType.instance));
+        CFMetaData cfm = CFMetaData.Builder.create(KEYSPACE1, "TestApplyCFM_CF")
+                                           .addPartitionKey("keys", BytesType.instance)
+                                           .addClusteringColumn("col", BytesType.instance).build();
+
 
         for (int i = 0; i < 5; i++)
         {
             ByteBuffer name = ByteBuffer.wrap(new byte[] { (byte)i });
-            cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, name, BytesType.instance, null).setIndex(Integer.toString(i), IndexType.KEYS, null));
+            cfm.addColumnDefinition(ColumnDefinition.regularDef(cfm, name, BytesType.instance));
         }
 
         cfm.comment("No comment")
            .readRepairChance(0.5)
            .gcGraceSeconds(100000)
-           .minCompactionThreshold(500)
-           .maxCompactionThreshold(500);
+           .compaction(CompactionParams.scts(ImmutableMap.of("min_threshold", "500",
+                                                             "max_threshold", "500")));
 
         // we'll be adding this one later. make sure it's not already there.
-        Assert.assertNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 5 })));
+        assertNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[]{ 5 })));
 
         CFMetaData cfNew = cfm.copy();
 
         // add one.
-        ColumnDefinition addIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance, null)
-                                                       .setIndex("5", IndexType.KEYS, null);
+        ColumnDefinition addIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance);
         cfNew.addColumnDefinition(addIndexDef);
 
         // remove one.
-        ColumnDefinition removeIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance, null)
-                                                          .setIndex("0", IndexType.KEYS, null);
-        Assert.assertTrue(cfNew.removeColumnDefinition(removeIndexDef));
+        ColumnDefinition removeIndexDef = ColumnDefinition.regularDef(cfm, ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance);
+        assertTrue(cfNew.removeColumnDefinition(removeIndexDef));
 
         cfm.apply(cfNew);
 
         for (int i = 1; i < cfm.allColumns().size(); i++)
-            Assert.assertNotNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 1 })));
-        Assert.assertNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 0 })));
-        Assert.assertNotNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[] { 5 })));
+            assertNotNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[]{ 1 })));
+        assertNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[]{ 0 })));
+        assertNotNull(cfm.getColumnDefinition(ByteBuffer.wrap(new byte[]{ 5 })));
     }
 
     @Test
@@ -131,37 +135,17 @@
     {
         String[] valid = {"1", "a", "_1", "b_", "__", "1_a"};
         for (String s : valid)
-            Assert.assertTrue(CFMetaData.isNameValid(s));
+            assertTrue(CFMetaData.isNameValid(s));
 
         String[] invalid = {"b@t", "dash-y", "", " ", "dot.s", ".hidden"};
         for (String s : invalid)
-            Assert.assertFalse(CFMetaData.isNameValid(s));
-    }
-
-    @Ignore
-    @Test
-    public void saveAndRestore()
-    {
-        /*
-        // verify dump and reload.
-        UUID first = UUIDGen.makeType1UUIDFromHost(FBUtilities.getBroadcastAddress());
-        DefsTables.dumpToStorage(first);
-        List<KSMetaData> defs = new ArrayList<KSMetaData>(DefsTables.loadFromStorage(first));
-
-        Assert.assertTrue(defs.size() > 0);
-        Assert.assertEquals(defs.size(), Schema.instance.getNonSystemKeyspaces().size());
-        for (KSMetaData loaded : defs)
-        {
-            KSMetaData defined = Schema.instance.getKeyspaceDefinition(loaded.name);
-            Assert.assertTrue(String.format("%s != %s", loaded, defined), defined.equals(loaded));
-        }
-        */
+            assertFalse(CFMetaData.isNameValid(s));
     }
 
     @Test
     public void addNewCfToBogusKeyspace()
     {
-        CFMetaData newCf = addTestCF("MadeUpKeyspace", "NewCF", "new cf");
+        CFMetaData newCf = addTestTable("MadeUpKeyspace", "NewCF", "new cf");
         try
         {
             MigrationManager.announceNewColumnFamily(newCf);
@@ -173,101 +157,80 @@
     }
 
     @Test
-    public void addNewCfWithNullComment() throws ConfigurationException
+    public void addNewTable() throws ConfigurationException
     {
-        final String ks = KEYSPACE1;
-        final String cf = "BrandNewCfWithNull";
-        KSMetaData original = Schema.instance.getKSMetaData(ks);
+        final String ksName = KEYSPACE1;
+        final String tableName = "anewtable";
+        KeyspaceMetadata original = Schema.instance.getKSMetaData(ksName);
 
-        CFMetaData newCf = addTestCF(original.name, cf, null);
+        CFMetaData cfm = addTestTable(original.name, tableName, "A New Table");
 
-        Assert.assertFalse(Schema.instance.getKSMetaData(ks).cfMetaData().containsKey(newCf.cfName));
-        MigrationManager.announceNewColumnFamily(newCf);
+        assertFalse(Schema.instance.getKSMetaData(ksName).tables.get(cfm.cfName).isPresent());
+        MigrationManager.announceNewColumnFamily(cfm);
 
-        Assert.assertTrue(Schema.instance.getKSMetaData(ks).cfMetaData().containsKey(newCf.cfName));
-        Assert.assertEquals(newCf, Schema.instance.getKSMetaData(ks).cfMetaData().get(newCf.cfName));
-    }
-
-    @Test
-    public void addNewCF() throws ConfigurationException
-    {
-        final String ks = KEYSPACE1;
-        final String cf = "BrandNewCf";
-        KSMetaData original = Schema.instance.getKSMetaData(ks);
-
-        CFMetaData newCf = addTestCF(original.name, cf, "A New Table");
-
-        Assert.assertFalse(Schema.instance.getKSMetaData(ks).cfMetaData().containsKey(newCf.cfName));
-        MigrationManager.announceNewColumnFamily(newCf);
-
-        Assert.assertTrue(Schema.instance.getKSMetaData(ks).cfMetaData().containsKey(newCf.cfName));
-        Assert.assertEquals(newCf, Schema.instance.getKSMetaData(ks).cfMetaData().get(newCf.cfName));
+        assertTrue(Schema.instance.getKSMetaData(ksName).tables.get(cfm.cfName).isPresent());
+        assertEquals(cfm, Schema.instance.getKSMetaData(ksName).tables.get(cfm.cfName).get());
 
         // now read and write to it.
-        CellName col0 = cellname("col0");
-        DecoratedKey dk = Util.dk("key0");
-        Mutation rm = new Mutation(ks, dk.getKey());
-        rm.add(cf, col0, ByteBufferUtil.bytes("value0"), 1L);
-        rm.applyUnsafe();
-        ColumnFamilyStore store = Keyspace.open(ks).getColumnFamilyStore(cf);
-        Assert.assertNotNull(store);
-        store.forceBlockingFlush();
+        QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, col, val) VALUES (?, ?, ?)",
+                                                     ksName, tableName),
+                                       "key0", "col0", "val0");
 
-        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
-        Assert.assertNotNull(cfam.getColumn(col0));
-        Cell col = cfam.getColumn(col0);
-        Assert.assertEquals(ByteBufferUtil.bytes("value0"), col.value());
+        // flush to exercise more than just hitting the memtable
+        ColumnFamilyStore cfs = Keyspace.open(ksName).getColumnFamilyStore(tableName);
+        assertNotNull(cfs);
+        cfs.forceBlockingFlush();
+
+        // and make sure we get out what we put in
+        UntypedResultSet rows = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s", ksName, tableName));
+        assertRows(rows, row("key0", "col0", "val0"));
     }
 
     @Test
     public void dropCf() throws ConfigurationException
     {
-        DecoratedKey dk = Util.dk("dropCf");
         // sanity
-        final KSMetaData ks = Schema.instance.getKSMetaData(KEYSPACE1);
-        Assert.assertNotNull(ks);
-        final CFMetaData cfm = ks.cfMetaData().get("Standard1");
-        Assert.assertNotNull(cfm);
+        final KeyspaceMetadata ks = Schema.instance.getKSMetaData(KEYSPACE1);
+        assertNotNull(ks);
+        final CFMetaData cfm = ks.tables.getNullable(TABLE1);
+        assertNotNull(cfm);
 
         // write some data, force a flush, then verify that files exist on disk.
-        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
-        rm.applyUnsafe();
-        final ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
-        Assert.assertNotNull(store);
+            QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)",
+                                                         KEYSPACE1, TABLE1),
+                                           "dropCf", "col" + i, "anyvalue");
+        ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
+        assertNotNull(store);
         store.forceBlockingFlush();
-        Assert.assertTrue(store.directories.sstableLister().list().size() > 0);
+        assertTrue(store.getDirectories().sstableLister(Directories.OnTxnErr.THROW).list().size() > 0);
 
         MigrationManager.announceColumnFamilyDrop(ks.name, cfm.cfName);
 
-        Assert.assertFalse(Schema.instance.getKSMetaData(ks.name).cfMetaData().containsKey(cfm.cfName));
+        assertFalse(Schema.instance.getKSMetaData(ks.name).tables.get(cfm.cfName).isPresent());
 
         // any write should fail.
-        rm = new Mutation(ks.name, dk.getKey());
         boolean success = true;
         try
         {
-            rm.add("Standard1", cellname("col0"), ByteBufferUtil.bytes("value0"), 1L);
-            rm.applyUnsafe();
+            QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)",
+                                                         KEYSPACE1, TABLE1),
+                                           "dropCf", "col0", "anyvalue");
         }
         catch (Throwable th)
         {
             success = false;
         }
-        Assert.assertFalse("This mutation should have failed since the CF no longer exists.", success);
+        assertFalse("This mutation should have failed since the CF no longer exists.", success);
 
         // verify that the files are gone.
-        Supplier<Object> lambda = new Supplier<Object>() {
-            @Override
-            public Boolean get() {
-                for (File file : store.directories.sstableLister().listFiles())
-                {
-                    if (file.getPath().endsWith("Data.db") && !new File(file.getPath().replace("Data.db", "Compacted")).exists())
-                        return false;
-                }
-                return true;
+        Supplier<Object> lambda = () -> {
+            for (File file : store.getDirectories().sstableLister(Directories.OnTxnErr.THROW).listFiles())
+            {
+                if (file.getPath().endsWith("Data.db") && !new File(file.getPath().replace("Data.db", "Compacted")).exists())
+                    return false;
             }
+            return true;
         };
         Util.spinAssertEquals(true, lambda, 30);
 
@@ -276,68 +239,60 @@
     @Test
     public void addNewKS() throws ConfigurationException
     {
-        DecoratedKey dk = Util.dk("key0");
-        CFMetaData newCf = addTestCF("NewKeyspace1", "AddedStandard1", "A new cf for a new ks");
-
-        KSMetaData newKs = KSMetaData.testMetadata(newCf.ksName, SimpleStrategy.class, KSMetaData.optsWithRF(5), newCf);
-
+        CFMetaData cfm = addTestTable("newkeyspace1", "newstandard1", "A new cf for a new ks");
+        KeyspaceMetadata newKs = KeyspaceMetadata.create(cfm.ksName, KeyspaceParams.simple(5), Tables.of(cfm));
         MigrationManager.announceNewKeyspace(newKs);
 
-        Assert.assertNotNull(Schema.instance.getKSMetaData(newCf.ksName));
-        Assert.assertEquals(Schema.instance.getKSMetaData(newCf.ksName), newKs);
+        assertNotNull(Schema.instance.getKSMetaData(cfm.ksName));
+        assertEquals(Schema.instance.getKSMetaData(cfm.ksName), newKs);
 
         // test reads and writes.
-        CellName col0 = cellname("col0");
-        Mutation rm = new Mutation(newCf.ksName, dk.getKey());
-        rm.add(newCf.cfName, col0, ByteBufferUtil.bytes("value0"), 1L);
-        rm.applyUnsafe();
-        ColumnFamilyStore store = Keyspace.open(newCf.ksName).getColumnFamilyStore(newCf.cfName);
-        Assert.assertNotNull(store);
+        QueryProcessor.executeInternal("INSERT INTO newkeyspace1.newstandard1 (key, col, val) VALUES (?, ?, ?)",
+                                       "key0", "col0", "val0");
+        ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
+        assertNotNull(store);
         store.forceBlockingFlush();
 
-        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
-        Assert.assertNotNull(cfam.getColumn(col0));
-        Cell col = cfam.getColumn(col0);
-        Assert.assertEquals(ByteBufferUtil.bytes("value0"), col.value());
+        UntypedResultSet rows = QueryProcessor.executeInternal("SELECT * FROM newkeyspace1.newstandard1");
+        assertRows(rows, row("key0", "col0", "val0"));
     }
 
     @Test
     public void dropKS() throws ConfigurationException
     {
-        DecoratedKey dk = Util.dk("dropKs");
         // sanity
-        final KSMetaData ks = Schema.instance.getKSMetaData(KEYSPACE1);
-        Assert.assertNotNull(ks);
-        final CFMetaData cfm = ks.cfMetaData().get("Standard2");
-        Assert.assertNotNull(cfm);
+        final KeyspaceMetadata ks = Schema.instance.getKSMetaData(KEYSPACE1);
+        assertNotNull(ks);
+        final CFMetaData cfm = ks.tables.getNullable(TABLE2);
+        assertNotNull(cfm);
 
         // write some data, force a flush, then verify that files exist on disk.
-        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
-        rm.applyUnsafe();
-        ColumnFamilyStore store = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
-        Assert.assertNotNull(store);
-        store.forceBlockingFlush();
-        Assert.assertTrue(store.directories.sstableLister().list().size() > 0);
+            QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)",
+                                                         KEYSPACE1, TABLE2),
+                                           "dropKs", "col" + i, "anyvalue");
+        ColumnFamilyStore cfs = Keyspace.open(cfm.ksName).getColumnFamilyStore(cfm.cfName);
+        assertNotNull(cfs);
+        cfs.forceBlockingFlush();
+        assertTrue(!cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).list().isEmpty());
 
         MigrationManager.announceKeyspaceDrop(ks.name);
 
-        Assert.assertNull(Schema.instance.getKSMetaData(ks.name));
+        assertNull(Schema.instance.getKSMetaData(ks.name));
 
         // write should fail.
-        rm = new Mutation(ks.name, dk.getKey());
         boolean success = true;
         try
         {
-            rm.add("Standard1", cellname("col0"), ByteBufferUtil.bytes("value0"), 1L);
-            rm.applyUnsafe();
+            QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)",
+                                                         KEYSPACE1, TABLE2),
+                                           "dropKs", "col0", "anyvalue");
         }
         catch (Throwable th)
         {
             success = false;
         }
-        Assert.assertFalse("This mutation should have failed since the CF no longer exists.", success);
+        assertFalse("This mutation should have failed since the KS no longer exists.", success);
 
         // reads should fail too.
         boolean threw = false;
@@ -349,81 +304,76 @@
         {
             threw = true;
         }
-        Assert.assertTrue(threw);
+        assertTrue(threw);
     }
 
     @Test
     public void dropKSUnflushed() throws ConfigurationException
     {
-        DecoratedKey dk = Util.dk("dropKs");
         // sanity
-        final KSMetaData ks = Schema.instance.getKSMetaData(KEYSPACE3);
-        Assert.assertNotNull(ks);
-        final CFMetaData cfm = ks.cfMetaData().get("Standard1");
-        Assert.assertNotNull(cfm);
+        final KeyspaceMetadata ks = Schema.instance.getKSMetaData(KEYSPACE3);
+        assertNotNull(ks);
+        final CFMetaData cfm = ks.tables.getNullable(TABLE1);
+        assertNotNull(cfm);
 
         // write some data
-        Mutation rm = new Mutation(ks.name, dk.getKey());
         for (int i = 0; i < 100; i++)
-            rm.add(cfm.cfName, cellname("col" + i), ByteBufferUtil.bytes("anyvalue"), 1L);
-        rm.applyUnsafe();
+            QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, name, val) VALUES (?, ?, ?)",
+                                                         KEYSPACE3, TABLE1),
+                                           "dropKs", "col" + i, "anyvalue");
 
         MigrationManager.announceKeyspaceDrop(ks.name);
 
-        Assert.assertNull(Schema.instance.getKSMetaData(ks.name));
+        assertNull(Schema.instance.getKSMetaData(ks.name));
     }
 
     @Test
     public void createEmptyKsAddNewCf() throws ConfigurationException
     {
-        Assert.assertNull(Schema.instance.getKSMetaData(EMPTYKEYSPACE));
-
-        KSMetaData newKs = KSMetaData.testMetadata(EMPTYKEYSPACE, SimpleStrategy.class, KSMetaData.optsWithRF(5));
-
+        assertNull(Schema.instance.getKSMetaData(EMPTY_KEYSPACE));
+        KeyspaceMetadata newKs = KeyspaceMetadata.create(EMPTY_KEYSPACE, KeyspaceParams.simple(5));
         MigrationManager.announceNewKeyspace(newKs);
-        Assert.assertNotNull(Schema.instance.getKSMetaData(EMPTYKEYSPACE));
+        assertNotNull(Schema.instance.getKSMetaData(EMPTY_KEYSPACE));
 
-        CFMetaData newCf = addTestCF(EMPTYKEYSPACE, "AddedLater", "A new CF to add to an empty KS");
+        String tableName = "added_later";
+        CFMetaData newCf = addTestTable(EMPTY_KEYSPACE, tableName, "A new CF to add to an empty KS");
 
         //should not exist until apply
-        Assert.assertFalse(Schema.instance.getKSMetaData(newKs.name).cfMetaData().containsKey(newCf.cfName));
+        assertFalse(Schema.instance.getKSMetaData(newKs.name).tables.get(newCf.cfName).isPresent());
 
         //add the new CF to the empty space
         MigrationManager.announceNewColumnFamily(newCf);
 
-        Assert.assertTrue(Schema.instance.getKSMetaData(newKs.name).cfMetaData().containsKey(newCf.cfName));
-        Assert.assertEquals(Schema.instance.getKSMetaData(newKs.name).cfMetaData().get(newCf.cfName), newCf);
+        assertTrue(Schema.instance.getKSMetaData(newKs.name).tables.get(newCf.cfName).isPresent());
+        assertEquals(Schema.instance.getKSMetaData(newKs.name).tables.get(newCf.cfName).get(), newCf);
 
         // now read and write to it.
-        CellName col0 = cellname("col0");
-        DecoratedKey dk = Util.dk("key0");
-        Mutation rm = new Mutation(newKs.name, dk.getKey());
-        rm.add(newCf.cfName, col0, ByteBufferUtil.bytes("value0"), 1L);
-        rm.applyUnsafe();
-        ColumnFamilyStore store = Keyspace.open(newKs.name).getColumnFamilyStore(newCf.cfName);
-        Assert.assertNotNull(store);
-        store.forceBlockingFlush();
+        QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (key, col, val) VALUES (?, ?, ?)",
+                                                     EMPTY_KEYSPACE, tableName),
+                                       "key0", "col0", "val0");
 
-        ColumnFamily cfam = store.getColumnFamily(Util.namesQueryFilter(store, dk, col0));
-        Assert.assertNotNull(cfam.getColumn(col0));
-        Cell col = cfam.getColumn(col0);
-        Assert.assertEquals(ByteBufferUtil.bytes("value0"), col.value());
+        ColumnFamilyStore cfs = Keyspace.open(newKs.name).getColumnFamilyStore(newCf.cfName);
+        assertNotNull(cfs);
+        cfs.forceBlockingFlush();
+
+        UntypedResultSet rows = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s", EMPTY_KEYSPACE, tableName));
+        assertRows(rows, row("key0", "col0", "val0"));
     }
 
     @Test
     public void testUpdateKeyspace() throws ConfigurationException
     {
         // create a keyspace to serve as existing.
-        CFMetaData cf = addTestCF("UpdatedKeyspace", "AddedStandard1", "A new cf for a new ks");
-        KSMetaData oldKs = KSMetaData.testMetadata(cf.ksName, SimpleStrategy.class, KSMetaData.optsWithRF(5), cf);
+        CFMetaData cf = addTestTable("UpdatedKeyspace", "AddedStandard1", "A new cf for a new ks");
+        KeyspaceMetadata oldKs = KeyspaceMetadata.create(cf.ksName, KeyspaceParams.simple(5), Tables.of(cf));
 
         MigrationManager.announceNewKeyspace(oldKs);
 
-        Assert.assertNotNull(Schema.instance.getKSMetaData(cf.ksName));
-        Assert.assertEquals(Schema.instance.getKSMetaData(cf.ksName), oldKs);
+        assertNotNull(Schema.instance.getKSMetaData(cf.ksName));
+        assertEquals(Schema.instance.getKSMetaData(cf.ksName), oldKs);
 
         // names should match.
-        KSMetaData newBadKs2 = KSMetaData.testMetadata(cf.ksName + "trash", SimpleStrategy.class, KSMetaData.optsWithRF(4));
+        KeyspaceMetadata newBadKs2 = KeyspaceMetadata.create(cf.ksName + "trash", KeyspaceParams.simple(4));
         try
         {
             MigrationManager.announceKeyspaceUpdate(newBadKs2);
@@ -434,25 +384,30 @@
             // expected.
         }
 
-        KSMetaData newKs = KSMetaData.testMetadata(cf.ksName, OldNetworkTopologyStrategy.class, KSMetaData.optsWithRF(1));
+        Map<String, String> replicationMap = new HashMap<>();
+        replicationMap.put(ReplicationParams.CLASS, OldNetworkTopologyStrategy.class.getName());
+        replicationMap.put("replication_factor", "1");
+
+        KeyspaceMetadata newKs = KeyspaceMetadata.create(cf.ksName, KeyspaceParams.create(true, replicationMap));
         MigrationManager.announceKeyspaceUpdate(newKs);
 
-        KSMetaData newFetchedKs = Schema.instance.getKSMetaData(newKs.name);
-        Assert.assertEquals(newFetchedKs.strategyClass, newKs.strategyClass);
-        Assert.assertFalse(newFetchedKs.strategyClass.equals(oldKs.strategyClass));
+        KeyspaceMetadata newFetchedKs = Schema.instance.getKSMetaData(newKs.name);
+        assertEquals(newFetchedKs.params.replication.klass, newKs.params.replication.klass);
+        assertFalse(newFetchedKs.params.replication.klass.equals(oldKs.params.replication.klass));
     }
 
+    /*
     @Test
     public void testUpdateColumnFamilyNoIndexes() throws ConfigurationException
     {
         // create a keyspace with a cf to update.
-        CFMetaData cf = addTestCF("UpdatedCfKs", "Standard1added", "A new cf that will be updated");
+        CFMetaData cf = addTestTable("UpdatedCfKs", "Standard1added", "A new cf that will be updated");
         KSMetaData ksm = KSMetaData.testMetadata(cf.ksName, SimpleStrategy.class, KSMetaData.optsWithRF(1), cf);
         MigrationManager.announceNewKeyspace(ksm);
 
-        Assert.assertNotNull(Schema.instance.getKSMetaData(cf.ksName));
-        Assert.assertEquals(Schema.instance.getKSMetaData(cf.ksName), ksm);
-        Assert.assertNotNull(Schema.instance.getCFMetaData(cf.ksName, cf.cfName));
+        assertNotNull(Schema.instance.getKSMetaData(cf.ksName));
+        assertEquals(Schema.instance.getKSMetaData(cf.ksName), ksm);
+        assertNotNull(Schema.instance.getCFMetaData(cf.ksName, cf.cfName));
 
         // updating certain fields should fail.
         CFMetaData newCfm = cf.copy();
@@ -482,10 +437,10 @@
         // can't test changing the reconciler because there is only one impl.
 
         // check the cumulative affect.
-        Assert.assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getComment(), newCfm.getComment());
-        Assert.assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getReadRepairChance(), newCfm.getReadRepairChance(), 0.0001);
-        Assert.assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getGcGraceSeconds(), newCfm.getGcGraceSeconds());
-        Assert.assertEquals(UTF8Type.instance, Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getDefaultValidator());
+        assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getComment(), newCfm.getComment());
+        assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getReadRepairChance(), newCfm.getReadRepairChance(), 0.0001);
+        assertEquals(Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getGcGraceSeconds(), newCfm.getGcGraceSeconds());
+        assertEquals(UTF8Type.instance, Schema.instance.getCFMetaData(cf.ksName, cf.cfName).getDefaultValidator());
 
         // Change cfId
         newCfm = new CFMetaData(cf.ksName, cf.cfName, cf.cfType, cf.comparator);
@@ -537,39 +492,51 @@
         }
         catch (ConfigurationException expected) {}
     }
+    */
 
     @Test
     public void testDropIndex() throws ConfigurationException
     {
         // persist keyspace definition in the system keyspace
-        LegacySchemaTables.makeCreateKeyspaceMutation(Schema.instance.getKSMetaData(KEYSPACE6), FBUtilities.timestampMicros()).applyUnsafe();
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE6).getColumnFamilyStore("Indexed1");
+        SchemaKeyspace.makeCreateKeyspaceMutation(Schema.instance.getKSMetaData(KEYSPACE6), FBUtilities.timestampMicros()).applyUnsafe();
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE6).getColumnFamilyStore(TABLE1i);
+        String indexName = "birthdate_key_index";
 
         // insert some data.  save the sstable descriptor so we can make sure it's marked for delete after the drop
-        Mutation rm = new Mutation(KEYSPACE6, ByteBufferUtil.bytes("k1"));
-        rm.add("Indexed1", cellname("notbirthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.add("Indexed1", cellname("birthdate"), ByteBufferUtil.bytes(1L), 0);
-        rm.applyUnsafe();
+        QueryProcessor.executeInternal(String.format(
+                                                    "INSERT INTO %s.%s (key, c1, birthdate, notbirthdate) VALUES (?, ?, ?, ?)",
+                                                    KEYSPACE6,
+                                                    TABLE1i),
+                                       "key0", "col0", 1L, 1L);
+
         cfs.forceBlockingFlush();
-        ColumnFamilyStore indexedCfs = cfs.indexManager.getIndexForColumn(ByteBufferUtil.bytes("birthdate")).getIndexCfs();
-        Descriptor desc = indexedCfs.getSSTables().iterator().next().descriptor;
+        ColumnFamilyStore indexCfs = cfs.indexManager.getIndexByName(indexName)
+                                                     .getBackingTable()
+                                                     .orElseThrow(throwAssert("Cannot access index cfs"));
+        Descriptor desc = indexCfs.getLiveSSTables().iterator().next().descriptor;
 
         // drop the index
         CFMetaData meta = cfs.metadata.copy();
-        ColumnDefinition cdOld = meta.regularColumns().iterator().next();
-        ColumnDefinition cdNew = ColumnDefinition.regularDef(meta, cdOld.name.bytes, cdOld.type, null);
-        meta.addOrReplaceColumnDefinition(cdNew);
+        IndexMetadata existing = cfs.metadata.getIndexes()
+                                             .get(indexName)
+                                             .orElseThrow(throwAssert("Index not found"));
+
+        meta.indexes(meta.getIndexes().without(existing.name));
         MigrationManager.announceColumnFamilyUpdate(meta);
 
         // check
-        Assert.assertTrue(cfs.indexManager.getIndexes().isEmpty());
-        SSTableDeletingTask.waitForDeletions();
-        Assert.assertFalse(new File(desc.filenameFor(Component.DATA)).exists());
+        assertTrue(cfs.indexManager.listIndexes().isEmpty());
+        LifecycleTransaction.waitForDeletions();
+        assertFalse(new File(desc.filenameFor(Component.DATA)).exists());
     }
 
-    private CFMetaData addTestCF(String ks, String cf, String comment)
+    private CFMetaData addTestTable(String ks, String cf, String comment)
     {
-        CFMetaData newCFMD = new CFMetaData(ks, cf, ColumnFamilyType.Standard, new SimpleDenseCellNameType(UTF8Type.instance));
+        CFMetaData newCFMD = CFMetaData.Builder.create(ks, cf)
+                                               .addPartitionKey("key", UTF8Type.instance)
+                                               .addClusteringColumn("col", UTF8Type.instance)
+                                               .addRegularColumn("val", UTF8Type.instance).build();
+
         newCFMD.comment(comment)
                .readRepairChance(0.0);
 

diff --git a/test/unit/org/apache/cassandra/schema/LegacySchemaMigratorTest.java b/test/unit/org/apache/cassandra/schema/LegacySchemaMigratorTest.java
new file mode 100644
index 0000000..7643456
--- /dev/null
+++ b/test/unit/org/apache/cassandra/schema/LegacySchemaMigratorTest.java

@@ -0,0 +1,845 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.functions.*;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.cassandra.utils.*;
+
+import static java.lang.String.format;
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertFalse;
+import static junit.framework.Assert.assertTrue;
+import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
+import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.apache.cassandra.utils.FBUtilities.json;
+
+@SuppressWarnings("deprecation")
+public class LegacySchemaMigratorTest
+{
+    private static final long TIMESTAMP = 1435908994000000L;
+
+    private static final String KEYSPACE_PREFIX = "LegacySchemaMigratorTest";
+
+    /*
+     * 1. Write a variety of different keyspaces/tables/types/function in the legacy manner, using legacy schema tables
+     * 2. Run the migrator
+     * 3. Read all the keyspaces from the new schema tables
+     * 4. Make sure that we've read *exactly* the same set of keyspaces/tables/types/functions
+     * 5. Validate that the legacy schema tables are now empty
+     */
+    @Test
+    public void testMigrate() throws IOException
+    {
+        CQLTester.cleanupAndLeaveDirs();
+
+        Keyspaces expected = keyspacesToMigrate();
+
+        // write the keyspaces into the legacy tables
+        expected.forEach(LegacySchemaMigratorTest::legacySerializeKeyspace);
+
+        // run the migration
+        LegacySchemaMigrator.migrate();
+
+        // read back all the metadata from the new schema tables
+        Keyspaces actual = SchemaKeyspace.fetchNonSystemKeyspaces();
+
+        // need to load back CFMetaData of those tables (CFS instances will still be loaded)
+        loadLegacySchemaTables();
+
+        // verify that nothing's left in the old schema tables
+        for (CFMetaData table : LegacySchemaMigrator.LegacySchemaTables)
+        {
+            String query = format("SELECT * FROM %s.%s", SystemKeyspace.NAME, table.cfName);
+            //noinspection ConstantConditions
+            assertTrue(executeOnceInternal(query).isEmpty());
+        }
+
+        // make sure that we've read *exactly* the same set of keyspaces/tables/types/functions
+        assertEquals(expected, actual);
+
+        // check that the build status of all indexes has been updated to use the new
+        // format of index name: the index_name column of system.IndexInfo used to
+        // contain table_name.index_name. Now it should contain just the index_name.
+        expected.forEach(LegacySchemaMigratorTest::verifyIndexBuildStatus);
+    }
+
+    @Test
+    public void testMigrateLegacyCachingOptions() throws IOException
+    {
+        CQLTester.cleanupAndLeaveDirs();
+
+        assertEquals(CachingParams.CACHE_EVERYTHING, LegacySchemaMigrator.cachingFromRow("ALL"));
+        assertEquals(CachingParams.CACHE_NOTHING, LegacySchemaMigrator.cachingFromRow("NONE"));
+        assertEquals(CachingParams.CACHE_KEYS, LegacySchemaMigrator.cachingFromRow("KEYS_ONLY"));
+        assertEquals(new CachingParams(false, Integer.MAX_VALUE), LegacySchemaMigrator.cachingFromRow("ROWS_ONLY"));
+        assertEquals(CachingParams.CACHE_KEYS, LegacySchemaMigrator.cachingFromRow("{\"keys\" : \"ALL\", \"rows_per_partition\" : \"NONE\"}" ));
+        assertEquals(new CachingParams(false, Integer.MAX_VALUE), LegacySchemaMigrator.cachingFromRow("{\"keys\" : \"NONE\", \"rows_per_partition\" : \"ALL\"}" ));
+        assertEquals(new CachingParams(true, 100), LegacySchemaMigrator.cachingFromRow("{\"keys\" : \"ALL\", \"rows_per_partition\" : \"100\"}" ));
+
+        try
+        {
+            LegacySchemaMigrator.cachingFromRow("EXCEPTION");
+            Assert.fail();
+        }
+        catch(RuntimeException e)
+        {
+            // Expected passing path
+            assertTrue(true);
+        }
+    }
+
+    private static void loadLegacySchemaTables()
+    {
+        KeyspaceMetadata systemKeyspace = Schema.instance.getKSMetaData(SystemKeyspace.NAME);
+
+        Tables systemTables = systemKeyspace.tables;
+        for (CFMetaData table : LegacySchemaMigrator.LegacySchemaTables)
+            systemTables = systemTables.with(table);
+
+        LegacySchemaMigrator.LegacySchemaTables.forEach(Schema.instance::load);
+
+        Schema.instance.setKeyspaceMetadata(systemKeyspace.withSwapped(systemTables));
+    }
+
+    private static Keyspaces keyspacesToMigrate()
+    {
+        Keyspaces.Builder keyspaces = Keyspaces.builder();
+
+        // A whole bucket of shorthand
+        String ks1 = KEYSPACE_PREFIX + "Keyspace1";
+        String ks2 = KEYSPACE_PREFIX + "Keyspace2";
+        String ks3 = KEYSPACE_PREFIX + "Keyspace3";
+        String ks4 = KEYSPACE_PREFIX + "Keyspace4";
+        String ks5 = KEYSPACE_PREFIX + "Keyspace5";
+        String ks6 = KEYSPACE_PREFIX + "Keyspace6";
+        String ks_rcs = KEYSPACE_PREFIX + "RowCacheSpace";
+        String ks_nocommit = KEYSPACE_PREFIX + "NoCommitlogSpace";
+        String ks_prsi = KEYSPACE_PREFIX + "PerRowSecondaryIndex";
+        String ks_cql = KEYSPACE_PREFIX + "cql_keyspace";
+
+        // Make it easy to test compaction
+        Map<String, String> compactionOptions = new HashMap<>();
+        compactionOptions.put("tombstone_compaction_interval", "1");
+
+        Map<String, String> leveledOptions = new HashMap<>();
+        leveledOptions.put("sstable_size_in_mb", "1");
+
+        keyspaces.add(KeyspaceMetadata.create(ks1,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(SchemaLoader.standardCFMD(ks1, "Standard1")
+                                                                    .compaction(CompactionParams.scts(compactionOptions)),
+                                                        SchemaLoader.standardCFMD(ks1, "StandardGCGS0").gcGraceSeconds(0),
+                                                        SchemaLoader.standardCFMD(ks1, "StandardLong1"),
+                                                        SchemaLoader.keysIndexCFMD(ks1, "Indexed1", true),
+                                                        SchemaLoader.keysIndexCFMD(ks1, "Indexed2", false),
+                                                        SchemaLoader.jdbcCFMD(ks1, "JdbcUtf8", UTF8Type.instance)
+                                                                    .addColumnDefinition(SchemaLoader.utf8Column(ks1, "JdbcUtf8")),
+                                                        SchemaLoader.jdbcCFMD(ks1, "JdbcLong", LongType.instance),
+                                                        SchemaLoader.jdbcCFMD(ks1, "JdbcBytes", BytesType.instance),
+                                                        SchemaLoader.jdbcCFMD(ks1, "JdbcAscii", AsciiType.instance),
+                                                        SchemaLoader.standardCFMD(ks1, "StandardLeveled")
+                                                                    .compaction(CompactionParams.lcs(leveledOptions)),
+                                                        SchemaLoader.standardCFMD(ks1, "legacyleveled")
+                                                                    .compaction(CompactionParams.lcs(leveledOptions)),
+                                                        SchemaLoader.standardCFMD(ks1, "StandardLowIndexInterval")
+                                                                    .minIndexInterval(8)
+                                                                    .maxIndexInterval(256)
+                                                                    .caching(CachingParams.CACHE_NOTHING))));
+
+        // Keyspace 2
+        keyspaces.add(KeyspaceMetadata.create(ks2,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(SchemaLoader.standardCFMD(ks2, "Standard1"),
+                                                        SchemaLoader.keysIndexCFMD(ks2, "Indexed1", true),
+                                                        SchemaLoader.compositeIndexCFMD(ks2, "Indexed2", true),
+                                                        SchemaLoader.compositeIndexCFMD(ks2, "Indexed3", true)
+                                                                    .gcGraceSeconds(0))));
+
+        // Keyspace 3
+        keyspaces.add(KeyspaceMetadata.create(ks3,
+                                              KeyspaceParams.simple(5),
+                                              Tables.of(SchemaLoader.standardCFMD(ks3, "Standard1"),
+                                                        SchemaLoader.keysIndexCFMD(ks3, "Indexed1", true))));
+
+        // Keyspace 4
+        keyspaces.add(KeyspaceMetadata.create(ks4,
+                                              KeyspaceParams.simple(3),
+                                              Tables.of(SchemaLoader.standardCFMD(ks4, "Standard1"))));
+
+        // Keyspace 5
+        keyspaces.add(KeyspaceMetadata.create(ks5,
+                                              KeyspaceParams.simple(2),
+                                              Tables.of(SchemaLoader.standardCFMD(ks5, "Standard1"))));
+
+        // Keyspace 6
+        keyspaces.add(KeyspaceMetadata.create(ks6,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(SchemaLoader.keysIndexCFMD(ks6, "Indexed1", true))));
+
+        // RowCacheSpace
+        keyspaces.add(KeyspaceMetadata.create(ks_rcs,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(SchemaLoader.standardCFMD(ks_rcs, "CFWithoutCache")
+                                                                    .caching(CachingParams.CACHE_NOTHING),
+                                                        SchemaLoader.standardCFMD(ks_rcs, "CachedCF")
+                                                                    .caching(CachingParams.CACHE_EVERYTHING),
+                                                        SchemaLoader.standardCFMD(ks_rcs, "CachedIntCF")
+                                                                    .caching(new CachingParams(true, 100)))));
+
+        keyspaces.add(KeyspaceMetadata.create(ks_nocommit,
+                                              KeyspaceParams.simpleTransient(1),
+                                              Tables.of(SchemaLoader.standardCFMD(ks_nocommit, "Standard1"))));
+
+        // PerRowSecondaryIndexTest
+        keyspaces.add(KeyspaceMetadata.create(ks_prsi,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(SchemaLoader.perRowIndexedCFMD(ks_prsi, "Indexed1"))));
+
+        // CQLKeyspace
+        keyspaces.add(KeyspaceMetadata.create(ks_cql,
+                                              KeyspaceParams.simple(1),
+                                              Tables.of(CFMetaData.compile("CREATE TABLE table1 ("
+                                                                           + "k int PRIMARY KEY,"
+                                                                           + "v1 text,"
+                                                                           + "v2 int"
+                                                                           + ')', ks_cql),
+
+                                                        CFMetaData.compile("CREATE TABLE table2 ("
+                                                                           + "k text,"
+                                                                           + "c text,"
+                                                                           + "v text,"
+                                                                           + "PRIMARY KEY (k, c))", ks_cql),
+
+                                                        CFMetaData.compile("CREATE TABLE foo ("
+                                                                           + "bar text, "
+                                                                           + "baz text, "
+                                                                           + "qux text, "
+                                                                           + "PRIMARY KEY(bar, baz) ) "
+                                                                           + "WITH COMPACT STORAGE", ks_cql),
+
+                                                        CFMetaData.compile("CREATE TABLE compact_pkonly ("
+                                                                           + "k int, "
+                                                                           + "c int, "
+                                                                           + "PRIMARY KEY (k, c)) "
+                                                                           + "WITH COMPACT STORAGE",
+                                                                           ks_cql),
+
+                                                        CFMetaData.compile("CREATE TABLE foofoo ("
+                                                                           + "bar text, "
+                                                                           + "baz text, "
+                                                                           + "qux text, "
+                                                                           + "quz text, "
+                                                                           + "foo text, "
+                                                                           + "PRIMARY KEY((bar, baz), qux, quz) ) "
+                                                                           + "WITH COMPACT STORAGE", ks_cql))));
+
+        // NTS keyspace
+        keyspaces.add(KeyspaceMetadata.create("nts", KeyspaceParams.nts("dc1", 1, "dc2", 2)));
+
+        keyspaces.add(keyspaceWithDroppedCollections());
+        keyspaces.add(keyspaceWithTriggers());
+        keyspaces.add(keyspaceWithUDTs());
+        keyspaces.add(keyspaceWithUDFs());
+        keyspaces.add(keyspaceWithUDFsAndUDTs());
+        keyspaces.add(keyspaceWithUDAs());
+        keyspaces.add(keyspaceWithUDAsAndUDTs());
+
+        return keyspaces.build();
+    }
+
+    private static KeyspaceMetadata keyspaceWithDroppedCollections()
+    {
+        String keyspace = KEYSPACE_PREFIX + "DroppedCollections";
+
+        CFMetaData table =
+            CFMetaData.compile("CREATE TABLE dropped_columns ("
+                               + "foo text,"
+                               + "bar text,"
+                               + "map1 map<text, text>,"
+                               + "map2 map<int, int>,"
+                               + "set1 set<ascii>,"
+                               + "list1 list<blob>,"
+                               + "PRIMARY KEY ((foo), bar))",
+                               keyspace);
+
+        String[] collectionColumnNames = { "map1", "map2", "set1", "list1" };
+        for (String name : collectionColumnNames)
+        {
+            ColumnDefinition column = table.getColumnDefinition(bytes(name));
+            table.recordColumnDrop(column, FBUtilities.timestampMicros(), false);
+            table.removeColumnDefinition(column);
+        }
+
+        return KeyspaceMetadata.create(keyspace, KeyspaceParams.simple(1), Tables.of(table));
+    }
+
+    private static KeyspaceMetadata keyspaceWithTriggers()
+    {
+        String keyspace = KEYSPACE_PREFIX + "Triggers";
+
+        Triggers.Builder triggers = Triggers.builder();
+        CFMetaData table = SchemaLoader.standardCFMD(keyspace, "WithTriggers");
+        for (int i = 0; i < 10; i++)
+            triggers.add(new TriggerMetadata("trigger" + i, "DummyTrigger" + i));
+        table.triggers(triggers.build());
+
+        return KeyspaceMetadata.create(keyspace, KeyspaceParams.simple(1), Tables.of(table));
+    }
+
+    private static KeyspaceMetadata keyspaceWithUDTs()
+    {
+        String keyspace = KEYSPACE_PREFIX + "UDTs";
+
+        UserType udt1 = new UserType(keyspace,
+                                     bytes("udt1"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col1")); add(bytes("col2")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(UTF8Type.instance); add(Int32Type.instance); }});
+
+        UserType udt2 = new UserType(keyspace,
+                                     bytes("udt2"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col3")); add(bytes("col4")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(BytesType.instance); add(BooleanType.instance); }});
+
+        UserType udt3 = new UserType(keyspace,
+                                     bytes("udt3"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col5")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(AsciiType.instance); }});
+
+        return KeyspaceMetadata.create(keyspace,
+                                       KeyspaceParams.simple(1),
+                                       Tables.none(),
+                                       Views.none(),
+                                       Types.of(udt1, udt2, udt3),
+                                       Functions.none());
+    }
+
+    private static KeyspaceMetadata keyspaceWithUDFs()
+    {
+        String keyspace = KEYSPACE_PREFIX + "UDFs";
+
+        UDFunction udf1 = UDFunction.create(new FunctionName(keyspace, "udf"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(BytesType.instance, Int32Type.instance),
+                                            LongType.instance,
+                                            false,
+                                            "java",
+                                            "return 42L;");
+
+        // an overload with the same name, not a typo
+        UDFunction udf2 = UDFunction.create(new FunctionName(keyspace, "udf"),
+                                            ImmutableList.of(new ColumnIdentifier("col3", false), new ColumnIdentifier("col4", false)),
+                                            ImmutableList.of(AsciiType.instance, LongType.instance),
+                                            Int32Type.instance,
+                                            true,
+                                            "java",
+                                            "return 42;");
+
+        UDFunction udf3 = UDFunction.create(new FunctionName(keyspace, "udf3"),
+                                            ImmutableList.of(new ColumnIdentifier("col4", false)),
+                                            ImmutableList.of(UTF8Type.instance),
+                                            BooleanType.instance,
+                                            false,
+                                            "java",
+                                            "return true;");
+
+        return KeyspaceMetadata.create(keyspace,
+                                       KeyspaceParams.simple(1),
+                                       Tables.none(),
+                                       Views.none(),
+                                       Types.none(),
+                                       Functions.of(udf1, udf2, udf3));
+    }
+
+    private static KeyspaceMetadata keyspaceWithUDAs()
+    {
+        String keyspace = KEYSPACE_PREFIX + "UDAs";
+
+        UDFunction udf1 = UDFunction.create(new FunctionName(keyspace, "udf1"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(Int32Type.instance, Int32Type.instance),
+                                            Int32Type.instance,
+                                            false,
+                                            "java",
+                                            "return 42;");
+
+        UDFunction udf2 = UDFunction.create(new FunctionName(keyspace, "udf2"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(LongType.instance, Int32Type.instance),
+                                            LongType.instance,
+                                            false,
+                                            "java",
+                                            "return 42L;");
+
+        UDFunction udf3 = UDFunction.create(new FunctionName(keyspace, "udf3"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false)),
+                                            ImmutableList.of(LongType.instance),
+                                            DoubleType.instance,
+                                            false,
+                                            "java",
+                                            "return 42d;");
+
+        Functions udfs = Functions.builder().add(udf1).add(udf2).add(udf3).build();
+
+        UDAggregate uda1 = UDAggregate.create(udfs, new FunctionName(keyspace, "uda1"),
+                                              ImmutableList.of(udf1.argTypes().get(1)),
+                                              udf1.returnType(),
+                                              udf1.name(),
+                                              null,
+                                              udf1.argTypes().get(0),
+                                              null
+        );
+
+        UDAggregate uda2 = UDAggregate.create(udfs, new FunctionName(keyspace, "uda2"),
+                                              ImmutableList.of(udf2.argTypes().get(1)),
+                                              udf3.returnType(),
+                                              udf2.name(),
+                                              udf3.name(),
+                                              udf2.argTypes().get(0),
+                                              LongType.instance.decompose(0L)
+        );
+
+        return KeyspaceMetadata.create(keyspace,
+                                       KeyspaceParams.simple(1),
+                                       Tables.none(),
+                                       Views.none(),
+                                       Types.none(),
+                                       Functions.of(udf1, udf2, udf3, uda1, uda2));
+    }
+
+    private static KeyspaceMetadata keyspaceWithUDFsAndUDTs()
+    {
+        String keyspace = KEYSPACE_PREFIX + "UDFUDTs";
+
+        UserType udt1 = new UserType(keyspace,
+                                     bytes("udt1"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col1")); add(bytes("col2")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(UTF8Type.instance); add(Int32Type.instance); }});
+
+        UserType udt2 = new UserType(keyspace,
+                                     bytes("udt2"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col1")); add(bytes("col2")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(ListType.getInstance(udt1, false)); add(Int32Type.instance); }});
+
+        UDFunction udf1 = UDFunction.create(new FunctionName(keyspace, "udf"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(udt1, udt2),
+                                            LongType.instance,
+                                            false,
+                                            "java",
+                                            "return 42L;");
+
+        // an overload with the same name, not a typo
+        UDFunction udf2 = UDFunction.create(new FunctionName(keyspace, "udf"),
+                                            ImmutableList.of(new ColumnIdentifier("col3", false), new ColumnIdentifier("col4", false)),
+                                            ImmutableList.of(AsciiType.instance, LongType.instance),
+                                            Int32Type.instance,
+                                            true,
+                                            "java",
+                                            "return 42;");
+
+        UDFunction udf3 = UDFunction.create(new FunctionName(keyspace, "udf3"),
+                                            ImmutableList.of(new ColumnIdentifier("col4", false)),
+                                            ImmutableList.of(new TupleType(Arrays.asList(udt1, udt2))),
+                                            BooleanType.instance,
+                                            false,
+                                            "java",
+                                            "return true;");
+
+        return KeyspaceMetadata.create(keyspace,
+                                       KeyspaceParams.simple(1),
+                                       Tables.none(),
+                                       Views.none(),
+                                       Types.of(udt1, udt2),
+                                       Functions.of(udf1, udf2, udf3));
+    }
+
+    private static KeyspaceMetadata keyspaceWithUDAsAndUDTs()
+    {
+        String keyspace = KEYSPACE_PREFIX + "UDAUDTs";
+
+        UserType udt1 = new UserType(keyspace,
+                                     bytes("udt1"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col1")); add(bytes("col2")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(UTF8Type.instance); add(Int32Type.instance); }});
+
+        UserType udt2 = new UserType(keyspace,
+                                     bytes("udt2"),
+                                     new ArrayList<ByteBuffer>() {{ add(bytes("col1")); add(bytes("col2")); }},
+                                     new ArrayList<AbstractType<?>>() {{ add(ListType.getInstance(udt1, false)); add(Int32Type.instance); }});
+
+        UDFunction udf1 = UDFunction.create(new FunctionName(keyspace, "udf1"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(udt1, udt2),
+                                            udt1,
+                                            false,
+                                            "java",
+                                            "return null;");
+
+        UDFunction udf2 = UDFunction.create(new FunctionName(keyspace, "udf2"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false), new ColumnIdentifier("col2", false)),
+                                            ImmutableList.of(udt2, udt1),
+                                            udt2,
+                                            false,
+                                            "java",
+                                            "return null;");
+
+        UDFunction udf3 = UDFunction.create(new FunctionName(keyspace, "udf3"),
+                                            ImmutableList.of(new ColumnIdentifier("col1", false)),
+                                            ImmutableList.of(udt2),
+                                            DoubleType.instance,
+                                            false,
+                                            "java",
+                                            "return 42d;");
+
+        Functions udfs = Functions.builder().add(udf1).add(udf2).add(udf3).build();
+
+        UDAggregate uda1 = UDAggregate.create(udfs, new FunctionName(keyspace, "uda1"),
+                                              ImmutableList.of(udf1.argTypes().get(1)),
+                                              udf1.returnType(),
+                                              udf1.name(),
+                                              null,
+                                              udf1.argTypes().get(0),
+                                              null
+        );
+
+        ByteBuffer twoNullEntries = ByteBuffer.allocate(8);
+        twoNullEntries.putInt(-1);
+        twoNullEntries.putInt(-1);
+        twoNullEntries.flip();
+        UDAggregate uda2 = UDAggregate.create(udfs, new FunctionName(keyspace, "uda2"),
+                                              ImmutableList.of(udf2.argTypes().get(1)),
+                                              udf3.returnType(),
+                                              udf2.name(),
+                                              udf3.name(),
+                                              udf2.argTypes().get(0),
+                                              twoNullEntries
+        );
+
+        return KeyspaceMetadata.create(keyspace,
+                                       KeyspaceParams.simple(1),
+                                       Tables.none(),
+                                       Views.none(),
+                                       Types.of(udt1, udt2),
+                                       Functions.of(udf1, udf2, udf3, uda1, uda2));
+    }
+
+    /*
+     * Serializing keyspaces
+     */
+
+    private static void legacySerializeKeyspace(KeyspaceMetadata keyspace)
+    {
+        makeLegacyCreateKeyspaceMutation(keyspace, TIMESTAMP).apply();
+        setLegacyIndexStatus(keyspace);
+    }
+
+    private static Mutation makeLegacyCreateKeyspaceMutation(KeyspaceMetadata keyspace, long timestamp)
+    {
+        // Note that because Keyspaces is a COMPACT TABLE, we're really only setting static columns internally and shouldn't set any clustering.
+        RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyKeyspaces, timestamp, keyspace.name);
+
+        adder.add("durable_writes", keyspace.params.durableWrites)
+             .add("strategy_class", keyspace.params.replication.klass.getName())
+             .add("strategy_options", json(keyspace.params.replication.options));
+
+        Mutation mutation = adder.build();
+
+        keyspace.tables.forEach(table -> addTableToSchemaMutation(table, timestamp, true, mutation));
+        keyspace.types.forEach(type -> addTypeToSchemaMutation(type, timestamp, mutation));
+        keyspace.functions.udfs().forEach(udf -> addFunctionToSchemaMutation(udf, timestamp, mutation));
+        keyspace.functions.udas().forEach(uda -> addAggregateToSchemaMutation(uda, timestamp, mutation));
+
+        return mutation;
+    }
+
+    /*
+     * Serializing tables
+     */
+
+    private static void addTableToSchemaMutation(CFMetaData table, long timestamp, boolean withColumnsAndTriggers, Mutation mutation)
+    {
+        // For property that can be null (and can be changed), we insert tombstones, to make sure
+        // we don't keep a property the user has removed
+        RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyColumnfamilies, timestamp, mutation)
+                                 .clustering(table.cfName);
+
+        adder.add("cf_id", table.cfId)
+             .add("type", table.isSuper() ? "Super" : "Standard");
+
+        if (table.isSuper())
+        {
+            adder.add("comparator", table.comparator.subtype(0).toString())
+                 .add("subcomparator", ((MapType)table.compactValueColumn().type).getKeysType().toString());
+        }
+        else
+        {
+            adder.add("comparator", LegacyLayout.makeLegacyComparator(table).toString());
+        }
+
+        adder.add("bloom_filter_fp_chance", table.params.bloomFilterFpChance)
+             .add("caching", cachingToString(table.params.caching))
+             .add("comment", table.params.comment)
+             .add("compaction_strategy_class", table.params.compaction.klass().getName())
+             .add("compaction_strategy_options", json(table.params.compaction.options()))
+             .add("compression_parameters", json(ThriftConversion.compressionParametersToThrift(table.params.compression)))
+             .add("default_time_to_live", table.params.defaultTimeToLive)
+             .add("gc_grace_seconds", table.params.gcGraceSeconds)
+             .add("key_validator", table.getKeyValidator().toString())
+             .add("local_read_repair_chance", table.params.dcLocalReadRepairChance)
+             .add("max_compaction_threshold", table.params.compaction.maxCompactionThreshold())
+             .add("max_index_interval", table.params.maxIndexInterval)
+             .add("memtable_flush_period_in_ms", table.params.memtableFlushPeriodInMs)
+             .add("min_compaction_threshold", table.params.compaction.minCompactionThreshold())
+             .add("min_index_interval", table.params.minIndexInterval)
+             .add("read_repair_chance", table.params.readRepairChance)
+             .add("speculative_retry", table.params.speculativeRetry.toString());
+
+        for (Map.Entry<ByteBuffer, CFMetaData.DroppedColumn> entry : table.getDroppedColumns().entrySet())
+        {
+            String name = UTF8Type.instance.getString(entry.getKey());
+            CFMetaData.DroppedColumn column = entry.getValue();
+            adder.addMapEntry("dropped_columns", name, column.droppedTime);
+        }
+
+        adder.add("is_dense", table.isDense());
+
+        adder.add("default_validator", table.makeLegacyDefaultValidator().toString());
+
+        if (withColumnsAndTriggers)
+        {
+            for (ColumnDefinition column : table.allColumns())
+                addColumnToSchemaMutation(table, column, timestamp, mutation);
+
+            for (TriggerMetadata trigger : table.getTriggers())
+                addTriggerToSchemaMutation(table, trigger, timestamp, mutation);
+        }
+
+        adder.build();
+    }
+
+    private static String cachingToString(CachingParams caching)
+    {
+        return format("{\"keys\":\"%s\", \"rows_per_partition\":\"%s\"}",
+                      caching.keysAsString(),
+                      caching.rowsPerPartitionAsString());
+    }
+
+    private static void addColumnToSchemaMutation(CFMetaData table, ColumnDefinition column, long timestamp, Mutation mutation)
+    {
+        // We need to special case pk-only dense tables. See CASSANDRA-9874.
+        String name = table.isDense() && column.kind == ColumnDefinition.Kind.REGULAR && column.type instanceof EmptyType
+                    ? ""
+                    : column.name.toString();
+
+        final RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyColumns, timestamp, mutation).clustering(table.cfName, name);
+
+        adder.add("validator", column.type.toString())
+             .add("type", serializeKind(column.kind, table.isDense()))
+             .add("component_index", column.position());
+
+        Optional<IndexMetadata> index = findIndexForColumn(table.getIndexes(), table, column);
+        if (index.isPresent())
+        {
+            IndexMetadata i = index.get();
+            adder.add("index_name", i.name);
+            adder.add("index_type", i.kind.toString());
+            adder.add("index_options", json(i.options));
+        }
+        else
+        {
+            adder.add("index_name", null);
+            adder.add("index_type", null);
+            adder.add("index_options", null);
+        }
+
+        adder.build();
+    }
+
+    private static Optional<IndexMetadata> findIndexForColumn(Indexes indexes,
+                                                              CFMetaData table,
+                                                              ColumnDefinition column)
+    {
+        // makes the assumptions that the string option denoting the
+        // index targets can be parsed by CassandraIndex.parseTarget
+        // which should be true for any pre-3.0 index
+        for (IndexMetadata index : indexes)
+          if (CassandraIndex.parseTarget(table, index).left.equals(column))
+                return Optional.of(index);
+
+        return Optional.empty();
+    }
+
+    private static String serializeKind(ColumnDefinition.Kind kind, boolean isDense)
+    {
+        // For backward compatibility, we special case CLUSTERING and the case where the table is dense.
+        if (kind == ColumnDefinition.Kind.CLUSTERING)
+            return "clustering_key";
+
+        if (kind == ColumnDefinition.Kind.REGULAR && isDense)
+            return "compact_value";
+
+        return kind.toString().toLowerCase();
+    }
+
+    private static void addTriggerToSchemaMutation(CFMetaData table, TriggerMetadata trigger, long timestamp, Mutation mutation)
+    {
+        new RowUpdateBuilder(SystemKeyspace.LegacyTriggers, timestamp, mutation)
+            .clustering(table.cfName, trigger.name)
+            .addMapEntry("trigger_options", "class", trigger.classOption)
+            .build();
+    }
+
+    /*
+     * Serializing types
+     */
+
+    private static void addTypeToSchemaMutation(UserType type, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyUsertypes, timestamp, mutation)
+                                 .clustering(type.getNameAsString());
+
+        adder.resetCollection("field_names")
+             .resetCollection("field_types");
+
+        for (int i = 0; i < type.size(); i++)
+        {
+            adder.addListEntry("field_names", type.fieldName(i))
+                 .addListEntry("field_types", type.fieldType(i).toString());
+        }
+
+        adder.build();
+    }
+
+    /*
+     * Serializing functions
+     */
+
+    private static void addFunctionToSchemaMutation(UDFunction function, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyFunctions, timestamp, mutation)
+                                 .clustering(function.name().name, functionSignatureWithTypes(function));
+
+        adder.add("body", function.body())
+             .add("language", function.language())
+             .add("return_type", function.returnType().toString())
+             .add("called_on_null_input", function.isCalledOnNullInput());
+
+        adder.resetCollection("argument_names")
+             .resetCollection("argument_types");
+
+        for (int i = 0; i < function.argNames().size(); i++)
+        {
+            adder.addListEntry("argument_names", function.argNames().get(i).bytes)
+                 .addListEntry("argument_types", function.argTypes().get(i).toString());
+        }
+
+        adder.build();
+    }
+
+    /*
+     * Serializing aggregates
+     */
+
+    private static void addAggregateToSchemaMutation(UDAggregate aggregate, long timestamp, Mutation mutation)
+    {
+        RowUpdateBuilder adder = new RowUpdateBuilder(SystemKeyspace.LegacyAggregates, timestamp, mutation)
+                                 .clustering(aggregate.name().name, functionSignatureWithTypes(aggregate));
+
+        adder.resetCollection("argument_types");
+
+        adder.add("return_type", aggregate.returnType().toString())
+             .add("state_func", aggregate.stateFunction().name().name);
+
+        if (aggregate.stateType() != null)
+            adder.add("state_type", aggregate.stateType().toString());
+        if (aggregate.finalFunction() != null)
+            adder.add("final_func", aggregate.finalFunction().name().name);
+        if (aggregate.initialCondition() != null)
+            adder.add("initcond", aggregate.initialCondition());
+
+        for (AbstractType<?> argType : aggregate.argTypes())
+            adder.addListEntry("argument_types", argType.toString());
+
+        adder.build();
+    }
+
+    // We allow method overloads, so a function is not uniquely identified by its name only, but
+    // also by its argument types. To distinguish overloads of given function name in the schema
+    // we use a "signature" which is just a list of it's CQL argument types.
+    public static ByteBuffer functionSignatureWithTypes(AbstractFunction fun)
+    {
+        List<String> arguments =
+            fun.argTypes()
+               .stream()
+               .map(argType -> argType.asCQL3Type().toString())
+               .collect(Collectors.toList());
+
+        return ListType.getInstance(UTF8Type.instance, false).decompose(arguments);
+    }
+
+    private static void setLegacyIndexStatus(KeyspaceMetadata keyspace)
+    {
+        keyspace.tables.forEach(LegacySchemaMigratorTest::setLegacyIndexStatus);
+    }
+
+    private static void setLegacyIndexStatus(CFMetaData table)
+    {
+        table.getIndexes().forEach((index) -> setLegacyIndexStatus(table.ksName, table.cfName, index));
+    }
+
+    private static void setLegacyIndexStatus(String keyspace, String table, IndexMetadata index)
+    {
+        SystemKeyspace.setIndexBuilt(keyspace, table + '.' + index.name);
+    }
+
+    private static void verifyIndexBuildStatus(KeyspaceMetadata keyspace)
+    {
+        keyspace.tables.forEach(LegacySchemaMigratorTest::verifyIndexBuildStatus);
+    }
+
+    private static void verifyIndexBuildStatus(CFMetaData table)
+    {
+        table.getIndexes().forEach(index -> verifyIndexBuildStatus(table.ksName, table.cfName, index));
+    }
+
+    private static void verifyIndexBuildStatus(String keyspace, String table, IndexMetadata index)
+    {
+        assertFalse(SystemKeyspace.isIndexBuilt(keyspace, table + '.' + index.name));
+        assertTrue(SystemKeyspace.isIndexBuilt(keyspace, index.name));
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java b/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java
new file mode 100644
index 0000000..f76fc4f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java

@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableMap;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.thrift.CfDef;
+import org.apache.cassandra.thrift.ColumnDef;
+import org.apache.cassandra.thrift.IndexType;
+import org.apache.cassandra.thrift.ThriftConversion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class SchemaKeyspaceTest
+{
+    private static final String KEYSPACE1 = "CFMetaDataTest1";
+    private static final String CF_STANDARD1 = "Standard1";
+
+    private static final List<ColumnDef> columnDefs = new ArrayList<>();
+
+    static
+    {
+        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col1"), AsciiType.class.getCanonicalName())
+                                    .setIndex_name("col1Index")
+                                    .setIndex_type(IndexType.KEYS));
+
+        columnDefs.add(new ColumnDef(ByteBufferUtil.bytes("col2"), UTF8Type.class.getCanonicalName())
+                                    .setIndex_name("col2Index")
+                                    .setIndex_type(IndexType.KEYS));
+    }
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+    }
+
+    @Test
+    public void testThriftConversion() throws Exception
+    {
+        CfDef cfDef = new CfDef().setDefault_validation_class(AsciiType.class.getCanonicalName())
+                                 .setComment("Test comment")
+                                 .setColumn_metadata(columnDefs)
+                                 .setKeyspace(KEYSPACE1)
+                                 .setName(CF_STANDARD1);
+
+        // convert Thrift to CFMetaData
+        CFMetaData cfMetaData = ThriftConversion.fromThrift(cfDef);
+
+        CfDef thriftCfDef = new CfDef();
+        thriftCfDef.keyspace = KEYSPACE1;
+        thriftCfDef.name = CF_STANDARD1;
+        thriftCfDef.default_validation_class = cfDef.default_validation_class;
+        thriftCfDef.comment = cfDef.comment;
+        thriftCfDef.column_metadata = new ArrayList<>();
+        for (ColumnDef columnDef : columnDefs)
+        {
+            ColumnDef c = new ColumnDef();
+            c.name = ByteBufferUtil.clone(columnDef.name);
+            c.validation_class = columnDef.getValidation_class();
+            c.index_name = columnDef.getIndex_name();
+            c.index_type = IndexType.KEYS;
+            thriftCfDef.column_metadata.add(c);
+        }
+
+        CfDef converted = ThriftConversion.toThrift(cfMetaData);
+
+        assertEquals(thriftCfDef.keyspace, converted.keyspace);
+        assertEquals(thriftCfDef.name, converted.name);
+        assertEquals(thriftCfDef.default_validation_class, converted.default_validation_class);
+        assertEquals(thriftCfDef.comment, converted.comment);
+        assertEquals(new HashSet<>(thriftCfDef.column_metadata), new HashSet<>(converted.column_metadata));
+    }
+
+    @Test
+    public void testConversionsInverses() throws Exception
+    {
+        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        {
+            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+            {
+                CFMetaData cfm = cfs.metadata;
+                if (!cfm.isThriftCompatible())
+                    continue;
+
+                checkInverses(cfm);
+
+                // Testing with compression to catch #3558
+                CFMetaData withCompression = cfm.copy();
+                withCompression.compression(CompressionParams.snappy(32768));
+                checkInverses(withCompression);
+            }
+        }
+    }
+
+    @Test
+    public void testExtensions() throws IOException
+    {
+        String keyspace = "SandBox";
+
+        createTable(keyspace, "CREATE TABLE test (a text primary key, b int, c int)");
+
+        CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, "test");
+        assertTrue("extensions should be empty", metadata.params.extensions.isEmpty());
+
+        ImmutableMap<String, ByteBuffer> extensions = ImmutableMap.of("From ... with Love",
+                                                                      ByteBuffer.wrap(new byte[]{0, 0, 7}));
+
+        CFMetaData copy = metadata.copy().extensions(extensions);
+
+        updateTable(keyspace, metadata, copy);
+
+        metadata = Schema.instance.getCFMetaData(keyspace, "test");
+        assertEquals(extensions, metadata.params.extensions);
+    }
+
+    private static void updateTable(String keyspace, CFMetaData oldTable, CFMetaData newTable)
+    {
+        KeyspaceMetadata ksm = Schema.instance.getKeyspaceInstance(keyspace).getMetadata();
+        Mutation mutation = SchemaKeyspace.makeUpdateTableMutation(ksm, oldTable, newTable, FBUtilities.timestampMicros());
+        SchemaKeyspace.mergeSchema(Collections.singleton(mutation));
+    }
+
+    private static void createTable(String keyspace, String cql)
+    {
+        CFMetaData table = CFMetaData.compile(cql, keyspace);
+
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(keyspace, KeyspaceParams.simple(1), Tables.of(table));
+        Mutation mutation = SchemaKeyspace.makeCreateTableMutation(ksm, table, FBUtilities.timestampMicros());
+        SchemaKeyspace.mergeSchema(Collections.singleton(mutation));
+    }
+
+    private static void checkInverses(CFMetaData cfm) throws Exception
+    {
+        KeyspaceMetadata keyspace = Schema.instance.getKSMetaData(cfm.ksName);
+
+        // Test thrift conversion
+        CFMetaData before = cfm;
+        CFMetaData after = ThriftConversion.fromThriftForUpdate(ThriftConversion.toThrift(before), before);
+        assert before.equals(after) : String.format("%n%s%n!=%n%s", before, after);
+
+        // Test schema conversion
+        Mutation rm = SchemaKeyspace.makeCreateTableMutation(keyspace, cfm, FBUtilities.timestampMicros());
+        PartitionUpdate serializedCf = rm.getPartitionUpdate(Schema.instance.getId(SchemaKeyspace.NAME, SchemaKeyspace.TABLES));
+        PartitionUpdate serializedCD = rm.getPartitionUpdate(Schema.instance.getId(SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS));
+
+        UntypedResultSet.Row tableRow = QueryProcessor.resultify(String.format("SELECT * FROM %s.%s", SchemaKeyspace.NAME, SchemaKeyspace.TABLES),
+                                                                 UnfilteredRowIterators.filter(serializedCf.unfilteredIterator(), FBUtilities.nowInSeconds()))
+                                                      .one();
+        TableParams params = SchemaKeyspace.createTableParamsFromRow(tableRow);
+
+        UntypedResultSet columnsRows = QueryProcessor.resultify(String.format("SELECT * FROM %s.%s", SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS),
+                                                                UnfilteredRowIterators.filter(serializedCD.unfilteredIterator(), FBUtilities.nowInSeconds()));
+        Set<ColumnDefinition> columns = new HashSet<>();
+        for (UntypedResultSet.Row row : columnsRows)
+            columns.add(SchemaKeyspace.createColumnFromRow(row, Types.none()));
+
+        assertEquals(cfm.params, params);
+        assertEquals(new HashSet<>(cfm.allColumns()), columns);
+    }
+
+    @Test(expected = SchemaKeyspace.MissingColumns.class)
+    public void testSchemaNoPartition()
+    {
+        String testKS = "test_schema_no_partition";
+        String testTable = "invalid_table";
+        SchemaLoader.createKeyspace(testKS,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(testKS, testTable));
+        // Delete partition column in the schema
+        String query = String.format("DELETE FROM %s.%s WHERE keyspace_name=? and table_name=? and column_name=?", SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS);
+        executeOnceInternal(query, testKS, testTable, "key");
+        SchemaKeyspace.fetchNonSystemKeyspaces();
+    }
+
+    @Test(expected = SchemaKeyspace.MissingColumns.class)
+    public void testSchemaNoColumn()
+    {
+        String testKS = "test_schema_no_Column";
+        String testTable = "invalid_table";
+        SchemaLoader.createKeyspace(testKS,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(testKS, testTable));
+        // Delete all colmns in the schema
+        String query = String.format("DELETE FROM %s.%s WHERE keyspace_name=? and table_name=?", SchemaKeyspace.NAME, SchemaKeyspace.COLUMNS);
+        executeOnceInternal(query, testKS, testTable);
+        SchemaKeyspace.fetchNonSystemKeyspaces();
+    }
+}

diff --git a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java
index 03a25c6..adcd684 100644
--- a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java
+++ b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java

@@ -22,29 +22,26 @@
 import java.util.*;
 import java.util.concurrent.ExecutionException;
 
-import com.google.common.base.Predicate;
 import com.google.common.collect.Sets;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Refs;
 
@@ -69,8 +66,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE5,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(2),
+                                    KeyspaceParams.simple(2),
                                     SchemaLoader.standardCFMD(KEYSPACE5, CF_COUNTER),
                                     SchemaLoader.standardCFMD(KEYSPACE5, CF_STANDARD1));
     }
@@ -90,8 +86,8 @@
 
         TokenMetadata tmd = StorageService.instance.getTokenMetadata();
         tmd.clearUnsafe();
-        StorageService.instance.setTokens(Collections.singleton(StorageService.getPartitioner().getRandomToken()));
-        tmd.updateNormalToken(StorageService.getPartitioner().getMinimumToken(), REMOTE);
+        StorageService.instance.setTokens(Collections.singleton(tmd.partitioner.getRandomToken()));
+        tmd.updateNormalToken(tmd.partitioner.getMinimumToken(), REMOTE);
         assert tmd.isMember(REMOTE);
     }
 
@@ -223,7 +219,7 @@
         for (int i = 1; i <= max; i++)
         {
             InetAddress endpoint = InetAddress.getByName("127.0.0." + i);
-            tmd.updateNormalToken(StorageService.getPartitioner().getRandomToken(), endpoint);
+            tmd.updateNormalToken(tmd.partitioner.getRandomToken(), endpoint);
             endpoints.add(endpoint);
         }
         return endpoints;
@@ -233,10 +229,10 @@
     public void testGetActiveRepairedSSTableRefs()
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Set<SSTableReader> original = store.getUnrepairedSSTables();
+        Set<SSTableReader> original = store.getLiveSSTables();
 
         UUID prsId = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, false);
+        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, 0, false);
         ActiveRepairService.ParentRepairSession prs = ActiveRepairService.instance.getParentRepairSession(prsId);
         prs.markSSTablesRepairing(store.metadata.cfId, prsId);
 
@@ -251,7 +247,7 @@
         Iterator<SSTableReader> it = newLiveSet.iterator();
         final SSTableReader removed = it.next();
         it.remove();
-        store.getTracker().dropSSTables(new Predicate<SSTableReader>()
+        store.getTracker().dropSSTables(new com.google.common.base.Predicate<SSTableReader>()
         {
             public boolean apply(SSTableReader reader)
             {
@@ -271,9 +267,9 @@
     public void testAddingMoreSSTables()
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Set<SSTableReader> original = store.getUnrepairedSSTables();
+        Set<SSTableReader> original = Sets.newHashSet(store.select(View.select(SSTableSet.CANONICAL, (s) -> !s.isRepaired())).sstables);
         UUID prsId = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, true);
+        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, System.currentTimeMillis(), true);
         ActiveRepairService.ParentRepairSession prs = ActiveRepairService.instance.getParentRepairSession(prsId);
         prs.markSSTablesRepairing(store.metadata.cfId, prsId);
         try (Refs<SSTableReader> refs = prs.getActiveRepairedSSTableRefsForAntiCompaction(store.metadata.cfId, prsId))
@@ -286,7 +282,7 @@
         try
         {
             UUID newPrsId = UUID.randomUUID();
-            ActiveRepairService.instance.registerParentRepairSession(newPrsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, true);
+            ActiveRepairService.instance.registerParentRepairSession(newPrsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), null, true, System.currentTimeMillis(), true);
             ActiveRepairService.instance.getParentRepairSession(newPrsId).markSSTablesRepairing(store.metadata.cfId, newPrsId);
         }
         catch (Throwable t)
@@ -307,12 +303,12 @@
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
         UUID prsId = UUID.randomUUID();
-        Set<SSTableReader> original = store.getUnrepairedSSTables();
-        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.partitioner.getMinimumToken(), store.partitioner.getMinimumToken())), true, true);
+        Set<SSTableReader> original = Sets.newHashSet(store.select(View.select(SSTableSet.CANONICAL, (s) -> !s.isRepaired())).sstables);
+        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.getPartitioner().getMinimumToken(), store.getPartitioner().getMinimumToken())), true, System.currentTimeMillis(), true);
         ActiveRepairService.instance.getParentRepairSession(prsId).maybeSnapshot(store.metadata.cfId, prsId);
 
         UUID prsId2 = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(prsId2, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.partitioner.getMinimumToken(), store.partitioner.getMinimumToken())), true, true);
+        ActiveRepairService.instance.registerParentRepairSession(prsId2, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.getPartitioner().getMinimumToken(), store.getPartitioner().getMinimumToken())), true, System.currentTimeMillis(), true);
         createSSTables(store, 2);
         ActiveRepairService.instance.getParentRepairSession(prsId).maybeSnapshot(store.metadata.cfId, prsId);
         try (Refs<SSTableReader> refs = ActiveRepairService.instance.getParentRepairSession(prsId).getActiveRepairedSSTableRefsForAntiCompaction(store.metadata.cfId, prsId))
@@ -331,13 +327,13 @@
     public void testSnapshotMultipleRepairs()
     {
         ColumnFamilyStore store = prepareColumnFamilyStore();
-        Set<SSTableReader> original = store.getUnrepairedSSTables();
+        Set<SSTableReader> original = Sets.newHashSet(store.select(View.select(SSTableSet.CANONICAL, (s) -> !s.isRepaired())).sstables);
         UUID prsId = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.partitioner.getMinimumToken(), store.partitioner.getMinimumToken())), true, true);
+        ActiveRepairService.instance.registerParentRepairSession(prsId, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.getPartitioner().getMinimumToken(), store.getPartitioner().getMinimumToken())), true, System.currentTimeMillis(), true);
         ActiveRepairService.instance.getParentRepairSession(prsId).maybeSnapshot(store.metadata.cfId, prsId);
 
         UUID prsId2 = UUID.randomUUID();
-        ActiveRepairService.instance.registerParentRepairSession(prsId2, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.partitioner.getMinimumToken(), store.partitioner.getMinimumToken())), true, true);
+        ActiveRepairService.instance.registerParentRepairSession(prsId2, FBUtilities.getBroadcastAddress(), Collections.singletonList(store), Collections.singleton(new Range<>(store.getPartitioner().getMinimumToken(), store.getPartitioner().getMinimumToken())), true, System.currentTimeMillis(), true);
         boolean exception = false;
         try
         {
@@ -369,14 +365,14 @@
         long timestamp = System.currentTimeMillis();
         for (int i = 0; i < count; i++)
         {
-            DecoratedKey key = Util.dk(Integer.toString(i));
-            Mutation rm = new Mutation(KEYSPACE5, key.getKey());
             for (int j = 0; j < 10; j++)
-                rm.add("Standard1", Util.cellname(Integer.toString(j)),
-                       ByteBufferUtil.EMPTY_BYTE_BUFFER,
-                       timestamp,
-                       0);
-            rm.apply();
+            {
+                new RowUpdateBuilder(cfs.metadata, timestamp, Integer.toString(j))
+                .clustering("c")
+                .add("val", "val")
+                .build()
+                .applyUnsafe();
+            }
             cfs.forceBlockingFlush();
         }
     }

diff --git a/test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java b/test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java
deleted file mode 100644
index 3a19b75..0000000
--- a/test/unit/org/apache/cassandra/service/BatchlogEndpointFilterTest.java
+++ /dev/null

@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service;
-
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.util.Collection;
-import java.util.HashSet;
-
-import org.junit.Test;
-import org.junit.matchers.JUnitMatchers;
-
-import com.google.common.collect.ImmutableMultimap;
-import com.google.common.collect.Multimap;
-
-import org.apache.cassandra.db.BatchlogManager;
-
-import static org.junit.Assert.assertThat;
-import static org.hamcrest.CoreMatchers.is;
-
-public class BatchlogEndpointFilterTest
-{
-    private static final String LOCAL = "local";
-
-    @Test
-    public void shouldSelect2hostsFromNonLocalRacks() throws UnknownHostException
-    {
-        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
-                .put(LOCAL, InetAddress.getByName("0"))
-                .put(LOCAL, InetAddress.getByName("00"))
-                .put("1", InetAddress.getByName("1"))
-                .put("1", InetAddress.getByName("11"))
-                .put("2", InetAddress.getByName("2"))
-                .put("2", InetAddress.getByName("22"))
-                .build();
-        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
-        assertThat(result.size(), is(2));
-        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("11")));
-        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("22")));
-    }
-
-    @Test
-    public void shouldSelectHostFromLocal() throws UnknownHostException
-    {
-        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
-                .put(LOCAL, InetAddress.getByName("0"))
-                .put(LOCAL, InetAddress.getByName("00"))
-                .put("1", InetAddress.getByName("1"))
-                .build();
-        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
-        assertThat(result.size(), is(2));
-        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("1")));
-        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
-    }
-
-    @Test
-    public void shouldReturnAsIsIfNoEnoughEndpoints() throws UnknownHostException
-    {
-        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
-                .put(LOCAL, InetAddress.getByName("0"))
-                .build();
-        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
-        assertThat(result.size(), is(1));
-        assertThat(result, JUnitMatchers.hasItem(InetAddress.getByName("0")));
-    }
-
-    @Test
-    public void shouldSelectTwoRandomHostsFromSingleOtherRack() throws UnknownHostException
-    {
-        Multimap<String, InetAddress> endpoints = ImmutableMultimap.<String, InetAddress> builder()
-                .put(LOCAL, InetAddress.getByName("0"))
-                .put(LOCAL, InetAddress.getByName("00"))
-                .put("1", InetAddress.getByName("1"))
-                .put("1", InetAddress.getByName("11"))
-                .put("1", InetAddress.getByName("111"))
-                .build();
-        Collection<InetAddress> result = new TestEndpointFilter(LOCAL, endpoints).filter();
-        // result should contain random two distinct values
-        assertThat(new HashSet<>(result).size(), is(2));
-    }
-
-    private static class TestEndpointFilter extends BatchlogManager.EndpointFilter
-    {
-        public TestEndpointFilter(String localRack, Multimap<String, InetAddress> endpoints)
-        {
-            super(localRack, endpoints);
-        }
-
-        @Override
-        protected boolean isValid(InetAddress input)
-        {
-            // We will use always alive non-localhost endpoints
-            return true;
-        }
-
-        @Override
-        protected int getRandomInt(int bound)
-        {
-            // We don't need random behavior here
-            return bound - 1;
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
index d22a8f6..cf14d55 100644
--- a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
+++ b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java

@@ -18,30 +18,28 @@
 package org.apache.cassandra.service;
 
 import org.apache.commons.lang3.StringUtils;
+
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.transport.Message;
 import org.apache.cassandra.transport.Server;
 import org.apache.cassandra.transport.SimpleClient;
 import org.apache.cassandra.transport.messages.QueryMessage;
 
 import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertNull;
+import static org.junit.Assert.assertNull;
 
 public class ClientWarningsTest extends CQLTester
 {
     @BeforeClass
     public static void setUp()
     {
-        DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
-
         requireNetwork();
         DatabaseDescriptor.setBatchSizeWarnThresholdInKB(1);
     }
@@ -126,7 +124,7 @@
     {
         createTable("CREATE TABLE %s (pk int PRIMARY KEY, v text)");
 
-        try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, Server.VERSION_2))
+        try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, Server.VERSION_3))
         {
             client.connect(false);
 

diff --git a/test/unit/org/apache/cassandra/service/DataResolverTest.java b/test/unit/org/apache/cassandra/service/DataResolverTest.java
new file mode 100644
index 0000000..65e18ce
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/DataResolverTest.java

@@ -0,0 +1,1112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Sets;
+import org.junit.*;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.marshal.ByteType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.net.*;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.Util.assertClustering;
+import static org.apache.cassandra.Util.assertColumn;
+import static org.apache.cassandra.Util.assertColumns;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.apache.cassandra.db.RangeTombstone.Bound.Kind;
+
+public class DataResolverTest
+{
+    public static final String KEYSPACE1 = "DataResolverTest";
+    public static final String CF_STANDARD = "Standard1";
+    public static final String CF_COLLECTION = "Collection1";
+
+    // counter to generate the last byte of the respondent's address in a ReadResponse message
+    private int addressSuffix = 10;
+
+    private DecoratedKey dk;
+    private Keyspace ks;
+    private ColumnFamilyStore cfs;
+    private ColumnFamilyStore cfs2;
+    private CFMetaData cfm;
+    private CFMetaData cfm2;
+    private ColumnDefinition m;
+    private int nowInSec;
+    private ReadCommand command;
+    private MessageRecorder messageRecorder;
+
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        CFMetaData cfMetadata = CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD)
+                                                  .addPartitionKey("key", BytesType.instance)
+                                                  .addClusteringColumn("col1", AsciiType.instance)
+                                                  .addRegularColumn("c1", AsciiType.instance)
+                                                  .addRegularColumn("c2", AsciiType.instance)
+                                                  .addRegularColumn("one", AsciiType.instance)
+                                                  .addRegularColumn("two", AsciiType.instance)
+                                                  .build();
+
+        CFMetaData cfMetaData2 = CFMetaData.Builder.create(KEYSPACE1, CF_COLLECTION)
+                                                   .addPartitionKey("k", ByteType.instance)
+                                                   .addRegularColumn("m", MapType.getInstance(IntegerType.instance, IntegerType.instance, true))
+                                                   .build();
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    cfMetadata, cfMetaData2);
+    }
+
+    @Before
+    public void setup()
+    {
+        dk = Util.dk("key1");
+        ks = Keyspace.open(KEYSPACE1);
+        cfs = ks.getColumnFamilyStore(CF_STANDARD);
+        cfm = cfs.metadata;
+        cfs2 = ks.getColumnFamilyStore(CF_COLLECTION);
+        cfm2 = cfs2.metadata;
+        m = cfm2.getColumnDefinition(new ColumnIdentifier("m", false));
+
+        nowInSec = FBUtilities.nowInSeconds();
+        command = Util.cmd(cfs, dk).withNowInSeconds(nowInSec).build();
+    }
+
+    @Before
+    public void injectMessageSink()
+    {
+        // install an IMessageSink to capture all messages
+        // so we can inspect them during tests
+        messageRecorder = new MessageRecorder();
+        MessagingService.instance().addMessageSink(messageRecorder);
+    }
+
+    @After
+    public void removeMessageSink()
+    {
+        // should be unnecessary, but good housekeeping
+        MessagingService.instance().clearMessageSinks();
+    }
+
+    /**
+     * Checks that the provided data resolver has the expected number of repair futures created.
+     * This method also "release" those future by faking replica responses to those repair, which is necessary or
+     * every test would timeout when closing the result of resolver.resolve(), since it waits on those futures.
+     */
+    private void assertRepairFuture(DataResolver resolver, int expectedRepairs)
+    {
+        assertEquals(expectedRepairs, resolver.repairResults.size());
+
+        // Signal all future. We pass a completely fake response message, but it doesn't matter as we just want
+        // AsyncOneResponse to signal success, and it only cares about a non-null MessageIn (it collects the payload).
+        for (AsyncOneResponse<?> future : resolver.repairResults)
+            future.response(MessageIn.create(null, null, null, null, -1));
+    }
+
+    @Test
+    public void testResolveNewerSingleRow() throws UnknownHostException
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1")
+                                                                                                       .add("c1", "v1")
+                                                                                                       .buildUpdate())));
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("1")
+                                                                                                       .add("c1", "v2")
+                                                                                                       .buildUpdate())));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "c1");
+                assertColumn(cfm, row, "c1", "v2", 1);
+            }
+            assertRepairFuture(resolver, 1);
+        }
+
+        assertEquals(1, messageRecorder.sent.size());
+        // peer 1 just needs to repair with the row from peer 2
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "1", "c1", "v2", 1);
+    }
+
+    @Test
+    public void testResolveDisjointSingleRow()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1")
+                                                                                                       .add("c1", "v1")
+                                                                                                       .buildUpdate())));
+
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("1")
+                                                                                                       .add("c2", "v2")
+                                                                                                       .buildUpdate())));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "c1", "c2");
+                assertColumn(cfm, row, "c1", "v1", 0);
+                assertColumn(cfm, row, "c2", "v2", 1);
+            }
+            assertRepairFuture(resolver, 2);
+        }
+
+        assertEquals(2, messageRecorder.sent.size());
+        // each peer needs to repair with each other's column
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsColumn(msg, "1", "c2", "v2", 1);
+
+        msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsColumn(msg, "1", "c1", "v1", 0);
+    }
+
+    @Test
+    public void testResolveDisjointMultipleRows() throws UnknownHostException
+    {
+
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1")
+                                                                                                       .add("c1", "v1")
+                                                                                                       .buildUpdate())));
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("2")
+                                                                                                       .add("c2", "v2")
+                                                                                                       .buildUpdate())));
+
+        try (PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = data.next())
+            {
+                // We expect the resolved superset to contain both rows
+                Row row = rows.next();
+                assertClustering(cfm, row, "1");
+                assertColumns(row, "c1");
+                assertColumn(cfm, row, "c1", "v1", 0);
+
+                row = rows.next();
+                assertClustering(cfm, row, "2");
+                assertColumns(row, "c2");
+                assertColumn(cfm, row, "c2", "v2", 1);
+
+                assertFalse(rows.hasNext());
+                assertFalse(data.hasNext());
+            }
+            assertRepairFuture(resolver, 2);
+        }
+
+        assertEquals(2, messageRecorder.sent.size());
+        // each peer needs to repair the row from the other
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "2", "c2", "v2", 1);
+
+        msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "1", "c1", "v1", 0);
+    }
+
+    @Test
+    public void testResolveDisjointMultipleRowsWithRangeTombstones()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 4);
+
+        RangeTombstone tombstone1 = tombstone("1", "11", 1, nowInSec);
+        RangeTombstone tombstone2 = tombstone("3", "31", 1, nowInSec);
+        PartitionUpdate update =new RowUpdateBuilder(cfm, nowInSec, 1L, dk).addRangeTombstone(tombstone1)
+                                                                                  .addRangeTombstone(tombstone2)
+                                                                                  .buildUpdate();
+
+        InetAddress peer1 = peer();
+        UnfilteredPartitionIterator iter1 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).addRangeTombstone(tombstone1)
+                                                                                  .addRangeTombstone(tombstone2)
+                                                                                  .buildUpdate());
+        resolver.preprocess(readResponseMessage(peer1, iter1));
+        // not covered by any range tombstone
+        InetAddress peer2 = peer();
+        UnfilteredPartitionIterator iter2 = iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("0")
+                                                                                  .add("c1", "v0")
+                                                                                  .buildUpdate());
+        resolver.preprocess(readResponseMessage(peer2, iter2));
+        // covered by a range tombstone
+        InetAddress peer3 = peer();
+        UnfilteredPartitionIterator iter3 = iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("10")
+                                                                                  .add("c2", "v1")
+                                                                                  .buildUpdate());
+        resolver.preprocess(readResponseMessage(peer3, iter3));
+        // range covered by rt, but newer
+        InetAddress peer4 = peer();
+        UnfilteredPartitionIterator iter4 = iter(new RowUpdateBuilder(cfm, nowInSec, 2L, dk).clustering("3")
+                                                                                  .add("one", "A")
+                                                                                  .buildUpdate());
+        resolver.preprocess(readResponseMessage(peer4, iter4));
+        try (PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = data.next())
+            {
+                Row row = rows.next();
+                assertClustering(cfm, row, "0");
+                assertColumns(row, "c1");
+                assertColumn(cfm, row, "c1", "v0", 0);
+
+                row = rows.next();
+                assertClustering(cfm, row, "3");
+                assertColumns(row, "one");
+                assertColumn(cfm, row, "one", "A", 2);
+
+                assertFalse(rows.hasNext());
+            }
+            assertRepairFuture(resolver, 4);
+        }
+
+        assertEquals(4, messageRecorder.sent.size());
+        // peer1 needs the rows from peers 2 and 4
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "0", "c1", "v0", 0);
+        assertRepairContainsColumn(msg, "3", "one", "A", 2);
+
+        // peer2 needs to get the row from peer4 and the RTs
+        msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, null, tombstone1, tombstone2);
+        assertRepairContainsColumn(msg, "3", "one", "A", 2);
+
+        // peer 3 needs both rows and the RTs
+        msg = getSentMessage(peer3);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, null, tombstone1, tombstone2);
+        assertRepairContainsColumn(msg, "0", "c1", "v0", 0);
+        assertRepairContainsColumn(msg, "3", "one", "A", 2);
+
+        // peer4 needs the row from peer2  and the RTs
+        msg = getSentMessage(peer4);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, null, tombstone1, tombstone2);
+        assertRepairContainsColumn(msg, "0", "c1", "v0", 0);
+    }
+
+    @Test
+    public void testResolveWithOneEmpty()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("1")
+                                                                                                       .add("c2", "v2")
+                                                                                                       .buildUpdate())));
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, EmptyIterators.unfilteredPartition(cfm, false)));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "c2");
+                assertColumn(cfm, row, "c2", "v2", 1);
+            }
+            assertRepairFuture(resolver, 1);
+        }
+
+        assertEquals(1, messageRecorder.sent.size());
+        // peer 2 needs the row from peer 1
+        MessageOut msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "1", "c2", "v2", 1);
+    }
+
+    @Test
+    public void testResolveWithBothEmpty()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        resolver.preprocess(readResponseMessage(peer(), EmptyIterators.unfilteredPartition(cfm, false)));
+        resolver.preprocess(readResponseMessage(peer(), EmptyIterators.unfilteredPartition(cfm, false)));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            assertRepairFuture(resolver, 0);
+        }
+
+        assertTrue(messageRecorder.sent.isEmpty());
+    }
+
+    @Test
+    public void testResolveDeleted()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        // one response with columns timestamped before a delete in another response
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1")
+                                                                                                       .add("one", "A")
+                                                                                                       .buildUpdate())));
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, fullPartitionDelete(cfm, dk, 1, nowInSec)));
+
+        try (PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            assertRepairFuture(resolver, 1);
+        }
+
+        // peer1 should get the deletion from peer2
+        assertEquals(1, messageRecorder.sent.size());
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, new DeletionTime(1, nowInSec));
+        assertRepairContainsNoColumns(msg);
+    }
+
+    @Test
+    public void testResolveMultipleDeleted()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 4);
+        // deletes and columns with interleaved timestamp, with out of order return sequence
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, fullPartitionDelete(cfm, dk, 0, nowInSec)));
+        // these columns created after the previous deletion
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("1")
+                                                                                                       .add("one", "A")
+                                                                                                       .add("two", "A")
+                                                                                                       .buildUpdate())));
+        //this column created after the next delete
+        InetAddress peer3 = peer();
+        resolver.preprocess(readResponseMessage(peer3, iter(new RowUpdateBuilder(cfm, nowInSec, 3L, dk).clustering("1")
+                                                                                                       .add("two", "B")
+                                                                                                       .buildUpdate())));
+        InetAddress peer4 = peer();
+        resolver.preprocess(readResponseMessage(peer4, fullPartitionDelete(cfm, dk, 2, nowInSec)));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "two");
+                assertColumn(cfm, row, "two", "B", 3);
+            }
+            assertRepairFuture(resolver, 4);
+        }
+
+        // peer 1 needs to get the partition delete from peer 4 and the row from peer 3
+        assertEquals(4, messageRecorder.sent.size());
+        MessageOut msg = getSentMessage(peer1);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, new DeletionTime(2, nowInSec));
+        assertRepairContainsColumn(msg, "1", "two", "B", 3);
+
+        // peer 2 needs the deletion from peer 4 and the row from peer 3
+        msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, new DeletionTime(2, nowInSec));
+        assertRepairContainsColumn(msg, "1", "two", "B", 3);
+
+        // peer 3 needs just the deletion from peer 4
+        msg = getSentMessage(peer3);
+        assertRepairMetadata(msg);
+        assertRepairContainsDeletions(msg, new DeletionTime(2, nowInSec));
+        assertRepairContainsNoColumns(msg);
+
+        // peer 4 needs just the row from peer 3
+        msg = getSentMessage(peer4);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoDeletions(msg);
+        assertRepairContainsColumn(msg, "1", "two", "B", 3);
+    }
+
+    @Test
+    public void testResolveRangeTombstonesOnBoundaryRightWins() throws UnknownHostException
+    {
+        resolveRangeTombstonesOnBoundary(1, 2);
+    }
+
+    @Test
+    public void testResolveRangeTombstonesOnBoundaryLeftWins() throws UnknownHostException
+    {
+        resolveRangeTombstonesOnBoundary(2, 1);
+    }
+
+    @Test
+    public void testResolveRangeTombstonesOnBoundarySameTimestamp() throws UnknownHostException
+    {
+        resolveRangeTombstonesOnBoundary(1, 1);
+    }
+
+    /*
+     * We want responses to merge on tombstone boundary. So we'll merge 2 "streams":
+     *   1: [1, 2)(3, 4](5, 6]  2
+     *   2:    [2, 3][4, 5)     1
+     * which tests all combination of open/close boundaries (open/close, close/open, open/open, close/close).
+     *
+     * Note that, because DataResolver returns a "filtered" iterator, it should resolve into an empty iterator.
+     * However, what should be sent to each source depends on the exact on the timestamps of each tombstones and we
+     * test a few combination.
+     */
+    private void resolveRangeTombstonesOnBoundary(long timestamp1, long timestamp2)
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        InetAddress peer2 = peer();
+
+        // 1st "stream"
+        RangeTombstone one_two    = tombstone("1", true , "2", false, timestamp1, nowInSec);
+        RangeTombstone three_four = tombstone("3", false, "4", true , timestamp1, nowInSec);
+        RangeTombstone five_six   = tombstone("5", false, "6", true , timestamp1, nowInSec);
+        UnfilteredPartitionIterator iter1 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).addRangeTombstone(one_two)
+                                                                                            .addRangeTombstone(three_four)
+                                                                                            .addRangeTombstone(five_six)
+                                                                                            .buildUpdate());
+
+        // 2nd "stream"
+        RangeTombstone two_three = tombstone("2", true, "3", true , timestamp2, nowInSec);
+        RangeTombstone four_five = tombstone("4", true, "5", false, timestamp2, nowInSec);
+        UnfilteredPartitionIterator iter2 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).addRangeTombstone(two_three)
+                                                                                            .addRangeTombstone(four_five)
+                                                                                            .buildUpdate());
+
+        resolver.preprocess(readResponseMessage(peer1, iter1));
+        resolver.preprocess(readResponseMessage(peer2, iter2));
+
+        // No results, we've only reconciled tombstones.
+        try (PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            assertRepairFuture(resolver, 2);
+        }
+
+        assertEquals(2, messageRecorder.sent.size());
+
+        MessageOut msg1 = getSentMessage(peer1);
+        assertRepairMetadata(msg1);
+        assertRepairContainsNoColumns(msg1);
+
+        MessageOut msg2 = getSentMessage(peer2);
+        assertRepairMetadata(msg2);
+        assertRepairContainsNoColumns(msg2);
+
+        // Both streams are mostly complementary, so they will roughly get the ranges of the other stream. One subtlety is
+        // around the value "4" however, as it's included by both stream.
+        // So for a given stream, unless the other stream has a strictly higher timestamp, the value 4 will be excluded
+        // from whatever range it receives as repair since the stream already covers it.
+
+        // Message to peer1 contains peer2 ranges
+        assertRepairContainsDeletions(msg1, null, two_three, withExclusiveStartIf(four_five, timestamp1 >= timestamp2));
+
+        // Message to peer2 contains peer1 ranges
+        assertRepairContainsDeletions(msg2, null, one_two, withExclusiveEndIf(three_four, timestamp2 >= timestamp1), five_six);
+    }
+
+    /**
+     * Test cases where a boundary of a source is covered by another source deletion and timestamp on one or both side
+     * of the boundary are equal to the "merged" deletion.
+     * This is a test for CASSANDRA-13237 to make sure we handle this case properly.
+     */
+    @Test
+    public void testRepairRangeTombstoneBoundary() throws UnknownHostException
+    {
+        testRepairRangeTombstoneBoundary(1, 0, 1);
+        messageRecorder.sent.clear();
+        testRepairRangeTombstoneBoundary(1, 1, 0);
+        messageRecorder.sent.clear();
+        testRepairRangeTombstoneBoundary(1, 1, 1);
+    }
+
+    /**
+     * Test for CASSANDRA-13237, checking we don't fail (and handle correctly) the case where a RT boundary has the
+     * same deletion on both side (while is useless but could be created by legacy code pre-CASSANDRA-13237 and could
+     * thus still be sent).
+     */
+    private void testRepairRangeTombstoneBoundary(int timestamp1, int timestamp2, int timestamp3) throws UnknownHostException
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        InetAddress peer2 = peer();
+
+        // 1st "stream"
+        RangeTombstone one_nine = tombstone("0", true , "9", true, timestamp1, nowInSec);
+        UnfilteredPartitionIterator iter1 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk)
+                                                 .addRangeTombstone(one_nine)
+                                                 .buildUpdate());
+
+        // 2nd "stream" (build more manually to ensure we have the boundary we want)
+        RangeTombstoneBoundMarker open_one = marker("0", true, true, timestamp2, nowInSec);
+        RangeTombstoneBoundaryMarker boundary_five = boundary("5", false, timestamp2, nowInSec, timestamp3, nowInSec);
+        RangeTombstoneBoundMarker close_nine = marker("9", false, true, timestamp3, nowInSec);
+        UnfilteredPartitionIterator iter2 = iter(dk, open_one, boundary_five, close_nine);
+
+        resolver.preprocess(readResponseMessage(peer1, iter1));
+        resolver.preprocess(readResponseMessage(peer2, iter2));
+
+        boolean shouldHaveRepair = timestamp1 != timestamp2 || timestamp1 != timestamp3;
+
+        // No results, we've only reconciled tombstones.
+        try (PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            assertRepairFuture(resolver, shouldHaveRepair ? 1 : 0);
+        }
+
+        assertEquals(shouldHaveRepair? 1 : 0, messageRecorder.sent.size());
+
+        if (!shouldHaveRepair)
+            return;
+
+        MessageOut msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoColumns(msg);
+
+        RangeTombstone expected = timestamp1 != timestamp2
+                                  // We've repaired the 1st part
+                                  ? tombstone("0", true, "5", false, timestamp1, nowInSec)
+                                  // We've repaired the 2nd part
+                                  : tombstone("5", true, "9", true, timestamp1, nowInSec);
+        assertRepairContainsDeletions(msg, null, expected);
+    }
+
+    /**
+     * Test for CASSANDRA-13719: tests that having a partition deletion shadow a range tombstone on another source
+     * doesn't trigger an assertion error.
+     */
+    @Test
+    public void testRepairRangeTombstoneWithPartitionDeletion()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        InetAddress peer2 = peer();
+
+        // 1st "stream": just a partition deletion
+        UnfilteredPartitionIterator iter1 = iter(PartitionUpdate.fullPartitionDelete(cfm, dk, 10, nowInSec));
+
+        // 2nd "stream": a range tombstone that is covered by the 1st stream
+        RangeTombstone rt = tombstone("0", true , "10", true, 5, nowInSec);
+        UnfilteredPartitionIterator iter2 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk)
+                                                 .addRangeTombstone(rt)
+                                                 .buildUpdate());
+
+        resolver.preprocess(readResponseMessage(peer1, iter1));
+        resolver.preprocess(readResponseMessage(peer2, iter2));
+
+        // No results, we've only reconciled tombstones.
+        try (PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            // 2nd stream should get repaired
+            assertRepairFuture(resolver, 1);
+        }
+
+        assertEquals(1, messageRecorder.sent.size());
+
+        MessageOut msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoColumns(msg);
+
+        assertRepairContainsDeletions(msg, new DeletionTime(10, nowInSec));
+    }
+
+    /**
+     * Additional test for CASSANDRA-13719: tests the case where a partition deletion doesn't shadow a range tombstone.
+     */
+    @Test
+    public void testRepairRangeTombstoneWithPartitionDeletion2()
+    {
+        DataResolver resolver = new DataResolver(ks, command, ConsistencyLevel.ALL, 2);
+        InetAddress peer1 = peer();
+        InetAddress peer2 = peer();
+
+        // 1st "stream": a partition deletion and a range tombstone
+        RangeTombstone rt1 = tombstone("0", true , "9", true, 11, nowInSec);
+        PartitionUpdate upd1 = new RowUpdateBuilder(cfm, nowInSec, 1L, dk)
+                                                 .addRangeTombstone(rt1)
+                                                 .buildUpdate();
+        ((MutableDeletionInfo)upd1.deletionInfo()).add(new DeletionTime(10, nowInSec));
+        UnfilteredPartitionIterator iter1 = iter(upd1);
+
+        // 2nd "stream": a range tombstone that is covered by the other stream rt
+        RangeTombstone rt2 = tombstone("2", true , "3", true, 11, nowInSec);
+        RangeTombstone rt3 = tombstone("4", true , "5", true, 10, nowInSec);
+        UnfilteredPartitionIterator iter2 = iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk)
+                                                 .addRangeTombstone(rt2)
+                                                 .addRangeTombstone(rt3)
+                                                 .buildUpdate());
+
+        resolver.preprocess(readResponseMessage(peer1, iter1));
+        resolver.preprocess(readResponseMessage(peer2, iter2));
+
+        // No results, we've only reconciled tombstones.
+        try (PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            // 2nd stream should get repaired
+            assertRepairFuture(resolver, 1);
+        }
+
+        assertEquals(1, messageRecorder.sent.size());
+
+        MessageOut msg = getSentMessage(peer2);
+        assertRepairMetadata(msg);
+        assertRepairContainsNoColumns(msg);
+
+        // 2nd stream should get both the partition deletion, as well as the part of the 1st stream RT that it misses
+        assertRepairContainsDeletions(msg, new DeletionTime(10, nowInSec),
+                                      tombstone("0", true, "2", false, 11, nowInSec),
+                                      tombstone("3", false, "9", true, 11, nowInSec));
+    }
+
+    // Forces the start to be exclusive if the condition holds
+    private static RangeTombstone withExclusiveStartIf(RangeTombstone rt, boolean condition)
+    {
+        Slice slice = rt.deletedSlice();
+        return condition
+             ? new RangeTombstone(Slice.make(slice.start().withNewKind(Kind.EXCL_START_BOUND), slice.end()), rt.deletionTime())
+             : rt;
+    }
+
+    // Forces the end to be exclusive if the condition holds
+    private static RangeTombstone withExclusiveEndIf(RangeTombstone rt, boolean condition)
+    {
+        Slice slice = rt.deletedSlice();
+        return condition
+             ? new RangeTombstone(Slice.make(slice.start(), slice.end().withNewKind(Kind.EXCL_END_BOUND)), rt.deletionTime())
+             : rt;
+    }
+
+    private static ByteBuffer bb(int b)
+    {
+        return ByteBufferUtil.bytes(b);
+    }
+
+    private Cell mapCell(int k, int v, long ts)
+    {
+        return BufferCell.live(cfm2, m, ts, bb(v), CellPath.create(bb(k)));
+    }
+
+    @Test
+    public void testResolveComplexDelete()
+    {
+        ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build();
+        DataResolver resolver = new DataResolver(ks, cmd, ConsistencyLevel.ALL, 2);
+
+        long[] ts = {100, 200};
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSec);
+        builder.newRow(Clustering.EMPTY);
+        builder.addComplexDeletion(m, new DeletionTime(ts[0] - 1, nowInSec));
+        builder.addCell(mapCell(0, 0, ts[0]));
+
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        builder.newRow(Clustering.EMPTY);
+        DeletionTime expectedCmplxDelete = new DeletionTime(ts[1] - 1, nowInSec);
+        builder.addComplexDeletion(m, expectedCmplxDelete);
+        Cell expectedCell = mapCell(1, 1, ts[1]);
+        builder.addCell(expectedCell);
+
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "m");
+                Assert.assertNull(row.getCell(m, CellPath.create(bb(0))));
+                Assert.assertNotNull(row.getCell(m, CellPath.create(bb(1))));
+            }
+            assertRepairFuture(resolver, 1);
+        }
+
+        MessageOut<Mutation> msg;
+        msg = getSentMessage(peer1);
+        Iterator<Row> rowIter = msg.payload.getPartitionUpdate(cfm2.cfId).iterator();
+        assertTrue(rowIter.hasNext());
+        Row row = rowIter.next();
+        assertFalse(rowIter.hasNext());
+
+        ComplexColumnData cd = row.getComplexColumnData(m);
+
+        assertEquals(Collections.singleton(expectedCell), Sets.newHashSet(cd));
+        assertEquals(expectedCmplxDelete, cd.complexDeletion());
+
+        Assert.assertNull(messageRecorder.sent.get(peer2));
+    }
+
+    @Test
+    public void testResolveDeletedCollection()
+    {
+
+        ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build();
+        DataResolver resolver = new DataResolver(ks, cmd, ConsistencyLevel.ALL, 2);
+
+        long[] ts = {100, 200};
+
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSec);
+        builder.newRow(Clustering.EMPTY);
+        builder.addComplexDeletion(m, new DeletionTime(ts[0] - 1, nowInSec));
+        builder.addCell(mapCell(0, 0, ts[0]));
+
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        builder.newRow(Clustering.EMPTY);
+        DeletionTime expectedCmplxDelete = new DeletionTime(ts[1] - 1, nowInSec);
+        builder.addComplexDeletion(m, expectedCmplxDelete);
+
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            assertFalse(data.hasNext());
+            assertRepairFuture(resolver, 1);
+        }
+
+        MessageOut<Mutation> msg;
+        msg = getSentMessage(peer1);
+        Iterator<Row> rowIter = msg.payload.getPartitionUpdate(cfm2.cfId).iterator();
+        assertTrue(rowIter.hasNext());
+        Row row = rowIter.next();
+        assertFalse(rowIter.hasNext());
+
+        ComplexColumnData cd = row.getComplexColumnData(m);
+
+        assertEquals(Collections.emptySet(), Sets.newHashSet(cd));
+        assertEquals(expectedCmplxDelete, cd.complexDeletion());
+
+        Assert.assertNull(messageRecorder.sent.get(peer2));
+    }
+
+    @Test
+    public void testResolveNewCollection()
+    {
+        ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build();
+        DataResolver resolver = new DataResolver(ks, cmd, ConsistencyLevel.ALL, 2);
+
+        long[] ts = {100, 200};
+
+        // map column
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSec);
+        builder.newRow(Clustering.EMPTY);
+        DeletionTime expectedCmplxDelete = new DeletionTime(ts[0] - 1, nowInSec);
+        builder.addComplexDeletion(m, expectedCmplxDelete);
+        Cell expectedCell = mapCell(0, 0, ts[0]);
+        builder.addCell(expectedCell);
+
+        // empty map column
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(PartitionUpdate.emptyUpdate(cfm2, dk))));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "m");
+                ComplexColumnData cd = row.getComplexColumnData(m);
+                assertEquals(Collections.singleton(expectedCell), Sets.newHashSet(cd));
+            }
+            assertRepairFuture(resolver, 1);
+        }
+
+        Assert.assertNull(messageRecorder.sent.get(peer1));
+
+        MessageOut<Mutation> msg;
+        msg = getSentMessage(peer2);
+        Iterator<Row> rowIter = msg.payload.getPartitionUpdate(cfm2.cfId).iterator();
+        assertTrue(rowIter.hasNext());
+        Row row = rowIter.next();
+        assertFalse(rowIter.hasNext());
+
+        ComplexColumnData cd = row.getComplexColumnData(m);
+
+        assertEquals(Sets.newHashSet(expectedCell), Sets.newHashSet(cd));
+        assertEquals(expectedCmplxDelete, cd.complexDeletion());
+    }
+
+    @Test
+    public void testResolveNewCollectionOverwritingDeleted()
+    {
+        ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build();
+        DataResolver resolver = new DataResolver(ks, cmd, ConsistencyLevel.ALL, 2);
+
+        long[] ts = {100, 200};
+
+        // cleared map column
+        Row.Builder builder = BTreeRow.unsortedBuilder(nowInSec);
+        builder.newRow(Clustering.EMPTY);
+        builder.addComplexDeletion(m, new DeletionTime(ts[0] - 1, nowInSec));
+
+        InetAddress peer1 = peer();
+        resolver.preprocess(readResponseMessage(peer1, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        // newer, overwritten map column
+        builder.newRow(Clustering.EMPTY);
+        DeletionTime expectedCmplxDelete = new DeletionTime(ts[1] - 1, nowInSec);
+        builder.addComplexDeletion(m, expectedCmplxDelete);
+        Cell expectedCell = mapCell(1, 1, ts[1]);
+        builder.addCell(expectedCell);
+
+        InetAddress peer2 = peer();
+        resolver.preprocess(readResponseMessage(peer2, iter(PartitionUpdate.singleRowUpdate(cfm2, dk, builder.build())), cmd));
+
+        try(PartitionIterator data = resolver.resolve())
+        {
+            try (RowIterator rows = Iterators.getOnlyElement(data))
+            {
+                Row row = Iterators.getOnlyElement(rows);
+                assertColumns(row, "m");
+                ComplexColumnData cd = row.getComplexColumnData(m);
+                assertEquals(Collections.singleton(expectedCell), Sets.newHashSet(cd));
+            }
+            assertRepairFuture(resolver, 1);
+        }
+
+        MessageOut<Mutation> msg;
+        msg = getSentMessage(peer1);
+        Row row = Iterators.getOnlyElement(msg.payload.getPartitionUpdate(cfm2.cfId).iterator());
+
+        ComplexColumnData cd = row.getComplexColumnData(m);
+
+        assertEquals(Collections.singleton(expectedCell), Sets.newHashSet(cd));
+        assertEquals(expectedCmplxDelete, cd.complexDeletion());
+
+        Assert.assertNull(messageRecorder.sent.get(peer2));
+    }
+
+    private InetAddress peer()
+    {
+        try
+        {
+            return InetAddress.getByAddress(new byte[]{ 127, 0, 0, (byte) addressSuffix++ });
+        }
+        catch (UnknownHostException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private MessageOut<Mutation> getSentMessage(InetAddress target)
+    {
+        MessageOut<Mutation> message = messageRecorder.sent.get(target);
+        assertNotNull(String.format("No repair message was sent to %s", target), message);
+        return message;
+    }
+
+    private void assertRepairContainsDeletions(MessageOut<Mutation> message,
+                                               DeletionTime deletionTime,
+                                               RangeTombstone...rangeTombstones)
+    {
+        PartitionUpdate update = ((Mutation)message.payload).getPartitionUpdates().iterator().next();
+        DeletionInfo deletionInfo = update.deletionInfo();
+        if (deletionTime != null)
+            assertEquals(deletionTime, deletionInfo.getPartitionDeletion());
+
+        assertEquals(rangeTombstones.length, deletionInfo.rangeCount());
+        Iterator<RangeTombstone> ranges = deletionInfo.rangeIterator(false);
+        int i = 0;
+        while (ranges.hasNext())
+        {
+            RangeTombstone expected = rangeTombstones[i++];
+            RangeTombstone actual = ranges.next();
+            String msg = String.format("Expected %s, but got %s", expected.toString(cfm.comparator), actual.toString(cfm.comparator));
+            assertEquals(msg, expected, actual);
+        }
+    }
+
+    private void assertRepairContainsNoDeletions(MessageOut<Mutation> message)
+    {
+        PartitionUpdate update = ((Mutation)message.payload).getPartitionUpdates().iterator().next();
+        assertTrue(update.deletionInfo().isLive());
+    }
+
+    private void assertRepairContainsColumn(MessageOut<Mutation> message,
+                                            String clustering,
+                                            String columnName,
+                                            String value,
+                                            long timestamp)
+    {
+        PartitionUpdate update = ((Mutation)message.payload).getPartitionUpdates().iterator().next();
+        Row row = update.getRow(update.metadata().comparator.make(clustering));
+        assertNotNull(row);
+        assertColumn(cfm, row, columnName, value, timestamp);
+    }
+
+    private void assertRepairContainsNoColumns(MessageOut<Mutation> message)
+    {
+        PartitionUpdate update = ((Mutation)message.payload).getPartitionUpdates().iterator().next();
+        assertFalse(update.iterator().hasNext());
+    }
+
+    private void assertRepairMetadata(MessageOut<Mutation> message)
+    {
+        assertEquals(MessagingService.Verb.READ_REPAIR, message.verb);
+        PartitionUpdate update = ((Mutation)message.payload).getPartitionUpdates().iterator().next();
+        assertEquals(update.metadata().ksName, cfm.ksName);
+        assertEquals(update.metadata().cfName, cfm.cfName);
+    }
+
+
+    public MessageIn<ReadResponse> readResponseMessage(InetAddress from, UnfilteredPartitionIterator partitionIterator)
+    {
+        return readResponseMessage(from, partitionIterator, command);
+
+    }
+    public MessageIn<ReadResponse> readResponseMessage(InetAddress from, UnfilteredPartitionIterator partitionIterator, ReadCommand cmd)
+    {
+        return MessageIn.create(from,
+                                ReadResponse.createRemoteDataResponse(partitionIterator, cmd),
+                                Collections.EMPTY_MAP,
+                                MessagingService.Verb.REQUEST_RESPONSE,
+                                MessagingService.current_version);
+    }
+
+    private RangeTombstone tombstone(Object start, Object end, long markedForDeleteAt, int localDeletionTime)
+    {
+        return tombstone(start, true, end, true, markedForDeleteAt, localDeletionTime);
+    }
+
+    private RangeTombstone tombstone(Object start, boolean inclusiveStart, Object end, boolean inclusiveEnd, long markedForDeleteAt, int localDeletionTime)
+    {
+        RangeTombstone.Bound startBound = rtBound(start, true, inclusiveStart);
+        RangeTombstone.Bound endBound = rtBound(end, false, inclusiveEnd);
+        return new RangeTombstone(Slice.make(startBound, endBound), new DeletionTime(markedForDeleteAt, localDeletionTime));
+    }
+
+    private RangeTombstone.Bound rtBound(Object value, boolean isStart, boolean inclusive)
+    {
+        RangeTombstone.Bound.Kind kind = isStart
+                                         ? (inclusive ? Kind.INCL_START_BOUND : Kind.EXCL_START_BOUND)
+                                         : (inclusive ? Kind.INCL_END_BOUND : Kind.EXCL_END_BOUND);
+
+        return new RangeTombstone.Bound(kind, cfm.comparator.make(value).getRawValues());
+    }
+
+    private RangeTombstone.Bound rtBoundary(Object value, boolean inclusiveOnEnd)
+    {
+        RangeTombstone.Bound.Kind kind = inclusiveOnEnd
+                                         ? Kind.INCL_END_EXCL_START_BOUNDARY
+                                         : Kind.EXCL_END_INCL_START_BOUNDARY;
+        return new RangeTombstone.Bound(kind, cfm.comparator.make(value).getRawValues());
+    }
+
+    private RangeTombstoneBoundMarker marker(Object value, boolean isStart, boolean inclusive, long markedForDeleteAt, int localDeletionTime)
+    {
+        return new RangeTombstoneBoundMarker(rtBound(value, isStart, inclusive), new DeletionTime(markedForDeleteAt, localDeletionTime));
+    }
+
+    private RangeTombstoneBoundaryMarker boundary(Object value, boolean inclusiveOnEnd, long markedForDeleteAt1, int localDeletionTime1, long markedForDeleteAt2, int localDeletionTime2)
+    {
+        return new RangeTombstoneBoundaryMarker(rtBoundary(value, inclusiveOnEnd),
+                                                new DeletionTime(markedForDeleteAt1, localDeletionTime1),
+                                                new DeletionTime(markedForDeleteAt2, localDeletionTime2));
+    }
+
+    private UnfilteredPartitionIterator fullPartitionDelete(CFMetaData cfm, DecoratedKey dk, long timestamp, int nowInSec)
+    {
+        return new SingletonUnfilteredPartitionIterator(PartitionUpdate.fullPartitionDelete(cfm, dk, timestamp, nowInSec).unfilteredIterator(), false);
+    }
+
+    private static class MessageRecorder implements IMessageSink
+    {
+        Map<InetAddress, MessageOut> sent = new HashMap<>();
+        public boolean allowOutgoingMessage(MessageOut message, int id, InetAddress to)
+        {
+            sent.put(to, message);
+            return false;
+        }
+
+        public boolean allowIncomingMessage(MessageIn message, int id)
+        {
+            return false;
+        }
+    }
+
+    private UnfilteredPartitionIterator iter(PartitionUpdate update)
+    {
+        return new SingletonUnfilteredPartitionIterator(update.unfilteredIterator(), false);
+    }
+
+    private UnfilteredPartitionIterator iter(DecoratedKey key, Unfiltered... unfiltereds)
+    {
+        SortedSet<Unfiltered> s = new TreeSet<>(cfm.comparator);
+        Collections.addAll(s, unfiltereds);
+        final Iterator<Unfiltered> iterator = s.iterator();
+
+        UnfilteredRowIterator rowIter = new AbstractUnfilteredRowIterator(cfm,
+                                                                          key,
+                                                                          DeletionTime.LIVE,
+                                                                          cfm.partitionColumns(),
+                                                                          Rows.EMPTY_STATIC_ROW,
+                                                                          false,
+                                                                          EncodingStats.NO_STATS)
+        {
+            protected Unfiltered computeNext()
+            {
+                return iterator.hasNext() ? iterator.next() : endOfData();
+            }
+        };
+        return new SingletonUnfilteredPartitionIterator(rowIter, false);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java b/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java
index ed0efee..b89f01d 100644
--- a/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java
+++ b/test/unit/org/apache/cassandra/service/EmbeddedCassandraServiceTest.java

@@ -26,9 +26,10 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.thrift.*;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.thrift.TException;
@@ -61,9 +62,12 @@
         SchemaLoader.prepareServer();
         setup();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
+                                    KeyspaceParams.simple(1),
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_STANDARD, true, false, false)
+                                                      .addPartitionKey("pk", AsciiType.instance)
+                                                      .addClusteringColumn("ck", AsciiType.instance)
+                                                      .addRegularColumn("val", AsciiType.instance)
+                                                      .build());
     }
 
     /**

diff --git a/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java b/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java
index 4a09b7a..91a7ab2 100644
--- a/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java
+++ b/test/unit/org/apache/cassandra/service/LeaveAndBootstrapTest.java

@@ -22,32 +22,42 @@
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.*;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
 
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.Util.PartitionerSwitcher;
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.concurrent.StageManager;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.gms.ApplicationState;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.gms.VersionedValue;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.SimpleSnitch;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.*;
 
 public class LeaveAndBootstrapTest
 {
     private static final IPartitioner partitioner = RandomPartitioner.instance;
-    private static IPartitioner oldPartitioner;
+    private static PartitionerSwitcher partitionerSwitcher;
     private static final String KEYSPACE1 = "LeaveAndBootstrapTestKeyspace1";
     private static final String KEYSPACE2 = "LeaveAndBootstrapTestKeyspace2";
     private static final String KEYSPACE3 = "LeaveAndBootstrapTestKeyspace3";
@@ -56,7 +66,7 @@
     @BeforeClass
     public static void defineSchema() throws Exception
     {
-        oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner);
+        partitionerSwitcher = Util.switchPartitioner(partitioner);
         SchemaLoader.loadSchema();
         SchemaLoader.schemaDefinition("LeaveAndBootstrapTest");
     }
@@ -64,7 +74,7 @@
     @AfterClass
     public static void tearDown()
     {
-        StorageService.instance.setPartitionerUnsafe(oldPartitioner);
+        partitionerSwitcher.close();
     }
 
     /**
@@ -91,7 +101,7 @@
         Util.createInitialRing(ss, partitioner, endpointTokens, keyTokens, hosts, hostIds, RING_SIZE);
 
         Map<Token, List<InetAddress>> expectedEndpoints = new HashMap<Token, List<InetAddress>>();
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             for (Token token : keyTokens)
             {
@@ -115,7 +125,7 @@
         PendingRangeCalculatorService.instance.blockUntilFinished();
 
         AbstractReplicationStrategy strategy;
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             strategy = getStrategy(keyspaceName, tmd);
             for (Token token : keyTokens)
@@ -670,8 +680,9 @@
         Util.createInitialRing(ss, partitioner, endpointTokens, new ArrayList<Token>(), hosts, new ArrayList<UUID>(), 2);
 
         InetAddress toRemove = hosts.get(1);
-        SystemKeyspace.updatePeerInfo(toRemove, "data_center", "dc42");
-        SystemKeyspace.updatePeerInfo(toRemove, "rack", "rack42");
+        final ExecutorService executor = StageManager.getStage(Stage.MUTATION);
+        FBUtilities.waitOnFuture(SystemKeyspace.updatePeerInfo(toRemove, "data_center", "dc42", executor));
+        FBUtilities.waitOnFuture(SystemKeyspace.updatePeerInfo(toRemove, "rack", "rack42", executor));
         assertEquals("rack42", SystemKeyspace.loadDcRackInfo().get(toRemove).get("rack"));
 
         // mark the node as removed
@@ -691,7 +702,7 @@
         // create a ring of 1 node
         StorageService ss = StorageService.instance;
         VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(partitioner);
-        Util.createInitialRing(ss, partitioner, new ArrayList<Token>(), new ArrayList<Token>(),  new ArrayList<InetAddress>(), new ArrayList<UUID>(), 1);
+        Util.createInitialRing(ss, partitioner, new ArrayList<Token>(), new ArrayList<Token>(), new ArrayList<InetAddress>(), new ArrayList<UUID>(), 1);
 
         // make a REMOVING state change on a non-member endpoint; without the CASSANDRA-6564 fix, this
         // would result in an ArrayIndexOutOfBoundsException
@@ -708,13 +719,13 @@
 
     private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
-        KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
+        KeyspaceMetadata ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(
                 keyspaceName,
-                ksmd.strategyClass,
+                ksmd.params.replication.klass,
                 tmd,
                 new SimpleSnitch(),
-                ksmd.strategyOptions);
+                ksmd.params.replication.options);
     }
 
 }

diff --git a/test/unit/org/apache/cassandra/service/LegacyAuthFailTest.java b/test/unit/org/apache/cassandra/service/LegacyAuthFailTest.java
new file mode 100644
index 0000000..079543f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/LegacyAuthFailTest.java

@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+import com.google.common.base.Joiner;
+import org.junit.Test;
+
+import org.apache.cassandra.auth.AuthKeyspace;
+import org.apache.cassandra.cql3.CQLTester;
+
+import static java.lang.String.format;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+public class LegacyAuthFailTest extends CQLTester
+{
+    @Test
+    public void testStartupChecks() throws Throwable
+    {
+        createKeyspace();
+
+        List<String> legacyTables = new ArrayList<>(StartupChecks.LEGACY_AUTH_TABLES);
+
+        // test reporting for individual tables
+        for (String legacyTable : legacyTables)
+        {
+            createLegacyTable(legacyTable);
+
+            Optional<String> errMsg = StartupChecks.checkLegacyAuthTablesMessage();
+            assertEquals(format("Legacy auth tables %s in keyspace %s still exist and have not been properly migrated.",
+                                legacyTable,
+                                AuthKeyspace.NAME), errMsg.get());
+            dropLegacyTable(legacyTable);
+        }
+
+        // test reporting of multiple existing tables
+        for (String legacyTable : legacyTables)
+            createLegacyTable(legacyTable);
+
+        while (!legacyTables.isEmpty())
+        {
+            Optional<String> errMsg = StartupChecks.checkLegacyAuthTablesMessage();
+            assertEquals(format("Legacy auth tables %s in keyspace %s still exist and have not been properly migrated.",
+                                Joiner.on(", ").join(legacyTables),
+                                AuthKeyspace.NAME), errMsg.get());
+
+            dropLegacyTable(legacyTables.remove(0));
+        }
+
+        // no legacy tables found
+        Optional<String> errMsg = StartupChecks.checkLegacyAuthTablesMessage();
+        assertFalse(errMsg.isPresent());
+    }
+
+    private void dropLegacyTable(String legacyTable) throws Throwable
+    {
+        execute(format("DROP TABLE %s.%s", AuthKeyspace.NAME, legacyTable));
+    }
+
+    private void createLegacyTable(String legacyTable) throws Throwable
+    {
+        execute(format("CREATE TABLE %s.%s (id int PRIMARY KEY, val text)", AuthKeyspace.NAME, legacyTable));
+    }
+
+    private void createKeyspace() throws Throwable
+    {
+        execute(format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}", AuthKeyspace.NAME));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/service/MigrationManagerTest.java b/test/unit/org/apache/cassandra/service/MigrationManagerTest.java
new file mode 100644
index 0000000..eefc640
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/MigrationManagerTest.java

@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.util.Optional;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.SchemaKeyspace;
+import org.apache.cassandra.schema.Tables;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class MigrationManagerTest extends CQLTester
+{
+    @Test
+    public void testEvolveSystemKeyspaceNew()
+    {
+        CFMetaData table = CFMetaData.compile("CREATE TABLE t (id int PRIMARY KEY)", "ks0");
+        KeyspaceMetadata keyspace = KeyspaceMetadata.create("ks0", KeyspaceParams.simple(1), Tables.of(table));
+
+        Optional<Mutation> mutation = MigrationManager.evolveSystemKeyspace(keyspace, 0);
+        assertTrue(mutation.isPresent());
+
+        MigrationManager.announce(mutation.get(), true);
+        assertEquals(keyspace, Schema.instance.getKSMetaData("ks0"));
+    }
+
+    @Test
+    public void testEvolveSystemKeyspaceExistsUpToDate()
+    {
+        CFMetaData table = CFMetaData.compile("CREATE TABLE t (id int PRIMARY KEY)", "ks1");
+        KeyspaceMetadata keyspace = KeyspaceMetadata.create("ks1", KeyspaceParams.simple(1), Tables.of(table));
+
+        // create the keyspace, verify it's there
+        MigrationManager.announce(SchemaKeyspace.makeCreateKeyspaceMutation(keyspace, 0), true);
+        assertEquals(keyspace, Schema.instance.getKSMetaData("ks1"));
+
+        Optional<Mutation> mutation = MigrationManager.evolveSystemKeyspace(keyspace, 0);
+        assertFalse(mutation.isPresent());
+    }
+
+    @Test
+    public void testEvolveSystemKeyspaceChanged()
+    {
+        CFMetaData table0 = CFMetaData.compile("CREATE TABLE t (id int PRIMARY KEY)", "ks2");
+        KeyspaceMetadata keyspace0 = KeyspaceMetadata.create("ks2", KeyspaceParams.simple(1), Tables.of(table0));
+
+        // create the keyspace, verify it's there
+        MigrationManager.announce(SchemaKeyspace.makeCreateKeyspaceMutation(keyspace0, 0), true);
+        assertEquals(keyspace0, Schema.instance.getKSMetaData("ks2"));
+
+        CFMetaData table1 = table0.copy().comment("comment");
+        KeyspaceMetadata keyspace1 = KeyspaceMetadata.create("ks2", KeyspaceParams.simple(1), Tables.of(table1));
+
+        Optional<Mutation> mutation = MigrationManager.evolveSystemKeyspace(keyspace1, 1);
+        assertTrue(mutation.isPresent());
+
+        MigrationManager.announce(mutation.get(), true);
+        assertEquals(keyspace1, Schema.instance.getKSMetaData("ks2"));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/service/MoveTest.java b/test/unit/org/apache/cassandra/service/MoveTest.java
index bd4317d..bc6c6d2 100644
--- a/test/unit/org/apache/cassandra/service/MoveTest.java
+++ b/test/unit/org/apache/cassandra/service/MoveTest.java

@@ -25,32 +25,39 @@
 
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
-import static org.junit.Assert.*;
-
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.gms.Gossiper;
-import org.apache.cassandra.locator.AbstractNetworkTopologySnitch;
-import org.apache.cassandra.locator.NetworkTopologyStrategy;
-import org.apache.cassandra.locator.PendingRangeMaps;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.config.Schema;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.locator.AbstractNetworkTopologySnitch;
+import org.apache.cassandra.locator.NetworkTopologyStrategy;
+import org.apache.cassandra.locator.PendingRangeMaps;
 import org.apache.cassandra.gms.VersionedValue;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.SimpleSnitch;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Tables;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertNotNull;
 
 public class MoveTest
 {
@@ -132,21 +139,23 @@
             }
         });
 
-        Class<? extends AbstractReplicationStrategy> strategy = NetworkTopologyStrategy.class;
-        KSMetaData keyspace = KSMetaData.testMetadata(keyspaceName, strategy, configOptions(replicas),
-                CFMetaData.denseCFMetaData(keyspaceName, "CF1", BytesType.instance));
+        KeyspaceMetadata keyspace =  KeyspaceMetadata.create(keyspaceName,
+                                                             KeyspaceParams.nts(configOptions(replicas)),
+                                                             Tables.of(CFMetaData.Builder.create(keyspaceName, "CF1")
+                                                                                         .addPartitionKey("key", BytesType.instance).build()));
         MigrationManager.announceNewKeyspace(keyspace);
     }
 
-    private static Map<String, String> configOptions(Integer[] replicas)
+    private static Object[] configOptions(Integer[] replicas)
     {
-        Map<String, String> configOptions = new HashMap<>();
-        int i = 1;
+        Object[] configOptions = new Object[(replicas.length * 2)];
+        int i = 1, j=0;
         for(Integer replica : replicas)
         {
             if(replica == null)
                 continue;
-            configOptions.put("DC" + i++, String.valueOf(replica));
+            configOptions[j++] = "DC" + i++;
+            configOptions[j++] = replica;
         }
         return configOptions;
     }
@@ -470,7 +479,16 @@
     {
         tmd.removeFromMoving(host);
         assertTrue(!tmd.isMoving(host));
-        tmd.updateNormalToken(new BigIntegerToken(String.valueOf(token)), host);
+        Token newToken = new BigIntegerToken(String.valueOf(token));
+        tmd.updateNormalToken(newToken, host);
+        // As well as upating TMD, update the host's tokens in gossip. Since CASSANDRA-15120, status changing to MOVING
+        // ensures that TMD is up to date with token assignments according to gossip. So we need to make sure gossip has
+        // the correct new token, as the moving node itself would do upon successful completion of the move operation.
+        // Without this, the next movement for that host will set the token in TMD's back to the old value from gossip
+        // and incorrect range movements will follow
+        Gossiper.instance.injectApplicationState(host,
+                                                 ApplicationState.TOKENS,
+                                                 new VersionedValue.VersionedValueFactory(partitioner).tokens(Collections.singleton(newToken)));
     }
 
     private Map.Entry<Range<Token>, Collection<InetAddress>> generatePendingMapEntry(int start, int end, String... endpoints) throws UnknownHostException
@@ -493,7 +511,7 @@
     private void assertPendingRanges(TokenMetadata tmd, Map<Range<Token>,  Collection<InetAddress>> pendingRanges, String keyspaceName) throws ConfigurationException
     {
         boolean keyspaceFound = false;
-        for (String nonSystemKeyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String nonSystemKeyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             if(!keyspaceName.equals(nonSystemKeyspaceName))
                 continue;
@@ -562,7 +580,7 @@
         assertTrue(tmd.isMoving(hosts.get(MOVING_NODE)));
 
         AbstractReplicationStrategy strategy;
-        for (String keyspaceName : Schema.instance.getNonSystemKeyspaces())
+        for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
         {
             strategy = getStrategy(keyspaceName, tmd);
             if(strategy instanceof NetworkTopologyStrategy)
@@ -992,13 +1010,13 @@
 
     private AbstractReplicationStrategy getStrategy(String keyspaceName, TokenMetadata tmd)
     {
-        KSMetaData ksmd = Schema.instance.getKSMetaData(keyspaceName);
+        KeyspaceMetadata ksmd = Schema.instance.getKSMetaData(keyspaceName);
         return AbstractReplicationStrategy.createReplicationStrategy(
                 keyspaceName,
-                ksmd.strategyClass,
+                ksmd.params.replication.klass,
                 tmd,
                 new SimpleSnitch(),
-                ksmd.strategyOptions);
+                ksmd.params.replication.options);
     }
 
     private Token positionToken(int position)

diff --git a/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java b/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java
new file mode 100644
index 0000000..8f2689a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java

@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service;
+
+import java.util.Arrays;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.google.common.collect.Sets;
+import org.junit.After;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.transport.Server;
+import org.apache.cassandra.utils.Pair;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class NativeTransportServiceTest
+{
+
+    @After
+    public void resetConfig()
+    {
+        DatabaseDescriptor.getClientEncryptionOptions().enabled = false;
+        DatabaseDescriptor.setNativeTransportPortSSL(null);
+    }
+
+    @Test
+    public void testServiceCanBeStopped()
+    {
+        withService((NativeTransportService service) -> {
+            service.stop();
+            assertFalse(service.isRunning());
+        });
+    }
+
+    @Test
+    public void testIgnoresStartOnAlreadyStarted()
+    {
+        withService((NativeTransportService service) -> {
+            service.start();
+            service.start();
+            service.start();
+        });
+    }
+
+    @Test
+    public void testIgnoresStoppedOnAlreadyStopped()
+    {
+        withService((NativeTransportService service) -> {
+            service.stop();
+            service.stop();
+            service.stop();
+        });
+    }
+
+    @Test
+    public void testDestroy()
+    {
+        withService((NativeTransportService service) -> {
+            Supplier<Boolean> allTerminated = () ->
+                                              service.getWorkerGroup().isShutdown() && service.getWorkerGroup().isTerminated();
+            assertFalse(allTerminated.get());
+            service.destroy();
+            assertTrue(allTerminated.get());
+        });
+    }
+
+    @Test
+    public void testConcurrentStarts()
+    {
+        withService(NativeTransportService::start, false, 20);
+    }
+
+    @Test
+    public void testConcurrentStops()
+    {
+        withService(NativeTransportService::stop, true, 20);
+    }
+
+    @Test
+    public void testConcurrentDestroys()
+    {
+        withService(NativeTransportService::destroy, true, 20);
+    }
+
+    @Test
+    public void testPlainDefaultPort()
+    {
+        // default plain settings: client encryption disabled and default native transport port 
+        withService((NativeTransportService service) ->
+                    {
+                        assertEquals(1, service.getServers().size());
+                        Server server = service.getServers().iterator().next();
+                        assertFalse(server.useSSL);
+                        assertEquals(server.socket.getPort(), DatabaseDescriptor.getNativeTransportPort());
+                    });
+    }
+
+    @Test
+    public void testSSLOnly()
+    {
+        // default ssl settings: client encryption enabled and default native transport port used for ssl only
+        DatabaseDescriptor.getClientEncryptionOptions().enabled = true;
+        DatabaseDescriptor.getClientEncryptionOptions().optional = false;
+
+        withService((NativeTransportService service) ->
+                    {
+                        service.initialize();
+                        assertEquals(1, service.getServers().size());
+                        Server server = service.getServers().iterator().next();
+                        assertTrue(server.useSSL);
+                        assertEquals(server.socket.getPort(), DatabaseDescriptor.getNativeTransportPort());
+                    }, false, 1);
+    }
+
+    @Test
+    public void testSSLOptional()
+    {
+        // default ssl settings: client encryption enabled and default native transport port used for optional ssl
+        DatabaseDescriptor.getClientEncryptionOptions().enabled = true;
+        DatabaseDescriptor.getClientEncryptionOptions().optional = true;
+
+        withService((NativeTransportService service) ->
+                    {
+                        service.initialize();
+                        assertEquals(1, service.getServers().size());
+                        Server server = service.getServers().iterator().next();
+                        assertTrue(server.useSSL);
+                        assertEquals(server.socket.getPort(), DatabaseDescriptor.getNativeTransportPort());
+                    }, false, 1);
+    }
+
+    @Test
+    public void testSSLWithNonSSL()
+    {
+        // ssl+non-ssl settings: client encryption enabled and additional ssl port specified
+        DatabaseDescriptor.getClientEncryptionOptions().enabled = true;
+        DatabaseDescriptor.setNativeTransportPortSSL(8432);
+
+        withService((NativeTransportService service) ->
+                    {
+                        service.initialize();
+                        assertEquals(2, service.getServers().size());
+                        assertEquals(
+                                    Sets.newHashSet(Arrays.asList(
+                                                                 Pair.create(true, DatabaseDescriptor.getNativeTransportPortSSL()),
+                                                                 Pair.create(false, DatabaseDescriptor.getNativeTransportPort())
+                                                    )
+                                    ),
+                                    service.getServers().stream().map((Server s) ->
+                                                                      Pair.create(s.useSSL, s.socket.getPort())).collect(Collectors.toSet())
+                        );
+                    }, false, 1);
+    }
+
+    private static void withService(Consumer<NativeTransportService> f)
+    {
+        withService(f, true, 1);
+    }
+
+    private static void withService(Consumer<NativeTransportService> f, boolean start, int concurrently)
+    {
+        NativeTransportService service = new NativeTransportService();
+        assertFalse(service.isRunning());
+        if (start)
+        {
+            service.start();
+            assertTrue(service.isRunning());
+        }
+        try
+        {
+            if (concurrently == 1)
+            {
+                f.accept(service);
+            }
+            else
+            {
+                IntStream.range(0, concurrently).parallel().map((int i) -> {
+                    f.accept(service);
+                    return 1;
+                }).sum();
+            }
+        }
+        finally
+        {
+            service.stop();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/service/PaxosStateTest.java b/test/unit/org/apache/cassandra/service/PaxosStateTest.java
index 7f4bc49..9ee91dd 100644
--- a/test/unit/org/apache/cassandra/service/PaxosStateTest.java
+++ b/test/unit/org/apache/cassandra/service/PaxosStateTest.java

@@ -19,6 +19,7 @@
 
 import java.nio.ByteBuffer;
 
+import com.google.common.collect.Iterables;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -26,8 +27,9 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.filter.QueryFilter;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.gms.Gossiper;
 import org.apache.cassandra.service.paxos.Commit;
 import org.apache.cassandra.service.paxos.PaxosState;
@@ -35,9 +37,7 @@
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
+import static org.junit.Assert.*;
 
 public class PaxosStateTest
 {
@@ -58,48 +58,47 @@
     public void testCommittingAfterTruncation() throws Exception
     {
         ColumnFamilyStore cfs = Keyspace.open("PaxosStateTestKeyspace1").getColumnFamilyStore("Standard1");
-        DecoratedKey key = Util.dk("key" + System.nanoTime());
-        CellName name = Util.cellname("col");
+        String key = "key" + System.nanoTime();
         ByteBuffer value = ByteBufferUtil.bytes(0);
-        ColumnFamily update = ArrayBackedSortedColumns.factory.create(cfs.metadata);
-        update.addColumn(name, value, FBUtilities.timestampMicros());
+        RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros(), key);
+        builder.clustering("a").add("val", value);
+        PartitionUpdate update = Iterables.getOnlyElement(builder.build().getPartitionUpdates());
 
         // CFS should be empty initially
-        assertNoDataPresent(cfs, key);
+        assertNoDataPresent(cfs, Util.dk(key));
 
         // Commit the proposal & verify the data is present
-        Commit beforeTruncate = newProposal(0, key.getKey(), update);
+        Commit beforeTruncate = newProposal(0, update);
         PaxosState.commit(beforeTruncate);
-        assertDataPresent(cfs, key, name, value);
+        assertDataPresent(cfs, Util.dk(key), "val", value);
 
         // Truncate then attempt to commit again, mutation should
         // be ignored as the proposal predates the truncation
         cfs.truncateBlocking();
         PaxosState.commit(beforeTruncate);
-        assertNoDataPresent(cfs, key);
+        assertNoDataPresent(cfs, Util.dk(key));
 
         // Now try again with a ballot created after the truncation
         long timestamp = SystemKeyspace.getTruncatedAt(update.metadata().cfId) + 1;
-        Commit afterTruncate = newProposal(timestamp, key.getKey(), update);
+        Commit afterTruncate = newProposal(timestamp, update);
         PaxosState.commit(afterTruncate);
-        assertDataPresent(cfs, key, name, value);
+        assertDataPresent(cfs, Util.dk(key), "val", value);
     }
 
-    private Commit newProposal(long ballotMillis, ByteBuffer key, ColumnFamily update)
+    private Commit newProposal(long ballotMillis, PartitionUpdate update)
     {
-        return Commit.newProposal(key, UUIDGen.getTimeUUID(ballotMillis), update);
+        return Commit.newProposal(UUIDGen.getTimeUUID(ballotMillis), update);
     }
 
-    private void assertDataPresent(ColumnFamilyStore cfs, DecoratedKey key, CellName name, ByteBuffer value)
+    private void assertDataPresent(ColumnFamilyStore cfs, DecoratedKey key, String name, ByteBuffer value)
     {
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfs.name, System.currentTimeMillis()));
-        assertFalse(cf.isEmpty());
-        assertEquals(0, ByteBufferUtil.compareUnsigned(value, cf.getColumn(name).value()));
+        Row row = Util.getOnlyRowUnfiltered(Util.cmd(cfs, key).build());
+        assertEquals(0, ByteBufferUtil.compareUnsigned(value,
+                row.getCell(cfs.metadata.getColumnDefinition(ByteBufferUtil.bytes(name))).value()));
     }
 
     private void assertNoDataPresent(ColumnFamilyStore cfs, DecoratedKey key)
     {
-        ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(key, cfs.name, System.currentTimeMillis()));
-        assertNull(cf);
+        Util.assertEmpty(Util.cmd(cfs, key).build());
     }
 }

diff --git a/test/unit/org/apache/cassandra/service/QueryPagerTest.java b/test/unit/org/apache/cassandra/service/QueryPagerTest.java
index 33a7585..27c630d 100644
--- a/test/unit/org/apache/cassandra/service/QueryPagerTest.java
+++ b/test/unit/org/apache/cassandra/service/QueryPagerTest.java

@@ -18,33 +18,36 @@
 */
 package org.apache.cassandra.service;
 
-import java.util.*;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
+import java.util.*;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
-import org.apache.cassandra.Util;
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.*;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.*;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.pager.*;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.transport.Server;
 
-import static org.junit.Assert.*;
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
-import static org.apache.cassandra.Util.range;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.*;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class QueryPagerTest
@@ -54,18 +57,17 @@
     public static final String KEYSPACE_CQL = "cql_keyspace";
     public static final String CF_CQL = "table2";
     public static final String CF_CQL_WITH_STATIC = "with_static";
+    public static final int nowInSec = FBUtilities.nowInSeconds();
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
         SchemaLoader.createKeyspace(KEYSPACE_CQL,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     CFMetaData.compile("CREATE TABLE " + CF_CQL + " ("
                                                      + "k text,"
                                                      + "c text,"
@@ -81,11 +83,6 @@
         addData();
     }
 
-    private static String string(CellName name)
-    {
-        return string(name.toByteBuffer());
-    }
-
     private static String string(ByteBuffer bb)
     {
         try
@@ -105,21 +102,19 @@
         int nbKeys = 10;
         int nbCols = 10;
 
-        /*
-         * Creates the following data:
-         *   k1: c1 ... cn
-         *   ...
-         *   ki: c1 ... cn
-         */
+        // *
+        // * Creates the following data:
+        // *   k1: c1 ... cn
+        // *   ...
+        // *   ki: c1 ... cn
+        // *
         for (int i = 0; i < nbKeys; i++)
         {
-            Mutation rm = new Mutation(KEYSPACE1, bytes("k" + i));
-            ColumnFamily cf = rm.addOrGet(CF_STANDARD);
-
             for (int j = 0; j < nbCols; j++)
-                cf.addColumn(Util.column("c" + j, "", 0));
-
-            rm.applyUnsafe();
+            {
+                RowUpdateBuilder builder = new RowUpdateBuilder(cfs().metadata, FBUtilities.timestampMicros(), "k" + i);
+                builder.clustering("c" + j).add("val", "").build().applyUnsafe();
+            }
         }
     }
 
@@ -128,60 +123,81 @@
         return Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD);
     }
 
-    private static String toString(List<Row> rows)
+    private static List<FilteredPartition> query(QueryPager pager, int expectedSize)
     {
-        StringBuilder sb = new StringBuilder();
-        for (Row row : rows)
-            sb.append(string(row.key.getKey())).append(":").append(toString(row.cf)).append("\n");
-        return sb.toString();
+        return query(pager, expectedSize, expectedSize);
     }
 
-    private static String toString(ColumnFamily cf)
+    private static List<FilteredPartition> query(QueryPager pager, int toQuery, int expectedSize)
     {
-        if (cf == null)
-            return "";
-
         StringBuilder sb = new StringBuilder();
-        for (Cell c : cf)
-            sb.append(" ").append(string(c.name()));
-        return sb.toString();
+        List<FilteredPartition> partitionList = new ArrayList<>();
+        int rows = 0;
+        try (ReadOrderGroup orderGroup = pager.startOrderGroup(); PartitionIterator iterator = pager.fetchPageInternal(toQuery, orderGroup))
+        {
+            while (iterator.hasNext())
+            {
+                try (RowIterator rowIter = iterator.next())
+                {
+                    FilteredPartition partition = FilteredPartition.create(rowIter);
+                    sb.append(partition);
+                    partitionList.add(partition);
+                    rows += partition.rowCount();
+                }
+            }
+        }
+        assertEquals(sb.toString(), expectedSize, rows);
+        return partitionList;
     }
 
     private static ReadCommand namesQuery(String key, String... names)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs().metadata.comparator);
+        AbstractReadCommandBuilder builder = Util.cmd(cfs(), key);
         for (String name : names)
-            s.add(CellNames.simpleDense(bytes(name)));
-        return new SliceByNamesReadCommand(KEYSPACE1, bytes(key), CF_STANDARD, System.currentTimeMillis(), new NamesQueryFilter(s, true));
+            builder.includeRow(name);
+        return builder.withPagingLimit(100).build();
     }
 
-    private static ReadCommand sliceQuery(String key, String start, String end, int count)
+    private static SinglePartitionReadCommand sliceQuery(String key, String start, String end, int count)
     {
         return sliceQuery(key, start, end, false, count);
     }
 
-    private static ReadCommand sliceQuery(String key, String start, String end, boolean reversed, int count)
+    private static SinglePartitionReadCommand sliceQuery(String key, String start, String end, boolean reversed, int count)
     {
-        SliceQueryFilter filter = new SliceQueryFilter(CellNames.simpleDense(bytes(start)), CellNames.simpleDense(bytes(end)), reversed, count);
-        // Note: for MultiQueryTest, we need the same timestamp/expireBefore for all queries, so we just use 0 as it doesn't matter here.
-        return new SliceFromReadCommand(KEYSPACE1, bytes(key), CF_STANDARD, 0, filter);
+        ClusteringComparator cmp = cfs().getComparator();
+        CFMetaData metadata = cfs().metadata;
+
+        Slice slice = Slice.make(cmp.make(start), cmp.make(end));
+        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(Slices.with(cmp, slice), reversed);
+
+        return SinglePartitionReadCommand.create(cfs().metadata, nowInSec, ColumnFilter.all(metadata), RowFilter.NONE, DataLimits.NONE, Util.dk(key), filter);
     }
 
-    private static RangeSliceCommand rangeNamesQuery(AbstractBounds<RowPosition> range, int count, String... names)
+    private static ReadCommand rangeNamesQuery(String keyStart, String keyEnd, int count, String... names)
     {
-        SortedSet<CellName> s = new TreeSet<CellName>(cfs().metadata.comparator);
+        AbstractReadCommandBuilder builder = Util.cmd(cfs())
+                                                 .fromKeyExcl(keyStart)
+                                                 .toKeyIncl(keyEnd)
+                                                 .withPagingLimit(count);
         for (String name : names)
-            s.add(CellNames.simpleDense(bytes(name)));
-        return new RangeSliceCommand(KEYSPACE1, CF_STANDARD, System.currentTimeMillis(), new NamesQueryFilter(s, true), range, count);
+            builder.includeRow(name);
+
+        return builder.build();
     }
 
-    private static RangeSliceCommand rangeSliceQuery(AbstractBounds<RowPosition> range, int count, String start, String end)
+    private static ReadCommand rangeSliceQuery(String keyStart, String keyEnd, int count, String start, String end)
     {
-        SliceQueryFilter filter = new SliceQueryFilter(CellNames.simpleDense(bytes(start)), CellNames.simpleDense(bytes(end)), false, Integer.MAX_VALUE);
-        return new RangeSliceCommand(KEYSPACE1, CF_STANDARD, System.currentTimeMillis(), filter, range, null, count, true, false);
+        return Util.cmd(cfs())
+                   .fromKeyExcl(keyStart)
+                   .toKeyIncl(keyEnd)
+                   .fromIncl(start)
+                   .toIncl(end)
+                   .withPagingLimit(count)
+                   .build();
     }
 
-    private static void assertRow(Row r, String key, String... names)
+    private static void assertRow(FilteredPartition r, String key, String... names)
     {
         ByteBuffer[] bbs = new ByteBuffer[names.length];
         for (int i = 0; i < names.length; i++)
@@ -189,31 +205,35 @@
         assertRow(r, key, bbs);
     }
 
-    private static void assertRow(Row r, String key, ByteBuffer... names)
+    private static void assertRow(FilteredPartition partition, String key, ByteBuffer... names)
     {
-        assertEquals(key, string(r.key.getKey()));
-        assertNotNull(r.cf);
+        assertEquals(key, string(partition.partitionKey().getKey()));
+        assertFalse(partition.isEmpty());
         int i = 0;
-        for (Cell c : r.cf)
+        for (Row row : Util.once(partition.iterator()))
         {
-            // Ignore deleted cells if we have them
-            if (!c.isLive())
-                continue;
-
             ByteBuffer expected = names[i++];
-            assertEquals("column " + i + " doesn't match: " + toString(r.cf), expected, c.name().toByteBuffer());
+            assertEquals("column " + i + " doesn't match "+string(expected)+" vs "+string(row.clustering().get(0)), expected, row.clustering().get(0));
         }
     }
 
+    private QueryPager maybeRecreate(QueryPager pager, ReadQuery command, boolean testPagingState, int protocolVersion)
+    {
+        if (!testPagingState)
+            return pager;
+
+        PagingState state = PagingState.deserialize(pager.state().serialize(protocolVersion), protocolVersion);
+        return command.getPager(state, protocolVersion);
+    }
+
     @Test
     public void namesQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(namesQuery("k0", "c1", "c5", "c7", "c8"));
+        QueryPager pager = namesQuery("k0", "c1", "c5", "c7", "c8").getPager(null, Server.CURRENT_VERSION);
 
         assertFalse(pager.isExhausted());
-        List<Row> page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c1", "c5", "c7", "c8");
+        List<FilteredPartition> partition = query(pager, 5, 4);
+        assertRow(partition.get(0), "k0", "c1", "c5", "c7", "c8");
 
         assertTrue(pager.isExhausted());
     }
@@ -221,24 +241,33 @@
     @Test
     public void sliceQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(sliceQuery("k0", "c1", "c8", 10));
+        sliceQueryTest(false, Server.VERSION_3);
+        sliceQueryTest(true,  Server.VERSION_3);
 
-        List<Row> page;
+        sliceQueryTest(false, Server.VERSION_4);
+        sliceQueryTest(true,  Server.VERSION_4);
+    }
+
+    public void sliceQueryTest(boolean testPagingState, int protocolVersion) throws Exception
+    {
+        ReadCommand command = sliceQuery("k0", "c1", "c8", 10);
+        QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c1", "c2", "c3");
-
+        List<FilteredPartition> partition = query(pager, 3);
+        assertRow(partition.get(0), "k0", "c1", "c2", "c3");
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c4", "c5", "c6");
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c7", "c8");
+        partition = query(pager, 3);
+        assertRow(partition.get(0), "k0", "c4", "c5", "c6");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        assertFalse(pager.isExhausted());
+        partition = query(pager, 3, 2);
+        assertRow(partition.get(0), "k0", "c7", "c8");
 
         assertTrue(pager.isExhausted());
     }
@@ -246,24 +275,33 @@
     @Test
     public void reversedSliceQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(sliceQuery("k0", "c8", "c1", true, 10));
+        reversedSliceQueryTest(false, Server.VERSION_3);
+        reversedSliceQueryTest(true,  Server.VERSION_3);
 
-        List<Row> page;
+        reversedSliceQueryTest(false, Server.VERSION_4);
+        reversedSliceQueryTest(true,  Server.VERSION_4);
+    }
+
+    public void reversedSliceQueryTest(boolean testPagingState, int protocolVersion) throws Exception
+    {
+        ReadCommand command = sliceQuery("k0", "c1", "c8", true, 10);
+        QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c6", "c7", "c8");
-
+        List<FilteredPartition> partition = query(pager, 3);
+        assertRow(partition.get(0), "k0", "c6", "c7", "c8");
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c3", "c4", "c5");
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k0", "c1", "c2");
+        partition = query(pager, 3);
+        assertRow(partition.get(0), "k0", "c3", "c4", "c5");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        assertFalse(pager.isExhausted());
+        partition = query(pager, 3, 2);
+        assertRow(partition.get(0), "k0", "c1", "c2");
 
         assertTrue(pager.isExhausted());
     }
@@ -271,28 +309,38 @@
     @Test
     public void multiQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(new Pageable.ReadCommands(new ArrayList<ReadCommand>() {{
+        multiQueryTest(false, Server.VERSION_3);
+        multiQueryTest(true,  Server.VERSION_3);
+
+        multiQueryTest(false, Server.VERSION_4);
+        multiQueryTest(true,  Server.VERSION_4);
+    }
+
+    public void multiQueryTest(boolean testPagingState, int protocolVersion) throws Exception
+    {
+        ReadQuery command = new SinglePartitionReadCommand.Group(new ArrayList<SinglePartitionReadCommand>()
+        {{
             add(sliceQuery("k1", "c2", "c6", 10));
             add(sliceQuery("k4", "c3", "c5", 10));
-        }}, 10));
-
-        List<Row> page;
+        }}, DataLimits.NONE);
+        QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k1", "c2", "c3", "c4");
-
+        List<FilteredPartition> partition = query(pager, 3);
+        assertRow(partition.get(0), "k1", "c2", "c3", "c4");
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(4);
-        assertEquals(toString(page), 2, page.size());
-        assertRow(page.get(0), "k1", "c5", "c6");
-        assertRow(page.get(1), "k4", "c3", "c4");
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k4", "c5");
+        partition = query(pager , 4);
+        assertRow(partition.get(0), "k1", "c5", "c6");
+        assertRow(partition.get(1), "k4", "c3", "c4");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        assertFalse(pager.isExhausted());
+        partition = query(pager, 3, 1);
+        assertRow(partition.get(0), "k4", "c5");
 
         assertTrue(pager.isExhausted());
     }
@@ -300,21 +348,29 @@
     @Test
     public void rangeNamesQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(rangeNamesQuery(range("k0", "k5"), 100, "c1", "c4", "c8"));
+        rangeNamesQueryTest(false, Server.VERSION_3);
+        rangeNamesQueryTest(true,  Server.VERSION_3);
 
-        List<Row> page;
+        rangeNamesQueryTest(false, Server.VERSION_4);
+        rangeNamesQueryTest(true,  Server.VERSION_4);
+    }
+
+    public void rangeNamesQueryTest(boolean testPagingState, int protocolVersion) throws Exception
+    {
+        ReadCommand command = rangeNamesQuery("k0", "k5", 100, "c1", "c4", "c8");
+        QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 3, page.size());
+        List<FilteredPartition> partitions = query(pager, 3 * 3);
         for (int i = 1; i <= 3; i++)
-            assertRow(page.get(i-1), "k" + i, "c1", "c4", "c8");
-
+            assertRow(partitions.get(i-1), "k" + i, "c1", "c4", "c8");
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(3);
-        assertEquals(toString(page), 2, page.size());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        assertFalse(pager.isExhausted());
+        partitions = query(pager, 3 * 3, 2 * 3);
         for (int i = 4; i <= 5; i++)
-            assertRow(page.get(i-4), "k" + i, "c1", "c4", "c8");
+            assertRow(partitions.get(i-4), "k" + i, "c1", "c4", "c8");
 
         assertTrue(pager.isExhausted());
     }
@@ -322,42 +378,54 @@
     @Test
     public void rangeSliceQueryTest() throws Exception
     {
-        QueryPager pager = QueryPagers.localPager(rangeSliceQuery(range("k1", "k5"), 100, "c1", "c7"));
+        rangeSliceQueryTest(false, Server.VERSION_3);
+        rangeSliceQueryTest(true,  Server.VERSION_3);
 
-        List<Row> page;
+        rangeSliceQueryTest(false, Server.VERSION_4);
+        rangeSliceQueryTest(true,  Server.VERSION_4);
+    }
+
+    public void rangeSliceQueryTest(boolean testPagingState, int protocolVersion) throws Exception
+    {
+        ReadCommand command = rangeSliceQuery("k1", "k5", 100, "c1", "c7");
+        QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(5);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k2", "c1", "c2", "c3", "c4", "c5");
-
+        List<FilteredPartition> partitions = query(pager, 5);
+        assertRow(partitions.get(0), "k2", "c1", "c2", "c3", "c4", "c5");
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(4);
-        assertEquals(toString(page), 2, page.size());
-        assertRow(page.get(0), "k2", "c6", "c7");
-        assertRow(page.get(1), "k3", "c1", "c2");
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(6);
-        assertEquals(toString(page), 2, page.size());
-        assertRow(page.get(0), "k3", "c3", "c4", "c5", "c6", "c7");
-        assertRow(page.get(1), "k4", "c1");
+        partitions = query(pager, 4);
+        assertRow(partitions.get(0), "k2", "c6", "c7");
+        assertRow(partitions.get(1), "k3", "c1", "c2");
+        assertFalse(pager.isExhausted());
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(5);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k4", "c2", "c3", "c4", "c5", "c6");
+        partitions = query(pager, 6);
+        assertRow(partitions.get(0), "k3", "c3", "c4", "c5", "c6", "c7");
+        assertRow(partitions.get(1), "k4", "c1");
+        assertFalse(pager.isExhausted());
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(5);
-        assertEquals(toString(page), 2, page.size());
-        assertRow(page.get(0), "k4", "c7");
-        assertRow(page.get(1), "k5", "c1", "c2", "c3", "c4");
+        partitions = query(pager, 5);
+        assertRow(partitions.get(0), "k4", "c2", "c3", "c4", "c5", "c6");
+        assertFalse(pager.isExhausted());
 
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        page = pager.fetchPage(5);
-        assertEquals(toString(page), 1, page.size());
-        assertRow(page.get(0), "k5", "c5", "c6", "c7");
+        partitions = query(pager, 5);
+        assertRow(partitions.get(0), "k4", "c7");
+        assertRow(partitions.get(1), "k5", "c1", "c2", "c3", "c4");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        assertFalse(pager.isExhausted());
+        partitions = query(pager, 5, 3);
+        assertRow(partitions.get(0), "k5", "c5", "c6", "c7");
 
         assertTrue(pager.isExhausted());
     }
@@ -369,27 +437,30 @@
         String keyspace = "cql_keyspace";
         String table = "table2";
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
-        CompositeType ct = (CompositeType)cfs.metadata.comparator.asAbstractType();
 
         // Insert rows but with a tombstone as last cell
         for (int i = 0; i < 5; i++)
             executeInternal(String.format("INSERT INTO %s.%s (k, c, v) VALUES ('k%d', 'c%d', null)", keyspace, table, 0, i));
 
-        SliceQueryFilter filter = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 100);
-        QueryPager pager = QueryPagers.localPager(new SliceFromReadCommand(keyspace, bytes("k0"), table, 0, filter));
+        ReadCommand command = SinglePartitionReadCommand.create(cfs.metadata, nowInSec, Util.dk("k0"), Slice.ALL);
+
+        QueryPager pager = command.getPager(null, Server.CURRENT_VERSION);
 
         for (int i = 0; i < 5; i++)
         {
-            List<Row> page = pager.fetchPage(1);
-            assertEquals(toString(page), 1, page.size());
+            List<FilteredPartition> partitions = query(pager, 1);
             // The only live cell we should have each time is the row marker
-            assertRow(page.get(0), "k0", ct.decompose("c" + i, ""));
+            assertRow(partitions.get(0), "k0", "c" + i);
         }
     }
 
     @Test
     public void pagingReversedQueriesWithStaticColumnsTest() throws Exception
     {
+        // There was a bug in paging for reverse queries when the schema includes static columns in
+        // 2.1 & 2.2. This was never a problem in 3.0, this test just guards against regressions
+        // see CASSANDRA-13222
+
         // insert some rows into a single partition
         for (int i=0; i < 5; i++)
             executeInternal(String.format("INSERT INTO %s.%s (pk, ck, st, v1, v2) VALUES ('k0', %3$s, %3$s, %3$s, %3$s)",
@@ -402,29 +473,46 @@
 
     private void queryAndVerifyCells(CFMetaData cfm, boolean reversed, String key) throws Exception
     {
-        SliceQueryFilter filter = new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, reversed, 100, 1);
-        QueryPager pager = QueryPagers.localPager(new SliceFromReadCommand(cfm.ksName, bytes(key), cfm.cfName, 0, filter));
-        CellName staticCellName = cfm.comparator.create(cfm.comparator.staticPrefix(),
-                                                        cfm.staticColumns().iterator().next());
+        ClusteringIndexFilter rowfilter = new ClusteringIndexSliceFilter(Slices.ALL, reversed);
+        ReadCommand command = SinglePartitionReadCommand.create(cfm, nowInSec, Util.dk(key), ColumnFilter.all(cfm), rowfilter);
+        QueryPager pager = command.getPager(null, Server.CURRENT_VERSION);
+
+        ColumnDefinition staticColumn = cfm.partitionColumns().statics.getSimple(0);
+        assertEquals(staticColumn.name.toCQLString(), "st");
+
         for (int i=0; i<5; i++)
         {
-            List<Row> page = pager.fetchPage(1);
-            assertEquals(1, page.size());
-            Row row = page.get(0);
-            assertCell(row.cf, staticCellName, 4);
-            int cellIndex = !reversed ? i : 4 - i;
-            assertCell(row.cf, Util.cellname(ByteBufferUtil.bytes(cellIndex), ByteBufferUtil.bytes("v1")), cellIndex);
-            assertCell(row.cf, Util.cellname(ByteBufferUtil.bytes(cellIndex), ByteBufferUtil.bytes("v2")), cellIndex);
+            try (ReadOrderGroup orderGroup = pager.startOrderGroup();
+                 PartitionIterator partitions = pager.fetchPageInternal(1, orderGroup))
+            {
+                try (RowIterator partition = partitions.next())
+                {
+                    assertCell(partition.staticRow(), staticColumn, 4);
+
+                    Row row = partition.next();
+                    int cellIndex = !reversed ? i : 4 - i;
+
+                    assertEquals(row.clustering().get(0), ByteBufferUtil.bytes(cellIndex));
+                    assertCell(row, cfm.getColumnDefinition(new ColumnIdentifier("v1", false)), cellIndex);
+                    assertCell(row, cfm.getColumnDefinition(new ColumnIdentifier("v2", false)), cellIndex);
+
+                    // the partition/page should contain just a single regular row
+                    assertFalse(partition.hasNext());
+                }
+            }
         }
 
         // After processing the 5 rows there should be no more rows to return
-        List<Row> page = pager.fetchPage(1);
-        assertTrue(page.isEmpty());
+        try ( ReadOrderGroup orderGroup = pager.startOrderGroup();
+              PartitionIterator partitions = pager.fetchPageInternal(1, orderGroup))
+        {
+            assertFalse(partitions.hasNext());
+        }
     }
 
-    private void assertCell(ColumnFamily cf, CellName cellName, int value)
+    private void assertCell(Row row, ColumnDefinition column, int value)
     {
-        Cell cell = cf.getColumn(cellName);
+        Cell cell = row.getCell(column);
         assertNotNull(cell);
         assertEquals(value, ByteBufferUtil.toInt(cell.value()));
     }

diff --git a/test/unit/org/apache/cassandra/service/RMIServerSocketFactoryImplTest.java b/test/unit/org/apache/cassandra/service/RMIServerSocketFactoryImplTest.java
new file mode 100644
index 0000000..393dfe1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/RMIServerSocketFactoryImplTest.java

@@ -0,0 +1,44 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.service;
+
+import java.io.IOException;
+import java.net.ServerSocket;
+import java.rmi.server.RMIServerSocketFactory;
+
+import org.junit.Test;
+
+import org.apache.cassandra.utils.RMIServerSocketFactoryImpl;
+
+import static org.junit.Assert.assertTrue;
+
+
+public class RMIServerSocketFactoryImplTest
+{
+    @Test
+    public void testReusableAddrSocket() throws IOException
+    {
+        RMIServerSocketFactory serverFactory = new RMIServerSocketFactoryImpl();
+        ServerSocket socket = serverFactory.createServerSocket(7199);
+        assertTrue(socket.getReuseAddress());
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/service/ResponseResolverTest.java b/test/unit/org/apache/cassandra/service/ResponseResolverTest.java
deleted file mode 100644
index 7e42825..0000000
--- a/test/unit/org/apache/cassandra/service/ResponseResolverTest.java
+++ /dev/null

@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.service;
-
-
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ColumnSlice;
-import org.apache.cassandra.db.filter.SliceQueryFilter;
-import org.apache.cassandra.net.MessageIn;
-import org.apache.cassandra.net.MessagingService;
-
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.fail;
-
-public class ResponseResolverTest extends SchemaLoader
-{
-    private final static String KEYSPACE = "Keyspace1";
-    private final static String TABLE = "Standard1";
-    private final static int MAX_RESPONSE_COUNT = 3;
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(MAX_RESPONSE_COUNT),
-                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE));
-    }
-    
-    @Test
-    public void testSingleMessage_RowDigestResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponses(new RowDigestResolver(KEYSPACE, key, MAX_RESPONSE_COUNT), row, makeReadResponse("127.0.0.1", row));
-    }
-
-    @Test
-    public void testMultipleMessages_RowDigestResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponses(new RowDigestResolver(KEYSPACE, key, MAX_RESPONSE_COUNT),
-                          row,
-                          makeReadResponse("127.0.0.1", row),
-                          makeReadResponse("127.0.0.2", row),
-                          makeReadResponse("127.0.0.3", row));
-    }
-
-    @Test(expected = DigestMismatchException.class)
-    public void testDigestMismatch() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf1.addColumn(column("c1", "v1", 0));
-        Row row1 = new Row(key, cf1);
-
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf2.addColumn(column("c1", "v2", 1));
-        Row row2 = new Row(key, cf2);
-
-        testReadResponses(new RowDigestResolver(KEYSPACE, key, MAX_RESPONSE_COUNT),
-                          row1,
-                          makeReadResponse("127.0.0.1", row1),
-                          makeReadResponse("127.0.0.2", row2),
-                          makeReadResponse("127.0.0.3", row1));
-    }
-
-    @Test
-    public void testMultipleThreads_RowDigestResolver() throws DigestMismatchException, UnknownHostException, InterruptedException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponsesMT(new RowDigestResolver(KEYSPACE, key, MAX_RESPONSE_COUNT),
-                            row,
-                            makeReadResponse("127.0.0.1", row),
-                            makeReadResponse("127.0.0.2", row),
-                            makeReadResponse("127.0.0.3", row));
-    }
-
-    @Test
-    public void testSingleMessage_RowDataResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponses(new RowDataResolver(KEYSPACE,
-                                              key,
-                                              new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 10),
-                                              System.currentTimeMillis(),
-                                              MAX_RESPONSE_COUNT),
-                          row,
-                          makeReadResponse("127.0.0.1", row));
-    }
-
-    @Test
-    public void testMultipleMessages_RowDataResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponses(new RowDataResolver(KEYSPACE,
-                                              key,
-                                              new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 10),
-                                              System.currentTimeMillis(),
-                                              MAX_RESPONSE_COUNT),
-                          row,
-                          makeReadResponse("127.0.0.1", row),
-                          makeReadResponse("127.0.0.2", row),
-                          makeReadResponse("127.0.0.3", row));
-    }
-
-    @Test
-    public void testMultipleThreads_RowDataResolver() throws DigestMismatchException, UnknownHostException, InterruptedException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-        Row row = new Row(key, cf);
-
-        testReadResponsesMT(new RowDataResolver(KEYSPACE,
-                                                key,
-                                                new SliceQueryFilter(ColumnSlice.ALL_COLUMNS_ARRAY, false, 10),
-                                                System.currentTimeMillis(),
-                                                MAX_RESPONSE_COUNT),
-                            row,
-                            makeReadResponse("127.0.0.1", row),
-                            makeReadResponse("127.0.0.2", row),
-                            makeReadResponse("127.0.0.3", row));
-    }
-
-    @Test
-    public void testSingleMessage_RangeSliceResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-
-        Row[] expected = new Row[2];
-        for (int i = 0; i < expected.length; i++)
-            expected[i] = new Row(key, cf);
-
-        MessageIn<RangeSliceReply> message = makeRangeSlice("127.0.0.1", expected);
-
-        RangeSliceResponseResolver resolver = new RangeSliceResponseResolver(KEYSPACE, System.currentTimeMillis());
-        resolver.setSources(Collections.singletonList(message.from));
-
-        testRangeSlices(resolver, expected, message);
-    }
-
-    @Test
-    public void testMultipleMessages_RangeSliceResolver() throws DigestMismatchException, UnknownHostException
-    {
-        ByteBuffer key = bytes("key");
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE, TABLE);
-        cf.addColumn(column("c1", "v1", 0));
-
-        Row[] expected = new Row[2];
-        for (int i = 0; i < expected.length; i++)
-            expected[i] = new Row(key, cf);
-
-        List<InetAddress> sources = new ArrayList<>(3);
-        sources.add(InetAddress.getByName("127.0.0.1"));
-        sources.add(InetAddress.getByName("127.0.0.2"));
-        sources.add(InetAddress.getByName("127.0.0.3"));
-
-        RangeSliceResponseResolver resolver = new RangeSliceResponseResolver(KEYSPACE, System.currentTimeMillis());
-        resolver.setSources(sources);
-
-        testRangeSlices(resolver,
-                        expected,
-                        makeRangeSlice("127.0.0.1", expected),
-                        makeRangeSlice("127.0.0.2", expected),
-                        makeRangeSlice("127.0.0.3", expected));
-    }
-
-    private void testReadResponses(AbstractRowResolver resolver, Row expected, MessageIn<ReadResponse> ... messages) throws DigestMismatchException
-    {
-        for (MessageIn<ReadResponse> message : messages)
-        {
-            resolver.preprocess(message);
-
-            Row row = resolver.getData();
-            checkSame(expected, row);
-
-            row = resolver.resolve();
-            checkSame(expected, row);
-        }
-    }
-
-    private void testReadResponsesMT(final AbstractRowResolver resolver,
-                                     final Row expected,
-                                     final MessageIn<ReadResponse> ... messages) throws InterruptedException
-    {
-        for (MessageIn<ReadResponse> message : messages)
-            resolver.preprocess(message);
-
-        final int threadCount = 45;
-        ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
-        final CountDownLatch finished = new CountDownLatch(threadCount);
-
-        for (int i = 0; i < threadCount; i++)
-        {
-            executorService.submit(new Runnable()
-            {
-                public void run()
-                {
-                    try
-                    {
-                        Row row = resolver.getData();
-                        checkSame(expected, row);
-
-                        row = resolver.resolve();
-                        checkSame(expected, row);
-                    }
-                    catch (DigestMismatchException ex)
-                    {
-                        fail(ex.getMessage());
-                    }
-                    finally
-                    {
-                        finished.countDown();
-                    }
-                }
-            });
-        }
-
-        finished.await();
-        assertEquals(0, executorService.shutdownNow().size());
-
-    }
-
-    private void testRangeSlices(RangeSliceResponseResolver resolver, Row[] expected, MessageIn<RangeSliceReply> ... messages)
-    {
-        for (MessageIn<RangeSliceReply> message : messages)
-        {
-            resolver.preprocess(message);
-
-            List<Row> rows = resolver.getData();
-            assertNotNull(rows);
-
-            for (int i = 0; i < expected.length; i++)
-                checkSame(expected[i], rows.get(i));
-
-            Iterator<Row> rowIt = resolver.resolve().iterator();
-            assertNotNull(rowIt);
-
-            for (Row r : expected)
-                checkSame(r, rowIt.next());
-        }
-    }
-
-    private MessageIn<ReadResponse> makeReadResponse(String address, Row row) throws UnknownHostException
-    {
-        return MessageIn.create(InetAddress.getByName(address),
-                                new ReadResponse(row),
-                                Collections.<String, byte[]>emptyMap(),
-                                MessagingService.Verb.INTERNAL_RESPONSE,
-                                MessagingService.current_version);
-    }
-
-    private MessageIn<RangeSliceReply> makeRangeSlice(String address, Row ... rows) throws UnknownHostException
-    {
-        return MessageIn.create(InetAddress.getByName(address),
-                                new RangeSliceReply(Arrays.asList(rows)),
-                                Collections.<String, byte[]>emptyMap(),
-                                MessagingService.Verb.INTERNAL_RESPONSE,
-                                MessagingService.current_version);
-    }
-
-    private void checkSame(Row r1, Row r2)
-    {
-        assertEquals(r1.key, r2.key);
-        assertEquals(r1.cf, r2.cf);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/service/RowResolverTest.java b/test/unit/org/apache/cassandra/service/RowResolverTest.java
deleted file mode 100644
index 825944c..0000000
--- a/test/unit/org/apache/cassandra/service/RowResolverTest.java
+++ /dev/null

@@ -1,160 +0,0 @@
-package org.apache.cassandra.service;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-
-import java.util.Arrays;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-import static org.junit.Assert.*;
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.db.KeyspaceTest.*;
-
-public class RowResolverTest
-{
-    public static final String KEYSPACE1 = "Keyspace1";
-    public static final String CF_STANDARD = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
-    }
-    
-    @Test
-    public void testResolveSupersetNewer()
-    {
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("c1", "v1", 0));
-
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.addColumn(column("c1", "v2", 1));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
-        assertColumns(resolved, "c1");
-        assertColumns(ColumnFamily.diff(cf1, resolved), "c1");
-        assertNull(ColumnFamily.diff(cf2, resolved));
-    }
-
-    @Test
-    public void testResolveSupersetDisjoint()
-    {
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("c1", "v1", 0));
-
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.addColumn(column("c2", "v2", 1));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
-        assertColumns(resolved, "c1", "c2");
-        assertColumns(ColumnFamily.diff(cf1, resolved), "c2");
-        assertColumns(ColumnFamily.diff(cf2, resolved), "c1");
-    }
-
-    @Test
-    public void testResolveSupersetNullOne()
-    {
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.addColumn(column("c2", "v2", 1));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(null, cf2), System.currentTimeMillis());
-        assertColumns(resolved, "c2");
-        assertColumns(ColumnFamily.diff(null, resolved), "c2");
-        assertNull(ColumnFamily.diff(cf2, resolved));
-    }
-
-    @Test
-    public void testResolveSupersetNullTwo()
-    {
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("c1", "v1", 0));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, null), System.currentTimeMillis());
-        assertColumns(resolved, "c1");
-        assertNull(ColumnFamily.diff(cf1, resolved));
-        assertColumns(ColumnFamily.diff(null, resolved), "c1");
-    }
-
-    @Test
-    public void testResolveSupersetNullBoth()
-    {
-        assertNull(RowDataResolver.resolveSuperset(Arrays.<ColumnFamily>asList(null, null), System.currentTimeMillis()));
-    }
-
-    @Test
-    public void testResolveDeleted()
-    {
-        // one CF with columns timestamped before a delete in another cf
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.addColumn(column("one", "A", 0));
-
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.delete(new DeletionInfo(1L, (int) (System.currentTimeMillis() / 1000)));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2), System.currentTimeMillis());
-        // no columns in the cf
-        assertColumns(resolved);
-        assertTrue(resolved.isMarkedForDelete());
-        assertEquals(1, resolved.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
-    }
-
-    @Test
-    public void testResolveMultipleDeleted()
-    {
-        // deletes and columns with interleaved timestamp, with out of order return sequence
-
-        ColumnFamily cf1 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf1.delete(new DeletionInfo(0L, (int) (System.currentTimeMillis() / 1000)));
-
-        // these columns created after the previous deletion
-        ColumnFamily cf2 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf2.addColumn(column("one", "A", 1));
-        cf2.addColumn(column("two", "A", 1));
-
-        //this column created after the next delete
-        ColumnFamily cf3 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf3.addColumn(column("two", "B", 3));
-
-        ColumnFamily cf4 = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        cf4.delete(new DeletionInfo(2L, (int) (System.currentTimeMillis() / 1000)));
-
-        ColumnFamily resolved = RowDataResolver.resolveSuperset(Arrays.asList(cf1, cf2, cf3, cf4), System.currentTimeMillis());
-        // will have deleted marker and one column
-        assertColumns(resolved, "two");
-        assertColumn(resolved, "two", "B", 3);
-        assertTrue(resolved.isMarkedForDelete());
-        assertEquals(2, resolved.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/service/SerializationsTest.java b/test/unit/org/apache/cassandra/service/SerializationsTest.java
index 5d2b74d..847bcea 100644
--- a/test/unit/org/apache/cassandra/service/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/service/SerializationsTest.java

@@ -18,18 +18,24 @@
  */
 package org.apache.cassandra.service;
 
-import java.io.DataInputStream;
 import java.io.IOException;
 import java.net.InetAddress;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.UUID;
 
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
 import org.junit.Test;
+
 import org.apache.cassandra.AbstractSerializationsTester;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.Util.PartitionerSwitcher;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.net.MessageIn;
 import org.apache.cassandra.net.MessagingService;
@@ -38,19 +44,30 @@
 import org.apache.cassandra.repair.Validator;
 import org.apache.cassandra.repair.messages.*;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.MerkleTree;
+import org.apache.cassandra.utils.MerkleTrees;
 
 public class SerializationsTest extends AbstractSerializationsTester
 {
-    static
+    private static PartitionerSwitcher partitionerSwitcher;
+    private static UUID RANDOM_UUID;
+    private static Range<Token> FULL_RANGE;
+    private static RepairJobDesc DESC;
+
+    @BeforeClass
+    public static void defineSchema() throws Exception
     {
-        System.setProperty("cassandra.partitioner", "RandomPartitioner");
+        partitionerSwitcher = Util.switchPartitioner(RandomPartitioner.instance);
+        RANDOM_UUID = UUID.fromString("b5c3d033-75aa-4c2f-a819-947aac7a0c54");
+        FULL_RANGE = new Range<>(Util.testPartitioner().getMinimumToken(), Util.testPartitioner().getMinimumToken());
+        DESC = new RepairJobDesc(getVersion() < MessagingService.VERSION_21 ? null : RANDOM_UUID, RANDOM_UUID, "Keyspace1", "Standard1", Arrays.asList(FULL_RANGE));
     }
 
-    private static final UUID RANDOM_UUID = UUID.fromString("b5c3d033-75aa-4c2f-a819-947aac7a0c54");
-    private static final Range<Token> FULL_RANGE = new Range<>(StorageService.getPartitioner().getMinimumToken(), StorageService.getPartitioner().getMinimumToken());
-    private static final RepairJobDesc DESC = new RepairJobDesc(getVersion() < MessagingService.VERSION_21 ? null : RANDOM_UUID, RANDOM_UUID, "Keyspace1", "Standard1", FULL_RANGE);
-
+    @AfterClass
+    public static void tearDown()
+    {
+        partitionerSwitcher.close();
+    }
+    
     private void testRepairMessageWrite(String fileName, RepairMessage... messages) throws IOException
     {
         try (DataOutputStreamPlus out = getOutput(fileName))
@@ -78,7 +95,7 @@
         if (EXECUTE_WRITES)
             testValidationRequestWrite();
 
-        try (DataInputStream in = getInput("service.ValidationRequest.bin"))
+        try (DataInputStreamPlus in = getInput("service.ValidationRequest.bin"))
         {
             RepairMessage message = RepairMessage.serializer.deserialize(in, getVersion());
             assert message.messageType == RepairMessage.Type.VALIDATION_REQUEST;
@@ -92,13 +109,17 @@
     private void testValidationCompleteWrite() throws IOException
     {
         IPartitioner p = RandomPartitioner.instance;
+
+        MerkleTrees mt = new MerkleTrees(p);
+
         // empty validation
-        MerkleTree mt = new MerkleTree(p, FULL_RANGE, MerkleTree.RECOMMENDED_DEPTH, (int) Math.pow(2, 15));
+        mt.addMerkleTree((int) Math.pow(2, 15), FULL_RANGE);
         Validator v0 = new Validator(DESC, FBUtilities.getBroadcastAddress(),  -1);
         ValidationComplete c0 = new ValidationComplete(DESC, mt);
 
         // validation with a tree
-        mt = new MerkleTree(p, FULL_RANGE, MerkleTree.RECOMMENDED_DEPTH, Integer.MAX_VALUE);
+        mt = new MerkleTrees(p);
+        mt.addMerkleTree(Integer.MAX_VALUE, FULL_RANGE);
         for (int i = 0; i < 10; i++)
             mt.split(p.getRandomToken());
         Validator v1 = new Validator(DESC, FBUtilities.getBroadcastAddress(), -1);
@@ -116,31 +137,31 @@
         if (EXECUTE_WRITES)
             testValidationCompleteWrite();
 
-        try (DataInputStream in = getInput("service.ValidationComplete.bin"))
+        try (DataInputStreamPlus in = getInput("service.ValidationComplete.bin"))
         {
             // empty validation
             RepairMessage message = RepairMessage.serializer.deserialize(in, getVersion());
             assert message.messageType == RepairMessage.Type.VALIDATION_COMPLETE;
             assert DESC.equals(message.desc);
 
-            assert ((ValidationComplete) message).success;
-            assert ((ValidationComplete) message).tree != null;
+            assert ((ValidationComplete) message).success();
+            assert ((ValidationComplete) message).trees != null;
 
             // validation with a tree
             message = RepairMessage.serializer.deserialize(in, getVersion());
             assert message.messageType == RepairMessage.Type.VALIDATION_COMPLETE;
             assert DESC.equals(message.desc);
 
-            assert ((ValidationComplete) message).success;
-            assert ((ValidationComplete) message).tree != null;
+            assert ((ValidationComplete) message).success();
+            assert ((ValidationComplete) message).trees != null;
 
             // failed validation
             message = RepairMessage.serializer.deserialize(in, getVersion());
             assert message.messageType == RepairMessage.Type.VALIDATION_COMPLETE;
             assert DESC.equals(message.desc);
 
-            assert !((ValidationComplete) message).success;
-            assert ((ValidationComplete) message).tree == null;
+            assert !((ValidationComplete) message).success();
+            assert ((ValidationComplete) message).trees == null;
 
             // MessageOuts
             for (int i = 0; i < 3; i++)
@@ -168,7 +189,7 @@
         InetAddress src = InetAddress.getByAddress(new byte[]{127, 0, 0, 2});
         InetAddress dest = InetAddress.getByAddress(new byte[]{127, 0, 0, 3});
 
-        try (DataInputStream in = getInput("service.SyncRequest.bin"))
+        try (DataInputStreamPlus in = getInput("service.SyncRequest.bin"))
         {
             RepairMessage message = RepairMessage.serializer.deserialize(in, getVersion());
             assert message.messageType == RepairMessage.Type.SYNC_REQUEST;
@@ -204,7 +225,7 @@
         InetAddress dest = InetAddress.getByAddress(new byte[]{127, 0, 0, 3});
         NodePair nodes = new NodePair(src, dest);
 
-        try (DataInputStream in = getInput("service.SyncComplete.bin"))
+        try (DataInputStreamPlus in = getInput("service.SyncComplete.bin"))
         {
             // success
             RepairMessage message = RepairMessage.serializer.deserialize(in, getVersion());

diff --git a/test/unit/org/apache/cassandra/service/StartupChecksTest.java b/test/unit/org/apache/cassandra/service/StartupChecksTest.java
index d32b1b1..0f30d3c 100644
--- a/test/unit/org/apache/cassandra/service/StartupChecksTest.java
+++ b/test/unit/org/apache/cassandra/service/StartupChecksTest.java

@@ -102,6 +102,13 @@
         startupChecks.verify();
     }
 
+    @Test
+    public void maxMapCountCheck() throws Exception
+    {
+        startupChecks = startupChecks.withTest(StartupChecks.checkMaxMapCount);
+        startupChecks.verify();
+    }
+
     private void copyLegacyNonSSTableFiles(Path targetDir) throws IOException
     {
 

diff --git a/test/unit/org/apache/cassandra/service/StorageProxyTest.java b/test/unit/org/apache/cassandra/service/StorageProxyTest.java
index c8afac0..42eb1f5 100644
--- a/test/unit/org/apache/cassandra/service/StorageProxyTest.java
+++ b/test/unit/org/apache/cassandra/service/StorageProxyTest.java

@@ -23,51 +23,46 @@
 
 import org.junit.BeforeClass;
 import org.junit.Test;
-import static org.junit.Assert.assertEquals;
 
-import static org.apache.cassandra.Util.token;
-import static org.apache.cassandra.Util.rp;
-
-import org.apache.cassandra.db.RowPosition;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.dht.ExcludingBounds;
-import org.apache.cassandra.dht.IncludingExcludingBounds;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.*;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.Util.rp;
+import static org.apache.cassandra.Util.token;
+import static org.junit.Assert.assertEquals;
 
 public class StorageProxyTest
 {
-    private static Range<RowPosition> range(RowPosition left, RowPosition right)
+    private static Range<PartitionPosition> range(PartitionPosition left, PartitionPosition right)
     {
-        return new Range<RowPosition>(left, right);
+        return new Range<PartitionPosition>(left, right);
     }
 
-    private static Bounds<RowPosition> bounds(RowPosition left, RowPosition right)
+    private static Bounds<PartitionPosition> bounds(PartitionPosition left, PartitionPosition right)
     {
-        return new Bounds<RowPosition>(left, right);
+        return new Bounds<PartitionPosition>(left, right);
     }
 
-    private static ExcludingBounds<RowPosition> exBounds(RowPosition left, RowPosition right)
+    private static ExcludingBounds<PartitionPosition> exBounds(PartitionPosition left, PartitionPosition right)
     {
-        return new ExcludingBounds<RowPosition>(left, right);
+        return new ExcludingBounds<PartitionPosition>(left, right);
     }
 
-    private static IncludingExcludingBounds<RowPosition> incExBounds(RowPosition left, RowPosition right)
+    private static IncludingExcludingBounds<PartitionPosition> incExBounds(PartitionPosition left, PartitionPosition right)
     {
-        return new IncludingExcludingBounds<RowPosition>(left, right);
+        return new IncludingExcludingBounds<PartitionPosition>(left, right);
     }
 
-    private static RowPosition startOf(String key)
+    private static PartitionPosition startOf(String key)
     {
-        return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(key)).minKeyBound();
+        return token(key).minKeyBound();
     }
 
-    private static RowPosition endOf(String key)
+    private static PartitionPosition endOf(String key)
     {
-        return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(key)).maxKeyBound();
+        return token(key).maxKeyBound();
     }
 
     private static Range<Token> tokenRange(String left, String right)
@@ -83,6 +78,7 @@
     @BeforeClass
     public static void beforeClass() throws Throwable
     {
+        DatabaseDescriptor.getHintsDirectory().mkdir();
         TokenMetadata tmd = StorageService.instance.getTokenMetadata();
         tmd.updateNormalToken(token("1"), InetAddress.getByName("127.0.0.1"));
         tmd.updateNormalToken(token("6"), InetAddress.getByName("127.0.0.6"));
@@ -99,10 +95,10 @@
     }
 
     // test getRestrictedRanges for keys
-    private void testGRRKeys(AbstractBounds<RowPosition> queryRange, AbstractBounds<RowPosition>... expected)
+    private void testGRRKeys(AbstractBounds<PartitionPosition> queryRange, AbstractBounds<PartitionPosition>... expected)
     {
         // Testing for keys
-        List<AbstractBounds<RowPosition>> restrictedKeys = StorageProxy.getRestrictedRanges(queryRange);
+        List<AbstractBounds<PartitionPosition>> restrictedKeys = StorageProxy.getRestrictedRanges(queryRange);
         assertEquals(restrictedKeys.toString(), expected.length, restrictedKeys.size());
         for (int i = 0; i < expected.length; i++)
             assertEquals("Mismatch for index " + i + ": " + restrictedKeys, expected[i], restrictedKeys.get(i));

diff --git a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
index a693a23..4c776ba 100644
--- a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
+++ b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java

@@ -28,7 +28,6 @@
 
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
-
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -36,21 +35,22 @@
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
+import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.config.Schema;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.WindowsFailedSnapshotTracker;
-import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
+import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.locator.IEndpointSnitch;
 import org.apache.cassandra.locator.PropertyFileSnitch;
 import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.schema.LegacySchemaTables;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.ReplicationParams;
+import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertEquals;
@@ -171,10 +171,10 @@
     }
 
     @Test
-    public void testColumnFamilySnapshot() throws IOException
+    public void testTableSnapshot() throws IOException
     {
         // no need to insert extra data, even an "empty" database will have a little information in the system keyspace
-        StorageService.instance.takeColumnFamilySnapshot(SystemKeyspace.NAME, LegacySchemaTables.KEYSPACES, "cf_snapshot");
+        StorageService.instance.takeTableSnapshot(SchemaKeyspace.NAME, SchemaKeyspace.KEYSPACES, "cf_snapshot");
     }
 
     @Test
@@ -194,10 +194,11 @@
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC1", "1");
         configOptions.put("DC2", "1");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name,
                                                                                                             InetAddress.getByName("127.0.0.1"));
@@ -236,10 +237,11 @@
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC1", "1");
         configOptions.put("DC2", "1");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangesForEndpoint(meta.name, InetAddress.getByName("127.0.0.1"));
         assert primaryRanges.size() == 1;
@@ -272,10 +274,11 @@
 
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC2", "2");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         // endpoints in DC1 should not have primary range
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangesForEndpoint(meta.name, InetAddress.getByName("127.0.0.1"));
@@ -310,10 +313,11 @@
 
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC2", "2");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         // endpoints in DC1 should not have primary range
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
@@ -361,10 +365,11 @@
 
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC2", "2");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         // endpoints in DC1 should not have primary range
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangesForEndpoint(meta.name, InetAddress.getByName("127.0.0.1"));
@@ -427,10 +432,11 @@
         Map<String, String> configOptions = new HashMap<>();
         configOptions.put("DC1", "1");
         configOptions.put("DC2", "2");
+        configOptions.put(ReplicationParams.CLASS, "NetworkTopologyStrategy");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "NetworkTopologyStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.create(false, configOptions));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         // endpoints in DC1 should have primary ranges which also cover DC2
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
@@ -487,12 +493,9 @@
         metadata.updateNormalToken(new StringToken("B"), InetAddress.getByName("127.0.0.2"));
         metadata.updateNormalToken(new StringToken("C"), InetAddress.getByName("127.0.0.3"));
 
-        Map<String, String> configOptions = new HashMap<>();
-        configOptions.put("replication_factor", "2");
-
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "SimpleStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.simpleTransient(2));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangesForEndpoint(meta.name, InetAddress.getByName("127.0.0.1"));
         assert primaryRanges.size() == 1;
@@ -522,8 +525,8 @@
         configOptions.put("replication_factor", "2");
 
         Keyspace.clear("Keyspace1");
-        KSMetaData meta = KSMetaData.newKeyspace("Keyspace1", "SimpleStrategy", configOptions, false);
-        Schema.instance.setKeyspaceDefinition(meta);
+        KeyspaceMetadata meta = KeyspaceMetadata.create("Keyspace1", KeyspaceParams.simpleTransient(2));
+        Schema.instance.setKeyspaceMetadata(meta);
 
         Collection<Range<Token>> primaryRanges = StorageService.instance.getPrimaryRangeForEndpointWithinDC(meta.name, InetAddress.getByName("127.0.0.1"));
         assert primaryRanges.size() == 1;

diff --git a/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java b/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java
deleted file mode 100644
index 00718b4..0000000
--- a/test/unit/org/apache/cassandra/service/pager/AbstractQueryPagerTest.java
+++ /dev/null

@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.service.pager;
-
-import java.nio.ByteBuffer;
-import java.util.*;
-
-import org.junit.Test;
-import static org.junit.Assert.*;
-
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellNames;
-import org.apache.cassandra.db.filter.ColumnCounter;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-public class AbstractQueryPagerTest
-{
-    @Test
-    public void discardFirstTest()
-    {
-        TestPager pager = new TestPager();
-        List<Row> rows = Arrays.asList(createRow("r1", 1),
-                                       createRow("r2", 3),
-                                       createRow("r3", 2));
-
-        assertEquals(3, rows.size());
-        assertRow(rows.get(0), "r1", 0);
-        assertRow(rows.get(1), "r2", 0, 1, 2);
-        assertRow(rows.get(2), "r3", 0, 1);
-
-        rows = pager.discardFirst(rows, 1);
-
-        assertEquals(2, rows.size());
-        assertRow(rows.get(0), "r2", 0, 1, 2);
-        assertRow(rows.get(1), "r3", 0, 1);
-
-        rows = pager.discardFirst(rows, 1);
-
-        assertEquals(2, rows.size());
-        assertRow(rows.get(0), "r2", 1, 2);
-        assertRow(rows.get(1), "r3", 0, 1);
-
-        rows = pager.discardFirst(rows, 3);
-
-        assertEquals(1, rows.size());
-        assertRow(rows.get(0), "r3", 1);
-
-        rows = pager.discardFirst(rows, 1);
-
-        assertTrue(rows.isEmpty());
-    }
-
-    @Test
-    public void discardLastTest()
-    {
-        TestPager pager = new TestPager();
-        List<Row> rows = Arrays.asList(createRow("r1", 2),
-                                       createRow("r2", 3),
-                                       createRow("r3", 1));
-
-        assertEquals(3, rows.size());
-        assertRow(rows.get(0), "r1", 0, 1);
-        assertRow(rows.get(1), "r2", 0, 1, 2);
-        assertRow(rows.get(2), "r3", 0);
-
-        rows = pager.discardLast(rows, 1);
-
-        assertEquals(2, rows.size());
-        assertRow(rows.get(0), "r1", 0, 1);
-        assertRow(rows.get(1), "r2", 0, 1, 2);
-
-        rows = pager.discardLast(rows, 1);
-
-        assertEquals(2, rows.size());
-        assertRow(rows.get(0), "r1", 0, 1);
-        assertRow(rows.get(1), "r2", 0, 1);
-
-        rows = pager.discardLast(rows, 3);
-
-        assertEquals(1, rows.size());
-        assertRow(rows.get(0), "r1", 0);
-
-        rows = pager.discardLast(rows, 1);
-
-        assertTrue(rows.isEmpty());
-    }
-
-    private void assertRow(Row row, String name, int... values)
-    {
-        assertEquals(row.key.getKey(), ByteBufferUtil.bytes(name));
-        assertEquals(values.length, row.cf.getColumnCount());
-
-        int i = 0;
-        for (Cell c : row.cf)
-            assertEquals(values[i++], i(c.name().toByteBuffer()));
-    }
-
-    private Row createRow(String name, int nbCol)
-    {
-        return new Row(Util.dk(name), createCF(nbCol));
-    }
-
-    private ColumnFamily createCF(int nbCol)
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(createMetadata());
-        for (int i = 0; i < nbCol; i++)
-            cf.addColumn(CellNames.simpleDense(bb(i)), bb(i), 0);
-        return cf;
-    }
-
-    private static CFMetaData createMetadata()
-    {
-        CFMetaData cfm = new CFMetaData("ks", "cf", ColumnFamilyType.Standard, CellNames.fromAbstractType(Int32Type.instance, false));
-        cfm.rebuild();
-        return cfm;
-    }
-
-    private static ByteBuffer bb(int i)
-    {
-        return ByteBufferUtil.bytes(i);
-    }
-
-    private static int i(ByteBuffer bb)
-    {
-        return ByteBufferUtil.toInt(bb);
-    }
-
-    private static class TestPager extends AbstractQueryPager
-    {
-        public TestPager()
-        {
-            // We use this to test more thorougly DiscardFirst and DiscardLast (more generic pager behavior is tested in
-            // QueryPagerTest). The only thing those method use is the result of the columnCounter() method. So to keep
-            // it simple, we fake all actual parameters in the ctor below but just override the columnCounter() method.
-            super(null, 0, false, createMetadata(), null, 0);
-        }
-
-        @Override
-        public ColumnCounter columnCounter()
-        {
-            return new ColumnCounter(0);
-        }
-
-        public PagingState state()
-        {
-            return null;
-        }
-
-        protected List<Row> queryNextPage(int pageSize, ConsistencyLevel consistency, boolean localQuery)
-        {
-            return null;
-        }
-
-        protected boolean containsPreviousLast(Row first)
-        {
-            return false;
-        }
-
-        protected boolean recordLast(Row last)
-        {
-            return false;
-        }
-
-        protected boolean isReversed()
-        {
-            return false;
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java b/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java
new file mode 100644
index 0000000..8e48771
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/pager/PagingStateTest.java

@@ -0,0 +1,141 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.service.pager;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.ColumnDefinition;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.transport.Server.VERSION_3;
+import static org.apache.cassandra.transport.Server.VERSION_4;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class PagingStateTest
+{
+    private PagingState makeSomePagingState(int protocolVersion)
+    {
+        return makeSomePagingState(protocolVersion, 0);
+    }
+
+    private PagingState makeSomePagingState(int protocolVersion, int remainingInPartition)
+    {
+        CFMetaData metadata = CFMetaData.Builder.create("ks", "tbl")
+                                                .addPartitionKey("k", AsciiType.instance)
+                                                .addClusteringColumn("c1", AsciiType.instance)
+                                                .addClusteringColumn("c1", Int32Type.instance)
+                                                .addRegularColumn("myCol", AsciiType.instance)
+                                                .build();
+
+        ByteBuffer pk = ByteBufferUtil.bytes("someKey");
+
+        ColumnDefinition def = metadata.getColumnDefinition(new ColumnIdentifier("myCol", false));
+        Clustering c = new Clustering(ByteBufferUtil.bytes("c1"), ByteBufferUtil.bytes(42));
+        Row row = BTreeRow.singleCellRow(c, BufferCell.live(metadata, def, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER));
+        PagingState.RowMark mark = PagingState.RowMark.create(metadata, row, protocolVersion);
+        return new PagingState(pk, mark, 10, remainingInPartition);
+    }
+
+    @Test
+    public void testSerializationBackwardCompatibility()
+    {
+        /*
+         * Tests that the serialized paging state for the native protocol V3 is backward compatible
+         * with what old nodes generate. For that, it compares the serialized format to the hard-coded
+         * value of the same state generated on a 2.1. For the curious, said hardcoded value has been
+         * generated by the following code:
+         *     ByteBuffer pk = ByteBufferUtil.bytes("someKey");
+         *     CellName cn = CellNames.compositeSparse(new ByteBuffer[]{ ByteBufferUtil.bytes("c1"), ByteBufferUtil.bytes(42) },
+         *                                             new ColumnIdentifier("myCol", false),
+         *                                             false);
+         *     PagingState state = new PagingState(pk, cn.toByteBuffer(), 10);
+         *     System.out.println("PagingState = " + ByteBufferUtil.bytesToHex(state.serialize()));
+         */
+        PagingState state = makeSomePagingState(VERSION_3);
+
+        String serializedState = ByteBufferUtil.bytesToHex(state.serialize(VERSION_3));
+        // Note that we don't assert exact equality because we know 3.0 nodes include the "remainingInPartition" number
+        // that is not present on 2.1/2.2 nodes. We know this is ok however because we know that 2.1/2.2 nodes will ignore
+        // anything remaining once they have properly deserialized a paging state.
+        assertTrue(serializedState.startsWith("0007736f6d654b65790014000263310000040000002a0000056d79636f6c000000000a"));
+    }
+
+    @Test
+    public void testSerializeV3DeserializeV3()
+    {
+        PagingState state = makeSomePagingState(VERSION_3);
+        ByteBuffer serialized = state.serialize(VERSION_3);
+        assertEquals(serialized.remaining(), state.serializedSize(VERSION_3));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_3));
+    }
+
+    @Test
+    public void testSerializeV4DeserializeV4()
+    {
+        PagingState state = makeSomePagingState(VERSION_4);
+        ByteBuffer serialized = state.serialize(VERSION_4);
+        assertEquals(serialized.remaining(), state.serializedSize(VERSION_4));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_4));
+    }
+
+    @Test
+    public void testSerializeV3DeserializeV4()
+    {
+        PagingState state = makeSomePagingState(VERSION_3);
+        ByteBuffer serialized = state.serialize(VERSION_3);
+        assertEquals(serialized.remaining(), state.serializedSize(VERSION_3));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_4));
+    }
+
+    @Test
+    public void testSerializeV4DeserializeV3()
+    {
+        PagingState state = makeSomePagingState(VERSION_4);
+        ByteBuffer serialized = state.serialize(VERSION_4);
+        assertEquals(serialized.remaining(), state.serializedSize(VERSION_4));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_3));
+    }
+
+    @Test
+    public void testSerializeV3WithoutRemainingInPartitionDeserializeV3() throws IOException
+    {
+        PagingState state = makeSomePagingState(VERSION_3, Integer.MAX_VALUE);
+        ByteBuffer serialized = state.legacySerialize(false);
+        assertEquals(serialized.remaining(), state.legacySerializedSize(false));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_3));
+    }
+
+    @Test
+    public void testSerializeV3WithoutRemainingInPartitionDeserializeV4() throws IOException
+    {
+        PagingState state = makeSomePagingState(VERSION_3, Integer.MAX_VALUE);
+        ByteBuffer serialized = state.legacySerialize(false);
+        assertEquals(serialized.remaining(), state.legacySerializedSize(false));
+        assertEquals(state, PagingState.deserialize(serialized, VERSION_4));
+    }
+}

diff --git a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
index 02af9a7..185498f 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java

@@ -28,20 +28,19 @@
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
 import org.junit.After;
 import org.junit.Test;
 
 import junit.framework.Assert;
 import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.streaming.messages.OutgoingFileMessage;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Ref;
@@ -59,8 +58,7 @@
     {
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
     }
 
@@ -87,7 +85,7 @@
 
         // create streaming task that streams those two sstables
         StreamTransferTask task = new StreamTransferTask(session, cfs.metadata.cfId);
-        for (SSTableReader sstable : cfs.getSSTables())
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             List<Range<Token>> ranges = new ArrayList<>();
             ranges.add(new Range<>(sstable.first.getToken(), sstable.last.getToken()));
@@ -100,7 +98,7 @@
         f.get();
 
         // when timeout runs on second file, task should be completed
-        f = task.scheduleTimeout(1, 1, TimeUnit.MILLISECONDS);
+        f = task.scheduleTimeout(1, 10, TimeUnit.MILLISECONDS);
         task.complete(1);
         try
         {
@@ -110,6 +108,7 @@
         catch (CancellationException ex)
         {
         }
+
         assertEquals(StreamSession.State.WAIT_COMPLETE, session.state());
 
         // when all streaming are done, time out task should not be scheduled.
@@ -135,8 +134,8 @@
 
         // create streaming task that streams those two sstables
         StreamTransferTask task = new StreamTransferTask(session, cfs.metadata.cfId);
-        List<Ref<SSTableReader>> refs = new ArrayList<>(cfs.getSSTables().size());
-        for (SSTableReader sstable : cfs.getSSTables())
+        List<Ref<SSTableReader>> refs = new ArrayList<>(cfs.getLiveSSTables().size());
+        for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             List<Range<Token>> ranges = new ArrayList<>();
             ranges.add(new Range<>(sstable.first.getToken(), sstable.last.getToken()));

diff --git a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
index 0af76c2..fb5e03f 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java

@@ -19,57 +19,43 @@
 
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
-import java.sql.Date;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 
 import com.google.common.collect.Iterables;
 import com.google.common.util.concurrent.FutureCallback;
 import com.google.common.util.concurrent.Futures;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import junit.framework.Assert;
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.compaction.Scrubber;
-import org.apache.cassandra.db.compaction.Scrubber.ScrubResult;
-import org.apache.cassandra.db.context.CounterContext;
-import org.apache.cassandra.db.filter.IDiskAtomFilter;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.SSTableUtils;
-import org.apache.cassandra.locator.SimpleStrategy;
 import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.CounterId;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.OutputHandler;
 import org.apache.cassandra.utils.concurrent.Refs;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
-import static org.apache.cassandra.Util.cellname;
-import static org.apache.cassandra.Util.column;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class StreamingTransferTest
@@ -93,18 +79,21 @@
         SchemaLoader.prepareServer();
         StorageService.instance.initServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDINT, IntegerType.instance),
-                                    SchemaLoader.indexCFMD(KEYSPACE1, CF_INDEX, true));
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_COUNTER, false, true, true)
+                                                      .addPartitionKey("key", BytesType.instance)
+                                                      .build(),
+                                    CFMetaData.Builder.create(KEYSPACE1, CF_STANDARDINT)
+                                                      .addPartitionKey("key", AsciiType.instance)
+                                                      .addClusteringColumn("cols", Int32Type.instance)
+                                                      .addRegularColumn("val", BytesType.instance)
+                                                      .build(),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEX, true));
         SchemaLoader.createKeyspace(KEYSPACE2,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1));
+                                    KeyspaceParams.simple(1));
         SchemaLoader.createKeyspace(KEYSPACE_CACHEKEY,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
+                                    KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE_CACHEKEY, CF_STANDARD),
                                     SchemaLoader.standardCFMD(KEYSPACE_CACHEKEY, CF_STANDARD2),
                                     SchemaLoader.standardCFMD(KEYSPACE_CACHEKEY, CF_STANDARD3));
@@ -140,7 +129,7 @@
     public void testRequestEmpty() throws Exception
     {
         // requesting empty data should succeed
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = Util.testPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("key1"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("key2")), p.getMinimumToken()));
@@ -177,14 +166,14 @@
             mutator.mutate("key" + i, "col" + i, timestamp);
         cfs.forceBlockingFlush();
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // transfer the first and last key
         logger.debug("Transferring {}", cfs.name);
         int[] offs;
         if (transferSSTables)
         {
-            SSTableReader sstable = cfs.getSSTables().iterator().next();
+            SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
             cfs.clearUnsafe();
             transferSSTables(sstable);
             offs = new int[]{1, 3};
@@ -198,22 +187,24 @@
         }
 
         // confirm that a single SSTable was transferred and registered
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
         // and that the index and filter were properly recovered
-        List<Row> rows = Util.getRangeSlice(cfs);
-        assertEquals(offs.length, rows.size());
+        List<ImmutableBTreePartition> partitions = Util.getAllUnfiltered(Util.cmd(cfs).build());
+        assertEquals(offs.length, partitions.size());
         for (int i = 0; i < offs.length; i++)
         {
             String key = "key" + offs[i];
             String col = "col" + offs[i];
-            assert cfs.getColumnFamily(QueryFilter.getIdentityFilter(Util.dk(key), cfs.name, System.currentTimeMillis())) != null;
-            assert rows.get(i).key.getKey().equals(ByteBufferUtil.bytes(key));
-            assert rows.get(i).cf.getColumn(cellname(col)) != null;
+
+            assert !Util.getAll(Util.cmd(cfs, key).build()).isEmpty();
+            ImmutableBTreePartition partition = partitions.get(i);
+            assert ByteBufferUtil.compareUnsigned(partition.partitionKey().getKey(), ByteBufferUtil.bytes(key)) == 0;
+            assert ByteBufferUtil.compareUnsigned(partition.iterator().next().clustering().get(0), ByteBufferUtil.bytes(col)) == 0;
         }
 
         // and that the max timestamp for the file was rediscovered
-        assertEquals(timestamp, cfs.getSSTables().iterator().next().getMaxTimestamp());
+        assertEquals(timestamp, cfs.getLiveSSTables().iterator().next().getMaxTimestamp());
 
         List<String> keys = new ArrayList<>();
         for (int off : offs)
@@ -225,7 +216,7 @@
 
     private void transferSSTables(SSTableReader sstable) throws Exception
     {
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = sstable.getPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("key1"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("key2")), p.getMinimumToken()));
@@ -234,18 +225,42 @@
 
     private void transferRanges(ColumnFamilyStore cfs) throws Exception
     {
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = cfs.getPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         // wrapped range
         ranges.add(new Range<Token>(p.getToken(ByteBufferUtil.bytes("key1")), p.getToken(ByteBufferUtil.bytes("key0"))));
-        new StreamPlan("StreamingTransferTest").transferRanges(LOCAL, cfs.keyspace.getName(), ranges, cfs.getColumnFamilyName()).execute().get();
+        StreamPlan streamPlan = new StreamPlan("StreamingTransferTest").transferRanges(LOCAL, cfs.keyspace.getName(), ranges, cfs.getColumnFamilyName());
+        streamPlan.execute().get();
         verifyConnectionsAreClosed();
+
+        //cannot add ranges after stream session is finished
+        try
+        {
+            streamPlan.transferRanges(LOCAL, cfs.keyspace.getName(), ranges, cfs.getColumnFamilyName());
+            fail("Should have thrown exception");
+        }
+        catch (RuntimeException e)
+        {
+            //do nothing
+        }
     }
 
     private void transfer(SSTableReader sstable, List<Range<Token>> ranges) throws Exception
     {
-        new StreamPlan("StreamingTransferTest").transferFiles(LOCAL, makeStreamingDetails(ranges, Refs.tryRef(Arrays.asList(sstable)))).execute().get();
+        StreamPlan streamPlan = new StreamPlan("StreamingTransferTest").transferFiles(LOCAL, makeStreamingDetails(ranges, Refs.tryRef(Arrays.asList(sstable))));
+        streamPlan.execute().get();
         verifyConnectionsAreClosed();
+
+        //cannot add files after stream session is finished
+        try
+        {
+            streamPlan.transferFiles(LOCAL, makeStreamingDetails(ranges, Refs.tryRef(Arrays.asList(sstable))));
+            fail("Should have thrown exception");
+        }
+        catch (RuntimeException e)
+        {
+            //do nothing
+        }
     }
 
     /**
@@ -283,19 +298,17 @@
     private void doTransferTable(boolean transferSSTables) throws Exception
     {
         final Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Indexed1");
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_INDEX);
 
         List<String> keys = createAndTransfer(cfs, new Mutator()
         {
             public void mutate(String key, String col, long timestamp) throws Exception
             {
                 long val = key.hashCode();
-                ColumnFamily cf = ArrayBackedSortedColumns.factory.create(keyspace.getName(), cfs.name);
-                cf.addColumn(column(col, "v", timestamp));
-                cf.addColumn(new BufferCell(cellname("birthdate"), ByteBufferUtil.bytes(val), timestamp));
-                Mutation rm = new Mutation(KEYSPACE1, ByteBufferUtil.bytes(key), cf);
-                logger.debug("Applying row to transfer {}", rm);
-                rm.applyUnsafe();
+
+                RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata, timestamp, key);
+                builder.clustering(col).add("birthdate", ByteBufferUtil.bytes(val));
+                builder.build().applyUnsafe();
             }
         }, transferSSTables);
 
@@ -303,15 +316,13 @@
         for (String key : keys)
         {
             long val = key.hashCode();
-            IndexExpression expr = new IndexExpression(ByteBufferUtil.bytes("birthdate"),
-                                                       Operator.EQ,
-                                                       ByteBufferUtil.bytes(val));
-            List<IndexExpression> clause = Arrays.asList(expr);
-            IDiskAtomFilter filter = new IdentityQueryFilter();
-            Range<RowPosition> range = Util.range("", "");
-            List<Row> rows = cfs.search(range, clause, filter, 100);
-            assertEquals(1, rows.size());
-            assert rows.get(0).key.getKey().equals(ByteBufferUtil.bytes(key));
+
+            // test we can search:
+            UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM \"%s\".\"%s\" WHERE birthdate = %d",
+                    cfs.metadata.ksName, cfs.metadata.cfName, val));
+            assertEquals(1, result.size());
+
+            assert result.iterator().next().getBytes("key").equals(ByteBufferUtil.bytes(key));
         }
     }
 
@@ -325,62 +336,49 @@
         String cfname = "StandardInteger1";
         Keyspace keyspace = Keyspace.open(ks);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname);
+        ClusteringComparator comparator = cfs.getComparator();
 
-        String key = "key0";
-        Mutation rm = new Mutation(ks, ByteBufferUtil.bytes(key));
-        // add columns of size slightly less than column_index_size to force insert column index
-        rm.add(cfname, cellname(1), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize() - 64]), 2);
-        rm.add(cfname, cellname(6), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize()]), 2);
-        ColumnFamily cf = rm.addOrGet(cfname);
-        // add RangeTombstones
-        cf.delete(new DeletionInfo(cellname(2), cellname(3), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
-        cf.delete(new DeletionInfo(cellname(5), cellname(7), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
-        cf.delete(new DeletionInfo(cellname(8), cellname(10), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
-        rm.applyUnsafe();
+        String key = "key1";
 
-        key = "key1";
-        rm = new Mutation(ks, ByteBufferUtil.bytes(key));
+
+        RowUpdateBuilder updates = new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros(), key);
+
         // add columns of size slightly less than column_index_size to force insert column index
-        rm.add(cfname, cellname(1), ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize() - 64]), 2);
-        cf = rm.addOrGet(cfname);
+        updates.clustering(1)
+                .add("val", ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize() - 64]))
+                .build()
+                .apply();
+
+        updates = new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros(), key);
+        updates.clustering(6)
+                .add("val", ByteBuffer.wrap(new byte[DatabaseDescriptor.getColumnIndexSize()]))
+                .build()
+                .apply();
+
         // add RangeTombstones
-        cf.delete(new DeletionInfo(cellname(2), cellname(3), cf.getComparator(), 1, (int) (System.currentTimeMillis() / 1000)));
-        rm.applyUnsafe();
+        //updates = new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros() + 1 , key);
+        //updates.addRangeTombstone(Slice.make(comparator, comparator.make(2), comparator.make(4)))
+        //        .build()
+        //        .apply();
+
+
+        updates = new RowUpdateBuilder(cfs.metadata, FBUtilities.timestampMicros() + 1, key);
+        updates.addRangeTombstone(Slice.make(comparator.make(5), comparator.make(7)))
+                .build()
+                .apply();
 
         cfs.forceBlockingFlush();
 
-        int cellCount = countCells(cfs);
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         cfs.clearUnsafe();
         transferSSTables(sstable);
 
         // confirm that a single SSTable was transferred and registered
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
 
-        // Verify table
-        assertEquals(cellCount, countCells(cfs));
-
-        List<Row> rows = Util.getRangeSlice(cfs);
-        assertEquals(2, rows.size());
-    }
-
-    private int countCells(ColumnFamilyStore cfs)
-    {
-        int cellCount = 0;
-        for (SSTableReader sstable : cfs.getSSTables())
-        {
-            Iterator<OnDiskAtomIterator> it = sstable.getScanner();
-            while (it.hasNext())
-            {
-                Iterator<OnDiskAtom> itr = it.next();
-                while (itr.hasNext())
-                {
-                    ++cellCount;
-                    itr.next();
-                }
-            }
-        }
-        return cellCount;
+        Row r = Util.getOnlyRow(Util.cmd(cfs).build());
+        Assert.assertFalse(r.isEmpty());
+        Assert.assertTrue(1 == Int32Type.instance.compose(r.clustering().get(0)));
     }
 
     @Test
@@ -395,6 +393,7 @@
         doTransferTable(true);
     }
 
+    /*
     @Test
     public void testTransferTableCounter() throws Exception
     {
@@ -406,7 +405,7 @@
 
         List<String> keys = createAndTransfer(cfs, new Mutator()
         {
-            /** Creates a new SSTable per key: all will be merged before streaming. */
+            // Creates a new SSTable per key: all will be merged before streaming.
             public void mutate(String key, String col, long timestamp) throws Exception
             {
                 Map<String, ColumnFamily> entries = new HashMap<>();
@@ -437,13 +436,13 @@
             .cf(cfs.name)
             .generation(0)
             .write(cleanedEntries);
-        SSTableReader streamed = cfs.getSSTables().iterator().next();
+        SSTableReader streamed = cfs.getLiveSSTables().iterator().next();
         SSTableUtils.assertContentEquals(cleaned, streamed);
 
         // Retransfer the file, making sure it is now idempotent (see CASSANDRA-3481)
         cfs.clearUnsafe();
         transferSSTables(streamed);
-        SSTableReader restreamed = cfs.getSSTables().iterator().next();
+        SSTableReader restreamed = cfs.getLiveSSTables().iterator().next();
         SSTableUtils.assertContentEquals(streamed, restreamed);
     }
 
@@ -466,7 +465,7 @@
         SSTableReader sstable2 = SSTableUtils.prepare().write(content);
 
         // transfer the first and last key
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = Util.testPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("test"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("transfer2")), p.getMinimumToken()));
@@ -495,7 +494,7 @@
     public void testTransferOfMultipleColumnFamilies() throws Exception
     {
         String keyspace = KEYSPACE_CACHEKEY;
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = Util.testPartitioner();
         String[] columnFamilies = new String[] { "Standard1", "Standard2", "Standard3" };
         List<SSTableReader> ssTableReaders = new ArrayList<>();
 
@@ -562,19 +561,19 @@
             mutator.mutate("key" + i, "col" + i, System.currentTimeMillis());
         cfs.forceBlockingFlush();
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
-        SSTableReader sstable = cfs.getSSTables().iterator().next();
+        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         cfs.clearUnsafe();
 
-        IPartitioner p = StorageService.getPartitioner();
+        IPartitioner p = Util.testPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("key1")), p.getToken(ByteBufferUtil.bytes("key1000"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("key5")), p.getToken(ByteBufferUtil.bytes("key500"))));
         ranges.add(new Range<>(p.getToken(ByteBufferUtil.bytes("key9")), p.getToken(ByteBufferUtil.bytes("key900"))));
         transfer(sstable, ranges);
-        assertEquals(1, cfs.getSSTables().size());
+        assertEquals(1, cfs.getLiveSSTables().size());
         assertEquals(7, Util.getRangeSlice(cfs).size());
     }
-
+    */
     public interface Mutator
     {
         public void mutate(String key, String col, long timestamp) throws Exception;

diff --git a/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java b/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java
deleted file mode 100644
index 87d93fd..0000000
--- a/test/unit/org/apache/cassandra/streaming/compress/CompressedInputStreamTest.java
+++ /dev/null

@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.streaming.compress;
-
-import java.io.*;
-import java.util.*;
-import java.util.concurrent.SynchronousQueue;
-import java.util.concurrent.TimeUnit;
-
-import org.junit.Test;
-
-import org.apache.cassandra.db.composites.*;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.io.compress.CompressedSequentialWriter;
-import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.compress.CompressionParameters;
-import org.apache.cassandra.io.compress.SnappyCompressor;
-import org.apache.cassandra.io.sstable.Component;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.utils.Pair;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-
-/**
- */
-public class CompressedInputStreamTest
-{
-    @Test
-    public void testCompressedRead() throws Exception
-    {
-        testCompressedReadWith(new long[]{0L}, false, false);
-        testCompressedReadWith(new long[]{1L}, false, false);
-        testCompressedReadWith(new long[]{100L}, false, false);
-
-        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, false, false);
-    }
-
-    @Test(expected = EOFException.class)
-    public void testTruncatedRead() throws Exception
-    {
-        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, true, false);
-    }
-
-    /**
-     * Test that CompressedInputStream does not block if there's an exception while reading stream
-     */
-    @Test(timeout = 30000)
-    public void testException() throws Exception
-    {
-        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, false, true);
-    }
-
-    /**
-     * @param valuesToCheck array of longs of range(0-999)
-     * @throws Exception
-     */
-    private void testCompressedReadWith(long[] valuesToCheck, boolean testTruncate, boolean testException) throws Exception
-    {
-        assert valuesToCheck != null && valuesToCheck.length > 0;
-
-        // write compressed data file of longs
-        File tmp = new File(File.createTempFile("cassandra", "unittest").getParent(), "ks-cf-ib-1-Data.db");
-        Descriptor desc = Descriptor.fromFilename(tmp.getAbsolutePath());
-        MetadataCollector collector = new MetadataCollector(new SimpleDenseCellNameType(BytesType.instance));
-        CompressionParameters param = new CompressionParameters(SnappyCompressor.instance, 32, Collections.<String, String>emptyMap());
-        Map<Long, Long> index = new HashMap<>();
-        try (CompressedSequentialWriter writer = new CompressedSequentialWriter(tmp, desc.filenameFor(Component.COMPRESSION_INFO), param, collector))
-        {
-            for (long l = 0L; l < 1000; l++)
-            {
-                index.put(l, writer.getFilePointer());
-                writer.stream.writeLong(l);
-            }
-            writer.finish();
-        }
-
-        CompressionMetadata comp = CompressionMetadata.create(tmp.getAbsolutePath());
-        List<Pair<Long, Long>> sections = new ArrayList<>();
-        for (long l : valuesToCheck)
-        {
-            long position = index.get(l);
-            sections.add(Pair.create(position, position + 8));
-        }
-        CompressionMetadata.Chunk[] chunks = comp.getChunksForSections(sections);
-        long totalSize = comp.getTotalSizeForSections(sections);
-        long expectedSize = 0;
-        for (CompressionMetadata.Chunk c : chunks)
-            expectedSize += c.length + 4;
-        assertEquals(expectedSize, totalSize);
-
-        // buffer up only relevant parts of file
-        int size = 0;
-        for (CompressionMetadata.Chunk c : chunks)
-            size += (c.length + 4); // 4bytes CRC
-        byte[] toRead = new byte[size];
-
-        try (RandomAccessFile f = new RandomAccessFile(tmp, "r"))
-        {
-            int pos = 0;
-            for (CompressionMetadata.Chunk c : chunks)
-            {
-                f.seek(c.offset);
-                pos += f.read(toRead, pos, c.length + 4);
-            }
-        }
-
-        if (testTruncate)
-        {
-            byte [] actuallyRead = new byte[50];
-            System.arraycopy(toRead, 0, actuallyRead, 0, 50);
-            toRead = actuallyRead;
-        }
-
-        // read buffer using CompressedInputStream
-        CompressionInfo info = new CompressionInfo(chunks, param);
-
-        if (testException)
-        {
-            testException(sections, info);
-            return;
-        }
-        CompressedInputStream input = new CompressedInputStream(new ByteArrayInputStream(toRead), info);
-
-        try (DataInputStream in = new DataInputStream(input))
-        {
-            for (int i = 0; i < sections.size(); i++)
-            {
-                input.position(sections.get(i).left);
-                long readValue = in.readLong();
-                assertEquals("expected " + valuesToCheck[i] + " but was " + readValue, valuesToCheck[i], readValue);
-            }
-        }
-    }
-
-    private static void testException(List<Pair<Long, Long>> sections, CompressionInfo info) throws IOException
-    {
-        CompressedInputStream input = new CompressedInputStream(new ByteArrayInputStream(new byte[0]), info);
-
-        try (DataInputStream in = new DataInputStream(input))
-        {
-            for (int i = 0; i < sections.size(); i++)
-            {
-                input.position(sections.get(i).left);
-                try {
-                    in.readLong();
-                    fail("Should have thrown IOException");
-                }
-                catch (IOException e)
-                {
-                    continue;
-                }
-            }
-        }
-    }
-}

diff --git a/test/unit/org/apache/cassandra/streaming/compression/CompressedInputStreamTest.java b/test/unit/org/apache/cassandra/streaming/compression/CompressedInputStreamTest.java
new file mode 100644
index 0000000..8512d8f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/streaming/compression/CompressedInputStreamTest.java

@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.streaming.compression;
+
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
+import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.streaming.compress.CompressedInputStream;
+import org.apache.cassandra.streaming.compress.CompressionInfo;
+import org.apache.cassandra.utils.ChecksumType;
+import org.apache.cassandra.utils.Pair;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+/**
+ */
+public class CompressedInputStreamTest
+{
+    @Test
+    public void testCompressedRead() throws Exception
+    {
+        testCompressedReadWith(new long[]{0L}, false, false);
+        testCompressedReadWith(new long[]{1L}, false, false);
+        testCompressedReadWith(new long[]{100L}, false, false);
+
+        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, false, false);
+    }
+
+    @Test(expected = EOFException.class)
+    public void testTruncatedRead() throws Exception
+    {
+        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, true, false);
+    }
+
+    /**
+     * Test that CompressedInputStream does not block if there's an exception while reading stream
+     */
+    @Test(timeout = 30000)
+    public void testException() throws Exception
+    {
+        testCompressedReadWith(new long[]{1L, 122L, 123L, 124L, 456L}, false, true);
+    }
+
+    /**
+     * @param valuesToCheck array of longs of range(0-999)
+     * @throws Exception
+     */
+    private void testCompressedReadWith(long[] valuesToCheck, boolean testTruncate, boolean testException) throws Exception
+    {
+        assert valuesToCheck != null && valuesToCheck.length > 0;
+
+        // write compressed data file of longs
+        File tmp = new File(File.createTempFile("cassandra", "unittest").getParent(), "ks-cf-ib-1-Data.db");
+        Descriptor desc = Descriptor.fromFilename(tmp.getAbsolutePath());
+        MetadataCollector collector = new MetadataCollector(new ClusteringComparator(BytesType.instance));
+        CompressionParams param = CompressionParams.snappy(32);
+        Map<Long, Long> index = new HashMap<Long, Long>();
+        try (CompressedSequentialWriter writer = new CompressedSequentialWriter(tmp, desc.filenameFor(Component.COMPRESSION_INFO), param, collector))
+        {
+            for (long l = 0L; l < 1000; l++)
+            {
+                index.put(l, writer.position());
+                writer.writeLong(l);
+            }
+            writer.finish();
+        }
+
+        CompressionMetadata comp = CompressionMetadata.create(tmp.getAbsolutePath());
+        List<Pair<Long, Long>> sections = new ArrayList<>();
+        for (long l : valuesToCheck)
+        {
+            long position = index.get(l);
+            sections.add(Pair.create(position, position + 8));
+        }
+        CompressionMetadata.Chunk[] chunks = comp.getChunksForSections(sections);
+        long totalSize = comp.getTotalSizeForSections(sections);
+        long expectedSize = 0;
+        for (CompressionMetadata.Chunk c : chunks)
+            expectedSize += c.length + 4;
+        assertEquals(expectedSize, totalSize);
+
+        // buffer up only relevant parts of file
+        int size = 0;
+        for (CompressionMetadata.Chunk c : chunks)
+            size += (c.length + 4); // 4bytes CRC
+        byte[] toRead = new byte[size];
+
+        try (RandomAccessFile f = new RandomAccessFile(tmp, "r"))
+        {
+            int pos = 0;
+            for (CompressionMetadata.Chunk c : chunks)
+            {
+                f.seek(c.offset);
+                pos += f.read(toRead, pos, c.length + 4);
+            }
+        }
+
+        if (testTruncate)
+        {
+            byte [] actuallyRead = new byte[50];
+            System.arraycopy(toRead, 0, actuallyRead, 0, 50);
+            toRead = actuallyRead;
+        }
+
+        // read buffer using CompressedInputStream
+        CompressionInfo info = new CompressionInfo(chunks, param);
+
+        if (testException)
+        {
+            testException(sections, info);
+            return;
+        }
+        CompressedInputStream input = new CompressedInputStream(new ByteArrayInputStream(toRead), info, ChecksumType.CRC32, () -> 1.0);
+
+        try (DataInputStream in = new DataInputStream(input))
+        {
+            for (int i = 0; i < sections.size(); i++)
+            {
+                input.position(sections.get(i).left);
+                long readValue = in.readLong();
+                assertEquals("expected " + valuesToCheck[i] + " but was " + readValue, valuesToCheck[i], readValue);
+            }
+        }
+    }
+
+    private static void testException(List<Pair<Long, Long>> sections, CompressionInfo info) throws IOException
+    {
+        CompressedInputStream input = new CompressedInputStream(new ByteArrayInputStream(new byte[0]), info, ChecksumType.CRC32, () -> 1.0);
+
+        try (DataInputStream in = new DataInputStream(input))
+        {
+            for (int i = 0; i < sections.size(); i++)
+            {
+                input.position(sections.get(i).left);
+                try {
+                    in.readLong();
+                    fail("Should have thrown IOException");
+                }
+                catch (IOException e)
+                {
+                    continue;
+                }
+            }
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java b/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java
deleted file mode 100644
index 9716876..0000000
--- a/test/unit/org/apache/cassandra/thrift/MultiSliceTest.java
+++ /dev/null

@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.thrift;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.nio.ByteBuffer;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import junit.framework.Assert;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.EmbeddedCassandraService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.thrift.TException;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class MultiSliceTest
-{
-    private static CassandraServer server;
-    public static final String KEYSPACE1 = "MultiSliceTest";
-    public static final String CF_STANDARD = "Standard1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException, IOException, TException
-    {
-        SchemaLoader.prepareServer();
-        new EmbeddedCassandraService().start();
-        ThriftSessionManager.instance.setCurrentSocket(new InetSocketAddress(9160));
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
-        server = new CassandraServer();
-        server.set_keyspace(KEYSPACE1);
-    }
-
-    private static MultiSliceRequest makeMultiSliceRequest(ByteBuffer key)
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        MultiSliceRequest req = new MultiSliceRequest();
-        req.setKey(key);
-        req.setCount(1000);
-        req.reversed = false;
-        req.setColumn_parent(cp);
-        return req;
-    }
-    
-    @Test
-    public void test_multi_slice_optional_column_slice() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("multi_slice".getBytes());
-        List<String> expected = new ArrayList<String>();
-        for (char a = 'a'; a <= 'z'; a++)
-            expected.add(a + "");
-
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.setColumn_slices(new ArrayList<ColumnSlice>());
-        req.getColumn_slices().add(new ColumnSlice());
-        List<ColumnOrSuperColumn> list = server.get_multi_slice(req);
-        assertColumnNameMatches(expected, list);
-    }
-    
-    @Test
-    public void test_multi_slice() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("multi_slice_two_slice".getBytes());
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("i", "n")));
-        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "i", "j", "k" , "l", "m" , "n"), server.get_multi_slice(req));
-    }
-    
-    @Test
-    public void test_with_overlap() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("overlap".getBytes());
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("d", "g")));
-        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f", "g"), server.get_multi_slice(req));
-    }
-    
-    @Test
-    public void test_with_overlap_reversed() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("overlap_reversed".getBytes());
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.reversed = true;
-        req.setColumn_slices(Arrays.asList(columnSliceFrom("e", "a"), columnSliceFrom("g", "d")));
-        assertColumnNameMatches(Arrays.asList("g", "f", "e", "d", "c", "b", "a"), server.get_multi_slice(req));
-    }
-
-    @Test(expected=InvalidRequestException.class)
-    public void test_that_column_slice_is_proper() throws TException
-    {
-      ByteBuffer key = ByteBuffer.wrap("overlap".getBytes());
-      MultiSliceRequest req = makeMultiSliceRequest(key);
-      req.reversed = true;
-      req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("g", "d")));
-      assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f", "g"), server.get_multi_slice(req));
-    }
-    
-    @Test
-    public void test_with_overlap_reversed_with_count() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("overlap_reversed_count".getBytes());
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.setCount(6);
-        req.reversed = true;
-        req.setColumn_slices(Arrays.asList(columnSliceFrom("e", "a"), columnSliceFrom("g", "d")));
-        assertColumnNameMatches(Arrays.asList("g", "f", "e", "d", "c", "b"), server.get_multi_slice(req));
-    }
-
-    @Test
-    public void test_with_overlap_with_count() throws TException
-    {
-        ColumnParent cp = new ColumnParent("Standard1");
-        ByteBuffer key = ByteBuffer.wrap("overlap_reversed_count".getBytes());
-        addTheAlphabetToRow(key, cp);
-        MultiSliceRequest req = makeMultiSliceRequest(key);
-        req.setCount(6);
-        req.setColumn_slices(Arrays.asList(columnSliceFrom("a", "e"), columnSliceFrom("d", "g"), columnSliceFrom("d", "g")));
-        assertColumnNameMatches(Arrays.asList("a", "b", "c", "d", "e", "f"), server.get_multi_slice(req));
-    }
-
-    private static void addTheAlphabetToRow(ByteBuffer key, ColumnParent parent) 
-            throws InvalidRequestException, UnavailableException, TimedOutException
-    {
-        for (char a = 'a'; a <= 'z'; a++) {
-            Column c1 = new Column();
-            c1.setName(ByteBufferUtil.bytes(String.valueOf(a)));
-            c1.setValue(new byte [0]);
-            c1.setTimestamp(System.nanoTime());
-            server.insert(key, parent, c1, ConsistencyLevel.ONE); 
-         }
-    }
-    
-    private static void assertColumnNameMatches(List<String> expected , List<ColumnOrSuperColumn> actual)
-    {
-        Assert.assertEquals(actual+" "+expected +" did not have same number of elements", actual.size(), expected.size());
-        for (int i = 0 ; i< expected.size() ; i++)
-        {
-            Assert.assertEquals(actual.get(i) +" did not equal "+ expected.get(i), 
-                    expected.get(i), new String(actual.get(i).getColumn().getName()));
-        }
-    }
-    
-    private ColumnSlice columnSliceFrom(String startInclusive, String endInclusive)
-    {
-        ColumnSlice cs = new ColumnSlice();
-        cs.setStart(ByteBufferUtil.bytes(startInclusive));
-        cs.setFinish(ByteBufferUtil.bytes(endInclusive));
-        return cs;
-    }
-}

diff --git a/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java b/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java
deleted file mode 100644
index c693f7c..0000000
--- a/test/unit/org/apache/cassandra/thrift/ThriftValidationTest.java
+++ /dev/null

@@ -1,205 +0,0 @@
-package org.apache.cassandra.thrift;
-/*
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- */
-
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.*;
-import org.apache.cassandra.db.marshal.*;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.LocalStrategy;
-import org.apache.cassandra.locator.NetworkTopologyStrategy;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.utils.ByteBufferUtil;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-public class ThriftValidationTest
-{
-    public static final String KEYSPACE1 = "MultiSliceTest";
-    public static final String CF_STANDARD = "Standard1";
-    public static final String CF_COUNTER = "Counter1";
-    public static final String CF_UUID = "UUIDKeys";
-    public static final String CF_STANDARDLONG3 = "StandardLong3";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_UUID).keyValidator(UUIDType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLONG3, IntegerType.instance));
-    }
-    
-    @Test(expected=org.apache.cassandra.exceptions.InvalidRequestException.class)
-    public void testValidateCommutativeWithStandard() throws org.apache.cassandra.exceptions.InvalidRequestException
-    {
-        ThriftValidation.validateColumnFamily(KEYSPACE1, "Standard1", true);
-    }
-
-    @Test
-    public void testValidateCommutativeWithCounter() throws org.apache.cassandra.exceptions.InvalidRequestException
-    {
-        ThriftValidation.validateColumnFamily(KEYSPACE1, "Counter1", true);
-    }
-
-    @Test
-    public void testColumnNameEqualToKeyAlias() throws org.apache.cassandra.exceptions.InvalidRequestException
-    {
-        CFMetaData metaData = Schema.instance.getCFMetaData(KEYSPACE1, "Standard1");
-        CFMetaData newMetadata = metaData.copy();
-
-        boolean gotException = false;
-
-        // add a key_alias = "id"
-        // should not throw IRE here
-        try
-        {
-            newMetadata.addColumnDefinition(ColumnDefinition.partitionKeyDef(metaData, AsciiType.instance.decompose("id"), LongType.instance, null));
-            newMetadata.validate();
-        }
-        catch (ConfigurationException e)
-        {
-            gotException = true;
-        }
-
-        assert !gotException : "got unexpected ConfigurationException";
-
-
-        gotException = false;
-
-        // add a column with name = "id"
-        try
-        {
-            newMetadata.addColumnDefinition(ColumnDefinition.regularDef(metaData, ByteBufferUtil.bytes("id"), LongType.instance, null));
-            newMetadata.validate();
-        }
-        catch (ConfigurationException e)
-        {
-            gotException = true;
-        }
-
-        assert gotException : "expected ConfigurationException but not received.";
-
-        // make sure the key alias does not affect validation of columns with the same name (CASSANDRA-6892)
-        Column column = new Column(ByteBufferUtil.bytes("id"));
-        column.setValue(ByteBufferUtil.bytes("not a long"));
-        column.setTimestamp(1234);
-        ByteBuffer key = ByteBufferUtil.bytes("key");
-        ThriftValidation.validateColumnData(newMetadata, key, null, column);
-    }
-
-    @Test
-    public void testColumnNameEqualToDefaultKeyAlias() throws org.apache.cassandra.exceptions.InvalidRequestException
-    {
-        CFMetaData metaData = Schema.instance.getCFMetaData(KEYSPACE1, "UUIDKeys");
-        ColumnDefinition definition = metaData.getColumnDefinition(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS));
-        assertNotNull(definition);
-        assertEquals(ColumnDefinition.Kind.PARTITION_KEY, definition.kind);
-
-        // make sure the key alias does not affect validation of columns with the same name (CASSANDRA-6892)
-        Column column = new Column(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS));
-        column.setValue(ByteBufferUtil.bytes("not a uuid"));
-        column.setTimestamp(1234);
-        ByteBuffer key = ByteBufferUtil.bytes("key");
-        ThriftValidation.validateColumnData(metaData, key, null, column);
-
-        IndexExpression expression = new IndexExpression(ByteBufferUtil.bytes(CFMetaData.DEFAULT_KEY_ALIAS), IndexOperator.EQ, ByteBufferUtil.bytes("a"));
-        ThriftValidation.validateFilterClauses(metaData, Arrays.asList(expression));
-    }
-
-    @Test
-    public void testColumnNameEqualToDefaultColumnAlias() throws org.apache.cassandra.exceptions.InvalidRequestException
-    {
-        CFMetaData metaData = Schema.instance.getCFMetaData(KEYSPACE1, "StandardLong3");
-        ColumnDefinition definition = metaData.getColumnDefinition(ByteBufferUtil.bytes(CFMetaData.DEFAULT_COLUMN_ALIAS + 1));
-        assertNotNull(definition);
-
-        // make sure the column alias does not affect validation of columns with the same name (CASSANDRA-6892)
-        Column column = new Column(ByteBufferUtil.bytes(CFMetaData.DEFAULT_COLUMN_ALIAS + 1));
-        column.setValue(ByteBufferUtil.bytes("not a long"));
-        column.setTimestamp(1234);
-        ByteBuffer key = ByteBufferUtil.bytes("key");
-        ThriftValidation.validateColumnData(metaData, key, null, column);
-    }
-
-    @Test
-    public void testValidateKsDef()
-    {
-        KsDef ks_def = new KsDef()
-                            .setName("keyspaceValid")
-                            .setStrategy_class(LocalStrategy.class.getSimpleName());
-
-
-        boolean gotException = false;
-
-        try
-        {
-            ThriftConversion.fromThrift(ks_def).validate();
-        }
-        catch (ConfigurationException e)
-        {
-            gotException = true;
-        }
-
-        assert gotException : "expected ConfigurationException but not received.";
-
-        ks_def.setStrategy_class(LocalStrategy.class.getName());
-
-        gotException = false;
-
-        try
-        {
-            ThriftConversion.fromThrift(ks_def).validate();
-        }
-        catch (ConfigurationException e)
-        {
-            gotException = true;
-        }
-
-        assert gotException : "expected ConfigurationException but not received.";
-
-        ks_def.setStrategy_class(NetworkTopologyStrategy.class.getName());
-
-        gotException = false;
-
-        try
-        {
-            ThriftConversion.fromThrift(ks_def).validate();
-        }
-        catch (ConfigurationException e)
-        {
-            gotException = true;
-        }
-
-        assert !gotException : "got unexpected ConfigurationException";
-    }
-}

diff --git a/test/unit/org/apache/cassandra/tools/SSTableExportTest.java b/test/unit/org/apache/cassandra/tools/SSTableExportTest.java
deleted file mode 100644
index bc73c83..0000000
--- a/test/unit/org/apache/cassandra/tools/SSTableExportTest.java
+++ /dev/null

@@ -1,405 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.tools;
-
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.PrintStream;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.BufferCell;
-import org.apache.cassandra.db.BufferCounterCell;
-import org.apache.cassandra.db.BufferExpiringCell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableWriter;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.UUIDGen;
-import org.apache.thrift.TException;
-import org.json.simple.JSONArray;
-import org.json.simple.JSONObject;
-import org.json.simple.JSONValue;
-import org.json.simple.parser.ParseException;
-
-import static org.apache.cassandra.Util.column;
-import static org.apache.cassandra.io.sstable.SSTableUtils.tempSSTableFile;
-import static org.apache.cassandra.utils.ByteBufferUtil.bytesToHex;
-import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-public class SSTableExportTest
-{
-    public static final String KEYSPACE1 = "SSTableExportTest";
-    public static final String CF_STANDARD = "Standard1";
-    public static final String CF_COUNTER = "Counter1";
-    public static final String CF_UUID = "UUIDKeys";
-    public static final String CF_VALSWITHQUOTES = "ValuesWithQuotes";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_UUID).keyValidator(UUIDType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_VALSWITHQUOTES).defaultValidator(UTF8Type.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, "AsciiKeys").keyValidator(AsciiType.instance));
-    }
-
-    public String asHex(String str)
-    {
-        return bytesToHex(ByteBufferUtil.bytes(str));
-    }
-
-    @Test
-    public void testEnumeratekeys() throws IOException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
-
-        // Add rowA
-        cfamily.addColumn(Util.cellname("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-
-        // Add rowB
-        cfamily.addColumn(Util.cellname("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
-        writer.append(Util.dk("rowB"), cfamily);
-        cfamily.clear();
-
-        writer.finish(true);
-
-        // Enumerate and verify
-        File temp = File.createTempFile("Standard1", ".txt");
-        final Descriptor descriptor = Descriptor.fromFilename(writer.getFilename());
-        SSTableExport.enumeratekeys(descriptor, new PrintStream(temp.getPath()),
-                CFMetaData.sparseCFMetaData(descriptor.ksname, descriptor.cfname, BytesType.instance));
-
-
-        try (FileReader file = new FileReader(temp))
-        {
-            char[] buf = new char[(int) temp.length()];
-            file.read(buf);
-            String output = new String(buf);
-
-            String sep = System.getProperty("line.separator");
-            assert output.equals(asHex("rowA") + sep + asHex("rowB") + sep) : output;
-        }
-    }
-
-    @Test
-    public void testExportSimpleCf() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
-
-        int nowInSec = (int)(System.currentTimeMillis() / 1000) + 42; //live for 42 seconds
-        // Add rowA
-        cfamily.addColumn(Util.cellname("colA"), ByteBufferUtil.bytes("valA"), System.currentTimeMillis());
-        cfamily.addColumn(new BufferExpiringCell(Util.cellname("colExp"), ByteBufferUtil.bytes("valExp"), System.currentTimeMillis(), 42, nowInSec));
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-
-        // Add rowB
-        cfamily.addColumn(Util.cellname("colB"), ByteBufferUtil.bytes("valB"), System.currentTimeMillis());
-        writer.append(Util.dk("rowB"), cfamily);
-        cfamily.clear();
-
-        // Add rowExclude
-        cfamily.addColumn(Util.cellname("colX"), ByteBufferUtil.bytes("valX"), System.currentTimeMillis());
-        writer.append(Util.dk("rowExclude"), cfamily);
-        cfamily.clear();
-
-        SSTableReader reader = writer.finish(true);
-
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("Standard1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")});
-
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals("unexpected number of rows", 2, json.size());
-
-        JSONObject rowA = (JSONObject)json.get(0);
-        assertEquals("unexpected number of keys", 2, rowA.keySet().size());
-        assertEquals("unexpected row key",asHex("rowA"),rowA.get("key"));
-
-        JSONArray colsA = (JSONArray)rowA.get("cells");
-        JSONArray colA = (JSONArray)colsA.get(0);
-        assert hexToBytes((String)colA.get(1)).equals(ByteBufferUtil.bytes("valA"));
-
-        JSONArray colExp = (JSONArray)colsA.get(1);
-        assert ((Long)colExp.get(4)) == 42;
-        assert ((Long)colExp.get(5)) == nowInSec;
-
-        JSONObject rowB = (JSONObject)json.get(1);
-        assertEquals("unexpected number of keys", 2, rowB.keySet().size());
-        assertEquals("unexpected row key",asHex("rowB"),rowB.get("key"));
-
-        JSONArray colsB = (JSONArray)rowB.get("cells");
-        JSONArray colB = (JSONArray)colsB.get(0);
-        assert colB.size() == 3;
-
-    }
-
-    @Test
-    public void testRoundTripStandardCf() throws IOException
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore("Standard1");
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
-
-        // Add rowA
-        cfamily.addColumn(Util.cellname("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-
-        // Add rowExclude
-        cfamily.addColumn(Util.cellname("name"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
-        writer.append(Util.dk("rowExclude"), cfamily);
-        cfamily.clear();
-
-        SSTableReader reader = writer.finish(true);
-
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("Standard1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[]{asHex("rowExclude")});
-
-        // Import JSON to another SSTable file
-        File tempSS2 = tempSSTableFile(KEYSPACE1, "Standard1");
-        new SSTableImport().importJson(tempJson.getPath(), KEYSPACE1, "Standard1", tempSS2.getPath());
-
-        reader = SSTableReader.open(Descriptor.fromFilename(tempSS2.getPath()));
-        QueryFilter qf = Util.namesQueryFilter(cfs, Util.dk("rowA"), "name");
-        ColumnFamily cf = qf.getSSTableColumnIterator(reader).getColumnFamily();
-        qf.collateOnDiskAtom(cf, qf.getSSTableColumnIterator(reader), Integer.MIN_VALUE);
-        assertNotNull(cf);
-        assertEquals(hexToBytes("76616c"), cf.getColumn(Util.cellname("name")).value());
-
-        qf = Util.namesQueryFilter(cfs, Util.dk("rowExclude"), "name");
-        cf = qf.getSSTableColumnIterator(reader).getColumnFamily();
-        assert cf == null;
-        reader.selfRef().release();
-    }
-
-    @Test
-    public void testExportCounterCf() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "Counter1");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Counter1");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
-
-        // Add rowA
-        cfamily.addColumn(BufferCounterCell.createLocal(Util.cellname("colA"), 42, System.currentTimeMillis(), Long.MIN_VALUE));
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-
-        SSTableReader reader = writer.finish(true);
-
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("Counter1", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals("unexpected number of rows", 1, json.size());
-
-        JSONObject row = (JSONObject)json.get(0);
-        assertEquals("unexpected number of keys", 2, row.keySet().size());
-        assertEquals("unexpected row key",asHex("rowA"),row.get("key"));
-
-        JSONArray cols = (JSONArray)row.get("cells");
-        JSONArray colA = (JSONArray)cols.get(0);
-        assert hexToBytes((String)colA.get(0)).equals(ByteBufferUtil.bytes("colA"));
-        assert ((String) colA.get(3)).equals("c");
-        assert (Long) colA.get(4) == Long.MIN_VALUE;
-    }
-
-    @Test
-    public void testEscapingDoubleQuotes() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "ValuesWithQuotes");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "ValuesWithQuotes");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
-
-        // Add rowA
-        cfamily.addColumn(new BufferCell(Util.cellname("data"), UTF8Type.instance.fromString("{\"foo\":\"bar\"}")));
-        writer.append(Util.dk("rowA"), cfamily);
-        cfamily.clear();
-
-        SSTableReader reader = writer.finish(true);
-
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("ValuesWithQuotes", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
-
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals("unexpected number of rows", 1, json.size());
-
-        JSONObject row = (JSONObject)json.get(0);
-        assertEquals("unexpected number of keys", 2, row.keySet().size());
-        assertEquals("unexpected row key",asHex("rowA"),row.get("key"));
-
-        JSONArray cols = (JSONArray)row.get("cells");
-        JSONArray colA = (JSONArray)cols.get(0);
-        assert hexToBytes((String)colA.get(0)).equals(ByteBufferUtil.bytes("data"));
-        assert colA.get(1).equals("{\"foo\":\"bar\"}");
-    }
-
-    @Test
-    public void testExportColumnsWithMetadata() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "Standard1");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
-
-        // Add rowA
-        cfamily.addColumn(Util.cellname("colName"), ByteBufferUtil.bytes("val"), System.currentTimeMillis());
-        cfamily.addColumn(Util.cellname("colName1"), ByteBufferUtil.bytes("val1"), System.currentTimeMillis());
-        cfamily.delete(new DeletionInfo(0, 0));
-        writer.append(Util.dk("rowA"), cfamily);
-
-        SSTableReader reader = writer.finish(true);
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("CFWithDeletionInfo", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
-
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals("unexpected number of rows", 1, json.size());
-
-        JSONObject row = (JSONObject)json.get(0);
-        assertEquals("unexpected number of keys", 3, row.keySet().size());
-        assertEquals("unexpected row key",asHex("rowA"),row.get("key"));
-
-        // check that the row key is there and present
-        String rowKey = (String) row.get("key");
-        assertNotNull("expecing key to be present", rowKey);
-        assertEquals("key did not match", ByteBufferUtil.bytes("rowA"), hexToBytes(rowKey));
-
-        // check that there is metadata and that it contains deletionInfo
-        JSONObject meta = (JSONObject) row.get("metadata");
-        assertNotNull("expecing metadata to be present", meta);
-
-        assertEquals("unexpected number of metadata entries", 1, meta.keySet().size());
-
-        JSONObject serializedDeletionInfo = (JSONObject) meta.get("deletionInfo");
-        assertNotNull("expecing deletionInfo to be present", serializedDeletionInfo);
-
-        assertEquals(
-                "unexpected serialization format for topLevelDeletion",
-                JSONValue.parse("{\"markedForDeleteAt\":0,\"localDeletionTime\":0}"),
-                serializedDeletionInfo);
-
-        // check the colums are what we put in
-        JSONArray cols = (JSONArray) row.get("cells");
-        assertNotNull("expecing columns to be present", cols);
-        assertEquals("expecting two columns", 2, cols.size());
-
-        JSONArray col1 = (JSONArray) cols.get(0);
-        assertEquals("column name did not match", ByteBufferUtil.bytes("colName"), hexToBytes((String) col1.get(0)));
-        assertEquals("column value did not match", ByteBufferUtil.bytes("val"), hexToBytes((String) col1.get(1)));
-
-        JSONArray col2 = (JSONArray) cols.get(1);
-        assertEquals("column name did not match", ByteBufferUtil.bytes("colName1"), hexToBytes((String) col2.get(0)));
-        assertEquals("column value did not match", ByteBufferUtil.bytes("val1"), hexToBytes((String) col2.get(1)));
-    }
-
-    /**
-     * Tests CASSANDRA-6892 (key aliases being used improperly for validation)
-     */
-    @Test
-    public void testColumnNameEqualToDefaultKeyAlias() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "UUIDKeys");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "UUIDKeys");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE);
-
-        // Add a row
-        cfamily.addColumn(column(CFMetaData.DEFAULT_KEY_ALIAS, "not a uuid", 1L));
-        writer.append(Util.dk(ByteBufferUtil.bytes(UUIDGen.getTimeUUID())), cfamily);
-
-        SSTableReader reader = writer.finish(true);
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("CFWithColumnNameEqualToDefaultKeyAlias", ".json");
-        SSTableExport.export(reader, new PrintStream(tempJson.getPath()), new String[0]);
-
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals(1, json.size());
-
-        JSONObject row = (JSONObject)json.get(0);
-        JSONArray cols = (JSONArray) row.get("cells");
-        assertEquals(1, cols.size());
-
-        // check column name and value
-        JSONArray col = (JSONArray) cols.get(0);
-        assertEquals(CFMetaData.DEFAULT_KEY_ALIAS, ByteBufferUtil.string(hexToBytes((String) col.get(0))));
-        assertEquals("not a uuid", ByteBufferUtil.string(hexToBytes((String) col.get(1))));
-    }
-
-    @Test
-    public void testAsciiKeyValidator() throws IOException, ParseException
-    {
-        File tempSS = tempSSTableFile(KEYSPACE1, "AsciiKeys");
-        ColumnFamily cfamily = ArrayBackedSortedColumns.factory.create(KEYSPACE1, "AsciiKeys");
-        SSTableWriter writer = SSTableWriter.create(tempSS.getPath(), 2, ActiveRepairService.UNREPAIRED_SSTABLE, 0);
-
-        // Add a row
-        cfamily.addColumn(column("column", "value", 1L));
-        writer.append(Util.dk("key", AsciiType.instance), cfamily);
-
-        SSTableReader reader = writer.finish(true);
-        // Export to JSON and verify
-        File tempJson = File.createTempFile("CFWithAsciiKeys", ".json");
-        SSTableExport.export(reader,
-                             new PrintStream(tempJson.getPath()),
-                             new String[0]);
-
-        JSONArray json = (JSONArray)JSONValue.parseWithException(new FileReader(tempJson));
-        assertEquals(1, json.size());
-
-        JSONObject row = (JSONObject)json.get(0);
-        // check row key
-        assertEquals("key", row.get("key"));
-    }
-}

diff --git a/test/unit/org/apache/cassandra/tools/SSTableImportTest.java b/test/unit/org/apache/cassandra/tools/SSTableImportTest.java
deleted file mode 100644
index 5eaf154..0000000
--- a/test/unit/org/apache/cassandra/tools/SSTableImportTest.java
+++ /dev/null

@@ -1,286 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
-package org.apache.cassandra.tools;
-
-import static org.junit.Assert.assertEquals;
-import static org.hamcrest.CoreMatchers.is;
-import static org.junit.Assert.assertThat;
-import static org.junit.matchers.JUnitMatchers.hasItem;
-
-import static org.apache.cassandra.io.sstable.SSTableUtils.tempSSTableFile;
-import static org.apache.cassandra.utils.ByteBufferUtil.hexToBytes;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.junit.BeforeClass;
-import org.hamcrest.Description;
-import org.hamcrest.Matcher;
-import org.junit.Test;
-import org.junit.internal.matchers.TypeSafeMatcher;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.Util;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.BufferDeletedCell;
-import org.apache.cassandra.db.Cell;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.CounterCell;
-import org.apache.cassandra.db.DeletionInfo;
-import org.apache.cassandra.db.ExpiringCell;
-import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.cql3.UntypedResultSet.Row;
-import org.apache.cassandra.db.columniterator.OnDiskAtomIterator;
-import org.apache.cassandra.db.filter.QueryFilter;
-import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.locator.SimpleStrategy;
-
-public class SSTableImportTest
-{
-    public static final String KEYSPACE1 = "SSTableImportTest";
-    public static final String CF_STANDARD = "Standard1";
-    public static final String CF_COUNTER = "Counter1";
-    public static final String CQL_TABLE = "table1";
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-        SchemaLoader.prepareServer();
-        SchemaLoader.createKeyspace(KEYSPACE1,
-                                    SimpleStrategy.class,
-                                    KSMetaData.optsWithRF(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER).defaultValidator(CounterColumnType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, "AsciiKeys").keyValidator(AsciiType.instance),
-                                    CFMetaData.compile("CREATE TABLE table1 (k int PRIMARY KEY, v1 text, v2 int)", KEYSPACE1));
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void testImportUnknownCf() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCF.json");
-        File tempSS = tempSSTableFile("Keyspace1", "Standard1");
-        new SSTableImport(true).importJson(jsonUrl, "UnknownKeyspace", "UnknownCF", tempSS.getPath());
-    }
-
-    @Test
-    public void testImportSimpleCf() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCF.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, "Standard1", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Standard1", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        ColumnFamily cf = cloneForAdditions(iter);
-        while (iter.hasNext()) cf.addAtom(iter.next());
-        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
-        Cell expCol = cf.getColumn(Util.cellname("colAC"));
-        assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringCell;
-        assert ((ExpiringCell)expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
-        reader.selfRef().release();
-    }
-
-    private ColumnFamily cloneForAdditions(OnDiskAtomIterator iter)
-    {
-        return iter.getColumnFamily().cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-    }
-
-    private String resourcePath(String name) throws URISyntaxException
-    {
-        // Naive resource.getPath fails on Windows in many cases, for example if there are spaces in the path
-        // which get encoded as %20 which Windows doesn't like. The trick is to create a URI first, which satisfies all platforms.
-        return new URI(getClass().getClassLoader().getResource(name).toString()).getPath();
-    }
-
-    @Test
-    public void testImportUnsortedMode() throws IOException, URISyntaxException
-    {
-        String jsonUrl = resourcePath("UnsortedCF.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-
-        new SSTableImport().importJson(jsonUrl, KEYSPACE1, "Standard1", tempSS.getPath());
-
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Standard1", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        ColumnFamily cf = cloneForAdditions(iter);
-        while (iter.hasNext())
-            cf.addAtom(iter.next());
-        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
-        Cell expCol = cf.getColumn(Util.cellname("colAC"));
-        assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringCell;
-        assert ((ExpiringCell) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
-        reader.selfRef().release();
-    }
-
-    @Test
-    public void testImportWithDeletionInfoMetadata() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCFWithDeletionInfo.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "Standard1");
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, "Standard1", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Standard1", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        ColumnFamily cf = cloneForAdditions(iter);
-        assertEquals(cf.deletionInfo(), new DeletionInfo(0, 0));
-        while (iter.hasNext())
-            cf.addAtom(iter.next());
-        assert cf.getColumn(Util.cellname("colAA")).value().equals(hexToBytes("76616c4141"));
-        assert !(cf.getColumn(Util.cellname("colAA")) instanceof BufferDeletedCell);
-        Cell expCol = cf.getColumn(Util.cellname("colAC"));
-        assert expCol.value().equals(hexToBytes("76616c4143"));
-        assert expCol instanceof ExpiringCell;
-        assert ((ExpiringCell) expCol).getTimeToLive() == 42 && expCol.getLocalDeletionTime() == 2000000000;
-        reader.selfRef().release();
-    }
-
-    @Test
-    public void testImportCounterCf() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("CounterCF.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "Counter1");
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, "Counter1", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "Counter1", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        ColumnFamily cf = cloneForAdditions(iter);
-        while (iter.hasNext()) cf.addAtom(iter.next());
-        Cell c = cf.getColumn(Util.cellname("colAA"));
-        assert c instanceof CounterCell : c;
-        assert ((CounterCell) c).total() == 42;
-        reader.selfRef().release();
-    }
-
-    @Test
-    public void testImportWithAsciiKeyValidator() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCF.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "AsciiKeys");
-        System.setProperty("skip.key.validator", "false");
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, "AsciiKeys", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        // check that keys are treated as ascii
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("726f7741", AsciiType.instance), "AsciiKeys", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        assert iter.hasNext(); // "ascii" key exists
-        QueryFilter qf2 = QueryFilter.getIdentityFilter(Util.dk("726f7741", BytesType.instance), "AsciiKeys", System.currentTimeMillis());
-        OnDiskAtomIterator iter2 = qf2.getSSTableColumnIterator(reader);
-        assert !iter2.hasNext(); // "bytes" key does not exist
-        reader.selfRef().release();
-    }
-
-    @Test
-    public void testBackwardCompatibilityOfImportWithAsciiKeyValidator() throws IOException, URISyntaxException
-    {
-        // Import JSON to temp SSTable file
-        String jsonUrl = resourcePath("SimpleCF.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, "AsciiKeys");
-        // To ignore current key validator
-        System.setProperty("skip.key.validator", "true");
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, "AsciiKeys", tempSS.getPath());
-
-        // Verify results
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        // check that keys are treated as bytes
-        QueryFilter qf = QueryFilter.getIdentityFilter(Util.dk("rowA"), "AsciiKeys", System.currentTimeMillis());
-        OnDiskAtomIterator iter = qf.getSSTableColumnIterator(reader);
-        assert iter.hasNext(); // "bytes" key exists
-        reader.selfRef().release();
-    }
-    
-    @Test
-    /* 
-     *  The schema is 
-     *      CREATE TABLE cql_keyspace.table1 (k int PRIMARY KEY, v1 text, v2 int)
-     * */
-    public void shouldImportCqlTable() throws IOException, URISyntaxException
-    {
-        String jsonUrl = resourcePath("CQLTable.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, CQL_TABLE);
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, CQL_TABLE, tempSS.getPath());
-        SSTableReader reader = SSTableReader.open(Descriptor.fromFilename(tempSS.getPath()));
-        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CQL_TABLE).addSSTable(reader);
-        
-        UntypedResultSet result = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM \"%s\".%s", KEYSPACE1, CQL_TABLE));
-        assertThat(result.size(), is(2));
-        assertThat(result, hasItem(withElements(1, "NY", 1980)));
-        assertThat(result, hasItem(withElements(2, "CA", 2014)));
-        reader.selfRef().release();
-    }
-
-    @Test(expected=AssertionError.class)
-    public void shouldRejectEmptyCellNamesForNonCqlTables() throws IOException, URISyntaxException
-    {
-        String jsonUrl = resourcePath("CQLTable.json");
-        File tempSS = tempSSTableFile(KEYSPACE1, CF_COUNTER);
-        new SSTableImport(true).importJson(jsonUrl, KEYSPACE1, CF_COUNTER, tempSS.getPath());
-    }
-    
-    private static Matcher<UntypedResultSet.Row> withElements(final int key, final String v1, final int v2) {
-        return new TypeSafeMatcher<UntypedResultSet.Row>()
-        {
-            @Override
-            public boolean matchesSafely(Row input)
-            {
-                if (!input.has("k") || !input.has("v1") || !input.has("v2"))
-                    return false;
-                return input.getInt("k") == key
-                        && input.getString("v1").equals(v1)
-                        && input.getInt("v2") == v2;
-            }
-
-            @Override
-            public void describeTo(Description description)
-            {
-                description.appendText(String.format("a row containing: %s, %s, %s", key, v1, v2));
-            }
-        };
-        
-    }
-}

diff --git a/test/unit/org/apache/cassandra/transport/DynamicLimitTest.java b/test/unit/org/apache/cassandra/transport/DynamicLimitTest.java
new file mode 100644
index 0000000..83a0dd9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/transport/DynamicLimitTest.java

@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+import java.net.InetAddress;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+
+import static org.apache.cassandra.transport.ProtocolTestHelper.cleanupPeers;
+import static org.apache.cassandra.transport.ProtocolTestHelper.setStaticLimitInConfig;
+import static org.apache.cassandra.transport.ProtocolTestHelper.setupPeer;
+import static org.apache.cassandra.transport.ProtocolTestHelper.updatePeerInfo;
+import static org.junit.Assert.assertEquals;
+
+public class DynamicLimitTest
+{
+    @BeforeClass
+    public static void setup()
+    {
+        CQLTester.prepareServer();
+    }
+
+    @Test
+    public void disableDynamicLimitWithSystemProperty() throws Throwable
+    {
+        // Dynamic limiting of the max negotiable protocol version can be
+        // disabled with a system property
+
+        // ensure that no static limit is configured
+        setStaticLimitInConfig(null);
+
+        // set the property which disables dynamic limiting
+        System.setProperty(ConfiguredLimit.DISABLE_MAX_PROTOCOL_AUTO_OVERRIDE, "true");
+        // insert a legacy peer into system.peers and also
+        InetAddress peer = null;
+        try
+        {
+            peer = setupPeer("127.1.0.1", "2.2.0");
+            ConfiguredLimit limit = ConfiguredLimit.newLimit();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+            // clearing the property after the limit has been returned has no effect
+            System.clearProperty(ConfiguredLimit.DISABLE_MAX_PROTOCOL_AUTO_OVERRIDE);
+            limit.updateMaxSupportedVersion();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+            // a new limit should now be dynamic
+            limit = ConfiguredLimit.newLimit();
+            assertEquals(Server.VERSION_3, limit.getMaxVersion());
+        }
+        finally
+        {
+            System.clearProperty(ConfiguredLimit.DISABLE_MAX_PROTOCOL_AUTO_OVERRIDE);
+            cleanupPeers(peer);
+        }
+    }
+
+    @Test
+    public void disallowLoweringMaxVersion() throws Throwable
+    {
+        // Lowering the max version once connections have been established is a problem
+        // for some clients. So for a dynamic limit, if notifications of peer versions
+        // trigger a change to the max version, it's only allowed to increase the max
+        // negotiable version
+
+        InetAddress peer = null;
+        try
+        {
+            // ensure that no static limit is configured
+            setStaticLimitInConfig(null);
+            ConfiguredLimit limit = ConfiguredLimit.newLimit();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+            peer = setupPeer("127.1.0.1", "3.0.0");
+            limit.updateMaxSupportedVersion();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+            // learn that peer doesn't actually fully support V4, behaviour should remain the same
+            updatePeerInfo(peer, "2.2.0");
+            limit.updateMaxSupportedVersion();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+            // finally learn that peer2 has been upgraded, just for completeness
+            updatePeerInfo(peer, "3.3.0");
+            limit.updateMaxSupportedVersion();
+            assertEquals(Server.CURRENT_VERSION, limit.getMaxVersion());
+
+        } finally {
+            cleanupPeers(peer);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/transport/InflightRequestPayloadTrackerTest.java b/test/unit/org/apache/cassandra/transport/InflightRequestPayloadTrackerTest.java
new file mode 100644
index 0000000..e4d335b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/transport/InflightRequestPayloadTrackerTest.java

@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.EncryptionOptions;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.exceptions.OverloadedException;
+import org.apache.cassandra.transport.messages.QueryMessage;
+
+@RunWith(OrderedJUnit4ClassRunner.class)
+public class InflightRequestPayloadTrackerTest extends CQLTester
+{
+    @BeforeClass
+    public static void setUp()
+    {
+        DatabaseDescriptor.setNativeTransportMaxConcurrentRequestsInBytesPerIp(600);
+        DatabaseDescriptor.setNativeTransportMaxConcurrentRequestsInBytes(600);
+        requireNetwork();
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.setNativeTransportMaxConcurrentRequestsInBytesPerIp(3000000000L);
+        DatabaseDescriptor.setNativeTransportMaxConcurrentRequestsInBytes(5000000000L);
+    }
+
+    @After
+    public void dropCreatedTable()
+    {
+        try
+        {
+            QueryProcessor.executeOnceInternal("DROP TABLE " + KEYSPACE + ".atable");
+        }
+        catch (Throwable t)
+        {
+            // ignore
+        }
+    }
+
+    @Test
+    public void testQueryExecutionWithThrowOnOverload() throws Throwable
+    {
+        SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(),
+                                               nativePort,
+                                               Server.CURRENT_VERSION,
+                                               new EncryptionOptions.ClientEncryptionOptions());
+
+        try
+        {
+            client.connect(false, true);
+            QueryOptions queryOptions = QueryOptions.create(
+            QueryOptions.DEFAULT.getConsistency(),
+            QueryOptions.DEFAULT.getValues(),
+            QueryOptions.DEFAULT.skipMetadata(),
+            QueryOptions.DEFAULT.getPageSize(),
+            QueryOptions.DEFAULT.getPagingState(),
+            QueryOptions.DEFAULT.getSerialConsistency(),
+            Server.CURRENT_VERSION);
+
+            QueryMessage queryMessage = new QueryMessage(String.format("CREATE TABLE %s.atable (pk1 int PRIMARY KEY, v text)", KEYSPACE),
+                                                         queryOptions);
+            client.execute(queryMessage);
+        }
+        finally
+        {
+            client.close();
+        }
+    }
+
+    @Test
+    public void testQueryExecutionWithoutThrowOnOverload() throws Throwable
+    {
+        SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(),
+                                               nativePort,
+                                               Server.CURRENT_VERSION,
+                                               new EncryptionOptions.ClientEncryptionOptions());
+
+        try
+        {
+            client.connect(false, false);
+            QueryOptions queryOptions = QueryOptions.create(
+            QueryOptions.DEFAULT.getConsistency(),
+            QueryOptions.DEFAULT.getValues(),
+            QueryOptions.DEFAULT.skipMetadata(),
+            QueryOptions.DEFAULT.getPageSize(),
+            QueryOptions.DEFAULT.getPagingState(),
+            QueryOptions.DEFAULT.getSerialConsistency(),
+            Server.CURRENT_VERSION);
+
+            QueryMessage queryMessage = new QueryMessage(String.format("CREATE TABLE %s.atable (pk int PRIMARY KEY, v text)", KEYSPACE),
+                                                         queryOptions);
+            client.execute(queryMessage);
+            queryMessage = new QueryMessage(String.format("SELECT * FROM %s.atable", KEYSPACE),
+                                            queryOptions);
+            client.execute(queryMessage);
+        }
+        finally
+        {
+            client.close();
+        }
+    }
+
+    @Test
+    public void testQueryExecutionWithoutThrowOnOverloadAndInflightLimitedExceeded() throws Throwable
+    {
+        SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(),
+                                               nativePort,
+                                               Server.CURRENT_VERSION,
+                                               new EncryptionOptions.ClientEncryptionOptions());
+
+        try
+        {
+            client.connect(false, false);
+            QueryOptions queryOptions = QueryOptions.create(
+            QueryOptions.DEFAULT.getConsistency(),
+            QueryOptions.DEFAULT.getValues(),
+            QueryOptions.DEFAULT.skipMetadata(),
+            QueryOptions.DEFAULT.getPageSize(),
+            QueryOptions.DEFAULT.getPagingState(),
+            QueryOptions.DEFAULT.getSerialConsistency(),
+            Server.CURRENT_VERSION);
+
+            QueryMessage queryMessage = new QueryMessage(String.format("CREATE TABLE %s.atable (pk int PRIMARY KEY, v text)", KEYSPACE),
+                                                         queryOptions);
+            client.execute(queryMessage);
+
+            queryMessage = new QueryMessage(String.format("INSERT INTO %s.atable (pk, v) VALUES (1, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')", KEYSPACE),
+                                            queryOptions);
+            client.execute(queryMessage);
+        }
+        finally
+        {
+            client.close();
+        }
+    }
+
+    @Test
+    public void testOverloadedExceptionForEndpointInflightLimit() throws Throwable
+    {
+        SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(),
+                                               nativePort,
+                                               Server.CURRENT_VERSION,
+                                               new EncryptionOptions.ClientEncryptionOptions());
+
+        try
+        {
+            client.connect(false, true);
+            QueryOptions queryOptions = QueryOptions.create(
+            QueryOptions.DEFAULT.getConsistency(),
+            QueryOptions.DEFAULT.getValues(),
+            QueryOptions.DEFAULT.skipMetadata(),
+            QueryOptions.DEFAULT.getPageSize(),
+            QueryOptions.DEFAULT.getPagingState(),
+            QueryOptions.DEFAULT.getSerialConsistency(),
+            Server.CURRENT_VERSION);
+
+            QueryMessage queryMessage = new QueryMessage(String.format("CREATE TABLE %s.atable (pk int PRIMARY KEY, v text)", KEYSPACE),
+                                                         queryOptions);
+            client.execute(queryMessage);
+
+            queryMessage = new QueryMessage(String.format("INSERT INTO %s.atable (pk, v) VALUES (1, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')", KEYSPACE),
+                                            queryOptions);
+            try
+            {
+                client.execute(queryMessage);
+                Assert.fail();
+            }
+            catch (RuntimeException e)
+            {
+                Assert.assertTrue(e.getCause() instanceof OverloadedException);
+            }
+        }
+        finally
+        {
+            client.close();
+        }
+    }
+
+    @Test
+    public void testOverloadedExceptionForOverallInflightLimit() throws Throwable
+    {
+        SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(),
+                                               nativePort,
+                                               Server.CURRENT_VERSION,
+                                               new EncryptionOptions.ClientEncryptionOptions());
+
+        try
+        {
+            client.connect(false, true);
+            QueryOptions queryOptions = QueryOptions.create(
+            QueryOptions.DEFAULT.getConsistency(),
+            QueryOptions.DEFAULT.getValues(),
+            QueryOptions.DEFAULT.skipMetadata(),
+            QueryOptions.DEFAULT.getPageSize(),
+            QueryOptions.DEFAULT.getPagingState(),
+            QueryOptions.DEFAULT.getSerialConsistency(),
+            Server.CURRENT_VERSION);
+
+            QueryMessage queryMessage = new QueryMessage(String.format("CREATE TABLE %s.atable (pk int PRIMARY KEY, v text)", KEYSPACE),
+                                                         queryOptions);
+            client.execute(queryMessage);
+
+            queryMessage = new QueryMessage(String.format("INSERT INTO %s.atable (pk, v) VALUES (1, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')", KEYSPACE),
+                                            queryOptions);
+            try
+            {
+                client.execute(queryMessage);
+                Assert.fail();
+            }
+            catch (RuntimeException e)
+            {
+                Assert.assertTrue(e.getCause() instanceof OverloadedException);
+            }
+        }
+        finally
+        {
+            client.close();
+        }
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java
index 1dd3c5d..865a173 100644
--- a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java
+++ b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java

@@ -65,8 +65,6 @@
     {
         try
         {
-            DatabaseDescriptor.setPartitioner(ByteOrderedPartitioner.instance);
-
             cqlQueryHandlerField = ClientState.class.getDeclaredField("cqlQueryHandler");
             cqlQueryHandlerField.setAccessible(true);
 

diff --git a/test/unit/org/apache/cassandra/transport/ProtocolErrorTest.java b/test/unit/org/apache/cassandra/transport/ProtocolErrorTest.java
index fc8c41c..e212c4c 100644
--- a/test/unit/org/apache/cassandra/transport/ProtocolErrorTest.java
+++ b/test/unit/org/apache/cassandra/transport/ProtocolErrorTest.java

@@ -20,6 +20,7 @@
 import io.netty.buffer.ByteBuf;
 import io.netty.buffer.Unpooled;
 import org.apache.cassandra.transport.messages.ErrorMessage;
+
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -33,14 +34,22 @@
     @Test
     public void testInvalidProtocolVersion() throws Exception
     {
-        Frame.Decoder dec = new Frame.Decoder(null);
+        // test using a protocol version higher than the current version
+        testInvalidProtocolVersion(Server.CURRENT_VERSION + 1);
+     // test using a protocol version lower than the lowest version
+        testInvalidProtocolVersion(Server.MIN_SUPPORTED_VERSION - 1);
+
+    }
+
+    public void testInvalidProtocolVersion(int version) throws Exception
+    {
+        Frame.Decoder dec = new Frame.Decoder(null, ProtocolVersionLimit.SERVER_DEFAULT);
 
         List<Object> results = new ArrayList<>();
-        // should generate a protocol exception for using a protocol version higher than the current version
         byte[] frame = new byte[] {
-                (byte) ((Server.CURRENT_VERSION + 1) & Frame.PROTOCOL_VERSION_MASK),  // direction & version
+                (byte) REQUEST.addToVersion(version),  // direction & version
                 0x00,  // flags
-                0x01,  // stream ID
+                0x00, 0x01,  // stream ID
                 0x09,  // opcode
                 0x00, 0x00, 0x00, 0x21,  // body length
                 0x00, 0x00, 0x00, 0x1b, 0x00, 0x1b, 0x53, 0x45,
@@ -59,17 +68,40 @@
     }
 
     @Test
+    public void testInvalidProtocolVersionShortFrame() throws Exception
+    {
+        // test for CASSANDRA-11464
+        Frame.Decoder dec = new Frame.Decoder(null, ProtocolVersionLimit.SERVER_DEFAULT);
+
+        List<Object> results = new ArrayList<>();
+        byte[] frame = new byte[] {
+                (byte) REQUEST.addToVersion(1),  // direction & version
+                0x00,  // flags
+                0x01,  // stream ID
+                0x09,  // opcode
+                0x00, 0x00, 0x00, 0x21,  // body length
+        };
+        ByteBuf buf = Unpooled.wrappedBuffer(frame);
+        try {
+            dec.decode(null, buf, results);
+            Assert.fail("Expected protocol error");
+        } catch (ProtocolException e) {
+            Assert.assertTrue(e.getMessage().contains("Invalid or unsupported protocol version"));
+        }
+    }
+
+    @Test
     public void testInvalidDirection() throws Exception
     {
-        Frame.Decoder dec = new Frame.Decoder(null);
+        Frame.Decoder dec = new Frame.Decoder(null, ProtocolVersionLimit.SERVER_DEFAULT);
 
         List<Object> results = new ArrayList<>();
         // should generate a protocol exception for using a response frame with
         // a prepare op, ensure that it comes back with stream ID 1
         byte[] frame = new byte[] {
-                (byte) RESPONSE.addToVersion(Server.VERSION_2),  // direction & version
+                (byte) RESPONSE.addToVersion(Server.CURRENT_VERSION),  // direction & version
                 0x00,  // flags
-                0x01,  // stream ID
+                0x00, 0x01,  // stream ID
                 0x09,  // opcode
                 0x00, 0x00, 0x00, 0x21,  // body length
                 0x00, 0x00, 0x00, 0x1b, 0x00, 0x1b, 0x53, 0x45,
@@ -92,13 +124,13 @@
     @Test
     public void testBodyLengthOverLimit() throws Exception
     {
-        Frame.Decoder dec = new Frame.Decoder(null);
+        Frame.Decoder dec = new Frame.Decoder(null, ProtocolVersionLimit.SERVER_DEFAULT);
 
         List<Object> results = new ArrayList<>();
         byte[] frame = new byte[] {
-                (byte) REQUEST.addToVersion(Server.VERSION_2),  // direction & version
+                (byte) REQUEST.addToVersion(Server.CURRENT_VERSION),  // direction & version
                 0x00,  // flags
-                0x01,  // stream ID
+                0x00, 0x01,  // stream ID
                 0x09,  // opcode
                 0x10, (byte) 0x00, (byte) 0x00, (byte) 0x00,  // body length
         };

diff --git a/test/unit/org/apache/cassandra/transport/ProtocolNegotiationTest.java b/test/unit/org/apache/cassandra/transport/ProtocolNegotiationTest.java
new file mode 100644
index 0000000..91c1d6a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/transport/ProtocolNegotiationTest.java

@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+import java.net.InetAddress;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.ProtocolVersion;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+
+import static org.apache.cassandra.transport.ProtocolTestHelper.cleanupPeers;
+import static org.apache.cassandra.transport.ProtocolTestHelper.setStaticLimitInConfig;
+import static org.apache.cassandra.transport.ProtocolTestHelper.setupPeer;
+import static org.apache.cassandra.transport.ProtocolTestHelper.updatePeerInfo;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class ProtocolNegotiationTest extends CQLTester
+{
+    // to avoid JMX naming clashes between cluster metrics
+    private int clusterId = 0;
+
+    @BeforeClass
+    public static void setup()
+    {
+        prepareNetwork();
+    }
+
+    @Before
+    public void clearConfig()
+    {
+        setStaticLimitInConfig(null);
+    }
+
+    @Test
+    public void serverSupportsV3AndV4ByDefault() throws Throwable
+    {
+        reinitializeNetwork();
+        // client can explicitly request either V3 or V4
+        testConnection(ProtocolVersion.V3, ProtocolVersion.V3);
+        testConnection(ProtocolVersion.V4, ProtocolVersion.V4);
+
+        // if not specified, V4 is the default
+        testConnection(null, ProtocolVersion.V4);
+    }
+
+    @Test
+    public void testStaticLimit() throws Throwable
+    {
+        try
+        {
+            reinitializeNetwork();
+            // No limit enforced to start
+            assertEquals(Integer.MIN_VALUE, DatabaseDescriptor.getNativeProtocolMaxVersionOverride());
+            testConnection(null, ProtocolVersion.V4);
+
+            // Update DatabaseDescriptor, then re-initialise the server to force it to read it
+            setStaticLimitInConfig(ProtocolVersion.V3.toInt());
+            reinitializeNetwork();
+            assertEquals(3, DatabaseDescriptor.getNativeProtocolMaxVersionOverride());
+            testConnection(ProtocolVersion.V4, ProtocolVersion.V3);
+            testConnection(ProtocolVersion.V3, ProtocolVersion.V3);
+            testConnection(null, ProtocolVersion.V3);
+        } finally {
+            setStaticLimitInConfig(null);
+        }
+    }
+
+    @Test
+    public void testDynamicLimit() throws Throwable
+    {
+        InetAddress peer1 = setupPeer("127.1.0.1", "2.2.0");
+        InetAddress peer2 = setupPeer("127.1.0.2", "2.2.0");
+        InetAddress peer3 = setupPeer("127.1.0.3", "2.2.0");
+        reinitializeNetwork();
+        try
+        {
+            // legacy peers means max negotiable version is V3
+            testConnection(ProtocolVersion.V4, ProtocolVersion.V3);
+            testConnection(ProtocolVersion.V3, ProtocolVersion.V3);
+            testConnection(null, ProtocolVersion.V3);
+
+            // receive notification that 2 peers have upgraded to a version that fully supports V4
+            updatePeerInfo(peer1, "3.0.0");
+            updatePeerInfo(peer2, "3.0.0");
+            updateMaxNegotiableProtocolVersion();
+            // version should still be capped
+            testConnection(ProtocolVersion.V4, ProtocolVersion.V3);
+            testConnection(ProtocolVersion.V3, ProtocolVersion.V3);
+            testConnection(null, ProtocolVersion.V3);
+
+            // no legacy peers so V4 is negotiable
+            // after the last peer upgrades, cap should be lifted
+            updatePeerInfo(peer3, "3.0.0");
+            updateMaxNegotiableProtocolVersion();
+            testConnection(ProtocolVersion.V4, ProtocolVersion.V4);
+            testConnection(ProtocolVersion.V3, ProtocolVersion.V3);
+            testConnection(null, ProtocolVersion.V4);
+        } finally {
+            cleanupPeers(peer1, peer2, peer3);
+        }
+    }
+
+    private void testConnection(com.datastax.driver.core.ProtocolVersion requestedVersion,
+                                com.datastax.driver.core.ProtocolVersion expectedVersion)
+    {
+        long start = System.nanoTime();
+        boolean expectError = requestedVersion != null && requestedVersion != expectedVersion;
+        Cluster.Builder builder = Cluster.builder()
+                                         .addContactPoints(nativeAddr)
+                                         .withClusterName("Test Cluster" + clusterId++)
+                                         .withPort(nativePort);
+
+        if (requestedVersion != null)
+            builder = builder.withProtocolVersion(requestedVersion) ;
+
+        Cluster cluster = builder.build();
+        logger.info("Setting up cluster took {}ms", TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS));
+        start = System.nanoTime();
+        try {
+            cluster.connect();
+            if (expectError)
+                fail("Expected a protocol exception");
+        }
+        catch (Exception e)
+        {
+            if (!expectError)
+            {
+                e.printStackTrace();
+                fail("Did not expect any exception");
+            }
+
+            assertTrue(e.getMessage().contains(String.format("Host does not support protocol version %s but %s", requestedVersion, expectedVersion)));
+        } finally {
+            logger.info("Testing connection took {}ms", TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS));
+            start = System.nanoTime();
+            cluster.closeAsync();
+            logger.info("Tearing down cluster connection took {}ms", TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS));
+
+        }
+    }
+
+}

diff --git a/test/unit/org/apache/cassandra/transport/ProtocolTestHelper.java b/test/unit/org/apache/cassandra/transport/ProtocolTestHelper.java
new file mode 100644
index 0000000..90a2801
--- /dev/null
+++ b/test/unit/org/apache/cassandra/transport/ProtocolTestHelper.java

@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.transport;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.concurrent.ExecutorService;
+
+import com.google.common.util.concurrent.MoreExecutors;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.gms.VersionedValue;
+import org.apache.cassandra.utils.FBUtilities;
+
+public class ProtocolTestHelper
+{
+    static ExecutorService executor = MoreExecutors.newDirectExecutorService();
+    static InetAddress setupPeer(String address, String version) throws Throwable
+    {
+        InetAddress peer = peer(address);
+        updatePeerInfo(peer, version);
+        return peer;
+    }
+
+    static void updatePeerInfo(InetAddress peer, String version) throws Throwable
+    {
+        SystemKeyspace.updatePeerInfo(peer, "release_version", version, executor);
+    }
+
+    static InetAddress peer(String address)
+    {
+        try
+        {
+            return InetAddress.getByName(address);
+        }
+        catch (UnknownHostException e)
+        {
+            throw new RuntimeException("Error creating peer", e);
+        }
+    }
+
+    static void cleanupPeers(InetAddress...peers) throws Throwable
+    {
+        for (InetAddress peer : peers)
+            if (peer != null)
+                SystemKeyspace.removeEndpoint(peer);
+    }
+
+    static void setStaticLimitInConfig(Integer version)
+    {
+        try
+        {
+            Field field = FBUtilities.getProtectedField(DatabaseDescriptor.class, "conf");
+            ((Config)field.get(null)).native_transport_max_negotiable_protocol_version = version == null ? Integer.MIN_VALUE : version;
+        }
+        catch (IllegalAccessException e)
+        {
+            throw new RuntimeException("Error setting native_transport_max_protocol_version on Config", e);
+        }
+    }
+
+    static VersionedValue releaseVersion(String versionString)
+    {
+        try
+        {
+            Constructor<VersionedValue> ctor = VersionedValue.class.getDeclaredConstructor(String.class);
+            ctor.setAccessible(true);
+            return ctor.newInstance(versionString);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException("Error constructing VersionedValue for release version", e);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index 352327e..fdb346e 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java

@@ -25,7 +25,6 @@
 
 import org.junit.Test;
 import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.transport.Event.TopologyChange;
@@ -46,8 +45,8 @@
     @Test
     public void collectionSerDeserTest() throws Exception
     {
-        collectionSerDeserTest(2);
         collectionSerDeserTest(3);
+        collectionSerDeserTest(4);
     }
 
     public void collectionSerDeserTest(int version) throws Exception
@@ -93,7 +92,6 @@
     @Test
     public void eventSerDeserTest() throws Exception
     {
-        eventSerDeserTest(2);
         eventSerDeserTest(3);
         eventSerDeserTest(4);
     }
@@ -173,8 +171,8 @@
     @Test
     public void udtSerDeserTest() throws Exception
     {
-        udtSerDeserTest(2);
         udtSerDeserTest(3);
+        udtSerDeserTest(4);
     }
 
     public void udtSerDeserTest(int version) throws Exception
@@ -200,10 +198,6 @@
         Term t = u.prepare("ks", columnSpec("myValue", udt));
 
         QueryOptions options = QueryOptions.DEFAULT;
-        if (version == 2)
-            options = QueryOptions.fromProtocolV2(ConsistencyLevel.ONE, Collections.<ByteBuffer>emptyList());
-        else if (version != 3)
-            throw new AssertionError("Invalid protocol version for test");
 
         ByteBuffer serialized = t.bindAndGet(options);
 

diff --git a/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java b/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java
index 3d505c8..d3c6961 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggerExecutorTest.java

@@ -17,248 +17,286 @@
  */
 package org.apache.cassandra.triggers;
 
-import java.nio.ByteBuffer;
 import java.util.*;
+
 import org.junit.Test;
 
+import org.apache.cassandra.Util;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.ColumnDefinition;
-import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.composites.CellName;
-import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-
-import static org.junit.Assert.*;
+import org.apache.cassandra.schema.TriggerMetadata;
+import org.apache.cassandra.utils.FBUtilities;
 
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 
 public class TriggerExecutorTest
 {
     @Test
     public void sameKeySameCfColumnFamilies() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeySameCfTrigger.class.getName()));
-        ColumnFamily mutated = TriggerExecutor.instance.execute(bytes("k1"), makeCf(metadata, "v1", null));
-        assertEquals(bytes("v1"), mutated.getColumn(getColumnName(metadata, "c1")).value());
-        assertEquals(bytes("trigger"), mutated.getColumn(getColumnName(metadata, "c2")).value());
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeySameCfTrigger.class.getName()));
+        // origin column 'c1' = "v1", augment extra column 'c2' = "trigger"
+        PartitionUpdate mutated = TriggerExecutor.instance.execute(makeCf(metadata, "k1", "v1", null));
+
+        List<Row> rows = new ArrayList<>();
+        try (RowIterator iterator = UnfilteredRowIterators.filter(mutated.unfilteredIterator(),
+                                                                  FBUtilities.nowInSeconds()))
+        {
+            iterator.forEachRemaining(rows::add);
+        }
+
+        // only 1 row
+        assertEquals(1, rows.size());
+
+        List<Cell> cells = new ArrayList<>();
+        rows.get(0).cells().forEach(cells::add);
+
+        // 2 columns
+        assertEquals(2, cells.size());
+
+        // check column 'c1'
+        assertEquals(bytes("v1"), cells.get(0).value());
+        // check column 'c2'
+        assertEquals(bytes("trigger"), cells.get(1).value());
     }
 
     @Test(expected = InvalidRequestException.class)
     public void sameKeyDifferentCfColumnFamilies() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeyDifferentCfTrigger.class.getName()));
-        TriggerExecutor.instance.execute(bytes("k1"), makeCf(metadata, "v1", null));
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeyDifferentCfTrigger.class.getName()));
+        TriggerExecutor.instance.execute(makeCf(metadata, "k1", "v1", null));
     }
 
     @Test(expected = InvalidRequestException.class)
     public void differentKeyColumnFamilies() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", DifferentKeyTrigger.class.getName()));
-        TriggerExecutor.instance.execute(bytes("k1"), makeCf(metadata, "v1", null));
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", DifferentKeyTrigger.class.getName()));
+        TriggerExecutor.instance.execute(makeCf(metadata, "k1", "v1", null));
     }
 
     @Test
     public void noTriggerMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", NoOpTrigger.class.getName()));
-        Mutation rm = new Mutation(bytes("k1"), makeCf(metadata, "v1", null));
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", NoOpTrigger.class.getName()));
+        Mutation rm = new Mutation(makeCf(metadata, "k1", "v1", null));
         assertNull(TriggerExecutor.instance.execute(Collections.singletonList(rm)));
     }
 
     @Test
     public void sameKeySameCfRowMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeySameCfTrigger.class.getName()));
-        ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
-        ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        Mutation rm1 = new Mutation(bytes("k1"), cf1);
-        Mutation rm2 = new Mutation(bytes("k2"), cf2);
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeySameCfTrigger.class.getName()));
+        PartitionUpdate cf1 = makeCf(metadata, "k1", "k1v1", null);
+        PartitionUpdate cf2 = makeCf(metadata, "k2", "k2v1", null);
+        Mutation rm1 = new Mutation("ks1", cf1.partitionKey()).add(cf1);
+        Mutation rm2 = new Mutation("ks1", cf2.partitionKey()).add(cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
         Collections.sort(tmutations, new RmComparator());
 
-        List<ColumnFamily> mutatedCFs = new ArrayList<>(tmutations.get(0).getColumnFamilies());
+        List<PartitionUpdate> mutatedCFs = new ArrayList<>(tmutations.get(0).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k1v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        Row row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k1v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
 
-        mutatedCFs = new ArrayList<>(tmutations.get(1).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(1).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k2v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k2v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
     }
 
     @Test
     public void sameKeySameCfPartialRowMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeySameCfPartialTrigger.class.getName()));
-        ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
-        ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        Mutation rm1 = new Mutation(bytes("k1"), cf1);
-        Mutation rm2 = new Mutation(bytes("k2"), cf2);
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeySameCfPartialTrigger.class.getName()));
+        PartitionUpdate cf1 = makeCf(metadata, "k1", "k1v1", null);
+        PartitionUpdate cf2 = makeCf(metadata, "k2", "k2v1", null);
+        Mutation rm1 = new Mutation("ks1", cf1.partitionKey()).add(cf1);
+        Mutation rm2 = new Mutation("ks1", cf2.partitionKey()).add(cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
         Collections.sort(tmutations, new RmComparator());
 
-        List<ColumnFamily> mutatedCFs = new ArrayList<>(tmutations.get(0).getColumnFamilies());
+        List<PartitionUpdate> mutatedCFs = new ArrayList<>(tmutations.get(0).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k1v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
+        Row row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k1v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
 
-        mutatedCFs = new ArrayList<>(tmutations.get(1).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(1).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k2v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k2v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
     }
 
     @Test
     public void sameKeyDifferentCfRowMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeyDifferentCfTrigger.class.getName()));
-        ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
-        ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        Mutation rm1 = new Mutation(bytes("k1"), cf1);
-        Mutation rm2 = new Mutation(bytes("k2"), cf2);
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeyDifferentCfTrigger.class.getName()));
+        PartitionUpdate cf1 = makeCf(metadata, "k1", "k1v1", null);
+        PartitionUpdate cf2 = makeCf(metadata, "k2", "k2v1", null);
+        Mutation rm1 = new Mutation("ks1", cf1.partitionKey()).add(cf1);
+        Mutation rm2 = new Mutation("ks1", cf2.partitionKey()).add(cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(2, tmutations.size());
         Collections.sort(tmutations, new RmComparator());
 
-        List<ColumnFamily> mutatedCFs = new ArrayList<>(tmutations.get(0).getColumnFamilies());
+        List<PartitionUpdate> mutatedCFs = new ArrayList<>(tmutations.get(0).getPartitionUpdates());
+        assertEquals(2, mutatedCFs.size());
+        for (PartitionUpdate update : mutatedCFs)
+        {
+            if (update.metadata().cfName.equals("cf1"))
+            {
+                Row row = update.iterator().next();
+                assertEquals(bytes("k1v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+                assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
+            }
+            else
+            {
+                Row row = update.iterator().next();
+                assertNull(row.getCell(metadata.getColumnDefinition(bytes("c1"))));
+                assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
+            }
+        }
+
+        mutatedCFs = new ArrayList<>(tmutations.get(1).getPartitionUpdates());
         assertEquals(2, mutatedCFs.size());
 
-        Collections.sort(mutatedCFs, new CfComparator());
-        assertEquals(bytes("k1v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
-        assertNull(mutatedCFs.get(1).getColumn(getColumnName(metadata, "c1")));
-        assertEquals(bytes("trigger"), mutatedCFs.get(1).getColumn(getColumnName(metadata, "c2")).value());
-
-        mutatedCFs = new ArrayList<>(tmutations.get(1).getColumnFamilies());
-        assertEquals(2, mutatedCFs.size());
-
-        Collections.sort(mutatedCFs, new CfComparator());
-        assertEquals(bytes("k2v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
-        assertNull(mutatedCFs.get(1).getColumn(getColumnName(metadata, "c1")));
-        assertEquals(bytes("trigger"), mutatedCFs.get(1).getColumn(getColumnName(metadata, "c2")).value());
+        for (PartitionUpdate update : mutatedCFs)
+        {
+            if (update.metadata().cfName.equals("cf1"))
+            {
+                Row row = update.iterator().next();
+                assertEquals(bytes("k2v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+                assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
+            }
+            else
+            {
+                Row row = update.iterator().next();
+                assertNull(row.getCell(metadata.getColumnDefinition(bytes("c1"))));
+                assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
+            }
+        }
     }
 
     @Test
     public void sameKeyDifferentKsRowMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", SameKeyDifferentKsTrigger.class.getName()));
-        ColumnFamily cf1 = makeCf(metadata, "k1v1", null);
-        ColumnFamily cf2 = makeCf(metadata, "k2v1", null);
-        Mutation rm1 = new Mutation(bytes("k1"), cf1);
-        Mutation rm2 = new Mutation(bytes("k2"), cf2);
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", SameKeyDifferentKsTrigger.class.getName()));
+        PartitionUpdate cf1 = makeCf(metadata, "k1", "k1v1", null);
+        PartitionUpdate cf2 = makeCf(metadata, "k2", "k2v1", null);
+        Mutation rm1 = new Mutation("ks1", cf1.partitionKey()).add(cf1);
+        Mutation rm2 = new Mutation("ks1", cf2.partitionKey()).add(cf2);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm1, rm2)));
         assertEquals(4, tmutations.size());
         Collections.sort(tmutations, new RmComparator());
 
-        List<ColumnFamily> mutatedCFs = new ArrayList<>(tmutations.get(0).getColumnFamilies());
+        List<PartitionUpdate> mutatedCFs = new ArrayList<>(tmutations.get(0).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k1v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
+        Row row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k1v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
 
-        mutatedCFs = new ArrayList<>(tmutations.get(1).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(1).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("k2v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
+        row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("k2v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
 
-        mutatedCFs = new ArrayList<>(tmutations.get(2).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(2).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")));
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        row = mutatedCFs.get(0).iterator().next();
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c1"))));
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
 
-        mutatedCFs = new ArrayList<>(tmutations.get(3).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(3).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")));
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        row = mutatedCFs.get(0).iterator().next();
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c1"))));
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
     }
 
     @Test
     public void differentKeyRowMutations() throws ConfigurationException, InvalidRequestException
     {
-        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerDefinition.create("test", DifferentKeyTrigger.class.getName()));
-        ColumnFamily cf = makeCf(metadata, "v1", null);
-        Mutation rm = new Mutation(UTF8Type.instance.fromString("k1"), cf);
+
+        CFMetaData metadata = makeCfMetaData("ks1", "cf1", TriggerMetadata.create("test", DifferentKeyTrigger.class.getName()));
+        PartitionUpdate cf1 = makeCf(metadata, "k1", "v1", null);
+        Mutation rm = new Mutation("ks1", cf1.partitionKey()).add(cf1);
 
         List<? extends IMutation> tmutations = new ArrayList<>(TriggerExecutor.instance.execute(Arrays.asList(rm)));
         assertEquals(2, tmutations.size());
         Collections.sort(tmutations, new RmComparator());
 
-        assertEquals(bytes("k1"), tmutations.get(0).key());
-        assertEquals(bytes("otherKey"), tmutations.get(1).key());
+        assertEquals(bytes("k1"), tmutations.get(0).key().getKey());
+        assertEquals(bytes("otherKey"), tmutations.get(1).key().getKey());
 
-        List<ColumnFamily> mutatedCFs = new ArrayList<>(tmutations.get(0).getColumnFamilies());
+        List<PartitionUpdate> mutatedCFs = new ArrayList<>(tmutations.get(0).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertEquals(bytes("v1"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")).value());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")));
+        Row row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("v1"), row.getCell(metadata.getColumnDefinition(bytes("c1"))).value());
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c2"))));
 
-        mutatedCFs = new ArrayList<>(tmutations.get(1).getColumnFamilies());
+        mutatedCFs = new ArrayList<>(tmutations.get(1).getPartitionUpdates());
         assertEquals(1, mutatedCFs.size());
-        assertNull(mutatedCFs.get(0).getColumn(getColumnName(metadata, "c1")));
-        assertEquals(bytes("trigger"), mutatedCFs.get(0).getColumn(getColumnName(metadata, "c2")).value());
+        row = mutatedCFs.get(0).iterator().next();
+        assertEquals(bytes("trigger"), row.getCell(metadata.getColumnDefinition(bytes("c2"))).value());
+        assertNull(row.getCell(metadata.getColumnDefinition(bytes("c1"))));
     }
 
-    private static CFMetaData makeCfMetaData(String ks, String cf, TriggerDefinition trigger)
+    private static CFMetaData makeCfMetaData(String ks, String cf, TriggerMetadata trigger)
     {
+        CFMetaData metadata = CFMetaData.Builder.create(ks, cf)
+                .addPartitionKey("pkey", UTF8Type.instance)
+                .addRegularColumn("c1", UTF8Type.instance)
+                .addRegularColumn("c2", UTF8Type.instance)
+                .build();
 
-        CFMetaData metadata = CFMetaData.sparseCFMetaData(ks, cf, CompositeType.getInstance(UTF8Type.instance));
-
-        metadata.keyValidator(UTF8Type.instance);
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.partitionKeyDef(metadata,
-                                                                               UTF8Type.instance.fromString("pkey"),
-                                                                               UTF8Type.instance,
-                                                                               null));
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(metadata,
-                                                                          UTF8Type.instance.fromString("c1"),
-                                                                          UTF8Type.instance,
-                                                                          0));
-        metadata.addOrReplaceColumnDefinition(ColumnDefinition.regularDef(metadata,
-                                                                          UTF8Type.instance.fromString("c2"),
-                                                                          UTF8Type.instance,
-                                                                          0));
         try
         {
             if (trigger != null)
-                metadata.addTriggerDefinition(trigger);
+                metadata.triggers(metadata.getTriggers().with(trigger));
         }
         catch (InvalidRequestException e)
         {
             throw new AssertionError(e);
         }
 
-        return metadata.rebuild();
+        return metadata;
     }
 
-    private static ColumnFamily makeCf(CFMetaData metadata, String columnValue1, String columnValue2)
+    private static PartitionUpdate makeCf(CFMetaData metadata, String key, String columnValue1, String columnValue2)
     {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(metadata);
-
+        Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds());
+        builder.newRow(Clustering.EMPTY);
+        long ts = FBUtilities.timestampMicros();
         if (columnValue1 != null)
-            cf.addColumn(new BufferCell(getColumnName(metadata, "c1"), bytes(columnValue1)));
-
+            builder.addCell(BufferCell.live(metadata, metadata.getColumnDefinition(bytes("c1")), ts, bytes(columnValue1)));
         if (columnValue2 != null)
-            cf.addColumn(new BufferCell(getColumnName(metadata, "c2"), bytes(columnValue2)));
+            builder.addCell(BufferCell.live(metadata, metadata.getColumnDefinition(bytes("c2")), ts, bytes(columnValue2)));
 
-        return cf;
-    }
-
-    private static CellName getColumnName(CFMetaData metadata, String stringName)
-    {
-        return metadata.comparator.makeCellName(stringName);
+        return PartitionUpdate.singleRowUpdate(metadata, Util.dk(key), builder.build());
     }
 
     public static class NoOpTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
             return null;
         }
@@ -266,54 +304,54 @@
 
     public static class SameKeySameCfTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new Mutation(update.metadata().ksName, key, cf));
+            RowUpdateBuilder builder = new RowUpdateBuilder(partition.metadata(), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            builder.add("c2", bytes("trigger"));
+            return Collections.singletonList(builder.build());
         }
     }
 
     public static class SameKeySameCfPartialTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            if (!key.equals(bytes("k2")))
+            if (!partition.partitionKey().getKey().equals(bytes("k2")))
                 return null;
 
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new Mutation(update.metadata().ksName, key, cf));
+            RowUpdateBuilder builder = new RowUpdateBuilder(partition.metadata(), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            builder.add("c2", bytes("trigger"));
+            return Collections.singletonList(builder.build());
         }
     }
 
     public static class SameKeyDifferentCfTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(makeCfMetaData(update.metadata().ksName, "otherCf", null));
-            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new Mutation(cf.metadata().ksName, key, cf));
+            RowUpdateBuilder builder = new RowUpdateBuilder(makeCfMetaData(partition.metadata().ksName, "otherCf", null), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            builder.add("c2", bytes("trigger"));
+            return Collections.singletonList(builder.build());
         }
     }
 
     public static class SameKeyDifferentKsTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(makeCfMetaData("otherKs", "otherCf", null));
-            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new Mutation(cf.metadata().ksName, key, cf));
+            RowUpdateBuilder builder = new RowUpdateBuilder(makeCfMetaData("otherKs", "otherCf", null), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            builder.add("c2", bytes("trigger"));
+            return Collections.singletonList(builder.build());
         }
     }
 
     public static class DifferentKeyTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily cf = ArrayBackedSortedColumns.factory.create(update.metadata());
-            cf.addColumn(new BufferCell(getColumnName(update.metadata(), "c2"), bytes("trigger")));
-            return Collections.singletonList(new Mutation(cf.metadata().ksName, bytes("otherKey"), cf));
+            RowUpdateBuilder builder = new RowUpdateBuilder(makeCfMetaData("otherKs", "otherCf", null), FBUtilities.timestampMicros(), "otherKey");
+            builder.add("c2", bytes("trigger"));
+            return Collections.singletonList(builder.build());
         }
     }
 
@@ -326,9 +364,9 @@
         }
     }
 
-    private static class CfComparator implements Comparator<ColumnFamily>
+    private static class CfComparator implements Comparator<Partition>
     {
-        public int compare(ColumnFamily cf1, ColumnFamily cf2)
+        public int compare(Partition cf1, Partition cf2)
         {
             return cf1.metadata().cfName.compareTo(cf2.metadata().cfName);
         }

diff --git a/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java b/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java
index 58f743e..b6549bb 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggersSchemaTest.java

@@ -17,23 +17,20 @@
  */
 package org.apache.cassandra.triggers;
 
-import java.util.Collections;
-
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
 import org.apache.cassandra.config.Schema;
-import org.apache.cassandra.config.TriggerDefinition;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Tables;
+import org.apache.cassandra.schema.TriggerMetadata;
 import org.apache.cassandra.service.MigrationManager;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 public class TriggersSchemaTest
 {
@@ -51,81 +48,65 @@
     @Test
     public void newKsContainsCfWithTrigger() throws Exception
     {
-        TriggerDefinition td = TriggerDefinition.create(triggerName, triggerClass);
+        TriggerMetadata td = TriggerMetadata.create(triggerName, triggerClass);
         CFMetaData cfm1 = CFMetaData.compile(String.format("CREATE TABLE %s (k int PRIMARY KEY, v int)", cfName), ksName);
-        cfm1.addTriggerDefinition(td);
-        KSMetaData ksm = KSMetaData.newKeyspace(ksName,
-                                                SimpleStrategy.class,
-                                                Collections.singletonMap("replication_factor", "1"),
-                                                true,
-                                                Collections.singletonList(cfm1));
+        cfm1.triggers(cfm1.getTriggers().with(td));
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(ksName, KeyspaceParams.simple(1), Tables.of(cfm1));
         MigrationManager.announceNewKeyspace(ksm);
 
         CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName);
         assertFalse(cfm2.getTriggers().isEmpty());
         assertEquals(1, cfm2.getTriggers().size());
-        assertEquals(td, cfm2.getTriggers().get(triggerName));
+        assertEquals(td, cfm2.getTriggers().get(triggerName).get());
     }
 
     @Test
     public void addNewCfWithTriggerToKs() throws Exception
     {
-        KSMetaData ksm = KSMetaData.newKeyspace(ksName,
-                                                SimpleStrategy.class,
-                                                Collections.singletonMap("replication_factor", "1"),
-                                                true,
-                                                Collections.EMPTY_LIST);
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(ksName, KeyspaceParams.simple(1));
         MigrationManager.announceNewKeyspace(ksm);
 
         CFMetaData cfm1 = CFMetaData.compile(String.format("CREATE TABLE %s (k int PRIMARY KEY, v int)", cfName), ksName);
-        TriggerDefinition td = TriggerDefinition.create(triggerName, triggerClass);
-        cfm1.addTriggerDefinition(td);
+        TriggerMetadata td = TriggerMetadata.create(triggerName, triggerClass);
+        cfm1.triggers(cfm1.getTriggers().with(td));
 
         MigrationManager.announceNewColumnFamily(cfm1);
 
         CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName);
         assertFalse(cfm2.getTriggers().isEmpty());
         assertEquals(1, cfm2.getTriggers().size());
-        assertEquals(td, cfm2.getTriggers().get(triggerName));
+        assertEquals(td, cfm2.getTriggers().get(triggerName).get());
     }
 
     @Test
     public void addTriggerToCf() throws Exception
     {
         CFMetaData cfm1 = CFMetaData.compile(String.format("CREATE TABLE %s (k int PRIMARY KEY, v int)", cfName), ksName);
-        KSMetaData ksm = KSMetaData.newKeyspace(ksName,
-                                                SimpleStrategy.class,
-                                                Collections.singletonMap("replication_factor", "1"),
-                                                true,
-                                                Collections.singletonList(cfm1));
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(ksName, KeyspaceParams.simple(1), Tables.of(cfm1));
         MigrationManager.announceNewKeyspace(ksm);
 
         CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).copy();
-        TriggerDefinition td = TriggerDefinition.create(triggerName, triggerClass);
-        cfm2.addTriggerDefinition(td);
+        TriggerMetadata td = TriggerMetadata.create(triggerName, triggerClass);
+        cfm2.triggers(cfm2.getTriggers().with(td));
         MigrationManager.announceColumnFamilyUpdate(cfm2);
 
         CFMetaData cfm3 = Schema.instance.getCFMetaData(ksName, cfName);
         assertFalse(cfm3.getTriggers().isEmpty());
         assertEquals(1, cfm3.getTriggers().size());
-        assertEquals(td, cfm3.getTriggers().get(triggerName));
+        assertEquals(td, cfm3.getTriggers().get(triggerName).get());
     }
 
     @Test
     public void removeTriggerFromCf() throws Exception
     {
-        TriggerDefinition td = TriggerDefinition.create(triggerName, triggerClass);
+        TriggerMetadata td = TriggerMetadata.create(triggerName, triggerClass);
         CFMetaData cfm1 = CFMetaData.compile(String.format("CREATE TABLE %s (k int PRIMARY KEY, v int)", cfName), ksName);
-        cfm1.addTriggerDefinition(td);
-        KSMetaData ksm = KSMetaData.newKeyspace(ksName,
-                                                SimpleStrategy.class,
-                                                Collections.singletonMap("replication_factor", "1"),
-                                                true,
-                                                Collections.singletonList(cfm1));
+        cfm1.triggers(cfm1.getTriggers().with(td));
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(ksName, KeyspaceParams.simple(1), Tables.of(cfm1));
         MigrationManager.announceNewKeyspace(ksm);
 
         CFMetaData cfm2 = Schema.instance.getCFMetaData(ksName, cfName).copy();
-        cfm2.removeTrigger(triggerName);
+        cfm2.triggers(cfm2.getTriggers().without(triggerName));
         MigrationManager.announceColumnFamilyUpdate(cfm2);
 
         CFMetaData cfm3 = Schema.instance.getCFMetaData(ksName, cfName).copy();

diff --git a/test/unit/org/apache/cassandra/triggers/TriggersTest.java b/test/unit/org/apache/cassandra/triggers/TriggersTest.java
index 3cd0a2c..70e040b 100644
--- a/test/unit/org/apache/cassandra/triggers/TriggersTest.java
+++ b/test/unit/org/apache/cassandra/triggers/TriggersTest.java

@@ -21,7 +21,10 @@
 import java.util.Collection;
 import java.util.Collections;
 
-import org.junit.*;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.Schema;
@@ -30,6 +33,8 @@
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.service.StorageService;
@@ -55,12 +60,12 @@
     public static void beforeTest() throws ConfigurationException
     {
         SchemaLoader.loadSchema();
-        StorageService.instance.initServer(0);
     }
 
     @Before
     public void setup() throws Exception
     {
+        StorageService.instance.initServer(0);
         if (thriftServer == null || ! thriftServer.isRunning())
         {
             thriftServer = new ThriftServer(FBUtilities.getLocalAddress(), 9170, 50);
@@ -102,7 +107,7 @@
     {
         String cql = String.format("INSERT INTO %s.%s (k, v1) VALUES (0, 0)", ksName, cfName);
         QueryProcessor.process(cql, ConsistencyLevel.ONE);
-        assertUpdateIsAugmented(0);
+        assertUpdateIsAugmented(0, "v1", 0);
     }
 
     @Test
@@ -113,7 +118,7 @@
                                    "APPLY BATCH",
                                    ksName, cfName);
         QueryProcessor.process(cql, ConsistencyLevel.ONE);
-        assertUpdateIsAugmented(1);
+        assertUpdateIsAugmented(1, "v1", 1);
     }
 
     @Test
@@ -129,7 +134,7 @@
                       getColumnForInsert("v1", 2),
                       org.apache.cassandra.thrift.ConsistencyLevel.ONE);
 
-        assertUpdateIsAugmented(2);
+        assertUpdateIsAugmented(2, "v1", 2);
     }
 
     @Test
@@ -150,7 +155,7 @@
                                                               Collections.singletonList(mutation))),
             org.apache.cassandra.thrift.ConsistencyLevel.ONE);
 
-        assertUpdateIsAugmented(3);
+        assertUpdateIsAugmented(3, "v1", 3);
     }
 
     @Test
@@ -158,7 +163,7 @@
     {
         String cql = String.format("INSERT INTO %s.%s (k, v1) VALUES (4, 4) IF NOT EXISTS", ksName, cfName);
         QueryProcessor.process(cql, ConsistencyLevel.ONE);
-        assertUpdateIsAugmented(4);
+        assertUpdateIsAugmented(4, "v1", 4);
     }
 
     @Test
@@ -170,7 +175,7 @@
                                    "APPLY BATCH",
                                     ksName, cfName);
         QueryProcessor.process(cql, ConsistencyLevel.ONE);
-        assertUpdateIsAugmented(5);
+        assertUpdateIsAugmented(5, "v1", 5);
     }
 
     @Test
@@ -188,7 +193,7 @@
                    org.apache.cassandra.thrift.ConsistencyLevel.LOCAL_SERIAL,
                    org.apache.cassandra.thrift.ConsistencyLevel.ONE);
 
-        assertUpdateIsAugmented(6);
+        assertUpdateIsAugmented(6, "v1", 6);
     }
 
     // Unfortunately, an IRE thrown from StorageProxy.cas
@@ -277,29 +282,6 @@
         }
     }
 
-    @Test(expected=RuntimeException.class)
-    public void ifTriggerThrowsErrorNoMutationsAreApplied() throws Exception
-    {
-        String cf = "cf" + System.nanoTime();
-        try
-        {
-            setupTableWithTrigger(cf, ErrorTrigger.class);
-            String cql = String.format("INSERT INTO %s.%s (k, v1) VALUES (11, 11)", ksName, cf);
-            QueryProcessor.process(cql, ConsistencyLevel.ONE);
-        }
-        catch (Exception e)
-        {
-            Throwable cause = e.getCause();
-            assertTrue((cause instanceof org.apache.cassandra.exceptions.InvalidRequestException));
-            assertTrue(cause.getMessage().equals(ErrorTrigger.MESSAGE));
-            throw e;
-        }
-        finally
-        {
-            assertUpdateNotExecuted(cf, 11);
-        }
-    }
-
     private void setupTableWithTrigger(String cf, Class<? extends ITrigger> triggerImpl)
     throws RequestExecutionException
     {
@@ -312,12 +294,18 @@
         QueryProcessor.process(cql, ConsistencyLevel.ONE);
     }
 
-    private void assertUpdateIsAugmented(int key)
+    private void assertUpdateIsAugmented(int key, String originColumnName, Object originColumnValue)
     {
-        UntypedResultSet rs = QueryProcessor.executeInternal(
-                                String.format("SELECT * FROM %s.%s WHERE k=%s", ksName, cfName, key));
-        assertTrue(String.format("Expected value (%s) for augmented cell v2 was not found", key), rs.one().has("v2"));
-        assertEquals(999, rs.one().getInt("v2"));
+        UntypedResultSet rs = QueryProcessor.process(String.format("SELECT * FROM %s.%s WHERE k=%s", ksName, cfName, key), ConsistencyLevel.ONE);
+        assertRowValue(rs.one(), key, "v2", 999); // from trigger
+        assertRowValue(rs.one(), key, originColumnName, originColumnValue); // from original update
+    }
+    
+    private void assertRowValue(UntypedResultSet.Row row, int key, String columnName, Object columnValue)
+    {
+        assertTrue(String.format("Expected value (%s) for augmented cell %s was not found", key, columnName),
+                   row.has(columnName));
+        assertEquals(columnValue, row.getInt(columnName));
     }
 
     private void assertUpdateNotExecuted(String cf, int key)
@@ -330,7 +318,7 @@
     private org.apache.cassandra.thrift.Column getColumnForInsert(String columnName, int value)
     {
         org.apache.cassandra.thrift.Column column = new org.apache.cassandra.thrift.Column();
-        column.setName(Schema.instance.getCFMetaData(ksName, cfName).comparator.asAbstractType().fromString(columnName));
+        column.setName(LegacyLayout.makeLegacyComparator(Schema.instance.getCFMetaData(ksName, cfName)).fromString(columnName));
         column.setValue(bytes(value));
         column.setTimestamp(System.currentTimeMillis());
         return column;
@@ -338,43 +326,35 @@
 
     public static class TestTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily extraUpdate = update.cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-            extraUpdate.addColumn(new BufferCell(update.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
-            return Collections.singletonList(new Mutation(ksName, key, extraUpdate));
+            RowUpdateBuilder update = new RowUpdateBuilder(partition.metadata(), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            update.add("v2", 999);
+
+            return Collections.singletonList(update.build());
         }
     }
 
     public static class CrossPartitionTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily extraUpdate = update.cloneMeShallow(ArrayBackedSortedColumns.factory, false);
-            extraUpdate.addColumn(new BufferCell(update.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
+            RowUpdateBuilder update = new RowUpdateBuilder(partition.metadata(), FBUtilities.timestampMicros(), toInt(partition.partitionKey().getKey()) + 1000);
+            update.add("v2", 999);
 
-            int newKey = toInt(key) + 1000;
-            return Collections.singletonList(new Mutation(ksName, bytes(newKey), extraUpdate));
+            return Collections.singletonList(update.build());
         }
     }
 
     public static class CrossTableTrigger implements ITrigger
     {
-        public Collection<Mutation> augment(ByteBuffer key, ColumnFamily update)
+        public Collection<Mutation> augment(Partition partition)
         {
-            ColumnFamily extraUpdate = ArrayBackedSortedColumns.factory.create(ksName, otherCf);
-            extraUpdate.addColumn(new BufferCell(extraUpdate.metadata().comparator.makeCellName(bytes("v2")), bytes(999)));
-            return Collections.singletonList(new Mutation(ksName, key, extraUpdate));
+
+            RowUpdateBuilder update = new RowUpdateBuilder(Schema.instance.getCFMetaData(ksName, otherCf), FBUtilities.timestampMicros(), partition.partitionKey().getKey());
+            update.add("v2", 999);
+
+            return Collections.singletonList(update.build());
         }
     }
-
-    public static class ErrorTrigger implements ITrigger
-    {
-        public static final String MESSAGE = "Thrown by ErrorTrigger";
-        public Collection<Mutation> augment(ByteBuffer partitionKey, ColumnFamily update)
-        {
-            throw new org.apache.cassandra.exceptions.InvalidRequestException(MESSAGE);
-        }
-    }
-
 }

diff --git a/test/unit/org/apache/cassandra/utils/AbstractIteratorTest.java b/test/unit/org/apache/cassandra/utils/AbstractIteratorTest.java
new file mode 100644
index 0000000..b2f9433
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/AbstractIteratorTest.java

@@ -0,0 +1,383 @@
+/*
+ * Copyright (C) 2007 The Guava Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import junit.framework.TestCase;
+
+import java.lang.ref.WeakReference;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Unit test for {@code AbstractIterator}.
+ *
+ * @author Kevin Bourrillion
+ */
+@SuppressWarnings("serial") // No serialization is used in this test
+// TODO(cpovirk): why is this slow (>1m/test) under GWT when fully optimized?
+public class AbstractIteratorTest extends TestCase
+{
+
+    public void testDefaultBehaviorOfNextAndHasNext()
+    {
+
+        // This sample AbstractIterator returns 0 on the first call, 1 on the
+        // second, then signals that it's reached the end of the data
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            private int rep;
+
+            @Override
+            public Integer computeNext()
+            {
+                switch (rep++)
+                {
+                    case 0:
+                        return 0;
+                    case 1:
+                        return 1;
+                    case 2:
+                        return endOfData();
+                    default:
+                        fail("Should not have been invoked again");
+                        return null;
+                }
+            }
+        };
+
+        assertTrue(iter.hasNext());
+        assertEquals(0, (int) iter.next());
+
+        // verify idempotence of hasNext()
+        assertTrue(iter.hasNext());
+        assertTrue(iter.hasNext());
+        assertTrue(iter.hasNext());
+        assertEquals(1, (int) iter.next());
+
+        assertFalse(iter.hasNext());
+
+        // Make sure computeNext() doesn't get invoked again
+        assertFalse(iter.hasNext());
+
+        try
+        {
+            iter.next();
+            fail("no exception thrown");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+    }
+
+    public void testDefaultBehaviorOfPeek()
+    {
+    /*
+     * This sample AbstractIterator returns 0 on the first call, 1 on the
+     * second, then signals that it's reached the end of the data
+     */
+        AbstractIterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            private int rep;
+
+            @Override
+            public Integer computeNext()
+            {
+                switch (rep++)
+                {
+                    case 0:
+                        return 0;
+                    case 1:
+                        return 1;
+                    case 2:
+                        return endOfData();
+                    default:
+                        fail("Should not have been invoked again");
+                        return null;
+                }
+            }
+        };
+
+        assertEquals(0, (int) iter.peek());
+        assertEquals(0, (int) iter.peek());
+        assertTrue(iter.hasNext());
+        assertEquals(0, (int) iter.peek());
+        assertEquals(0, (int) iter.next());
+
+        assertEquals(1, (int) iter.peek());
+        assertEquals(1, (int) iter.next());
+
+        try
+        {
+            iter.peek();
+            fail("peek() should throw NoSuchElementException at end");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+
+        try
+        {
+            iter.peek();
+            fail("peek() should continue to throw NoSuchElementException at end");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+
+        try
+        {
+            iter.next();
+            fail("next() should throw NoSuchElementException as usual");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+
+        try
+        {
+            iter.peek();
+            fail("peek() should still throw NoSuchElementException after next()");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+    }
+
+    public void testFreesNextReference() throws InterruptedException
+    {
+        Iterator<Object> itr = new AbstractIterator<Object>()
+        {
+            @Override
+            public Object computeNext()
+            {
+                return new Object();
+            }
+        };
+        WeakReference<Object> ref = new WeakReference<Object>(itr.next());
+        while (ref.get() != null)
+        {
+            System.gc();
+            Thread.sleep(1);
+        }
+    }
+
+    public void testDefaultBehaviorOfPeekForEmptyIteration()
+    {
+
+        AbstractIterator<Integer> empty = new AbstractIterator<Integer>()
+        {
+            private boolean alreadyCalledEndOfData;
+
+            @Override
+            public Integer computeNext()
+            {
+                if (alreadyCalledEndOfData)
+                {
+                    fail("Should not have been invoked again");
+                }
+                alreadyCalledEndOfData = true;
+                return endOfData();
+            }
+        };
+
+        try
+        {
+            empty.peek();
+            fail("peek() should throw NoSuchElementException at end");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+
+        try
+        {
+            empty.peek();
+            fail("peek() should continue to throw NoSuchElementException at end");
+        }
+        catch (NoSuchElementException expected)
+        {
+        }
+    }
+
+    public void testException()
+    {
+        final SomeUncheckedException exception = new SomeUncheckedException();
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            @Override
+            public Integer computeNext()
+            {
+                throw exception;
+            }
+        };
+
+        // It should pass through untouched
+        try
+        {
+            iter.hasNext();
+            fail("No exception thrown");
+        }
+        catch (SomeUncheckedException e)
+        {
+            assertSame(exception, e);
+        }
+    }
+
+    public void testExceptionAfterEndOfData()
+    {
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            @Override
+            public Integer computeNext()
+            {
+                endOfData();
+                throw new SomeUncheckedException();
+            }
+        };
+        try
+        {
+            iter.hasNext();
+            fail("No exception thrown");
+        }
+        catch (SomeUncheckedException expected)
+        {
+        }
+    }
+
+    public void testCantRemove()
+    {
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            boolean haveBeenCalled;
+
+            @Override
+            public Integer computeNext()
+            {
+                if (haveBeenCalled)
+                {
+                    endOfData();
+                }
+                haveBeenCalled = true;
+                return 0;
+            }
+        };
+
+        assertEquals(0, (int) iter.next());
+
+        try
+        {
+            iter.remove();
+            fail("No exception thrown");
+        }
+        catch (UnsupportedOperationException expected)
+        {
+        }
+    }
+
+    public void testSneakyThrow() throws Exception
+    {
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            boolean haveBeenCalled;
+
+            @Override
+            public Integer computeNext()
+            {
+                if (haveBeenCalled)
+                {
+                    fail("Should not have been called again");
+                }
+                else
+                {
+                    haveBeenCalled = true;
+                    sneakyThrow(new SomeCheckedException());
+                }
+                return null; // never reached
+            }
+        };
+
+        // The first time, the sneakily-thrown exception comes out
+        try
+        {
+            iter.hasNext();
+            fail("No exception thrown");
+        }
+        catch (Exception e)
+        {
+            if (!(e instanceof SomeCheckedException))
+            {
+                throw e;
+            }
+        }
+
+        // But the second time, AbstractIterator itself throws an ISE
+        try
+        {
+            iter.hasNext();
+            fail("No exception thrown");
+        }
+        catch (IllegalStateException expected)
+        {
+        }
+    }
+
+    public void testReentrantHasNext()
+    {
+        Iterator<Integer> iter = new AbstractIterator<Integer>()
+        {
+            @Override
+            protected Integer computeNext()
+            {
+                hasNext();
+                return null;
+            }
+        };
+        try
+        {
+            iter.hasNext();
+            fail();
+        }
+        catch (IllegalStateException expected)
+        {
+        }
+    }
+
+    /**
+     * Throws a undeclared checked exception.
+     */
+    private static void sneakyThrow(Throwable t)
+    {
+        class SneakyThrower<T extends Throwable>
+        {
+            @SuppressWarnings("unchecked")
+                // not really safe, but that's the point
+            void throwIt(Throwable t) throws T
+            {
+                throw (T) t;
+            }
+        }
+        new SneakyThrower<Error>().throwIt(t);
+    }
+
+    private static class SomeCheckedException extends Exception
+    {
+    }
+
+    private static class SomeUncheckedException extends RuntimeException
+    {
+    }
+}
\ No newline at end of file

diff --git a/test/unit/org/apache/cassandra/utils/BTreeTest.java b/test/unit/org/apache/cassandra/utils/BTreeTest.java
index e1bf388..ffd7315 100644
--- a/test/unit/org/apache/cassandra/utils/BTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/BTreeTest.java

@@ -20,15 +20,14 @@
 import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
 
+import com.google.common.collect.Iterables;
 import org.junit.Test;
 
+import junit.framework.Assert;
 import org.apache.cassandra.utils.btree.BTree;
-import org.apache.cassandra.utils.btree.BTreeSet;
 import org.apache.cassandra.utils.btree.UpdateFunction;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 public class BTreeTest
 {
@@ -40,7 +39,7 @@
             ints[i] = new Integer(i);
     }
 
-    static final UpdateFunction<Integer> updateF = new UpdateFunction<Integer>()
+    static final UpdateFunction<Integer, Integer> updateF = new UpdateFunction<Integer, Integer>()
     {
         public Integer apply(Integer replacing, Integer update)
         {
@@ -63,6 +62,28 @@
         }
     };
 
+    private static final UpdateFunction<Integer, Integer> noOp = new UpdateFunction<Integer, Integer>()
+    {
+        public Integer apply(Integer replacing, Integer update)
+        {
+            return update;
+        }
+
+        public boolean abortEarly()
+        {
+            return false;
+        }
+
+        public void allocated(long heapSize)
+        {
+        }
+
+        public Integer apply(Integer k)
+        {
+            return k;
+        }
+    };
+
     private static List<Integer> seq(int count)
     {
         List<Integer> r = new ArrayList<>();
@@ -97,20 +118,14 @@
     public void testBuilding_UpdateFunctionReplacement()
     {
         for (int i = 0; i < 20 ; i++)
-        {
-            checkResult(i, BTree.build(seq(i), CMP, true, updateF));
-            checkResult(i, BTree.build(rand(i), CMP, false, updateF));
-        }
+            checkResult(i, BTree.build(seq(i), updateF));
     }
 
     @Test
     public void testUpdate_UpdateFunctionReplacement()
     {
         for (int i = 0; i < 20 ; i++)
-        {
-            checkResult(i, BTree.update(BTree.build(seq(i), CMP, true, UpdateFunction.NoOp.<Integer>instance()), CMP, seq(i), true, updateF));
-            checkResult(i, BTree.update(BTree.build(rand(i), CMP, false, UpdateFunction.NoOp.<Integer>instance()), CMP, rand(i), false, updateF));
-        }
+            checkResult(i, BTree.update(BTree.build(seq(i), noOp), CMP, seq(i), updateF));
     }
 
     /**
@@ -120,35 +135,35 @@
     @Test
     public void testUpdate_UpdateFunctionCallBack()
     {
-        Object[] btree = new Object[0];
+        Object[] btree = new Object[1];
         CallsMonitor monitor = new CallsMonitor();
 
-        btree = BTree.update(btree, CMP, Arrays.asList(1), true, monitor);
-        assertArrayEquals(new Object[] {1, null}, btree);
+        btree = BTree.update(btree, CMP, Arrays.asList(1), monitor);
+        assertArrayEquals(new Object[] {1}, btree);
         assertEquals(1, monitor.getNumberOfCalls(1));
 
         monitor.clear();
-        btree = BTree.update(btree, CMP, Arrays.asList(2), true, monitor);
-        assertArrayEquals(new Object[] {1, 2}, btree);
+        btree = BTree.update(btree, CMP, Arrays.asList(2), monitor);
+        assertArrayEquals(new Object[] {1, 2, null}, btree);
         assertEquals(1, monitor.getNumberOfCalls(2));
 
         // with existing value
         monitor.clear();
-        btree = BTree.update(btree, CMP, Arrays.asList(1), true, monitor);
-        assertArrayEquals(new Object[] {1, 2}, btree);
+        btree = BTree.update(btree, CMP, Arrays.asList(1), monitor);
+        assertArrayEquals(new Object[] {1, 2, null}, btree);
         assertEquals(1, monitor.getNumberOfCalls(1));
 
         // with two non-existing values
         monitor.clear();
-        btree = BTree.update(btree, CMP, Arrays.asList(3, 4), true, monitor);
-        assertArrayEquals(new Object[] {1, 2, 3, 4}, btree);
+        btree = BTree.update(btree, CMP, Arrays.asList(3, 4), monitor);
+        assertArrayEquals(new Object[] {1, 2, 3, 4, null}, btree);
         assertEquals(1, monitor.getNumberOfCalls(3));
         assertEquals(1, monitor.getNumberOfCalls(4));
 
-        // with one existing value and one non existing value in disorder
+        // with one existing value and one non existing value
         monitor.clear();
-        btree = BTree.update(btree, CMP, Arrays.asList(5, 2), false, monitor);
-        assertArrayEquals(new Object[] {3, new Object[]{1, 2}, new Object[]{4, 5}}, btree);
+        btree = BTree.update(btree, CMP, Arrays.asList(2, 5), monitor);
+        assertArrayEquals(new Object[] {3, new Object[]{1, 2, null}, new Object[]{4, 5, null},  new int[]{2, 5}}, btree);
         assertEquals(1, monitor.getNumberOfCalls(2));
         assertEquals(1, monitor.getNumberOfCalls(5));
     }
@@ -160,51 +175,202 @@
     public void testBuilding_UpdateFunctionCallBack()
     {
         CallsMonitor monitor = new CallsMonitor();
-        Object[] btree = BTree.build(Arrays.asList(1), CMP, true, monitor);
-        assertArrayEquals(new Object[] {1, null}, btree);
+        Object[] btree = BTree.build(Arrays.asList(1), monitor);
+        assertArrayEquals(new Object[] {1}, btree);
         assertEquals(1, monitor.getNumberOfCalls(1));
 
         monitor.clear();
-        btree = BTree.build(Arrays.asList(1, 2), CMP, true, monitor);
-        assertArrayEquals(new Object[] {1, 2}, btree);
+        btree = BTree.build(Arrays.asList(1, 2), monitor);
+        assertArrayEquals(new Object[] {1, 2, null}, btree);
         assertEquals(1, monitor.getNumberOfCalls(1));
         assertEquals(1, monitor.getNumberOfCalls(2));
 
         monitor.clear();
-        btree = BTree.build(Arrays.asList(3, 1, 2), CMP, false, monitor);
-        assertArrayEquals(new Object[] {1, 2, 3, null}, btree);
+        btree = BTree.build(Arrays.asList(1, 2, 3), monitor);
+        assertArrayEquals(new Object[] {1, 2, 3}, btree);
         assertEquals(1, monitor.getNumberOfCalls(1));
         assertEquals(1, monitor.getNumberOfCalls(2));
         assertEquals(1, monitor.getNumberOfCalls(3));
     }
 
+    /**
+     * Tests that the apply method of the <code>QuickResolver</code> is called exactly once per duplicate value
+     */
+    @Test
+    public void testBuilder_QuickResolver()
+    {
+        // for numbers x in 1..N, we repeat x x times, and resolve values to their sum,
+        // so that the resulting tree is of square numbers
+        BTree.Builder.QuickResolver<Accumulator> resolver = (a, b) -> new Accumulator(a.base, a.sum + b.sum);
+
+        for (int count = 0 ; count < 10 ; count ++)
+        {
+            BTree.Builder<Accumulator> builder;
+            // first check we produce the right output for sorted input
+            List<Accumulator> sorted = resolverInput(count, false);
+            builder = BTree.builder(Comparator.naturalOrder());
+            builder.setQuickResolver(resolver);
+            for (Accumulator i : sorted)
+                builder.add(i);
+            // for sorted input, check non-resolve path works before checking resolution path
+            checkResolverOutput(count, builder.build(), BTree.Dir.ASC);
+            builder.reuse();
+            for (int i = 0 ; i < 10 ; i++)
+            {
+                // now do a few runs of randomized inputs
+                for (Accumulator j : resolverInput(count, true))
+                    builder.add(j);
+                checkResolverOutput(count, builder.build(), BTree.Dir.ASC);
+                builder.reuse();
+            }
+            for (List<Accumulator> add : splitResolverInput(count))
+            {
+                if (ThreadLocalRandom.current().nextBoolean())
+                    builder.addAll(add);
+                else
+                    builder.addAll(new TreeSet<>(add));
+            }
+            checkResolverOutput(count, builder.build(), BTree.Dir.ASC);
+            builder.reuse();
+        }
+    }
+
+    private static class Accumulator extends Number implements Comparable<Accumulator>
+    {
+        final int base;
+        final int sum;
+        private Accumulator(int base, int sum)
+        {
+            this.base = base;
+            this.sum = sum;
+        }
+
+        public int compareTo(Accumulator that) { return Integer.compare(base, that.base); }
+        public int intValue() { return sum; }
+        public long longValue() { return sum; }
+        public float floatValue() { return sum; }
+        public double doubleValue() { return sum; }
+    }
+
+    /**
+     * Tests that the apply method of the <code>Resolver</code> is called exactly once per unique value
+     */
+    @Test
+    public void testBuilder_ResolverAndReverse()
+    {
+        // for numbers x in 1..N, we repeat x x times, and resolve values to their sum,
+        // so that the resulting tree is of square numbers
+        BTree.Builder.Resolver resolver = (array, lb, ub) -> {
+            int sum = 0;
+            for (int i = lb ; i < ub ; i++)
+                sum += ((Accumulator) array[i]).sum;
+            return new Accumulator(((Accumulator) array[lb]).base, sum);
+        };
+
+        for (int count = 0 ; count < 10 ; count ++)
+        {
+            BTree.Builder<Accumulator> builder;
+            // first check we produce the right output for sorted input
+            List<Accumulator> sorted = resolverInput(count, false);
+            builder = BTree.builder(Comparator.naturalOrder());
+            builder.auto(false);
+            for (Accumulator i : sorted)
+                builder.add(i);
+            // for sorted input, check non-resolve path works before checking resolution path
+            Assert.assertTrue(Iterables.elementsEqual(sorted, BTree.iterable(builder.build())));
+            checkResolverOutput(count, builder.resolve(resolver).build(), BTree.Dir.ASC);
+            builder = BTree.builder(Comparator.naturalOrder());
+            builder.auto(false);
+            for (int i = 0 ; i < 10 ; i++)
+            {
+                // now do a few runs of randomized inputs
+                for (Accumulator j : resolverInput(count, true))
+                    builder.add(j);
+                checkResolverOutput(count, builder.sort().resolve(resolver).build(), BTree.Dir.ASC);
+                builder.reuse();
+                for (Accumulator j : resolverInput(count, true))
+                    builder.add(j);
+                checkResolverOutput(count, builder.sort().reverse().resolve(resolver).build(), BTree.Dir.DESC);
+                builder.reuse();
+            }
+        }
+    }
+
+    private static List<Accumulator> resolverInput(int count, boolean shuffled)
+    {
+        List<Accumulator> result = new ArrayList<>();
+        for (int i = 1 ; i <= count ; i++)
+            for (int j = 0 ; j < i ; j++)
+                result.add(new Accumulator(i, i));
+        if (shuffled)
+        {
+            ThreadLocalRandom random = ThreadLocalRandom.current();
+            for (int i = 0 ; i < result.size() ; i++)
+            {
+                int swapWith = random.nextInt(i, result.size());
+                Accumulator t = result.get(swapWith);
+                result.set(swapWith, result.get(i));
+                result.set(i, t);
+            }
+        }
+        return result;
+    }
+
+    private static List<List<Accumulator>> splitResolverInput(int count)
+    {
+        List<Accumulator> all = resolverInput(count, false);
+        List<List<Accumulator>> result = new ArrayList<>();
+        while (!all.isEmpty())
+        {
+            List<Accumulator> is = new ArrayList<>();
+            int prev = -1;
+            for (Accumulator i : new ArrayList<>(all))
+            {
+                if (i.base == prev)
+                    continue;
+                is.add(i);
+                all.remove(i);
+                prev = i.base;
+            }
+            result.add(is);
+        }
+        return result;
+    }
+
+    private static void checkResolverOutput(int count, Object[] btree, BTree.Dir dir)
+    {
+        int i = 1;
+        for (Accumulator current : BTree.<Accumulator>iterable(btree, dir))
+        {
+            Assert.assertEquals(i * i, current.sum);
+            i++;
+        }
+        Assert.assertEquals(i, count + 1);
+    }
+
     private static void checkResult(int count, Object[] btree)
     {
-        BTreeSet<Integer> vs = new BTreeSet<>(btree, CMP);
-        assert vs.size() == count;
+        Iterator<Integer> iter = BTree.slice(btree, CMP, BTree.Dir.ASC);
         int i = 0;
-        for (Integer j : vs)
-            assertEquals(j, ints[i++]);
+        while (iter.hasNext())
+            assertEquals(iter.next(), ints[i++]);
+        assertEquals(count, i);
     }
 
     @Test
     public void testClearOnAbort()
     {
-        final Comparator<String> cmp = new Comparator<String>()
-        {
-            public int compare(String o1, String o2)
-            {
-                return o1.compareTo(o2);
-            }
-        };
+        Object[] btree = BTree.build(seq(2), noOp);
+        Object[] copy = Arrays.copyOf(btree, btree.length);
+        BTree.update(btree, CMP, seq(94), new AbortAfterX(90));
 
-        Object[] btree = BTree.build(ranges(range(0, 8)), cmp, true, UpdateFunction.NoOp.<String>instance());
-        BTree.update(btree, cmp, ranges(range(0, 94)), false, new AbortAfterX(90));
-        btree = BTree.update(btree, cmp, ranges(range(0, 94)), false, UpdateFunction.NoOp.<String>instance());
-        assertTrue(BTree.isWellFormed(btree, cmp));
+        assertArrayEquals(copy, btree);
+
+        btree = BTree.update(btree, CMP, seq(94), noOp);
+        assertTrue(BTree.isWellFormed(btree, CMP));
     }
 
-    private static final class AbortAfterX implements UpdateFunction<String>
+    private static final class AbortAfterX implements UpdateFunction<Integer, Integer>
     {
         int counter;
         final int abortAfter;
@@ -212,7 +378,7 @@
         {
             this.abortAfter = abortAfter;
         }
-        public String apply(String replacing, String update)
+        public Integer apply(Integer replacing, Integer update)
         {
             return update;
         }
@@ -223,33 +389,16 @@
         public void allocated(long heapSize)
         {
         }
-        public String apply(String v)
+        public Integer apply(Integer v)
         {
             return v;
         }
     }
 
-    private static int[] range(int lb, int ub)
-    {
-        return new int[] { lb, ub };
-    }
-
-    private static List<String> ranges(int[] ... ranges)
-    {
-
-        List<String> r = new ArrayList<>();
-        for (int[] range : ranges)
-        {
-            for (int i = range[0] ; i < range[1] ; i+=1)
-                r.add(Integer.toString(i));
-        }
-        return r;
-    }
-
     /**
      * <code>UpdateFunction</code> that count the number of call made to apply for each value.
      */
-    public static final class CallsMonitor implements UpdateFunction<Integer>
+    public static final class CallsMonitor implements UpdateFunction<Integer, Integer>
     {
         private int[] numberOfCalls = new int[20];
 

diff --git a/test/unit/org/apache/cassandra/utils/BitSetTest.java b/test/unit/org/apache/cassandra/utils/BitSetTest.java
index 9d82edf..0f51531 100644
--- a/test/unit/org/apache/cassandra/utils/BitSetTest.java
+++ b/test/unit/org/apache/cassandra/utils/BitSetTest.java

@@ -24,9 +24,9 @@
 import java.util.Random;
 
 import com.google.common.collect.Lists;
-
-import org.junit.Test;
 import org.junit.Assert;
+import org.junit.Test;
+
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.utils.IFilter.FilterKey;
 import org.apache.cassandra.utils.KeyGenerator.RandomStringGenerator;
@@ -44,8 +44,13 @@
     @Test
     public void compareBitSets()
     {
-        BloomFilter bf2 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false);
-        BloomFilter bf3 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, true);
+        compareBitSets(false);
+        compareBitSets(true);
+    }
+    private static void compareBitSets(boolean oldBfHashOrder)
+    {
+        BloomFilter bf2 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, oldBfHashOrder);
+        BloomFilter bf3 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, true, oldBfHashOrder);
 
         RandomStringGenerator gen1 = new KeyGenerator.RandomStringGenerator(new Random().nextInt(), FilterTestHelper.ELEMENTS);
 
@@ -79,13 +84,13 @@
     @Test
     public void testOffHeapCompatibility() throws IOException
     {
-        try (OpenBitSet bs = new OpenBitSet(100000)) 
+        try (OpenBitSet bs = new OpenBitSet(100000))
         {
             populateAndReserialize(bs);
         }
     }
 
-    private void populateAndReserialize(IBitSet bs) throws IOException
+    private static void populateAndReserialize(IBitSet bs) throws IOException
     {
         for (long i = 0; i < bs.capacity(); i++)
             if (random.nextBoolean())

diff --git a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
index 0c8aec6..2e76e0e 100644
--- a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
+++ b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java

@@ -18,45 +18,45 @@
 */
 package org.apache.cassandra.utils;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
+import java.io.*;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Random;
 import java.util.Set;
 
 import org.junit.*;
+
+import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.utils.IFilter.FilterKey;
 import org.apache.cassandra.utils.KeyGenerator.RandomStringGenerator;
 
 public class BloomFilterTest
 {
-    public IFilter bf;
+    public IFilter bfOldFormat;
+    public IFilter bfInvHashes;
 
     public BloomFilterTest()
     {
 
     }
 
-    public static IFilter testSerialize(IFilter f) throws IOException
+    public static IFilter testSerialize(IFilter f, boolean oldBfHashOrder) throws IOException
     {
         f.add(FilterTestHelper.bytes("a"));
         DataOutputBuffer out = new DataOutputBuffer();
         FilterFactory.serialize(f, out);
 
         ByteArrayInputStream in = new ByteArrayInputStream(out.getData(), 0, out.getLength());
-        IFilter f2 = FilterFactory.deserialize(new DataInputStream(in), true);
+        IFilter f2 = FilterFactory.deserialize(new DataInputStream(in), true, oldBfHashOrder);
 
         assert f2.isPresent(FilterTestHelper.bytes("a"));
         assert !f2.isPresent(FilterTestHelper.bytes("b"));
@@ -67,13 +67,15 @@
     @Before
     public void setup()
     {
-        bf = FilterFactory.getFilter(10000L, FilterTestHelper.MAX_FAILURE_RATE, true);
+        bfOldFormat = FilterFactory.getFilter(10000L, FilterTestHelper.MAX_FAILURE_RATE, true, true);
+        bfInvHashes = FilterFactory.getFilter(10000L, FilterTestHelper.MAX_FAILURE_RATE, true, false);
     }
 
     @After
     public void destroy()
     {
-        bf.close();
+        bfOldFormat.close();
+        bfInvHashes.close();
     }
 
     @Test(expected = UnsupportedOperationException.class)
@@ -92,21 +94,29 @@
     @Test
     public void testOne()
     {
-        bf.add(FilterTestHelper.bytes("a"));
-        assert bf.isPresent(FilterTestHelper.bytes("a"));
-        assert !bf.isPresent(FilterTestHelper.bytes("b"));
+        bfOldFormat.add(FilterTestHelper.bytes("a"));
+        assert bfOldFormat.isPresent(FilterTestHelper.bytes("a"));
+        assert !bfOldFormat.isPresent(FilterTestHelper.bytes("b"));
+
+        bfInvHashes.add(FilterTestHelper.bytes("a"));
+        assert bfInvHashes.isPresent(FilterTestHelper.bytes("a"));
+        assert !bfInvHashes.isPresent(FilterTestHelper.bytes("b"));
     }
 
     @Test
     public void testFalsePositivesInt()
     {
-        FilterTestHelper.testFalsePositives(bf, FilterTestHelper.intKeys(), FilterTestHelper.randomKeys2());
+        FilterTestHelper.testFalsePositives(bfOldFormat, FilterTestHelper.intKeys(), FilterTestHelper.randomKeys2());
+
+        FilterTestHelper.testFalsePositives(bfInvHashes, FilterTestHelper.intKeys(), FilterTestHelper.randomKeys2());
     }
 
     @Test
     public void testFalsePositivesRandom()
     {
-        FilterTestHelper.testFalsePositives(bf, FilterTestHelper.randomKeys(), FilterTestHelper.randomKeys2());
+        FilterTestHelper.testFalsePositives(bfOldFormat, FilterTestHelper.randomKeys(), FilterTestHelper.randomKeys2());
+
+        FilterTestHelper.testFalsePositives(bfInvHashes, FilterTestHelper.randomKeys(), FilterTestHelper.randomKeys2());
     }
 
     @Test
@@ -116,30 +126,48 @@
         {
             return;
         }
-        IFilter bf2 = FilterFactory.getFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTestHelper.MAX_FAILURE_RATE, true);
+        IFilter bf2 = FilterFactory.getFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTestHelper.MAX_FAILURE_RATE, true, false);
         int skipEven = KeyGenerator.WordGenerator.WORDS % 2 == 0 ? 0 : 2;
         FilterTestHelper.testFalsePositives(bf2,
                                             new KeyGenerator.WordGenerator(skipEven, 2),
                                             new KeyGenerator.WordGenerator(1, 2));
         bf2.close();
+
+        // new, swapped hash values bloom filter
+        bf2 = FilterFactory.getFilter(KeyGenerator.WordGenerator.WORDS / 2, FilterTestHelper.MAX_FAILURE_RATE, true, true);
+        FilterTestHelper.testFalsePositives(bf2,
+                                            new KeyGenerator.WordGenerator(skipEven, 2),
+                                            new KeyGenerator.WordGenerator(1, 2));
+        bf2.close();
     }
 
     @Test
     public void testSerialize() throws IOException
     {
-        BloomFilterTest.testSerialize(bf).close();
+        BloomFilterTest.testSerialize(bfOldFormat, true).close();
+
+        BloomFilterTest.testSerialize(bfInvHashes, false).close();
     }
 
-    public void testManyHashes(Iterator<ByteBuffer> keys)
+    @Test
+    @Ignore
+    public void testManyRandom()
+    {
+        testManyRandom(FilterTestHelper.randomKeys(), false);
+
+        testManyRandom(FilterTestHelper.randomKeys(), true);
+    }
+
+    private static void testManyRandom(Iterator<ByteBuffer> keys, boolean oldBfHashOrder)
     {
         int MAX_HASH_COUNT = 128;
-        Set<Long> hashes = new HashSet<Long>();
+        Set<Long> hashes = new HashSet<>();
         long collisions = 0;
         while (keys.hasNext())
         {
             hashes.clear();
             FilterKey buf = FilterTestHelper.wrap(keys.next());
-            BloomFilter bf = (BloomFilter) FilterFactory.getFilter(10, 1, false);
+            BloomFilter bf = (BloomFilter) FilterFactory.getFilter(10, 1, false, oldBfHashOrder);
             for (long hashIndex : bf.getHashBuckets(buf, MAX_HASH_COUNT, 1024 * 1024))
             {
                 hashes.add(hashIndex);
@@ -147,28 +175,22 @@
             collisions += (MAX_HASH_COUNT - hashes.size());
             bf.close();
         }
-        assert collisions <= 100;
-    }
-
-    @Test
-    public void testManyRandom()
-    {
-        testManyHashes(FilterTestHelper.randomKeys());
+        Assert.assertTrue("collisions=" + collisions, collisions <= 100);
     }
 
     @Test(expected = UnsupportedOperationException.class)
     public void testOffHeapException()
     {
-        long numKeys = (Integer.MAX_VALUE * 64) + 1; // approx 128 Billion
-        FilterFactory.getFilter(numKeys, 0.01d, true);
+        long numKeys = ((long)Integer.MAX_VALUE) * 64L + 1L; // approx 128 Billion
+        FilterFactory.getFilter(numKeys, 0.01d, true, true).close();
     }
 
     @Test
-    public void compareCachedKey()
+    public void compareCachedKeyOldHashOrder()
     {
-        BloomFilter bf1 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false);
-        BloomFilter bf2 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false);
-        BloomFilter bf3 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false);
+        BloomFilter bf1 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, true);
+        BloomFilter bf2 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, true);
+        BloomFilter bf3 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, true);
 
         RandomStringGenerator gen1 = new KeyGenerator.RandomStringGenerator(new Random().nextInt(), FilterTestHelper.ELEMENTS);
 
@@ -190,13 +212,46 @@
     }
 
     @Test
+    public void compareCachedKeyNewHashOrder()
+    {
+        try (BloomFilter bf1 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, false);
+             BloomFilter bf2 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, false);
+             BloomFilter bf3 = (BloomFilter) FilterFactory.getFilter(FilterTestHelper.ELEMENTS / 2, FilterTestHelper.MAX_FAILURE_RATE, false, false))
+        {
+            RandomStringGenerator gen1 = new KeyGenerator.RandomStringGenerator(new Random().nextInt(), FilterTestHelper.ELEMENTS);
+
+            // make sure all bitsets are empty.
+            BitSetTest.compare(bf1.bitset, bf2.bitset);
+            BitSetTest.compare(bf1.bitset, bf3.bitset);
+
+            while (gen1.hasNext())
+            {
+                ByteBuffer key = gen1.next();
+                FilterKey cached = FilterTestHelper.wrapCached(key);
+                bf1.add(FilterTestHelper.wrap(key));
+                bf2.add(cached);
+                bf3.add(cached);
+            }
+
+            BitSetTest.compare(bf1.bitset, bf2.bitset);
+            BitSetTest.compare(bf1.bitset, bf3.bitset);
+        }
+    }
+
+    @Test
     @Ignore
     public void testHugeBFSerialization() throws IOException
     {
+        hugeBFSerialization(false);
+        hugeBFSerialization(true);
+    }
+
+    static void hugeBFSerialization(boolean oldBfHashOrder) throws IOException
+    {
         ByteBuffer test = ByteBuffer.wrap(new byte[] {0, 1});
 
         File file = FileUtils.createTempFile("bloomFilterTest-", ".dat");
-        BloomFilter filter = (BloomFilter) FilterFactory.getFilter(((long)Integer.MAX_VALUE / 8) + 1, 0.01d, true);
+        BloomFilter filter = (BloomFilter) FilterFactory.getFilter(((long) Integer.MAX_VALUE / 8) + 1, 0.01d, true, oldBfHashOrder);
         filter.add(FilterTestHelper.wrap(test));
         DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(file));
         FilterFactory.serialize(filter, out);
@@ -205,9 +260,10 @@
         filter.close();
 
         DataInputStream in = new DataInputStream(new FileInputStream(file));
-        BloomFilter filter2 = (BloomFilter) FilterFactory.deserialize(in, true);
+        BloomFilter filter2 = (BloomFilter) FilterFactory.deserialize(in, true, oldBfHashOrder);
         Assert.assertTrue(filter2.isPresent(FilterTestHelper.wrap(test)));
         FileUtils.closeQuietly(in);
+        filter2.close();
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java b/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java
index 2cbac92..3f34102 100644
--- a/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java
+++ b/test/unit/org/apache/cassandra/utils/ByteBufferUtilTest.java

@@ -18,12 +18,9 @@
 
 package org.apache.cassandra.utils;
 
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.io.DataInputStream;
 import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.util.Arrays;
@@ -32,6 +29,9 @@
 
 import org.apache.cassandra.io.util.DataOutputBuffer;
 
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
 public class ByteBufferUtilTest
 {
     private static final String s = "cassandra";
@@ -101,11 +101,11 @@
 
     private void checkLastIndexOf(ByteBuffer bb)
     {
-        assert bb.position() + 8 == ByteBufferUtil.lastIndexOf(bb, (byte)'a', bb.position() + 8);
-        assert bb.position() + 4 == ByteBufferUtil.lastIndexOf(bb, (byte)'a', bb.position() + 7);
-        assert bb.position() + 3 == ByteBufferUtil.lastIndexOf(bb, (byte)'s', bb.position() + 8);
-        assert -1 == ByteBufferUtil.lastIndexOf(bb, (byte)'o', bb.position() + 8);
-        assert -1 == ByteBufferUtil.lastIndexOf(bb, (byte)'d', bb.position() + 5);
+        assert bb.position() + 8 == ByteBufferUtil.lastIndexOf(bb, (byte) 'a', bb.position() + 8);
+        assert bb.position() + 4 == ByteBufferUtil.lastIndexOf(bb, (byte) 'a', bb.position() + 7);
+        assert bb.position() + 3 == ByteBufferUtil.lastIndexOf(bb, (byte) 's', bb.position() + 8);
+        assert -1 == ByteBufferUtil.lastIndexOf(bb, (byte) 'o', bb.position() + 8);
+        assert -1 == ByteBufferUtil.lastIndexOf(bb, (byte) 'd', bb.position() + 5);
     }
 
     @Test

diff --git a/test/unit/org/apache/cassandra/utils/BytesReadTrackerTest.java b/test/unit/org/apache/cassandra/utils/BytesReadTrackerTest.java
index e2e0bf2..7693b45 100644
--- a/test/unit/org/apache/cassandra/utils/BytesReadTrackerTest.java
+++ b/test/unit/org/apache/cassandra/utils/BytesReadTrackerTest.java

@@ -20,6 +20,7 @@
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -28,12 +29,44 @@
 
 import org.junit.Test;
 
+import org.apache.cassandra.io.util.BytesReadTracker;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.TrackedDataInputPlus;
+import org.apache.cassandra.io.util.TrackedInputStream;
+
 public class BytesReadTrackerTest
 {
 
     @Test
     public void testBytesRead() throws Exception
     {
+        internalTestBytesRead(true);
+        internalTestBytesRead(false);
+    }
+
+    @Test
+    public void testUnsignedRead() throws Exception
+    {
+        internalTestUnsignedRead(true);
+        internalTestUnsignedRead(false);
+    }
+
+    @Test
+    public void testSkipBytesAndReadFully() throws Exception
+    {
+        internalTestSkipBytesAndReadFully(true);
+        internalTestSkipBytesAndReadFully(false);
+    }
+
+    @Test
+    public void testReadLine() throws Exception
+    {
+        internalTestReadLine(true);
+        internalTestReadLine(false);
+    }
+
+    public void internalTestBytesRead(boolean inputStream) throws Exception
+    {
         byte[] testData;
 
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
@@ -66,45 +99,46 @@
             out.close();
         }
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(testData));
-        BytesReadTracker tracker = new BytesReadTracker(in);
+        DataInputPlus.DataInputStreamPlus in = new DataInputPlus.DataInputStreamPlus(new ByteArrayInputStream(testData));
+        BytesReadTracker tracker = inputStream? new TrackedInputStream(in) : new TrackedDataInputPlus(in);
+        DataInputPlus reader = inputStream? new DataInputPlus.DataInputStreamPlus((TrackedInputStream)tracker) : (DataInputPlus) tracker;
 
         try
         {
             // boolean = 1byte
-            boolean bool = tracker.readBoolean();
+            boolean bool = reader.readBoolean();
             assertTrue(bool);
             assertEquals(1, tracker.getBytesRead());
             // byte = 1byte
-            byte b = tracker.readByte();
+            byte b = reader.readByte();
             assertEquals(b, 0x1);
             assertEquals(2, tracker.getBytesRead());
             // char = 2byte
-            char c = tracker.readChar();
+            char c = reader.readChar();
             assertEquals('a', c);
             assertEquals(4, tracker.getBytesRead());
             // short = 2bytes
-            short s = tracker.readShort();
+            short s = reader.readShort();
             assertEquals(1, s);
             assertEquals((short) 6, tracker.getBytesRead());
             // int = 4bytes
-            int i = tracker.readInt();
+            int i = reader.readInt();
             assertEquals(1, i);
             assertEquals(10, tracker.getBytesRead());
             // long = 8bytes
-            long l = tracker.readLong();
+            long l = reader.readLong();
             assertEquals(1L, l);
             assertEquals(18, tracker.getBytesRead());
             // float = 4bytes
-            float f = tracker.readFloat();
+            float f = reader.readFloat();
             assertEquals(1.0f, f, 0);
             assertEquals(22, tracker.getBytesRead());
             // double = 8bytes
-            double d = tracker.readDouble();
+            double d = reader.readDouble();
             assertEquals(1.0d, d, 0);
             assertEquals(30, tracker.getBytesRead());
             // String("abc") = 2(string size) + 3 = 5 bytes
-            String str = tracker.readUTF();
+            String str = reader.readUTF();
             assertEquals("abc", str);
             assertEquals(35, tracker.getBytesRead());
 
@@ -119,8 +153,7 @@
         assertEquals(0, tracker.getBytesRead());
     }
 
-    @Test
-    public void testUnsignedRead() throws Exception
+    public void internalTestUnsignedRead(boolean inputStream) throws Exception
     {
         byte[] testData;
 
@@ -139,17 +172,18 @@
             out.close();
         }
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(testData));
-        BytesReadTracker tracker = new BytesReadTracker(in);
+        DataInputPlus.DataInputStreamPlus in = new DataInputPlus.DataInputStreamPlus(new ByteArrayInputStream(testData));
+        BytesReadTracker tracker = inputStream? new TrackedInputStream(in) : new TrackedDataInputPlus(in);
+        DataInputPlus reader = inputStream? new DataInputPlus.DataInputStreamPlus((TrackedInputStream)tracker) : (DataInputPlus) tracker;
 
         try
         {
             // byte = 1byte
-            int b = tracker.readUnsignedByte();
+            int b = reader.readUnsignedByte();
             assertEquals(b, 1);
             assertEquals(1, tracker.getBytesRead());
             // short = 2bytes
-            int s = tracker.readUnsignedShort();
+            int s = reader.readUnsignedShort();
             assertEquals(1, s);
             assertEquals(3, tracker.getBytesRead());
 
@@ -161,30 +195,30 @@
         }
     }
 
-    @Test
-    public void testSkipBytesAndReadFully() throws Exception
+    public void internalTestSkipBytesAndReadFully(boolean inputStream) throws Exception
     {
         String testStr = "1234567890";
         byte[] testData = testStr.getBytes();
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(testData));
-        BytesReadTracker tracker = new BytesReadTracker(in);
+        DataInputPlus.DataInputStreamPlus in = new DataInputPlus.DataInputStreamPlus(new ByteArrayInputStream(testData));
+        BytesReadTracker tracker = inputStream? new TrackedInputStream(in) : new TrackedDataInputPlus(in);
+        DataInputPlus reader = inputStream? new DataInputPlus.DataInputStreamPlus((TrackedInputStream)tracker) : (DataInputPlus) tracker;
 
         try
         {
             // read first 5 bytes
             byte[] out = new byte[5];
-            tracker.readFully(out, 0, 5);
+            reader.readFully(out, 0, 5);
             assertEquals("12345", new String(out));
             assertEquals(5, tracker.getBytesRead());
 
             // then skip 2 bytes
-            tracker.skipBytes(2);
+            reader.skipBytes(2);
             assertEquals(7, tracker.getBytesRead());
 
             // and read the rest
             out = new byte[3];
-            tracker.readFully(out);
+            reader.readFully(out);
             assertEquals("890", new String(out));
             assertEquals(10, tracker.getBytesRead());
 
@@ -196,16 +230,24 @@
         }
     }
 
-    @Test(expected = UnsupportedOperationException.class)
-    public void testReadLine() throws Exception
+    public void internalTestReadLine(boolean inputStream) throws Exception
     {
         DataInputStream in = new DataInputStream(new ByteArrayInputStream("1".getBytes()));
-        BytesReadTracker tracker = new BytesReadTracker(in);
+        BytesReadTracker tracker = inputStream? new TrackedInputStream(in) : new TrackedDataInputPlus(in);
+        DataInputPlus reader = inputStream? new DataInputPlus.DataInputStreamPlus((TrackedInputStream)tracker) : (DataInputPlus) tracker;
 
         try
         {
-            // throws UnsupportedOperationException
-            tracker.readLine();
+            String line = reader.readLine();
+            if (inputStream)
+                assertEquals(line, "1");
+            else
+                fail("Should have thrown UnsupportedOperationException");
+        }
+        catch (UnsupportedOperationException e)
+        {
+            if (inputStream)
+                fail("Should have not thrown UnsupportedOperationException");
         }
         finally
         {

diff --git a/test/unit/org/apache/cassandra/utils/CLibraryTest.java b/test/unit/org/apache/cassandra/utils/CLibraryTest.java
deleted file mode 100644
index be52bed..0000000
--- a/test/unit/org/apache/cassandra/utils/CLibraryTest.java
+++ /dev/null

@@ -1,37 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.io.File;
-
-import org.junit.Test;
-
-import org.apache.cassandra.io.util.FileUtils;
-
-public class CLibraryTest
-{
-    @Test
-    public void testSkipCache()
-    {
-        File file = FileUtils.createTempFile("testSkipCache", "1");
-
-        int fd = CLibrary.getfd(file.getPath());
-        CLibrary.trySkipCache(fd, 0, 0);
-    }
-}

diff --git a/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java b/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java
deleted file mode 100644
index c23ef53..0000000
--- a/test/unit/org/apache/cassandra/utils/EncodedStreamsTest.java
+++ /dev/null

@@ -1,161 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import static org.apache.cassandra.Util.*;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-
-import org.apache.cassandra.SchemaLoader;
-import org.apache.cassandra.config.CFMetaData;
-import org.apache.cassandra.config.KSMetaData;
-import org.apache.cassandra.db.ArrayBackedSortedColumns;
-import org.apache.cassandra.db.ColumnFamily;
-import org.apache.cassandra.db.TypeSizes;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CounterColumnType;
-import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.locator.SimpleStrategy;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.utils.vint.EncodedDataInputStream;
-import org.apache.cassandra.utils.vint.EncodedDataOutputStream;
-
-import org.junit.Assert;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-public class EncodedStreamsTest
-{
-    private static final String KEYSPACE1 = "Keyspace1";
-    private static final String CF_STANDARD = "Standard1";
-    private static final String CF_COUNTER = "Counter1";
-    private int version = MessagingService.current_version;
-
-    @BeforeClass
-    public static void defineSchema() throws ConfigurationException
-    {
-    SchemaLoader.prepareServer();
-    SchemaLoader.createKeyspace(KEYSPACE1,
-                                SimpleStrategy.class,
-                                KSMetaData.optsWithRF(1),
-                                SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                SchemaLoader.standardCFMD(KEYSPACE1, CF_COUNTER)
-                                            .defaultValidator(CounterColumnType.instance));
-    }
-
-    @Test
-    public void testStreams() throws IOException
-    {
-        ByteArrayOutputStream byteArrayOStream1 = new ByteArrayOutputStream();
-        EncodedDataOutputStream odos = new EncodedDataOutputStream(byteArrayOStream1);
-
-        ByteArrayOutputStream byteArrayOStream2 = new ByteArrayOutputStream();
-        DataOutputStream out = new DataOutputStream(byteArrayOStream2);
-        
-        for (short i = 0; i < 10000; i++)
-        {
-            out.writeShort(i);
-            odos.writeShort(i);
-        }
-        out.flush();
-        odos.flush();
-
-        for (int i = Short.MAX_VALUE; i < ((int)Short.MAX_VALUE + 10000); i++)
-        {
-            out.writeInt(i);
-            odos.writeInt(i);
-        }
-        out.flush();
-        odos.flush();
-
-        for (long i = Integer.MAX_VALUE; i < ((long)Integer.MAX_VALUE + 10000);i++)
-        {
-            out.writeLong(i);
-            odos.writeLong(i);
-        }
-        out.flush();
-        odos.flush();
-        Assert.assertTrue(byteArrayOStream1.size() < byteArrayOStream2.size());
-
-        ByteArrayInputStream byteArrayIStream1 = new ByteArrayInputStream(byteArrayOStream1.toByteArray());
-        EncodedDataInputStream idis = new EncodedDataInputStream(new DataInputStream(byteArrayIStream1));
-
-        // assert reading Short
-        for (int i = 0; i < 10000; i++)
-            Assert.assertEquals(i, idis.readShort());
-
-        // assert reading Integer
-        for (int i = Short.MAX_VALUE; i < ((int)Short.MAX_VALUE + 10000); i++)
-            Assert.assertEquals(i, idis.readInt());
-
-        // assert reading Long
-        for (long i = Integer.MAX_VALUE; i < ((long)Integer.MAX_VALUE) + 1000; i++)
-            Assert.assertEquals(i, idis.readLong());
-    }
-
-    private ColumnFamily createCF()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_STANDARD);
-        cf.addColumn(column("vijay", "try", 1));
-        cf.addColumn(column("to", "be_nice", 1));
-        return cf;
-    }
-
-    private ColumnFamily createCounterCF()
-    {
-        ColumnFamily cf = ArrayBackedSortedColumns.factory.create(KEYSPACE1, CF_COUNTER);
-        cf.addCounter(cellname("vijay"), 1);
-        cf.addCounter(cellname("wants"), 1000000);
-        return cf;
-    }
-
-    @Test
-    public void testCFSerialization() throws IOException
-    {
-        ByteArrayOutputStream byteArrayOStream1 = new ByteArrayOutputStream();
-        EncodedDataOutputStream odos = new EncodedDataOutputStream(byteArrayOStream1);
-        ColumnFamily.serializer.serialize(createCF(), odos, version);
-
-        ByteArrayInputStream byteArrayIStream1 = new ByteArrayInputStream(byteArrayOStream1.toByteArray());
-        EncodedDataInputStream odis = new EncodedDataInputStream(new DataInputStream(byteArrayIStream1));
-        ColumnFamily cf = ColumnFamily.serializer.deserialize(odis, version);
-        Assert.assertEquals(cf, createCF());
-        Assert.assertEquals(byteArrayOStream1.size(), (int) ColumnFamily.serializer.serializedSize(cf, TypeSizes.VINT, version));
-    }
-
-    @Test
-    public void testCounterCFSerialization() throws IOException
-    {
-        ColumnFamily counterCF = createCounterCF();
-
-        ByteArrayOutputStream byteArrayOStream1 = new ByteArrayOutputStream();
-        EncodedDataOutputStream odos = new EncodedDataOutputStream(byteArrayOStream1);
-        ColumnFamily.serializer.serialize(counterCF, odos, version);
-
-        ByteArrayInputStream byteArrayIStream1 = new ByteArrayInputStream(byteArrayOStream1.toByteArray());
-        EncodedDataInputStream odis = new EncodedDataInputStream(new DataInputStream(byteArrayIStream1));
-        ColumnFamily cf = ColumnFamily.serializer.deserialize(odis, version);
-        Assert.assertEquals(cf, counterCF);
-        Assert.assertEquals(byteArrayOStream1.size(), (int) ColumnFamily.serializer.serializedSize(cf, TypeSizes.VINT, version));
-    }
-}
-

diff --git a/test/unit/org/apache/cassandra/utils/EstimatedHistogramTest.java b/test/unit/org/apache/cassandra/utils/EstimatedHistogramTest.java
index b0e0e4f..b3fbfb6 100644
--- a/test/unit/org/apache/cassandra/utils/EstimatedHistogramTest.java
+++ b/test/unit/org/apache/cassandra/utils/EstimatedHistogramTest.java

@@ -20,7 +20,7 @@
 
 import org.junit.Test;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
 
 
 public class EstimatedHistogramTest

diff --git a/test/unit/org/apache/cassandra/utils/FBUtilitiesTest.java b/test/unit/org/apache/cassandra/utils/FBUtilitiesTest.java
index 5b86252..f13d076 100644
--- a/test/unit/org/apache/cassandra/utils/FBUtilitiesTest.java
+++ b/test/unit/org/apache/cassandra/utils/FBUtilitiesTest.java

@@ -18,13 +18,22 @@
 
 package org.apache.cassandra.utils;
 
-import static org.junit.Assert.fail;
-
 import java.io.IOException;
 import java.net.InetAddress;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
 import com.google.common.primitives.Ints;
 import org.junit.Test;
 
@@ -35,6 +44,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 public class FBUtilitiesTest
 {
@@ -127,4 +137,45 @@
 
         FBUtilities.reset();
     }
+
+    @Test
+    public void testWaitFirstFuture() throws ExecutionException, InterruptedException
+    {
+        final int threadCount = 10;
+        ExecutorService executor = Executors.newFixedThreadPool(threadCount);
+        try
+        {
+            List<Future<?>> futures = new ArrayList<>(threadCount);
+            List<CountDownLatch> latches = new ArrayList<>(threadCount);
+
+            for (int i = 0; i < threadCount; i++)
+            {
+                CountDownLatch latch = new CountDownLatch(1);
+                latches.add(latch);
+                int finalI = i;
+                futures.add(executor.submit(() -> {
+                    latch.await(10, TimeUnit.SECONDS);
+                    // Sleep to emulate "work" done by the future to make it not return immediately
+                    // after counting down the latch in order to test for delay and spinning done
+                    // in FBUtilities#waitOnFirstFuture.
+                    TimeUnit.MILLISECONDS.sleep(10);
+                    return latch.getCount() == 0 ? finalI : -1;
+                }));
+            }
+
+            for (int i = 0; i < threadCount; i++)
+            {
+                latches.get(i).countDown();
+                Future<?> fut = FBUtilities.waitOnFirstFuture(futures, 3);
+                int futSleep = (Integer) fut.get();
+                assertEquals(futSleep, i);
+                futures.remove(fut);
+            }
+        }
+        finally
+        {
+            executor.shutdown();
+        }
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/utils/FreeRunningClock.java b/test/unit/org/apache/cassandra/utils/FreeRunningClock.java
new file mode 100644
index 0000000..83c8db7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/FreeRunningClock.java

@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * A freely adjustable clock that can be used for unit testing. See {@link Clock#instance} how to
+ * enable this class.
+ */
+public class FreeRunningClock extends Clock
+{
+    private long nanoTime = 0;
+
+    @Override
+    public long nanoTime()
+    {
+        return nanoTime;
+    }
+
+    @Override
+    public long currentTimeMillis()
+    {
+        return TimeUnit.NANOSECONDS.toMillis(nanoTime());
+    }
+
+    public void advance(long time, TimeUnit unit)
+    {
+        nanoTime += unit.toNanos(time);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/HexTest.java b/test/unit/org/apache/cassandra/utils/HexTest.java
index ad64668..db93c08 100644
--- a/test/unit/org/apache/cassandra/utils/HexTest.java
+++ b/test/unit/org/apache/cassandra/utils/HexTest.java

@@ -18,11 +18,12 @@
 
 package org.apache.cassandra.utils;
 
-import static org.junit.Assert.assertArrayEquals;
-
 import java.util.Arrays;
+
 import org.junit.Test;
 
+import static org.junit.Assert.assertArrayEquals;
+
 public class HexTest
 {
     @Test

diff --git a/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
index dfceaf3..4e7c439 100644
--- a/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java
+++ b/test/unit/org/apache/cassandra/utils/HistogramBuilderTest.java

@@ -21,7 +21,7 @@
 
 import org.junit.Test;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertArrayEquals;
 
 public class HistogramBuilderTest
 {

diff --git a/test/unit/org/apache/cassandra/utils/IntegerIntervalsTest.java b/test/unit/org/apache/cassandra/utils/IntegerIntervalsTest.java
new file mode 100644
index 0000000..44843fd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/IntegerIntervalsTest.java

@@ -0,0 +1,326 @@
+package org.apache.cassandra.utils;
+
+import static org.junit.Assert.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.util.concurrent.Futures;
+
+import org.junit.Test;
+
+import org.apache.cassandra.utils.IntegerInterval.Set;
+
+public class IntegerIntervalsTest
+{
+    int[] values = new int[] { Integer.MIN_VALUE, -2, -1, 0, 5, 9, 13, Integer.MAX_VALUE };
+
+    @Test
+    public void testMake()
+    {
+        IntegerInterval iv;
+        for (int i = 0; i < values.length; ++i)
+        {
+            for (int j = i; j < values.length; ++j)
+            {
+                iv = new IntegerInterval(values[i], values[j]);
+                assertEquals(values[i], iv.lower());
+                assertEquals(values[j], iv.upper());
+            }
+        }
+
+        for (int i = 0; i < values.length; ++i)
+        {
+            for (int j = 0; j < i; ++j)
+            {
+                try
+                {
+                    iv = new IntegerInterval(values[i], values[j]);
+                    fail("Assertion not thrown: " + values[i] + ", " + values[j]);
+                }
+                catch (AssertionError e)
+                {
+                    // expected
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testExpandToCoverSingleThread()
+    {
+        IntegerInterval iv;
+        for (int i = 0; i < values.length; ++i)
+        {
+            for (int j = i; j < values.length; ++j)
+            {
+                iv = new IntegerInterval(values[i], values[j]);
+                int k = 0;
+                for (; k < i; ++k)
+                {
+                    IntegerInterval v = new IntegerInterval(iv);
+                    v.expandToCover(values[k]);
+                    assertEquals(values[k], v.lower());
+                    assertEquals(values[j], v.upper());
+                }
+                for (; k < j; ++k)
+                {
+                    IntegerInterval v = new IntegerInterval(iv);
+                    v.expandToCover(values[k]);
+                    assertEquals(values[i], v.lower());
+                    assertEquals(values[j], v.upper());
+                }
+                for (; k < values.length; ++k)
+                {
+                    IntegerInterval v = new IntegerInterval(iv);
+                    v.expandToCover(values[k]);
+                    assertEquals(values[i], v.lower());
+                    assertEquals(values[k], v.upper());
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testExpandToCoverMultiThread() throws InterruptedException
+    {
+        Random r = new Random();
+        int threads = 16;
+        int streamSize = 1000000;
+        List<Callable<Void>> tasks = new ArrayList<>(threads);
+        final IntegerInterval interval = new IntegerInterval(0, 0);
+        int min = 0;
+        int max = 0;
+        for (int i = 0; i < threads; ++i)
+        {
+            final int seed = r.nextInt();
+            tasks.add(() -> {
+                new Random(seed).ints(streamSize).forEach(v -> interval.expandToCover(v));
+                return null;
+            });
+            min = Math.min(min, new Random(seed).ints(streamSize).min().getAsInt());
+            max = Math.max(max, new Random(seed).ints(streamSize).max().getAsInt());
+        }
+        for (Future<?> f : Executors.newFixedThreadPool(threads).invokeAll(tasks))
+            Futures.getUnchecked(f);
+        assertEquals(min, interval.lower());
+        assertEquals(max, interval.upper());
+    }
+
+    void testSetAdd(int l, int r, Integer... expected)
+    {
+        Set s = new Set();
+        s.add(-3, -1);
+        s.add(1, 3);
+        s.add(l, r);
+        assertArrayEquals(expected, s
+                                    .intervals()
+                                    .stream()
+                                    .flatMap(x -> ImmutableList.of(x.lower(), x.upper()).stream())
+                                    .toArray());
+    }
+
+    void testSetAdd(int l, int r, String expected)
+    {
+        Set s = new Set();
+        s.add(-3, -1);
+        s.add(1, 3);
+        s.add(l, r);
+        assertEquals(expected, s.toString());
+    }
+
+    @Test
+    public void testSetAdd()
+    {
+        testSetAdd(Integer.MIN_VALUE, -4, Integer.MIN_VALUE, -4, -3, -1, 1, 3);
+        testSetAdd(Integer.MIN_VALUE, -3, Integer.MIN_VALUE, -1, 1, 3);
+        testSetAdd(Integer.MIN_VALUE, -2, Integer.MIN_VALUE, -1, 1, 3);
+        testSetAdd(Integer.MIN_VALUE, -1, Integer.MIN_VALUE, -1, 1, 3);
+        testSetAdd(Integer.MIN_VALUE, 0, Integer.MIN_VALUE, 0, 1, 3);
+        testSetAdd(Integer.MIN_VALUE, 1, Integer.MIN_VALUE, 3);
+        testSetAdd(Integer.MIN_VALUE, 2, Integer.MIN_VALUE, 3);
+        testSetAdd(Integer.MIN_VALUE, 3, Integer.MIN_VALUE, 3);
+        testSetAdd(Integer.MIN_VALUE, Integer.MAX_VALUE, Integer.MIN_VALUE, Integer.MAX_VALUE);
+
+        testSetAdd(-5, -4, "[[-5,-4], [-3,-1], [1,3]]");
+        testSetAdd(-5, -3, -5, -1, 1, 3);
+        testSetAdd(-5, -2, -5, -1, 1, 3);
+        testSetAdd(-5, -1, -5, -1, 1, 3);
+        testSetAdd(-5, 0, -5, 0, 1, 3);
+        testSetAdd(-5, 1, -5, 3);
+        testSetAdd(-5, 2, -5, 3);
+        testSetAdd(-5, 3, -5, 3);
+        testSetAdd(-5, 4, -5, 4);
+        testSetAdd(-5, Integer.MAX_VALUE, -5, Integer.MAX_VALUE);
+
+        testSetAdd(-3, -3, -3, -1, 1, 3);
+        testSetAdd(-3, -2, -3, -1, 1, 3);
+        testSetAdd(-3, -1, -3, -1, 1, 3);
+        testSetAdd(-3, 0, -3, 0, 1, 3);
+        testSetAdd(-3, 1, "[[-3,3]]");
+        testSetAdd(-3, 2, -3, 3);
+        testSetAdd(-3, 3, -3, 3);
+        testSetAdd(-3, 4, -3, 4);
+        testSetAdd(-3, Integer.MAX_VALUE, -3, Integer.MAX_VALUE);
+
+        testSetAdd(-2, -2, -3, -1, 1, 3);
+        testSetAdd(-2, -1, -3, -1, 1, 3);
+        testSetAdd(-2, 0, "[[-3,0], [1,3]]");
+        testSetAdd(-2, 1, -3, 3);
+        testSetAdd(-2, 2, -3, 3);
+        testSetAdd(-2, 3, -3, 3);
+        testSetAdd(-2, 4, -3, 4);
+        testSetAdd(-2, Integer.MAX_VALUE, -3, Integer.MAX_VALUE);
+
+        testSetAdd(-1, -1, -3, -1, 1, 3);
+        testSetAdd(-1, 0, -3, 0, 1, 3);
+        testSetAdd(-1, 1, -3, 3);
+        testSetAdd(-1, 2, -3, 3);
+        testSetAdd(-1, 3, -3, 3);
+        testSetAdd(-1, 4, -3, 4);
+        testSetAdd(-1, Integer.MAX_VALUE, -3, Integer.MAX_VALUE);
+
+        testSetAdd(0, 0, -3, -1, 0, 0, 1, 3);
+        testSetAdd(0, 1, -3, -1, 0, 3);
+        testSetAdd(0, 2, -3, -1, 0, 3);
+        testSetAdd(0, 3, -3, -1, 0, 3);
+        testSetAdd(0, 4, -3, -1, 0, 4);
+        testSetAdd(0, Integer.MAX_VALUE, -3, -1, 0, Integer.MAX_VALUE);
+
+        testSetAdd(1, 1, -3, -1, 1, 3);
+        testSetAdd(1, 2, -3, -1, 1, 3);
+        testSetAdd(1, 3, -3, -1, 1, 3);
+        testSetAdd(1, 4, -3, -1, 1, 4);
+        testSetAdd(1, Integer.MAX_VALUE, -3, -1, 1, Integer.MAX_VALUE);
+
+        testSetAdd(2, 2, -3, -1, 1, 3);
+        testSetAdd(2, 3, -3, -1, 1, 3);
+        testSetAdd(2, 4, -3, -1, 1, 4);
+        testSetAdd(2, Integer.MAX_VALUE, -3, -1, 1, Integer.MAX_VALUE);
+
+        testSetAdd(3, 3, "[[-3,-1], [1,3]]");
+        testSetAdd(3, 4, -3, -1, 1, 4);
+        testSetAdd(3, Integer.MAX_VALUE, -3, -1, 1, Integer.MAX_VALUE);
+
+        testSetAdd(4, 5, -3, -1, 1, 3, 4, 5);
+        testSetAdd(4, Integer.MAX_VALUE, -3, -1, 1, 3, 4, Integer.MAX_VALUE);
+    }
+
+    @Test
+    public void testSetAddMultiThread() throws InterruptedException
+    {
+        Random r = new Random();
+        int threads = 16;
+        int streamSize = 10000;
+        List<Callable<Void>> tasks = new ArrayList<>(threads);
+        final IntegerInterval.Set st = new IntegerInterval.Set();
+        final IntegerInterval.Set mt = new IntegerInterval.Set();
+        for (int i = 0; i < threads; ++i)
+        {
+            final int seed = r.nextInt();
+            tasks.add(() -> {
+                new Random(seed)
+                    .ints(streamSize)
+                    .forEach(v -> mt.add(v, v + 5));
+                return null;
+            });
+            new Random(seed)
+                .ints(streamSize)
+                .forEach(v -> st.add(v, v + 5));
+        }
+        for (Future<?> f : Executors.newFixedThreadPool(threads).invokeAll(tasks))
+            Futures.getUnchecked(f);
+        assertEquals(st, mt);
+    }
+
+    void testSetCovers(int l, int r, boolean expected)
+    {
+        Set s = new Set();
+        s.add(-3, -1);
+        s.add(1, 3);
+        assertEquals(expected, s.covers(new IntegerInterval(l, r)));
+    }
+
+
+    @Test
+    public void testSetCovers()
+    {
+        testSetCovers(Integer.MIN_VALUE, -4, false);
+        testSetCovers(Integer.MIN_VALUE, -3, false);
+        testSetCovers(Integer.MIN_VALUE, -2, false);
+        testSetCovers(Integer.MIN_VALUE, -1, false);
+        testSetCovers(Integer.MIN_VALUE, 0, false);
+        testSetCovers(Integer.MIN_VALUE, 1, false);
+        testSetCovers(Integer.MIN_VALUE, 2, false);
+        testSetCovers(Integer.MIN_VALUE, 3, false);
+        testSetCovers(Integer.MIN_VALUE, Integer.MAX_VALUE, false);
+
+        testSetCovers(-5, -4, false);
+        testSetCovers(-5, -3, false);
+        testSetCovers(-5, -2, false);
+        testSetCovers(-5, -1, false);
+        testSetCovers(-5, 0, false);
+        testSetCovers(-5, 1, false);
+        testSetCovers(-5, 2, false);
+        testSetCovers(-5, 3, false);
+        testSetCovers(-5, 4, false);
+        testSetCovers(-5, Integer.MAX_VALUE, false);
+
+        testSetCovers(-3, -3, true);
+        testSetCovers(-3, -2, true);
+        testSetCovers(-3, -1, true);
+        testSetCovers(-3, 0, false);
+        testSetCovers(-3, 1, false);
+        testSetCovers(-3, 2, false);
+        testSetCovers(-3, 3, false);
+        testSetCovers(-3, 4, false);
+        testSetCovers(-3, Integer.MAX_VALUE, false);
+
+        testSetCovers(-2, -2, true);
+        testSetCovers(-2, -1, true);
+        testSetCovers(-2, 0, false);
+        testSetCovers(-2, 1, false);
+        testSetCovers(-2, 2, false);
+        testSetCovers(-2, 3, false);
+        testSetCovers(-2, 4, false);
+        testSetCovers(-2, Integer.MAX_VALUE, false);
+
+        testSetCovers(-1, -1, true);
+        testSetCovers(-1, 0, false);
+        testSetCovers(-1, 1, false);
+        testSetCovers(-1, 2, false);
+        testSetCovers(-1, 3, false);
+        testSetCovers(-1, 4, false);
+        testSetCovers(-1, Integer.MAX_VALUE, false);
+
+        testSetCovers(0, 0, false);
+        testSetCovers(0, 1, false);
+        testSetCovers(0, 2, false);
+        testSetCovers(0, 3, false);
+        testSetCovers(0, 4, false);
+        testSetCovers(0, Integer.MAX_VALUE, false);
+
+        testSetCovers(1, 1, true);
+        testSetCovers(1, 2, true);
+        testSetCovers(1, 3, true);
+        testSetCovers(1, 4, false);
+        testSetCovers(1, Integer.MAX_VALUE, false);
+
+        testSetCovers(2, 2, true);
+        testSetCovers(2, 3, true);
+        testSetCovers(2, 4, false);
+        testSetCovers(2, Integer.MAX_VALUE, false);
+
+        testSetCovers(3, 3, true);
+        testSetCovers(3, 4, false);
+        testSetCovers(3, Integer.MAX_VALUE, false);
+
+        testSetCovers(4, 5, false);
+        testSetCovers(4, Integer.MAX_VALUE, false);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java
index 1f66fb7..7e72098 100644
--- a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java

@@ -21,22 +21,22 @@
  */
 
 
-import org.junit.Test;
-
+import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
-import java.io.*;
 
-import static org.junit.Assert.*;
-
-import org.apache.cassandra.db.TypeSizes;
+import org.junit.Test;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
+import static org.junit.Assert.assertEquals;
+
 public class IntervalTreeTest
 {
     @Test
@@ -46,27 +46,27 @@
 
         intervals.add(Interval.<Integer, Void>create(-300, -200));
         intervals.add(Interval.<Integer, Void>create(-3, -2));
-        intervals.add(Interval.<Integer, Void>create(1,2));
-        intervals.add(Interval.<Integer, Void>create(3,6));
-        intervals.add(Interval.<Integer, Void>create(2,4));
-        intervals.add(Interval.<Integer, Void>create(5,7));
-        intervals.add(Interval.<Integer, Void>create(1,3));
-        intervals.add(Interval.<Integer, Void>create(4,6));
-        intervals.add(Interval.<Integer, Void>create(8,9));
-        intervals.add(Interval.<Integer, Void>create(15,20));
-        intervals.add(Interval.<Integer, Void>create(40,50));
-        intervals.add(Interval.<Integer, Void>create(49,60));
+        intervals.add(Interval.<Integer, Void>create(1, 2));
+        intervals.add(Interval.<Integer, Void>create(3, 6));
+        intervals.add(Interval.<Integer, Void>create(2, 4));
+        intervals.add(Interval.<Integer, Void>create(5, 7));
+        intervals.add(Interval.<Integer, Void>create(1, 3));
+        intervals.add(Interval.<Integer, Void>create(4, 6));
+        intervals.add(Interval.<Integer, Void>create(8, 9));
+        intervals.add(Interval.<Integer, Void>create(15, 20));
+        intervals.add(Interval.<Integer, Void>create(40, 50));
+        intervals.add(Interval.<Integer, Void>create(49, 60));
 
 
         IntervalTree<Integer, Void, Interval<Integer, Void>> it = IntervalTree.build(intervals);
 
-        assertEquals(3, it.search(Interval.<Integer, Void>create(4,4)).size());
+        assertEquals(3, it.search(Interval.<Integer, Void>create(4, 4)).size());
         assertEquals(4, it.search(Interval.<Integer, Void>create(4, 5)).size());
-        assertEquals(7, it.search(Interval.<Integer, Void>create(-1,10)).size());
-        assertEquals(0, it.search(Interval.<Integer, Void>create(-1,-1)).size());
-        assertEquals(5, it.search(Interval.<Integer, Void>create(1,4)).size());
-        assertEquals(2, it.search(Interval.<Integer, Void>create(0,1)).size());
-        assertEquals(0, it.search(Interval.<Integer, Void>create(10,12)).size());
+        assertEquals(7, it.search(Interval.<Integer, Void>create(-1, 10)).size());
+        assertEquals(0, it.search(Interval.<Integer, Void>create(-1, -1)).size());
+        assertEquals(5, it.search(Interval.<Integer, Void>create(1, 4)).size());
+        assertEquals(2, it.search(Interval.<Integer, Void>create(0, 1)).size());
+        assertEquals(0, it.search(Interval.<Integer, Void>create(10, 12)).size());
 
         List<Interval<Integer, Void>> intervals2 = new ArrayList<Interval<Integer, Void>>();
 
@@ -102,16 +102,16 @@
 
         intervals.add(Interval.<Integer, Void>create(-300, -200));
         intervals.add(Interval.<Integer, Void>create(-3, -2));
-        intervals.add(Interval.<Integer, Void>create(1,2));
-        intervals.add(Interval.<Integer, Void>create(3,6));
-        intervals.add(Interval.<Integer, Void>create(2,4));
-        intervals.add(Interval.<Integer, Void>create(5,7));
-        intervals.add(Interval.<Integer, Void>create(1,3));
-        intervals.add(Interval.<Integer, Void>create(4,6));
-        intervals.add(Interval.<Integer, Void>create(8,9));
-        intervals.add(Interval.<Integer, Void>create(15,20));
-        intervals.add(Interval.<Integer, Void>create(40,50));
-        intervals.add(Interval.<Integer, Void>create(49,60));
+        intervals.add(Interval.<Integer, Void>create(1, 2));
+        intervals.add(Interval.<Integer, Void>create(3, 6));
+        intervals.add(Interval.<Integer, Void>create(2, 4));
+        intervals.add(Interval.<Integer, Void>create(5, 7));
+        intervals.add(Interval.<Integer, Void>create(1, 3));
+        intervals.add(Interval.<Integer, Void>create(4, 6));
+        intervals.add(Interval.<Integer, Void>create(8, 9));
+        intervals.add(Interval.<Integer, Void>create(15, 20));
+        intervals.add(Interval.<Integer, Void>create(40, 50));
+        intervals.add(Interval.<Integer, Void>create(49, 60));
 
         IntervalTree<Integer, Void, Interval<Integer, Void>> it = IntervalTree.build(intervals);
 
@@ -131,40 +131,62 @@
 
         intervals.add(Interval.<Integer, String>create(-300, -200, "a"));
         intervals.add(Interval.<Integer, String>create(-3, -2, "b"));
-        intervals.add(Interval.<Integer, String>create(1,2, "c"));
-        intervals.add(Interval.<Integer, String>create(1,3, "d"));
-        intervals.add(Interval.<Integer, String>create(2,4, "e"));
-        intervals.add(Interval.<Integer, String>create(3,6, "f"));
-        intervals.add(Interval.<Integer, String>create(4,6, "g"));
-        intervals.add(Interval.<Integer, String>create(5,7, "h"));
-        intervals.add(Interval.<Integer, String>create(8,9, "i"));
-        intervals.add(Interval.<Integer, String>create(15,20, "j"));
-        intervals.add(Interval.<Integer, String>create(40,50, "k"));
-        intervals.add(Interval.<Integer, String>create(49,60, "l"));
+        intervals.add(Interval.<Integer, String>create(1, 2, "c"));
+        intervals.add(Interval.<Integer, String>create(1, 3, "d"));
+        intervals.add(Interval.<Integer, String>create(2, 4, "e"));
+        intervals.add(Interval.<Integer, String>create(3, 6, "f"));
+        intervals.add(Interval.<Integer, String>create(4, 6, "g"));
+        intervals.add(Interval.<Integer, String>create(5, 7, "h"));
+        intervals.add(Interval.<Integer, String>create(8, 9, "i"));
+        intervals.add(Interval.<Integer, String>create(15, 20, "j"));
+        intervals.add(Interval.<Integer, String>create(40, 50, "k"));
+        intervals.add(Interval.<Integer, String>create(49, 60, "l"));
 
         IntervalTree<Integer, String, Interval<Integer, String>> it = IntervalTree.build(intervals);
 
         IVersionedSerializer<IntervalTree<Integer, String, Interval<Integer, String>>> serializer = IntervalTree.serializer(
-            new ISerializer<Integer>()
-            {
-                public void serialize(Integer i, DataOutputPlus out) throws IOException { out.writeInt(i); }
-                public Integer deserialize(DataInput in) throws IOException { return in.readInt(); }
-                public long serializedSize(Integer i, TypeSizes s) { return 4; }
-            },
-            new ISerializer<String>()
-            {
-                public void serialize(String v, DataOutputPlus out) throws IOException { out.writeUTF(v); }
-                public String deserialize(DataInput in) throws IOException { return in.readUTF(); }
-                public long serializedSize(String v, TypeSizes s) { return v.length(); }
-            },
-            (Constructor<Interval<Integer, String>>) (Constructor<?>) Interval.class.getConstructor(Object.class, Object.class, Object.class)
+                new ISerializer<Integer>()
+                {
+                    public void serialize(Integer i, DataOutputPlus out) throws IOException
+                    {
+                        out.writeInt(i);
+                    }
+
+                    public Integer deserialize(DataInputPlus in) throws IOException
+                    {
+                        return in.readInt();
+                    }
+
+                    public long serializedSize(Integer i)
+                    {
+                        return 4;
+                    }
+                },
+                new ISerializer<String>()
+                {
+                    public void serialize(String v, DataOutputPlus out) throws IOException
+                    {
+                        out.writeUTF(v);
+                    }
+
+                    public String deserialize(DataInputPlus in) throws IOException
+                    {
+                        return in.readUTF();
+                    }
+
+                    public long serializedSize(String v)
+                    {
+                        return v.length();
+                    }
+                },
+                (Constructor<Interval<Integer, String>>) (Object) Interval.class.getConstructor(Object.class, Object.class, Object.class)
         );
 
         DataOutputBuffer out = new DataOutputBuffer();
 
         serializer.serialize(it, out, 0);
 
-        DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()));
+        DataInputPlus in = new DataInputBuffer(out.toByteArray());
 
         IntervalTree<Integer, String, Interval<Integer, String>> it2 = serializer.deserialize(in, 0);
         List<Interval<Integer, String>> intervals2 = new ArrayList<Interval<Integer, String>>();

diff --git a/test/unit/org/apache/cassandra/utils/JVMStabilityInspectorTest.java b/test/unit/org/apache/cassandra/utils/JVMStabilityInspectorTest.java
index f96ac6e..00447da 100644
--- a/test/unit/org/apache/cassandra/utils/JVMStabilityInspectorTest.java
+++ b/test/unit/org/apache/cassandra/utils/JVMStabilityInspectorTest.java

@@ -17,19 +17,17 @@
  */
 package org.apache.cassandra.utils;
 
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.SocketException;
+
+import org.junit.Test;
+
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.FSReadError;
 
 import static java.util.Arrays.asList;
-
-import org.junit.Test;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.net.SocketException;
-import java.util.Arrays;
-
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;

diff --git a/test/unit/org/apache/cassandra/utils/KeyGenerator.java b/test/unit/org/apache/cassandra/utils/KeyGenerator.java
index 519c580..8a9d8b8 100644
--- a/test/unit/org/apache/cassandra/utils/KeyGenerator.java
+++ b/test/unit/org/apache/cassandra/utils/KeyGenerator.java

@@ -18,22 +18,20 @@
 */
 package org.apache.cassandra.utils;
 
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.*;
 import java.nio.ByteBuffer;
 import java.util.Random;
 
-public class KeyGenerator {
+public class KeyGenerator
+{
     private static ByteBuffer randomKey(Random r) {
         byte[] bytes = new byte[48];
         r.nextBytes(bytes);
         return ByteBuffer.wrap(bytes);
     }
 
-    static class RandomStringGenerator implements ResetableIterator<ByteBuffer> {
+    static class RandomStringGenerator implements ResetableIterator<ByteBuffer>
+    {
         int i, n, seed;
         Random random;
 
@@ -66,7 +64,8 @@
         }
     }
 
-    static class IntGenerator implements ResetableIterator<ByteBuffer> {
+    static class IntGenerator implements ResetableIterator<ByteBuffer>
+    {
         private int i, start, n;
 
         IntGenerator(int n) {
@@ -100,7 +99,8 @@
         }
     }
 
-    static class WordGenerator implements ResetableIterator<ByteBuffer> {
+    static class WordGenerator implements ResetableIterator<ByteBuffer>
+    {
         static int WORDS;
 
         static {

diff --git a/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java b/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java
new file mode 100644
index 0000000..5f2de73
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java

@@ -0,0 +1,733 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.utils;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+
+import com.google.common.base.Function;
+import com.google.common.base.Objects;
+import org.apache.cassandra.utils.AbstractIterator;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.utils.MergeIterator.Candidate;
+import org.apache.cassandra.utils.MergeIterator.Reducer;
+
+public class MergeIteratorComparisonTest
+{
+    private static class CountingComparator<T> implements Comparator<T>
+    {
+        final Comparator<T> wrapped;
+        int count = 0;
+
+        protected CountingComparator(Comparator<T> wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public int compare(T o1, T o2)
+        {
+            count++;
+            return wrapped.compare(o1, o2);
+        }
+    }
+
+    static int ITERATOR_COUNT = 15;
+    static int LIST_LENGTH = 15000;
+    static boolean BENCHMARK = false;
+
+    @Test
+    public void testRandomInts()
+    {
+        System.out.println("testRandomInts");
+        final Random r = new Random();
+        Reducer<Integer, Counted<Integer>> reducer = new Counter<Integer>();
+
+        List<List<Integer>> lists = new NaturalListGenerator<Integer>(ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public Integer next()
+            {
+                return r.nextInt(5 * LIST_LENGTH);
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+    
+    @Test
+    public void testNonOverlapInts()
+    {
+        System.out.println("testNonOverlapInts");
+        Reducer<Integer, Counted<Integer>> reducer = new Counter<Integer>();
+
+        List<List<Integer>> lists = new NaturalListGenerator<Integer>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 1;
+            @Override
+            public Integer next()
+            {
+                return next++;
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testCombinationInts()
+    {
+        System.out.println("testCombinationInts");
+        final Random r = new Random();
+        Reducer<Integer, Counted<Integer>> reducer = new Counter<Integer>();
+
+        List<List<Integer>> lists = new NaturalListGenerator<Integer>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 1;
+            @Override
+            public Integer next()
+            {
+                return r.nextBoolean() ? r.nextInt(5 * LIST_LENGTH) : next++;
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testLCSTotalOverlap()
+    {
+        testLCS(2, LIST_LENGTH / 100, 1f);
+        testLCS(3, LIST_LENGTH / 100, 1f);
+        testLCS(3, LIST_LENGTH / 100, 1f, 10, LIST_LENGTH);
+        testLCS(4, LIST_LENGTH / 100, 1f);
+        testLCS(4, LIST_LENGTH / 100, 1f, 10, LIST_LENGTH);
+    }
+
+    @Test
+    public void testLCSPartialOverlap()
+    {
+        testLCS(2, LIST_LENGTH / 100, 0.5f);
+        testLCS(3, LIST_LENGTH / 100, 0.5f);
+        testLCS(3, LIST_LENGTH / 100, 0.5f, 10, LIST_LENGTH);
+        testLCS(4, LIST_LENGTH / 100, 0.5f);
+        testLCS(4, LIST_LENGTH / 100, 0.5f, 10, LIST_LENGTH);
+    }
+
+    @Test
+    public void testLCSNoOverlap()
+    {
+        testLCS(2, LIST_LENGTH / 100, 0f);
+        testLCS(3, LIST_LENGTH / 100, 0f);
+        testLCS(3, LIST_LENGTH / 100, 0f, 10, LIST_LENGTH);
+        testLCS(4, LIST_LENGTH / 100, 0f);
+        testLCS(4, LIST_LENGTH / 100, 0f, 10, LIST_LENGTH);
+    }
+
+    public void testLCS(int levelCount, int levelMultiplier, float levelOverlap)
+    {
+        testLCS(levelCount, levelMultiplier, levelOverlap, 0, 0);
+    }
+    public void testLCS(int levelCount, int levelMultiplier, float levelOverlap, int countOfL0, int sizeOfL0)
+    {
+        System.out.printf("testLCS(lc=%d,lm=%d,o=%.2f,L0=%d*%d)\n", levelCount, levelMultiplier, levelOverlap, countOfL0, countOfL0 == 0 ? 0 : sizeOfL0 / countOfL0);
+        final Random r = new Random();
+        Reducer<Integer, Counted<Integer>> reducer = new Counter<Integer>();
+        List<List<Integer>> lists = new LCSGenerator<Integer>(Ordering.<Integer>natural(), levelCount, levelMultiplier, levelOverlap) {
+            @Override
+            public Integer newItem()
+            {
+                return r.nextInt();
+            }
+        }.result;
+        if (sizeOfL0 > 0 && countOfL0 > 0)
+            lists.addAll(new NaturalListGenerator<Integer>(countOfL0, sizeOfL0 / countOfL0)
+            {
+                Integer next()
+                {
+                    return r.nextInt();
+                }
+            }.result);
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testRandomStrings()
+    {
+        System.out.println("testRandomStrings");
+        final Random r = new Random();
+        Reducer<String, Counted<String>> reducer = new Counter<String>();
+
+        List<List<String>> lists = new NaturalListGenerator<String>(ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public String next()
+            {
+                return "longish_prefix_" + r.nextInt(5 * LIST_LENGTH);
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+    
+    @Test
+    public void testNonOverlapStrings()
+    {
+        System.out.println("testNonOverlapStrings");
+        Reducer<String, Counted<String>> reducer = new Counter<String>();
+
+        List<List<String>> lists = new NaturalListGenerator<String>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 1;
+            @Override
+            public String next()
+            {
+                return "longish_prefix_" + next++;
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testCombinationStrings()
+    {
+        System.out.println("testCombinationStrings");
+        final Random r = new Random();
+        Reducer<String, Counted<String>> reducer = new Counter<String>();
+
+        List<List<String>> lists = new NaturalListGenerator<String>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 1;
+            public String next()
+            {
+                return "longish_prefix_" + (r.nextBoolean() ? r.nextInt(5 * LIST_LENGTH) : next++);
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testTimeUuids()
+    {
+        System.out.println("testTimeUuids");
+        Reducer<UUID, Counted<UUID>> reducer = new Counter<UUID>();
+
+        List<List<UUID>> lists = new NaturalListGenerator<UUID>(ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public UUID next()
+            {
+                return UUIDGen.getTimeUUID();
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testRandomUuids()
+    {
+        System.out.println("testRandomUuids");
+        Reducer<UUID, Counted<UUID>> reducer = new Counter<UUID>();
+
+        List<List<UUID>> lists = new NaturalListGenerator<UUID>(ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public UUID next()
+            {
+                return UUID.randomUUID();
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testTimeUuidType()
+    {
+        System.out.println("testTimeUuidType");
+        final AbstractType<UUID> type = TimeUUIDType.instance;
+        Reducer<ByteBuffer, Counted<ByteBuffer>> reducer = new Counter<ByteBuffer>();
+
+        List<List<ByteBuffer>> lists = new SimpleListGenerator<ByteBuffer>(type, ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public ByteBuffer next()
+            {
+                return type.decompose(UUIDGen.getTimeUUID());
+            }
+        }.result;
+        testMergeIterator(reducer, lists, type);
+    }
+
+    @Test
+    public void testUuidType()
+    {
+        System.out.println("testUuidType");
+        final AbstractType<UUID> type = UUIDType.instance;
+        Reducer<ByteBuffer, Counted<ByteBuffer>> reducer = new Counter<ByteBuffer>();
+
+        List<List<ByteBuffer>> lists = new SimpleListGenerator<ByteBuffer>(type, ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public ByteBuffer next()
+            {
+                return type.decompose(UUIDGen.getTimeUUID());
+            }
+        }.result;
+        testMergeIterator(reducer, lists, type);
+    }
+
+    
+    @Test
+    public void testSets()
+    {
+        System.out.println("testSets");
+        final Random r = new Random();
+
+        Reducer<KeyedSet<Integer, UUID>, KeyedSet<Integer, UUID>> reducer = new Union<Integer, UUID>();
+
+        List<List<KeyedSet<Integer, UUID>>> lists = new NaturalListGenerator<KeyedSet<Integer, UUID>>(ITERATOR_COUNT, LIST_LENGTH) {
+            @Override
+            public KeyedSet<Integer, UUID> next()
+            {
+                return new KeyedSet<>(r.nextInt(5 * LIST_LENGTH), UUIDGen.getTimeUUID());
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+    /* */
+
+    @Test
+    public void testLimitedOverlapStrings2()
+    {
+        System.out.println("testLimitedOverlapStrings2");
+        Reducer<String, Counted<String>> reducer = new Counter<String>();
+
+        List<List<String>> lists = new NaturalListGenerator<String>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 0;
+            @Override
+            public String next()
+            {
+                ++next;
+                int list = next / LIST_LENGTH;
+                int id = next % LIST_LENGTH;
+                return "longish_prefix_" + (id + list * LIST_LENGTH / 2);
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    @Test
+    public void testLimitedOverlapStrings3()
+    {
+        System.out.println("testLimitedOverlapStrings3");
+        Reducer<String, Counted<String>> reducer = new Counter<String>();
+
+        List<List<String>> lists = new NaturalListGenerator<String>(ITERATOR_COUNT, LIST_LENGTH) {
+            int next = 0;
+            @Override
+            public String next()
+            {
+                ++next;
+                int list = next / LIST_LENGTH;
+                int id = next % LIST_LENGTH;
+                return "longish_prefix_" + (id + list * LIST_LENGTH / 3);
+            }
+        }.result;
+        testMergeIterator(reducer, lists);
+    }
+
+    private static abstract class ListGenerator<T>
+    {
+        abstract boolean hasMoreItems();
+        abstract boolean hasMoreLists();
+        abstract T next();
+
+        final Comparator<T> comparator;
+        final List<List<T>> result = Lists.newArrayList();
+
+        protected ListGenerator(Comparator<T> comparator)
+        {
+            this.comparator = comparator;
+        }
+
+        void build()
+        {
+            while (hasMoreLists())
+            {
+                List<T> l = Lists.newArrayList();
+                while (hasMoreItems())
+                    l.add(next());
+                Collections.sort(l, comparator);
+                result.add(l);
+            }
+        }
+    }
+
+    private static abstract class NaturalListGenerator<T extends Comparable<T>> extends SimpleListGenerator<T>
+    {
+        private NaturalListGenerator(int listCount, int perListCount)
+        {
+            super(Ordering.natural(), listCount, perListCount);
+        }
+    }
+    private static abstract class SimpleListGenerator<T> extends ListGenerator<T>
+    {
+        final int listCount;
+        final int perListCount;
+
+        int listIdx = 0, itemIdx = 0;
+
+        private SimpleListGenerator(Comparator<T> comparator, int listCount, int perListCount)
+        {
+            super(comparator);
+            this.listCount = listCount;
+            this.perListCount = perListCount;
+            build();
+        }
+
+        public boolean hasMoreItems()
+        {
+            return itemIdx++ < perListCount;
+        }
+
+        public boolean hasMoreLists()
+        {
+            itemIdx = 0;
+            return listIdx++ < listCount;
+        }
+    }
+
+    private static abstract class LCSGenerator<T> extends ListGenerator<T>
+    {
+        final int levelCount;
+        final int itemMultiplier;
+        final float levelOverlap;
+
+        int levelIdx, itemIdx;
+        int levelItems, overlapItems, runningTotalItems;
+        final Random random = new Random();
+
+        public LCSGenerator(Comparator<T> comparator, int levelCount, int l1Items, float levelOverlap)
+        {
+            super(comparator);
+            this.levelCount = levelCount;
+            this.itemMultiplier = l1Items;
+            this.levelOverlap = levelOverlap;
+            build();
+        }
+
+        public boolean hasMoreItems()
+        {
+            return itemIdx++ < levelItems;
+        }
+
+        public boolean hasMoreLists()
+        {
+            if (result.size() > 0)
+                runningTotalItems += result.get(result.size() - 1).size();
+            itemIdx = 0;
+            levelItems = itemMultiplier * (int)Math.pow(10, levelCount - levelIdx);
+            overlapItems = levelIdx == 0 ? 0 : (int) (levelItems * levelOverlap);
+            return levelIdx++ < levelCount;
+        }
+
+        abstract T newItem();
+
+        T next()
+        {
+            if (itemIdx < overlapItems)
+            {
+                int item = random.nextInt(runningTotalItems);
+                for (List<T> list : result)
+                {
+                    if (item < list.size()) return list.get(item);
+                    else item -= list.size();
+                }
+            }
+            return newItem();
+        }
+    }
+
+    public <T extends Comparable<T>> void testMergeIterator(Reducer<T, ?> reducer, List<List<T>> lists)
+    {
+        testMergeIterator(reducer, lists, Ordering.natural());
+    }
+    public <T> void testMergeIterator(Reducer<T, ?> reducer, List<List<T>> lists, Comparator<T> comparator)
+    {
+        {
+            IMergeIterator<T,?> tested = MergeIterator.get(closeableIterators(lists), comparator, reducer);
+            IMergeIterator<T,?> base = new MergeIteratorPQ<>(closeableIterators(lists), comparator, reducer);
+            // If test fails, try the version below for improved reporting:
+            Object[] basearr = Iterators.toArray(base, Object.class);
+            Assert.assertArrayEquals(basearr, Iterators.toArray(tested, Object.class));
+            //Assert.assertTrue(Iterators.elementsEqual(base, tested));
+            if (!BENCHMARK)
+                return;
+        }
+
+        CountingComparator<T> cmp, cmpb;
+        cmp = new CountingComparator<>(comparator); cmpb = new CountingComparator<>(comparator);
+        System.out.println();
+        for (int i=0; i<10; ++i) {
+            benchmarkIterator(MergeIterator.get(closeableIterators(lists), cmp, reducer), cmp);
+            benchmarkIterator(new MergeIteratorPQ<>(closeableIterators(lists), cmpb, reducer), cmpb);
+        }
+        System.out.format("MI: %.2f\n", cmp.count / (double) cmpb.count);
+    }
+    
+    public <T> void benchmarkIterator(IMergeIterator<T, ?> it, CountingComparator<T> comparator)
+    {
+        System.out.format("Testing %30s... ", it.getClass().getSimpleName());
+        long time = System.currentTimeMillis();
+        Object value = null;
+        while (it.hasNext())
+            value = it.next();
+        time = System.currentTimeMillis() - time;
+        String type = "";
+        if (value instanceof Counted<?>)
+        {
+            type = "type " + ((Counted<?>)value).item.getClass().getSimpleName();
+        }
+        System.out.format("%15s time %5dms; comparisons: %d\n", type, time, comparator.count);
+    }
+
+    public <T> List<CloseableIterator<T>> closeableIterators(List<List<T>> iterators)
+    {
+        return Lists.transform(iterators, new Function<List<T>, CloseableIterator<T>>() {
+
+            @Override
+            public CloseableIterator<T> apply(List<T> arg)
+            {
+                return new CLI<T>(arg.iterator());
+            }
+        });
+    }
+
+    static class Counted<T> {
+        T item;
+        int count;
+        
+        Counted(T item) {
+            this.item = item;
+            count = 0;
+        }
+
+        public boolean equals(Object obj)
+        {
+            if (obj == null || !(obj instanceof Counted))
+                return false;
+            Counted<?> c = (Counted<?>) obj;
+            return Objects.equal(item, c.item) && count == c.count;
+        }
+
+        @Override
+        public String toString()
+        {
+            return item.toString() + "x" + count;
+        }
+    }
+    
+    static class Counter<T> extends Reducer<T, Counted<T>> {
+        Counted<T> current = null;
+        boolean read = true;
+
+        @Override
+        public void reduce(int idx, T next)
+        {
+            if (current == null)
+                current = new Counted<T>(next);
+            assert current.item.equals(next);
+            ++current.count;
+        }
+
+        @Override
+        protected void onKeyChange()
+        {
+            assert read;
+            current = null;
+            read = false;
+        }
+
+        @Override
+        protected Counted<T> getReduced()
+        {
+            assert current != null;
+            read = true;
+            return current;
+        }
+    }
+    
+    static class KeyedSet<K extends Comparable<? super K>, V> extends Pair<K, Set<V>> implements Comparable<KeyedSet<K, V>>
+    {
+        protected KeyedSet(K left, V right)
+        {
+            super(left, ImmutableSet.of(right));
+        }
+        
+        protected KeyedSet(K left, Collection<V> right)
+        {
+            super(left, Sets.newHashSet(right));
+        }
+
+        @Override
+        public int compareTo(KeyedSet<K, V> o)
+        {
+            return left.compareTo(o.left);
+        }
+    }
+    
+    static class Union<K extends Comparable<K>, V> extends Reducer<KeyedSet<K, V>, KeyedSet<K, V>> {
+        KeyedSet<K, V> current = null;
+        boolean read = true;
+
+        @Override
+        public void reduce(int idx, KeyedSet<K, V> next)
+        {
+            if (current == null)
+                current = new KeyedSet<>(next.left, next.right);
+            else {
+                assert current.left.equals(next.left);
+                current.right.addAll(next.right);
+            }
+        }
+
+        @Override
+        protected void onKeyChange()
+        {
+            assert read;
+            current = null;
+            read = false;
+        }
+
+        @Override
+        protected KeyedSet<K, V> getReduced()
+        {
+            assert current != null;
+            read = true;
+            return current;
+        }
+    }
+    
+    // closeable list iterator
+    public static class CLI<E> extends AbstractIterator<E> implements CloseableIterator<E>
+    {
+        Iterator<E> iter;
+        boolean closed = false;
+        public CLI(Iterator<E> items)
+        {
+            this.iter = items;
+        }
+
+        protected E computeNext()
+        {
+            if (!iter.hasNext()) return endOfData();
+            return iter.next();
+        }
+
+        public void close()
+        {
+            assert !this.closed;
+            this.closed = true;
+        }
+    }
+
+    // Old MergeIterator implementation for comparison.
+    public class MergeIteratorPQ<In,Out> extends MergeIterator<In,Out> implements IMergeIterator<In, Out>
+    {
+        // a queue for return: all candidates must be open and have at least one item
+        protected final PriorityQueue<CandidatePQ<In>> queue;
+        // a stack of the last consumed candidates, so that we can lazily call 'advance()'
+        // TODO: if we had our own PriorityQueue implementation we could stash items
+        // at the end of its array, so we wouldn't need this storage
+        protected final ArrayDeque<CandidatePQ<In>> candidates;
+        public MergeIteratorPQ(List<? extends Iterator<In>> iters, Comparator<In> comp, Reducer<In, Out> reducer)
+        {
+            super(iters, reducer);
+            this.queue = new PriorityQueue<>(Math.max(1, iters.size()));
+            for (int i = 0; i < iters.size(); i++)
+            {
+                CandidatePQ<In> candidate = new CandidatePQ<>(i, iters.get(i), comp);
+                if (!candidate.advance())
+                    // was empty
+                    continue;
+                this.queue.add(candidate);
+            }
+            this.candidates = new ArrayDeque<>(queue.size());
+        }
+
+        protected final Out computeNext()
+        {
+            advance();
+            return consume();
+        }
+
+        /** Consume values by sending them to the reducer while they are equal. */
+        protected final Out consume()
+        {
+            CandidatePQ<In> candidate = queue.peek();
+            if (candidate == null)
+                return endOfData();
+            reducer.onKeyChange();
+            do
+            {
+                candidate = queue.poll();
+                candidates.push(candidate);
+                reducer.reduce(candidate.idx, candidate.item);
+            }
+            while (queue.peek() != null && queue.peek().compareTo(candidate) == 0);
+            return reducer.getReduced();
+        }
+
+        /** Advance and re-enqueue all items we consumed in the last iteration. */
+        protected final void advance()
+        {
+            CandidatePQ<In> candidate;
+            while ((candidate = candidates.pollFirst()) != null)
+                if (candidate.advance())
+                    queue.add(candidate);
+        }
+    }
+
+    // Holds and is comparable by the head item of an iterator it owns
+    protected static final class CandidatePQ<In> implements Comparable<CandidatePQ<In>>
+    {
+        private final Iterator<? extends In> iter;
+        private final Comparator<? super In> comp;
+        private final int idx;
+        private In item;
+        boolean equalParent;
+
+        public CandidatePQ(int idx, Iterator<? extends In> iter, Comparator<? super In> comp)
+        {
+            this.iter = iter;
+            this.comp = comp;
+            this.idx = idx;
+        }
+
+        /** @return true if our iterator had an item, and it is now available */
+        protected boolean advance()
+        {
+            if (!iter.hasNext())
+                return false;
+            item = iter.next();
+            return true;
+        }
+
+        public int compareTo(CandidatePQ<In> that)
+        {
+            return comp.compare(this.item, that.item);
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/MergeIteratorTest.java b/test/unit/org/apache/cassandra/utils/MergeIteratorTest.java
index 3544955..fe2cecf 100644
--- a/test/unit/org/apache/cassandra/utils/MergeIteratorTest.java
+++ b/test/unit/org/apache/cassandra/utils/MergeIteratorTest.java

@@ -21,10 +21,9 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
-import com.google.common.collect.AbstractIterator;
+import org.apache.cassandra.utils.AbstractIterator;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.Ordering;
-
 import org.junit.Before;
 import org.junit.Test;
 
@@ -51,9 +50,10 @@
         {
             String concatted = "";
 
-            public void reduce(String value)
+            @Override
+            public void reduce(int idx, String current)
             {
-                concatted += value;
+                concatted += current;
             }
 
             public String getReduced()
@@ -64,8 +64,8 @@
             }
         };
         IMergeIterator<String,String> smi = MergeIterator.get(Arrays.asList(a, b, c, d),
-                                                             Ordering.<String>natural(),
-                                                             reducer);
+                Ordering.<String>natural(),
+                reducer);
         assert Iterators.elementsEqual(cat, smi);
         smi.close();
         assert a.closed && b.closed && c.closed && d.closed;

diff --git a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
index 8d6e272..64aea24 100644
--- a/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java
+++ b/test/unit/org/apache/cassandra/utils/MerkleTreeTest.java

@@ -21,15 +21,18 @@
 import java.math.BigInteger;
 import java.util.*;
 
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.ByteArrayDataInput;
-import com.google.common.io.ByteStreams;
+import com.google.common.collect.Lists;
 
 import org.junit.Before;
 import org.junit.Test;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.dht.*;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.MerkleTree.Hashable;
@@ -64,7 +67,7 @@
         TOKEN_SCALE = new BigInteger("8");
         partitioner = RandomPartitioner.instance;
         // TODO need to trickle TokenSerializer
-        DatabaseDescriptor.setPartitioner(partitioner);
+        DatabaseDescriptor.setPartitionerUnsafe(partitioner);
         mt = new MerkleTree(partitioner, fullRange(), RECOMMENDED_DEPTH, Integer.MAX_VALUE);
     }
 
@@ -397,7 +400,7 @@
         MerkleTree.serializer.serialize(mt, out, MessagingService.current_version);
         byte[] serialized = out.toByteArray();
 
-        ByteArrayDataInput in = ByteStreams.newDataInput(serialized);
+        DataInputPlus in = new DataInputBuffer(serialized);
         MerkleTree restored = MerkleTree.serializer.deserialize(in, MessagingService.current_version);
 
         assertHashEquals(initialhash, restored.hash(full));
@@ -440,6 +443,72 @@
     }
 
     /**
+     * difference should behave as expected, even with extremely small ranges
+     */
+    @Test
+    public void differenceSmallRange()
+    {
+        Token start = new BigIntegerToken("9");
+        Token end = new BigIntegerToken("10");
+        Range<Token> range = new Range<>(start, end);
+
+        MerkleTree ltree = new MerkleTree(partitioner, range, RECOMMENDED_DEPTH, 16);
+        ltree.init();
+        MerkleTree rtree = new MerkleTree(partitioner, range, RECOMMENDED_DEPTH, 16);
+        rtree.init();
+
+        byte[] h1 = "asdf".getBytes();
+        byte[] h2 = "hjkl".getBytes();
+
+        // add dummy hashes to both trees
+        for (TreeRange tree : ltree.invalids())
+        {
+            tree.addHash(new RowHash(range.right, h1, h1.length));
+        }
+        for (TreeRange tree : rtree.invalids())
+        {
+            tree.addHash(new RowHash(range.right, h2, h2.length));
+        }
+
+        List<TreeRange> diffs = MerkleTree.difference(ltree, rtree);
+        assertEquals(Lists.newArrayList(range), diffs);
+        assertEquals(MerkleTree.FULLY_INCONSISTENT, MerkleTree.differenceHelper(ltree, rtree, new ArrayList<>(), new MerkleTree.TreeDifference(ltree.fullRange.left, ltree.fullRange.right, (byte)0)));
+    }
+
+    /**
+     * matching should behave as expected, even with extremely small ranges
+     */
+    @Test
+    public void matchingSmallRange()
+    {
+        Token start = new BigIntegerToken("9");
+        Token end = new BigIntegerToken("10");
+        Range<Token> range = new Range<>(start, end);
+
+        MerkleTree ltree = new MerkleTree(partitioner, range, RECOMMENDED_DEPTH, 16);
+        ltree.init();
+        MerkleTree rtree = new MerkleTree(partitioner, range, RECOMMENDED_DEPTH, 16);
+        rtree.init();
+
+        byte[] h1 = "asdf".getBytes();
+        byte[] h2 = "asdf".getBytes();
+
+
+        // add dummy hashes to both trees
+        for (TreeRange tree : ltree.invalids())
+        {
+            tree.addHash(new RowHash(range.right, h1, h1.length));
+        }
+        for (TreeRange tree : rtree.invalids())
+        {
+            tree.addHash(new RowHash(range.right, h2, h2.length));
+        }
+
+        // top level difference() should show no differences
+        assertEquals(MerkleTree.difference(ltree, rtree), Lists.newArrayList());
+    }
+
+    /**
      * Return the root hash of a binary tree with leaves at the given depths
      * and with the given hash val in each leaf.
      */

diff --git a/test/unit/org/apache/cassandra/utils/MerkleTreesTest.java b/test/unit/org/apache/cassandra/utils/MerkleTreesTest.java
new file mode 100644
index 0000000..ec8fd68
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/MerkleTreesTest.java

@@ -0,0 +1,538 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyten ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.utils;
+
+import java.math.BigInteger;
+import java.util.*;
+
+import com.google.common.collect.AbstractIterator;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.cassandra.dht.*;
+import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.MerkleTree.Hashable;
+import org.apache.cassandra.utils.MerkleTree.RowHash;
+import org.apache.cassandra.utils.MerkleTree.TreeRange;
+import org.apache.cassandra.utils.MerkleTrees.TreeRangeIterator;
+
+import static org.junit.Assert.*;
+
+public class MerkleTreesTest
+{
+    public static byte[] DUMMY = "blah".getBytes();
+
+    /**
+     * If a test assumes that the tree is 8 units wide, then it should set this value
+     * to 8.
+     */
+    public static BigInteger TOKEN_SCALE = new BigInteger("8");
+
+    protected static final IPartitioner partitioner = RandomPartitioner.instance;
+    protected MerkleTrees mts;
+
+    private Range<Token> fullRange()
+    {
+        return new Range<>(partitioner.getMinimumToken(), partitioner.getMinimumToken());
+    }
+
+    @BeforeClass
+    public static void setUp()
+    {
+        StorageService.instance.setPartitionerUnsafe(partitioner);
+    }
+    @Before
+    public void clear()
+    {
+        TOKEN_SCALE = new BigInteger("8");
+        mts = new MerkleTrees(partitioner);
+        mts.addMerkleTree(Integer.MAX_VALUE, fullRange());
+    }
+
+    public static void assertHashEquals(final byte[] left, final byte[] right)
+    {
+        assertHashEquals("", left, right);
+    }
+
+    public static void assertHashEquals(String message, final byte[] left, final byte[] right)
+    {
+        String lstring = left == null ? "null" : Hex.bytesToHex(left);
+        String rstring = right == null ? "null" : Hex.bytesToHex(right);
+        assertEquals(message, lstring, rstring);
+    }
+
+    /**
+     * The value returned by this method is affected by TOKEN_SCALE: setting TOKEN_SCALE
+     * to 8 means that passing -1 through 8 for this method will return values mapped
+     * between -1 and Token.MAX_VALUE.
+     */
+    public static Token tok(int i)
+    {
+        if (i == -1)
+            return new BigIntegerToken(new BigInteger("-1"));
+        BigInteger bint = RandomPartitioner.MAXIMUM.divide(TOKEN_SCALE).multiply(new BigInteger(""+i));
+        return new BigIntegerToken(bint);
+    }
+
+    @Test
+    public void testIntersectingRanges()
+    {
+        mts = new MerkleTrees(partitioner);
+
+        boolean failure = true;
+        mts.addMerkleTree(1, new Range<>(tok(1), tok(3)));
+
+        try
+        {
+            mts.addMerkleTree(1, new Range<>(tok(2), tok(4)));
+        }
+        catch (AssertionError e)
+        {
+            failure = false;
+        }
+
+        assertFalse(failure);
+    }
+
+    @Test
+    public void testSplit()
+    {
+        // split the range  (zero, zero] into:
+        //  (zero,four], (four,six], (six,seven] and (seven, zero]
+        mts.split(tok(4));
+        mts.split(tok(6));
+        mts.split(tok(7));
+
+        assertEquals(4, mts.size());
+        assertEquals(new Range<>(tok(7), tok(-1)), mts.get(tok(-1)));
+        assertEquals(new Range<>(tok(-1), tok(4)), mts.get(tok(3)));
+        assertEquals(new Range<>(tok(-1), tok(4)), mts.get(tok(4)));
+        assertEquals(new Range<>(tok(4), tok(6)), mts.get(tok(6)));
+        assertEquals(new Range<>(tok(6), tok(7)), mts.get(tok(7)));
+
+        // check depths
+        assertEquals((byte) 1, mts.get(tok(4)).depth);
+        assertEquals((byte) 2, mts.get(tok(6)).depth);
+        assertEquals((byte) 3, mts.get(tok(7)).depth);
+        assertEquals((byte) 3, mts.get(tok(-1)).depth);
+
+        try
+        {
+            mts.split(tok(-1));
+            fail("Shouldn't be able to split outside the initial range.");
+        }
+        catch (AssertionError e)
+        {
+            // pass
+        }
+    }
+
+    @Test
+    public void testSplitLimitDepth()
+    {
+        mts = new MerkleTrees(partitioner);
+
+        mts.addMerkleTree(Integer.MAX_VALUE, (byte) 2, fullRange());
+
+        assertTrue(mts.split(tok(4)));
+        assertTrue(mts.split(tok(2)));
+        assertEquals(3, mts.size());
+
+        // should fail to split below hashdepth
+        assertFalse(mts.split(tok(1)));
+        assertEquals(3, mts.size());
+        assertEquals(new Range<>(tok(4), tok(-1)), mts.get(tok(-1)));
+        assertEquals(new Range<>(tok(-1), tok(2)), mts.get(tok(2)));
+        assertEquals(new Range<>(tok(2), tok(4)), mts.get(tok(4)));
+    }
+
+    @Test
+    public void testSplitLimitSize()
+    {
+        mts = new MerkleTrees(partitioner);
+
+        mts.addMerkleTree(2, fullRange());
+
+        assertTrue(mts.split(tok(4)));
+        assertEquals(2, mts.size());
+
+        // should fail to split above maxsize
+        assertFalse(mts.split(tok(2)));
+        assertEquals(2, mts.size());
+        assertEquals(new Range<>(tok(4), tok(-1)), mts.get(tok(-1)));
+        assertEquals(new Range<>(tok(-1), tok(4)), mts.get(tok(4)));
+    }
+
+    @Test
+    public void testInvalids()
+    {
+        Iterator<TreeRange> ranges;
+
+        // (zero, zero]
+        ranges = mts.invalids();
+        assertEquals(new Range<>(tok(-1), tok(-1)), ranges.next());
+        assertFalse(ranges.hasNext());
+
+        // all invalid
+        mts.split(tok(4));
+        mts.split(tok(2));
+        mts.split(tok(6));
+        mts.split(tok(3));
+        mts.split(tok(5));
+        ranges = mts.invalids();
+        assertEquals(new Range<>(tok(6), tok(-1)), ranges.next());
+        assertEquals(new Range<>(tok(-1), tok(2)), ranges.next());
+        assertEquals(new Range<>(tok(2), tok(3)), ranges.next());
+        assertEquals(new Range<>(tok(3), tok(4)), ranges.next());
+        assertEquals(new Range<>(tok(4), tok(5)), ranges.next());
+        assertEquals(new Range<>(tok(5), tok(6)), ranges.next());
+        assertEquals(new Range<>(tok(6), tok(-1)), ranges.next());
+        assertFalse(ranges.hasNext());
+    }
+
+
+    @Test
+    public void testHashFull()
+    {
+        byte[] val = DUMMY;
+        Range<Token> range = new Range<>(tok(-1), tok(-1));
+
+        // (zero, zero]
+        assertNull(mts.hash(range));
+
+        // validate the range
+        mts.get(tok(-1)).hash(val);
+
+        assertHashEquals(val, mts.hash(range));
+    }
+
+    @Test
+    public void testHashPartial()
+    {
+        byte[] val = DUMMY;
+        byte[] leftval = hashed(val, 1, 1);
+        byte[] partialval = hashed(val, 1);
+        Range<Token> left = new Range<>(tok(-1), tok(4));
+        Range<Token> partial = new Range<>(tok(2), tok(4));
+        Range<Token> right = new Range<>(tok(4), tok(-1));
+        Range<Token> linvalid = new Range<>(tok(1), tok(4));
+        Range<Token> rinvalid = new Range<>(tok(4), tok(6));
+
+        // (zero,two] (two,four] (four, zero]
+        mts.split(tok(4));
+        mts.split(tok(2));
+        assertNull(mts.hash(left));
+        assertNull(mts.hash(partial));
+        assertNull(mts.hash(right));
+        assertNull(mts.hash(linvalid));
+        assertNull(mts.hash(rinvalid));
+
+        // validate the range
+        mts.get(tok(2)).hash(val);
+        mts.get(tok(4)).hash(val);
+        mts.get(tok(-1)).hash(val);
+
+        assertHashEquals(leftval, mts.hash(left));
+        assertHashEquals(partialval, mts.hash(partial));
+        assertHashEquals(val, mts.hash(right));
+        assertNull(mts.hash(linvalid));
+        assertNull(mts.hash(rinvalid));
+    }
+
+    @Test
+    public void testHashInner()
+    {
+        byte[] val = DUMMY;
+        byte[] lchildval = hashed(val, 3, 3, 2);
+        byte[] rchildval = hashed(val, 2, 2);
+        byte[] fullval = hashed(val, 3, 3, 2, 2, 2);
+        Range<Token> full = new Range<>(tok(-1), tok(-1));
+        Range<Token> lchild = new Range<>(tok(-1), tok(4));
+        Range<Token> rchild = new Range<>(tok(4), tok(-1));
+        Range<Token> invalid = new Range<>(tok(1), tok(-1));
+
+        // (zero,one] (one, two] (two,four] (four, six] (six, zero]
+        mts.split(tok(4));
+        mts.split(tok(2));
+        mts.split(tok(6));
+        mts.split(tok(1));
+        assertNull(mts.hash(full));
+        assertNull(mts.hash(lchild));
+        assertNull(mts.hash(rchild));
+        assertNull(mts.hash(invalid));
+
+        // validate the range
+        mts.get(tok(1)).hash(val);
+        mts.get(tok(2)).hash(val);
+        mts.get(tok(4)).hash(val);
+        mts.get(tok(6)).hash(val);
+        mts.get(tok(-1)).hash(val);
+
+        assertHashEquals(fullval, mts.hash(full));
+        assertHashEquals(lchildval, mts.hash(lchild));
+        assertHashEquals(rchildval, mts.hash(rchild));
+        assertNull(mts.hash(invalid));
+    }
+
+    @Test
+    public void testHashDegenerate()
+    {
+        TOKEN_SCALE = new BigInteger("32");
+
+        byte[] val = DUMMY;
+        byte[] childfullval = hashed(val, 5, 5, 4);
+        byte[] fullval = hashed(val, 5, 5, 4, 3, 2, 1);
+        Range<Token> childfull = new Range<>(tok(-1), tok(4));
+        Range<Token> full = new Range<>(tok(-1), tok(-1));
+        Range<Token> invalid = new Range<>(tok(4), tok(-1));
+
+        mts.split(tok(16));
+        mts.split(tok(8));
+        mts.split(tok(4));
+        mts.split(tok(2));
+        mts.split(tok(1));
+        assertNull(mts.hash(full));
+        assertNull(mts.hash(childfull));
+        assertNull(mts.hash(invalid));
+
+        // validate the range
+        mts.get(tok(1)).hash(val);
+        mts.get(tok(2)).hash(val);
+        mts.get(tok(4)).hash(val);
+        mts.get(tok(8)).hash(val);
+        mts.get(tok(16)).hash(val);
+        mts.get(tok(-1)).hash(val);
+
+        assertHashEquals(fullval, mts.hash(full));
+        assertHashEquals(childfullval, mts.hash(childfull));
+        assertNull(mts.hash(invalid));
+    }
+
+    @Test
+    public void testHashRandom()
+    {
+        int max = 1000000;
+        TOKEN_SCALE = new BigInteger("" + max);
+
+        mts = new MerkleTrees(partitioner);
+        mts.addMerkleTree(32, fullRange());
+
+        Random random = new Random();
+        while (true)
+        {
+            if (!mts.split(tok(random.nextInt(max))))
+                break;
+        }
+
+        // validate the tree
+        TreeRangeIterator ranges = mts.invalids();
+        for (TreeRange range : ranges)
+            range.addHash(new RowHash(range.right, new byte[0], 0));
+
+        assert mts.hash(new Range<>(tok(-1), tok(-1))) != null : "Could not hash tree " + mts;
+    }
+
+    /**
+     * Generate two trees with different splits, but containing the same keys, and
+     * check that they compare equally.
+     *
+     * The set of keys used in this test is: #{2,4,6,8,12,14,0}
+     */
+    @Test
+    public void testValidateTree()
+    {
+        TOKEN_SCALE = new BigInteger("16"); // this test needs slightly more resolution
+
+        Range<Token> full = new Range<>(tok(-1), tok(-1));
+        Iterator<TreeRange> ranges;
+        MerkleTrees mts2 = new MerkleTrees(partitioner);
+        mts2.addMerkleTree(Integer.MAX_VALUE, fullRange());
+
+        mts.split(tok(8));
+        mts.split(tok(4));
+        mts.split(tok(12));
+        mts.split(tok(6));
+        mts.split(tok(10));
+
+        ranges = mts.invalids();
+        ranges.next().addAll(new HIterator(2, 4)); // (-1,4]: depth 2
+        ranges.next().addAll(new HIterator(6)); // (4,6]
+        ranges.next().addAll(new HIterator(8)); // (6,8]
+        ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (8,10]
+        ranges.next().addAll(new HIterator(12)); // (10,12]
+        ranges.next().addAll(new HIterator(14, -1)); // (12,-1]: depth 2
+
+
+        mts2.split(tok(8));
+        mts2.split(tok(4));
+        mts2.split(tok(12));
+        mts2.split(tok(2));
+        mts2.split(tok(10));
+        mts2.split(tok(9));
+        mts2.split(tok(11));
+
+        ranges = mts2.invalids();
+        ranges.next().addAll(new HIterator(2)); // (-1,2]
+        ranges.next().addAll(new HIterator(4)); // (2,4]
+        ranges.next().addAll(new HIterator(6, 8)); // (4,8]: depth 2
+        ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (8,9]
+        ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (9,10]
+        ranges.next().addAll(new HIterator(/*empty*/ new int[0])); // (10,11]: depth 4
+        ranges.next().addAll(new HIterator(12)); // (11,12]: depth 4
+        ranges.next().addAll(new HIterator(14, -1)); // (12,-1]: depth 2
+
+        byte[] mthash = mts.hash(full);
+        byte[] mt2hash = mts2.hash(full);
+        assertHashEquals("Tree hashes did not match: " + mts + " && " + mts2, mthash, mt2hash);
+    }
+
+    @Test
+    public void testSerialization() throws Exception
+    {
+        Range<Token> first = new Range<>(tok(3), tok(4));
+
+        Collection<Range<Token>> ranges = new ArrayList<>();
+
+        ranges.add(first);
+        ranges.add(new Range<Token>(tok(5), tok(2)));
+
+        mts = new MerkleTrees(partitioner);
+        mts.addMerkleTrees(256, ranges);
+
+        // populate and validate the tree
+        mts.init();
+        for (TreeRange range : mts.invalids())
+            range.addAll(new HIterator(range.right));
+
+        byte[] initialhash = mts.hash(first);
+
+        long serializedSize = MerkleTrees.serializer.serializedSize(mts, MessagingService.current_version);
+        DataOutputBuffer out = new DataOutputBuffer();
+        MerkleTrees.serializer.serialize(mts, out, MessagingService.current_version);
+        byte[] serialized = out.toByteArray();
+
+        assertEquals(serializedSize, serialized.length);
+
+        DataInputBuffer in = new DataInputBuffer(serialized);
+        MerkleTrees restored = MerkleTrees.serializer.deserialize(in, MessagingService.current_version);
+
+        assertHashEquals(initialhash, restored.hash(first));
+    }
+
+    @Test
+    public void testDifference()
+    {
+        int maxsize = 16;
+        mts = new MerkleTrees(partitioner);
+        mts.addMerkleTree(32, fullRange());
+
+        MerkleTrees mts2 = new MerkleTrees(partitioner);
+        mts2.addMerkleTree(32, fullRange());
+
+        mts.init();
+        mts2.init();
+
+        // add dummy hashes to both trees
+        for (TreeRange range : mts.invalids())
+            range.addAll(new HIterator(range.right));
+        for (TreeRange range : mts2.invalids())
+            range.addAll(new HIterator(range.right));
+
+        TreeRange leftmost = null;
+        TreeRange middle = null;
+
+        mts.maxsize(fullRange(), maxsize + 2); // give some room for splitting
+
+        // split the leftmost
+        Iterator<TreeRange> ranges = mts.invalids();
+        leftmost = ranges.next();
+        mts.split(leftmost.right);
+
+        // set the hashes for the leaf of the created split
+        middle = mts.get(leftmost.right);
+        middle.hash("arbitrary!".getBytes());
+        mts.get(partitioner.midpoint(leftmost.left, leftmost.right)).hash("even more arbitrary!".getBytes());
+
+        // trees should disagree for (leftmost.left, middle.right]
+        List<Range<Token>> diffs = MerkleTrees.difference(mts, mts2);
+        assertEquals(diffs + " contains wrong number of differences:", 1, diffs.size());
+        assertTrue(diffs.contains(new Range<>(leftmost.left, middle.right)));
+    }
+
+    /**
+     * Return the root hash of a binary tree with leaves at the given depths
+     * and with the given hash val in each leaf.
+     */
+    byte[] hashed(byte[] val, Integer... depths)
+    {
+        ArrayDeque<Integer> dstack = new ArrayDeque<Integer>();
+        ArrayDeque<byte[]> hstack = new ArrayDeque<byte[]>();
+        Iterator<Integer> depthiter = Arrays.asList(depths).iterator();
+        if (depthiter.hasNext())
+        {
+            dstack.push(depthiter.next());
+            hstack.push(val);
+        }
+        while (depthiter.hasNext())
+        {
+            Integer depth = depthiter.next();
+            byte[] hash = val;
+            while (depth.equals(dstack.peek()))
+            {
+                // consume the stack
+                hash = Hashable.binaryHash(hstack.pop(), hash);
+                depth = dstack.pop()-1;
+            }
+            dstack.push(depth);
+            hstack.push(hash);
+        }
+        assert hstack.size() == 1;
+        return hstack.pop();
+    }
+
+    static class HIterator extends AbstractIterator<RowHash>
+    {
+        private Iterator<Token> tokens;
+
+        public HIterator(int... tokens)
+        {
+            List<Token> tlist = new LinkedList<Token>();
+            for (int token : tokens)
+                tlist.add(tok(token));
+            this.tokens = tlist.iterator();
+        }
+
+        public HIterator(Token... tokens)
+        {
+            this.tokens = Arrays.asList(tokens).iterator();
+        }
+
+        public RowHash computeNext()
+        {
+            if (tokens.hasNext())
+                return new RowHash(tokens.next(), DUMMY, DUMMY.length);
+            return endOfData();
+        }
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/NativeLibraryTest.java b/test/unit/org/apache/cassandra/utils/NativeLibraryTest.java
new file mode 100644
index 0000000..226653e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/NativeLibraryTest.java

@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.io.File;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.FileUtils;
+
+public class NativeLibraryTest
+{
+    @Test
+    public void testSkipCache()
+    {
+        File file = FileUtils.createTempFile("testSkipCache", "1");
+
+        NativeLibrary.trySkipCache(file.getPath(), 0, 0);
+    }
+
+    @Test
+    public void getPid()
+    {
+        long pid = NativeLibrary.getProcessID();
+        Assert.assertTrue(pid > 0);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/NoSpamLoggerTest.java b/test/unit/org/apache/cassandra/utils/NoSpamLoggerTest.java
index 0a5a005..702fa98 100644
--- a/test/unit/org/apache/cassandra/utils/NoSpamLoggerTest.java
+++ b/test/unit/org/apache/cassandra/utils/NoSpamLoggerTest.java

@@ -113,19 +113,27 @@
        setUp();
        now = 5;
 
-       NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertTrue(NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param));
 
        assertEquals(1, logged.get(l).size());
 
-       NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertFalse(NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param));
 
        assertEquals(1, logged.get(l).size());
 
        now += 5;
 
-       NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertTrue(NoSpamLogger.log( mock, l, 5,  TimeUnit.NANOSECONDS, statement, param));
 
        assertEquals(2, logged.get(l).size());
+
+       assertTrue(NoSpamLogger.log( mock, l, "key", 5,  TimeUnit.NANOSECONDS, statement, param));
+
+       assertEquals(3, logged.get(l).size());
+
+       assertFalse(NoSpamLogger.log( mock, l, "key", 5,  TimeUnit.NANOSECONDS, statement, param));
+
+       assertEquals(3, logged.get(l).size());
    }
 
    private void assertLoggedSizes(int info, int warn, int error)
@@ -141,20 +149,20 @@
        now = 5;
        NoSpamLogger logger = NoSpamLogger.getLogger( mock, 5, TimeUnit.NANOSECONDS);
 
-       logger.info(statement, param);
-       logger.info(statement, param);
-       logger.warn(statement, param);
-       logger.error(statement, param);
+       assertTrue(logger.info(statement, param));
+       assertFalse(logger.info(statement, param));
+       assertFalse(logger.warn(statement, param));
+       assertFalse(logger.error(statement, param));
 
        assertLoggedSizes(1, 0, 0);
 
        NoSpamLogStatement statement = logger.getStatement("swizzle2{}", 10, TimeUnit.NANOSECONDS);
-       statement.warn(param);
+       assertFalse(statement.warn(param));
        //now is 5 so it won't log
        assertLoggedSizes(1, 0, 0);
 
        now = 10;
-       statement.warn(param);
+       assertTrue(statement.warn(param));
        assertLoggedSizes(1, 1, 0);
 
    }
@@ -166,10 +174,10 @@
 
        now = 5;
 
-       nospam.info(statement, param);
-       nospam.info(statement, param);
-       nospam.warn(statement, param);
-       nospam.error(statement, param);
+       assertTrue(nospam.info(statement, param));
+       assertFalse(nospam.info(statement, param));
+       assertFalse(nospam.warn(statement, param));
+       assertFalse(nospam.error(statement, param));
 
        assertLoggedSizes(1, 0, 0);
    }
@@ -193,51 +201,51 @@
    {
        now = 5;
 
-       NoSpamLogger.log( mock, Level.INFO, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertTrue(NoSpamLogger.log( mock, Level.INFO, 5,  TimeUnit.NANOSECONDS, statement, param));
        checkMock(Level.INFO);
 
        now = 10;
 
-       NoSpamLogger.log( mock, Level.WARN, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertTrue(NoSpamLogger.log( mock, Level.WARN, 5,  TimeUnit.NANOSECONDS, statement, param));
        checkMock(Level.WARN);
 
        now = 15;
 
-       NoSpamLogger.log( mock, Level.ERROR, 5,  TimeUnit.NANOSECONDS, statement, param);
+       assertTrue(NoSpamLogger.log( mock, Level.ERROR, 5,  TimeUnit.NANOSECONDS, statement, param));
        checkMock(Level.ERROR);
 
        now = 20;
 
        NoSpamLogger logger = NoSpamLogger.getLogger(mock, 5, TimeUnit.NANOSECONDS);
 
-       logger.info(statement, param);
+       assertTrue(logger.info(statement, param));
        checkMock(Level.INFO);
 
        now = 25;
 
-       logger.warn(statement, param);
+       assertTrue(logger.warn(statement, param));
        checkMock(Level.WARN);
 
        now = 30;
 
-       logger.error(statement, param);
+       assertTrue(logger.error(statement, param));
        checkMock(Level.ERROR);
 
        NoSpamLogger.NoSpamLogStatement nospamStatement = logger.getStatement(statement);
 
        now = 35;
 
-       nospamStatement.info(param);
+       assertTrue(nospamStatement.info(param));
        checkMock(Level.INFO);
 
        now = 40;
 
-       nospamStatement.warn(param);
+       assertTrue(nospamStatement.warn(param));
        checkMock(Level.WARN);
 
        now = 45;
 
-       nospamStatement.error(param);
+       assertTrue(nospamStatement.error(param));
        checkMock(Level.ERROR);
    }
 }

diff --git a/test/unit/org/apache/cassandra/utils/SerializationsTest.java b/test/unit/org/apache/cassandra/utils/SerializationsTest.java
index 497b16d..cf50769 100644
--- a/test/unit/org/apache/cassandra/utils/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/utils/SerializationsTest.java

@@ -18,27 +18,47 @@
  */
 package org.apache.cassandra.utils;
 
-import org.apache.cassandra.AbstractSerializationsTester;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.util.DataOutputStreamPlus;
-import org.apache.cassandra.service.StorageService;
-import org.junit.Assert;
-import org.junit.Test;
-
 import java.io.DataInputStream;
 import java.io.IOException;
 
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.AbstractSerializationsTester;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.io.util.DataInputPlus.DataInputStreamPlus;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+
+import java.io.File;
+import java.io.FileInputStream;
+
 public class SerializationsTest extends AbstractSerializationsTester
 {
-
-    private void testBloomFilterWrite(boolean offheap) throws IOException
+    private static void testBloomFilterWrite(boolean offheap, boolean oldBfHashOrder) throws IOException
     {
-        IPartitioner partitioner = StorageService.getPartitioner();
-        try (IFilter bf = FilterFactory.getFilter(1000000, 0.0001, offheap))
+        IPartitioner partitioner = Util.testPartitioner();
+        try (IFilter bf = FilterFactory.getFilter(1000000, 0.0001, offheap, oldBfHashOrder))
         {
             for (int i = 0; i < 100; i++)
                 bf.add(partitioner.decorateKey(partitioner.getTokenFactory().toByteArray(partitioner.getRandomToken())));
-            try (DataOutputStreamPlus out = getOutput("utils.BloomFilter.bin"))
+            try (DataOutputStreamPlus out = getOutput(oldBfHashOrder ? "2.1" : "3.0", "utils.BloomFilter.bin"))
+            {
+                FilterFactory.serialize(bf, out);
+            }
+        }
+    }
+
+    private static void testBloomFilterWrite1000(boolean offheap, boolean oldBfHashOrder) throws IOException
+    {
+        try (IFilter bf = FilterFactory.getFilter(1000000, 0.0001, offheap, oldBfHashOrder))
+        {
+            for (int i = 0; i < 1000; i++)
+                bf.add(Util.dk(Int32Type.instance.decompose(i)));
+            try (DataOutputStreamPlus out = getOutput(oldBfHashOrder ? "2.1" : "3.0", "utils.BloomFilter1000.bin"))
             {
                 FilterFactory.serialize(bf, out);
             }
@@ -46,19 +66,132 @@
     }
 
     @Test
+    public void testBloomFilterRead1000() throws IOException
+    {
+        if (EXECUTE_WRITES)
+        {
+            testBloomFilterWrite1000(true, false);
+            testBloomFilterWrite1000(true, true);
+        }
+
+        try (DataInputStream in = getInput("3.0", "utils.BloomFilter1000.bin");
+             IFilter filter = FilterFactory.deserialize(in, true, false))
+        {
+            boolean present;
+            for (int i = 0 ; i < 1000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                Assert.assertTrue(present);
+            }
+            for (int i = 1000 ; i < 2000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                Assert.assertFalse(present);
+            }
+        }
+
+        try (DataInputStream in = getInput("2.1", "utils.BloomFilter1000.bin");
+             IFilter filter = FilterFactory.deserialize(in, true, true))
+        {
+            boolean present;
+            for (int i = 0 ; i < 1000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                Assert.assertTrue(present);
+            }
+            for (int i = 1000 ; i < 2000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                Assert.assertFalse(present);
+            }
+        }
+
+        // eh - reading version version 'ka' (2.1) with 3.0 BloomFilter
+        int falsePositive = 0;
+        int falseNegative = 0;
+        try (DataInputStream in = getInput("2.1", "utils.BloomFilter1000.bin");
+             IFilter filter = FilterFactory.deserialize(in, true, false))
+        {
+            boolean present;
+            for (int i = 0 ; i < 1000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                if (!present)
+                    falseNegative ++;
+            }
+            for (int i = 1000 ; i < 2000 ; i++)
+            {
+                present = filter.isPresent(Util.dk(Int32Type.instance.decompose(i)));
+                if (present)
+                    falsePositive ++;
+            }
+        }
+        Assert.assertEquals(1000, falseNegative);
+        Assert.assertEquals(0, falsePositive);
+    }
+
+    @Test
+    public void testBloomFilterTable() throws Exception
+    {
+        testBloomFilterTable("test/data/bloom-filter/ka/foo/foo-atable-ka-1-Filter.db", true);
+        testBloomFilterTable("test/data/bloom-filter/la/foo/la-1-big-Filter.db", false);
+    }
+
+    private static void testBloomFilterTable(String file, boolean oldBfHashOrder) throws Exception
+    {
+        Murmur3Partitioner partitioner = new Murmur3Partitioner();
+
+        try (DataInputStream in = new DataInputStream(new FileInputStream(new File(file)));
+             IFilter filter = FilterFactory.deserialize(in, true, oldBfHashOrder))
+        {
+            for (int i = 1; i <= 10; i++)
+            {
+                DecoratedKey decoratedKey = partitioner.decorateKey(Int32Type.instance.decompose(i));
+                boolean present = filter.isPresent(decoratedKey);
+                Assert.assertTrue(present);
+            }
+
+            int positives = 0;
+            for (int i = 11; i <= 1000010; i++)
+            {
+                DecoratedKey decoratedKey = partitioner.decorateKey(Int32Type.instance.decompose(i));
+                boolean present = filter.isPresent(decoratedKey);
+                if (present)
+                    positives++;
+            }
+            double fpr = positives;
+            fpr /= 1000000;
+            Assert.assertTrue(fpr <= 0.011d);
+        }
+    }
+
+    @Test
     public void testBloomFilterReadMURMUR3() throws IOException
     {
         if (EXECUTE_WRITES)
-            testBloomFilterWrite(true);
+            testBloomFilterWrite(true, true);
 
-        try (DataInputStream in = getInput("utils.BloomFilter.bin");
-             IFilter filter = FilterFactory.deserialize(in, true))
+        try (DataInputStream in = getInput("3.0", "utils.BloomFilter.bin");
+             IFilter filter = FilterFactory.deserialize(in, true, true))
         {
             Assert.assertNotNull(filter);
         }
     }
 
-    private void testEstimatedHistogramWrite() throws IOException
+    @Test
+    public void testBloomFilterReadMURMUR3pre30() throws IOException
+    {
+        if (EXECUTE_WRITES)
+            testBloomFilterWrite(true, false);
+
+        try (DataInputStream in = getInput("2.1", "utils.BloomFilter.bin");
+             IFilter filter = FilterFactory.deserialize(in, true, false))
+        {
+            Assert.assertNotNull(filter);
+        }
+    }
+
+    private static void testEstimatedHistogramWrite() throws IOException
     {
         EstimatedHistogram hist0 = new EstimatedHistogram();
         EstimatedHistogram hist1 = new EstimatedHistogram(5000);
@@ -86,7 +219,7 @@
         if (EXECUTE_WRITES)
             testEstimatedHistogramWrite();
 
-        try (DataInputStream in = getInput("utils.EstimatedHistogram.bin"))
+        try (DataInputStreamPlus in = getInput("utils.EstimatedHistogram.bin"))
         {
             Assert.assertNotNull(EstimatedHistogram.serializer.deserialize(in));
             Assert.assertNotNull(EstimatedHistogram.serializer.deserialize(in));

diff --git a/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java b/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java
index 0e9b90b..b107600 100644
--- a/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java
+++ b/test/unit/org/apache/cassandra/utils/StreamingHistogramTest.java

@@ -17,12 +17,12 @@
  */
 package org.apache.cassandra.utils;
 
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
 import org.junit.Test;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.util.*;
-
+import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 
 import static org.junit.Assert.assertEquals;
@@ -32,7 +32,7 @@
     @Test
     public void testFunction() throws Exception
     {
-        StreamingHistogram hist = new StreamingHistogram(5);
+        StreamingHistogram.StreamingHistogramBuilder hist = new StreamingHistogram.StreamingHistogramBuilder(5, 5, 1);
         long[] samples = new long[]{23, 19, 10, 16, 36, 2, 9, 32, 30, 45};
 
         // add 7 points to histogram of 5 bins
@@ -50,7 +50,7 @@
         expected1.put(36.0, 1L);
 
         Iterator<Map.Entry<Double, Long>> expectedItr = expected1.entrySet().iterator();
-        for (Map.Entry<Double, Long> actual : hist.getAsMap().entrySet())
+        for (Map.Entry<Double, Long> actual : hist.build().getAsMap().entrySet())
         {
             Map.Entry<Double, Long> entry = expectedItr.next();
             assertEquals(entry.getKey(), actual.getKey(), 0.01);
@@ -58,12 +58,13 @@
         }
 
         // merge test
-        StreamingHistogram hist2 = new StreamingHistogram(3);
+        StreamingHistogram.StreamingHistogramBuilder hist2 = new StreamingHistogram.StreamingHistogramBuilder(3, 0, 1);
         for (int i = 7; i < samples.length; i++)
         {
             hist2.update(samples[i]);
         }
-        hist.merge(hist2);
+        hist.merge(hist2.build());
+        StreamingHistogram histBuilt = hist.build();
         // should end up (2,1),(9.5,2),(19.33,3),(32.67,3),(45,1)
         Map<Double, Long> expected2 = new LinkedHashMap<Double, Long>(5);
         expected2.put(2.0, 1L);
@@ -72,7 +73,7 @@
         expected2.put(32.67, 3L);
         expected2.put(45.0, 1L);
         expectedItr = expected2.entrySet().iterator();
-        for (Map.Entry<Double, Long> actual : hist.getAsMap().entrySet())
+        for (Map.Entry<Double, Long> actual : histBuilt.getAsMap().entrySet())
         {
             Map.Entry<Double, Long> entry = expectedItr.next();
             assertEquals(entry.getKey(), actual.getKey(), 0.01);
@@ -80,28 +81,29 @@
         }
 
         // sum test
-        assertEquals(3.28, hist.sum(15), 0.01);
+        assertEquals(3.28, histBuilt.sum(15), 0.01);
         // sum test (b > max(hist))
-        assertEquals(10.0, hist.sum(50), 0.01);
+        assertEquals(10.0, histBuilt.sum(50), 0.01);
     }
 
     @Test
     public void testSerDe() throws Exception
     {
-        StreamingHistogram hist = new StreamingHistogram(5);
+        StreamingHistogram.StreamingHistogramBuilder histogramBuilder = new StreamingHistogram.StreamingHistogramBuilder(5, 0, 1);
         long[] samples = new long[]{23, 19, 10, 16, 36, 2, 9};
 
         // add 7 points to histogram of 5 bins
         for (int i = 0; i < samples.length; i++)
         {
-            hist.update(samples[i]);
+            histogramBuilder.update(samples[i]);
         }
+        StreamingHistogram hist = histogramBuilder.build();
 
         DataOutputBuffer out = new DataOutputBuffer();
         StreamingHistogram.serializer.serialize(hist, out);
         byte[] bytes = out.toByteArray();
 
-        StreamingHistogram deserialized = StreamingHistogram.serializer.deserialize(new DataInputStream(new ByteArrayInputStream(bytes)));
+        StreamingHistogram deserialized = StreamingHistogram.serializer.deserialize(new DataInputBuffer(bytes));
 
         // deserialized histogram should have following values
         Map<Double, Long> expected1 = new LinkedHashMap<Double, Long>(5);
@@ -119,4 +121,38 @@
             assertEquals(entry.getValue(), actual.getValue());
         }
     }
+
+    @Test
+    public void testOverflow() throws Exception
+    {
+        StreamingHistogram.StreamingHistogramBuilder histogramBuilder = new StreamingHistogram.StreamingHistogramBuilder(5, 10, 1);
+        long[] samples = new long[]{23, 19, 10, 16, 36, 2, 9, 32, 30, 45, 31,
+                32, 32, 33, 34, 35, 70, 78, 80, 90, 100,
+                32, 32, 33, 34, 35, 70, 78, 80, 90, 100
+        };
+
+        // Hit the spool cap, force it to make bins
+        for (int i = 0; i < samples.length; i++)
+        {
+            histogramBuilder.update(samples[i]);
+        }
+        assertEquals(5, histogramBuilder.build().getAsMap().keySet().size());
+
+    }
+
+    @Test
+    public void testRounding() throws Exception
+    {
+        StreamingHistogram.StreamingHistogramBuilder histogramBuilder = new StreamingHistogram.StreamingHistogramBuilder(5, 10, 60);
+        long[] samples = new long[] { 59, 60, 119, 180, 181, 300 }; // 60, 60, 120, 180, 240, 300
+        for (int i = 0 ; i < samples.length ; i++)
+            histogramBuilder.update(samples[i]);
+
+        StreamingHistogram hist = histogramBuilder.build();
+        assertEquals(hist.getAsMap().keySet().size(), (int) 5);
+        assertEquals((long) hist.getAsMap().get(Double.valueOf(60)), (long) 2L);
+        assertEquals((long) hist.getAsMap().get(Double.valueOf(120)), (long) 1L);
+
+    }
+
 }

diff --git a/test/unit/org/apache/cassandra/utils/TopKSamplerTest.java b/test/unit/org/apache/cassandra/utils/TopKSamplerTest.java
index 61e4c52..bb6e3a8 100644
--- a/test/unit/org/apache/cassandra/utils/TopKSamplerTest.java
+++ b/test/unit/org/apache/cassandra/utils/TopKSamplerTest.java

@@ -22,18 +22,19 @@
 
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.*;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 
-import junit.framework.Assert;
-
-import org.apache.cassandra.utils.TopKSampler.SamplerResult;
+import com.google.common.collect.Maps;
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.Test;
 
 import com.clearspring.analytics.hash.MurmurHash;
 import com.clearspring.analytics.stream.Counter;
-import com.google.common.collect.Maps;
-import com.google.common.util.concurrent.Uninterruptibles;
+import junit.framework.Assert;
+import org.apache.cassandra.utils.TopKSampler.SamplerResult;
 
 public class TopKSamplerTest
 {

diff --git a/test/unit/org/apache/cassandra/utils/UUIDTests.java b/test/unit/org/apache/cassandra/utils/UUIDTests.java
index 99cd5ae..83e421a 100644
--- a/test/unit/org/apache/cassandra/utils/UUIDTests.java
+++ b/test/unit/org/apache/cassandra/utils/UUIDTests.java

@@ -21,12 +21,14 @@
  */
 
 
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.junit.Test;
-
 import java.nio.ByteBuffer;
 import java.util.UUID;
 
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.utils.UUIDGen;
+
 
 public class UUIDTests
 {

diff --git a/test/unit/org/apache/cassandra/utils/concurrent/AbstractTransactionalTest.java b/test/unit/org/apache/cassandra/utils/concurrent/AbstractTransactionalTest.java
index 4e160c2..bb2b9b0 100644
--- a/test/unit/org/apache/cassandra/utils/concurrent/AbstractTransactionalTest.java
+++ b/test/unit/org/apache/cassandra/utils/concurrent/AbstractTransactionalTest.java

@@ -18,14 +18,21 @@
 */
 package org.apache.cassandra.utils.concurrent;
 
+import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
 
 import junit.framework.Assert;
+import org.apache.cassandra.config.DatabaseDescriptor;
 
 @Ignore
 public abstract class AbstractTransactionalTest
 {
+    @BeforeClass
+    public static void setupDD()
+    {
+        DatabaseDescriptor.setDaemonInitialized();
+    }
 
     protected abstract TestableTransaction newTest() throws Exception;
 
@@ -87,9 +94,29 @@
         txn = newTest();
         Throwable t = new RuntimeException();
         txn.testing.prepareToCommit();
-        Assert.assertEquals(t, txn.testing.commit(t));
-        Assert.assertEquals(t, txn.testing.abort(t));
-        Assert.assertTrue(t.getSuppressed()[0] instanceof IllegalStateException);
+
+        if (txn.commitCanThrow())
+        {
+            try
+            {
+                txn.testing.commit(t);
+            }
+            catch (Throwable tt)
+            {
+                Assert.assertEquals(t, tt);
+            }
+
+            Assert.assertEquals(t, txn.testing.abort(t));
+            Assert.assertEquals(0, t.getSuppressed().length);
+        }
+        else
+        {
+            Assert.assertEquals(t, txn.testing.commit(t));
+            Assert.assertEquals(t, txn.testing.abort(t));
+            Assert.assertTrue(t.getSuppressed()[0] instanceof IllegalStateException);
+        }
+
+
     }
 
     @Test
@@ -132,5 +159,10 @@
         protected abstract void assertPrepared() throws Exception;
         protected abstract void assertAborted() throws Exception;
         protected abstract void assertCommitted() throws Exception;
+
+        protected boolean commitCanThrow()
+        {
+            return false;
+        }
     }
 }

diff --git a/test/unit/org/apache/cassandra/utils/concurrent/AccumulatorTest.java b/test/unit/org/apache/cassandra/utils/concurrent/AccumulatorTest.java
index 2842374..33daca7 100644
--- a/test/unit/org/apache/cassandra/utils/concurrent/AccumulatorTest.java
+++ b/test/unit/org/apache/cassandra/utils/concurrent/AccumulatorTest.java

@@ -81,15 +81,7 @@
         assertEquals("2", accu.get(1));
         assertEquals("4", accu.get(2));
 
-        try
-        {
-            assertEquals(null, accu.get(3));
-            fail();
-        }
-        catch (IndexOutOfBoundsException e)
-        {
-            // Expected
-        }
+        assertOutOfBonds(accu, 3);
 
         accu.add("0");
 
@@ -103,4 +95,48 @@
         assertEquals("0", iter.next());
         assertFalse(iter.hasNext());
     }
+
+    @Test
+    public void testClearUnsafe()
+    {
+        Accumulator<String> accu = new Accumulator<>(3);
+
+        accu.add("1");
+        accu.add("2");
+        accu.add("3");
+
+        accu.clearUnsafe();
+
+        assertEquals(0, accu.size());
+        assertFalse(accu.iterator().hasNext());
+        assertOutOfBonds(accu, 0);
+
+        accu.add("4");
+        accu.add("5");
+
+        assertEquals(2, accu.size());
+
+        assertEquals("4", accu.get(0));
+        assertEquals("5", accu.get(1));
+        assertOutOfBonds(accu, 2);
+
+        Iterator<String> iter = accu.iterator();
+        assertTrue(iter.hasNext());
+        assertEquals("4", iter.next());
+        assertEquals("5", iter.next());
+        assertFalse(iter.hasNext());
+    }
+
+    private static void assertOutOfBonds(Accumulator<String> accumulator, int index)
+    {
+        try
+        {
+            assertNull(accumulator.get(index));
+            fail();
+        }
+        catch (IndexOutOfBoundsException e)
+        {
+            // Expected
+        }
+    }
 }

diff --git a/test/unit/org/apache/cassandra/utils/concurrent/RefCountedTest.java b/test/unit/org/apache/cassandra/utils/concurrent/RefCountedTest.java
index bb173fe..0582ad4 100644
--- a/test/unit/org/apache/cassandra/utils/concurrent/RefCountedTest.java
+++ b/test/unit/org/apache/cassandra/utils/concurrent/RefCountedTest.java

@@ -21,10 +21,33 @@
 import org.junit.Test;
 
 import junit.framework.Assert;
-import org.apache.cassandra.utils.ObjectSizes;
 
+import java.io.File;
+import java.lang.ref.WeakReference;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.Ref.Visitor;
+
+@SuppressWarnings({"unused", "unchecked", "rawtypes"})
 public class RefCountedTest
 {
+    static
+    {
+        if (Ref.STRONG_LEAK_DETECTOR != null)
+            Ref.STRONG_LEAK_DETECTOR.submit(() -> { Thread.sleep(Integer.MAX_VALUE); return null; });
+    }
 
     private static final class Tidier implements RefCounted.Tidy
     {
@@ -97,4 +120,289 @@
             throw new AssertionError();
         ref.release();
     }
+
+    static final int entryCount = 1000000;
+    static final int fudgeFactor = 20;
+
+    @Test
+    public void testLinkedList()
+    {
+        final List<Object> iterable = new LinkedList<Object>();
+        Pair<Object, Object> p = Pair.create(iterable, iterable);
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = iterable;
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<List<Object>>(iterable), tidier);
+        for (int i = 0; i < entryCount; i++)
+        {
+            iterable.add(p);
+        }
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("LinkedList visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations);
+        //Should visit a lot of list nodes, but no more since there is only one object stored in the list
+        Assert.assertTrue(visitor.lastVisitedCount > entryCount && visitor.lastVisitedCount < entryCount + fudgeFactor);
+        //Should have a lot of iterations to walk the list, but linear to the number of entries
+        Assert.assertTrue(visitor.iterations > (entryCount * 3) && visitor.iterations < (entryCount * 3) + fudgeFactor);
+    }
+
+    /*
+     * There was a traversal error terminating traversal for an object upon encountering a null
+     * field. Test for the bug here using CLQ.
+     */
+    @Test
+    public void testCLQBug()
+    {
+        Ref.concurrentIterables.remove(ConcurrentLinkedQueue.class);
+        try
+        {
+            testConcurrentLinkedQueueImpl(true);
+        }
+        finally
+        {
+            Ref.concurrentIterables.add(ConcurrentLinkedQueue.class);
+        }
+    }
+
+    private void testConcurrentLinkedQueueImpl(boolean bugTest)
+    {
+        final Queue<Object> iterable = new ConcurrentLinkedQueue<Object>();
+        Pair<Object, Object> p = Pair.create(iterable, iterable);
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = iterable;
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<Queue<Object>>(iterable), tidier);
+        for (int i = 0; i < entryCount; i++)
+        {
+            iterable.add(p);
+        }
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("ConcurrentLinkedQueue visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations + " bug test " + bugTest);
+
+        if (bugTest)
+        {
+            //Should have to visit a lot of queue nodes
+            Assert.assertTrue(visitor.lastVisitedCount > entryCount && visitor.lastVisitedCount < entryCount + fudgeFactor);
+            //Should have a lot of iterations to walk the queue, but linear to the number of entries
+            Assert.assertTrue(visitor.iterations > (entryCount * 2) && visitor.iterations < (entryCount * 2) + fudgeFactor);
+        }
+        else
+        {
+            //There are almost no objects in this linked list once it's iterated as a collection so visited count
+            //should be small
+            Assert.assertTrue(visitor.lastVisitedCount < 10);
+            //Should have a lot of iterations to walk the collection, but linear to the number of entries
+            Assert.assertTrue(visitor.iterations > entryCount && visitor.iterations < entryCount + fudgeFactor);
+        }
+    }
+
+    @Test
+    public void testConcurrentLinkedQueue()
+    {
+        testConcurrentLinkedQueueImpl(false);
+    }
+
+    @Test
+    public void testBlockingQueue()
+    {
+        final BlockingQueue<Object> iterable = new LinkedBlockingQueue<Object>();
+        Pair<Object, Object> p = Pair.create(iterable, iterable);
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = iterable;
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<BlockingQueue<Object>>(iterable), tidier);
+        for (int i = 0; i < entryCount; i++)
+        {
+            iterable.add(p);
+        }
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("BlockingQueue visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations);
+        //There are almost no objects in this queue once it's iterated as a collection so visited count
+        //should be small
+        Assert.assertTrue(visitor.lastVisitedCount < 10);
+        //Should have a lot of iterations to walk the collection, but linear to the number of entries
+        Assert.assertTrue(visitor.iterations > entryCount && visitor.iterations < entryCount + fudgeFactor);
+    }
+
+    @Test
+    public void testConcurrentMap()
+    {
+        final Map<Object, Object> map = new ConcurrentHashMap<Object, Object>();
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = map;
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<Map<Object, Object>>(map), tidier);
+
+        Object o = new Object();
+        for (int i = 0; i < entryCount; i++)
+        {
+            map.put(new Object(), o);
+        }
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("ConcurrentHashMap visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations);
+
+        //Should visit roughly the same number of objects as entries because the value object is constant
+        //Map.Entry objects shouldn't be counted since it is iterated as a collection
+        Assert.assertTrue(visitor.lastVisitedCount > entryCount && visitor.lastVisitedCount < entryCount + fudgeFactor);
+        //Should visit 2x the number of entries since we have to traverse the key and value separately
+        Assert.assertTrue(visitor.iterations > entryCount * 2 && visitor.iterations < entryCount * 2 + fudgeFactor);
+    }
+
+    @Test
+    public void testHashMap()
+    {
+        final Map<Object, Object> map = new HashMap<Object, Object>();
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = map;
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<Map<Object, Object>>(map), tidier);
+
+        Object o = new Object();
+        for (int i = 0; i < entryCount; i++)
+        {
+            map.put(new Object(), o);
+        }
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("HashMap visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations);
+
+        //Should visit 2x  the number of entries because of the wrapper Map.Entry objects
+        Assert.assertTrue(visitor.lastVisitedCount > (entryCount * 2) && visitor.lastVisitedCount < (entryCount * 2) + fudgeFactor);
+        //Should iterate 3x the number of entries since we have to traverse the key and value separately
+        Assert.assertTrue(visitor.iterations > (entryCount * 3) && visitor.iterations < (entryCount * 3) + fudgeFactor);
+    }
+
+    @Test
+    public void testArray() throws Exception
+    {
+        final Object objects[] = new Object[entryCount];
+        for (int i = 0; i < entryCount; i += 2)
+            objects[i] = new Object();
+
+        File f = File.createTempFile("foo", "bar");
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            Object ref = objects;
+            //Checking we don't get an infinite loop out of traversing file refs
+            File fileRef = f;
+
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+        Ref<Object> ref = new Ref(new AtomicReference<Object[]>(objects), tidier);
+
+        Visitor visitor = new Visitor();
+        visitor.run();
+        ref.close();
+
+        System.out.println("Array visited " + visitor.lastVisitedCount + " iterations " + visitor.iterations);
+        //Should iterate the elements in the array and get a unique object from every other one
+        Assert.assertTrue(visitor.lastVisitedCount > (entryCount / 2) && visitor.lastVisitedCount < (entryCount / 2) + fudgeFactor);
+        //Should iterate over the array touching roughly the same number of objects as entries
+        Assert.assertTrue(visitor.iterations > (entryCount / 2) && visitor.iterations < (entryCount / 2) + fudgeFactor);
+    }
+
+    //Make sure a weak ref is ignored by the visitor looking for strong ref leaks
+    @Test
+    public void testWeakRef() throws Exception
+    {
+        AtomicReference dontRefMe = new AtomicReference();
+
+        WeakReference<Object> weakRef = new WeakReference(dontRefMe);
+
+        RefCounted.Tidy tidier = new RefCounted.Tidy() {
+            WeakReference<Object> ref = weakRef;
+
+            @Override
+            public void tidy() throws Exception
+            {
+            }
+
+            @Override
+            public String name()
+            {
+                return "42";
+            }
+        };
+
+        Ref<Object> ref = new Ref(dontRefMe, tidier);
+        dontRefMe.set(ref);
+
+        Visitor visitor = new Visitor();
+        visitor.haveLoops = new HashSet<>();
+        visitor.run();
+        ref.close();
+
+        Assert.assertTrue(visitor.haveLoops.isEmpty());
+    }
 }

diff --git a/test/unit/org/apache/cassandra/utils/memory/BufferPoolTest.java b/test/unit/org/apache/cassandra/utils/memory/BufferPoolTest.java
new file mode 100644
index 0000000..208cd32
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/memory/BufferPoolTest.java

@@ -0,0 +1,852 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.utils.memory;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.util.RandomAccessReader;
+
+import static org.junit.Assert.*;
+
+public class BufferPoolTest
+{
+    @Before
+    public void setUp()
+    {
+        BufferPool.MEMORY_USAGE_THRESHOLD = 8 * 1024L * 1024L;
+        BufferPool.DISABLED = false;
+    }
+
+    @After
+    public void cleanUp()
+    {
+        BufferPool.reset();
+    }
+
+    @Test
+    public void testGetPut() throws InterruptedException
+    {
+        final int size = RandomAccessReader.DEFAULT_BUFFER_SIZE;
+
+        ByteBuffer buffer = BufferPool.get(size);
+        assertNotNull(buffer);
+        assertEquals(size, buffer.capacity());
+        assertEquals(true, buffer.isDirect());
+
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+        assertEquals(BufferPool.GlobalPool.MACRO_CHUNK_SIZE, BufferPool.sizeInBytes());
+
+        BufferPool.put(buffer);
+        assertEquals(null, BufferPool.currentChunk());
+        assertEquals(BufferPool.GlobalPool.MACRO_CHUNK_SIZE, BufferPool.sizeInBytes());
+    }
+
+
+    @Test
+    public void testPageAligned()
+    {
+        final int size = 1024;
+        for (int i = size;
+                 i <= BufferPool.CHUNK_SIZE;
+                 i += size)
+        {
+            checkPageAligned(i);
+        }
+    }
+
+    private void checkPageAligned(int size)
+    {
+        ByteBuffer buffer = BufferPool.get(size);
+        assertNotNull(buffer);
+        assertEquals(size, buffer.capacity());
+        assertTrue(buffer.isDirect());
+
+        long address = MemoryUtil.getAddress(buffer);
+        assertTrue((address % MemoryUtil.pageSize()) == 0);
+
+        BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testDifferentSizes() throws InterruptedException
+    {
+        final int size1 = 1024;
+        final int size2 = 2048;
+
+        ByteBuffer buffer1 = BufferPool.get(size1);
+        assertNotNull(buffer1);
+        assertEquals(size1, buffer1.capacity());
+
+        ByteBuffer buffer2 = BufferPool.get(size2);
+        assertNotNull(buffer2);
+        assertEquals(size2, buffer2.capacity());
+
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+        assertEquals(BufferPool.GlobalPool.MACRO_CHUNK_SIZE, BufferPool.sizeInBytes());
+
+        BufferPool.put(buffer1);
+        BufferPool.put(buffer2);
+
+        assertEquals(null, BufferPool.currentChunk());
+        assertEquals(BufferPool.GlobalPool.MACRO_CHUNK_SIZE, BufferPool.sizeInBytes());
+    }
+
+    @Test
+    public void testMaxMemoryExceededDirect()
+    {
+        boolean cur = BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED;
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = false;
+
+        requestDoubleMaxMemory();
+
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = cur;
+    }
+
+    @Test
+    public void testMaxMemoryExceededHeap()
+    {
+        boolean cur = BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED;
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = true;
+
+        requestDoubleMaxMemory();
+
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = cur;
+    }
+
+    @Test
+    public void testMaxMemoryExceeded_SameAsChunkSize()
+    {
+        BufferPool.MEMORY_USAGE_THRESHOLD = BufferPool.GlobalPool.MACRO_CHUNK_SIZE;
+        requestDoubleMaxMemory();
+    }
+
+    @Test
+    public void testMaxMemoryExceeded_SmallerThanChunkSize()
+    {
+        BufferPool.MEMORY_USAGE_THRESHOLD = BufferPool.GlobalPool.MACRO_CHUNK_SIZE / 2;
+        requestDoubleMaxMemory();
+    }
+
+    @Test
+    public void testRecycle()
+    {
+        requestUpToSize(RandomAccessReader.DEFAULT_BUFFER_SIZE, 3 * BufferPool.CHUNK_SIZE);
+    }
+
+    private void requestDoubleMaxMemory()
+    {
+        requestUpToSize(RandomAccessReader.DEFAULT_BUFFER_SIZE, (int)(2 * BufferPool.MEMORY_USAGE_THRESHOLD));
+    }
+
+    private void requestUpToSize(int bufferSize, int totalSize)
+    {
+        final int numBuffers = totalSize / bufferSize;
+
+        List<ByteBuffer> buffers = new ArrayList<>(numBuffers);
+        for (int i = 0; i < numBuffers; i++)
+        {
+            ByteBuffer buffer = BufferPool.get(bufferSize);
+            assertNotNull(buffer);
+            assertEquals(bufferSize, buffer.capacity());
+
+            if (BufferPool.sizeInBytes() > BufferPool.MEMORY_USAGE_THRESHOLD)
+                assertEquals(BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED, !buffer.isDirect());
+
+            buffers.add(buffer);
+        }
+
+        for (ByteBuffer buffer : buffers)
+            BufferPool.put(buffer);
+
+    }
+
+    @Test
+    public void testBigRequest()
+    {
+        final int size = BufferPool.CHUNK_SIZE + 1;
+
+        ByteBuffer buffer = BufferPool.get(size);
+        assertNotNull(buffer);
+        assertEquals(size, buffer.capacity());
+        BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testFillUpChunks()
+    {
+        final int size = RandomAccessReader.DEFAULT_BUFFER_SIZE;
+        final int numBuffers = BufferPool.CHUNK_SIZE / size;
+
+        List<ByteBuffer> buffers1 = new ArrayList<>(numBuffers);
+        List<ByteBuffer> buffers2 = new ArrayList<>(numBuffers);
+        for (int i = 0; i < numBuffers; i++)
+            buffers1.add(BufferPool.get(size));
+
+        BufferPool.Chunk chunk1 = BufferPool.currentChunk();
+        assertNotNull(chunk1);
+
+        for (int i = 0; i < numBuffers; i++)
+            buffers2.add(BufferPool.get(size));
+
+        assertEquals(2, BufferPool.numChunks());
+
+        for (ByteBuffer buffer : buffers1)
+            BufferPool.put(buffer);
+
+        assertEquals(1, BufferPool.numChunks());
+
+        for (ByteBuffer buffer : buffers2)
+            BufferPool.put(buffer);
+
+        assertEquals(0, BufferPool.numChunks());
+
+        buffers2.clear();
+    }
+
+    @Test
+    public void testOutOfOrderFrees()
+    {
+        final int size = 4096;
+        final int maxFreeSlots = BufferPool.CHUNK_SIZE / size;
+
+        final int[] idxs = new int[maxFreeSlots];
+        for (int i = 0; i < maxFreeSlots; i++)
+            idxs[i] = i;
+
+        doTestFrees(size, maxFreeSlots, idxs);
+    }
+
+    @Test
+    public void testInOrderFrees()
+    {
+        final int size = 4096;
+        final int maxFreeSlots = BufferPool.CHUNK_SIZE / size;
+
+        final int[] idxs = new int[maxFreeSlots];
+        for (int i = 0; i < maxFreeSlots; i++)
+            idxs[i] = maxFreeSlots - 1 - i;
+
+        doTestFrees(size, maxFreeSlots, idxs);
+    }
+
+    @Test
+    public void testRandomFrees()
+    {
+        doTestRandomFrees(12345567878L);
+
+        BufferPool.reset();
+        doTestRandomFrees(20452249587L);
+
+        BufferPool.reset();
+        doTestRandomFrees(82457252948L);
+
+        BufferPool.reset();
+        doTestRandomFrees(98759284579L);
+
+        BufferPool.reset();
+        doTestRandomFrees(19475257244L);
+    }
+
+    private void doTestRandomFrees(long seed)
+    {
+        final int size = 4096;
+        final int maxFreeSlots = BufferPool.CHUNK_SIZE / size;
+
+        final int[] idxs = new int[maxFreeSlots];
+        for (int i = 0; i < maxFreeSlots; i++)
+            idxs[i] = maxFreeSlots - 1 - i;
+
+        Random rnd = new Random();
+        rnd.setSeed(seed);
+        for (int i = idxs.length - 1; i > 0; i--)
+        {
+            int idx = rnd.nextInt(i+1);
+            int v = idxs[idx];
+            idxs[idx] = idxs[i];
+            idxs[i] = v;
+        }
+
+        doTestFrees(size, maxFreeSlots, idxs);
+    }
+
+    private void doTestFrees(final int size, final int maxFreeSlots, final int[] toReleaseIdxs)
+    {
+        List<ByteBuffer> buffers = new ArrayList<>(maxFreeSlots);
+        for (int i = 0; i < maxFreeSlots; i++)
+        {
+            buffers.add(BufferPool.get(size));
+        }
+
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertFalse(chunk.isFree());
+
+        int freeSize = BufferPool.CHUNK_SIZE - maxFreeSlots * size;
+        assertEquals(freeSize, chunk.free());
+
+        for (int i : toReleaseIdxs)
+        {
+            ByteBuffer buffer = buffers.get(i);
+            assertNotNull(buffer);
+            assertEquals(size, buffer.capacity());
+
+            BufferPool.put(buffer);
+
+            freeSize += size;
+            if (freeSize == chunk.capacity())
+                assertEquals(0, chunk.free());
+            else
+                assertEquals(freeSize, chunk.free());
+        }
+
+        assertFalse(chunk.isFree());
+    }
+
+    @Test
+    public void testDifferentSizeBuffersOnOneChunk()
+    {
+        int[] sizes = new int[] {
+            5, 1024, 4096, 8, 16000, 78, 512, 256, 63, 55, 89, 90, 255, 32, 2048, 128
+        };
+
+        int sum = 0;
+        List<ByteBuffer> buffers = new ArrayList<>(sizes.length);
+        for (int i = 0; i < sizes.length; i++)
+        {
+            ByteBuffer buffer = BufferPool.get(sizes[i]);
+            assertNotNull(buffer);
+            assertTrue(buffer.capacity() >= sizes[i]);
+            buffers.add(buffer);
+
+            sum += BufferPool.currentChunk().roundUp(buffer.capacity());
+        }
+
+        // else the test will fail, adjust sizes as required
+        assertTrue(sum <= BufferPool.GlobalPool.MACRO_CHUNK_SIZE);
+
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+
+        Random rnd = new Random();
+        rnd.setSeed(298347529L);
+        while (!buffers.isEmpty())
+        {
+            int index = rnd.nextInt(buffers.size());
+            ByteBuffer buffer = buffers.remove(index);
+
+            BufferPool.put(buffer);
+        }
+
+        assertEquals(null, BufferPool.currentChunk());
+        assertEquals(0, chunk.free());
+    }
+
+    @Test
+    public void testChunkExhausted()
+    {
+        final int size = BufferPool.CHUNK_SIZE / 64; // 1kbit
+        int[] sizes = new int[128];
+        Arrays.fill(sizes, size);
+
+        int sum = 0;
+        List<ByteBuffer> buffers = new ArrayList<>(sizes.length);
+        for (int i = 0; i < sizes.length; i++)
+        {
+            ByteBuffer buffer = BufferPool.get(sizes[i]);
+            assertNotNull(buffer);
+            assertTrue(buffer.capacity() >= sizes[i]);
+            buffers.add(buffer);
+
+            sum += buffer.capacity();
+        }
+
+        // else the test will fail, adjust sizes as required
+        assertTrue(sum <= BufferPool.GlobalPool.MACRO_CHUNK_SIZE);
+
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+
+        for (int i = 0; i < sizes.length; i++)
+        {
+            BufferPool.put(buffers.get(i));
+        }
+
+        assertEquals(null, BufferPool.currentChunk());
+        assertEquals(0, chunk.free());
+    }
+
+    @Test
+    public void testCompactIfOutOfCapacity()
+    {
+        final int size = 4096;
+        final int numBuffersInChunk = BufferPool.GlobalPool.MACRO_CHUNK_SIZE / size;
+
+        List<ByteBuffer> buffers = new ArrayList<>(numBuffersInChunk);
+        Set<Long> addresses = new HashSet<>(numBuffersInChunk);
+
+        for (int i = 0; i < numBuffersInChunk; i++)
+        {
+            ByteBuffer buffer = BufferPool.get(size);
+            buffers.add(buffer);
+            addresses.add(MemoryUtil.getAddress(buffer));
+        }
+
+        for (int i = numBuffersInChunk - 1; i >= 0; i--)
+            BufferPool.put(buffers.get(i));
+
+        buffers.clear();
+
+        for (int i = 0; i < numBuffersInChunk; i++)
+        {
+            ByteBuffer buffer = BufferPool.get(size);
+            assertNotNull(buffer);
+            assertEquals(size, buffer.capacity());
+            addresses.remove(MemoryUtil.getAddress(buffer));
+
+            buffers.add(buffer);
+        }
+
+        assertTrue(addresses.isEmpty()); // all 5 released buffers were used
+
+        for (ByteBuffer buffer : buffers)
+            BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testHeapBuffer()
+    {
+        ByteBuffer buffer = BufferPool.get(1024, BufferType.ON_HEAP);
+        assertNotNull(buffer);
+        assertEquals(1024, buffer.capacity());
+        assertFalse(buffer.isDirect());
+        assertNotNull(buffer.array());
+        BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testSingleBufferOneChunk()
+    {
+        checkBuffer(0);
+
+        checkBuffer(1);
+        checkBuffer(2);
+        checkBuffer(4);
+        checkBuffer(5);
+        checkBuffer(8);
+        checkBuffer(16);
+        checkBuffer(32);
+        checkBuffer(64);
+
+        checkBuffer(65);
+        checkBuffer(127);
+        checkBuffer(128);
+
+        checkBuffer(129);
+        checkBuffer(255);
+        checkBuffer(256);
+
+        checkBuffer(512);
+        checkBuffer(1024);
+        checkBuffer(2048);
+        checkBuffer(4096);
+        checkBuffer(8192);
+        checkBuffer(16384);
+
+        checkBuffer(16385);
+        checkBuffer(32767);
+        checkBuffer(32768);
+
+        checkBuffer(32769);
+        checkBuffer(33172);
+        checkBuffer(33553);
+        checkBuffer(36000);
+        checkBuffer(65535);
+        checkBuffer(65536);
+
+        checkBuffer(65537);
+    }
+
+    private void checkBuffer(int size)
+    {
+        ByteBuffer buffer = BufferPool.get(size);
+        assertEquals(size, buffer.capacity());
+
+        if (size > 0 && size < BufferPool.CHUNK_SIZE)
+        {
+            BufferPool.Chunk chunk = BufferPool.currentChunk();
+            assertNotNull(chunk);
+            assertEquals(chunk.capacity(), chunk.free() + chunk.roundUp(size));
+        }
+
+        BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testMultipleBuffersOneChunk()
+    {
+        checkBuffers(32768, 33553);
+        checkBuffers(32768, 32768);
+        checkBuffers(48450, 33172);
+        checkBuffers(32768, 15682, 33172);
+    }
+
+    private void checkBuffers(int ... sizes)
+    {
+        List<ByteBuffer> buffers = new ArrayList<>(sizes.length);
+
+        for (int size : sizes)
+        {
+            ByteBuffer buffer = BufferPool.get(size);
+            assertEquals(size, buffer.capacity());
+
+            buffers.add(buffer);
+        }
+
+        for (ByteBuffer buffer : buffers)
+            BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testBuffersWithGivenSlots()
+    {
+        checkBufferWithGivenSlots(21241, (-1L << 27) ^ (1L << 40));
+    }
+
+    private void checkBufferWithGivenSlots(int size, long freeSlots)
+    {
+        //first allocate to make sure there is a chunk
+        ByteBuffer buffer = BufferPool.get(size);
+
+        // now get the current chunk and override the free slots mask
+        BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+        long oldFreeSlots = chunk.setFreeSlots(freeSlots);
+
+        // now check we can still get the buffer with the free slots mask changed
+        ByteBuffer buffer2 = BufferPool.get(size);
+        assertEquals(size, buffer.capacity());
+        BufferPool.put(buffer2);
+
+        // reset the free slots
+        chunk.setFreeSlots(oldFreeSlots);
+        BufferPool.put(buffer);
+    }
+
+    @Test
+    public void testZeroSizeRequest()
+    {
+        ByteBuffer buffer = BufferPool.get(0);
+        assertNotNull(buffer);
+        assertEquals(0, buffer.capacity());
+        BufferPool.put(buffer);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testNegativeSizeRequest()
+    {
+        BufferPool.get(-1);
+    }
+
+    @Test
+    public void testBufferPoolDisabled()
+    {
+        BufferPool.DISABLED = true;
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = true;
+        ByteBuffer buffer = BufferPool.get(1024);
+        assertEquals(0, BufferPool.numChunks());
+        assertNotNull(buffer);
+        assertEquals(1024, buffer.capacity());
+        assertFalse(buffer.isDirect());
+        assertNotNull(buffer.array());
+        BufferPool.put(buffer);
+        assertEquals(0, BufferPool.numChunks());
+
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = false;
+        buffer = BufferPool.get(1024);
+        assertEquals(0, BufferPool.numChunks());
+        assertNotNull(buffer);
+        assertEquals(1024, buffer.capacity());
+        assertTrue(buffer.isDirect());
+        BufferPool.put(buffer);
+        assertEquals(0, BufferPool.numChunks());
+
+        // clean-up
+        BufferPool.DISABLED = false;
+        BufferPool.ALLOCATE_ON_HEAP_WHEN_EXAHUSTED = true;
+    }
+
+    @Test
+    public void testMT_SameSizeImmediateReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 1, true, RandomAccessReader.DEFAULT_BUFFER_SIZE);
+    }
+
+    @Test
+    public void testMT_SameSizePostponedReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 1, false, RandomAccessReader.DEFAULT_BUFFER_SIZE);
+    }
+
+    @Test
+    public void testMT_TwoSizesOneBufferImmediateReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 1, true, 1024, 2048);
+    }
+
+    @Test
+    public void testMT_TwoSizesOneBufferPostponedReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 1, false, 1024, 2048);
+    }
+
+    @Test
+    public void testMT_TwoSizesTwoBuffersImmediateReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 2, true, 1024, 2048);
+    }
+
+    @Test
+    public void testMT_TwoSizesTwoBuffersPostponedReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40, 2, false, 1024, 2048);
+    }
+
+    @Test
+    public void testMT_MultipleSizesOneBufferImmediateReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40,
+                             1,
+                             true,
+                             1024,
+                             2048,
+                             3072,
+                             4096,
+                             5120);
+    }
+
+    @Test
+    public void testMT_MultipleSizesOneBufferPostponedReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40,
+                             1,
+                             false,
+                             1024,
+                             2048,
+                             3072,
+                             4096,
+                             5120);
+    }
+
+    @Test
+    public void testMT_MultipleSizesMultipleBuffersImmediateReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40,
+                             4,
+                             true,
+                             1024,
+                             2048,
+                             3072,
+                             4096,
+                             5120);
+    }
+
+    @Test
+    public void testMT_MultipleSizesMultipleBuffersPostponedReturn() throws InterruptedException
+    {
+        checkMultipleThreads(40,
+                             3,
+                             false,
+                             1024,
+                             2048,
+                             3072,
+                             4096,
+                             5120);
+    }
+
+    private void checkMultipleThreads(int threadCount, int numBuffersPerThread, final boolean returnImmediately, final int ... sizes) throws InterruptedException
+    {
+        ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
+        final CountDownLatch finished = new CountDownLatch(threadCount);
+
+        for (int i = 0; i < threadCount; i++)
+        {
+            final int[] threadSizes = new int[numBuffersPerThread];
+            for (int j = 0; j < threadSizes.length; j++)
+                threadSizes[j] = sizes[(i * numBuffersPerThread + j) % sizes.length];
+
+            final Random rand = new Random();
+            executorService.submit(new Runnable()
+            {
+                @Override
+                public void run()
+                {
+                    try
+                    {
+                        Thread.sleep(rand.nextInt(3));
+
+                        List<ByteBuffer> toBeReturned = new ArrayList<ByteBuffer>(threadSizes.length);
+
+                        for (int j = 0; j < threadSizes.length; j++)
+                        {
+                            ByteBuffer buffer = BufferPool.get(threadSizes[j]);
+                            assertNotNull(buffer);
+                            assertEquals(threadSizes[j], buffer.capacity());
+
+                            for (int i = 0; i < 10; i++)
+                                buffer.putInt(i);
+
+                            buffer.rewind();
+
+                            Thread.sleep(rand.nextInt(3));
+
+                            for (int i = 0; i < 10; i++)
+                                assertEquals(i, buffer.getInt());
+
+                            if (returnImmediately)
+                                BufferPool.put(buffer);
+                            else
+                                toBeReturned.add(buffer);
+
+                            assertTrue(BufferPool.sizeInBytes() > 0);
+                        }
+
+                        Thread.sleep(rand.nextInt(3));
+
+                        for (ByteBuffer buffer : toBeReturned)
+                            BufferPool.put(buffer);
+                    }
+                    catch (Exception ex)
+                    {
+                        ex.printStackTrace();
+                        fail(ex.getMessage());
+                    }
+                    finally
+                    {
+                        finished.countDown();
+                    }
+                }
+            });
+        }
+
+        finished.await();
+        assertEquals(0, executorService.shutdownNow().size());
+
+        // Make sure thread local storage gets GC-ed
+        for (int i = 0; i < 5; i++)
+        {
+            System.gc();
+            Thread.sleep(100);
+        }
+    }
+
+    @Ignore
+    public void testMultipleThreadsReleaseSameBuffer() throws InterruptedException
+    {
+        doMultipleThreadsReleaseBuffers(45, 4096);
+    }
+
+    @Ignore
+    public void testMultipleThreadsReleaseDifferentBuffer() throws InterruptedException
+    {
+        doMultipleThreadsReleaseBuffers(45, 4096, 8192);
+    }
+
+    private void doMultipleThreadsReleaseBuffers(final int threadCount, final int ... sizes) throws InterruptedException
+    {
+        final ByteBuffer[] buffers = new ByteBuffer[sizes.length];
+        int sum = 0;
+        for (int i = 0; i < sizes.length; i++)
+        {
+            buffers[i] = BufferPool.get(sizes[i]);
+            assertNotNull(buffers[i]);
+            assertEquals(sizes[i], buffers[i].capacity());
+            sum += BufferPool.currentChunk().roundUp(buffers[i].capacity());
+        }
+
+        final BufferPool.Chunk chunk = BufferPool.currentChunk();
+        assertNotNull(chunk);
+        assertFalse(chunk.isFree());
+
+        // if we use multiple chunks the test will fail, adjust sizes accordingly
+        assertTrue(sum < BufferPool.GlobalPool.MACRO_CHUNK_SIZE);
+
+        ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
+        final CountDownLatch finished = new CountDownLatch(threadCount);
+
+        for (int i = 0; i < threadCount; i++)
+        {
+            final int idx = i % sizes.length;
+            final ByteBuffer buffer = buffers[idx];
+
+            executorService.submit(new Runnable()
+            {
+                @Override
+                public void run()
+                {
+                    try
+                    {
+                        assertNotSame(chunk, BufferPool.currentChunk());
+                        BufferPool.put(buffer);
+                    }
+                    catch (AssertionError ex)
+                    { //this is expected if we release a buffer more than once
+                        ex.printStackTrace();
+                    }
+                    catch (Throwable t)
+                    {
+                        t.printStackTrace();
+                        fail(t.getMessage());
+                    }
+                    finally
+                    {
+                        finished.countDown();
+                    }
+                }
+            });
+        }
+
+        finished.await();
+        assertEquals(0, executorService.shutdownNow().size());
+
+        executorService = null;
+
+        // Make sure thread local storage gets GC-ed
+        System.gc();
+        System.gc();
+        System.gc();
+
+        assertTrue(BufferPool.currentChunk().isFree());
+
+        //make sure the main thread can still allocate buffers
+        ByteBuffer buffer = BufferPool.get(sizes[0]);
+        assertNotNull(buffer);
+        assertEquals(sizes[0], buffer.capacity());
+        BufferPool.put(buffer);
+    }
+}

diff --git a/test/unit/org/apache/cassandra/utils/memory/NativeAllocatorTest.java b/test/unit/org/apache/cassandra/utils/memory/NativeAllocatorTest.java
index 7f87fcd..b636bf7 100644
--- a/test/unit/org/apache/cassandra/utils/memory/NativeAllocatorTest.java
+++ b/test/unit/org/apache/cassandra/utils/memory/NativeAllocatorTest.java

@@ -22,7 +22,6 @@
 import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.util.concurrent.Uninterruptibles;
-
 import org.junit.Test;
 
 import junit.framework.Assert;

diff --git a/test/unit/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupportTest.java b/test/unit/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupportTest.java
index efa4a27..70fb5cc 100644
--- a/test/unit/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupportTest.java
+++ b/test/unit/org/apache/cassandra/utils/progress/jmx/LegacyJMXProgressSupportTest.java

@@ -18,9 +18,9 @@
 
 package org.apache.cassandra.utils.progress.jmx;
 
+import java.util.Optional;
 import java.util.UUID;
 
-import com.google.common.base.Optional;
 import org.junit.Test;
 
 import org.apache.cassandra.dht.Murmur3Partitioner;

diff --git a/test/unit/org/apache/cassandra/utils/vint/VIntCodingTest.java b/test/unit/org/apache/cassandra/utils/vint/VIntCodingTest.java
new file mode 100644
index 0000000..f08b181
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/vint/VIntCodingTest.java

@@ -0,0 +1,85 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.cassandra.utils.vint;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+public class VIntCodingTest
+{
+
+    @Test
+    public void testComputeSize() throws Exception
+    {
+        assertEncodedAtExpectedSize(0L, 1);
+
+        for (int size = 1 ; size < 8 ; size++)
+        {
+            assertEncodedAtExpectedSize((1L << 7 * size) - 1, size);
+            assertEncodedAtExpectedSize(1L << 7 * size, size + 1);
+        }
+        Assert.assertEquals(9, VIntCoding.computeUnsignedVIntSize(Long.MAX_VALUE));
+    }
+
+    private void assertEncodedAtExpectedSize(long value, int expectedSize) throws Exception
+    {
+        Assert.assertEquals(expectedSize, VIntCoding.computeUnsignedVIntSize(value));
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(baos);
+        VIntCoding.writeUnsignedVInt(value, dos);
+        dos.flush();
+        Assert.assertEquals( expectedSize, baos.toByteArray().length);
+
+        DataOutputBuffer dob = new DataOutputBuffer();
+        dob.writeUnsignedVInt(value);
+        Assert.assertEquals( expectedSize, dob.buffer().remaining());
+        dob.close();
+    }
+
+    @Test
+    public void testReadExtraBytesCount()
+    {
+        for (int i = 1 ; i < 8 ; i++)
+            Assert.assertEquals(i, VIntCoding.numberOfExtraBytesToRead((byte) ((0xFF << (8 - i)) & 0xFF)));
+    }
+
+    /*
+     * Quick sanity check that 1 byte encodes up to 127 as expected
+     */
+    @Test
+    public void testOneByteCapacity() throws Exception {
+        int biggestOneByte = 127;
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(baos);
+        VIntCoding.writeUnsignedVInt(biggestOneByte, dos);
+        dos.flush();
+        Assert.assertEquals( 1, baos.toByteArray().length);
+
+        DataOutputBuffer dob = new DataOutputBuffer();
+        dob.writeUnsignedVInt(biggestOneByte);
+        Assert.assertEquals( 1, dob.buffer().remaining());
+        dob.close();
+    }
+}

diff --git a/tools/bin/json2sstable b/tools/bin/json2sstable
deleted file mode 100755
index 1bed544..0000000
--- a/tools/bin/json2sstable
+++ /dev/null

@@ -1,61 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if [ "x$CASSANDRA_INCLUDE" = "x" ]; then
-    # Locations (in order) to use when searching for an include file.
-    for include in "`dirname "$0"`/cassandra.in.sh" \
-                   "$HOME/.cassandra.in.sh" \
-                   /usr/share/cassandra/cassandra.in.sh \
-                   /usr/local/share/cassandra/cassandra.in.sh \
-                   /opt/cassandra/cassandra.in.sh; do
-        if [ -r "$include" ]; then
-            . "$include"
-            break
-        fi
-    done
-elif [ -r "$CASSANDRA_INCLUDE" ]; then
-    . "$CASSANDRA_INCLUDE"
-fi
-
-# Use JAVA_HOME if set, otherwise look for java in PATH
-if [ -x "$JAVA_HOME/bin/java" ]; then
-    JAVA="$JAVA_HOME/bin/java"
-else
-    JAVA="`which java`"
-fi
-
-if [ "x$JAVA" = "x" ]; then
-    echo "Java executable not found (hint: set JAVA_HOME)" >&2
-    exit 1
-fi
-
-if [ -z "$CLASSPATH" ]; then
-    echo "You must set the CLASSPATH var" >&2
-    exit 1
-fi
-
-if [ "x$MAX_HEAP_SIZE" = "x" ]; then
-    MAX_HEAP_SIZE="256M"
-fi
-
-"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \
-        -Dcassandra.storagedir="$cassandra_storagedir" \
-        -Dlogback.configurationFile=logback-tools.xml \
-        org.apache.cassandra.tools.SSTableImport "$@"
-
-# vi:ai sw=4 ts=4 tw=0 et

diff --git a/tools/bin/json2sstable.bat b/tools/bin/json2sstable.bat
deleted file mode 100644
index db0fa91..0000000
--- a/tools/bin/json2sstable.bat
+++ /dev/null

@@ -1,48 +0,0 @@
-@REM

-@REM  Licensed to the Apache Software Foundation (ASF) under one or more

-@REM  contributor license agreements.  See the NOTICE file distributed with

-@REM  this work for additional information regarding copyright ownership.

-@REM  The ASF licenses this file to You under the Apache License, Version 2.0

-@REM  (the "License"); you may not use this file except in compliance with

-@REM  the License.  You may obtain a copy of the License at

-@REM

-@REM      http://www.apache.org/licenses/LICENSE-2.0

-@REM

-@REM  Unless required by applicable law or agreed to in writing, software

-@REM  distributed under the License is distributed on an "AS IS" BASIS,

-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-@REM  See the License for the specific language governing permissions and

-@REM  limitations under the License.

-

-@echo off

-if "%OS%" == "Windows_NT" setlocal

-

-pushd "%~dp0"

-call cassandra.in.bat

-

-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableImport

-if NOT DEFINED JAVA_HOME goto :err

-

-REM ***** JAVA options *****

-set JAVA_OPTS=^

- -Dlogback.configurationFile=logback-tools.xml

-

-set TOOLS_PARAMS=

-FOR %%A IN (%*) DO call :appendToolsParams %%A

-goto runTool

-

-:appendToolsParams

-set TOOLS_PARAMS=%TOOLS_PARAMS% %1

-goto :eof

-

-:runTool

-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %TOOLS_PARAMS%

-goto finally

-

-:err

-echo JAVA_HOME environment variable must be set!

-pause

-

-:finally

-

-ENDLOCAL


diff --git a/tools/bin/sstable2json.bat b/tools/bin/sstable2json.bat
deleted file mode 100644
index 17669c0..0000000
--- a/tools/bin/sstable2json.bat
+++ /dev/null

@@ -1,48 +0,0 @@
-@REM

-@REM  Licensed to the Apache Software Foundation (ASF) under one or more

-@REM  contributor license agreements.  See the NOTICE file distributed with

-@REM  this work for additional information regarding copyright ownership.

-@REM  The ASF licenses this file to You under the Apache License, Version 2.0

-@REM  (the "License"); you may not use this file except in compliance with

-@REM  the License.  You may obtain a copy of the License at

-@REM

-@REM      http://www.apache.org/licenses/LICENSE-2.0

-@REM

-@REM  Unless required by applicable law or agreed to in writing, software

-@REM  distributed under the License is distributed on an "AS IS" BASIS,

-@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-@REM  See the License for the specific language governing permissions and

-@REM  limitations under the License.

-

-@echo off

-if "%OS%" == "Windows_NT" setlocal

-

-pushd "%~dp0"

-call cassandra.in.bat

-

-if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableExport

-if NOT DEFINED JAVA_HOME goto :err

-

-REM ***** JAVA options *****

-set JAVA_OPTS=^

- -Dlogback.configurationFile=logback-tools.xml

-

-set TOOLS_PARAMS=

-FOR %%A IN (%*) DO call :appendToolsParams %%A

-goto runTool

-

-:appendToolsParams

-set TOOLS_PARAMS=%TOOLS_PARAMS% %1

-goto :eof

-

-:runTool

-"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %TOOLS_PARAMS%

-goto finally

-

-:err

-echo JAVA_HOME environment variable must be set!

-pause

-

-:finally

-

-ENDLOCAL


diff --git a/tools/bin/sstable2json b/tools/bin/sstabledump
similarity index 100%
rename from tools/bin/sstable2json
rename to tools/bin/sstabledump


diff --git a/tools/bin/sstabledump.bat b/tools/bin/sstabledump.bat
new file mode 100644
index 0000000..0a3a380
--- /dev/null
+++ b/tools/bin/sstabledump.bat

@@ -0,0 +1,48 @@
+@REM
+@REM  Licensed to the Apache Software Foundation (ASF) under one or more
+@REM  contributor license agreements.  See the NOTICE file distributed with
+@REM  this work for additional information regarding copyright ownership.
+@REM  The ASF licenses this file to You under the Apache License, Version 2.0
+@REM  (the "License"); you may not use this file except in compliance with
+@REM  the License.  You may obtain a copy of the License at
+@REM
+@REM      http://www.apache.org/licenses/LICENSE-2.0
+@REM
+@REM  Unless required by applicable law or agreed to in writing, software
+@REM  distributed under the License is distributed on an "AS IS" BASIS,
+@REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@REM  See the License for the specific language governing permissions and
+@REM  limitations under the License.
+
+@echo off
+if "%OS%" == "Windows_NT" setlocal
+
+pushd "%~dp0"
+call cassandra.in.bat
+
+if NOT DEFINED CASSANDRA_MAIN set CASSANDRA_MAIN=org.apache.cassandra.tools.SSTableExport
+if NOT DEFINED JAVA_HOME goto :err
+
+REM ***** JAVA options *****
+set JAVA_OPTS=^
+ -Dlogback.configurationFile=logback-tools.xml
+
+set TOOLS_PARAMS=
+FOR %%A IN (%*) DO call :appendToolsParams %%A
+goto runTool
+
+:appendToolsParams
+set TOOLS_PARAMS=%TOOLS_PARAMS% %1
+goto :eof
+
+:runTool
+"%JAVA_HOME%\bin\java" %JAVA_OPTS% %CASSANDRA_PARAMS% -cp %CASSANDRA_CLASSPATH% "%CASSANDRA_MAIN%" %TOOLS_PARAMS%
+goto finally
+
+:err
+echo JAVA_HOME environment variable must be set!
+pause
+
+:finally
+
+ENDLOCAL

diff --git a/tools/bin/token-generator b/tools/bin/token-generator
deleted file mode 100755
index 9ebef54..0000000
--- a/tools/bin/token-generator
+++ /dev/null

@@ -1,367 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import with_statement
-
-# Py2/3 input compatibility
-try:
-    user_input = raw_input
-except NameError:
-    user_input = input
-import os
-import sys
-import math
-import optparse
-import webbrowser
-# Py2/3 quote compat
-try:
-    from urllib import quote
-except ImportError:
-    from urllib.parse import quote
-
-from time import sleep
-from itertools import cycle
-from tempfile import NamedTemporaryFile
-
-description = '''Given a list of numbers indicating the number of nodes
-in each separate datacenter, outputs a recommended list of tokens to use
-with Murmur3Partitioner (by default) or with RandomPartitioner.
-The list contains one token for each node in each datacenter.
-'''
-
-usage = "%prog <nodes_in_dc1> [<nodes_in_dc2> [...]]"
-
-parser = optparse.OptionParser(description=description, usage=usage)
-
-def part_murmur3(option, opt, value, parser):
-    parser.values.ringoffset=-(1<<63)
-    parser.values.ringrange=(1<<64)
-    return
-def part_random(option, opt, value, parser):
-    parser.values.ringoffset=0
-    parser.values.ringrange=(1<<127)
-    return
-parser.add_option('--murmur3', action='callback', callback=part_murmur3,
-                  help='Generate tokens for Murmur3Partitioner (default).')
-parser.add_option('--random', action='callback', callback=part_random,
-                  help='Generate tokens for RandomPartitioner.')
-parser.add_option('--ringoffset', type='int',
-                  help=optparse.SUPPRESS_HELP)
-parser.add_option('--ringrange', type='int',
-                  help=optparse.SUPPRESS_HELP)
-
-parser.add_option('--graph', action='store_true',
-                  help='Show a rendering of the generated tokens as line '
-                       'segments in a circle, colored according to datacenter')
-parser.add_option('-n', '--nts', action='store_const', dest='strat', const='nts',
-                  help=optparse.SUPPRESS_HELP)
-parser.add_option('-o', '--onts', action='store_const', dest='strat', const='onts',
-                  help=optparse.SUPPRESS_HELP)
-
-parser.add_option('--test', action='store_true',
-                  help='Run in test mode, outputting an HTML file to display '
-                       'various generated ring arrangements.')
-
-parser.add_option('--html-output', help=optparse.SUPPRESS_HELP)
-parser.add_option('--browser-wait-time', type='float', help=optparse.SUPPRESS_HELP)
-parser.add_option('--test-colors', help=optparse.SUPPRESS_HELP)
-parser.add_option('--test-graphsize', type='int', help=optparse.SUPPRESS_HELP)
-
-
-parser.set_defaults(
-    # default is Murmur3
-    ringoffset=-(1<<63),
-    ringrange=(1<<64),
-
-    # whether to create (and try to display) graph output
-    graph=False,
-
-    # 'nts' or 'onts'; the replication strategy for which to optimize
-    strat='nts',
-
-    # durr
-    test=False,
-
-    # size of the generated graph
-    graphsize=600,
-
-    # where to write generated graph (HTML) output, or '*tmp*' to write a
-    # temporary file, and remove it after telling a browser to open it. '-'
-    # to write to stdout.
-    html_output='*tmp*',
-
-    # how long, in seconds, to wait before cleaning up a temporary html file
-    # after telling the browser to open it
-    browser_wait_time=5.0,
-
-    # comma-separated list of HTML color codes, used in order to represent
-    # respective datacenter nodes
-    test_colors='#000,#00F,#0F0,#F00,#0FF,#FF0,#F0F',
-
-    # size of the per-test graphs
-    test_graphsize=200,
-)
-
-class Ring:
-    MIN_DC_OFFSET_DIVIDER = 235
-    offset_spacer = 2
-
-    def __init__(self, dc_counts, ringoffset, ringrange, strategy='nts'):
-        self.ringoffset = ringoffset
-        self.ringrange = ringrange
-        self.dc_counts = dc_counts
-        self.calculate_offset_tokens = getattr(self, 'calc_offset_tokens_' + strategy)
-
-    def best_per_dc_offset(self):
-        """
-        Calculate a per-dc offset for NTS DC spacing, such that there is a little
-        bit of room between nodes which would otherwise have been at the same token;
-        (hopefully) large enough that the difference can show when --graph is used,
-        but small enough that it there's no chance of the relative ordering changing.
-        """
-        lowest_division = len(self.dc_counts) * max(self.dc_counts) * self.offset_spacer
-        division = max(lowest_division, self.MIN_DC_OFFSET_DIVIDER)
-        return -self.ringrange // division
-
-    def bound_token(self, tok):
-        if tok < self.ringoffset:
-            tok += self.ringrange
-        return tok
-
-    def calc_offset_tokens_nts(self):
-        dc_offset = self.best_per_dc_offset()
-        dcs = []
-        for (dcnum, dccount) in enumerate(self.dc_counts):
-            offset = dcnum * dc_offset
-            arcsize = self.ringrange // (dccount or 1)
-            dcs.append(sorted([self.bound_token((n * arcsize + offset) - self.ringoffset % self.ringrange) for n in range(dccount)]))
-        return dcs
-
-    def calc_offset_tokens_onts(self):
-        dcs_by_count = sorted(enumerate(self.dc_counts), key=lambda d:d[1], reverse=True)
-        biggest = dcs_by_count[0][1]
-        nodes = [dcnum for (dcnum, dccount) in dcs_by_count for x in range(dccount)]
-        layout = [nodes[n] for i in range(biggest) for n in range(i, len(nodes), biggest)]
-
-        final = [[] for x in dcs_by_count]
-        for pos, dc in enumerate(layout):
-            final[dc].append(self.ringoffset + pos * self.ringrange // len(layout))
-        return final
-
-
-def print_tokens(tokens, tokenwidth, indent=0):
-    indentstr = ' ' * indent
-    for dcnum, toklist in enumerate(tokens):
-        print("%sDC #%d:" % (indentstr, dcnum + 1))
-        nwidth = len(str(len(toklist)))
-        for tnum, tok in enumerate(toklist):
-            print("%s  Node #%0*d: % *d" % (indentstr, nwidth, tnum + 1, tokenwidth, tok))
-
-def calculate_ideal_tokens(datacenters, ringoffset, ringrange, strategy):
-    return Ring(datacenters, ringoffset, ringrange, strategy).calculate_offset_tokens()
-
-def file_to_url(path):
-    path = os.path.abspath(path)
-    if os.name == 'nt':
-        host, path = os.path.splitunc(path)
-        drive, path = os.path.splitdrive(path)
-        path = (host or (drive + '|')) + path.replace(os.sep, '/')
-    return 'file://' + quote(path, safe='/')
-
-html_template = """<!DOCTYPE html>
-<html>
-<body>
-
-%(generated_body)s
-
-</body>
-</html>
-"""
-
-chart_template = """
-<canvas id="%(id)s" width="%(size)s" height="%(size)s" style="border:1px solid #c3c3c3;">
-    Your browser does not support the canvas element.
-</canvas>
-<script type="text/javascript">
-    var c=document.getElementById("%(id)s");
-    var ctx=c.getContext("2d");
-%(generated_script)s
-</script>
-"""
-
-chart_js_template = """
-    ctx.beginPath();
-    ctx.strokeStyle = "%(color)s";
-    ctx.moveTo(%(center)s,%(center)s);
-    ctx.lineTo(%(x)s,%(y)s);
-    ctx.stroke();
-    ctx.closePath();
-"""
-
-class RingRenderer:
-    border_fraction = 0.08
-
-    def __init__(self, ringrange, graphsize, colors):
-        self.ringrange = ringrange
-        self.graphsize = graphsize
-        self.colors = colors
-        self.anglefactor = 2 * math.pi / ringrange
-        self.linelength = graphsize * (1 - self.border_fraction) / 2
-        self.center = graphsize / 2
-
-    def calc_coords(self, tokens):
-        these_calcs = []
-
-        for toklist in tokens:
-            coordlist = []
-            for tok in toklist:
-                angle = tok * self.anglefactor
-                x2 = self.center + self.linelength * math.sin(angle)
-                y2 = self.center - self.linelength * math.cos(angle)
-                coordlist.append((x2, y2))
-            these_calcs.append(coordlist)
-
-        return these_calcs
-
-    def make_html(self, tokensets):
-        coordinate_sets = map(self.calc_coords, tokensets)
-        all_charts = []
-        for chart_index, chart_set in enumerate(coordinate_sets):
-            chart_code = []
-            for coordlist, color in zip(chart_set, cycle(self.colors)):
-                for x, y in coordlist:
-                    chart_code.append(chart_js_template
-                                          % dict(color=color, x=x, y=y,
-                                                 center=(self.graphsize / 2)))
-            this_chart = chart_template % dict(generated_script=''.join(chart_code),
-                                               id=chart_index, size=self.graphsize)
-            all_charts.append(this_chart)
-        return html_template % dict(generated_body=''.join(all_charts))
-
-# ===========================
-# Tests
-
-def run_tests(opts):
-    tests = [
-        [1],
-        [1, 1],
-        [2, 2],
-        [1, 2, 2],
-        [2, 2, 2],
-        [2, 0, 0],
-        [0, 2, 0],
-        [0, 0, 2],
-        [2, 2, 0],
-        [2, 0, 2],
-        [0, 2, 2],
-        [0, 0, 1, 1, 0, 1, 1],
-        [6],
-        [3, 3, 3],
-        [9],
-        [1,1,1,1],
-        [4],
-        [3,3,6,4,2]
-    ]
-
-    tokensets = []
-    for test in tests:
-        print("Test %r" % (test,))
-        tokens = calculate_ideal_tokens(test, opts.ringoffset, opts.ringrange, opts.strat)
-        print_tokens(tokens, len(str(opts.ringrange)) + 1, indent=2)
-        tokensets.append(tokens)
-    return tokensets
-
-# ===========================
-
-def display_html(html, wait_time):
-    with NamedTemporaryFile('wt', suffix='.html') as f:
-        f.write(html)
-        f.flush()
-        webbrowser.open(file_to_url(f.name), new=2)
-        # this is stupid. webbrowser.open really can't wait until the
-        # browser has said "yes I've got it"?
-        sleep(wait_time)
-
-def write_output(html, opts):
-    if opts.html_output == '-':
-        sys.stdout.write(html)
-    elif opts.html_output == '*tmp*':
-        display_html(html, opts.browser_wait_time)
-    else:
-        with open(opts.html_output, 'w') as f:
-            f.write(html)
-
-def readnum(prompt, min=None, max=None):
-    while True:
-        x = input(prompt + ' ')
-        try:
-            val = int(x)
-        except ValueError:
-            print("Oops, %r is not an integer. Try again.\n" % (x,))
-            continue
-        if min is not None and val < min:
-            print("Oops, the answer must be at least %d. Try again.\n" % (min,))
-        elif max is not None and val > max:
-            print("Oops, the answer must be at most %d. Try again.\n" % (max,))
-        else:
-            return val
-
-def get_dc_sizes_interactive():
-    print("Token Generator Interactive Mode")
-    print("--------------------------------")
-    print()
-    dcs = readnum(" How many datacenters will participate in this Cassandra cluster?", min=1)
-    sizes = []
-    for n in range(dcs):
-        sizes.append(readnum(" How many nodes are in datacenter #%d?" % (n + 1), min=0))
-    print()
-    return sizes
-
-def main(opts, args):
-    opts.colorlist = [s.strip() for s in opts.test_colors.split(',')]
-    if opts.test:
-        opts.graph = True
-        tokensets = run_tests(opts)
-        renderer = RingRenderer(ringrange=opts.ringrange, graphsize=opts.test_graphsize,
-                                colors=opts.colorlist)
-    else:
-        if len(args) == 0:
-            args = get_dc_sizes_interactive()
-        try:
-            datacenters = [int(arg) for arg in args]
-        except ValueError as e:
-            parser.error('Arguments should be integers.')
-        renderer = RingRenderer(ringrange=opts.ringrange, graphsize=opts.graphsize,
-                                colors=opts.colorlist)
-        tokens = calculate_ideal_tokens(datacenters, opts.ringoffset, opts.ringrange, opts.strat)
-        print_tokens(tokens, len(str(opts.ringrange)) + 1)
-        tokensets = [tokens]
-
-    if opts.graph:
-        html = renderer.make_html(tokensets)
-        write_output(html, opts)
-    return 0
-
-if __name__ == '__main__':
-    opts, args = parser.parse_args()
-    try:
-        res = main(opts, args)
-    except KeyboardInterrupt:
-        res = -128
-    sys.exit(res)

diff --git a/tools/bin/token-generator.bat b/tools/bin/token-generator.bat
deleted file mode 100644
index a7188db..0000000
--- a/tools/bin/token-generator.bat
+++ /dev/null

@@ -1,34 +0,0 @@
-@ECHO OFF

-@REM

-@REM Licensed to the Apache Software Foundation (ASF) under one or more

-@REM contributor license agreements. See the NOTICE file distributed with

-@REM this work for additional information regarding copyright ownership.

-@REM The ASF licenses this file to You under the Apache License, Version 2.0

-@REM (the "License"); you may not use this file except in compliance with

-@REM the License. You may obtain a copy of the License at

-@REM

-@REM http://www.apache.org/licenses/LICENSE-2.0

-@REM

-@REM Unless required by applicable law or agreed to in writing, software

-@REM distributed under the License is distributed on an "AS IS" BASIS,

-@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-@REM See the License for the specific language governing permissions and

-@REM limitations under the License.

-

-@echo off

-

-if "%OS%" == "Windows_NT" setlocal

-

-python -V >nul 2>&1

-if ERRORLEVEL 1 goto err

-

-python "%~dp0\token-generator" %*

-goto finally

-

-:err

-echo Can't detect Python version!

-

-:finally

-

-ENDLOCAL

-


diff --git a/tools/cqlstress-example.yaml b/tools/cqlstress-example.yaml
index 3c60c32..835a4cb 100644
--- a/tools/cqlstress-example.yaml
+++ b/tools/cqlstress-example.yaml

@@ -93,3 +93,16 @@
    range1:
       cql: select * from typestest where name = ? and choice = ? and date >= ? LIMIT 100
       fields: multirow            # samerow or multirow (select arguments from the same row, or randomly from all rows in the partition)
+
+
+#
+# A list of bulk read queries that analytics tools may perform against the schema
+# Each query will sweep an entire token range, page by page.
+#
+token_range_queries:
+  all_columns_tr_query:
+    columns: '*'
+    page_size: 5000
+
+  value_tr_query:
+    columns: value

diff --git a/tools/stress/README.txt b/tools/stress/README.txt
index 0046b25..e560c08 100644
--- a/tools/stress/README.txt
+++ b/tools/stress/README.txt

@@ -72,6 +72,9 @@
         The port to connect to cassandra nodes on
     -sendto:
         Specify a stress server to send this command to
+    -tokenrange:
+        Token range settings
+
 
 Suboptions:
     Every command and primary option has its own collection of suboptions. These are too numerous to list here.

diff --git a/tools/stress/src/org/apache/cassandra/stress/Operation.java b/tools/stress/src/org/apache/cassandra/stress/Operation.java
index 1179f71..8054482 100644
--- a/tools/stress/src/org/apache/cassandra/stress/Operation.java
+++ b/tools/stress/src/org/apache/cassandra/stress/Operation.java

@@ -18,12 +18,9 @@
 package org.apache.cassandra.stress;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
 
 import com.google.common.util.concurrent.RateLimiter;
 
-import org.apache.cassandra.stress.generate.*;
 import org.apache.cassandra.stress.settings.SettingsLog;
 import org.apache.cassandra.stress.settings.StressSettings;
 import org.apache.cassandra.stress.util.JavaDriverClient;
@@ -36,42 +33,11 @@
 {
     public final StressSettings settings;
     public final Timer timer;
-    protected final DataSpec spec;
 
-    private final List<PartitionIterator> partitionCache = new ArrayList<>();
-    protected List<PartitionIterator> partitions;
-
-    public static final class DataSpec
-    {
-        public final PartitionGenerator partitionGenerator;
-        final SeedManager seedManager;
-        final Distribution partitionCount;
-        final RatioDistribution useRatio;
-        final Integer targetCount;
-
-        public DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, Integer targetCount)
-        {
-            this(partitionGenerator, seedManager, partitionCount, null, targetCount);
-        }
-        public DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, RatioDistribution useRatio)
-        {
-            this(partitionGenerator, seedManager, partitionCount, useRatio, null);
-        }
-        private DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, RatioDistribution useRatio, Integer targetCount)
-        {
-            this.partitionGenerator = partitionGenerator;
-            this.seedManager = seedManager;
-            this.partitionCount = partitionCount;
-            this.useRatio = useRatio;
-            this.targetCount = targetCount;
-        }
-    }
-
-    public Operation(Timer timer, StressSettings settings, DataSpec spec)
+    public Operation(Timer timer, StressSettings settings)
     {
         this.timer = timer;
         this.settings = settings;
-        this.spec = spec;
     }
 
     public static interface RunOp
@@ -81,48 +47,7 @@
         public int rowCount();
     }
 
-    boolean ready(WorkManager permits, RateLimiter rateLimiter)
-    {
-        int partitionCount = (int) spec.partitionCount.next();
-        if (partitionCount <= 0)
-            return false;
-        partitionCount = permits.takePermits(partitionCount);
-        if (partitionCount <= 0)
-            return false;
-
-        int i = 0;
-        boolean success = true;
-        for (; i < partitionCount && success ; i++)
-        {
-            if (i >= partitionCache.size())
-                partitionCache.add(PartitionIterator.get(spec.partitionGenerator, spec.seedManager));
-
-            success = false;
-            while (!success)
-            {
-                Seed seed = spec.seedManager.next(this);
-                if (seed == null)
-                    break;
-
-                success = reset(seed, partitionCache.get(i));
-            }
-        }
-        partitionCount = i;
-
-        if (rateLimiter != null)
-            rateLimiter.acquire(partitionCount);
-
-        partitions = partitionCache.subList(0, partitionCount);
-        return !partitions.isEmpty();
-    }
-
-    protected boolean reset(Seed seed, PartitionIterator iterator)
-    {
-        if (spec.useRatio == null)
-            return iterator.reset(seed, spec.targetCount, isWrite());
-        else
-            return iterator.reset(seed, spec.useRatio.next(), isWrite());
-    }
+    public abstract boolean ready(WorkManager permits, RateLimiter rateLimiter);
 
     public boolean isWrite()
     {
@@ -195,13 +120,7 @@
 
     }
 
-    private String key()
-    {
-        List<String> keys = new ArrayList<>();
-        for (PartitionIterator partition : partitions)
-            keys.add(partition.getKeyAsString());
-        return keys.toString();
-    }
+    public abstract String key();
 
     protected String getExceptionMessage(Exception e)
     {

diff --git a/tools/stress/src/org/apache/cassandra/stress/Stress.java b/tools/stress/src/org/apache/cassandra/stress/Stress.java
index a4ec8a0..bc6d027 100644
--- a/tools/stress/src/org/apache/cassandra/stress/Stress.java
+++ b/tools/stress/src/org/apache/cassandra/stress/Stress.java

@@ -57,66 +57,78 @@
         if (FBUtilities.isWindows())
             WindowsTimer.startTimerPeriod(1);
 
-        final StressSettings settings;
         try
         {
-            settings = StressSettings.parse(arguments);
-        }
-        catch (IllegalArgumentException e)
-        {
-            printHelpMessage();
-            e.printStackTrace();
-            return;
-        }
 
-        PrintStream logout = settings.log.getOutput();
-
-        if (settings.sendToDaemon != null)
-        {
-            Socket socket = new Socket(settings.sendToDaemon, 2159);
-
-            ObjectOutputStream out = new ObjectOutputStream(socket.getOutputStream());
-            BufferedReader inp = new BufferedReader(new InputStreamReader(socket.getInputStream()));
-
-            Runtime.getRuntime().addShutdownHook(new ShutDown(socket, out));
-
-            out.writeObject(settings);
-
-            String line;
-
+            final StressSettings settings;
             try
             {
-                while (!socket.isClosed() && (line = inp.readLine()) != null)
-                {
-                    if (line.equals("END") || line.equals("FAILURE"))
-                    {
-                        out.writeInt(1);
-                        break;
-                    }
-
-                    logout.println(line);
-                }
+                settings = StressSettings.parse(arguments);
             }
-            catch (SocketException e)
+            catch (IllegalArgumentException e)
             {
-                if (!stopped)
-                    e.printStackTrace();
+                printHelpMessage();
+                e.printStackTrace();
+                return;
             }
 
-            out.close();
-            inp.close();
+            PrintStream logout = settings.log.getOutput();
 
-            socket.close();
+            if (settings.sendToDaemon != null)
+            {
+                Socket socket = new Socket(settings.sendToDaemon, 2159);
+
+                ObjectOutputStream out = new ObjectOutputStream(socket.getOutputStream());
+                BufferedReader inp = new BufferedReader(new InputStreamReader(socket.getInputStream()));
+
+                Runtime.getRuntime().addShutdownHook(new ShutDown(socket, out));
+
+                out.writeObject(settings);
+
+                String line;
+
+                try
+                {
+                    while (!socket.isClosed() && (line = inp.readLine()) != null)
+                    {
+                        if (line.equals("END") || line.equals("FAILURE"))
+                        {
+                            out.writeInt(1);
+                            break;
+                        }
+
+                        logout.println(line);
+                    }
+                }
+                catch (SocketException e)
+                {
+                    if (!stopped)
+                        e.printStackTrace();
+                }
+
+                out.close();
+                inp.close();
+
+                socket.close();
+            }
+            else
+            {
+                StressAction stressAction = new StressAction(settings, logout);
+                stressAction.run();
+            }
+
         }
-        else
+        catch (Throwable t)
         {
-            StressAction stressAction = new StressAction(settings, logout);
-            stressAction.run();
+            t.printStackTrace();
+        }
+        finally
+        {
+            if (FBUtilities.isWindows())
+                WindowsTimer.endTimerPeriod(1);
+            System.exit(0);
         }
 
-        if (FBUtilities.isWindows())
-            WindowsTimer.endTimerPeriod(1);
-        System.exit(0);
     }
 
     /**

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressAction.java b/tools/stress/src/org/apache/cassandra/stress/StressAction.java
index 158a278..657117c 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressAction.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressAction.java

@@ -54,6 +54,13 @@
         // creating keyspace and column families
         settings.maybeCreateKeyspaces();
 
+        if (settings.command.count == 0)
+        {
+            output.println("N=0: SCHEMA CREATED, NOTHING ELSE DONE.");
+            settings.disconnect();
+            return;
+        }
+
         output.println("Sleeping 2s...");
         Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
 
@@ -72,7 +79,7 @@
             success = runMulti(settings.rate.auto, rateLimiter);
         else
             success = null != run(settings.command.getFactory(settings), settings.rate.threadCount, settings.command.count,
-                                  settings.command.duration, rateLimiter, settings.command.durationUnits, output);
+                                  settings.command.duration, rateLimiter, settings.command.durationUnits, output, false);
 
         if (success)
             output.println("END");
@@ -85,12 +92,14 @@
     // type provided separately to support recursive call for mixed command with each command type it is performing
     private void warmup(OpDistributionFactory operations)
     {
-        // warmup - do 50k iterations; by default hotspot compiles methods after 10k invocations
         PrintStream warmupOutput = new PrintStream(new OutputStream() { @Override public void write(int b) throws IOException { } } );
-        int iterations = 50000 * settings.node.nodes.size();
+        // do 25% of iterations as warmup but no more than 50k (by default hotspot compiles methods after 10k invocations)
+        int iterations = (settings.command.count >= 0
+                          ? Math.min(50000, (int)(settings.command.count * 0.25))
+                          : 50000) * settings.node.nodes.size();
+        if (iterations <= 0) return;
+
         int threads = 100;
-        if (iterations > settings.command.count && settings.command.count > 0)
-            return;
 
         if (settings.rate.maxThreads > 0)
             threads = Math.min(threads, settings.rate.maxThreads);
@@ -102,7 +111,7 @@
             // we need to warm up all the nodes in the cluster ideally, but we may not be the only stress instance;
             // so warm up all the nodes we're speaking to only.
             output.println(String.format("Warming up %s with %d iterations...", single.desc(), iterations));
-            run(single, threads, iterations, 0, null, null, warmupOutput);
+            run(single, threads, iterations, 0, null, null, warmupOutput, true);
         }
     }
 
@@ -124,7 +133,7 @@
                 settings.command.truncateTables(settings);
 
             StressMetrics result = run(settings.command.getFactory(settings), threadCount, settings.command.count,
-                                       settings.command.duration, rateLimiter, settings.command.durationUnits, output);
+                                       settings.command.duration, rateLimiter, settings.command.durationUnits, output, false);
             if (result == null)
                 return false;
             results.add(result);
@@ -181,7 +190,14 @@
         return improvement / count;
     }
 
-    private StressMetrics run(OpDistributionFactory operations, int threadCount, long opCount, long duration, RateLimiter rateLimiter, TimeUnit durationUnits, PrintStream output)
+    private StressMetrics run(OpDistributionFactory operations,
+                              int threadCount,
+                              long opCount,
+                              long duration,
+                              RateLimiter rateLimiter,
+                              TimeUnit durationUnits,
+                              PrintStream output,
+                              boolean isWarmup)
     {
         output.println(String.format("Running %s with %d threads %s",
                                      operations.desc(),
@@ -199,10 +215,12 @@
 
         final CountDownLatch done = new CountDownLatch(threadCount);
         final Consumer[] consumers = new Consumer[threadCount];
+        int sampleCount = settings.samples.liveCount / threadCount;
         for (int i = 0; i < threadCount; i++)
         {
-            consumers[i] = new Consumer(operations, done, workManager, metrics, rateLimiter,
-                                        settings.samples.liveCount / threadCount);
+
+            consumers[i] = new Consumer(operations.get(metrics.getTiming(), sampleCount, isWarmup),
+                                        done, workManager, metrics, rateLimiter);
         }
 
         // starting worker threadCount
@@ -259,14 +277,17 @@
         private final WorkManager workManager;
         private final CountDownLatch done;
 
-        public Consumer(OpDistributionFactory operations, CountDownLatch done, WorkManager workManager, StressMetrics metrics,
-                        RateLimiter rateLimiter, int sampleCount)
+        public Consumer(OpDistribution operations,
+                        CountDownLatch done,
+                        WorkManager workManager,
+                        StressMetrics metrics,
+                        RateLimiter rateLimiter)
         {
             this.done = done;
             this.rateLimiter = rateLimiter;
             this.workManager = workManager;
             this.metrics = metrics;
-            this.operations = operations.get(metrics.getTiming(), sampleCount);
+            this.operations = operations;
         }
 
         public void run()

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java b/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java
index a640058..a4f280e 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressMetrics.java

@@ -158,7 +158,7 @@
         TimingInterval current = result.intervals.combine(settings.samples.reportCount);
         TimingInterval history = timing.getHistory().combine(settings.samples.historyCount);
         rowRateUncertainty.update(current.adjustedRowRate());
-        if (current.partitionCount != 0)
+        if (current.operationCount != 0)
         {
             if (result.intervals.intervals().size() > 1)
             {

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressProfile.java b/tools/stress/src/org/apache/cassandra/stress/StressProfile.java
index 410f666..0b0d4e9 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressProfile.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressProfile.java

@@ -35,12 +35,15 @@
 import com.datastax.driver.core.*;
 import com.datastax.driver.core.exceptions.AlreadyExistsException;
 import org.apache.cassandra.config.CFMetaData;
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.statements.CreateKeyspaceStatement;
+import org.apache.cassandra.cql3.statements.CreateTableStatement;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.stress.generate.*;
 import org.apache.cassandra.stress.generate.values.*;
+import org.apache.cassandra.stress.operations.userdefined.TokenRangeQuery;
 import org.apache.cassandra.stress.operations.userdefined.SchemaInsert;
 import org.apache.cassandra.stress.operations.userdefined.SchemaQuery;
 import org.apache.cassandra.stress.operations.userdefined.ValidatingSchemaQuery;
@@ -59,21 +62,25 @@
 {
     private String keyspaceCql;
     private String tableCql;
+    private List<String> extraSchemaDefinitions;
     private String seedStr;
 
     public String keyspaceName;
     public String tableName;
     private Map<String, GeneratorConfig> columnConfigs;
     private Map<String, StressYaml.QueryDef> queries;
+    public Map<String, StressYaml.TokenRangeQueryDef> tokenRangeQueries;
     private Map<String, String> insert;
 
     transient volatile TableMetadata tableMetaData;
+    transient volatile Set<TokenRange> tokenRanges;
 
     transient volatile GeneratorFactory generatorFactory;
 
     transient volatile BatchStatement.Type batchType;
     transient volatile DistributionFactory partitions;
     transient volatile RatioDistributionFactory selectchance;
+    transient volatile RatioDistributionFactory rowPopulation;
     transient volatile PreparedStatement insertStatement;
     transient volatile Integer thriftInsertId;
     transient volatile List<ValidatingSchemaQuery.Factory> validationFactories;
@@ -84,18 +91,29 @@
 
     private void init(StressYaml yaml) throws RequestValidationException
     {
+        // Use client mode. Otherwise, users with no read permission on /var/lib/commitlog won't be able to
+        // use cassandra-stress...
+        Config.setClientMode(true);
+
         keyspaceName = yaml.keyspace;
         keyspaceCql = yaml.keyspace_definition;
         tableName = yaml.table;
         tableCql = yaml.table_definition;
         seedStr = "seed for stress";
         queries = yaml.queries;
+        tokenRangeQueries = yaml.token_range_queries;
         insert = yaml.insert;
 
+        extraSchemaDefinitions = yaml.extra_definitions;
+
         assert keyspaceName != null : "keyspace name is required in yaml file";
         assert tableName != null : "table name is required in yaml file";
         assert queries != null : "queries map is required in yaml file";
 
+        for (String query : queries.keySet())
+            assert !tokenRangeQueries.containsKey(query) :
+                String.format("Found %s in both queries and token_range_queries, please use different names", query);
+
         if (keyspaceCql != null && keyspaceCql.length() > 0)
         {
             try
@@ -117,12 +135,12 @@
         {
             try
             {
-                String name = CFMetaData.compile(tableCql, keyspaceName).cfName;
+                String name = ((CreateTableStatement.RawStatement) QueryProcessor.parseStatement(tableCql)).columnFamily();
                 assert name.equalsIgnoreCase(tableName) : "Name in table_definition doesn't match table property: '" + name + "' != '" + tableName + "'";
             }
             catch (RuntimeException e)
             {
-                throw new IllegalArgumentException("There was a problem parsing the table cql: " + e.getCause().getMessage());
+                throw new IllegalArgumentException("There was a problem parsing the table cql: " + e.getMessage());
             }
         }
         else
@@ -168,7 +186,7 @@
             }
         }
 
-        client.execute("use "+keyspaceName, org.apache.cassandra.db.ConsistencyLevel.ONE);
+        client.execute("use " + keyspaceName, org.apache.cassandra.db.ConsistencyLevel.ONE);
 
         if (tableCql != null)
         {
@@ -184,6 +202,24 @@
             Uninterruptibles.sleepUninterruptibly(settings.node.nodes.size(), TimeUnit.SECONDS);
         }
 
+        if (extraSchemaDefinitions != null)
+        {
+            for (String extraCql : extraSchemaDefinitions)
+            {
+
+                try
+                {
+                    client.execute(extraCql, org.apache.cassandra.db.ConsistencyLevel.ONE);
+                }
+                catch (AlreadyExistsException e)
+                {
+                }
+            }
+
+            System.out.println(String.format("Created extra schema. Sleeping %ss for propagation.", settings.node.nodes.size()));
+            Uninterruptibles.sleepUninterruptibly(settings.node.nodes.size(), TimeUnit.SECONDS);
+        }
+
         maybeLoadSchemaInfo(settings);
     }
 
@@ -232,8 +268,48 @@
         }
     }
 
-    public SchemaQuery getQuery(String name, Timer timer, PartitionGenerator generator, SeedManager seeds, StressSettings settings)
+    public Set<TokenRange> maybeLoadTokenRanges(StressSettings settings)
     {
+        maybeLoadSchemaInfo(settings); // ensure table metadata is available
+
+        JavaDriverClient client = settings.getJavaDriverClient();
+        synchronized (client)
+        {
+            if (tokenRanges != null)
+                return tokenRanges;
+
+            Cluster cluster = client.getCluster();
+            Metadata metadata = cluster.getMetadata();
+            if (metadata == null)
+                throw new RuntimeException("Unable to get metadata");
+
+            List<TokenRange> sortedRanges = new ArrayList<>(metadata.getTokenRanges().size() + 1);
+            for (TokenRange range : metadata.getTokenRanges())
+            {
+                //if we don't unwrap we miss the partitions between ring min and smallest range start value
+                if (range.isWrappedAround())
+                    sortedRanges.addAll(range.unwrap());
+                else
+                    sortedRanges.add(range);
+            }
+
+            Collections.sort(sortedRanges);
+            tokenRanges = new LinkedHashSet<>(sortedRanges);
+            return tokenRanges;
+        }
+    }
+
+    public Operation getQuery(String name,
+                              Timer timer,
+                              PartitionGenerator generator,
+                              SeedManager seeds,
+                              StressSettings settings,
+                              boolean isWarmup)
+    {
+        name = name.toLowerCase();
+        if (!queries.containsKey(name))
+            throw new IllegalArgumentException("No query defined with name " + name);
+
         if (queryStatements == null)
         {
             synchronized (this)
@@ -274,13 +350,19 @@
             }
         }
 
-        name = name.toLowerCase();
-        if (!queryStatements.containsKey(name))
-            throw new IllegalArgumentException("No query defined with name " + name);
         return new SchemaQuery(timer, settings, generator, seeds, thriftQueryIds.get(name), queryStatements.get(name),
                                ThriftConversion.fromThrift(settings.command.consistencyLevel), argSelects.get(name));
     }
 
+    public Operation getBulkReadQueries(String name, Timer timer, StressSettings settings, TokenRangeIterator tokenRangeIterator, boolean isWarmup)
+    {
+        StressYaml.TokenRangeQueryDef def = tokenRangeQueries.get(name);
+        if (def == null)
+            throw new IllegalArgumentException("No bulk read query defined with name " + name);
+
+        return new TokenRangeQuery(timer, settings, tableMetaData, tokenRangeIterator, def, isWarmup);
+    }
+
     public SchemaInsert getInsert(Timer timer, PartitionGenerator generator, SeedManager seedManager, StressSettings settings)
     {
         if (insertStatement == null)
@@ -329,6 +411,11 @@
                             {
                                 case SET:
                                 case LIST:
+                                    if (c.getType().isFrozen())
+                                    {
+                                        sb.append("?");
+                                        break;
+                                    }
                                 case COUNTER:
                                     sb.append(c.getName()).append(" + ?");
                                     break;
@@ -348,6 +435,7 @@
 
                     partitions = select(settings.insert.batchsize, "partitions", "fixed(1)", insert, OptionDistribution.BUILDER);
                     selectchance = select(settings.insert.selectRatio, "select", "fixed(1)/1", insert, OptionRatioDistribution.BUILDER);
+                    rowPopulation = select(settings.insert.rowPopulationRatio, "row-population", "fixed(1)/1", insert, OptionRatioDistribution.BUILDER);
                     batchType = settings.insert.batchType != null
                                 ? settings.insert.batchType
                                 : !insert.containsKey("batchtype")
@@ -398,7 +486,7 @@
             }
         }
 
-        return new SchemaInsert(timer, settings, generator, seedManager, partitions.get(), selectchance.get(), thriftInsertId, insertStatement, ThriftConversion.fromThrift(settings.command.consistencyLevel), batchType);
+        return new SchemaInsert(timer, settings, generator, seedManager, partitions.get(), selectchance.get(), rowPopulation.get(), thriftInsertId, insertStatement, ThriftConversion.fromThrift(settings.command.consistencyLevel), batchType);
     }
 
     public List<ValidatingSchemaQuery> getValidate(Timer timer, PartitionGenerator generator, SeedManager seedManager, StressSettings settings)
@@ -429,6 +517,7 @@
             return first;
         if (val != null && val.trim().length() > 0)
             return builder.apply(val);
+        
         return builder.apply(defValue);
     }
 

diff --git a/tools/stress/src/org/apache/cassandra/stress/StressYaml.java b/tools/stress/src/org/apache/cassandra/stress/StressYaml.java
index b6efc5e..214e56a 100644
--- a/tools/stress/src/org/apache/cassandra/stress/StressYaml.java
+++ b/tools/stress/src/org/apache/cassandra/stress/StressYaml.java

@@ -20,6 +20,7 @@
  */
 package org.apache.cassandra.stress;
 
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -30,9 +31,12 @@
     public String table;
     public String table_definition;
 
+    public List<String> extra_definitions;
+
     public List<Map<String, Object>> columnspec;
     public Map<String, QueryDef> queries;
     public Map<String, String> insert;
+    public Map<String, TokenRangeQueryDef> token_range_queries = new HashMap<>();
 
     public static class QueryDef
     {
@@ -40,4 +44,10 @@
         public String fields;
     }
 
+    public static class TokenRangeQueryDef
+    {
+        public String columns;
+        public int page_size = 5000;
+    }
+
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/WorkManager.java b/tools/stress/src/org/apache/cassandra/stress/WorkManager.java
index c6a3eee..78d4176 100644
--- a/tools/stress/src/org/apache/cassandra/stress/WorkManager.java
+++ b/tools/stress/src/org/apache/cassandra/stress/WorkManager.java

@@ -2,7 +2,7 @@
 
 import java.util.concurrent.atomic.AtomicLong;
 
-interface WorkManager
+public interface WorkManager
 {
     // -1 indicates consumer should terminate
     int takePermits(int count);

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/PartitionIterator.java b/tools/stress/src/org/apache/cassandra/stress/generate/PartitionIterator.java
index 4906b95..2157214 100644
--- a/tools/stress/src/org/apache/cassandra/stress/generate/PartitionIterator.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/PartitionIterator.java

@@ -52,7 +52,7 @@
 public abstract class PartitionIterator implements Iterator<Row>
 {
 
-    abstract boolean reset(double useChance, int targetCount, boolean isWrite, PartitionGenerator.Order order);
+    abstract boolean reset(double useChance, double rowPopulationRatio, int targetCount, boolean isWrite, PartitionGenerator.Order order);
     // picks random (inclusive) bounds to iterate, and returns them
     public abstract Pair<Row, Row> resetToBounds(Seed seed, int clusteringComponentDepth);
 
@@ -100,42 +100,47 @@
         this.idseed = idseed;
     }
 
-    public boolean reset(Seed seed, double useChance, boolean isWrite)
+    public boolean reset(Seed seed, double useChance, double rowPopulationRatio, boolean isWrite)
     {
         setSeed(seed);
         this.order = generator.order;
-        return reset(useChance, 0, isWrite, PartitionIterator.this.order);
+        return reset(useChance, rowPopulationRatio, 0, isWrite, PartitionIterator.this.order);
     }
 
-    public boolean reset(Seed seed, int targetCount, boolean isWrite)
+    public boolean reset(Seed seed, int targetCount, double rowPopulationRatio,  boolean isWrite)
     {
         setSeed(seed);
         this.order = generator.order;
-        return reset(Double.NaN, targetCount, isWrite, PartitionIterator.this.order);
+        return reset(Double.NaN, rowPopulationRatio,targetCount, isWrite,PartitionIterator.this.order);
     }
 
     static class SingleRowIterator extends PartitionIterator
     {
         boolean done;
         boolean isWrite;
+        double rowPopulationRatio;
+        final double totalValueColumns;
 
         private SingleRowIterator(PartitionGenerator generator, SeedManager seedManager)
         {
             super(generator, seedManager);
+
+            this.totalValueColumns = generator.valueComponents.size();
         }
 
         public Pair<Row, Row> resetToBounds(Seed seed, int clusteringComponentDepth)
         {
             assert clusteringComponentDepth == 0;
             setSeed(seed);
-            reset(1d, 1, false, PartitionGenerator.Order.SORTED);
+            reset(1d, 1d, 1, false, PartitionGenerator.Order.SORTED);
             return Pair.create(new Row(partitionKey), new Row(partitionKey));
         }
 
-        boolean reset(double useChance, int targetCount, boolean isWrite, PartitionGenerator.Order order)
+        boolean reset(double useChance, double rowPopulationRatio, int targetCount, boolean isWrite, PartitionGenerator.Order order)
         {
             done = false;
             this.isWrite = isWrite;
+            this.rowPopulationRatio = rowPopulationRatio;
             return true;
         }
 
@@ -148,11 +153,20 @@
         {
             if (done)
                 throw new NoSuchElementException();
+
+            double valueColumn = 0.0;
             for (int i = 0 ; i < row.row.length ; i++)
             {
-                Generator gen = generator.valueComponents.get(i);
-                gen.setSeed(idseed);
-                row.row[i] = gen.generate();
+                if (generator.permitNulls(i) && (++valueColumn/totalValueColumns) > rowPopulationRatio)
+                {
+                    row.row[i] = null;
+                }
+                else
+                {
+                    Generator gen = generator.valueComponents.get(i);
+                    gen.setSeed(idseed);
+                    row.row[i] = gen.generate();
+                }
             }
             done = true;
             if (isWrite)
@@ -180,6 +194,8 @@
 
         // probability any single row will be generated in this iteration
         double useChance;
+        double rowPopulationRatio;
+        final double totalValueColumns;
         // we want our chance of selection to be applied uniformly, so we compound the roll we make at each level
         // so that we know with what chance we reached there, and we adjust our roll at that level by that amount
         final double[] chancemodifier = new double[generator.clusteringComponents.size()];
@@ -201,6 +217,7 @@
                 clusteringComponents[i] = new ArrayDeque<>();
             rollmodifier[0] = 1f;
             chancemodifier[0] = generator.clusteringDescendantAverages[0];
+            this.totalValueColumns = generator.valueComponents.size();
         }
 
         /**
@@ -216,9 +233,10 @@
          *
          * @return true if there is data to return, false otherwise
          */
-        boolean reset(double useChance, int targetCount, boolean isWrite, PartitionGenerator.Order order)
+        boolean reset(double useChance, double rowPopulationRatio, int targetCount, boolean isWrite, PartitionGenerator.Order order)
         {
             this.isWrite = isWrite;
+            this.rowPopulationRatio = rowPopulationRatio;
 
             this.order = order;
             // set the seed for the first clustering component
@@ -291,7 +309,7 @@
             setUseChance(1d);
             if (clusteringComponentDepth == 0)
             {
-                reset(1d, -1, false, PartitionGenerator.Order.SORTED);
+                reset(1d, 1d, -1, false, PartitionGenerator.Order.SORTED);
                 return Pair.create(new Row(partitionKey), new Row(partitionKey));
             }
 
@@ -493,11 +511,20 @@
 
             Row result = row.copy();
             // and then fill the row with the _non-clustering_ values for the position we _were_ at, as this is what we'll deliver
+            double valueColumn = 0.0;
+
             for (int i = clusteringSeeds.length ; i < row.row.length ; i++)
             {
                 Generator gen = generator.valueComponents.get(i - clusteringSeeds.length);
-                gen.setSeed(rowSeed);
-                result.row[i] = gen.generate();
+                if (++valueColumn / totalValueColumns > rowPopulationRatio)
+                {
+                    result.row[i] = null;
+                }
+                else
+                {
+                    gen.setSeed(rowSeed);
+                    result.row[i] = gen.generate();
+                }
             }
 
             // then we advance the leaf level

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java b/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java
index 9e2e65b..243ac30 100644
--- a/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/Seed.java

@@ -20,7 +20,7 @@
 
 import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 
-import org.apache.cassandra.stress.util.DynamicList;
+import org.apache.cassandra.utils.DynamicList;
 
 public class Seed implements Comparable<Seed>
 {

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java b/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java
index 071d888..5020b45 100644
--- a/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/SeedManager.java

@@ -25,7 +25,7 @@
 
 import org.apache.cassandra.stress.Operation;
 import org.apache.cassandra.stress.settings.StressSettings;
-import org.apache.cassandra.stress.util.DynamicList;
+import org.apache.cassandra.utils.LockedDynamicList;
 
 public class SeedManager
 {
@@ -34,7 +34,7 @@
     final Generator writes;
     final Generator reads;
     final ConcurrentHashMap<Long, Seed> managing = new ConcurrentHashMap<>();
-    final DynamicList<Seed> sampleFrom;
+    final LockedDynamicList<Seed> sampleFrom;
     final Distribution sample;
     final long sampleOffset;
     final int sampleSize;
@@ -69,7 +69,7 @@
         long sampleSize = 1 + Math.max(sample.minValue(), sample.maxValue()) - sampleOffset;
         if (sampleOffset < 0 || sampleSize > Integer.MAX_VALUE)
             throw new IllegalArgumentException("sample range is invalid");
-        this.sampleFrom = new DynamicList<>((int) sampleSize);
+        this.sampleFrom = new LockedDynamicList<>((int) sampleSize);
         this.sample = DistributionInverted.invert(sample);
         this.sampleSize = (int) sampleSize;
         this.updateSampleImmediately = visits.average() > 1;

diff --git a/tools/stress/src/org/apache/cassandra/stress/generate/TokenRangeIterator.java b/tools/stress/src/org/apache/cassandra/stress/generate/TokenRangeIterator.java
new file mode 100644
index 0000000..5ddac61
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/generate/TokenRangeIterator.java

@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.stress.generate;
+
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import com.datastax.driver.core.TokenRange;
+import org.apache.cassandra.stress.settings.StressSettings;
+
+public class TokenRangeIterator
+{
+    private final Set<TokenRange> tokenRanges;
+    private final ConcurrentLinkedQueue<TokenRange> pendingRanges;
+    private final boolean wrap;
+
+    public TokenRangeIterator(StressSettings settings, Set<TokenRange> tokenRanges)
+    {
+        this.tokenRanges = maybeSplitRanges(tokenRanges, settings.tokenRange.splitFactor);
+        this.pendingRanges = new ConcurrentLinkedQueue<>(this.tokenRanges);
+        this.wrap = settings.tokenRange.wrap;
+    }
+
+    private static Set<TokenRange> maybeSplitRanges(Set<TokenRange> tokenRanges, int splitFactor)
+    {
+        if (splitFactor <= 1)
+            return tokenRanges;
+
+        Set<TokenRange> ret = new TreeSet<>();
+        for (TokenRange range : tokenRanges)
+            ret.addAll(range.splitEvenly(splitFactor));
+
+        return ret;
+    }
+
+    public void update()
+    {
+        // we may race and add to the queue twice but no bad consequence so it's fine if that happens
+        // as ultimately only the permits determine when to stop if wrap is true
+        if (wrap && pendingRanges.isEmpty())
+            pendingRanges.addAll(tokenRanges);
+    }
+
+    public TokenRange next()
+    {
+        return pendingRanges.poll();
+    }
+
+    public boolean exhausted()
+    {
+        return pendingRanges.isEmpty();
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java
index 7e13fcd..5fbb0f9 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/OpDistributionFactory.java

@@ -25,7 +25,7 @@
 
 public interface OpDistributionFactory
 {
-    public OpDistribution get(Timing timing, int sampleCount);
+    public OpDistribution get(Timing timing, int sampleCount, boolean isWarmup);
     public String desc();
     Iterable<OpDistributionFactory> each();
 }

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/PartitionOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/PartitionOperation.java
new file mode 100644
index 0000000..45c36f2
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/PartitionOperation.java

@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.stress.operations;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.util.concurrent.RateLimiter;
+
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.WorkManager;
+import org.apache.cassandra.stress.generate.Distribution;
+import org.apache.cassandra.stress.generate.PartitionGenerator;
+import org.apache.cassandra.stress.generate.PartitionIterator;
+import org.apache.cassandra.stress.generate.RatioDistribution;
+import org.apache.cassandra.stress.generate.Seed;
+import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.settings.OptionRatioDistribution;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.Timer;
+
+public abstract class PartitionOperation extends Operation
+{
+    protected final DataSpec spec;
+    private final static RatioDistribution defaultRowPopulationRatio = OptionRatioDistribution.BUILDER.apply("fixed(1)/1").get();
+
+    private final List<PartitionIterator> partitionCache = new ArrayList<>();
+    protected List<PartitionIterator> partitions;
+
+    public static final class DataSpec
+    {
+        public final PartitionGenerator partitionGenerator;
+        final SeedManager seedManager;
+        final Distribution partitionCount;
+        final RatioDistribution useRatio;
+        final RatioDistribution rowPopulationRatio;
+        final Integer targetCount;
+
+        public DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, RatioDistribution rowPopulationRatio, Integer targetCount)
+        {
+            this(partitionGenerator, seedManager, partitionCount, null, rowPopulationRatio, targetCount);
+        }
+        public DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, RatioDistribution useRatio, RatioDistribution rowPopulationRatio)
+        {
+            this(partitionGenerator, seedManager, partitionCount, useRatio, rowPopulationRatio, null);
+        }
+        private DataSpec(PartitionGenerator partitionGenerator, SeedManager seedManager, Distribution partitionCount, RatioDistribution useRatio, RatioDistribution rowPopulationRatio, Integer targetCount)
+        {
+            this.partitionGenerator = partitionGenerator;
+            this.seedManager = seedManager;
+            this.partitionCount = partitionCount;
+            this.useRatio = useRatio;
+            this.rowPopulationRatio = rowPopulationRatio == null ? defaultRowPopulationRatio : rowPopulationRatio;
+            this.targetCount = targetCount;
+        }
+    }
+
+    public PartitionOperation(Timer timer, StressSettings settings, DataSpec spec)
+    {
+        super(timer, settings);
+        this.spec = spec;
+    }
+
+    public boolean ready(WorkManager permits, RateLimiter rateLimiter)
+    {
+        int partitionCount = (int) spec.partitionCount.next();
+        if (partitionCount <= 0)
+            return false;
+        partitionCount = permits.takePermits(partitionCount);
+        if (partitionCount <= 0)
+            return false;
+
+        int i = 0;
+        boolean success = true;
+        for (; i < partitionCount && success ; i++)
+        {
+            if (i >= partitionCache.size())
+                partitionCache.add(PartitionIterator.get(spec.partitionGenerator, spec.seedManager));
+
+            success = false;
+            while (!success)
+            {
+                Seed seed = spec.seedManager.next(this);
+                if (seed == null)
+                    break;
+
+                success = reset(seed, partitionCache.get(i));
+            }
+        }
+        partitionCount = i;
+
+        if (rateLimiter != null)
+            rateLimiter.acquire(partitionCount);
+
+        partitions = partitionCache.subList(0, partitionCount);
+        return !partitions.isEmpty();
+    }
+
+    protected boolean reset(Seed seed, PartitionIterator iterator)
+    {
+        if (spec.useRatio == null)
+            return iterator.reset(seed, spec.targetCount, spec.rowPopulationRatio.next(), isWrite());
+        else
+            return iterator.reset(seed, spec.useRatio.next(), spec.rowPopulationRatio.next(), isWrite());
+    }
+
+    public String key()
+    {
+        List<String> keys = new ArrayList<>();
+        for (PartitionIterator partition : partitions)
+            keys.add(partition.getKeyAsString());
+        return keys.toString();
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java
index 194f84f..a10585d 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/SampledOpDistributionFactory.java

@@ -44,16 +44,17 @@
         this.clustering = clustering;
     }
 
-    protected abstract List<? extends Operation> get(Timer timer, PartitionGenerator generator, T key);
+    protected abstract List<? extends Operation> get(Timer timer, PartitionGenerator generator, T key, boolean isWarmup);
     protected abstract PartitionGenerator newGenerator();
 
-    public OpDistribution get(Timing timing, int sampleCount)
+    public OpDistribution get(Timing timing, int sampleCount, boolean isWarmup)
     {
         PartitionGenerator generator = newGenerator();
         List<Pair<Operation, Double>> operations = new ArrayList<>();
         for (Map.Entry<T, Double> ratio : ratios.entrySet())
         {
-            List<? extends Operation> ops = get(timing.newTimer(ratio.getKey().toString(), sampleCount), generator, ratio.getKey());
+            List<? extends Operation> ops = get(timing.newTimer(ratio.getKey().toString(), sampleCount),
+                                                generator, ratio.getKey(), isWarmup);
             for (Operation op : ops)
                 operations.add(new Pair<>(op, ratio.getValue() / ops.size()));
         }
@@ -75,15 +76,18 @@
         {
             out.add(new OpDistributionFactory()
             {
-                public OpDistribution get(Timing timing, int sampleCount)
+                public OpDistribution get(Timing timing, int sampleCount, boolean isWarmup)
                 {
-                    List<? extends Operation> ops = SampledOpDistributionFactory.this.get(timing.newTimer(ratio.getKey().toString(), sampleCount), newGenerator(), ratio.getKey());
+                    List<? extends Operation> ops = SampledOpDistributionFactory.this.get(timing.newTimer(ratio.getKey().toString(), sampleCount),
+                                                                                          newGenerator(),
+                                                                                          ratio.getKey(),
+                                                                                          isWarmup);
                     if (ops.size() == 1)
                         return new FixedOpDistribution(ops.get(0));
                     List<Pair<Operation, Double>> ratios = new ArrayList<>();
                     for (Operation op : ops)
                         ratios.add(new Pair<>(op, 1d / ops.size()));
-                    return new SampledOpDistribution(new EnumeratedDistribution<Operation>(ratios), new DistributionFixed(1));
+                    return new SampledOpDistribution(new EnumeratedDistribution<>(ratios), new DistributionFixed(1));
                 }
 
                 public String desc()

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java
index 89298ab..3767401 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java

@@ -25,6 +25,7 @@
 
 import org.apache.cassandra.stress.Operation;
 import org.apache.cassandra.stress.generate.*;
+import org.apache.cassandra.stress.operations.PartitionOperation;
 import org.apache.cassandra.stress.settings.Command;
 import org.apache.cassandra.stress.settings.CqlVersion;
 import org.apache.cassandra.stress.settings.StressSettings;
@@ -32,7 +33,7 @@
 import org.apache.cassandra.thrift.SlicePredicate;
 import org.apache.cassandra.thrift.SliceRange;
 
-public abstract class PredefinedOperation extends Operation
+public abstract class PredefinedOperation extends PartitionOperation
 {
     public final Command type;
     private final Distribution columnCount;
@@ -40,14 +41,14 @@
 
     public PredefinedOperation(Command type, Timer timer, PartitionGenerator generator, SeedManager seedManager, StressSettings settings)
     {
-        super(timer, settings, spec(generator, seedManager));
+        super(timer, settings, spec(generator, seedManager, settings.insert.rowPopulationRatio.get()));
         this.type = type;
         this.columnCount = settings.columns.countDistribution.get();
     }
 
-    private static DataSpec spec(PartitionGenerator generator, SeedManager seedManager)
+    private static DataSpec spec(PartitionGenerator generator, SeedManager seedManager, RatioDistribution rowPopulationCount)
     {
-        return new DataSpec(generator, seedManager, new DistributionFixed(1), 1);
+        return new DataSpec(generator, seedManager, new DistributionFixed(1), rowPopulationCount, 1);
     }
 
     public boolean isCql3()

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java
index ef4d53f..d9fcac8 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaInsert.java

@@ -41,9 +41,9 @@
 
     private final BatchStatement.Type batchType;
 
-    public SchemaInsert(Timer timer, StressSettings settings, PartitionGenerator generator, SeedManager seedManager, Distribution batchSize, RatioDistribution useRatio, Integer thriftId, PreparedStatement statement, ConsistencyLevel cl, BatchStatement.Type batchType)
+    public SchemaInsert(Timer timer, StressSettings settings, PartitionGenerator generator, SeedManager seedManager, Distribution batchSize, RatioDistribution useRatio, RatioDistribution rowPopulation, Integer thriftId, PreparedStatement statement, ConsistencyLevel cl, BatchStatement.Type batchType)
     {
-        super(timer, settings, new DataSpec(generator, seedManager, batchSize, useRatio), statement, thriftId, cl);
+        super(timer, settings, new DataSpec(generator, seedManager, batchSize, useRatio, rowPopulation), statement, thriftId, cl);
         this.batchType = batchType;
     }
 

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java
index 58f5307..9b5c4ae 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaQuery.java

@@ -53,7 +53,7 @@
 
     public SchemaQuery(Timer timer, StressSettings settings, PartitionGenerator generator, SeedManager seedManager, Integer thriftId, PreparedStatement statement, ConsistencyLevel cl, ArgSelect argSelect)
     {
-        super(timer, settings, new DataSpec(generator, seedManager, new DistributionFixed(1), argSelect == ArgSelect.MULTIROW ? statement.getVariables().size() : 1), statement, thriftId, cl);
+        super(timer, settings, new DataSpec(generator, seedManager, new DistributionFixed(1), settings.insert.rowPopulationRatio.get(), argSelect == ArgSelect.MULTIROW ? statement.getVariables().size() : 1), statement, thriftId, cl);
         this.argSelect = argSelect;
         randomBuffer = new Object[argumentIndex.length][argumentIndex.length];
     }

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java
index 49891ec..c9ead12 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/SchemaStatement.java

@@ -34,12 +34,13 @@
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.stress.Operation;
 import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.operations.PartitionOperation;
 import org.apache.cassandra.stress.settings.StressSettings;
 import org.apache.cassandra.stress.util.JavaDriverClient;
 import org.apache.cassandra.stress.util.Timer;
 import org.apache.cassandra.transport.SimpleClient;
 
-public abstract class SchemaStatement extends Operation
+public abstract class SchemaStatement extends PartitionOperation
 {
 
     final PreparedStatement statement;

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/TokenRangeQuery.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/TokenRangeQuery.java
new file mode 100644
index 0000000..60a6c48
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/TokenRangeQuery.java

@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.stress.operations.userdefined;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import javax.naming.OperationNotSupportedException;
+
+import com.google.common.util.concurrent.RateLimiter;
+
+import com.datastax.driver.core.ColumnMetadata;
+import com.datastax.driver.core.PagingState;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.SimpleStatement;
+import com.datastax.driver.core.Statement;
+import com.datastax.driver.core.TableMetadata;
+import com.datastax.driver.core.Token;
+import com.datastax.driver.core.TokenRange;
+import org.apache.cassandra.stress.Operation;
+import org.apache.cassandra.stress.StressYaml;
+import org.apache.cassandra.stress.WorkManager;
+import org.apache.cassandra.stress.generate.TokenRangeIterator;
+import org.apache.cassandra.stress.settings.StressSettings;
+import org.apache.cassandra.stress.util.JavaDriverClient;
+import org.apache.cassandra.stress.util.ThriftClient;
+import org.apache.cassandra.stress.util.Timer;
+
+public class TokenRangeQuery extends Operation
+{
+    private final ThreadLocal<State> currentState = new ThreadLocal<>();
+
+    private final TableMetadata tableMetadata;
+    private final TokenRangeIterator tokenRangeIterator;
+    private final String columns;
+    private final int pageSize;
+    private final boolean isWarmup;
+
+    public TokenRangeQuery(Timer timer,
+                           StressSettings settings,
+                           TableMetadata tableMetadata,
+                           TokenRangeIterator tokenRangeIterator,
+                           StressYaml.TokenRangeQueryDef def,
+                           boolean isWarmup)
+    {
+        super(timer, settings);
+        this.tableMetadata = tableMetadata;
+        this.tokenRangeIterator = tokenRangeIterator;
+        this.columns = sanitizeColumns(def.columns, tableMetadata);
+        this.pageSize = isWarmup ? Math.min(100, def.page_size) : def.page_size;
+        this.isWarmup = isWarmup;
+    }
+
+    /**
+     * We need to specify the columns by name because we need to add token(partition_keys) in order to count
+     * partitions. So if the user specifies '*' then replace it with a list of all columns.
+     */
+    private static String sanitizeColumns(String columns, TableMetadata tableMetadata)
+    {
+        if (!columns.equals("*"))
+            return columns;
+
+        return String.join(", ", tableMetadata.getColumns().stream().map(ColumnMetadata::getName).collect(Collectors.toList()));
+    }
+
+    /**
+     * The state of a token range currently being retrieved.
+     * Here we store the paging state to retrieve more pages
+     * and we keep track of which partitions have already been retrieved,
+     */
+    private final static class State
+    {
+        public final TokenRange tokenRange;
+        public final String query;
+        public PagingState pagingState;
+        public Set<Token> partitions = new HashSet<>();
+
+        public State(TokenRange tokenRange, String query)
+        {
+            this.tokenRange = tokenRange;
+            this.query = query;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[%s, %s]", tokenRange.getStart(), tokenRange.getEnd());
+        }
+    }
+
+    abstract static class Runner implements RunOp
+    {
+        int partitionCount;
+        int rowCount;
+
+        @Override
+        public int partitionCount()
+        {
+            return partitionCount;
+        }
+
+        @Override
+        public int rowCount()
+        {
+            return rowCount;
+        }
+    }
+
+    private class JavaDriverRun extends Runner
+    {
+        final JavaDriverClient client;
+
+        private JavaDriverRun(JavaDriverClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            State state = currentState.get();
+            if (state == null)
+            { // start processing a new token range
+                TokenRange range = tokenRangeIterator.next();
+                if (range == null)
+                    return true; // no more token ranges to process
+
+                state = new State(range, buildQuery(range));
+                currentState.set(state);
+            }
+
+            ResultSet results;
+            Statement statement = new SimpleStatement(state.query);
+            statement.setFetchSize(pageSize);
+
+            if (state.pagingState != null)
+                statement.setPagingState(state.pagingState);
+
+            results = client.getSession().execute(statement);
+            state.pagingState = results.getExecutionInfo().getPagingState();
+
+            int remaining = results.getAvailableWithoutFetching();
+            rowCount += remaining;
+
+            for (Row row : results)
+            {
+                // this call will only succeed if we've added token(partition keys) to the query
+                Token partition = row.getPartitionKeyToken();
+                if (!state.partitions.contains(partition))
+                {
+                    partitionCount += 1;
+                    state.partitions.add(partition);
+                }
+
+                if (--remaining == 0)
+                    break;
+            }
+
+            if (results.isExhausted() || isWarmup)
+            { // no more pages to fetch or just warming up, ready to move on to another token range
+                currentState.set(null);
+            }
+
+            return true;
+        }
+    }
+
+    private String buildQuery(TokenRange tokenRange)
+    {
+        Token start = tokenRange.getStart();
+        Token end = tokenRange.getEnd();
+        List<String> pkColumns = tableMetadata.getPartitionKey().stream().map(ColumnMetadata::getName).collect(Collectors.toList());
+        String tokenStatement = String.format("token(%s)", String.join(", ", pkColumns));
+
+        StringBuilder ret = new StringBuilder();
+        ret.append("SELECT ");
+        ret.append(tokenStatement); // add the token(pk) statement so that we can count partitions
+        ret.append(", ");
+        ret.append(columns);
+        ret.append(" FROM ");
+        ret.append(tableMetadata.getName());
+        if (start != null || end != null)
+            ret.append(" WHERE ");
+        if (start != null)
+        {
+            ret.append(tokenStatement);
+            ret.append(" > ");
+            ret.append(start.toString());
+        }
+
+        if (start != null && end != null)
+            ret.append(" AND ");
+
+        if (end != null)
+        {
+            ret.append(tokenStatement);
+            ret.append(" <= ");
+            ret.append(end.toString());
+        }
+
+        return ret.toString();
+    }
+
+    private static class ThriftRun extends Runner
+    {
+        final ThriftClient client;
+
+        private ThriftRun(ThriftClient client)
+        {
+            this.client = client;
+        }
+
+        public boolean run() throws Exception
+        {
+            throw new OperationNotSupportedException("Bulk read over thrift not supported");
+        }
+    }
+
+
+    @Override
+    public void run(JavaDriverClient client) throws IOException
+    {
+        timeWithRetry(new JavaDriverRun(client));
+    }
+
+    @Override
+    public void run(ThriftClient client) throws IOException
+    {
+        timeWithRetry(new ThriftRun(client));
+    }
+
+    public boolean ready(WorkManager workManager, RateLimiter rateLimiter)
+    {
+        tokenRangeIterator.update();
+
+        if (tokenRangeIterator.exhausted() && currentState.get() == null)
+            return false;
+
+        int numLeft = workManager.takePermits(1);
+        if (rateLimiter != null && numLeft > 0 )
+            rateLimiter.acquire(numLeft);
+
+        return numLeft > 0;
+    }
+
+    public String key()
+    {
+        State state = currentState.get();
+        return state == null ? "-" : state.toString();
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/ValidatingSchemaQuery.java b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/ValidatingSchemaQuery.java
index 1b10fcf..02a9ca8 100644
--- a/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/ValidatingSchemaQuery.java
+++ b/tools/stress/src/org/apache/cassandra/stress/operations/userdefined/ValidatingSchemaQuery.java

@@ -34,6 +34,7 @@
 import org.apache.cassandra.stress.Operation;
 import org.apache.cassandra.stress.generate.*;
 import org.apache.cassandra.stress.generate.Row;
+import org.apache.cassandra.stress.operations.PartitionOperation;
 import org.apache.cassandra.stress.settings.StressSettings;
 import org.apache.cassandra.stress.util.JavaDriverClient;
 import org.apache.cassandra.stress.util.ThriftClient;
@@ -46,7 +47,7 @@
 import org.apache.cassandra.utils.Pair;
 import org.apache.thrift.TException;
 
-public class ValidatingSchemaQuery extends Operation
+public class ValidatingSchemaQuery extends PartitionOperation
 {
     final Random random = new Random();
     private Pair<Row, Row> bounds;
@@ -65,7 +66,7 @@
 
     private ValidatingSchemaQuery(Timer timer, StressSettings settings, PartitionGenerator generator, SeedManager seedManager, ValidatingStatement[] statements, ConsistencyLevel cl, int clusteringComponents)
     {
-        super(timer, settings, new DataSpec(generator, seedManager, new DistributionFixed(1), 1));
+        super(timer, settings, new DataSpec(generator, seedManager, new DistributionFixed(1), settings.insert.rowPopulationRatio.get(), 1));
         this.statements = statements;
         this.cl = cl;
         argumentIndex = new int[statements[0].statement.getVariables().size()];

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java b/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java
index 5ec56dc..eb286ee 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/CliOption.java

@@ -38,7 +38,8 @@
     LOG("Where to log progress to, and the interval at which to do it", SettingsLog.helpPrinter()),
     TRANSPORT("Custom transport factories", SettingsTransport.helpPrinter()),
     PORT("The port to connect to cassandra nodes on", SettingsPort.helpPrinter()),
-    SENDTO("-send-to", "Specify a stress server to send this command to", SettingsMisc.sendToDaemonHelpPrinter())
+    SENDTO("-send-to", "Specify a stress server to send this command to", SettingsMisc.sendToDaemonHelpPrinter()),
+    TOKENRANGE("Token range settings", SettingsTokenRange.helpPrinter())
     ;
 
     private static final Map<String, CliOption> LOOKUP;

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java b/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java
index 67cd1a9..0bd1bfa 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/OptionRatioDistribution.java

@@ -87,7 +87,7 @@
             return new DelegateFactory(delegate.get(), divisor);
         if (defaultSpec == null)
             return null;
-        OptionRatioDistribution sub = new OptionRatioDistribution(delegate.prefix, null, null, true);
+        OptionRatioDistribution sub = new OptionRatioDistribution("", null, null, true);
         if (!sub.accept(defaultSpec))
             throw new IllegalStateException("Invalid default spec: " + defaultSpec);
         return sub.get();

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java
index 83f444c..c2f2591 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefined.java

@@ -51,7 +51,7 @@
         final SeedManager seeds = new SeedManager(settings);
         return new OpDistributionFactory()
         {
-            public OpDistribution get(Timing timing, int sampleCount)
+            public OpDistribution get(Timing timing, int sampleCount, boolean isWarmup)
             {
                 return new FixedOpDistribution(PredefinedOperation.operation(type, timing.newTimer(type.toString(), sampleCount),
                                                newGenerator(settings), seeds, settings, add));

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java
index 861b1a4..dd11452 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandPreDefinedMixed.java

@@ -55,7 +55,7 @@
         final SeedManager seeds = new SeedManager(settings);
         return new SampledOpDistributionFactory<Command>(ratios, clustering)
         {
-            protected List<? extends Operation> get(Timer timer, PartitionGenerator generator, Command key)
+            protected List<? extends Operation> get(Timer timer, PartitionGenerator generator, Command key, boolean isWarmup)
             {
                 return Collections.singletonList(PredefinedOperation.operation(key, timer, generator, seeds, settings, add));
             }

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java
index 8440e8e..36cbefe 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsCommandUser.java

@@ -33,6 +33,7 @@
 import org.apache.cassandra.stress.generate.DistributionFactory;
 import org.apache.cassandra.stress.generate.PartitionGenerator;
 import org.apache.cassandra.stress.generate.SeedManager;
+import org.apache.cassandra.stress.generate.TokenRangeIterator;
 import org.apache.cassandra.stress.operations.OpDistributionFactory;
 import org.apache.cassandra.stress.operations.SampledOpDistributionFactory;
 import org.apache.cassandra.stress.util.Timer;
@@ -69,15 +70,24 @@
     public OpDistributionFactory getFactory(final StressSettings settings)
     {
         final SeedManager seeds = new SeedManager(settings);
+        final TokenRangeIterator tokenRangeIterator = profile.tokenRangeQueries.isEmpty()
+                                                      ? null
+                                                      : new TokenRangeIterator(settings,
+                                                                               profile.maybeLoadTokenRanges(settings));
+
         return new SampledOpDistributionFactory<String>(ratios, clustering)
         {
-            protected List<? extends Operation> get(Timer timer, PartitionGenerator generator, String key)
+            protected List<? extends Operation> get(Timer timer, PartitionGenerator generator, String key, boolean isWarmup)
             {
                 if (key.equalsIgnoreCase("insert"))
                     return Collections.singletonList(profile.getInsert(timer, generator, seeds, settings));
                 if (key.equalsIgnoreCase("validate"))
                     return profile.getValidate(timer, generator, seeds, settings);
-                return Collections.singletonList(profile.getQuery(key, timer, generator, seeds, settings));
+
+                if (profile.tokenRangeQueries.containsKey(key))
+                    return Collections.singletonList(profile.getBulkReadQueries(key, timer, settings, tokenRangeIterator, isWarmup));
+
+                return Collections.singletonList(profile.getQuery(key, timer, generator, seeds, settings, isWarmup));
             }
 
             protected PartitionGenerator newGenerator()

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java
index a6c298b..e999e4b 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsInsert.java

@@ -37,6 +37,7 @@
     public final DistributionFactory visits;
     public final DistributionFactory batchsize;
     public final RatioDistributionFactory selectRatio;
+    public final RatioDistributionFactory rowPopulationRatio;
     public final BatchStatement.Type batchType;
 
     private SettingsInsert(InsertOptions options)
@@ -45,6 +46,10 @@
         this.revisit = options.revisit.get();
         this.batchsize = options.partitions.get();
         this.selectRatio = options.selectRatio.get();
+        this.rowPopulationRatio = options.rowPopulationRatio.get();
+
+
+
         this.batchType = !options.batchType.setByUser() ? null : BatchStatement.Type.valueOf(options.batchType.value());
     }
 
@@ -57,11 +62,12 @@
         final OptionDistribution partitions = new OptionDistribution("partitions=", null, "The number of partitions to update in a single batch", false);
         final OptionSimple batchType = new OptionSimple("batchtype=", "unlogged|logged|counter", null, "Specify the type of batch statement (LOGGED, UNLOGGED or COUNTER)", false);
         final OptionRatioDistribution selectRatio = new OptionRatioDistribution("select-ratio=", null, "The uniform probability of visiting any CQL row in the generated partition", false);
+        final OptionRatioDistribution rowPopulationRatio = new OptionRatioDistribution("row-population-ratio=", "fixed(1)/1", "The percent of a given rows columns to populate", false);
 
         @Override
         public List<? extends Option> options()
         {
-            return Arrays.asList(revisit, visits, partitions, batchType, selectRatio);
+            return Arrays.asList(revisit, visits, partitions, batchType, selectRatio, rowPopulationRatio);
         }
     }
 

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java
index 5735f9d..5334f25 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMisc.java

@@ -153,52 +153,35 @@
 
     public static Runnable helpHelpPrinter()
     {
-        return new Runnable()
-        {
-            @Override
-            public void run()
-            {
-                System.out.println("Usage: ./bin/cassandra-stress help <command|option>");
-                System.out.println("Commands:");
-                for (Command cmd : Command.values())
-                    System.out.println("    " + cmd.names.toString().replaceAll("\\[|\\]", ""));
-                System.out.println("Options:");
-                for (CliOption op : CliOption.values())
-                    System.out.println("    -" + op.toString().toLowerCase() + (op.extraName != null ? ", " + op.extraName : ""));
-            }
+        return () -> {
+            System.out.println("Usage: ./bin/cassandra-stress help <command|option>");
+            System.out.println("Commands:");
+            for (Command cmd : Command.values())
+                System.out.println("    " + cmd.names.toString().replaceAll("\\[|\\]", ""));
+            System.out.println("Options:");
+            for (CliOption op : CliOption.values())
+                System.out.println("    -" + op.toString().toLowerCase() + (op.extraName != null ? ", " + op.extraName : ""));
         };
     }
 
     public static Runnable printHelpPrinter()
     {
-        return new Runnable()
+        return () -> GroupedOptions.printOptions(System.out, "print", new GroupedOptions()
         {
             @Override
-            public void run()
+            public List<? extends Option> options()
             {
-                GroupedOptions.printOptions(System.out, "print", new GroupedOptions()
-                {
-                    @Override
-                    public List<? extends Option> options()
-                    {
-                        return Arrays.asList(new OptionDistribution("dist=", null, "A mathematical distribution"));
-                    }
-                });
+                return Arrays.asList(new OptionDistribution("dist=", null, "A mathematical distribution"));
             }
-        };
+        });
     }
 
     public static Runnable sendToDaemonHelpPrinter()
     {
-        return new Runnable()
-        {
-            @Override
-            public void run()
-            {
-                System.out.println("Usage: -sendToDaemon <host>");
-                System.out.println();
-                System.out.println("Specify a host running the stress server to send this stress command to");
-            }
+        return () -> {
+            System.out.println("Usage: -sendto <host>");
+            System.out.println();
+            System.out.println("Specify a host running the stress server to send this stress command to");
         };
     }
 
@@ -212,7 +195,7 @@
         if (params.length != 1)
         {
             sendToDaemonHelpPrinter().run();
-            System.out.println("Invalid -send-to specifier: " + Arrays.toString(params));
+            System.out.println("Invalid -sendto specifier: " + Arrays.toString(params));
             System.exit(1);
         }
         return params[0];

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java
index 2c91b6d..8f0ab25 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsMode.java

@@ -1,6 +1,6 @@
 package org.apache.cassandra.stress.settings;
 /*
- * 
+ *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -8,16 +8,16 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  */
 
 
@@ -29,6 +29,7 @@
 import com.datastax.driver.core.AuthProvider;
 import com.datastax.driver.core.PlainTextAuthProvider;
 import com.datastax.driver.core.ProtocolOptions;
+import com.datastax.driver.core.ProtocolVersion;
 
 public class SettingsMode implements Serializable
 {
@@ -36,12 +37,16 @@
     public final ConnectionAPI api;
     public final ConnectionStyle style;
     public final CqlVersion cqlVersion;
+    public final ProtocolVersion protocolVersion;
 
     public final String username;
     public final String password;
     public final String authProviderClassname;
     public final AuthProvider authProvider;
 
+    public final Integer maxPendingPerConnection;
+    public final Integer connectionsPerHost;
+
     private final String compression;
 
     public SettingsMode(GroupedOptions options)
@@ -50,11 +55,16 @@
         {
             cqlVersion = CqlVersion.CQL3;
             Cql3Options opts = (Cql3Options) options;
+            protocolVersion = "NEWEST_SUPPORTED".equals(opts.protocolVersion.value())
+                    ? ProtocolVersion.NEWEST_SUPPORTED
+                    : ProtocolVersion.fromInt(Integer.parseInt(opts.protocolVersion.value()));
             api = opts.mode().displayPrefix.equals("native") ? ConnectionAPI.JAVA_DRIVER_NATIVE : ConnectionAPI.THRIFT;
             style = opts.useUnPrepared.setByUser() ? ConnectionStyle.CQL :  ConnectionStyle.CQL_PREPARED;
             compression = ProtocolOptions.Compression.valueOf(opts.useCompression.value().toUpperCase()).name();
             username = opts.user.value();
             password = opts.password.value();
+            maxPendingPerConnection = opts.maxPendingPerConnection.value().isEmpty() ? null : Integer.valueOf(opts.maxPendingPerConnection.value());
+            connectionsPerHost = opts.connectionsPerHost.value().isEmpty() ? null : Integer.valueOf(opts.connectionsPerHost.value());
             authProviderClassname = opts.authProvider.value();
             if (authProviderClassname != null)
             {
@@ -87,6 +97,7 @@
         {
             cqlVersion = CqlVersion.CQL3;
             Cql3SimpleNativeOptions opts = (Cql3SimpleNativeOptions) options;
+            protocolVersion = ProtocolVersion.NEWEST_SUPPORTED;
             api = ConnectionAPI.SIMPLE_NATIVE;
             style = opts.usePrepared.setByUser() ? ConnectionStyle.CQL_PREPARED : ConnectionStyle.CQL;
             compression = ProtocolOptions.Compression.NONE.name();
@@ -94,10 +105,13 @@
             password = null;
             authProvider = null;
             authProviderClassname = null;
+            maxPendingPerConnection = null;
+            connectionsPerHost = null;
         }
         else if (options instanceof ThriftOptions)
         {
             ThriftOptions opts = (ThriftOptions) options;
+            protocolVersion = ProtocolVersion.NEWEST_SUPPORTED;
             cqlVersion = CqlVersion.NOCQL;
             api = opts.smart.setByUser() ? ConnectionAPI.THRIFT_SMART : ConnectionAPI.THRIFT;
             style = ConnectionStyle.THRIFT;
@@ -106,6 +120,8 @@
             password = opts.password.value();
             authProviderClassname = null;
             authProvider = null;
+            maxPendingPerConnection = null;
+            connectionsPerHost = null;
         }
         else
             throw new IllegalStateException();
@@ -139,18 +155,22 @@
     private static abstract class Cql3Options extends GroupedOptions
     {
         final OptionSimple api = new OptionSimple("cql3", "", null, "", true);
+        final OptionSimple protocolVersion = new OptionSimple("protocolVersion=", "[2-4]+", "NEWEST_SUPPORTED", "CQL Protocol Version", false);
         final OptionSimple useUnPrepared = new OptionSimple("unprepared", "", null, "force use of unprepared statements", false);
         final OptionSimple useCompression = new OptionSimple("compression=", "none|lz4|snappy", "none", "", false);
         final OptionSimple port = new OptionSimple("port=", "[0-9]+", "9046", "", false);
         final OptionSimple user = new OptionSimple("user=", ".+", null, "username", false);
         final OptionSimple password = new OptionSimple("password=", ".+", null, "password", false);
         final OptionSimple authProvider = new OptionSimple("auth-provider=", ".*", null, "Fully qualified implementation of com.datastax.driver.core.AuthProvider", false);
+        final OptionSimple maxPendingPerConnection = new OptionSimple("maxPending=", "[0-9]+", "", "Maximum pending requests per connection", false);
+        final OptionSimple connectionsPerHost = new OptionSimple("connectionsPerHost=", "[0-9]+", "", "Number of connections per host", false);
 
         abstract OptionSimple mode();
         @Override
         public List<? extends Option> options()
         {
-            return Arrays.asList(mode(), useUnPrepared, api, useCompression, port, user, password, authProvider);
+            return Arrays.asList(mode(), useUnPrepared, api, useCompression, port, user, password, authProvider,
+                                 maxPendingPerConnection, connectionsPerHost, protocolVersion);
         }
     }
 

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTokenRange.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTokenRange.java
new file mode 100644
index 0000000..8fb0048
--- /dev/null
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTokenRange.java

@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.stress.settings;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.primitives.Ints;
+
+public class SettingsTokenRange
+{
+    public final boolean wrap;
+    public final int splitFactor;
+
+    public SettingsTokenRange(TokenRangeOptions options)
+    {
+        this.wrap = options.wrap.setByUser();
+        this.splitFactor = Ints.checkedCast(OptionDistribution.parseLong(options.splitFactor.value()));
+    }
+
+    private static final class TokenRangeOptions extends GroupedOptions
+    {
+        final OptionSimple wrap = new OptionSimple("wrap", "", null, "Re-use token ranges in order to terminate stress iterations", false);
+        final OptionSimple splitFactor = new OptionSimple("split-factor=", "[0-9]+[bmk]?", "1", "Split every token range by this factor", false);
+
+
+        @Override
+        public List<? extends Option> options()
+        {
+            return ImmutableList.<Option>builder().add(wrap, splitFactor).build();
+        }
+    }
+
+    public static SettingsTokenRange get(Map<String, String[]> clArgs)
+    {
+        String[] params = clArgs.remove("-tokenrange");
+        if (params == null)
+        {
+            return new SettingsTokenRange(new TokenRangeOptions());
+        }
+        TokenRangeOptions options = GroupedOptions.select(params, new TokenRangeOptions());
+        if (options == null)
+        {
+            printHelp();
+            System.out.println("Invalid -tokenrange options provided, see output for valid options");
+            System.exit(1);
+        }
+        return new SettingsTokenRange(options);
+    }
+
+    public static void printHelp()
+    {
+        GroupedOptions.printOptions(System.out, "-tokenrange", new TokenRangeOptions());
+    }
+
+    public static Runnable helpPrinter()
+    {
+        return SettingsTokenRange::printHelp;
+    }
+}

diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java
index 335ca92..5b1f861 100644
--- a/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java
+++ b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java

@@ -55,7 +55,8 @@
     public final SettingsPort port;
     public final String sendToDaemon;
 
-    public StressSettings(SettingsCommand command, SettingsRate rate, SettingsPopulation generate, SettingsInsert insert, SettingsColumn columns, SettingsSamples samples, SettingsErrors errors, SettingsLog log, SettingsMode mode, SettingsNode node, SettingsSchema schema, SettingsTransport transport, SettingsPort port, String sendToDaemon)
+    public final SettingsTokenRange tokenRange;
+    public StressSettings(SettingsCommand command, SettingsRate rate, SettingsPopulation generate, SettingsInsert insert, SettingsColumn columns, SettingsSamples samples, SettingsErrors errors, SettingsLog log, SettingsMode mode, SettingsNode node, SettingsSchema schema, SettingsTransport transport, SettingsPort port, String sendToDaemon, SettingsTokenRange tokenRange)
     {
         this.command = command;
         this.rate = rate;
@@ -71,6 +72,7 @@
         this.transport = transport;
         this.port = port;
         this.sendToDaemon = sendToDaemon;
+        this.tokenRange = tokenRange;
     }
 
     private SmartThriftClient tclient;
@@ -253,6 +255,7 @@
         SettingsPort port = SettingsPort.get(clArgs);
         SettingsRate rate = SettingsRate.get(clArgs, command);
         SettingsPopulation generate = SettingsPopulation.get(clArgs, command);
+        SettingsTokenRange tokenRange = SettingsTokenRange.get(clArgs);
         SettingsInsert insert = SettingsInsert.get(clArgs);
         SettingsColumn columns = SettingsColumn.get(clArgs);
         SettingsSamples samples = SettingsSamples.get(clArgs);
@@ -278,7 +281,7 @@
             }
             System.exit(1);
         }
-        return new StressSettings(command, rate, generate, insert, columns, samples, errors, log, mode, node, schema, transport, port, sendToDaemon);
+        return new StressSettings(command, rate, generate, insert, columns, samples, errors, log, mode, node, schema, transport, port, sendToDaemon, tokenRange);
     }
 
     private static Map<String, String[]> parseMap(String[] args)

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java b/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java
deleted file mode 100644
index ee04063..0000000
--- a/tools/stress/src/org/apache/cassandra/stress/util/DynamicList.java
+++ /dev/null

@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.stress.util;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.TreeSet;
-import java.util.concurrent.ThreadLocalRandom;
-import java.util.concurrent.locks.ReadWriteLock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-import org.apache.cassandra.stress.generate.FasterRandom;
-
-// simple thread-unsafe skiplist that permits indexing/removal by position, insertion at the end
-// (though easily extended to insertion at any position, not necessary here)
-// we use it for sampling items by position for visiting writes in the pool of pending writes
-public class DynamicList<E>
-{
-
-    // represents a value and an index simultaneously; each node maintains a list
-    // of next pointers for each height in the skip-list this node participates in
-    // (a contiguous range from [0..height))
-    public static class Node<E>
-    {
-        // stores the size of each descendant
-        private final int[] size;
-        // TODO: alternate links to save space
-        private final Node<E>[] links;
-        private E value;
-
-        private Node(int height, E value)
-        {
-            this.value = value;
-            links = new Node[height * 2];
-            size = new int[height];
-            Arrays.fill(size, 1);
-        }
-
-        private int height()
-        {
-            return size.length;
-        }
-
-        private Node<E> next(int i)
-        {
-            return links[i * 2];
-        }
-
-        private Node<E> prev(int i)
-        {
-            return links[1 + i * 2];
-        }
-
-        private void setNext(int i, Node<E> next)
-        {
-            links[i * 2] = next;
-        }
-
-        private void setPrev(int i, Node<E> prev)
-        {
-            links[1 + i * 2] = prev;
-        }
-
-        private Node parent(int parentHeight)
-        {
-            Node prev = this;
-            while (true)
-            {
-                int height = prev.height();
-                if (parentHeight < height)
-                    return prev;
-                prev = prev.prev(height - 1);
-            }
-        }
-    }
-
-    private final ReadWriteLock lock = new ReentrantReadWriteLock();
-    private final int maxHeight;
-    private final Node<E> head;
-    private int size;
-
-    public DynamicList(int maxExpectedSize)
-    {
-        this.maxHeight = 3 + Math.max(0, (int) Math.ceil(Math.log(maxExpectedSize) / Math.log(2)));
-        head = new Node<>(maxHeight, null);
-    }
-
-    private int randomLevel()
-    {
-        return 1 + Integer.bitCount(ThreadLocalRandom.current().nextInt() & ((1 << (maxHeight - 1)) - 1));
-    }
-
-    public Node<E> append(E value)
-    {
-        return append(value, Integer.MAX_VALUE);
-    }
-
-    // add the value to the end of the list, and return the associated Node that permits efficient removal
-    // regardless of its future position in the list from other modifications
-    public Node<E> append(E value, int maxSize)
-    {
-        Node<E> newTail = new Node<>(randomLevel(), value);
-
-        lock.writeLock().lock();
-        try
-        {
-            if (size >= maxSize)
-                return null;
-            size++;
-
-            Node<E> tail = head;
-            for (int i = maxHeight - 1 ; i >= newTail.height() ; i--)
-            {
-                Node<E> next;
-                while ((next = tail.next(i)) != null)
-                    tail = next;
-                tail.size[i]++;
-            }
-
-            for (int i = newTail.height() - 1 ; i >= 0 ; i--)
-            {
-                Node<E> next;
-                while ((next = tail.next(i)) != null)
-                    tail = next;
-                tail.setNext(i, newTail);
-                newTail.setPrev(i, tail);
-            }
-
-            return newTail;
-        }
-        finally
-        {
-            lock.writeLock().unlock();
-        }
-    }
-
-    // remove the provided node and its associated value from the list
-    public void remove(Node<E> node)
-    {
-        lock.writeLock().lock();
-        try
-        {
-            assert node.value != null;
-            node.value = null;
-
-            size--;
-
-            // go up through each level in the skip list, unlinking this node; this entails
-            // simply linking each neighbour to each other, and appending the size of the
-            // current level owned by this node's index to the preceding neighbour (since
-            // ownership is defined as any node that you must visit through the index,
-            // removal of ourselves from a level means the preceding index entry is the
-            // entry point to all of the removed node's descendants)
-            for (int i = 0 ; i < node.height() ; i++)
-            {
-                Node<E> prev = node.prev(i);
-                Node<E> next = node.next(i);
-                assert prev != null;
-                prev.setNext(i, next);
-                if (next != null)
-                    next.setPrev(i, prev);
-                prev.size[i] += node.size[i] - 1;
-            }
-
-            // then go up the levels, removing 1 from the size at each height above ours
-            for (int i = node.height() ; i < maxHeight ; i++)
-            {
-                // if we're at our height limit, we backtrack at our top level until we
-                // hit a neighbour with a greater height
-                while (i == node.height())
-                    node = node.prev(i - 1);
-                node.size[i]--;
-            }
-        }
-        finally
-        {
-            lock.writeLock().unlock();
-        }
-    }
-
-    // retrieve the item at the provided index, or return null if the index is past the end of the list
-    public E get(int index)
-    {
-        lock.readLock().lock();
-        try
-        {
-            if (index >= size)
-                return null;
-
-            index++;
-            int c = 0;
-            Node<E> finger = head;
-            for (int i = maxHeight - 1 ; i >= 0 ; i--)
-            {
-                while (c + finger.size[i] <= index)
-                {
-                    c += finger.size[i];
-                    finger = finger.next(i);
-                }
-            }
-
-            assert c == index;
-            return finger.value;
-        }
-        finally
-        {
-            lock.readLock().unlock();
-        }
-    }
-
-    // some quick and dirty tests to confirm the skiplist works as intended
-    // don't create a separate unit test - tools tree doesn't currently warrant them
-
-    private boolean isWellFormed()
-    {
-        for (int i = 0 ; i < maxHeight ; i++)
-        {
-            int c = 0;
-            for (Node node = head ; node != null ; node = node.next(i))
-            {
-                if (node.prev(i) != null && node.prev(i).next(i) != node)
-                    return false;
-                if (node.next(i) != null && node.next(i).prev(i) != node)
-                    return false;
-                c += node.size[i];
-                if (i + 1 < maxHeight && node.parent(i + 1).next(i + 1) == node.next(i))
-                {
-                    if (node.parent(i + 1).size[i + 1] != c)
-                        return false;
-                    c = 0;
-                }
-            }
-            if (i == maxHeight - 1 && c != size + 1)
-                return false;
-        }
-        return true;
-    }
-
-    public static void main(String[] args)
-    {
-        DynamicList<Integer> list = new DynamicList<>(20);
-        TreeSet<Integer> canon = new TreeSet<>();
-        HashMap<Integer, Node> nodes = new HashMap<>();
-        int c = 0;
-        for (int i = 0 ; i < 100000 ; i++)
-        {
-            nodes.put(c, list.append(c));
-            canon.add(c);
-            c++;
-        }
-        FasterRandom rand = new FasterRandom();
-        assert list.isWellFormed();
-        for (int loop = 0 ; loop < 100 ; loop++)
-        {
-            System.out.println(loop);
-            for (int i = 0 ; i < 100000 ; i++)
-            {
-                int index = rand.nextInt(100000);
-                Integer seed = list.get(index);
-//                assert canon.headSet(seed, false).size() == index;
-                list.remove(nodes.remove(seed));
-                canon.remove(seed);
-                nodes.put(c, list.append(c));
-                canon.add(c);
-                c++;
-            }
-            assert list.isWellFormed();
-        }
-    }
-
-}

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java
index 30d0d4a..4f173b4 100644
--- a/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java
+++ b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java

@@ -44,7 +44,10 @@
     public final String username;
     public final String password;
     public final AuthProvider authProvider;
+    public final int maxPendingPerConnection;
+    public final int connectionsPerHost;
 
+    private final ProtocolVersion protocolVersion;
     private final EncryptionOptions.ClientEncryptionOptions encryptionOptions;
     private Cluster cluster;
     private Session session;
@@ -59,6 +62,7 @@
 
     public JavaDriverClient(StressSettings settings, String host, int port, EncryptionOptions.ClientEncryptionOptions encryptionOptions)
     {
+        this.protocolVersion = settings.mode.protocolVersion;
         this.host = host;
         this.port = port;
         this.username = settings.mode.username;
@@ -66,9 +70,22 @@
         this.authProvider = settings.mode.authProvider;
         this.encryptionOptions = encryptionOptions;
         if (settings.node.isWhiteList)
-            whitelist = new WhiteListPolicy(new DCAwareRoundRobinPolicy(), settings.node.resolveAll(settings.port.nativePort));
+            whitelist = new WhiteListPolicy(DCAwareRoundRobinPolicy.builder().build(), settings.node.resolveAll(settings.port.nativePort));
         else
             whitelist = null;
+        connectionsPerHost = settings.mode.connectionsPerHost == null ? 8 : settings.mode.connectionsPerHost;
+
+        int maxThreadCount = 0;
+        if (settings.rate.auto)
+            maxThreadCount = settings.rate.maxThreads;
+        else
+            maxThreadCount = settings.rate.threadCount;
+
+        //Always allow enough pending requests so every thread can have a request pending
+        //See https://issues.apache.org/jira/browse/CASSANDRA-7217
+        int requestsPerConnection = (maxThreadCount / connectionsPerHost) + connectionsPerHost;
+
+        maxPendingPerConnection = settings.mode.maxPendingPerConnection == null ? Math.max(128, requestsPerConnection ) : settings.mode.maxPendingPerConnection;
     }
 
     public PreparedStatement prepare(String query)
@@ -89,13 +106,18 @@
 
     public void connect(ProtocolOptions.Compression compression) throws Exception
     {
-        PoolingOptions poolingOpts = new PoolingOptions();
-        poolingOpts.setCoreConnectionsPerHost(HostDistance.LOCAL, 8);
+
+        PoolingOptions poolingOpts = new PoolingOptions()
+                                     .setConnectionsPerHost(HostDistance.LOCAL, connectionsPerHost, connectionsPerHost)
+                                     .setMaxRequestsPerConnection(HostDistance.LOCAL, maxPendingPerConnection)
+                                     .setNewConnectionThreshold(HostDistance.LOCAL, 100);
+
         Cluster.Builder clusterBuilder = Cluster.builder()
                                                 .addContactPoint(host)
                                                 .withPort(port)
                                                 .withPoolingOptions(poolingOpts)
-                                                .withProtocolVersion(ProtocolVersion.NEWEST_SUPPORTED)
+                                                .withoutJMXReporting()
+                                                .withProtocolVersion(protocolVersion)
                                                 .withoutMetrics(); // The driver uses metrics 3 with conflict with our version
         if (whitelist != null)
             clusterBuilder.withLoadBalancingPolicy(whitelist);
@@ -104,7 +126,9 @@
         {
             SSLContext sslContext;
             sslContext = SSLFactory.createSSLContext(encryptionOptions, true);
-            SSLOptions sslOptions = new SSLOptions(sslContext, encryptionOptions.cipher_suites);
+            SSLOptions sslOptions = JdkSSLOptions.builder()
+                                                 .withSSLContext(sslContext)
+                                                 .withCipherSuites(encryptionOptions.cipher_suites).build();
             clusterBuilder.withSSL(sslOptions);
         }
 
@@ -119,8 +143,11 @@
 
         cluster = clusterBuilder.build();
         Metadata metadata = cluster.getMetadata();
-        System.out.printf("Connected to cluster: %s%n",
-                metadata.getClusterName());
+        System.out.printf(
+                "Connected to cluster: %s, max pending requests per connection %d, max connections per host %d%n",
+                metadata.getClusterName(),
+                maxPendingPerConnection,
+                connectionsPerHost);
         for (Host host : metadata.getAllHosts())
         {
             System.out.printf("Datatacenter: %s; Host: %s; Rack: %s%n",

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java b/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java
index f813e93..bb5f4c0 100644
--- a/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java
+++ b/tools/stress/src/org/apache/cassandra/stress/util/SimpleThriftClient.java

@@ -44,43 +44,43 @@
     }
 
     @Override
-    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws TException
     {
         return client.get_slice(key, column_parent, predicate, consistency_level);
     }
 
     @Override
-    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws TException
     {
         return client.get_indexed_slices(column_parent, index_clause, column_predicate, consistency_level);
     }
 
     @Override
-    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws TException
     {
         return client.get_range_slices(column_parent, predicate, range, consistency_level);
     }
 
     @Override
-    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws TException
     {
         return client.multiget_slice(keys, column_parent, predicate, consistency_level);
     }
 
     @Override
-    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws TException
     {
         client.insert(key, column_parent, column, consistency_level);
     }
 
     @Override
-    public Integer prepare_cql3_query(String query, Compression compression) throws InvalidRequestException, TException
+    public Integer prepare_cql3_query(String query, Compression compression) throws TException
     {
         return client.prepare_cql3_query(ByteBufferUtil.bytes(query), compression).itemId;
     }
 
     @Override
-    public CqlResult execute_prepared_cql_query(int itemId, ByteBuffer key, List<ByteBuffer> values) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_prepared_cql_query(int itemId, ByteBuffer key, List<ByteBuffer> values) throws TException
     {
         return client.execute_prepared_cql_query(itemId, values);
     }
@@ -92,19 +92,19 @@
     }
 
     @Override
-    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws TException
     {
         return client.execute_cql3_query(ByteBufferUtil.bytes(query), compression, consistency);
     }
 
     @Override
-    public CqlResult execute_prepared_cql3_query(int itemId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_prepared_cql3_query(int itemId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws TException
     {
         return client.execute_prepared_cql3_query(itemId, values, consistency);
     }
 
     @Override
-    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws TException
     {
         return client.execute_cql_query(ByteBufferUtil.bytes(query), compression);
     }

diff --git a/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java b/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java
index 178da64..babbd7a 100644
--- a/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java
+++ b/tools/stress/src/org/apache/cassandra/stress/util/SmartThriftClient.java

@@ -172,7 +172,7 @@
     }
 
     @Override
-    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent parent, SlicePredicate predicate, ConsistencyLevel consistencyLevel) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<ColumnOrSuperColumn> get_slice(ByteBuffer key, ColumnParent parent, SlicePredicate predicate, ConsistencyLevel consistencyLevel) throws TException
     {
         Client client = get(key);
         try
@@ -185,7 +185,7 @@
     }
 
     @Override
-    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public void insert(ByteBuffer key, ColumnParent column_parent, Column column, ConsistencyLevel consistency_level) throws TException
     {
         Client client = get(key);
         try
@@ -198,7 +198,7 @@
     }
 
     @Override
-    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_cql_query(String query, ByteBuffer key, Compression compression) throws TException
     {
         Client client = get(key);
         try
@@ -211,7 +211,7 @@
     }
 
     @Override
-    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_cql3_query(String query, ByteBuffer key, Compression compression, ConsistencyLevel consistency) throws TException
     {
         Client client = get(key);
         try
@@ -224,13 +224,13 @@
     }
 
     @Override
-    public Integer prepare_cql3_query(String query, Compression compression) throws InvalidRequestException, TException
+    public Integer prepare_cql3_query(String query, Compression compression) throws TException
     {
         return getId(query);
     }
 
     @Override
-    public CqlResult execute_prepared_cql3_query(int queryId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_prepared_cql3_query(int queryId, ByteBuffer key, List<ByteBuffer> values, ConsistencyLevel consistency) throws TException
     {
         Client client = get(key);
         try
@@ -243,13 +243,13 @@
     }
 
     @Override
-    public Integer prepare_cql_query(String query, Compression compression) throws InvalidRequestException, TException
+    public Integer prepare_cql_query(String query, Compression compression) throws TException
     {
         return getId(query);
     }
 
     @Override
-    public CqlResult execute_prepared_cql_query(int queryId, ByteBuffer key, List<ByteBuffer> values) throws InvalidRequestException, UnavailableException, TimedOutException, SchemaDisagreementException, TException
+    public CqlResult execute_prepared_cql_query(int queryId, ByteBuffer key, List<ByteBuffer> values) throws TException
     {
         Client client = get(key);
         try
@@ -262,19 +262,19 @@
     }
 
     @Override
-    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public Map<ByteBuffer, List<ColumnOrSuperColumn>> multiget_slice(List<ByteBuffer> keys, ColumnParent column_parent, SlicePredicate predicate, ConsistencyLevel consistency_level) throws TException
     {
         throw new UnsupportedOperationException();
     }
 
     @Override
-    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<KeySlice> get_range_slices(ColumnParent column_parent, SlicePredicate predicate, KeyRange range, ConsistencyLevel consistency_level) throws TException
     {
         throw new UnsupportedOperationException();
     }
 
     @Override
-    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws InvalidRequestException, UnavailableException, TimedOutException, TException
+    public List<KeySlice> get_indexed_slices(ColumnParent column_parent, IndexClause index_clause, SlicePredicate column_predicate, ConsistencyLevel consistency_level) throws TException
     {
         throw new UnsupportedOperationException();
     }
commit	e8745effa0140202684f6c7f4fc6e5c18d96b295	[log] [tgz]
author	Mick Semb Wever <mck@apache.org>	Tue Jul 14 22:06:20 2020 +0200
committer	Mick Semb Wever <mck@apache.org>	Tue Jul 14 22:06:20 2020 +0200
tree	e69733855b6f8a82eb7c3ceec294772ae4d51a30
parent	4f11cb9feebdb0017de1e1063cbf72b840567518 [diff]
parent	cd006d275aa9b6e937c6ebd036d4d27c4ed18dbe [diff]